1028ba5dfSTristan Ye /* -*- mode: c; c-basic-offset: 8; -*- 2028ba5dfSTristan Ye * vim: noexpandtab sw=8 ts=8 sts=0: 3028ba5dfSTristan Ye * 4028ba5dfSTristan Ye * move_extents.c 5028ba5dfSTristan Ye * 6028ba5dfSTristan Ye * Copyright (C) 2011 Oracle. All rights reserved. 7028ba5dfSTristan Ye * 8028ba5dfSTristan Ye * This program is free software; you can redistribute it and/or 9028ba5dfSTristan Ye * modify it under the terms of the GNU General Public 10028ba5dfSTristan Ye * License version 2 as published by the Free Software Foundation. 11028ba5dfSTristan Ye * 12028ba5dfSTristan Ye * This program is distributed in the hope that it will be useful, 13028ba5dfSTristan Ye * but WITHOUT ANY WARRANTY; without even the implied warranty of 14028ba5dfSTristan Ye * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15028ba5dfSTristan Ye * General Public License for more details. 16028ba5dfSTristan Ye */ 17028ba5dfSTristan Ye #include <linux/fs.h> 18028ba5dfSTristan Ye #include <linux/types.h> 19028ba5dfSTristan Ye #include <linux/mount.h> 20028ba5dfSTristan Ye #include <linux/swap.h> 21028ba5dfSTristan Ye 22028ba5dfSTristan Ye #include <cluster/masklog.h> 23028ba5dfSTristan Ye 24028ba5dfSTristan Ye #include "ocfs2.h" 25028ba5dfSTristan Ye #include "ocfs2_ioctl.h" 26028ba5dfSTristan Ye 27028ba5dfSTristan Ye #include "alloc.h" 28028ba5dfSTristan Ye #include "aops.h" 29028ba5dfSTristan Ye #include "dlmglue.h" 30028ba5dfSTristan Ye #include "extent_map.h" 31028ba5dfSTristan Ye #include "inode.h" 32028ba5dfSTristan Ye #include "journal.h" 33028ba5dfSTristan Ye #include "suballoc.h" 34028ba5dfSTristan Ye #include "uptodate.h" 35028ba5dfSTristan Ye #include "super.h" 36028ba5dfSTristan Ye #include "dir.h" 37028ba5dfSTristan Ye #include "buffer_head_io.h" 38028ba5dfSTristan Ye #include "sysfile.h" 39028ba5dfSTristan Ye #include "suballoc.h" 40028ba5dfSTristan Ye #include "refcounttree.h" 41028ba5dfSTristan Ye #include "move_extents.h" 42028ba5dfSTristan Ye 43028ba5dfSTristan Ye struct ocfs2_move_extents_context { 44028ba5dfSTristan Ye struct inode *inode; 45028ba5dfSTristan Ye struct file *file; 46028ba5dfSTristan Ye int auto_defrag; 474dfa66bdSTristan Ye int partial; 48028ba5dfSTristan Ye int credits; 49028ba5dfSTristan Ye u32 new_phys_cpos; 50028ba5dfSTristan Ye u32 clusters_moved; 51028ba5dfSTristan Ye u64 refcount_loc; 52028ba5dfSTristan Ye struct ocfs2_move_extents *range; 53028ba5dfSTristan Ye struct ocfs2_extent_tree et; 54028ba5dfSTristan Ye struct ocfs2_alloc_context *meta_ac; 55028ba5dfSTristan Ye struct ocfs2_alloc_context *data_ac; 56028ba5dfSTristan Ye struct ocfs2_cached_dealloc_ctxt dealloc; 57028ba5dfSTristan Ye }; 58de474ee8STristan Ye 598f603e56STristan Ye static int __ocfs2_move_extent(handle_t *handle, 608f603e56STristan Ye struct ocfs2_move_extents_context *context, 618f603e56STristan Ye u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, 628f603e56STristan Ye int ext_flags) 638f603e56STristan Ye { 648f603e56STristan Ye int ret = 0, index; 658f603e56STristan Ye struct inode *inode = context->inode; 668f603e56STristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 678f603e56STristan Ye struct ocfs2_extent_rec *rec, replace_rec; 688f603e56STristan Ye struct ocfs2_path *path = NULL; 698f603e56STristan Ye struct ocfs2_extent_list *el; 708f603e56STristan Ye u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); 718f603e56STristan Ye u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); 728f603e56STristan Ye 738f603e56STristan Ye ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos, 748f603e56STristan Ye p_cpos, new_p_cpos, len); 758f603e56STristan Ye if (ret) { 768f603e56STristan Ye mlog_errno(ret); 778f603e56STristan Ye goto out; 788f603e56STristan Ye } 798f603e56STristan Ye 808f603e56STristan Ye memset(&replace_rec, 0, sizeof(replace_rec)); 818f603e56STristan Ye replace_rec.e_cpos = cpu_to_le32(cpos); 828f603e56STristan Ye replace_rec.e_leaf_clusters = cpu_to_le16(len); 838f603e56STristan Ye replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, 848f603e56STristan Ye new_p_cpos)); 858f603e56STristan Ye 868f603e56STristan Ye path = ocfs2_new_path_from_et(&context->et); 878f603e56STristan Ye if (!path) { 888f603e56STristan Ye ret = -ENOMEM; 898f603e56STristan Ye mlog_errno(ret); 908f603e56STristan Ye goto out; 918f603e56STristan Ye } 928f603e56STristan Ye 938f603e56STristan Ye ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); 948f603e56STristan Ye if (ret) { 958f603e56STristan Ye mlog_errno(ret); 968f603e56STristan Ye goto out; 978f603e56STristan Ye } 988f603e56STristan Ye 998f603e56STristan Ye el = path_leaf_el(path); 1008f603e56STristan Ye 1018f603e56STristan Ye index = ocfs2_search_extent_list(el, cpos); 1028f603e56STristan Ye if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 1038f603e56STristan Ye ocfs2_error(inode->i_sb, 1048f603e56STristan Ye "Inode %llu has an extent at cpos %u which can no " 1058f603e56STristan Ye "longer be found.\n", 1068f603e56STristan Ye (unsigned long long)ino, cpos); 1078f603e56STristan Ye ret = -EROFS; 1088f603e56STristan Ye goto out; 1098f603e56STristan Ye } 1108f603e56STristan Ye 1118f603e56STristan Ye rec = &el->l_recs[index]; 1128f603e56STristan Ye 1138f603e56STristan Ye BUG_ON(ext_flags != rec->e_flags); 1148f603e56STristan Ye /* 1158f603e56STristan Ye * after moving/defraging to new location, the extent is not going 1168f603e56STristan Ye * to be refcounted anymore. 1178f603e56STristan Ye */ 1188f603e56STristan Ye replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; 1198f603e56STristan Ye 1208f603e56STristan Ye ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), 1218f603e56STristan Ye context->et.et_root_bh, 1228f603e56STristan Ye OCFS2_JOURNAL_ACCESS_WRITE); 1238f603e56STristan Ye if (ret) { 1248f603e56STristan Ye mlog_errno(ret); 1258f603e56STristan Ye goto out; 1268f603e56STristan Ye } 1278f603e56STristan Ye 1288f603e56STristan Ye ret = ocfs2_split_extent(handle, &context->et, path, index, 1298f603e56STristan Ye &replace_rec, context->meta_ac, 1308f603e56STristan Ye &context->dealloc); 1318f603e56STristan Ye if (ret) { 1328f603e56STristan Ye mlog_errno(ret); 1338f603e56STristan Ye goto out; 1348f603e56STristan Ye } 1358f603e56STristan Ye 1368f603e56STristan Ye ocfs2_journal_dirty(handle, context->et.et_root_bh); 1378f603e56STristan Ye 1388f603e56STristan Ye context->new_phys_cpos = new_p_cpos; 1398f603e56STristan Ye 1408f603e56STristan Ye /* 1418f603e56STristan Ye * need I to append truncate log for old clusters? 1428f603e56STristan Ye */ 1438f603e56STristan Ye if (old_blkno) { 1448f603e56STristan Ye if (ext_flags & OCFS2_EXT_REFCOUNTED) 1458f603e56STristan Ye ret = ocfs2_decrease_refcount(inode, handle, 1468f603e56STristan Ye ocfs2_blocks_to_clusters(osb->sb, 1478f603e56STristan Ye old_blkno), 1488f603e56STristan Ye len, context->meta_ac, 1498f603e56STristan Ye &context->dealloc, 1); 1508f603e56STristan Ye else 1518f603e56STristan Ye ret = ocfs2_truncate_log_append(osb, handle, 1528f603e56STristan Ye old_blkno, len); 1538f603e56STristan Ye } 1548f603e56STristan Ye 1558f603e56STristan Ye out: 1568f603e56STristan Ye return ret; 1578f603e56STristan Ye } 1588f603e56STristan Ye 159de474ee8STristan Ye /* 160de474ee8STristan Ye * lock allocators, and reserving appropriate number of bits for 161de474ee8STristan Ye * meta blocks and data clusters. 162de474ee8STristan Ye * 163de474ee8STristan Ye * in some cases, we don't need to reserve clusters, just let data_ac 164de474ee8STristan Ye * be NULL. 165de474ee8STristan Ye */ 166de474ee8STristan Ye static int ocfs2_lock_allocators_move_extents(struct inode *inode, 167de474ee8STristan Ye struct ocfs2_extent_tree *et, 168de474ee8STristan Ye u32 clusters_to_move, 169de474ee8STristan Ye u32 extents_to_split, 170de474ee8STristan Ye struct ocfs2_alloc_context **meta_ac, 171de474ee8STristan Ye struct ocfs2_alloc_context **data_ac, 172de474ee8STristan Ye int extra_blocks, 173de474ee8STristan Ye int *credits) 174de474ee8STristan Ye { 175de474ee8STristan Ye int ret, num_free_extents; 176de474ee8STristan Ye unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; 177de474ee8STristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 178de474ee8STristan Ye 179de474ee8STristan Ye num_free_extents = ocfs2_num_free_extents(osb, et); 180de474ee8STristan Ye if (num_free_extents < 0) { 181de474ee8STristan Ye ret = num_free_extents; 182de474ee8STristan Ye mlog_errno(ret); 183de474ee8STristan Ye goto out; 184de474ee8STristan Ye } 185de474ee8STristan Ye 186de474ee8STristan Ye if (!num_free_extents || 187de474ee8STristan Ye (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) 188de474ee8STristan Ye extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); 189de474ee8STristan Ye 190de474ee8STristan Ye ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); 191de474ee8STristan Ye if (ret) { 192de474ee8STristan Ye mlog_errno(ret); 193de474ee8STristan Ye goto out; 194de474ee8STristan Ye } 195de474ee8STristan Ye 196de474ee8STristan Ye if (data_ac) { 197de474ee8STristan Ye ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); 198de474ee8STristan Ye if (ret) { 199de474ee8STristan Ye mlog_errno(ret); 200de474ee8STristan Ye goto out; 201de474ee8STristan Ye } 202de474ee8STristan Ye } 203de474ee8STristan Ye 204de474ee8STristan Ye *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el, 205de474ee8STristan Ye clusters_to_move + 2); 206de474ee8STristan Ye 207de474ee8STristan Ye mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", 208de474ee8STristan Ye extra_blocks, clusters_to_move, *credits); 209de474ee8STristan Ye out: 210de474ee8STristan Ye if (ret) { 211de474ee8STristan Ye if (*meta_ac) { 212de474ee8STristan Ye ocfs2_free_alloc_context(*meta_ac); 213de474ee8STristan Ye *meta_ac = NULL; 214de474ee8STristan Ye } 215de474ee8STristan Ye } 216de474ee8STristan Ye 217de474ee8STristan Ye return ret; 218de474ee8STristan Ye } 219202ee5faSTristan Ye 220202ee5faSTristan Ye /* 221202ee5faSTristan Ye * Using one journal handle to guarantee the data consistency in case 222202ee5faSTristan Ye * crash happens anywhere. 223dda54e76STristan Ye * 224dda54e76STristan Ye * XXX: defrag can end up with finishing partial extent as requested, 225dda54e76STristan Ye * due to not enough contiguous clusters can be found in allocator. 226202ee5faSTristan Ye */ 227202ee5faSTristan Ye static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, 2284dfa66bdSTristan Ye u32 cpos, u32 phys_cpos, u32 *len, int ext_flags) 229202ee5faSTristan Ye { 2304dfa66bdSTristan Ye int ret, credits = 0, extra_blocks = 0, partial = context->partial; 231202ee5faSTristan Ye handle_t *handle; 232202ee5faSTristan Ye struct inode *inode = context->inode; 233202ee5faSTristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 234202ee5faSTristan Ye struct inode *tl_inode = osb->osb_tl_inode; 235202ee5faSTristan Ye struct ocfs2_refcount_tree *ref_tree = NULL; 236202ee5faSTristan Ye u32 new_phys_cpos, new_len; 237202ee5faSTristan Ye u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 238202ee5faSTristan Ye 2394dfa66bdSTristan Ye if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { 240202ee5faSTristan Ye 241202ee5faSTristan Ye BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & 242202ee5faSTristan Ye OCFS2_HAS_REFCOUNT_FL)); 243202ee5faSTristan Ye 244202ee5faSTristan Ye BUG_ON(!context->refcount_loc); 245202ee5faSTristan Ye 246202ee5faSTristan Ye ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, 247202ee5faSTristan Ye &ref_tree, NULL); 248202ee5faSTristan Ye if (ret) { 249202ee5faSTristan Ye mlog_errno(ret); 250202ee5faSTristan Ye return ret; 251202ee5faSTristan Ye } 252202ee5faSTristan Ye 253202ee5faSTristan Ye ret = ocfs2_prepare_refcount_change_for_del(inode, 254202ee5faSTristan Ye context->refcount_loc, 255202ee5faSTristan Ye phys_blkno, 2564dfa66bdSTristan Ye *len, 257202ee5faSTristan Ye &credits, 258202ee5faSTristan Ye &extra_blocks); 259202ee5faSTristan Ye if (ret) { 260202ee5faSTristan Ye mlog_errno(ret); 261202ee5faSTristan Ye goto out; 262202ee5faSTristan Ye } 263202ee5faSTristan Ye } 264202ee5faSTristan Ye 2654dfa66bdSTristan Ye ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, 266202ee5faSTristan Ye &context->meta_ac, 267202ee5faSTristan Ye &context->data_ac, 268202ee5faSTristan Ye extra_blocks, &credits); 269202ee5faSTristan Ye if (ret) { 270202ee5faSTristan Ye mlog_errno(ret); 271202ee5faSTristan Ye goto out; 272202ee5faSTristan Ye } 273202ee5faSTristan Ye 274202ee5faSTristan Ye /* 275202ee5faSTristan Ye * should be using allocation reservation strategy there? 276202ee5faSTristan Ye * 277202ee5faSTristan Ye * if (context->data_ac) 278202ee5faSTristan Ye * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; 279202ee5faSTristan Ye */ 280202ee5faSTristan Ye 281202ee5faSTristan Ye mutex_lock(&tl_inode->i_mutex); 282202ee5faSTristan Ye 283202ee5faSTristan Ye if (ocfs2_truncate_log_needs_flush(osb)) { 284202ee5faSTristan Ye ret = __ocfs2_flush_truncate_log(osb); 285202ee5faSTristan Ye if (ret < 0) { 286202ee5faSTristan Ye mlog_errno(ret); 287202ee5faSTristan Ye goto out_unlock_mutex; 288202ee5faSTristan Ye } 289202ee5faSTristan Ye } 290202ee5faSTristan Ye 291202ee5faSTristan Ye handle = ocfs2_start_trans(osb, credits); 292202ee5faSTristan Ye if (IS_ERR(handle)) { 293202ee5faSTristan Ye ret = PTR_ERR(handle); 294202ee5faSTristan Ye mlog_errno(ret); 295202ee5faSTristan Ye goto out_unlock_mutex; 296202ee5faSTristan Ye } 297202ee5faSTristan Ye 2984dfa66bdSTristan Ye ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len, 299202ee5faSTristan Ye &new_phys_cpos, &new_len); 300202ee5faSTristan Ye if (ret) { 301202ee5faSTristan Ye mlog_errno(ret); 302202ee5faSTristan Ye goto out_commit; 303202ee5faSTristan Ye } 304202ee5faSTristan Ye 305202ee5faSTristan Ye /* 3064dfa66bdSTristan Ye * allowing partial extent moving is kind of 'pros and cons', it makes 3074dfa66bdSTristan Ye * whole defragmentation less likely to fail, on the contrary, the bad 3084dfa66bdSTristan Ye * thing is it may make the fs even more fragmented after moving, let 3094dfa66bdSTristan Ye * userspace make a good decision here. 310202ee5faSTristan Ye */ 3114dfa66bdSTristan Ye if (new_len != *len) { 3124dfa66bdSTristan Ye mlog(0, "len_claimed: %u, len: %u\n", new_len, *len); 3134dfa66bdSTristan Ye if (!partial) { 314202ee5faSTristan Ye context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; 315202ee5faSTristan Ye ret = -ENOSPC; 316202ee5faSTristan Ye goto out_commit; 317202ee5faSTristan Ye } 3184dfa66bdSTristan Ye } 319202ee5faSTristan Ye 320202ee5faSTristan Ye mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, 321202ee5faSTristan Ye phys_cpos, new_phys_cpos); 322202ee5faSTristan Ye 3234dfa66bdSTristan Ye ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos, 324202ee5faSTristan Ye new_phys_cpos, ext_flags); 325202ee5faSTristan Ye if (ret) 326202ee5faSTristan Ye mlog_errno(ret); 327202ee5faSTristan Ye 3284dfa66bdSTristan Ye if (partial && (new_len != *len)) 3294dfa66bdSTristan Ye *len = new_len; 3304dfa66bdSTristan Ye 331202ee5faSTristan Ye /* 332202ee5faSTristan Ye * Here we should write the new page out first if we are 333202ee5faSTristan Ye * in write-back mode. 334202ee5faSTristan Ye */ 3354dfa66bdSTristan Ye ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len); 336202ee5faSTristan Ye if (ret) 337202ee5faSTristan Ye mlog_errno(ret); 338202ee5faSTristan Ye 339202ee5faSTristan Ye out_commit: 340202ee5faSTristan Ye ocfs2_commit_trans(osb, handle); 341202ee5faSTristan Ye 342202ee5faSTristan Ye out_unlock_mutex: 343202ee5faSTristan Ye mutex_unlock(&tl_inode->i_mutex); 344202ee5faSTristan Ye 345202ee5faSTristan Ye if (context->data_ac) { 346202ee5faSTristan Ye ocfs2_free_alloc_context(context->data_ac); 347202ee5faSTristan Ye context->data_ac = NULL; 348202ee5faSTristan Ye } 349202ee5faSTristan Ye 350202ee5faSTristan Ye if (context->meta_ac) { 351202ee5faSTristan Ye ocfs2_free_alloc_context(context->meta_ac); 352202ee5faSTristan Ye context->meta_ac = NULL; 353202ee5faSTristan Ye } 354202ee5faSTristan Ye 355202ee5faSTristan Ye out: 356202ee5faSTristan Ye if (ref_tree) 357202ee5faSTristan Ye ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 358202ee5faSTristan Ye 359202ee5faSTristan Ye return ret; 360202ee5faSTristan Ye } 3611c06b912STristan Ye 3621c06b912STristan Ye /* 3631c06b912STristan Ye * find the victim alloc group, where #blkno fits. 3641c06b912STristan Ye */ 3651c06b912STristan Ye static int ocfs2_find_victim_alloc_group(struct inode *inode, 3661c06b912STristan Ye u64 vict_blkno, 3671c06b912STristan Ye int type, int slot, 3681c06b912STristan Ye int *vict_bit, 3691c06b912STristan Ye struct buffer_head **ret_bh) 3701c06b912STristan Ye { 3716aea6f50STristan Ye int ret, i, bits_per_unit = 0; 3721c06b912STristan Ye u64 blkno; 3731c06b912STristan Ye char namebuf[40]; 3741c06b912STristan Ye 3751c06b912STristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3761c06b912STristan Ye struct buffer_head *ac_bh = NULL, *gd_bh = NULL; 3771c06b912STristan Ye struct ocfs2_chain_list *cl; 3781c06b912STristan Ye struct ocfs2_chain_rec *rec; 3791c06b912STristan Ye struct ocfs2_dinode *ac_dinode; 3801c06b912STristan Ye struct ocfs2_group_desc *bg; 3811c06b912STristan Ye 3821c06b912STristan Ye ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); 3831c06b912STristan Ye ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, 3841c06b912STristan Ye strlen(namebuf), &blkno); 3851c06b912STristan Ye if (ret) { 3861c06b912STristan Ye ret = -ENOENT; 3871c06b912STristan Ye goto out; 3881c06b912STristan Ye } 3891c06b912STristan Ye 3901c06b912STristan Ye ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); 3911c06b912STristan Ye if (ret) { 3921c06b912STristan Ye mlog_errno(ret); 3931c06b912STristan Ye goto out; 3941c06b912STristan Ye } 3951c06b912STristan Ye 3961c06b912STristan Ye ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; 3971c06b912STristan Ye cl = &(ac_dinode->id2.i_chain); 3981c06b912STristan Ye rec = &(cl->cl_recs[0]); 3991c06b912STristan Ye 4001c06b912STristan Ye if (type == GLOBAL_BITMAP_SYSTEM_INODE) 4016aea6f50STristan Ye bits_per_unit = osb->s_clustersize_bits - 4026aea6f50STristan Ye inode->i_sb->s_blocksize_bits; 4031c06b912STristan Ye /* 4041c06b912STristan Ye * 'vict_blkno' was out of the valid range. 4051c06b912STristan Ye */ 4061c06b912STristan Ye if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || 4076aea6f50STristan Ye (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << 4086aea6f50STristan Ye bits_per_unit))) { 4091c06b912STristan Ye ret = -EINVAL; 4101c06b912STristan Ye goto out; 4111c06b912STristan Ye } 4121c06b912STristan Ye 4131c06b912STristan Ye for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { 4141c06b912STristan Ye 4151c06b912STristan Ye rec = &(cl->cl_recs[i]); 4161c06b912STristan Ye if (!rec) 4171c06b912STristan Ye continue; 4181c06b912STristan Ye 4191c06b912STristan Ye bg = NULL; 4201c06b912STristan Ye 4211c06b912STristan Ye do { 4221c06b912STristan Ye if (!bg) 4231c06b912STristan Ye blkno = le64_to_cpu(rec->c_blkno); 4241c06b912STristan Ye else 4251c06b912STristan Ye blkno = le64_to_cpu(bg->bg_next_group); 4261c06b912STristan Ye 4271c06b912STristan Ye if (gd_bh) { 4281c06b912STristan Ye brelse(gd_bh); 4291c06b912STristan Ye gd_bh = NULL; 4301c06b912STristan Ye } 4311c06b912STristan Ye 4321c06b912STristan Ye ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); 4331c06b912STristan Ye if (ret) { 4341c06b912STristan Ye mlog_errno(ret); 4351c06b912STristan Ye goto out; 4361c06b912STristan Ye } 4371c06b912STristan Ye 4381c06b912STristan Ye bg = (struct ocfs2_group_desc *)gd_bh->b_data; 4391c06b912STristan Ye 4401c06b912STristan Ye if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + 4411c06b912STristan Ye le16_to_cpu(bg->bg_bits))) { 4421c06b912STristan Ye 4431c06b912STristan Ye *ret_bh = gd_bh; 4446aea6f50STristan Ye *vict_bit = (vict_blkno - blkno) >> 4456aea6f50STristan Ye bits_per_unit; 4461c06b912STristan Ye mlog(0, "find the victim group: #%llu, " 4471c06b912STristan Ye "total_bits: %u, vict_bit: %u\n", 4481c06b912STristan Ye blkno, le16_to_cpu(bg->bg_bits), 4491c06b912STristan Ye *vict_bit); 4501c06b912STristan Ye goto out; 4511c06b912STristan Ye } 4521c06b912STristan Ye 4531c06b912STristan Ye } while (le64_to_cpu(bg->bg_next_group)); 4541c06b912STristan Ye } 4551c06b912STristan Ye 4561c06b912STristan Ye ret = -EINVAL; 4571c06b912STristan Ye out: 4581c06b912STristan Ye brelse(ac_bh); 4591c06b912STristan Ye 4601c06b912STristan Ye /* 4611c06b912STristan Ye * caller has to release the gd_bh properly. 4621c06b912STristan Ye */ 4631c06b912STristan Ye return ret; 4641c06b912STristan Ye } 46599e4c750STristan Ye 46699e4c750STristan Ye /* 46799e4c750STristan Ye * XXX: helper to validate and adjust moving goal. 46899e4c750STristan Ye */ 46999e4c750STristan Ye static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, 47099e4c750STristan Ye struct ocfs2_move_extents *range) 47199e4c750STristan Ye { 47299e4c750STristan Ye int ret, goal_bit = 0; 47399e4c750STristan Ye 47499e4c750STristan Ye struct buffer_head *gd_bh = NULL; 475ea5e1675STristan Ye struct ocfs2_group_desc *bg = NULL; 47699e4c750STristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 47799e4c750STristan Ye int c_to_b = 1 << (osb->s_clustersize_bits - 47899e4c750STristan Ye inode->i_sb->s_blocksize_bits); 47999e4c750STristan Ye 48099e4c750STristan Ye /* 481ea5e1675STristan Ye * make goal become cluster aligned. 482ea5e1675STristan Ye */ 483ea5e1675STristan Ye range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb, 484ea5e1675STristan Ye range->me_goal); 485ea5e1675STristan Ye /* 486ea5e1675STristan Ye * moving goal is not allowd to start with a group desc blok(#0 blk) 487ea5e1675STristan Ye * let's compromise to the latter cluster. 488ea5e1675STristan Ye */ 489ea5e1675STristan Ye if (range->me_goal == le64_to_cpu(bg->bg_blkno)) 490ea5e1675STristan Ye range->me_goal += c_to_b; 491ea5e1675STristan Ye 492ea5e1675STristan Ye /* 49399e4c750STristan Ye * validate goal sits within global_bitmap, and return the victim 49499e4c750STristan Ye * group desc 49599e4c750STristan Ye */ 49699e4c750STristan Ye ret = ocfs2_find_victim_alloc_group(inode, range->me_goal, 49799e4c750STristan Ye GLOBAL_BITMAP_SYSTEM_INODE, 49899e4c750STristan Ye OCFS2_INVALID_SLOT, 49999e4c750STristan Ye &goal_bit, &gd_bh); 50099e4c750STristan Ye if (ret) 50199e4c750STristan Ye goto out; 50299e4c750STristan Ye 50399e4c750STristan Ye bg = (struct ocfs2_group_desc *)gd_bh->b_data; 50499e4c750STristan Ye 50599e4c750STristan Ye /* 50699e4c750STristan Ye * movement is not gonna cross two groups. 50799e4c750STristan Ye */ 50899e4c750STristan Ye if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < 50999e4c750STristan Ye range->me_len) { 51099e4c750STristan Ye ret = -EINVAL; 51199e4c750STristan Ye goto out; 51299e4c750STristan Ye } 51399e4c750STristan Ye /* 51499e4c750STristan Ye * more exact validations/adjustments will be performed later during 51599e4c750STristan Ye * moving operation for each extent range. 51699e4c750STristan Ye */ 51799e4c750STristan Ye mlog(0, "extents get ready to be moved to #%llu block\n", 51899e4c750STristan Ye range->me_goal); 51999e4c750STristan Ye 52099e4c750STristan Ye out: 52199e4c750STristan Ye brelse(gd_bh); 52299e4c750STristan Ye 52399e4c750STristan Ye return ret; 52499e4c750STristan Ye } 525e6b5859cSTristan Ye 526e6b5859cSTristan Ye static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, 527e6b5859cSTristan Ye int *goal_bit, u32 move_len, u32 max_hop, 528e6b5859cSTristan Ye u32 *phys_cpos) 529e6b5859cSTristan Ye { 530e6b5859cSTristan Ye int i, used, last_free_bits = 0, base_bit = *goal_bit; 531e6b5859cSTristan Ye struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 532e6b5859cSTristan Ye u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 533e6b5859cSTristan Ye le64_to_cpu(gd->bg_blkno)); 534e6b5859cSTristan Ye 535e6b5859cSTristan Ye for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) { 536e6b5859cSTristan Ye 537e6b5859cSTristan Ye used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap); 538e6b5859cSTristan Ye if (used) { 539e6b5859cSTristan Ye /* 540e6b5859cSTristan Ye * we even tried searching the free chunk by jumping 541e6b5859cSTristan Ye * a 'max_hop' distance, but still failed. 542e6b5859cSTristan Ye */ 543e6b5859cSTristan Ye if ((i - base_bit) > max_hop) { 544e6b5859cSTristan Ye *phys_cpos = 0; 545e6b5859cSTristan Ye break; 546e6b5859cSTristan Ye } 547e6b5859cSTristan Ye 548e6b5859cSTristan Ye if (last_free_bits) 549e6b5859cSTristan Ye last_free_bits = 0; 550e6b5859cSTristan Ye 551e6b5859cSTristan Ye continue; 552e6b5859cSTristan Ye } else 553e6b5859cSTristan Ye last_free_bits++; 554e6b5859cSTristan Ye 555e6b5859cSTristan Ye if (last_free_bits == move_len) { 556e6b5859cSTristan Ye *goal_bit = i; 557e6b5859cSTristan Ye *phys_cpos = base_cpos + i; 558e6b5859cSTristan Ye break; 559e6b5859cSTristan Ye } 560e6b5859cSTristan Ye } 561e6b5859cSTristan Ye 562e6b5859cSTristan Ye mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); 563e6b5859cSTristan Ye } 5648473aa8aSTristan Ye 5658473aa8aSTristan Ye static int ocfs2_alloc_dinode_update_counts(struct inode *inode, 5668473aa8aSTristan Ye handle_t *handle, 5678473aa8aSTristan Ye struct buffer_head *di_bh, 5688473aa8aSTristan Ye u32 num_bits, 5698473aa8aSTristan Ye u16 chain) 5708473aa8aSTristan Ye { 5718473aa8aSTristan Ye int ret; 5728473aa8aSTristan Ye u32 tmp_used; 5738473aa8aSTristan Ye struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 5748473aa8aSTristan Ye struct ocfs2_chain_list *cl = 5758473aa8aSTristan Ye (struct ocfs2_chain_list *) &di->id2.i_chain; 5768473aa8aSTristan Ye 5778473aa8aSTristan Ye ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 5788473aa8aSTristan Ye OCFS2_JOURNAL_ACCESS_WRITE); 5798473aa8aSTristan Ye if (ret < 0) { 5808473aa8aSTristan Ye mlog_errno(ret); 5818473aa8aSTristan Ye goto out; 5828473aa8aSTristan Ye } 5838473aa8aSTristan Ye 5848473aa8aSTristan Ye tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 5858473aa8aSTristan Ye di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 5868473aa8aSTristan Ye le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 5878473aa8aSTristan Ye ocfs2_journal_dirty(handle, di_bh); 5888473aa8aSTristan Ye 5898473aa8aSTristan Ye out: 5908473aa8aSTristan Ye return ret; 5918473aa8aSTristan Ye } 5928473aa8aSTristan Ye 5938473aa8aSTristan Ye static inline int ocfs2_block_group_set_bits(handle_t *handle, 5948473aa8aSTristan Ye struct inode *alloc_inode, 5958473aa8aSTristan Ye struct ocfs2_group_desc *bg, 5968473aa8aSTristan Ye struct buffer_head *group_bh, 5978473aa8aSTristan Ye unsigned int bit_off, 5988473aa8aSTristan Ye unsigned int num_bits) 5998473aa8aSTristan Ye { 6008473aa8aSTristan Ye int status; 6018473aa8aSTristan Ye void *bitmap = bg->bg_bitmap; 6028473aa8aSTristan Ye int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 6038473aa8aSTristan Ye 6048473aa8aSTristan Ye /* All callers get the descriptor via 6058473aa8aSTristan Ye * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 6068473aa8aSTristan Ye BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 6078473aa8aSTristan Ye BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 6088473aa8aSTristan Ye 6098473aa8aSTristan Ye mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, 6108473aa8aSTristan Ye num_bits); 6118473aa8aSTristan Ye 6128473aa8aSTristan Ye if (ocfs2_is_cluster_bitmap(alloc_inode)) 6138473aa8aSTristan Ye journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 6148473aa8aSTristan Ye 6158473aa8aSTristan Ye status = ocfs2_journal_access_gd(handle, 6168473aa8aSTristan Ye INODE_CACHE(alloc_inode), 6178473aa8aSTristan Ye group_bh, 6188473aa8aSTristan Ye journal_type); 6198473aa8aSTristan Ye if (status < 0) { 6208473aa8aSTristan Ye mlog_errno(status); 6218473aa8aSTristan Ye goto bail; 6228473aa8aSTristan Ye } 6238473aa8aSTristan Ye 6248473aa8aSTristan Ye le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 6258473aa8aSTristan Ye if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 6268473aa8aSTristan Ye ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" 6278473aa8aSTristan Ye " count %u but claims %u are freed. num_bits %d", 6288473aa8aSTristan Ye (unsigned long long)le64_to_cpu(bg->bg_blkno), 6298473aa8aSTristan Ye le16_to_cpu(bg->bg_bits), 6308473aa8aSTristan Ye le16_to_cpu(bg->bg_free_bits_count), num_bits); 6318473aa8aSTristan Ye return -EROFS; 6328473aa8aSTristan Ye } 6338473aa8aSTristan Ye while (num_bits--) 6348473aa8aSTristan Ye ocfs2_set_bit(bit_off++, bitmap); 6358473aa8aSTristan Ye 6368473aa8aSTristan Ye ocfs2_journal_dirty(handle, group_bh); 6378473aa8aSTristan Ye 6388473aa8aSTristan Ye bail: 6398473aa8aSTristan Ye return status; 6408473aa8aSTristan Ye } 641e0847717STristan Ye 642e0847717STristan Ye static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, 643e0847717STristan Ye u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, 644e0847717STristan Ye u32 len, int ext_flags) 645e0847717STristan Ye { 646e0847717STristan Ye int ret, credits = 0, extra_blocks = 0, goal_bit = 0; 647e0847717STristan Ye handle_t *handle; 648e0847717STristan Ye struct inode *inode = context->inode; 649e0847717STristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 650e0847717STristan Ye struct inode *tl_inode = osb->osb_tl_inode; 651e0847717STristan Ye struct inode *gb_inode = NULL; 652e0847717STristan Ye struct buffer_head *gb_bh = NULL; 653e0847717STristan Ye struct buffer_head *gd_bh = NULL; 654e0847717STristan Ye struct ocfs2_group_desc *gd; 655e0847717STristan Ye struct ocfs2_refcount_tree *ref_tree = NULL; 656e0847717STristan Ye u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb, 657e0847717STristan Ye context->range->me_threshold); 658e0847717STristan Ye u64 phys_blkno, new_phys_blkno; 659e0847717STristan Ye 660e0847717STristan Ye phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 661e0847717STristan Ye 662e0847717STristan Ye if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { 663e0847717STristan Ye 664e0847717STristan Ye BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & 665e0847717STristan Ye OCFS2_HAS_REFCOUNT_FL)); 666e0847717STristan Ye 667e0847717STristan Ye BUG_ON(!context->refcount_loc); 668e0847717STristan Ye 669e0847717STristan Ye ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, 670e0847717STristan Ye &ref_tree, NULL); 671e0847717STristan Ye if (ret) { 672e0847717STristan Ye mlog_errno(ret); 673e0847717STristan Ye return ret; 674e0847717STristan Ye } 675e0847717STristan Ye 676e0847717STristan Ye ret = ocfs2_prepare_refcount_change_for_del(inode, 677e0847717STristan Ye context->refcount_loc, 678e0847717STristan Ye phys_blkno, 679e0847717STristan Ye len, 680e0847717STristan Ye &credits, 681e0847717STristan Ye &extra_blocks); 682e0847717STristan Ye if (ret) { 683e0847717STristan Ye mlog_errno(ret); 684e0847717STristan Ye goto out; 685e0847717STristan Ye } 686e0847717STristan Ye } 687e0847717STristan Ye 688e0847717STristan Ye ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, 689e0847717STristan Ye &context->meta_ac, 690e0847717STristan Ye NULL, extra_blocks, &credits); 691e0847717STristan Ye if (ret) { 692e0847717STristan Ye mlog_errno(ret); 693e0847717STristan Ye goto out; 694e0847717STristan Ye } 695e0847717STristan Ye 696e0847717STristan Ye /* 697e0847717STristan Ye * need to count 2 extra credits for global_bitmap inode and 698e0847717STristan Ye * group descriptor. 699e0847717STristan Ye */ 700e0847717STristan Ye credits += OCFS2_INODE_UPDATE_CREDITS + 1; 701e0847717STristan Ye 702e0847717STristan Ye /* 703e0847717STristan Ye * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() 704e0847717STristan Ye * logic, while we still need to lock the global_bitmap. 705e0847717STristan Ye */ 706e0847717STristan Ye gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, 707e0847717STristan Ye OCFS2_INVALID_SLOT); 708e0847717STristan Ye if (!gb_inode) { 709e0847717STristan Ye mlog(ML_ERROR, "unable to get global_bitmap inode\n"); 710e0847717STristan Ye ret = -EIO; 711e0847717STristan Ye goto out; 712e0847717STristan Ye } 713e0847717STristan Ye 714e0847717STristan Ye mutex_lock(&gb_inode->i_mutex); 715e0847717STristan Ye 716e0847717STristan Ye ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); 717e0847717STristan Ye if (ret) { 718e0847717STristan Ye mlog_errno(ret); 719e0847717STristan Ye goto out_unlock_gb_mutex; 720e0847717STristan Ye } 721e0847717STristan Ye 722e0847717STristan Ye mutex_lock(&tl_inode->i_mutex); 723e0847717STristan Ye 724e0847717STristan Ye handle = ocfs2_start_trans(osb, credits); 725e0847717STristan Ye if (IS_ERR(handle)) { 726e0847717STristan Ye ret = PTR_ERR(handle); 727e0847717STristan Ye mlog_errno(ret); 728e0847717STristan Ye goto out_unlock_tl_inode; 729e0847717STristan Ye } 730e0847717STristan Ye 731e0847717STristan Ye new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); 732e0847717STristan Ye ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno, 733e0847717STristan Ye GLOBAL_BITMAP_SYSTEM_INODE, 734e0847717STristan Ye OCFS2_INVALID_SLOT, 735e0847717STristan Ye &goal_bit, &gd_bh); 736e0847717STristan Ye if (ret) { 737e0847717STristan Ye mlog_errno(ret); 738e0847717STristan Ye goto out_commit; 739e0847717STristan Ye } 740e0847717STristan Ye 741e0847717STristan Ye /* 742e0847717STristan Ye * probe the victim cluster group to find a proper 743e0847717STristan Ye * region to fit wanted movement, it even will perfrom 744e0847717STristan Ye * a best-effort attempt by compromising to a threshold 745e0847717STristan Ye * around the goal. 746e0847717STristan Ye */ 747e0847717STristan Ye ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, 748e0847717STristan Ye new_phys_cpos); 749e0847717STristan Ye if (!new_phys_cpos) { 750e0847717STristan Ye ret = -ENOSPC; 751e0847717STristan Ye goto out_commit; 752e0847717STristan Ye } 753e0847717STristan Ye 754e0847717STristan Ye ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos, 755e0847717STristan Ye *new_phys_cpos, ext_flags); 756e0847717STristan Ye if (ret) { 757e0847717STristan Ye mlog_errno(ret); 758e0847717STristan Ye goto out_commit; 759e0847717STristan Ye } 760e0847717STristan Ye 761e0847717STristan Ye gd = (struct ocfs2_group_desc *)gd_bh->b_data; 762e0847717STristan Ye ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len, 763e0847717STristan Ye le16_to_cpu(gd->bg_chain)); 764e0847717STristan Ye if (ret) { 765e0847717STristan Ye mlog_errno(ret); 766e0847717STristan Ye goto out_commit; 767e0847717STristan Ye } 768e0847717STristan Ye 769e0847717STristan Ye ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, 770e0847717STristan Ye goal_bit, len); 771e0847717STristan Ye if (ret) 772e0847717STristan Ye mlog_errno(ret); 773e0847717STristan Ye 774e0847717STristan Ye /* 775e0847717STristan Ye * Here we should write the new page out first if we are 776e0847717STristan Ye * in write-back mode. 777e0847717STristan Ye */ 778e0847717STristan Ye ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len); 779e0847717STristan Ye if (ret) 780e0847717STristan Ye mlog_errno(ret); 781e0847717STristan Ye 782e0847717STristan Ye out_commit: 783e0847717STristan Ye ocfs2_commit_trans(osb, handle); 784e0847717STristan Ye brelse(gd_bh); 785e0847717STristan Ye 786e0847717STristan Ye out_unlock_tl_inode: 787e0847717STristan Ye mutex_unlock(&tl_inode->i_mutex); 788e0847717STristan Ye 789e0847717STristan Ye ocfs2_inode_unlock(gb_inode, 1); 790e0847717STristan Ye out_unlock_gb_mutex: 791e0847717STristan Ye mutex_unlock(&gb_inode->i_mutex); 792e0847717STristan Ye brelse(gb_bh); 793e0847717STristan Ye iput(gb_inode); 794e0847717STristan Ye 795e0847717STristan Ye out: 796e0847717STristan Ye if (context->meta_ac) { 797e0847717STristan Ye ocfs2_free_alloc_context(context->meta_ac); 798e0847717STristan Ye context->meta_ac = NULL; 799e0847717STristan Ye } 800e0847717STristan Ye 801e0847717STristan Ye if (ref_tree) 802e0847717STristan Ye ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 803e0847717STristan Ye 804e0847717STristan Ye return ret; 805e0847717STristan Ye } 806ee16cc03STristan Ye 807ee16cc03STristan Ye /* 808ee16cc03STristan Ye * Helper to calculate the defraging length in one run according to threshold. 809ee16cc03STristan Ye */ 810ee16cc03STristan Ye static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged, 811ee16cc03STristan Ye u32 threshold, int *skip) 812ee16cc03STristan Ye { 813ee16cc03STristan Ye if ((*alloc_size + *len_defraged) < threshold) { 814ee16cc03STristan Ye /* 815ee16cc03STristan Ye * proceed defragmentation until we meet the thresh 816ee16cc03STristan Ye */ 817ee16cc03STristan Ye *len_defraged += *alloc_size; 818ee16cc03STristan Ye } else if (*len_defraged == 0) { 819ee16cc03STristan Ye /* 820ee16cc03STristan Ye * XXX: skip a large extent. 821ee16cc03STristan Ye */ 822ee16cc03STristan Ye *skip = 1; 823ee16cc03STristan Ye } else { 824ee16cc03STristan Ye /* 825ee16cc03STristan Ye * split this extent to coalesce with former pieces as 826ee16cc03STristan Ye * to reach the threshold. 827ee16cc03STristan Ye * 828ee16cc03STristan Ye * we're done here with one cycle of defragmentation 829ee16cc03STristan Ye * in a size of 'thresh', resetting 'len_defraged' 830ee16cc03STristan Ye * forces a new defragmentation. 831ee16cc03STristan Ye */ 832ee16cc03STristan Ye *alloc_size = threshold - *len_defraged; 833ee16cc03STristan Ye *len_defraged = 0; 834ee16cc03STristan Ye } 835ee16cc03STristan Ye } 83653069d4eSTristan Ye 83753069d4eSTristan Ye static int __ocfs2_move_extents_range(struct buffer_head *di_bh, 83853069d4eSTristan Ye struct ocfs2_move_extents_context *context) 83953069d4eSTristan Ye { 84053069d4eSTristan Ye int ret = 0, flags, do_defrag, skip = 0; 84153069d4eSTristan Ye u32 cpos, phys_cpos, move_start, len_to_move, alloc_size; 84253069d4eSTristan Ye u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0; 84353069d4eSTristan Ye 84453069d4eSTristan Ye struct inode *inode = context->inode; 84553069d4eSTristan Ye struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 84653069d4eSTristan Ye struct ocfs2_move_extents *range = context->range; 84753069d4eSTristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 84853069d4eSTristan Ye 84953069d4eSTristan Ye if ((inode->i_size == 0) || (range->me_len == 0)) 85053069d4eSTristan Ye return 0; 85153069d4eSTristan Ye 85253069d4eSTristan Ye if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 85353069d4eSTristan Ye return 0; 85453069d4eSTristan Ye 85553069d4eSTristan Ye context->refcount_loc = le64_to_cpu(di->i_refcount_loc); 85653069d4eSTristan Ye 85753069d4eSTristan Ye ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh); 85853069d4eSTristan Ye ocfs2_init_dealloc_ctxt(&context->dealloc); 85953069d4eSTristan Ye 86053069d4eSTristan Ye /* 86153069d4eSTristan Ye * TO-DO XXX: 86253069d4eSTristan Ye * 86353069d4eSTristan Ye * - xattr extents. 86453069d4eSTristan Ye */ 86553069d4eSTristan Ye 86653069d4eSTristan Ye do_defrag = context->auto_defrag; 86753069d4eSTristan Ye 86853069d4eSTristan Ye /* 86953069d4eSTristan Ye * extents moving happens in unit of clusters, for the sake 87053069d4eSTristan Ye * of simplicity, we may ignore two clusters where 'byte_start' 87153069d4eSTristan Ye * and 'byte_start + len' were within. 87253069d4eSTristan Ye */ 87353069d4eSTristan Ye move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start); 87453069d4eSTristan Ye len_to_move = (range->me_start + range->me_len) >> 87553069d4eSTristan Ye osb->s_clustersize_bits; 87653069d4eSTristan Ye if (len_to_move >= move_start) 87753069d4eSTristan Ye len_to_move -= move_start; 87853069d4eSTristan Ye else 87953069d4eSTristan Ye len_to_move = 0; 88053069d4eSTristan Ye 881dda54e76STristan Ye if (do_defrag) { 88253069d4eSTristan Ye defrag_thresh = range->me_threshold >> osb->s_clustersize_bits; 883dda54e76STristan Ye if (defrag_thresh <= 1) 884dda54e76STristan Ye goto done; 885dda54e76STristan Ye } else 88653069d4eSTristan Ye new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 88753069d4eSTristan Ye range->me_goal); 88853069d4eSTristan Ye 88953069d4eSTristan Ye mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, " 89053069d4eSTristan Ye "thresh: %u\n", 89153069d4eSTristan Ye (unsigned long long)OCFS2_I(inode)->ip_blkno, 89253069d4eSTristan Ye (unsigned long long)range->me_start, 89353069d4eSTristan Ye (unsigned long long)range->me_len, 89453069d4eSTristan Ye move_start, len_to_move, defrag_thresh); 89553069d4eSTristan Ye 89653069d4eSTristan Ye cpos = move_start; 89753069d4eSTristan Ye while (len_to_move) { 89853069d4eSTristan Ye ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size, 89953069d4eSTristan Ye &flags); 90053069d4eSTristan Ye if (ret) { 90153069d4eSTristan Ye mlog_errno(ret); 90253069d4eSTristan Ye goto out; 90353069d4eSTristan Ye } 90453069d4eSTristan Ye 90553069d4eSTristan Ye if (alloc_size > len_to_move) 90653069d4eSTristan Ye alloc_size = len_to_move; 90753069d4eSTristan Ye 90853069d4eSTristan Ye /* 90953069d4eSTristan Ye * XXX: how to deal with a hole: 91053069d4eSTristan Ye * 91153069d4eSTristan Ye * - skip the hole of course 91253069d4eSTristan Ye * - force a new defragmentation 91353069d4eSTristan Ye */ 91453069d4eSTristan Ye if (!phys_cpos) { 91553069d4eSTristan Ye if (do_defrag) 91653069d4eSTristan Ye len_defraged = 0; 91753069d4eSTristan Ye 91853069d4eSTristan Ye goto next; 91953069d4eSTristan Ye } 92053069d4eSTristan Ye 92153069d4eSTristan Ye if (do_defrag) { 92253069d4eSTristan Ye ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged, 92353069d4eSTristan Ye defrag_thresh, &skip); 92453069d4eSTristan Ye /* 92553069d4eSTristan Ye * skip large extents 92653069d4eSTristan Ye */ 92753069d4eSTristan Ye if (skip) { 92853069d4eSTristan Ye skip = 0; 92953069d4eSTristan Ye goto next; 93053069d4eSTristan Ye } 93153069d4eSTristan Ye 93253069d4eSTristan Ye mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, " 93353069d4eSTristan Ye "alloc_size: %u, len_defraged: %u\n", 93453069d4eSTristan Ye cpos, phys_cpos, alloc_size, len_defraged); 93553069d4eSTristan Ye 93653069d4eSTristan Ye ret = ocfs2_defrag_extent(context, cpos, phys_cpos, 9374dfa66bdSTristan Ye &alloc_size, flags); 93853069d4eSTristan Ye } else { 93953069d4eSTristan Ye ret = ocfs2_move_extent(context, cpos, phys_cpos, 94053069d4eSTristan Ye &new_phys_cpos, alloc_size, 94153069d4eSTristan Ye flags); 94253069d4eSTristan Ye 94353069d4eSTristan Ye new_phys_cpos += alloc_size; 94453069d4eSTristan Ye } 94553069d4eSTristan Ye 94653069d4eSTristan Ye if (ret < 0) { 94753069d4eSTristan Ye mlog_errno(ret); 94853069d4eSTristan Ye goto out; 94953069d4eSTristan Ye } 95053069d4eSTristan Ye 95153069d4eSTristan Ye context->clusters_moved += alloc_size; 95253069d4eSTristan Ye next: 95353069d4eSTristan Ye cpos += alloc_size; 95453069d4eSTristan Ye len_to_move -= alloc_size; 95553069d4eSTristan Ye } 95653069d4eSTristan Ye 957dda54e76STristan Ye done: 95853069d4eSTristan Ye range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE; 95953069d4eSTristan Ye 96053069d4eSTristan Ye out: 96153069d4eSTristan Ye range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb, 96253069d4eSTristan Ye context->clusters_moved); 96353069d4eSTristan Ye range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb, 96453069d4eSTristan Ye context->new_phys_cpos); 96553069d4eSTristan Ye 96653069d4eSTristan Ye ocfs2_schedule_truncate_log_flush(osb, 1); 96753069d4eSTristan Ye ocfs2_run_deallocs(osb, &context->dealloc); 96853069d4eSTristan Ye 96953069d4eSTristan Ye return ret; 97053069d4eSTristan Ye } 97153069d4eSTristan Ye 97253069d4eSTristan Ye static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) 97353069d4eSTristan Ye { 97453069d4eSTristan Ye int status; 97553069d4eSTristan Ye handle_t *handle; 97653069d4eSTristan Ye struct inode *inode = context->inode; 97753069d4eSTristan Ye struct ocfs2_dinode *di; 97853069d4eSTristan Ye struct buffer_head *di_bh = NULL; 97953069d4eSTristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 98053069d4eSTristan Ye 98153069d4eSTristan Ye if (!inode) 98253069d4eSTristan Ye return -ENOENT; 98353069d4eSTristan Ye 98453069d4eSTristan Ye if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 98553069d4eSTristan Ye return -EROFS; 98653069d4eSTristan Ye 98753069d4eSTristan Ye mutex_lock(&inode->i_mutex); 98853069d4eSTristan Ye 98953069d4eSTristan Ye /* 99053069d4eSTristan Ye * This prevents concurrent writes from other nodes 99153069d4eSTristan Ye */ 99253069d4eSTristan Ye status = ocfs2_rw_lock(inode, 1); 99353069d4eSTristan Ye if (status) { 99453069d4eSTristan Ye mlog_errno(status); 99553069d4eSTristan Ye goto out; 99653069d4eSTristan Ye } 99753069d4eSTristan Ye 99853069d4eSTristan Ye status = ocfs2_inode_lock(inode, &di_bh, 1); 99953069d4eSTristan Ye if (status) { 100053069d4eSTristan Ye mlog_errno(status); 100153069d4eSTristan Ye goto out_rw_unlock; 100253069d4eSTristan Ye } 100353069d4eSTristan Ye 100453069d4eSTristan Ye /* 100553069d4eSTristan Ye * rememer ip_xattr_sem also needs to be held if necessary 100653069d4eSTristan Ye */ 100753069d4eSTristan Ye down_write(&OCFS2_I(inode)->ip_alloc_sem); 100853069d4eSTristan Ye 100953069d4eSTristan Ye status = __ocfs2_move_extents_range(di_bh, context); 101053069d4eSTristan Ye 101153069d4eSTristan Ye up_write(&OCFS2_I(inode)->ip_alloc_sem); 101253069d4eSTristan Ye if (status) { 101353069d4eSTristan Ye mlog_errno(status); 101453069d4eSTristan Ye goto out_inode_unlock; 101553069d4eSTristan Ye } 101653069d4eSTristan Ye 101753069d4eSTristan Ye /* 101853069d4eSTristan Ye * We update ctime for these changes 101953069d4eSTristan Ye */ 102053069d4eSTristan Ye handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 102153069d4eSTristan Ye if (IS_ERR(handle)) { 102253069d4eSTristan Ye status = PTR_ERR(handle); 102353069d4eSTristan Ye mlog_errno(status); 102453069d4eSTristan Ye goto out_inode_unlock; 102553069d4eSTristan Ye } 102653069d4eSTristan Ye 102753069d4eSTristan Ye status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 102853069d4eSTristan Ye OCFS2_JOURNAL_ACCESS_WRITE); 102953069d4eSTristan Ye if (status) { 103053069d4eSTristan Ye mlog_errno(status); 103153069d4eSTristan Ye goto out_commit; 103253069d4eSTristan Ye } 103353069d4eSTristan Ye 103453069d4eSTristan Ye di = (struct ocfs2_dinode *)di_bh->b_data; 103553069d4eSTristan Ye inode->i_ctime = CURRENT_TIME; 103653069d4eSTristan Ye di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 103753069d4eSTristan Ye di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 103853069d4eSTristan Ye 103953069d4eSTristan Ye ocfs2_journal_dirty(handle, di_bh); 104053069d4eSTristan Ye 104153069d4eSTristan Ye out_commit: 104253069d4eSTristan Ye ocfs2_commit_trans(osb, handle); 104353069d4eSTristan Ye 104453069d4eSTristan Ye out_inode_unlock: 104553069d4eSTristan Ye brelse(di_bh); 104653069d4eSTristan Ye ocfs2_inode_unlock(inode, 1); 104753069d4eSTristan Ye out_rw_unlock: 104853069d4eSTristan Ye ocfs2_rw_unlock(inode, 1); 104953069d4eSTristan Ye out: 105053069d4eSTristan Ye mutex_unlock(&inode->i_mutex); 105153069d4eSTristan Ye 105253069d4eSTristan Ye return status; 105353069d4eSTristan Ye } 105453069d4eSTristan Ye 105553069d4eSTristan Ye int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) 105653069d4eSTristan Ye { 105753069d4eSTristan Ye int status; 105853069d4eSTristan Ye 105953069d4eSTristan Ye struct inode *inode = filp->f_path.dentry->d_inode; 106053069d4eSTristan Ye struct ocfs2_move_extents range; 106153069d4eSTristan Ye struct ocfs2_move_extents_context *context = NULL; 106253069d4eSTristan Ye 106353069d4eSTristan Ye status = mnt_want_write(filp->f_path.mnt); 106453069d4eSTristan Ye if (status) 106553069d4eSTristan Ye return status; 106653069d4eSTristan Ye 106753069d4eSTristan Ye if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) 106853069d4eSTristan Ye goto out; 106953069d4eSTristan Ye 107053069d4eSTristan Ye if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 107153069d4eSTristan Ye status = -EPERM; 107253069d4eSTristan Ye goto out; 107353069d4eSTristan Ye } 107453069d4eSTristan Ye 107553069d4eSTristan Ye context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); 107653069d4eSTristan Ye if (!context) { 107753069d4eSTristan Ye status = -ENOMEM; 107853069d4eSTristan Ye mlog_errno(status); 107953069d4eSTristan Ye goto out; 108053069d4eSTristan Ye } 108153069d4eSTristan Ye 108253069d4eSTristan Ye context->inode = inode; 108353069d4eSTristan Ye context->file = filp; 108453069d4eSTristan Ye 108553069d4eSTristan Ye if (argp) { 108653069d4eSTristan Ye if (copy_from_user(&range, (struct ocfs2_move_extents *)argp, 108753069d4eSTristan Ye sizeof(range))) { 108853069d4eSTristan Ye status = -EFAULT; 108953069d4eSTristan Ye goto out; 109053069d4eSTristan Ye } 109153069d4eSTristan Ye } else { 109253069d4eSTristan Ye status = -EINVAL; 109353069d4eSTristan Ye goto out; 109453069d4eSTristan Ye } 109553069d4eSTristan Ye 109653069d4eSTristan Ye if (range.me_start > i_size_read(inode)) 109753069d4eSTristan Ye goto out; 109853069d4eSTristan Ye 109953069d4eSTristan Ye if (range.me_start + range.me_len > i_size_read(inode)) 110053069d4eSTristan Ye range.me_len = i_size_read(inode) - range.me_start; 110153069d4eSTristan Ye 110253069d4eSTristan Ye context->range = ⦥ 110353069d4eSTristan Ye 110453069d4eSTristan Ye if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { 110553069d4eSTristan Ye context->auto_defrag = 1; 110653069d4eSTristan Ye /* 110753069d4eSTristan Ye * ok, the default theshold for the defragmentation 110853069d4eSTristan Ye * is 1M, since our maximum clustersize was 1M also. 110953069d4eSTristan Ye * any thought? 111053069d4eSTristan Ye */ 1111dda54e76STristan Ye if (!range.me_threshold) 111253069d4eSTristan Ye range.me_threshold = 1024 * 1024; 1113dda54e76STristan Ye 1114dda54e76STristan Ye if (range.me_threshold > i_size_read(inode)) 1115dda54e76STristan Ye range.me_threshold = i_size_read(inode); 1116dda54e76STristan Ye 11174dfa66bdSTristan Ye if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) 11184dfa66bdSTristan Ye context->partial = 1; 111953069d4eSTristan Ye } else { 112053069d4eSTristan Ye /* 112153069d4eSTristan Ye * first best-effort attempt to validate and adjust the goal 112253069d4eSTristan Ye * (physical address in block), while it can't guarantee later 112353069d4eSTristan Ye * operation can succeed all the time since global_bitmap may 112453069d4eSTristan Ye * change a bit over time. 112553069d4eSTristan Ye */ 112653069d4eSTristan Ye 112753069d4eSTristan Ye status = ocfs2_validate_and_adjust_move_goal(inode, &range); 112853069d4eSTristan Ye if (status) 112953069d4eSTristan Ye goto out; 113053069d4eSTristan Ye } 113153069d4eSTristan Ye 113253069d4eSTristan Ye status = ocfs2_move_extents(context); 113353069d4eSTristan Ye if (status) 113453069d4eSTristan Ye mlog_errno(status); 113553069d4eSTristan Ye out: 113653069d4eSTristan Ye /* 113753069d4eSTristan Ye * movement/defragmentation may end up being partially completed, 113853069d4eSTristan Ye * that's the reason why we need to return userspace the finished 113953069d4eSTristan Ye * length and new_offset even if failure happens somewhere. 114053069d4eSTristan Ye */ 114153069d4eSTristan Ye if (argp) { 114253069d4eSTristan Ye if (copy_to_user((struct ocfs2_move_extents *)argp, &range, 114353069d4eSTristan Ye sizeof(range))) 114453069d4eSTristan Ye status = -EFAULT; 114553069d4eSTristan Ye } 114653069d4eSTristan Ye 114753069d4eSTristan Ye kfree(context); 114853069d4eSTristan Ye 114953069d4eSTristan Ye mnt_drop_write(filp->f_path.mnt); 115053069d4eSTristan Ye 115153069d4eSTristan Ye return status; 115253069d4eSTristan Ye } 1153