1028ba5dfSTristan Ye /* -*- mode: c; c-basic-offset: 8; -*- 2028ba5dfSTristan Ye * vim: noexpandtab sw=8 ts=8 sts=0: 3028ba5dfSTristan Ye * 4028ba5dfSTristan Ye * move_extents.c 5028ba5dfSTristan Ye * 6028ba5dfSTristan Ye * Copyright (C) 2011 Oracle. All rights reserved. 7028ba5dfSTristan Ye * 8028ba5dfSTristan Ye * This program is free software; you can redistribute it and/or 9028ba5dfSTristan Ye * modify it under the terms of the GNU General Public 10028ba5dfSTristan Ye * License version 2 as published by the Free Software Foundation. 11028ba5dfSTristan Ye * 12028ba5dfSTristan Ye * This program is distributed in the hope that it will be useful, 13028ba5dfSTristan Ye * but WITHOUT ANY WARRANTY; without even the implied warranty of 14028ba5dfSTristan Ye * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 15028ba5dfSTristan Ye * General Public License for more details. 16028ba5dfSTristan Ye */ 17028ba5dfSTristan Ye #include <linux/fs.h> 18028ba5dfSTristan Ye #include <linux/types.h> 19028ba5dfSTristan Ye #include <linux/mount.h> 20028ba5dfSTristan Ye #include <linux/swap.h> 21028ba5dfSTristan Ye 22028ba5dfSTristan Ye #include <cluster/masklog.h> 23028ba5dfSTristan Ye 24028ba5dfSTristan Ye #include "ocfs2.h" 25028ba5dfSTristan Ye #include "ocfs2_ioctl.h" 26028ba5dfSTristan Ye 27028ba5dfSTristan Ye #include "alloc.h" 28028ba5dfSTristan Ye #include "aops.h" 29028ba5dfSTristan Ye #include "dlmglue.h" 30028ba5dfSTristan Ye #include "extent_map.h" 31028ba5dfSTristan Ye #include "inode.h" 32028ba5dfSTristan Ye #include "journal.h" 33028ba5dfSTristan Ye #include "suballoc.h" 34028ba5dfSTristan Ye #include "uptodate.h" 35028ba5dfSTristan Ye #include "super.h" 36028ba5dfSTristan Ye #include "dir.h" 37028ba5dfSTristan Ye #include "buffer_head_io.h" 38028ba5dfSTristan Ye #include "sysfile.h" 39028ba5dfSTristan Ye #include "refcounttree.h" 40028ba5dfSTristan Ye #include "move_extents.h" 41028ba5dfSTristan Ye 42028ba5dfSTristan Ye struct ocfs2_move_extents_context { 43028ba5dfSTristan Ye struct inode *inode; 44028ba5dfSTristan Ye struct file *file; 45028ba5dfSTristan Ye int auto_defrag; 464dfa66bdSTristan Ye int partial; 47028ba5dfSTristan Ye int credits; 48028ba5dfSTristan Ye u32 new_phys_cpos; 49028ba5dfSTristan Ye u32 clusters_moved; 50028ba5dfSTristan Ye u64 refcount_loc; 51028ba5dfSTristan Ye struct ocfs2_move_extents *range; 52028ba5dfSTristan Ye struct ocfs2_extent_tree et; 53028ba5dfSTristan Ye struct ocfs2_alloc_context *meta_ac; 54028ba5dfSTristan Ye struct ocfs2_alloc_context *data_ac; 55028ba5dfSTristan Ye struct ocfs2_cached_dealloc_ctxt dealloc; 56028ba5dfSTristan Ye }; 57de474ee8STristan Ye 588f603e56STristan Ye static int __ocfs2_move_extent(handle_t *handle, 598f603e56STristan Ye struct ocfs2_move_extents_context *context, 608f603e56STristan Ye u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos, 618f603e56STristan Ye int ext_flags) 628f603e56STristan Ye { 638f603e56STristan Ye int ret = 0, index; 648f603e56STristan Ye struct inode *inode = context->inode; 658f603e56STristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 668f603e56STristan Ye struct ocfs2_extent_rec *rec, replace_rec; 678f603e56STristan Ye struct ocfs2_path *path = NULL; 688f603e56STristan Ye struct ocfs2_extent_list *el; 698f603e56STristan Ye u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci); 708f603e56STristan Ye u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos); 718f603e56STristan Ye 728f603e56STristan Ye ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos, 738f603e56STristan Ye p_cpos, new_p_cpos, len); 748f603e56STristan Ye if (ret) { 758f603e56STristan Ye mlog_errno(ret); 768f603e56STristan Ye goto out; 778f603e56STristan Ye } 788f603e56STristan Ye 798f603e56STristan Ye memset(&replace_rec, 0, sizeof(replace_rec)); 808f603e56STristan Ye replace_rec.e_cpos = cpu_to_le32(cpos); 818f603e56STristan Ye replace_rec.e_leaf_clusters = cpu_to_le16(len); 828f603e56STristan Ye replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, 838f603e56STristan Ye new_p_cpos)); 848f603e56STristan Ye 858f603e56STristan Ye path = ocfs2_new_path_from_et(&context->et); 868f603e56STristan Ye if (!path) { 878f603e56STristan Ye ret = -ENOMEM; 888f603e56STristan Ye mlog_errno(ret); 898f603e56STristan Ye goto out; 908f603e56STristan Ye } 918f603e56STristan Ye 928f603e56STristan Ye ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos); 938f603e56STristan Ye if (ret) { 948f603e56STristan Ye mlog_errno(ret); 958f603e56STristan Ye goto out; 968f603e56STristan Ye } 978f603e56STristan Ye 988f603e56STristan Ye el = path_leaf_el(path); 998f603e56STristan Ye 1008f603e56STristan Ye index = ocfs2_search_extent_list(el, cpos); 1018f603e56STristan Ye if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 1028f603e56STristan Ye ocfs2_error(inode->i_sb, 1038f603e56STristan Ye "Inode %llu has an extent at cpos %u which can no " 1048f603e56STristan Ye "longer be found.\n", 1058f603e56STristan Ye (unsigned long long)ino, cpos); 1068f603e56STristan Ye ret = -EROFS; 1078f603e56STristan Ye goto out; 1088f603e56STristan Ye } 1098f603e56STristan Ye 1108f603e56STristan Ye rec = &el->l_recs[index]; 1118f603e56STristan Ye 1128f603e56STristan Ye BUG_ON(ext_flags != rec->e_flags); 1138f603e56STristan Ye /* 1148f603e56STristan Ye * after moving/defraging to new location, the extent is not going 1158f603e56STristan Ye * to be refcounted anymore. 1168f603e56STristan Ye */ 1178f603e56STristan Ye replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED; 1188f603e56STristan Ye 1198f603e56STristan Ye ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), 1208f603e56STristan Ye context->et.et_root_bh, 1218f603e56STristan Ye OCFS2_JOURNAL_ACCESS_WRITE); 1228f603e56STristan Ye if (ret) { 1238f603e56STristan Ye mlog_errno(ret); 1248f603e56STristan Ye goto out; 1258f603e56STristan Ye } 1268f603e56STristan Ye 1278f603e56STristan Ye ret = ocfs2_split_extent(handle, &context->et, path, index, 1288f603e56STristan Ye &replace_rec, context->meta_ac, 1298f603e56STristan Ye &context->dealloc); 1308f603e56STristan Ye if (ret) { 1318f603e56STristan Ye mlog_errno(ret); 1328f603e56STristan Ye goto out; 1338f603e56STristan Ye } 1348f603e56STristan Ye 1358f603e56STristan Ye ocfs2_journal_dirty(handle, context->et.et_root_bh); 1368f603e56STristan Ye 1378f603e56STristan Ye context->new_phys_cpos = new_p_cpos; 1388f603e56STristan Ye 1398f603e56STristan Ye /* 1408f603e56STristan Ye * need I to append truncate log for old clusters? 1418f603e56STristan Ye */ 1428f603e56STristan Ye if (old_blkno) { 1438f603e56STristan Ye if (ext_flags & OCFS2_EXT_REFCOUNTED) 1448f603e56STristan Ye ret = ocfs2_decrease_refcount(inode, handle, 1458f603e56STristan Ye ocfs2_blocks_to_clusters(osb->sb, 1468f603e56STristan Ye old_blkno), 1478f603e56STristan Ye len, context->meta_ac, 1488f603e56STristan Ye &context->dealloc, 1); 1498f603e56STristan Ye else 1508f603e56STristan Ye ret = ocfs2_truncate_log_append(osb, handle, 1518f603e56STristan Ye old_blkno, len); 1528f603e56STristan Ye } 1538f603e56STristan Ye 1548f603e56STristan Ye out: 1558f603e56STristan Ye return ret; 1568f603e56STristan Ye } 1578f603e56STristan Ye 158de474ee8STristan Ye /* 159de474ee8STristan Ye * lock allocators, and reserving appropriate number of bits for 160de474ee8STristan Ye * meta blocks and data clusters. 161de474ee8STristan Ye * 162de474ee8STristan Ye * in some cases, we don't need to reserve clusters, just let data_ac 163de474ee8STristan Ye * be NULL. 164de474ee8STristan Ye */ 165de474ee8STristan Ye static int ocfs2_lock_allocators_move_extents(struct inode *inode, 166de474ee8STristan Ye struct ocfs2_extent_tree *et, 167de474ee8STristan Ye u32 clusters_to_move, 168de474ee8STristan Ye u32 extents_to_split, 169de474ee8STristan Ye struct ocfs2_alloc_context **meta_ac, 170de474ee8STristan Ye struct ocfs2_alloc_context **data_ac, 171de474ee8STristan Ye int extra_blocks, 172de474ee8STristan Ye int *credits) 173de474ee8STristan Ye { 174de474ee8STristan Ye int ret, num_free_extents; 175de474ee8STristan Ye unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move; 176de474ee8STristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 177de474ee8STristan Ye 178de474ee8STristan Ye num_free_extents = ocfs2_num_free_extents(osb, et); 179de474ee8STristan Ye if (num_free_extents < 0) { 180de474ee8STristan Ye ret = num_free_extents; 181de474ee8STristan Ye mlog_errno(ret); 182de474ee8STristan Ye goto out; 183de474ee8STristan Ye } 184de474ee8STristan Ye 185de474ee8STristan Ye if (!num_free_extents || 186de474ee8STristan Ye (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) 187de474ee8STristan Ye extra_blocks += ocfs2_extend_meta_needed(et->et_root_el); 188de474ee8STristan Ye 189de474ee8STristan Ye ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac); 190de474ee8STristan Ye if (ret) { 191de474ee8STristan Ye mlog_errno(ret); 192de474ee8STristan Ye goto out; 193de474ee8STristan Ye } 194de474ee8STristan Ye 195de474ee8STristan Ye if (data_ac) { 196de474ee8STristan Ye ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac); 197de474ee8STristan Ye if (ret) { 198de474ee8STristan Ye mlog_errno(ret); 199de474ee8STristan Ye goto out; 200de474ee8STristan Ye } 201de474ee8STristan Ye } 202de474ee8STristan Ye 203de474ee8STristan Ye *credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el, 204de474ee8STristan Ye clusters_to_move + 2); 205de474ee8STristan Ye 206de474ee8STristan Ye mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n", 207de474ee8STristan Ye extra_blocks, clusters_to_move, *credits); 208de474ee8STristan Ye out: 209de474ee8STristan Ye if (ret) { 210de474ee8STristan Ye if (*meta_ac) { 211de474ee8STristan Ye ocfs2_free_alloc_context(*meta_ac); 212de474ee8STristan Ye *meta_ac = NULL; 213de474ee8STristan Ye } 214de474ee8STristan Ye } 215de474ee8STristan Ye 216de474ee8STristan Ye return ret; 217de474ee8STristan Ye } 218202ee5faSTristan Ye 219202ee5faSTristan Ye /* 220202ee5faSTristan Ye * Using one journal handle to guarantee the data consistency in case 221202ee5faSTristan Ye * crash happens anywhere. 222dda54e76STristan Ye * 223dda54e76STristan Ye * XXX: defrag can end up with finishing partial extent as requested, 224dda54e76STristan Ye * due to not enough contiguous clusters can be found in allocator. 225202ee5faSTristan Ye */ 226202ee5faSTristan Ye static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, 2274dfa66bdSTristan Ye u32 cpos, u32 phys_cpos, u32 *len, int ext_flags) 228202ee5faSTristan Ye { 2294dfa66bdSTristan Ye int ret, credits = 0, extra_blocks = 0, partial = context->partial; 230202ee5faSTristan Ye handle_t *handle; 231202ee5faSTristan Ye struct inode *inode = context->inode; 232202ee5faSTristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 233202ee5faSTristan Ye struct inode *tl_inode = osb->osb_tl_inode; 234202ee5faSTristan Ye struct ocfs2_refcount_tree *ref_tree = NULL; 235202ee5faSTristan Ye u32 new_phys_cpos, new_len; 236202ee5faSTristan Ye u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 237202ee5faSTristan Ye 2384dfa66bdSTristan Ye if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) { 239202ee5faSTristan Ye 240202ee5faSTristan Ye BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & 241202ee5faSTristan Ye OCFS2_HAS_REFCOUNT_FL)); 242202ee5faSTristan Ye 243202ee5faSTristan Ye BUG_ON(!context->refcount_loc); 244202ee5faSTristan Ye 245202ee5faSTristan Ye ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, 246202ee5faSTristan Ye &ref_tree, NULL); 247202ee5faSTristan Ye if (ret) { 248202ee5faSTristan Ye mlog_errno(ret); 249202ee5faSTristan Ye return ret; 250202ee5faSTristan Ye } 251202ee5faSTristan Ye 252202ee5faSTristan Ye ret = ocfs2_prepare_refcount_change_for_del(inode, 253202ee5faSTristan Ye context->refcount_loc, 254202ee5faSTristan Ye phys_blkno, 2554dfa66bdSTristan Ye *len, 256202ee5faSTristan Ye &credits, 257202ee5faSTristan Ye &extra_blocks); 258202ee5faSTristan Ye if (ret) { 259202ee5faSTristan Ye mlog_errno(ret); 260202ee5faSTristan Ye goto out; 261202ee5faSTristan Ye } 262202ee5faSTristan Ye } 263202ee5faSTristan Ye 2644dfa66bdSTristan Ye ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1, 265202ee5faSTristan Ye &context->meta_ac, 266202ee5faSTristan Ye &context->data_ac, 267202ee5faSTristan Ye extra_blocks, &credits); 268202ee5faSTristan Ye if (ret) { 269202ee5faSTristan Ye mlog_errno(ret); 270202ee5faSTristan Ye goto out; 271202ee5faSTristan Ye } 272202ee5faSTristan Ye 273202ee5faSTristan Ye /* 274202ee5faSTristan Ye * should be using allocation reservation strategy there? 275202ee5faSTristan Ye * 276202ee5faSTristan Ye * if (context->data_ac) 277202ee5faSTristan Ye * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; 278202ee5faSTristan Ye */ 279202ee5faSTristan Ye 280202ee5faSTristan Ye mutex_lock(&tl_inode->i_mutex); 281202ee5faSTristan Ye 282202ee5faSTristan Ye if (ocfs2_truncate_log_needs_flush(osb)) { 283202ee5faSTristan Ye ret = __ocfs2_flush_truncate_log(osb); 284202ee5faSTristan Ye if (ret < 0) { 285202ee5faSTristan Ye mlog_errno(ret); 286202ee5faSTristan Ye goto out_unlock_mutex; 287202ee5faSTristan Ye } 288202ee5faSTristan Ye } 289202ee5faSTristan Ye 290202ee5faSTristan Ye handle = ocfs2_start_trans(osb, credits); 291202ee5faSTristan Ye if (IS_ERR(handle)) { 292202ee5faSTristan Ye ret = PTR_ERR(handle); 293202ee5faSTristan Ye mlog_errno(ret); 294202ee5faSTristan Ye goto out_unlock_mutex; 295202ee5faSTristan Ye } 296202ee5faSTristan Ye 2974dfa66bdSTristan Ye ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len, 298202ee5faSTristan Ye &new_phys_cpos, &new_len); 299202ee5faSTristan Ye if (ret) { 300202ee5faSTristan Ye mlog_errno(ret); 301202ee5faSTristan Ye goto out_commit; 302202ee5faSTristan Ye } 303202ee5faSTristan Ye 304202ee5faSTristan Ye /* 3054dfa66bdSTristan Ye * allowing partial extent moving is kind of 'pros and cons', it makes 3064dfa66bdSTristan Ye * whole defragmentation less likely to fail, on the contrary, the bad 3074dfa66bdSTristan Ye * thing is it may make the fs even more fragmented after moving, let 3084dfa66bdSTristan Ye * userspace make a good decision here. 309202ee5faSTristan Ye */ 3104dfa66bdSTristan Ye if (new_len != *len) { 3114dfa66bdSTristan Ye mlog(0, "len_claimed: %u, len: %u\n", new_len, *len); 3124dfa66bdSTristan Ye if (!partial) { 313202ee5faSTristan Ye context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE; 314202ee5faSTristan Ye ret = -ENOSPC; 315202ee5faSTristan Ye goto out_commit; 316202ee5faSTristan Ye } 3174dfa66bdSTristan Ye } 318202ee5faSTristan Ye 319202ee5faSTristan Ye mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos, 320202ee5faSTristan Ye phys_cpos, new_phys_cpos); 321202ee5faSTristan Ye 3224dfa66bdSTristan Ye ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos, 323202ee5faSTristan Ye new_phys_cpos, ext_flags); 324202ee5faSTristan Ye if (ret) 325202ee5faSTristan Ye mlog_errno(ret); 326202ee5faSTristan Ye 3274dfa66bdSTristan Ye if (partial && (new_len != *len)) 3284dfa66bdSTristan Ye *len = new_len; 3294dfa66bdSTristan Ye 330202ee5faSTristan Ye /* 331202ee5faSTristan Ye * Here we should write the new page out first if we are 332202ee5faSTristan Ye * in write-back mode. 333202ee5faSTristan Ye */ 3344dfa66bdSTristan Ye ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len); 335202ee5faSTristan Ye if (ret) 336202ee5faSTristan Ye mlog_errno(ret); 337202ee5faSTristan Ye 338202ee5faSTristan Ye out_commit: 339202ee5faSTristan Ye ocfs2_commit_trans(osb, handle); 340202ee5faSTristan Ye 341202ee5faSTristan Ye out_unlock_mutex: 342202ee5faSTristan Ye mutex_unlock(&tl_inode->i_mutex); 343202ee5faSTristan Ye 344202ee5faSTristan Ye if (context->data_ac) { 345202ee5faSTristan Ye ocfs2_free_alloc_context(context->data_ac); 346202ee5faSTristan Ye context->data_ac = NULL; 347202ee5faSTristan Ye } 348202ee5faSTristan Ye 349202ee5faSTristan Ye if (context->meta_ac) { 350202ee5faSTristan Ye ocfs2_free_alloc_context(context->meta_ac); 351202ee5faSTristan Ye context->meta_ac = NULL; 352202ee5faSTristan Ye } 353202ee5faSTristan Ye 354202ee5faSTristan Ye out: 355202ee5faSTristan Ye if (ref_tree) 356202ee5faSTristan Ye ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 357202ee5faSTristan Ye 358202ee5faSTristan Ye return ret; 359202ee5faSTristan Ye } 3601c06b912STristan Ye 3611c06b912STristan Ye /* 3621c06b912STristan Ye * find the victim alloc group, where #blkno fits. 3631c06b912STristan Ye */ 3641c06b912STristan Ye static int ocfs2_find_victim_alloc_group(struct inode *inode, 3651c06b912STristan Ye u64 vict_blkno, 3661c06b912STristan Ye int type, int slot, 3671c06b912STristan Ye int *vict_bit, 3681c06b912STristan Ye struct buffer_head **ret_bh) 3691c06b912STristan Ye { 3706aea6f50STristan Ye int ret, i, bits_per_unit = 0; 3711c06b912STristan Ye u64 blkno; 3721c06b912STristan Ye char namebuf[40]; 3731c06b912STristan Ye 3741c06b912STristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 3751c06b912STristan Ye struct buffer_head *ac_bh = NULL, *gd_bh = NULL; 3761c06b912STristan Ye struct ocfs2_chain_list *cl; 3771c06b912STristan Ye struct ocfs2_chain_rec *rec; 3781c06b912STristan Ye struct ocfs2_dinode *ac_dinode; 3791c06b912STristan Ye struct ocfs2_group_desc *bg; 3801c06b912STristan Ye 3811c06b912STristan Ye ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot); 3821c06b912STristan Ye ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf, 3831c06b912STristan Ye strlen(namebuf), &blkno); 3841c06b912STristan Ye if (ret) { 3851c06b912STristan Ye ret = -ENOENT; 3861c06b912STristan Ye goto out; 3871c06b912STristan Ye } 3881c06b912STristan Ye 3891c06b912STristan Ye ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh); 3901c06b912STristan Ye if (ret) { 3911c06b912STristan Ye mlog_errno(ret); 3921c06b912STristan Ye goto out; 3931c06b912STristan Ye } 3941c06b912STristan Ye 3951c06b912STristan Ye ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data; 3961c06b912STristan Ye cl = &(ac_dinode->id2.i_chain); 3971c06b912STristan Ye rec = &(cl->cl_recs[0]); 3981c06b912STristan Ye 3991c06b912STristan Ye if (type == GLOBAL_BITMAP_SYSTEM_INODE) 4006aea6f50STristan Ye bits_per_unit = osb->s_clustersize_bits - 4016aea6f50STristan Ye inode->i_sb->s_blocksize_bits; 4021c06b912STristan Ye /* 4031c06b912STristan Ye * 'vict_blkno' was out of the valid range. 4041c06b912STristan Ye */ 4051c06b912STristan Ye if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || 4066aea6f50STristan Ye (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << 4076aea6f50STristan Ye bits_per_unit))) { 4081c06b912STristan Ye ret = -EINVAL; 4091c06b912STristan Ye goto out; 4101c06b912STristan Ye } 4111c06b912STristan Ye 4121c06b912STristan Ye for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) { 4131c06b912STristan Ye 4141c06b912STristan Ye rec = &(cl->cl_recs[i]); 4151c06b912STristan Ye if (!rec) 4161c06b912STristan Ye continue; 4171c06b912STristan Ye 4181c06b912STristan Ye bg = NULL; 4191c06b912STristan Ye 4201c06b912STristan Ye do { 4211c06b912STristan Ye if (!bg) 4221c06b912STristan Ye blkno = le64_to_cpu(rec->c_blkno); 4231c06b912STristan Ye else 4241c06b912STristan Ye blkno = le64_to_cpu(bg->bg_next_group); 4251c06b912STristan Ye 4261c06b912STristan Ye if (gd_bh) { 4271c06b912STristan Ye brelse(gd_bh); 4281c06b912STristan Ye gd_bh = NULL; 4291c06b912STristan Ye } 4301c06b912STristan Ye 4311c06b912STristan Ye ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh); 4321c06b912STristan Ye if (ret) { 4331c06b912STristan Ye mlog_errno(ret); 4341c06b912STristan Ye goto out; 4351c06b912STristan Ye } 4361c06b912STristan Ye 4371c06b912STristan Ye bg = (struct ocfs2_group_desc *)gd_bh->b_data; 4381c06b912STristan Ye 4391c06b912STristan Ye if (vict_blkno < (le64_to_cpu(bg->bg_blkno) + 4401c06b912STristan Ye le16_to_cpu(bg->bg_bits))) { 4411c06b912STristan Ye 4421c06b912STristan Ye *ret_bh = gd_bh; 4436aea6f50STristan Ye *vict_bit = (vict_blkno - blkno) >> 4446aea6f50STristan Ye bits_per_unit; 4451c06b912STristan Ye mlog(0, "find the victim group: #%llu, " 4461c06b912STristan Ye "total_bits: %u, vict_bit: %u\n", 4471c06b912STristan Ye blkno, le16_to_cpu(bg->bg_bits), 4481c06b912STristan Ye *vict_bit); 4491c06b912STristan Ye goto out; 4501c06b912STristan Ye } 4511c06b912STristan Ye 4521c06b912STristan Ye } while (le64_to_cpu(bg->bg_next_group)); 4531c06b912STristan Ye } 4541c06b912STristan Ye 4551c06b912STristan Ye ret = -EINVAL; 4561c06b912STristan Ye out: 4571c06b912STristan Ye brelse(ac_bh); 4581c06b912STristan Ye 4591c06b912STristan Ye /* 4601c06b912STristan Ye * caller has to release the gd_bh properly. 4611c06b912STristan Ye */ 4621c06b912STristan Ye return ret; 4631c06b912STristan Ye } 46499e4c750STristan Ye 46599e4c750STristan Ye /* 46699e4c750STristan Ye * XXX: helper to validate and adjust moving goal. 46799e4c750STristan Ye */ 46899e4c750STristan Ye static int ocfs2_validate_and_adjust_move_goal(struct inode *inode, 46999e4c750STristan Ye struct ocfs2_move_extents *range) 47099e4c750STristan Ye { 47199e4c750STristan Ye int ret, goal_bit = 0; 47299e4c750STristan Ye 47399e4c750STristan Ye struct buffer_head *gd_bh = NULL; 474ea5e1675STristan Ye struct ocfs2_group_desc *bg = NULL; 47599e4c750STristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 47699e4c750STristan Ye int c_to_b = 1 << (osb->s_clustersize_bits - 47799e4c750STristan Ye inode->i_sb->s_blocksize_bits); 47899e4c750STristan Ye 47999e4c750STristan Ye /* 480ea5e1675STristan Ye * make goal become cluster aligned. 481ea5e1675STristan Ye */ 482ea5e1675STristan Ye range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb, 483ea5e1675STristan Ye range->me_goal); 484ea5e1675STristan Ye /* 485ea5e1675STristan Ye * moving goal is not allowd to start with a group desc blok(#0 blk) 486ea5e1675STristan Ye * let's compromise to the latter cluster. 487ea5e1675STristan Ye */ 488ea5e1675STristan Ye if (range->me_goal == le64_to_cpu(bg->bg_blkno)) 489ea5e1675STristan Ye range->me_goal += c_to_b; 490ea5e1675STristan Ye 491ea5e1675STristan Ye /* 49299e4c750STristan Ye * validate goal sits within global_bitmap, and return the victim 49399e4c750STristan Ye * group desc 49499e4c750STristan Ye */ 49599e4c750STristan Ye ret = ocfs2_find_victim_alloc_group(inode, range->me_goal, 49699e4c750STristan Ye GLOBAL_BITMAP_SYSTEM_INODE, 49799e4c750STristan Ye OCFS2_INVALID_SLOT, 49899e4c750STristan Ye &goal_bit, &gd_bh); 49999e4c750STristan Ye if (ret) 50099e4c750STristan Ye goto out; 50199e4c750STristan Ye 50299e4c750STristan Ye bg = (struct ocfs2_group_desc *)gd_bh->b_data; 50399e4c750STristan Ye 50499e4c750STristan Ye /* 50599e4c750STristan Ye * movement is not gonna cross two groups. 50699e4c750STristan Ye */ 50799e4c750STristan Ye if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize < 50899e4c750STristan Ye range->me_len) { 50999e4c750STristan Ye ret = -EINVAL; 51099e4c750STristan Ye goto out; 51199e4c750STristan Ye } 51299e4c750STristan Ye /* 51399e4c750STristan Ye * more exact validations/adjustments will be performed later during 51499e4c750STristan Ye * moving operation for each extent range. 51599e4c750STristan Ye */ 51699e4c750STristan Ye mlog(0, "extents get ready to be moved to #%llu block\n", 51799e4c750STristan Ye range->me_goal); 51899e4c750STristan Ye 51999e4c750STristan Ye out: 52099e4c750STristan Ye brelse(gd_bh); 52199e4c750STristan Ye 52299e4c750STristan Ye return ret; 52399e4c750STristan Ye } 524e6b5859cSTristan Ye 525e6b5859cSTristan Ye static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh, 526e6b5859cSTristan Ye int *goal_bit, u32 move_len, u32 max_hop, 527e6b5859cSTristan Ye u32 *phys_cpos) 528e6b5859cSTristan Ye { 529e6b5859cSTristan Ye int i, used, last_free_bits = 0, base_bit = *goal_bit; 530e6b5859cSTristan Ye struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; 531e6b5859cSTristan Ye u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 532e6b5859cSTristan Ye le64_to_cpu(gd->bg_blkno)); 533e6b5859cSTristan Ye 534e6b5859cSTristan Ye for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) { 535e6b5859cSTristan Ye 536e6b5859cSTristan Ye used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap); 537e6b5859cSTristan Ye if (used) { 538e6b5859cSTristan Ye /* 539e6b5859cSTristan Ye * we even tried searching the free chunk by jumping 540e6b5859cSTristan Ye * a 'max_hop' distance, but still failed. 541e6b5859cSTristan Ye */ 542e6b5859cSTristan Ye if ((i - base_bit) > max_hop) { 543e6b5859cSTristan Ye *phys_cpos = 0; 544e6b5859cSTristan Ye break; 545e6b5859cSTristan Ye } 546e6b5859cSTristan Ye 547e6b5859cSTristan Ye if (last_free_bits) 548e6b5859cSTristan Ye last_free_bits = 0; 549e6b5859cSTristan Ye 550e6b5859cSTristan Ye continue; 551e6b5859cSTristan Ye } else 552e6b5859cSTristan Ye last_free_bits++; 553e6b5859cSTristan Ye 554e6b5859cSTristan Ye if (last_free_bits == move_len) { 555e6b5859cSTristan Ye *goal_bit = i; 556e6b5859cSTristan Ye *phys_cpos = base_cpos + i; 557e6b5859cSTristan Ye break; 558e6b5859cSTristan Ye } 559e6b5859cSTristan Ye } 560e6b5859cSTristan Ye 561e6b5859cSTristan Ye mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos); 562e6b5859cSTristan Ye } 5638473aa8aSTristan Ye 5648473aa8aSTristan Ye static int ocfs2_alloc_dinode_update_counts(struct inode *inode, 5658473aa8aSTristan Ye handle_t *handle, 5668473aa8aSTristan Ye struct buffer_head *di_bh, 5678473aa8aSTristan Ye u32 num_bits, 5688473aa8aSTristan Ye u16 chain) 5698473aa8aSTristan Ye { 5708473aa8aSTristan Ye int ret; 5718473aa8aSTristan Ye u32 tmp_used; 5728473aa8aSTristan Ye struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; 5738473aa8aSTristan Ye struct ocfs2_chain_list *cl = 5748473aa8aSTristan Ye (struct ocfs2_chain_list *) &di->id2.i_chain; 5758473aa8aSTristan Ye 5768473aa8aSTristan Ye ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 5778473aa8aSTristan Ye OCFS2_JOURNAL_ACCESS_WRITE); 5788473aa8aSTristan Ye if (ret < 0) { 5798473aa8aSTristan Ye mlog_errno(ret); 5808473aa8aSTristan Ye goto out; 5818473aa8aSTristan Ye } 5828473aa8aSTristan Ye 5838473aa8aSTristan Ye tmp_used = le32_to_cpu(di->id1.bitmap1.i_used); 5848473aa8aSTristan Ye di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used); 5858473aa8aSTristan Ye le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits); 5868473aa8aSTristan Ye ocfs2_journal_dirty(handle, di_bh); 5878473aa8aSTristan Ye 5888473aa8aSTristan Ye out: 5898473aa8aSTristan Ye return ret; 5908473aa8aSTristan Ye } 5918473aa8aSTristan Ye 5928473aa8aSTristan Ye static inline int ocfs2_block_group_set_bits(handle_t *handle, 5938473aa8aSTristan Ye struct inode *alloc_inode, 5948473aa8aSTristan Ye struct ocfs2_group_desc *bg, 5958473aa8aSTristan Ye struct buffer_head *group_bh, 5968473aa8aSTristan Ye unsigned int bit_off, 5978473aa8aSTristan Ye unsigned int num_bits) 5988473aa8aSTristan Ye { 5998473aa8aSTristan Ye int status; 6008473aa8aSTristan Ye void *bitmap = bg->bg_bitmap; 6018473aa8aSTristan Ye int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; 6028473aa8aSTristan Ye 6038473aa8aSTristan Ye /* All callers get the descriptor via 6048473aa8aSTristan Ye * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ 6058473aa8aSTristan Ye BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); 6068473aa8aSTristan Ye BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); 6078473aa8aSTristan Ye 6088473aa8aSTristan Ye mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, 6098473aa8aSTristan Ye num_bits); 6108473aa8aSTristan Ye 6118473aa8aSTristan Ye if (ocfs2_is_cluster_bitmap(alloc_inode)) 6128473aa8aSTristan Ye journal_type = OCFS2_JOURNAL_ACCESS_UNDO; 6138473aa8aSTristan Ye 6148473aa8aSTristan Ye status = ocfs2_journal_access_gd(handle, 6158473aa8aSTristan Ye INODE_CACHE(alloc_inode), 6168473aa8aSTristan Ye group_bh, 6178473aa8aSTristan Ye journal_type); 6188473aa8aSTristan Ye if (status < 0) { 6198473aa8aSTristan Ye mlog_errno(status); 6208473aa8aSTristan Ye goto bail; 6218473aa8aSTristan Ye } 6228473aa8aSTristan Ye 6238473aa8aSTristan Ye le16_add_cpu(&bg->bg_free_bits_count, -num_bits); 6248473aa8aSTristan Ye if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { 6258473aa8aSTristan Ye ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit" 6268473aa8aSTristan Ye " count %u but claims %u are freed. num_bits %d", 6278473aa8aSTristan Ye (unsigned long long)le64_to_cpu(bg->bg_blkno), 6288473aa8aSTristan Ye le16_to_cpu(bg->bg_bits), 6298473aa8aSTristan Ye le16_to_cpu(bg->bg_free_bits_count), num_bits); 6308473aa8aSTristan Ye return -EROFS; 6318473aa8aSTristan Ye } 6328473aa8aSTristan Ye while (num_bits--) 6338473aa8aSTristan Ye ocfs2_set_bit(bit_off++, bitmap); 6348473aa8aSTristan Ye 6358473aa8aSTristan Ye ocfs2_journal_dirty(handle, group_bh); 6368473aa8aSTristan Ye 6378473aa8aSTristan Ye bail: 6388473aa8aSTristan Ye return status; 6398473aa8aSTristan Ye } 640e0847717STristan Ye 641e0847717STristan Ye static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, 642e0847717STristan Ye u32 cpos, u32 phys_cpos, u32 *new_phys_cpos, 643e0847717STristan Ye u32 len, int ext_flags) 644e0847717STristan Ye { 645e0847717STristan Ye int ret, credits = 0, extra_blocks = 0, goal_bit = 0; 646e0847717STristan Ye handle_t *handle; 647e0847717STristan Ye struct inode *inode = context->inode; 648e0847717STristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 649e0847717STristan Ye struct inode *tl_inode = osb->osb_tl_inode; 650e0847717STristan Ye struct inode *gb_inode = NULL; 651e0847717STristan Ye struct buffer_head *gb_bh = NULL; 652e0847717STristan Ye struct buffer_head *gd_bh = NULL; 653e0847717STristan Ye struct ocfs2_group_desc *gd; 654e0847717STristan Ye struct ocfs2_refcount_tree *ref_tree = NULL; 655e0847717STristan Ye u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb, 656e0847717STristan Ye context->range->me_threshold); 657e0847717STristan Ye u64 phys_blkno, new_phys_blkno; 658e0847717STristan Ye 659e0847717STristan Ye phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 660e0847717STristan Ye 661e0847717STristan Ye if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) { 662e0847717STristan Ye 663e0847717STristan Ye BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & 664e0847717STristan Ye OCFS2_HAS_REFCOUNT_FL)); 665e0847717STristan Ye 666e0847717STristan Ye BUG_ON(!context->refcount_loc); 667e0847717STristan Ye 668e0847717STristan Ye ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1, 669e0847717STristan Ye &ref_tree, NULL); 670e0847717STristan Ye if (ret) { 671e0847717STristan Ye mlog_errno(ret); 672e0847717STristan Ye return ret; 673e0847717STristan Ye } 674e0847717STristan Ye 675e0847717STristan Ye ret = ocfs2_prepare_refcount_change_for_del(inode, 676e0847717STristan Ye context->refcount_loc, 677e0847717STristan Ye phys_blkno, 678e0847717STristan Ye len, 679e0847717STristan Ye &credits, 680e0847717STristan Ye &extra_blocks); 681e0847717STristan Ye if (ret) { 682e0847717STristan Ye mlog_errno(ret); 683e0847717STristan Ye goto out; 684e0847717STristan Ye } 685e0847717STristan Ye } 686e0847717STristan Ye 687e0847717STristan Ye ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1, 688e0847717STristan Ye &context->meta_ac, 689e0847717STristan Ye NULL, extra_blocks, &credits); 690e0847717STristan Ye if (ret) { 691e0847717STristan Ye mlog_errno(ret); 692e0847717STristan Ye goto out; 693e0847717STristan Ye } 694e0847717STristan Ye 695e0847717STristan Ye /* 696e0847717STristan Ye * need to count 2 extra credits for global_bitmap inode and 697e0847717STristan Ye * group descriptor. 698e0847717STristan Ye */ 699e0847717STristan Ye credits += OCFS2_INODE_UPDATE_CREDITS + 1; 700e0847717STristan Ye 701e0847717STristan Ye /* 702e0847717STristan Ye * ocfs2_move_extent() didn't reserve any clusters in lock_allocators() 703e0847717STristan Ye * logic, while we still need to lock the global_bitmap. 704e0847717STristan Ye */ 705e0847717STristan Ye gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE, 706e0847717STristan Ye OCFS2_INVALID_SLOT); 707e0847717STristan Ye if (!gb_inode) { 708e0847717STristan Ye mlog(ML_ERROR, "unable to get global_bitmap inode\n"); 709e0847717STristan Ye ret = -EIO; 710e0847717STristan Ye goto out; 711e0847717STristan Ye } 712e0847717STristan Ye 713e0847717STristan Ye mutex_lock(&gb_inode->i_mutex); 714e0847717STristan Ye 715e0847717STristan Ye ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); 716e0847717STristan Ye if (ret) { 717e0847717STristan Ye mlog_errno(ret); 718e0847717STristan Ye goto out_unlock_gb_mutex; 719e0847717STristan Ye } 720e0847717STristan Ye 721e0847717STristan Ye mutex_lock(&tl_inode->i_mutex); 722e0847717STristan Ye 723e0847717STristan Ye handle = ocfs2_start_trans(osb, credits); 724e0847717STristan Ye if (IS_ERR(handle)) { 725e0847717STristan Ye ret = PTR_ERR(handle); 726e0847717STristan Ye mlog_errno(ret); 727e0847717STristan Ye goto out_unlock_tl_inode; 728e0847717STristan Ye } 729e0847717STristan Ye 730e0847717STristan Ye new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos); 731e0847717STristan Ye ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno, 732e0847717STristan Ye GLOBAL_BITMAP_SYSTEM_INODE, 733e0847717STristan Ye OCFS2_INVALID_SLOT, 734e0847717STristan Ye &goal_bit, &gd_bh); 735e0847717STristan Ye if (ret) { 736e0847717STristan Ye mlog_errno(ret); 737e0847717STristan Ye goto out_commit; 738e0847717STristan Ye } 739e0847717STristan Ye 740e0847717STristan Ye /* 741e0847717STristan Ye * probe the victim cluster group to find a proper 742e0847717STristan Ye * region to fit wanted movement, it even will perfrom 743e0847717STristan Ye * a best-effort attempt by compromising to a threshold 744e0847717STristan Ye * around the goal. 745e0847717STristan Ye */ 746e0847717STristan Ye ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop, 747e0847717STristan Ye new_phys_cpos); 7483d75be7cSDan Carpenter if (!*new_phys_cpos) { 749e0847717STristan Ye ret = -ENOSPC; 750e0847717STristan Ye goto out_commit; 751e0847717STristan Ye } 752e0847717STristan Ye 753e0847717STristan Ye ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos, 754e0847717STristan Ye *new_phys_cpos, ext_flags); 755e0847717STristan Ye if (ret) { 756e0847717STristan Ye mlog_errno(ret); 757e0847717STristan Ye goto out_commit; 758e0847717STristan Ye } 759e0847717STristan Ye 760e0847717STristan Ye gd = (struct ocfs2_group_desc *)gd_bh->b_data; 761e0847717STristan Ye ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len, 762e0847717STristan Ye le16_to_cpu(gd->bg_chain)); 763e0847717STristan Ye if (ret) { 764e0847717STristan Ye mlog_errno(ret); 765e0847717STristan Ye goto out_commit; 766e0847717STristan Ye } 767e0847717STristan Ye 768e0847717STristan Ye ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh, 769e0847717STristan Ye goal_bit, len); 770e0847717STristan Ye if (ret) 771e0847717STristan Ye mlog_errno(ret); 772e0847717STristan Ye 773e0847717STristan Ye /* 774e0847717STristan Ye * Here we should write the new page out first if we are 775e0847717STristan Ye * in write-back mode. 776e0847717STristan Ye */ 777e0847717STristan Ye ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len); 778e0847717STristan Ye if (ret) 779e0847717STristan Ye mlog_errno(ret); 780e0847717STristan Ye 781e0847717STristan Ye out_commit: 782e0847717STristan Ye ocfs2_commit_trans(osb, handle); 783e0847717STristan Ye brelse(gd_bh); 784e0847717STristan Ye 785e0847717STristan Ye out_unlock_tl_inode: 786e0847717STristan Ye mutex_unlock(&tl_inode->i_mutex); 787e0847717STristan Ye 788e0847717STristan Ye ocfs2_inode_unlock(gb_inode, 1); 789e0847717STristan Ye out_unlock_gb_mutex: 790e0847717STristan Ye mutex_unlock(&gb_inode->i_mutex); 791e0847717STristan Ye brelse(gb_bh); 792e0847717STristan Ye iput(gb_inode); 793e0847717STristan Ye 794e0847717STristan Ye out: 795e0847717STristan Ye if (context->meta_ac) { 796e0847717STristan Ye ocfs2_free_alloc_context(context->meta_ac); 797e0847717STristan Ye context->meta_ac = NULL; 798e0847717STristan Ye } 799e0847717STristan Ye 800e0847717STristan Ye if (ref_tree) 801e0847717STristan Ye ocfs2_unlock_refcount_tree(osb, ref_tree, 1); 802e0847717STristan Ye 803e0847717STristan Ye return ret; 804e0847717STristan Ye } 805ee16cc03STristan Ye 806ee16cc03STristan Ye /* 807ee16cc03STristan Ye * Helper to calculate the defraging length in one run according to threshold. 808ee16cc03STristan Ye */ 809ee16cc03STristan Ye static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged, 810ee16cc03STristan Ye u32 threshold, int *skip) 811ee16cc03STristan Ye { 812ee16cc03STristan Ye if ((*alloc_size + *len_defraged) < threshold) { 813ee16cc03STristan Ye /* 814ee16cc03STristan Ye * proceed defragmentation until we meet the thresh 815ee16cc03STristan Ye */ 816ee16cc03STristan Ye *len_defraged += *alloc_size; 817ee16cc03STristan Ye } else if (*len_defraged == 0) { 818ee16cc03STristan Ye /* 819ee16cc03STristan Ye * XXX: skip a large extent. 820ee16cc03STristan Ye */ 821ee16cc03STristan Ye *skip = 1; 822ee16cc03STristan Ye } else { 823ee16cc03STristan Ye /* 824ee16cc03STristan Ye * split this extent to coalesce with former pieces as 825ee16cc03STristan Ye * to reach the threshold. 826ee16cc03STristan Ye * 827ee16cc03STristan Ye * we're done here with one cycle of defragmentation 828ee16cc03STristan Ye * in a size of 'thresh', resetting 'len_defraged' 829ee16cc03STristan Ye * forces a new defragmentation. 830ee16cc03STristan Ye */ 831ee16cc03STristan Ye *alloc_size = threshold - *len_defraged; 832ee16cc03STristan Ye *len_defraged = 0; 833ee16cc03STristan Ye } 834ee16cc03STristan Ye } 83553069d4eSTristan Ye 83653069d4eSTristan Ye static int __ocfs2_move_extents_range(struct buffer_head *di_bh, 83753069d4eSTristan Ye struct ocfs2_move_extents_context *context) 83853069d4eSTristan Ye { 83953069d4eSTristan Ye int ret = 0, flags, do_defrag, skip = 0; 84053069d4eSTristan Ye u32 cpos, phys_cpos, move_start, len_to_move, alloc_size; 84153069d4eSTristan Ye u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0; 84253069d4eSTristan Ye 84353069d4eSTristan Ye struct inode *inode = context->inode; 84453069d4eSTristan Ye struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 84553069d4eSTristan Ye struct ocfs2_move_extents *range = context->range; 84653069d4eSTristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 84753069d4eSTristan Ye 84853069d4eSTristan Ye if ((inode->i_size == 0) || (range->me_len == 0)) 84953069d4eSTristan Ye return 0; 85053069d4eSTristan Ye 85153069d4eSTristan Ye if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) 85253069d4eSTristan Ye return 0; 85353069d4eSTristan Ye 85453069d4eSTristan Ye context->refcount_loc = le64_to_cpu(di->i_refcount_loc); 85553069d4eSTristan Ye 85653069d4eSTristan Ye ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh); 85753069d4eSTristan Ye ocfs2_init_dealloc_ctxt(&context->dealloc); 85853069d4eSTristan Ye 85953069d4eSTristan Ye /* 86053069d4eSTristan Ye * TO-DO XXX: 86153069d4eSTristan Ye * 86253069d4eSTristan Ye * - xattr extents. 86353069d4eSTristan Ye */ 86453069d4eSTristan Ye 86553069d4eSTristan Ye do_defrag = context->auto_defrag; 86653069d4eSTristan Ye 86753069d4eSTristan Ye /* 86853069d4eSTristan Ye * extents moving happens in unit of clusters, for the sake 86953069d4eSTristan Ye * of simplicity, we may ignore two clusters where 'byte_start' 87053069d4eSTristan Ye * and 'byte_start + len' were within. 87153069d4eSTristan Ye */ 87253069d4eSTristan Ye move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start); 87353069d4eSTristan Ye len_to_move = (range->me_start + range->me_len) >> 87453069d4eSTristan Ye osb->s_clustersize_bits; 87553069d4eSTristan Ye if (len_to_move >= move_start) 87653069d4eSTristan Ye len_to_move -= move_start; 87753069d4eSTristan Ye else 87853069d4eSTristan Ye len_to_move = 0; 87953069d4eSTristan Ye 880dda54e76STristan Ye if (do_defrag) { 88153069d4eSTristan Ye defrag_thresh = range->me_threshold >> osb->s_clustersize_bits; 882dda54e76STristan Ye if (defrag_thresh <= 1) 883dda54e76STristan Ye goto done; 884dda54e76STristan Ye } else 88553069d4eSTristan Ye new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb, 88653069d4eSTristan Ye range->me_goal); 88753069d4eSTristan Ye 88853069d4eSTristan Ye mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, " 88953069d4eSTristan Ye "thresh: %u\n", 89053069d4eSTristan Ye (unsigned long long)OCFS2_I(inode)->ip_blkno, 89153069d4eSTristan Ye (unsigned long long)range->me_start, 89253069d4eSTristan Ye (unsigned long long)range->me_len, 89353069d4eSTristan Ye move_start, len_to_move, defrag_thresh); 89453069d4eSTristan Ye 89553069d4eSTristan Ye cpos = move_start; 89653069d4eSTristan Ye while (len_to_move) { 89753069d4eSTristan Ye ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size, 89853069d4eSTristan Ye &flags); 89953069d4eSTristan Ye if (ret) { 90053069d4eSTristan Ye mlog_errno(ret); 90153069d4eSTristan Ye goto out; 90253069d4eSTristan Ye } 90353069d4eSTristan Ye 90453069d4eSTristan Ye if (alloc_size > len_to_move) 90553069d4eSTristan Ye alloc_size = len_to_move; 90653069d4eSTristan Ye 90753069d4eSTristan Ye /* 90853069d4eSTristan Ye * XXX: how to deal with a hole: 90953069d4eSTristan Ye * 91053069d4eSTristan Ye * - skip the hole of course 91153069d4eSTristan Ye * - force a new defragmentation 91253069d4eSTristan Ye */ 91353069d4eSTristan Ye if (!phys_cpos) { 91453069d4eSTristan Ye if (do_defrag) 91553069d4eSTristan Ye len_defraged = 0; 91653069d4eSTristan Ye 91753069d4eSTristan Ye goto next; 91853069d4eSTristan Ye } 91953069d4eSTristan Ye 92053069d4eSTristan Ye if (do_defrag) { 92153069d4eSTristan Ye ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged, 92253069d4eSTristan Ye defrag_thresh, &skip); 92353069d4eSTristan Ye /* 92453069d4eSTristan Ye * skip large extents 92553069d4eSTristan Ye */ 92653069d4eSTristan Ye if (skip) { 92753069d4eSTristan Ye skip = 0; 92853069d4eSTristan Ye goto next; 92953069d4eSTristan Ye } 93053069d4eSTristan Ye 93153069d4eSTristan Ye mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, " 93253069d4eSTristan Ye "alloc_size: %u, len_defraged: %u\n", 93353069d4eSTristan Ye cpos, phys_cpos, alloc_size, len_defraged); 93453069d4eSTristan Ye 93553069d4eSTristan Ye ret = ocfs2_defrag_extent(context, cpos, phys_cpos, 9364dfa66bdSTristan Ye &alloc_size, flags); 93753069d4eSTristan Ye } else { 93853069d4eSTristan Ye ret = ocfs2_move_extent(context, cpos, phys_cpos, 93953069d4eSTristan Ye &new_phys_cpos, alloc_size, 94053069d4eSTristan Ye flags); 94153069d4eSTristan Ye 94253069d4eSTristan Ye new_phys_cpos += alloc_size; 94353069d4eSTristan Ye } 94453069d4eSTristan Ye 94553069d4eSTristan Ye if (ret < 0) { 94653069d4eSTristan Ye mlog_errno(ret); 94753069d4eSTristan Ye goto out; 94853069d4eSTristan Ye } 94953069d4eSTristan Ye 95053069d4eSTristan Ye context->clusters_moved += alloc_size; 95153069d4eSTristan Ye next: 95253069d4eSTristan Ye cpos += alloc_size; 95353069d4eSTristan Ye len_to_move -= alloc_size; 95453069d4eSTristan Ye } 95553069d4eSTristan Ye 956dda54e76STristan Ye done: 95753069d4eSTristan Ye range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE; 95853069d4eSTristan Ye 95953069d4eSTristan Ye out: 96053069d4eSTristan Ye range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb, 96153069d4eSTristan Ye context->clusters_moved); 96253069d4eSTristan Ye range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb, 96353069d4eSTristan Ye context->new_phys_cpos); 96453069d4eSTristan Ye 96553069d4eSTristan Ye ocfs2_schedule_truncate_log_flush(osb, 1); 96653069d4eSTristan Ye ocfs2_run_deallocs(osb, &context->dealloc); 96753069d4eSTristan Ye 96853069d4eSTristan Ye return ret; 96953069d4eSTristan Ye } 97053069d4eSTristan Ye 97153069d4eSTristan Ye static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) 97253069d4eSTristan Ye { 97353069d4eSTristan Ye int status; 97453069d4eSTristan Ye handle_t *handle; 97553069d4eSTristan Ye struct inode *inode = context->inode; 97653069d4eSTristan Ye struct ocfs2_dinode *di; 97753069d4eSTristan Ye struct buffer_head *di_bh = NULL; 97853069d4eSTristan Ye struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 97953069d4eSTristan Ye 98053069d4eSTristan Ye if (!inode) 98153069d4eSTristan Ye return -ENOENT; 98253069d4eSTristan Ye 98353069d4eSTristan Ye if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) 98453069d4eSTristan Ye return -EROFS; 98553069d4eSTristan Ye 98653069d4eSTristan Ye mutex_lock(&inode->i_mutex); 98753069d4eSTristan Ye 98853069d4eSTristan Ye /* 98953069d4eSTristan Ye * This prevents concurrent writes from other nodes 99053069d4eSTristan Ye */ 99153069d4eSTristan Ye status = ocfs2_rw_lock(inode, 1); 99253069d4eSTristan Ye if (status) { 99353069d4eSTristan Ye mlog_errno(status); 99453069d4eSTristan Ye goto out; 99553069d4eSTristan Ye } 99653069d4eSTristan Ye 99753069d4eSTristan Ye status = ocfs2_inode_lock(inode, &di_bh, 1); 99853069d4eSTristan Ye if (status) { 99953069d4eSTristan Ye mlog_errno(status); 100053069d4eSTristan Ye goto out_rw_unlock; 100153069d4eSTristan Ye } 100253069d4eSTristan Ye 100353069d4eSTristan Ye /* 100453069d4eSTristan Ye * rememer ip_xattr_sem also needs to be held if necessary 100553069d4eSTristan Ye */ 100653069d4eSTristan Ye down_write(&OCFS2_I(inode)->ip_alloc_sem); 100753069d4eSTristan Ye 100853069d4eSTristan Ye status = __ocfs2_move_extents_range(di_bh, context); 100953069d4eSTristan Ye 101053069d4eSTristan Ye up_write(&OCFS2_I(inode)->ip_alloc_sem); 101153069d4eSTristan Ye if (status) { 101253069d4eSTristan Ye mlog_errno(status); 101353069d4eSTristan Ye goto out_inode_unlock; 101453069d4eSTristan Ye } 101553069d4eSTristan Ye 101653069d4eSTristan Ye /* 101753069d4eSTristan Ye * We update ctime for these changes 101853069d4eSTristan Ye */ 101953069d4eSTristan Ye handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 102053069d4eSTristan Ye if (IS_ERR(handle)) { 102153069d4eSTristan Ye status = PTR_ERR(handle); 102253069d4eSTristan Ye mlog_errno(status); 102353069d4eSTristan Ye goto out_inode_unlock; 102453069d4eSTristan Ye } 102553069d4eSTristan Ye 102653069d4eSTristan Ye status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, 102753069d4eSTristan Ye OCFS2_JOURNAL_ACCESS_WRITE); 102853069d4eSTristan Ye if (status) { 102953069d4eSTristan Ye mlog_errno(status); 103053069d4eSTristan Ye goto out_commit; 103153069d4eSTristan Ye } 103253069d4eSTristan Ye 103353069d4eSTristan Ye di = (struct ocfs2_dinode *)di_bh->b_data; 103453069d4eSTristan Ye inode->i_ctime = CURRENT_TIME; 103553069d4eSTristan Ye di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); 103653069d4eSTristan Ye di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 103753069d4eSTristan Ye 103853069d4eSTristan Ye ocfs2_journal_dirty(handle, di_bh); 103953069d4eSTristan Ye 104053069d4eSTristan Ye out_commit: 104153069d4eSTristan Ye ocfs2_commit_trans(osb, handle); 104253069d4eSTristan Ye 104353069d4eSTristan Ye out_inode_unlock: 104453069d4eSTristan Ye brelse(di_bh); 104553069d4eSTristan Ye ocfs2_inode_unlock(inode, 1); 104653069d4eSTristan Ye out_rw_unlock: 104753069d4eSTristan Ye ocfs2_rw_unlock(inode, 1); 104853069d4eSTristan Ye out: 104953069d4eSTristan Ye mutex_unlock(&inode->i_mutex); 105053069d4eSTristan Ye 105153069d4eSTristan Ye return status; 105253069d4eSTristan Ye } 105353069d4eSTristan Ye 105453069d4eSTristan Ye int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp) 105553069d4eSTristan Ye { 105653069d4eSTristan Ye int status; 105753069d4eSTristan Ye 105853069d4eSTristan Ye struct inode *inode = filp->f_path.dentry->d_inode; 105953069d4eSTristan Ye struct ocfs2_move_extents range; 106053069d4eSTristan Ye struct ocfs2_move_extents_context *context = NULL; 106153069d4eSTristan Ye 1062a561be71SAl Viro status = mnt_want_write_file(filp); 106353069d4eSTristan Ye if (status) 106453069d4eSTristan Ye return status; 106553069d4eSTristan Ye 106653069d4eSTristan Ye if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) 106753069d4eSTristan Ye goto out; 106853069d4eSTristan Ye 106953069d4eSTristan Ye if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { 107053069d4eSTristan Ye status = -EPERM; 107153069d4eSTristan Ye goto out; 107253069d4eSTristan Ye } 107353069d4eSTristan Ye 107453069d4eSTristan Ye context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS); 107553069d4eSTristan Ye if (!context) { 107653069d4eSTristan Ye status = -ENOMEM; 107753069d4eSTristan Ye mlog_errno(status); 107853069d4eSTristan Ye goto out; 107953069d4eSTristan Ye } 108053069d4eSTristan Ye 108153069d4eSTristan Ye context->inode = inode; 108253069d4eSTristan Ye context->file = filp; 108353069d4eSTristan Ye 108453069d4eSTristan Ye if (argp) { 1085f6a56903SAl Viro if (copy_from_user(&range, argp, sizeof(range))) { 108653069d4eSTristan Ye status = -EFAULT; 108753069d4eSTristan Ye goto out; 108853069d4eSTristan Ye } 108953069d4eSTristan Ye } else { 109053069d4eSTristan Ye status = -EINVAL; 109153069d4eSTristan Ye goto out; 109253069d4eSTristan Ye } 109353069d4eSTristan Ye 109453069d4eSTristan Ye if (range.me_start > i_size_read(inode)) 109553069d4eSTristan Ye goto out; 109653069d4eSTristan Ye 109753069d4eSTristan Ye if (range.me_start + range.me_len > i_size_read(inode)) 109853069d4eSTristan Ye range.me_len = i_size_read(inode) - range.me_start; 109953069d4eSTristan Ye 110053069d4eSTristan Ye context->range = ⦥ 110153069d4eSTristan Ye 110253069d4eSTristan Ye if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) { 110353069d4eSTristan Ye context->auto_defrag = 1; 110453069d4eSTristan Ye /* 110553069d4eSTristan Ye * ok, the default theshold for the defragmentation 110653069d4eSTristan Ye * is 1M, since our maximum clustersize was 1M also. 110753069d4eSTristan Ye * any thought? 110853069d4eSTristan Ye */ 1109dda54e76STristan Ye if (!range.me_threshold) 111053069d4eSTristan Ye range.me_threshold = 1024 * 1024; 1111dda54e76STristan Ye 1112dda54e76STristan Ye if (range.me_threshold > i_size_read(inode)) 1113dda54e76STristan Ye range.me_threshold = i_size_read(inode); 1114dda54e76STristan Ye 11154dfa66bdSTristan Ye if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG) 11164dfa66bdSTristan Ye context->partial = 1; 111753069d4eSTristan Ye } else { 111853069d4eSTristan Ye /* 111953069d4eSTristan Ye * first best-effort attempt to validate and adjust the goal 112053069d4eSTristan Ye * (physical address in block), while it can't guarantee later 112153069d4eSTristan Ye * operation can succeed all the time since global_bitmap may 112253069d4eSTristan Ye * change a bit over time. 112353069d4eSTristan Ye */ 112453069d4eSTristan Ye 112553069d4eSTristan Ye status = ocfs2_validate_and_adjust_move_goal(inode, &range); 112653069d4eSTristan Ye if (status) 112753069d4eSTristan Ye goto out; 112853069d4eSTristan Ye } 112953069d4eSTristan Ye 113053069d4eSTristan Ye status = ocfs2_move_extents(context); 113153069d4eSTristan Ye if (status) 113253069d4eSTristan Ye mlog_errno(status); 113353069d4eSTristan Ye out: 113453069d4eSTristan Ye /* 113553069d4eSTristan Ye * movement/defragmentation may end up being partially completed, 113653069d4eSTristan Ye * that's the reason why we need to return userspace the finished 113753069d4eSTristan Ye * length and new_offset even if failure happens somewhere. 113853069d4eSTristan Ye */ 113953069d4eSTristan Ye if (argp) { 1140f6a56903SAl Viro if (copy_to_user(argp, &range, sizeof(range))) 114153069d4eSTristan Ye status = -EFAULT; 114253069d4eSTristan Ye } 114353069d4eSTristan Ye 114453069d4eSTristan Ye kfree(context); 114553069d4eSTristan Ye 11462a79f17eSAl Viro mnt_drop_write_file(filp); 114753069d4eSTristan Ye 114853069d4eSTristan Ye return status; 114953069d4eSTristan Ye } 1150