xref: /openbmc/linux/fs/ocfs2/move_extents.c (revision 10fc3a18)
11802d0beSThomas Gleixner // SPDX-License-Identifier: GPL-2.0-only
2fa60ce2cSMasahiro Yamada /*
3028ba5dfSTristan Ye  * move_extents.c
4028ba5dfSTristan Ye  *
5028ba5dfSTristan Ye  * Copyright (C) 2011 Oracle.  All rights reserved.
6028ba5dfSTristan Ye  */
7028ba5dfSTristan Ye #include <linux/fs.h>
8028ba5dfSTristan Ye #include <linux/types.h>
9028ba5dfSTristan Ye #include <linux/mount.h>
10028ba5dfSTristan Ye #include <linux/swap.h>
11028ba5dfSTristan Ye 
12028ba5dfSTristan Ye #include <cluster/masklog.h>
13028ba5dfSTristan Ye 
14028ba5dfSTristan Ye #include "ocfs2.h"
15028ba5dfSTristan Ye #include "ocfs2_ioctl.h"
16028ba5dfSTristan Ye 
17028ba5dfSTristan Ye #include "alloc.h"
186194ae42SLarry Chen #include "localalloc.h"
19028ba5dfSTristan Ye #include "aops.h"
20028ba5dfSTristan Ye #include "dlmglue.h"
21028ba5dfSTristan Ye #include "extent_map.h"
22028ba5dfSTristan Ye #include "inode.h"
23028ba5dfSTristan Ye #include "journal.h"
24028ba5dfSTristan Ye #include "suballoc.h"
25028ba5dfSTristan Ye #include "uptodate.h"
26028ba5dfSTristan Ye #include "super.h"
27028ba5dfSTristan Ye #include "dir.h"
28028ba5dfSTristan Ye #include "buffer_head_io.h"
29028ba5dfSTristan Ye #include "sysfile.h"
30028ba5dfSTristan Ye #include "refcounttree.h"
31028ba5dfSTristan Ye #include "move_extents.h"
32028ba5dfSTristan Ye 
33028ba5dfSTristan Ye struct ocfs2_move_extents_context {
34028ba5dfSTristan Ye 	struct inode *inode;
35028ba5dfSTristan Ye 	struct file *file;
36028ba5dfSTristan Ye 	int auto_defrag;
374dfa66bdSTristan Ye 	int partial;
38028ba5dfSTristan Ye 	int credits;
39028ba5dfSTristan Ye 	u32 new_phys_cpos;
40028ba5dfSTristan Ye 	u32 clusters_moved;
41028ba5dfSTristan Ye 	u64 refcount_loc;
42028ba5dfSTristan Ye 	struct ocfs2_move_extents *range;
43028ba5dfSTristan Ye 	struct ocfs2_extent_tree et;
44028ba5dfSTristan Ye 	struct ocfs2_alloc_context *meta_ac;
45028ba5dfSTristan Ye 	struct ocfs2_alloc_context *data_ac;
46028ba5dfSTristan Ye 	struct ocfs2_cached_dealloc_ctxt dealloc;
47028ba5dfSTristan Ye };
48de474ee8STristan Ye 
__ocfs2_move_extent(handle_t * handle,struct ocfs2_move_extents_context * context,u32 cpos,u32 len,u32 p_cpos,u32 new_p_cpos,int ext_flags)498f603e56STristan Ye static int __ocfs2_move_extent(handle_t *handle,
508f603e56STristan Ye 			       struct ocfs2_move_extents_context *context,
518f603e56STristan Ye 			       u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
528f603e56STristan Ye 			       int ext_flags)
538f603e56STristan Ye {
548f603e56STristan Ye 	int ret = 0, index;
558f603e56STristan Ye 	struct inode *inode = context->inode;
568f603e56STristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
578f603e56STristan Ye 	struct ocfs2_extent_rec *rec, replace_rec;
588f603e56STristan Ye 	struct ocfs2_path *path = NULL;
598f603e56STristan Ye 	struct ocfs2_extent_list *el;
608f603e56STristan Ye 	u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
618f603e56STristan Ye 	u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
628f603e56STristan Ye 
63c7dd3392STiger Yang 	ret = ocfs2_duplicate_clusters_by_page(handle, inode, cpos,
648f603e56STristan Ye 					       p_cpos, new_p_cpos, len);
658f603e56STristan Ye 	if (ret) {
668f603e56STristan Ye 		mlog_errno(ret);
678f603e56STristan Ye 		goto out;
688f603e56STristan Ye 	}
698f603e56STristan Ye 
708f603e56STristan Ye 	memset(&replace_rec, 0, sizeof(replace_rec));
718f603e56STristan Ye 	replace_rec.e_cpos = cpu_to_le32(cpos);
728f603e56STristan Ye 	replace_rec.e_leaf_clusters = cpu_to_le16(len);
738f603e56STristan Ye 	replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
748f603e56STristan Ye 								   new_p_cpos));
758f603e56STristan Ye 
768f603e56STristan Ye 	path = ocfs2_new_path_from_et(&context->et);
778f603e56STristan Ye 	if (!path) {
788f603e56STristan Ye 		ret = -ENOMEM;
798f603e56STristan Ye 		mlog_errno(ret);
808f603e56STristan Ye 		goto out;
818f603e56STristan Ye 	}
828f603e56STristan Ye 
838f603e56STristan Ye 	ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
848f603e56STristan Ye 	if (ret) {
858f603e56STristan Ye 		mlog_errno(ret);
868f603e56STristan Ye 		goto out;
878f603e56STristan Ye 	}
888f603e56STristan Ye 
898f603e56STristan Ye 	el = path_leaf_el(path);
908f603e56STristan Ye 
918f603e56STristan Ye 	index = ocfs2_search_extent_list(el, cpos);
92981035b4SYingtai Xie 	if (index == -1) {
9317a5b9abSGoldwyn Rodrigues 		ret = ocfs2_error(inode->i_sb,
947ecef14aSJoe Perches 				  "Inode %llu has an extent at cpos %u which can no longer be found\n",
958f603e56STristan Ye 				  (unsigned long long)ino, cpos);
968f603e56STristan Ye 		goto out;
978f603e56STristan Ye 	}
988f603e56STristan Ye 
998f603e56STristan Ye 	rec = &el->l_recs[index];
1008f603e56STristan Ye 
1018f603e56STristan Ye 	BUG_ON(ext_flags != rec->e_flags);
1028f603e56STristan Ye 	/*
1038f603e56STristan Ye 	 * after moving/defraging to new location, the extent is not going
1048f603e56STristan Ye 	 * to be refcounted anymore.
1058f603e56STristan Ye 	 */
1068f603e56STristan Ye 	replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
1078f603e56STristan Ye 
1088f603e56STristan Ye 	ret = ocfs2_split_extent(handle, &context->et, path, index,
1098f603e56STristan Ye 				 &replace_rec, context->meta_ac,
1108f603e56STristan Ye 				 &context->dealloc);
1118f603e56STristan Ye 	if (ret) {
1128f603e56STristan Ye 		mlog_errno(ret);
1138f603e56STristan Ye 		goto out;
1148f603e56STristan Ye 	}
1158f603e56STristan Ye 
1168f603e56STristan Ye 	context->new_phys_cpos = new_p_cpos;
1178f603e56STristan Ye 
1188f603e56STristan Ye 	/*
1198f603e56STristan Ye 	 * need I to append truncate log for old clusters?
1208f603e56STristan Ye 	 */
1218f603e56STristan Ye 	if (old_blkno) {
1228f603e56STristan Ye 		if (ext_flags & OCFS2_EXT_REFCOUNTED)
1238f603e56STristan Ye 			ret = ocfs2_decrease_refcount(inode, handle,
1248f603e56STristan Ye 					ocfs2_blocks_to_clusters(osb->sb,
1258f603e56STristan Ye 								 old_blkno),
1268f603e56STristan Ye 					len, context->meta_ac,
1278f603e56STristan Ye 					&context->dealloc, 1);
1288f603e56STristan Ye 		else
1298f603e56STristan Ye 			ret = ocfs2_truncate_log_append(osb, handle,
1308f603e56STristan Ye 							old_blkno, len);
1318f603e56STristan Ye 	}
1328f603e56STristan Ye 
1336fdb702dSDarrick J. Wong 	ocfs2_update_inode_fsync_trans(handle, inode, 0);
1348f603e56STristan Ye out:
1354704aa30SJie Liu 	ocfs2_free_path(path);
1368f603e56STristan Ye 	return ret;
1378f603e56STristan Ye }
1388f603e56STristan Ye 
139de474ee8STristan Ye /*
140e21e5744SLarry Chen  * lock allocator, and reserve appropriate number of bits for
141e21e5744SLarry Chen  * meta blocks.
142de474ee8STristan Ye  */
ocfs2_lock_meta_allocator_move_extents(struct inode * inode,struct ocfs2_extent_tree * et,u32 clusters_to_move,u32 extents_to_split,struct ocfs2_alloc_context ** meta_ac,int extra_blocks,int * credits)143e21e5744SLarry Chen static int ocfs2_lock_meta_allocator_move_extents(struct inode *inode,
144de474ee8STristan Ye 					struct ocfs2_extent_tree *et,
145de474ee8STristan Ye 					u32 clusters_to_move,
146de474ee8STristan Ye 					u32 extents_to_split,
147de474ee8STristan Ye 					struct ocfs2_alloc_context **meta_ac,
148de474ee8STristan Ye 					int extra_blocks,
149de474ee8STristan Ye 					int *credits)
150de474ee8STristan Ye {
151de474ee8STristan Ye 	int ret, num_free_extents;
152de474ee8STristan Ye 	unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
153de474ee8STristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
154de474ee8STristan Ye 
155964f14a0SJun Piao 	num_free_extents = ocfs2_num_free_extents(et);
156de474ee8STristan Ye 	if (num_free_extents < 0) {
157de474ee8STristan Ye 		ret = num_free_extents;
158de474ee8STristan Ye 		mlog_errno(ret);
159de474ee8STristan Ye 		goto out;
160de474ee8STristan Ye 	}
161de474ee8STristan Ye 
162de474ee8STristan Ye 	if (!num_free_extents ||
163de474ee8STristan Ye 	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
164de474ee8STristan Ye 		extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
165de474ee8STristan Ye 
166de474ee8STristan Ye 	ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
167de474ee8STristan Ye 	if (ret) {
168de474ee8STristan Ye 		mlog_errno(ret);
169de474ee8STristan Ye 		goto out;
170de474ee8STristan Ye 	}
171de474ee8STristan Ye 
172de474ee8STristan Ye 
17306f9da6eSGoldwyn Rodrigues 	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el);
174de474ee8STristan Ye 
175de474ee8STristan Ye 	mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
176de474ee8STristan Ye 	     extra_blocks, clusters_to_move, *credits);
177de474ee8STristan Ye out:
178de474ee8STristan Ye 	if (ret) {
179de474ee8STristan Ye 		if (*meta_ac) {
180de474ee8STristan Ye 			ocfs2_free_alloc_context(*meta_ac);
181de474ee8STristan Ye 			*meta_ac = NULL;
182de474ee8STristan Ye 		}
183de474ee8STristan Ye 	}
184de474ee8STristan Ye 
185de474ee8STristan Ye 	return ret;
186de474ee8STristan Ye }
187202ee5faSTristan Ye 
188202ee5faSTristan Ye /*
189202ee5faSTristan Ye  * Using one journal handle to guarantee the data consistency in case
190202ee5faSTristan Ye  * crash happens anywhere.
191dda54e76STristan Ye  *
192dda54e76STristan Ye  *  XXX: defrag can end up with finishing partial extent as requested,
193dda54e76STristan Ye  * due to not enough contiguous clusters can be found in allocator.
194202ee5faSTristan Ye  */
ocfs2_defrag_extent(struct ocfs2_move_extents_context * context,u32 cpos,u32 phys_cpos,u32 * len,int ext_flags)195202ee5faSTristan Ye static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
1964dfa66bdSTristan Ye 			       u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
197202ee5faSTristan Ye {
1984dfa66bdSTristan Ye 	int ret, credits = 0, extra_blocks = 0, partial = context->partial;
199202ee5faSTristan Ye 	handle_t *handle;
200202ee5faSTristan Ye 	struct inode *inode = context->inode;
201202ee5faSTristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
202202ee5faSTristan Ye 	struct inode *tl_inode = osb->osb_tl_inode;
203202ee5faSTristan Ye 	struct ocfs2_refcount_tree *ref_tree = NULL;
204202ee5faSTristan Ye 	u32 new_phys_cpos, new_len;
205202ee5faSTristan Ye 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
2066194ae42SLarry Chen 	int need_free = 0;
207202ee5faSTristan Ye 
2084dfa66bdSTristan Ye 	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
20984e40080SDarrick J. Wong 		BUG_ON(!ocfs2_is_refcount_inode(inode));
210202ee5faSTristan Ye 		BUG_ON(!context->refcount_loc);
211202ee5faSTristan Ye 
212202ee5faSTristan Ye 		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
213202ee5faSTristan Ye 					       &ref_tree, NULL);
214202ee5faSTristan Ye 		if (ret) {
215202ee5faSTristan Ye 			mlog_errno(ret);
216202ee5faSTristan Ye 			return ret;
217202ee5faSTristan Ye 		}
218202ee5faSTristan Ye 
219202ee5faSTristan Ye 		ret = ocfs2_prepare_refcount_change_for_del(inode,
220202ee5faSTristan Ye 							context->refcount_loc,
221202ee5faSTristan Ye 							phys_blkno,
2224dfa66bdSTristan Ye 							*len,
223202ee5faSTristan Ye 							&credits,
224202ee5faSTristan Ye 							&extra_blocks);
225202ee5faSTristan Ye 		if (ret) {
226202ee5faSTristan Ye 			mlog_errno(ret);
227202ee5faSTristan Ye 			goto out;
228202ee5faSTristan Ye 		}
229202ee5faSTristan Ye 	}
230202ee5faSTristan Ye 
231e21e5744SLarry Chen 	ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
232e21e5744SLarry Chen 						*len, 1,
233202ee5faSTristan Ye 						&context->meta_ac,
234202ee5faSTristan Ye 						extra_blocks, &credits);
235202ee5faSTristan Ye 	if (ret) {
236202ee5faSTristan Ye 		mlog_errno(ret);
237202ee5faSTristan Ye 		goto out;
238202ee5faSTristan Ye 	}
239202ee5faSTristan Ye 
240202ee5faSTristan Ye 	/*
241202ee5faSTristan Ye 	 * should be using allocation reservation strategy there?
242202ee5faSTristan Ye 	 *
243202ee5faSTristan Ye 	 * if (context->data_ac)
244202ee5faSTristan Ye 	 *	context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
245202ee5faSTristan Ye 	 */
246202ee5faSTristan Ye 
2475955102cSAl Viro 	inode_lock(tl_inode);
248202ee5faSTristan Ye 
249202ee5faSTristan Ye 	if (ocfs2_truncate_log_needs_flush(osb)) {
250202ee5faSTristan Ye 		ret = __ocfs2_flush_truncate_log(osb);
251202ee5faSTristan Ye 		if (ret < 0) {
252202ee5faSTristan Ye 			mlog_errno(ret);
253202ee5faSTristan Ye 			goto out_unlock_mutex;
254202ee5faSTristan Ye 		}
255202ee5faSTristan Ye 	}
256202ee5faSTristan Ye 
257e21e5744SLarry Chen 	/*
258e21e5744SLarry Chen 	 * Make sure ocfs2_reserve_cluster is called after
259e21e5744SLarry Chen 	 * __ocfs2_flush_truncate_log, otherwise, dead lock may happen.
260e21e5744SLarry Chen 	 *
261e21e5744SLarry Chen 	 * If ocfs2_reserve_cluster is called
262e21e5744SLarry Chen 	 * before __ocfs2_flush_truncate_log, dead lock on global bitmap
263e21e5744SLarry Chen 	 * may happen.
264e21e5744SLarry Chen 	 *
265e21e5744SLarry Chen 	 */
266e21e5744SLarry Chen 	ret = ocfs2_reserve_clusters(osb, *len, &context->data_ac);
267e21e5744SLarry Chen 	if (ret) {
268e21e5744SLarry Chen 		mlog_errno(ret);
269e21e5744SLarry Chen 		goto out_unlock_mutex;
270e21e5744SLarry Chen 	}
271e21e5744SLarry Chen 
272202ee5faSTristan Ye 	handle = ocfs2_start_trans(osb, credits);
273202ee5faSTristan Ye 	if (IS_ERR(handle)) {
274202ee5faSTristan Ye 		ret = PTR_ERR(handle);
275202ee5faSTristan Ye 		mlog_errno(ret);
276202ee5faSTristan Ye 		goto out_unlock_mutex;
277202ee5faSTristan Ye 	}
278202ee5faSTristan Ye 
2794dfa66bdSTristan Ye 	ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
280202ee5faSTristan Ye 				     &new_phys_cpos, &new_len);
281202ee5faSTristan Ye 	if (ret) {
282202ee5faSTristan Ye 		mlog_errno(ret);
283202ee5faSTristan Ye 		goto out_commit;
284202ee5faSTristan Ye 	}
285202ee5faSTristan Ye 
286202ee5faSTristan Ye 	/*
2874dfa66bdSTristan Ye 	 * allowing partial extent moving is kind of 'pros and cons', it makes
2884dfa66bdSTristan Ye 	 * whole defragmentation less likely to fail, on the contrary, the bad
2894dfa66bdSTristan Ye 	 * thing is it may make the fs even more fragmented after moving, let
2904dfa66bdSTristan Ye 	 * userspace make a good decision here.
291202ee5faSTristan Ye 	 */
2924dfa66bdSTristan Ye 	if (new_len != *len) {
2934dfa66bdSTristan Ye 		mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
2944dfa66bdSTristan Ye 		if (!partial) {
295202ee5faSTristan Ye 			context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
296202ee5faSTristan Ye 			ret = -ENOSPC;
2976194ae42SLarry Chen 			need_free = 1;
298202ee5faSTristan Ye 			goto out_commit;
299202ee5faSTristan Ye 		}
3004dfa66bdSTristan Ye 	}
301202ee5faSTristan Ye 
302202ee5faSTristan Ye 	mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
303202ee5faSTristan Ye 	     phys_cpos, new_phys_cpos);
304202ee5faSTristan Ye 
3054dfa66bdSTristan Ye 	ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
306202ee5faSTristan Ye 				  new_phys_cpos, ext_flags);
307202ee5faSTristan Ye 	if (ret)
308202ee5faSTristan Ye 		mlog_errno(ret);
309202ee5faSTristan Ye 
3104dfa66bdSTristan Ye 	if (partial && (new_len != *len))
3114dfa66bdSTristan Ye 		*len = new_len;
3124dfa66bdSTristan Ye 
313202ee5faSTristan Ye 	/*
314202ee5faSTristan Ye 	 * Here we should write the new page out first if we are
315202ee5faSTristan Ye 	 * in write-back mode.
316202ee5faSTristan Ye 	 */
3174dfa66bdSTristan Ye 	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
318202ee5faSTristan Ye 	if (ret)
319202ee5faSTristan Ye 		mlog_errno(ret);
320202ee5faSTristan Ye 
321202ee5faSTristan Ye out_commit:
3226194ae42SLarry Chen 	if (need_free && context->data_ac) {
3236194ae42SLarry Chen 		struct ocfs2_alloc_context *data_ac = context->data_ac;
3246194ae42SLarry Chen 
3256194ae42SLarry Chen 		if (context->data_ac->ac_which == OCFS2_AC_USE_LOCAL)
3266194ae42SLarry Chen 			ocfs2_free_local_alloc_bits(osb, handle, data_ac,
3276194ae42SLarry Chen 					new_phys_cpos, new_len);
3286194ae42SLarry Chen 		else
3296194ae42SLarry Chen 			ocfs2_free_clusters(handle,
3306194ae42SLarry Chen 					data_ac->ac_inode,
3316194ae42SLarry Chen 					data_ac->ac_bh,
3326194ae42SLarry Chen 					ocfs2_clusters_to_blocks(osb->sb, new_phys_cpos),
3336194ae42SLarry Chen 					new_len);
3346194ae42SLarry Chen 	}
3356194ae42SLarry Chen 
336202ee5faSTristan Ye 	ocfs2_commit_trans(osb, handle);
337202ee5faSTristan Ye 
338202ee5faSTristan Ye out_unlock_mutex:
3395955102cSAl Viro 	inode_unlock(tl_inode);
340202ee5faSTristan Ye 
341202ee5faSTristan Ye 	if (context->data_ac) {
342202ee5faSTristan Ye 		ocfs2_free_alloc_context(context->data_ac);
343202ee5faSTristan Ye 		context->data_ac = NULL;
344202ee5faSTristan Ye 	}
345202ee5faSTristan Ye 
346202ee5faSTristan Ye 	if (context->meta_ac) {
347202ee5faSTristan Ye 		ocfs2_free_alloc_context(context->meta_ac);
348202ee5faSTristan Ye 		context->meta_ac = NULL;
349202ee5faSTristan Ye 	}
350202ee5faSTristan Ye 
351202ee5faSTristan Ye out:
352202ee5faSTristan Ye 	if (ref_tree)
353202ee5faSTristan Ye 		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
354202ee5faSTristan Ye 
355202ee5faSTristan Ye 	return ret;
356202ee5faSTristan Ye }
3571c06b912STristan Ye 
3581c06b912STristan Ye /*
3591c06b912STristan Ye  * find the victim alloc group, where #blkno fits.
3601c06b912STristan Ye  */
ocfs2_find_victim_alloc_group(struct inode * inode,u64 vict_blkno,int type,int slot,int * vict_bit,struct buffer_head ** ret_bh)3611c06b912STristan Ye static int ocfs2_find_victim_alloc_group(struct inode *inode,
3621c06b912STristan Ye 					 u64 vict_blkno,
3631c06b912STristan Ye 					 int type, int slot,
3641c06b912STristan Ye 					 int *vict_bit,
3651c06b912STristan Ye 					 struct buffer_head **ret_bh)
3661c06b912STristan Ye {
3676aea6f50STristan Ye 	int ret, i, bits_per_unit = 0;
3681c06b912STristan Ye 	u64 blkno;
3691c06b912STristan Ye 	char namebuf[40];
3701c06b912STristan Ye 
3711c06b912STristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3721c06b912STristan Ye 	struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
3731c06b912STristan Ye 	struct ocfs2_chain_list *cl;
3741c06b912STristan Ye 	struct ocfs2_chain_rec *rec;
3751c06b912STristan Ye 	struct ocfs2_dinode *ac_dinode;
3761c06b912STristan Ye 	struct ocfs2_group_desc *bg;
3771c06b912STristan Ye 
3781c06b912STristan Ye 	ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
3791c06b912STristan Ye 	ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
3801c06b912STristan Ye 					 strlen(namebuf), &blkno);
3811c06b912STristan Ye 	if (ret) {
3821c06b912STristan Ye 		ret = -ENOENT;
3831c06b912STristan Ye 		goto out;
3841c06b912STristan Ye 	}
3851c06b912STristan Ye 
3861c06b912STristan Ye 	ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
3871c06b912STristan Ye 	if (ret) {
3881c06b912STristan Ye 		mlog_errno(ret);
3891c06b912STristan Ye 		goto out;
3901c06b912STristan Ye 	}
3911c06b912STristan Ye 
3921c06b912STristan Ye 	ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
3931c06b912STristan Ye 	cl = &(ac_dinode->id2.i_chain);
3941c06b912STristan Ye 	rec = &(cl->cl_recs[0]);
3951c06b912STristan Ye 
3961c06b912STristan Ye 	if (type == GLOBAL_BITMAP_SYSTEM_INODE)
3976aea6f50STristan Ye 		bits_per_unit = osb->s_clustersize_bits -
3986aea6f50STristan Ye 					inode->i_sb->s_blocksize_bits;
3991c06b912STristan Ye 	/*
4001c06b912STristan Ye 	 * 'vict_blkno' was out of the valid range.
4011c06b912STristan Ye 	 */
4021c06b912STristan Ye 	if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
4037fa05c6eSJoseph Qi 	    (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
4046aea6f50STristan Ye 				bits_per_unit))) {
4051c06b912STristan Ye 		ret = -EINVAL;
4061c06b912STristan Ye 		goto out;
4071c06b912STristan Ye 	}
4081c06b912STristan Ye 
4091c06b912STristan Ye 	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
4101c06b912STristan Ye 
4111c06b912STristan Ye 		rec = &(cl->cl_recs[i]);
4121c06b912STristan Ye 		if (!rec)
4131c06b912STristan Ye 			continue;
4141c06b912STristan Ye 
4151c06b912STristan Ye 		bg = NULL;
4161c06b912STristan Ye 
4171c06b912STristan Ye 		do {
4181c06b912STristan Ye 			if (!bg)
4191c06b912STristan Ye 				blkno = le64_to_cpu(rec->c_blkno);
4201c06b912STristan Ye 			else
4211c06b912STristan Ye 				blkno = le64_to_cpu(bg->bg_next_group);
4221c06b912STristan Ye 
4231c06b912STristan Ye 			if (gd_bh) {
4241c06b912STristan Ye 				brelse(gd_bh);
4251c06b912STristan Ye 				gd_bh = NULL;
4261c06b912STristan Ye 			}
4271c06b912STristan Ye 
4281c06b912STristan Ye 			ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
4291c06b912STristan Ye 			if (ret) {
4301c06b912STristan Ye 				mlog_errno(ret);
4311c06b912STristan Ye 				goto out;
4321c06b912STristan Ye 			}
4331c06b912STristan Ye 
4341c06b912STristan Ye 			bg = (struct ocfs2_group_desc *)gd_bh->b_data;
4351c06b912STristan Ye 
4361c06b912STristan Ye 			if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
437236b9254SHeming Zhao via Ocfs2-devel 						(le16_to_cpu(bg->bg_bits) << bits_per_unit))) {
4381c06b912STristan Ye 
4391c06b912STristan Ye 				*ret_bh = gd_bh;
4406aea6f50STristan Ye 				*vict_bit = (vict_blkno - blkno) >>
4416aea6f50STristan Ye 							bits_per_unit;
4421c06b912STristan Ye 				mlog(0, "find the victim group: #%llu, "
4431c06b912STristan Ye 				     "total_bits: %u, vict_bit: %u\n",
4441c06b912STristan Ye 				     blkno, le16_to_cpu(bg->bg_bits),
4451c06b912STristan Ye 				     *vict_bit);
4461c06b912STristan Ye 				goto out;
4471c06b912STristan Ye 			}
4481c06b912STristan Ye 
4491c06b912STristan Ye 		} while (le64_to_cpu(bg->bg_next_group));
4501c06b912STristan Ye 	}
4511c06b912STristan Ye 
4521c06b912STristan Ye 	ret = -EINVAL;
4531c06b912STristan Ye out:
4541c06b912STristan Ye 	brelse(ac_bh);
4551c06b912STristan Ye 
4561c06b912STristan Ye 	/*
4571c06b912STristan Ye 	 * caller has to release the gd_bh properly.
4581c06b912STristan Ye 	 */
4591c06b912STristan Ye 	return ret;
4601c06b912STristan Ye }
46199e4c750STristan Ye 
46299e4c750STristan Ye /*
46399e4c750STristan Ye  * XXX: helper to validate and adjust moving goal.
46499e4c750STristan Ye  */
ocfs2_validate_and_adjust_move_goal(struct inode * inode,struct ocfs2_move_extents * range)46599e4c750STristan Ye static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
46699e4c750STristan Ye 					       struct ocfs2_move_extents *range)
46799e4c750STristan Ye {
46899e4c750STristan Ye 	int ret, goal_bit = 0;
46999e4c750STristan Ye 
47099e4c750STristan Ye 	struct buffer_head *gd_bh = NULL;
4717f4804d4SDan Carpenter 	struct ocfs2_group_desc *bg;
47299e4c750STristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
47399e4c750STristan Ye 	int c_to_b = 1 << (osb->s_clustersize_bits -
47499e4c750STristan Ye 					inode->i_sb->s_blocksize_bits);
47599e4c750STristan Ye 
47699e4c750STristan Ye 	/*
477ea5e1675STristan Ye 	 * make goal become cluster aligned.
478ea5e1675STristan Ye 	 */
479ea5e1675STristan Ye 	range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
480ea5e1675STristan Ye 						      range->me_goal);
481ea5e1675STristan Ye 	/*
48299e4c750STristan Ye 	 * validate goal sits within global_bitmap, and return the victim
48399e4c750STristan Ye 	 * group desc
48499e4c750STristan Ye 	 */
48599e4c750STristan Ye 	ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
48699e4c750STristan Ye 					    GLOBAL_BITMAP_SYSTEM_INODE,
48799e4c750STristan Ye 					    OCFS2_INVALID_SLOT,
48899e4c750STristan Ye 					    &goal_bit, &gd_bh);
48999e4c750STristan Ye 	if (ret)
49099e4c750STristan Ye 		goto out;
49199e4c750STristan Ye 
49299e4c750STristan Ye 	bg = (struct ocfs2_group_desc *)gd_bh->b_data;
49399e4c750STristan Ye 
49499e4c750STristan Ye 	/*
4957f4804d4SDan Carpenter 	 * moving goal is not allowd to start with a group desc blok(#0 blk)
4967f4804d4SDan Carpenter 	 * let's compromise to the latter cluster.
4977f4804d4SDan Carpenter 	 */
4987f4804d4SDan Carpenter 	if (range->me_goal == le64_to_cpu(bg->bg_blkno))
4997f4804d4SDan Carpenter 		range->me_goal += c_to_b;
5007f4804d4SDan Carpenter 
5017f4804d4SDan Carpenter 	/*
50299e4c750STristan Ye 	 * movement is not gonna cross two groups.
50399e4c750STristan Ye 	 */
50499e4c750STristan Ye 	if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
50599e4c750STristan Ye 								range->me_len) {
50699e4c750STristan Ye 		ret = -EINVAL;
50799e4c750STristan Ye 		goto out;
50899e4c750STristan Ye 	}
50999e4c750STristan Ye 	/*
51099e4c750STristan Ye 	 * more exact validations/adjustments will be performed later during
51199e4c750STristan Ye 	 * moving operation for each extent range.
51299e4c750STristan Ye 	 */
51399e4c750STristan Ye 	mlog(0, "extents get ready to be moved to #%llu block\n",
51499e4c750STristan Ye 	     range->me_goal);
51599e4c750STristan Ye 
51699e4c750STristan Ye out:
51799e4c750STristan Ye 	brelse(gd_bh);
51899e4c750STristan Ye 
51999e4c750STristan Ye 	return ret;
52099e4c750STristan Ye }
521e6b5859cSTristan Ye 
ocfs2_probe_alloc_group(struct inode * inode,struct buffer_head * bh,int * goal_bit,u32 move_len,u32 max_hop,u32 * phys_cpos)522e6b5859cSTristan Ye static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
523e6b5859cSTristan Ye 				    int *goal_bit, u32 move_len, u32 max_hop,
524e6b5859cSTristan Ye 				    u32 *phys_cpos)
525e6b5859cSTristan Ye {
526e6b5859cSTristan Ye 	int i, used, last_free_bits = 0, base_bit = *goal_bit;
527e6b5859cSTristan Ye 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
528e6b5859cSTristan Ye 	u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
529e6b5859cSTristan Ye 						 le64_to_cpu(gd->bg_blkno));
530e6b5859cSTristan Ye 
531e6b5859cSTristan Ye 	for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
532e6b5859cSTristan Ye 
533e6b5859cSTristan Ye 		used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
534e6b5859cSTristan Ye 		if (used) {
535e6b5859cSTristan Ye 			/*
536e6b5859cSTristan Ye 			 * we even tried searching the free chunk by jumping
537e6b5859cSTristan Ye 			 * a 'max_hop' distance, but still failed.
538e6b5859cSTristan Ye 			 */
539e6b5859cSTristan Ye 			if ((i - base_bit) > max_hop) {
540e6b5859cSTristan Ye 				*phys_cpos = 0;
541e6b5859cSTristan Ye 				break;
542e6b5859cSTristan Ye 			}
543e6b5859cSTristan Ye 
544e6b5859cSTristan Ye 			if (last_free_bits)
545e6b5859cSTristan Ye 				last_free_bits = 0;
546e6b5859cSTristan Ye 
547e6b5859cSTristan Ye 			continue;
548e6b5859cSTristan Ye 		} else
549e6b5859cSTristan Ye 			last_free_bits++;
550e6b5859cSTristan Ye 
551e6b5859cSTristan Ye 		if (last_free_bits == move_len) {
552236b9254SHeming Zhao via Ocfs2-devel 			i -= move_len;
553e6b5859cSTristan Ye 			*goal_bit = i;
554e6b5859cSTristan Ye 			*phys_cpos = base_cpos + i;
555e6b5859cSTristan Ye 			break;
556e6b5859cSTristan Ye 		}
557e6b5859cSTristan Ye 	}
558e6b5859cSTristan Ye 
559e6b5859cSTristan Ye 	mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
560e6b5859cSTristan Ye }
5618473aa8aSTristan Ye 
ocfs2_move_extent(struct ocfs2_move_extents_context * context,u32 cpos,u32 phys_cpos,u32 * new_phys_cpos,u32 len,int ext_flags)562e0847717STristan Ye static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
563e0847717STristan Ye 			     u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
564e0847717STristan Ye 			     u32 len, int ext_flags)
565e0847717STristan Ye {
566e0847717STristan Ye 	int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
567e0847717STristan Ye 	handle_t *handle;
568e0847717STristan Ye 	struct inode *inode = context->inode;
569e0847717STristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
570e0847717STristan Ye 	struct inode *tl_inode = osb->osb_tl_inode;
571e0847717STristan Ye 	struct inode *gb_inode = NULL;
572e0847717STristan Ye 	struct buffer_head *gb_bh = NULL;
573e0847717STristan Ye 	struct buffer_head *gd_bh = NULL;
574e0847717STristan Ye 	struct ocfs2_group_desc *gd;
575e0847717STristan Ye 	struct ocfs2_refcount_tree *ref_tree = NULL;
576e0847717STristan Ye 	u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
577e0847717STristan Ye 						    context->range->me_threshold);
578e0847717STristan Ye 	u64 phys_blkno, new_phys_blkno;
579e0847717STristan Ye 
580e0847717STristan Ye 	phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
581e0847717STristan Ye 
582e0847717STristan Ye 	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
58384e40080SDarrick J. Wong 		BUG_ON(!ocfs2_is_refcount_inode(inode));
584e0847717STristan Ye 		BUG_ON(!context->refcount_loc);
585e0847717STristan Ye 
586e0847717STristan Ye 		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
587e0847717STristan Ye 					       &ref_tree, NULL);
588e0847717STristan Ye 		if (ret) {
589e0847717STristan Ye 			mlog_errno(ret);
590e0847717STristan Ye 			return ret;
591e0847717STristan Ye 		}
592e0847717STristan Ye 
593e0847717STristan Ye 		ret = ocfs2_prepare_refcount_change_for_del(inode,
594e0847717STristan Ye 							context->refcount_loc,
595e0847717STristan Ye 							phys_blkno,
596e0847717STristan Ye 							len,
597e0847717STristan Ye 							&credits,
598e0847717STristan Ye 							&extra_blocks);
599e0847717STristan Ye 		if (ret) {
600e0847717STristan Ye 			mlog_errno(ret);
601e0847717STristan Ye 			goto out;
602e0847717STristan Ye 		}
603e0847717STristan Ye 	}
604e0847717STristan Ye 
605e21e5744SLarry Chen 	ret = ocfs2_lock_meta_allocator_move_extents(inode, &context->et,
606e21e5744SLarry Chen 						len, 1,
607e0847717STristan Ye 						&context->meta_ac,
608e21e5744SLarry Chen 						extra_blocks, &credits);
609e0847717STristan Ye 	if (ret) {
610e0847717STristan Ye 		mlog_errno(ret);
611e0847717STristan Ye 		goto out;
612e0847717STristan Ye 	}
613e0847717STristan Ye 
614e0847717STristan Ye 	/*
615e0847717STristan Ye 	 * need to count 2 extra credits for global_bitmap inode and
616e0847717STristan Ye 	 * group descriptor.
617e0847717STristan Ye 	 */
618e0847717STristan Ye 	credits += OCFS2_INODE_UPDATE_CREDITS + 1;
619e0847717STristan Ye 
620e0847717STristan Ye 	/*
621e0847717STristan Ye 	 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
622e0847717STristan Ye 	 * logic, while we still need to lock the global_bitmap.
623e0847717STristan Ye 	 */
624e0847717STristan Ye 	gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
625e0847717STristan Ye 					       OCFS2_INVALID_SLOT);
626e0847717STristan Ye 	if (!gb_inode) {
627e0847717STristan Ye 		mlog(ML_ERROR, "unable to get global_bitmap inode\n");
628e0847717STristan Ye 		ret = -EIO;
629e0847717STristan Ye 		goto out;
630e0847717STristan Ye 	}
631e0847717STristan Ye 
6325955102cSAl Viro 	inode_lock(gb_inode);
633e0847717STristan Ye 
634e0847717STristan Ye 	ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
635e0847717STristan Ye 	if (ret) {
636e0847717STristan Ye 		mlog_errno(ret);
637e0847717STristan Ye 		goto out_unlock_gb_mutex;
638e0847717STristan Ye 	}
639e0847717STristan Ye 
6405955102cSAl Viro 	inode_lock(tl_inode);
641e0847717STristan Ye 
642e0847717STristan Ye 	handle = ocfs2_start_trans(osb, credits);
643e0847717STristan Ye 	if (IS_ERR(handle)) {
644e0847717STristan Ye 		ret = PTR_ERR(handle);
645e0847717STristan Ye 		mlog_errno(ret);
646e0847717STristan Ye 		goto out_unlock_tl_inode;
647e0847717STristan Ye 	}
648e0847717STristan Ye 
649e0847717STristan Ye 	new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
650e0847717STristan Ye 	ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
651e0847717STristan Ye 					    GLOBAL_BITMAP_SYSTEM_INODE,
652e0847717STristan Ye 					    OCFS2_INVALID_SLOT,
653e0847717STristan Ye 					    &goal_bit, &gd_bh);
654e0847717STristan Ye 	if (ret) {
655e0847717STristan Ye 		mlog_errno(ret);
656e0847717STristan Ye 		goto out_commit;
657e0847717STristan Ye 	}
658e0847717STristan Ye 
659e0847717STristan Ye 	/*
660e0847717STristan Ye 	 * probe the victim cluster group to find a proper
661e0847717STristan Ye 	 * region to fit wanted movement, it even will perfrom
662e0847717STristan Ye 	 * a best-effort attempt by compromising to a threshold
663e0847717STristan Ye 	 * around the goal.
664e0847717STristan Ye 	 */
665e0847717STristan Ye 	ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
666e0847717STristan Ye 				new_phys_cpos);
6673d75be7cSDan Carpenter 	if (!*new_phys_cpos) {
668e0847717STristan Ye 		ret = -ENOSPC;
669e0847717STristan Ye 		goto out_commit;
670e0847717STristan Ye 	}
671e0847717STristan Ye 
672e0847717STristan Ye 	ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
673e0847717STristan Ye 				  *new_phys_cpos, ext_flags);
674e0847717STristan Ye 	if (ret) {
675e0847717STristan Ye 		mlog_errno(ret);
676e0847717STristan Ye 		goto out_commit;
677e0847717STristan Ye 	}
678e0847717STristan Ye 
679e0847717STristan Ye 	gd = (struct ocfs2_group_desc *)gd_bh->b_data;
680e0847717STristan Ye 	ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
681e0847717STristan Ye 					       le16_to_cpu(gd->bg_chain));
682e0847717STristan Ye 	if (ret) {
683e0847717STristan Ye 		mlog_errno(ret);
684e0847717STristan Ye 		goto out_commit;
685e0847717STristan Ye 	}
686e0847717STristan Ye 
687e0847717STristan Ye 	ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
688e0847717STristan Ye 					 goal_bit, len);
689db66c715SYounger Liu 	if (ret) {
690db66c715SYounger Liu 		ocfs2_rollback_alloc_dinode_counts(gb_inode, gb_bh, len,
691db66c715SYounger Liu 					       le16_to_cpu(gd->bg_chain));
692e0847717STristan Ye 		mlog_errno(ret);
693db66c715SYounger Liu 	}
694e0847717STristan Ye 
695e0847717STristan Ye 	/*
696e0847717STristan Ye 	 * Here we should write the new page out first if we are
697e0847717STristan Ye 	 * in write-back mode.
698e0847717STristan Ye 	 */
699e0847717STristan Ye 	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
700e0847717STristan Ye 	if (ret)
701e0847717STristan Ye 		mlog_errno(ret);
702e0847717STristan Ye 
703e0847717STristan Ye out_commit:
704e0847717STristan Ye 	ocfs2_commit_trans(osb, handle);
705e0847717STristan Ye 	brelse(gd_bh);
706e0847717STristan Ye 
707e0847717STristan Ye out_unlock_tl_inode:
7085955102cSAl Viro 	inode_unlock(tl_inode);
709e0847717STristan Ye 
710e0847717STristan Ye 	ocfs2_inode_unlock(gb_inode, 1);
711e0847717STristan Ye out_unlock_gb_mutex:
7125955102cSAl Viro 	inode_unlock(gb_inode);
713e0847717STristan Ye 	brelse(gb_bh);
714e0847717STristan Ye 	iput(gb_inode);
715e0847717STristan Ye 
716e0847717STristan Ye out:
717e0847717STristan Ye 	if (context->meta_ac) {
718e0847717STristan Ye 		ocfs2_free_alloc_context(context->meta_ac);
719e0847717STristan Ye 		context->meta_ac = NULL;
720e0847717STristan Ye 	}
721e0847717STristan Ye 
722e0847717STristan Ye 	if (ref_tree)
723e0847717STristan Ye 		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
724e0847717STristan Ye 
725e0847717STristan Ye 	return ret;
726e0847717STristan Ye }
727ee16cc03STristan Ye 
728ee16cc03STristan Ye /*
729ee16cc03STristan Ye  * Helper to calculate the defraging length in one run according to threshold.
730ee16cc03STristan Ye  */
ocfs2_calc_extent_defrag_len(u32 * alloc_size,u32 * len_defraged,u32 threshold,int * skip)731ee16cc03STristan Ye static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
732ee16cc03STristan Ye 					 u32 threshold, int *skip)
733ee16cc03STristan Ye {
734ee16cc03STristan Ye 	if ((*alloc_size + *len_defraged) < threshold) {
735ee16cc03STristan Ye 		/*
736ee16cc03STristan Ye 		 * proceed defragmentation until we meet the thresh
737ee16cc03STristan Ye 		 */
738ee16cc03STristan Ye 		*len_defraged += *alloc_size;
739ee16cc03STristan Ye 	} else if (*len_defraged == 0) {
740ee16cc03STristan Ye 		/*
741ee16cc03STristan Ye 		 * XXX: skip a large extent.
742ee16cc03STristan Ye 		 */
743ee16cc03STristan Ye 		*skip = 1;
744ee16cc03STristan Ye 	} else {
745ee16cc03STristan Ye 		/*
746ee16cc03STristan Ye 		 * split this extent to coalesce with former pieces as
747ee16cc03STristan Ye 		 * to reach the threshold.
748ee16cc03STristan Ye 		 *
749ee16cc03STristan Ye 		 * we're done here with one cycle of defragmentation
750ee16cc03STristan Ye 		 * in a size of 'thresh', resetting 'len_defraged'
751ee16cc03STristan Ye 		 * forces a new defragmentation.
752ee16cc03STristan Ye 		 */
753ee16cc03STristan Ye 		*alloc_size = threshold - *len_defraged;
754ee16cc03STristan Ye 		*len_defraged = 0;
755ee16cc03STristan Ye 	}
756ee16cc03STristan Ye }
75753069d4eSTristan Ye 
__ocfs2_move_extents_range(struct buffer_head * di_bh,struct ocfs2_move_extents_context * context)75853069d4eSTristan Ye static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
75953069d4eSTristan Ye 				struct ocfs2_move_extents_context *context)
76053069d4eSTristan Ye {
76153069d4eSTristan Ye 	int ret = 0, flags, do_defrag, skip = 0;
76253069d4eSTristan Ye 	u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
76353069d4eSTristan Ye 	u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
76453069d4eSTristan Ye 
76553069d4eSTristan Ye 	struct inode *inode = context->inode;
76653069d4eSTristan Ye 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
76753069d4eSTristan Ye 	struct ocfs2_move_extents *range = context->range;
76853069d4eSTristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
76953069d4eSTristan Ye 
770f17c20ddSJunxiao Bi 	if ((i_size_read(inode) == 0) || (range->me_len == 0))
77153069d4eSTristan Ye 		return 0;
77253069d4eSTristan Ye 
77353069d4eSTristan Ye 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
77453069d4eSTristan Ye 		return 0;
77553069d4eSTristan Ye 
77653069d4eSTristan Ye 	context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
77753069d4eSTristan Ye 
77853069d4eSTristan Ye 	ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
77953069d4eSTristan Ye 	ocfs2_init_dealloc_ctxt(&context->dealloc);
78053069d4eSTristan Ye 
78153069d4eSTristan Ye 	/*
78253069d4eSTristan Ye 	 * TO-DO XXX:
78353069d4eSTristan Ye 	 *
78453069d4eSTristan Ye 	 * - xattr extents.
78553069d4eSTristan Ye 	 */
78653069d4eSTristan Ye 
78753069d4eSTristan Ye 	do_defrag = context->auto_defrag;
78853069d4eSTristan Ye 
78953069d4eSTristan Ye 	/*
79053069d4eSTristan Ye 	 * extents moving happens in unit of clusters, for the sake
79153069d4eSTristan Ye 	 * of simplicity, we may ignore two clusters where 'byte_start'
79253069d4eSTristan Ye 	 * and 'byte_start + len' were within.
79353069d4eSTristan Ye 	 */
79453069d4eSTristan Ye 	move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
79553069d4eSTristan Ye 	len_to_move = (range->me_start + range->me_len) >>
79653069d4eSTristan Ye 						osb->s_clustersize_bits;
79753069d4eSTristan Ye 	if (len_to_move >= move_start)
79853069d4eSTristan Ye 		len_to_move -= move_start;
79953069d4eSTristan Ye 	else
80053069d4eSTristan Ye 		len_to_move = 0;
80153069d4eSTristan Ye 
802dda54e76STristan Ye 	if (do_defrag) {
80353069d4eSTristan Ye 		defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
804dda54e76STristan Ye 		if (defrag_thresh <= 1)
805dda54e76STristan Ye 			goto done;
806dda54e76STristan Ye 	} else
80753069d4eSTristan Ye 		new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
80853069d4eSTristan Ye 							 range->me_goal);
80953069d4eSTristan Ye 
81053069d4eSTristan Ye 	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
81153069d4eSTristan Ye 	     "thresh: %u\n",
81253069d4eSTristan Ye 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
81353069d4eSTristan Ye 	     (unsigned long long)range->me_start,
81453069d4eSTristan Ye 	     (unsigned long long)range->me_len,
81553069d4eSTristan Ye 	     move_start, len_to_move, defrag_thresh);
81653069d4eSTristan Ye 
81753069d4eSTristan Ye 	cpos = move_start;
81853069d4eSTristan Ye 	while (len_to_move) {
81953069d4eSTristan Ye 		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
82053069d4eSTristan Ye 					 &flags);
82153069d4eSTristan Ye 		if (ret) {
82253069d4eSTristan Ye 			mlog_errno(ret);
82353069d4eSTristan Ye 			goto out;
82453069d4eSTristan Ye 		}
82553069d4eSTristan Ye 
82653069d4eSTristan Ye 		if (alloc_size > len_to_move)
82753069d4eSTristan Ye 			alloc_size = len_to_move;
82853069d4eSTristan Ye 
82953069d4eSTristan Ye 		/*
83053069d4eSTristan Ye 		 * XXX: how to deal with a hole:
83153069d4eSTristan Ye 		 *
83253069d4eSTristan Ye 		 * - skip the hole of course
83353069d4eSTristan Ye 		 * - force a new defragmentation
83453069d4eSTristan Ye 		 */
83553069d4eSTristan Ye 		if (!phys_cpos) {
83653069d4eSTristan Ye 			if (do_defrag)
83753069d4eSTristan Ye 				len_defraged = 0;
83853069d4eSTristan Ye 
83953069d4eSTristan Ye 			goto next;
84053069d4eSTristan Ye 		}
84153069d4eSTristan Ye 
84253069d4eSTristan Ye 		if (do_defrag) {
84353069d4eSTristan Ye 			ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
84453069d4eSTristan Ye 						     defrag_thresh, &skip);
84553069d4eSTristan Ye 			/*
84653069d4eSTristan Ye 			 * skip large extents
84753069d4eSTristan Ye 			 */
84853069d4eSTristan Ye 			if (skip) {
84953069d4eSTristan Ye 				skip = 0;
85053069d4eSTristan Ye 				goto next;
85153069d4eSTristan Ye 			}
85253069d4eSTristan Ye 
85353069d4eSTristan Ye 			mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
85453069d4eSTristan Ye 			     "alloc_size: %u, len_defraged: %u\n",
85553069d4eSTristan Ye 			     cpos, phys_cpos, alloc_size, len_defraged);
85653069d4eSTristan Ye 
85753069d4eSTristan Ye 			ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
8584dfa66bdSTristan Ye 						  &alloc_size, flags);
85953069d4eSTristan Ye 		} else {
86053069d4eSTristan Ye 			ret = ocfs2_move_extent(context, cpos, phys_cpos,
86153069d4eSTristan Ye 						&new_phys_cpos, alloc_size,
86253069d4eSTristan Ye 						flags);
86353069d4eSTristan Ye 
86453069d4eSTristan Ye 			new_phys_cpos += alloc_size;
86553069d4eSTristan Ye 		}
86653069d4eSTristan Ye 
86753069d4eSTristan Ye 		if (ret < 0) {
86853069d4eSTristan Ye 			mlog_errno(ret);
86953069d4eSTristan Ye 			goto out;
87053069d4eSTristan Ye 		}
87153069d4eSTristan Ye 
87253069d4eSTristan Ye 		context->clusters_moved += alloc_size;
87353069d4eSTristan Ye next:
87453069d4eSTristan Ye 		cpos += alloc_size;
87553069d4eSTristan Ye 		len_to_move -= alloc_size;
87653069d4eSTristan Ye 	}
87753069d4eSTristan Ye 
878dda54e76STristan Ye done:
87953069d4eSTristan Ye 	range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
88053069d4eSTristan Ye 
88153069d4eSTristan Ye out:
88253069d4eSTristan Ye 	range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
88353069d4eSTristan Ye 						      context->clusters_moved);
88453069d4eSTristan Ye 	range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
88553069d4eSTristan Ye 						       context->new_phys_cpos);
88653069d4eSTristan Ye 
88753069d4eSTristan Ye 	ocfs2_schedule_truncate_log_flush(osb, 1);
88853069d4eSTristan Ye 	ocfs2_run_deallocs(osb, &context->dealloc);
88953069d4eSTristan Ye 
89053069d4eSTristan Ye 	return ret;
89153069d4eSTristan Ye }
89253069d4eSTristan Ye 
ocfs2_move_extents(struct ocfs2_move_extents_context * context)89353069d4eSTristan Ye static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
89453069d4eSTristan Ye {
89553069d4eSTristan Ye 	int status;
89653069d4eSTristan Ye 	handle_t *handle;
89753069d4eSTristan Ye 	struct inode *inode = context->inode;
89853069d4eSTristan Ye 	struct ocfs2_dinode *di;
89953069d4eSTristan Ye 	struct buffer_head *di_bh = NULL;
90053069d4eSTristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
90153069d4eSTristan Ye 
90253069d4eSTristan Ye 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
90353069d4eSTristan Ye 		return -EROFS;
90453069d4eSTristan Ye 
9055955102cSAl Viro 	inode_lock(inode);
90653069d4eSTristan Ye 
90753069d4eSTristan Ye 	/*
90853069d4eSTristan Ye 	 * This prevents concurrent writes from other nodes
90953069d4eSTristan Ye 	 */
91053069d4eSTristan Ye 	status = ocfs2_rw_lock(inode, 1);
91153069d4eSTristan Ye 	if (status) {
91253069d4eSTristan Ye 		mlog_errno(status);
91353069d4eSTristan Ye 		goto out;
91453069d4eSTristan Ye 	}
91553069d4eSTristan Ye 
91653069d4eSTristan Ye 	status = ocfs2_inode_lock(inode, &di_bh, 1);
91753069d4eSTristan Ye 	if (status) {
91853069d4eSTristan Ye 		mlog_errno(status);
91953069d4eSTristan Ye 		goto out_rw_unlock;
92053069d4eSTristan Ye 	}
92153069d4eSTristan Ye 
92253069d4eSTristan Ye 	/*
92353069d4eSTristan Ye 	 * rememer ip_xattr_sem also needs to be held if necessary
92453069d4eSTristan Ye 	 */
92553069d4eSTristan Ye 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
92653069d4eSTristan Ye 
92753069d4eSTristan Ye 	status = __ocfs2_move_extents_range(di_bh, context);
92853069d4eSTristan Ye 
92953069d4eSTristan Ye 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
93053069d4eSTristan Ye 	if (status) {
93153069d4eSTristan Ye 		mlog_errno(status);
93253069d4eSTristan Ye 		goto out_inode_unlock;
93353069d4eSTristan Ye 	}
93453069d4eSTristan Ye 
93553069d4eSTristan Ye 	/*
93653069d4eSTristan Ye 	 * We update ctime for these changes
93753069d4eSTristan Ye 	 */
93853069d4eSTristan Ye 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
93953069d4eSTristan Ye 	if (IS_ERR(handle)) {
94053069d4eSTristan Ye 		status = PTR_ERR(handle);
94153069d4eSTristan Ye 		mlog_errno(status);
94253069d4eSTristan Ye 		goto out_inode_unlock;
94353069d4eSTristan Ye 	}
94453069d4eSTristan Ye 
94553069d4eSTristan Ye 	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
94653069d4eSTristan Ye 					 OCFS2_JOURNAL_ACCESS_WRITE);
94753069d4eSTristan Ye 	if (status) {
94853069d4eSTristan Ye 		mlog_errno(status);
94953069d4eSTristan Ye 		goto out_commit;
95053069d4eSTristan Ye 	}
95153069d4eSTristan Ye 
95253069d4eSTristan Ye 	di = (struct ocfs2_dinode *)di_bh->b_data;
9536861de97SJeff Layton 	inode_set_ctime_current(inode);
954*10fc3a18SJeff Layton 	di->i_ctime = cpu_to_le64(inode_get_ctime_sec(inode));
955*10fc3a18SJeff Layton 	di->i_ctime_nsec = cpu_to_le32(inode_get_ctime_nsec(inode));
9566fdb702dSDarrick J. Wong 	ocfs2_update_inode_fsync_trans(handle, inode, 0);
95753069d4eSTristan Ye 
95853069d4eSTristan Ye 	ocfs2_journal_dirty(handle, di_bh);
95953069d4eSTristan Ye 
96053069d4eSTristan Ye out_commit:
96153069d4eSTristan Ye 	ocfs2_commit_trans(osb, handle);
96253069d4eSTristan Ye 
96353069d4eSTristan Ye out_inode_unlock:
96453069d4eSTristan Ye 	brelse(di_bh);
96553069d4eSTristan Ye 	ocfs2_inode_unlock(inode, 1);
96653069d4eSTristan Ye out_rw_unlock:
96753069d4eSTristan Ye 	ocfs2_rw_unlock(inode, 1);
96853069d4eSTristan Ye out:
9695955102cSAl Viro 	inode_unlock(inode);
97053069d4eSTristan Ye 
97153069d4eSTristan Ye 	return status;
97253069d4eSTristan Ye }
97353069d4eSTristan Ye 
ocfs2_ioctl_move_extents(struct file * filp,void __user * argp)97453069d4eSTristan Ye int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
97553069d4eSTristan Ye {
97653069d4eSTristan Ye 	int status;
97753069d4eSTristan Ye 
978496ad9aaSAl Viro 	struct inode *inode = file_inode(filp);
97953069d4eSTristan Ye 	struct ocfs2_move_extents range;
98085a258b7SDan Carpenter 	struct ocfs2_move_extents_context *context;
98185a258b7SDan Carpenter 
98285a258b7SDan Carpenter 	if (!argp)
98385a258b7SDan Carpenter 		return -EINVAL;
98453069d4eSTristan Ye 
985a561be71SAl Viro 	status = mnt_want_write_file(filp);
98653069d4eSTristan Ye 	if (status)
98753069d4eSTristan Ye 		return status;
98853069d4eSTristan Ye 
989bfbca926SYounger Liu 	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE)) {
990bfbca926SYounger Liu 		status = -EPERM;
99185a258b7SDan Carpenter 		goto out_drop;
992bfbca926SYounger Liu 	}
99353069d4eSTristan Ye 
99453069d4eSTristan Ye 	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
99553069d4eSTristan Ye 		status = -EPERM;
99685a258b7SDan Carpenter 		goto out_drop;
99753069d4eSTristan Ye 	}
99853069d4eSTristan Ye 
99953069d4eSTristan Ye 	context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
100053069d4eSTristan Ye 	if (!context) {
100153069d4eSTristan Ye 		status = -ENOMEM;
100253069d4eSTristan Ye 		mlog_errno(status);
100385a258b7SDan Carpenter 		goto out_drop;
100453069d4eSTristan Ye 	}
100553069d4eSTristan Ye 
100653069d4eSTristan Ye 	context->inode = inode;
100753069d4eSTristan Ye 	context->file = filp;
100853069d4eSTristan Ye 
1009f6a56903SAl Viro 	if (copy_from_user(&range, argp, sizeof(range))) {
101053069d4eSTristan Ye 		status = -EFAULT;
101185a258b7SDan Carpenter 		goto out_free;
101253069d4eSTristan Ye 	}
101353069d4eSTristan Ye 
1014bfbca926SYounger Liu 	if (range.me_start > i_size_read(inode)) {
1015bfbca926SYounger Liu 		status = -EINVAL;
101685a258b7SDan Carpenter 		goto out_free;
1017bfbca926SYounger Liu 	}
101853069d4eSTristan Ye 
101953069d4eSTristan Ye 	if (range.me_start + range.me_len > i_size_read(inode))
102053069d4eSTristan Ye 			range.me_len = i_size_read(inode) - range.me_start;
102153069d4eSTristan Ye 
102253069d4eSTristan Ye 	context->range = &range;
102353069d4eSTristan Ye 
102453069d4eSTristan Ye 	/*
102553069d4eSTristan Ye 	 * ok, the default theshold for the defragmentation
102653069d4eSTristan Ye 	 * is 1M, since our maximum clustersize was 1M also.
102753069d4eSTristan Ye 	 * any thought?
102853069d4eSTristan Ye 	 */
1029dda54e76STristan Ye 	if (!range.me_threshold)
103053069d4eSTristan Ye 		range.me_threshold = 1024 * 1024;
1031dda54e76STristan Ye 
1032dda54e76STristan Ye 	if (range.me_threshold > i_size_read(inode))
1033dda54e76STristan Ye 		range.me_threshold = i_size_read(inode);
1034dda54e76STristan Ye 
1035236b9254SHeming Zhao via Ocfs2-devel 	if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
1036236b9254SHeming Zhao via Ocfs2-devel 		context->auto_defrag = 1;
1037236b9254SHeming Zhao via Ocfs2-devel 
10384dfa66bdSTristan Ye 		if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
10394dfa66bdSTristan Ye 			context->partial = 1;
104053069d4eSTristan Ye 	} else {
104153069d4eSTristan Ye 		/*
104253069d4eSTristan Ye 		 * first best-effort attempt to validate and adjust the goal
104353069d4eSTristan Ye 		 * (physical address in block), while it can't guarantee later
104453069d4eSTristan Ye 		 * operation can succeed all the time since global_bitmap may
104553069d4eSTristan Ye 		 * change a bit over time.
104653069d4eSTristan Ye 		 */
104753069d4eSTristan Ye 
104853069d4eSTristan Ye 		status = ocfs2_validate_and_adjust_move_goal(inode, &range);
104953069d4eSTristan Ye 		if (status)
105085a258b7SDan Carpenter 			goto out_copy;
105153069d4eSTristan Ye 	}
105253069d4eSTristan Ye 
105353069d4eSTristan Ye 	status = ocfs2_move_extents(context);
105453069d4eSTristan Ye 	if (status)
105553069d4eSTristan Ye 		mlog_errno(status);
105685a258b7SDan Carpenter out_copy:
105753069d4eSTristan Ye 	/*
105853069d4eSTristan Ye 	 * movement/defragmentation may end up being partially completed,
105953069d4eSTristan Ye 	 * that's the reason why we need to return userspace the finished
106053069d4eSTristan Ye 	 * length and new_offset even if failure happens somewhere.
106153069d4eSTristan Ye 	 */
1062f6a56903SAl Viro 	if (copy_to_user(argp, &range, sizeof(range)))
106353069d4eSTristan Ye 		status = -EFAULT;
106453069d4eSTristan Ye 
106585a258b7SDan Carpenter out_free:
106653069d4eSTristan Ye 	kfree(context);
106785a258b7SDan Carpenter out_drop:
10682a79f17eSAl Viro 	mnt_drop_write_file(filp);
106953069d4eSTristan Ye 
107053069d4eSTristan Ye 	return status;
107153069d4eSTristan Ye }
1072