xref: /openbmc/linux/fs/ocfs2/move_extents.c (revision f6a56903)
1028ba5dfSTristan Ye /* -*- mode: c; c-basic-offset: 8; -*-
2028ba5dfSTristan Ye  * vim: noexpandtab sw=8 ts=8 sts=0:
3028ba5dfSTristan Ye  *
4028ba5dfSTristan Ye  * move_extents.c
5028ba5dfSTristan Ye  *
6028ba5dfSTristan Ye  * Copyright (C) 2011 Oracle.  All rights reserved.
7028ba5dfSTristan Ye  *
8028ba5dfSTristan Ye  * This program is free software; you can redistribute it and/or
9028ba5dfSTristan Ye  * modify it under the terms of the GNU General Public
10028ba5dfSTristan Ye  * License version 2 as published by the Free Software Foundation.
11028ba5dfSTristan Ye  *
12028ba5dfSTristan Ye  * This program is distributed in the hope that it will be useful,
13028ba5dfSTristan Ye  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14028ba5dfSTristan Ye  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15028ba5dfSTristan Ye  * General Public License for more details.
16028ba5dfSTristan Ye  */
17028ba5dfSTristan Ye #include <linux/fs.h>
18028ba5dfSTristan Ye #include <linux/types.h>
19028ba5dfSTristan Ye #include <linux/mount.h>
20028ba5dfSTristan Ye #include <linux/swap.h>
21028ba5dfSTristan Ye 
22028ba5dfSTristan Ye #include <cluster/masklog.h>
23028ba5dfSTristan Ye 
24028ba5dfSTristan Ye #include "ocfs2.h"
25028ba5dfSTristan Ye #include "ocfs2_ioctl.h"
26028ba5dfSTristan Ye 
27028ba5dfSTristan Ye #include "alloc.h"
28028ba5dfSTristan Ye #include "aops.h"
29028ba5dfSTristan Ye #include "dlmglue.h"
30028ba5dfSTristan Ye #include "extent_map.h"
31028ba5dfSTristan Ye #include "inode.h"
32028ba5dfSTristan Ye #include "journal.h"
33028ba5dfSTristan Ye #include "suballoc.h"
34028ba5dfSTristan Ye #include "uptodate.h"
35028ba5dfSTristan Ye #include "super.h"
36028ba5dfSTristan Ye #include "dir.h"
37028ba5dfSTristan Ye #include "buffer_head_io.h"
38028ba5dfSTristan Ye #include "sysfile.h"
39028ba5dfSTristan Ye #include "refcounttree.h"
40028ba5dfSTristan Ye #include "move_extents.h"
41028ba5dfSTristan Ye 
42028ba5dfSTristan Ye struct ocfs2_move_extents_context {
43028ba5dfSTristan Ye 	struct inode *inode;
44028ba5dfSTristan Ye 	struct file *file;
45028ba5dfSTristan Ye 	int auto_defrag;
464dfa66bdSTristan Ye 	int partial;
47028ba5dfSTristan Ye 	int credits;
48028ba5dfSTristan Ye 	u32 new_phys_cpos;
49028ba5dfSTristan Ye 	u32 clusters_moved;
50028ba5dfSTristan Ye 	u64 refcount_loc;
51028ba5dfSTristan Ye 	struct ocfs2_move_extents *range;
52028ba5dfSTristan Ye 	struct ocfs2_extent_tree et;
53028ba5dfSTristan Ye 	struct ocfs2_alloc_context *meta_ac;
54028ba5dfSTristan Ye 	struct ocfs2_alloc_context *data_ac;
55028ba5dfSTristan Ye 	struct ocfs2_cached_dealloc_ctxt dealloc;
56028ba5dfSTristan Ye };
57de474ee8STristan Ye 
588f603e56STristan Ye static int __ocfs2_move_extent(handle_t *handle,
598f603e56STristan Ye 			       struct ocfs2_move_extents_context *context,
608f603e56STristan Ye 			       u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
618f603e56STristan Ye 			       int ext_flags)
628f603e56STristan Ye {
638f603e56STristan Ye 	int ret = 0, index;
648f603e56STristan Ye 	struct inode *inode = context->inode;
658f603e56STristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
668f603e56STristan Ye 	struct ocfs2_extent_rec *rec, replace_rec;
678f603e56STristan Ye 	struct ocfs2_path *path = NULL;
688f603e56STristan Ye 	struct ocfs2_extent_list *el;
698f603e56STristan Ye 	u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
708f603e56STristan Ye 	u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
718f603e56STristan Ye 
728f603e56STristan Ye 	ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
738f603e56STristan Ye 					       p_cpos, new_p_cpos, len);
748f603e56STristan Ye 	if (ret) {
758f603e56STristan Ye 		mlog_errno(ret);
768f603e56STristan Ye 		goto out;
778f603e56STristan Ye 	}
788f603e56STristan Ye 
798f603e56STristan Ye 	memset(&replace_rec, 0, sizeof(replace_rec));
808f603e56STristan Ye 	replace_rec.e_cpos = cpu_to_le32(cpos);
818f603e56STristan Ye 	replace_rec.e_leaf_clusters = cpu_to_le16(len);
828f603e56STristan Ye 	replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
838f603e56STristan Ye 								   new_p_cpos));
848f603e56STristan Ye 
858f603e56STristan Ye 	path = ocfs2_new_path_from_et(&context->et);
868f603e56STristan Ye 	if (!path) {
878f603e56STristan Ye 		ret = -ENOMEM;
888f603e56STristan Ye 		mlog_errno(ret);
898f603e56STristan Ye 		goto out;
908f603e56STristan Ye 	}
918f603e56STristan Ye 
928f603e56STristan Ye 	ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
938f603e56STristan Ye 	if (ret) {
948f603e56STristan Ye 		mlog_errno(ret);
958f603e56STristan Ye 		goto out;
968f603e56STristan Ye 	}
978f603e56STristan Ye 
988f603e56STristan Ye 	el = path_leaf_el(path);
998f603e56STristan Ye 
1008f603e56STristan Ye 	index = ocfs2_search_extent_list(el, cpos);
1018f603e56STristan Ye 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
1028f603e56STristan Ye 		ocfs2_error(inode->i_sb,
1038f603e56STristan Ye 			    "Inode %llu has an extent at cpos %u which can no "
1048f603e56STristan Ye 			    "longer be found.\n",
1058f603e56STristan Ye 			    (unsigned long long)ino, cpos);
1068f603e56STristan Ye 		ret = -EROFS;
1078f603e56STristan Ye 		goto out;
1088f603e56STristan Ye 	}
1098f603e56STristan Ye 
1108f603e56STristan Ye 	rec = &el->l_recs[index];
1118f603e56STristan Ye 
1128f603e56STristan Ye 	BUG_ON(ext_flags != rec->e_flags);
1138f603e56STristan Ye 	/*
1148f603e56STristan Ye 	 * after moving/defraging to new location, the extent is not going
1158f603e56STristan Ye 	 * to be refcounted anymore.
1168f603e56STristan Ye 	 */
1178f603e56STristan Ye 	replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
1188f603e56STristan Ye 
1198f603e56STristan Ye 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
1208f603e56STristan Ye 				      context->et.et_root_bh,
1218f603e56STristan Ye 				      OCFS2_JOURNAL_ACCESS_WRITE);
1228f603e56STristan Ye 	if (ret) {
1238f603e56STristan Ye 		mlog_errno(ret);
1248f603e56STristan Ye 		goto out;
1258f603e56STristan Ye 	}
1268f603e56STristan Ye 
1278f603e56STristan Ye 	ret = ocfs2_split_extent(handle, &context->et, path, index,
1288f603e56STristan Ye 				 &replace_rec, context->meta_ac,
1298f603e56STristan Ye 				 &context->dealloc);
1308f603e56STristan Ye 	if (ret) {
1318f603e56STristan Ye 		mlog_errno(ret);
1328f603e56STristan Ye 		goto out;
1338f603e56STristan Ye 	}
1348f603e56STristan Ye 
1358f603e56STristan Ye 	ocfs2_journal_dirty(handle, context->et.et_root_bh);
1368f603e56STristan Ye 
1378f603e56STristan Ye 	context->new_phys_cpos = new_p_cpos;
1388f603e56STristan Ye 
1398f603e56STristan Ye 	/*
1408f603e56STristan Ye 	 * need I to append truncate log for old clusters?
1418f603e56STristan Ye 	 */
1428f603e56STristan Ye 	if (old_blkno) {
1438f603e56STristan Ye 		if (ext_flags & OCFS2_EXT_REFCOUNTED)
1448f603e56STristan Ye 			ret = ocfs2_decrease_refcount(inode, handle,
1458f603e56STristan Ye 					ocfs2_blocks_to_clusters(osb->sb,
1468f603e56STristan Ye 								 old_blkno),
1478f603e56STristan Ye 					len, context->meta_ac,
1488f603e56STristan Ye 					&context->dealloc, 1);
1498f603e56STristan Ye 		else
1508f603e56STristan Ye 			ret = ocfs2_truncate_log_append(osb, handle,
1518f603e56STristan Ye 							old_blkno, len);
1528f603e56STristan Ye 	}
1538f603e56STristan Ye 
1548f603e56STristan Ye out:
1558f603e56STristan Ye 	return ret;
1568f603e56STristan Ye }
1578f603e56STristan Ye 
158de474ee8STristan Ye /*
159de474ee8STristan Ye  * lock allocators, and reserving appropriate number of bits for
160de474ee8STristan Ye  * meta blocks and data clusters.
161de474ee8STristan Ye  *
162de474ee8STristan Ye  * in some cases, we don't need to reserve clusters, just let data_ac
163de474ee8STristan Ye  * be NULL.
164de474ee8STristan Ye  */
165de474ee8STristan Ye static int ocfs2_lock_allocators_move_extents(struct inode *inode,
166de474ee8STristan Ye 					struct ocfs2_extent_tree *et,
167de474ee8STristan Ye 					u32 clusters_to_move,
168de474ee8STristan Ye 					u32 extents_to_split,
169de474ee8STristan Ye 					struct ocfs2_alloc_context **meta_ac,
170de474ee8STristan Ye 					struct ocfs2_alloc_context **data_ac,
171de474ee8STristan Ye 					int extra_blocks,
172de474ee8STristan Ye 					int *credits)
173de474ee8STristan Ye {
174de474ee8STristan Ye 	int ret, num_free_extents;
175de474ee8STristan Ye 	unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
176de474ee8STristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
177de474ee8STristan Ye 
178de474ee8STristan Ye 	num_free_extents = ocfs2_num_free_extents(osb, et);
179de474ee8STristan Ye 	if (num_free_extents < 0) {
180de474ee8STristan Ye 		ret = num_free_extents;
181de474ee8STristan Ye 		mlog_errno(ret);
182de474ee8STristan Ye 		goto out;
183de474ee8STristan Ye 	}
184de474ee8STristan Ye 
185de474ee8STristan Ye 	if (!num_free_extents ||
186de474ee8STristan Ye 	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
187de474ee8STristan Ye 		extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
188de474ee8STristan Ye 
189de474ee8STristan Ye 	ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
190de474ee8STristan Ye 	if (ret) {
191de474ee8STristan Ye 		mlog_errno(ret);
192de474ee8STristan Ye 		goto out;
193de474ee8STristan Ye 	}
194de474ee8STristan Ye 
195de474ee8STristan Ye 	if (data_ac) {
196de474ee8STristan Ye 		ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
197de474ee8STristan Ye 		if (ret) {
198de474ee8STristan Ye 			mlog_errno(ret);
199de474ee8STristan Ye 			goto out;
200de474ee8STristan Ye 		}
201de474ee8STristan Ye 	}
202de474ee8STristan Ye 
203de474ee8STristan Ye 	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
204de474ee8STristan Ye 					      clusters_to_move + 2);
205de474ee8STristan Ye 
206de474ee8STristan Ye 	mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
207de474ee8STristan Ye 	     extra_blocks, clusters_to_move, *credits);
208de474ee8STristan Ye out:
209de474ee8STristan Ye 	if (ret) {
210de474ee8STristan Ye 		if (*meta_ac) {
211de474ee8STristan Ye 			ocfs2_free_alloc_context(*meta_ac);
212de474ee8STristan Ye 			*meta_ac = NULL;
213de474ee8STristan Ye 		}
214de474ee8STristan Ye 	}
215de474ee8STristan Ye 
216de474ee8STristan Ye 	return ret;
217de474ee8STristan Ye }
218202ee5faSTristan Ye 
219202ee5faSTristan Ye /*
220202ee5faSTristan Ye  * Using one journal handle to guarantee the data consistency in case
221202ee5faSTristan Ye  * crash happens anywhere.
222dda54e76STristan Ye  *
223dda54e76STristan Ye  *  XXX: defrag can end up with finishing partial extent as requested,
224dda54e76STristan Ye  * due to not enough contiguous clusters can be found in allocator.
225202ee5faSTristan Ye  */
226202ee5faSTristan Ye static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
2274dfa66bdSTristan Ye 			       u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
228202ee5faSTristan Ye {
2294dfa66bdSTristan Ye 	int ret, credits = 0, extra_blocks = 0, partial = context->partial;
230202ee5faSTristan Ye 	handle_t *handle;
231202ee5faSTristan Ye 	struct inode *inode = context->inode;
232202ee5faSTristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
233202ee5faSTristan Ye 	struct inode *tl_inode = osb->osb_tl_inode;
234202ee5faSTristan Ye 	struct ocfs2_refcount_tree *ref_tree = NULL;
235202ee5faSTristan Ye 	u32 new_phys_cpos, new_len;
236202ee5faSTristan Ye 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
237202ee5faSTristan Ye 
2384dfa66bdSTristan Ye 	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
239202ee5faSTristan Ye 
240202ee5faSTristan Ye 		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
241202ee5faSTristan Ye 			 OCFS2_HAS_REFCOUNT_FL));
242202ee5faSTristan Ye 
243202ee5faSTristan Ye 		BUG_ON(!context->refcount_loc);
244202ee5faSTristan Ye 
245202ee5faSTristan Ye 		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
246202ee5faSTristan Ye 					       &ref_tree, NULL);
247202ee5faSTristan Ye 		if (ret) {
248202ee5faSTristan Ye 			mlog_errno(ret);
249202ee5faSTristan Ye 			return ret;
250202ee5faSTristan Ye 		}
251202ee5faSTristan Ye 
252202ee5faSTristan Ye 		ret = ocfs2_prepare_refcount_change_for_del(inode,
253202ee5faSTristan Ye 							context->refcount_loc,
254202ee5faSTristan Ye 							phys_blkno,
2554dfa66bdSTristan Ye 							*len,
256202ee5faSTristan Ye 							&credits,
257202ee5faSTristan Ye 							&extra_blocks);
258202ee5faSTristan Ye 		if (ret) {
259202ee5faSTristan Ye 			mlog_errno(ret);
260202ee5faSTristan Ye 			goto out;
261202ee5faSTristan Ye 		}
262202ee5faSTristan Ye 	}
263202ee5faSTristan Ye 
2644dfa66bdSTristan Ye 	ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
265202ee5faSTristan Ye 						 &context->meta_ac,
266202ee5faSTristan Ye 						 &context->data_ac,
267202ee5faSTristan Ye 						 extra_blocks, &credits);
268202ee5faSTristan Ye 	if (ret) {
269202ee5faSTristan Ye 		mlog_errno(ret);
270202ee5faSTristan Ye 		goto out;
271202ee5faSTristan Ye 	}
272202ee5faSTristan Ye 
273202ee5faSTristan Ye 	/*
274202ee5faSTristan Ye 	 * should be using allocation reservation strategy there?
275202ee5faSTristan Ye 	 *
276202ee5faSTristan Ye 	 * if (context->data_ac)
277202ee5faSTristan Ye 	 *	context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
278202ee5faSTristan Ye 	 */
279202ee5faSTristan Ye 
280202ee5faSTristan Ye 	mutex_lock(&tl_inode->i_mutex);
281202ee5faSTristan Ye 
282202ee5faSTristan Ye 	if (ocfs2_truncate_log_needs_flush(osb)) {
283202ee5faSTristan Ye 		ret = __ocfs2_flush_truncate_log(osb);
284202ee5faSTristan Ye 		if (ret < 0) {
285202ee5faSTristan Ye 			mlog_errno(ret);
286202ee5faSTristan Ye 			goto out_unlock_mutex;
287202ee5faSTristan Ye 		}
288202ee5faSTristan Ye 	}
289202ee5faSTristan Ye 
290202ee5faSTristan Ye 	handle = ocfs2_start_trans(osb, credits);
291202ee5faSTristan Ye 	if (IS_ERR(handle)) {
292202ee5faSTristan Ye 		ret = PTR_ERR(handle);
293202ee5faSTristan Ye 		mlog_errno(ret);
294202ee5faSTristan Ye 		goto out_unlock_mutex;
295202ee5faSTristan Ye 	}
296202ee5faSTristan Ye 
2974dfa66bdSTristan Ye 	ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
298202ee5faSTristan Ye 				     &new_phys_cpos, &new_len);
299202ee5faSTristan Ye 	if (ret) {
300202ee5faSTristan Ye 		mlog_errno(ret);
301202ee5faSTristan Ye 		goto out_commit;
302202ee5faSTristan Ye 	}
303202ee5faSTristan Ye 
304202ee5faSTristan Ye 	/*
3054dfa66bdSTristan Ye 	 * allowing partial extent moving is kind of 'pros and cons', it makes
3064dfa66bdSTristan Ye 	 * whole defragmentation less likely to fail, on the contrary, the bad
3074dfa66bdSTristan Ye 	 * thing is it may make the fs even more fragmented after moving, let
3084dfa66bdSTristan Ye 	 * userspace make a good decision here.
309202ee5faSTristan Ye 	 */
3104dfa66bdSTristan Ye 	if (new_len != *len) {
3114dfa66bdSTristan Ye 		mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
3124dfa66bdSTristan Ye 		if (!partial) {
313202ee5faSTristan Ye 			context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
314202ee5faSTristan Ye 			ret = -ENOSPC;
315202ee5faSTristan Ye 			goto out_commit;
316202ee5faSTristan Ye 		}
3174dfa66bdSTristan Ye 	}
318202ee5faSTristan Ye 
319202ee5faSTristan Ye 	mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
320202ee5faSTristan Ye 	     phys_cpos, new_phys_cpos);
321202ee5faSTristan Ye 
3224dfa66bdSTristan Ye 	ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
323202ee5faSTristan Ye 				  new_phys_cpos, ext_flags);
324202ee5faSTristan Ye 	if (ret)
325202ee5faSTristan Ye 		mlog_errno(ret);
326202ee5faSTristan Ye 
3274dfa66bdSTristan Ye 	if (partial && (new_len != *len))
3284dfa66bdSTristan Ye 		*len = new_len;
3294dfa66bdSTristan Ye 
330202ee5faSTristan Ye 	/*
331202ee5faSTristan Ye 	 * Here we should write the new page out first if we are
332202ee5faSTristan Ye 	 * in write-back mode.
333202ee5faSTristan Ye 	 */
3344dfa66bdSTristan Ye 	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
335202ee5faSTristan Ye 	if (ret)
336202ee5faSTristan Ye 		mlog_errno(ret);
337202ee5faSTristan Ye 
338202ee5faSTristan Ye out_commit:
339202ee5faSTristan Ye 	ocfs2_commit_trans(osb, handle);
340202ee5faSTristan Ye 
341202ee5faSTristan Ye out_unlock_mutex:
342202ee5faSTristan Ye 	mutex_unlock(&tl_inode->i_mutex);
343202ee5faSTristan Ye 
344202ee5faSTristan Ye 	if (context->data_ac) {
345202ee5faSTristan Ye 		ocfs2_free_alloc_context(context->data_ac);
346202ee5faSTristan Ye 		context->data_ac = NULL;
347202ee5faSTristan Ye 	}
348202ee5faSTristan Ye 
349202ee5faSTristan Ye 	if (context->meta_ac) {
350202ee5faSTristan Ye 		ocfs2_free_alloc_context(context->meta_ac);
351202ee5faSTristan Ye 		context->meta_ac = NULL;
352202ee5faSTristan Ye 	}
353202ee5faSTristan Ye 
354202ee5faSTristan Ye out:
355202ee5faSTristan Ye 	if (ref_tree)
356202ee5faSTristan Ye 		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
357202ee5faSTristan Ye 
358202ee5faSTristan Ye 	return ret;
359202ee5faSTristan Ye }
3601c06b912STristan Ye 
3611c06b912STristan Ye /*
3621c06b912STristan Ye  * find the victim alloc group, where #blkno fits.
3631c06b912STristan Ye  */
3641c06b912STristan Ye static int ocfs2_find_victim_alloc_group(struct inode *inode,
3651c06b912STristan Ye 					 u64 vict_blkno,
3661c06b912STristan Ye 					 int type, int slot,
3671c06b912STristan Ye 					 int *vict_bit,
3681c06b912STristan Ye 					 struct buffer_head **ret_bh)
3691c06b912STristan Ye {
3706aea6f50STristan Ye 	int ret, i, bits_per_unit = 0;
3711c06b912STristan Ye 	u64 blkno;
3721c06b912STristan Ye 	char namebuf[40];
3731c06b912STristan Ye 
3741c06b912STristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3751c06b912STristan Ye 	struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
3761c06b912STristan Ye 	struct ocfs2_chain_list *cl;
3771c06b912STristan Ye 	struct ocfs2_chain_rec *rec;
3781c06b912STristan Ye 	struct ocfs2_dinode *ac_dinode;
3791c06b912STristan Ye 	struct ocfs2_group_desc *bg;
3801c06b912STristan Ye 
3811c06b912STristan Ye 	ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
3821c06b912STristan Ye 	ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
3831c06b912STristan Ye 					 strlen(namebuf), &blkno);
3841c06b912STristan Ye 	if (ret) {
3851c06b912STristan Ye 		ret = -ENOENT;
3861c06b912STristan Ye 		goto out;
3871c06b912STristan Ye 	}
3881c06b912STristan Ye 
3891c06b912STristan Ye 	ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
3901c06b912STristan Ye 	if (ret) {
3911c06b912STristan Ye 		mlog_errno(ret);
3921c06b912STristan Ye 		goto out;
3931c06b912STristan Ye 	}
3941c06b912STristan Ye 
3951c06b912STristan Ye 	ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
3961c06b912STristan Ye 	cl = &(ac_dinode->id2.i_chain);
3971c06b912STristan Ye 	rec = &(cl->cl_recs[0]);
3981c06b912STristan Ye 
3991c06b912STristan Ye 	if (type == GLOBAL_BITMAP_SYSTEM_INODE)
4006aea6f50STristan Ye 		bits_per_unit = osb->s_clustersize_bits -
4016aea6f50STristan Ye 					inode->i_sb->s_blocksize_bits;
4021c06b912STristan Ye 	/*
4031c06b912STristan Ye 	 * 'vict_blkno' was out of the valid range.
4041c06b912STristan Ye 	 */
4051c06b912STristan Ye 	if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
4066aea6f50STristan Ye 	    (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
4076aea6f50STristan Ye 				bits_per_unit))) {
4081c06b912STristan Ye 		ret = -EINVAL;
4091c06b912STristan Ye 		goto out;
4101c06b912STristan Ye 	}
4111c06b912STristan Ye 
4121c06b912STristan Ye 	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
4131c06b912STristan Ye 
4141c06b912STristan Ye 		rec = &(cl->cl_recs[i]);
4151c06b912STristan Ye 		if (!rec)
4161c06b912STristan Ye 			continue;
4171c06b912STristan Ye 
4181c06b912STristan Ye 		bg = NULL;
4191c06b912STristan Ye 
4201c06b912STristan Ye 		do {
4211c06b912STristan Ye 			if (!bg)
4221c06b912STristan Ye 				blkno = le64_to_cpu(rec->c_blkno);
4231c06b912STristan Ye 			else
4241c06b912STristan Ye 				blkno = le64_to_cpu(bg->bg_next_group);
4251c06b912STristan Ye 
4261c06b912STristan Ye 			if (gd_bh) {
4271c06b912STristan Ye 				brelse(gd_bh);
4281c06b912STristan Ye 				gd_bh = NULL;
4291c06b912STristan Ye 			}
4301c06b912STristan Ye 
4311c06b912STristan Ye 			ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
4321c06b912STristan Ye 			if (ret) {
4331c06b912STristan Ye 				mlog_errno(ret);
4341c06b912STristan Ye 				goto out;
4351c06b912STristan Ye 			}
4361c06b912STristan Ye 
4371c06b912STristan Ye 			bg = (struct ocfs2_group_desc *)gd_bh->b_data;
4381c06b912STristan Ye 
4391c06b912STristan Ye 			if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
4401c06b912STristan Ye 						le16_to_cpu(bg->bg_bits))) {
4411c06b912STristan Ye 
4421c06b912STristan Ye 				*ret_bh = gd_bh;
4436aea6f50STristan Ye 				*vict_bit = (vict_blkno - blkno) >>
4446aea6f50STristan Ye 							bits_per_unit;
4451c06b912STristan Ye 				mlog(0, "find the victim group: #%llu, "
4461c06b912STristan Ye 				     "total_bits: %u, vict_bit: %u\n",
4471c06b912STristan Ye 				     blkno, le16_to_cpu(bg->bg_bits),
4481c06b912STristan Ye 				     *vict_bit);
4491c06b912STristan Ye 				goto out;
4501c06b912STristan Ye 			}
4511c06b912STristan Ye 
4521c06b912STristan Ye 		} while (le64_to_cpu(bg->bg_next_group));
4531c06b912STristan Ye 	}
4541c06b912STristan Ye 
4551c06b912STristan Ye 	ret = -EINVAL;
4561c06b912STristan Ye out:
4571c06b912STristan Ye 	brelse(ac_bh);
4581c06b912STristan Ye 
4591c06b912STristan Ye 	/*
4601c06b912STristan Ye 	 * caller has to release the gd_bh properly.
4611c06b912STristan Ye 	 */
4621c06b912STristan Ye 	return ret;
4631c06b912STristan Ye }
46499e4c750STristan Ye 
46599e4c750STristan Ye /*
46699e4c750STristan Ye  * XXX: helper to validate and adjust moving goal.
46799e4c750STristan Ye  */
46899e4c750STristan Ye static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
46999e4c750STristan Ye 					       struct ocfs2_move_extents *range)
47099e4c750STristan Ye {
47199e4c750STristan Ye 	int ret, goal_bit = 0;
47299e4c750STristan Ye 
47399e4c750STristan Ye 	struct buffer_head *gd_bh = NULL;
474ea5e1675STristan Ye 	struct ocfs2_group_desc *bg = NULL;
47599e4c750STristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
47699e4c750STristan Ye 	int c_to_b = 1 << (osb->s_clustersize_bits -
47799e4c750STristan Ye 					inode->i_sb->s_blocksize_bits);
47899e4c750STristan Ye 
47999e4c750STristan Ye 	/*
480ea5e1675STristan Ye 	 * make goal become cluster aligned.
481ea5e1675STristan Ye 	 */
482ea5e1675STristan Ye 	range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
483ea5e1675STristan Ye 						      range->me_goal);
484ea5e1675STristan Ye 	/*
485ea5e1675STristan Ye 	 * moving goal is not allowd to start with a group desc blok(#0 blk)
486ea5e1675STristan Ye 	 * let's compromise to the latter cluster.
487ea5e1675STristan Ye 	 */
488ea5e1675STristan Ye 	if (range->me_goal == le64_to_cpu(bg->bg_blkno))
489ea5e1675STristan Ye 		range->me_goal += c_to_b;
490ea5e1675STristan Ye 
491ea5e1675STristan Ye 	/*
49299e4c750STristan Ye 	 * validate goal sits within global_bitmap, and return the victim
49399e4c750STristan Ye 	 * group desc
49499e4c750STristan Ye 	 */
49599e4c750STristan Ye 	ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
49699e4c750STristan Ye 					    GLOBAL_BITMAP_SYSTEM_INODE,
49799e4c750STristan Ye 					    OCFS2_INVALID_SLOT,
49899e4c750STristan Ye 					    &goal_bit, &gd_bh);
49999e4c750STristan Ye 	if (ret)
50099e4c750STristan Ye 		goto out;
50199e4c750STristan Ye 
50299e4c750STristan Ye 	bg = (struct ocfs2_group_desc *)gd_bh->b_data;
50399e4c750STristan Ye 
50499e4c750STristan Ye 	/*
50599e4c750STristan Ye 	 * movement is not gonna cross two groups.
50699e4c750STristan Ye 	 */
50799e4c750STristan Ye 	if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
50899e4c750STristan Ye 								range->me_len) {
50999e4c750STristan Ye 		ret = -EINVAL;
51099e4c750STristan Ye 		goto out;
51199e4c750STristan Ye 	}
51299e4c750STristan Ye 	/*
51399e4c750STristan Ye 	 * more exact validations/adjustments will be performed later during
51499e4c750STristan Ye 	 * moving operation for each extent range.
51599e4c750STristan Ye 	 */
51699e4c750STristan Ye 	mlog(0, "extents get ready to be moved to #%llu block\n",
51799e4c750STristan Ye 	     range->me_goal);
51899e4c750STristan Ye 
51999e4c750STristan Ye out:
52099e4c750STristan Ye 	brelse(gd_bh);
52199e4c750STristan Ye 
52299e4c750STristan Ye 	return ret;
52399e4c750STristan Ye }
524e6b5859cSTristan Ye 
525e6b5859cSTristan Ye static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
526e6b5859cSTristan Ye 				    int *goal_bit, u32 move_len, u32 max_hop,
527e6b5859cSTristan Ye 				    u32 *phys_cpos)
528e6b5859cSTristan Ye {
529e6b5859cSTristan Ye 	int i, used, last_free_bits = 0, base_bit = *goal_bit;
530e6b5859cSTristan Ye 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
531e6b5859cSTristan Ye 	u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
532e6b5859cSTristan Ye 						 le64_to_cpu(gd->bg_blkno));
533e6b5859cSTristan Ye 
534e6b5859cSTristan Ye 	for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
535e6b5859cSTristan Ye 
536e6b5859cSTristan Ye 		used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
537e6b5859cSTristan Ye 		if (used) {
538e6b5859cSTristan Ye 			/*
539e6b5859cSTristan Ye 			 * we even tried searching the free chunk by jumping
540e6b5859cSTristan Ye 			 * a 'max_hop' distance, but still failed.
541e6b5859cSTristan Ye 			 */
542e6b5859cSTristan Ye 			if ((i - base_bit) > max_hop) {
543e6b5859cSTristan Ye 				*phys_cpos = 0;
544e6b5859cSTristan Ye 				break;
545e6b5859cSTristan Ye 			}
546e6b5859cSTristan Ye 
547e6b5859cSTristan Ye 			if (last_free_bits)
548e6b5859cSTristan Ye 				last_free_bits = 0;
549e6b5859cSTristan Ye 
550e6b5859cSTristan Ye 			continue;
551e6b5859cSTristan Ye 		} else
552e6b5859cSTristan Ye 			last_free_bits++;
553e6b5859cSTristan Ye 
554e6b5859cSTristan Ye 		if (last_free_bits == move_len) {
555e6b5859cSTristan Ye 			*goal_bit = i;
556e6b5859cSTristan Ye 			*phys_cpos = base_cpos + i;
557e6b5859cSTristan Ye 			break;
558e6b5859cSTristan Ye 		}
559e6b5859cSTristan Ye 	}
560e6b5859cSTristan Ye 
561e6b5859cSTristan Ye 	mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
562e6b5859cSTristan Ye }
5638473aa8aSTristan Ye 
5648473aa8aSTristan Ye static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
5658473aa8aSTristan Ye 				       handle_t *handle,
5668473aa8aSTristan Ye 				       struct buffer_head *di_bh,
5678473aa8aSTristan Ye 				       u32 num_bits,
5688473aa8aSTristan Ye 				       u16 chain)
5698473aa8aSTristan Ye {
5708473aa8aSTristan Ye 	int ret;
5718473aa8aSTristan Ye 	u32 tmp_used;
5728473aa8aSTristan Ye 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
5738473aa8aSTristan Ye 	struct ocfs2_chain_list *cl =
5748473aa8aSTristan Ye 				(struct ocfs2_chain_list *) &di->id2.i_chain;
5758473aa8aSTristan Ye 
5768473aa8aSTristan Ye 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
5778473aa8aSTristan Ye 				      OCFS2_JOURNAL_ACCESS_WRITE);
5788473aa8aSTristan Ye 	if (ret < 0) {
5798473aa8aSTristan Ye 		mlog_errno(ret);
5808473aa8aSTristan Ye 		goto out;
5818473aa8aSTristan Ye 	}
5828473aa8aSTristan Ye 
5838473aa8aSTristan Ye 	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
5848473aa8aSTristan Ye 	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
5858473aa8aSTristan Ye 	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
5868473aa8aSTristan Ye 	ocfs2_journal_dirty(handle, di_bh);
5878473aa8aSTristan Ye 
5888473aa8aSTristan Ye out:
5898473aa8aSTristan Ye 	return ret;
5908473aa8aSTristan Ye }
5918473aa8aSTristan Ye 
5928473aa8aSTristan Ye static inline int ocfs2_block_group_set_bits(handle_t *handle,
5938473aa8aSTristan Ye 					     struct inode *alloc_inode,
5948473aa8aSTristan Ye 					     struct ocfs2_group_desc *bg,
5958473aa8aSTristan Ye 					     struct buffer_head *group_bh,
5968473aa8aSTristan Ye 					     unsigned int bit_off,
5978473aa8aSTristan Ye 					     unsigned int num_bits)
5988473aa8aSTristan Ye {
5998473aa8aSTristan Ye 	int status;
6008473aa8aSTristan Ye 	void *bitmap = bg->bg_bitmap;
6018473aa8aSTristan Ye 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
6028473aa8aSTristan Ye 
6038473aa8aSTristan Ye 	/* All callers get the descriptor via
6048473aa8aSTristan Ye 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
6058473aa8aSTristan Ye 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
6068473aa8aSTristan Ye 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
6078473aa8aSTristan Ye 
6088473aa8aSTristan Ye 	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
6098473aa8aSTristan Ye 	     num_bits);
6108473aa8aSTristan Ye 
6118473aa8aSTristan Ye 	if (ocfs2_is_cluster_bitmap(alloc_inode))
6128473aa8aSTristan Ye 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
6138473aa8aSTristan Ye 
6148473aa8aSTristan Ye 	status = ocfs2_journal_access_gd(handle,
6158473aa8aSTristan Ye 					 INODE_CACHE(alloc_inode),
6168473aa8aSTristan Ye 					 group_bh,
6178473aa8aSTristan Ye 					 journal_type);
6188473aa8aSTristan Ye 	if (status < 0) {
6198473aa8aSTristan Ye 		mlog_errno(status);
6208473aa8aSTristan Ye 		goto bail;
6218473aa8aSTristan Ye 	}
6228473aa8aSTristan Ye 
6238473aa8aSTristan Ye 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
6248473aa8aSTristan Ye 	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
6258473aa8aSTristan Ye 		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
6268473aa8aSTristan Ye 			    " count %u but claims %u are freed. num_bits %d",
6278473aa8aSTristan Ye 			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
6288473aa8aSTristan Ye 			    le16_to_cpu(bg->bg_bits),
6298473aa8aSTristan Ye 			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
6308473aa8aSTristan Ye 		return -EROFS;
6318473aa8aSTristan Ye 	}
6328473aa8aSTristan Ye 	while (num_bits--)
6338473aa8aSTristan Ye 		ocfs2_set_bit(bit_off++, bitmap);
6348473aa8aSTristan Ye 
6358473aa8aSTristan Ye 	ocfs2_journal_dirty(handle, group_bh);
6368473aa8aSTristan Ye 
6378473aa8aSTristan Ye bail:
6388473aa8aSTristan Ye 	return status;
6398473aa8aSTristan Ye }
640e0847717STristan Ye 
641e0847717STristan Ye static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
642e0847717STristan Ye 			     u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
643e0847717STristan Ye 			     u32 len, int ext_flags)
644e0847717STristan Ye {
645e0847717STristan Ye 	int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
646e0847717STristan Ye 	handle_t *handle;
647e0847717STristan Ye 	struct inode *inode = context->inode;
648e0847717STristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
649e0847717STristan Ye 	struct inode *tl_inode = osb->osb_tl_inode;
650e0847717STristan Ye 	struct inode *gb_inode = NULL;
651e0847717STristan Ye 	struct buffer_head *gb_bh = NULL;
652e0847717STristan Ye 	struct buffer_head *gd_bh = NULL;
653e0847717STristan Ye 	struct ocfs2_group_desc *gd;
654e0847717STristan Ye 	struct ocfs2_refcount_tree *ref_tree = NULL;
655e0847717STristan Ye 	u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
656e0847717STristan Ye 						    context->range->me_threshold);
657e0847717STristan Ye 	u64 phys_blkno, new_phys_blkno;
658e0847717STristan Ye 
659e0847717STristan Ye 	phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
660e0847717STristan Ye 
661e0847717STristan Ye 	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
662e0847717STristan Ye 
663e0847717STristan Ye 		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
664e0847717STristan Ye 			 OCFS2_HAS_REFCOUNT_FL));
665e0847717STristan Ye 
666e0847717STristan Ye 		BUG_ON(!context->refcount_loc);
667e0847717STristan Ye 
668e0847717STristan Ye 		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
669e0847717STristan Ye 					       &ref_tree, NULL);
670e0847717STristan Ye 		if (ret) {
671e0847717STristan Ye 			mlog_errno(ret);
672e0847717STristan Ye 			return ret;
673e0847717STristan Ye 		}
674e0847717STristan Ye 
675e0847717STristan Ye 		ret = ocfs2_prepare_refcount_change_for_del(inode,
676e0847717STristan Ye 							context->refcount_loc,
677e0847717STristan Ye 							phys_blkno,
678e0847717STristan Ye 							len,
679e0847717STristan Ye 							&credits,
680e0847717STristan Ye 							&extra_blocks);
681e0847717STristan Ye 		if (ret) {
682e0847717STristan Ye 			mlog_errno(ret);
683e0847717STristan Ye 			goto out;
684e0847717STristan Ye 		}
685e0847717STristan Ye 	}
686e0847717STristan Ye 
687e0847717STristan Ye 	ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
688e0847717STristan Ye 						 &context->meta_ac,
689e0847717STristan Ye 						 NULL, extra_blocks, &credits);
690e0847717STristan Ye 	if (ret) {
691e0847717STristan Ye 		mlog_errno(ret);
692e0847717STristan Ye 		goto out;
693e0847717STristan Ye 	}
694e0847717STristan Ye 
695e0847717STristan Ye 	/*
696e0847717STristan Ye 	 * need to count 2 extra credits for global_bitmap inode and
697e0847717STristan Ye 	 * group descriptor.
698e0847717STristan Ye 	 */
699e0847717STristan Ye 	credits += OCFS2_INODE_UPDATE_CREDITS + 1;
700e0847717STristan Ye 
701e0847717STristan Ye 	/*
702e0847717STristan Ye 	 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
703e0847717STristan Ye 	 * logic, while we still need to lock the global_bitmap.
704e0847717STristan Ye 	 */
705e0847717STristan Ye 	gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
706e0847717STristan Ye 					       OCFS2_INVALID_SLOT);
707e0847717STristan Ye 	if (!gb_inode) {
708e0847717STristan Ye 		mlog(ML_ERROR, "unable to get global_bitmap inode\n");
709e0847717STristan Ye 		ret = -EIO;
710e0847717STristan Ye 		goto out;
711e0847717STristan Ye 	}
712e0847717STristan Ye 
713e0847717STristan Ye 	mutex_lock(&gb_inode->i_mutex);
714e0847717STristan Ye 
715e0847717STristan Ye 	ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
716e0847717STristan Ye 	if (ret) {
717e0847717STristan Ye 		mlog_errno(ret);
718e0847717STristan Ye 		goto out_unlock_gb_mutex;
719e0847717STristan Ye 	}
720e0847717STristan Ye 
721e0847717STristan Ye 	mutex_lock(&tl_inode->i_mutex);
722e0847717STristan Ye 
723e0847717STristan Ye 	handle = ocfs2_start_trans(osb, credits);
724e0847717STristan Ye 	if (IS_ERR(handle)) {
725e0847717STristan Ye 		ret = PTR_ERR(handle);
726e0847717STristan Ye 		mlog_errno(ret);
727e0847717STristan Ye 		goto out_unlock_tl_inode;
728e0847717STristan Ye 	}
729e0847717STristan Ye 
730e0847717STristan Ye 	new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
731e0847717STristan Ye 	ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
732e0847717STristan Ye 					    GLOBAL_BITMAP_SYSTEM_INODE,
733e0847717STristan Ye 					    OCFS2_INVALID_SLOT,
734e0847717STristan Ye 					    &goal_bit, &gd_bh);
735e0847717STristan Ye 	if (ret) {
736e0847717STristan Ye 		mlog_errno(ret);
737e0847717STristan Ye 		goto out_commit;
738e0847717STristan Ye 	}
739e0847717STristan Ye 
740e0847717STristan Ye 	/*
741e0847717STristan Ye 	 * probe the victim cluster group to find a proper
742e0847717STristan Ye 	 * region to fit wanted movement, it even will perfrom
743e0847717STristan Ye 	 * a best-effort attempt by compromising to a threshold
744e0847717STristan Ye 	 * around the goal.
745e0847717STristan Ye 	 */
746e0847717STristan Ye 	ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
747e0847717STristan Ye 				new_phys_cpos);
7483d75be7cSDan Carpenter 	if (!*new_phys_cpos) {
749e0847717STristan Ye 		ret = -ENOSPC;
750e0847717STristan Ye 		goto out_commit;
751e0847717STristan Ye 	}
752e0847717STristan Ye 
753e0847717STristan Ye 	ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
754e0847717STristan Ye 				  *new_phys_cpos, ext_flags);
755e0847717STristan Ye 	if (ret) {
756e0847717STristan Ye 		mlog_errno(ret);
757e0847717STristan Ye 		goto out_commit;
758e0847717STristan Ye 	}
759e0847717STristan Ye 
760e0847717STristan Ye 	gd = (struct ocfs2_group_desc *)gd_bh->b_data;
761e0847717STristan Ye 	ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
762e0847717STristan Ye 					       le16_to_cpu(gd->bg_chain));
763e0847717STristan Ye 	if (ret) {
764e0847717STristan Ye 		mlog_errno(ret);
765e0847717STristan Ye 		goto out_commit;
766e0847717STristan Ye 	}
767e0847717STristan Ye 
768e0847717STristan Ye 	ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
769e0847717STristan Ye 					 goal_bit, len);
770e0847717STristan Ye 	if (ret)
771e0847717STristan Ye 		mlog_errno(ret);
772e0847717STristan Ye 
773e0847717STristan Ye 	/*
774e0847717STristan Ye 	 * Here we should write the new page out first if we are
775e0847717STristan Ye 	 * in write-back mode.
776e0847717STristan Ye 	 */
777e0847717STristan Ye 	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
778e0847717STristan Ye 	if (ret)
779e0847717STristan Ye 		mlog_errno(ret);
780e0847717STristan Ye 
781e0847717STristan Ye out_commit:
782e0847717STristan Ye 	ocfs2_commit_trans(osb, handle);
783e0847717STristan Ye 	brelse(gd_bh);
784e0847717STristan Ye 
785e0847717STristan Ye out_unlock_tl_inode:
786e0847717STristan Ye 	mutex_unlock(&tl_inode->i_mutex);
787e0847717STristan Ye 
788e0847717STristan Ye 	ocfs2_inode_unlock(gb_inode, 1);
789e0847717STristan Ye out_unlock_gb_mutex:
790e0847717STristan Ye 	mutex_unlock(&gb_inode->i_mutex);
791e0847717STristan Ye 	brelse(gb_bh);
792e0847717STristan Ye 	iput(gb_inode);
793e0847717STristan Ye 
794e0847717STristan Ye out:
795e0847717STristan Ye 	if (context->meta_ac) {
796e0847717STristan Ye 		ocfs2_free_alloc_context(context->meta_ac);
797e0847717STristan Ye 		context->meta_ac = NULL;
798e0847717STristan Ye 	}
799e0847717STristan Ye 
800e0847717STristan Ye 	if (ref_tree)
801e0847717STristan Ye 		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
802e0847717STristan Ye 
803e0847717STristan Ye 	return ret;
804e0847717STristan Ye }
805ee16cc03STristan Ye 
806ee16cc03STristan Ye /*
807ee16cc03STristan Ye  * Helper to calculate the defraging length in one run according to threshold.
808ee16cc03STristan Ye  */
809ee16cc03STristan Ye static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
810ee16cc03STristan Ye 					 u32 threshold, int *skip)
811ee16cc03STristan Ye {
812ee16cc03STristan Ye 	if ((*alloc_size + *len_defraged) < threshold) {
813ee16cc03STristan Ye 		/*
814ee16cc03STristan Ye 		 * proceed defragmentation until we meet the thresh
815ee16cc03STristan Ye 		 */
816ee16cc03STristan Ye 		*len_defraged += *alloc_size;
817ee16cc03STristan Ye 	} else if (*len_defraged == 0) {
818ee16cc03STristan Ye 		/*
819ee16cc03STristan Ye 		 * XXX: skip a large extent.
820ee16cc03STristan Ye 		 */
821ee16cc03STristan Ye 		*skip = 1;
822ee16cc03STristan Ye 	} else {
823ee16cc03STristan Ye 		/*
824ee16cc03STristan Ye 		 * split this extent to coalesce with former pieces as
825ee16cc03STristan Ye 		 * to reach the threshold.
826ee16cc03STristan Ye 		 *
827ee16cc03STristan Ye 		 * we're done here with one cycle of defragmentation
828ee16cc03STristan Ye 		 * in a size of 'thresh', resetting 'len_defraged'
829ee16cc03STristan Ye 		 * forces a new defragmentation.
830ee16cc03STristan Ye 		 */
831ee16cc03STristan Ye 		*alloc_size = threshold - *len_defraged;
832ee16cc03STristan Ye 		*len_defraged = 0;
833ee16cc03STristan Ye 	}
834ee16cc03STristan Ye }
83553069d4eSTristan Ye 
83653069d4eSTristan Ye static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
83753069d4eSTristan Ye 				struct ocfs2_move_extents_context *context)
83853069d4eSTristan Ye {
83953069d4eSTristan Ye 	int ret = 0, flags, do_defrag, skip = 0;
84053069d4eSTristan Ye 	u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
84153069d4eSTristan Ye 	u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
84253069d4eSTristan Ye 
84353069d4eSTristan Ye 	struct inode *inode = context->inode;
84453069d4eSTristan Ye 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
84553069d4eSTristan Ye 	struct ocfs2_move_extents *range = context->range;
84653069d4eSTristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
84753069d4eSTristan Ye 
84853069d4eSTristan Ye 	if ((inode->i_size == 0) || (range->me_len == 0))
84953069d4eSTristan Ye 		return 0;
85053069d4eSTristan Ye 
85153069d4eSTristan Ye 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
85253069d4eSTristan Ye 		return 0;
85353069d4eSTristan Ye 
85453069d4eSTristan Ye 	context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
85553069d4eSTristan Ye 
85653069d4eSTristan Ye 	ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
85753069d4eSTristan Ye 	ocfs2_init_dealloc_ctxt(&context->dealloc);
85853069d4eSTristan Ye 
85953069d4eSTristan Ye 	/*
86053069d4eSTristan Ye 	 * TO-DO XXX:
86153069d4eSTristan Ye 	 *
86253069d4eSTristan Ye 	 * - xattr extents.
86353069d4eSTristan Ye 	 */
86453069d4eSTristan Ye 
86553069d4eSTristan Ye 	do_defrag = context->auto_defrag;
86653069d4eSTristan Ye 
86753069d4eSTristan Ye 	/*
86853069d4eSTristan Ye 	 * extents moving happens in unit of clusters, for the sake
86953069d4eSTristan Ye 	 * of simplicity, we may ignore two clusters where 'byte_start'
87053069d4eSTristan Ye 	 * and 'byte_start + len' were within.
87153069d4eSTristan Ye 	 */
87253069d4eSTristan Ye 	move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
87353069d4eSTristan Ye 	len_to_move = (range->me_start + range->me_len) >>
87453069d4eSTristan Ye 						osb->s_clustersize_bits;
87553069d4eSTristan Ye 	if (len_to_move >= move_start)
87653069d4eSTristan Ye 		len_to_move -= move_start;
87753069d4eSTristan Ye 	else
87853069d4eSTristan Ye 		len_to_move = 0;
87953069d4eSTristan Ye 
880dda54e76STristan Ye 	if (do_defrag) {
88153069d4eSTristan Ye 		defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
882dda54e76STristan Ye 		if (defrag_thresh <= 1)
883dda54e76STristan Ye 			goto done;
884dda54e76STristan Ye 	} else
88553069d4eSTristan Ye 		new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
88653069d4eSTristan Ye 							 range->me_goal);
88753069d4eSTristan Ye 
88853069d4eSTristan Ye 	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
88953069d4eSTristan Ye 	     "thresh: %u\n",
89053069d4eSTristan Ye 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
89153069d4eSTristan Ye 	     (unsigned long long)range->me_start,
89253069d4eSTristan Ye 	     (unsigned long long)range->me_len,
89353069d4eSTristan Ye 	     move_start, len_to_move, defrag_thresh);
89453069d4eSTristan Ye 
89553069d4eSTristan Ye 	cpos = move_start;
89653069d4eSTristan Ye 	while (len_to_move) {
89753069d4eSTristan Ye 		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
89853069d4eSTristan Ye 					 &flags);
89953069d4eSTristan Ye 		if (ret) {
90053069d4eSTristan Ye 			mlog_errno(ret);
90153069d4eSTristan Ye 			goto out;
90253069d4eSTristan Ye 		}
90353069d4eSTristan Ye 
90453069d4eSTristan Ye 		if (alloc_size > len_to_move)
90553069d4eSTristan Ye 			alloc_size = len_to_move;
90653069d4eSTristan Ye 
90753069d4eSTristan Ye 		/*
90853069d4eSTristan Ye 		 * XXX: how to deal with a hole:
90953069d4eSTristan Ye 		 *
91053069d4eSTristan Ye 		 * - skip the hole of course
91153069d4eSTristan Ye 		 * - force a new defragmentation
91253069d4eSTristan Ye 		 */
91353069d4eSTristan Ye 		if (!phys_cpos) {
91453069d4eSTristan Ye 			if (do_defrag)
91553069d4eSTristan Ye 				len_defraged = 0;
91653069d4eSTristan Ye 
91753069d4eSTristan Ye 			goto next;
91853069d4eSTristan Ye 		}
91953069d4eSTristan Ye 
92053069d4eSTristan Ye 		if (do_defrag) {
92153069d4eSTristan Ye 			ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
92253069d4eSTristan Ye 						     defrag_thresh, &skip);
92353069d4eSTristan Ye 			/*
92453069d4eSTristan Ye 			 * skip large extents
92553069d4eSTristan Ye 			 */
92653069d4eSTristan Ye 			if (skip) {
92753069d4eSTristan Ye 				skip = 0;
92853069d4eSTristan Ye 				goto next;
92953069d4eSTristan Ye 			}
93053069d4eSTristan Ye 
93153069d4eSTristan Ye 			mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
93253069d4eSTristan Ye 			     "alloc_size: %u, len_defraged: %u\n",
93353069d4eSTristan Ye 			     cpos, phys_cpos, alloc_size, len_defraged);
93453069d4eSTristan Ye 
93553069d4eSTristan Ye 			ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
9364dfa66bdSTristan Ye 						  &alloc_size, flags);
93753069d4eSTristan Ye 		} else {
93853069d4eSTristan Ye 			ret = ocfs2_move_extent(context, cpos, phys_cpos,
93953069d4eSTristan Ye 						&new_phys_cpos, alloc_size,
94053069d4eSTristan Ye 						flags);
94153069d4eSTristan Ye 
94253069d4eSTristan Ye 			new_phys_cpos += alloc_size;
94353069d4eSTristan Ye 		}
94453069d4eSTristan Ye 
94553069d4eSTristan Ye 		if (ret < 0) {
94653069d4eSTristan Ye 			mlog_errno(ret);
94753069d4eSTristan Ye 			goto out;
94853069d4eSTristan Ye 		}
94953069d4eSTristan Ye 
95053069d4eSTristan Ye 		context->clusters_moved += alloc_size;
95153069d4eSTristan Ye next:
95253069d4eSTristan Ye 		cpos += alloc_size;
95353069d4eSTristan Ye 		len_to_move -= alloc_size;
95453069d4eSTristan Ye 	}
95553069d4eSTristan Ye 
956dda54e76STristan Ye done:
95753069d4eSTristan Ye 	range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
95853069d4eSTristan Ye 
95953069d4eSTristan Ye out:
96053069d4eSTristan Ye 	range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
96153069d4eSTristan Ye 						      context->clusters_moved);
96253069d4eSTristan Ye 	range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
96353069d4eSTristan Ye 						       context->new_phys_cpos);
96453069d4eSTristan Ye 
96553069d4eSTristan Ye 	ocfs2_schedule_truncate_log_flush(osb, 1);
96653069d4eSTristan Ye 	ocfs2_run_deallocs(osb, &context->dealloc);
96753069d4eSTristan Ye 
96853069d4eSTristan Ye 	return ret;
96953069d4eSTristan Ye }
97053069d4eSTristan Ye 
97153069d4eSTristan Ye static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
97253069d4eSTristan Ye {
97353069d4eSTristan Ye 	int status;
97453069d4eSTristan Ye 	handle_t *handle;
97553069d4eSTristan Ye 	struct inode *inode = context->inode;
97653069d4eSTristan Ye 	struct ocfs2_dinode *di;
97753069d4eSTristan Ye 	struct buffer_head *di_bh = NULL;
97853069d4eSTristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
97953069d4eSTristan Ye 
98053069d4eSTristan Ye 	if (!inode)
98153069d4eSTristan Ye 		return -ENOENT;
98253069d4eSTristan Ye 
98353069d4eSTristan Ye 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
98453069d4eSTristan Ye 		return -EROFS;
98553069d4eSTristan Ye 
98653069d4eSTristan Ye 	mutex_lock(&inode->i_mutex);
98753069d4eSTristan Ye 
98853069d4eSTristan Ye 	/*
98953069d4eSTristan Ye 	 * This prevents concurrent writes from other nodes
99053069d4eSTristan Ye 	 */
99153069d4eSTristan Ye 	status = ocfs2_rw_lock(inode, 1);
99253069d4eSTristan Ye 	if (status) {
99353069d4eSTristan Ye 		mlog_errno(status);
99453069d4eSTristan Ye 		goto out;
99553069d4eSTristan Ye 	}
99653069d4eSTristan Ye 
99753069d4eSTristan Ye 	status = ocfs2_inode_lock(inode, &di_bh, 1);
99853069d4eSTristan Ye 	if (status) {
99953069d4eSTristan Ye 		mlog_errno(status);
100053069d4eSTristan Ye 		goto out_rw_unlock;
100153069d4eSTristan Ye 	}
100253069d4eSTristan Ye 
100353069d4eSTristan Ye 	/*
100453069d4eSTristan Ye 	 * rememer ip_xattr_sem also needs to be held if necessary
100553069d4eSTristan Ye 	 */
100653069d4eSTristan Ye 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
100753069d4eSTristan Ye 
100853069d4eSTristan Ye 	status = __ocfs2_move_extents_range(di_bh, context);
100953069d4eSTristan Ye 
101053069d4eSTristan Ye 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
101153069d4eSTristan Ye 	if (status) {
101253069d4eSTristan Ye 		mlog_errno(status);
101353069d4eSTristan Ye 		goto out_inode_unlock;
101453069d4eSTristan Ye 	}
101553069d4eSTristan Ye 
101653069d4eSTristan Ye 	/*
101753069d4eSTristan Ye 	 * We update ctime for these changes
101853069d4eSTristan Ye 	 */
101953069d4eSTristan Ye 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
102053069d4eSTristan Ye 	if (IS_ERR(handle)) {
102153069d4eSTristan Ye 		status = PTR_ERR(handle);
102253069d4eSTristan Ye 		mlog_errno(status);
102353069d4eSTristan Ye 		goto out_inode_unlock;
102453069d4eSTristan Ye 	}
102553069d4eSTristan Ye 
102653069d4eSTristan Ye 	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
102753069d4eSTristan Ye 					 OCFS2_JOURNAL_ACCESS_WRITE);
102853069d4eSTristan Ye 	if (status) {
102953069d4eSTristan Ye 		mlog_errno(status);
103053069d4eSTristan Ye 		goto out_commit;
103153069d4eSTristan Ye 	}
103253069d4eSTristan Ye 
103353069d4eSTristan Ye 	di = (struct ocfs2_dinode *)di_bh->b_data;
103453069d4eSTristan Ye 	inode->i_ctime = CURRENT_TIME;
103553069d4eSTristan Ye 	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
103653069d4eSTristan Ye 	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
103753069d4eSTristan Ye 
103853069d4eSTristan Ye 	ocfs2_journal_dirty(handle, di_bh);
103953069d4eSTristan Ye 
104053069d4eSTristan Ye out_commit:
104153069d4eSTristan Ye 	ocfs2_commit_trans(osb, handle);
104253069d4eSTristan Ye 
104353069d4eSTristan Ye out_inode_unlock:
104453069d4eSTristan Ye 	brelse(di_bh);
104553069d4eSTristan Ye 	ocfs2_inode_unlock(inode, 1);
104653069d4eSTristan Ye out_rw_unlock:
104753069d4eSTristan Ye 	ocfs2_rw_unlock(inode, 1);
104853069d4eSTristan Ye out:
104953069d4eSTristan Ye 	mutex_unlock(&inode->i_mutex);
105053069d4eSTristan Ye 
105153069d4eSTristan Ye 	return status;
105253069d4eSTristan Ye }
105353069d4eSTristan Ye 
105453069d4eSTristan Ye int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
105553069d4eSTristan Ye {
105653069d4eSTristan Ye 	int status;
105753069d4eSTristan Ye 
105853069d4eSTristan Ye 	struct inode *inode = filp->f_path.dentry->d_inode;
105953069d4eSTristan Ye 	struct ocfs2_move_extents range;
106053069d4eSTristan Ye 	struct ocfs2_move_extents_context *context = NULL;
106153069d4eSTristan Ye 
1062a561be71SAl Viro 	status = mnt_want_write_file(filp);
106353069d4eSTristan Ye 	if (status)
106453069d4eSTristan Ye 		return status;
106553069d4eSTristan Ye 
106653069d4eSTristan Ye 	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
106753069d4eSTristan Ye 		goto out;
106853069d4eSTristan Ye 
106953069d4eSTristan Ye 	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
107053069d4eSTristan Ye 		status = -EPERM;
107153069d4eSTristan Ye 		goto out;
107253069d4eSTristan Ye 	}
107353069d4eSTristan Ye 
107453069d4eSTristan Ye 	context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
107553069d4eSTristan Ye 	if (!context) {
107653069d4eSTristan Ye 		status = -ENOMEM;
107753069d4eSTristan Ye 		mlog_errno(status);
107853069d4eSTristan Ye 		goto out;
107953069d4eSTristan Ye 	}
108053069d4eSTristan Ye 
108153069d4eSTristan Ye 	context->inode = inode;
108253069d4eSTristan Ye 	context->file = filp;
108353069d4eSTristan Ye 
108453069d4eSTristan Ye 	if (argp) {
1085f6a56903SAl Viro 		if (copy_from_user(&range, argp, sizeof(range))) {
108653069d4eSTristan Ye 			status = -EFAULT;
108753069d4eSTristan Ye 			goto out;
108853069d4eSTristan Ye 		}
108953069d4eSTristan Ye 	} else {
109053069d4eSTristan Ye 		status = -EINVAL;
109153069d4eSTristan Ye 		goto out;
109253069d4eSTristan Ye 	}
109353069d4eSTristan Ye 
109453069d4eSTristan Ye 	if (range.me_start > i_size_read(inode))
109553069d4eSTristan Ye 		goto out;
109653069d4eSTristan Ye 
109753069d4eSTristan Ye 	if (range.me_start + range.me_len > i_size_read(inode))
109853069d4eSTristan Ye 			range.me_len = i_size_read(inode) - range.me_start;
109953069d4eSTristan Ye 
110053069d4eSTristan Ye 	context->range = &range;
110153069d4eSTristan Ye 
110253069d4eSTristan Ye 	if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
110353069d4eSTristan Ye 		context->auto_defrag = 1;
110453069d4eSTristan Ye 		/*
110553069d4eSTristan Ye 		 * ok, the default theshold for the defragmentation
110653069d4eSTristan Ye 		 * is 1M, since our maximum clustersize was 1M also.
110753069d4eSTristan Ye 		 * any thought?
110853069d4eSTristan Ye 		 */
1109dda54e76STristan Ye 		if (!range.me_threshold)
111053069d4eSTristan Ye 			range.me_threshold = 1024 * 1024;
1111dda54e76STristan Ye 
1112dda54e76STristan Ye 		if (range.me_threshold > i_size_read(inode))
1113dda54e76STristan Ye 			range.me_threshold = i_size_read(inode);
1114dda54e76STristan Ye 
11154dfa66bdSTristan Ye 		if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
11164dfa66bdSTristan Ye 			context->partial = 1;
111753069d4eSTristan Ye 	} else {
111853069d4eSTristan Ye 		/*
111953069d4eSTristan Ye 		 * first best-effort attempt to validate and adjust the goal
112053069d4eSTristan Ye 		 * (physical address in block), while it can't guarantee later
112153069d4eSTristan Ye 		 * operation can succeed all the time since global_bitmap may
112253069d4eSTristan Ye 		 * change a bit over time.
112353069d4eSTristan Ye 		 */
112453069d4eSTristan Ye 
112553069d4eSTristan Ye 		status = ocfs2_validate_and_adjust_move_goal(inode, &range);
112653069d4eSTristan Ye 		if (status)
112753069d4eSTristan Ye 			goto out;
112853069d4eSTristan Ye 	}
112953069d4eSTristan Ye 
113053069d4eSTristan Ye 	status = ocfs2_move_extents(context);
113153069d4eSTristan Ye 	if (status)
113253069d4eSTristan Ye 		mlog_errno(status);
113353069d4eSTristan Ye out:
113453069d4eSTristan Ye 	/*
113553069d4eSTristan Ye 	 * movement/defragmentation may end up being partially completed,
113653069d4eSTristan Ye 	 * that's the reason why we need to return userspace the finished
113753069d4eSTristan Ye 	 * length and new_offset even if failure happens somewhere.
113853069d4eSTristan Ye 	 */
113953069d4eSTristan Ye 	if (argp) {
1140f6a56903SAl Viro 		if (copy_to_user(argp, &range, sizeof(range)))
114153069d4eSTristan Ye 			status = -EFAULT;
114253069d4eSTristan Ye 	}
114353069d4eSTristan Ye 
114453069d4eSTristan Ye 	kfree(context);
114553069d4eSTristan Ye 
11462a79f17eSAl Viro 	mnt_drop_write_file(filp);
114753069d4eSTristan Ye 
114853069d4eSTristan Ye 	return status;
114953069d4eSTristan Ye }
1150