xref: /openbmc/linux/fs/ocfs2/move_extents.c (revision 3d75be7c)
1028ba5dfSTristan Ye /* -*- mode: c; c-basic-offset: 8; -*-
2028ba5dfSTristan Ye  * vim: noexpandtab sw=8 ts=8 sts=0:
3028ba5dfSTristan Ye  *
4028ba5dfSTristan Ye  * move_extents.c
5028ba5dfSTristan Ye  *
6028ba5dfSTristan Ye  * Copyright (C) 2011 Oracle.  All rights reserved.
7028ba5dfSTristan Ye  *
8028ba5dfSTristan Ye  * This program is free software; you can redistribute it and/or
9028ba5dfSTristan Ye  * modify it under the terms of the GNU General Public
10028ba5dfSTristan Ye  * License version 2 as published by the Free Software Foundation.
11028ba5dfSTristan Ye  *
12028ba5dfSTristan Ye  * This program is distributed in the hope that it will be useful,
13028ba5dfSTristan Ye  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14028ba5dfSTristan Ye  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15028ba5dfSTristan Ye  * General Public License for more details.
16028ba5dfSTristan Ye  */
17028ba5dfSTristan Ye #include <linux/fs.h>
18028ba5dfSTristan Ye #include <linux/types.h>
19028ba5dfSTristan Ye #include <linux/mount.h>
20028ba5dfSTristan Ye #include <linux/swap.h>
21028ba5dfSTristan Ye 
22028ba5dfSTristan Ye #include <cluster/masklog.h>
23028ba5dfSTristan Ye 
24028ba5dfSTristan Ye #include "ocfs2.h"
25028ba5dfSTristan Ye #include "ocfs2_ioctl.h"
26028ba5dfSTristan Ye 
27028ba5dfSTristan Ye #include "alloc.h"
28028ba5dfSTristan Ye #include "aops.h"
29028ba5dfSTristan Ye #include "dlmglue.h"
30028ba5dfSTristan Ye #include "extent_map.h"
31028ba5dfSTristan Ye #include "inode.h"
32028ba5dfSTristan Ye #include "journal.h"
33028ba5dfSTristan Ye #include "suballoc.h"
34028ba5dfSTristan Ye #include "uptodate.h"
35028ba5dfSTristan Ye #include "super.h"
36028ba5dfSTristan Ye #include "dir.h"
37028ba5dfSTristan Ye #include "buffer_head_io.h"
38028ba5dfSTristan Ye #include "sysfile.h"
39028ba5dfSTristan Ye #include "suballoc.h"
40028ba5dfSTristan Ye #include "refcounttree.h"
41028ba5dfSTristan Ye #include "move_extents.h"
42028ba5dfSTristan Ye 
43028ba5dfSTristan Ye struct ocfs2_move_extents_context {
44028ba5dfSTristan Ye 	struct inode *inode;
45028ba5dfSTristan Ye 	struct file *file;
46028ba5dfSTristan Ye 	int auto_defrag;
474dfa66bdSTristan Ye 	int partial;
48028ba5dfSTristan Ye 	int credits;
49028ba5dfSTristan Ye 	u32 new_phys_cpos;
50028ba5dfSTristan Ye 	u32 clusters_moved;
51028ba5dfSTristan Ye 	u64 refcount_loc;
52028ba5dfSTristan Ye 	struct ocfs2_move_extents *range;
53028ba5dfSTristan Ye 	struct ocfs2_extent_tree et;
54028ba5dfSTristan Ye 	struct ocfs2_alloc_context *meta_ac;
55028ba5dfSTristan Ye 	struct ocfs2_alloc_context *data_ac;
56028ba5dfSTristan Ye 	struct ocfs2_cached_dealloc_ctxt dealloc;
57028ba5dfSTristan Ye };
58de474ee8STristan Ye 
598f603e56STristan Ye static int __ocfs2_move_extent(handle_t *handle,
608f603e56STristan Ye 			       struct ocfs2_move_extents_context *context,
618f603e56STristan Ye 			       u32 cpos, u32 len, u32 p_cpos, u32 new_p_cpos,
628f603e56STristan Ye 			       int ext_flags)
638f603e56STristan Ye {
648f603e56STristan Ye 	int ret = 0, index;
658f603e56STristan Ye 	struct inode *inode = context->inode;
668f603e56STristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
678f603e56STristan Ye 	struct ocfs2_extent_rec *rec, replace_rec;
688f603e56STristan Ye 	struct ocfs2_path *path = NULL;
698f603e56STristan Ye 	struct ocfs2_extent_list *el;
708f603e56STristan Ye 	u64 ino = ocfs2_metadata_cache_owner(context->et.et_ci);
718f603e56STristan Ye 	u64 old_blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cpos);
728f603e56STristan Ye 
738f603e56STristan Ye 	ret = ocfs2_duplicate_clusters_by_page(handle, context->file, cpos,
748f603e56STristan Ye 					       p_cpos, new_p_cpos, len);
758f603e56STristan Ye 	if (ret) {
768f603e56STristan Ye 		mlog_errno(ret);
778f603e56STristan Ye 		goto out;
788f603e56STristan Ye 	}
798f603e56STristan Ye 
808f603e56STristan Ye 	memset(&replace_rec, 0, sizeof(replace_rec));
818f603e56STristan Ye 	replace_rec.e_cpos = cpu_to_le32(cpos);
828f603e56STristan Ye 	replace_rec.e_leaf_clusters = cpu_to_le16(len);
838f603e56STristan Ye 	replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb,
848f603e56STristan Ye 								   new_p_cpos));
858f603e56STristan Ye 
868f603e56STristan Ye 	path = ocfs2_new_path_from_et(&context->et);
878f603e56STristan Ye 	if (!path) {
888f603e56STristan Ye 		ret = -ENOMEM;
898f603e56STristan Ye 		mlog_errno(ret);
908f603e56STristan Ye 		goto out;
918f603e56STristan Ye 	}
928f603e56STristan Ye 
938f603e56STristan Ye 	ret = ocfs2_find_path(INODE_CACHE(inode), path, cpos);
948f603e56STristan Ye 	if (ret) {
958f603e56STristan Ye 		mlog_errno(ret);
968f603e56STristan Ye 		goto out;
978f603e56STristan Ye 	}
988f603e56STristan Ye 
998f603e56STristan Ye 	el = path_leaf_el(path);
1008f603e56STristan Ye 
1018f603e56STristan Ye 	index = ocfs2_search_extent_list(el, cpos);
1028f603e56STristan Ye 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
1038f603e56STristan Ye 		ocfs2_error(inode->i_sb,
1048f603e56STristan Ye 			    "Inode %llu has an extent at cpos %u which can no "
1058f603e56STristan Ye 			    "longer be found.\n",
1068f603e56STristan Ye 			    (unsigned long long)ino, cpos);
1078f603e56STristan Ye 		ret = -EROFS;
1088f603e56STristan Ye 		goto out;
1098f603e56STristan Ye 	}
1108f603e56STristan Ye 
1118f603e56STristan Ye 	rec = &el->l_recs[index];
1128f603e56STristan Ye 
1138f603e56STristan Ye 	BUG_ON(ext_flags != rec->e_flags);
1148f603e56STristan Ye 	/*
1158f603e56STristan Ye 	 * after moving/defraging to new location, the extent is not going
1168f603e56STristan Ye 	 * to be refcounted anymore.
1178f603e56STristan Ye 	 */
1188f603e56STristan Ye 	replace_rec.e_flags = ext_flags & ~OCFS2_EXT_REFCOUNTED;
1198f603e56STristan Ye 
1208f603e56STristan Ye 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode),
1218f603e56STristan Ye 				      context->et.et_root_bh,
1228f603e56STristan Ye 				      OCFS2_JOURNAL_ACCESS_WRITE);
1238f603e56STristan Ye 	if (ret) {
1248f603e56STristan Ye 		mlog_errno(ret);
1258f603e56STristan Ye 		goto out;
1268f603e56STristan Ye 	}
1278f603e56STristan Ye 
1288f603e56STristan Ye 	ret = ocfs2_split_extent(handle, &context->et, path, index,
1298f603e56STristan Ye 				 &replace_rec, context->meta_ac,
1308f603e56STristan Ye 				 &context->dealloc);
1318f603e56STristan Ye 	if (ret) {
1328f603e56STristan Ye 		mlog_errno(ret);
1338f603e56STristan Ye 		goto out;
1348f603e56STristan Ye 	}
1358f603e56STristan Ye 
1368f603e56STristan Ye 	ocfs2_journal_dirty(handle, context->et.et_root_bh);
1378f603e56STristan Ye 
1388f603e56STristan Ye 	context->new_phys_cpos = new_p_cpos;
1398f603e56STristan Ye 
1408f603e56STristan Ye 	/*
1418f603e56STristan Ye 	 * need I to append truncate log for old clusters?
1428f603e56STristan Ye 	 */
1438f603e56STristan Ye 	if (old_blkno) {
1448f603e56STristan Ye 		if (ext_flags & OCFS2_EXT_REFCOUNTED)
1458f603e56STristan Ye 			ret = ocfs2_decrease_refcount(inode, handle,
1468f603e56STristan Ye 					ocfs2_blocks_to_clusters(osb->sb,
1478f603e56STristan Ye 								 old_blkno),
1488f603e56STristan Ye 					len, context->meta_ac,
1498f603e56STristan Ye 					&context->dealloc, 1);
1508f603e56STristan Ye 		else
1518f603e56STristan Ye 			ret = ocfs2_truncate_log_append(osb, handle,
1528f603e56STristan Ye 							old_blkno, len);
1538f603e56STristan Ye 	}
1548f603e56STristan Ye 
1558f603e56STristan Ye out:
1568f603e56STristan Ye 	return ret;
1578f603e56STristan Ye }
1588f603e56STristan Ye 
159de474ee8STristan Ye /*
160de474ee8STristan Ye  * lock allocators, and reserving appropriate number of bits for
161de474ee8STristan Ye  * meta blocks and data clusters.
162de474ee8STristan Ye  *
163de474ee8STristan Ye  * in some cases, we don't need to reserve clusters, just let data_ac
164de474ee8STristan Ye  * be NULL.
165de474ee8STristan Ye  */
166de474ee8STristan Ye static int ocfs2_lock_allocators_move_extents(struct inode *inode,
167de474ee8STristan Ye 					struct ocfs2_extent_tree *et,
168de474ee8STristan Ye 					u32 clusters_to_move,
169de474ee8STristan Ye 					u32 extents_to_split,
170de474ee8STristan Ye 					struct ocfs2_alloc_context **meta_ac,
171de474ee8STristan Ye 					struct ocfs2_alloc_context **data_ac,
172de474ee8STristan Ye 					int extra_blocks,
173de474ee8STristan Ye 					int *credits)
174de474ee8STristan Ye {
175de474ee8STristan Ye 	int ret, num_free_extents;
176de474ee8STristan Ye 	unsigned int max_recs_needed = 2 * extents_to_split + clusters_to_move;
177de474ee8STristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
178de474ee8STristan Ye 
179de474ee8STristan Ye 	num_free_extents = ocfs2_num_free_extents(osb, et);
180de474ee8STristan Ye 	if (num_free_extents < 0) {
181de474ee8STristan Ye 		ret = num_free_extents;
182de474ee8STristan Ye 		mlog_errno(ret);
183de474ee8STristan Ye 		goto out;
184de474ee8STristan Ye 	}
185de474ee8STristan Ye 
186de474ee8STristan Ye 	if (!num_free_extents ||
187de474ee8STristan Ye 	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed))
188de474ee8STristan Ye 		extra_blocks += ocfs2_extend_meta_needed(et->et_root_el);
189de474ee8STristan Ye 
190de474ee8STristan Ye 	ret = ocfs2_reserve_new_metadata_blocks(osb, extra_blocks, meta_ac);
191de474ee8STristan Ye 	if (ret) {
192de474ee8STristan Ye 		mlog_errno(ret);
193de474ee8STristan Ye 		goto out;
194de474ee8STristan Ye 	}
195de474ee8STristan Ye 
196de474ee8STristan Ye 	if (data_ac) {
197de474ee8STristan Ye 		ret = ocfs2_reserve_clusters(osb, clusters_to_move, data_ac);
198de474ee8STristan Ye 		if (ret) {
199de474ee8STristan Ye 			mlog_errno(ret);
200de474ee8STristan Ye 			goto out;
201de474ee8STristan Ye 		}
202de474ee8STristan Ye 	}
203de474ee8STristan Ye 
204de474ee8STristan Ye 	*credits += ocfs2_calc_extend_credits(osb->sb, et->et_root_el,
205de474ee8STristan Ye 					      clusters_to_move + 2);
206de474ee8STristan Ye 
207de474ee8STristan Ye 	mlog(0, "reserve metadata_blocks: %d, data_clusters: %u, credits: %d\n",
208de474ee8STristan Ye 	     extra_blocks, clusters_to_move, *credits);
209de474ee8STristan Ye out:
210de474ee8STristan Ye 	if (ret) {
211de474ee8STristan Ye 		if (*meta_ac) {
212de474ee8STristan Ye 			ocfs2_free_alloc_context(*meta_ac);
213de474ee8STristan Ye 			*meta_ac = NULL;
214de474ee8STristan Ye 		}
215de474ee8STristan Ye 	}
216de474ee8STristan Ye 
217de474ee8STristan Ye 	return ret;
218de474ee8STristan Ye }
219202ee5faSTristan Ye 
220202ee5faSTristan Ye /*
221202ee5faSTristan Ye  * Using one journal handle to guarantee the data consistency in case
222202ee5faSTristan Ye  * crash happens anywhere.
223dda54e76STristan Ye  *
224dda54e76STristan Ye  *  XXX: defrag can end up with finishing partial extent as requested,
225dda54e76STristan Ye  * due to not enough contiguous clusters can be found in allocator.
226202ee5faSTristan Ye  */
227202ee5faSTristan Ye static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
2284dfa66bdSTristan Ye 			       u32 cpos, u32 phys_cpos, u32 *len, int ext_flags)
229202ee5faSTristan Ye {
2304dfa66bdSTristan Ye 	int ret, credits = 0, extra_blocks = 0, partial = context->partial;
231202ee5faSTristan Ye 	handle_t *handle;
232202ee5faSTristan Ye 	struct inode *inode = context->inode;
233202ee5faSTristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
234202ee5faSTristan Ye 	struct inode *tl_inode = osb->osb_tl_inode;
235202ee5faSTristan Ye 	struct ocfs2_refcount_tree *ref_tree = NULL;
236202ee5faSTristan Ye 	u32 new_phys_cpos, new_len;
237202ee5faSTristan Ye 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
238202ee5faSTristan Ye 
2394dfa66bdSTristan Ye 	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
240202ee5faSTristan Ye 
241202ee5faSTristan Ye 		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
242202ee5faSTristan Ye 			 OCFS2_HAS_REFCOUNT_FL));
243202ee5faSTristan Ye 
244202ee5faSTristan Ye 		BUG_ON(!context->refcount_loc);
245202ee5faSTristan Ye 
246202ee5faSTristan Ye 		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
247202ee5faSTristan Ye 					       &ref_tree, NULL);
248202ee5faSTristan Ye 		if (ret) {
249202ee5faSTristan Ye 			mlog_errno(ret);
250202ee5faSTristan Ye 			return ret;
251202ee5faSTristan Ye 		}
252202ee5faSTristan Ye 
253202ee5faSTristan Ye 		ret = ocfs2_prepare_refcount_change_for_del(inode,
254202ee5faSTristan Ye 							context->refcount_loc,
255202ee5faSTristan Ye 							phys_blkno,
2564dfa66bdSTristan Ye 							*len,
257202ee5faSTristan Ye 							&credits,
258202ee5faSTristan Ye 							&extra_blocks);
259202ee5faSTristan Ye 		if (ret) {
260202ee5faSTristan Ye 			mlog_errno(ret);
261202ee5faSTristan Ye 			goto out;
262202ee5faSTristan Ye 		}
263202ee5faSTristan Ye 	}
264202ee5faSTristan Ye 
2654dfa66bdSTristan Ye 	ret = ocfs2_lock_allocators_move_extents(inode, &context->et, *len, 1,
266202ee5faSTristan Ye 						 &context->meta_ac,
267202ee5faSTristan Ye 						 &context->data_ac,
268202ee5faSTristan Ye 						 extra_blocks, &credits);
269202ee5faSTristan Ye 	if (ret) {
270202ee5faSTristan Ye 		mlog_errno(ret);
271202ee5faSTristan Ye 		goto out;
272202ee5faSTristan Ye 	}
273202ee5faSTristan Ye 
274202ee5faSTristan Ye 	/*
275202ee5faSTristan Ye 	 * should be using allocation reservation strategy there?
276202ee5faSTristan Ye 	 *
277202ee5faSTristan Ye 	 * if (context->data_ac)
278202ee5faSTristan Ye 	 *	context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
279202ee5faSTristan Ye 	 */
280202ee5faSTristan Ye 
281202ee5faSTristan Ye 	mutex_lock(&tl_inode->i_mutex);
282202ee5faSTristan Ye 
283202ee5faSTristan Ye 	if (ocfs2_truncate_log_needs_flush(osb)) {
284202ee5faSTristan Ye 		ret = __ocfs2_flush_truncate_log(osb);
285202ee5faSTristan Ye 		if (ret < 0) {
286202ee5faSTristan Ye 			mlog_errno(ret);
287202ee5faSTristan Ye 			goto out_unlock_mutex;
288202ee5faSTristan Ye 		}
289202ee5faSTristan Ye 	}
290202ee5faSTristan Ye 
291202ee5faSTristan Ye 	handle = ocfs2_start_trans(osb, credits);
292202ee5faSTristan Ye 	if (IS_ERR(handle)) {
293202ee5faSTristan Ye 		ret = PTR_ERR(handle);
294202ee5faSTristan Ye 		mlog_errno(ret);
295202ee5faSTristan Ye 		goto out_unlock_mutex;
296202ee5faSTristan Ye 	}
297202ee5faSTristan Ye 
2984dfa66bdSTristan Ye 	ret = __ocfs2_claim_clusters(handle, context->data_ac, 1, *len,
299202ee5faSTristan Ye 				     &new_phys_cpos, &new_len);
300202ee5faSTristan Ye 	if (ret) {
301202ee5faSTristan Ye 		mlog_errno(ret);
302202ee5faSTristan Ye 		goto out_commit;
303202ee5faSTristan Ye 	}
304202ee5faSTristan Ye 
305202ee5faSTristan Ye 	/*
3064dfa66bdSTristan Ye 	 * allowing partial extent moving is kind of 'pros and cons', it makes
3074dfa66bdSTristan Ye 	 * whole defragmentation less likely to fail, on the contrary, the bad
3084dfa66bdSTristan Ye 	 * thing is it may make the fs even more fragmented after moving, let
3094dfa66bdSTristan Ye 	 * userspace make a good decision here.
310202ee5faSTristan Ye 	 */
3114dfa66bdSTristan Ye 	if (new_len != *len) {
3124dfa66bdSTristan Ye 		mlog(0, "len_claimed: %u, len: %u\n", new_len, *len);
3134dfa66bdSTristan Ye 		if (!partial) {
314202ee5faSTristan Ye 			context->range->me_flags &= ~OCFS2_MOVE_EXT_FL_COMPLETE;
315202ee5faSTristan Ye 			ret = -ENOSPC;
316202ee5faSTristan Ye 			goto out_commit;
317202ee5faSTristan Ye 		}
3184dfa66bdSTristan Ye 	}
319202ee5faSTristan Ye 
320202ee5faSTristan Ye 	mlog(0, "cpos: %u, phys_cpos: %u, new_phys_cpos: %u\n", cpos,
321202ee5faSTristan Ye 	     phys_cpos, new_phys_cpos);
322202ee5faSTristan Ye 
3234dfa66bdSTristan Ye 	ret = __ocfs2_move_extent(handle, context, cpos, new_len, phys_cpos,
324202ee5faSTristan Ye 				  new_phys_cpos, ext_flags);
325202ee5faSTristan Ye 	if (ret)
326202ee5faSTristan Ye 		mlog_errno(ret);
327202ee5faSTristan Ye 
3284dfa66bdSTristan Ye 	if (partial && (new_len != *len))
3294dfa66bdSTristan Ye 		*len = new_len;
3304dfa66bdSTristan Ye 
331202ee5faSTristan Ye 	/*
332202ee5faSTristan Ye 	 * Here we should write the new page out first if we are
333202ee5faSTristan Ye 	 * in write-back mode.
334202ee5faSTristan Ye 	 */
3354dfa66bdSTristan Ye 	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, *len);
336202ee5faSTristan Ye 	if (ret)
337202ee5faSTristan Ye 		mlog_errno(ret);
338202ee5faSTristan Ye 
339202ee5faSTristan Ye out_commit:
340202ee5faSTristan Ye 	ocfs2_commit_trans(osb, handle);
341202ee5faSTristan Ye 
342202ee5faSTristan Ye out_unlock_mutex:
343202ee5faSTristan Ye 	mutex_unlock(&tl_inode->i_mutex);
344202ee5faSTristan Ye 
345202ee5faSTristan Ye 	if (context->data_ac) {
346202ee5faSTristan Ye 		ocfs2_free_alloc_context(context->data_ac);
347202ee5faSTristan Ye 		context->data_ac = NULL;
348202ee5faSTristan Ye 	}
349202ee5faSTristan Ye 
350202ee5faSTristan Ye 	if (context->meta_ac) {
351202ee5faSTristan Ye 		ocfs2_free_alloc_context(context->meta_ac);
352202ee5faSTristan Ye 		context->meta_ac = NULL;
353202ee5faSTristan Ye 	}
354202ee5faSTristan Ye 
355202ee5faSTristan Ye out:
356202ee5faSTristan Ye 	if (ref_tree)
357202ee5faSTristan Ye 		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
358202ee5faSTristan Ye 
359202ee5faSTristan Ye 	return ret;
360202ee5faSTristan Ye }
3611c06b912STristan Ye 
3621c06b912STristan Ye /*
3631c06b912STristan Ye  * find the victim alloc group, where #blkno fits.
3641c06b912STristan Ye  */
3651c06b912STristan Ye static int ocfs2_find_victim_alloc_group(struct inode *inode,
3661c06b912STristan Ye 					 u64 vict_blkno,
3671c06b912STristan Ye 					 int type, int slot,
3681c06b912STristan Ye 					 int *vict_bit,
3691c06b912STristan Ye 					 struct buffer_head **ret_bh)
3701c06b912STristan Ye {
3716aea6f50STristan Ye 	int ret, i, bits_per_unit = 0;
3721c06b912STristan Ye 	u64 blkno;
3731c06b912STristan Ye 	char namebuf[40];
3741c06b912STristan Ye 
3751c06b912STristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
3761c06b912STristan Ye 	struct buffer_head *ac_bh = NULL, *gd_bh = NULL;
3771c06b912STristan Ye 	struct ocfs2_chain_list *cl;
3781c06b912STristan Ye 	struct ocfs2_chain_rec *rec;
3791c06b912STristan Ye 	struct ocfs2_dinode *ac_dinode;
3801c06b912STristan Ye 	struct ocfs2_group_desc *bg;
3811c06b912STristan Ye 
3821c06b912STristan Ye 	ocfs2_sprintf_system_inode_name(namebuf, sizeof(namebuf), type, slot);
3831c06b912STristan Ye 	ret = ocfs2_lookup_ino_from_name(osb->sys_root_inode, namebuf,
3841c06b912STristan Ye 					 strlen(namebuf), &blkno);
3851c06b912STristan Ye 	if (ret) {
3861c06b912STristan Ye 		ret = -ENOENT;
3871c06b912STristan Ye 		goto out;
3881c06b912STristan Ye 	}
3891c06b912STristan Ye 
3901c06b912STristan Ye 	ret = ocfs2_read_blocks_sync(osb, blkno, 1, &ac_bh);
3911c06b912STristan Ye 	if (ret) {
3921c06b912STristan Ye 		mlog_errno(ret);
3931c06b912STristan Ye 		goto out;
3941c06b912STristan Ye 	}
3951c06b912STristan Ye 
3961c06b912STristan Ye 	ac_dinode = (struct ocfs2_dinode *)ac_bh->b_data;
3971c06b912STristan Ye 	cl = &(ac_dinode->id2.i_chain);
3981c06b912STristan Ye 	rec = &(cl->cl_recs[0]);
3991c06b912STristan Ye 
4001c06b912STristan Ye 	if (type == GLOBAL_BITMAP_SYSTEM_INODE)
4016aea6f50STristan Ye 		bits_per_unit = osb->s_clustersize_bits -
4026aea6f50STristan Ye 					inode->i_sb->s_blocksize_bits;
4031c06b912STristan Ye 	/*
4041c06b912STristan Ye 	 * 'vict_blkno' was out of the valid range.
4051c06b912STristan Ye 	 */
4061c06b912STristan Ye 	if ((vict_blkno < le64_to_cpu(rec->c_blkno)) ||
4076aea6f50STristan Ye 	    (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) <<
4086aea6f50STristan Ye 				bits_per_unit))) {
4091c06b912STristan Ye 		ret = -EINVAL;
4101c06b912STristan Ye 		goto out;
4111c06b912STristan Ye 	}
4121c06b912STristan Ye 
4131c06b912STristan Ye 	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i++) {
4141c06b912STristan Ye 
4151c06b912STristan Ye 		rec = &(cl->cl_recs[i]);
4161c06b912STristan Ye 		if (!rec)
4171c06b912STristan Ye 			continue;
4181c06b912STristan Ye 
4191c06b912STristan Ye 		bg = NULL;
4201c06b912STristan Ye 
4211c06b912STristan Ye 		do {
4221c06b912STristan Ye 			if (!bg)
4231c06b912STristan Ye 				blkno = le64_to_cpu(rec->c_blkno);
4241c06b912STristan Ye 			else
4251c06b912STristan Ye 				blkno = le64_to_cpu(bg->bg_next_group);
4261c06b912STristan Ye 
4271c06b912STristan Ye 			if (gd_bh) {
4281c06b912STristan Ye 				brelse(gd_bh);
4291c06b912STristan Ye 				gd_bh = NULL;
4301c06b912STristan Ye 			}
4311c06b912STristan Ye 
4321c06b912STristan Ye 			ret = ocfs2_read_blocks_sync(osb, blkno, 1, &gd_bh);
4331c06b912STristan Ye 			if (ret) {
4341c06b912STristan Ye 				mlog_errno(ret);
4351c06b912STristan Ye 				goto out;
4361c06b912STristan Ye 			}
4371c06b912STristan Ye 
4381c06b912STristan Ye 			bg = (struct ocfs2_group_desc *)gd_bh->b_data;
4391c06b912STristan Ye 
4401c06b912STristan Ye 			if (vict_blkno < (le64_to_cpu(bg->bg_blkno) +
4411c06b912STristan Ye 						le16_to_cpu(bg->bg_bits))) {
4421c06b912STristan Ye 
4431c06b912STristan Ye 				*ret_bh = gd_bh;
4446aea6f50STristan Ye 				*vict_bit = (vict_blkno - blkno) >>
4456aea6f50STristan Ye 							bits_per_unit;
4461c06b912STristan Ye 				mlog(0, "find the victim group: #%llu, "
4471c06b912STristan Ye 				     "total_bits: %u, vict_bit: %u\n",
4481c06b912STristan Ye 				     blkno, le16_to_cpu(bg->bg_bits),
4491c06b912STristan Ye 				     *vict_bit);
4501c06b912STristan Ye 				goto out;
4511c06b912STristan Ye 			}
4521c06b912STristan Ye 
4531c06b912STristan Ye 		} while (le64_to_cpu(bg->bg_next_group));
4541c06b912STristan Ye 	}
4551c06b912STristan Ye 
4561c06b912STristan Ye 	ret = -EINVAL;
4571c06b912STristan Ye out:
4581c06b912STristan Ye 	brelse(ac_bh);
4591c06b912STristan Ye 
4601c06b912STristan Ye 	/*
4611c06b912STristan Ye 	 * caller has to release the gd_bh properly.
4621c06b912STristan Ye 	 */
4631c06b912STristan Ye 	return ret;
4641c06b912STristan Ye }
46599e4c750STristan Ye 
46699e4c750STristan Ye /*
46799e4c750STristan Ye  * XXX: helper to validate and adjust moving goal.
46899e4c750STristan Ye  */
46999e4c750STristan Ye static int ocfs2_validate_and_adjust_move_goal(struct inode *inode,
47099e4c750STristan Ye 					       struct ocfs2_move_extents *range)
47199e4c750STristan Ye {
47299e4c750STristan Ye 	int ret, goal_bit = 0;
47399e4c750STristan Ye 
47499e4c750STristan Ye 	struct buffer_head *gd_bh = NULL;
475ea5e1675STristan Ye 	struct ocfs2_group_desc *bg = NULL;
47699e4c750STristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
47799e4c750STristan Ye 	int c_to_b = 1 << (osb->s_clustersize_bits -
47899e4c750STristan Ye 					inode->i_sb->s_blocksize_bits);
47999e4c750STristan Ye 
48099e4c750STristan Ye 	/*
481ea5e1675STristan Ye 	 * make goal become cluster aligned.
482ea5e1675STristan Ye 	 */
483ea5e1675STristan Ye 	range->me_goal = ocfs2_block_to_cluster_start(inode->i_sb,
484ea5e1675STristan Ye 						      range->me_goal);
485ea5e1675STristan Ye 	/*
486ea5e1675STristan Ye 	 * moving goal is not allowd to start with a group desc blok(#0 blk)
487ea5e1675STristan Ye 	 * let's compromise to the latter cluster.
488ea5e1675STristan Ye 	 */
489ea5e1675STristan Ye 	if (range->me_goal == le64_to_cpu(bg->bg_blkno))
490ea5e1675STristan Ye 		range->me_goal += c_to_b;
491ea5e1675STristan Ye 
492ea5e1675STristan Ye 	/*
49399e4c750STristan Ye 	 * validate goal sits within global_bitmap, and return the victim
49499e4c750STristan Ye 	 * group desc
49599e4c750STristan Ye 	 */
49699e4c750STristan Ye 	ret = ocfs2_find_victim_alloc_group(inode, range->me_goal,
49799e4c750STristan Ye 					    GLOBAL_BITMAP_SYSTEM_INODE,
49899e4c750STristan Ye 					    OCFS2_INVALID_SLOT,
49999e4c750STristan Ye 					    &goal_bit, &gd_bh);
50099e4c750STristan Ye 	if (ret)
50199e4c750STristan Ye 		goto out;
50299e4c750STristan Ye 
50399e4c750STristan Ye 	bg = (struct ocfs2_group_desc *)gd_bh->b_data;
50499e4c750STristan Ye 
50599e4c750STristan Ye 	/*
50699e4c750STristan Ye 	 * movement is not gonna cross two groups.
50799e4c750STristan Ye 	 */
50899e4c750STristan Ye 	if ((le16_to_cpu(bg->bg_bits) - goal_bit) * osb->s_clustersize <
50999e4c750STristan Ye 								range->me_len) {
51099e4c750STristan Ye 		ret = -EINVAL;
51199e4c750STristan Ye 		goto out;
51299e4c750STristan Ye 	}
51399e4c750STristan Ye 	/*
51499e4c750STristan Ye 	 * more exact validations/adjustments will be performed later during
51599e4c750STristan Ye 	 * moving operation for each extent range.
51699e4c750STristan Ye 	 */
51799e4c750STristan Ye 	mlog(0, "extents get ready to be moved to #%llu block\n",
51899e4c750STristan Ye 	     range->me_goal);
51999e4c750STristan Ye 
52099e4c750STristan Ye out:
52199e4c750STristan Ye 	brelse(gd_bh);
52299e4c750STristan Ye 
52399e4c750STristan Ye 	return ret;
52499e4c750STristan Ye }
525e6b5859cSTristan Ye 
526e6b5859cSTristan Ye static void ocfs2_probe_alloc_group(struct inode *inode, struct buffer_head *bh,
527e6b5859cSTristan Ye 				    int *goal_bit, u32 move_len, u32 max_hop,
528e6b5859cSTristan Ye 				    u32 *phys_cpos)
529e6b5859cSTristan Ye {
530e6b5859cSTristan Ye 	int i, used, last_free_bits = 0, base_bit = *goal_bit;
531e6b5859cSTristan Ye 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
532e6b5859cSTristan Ye 	u32 base_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
533e6b5859cSTristan Ye 						 le64_to_cpu(gd->bg_blkno));
534e6b5859cSTristan Ye 
535e6b5859cSTristan Ye 	for (i = base_bit; i < le16_to_cpu(gd->bg_bits); i++) {
536e6b5859cSTristan Ye 
537e6b5859cSTristan Ye 		used = ocfs2_test_bit(i, (unsigned long *)gd->bg_bitmap);
538e6b5859cSTristan Ye 		if (used) {
539e6b5859cSTristan Ye 			/*
540e6b5859cSTristan Ye 			 * we even tried searching the free chunk by jumping
541e6b5859cSTristan Ye 			 * a 'max_hop' distance, but still failed.
542e6b5859cSTristan Ye 			 */
543e6b5859cSTristan Ye 			if ((i - base_bit) > max_hop) {
544e6b5859cSTristan Ye 				*phys_cpos = 0;
545e6b5859cSTristan Ye 				break;
546e6b5859cSTristan Ye 			}
547e6b5859cSTristan Ye 
548e6b5859cSTristan Ye 			if (last_free_bits)
549e6b5859cSTristan Ye 				last_free_bits = 0;
550e6b5859cSTristan Ye 
551e6b5859cSTristan Ye 			continue;
552e6b5859cSTristan Ye 		} else
553e6b5859cSTristan Ye 			last_free_bits++;
554e6b5859cSTristan Ye 
555e6b5859cSTristan Ye 		if (last_free_bits == move_len) {
556e6b5859cSTristan Ye 			*goal_bit = i;
557e6b5859cSTristan Ye 			*phys_cpos = base_cpos + i;
558e6b5859cSTristan Ye 			break;
559e6b5859cSTristan Ye 		}
560e6b5859cSTristan Ye 	}
561e6b5859cSTristan Ye 
562e6b5859cSTristan Ye 	mlog(0, "found phys_cpos: %u to fit the wanted moving.\n", *phys_cpos);
563e6b5859cSTristan Ye }
5648473aa8aSTristan Ye 
5658473aa8aSTristan Ye static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
5668473aa8aSTristan Ye 				       handle_t *handle,
5678473aa8aSTristan Ye 				       struct buffer_head *di_bh,
5688473aa8aSTristan Ye 				       u32 num_bits,
5698473aa8aSTristan Ye 				       u16 chain)
5708473aa8aSTristan Ye {
5718473aa8aSTristan Ye 	int ret;
5728473aa8aSTristan Ye 	u32 tmp_used;
5738473aa8aSTristan Ye 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
5748473aa8aSTristan Ye 	struct ocfs2_chain_list *cl =
5758473aa8aSTristan Ye 				(struct ocfs2_chain_list *) &di->id2.i_chain;
5768473aa8aSTristan Ye 
5778473aa8aSTristan Ye 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
5788473aa8aSTristan Ye 				      OCFS2_JOURNAL_ACCESS_WRITE);
5798473aa8aSTristan Ye 	if (ret < 0) {
5808473aa8aSTristan Ye 		mlog_errno(ret);
5818473aa8aSTristan Ye 		goto out;
5828473aa8aSTristan Ye 	}
5838473aa8aSTristan Ye 
5848473aa8aSTristan Ye 	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
5858473aa8aSTristan Ye 	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
5868473aa8aSTristan Ye 	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
5878473aa8aSTristan Ye 	ocfs2_journal_dirty(handle, di_bh);
5888473aa8aSTristan Ye 
5898473aa8aSTristan Ye out:
5908473aa8aSTristan Ye 	return ret;
5918473aa8aSTristan Ye }
5928473aa8aSTristan Ye 
5938473aa8aSTristan Ye static inline int ocfs2_block_group_set_bits(handle_t *handle,
5948473aa8aSTristan Ye 					     struct inode *alloc_inode,
5958473aa8aSTristan Ye 					     struct ocfs2_group_desc *bg,
5968473aa8aSTristan Ye 					     struct buffer_head *group_bh,
5978473aa8aSTristan Ye 					     unsigned int bit_off,
5988473aa8aSTristan Ye 					     unsigned int num_bits)
5998473aa8aSTristan Ye {
6008473aa8aSTristan Ye 	int status;
6018473aa8aSTristan Ye 	void *bitmap = bg->bg_bitmap;
6028473aa8aSTristan Ye 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
6038473aa8aSTristan Ye 
6048473aa8aSTristan Ye 	/* All callers get the descriptor via
6058473aa8aSTristan Ye 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
6068473aa8aSTristan Ye 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
6078473aa8aSTristan Ye 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
6088473aa8aSTristan Ye 
6098473aa8aSTristan Ye 	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
6108473aa8aSTristan Ye 	     num_bits);
6118473aa8aSTristan Ye 
6128473aa8aSTristan Ye 	if (ocfs2_is_cluster_bitmap(alloc_inode))
6138473aa8aSTristan Ye 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
6148473aa8aSTristan Ye 
6158473aa8aSTristan Ye 	status = ocfs2_journal_access_gd(handle,
6168473aa8aSTristan Ye 					 INODE_CACHE(alloc_inode),
6178473aa8aSTristan Ye 					 group_bh,
6188473aa8aSTristan Ye 					 journal_type);
6198473aa8aSTristan Ye 	if (status < 0) {
6208473aa8aSTristan Ye 		mlog_errno(status);
6218473aa8aSTristan Ye 		goto bail;
6228473aa8aSTristan Ye 	}
6238473aa8aSTristan Ye 
6248473aa8aSTristan Ye 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
6258473aa8aSTristan Ye 	if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) {
6268473aa8aSTristan Ye 		ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit"
6278473aa8aSTristan Ye 			    " count %u but claims %u are freed. num_bits %d",
6288473aa8aSTristan Ye 			    (unsigned long long)le64_to_cpu(bg->bg_blkno),
6298473aa8aSTristan Ye 			    le16_to_cpu(bg->bg_bits),
6308473aa8aSTristan Ye 			    le16_to_cpu(bg->bg_free_bits_count), num_bits);
6318473aa8aSTristan Ye 		return -EROFS;
6328473aa8aSTristan Ye 	}
6338473aa8aSTristan Ye 	while (num_bits--)
6348473aa8aSTristan Ye 		ocfs2_set_bit(bit_off++, bitmap);
6358473aa8aSTristan Ye 
6368473aa8aSTristan Ye 	ocfs2_journal_dirty(handle, group_bh);
6378473aa8aSTristan Ye 
6388473aa8aSTristan Ye bail:
6398473aa8aSTristan Ye 	return status;
6408473aa8aSTristan Ye }
641e0847717STristan Ye 
642e0847717STristan Ye static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
643e0847717STristan Ye 			     u32 cpos, u32 phys_cpos, u32 *new_phys_cpos,
644e0847717STristan Ye 			     u32 len, int ext_flags)
645e0847717STristan Ye {
646e0847717STristan Ye 	int ret, credits = 0, extra_blocks = 0, goal_bit = 0;
647e0847717STristan Ye 	handle_t *handle;
648e0847717STristan Ye 	struct inode *inode = context->inode;
649e0847717STristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
650e0847717STristan Ye 	struct inode *tl_inode = osb->osb_tl_inode;
651e0847717STristan Ye 	struct inode *gb_inode = NULL;
652e0847717STristan Ye 	struct buffer_head *gb_bh = NULL;
653e0847717STristan Ye 	struct buffer_head *gd_bh = NULL;
654e0847717STristan Ye 	struct ocfs2_group_desc *gd;
655e0847717STristan Ye 	struct ocfs2_refcount_tree *ref_tree = NULL;
656e0847717STristan Ye 	u32 move_max_hop = ocfs2_blocks_to_clusters(inode->i_sb,
657e0847717STristan Ye 						    context->range->me_threshold);
658e0847717STristan Ye 	u64 phys_blkno, new_phys_blkno;
659e0847717STristan Ye 
660e0847717STristan Ye 	phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
661e0847717STristan Ye 
662e0847717STristan Ye 	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
663e0847717STristan Ye 
664e0847717STristan Ye 		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
665e0847717STristan Ye 			 OCFS2_HAS_REFCOUNT_FL));
666e0847717STristan Ye 
667e0847717STristan Ye 		BUG_ON(!context->refcount_loc);
668e0847717STristan Ye 
669e0847717STristan Ye 		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
670e0847717STristan Ye 					       &ref_tree, NULL);
671e0847717STristan Ye 		if (ret) {
672e0847717STristan Ye 			mlog_errno(ret);
673e0847717STristan Ye 			return ret;
674e0847717STristan Ye 		}
675e0847717STristan Ye 
676e0847717STristan Ye 		ret = ocfs2_prepare_refcount_change_for_del(inode,
677e0847717STristan Ye 							context->refcount_loc,
678e0847717STristan Ye 							phys_blkno,
679e0847717STristan Ye 							len,
680e0847717STristan Ye 							&credits,
681e0847717STristan Ye 							&extra_blocks);
682e0847717STristan Ye 		if (ret) {
683e0847717STristan Ye 			mlog_errno(ret);
684e0847717STristan Ye 			goto out;
685e0847717STristan Ye 		}
686e0847717STristan Ye 	}
687e0847717STristan Ye 
688e0847717STristan Ye 	ret = ocfs2_lock_allocators_move_extents(inode, &context->et, len, 1,
689e0847717STristan Ye 						 &context->meta_ac,
690e0847717STristan Ye 						 NULL, extra_blocks, &credits);
691e0847717STristan Ye 	if (ret) {
692e0847717STristan Ye 		mlog_errno(ret);
693e0847717STristan Ye 		goto out;
694e0847717STristan Ye 	}
695e0847717STristan Ye 
696e0847717STristan Ye 	/*
697e0847717STristan Ye 	 * need to count 2 extra credits for global_bitmap inode and
698e0847717STristan Ye 	 * group descriptor.
699e0847717STristan Ye 	 */
700e0847717STristan Ye 	credits += OCFS2_INODE_UPDATE_CREDITS + 1;
701e0847717STristan Ye 
702e0847717STristan Ye 	/*
703e0847717STristan Ye 	 * ocfs2_move_extent() didn't reserve any clusters in lock_allocators()
704e0847717STristan Ye 	 * logic, while we still need to lock the global_bitmap.
705e0847717STristan Ye 	 */
706e0847717STristan Ye 	gb_inode = ocfs2_get_system_file_inode(osb, GLOBAL_BITMAP_SYSTEM_INODE,
707e0847717STristan Ye 					       OCFS2_INVALID_SLOT);
708e0847717STristan Ye 	if (!gb_inode) {
709e0847717STristan Ye 		mlog(ML_ERROR, "unable to get global_bitmap inode\n");
710e0847717STristan Ye 		ret = -EIO;
711e0847717STristan Ye 		goto out;
712e0847717STristan Ye 	}
713e0847717STristan Ye 
714e0847717STristan Ye 	mutex_lock(&gb_inode->i_mutex);
715e0847717STristan Ye 
716e0847717STristan Ye 	ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
717e0847717STristan Ye 	if (ret) {
718e0847717STristan Ye 		mlog_errno(ret);
719e0847717STristan Ye 		goto out_unlock_gb_mutex;
720e0847717STristan Ye 	}
721e0847717STristan Ye 
722e0847717STristan Ye 	mutex_lock(&tl_inode->i_mutex);
723e0847717STristan Ye 
724e0847717STristan Ye 	handle = ocfs2_start_trans(osb, credits);
725e0847717STristan Ye 	if (IS_ERR(handle)) {
726e0847717STristan Ye 		ret = PTR_ERR(handle);
727e0847717STristan Ye 		mlog_errno(ret);
728e0847717STristan Ye 		goto out_unlock_tl_inode;
729e0847717STristan Ye 	}
730e0847717STristan Ye 
731e0847717STristan Ye 	new_phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, *new_phys_cpos);
732e0847717STristan Ye 	ret = ocfs2_find_victim_alloc_group(inode, new_phys_blkno,
733e0847717STristan Ye 					    GLOBAL_BITMAP_SYSTEM_INODE,
734e0847717STristan Ye 					    OCFS2_INVALID_SLOT,
735e0847717STristan Ye 					    &goal_bit, &gd_bh);
736e0847717STristan Ye 	if (ret) {
737e0847717STristan Ye 		mlog_errno(ret);
738e0847717STristan Ye 		goto out_commit;
739e0847717STristan Ye 	}
740e0847717STristan Ye 
741e0847717STristan Ye 	/*
742e0847717STristan Ye 	 * probe the victim cluster group to find a proper
743e0847717STristan Ye 	 * region to fit wanted movement, it even will perfrom
744e0847717STristan Ye 	 * a best-effort attempt by compromising to a threshold
745e0847717STristan Ye 	 * around the goal.
746e0847717STristan Ye 	 */
747e0847717STristan Ye 	ocfs2_probe_alloc_group(inode, gd_bh, &goal_bit, len, move_max_hop,
748e0847717STristan Ye 				new_phys_cpos);
7493d75be7cSDan Carpenter 	if (!*new_phys_cpos) {
750e0847717STristan Ye 		ret = -ENOSPC;
751e0847717STristan Ye 		goto out_commit;
752e0847717STristan Ye 	}
753e0847717STristan Ye 
754e0847717STristan Ye 	ret = __ocfs2_move_extent(handle, context, cpos, len, phys_cpos,
755e0847717STristan Ye 				  *new_phys_cpos, ext_flags);
756e0847717STristan Ye 	if (ret) {
757e0847717STristan Ye 		mlog_errno(ret);
758e0847717STristan Ye 		goto out_commit;
759e0847717STristan Ye 	}
760e0847717STristan Ye 
761e0847717STristan Ye 	gd = (struct ocfs2_group_desc *)gd_bh->b_data;
762e0847717STristan Ye 	ret = ocfs2_alloc_dinode_update_counts(gb_inode, handle, gb_bh, len,
763e0847717STristan Ye 					       le16_to_cpu(gd->bg_chain));
764e0847717STristan Ye 	if (ret) {
765e0847717STristan Ye 		mlog_errno(ret);
766e0847717STristan Ye 		goto out_commit;
767e0847717STristan Ye 	}
768e0847717STristan Ye 
769e0847717STristan Ye 	ret = ocfs2_block_group_set_bits(handle, gb_inode, gd, gd_bh,
770e0847717STristan Ye 					 goal_bit, len);
771e0847717STristan Ye 	if (ret)
772e0847717STristan Ye 		mlog_errno(ret);
773e0847717STristan Ye 
774e0847717STristan Ye 	/*
775e0847717STristan Ye 	 * Here we should write the new page out first if we are
776e0847717STristan Ye 	 * in write-back mode.
777e0847717STristan Ye 	 */
778e0847717STristan Ye 	ret = ocfs2_cow_sync_writeback(inode->i_sb, context->inode, cpos, len);
779e0847717STristan Ye 	if (ret)
780e0847717STristan Ye 		mlog_errno(ret);
781e0847717STristan Ye 
782e0847717STristan Ye out_commit:
783e0847717STristan Ye 	ocfs2_commit_trans(osb, handle);
784e0847717STristan Ye 	brelse(gd_bh);
785e0847717STristan Ye 
786e0847717STristan Ye out_unlock_tl_inode:
787e0847717STristan Ye 	mutex_unlock(&tl_inode->i_mutex);
788e0847717STristan Ye 
789e0847717STristan Ye 	ocfs2_inode_unlock(gb_inode, 1);
790e0847717STristan Ye out_unlock_gb_mutex:
791e0847717STristan Ye 	mutex_unlock(&gb_inode->i_mutex);
792e0847717STristan Ye 	brelse(gb_bh);
793e0847717STristan Ye 	iput(gb_inode);
794e0847717STristan Ye 
795e0847717STristan Ye out:
796e0847717STristan Ye 	if (context->meta_ac) {
797e0847717STristan Ye 		ocfs2_free_alloc_context(context->meta_ac);
798e0847717STristan Ye 		context->meta_ac = NULL;
799e0847717STristan Ye 	}
800e0847717STristan Ye 
801e0847717STristan Ye 	if (ref_tree)
802e0847717STristan Ye 		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
803e0847717STristan Ye 
804e0847717STristan Ye 	return ret;
805e0847717STristan Ye }
806ee16cc03STristan Ye 
807ee16cc03STristan Ye /*
808ee16cc03STristan Ye  * Helper to calculate the defraging length in one run according to threshold.
809ee16cc03STristan Ye  */
810ee16cc03STristan Ye static void ocfs2_calc_extent_defrag_len(u32 *alloc_size, u32 *len_defraged,
811ee16cc03STristan Ye 					 u32 threshold, int *skip)
812ee16cc03STristan Ye {
813ee16cc03STristan Ye 	if ((*alloc_size + *len_defraged) < threshold) {
814ee16cc03STristan Ye 		/*
815ee16cc03STristan Ye 		 * proceed defragmentation until we meet the thresh
816ee16cc03STristan Ye 		 */
817ee16cc03STristan Ye 		*len_defraged += *alloc_size;
818ee16cc03STristan Ye 	} else if (*len_defraged == 0) {
819ee16cc03STristan Ye 		/*
820ee16cc03STristan Ye 		 * XXX: skip a large extent.
821ee16cc03STristan Ye 		 */
822ee16cc03STristan Ye 		*skip = 1;
823ee16cc03STristan Ye 	} else {
824ee16cc03STristan Ye 		/*
825ee16cc03STristan Ye 		 * split this extent to coalesce with former pieces as
826ee16cc03STristan Ye 		 * to reach the threshold.
827ee16cc03STristan Ye 		 *
828ee16cc03STristan Ye 		 * we're done here with one cycle of defragmentation
829ee16cc03STristan Ye 		 * in a size of 'thresh', resetting 'len_defraged'
830ee16cc03STristan Ye 		 * forces a new defragmentation.
831ee16cc03STristan Ye 		 */
832ee16cc03STristan Ye 		*alloc_size = threshold - *len_defraged;
833ee16cc03STristan Ye 		*len_defraged = 0;
834ee16cc03STristan Ye 	}
835ee16cc03STristan Ye }
83653069d4eSTristan Ye 
83753069d4eSTristan Ye static int __ocfs2_move_extents_range(struct buffer_head *di_bh,
83853069d4eSTristan Ye 				struct ocfs2_move_extents_context *context)
83953069d4eSTristan Ye {
84053069d4eSTristan Ye 	int ret = 0, flags, do_defrag, skip = 0;
84153069d4eSTristan Ye 	u32 cpos, phys_cpos, move_start, len_to_move, alloc_size;
84253069d4eSTristan Ye 	u32 len_defraged = 0, defrag_thresh = 0, new_phys_cpos = 0;
84353069d4eSTristan Ye 
84453069d4eSTristan Ye 	struct inode *inode = context->inode;
84553069d4eSTristan Ye 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
84653069d4eSTristan Ye 	struct ocfs2_move_extents *range = context->range;
84753069d4eSTristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
84853069d4eSTristan Ye 
84953069d4eSTristan Ye 	if ((inode->i_size == 0) || (range->me_len == 0))
85053069d4eSTristan Ye 		return 0;
85153069d4eSTristan Ye 
85253069d4eSTristan Ye 	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
85353069d4eSTristan Ye 		return 0;
85453069d4eSTristan Ye 
85553069d4eSTristan Ye 	context->refcount_loc = le64_to_cpu(di->i_refcount_loc);
85653069d4eSTristan Ye 
85753069d4eSTristan Ye 	ocfs2_init_dinode_extent_tree(&context->et, INODE_CACHE(inode), di_bh);
85853069d4eSTristan Ye 	ocfs2_init_dealloc_ctxt(&context->dealloc);
85953069d4eSTristan Ye 
86053069d4eSTristan Ye 	/*
86153069d4eSTristan Ye 	 * TO-DO XXX:
86253069d4eSTristan Ye 	 *
86353069d4eSTristan Ye 	 * - xattr extents.
86453069d4eSTristan Ye 	 */
86553069d4eSTristan Ye 
86653069d4eSTristan Ye 	do_defrag = context->auto_defrag;
86753069d4eSTristan Ye 
86853069d4eSTristan Ye 	/*
86953069d4eSTristan Ye 	 * extents moving happens in unit of clusters, for the sake
87053069d4eSTristan Ye 	 * of simplicity, we may ignore two clusters where 'byte_start'
87153069d4eSTristan Ye 	 * and 'byte_start + len' were within.
87253069d4eSTristan Ye 	 */
87353069d4eSTristan Ye 	move_start = ocfs2_clusters_for_bytes(osb->sb, range->me_start);
87453069d4eSTristan Ye 	len_to_move = (range->me_start + range->me_len) >>
87553069d4eSTristan Ye 						osb->s_clustersize_bits;
87653069d4eSTristan Ye 	if (len_to_move >= move_start)
87753069d4eSTristan Ye 		len_to_move -= move_start;
87853069d4eSTristan Ye 	else
87953069d4eSTristan Ye 		len_to_move = 0;
88053069d4eSTristan Ye 
881dda54e76STristan Ye 	if (do_defrag) {
88253069d4eSTristan Ye 		defrag_thresh = range->me_threshold >> osb->s_clustersize_bits;
883dda54e76STristan Ye 		if (defrag_thresh <= 1)
884dda54e76STristan Ye 			goto done;
885dda54e76STristan Ye 	} else
88653069d4eSTristan Ye 		new_phys_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
88753069d4eSTristan Ye 							 range->me_goal);
88853069d4eSTristan Ye 
88953069d4eSTristan Ye 	mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u, "
89053069d4eSTristan Ye 	     "thresh: %u\n",
89153069d4eSTristan Ye 	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
89253069d4eSTristan Ye 	     (unsigned long long)range->me_start,
89353069d4eSTristan Ye 	     (unsigned long long)range->me_len,
89453069d4eSTristan Ye 	     move_start, len_to_move, defrag_thresh);
89553069d4eSTristan Ye 
89653069d4eSTristan Ye 	cpos = move_start;
89753069d4eSTristan Ye 	while (len_to_move) {
89853069d4eSTristan Ye 		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &alloc_size,
89953069d4eSTristan Ye 					 &flags);
90053069d4eSTristan Ye 		if (ret) {
90153069d4eSTristan Ye 			mlog_errno(ret);
90253069d4eSTristan Ye 			goto out;
90353069d4eSTristan Ye 		}
90453069d4eSTristan Ye 
90553069d4eSTristan Ye 		if (alloc_size > len_to_move)
90653069d4eSTristan Ye 			alloc_size = len_to_move;
90753069d4eSTristan Ye 
90853069d4eSTristan Ye 		/*
90953069d4eSTristan Ye 		 * XXX: how to deal with a hole:
91053069d4eSTristan Ye 		 *
91153069d4eSTristan Ye 		 * - skip the hole of course
91253069d4eSTristan Ye 		 * - force a new defragmentation
91353069d4eSTristan Ye 		 */
91453069d4eSTristan Ye 		if (!phys_cpos) {
91553069d4eSTristan Ye 			if (do_defrag)
91653069d4eSTristan Ye 				len_defraged = 0;
91753069d4eSTristan Ye 
91853069d4eSTristan Ye 			goto next;
91953069d4eSTristan Ye 		}
92053069d4eSTristan Ye 
92153069d4eSTristan Ye 		if (do_defrag) {
92253069d4eSTristan Ye 			ocfs2_calc_extent_defrag_len(&alloc_size, &len_defraged,
92353069d4eSTristan Ye 						     defrag_thresh, &skip);
92453069d4eSTristan Ye 			/*
92553069d4eSTristan Ye 			 * skip large extents
92653069d4eSTristan Ye 			 */
92753069d4eSTristan Ye 			if (skip) {
92853069d4eSTristan Ye 				skip = 0;
92953069d4eSTristan Ye 				goto next;
93053069d4eSTristan Ye 			}
93153069d4eSTristan Ye 
93253069d4eSTristan Ye 			mlog(0, "#Defrag: cpos: %u, phys_cpos: %u, "
93353069d4eSTristan Ye 			     "alloc_size: %u, len_defraged: %u\n",
93453069d4eSTristan Ye 			     cpos, phys_cpos, alloc_size, len_defraged);
93553069d4eSTristan Ye 
93653069d4eSTristan Ye 			ret = ocfs2_defrag_extent(context, cpos, phys_cpos,
9374dfa66bdSTristan Ye 						  &alloc_size, flags);
93853069d4eSTristan Ye 		} else {
93953069d4eSTristan Ye 			ret = ocfs2_move_extent(context, cpos, phys_cpos,
94053069d4eSTristan Ye 						&new_phys_cpos, alloc_size,
94153069d4eSTristan Ye 						flags);
94253069d4eSTristan Ye 
94353069d4eSTristan Ye 			new_phys_cpos += alloc_size;
94453069d4eSTristan Ye 		}
94553069d4eSTristan Ye 
94653069d4eSTristan Ye 		if (ret < 0) {
94753069d4eSTristan Ye 			mlog_errno(ret);
94853069d4eSTristan Ye 			goto out;
94953069d4eSTristan Ye 		}
95053069d4eSTristan Ye 
95153069d4eSTristan Ye 		context->clusters_moved += alloc_size;
95253069d4eSTristan Ye next:
95353069d4eSTristan Ye 		cpos += alloc_size;
95453069d4eSTristan Ye 		len_to_move -= alloc_size;
95553069d4eSTristan Ye 	}
95653069d4eSTristan Ye 
957dda54e76STristan Ye done:
95853069d4eSTristan Ye 	range->me_flags |= OCFS2_MOVE_EXT_FL_COMPLETE;
95953069d4eSTristan Ye 
96053069d4eSTristan Ye out:
96153069d4eSTristan Ye 	range->me_moved_len = ocfs2_clusters_to_bytes(osb->sb,
96253069d4eSTristan Ye 						      context->clusters_moved);
96353069d4eSTristan Ye 	range->me_new_offset = ocfs2_clusters_to_bytes(osb->sb,
96453069d4eSTristan Ye 						       context->new_phys_cpos);
96553069d4eSTristan Ye 
96653069d4eSTristan Ye 	ocfs2_schedule_truncate_log_flush(osb, 1);
96753069d4eSTristan Ye 	ocfs2_run_deallocs(osb, &context->dealloc);
96853069d4eSTristan Ye 
96953069d4eSTristan Ye 	return ret;
97053069d4eSTristan Ye }
97153069d4eSTristan Ye 
97253069d4eSTristan Ye static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
97353069d4eSTristan Ye {
97453069d4eSTristan Ye 	int status;
97553069d4eSTristan Ye 	handle_t *handle;
97653069d4eSTristan Ye 	struct inode *inode = context->inode;
97753069d4eSTristan Ye 	struct ocfs2_dinode *di;
97853069d4eSTristan Ye 	struct buffer_head *di_bh = NULL;
97953069d4eSTristan Ye 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
98053069d4eSTristan Ye 
98153069d4eSTristan Ye 	if (!inode)
98253069d4eSTristan Ye 		return -ENOENT;
98353069d4eSTristan Ye 
98453069d4eSTristan Ye 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
98553069d4eSTristan Ye 		return -EROFS;
98653069d4eSTristan Ye 
98753069d4eSTristan Ye 	mutex_lock(&inode->i_mutex);
98853069d4eSTristan Ye 
98953069d4eSTristan Ye 	/*
99053069d4eSTristan Ye 	 * This prevents concurrent writes from other nodes
99153069d4eSTristan Ye 	 */
99253069d4eSTristan Ye 	status = ocfs2_rw_lock(inode, 1);
99353069d4eSTristan Ye 	if (status) {
99453069d4eSTristan Ye 		mlog_errno(status);
99553069d4eSTristan Ye 		goto out;
99653069d4eSTristan Ye 	}
99753069d4eSTristan Ye 
99853069d4eSTristan Ye 	status = ocfs2_inode_lock(inode, &di_bh, 1);
99953069d4eSTristan Ye 	if (status) {
100053069d4eSTristan Ye 		mlog_errno(status);
100153069d4eSTristan Ye 		goto out_rw_unlock;
100253069d4eSTristan Ye 	}
100353069d4eSTristan Ye 
100453069d4eSTristan Ye 	/*
100553069d4eSTristan Ye 	 * rememer ip_xattr_sem also needs to be held if necessary
100653069d4eSTristan Ye 	 */
100753069d4eSTristan Ye 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
100853069d4eSTristan Ye 
100953069d4eSTristan Ye 	status = __ocfs2_move_extents_range(di_bh, context);
101053069d4eSTristan Ye 
101153069d4eSTristan Ye 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
101253069d4eSTristan Ye 	if (status) {
101353069d4eSTristan Ye 		mlog_errno(status);
101453069d4eSTristan Ye 		goto out_inode_unlock;
101553069d4eSTristan Ye 	}
101653069d4eSTristan Ye 
101753069d4eSTristan Ye 	/*
101853069d4eSTristan Ye 	 * We update ctime for these changes
101953069d4eSTristan Ye 	 */
102053069d4eSTristan Ye 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
102153069d4eSTristan Ye 	if (IS_ERR(handle)) {
102253069d4eSTristan Ye 		status = PTR_ERR(handle);
102353069d4eSTristan Ye 		mlog_errno(status);
102453069d4eSTristan Ye 		goto out_inode_unlock;
102553069d4eSTristan Ye 	}
102653069d4eSTristan Ye 
102753069d4eSTristan Ye 	status = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
102853069d4eSTristan Ye 					 OCFS2_JOURNAL_ACCESS_WRITE);
102953069d4eSTristan Ye 	if (status) {
103053069d4eSTristan Ye 		mlog_errno(status);
103153069d4eSTristan Ye 		goto out_commit;
103253069d4eSTristan Ye 	}
103353069d4eSTristan Ye 
103453069d4eSTristan Ye 	di = (struct ocfs2_dinode *)di_bh->b_data;
103553069d4eSTristan Ye 	inode->i_ctime = CURRENT_TIME;
103653069d4eSTristan Ye 	di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
103753069d4eSTristan Ye 	di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
103853069d4eSTristan Ye 
103953069d4eSTristan Ye 	ocfs2_journal_dirty(handle, di_bh);
104053069d4eSTristan Ye 
104153069d4eSTristan Ye out_commit:
104253069d4eSTristan Ye 	ocfs2_commit_trans(osb, handle);
104353069d4eSTristan Ye 
104453069d4eSTristan Ye out_inode_unlock:
104553069d4eSTristan Ye 	brelse(di_bh);
104653069d4eSTristan Ye 	ocfs2_inode_unlock(inode, 1);
104753069d4eSTristan Ye out_rw_unlock:
104853069d4eSTristan Ye 	ocfs2_rw_unlock(inode, 1);
104953069d4eSTristan Ye out:
105053069d4eSTristan Ye 	mutex_unlock(&inode->i_mutex);
105153069d4eSTristan Ye 
105253069d4eSTristan Ye 	return status;
105353069d4eSTristan Ye }
105453069d4eSTristan Ye 
105553069d4eSTristan Ye int ocfs2_ioctl_move_extents(struct file *filp, void __user *argp)
105653069d4eSTristan Ye {
105753069d4eSTristan Ye 	int status;
105853069d4eSTristan Ye 
105953069d4eSTristan Ye 	struct inode *inode = filp->f_path.dentry->d_inode;
106053069d4eSTristan Ye 	struct ocfs2_move_extents range;
106153069d4eSTristan Ye 	struct ocfs2_move_extents_context *context = NULL;
106253069d4eSTristan Ye 
106353069d4eSTristan Ye 	status = mnt_want_write(filp->f_path.mnt);
106453069d4eSTristan Ye 	if (status)
106553069d4eSTristan Ye 		return status;
106653069d4eSTristan Ye 
106753069d4eSTristan Ye 	if ((!S_ISREG(inode->i_mode)) || !(filp->f_mode & FMODE_WRITE))
106853069d4eSTristan Ye 		goto out;
106953069d4eSTristan Ye 
107053069d4eSTristan Ye 	if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
107153069d4eSTristan Ye 		status = -EPERM;
107253069d4eSTristan Ye 		goto out;
107353069d4eSTristan Ye 	}
107453069d4eSTristan Ye 
107553069d4eSTristan Ye 	context = kzalloc(sizeof(struct ocfs2_move_extents_context), GFP_NOFS);
107653069d4eSTristan Ye 	if (!context) {
107753069d4eSTristan Ye 		status = -ENOMEM;
107853069d4eSTristan Ye 		mlog_errno(status);
107953069d4eSTristan Ye 		goto out;
108053069d4eSTristan Ye 	}
108153069d4eSTristan Ye 
108253069d4eSTristan Ye 	context->inode = inode;
108353069d4eSTristan Ye 	context->file = filp;
108453069d4eSTristan Ye 
108553069d4eSTristan Ye 	if (argp) {
108653069d4eSTristan Ye 		if (copy_from_user(&range, (struct ocfs2_move_extents *)argp,
108753069d4eSTristan Ye 				   sizeof(range))) {
108853069d4eSTristan Ye 			status = -EFAULT;
108953069d4eSTristan Ye 			goto out;
109053069d4eSTristan Ye 		}
109153069d4eSTristan Ye 	} else {
109253069d4eSTristan Ye 		status = -EINVAL;
109353069d4eSTristan Ye 		goto out;
109453069d4eSTristan Ye 	}
109553069d4eSTristan Ye 
109653069d4eSTristan Ye 	if (range.me_start > i_size_read(inode))
109753069d4eSTristan Ye 		goto out;
109853069d4eSTristan Ye 
109953069d4eSTristan Ye 	if (range.me_start + range.me_len > i_size_read(inode))
110053069d4eSTristan Ye 			range.me_len = i_size_read(inode) - range.me_start;
110153069d4eSTristan Ye 
110253069d4eSTristan Ye 	context->range = &range;
110353069d4eSTristan Ye 
110453069d4eSTristan Ye 	if (range.me_flags & OCFS2_MOVE_EXT_FL_AUTO_DEFRAG) {
110553069d4eSTristan Ye 		context->auto_defrag = 1;
110653069d4eSTristan Ye 		/*
110753069d4eSTristan Ye 		 * ok, the default theshold for the defragmentation
110853069d4eSTristan Ye 		 * is 1M, since our maximum clustersize was 1M also.
110953069d4eSTristan Ye 		 * any thought?
111053069d4eSTristan Ye 		 */
1111dda54e76STristan Ye 		if (!range.me_threshold)
111253069d4eSTristan Ye 			range.me_threshold = 1024 * 1024;
1113dda54e76STristan Ye 
1114dda54e76STristan Ye 		if (range.me_threshold > i_size_read(inode))
1115dda54e76STristan Ye 			range.me_threshold = i_size_read(inode);
1116dda54e76STristan Ye 
11174dfa66bdSTristan Ye 		if (range.me_flags & OCFS2_MOVE_EXT_FL_PART_DEFRAG)
11184dfa66bdSTristan Ye 			context->partial = 1;
111953069d4eSTristan Ye 	} else {
112053069d4eSTristan Ye 		/*
112153069d4eSTristan Ye 		 * first best-effort attempt to validate and adjust the goal
112253069d4eSTristan Ye 		 * (physical address in block), while it can't guarantee later
112353069d4eSTristan Ye 		 * operation can succeed all the time since global_bitmap may
112453069d4eSTristan Ye 		 * change a bit over time.
112553069d4eSTristan Ye 		 */
112653069d4eSTristan Ye 
112753069d4eSTristan Ye 		status = ocfs2_validate_and_adjust_move_goal(inode, &range);
112853069d4eSTristan Ye 		if (status)
112953069d4eSTristan Ye 			goto out;
113053069d4eSTristan Ye 	}
113153069d4eSTristan Ye 
113253069d4eSTristan Ye 	status = ocfs2_move_extents(context);
113353069d4eSTristan Ye 	if (status)
113453069d4eSTristan Ye 		mlog_errno(status);
113553069d4eSTristan Ye out:
113653069d4eSTristan Ye 	/*
113753069d4eSTristan Ye 	 * movement/defragmentation may end up being partially completed,
113853069d4eSTristan Ye 	 * that's the reason why we need to return userspace the finished
113953069d4eSTristan Ye 	 * length and new_offset even if failure happens somewhere.
114053069d4eSTristan Ye 	 */
114153069d4eSTristan Ye 	if (argp) {
114253069d4eSTristan Ye 		if (copy_to_user((struct ocfs2_move_extents *)argp, &range,
114353069d4eSTristan Ye 				sizeof(range)))
114453069d4eSTristan Ye 			status = -EFAULT;
114553069d4eSTristan Ye 	}
114653069d4eSTristan Ye 
114753069d4eSTristan Ye 	kfree(context);
114853069d4eSTristan Ye 
114953069d4eSTristan Ye 	mnt_drop_write(filp->f_path.mnt);
115053069d4eSTristan Ye 
115153069d4eSTristan Ye 	return status;
115253069d4eSTristan Ye }
1153