xref: /openbmc/linux/fs/ocfs2/alloc.c (revision 5e404e9e)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31 #include <linux/quotaops.h>
32 
33 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
34 #include <cluster/masklog.h>
35 
36 #include "ocfs2.h"
37 
38 #include "alloc.h"
39 #include "aops.h"
40 #include "blockcheck.h"
41 #include "dlmglue.h"
42 #include "extent_map.h"
43 #include "inode.h"
44 #include "journal.h"
45 #include "localalloc.h"
46 #include "suballoc.h"
47 #include "sysfile.h"
48 #include "file.h"
49 #include "super.h"
50 #include "uptodate.h"
51 #include "xattr.h"
52 
53 #include "buffer_head_io.h"
54 
55 
56 /*
57  * Operations for a specific extent tree type.
58  *
59  * To implement an on-disk btree (extent tree) type in ocfs2, add
60  * an ocfs2_extent_tree_operations structure and the matching
61  * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
62  * for the allocation portion of the extent tree.
63  */
64 struct ocfs2_extent_tree_operations {
65 	/*
66 	 * last_eb_blk is the block number of the right most leaf extent
67 	 * block.  Most on-disk structures containing an extent tree store
68 	 * this value for fast access.  The ->eo_set_last_eb_blk() and
69 	 * ->eo_get_last_eb_blk() operations access this value.  They are
70 	 *  both required.
71 	 */
72 	void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
73 				   u64 blkno);
74 	u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
75 
76 	/*
77 	 * The on-disk structure usually keeps track of how many total
78 	 * clusters are stored in this extent tree.  This function updates
79 	 * that value.  new_clusters is the delta, and must be
80 	 * added to the total.  Required.
81 	 */
82 	void (*eo_update_clusters)(struct ocfs2_extent_tree *et,
83 				   u32 new_clusters);
84 
85 	/*
86 	 * If this extent tree is supported by an extent map, insert
87 	 * a record into the map.
88 	 */
89 	void (*eo_extent_map_insert)(struct ocfs2_extent_tree *et,
90 				     struct ocfs2_extent_rec *rec);
91 
92 	/*
93 	 * If this extent tree is supported by an extent map, truncate the
94 	 * map to clusters,
95 	 */
96 	void (*eo_extent_map_truncate)(struct ocfs2_extent_tree *et,
97 				       u32 clusters);
98 
99 	/*
100 	 * If ->eo_insert_check() exists, it is called before rec is
101 	 * inserted into the extent tree.  It is optional.
102 	 */
103 	int (*eo_insert_check)(struct ocfs2_extent_tree *et,
104 			       struct ocfs2_extent_rec *rec);
105 	int (*eo_sanity_check)(struct ocfs2_extent_tree *et);
106 
107 	/*
108 	 * --------------------------------------------------------------
109 	 * The remaining are internal to ocfs2_extent_tree and don't have
110 	 * accessor functions
111 	 */
112 
113 	/*
114 	 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
115 	 * It is required.
116 	 */
117 	void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
118 
119 	/*
120 	 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
121 	 * it exists.  If it does not, et->et_max_leaf_clusters is set
122 	 * to 0 (unlimited).  Optional.
123 	 */
124 	void (*eo_fill_max_leaf_clusters)(struct ocfs2_extent_tree *et);
125 };
126 
127 
128 /*
129  * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
130  * in the methods.
131  */
132 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
133 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
134 					 u64 blkno);
135 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
136 					 u32 clusters);
137 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
138 					   struct ocfs2_extent_rec *rec);
139 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
140 					     u32 clusters);
141 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
142 				     struct ocfs2_extent_rec *rec);
143 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
144 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
145 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
146 	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
147 	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
148 	.eo_update_clusters	= ocfs2_dinode_update_clusters,
149 	.eo_extent_map_insert	= ocfs2_dinode_extent_map_insert,
150 	.eo_extent_map_truncate	= ocfs2_dinode_extent_map_truncate,
151 	.eo_insert_check	= ocfs2_dinode_insert_check,
152 	.eo_sanity_check	= ocfs2_dinode_sanity_check,
153 	.eo_fill_root_el	= ocfs2_dinode_fill_root_el,
154 };
155 
156 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
157 					 u64 blkno)
158 {
159 	struct ocfs2_dinode *di = et->et_object;
160 
161 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
162 	di->i_last_eb_blk = cpu_to_le64(blkno);
163 }
164 
165 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
166 {
167 	struct ocfs2_dinode *di = et->et_object;
168 
169 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
170 	return le64_to_cpu(di->i_last_eb_blk);
171 }
172 
173 static void ocfs2_dinode_update_clusters(struct ocfs2_extent_tree *et,
174 					 u32 clusters)
175 {
176 	struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
177 	struct ocfs2_dinode *di = et->et_object;
178 
179 	le32_add_cpu(&di->i_clusters, clusters);
180 	spin_lock(&oi->ip_lock);
181 	oi->ip_clusters = le32_to_cpu(di->i_clusters);
182 	spin_unlock(&oi->ip_lock);
183 }
184 
185 static void ocfs2_dinode_extent_map_insert(struct ocfs2_extent_tree *et,
186 					   struct ocfs2_extent_rec *rec)
187 {
188 	struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
189 
190 	ocfs2_extent_map_insert_rec(inode, rec);
191 }
192 
193 static void ocfs2_dinode_extent_map_truncate(struct ocfs2_extent_tree *et,
194 					     u32 clusters)
195 {
196 	struct inode *inode = &cache_info_to_inode(et->et_ci)->vfs_inode;
197 
198 	ocfs2_extent_map_trunc(inode, clusters);
199 }
200 
201 static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
202 				     struct ocfs2_extent_rec *rec)
203 {
204 	struct ocfs2_inode_info *oi = cache_info_to_inode(et->et_ci);
205 	struct ocfs2_super *osb = OCFS2_SB(oi->vfs_inode.i_sb);
206 
207 	BUG_ON(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL);
208 	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
209 			(oi->ip_clusters != le32_to_cpu(rec->e_cpos)),
210 			"Device %s, asking for sparse allocation: inode %llu, "
211 			"cpos %u, clusters %u\n",
212 			osb->dev_str,
213 			(unsigned long long)oi->ip_blkno,
214 			rec->e_cpos, oi->ip_clusters);
215 
216 	return 0;
217 }
218 
219 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et)
220 {
221 	struct ocfs2_dinode *di = et->et_object;
222 
223 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
224 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
225 
226 	return 0;
227 }
228 
229 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
230 {
231 	struct ocfs2_dinode *di = et->et_object;
232 
233 	et->et_root_el = &di->id2.i_list;
234 }
235 
236 
237 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
238 {
239 	struct ocfs2_xattr_value_buf *vb = et->et_object;
240 
241 	et->et_root_el = &vb->vb_xv->xr_list;
242 }
243 
244 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
245 					      u64 blkno)
246 {
247 	struct ocfs2_xattr_value_buf *vb = et->et_object;
248 
249 	vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
250 }
251 
252 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
253 {
254 	struct ocfs2_xattr_value_buf *vb = et->et_object;
255 
256 	return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
257 }
258 
259 static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
260 					      u32 clusters)
261 {
262 	struct ocfs2_xattr_value_buf *vb = et->et_object;
263 
264 	le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
265 }
266 
267 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
268 	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
269 	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
270 	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
271 	.eo_fill_root_el	= ocfs2_xattr_value_fill_root_el,
272 };
273 
274 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
275 {
276 	struct ocfs2_xattr_block *xb = et->et_object;
277 
278 	et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
279 }
280 
281 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct ocfs2_extent_tree *et)
282 {
283 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
284 	et->et_max_leaf_clusters =
285 		ocfs2_clusters_for_bytes(sb, OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
286 }
287 
288 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
289 					     u64 blkno)
290 {
291 	struct ocfs2_xattr_block *xb = et->et_object;
292 	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
293 
294 	xt->xt_last_eb_blk = cpu_to_le64(blkno);
295 }
296 
297 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
298 {
299 	struct ocfs2_xattr_block *xb = et->et_object;
300 	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
301 
302 	return le64_to_cpu(xt->xt_last_eb_blk);
303 }
304 
305 static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
306 					     u32 clusters)
307 {
308 	struct ocfs2_xattr_block *xb = et->et_object;
309 
310 	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
311 }
312 
313 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
314 	.eo_set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
315 	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
316 	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
317 	.eo_fill_root_el	= ocfs2_xattr_tree_fill_root_el,
318 	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
319 };
320 
321 static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
322 					  u64 blkno)
323 {
324 	struct ocfs2_dx_root_block *dx_root = et->et_object;
325 
326 	dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
327 }
328 
329 static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
330 {
331 	struct ocfs2_dx_root_block *dx_root = et->et_object;
332 
333 	return le64_to_cpu(dx_root->dr_last_eb_blk);
334 }
335 
336 static void ocfs2_dx_root_update_clusters(struct ocfs2_extent_tree *et,
337 					  u32 clusters)
338 {
339 	struct ocfs2_dx_root_block *dx_root = et->et_object;
340 
341 	le32_add_cpu(&dx_root->dr_clusters, clusters);
342 }
343 
344 static int ocfs2_dx_root_sanity_check(struct ocfs2_extent_tree *et)
345 {
346 	struct ocfs2_dx_root_block *dx_root = et->et_object;
347 
348 	BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
349 
350 	return 0;
351 }
352 
353 static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
354 {
355 	struct ocfs2_dx_root_block *dx_root = et->et_object;
356 
357 	et->et_root_el = &dx_root->dr_list;
358 }
359 
360 static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
361 	.eo_set_last_eb_blk	= ocfs2_dx_root_set_last_eb_blk,
362 	.eo_get_last_eb_blk	= ocfs2_dx_root_get_last_eb_blk,
363 	.eo_update_clusters	= ocfs2_dx_root_update_clusters,
364 	.eo_sanity_check	= ocfs2_dx_root_sanity_check,
365 	.eo_fill_root_el	= ocfs2_dx_root_fill_root_el,
366 };
367 
368 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
369 				     struct ocfs2_caching_info *ci,
370 				     struct buffer_head *bh,
371 				     ocfs2_journal_access_func access,
372 				     void *obj,
373 				     struct ocfs2_extent_tree_operations *ops)
374 {
375 	et->et_ops = ops;
376 	et->et_root_bh = bh;
377 	et->et_ci = ci;
378 	et->et_root_journal_access = access;
379 	if (!obj)
380 		obj = (void *)bh->b_data;
381 	et->et_object = obj;
382 
383 	et->et_ops->eo_fill_root_el(et);
384 	if (!et->et_ops->eo_fill_max_leaf_clusters)
385 		et->et_max_leaf_clusters = 0;
386 	else
387 		et->et_ops->eo_fill_max_leaf_clusters(et);
388 }
389 
390 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
391 				   struct ocfs2_caching_info *ci,
392 				   struct buffer_head *bh)
393 {
394 	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_di,
395 				 NULL, &ocfs2_dinode_et_ops);
396 }
397 
398 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
399 				       struct ocfs2_caching_info *ci,
400 				       struct buffer_head *bh)
401 {
402 	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_xb,
403 				 NULL, &ocfs2_xattr_tree_et_ops);
404 }
405 
406 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
407 					struct ocfs2_caching_info *ci,
408 					struct ocfs2_xattr_value_buf *vb)
409 {
410 	__ocfs2_init_extent_tree(et, ci, vb->vb_bh, vb->vb_access, vb,
411 				 &ocfs2_xattr_value_et_ops);
412 }
413 
414 void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
415 				    struct ocfs2_caching_info *ci,
416 				    struct buffer_head *bh)
417 {
418 	__ocfs2_init_extent_tree(et, ci, bh, ocfs2_journal_access_dr,
419 				 NULL, &ocfs2_dx_root_et_ops);
420 }
421 
422 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
423 					    u64 new_last_eb_blk)
424 {
425 	et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
426 }
427 
428 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
429 {
430 	return et->et_ops->eo_get_last_eb_blk(et);
431 }
432 
433 static inline void ocfs2_et_update_clusters(struct ocfs2_extent_tree *et,
434 					    u32 clusters)
435 {
436 	et->et_ops->eo_update_clusters(et, clusters);
437 }
438 
439 static inline void ocfs2_et_extent_map_insert(struct ocfs2_extent_tree *et,
440 					      struct ocfs2_extent_rec *rec)
441 {
442 	if (et->et_ops->eo_extent_map_insert)
443 		et->et_ops->eo_extent_map_insert(et, rec);
444 }
445 
446 static inline void ocfs2_et_extent_map_truncate(struct ocfs2_extent_tree *et,
447 						u32 clusters)
448 {
449 	if (et->et_ops->eo_extent_map_truncate)
450 		et->et_ops->eo_extent_map_truncate(et, clusters);
451 }
452 
453 static inline int ocfs2_et_root_journal_access(handle_t *handle,
454 					       struct ocfs2_extent_tree *et,
455 					       int type)
456 {
457 	return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
458 					  type);
459 }
460 
461 static inline int ocfs2_et_insert_check(struct ocfs2_extent_tree *et,
462 					struct ocfs2_extent_rec *rec)
463 {
464 	int ret = 0;
465 
466 	if (et->et_ops->eo_insert_check)
467 		ret = et->et_ops->eo_insert_check(et, rec);
468 	return ret;
469 }
470 
471 static inline int ocfs2_et_sanity_check(struct ocfs2_extent_tree *et)
472 {
473 	int ret = 0;
474 
475 	if (et->et_ops->eo_sanity_check)
476 		ret = et->et_ops->eo_sanity_check(et);
477 	return ret;
478 }
479 
480 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
481 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
482 					 struct ocfs2_extent_block *eb);
483 
484 /*
485  * Structures which describe a path through a btree, and functions to
486  * manipulate them.
487  *
488  * The idea here is to be as generic as possible with the tree
489  * manipulation code.
490  */
491 struct ocfs2_path_item {
492 	struct buffer_head		*bh;
493 	struct ocfs2_extent_list	*el;
494 };
495 
496 #define OCFS2_MAX_PATH_DEPTH	5
497 
498 struct ocfs2_path {
499 	int				p_tree_depth;
500 	ocfs2_journal_access_func	p_root_access;
501 	struct ocfs2_path_item		p_node[OCFS2_MAX_PATH_DEPTH];
502 };
503 
504 #define path_root_bh(_path) ((_path)->p_node[0].bh)
505 #define path_root_el(_path) ((_path)->p_node[0].el)
506 #define path_root_access(_path)((_path)->p_root_access)
507 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
508 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
509 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
510 
511 static int ocfs2_find_path(struct ocfs2_caching_info *ci,
512 			   struct ocfs2_path *path, u32 cpos);
513 static void ocfs2_adjust_rightmost_records(handle_t *handle,
514 					   struct ocfs2_extent_tree *et,
515 					   struct ocfs2_path *path,
516 					   struct ocfs2_extent_rec *insert_rec);
517 /*
518  * Reset the actual path elements so that we can re-use the structure
519  * to build another path. Generally, this involves freeing the buffer
520  * heads.
521  */
522 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
523 {
524 	int i, start = 0, depth = 0;
525 	struct ocfs2_path_item *node;
526 
527 	if (keep_root)
528 		start = 1;
529 
530 	for(i = start; i < path_num_items(path); i++) {
531 		node = &path->p_node[i];
532 
533 		brelse(node->bh);
534 		node->bh = NULL;
535 		node->el = NULL;
536 	}
537 
538 	/*
539 	 * Tree depth may change during truncate, or insert. If we're
540 	 * keeping the root extent list, then make sure that our path
541 	 * structure reflects the proper depth.
542 	 */
543 	if (keep_root)
544 		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
545 	else
546 		path_root_access(path) = NULL;
547 
548 	path->p_tree_depth = depth;
549 }
550 
551 static void ocfs2_free_path(struct ocfs2_path *path)
552 {
553 	if (path) {
554 		ocfs2_reinit_path(path, 0);
555 		kfree(path);
556 	}
557 }
558 
559 /*
560  * All the elements of src into dest. After this call, src could be freed
561  * without affecting dest.
562  *
563  * Both paths should have the same root. Any non-root elements of dest
564  * will be freed.
565  */
566 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
567 {
568 	int i;
569 
570 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
571 	BUG_ON(path_root_el(dest) != path_root_el(src));
572 	BUG_ON(path_root_access(dest) != path_root_access(src));
573 
574 	ocfs2_reinit_path(dest, 1);
575 
576 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
577 		dest->p_node[i].bh = src->p_node[i].bh;
578 		dest->p_node[i].el = src->p_node[i].el;
579 
580 		if (dest->p_node[i].bh)
581 			get_bh(dest->p_node[i].bh);
582 	}
583 }
584 
585 /*
586  * Make the *dest path the same as src and re-initialize src path to
587  * have a root only.
588  */
589 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
590 {
591 	int i;
592 
593 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
594 	BUG_ON(path_root_access(dest) != path_root_access(src));
595 
596 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
597 		brelse(dest->p_node[i].bh);
598 
599 		dest->p_node[i].bh = src->p_node[i].bh;
600 		dest->p_node[i].el = src->p_node[i].el;
601 
602 		src->p_node[i].bh = NULL;
603 		src->p_node[i].el = NULL;
604 	}
605 }
606 
607 /*
608  * Insert an extent block at given index.
609  *
610  * This will not take an additional reference on eb_bh.
611  */
612 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
613 					struct buffer_head *eb_bh)
614 {
615 	struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
616 
617 	/*
618 	 * Right now, no root bh is an extent block, so this helps
619 	 * catch code errors with dinode trees. The assertion can be
620 	 * safely removed if we ever need to insert extent block
621 	 * structures at the root.
622 	 */
623 	BUG_ON(index == 0);
624 
625 	path->p_node[index].bh = eb_bh;
626 	path->p_node[index].el = &eb->h_list;
627 }
628 
629 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
630 					 struct ocfs2_extent_list *root_el,
631 					 ocfs2_journal_access_func access)
632 {
633 	struct ocfs2_path *path;
634 
635 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
636 
637 	path = kzalloc(sizeof(*path), GFP_NOFS);
638 	if (path) {
639 		path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
640 		get_bh(root_bh);
641 		path_root_bh(path) = root_bh;
642 		path_root_el(path) = root_el;
643 		path_root_access(path) = access;
644 	}
645 
646 	return path;
647 }
648 
649 static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
650 {
651 	return ocfs2_new_path(path_root_bh(path), path_root_el(path),
652 			      path_root_access(path));
653 }
654 
655 static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
656 {
657 	return ocfs2_new_path(et->et_root_bh, et->et_root_el,
658 			      et->et_root_journal_access);
659 }
660 
661 /*
662  * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
663  * otherwise it's the root_access function.
664  *
665  * I don't like the way this function's name looks next to
666  * ocfs2_journal_access_path(), but I don't have a better one.
667  */
668 static int ocfs2_path_bh_journal_access(handle_t *handle,
669 					struct ocfs2_caching_info *ci,
670 					struct ocfs2_path *path,
671 					int idx)
672 {
673 	ocfs2_journal_access_func access = path_root_access(path);
674 
675 	if (!access)
676 		access = ocfs2_journal_access;
677 
678 	if (idx)
679 		access = ocfs2_journal_access_eb;
680 
681 	return access(handle, ci, path->p_node[idx].bh,
682 		      OCFS2_JOURNAL_ACCESS_WRITE);
683 }
684 
685 /*
686  * Convenience function to journal all components in a path.
687  */
688 static int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
689 				     handle_t *handle,
690 				     struct ocfs2_path *path)
691 {
692 	int i, ret = 0;
693 
694 	if (!path)
695 		goto out;
696 
697 	for(i = 0; i < path_num_items(path); i++) {
698 		ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
699 		if (ret < 0) {
700 			mlog_errno(ret);
701 			goto out;
702 		}
703 	}
704 
705 out:
706 	return ret;
707 }
708 
709 /*
710  * Return the index of the extent record which contains cluster #v_cluster.
711  * -1 is returned if it was not found.
712  *
713  * Should work fine on interior and exterior nodes.
714  */
715 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
716 {
717 	int ret = -1;
718 	int i;
719 	struct ocfs2_extent_rec *rec;
720 	u32 rec_end, rec_start, clusters;
721 
722 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
723 		rec = &el->l_recs[i];
724 
725 		rec_start = le32_to_cpu(rec->e_cpos);
726 		clusters = ocfs2_rec_clusters(el, rec);
727 
728 		rec_end = rec_start + clusters;
729 
730 		if (v_cluster >= rec_start && v_cluster < rec_end) {
731 			ret = i;
732 			break;
733 		}
734 	}
735 
736 	return ret;
737 }
738 
739 enum ocfs2_contig_type {
740 	CONTIG_NONE = 0,
741 	CONTIG_LEFT,
742 	CONTIG_RIGHT,
743 	CONTIG_LEFTRIGHT,
744 };
745 
746 
747 /*
748  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
749  * ocfs2_extent_contig only work properly against leaf nodes!
750  */
751 static int ocfs2_block_extent_contig(struct super_block *sb,
752 				     struct ocfs2_extent_rec *ext,
753 				     u64 blkno)
754 {
755 	u64 blk_end = le64_to_cpu(ext->e_blkno);
756 
757 	blk_end += ocfs2_clusters_to_blocks(sb,
758 				    le16_to_cpu(ext->e_leaf_clusters));
759 
760 	return blkno == blk_end;
761 }
762 
763 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
764 				  struct ocfs2_extent_rec *right)
765 {
766 	u32 left_range;
767 
768 	left_range = le32_to_cpu(left->e_cpos) +
769 		le16_to_cpu(left->e_leaf_clusters);
770 
771 	return (left_range == le32_to_cpu(right->e_cpos));
772 }
773 
774 static enum ocfs2_contig_type
775 	ocfs2_extent_contig(struct super_block *sb,
776 			    struct ocfs2_extent_rec *ext,
777 			    struct ocfs2_extent_rec *insert_rec)
778 {
779 	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
780 
781 	/*
782 	 * Refuse to coalesce extent records with different flag
783 	 * fields - we don't want to mix unwritten extents with user
784 	 * data.
785 	 */
786 	if (ext->e_flags != insert_rec->e_flags)
787 		return CONTIG_NONE;
788 
789 	if (ocfs2_extents_adjacent(ext, insert_rec) &&
790 	    ocfs2_block_extent_contig(sb, ext, blkno))
791 			return CONTIG_RIGHT;
792 
793 	blkno = le64_to_cpu(ext->e_blkno);
794 	if (ocfs2_extents_adjacent(insert_rec, ext) &&
795 	    ocfs2_block_extent_contig(sb, insert_rec, blkno))
796 		return CONTIG_LEFT;
797 
798 	return CONTIG_NONE;
799 }
800 
801 /*
802  * NOTE: We can have pretty much any combination of contiguousness and
803  * appending.
804  *
805  * The usefulness of APPEND_TAIL is more in that it lets us know that
806  * we'll have to update the path to that leaf.
807  */
808 enum ocfs2_append_type {
809 	APPEND_NONE = 0,
810 	APPEND_TAIL,
811 };
812 
813 enum ocfs2_split_type {
814 	SPLIT_NONE = 0,
815 	SPLIT_LEFT,
816 	SPLIT_RIGHT,
817 };
818 
819 struct ocfs2_insert_type {
820 	enum ocfs2_split_type	ins_split;
821 	enum ocfs2_append_type	ins_appending;
822 	enum ocfs2_contig_type	ins_contig;
823 	int			ins_contig_index;
824 	int			ins_tree_depth;
825 };
826 
827 struct ocfs2_merge_ctxt {
828 	enum ocfs2_contig_type	c_contig_type;
829 	int			c_has_empty_extent;
830 	int			c_split_covers_rec;
831 };
832 
833 static int ocfs2_validate_extent_block(struct super_block *sb,
834 				       struct buffer_head *bh)
835 {
836 	int rc;
837 	struct ocfs2_extent_block *eb =
838 		(struct ocfs2_extent_block *)bh->b_data;
839 
840 	mlog(0, "Validating extent block %llu\n",
841 	     (unsigned long long)bh->b_blocknr);
842 
843 	BUG_ON(!buffer_uptodate(bh));
844 
845 	/*
846 	 * If the ecc fails, we return the error but otherwise
847 	 * leave the filesystem running.  We know any error is
848 	 * local to this block.
849 	 */
850 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
851 	if (rc) {
852 		mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
853 		     (unsigned long long)bh->b_blocknr);
854 		return rc;
855 	}
856 
857 	/*
858 	 * Errors after here are fatal.
859 	 */
860 
861 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
862 		ocfs2_error(sb,
863 			    "Extent block #%llu has bad signature %.*s",
864 			    (unsigned long long)bh->b_blocknr, 7,
865 			    eb->h_signature);
866 		return -EINVAL;
867 	}
868 
869 	if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
870 		ocfs2_error(sb,
871 			    "Extent block #%llu has an invalid h_blkno "
872 			    "of %llu",
873 			    (unsigned long long)bh->b_blocknr,
874 			    (unsigned long long)le64_to_cpu(eb->h_blkno));
875 		return -EINVAL;
876 	}
877 
878 	if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
879 		ocfs2_error(sb,
880 			    "Extent block #%llu has an invalid "
881 			    "h_fs_generation of #%u",
882 			    (unsigned long long)bh->b_blocknr,
883 			    le32_to_cpu(eb->h_fs_generation));
884 		return -EINVAL;
885 	}
886 
887 	return 0;
888 }
889 
890 int ocfs2_read_extent_block(struct ocfs2_caching_info *ci, u64 eb_blkno,
891 			    struct buffer_head **bh)
892 {
893 	int rc;
894 	struct buffer_head *tmp = *bh;
895 
896 	rc = ocfs2_read_block(ci, eb_blkno, &tmp,
897 			      ocfs2_validate_extent_block);
898 
899 	/* If ocfs2_read_block() got us a new bh, pass it up. */
900 	if (!rc && !*bh)
901 		*bh = tmp;
902 
903 	return rc;
904 }
905 
906 
907 /*
908  * How many free extents have we got before we need more meta data?
909  */
910 int ocfs2_num_free_extents(struct ocfs2_super *osb,
911 			   struct ocfs2_extent_tree *et)
912 {
913 	int retval;
914 	struct ocfs2_extent_list *el = NULL;
915 	struct ocfs2_extent_block *eb;
916 	struct buffer_head *eb_bh = NULL;
917 	u64 last_eb_blk = 0;
918 
919 	mlog_entry_void();
920 
921 	el = et->et_root_el;
922 	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
923 
924 	if (last_eb_blk) {
925 		retval = ocfs2_read_extent_block(et->et_ci, last_eb_blk,
926 						 &eb_bh);
927 		if (retval < 0) {
928 			mlog_errno(retval);
929 			goto bail;
930 		}
931 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
932 		el = &eb->h_list;
933 	}
934 
935 	BUG_ON(el->l_tree_depth != 0);
936 
937 	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
938 bail:
939 	brelse(eb_bh);
940 
941 	mlog_exit(retval);
942 	return retval;
943 }
944 
945 /* expects array to already be allocated
946  *
947  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
948  * l_count for you
949  */
950 static int ocfs2_create_new_meta_bhs(handle_t *handle,
951 				     struct ocfs2_extent_tree *et,
952 				     int wanted,
953 				     struct ocfs2_alloc_context *meta_ac,
954 				     struct buffer_head *bhs[])
955 {
956 	int count, status, i;
957 	u16 suballoc_bit_start;
958 	u32 num_got;
959 	u64 first_blkno;
960 	struct ocfs2_super *osb =
961 		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
962 	struct ocfs2_extent_block *eb;
963 
964 	mlog_entry_void();
965 
966 	count = 0;
967 	while (count < wanted) {
968 		status = ocfs2_claim_metadata(osb,
969 					      handle,
970 					      meta_ac,
971 					      wanted - count,
972 					      &suballoc_bit_start,
973 					      &num_got,
974 					      &first_blkno);
975 		if (status < 0) {
976 			mlog_errno(status);
977 			goto bail;
978 		}
979 
980 		for(i = count;  i < (num_got + count); i++) {
981 			bhs[i] = sb_getblk(osb->sb, first_blkno);
982 			if (bhs[i] == NULL) {
983 				status = -EIO;
984 				mlog_errno(status);
985 				goto bail;
986 			}
987 			ocfs2_set_new_buffer_uptodate(et->et_ci, bhs[i]);
988 
989 			status = ocfs2_journal_access_eb(handle, et->et_ci,
990 							 bhs[i],
991 							 OCFS2_JOURNAL_ACCESS_CREATE);
992 			if (status < 0) {
993 				mlog_errno(status);
994 				goto bail;
995 			}
996 
997 			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
998 			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
999 			/* Ok, setup the minimal stuff here. */
1000 			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
1001 			eb->h_blkno = cpu_to_le64(first_blkno);
1002 			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
1003 			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
1004 			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
1005 			eb->h_list.l_count =
1006 				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
1007 
1008 			suballoc_bit_start++;
1009 			first_blkno++;
1010 
1011 			/* We'll also be dirtied by the caller, so
1012 			 * this isn't absolutely necessary. */
1013 			status = ocfs2_journal_dirty(handle, bhs[i]);
1014 			if (status < 0) {
1015 				mlog_errno(status);
1016 				goto bail;
1017 			}
1018 		}
1019 
1020 		count += num_got;
1021 	}
1022 
1023 	status = 0;
1024 bail:
1025 	if (status < 0) {
1026 		for(i = 0; i < wanted; i++) {
1027 			brelse(bhs[i]);
1028 			bhs[i] = NULL;
1029 		}
1030 	}
1031 	mlog_exit(status);
1032 	return status;
1033 }
1034 
1035 /*
1036  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
1037  *
1038  * Returns the sum of the rightmost extent rec logical offset and
1039  * cluster count.
1040  *
1041  * ocfs2_add_branch() uses this to determine what logical cluster
1042  * value should be populated into the leftmost new branch records.
1043  *
1044  * ocfs2_shift_tree_depth() uses this to determine the # clusters
1045  * value for the new topmost tree record.
1046  */
1047 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
1048 {
1049 	int i;
1050 
1051 	i = le16_to_cpu(el->l_next_free_rec) - 1;
1052 
1053 	return le32_to_cpu(el->l_recs[i].e_cpos) +
1054 		ocfs2_rec_clusters(el, &el->l_recs[i]);
1055 }
1056 
1057 /*
1058  * Change range of the branches in the right most path according to the leaf
1059  * extent block's rightmost record.
1060  */
1061 static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1062 					 struct ocfs2_extent_tree *et)
1063 {
1064 	int status;
1065 	struct ocfs2_path *path = NULL;
1066 	struct ocfs2_extent_list *el;
1067 	struct ocfs2_extent_rec *rec;
1068 
1069 	path = ocfs2_new_path_from_et(et);
1070 	if (!path) {
1071 		status = -ENOMEM;
1072 		return status;
1073 	}
1074 
1075 	status = ocfs2_find_path(et->et_ci, path, UINT_MAX);
1076 	if (status < 0) {
1077 		mlog_errno(status);
1078 		goto out;
1079 	}
1080 
1081 	status = ocfs2_extend_trans(handle, path_num_items(path) +
1082 				    handle->h_buffer_credits);
1083 	if (status < 0) {
1084 		mlog_errno(status);
1085 		goto out;
1086 	}
1087 
1088 	status = ocfs2_journal_access_path(et->et_ci, handle, path);
1089 	if (status < 0) {
1090 		mlog_errno(status);
1091 		goto out;
1092 	}
1093 
1094 	el = path_leaf_el(path);
1095 	rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
1096 
1097 	ocfs2_adjust_rightmost_records(handle, et, path, rec);
1098 
1099 out:
1100 	ocfs2_free_path(path);
1101 	return status;
1102 }
1103 
1104 /*
1105  * Add an entire tree branch to our inode. eb_bh is the extent block
1106  * to start at, if we don't want to start the branch at the root
1107  * structure.
1108  *
1109  * last_eb_bh is required as we have to update it's next_leaf pointer
1110  * for the new last extent block.
1111  *
1112  * the new branch will be 'empty' in the sense that every block will
1113  * contain a single record with cluster count == 0.
1114  */
1115 static int ocfs2_add_branch(handle_t *handle,
1116 			    struct ocfs2_extent_tree *et,
1117 			    struct buffer_head *eb_bh,
1118 			    struct buffer_head **last_eb_bh,
1119 			    struct ocfs2_alloc_context *meta_ac)
1120 {
1121 	int status, new_blocks, i;
1122 	u64 next_blkno, new_last_eb_blk;
1123 	struct buffer_head *bh;
1124 	struct buffer_head **new_eb_bhs = NULL;
1125 	struct ocfs2_extent_block *eb;
1126 	struct ocfs2_extent_list  *eb_el;
1127 	struct ocfs2_extent_list  *el;
1128 	u32 new_cpos, root_end;
1129 
1130 	mlog_entry_void();
1131 
1132 	BUG_ON(!last_eb_bh || !*last_eb_bh);
1133 
1134 	if (eb_bh) {
1135 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1136 		el = &eb->h_list;
1137 	} else
1138 		el = et->et_root_el;
1139 
1140 	/* we never add a branch to a leaf. */
1141 	BUG_ON(!el->l_tree_depth);
1142 
1143 	new_blocks = le16_to_cpu(el->l_tree_depth);
1144 
1145 	eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
1146 	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
1147 	root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
1148 
1149 	/*
1150 	 * If there is a gap before the root end and the real end
1151 	 * of the righmost leaf block, we need to remove the gap
1152 	 * between new_cpos and root_end first so that the tree
1153 	 * is consistent after we add a new branch(it will start
1154 	 * from new_cpos).
1155 	 */
1156 	if (root_end > new_cpos) {
1157 		mlog(0, "adjust the cluster end from %u to %u\n",
1158 		     root_end, new_cpos);
1159 		status = ocfs2_adjust_rightmost_branch(handle, et);
1160 		if (status) {
1161 			mlog_errno(status);
1162 			goto bail;
1163 		}
1164 	}
1165 
1166 	/* allocate the number of new eb blocks we need */
1167 	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
1168 			     GFP_KERNEL);
1169 	if (!new_eb_bhs) {
1170 		status = -ENOMEM;
1171 		mlog_errno(status);
1172 		goto bail;
1173 	}
1174 
1175 	status = ocfs2_create_new_meta_bhs(handle, et, new_blocks,
1176 					   meta_ac, new_eb_bhs);
1177 	if (status < 0) {
1178 		mlog_errno(status);
1179 		goto bail;
1180 	}
1181 
1182 	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
1183 	 * linked with the rest of the tree.
1184 	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
1185 	 *
1186 	 * when we leave the loop, new_last_eb_blk will point to the
1187 	 * newest leaf, and next_blkno will point to the topmost extent
1188 	 * block. */
1189 	next_blkno = new_last_eb_blk = 0;
1190 	for(i = 0; i < new_blocks; i++) {
1191 		bh = new_eb_bhs[i];
1192 		eb = (struct ocfs2_extent_block *) bh->b_data;
1193 		/* ocfs2_create_new_meta_bhs() should create it right! */
1194 		BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1195 		eb_el = &eb->h_list;
1196 
1197 		status = ocfs2_journal_access_eb(handle, et->et_ci, bh,
1198 						 OCFS2_JOURNAL_ACCESS_CREATE);
1199 		if (status < 0) {
1200 			mlog_errno(status);
1201 			goto bail;
1202 		}
1203 
1204 		eb->h_next_leaf_blk = 0;
1205 		eb_el->l_tree_depth = cpu_to_le16(i);
1206 		eb_el->l_next_free_rec = cpu_to_le16(1);
1207 		/*
1208 		 * This actually counts as an empty extent as
1209 		 * c_clusters == 0
1210 		 */
1211 		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
1212 		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
1213 		/*
1214 		 * eb_el isn't always an interior node, but even leaf
1215 		 * nodes want a zero'd flags and reserved field so
1216 		 * this gets the whole 32 bits regardless of use.
1217 		 */
1218 		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
1219 		if (!eb_el->l_tree_depth)
1220 			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1221 
1222 		status = ocfs2_journal_dirty(handle, bh);
1223 		if (status < 0) {
1224 			mlog_errno(status);
1225 			goto bail;
1226 		}
1227 
1228 		next_blkno = le64_to_cpu(eb->h_blkno);
1229 	}
1230 
1231 	/* This is a bit hairy. We want to update up to three blocks
1232 	 * here without leaving any of them in an inconsistent state
1233 	 * in case of error. We don't have to worry about
1234 	 * journal_dirty erroring as it won't unless we've aborted the
1235 	 * handle (in which case we would never be here) so reserving
1236 	 * the write with journal_access is all we need to do. */
1237 	status = ocfs2_journal_access_eb(handle, et->et_ci, *last_eb_bh,
1238 					 OCFS2_JOURNAL_ACCESS_WRITE);
1239 	if (status < 0) {
1240 		mlog_errno(status);
1241 		goto bail;
1242 	}
1243 	status = ocfs2_et_root_journal_access(handle, et,
1244 					      OCFS2_JOURNAL_ACCESS_WRITE);
1245 	if (status < 0) {
1246 		mlog_errno(status);
1247 		goto bail;
1248 	}
1249 	if (eb_bh) {
1250 		status = ocfs2_journal_access_eb(handle, et->et_ci, eb_bh,
1251 						 OCFS2_JOURNAL_ACCESS_WRITE);
1252 		if (status < 0) {
1253 			mlog_errno(status);
1254 			goto bail;
1255 		}
1256 	}
1257 
1258 	/* Link the new branch into the rest of the tree (el will
1259 	 * either be on the root_bh, or the extent block passed in. */
1260 	i = le16_to_cpu(el->l_next_free_rec);
1261 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1262 	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1263 	el->l_recs[i].e_int_clusters = 0;
1264 	le16_add_cpu(&el->l_next_free_rec, 1);
1265 
1266 	/* fe needs a new last extent block pointer, as does the
1267 	 * next_leaf on the previously last-extent-block. */
1268 	ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1269 
1270 	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1271 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1272 
1273 	status = ocfs2_journal_dirty(handle, *last_eb_bh);
1274 	if (status < 0)
1275 		mlog_errno(status);
1276 	status = ocfs2_journal_dirty(handle, et->et_root_bh);
1277 	if (status < 0)
1278 		mlog_errno(status);
1279 	if (eb_bh) {
1280 		status = ocfs2_journal_dirty(handle, eb_bh);
1281 		if (status < 0)
1282 			mlog_errno(status);
1283 	}
1284 
1285 	/*
1286 	 * Some callers want to track the rightmost leaf so pass it
1287 	 * back here.
1288 	 */
1289 	brelse(*last_eb_bh);
1290 	get_bh(new_eb_bhs[0]);
1291 	*last_eb_bh = new_eb_bhs[0];
1292 
1293 	status = 0;
1294 bail:
1295 	if (new_eb_bhs) {
1296 		for (i = 0; i < new_blocks; i++)
1297 			brelse(new_eb_bhs[i]);
1298 		kfree(new_eb_bhs);
1299 	}
1300 
1301 	mlog_exit(status);
1302 	return status;
1303 }
1304 
1305 /*
1306  * adds another level to the allocation tree.
1307  * returns back the new extent block so you can add a branch to it
1308  * after this call.
1309  */
1310 static int ocfs2_shift_tree_depth(handle_t *handle,
1311 				  struct ocfs2_extent_tree *et,
1312 				  struct ocfs2_alloc_context *meta_ac,
1313 				  struct buffer_head **ret_new_eb_bh)
1314 {
1315 	int status, i;
1316 	u32 new_clusters;
1317 	struct buffer_head *new_eb_bh = NULL;
1318 	struct ocfs2_extent_block *eb;
1319 	struct ocfs2_extent_list  *root_el;
1320 	struct ocfs2_extent_list  *eb_el;
1321 
1322 	mlog_entry_void();
1323 
1324 	status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac,
1325 					   &new_eb_bh);
1326 	if (status < 0) {
1327 		mlog_errno(status);
1328 		goto bail;
1329 	}
1330 
1331 	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1332 	/* ocfs2_create_new_meta_bhs() should create it right! */
1333 	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1334 
1335 	eb_el = &eb->h_list;
1336 	root_el = et->et_root_el;
1337 
1338 	status = ocfs2_journal_access_eb(handle, et->et_ci, new_eb_bh,
1339 					 OCFS2_JOURNAL_ACCESS_CREATE);
1340 	if (status < 0) {
1341 		mlog_errno(status);
1342 		goto bail;
1343 	}
1344 
1345 	/* copy the root extent list data into the new extent block */
1346 	eb_el->l_tree_depth = root_el->l_tree_depth;
1347 	eb_el->l_next_free_rec = root_el->l_next_free_rec;
1348 	for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1349 		eb_el->l_recs[i] = root_el->l_recs[i];
1350 
1351 	status = ocfs2_journal_dirty(handle, new_eb_bh);
1352 	if (status < 0) {
1353 		mlog_errno(status);
1354 		goto bail;
1355 	}
1356 
1357 	status = ocfs2_et_root_journal_access(handle, et,
1358 					      OCFS2_JOURNAL_ACCESS_WRITE);
1359 	if (status < 0) {
1360 		mlog_errno(status);
1361 		goto bail;
1362 	}
1363 
1364 	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1365 
1366 	/* update root_bh now */
1367 	le16_add_cpu(&root_el->l_tree_depth, 1);
1368 	root_el->l_recs[0].e_cpos = 0;
1369 	root_el->l_recs[0].e_blkno = eb->h_blkno;
1370 	root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1371 	for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1372 		memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1373 	root_el->l_next_free_rec = cpu_to_le16(1);
1374 
1375 	/* If this is our 1st tree depth shift, then last_eb_blk
1376 	 * becomes the allocated extent block */
1377 	if (root_el->l_tree_depth == cpu_to_le16(1))
1378 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1379 
1380 	status = ocfs2_journal_dirty(handle, et->et_root_bh);
1381 	if (status < 0) {
1382 		mlog_errno(status);
1383 		goto bail;
1384 	}
1385 
1386 	*ret_new_eb_bh = new_eb_bh;
1387 	new_eb_bh = NULL;
1388 	status = 0;
1389 bail:
1390 	brelse(new_eb_bh);
1391 
1392 	mlog_exit(status);
1393 	return status;
1394 }
1395 
1396 /*
1397  * Should only be called when there is no space left in any of the
1398  * leaf nodes. What we want to do is find the lowest tree depth
1399  * non-leaf extent block with room for new records. There are three
1400  * valid results of this search:
1401  *
1402  * 1) a lowest extent block is found, then we pass it back in
1403  *    *lowest_eb_bh and return '0'
1404  *
1405  * 2) the search fails to find anything, but the root_el has room. We
1406  *    pass NULL back in *lowest_eb_bh, but still return '0'
1407  *
1408  * 3) the search fails to find anything AND the root_el is full, in
1409  *    which case we return > 0
1410  *
1411  * return status < 0 indicates an error.
1412  */
1413 static int ocfs2_find_branch_target(struct ocfs2_extent_tree *et,
1414 				    struct buffer_head **target_bh)
1415 {
1416 	int status = 0, i;
1417 	u64 blkno;
1418 	struct ocfs2_extent_block *eb;
1419 	struct ocfs2_extent_list  *el;
1420 	struct buffer_head *bh = NULL;
1421 	struct buffer_head *lowest_bh = NULL;
1422 
1423 	mlog_entry_void();
1424 
1425 	*target_bh = NULL;
1426 
1427 	el = et->et_root_el;
1428 
1429 	while(le16_to_cpu(el->l_tree_depth) > 1) {
1430 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
1431 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1432 				    "Owner %llu has empty "
1433 				    "extent list (next_free_rec == 0)",
1434 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
1435 			status = -EIO;
1436 			goto bail;
1437 		}
1438 		i = le16_to_cpu(el->l_next_free_rec) - 1;
1439 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1440 		if (!blkno) {
1441 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
1442 				    "Owner %llu has extent "
1443 				    "list where extent # %d has no physical "
1444 				    "block start",
1445 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci), i);
1446 			status = -EIO;
1447 			goto bail;
1448 		}
1449 
1450 		brelse(bh);
1451 		bh = NULL;
1452 
1453 		status = ocfs2_read_extent_block(et->et_ci, blkno, &bh);
1454 		if (status < 0) {
1455 			mlog_errno(status);
1456 			goto bail;
1457 		}
1458 
1459 		eb = (struct ocfs2_extent_block *) bh->b_data;
1460 		el = &eb->h_list;
1461 
1462 		if (le16_to_cpu(el->l_next_free_rec) <
1463 		    le16_to_cpu(el->l_count)) {
1464 			brelse(lowest_bh);
1465 			lowest_bh = bh;
1466 			get_bh(lowest_bh);
1467 		}
1468 	}
1469 
1470 	/* If we didn't find one and the fe doesn't have any room,
1471 	 * then return '1' */
1472 	el = et->et_root_el;
1473 	if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1474 		status = 1;
1475 
1476 	*target_bh = lowest_bh;
1477 bail:
1478 	brelse(bh);
1479 
1480 	mlog_exit(status);
1481 	return status;
1482 }
1483 
1484 /*
1485  * Grow a b-tree so that it has more records.
1486  *
1487  * We might shift the tree depth in which case existing paths should
1488  * be considered invalid.
1489  *
1490  * Tree depth after the grow is returned via *final_depth.
1491  *
1492  * *last_eb_bh will be updated by ocfs2_add_branch().
1493  */
1494 static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et,
1495 			   int *final_depth, struct buffer_head **last_eb_bh,
1496 			   struct ocfs2_alloc_context *meta_ac)
1497 {
1498 	int ret, shift;
1499 	struct ocfs2_extent_list *el = et->et_root_el;
1500 	int depth = le16_to_cpu(el->l_tree_depth);
1501 	struct buffer_head *bh = NULL;
1502 
1503 	BUG_ON(meta_ac == NULL);
1504 
1505 	shift = ocfs2_find_branch_target(et, &bh);
1506 	if (shift < 0) {
1507 		ret = shift;
1508 		mlog_errno(ret);
1509 		goto out;
1510 	}
1511 
1512 	/* We traveled all the way to the bottom of the allocation tree
1513 	 * and didn't find room for any more extents - we need to add
1514 	 * another tree level */
1515 	if (shift) {
1516 		BUG_ON(bh);
1517 		mlog(0, "need to shift tree depth (current = %d)\n", depth);
1518 
1519 		/* ocfs2_shift_tree_depth will return us a buffer with
1520 		 * the new extent block (so we can pass that to
1521 		 * ocfs2_add_branch). */
1522 		ret = ocfs2_shift_tree_depth(handle, et, meta_ac, &bh);
1523 		if (ret < 0) {
1524 			mlog_errno(ret);
1525 			goto out;
1526 		}
1527 		depth++;
1528 		if (depth == 1) {
1529 			/*
1530 			 * Special case: we have room now if we shifted from
1531 			 * tree_depth 0, so no more work needs to be done.
1532 			 *
1533 			 * We won't be calling add_branch, so pass
1534 			 * back *last_eb_bh as the new leaf. At depth
1535 			 * zero, it should always be null so there's
1536 			 * no reason to brelse.
1537 			 */
1538 			BUG_ON(*last_eb_bh);
1539 			get_bh(bh);
1540 			*last_eb_bh = bh;
1541 			goto out;
1542 		}
1543 	}
1544 
1545 	/* call ocfs2_add_branch to add the final part of the tree with
1546 	 * the new data. */
1547 	mlog(0, "add branch. bh = %p\n", bh);
1548 	ret = ocfs2_add_branch(handle, et, bh, last_eb_bh,
1549 			       meta_ac);
1550 	if (ret < 0) {
1551 		mlog_errno(ret);
1552 		goto out;
1553 	}
1554 
1555 out:
1556 	if (final_depth)
1557 		*final_depth = depth;
1558 	brelse(bh);
1559 	return ret;
1560 }
1561 
1562 /*
1563  * This function will discard the rightmost extent record.
1564  */
1565 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1566 {
1567 	int next_free = le16_to_cpu(el->l_next_free_rec);
1568 	int count = le16_to_cpu(el->l_count);
1569 	unsigned int num_bytes;
1570 
1571 	BUG_ON(!next_free);
1572 	/* This will cause us to go off the end of our extent list. */
1573 	BUG_ON(next_free >= count);
1574 
1575 	num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1576 
1577 	memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1578 }
1579 
1580 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1581 			      struct ocfs2_extent_rec *insert_rec)
1582 {
1583 	int i, insert_index, next_free, has_empty, num_bytes;
1584 	u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1585 	struct ocfs2_extent_rec *rec;
1586 
1587 	next_free = le16_to_cpu(el->l_next_free_rec);
1588 	has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1589 
1590 	BUG_ON(!next_free);
1591 
1592 	/* The tree code before us didn't allow enough room in the leaf. */
1593 	BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1594 
1595 	/*
1596 	 * The easiest way to approach this is to just remove the
1597 	 * empty extent and temporarily decrement next_free.
1598 	 */
1599 	if (has_empty) {
1600 		/*
1601 		 * If next_free was 1 (only an empty extent), this
1602 		 * loop won't execute, which is fine. We still want
1603 		 * the decrement above to happen.
1604 		 */
1605 		for(i = 0; i < (next_free - 1); i++)
1606 			el->l_recs[i] = el->l_recs[i+1];
1607 
1608 		next_free--;
1609 	}
1610 
1611 	/*
1612 	 * Figure out what the new record index should be.
1613 	 */
1614 	for(i = 0; i < next_free; i++) {
1615 		rec = &el->l_recs[i];
1616 
1617 		if (insert_cpos < le32_to_cpu(rec->e_cpos))
1618 			break;
1619 	}
1620 	insert_index = i;
1621 
1622 	mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1623 	     insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1624 
1625 	BUG_ON(insert_index < 0);
1626 	BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1627 	BUG_ON(insert_index > next_free);
1628 
1629 	/*
1630 	 * No need to memmove if we're just adding to the tail.
1631 	 */
1632 	if (insert_index != next_free) {
1633 		BUG_ON(next_free >= le16_to_cpu(el->l_count));
1634 
1635 		num_bytes = next_free - insert_index;
1636 		num_bytes *= sizeof(struct ocfs2_extent_rec);
1637 		memmove(&el->l_recs[insert_index + 1],
1638 			&el->l_recs[insert_index],
1639 			num_bytes);
1640 	}
1641 
1642 	/*
1643 	 * Either we had an empty extent, and need to re-increment or
1644 	 * there was no empty extent on a non full rightmost leaf node,
1645 	 * in which case we still need to increment.
1646 	 */
1647 	next_free++;
1648 	el->l_next_free_rec = cpu_to_le16(next_free);
1649 	/*
1650 	 * Make sure none of the math above just messed up our tree.
1651 	 */
1652 	BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1653 
1654 	el->l_recs[insert_index] = *insert_rec;
1655 
1656 }
1657 
1658 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1659 {
1660 	int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1661 
1662 	BUG_ON(num_recs == 0);
1663 
1664 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1665 		num_recs--;
1666 		size = num_recs * sizeof(struct ocfs2_extent_rec);
1667 		memmove(&el->l_recs[0], &el->l_recs[1], size);
1668 		memset(&el->l_recs[num_recs], 0,
1669 		       sizeof(struct ocfs2_extent_rec));
1670 		el->l_next_free_rec = cpu_to_le16(num_recs);
1671 	}
1672 }
1673 
1674 /*
1675  * Create an empty extent record .
1676  *
1677  * l_next_free_rec may be updated.
1678  *
1679  * If an empty extent already exists do nothing.
1680  */
1681 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1682 {
1683 	int next_free = le16_to_cpu(el->l_next_free_rec);
1684 
1685 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1686 
1687 	if (next_free == 0)
1688 		goto set_and_inc;
1689 
1690 	if (ocfs2_is_empty_extent(&el->l_recs[0]))
1691 		return;
1692 
1693 	mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1694 			"Asked to create an empty extent in a full list:\n"
1695 			"count = %u, tree depth = %u",
1696 			le16_to_cpu(el->l_count),
1697 			le16_to_cpu(el->l_tree_depth));
1698 
1699 	ocfs2_shift_records_right(el);
1700 
1701 set_and_inc:
1702 	le16_add_cpu(&el->l_next_free_rec, 1);
1703 	memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1704 }
1705 
1706 /*
1707  * For a rotation which involves two leaf nodes, the "root node" is
1708  * the lowest level tree node which contains a path to both leafs. This
1709  * resulting set of information can be used to form a complete "subtree"
1710  *
1711  * This function is passed two full paths from the dinode down to a
1712  * pair of adjacent leaves. It's task is to figure out which path
1713  * index contains the subtree root - this can be the root index itself
1714  * in a worst-case rotation.
1715  *
1716  * The array index of the subtree root is passed back.
1717  */
1718 static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et,
1719 				   struct ocfs2_path *left,
1720 				   struct ocfs2_path *right)
1721 {
1722 	int i = 0;
1723 
1724 	/*
1725 	 * Check that the caller passed in two paths from the same tree.
1726 	 */
1727 	BUG_ON(path_root_bh(left) != path_root_bh(right));
1728 
1729 	do {
1730 		i++;
1731 
1732 		/*
1733 		 * The caller didn't pass two adjacent paths.
1734 		 */
1735 		mlog_bug_on_msg(i > left->p_tree_depth,
1736 				"Owner %llu, left depth %u, right depth %u\n"
1737 				"left leaf blk %llu, right leaf blk %llu\n",
1738 				(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
1739 				left->p_tree_depth, right->p_tree_depth,
1740 				(unsigned long long)path_leaf_bh(left)->b_blocknr,
1741 				(unsigned long long)path_leaf_bh(right)->b_blocknr);
1742 	} while (left->p_node[i].bh->b_blocknr ==
1743 		 right->p_node[i].bh->b_blocknr);
1744 
1745 	return i - 1;
1746 }
1747 
1748 typedef void (path_insert_t)(void *, struct buffer_head *);
1749 
1750 /*
1751  * Traverse a btree path in search of cpos, starting at root_el.
1752  *
1753  * This code can be called with a cpos larger than the tree, in which
1754  * case it will return the rightmost path.
1755  */
1756 static int __ocfs2_find_path(struct ocfs2_caching_info *ci,
1757 			     struct ocfs2_extent_list *root_el, u32 cpos,
1758 			     path_insert_t *func, void *data)
1759 {
1760 	int i, ret = 0;
1761 	u32 range;
1762 	u64 blkno;
1763 	struct buffer_head *bh = NULL;
1764 	struct ocfs2_extent_block *eb;
1765 	struct ocfs2_extent_list *el;
1766 	struct ocfs2_extent_rec *rec;
1767 
1768 	el = root_el;
1769 	while (el->l_tree_depth) {
1770 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
1771 			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1772 				    "Owner %llu has empty extent list at "
1773 				    "depth %u\n",
1774 				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
1775 				    le16_to_cpu(el->l_tree_depth));
1776 			ret = -EROFS;
1777 			goto out;
1778 
1779 		}
1780 
1781 		for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1782 			rec = &el->l_recs[i];
1783 
1784 			/*
1785 			 * In the case that cpos is off the allocation
1786 			 * tree, this should just wind up returning the
1787 			 * rightmost record.
1788 			 */
1789 			range = le32_to_cpu(rec->e_cpos) +
1790 				ocfs2_rec_clusters(el, rec);
1791 			if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1792 			    break;
1793 		}
1794 
1795 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1796 		if (blkno == 0) {
1797 			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1798 				    "Owner %llu has bad blkno in extent list "
1799 				    "at depth %u (index %d)\n",
1800 				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
1801 				    le16_to_cpu(el->l_tree_depth), i);
1802 			ret = -EROFS;
1803 			goto out;
1804 		}
1805 
1806 		brelse(bh);
1807 		bh = NULL;
1808 		ret = ocfs2_read_extent_block(ci, blkno, &bh);
1809 		if (ret) {
1810 			mlog_errno(ret);
1811 			goto out;
1812 		}
1813 
1814 		eb = (struct ocfs2_extent_block *) bh->b_data;
1815 		el = &eb->h_list;
1816 
1817 		if (le16_to_cpu(el->l_next_free_rec) >
1818 		    le16_to_cpu(el->l_count)) {
1819 			ocfs2_error(ocfs2_metadata_cache_get_super(ci),
1820 				    "Owner %llu has bad count in extent list "
1821 				    "at block %llu (next free=%u, count=%u)\n",
1822 				    (unsigned long long)ocfs2_metadata_cache_owner(ci),
1823 				    (unsigned long long)bh->b_blocknr,
1824 				    le16_to_cpu(el->l_next_free_rec),
1825 				    le16_to_cpu(el->l_count));
1826 			ret = -EROFS;
1827 			goto out;
1828 		}
1829 
1830 		if (func)
1831 			func(data, bh);
1832 	}
1833 
1834 out:
1835 	/*
1836 	 * Catch any trailing bh that the loop didn't handle.
1837 	 */
1838 	brelse(bh);
1839 
1840 	return ret;
1841 }
1842 
1843 /*
1844  * Given an initialized path (that is, it has a valid root extent
1845  * list), this function will traverse the btree in search of the path
1846  * which would contain cpos.
1847  *
1848  * The path traveled is recorded in the path structure.
1849  *
1850  * Note that this will not do any comparisons on leaf node extent
1851  * records, so it will work fine in the case that we just added a tree
1852  * branch.
1853  */
1854 struct find_path_data {
1855 	int index;
1856 	struct ocfs2_path *path;
1857 };
1858 static void find_path_ins(void *data, struct buffer_head *bh)
1859 {
1860 	struct find_path_data *fp = data;
1861 
1862 	get_bh(bh);
1863 	ocfs2_path_insert_eb(fp->path, fp->index, bh);
1864 	fp->index++;
1865 }
1866 static int ocfs2_find_path(struct ocfs2_caching_info *ci,
1867 			   struct ocfs2_path *path, u32 cpos)
1868 {
1869 	struct find_path_data data;
1870 
1871 	data.index = 1;
1872 	data.path = path;
1873 	return __ocfs2_find_path(ci, path_root_el(path), cpos,
1874 				 find_path_ins, &data);
1875 }
1876 
1877 static void find_leaf_ins(void *data, struct buffer_head *bh)
1878 {
1879 	struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1880 	struct ocfs2_extent_list *el = &eb->h_list;
1881 	struct buffer_head **ret = data;
1882 
1883 	/* We want to retain only the leaf block. */
1884 	if (le16_to_cpu(el->l_tree_depth) == 0) {
1885 		get_bh(bh);
1886 		*ret = bh;
1887 	}
1888 }
1889 /*
1890  * Find the leaf block in the tree which would contain cpos. No
1891  * checking of the actual leaf is done.
1892  *
1893  * Some paths want to call this instead of allocating a path structure
1894  * and calling ocfs2_find_path().
1895  *
1896  * This function doesn't handle non btree extent lists.
1897  */
1898 int ocfs2_find_leaf(struct ocfs2_caching_info *ci,
1899 		    struct ocfs2_extent_list *root_el, u32 cpos,
1900 		    struct buffer_head **leaf_bh)
1901 {
1902 	int ret;
1903 	struct buffer_head *bh = NULL;
1904 
1905 	ret = __ocfs2_find_path(ci, root_el, cpos, find_leaf_ins, &bh);
1906 	if (ret) {
1907 		mlog_errno(ret);
1908 		goto out;
1909 	}
1910 
1911 	*leaf_bh = bh;
1912 out:
1913 	return ret;
1914 }
1915 
1916 /*
1917  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1918  *
1919  * Basically, we've moved stuff around at the bottom of the tree and
1920  * we need to fix up the extent records above the changes to reflect
1921  * the new changes.
1922  *
1923  * left_rec: the record on the left.
1924  * left_child_el: is the child list pointed to by left_rec
1925  * right_rec: the record to the right of left_rec
1926  * right_child_el: is the child list pointed to by right_rec
1927  *
1928  * By definition, this only works on interior nodes.
1929  */
1930 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1931 				  struct ocfs2_extent_list *left_child_el,
1932 				  struct ocfs2_extent_rec *right_rec,
1933 				  struct ocfs2_extent_list *right_child_el)
1934 {
1935 	u32 left_clusters, right_end;
1936 
1937 	/*
1938 	 * Interior nodes never have holes. Their cpos is the cpos of
1939 	 * the leftmost record in their child list. Their cluster
1940 	 * count covers the full theoretical range of their child list
1941 	 * - the range between their cpos and the cpos of the record
1942 	 * immediately to their right.
1943 	 */
1944 	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1945 	if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
1946 		BUG_ON(right_child_el->l_tree_depth);
1947 		BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1948 		left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1949 	}
1950 	left_clusters -= le32_to_cpu(left_rec->e_cpos);
1951 	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1952 
1953 	/*
1954 	 * Calculate the rightmost cluster count boundary before
1955 	 * moving cpos - we will need to adjust clusters after
1956 	 * updating e_cpos to keep the same highest cluster count.
1957 	 */
1958 	right_end = le32_to_cpu(right_rec->e_cpos);
1959 	right_end += le32_to_cpu(right_rec->e_int_clusters);
1960 
1961 	right_rec->e_cpos = left_rec->e_cpos;
1962 	le32_add_cpu(&right_rec->e_cpos, left_clusters);
1963 
1964 	right_end -= le32_to_cpu(right_rec->e_cpos);
1965 	right_rec->e_int_clusters = cpu_to_le32(right_end);
1966 }
1967 
1968 /*
1969  * Adjust the adjacent root node records involved in a
1970  * rotation. left_el_blkno is passed in as a key so that we can easily
1971  * find it's index in the root list.
1972  */
1973 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1974 				      struct ocfs2_extent_list *left_el,
1975 				      struct ocfs2_extent_list *right_el,
1976 				      u64 left_el_blkno)
1977 {
1978 	int i;
1979 
1980 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1981 	       le16_to_cpu(left_el->l_tree_depth));
1982 
1983 	for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1984 		if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1985 			break;
1986 	}
1987 
1988 	/*
1989 	 * The path walking code should have never returned a root and
1990 	 * two paths which are not adjacent.
1991 	 */
1992 	BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1993 
1994 	ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1995 				      &root_el->l_recs[i + 1], right_el);
1996 }
1997 
1998 /*
1999  * We've changed a leaf block (in right_path) and need to reflect that
2000  * change back up the subtree.
2001  *
2002  * This happens in multiple places:
2003  *   - When we've moved an extent record from the left path leaf to the right
2004  *     path leaf to make room for an empty extent in the left path leaf.
2005  *   - When our insert into the right path leaf is at the leftmost edge
2006  *     and requires an update of the path immediately to it's left. This
2007  *     can occur at the end of some types of rotation and appending inserts.
2008  *   - When we've adjusted the last extent record in the left path leaf and the
2009  *     1st extent record in the right path leaf during cross extent block merge.
2010  */
2011 static void ocfs2_complete_edge_insert(handle_t *handle,
2012 				       struct ocfs2_path *left_path,
2013 				       struct ocfs2_path *right_path,
2014 				       int subtree_index)
2015 {
2016 	int ret, i, idx;
2017 	struct ocfs2_extent_list *el, *left_el, *right_el;
2018 	struct ocfs2_extent_rec *left_rec, *right_rec;
2019 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2020 
2021 	/*
2022 	 * Update the counts and position values within all the
2023 	 * interior nodes to reflect the leaf rotation we just did.
2024 	 *
2025 	 * The root node is handled below the loop.
2026 	 *
2027 	 * We begin the loop with right_el and left_el pointing to the
2028 	 * leaf lists and work our way up.
2029 	 *
2030 	 * NOTE: within this loop, left_el and right_el always refer
2031 	 * to the *child* lists.
2032 	 */
2033 	left_el = path_leaf_el(left_path);
2034 	right_el = path_leaf_el(right_path);
2035 	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
2036 		mlog(0, "Adjust records at index %u\n", i);
2037 
2038 		/*
2039 		 * One nice property of knowing that all of these
2040 		 * nodes are below the root is that we only deal with
2041 		 * the leftmost right node record and the rightmost
2042 		 * left node record.
2043 		 */
2044 		el = left_path->p_node[i].el;
2045 		idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
2046 		left_rec = &el->l_recs[idx];
2047 
2048 		el = right_path->p_node[i].el;
2049 		right_rec = &el->l_recs[0];
2050 
2051 		ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
2052 					      right_el);
2053 
2054 		ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
2055 		if (ret)
2056 			mlog_errno(ret);
2057 
2058 		ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2059 		if (ret)
2060 			mlog_errno(ret);
2061 
2062 		/*
2063 		 * Setup our list pointers now so that the current
2064 		 * parents become children in the next iteration.
2065 		 */
2066 		left_el = left_path->p_node[i].el;
2067 		right_el = right_path->p_node[i].el;
2068 	}
2069 
2070 	/*
2071 	 * At the root node, adjust the two adjacent records which
2072 	 * begin our path to the leaves.
2073 	 */
2074 
2075 	el = left_path->p_node[subtree_index].el;
2076 	left_el = left_path->p_node[subtree_index + 1].el;
2077 	right_el = right_path->p_node[subtree_index + 1].el;
2078 
2079 	ocfs2_adjust_root_records(el, left_el, right_el,
2080 				  left_path->p_node[subtree_index + 1].bh->b_blocknr);
2081 
2082 	root_bh = left_path->p_node[subtree_index].bh;
2083 
2084 	ret = ocfs2_journal_dirty(handle, root_bh);
2085 	if (ret)
2086 		mlog_errno(ret);
2087 }
2088 
2089 static int ocfs2_rotate_subtree_right(handle_t *handle,
2090 				      struct ocfs2_extent_tree *et,
2091 				      struct ocfs2_path *left_path,
2092 				      struct ocfs2_path *right_path,
2093 				      int subtree_index)
2094 {
2095 	int ret, i;
2096 	struct buffer_head *right_leaf_bh;
2097 	struct buffer_head *left_leaf_bh = NULL;
2098 	struct buffer_head *root_bh;
2099 	struct ocfs2_extent_list *right_el, *left_el;
2100 	struct ocfs2_extent_rec move_rec;
2101 
2102 	left_leaf_bh = path_leaf_bh(left_path);
2103 	left_el = path_leaf_el(left_path);
2104 
2105 	if (left_el->l_next_free_rec != left_el->l_count) {
2106 		ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
2107 			    "Inode %llu has non-full interior leaf node %llu"
2108 			    "(next free = %u)",
2109 			    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2110 			    (unsigned long long)left_leaf_bh->b_blocknr,
2111 			    le16_to_cpu(left_el->l_next_free_rec));
2112 		return -EROFS;
2113 	}
2114 
2115 	/*
2116 	 * This extent block may already have an empty record, so we
2117 	 * return early if so.
2118 	 */
2119 	if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
2120 		return 0;
2121 
2122 	root_bh = left_path->p_node[subtree_index].bh;
2123 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2124 
2125 	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2126 					   subtree_index);
2127 	if (ret) {
2128 		mlog_errno(ret);
2129 		goto out;
2130 	}
2131 
2132 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2133 		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2134 						   right_path, i);
2135 		if (ret) {
2136 			mlog_errno(ret);
2137 			goto out;
2138 		}
2139 
2140 		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2141 						   left_path, i);
2142 		if (ret) {
2143 			mlog_errno(ret);
2144 			goto out;
2145 		}
2146 	}
2147 
2148 	right_leaf_bh = path_leaf_bh(right_path);
2149 	right_el = path_leaf_el(right_path);
2150 
2151 	/* This is a code error, not a disk corruption. */
2152 	mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
2153 			"because rightmost leaf block %llu is empty\n",
2154 			(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2155 			(unsigned long long)right_leaf_bh->b_blocknr);
2156 
2157 	ocfs2_create_empty_extent(right_el);
2158 
2159 	ret = ocfs2_journal_dirty(handle, right_leaf_bh);
2160 	if (ret) {
2161 		mlog_errno(ret);
2162 		goto out;
2163 	}
2164 
2165 	/* Do the copy now. */
2166 	i = le16_to_cpu(left_el->l_next_free_rec) - 1;
2167 	move_rec = left_el->l_recs[i];
2168 	right_el->l_recs[0] = move_rec;
2169 
2170 	/*
2171 	 * Clear out the record we just copied and shift everything
2172 	 * over, leaving an empty extent in the left leaf.
2173 	 *
2174 	 * We temporarily subtract from next_free_rec so that the
2175 	 * shift will lose the tail record (which is now defunct).
2176 	 */
2177 	le16_add_cpu(&left_el->l_next_free_rec, -1);
2178 	ocfs2_shift_records_right(left_el);
2179 	memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2180 	le16_add_cpu(&left_el->l_next_free_rec, 1);
2181 
2182 	ret = ocfs2_journal_dirty(handle, left_leaf_bh);
2183 	if (ret) {
2184 		mlog_errno(ret);
2185 		goto out;
2186 	}
2187 
2188 	ocfs2_complete_edge_insert(handle, left_path, right_path,
2189 				   subtree_index);
2190 
2191 out:
2192 	return ret;
2193 }
2194 
2195 /*
2196  * Given a full path, determine what cpos value would return us a path
2197  * containing the leaf immediately to the left of the current one.
2198  *
2199  * Will return zero if the path passed in is already the leftmost path.
2200  */
2201 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2202 					 struct ocfs2_path *path, u32 *cpos)
2203 {
2204 	int i, j, ret = 0;
2205 	u64 blkno;
2206 	struct ocfs2_extent_list *el;
2207 
2208 	BUG_ON(path->p_tree_depth == 0);
2209 
2210 	*cpos = 0;
2211 
2212 	blkno = path_leaf_bh(path)->b_blocknr;
2213 
2214 	/* Start at the tree node just above the leaf and work our way up. */
2215 	i = path->p_tree_depth - 1;
2216 	while (i >= 0) {
2217 		el = path->p_node[i].el;
2218 
2219 		/*
2220 		 * Find the extent record just before the one in our
2221 		 * path.
2222 		 */
2223 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2224 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2225 				if (j == 0) {
2226 					if (i == 0) {
2227 						/*
2228 						 * We've determined that the
2229 						 * path specified is already
2230 						 * the leftmost one - return a
2231 						 * cpos of zero.
2232 						 */
2233 						goto out;
2234 					}
2235 					/*
2236 					 * The leftmost record points to our
2237 					 * leaf - we need to travel up the
2238 					 * tree one level.
2239 					 */
2240 					goto next_node;
2241 				}
2242 
2243 				*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2244 				*cpos = *cpos + ocfs2_rec_clusters(el,
2245 							   &el->l_recs[j - 1]);
2246 				*cpos = *cpos - 1;
2247 				goto out;
2248 			}
2249 		}
2250 
2251 		/*
2252 		 * If we got here, we never found a valid node where
2253 		 * the tree indicated one should be.
2254 		 */
2255 		ocfs2_error(sb,
2256 			    "Invalid extent tree at extent block %llu\n",
2257 			    (unsigned long long)blkno);
2258 		ret = -EROFS;
2259 		goto out;
2260 
2261 next_node:
2262 		blkno = path->p_node[i].bh->b_blocknr;
2263 		i--;
2264 	}
2265 
2266 out:
2267 	return ret;
2268 }
2269 
2270 /*
2271  * Extend the transaction by enough credits to complete the rotation,
2272  * and still leave at least the original number of credits allocated
2273  * to this transaction.
2274  */
2275 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2276 					   int op_credits,
2277 					   struct ocfs2_path *path)
2278 {
2279 	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2280 
2281 	if (handle->h_buffer_credits < credits)
2282 		return ocfs2_extend_trans(handle, credits);
2283 
2284 	return 0;
2285 }
2286 
2287 /*
2288  * Trap the case where we're inserting into the theoretical range past
2289  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2290  * whose cpos is less than ours into the right leaf.
2291  *
2292  * It's only necessary to look at the rightmost record of the left
2293  * leaf because the logic that calls us should ensure that the
2294  * theoretical ranges in the path components above the leaves are
2295  * correct.
2296  */
2297 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2298 						 u32 insert_cpos)
2299 {
2300 	struct ocfs2_extent_list *left_el;
2301 	struct ocfs2_extent_rec *rec;
2302 	int next_free;
2303 
2304 	left_el = path_leaf_el(left_path);
2305 	next_free = le16_to_cpu(left_el->l_next_free_rec);
2306 	rec = &left_el->l_recs[next_free - 1];
2307 
2308 	if (insert_cpos > le32_to_cpu(rec->e_cpos))
2309 		return 1;
2310 	return 0;
2311 }
2312 
2313 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2314 {
2315 	int next_free = le16_to_cpu(el->l_next_free_rec);
2316 	unsigned int range;
2317 	struct ocfs2_extent_rec *rec;
2318 
2319 	if (next_free == 0)
2320 		return 0;
2321 
2322 	rec = &el->l_recs[0];
2323 	if (ocfs2_is_empty_extent(rec)) {
2324 		/* Empty list. */
2325 		if (next_free == 1)
2326 			return 0;
2327 		rec = &el->l_recs[1];
2328 	}
2329 
2330 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2331 	if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2332 		return 1;
2333 	return 0;
2334 }
2335 
2336 /*
2337  * Rotate all the records in a btree right one record, starting at insert_cpos.
2338  *
2339  * The path to the rightmost leaf should be passed in.
2340  *
2341  * The array is assumed to be large enough to hold an entire path (tree depth).
2342  *
2343  * Upon succesful return from this function:
2344  *
2345  * - The 'right_path' array will contain a path to the leaf block
2346  *   whose range contains e_cpos.
2347  * - That leaf block will have a single empty extent in list index 0.
2348  * - In the case that the rotation requires a post-insert update,
2349  *   *ret_left_path will contain a valid path which can be passed to
2350  *   ocfs2_insert_path().
2351  */
2352 static int ocfs2_rotate_tree_right(handle_t *handle,
2353 				   struct ocfs2_extent_tree *et,
2354 				   enum ocfs2_split_type split,
2355 				   u32 insert_cpos,
2356 				   struct ocfs2_path *right_path,
2357 				   struct ocfs2_path **ret_left_path)
2358 {
2359 	int ret, start, orig_credits = handle->h_buffer_credits;
2360 	u32 cpos;
2361 	struct ocfs2_path *left_path = NULL;
2362 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2363 
2364 	*ret_left_path = NULL;
2365 
2366 	left_path = ocfs2_new_path_from_path(right_path);
2367 	if (!left_path) {
2368 		ret = -ENOMEM;
2369 		mlog_errno(ret);
2370 		goto out;
2371 	}
2372 
2373 	ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2374 	if (ret) {
2375 		mlog_errno(ret);
2376 		goto out;
2377 	}
2378 
2379 	mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2380 
2381 	/*
2382 	 * What we want to do here is:
2383 	 *
2384 	 * 1) Start with the rightmost path.
2385 	 *
2386 	 * 2) Determine a path to the leaf block directly to the left
2387 	 *    of that leaf.
2388 	 *
2389 	 * 3) Determine the 'subtree root' - the lowest level tree node
2390 	 *    which contains a path to both leaves.
2391 	 *
2392 	 * 4) Rotate the subtree.
2393 	 *
2394 	 * 5) Find the next subtree by considering the left path to be
2395 	 *    the new right path.
2396 	 *
2397 	 * The check at the top of this while loop also accepts
2398 	 * insert_cpos == cpos because cpos is only a _theoretical_
2399 	 * value to get us the left path - insert_cpos might very well
2400 	 * be filling that hole.
2401 	 *
2402 	 * Stop at a cpos of '0' because we either started at the
2403 	 * leftmost branch (i.e., a tree with one branch and a
2404 	 * rotation inside of it), or we've gone as far as we can in
2405 	 * rotating subtrees.
2406 	 */
2407 	while (cpos && insert_cpos <= cpos) {
2408 		mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2409 		     insert_cpos, cpos);
2410 
2411 		ret = ocfs2_find_path(et->et_ci, left_path, cpos);
2412 		if (ret) {
2413 			mlog_errno(ret);
2414 			goto out;
2415 		}
2416 
2417 		mlog_bug_on_msg(path_leaf_bh(left_path) ==
2418 				path_leaf_bh(right_path),
2419 				"Owner %llu: error during insert of %u "
2420 				"(left path cpos %u) results in two identical "
2421 				"paths ending at %llu\n",
2422 				(unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2423 				insert_cpos, cpos,
2424 				(unsigned long long)
2425 				path_leaf_bh(left_path)->b_blocknr);
2426 
2427 		if (split == SPLIT_NONE &&
2428 		    ocfs2_rotate_requires_path_adjustment(left_path,
2429 							  insert_cpos)) {
2430 
2431 			/*
2432 			 * We've rotated the tree as much as we
2433 			 * should. The rest is up to
2434 			 * ocfs2_insert_path() to complete, after the
2435 			 * record insertion. We indicate this
2436 			 * situation by returning the left path.
2437 			 *
2438 			 * The reason we don't adjust the records here
2439 			 * before the record insert is that an error
2440 			 * later might break the rule where a parent
2441 			 * record e_cpos will reflect the actual
2442 			 * e_cpos of the 1st nonempty record of the
2443 			 * child list.
2444 			 */
2445 			*ret_left_path = left_path;
2446 			goto out_ret_path;
2447 		}
2448 
2449 		start = ocfs2_find_subtree_root(et, left_path, right_path);
2450 
2451 		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2452 		     start,
2453 		     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2454 		     right_path->p_tree_depth);
2455 
2456 		ret = ocfs2_extend_rotate_transaction(handle, start,
2457 						      orig_credits, right_path);
2458 		if (ret) {
2459 			mlog_errno(ret);
2460 			goto out;
2461 		}
2462 
2463 		ret = ocfs2_rotate_subtree_right(handle, et, left_path,
2464 						 right_path, start);
2465 		if (ret) {
2466 			mlog_errno(ret);
2467 			goto out;
2468 		}
2469 
2470 		if (split != SPLIT_NONE &&
2471 		    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2472 						insert_cpos)) {
2473 			/*
2474 			 * A rotate moves the rightmost left leaf
2475 			 * record over to the leftmost right leaf
2476 			 * slot. If we're doing an extent split
2477 			 * instead of a real insert, then we have to
2478 			 * check that the extent to be split wasn't
2479 			 * just moved over. If it was, then we can
2480 			 * exit here, passing left_path back -
2481 			 * ocfs2_split_extent() is smart enough to
2482 			 * search both leaves.
2483 			 */
2484 			*ret_left_path = left_path;
2485 			goto out_ret_path;
2486 		}
2487 
2488 		/*
2489 		 * There is no need to re-read the next right path
2490 		 * as we know that it'll be our current left
2491 		 * path. Optimize by copying values instead.
2492 		 */
2493 		ocfs2_mv_path(right_path, left_path);
2494 
2495 		ret = ocfs2_find_cpos_for_left_leaf(sb, right_path, &cpos);
2496 		if (ret) {
2497 			mlog_errno(ret);
2498 			goto out;
2499 		}
2500 	}
2501 
2502 out:
2503 	ocfs2_free_path(left_path);
2504 
2505 out_ret_path:
2506 	return ret;
2507 }
2508 
2509 static int ocfs2_update_edge_lengths(handle_t *handle,
2510 				     struct ocfs2_extent_tree *et,
2511 				     int subtree_index, struct ocfs2_path *path)
2512 {
2513 	int i, idx, ret;
2514 	struct ocfs2_extent_rec *rec;
2515 	struct ocfs2_extent_list *el;
2516 	struct ocfs2_extent_block *eb;
2517 	u32 range;
2518 
2519 	/*
2520 	 * In normal tree rotation process, we will never touch the
2521 	 * tree branch above subtree_index and ocfs2_extend_rotate_transaction
2522 	 * doesn't reserve the credits for them either.
2523 	 *
2524 	 * But we do have a special case here which will update the rightmost
2525 	 * records for all the bh in the path.
2526 	 * So we have to allocate extra credits and access them.
2527 	 */
2528 	ret = ocfs2_extend_trans(handle,
2529 				 handle->h_buffer_credits + subtree_index);
2530 	if (ret) {
2531 		mlog_errno(ret);
2532 		goto out;
2533 	}
2534 
2535 	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
2536 	if (ret) {
2537 		mlog_errno(ret);
2538 		goto out;
2539 	}
2540 
2541 	/* Path should always be rightmost. */
2542 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2543 	BUG_ON(eb->h_next_leaf_blk != 0ULL);
2544 
2545 	el = &eb->h_list;
2546 	BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2547 	idx = le16_to_cpu(el->l_next_free_rec) - 1;
2548 	rec = &el->l_recs[idx];
2549 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2550 
2551 	for (i = 0; i < path->p_tree_depth; i++) {
2552 		el = path->p_node[i].el;
2553 		idx = le16_to_cpu(el->l_next_free_rec) - 1;
2554 		rec = &el->l_recs[idx];
2555 
2556 		rec->e_int_clusters = cpu_to_le32(range);
2557 		le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2558 
2559 		ocfs2_journal_dirty(handle, path->p_node[i].bh);
2560 	}
2561 out:
2562 	return ret;
2563 }
2564 
2565 static void ocfs2_unlink_path(handle_t *handle,
2566 			      struct ocfs2_extent_tree *et,
2567 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
2568 			      struct ocfs2_path *path, int unlink_start)
2569 {
2570 	int ret, i;
2571 	struct ocfs2_extent_block *eb;
2572 	struct ocfs2_extent_list *el;
2573 	struct buffer_head *bh;
2574 
2575 	for(i = unlink_start; i < path_num_items(path); i++) {
2576 		bh = path->p_node[i].bh;
2577 
2578 		eb = (struct ocfs2_extent_block *)bh->b_data;
2579 		/*
2580 		 * Not all nodes might have had their final count
2581 		 * decremented by the caller - handle this here.
2582 		 */
2583 		el = &eb->h_list;
2584 		if (le16_to_cpu(el->l_next_free_rec) > 1) {
2585 			mlog(ML_ERROR,
2586 			     "Inode %llu, attempted to remove extent block "
2587 			     "%llu with %u records\n",
2588 			     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
2589 			     (unsigned long long)le64_to_cpu(eb->h_blkno),
2590 			     le16_to_cpu(el->l_next_free_rec));
2591 
2592 			ocfs2_journal_dirty(handle, bh);
2593 			ocfs2_remove_from_cache(et->et_ci, bh);
2594 			continue;
2595 		}
2596 
2597 		el->l_next_free_rec = 0;
2598 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2599 
2600 		ocfs2_journal_dirty(handle, bh);
2601 
2602 		ret = ocfs2_cache_extent_block_free(dealloc, eb);
2603 		if (ret)
2604 			mlog_errno(ret);
2605 
2606 		ocfs2_remove_from_cache(et->et_ci, bh);
2607 	}
2608 }
2609 
2610 static void ocfs2_unlink_subtree(handle_t *handle,
2611 				 struct ocfs2_extent_tree *et,
2612 				 struct ocfs2_path *left_path,
2613 				 struct ocfs2_path *right_path,
2614 				 int subtree_index,
2615 				 struct ocfs2_cached_dealloc_ctxt *dealloc)
2616 {
2617 	int i;
2618 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2619 	struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2620 	struct ocfs2_extent_list *el;
2621 	struct ocfs2_extent_block *eb;
2622 
2623 	el = path_leaf_el(left_path);
2624 
2625 	eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2626 
2627 	for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2628 		if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2629 			break;
2630 
2631 	BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2632 
2633 	memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2634 	le16_add_cpu(&root_el->l_next_free_rec, -1);
2635 
2636 	eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2637 	eb->h_next_leaf_blk = 0;
2638 
2639 	ocfs2_journal_dirty(handle, root_bh);
2640 	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2641 
2642 	ocfs2_unlink_path(handle, et, dealloc, right_path,
2643 			  subtree_index + 1);
2644 }
2645 
2646 static int ocfs2_rotate_subtree_left(handle_t *handle,
2647 				     struct ocfs2_extent_tree *et,
2648 				     struct ocfs2_path *left_path,
2649 				     struct ocfs2_path *right_path,
2650 				     int subtree_index,
2651 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
2652 				     int *deleted)
2653 {
2654 	int ret, i, del_right_subtree = 0, right_has_empty = 0;
2655 	struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2656 	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2657 	struct ocfs2_extent_block *eb;
2658 
2659 	*deleted = 0;
2660 
2661 	right_leaf_el = path_leaf_el(right_path);
2662 	left_leaf_el = path_leaf_el(left_path);
2663 	root_bh = left_path->p_node[subtree_index].bh;
2664 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2665 
2666 	if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2667 		return 0;
2668 
2669 	eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2670 	if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2671 		/*
2672 		 * It's legal for us to proceed if the right leaf is
2673 		 * the rightmost one and it has an empty extent. There
2674 		 * are two cases to handle - whether the leaf will be
2675 		 * empty after removal or not. If the leaf isn't empty
2676 		 * then just remove the empty extent up front. The
2677 		 * next block will handle empty leaves by flagging
2678 		 * them for unlink.
2679 		 *
2680 		 * Non rightmost leaves will throw -EAGAIN and the
2681 		 * caller can manually move the subtree and retry.
2682 		 */
2683 
2684 		if (eb->h_next_leaf_blk != 0ULL)
2685 			return -EAGAIN;
2686 
2687 		if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2688 			ret = ocfs2_journal_access_eb(handle, et->et_ci,
2689 						      path_leaf_bh(right_path),
2690 						      OCFS2_JOURNAL_ACCESS_WRITE);
2691 			if (ret) {
2692 				mlog_errno(ret);
2693 				goto out;
2694 			}
2695 
2696 			ocfs2_remove_empty_extent(right_leaf_el);
2697 		} else
2698 			right_has_empty = 1;
2699 	}
2700 
2701 	if (eb->h_next_leaf_blk == 0ULL &&
2702 	    le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2703 		/*
2704 		 * We have to update i_last_eb_blk during the meta
2705 		 * data delete.
2706 		 */
2707 		ret = ocfs2_et_root_journal_access(handle, et,
2708 						   OCFS2_JOURNAL_ACCESS_WRITE);
2709 		if (ret) {
2710 			mlog_errno(ret);
2711 			goto out;
2712 		}
2713 
2714 		del_right_subtree = 1;
2715 	}
2716 
2717 	/*
2718 	 * Getting here with an empty extent in the right path implies
2719 	 * that it's the rightmost path and will be deleted.
2720 	 */
2721 	BUG_ON(right_has_empty && !del_right_subtree);
2722 
2723 	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
2724 					   subtree_index);
2725 	if (ret) {
2726 		mlog_errno(ret);
2727 		goto out;
2728 	}
2729 
2730 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2731 		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2732 						   right_path, i);
2733 		if (ret) {
2734 			mlog_errno(ret);
2735 			goto out;
2736 		}
2737 
2738 		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2739 						   left_path, i);
2740 		if (ret) {
2741 			mlog_errno(ret);
2742 			goto out;
2743 		}
2744 	}
2745 
2746 	if (!right_has_empty) {
2747 		/*
2748 		 * Only do this if we're moving a real
2749 		 * record. Otherwise, the action is delayed until
2750 		 * after removal of the right path in which case we
2751 		 * can do a simple shift to remove the empty extent.
2752 		 */
2753 		ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2754 		memset(&right_leaf_el->l_recs[0], 0,
2755 		       sizeof(struct ocfs2_extent_rec));
2756 	}
2757 	if (eb->h_next_leaf_blk == 0ULL) {
2758 		/*
2759 		 * Move recs over to get rid of empty extent, decrease
2760 		 * next_free. This is allowed to remove the last
2761 		 * extent in our leaf (setting l_next_free_rec to
2762 		 * zero) - the delete code below won't care.
2763 		 */
2764 		ocfs2_remove_empty_extent(right_leaf_el);
2765 	}
2766 
2767 	ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2768 	if (ret)
2769 		mlog_errno(ret);
2770 	ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2771 	if (ret)
2772 		mlog_errno(ret);
2773 
2774 	if (del_right_subtree) {
2775 		ocfs2_unlink_subtree(handle, et, left_path, right_path,
2776 				     subtree_index, dealloc);
2777 		ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
2778 						left_path);
2779 		if (ret) {
2780 			mlog_errno(ret);
2781 			goto out;
2782 		}
2783 
2784 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2785 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2786 
2787 		/*
2788 		 * Removal of the extent in the left leaf was skipped
2789 		 * above so we could delete the right path
2790 		 * 1st.
2791 		 */
2792 		if (right_has_empty)
2793 			ocfs2_remove_empty_extent(left_leaf_el);
2794 
2795 		ret = ocfs2_journal_dirty(handle, et_root_bh);
2796 		if (ret)
2797 			mlog_errno(ret);
2798 
2799 		*deleted = 1;
2800 	} else
2801 		ocfs2_complete_edge_insert(handle, left_path, right_path,
2802 					   subtree_index);
2803 
2804 out:
2805 	return ret;
2806 }
2807 
2808 /*
2809  * Given a full path, determine what cpos value would return us a path
2810  * containing the leaf immediately to the right of the current one.
2811  *
2812  * Will return zero if the path passed in is already the rightmost path.
2813  *
2814  * This looks similar, but is subtly different to
2815  * ocfs2_find_cpos_for_left_leaf().
2816  */
2817 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2818 					  struct ocfs2_path *path, u32 *cpos)
2819 {
2820 	int i, j, ret = 0;
2821 	u64 blkno;
2822 	struct ocfs2_extent_list *el;
2823 
2824 	*cpos = 0;
2825 
2826 	if (path->p_tree_depth == 0)
2827 		return 0;
2828 
2829 	blkno = path_leaf_bh(path)->b_blocknr;
2830 
2831 	/* Start at the tree node just above the leaf and work our way up. */
2832 	i = path->p_tree_depth - 1;
2833 	while (i >= 0) {
2834 		int next_free;
2835 
2836 		el = path->p_node[i].el;
2837 
2838 		/*
2839 		 * Find the extent record just after the one in our
2840 		 * path.
2841 		 */
2842 		next_free = le16_to_cpu(el->l_next_free_rec);
2843 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2844 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2845 				if (j == (next_free - 1)) {
2846 					if (i == 0) {
2847 						/*
2848 						 * We've determined that the
2849 						 * path specified is already
2850 						 * the rightmost one - return a
2851 						 * cpos of zero.
2852 						 */
2853 						goto out;
2854 					}
2855 					/*
2856 					 * The rightmost record points to our
2857 					 * leaf - we need to travel up the
2858 					 * tree one level.
2859 					 */
2860 					goto next_node;
2861 				}
2862 
2863 				*cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2864 				goto out;
2865 			}
2866 		}
2867 
2868 		/*
2869 		 * If we got here, we never found a valid node where
2870 		 * the tree indicated one should be.
2871 		 */
2872 		ocfs2_error(sb,
2873 			    "Invalid extent tree at extent block %llu\n",
2874 			    (unsigned long long)blkno);
2875 		ret = -EROFS;
2876 		goto out;
2877 
2878 next_node:
2879 		blkno = path->p_node[i].bh->b_blocknr;
2880 		i--;
2881 	}
2882 
2883 out:
2884 	return ret;
2885 }
2886 
2887 static int ocfs2_rotate_rightmost_leaf_left(handle_t *handle,
2888 					    struct ocfs2_extent_tree *et,
2889 					    struct ocfs2_path *path)
2890 {
2891 	int ret;
2892 	struct buffer_head *bh = path_leaf_bh(path);
2893 	struct ocfs2_extent_list *el = path_leaf_el(path);
2894 
2895 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2896 		return 0;
2897 
2898 	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
2899 					   path_num_items(path) - 1);
2900 	if (ret) {
2901 		mlog_errno(ret);
2902 		goto out;
2903 	}
2904 
2905 	ocfs2_remove_empty_extent(el);
2906 
2907 	ret = ocfs2_journal_dirty(handle, bh);
2908 	if (ret)
2909 		mlog_errno(ret);
2910 
2911 out:
2912 	return ret;
2913 }
2914 
2915 static int __ocfs2_rotate_tree_left(handle_t *handle,
2916 				    struct ocfs2_extent_tree *et,
2917 				    int orig_credits,
2918 				    struct ocfs2_path *path,
2919 				    struct ocfs2_cached_dealloc_ctxt *dealloc,
2920 				    struct ocfs2_path **empty_extent_path)
2921 {
2922 	int ret, subtree_root, deleted;
2923 	u32 right_cpos;
2924 	struct ocfs2_path *left_path = NULL;
2925 	struct ocfs2_path *right_path = NULL;
2926 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
2927 
2928 	BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2929 
2930 	*empty_extent_path = NULL;
2931 
2932 	ret = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
2933 	if (ret) {
2934 		mlog_errno(ret);
2935 		goto out;
2936 	}
2937 
2938 	left_path = ocfs2_new_path_from_path(path);
2939 	if (!left_path) {
2940 		ret = -ENOMEM;
2941 		mlog_errno(ret);
2942 		goto out;
2943 	}
2944 
2945 	ocfs2_cp_path(left_path, path);
2946 
2947 	right_path = ocfs2_new_path_from_path(path);
2948 	if (!right_path) {
2949 		ret = -ENOMEM;
2950 		mlog_errno(ret);
2951 		goto out;
2952 	}
2953 
2954 	while (right_cpos) {
2955 		ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
2956 		if (ret) {
2957 			mlog_errno(ret);
2958 			goto out;
2959 		}
2960 
2961 		subtree_root = ocfs2_find_subtree_root(et, left_path,
2962 						       right_path);
2963 
2964 		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2965 		     subtree_root,
2966 		     (unsigned long long)
2967 		     right_path->p_node[subtree_root].bh->b_blocknr,
2968 		     right_path->p_tree_depth);
2969 
2970 		ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2971 						      orig_credits, left_path);
2972 		if (ret) {
2973 			mlog_errno(ret);
2974 			goto out;
2975 		}
2976 
2977 		/*
2978 		 * Caller might still want to make changes to the
2979 		 * tree root, so re-add it to the journal here.
2980 		 */
2981 		ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
2982 						   left_path, 0);
2983 		if (ret) {
2984 			mlog_errno(ret);
2985 			goto out;
2986 		}
2987 
2988 		ret = ocfs2_rotate_subtree_left(handle, et, left_path,
2989 						right_path, subtree_root,
2990 						dealloc, &deleted);
2991 		if (ret == -EAGAIN) {
2992 			/*
2993 			 * The rotation has to temporarily stop due to
2994 			 * the right subtree having an empty
2995 			 * extent. Pass it back to the caller for a
2996 			 * fixup.
2997 			 */
2998 			*empty_extent_path = right_path;
2999 			right_path = NULL;
3000 			goto out;
3001 		}
3002 		if (ret) {
3003 			mlog_errno(ret);
3004 			goto out;
3005 		}
3006 
3007 		/*
3008 		 * The subtree rotate might have removed records on
3009 		 * the rightmost edge. If so, then rotation is
3010 		 * complete.
3011 		 */
3012 		if (deleted)
3013 			break;
3014 
3015 		ocfs2_mv_path(left_path, right_path);
3016 
3017 		ret = ocfs2_find_cpos_for_right_leaf(sb, left_path,
3018 						     &right_cpos);
3019 		if (ret) {
3020 			mlog_errno(ret);
3021 			goto out;
3022 		}
3023 	}
3024 
3025 out:
3026 	ocfs2_free_path(right_path);
3027 	ocfs2_free_path(left_path);
3028 
3029 	return ret;
3030 }
3031 
3032 static int ocfs2_remove_rightmost_path(handle_t *handle,
3033 				struct ocfs2_extent_tree *et,
3034 				struct ocfs2_path *path,
3035 				struct ocfs2_cached_dealloc_ctxt *dealloc)
3036 {
3037 	int ret, subtree_index;
3038 	u32 cpos;
3039 	struct ocfs2_path *left_path = NULL;
3040 	struct ocfs2_extent_block *eb;
3041 	struct ocfs2_extent_list *el;
3042 
3043 
3044 	ret = ocfs2_et_sanity_check(et);
3045 	if (ret)
3046 		goto out;
3047 	/*
3048 	 * There's two ways we handle this depending on
3049 	 * whether path is the only existing one.
3050 	 */
3051 	ret = ocfs2_extend_rotate_transaction(handle, 0,
3052 					      handle->h_buffer_credits,
3053 					      path);
3054 	if (ret) {
3055 		mlog_errno(ret);
3056 		goto out;
3057 	}
3058 
3059 	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
3060 	if (ret) {
3061 		mlog_errno(ret);
3062 		goto out;
3063 	}
3064 
3065 	ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3066 					    path, &cpos);
3067 	if (ret) {
3068 		mlog_errno(ret);
3069 		goto out;
3070 	}
3071 
3072 	if (cpos) {
3073 		/*
3074 		 * We have a path to the left of this one - it needs
3075 		 * an update too.
3076 		 */
3077 		left_path = ocfs2_new_path_from_path(path);
3078 		if (!left_path) {
3079 			ret = -ENOMEM;
3080 			mlog_errno(ret);
3081 			goto out;
3082 		}
3083 
3084 		ret = ocfs2_find_path(et->et_ci, left_path, cpos);
3085 		if (ret) {
3086 			mlog_errno(ret);
3087 			goto out;
3088 		}
3089 
3090 		ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
3091 		if (ret) {
3092 			mlog_errno(ret);
3093 			goto out;
3094 		}
3095 
3096 		subtree_index = ocfs2_find_subtree_root(et, left_path, path);
3097 
3098 		ocfs2_unlink_subtree(handle, et, left_path, path,
3099 				     subtree_index, dealloc);
3100 		ret = ocfs2_update_edge_lengths(handle, et, subtree_index,
3101 						left_path);
3102 		if (ret) {
3103 			mlog_errno(ret);
3104 			goto out;
3105 		}
3106 
3107 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
3108 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
3109 	} else {
3110 		/*
3111 		 * 'path' is also the leftmost path which
3112 		 * means it must be the only one. This gets
3113 		 * handled differently because we want to
3114 		 * revert the root back to having extents
3115 		 * in-line.
3116 		 */
3117 		ocfs2_unlink_path(handle, et, dealloc, path, 1);
3118 
3119 		el = et->et_root_el;
3120 		el->l_tree_depth = 0;
3121 		el->l_next_free_rec = 0;
3122 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3123 
3124 		ocfs2_et_set_last_eb_blk(et, 0);
3125 	}
3126 
3127 	ocfs2_journal_dirty(handle, path_root_bh(path));
3128 
3129 out:
3130 	ocfs2_free_path(left_path);
3131 	return ret;
3132 }
3133 
3134 /*
3135  * Left rotation of btree records.
3136  *
3137  * In many ways, this is (unsurprisingly) the opposite of right
3138  * rotation. We start at some non-rightmost path containing an empty
3139  * extent in the leaf block. The code works its way to the rightmost
3140  * path by rotating records to the left in every subtree.
3141  *
3142  * This is used by any code which reduces the number of extent records
3143  * in a leaf. After removal, an empty record should be placed in the
3144  * leftmost list position.
3145  *
3146  * This won't handle a length update of the rightmost path records if
3147  * the rightmost tree leaf record is removed so the caller is
3148  * responsible for detecting and correcting that.
3149  */
3150 static int ocfs2_rotate_tree_left(handle_t *handle,
3151 				  struct ocfs2_extent_tree *et,
3152 				  struct ocfs2_path *path,
3153 				  struct ocfs2_cached_dealloc_ctxt *dealloc)
3154 {
3155 	int ret, orig_credits = handle->h_buffer_credits;
3156 	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
3157 	struct ocfs2_extent_block *eb;
3158 	struct ocfs2_extent_list *el;
3159 
3160 	el = path_leaf_el(path);
3161 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
3162 		return 0;
3163 
3164 	if (path->p_tree_depth == 0) {
3165 rightmost_no_delete:
3166 		/*
3167 		 * Inline extents. This is trivially handled, so do
3168 		 * it up front.
3169 		 */
3170 		ret = ocfs2_rotate_rightmost_leaf_left(handle, et, path);
3171 		if (ret)
3172 			mlog_errno(ret);
3173 		goto out;
3174 	}
3175 
3176 	/*
3177 	 * Handle rightmost branch now. There's several cases:
3178 	 *  1) simple rotation leaving records in there. That's trivial.
3179 	 *  2) rotation requiring a branch delete - there's no more
3180 	 *     records left. Two cases of this:
3181 	 *     a) There are branches to the left.
3182 	 *     b) This is also the leftmost (the only) branch.
3183 	 *
3184 	 *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
3185 	 *  2a) we need the left branch so that we can update it with the unlink
3186 	 *  2b) we need to bring the root back to inline extents.
3187 	 */
3188 
3189 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
3190 	el = &eb->h_list;
3191 	if (eb->h_next_leaf_blk == 0) {
3192 		/*
3193 		 * This gets a bit tricky if we're going to delete the
3194 		 * rightmost path. Get the other cases out of the way
3195 		 * 1st.
3196 		 */
3197 		if (le16_to_cpu(el->l_next_free_rec) > 1)
3198 			goto rightmost_no_delete;
3199 
3200 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
3201 			ret = -EIO;
3202 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3203 				    "Owner %llu has empty extent block at %llu",
3204 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
3205 				    (unsigned long long)le64_to_cpu(eb->h_blkno));
3206 			goto out;
3207 		}
3208 
3209 		/*
3210 		 * XXX: The caller can not trust "path" any more after
3211 		 * this as it will have been deleted. What do we do?
3212 		 *
3213 		 * In theory the rotate-for-merge code will never get
3214 		 * here because it'll always ask for a rotate in a
3215 		 * nonempty list.
3216 		 */
3217 
3218 		ret = ocfs2_remove_rightmost_path(handle, et, path,
3219 						  dealloc);
3220 		if (ret)
3221 			mlog_errno(ret);
3222 		goto out;
3223 	}
3224 
3225 	/*
3226 	 * Now we can loop, remembering the path we get from -EAGAIN
3227 	 * and restarting from there.
3228 	 */
3229 try_rotate:
3230 	ret = __ocfs2_rotate_tree_left(handle, et, orig_credits, path,
3231 				       dealloc, &restart_path);
3232 	if (ret && ret != -EAGAIN) {
3233 		mlog_errno(ret);
3234 		goto out;
3235 	}
3236 
3237 	while (ret == -EAGAIN) {
3238 		tmp_path = restart_path;
3239 		restart_path = NULL;
3240 
3241 		ret = __ocfs2_rotate_tree_left(handle, et, orig_credits,
3242 					       tmp_path, dealloc,
3243 					       &restart_path);
3244 		if (ret && ret != -EAGAIN) {
3245 			mlog_errno(ret);
3246 			goto out;
3247 		}
3248 
3249 		ocfs2_free_path(tmp_path);
3250 		tmp_path = NULL;
3251 
3252 		if (ret == 0)
3253 			goto try_rotate;
3254 	}
3255 
3256 out:
3257 	ocfs2_free_path(tmp_path);
3258 	ocfs2_free_path(restart_path);
3259 	return ret;
3260 }
3261 
3262 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3263 				int index)
3264 {
3265 	struct ocfs2_extent_rec *rec = &el->l_recs[index];
3266 	unsigned int size;
3267 
3268 	if (rec->e_leaf_clusters == 0) {
3269 		/*
3270 		 * We consumed all of the merged-from record. An empty
3271 		 * extent cannot exist anywhere but the 1st array
3272 		 * position, so move things over if the merged-from
3273 		 * record doesn't occupy that position.
3274 		 *
3275 		 * This creates a new empty extent so the caller
3276 		 * should be smart enough to have removed any existing
3277 		 * ones.
3278 		 */
3279 		if (index > 0) {
3280 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3281 			size = index * sizeof(struct ocfs2_extent_rec);
3282 			memmove(&el->l_recs[1], &el->l_recs[0], size);
3283 		}
3284 
3285 		/*
3286 		 * Always memset - the caller doesn't check whether it
3287 		 * created an empty extent, so there could be junk in
3288 		 * the other fields.
3289 		 */
3290 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3291 	}
3292 }
3293 
3294 static int ocfs2_get_right_path(struct ocfs2_extent_tree *et,
3295 				struct ocfs2_path *left_path,
3296 				struct ocfs2_path **ret_right_path)
3297 {
3298 	int ret;
3299 	u32 right_cpos;
3300 	struct ocfs2_path *right_path = NULL;
3301 	struct ocfs2_extent_list *left_el;
3302 
3303 	*ret_right_path = NULL;
3304 
3305 	/* This function shouldn't be called for non-trees. */
3306 	BUG_ON(left_path->p_tree_depth == 0);
3307 
3308 	left_el = path_leaf_el(left_path);
3309 	BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3310 
3311 	ret = ocfs2_find_cpos_for_right_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3312 					     left_path, &right_cpos);
3313 	if (ret) {
3314 		mlog_errno(ret);
3315 		goto out;
3316 	}
3317 
3318 	/* This function shouldn't be called for the rightmost leaf. */
3319 	BUG_ON(right_cpos == 0);
3320 
3321 	right_path = ocfs2_new_path_from_path(left_path);
3322 	if (!right_path) {
3323 		ret = -ENOMEM;
3324 		mlog_errno(ret);
3325 		goto out;
3326 	}
3327 
3328 	ret = ocfs2_find_path(et->et_ci, right_path, right_cpos);
3329 	if (ret) {
3330 		mlog_errno(ret);
3331 		goto out;
3332 	}
3333 
3334 	*ret_right_path = right_path;
3335 out:
3336 	if (ret)
3337 		ocfs2_free_path(right_path);
3338 	return ret;
3339 }
3340 
3341 /*
3342  * Remove split_rec clusters from the record at index and merge them
3343  * onto the beginning of the record "next" to it.
3344  * For index < l_count - 1, the next means the extent rec at index + 1.
3345  * For index == l_count - 1, the "next" means the 1st extent rec of the
3346  * next extent block.
3347  */
3348 static int ocfs2_merge_rec_right(struct ocfs2_path *left_path,
3349 				 handle_t *handle,
3350 				 struct ocfs2_extent_tree *et,
3351 				 struct ocfs2_extent_rec *split_rec,
3352 				 int index)
3353 {
3354 	int ret, next_free, i;
3355 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3356 	struct ocfs2_extent_rec *left_rec;
3357 	struct ocfs2_extent_rec *right_rec;
3358 	struct ocfs2_extent_list *right_el;
3359 	struct ocfs2_path *right_path = NULL;
3360 	int subtree_index = 0;
3361 	struct ocfs2_extent_list *el = path_leaf_el(left_path);
3362 	struct buffer_head *bh = path_leaf_bh(left_path);
3363 	struct buffer_head *root_bh = NULL;
3364 
3365 	BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3366 	left_rec = &el->l_recs[index];
3367 
3368 	if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3369 	    le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3370 		/* we meet with a cross extent block merge. */
3371 		ret = ocfs2_get_right_path(et, left_path, &right_path);
3372 		if (ret) {
3373 			mlog_errno(ret);
3374 			goto out;
3375 		}
3376 
3377 		right_el = path_leaf_el(right_path);
3378 		next_free = le16_to_cpu(right_el->l_next_free_rec);
3379 		BUG_ON(next_free <= 0);
3380 		right_rec = &right_el->l_recs[0];
3381 		if (ocfs2_is_empty_extent(right_rec)) {
3382 			BUG_ON(next_free <= 1);
3383 			right_rec = &right_el->l_recs[1];
3384 		}
3385 
3386 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3387 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
3388 		       le32_to_cpu(right_rec->e_cpos));
3389 
3390 		subtree_index = ocfs2_find_subtree_root(et, left_path,
3391 							right_path);
3392 
3393 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3394 						      handle->h_buffer_credits,
3395 						      right_path);
3396 		if (ret) {
3397 			mlog_errno(ret);
3398 			goto out;
3399 		}
3400 
3401 		root_bh = left_path->p_node[subtree_index].bh;
3402 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3403 
3404 		ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3405 						   subtree_index);
3406 		if (ret) {
3407 			mlog_errno(ret);
3408 			goto out;
3409 		}
3410 
3411 		for (i = subtree_index + 1;
3412 		     i < path_num_items(right_path); i++) {
3413 			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3414 							   right_path, i);
3415 			if (ret) {
3416 				mlog_errno(ret);
3417 				goto out;
3418 			}
3419 
3420 			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3421 							   left_path, i);
3422 			if (ret) {
3423 				mlog_errno(ret);
3424 				goto out;
3425 			}
3426 		}
3427 
3428 	} else {
3429 		BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3430 		right_rec = &el->l_recs[index + 1];
3431 	}
3432 
3433 	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, left_path,
3434 					   path_num_items(left_path) - 1);
3435 	if (ret) {
3436 		mlog_errno(ret);
3437 		goto out;
3438 	}
3439 
3440 	le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3441 
3442 	le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3443 	le64_add_cpu(&right_rec->e_blkno,
3444 		     -ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3445 					       split_clusters));
3446 	le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3447 
3448 	ocfs2_cleanup_merge(el, index);
3449 
3450 	ret = ocfs2_journal_dirty(handle, bh);
3451 	if (ret)
3452 		mlog_errno(ret);
3453 
3454 	if (right_path) {
3455 		ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3456 		if (ret)
3457 			mlog_errno(ret);
3458 
3459 		ocfs2_complete_edge_insert(handle, left_path, right_path,
3460 					   subtree_index);
3461 	}
3462 out:
3463 	if (right_path)
3464 		ocfs2_free_path(right_path);
3465 	return ret;
3466 }
3467 
3468 static int ocfs2_get_left_path(struct ocfs2_extent_tree *et,
3469 			       struct ocfs2_path *right_path,
3470 			       struct ocfs2_path **ret_left_path)
3471 {
3472 	int ret;
3473 	u32 left_cpos;
3474 	struct ocfs2_path *left_path = NULL;
3475 
3476 	*ret_left_path = NULL;
3477 
3478 	/* This function shouldn't be called for non-trees. */
3479 	BUG_ON(right_path->p_tree_depth == 0);
3480 
3481 	ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3482 					    right_path, &left_cpos);
3483 	if (ret) {
3484 		mlog_errno(ret);
3485 		goto out;
3486 	}
3487 
3488 	/* This function shouldn't be called for the leftmost leaf. */
3489 	BUG_ON(left_cpos == 0);
3490 
3491 	left_path = ocfs2_new_path_from_path(right_path);
3492 	if (!left_path) {
3493 		ret = -ENOMEM;
3494 		mlog_errno(ret);
3495 		goto out;
3496 	}
3497 
3498 	ret = ocfs2_find_path(et->et_ci, left_path, left_cpos);
3499 	if (ret) {
3500 		mlog_errno(ret);
3501 		goto out;
3502 	}
3503 
3504 	*ret_left_path = left_path;
3505 out:
3506 	if (ret)
3507 		ocfs2_free_path(left_path);
3508 	return ret;
3509 }
3510 
3511 /*
3512  * Remove split_rec clusters from the record at index and merge them
3513  * onto the tail of the record "before" it.
3514  * For index > 0, the "before" means the extent rec at index - 1.
3515  *
3516  * For index == 0, the "before" means the last record of the previous
3517  * extent block. And there is also a situation that we may need to
3518  * remove the rightmost leaf extent block in the right_path and change
3519  * the right path to indicate the new rightmost path.
3520  */
3521 static int ocfs2_merge_rec_left(struct ocfs2_path *right_path,
3522 				handle_t *handle,
3523 				struct ocfs2_extent_tree *et,
3524 				struct ocfs2_extent_rec *split_rec,
3525 				struct ocfs2_cached_dealloc_ctxt *dealloc,
3526 				int index)
3527 {
3528 	int ret, i, subtree_index = 0, has_empty_extent = 0;
3529 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3530 	struct ocfs2_extent_rec *left_rec;
3531 	struct ocfs2_extent_rec *right_rec;
3532 	struct ocfs2_extent_list *el = path_leaf_el(right_path);
3533 	struct buffer_head *bh = path_leaf_bh(right_path);
3534 	struct buffer_head *root_bh = NULL;
3535 	struct ocfs2_path *left_path = NULL;
3536 	struct ocfs2_extent_list *left_el;
3537 
3538 	BUG_ON(index < 0);
3539 
3540 	right_rec = &el->l_recs[index];
3541 	if (index == 0) {
3542 		/* we meet with a cross extent block merge. */
3543 		ret = ocfs2_get_left_path(et, right_path, &left_path);
3544 		if (ret) {
3545 			mlog_errno(ret);
3546 			goto out;
3547 		}
3548 
3549 		left_el = path_leaf_el(left_path);
3550 		BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3551 		       le16_to_cpu(left_el->l_count));
3552 
3553 		left_rec = &left_el->l_recs[
3554 				le16_to_cpu(left_el->l_next_free_rec) - 1];
3555 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3556 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
3557 		       le32_to_cpu(split_rec->e_cpos));
3558 
3559 		subtree_index = ocfs2_find_subtree_root(et, left_path,
3560 							right_path);
3561 
3562 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3563 						      handle->h_buffer_credits,
3564 						      left_path);
3565 		if (ret) {
3566 			mlog_errno(ret);
3567 			goto out;
3568 		}
3569 
3570 		root_bh = left_path->p_node[subtree_index].bh;
3571 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3572 
3573 		ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3574 						   subtree_index);
3575 		if (ret) {
3576 			mlog_errno(ret);
3577 			goto out;
3578 		}
3579 
3580 		for (i = subtree_index + 1;
3581 		     i < path_num_items(right_path); i++) {
3582 			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3583 							   right_path, i);
3584 			if (ret) {
3585 				mlog_errno(ret);
3586 				goto out;
3587 			}
3588 
3589 			ret = ocfs2_path_bh_journal_access(handle, et->et_ci,
3590 							   left_path, i);
3591 			if (ret) {
3592 				mlog_errno(ret);
3593 				goto out;
3594 			}
3595 		}
3596 	} else {
3597 		left_rec = &el->l_recs[index - 1];
3598 		if (ocfs2_is_empty_extent(&el->l_recs[0]))
3599 			has_empty_extent = 1;
3600 	}
3601 
3602 	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, right_path,
3603 					   path_num_items(right_path) - 1);
3604 	if (ret) {
3605 		mlog_errno(ret);
3606 		goto out;
3607 	}
3608 
3609 	if (has_empty_extent && index == 1) {
3610 		/*
3611 		 * The easy case - we can just plop the record right in.
3612 		 */
3613 		*left_rec = *split_rec;
3614 
3615 		has_empty_extent = 0;
3616 	} else
3617 		le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3618 
3619 	le32_add_cpu(&right_rec->e_cpos, split_clusters);
3620 	le64_add_cpu(&right_rec->e_blkno,
3621 		     ocfs2_clusters_to_blocks(ocfs2_metadata_cache_get_super(et->et_ci),
3622 					      split_clusters));
3623 	le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3624 
3625 	ocfs2_cleanup_merge(el, index);
3626 
3627 	ret = ocfs2_journal_dirty(handle, bh);
3628 	if (ret)
3629 		mlog_errno(ret);
3630 
3631 	if (left_path) {
3632 		ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3633 		if (ret)
3634 			mlog_errno(ret);
3635 
3636 		/*
3637 		 * In the situation that the right_rec is empty and the extent
3638 		 * block is empty also,  ocfs2_complete_edge_insert can't handle
3639 		 * it and we need to delete the right extent block.
3640 		 */
3641 		if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3642 		    le16_to_cpu(el->l_next_free_rec) == 1) {
3643 
3644 			ret = ocfs2_remove_rightmost_path(handle, et,
3645 							  right_path,
3646 							  dealloc);
3647 			if (ret) {
3648 				mlog_errno(ret);
3649 				goto out;
3650 			}
3651 
3652 			/* Now the rightmost extent block has been deleted.
3653 			 * So we use the new rightmost path.
3654 			 */
3655 			ocfs2_mv_path(right_path, left_path);
3656 			left_path = NULL;
3657 		} else
3658 			ocfs2_complete_edge_insert(handle, left_path,
3659 						   right_path, subtree_index);
3660 	}
3661 out:
3662 	if (left_path)
3663 		ocfs2_free_path(left_path);
3664 	return ret;
3665 }
3666 
3667 static int ocfs2_try_to_merge_extent(handle_t *handle,
3668 				     struct ocfs2_extent_tree *et,
3669 				     struct ocfs2_path *path,
3670 				     int split_index,
3671 				     struct ocfs2_extent_rec *split_rec,
3672 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
3673 				     struct ocfs2_merge_ctxt *ctxt)
3674 {
3675 	int ret = 0;
3676 	struct ocfs2_extent_list *el = path_leaf_el(path);
3677 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3678 
3679 	BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3680 
3681 	if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3682 		/*
3683 		 * The merge code will need to create an empty
3684 		 * extent to take the place of the newly
3685 		 * emptied slot. Remove any pre-existing empty
3686 		 * extents - having more than one in a leaf is
3687 		 * illegal.
3688 		 */
3689 		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3690 		if (ret) {
3691 			mlog_errno(ret);
3692 			goto out;
3693 		}
3694 		split_index--;
3695 		rec = &el->l_recs[split_index];
3696 	}
3697 
3698 	if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3699 		/*
3700 		 * Left-right contig implies this.
3701 		 */
3702 		BUG_ON(!ctxt->c_split_covers_rec);
3703 
3704 		/*
3705 		 * Since the leftright insert always covers the entire
3706 		 * extent, this call will delete the insert record
3707 		 * entirely, resulting in an empty extent record added to
3708 		 * the extent block.
3709 		 *
3710 		 * Since the adding of an empty extent shifts
3711 		 * everything back to the right, there's no need to
3712 		 * update split_index here.
3713 		 *
3714 		 * When the split_index is zero, we need to merge it to the
3715 		 * prevoius extent block. It is more efficient and easier
3716 		 * if we do merge_right first and merge_left later.
3717 		 */
3718 		ret = ocfs2_merge_rec_right(path, handle, et, split_rec,
3719 					    split_index);
3720 		if (ret) {
3721 			mlog_errno(ret);
3722 			goto out;
3723 		}
3724 
3725 		/*
3726 		 * We can only get this from logic error above.
3727 		 */
3728 		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3729 
3730 		/* The merge left us with an empty extent, remove it. */
3731 		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3732 		if (ret) {
3733 			mlog_errno(ret);
3734 			goto out;
3735 		}
3736 
3737 		rec = &el->l_recs[split_index];
3738 
3739 		/*
3740 		 * Note that we don't pass split_rec here on purpose -
3741 		 * we've merged it into the rec already.
3742 		 */
3743 		ret = ocfs2_merge_rec_left(path, handle, et, rec,
3744 					   dealloc, split_index);
3745 
3746 		if (ret) {
3747 			mlog_errno(ret);
3748 			goto out;
3749 		}
3750 
3751 		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
3752 		/*
3753 		 * Error from this last rotate is not critical, so
3754 		 * print but don't bubble it up.
3755 		 */
3756 		if (ret)
3757 			mlog_errno(ret);
3758 		ret = 0;
3759 	} else {
3760 		/*
3761 		 * Merge a record to the left or right.
3762 		 *
3763 		 * 'contig_type' is relative to the existing record,
3764 		 * so for example, if we're "right contig", it's to
3765 		 * the record on the left (hence the left merge).
3766 		 */
3767 		if (ctxt->c_contig_type == CONTIG_RIGHT) {
3768 			ret = ocfs2_merge_rec_left(path, handle, et,
3769 						   split_rec, dealloc,
3770 						   split_index);
3771 			if (ret) {
3772 				mlog_errno(ret);
3773 				goto out;
3774 			}
3775 		} else {
3776 			ret = ocfs2_merge_rec_right(path, handle,
3777 						    et, split_rec,
3778 						    split_index);
3779 			if (ret) {
3780 				mlog_errno(ret);
3781 				goto out;
3782 			}
3783 		}
3784 
3785 		if (ctxt->c_split_covers_rec) {
3786 			/*
3787 			 * The merge may have left an empty extent in
3788 			 * our leaf. Try to rotate it away.
3789 			 */
3790 			ret = ocfs2_rotate_tree_left(handle, et, path,
3791 						     dealloc);
3792 			if (ret)
3793 				mlog_errno(ret);
3794 			ret = 0;
3795 		}
3796 	}
3797 
3798 out:
3799 	return ret;
3800 }
3801 
3802 static void ocfs2_subtract_from_rec(struct super_block *sb,
3803 				    enum ocfs2_split_type split,
3804 				    struct ocfs2_extent_rec *rec,
3805 				    struct ocfs2_extent_rec *split_rec)
3806 {
3807 	u64 len_blocks;
3808 
3809 	len_blocks = ocfs2_clusters_to_blocks(sb,
3810 				le16_to_cpu(split_rec->e_leaf_clusters));
3811 
3812 	if (split == SPLIT_LEFT) {
3813 		/*
3814 		 * Region is on the left edge of the existing
3815 		 * record.
3816 		 */
3817 		le32_add_cpu(&rec->e_cpos,
3818 			     le16_to_cpu(split_rec->e_leaf_clusters));
3819 		le64_add_cpu(&rec->e_blkno, len_blocks);
3820 		le16_add_cpu(&rec->e_leaf_clusters,
3821 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3822 	} else {
3823 		/*
3824 		 * Region is on the right edge of the existing
3825 		 * record.
3826 		 */
3827 		le16_add_cpu(&rec->e_leaf_clusters,
3828 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3829 	}
3830 }
3831 
3832 /*
3833  * Do the final bits of extent record insertion at the target leaf
3834  * list. If this leaf is part of an allocation tree, it is assumed
3835  * that the tree above has been prepared.
3836  */
3837 static void ocfs2_insert_at_leaf(struct ocfs2_extent_tree *et,
3838 				 struct ocfs2_extent_rec *insert_rec,
3839 				 struct ocfs2_extent_list *el,
3840 				 struct ocfs2_insert_type *insert)
3841 {
3842 	int i = insert->ins_contig_index;
3843 	unsigned int range;
3844 	struct ocfs2_extent_rec *rec;
3845 
3846 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3847 
3848 	if (insert->ins_split != SPLIT_NONE) {
3849 		i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3850 		BUG_ON(i == -1);
3851 		rec = &el->l_recs[i];
3852 		ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
3853 					insert->ins_split, rec,
3854 					insert_rec);
3855 		goto rotate;
3856 	}
3857 
3858 	/*
3859 	 * Contiguous insert - either left or right.
3860 	 */
3861 	if (insert->ins_contig != CONTIG_NONE) {
3862 		rec = &el->l_recs[i];
3863 		if (insert->ins_contig == CONTIG_LEFT) {
3864 			rec->e_blkno = insert_rec->e_blkno;
3865 			rec->e_cpos = insert_rec->e_cpos;
3866 		}
3867 		le16_add_cpu(&rec->e_leaf_clusters,
3868 			     le16_to_cpu(insert_rec->e_leaf_clusters));
3869 		return;
3870 	}
3871 
3872 	/*
3873 	 * Handle insert into an empty leaf.
3874 	 */
3875 	if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3876 	    ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3877 	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
3878 		el->l_recs[0] = *insert_rec;
3879 		el->l_next_free_rec = cpu_to_le16(1);
3880 		return;
3881 	}
3882 
3883 	/*
3884 	 * Appending insert.
3885 	 */
3886 	if (insert->ins_appending == APPEND_TAIL) {
3887 		i = le16_to_cpu(el->l_next_free_rec) - 1;
3888 		rec = &el->l_recs[i];
3889 		range = le32_to_cpu(rec->e_cpos)
3890 			+ le16_to_cpu(rec->e_leaf_clusters);
3891 		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3892 
3893 		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3894 				le16_to_cpu(el->l_count),
3895 				"owner %llu, depth %u, count %u, next free %u, "
3896 				"rec.cpos %u, rec.clusters %u, "
3897 				"insert.cpos %u, insert.clusters %u\n",
3898 				ocfs2_metadata_cache_owner(et->et_ci),
3899 				le16_to_cpu(el->l_tree_depth),
3900 				le16_to_cpu(el->l_count),
3901 				le16_to_cpu(el->l_next_free_rec),
3902 				le32_to_cpu(el->l_recs[i].e_cpos),
3903 				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3904 				le32_to_cpu(insert_rec->e_cpos),
3905 				le16_to_cpu(insert_rec->e_leaf_clusters));
3906 		i++;
3907 		el->l_recs[i] = *insert_rec;
3908 		le16_add_cpu(&el->l_next_free_rec, 1);
3909 		return;
3910 	}
3911 
3912 rotate:
3913 	/*
3914 	 * Ok, we have to rotate.
3915 	 *
3916 	 * At this point, it is safe to assume that inserting into an
3917 	 * empty leaf and appending to a leaf have both been handled
3918 	 * above.
3919 	 *
3920 	 * This leaf needs to have space, either by the empty 1st
3921 	 * extent record, or by virtue of an l_next_rec < l_count.
3922 	 */
3923 	ocfs2_rotate_leaf(el, insert_rec);
3924 }
3925 
3926 static void ocfs2_adjust_rightmost_records(handle_t *handle,
3927 					   struct ocfs2_extent_tree *et,
3928 					   struct ocfs2_path *path,
3929 					   struct ocfs2_extent_rec *insert_rec)
3930 {
3931 	int ret, i, next_free;
3932 	struct buffer_head *bh;
3933 	struct ocfs2_extent_list *el;
3934 	struct ocfs2_extent_rec *rec;
3935 
3936 	/*
3937 	 * Update everything except the leaf block.
3938 	 */
3939 	for (i = 0; i < path->p_tree_depth; i++) {
3940 		bh = path->p_node[i].bh;
3941 		el = path->p_node[i].el;
3942 
3943 		next_free = le16_to_cpu(el->l_next_free_rec);
3944 		if (next_free == 0) {
3945 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
3946 				    "Owner %llu has a bad extent list",
3947 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
3948 			ret = -EIO;
3949 			return;
3950 		}
3951 
3952 		rec = &el->l_recs[next_free - 1];
3953 
3954 		rec->e_int_clusters = insert_rec->e_cpos;
3955 		le32_add_cpu(&rec->e_int_clusters,
3956 			     le16_to_cpu(insert_rec->e_leaf_clusters));
3957 		le32_add_cpu(&rec->e_int_clusters,
3958 			     -le32_to_cpu(rec->e_cpos));
3959 
3960 		ret = ocfs2_journal_dirty(handle, bh);
3961 		if (ret)
3962 			mlog_errno(ret);
3963 
3964 	}
3965 }
3966 
3967 static int ocfs2_append_rec_to_path(handle_t *handle,
3968 				    struct ocfs2_extent_tree *et,
3969 				    struct ocfs2_extent_rec *insert_rec,
3970 				    struct ocfs2_path *right_path,
3971 				    struct ocfs2_path **ret_left_path)
3972 {
3973 	int ret, next_free;
3974 	struct ocfs2_extent_list *el;
3975 	struct ocfs2_path *left_path = NULL;
3976 
3977 	*ret_left_path = NULL;
3978 
3979 	/*
3980 	 * This shouldn't happen for non-trees. The extent rec cluster
3981 	 * count manipulation below only works for interior nodes.
3982 	 */
3983 	BUG_ON(right_path->p_tree_depth == 0);
3984 
3985 	/*
3986 	 * If our appending insert is at the leftmost edge of a leaf,
3987 	 * then we might need to update the rightmost records of the
3988 	 * neighboring path.
3989 	 */
3990 	el = path_leaf_el(right_path);
3991 	next_free = le16_to_cpu(el->l_next_free_rec);
3992 	if (next_free == 0 ||
3993 	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3994 		u32 left_cpos;
3995 
3996 		ret = ocfs2_find_cpos_for_left_leaf(ocfs2_metadata_cache_get_super(et->et_ci),
3997 						    right_path, &left_cpos);
3998 		if (ret) {
3999 			mlog_errno(ret);
4000 			goto out;
4001 		}
4002 
4003 		mlog(0, "Append may need a left path update. cpos: %u, "
4004 		     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
4005 		     left_cpos);
4006 
4007 		/*
4008 		 * No need to worry if the append is already in the
4009 		 * leftmost leaf.
4010 		 */
4011 		if (left_cpos) {
4012 			left_path = ocfs2_new_path_from_path(right_path);
4013 			if (!left_path) {
4014 				ret = -ENOMEM;
4015 				mlog_errno(ret);
4016 				goto out;
4017 			}
4018 
4019 			ret = ocfs2_find_path(et->et_ci, left_path,
4020 					      left_cpos);
4021 			if (ret) {
4022 				mlog_errno(ret);
4023 				goto out;
4024 			}
4025 
4026 			/*
4027 			 * ocfs2_insert_path() will pass the left_path to the
4028 			 * journal for us.
4029 			 */
4030 		}
4031 	}
4032 
4033 	ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4034 	if (ret) {
4035 		mlog_errno(ret);
4036 		goto out;
4037 	}
4038 
4039 	ocfs2_adjust_rightmost_records(handle, et, right_path, insert_rec);
4040 
4041 	*ret_left_path = left_path;
4042 	ret = 0;
4043 out:
4044 	if (ret != 0)
4045 		ocfs2_free_path(left_path);
4046 
4047 	return ret;
4048 }
4049 
4050 static void ocfs2_split_record(struct ocfs2_extent_tree *et,
4051 			       struct ocfs2_path *left_path,
4052 			       struct ocfs2_path *right_path,
4053 			       struct ocfs2_extent_rec *split_rec,
4054 			       enum ocfs2_split_type split)
4055 {
4056 	int index;
4057 	u32 cpos = le32_to_cpu(split_rec->e_cpos);
4058 	struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
4059 	struct ocfs2_extent_rec *rec, *tmprec;
4060 
4061 	right_el = path_leaf_el(right_path);
4062 	if (left_path)
4063 		left_el = path_leaf_el(left_path);
4064 
4065 	el = right_el;
4066 	insert_el = right_el;
4067 	index = ocfs2_search_extent_list(el, cpos);
4068 	if (index != -1) {
4069 		if (index == 0 && left_path) {
4070 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
4071 
4072 			/*
4073 			 * This typically means that the record
4074 			 * started in the left path but moved to the
4075 			 * right as a result of rotation. We either
4076 			 * move the existing record to the left, or we
4077 			 * do the later insert there.
4078 			 *
4079 			 * In this case, the left path should always
4080 			 * exist as the rotate code will have passed
4081 			 * it back for a post-insert update.
4082 			 */
4083 
4084 			if (split == SPLIT_LEFT) {
4085 				/*
4086 				 * It's a left split. Since we know
4087 				 * that the rotate code gave us an
4088 				 * empty extent in the left path, we
4089 				 * can just do the insert there.
4090 				 */
4091 				insert_el = left_el;
4092 			} else {
4093 				/*
4094 				 * Right split - we have to move the
4095 				 * existing record over to the left
4096 				 * leaf. The insert will be into the
4097 				 * newly created empty extent in the
4098 				 * right leaf.
4099 				 */
4100 				tmprec = &right_el->l_recs[index];
4101 				ocfs2_rotate_leaf(left_el, tmprec);
4102 				el = left_el;
4103 
4104 				memset(tmprec, 0, sizeof(*tmprec));
4105 				index = ocfs2_search_extent_list(left_el, cpos);
4106 				BUG_ON(index == -1);
4107 			}
4108 		}
4109 	} else {
4110 		BUG_ON(!left_path);
4111 		BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
4112 		/*
4113 		 * Left path is easy - we can just allow the insert to
4114 		 * happen.
4115 		 */
4116 		el = left_el;
4117 		insert_el = left_el;
4118 		index = ocfs2_search_extent_list(el, cpos);
4119 		BUG_ON(index == -1);
4120 	}
4121 
4122 	rec = &el->l_recs[index];
4123 	ocfs2_subtract_from_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4124 				split, rec, split_rec);
4125 	ocfs2_rotate_leaf(insert_el, split_rec);
4126 }
4127 
4128 /*
4129  * This function only does inserts on an allocation b-tree. For tree
4130  * depth = 0, ocfs2_insert_at_leaf() is called directly.
4131  *
4132  * right_path is the path we want to do the actual insert
4133  * in. left_path should only be passed in if we need to update that
4134  * portion of the tree after an edge insert.
4135  */
4136 static int ocfs2_insert_path(handle_t *handle,
4137 			     struct ocfs2_extent_tree *et,
4138 			     struct ocfs2_path *left_path,
4139 			     struct ocfs2_path *right_path,
4140 			     struct ocfs2_extent_rec *insert_rec,
4141 			     struct ocfs2_insert_type *insert)
4142 {
4143 	int ret, subtree_index;
4144 	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
4145 
4146 	if (left_path) {
4147 		int credits = handle->h_buffer_credits;
4148 
4149 		/*
4150 		 * There's a chance that left_path got passed back to
4151 		 * us without being accounted for in the
4152 		 * journal. Extend our transaction here to be sure we
4153 		 * can change those blocks.
4154 		 */
4155 		credits += left_path->p_tree_depth;
4156 
4157 		ret = ocfs2_extend_trans(handle, credits);
4158 		if (ret < 0) {
4159 			mlog_errno(ret);
4160 			goto out;
4161 		}
4162 
4163 		ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
4164 		if (ret < 0) {
4165 			mlog_errno(ret);
4166 			goto out;
4167 		}
4168 	}
4169 
4170 	/*
4171 	 * Pass both paths to the journal. The majority of inserts
4172 	 * will be touching all components anyway.
4173 	 */
4174 	ret = ocfs2_journal_access_path(et->et_ci, handle, right_path);
4175 	if (ret < 0) {
4176 		mlog_errno(ret);
4177 		goto out;
4178 	}
4179 
4180 	if (insert->ins_split != SPLIT_NONE) {
4181 		/*
4182 		 * We could call ocfs2_insert_at_leaf() for some types
4183 		 * of splits, but it's easier to just let one separate
4184 		 * function sort it all out.
4185 		 */
4186 		ocfs2_split_record(et, left_path, right_path,
4187 				   insert_rec, insert->ins_split);
4188 
4189 		/*
4190 		 * Split might have modified either leaf and we don't
4191 		 * have a guarantee that the later edge insert will
4192 		 * dirty this for us.
4193 		 */
4194 		if (left_path)
4195 			ret = ocfs2_journal_dirty(handle,
4196 						  path_leaf_bh(left_path));
4197 			if (ret)
4198 				mlog_errno(ret);
4199 	} else
4200 		ocfs2_insert_at_leaf(et, insert_rec, path_leaf_el(right_path),
4201 				     insert);
4202 
4203 	ret = ocfs2_journal_dirty(handle, leaf_bh);
4204 	if (ret)
4205 		mlog_errno(ret);
4206 
4207 	if (left_path) {
4208 		/*
4209 		 * The rotate code has indicated that we need to fix
4210 		 * up portions of the tree after the insert.
4211 		 *
4212 		 * XXX: Should we extend the transaction here?
4213 		 */
4214 		subtree_index = ocfs2_find_subtree_root(et, left_path,
4215 							right_path);
4216 		ocfs2_complete_edge_insert(handle, left_path, right_path,
4217 					   subtree_index);
4218 	}
4219 
4220 	ret = 0;
4221 out:
4222 	return ret;
4223 }
4224 
4225 static int ocfs2_do_insert_extent(handle_t *handle,
4226 				  struct ocfs2_extent_tree *et,
4227 				  struct ocfs2_extent_rec *insert_rec,
4228 				  struct ocfs2_insert_type *type)
4229 {
4230 	int ret, rotate = 0;
4231 	u32 cpos;
4232 	struct ocfs2_path *right_path = NULL;
4233 	struct ocfs2_path *left_path = NULL;
4234 	struct ocfs2_extent_list *el;
4235 
4236 	el = et->et_root_el;
4237 
4238 	ret = ocfs2_et_root_journal_access(handle, et,
4239 					   OCFS2_JOURNAL_ACCESS_WRITE);
4240 	if (ret) {
4241 		mlog_errno(ret);
4242 		goto out;
4243 	}
4244 
4245 	if (le16_to_cpu(el->l_tree_depth) == 0) {
4246 		ocfs2_insert_at_leaf(et, insert_rec, el, type);
4247 		goto out_update_clusters;
4248 	}
4249 
4250 	right_path = ocfs2_new_path_from_et(et);
4251 	if (!right_path) {
4252 		ret = -ENOMEM;
4253 		mlog_errno(ret);
4254 		goto out;
4255 	}
4256 
4257 	/*
4258 	 * Determine the path to start with. Rotations need the
4259 	 * rightmost path, everything else can go directly to the
4260 	 * target leaf.
4261 	 */
4262 	cpos = le32_to_cpu(insert_rec->e_cpos);
4263 	if (type->ins_appending == APPEND_NONE &&
4264 	    type->ins_contig == CONTIG_NONE) {
4265 		rotate = 1;
4266 		cpos = UINT_MAX;
4267 	}
4268 
4269 	ret = ocfs2_find_path(et->et_ci, right_path, cpos);
4270 	if (ret) {
4271 		mlog_errno(ret);
4272 		goto out;
4273 	}
4274 
4275 	/*
4276 	 * Rotations and appends need special treatment - they modify
4277 	 * parts of the tree's above them.
4278 	 *
4279 	 * Both might pass back a path immediate to the left of the
4280 	 * one being inserted to. This will be cause
4281 	 * ocfs2_insert_path() to modify the rightmost records of
4282 	 * left_path to account for an edge insert.
4283 	 *
4284 	 * XXX: When modifying this code, keep in mind that an insert
4285 	 * can wind up skipping both of these two special cases...
4286 	 */
4287 	if (rotate) {
4288 		ret = ocfs2_rotate_tree_right(handle, et, type->ins_split,
4289 					      le32_to_cpu(insert_rec->e_cpos),
4290 					      right_path, &left_path);
4291 		if (ret) {
4292 			mlog_errno(ret);
4293 			goto out;
4294 		}
4295 
4296 		/*
4297 		 * ocfs2_rotate_tree_right() might have extended the
4298 		 * transaction without re-journaling our tree root.
4299 		 */
4300 		ret = ocfs2_et_root_journal_access(handle, et,
4301 						   OCFS2_JOURNAL_ACCESS_WRITE);
4302 		if (ret) {
4303 			mlog_errno(ret);
4304 			goto out;
4305 		}
4306 	} else if (type->ins_appending == APPEND_TAIL
4307 		   && type->ins_contig != CONTIG_LEFT) {
4308 		ret = ocfs2_append_rec_to_path(handle, et, insert_rec,
4309 					       right_path, &left_path);
4310 		if (ret) {
4311 			mlog_errno(ret);
4312 			goto out;
4313 		}
4314 	}
4315 
4316 	ret = ocfs2_insert_path(handle, et, left_path, right_path,
4317 				insert_rec, type);
4318 	if (ret) {
4319 		mlog_errno(ret);
4320 		goto out;
4321 	}
4322 
4323 out_update_clusters:
4324 	if (type->ins_split == SPLIT_NONE)
4325 		ocfs2_et_update_clusters(et,
4326 					 le16_to_cpu(insert_rec->e_leaf_clusters));
4327 
4328 	ret = ocfs2_journal_dirty(handle, et->et_root_bh);
4329 	if (ret)
4330 		mlog_errno(ret);
4331 
4332 out:
4333 	ocfs2_free_path(left_path);
4334 	ocfs2_free_path(right_path);
4335 
4336 	return ret;
4337 }
4338 
4339 static enum ocfs2_contig_type
4340 ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et,
4341 			       struct ocfs2_path *path,
4342 			       struct ocfs2_extent_list *el, int index,
4343 			       struct ocfs2_extent_rec *split_rec)
4344 {
4345 	int status;
4346 	enum ocfs2_contig_type ret = CONTIG_NONE;
4347 	u32 left_cpos, right_cpos;
4348 	struct ocfs2_extent_rec *rec = NULL;
4349 	struct ocfs2_extent_list *new_el;
4350 	struct ocfs2_path *left_path = NULL, *right_path = NULL;
4351 	struct buffer_head *bh;
4352 	struct ocfs2_extent_block *eb;
4353 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
4354 
4355 	if (index > 0) {
4356 		rec = &el->l_recs[index - 1];
4357 	} else if (path->p_tree_depth > 0) {
4358 		status = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
4359 		if (status)
4360 			goto out;
4361 
4362 		if (left_cpos != 0) {
4363 			left_path = ocfs2_new_path_from_path(path);
4364 			if (!left_path)
4365 				goto out;
4366 
4367 			status = ocfs2_find_path(et->et_ci, left_path,
4368 						 left_cpos);
4369 			if (status)
4370 				goto out;
4371 
4372 			new_el = path_leaf_el(left_path);
4373 
4374 			if (le16_to_cpu(new_el->l_next_free_rec) !=
4375 			    le16_to_cpu(new_el->l_count)) {
4376 				bh = path_leaf_bh(left_path);
4377 				eb = (struct ocfs2_extent_block *)bh->b_data;
4378 				ocfs2_error(sb,
4379 					    "Extent block #%llu has an "
4380 					    "invalid l_next_free_rec of "
4381 					    "%d.  It should have "
4382 					    "matched the l_count of %d",
4383 					    (unsigned long long)le64_to_cpu(eb->h_blkno),
4384 					    le16_to_cpu(new_el->l_next_free_rec),
4385 					    le16_to_cpu(new_el->l_count));
4386 				status = -EINVAL;
4387 				goto out;
4388 			}
4389 			rec = &new_el->l_recs[
4390 				le16_to_cpu(new_el->l_next_free_rec) - 1];
4391 		}
4392 	}
4393 
4394 	/*
4395 	 * We're careful to check for an empty extent record here -
4396 	 * the merge code will know what to do if it sees one.
4397 	 */
4398 	if (rec) {
4399 		if (index == 1 && ocfs2_is_empty_extent(rec)) {
4400 			if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4401 				ret = CONTIG_RIGHT;
4402 		} else {
4403 			ret = ocfs2_extent_contig(sb, rec, split_rec);
4404 		}
4405 	}
4406 
4407 	rec = NULL;
4408 	if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4409 		rec = &el->l_recs[index + 1];
4410 	else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4411 		 path->p_tree_depth > 0) {
4412 		status = ocfs2_find_cpos_for_right_leaf(sb, path, &right_cpos);
4413 		if (status)
4414 			goto out;
4415 
4416 		if (right_cpos == 0)
4417 			goto out;
4418 
4419 		right_path = ocfs2_new_path_from_path(path);
4420 		if (!right_path)
4421 			goto out;
4422 
4423 		status = ocfs2_find_path(et->et_ci, right_path, right_cpos);
4424 		if (status)
4425 			goto out;
4426 
4427 		new_el = path_leaf_el(right_path);
4428 		rec = &new_el->l_recs[0];
4429 		if (ocfs2_is_empty_extent(rec)) {
4430 			if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4431 				bh = path_leaf_bh(right_path);
4432 				eb = (struct ocfs2_extent_block *)bh->b_data;
4433 				ocfs2_error(sb,
4434 					    "Extent block #%llu has an "
4435 					    "invalid l_next_free_rec of %d",
4436 					    (unsigned long long)le64_to_cpu(eb->h_blkno),
4437 					    le16_to_cpu(new_el->l_next_free_rec));
4438 				status = -EINVAL;
4439 				goto out;
4440 			}
4441 			rec = &new_el->l_recs[1];
4442 		}
4443 	}
4444 
4445 	if (rec) {
4446 		enum ocfs2_contig_type contig_type;
4447 
4448 		contig_type = ocfs2_extent_contig(sb, rec, split_rec);
4449 
4450 		if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4451 			ret = CONTIG_LEFTRIGHT;
4452 		else if (ret == CONTIG_NONE)
4453 			ret = contig_type;
4454 	}
4455 
4456 out:
4457 	if (left_path)
4458 		ocfs2_free_path(left_path);
4459 	if (right_path)
4460 		ocfs2_free_path(right_path);
4461 
4462 	return ret;
4463 }
4464 
4465 static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et,
4466 				     struct ocfs2_insert_type *insert,
4467 				     struct ocfs2_extent_list *el,
4468 				     struct ocfs2_extent_rec *insert_rec)
4469 {
4470 	int i;
4471 	enum ocfs2_contig_type contig_type = CONTIG_NONE;
4472 
4473 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4474 
4475 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4476 		contig_type = ocfs2_extent_contig(ocfs2_metadata_cache_get_super(et->et_ci),
4477 						  &el->l_recs[i], insert_rec);
4478 		if (contig_type != CONTIG_NONE) {
4479 			insert->ins_contig_index = i;
4480 			break;
4481 		}
4482 	}
4483 	insert->ins_contig = contig_type;
4484 
4485 	if (insert->ins_contig != CONTIG_NONE) {
4486 		struct ocfs2_extent_rec *rec =
4487 				&el->l_recs[insert->ins_contig_index];
4488 		unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4489 				   le16_to_cpu(insert_rec->e_leaf_clusters);
4490 
4491 		/*
4492 		 * Caller might want us to limit the size of extents, don't
4493 		 * calculate contiguousness if we might exceed that limit.
4494 		 */
4495 		if (et->et_max_leaf_clusters &&
4496 		    (len > et->et_max_leaf_clusters))
4497 			insert->ins_contig = CONTIG_NONE;
4498 	}
4499 }
4500 
4501 /*
4502  * This should only be called against the righmost leaf extent list.
4503  *
4504  * ocfs2_figure_appending_type() will figure out whether we'll have to
4505  * insert at the tail of the rightmost leaf.
4506  *
4507  * This should also work against the root extent list for tree's with 0
4508  * depth. If we consider the root extent list to be the rightmost leaf node
4509  * then the logic here makes sense.
4510  */
4511 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4512 					struct ocfs2_extent_list *el,
4513 					struct ocfs2_extent_rec *insert_rec)
4514 {
4515 	int i;
4516 	u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4517 	struct ocfs2_extent_rec *rec;
4518 
4519 	insert->ins_appending = APPEND_NONE;
4520 
4521 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4522 
4523 	if (!el->l_next_free_rec)
4524 		goto set_tail_append;
4525 
4526 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4527 		/* Were all records empty? */
4528 		if (le16_to_cpu(el->l_next_free_rec) == 1)
4529 			goto set_tail_append;
4530 	}
4531 
4532 	i = le16_to_cpu(el->l_next_free_rec) - 1;
4533 	rec = &el->l_recs[i];
4534 
4535 	if (cpos >=
4536 	    (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4537 		goto set_tail_append;
4538 
4539 	return;
4540 
4541 set_tail_append:
4542 	insert->ins_appending = APPEND_TAIL;
4543 }
4544 
4545 /*
4546  * Helper function called at the begining of an insert.
4547  *
4548  * This computes a few things that are commonly used in the process of
4549  * inserting into the btree:
4550  *   - Whether the new extent is contiguous with an existing one.
4551  *   - The current tree depth.
4552  *   - Whether the insert is an appending one.
4553  *   - The total # of free records in the tree.
4554  *
4555  * All of the information is stored on the ocfs2_insert_type
4556  * structure.
4557  */
4558 static int ocfs2_figure_insert_type(struct ocfs2_extent_tree *et,
4559 				    struct buffer_head **last_eb_bh,
4560 				    struct ocfs2_extent_rec *insert_rec,
4561 				    int *free_records,
4562 				    struct ocfs2_insert_type *insert)
4563 {
4564 	int ret;
4565 	struct ocfs2_extent_block *eb;
4566 	struct ocfs2_extent_list *el;
4567 	struct ocfs2_path *path = NULL;
4568 	struct buffer_head *bh = NULL;
4569 
4570 	insert->ins_split = SPLIT_NONE;
4571 
4572 	el = et->et_root_el;
4573 	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4574 
4575 	if (el->l_tree_depth) {
4576 		/*
4577 		 * If we have tree depth, we read in the
4578 		 * rightmost extent block ahead of time as
4579 		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4580 		 * may want it later.
4581 		 */
4582 		ret = ocfs2_read_extent_block(et->et_ci,
4583 					      ocfs2_et_get_last_eb_blk(et),
4584 					      &bh);
4585 		if (ret) {
4586 			mlog_exit(ret);
4587 			goto out;
4588 		}
4589 		eb = (struct ocfs2_extent_block *) bh->b_data;
4590 		el = &eb->h_list;
4591 	}
4592 
4593 	/*
4594 	 * Unless we have a contiguous insert, we'll need to know if
4595 	 * there is room left in our allocation tree for another
4596 	 * extent record.
4597 	 *
4598 	 * XXX: This test is simplistic, we can search for empty
4599 	 * extent records too.
4600 	 */
4601 	*free_records = le16_to_cpu(el->l_count) -
4602 		le16_to_cpu(el->l_next_free_rec);
4603 
4604 	if (!insert->ins_tree_depth) {
4605 		ocfs2_figure_contig_type(et, insert, el, insert_rec);
4606 		ocfs2_figure_appending_type(insert, el, insert_rec);
4607 		return 0;
4608 	}
4609 
4610 	path = ocfs2_new_path_from_et(et);
4611 	if (!path) {
4612 		ret = -ENOMEM;
4613 		mlog_errno(ret);
4614 		goto out;
4615 	}
4616 
4617 	/*
4618 	 * In the case that we're inserting past what the tree
4619 	 * currently accounts for, ocfs2_find_path() will return for
4620 	 * us the rightmost tree path. This is accounted for below in
4621 	 * the appending code.
4622 	 */
4623 	ret = ocfs2_find_path(et->et_ci, path, le32_to_cpu(insert_rec->e_cpos));
4624 	if (ret) {
4625 		mlog_errno(ret);
4626 		goto out;
4627 	}
4628 
4629 	el = path_leaf_el(path);
4630 
4631 	/*
4632 	 * Now that we have the path, there's two things we want to determine:
4633 	 * 1) Contiguousness (also set contig_index if this is so)
4634 	 *
4635 	 * 2) Are we doing an append? We can trivially break this up
4636          *     into two types of appends: simple record append, or a
4637          *     rotate inside the tail leaf.
4638 	 */
4639 	ocfs2_figure_contig_type(et, insert, el, insert_rec);
4640 
4641 	/*
4642 	 * The insert code isn't quite ready to deal with all cases of
4643 	 * left contiguousness. Specifically, if it's an insert into
4644 	 * the 1st record in a leaf, it will require the adjustment of
4645 	 * cluster count on the last record of the path directly to it's
4646 	 * left. For now, just catch that case and fool the layers
4647 	 * above us. This works just fine for tree_depth == 0, which
4648 	 * is why we allow that above.
4649 	 */
4650 	if (insert->ins_contig == CONTIG_LEFT &&
4651 	    insert->ins_contig_index == 0)
4652 		insert->ins_contig = CONTIG_NONE;
4653 
4654 	/*
4655 	 * Ok, so we can simply compare against last_eb to figure out
4656 	 * whether the path doesn't exist. This will only happen in
4657 	 * the case that we're doing a tail append, so maybe we can
4658 	 * take advantage of that information somehow.
4659 	 */
4660 	if (ocfs2_et_get_last_eb_blk(et) ==
4661 	    path_leaf_bh(path)->b_blocknr) {
4662 		/*
4663 		 * Ok, ocfs2_find_path() returned us the rightmost
4664 		 * tree path. This might be an appending insert. There are
4665 		 * two cases:
4666 		 *    1) We're doing a true append at the tail:
4667 		 *	-This might even be off the end of the leaf
4668 		 *    2) We're "appending" by rotating in the tail
4669 		 */
4670 		ocfs2_figure_appending_type(insert, el, insert_rec);
4671 	}
4672 
4673 out:
4674 	ocfs2_free_path(path);
4675 
4676 	if (ret == 0)
4677 		*last_eb_bh = bh;
4678 	else
4679 		brelse(bh);
4680 	return ret;
4681 }
4682 
4683 /*
4684  * Insert an extent into a btree.
4685  *
4686  * The caller needs to update the owning btree's cluster count.
4687  */
4688 int ocfs2_insert_extent(handle_t *handle,
4689 			struct ocfs2_extent_tree *et,
4690 			u32 cpos,
4691 			u64 start_blk,
4692 			u32 new_clusters,
4693 			u8 flags,
4694 			struct ocfs2_alloc_context *meta_ac)
4695 {
4696 	int status;
4697 	int uninitialized_var(free_records);
4698 	struct buffer_head *last_eb_bh = NULL;
4699 	struct ocfs2_insert_type insert = {0, };
4700 	struct ocfs2_extent_rec rec;
4701 
4702 	mlog(0, "add %u clusters at position %u to owner %llu\n",
4703 	     new_clusters, cpos,
4704 	     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
4705 
4706 	memset(&rec, 0, sizeof(rec));
4707 	rec.e_cpos = cpu_to_le32(cpos);
4708 	rec.e_blkno = cpu_to_le64(start_blk);
4709 	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4710 	rec.e_flags = flags;
4711 	status = ocfs2_et_insert_check(et, &rec);
4712 	if (status) {
4713 		mlog_errno(status);
4714 		goto bail;
4715 	}
4716 
4717 	status = ocfs2_figure_insert_type(et, &last_eb_bh, &rec,
4718 					  &free_records, &insert);
4719 	if (status < 0) {
4720 		mlog_errno(status);
4721 		goto bail;
4722 	}
4723 
4724 	mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4725 	     "Insert.contig_index: %d, Insert.free_records: %d, "
4726 	     "Insert.tree_depth: %d\n",
4727 	     insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4728 	     free_records, insert.ins_tree_depth);
4729 
4730 	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4731 		status = ocfs2_grow_tree(handle, et,
4732 					 &insert.ins_tree_depth, &last_eb_bh,
4733 					 meta_ac);
4734 		if (status) {
4735 			mlog_errno(status);
4736 			goto bail;
4737 		}
4738 	}
4739 
4740 	/* Finally, we can add clusters. This might rotate the tree for us. */
4741 	status = ocfs2_do_insert_extent(handle, et, &rec, &insert);
4742 	if (status < 0)
4743 		mlog_errno(status);
4744 	else
4745 		ocfs2_et_extent_map_insert(et, &rec);
4746 
4747 bail:
4748 	brelse(last_eb_bh);
4749 
4750 	mlog_exit(status);
4751 	return status;
4752 }
4753 
4754 /*
4755  * Allcate and add clusters into the extent b-tree.
4756  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4757  * The extent b-tree's root is specified by et, and
4758  * it is not limited to the file storage. Any extent tree can use this
4759  * function if it implements the proper ocfs2_extent_tree.
4760  */
4761 int ocfs2_add_clusters_in_btree(handle_t *handle,
4762 				struct ocfs2_extent_tree *et,
4763 				u32 *logical_offset,
4764 				u32 clusters_to_add,
4765 				int mark_unwritten,
4766 				struct ocfs2_alloc_context *data_ac,
4767 				struct ocfs2_alloc_context *meta_ac,
4768 				enum ocfs2_alloc_restarted *reason_ret)
4769 {
4770 	int status = 0;
4771 	int free_extents;
4772 	enum ocfs2_alloc_restarted reason = RESTART_NONE;
4773 	u32 bit_off, num_bits;
4774 	u64 block;
4775 	u8 flags = 0;
4776 	struct ocfs2_super *osb =
4777 		OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci));
4778 
4779 	BUG_ON(!clusters_to_add);
4780 
4781 	if (mark_unwritten)
4782 		flags = OCFS2_EXT_UNWRITTEN;
4783 
4784 	free_extents = ocfs2_num_free_extents(osb, et);
4785 	if (free_extents < 0) {
4786 		status = free_extents;
4787 		mlog_errno(status);
4788 		goto leave;
4789 	}
4790 
4791 	/* there are two cases which could cause us to EAGAIN in the
4792 	 * we-need-more-metadata case:
4793 	 * 1) we haven't reserved *any*
4794 	 * 2) we are so fragmented, we've needed to add metadata too
4795 	 *    many times. */
4796 	if (!free_extents && !meta_ac) {
4797 		mlog(0, "we haven't reserved any metadata!\n");
4798 		status = -EAGAIN;
4799 		reason = RESTART_META;
4800 		goto leave;
4801 	} else if ((!free_extents)
4802 		   && (ocfs2_alloc_context_bits_left(meta_ac)
4803 		       < ocfs2_extend_meta_needed(et->et_root_el))) {
4804 		mlog(0, "filesystem is really fragmented...\n");
4805 		status = -EAGAIN;
4806 		reason = RESTART_META;
4807 		goto leave;
4808 	}
4809 
4810 	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4811 					clusters_to_add, &bit_off, &num_bits);
4812 	if (status < 0) {
4813 		if (status != -ENOSPC)
4814 			mlog_errno(status);
4815 		goto leave;
4816 	}
4817 
4818 	BUG_ON(num_bits > clusters_to_add);
4819 
4820 	/* reserve our write early -- insert_extent may update the tree root */
4821 	status = ocfs2_et_root_journal_access(handle, et,
4822 					      OCFS2_JOURNAL_ACCESS_WRITE);
4823 	if (status < 0) {
4824 		mlog_errno(status);
4825 		goto leave;
4826 	}
4827 
4828 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4829 	mlog(0, "Allocating %u clusters at block %u for owner %llu\n",
4830 	     num_bits, bit_off,
4831 	     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci));
4832 	status = ocfs2_insert_extent(handle, et, *logical_offset, block,
4833 				     num_bits, flags, meta_ac);
4834 	if (status < 0) {
4835 		mlog_errno(status);
4836 		goto leave;
4837 	}
4838 
4839 	status = ocfs2_journal_dirty(handle, et->et_root_bh);
4840 	if (status < 0) {
4841 		mlog_errno(status);
4842 		goto leave;
4843 	}
4844 
4845 	clusters_to_add -= num_bits;
4846 	*logical_offset += num_bits;
4847 
4848 	if (clusters_to_add) {
4849 		mlog(0, "need to alloc once more, wanted = %u\n",
4850 		     clusters_to_add);
4851 		status = -EAGAIN;
4852 		reason = RESTART_TRANS;
4853 	}
4854 
4855 leave:
4856 	mlog_exit(status);
4857 	if (reason_ret)
4858 		*reason_ret = reason;
4859 	return status;
4860 }
4861 
4862 static void ocfs2_make_right_split_rec(struct super_block *sb,
4863 				       struct ocfs2_extent_rec *split_rec,
4864 				       u32 cpos,
4865 				       struct ocfs2_extent_rec *rec)
4866 {
4867 	u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4868 	u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4869 
4870 	memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4871 
4872 	split_rec->e_cpos = cpu_to_le32(cpos);
4873 	split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4874 
4875 	split_rec->e_blkno = rec->e_blkno;
4876 	le64_add_cpu(&split_rec->e_blkno,
4877 		     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4878 
4879 	split_rec->e_flags = rec->e_flags;
4880 }
4881 
4882 static int ocfs2_split_and_insert(handle_t *handle,
4883 				  struct ocfs2_extent_tree *et,
4884 				  struct ocfs2_path *path,
4885 				  struct buffer_head **last_eb_bh,
4886 				  int split_index,
4887 				  struct ocfs2_extent_rec *orig_split_rec,
4888 				  struct ocfs2_alloc_context *meta_ac)
4889 {
4890 	int ret = 0, depth;
4891 	unsigned int insert_range, rec_range, do_leftright = 0;
4892 	struct ocfs2_extent_rec tmprec;
4893 	struct ocfs2_extent_list *rightmost_el;
4894 	struct ocfs2_extent_rec rec;
4895 	struct ocfs2_extent_rec split_rec = *orig_split_rec;
4896 	struct ocfs2_insert_type insert;
4897 	struct ocfs2_extent_block *eb;
4898 
4899 leftright:
4900 	/*
4901 	 * Store a copy of the record on the stack - it might move
4902 	 * around as the tree is manipulated below.
4903 	 */
4904 	rec = path_leaf_el(path)->l_recs[split_index];
4905 
4906 	rightmost_el = et->et_root_el;
4907 
4908 	depth = le16_to_cpu(rightmost_el->l_tree_depth);
4909 	if (depth) {
4910 		BUG_ON(!(*last_eb_bh));
4911 		eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4912 		rightmost_el = &eb->h_list;
4913 	}
4914 
4915 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4916 	    le16_to_cpu(rightmost_el->l_count)) {
4917 		ret = ocfs2_grow_tree(handle, et,
4918 				      &depth, last_eb_bh, meta_ac);
4919 		if (ret) {
4920 			mlog_errno(ret);
4921 			goto out;
4922 		}
4923 	}
4924 
4925 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4926 	insert.ins_appending = APPEND_NONE;
4927 	insert.ins_contig = CONTIG_NONE;
4928 	insert.ins_tree_depth = depth;
4929 
4930 	insert_range = le32_to_cpu(split_rec.e_cpos) +
4931 		le16_to_cpu(split_rec.e_leaf_clusters);
4932 	rec_range = le32_to_cpu(rec.e_cpos) +
4933 		le16_to_cpu(rec.e_leaf_clusters);
4934 
4935 	if (split_rec.e_cpos == rec.e_cpos) {
4936 		insert.ins_split = SPLIT_LEFT;
4937 	} else if (insert_range == rec_range) {
4938 		insert.ins_split = SPLIT_RIGHT;
4939 	} else {
4940 		/*
4941 		 * Left/right split. We fake this as a right split
4942 		 * first and then make a second pass as a left split.
4943 		 */
4944 		insert.ins_split = SPLIT_RIGHT;
4945 
4946 		ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
4947 					   &tmprec, insert_range, &rec);
4948 
4949 		split_rec = tmprec;
4950 
4951 		BUG_ON(do_leftright);
4952 		do_leftright = 1;
4953 	}
4954 
4955 	ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
4956 	if (ret) {
4957 		mlog_errno(ret);
4958 		goto out;
4959 	}
4960 
4961 	if (do_leftright == 1) {
4962 		u32 cpos;
4963 		struct ocfs2_extent_list *el;
4964 
4965 		do_leftright++;
4966 		split_rec = *orig_split_rec;
4967 
4968 		ocfs2_reinit_path(path, 1);
4969 
4970 		cpos = le32_to_cpu(split_rec.e_cpos);
4971 		ret = ocfs2_find_path(et->et_ci, path, cpos);
4972 		if (ret) {
4973 			mlog_errno(ret);
4974 			goto out;
4975 		}
4976 
4977 		el = path_leaf_el(path);
4978 		split_index = ocfs2_search_extent_list(el, cpos);
4979 		goto leftright;
4980 	}
4981 out:
4982 
4983 	return ret;
4984 }
4985 
4986 static int ocfs2_replace_extent_rec(handle_t *handle,
4987 				    struct ocfs2_extent_tree *et,
4988 				    struct ocfs2_path *path,
4989 				    struct ocfs2_extent_list *el,
4990 				    int split_index,
4991 				    struct ocfs2_extent_rec *split_rec)
4992 {
4993 	int ret;
4994 
4995 	ret = ocfs2_path_bh_journal_access(handle, et->et_ci, path,
4996 					   path_num_items(path) - 1);
4997 	if (ret) {
4998 		mlog_errno(ret);
4999 		goto out;
5000 	}
5001 
5002 	el->l_recs[split_index] = *split_rec;
5003 
5004 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
5005 out:
5006 	return ret;
5007 }
5008 
5009 /*
5010  * Mark part or all of the extent record at split_index in the leaf
5011  * pointed to by path as written. This removes the unwritten
5012  * extent flag.
5013  *
5014  * Care is taken to handle contiguousness so as to not grow the tree.
5015  *
5016  * meta_ac is not strictly necessary - we only truly need it if growth
5017  * of the tree is required. All other cases will degrade into a less
5018  * optimal tree layout.
5019  *
5020  * last_eb_bh should be the rightmost leaf block for any extent
5021  * btree. Since a split may grow the tree or a merge might shrink it,
5022  * the caller cannot trust the contents of that buffer after this call.
5023  *
5024  * This code is optimized for readability - several passes might be
5025  * made over certain portions of the tree. All of those blocks will
5026  * have been brought into cache (and pinned via the journal), so the
5027  * extra overhead is not expressed in terms of disk reads.
5028  */
5029 static int __ocfs2_mark_extent_written(handle_t *handle,
5030 				       struct ocfs2_extent_tree *et,
5031 				       struct ocfs2_path *path,
5032 				       int split_index,
5033 				       struct ocfs2_extent_rec *split_rec,
5034 				       struct ocfs2_alloc_context *meta_ac,
5035 				       struct ocfs2_cached_dealloc_ctxt *dealloc)
5036 {
5037 	int ret = 0;
5038 	struct ocfs2_extent_list *el = path_leaf_el(path);
5039 	struct buffer_head *last_eb_bh = NULL;
5040 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
5041 	struct ocfs2_merge_ctxt ctxt;
5042 	struct ocfs2_extent_list *rightmost_el;
5043 
5044 	if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
5045 		ret = -EIO;
5046 		mlog_errno(ret);
5047 		goto out;
5048 	}
5049 
5050 	if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
5051 	    ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
5052 	     (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
5053 		ret = -EIO;
5054 		mlog_errno(ret);
5055 		goto out;
5056 	}
5057 
5058 	ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el,
5059 							    split_index,
5060 							    split_rec);
5061 
5062 	/*
5063 	 * The core merge / split code wants to know how much room is
5064 	 * left in this allocation tree, so we pass the
5065 	 * rightmost extent list.
5066 	 */
5067 	if (path->p_tree_depth) {
5068 		struct ocfs2_extent_block *eb;
5069 
5070 		ret = ocfs2_read_extent_block(et->et_ci,
5071 					      ocfs2_et_get_last_eb_blk(et),
5072 					      &last_eb_bh);
5073 		if (ret) {
5074 			mlog_exit(ret);
5075 			goto out;
5076 		}
5077 
5078 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5079 		rightmost_el = &eb->h_list;
5080 	} else
5081 		rightmost_el = path_root_el(path);
5082 
5083 	if (rec->e_cpos == split_rec->e_cpos &&
5084 	    rec->e_leaf_clusters == split_rec->e_leaf_clusters)
5085 		ctxt.c_split_covers_rec = 1;
5086 	else
5087 		ctxt.c_split_covers_rec = 0;
5088 
5089 	ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
5090 
5091 	mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
5092 	     split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
5093 	     ctxt.c_split_covers_rec);
5094 
5095 	if (ctxt.c_contig_type == CONTIG_NONE) {
5096 		if (ctxt.c_split_covers_rec)
5097 			ret = ocfs2_replace_extent_rec(handle, et, path, el,
5098 						       split_index, split_rec);
5099 		else
5100 			ret = ocfs2_split_and_insert(handle, et, path,
5101 						     &last_eb_bh, split_index,
5102 						     split_rec, meta_ac);
5103 		if (ret)
5104 			mlog_errno(ret);
5105 	} else {
5106 		ret = ocfs2_try_to_merge_extent(handle, et, path,
5107 						split_index, split_rec,
5108 						dealloc, &ctxt);
5109 		if (ret)
5110 			mlog_errno(ret);
5111 	}
5112 
5113 out:
5114 	brelse(last_eb_bh);
5115 	return ret;
5116 }
5117 
5118 /*
5119  * Mark the already-existing extent at cpos as written for len clusters.
5120  *
5121  * If the existing extent is larger than the request, initiate a
5122  * split. An attempt will be made at merging with adjacent extents.
5123  *
5124  * The caller is responsible for passing down meta_ac if we'll need it.
5125  */
5126 int ocfs2_mark_extent_written(struct inode *inode,
5127 			      struct ocfs2_extent_tree *et,
5128 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
5129 			      struct ocfs2_alloc_context *meta_ac,
5130 			      struct ocfs2_cached_dealloc_ctxt *dealloc)
5131 {
5132 	int ret, index;
5133 	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
5134 	struct ocfs2_extent_rec split_rec;
5135 	struct ocfs2_path *left_path = NULL;
5136 	struct ocfs2_extent_list *el;
5137 
5138 	mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
5139 	     inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
5140 
5141 	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5142 		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
5143 			    "that are being written to, but the feature bit "
5144 			    "is not set in the super block.",
5145 			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
5146 		ret = -EROFS;
5147 		goto out;
5148 	}
5149 
5150 	/*
5151 	 * XXX: This should be fixed up so that we just re-insert the
5152 	 * next extent records.
5153 	 */
5154 	ocfs2_et_extent_map_truncate(et, 0);
5155 
5156 	left_path = ocfs2_new_path_from_et(et);
5157 	if (!left_path) {
5158 		ret = -ENOMEM;
5159 		mlog_errno(ret);
5160 		goto out;
5161 	}
5162 
5163 	ret = ocfs2_find_path(et->et_ci, left_path, cpos);
5164 	if (ret) {
5165 		mlog_errno(ret);
5166 		goto out;
5167 	}
5168 	el = path_leaf_el(left_path);
5169 
5170 	index = ocfs2_search_extent_list(el, cpos);
5171 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5172 		ocfs2_error(inode->i_sb,
5173 			    "Inode %llu has an extent at cpos %u which can no "
5174 			    "longer be found.\n",
5175 			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5176 		ret = -EROFS;
5177 		goto out;
5178 	}
5179 
5180 	memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
5181 	split_rec.e_cpos = cpu_to_le32(cpos);
5182 	split_rec.e_leaf_clusters = cpu_to_le16(len);
5183 	split_rec.e_blkno = cpu_to_le64(start_blkno);
5184 	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
5185 	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
5186 
5187 	ret = __ocfs2_mark_extent_written(handle, et, left_path,
5188 					  index, &split_rec, meta_ac,
5189 					  dealloc);
5190 	if (ret)
5191 		mlog_errno(ret);
5192 
5193 out:
5194 	ocfs2_free_path(left_path);
5195 	return ret;
5196 }
5197 
5198 static int ocfs2_split_tree(handle_t *handle, struct ocfs2_extent_tree *et,
5199 			    struct ocfs2_path *path,
5200 			    int index, u32 new_range,
5201 			    struct ocfs2_alloc_context *meta_ac)
5202 {
5203 	int ret, depth, credits = handle->h_buffer_credits;
5204 	struct buffer_head *last_eb_bh = NULL;
5205 	struct ocfs2_extent_block *eb;
5206 	struct ocfs2_extent_list *rightmost_el, *el;
5207 	struct ocfs2_extent_rec split_rec;
5208 	struct ocfs2_extent_rec *rec;
5209 	struct ocfs2_insert_type insert;
5210 
5211 	/*
5212 	 * Setup the record to split before we grow the tree.
5213 	 */
5214 	el = path_leaf_el(path);
5215 	rec = &el->l_recs[index];
5216 	ocfs2_make_right_split_rec(ocfs2_metadata_cache_get_super(et->et_ci),
5217 				   &split_rec, new_range, rec);
5218 
5219 	depth = path->p_tree_depth;
5220 	if (depth > 0) {
5221 		ret = ocfs2_read_extent_block(et->et_ci,
5222 					      ocfs2_et_get_last_eb_blk(et),
5223 					      &last_eb_bh);
5224 		if (ret < 0) {
5225 			mlog_errno(ret);
5226 			goto out;
5227 		}
5228 
5229 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5230 		rightmost_el = &eb->h_list;
5231 	} else
5232 		rightmost_el = path_leaf_el(path);
5233 
5234 	credits += path->p_tree_depth +
5235 		   ocfs2_extend_meta_needed(et->et_root_el);
5236 	ret = ocfs2_extend_trans(handle, credits);
5237 	if (ret) {
5238 		mlog_errno(ret);
5239 		goto out;
5240 	}
5241 
5242 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5243 	    le16_to_cpu(rightmost_el->l_count)) {
5244 		ret = ocfs2_grow_tree(handle, et, &depth, &last_eb_bh,
5245 				      meta_ac);
5246 		if (ret) {
5247 			mlog_errno(ret);
5248 			goto out;
5249 		}
5250 	}
5251 
5252 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
5253 	insert.ins_appending = APPEND_NONE;
5254 	insert.ins_contig = CONTIG_NONE;
5255 	insert.ins_split = SPLIT_RIGHT;
5256 	insert.ins_tree_depth = depth;
5257 
5258 	ret = ocfs2_do_insert_extent(handle, et, &split_rec, &insert);
5259 	if (ret)
5260 		mlog_errno(ret);
5261 
5262 out:
5263 	brelse(last_eb_bh);
5264 	return ret;
5265 }
5266 
5267 static int ocfs2_truncate_rec(handle_t *handle,
5268 			      struct ocfs2_extent_tree *et,
5269 			      struct ocfs2_path *path, int index,
5270 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
5271 			      u32 cpos, u32 len)
5272 {
5273 	int ret;
5274 	u32 left_cpos, rec_range, trunc_range;
5275 	int wants_rotate = 0, is_rightmost_tree_rec = 0;
5276 	struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
5277 	struct ocfs2_path *left_path = NULL;
5278 	struct ocfs2_extent_list *el = path_leaf_el(path);
5279 	struct ocfs2_extent_rec *rec;
5280 	struct ocfs2_extent_block *eb;
5281 
5282 	if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5283 		ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5284 		if (ret) {
5285 			mlog_errno(ret);
5286 			goto out;
5287 		}
5288 
5289 		index--;
5290 	}
5291 
5292 	if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5293 	    path->p_tree_depth) {
5294 		/*
5295 		 * Check whether this is the rightmost tree record. If
5296 		 * we remove all of this record or part of its right
5297 		 * edge then an update of the record lengths above it
5298 		 * will be required.
5299 		 */
5300 		eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5301 		if (eb->h_next_leaf_blk == 0)
5302 			is_rightmost_tree_rec = 1;
5303 	}
5304 
5305 	rec = &el->l_recs[index];
5306 	if (index == 0 && path->p_tree_depth &&
5307 	    le32_to_cpu(rec->e_cpos) == cpos) {
5308 		/*
5309 		 * Changing the leftmost offset (via partial or whole
5310 		 * record truncate) of an interior (or rightmost) path
5311 		 * means we have to update the subtree that is formed
5312 		 * by this leaf and the one to it's left.
5313 		 *
5314 		 * There are two cases we can skip:
5315 		 *   1) Path is the leftmost one in our btree.
5316 		 *   2) The leaf is rightmost and will be empty after
5317 		 *      we remove the extent record - the rotate code
5318 		 *      knows how to update the newly formed edge.
5319 		 */
5320 
5321 		ret = ocfs2_find_cpos_for_left_leaf(sb, path, &left_cpos);
5322 		if (ret) {
5323 			mlog_errno(ret);
5324 			goto out;
5325 		}
5326 
5327 		if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5328 			left_path = ocfs2_new_path_from_path(path);
5329 			if (!left_path) {
5330 				ret = -ENOMEM;
5331 				mlog_errno(ret);
5332 				goto out;
5333 			}
5334 
5335 			ret = ocfs2_find_path(et->et_ci, left_path,
5336 					      left_cpos);
5337 			if (ret) {
5338 				mlog_errno(ret);
5339 				goto out;
5340 			}
5341 		}
5342 	}
5343 
5344 	ret = ocfs2_extend_rotate_transaction(handle, 0,
5345 					      handle->h_buffer_credits,
5346 					      path);
5347 	if (ret) {
5348 		mlog_errno(ret);
5349 		goto out;
5350 	}
5351 
5352 	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
5353 	if (ret) {
5354 		mlog_errno(ret);
5355 		goto out;
5356 	}
5357 
5358 	ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
5359 	if (ret) {
5360 		mlog_errno(ret);
5361 		goto out;
5362 	}
5363 
5364 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5365 	trunc_range = cpos + len;
5366 
5367 	if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5368 		int next_free;
5369 
5370 		memset(rec, 0, sizeof(*rec));
5371 		ocfs2_cleanup_merge(el, index);
5372 		wants_rotate = 1;
5373 
5374 		next_free = le16_to_cpu(el->l_next_free_rec);
5375 		if (is_rightmost_tree_rec && next_free > 1) {
5376 			/*
5377 			 * We skip the edge update if this path will
5378 			 * be deleted by the rotate code.
5379 			 */
5380 			rec = &el->l_recs[next_free - 1];
5381 			ocfs2_adjust_rightmost_records(handle, et, path,
5382 						       rec);
5383 		}
5384 	} else if (le32_to_cpu(rec->e_cpos) == cpos) {
5385 		/* Remove leftmost portion of the record. */
5386 		le32_add_cpu(&rec->e_cpos, len);
5387 		le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5388 		le16_add_cpu(&rec->e_leaf_clusters, -len);
5389 	} else if (rec_range == trunc_range) {
5390 		/* Remove rightmost portion of the record */
5391 		le16_add_cpu(&rec->e_leaf_clusters, -len);
5392 		if (is_rightmost_tree_rec)
5393 			ocfs2_adjust_rightmost_records(handle, et, path, rec);
5394 	} else {
5395 		/* Caller should have trapped this. */
5396 		mlog(ML_ERROR, "Owner %llu: Invalid record truncate: (%u, %u) "
5397 		     "(%u, %u)\n",
5398 		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5399 		     le32_to_cpu(rec->e_cpos),
5400 		     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5401 		BUG();
5402 	}
5403 
5404 	if (left_path) {
5405 		int subtree_index;
5406 
5407 		subtree_index = ocfs2_find_subtree_root(et, left_path, path);
5408 		ocfs2_complete_edge_insert(handle, left_path, path,
5409 					   subtree_index);
5410 	}
5411 
5412 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
5413 
5414 	ret = ocfs2_rotate_tree_left(handle, et, path, dealloc);
5415 	if (ret) {
5416 		mlog_errno(ret);
5417 		goto out;
5418 	}
5419 
5420 out:
5421 	ocfs2_free_path(left_path);
5422 	return ret;
5423 }
5424 
5425 int ocfs2_remove_extent(handle_t *handle,
5426 			struct ocfs2_extent_tree *et,
5427 			u32 cpos, u32 len,
5428 			struct ocfs2_alloc_context *meta_ac,
5429 			struct ocfs2_cached_dealloc_ctxt *dealloc)
5430 {
5431 	int ret, index;
5432 	u32 rec_range, trunc_range;
5433 	struct ocfs2_extent_rec *rec;
5434 	struct ocfs2_extent_list *el;
5435 	struct ocfs2_path *path = NULL;
5436 
5437 	/*
5438 	 * XXX: Why are we truncating to 0 instead of wherever this
5439 	 * affects us?
5440 	 */
5441 	ocfs2_et_extent_map_truncate(et, 0);
5442 
5443 	path = ocfs2_new_path_from_et(et);
5444 	if (!path) {
5445 		ret = -ENOMEM;
5446 		mlog_errno(ret);
5447 		goto out;
5448 	}
5449 
5450 	ret = ocfs2_find_path(et->et_ci, path, cpos);
5451 	if (ret) {
5452 		mlog_errno(ret);
5453 		goto out;
5454 	}
5455 
5456 	el = path_leaf_el(path);
5457 	index = ocfs2_search_extent_list(el, cpos);
5458 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5459 		ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5460 			    "Owner %llu has an extent at cpos %u which can no "
5461 			    "longer be found.\n",
5462 			    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5463 			    cpos);
5464 		ret = -EROFS;
5465 		goto out;
5466 	}
5467 
5468 	/*
5469 	 * We have 3 cases of extent removal:
5470 	 *   1) Range covers the entire extent rec
5471 	 *   2) Range begins or ends on one edge of the extent rec
5472 	 *   3) Range is in the middle of the extent rec (no shared edges)
5473 	 *
5474 	 * For case 1 we remove the extent rec and left rotate to
5475 	 * fill the hole.
5476 	 *
5477 	 * For case 2 we just shrink the existing extent rec, with a
5478 	 * tree update if the shrinking edge is also the edge of an
5479 	 * extent block.
5480 	 *
5481 	 * For case 3 we do a right split to turn the extent rec into
5482 	 * something case 2 can handle.
5483 	 */
5484 	rec = &el->l_recs[index];
5485 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5486 	trunc_range = cpos + len;
5487 
5488 	BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5489 
5490 	mlog(0, "Owner %llu, remove (cpos %u, len %u). Existing index %d "
5491 	     "(cpos %u, len %u)\n",
5492 	     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5493 	     cpos, len, index,
5494 	     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5495 
5496 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5497 		ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5498 					 cpos, len);
5499 		if (ret) {
5500 			mlog_errno(ret);
5501 			goto out;
5502 		}
5503 	} else {
5504 		ret = ocfs2_split_tree(handle, et, path, index,
5505 				       trunc_range, meta_ac);
5506 		if (ret) {
5507 			mlog_errno(ret);
5508 			goto out;
5509 		}
5510 
5511 		/*
5512 		 * The split could have manipulated the tree enough to
5513 		 * move the record location, so we have to look for it again.
5514 		 */
5515 		ocfs2_reinit_path(path, 1);
5516 
5517 		ret = ocfs2_find_path(et->et_ci, path, cpos);
5518 		if (ret) {
5519 			mlog_errno(ret);
5520 			goto out;
5521 		}
5522 
5523 		el = path_leaf_el(path);
5524 		index = ocfs2_search_extent_list(el, cpos);
5525 		if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5526 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5527 				    "Owner %llu: split at cpos %u lost record.",
5528 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5529 				    cpos);
5530 			ret = -EROFS;
5531 			goto out;
5532 		}
5533 
5534 		/*
5535 		 * Double check our values here. If anything is fishy,
5536 		 * it's easier to catch it at the top level.
5537 		 */
5538 		rec = &el->l_recs[index];
5539 		rec_range = le32_to_cpu(rec->e_cpos) +
5540 			ocfs2_rec_clusters(el, rec);
5541 		if (rec_range != trunc_range) {
5542 			ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci),
5543 				    "Owner %llu: error after split at cpos %u"
5544 				    "trunc len %u, existing record is (%u,%u)",
5545 				    (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
5546 				    cpos, len, le32_to_cpu(rec->e_cpos),
5547 				    ocfs2_rec_clusters(el, rec));
5548 			ret = -EROFS;
5549 			goto out;
5550 		}
5551 
5552 		ret = ocfs2_truncate_rec(handle, et, path, index, dealloc,
5553 					 cpos, len);
5554 		if (ret) {
5555 			mlog_errno(ret);
5556 			goto out;
5557 		}
5558 	}
5559 
5560 out:
5561 	ocfs2_free_path(path);
5562 	return ret;
5563 }
5564 
5565 int ocfs2_remove_btree_range(struct inode *inode,
5566 			     struct ocfs2_extent_tree *et,
5567 			     u32 cpos, u32 phys_cpos, u32 len,
5568 			     struct ocfs2_cached_dealloc_ctxt *dealloc)
5569 {
5570 	int ret;
5571 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5572 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5573 	struct inode *tl_inode = osb->osb_tl_inode;
5574 	handle_t *handle;
5575 	struct ocfs2_alloc_context *meta_ac = NULL;
5576 
5577 	ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
5578 	if (ret) {
5579 		mlog_errno(ret);
5580 		return ret;
5581 	}
5582 
5583 	mutex_lock(&tl_inode->i_mutex);
5584 
5585 	if (ocfs2_truncate_log_needs_flush(osb)) {
5586 		ret = __ocfs2_flush_truncate_log(osb);
5587 		if (ret < 0) {
5588 			mlog_errno(ret);
5589 			goto out;
5590 		}
5591 	}
5592 
5593 	handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
5594 	if (IS_ERR(handle)) {
5595 		ret = PTR_ERR(handle);
5596 		mlog_errno(ret);
5597 		goto out;
5598 	}
5599 
5600 	ret = ocfs2_et_root_journal_access(handle, et,
5601 					   OCFS2_JOURNAL_ACCESS_WRITE);
5602 	if (ret) {
5603 		mlog_errno(ret);
5604 		goto out;
5605 	}
5606 
5607 	vfs_dq_free_space_nodirty(inode,
5608 				  ocfs2_clusters_to_bytes(inode->i_sb, len));
5609 
5610 	ret = ocfs2_remove_extent(handle, et, cpos, len, meta_ac, dealloc);
5611 	if (ret) {
5612 		mlog_errno(ret);
5613 		goto out_commit;
5614 	}
5615 
5616 	ocfs2_et_update_clusters(et, -len);
5617 
5618 	ret = ocfs2_journal_dirty(handle, et->et_root_bh);
5619 	if (ret) {
5620 		mlog_errno(ret);
5621 		goto out_commit;
5622 	}
5623 
5624 	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
5625 	if (ret)
5626 		mlog_errno(ret);
5627 
5628 out_commit:
5629 	ocfs2_commit_trans(osb, handle);
5630 out:
5631 	mutex_unlock(&tl_inode->i_mutex);
5632 
5633 	if (meta_ac)
5634 		ocfs2_free_alloc_context(meta_ac);
5635 
5636 	return ret;
5637 }
5638 
5639 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5640 {
5641 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5642 	struct ocfs2_dinode *di;
5643 	struct ocfs2_truncate_log *tl;
5644 
5645 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5646 	tl = &di->id2.i_dealloc;
5647 
5648 	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5649 			"slot %d, invalid truncate log parameters: used = "
5650 			"%u, count = %u\n", osb->slot_num,
5651 			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5652 	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5653 }
5654 
5655 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5656 					   unsigned int new_start)
5657 {
5658 	unsigned int tail_index;
5659 	unsigned int current_tail;
5660 
5661 	/* No records, nothing to coalesce */
5662 	if (!le16_to_cpu(tl->tl_used))
5663 		return 0;
5664 
5665 	tail_index = le16_to_cpu(tl->tl_used) - 1;
5666 	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5667 	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5668 
5669 	return current_tail == new_start;
5670 }
5671 
5672 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5673 			      handle_t *handle,
5674 			      u64 start_blk,
5675 			      unsigned int num_clusters)
5676 {
5677 	int status, index;
5678 	unsigned int start_cluster, tl_count;
5679 	struct inode *tl_inode = osb->osb_tl_inode;
5680 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5681 	struct ocfs2_dinode *di;
5682 	struct ocfs2_truncate_log *tl;
5683 
5684 	mlog_entry("start_blk = %llu, num_clusters = %u\n",
5685 		   (unsigned long long)start_blk, num_clusters);
5686 
5687 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5688 
5689 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5690 
5691 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5692 
5693 	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5694 	 * by the underlying call to ocfs2_read_inode_block(), so any
5695 	 * corruption is a code bug */
5696 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5697 
5698 	tl = &di->id2.i_dealloc;
5699 	tl_count = le16_to_cpu(tl->tl_count);
5700 	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5701 			tl_count == 0,
5702 			"Truncate record count on #%llu invalid "
5703 			"wanted %u, actual %u\n",
5704 			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5705 			ocfs2_truncate_recs_per_inode(osb->sb),
5706 			le16_to_cpu(tl->tl_count));
5707 
5708 	/* Caller should have known to flush before calling us. */
5709 	index = le16_to_cpu(tl->tl_used);
5710 	if (index >= tl_count) {
5711 		status = -ENOSPC;
5712 		mlog_errno(status);
5713 		goto bail;
5714 	}
5715 
5716 	status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5717 					 OCFS2_JOURNAL_ACCESS_WRITE);
5718 	if (status < 0) {
5719 		mlog_errno(status);
5720 		goto bail;
5721 	}
5722 
5723 	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5724 	     "%llu (index = %d)\n", num_clusters, start_cluster,
5725 	     (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5726 
5727 	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5728 		/*
5729 		 * Move index back to the record we are coalescing with.
5730 		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5731 		 */
5732 		index--;
5733 
5734 		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5735 		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5736 		     index, le32_to_cpu(tl->tl_recs[index].t_start),
5737 		     num_clusters);
5738 	} else {
5739 		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5740 		tl->tl_used = cpu_to_le16(index + 1);
5741 	}
5742 	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5743 
5744 	status = ocfs2_journal_dirty(handle, tl_bh);
5745 	if (status < 0) {
5746 		mlog_errno(status);
5747 		goto bail;
5748 	}
5749 
5750 bail:
5751 	mlog_exit(status);
5752 	return status;
5753 }
5754 
5755 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5756 					 handle_t *handle,
5757 					 struct inode *data_alloc_inode,
5758 					 struct buffer_head *data_alloc_bh)
5759 {
5760 	int status = 0;
5761 	int i;
5762 	unsigned int num_clusters;
5763 	u64 start_blk;
5764 	struct ocfs2_truncate_rec rec;
5765 	struct ocfs2_dinode *di;
5766 	struct ocfs2_truncate_log *tl;
5767 	struct inode *tl_inode = osb->osb_tl_inode;
5768 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5769 
5770 	mlog_entry_void();
5771 
5772 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5773 	tl = &di->id2.i_dealloc;
5774 	i = le16_to_cpu(tl->tl_used) - 1;
5775 	while (i >= 0) {
5776 		/* Caller has given us at least enough credits to
5777 		 * update the truncate log dinode */
5778 		status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5779 						 OCFS2_JOURNAL_ACCESS_WRITE);
5780 		if (status < 0) {
5781 			mlog_errno(status);
5782 			goto bail;
5783 		}
5784 
5785 		tl->tl_used = cpu_to_le16(i);
5786 
5787 		status = ocfs2_journal_dirty(handle, tl_bh);
5788 		if (status < 0) {
5789 			mlog_errno(status);
5790 			goto bail;
5791 		}
5792 
5793 		/* TODO: Perhaps we can calculate the bulk of the
5794 		 * credits up front rather than extending like
5795 		 * this. */
5796 		status = ocfs2_extend_trans(handle,
5797 					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5798 		if (status < 0) {
5799 			mlog_errno(status);
5800 			goto bail;
5801 		}
5802 
5803 		rec = tl->tl_recs[i];
5804 		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5805 						    le32_to_cpu(rec.t_start));
5806 		num_clusters = le32_to_cpu(rec.t_clusters);
5807 
5808 		/* if start_blk is not set, we ignore the record as
5809 		 * invalid. */
5810 		if (start_blk) {
5811 			mlog(0, "free record %d, start = %u, clusters = %u\n",
5812 			     i, le32_to_cpu(rec.t_start), num_clusters);
5813 
5814 			status = ocfs2_free_clusters(handle, data_alloc_inode,
5815 						     data_alloc_bh, start_blk,
5816 						     num_clusters);
5817 			if (status < 0) {
5818 				mlog_errno(status);
5819 				goto bail;
5820 			}
5821 		}
5822 		i--;
5823 	}
5824 
5825 bail:
5826 	mlog_exit(status);
5827 	return status;
5828 }
5829 
5830 /* Expects you to already be holding tl_inode->i_mutex */
5831 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5832 {
5833 	int status;
5834 	unsigned int num_to_flush;
5835 	handle_t *handle;
5836 	struct inode *tl_inode = osb->osb_tl_inode;
5837 	struct inode *data_alloc_inode = NULL;
5838 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5839 	struct buffer_head *data_alloc_bh = NULL;
5840 	struct ocfs2_dinode *di;
5841 	struct ocfs2_truncate_log *tl;
5842 
5843 	mlog_entry_void();
5844 
5845 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5846 
5847 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5848 
5849 	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5850 	 * by the underlying call to ocfs2_read_inode_block(), so any
5851 	 * corruption is a code bug */
5852 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5853 
5854 	tl = &di->id2.i_dealloc;
5855 	num_to_flush = le16_to_cpu(tl->tl_used);
5856 	mlog(0, "Flush %u records from truncate log #%llu\n",
5857 	     num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5858 	if (!num_to_flush) {
5859 		status = 0;
5860 		goto out;
5861 	}
5862 
5863 	data_alloc_inode = ocfs2_get_system_file_inode(osb,
5864 						       GLOBAL_BITMAP_SYSTEM_INODE,
5865 						       OCFS2_INVALID_SLOT);
5866 	if (!data_alloc_inode) {
5867 		status = -EINVAL;
5868 		mlog(ML_ERROR, "Could not get bitmap inode!\n");
5869 		goto out;
5870 	}
5871 
5872 	mutex_lock(&data_alloc_inode->i_mutex);
5873 
5874 	status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5875 	if (status < 0) {
5876 		mlog_errno(status);
5877 		goto out_mutex;
5878 	}
5879 
5880 	handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5881 	if (IS_ERR(handle)) {
5882 		status = PTR_ERR(handle);
5883 		mlog_errno(status);
5884 		goto out_unlock;
5885 	}
5886 
5887 	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5888 					       data_alloc_bh);
5889 	if (status < 0)
5890 		mlog_errno(status);
5891 
5892 	ocfs2_commit_trans(osb, handle);
5893 
5894 out_unlock:
5895 	brelse(data_alloc_bh);
5896 	ocfs2_inode_unlock(data_alloc_inode, 1);
5897 
5898 out_mutex:
5899 	mutex_unlock(&data_alloc_inode->i_mutex);
5900 	iput(data_alloc_inode);
5901 
5902 out:
5903 	mlog_exit(status);
5904 	return status;
5905 }
5906 
5907 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5908 {
5909 	int status;
5910 	struct inode *tl_inode = osb->osb_tl_inode;
5911 
5912 	mutex_lock(&tl_inode->i_mutex);
5913 	status = __ocfs2_flush_truncate_log(osb);
5914 	mutex_unlock(&tl_inode->i_mutex);
5915 
5916 	return status;
5917 }
5918 
5919 static void ocfs2_truncate_log_worker(struct work_struct *work)
5920 {
5921 	int status;
5922 	struct ocfs2_super *osb =
5923 		container_of(work, struct ocfs2_super,
5924 			     osb_truncate_log_wq.work);
5925 
5926 	mlog_entry_void();
5927 
5928 	status = ocfs2_flush_truncate_log(osb);
5929 	if (status < 0)
5930 		mlog_errno(status);
5931 	else
5932 		ocfs2_init_inode_steal_slot(osb);
5933 
5934 	mlog_exit(status);
5935 }
5936 
5937 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5938 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5939 				       int cancel)
5940 {
5941 	if (osb->osb_tl_inode) {
5942 		/* We want to push off log flushes while truncates are
5943 		 * still running. */
5944 		if (cancel)
5945 			cancel_delayed_work(&osb->osb_truncate_log_wq);
5946 
5947 		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5948 				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5949 	}
5950 }
5951 
5952 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5953 				       int slot_num,
5954 				       struct inode **tl_inode,
5955 				       struct buffer_head **tl_bh)
5956 {
5957 	int status;
5958 	struct inode *inode = NULL;
5959 	struct buffer_head *bh = NULL;
5960 
5961 	inode = ocfs2_get_system_file_inode(osb,
5962 					   TRUNCATE_LOG_SYSTEM_INODE,
5963 					   slot_num);
5964 	if (!inode) {
5965 		status = -EINVAL;
5966 		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5967 		goto bail;
5968 	}
5969 
5970 	status = ocfs2_read_inode_block(inode, &bh);
5971 	if (status < 0) {
5972 		iput(inode);
5973 		mlog_errno(status);
5974 		goto bail;
5975 	}
5976 
5977 	*tl_inode = inode;
5978 	*tl_bh    = bh;
5979 bail:
5980 	mlog_exit(status);
5981 	return status;
5982 }
5983 
5984 /* called during the 1st stage of node recovery. we stamp a clean
5985  * truncate log and pass back a copy for processing later. if the
5986  * truncate log does not require processing, a *tl_copy is set to
5987  * NULL. */
5988 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5989 				      int slot_num,
5990 				      struct ocfs2_dinode **tl_copy)
5991 {
5992 	int status;
5993 	struct inode *tl_inode = NULL;
5994 	struct buffer_head *tl_bh = NULL;
5995 	struct ocfs2_dinode *di;
5996 	struct ocfs2_truncate_log *tl;
5997 
5998 	*tl_copy = NULL;
5999 
6000 	mlog(0, "recover truncate log from slot %d\n", slot_num);
6001 
6002 	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
6003 	if (status < 0) {
6004 		mlog_errno(status);
6005 		goto bail;
6006 	}
6007 
6008 	di = (struct ocfs2_dinode *) tl_bh->b_data;
6009 
6010 	/* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
6011 	 * validated by the underlying call to ocfs2_read_inode_block(),
6012 	 * so any corruption is a code bug */
6013 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
6014 
6015 	tl = &di->id2.i_dealloc;
6016 	if (le16_to_cpu(tl->tl_used)) {
6017 		mlog(0, "We'll have %u logs to recover\n",
6018 		     le16_to_cpu(tl->tl_used));
6019 
6020 		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
6021 		if (!(*tl_copy)) {
6022 			status = -ENOMEM;
6023 			mlog_errno(status);
6024 			goto bail;
6025 		}
6026 
6027 		/* Assuming the write-out below goes well, this copy
6028 		 * will be passed back to recovery for processing. */
6029 		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
6030 
6031 		/* All we need to do to clear the truncate log is set
6032 		 * tl_used. */
6033 		tl->tl_used = 0;
6034 
6035 		ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
6036 		status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
6037 		if (status < 0) {
6038 			mlog_errno(status);
6039 			goto bail;
6040 		}
6041 	}
6042 
6043 bail:
6044 	if (tl_inode)
6045 		iput(tl_inode);
6046 	brelse(tl_bh);
6047 
6048 	if (status < 0 && (*tl_copy)) {
6049 		kfree(*tl_copy);
6050 		*tl_copy = NULL;
6051 	}
6052 
6053 	mlog_exit(status);
6054 	return status;
6055 }
6056 
6057 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
6058 					 struct ocfs2_dinode *tl_copy)
6059 {
6060 	int status = 0;
6061 	int i;
6062 	unsigned int clusters, num_recs, start_cluster;
6063 	u64 start_blk;
6064 	handle_t *handle;
6065 	struct inode *tl_inode = osb->osb_tl_inode;
6066 	struct ocfs2_truncate_log *tl;
6067 
6068 	mlog_entry_void();
6069 
6070 	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
6071 		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
6072 		return -EINVAL;
6073 	}
6074 
6075 	tl = &tl_copy->id2.i_dealloc;
6076 	num_recs = le16_to_cpu(tl->tl_used);
6077 	mlog(0, "cleanup %u records from %llu\n", num_recs,
6078 	     (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
6079 
6080 	mutex_lock(&tl_inode->i_mutex);
6081 	for(i = 0; i < num_recs; i++) {
6082 		if (ocfs2_truncate_log_needs_flush(osb)) {
6083 			status = __ocfs2_flush_truncate_log(osb);
6084 			if (status < 0) {
6085 				mlog_errno(status);
6086 				goto bail_up;
6087 			}
6088 		}
6089 
6090 		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6091 		if (IS_ERR(handle)) {
6092 			status = PTR_ERR(handle);
6093 			mlog_errno(status);
6094 			goto bail_up;
6095 		}
6096 
6097 		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
6098 		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
6099 		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
6100 
6101 		status = ocfs2_truncate_log_append(osb, handle,
6102 						   start_blk, clusters);
6103 		ocfs2_commit_trans(osb, handle);
6104 		if (status < 0) {
6105 			mlog_errno(status);
6106 			goto bail_up;
6107 		}
6108 	}
6109 
6110 bail_up:
6111 	mutex_unlock(&tl_inode->i_mutex);
6112 
6113 	mlog_exit(status);
6114 	return status;
6115 }
6116 
6117 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
6118 {
6119 	int status;
6120 	struct inode *tl_inode = osb->osb_tl_inode;
6121 
6122 	mlog_entry_void();
6123 
6124 	if (tl_inode) {
6125 		cancel_delayed_work(&osb->osb_truncate_log_wq);
6126 		flush_workqueue(ocfs2_wq);
6127 
6128 		status = ocfs2_flush_truncate_log(osb);
6129 		if (status < 0)
6130 			mlog_errno(status);
6131 
6132 		brelse(osb->osb_tl_bh);
6133 		iput(osb->osb_tl_inode);
6134 	}
6135 
6136 	mlog_exit_void();
6137 }
6138 
6139 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6140 {
6141 	int status;
6142 	struct inode *tl_inode = NULL;
6143 	struct buffer_head *tl_bh = NULL;
6144 
6145 	mlog_entry_void();
6146 
6147 	status = ocfs2_get_truncate_log_info(osb,
6148 					     osb->slot_num,
6149 					     &tl_inode,
6150 					     &tl_bh);
6151 	if (status < 0)
6152 		mlog_errno(status);
6153 
6154 	/* ocfs2_truncate_log_shutdown keys on the existence of
6155 	 * osb->osb_tl_inode so we don't set any of the osb variables
6156 	 * until we're sure all is well. */
6157 	INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
6158 			  ocfs2_truncate_log_worker);
6159 	osb->osb_tl_bh    = tl_bh;
6160 	osb->osb_tl_inode = tl_inode;
6161 
6162 	mlog_exit(status);
6163 	return status;
6164 }
6165 
6166 /*
6167  * Delayed de-allocation of suballocator blocks.
6168  *
6169  * Some sets of block de-allocations might involve multiple suballocator inodes.
6170  *
6171  * The locking for this can get extremely complicated, especially when
6172  * the suballocator inodes to delete from aren't known until deep
6173  * within an unrelated codepath.
6174  *
6175  * ocfs2_extent_block structures are a good example of this - an inode
6176  * btree could have been grown by any number of nodes each allocating
6177  * out of their own suballoc inode.
6178  *
6179  * These structures allow the delay of block de-allocation until a
6180  * later time, when locking of multiple cluster inodes won't cause
6181  * deadlock.
6182  */
6183 
6184 /*
6185  * Describe a single bit freed from a suballocator.  For the block
6186  * suballocators, it represents one block.  For the global cluster
6187  * allocator, it represents some clusters and free_bit indicates
6188  * clusters number.
6189  */
6190 struct ocfs2_cached_block_free {
6191 	struct ocfs2_cached_block_free		*free_next;
6192 	u64					free_blk;
6193 	unsigned int				free_bit;
6194 };
6195 
6196 struct ocfs2_per_slot_free_list {
6197 	struct ocfs2_per_slot_free_list		*f_next_suballocator;
6198 	int					f_inode_type;
6199 	int					f_slot;
6200 	struct ocfs2_cached_block_free		*f_first;
6201 };
6202 
6203 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6204 				    int sysfile_type,
6205 				    int slot,
6206 				    struct ocfs2_cached_block_free *head)
6207 {
6208 	int ret;
6209 	u64 bg_blkno;
6210 	handle_t *handle;
6211 	struct inode *inode;
6212 	struct buffer_head *di_bh = NULL;
6213 	struct ocfs2_cached_block_free *tmp;
6214 
6215 	inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
6216 	if (!inode) {
6217 		ret = -EINVAL;
6218 		mlog_errno(ret);
6219 		goto out;
6220 	}
6221 
6222 	mutex_lock(&inode->i_mutex);
6223 
6224 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
6225 	if (ret) {
6226 		mlog_errno(ret);
6227 		goto out_mutex;
6228 	}
6229 
6230 	handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
6231 	if (IS_ERR(handle)) {
6232 		ret = PTR_ERR(handle);
6233 		mlog_errno(ret);
6234 		goto out_unlock;
6235 	}
6236 
6237 	while (head) {
6238 		bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6239 						      head->free_bit);
6240 		mlog(0, "Free bit: (bit %u, blkno %llu)\n",
6241 		     head->free_bit, (unsigned long long)head->free_blk);
6242 
6243 		ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
6244 					       head->free_bit, bg_blkno, 1);
6245 		if (ret) {
6246 			mlog_errno(ret);
6247 			goto out_journal;
6248 		}
6249 
6250 		ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
6251 		if (ret) {
6252 			mlog_errno(ret);
6253 			goto out_journal;
6254 		}
6255 
6256 		tmp = head;
6257 		head = head->free_next;
6258 		kfree(tmp);
6259 	}
6260 
6261 out_journal:
6262 	ocfs2_commit_trans(osb, handle);
6263 
6264 out_unlock:
6265 	ocfs2_inode_unlock(inode, 1);
6266 	brelse(di_bh);
6267 out_mutex:
6268 	mutex_unlock(&inode->i_mutex);
6269 	iput(inode);
6270 out:
6271 	while(head) {
6272 		/* Premature exit may have left some dangling items. */
6273 		tmp = head;
6274 		head = head->free_next;
6275 		kfree(tmp);
6276 	}
6277 
6278 	return ret;
6279 }
6280 
6281 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6282 				u64 blkno, unsigned int bit)
6283 {
6284 	int ret = 0;
6285 	struct ocfs2_cached_block_free *item;
6286 
6287 	item = kmalloc(sizeof(*item), GFP_NOFS);
6288 	if (item == NULL) {
6289 		ret = -ENOMEM;
6290 		mlog_errno(ret);
6291 		return ret;
6292 	}
6293 
6294 	mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
6295 	     bit, (unsigned long long)blkno);
6296 
6297 	item->free_blk = blkno;
6298 	item->free_bit = bit;
6299 	item->free_next = ctxt->c_global_allocator;
6300 
6301 	ctxt->c_global_allocator = item;
6302 	return ret;
6303 }
6304 
6305 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6306 				      struct ocfs2_cached_block_free *head)
6307 {
6308 	struct ocfs2_cached_block_free *tmp;
6309 	struct inode *tl_inode = osb->osb_tl_inode;
6310 	handle_t *handle;
6311 	int ret = 0;
6312 
6313 	mutex_lock(&tl_inode->i_mutex);
6314 
6315 	while (head) {
6316 		if (ocfs2_truncate_log_needs_flush(osb)) {
6317 			ret = __ocfs2_flush_truncate_log(osb);
6318 			if (ret < 0) {
6319 				mlog_errno(ret);
6320 				break;
6321 			}
6322 		}
6323 
6324 		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6325 		if (IS_ERR(handle)) {
6326 			ret = PTR_ERR(handle);
6327 			mlog_errno(ret);
6328 			break;
6329 		}
6330 
6331 		ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6332 						head->free_bit);
6333 
6334 		ocfs2_commit_trans(osb, handle);
6335 		tmp = head;
6336 		head = head->free_next;
6337 		kfree(tmp);
6338 
6339 		if (ret < 0) {
6340 			mlog_errno(ret);
6341 			break;
6342 		}
6343 	}
6344 
6345 	mutex_unlock(&tl_inode->i_mutex);
6346 
6347 	while (head) {
6348 		/* Premature exit may have left some dangling items. */
6349 		tmp = head;
6350 		head = head->free_next;
6351 		kfree(tmp);
6352 	}
6353 
6354 	return ret;
6355 }
6356 
6357 int ocfs2_run_deallocs(struct ocfs2_super *osb,
6358 		       struct ocfs2_cached_dealloc_ctxt *ctxt)
6359 {
6360 	int ret = 0, ret2;
6361 	struct ocfs2_per_slot_free_list *fl;
6362 
6363 	if (!ctxt)
6364 		return 0;
6365 
6366 	while (ctxt->c_first_suballocator) {
6367 		fl = ctxt->c_first_suballocator;
6368 
6369 		if (fl->f_first) {
6370 			mlog(0, "Free items: (type %u, slot %d)\n",
6371 			     fl->f_inode_type, fl->f_slot);
6372 			ret2 = ocfs2_free_cached_blocks(osb,
6373 							fl->f_inode_type,
6374 							fl->f_slot,
6375 							fl->f_first);
6376 			if (ret2)
6377 				mlog_errno(ret2);
6378 			if (!ret)
6379 				ret = ret2;
6380 		}
6381 
6382 		ctxt->c_first_suballocator = fl->f_next_suballocator;
6383 		kfree(fl);
6384 	}
6385 
6386 	if (ctxt->c_global_allocator) {
6387 		ret2 = ocfs2_free_cached_clusters(osb,
6388 						  ctxt->c_global_allocator);
6389 		if (ret2)
6390 			mlog_errno(ret2);
6391 		if (!ret)
6392 			ret = ret2;
6393 
6394 		ctxt->c_global_allocator = NULL;
6395 	}
6396 
6397 	return ret;
6398 }
6399 
6400 static struct ocfs2_per_slot_free_list *
6401 ocfs2_find_per_slot_free_list(int type,
6402 			      int slot,
6403 			      struct ocfs2_cached_dealloc_ctxt *ctxt)
6404 {
6405 	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6406 
6407 	while (fl) {
6408 		if (fl->f_inode_type == type && fl->f_slot == slot)
6409 			return fl;
6410 
6411 		fl = fl->f_next_suballocator;
6412 	}
6413 
6414 	fl = kmalloc(sizeof(*fl), GFP_NOFS);
6415 	if (fl) {
6416 		fl->f_inode_type = type;
6417 		fl->f_slot = slot;
6418 		fl->f_first = NULL;
6419 		fl->f_next_suballocator = ctxt->c_first_suballocator;
6420 
6421 		ctxt->c_first_suballocator = fl;
6422 	}
6423 	return fl;
6424 }
6425 
6426 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6427 				     int type, int slot, u64 blkno,
6428 				     unsigned int bit)
6429 {
6430 	int ret;
6431 	struct ocfs2_per_slot_free_list *fl;
6432 	struct ocfs2_cached_block_free *item;
6433 
6434 	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6435 	if (fl == NULL) {
6436 		ret = -ENOMEM;
6437 		mlog_errno(ret);
6438 		goto out;
6439 	}
6440 
6441 	item = kmalloc(sizeof(*item), GFP_NOFS);
6442 	if (item == NULL) {
6443 		ret = -ENOMEM;
6444 		mlog_errno(ret);
6445 		goto out;
6446 	}
6447 
6448 	mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6449 	     type, slot, bit, (unsigned long long)blkno);
6450 
6451 	item->free_blk = blkno;
6452 	item->free_bit = bit;
6453 	item->free_next = fl->f_first;
6454 
6455 	fl->f_first = item;
6456 
6457 	ret = 0;
6458 out:
6459 	return ret;
6460 }
6461 
6462 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6463 					 struct ocfs2_extent_block *eb)
6464 {
6465 	return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6466 					 le16_to_cpu(eb->h_suballoc_slot),
6467 					 le64_to_cpu(eb->h_blkno),
6468 					 le16_to_cpu(eb->h_suballoc_bit));
6469 }
6470 
6471 /* This function will figure out whether the currently last extent
6472  * block will be deleted, and if it will, what the new last extent
6473  * block will be so we can update his h_next_leaf_blk field, as well
6474  * as the dinodes i_last_eb_blk */
6475 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6476 				       unsigned int clusters_to_del,
6477 				       struct ocfs2_path *path,
6478 				       struct buffer_head **new_last_eb)
6479 {
6480 	int next_free, ret = 0;
6481 	u32 cpos;
6482 	struct ocfs2_extent_rec *rec;
6483 	struct ocfs2_extent_block *eb;
6484 	struct ocfs2_extent_list *el;
6485 	struct buffer_head *bh = NULL;
6486 
6487 	*new_last_eb = NULL;
6488 
6489 	/* we have no tree, so of course, no last_eb. */
6490 	if (!path->p_tree_depth)
6491 		goto out;
6492 
6493 	/* trunc to zero special case - this makes tree_depth = 0
6494 	 * regardless of what it is.  */
6495 	if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6496 		goto out;
6497 
6498 	el = path_leaf_el(path);
6499 	BUG_ON(!el->l_next_free_rec);
6500 
6501 	/*
6502 	 * Make sure that this extent list will actually be empty
6503 	 * after we clear away the data. We can shortcut out if
6504 	 * there's more than one non-empty extent in the
6505 	 * list. Otherwise, a check of the remaining extent is
6506 	 * necessary.
6507 	 */
6508 	next_free = le16_to_cpu(el->l_next_free_rec);
6509 	rec = NULL;
6510 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6511 		if (next_free > 2)
6512 			goto out;
6513 
6514 		/* We may have a valid extent in index 1, check it. */
6515 		if (next_free == 2)
6516 			rec = &el->l_recs[1];
6517 
6518 		/*
6519 		 * Fall through - no more nonempty extents, so we want
6520 		 * to delete this leaf.
6521 		 */
6522 	} else {
6523 		if (next_free > 1)
6524 			goto out;
6525 
6526 		rec = &el->l_recs[0];
6527 	}
6528 
6529 	if (rec) {
6530 		/*
6531 		 * Check it we'll only be trimming off the end of this
6532 		 * cluster.
6533 		 */
6534 		if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6535 			goto out;
6536 	}
6537 
6538 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6539 	if (ret) {
6540 		mlog_errno(ret);
6541 		goto out;
6542 	}
6543 
6544 	ret = ocfs2_find_leaf(INODE_CACHE(inode), path_root_el(path), cpos, &bh);
6545 	if (ret) {
6546 		mlog_errno(ret);
6547 		goto out;
6548 	}
6549 
6550 	eb = (struct ocfs2_extent_block *) bh->b_data;
6551 	el = &eb->h_list;
6552 
6553 	/* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6554 	 * Any corruption is a code bug. */
6555 	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6556 
6557 	*new_last_eb = bh;
6558 	get_bh(*new_last_eb);
6559 	mlog(0, "returning block %llu, (cpos: %u)\n",
6560 	     (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6561 out:
6562 	brelse(bh);
6563 
6564 	return ret;
6565 }
6566 
6567 /*
6568  * Trim some clusters off the rightmost edge of a tree. Only called
6569  * during truncate.
6570  *
6571  * The caller needs to:
6572  *   - start journaling of each path component.
6573  *   - compute and fully set up any new last ext block
6574  */
6575 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6576 			   handle_t *handle, struct ocfs2_truncate_context *tc,
6577 			   u32 clusters_to_del, u64 *delete_start)
6578 {
6579 	int ret, i, index = path->p_tree_depth;
6580 	u32 new_edge = 0;
6581 	u64 deleted_eb = 0;
6582 	struct buffer_head *bh;
6583 	struct ocfs2_extent_list *el;
6584 	struct ocfs2_extent_rec *rec;
6585 
6586 	*delete_start = 0;
6587 
6588 	while (index >= 0) {
6589 		bh = path->p_node[index].bh;
6590 		el = path->p_node[index].el;
6591 
6592 		mlog(0, "traveling tree (index = %d, block = %llu)\n",
6593 		     index,  (unsigned long long)bh->b_blocknr);
6594 
6595 		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6596 
6597 		if (index !=
6598 		    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6599 			ocfs2_error(inode->i_sb,
6600 				    "Inode %lu has invalid ext. block %llu",
6601 				    inode->i_ino,
6602 				    (unsigned long long)bh->b_blocknr);
6603 			ret = -EROFS;
6604 			goto out;
6605 		}
6606 
6607 find_tail_record:
6608 		i = le16_to_cpu(el->l_next_free_rec) - 1;
6609 		rec = &el->l_recs[i];
6610 
6611 		mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6612 		     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6613 		     ocfs2_rec_clusters(el, rec),
6614 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
6615 		     le16_to_cpu(el->l_next_free_rec));
6616 
6617 		BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6618 
6619 		if (le16_to_cpu(el->l_tree_depth) == 0) {
6620 			/*
6621 			 * If the leaf block contains a single empty
6622 			 * extent and no records, we can just remove
6623 			 * the block.
6624 			 */
6625 			if (i == 0 && ocfs2_is_empty_extent(rec)) {
6626 				memset(rec, 0,
6627 				       sizeof(struct ocfs2_extent_rec));
6628 				el->l_next_free_rec = cpu_to_le16(0);
6629 
6630 				goto delete;
6631 			}
6632 
6633 			/*
6634 			 * Remove any empty extents by shifting things
6635 			 * left. That should make life much easier on
6636 			 * the code below. This condition is rare
6637 			 * enough that we shouldn't see a performance
6638 			 * hit.
6639 			 */
6640 			if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6641 				le16_add_cpu(&el->l_next_free_rec, -1);
6642 
6643 				for(i = 0;
6644 				    i < le16_to_cpu(el->l_next_free_rec); i++)
6645 					el->l_recs[i] = el->l_recs[i + 1];
6646 
6647 				memset(&el->l_recs[i], 0,
6648 				       sizeof(struct ocfs2_extent_rec));
6649 
6650 				/*
6651 				 * We've modified our extent list. The
6652 				 * simplest way to handle this change
6653 				 * is to being the search from the
6654 				 * start again.
6655 				 */
6656 				goto find_tail_record;
6657 			}
6658 
6659 			le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6660 
6661 			/*
6662 			 * We'll use "new_edge" on our way back up the
6663 			 * tree to know what our rightmost cpos is.
6664 			 */
6665 			new_edge = le16_to_cpu(rec->e_leaf_clusters);
6666 			new_edge += le32_to_cpu(rec->e_cpos);
6667 
6668 			/*
6669 			 * The caller will use this to delete data blocks.
6670 			 */
6671 			*delete_start = le64_to_cpu(rec->e_blkno)
6672 				+ ocfs2_clusters_to_blocks(inode->i_sb,
6673 					le16_to_cpu(rec->e_leaf_clusters));
6674 
6675 			/*
6676 			 * If it's now empty, remove this record.
6677 			 */
6678 			if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6679 				memset(rec, 0,
6680 				       sizeof(struct ocfs2_extent_rec));
6681 				le16_add_cpu(&el->l_next_free_rec, -1);
6682 			}
6683 		} else {
6684 			if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6685 				memset(rec, 0,
6686 				       sizeof(struct ocfs2_extent_rec));
6687 				le16_add_cpu(&el->l_next_free_rec, -1);
6688 
6689 				goto delete;
6690 			}
6691 
6692 			/* Can this actually happen? */
6693 			if (le16_to_cpu(el->l_next_free_rec) == 0)
6694 				goto delete;
6695 
6696 			/*
6697 			 * We never actually deleted any clusters
6698 			 * because our leaf was empty. There's no
6699 			 * reason to adjust the rightmost edge then.
6700 			 */
6701 			if (new_edge == 0)
6702 				goto delete;
6703 
6704 			rec->e_int_clusters = cpu_to_le32(new_edge);
6705 			le32_add_cpu(&rec->e_int_clusters,
6706 				     -le32_to_cpu(rec->e_cpos));
6707 
6708 			 /*
6709 			  * A deleted child record should have been
6710 			  * caught above.
6711 			  */
6712 			 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6713 		}
6714 
6715 delete:
6716 		ret = ocfs2_journal_dirty(handle, bh);
6717 		if (ret) {
6718 			mlog_errno(ret);
6719 			goto out;
6720 		}
6721 
6722 		mlog(0, "extent list container %llu, after: record %d: "
6723 		     "(%u, %u, %llu), next = %u.\n",
6724 		     (unsigned long long)bh->b_blocknr, i,
6725 		     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6726 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
6727 		     le16_to_cpu(el->l_next_free_rec));
6728 
6729 		/*
6730 		 * We must be careful to only attempt delete of an
6731 		 * extent block (and not the root inode block).
6732 		 */
6733 		if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6734 			struct ocfs2_extent_block *eb =
6735 				(struct ocfs2_extent_block *)bh->b_data;
6736 
6737 			/*
6738 			 * Save this for use when processing the
6739 			 * parent block.
6740 			 */
6741 			deleted_eb = le64_to_cpu(eb->h_blkno);
6742 
6743 			mlog(0, "deleting this extent block.\n");
6744 
6745 			ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
6746 
6747 			BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6748 			BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6749 			BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6750 
6751 			ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6752 			/* An error here is not fatal. */
6753 			if (ret < 0)
6754 				mlog_errno(ret);
6755 		} else {
6756 			deleted_eb = 0;
6757 		}
6758 
6759 		index--;
6760 	}
6761 
6762 	ret = 0;
6763 out:
6764 	return ret;
6765 }
6766 
6767 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6768 			     unsigned int clusters_to_del,
6769 			     struct inode *inode,
6770 			     struct buffer_head *fe_bh,
6771 			     handle_t *handle,
6772 			     struct ocfs2_truncate_context *tc,
6773 			     struct ocfs2_path *path)
6774 {
6775 	int status;
6776 	struct ocfs2_dinode *fe;
6777 	struct ocfs2_extent_block *last_eb = NULL;
6778 	struct ocfs2_extent_list *el;
6779 	struct buffer_head *last_eb_bh = NULL;
6780 	u64 delete_blk = 0;
6781 
6782 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
6783 
6784 	status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6785 					     path, &last_eb_bh);
6786 	if (status < 0) {
6787 		mlog_errno(status);
6788 		goto bail;
6789 	}
6790 
6791 	/*
6792 	 * Each component will be touched, so we might as well journal
6793 	 * here to avoid having to handle errors later.
6794 	 */
6795 	status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
6796 	if (status < 0) {
6797 		mlog_errno(status);
6798 		goto bail;
6799 	}
6800 
6801 	if (last_eb_bh) {
6802 		status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
6803 						 OCFS2_JOURNAL_ACCESS_WRITE);
6804 		if (status < 0) {
6805 			mlog_errno(status);
6806 			goto bail;
6807 		}
6808 
6809 		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6810 	}
6811 
6812 	el = &(fe->id2.i_list);
6813 
6814 	/*
6815 	 * Lower levels depend on this never happening, but it's best
6816 	 * to check it up here before changing the tree.
6817 	 */
6818 	if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6819 		ocfs2_error(inode->i_sb,
6820 			    "Inode %lu has an empty extent record, depth %u\n",
6821 			    inode->i_ino, le16_to_cpu(el->l_tree_depth));
6822 		status = -EROFS;
6823 		goto bail;
6824 	}
6825 
6826 	vfs_dq_free_space_nodirty(inode,
6827 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6828 	spin_lock(&OCFS2_I(inode)->ip_lock);
6829 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6830 				      clusters_to_del;
6831 	spin_unlock(&OCFS2_I(inode)->ip_lock);
6832 	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6833 	inode->i_blocks = ocfs2_inode_sector_count(inode);
6834 
6835 	status = ocfs2_trim_tree(inode, path, handle, tc,
6836 				 clusters_to_del, &delete_blk);
6837 	if (status) {
6838 		mlog_errno(status);
6839 		goto bail;
6840 	}
6841 
6842 	if (le32_to_cpu(fe->i_clusters) == 0) {
6843 		/* trunc to zero is a special case. */
6844 		el->l_tree_depth = 0;
6845 		fe->i_last_eb_blk = 0;
6846 	} else if (last_eb)
6847 		fe->i_last_eb_blk = last_eb->h_blkno;
6848 
6849 	status = ocfs2_journal_dirty(handle, fe_bh);
6850 	if (status < 0) {
6851 		mlog_errno(status);
6852 		goto bail;
6853 	}
6854 
6855 	if (last_eb) {
6856 		/* If there will be a new last extent block, then by
6857 		 * definition, there cannot be any leaves to the right of
6858 		 * him. */
6859 		last_eb->h_next_leaf_blk = 0;
6860 		status = ocfs2_journal_dirty(handle, last_eb_bh);
6861 		if (status < 0) {
6862 			mlog_errno(status);
6863 			goto bail;
6864 		}
6865 	}
6866 
6867 	if (delete_blk) {
6868 		status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6869 						   clusters_to_del);
6870 		if (status < 0) {
6871 			mlog_errno(status);
6872 			goto bail;
6873 		}
6874 	}
6875 	status = 0;
6876 bail:
6877 	brelse(last_eb_bh);
6878 	mlog_exit(status);
6879 	return status;
6880 }
6881 
6882 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6883 {
6884 	set_buffer_uptodate(bh);
6885 	mark_buffer_dirty(bh);
6886 	return 0;
6887 }
6888 
6889 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6890 				     unsigned int from, unsigned int to,
6891 				     struct page *page, int zero, u64 *phys)
6892 {
6893 	int ret, partial = 0;
6894 
6895 	ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6896 	if (ret)
6897 		mlog_errno(ret);
6898 
6899 	if (zero)
6900 		zero_user_segment(page, from, to);
6901 
6902 	/*
6903 	 * Need to set the buffers we zero'd into uptodate
6904 	 * here if they aren't - ocfs2_map_page_blocks()
6905 	 * might've skipped some
6906 	 */
6907 	ret = walk_page_buffers(handle, page_buffers(page),
6908 				from, to, &partial,
6909 				ocfs2_zero_func);
6910 	if (ret < 0)
6911 		mlog_errno(ret);
6912 	else if (ocfs2_should_order_data(inode)) {
6913 		ret = ocfs2_jbd2_file_inode(handle, inode);
6914 		if (ret < 0)
6915 			mlog_errno(ret);
6916 	}
6917 
6918 	if (!partial)
6919 		SetPageUptodate(page);
6920 
6921 	flush_dcache_page(page);
6922 }
6923 
6924 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6925 				     loff_t end, struct page **pages,
6926 				     int numpages, u64 phys, handle_t *handle)
6927 {
6928 	int i;
6929 	struct page *page;
6930 	unsigned int from, to = PAGE_CACHE_SIZE;
6931 	struct super_block *sb = inode->i_sb;
6932 
6933 	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6934 
6935 	if (numpages == 0)
6936 		goto out;
6937 
6938 	to = PAGE_CACHE_SIZE;
6939 	for(i = 0; i < numpages; i++) {
6940 		page = pages[i];
6941 
6942 		from = start & (PAGE_CACHE_SIZE - 1);
6943 		if ((end >> PAGE_CACHE_SHIFT) == page->index)
6944 			to = end & (PAGE_CACHE_SIZE - 1);
6945 
6946 		BUG_ON(from > PAGE_CACHE_SIZE);
6947 		BUG_ON(to > PAGE_CACHE_SIZE);
6948 
6949 		ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6950 					 &phys);
6951 
6952 		start = (page->index + 1) << PAGE_CACHE_SHIFT;
6953 	}
6954 out:
6955 	if (pages)
6956 		ocfs2_unlock_and_free_pages(pages, numpages);
6957 }
6958 
6959 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6960 				struct page **pages, int *num)
6961 {
6962 	int numpages, ret = 0;
6963 	struct super_block *sb = inode->i_sb;
6964 	struct address_space *mapping = inode->i_mapping;
6965 	unsigned long index;
6966 	loff_t last_page_bytes;
6967 
6968 	BUG_ON(start > end);
6969 
6970 	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6971 	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6972 
6973 	numpages = 0;
6974 	last_page_bytes = PAGE_ALIGN(end);
6975 	index = start >> PAGE_CACHE_SHIFT;
6976 	do {
6977 		pages[numpages] = grab_cache_page(mapping, index);
6978 		if (!pages[numpages]) {
6979 			ret = -ENOMEM;
6980 			mlog_errno(ret);
6981 			goto out;
6982 		}
6983 
6984 		numpages++;
6985 		index++;
6986 	} while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6987 
6988 out:
6989 	if (ret != 0) {
6990 		if (pages)
6991 			ocfs2_unlock_and_free_pages(pages, numpages);
6992 		numpages = 0;
6993 	}
6994 
6995 	*num = numpages;
6996 
6997 	return ret;
6998 }
6999 
7000 /*
7001  * Zero the area past i_size but still within an allocated
7002  * cluster. This avoids exposing nonzero data on subsequent file
7003  * extends.
7004  *
7005  * We need to call this before i_size is updated on the inode because
7006  * otherwise block_write_full_page() will skip writeout of pages past
7007  * i_size. The new_i_size parameter is passed for this reason.
7008  */
7009 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
7010 				  u64 range_start, u64 range_end)
7011 {
7012 	int ret = 0, numpages;
7013 	struct page **pages = NULL;
7014 	u64 phys;
7015 	unsigned int ext_flags;
7016 	struct super_block *sb = inode->i_sb;
7017 
7018 	/*
7019 	 * File systems which don't support sparse files zero on every
7020 	 * extend.
7021 	 */
7022 	if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
7023 		return 0;
7024 
7025 	pages = kcalloc(ocfs2_pages_per_cluster(sb),
7026 			sizeof(struct page *), GFP_NOFS);
7027 	if (pages == NULL) {
7028 		ret = -ENOMEM;
7029 		mlog_errno(ret);
7030 		goto out;
7031 	}
7032 
7033 	if (range_start == range_end)
7034 		goto out;
7035 
7036 	ret = ocfs2_extent_map_get_blocks(inode,
7037 					  range_start >> sb->s_blocksize_bits,
7038 					  &phys, NULL, &ext_flags);
7039 	if (ret) {
7040 		mlog_errno(ret);
7041 		goto out;
7042 	}
7043 
7044 	/*
7045 	 * Tail is a hole, or is marked unwritten. In either case, we
7046 	 * can count on read and write to return/push zero's.
7047 	 */
7048 	if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
7049 		goto out;
7050 
7051 	ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
7052 				   &numpages);
7053 	if (ret) {
7054 		mlog_errno(ret);
7055 		goto out;
7056 	}
7057 
7058 	ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
7059 				 numpages, phys, handle);
7060 
7061 	/*
7062 	 * Initiate writeout of the pages we zero'd here. We don't
7063 	 * wait on them - the truncate_inode_pages() call later will
7064 	 * do that for us.
7065 	 */
7066 	ret = do_sync_mapping_range(inode->i_mapping, range_start,
7067 				    range_end - 1, SYNC_FILE_RANGE_WRITE);
7068 	if (ret)
7069 		mlog_errno(ret);
7070 
7071 out:
7072 	if (pages)
7073 		kfree(pages);
7074 
7075 	return ret;
7076 }
7077 
7078 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
7079 					     struct ocfs2_dinode *di)
7080 {
7081 	unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
7082 	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
7083 
7084 	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
7085 		memset(&di->id2, 0, blocksize -
7086 				    offsetof(struct ocfs2_dinode, id2) -
7087 				    xattrsize);
7088 	else
7089 		memset(&di->id2, 0, blocksize -
7090 				    offsetof(struct ocfs2_dinode, id2));
7091 }
7092 
7093 void ocfs2_dinode_new_extent_list(struct inode *inode,
7094 				  struct ocfs2_dinode *di)
7095 {
7096 	ocfs2_zero_dinode_id2_with_xattr(inode, di);
7097 	di->id2.i_list.l_tree_depth = 0;
7098 	di->id2.i_list.l_next_free_rec = 0;
7099 	di->id2.i_list.l_count = cpu_to_le16(
7100 		ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
7101 }
7102 
7103 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
7104 {
7105 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
7106 	struct ocfs2_inline_data *idata = &di->id2.i_data;
7107 
7108 	spin_lock(&oi->ip_lock);
7109 	oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
7110 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
7111 	spin_unlock(&oi->ip_lock);
7112 
7113 	/*
7114 	 * We clear the entire i_data structure here so that all
7115 	 * fields can be properly initialized.
7116 	 */
7117 	ocfs2_zero_dinode_id2_with_xattr(inode, di);
7118 
7119 	idata->id_count = cpu_to_le16(
7120 			ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
7121 }
7122 
7123 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7124 					 struct buffer_head *di_bh)
7125 {
7126 	int ret, i, has_data, num_pages = 0;
7127 	handle_t *handle;
7128 	u64 uninitialized_var(block);
7129 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
7130 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7131 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7132 	struct ocfs2_alloc_context *data_ac = NULL;
7133 	struct page **pages = NULL;
7134 	loff_t end = osb->s_clustersize;
7135 	struct ocfs2_extent_tree et;
7136 	int did_quota = 0;
7137 
7138 	has_data = i_size_read(inode) ? 1 : 0;
7139 
7140 	if (has_data) {
7141 		pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
7142 				sizeof(struct page *), GFP_NOFS);
7143 		if (pages == NULL) {
7144 			ret = -ENOMEM;
7145 			mlog_errno(ret);
7146 			goto out;
7147 		}
7148 
7149 		ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
7150 		if (ret) {
7151 			mlog_errno(ret);
7152 			goto out;
7153 		}
7154 	}
7155 
7156 	handle = ocfs2_start_trans(osb,
7157 				   ocfs2_inline_to_extents_credits(osb->sb));
7158 	if (IS_ERR(handle)) {
7159 		ret = PTR_ERR(handle);
7160 		mlog_errno(ret);
7161 		goto out_unlock;
7162 	}
7163 
7164 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7165 				      OCFS2_JOURNAL_ACCESS_WRITE);
7166 	if (ret) {
7167 		mlog_errno(ret);
7168 		goto out_commit;
7169 	}
7170 
7171 	if (has_data) {
7172 		u32 bit_off, num;
7173 		unsigned int page_end;
7174 		u64 phys;
7175 
7176 		if (vfs_dq_alloc_space_nodirty(inode,
7177 				       ocfs2_clusters_to_bytes(osb->sb, 1))) {
7178 			ret = -EDQUOT;
7179 			goto out_commit;
7180 		}
7181 		did_quota = 1;
7182 
7183 		ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
7184 					   &num);
7185 		if (ret) {
7186 			mlog_errno(ret);
7187 			goto out_commit;
7188 		}
7189 
7190 		/*
7191 		 * Save two copies, one for insert, and one that can
7192 		 * be changed by ocfs2_map_and_dirty_page() below.
7193 		 */
7194 		block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
7195 
7196 		/*
7197 		 * Non sparse file systems zero on extend, so no need
7198 		 * to do that now.
7199 		 */
7200 		if (!ocfs2_sparse_alloc(osb) &&
7201 		    PAGE_CACHE_SIZE < osb->s_clustersize)
7202 			end = PAGE_CACHE_SIZE;
7203 
7204 		ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
7205 		if (ret) {
7206 			mlog_errno(ret);
7207 			goto out_commit;
7208 		}
7209 
7210 		/*
7211 		 * This should populate the 1st page for us and mark
7212 		 * it up to date.
7213 		 */
7214 		ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
7215 		if (ret) {
7216 			mlog_errno(ret);
7217 			goto out_commit;
7218 		}
7219 
7220 		page_end = PAGE_CACHE_SIZE;
7221 		if (PAGE_CACHE_SIZE > osb->s_clustersize)
7222 			page_end = osb->s_clustersize;
7223 
7224 		for (i = 0; i < num_pages; i++)
7225 			ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
7226 						 pages[i], i > 0, &phys);
7227 	}
7228 
7229 	spin_lock(&oi->ip_lock);
7230 	oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
7231 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
7232 	spin_unlock(&oi->ip_lock);
7233 
7234 	ocfs2_dinode_new_extent_list(inode, di);
7235 
7236 	ocfs2_journal_dirty(handle, di_bh);
7237 
7238 	if (has_data) {
7239 		/*
7240 		 * An error at this point should be extremely rare. If
7241 		 * this proves to be false, we could always re-build
7242 		 * the in-inode data from our pages.
7243 		 */
7244 		ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
7245 		ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
7246 		if (ret) {
7247 			mlog_errno(ret);
7248 			goto out_commit;
7249 		}
7250 
7251 		inode->i_blocks = ocfs2_inode_sector_count(inode);
7252 	}
7253 
7254 out_commit:
7255 	if (ret < 0 && did_quota)
7256 		vfs_dq_free_space_nodirty(inode,
7257 					  ocfs2_clusters_to_bytes(osb->sb, 1));
7258 
7259 	ocfs2_commit_trans(osb, handle);
7260 
7261 out_unlock:
7262 	if (data_ac)
7263 		ocfs2_free_alloc_context(data_ac);
7264 
7265 out:
7266 	if (pages) {
7267 		ocfs2_unlock_and_free_pages(pages, num_pages);
7268 		kfree(pages);
7269 	}
7270 
7271 	return ret;
7272 }
7273 
7274 /*
7275  * It is expected, that by the time you call this function,
7276  * inode->i_size and fe->i_size have been adjusted.
7277  *
7278  * WARNING: This will kfree the truncate context
7279  */
7280 int ocfs2_commit_truncate(struct ocfs2_super *osb,
7281 			  struct inode *inode,
7282 			  struct buffer_head *fe_bh,
7283 			  struct ocfs2_truncate_context *tc)
7284 {
7285 	int status, i, credits, tl_sem = 0;
7286 	u32 clusters_to_del, new_highest_cpos, range;
7287 	struct ocfs2_extent_list *el;
7288 	handle_t *handle = NULL;
7289 	struct inode *tl_inode = osb->osb_tl_inode;
7290 	struct ocfs2_path *path = NULL;
7291 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
7292 
7293 	mlog_entry_void();
7294 
7295 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7296 						     i_size_read(inode));
7297 
7298 	path = ocfs2_new_path(fe_bh, &di->id2.i_list,
7299 			      ocfs2_journal_access_di);
7300 	if (!path) {
7301 		status = -ENOMEM;
7302 		mlog_errno(status);
7303 		goto bail;
7304 	}
7305 
7306 	ocfs2_extent_map_trunc(inode, new_highest_cpos);
7307 
7308 start:
7309 	/*
7310 	 * Check that we still have allocation to delete.
7311 	 */
7312 	if (OCFS2_I(inode)->ip_clusters == 0) {
7313 		status = 0;
7314 		goto bail;
7315 	}
7316 
7317 	/*
7318 	 * Truncate always works against the rightmost tree branch.
7319 	 */
7320 	status = ocfs2_find_path(INODE_CACHE(inode), path, UINT_MAX);
7321 	if (status) {
7322 		mlog_errno(status);
7323 		goto bail;
7324 	}
7325 
7326 	mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
7327 	     OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
7328 
7329 	/*
7330 	 * By now, el will point to the extent list on the bottom most
7331 	 * portion of this tree. Only the tail record is considered in
7332 	 * each pass.
7333 	 *
7334 	 * We handle the following cases, in order:
7335 	 * - empty extent: delete the remaining branch
7336 	 * - remove the entire record
7337 	 * - remove a partial record
7338 	 * - no record needs to be removed (truncate has completed)
7339 	 */
7340 	el = path_leaf_el(path);
7341 	if (le16_to_cpu(el->l_next_free_rec) == 0) {
7342 		ocfs2_error(inode->i_sb,
7343 			    "Inode %llu has empty extent block at %llu\n",
7344 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
7345 			    (unsigned long long)path_leaf_bh(path)->b_blocknr);
7346 		status = -EROFS;
7347 		goto bail;
7348 	}
7349 
7350 	i = le16_to_cpu(el->l_next_free_rec) - 1;
7351 	range = le32_to_cpu(el->l_recs[i].e_cpos) +
7352 		ocfs2_rec_clusters(el, &el->l_recs[i]);
7353 	if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
7354 		clusters_to_del = 0;
7355 	} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
7356 		clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
7357 	} else if (range > new_highest_cpos) {
7358 		clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
7359 				   le32_to_cpu(el->l_recs[i].e_cpos)) -
7360 				  new_highest_cpos;
7361 	} else {
7362 		status = 0;
7363 		goto bail;
7364 	}
7365 
7366 	mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
7367 	     clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7368 
7369 	mutex_lock(&tl_inode->i_mutex);
7370 	tl_sem = 1;
7371 	/* ocfs2_truncate_log_needs_flush guarantees us at least one
7372 	 * record is free for use. If there isn't any, we flush to get
7373 	 * an empty truncate log.  */
7374 	if (ocfs2_truncate_log_needs_flush(osb)) {
7375 		status = __ocfs2_flush_truncate_log(osb);
7376 		if (status < 0) {
7377 			mlog_errno(status);
7378 			goto bail;
7379 		}
7380 	}
7381 
7382 	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
7383 						(struct ocfs2_dinode *)fe_bh->b_data,
7384 						el);
7385 	handle = ocfs2_start_trans(osb, credits);
7386 	if (IS_ERR(handle)) {
7387 		status = PTR_ERR(handle);
7388 		handle = NULL;
7389 		mlog_errno(status);
7390 		goto bail;
7391 	}
7392 
7393 	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7394 				   tc, path);
7395 	if (status < 0) {
7396 		mlog_errno(status);
7397 		goto bail;
7398 	}
7399 
7400 	mutex_unlock(&tl_inode->i_mutex);
7401 	tl_sem = 0;
7402 
7403 	ocfs2_commit_trans(osb, handle);
7404 	handle = NULL;
7405 
7406 	ocfs2_reinit_path(path, 1);
7407 
7408 	/*
7409 	 * The check above will catch the case where we've truncated
7410 	 * away all allocation.
7411 	 */
7412 	goto start;
7413 
7414 bail:
7415 
7416 	ocfs2_schedule_truncate_log_flush(osb, 1);
7417 
7418 	if (tl_sem)
7419 		mutex_unlock(&tl_inode->i_mutex);
7420 
7421 	if (handle)
7422 		ocfs2_commit_trans(osb, handle);
7423 
7424 	ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7425 
7426 	ocfs2_free_path(path);
7427 
7428 	/* This will drop the ext_alloc cluster lock for us */
7429 	ocfs2_free_truncate_context(tc);
7430 
7431 	mlog_exit(status);
7432 	return status;
7433 }
7434 
7435 /*
7436  * Expects the inode to already be locked.
7437  */
7438 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7439 			   struct inode *inode,
7440 			   struct buffer_head *fe_bh,
7441 			   struct ocfs2_truncate_context **tc)
7442 {
7443 	int status;
7444 	unsigned int new_i_clusters;
7445 	struct ocfs2_dinode *fe;
7446 	struct ocfs2_extent_block *eb;
7447 	struct buffer_head *last_eb_bh = NULL;
7448 
7449 	mlog_entry_void();
7450 
7451 	*tc = NULL;
7452 
7453 	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7454 						  i_size_read(inode));
7455 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
7456 
7457 	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7458 	     "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7459 	     (unsigned long long)le64_to_cpu(fe->i_size));
7460 
7461 	*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7462 	if (!(*tc)) {
7463 		status = -ENOMEM;
7464 		mlog_errno(status);
7465 		goto bail;
7466 	}
7467 	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7468 
7469 	if (fe->id2.i_list.l_tree_depth) {
7470 		status = ocfs2_read_extent_block(INODE_CACHE(inode),
7471 						 le64_to_cpu(fe->i_last_eb_blk),
7472 						 &last_eb_bh);
7473 		if (status < 0) {
7474 			mlog_errno(status);
7475 			goto bail;
7476 		}
7477 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7478 	}
7479 
7480 	(*tc)->tc_last_eb_bh = last_eb_bh;
7481 
7482 	status = 0;
7483 bail:
7484 	if (status < 0) {
7485 		if (*tc)
7486 			ocfs2_free_truncate_context(*tc);
7487 		*tc = NULL;
7488 	}
7489 	mlog_exit_void();
7490 	return status;
7491 }
7492 
7493 /*
7494  * 'start' is inclusive, 'end' is not.
7495  */
7496 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7497 			  unsigned int start, unsigned int end, int trunc)
7498 {
7499 	int ret;
7500 	unsigned int numbytes;
7501 	handle_t *handle;
7502 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7503 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7504 	struct ocfs2_inline_data *idata = &di->id2.i_data;
7505 
7506 	if (end > i_size_read(inode))
7507 		end = i_size_read(inode);
7508 
7509 	BUG_ON(start >= end);
7510 
7511 	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7512 	    !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7513 	    !ocfs2_supports_inline_data(osb)) {
7514 		ocfs2_error(inode->i_sb,
7515 			    "Inline data flags for inode %llu don't agree! "
7516 			    "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7517 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
7518 			    le16_to_cpu(di->i_dyn_features),
7519 			    OCFS2_I(inode)->ip_dyn_features,
7520 			    osb->s_feature_incompat);
7521 		ret = -EROFS;
7522 		goto out;
7523 	}
7524 
7525 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7526 	if (IS_ERR(handle)) {
7527 		ret = PTR_ERR(handle);
7528 		mlog_errno(ret);
7529 		goto out;
7530 	}
7531 
7532 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7533 				      OCFS2_JOURNAL_ACCESS_WRITE);
7534 	if (ret) {
7535 		mlog_errno(ret);
7536 		goto out_commit;
7537 	}
7538 
7539 	numbytes = end - start;
7540 	memset(idata->id_data + start, 0, numbytes);
7541 
7542 	/*
7543 	 * No need to worry about the data page here - it's been
7544 	 * truncated already and inline data doesn't need it for
7545 	 * pushing zero's to disk, so we'll let readpage pick it up
7546 	 * later.
7547 	 */
7548 	if (trunc) {
7549 		i_size_write(inode, start);
7550 		di->i_size = cpu_to_le64(start);
7551 	}
7552 
7553 	inode->i_blocks = ocfs2_inode_sector_count(inode);
7554 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7555 
7556 	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7557 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7558 
7559 	ocfs2_journal_dirty(handle, di_bh);
7560 
7561 out_commit:
7562 	ocfs2_commit_trans(osb, handle);
7563 
7564 out:
7565 	return ret;
7566 }
7567 
7568 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7569 {
7570 	/*
7571 	 * The caller is responsible for completing deallocation
7572 	 * before freeing the context.
7573 	 */
7574 	if (tc->tc_dealloc.c_first_suballocator != NULL)
7575 		mlog(ML_NOTICE,
7576 		     "Truncate completion has non-empty dealloc context\n");
7577 
7578 	brelse(tc->tc_last_eb_bh);
7579 
7580 	kfree(tc);
7581 }
7582