xref: /openbmc/linux/fs/ocfs2/alloc.c (revision 82ced6fd)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31 #include <linux/quotaops.h>
32 
33 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
34 #include <cluster/masklog.h>
35 
36 #include "ocfs2.h"
37 
38 #include "alloc.h"
39 #include "aops.h"
40 #include "blockcheck.h"
41 #include "dlmglue.h"
42 #include "extent_map.h"
43 #include "inode.h"
44 #include "journal.h"
45 #include "localalloc.h"
46 #include "suballoc.h"
47 #include "sysfile.h"
48 #include "file.h"
49 #include "super.h"
50 #include "uptodate.h"
51 #include "xattr.h"
52 
53 #include "buffer_head_io.h"
54 
55 
56 /*
57  * Operations for a specific extent tree type.
58  *
59  * To implement an on-disk btree (extent tree) type in ocfs2, add
60  * an ocfs2_extent_tree_operations structure and the matching
61  * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
62  * for the allocation portion of the extent tree.
63  */
64 struct ocfs2_extent_tree_operations {
65 	/*
66 	 * last_eb_blk is the block number of the right most leaf extent
67 	 * block.  Most on-disk structures containing an extent tree store
68 	 * this value for fast access.  The ->eo_set_last_eb_blk() and
69 	 * ->eo_get_last_eb_blk() operations access this value.  They are
70 	 *  both required.
71 	 */
72 	void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
73 				   u64 blkno);
74 	u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
75 
76 	/*
77 	 * The on-disk structure usually keeps track of how many total
78 	 * clusters are stored in this extent tree.  This function updates
79 	 * that value.  new_clusters is the delta, and must be
80 	 * added to the total.  Required.
81 	 */
82 	void (*eo_update_clusters)(struct inode *inode,
83 				   struct ocfs2_extent_tree *et,
84 				   u32 new_clusters);
85 
86 	/*
87 	 * If ->eo_insert_check() exists, it is called before rec is
88 	 * inserted into the extent tree.  It is optional.
89 	 */
90 	int (*eo_insert_check)(struct inode *inode,
91 			       struct ocfs2_extent_tree *et,
92 			       struct ocfs2_extent_rec *rec);
93 	int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
94 
95 	/*
96 	 * --------------------------------------------------------------
97 	 * The remaining are internal to ocfs2_extent_tree and don't have
98 	 * accessor functions
99 	 */
100 
101 	/*
102 	 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
103 	 * It is required.
104 	 */
105 	void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
106 
107 	/*
108 	 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
109 	 * it exists.  If it does not, et->et_max_leaf_clusters is set
110 	 * to 0 (unlimited).  Optional.
111 	 */
112 	void (*eo_fill_max_leaf_clusters)(struct inode *inode,
113 					  struct ocfs2_extent_tree *et);
114 };
115 
116 
117 /*
118  * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
119  * in the methods.
120  */
121 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
122 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
123 					 u64 blkno);
124 static void ocfs2_dinode_update_clusters(struct inode *inode,
125 					 struct ocfs2_extent_tree *et,
126 					 u32 clusters);
127 static int ocfs2_dinode_insert_check(struct inode *inode,
128 				     struct ocfs2_extent_tree *et,
129 				     struct ocfs2_extent_rec *rec);
130 static int ocfs2_dinode_sanity_check(struct inode *inode,
131 				     struct ocfs2_extent_tree *et);
132 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
133 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
134 	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
135 	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
136 	.eo_update_clusters	= ocfs2_dinode_update_clusters,
137 	.eo_insert_check	= ocfs2_dinode_insert_check,
138 	.eo_sanity_check	= ocfs2_dinode_sanity_check,
139 	.eo_fill_root_el	= ocfs2_dinode_fill_root_el,
140 };
141 
142 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
143 					 u64 blkno)
144 {
145 	struct ocfs2_dinode *di = et->et_object;
146 
147 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
148 	di->i_last_eb_blk = cpu_to_le64(blkno);
149 }
150 
151 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
152 {
153 	struct ocfs2_dinode *di = et->et_object;
154 
155 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
156 	return le64_to_cpu(di->i_last_eb_blk);
157 }
158 
159 static void ocfs2_dinode_update_clusters(struct inode *inode,
160 					 struct ocfs2_extent_tree *et,
161 					 u32 clusters)
162 {
163 	struct ocfs2_dinode *di = et->et_object;
164 
165 	le32_add_cpu(&di->i_clusters, clusters);
166 	spin_lock(&OCFS2_I(inode)->ip_lock);
167 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
168 	spin_unlock(&OCFS2_I(inode)->ip_lock);
169 }
170 
171 static int ocfs2_dinode_insert_check(struct inode *inode,
172 				     struct ocfs2_extent_tree *et,
173 				     struct ocfs2_extent_rec *rec)
174 {
175 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
176 
177 	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
178 	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
179 			(OCFS2_I(inode)->ip_clusters !=
180 			 le32_to_cpu(rec->e_cpos)),
181 			"Device %s, asking for sparse allocation: inode %llu, "
182 			"cpos %u, clusters %u\n",
183 			osb->dev_str,
184 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
185 			rec->e_cpos,
186 			OCFS2_I(inode)->ip_clusters);
187 
188 	return 0;
189 }
190 
191 static int ocfs2_dinode_sanity_check(struct inode *inode,
192 				     struct ocfs2_extent_tree *et)
193 {
194 	struct ocfs2_dinode *di = et->et_object;
195 
196 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
197 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
198 
199 	return 0;
200 }
201 
202 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
203 {
204 	struct ocfs2_dinode *di = et->et_object;
205 
206 	et->et_root_el = &di->id2.i_list;
207 }
208 
209 
210 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
211 {
212 	struct ocfs2_xattr_value_buf *vb = et->et_object;
213 
214 	et->et_root_el = &vb->vb_xv->xr_list;
215 }
216 
217 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
218 					      u64 blkno)
219 {
220 	struct ocfs2_xattr_value_buf *vb = et->et_object;
221 
222 	vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
223 }
224 
225 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
226 {
227 	struct ocfs2_xattr_value_buf *vb = et->et_object;
228 
229 	return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
230 }
231 
232 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
233 					      struct ocfs2_extent_tree *et,
234 					      u32 clusters)
235 {
236 	struct ocfs2_xattr_value_buf *vb = et->et_object;
237 
238 	le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
239 }
240 
241 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
242 	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
243 	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
244 	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
245 	.eo_fill_root_el	= ocfs2_xattr_value_fill_root_el,
246 };
247 
248 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
249 {
250 	struct ocfs2_xattr_block *xb = et->et_object;
251 
252 	et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
253 }
254 
255 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
256 						    struct ocfs2_extent_tree *et)
257 {
258 	et->et_max_leaf_clusters =
259 		ocfs2_clusters_for_bytes(inode->i_sb,
260 					 OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
261 }
262 
263 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
264 					     u64 blkno)
265 {
266 	struct ocfs2_xattr_block *xb = et->et_object;
267 	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
268 
269 	xt->xt_last_eb_blk = cpu_to_le64(blkno);
270 }
271 
272 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
273 {
274 	struct ocfs2_xattr_block *xb = et->et_object;
275 	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
276 
277 	return le64_to_cpu(xt->xt_last_eb_blk);
278 }
279 
280 static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
281 					     struct ocfs2_extent_tree *et,
282 					     u32 clusters)
283 {
284 	struct ocfs2_xattr_block *xb = et->et_object;
285 
286 	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
287 }
288 
289 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
290 	.eo_set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
291 	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
292 	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
293 	.eo_fill_root_el	= ocfs2_xattr_tree_fill_root_el,
294 	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
295 };
296 
297 static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
298 					  u64 blkno)
299 {
300 	struct ocfs2_dx_root_block *dx_root = et->et_object;
301 
302 	dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
303 }
304 
305 static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
306 {
307 	struct ocfs2_dx_root_block *dx_root = et->et_object;
308 
309 	return le64_to_cpu(dx_root->dr_last_eb_blk);
310 }
311 
312 static void ocfs2_dx_root_update_clusters(struct inode *inode,
313 					  struct ocfs2_extent_tree *et,
314 					  u32 clusters)
315 {
316 	struct ocfs2_dx_root_block *dx_root = et->et_object;
317 
318 	le32_add_cpu(&dx_root->dr_clusters, clusters);
319 }
320 
321 static int ocfs2_dx_root_sanity_check(struct inode *inode,
322 				      struct ocfs2_extent_tree *et)
323 {
324 	struct ocfs2_dx_root_block *dx_root = et->et_object;
325 
326 	BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
327 
328 	return 0;
329 }
330 
331 static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
332 {
333 	struct ocfs2_dx_root_block *dx_root = et->et_object;
334 
335 	et->et_root_el = &dx_root->dr_list;
336 }
337 
338 static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
339 	.eo_set_last_eb_blk	= ocfs2_dx_root_set_last_eb_blk,
340 	.eo_get_last_eb_blk	= ocfs2_dx_root_get_last_eb_blk,
341 	.eo_update_clusters	= ocfs2_dx_root_update_clusters,
342 	.eo_sanity_check	= ocfs2_dx_root_sanity_check,
343 	.eo_fill_root_el	= ocfs2_dx_root_fill_root_el,
344 };
345 
346 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
347 				     struct inode *inode,
348 				     struct buffer_head *bh,
349 				     ocfs2_journal_access_func access,
350 				     void *obj,
351 				     struct ocfs2_extent_tree_operations *ops)
352 {
353 	et->et_ops = ops;
354 	et->et_root_bh = bh;
355 	et->et_root_journal_access = access;
356 	if (!obj)
357 		obj = (void *)bh->b_data;
358 	et->et_object = obj;
359 
360 	et->et_ops->eo_fill_root_el(et);
361 	if (!et->et_ops->eo_fill_max_leaf_clusters)
362 		et->et_max_leaf_clusters = 0;
363 	else
364 		et->et_ops->eo_fill_max_leaf_clusters(inode, et);
365 }
366 
367 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
368 				   struct inode *inode,
369 				   struct buffer_head *bh)
370 {
371 	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
372 				 NULL, &ocfs2_dinode_et_ops);
373 }
374 
375 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
376 				       struct inode *inode,
377 				       struct buffer_head *bh)
378 {
379 	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
380 				 NULL, &ocfs2_xattr_tree_et_ops);
381 }
382 
383 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
384 					struct inode *inode,
385 					struct ocfs2_xattr_value_buf *vb)
386 {
387 	__ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
388 				 &ocfs2_xattr_value_et_ops);
389 }
390 
391 void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
392 				    struct inode *inode,
393 				    struct buffer_head *bh)
394 {
395 	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr,
396 				 NULL, &ocfs2_dx_root_et_ops);
397 }
398 
399 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
400 					    u64 new_last_eb_blk)
401 {
402 	et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
403 }
404 
405 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
406 {
407 	return et->et_ops->eo_get_last_eb_blk(et);
408 }
409 
410 static inline void ocfs2_et_update_clusters(struct inode *inode,
411 					    struct ocfs2_extent_tree *et,
412 					    u32 clusters)
413 {
414 	et->et_ops->eo_update_clusters(inode, et, clusters);
415 }
416 
417 static inline int ocfs2_et_root_journal_access(handle_t *handle,
418 					       struct inode *inode,
419 					       struct ocfs2_extent_tree *et,
420 					       int type)
421 {
422 	return et->et_root_journal_access(handle, inode, et->et_root_bh,
423 					  type);
424 }
425 
426 static inline int ocfs2_et_insert_check(struct inode *inode,
427 					struct ocfs2_extent_tree *et,
428 					struct ocfs2_extent_rec *rec)
429 {
430 	int ret = 0;
431 
432 	if (et->et_ops->eo_insert_check)
433 		ret = et->et_ops->eo_insert_check(inode, et, rec);
434 	return ret;
435 }
436 
437 static inline int ocfs2_et_sanity_check(struct inode *inode,
438 					struct ocfs2_extent_tree *et)
439 {
440 	int ret = 0;
441 
442 	if (et->et_ops->eo_sanity_check)
443 		ret = et->et_ops->eo_sanity_check(inode, et);
444 	return ret;
445 }
446 
447 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
448 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
449 					 struct ocfs2_extent_block *eb);
450 
451 /*
452  * Structures which describe a path through a btree, and functions to
453  * manipulate them.
454  *
455  * The idea here is to be as generic as possible with the tree
456  * manipulation code.
457  */
458 struct ocfs2_path_item {
459 	struct buffer_head		*bh;
460 	struct ocfs2_extent_list	*el;
461 };
462 
463 #define OCFS2_MAX_PATH_DEPTH	5
464 
465 struct ocfs2_path {
466 	int				p_tree_depth;
467 	ocfs2_journal_access_func	p_root_access;
468 	struct ocfs2_path_item		p_node[OCFS2_MAX_PATH_DEPTH];
469 };
470 
471 #define path_root_bh(_path) ((_path)->p_node[0].bh)
472 #define path_root_el(_path) ((_path)->p_node[0].el)
473 #define path_root_access(_path)((_path)->p_root_access)
474 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
475 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
476 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
477 
478 /*
479  * Reset the actual path elements so that we can re-use the structure
480  * to build another path. Generally, this involves freeing the buffer
481  * heads.
482  */
483 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
484 {
485 	int i, start = 0, depth = 0;
486 	struct ocfs2_path_item *node;
487 
488 	if (keep_root)
489 		start = 1;
490 
491 	for(i = start; i < path_num_items(path); i++) {
492 		node = &path->p_node[i];
493 
494 		brelse(node->bh);
495 		node->bh = NULL;
496 		node->el = NULL;
497 	}
498 
499 	/*
500 	 * Tree depth may change during truncate, or insert. If we're
501 	 * keeping the root extent list, then make sure that our path
502 	 * structure reflects the proper depth.
503 	 */
504 	if (keep_root)
505 		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
506 	else
507 		path_root_access(path) = NULL;
508 
509 	path->p_tree_depth = depth;
510 }
511 
512 static void ocfs2_free_path(struct ocfs2_path *path)
513 {
514 	if (path) {
515 		ocfs2_reinit_path(path, 0);
516 		kfree(path);
517 	}
518 }
519 
520 /*
521  * All the elements of src into dest. After this call, src could be freed
522  * without affecting dest.
523  *
524  * Both paths should have the same root. Any non-root elements of dest
525  * will be freed.
526  */
527 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
528 {
529 	int i;
530 
531 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
532 	BUG_ON(path_root_el(dest) != path_root_el(src));
533 	BUG_ON(path_root_access(dest) != path_root_access(src));
534 
535 	ocfs2_reinit_path(dest, 1);
536 
537 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
538 		dest->p_node[i].bh = src->p_node[i].bh;
539 		dest->p_node[i].el = src->p_node[i].el;
540 
541 		if (dest->p_node[i].bh)
542 			get_bh(dest->p_node[i].bh);
543 	}
544 }
545 
546 /*
547  * Make the *dest path the same as src and re-initialize src path to
548  * have a root only.
549  */
550 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
551 {
552 	int i;
553 
554 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
555 	BUG_ON(path_root_access(dest) != path_root_access(src));
556 
557 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
558 		brelse(dest->p_node[i].bh);
559 
560 		dest->p_node[i].bh = src->p_node[i].bh;
561 		dest->p_node[i].el = src->p_node[i].el;
562 
563 		src->p_node[i].bh = NULL;
564 		src->p_node[i].el = NULL;
565 	}
566 }
567 
568 /*
569  * Insert an extent block at given index.
570  *
571  * This will not take an additional reference on eb_bh.
572  */
573 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
574 					struct buffer_head *eb_bh)
575 {
576 	struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
577 
578 	/*
579 	 * Right now, no root bh is an extent block, so this helps
580 	 * catch code errors with dinode trees. The assertion can be
581 	 * safely removed if we ever need to insert extent block
582 	 * structures at the root.
583 	 */
584 	BUG_ON(index == 0);
585 
586 	path->p_node[index].bh = eb_bh;
587 	path->p_node[index].el = &eb->h_list;
588 }
589 
590 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
591 					 struct ocfs2_extent_list *root_el,
592 					 ocfs2_journal_access_func access)
593 {
594 	struct ocfs2_path *path;
595 
596 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
597 
598 	path = kzalloc(sizeof(*path), GFP_NOFS);
599 	if (path) {
600 		path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
601 		get_bh(root_bh);
602 		path_root_bh(path) = root_bh;
603 		path_root_el(path) = root_el;
604 		path_root_access(path) = access;
605 	}
606 
607 	return path;
608 }
609 
610 static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
611 {
612 	return ocfs2_new_path(path_root_bh(path), path_root_el(path),
613 			      path_root_access(path));
614 }
615 
616 static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
617 {
618 	return ocfs2_new_path(et->et_root_bh, et->et_root_el,
619 			      et->et_root_journal_access);
620 }
621 
622 /*
623  * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
624  * otherwise it's the root_access function.
625  *
626  * I don't like the way this function's name looks next to
627  * ocfs2_journal_access_path(), but I don't have a better one.
628  */
629 static int ocfs2_path_bh_journal_access(handle_t *handle,
630 					struct inode *inode,
631 					struct ocfs2_path *path,
632 					int idx)
633 {
634 	ocfs2_journal_access_func access = path_root_access(path);
635 
636 	if (!access)
637 		access = ocfs2_journal_access;
638 
639 	if (idx)
640 		access = ocfs2_journal_access_eb;
641 
642 	return access(handle, inode, path->p_node[idx].bh,
643 		      OCFS2_JOURNAL_ACCESS_WRITE);
644 }
645 
646 /*
647  * Convenience function to journal all components in a path.
648  */
649 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
650 				     struct ocfs2_path *path)
651 {
652 	int i, ret = 0;
653 
654 	if (!path)
655 		goto out;
656 
657 	for(i = 0; i < path_num_items(path); i++) {
658 		ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
659 		if (ret < 0) {
660 			mlog_errno(ret);
661 			goto out;
662 		}
663 	}
664 
665 out:
666 	return ret;
667 }
668 
669 /*
670  * Return the index of the extent record which contains cluster #v_cluster.
671  * -1 is returned if it was not found.
672  *
673  * Should work fine on interior and exterior nodes.
674  */
675 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
676 {
677 	int ret = -1;
678 	int i;
679 	struct ocfs2_extent_rec *rec;
680 	u32 rec_end, rec_start, clusters;
681 
682 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
683 		rec = &el->l_recs[i];
684 
685 		rec_start = le32_to_cpu(rec->e_cpos);
686 		clusters = ocfs2_rec_clusters(el, rec);
687 
688 		rec_end = rec_start + clusters;
689 
690 		if (v_cluster >= rec_start && v_cluster < rec_end) {
691 			ret = i;
692 			break;
693 		}
694 	}
695 
696 	return ret;
697 }
698 
699 enum ocfs2_contig_type {
700 	CONTIG_NONE = 0,
701 	CONTIG_LEFT,
702 	CONTIG_RIGHT,
703 	CONTIG_LEFTRIGHT,
704 };
705 
706 
707 /*
708  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
709  * ocfs2_extent_contig only work properly against leaf nodes!
710  */
711 static int ocfs2_block_extent_contig(struct super_block *sb,
712 				     struct ocfs2_extent_rec *ext,
713 				     u64 blkno)
714 {
715 	u64 blk_end = le64_to_cpu(ext->e_blkno);
716 
717 	blk_end += ocfs2_clusters_to_blocks(sb,
718 				    le16_to_cpu(ext->e_leaf_clusters));
719 
720 	return blkno == blk_end;
721 }
722 
723 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
724 				  struct ocfs2_extent_rec *right)
725 {
726 	u32 left_range;
727 
728 	left_range = le32_to_cpu(left->e_cpos) +
729 		le16_to_cpu(left->e_leaf_clusters);
730 
731 	return (left_range == le32_to_cpu(right->e_cpos));
732 }
733 
734 static enum ocfs2_contig_type
735 	ocfs2_extent_contig(struct inode *inode,
736 			    struct ocfs2_extent_rec *ext,
737 			    struct ocfs2_extent_rec *insert_rec)
738 {
739 	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
740 
741 	/*
742 	 * Refuse to coalesce extent records with different flag
743 	 * fields - we don't want to mix unwritten extents with user
744 	 * data.
745 	 */
746 	if (ext->e_flags != insert_rec->e_flags)
747 		return CONTIG_NONE;
748 
749 	if (ocfs2_extents_adjacent(ext, insert_rec) &&
750 	    ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
751 			return CONTIG_RIGHT;
752 
753 	blkno = le64_to_cpu(ext->e_blkno);
754 	if (ocfs2_extents_adjacent(insert_rec, ext) &&
755 	    ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
756 		return CONTIG_LEFT;
757 
758 	return CONTIG_NONE;
759 }
760 
761 /*
762  * NOTE: We can have pretty much any combination of contiguousness and
763  * appending.
764  *
765  * The usefulness of APPEND_TAIL is more in that it lets us know that
766  * we'll have to update the path to that leaf.
767  */
768 enum ocfs2_append_type {
769 	APPEND_NONE = 0,
770 	APPEND_TAIL,
771 };
772 
773 enum ocfs2_split_type {
774 	SPLIT_NONE = 0,
775 	SPLIT_LEFT,
776 	SPLIT_RIGHT,
777 };
778 
779 struct ocfs2_insert_type {
780 	enum ocfs2_split_type	ins_split;
781 	enum ocfs2_append_type	ins_appending;
782 	enum ocfs2_contig_type	ins_contig;
783 	int			ins_contig_index;
784 	int			ins_tree_depth;
785 };
786 
787 struct ocfs2_merge_ctxt {
788 	enum ocfs2_contig_type	c_contig_type;
789 	int			c_has_empty_extent;
790 	int			c_split_covers_rec;
791 };
792 
793 static int ocfs2_validate_extent_block(struct super_block *sb,
794 				       struct buffer_head *bh)
795 {
796 	int rc;
797 	struct ocfs2_extent_block *eb =
798 		(struct ocfs2_extent_block *)bh->b_data;
799 
800 	mlog(0, "Validating extent block %llu\n",
801 	     (unsigned long long)bh->b_blocknr);
802 
803 	BUG_ON(!buffer_uptodate(bh));
804 
805 	/*
806 	 * If the ecc fails, we return the error but otherwise
807 	 * leave the filesystem running.  We know any error is
808 	 * local to this block.
809 	 */
810 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
811 	if (rc) {
812 		mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
813 		     (unsigned long long)bh->b_blocknr);
814 		return rc;
815 	}
816 
817 	/*
818 	 * Errors after here are fatal.
819 	 */
820 
821 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
822 		ocfs2_error(sb,
823 			    "Extent block #%llu has bad signature %.*s",
824 			    (unsigned long long)bh->b_blocknr, 7,
825 			    eb->h_signature);
826 		return -EINVAL;
827 	}
828 
829 	if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
830 		ocfs2_error(sb,
831 			    "Extent block #%llu has an invalid h_blkno "
832 			    "of %llu",
833 			    (unsigned long long)bh->b_blocknr,
834 			    (unsigned long long)le64_to_cpu(eb->h_blkno));
835 		return -EINVAL;
836 	}
837 
838 	if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
839 		ocfs2_error(sb,
840 			    "Extent block #%llu has an invalid "
841 			    "h_fs_generation of #%u",
842 			    (unsigned long long)bh->b_blocknr,
843 			    le32_to_cpu(eb->h_fs_generation));
844 		return -EINVAL;
845 	}
846 
847 	return 0;
848 }
849 
850 int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
851 			    struct buffer_head **bh)
852 {
853 	int rc;
854 	struct buffer_head *tmp = *bh;
855 
856 	rc = ocfs2_read_block(inode, eb_blkno, &tmp,
857 			      ocfs2_validate_extent_block);
858 
859 	/* If ocfs2_read_block() got us a new bh, pass it up. */
860 	if (!rc && !*bh)
861 		*bh = tmp;
862 
863 	return rc;
864 }
865 
866 
867 /*
868  * How many free extents have we got before we need more meta data?
869  */
870 int ocfs2_num_free_extents(struct ocfs2_super *osb,
871 			   struct inode *inode,
872 			   struct ocfs2_extent_tree *et)
873 {
874 	int retval;
875 	struct ocfs2_extent_list *el = NULL;
876 	struct ocfs2_extent_block *eb;
877 	struct buffer_head *eb_bh = NULL;
878 	u64 last_eb_blk = 0;
879 
880 	mlog_entry_void();
881 
882 	el = et->et_root_el;
883 	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
884 
885 	if (last_eb_blk) {
886 		retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
887 		if (retval < 0) {
888 			mlog_errno(retval);
889 			goto bail;
890 		}
891 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
892 		el = &eb->h_list;
893 	}
894 
895 	BUG_ON(el->l_tree_depth != 0);
896 
897 	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
898 bail:
899 	brelse(eb_bh);
900 
901 	mlog_exit(retval);
902 	return retval;
903 }
904 
905 /* expects array to already be allocated
906  *
907  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
908  * l_count for you
909  */
910 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
911 				     handle_t *handle,
912 				     struct inode *inode,
913 				     int wanted,
914 				     struct ocfs2_alloc_context *meta_ac,
915 				     struct buffer_head *bhs[])
916 {
917 	int count, status, i;
918 	u16 suballoc_bit_start;
919 	u32 num_got;
920 	u64 first_blkno;
921 	struct ocfs2_extent_block *eb;
922 
923 	mlog_entry_void();
924 
925 	count = 0;
926 	while (count < wanted) {
927 		status = ocfs2_claim_metadata(osb,
928 					      handle,
929 					      meta_ac,
930 					      wanted - count,
931 					      &suballoc_bit_start,
932 					      &num_got,
933 					      &first_blkno);
934 		if (status < 0) {
935 			mlog_errno(status);
936 			goto bail;
937 		}
938 
939 		for(i = count;  i < (num_got + count); i++) {
940 			bhs[i] = sb_getblk(osb->sb, first_blkno);
941 			if (bhs[i] == NULL) {
942 				status = -EIO;
943 				mlog_errno(status);
944 				goto bail;
945 			}
946 			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
947 
948 			status = ocfs2_journal_access_eb(handle, inode, bhs[i],
949 							 OCFS2_JOURNAL_ACCESS_CREATE);
950 			if (status < 0) {
951 				mlog_errno(status);
952 				goto bail;
953 			}
954 
955 			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
956 			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
957 			/* Ok, setup the minimal stuff here. */
958 			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
959 			eb->h_blkno = cpu_to_le64(first_blkno);
960 			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
961 			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
962 			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
963 			eb->h_list.l_count =
964 				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
965 
966 			suballoc_bit_start++;
967 			first_blkno++;
968 
969 			/* We'll also be dirtied by the caller, so
970 			 * this isn't absolutely necessary. */
971 			status = ocfs2_journal_dirty(handle, bhs[i]);
972 			if (status < 0) {
973 				mlog_errno(status);
974 				goto bail;
975 			}
976 		}
977 
978 		count += num_got;
979 	}
980 
981 	status = 0;
982 bail:
983 	if (status < 0) {
984 		for(i = 0; i < wanted; i++) {
985 			brelse(bhs[i]);
986 			bhs[i] = NULL;
987 		}
988 	}
989 	mlog_exit(status);
990 	return status;
991 }
992 
993 /*
994  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
995  *
996  * Returns the sum of the rightmost extent rec logical offset and
997  * cluster count.
998  *
999  * ocfs2_add_branch() uses this to determine what logical cluster
1000  * value should be populated into the leftmost new branch records.
1001  *
1002  * ocfs2_shift_tree_depth() uses this to determine the # clusters
1003  * value for the new topmost tree record.
1004  */
1005 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
1006 {
1007 	int i;
1008 
1009 	i = le16_to_cpu(el->l_next_free_rec) - 1;
1010 
1011 	return le32_to_cpu(el->l_recs[i].e_cpos) +
1012 		ocfs2_rec_clusters(el, &el->l_recs[i]);
1013 }
1014 
1015 /*
1016  * Add an entire tree branch to our inode. eb_bh is the extent block
1017  * to start at, if we don't want to start the branch at the dinode
1018  * structure.
1019  *
1020  * last_eb_bh is required as we have to update it's next_leaf pointer
1021  * for the new last extent block.
1022  *
1023  * the new branch will be 'empty' in the sense that every block will
1024  * contain a single record with cluster count == 0.
1025  */
1026 static int ocfs2_add_branch(struct ocfs2_super *osb,
1027 			    handle_t *handle,
1028 			    struct inode *inode,
1029 			    struct ocfs2_extent_tree *et,
1030 			    struct buffer_head *eb_bh,
1031 			    struct buffer_head **last_eb_bh,
1032 			    struct ocfs2_alloc_context *meta_ac)
1033 {
1034 	int status, new_blocks, i;
1035 	u64 next_blkno, new_last_eb_blk;
1036 	struct buffer_head *bh;
1037 	struct buffer_head **new_eb_bhs = NULL;
1038 	struct ocfs2_extent_block *eb;
1039 	struct ocfs2_extent_list  *eb_el;
1040 	struct ocfs2_extent_list  *el;
1041 	u32 new_cpos;
1042 
1043 	mlog_entry_void();
1044 
1045 	BUG_ON(!last_eb_bh || !*last_eb_bh);
1046 
1047 	if (eb_bh) {
1048 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1049 		el = &eb->h_list;
1050 	} else
1051 		el = et->et_root_el;
1052 
1053 	/* we never add a branch to a leaf. */
1054 	BUG_ON(!el->l_tree_depth);
1055 
1056 	new_blocks = le16_to_cpu(el->l_tree_depth);
1057 
1058 	/* allocate the number of new eb blocks we need */
1059 	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
1060 			     GFP_KERNEL);
1061 	if (!new_eb_bhs) {
1062 		status = -ENOMEM;
1063 		mlog_errno(status);
1064 		goto bail;
1065 	}
1066 
1067 	status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
1068 					   meta_ac, new_eb_bhs);
1069 	if (status < 0) {
1070 		mlog_errno(status);
1071 		goto bail;
1072 	}
1073 
1074 	eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
1075 	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
1076 
1077 	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
1078 	 * linked with the rest of the tree.
1079 	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
1080 	 *
1081 	 * when we leave the loop, new_last_eb_blk will point to the
1082 	 * newest leaf, and next_blkno will point to the topmost extent
1083 	 * block. */
1084 	next_blkno = new_last_eb_blk = 0;
1085 	for(i = 0; i < new_blocks; i++) {
1086 		bh = new_eb_bhs[i];
1087 		eb = (struct ocfs2_extent_block *) bh->b_data;
1088 		/* ocfs2_create_new_meta_bhs() should create it right! */
1089 		BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1090 		eb_el = &eb->h_list;
1091 
1092 		status = ocfs2_journal_access_eb(handle, inode, bh,
1093 						 OCFS2_JOURNAL_ACCESS_CREATE);
1094 		if (status < 0) {
1095 			mlog_errno(status);
1096 			goto bail;
1097 		}
1098 
1099 		eb->h_next_leaf_blk = 0;
1100 		eb_el->l_tree_depth = cpu_to_le16(i);
1101 		eb_el->l_next_free_rec = cpu_to_le16(1);
1102 		/*
1103 		 * This actually counts as an empty extent as
1104 		 * c_clusters == 0
1105 		 */
1106 		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
1107 		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
1108 		/*
1109 		 * eb_el isn't always an interior node, but even leaf
1110 		 * nodes want a zero'd flags and reserved field so
1111 		 * this gets the whole 32 bits regardless of use.
1112 		 */
1113 		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
1114 		if (!eb_el->l_tree_depth)
1115 			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1116 
1117 		status = ocfs2_journal_dirty(handle, bh);
1118 		if (status < 0) {
1119 			mlog_errno(status);
1120 			goto bail;
1121 		}
1122 
1123 		next_blkno = le64_to_cpu(eb->h_blkno);
1124 	}
1125 
1126 	/* This is a bit hairy. We want to update up to three blocks
1127 	 * here without leaving any of them in an inconsistent state
1128 	 * in case of error. We don't have to worry about
1129 	 * journal_dirty erroring as it won't unless we've aborted the
1130 	 * handle (in which case we would never be here) so reserving
1131 	 * the write with journal_access is all we need to do. */
1132 	status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
1133 					 OCFS2_JOURNAL_ACCESS_WRITE);
1134 	if (status < 0) {
1135 		mlog_errno(status);
1136 		goto bail;
1137 	}
1138 	status = ocfs2_et_root_journal_access(handle, inode, et,
1139 					      OCFS2_JOURNAL_ACCESS_WRITE);
1140 	if (status < 0) {
1141 		mlog_errno(status);
1142 		goto bail;
1143 	}
1144 	if (eb_bh) {
1145 		status = ocfs2_journal_access_eb(handle, inode, eb_bh,
1146 						 OCFS2_JOURNAL_ACCESS_WRITE);
1147 		if (status < 0) {
1148 			mlog_errno(status);
1149 			goto bail;
1150 		}
1151 	}
1152 
1153 	/* Link the new branch into the rest of the tree (el will
1154 	 * either be on the root_bh, or the extent block passed in. */
1155 	i = le16_to_cpu(el->l_next_free_rec);
1156 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1157 	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1158 	el->l_recs[i].e_int_clusters = 0;
1159 	le16_add_cpu(&el->l_next_free_rec, 1);
1160 
1161 	/* fe needs a new last extent block pointer, as does the
1162 	 * next_leaf on the previously last-extent-block. */
1163 	ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1164 
1165 	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1166 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1167 
1168 	status = ocfs2_journal_dirty(handle, *last_eb_bh);
1169 	if (status < 0)
1170 		mlog_errno(status);
1171 	status = ocfs2_journal_dirty(handle, et->et_root_bh);
1172 	if (status < 0)
1173 		mlog_errno(status);
1174 	if (eb_bh) {
1175 		status = ocfs2_journal_dirty(handle, eb_bh);
1176 		if (status < 0)
1177 			mlog_errno(status);
1178 	}
1179 
1180 	/*
1181 	 * Some callers want to track the rightmost leaf so pass it
1182 	 * back here.
1183 	 */
1184 	brelse(*last_eb_bh);
1185 	get_bh(new_eb_bhs[0]);
1186 	*last_eb_bh = new_eb_bhs[0];
1187 
1188 	status = 0;
1189 bail:
1190 	if (new_eb_bhs) {
1191 		for (i = 0; i < new_blocks; i++)
1192 			brelse(new_eb_bhs[i]);
1193 		kfree(new_eb_bhs);
1194 	}
1195 
1196 	mlog_exit(status);
1197 	return status;
1198 }
1199 
1200 /*
1201  * adds another level to the allocation tree.
1202  * returns back the new extent block so you can add a branch to it
1203  * after this call.
1204  */
1205 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1206 				  handle_t *handle,
1207 				  struct inode *inode,
1208 				  struct ocfs2_extent_tree *et,
1209 				  struct ocfs2_alloc_context *meta_ac,
1210 				  struct buffer_head **ret_new_eb_bh)
1211 {
1212 	int status, i;
1213 	u32 new_clusters;
1214 	struct buffer_head *new_eb_bh = NULL;
1215 	struct ocfs2_extent_block *eb;
1216 	struct ocfs2_extent_list  *root_el;
1217 	struct ocfs2_extent_list  *eb_el;
1218 
1219 	mlog_entry_void();
1220 
1221 	status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
1222 					   &new_eb_bh);
1223 	if (status < 0) {
1224 		mlog_errno(status);
1225 		goto bail;
1226 	}
1227 
1228 	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1229 	/* ocfs2_create_new_meta_bhs() should create it right! */
1230 	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1231 
1232 	eb_el = &eb->h_list;
1233 	root_el = et->et_root_el;
1234 
1235 	status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
1236 					 OCFS2_JOURNAL_ACCESS_CREATE);
1237 	if (status < 0) {
1238 		mlog_errno(status);
1239 		goto bail;
1240 	}
1241 
1242 	/* copy the root extent list data into the new extent block */
1243 	eb_el->l_tree_depth = root_el->l_tree_depth;
1244 	eb_el->l_next_free_rec = root_el->l_next_free_rec;
1245 	for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1246 		eb_el->l_recs[i] = root_el->l_recs[i];
1247 
1248 	status = ocfs2_journal_dirty(handle, new_eb_bh);
1249 	if (status < 0) {
1250 		mlog_errno(status);
1251 		goto bail;
1252 	}
1253 
1254 	status = ocfs2_et_root_journal_access(handle, inode, et,
1255 					      OCFS2_JOURNAL_ACCESS_WRITE);
1256 	if (status < 0) {
1257 		mlog_errno(status);
1258 		goto bail;
1259 	}
1260 
1261 	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1262 
1263 	/* update root_bh now */
1264 	le16_add_cpu(&root_el->l_tree_depth, 1);
1265 	root_el->l_recs[0].e_cpos = 0;
1266 	root_el->l_recs[0].e_blkno = eb->h_blkno;
1267 	root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1268 	for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1269 		memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1270 	root_el->l_next_free_rec = cpu_to_le16(1);
1271 
1272 	/* If this is our 1st tree depth shift, then last_eb_blk
1273 	 * becomes the allocated extent block */
1274 	if (root_el->l_tree_depth == cpu_to_le16(1))
1275 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1276 
1277 	status = ocfs2_journal_dirty(handle, et->et_root_bh);
1278 	if (status < 0) {
1279 		mlog_errno(status);
1280 		goto bail;
1281 	}
1282 
1283 	*ret_new_eb_bh = new_eb_bh;
1284 	new_eb_bh = NULL;
1285 	status = 0;
1286 bail:
1287 	brelse(new_eb_bh);
1288 
1289 	mlog_exit(status);
1290 	return status;
1291 }
1292 
1293 /*
1294  * Should only be called when there is no space left in any of the
1295  * leaf nodes. What we want to do is find the lowest tree depth
1296  * non-leaf extent block with room for new records. There are three
1297  * valid results of this search:
1298  *
1299  * 1) a lowest extent block is found, then we pass it back in
1300  *    *lowest_eb_bh and return '0'
1301  *
1302  * 2) the search fails to find anything, but the root_el has room. We
1303  *    pass NULL back in *lowest_eb_bh, but still return '0'
1304  *
1305  * 3) the search fails to find anything AND the root_el is full, in
1306  *    which case we return > 0
1307  *
1308  * return status < 0 indicates an error.
1309  */
1310 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1311 				    struct inode *inode,
1312 				    struct ocfs2_extent_tree *et,
1313 				    struct buffer_head **target_bh)
1314 {
1315 	int status = 0, i;
1316 	u64 blkno;
1317 	struct ocfs2_extent_block *eb;
1318 	struct ocfs2_extent_list  *el;
1319 	struct buffer_head *bh = NULL;
1320 	struct buffer_head *lowest_bh = NULL;
1321 
1322 	mlog_entry_void();
1323 
1324 	*target_bh = NULL;
1325 
1326 	el = et->et_root_el;
1327 
1328 	while(le16_to_cpu(el->l_tree_depth) > 1) {
1329 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
1330 			ocfs2_error(inode->i_sb, "Dinode %llu has empty "
1331 				    "extent list (next_free_rec == 0)",
1332 				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
1333 			status = -EIO;
1334 			goto bail;
1335 		}
1336 		i = le16_to_cpu(el->l_next_free_rec) - 1;
1337 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1338 		if (!blkno) {
1339 			ocfs2_error(inode->i_sb, "Dinode %llu has extent "
1340 				    "list where extent # %d has no physical "
1341 				    "block start",
1342 				    (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
1343 			status = -EIO;
1344 			goto bail;
1345 		}
1346 
1347 		brelse(bh);
1348 		bh = NULL;
1349 
1350 		status = ocfs2_read_extent_block(inode, blkno, &bh);
1351 		if (status < 0) {
1352 			mlog_errno(status);
1353 			goto bail;
1354 		}
1355 
1356 		eb = (struct ocfs2_extent_block *) bh->b_data;
1357 		el = &eb->h_list;
1358 
1359 		if (le16_to_cpu(el->l_next_free_rec) <
1360 		    le16_to_cpu(el->l_count)) {
1361 			brelse(lowest_bh);
1362 			lowest_bh = bh;
1363 			get_bh(lowest_bh);
1364 		}
1365 	}
1366 
1367 	/* If we didn't find one and the fe doesn't have any room,
1368 	 * then return '1' */
1369 	el = et->et_root_el;
1370 	if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1371 		status = 1;
1372 
1373 	*target_bh = lowest_bh;
1374 bail:
1375 	brelse(bh);
1376 
1377 	mlog_exit(status);
1378 	return status;
1379 }
1380 
1381 /*
1382  * Grow a b-tree so that it has more records.
1383  *
1384  * We might shift the tree depth in which case existing paths should
1385  * be considered invalid.
1386  *
1387  * Tree depth after the grow is returned via *final_depth.
1388  *
1389  * *last_eb_bh will be updated by ocfs2_add_branch().
1390  */
1391 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1392 			   struct ocfs2_extent_tree *et, int *final_depth,
1393 			   struct buffer_head **last_eb_bh,
1394 			   struct ocfs2_alloc_context *meta_ac)
1395 {
1396 	int ret, shift;
1397 	struct ocfs2_extent_list *el = et->et_root_el;
1398 	int depth = le16_to_cpu(el->l_tree_depth);
1399 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1400 	struct buffer_head *bh = NULL;
1401 
1402 	BUG_ON(meta_ac == NULL);
1403 
1404 	shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1405 	if (shift < 0) {
1406 		ret = shift;
1407 		mlog_errno(ret);
1408 		goto out;
1409 	}
1410 
1411 	/* We traveled all the way to the bottom of the allocation tree
1412 	 * and didn't find room for any more extents - we need to add
1413 	 * another tree level */
1414 	if (shift) {
1415 		BUG_ON(bh);
1416 		mlog(0, "need to shift tree depth (current = %d)\n", depth);
1417 
1418 		/* ocfs2_shift_tree_depth will return us a buffer with
1419 		 * the new extent block (so we can pass that to
1420 		 * ocfs2_add_branch). */
1421 		ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1422 					     meta_ac, &bh);
1423 		if (ret < 0) {
1424 			mlog_errno(ret);
1425 			goto out;
1426 		}
1427 		depth++;
1428 		if (depth == 1) {
1429 			/*
1430 			 * Special case: we have room now if we shifted from
1431 			 * tree_depth 0, so no more work needs to be done.
1432 			 *
1433 			 * We won't be calling add_branch, so pass
1434 			 * back *last_eb_bh as the new leaf. At depth
1435 			 * zero, it should always be null so there's
1436 			 * no reason to brelse.
1437 			 */
1438 			BUG_ON(*last_eb_bh);
1439 			get_bh(bh);
1440 			*last_eb_bh = bh;
1441 			goto out;
1442 		}
1443 	}
1444 
1445 	/* call ocfs2_add_branch to add the final part of the tree with
1446 	 * the new data. */
1447 	mlog(0, "add branch. bh = %p\n", bh);
1448 	ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1449 			       meta_ac);
1450 	if (ret < 0) {
1451 		mlog_errno(ret);
1452 		goto out;
1453 	}
1454 
1455 out:
1456 	if (final_depth)
1457 		*final_depth = depth;
1458 	brelse(bh);
1459 	return ret;
1460 }
1461 
1462 /*
1463  * This function will discard the rightmost extent record.
1464  */
1465 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1466 {
1467 	int next_free = le16_to_cpu(el->l_next_free_rec);
1468 	int count = le16_to_cpu(el->l_count);
1469 	unsigned int num_bytes;
1470 
1471 	BUG_ON(!next_free);
1472 	/* This will cause us to go off the end of our extent list. */
1473 	BUG_ON(next_free >= count);
1474 
1475 	num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1476 
1477 	memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1478 }
1479 
1480 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1481 			      struct ocfs2_extent_rec *insert_rec)
1482 {
1483 	int i, insert_index, next_free, has_empty, num_bytes;
1484 	u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1485 	struct ocfs2_extent_rec *rec;
1486 
1487 	next_free = le16_to_cpu(el->l_next_free_rec);
1488 	has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1489 
1490 	BUG_ON(!next_free);
1491 
1492 	/* The tree code before us didn't allow enough room in the leaf. */
1493 	BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1494 
1495 	/*
1496 	 * The easiest way to approach this is to just remove the
1497 	 * empty extent and temporarily decrement next_free.
1498 	 */
1499 	if (has_empty) {
1500 		/*
1501 		 * If next_free was 1 (only an empty extent), this
1502 		 * loop won't execute, which is fine. We still want
1503 		 * the decrement above to happen.
1504 		 */
1505 		for(i = 0; i < (next_free - 1); i++)
1506 			el->l_recs[i] = el->l_recs[i+1];
1507 
1508 		next_free--;
1509 	}
1510 
1511 	/*
1512 	 * Figure out what the new record index should be.
1513 	 */
1514 	for(i = 0; i < next_free; i++) {
1515 		rec = &el->l_recs[i];
1516 
1517 		if (insert_cpos < le32_to_cpu(rec->e_cpos))
1518 			break;
1519 	}
1520 	insert_index = i;
1521 
1522 	mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1523 	     insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1524 
1525 	BUG_ON(insert_index < 0);
1526 	BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1527 	BUG_ON(insert_index > next_free);
1528 
1529 	/*
1530 	 * No need to memmove if we're just adding to the tail.
1531 	 */
1532 	if (insert_index != next_free) {
1533 		BUG_ON(next_free >= le16_to_cpu(el->l_count));
1534 
1535 		num_bytes = next_free - insert_index;
1536 		num_bytes *= sizeof(struct ocfs2_extent_rec);
1537 		memmove(&el->l_recs[insert_index + 1],
1538 			&el->l_recs[insert_index],
1539 			num_bytes);
1540 	}
1541 
1542 	/*
1543 	 * Either we had an empty extent, and need to re-increment or
1544 	 * there was no empty extent on a non full rightmost leaf node,
1545 	 * in which case we still need to increment.
1546 	 */
1547 	next_free++;
1548 	el->l_next_free_rec = cpu_to_le16(next_free);
1549 	/*
1550 	 * Make sure none of the math above just messed up our tree.
1551 	 */
1552 	BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1553 
1554 	el->l_recs[insert_index] = *insert_rec;
1555 
1556 }
1557 
1558 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1559 {
1560 	int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1561 
1562 	BUG_ON(num_recs == 0);
1563 
1564 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1565 		num_recs--;
1566 		size = num_recs * sizeof(struct ocfs2_extent_rec);
1567 		memmove(&el->l_recs[0], &el->l_recs[1], size);
1568 		memset(&el->l_recs[num_recs], 0,
1569 		       sizeof(struct ocfs2_extent_rec));
1570 		el->l_next_free_rec = cpu_to_le16(num_recs);
1571 	}
1572 }
1573 
1574 /*
1575  * Create an empty extent record .
1576  *
1577  * l_next_free_rec may be updated.
1578  *
1579  * If an empty extent already exists do nothing.
1580  */
1581 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1582 {
1583 	int next_free = le16_to_cpu(el->l_next_free_rec);
1584 
1585 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1586 
1587 	if (next_free == 0)
1588 		goto set_and_inc;
1589 
1590 	if (ocfs2_is_empty_extent(&el->l_recs[0]))
1591 		return;
1592 
1593 	mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1594 			"Asked to create an empty extent in a full list:\n"
1595 			"count = %u, tree depth = %u",
1596 			le16_to_cpu(el->l_count),
1597 			le16_to_cpu(el->l_tree_depth));
1598 
1599 	ocfs2_shift_records_right(el);
1600 
1601 set_and_inc:
1602 	le16_add_cpu(&el->l_next_free_rec, 1);
1603 	memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1604 }
1605 
1606 /*
1607  * For a rotation which involves two leaf nodes, the "root node" is
1608  * the lowest level tree node which contains a path to both leafs. This
1609  * resulting set of information can be used to form a complete "subtree"
1610  *
1611  * This function is passed two full paths from the dinode down to a
1612  * pair of adjacent leaves. It's task is to figure out which path
1613  * index contains the subtree root - this can be the root index itself
1614  * in a worst-case rotation.
1615  *
1616  * The array index of the subtree root is passed back.
1617  */
1618 static int ocfs2_find_subtree_root(struct inode *inode,
1619 				   struct ocfs2_path *left,
1620 				   struct ocfs2_path *right)
1621 {
1622 	int i = 0;
1623 
1624 	/*
1625 	 * Check that the caller passed in two paths from the same tree.
1626 	 */
1627 	BUG_ON(path_root_bh(left) != path_root_bh(right));
1628 
1629 	do {
1630 		i++;
1631 
1632 		/*
1633 		 * The caller didn't pass two adjacent paths.
1634 		 */
1635 		mlog_bug_on_msg(i > left->p_tree_depth,
1636 				"Inode %lu, left depth %u, right depth %u\n"
1637 				"left leaf blk %llu, right leaf blk %llu\n",
1638 				inode->i_ino, left->p_tree_depth,
1639 				right->p_tree_depth,
1640 				(unsigned long long)path_leaf_bh(left)->b_blocknr,
1641 				(unsigned long long)path_leaf_bh(right)->b_blocknr);
1642 	} while (left->p_node[i].bh->b_blocknr ==
1643 		 right->p_node[i].bh->b_blocknr);
1644 
1645 	return i - 1;
1646 }
1647 
1648 typedef void (path_insert_t)(void *, struct buffer_head *);
1649 
1650 /*
1651  * Traverse a btree path in search of cpos, starting at root_el.
1652  *
1653  * This code can be called with a cpos larger than the tree, in which
1654  * case it will return the rightmost path.
1655  */
1656 static int __ocfs2_find_path(struct inode *inode,
1657 			     struct ocfs2_extent_list *root_el, u32 cpos,
1658 			     path_insert_t *func, void *data)
1659 {
1660 	int i, ret = 0;
1661 	u32 range;
1662 	u64 blkno;
1663 	struct buffer_head *bh = NULL;
1664 	struct ocfs2_extent_block *eb;
1665 	struct ocfs2_extent_list *el;
1666 	struct ocfs2_extent_rec *rec;
1667 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1668 
1669 	el = root_el;
1670 	while (el->l_tree_depth) {
1671 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
1672 			ocfs2_error(inode->i_sb,
1673 				    "Inode %llu has empty extent list at "
1674 				    "depth %u\n",
1675 				    (unsigned long long)oi->ip_blkno,
1676 				    le16_to_cpu(el->l_tree_depth));
1677 			ret = -EROFS;
1678 			goto out;
1679 
1680 		}
1681 
1682 		for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1683 			rec = &el->l_recs[i];
1684 
1685 			/*
1686 			 * In the case that cpos is off the allocation
1687 			 * tree, this should just wind up returning the
1688 			 * rightmost record.
1689 			 */
1690 			range = le32_to_cpu(rec->e_cpos) +
1691 				ocfs2_rec_clusters(el, rec);
1692 			if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1693 			    break;
1694 		}
1695 
1696 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1697 		if (blkno == 0) {
1698 			ocfs2_error(inode->i_sb,
1699 				    "Inode %llu has bad blkno in extent list "
1700 				    "at depth %u (index %d)\n",
1701 				    (unsigned long long)oi->ip_blkno,
1702 				    le16_to_cpu(el->l_tree_depth), i);
1703 			ret = -EROFS;
1704 			goto out;
1705 		}
1706 
1707 		brelse(bh);
1708 		bh = NULL;
1709 		ret = ocfs2_read_extent_block(inode, blkno, &bh);
1710 		if (ret) {
1711 			mlog_errno(ret);
1712 			goto out;
1713 		}
1714 
1715 		eb = (struct ocfs2_extent_block *) bh->b_data;
1716 		el = &eb->h_list;
1717 
1718 		if (le16_to_cpu(el->l_next_free_rec) >
1719 		    le16_to_cpu(el->l_count)) {
1720 			ocfs2_error(inode->i_sb,
1721 				    "Inode %llu has bad count in extent list "
1722 				    "at block %llu (next free=%u, count=%u)\n",
1723 				    (unsigned long long)oi->ip_blkno,
1724 				    (unsigned long long)bh->b_blocknr,
1725 				    le16_to_cpu(el->l_next_free_rec),
1726 				    le16_to_cpu(el->l_count));
1727 			ret = -EROFS;
1728 			goto out;
1729 		}
1730 
1731 		if (func)
1732 			func(data, bh);
1733 	}
1734 
1735 out:
1736 	/*
1737 	 * Catch any trailing bh that the loop didn't handle.
1738 	 */
1739 	brelse(bh);
1740 
1741 	return ret;
1742 }
1743 
1744 /*
1745  * Given an initialized path (that is, it has a valid root extent
1746  * list), this function will traverse the btree in search of the path
1747  * which would contain cpos.
1748  *
1749  * The path traveled is recorded in the path structure.
1750  *
1751  * Note that this will not do any comparisons on leaf node extent
1752  * records, so it will work fine in the case that we just added a tree
1753  * branch.
1754  */
1755 struct find_path_data {
1756 	int index;
1757 	struct ocfs2_path *path;
1758 };
1759 static void find_path_ins(void *data, struct buffer_head *bh)
1760 {
1761 	struct find_path_data *fp = data;
1762 
1763 	get_bh(bh);
1764 	ocfs2_path_insert_eb(fp->path, fp->index, bh);
1765 	fp->index++;
1766 }
1767 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1768 			   u32 cpos)
1769 {
1770 	struct find_path_data data;
1771 
1772 	data.index = 1;
1773 	data.path = path;
1774 	return __ocfs2_find_path(inode, path_root_el(path), cpos,
1775 				 find_path_ins, &data);
1776 }
1777 
1778 static void find_leaf_ins(void *data, struct buffer_head *bh)
1779 {
1780 	struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1781 	struct ocfs2_extent_list *el = &eb->h_list;
1782 	struct buffer_head **ret = data;
1783 
1784 	/* We want to retain only the leaf block. */
1785 	if (le16_to_cpu(el->l_tree_depth) == 0) {
1786 		get_bh(bh);
1787 		*ret = bh;
1788 	}
1789 }
1790 /*
1791  * Find the leaf block in the tree which would contain cpos. No
1792  * checking of the actual leaf is done.
1793  *
1794  * Some paths want to call this instead of allocating a path structure
1795  * and calling ocfs2_find_path().
1796  *
1797  * This function doesn't handle non btree extent lists.
1798  */
1799 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1800 		    u32 cpos, struct buffer_head **leaf_bh)
1801 {
1802 	int ret;
1803 	struct buffer_head *bh = NULL;
1804 
1805 	ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1806 	if (ret) {
1807 		mlog_errno(ret);
1808 		goto out;
1809 	}
1810 
1811 	*leaf_bh = bh;
1812 out:
1813 	return ret;
1814 }
1815 
1816 /*
1817  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1818  *
1819  * Basically, we've moved stuff around at the bottom of the tree and
1820  * we need to fix up the extent records above the changes to reflect
1821  * the new changes.
1822  *
1823  * left_rec: the record on the left.
1824  * left_child_el: is the child list pointed to by left_rec
1825  * right_rec: the record to the right of left_rec
1826  * right_child_el: is the child list pointed to by right_rec
1827  *
1828  * By definition, this only works on interior nodes.
1829  */
1830 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1831 				  struct ocfs2_extent_list *left_child_el,
1832 				  struct ocfs2_extent_rec *right_rec,
1833 				  struct ocfs2_extent_list *right_child_el)
1834 {
1835 	u32 left_clusters, right_end;
1836 
1837 	/*
1838 	 * Interior nodes never have holes. Their cpos is the cpos of
1839 	 * the leftmost record in their child list. Their cluster
1840 	 * count covers the full theoretical range of their child list
1841 	 * - the range between their cpos and the cpos of the record
1842 	 * immediately to their right.
1843 	 */
1844 	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1845 	if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1846 		BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1847 		left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1848 	}
1849 	left_clusters -= le32_to_cpu(left_rec->e_cpos);
1850 	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1851 
1852 	/*
1853 	 * Calculate the rightmost cluster count boundary before
1854 	 * moving cpos - we will need to adjust clusters after
1855 	 * updating e_cpos to keep the same highest cluster count.
1856 	 */
1857 	right_end = le32_to_cpu(right_rec->e_cpos);
1858 	right_end += le32_to_cpu(right_rec->e_int_clusters);
1859 
1860 	right_rec->e_cpos = left_rec->e_cpos;
1861 	le32_add_cpu(&right_rec->e_cpos, left_clusters);
1862 
1863 	right_end -= le32_to_cpu(right_rec->e_cpos);
1864 	right_rec->e_int_clusters = cpu_to_le32(right_end);
1865 }
1866 
1867 /*
1868  * Adjust the adjacent root node records involved in a
1869  * rotation. left_el_blkno is passed in as a key so that we can easily
1870  * find it's index in the root list.
1871  */
1872 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1873 				      struct ocfs2_extent_list *left_el,
1874 				      struct ocfs2_extent_list *right_el,
1875 				      u64 left_el_blkno)
1876 {
1877 	int i;
1878 
1879 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1880 	       le16_to_cpu(left_el->l_tree_depth));
1881 
1882 	for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1883 		if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1884 			break;
1885 	}
1886 
1887 	/*
1888 	 * The path walking code should have never returned a root and
1889 	 * two paths which are not adjacent.
1890 	 */
1891 	BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1892 
1893 	ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1894 				      &root_el->l_recs[i + 1], right_el);
1895 }
1896 
1897 /*
1898  * We've changed a leaf block (in right_path) and need to reflect that
1899  * change back up the subtree.
1900  *
1901  * This happens in multiple places:
1902  *   - When we've moved an extent record from the left path leaf to the right
1903  *     path leaf to make room for an empty extent in the left path leaf.
1904  *   - When our insert into the right path leaf is at the leftmost edge
1905  *     and requires an update of the path immediately to it's left. This
1906  *     can occur at the end of some types of rotation and appending inserts.
1907  *   - When we've adjusted the last extent record in the left path leaf and the
1908  *     1st extent record in the right path leaf during cross extent block merge.
1909  */
1910 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1911 				       struct ocfs2_path *left_path,
1912 				       struct ocfs2_path *right_path,
1913 				       int subtree_index)
1914 {
1915 	int ret, i, idx;
1916 	struct ocfs2_extent_list *el, *left_el, *right_el;
1917 	struct ocfs2_extent_rec *left_rec, *right_rec;
1918 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1919 
1920 	/*
1921 	 * Update the counts and position values within all the
1922 	 * interior nodes to reflect the leaf rotation we just did.
1923 	 *
1924 	 * The root node is handled below the loop.
1925 	 *
1926 	 * We begin the loop with right_el and left_el pointing to the
1927 	 * leaf lists and work our way up.
1928 	 *
1929 	 * NOTE: within this loop, left_el and right_el always refer
1930 	 * to the *child* lists.
1931 	 */
1932 	left_el = path_leaf_el(left_path);
1933 	right_el = path_leaf_el(right_path);
1934 	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1935 		mlog(0, "Adjust records at index %u\n", i);
1936 
1937 		/*
1938 		 * One nice property of knowing that all of these
1939 		 * nodes are below the root is that we only deal with
1940 		 * the leftmost right node record and the rightmost
1941 		 * left node record.
1942 		 */
1943 		el = left_path->p_node[i].el;
1944 		idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1945 		left_rec = &el->l_recs[idx];
1946 
1947 		el = right_path->p_node[i].el;
1948 		right_rec = &el->l_recs[0];
1949 
1950 		ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1951 					      right_el);
1952 
1953 		ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1954 		if (ret)
1955 			mlog_errno(ret);
1956 
1957 		ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1958 		if (ret)
1959 			mlog_errno(ret);
1960 
1961 		/*
1962 		 * Setup our list pointers now so that the current
1963 		 * parents become children in the next iteration.
1964 		 */
1965 		left_el = left_path->p_node[i].el;
1966 		right_el = right_path->p_node[i].el;
1967 	}
1968 
1969 	/*
1970 	 * At the root node, adjust the two adjacent records which
1971 	 * begin our path to the leaves.
1972 	 */
1973 
1974 	el = left_path->p_node[subtree_index].el;
1975 	left_el = left_path->p_node[subtree_index + 1].el;
1976 	right_el = right_path->p_node[subtree_index + 1].el;
1977 
1978 	ocfs2_adjust_root_records(el, left_el, right_el,
1979 				  left_path->p_node[subtree_index + 1].bh->b_blocknr);
1980 
1981 	root_bh = left_path->p_node[subtree_index].bh;
1982 
1983 	ret = ocfs2_journal_dirty(handle, root_bh);
1984 	if (ret)
1985 		mlog_errno(ret);
1986 }
1987 
1988 static int ocfs2_rotate_subtree_right(struct inode *inode,
1989 				      handle_t *handle,
1990 				      struct ocfs2_path *left_path,
1991 				      struct ocfs2_path *right_path,
1992 				      int subtree_index)
1993 {
1994 	int ret, i;
1995 	struct buffer_head *right_leaf_bh;
1996 	struct buffer_head *left_leaf_bh = NULL;
1997 	struct buffer_head *root_bh;
1998 	struct ocfs2_extent_list *right_el, *left_el;
1999 	struct ocfs2_extent_rec move_rec;
2000 
2001 	left_leaf_bh = path_leaf_bh(left_path);
2002 	left_el = path_leaf_el(left_path);
2003 
2004 	if (left_el->l_next_free_rec != left_el->l_count) {
2005 		ocfs2_error(inode->i_sb,
2006 			    "Inode %llu has non-full interior leaf node %llu"
2007 			    "(next free = %u)",
2008 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
2009 			    (unsigned long long)left_leaf_bh->b_blocknr,
2010 			    le16_to_cpu(left_el->l_next_free_rec));
2011 		return -EROFS;
2012 	}
2013 
2014 	/*
2015 	 * This extent block may already have an empty record, so we
2016 	 * return early if so.
2017 	 */
2018 	if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
2019 		return 0;
2020 
2021 	root_bh = left_path->p_node[subtree_index].bh;
2022 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2023 
2024 	ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
2025 					   subtree_index);
2026 	if (ret) {
2027 		mlog_errno(ret);
2028 		goto out;
2029 	}
2030 
2031 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2032 		ret = ocfs2_path_bh_journal_access(handle, inode,
2033 						   right_path, i);
2034 		if (ret) {
2035 			mlog_errno(ret);
2036 			goto out;
2037 		}
2038 
2039 		ret = ocfs2_path_bh_journal_access(handle, inode,
2040 						   left_path, i);
2041 		if (ret) {
2042 			mlog_errno(ret);
2043 			goto out;
2044 		}
2045 	}
2046 
2047 	right_leaf_bh = path_leaf_bh(right_path);
2048 	right_el = path_leaf_el(right_path);
2049 
2050 	/* This is a code error, not a disk corruption. */
2051 	mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
2052 			"because rightmost leaf block %llu is empty\n",
2053 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
2054 			(unsigned long long)right_leaf_bh->b_blocknr);
2055 
2056 	ocfs2_create_empty_extent(right_el);
2057 
2058 	ret = ocfs2_journal_dirty(handle, right_leaf_bh);
2059 	if (ret) {
2060 		mlog_errno(ret);
2061 		goto out;
2062 	}
2063 
2064 	/* Do the copy now. */
2065 	i = le16_to_cpu(left_el->l_next_free_rec) - 1;
2066 	move_rec = left_el->l_recs[i];
2067 	right_el->l_recs[0] = move_rec;
2068 
2069 	/*
2070 	 * Clear out the record we just copied and shift everything
2071 	 * over, leaving an empty extent in the left leaf.
2072 	 *
2073 	 * We temporarily subtract from next_free_rec so that the
2074 	 * shift will lose the tail record (which is now defunct).
2075 	 */
2076 	le16_add_cpu(&left_el->l_next_free_rec, -1);
2077 	ocfs2_shift_records_right(left_el);
2078 	memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2079 	le16_add_cpu(&left_el->l_next_free_rec, 1);
2080 
2081 	ret = ocfs2_journal_dirty(handle, left_leaf_bh);
2082 	if (ret) {
2083 		mlog_errno(ret);
2084 		goto out;
2085 	}
2086 
2087 	ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2088 				subtree_index);
2089 
2090 out:
2091 	return ret;
2092 }
2093 
2094 /*
2095  * Given a full path, determine what cpos value would return us a path
2096  * containing the leaf immediately to the left of the current one.
2097  *
2098  * Will return zero if the path passed in is already the leftmost path.
2099  */
2100 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2101 					 struct ocfs2_path *path, u32 *cpos)
2102 {
2103 	int i, j, ret = 0;
2104 	u64 blkno;
2105 	struct ocfs2_extent_list *el;
2106 
2107 	BUG_ON(path->p_tree_depth == 0);
2108 
2109 	*cpos = 0;
2110 
2111 	blkno = path_leaf_bh(path)->b_blocknr;
2112 
2113 	/* Start at the tree node just above the leaf and work our way up. */
2114 	i = path->p_tree_depth - 1;
2115 	while (i >= 0) {
2116 		el = path->p_node[i].el;
2117 
2118 		/*
2119 		 * Find the extent record just before the one in our
2120 		 * path.
2121 		 */
2122 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2123 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2124 				if (j == 0) {
2125 					if (i == 0) {
2126 						/*
2127 						 * We've determined that the
2128 						 * path specified is already
2129 						 * the leftmost one - return a
2130 						 * cpos of zero.
2131 						 */
2132 						goto out;
2133 					}
2134 					/*
2135 					 * The leftmost record points to our
2136 					 * leaf - we need to travel up the
2137 					 * tree one level.
2138 					 */
2139 					goto next_node;
2140 				}
2141 
2142 				*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2143 				*cpos = *cpos + ocfs2_rec_clusters(el,
2144 							   &el->l_recs[j - 1]);
2145 				*cpos = *cpos - 1;
2146 				goto out;
2147 			}
2148 		}
2149 
2150 		/*
2151 		 * If we got here, we never found a valid node where
2152 		 * the tree indicated one should be.
2153 		 */
2154 		ocfs2_error(sb,
2155 			    "Invalid extent tree at extent block %llu\n",
2156 			    (unsigned long long)blkno);
2157 		ret = -EROFS;
2158 		goto out;
2159 
2160 next_node:
2161 		blkno = path->p_node[i].bh->b_blocknr;
2162 		i--;
2163 	}
2164 
2165 out:
2166 	return ret;
2167 }
2168 
2169 /*
2170  * Extend the transaction by enough credits to complete the rotation,
2171  * and still leave at least the original number of credits allocated
2172  * to this transaction.
2173  */
2174 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2175 					   int op_credits,
2176 					   struct ocfs2_path *path)
2177 {
2178 	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2179 
2180 	if (handle->h_buffer_credits < credits)
2181 		return ocfs2_extend_trans(handle, credits);
2182 
2183 	return 0;
2184 }
2185 
2186 /*
2187  * Trap the case where we're inserting into the theoretical range past
2188  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2189  * whose cpos is less than ours into the right leaf.
2190  *
2191  * It's only necessary to look at the rightmost record of the left
2192  * leaf because the logic that calls us should ensure that the
2193  * theoretical ranges in the path components above the leaves are
2194  * correct.
2195  */
2196 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2197 						 u32 insert_cpos)
2198 {
2199 	struct ocfs2_extent_list *left_el;
2200 	struct ocfs2_extent_rec *rec;
2201 	int next_free;
2202 
2203 	left_el = path_leaf_el(left_path);
2204 	next_free = le16_to_cpu(left_el->l_next_free_rec);
2205 	rec = &left_el->l_recs[next_free - 1];
2206 
2207 	if (insert_cpos > le32_to_cpu(rec->e_cpos))
2208 		return 1;
2209 	return 0;
2210 }
2211 
2212 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2213 {
2214 	int next_free = le16_to_cpu(el->l_next_free_rec);
2215 	unsigned int range;
2216 	struct ocfs2_extent_rec *rec;
2217 
2218 	if (next_free == 0)
2219 		return 0;
2220 
2221 	rec = &el->l_recs[0];
2222 	if (ocfs2_is_empty_extent(rec)) {
2223 		/* Empty list. */
2224 		if (next_free == 1)
2225 			return 0;
2226 		rec = &el->l_recs[1];
2227 	}
2228 
2229 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2230 	if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2231 		return 1;
2232 	return 0;
2233 }
2234 
2235 /*
2236  * Rotate all the records in a btree right one record, starting at insert_cpos.
2237  *
2238  * The path to the rightmost leaf should be passed in.
2239  *
2240  * The array is assumed to be large enough to hold an entire path (tree depth).
2241  *
2242  * Upon succesful return from this function:
2243  *
2244  * - The 'right_path' array will contain a path to the leaf block
2245  *   whose range contains e_cpos.
2246  * - That leaf block will have a single empty extent in list index 0.
2247  * - In the case that the rotation requires a post-insert update,
2248  *   *ret_left_path will contain a valid path which can be passed to
2249  *   ocfs2_insert_path().
2250  */
2251 static int ocfs2_rotate_tree_right(struct inode *inode,
2252 				   handle_t *handle,
2253 				   enum ocfs2_split_type split,
2254 				   u32 insert_cpos,
2255 				   struct ocfs2_path *right_path,
2256 				   struct ocfs2_path **ret_left_path)
2257 {
2258 	int ret, start, orig_credits = handle->h_buffer_credits;
2259 	u32 cpos;
2260 	struct ocfs2_path *left_path = NULL;
2261 
2262 	*ret_left_path = NULL;
2263 
2264 	left_path = ocfs2_new_path_from_path(right_path);
2265 	if (!left_path) {
2266 		ret = -ENOMEM;
2267 		mlog_errno(ret);
2268 		goto out;
2269 	}
2270 
2271 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
2272 	if (ret) {
2273 		mlog_errno(ret);
2274 		goto out;
2275 	}
2276 
2277 	mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2278 
2279 	/*
2280 	 * What we want to do here is:
2281 	 *
2282 	 * 1) Start with the rightmost path.
2283 	 *
2284 	 * 2) Determine a path to the leaf block directly to the left
2285 	 *    of that leaf.
2286 	 *
2287 	 * 3) Determine the 'subtree root' - the lowest level tree node
2288 	 *    which contains a path to both leaves.
2289 	 *
2290 	 * 4) Rotate the subtree.
2291 	 *
2292 	 * 5) Find the next subtree by considering the left path to be
2293 	 *    the new right path.
2294 	 *
2295 	 * The check at the top of this while loop also accepts
2296 	 * insert_cpos == cpos because cpos is only a _theoretical_
2297 	 * value to get us the left path - insert_cpos might very well
2298 	 * be filling that hole.
2299 	 *
2300 	 * Stop at a cpos of '0' because we either started at the
2301 	 * leftmost branch (i.e., a tree with one branch and a
2302 	 * rotation inside of it), or we've gone as far as we can in
2303 	 * rotating subtrees.
2304 	 */
2305 	while (cpos && insert_cpos <= cpos) {
2306 		mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2307 		     insert_cpos, cpos);
2308 
2309 		ret = ocfs2_find_path(inode, left_path, cpos);
2310 		if (ret) {
2311 			mlog_errno(ret);
2312 			goto out;
2313 		}
2314 
2315 		mlog_bug_on_msg(path_leaf_bh(left_path) ==
2316 				path_leaf_bh(right_path),
2317 				"Inode %lu: error during insert of %u "
2318 				"(left path cpos %u) results in two identical "
2319 				"paths ending at %llu\n",
2320 				inode->i_ino, insert_cpos, cpos,
2321 				(unsigned long long)
2322 				path_leaf_bh(left_path)->b_blocknr);
2323 
2324 		if (split == SPLIT_NONE &&
2325 		    ocfs2_rotate_requires_path_adjustment(left_path,
2326 							  insert_cpos)) {
2327 
2328 			/*
2329 			 * We've rotated the tree as much as we
2330 			 * should. The rest is up to
2331 			 * ocfs2_insert_path() to complete, after the
2332 			 * record insertion. We indicate this
2333 			 * situation by returning the left path.
2334 			 *
2335 			 * The reason we don't adjust the records here
2336 			 * before the record insert is that an error
2337 			 * later might break the rule where a parent
2338 			 * record e_cpos will reflect the actual
2339 			 * e_cpos of the 1st nonempty record of the
2340 			 * child list.
2341 			 */
2342 			*ret_left_path = left_path;
2343 			goto out_ret_path;
2344 		}
2345 
2346 		start = ocfs2_find_subtree_root(inode, left_path, right_path);
2347 
2348 		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2349 		     start,
2350 		     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2351 		     right_path->p_tree_depth);
2352 
2353 		ret = ocfs2_extend_rotate_transaction(handle, start,
2354 						      orig_credits, right_path);
2355 		if (ret) {
2356 			mlog_errno(ret);
2357 			goto out;
2358 		}
2359 
2360 		ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2361 						 right_path, start);
2362 		if (ret) {
2363 			mlog_errno(ret);
2364 			goto out;
2365 		}
2366 
2367 		if (split != SPLIT_NONE &&
2368 		    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2369 						insert_cpos)) {
2370 			/*
2371 			 * A rotate moves the rightmost left leaf
2372 			 * record over to the leftmost right leaf
2373 			 * slot. If we're doing an extent split
2374 			 * instead of a real insert, then we have to
2375 			 * check that the extent to be split wasn't
2376 			 * just moved over. If it was, then we can
2377 			 * exit here, passing left_path back -
2378 			 * ocfs2_split_extent() is smart enough to
2379 			 * search both leaves.
2380 			 */
2381 			*ret_left_path = left_path;
2382 			goto out_ret_path;
2383 		}
2384 
2385 		/*
2386 		 * There is no need to re-read the next right path
2387 		 * as we know that it'll be our current left
2388 		 * path. Optimize by copying values instead.
2389 		 */
2390 		ocfs2_mv_path(right_path, left_path);
2391 
2392 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2393 						    &cpos);
2394 		if (ret) {
2395 			mlog_errno(ret);
2396 			goto out;
2397 		}
2398 	}
2399 
2400 out:
2401 	ocfs2_free_path(left_path);
2402 
2403 out_ret_path:
2404 	return ret;
2405 }
2406 
2407 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2408 				      struct ocfs2_path *path)
2409 {
2410 	int i, idx;
2411 	struct ocfs2_extent_rec *rec;
2412 	struct ocfs2_extent_list *el;
2413 	struct ocfs2_extent_block *eb;
2414 	u32 range;
2415 
2416 	/* Path should always be rightmost. */
2417 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2418 	BUG_ON(eb->h_next_leaf_blk != 0ULL);
2419 
2420 	el = &eb->h_list;
2421 	BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2422 	idx = le16_to_cpu(el->l_next_free_rec) - 1;
2423 	rec = &el->l_recs[idx];
2424 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2425 
2426 	for (i = 0; i < path->p_tree_depth; i++) {
2427 		el = path->p_node[i].el;
2428 		idx = le16_to_cpu(el->l_next_free_rec) - 1;
2429 		rec = &el->l_recs[idx];
2430 
2431 		rec->e_int_clusters = cpu_to_le32(range);
2432 		le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2433 
2434 		ocfs2_journal_dirty(handle, path->p_node[i].bh);
2435 	}
2436 }
2437 
2438 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2439 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
2440 			      struct ocfs2_path *path, int unlink_start)
2441 {
2442 	int ret, i;
2443 	struct ocfs2_extent_block *eb;
2444 	struct ocfs2_extent_list *el;
2445 	struct buffer_head *bh;
2446 
2447 	for(i = unlink_start; i < path_num_items(path); i++) {
2448 		bh = path->p_node[i].bh;
2449 
2450 		eb = (struct ocfs2_extent_block *)bh->b_data;
2451 		/*
2452 		 * Not all nodes might have had their final count
2453 		 * decremented by the caller - handle this here.
2454 		 */
2455 		el = &eb->h_list;
2456 		if (le16_to_cpu(el->l_next_free_rec) > 1) {
2457 			mlog(ML_ERROR,
2458 			     "Inode %llu, attempted to remove extent block "
2459 			     "%llu with %u records\n",
2460 			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2461 			     (unsigned long long)le64_to_cpu(eb->h_blkno),
2462 			     le16_to_cpu(el->l_next_free_rec));
2463 
2464 			ocfs2_journal_dirty(handle, bh);
2465 			ocfs2_remove_from_cache(inode, bh);
2466 			continue;
2467 		}
2468 
2469 		el->l_next_free_rec = 0;
2470 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2471 
2472 		ocfs2_journal_dirty(handle, bh);
2473 
2474 		ret = ocfs2_cache_extent_block_free(dealloc, eb);
2475 		if (ret)
2476 			mlog_errno(ret);
2477 
2478 		ocfs2_remove_from_cache(inode, bh);
2479 	}
2480 }
2481 
2482 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2483 				 struct ocfs2_path *left_path,
2484 				 struct ocfs2_path *right_path,
2485 				 int subtree_index,
2486 				 struct ocfs2_cached_dealloc_ctxt *dealloc)
2487 {
2488 	int i;
2489 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2490 	struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2491 	struct ocfs2_extent_list *el;
2492 	struct ocfs2_extent_block *eb;
2493 
2494 	el = path_leaf_el(left_path);
2495 
2496 	eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2497 
2498 	for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2499 		if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2500 			break;
2501 
2502 	BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2503 
2504 	memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2505 	le16_add_cpu(&root_el->l_next_free_rec, -1);
2506 
2507 	eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2508 	eb->h_next_leaf_blk = 0;
2509 
2510 	ocfs2_journal_dirty(handle, root_bh);
2511 	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2512 
2513 	ocfs2_unlink_path(inode, handle, dealloc, right_path,
2514 			  subtree_index + 1);
2515 }
2516 
2517 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2518 				     struct ocfs2_path *left_path,
2519 				     struct ocfs2_path *right_path,
2520 				     int subtree_index,
2521 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
2522 				     int *deleted,
2523 				     struct ocfs2_extent_tree *et)
2524 {
2525 	int ret, i, del_right_subtree = 0, right_has_empty = 0;
2526 	struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2527 	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2528 	struct ocfs2_extent_block *eb;
2529 
2530 	*deleted = 0;
2531 
2532 	right_leaf_el = path_leaf_el(right_path);
2533 	left_leaf_el = path_leaf_el(left_path);
2534 	root_bh = left_path->p_node[subtree_index].bh;
2535 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2536 
2537 	if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2538 		return 0;
2539 
2540 	eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2541 	if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2542 		/*
2543 		 * It's legal for us to proceed if the right leaf is
2544 		 * the rightmost one and it has an empty extent. There
2545 		 * are two cases to handle - whether the leaf will be
2546 		 * empty after removal or not. If the leaf isn't empty
2547 		 * then just remove the empty extent up front. The
2548 		 * next block will handle empty leaves by flagging
2549 		 * them for unlink.
2550 		 *
2551 		 * Non rightmost leaves will throw -EAGAIN and the
2552 		 * caller can manually move the subtree and retry.
2553 		 */
2554 
2555 		if (eb->h_next_leaf_blk != 0ULL)
2556 			return -EAGAIN;
2557 
2558 		if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2559 			ret = ocfs2_journal_access_eb(handle, inode,
2560 						      path_leaf_bh(right_path),
2561 						      OCFS2_JOURNAL_ACCESS_WRITE);
2562 			if (ret) {
2563 				mlog_errno(ret);
2564 				goto out;
2565 			}
2566 
2567 			ocfs2_remove_empty_extent(right_leaf_el);
2568 		} else
2569 			right_has_empty = 1;
2570 	}
2571 
2572 	if (eb->h_next_leaf_blk == 0ULL &&
2573 	    le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2574 		/*
2575 		 * We have to update i_last_eb_blk during the meta
2576 		 * data delete.
2577 		 */
2578 		ret = ocfs2_et_root_journal_access(handle, inode, et,
2579 						   OCFS2_JOURNAL_ACCESS_WRITE);
2580 		if (ret) {
2581 			mlog_errno(ret);
2582 			goto out;
2583 		}
2584 
2585 		del_right_subtree = 1;
2586 	}
2587 
2588 	/*
2589 	 * Getting here with an empty extent in the right path implies
2590 	 * that it's the rightmost path and will be deleted.
2591 	 */
2592 	BUG_ON(right_has_empty && !del_right_subtree);
2593 
2594 	ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
2595 					   subtree_index);
2596 	if (ret) {
2597 		mlog_errno(ret);
2598 		goto out;
2599 	}
2600 
2601 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2602 		ret = ocfs2_path_bh_journal_access(handle, inode,
2603 						   right_path, i);
2604 		if (ret) {
2605 			mlog_errno(ret);
2606 			goto out;
2607 		}
2608 
2609 		ret = ocfs2_path_bh_journal_access(handle, inode,
2610 						   left_path, i);
2611 		if (ret) {
2612 			mlog_errno(ret);
2613 			goto out;
2614 		}
2615 	}
2616 
2617 	if (!right_has_empty) {
2618 		/*
2619 		 * Only do this if we're moving a real
2620 		 * record. Otherwise, the action is delayed until
2621 		 * after removal of the right path in which case we
2622 		 * can do a simple shift to remove the empty extent.
2623 		 */
2624 		ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2625 		memset(&right_leaf_el->l_recs[0], 0,
2626 		       sizeof(struct ocfs2_extent_rec));
2627 	}
2628 	if (eb->h_next_leaf_blk == 0ULL) {
2629 		/*
2630 		 * Move recs over to get rid of empty extent, decrease
2631 		 * next_free. This is allowed to remove the last
2632 		 * extent in our leaf (setting l_next_free_rec to
2633 		 * zero) - the delete code below won't care.
2634 		 */
2635 		ocfs2_remove_empty_extent(right_leaf_el);
2636 	}
2637 
2638 	ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2639 	if (ret)
2640 		mlog_errno(ret);
2641 	ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2642 	if (ret)
2643 		mlog_errno(ret);
2644 
2645 	if (del_right_subtree) {
2646 		ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2647 				     subtree_index, dealloc);
2648 		ocfs2_update_edge_lengths(inode, handle, left_path);
2649 
2650 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2651 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2652 
2653 		/*
2654 		 * Removal of the extent in the left leaf was skipped
2655 		 * above so we could delete the right path
2656 		 * 1st.
2657 		 */
2658 		if (right_has_empty)
2659 			ocfs2_remove_empty_extent(left_leaf_el);
2660 
2661 		ret = ocfs2_journal_dirty(handle, et_root_bh);
2662 		if (ret)
2663 			mlog_errno(ret);
2664 
2665 		*deleted = 1;
2666 	} else
2667 		ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2668 					   subtree_index);
2669 
2670 out:
2671 	return ret;
2672 }
2673 
2674 /*
2675  * Given a full path, determine what cpos value would return us a path
2676  * containing the leaf immediately to the right of the current one.
2677  *
2678  * Will return zero if the path passed in is already the rightmost path.
2679  *
2680  * This looks similar, but is subtly different to
2681  * ocfs2_find_cpos_for_left_leaf().
2682  */
2683 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2684 					  struct ocfs2_path *path, u32 *cpos)
2685 {
2686 	int i, j, ret = 0;
2687 	u64 blkno;
2688 	struct ocfs2_extent_list *el;
2689 
2690 	*cpos = 0;
2691 
2692 	if (path->p_tree_depth == 0)
2693 		return 0;
2694 
2695 	blkno = path_leaf_bh(path)->b_blocknr;
2696 
2697 	/* Start at the tree node just above the leaf and work our way up. */
2698 	i = path->p_tree_depth - 1;
2699 	while (i >= 0) {
2700 		int next_free;
2701 
2702 		el = path->p_node[i].el;
2703 
2704 		/*
2705 		 * Find the extent record just after the one in our
2706 		 * path.
2707 		 */
2708 		next_free = le16_to_cpu(el->l_next_free_rec);
2709 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2710 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2711 				if (j == (next_free - 1)) {
2712 					if (i == 0) {
2713 						/*
2714 						 * We've determined that the
2715 						 * path specified is already
2716 						 * the rightmost one - return a
2717 						 * cpos of zero.
2718 						 */
2719 						goto out;
2720 					}
2721 					/*
2722 					 * The rightmost record points to our
2723 					 * leaf - we need to travel up the
2724 					 * tree one level.
2725 					 */
2726 					goto next_node;
2727 				}
2728 
2729 				*cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2730 				goto out;
2731 			}
2732 		}
2733 
2734 		/*
2735 		 * If we got here, we never found a valid node where
2736 		 * the tree indicated one should be.
2737 		 */
2738 		ocfs2_error(sb,
2739 			    "Invalid extent tree at extent block %llu\n",
2740 			    (unsigned long long)blkno);
2741 		ret = -EROFS;
2742 		goto out;
2743 
2744 next_node:
2745 		blkno = path->p_node[i].bh->b_blocknr;
2746 		i--;
2747 	}
2748 
2749 out:
2750 	return ret;
2751 }
2752 
2753 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2754 					    handle_t *handle,
2755 					    struct ocfs2_path *path)
2756 {
2757 	int ret;
2758 	struct buffer_head *bh = path_leaf_bh(path);
2759 	struct ocfs2_extent_list *el = path_leaf_el(path);
2760 
2761 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2762 		return 0;
2763 
2764 	ret = ocfs2_path_bh_journal_access(handle, inode, path,
2765 					   path_num_items(path) - 1);
2766 	if (ret) {
2767 		mlog_errno(ret);
2768 		goto out;
2769 	}
2770 
2771 	ocfs2_remove_empty_extent(el);
2772 
2773 	ret = ocfs2_journal_dirty(handle, bh);
2774 	if (ret)
2775 		mlog_errno(ret);
2776 
2777 out:
2778 	return ret;
2779 }
2780 
2781 static int __ocfs2_rotate_tree_left(struct inode *inode,
2782 				    handle_t *handle, int orig_credits,
2783 				    struct ocfs2_path *path,
2784 				    struct ocfs2_cached_dealloc_ctxt *dealloc,
2785 				    struct ocfs2_path **empty_extent_path,
2786 				    struct ocfs2_extent_tree *et)
2787 {
2788 	int ret, subtree_root, deleted;
2789 	u32 right_cpos;
2790 	struct ocfs2_path *left_path = NULL;
2791 	struct ocfs2_path *right_path = NULL;
2792 
2793 	BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2794 
2795 	*empty_extent_path = NULL;
2796 
2797 	ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2798 					     &right_cpos);
2799 	if (ret) {
2800 		mlog_errno(ret);
2801 		goto out;
2802 	}
2803 
2804 	left_path = ocfs2_new_path_from_path(path);
2805 	if (!left_path) {
2806 		ret = -ENOMEM;
2807 		mlog_errno(ret);
2808 		goto out;
2809 	}
2810 
2811 	ocfs2_cp_path(left_path, path);
2812 
2813 	right_path = ocfs2_new_path_from_path(path);
2814 	if (!right_path) {
2815 		ret = -ENOMEM;
2816 		mlog_errno(ret);
2817 		goto out;
2818 	}
2819 
2820 	while (right_cpos) {
2821 		ret = ocfs2_find_path(inode, right_path, right_cpos);
2822 		if (ret) {
2823 			mlog_errno(ret);
2824 			goto out;
2825 		}
2826 
2827 		subtree_root = ocfs2_find_subtree_root(inode, left_path,
2828 						       right_path);
2829 
2830 		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2831 		     subtree_root,
2832 		     (unsigned long long)
2833 		     right_path->p_node[subtree_root].bh->b_blocknr,
2834 		     right_path->p_tree_depth);
2835 
2836 		ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2837 						      orig_credits, left_path);
2838 		if (ret) {
2839 			mlog_errno(ret);
2840 			goto out;
2841 		}
2842 
2843 		/*
2844 		 * Caller might still want to make changes to the
2845 		 * tree root, so re-add it to the journal here.
2846 		 */
2847 		ret = ocfs2_path_bh_journal_access(handle, inode,
2848 						   left_path, 0);
2849 		if (ret) {
2850 			mlog_errno(ret);
2851 			goto out;
2852 		}
2853 
2854 		ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2855 						right_path, subtree_root,
2856 						dealloc, &deleted, et);
2857 		if (ret == -EAGAIN) {
2858 			/*
2859 			 * The rotation has to temporarily stop due to
2860 			 * the right subtree having an empty
2861 			 * extent. Pass it back to the caller for a
2862 			 * fixup.
2863 			 */
2864 			*empty_extent_path = right_path;
2865 			right_path = NULL;
2866 			goto out;
2867 		}
2868 		if (ret) {
2869 			mlog_errno(ret);
2870 			goto out;
2871 		}
2872 
2873 		/*
2874 		 * The subtree rotate might have removed records on
2875 		 * the rightmost edge. If so, then rotation is
2876 		 * complete.
2877 		 */
2878 		if (deleted)
2879 			break;
2880 
2881 		ocfs2_mv_path(left_path, right_path);
2882 
2883 		ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2884 						     &right_cpos);
2885 		if (ret) {
2886 			mlog_errno(ret);
2887 			goto out;
2888 		}
2889 	}
2890 
2891 out:
2892 	ocfs2_free_path(right_path);
2893 	ocfs2_free_path(left_path);
2894 
2895 	return ret;
2896 }
2897 
2898 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2899 				struct ocfs2_path *path,
2900 				struct ocfs2_cached_dealloc_ctxt *dealloc,
2901 				struct ocfs2_extent_tree *et)
2902 {
2903 	int ret, subtree_index;
2904 	u32 cpos;
2905 	struct ocfs2_path *left_path = NULL;
2906 	struct ocfs2_extent_block *eb;
2907 	struct ocfs2_extent_list *el;
2908 
2909 
2910 	ret = ocfs2_et_sanity_check(inode, et);
2911 	if (ret)
2912 		goto out;
2913 	/*
2914 	 * There's two ways we handle this depending on
2915 	 * whether path is the only existing one.
2916 	 */
2917 	ret = ocfs2_extend_rotate_transaction(handle, 0,
2918 					      handle->h_buffer_credits,
2919 					      path);
2920 	if (ret) {
2921 		mlog_errno(ret);
2922 		goto out;
2923 	}
2924 
2925 	ret = ocfs2_journal_access_path(inode, handle, path);
2926 	if (ret) {
2927 		mlog_errno(ret);
2928 		goto out;
2929 	}
2930 
2931 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2932 	if (ret) {
2933 		mlog_errno(ret);
2934 		goto out;
2935 	}
2936 
2937 	if (cpos) {
2938 		/*
2939 		 * We have a path to the left of this one - it needs
2940 		 * an update too.
2941 		 */
2942 		left_path = ocfs2_new_path_from_path(path);
2943 		if (!left_path) {
2944 			ret = -ENOMEM;
2945 			mlog_errno(ret);
2946 			goto out;
2947 		}
2948 
2949 		ret = ocfs2_find_path(inode, left_path, cpos);
2950 		if (ret) {
2951 			mlog_errno(ret);
2952 			goto out;
2953 		}
2954 
2955 		ret = ocfs2_journal_access_path(inode, handle, left_path);
2956 		if (ret) {
2957 			mlog_errno(ret);
2958 			goto out;
2959 		}
2960 
2961 		subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2962 
2963 		ocfs2_unlink_subtree(inode, handle, left_path, path,
2964 				     subtree_index, dealloc);
2965 		ocfs2_update_edge_lengths(inode, handle, left_path);
2966 
2967 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2968 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2969 	} else {
2970 		/*
2971 		 * 'path' is also the leftmost path which
2972 		 * means it must be the only one. This gets
2973 		 * handled differently because we want to
2974 		 * revert the inode back to having extents
2975 		 * in-line.
2976 		 */
2977 		ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2978 
2979 		el = et->et_root_el;
2980 		el->l_tree_depth = 0;
2981 		el->l_next_free_rec = 0;
2982 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2983 
2984 		ocfs2_et_set_last_eb_blk(et, 0);
2985 	}
2986 
2987 	ocfs2_journal_dirty(handle, path_root_bh(path));
2988 
2989 out:
2990 	ocfs2_free_path(left_path);
2991 	return ret;
2992 }
2993 
2994 /*
2995  * Left rotation of btree records.
2996  *
2997  * In many ways, this is (unsurprisingly) the opposite of right
2998  * rotation. We start at some non-rightmost path containing an empty
2999  * extent in the leaf block. The code works its way to the rightmost
3000  * path by rotating records to the left in every subtree.
3001  *
3002  * This is used by any code which reduces the number of extent records
3003  * in a leaf. After removal, an empty record should be placed in the
3004  * leftmost list position.
3005  *
3006  * This won't handle a length update of the rightmost path records if
3007  * the rightmost tree leaf record is removed so the caller is
3008  * responsible for detecting and correcting that.
3009  */
3010 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
3011 				  struct ocfs2_path *path,
3012 				  struct ocfs2_cached_dealloc_ctxt *dealloc,
3013 				  struct ocfs2_extent_tree *et)
3014 {
3015 	int ret, orig_credits = handle->h_buffer_credits;
3016 	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
3017 	struct ocfs2_extent_block *eb;
3018 	struct ocfs2_extent_list *el;
3019 
3020 	el = path_leaf_el(path);
3021 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
3022 		return 0;
3023 
3024 	if (path->p_tree_depth == 0) {
3025 rightmost_no_delete:
3026 		/*
3027 		 * Inline extents. This is trivially handled, so do
3028 		 * it up front.
3029 		 */
3030 		ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
3031 						       path);
3032 		if (ret)
3033 			mlog_errno(ret);
3034 		goto out;
3035 	}
3036 
3037 	/*
3038 	 * Handle rightmost branch now. There's several cases:
3039 	 *  1) simple rotation leaving records in there. That's trivial.
3040 	 *  2) rotation requiring a branch delete - there's no more
3041 	 *     records left. Two cases of this:
3042 	 *     a) There are branches to the left.
3043 	 *     b) This is also the leftmost (the only) branch.
3044 	 *
3045 	 *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
3046 	 *  2a) we need the left branch so that we can update it with the unlink
3047 	 *  2b) we need to bring the inode back to inline extents.
3048 	 */
3049 
3050 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
3051 	el = &eb->h_list;
3052 	if (eb->h_next_leaf_blk == 0) {
3053 		/*
3054 		 * This gets a bit tricky if we're going to delete the
3055 		 * rightmost path. Get the other cases out of the way
3056 		 * 1st.
3057 		 */
3058 		if (le16_to_cpu(el->l_next_free_rec) > 1)
3059 			goto rightmost_no_delete;
3060 
3061 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
3062 			ret = -EIO;
3063 			ocfs2_error(inode->i_sb,
3064 				    "Inode %llu has empty extent block at %llu",
3065 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
3066 				    (unsigned long long)le64_to_cpu(eb->h_blkno));
3067 			goto out;
3068 		}
3069 
3070 		/*
3071 		 * XXX: The caller can not trust "path" any more after
3072 		 * this as it will have been deleted. What do we do?
3073 		 *
3074 		 * In theory the rotate-for-merge code will never get
3075 		 * here because it'll always ask for a rotate in a
3076 		 * nonempty list.
3077 		 */
3078 
3079 		ret = ocfs2_remove_rightmost_path(inode, handle, path,
3080 						  dealloc, et);
3081 		if (ret)
3082 			mlog_errno(ret);
3083 		goto out;
3084 	}
3085 
3086 	/*
3087 	 * Now we can loop, remembering the path we get from -EAGAIN
3088 	 * and restarting from there.
3089 	 */
3090 try_rotate:
3091 	ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
3092 				       dealloc, &restart_path, et);
3093 	if (ret && ret != -EAGAIN) {
3094 		mlog_errno(ret);
3095 		goto out;
3096 	}
3097 
3098 	while (ret == -EAGAIN) {
3099 		tmp_path = restart_path;
3100 		restart_path = NULL;
3101 
3102 		ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
3103 					       tmp_path, dealloc,
3104 					       &restart_path, et);
3105 		if (ret && ret != -EAGAIN) {
3106 			mlog_errno(ret);
3107 			goto out;
3108 		}
3109 
3110 		ocfs2_free_path(tmp_path);
3111 		tmp_path = NULL;
3112 
3113 		if (ret == 0)
3114 			goto try_rotate;
3115 	}
3116 
3117 out:
3118 	ocfs2_free_path(tmp_path);
3119 	ocfs2_free_path(restart_path);
3120 	return ret;
3121 }
3122 
3123 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3124 				int index)
3125 {
3126 	struct ocfs2_extent_rec *rec = &el->l_recs[index];
3127 	unsigned int size;
3128 
3129 	if (rec->e_leaf_clusters == 0) {
3130 		/*
3131 		 * We consumed all of the merged-from record. An empty
3132 		 * extent cannot exist anywhere but the 1st array
3133 		 * position, so move things over if the merged-from
3134 		 * record doesn't occupy that position.
3135 		 *
3136 		 * This creates a new empty extent so the caller
3137 		 * should be smart enough to have removed any existing
3138 		 * ones.
3139 		 */
3140 		if (index > 0) {
3141 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3142 			size = index * sizeof(struct ocfs2_extent_rec);
3143 			memmove(&el->l_recs[1], &el->l_recs[0], size);
3144 		}
3145 
3146 		/*
3147 		 * Always memset - the caller doesn't check whether it
3148 		 * created an empty extent, so there could be junk in
3149 		 * the other fields.
3150 		 */
3151 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3152 	}
3153 }
3154 
3155 static int ocfs2_get_right_path(struct inode *inode,
3156 				struct ocfs2_path *left_path,
3157 				struct ocfs2_path **ret_right_path)
3158 {
3159 	int ret;
3160 	u32 right_cpos;
3161 	struct ocfs2_path *right_path = NULL;
3162 	struct ocfs2_extent_list *left_el;
3163 
3164 	*ret_right_path = NULL;
3165 
3166 	/* This function shouldn't be called for non-trees. */
3167 	BUG_ON(left_path->p_tree_depth == 0);
3168 
3169 	left_el = path_leaf_el(left_path);
3170 	BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3171 
3172 	ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
3173 					     &right_cpos);
3174 	if (ret) {
3175 		mlog_errno(ret);
3176 		goto out;
3177 	}
3178 
3179 	/* This function shouldn't be called for the rightmost leaf. */
3180 	BUG_ON(right_cpos == 0);
3181 
3182 	right_path = ocfs2_new_path_from_path(left_path);
3183 	if (!right_path) {
3184 		ret = -ENOMEM;
3185 		mlog_errno(ret);
3186 		goto out;
3187 	}
3188 
3189 	ret = ocfs2_find_path(inode, right_path, right_cpos);
3190 	if (ret) {
3191 		mlog_errno(ret);
3192 		goto out;
3193 	}
3194 
3195 	*ret_right_path = right_path;
3196 out:
3197 	if (ret)
3198 		ocfs2_free_path(right_path);
3199 	return ret;
3200 }
3201 
3202 /*
3203  * Remove split_rec clusters from the record at index and merge them
3204  * onto the beginning of the record "next" to it.
3205  * For index < l_count - 1, the next means the extent rec at index + 1.
3206  * For index == l_count - 1, the "next" means the 1st extent rec of the
3207  * next extent block.
3208  */
3209 static int ocfs2_merge_rec_right(struct inode *inode,
3210 				 struct ocfs2_path *left_path,
3211 				 handle_t *handle,
3212 				 struct ocfs2_extent_rec *split_rec,
3213 				 int index)
3214 {
3215 	int ret, next_free, i;
3216 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3217 	struct ocfs2_extent_rec *left_rec;
3218 	struct ocfs2_extent_rec *right_rec;
3219 	struct ocfs2_extent_list *right_el;
3220 	struct ocfs2_path *right_path = NULL;
3221 	int subtree_index = 0;
3222 	struct ocfs2_extent_list *el = path_leaf_el(left_path);
3223 	struct buffer_head *bh = path_leaf_bh(left_path);
3224 	struct buffer_head *root_bh = NULL;
3225 
3226 	BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3227 	left_rec = &el->l_recs[index];
3228 
3229 	if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3230 	    le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3231 		/* we meet with a cross extent block merge. */
3232 		ret = ocfs2_get_right_path(inode, left_path, &right_path);
3233 		if (ret) {
3234 			mlog_errno(ret);
3235 			goto out;
3236 		}
3237 
3238 		right_el = path_leaf_el(right_path);
3239 		next_free = le16_to_cpu(right_el->l_next_free_rec);
3240 		BUG_ON(next_free <= 0);
3241 		right_rec = &right_el->l_recs[0];
3242 		if (ocfs2_is_empty_extent(right_rec)) {
3243 			BUG_ON(next_free <= 1);
3244 			right_rec = &right_el->l_recs[1];
3245 		}
3246 
3247 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3248 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
3249 		       le32_to_cpu(right_rec->e_cpos));
3250 
3251 		subtree_index = ocfs2_find_subtree_root(inode,
3252 							left_path, right_path);
3253 
3254 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3255 						      handle->h_buffer_credits,
3256 						      right_path);
3257 		if (ret) {
3258 			mlog_errno(ret);
3259 			goto out;
3260 		}
3261 
3262 		root_bh = left_path->p_node[subtree_index].bh;
3263 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3264 
3265 		ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3266 						   subtree_index);
3267 		if (ret) {
3268 			mlog_errno(ret);
3269 			goto out;
3270 		}
3271 
3272 		for (i = subtree_index + 1;
3273 		     i < path_num_items(right_path); i++) {
3274 			ret = ocfs2_path_bh_journal_access(handle, inode,
3275 							   right_path, i);
3276 			if (ret) {
3277 				mlog_errno(ret);
3278 				goto out;
3279 			}
3280 
3281 			ret = ocfs2_path_bh_journal_access(handle, inode,
3282 							   left_path, i);
3283 			if (ret) {
3284 				mlog_errno(ret);
3285 				goto out;
3286 			}
3287 		}
3288 
3289 	} else {
3290 		BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3291 		right_rec = &el->l_recs[index + 1];
3292 	}
3293 
3294 	ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
3295 					   path_num_items(left_path) - 1);
3296 	if (ret) {
3297 		mlog_errno(ret);
3298 		goto out;
3299 	}
3300 
3301 	le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3302 
3303 	le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3304 	le64_add_cpu(&right_rec->e_blkno,
3305 		     -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3306 	le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3307 
3308 	ocfs2_cleanup_merge(el, index);
3309 
3310 	ret = ocfs2_journal_dirty(handle, bh);
3311 	if (ret)
3312 		mlog_errno(ret);
3313 
3314 	if (right_path) {
3315 		ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3316 		if (ret)
3317 			mlog_errno(ret);
3318 
3319 		ocfs2_complete_edge_insert(inode, handle, left_path,
3320 					   right_path, subtree_index);
3321 	}
3322 out:
3323 	if (right_path)
3324 		ocfs2_free_path(right_path);
3325 	return ret;
3326 }
3327 
3328 static int ocfs2_get_left_path(struct inode *inode,
3329 			       struct ocfs2_path *right_path,
3330 			       struct ocfs2_path **ret_left_path)
3331 {
3332 	int ret;
3333 	u32 left_cpos;
3334 	struct ocfs2_path *left_path = NULL;
3335 
3336 	*ret_left_path = NULL;
3337 
3338 	/* This function shouldn't be called for non-trees. */
3339 	BUG_ON(right_path->p_tree_depth == 0);
3340 
3341 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3342 					    right_path, &left_cpos);
3343 	if (ret) {
3344 		mlog_errno(ret);
3345 		goto out;
3346 	}
3347 
3348 	/* This function shouldn't be called for the leftmost leaf. */
3349 	BUG_ON(left_cpos == 0);
3350 
3351 	left_path = ocfs2_new_path_from_path(right_path);
3352 	if (!left_path) {
3353 		ret = -ENOMEM;
3354 		mlog_errno(ret);
3355 		goto out;
3356 	}
3357 
3358 	ret = ocfs2_find_path(inode, left_path, left_cpos);
3359 	if (ret) {
3360 		mlog_errno(ret);
3361 		goto out;
3362 	}
3363 
3364 	*ret_left_path = left_path;
3365 out:
3366 	if (ret)
3367 		ocfs2_free_path(left_path);
3368 	return ret;
3369 }
3370 
3371 /*
3372  * Remove split_rec clusters from the record at index and merge them
3373  * onto the tail of the record "before" it.
3374  * For index > 0, the "before" means the extent rec at index - 1.
3375  *
3376  * For index == 0, the "before" means the last record of the previous
3377  * extent block. And there is also a situation that we may need to
3378  * remove the rightmost leaf extent block in the right_path and change
3379  * the right path to indicate the new rightmost path.
3380  */
3381 static int ocfs2_merge_rec_left(struct inode *inode,
3382 				struct ocfs2_path *right_path,
3383 				handle_t *handle,
3384 				struct ocfs2_extent_rec *split_rec,
3385 				struct ocfs2_cached_dealloc_ctxt *dealloc,
3386 				struct ocfs2_extent_tree *et,
3387 				int index)
3388 {
3389 	int ret, i, subtree_index = 0, has_empty_extent = 0;
3390 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3391 	struct ocfs2_extent_rec *left_rec;
3392 	struct ocfs2_extent_rec *right_rec;
3393 	struct ocfs2_extent_list *el = path_leaf_el(right_path);
3394 	struct buffer_head *bh = path_leaf_bh(right_path);
3395 	struct buffer_head *root_bh = NULL;
3396 	struct ocfs2_path *left_path = NULL;
3397 	struct ocfs2_extent_list *left_el;
3398 
3399 	BUG_ON(index < 0);
3400 
3401 	right_rec = &el->l_recs[index];
3402 	if (index == 0) {
3403 		/* we meet with a cross extent block merge. */
3404 		ret = ocfs2_get_left_path(inode, right_path, &left_path);
3405 		if (ret) {
3406 			mlog_errno(ret);
3407 			goto out;
3408 		}
3409 
3410 		left_el = path_leaf_el(left_path);
3411 		BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3412 		       le16_to_cpu(left_el->l_count));
3413 
3414 		left_rec = &left_el->l_recs[
3415 				le16_to_cpu(left_el->l_next_free_rec) - 1];
3416 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3417 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
3418 		       le32_to_cpu(split_rec->e_cpos));
3419 
3420 		subtree_index = ocfs2_find_subtree_root(inode,
3421 							left_path, right_path);
3422 
3423 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3424 						      handle->h_buffer_credits,
3425 						      left_path);
3426 		if (ret) {
3427 			mlog_errno(ret);
3428 			goto out;
3429 		}
3430 
3431 		root_bh = left_path->p_node[subtree_index].bh;
3432 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3433 
3434 		ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3435 						   subtree_index);
3436 		if (ret) {
3437 			mlog_errno(ret);
3438 			goto out;
3439 		}
3440 
3441 		for (i = subtree_index + 1;
3442 		     i < path_num_items(right_path); i++) {
3443 			ret = ocfs2_path_bh_journal_access(handle, inode,
3444 							   right_path, i);
3445 			if (ret) {
3446 				mlog_errno(ret);
3447 				goto out;
3448 			}
3449 
3450 			ret = ocfs2_path_bh_journal_access(handle, inode,
3451 							   left_path, i);
3452 			if (ret) {
3453 				mlog_errno(ret);
3454 				goto out;
3455 			}
3456 		}
3457 	} else {
3458 		left_rec = &el->l_recs[index - 1];
3459 		if (ocfs2_is_empty_extent(&el->l_recs[0]))
3460 			has_empty_extent = 1;
3461 	}
3462 
3463 	ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3464 					   path_num_items(right_path) - 1);
3465 	if (ret) {
3466 		mlog_errno(ret);
3467 		goto out;
3468 	}
3469 
3470 	if (has_empty_extent && index == 1) {
3471 		/*
3472 		 * The easy case - we can just plop the record right in.
3473 		 */
3474 		*left_rec = *split_rec;
3475 
3476 		has_empty_extent = 0;
3477 	} else
3478 		le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3479 
3480 	le32_add_cpu(&right_rec->e_cpos, split_clusters);
3481 	le64_add_cpu(&right_rec->e_blkno,
3482 		     ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3483 	le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3484 
3485 	ocfs2_cleanup_merge(el, index);
3486 
3487 	ret = ocfs2_journal_dirty(handle, bh);
3488 	if (ret)
3489 		mlog_errno(ret);
3490 
3491 	if (left_path) {
3492 		ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3493 		if (ret)
3494 			mlog_errno(ret);
3495 
3496 		/*
3497 		 * In the situation that the right_rec is empty and the extent
3498 		 * block is empty also,  ocfs2_complete_edge_insert can't handle
3499 		 * it and we need to delete the right extent block.
3500 		 */
3501 		if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3502 		    le16_to_cpu(el->l_next_free_rec) == 1) {
3503 
3504 			ret = ocfs2_remove_rightmost_path(inode, handle,
3505 							  right_path,
3506 							  dealloc, et);
3507 			if (ret) {
3508 				mlog_errno(ret);
3509 				goto out;
3510 			}
3511 
3512 			/* Now the rightmost extent block has been deleted.
3513 			 * So we use the new rightmost path.
3514 			 */
3515 			ocfs2_mv_path(right_path, left_path);
3516 			left_path = NULL;
3517 		} else
3518 			ocfs2_complete_edge_insert(inode, handle, left_path,
3519 						   right_path, subtree_index);
3520 	}
3521 out:
3522 	if (left_path)
3523 		ocfs2_free_path(left_path);
3524 	return ret;
3525 }
3526 
3527 static int ocfs2_try_to_merge_extent(struct inode *inode,
3528 				     handle_t *handle,
3529 				     struct ocfs2_path *path,
3530 				     int split_index,
3531 				     struct ocfs2_extent_rec *split_rec,
3532 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
3533 				     struct ocfs2_merge_ctxt *ctxt,
3534 				     struct ocfs2_extent_tree *et)
3535 
3536 {
3537 	int ret = 0;
3538 	struct ocfs2_extent_list *el = path_leaf_el(path);
3539 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3540 
3541 	BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3542 
3543 	if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3544 		/*
3545 		 * The merge code will need to create an empty
3546 		 * extent to take the place of the newly
3547 		 * emptied slot. Remove any pre-existing empty
3548 		 * extents - having more than one in a leaf is
3549 		 * illegal.
3550 		 */
3551 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3552 					     dealloc, et);
3553 		if (ret) {
3554 			mlog_errno(ret);
3555 			goto out;
3556 		}
3557 		split_index--;
3558 		rec = &el->l_recs[split_index];
3559 	}
3560 
3561 	if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3562 		/*
3563 		 * Left-right contig implies this.
3564 		 */
3565 		BUG_ON(!ctxt->c_split_covers_rec);
3566 
3567 		/*
3568 		 * Since the leftright insert always covers the entire
3569 		 * extent, this call will delete the insert record
3570 		 * entirely, resulting in an empty extent record added to
3571 		 * the extent block.
3572 		 *
3573 		 * Since the adding of an empty extent shifts
3574 		 * everything back to the right, there's no need to
3575 		 * update split_index here.
3576 		 *
3577 		 * When the split_index is zero, we need to merge it to the
3578 		 * prevoius extent block. It is more efficient and easier
3579 		 * if we do merge_right first and merge_left later.
3580 		 */
3581 		ret = ocfs2_merge_rec_right(inode, path,
3582 					    handle, split_rec,
3583 					    split_index);
3584 		if (ret) {
3585 			mlog_errno(ret);
3586 			goto out;
3587 		}
3588 
3589 		/*
3590 		 * We can only get this from logic error above.
3591 		 */
3592 		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3593 
3594 		/* The merge left us with an empty extent, remove it. */
3595 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3596 					     dealloc, et);
3597 		if (ret) {
3598 			mlog_errno(ret);
3599 			goto out;
3600 		}
3601 
3602 		rec = &el->l_recs[split_index];
3603 
3604 		/*
3605 		 * Note that we don't pass split_rec here on purpose -
3606 		 * we've merged it into the rec already.
3607 		 */
3608 		ret = ocfs2_merge_rec_left(inode, path,
3609 					   handle, rec,
3610 					   dealloc, et,
3611 					   split_index);
3612 
3613 		if (ret) {
3614 			mlog_errno(ret);
3615 			goto out;
3616 		}
3617 
3618 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3619 					     dealloc, et);
3620 		/*
3621 		 * Error from this last rotate is not critical, so
3622 		 * print but don't bubble it up.
3623 		 */
3624 		if (ret)
3625 			mlog_errno(ret);
3626 		ret = 0;
3627 	} else {
3628 		/*
3629 		 * Merge a record to the left or right.
3630 		 *
3631 		 * 'contig_type' is relative to the existing record,
3632 		 * so for example, if we're "right contig", it's to
3633 		 * the record on the left (hence the left merge).
3634 		 */
3635 		if (ctxt->c_contig_type == CONTIG_RIGHT) {
3636 			ret = ocfs2_merge_rec_left(inode,
3637 						   path,
3638 						   handle, split_rec,
3639 						   dealloc, et,
3640 						   split_index);
3641 			if (ret) {
3642 				mlog_errno(ret);
3643 				goto out;
3644 			}
3645 		} else {
3646 			ret = ocfs2_merge_rec_right(inode,
3647 						    path,
3648 						    handle, split_rec,
3649 						    split_index);
3650 			if (ret) {
3651 				mlog_errno(ret);
3652 				goto out;
3653 			}
3654 		}
3655 
3656 		if (ctxt->c_split_covers_rec) {
3657 			/*
3658 			 * The merge may have left an empty extent in
3659 			 * our leaf. Try to rotate it away.
3660 			 */
3661 			ret = ocfs2_rotate_tree_left(inode, handle, path,
3662 						     dealloc, et);
3663 			if (ret)
3664 				mlog_errno(ret);
3665 			ret = 0;
3666 		}
3667 	}
3668 
3669 out:
3670 	return ret;
3671 }
3672 
3673 static void ocfs2_subtract_from_rec(struct super_block *sb,
3674 				    enum ocfs2_split_type split,
3675 				    struct ocfs2_extent_rec *rec,
3676 				    struct ocfs2_extent_rec *split_rec)
3677 {
3678 	u64 len_blocks;
3679 
3680 	len_blocks = ocfs2_clusters_to_blocks(sb,
3681 				le16_to_cpu(split_rec->e_leaf_clusters));
3682 
3683 	if (split == SPLIT_LEFT) {
3684 		/*
3685 		 * Region is on the left edge of the existing
3686 		 * record.
3687 		 */
3688 		le32_add_cpu(&rec->e_cpos,
3689 			     le16_to_cpu(split_rec->e_leaf_clusters));
3690 		le64_add_cpu(&rec->e_blkno, len_blocks);
3691 		le16_add_cpu(&rec->e_leaf_clusters,
3692 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3693 	} else {
3694 		/*
3695 		 * Region is on the right edge of the existing
3696 		 * record.
3697 		 */
3698 		le16_add_cpu(&rec->e_leaf_clusters,
3699 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3700 	}
3701 }
3702 
3703 /*
3704  * Do the final bits of extent record insertion at the target leaf
3705  * list. If this leaf is part of an allocation tree, it is assumed
3706  * that the tree above has been prepared.
3707  */
3708 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3709 				 struct ocfs2_extent_list *el,
3710 				 struct ocfs2_insert_type *insert,
3711 				 struct inode *inode)
3712 {
3713 	int i = insert->ins_contig_index;
3714 	unsigned int range;
3715 	struct ocfs2_extent_rec *rec;
3716 
3717 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3718 
3719 	if (insert->ins_split != SPLIT_NONE) {
3720 		i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3721 		BUG_ON(i == -1);
3722 		rec = &el->l_recs[i];
3723 		ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3724 					insert_rec);
3725 		goto rotate;
3726 	}
3727 
3728 	/*
3729 	 * Contiguous insert - either left or right.
3730 	 */
3731 	if (insert->ins_contig != CONTIG_NONE) {
3732 		rec = &el->l_recs[i];
3733 		if (insert->ins_contig == CONTIG_LEFT) {
3734 			rec->e_blkno = insert_rec->e_blkno;
3735 			rec->e_cpos = insert_rec->e_cpos;
3736 		}
3737 		le16_add_cpu(&rec->e_leaf_clusters,
3738 			     le16_to_cpu(insert_rec->e_leaf_clusters));
3739 		return;
3740 	}
3741 
3742 	/*
3743 	 * Handle insert into an empty leaf.
3744 	 */
3745 	if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3746 	    ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3747 	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
3748 		el->l_recs[0] = *insert_rec;
3749 		el->l_next_free_rec = cpu_to_le16(1);
3750 		return;
3751 	}
3752 
3753 	/*
3754 	 * Appending insert.
3755 	 */
3756 	if (insert->ins_appending == APPEND_TAIL) {
3757 		i = le16_to_cpu(el->l_next_free_rec) - 1;
3758 		rec = &el->l_recs[i];
3759 		range = le32_to_cpu(rec->e_cpos)
3760 			+ le16_to_cpu(rec->e_leaf_clusters);
3761 		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3762 
3763 		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3764 				le16_to_cpu(el->l_count),
3765 				"inode %lu, depth %u, count %u, next free %u, "
3766 				"rec.cpos %u, rec.clusters %u, "
3767 				"insert.cpos %u, insert.clusters %u\n",
3768 				inode->i_ino,
3769 				le16_to_cpu(el->l_tree_depth),
3770 				le16_to_cpu(el->l_count),
3771 				le16_to_cpu(el->l_next_free_rec),
3772 				le32_to_cpu(el->l_recs[i].e_cpos),
3773 				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3774 				le32_to_cpu(insert_rec->e_cpos),
3775 				le16_to_cpu(insert_rec->e_leaf_clusters));
3776 		i++;
3777 		el->l_recs[i] = *insert_rec;
3778 		le16_add_cpu(&el->l_next_free_rec, 1);
3779 		return;
3780 	}
3781 
3782 rotate:
3783 	/*
3784 	 * Ok, we have to rotate.
3785 	 *
3786 	 * At this point, it is safe to assume that inserting into an
3787 	 * empty leaf and appending to a leaf have both been handled
3788 	 * above.
3789 	 *
3790 	 * This leaf needs to have space, either by the empty 1st
3791 	 * extent record, or by virtue of an l_next_rec < l_count.
3792 	 */
3793 	ocfs2_rotate_leaf(el, insert_rec);
3794 }
3795 
3796 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3797 					   handle_t *handle,
3798 					   struct ocfs2_path *path,
3799 					   struct ocfs2_extent_rec *insert_rec)
3800 {
3801 	int ret, i, next_free;
3802 	struct buffer_head *bh;
3803 	struct ocfs2_extent_list *el;
3804 	struct ocfs2_extent_rec *rec;
3805 
3806 	/*
3807 	 * Update everything except the leaf block.
3808 	 */
3809 	for (i = 0; i < path->p_tree_depth; i++) {
3810 		bh = path->p_node[i].bh;
3811 		el = path->p_node[i].el;
3812 
3813 		next_free = le16_to_cpu(el->l_next_free_rec);
3814 		if (next_free == 0) {
3815 			ocfs2_error(inode->i_sb,
3816 				    "Dinode %llu has a bad extent list",
3817 				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
3818 			ret = -EIO;
3819 			return;
3820 		}
3821 
3822 		rec = &el->l_recs[next_free - 1];
3823 
3824 		rec->e_int_clusters = insert_rec->e_cpos;
3825 		le32_add_cpu(&rec->e_int_clusters,
3826 			     le16_to_cpu(insert_rec->e_leaf_clusters));
3827 		le32_add_cpu(&rec->e_int_clusters,
3828 			     -le32_to_cpu(rec->e_cpos));
3829 
3830 		ret = ocfs2_journal_dirty(handle, bh);
3831 		if (ret)
3832 			mlog_errno(ret);
3833 
3834 	}
3835 }
3836 
3837 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3838 				    struct ocfs2_extent_rec *insert_rec,
3839 				    struct ocfs2_path *right_path,
3840 				    struct ocfs2_path **ret_left_path)
3841 {
3842 	int ret, next_free;
3843 	struct ocfs2_extent_list *el;
3844 	struct ocfs2_path *left_path = NULL;
3845 
3846 	*ret_left_path = NULL;
3847 
3848 	/*
3849 	 * This shouldn't happen for non-trees. The extent rec cluster
3850 	 * count manipulation below only works for interior nodes.
3851 	 */
3852 	BUG_ON(right_path->p_tree_depth == 0);
3853 
3854 	/*
3855 	 * If our appending insert is at the leftmost edge of a leaf,
3856 	 * then we might need to update the rightmost records of the
3857 	 * neighboring path.
3858 	 */
3859 	el = path_leaf_el(right_path);
3860 	next_free = le16_to_cpu(el->l_next_free_rec);
3861 	if (next_free == 0 ||
3862 	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3863 		u32 left_cpos;
3864 
3865 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3866 						    &left_cpos);
3867 		if (ret) {
3868 			mlog_errno(ret);
3869 			goto out;
3870 		}
3871 
3872 		mlog(0, "Append may need a left path update. cpos: %u, "
3873 		     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3874 		     left_cpos);
3875 
3876 		/*
3877 		 * No need to worry if the append is already in the
3878 		 * leftmost leaf.
3879 		 */
3880 		if (left_cpos) {
3881 			left_path = ocfs2_new_path_from_path(right_path);
3882 			if (!left_path) {
3883 				ret = -ENOMEM;
3884 				mlog_errno(ret);
3885 				goto out;
3886 			}
3887 
3888 			ret = ocfs2_find_path(inode, left_path, left_cpos);
3889 			if (ret) {
3890 				mlog_errno(ret);
3891 				goto out;
3892 			}
3893 
3894 			/*
3895 			 * ocfs2_insert_path() will pass the left_path to the
3896 			 * journal for us.
3897 			 */
3898 		}
3899 	}
3900 
3901 	ret = ocfs2_journal_access_path(inode, handle, right_path);
3902 	if (ret) {
3903 		mlog_errno(ret);
3904 		goto out;
3905 	}
3906 
3907 	ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3908 
3909 	*ret_left_path = left_path;
3910 	ret = 0;
3911 out:
3912 	if (ret != 0)
3913 		ocfs2_free_path(left_path);
3914 
3915 	return ret;
3916 }
3917 
3918 static void ocfs2_split_record(struct inode *inode,
3919 			       struct ocfs2_path *left_path,
3920 			       struct ocfs2_path *right_path,
3921 			       struct ocfs2_extent_rec *split_rec,
3922 			       enum ocfs2_split_type split)
3923 {
3924 	int index;
3925 	u32 cpos = le32_to_cpu(split_rec->e_cpos);
3926 	struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3927 	struct ocfs2_extent_rec *rec, *tmprec;
3928 
3929 	right_el = path_leaf_el(right_path);
3930 	if (left_path)
3931 		left_el = path_leaf_el(left_path);
3932 
3933 	el = right_el;
3934 	insert_el = right_el;
3935 	index = ocfs2_search_extent_list(el, cpos);
3936 	if (index != -1) {
3937 		if (index == 0 && left_path) {
3938 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3939 
3940 			/*
3941 			 * This typically means that the record
3942 			 * started in the left path but moved to the
3943 			 * right as a result of rotation. We either
3944 			 * move the existing record to the left, or we
3945 			 * do the later insert there.
3946 			 *
3947 			 * In this case, the left path should always
3948 			 * exist as the rotate code will have passed
3949 			 * it back for a post-insert update.
3950 			 */
3951 
3952 			if (split == SPLIT_LEFT) {
3953 				/*
3954 				 * It's a left split. Since we know
3955 				 * that the rotate code gave us an
3956 				 * empty extent in the left path, we
3957 				 * can just do the insert there.
3958 				 */
3959 				insert_el = left_el;
3960 			} else {
3961 				/*
3962 				 * Right split - we have to move the
3963 				 * existing record over to the left
3964 				 * leaf. The insert will be into the
3965 				 * newly created empty extent in the
3966 				 * right leaf.
3967 				 */
3968 				tmprec = &right_el->l_recs[index];
3969 				ocfs2_rotate_leaf(left_el, tmprec);
3970 				el = left_el;
3971 
3972 				memset(tmprec, 0, sizeof(*tmprec));
3973 				index = ocfs2_search_extent_list(left_el, cpos);
3974 				BUG_ON(index == -1);
3975 			}
3976 		}
3977 	} else {
3978 		BUG_ON(!left_path);
3979 		BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3980 		/*
3981 		 * Left path is easy - we can just allow the insert to
3982 		 * happen.
3983 		 */
3984 		el = left_el;
3985 		insert_el = left_el;
3986 		index = ocfs2_search_extent_list(el, cpos);
3987 		BUG_ON(index == -1);
3988 	}
3989 
3990 	rec = &el->l_recs[index];
3991 	ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3992 	ocfs2_rotate_leaf(insert_el, split_rec);
3993 }
3994 
3995 /*
3996  * This function only does inserts on an allocation b-tree. For tree
3997  * depth = 0, ocfs2_insert_at_leaf() is called directly.
3998  *
3999  * right_path is the path we want to do the actual insert
4000  * in. left_path should only be passed in if we need to update that
4001  * portion of the tree after an edge insert.
4002  */
4003 static int ocfs2_insert_path(struct inode *inode,
4004 			     handle_t *handle,
4005 			     struct ocfs2_path *left_path,
4006 			     struct ocfs2_path *right_path,
4007 			     struct ocfs2_extent_rec *insert_rec,
4008 			     struct ocfs2_insert_type *insert)
4009 {
4010 	int ret, subtree_index;
4011 	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
4012 
4013 	if (left_path) {
4014 		int credits = handle->h_buffer_credits;
4015 
4016 		/*
4017 		 * There's a chance that left_path got passed back to
4018 		 * us without being accounted for in the
4019 		 * journal. Extend our transaction here to be sure we
4020 		 * can change those blocks.
4021 		 */
4022 		credits += left_path->p_tree_depth;
4023 
4024 		ret = ocfs2_extend_trans(handle, credits);
4025 		if (ret < 0) {
4026 			mlog_errno(ret);
4027 			goto out;
4028 		}
4029 
4030 		ret = ocfs2_journal_access_path(inode, handle, left_path);
4031 		if (ret < 0) {
4032 			mlog_errno(ret);
4033 			goto out;
4034 		}
4035 	}
4036 
4037 	/*
4038 	 * Pass both paths to the journal. The majority of inserts
4039 	 * will be touching all components anyway.
4040 	 */
4041 	ret = ocfs2_journal_access_path(inode, handle, right_path);
4042 	if (ret < 0) {
4043 		mlog_errno(ret);
4044 		goto out;
4045 	}
4046 
4047 	if (insert->ins_split != SPLIT_NONE) {
4048 		/*
4049 		 * We could call ocfs2_insert_at_leaf() for some types
4050 		 * of splits, but it's easier to just let one separate
4051 		 * function sort it all out.
4052 		 */
4053 		ocfs2_split_record(inode, left_path, right_path,
4054 				   insert_rec, insert->ins_split);
4055 
4056 		/*
4057 		 * Split might have modified either leaf and we don't
4058 		 * have a guarantee that the later edge insert will
4059 		 * dirty this for us.
4060 		 */
4061 		if (left_path)
4062 			ret = ocfs2_journal_dirty(handle,
4063 						  path_leaf_bh(left_path));
4064 			if (ret)
4065 				mlog_errno(ret);
4066 	} else
4067 		ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
4068 				     insert, inode);
4069 
4070 	ret = ocfs2_journal_dirty(handle, leaf_bh);
4071 	if (ret)
4072 		mlog_errno(ret);
4073 
4074 	if (left_path) {
4075 		/*
4076 		 * The rotate code has indicated that we need to fix
4077 		 * up portions of the tree after the insert.
4078 		 *
4079 		 * XXX: Should we extend the transaction here?
4080 		 */
4081 		subtree_index = ocfs2_find_subtree_root(inode, left_path,
4082 							right_path);
4083 		ocfs2_complete_edge_insert(inode, handle, left_path,
4084 					   right_path, subtree_index);
4085 	}
4086 
4087 	ret = 0;
4088 out:
4089 	return ret;
4090 }
4091 
4092 static int ocfs2_do_insert_extent(struct inode *inode,
4093 				  handle_t *handle,
4094 				  struct ocfs2_extent_tree *et,
4095 				  struct ocfs2_extent_rec *insert_rec,
4096 				  struct ocfs2_insert_type *type)
4097 {
4098 	int ret, rotate = 0;
4099 	u32 cpos;
4100 	struct ocfs2_path *right_path = NULL;
4101 	struct ocfs2_path *left_path = NULL;
4102 	struct ocfs2_extent_list *el;
4103 
4104 	el = et->et_root_el;
4105 
4106 	ret = ocfs2_et_root_journal_access(handle, inode, et,
4107 					   OCFS2_JOURNAL_ACCESS_WRITE);
4108 	if (ret) {
4109 		mlog_errno(ret);
4110 		goto out;
4111 	}
4112 
4113 	if (le16_to_cpu(el->l_tree_depth) == 0) {
4114 		ocfs2_insert_at_leaf(insert_rec, el, type, inode);
4115 		goto out_update_clusters;
4116 	}
4117 
4118 	right_path = ocfs2_new_path_from_et(et);
4119 	if (!right_path) {
4120 		ret = -ENOMEM;
4121 		mlog_errno(ret);
4122 		goto out;
4123 	}
4124 
4125 	/*
4126 	 * Determine the path to start with. Rotations need the
4127 	 * rightmost path, everything else can go directly to the
4128 	 * target leaf.
4129 	 */
4130 	cpos = le32_to_cpu(insert_rec->e_cpos);
4131 	if (type->ins_appending == APPEND_NONE &&
4132 	    type->ins_contig == CONTIG_NONE) {
4133 		rotate = 1;
4134 		cpos = UINT_MAX;
4135 	}
4136 
4137 	ret = ocfs2_find_path(inode, right_path, cpos);
4138 	if (ret) {
4139 		mlog_errno(ret);
4140 		goto out;
4141 	}
4142 
4143 	/*
4144 	 * Rotations and appends need special treatment - they modify
4145 	 * parts of the tree's above them.
4146 	 *
4147 	 * Both might pass back a path immediate to the left of the
4148 	 * one being inserted to. This will be cause
4149 	 * ocfs2_insert_path() to modify the rightmost records of
4150 	 * left_path to account for an edge insert.
4151 	 *
4152 	 * XXX: When modifying this code, keep in mind that an insert
4153 	 * can wind up skipping both of these two special cases...
4154 	 */
4155 	if (rotate) {
4156 		ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
4157 					      le32_to_cpu(insert_rec->e_cpos),
4158 					      right_path, &left_path);
4159 		if (ret) {
4160 			mlog_errno(ret);
4161 			goto out;
4162 		}
4163 
4164 		/*
4165 		 * ocfs2_rotate_tree_right() might have extended the
4166 		 * transaction without re-journaling our tree root.
4167 		 */
4168 		ret = ocfs2_et_root_journal_access(handle, inode, et,
4169 						   OCFS2_JOURNAL_ACCESS_WRITE);
4170 		if (ret) {
4171 			mlog_errno(ret);
4172 			goto out;
4173 		}
4174 	} else if (type->ins_appending == APPEND_TAIL
4175 		   && type->ins_contig != CONTIG_LEFT) {
4176 		ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
4177 					       right_path, &left_path);
4178 		if (ret) {
4179 			mlog_errno(ret);
4180 			goto out;
4181 		}
4182 	}
4183 
4184 	ret = ocfs2_insert_path(inode, handle, left_path, right_path,
4185 				insert_rec, type);
4186 	if (ret) {
4187 		mlog_errno(ret);
4188 		goto out;
4189 	}
4190 
4191 out_update_clusters:
4192 	if (type->ins_split == SPLIT_NONE)
4193 		ocfs2_et_update_clusters(inode, et,
4194 					 le16_to_cpu(insert_rec->e_leaf_clusters));
4195 
4196 	ret = ocfs2_journal_dirty(handle, et->et_root_bh);
4197 	if (ret)
4198 		mlog_errno(ret);
4199 
4200 out:
4201 	ocfs2_free_path(left_path);
4202 	ocfs2_free_path(right_path);
4203 
4204 	return ret;
4205 }
4206 
4207 static enum ocfs2_contig_type
4208 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4209 			       struct ocfs2_extent_list *el, int index,
4210 			       struct ocfs2_extent_rec *split_rec)
4211 {
4212 	int status;
4213 	enum ocfs2_contig_type ret = CONTIG_NONE;
4214 	u32 left_cpos, right_cpos;
4215 	struct ocfs2_extent_rec *rec = NULL;
4216 	struct ocfs2_extent_list *new_el;
4217 	struct ocfs2_path *left_path = NULL, *right_path = NULL;
4218 	struct buffer_head *bh;
4219 	struct ocfs2_extent_block *eb;
4220 
4221 	if (index > 0) {
4222 		rec = &el->l_recs[index - 1];
4223 	} else if (path->p_tree_depth > 0) {
4224 		status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
4225 						       path, &left_cpos);
4226 		if (status)
4227 			goto out;
4228 
4229 		if (left_cpos != 0) {
4230 			left_path = ocfs2_new_path_from_path(path);
4231 			if (!left_path)
4232 				goto out;
4233 
4234 			status = ocfs2_find_path(inode, left_path, left_cpos);
4235 			if (status)
4236 				goto out;
4237 
4238 			new_el = path_leaf_el(left_path);
4239 
4240 			if (le16_to_cpu(new_el->l_next_free_rec) !=
4241 			    le16_to_cpu(new_el->l_count)) {
4242 				bh = path_leaf_bh(left_path);
4243 				eb = (struct ocfs2_extent_block *)bh->b_data;
4244 				ocfs2_error(inode->i_sb,
4245 					    "Extent block #%llu has an "
4246 					    "invalid l_next_free_rec of "
4247 					    "%d.  It should have "
4248 					    "matched the l_count of %d",
4249 					    (unsigned long long)le64_to_cpu(eb->h_blkno),
4250 					    le16_to_cpu(new_el->l_next_free_rec),
4251 					    le16_to_cpu(new_el->l_count));
4252 				status = -EINVAL;
4253 				goto out;
4254 			}
4255 			rec = &new_el->l_recs[
4256 				le16_to_cpu(new_el->l_next_free_rec) - 1];
4257 		}
4258 	}
4259 
4260 	/*
4261 	 * We're careful to check for an empty extent record here -
4262 	 * the merge code will know what to do if it sees one.
4263 	 */
4264 	if (rec) {
4265 		if (index == 1 && ocfs2_is_empty_extent(rec)) {
4266 			if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4267 				ret = CONTIG_RIGHT;
4268 		} else {
4269 			ret = ocfs2_extent_contig(inode, rec, split_rec);
4270 		}
4271 	}
4272 
4273 	rec = NULL;
4274 	if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4275 		rec = &el->l_recs[index + 1];
4276 	else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4277 		 path->p_tree_depth > 0) {
4278 		status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
4279 							path, &right_cpos);
4280 		if (status)
4281 			goto out;
4282 
4283 		if (right_cpos == 0)
4284 			goto out;
4285 
4286 		right_path = ocfs2_new_path_from_path(path);
4287 		if (!right_path)
4288 			goto out;
4289 
4290 		status = ocfs2_find_path(inode, right_path, right_cpos);
4291 		if (status)
4292 			goto out;
4293 
4294 		new_el = path_leaf_el(right_path);
4295 		rec = &new_el->l_recs[0];
4296 		if (ocfs2_is_empty_extent(rec)) {
4297 			if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4298 				bh = path_leaf_bh(right_path);
4299 				eb = (struct ocfs2_extent_block *)bh->b_data;
4300 				ocfs2_error(inode->i_sb,
4301 					    "Extent block #%llu has an "
4302 					    "invalid l_next_free_rec of %d",
4303 					    (unsigned long long)le64_to_cpu(eb->h_blkno),
4304 					    le16_to_cpu(new_el->l_next_free_rec));
4305 				status = -EINVAL;
4306 				goto out;
4307 			}
4308 			rec = &new_el->l_recs[1];
4309 		}
4310 	}
4311 
4312 	if (rec) {
4313 		enum ocfs2_contig_type contig_type;
4314 
4315 		contig_type = ocfs2_extent_contig(inode, rec, split_rec);
4316 
4317 		if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4318 			ret = CONTIG_LEFTRIGHT;
4319 		else if (ret == CONTIG_NONE)
4320 			ret = contig_type;
4321 	}
4322 
4323 out:
4324 	if (left_path)
4325 		ocfs2_free_path(left_path);
4326 	if (right_path)
4327 		ocfs2_free_path(right_path);
4328 
4329 	return ret;
4330 }
4331 
4332 static void ocfs2_figure_contig_type(struct inode *inode,
4333 				     struct ocfs2_insert_type *insert,
4334 				     struct ocfs2_extent_list *el,
4335 				     struct ocfs2_extent_rec *insert_rec,
4336 				     struct ocfs2_extent_tree *et)
4337 {
4338 	int i;
4339 	enum ocfs2_contig_type contig_type = CONTIG_NONE;
4340 
4341 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4342 
4343 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4344 		contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4345 						  insert_rec);
4346 		if (contig_type != CONTIG_NONE) {
4347 			insert->ins_contig_index = i;
4348 			break;
4349 		}
4350 	}
4351 	insert->ins_contig = contig_type;
4352 
4353 	if (insert->ins_contig != CONTIG_NONE) {
4354 		struct ocfs2_extent_rec *rec =
4355 				&el->l_recs[insert->ins_contig_index];
4356 		unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4357 				   le16_to_cpu(insert_rec->e_leaf_clusters);
4358 
4359 		/*
4360 		 * Caller might want us to limit the size of extents, don't
4361 		 * calculate contiguousness if we might exceed that limit.
4362 		 */
4363 		if (et->et_max_leaf_clusters &&
4364 		    (len > et->et_max_leaf_clusters))
4365 			insert->ins_contig = CONTIG_NONE;
4366 	}
4367 }
4368 
4369 /*
4370  * This should only be called against the righmost leaf extent list.
4371  *
4372  * ocfs2_figure_appending_type() will figure out whether we'll have to
4373  * insert at the tail of the rightmost leaf.
4374  *
4375  * This should also work against the root extent list for tree's with 0
4376  * depth. If we consider the root extent list to be the rightmost leaf node
4377  * then the logic here makes sense.
4378  */
4379 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4380 					struct ocfs2_extent_list *el,
4381 					struct ocfs2_extent_rec *insert_rec)
4382 {
4383 	int i;
4384 	u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4385 	struct ocfs2_extent_rec *rec;
4386 
4387 	insert->ins_appending = APPEND_NONE;
4388 
4389 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4390 
4391 	if (!el->l_next_free_rec)
4392 		goto set_tail_append;
4393 
4394 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4395 		/* Were all records empty? */
4396 		if (le16_to_cpu(el->l_next_free_rec) == 1)
4397 			goto set_tail_append;
4398 	}
4399 
4400 	i = le16_to_cpu(el->l_next_free_rec) - 1;
4401 	rec = &el->l_recs[i];
4402 
4403 	if (cpos >=
4404 	    (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4405 		goto set_tail_append;
4406 
4407 	return;
4408 
4409 set_tail_append:
4410 	insert->ins_appending = APPEND_TAIL;
4411 }
4412 
4413 /*
4414  * Helper function called at the begining of an insert.
4415  *
4416  * This computes a few things that are commonly used in the process of
4417  * inserting into the btree:
4418  *   - Whether the new extent is contiguous with an existing one.
4419  *   - The current tree depth.
4420  *   - Whether the insert is an appending one.
4421  *   - The total # of free records in the tree.
4422  *
4423  * All of the information is stored on the ocfs2_insert_type
4424  * structure.
4425  */
4426 static int ocfs2_figure_insert_type(struct inode *inode,
4427 				    struct ocfs2_extent_tree *et,
4428 				    struct buffer_head **last_eb_bh,
4429 				    struct ocfs2_extent_rec *insert_rec,
4430 				    int *free_records,
4431 				    struct ocfs2_insert_type *insert)
4432 {
4433 	int ret;
4434 	struct ocfs2_extent_block *eb;
4435 	struct ocfs2_extent_list *el;
4436 	struct ocfs2_path *path = NULL;
4437 	struct buffer_head *bh = NULL;
4438 
4439 	insert->ins_split = SPLIT_NONE;
4440 
4441 	el = et->et_root_el;
4442 	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4443 
4444 	if (el->l_tree_depth) {
4445 		/*
4446 		 * If we have tree depth, we read in the
4447 		 * rightmost extent block ahead of time as
4448 		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4449 		 * may want it later.
4450 		 */
4451 		ret = ocfs2_read_extent_block(inode,
4452 					      ocfs2_et_get_last_eb_blk(et),
4453 					      &bh);
4454 		if (ret) {
4455 			mlog_exit(ret);
4456 			goto out;
4457 		}
4458 		eb = (struct ocfs2_extent_block *) bh->b_data;
4459 		el = &eb->h_list;
4460 	}
4461 
4462 	/*
4463 	 * Unless we have a contiguous insert, we'll need to know if
4464 	 * there is room left in our allocation tree for another
4465 	 * extent record.
4466 	 *
4467 	 * XXX: This test is simplistic, we can search for empty
4468 	 * extent records too.
4469 	 */
4470 	*free_records = le16_to_cpu(el->l_count) -
4471 		le16_to_cpu(el->l_next_free_rec);
4472 
4473 	if (!insert->ins_tree_depth) {
4474 		ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4475 		ocfs2_figure_appending_type(insert, el, insert_rec);
4476 		return 0;
4477 	}
4478 
4479 	path = ocfs2_new_path_from_et(et);
4480 	if (!path) {
4481 		ret = -ENOMEM;
4482 		mlog_errno(ret);
4483 		goto out;
4484 	}
4485 
4486 	/*
4487 	 * In the case that we're inserting past what the tree
4488 	 * currently accounts for, ocfs2_find_path() will return for
4489 	 * us the rightmost tree path. This is accounted for below in
4490 	 * the appending code.
4491 	 */
4492 	ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4493 	if (ret) {
4494 		mlog_errno(ret);
4495 		goto out;
4496 	}
4497 
4498 	el = path_leaf_el(path);
4499 
4500 	/*
4501 	 * Now that we have the path, there's two things we want to determine:
4502 	 * 1) Contiguousness (also set contig_index if this is so)
4503 	 *
4504 	 * 2) Are we doing an append? We can trivially break this up
4505          *     into two types of appends: simple record append, or a
4506          *     rotate inside the tail leaf.
4507 	 */
4508 	ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4509 
4510 	/*
4511 	 * The insert code isn't quite ready to deal with all cases of
4512 	 * left contiguousness. Specifically, if it's an insert into
4513 	 * the 1st record in a leaf, it will require the adjustment of
4514 	 * cluster count on the last record of the path directly to it's
4515 	 * left. For now, just catch that case and fool the layers
4516 	 * above us. This works just fine for tree_depth == 0, which
4517 	 * is why we allow that above.
4518 	 */
4519 	if (insert->ins_contig == CONTIG_LEFT &&
4520 	    insert->ins_contig_index == 0)
4521 		insert->ins_contig = CONTIG_NONE;
4522 
4523 	/*
4524 	 * Ok, so we can simply compare against last_eb to figure out
4525 	 * whether the path doesn't exist. This will only happen in
4526 	 * the case that we're doing a tail append, so maybe we can
4527 	 * take advantage of that information somehow.
4528 	 */
4529 	if (ocfs2_et_get_last_eb_blk(et) ==
4530 	    path_leaf_bh(path)->b_blocknr) {
4531 		/*
4532 		 * Ok, ocfs2_find_path() returned us the rightmost
4533 		 * tree path. This might be an appending insert. There are
4534 		 * two cases:
4535 		 *    1) We're doing a true append at the tail:
4536 		 *	-This might even be off the end of the leaf
4537 		 *    2) We're "appending" by rotating in the tail
4538 		 */
4539 		ocfs2_figure_appending_type(insert, el, insert_rec);
4540 	}
4541 
4542 out:
4543 	ocfs2_free_path(path);
4544 
4545 	if (ret == 0)
4546 		*last_eb_bh = bh;
4547 	else
4548 		brelse(bh);
4549 	return ret;
4550 }
4551 
4552 /*
4553  * Insert an extent into an inode btree.
4554  *
4555  * The caller needs to update fe->i_clusters
4556  */
4557 int ocfs2_insert_extent(struct ocfs2_super *osb,
4558 			handle_t *handle,
4559 			struct inode *inode,
4560 			struct ocfs2_extent_tree *et,
4561 			u32 cpos,
4562 			u64 start_blk,
4563 			u32 new_clusters,
4564 			u8 flags,
4565 			struct ocfs2_alloc_context *meta_ac)
4566 {
4567 	int status;
4568 	int uninitialized_var(free_records);
4569 	struct buffer_head *last_eb_bh = NULL;
4570 	struct ocfs2_insert_type insert = {0, };
4571 	struct ocfs2_extent_rec rec;
4572 
4573 	mlog(0, "add %u clusters at position %u to inode %llu\n",
4574 	     new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4575 
4576 	memset(&rec, 0, sizeof(rec));
4577 	rec.e_cpos = cpu_to_le32(cpos);
4578 	rec.e_blkno = cpu_to_le64(start_blk);
4579 	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4580 	rec.e_flags = flags;
4581 	status = ocfs2_et_insert_check(inode, et, &rec);
4582 	if (status) {
4583 		mlog_errno(status);
4584 		goto bail;
4585 	}
4586 
4587 	status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4588 					  &free_records, &insert);
4589 	if (status < 0) {
4590 		mlog_errno(status);
4591 		goto bail;
4592 	}
4593 
4594 	mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4595 	     "Insert.contig_index: %d, Insert.free_records: %d, "
4596 	     "Insert.tree_depth: %d\n",
4597 	     insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4598 	     free_records, insert.ins_tree_depth);
4599 
4600 	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4601 		status = ocfs2_grow_tree(inode, handle, et,
4602 					 &insert.ins_tree_depth, &last_eb_bh,
4603 					 meta_ac);
4604 		if (status) {
4605 			mlog_errno(status);
4606 			goto bail;
4607 		}
4608 	}
4609 
4610 	/* Finally, we can add clusters. This might rotate the tree for us. */
4611 	status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4612 	if (status < 0)
4613 		mlog_errno(status);
4614 	else if (et->et_ops == &ocfs2_dinode_et_ops)
4615 		ocfs2_extent_map_insert_rec(inode, &rec);
4616 
4617 bail:
4618 	brelse(last_eb_bh);
4619 
4620 	mlog_exit(status);
4621 	return status;
4622 }
4623 
4624 /*
4625  * Allcate and add clusters into the extent b-tree.
4626  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4627  * The extent b-tree's root is specified by et, and
4628  * it is not limited to the file storage. Any extent tree can use this
4629  * function if it implements the proper ocfs2_extent_tree.
4630  */
4631 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4632 				struct inode *inode,
4633 				u32 *logical_offset,
4634 				u32 clusters_to_add,
4635 				int mark_unwritten,
4636 				struct ocfs2_extent_tree *et,
4637 				handle_t *handle,
4638 				struct ocfs2_alloc_context *data_ac,
4639 				struct ocfs2_alloc_context *meta_ac,
4640 				enum ocfs2_alloc_restarted *reason_ret)
4641 {
4642 	int status = 0;
4643 	int free_extents;
4644 	enum ocfs2_alloc_restarted reason = RESTART_NONE;
4645 	u32 bit_off, num_bits;
4646 	u64 block;
4647 	u8 flags = 0;
4648 
4649 	BUG_ON(!clusters_to_add);
4650 
4651 	if (mark_unwritten)
4652 		flags = OCFS2_EXT_UNWRITTEN;
4653 
4654 	free_extents = ocfs2_num_free_extents(osb, inode, et);
4655 	if (free_extents < 0) {
4656 		status = free_extents;
4657 		mlog_errno(status);
4658 		goto leave;
4659 	}
4660 
4661 	/* there are two cases which could cause us to EAGAIN in the
4662 	 * we-need-more-metadata case:
4663 	 * 1) we haven't reserved *any*
4664 	 * 2) we are so fragmented, we've needed to add metadata too
4665 	 *    many times. */
4666 	if (!free_extents && !meta_ac) {
4667 		mlog(0, "we haven't reserved any metadata!\n");
4668 		status = -EAGAIN;
4669 		reason = RESTART_META;
4670 		goto leave;
4671 	} else if ((!free_extents)
4672 		   && (ocfs2_alloc_context_bits_left(meta_ac)
4673 		       < ocfs2_extend_meta_needed(et->et_root_el))) {
4674 		mlog(0, "filesystem is really fragmented...\n");
4675 		status = -EAGAIN;
4676 		reason = RESTART_META;
4677 		goto leave;
4678 	}
4679 
4680 	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4681 					clusters_to_add, &bit_off, &num_bits);
4682 	if (status < 0) {
4683 		if (status != -ENOSPC)
4684 			mlog_errno(status);
4685 		goto leave;
4686 	}
4687 
4688 	BUG_ON(num_bits > clusters_to_add);
4689 
4690 	/* reserve our write early -- insert_extent may update the tree root */
4691 	status = ocfs2_et_root_journal_access(handle, inode, et,
4692 					      OCFS2_JOURNAL_ACCESS_WRITE);
4693 	if (status < 0) {
4694 		mlog_errno(status);
4695 		goto leave;
4696 	}
4697 
4698 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4699 	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4700 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4701 	status = ocfs2_insert_extent(osb, handle, inode, et,
4702 				     *logical_offset, block,
4703 				     num_bits, flags, meta_ac);
4704 	if (status < 0) {
4705 		mlog_errno(status);
4706 		goto leave;
4707 	}
4708 
4709 	status = ocfs2_journal_dirty(handle, et->et_root_bh);
4710 	if (status < 0) {
4711 		mlog_errno(status);
4712 		goto leave;
4713 	}
4714 
4715 	clusters_to_add -= num_bits;
4716 	*logical_offset += num_bits;
4717 
4718 	if (clusters_to_add) {
4719 		mlog(0, "need to alloc once more, wanted = %u\n",
4720 		     clusters_to_add);
4721 		status = -EAGAIN;
4722 		reason = RESTART_TRANS;
4723 	}
4724 
4725 leave:
4726 	mlog_exit(status);
4727 	if (reason_ret)
4728 		*reason_ret = reason;
4729 	return status;
4730 }
4731 
4732 static void ocfs2_make_right_split_rec(struct super_block *sb,
4733 				       struct ocfs2_extent_rec *split_rec,
4734 				       u32 cpos,
4735 				       struct ocfs2_extent_rec *rec)
4736 {
4737 	u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4738 	u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4739 
4740 	memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4741 
4742 	split_rec->e_cpos = cpu_to_le32(cpos);
4743 	split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4744 
4745 	split_rec->e_blkno = rec->e_blkno;
4746 	le64_add_cpu(&split_rec->e_blkno,
4747 		     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4748 
4749 	split_rec->e_flags = rec->e_flags;
4750 }
4751 
4752 static int ocfs2_split_and_insert(struct inode *inode,
4753 				  handle_t *handle,
4754 				  struct ocfs2_path *path,
4755 				  struct ocfs2_extent_tree *et,
4756 				  struct buffer_head **last_eb_bh,
4757 				  int split_index,
4758 				  struct ocfs2_extent_rec *orig_split_rec,
4759 				  struct ocfs2_alloc_context *meta_ac)
4760 {
4761 	int ret = 0, depth;
4762 	unsigned int insert_range, rec_range, do_leftright = 0;
4763 	struct ocfs2_extent_rec tmprec;
4764 	struct ocfs2_extent_list *rightmost_el;
4765 	struct ocfs2_extent_rec rec;
4766 	struct ocfs2_extent_rec split_rec = *orig_split_rec;
4767 	struct ocfs2_insert_type insert;
4768 	struct ocfs2_extent_block *eb;
4769 
4770 leftright:
4771 	/*
4772 	 * Store a copy of the record on the stack - it might move
4773 	 * around as the tree is manipulated below.
4774 	 */
4775 	rec = path_leaf_el(path)->l_recs[split_index];
4776 
4777 	rightmost_el = et->et_root_el;
4778 
4779 	depth = le16_to_cpu(rightmost_el->l_tree_depth);
4780 	if (depth) {
4781 		BUG_ON(!(*last_eb_bh));
4782 		eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4783 		rightmost_el = &eb->h_list;
4784 	}
4785 
4786 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4787 	    le16_to_cpu(rightmost_el->l_count)) {
4788 		ret = ocfs2_grow_tree(inode, handle, et,
4789 				      &depth, last_eb_bh, meta_ac);
4790 		if (ret) {
4791 			mlog_errno(ret);
4792 			goto out;
4793 		}
4794 	}
4795 
4796 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4797 	insert.ins_appending = APPEND_NONE;
4798 	insert.ins_contig = CONTIG_NONE;
4799 	insert.ins_tree_depth = depth;
4800 
4801 	insert_range = le32_to_cpu(split_rec.e_cpos) +
4802 		le16_to_cpu(split_rec.e_leaf_clusters);
4803 	rec_range = le32_to_cpu(rec.e_cpos) +
4804 		le16_to_cpu(rec.e_leaf_clusters);
4805 
4806 	if (split_rec.e_cpos == rec.e_cpos) {
4807 		insert.ins_split = SPLIT_LEFT;
4808 	} else if (insert_range == rec_range) {
4809 		insert.ins_split = SPLIT_RIGHT;
4810 	} else {
4811 		/*
4812 		 * Left/right split. We fake this as a right split
4813 		 * first and then make a second pass as a left split.
4814 		 */
4815 		insert.ins_split = SPLIT_RIGHT;
4816 
4817 		ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4818 					   &rec);
4819 
4820 		split_rec = tmprec;
4821 
4822 		BUG_ON(do_leftright);
4823 		do_leftright = 1;
4824 	}
4825 
4826 	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4827 	if (ret) {
4828 		mlog_errno(ret);
4829 		goto out;
4830 	}
4831 
4832 	if (do_leftright == 1) {
4833 		u32 cpos;
4834 		struct ocfs2_extent_list *el;
4835 
4836 		do_leftright++;
4837 		split_rec = *orig_split_rec;
4838 
4839 		ocfs2_reinit_path(path, 1);
4840 
4841 		cpos = le32_to_cpu(split_rec.e_cpos);
4842 		ret = ocfs2_find_path(inode, path, cpos);
4843 		if (ret) {
4844 			mlog_errno(ret);
4845 			goto out;
4846 		}
4847 
4848 		el = path_leaf_el(path);
4849 		split_index = ocfs2_search_extent_list(el, cpos);
4850 		goto leftright;
4851 	}
4852 out:
4853 
4854 	return ret;
4855 }
4856 
4857 static int ocfs2_replace_extent_rec(struct inode *inode,
4858 				    handle_t *handle,
4859 				    struct ocfs2_path *path,
4860 				    struct ocfs2_extent_list *el,
4861 				    int split_index,
4862 				    struct ocfs2_extent_rec *split_rec)
4863 {
4864 	int ret;
4865 
4866 	ret = ocfs2_path_bh_journal_access(handle, inode, path,
4867 					   path_num_items(path) - 1);
4868 	if (ret) {
4869 		mlog_errno(ret);
4870 		goto out;
4871 	}
4872 
4873 	el->l_recs[split_index] = *split_rec;
4874 
4875 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
4876 out:
4877 	return ret;
4878 }
4879 
4880 /*
4881  * Mark part or all of the extent record at split_index in the leaf
4882  * pointed to by path as written. This removes the unwritten
4883  * extent flag.
4884  *
4885  * Care is taken to handle contiguousness so as to not grow the tree.
4886  *
4887  * meta_ac is not strictly necessary - we only truly need it if growth
4888  * of the tree is required. All other cases will degrade into a less
4889  * optimal tree layout.
4890  *
4891  * last_eb_bh should be the rightmost leaf block for any extent
4892  * btree. Since a split may grow the tree or a merge might shrink it,
4893  * the caller cannot trust the contents of that buffer after this call.
4894  *
4895  * This code is optimized for readability - several passes might be
4896  * made over certain portions of the tree. All of those blocks will
4897  * have been brought into cache (and pinned via the journal), so the
4898  * extra overhead is not expressed in terms of disk reads.
4899  */
4900 static int __ocfs2_mark_extent_written(struct inode *inode,
4901 				       struct ocfs2_extent_tree *et,
4902 				       handle_t *handle,
4903 				       struct ocfs2_path *path,
4904 				       int split_index,
4905 				       struct ocfs2_extent_rec *split_rec,
4906 				       struct ocfs2_alloc_context *meta_ac,
4907 				       struct ocfs2_cached_dealloc_ctxt *dealloc)
4908 {
4909 	int ret = 0;
4910 	struct ocfs2_extent_list *el = path_leaf_el(path);
4911 	struct buffer_head *last_eb_bh = NULL;
4912 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4913 	struct ocfs2_merge_ctxt ctxt;
4914 	struct ocfs2_extent_list *rightmost_el;
4915 
4916 	if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4917 		ret = -EIO;
4918 		mlog_errno(ret);
4919 		goto out;
4920 	}
4921 
4922 	if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4923 	    ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4924 	     (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4925 		ret = -EIO;
4926 		mlog_errno(ret);
4927 		goto out;
4928 	}
4929 
4930 	ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4931 							    split_index,
4932 							    split_rec);
4933 
4934 	/*
4935 	 * The core merge / split code wants to know how much room is
4936 	 * left in this inodes allocation tree, so we pass the
4937 	 * rightmost extent list.
4938 	 */
4939 	if (path->p_tree_depth) {
4940 		struct ocfs2_extent_block *eb;
4941 
4942 		ret = ocfs2_read_extent_block(inode,
4943 					      ocfs2_et_get_last_eb_blk(et),
4944 					      &last_eb_bh);
4945 		if (ret) {
4946 			mlog_exit(ret);
4947 			goto out;
4948 		}
4949 
4950 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4951 		rightmost_el = &eb->h_list;
4952 	} else
4953 		rightmost_el = path_root_el(path);
4954 
4955 	if (rec->e_cpos == split_rec->e_cpos &&
4956 	    rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4957 		ctxt.c_split_covers_rec = 1;
4958 	else
4959 		ctxt.c_split_covers_rec = 0;
4960 
4961 	ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4962 
4963 	mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4964 	     split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4965 	     ctxt.c_split_covers_rec);
4966 
4967 	if (ctxt.c_contig_type == CONTIG_NONE) {
4968 		if (ctxt.c_split_covers_rec)
4969 			ret = ocfs2_replace_extent_rec(inode, handle,
4970 						       path, el,
4971 						       split_index, split_rec);
4972 		else
4973 			ret = ocfs2_split_and_insert(inode, handle, path, et,
4974 						     &last_eb_bh, split_index,
4975 						     split_rec, meta_ac);
4976 		if (ret)
4977 			mlog_errno(ret);
4978 	} else {
4979 		ret = ocfs2_try_to_merge_extent(inode, handle, path,
4980 						split_index, split_rec,
4981 						dealloc, &ctxt, et);
4982 		if (ret)
4983 			mlog_errno(ret);
4984 	}
4985 
4986 out:
4987 	brelse(last_eb_bh);
4988 	return ret;
4989 }
4990 
4991 /*
4992  * Mark the already-existing extent at cpos as written for len clusters.
4993  *
4994  * If the existing extent is larger than the request, initiate a
4995  * split. An attempt will be made at merging with adjacent extents.
4996  *
4997  * The caller is responsible for passing down meta_ac if we'll need it.
4998  */
4999 int ocfs2_mark_extent_written(struct inode *inode,
5000 			      struct ocfs2_extent_tree *et,
5001 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
5002 			      struct ocfs2_alloc_context *meta_ac,
5003 			      struct ocfs2_cached_dealloc_ctxt *dealloc)
5004 {
5005 	int ret, index;
5006 	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
5007 	struct ocfs2_extent_rec split_rec;
5008 	struct ocfs2_path *left_path = NULL;
5009 	struct ocfs2_extent_list *el;
5010 
5011 	mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
5012 	     inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
5013 
5014 	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5015 		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
5016 			    "that are being written to, but the feature bit "
5017 			    "is not set in the super block.",
5018 			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
5019 		ret = -EROFS;
5020 		goto out;
5021 	}
5022 
5023 	/*
5024 	 * XXX: This should be fixed up so that we just re-insert the
5025 	 * next extent records.
5026 	 *
5027 	 * XXX: This is a hack on the extent tree, maybe it should be
5028 	 * an op?
5029 	 */
5030 	if (et->et_ops == &ocfs2_dinode_et_ops)
5031 		ocfs2_extent_map_trunc(inode, 0);
5032 
5033 	left_path = ocfs2_new_path_from_et(et);
5034 	if (!left_path) {
5035 		ret = -ENOMEM;
5036 		mlog_errno(ret);
5037 		goto out;
5038 	}
5039 
5040 	ret = ocfs2_find_path(inode, left_path, cpos);
5041 	if (ret) {
5042 		mlog_errno(ret);
5043 		goto out;
5044 	}
5045 	el = path_leaf_el(left_path);
5046 
5047 	index = ocfs2_search_extent_list(el, cpos);
5048 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5049 		ocfs2_error(inode->i_sb,
5050 			    "Inode %llu has an extent at cpos %u which can no "
5051 			    "longer be found.\n",
5052 			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5053 		ret = -EROFS;
5054 		goto out;
5055 	}
5056 
5057 	memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
5058 	split_rec.e_cpos = cpu_to_le32(cpos);
5059 	split_rec.e_leaf_clusters = cpu_to_le16(len);
5060 	split_rec.e_blkno = cpu_to_le64(start_blkno);
5061 	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
5062 	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
5063 
5064 	ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
5065 					  index, &split_rec, meta_ac,
5066 					  dealloc);
5067 	if (ret)
5068 		mlog_errno(ret);
5069 
5070 out:
5071 	ocfs2_free_path(left_path);
5072 	return ret;
5073 }
5074 
5075 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
5076 			    handle_t *handle, struct ocfs2_path *path,
5077 			    int index, u32 new_range,
5078 			    struct ocfs2_alloc_context *meta_ac)
5079 {
5080 	int ret, depth, credits = handle->h_buffer_credits;
5081 	struct buffer_head *last_eb_bh = NULL;
5082 	struct ocfs2_extent_block *eb;
5083 	struct ocfs2_extent_list *rightmost_el, *el;
5084 	struct ocfs2_extent_rec split_rec;
5085 	struct ocfs2_extent_rec *rec;
5086 	struct ocfs2_insert_type insert;
5087 
5088 	/*
5089 	 * Setup the record to split before we grow the tree.
5090 	 */
5091 	el = path_leaf_el(path);
5092 	rec = &el->l_recs[index];
5093 	ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
5094 
5095 	depth = path->p_tree_depth;
5096 	if (depth > 0) {
5097 		ret = ocfs2_read_extent_block(inode,
5098 					      ocfs2_et_get_last_eb_blk(et),
5099 					      &last_eb_bh);
5100 		if (ret < 0) {
5101 			mlog_errno(ret);
5102 			goto out;
5103 		}
5104 
5105 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5106 		rightmost_el = &eb->h_list;
5107 	} else
5108 		rightmost_el = path_leaf_el(path);
5109 
5110 	credits += path->p_tree_depth +
5111 		   ocfs2_extend_meta_needed(et->et_root_el);
5112 	ret = ocfs2_extend_trans(handle, credits);
5113 	if (ret) {
5114 		mlog_errno(ret);
5115 		goto out;
5116 	}
5117 
5118 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5119 	    le16_to_cpu(rightmost_el->l_count)) {
5120 		ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
5121 				      meta_ac);
5122 		if (ret) {
5123 			mlog_errno(ret);
5124 			goto out;
5125 		}
5126 	}
5127 
5128 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
5129 	insert.ins_appending = APPEND_NONE;
5130 	insert.ins_contig = CONTIG_NONE;
5131 	insert.ins_split = SPLIT_RIGHT;
5132 	insert.ins_tree_depth = depth;
5133 
5134 	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
5135 	if (ret)
5136 		mlog_errno(ret);
5137 
5138 out:
5139 	brelse(last_eb_bh);
5140 	return ret;
5141 }
5142 
5143 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5144 			      struct ocfs2_path *path, int index,
5145 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
5146 			      u32 cpos, u32 len,
5147 			      struct ocfs2_extent_tree *et)
5148 {
5149 	int ret;
5150 	u32 left_cpos, rec_range, trunc_range;
5151 	int wants_rotate = 0, is_rightmost_tree_rec = 0;
5152 	struct super_block *sb = inode->i_sb;
5153 	struct ocfs2_path *left_path = NULL;
5154 	struct ocfs2_extent_list *el = path_leaf_el(path);
5155 	struct ocfs2_extent_rec *rec;
5156 	struct ocfs2_extent_block *eb;
5157 
5158 	if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5159 		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5160 		if (ret) {
5161 			mlog_errno(ret);
5162 			goto out;
5163 		}
5164 
5165 		index--;
5166 	}
5167 
5168 	if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5169 	    path->p_tree_depth) {
5170 		/*
5171 		 * Check whether this is the rightmost tree record. If
5172 		 * we remove all of this record or part of its right
5173 		 * edge then an update of the record lengths above it
5174 		 * will be required.
5175 		 */
5176 		eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5177 		if (eb->h_next_leaf_blk == 0)
5178 			is_rightmost_tree_rec = 1;
5179 	}
5180 
5181 	rec = &el->l_recs[index];
5182 	if (index == 0 && path->p_tree_depth &&
5183 	    le32_to_cpu(rec->e_cpos) == cpos) {
5184 		/*
5185 		 * Changing the leftmost offset (via partial or whole
5186 		 * record truncate) of an interior (or rightmost) path
5187 		 * means we have to update the subtree that is formed
5188 		 * by this leaf and the one to it's left.
5189 		 *
5190 		 * There are two cases we can skip:
5191 		 *   1) Path is the leftmost one in our inode tree.
5192 		 *   2) The leaf is rightmost and will be empty after
5193 		 *      we remove the extent record - the rotate code
5194 		 *      knows how to update the newly formed edge.
5195 		 */
5196 
5197 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
5198 						    &left_cpos);
5199 		if (ret) {
5200 			mlog_errno(ret);
5201 			goto out;
5202 		}
5203 
5204 		if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5205 			left_path = ocfs2_new_path_from_path(path);
5206 			if (!left_path) {
5207 				ret = -ENOMEM;
5208 				mlog_errno(ret);
5209 				goto out;
5210 			}
5211 
5212 			ret = ocfs2_find_path(inode, left_path, left_cpos);
5213 			if (ret) {
5214 				mlog_errno(ret);
5215 				goto out;
5216 			}
5217 		}
5218 	}
5219 
5220 	ret = ocfs2_extend_rotate_transaction(handle, 0,
5221 					      handle->h_buffer_credits,
5222 					      path);
5223 	if (ret) {
5224 		mlog_errno(ret);
5225 		goto out;
5226 	}
5227 
5228 	ret = ocfs2_journal_access_path(inode, handle, path);
5229 	if (ret) {
5230 		mlog_errno(ret);
5231 		goto out;
5232 	}
5233 
5234 	ret = ocfs2_journal_access_path(inode, handle, left_path);
5235 	if (ret) {
5236 		mlog_errno(ret);
5237 		goto out;
5238 	}
5239 
5240 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5241 	trunc_range = cpos + len;
5242 
5243 	if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5244 		int next_free;
5245 
5246 		memset(rec, 0, sizeof(*rec));
5247 		ocfs2_cleanup_merge(el, index);
5248 		wants_rotate = 1;
5249 
5250 		next_free = le16_to_cpu(el->l_next_free_rec);
5251 		if (is_rightmost_tree_rec && next_free > 1) {
5252 			/*
5253 			 * We skip the edge update if this path will
5254 			 * be deleted by the rotate code.
5255 			 */
5256 			rec = &el->l_recs[next_free - 1];
5257 			ocfs2_adjust_rightmost_records(inode, handle, path,
5258 						       rec);
5259 		}
5260 	} else if (le32_to_cpu(rec->e_cpos) == cpos) {
5261 		/* Remove leftmost portion of the record. */
5262 		le32_add_cpu(&rec->e_cpos, len);
5263 		le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5264 		le16_add_cpu(&rec->e_leaf_clusters, -len);
5265 	} else if (rec_range == trunc_range) {
5266 		/* Remove rightmost portion of the record */
5267 		le16_add_cpu(&rec->e_leaf_clusters, -len);
5268 		if (is_rightmost_tree_rec)
5269 			ocfs2_adjust_rightmost_records(inode, handle, path, rec);
5270 	} else {
5271 		/* Caller should have trapped this. */
5272 		mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
5273 		     "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
5274 		     le32_to_cpu(rec->e_cpos),
5275 		     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5276 		BUG();
5277 	}
5278 
5279 	if (left_path) {
5280 		int subtree_index;
5281 
5282 		subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
5283 		ocfs2_complete_edge_insert(inode, handle, left_path, path,
5284 					   subtree_index);
5285 	}
5286 
5287 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
5288 
5289 	ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5290 	if (ret) {
5291 		mlog_errno(ret);
5292 		goto out;
5293 	}
5294 
5295 out:
5296 	ocfs2_free_path(left_path);
5297 	return ret;
5298 }
5299 
5300 int ocfs2_remove_extent(struct inode *inode,
5301 			struct ocfs2_extent_tree *et,
5302 			u32 cpos, u32 len, handle_t *handle,
5303 			struct ocfs2_alloc_context *meta_ac,
5304 			struct ocfs2_cached_dealloc_ctxt *dealloc)
5305 {
5306 	int ret, index;
5307 	u32 rec_range, trunc_range;
5308 	struct ocfs2_extent_rec *rec;
5309 	struct ocfs2_extent_list *el;
5310 	struct ocfs2_path *path = NULL;
5311 
5312 	ocfs2_extent_map_trunc(inode, 0);
5313 
5314 	path = ocfs2_new_path_from_et(et);
5315 	if (!path) {
5316 		ret = -ENOMEM;
5317 		mlog_errno(ret);
5318 		goto out;
5319 	}
5320 
5321 	ret = ocfs2_find_path(inode, path, cpos);
5322 	if (ret) {
5323 		mlog_errno(ret);
5324 		goto out;
5325 	}
5326 
5327 	el = path_leaf_el(path);
5328 	index = ocfs2_search_extent_list(el, cpos);
5329 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5330 		ocfs2_error(inode->i_sb,
5331 			    "Inode %llu has an extent at cpos %u which can no "
5332 			    "longer be found.\n",
5333 			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5334 		ret = -EROFS;
5335 		goto out;
5336 	}
5337 
5338 	/*
5339 	 * We have 3 cases of extent removal:
5340 	 *   1) Range covers the entire extent rec
5341 	 *   2) Range begins or ends on one edge of the extent rec
5342 	 *   3) Range is in the middle of the extent rec (no shared edges)
5343 	 *
5344 	 * For case 1 we remove the extent rec and left rotate to
5345 	 * fill the hole.
5346 	 *
5347 	 * For case 2 we just shrink the existing extent rec, with a
5348 	 * tree update if the shrinking edge is also the edge of an
5349 	 * extent block.
5350 	 *
5351 	 * For case 3 we do a right split to turn the extent rec into
5352 	 * something case 2 can handle.
5353 	 */
5354 	rec = &el->l_recs[index];
5355 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5356 	trunc_range = cpos + len;
5357 
5358 	BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5359 
5360 	mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
5361 	     "(cpos %u, len %u)\n",
5362 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
5363 	     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5364 
5365 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5366 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5367 					 cpos, len, et);
5368 		if (ret) {
5369 			mlog_errno(ret);
5370 			goto out;
5371 		}
5372 	} else {
5373 		ret = ocfs2_split_tree(inode, et, handle, path, index,
5374 				       trunc_range, meta_ac);
5375 		if (ret) {
5376 			mlog_errno(ret);
5377 			goto out;
5378 		}
5379 
5380 		/*
5381 		 * The split could have manipulated the tree enough to
5382 		 * move the record location, so we have to look for it again.
5383 		 */
5384 		ocfs2_reinit_path(path, 1);
5385 
5386 		ret = ocfs2_find_path(inode, path, cpos);
5387 		if (ret) {
5388 			mlog_errno(ret);
5389 			goto out;
5390 		}
5391 
5392 		el = path_leaf_el(path);
5393 		index = ocfs2_search_extent_list(el, cpos);
5394 		if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5395 			ocfs2_error(inode->i_sb,
5396 				    "Inode %llu: split at cpos %u lost record.",
5397 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
5398 				    cpos);
5399 			ret = -EROFS;
5400 			goto out;
5401 		}
5402 
5403 		/*
5404 		 * Double check our values here. If anything is fishy,
5405 		 * it's easier to catch it at the top level.
5406 		 */
5407 		rec = &el->l_recs[index];
5408 		rec_range = le32_to_cpu(rec->e_cpos) +
5409 			ocfs2_rec_clusters(el, rec);
5410 		if (rec_range != trunc_range) {
5411 			ocfs2_error(inode->i_sb,
5412 				    "Inode %llu: error after split at cpos %u"
5413 				    "trunc len %u, existing record is (%u,%u)",
5414 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
5415 				    cpos, len, le32_to_cpu(rec->e_cpos),
5416 				    ocfs2_rec_clusters(el, rec));
5417 			ret = -EROFS;
5418 			goto out;
5419 		}
5420 
5421 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5422 					 cpos, len, et);
5423 		if (ret) {
5424 			mlog_errno(ret);
5425 			goto out;
5426 		}
5427 	}
5428 
5429 out:
5430 	ocfs2_free_path(path);
5431 	return ret;
5432 }
5433 
5434 int ocfs2_remove_btree_range(struct inode *inode,
5435 			     struct ocfs2_extent_tree *et,
5436 			     u32 cpos, u32 phys_cpos, u32 len,
5437 			     struct ocfs2_cached_dealloc_ctxt *dealloc)
5438 {
5439 	int ret;
5440 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5441 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5442 	struct inode *tl_inode = osb->osb_tl_inode;
5443 	handle_t *handle;
5444 	struct ocfs2_alloc_context *meta_ac = NULL;
5445 
5446 	ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
5447 	if (ret) {
5448 		mlog_errno(ret);
5449 		return ret;
5450 	}
5451 
5452 	mutex_lock(&tl_inode->i_mutex);
5453 
5454 	if (ocfs2_truncate_log_needs_flush(osb)) {
5455 		ret = __ocfs2_flush_truncate_log(osb);
5456 		if (ret < 0) {
5457 			mlog_errno(ret);
5458 			goto out;
5459 		}
5460 	}
5461 
5462 	handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
5463 	if (IS_ERR(handle)) {
5464 		ret = PTR_ERR(handle);
5465 		mlog_errno(ret);
5466 		goto out;
5467 	}
5468 
5469 	ret = ocfs2_et_root_journal_access(handle, inode, et,
5470 					   OCFS2_JOURNAL_ACCESS_WRITE);
5471 	if (ret) {
5472 		mlog_errno(ret);
5473 		goto out;
5474 	}
5475 
5476 	vfs_dq_free_space_nodirty(inode,
5477 				  ocfs2_clusters_to_bytes(inode->i_sb, len));
5478 
5479 	ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
5480 				  dealloc);
5481 	if (ret) {
5482 		mlog_errno(ret);
5483 		goto out_commit;
5484 	}
5485 
5486 	ocfs2_et_update_clusters(inode, et, -len);
5487 
5488 	ret = ocfs2_journal_dirty(handle, et->et_root_bh);
5489 	if (ret) {
5490 		mlog_errno(ret);
5491 		goto out_commit;
5492 	}
5493 
5494 	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
5495 	if (ret)
5496 		mlog_errno(ret);
5497 
5498 out_commit:
5499 	ocfs2_commit_trans(osb, handle);
5500 out:
5501 	mutex_unlock(&tl_inode->i_mutex);
5502 
5503 	if (meta_ac)
5504 		ocfs2_free_alloc_context(meta_ac);
5505 
5506 	return ret;
5507 }
5508 
5509 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5510 {
5511 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5512 	struct ocfs2_dinode *di;
5513 	struct ocfs2_truncate_log *tl;
5514 
5515 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5516 	tl = &di->id2.i_dealloc;
5517 
5518 	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5519 			"slot %d, invalid truncate log parameters: used = "
5520 			"%u, count = %u\n", osb->slot_num,
5521 			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5522 	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5523 }
5524 
5525 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5526 					   unsigned int new_start)
5527 {
5528 	unsigned int tail_index;
5529 	unsigned int current_tail;
5530 
5531 	/* No records, nothing to coalesce */
5532 	if (!le16_to_cpu(tl->tl_used))
5533 		return 0;
5534 
5535 	tail_index = le16_to_cpu(tl->tl_used) - 1;
5536 	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5537 	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5538 
5539 	return current_tail == new_start;
5540 }
5541 
5542 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5543 			      handle_t *handle,
5544 			      u64 start_blk,
5545 			      unsigned int num_clusters)
5546 {
5547 	int status, index;
5548 	unsigned int start_cluster, tl_count;
5549 	struct inode *tl_inode = osb->osb_tl_inode;
5550 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5551 	struct ocfs2_dinode *di;
5552 	struct ocfs2_truncate_log *tl;
5553 
5554 	mlog_entry("start_blk = %llu, num_clusters = %u\n",
5555 		   (unsigned long long)start_blk, num_clusters);
5556 
5557 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5558 
5559 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5560 
5561 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5562 
5563 	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5564 	 * by the underlying call to ocfs2_read_inode_block(), so any
5565 	 * corruption is a code bug */
5566 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5567 
5568 	tl = &di->id2.i_dealloc;
5569 	tl_count = le16_to_cpu(tl->tl_count);
5570 	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5571 			tl_count == 0,
5572 			"Truncate record count on #%llu invalid "
5573 			"wanted %u, actual %u\n",
5574 			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5575 			ocfs2_truncate_recs_per_inode(osb->sb),
5576 			le16_to_cpu(tl->tl_count));
5577 
5578 	/* Caller should have known to flush before calling us. */
5579 	index = le16_to_cpu(tl->tl_used);
5580 	if (index >= tl_count) {
5581 		status = -ENOSPC;
5582 		mlog_errno(status);
5583 		goto bail;
5584 	}
5585 
5586 	status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
5587 					 OCFS2_JOURNAL_ACCESS_WRITE);
5588 	if (status < 0) {
5589 		mlog_errno(status);
5590 		goto bail;
5591 	}
5592 
5593 	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5594 	     "%llu (index = %d)\n", num_clusters, start_cluster,
5595 	     (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5596 
5597 	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5598 		/*
5599 		 * Move index back to the record we are coalescing with.
5600 		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5601 		 */
5602 		index--;
5603 
5604 		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5605 		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5606 		     index, le32_to_cpu(tl->tl_recs[index].t_start),
5607 		     num_clusters);
5608 	} else {
5609 		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5610 		tl->tl_used = cpu_to_le16(index + 1);
5611 	}
5612 	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5613 
5614 	status = ocfs2_journal_dirty(handle, tl_bh);
5615 	if (status < 0) {
5616 		mlog_errno(status);
5617 		goto bail;
5618 	}
5619 
5620 bail:
5621 	mlog_exit(status);
5622 	return status;
5623 }
5624 
5625 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5626 					 handle_t *handle,
5627 					 struct inode *data_alloc_inode,
5628 					 struct buffer_head *data_alloc_bh)
5629 {
5630 	int status = 0;
5631 	int i;
5632 	unsigned int num_clusters;
5633 	u64 start_blk;
5634 	struct ocfs2_truncate_rec rec;
5635 	struct ocfs2_dinode *di;
5636 	struct ocfs2_truncate_log *tl;
5637 	struct inode *tl_inode = osb->osb_tl_inode;
5638 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5639 
5640 	mlog_entry_void();
5641 
5642 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5643 	tl = &di->id2.i_dealloc;
5644 	i = le16_to_cpu(tl->tl_used) - 1;
5645 	while (i >= 0) {
5646 		/* Caller has given us at least enough credits to
5647 		 * update the truncate log dinode */
5648 		status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
5649 						 OCFS2_JOURNAL_ACCESS_WRITE);
5650 		if (status < 0) {
5651 			mlog_errno(status);
5652 			goto bail;
5653 		}
5654 
5655 		tl->tl_used = cpu_to_le16(i);
5656 
5657 		status = ocfs2_journal_dirty(handle, tl_bh);
5658 		if (status < 0) {
5659 			mlog_errno(status);
5660 			goto bail;
5661 		}
5662 
5663 		/* TODO: Perhaps we can calculate the bulk of the
5664 		 * credits up front rather than extending like
5665 		 * this. */
5666 		status = ocfs2_extend_trans(handle,
5667 					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5668 		if (status < 0) {
5669 			mlog_errno(status);
5670 			goto bail;
5671 		}
5672 
5673 		rec = tl->tl_recs[i];
5674 		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5675 						    le32_to_cpu(rec.t_start));
5676 		num_clusters = le32_to_cpu(rec.t_clusters);
5677 
5678 		/* if start_blk is not set, we ignore the record as
5679 		 * invalid. */
5680 		if (start_blk) {
5681 			mlog(0, "free record %d, start = %u, clusters = %u\n",
5682 			     i, le32_to_cpu(rec.t_start), num_clusters);
5683 
5684 			status = ocfs2_free_clusters(handle, data_alloc_inode,
5685 						     data_alloc_bh, start_blk,
5686 						     num_clusters);
5687 			if (status < 0) {
5688 				mlog_errno(status);
5689 				goto bail;
5690 			}
5691 		}
5692 		i--;
5693 	}
5694 
5695 bail:
5696 	mlog_exit(status);
5697 	return status;
5698 }
5699 
5700 /* Expects you to already be holding tl_inode->i_mutex */
5701 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5702 {
5703 	int status;
5704 	unsigned int num_to_flush;
5705 	handle_t *handle;
5706 	struct inode *tl_inode = osb->osb_tl_inode;
5707 	struct inode *data_alloc_inode = NULL;
5708 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5709 	struct buffer_head *data_alloc_bh = NULL;
5710 	struct ocfs2_dinode *di;
5711 	struct ocfs2_truncate_log *tl;
5712 
5713 	mlog_entry_void();
5714 
5715 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5716 
5717 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5718 
5719 	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5720 	 * by the underlying call to ocfs2_read_inode_block(), so any
5721 	 * corruption is a code bug */
5722 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5723 
5724 	tl = &di->id2.i_dealloc;
5725 	num_to_flush = le16_to_cpu(tl->tl_used);
5726 	mlog(0, "Flush %u records from truncate log #%llu\n",
5727 	     num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5728 	if (!num_to_flush) {
5729 		status = 0;
5730 		goto out;
5731 	}
5732 
5733 	data_alloc_inode = ocfs2_get_system_file_inode(osb,
5734 						       GLOBAL_BITMAP_SYSTEM_INODE,
5735 						       OCFS2_INVALID_SLOT);
5736 	if (!data_alloc_inode) {
5737 		status = -EINVAL;
5738 		mlog(ML_ERROR, "Could not get bitmap inode!\n");
5739 		goto out;
5740 	}
5741 
5742 	mutex_lock(&data_alloc_inode->i_mutex);
5743 
5744 	status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5745 	if (status < 0) {
5746 		mlog_errno(status);
5747 		goto out_mutex;
5748 	}
5749 
5750 	handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5751 	if (IS_ERR(handle)) {
5752 		status = PTR_ERR(handle);
5753 		mlog_errno(status);
5754 		goto out_unlock;
5755 	}
5756 
5757 	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5758 					       data_alloc_bh);
5759 	if (status < 0)
5760 		mlog_errno(status);
5761 
5762 	ocfs2_commit_trans(osb, handle);
5763 
5764 out_unlock:
5765 	brelse(data_alloc_bh);
5766 	ocfs2_inode_unlock(data_alloc_inode, 1);
5767 
5768 out_mutex:
5769 	mutex_unlock(&data_alloc_inode->i_mutex);
5770 	iput(data_alloc_inode);
5771 
5772 out:
5773 	mlog_exit(status);
5774 	return status;
5775 }
5776 
5777 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5778 {
5779 	int status;
5780 	struct inode *tl_inode = osb->osb_tl_inode;
5781 
5782 	mutex_lock(&tl_inode->i_mutex);
5783 	status = __ocfs2_flush_truncate_log(osb);
5784 	mutex_unlock(&tl_inode->i_mutex);
5785 
5786 	return status;
5787 }
5788 
5789 static void ocfs2_truncate_log_worker(struct work_struct *work)
5790 {
5791 	int status;
5792 	struct ocfs2_super *osb =
5793 		container_of(work, struct ocfs2_super,
5794 			     osb_truncate_log_wq.work);
5795 
5796 	mlog_entry_void();
5797 
5798 	status = ocfs2_flush_truncate_log(osb);
5799 	if (status < 0)
5800 		mlog_errno(status);
5801 	else
5802 		ocfs2_init_inode_steal_slot(osb);
5803 
5804 	mlog_exit(status);
5805 }
5806 
5807 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5808 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5809 				       int cancel)
5810 {
5811 	if (osb->osb_tl_inode) {
5812 		/* We want to push off log flushes while truncates are
5813 		 * still running. */
5814 		if (cancel)
5815 			cancel_delayed_work(&osb->osb_truncate_log_wq);
5816 
5817 		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5818 				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5819 	}
5820 }
5821 
5822 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5823 				       int slot_num,
5824 				       struct inode **tl_inode,
5825 				       struct buffer_head **tl_bh)
5826 {
5827 	int status;
5828 	struct inode *inode = NULL;
5829 	struct buffer_head *bh = NULL;
5830 
5831 	inode = ocfs2_get_system_file_inode(osb,
5832 					   TRUNCATE_LOG_SYSTEM_INODE,
5833 					   slot_num);
5834 	if (!inode) {
5835 		status = -EINVAL;
5836 		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5837 		goto bail;
5838 	}
5839 
5840 	status = ocfs2_read_inode_block(inode, &bh);
5841 	if (status < 0) {
5842 		iput(inode);
5843 		mlog_errno(status);
5844 		goto bail;
5845 	}
5846 
5847 	*tl_inode = inode;
5848 	*tl_bh    = bh;
5849 bail:
5850 	mlog_exit(status);
5851 	return status;
5852 }
5853 
5854 /* called during the 1st stage of node recovery. we stamp a clean
5855  * truncate log and pass back a copy for processing later. if the
5856  * truncate log does not require processing, a *tl_copy is set to
5857  * NULL. */
5858 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5859 				      int slot_num,
5860 				      struct ocfs2_dinode **tl_copy)
5861 {
5862 	int status;
5863 	struct inode *tl_inode = NULL;
5864 	struct buffer_head *tl_bh = NULL;
5865 	struct ocfs2_dinode *di;
5866 	struct ocfs2_truncate_log *tl;
5867 
5868 	*tl_copy = NULL;
5869 
5870 	mlog(0, "recover truncate log from slot %d\n", slot_num);
5871 
5872 	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5873 	if (status < 0) {
5874 		mlog_errno(status);
5875 		goto bail;
5876 	}
5877 
5878 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5879 
5880 	/* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
5881 	 * validated by the underlying call to ocfs2_read_inode_block(),
5882 	 * so any corruption is a code bug */
5883 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5884 
5885 	tl = &di->id2.i_dealloc;
5886 	if (le16_to_cpu(tl->tl_used)) {
5887 		mlog(0, "We'll have %u logs to recover\n",
5888 		     le16_to_cpu(tl->tl_used));
5889 
5890 		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5891 		if (!(*tl_copy)) {
5892 			status = -ENOMEM;
5893 			mlog_errno(status);
5894 			goto bail;
5895 		}
5896 
5897 		/* Assuming the write-out below goes well, this copy
5898 		 * will be passed back to recovery for processing. */
5899 		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5900 
5901 		/* All we need to do to clear the truncate log is set
5902 		 * tl_used. */
5903 		tl->tl_used = 0;
5904 
5905 		ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
5906 		status = ocfs2_write_block(osb, tl_bh, tl_inode);
5907 		if (status < 0) {
5908 			mlog_errno(status);
5909 			goto bail;
5910 		}
5911 	}
5912 
5913 bail:
5914 	if (tl_inode)
5915 		iput(tl_inode);
5916 	brelse(tl_bh);
5917 
5918 	if (status < 0 && (*tl_copy)) {
5919 		kfree(*tl_copy);
5920 		*tl_copy = NULL;
5921 	}
5922 
5923 	mlog_exit(status);
5924 	return status;
5925 }
5926 
5927 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5928 					 struct ocfs2_dinode *tl_copy)
5929 {
5930 	int status = 0;
5931 	int i;
5932 	unsigned int clusters, num_recs, start_cluster;
5933 	u64 start_blk;
5934 	handle_t *handle;
5935 	struct inode *tl_inode = osb->osb_tl_inode;
5936 	struct ocfs2_truncate_log *tl;
5937 
5938 	mlog_entry_void();
5939 
5940 	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5941 		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5942 		return -EINVAL;
5943 	}
5944 
5945 	tl = &tl_copy->id2.i_dealloc;
5946 	num_recs = le16_to_cpu(tl->tl_used);
5947 	mlog(0, "cleanup %u records from %llu\n", num_recs,
5948 	     (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5949 
5950 	mutex_lock(&tl_inode->i_mutex);
5951 	for(i = 0; i < num_recs; i++) {
5952 		if (ocfs2_truncate_log_needs_flush(osb)) {
5953 			status = __ocfs2_flush_truncate_log(osb);
5954 			if (status < 0) {
5955 				mlog_errno(status);
5956 				goto bail_up;
5957 			}
5958 		}
5959 
5960 		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5961 		if (IS_ERR(handle)) {
5962 			status = PTR_ERR(handle);
5963 			mlog_errno(status);
5964 			goto bail_up;
5965 		}
5966 
5967 		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5968 		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5969 		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5970 
5971 		status = ocfs2_truncate_log_append(osb, handle,
5972 						   start_blk, clusters);
5973 		ocfs2_commit_trans(osb, handle);
5974 		if (status < 0) {
5975 			mlog_errno(status);
5976 			goto bail_up;
5977 		}
5978 	}
5979 
5980 bail_up:
5981 	mutex_unlock(&tl_inode->i_mutex);
5982 
5983 	mlog_exit(status);
5984 	return status;
5985 }
5986 
5987 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5988 {
5989 	int status;
5990 	struct inode *tl_inode = osb->osb_tl_inode;
5991 
5992 	mlog_entry_void();
5993 
5994 	if (tl_inode) {
5995 		cancel_delayed_work(&osb->osb_truncate_log_wq);
5996 		flush_workqueue(ocfs2_wq);
5997 
5998 		status = ocfs2_flush_truncate_log(osb);
5999 		if (status < 0)
6000 			mlog_errno(status);
6001 
6002 		brelse(osb->osb_tl_bh);
6003 		iput(osb->osb_tl_inode);
6004 	}
6005 
6006 	mlog_exit_void();
6007 }
6008 
6009 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6010 {
6011 	int status;
6012 	struct inode *tl_inode = NULL;
6013 	struct buffer_head *tl_bh = NULL;
6014 
6015 	mlog_entry_void();
6016 
6017 	status = ocfs2_get_truncate_log_info(osb,
6018 					     osb->slot_num,
6019 					     &tl_inode,
6020 					     &tl_bh);
6021 	if (status < 0)
6022 		mlog_errno(status);
6023 
6024 	/* ocfs2_truncate_log_shutdown keys on the existence of
6025 	 * osb->osb_tl_inode so we don't set any of the osb variables
6026 	 * until we're sure all is well. */
6027 	INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
6028 			  ocfs2_truncate_log_worker);
6029 	osb->osb_tl_bh    = tl_bh;
6030 	osb->osb_tl_inode = tl_inode;
6031 
6032 	mlog_exit(status);
6033 	return status;
6034 }
6035 
6036 /*
6037  * Delayed de-allocation of suballocator blocks.
6038  *
6039  * Some sets of block de-allocations might involve multiple suballocator inodes.
6040  *
6041  * The locking for this can get extremely complicated, especially when
6042  * the suballocator inodes to delete from aren't known until deep
6043  * within an unrelated codepath.
6044  *
6045  * ocfs2_extent_block structures are a good example of this - an inode
6046  * btree could have been grown by any number of nodes each allocating
6047  * out of their own suballoc inode.
6048  *
6049  * These structures allow the delay of block de-allocation until a
6050  * later time, when locking of multiple cluster inodes won't cause
6051  * deadlock.
6052  */
6053 
6054 /*
6055  * Describe a single bit freed from a suballocator.  For the block
6056  * suballocators, it represents one block.  For the global cluster
6057  * allocator, it represents some clusters and free_bit indicates
6058  * clusters number.
6059  */
6060 struct ocfs2_cached_block_free {
6061 	struct ocfs2_cached_block_free		*free_next;
6062 	u64					free_blk;
6063 	unsigned int				free_bit;
6064 };
6065 
6066 struct ocfs2_per_slot_free_list {
6067 	struct ocfs2_per_slot_free_list		*f_next_suballocator;
6068 	int					f_inode_type;
6069 	int					f_slot;
6070 	struct ocfs2_cached_block_free		*f_first;
6071 };
6072 
6073 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6074 				    int sysfile_type,
6075 				    int slot,
6076 				    struct ocfs2_cached_block_free *head)
6077 {
6078 	int ret;
6079 	u64 bg_blkno;
6080 	handle_t *handle;
6081 	struct inode *inode;
6082 	struct buffer_head *di_bh = NULL;
6083 	struct ocfs2_cached_block_free *tmp;
6084 
6085 	inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
6086 	if (!inode) {
6087 		ret = -EINVAL;
6088 		mlog_errno(ret);
6089 		goto out;
6090 	}
6091 
6092 	mutex_lock(&inode->i_mutex);
6093 
6094 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
6095 	if (ret) {
6096 		mlog_errno(ret);
6097 		goto out_mutex;
6098 	}
6099 
6100 	handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
6101 	if (IS_ERR(handle)) {
6102 		ret = PTR_ERR(handle);
6103 		mlog_errno(ret);
6104 		goto out_unlock;
6105 	}
6106 
6107 	while (head) {
6108 		bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6109 						      head->free_bit);
6110 		mlog(0, "Free bit: (bit %u, blkno %llu)\n",
6111 		     head->free_bit, (unsigned long long)head->free_blk);
6112 
6113 		ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
6114 					       head->free_bit, bg_blkno, 1);
6115 		if (ret) {
6116 			mlog_errno(ret);
6117 			goto out_journal;
6118 		}
6119 
6120 		ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
6121 		if (ret) {
6122 			mlog_errno(ret);
6123 			goto out_journal;
6124 		}
6125 
6126 		tmp = head;
6127 		head = head->free_next;
6128 		kfree(tmp);
6129 	}
6130 
6131 out_journal:
6132 	ocfs2_commit_trans(osb, handle);
6133 
6134 out_unlock:
6135 	ocfs2_inode_unlock(inode, 1);
6136 	brelse(di_bh);
6137 out_mutex:
6138 	mutex_unlock(&inode->i_mutex);
6139 	iput(inode);
6140 out:
6141 	while(head) {
6142 		/* Premature exit may have left some dangling items. */
6143 		tmp = head;
6144 		head = head->free_next;
6145 		kfree(tmp);
6146 	}
6147 
6148 	return ret;
6149 }
6150 
6151 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6152 				u64 blkno, unsigned int bit)
6153 {
6154 	int ret = 0;
6155 	struct ocfs2_cached_block_free *item;
6156 
6157 	item = kmalloc(sizeof(*item), GFP_NOFS);
6158 	if (item == NULL) {
6159 		ret = -ENOMEM;
6160 		mlog_errno(ret);
6161 		return ret;
6162 	}
6163 
6164 	mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
6165 	     bit, (unsigned long long)blkno);
6166 
6167 	item->free_blk = blkno;
6168 	item->free_bit = bit;
6169 	item->free_next = ctxt->c_global_allocator;
6170 
6171 	ctxt->c_global_allocator = item;
6172 	return ret;
6173 }
6174 
6175 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6176 				      struct ocfs2_cached_block_free *head)
6177 {
6178 	struct ocfs2_cached_block_free *tmp;
6179 	struct inode *tl_inode = osb->osb_tl_inode;
6180 	handle_t *handle;
6181 	int ret = 0;
6182 
6183 	mutex_lock(&tl_inode->i_mutex);
6184 
6185 	while (head) {
6186 		if (ocfs2_truncate_log_needs_flush(osb)) {
6187 			ret = __ocfs2_flush_truncate_log(osb);
6188 			if (ret < 0) {
6189 				mlog_errno(ret);
6190 				break;
6191 			}
6192 		}
6193 
6194 		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6195 		if (IS_ERR(handle)) {
6196 			ret = PTR_ERR(handle);
6197 			mlog_errno(ret);
6198 			break;
6199 		}
6200 
6201 		ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6202 						head->free_bit);
6203 
6204 		ocfs2_commit_trans(osb, handle);
6205 		tmp = head;
6206 		head = head->free_next;
6207 		kfree(tmp);
6208 
6209 		if (ret < 0) {
6210 			mlog_errno(ret);
6211 			break;
6212 		}
6213 	}
6214 
6215 	mutex_unlock(&tl_inode->i_mutex);
6216 
6217 	while (head) {
6218 		/* Premature exit may have left some dangling items. */
6219 		tmp = head;
6220 		head = head->free_next;
6221 		kfree(tmp);
6222 	}
6223 
6224 	return ret;
6225 }
6226 
6227 int ocfs2_run_deallocs(struct ocfs2_super *osb,
6228 		       struct ocfs2_cached_dealloc_ctxt *ctxt)
6229 {
6230 	int ret = 0, ret2;
6231 	struct ocfs2_per_slot_free_list *fl;
6232 
6233 	if (!ctxt)
6234 		return 0;
6235 
6236 	while (ctxt->c_first_suballocator) {
6237 		fl = ctxt->c_first_suballocator;
6238 
6239 		if (fl->f_first) {
6240 			mlog(0, "Free items: (type %u, slot %d)\n",
6241 			     fl->f_inode_type, fl->f_slot);
6242 			ret2 = ocfs2_free_cached_blocks(osb,
6243 							fl->f_inode_type,
6244 							fl->f_slot,
6245 							fl->f_first);
6246 			if (ret2)
6247 				mlog_errno(ret2);
6248 			if (!ret)
6249 				ret = ret2;
6250 		}
6251 
6252 		ctxt->c_first_suballocator = fl->f_next_suballocator;
6253 		kfree(fl);
6254 	}
6255 
6256 	if (ctxt->c_global_allocator) {
6257 		ret2 = ocfs2_free_cached_clusters(osb,
6258 						  ctxt->c_global_allocator);
6259 		if (ret2)
6260 			mlog_errno(ret2);
6261 		if (!ret)
6262 			ret = ret2;
6263 
6264 		ctxt->c_global_allocator = NULL;
6265 	}
6266 
6267 	return ret;
6268 }
6269 
6270 static struct ocfs2_per_slot_free_list *
6271 ocfs2_find_per_slot_free_list(int type,
6272 			      int slot,
6273 			      struct ocfs2_cached_dealloc_ctxt *ctxt)
6274 {
6275 	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6276 
6277 	while (fl) {
6278 		if (fl->f_inode_type == type && fl->f_slot == slot)
6279 			return fl;
6280 
6281 		fl = fl->f_next_suballocator;
6282 	}
6283 
6284 	fl = kmalloc(sizeof(*fl), GFP_NOFS);
6285 	if (fl) {
6286 		fl->f_inode_type = type;
6287 		fl->f_slot = slot;
6288 		fl->f_first = NULL;
6289 		fl->f_next_suballocator = ctxt->c_first_suballocator;
6290 
6291 		ctxt->c_first_suballocator = fl;
6292 	}
6293 	return fl;
6294 }
6295 
6296 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6297 				     int type, int slot, u64 blkno,
6298 				     unsigned int bit)
6299 {
6300 	int ret;
6301 	struct ocfs2_per_slot_free_list *fl;
6302 	struct ocfs2_cached_block_free *item;
6303 
6304 	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6305 	if (fl == NULL) {
6306 		ret = -ENOMEM;
6307 		mlog_errno(ret);
6308 		goto out;
6309 	}
6310 
6311 	item = kmalloc(sizeof(*item), GFP_NOFS);
6312 	if (item == NULL) {
6313 		ret = -ENOMEM;
6314 		mlog_errno(ret);
6315 		goto out;
6316 	}
6317 
6318 	mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6319 	     type, slot, bit, (unsigned long long)blkno);
6320 
6321 	item->free_blk = blkno;
6322 	item->free_bit = bit;
6323 	item->free_next = fl->f_first;
6324 
6325 	fl->f_first = item;
6326 
6327 	ret = 0;
6328 out:
6329 	return ret;
6330 }
6331 
6332 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6333 					 struct ocfs2_extent_block *eb)
6334 {
6335 	return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6336 					 le16_to_cpu(eb->h_suballoc_slot),
6337 					 le64_to_cpu(eb->h_blkno),
6338 					 le16_to_cpu(eb->h_suballoc_bit));
6339 }
6340 
6341 /* This function will figure out whether the currently last extent
6342  * block will be deleted, and if it will, what the new last extent
6343  * block will be so we can update his h_next_leaf_blk field, as well
6344  * as the dinodes i_last_eb_blk */
6345 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6346 				       unsigned int clusters_to_del,
6347 				       struct ocfs2_path *path,
6348 				       struct buffer_head **new_last_eb)
6349 {
6350 	int next_free, ret = 0;
6351 	u32 cpos;
6352 	struct ocfs2_extent_rec *rec;
6353 	struct ocfs2_extent_block *eb;
6354 	struct ocfs2_extent_list *el;
6355 	struct buffer_head *bh = NULL;
6356 
6357 	*new_last_eb = NULL;
6358 
6359 	/* we have no tree, so of course, no last_eb. */
6360 	if (!path->p_tree_depth)
6361 		goto out;
6362 
6363 	/* trunc to zero special case - this makes tree_depth = 0
6364 	 * regardless of what it is.  */
6365 	if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6366 		goto out;
6367 
6368 	el = path_leaf_el(path);
6369 	BUG_ON(!el->l_next_free_rec);
6370 
6371 	/*
6372 	 * Make sure that this extent list will actually be empty
6373 	 * after we clear away the data. We can shortcut out if
6374 	 * there's more than one non-empty extent in the
6375 	 * list. Otherwise, a check of the remaining extent is
6376 	 * necessary.
6377 	 */
6378 	next_free = le16_to_cpu(el->l_next_free_rec);
6379 	rec = NULL;
6380 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6381 		if (next_free > 2)
6382 			goto out;
6383 
6384 		/* We may have a valid extent in index 1, check it. */
6385 		if (next_free == 2)
6386 			rec = &el->l_recs[1];
6387 
6388 		/*
6389 		 * Fall through - no more nonempty extents, so we want
6390 		 * to delete this leaf.
6391 		 */
6392 	} else {
6393 		if (next_free > 1)
6394 			goto out;
6395 
6396 		rec = &el->l_recs[0];
6397 	}
6398 
6399 	if (rec) {
6400 		/*
6401 		 * Check it we'll only be trimming off the end of this
6402 		 * cluster.
6403 		 */
6404 		if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6405 			goto out;
6406 	}
6407 
6408 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6409 	if (ret) {
6410 		mlog_errno(ret);
6411 		goto out;
6412 	}
6413 
6414 	ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
6415 	if (ret) {
6416 		mlog_errno(ret);
6417 		goto out;
6418 	}
6419 
6420 	eb = (struct ocfs2_extent_block *) bh->b_data;
6421 	el = &eb->h_list;
6422 
6423 	/* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6424 	 * Any corruption is a code bug. */
6425 	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6426 
6427 	*new_last_eb = bh;
6428 	get_bh(*new_last_eb);
6429 	mlog(0, "returning block %llu, (cpos: %u)\n",
6430 	     (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6431 out:
6432 	brelse(bh);
6433 
6434 	return ret;
6435 }
6436 
6437 /*
6438  * Trim some clusters off the rightmost edge of a tree. Only called
6439  * during truncate.
6440  *
6441  * The caller needs to:
6442  *   - start journaling of each path component.
6443  *   - compute and fully set up any new last ext block
6444  */
6445 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6446 			   handle_t *handle, struct ocfs2_truncate_context *tc,
6447 			   u32 clusters_to_del, u64 *delete_start)
6448 {
6449 	int ret, i, index = path->p_tree_depth;
6450 	u32 new_edge = 0;
6451 	u64 deleted_eb = 0;
6452 	struct buffer_head *bh;
6453 	struct ocfs2_extent_list *el;
6454 	struct ocfs2_extent_rec *rec;
6455 
6456 	*delete_start = 0;
6457 
6458 	while (index >= 0) {
6459 		bh = path->p_node[index].bh;
6460 		el = path->p_node[index].el;
6461 
6462 		mlog(0, "traveling tree (index = %d, block = %llu)\n",
6463 		     index,  (unsigned long long)bh->b_blocknr);
6464 
6465 		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6466 
6467 		if (index !=
6468 		    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6469 			ocfs2_error(inode->i_sb,
6470 				    "Inode %lu has invalid ext. block %llu",
6471 				    inode->i_ino,
6472 				    (unsigned long long)bh->b_blocknr);
6473 			ret = -EROFS;
6474 			goto out;
6475 		}
6476 
6477 find_tail_record:
6478 		i = le16_to_cpu(el->l_next_free_rec) - 1;
6479 		rec = &el->l_recs[i];
6480 
6481 		mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6482 		     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6483 		     ocfs2_rec_clusters(el, rec),
6484 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
6485 		     le16_to_cpu(el->l_next_free_rec));
6486 
6487 		BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6488 
6489 		if (le16_to_cpu(el->l_tree_depth) == 0) {
6490 			/*
6491 			 * If the leaf block contains a single empty
6492 			 * extent and no records, we can just remove
6493 			 * the block.
6494 			 */
6495 			if (i == 0 && ocfs2_is_empty_extent(rec)) {
6496 				memset(rec, 0,
6497 				       sizeof(struct ocfs2_extent_rec));
6498 				el->l_next_free_rec = cpu_to_le16(0);
6499 
6500 				goto delete;
6501 			}
6502 
6503 			/*
6504 			 * Remove any empty extents by shifting things
6505 			 * left. That should make life much easier on
6506 			 * the code below. This condition is rare
6507 			 * enough that we shouldn't see a performance
6508 			 * hit.
6509 			 */
6510 			if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6511 				le16_add_cpu(&el->l_next_free_rec, -1);
6512 
6513 				for(i = 0;
6514 				    i < le16_to_cpu(el->l_next_free_rec); i++)
6515 					el->l_recs[i] = el->l_recs[i + 1];
6516 
6517 				memset(&el->l_recs[i], 0,
6518 				       sizeof(struct ocfs2_extent_rec));
6519 
6520 				/*
6521 				 * We've modified our extent list. The
6522 				 * simplest way to handle this change
6523 				 * is to being the search from the
6524 				 * start again.
6525 				 */
6526 				goto find_tail_record;
6527 			}
6528 
6529 			le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6530 
6531 			/*
6532 			 * We'll use "new_edge" on our way back up the
6533 			 * tree to know what our rightmost cpos is.
6534 			 */
6535 			new_edge = le16_to_cpu(rec->e_leaf_clusters);
6536 			new_edge += le32_to_cpu(rec->e_cpos);
6537 
6538 			/*
6539 			 * The caller will use this to delete data blocks.
6540 			 */
6541 			*delete_start = le64_to_cpu(rec->e_blkno)
6542 				+ ocfs2_clusters_to_blocks(inode->i_sb,
6543 					le16_to_cpu(rec->e_leaf_clusters));
6544 
6545 			/*
6546 			 * If it's now empty, remove this record.
6547 			 */
6548 			if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6549 				memset(rec, 0,
6550 				       sizeof(struct ocfs2_extent_rec));
6551 				le16_add_cpu(&el->l_next_free_rec, -1);
6552 			}
6553 		} else {
6554 			if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6555 				memset(rec, 0,
6556 				       sizeof(struct ocfs2_extent_rec));
6557 				le16_add_cpu(&el->l_next_free_rec, -1);
6558 
6559 				goto delete;
6560 			}
6561 
6562 			/* Can this actually happen? */
6563 			if (le16_to_cpu(el->l_next_free_rec) == 0)
6564 				goto delete;
6565 
6566 			/*
6567 			 * We never actually deleted any clusters
6568 			 * because our leaf was empty. There's no
6569 			 * reason to adjust the rightmost edge then.
6570 			 */
6571 			if (new_edge == 0)
6572 				goto delete;
6573 
6574 			rec->e_int_clusters = cpu_to_le32(new_edge);
6575 			le32_add_cpu(&rec->e_int_clusters,
6576 				     -le32_to_cpu(rec->e_cpos));
6577 
6578 			 /*
6579 			  * A deleted child record should have been
6580 			  * caught above.
6581 			  */
6582 			 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6583 		}
6584 
6585 delete:
6586 		ret = ocfs2_journal_dirty(handle, bh);
6587 		if (ret) {
6588 			mlog_errno(ret);
6589 			goto out;
6590 		}
6591 
6592 		mlog(0, "extent list container %llu, after: record %d: "
6593 		     "(%u, %u, %llu), next = %u.\n",
6594 		     (unsigned long long)bh->b_blocknr, i,
6595 		     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6596 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
6597 		     le16_to_cpu(el->l_next_free_rec));
6598 
6599 		/*
6600 		 * We must be careful to only attempt delete of an
6601 		 * extent block (and not the root inode block).
6602 		 */
6603 		if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6604 			struct ocfs2_extent_block *eb =
6605 				(struct ocfs2_extent_block *)bh->b_data;
6606 
6607 			/*
6608 			 * Save this for use when processing the
6609 			 * parent block.
6610 			 */
6611 			deleted_eb = le64_to_cpu(eb->h_blkno);
6612 
6613 			mlog(0, "deleting this extent block.\n");
6614 
6615 			ocfs2_remove_from_cache(inode, bh);
6616 
6617 			BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6618 			BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6619 			BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6620 
6621 			ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6622 			/* An error here is not fatal. */
6623 			if (ret < 0)
6624 				mlog_errno(ret);
6625 		} else {
6626 			deleted_eb = 0;
6627 		}
6628 
6629 		index--;
6630 	}
6631 
6632 	ret = 0;
6633 out:
6634 	return ret;
6635 }
6636 
6637 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6638 			     unsigned int clusters_to_del,
6639 			     struct inode *inode,
6640 			     struct buffer_head *fe_bh,
6641 			     handle_t *handle,
6642 			     struct ocfs2_truncate_context *tc,
6643 			     struct ocfs2_path *path)
6644 {
6645 	int status;
6646 	struct ocfs2_dinode *fe;
6647 	struct ocfs2_extent_block *last_eb = NULL;
6648 	struct ocfs2_extent_list *el;
6649 	struct buffer_head *last_eb_bh = NULL;
6650 	u64 delete_blk = 0;
6651 
6652 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
6653 
6654 	status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6655 					     path, &last_eb_bh);
6656 	if (status < 0) {
6657 		mlog_errno(status);
6658 		goto bail;
6659 	}
6660 
6661 	/*
6662 	 * Each component will be touched, so we might as well journal
6663 	 * here to avoid having to handle errors later.
6664 	 */
6665 	status = ocfs2_journal_access_path(inode, handle, path);
6666 	if (status < 0) {
6667 		mlog_errno(status);
6668 		goto bail;
6669 	}
6670 
6671 	if (last_eb_bh) {
6672 		status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
6673 						 OCFS2_JOURNAL_ACCESS_WRITE);
6674 		if (status < 0) {
6675 			mlog_errno(status);
6676 			goto bail;
6677 		}
6678 
6679 		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6680 	}
6681 
6682 	el = &(fe->id2.i_list);
6683 
6684 	/*
6685 	 * Lower levels depend on this never happening, but it's best
6686 	 * to check it up here before changing the tree.
6687 	 */
6688 	if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6689 		ocfs2_error(inode->i_sb,
6690 			    "Inode %lu has an empty extent record, depth %u\n",
6691 			    inode->i_ino, le16_to_cpu(el->l_tree_depth));
6692 		status = -EROFS;
6693 		goto bail;
6694 	}
6695 
6696 	vfs_dq_free_space_nodirty(inode,
6697 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6698 	spin_lock(&OCFS2_I(inode)->ip_lock);
6699 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6700 				      clusters_to_del;
6701 	spin_unlock(&OCFS2_I(inode)->ip_lock);
6702 	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6703 	inode->i_blocks = ocfs2_inode_sector_count(inode);
6704 
6705 	status = ocfs2_trim_tree(inode, path, handle, tc,
6706 				 clusters_to_del, &delete_blk);
6707 	if (status) {
6708 		mlog_errno(status);
6709 		goto bail;
6710 	}
6711 
6712 	if (le32_to_cpu(fe->i_clusters) == 0) {
6713 		/* trunc to zero is a special case. */
6714 		el->l_tree_depth = 0;
6715 		fe->i_last_eb_blk = 0;
6716 	} else if (last_eb)
6717 		fe->i_last_eb_blk = last_eb->h_blkno;
6718 
6719 	status = ocfs2_journal_dirty(handle, fe_bh);
6720 	if (status < 0) {
6721 		mlog_errno(status);
6722 		goto bail;
6723 	}
6724 
6725 	if (last_eb) {
6726 		/* If there will be a new last extent block, then by
6727 		 * definition, there cannot be any leaves to the right of
6728 		 * him. */
6729 		last_eb->h_next_leaf_blk = 0;
6730 		status = ocfs2_journal_dirty(handle, last_eb_bh);
6731 		if (status < 0) {
6732 			mlog_errno(status);
6733 			goto bail;
6734 		}
6735 	}
6736 
6737 	if (delete_blk) {
6738 		status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6739 						   clusters_to_del);
6740 		if (status < 0) {
6741 			mlog_errno(status);
6742 			goto bail;
6743 		}
6744 	}
6745 	status = 0;
6746 bail:
6747 
6748 	mlog_exit(status);
6749 	return status;
6750 }
6751 
6752 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6753 {
6754 	set_buffer_uptodate(bh);
6755 	mark_buffer_dirty(bh);
6756 	return 0;
6757 }
6758 
6759 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6760 				     unsigned int from, unsigned int to,
6761 				     struct page *page, int zero, u64 *phys)
6762 {
6763 	int ret, partial = 0;
6764 
6765 	ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6766 	if (ret)
6767 		mlog_errno(ret);
6768 
6769 	if (zero)
6770 		zero_user_segment(page, from, to);
6771 
6772 	/*
6773 	 * Need to set the buffers we zero'd into uptodate
6774 	 * here if they aren't - ocfs2_map_page_blocks()
6775 	 * might've skipped some
6776 	 */
6777 	ret = walk_page_buffers(handle, page_buffers(page),
6778 				from, to, &partial,
6779 				ocfs2_zero_func);
6780 	if (ret < 0)
6781 		mlog_errno(ret);
6782 	else if (ocfs2_should_order_data(inode)) {
6783 		ret = ocfs2_jbd2_file_inode(handle, inode);
6784 		if (ret < 0)
6785 			mlog_errno(ret);
6786 	}
6787 
6788 	if (!partial)
6789 		SetPageUptodate(page);
6790 
6791 	flush_dcache_page(page);
6792 }
6793 
6794 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6795 				     loff_t end, struct page **pages,
6796 				     int numpages, u64 phys, handle_t *handle)
6797 {
6798 	int i;
6799 	struct page *page;
6800 	unsigned int from, to = PAGE_CACHE_SIZE;
6801 	struct super_block *sb = inode->i_sb;
6802 
6803 	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6804 
6805 	if (numpages == 0)
6806 		goto out;
6807 
6808 	to = PAGE_CACHE_SIZE;
6809 	for(i = 0; i < numpages; i++) {
6810 		page = pages[i];
6811 
6812 		from = start & (PAGE_CACHE_SIZE - 1);
6813 		if ((end >> PAGE_CACHE_SHIFT) == page->index)
6814 			to = end & (PAGE_CACHE_SIZE - 1);
6815 
6816 		BUG_ON(from > PAGE_CACHE_SIZE);
6817 		BUG_ON(to > PAGE_CACHE_SIZE);
6818 
6819 		ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6820 					 &phys);
6821 
6822 		start = (page->index + 1) << PAGE_CACHE_SHIFT;
6823 	}
6824 out:
6825 	if (pages)
6826 		ocfs2_unlock_and_free_pages(pages, numpages);
6827 }
6828 
6829 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6830 				struct page **pages, int *num)
6831 {
6832 	int numpages, ret = 0;
6833 	struct super_block *sb = inode->i_sb;
6834 	struct address_space *mapping = inode->i_mapping;
6835 	unsigned long index;
6836 	loff_t last_page_bytes;
6837 
6838 	BUG_ON(start > end);
6839 
6840 	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6841 	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6842 
6843 	numpages = 0;
6844 	last_page_bytes = PAGE_ALIGN(end);
6845 	index = start >> PAGE_CACHE_SHIFT;
6846 	do {
6847 		pages[numpages] = grab_cache_page(mapping, index);
6848 		if (!pages[numpages]) {
6849 			ret = -ENOMEM;
6850 			mlog_errno(ret);
6851 			goto out;
6852 		}
6853 
6854 		numpages++;
6855 		index++;
6856 	} while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6857 
6858 out:
6859 	if (ret != 0) {
6860 		if (pages)
6861 			ocfs2_unlock_and_free_pages(pages, numpages);
6862 		numpages = 0;
6863 	}
6864 
6865 	*num = numpages;
6866 
6867 	return ret;
6868 }
6869 
6870 /*
6871  * Zero the area past i_size but still within an allocated
6872  * cluster. This avoids exposing nonzero data on subsequent file
6873  * extends.
6874  *
6875  * We need to call this before i_size is updated on the inode because
6876  * otherwise block_write_full_page() will skip writeout of pages past
6877  * i_size. The new_i_size parameter is passed for this reason.
6878  */
6879 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6880 				  u64 range_start, u64 range_end)
6881 {
6882 	int ret = 0, numpages;
6883 	struct page **pages = NULL;
6884 	u64 phys;
6885 	unsigned int ext_flags;
6886 	struct super_block *sb = inode->i_sb;
6887 
6888 	/*
6889 	 * File systems which don't support sparse files zero on every
6890 	 * extend.
6891 	 */
6892 	if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6893 		return 0;
6894 
6895 	pages = kcalloc(ocfs2_pages_per_cluster(sb),
6896 			sizeof(struct page *), GFP_NOFS);
6897 	if (pages == NULL) {
6898 		ret = -ENOMEM;
6899 		mlog_errno(ret);
6900 		goto out;
6901 	}
6902 
6903 	if (range_start == range_end)
6904 		goto out;
6905 
6906 	ret = ocfs2_extent_map_get_blocks(inode,
6907 					  range_start >> sb->s_blocksize_bits,
6908 					  &phys, NULL, &ext_flags);
6909 	if (ret) {
6910 		mlog_errno(ret);
6911 		goto out;
6912 	}
6913 
6914 	/*
6915 	 * Tail is a hole, or is marked unwritten. In either case, we
6916 	 * can count on read and write to return/push zero's.
6917 	 */
6918 	if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6919 		goto out;
6920 
6921 	ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6922 				   &numpages);
6923 	if (ret) {
6924 		mlog_errno(ret);
6925 		goto out;
6926 	}
6927 
6928 	ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6929 				 numpages, phys, handle);
6930 
6931 	/*
6932 	 * Initiate writeout of the pages we zero'd here. We don't
6933 	 * wait on them - the truncate_inode_pages() call later will
6934 	 * do that for us.
6935 	 */
6936 	ret = do_sync_mapping_range(inode->i_mapping, range_start,
6937 				    range_end - 1, SYNC_FILE_RANGE_WRITE);
6938 	if (ret)
6939 		mlog_errno(ret);
6940 
6941 out:
6942 	if (pages)
6943 		kfree(pages);
6944 
6945 	return ret;
6946 }
6947 
6948 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6949 					     struct ocfs2_dinode *di)
6950 {
6951 	unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6952 	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6953 
6954 	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6955 		memset(&di->id2, 0, blocksize -
6956 				    offsetof(struct ocfs2_dinode, id2) -
6957 				    xattrsize);
6958 	else
6959 		memset(&di->id2, 0, blocksize -
6960 				    offsetof(struct ocfs2_dinode, id2));
6961 }
6962 
6963 void ocfs2_dinode_new_extent_list(struct inode *inode,
6964 				  struct ocfs2_dinode *di)
6965 {
6966 	ocfs2_zero_dinode_id2_with_xattr(inode, di);
6967 	di->id2.i_list.l_tree_depth = 0;
6968 	di->id2.i_list.l_next_free_rec = 0;
6969 	di->id2.i_list.l_count = cpu_to_le16(
6970 		ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6971 }
6972 
6973 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6974 {
6975 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
6976 	struct ocfs2_inline_data *idata = &di->id2.i_data;
6977 
6978 	spin_lock(&oi->ip_lock);
6979 	oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6980 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6981 	spin_unlock(&oi->ip_lock);
6982 
6983 	/*
6984 	 * We clear the entire i_data structure here so that all
6985 	 * fields can be properly initialized.
6986 	 */
6987 	ocfs2_zero_dinode_id2_with_xattr(inode, di);
6988 
6989 	idata->id_count = cpu_to_le16(
6990 			ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6991 }
6992 
6993 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6994 					 struct buffer_head *di_bh)
6995 {
6996 	int ret, i, has_data, num_pages = 0;
6997 	handle_t *handle;
6998 	u64 uninitialized_var(block);
6999 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
7000 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7001 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7002 	struct ocfs2_alloc_context *data_ac = NULL;
7003 	struct page **pages = NULL;
7004 	loff_t end = osb->s_clustersize;
7005 	struct ocfs2_extent_tree et;
7006 	int did_quota = 0;
7007 
7008 	has_data = i_size_read(inode) ? 1 : 0;
7009 
7010 	if (has_data) {
7011 		pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
7012 				sizeof(struct page *), GFP_NOFS);
7013 		if (pages == NULL) {
7014 			ret = -ENOMEM;
7015 			mlog_errno(ret);
7016 			goto out;
7017 		}
7018 
7019 		ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
7020 		if (ret) {
7021 			mlog_errno(ret);
7022 			goto out;
7023 		}
7024 	}
7025 
7026 	handle = ocfs2_start_trans(osb,
7027 				   ocfs2_inline_to_extents_credits(osb->sb));
7028 	if (IS_ERR(handle)) {
7029 		ret = PTR_ERR(handle);
7030 		mlog_errno(ret);
7031 		goto out_unlock;
7032 	}
7033 
7034 	ret = ocfs2_journal_access_di(handle, inode, di_bh,
7035 				      OCFS2_JOURNAL_ACCESS_WRITE);
7036 	if (ret) {
7037 		mlog_errno(ret);
7038 		goto out_commit;
7039 	}
7040 
7041 	if (has_data) {
7042 		u32 bit_off, num;
7043 		unsigned int page_end;
7044 		u64 phys;
7045 
7046 		if (vfs_dq_alloc_space_nodirty(inode,
7047 				       ocfs2_clusters_to_bytes(osb->sb, 1))) {
7048 			ret = -EDQUOT;
7049 			goto out_commit;
7050 		}
7051 		did_quota = 1;
7052 
7053 		ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
7054 					   &num);
7055 		if (ret) {
7056 			mlog_errno(ret);
7057 			goto out_commit;
7058 		}
7059 
7060 		/*
7061 		 * Save two copies, one for insert, and one that can
7062 		 * be changed by ocfs2_map_and_dirty_page() below.
7063 		 */
7064 		block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
7065 
7066 		/*
7067 		 * Non sparse file systems zero on extend, so no need
7068 		 * to do that now.
7069 		 */
7070 		if (!ocfs2_sparse_alloc(osb) &&
7071 		    PAGE_CACHE_SIZE < osb->s_clustersize)
7072 			end = PAGE_CACHE_SIZE;
7073 
7074 		ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
7075 		if (ret) {
7076 			mlog_errno(ret);
7077 			goto out_commit;
7078 		}
7079 
7080 		/*
7081 		 * This should populate the 1st page for us and mark
7082 		 * it up to date.
7083 		 */
7084 		ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
7085 		if (ret) {
7086 			mlog_errno(ret);
7087 			goto out_commit;
7088 		}
7089 
7090 		page_end = PAGE_CACHE_SIZE;
7091 		if (PAGE_CACHE_SIZE > osb->s_clustersize)
7092 			page_end = osb->s_clustersize;
7093 
7094 		for (i = 0; i < num_pages; i++)
7095 			ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
7096 						 pages[i], i > 0, &phys);
7097 	}
7098 
7099 	spin_lock(&oi->ip_lock);
7100 	oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
7101 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
7102 	spin_unlock(&oi->ip_lock);
7103 
7104 	ocfs2_dinode_new_extent_list(inode, di);
7105 
7106 	ocfs2_journal_dirty(handle, di_bh);
7107 
7108 	if (has_data) {
7109 		/*
7110 		 * An error at this point should be extremely rare. If
7111 		 * this proves to be false, we could always re-build
7112 		 * the in-inode data from our pages.
7113 		 */
7114 		ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
7115 		ret = ocfs2_insert_extent(osb, handle, inode, &et,
7116 					  0, block, 1, 0, NULL);
7117 		if (ret) {
7118 			mlog_errno(ret);
7119 			goto out_commit;
7120 		}
7121 
7122 		inode->i_blocks = ocfs2_inode_sector_count(inode);
7123 	}
7124 
7125 out_commit:
7126 	if (ret < 0 && did_quota)
7127 		vfs_dq_free_space_nodirty(inode,
7128 					  ocfs2_clusters_to_bytes(osb->sb, 1));
7129 
7130 	ocfs2_commit_trans(osb, handle);
7131 
7132 out_unlock:
7133 	if (data_ac)
7134 		ocfs2_free_alloc_context(data_ac);
7135 
7136 out:
7137 	if (pages) {
7138 		ocfs2_unlock_and_free_pages(pages, num_pages);
7139 		kfree(pages);
7140 	}
7141 
7142 	return ret;
7143 }
7144 
7145 /*
7146  * It is expected, that by the time you call this function,
7147  * inode->i_size and fe->i_size have been adjusted.
7148  *
7149  * WARNING: This will kfree the truncate context
7150  */
7151 int ocfs2_commit_truncate(struct ocfs2_super *osb,
7152 			  struct inode *inode,
7153 			  struct buffer_head *fe_bh,
7154 			  struct ocfs2_truncate_context *tc)
7155 {
7156 	int status, i, credits, tl_sem = 0;
7157 	u32 clusters_to_del, new_highest_cpos, range;
7158 	struct ocfs2_extent_list *el;
7159 	handle_t *handle = NULL;
7160 	struct inode *tl_inode = osb->osb_tl_inode;
7161 	struct ocfs2_path *path = NULL;
7162 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
7163 
7164 	mlog_entry_void();
7165 
7166 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7167 						     i_size_read(inode));
7168 
7169 	path = ocfs2_new_path(fe_bh, &di->id2.i_list,
7170 			      ocfs2_journal_access_di);
7171 	if (!path) {
7172 		status = -ENOMEM;
7173 		mlog_errno(status);
7174 		goto bail;
7175 	}
7176 
7177 	ocfs2_extent_map_trunc(inode, new_highest_cpos);
7178 
7179 start:
7180 	/*
7181 	 * Check that we still have allocation to delete.
7182 	 */
7183 	if (OCFS2_I(inode)->ip_clusters == 0) {
7184 		status = 0;
7185 		goto bail;
7186 	}
7187 
7188 	/*
7189 	 * Truncate always works against the rightmost tree branch.
7190 	 */
7191 	status = ocfs2_find_path(inode, path, UINT_MAX);
7192 	if (status) {
7193 		mlog_errno(status);
7194 		goto bail;
7195 	}
7196 
7197 	mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
7198 	     OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
7199 
7200 	/*
7201 	 * By now, el will point to the extent list on the bottom most
7202 	 * portion of this tree. Only the tail record is considered in
7203 	 * each pass.
7204 	 *
7205 	 * We handle the following cases, in order:
7206 	 * - empty extent: delete the remaining branch
7207 	 * - remove the entire record
7208 	 * - remove a partial record
7209 	 * - no record needs to be removed (truncate has completed)
7210 	 */
7211 	el = path_leaf_el(path);
7212 	if (le16_to_cpu(el->l_next_free_rec) == 0) {
7213 		ocfs2_error(inode->i_sb,
7214 			    "Inode %llu has empty extent block at %llu\n",
7215 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
7216 			    (unsigned long long)path_leaf_bh(path)->b_blocknr);
7217 		status = -EROFS;
7218 		goto bail;
7219 	}
7220 
7221 	i = le16_to_cpu(el->l_next_free_rec) - 1;
7222 	range = le32_to_cpu(el->l_recs[i].e_cpos) +
7223 		ocfs2_rec_clusters(el, &el->l_recs[i]);
7224 	if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
7225 		clusters_to_del = 0;
7226 	} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
7227 		clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
7228 	} else if (range > new_highest_cpos) {
7229 		clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
7230 				   le32_to_cpu(el->l_recs[i].e_cpos)) -
7231 				  new_highest_cpos;
7232 	} else {
7233 		status = 0;
7234 		goto bail;
7235 	}
7236 
7237 	mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
7238 	     clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7239 
7240 	mutex_lock(&tl_inode->i_mutex);
7241 	tl_sem = 1;
7242 	/* ocfs2_truncate_log_needs_flush guarantees us at least one
7243 	 * record is free for use. If there isn't any, we flush to get
7244 	 * an empty truncate log.  */
7245 	if (ocfs2_truncate_log_needs_flush(osb)) {
7246 		status = __ocfs2_flush_truncate_log(osb);
7247 		if (status < 0) {
7248 			mlog_errno(status);
7249 			goto bail;
7250 		}
7251 	}
7252 
7253 	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
7254 						(struct ocfs2_dinode *)fe_bh->b_data,
7255 						el);
7256 	handle = ocfs2_start_trans(osb, credits);
7257 	if (IS_ERR(handle)) {
7258 		status = PTR_ERR(handle);
7259 		handle = NULL;
7260 		mlog_errno(status);
7261 		goto bail;
7262 	}
7263 
7264 	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7265 				   tc, path);
7266 	if (status < 0) {
7267 		mlog_errno(status);
7268 		goto bail;
7269 	}
7270 
7271 	mutex_unlock(&tl_inode->i_mutex);
7272 	tl_sem = 0;
7273 
7274 	ocfs2_commit_trans(osb, handle);
7275 	handle = NULL;
7276 
7277 	ocfs2_reinit_path(path, 1);
7278 
7279 	/*
7280 	 * The check above will catch the case where we've truncated
7281 	 * away all allocation.
7282 	 */
7283 	goto start;
7284 
7285 bail:
7286 
7287 	ocfs2_schedule_truncate_log_flush(osb, 1);
7288 
7289 	if (tl_sem)
7290 		mutex_unlock(&tl_inode->i_mutex);
7291 
7292 	if (handle)
7293 		ocfs2_commit_trans(osb, handle);
7294 
7295 	ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7296 
7297 	ocfs2_free_path(path);
7298 
7299 	/* This will drop the ext_alloc cluster lock for us */
7300 	ocfs2_free_truncate_context(tc);
7301 
7302 	mlog_exit(status);
7303 	return status;
7304 }
7305 
7306 /*
7307  * Expects the inode to already be locked.
7308  */
7309 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7310 			   struct inode *inode,
7311 			   struct buffer_head *fe_bh,
7312 			   struct ocfs2_truncate_context **tc)
7313 {
7314 	int status;
7315 	unsigned int new_i_clusters;
7316 	struct ocfs2_dinode *fe;
7317 	struct ocfs2_extent_block *eb;
7318 	struct buffer_head *last_eb_bh = NULL;
7319 
7320 	mlog_entry_void();
7321 
7322 	*tc = NULL;
7323 
7324 	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7325 						  i_size_read(inode));
7326 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
7327 
7328 	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7329 	     "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7330 	     (unsigned long long)le64_to_cpu(fe->i_size));
7331 
7332 	*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7333 	if (!(*tc)) {
7334 		status = -ENOMEM;
7335 		mlog_errno(status);
7336 		goto bail;
7337 	}
7338 	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7339 
7340 	if (fe->id2.i_list.l_tree_depth) {
7341 		status = ocfs2_read_extent_block(inode,
7342 						 le64_to_cpu(fe->i_last_eb_blk),
7343 						 &last_eb_bh);
7344 		if (status < 0) {
7345 			mlog_errno(status);
7346 			goto bail;
7347 		}
7348 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7349 	}
7350 
7351 	(*tc)->tc_last_eb_bh = last_eb_bh;
7352 
7353 	status = 0;
7354 bail:
7355 	if (status < 0) {
7356 		if (*tc)
7357 			ocfs2_free_truncate_context(*tc);
7358 		*tc = NULL;
7359 	}
7360 	mlog_exit_void();
7361 	return status;
7362 }
7363 
7364 /*
7365  * 'start' is inclusive, 'end' is not.
7366  */
7367 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7368 			  unsigned int start, unsigned int end, int trunc)
7369 {
7370 	int ret;
7371 	unsigned int numbytes;
7372 	handle_t *handle;
7373 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7374 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7375 	struct ocfs2_inline_data *idata = &di->id2.i_data;
7376 
7377 	if (end > i_size_read(inode))
7378 		end = i_size_read(inode);
7379 
7380 	BUG_ON(start >= end);
7381 
7382 	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7383 	    !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7384 	    !ocfs2_supports_inline_data(osb)) {
7385 		ocfs2_error(inode->i_sb,
7386 			    "Inline data flags for inode %llu don't agree! "
7387 			    "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7388 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
7389 			    le16_to_cpu(di->i_dyn_features),
7390 			    OCFS2_I(inode)->ip_dyn_features,
7391 			    osb->s_feature_incompat);
7392 		ret = -EROFS;
7393 		goto out;
7394 	}
7395 
7396 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7397 	if (IS_ERR(handle)) {
7398 		ret = PTR_ERR(handle);
7399 		mlog_errno(ret);
7400 		goto out;
7401 	}
7402 
7403 	ret = ocfs2_journal_access_di(handle, inode, di_bh,
7404 				      OCFS2_JOURNAL_ACCESS_WRITE);
7405 	if (ret) {
7406 		mlog_errno(ret);
7407 		goto out_commit;
7408 	}
7409 
7410 	numbytes = end - start;
7411 	memset(idata->id_data + start, 0, numbytes);
7412 
7413 	/*
7414 	 * No need to worry about the data page here - it's been
7415 	 * truncated already and inline data doesn't need it for
7416 	 * pushing zero's to disk, so we'll let readpage pick it up
7417 	 * later.
7418 	 */
7419 	if (trunc) {
7420 		i_size_write(inode, start);
7421 		di->i_size = cpu_to_le64(start);
7422 	}
7423 
7424 	inode->i_blocks = ocfs2_inode_sector_count(inode);
7425 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7426 
7427 	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7428 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7429 
7430 	ocfs2_journal_dirty(handle, di_bh);
7431 
7432 out_commit:
7433 	ocfs2_commit_trans(osb, handle);
7434 
7435 out:
7436 	return ret;
7437 }
7438 
7439 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7440 {
7441 	/*
7442 	 * The caller is responsible for completing deallocation
7443 	 * before freeing the context.
7444 	 */
7445 	if (tc->tc_dealloc.c_first_suballocator != NULL)
7446 		mlog(ML_NOTICE,
7447 		     "Truncate completion has non-empty dealloc context\n");
7448 
7449 	brelse(tc->tc_last_eb_bh);
7450 
7451 	kfree(tc);
7452 }
7453