xref: /openbmc/linux/fs/ocfs2/alloc.c (revision e7d4cb6bc19658646357eeff134645cd9bc3479f)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31 
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34 
35 #include "ocfs2.h"
36 
37 #include "alloc.h"
38 #include "aops.h"
39 #include "dlmglue.h"
40 #include "extent_map.h"
41 #include "inode.h"
42 #include "journal.h"
43 #include "localalloc.h"
44 #include "suballoc.h"
45 #include "sysfile.h"
46 #include "file.h"
47 #include "super.h"
48 #include "uptodate.h"
49 
50 #include "buffer_head_io.h"
51 
52 /*
53  * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
54  * the b-tree operations in ocfs2. Now all the b-tree operations are not
55  * limited to ocfs2_dinode only. Any data which need to allocate clusters
56  * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
57  * and operation.
58  *
59  * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
60  * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
61  * functions.
62  * ocfs2_extent_tree_operations abstract the normal operations we do for
63  * the root of extent b-tree.
64  */
65 struct ocfs2_extent_tree;
66 
67 struct ocfs2_extent_tree_operations {
68 	void (*set_last_eb_blk) (struct ocfs2_extent_tree *et, u64 blkno);
69 	u64 (*get_last_eb_blk) (struct ocfs2_extent_tree *et);
70 	void (*update_clusters) (struct inode *inode,
71 				 struct ocfs2_extent_tree *et,
72 				 u32 new_clusters);
73 	int (*sanity_check) (struct inode *inode, struct ocfs2_extent_tree *et);
74 };
75 
76 struct ocfs2_extent_tree {
77 	enum ocfs2_extent_tree_type type;
78 	struct ocfs2_extent_tree_operations *eops;
79 	struct buffer_head *root_bh;
80 	struct ocfs2_extent_list *root_el;
81 };
82 
83 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
84 					 u64 blkno)
85 {
86 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
87 
88 	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
89 	di->i_last_eb_blk = cpu_to_le64(blkno);
90 }
91 
92 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
93 {
94 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
95 
96 	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
97 	return le64_to_cpu(di->i_last_eb_blk);
98 }
99 
100 static void ocfs2_dinode_update_clusters(struct inode *inode,
101 					 struct ocfs2_extent_tree *et,
102 					 u32 clusters)
103 {
104 	struct ocfs2_dinode *di =
105 			(struct ocfs2_dinode *)et->root_bh->b_data;
106 
107 	le32_add_cpu(&di->i_clusters, clusters);
108 	spin_lock(&OCFS2_I(inode)->ip_lock);
109 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
110 	spin_unlock(&OCFS2_I(inode)->ip_lock);
111 }
112 
113 static int ocfs2_dinode_sanity_check(struct inode *inode,
114 				     struct ocfs2_extent_tree *et)
115 {
116 	int ret = 0;
117 	struct ocfs2_dinode *di;
118 
119 	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
120 
121 	di = (struct ocfs2_dinode *)et->root_bh->b_data;
122 	if (!OCFS2_IS_VALID_DINODE(di)) {
123 		ret = -EIO;
124 		ocfs2_error(inode->i_sb,
125 			"Inode %llu has invalid path root",
126 			(unsigned long long)OCFS2_I(inode)->ip_blkno);
127 	}
128 
129 	return ret;
130 }
131 
132 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
133 	.set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
134 	.get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
135 	.update_clusters	= ocfs2_dinode_update_clusters,
136 	.sanity_check		= ocfs2_dinode_sanity_check,
137 };
138 
139 static struct ocfs2_extent_tree*
140 	 ocfs2_new_extent_tree(struct buffer_head *bh,
141 			       enum ocfs2_extent_tree_type et_type)
142 {
143 	struct ocfs2_extent_tree *et;
144 
145 	et = kzalloc(sizeof(*et), GFP_NOFS);
146 	if (!et)
147 		return NULL;
148 
149 	et->type = et_type;
150 	get_bh(bh);
151 	et->root_bh = bh;
152 
153 	/* current we only support dinode extent. */
154 	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
155 	if (et_type == OCFS2_DINODE_EXTENT) {
156 		et->root_el = &((struct ocfs2_dinode *)bh->b_data)->id2.i_list;
157 		et->eops = &ocfs2_dinode_et_ops;
158 	}
159 
160 	return et;
161 }
162 
163 static void ocfs2_free_extent_tree(struct ocfs2_extent_tree *et)
164 {
165 	if (et) {
166 		brelse(et->root_bh);
167 		kfree(et);
168 	}
169 }
170 
171 static inline void ocfs2_set_last_eb_blk(struct ocfs2_extent_tree *et,
172 					 u64 new_last_eb_blk)
173 {
174 	et->eops->set_last_eb_blk(et, new_last_eb_blk);
175 }
176 
177 static inline u64 ocfs2_get_last_eb_blk(struct ocfs2_extent_tree *et)
178 {
179 	return et->eops->get_last_eb_blk(et);
180 }
181 
182 static inline void ocfs2_update_clusters(struct inode *inode,
183 					 struct ocfs2_extent_tree *et,
184 					 u32 clusters)
185 {
186 	et->eops->update_clusters(inode, et, clusters);
187 }
188 
189 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
190 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
191 					 struct ocfs2_extent_block *eb);
192 
193 /*
194  * Structures which describe a path through a btree, and functions to
195  * manipulate them.
196  *
197  * The idea here is to be as generic as possible with the tree
198  * manipulation code.
199  */
200 struct ocfs2_path_item {
201 	struct buffer_head		*bh;
202 	struct ocfs2_extent_list	*el;
203 };
204 
205 #define OCFS2_MAX_PATH_DEPTH	5
206 
207 struct ocfs2_path {
208 	int			p_tree_depth;
209 	struct ocfs2_path_item	p_node[OCFS2_MAX_PATH_DEPTH];
210 };
211 
212 #define path_root_bh(_path) ((_path)->p_node[0].bh)
213 #define path_root_el(_path) ((_path)->p_node[0].el)
214 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
215 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
216 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
217 
218 /*
219  * Reset the actual path elements so that we can re-use the structure
220  * to build another path. Generally, this involves freeing the buffer
221  * heads.
222  */
223 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
224 {
225 	int i, start = 0, depth = 0;
226 	struct ocfs2_path_item *node;
227 
228 	if (keep_root)
229 		start = 1;
230 
231 	for(i = start; i < path_num_items(path); i++) {
232 		node = &path->p_node[i];
233 
234 		brelse(node->bh);
235 		node->bh = NULL;
236 		node->el = NULL;
237 	}
238 
239 	/*
240 	 * Tree depth may change during truncate, or insert. If we're
241 	 * keeping the root extent list, then make sure that our path
242 	 * structure reflects the proper depth.
243 	 */
244 	if (keep_root)
245 		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
246 
247 	path->p_tree_depth = depth;
248 }
249 
250 static void ocfs2_free_path(struct ocfs2_path *path)
251 {
252 	if (path) {
253 		ocfs2_reinit_path(path, 0);
254 		kfree(path);
255 	}
256 }
257 
258 /*
259  * All the elements of src into dest. After this call, src could be freed
260  * without affecting dest.
261  *
262  * Both paths should have the same root. Any non-root elements of dest
263  * will be freed.
264  */
265 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
266 {
267 	int i;
268 
269 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
270 	BUG_ON(path_root_el(dest) != path_root_el(src));
271 
272 	ocfs2_reinit_path(dest, 1);
273 
274 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
275 		dest->p_node[i].bh = src->p_node[i].bh;
276 		dest->p_node[i].el = src->p_node[i].el;
277 
278 		if (dest->p_node[i].bh)
279 			get_bh(dest->p_node[i].bh);
280 	}
281 }
282 
283 /*
284  * Make the *dest path the same as src and re-initialize src path to
285  * have a root only.
286  */
287 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
288 {
289 	int i;
290 
291 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
292 
293 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
294 		brelse(dest->p_node[i].bh);
295 
296 		dest->p_node[i].bh = src->p_node[i].bh;
297 		dest->p_node[i].el = src->p_node[i].el;
298 
299 		src->p_node[i].bh = NULL;
300 		src->p_node[i].el = NULL;
301 	}
302 }
303 
304 /*
305  * Insert an extent block at given index.
306  *
307  * This will not take an additional reference on eb_bh.
308  */
309 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
310 					struct buffer_head *eb_bh)
311 {
312 	struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
313 
314 	/*
315 	 * Right now, no root bh is an extent block, so this helps
316 	 * catch code errors with dinode trees. The assertion can be
317 	 * safely removed if we ever need to insert extent block
318 	 * structures at the root.
319 	 */
320 	BUG_ON(index == 0);
321 
322 	path->p_node[index].bh = eb_bh;
323 	path->p_node[index].el = &eb->h_list;
324 }
325 
326 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
327 					 struct ocfs2_extent_list *root_el)
328 {
329 	struct ocfs2_path *path;
330 
331 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
332 
333 	path = kzalloc(sizeof(*path), GFP_NOFS);
334 	if (path) {
335 		path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
336 		get_bh(root_bh);
337 		path_root_bh(path) = root_bh;
338 		path_root_el(path) = root_el;
339 	}
340 
341 	return path;
342 }
343 
344 /*
345  * Convenience function to journal all components in a path.
346  */
347 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
348 				     struct ocfs2_path *path)
349 {
350 	int i, ret = 0;
351 
352 	if (!path)
353 		goto out;
354 
355 	for(i = 0; i < path_num_items(path); i++) {
356 		ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
357 					   OCFS2_JOURNAL_ACCESS_WRITE);
358 		if (ret < 0) {
359 			mlog_errno(ret);
360 			goto out;
361 		}
362 	}
363 
364 out:
365 	return ret;
366 }
367 
368 /*
369  * Return the index of the extent record which contains cluster #v_cluster.
370  * -1 is returned if it was not found.
371  *
372  * Should work fine on interior and exterior nodes.
373  */
374 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
375 {
376 	int ret = -1;
377 	int i;
378 	struct ocfs2_extent_rec *rec;
379 	u32 rec_end, rec_start, clusters;
380 
381 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
382 		rec = &el->l_recs[i];
383 
384 		rec_start = le32_to_cpu(rec->e_cpos);
385 		clusters = ocfs2_rec_clusters(el, rec);
386 
387 		rec_end = rec_start + clusters;
388 
389 		if (v_cluster >= rec_start && v_cluster < rec_end) {
390 			ret = i;
391 			break;
392 		}
393 	}
394 
395 	return ret;
396 }
397 
398 enum ocfs2_contig_type {
399 	CONTIG_NONE = 0,
400 	CONTIG_LEFT,
401 	CONTIG_RIGHT,
402 	CONTIG_LEFTRIGHT,
403 };
404 
405 
406 /*
407  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
408  * ocfs2_extent_contig only work properly against leaf nodes!
409  */
410 static int ocfs2_block_extent_contig(struct super_block *sb,
411 				     struct ocfs2_extent_rec *ext,
412 				     u64 blkno)
413 {
414 	u64 blk_end = le64_to_cpu(ext->e_blkno);
415 
416 	blk_end += ocfs2_clusters_to_blocks(sb,
417 				    le16_to_cpu(ext->e_leaf_clusters));
418 
419 	return blkno == blk_end;
420 }
421 
422 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
423 				  struct ocfs2_extent_rec *right)
424 {
425 	u32 left_range;
426 
427 	left_range = le32_to_cpu(left->e_cpos) +
428 		le16_to_cpu(left->e_leaf_clusters);
429 
430 	return (left_range == le32_to_cpu(right->e_cpos));
431 }
432 
433 static enum ocfs2_contig_type
434 	ocfs2_extent_contig(struct inode *inode,
435 			    struct ocfs2_extent_rec *ext,
436 			    struct ocfs2_extent_rec *insert_rec)
437 {
438 	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
439 
440 	/*
441 	 * Refuse to coalesce extent records with different flag
442 	 * fields - we don't want to mix unwritten extents with user
443 	 * data.
444 	 */
445 	if (ext->e_flags != insert_rec->e_flags)
446 		return CONTIG_NONE;
447 
448 	if (ocfs2_extents_adjacent(ext, insert_rec) &&
449 	    ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
450 			return CONTIG_RIGHT;
451 
452 	blkno = le64_to_cpu(ext->e_blkno);
453 	if (ocfs2_extents_adjacent(insert_rec, ext) &&
454 	    ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
455 		return CONTIG_LEFT;
456 
457 	return CONTIG_NONE;
458 }
459 
460 /*
461  * NOTE: We can have pretty much any combination of contiguousness and
462  * appending.
463  *
464  * The usefulness of APPEND_TAIL is more in that it lets us know that
465  * we'll have to update the path to that leaf.
466  */
467 enum ocfs2_append_type {
468 	APPEND_NONE = 0,
469 	APPEND_TAIL,
470 };
471 
472 enum ocfs2_split_type {
473 	SPLIT_NONE = 0,
474 	SPLIT_LEFT,
475 	SPLIT_RIGHT,
476 };
477 
478 struct ocfs2_insert_type {
479 	enum ocfs2_split_type	ins_split;
480 	enum ocfs2_append_type	ins_appending;
481 	enum ocfs2_contig_type	ins_contig;
482 	int			ins_contig_index;
483 	int			ins_tree_depth;
484 };
485 
486 struct ocfs2_merge_ctxt {
487 	enum ocfs2_contig_type	c_contig_type;
488 	int			c_has_empty_extent;
489 	int			c_split_covers_rec;
490 };
491 
492 /*
493  * How many free extents have we got before we need more meta data?
494  */
495 int ocfs2_num_free_extents(struct ocfs2_super *osb,
496 			   struct inode *inode,
497 			   struct buffer_head *root_bh,
498 			   enum ocfs2_extent_tree_type type)
499 {
500 	int retval;
501 	struct ocfs2_extent_list *el = NULL;
502 	struct ocfs2_extent_block *eb;
503 	struct buffer_head *eb_bh = NULL;
504 	u64 last_eb_blk = 0;
505 
506 	mlog_entry_void();
507 
508 	if (type == OCFS2_DINODE_EXTENT) {
509 		struct ocfs2_dinode *fe =
510 				(struct ocfs2_dinode *)root_bh->b_data;
511 		if (!OCFS2_IS_VALID_DINODE(fe)) {
512 			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
513 			retval = -EIO;
514 			goto bail;
515 		}
516 
517 		if (fe->i_last_eb_blk)
518 			last_eb_blk = le64_to_cpu(fe->i_last_eb_blk);
519 		el = &fe->id2.i_list;
520 	}
521 
522 	if (last_eb_blk) {
523 		retval = ocfs2_read_block(osb, last_eb_blk,
524 					  &eb_bh, OCFS2_BH_CACHED, inode);
525 		if (retval < 0) {
526 			mlog_errno(retval);
527 			goto bail;
528 		}
529 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
530 		el = &eb->h_list;
531 	}
532 
533 	BUG_ON(el->l_tree_depth != 0);
534 
535 	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
536 bail:
537 	if (eb_bh)
538 		brelse(eb_bh);
539 
540 	mlog_exit(retval);
541 	return retval;
542 }
543 
544 /* expects array to already be allocated
545  *
546  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
547  * l_count for you
548  */
549 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
550 				     handle_t *handle,
551 				     struct inode *inode,
552 				     int wanted,
553 				     struct ocfs2_alloc_context *meta_ac,
554 				     struct buffer_head *bhs[])
555 {
556 	int count, status, i;
557 	u16 suballoc_bit_start;
558 	u32 num_got;
559 	u64 first_blkno;
560 	struct ocfs2_extent_block *eb;
561 
562 	mlog_entry_void();
563 
564 	count = 0;
565 	while (count < wanted) {
566 		status = ocfs2_claim_metadata(osb,
567 					      handle,
568 					      meta_ac,
569 					      wanted - count,
570 					      &suballoc_bit_start,
571 					      &num_got,
572 					      &first_blkno);
573 		if (status < 0) {
574 			mlog_errno(status);
575 			goto bail;
576 		}
577 
578 		for(i = count;  i < (num_got + count); i++) {
579 			bhs[i] = sb_getblk(osb->sb, first_blkno);
580 			if (bhs[i] == NULL) {
581 				status = -EIO;
582 				mlog_errno(status);
583 				goto bail;
584 			}
585 			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
586 
587 			status = ocfs2_journal_access(handle, inode, bhs[i],
588 						      OCFS2_JOURNAL_ACCESS_CREATE);
589 			if (status < 0) {
590 				mlog_errno(status);
591 				goto bail;
592 			}
593 
594 			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
595 			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
596 			/* Ok, setup the minimal stuff here. */
597 			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
598 			eb->h_blkno = cpu_to_le64(first_blkno);
599 			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
600 			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
601 			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
602 			eb->h_list.l_count =
603 				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
604 
605 			suballoc_bit_start++;
606 			first_blkno++;
607 
608 			/* We'll also be dirtied by the caller, so
609 			 * this isn't absolutely necessary. */
610 			status = ocfs2_journal_dirty(handle, bhs[i]);
611 			if (status < 0) {
612 				mlog_errno(status);
613 				goto bail;
614 			}
615 		}
616 
617 		count += num_got;
618 	}
619 
620 	status = 0;
621 bail:
622 	if (status < 0) {
623 		for(i = 0; i < wanted; i++) {
624 			if (bhs[i])
625 				brelse(bhs[i]);
626 			bhs[i] = NULL;
627 		}
628 	}
629 	mlog_exit(status);
630 	return status;
631 }
632 
633 /*
634  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
635  *
636  * Returns the sum of the rightmost extent rec logical offset and
637  * cluster count.
638  *
639  * ocfs2_add_branch() uses this to determine what logical cluster
640  * value should be populated into the leftmost new branch records.
641  *
642  * ocfs2_shift_tree_depth() uses this to determine the # clusters
643  * value for the new topmost tree record.
644  */
645 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
646 {
647 	int i;
648 
649 	i = le16_to_cpu(el->l_next_free_rec) - 1;
650 
651 	return le32_to_cpu(el->l_recs[i].e_cpos) +
652 		ocfs2_rec_clusters(el, &el->l_recs[i]);
653 }
654 
655 /*
656  * Add an entire tree branch to our inode. eb_bh is the extent block
657  * to start at, if we don't want to start the branch at the dinode
658  * structure.
659  *
660  * last_eb_bh is required as we have to update it's next_leaf pointer
661  * for the new last extent block.
662  *
663  * the new branch will be 'empty' in the sense that every block will
664  * contain a single record with cluster count == 0.
665  */
666 static int ocfs2_add_branch(struct ocfs2_super *osb,
667 			    handle_t *handle,
668 			    struct inode *inode,
669 			    struct ocfs2_extent_tree *et,
670 			    struct buffer_head *eb_bh,
671 			    struct buffer_head **last_eb_bh,
672 			    struct ocfs2_alloc_context *meta_ac)
673 {
674 	int status, new_blocks, i;
675 	u64 next_blkno, new_last_eb_blk;
676 	struct buffer_head *bh;
677 	struct buffer_head **new_eb_bhs = NULL;
678 	struct ocfs2_extent_block *eb;
679 	struct ocfs2_extent_list  *eb_el;
680 	struct ocfs2_extent_list  *el;
681 	u32 new_cpos;
682 
683 	mlog_entry_void();
684 
685 	BUG_ON(!last_eb_bh || !*last_eb_bh);
686 
687 	if (eb_bh) {
688 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
689 		el = &eb->h_list;
690 	} else
691 		el = et->root_el;
692 
693 	/* we never add a branch to a leaf. */
694 	BUG_ON(!el->l_tree_depth);
695 
696 	new_blocks = le16_to_cpu(el->l_tree_depth);
697 
698 	/* allocate the number of new eb blocks we need */
699 	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
700 			     GFP_KERNEL);
701 	if (!new_eb_bhs) {
702 		status = -ENOMEM;
703 		mlog_errno(status);
704 		goto bail;
705 	}
706 
707 	status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
708 					   meta_ac, new_eb_bhs);
709 	if (status < 0) {
710 		mlog_errno(status);
711 		goto bail;
712 	}
713 
714 	eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
715 	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
716 
717 	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
718 	 * linked with the rest of the tree.
719 	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
720 	 *
721 	 * when we leave the loop, new_last_eb_blk will point to the
722 	 * newest leaf, and next_blkno will point to the topmost extent
723 	 * block. */
724 	next_blkno = new_last_eb_blk = 0;
725 	for(i = 0; i < new_blocks; i++) {
726 		bh = new_eb_bhs[i];
727 		eb = (struct ocfs2_extent_block *) bh->b_data;
728 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
729 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
730 			status = -EIO;
731 			goto bail;
732 		}
733 		eb_el = &eb->h_list;
734 
735 		status = ocfs2_journal_access(handle, inode, bh,
736 					      OCFS2_JOURNAL_ACCESS_CREATE);
737 		if (status < 0) {
738 			mlog_errno(status);
739 			goto bail;
740 		}
741 
742 		eb->h_next_leaf_blk = 0;
743 		eb_el->l_tree_depth = cpu_to_le16(i);
744 		eb_el->l_next_free_rec = cpu_to_le16(1);
745 		/*
746 		 * This actually counts as an empty extent as
747 		 * c_clusters == 0
748 		 */
749 		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
750 		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
751 		/*
752 		 * eb_el isn't always an interior node, but even leaf
753 		 * nodes want a zero'd flags and reserved field so
754 		 * this gets the whole 32 bits regardless of use.
755 		 */
756 		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
757 		if (!eb_el->l_tree_depth)
758 			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
759 
760 		status = ocfs2_journal_dirty(handle, bh);
761 		if (status < 0) {
762 			mlog_errno(status);
763 			goto bail;
764 		}
765 
766 		next_blkno = le64_to_cpu(eb->h_blkno);
767 	}
768 
769 	/* This is a bit hairy. We want to update up to three blocks
770 	 * here without leaving any of them in an inconsistent state
771 	 * in case of error. We don't have to worry about
772 	 * journal_dirty erroring as it won't unless we've aborted the
773 	 * handle (in which case we would never be here) so reserving
774 	 * the write with journal_access is all we need to do. */
775 	status = ocfs2_journal_access(handle, inode, *last_eb_bh,
776 				      OCFS2_JOURNAL_ACCESS_WRITE);
777 	if (status < 0) {
778 		mlog_errno(status);
779 		goto bail;
780 	}
781 	status = ocfs2_journal_access(handle, inode, et->root_bh,
782 				      OCFS2_JOURNAL_ACCESS_WRITE);
783 	if (status < 0) {
784 		mlog_errno(status);
785 		goto bail;
786 	}
787 	if (eb_bh) {
788 		status = ocfs2_journal_access(handle, inode, eb_bh,
789 					      OCFS2_JOURNAL_ACCESS_WRITE);
790 		if (status < 0) {
791 			mlog_errno(status);
792 			goto bail;
793 		}
794 	}
795 
796 	/* Link the new branch into the rest of the tree (el will
797 	 * either be on the root_bh, or the extent block passed in. */
798 	i = le16_to_cpu(el->l_next_free_rec);
799 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
800 	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
801 	el->l_recs[i].e_int_clusters = 0;
802 	le16_add_cpu(&el->l_next_free_rec, 1);
803 
804 	/* fe needs a new last extent block pointer, as does the
805 	 * next_leaf on the previously last-extent-block. */
806 	ocfs2_set_last_eb_blk(et, new_last_eb_blk);
807 
808 	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
809 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
810 
811 	status = ocfs2_journal_dirty(handle, *last_eb_bh);
812 	if (status < 0)
813 		mlog_errno(status);
814 	status = ocfs2_journal_dirty(handle, et->root_bh);
815 	if (status < 0)
816 		mlog_errno(status);
817 	if (eb_bh) {
818 		status = ocfs2_journal_dirty(handle, eb_bh);
819 		if (status < 0)
820 			mlog_errno(status);
821 	}
822 
823 	/*
824 	 * Some callers want to track the rightmost leaf so pass it
825 	 * back here.
826 	 */
827 	brelse(*last_eb_bh);
828 	get_bh(new_eb_bhs[0]);
829 	*last_eb_bh = new_eb_bhs[0];
830 
831 	status = 0;
832 bail:
833 	if (new_eb_bhs) {
834 		for (i = 0; i < new_blocks; i++)
835 			if (new_eb_bhs[i])
836 				brelse(new_eb_bhs[i]);
837 		kfree(new_eb_bhs);
838 	}
839 
840 	mlog_exit(status);
841 	return status;
842 }
843 
844 /*
845  * adds another level to the allocation tree.
846  * returns back the new extent block so you can add a branch to it
847  * after this call.
848  */
849 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
850 				  handle_t *handle,
851 				  struct inode *inode,
852 				  struct ocfs2_extent_tree *et,
853 				  struct ocfs2_alloc_context *meta_ac,
854 				  struct buffer_head **ret_new_eb_bh)
855 {
856 	int status, i;
857 	u32 new_clusters;
858 	struct buffer_head *new_eb_bh = NULL;
859 	struct ocfs2_extent_block *eb;
860 	struct ocfs2_extent_list  *root_el;
861 	struct ocfs2_extent_list  *eb_el;
862 
863 	mlog_entry_void();
864 
865 	status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
866 					   &new_eb_bh);
867 	if (status < 0) {
868 		mlog_errno(status);
869 		goto bail;
870 	}
871 
872 	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
873 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
874 		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
875 		status = -EIO;
876 		goto bail;
877 	}
878 
879 	eb_el = &eb->h_list;
880 	root_el = et->root_el;
881 
882 	status = ocfs2_journal_access(handle, inode, new_eb_bh,
883 				      OCFS2_JOURNAL_ACCESS_CREATE);
884 	if (status < 0) {
885 		mlog_errno(status);
886 		goto bail;
887 	}
888 
889 	/* copy the root extent list data into the new extent block */
890 	eb_el->l_tree_depth = root_el->l_tree_depth;
891 	eb_el->l_next_free_rec = root_el->l_next_free_rec;
892 	for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
893 		eb_el->l_recs[i] = root_el->l_recs[i];
894 
895 	status = ocfs2_journal_dirty(handle, new_eb_bh);
896 	if (status < 0) {
897 		mlog_errno(status);
898 		goto bail;
899 	}
900 
901 	status = ocfs2_journal_access(handle, inode, et->root_bh,
902 				      OCFS2_JOURNAL_ACCESS_WRITE);
903 	if (status < 0) {
904 		mlog_errno(status);
905 		goto bail;
906 	}
907 
908 	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
909 
910 	/* update root_bh now */
911 	le16_add_cpu(&root_el->l_tree_depth, 1);
912 	root_el->l_recs[0].e_cpos = 0;
913 	root_el->l_recs[0].e_blkno = eb->h_blkno;
914 	root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
915 	for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
916 		memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
917 	root_el->l_next_free_rec = cpu_to_le16(1);
918 
919 	/* If this is our 1st tree depth shift, then last_eb_blk
920 	 * becomes the allocated extent block */
921 	if (root_el->l_tree_depth == cpu_to_le16(1))
922 		ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
923 
924 	status = ocfs2_journal_dirty(handle, et->root_bh);
925 	if (status < 0) {
926 		mlog_errno(status);
927 		goto bail;
928 	}
929 
930 	*ret_new_eb_bh = new_eb_bh;
931 	new_eb_bh = NULL;
932 	status = 0;
933 bail:
934 	if (new_eb_bh)
935 		brelse(new_eb_bh);
936 
937 	mlog_exit(status);
938 	return status;
939 }
940 
941 /*
942  * Should only be called when there is no space left in any of the
943  * leaf nodes. What we want to do is find the lowest tree depth
944  * non-leaf extent block with room for new records. There are three
945  * valid results of this search:
946  *
947  * 1) a lowest extent block is found, then we pass it back in
948  *    *lowest_eb_bh and return '0'
949  *
950  * 2) the search fails to find anything, but the root_el has room. We
951  *    pass NULL back in *lowest_eb_bh, but still return '0'
952  *
953  * 3) the search fails to find anything AND the root_el is full, in
954  *    which case we return > 0
955  *
956  * return status < 0 indicates an error.
957  */
958 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
959 				    struct inode *inode,
960 				    struct ocfs2_extent_tree *et,
961 				    struct buffer_head **target_bh)
962 {
963 	int status = 0, i;
964 	u64 blkno;
965 	struct ocfs2_extent_block *eb;
966 	struct ocfs2_extent_list  *el;
967 	struct buffer_head *bh = NULL;
968 	struct buffer_head *lowest_bh = NULL;
969 
970 	mlog_entry_void();
971 
972 	*target_bh = NULL;
973 
974 	el = et->root_el;
975 
976 	while(le16_to_cpu(el->l_tree_depth) > 1) {
977 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
978 			ocfs2_error(inode->i_sb, "Dinode %llu has empty "
979 				    "extent list (next_free_rec == 0)",
980 				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
981 			status = -EIO;
982 			goto bail;
983 		}
984 		i = le16_to_cpu(el->l_next_free_rec) - 1;
985 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
986 		if (!blkno) {
987 			ocfs2_error(inode->i_sb, "Dinode %llu has extent "
988 				    "list where extent # %d has no physical "
989 				    "block start",
990 				    (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
991 			status = -EIO;
992 			goto bail;
993 		}
994 
995 		if (bh) {
996 			brelse(bh);
997 			bh = NULL;
998 		}
999 
1000 		status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
1001 					  inode);
1002 		if (status < 0) {
1003 			mlog_errno(status);
1004 			goto bail;
1005 		}
1006 
1007 		eb = (struct ocfs2_extent_block *) bh->b_data;
1008 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1009 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1010 			status = -EIO;
1011 			goto bail;
1012 		}
1013 		el = &eb->h_list;
1014 
1015 		if (le16_to_cpu(el->l_next_free_rec) <
1016 		    le16_to_cpu(el->l_count)) {
1017 			if (lowest_bh)
1018 				brelse(lowest_bh);
1019 			lowest_bh = bh;
1020 			get_bh(lowest_bh);
1021 		}
1022 	}
1023 
1024 	/* If we didn't find one and the fe doesn't have any room,
1025 	 * then return '1' */
1026 	el = et->root_el;
1027 	if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1028 		status = 1;
1029 
1030 	*target_bh = lowest_bh;
1031 bail:
1032 	if (bh)
1033 		brelse(bh);
1034 
1035 	mlog_exit(status);
1036 	return status;
1037 }
1038 
1039 /*
1040  * Grow a b-tree so that it has more records.
1041  *
1042  * We might shift the tree depth in which case existing paths should
1043  * be considered invalid.
1044  *
1045  * Tree depth after the grow is returned via *final_depth.
1046  *
1047  * *last_eb_bh will be updated by ocfs2_add_branch().
1048  */
1049 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1050 			   struct ocfs2_extent_tree *et, int *final_depth,
1051 			   struct buffer_head **last_eb_bh,
1052 			   struct ocfs2_alloc_context *meta_ac)
1053 {
1054 	int ret, shift;
1055 	struct ocfs2_extent_list *el = et->root_el;
1056 	int depth = le16_to_cpu(el->l_tree_depth);
1057 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1058 	struct buffer_head *bh = NULL;
1059 
1060 	BUG_ON(meta_ac == NULL);
1061 
1062 	shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1063 	if (shift < 0) {
1064 		ret = shift;
1065 		mlog_errno(ret);
1066 		goto out;
1067 	}
1068 
1069 	/* We traveled all the way to the bottom of the allocation tree
1070 	 * and didn't find room for any more extents - we need to add
1071 	 * another tree level */
1072 	if (shift) {
1073 		BUG_ON(bh);
1074 		mlog(0, "need to shift tree depth (current = %d)\n", depth);
1075 
1076 		/* ocfs2_shift_tree_depth will return us a buffer with
1077 		 * the new extent block (so we can pass that to
1078 		 * ocfs2_add_branch). */
1079 		ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1080 					     meta_ac, &bh);
1081 		if (ret < 0) {
1082 			mlog_errno(ret);
1083 			goto out;
1084 		}
1085 		depth++;
1086 		if (depth == 1) {
1087 			/*
1088 			 * Special case: we have room now if we shifted from
1089 			 * tree_depth 0, so no more work needs to be done.
1090 			 *
1091 			 * We won't be calling add_branch, so pass
1092 			 * back *last_eb_bh as the new leaf. At depth
1093 			 * zero, it should always be null so there's
1094 			 * no reason to brelse.
1095 			 */
1096 			BUG_ON(*last_eb_bh);
1097 			get_bh(bh);
1098 			*last_eb_bh = bh;
1099 			goto out;
1100 		}
1101 	}
1102 
1103 	/* call ocfs2_add_branch to add the final part of the tree with
1104 	 * the new data. */
1105 	mlog(0, "add branch. bh = %p\n", bh);
1106 	ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1107 			       meta_ac);
1108 	if (ret < 0) {
1109 		mlog_errno(ret);
1110 		goto out;
1111 	}
1112 
1113 out:
1114 	if (final_depth)
1115 		*final_depth = depth;
1116 	brelse(bh);
1117 	return ret;
1118 }
1119 
1120 /*
1121  * This function will discard the rightmost extent record.
1122  */
1123 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1124 {
1125 	int next_free = le16_to_cpu(el->l_next_free_rec);
1126 	int count = le16_to_cpu(el->l_count);
1127 	unsigned int num_bytes;
1128 
1129 	BUG_ON(!next_free);
1130 	/* This will cause us to go off the end of our extent list. */
1131 	BUG_ON(next_free >= count);
1132 
1133 	num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1134 
1135 	memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1136 }
1137 
1138 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1139 			      struct ocfs2_extent_rec *insert_rec)
1140 {
1141 	int i, insert_index, next_free, has_empty, num_bytes;
1142 	u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1143 	struct ocfs2_extent_rec *rec;
1144 
1145 	next_free = le16_to_cpu(el->l_next_free_rec);
1146 	has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1147 
1148 	BUG_ON(!next_free);
1149 
1150 	/* The tree code before us didn't allow enough room in the leaf. */
1151 	BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1152 
1153 	/*
1154 	 * The easiest way to approach this is to just remove the
1155 	 * empty extent and temporarily decrement next_free.
1156 	 */
1157 	if (has_empty) {
1158 		/*
1159 		 * If next_free was 1 (only an empty extent), this
1160 		 * loop won't execute, which is fine. We still want
1161 		 * the decrement above to happen.
1162 		 */
1163 		for(i = 0; i < (next_free - 1); i++)
1164 			el->l_recs[i] = el->l_recs[i+1];
1165 
1166 		next_free--;
1167 	}
1168 
1169 	/*
1170 	 * Figure out what the new record index should be.
1171 	 */
1172 	for(i = 0; i < next_free; i++) {
1173 		rec = &el->l_recs[i];
1174 
1175 		if (insert_cpos < le32_to_cpu(rec->e_cpos))
1176 			break;
1177 	}
1178 	insert_index = i;
1179 
1180 	mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1181 	     insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1182 
1183 	BUG_ON(insert_index < 0);
1184 	BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1185 	BUG_ON(insert_index > next_free);
1186 
1187 	/*
1188 	 * No need to memmove if we're just adding to the tail.
1189 	 */
1190 	if (insert_index != next_free) {
1191 		BUG_ON(next_free >= le16_to_cpu(el->l_count));
1192 
1193 		num_bytes = next_free - insert_index;
1194 		num_bytes *= sizeof(struct ocfs2_extent_rec);
1195 		memmove(&el->l_recs[insert_index + 1],
1196 			&el->l_recs[insert_index],
1197 			num_bytes);
1198 	}
1199 
1200 	/*
1201 	 * Either we had an empty extent, and need to re-increment or
1202 	 * there was no empty extent on a non full rightmost leaf node,
1203 	 * in which case we still need to increment.
1204 	 */
1205 	next_free++;
1206 	el->l_next_free_rec = cpu_to_le16(next_free);
1207 	/*
1208 	 * Make sure none of the math above just messed up our tree.
1209 	 */
1210 	BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1211 
1212 	el->l_recs[insert_index] = *insert_rec;
1213 
1214 }
1215 
1216 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1217 {
1218 	int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1219 
1220 	BUG_ON(num_recs == 0);
1221 
1222 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1223 		num_recs--;
1224 		size = num_recs * sizeof(struct ocfs2_extent_rec);
1225 		memmove(&el->l_recs[0], &el->l_recs[1], size);
1226 		memset(&el->l_recs[num_recs], 0,
1227 		       sizeof(struct ocfs2_extent_rec));
1228 		el->l_next_free_rec = cpu_to_le16(num_recs);
1229 	}
1230 }
1231 
1232 /*
1233  * Create an empty extent record .
1234  *
1235  * l_next_free_rec may be updated.
1236  *
1237  * If an empty extent already exists do nothing.
1238  */
1239 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1240 {
1241 	int next_free = le16_to_cpu(el->l_next_free_rec);
1242 
1243 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1244 
1245 	if (next_free == 0)
1246 		goto set_and_inc;
1247 
1248 	if (ocfs2_is_empty_extent(&el->l_recs[0]))
1249 		return;
1250 
1251 	mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1252 			"Asked to create an empty extent in a full list:\n"
1253 			"count = %u, tree depth = %u",
1254 			le16_to_cpu(el->l_count),
1255 			le16_to_cpu(el->l_tree_depth));
1256 
1257 	ocfs2_shift_records_right(el);
1258 
1259 set_and_inc:
1260 	le16_add_cpu(&el->l_next_free_rec, 1);
1261 	memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1262 }
1263 
1264 /*
1265  * For a rotation which involves two leaf nodes, the "root node" is
1266  * the lowest level tree node which contains a path to both leafs. This
1267  * resulting set of information can be used to form a complete "subtree"
1268  *
1269  * This function is passed two full paths from the dinode down to a
1270  * pair of adjacent leaves. It's task is to figure out which path
1271  * index contains the subtree root - this can be the root index itself
1272  * in a worst-case rotation.
1273  *
1274  * The array index of the subtree root is passed back.
1275  */
1276 static int ocfs2_find_subtree_root(struct inode *inode,
1277 				   struct ocfs2_path *left,
1278 				   struct ocfs2_path *right)
1279 {
1280 	int i = 0;
1281 
1282 	/*
1283 	 * Check that the caller passed in two paths from the same tree.
1284 	 */
1285 	BUG_ON(path_root_bh(left) != path_root_bh(right));
1286 
1287 	do {
1288 		i++;
1289 
1290 		/*
1291 		 * The caller didn't pass two adjacent paths.
1292 		 */
1293 		mlog_bug_on_msg(i > left->p_tree_depth,
1294 				"Inode %lu, left depth %u, right depth %u\n"
1295 				"left leaf blk %llu, right leaf blk %llu\n",
1296 				inode->i_ino, left->p_tree_depth,
1297 				right->p_tree_depth,
1298 				(unsigned long long)path_leaf_bh(left)->b_blocknr,
1299 				(unsigned long long)path_leaf_bh(right)->b_blocknr);
1300 	} while (left->p_node[i].bh->b_blocknr ==
1301 		 right->p_node[i].bh->b_blocknr);
1302 
1303 	return i - 1;
1304 }
1305 
1306 typedef void (path_insert_t)(void *, struct buffer_head *);
1307 
1308 /*
1309  * Traverse a btree path in search of cpos, starting at root_el.
1310  *
1311  * This code can be called with a cpos larger than the tree, in which
1312  * case it will return the rightmost path.
1313  */
1314 static int __ocfs2_find_path(struct inode *inode,
1315 			     struct ocfs2_extent_list *root_el, u32 cpos,
1316 			     path_insert_t *func, void *data)
1317 {
1318 	int i, ret = 0;
1319 	u32 range;
1320 	u64 blkno;
1321 	struct buffer_head *bh = NULL;
1322 	struct ocfs2_extent_block *eb;
1323 	struct ocfs2_extent_list *el;
1324 	struct ocfs2_extent_rec *rec;
1325 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1326 
1327 	el = root_el;
1328 	while (el->l_tree_depth) {
1329 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
1330 			ocfs2_error(inode->i_sb,
1331 				    "Inode %llu has empty extent list at "
1332 				    "depth %u\n",
1333 				    (unsigned long long)oi->ip_blkno,
1334 				    le16_to_cpu(el->l_tree_depth));
1335 			ret = -EROFS;
1336 			goto out;
1337 
1338 		}
1339 
1340 		for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1341 			rec = &el->l_recs[i];
1342 
1343 			/*
1344 			 * In the case that cpos is off the allocation
1345 			 * tree, this should just wind up returning the
1346 			 * rightmost record.
1347 			 */
1348 			range = le32_to_cpu(rec->e_cpos) +
1349 				ocfs2_rec_clusters(el, rec);
1350 			if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1351 			    break;
1352 		}
1353 
1354 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1355 		if (blkno == 0) {
1356 			ocfs2_error(inode->i_sb,
1357 				    "Inode %llu has bad blkno in extent list "
1358 				    "at depth %u (index %d)\n",
1359 				    (unsigned long long)oi->ip_blkno,
1360 				    le16_to_cpu(el->l_tree_depth), i);
1361 			ret = -EROFS;
1362 			goto out;
1363 		}
1364 
1365 		brelse(bh);
1366 		bh = NULL;
1367 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1368 				       &bh, OCFS2_BH_CACHED, inode);
1369 		if (ret) {
1370 			mlog_errno(ret);
1371 			goto out;
1372 		}
1373 
1374 		eb = (struct ocfs2_extent_block *) bh->b_data;
1375 		el = &eb->h_list;
1376 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1377 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1378 			ret = -EIO;
1379 			goto out;
1380 		}
1381 
1382 		if (le16_to_cpu(el->l_next_free_rec) >
1383 		    le16_to_cpu(el->l_count)) {
1384 			ocfs2_error(inode->i_sb,
1385 				    "Inode %llu has bad count in extent list "
1386 				    "at block %llu (next free=%u, count=%u)\n",
1387 				    (unsigned long long)oi->ip_blkno,
1388 				    (unsigned long long)bh->b_blocknr,
1389 				    le16_to_cpu(el->l_next_free_rec),
1390 				    le16_to_cpu(el->l_count));
1391 			ret = -EROFS;
1392 			goto out;
1393 		}
1394 
1395 		if (func)
1396 			func(data, bh);
1397 	}
1398 
1399 out:
1400 	/*
1401 	 * Catch any trailing bh that the loop didn't handle.
1402 	 */
1403 	brelse(bh);
1404 
1405 	return ret;
1406 }
1407 
1408 /*
1409  * Given an initialized path (that is, it has a valid root extent
1410  * list), this function will traverse the btree in search of the path
1411  * which would contain cpos.
1412  *
1413  * The path traveled is recorded in the path structure.
1414  *
1415  * Note that this will not do any comparisons on leaf node extent
1416  * records, so it will work fine in the case that we just added a tree
1417  * branch.
1418  */
1419 struct find_path_data {
1420 	int index;
1421 	struct ocfs2_path *path;
1422 };
1423 static void find_path_ins(void *data, struct buffer_head *bh)
1424 {
1425 	struct find_path_data *fp = data;
1426 
1427 	get_bh(bh);
1428 	ocfs2_path_insert_eb(fp->path, fp->index, bh);
1429 	fp->index++;
1430 }
1431 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1432 			   u32 cpos)
1433 {
1434 	struct find_path_data data;
1435 
1436 	data.index = 1;
1437 	data.path = path;
1438 	return __ocfs2_find_path(inode, path_root_el(path), cpos,
1439 				 find_path_ins, &data);
1440 }
1441 
1442 static void find_leaf_ins(void *data, struct buffer_head *bh)
1443 {
1444 	struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1445 	struct ocfs2_extent_list *el = &eb->h_list;
1446 	struct buffer_head **ret = data;
1447 
1448 	/* We want to retain only the leaf block. */
1449 	if (le16_to_cpu(el->l_tree_depth) == 0) {
1450 		get_bh(bh);
1451 		*ret = bh;
1452 	}
1453 }
1454 /*
1455  * Find the leaf block in the tree which would contain cpos. No
1456  * checking of the actual leaf is done.
1457  *
1458  * Some paths want to call this instead of allocating a path structure
1459  * and calling ocfs2_find_path().
1460  *
1461  * This function doesn't handle non btree extent lists.
1462  */
1463 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1464 		    u32 cpos, struct buffer_head **leaf_bh)
1465 {
1466 	int ret;
1467 	struct buffer_head *bh = NULL;
1468 
1469 	ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1470 	if (ret) {
1471 		mlog_errno(ret);
1472 		goto out;
1473 	}
1474 
1475 	*leaf_bh = bh;
1476 out:
1477 	return ret;
1478 }
1479 
1480 /*
1481  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1482  *
1483  * Basically, we've moved stuff around at the bottom of the tree and
1484  * we need to fix up the extent records above the changes to reflect
1485  * the new changes.
1486  *
1487  * left_rec: the record on the left.
1488  * left_child_el: is the child list pointed to by left_rec
1489  * right_rec: the record to the right of left_rec
1490  * right_child_el: is the child list pointed to by right_rec
1491  *
1492  * By definition, this only works on interior nodes.
1493  */
1494 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1495 				  struct ocfs2_extent_list *left_child_el,
1496 				  struct ocfs2_extent_rec *right_rec,
1497 				  struct ocfs2_extent_list *right_child_el)
1498 {
1499 	u32 left_clusters, right_end;
1500 
1501 	/*
1502 	 * Interior nodes never have holes. Their cpos is the cpos of
1503 	 * the leftmost record in their child list. Their cluster
1504 	 * count covers the full theoretical range of their child list
1505 	 * - the range between their cpos and the cpos of the record
1506 	 * immediately to their right.
1507 	 */
1508 	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1509 	if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1510 		BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1511 		left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1512 	}
1513 	left_clusters -= le32_to_cpu(left_rec->e_cpos);
1514 	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1515 
1516 	/*
1517 	 * Calculate the rightmost cluster count boundary before
1518 	 * moving cpos - we will need to adjust clusters after
1519 	 * updating e_cpos to keep the same highest cluster count.
1520 	 */
1521 	right_end = le32_to_cpu(right_rec->e_cpos);
1522 	right_end += le32_to_cpu(right_rec->e_int_clusters);
1523 
1524 	right_rec->e_cpos = left_rec->e_cpos;
1525 	le32_add_cpu(&right_rec->e_cpos, left_clusters);
1526 
1527 	right_end -= le32_to_cpu(right_rec->e_cpos);
1528 	right_rec->e_int_clusters = cpu_to_le32(right_end);
1529 }
1530 
1531 /*
1532  * Adjust the adjacent root node records involved in a
1533  * rotation. left_el_blkno is passed in as a key so that we can easily
1534  * find it's index in the root list.
1535  */
1536 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1537 				      struct ocfs2_extent_list *left_el,
1538 				      struct ocfs2_extent_list *right_el,
1539 				      u64 left_el_blkno)
1540 {
1541 	int i;
1542 
1543 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1544 	       le16_to_cpu(left_el->l_tree_depth));
1545 
1546 	for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1547 		if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1548 			break;
1549 	}
1550 
1551 	/*
1552 	 * The path walking code should have never returned a root and
1553 	 * two paths which are not adjacent.
1554 	 */
1555 	BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1556 
1557 	ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1558 				      &root_el->l_recs[i + 1], right_el);
1559 }
1560 
1561 /*
1562  * We've changed a leaf block (in right_path) and need to reflect that
1563  * change back up the subtree.
1564  *
1565  * This happens in multiple places:
1566  *   - When we've moved an extent record from the left path leaf to the right
1567  *     path leaf to make room for an empty extent in the left path leaf.
1568  *   - When our insert into the right path leaf is at the leftmost edge
1569  *     and requires an update of the path immediately to it's left. This
1570  *     can occur at the end of some types of rotation and appending inserts.
1571  *   - When we've adjusted the last extent record in the left path leaf and the
1572  *     1st extent record in the right path leaf during cross extent block merge.
1573  */
1574 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1575 				       struct ocfs2_path *left_path,
1576 				       struct ocfs2_path *right_path,
1577 				       int subtree_index)
1578 {
1579 	int ret, i, idx;
1580 	struct ocfs2_extent_list *el, *left_el, *right_el;
1581 	struct ocfs2_extent_rec *left_rec, *right_rec;
1582 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1583 
1584 	/*
1585 	 * Update the counts and position values within all the
1586 	 * interior nodes to reflect the leaf rotation we just did.
1587 	 *
1588 	 * The root node is handled below the loop.
1589 	 *
1590 	 * We begin the loop with right_el and left_el pointing to the
1591 	 * leaf lists and work our way up.
1592 	 *
1593 	 * NOTE: within this loop, left_el and right_el always refer
1594 	 * to the *child* lists.
1595 	 */
1596 	left_el = path_leaf_el(left_path);
1597 	right_el = path_leaf_el(right_path);
1598 	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1599 		mlog(0, "Adjust records at index %u\n", i);
1600 
1601 		/*
1602 		 * One nice property of knowing that all of these
1603 		 * nodes are below the root is that we only deal with
1604 		 * the leftmost right node record and the rightmost
1605 		 * left node record.
1606 		 */
1607 		el = left_path->p_node[i].el;
1608 		idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1609 		left_rec = &el->l_recs[idx];
1610 
1611 		el = right_path->p_node[i].el;
1612 		right_rec = &el->l_recs[0];
1613 
1614 		ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1615 					      right_el);
1616 
1617 		ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1618 		if (ret)
1619 			mlog_errno(ret);
1620 
1621 		ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1622 		if (ret)
1623 			mlog_errno(ret);
1624 
1625 		/*
1626 		 * Setup our list pointers now so that the current
1627 		 * parents become children in the next iteration.
1628 		 */
1629 		left_el = left_path->p_node[i].el;
1630 		right_el = right_path->p_node[i].el;
1631 	}
1632 
1633 	/*
1634 	 * At the root node, adjust the two adjacent records which
1635 	 * begin our path to the leaves.
1636 	 */
1637 
1638 	el = left_path->p_node[subtree_index].el;
1639 	left_el = left_path->p_node[subtree_index + 1].el;
1640 	right_el = right_path->p_node[subtree_index + 1].el;
1641 
1642 	ocfs2_adjust_root_records(el, left_el, right_el,
1643 				  left_path->p_node[subtree_index + 1].bh->b_blocknr);
1644 
1645 	root_bh = left_path->p_node[subtree_index].bh;
1646 
1647 	ret = ocfs2_journal_dirty(handle, root_bh);
1648 	if (ret)
1649 		mlog_errno(ret);
1650 }
1651 
1652 static int ocfs2_rotate_subtree_right(struct inode *inode,
1653 				      handle_t *handle,
1654 				      struct ocfs2_path *left_path,
1655 				      struct ocfs2_path *right_path,
1656 				      int subtree_index)
1657 {
1658 	int ret, i;
1659 	struct buffer_head *right_leaf_bh;
1660 	struct buffer_head *left_leaf_bh = NULL;
1661 	struct buffer_head *root_bh;
1662 	struct ocfs2_extent_list *right_el, *left_el;
1663 	struct ocfs2_extent_rec move_rec;
1664 
1665 	left_leaf_bh = path_leaf_bh(left_path);
1666 	left_el = path_leaf_el(left_path);
1667 
1668 	if (left_el->l_next_free_rec != left_el->l_count) {
1669 		ocfs2_error(inode->i_sb,
1670 			    "Inode %llu has non-full interior leaf node %llu"
1671 			    "(next free = %u)",
1672 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
1673 			    (unsigned long long)left_leaf_bh->b_blocknr,
1674 			    le16_to_cpu(left_el->l_next_free_rec));
1675 		return -EROFS;
1676 	}
1677 
1678 	/*
1679 	 * This extent block may already have an empty record, so we
1680 	 * return early if so.
1681 	 */
1682 	if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1683 		return 0;
1684 
1685 	root_bh = left_path->p_node[subtree_index].bh;
1686 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1687 
1688 	ret = ocfs2_journal_access(handle, inode, root_bh,
1689 				   OCFS2_JOURNAL_ACCESS_WRITE);
1690 	if (ret) {
1691 		mlog_errno(ret);
1692 		goto out;
1693 	}
1694 
1695 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1696 		ret = ocfs2_journal_access(handle, inode,
1697 					   right_path->p_node[i].bh,
1698 					   OCFS2_JOURNAL_ACCESS_WRITE);
1699 		if (ret) {
1700 			mlog_errno(ret);
1701 			goto out;
1702 		}
1703 
1704 		ret = ocfs2_journal_access(handle, inode,
1705 					   left_path->p_node[i].bh,
1706 					   OCFS2_JOURNAL_ACCESS_WRITE);
1707 		if (ret) {
1708 			mlog_errno(ret);
1709 			goto out;
1710 		}
1711 	}
1712 
1713 	right_leaf_bh = path_leaf_bh(right_path);
1714 	right_el = path_leaf_el(right_path);
1715 
1716 	/* This is a code error, not a disk corruption. */
1717 	mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1718 			"because rightmost leaf block %llu is empty\n",
1719 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
1720 			(unsigned long long)right_leaf_bh->b_blocknr);
1721 
1722 	ocfs2_create_empty_extent(right_el);
1723 
1724 	ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1725 	if (ret) {
1726 		mlog_errno(ret);
1727 		goto out;
1728 	}
1729 
1730 	/* Do the copy now. */
1731 	i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1732 	move_rec = left_el->l_recs[i];
1733 	right_el->l_recs[0] = move_rec;
1734 
1735 	/*
1736 	 * Clear out the record we just copied and shift everything
1737 	 * over, leaving an empty extent in the left leaf.
1738 	 *
1739 	 * We temporarily subtract from next_free_rec so that the
1740 	 * shift will lose the tail record (which is now defunct).
1741 	 */
1742 	le16_add_cpu(&left_el->l_next_free_rec, -1);
1743 	ocfs2_shift_records_right(left_el);
1744 	memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1745 	le16_add_cpu(&left_el->l_next_free_rec, 1);
1746 
1747 	ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1748 	if (ret) {
1749 		mlog_errno(ret);
1750 		goto out;
1751 	}
1752 
1753 	ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1754 				subtree_index);
1755 
1756 out:
1757 	return ret;
1758 }
1759 
1760 /*
1761  * Given a full path, determine what cpos value would return us a path
1762  * containing the leaf immediately to the left of the current one.
1763  *
1764  * Will return zero if the path passed in is already the leftmost path.
1765  */
1766 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1767 					 struct ocfs2_path *path, u32 *cpos)
1768 {
1769 	int i, j, ret = 0;
1770 	u64 blkno;
1771 	struct ocfs2_extent_list *el;
1772 
1773 	BUG_ON(path->p_tree_depth == 0);
1774 
1775 	*cpos = 0;
1776 
1777 	blkno = path_leaf_bh(path)->b_blocknr;
1778 
1779 	/* Start at the tree node just above the leaf and work our way up. */
1780 	i = path->p_tree_depth - 1;
1781 	while (i >= 0) {
1782 		el = path->p_node[i].el;
1783 
1784 		/*
1785 		 * Find the extent record just before the one in our
1786 		 * path.
1787 		 */
1788 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1789 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1790 				if (j == 0) {
1791 					if (i == 0) {
1792 						/*
1793 						 * We've determined that the
1794 						 * path specified is already
1795 						 * the leftmost one - return a
1796 						 * cpos of zero.
1797 						 */
1798 						goto out;
1799 					}
1800 					/*
1801 					 * The leftmost record points to our
1802 					 * leaf - we need to travel up the
1803 					 * tree one level.
1804 					 */
1805 					goto next_node;
1806 				}
1807 
1808 				*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1809 				*cpos = *cpos + ocfs2_rec_clusters(el,
1810 							   &el->l_recs[j - 1]);
1811 				*cpos = *cpos - 1;
1812 				goto out;
1813 			}
1814 		}
1815 
1816 		/*
1817 		 * If we got here, we never found a valid node where
1818 		 * the tree indicated one should be.
1819 		 */
1820 		ocfs2_error(sb,
1821 			    "Invalid extent tree at extent block %llu\n",
1822 			    (unsigned long long)blkno);
1823 		ret = -EROFS;
1824 		goto out;
1825 
1826 next_node:
1827 		blkno = path->p_node[i].bh->b_blocknr;
1828 		i--;
1829 	}
1830 
1831 out:
1832 	return ret;
1833 }
1834 
1835 /*
1836  * Extend the transaction by enough credits to complete the rotation,
1837  * and still leave at least the original number of credits allocated
1838  * to this transaction.
1839  */
1840 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1841 					   int op_credits,
1842 					   struct ocfs2_path *path)
1843 {
1844 	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1845 
1846 	if (handle->h_buffer_credits < credits)
1847 		return ocfs2_extend_trans(handle, credits);
1848 
1849 	return 0;
1850 }
1851 
1852 /*
1853  * Trap the case where we're inserting into the theoretical range past
1854  * the _actual_ left leaf range. Otherwise, we'll rotate a record
1855  * whose cpos is less than ours into the right leaf.
1856  *
1857  * It's only necessary to look at the rightmost record of the left
1858  * leaf because the logic that calls us should ensure that the
1859  * theoretical ranges in the path components above the leaves are
1860  * correct.
1861  */
1862 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1863 						 u32 insert_cpos)
1864 {
1865 	struct ocfs2_extent_list *left_el;
1866 	struct ocfs2_extent_rec *rec;
1867 	int next_free;
1868 
1869 	left_el = path_leaf_el(left_path);
1870 	next_free = le16_to_cpu(left_el->l_next_free_rec);
1871 	rec = &left_el->l_recs[next_free - 1];
1872 
1873 	if (insert_cpos > le32_to_cpu(rec->e_cpos))
1874 		return 1;
1875 	return 0;
1876 }
1877 
1878 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
1879 {
1880 	int next_free = le16_to_cpu(el->l_next_free_rec);
1881 	unsigned int range;
1882 	struct ocfs2_extent_rec *rec;
1883 
1884 	if (next_free == 0)
1885 		return 0;
1886 
1887 	rec = &el->l_recs[0];
1888 	if (ocfs2_is_empty_extent(rec)) {
1889 		/* Empty list. */
1890 		if (next_free == 1)
1891 			return 0;
1892 		rec = &el->l_recs[1];
1893 	}
1894 
1895 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1896 	if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1897 		return 1;
1898 	return 0;
1899 }
1900 
1901 /*
1902  * Rotate all the records in a btree right one record, starting at insert_cpos.
1903  *
1904  * The path to the rightmost leaf should be passed in.
1905  *
1906  * The array is assumed to be large enough to hold an entire path (tree depth).
1907  *
1908  * Upon succesful return from this function:
1909  *
1910  * - The 'right_path' array will contain a path to the leaf block
1911  *   whose range contains e_cpos.
1912  * - That leaf block will have a single empty extent in list index 0.
1913  * - In the case that the rotation requires a post-insert update,
1914  *   *ret_left_path will contain a valid path which can be passed to
1915  *   ocfs2_insert_path().
1916  */
1917 static int ocfs2_rotate_tree_right(struct inode *inode,
1918 				   handle_t *handle,
1919 				   enum ocfs2_split_type split,
1920 				   u32 insert_cpos,
1921 				   struct ocfs2_path *right_path,
1922 				   struct ocfs2_path **ret_left_path)
1923 {
1924 	int ret, start, orig_credits = handle->h_buffer_credits;
1925 	u32 cpos;
1926 	struct ocfs2_path *left_path = NULL;
1927 
1928 	*ret_left_path = NULL;
1929 
1930 	left_path = ocfs2_new_path(path_root_bh(right_path),
1931 				   path_root_el(right_path));
1932 	if (!left_path) {
1933 		ret = -ENOMEM;
1934 		mlog_errno(ret);
1935 		goto out;
1936 	}
1937 
1938 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
1939 	if (ret) {
1940 		mlog_errno(ret);
1941 		goto out;
1942 	}
1943 
1944 	mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
1945 
1946 	/*
1947 	 * What we want to do here is:
1948 	 *
1949 	 * 1) Start with the rightmost path.
1950 	 *
1951 	 * 2) Determine a path to the leaf block directly to the left
1952 	 *    of that leaf.
1953 	 *
1954 	 * 3) Determine the 'subtree root' - the lowest level tree node
1955 	 *    which contains a path to both leaves.
1956 	 *
1957 	 * 4) Rotate the subtree.
1958 	 *
1959 	 * 5) Find the next subtree by considering the left path to be
1960 	 *    the new right path.
1961 	 *
1962 	 * The check at the top of this while loop also accepts
1963 	 * insert_cpos == cpos because cpos is only a _theoretical_
1964 	 * value to get us the left path - insert_cpos might very well
1965 	 * be filling that hole.
1966 	 *
1967 	 * Stop at a cpos of '0' because we either started at the
1968 	 * leftmost branch (i.e., a tree with one branch and a
1969 	 * rotation inside of it), or we've gone as far as we can in
1970 	 * rotating subtrees.
1971 	 */
1972 	while (cpos && insert_cpos <= cpos) {
1973 		mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
1974 		     insert_cpos, cpos);
1975 
1976 		ret = ocfs2_find_path(inode, left_path, cpos);
1977 		if (ret) {
1978 			mlog_errno(ret);
1979 			goto out;
1980 		}
1981 
1982 		mlog_bug_on_msg(path_leaf_bh(left_path) ==
1983 				path_leaf_bh(right_path),
1984 				"Inode %lu: error during insert of %u "
1985 				"(left path cpos %u) results in two identical "
1986 				"paths ending at %llu\n",
1987 				inode->i_ino, insert_cpos, cpos,
1988 				(unsigned long long)
1989 				path_leaf_bh(left_path)->b_blocknr);
1990 
1991 		if (split == SPLIT_NONE &&
1992 		    ocfs2_rotate_requires_path_adjustment(left_path,
1993 							  insert_cpos)) {
1994 
1995 			/*
1996 			 * We've rotated the tree as much as we
1997 			 * should. The rest is up to
1998 			 * ocfs2_insert_path() to complete, after the
1999 			 * record insertion. We indicate this
2000 			 * situation by returning the left path.
2001 			 *
2002 			 * The reason we don't adjust the records here
2003 			 * before the record insert is that an error
2004 			 * later might break the rule where a parent
2005 			 * record e_cpos will reflect the actual
2006 			 * e_cpos of the 1st nonempty record of the
2007 			 * child list.
2008 			 */
2009 			*ret_left_path = left_path;
2010 			goto out_ret_path;
2011 		}
2012 
2013 		start = ocfs2_find_subtree_root(inode, left_path, right_path);
2014 
2015 		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2016 		     start,
2017 		     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2018 		     right_path->p_tree_depth);
2019 
2020 		ret = ocfs2_extend_rotate_transaction(handle, start,
2021 						      orig_credits, right_path);
2022 		if (ret) {
2023 			mlog_errno(ret);
2024 			goto out;
2025 		}
2026 
2027 		ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2028 						 right_path, start);
2029 		if (ret) {
2030 			mlog_errno(ret);
2031 			goto out;
2032 		}
2033 
2034 		if (split != SPLIT_NONE &&
2035 		    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2036 						insert_cpos)) {
2037 			/*
2038 			 * A rotate moves the rightmost left leaf
2039 			 * record over to the leftmost right leaf
2040 			 * slot. If we're doing an extent split
2041 			 * instead of a real insert, then we have to
2042 			 * check that the extent to be split wasn't
2043 			 * just moved over. If it was, then we can
2044 			 * exit here, passing left_path back -
2045 			 * ocfs2_split_extent() is smart enough to
2046 			 * search both leaves.
2047 			 */
2048 			*ret_left_path = left_path;
2049 			goto out_ret_path;
2050 		}
2051 
2052 		/*
2053 		 * There is no need to re-read the next right path
2054 		 * as we know that it'll be our current left
2055 		 * path. Optimize by copying values instead.
2056 		 */
2057 		ocfs2_mv_path(right_path, left_path);
2058 
2059 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2060 						    &cpos);
2061 		if (ret) {
2062 			mlog_errno(ret);
2063 			goto out;
2064 		}
2065 	}
2066 
2067 out:
2068 	ocfs2_free_path(left_path);
2069 
2070 out_ret_path:
2071 	return ret;
2072 }
2073 
2074 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2075 				      struct ocfs2_path *path)
2076 {
2077 	int i, idx;
2078 	struct ocfs2_extent_rec *rec;
2079 	struct ocfs2_extent_list *el;
2080 	struct ocfs2_extent_block *eb;
2081 	u32 range;
2082 
2083 	/* Path should always be rightmost. */
2084 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2085 	BUG_ON(eb->h_next_leaf_blk != 0ULL);
2086 
2087 	el = &eb->h_list;
2088 	BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2089 	idx = le16_to_cpu(el->l_next_free_rec) - 1;
2090 	rec = &el->l_recs[idx];
2091 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2092 
2093 	for (i = 0; i < path->p_tree_depth; i++) {
2094 		el = path->p_node[i].el;
2095 		idx = le16_to_cpu(el->l_next_free_rec) - 1;
2096 		rec = &el->l_recs[idx];
2097 
2098 		rec->e_int_clusters = cpu_to_le32(range);
2099 		le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2100 
2101 		ocfs2_journal_dirty(handle, path->p_node[i].bh);
2102 	}
2103 }
2104 
2105 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2106 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
2107 			      struct ocfs2_path *path, int unlink_start)
2108 {
2109 	int ret, i;
2110 	struct ocfs2_extent_block *eb;
2111 	struct ocfs2_extent_list *el;
2112 	struct buffer_head *bh;
2113 
2114 	for(i = unlink_start; i < path_num_items(path); i++) {
2115 		bh = path->p_node[i].bh;
2116 
2117 		eb = (struct ocfs2_extent_block *)bh->b_data;
2118 		/*
2119 		 * Not all nodes might have had their final count
2120 		 * decremented by the caller - handle this here.
2121 		 */
2122 		el = &eb->h_list;
2123 		if (le16_to_cpu(el->l_next_free_rec) > 1) {
2124 			mlog(ML_ERROR,
2125 			     "Inode %llu, attempted to remove extent block "
2126 			     "%llu with %u records\n",
2127 			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2128 			     (unsigned long long)le64_to_cpu(eb->h_blkno),
2129 			     le16_to_cpu(el->l_next_free_rec));
2130 
2131 			ocfs2_journal_dirty(handle, bh);
2132 			ocfs2_remove_from_cache(inode, bh);
2133 			continue;
2134 		}
2135 
2136 		el->l_next_free_rec = 0;
2137 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2138 
2139 		ocfs2_journal_dirty(handle, bh);
2140 
2141 		ret = ocfs2_cache_extent_block_free(dealloc, eb);
2142 		if (ret)
2143 			mlog_errno(ret);
2144 
2145 		ocfs2_remove_from_cache(inode, bh);
2146 	}
2147 }
2148 
2149 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2150 				 struct ocfs2_path *left_path,
2151 				 struct ocfs2_path *right_path,
2152 				 int subtree_index,
2153 				 struct ocfs2_cached_dealloc_ctxt *dealloc)
2154 {
2155 	int i;
2156 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2157 	struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2158 	struct ocfs2_extent_list *el;
2159 	struct ocfs2_extent_block *eb;
2160 
2161 	el = path_leaf_el(left_path);
2162 
2163 	eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2164 
2165 	for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2166 		if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2167 			break;
2168 
2169 	BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2170 
2171 	memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2172 	le16_add_cpu(&root_el->l_next_free_rec, -1);
2173 
2174 	eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2175 	eb->h_next_leaf_blk = 0;
2176 
2177 	ocfs2_journal_dirty(handle, root_bh);
2178 	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2179 
2180 	ocfs2_unlink_path(inode, handle, dealloc, right_path,
2181 			  subtree_index + 1);
2182 }
2183 
2184 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2185 				     struct ocfs2_path *left_path,
2186 				     struct ocfs2_path *right_path,
2187 				     int subtree_index,
2188 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
2189 				     int *deleted,
2190 				     struct ocfs2_extent_tree *et)
2191 {
2192 	int ret, i, del_right_subtree = 0, right_has_empty = 0;
2193 	struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2194 	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2195 	struct ocfs2_extent_block *eb;
2196 
2197 	*deleted = 0;
2198 
2199 	right_leaf_el = path_leaf_el(right_path);
2200 	left_leaf_el = path_leaf_el(left_path);
2201 	root_bh = left_path->p_node[subtree_index].bh;
2202 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2203 
2204 	if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2205 		return 0;
2206 
2207 	eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2208 	if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2209 		/*
2210 		 * It's legal for us to proceed if the right leaf is
2211 		 * the rightmost one and it has an empty extent. There
2212 		 * are two cases to handle - whether the leaf will be
2213 		 * empty after removal or not. If the leaf isn't empty
2214 		 * then just remove the empty extent up front. The
2215 		 * next block will handle empty leaves by flagging
2216 		 * them for unlink.
2217 		 *
2218 		 * Non rightmost leaves will throw -EAGAIN and the
2219 		 * caller can manually move the subtree and retry.
2220 		 */
2221 
2222 		if (eb->h_next_leaf_blk != 0ULL)
2223 			return -EAGAIN;
2224 
2225 		if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2226 			ret = ocfs2_journal_access(handle, inode,
2227 						   path_leaf_bh(right_path),
2228 						   OCFS2_JOURNAL_ACCESS_WRITE);
2229 			if (ret) {
2230 				mlog_errno(ret);
2231 				goto out;
2232 			}
2233 
2234 			ocfs2_remove_empty_extent(right_leaf_el);
2235 		} else
2236 			right_has_empty = 1;
2237 	}
2238 
2239 	if (eb->h_next_leaf_blk == 0ULL &&
2240 	    le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2241 		/*
2242 		 * We have to update i_last_eb_blk during the meta
2243 		 * data delete.
2244 		 */
2245 		ret = ocfs2_journal_access(handle, inode, et_root_bh,
2246 					   OCFS2_JOURNAL_ACCESS_WRITE);
2247 		if (ret) {
2248 			mlog_errno(ret);
2249 			goto out;
2250 		}
2251 
2252 		del_right_subtree = 1;
2253 	}
2254 
2255 	/*
2256 	 * Getting here with an empty extent in the right path implies
2257 	 * that it's the rightmost path and will be deleted.
2258 	 */
2259 	BUG_ON(right_has_empty && !del_right_subtree);
2260 
2261 	ret = ocfs2_journal_access(handle, inode, root_bh,
2262 				   OCFS2_JOURNAL_ACCESS_WRITE);
2263 	if (ret) {
2264 		mlog_errno(ret);
2265 		goto out;
2266 	}
2267 
2268 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2269 		ret = ocfs2_journal_access(handle, inode,
2270 					   right_path->p_node[i].bh,
2271 					   OCFS2_JOURNAL_ACCESS_WRITE);
2272 		if (ret) {
2273 			mlog_errno(ret);
2274 			goto out;
2275 		}
2276 
2277 		ret = ocfs2_journal_access(handle, inode,
2278 					   left_path->p_node[i].bh,
2279 					   OCFS2_JOURNAL_ACCESS_WRITE);
2280 		if (ret) {
2281 			mlog_errno(ret);
2282 			goto out;
2283 		}
2284 	}
2285 
2286 	if (!right_has_empty) {
2287 		/*
2288 		 * Only do this if we're moving a real
2289 		 * record. Otherwise, the action is delayed until
2290 		 * after removal of the right path in which case we
2291 		 * can do a simple shift to remove the empty extent.
2292 		 */
2293 		ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2294 		memset(&right_leaf_el->l_recs[0], 0,
2295 		       sizeof(struct ocfs2_extent_rec));
2296 	}
2297 	if (eb->h_next_leaf_blk == 0ULL) {
2298 		/*
2299 		 * Move recs over to get rid of empty extent, decrease
2300 		 * next_free. This is allowed to remove the last
2301 		 * extent in our leaf (setting l_next_free_rec to
2302 		 * zero) - the delete code below won't care.
2303 		 */
2304 		ocfs2_remove_empty_extent(right_leaf_el);
2305 	}
2306 
2307 	ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2308 	if (ret)
2309 		mlog_errno(ret);
2310 	ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2311 	if (ret)
2312 		mlog_errno(ret);
2313 
2314 	if (del_right_subtree) {
2315 		ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2316 				     subtree_index, dealloc);
2317 		ocfs2_update_edge_lengths(inode, handle, left_path);
2318 
2319 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2320 		ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2321 
2322 		/*
2323 		 * Removal of the extent in the left leaf was skipped
2324 		 * above so we could delete the right path
2325 		 * 1st.
2326 		 */
2327 		if (right_has_empty)
2328 			ocfs2_remove_empty_extent(left_leaf_el);
2329 
2330 		ret = ocfs2_journal_dirty(handle, et_root_bh);
2331 		if (ret)
2332 			mlog_errno(ret);
2333 
2334 		*deleted = 1;
2335 	} else
2336 		ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2337 					   subtree_index);
2338 
2339 out:
2340 	return ret;
2341 }
2342 
2343 /*
2344  * Given a full path, determine what cpos value would return us a path
2345  * containing the leaf immediately to the right of the current one.
2346  *
2347  * Will return zero if the path passed in is already the rightmost path.
2348  *
2349  * This looks similar, but is subtly different to
2350  * ocfs2_find_cpos_for_left_leaf().
2351  */
2352 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2353 					  struct ocfs2_path *path, u32 *cpos)
2354 {
2355 	int i, j, ret = 0;
2356 	u64 blkno;
2357 	struct ocfs2_extent_list *el;
2358 
2359 	*cpos = 0;
2360 
2361 	if (path->p_tree_depth == 0)
2362 		return 0;
2363 
2364 	blkno = path_leaf_bh(path)->b_blocknr;
2365 
2366 	/* Start at the tree node just above the leaf and work our way up. */
2367 	i = path->p_tree_depth - 1;
2368 	while (i >= 0) {
2369 		int next_free;
2370 
2371 		el = path->p_node[i].el;
2372 
2373 		/*
2374 		 * Find the extent record just after the one in our
2375 		 * path.
2376 		 */
2377 		next_free = le16_to_cpu(el->l_next_free_rec);
2378 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2379 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2380 				if (j == (next_free - 1)) {
2381 					if (i == 0) {
2382 						/*
2383 						 * We've determined that the
2384 						 * path specified is already
2385 						 * the rightmost one - return a
2386 						 * cpos of zero.
2387 						 */
2388 						goto out;
2389 					}
2390 					/*
2391 					 * The rightmost record points to our
2392 					 * leaf - we need to travel up the
2393 					 * tree one level.
2394 					 */
2395 					goto next_node;
2396 				}
2397 
2398 				*cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2399 				goto out;
2400 			}
2401 		}
2402 
2403 		/*
2404 		 * If we got here, we never found a valid node where
2405 		 * the tree indicated one should be.
2406 		 */
2407 		ocfs2_error(sb,
2408 			    "Invalid extent tree at extent block %llu\n",
2409 			    (unsigned long long)blkno);
2410 		ret = -EROFS;
2411 		goto out;
2412 
2413 next_node:
2414 		blkno = path->p_node[i].bh->b_blocknr;
2415 		i--;
2416 	}
2417 
2418 out:
2419 	return ret;
2420 }
2421 
2422 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2423 					    handle_t *handle,
2424 					    struct buffer_head *bh,
2425 					    struct ocfs2_extent_list *el)
2426 {
2427 	int ret;
2428 
2429 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2430 		return 0;
2431 
2432 	ret = ocfs2_journal_access(handle, inode, bh,
2433 				   OCFS2_JOURNAL_ACCESS_WRITE);
2434 	if (ret) {
2435 		mlog_errno(ret);
2436 		goto out;
2437 	}
2438 
2439 	ocfs2_remove_empty_extent(el);
2440 
2441 	ret = ocfs2_journal_dirty(handle, bh);
2442 	if (ret)
2443 		mlog_errno(ret);
2444 
2445 out:
2446 	return ret;
2447 }
2448 
2449 static int __ocfs2_rotate_tree_left(struct inode *inode,
2450 				    handle_t *handle, int orig_credits,
2451 				    struct ocfs2_path *path,
2452 				    struct ocfs2_cached_dealloc_ctxt *dealloc,
2453 				    struct ocfs2_path **empty_extent_path,
2454 				    struct ocfs2_extent_tree *et)
2455 {
2456 	int ret, subtree_root, deleted;
2457 	u32 right_cpos;
2458 	struct ocfs2_path *left_path = NULL;
2459 	struct ocfs2_path *right_path = NULL;
2460 
2461 	BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2462 
2463 	*empty_extent_path = NULL;
2464 
2465 	ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2466 					     &right_cpos);
2467 	if (ret) {
2468 		mlog_errno(ret);
2469 		goto out;
2470 	}
2471 
2472 	left_path = ocfs2_new_path(path_root_bh(path),
2473 				   path_root_el(path));
2474 	if (!left_path) {
2475 		ret = -ENOMEM;
2476 		mlog_errno(ret);
2477 		goto out;
2478 	}
2479 
2480 	ocfs2_cp_path(left_path, path);
2481 
2482 	right_path = ocfs2_new_path(path_root_bh(path),
2483 				    path_root_el(path));
2484 	if (!right_path) {
2485 		ret = -ENOMEM;
2486 		mlog_errno(ret);
2487 		goto out;
2488 	}
2489 
2490 	while (right_cpos) {
2491 		ret = ocfs2_find_path(inode, right_path, right_cpos);
2492 		if (ret) {
2493 			mlog_errno(ret);
2494 			goto out;
2495 		}
2496 
2497 		subtree_root = ocfs2_find_subtree_root(inode, left_path,
2498 						       right_path);
2499 
2500 		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2501 		     subtree_root,
2502 		     (unsigned long long)
2503 		     right_path->p_node[subtree_root].bh->b_blocknr,
2504 		     right_path->p_tree_depth);
2505 
2506 		ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2507 						      orig_credits, left_path);
2508 		if (ret) {
2509 			mlog_errno(ret);
2510 			goto out;
2511 		}
2512 
2513 		/*
2514 		 * Caller might still want to make changes to the
2515 		 * tree root, so re-add it to the journal here.
2516 		 */
2517 		ret = ocfs2_journal_access(handle, inode,
2518 					   path_root_bh(left_path),
2519 					   OCFS2_JOURNAL_ACCESS_WRITE);
2520 		if (ret) {
2521 			mlog_errno(ret);
2522 			goto out;
2523 		}
2524 
2525 		ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2526 						right_path, subtree_root,
2527 						dealloc, &deleted, et);
2528 		if (ret == -EAGAIN) {
2529 			/*
2530 			 * The rotation has to temporarily stop due to
2531 			 * the right subtree having an empty
2532 			 * extent. Pass it back to the caller for a
2533 			 * fixup.
2534 			 */
2535 			*empty_extent_path = right_path;
2536 			right_path = NULL;
2537 			goto out;
2538 		}
2539 		if (ret) {
2540 			mlog_errno(ret);
2541 			goto out;
2542 		}
2543 
2544 		/*
2545 		 * The subtree rotate might have removed records on
2546 		 * the rightmost edge. If so, then rotation is
2547 		 * complete.
2548 		 */
2549 		if (deleted)
2550 			break;
2551 
2552 		ocfs2_mv_path(left_path, right_path);
2553 
2554 		ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2555 						     &right_cpos);
2556 		if (ret) {
2557 			mlog_errno(ret);
2558 			goto out;
2559 		}
2560 	}
2561 
2562 out:
2563 	ocfs2_free_path(right_path);
2564 	ocfs2_free_path(left_path);
2565 
2566 	return ret;
2567 }
2568 
2569 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2570 				struct ocfs2_path *path,
2571 				struct ocfs2_cached_dealloc_ctxt *dealloc,
2572 				struct ocfs2_extent_tree *et)
2573 {
2574 	int ret, subtree_index;
2575 	u32 cpos;
2576 	struct ocfs2_path *left_path = NULL;
2577 	struct ocfs2_extent_block *eb;
2578 	struct ocfs2_extent_list *el;
2579 
2580 
2581 	ret = et->eops->sanity_check(inode, et);
2582 	if (ret)
2583 		goto out;
2584 	/*
2585 	 * There's two ways we handle this depending on
2586 	 * whether path is the only existing one.
2587 	 */
2588 	ret = ocfs2_extend_rotate_transaction(handle, 0,
2589 					      handle->h_buffer_credits,
2590 					      path);
2591 	if (ret) {
2592 		mlog_errno(ret);
2593 		goto out;
2594 	}
2595 
2596 	ret = ocfs2_journal_access_path(inode, handle, path);
2597 	if (ret) {
2598 		mlog_errno(ret);
2599 		goto out;
2600 	}
2601 
2602 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2603 	if (ret) {
2604 		mlog_errno(ret);
2605 		goto out;
2606 	}
2607 
2608 	if (cpos) {
2609 		/*
2610 		 * We have a path to the left of this one - it needs
2611 		 * an update too.
2612 		 */
2613 		left_path = ocfs2_new_path(path_root_bh(path),
2614 					   path_root_el(path));
2615 		if (!left_path) {
2616 			ret = -ENOMEM;
2617 			mlog_errno(ret);
2618 			goto out;
2619 		}
2620 
2621 		ret = ocfs2_find_path(inode, left_path, cpos);
2622 		if (ret) {
2623 			mlog_errno(ret);
2624 			goto out;
2625 		}
2626 
2627 		ret = ocfs2_journal_access_path(inode, handle, left_path);
2628 		if (ret) {
2629 			mlog_errno(ret);
2630 			goto out;
2631 		}
2632 
2633 		subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2634 
2635 		ocfs2_unlink_subtree(inode, handle, left_path, path,
2636 				     subtree_index, dealloc);
2637 		ocfs2_update_edge_lengths(inode, handle, left_path);
2638 
2639 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2640 		ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2641 	} else {
2642 		/*
2643 		 * 'path' is also the leftmost path which
2644 		 * means it must be the only one. This gets
2645 		 * handled differently because we want to
2646 		 * revert the inode back to having extents
2647 		 * in-line.
2648 		 */
2649 		ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2650 
2651 		el = et->root_el;
2652 		el->l_tree_depth = 0;
2653 		el->l_next_free_rec = 0;
2654 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2655 
2656 		ocfs2_set_last_eb_blk(et, 0);
2657 	}
2658 
2659 	ocfs2_journal_dirty(handle, path_root_bh(path));
2660 
2661 out:
2662 	ocfs2_free_path(left_path);
2663 	return ret;
2664 }
2665 
2666 /*
2667  * Left rotation of btree records.
2668  *
2669  * In many ways, this is (unsurprisingly) the opposite of right
2670  * rotation. We start at some non-rightmost path containing an empty
2671  * extent in the leaf block. The code works its way to the rightmost
2672  * path by rotating records to the left in every subtree.
2673  *
2674  * This is used by any code which reduces the number of extent records
2675  * in a leaf. After removal, an empty record should be placed in the
2676  * leftmost list position.
2677  *
2678  * This won't handle a length update of the rightmost path records if
2679  * the rightmost tree leaf record is removed so the caller is
2680  * responsible for detecting and correcting that.
2681  */
2682 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2683 				  struct ocfs2_path *path,
2684 				  struct ocfs2_cached_dealloc_ctxt *dealloc,
2685 				  struct ocfs2_extent_tree *et)
2686 {
2687 	int ret, orig_credits = handle->h_buffer_credits;
2688 	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2689 	struct ocfs2_extent_block *eb;
2690 	struct ocfs2_extent_list *el;
2691 
2692 	el = path_leaf_el(path);
2693 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2694 		return 0;
2695 
2696 	if (path->p_tree_depth == 0) {
2697 rightmost_no_delete:
2698 		/*
2699 		 * Inline extents. This is trivially handled, so do
2700 		 * it up front.
2701 		 */
2702 		ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2703 						       path_leaf_bh(path),
2704 						       path_leaf_el(path));
2705 		if (ret)
2706 			mlog_errno(ret);
2707 		goto out;
2708 	}
2709 
2710 	/*
2711 	 * Handle rightmost branch now. There's several cases:
2712 	 *  1) simple rotation leaving records in there. That's trivial.
2713 	 *  2) rotation requiring a branch delete - there's no more
2714 	 *     records left. Two cases of this:
2715 	 *     a) There are branches to the left.
2716 	 *     b) This is also the leftmost (the only) branch.
2717 	 *
2718 	 *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2719 	 *  2a) we need the left branch so that we can update it with the unlink
2720 	 *  2b) we need to bring the inode back to inline extents.
2721 	 */
2722 
2723 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2724 	el = &eb->h_list;
2725 	if (eb->h_next_leaf_blk == 0) {
2726 		/*
2727 		 * This gets a bit tricky if we're going to delete the
2728 		 * rightmost path. Get the other cases out of the way
2729 		 * 1st.
2730 		 */
2731 		if (le16_to_cpu(el->l_next_free_rec) > 1)
2732 			goto rightmost_no_delete;
2733 
2734 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
2735 			ret = -EIO;
2736 			ocfs2_error(inode->i_sb,
2737 				    "Inode %llu has empty extent block at %llu",
2738 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
2739 				    (unsigned long long)le64_to_cpu(eb->h_blkno));
2740 			goto out;
2741 		}
2742 
2743 		/*
2744 		 * XXX: The caller can not trust "path" any more after
2745 		 * this as it will have been deleted. What do we do?
2746 		 *
2747 		 * In theory the rotate-for-merge code will never get
2748 		 * here because it'll always ask for a rotate in a
2749 		 * nonempty list.
2750 		 */
2751 
2752 		ret = ocfs2_remove_rightmost_path(inode, handle, path,
2753 						  dealloc, et);
2754 		if (ret)
2755 			mlog_errno(ret);
2756 		goto out;
2757 	}
2758 
2759 	/*
2760 	 * Now we can loop, remembering the path we get from -EAGAIN
2761 	 * and restarting from there.
2762 	 */
2763 try_rotate:
2764 	ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2765 				       dealloc, &restart_path, et);
2766 	if (ret && ret != -EAGAIN) {
2767 		mlog_errno(ret);
2768 		goto out;
2769 	}
2770 
2771 	while (ret == -EAGAIN) {
2772 		tmp_path = restart_path;
2773 		restart_path = NULL;
2774 
2775 		ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2776 					       tmp_path, dealloc,
2777 					       &restart_path, et);
2778 		if (ret && ret != -EAGAIN) {
2779 			mlog_errno(ret);
2780 			goto out;
2781 		}
2782 
2783 		ocfs2_free_path(tmp_path);
2784 		tmp_path = NULL;
2785 
2786 		if (ret == 0)
2787 			goto try_rotate;
2788 	}
2789 
2790 out:
2791 	ocfs2_free_path(tmp_path);
2792 	ocfs2_free_path(restart_path);
2793 	return ret;
2794 }
2795 
2796 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2797 				int index)
2798 {
2799 	struct ocfs2_extent_rec *rec = &el->l_recs[index];
2800 	unsigned int size;
2801 
2802 	if (rec->e_leaf_clusters == 0) {
2803 		/*
2804 		 * We consumed all of the merged-from record. An empty
2805 		 * extent cannot exist anywhere but the 1st array
2806 		 * position, so move things over if the merged-from
2807 		 * record doesn't occupy that position.
2808 		 *
2809 		 * This creates a new empty extent so the caller
2810 		 * should be smart enough to have removed any existing
2811 		 * ones.
2812 		 */
2813 		if (index > 0) {
2814 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2815 			size = index * sizeof(struct ocfs2_extent_rec);
2816 			memmove(&el->l_recs[1], &el->l_recs[0], size);
2817 		}
2818 
2819 		/*
2820 		 * Always memset - the caller doesn't check whether it
2821 		 * created an empty extent, so there could be junk in
2822 		 * the other fields.
2823 		 */
2824 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2825 	}
2826 }
2827 
2828 static int ocfs2_get_right_path(struct inode *inode,
2829 				struct ocfs2_path *left_path,
2830 				struct ocfs2_path **ret_right_path)
2831 {
2832 	int ret;
2833 	u32 right_cpos;
2834 	struct ocfs2_path *right_path = NULL;
2835 	struct ocfs2_extent_list *left_el;
2836 
2837 	*ret_right_path = NULL;
2838 
2839 	/* This function shouldn't be called for non-trees. */
2840 	BUG_ON(left_path->p_tree_depth == 0);
2841 
2842 	left_el = path_leaf_el(left_path);
2843 	BUG_ON(left_el->l_next_free_rec != left_el->l_count);
2844 
2845 	ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2846 					     &right_cpos);
2847 	if (ret) {
2848 		mlog_errno(ret);
2849 		goto out;
2850 	}
2851 
2852 	/* This function shouldn't be called for the rightmost leaf. */
2853 	BUG_ON(right_cpos == 0);
2854 
2855 	right_path = ocfs2_new_path(path_root_bh(left_path),
2856 				    path_root_el(left_path));
2857 	if (!right_path) {
2858 		ret = -ENOMEM;
2859 		mlog_errno(ret);
2860 		goto out;
2861 	}
2862 
2863 	ret = ocfs2_find_path(inode, right_path, right_cpos);
2864 	if (ret) {
2865 		mlog_errno(ret);
2866 		goto out;
2867 	}
2868 
2869 	*ret_right_path = right_path;
2870 out:
2871 	if (ret)
2872 		ocfs2_free_path(right_path);
2873 	return ret;
2874 }
2875 
2876 /*
2877  * Remove split_rec clusters from the record at index and merge them
2878  * onto the beginning of the record "next" to it.
2879  * For index < l_count - 1, the next means the extent rec at index + 1.
2880  * For index == l_count - 1, the "next" means the 1st extent rec of the
2881  * next extent block.
2882  */
2883 static int ocfs2_merge_rec_right(struct inode *inode,
2884 				 struct ocfs2_path *left_path,
2885 				 handle_t *handle,
2886 				 struct ocfs2_extent_rec *split_rec,
2887 				 int index)
2888 {
2889 	int ret, next_free, i;
2890 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2891 	struct ocfs2_extent_rec *left_rec;
2892 	struct ocfs2_extent_rec *right_rec;
2893 	struct ocfs2_extent_list *right_el;
2894 	struct ocfs2_path *right_path = NULL;
2895 	int subtree_index = 0;
2896 	struct ocfs2_extent_list *el = path_leaf_el(left_path);
2897 	struct buffer_head *bh = path_leaf_bh(left_path);
2898 	struct buffer_head *root_bh = NULL;
2899 
2900 	BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2901 	left_rec = &el->l_recs[index];
2902 
2903 	if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
2904 	    le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
2905 		/* we meet with a cross extent block merge. */
2906 		ret = ocfs2_get_right_path(inode, left_path, &right_path);
2907 		if (ret) {
2908 			mlog_errno(ret);
2909 			goto out;
2910 		}
2911 
2912 		right_el = path_leaf_el(right_path);
2913 		next_free = le16_to_cpu(right_el->l_next_free_rec);
2914 		BUG_ON(next_free <= 0);
2915 		right_rec = &right_el->l_recs[0];
2916 		if (ocfs2_is_empty_extent(right_rec)) {
2917 			BUG_ON(next_free <= 1);
2918 			right_rec = &right_el->l_recs[1];
2919 		}
2920 
2921 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
2922 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
2923 		       le32_to_cpu(right_rec->e_cpos));
2924 
2925 		subtree_index = ocfs2_find_subtree_root(inode,
2926 							left_path, right_path);
2927 
2928 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
2929 						      handle->h_buffer_credits,
2930 						      right_path);
2931 		if (ret) {
2932 			mlog_errno(ret);
2933 			goto out;
2934 		}
2935 
2936 		root_bh = left_path->p_node[subtree_index].bh;
2937 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2938 
2939 		ret = ocfs2_journal_access(handle, inode, root_bh,
2940 					   OCFS2_JOURNAL_ACCESS_WRITE);
2941 		if (ret) {
2942 			mlog_errno(ret);
2943 			goto out;
2944 		}
2945 
2946 		for (i = subtree_index + 1;
2947 		     i < path_num_items(right_path); i++) {
2948 			ret = ocfs2_journal_access(handle, inode,
2949 						   right_path->p_node[i].bh,
2950 						   OCFS2_JOURNAL_ACCESS_WRITE);
2951 			if (ret) {
2952 				mlog_errno(ret);
2953 				goto out;
2954 			}
2955 
2956 			ret = ocfs2_journal_access(handle, inode,
2957 						   left_path->p_node[i].bh,
2958 						   OCFS2_JOURNAL_ACCESS_WRITE);
2959 			if (ret) {
2960 				mlog_errno(ret);
2961 				goto out;
2962 			}
2963 		}
2964 
2965 	} else {
2966 		BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
2967 		right_rec = &el->l_recs[index + 1];
2968 	}
2969 
2970 	ret = ocfs2_journal_access(handle, inode, bh,
2971 				   OCFS2_JOURNAL_ACCESS_WRITE);
2972 	if (ret) {
2973 		mlog_errno(ret);
2974 		goto out;
2975 	}
2976 
2977 	le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
2978 
2979 	le32_add_cpu(&right_rec->e_cpos, -split_clusters);
2980 	le64_add_cpu(&right_rec->e_blkno,
2981 		     -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2982 	le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
2983 
2984 	ocfs2_cleanup_merge(el, index);
2985 
2986 	ret = ocfs2_journal_dirty(handle, bh);
2987 	if (ret)
2988 		mlog_errno(ret);
2989 
2990 	if (right_path) {
2991 		ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2992 		if (ret)
2993 			mlog_errno(ret);
2994 
2995 		ocfs2_complete_edge_insert(inode, handle, left_path,
2996 					   right_path, subtree_index);
2997 	}
2998 out:
2999 	if (right_path)
3000 		ocfs2_free_path(right_path);
3001 	return ret;
3002 }
3003 
3004 static int ocfs2_get_left_path(struct inode *inode,
3005 			       struct ocfs2_path *right_path,
3006 			       struct ocfs2_path **ret_left_path)
3007 {
3008 	int ret;
3009 	u32 left_cpos;
3010 	struct ocfs2_path *left_path = NULL;
3011 
3012 	*ret_left_path = NULL;
3013 
3014 	/* This function shouldn't be called for non-trees. */
3015 	BUG_ON(right_path->p_tree_depth == 0);
3016 
3017 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3018 					    right_path, &left_cpos);
3019 	if (ret) {
3020 		mlog_errno(ret);
3021 		goto out;
3022 	}
3023 
3024 	/* This function shouldn't be called for the leftmost leaf. */
3025 	BUG_ON(left_cpos == 0);
3026 
3027 	left_path = ocfs2_new_path(path_root_bh(right_path),
3028 				   path_root_el(right_path));
3029 	if (!left_path) {
3030 		ret = -ENOMEM;
3031 		mlog_errno(ret);
3032 		goto out;
3033 	}
3034 
3035 	ret = ocfs2_find_path(inode, left_path, left_cpos);
3036 	if (ret) {
3037 		mlog_errno(ret);
3038 		goto out;
3039 	}
3040 
3041 	*ret_left_path = left_path;
3042 out:
3043 	if (ret)
3044 		ocfs2_free_path(left_path);
3045 	return ret;
3046 }
3047 
3048 /*
3049  * Remove split_rec clusters from the record at index and merge them
3050  * onto the tail of the record "before" it.
3051  * For index > 0, the "before" means the extent rec at index - 1.
3052  *
3053  * For index == 0, the "before" means the last record of the previous
3054  * extent block. And there is also a situation that we may need to
3055  * remove the rightmost leaf extent block in the right_path and change
3056  * the right path to indicate the new rightmost path.
3057  */
3058 static int ocfs2_merge_rec_left(struct inode *inode,
3059 				struct ocfs2_path *right_path,
3060 				handle_t *handle,
3061 				struct ocfs2_extent_rec *split_rec,
3062 				struct ocfs2_cached_dealloc_ctxt *dealloc,
3063 				struct ocfs2_extent_tree *et,
3064 				int index)
3065 {
3066 	int ret, i, subtree_index = 0, has_empty_extent = 0;
3067 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3068 	struct ocfs2_extent_rec *left_rec;
3069 	struct ocfs2_extent_rec *right_rec;
3070 	struct ocfs2_extent_list *el = path_leaf_el(right_path);
3071 	struct buffer_head *bh = path_leaf_bh(right_path);
3072 	struct buffer_head *root_bh = NULL;
3073 	struct ocfs2_path *left_path = NULL;
3074 	struct ocfs2_extent_list *left_el;
3075 
3076 	BUG_ON(index < 0);
3077 
3078 	right_rec = &el->l_recs[index];
3079 	if (index == 0) {
3080 		/* we meet with a cross extent block merge. */
3081 		ret = ocfs2_get_left_path(inode, right_path, &left_path);
3082 		if (ret) {
3083 			mlog_errno(ret);
3084 			goto out;
3085 		}
3086 
3087 		left_el = path_leaf_el(left_path);
3088 		BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3089 		       le16_to_cpu(left_el->l_count));
3090 
3091 		left_rec = &left_el->l_recs[
3092 				le16_to_cpu(left_el->l_next_free_rec) - 1];
3093 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3094 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
3095 		       le32_to_cpu(split_rec->e_cpos));
3096 
3097 		subtree_index = ocfs2_find_subtree_root(inode,
3098 							left_path, right_path);
3099 
3100 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3101 						      handle->h_buffer_credits,
3102 						      left_path);
3103 		if (ret) {
3104 			mlog_errno(ret);
3105 			goto out;
3106 		}
3107 
3108 		root_bh = left_path->p_node[subtree_index].bh;
3109 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3110 
3111 		ret = ocfs2_journal_access(handle, inode, root_bh,
3112 					   OCFS2_JOURNAL_ACCESS_WRITE);
3113 		if (ret) {
3114 			mlog_errno(ret);
3115 			goto out;
3116 		}
3117 
3118 		for (i = subtree_index + 1;
3119 		     i < path_num_items(right_path); i++) {
3120 			ret = ocfs2_journal_access(handle, inode,
3121 						   right_path->p_node[i].bh,
3122 						   OCFS2_JOURNAL_ACCESS_WRITE);
3123 			if (ret) {
3124 				mlog_errno(ret);
3125 				goto out;
3126 			}
3127 
3128 			ret = ocfs2_journal_access(handle, inode,
3129 						   left_path->p_node[i].bh,
3130 						   OCFS2_JOURNAL_ACCESS_WRITE);
3131 			if (ret) {
3132 				mlog_errno(ret);
3133 				goto out;
3134 			}
3135 		}
3136 	} else {
3137 		left_rec = &el->l_recs[index - 1];
3138 		if (ocfs2_is_empty_extent(&el->l_recs[0]))
3139 			has_empty_extent = 1;
3140 	}
3141 
3142 	ret = ocfs2_journal_access(handle, inode, bh,
3143 				   OCFS2_JOURNAL_ACCESS_WRITE);
3144 	if (ret) {
3145 		mlog_errno(ret);
3146 		goto out;
3147 	}
3148 
3149 	if (has_empty_extent && index == 1) {
3150 		/*
3151 		 * The easy case - we can just plop the record right in.
3152 		 */
3153 		*left_rec = *split_rec;
3154 
3155 		has_empty_extent = 0;
3156 	} else
3157 		le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3158 
3159 	le32_add_cpu(&right_rec->e_cpos, split_clusters);
3160 	le64_add_cpu(&right_rec->e_blkno,
3161 		     ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3162 	le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3163 
3164 	ocfs2_cleanup_merge(el, index);
3165 
3166 	ret = ocfs2_journal_dirty(handle, bh);
3167 	if (ret)
3168 		mlog_errno(ret);
3169 
3170 	if (left_path) {
3171 		ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3172 		if (ret)
3173 			mlog_errno(ret);
3174 
3175 		/*
3176 		 * In the situation that the right_rec is empty and the extent
3177 		 * block is empty also,  ocfs2_complete_edge_insert can't handle
3178 		 * it and we need to delete the right extent block.
3179 		 */
3180 		if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3181 		    le16_to_cpu(el->l_next_free_rec) == 1) {
3182 
3183 			ret = ocfs2_remove_rightmost_path(inode, handle,
3184 							  right_path,
3185 							  dealloc, et);
3186 			if (ret) {
3187 				mlog_errno(ret);
3188 				goto out;
3189 			}
3190 
3191 			/* Now the rightmost extent block has been deleted.
3192 			 * So we use the new rightmost path.
3193 			 */
3194 			ocfs2_mv_path(right_path, left_path);
3195 			left_path = NULL;
3196 		} else
3197 			ocfs2_complete_edge_insert(inode, handle, left_path,
3198 						   right_path, subtree_index);
3199 	}
3200 out:
3201 	if (left_path)
3202 		ocfs2_free_path(left_path);
3203 	return ret;
3204 }
3205 
3206 static int ocfs2_try_to_merge_extent(struct inode *inode,
3207 				     handle_t *handle,
3208 				     struct ocfs2_path *path,
3209 				     int split_index,
3210 				     struct ocfs2_extent_rec *split_rec,
3211 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
3212 				     struct ocfs2_merge_ctxt *ctxt,
3213 				     struct ocfs2_extent_tree *et)
3214 
3215 {
3216 	int ret = 0;
3217 	struct ocfs2_extent_list *el = path_leaf_el(path);
3218 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3219 
3220 	BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3221 
3222 	if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3223 		/*
3224 		 * The merge code will need to create an empty
3225 		 * extent to take the place of the newly
3226 		 * emptied slot. Remove any pre-existing empty
3227 		 * extents - having more than one in a leaf is
3228 		 * illegal.
3229 		 */
3230 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3231 					     dealloc, et);
3232 		if (ret) {
3233 			mlog_errno(ret);
3234 			goto out;
3235 		}
3236 		split_index--;
3237 		rec = &el->l_recs[split_index];
3238 	}
3239 
3240 	if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3241 		/*
3242 		 * Left-right contig implies this.
3243 		 */
3244 		BUG_ON(!ctxt->c_split_covers_rec);
3245 
3246 		/*
3247 		 * Since the leftright insert always covers the entire
3248 		 * extent, this call will delete the insert record
3249 		 * entirely, resulting in an empty extent record added to
3250 		 * the extent block.
3251 		 *
3252 		 * Since the adding of an empty extent shifts
3253 		 * everything back to the right, there's no need to
3254 		 * update split_index here.
3255 		 *
3256 		 * When the split_index is zero, we need to merge it to the
3257 		 * prevoius extent block. It is more efficient and easier
3258 		 * if we do merge_right first and merge_left later.
3259 		 */
3260 		ret = ocfs2_merge_rec_right(inode, path,
3261 					    handle, split_rec,
3262 					    split_index);
3263 		if (ret) {
3264 			mlog_errno(ret);
3265 			goto out;
3266 		}
3267 
3268 		/*
3269 		 * We can only get this from logic error above.
3270 		 */
3271 		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3272 
3273 		/* The merge left us with an empty extent, remove it. */
3274 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3275 					     dealloc, et);
3276 		if (ret) {
3277 			mlog_errno(ret);
3278 			goto out;
3279 		}
3280 
3281 		rec = &el->l_recs[split_index];
3282 
3283 		/*
3284 		 * Note that we don't pass split_rec here on purpose -
3285 		 * we've merged it into the rec already.
3286 		 */
3287 		ret = ocfs2_merge_rec_left(inode, path,
3288 					   handle, rec,
3289 					   dealloc, et,
3290 					   split_index);
3291 
3292 		if (ret) {
3293 			mlog_errno(ret);
3294 			goto out;
3295 		}
3296 
3297 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3298 					     dealloc, et);
3299 		/*
3300 		 * Error from this last rotate is not critical, so
3301 		 * print but don't bubble it up.
3302 		 */
3303 		if (ret)
3304 			mlog_errno(ret);
3305 		ret = 0;
3306 	} else {
3307 		/*
3308 		 * Merge a record to the left or right.
3309 		 *
3310 		 * 'contig_type' is relative to the existing record,
3311 		 * so for example, if we're "right contig", it's to
3312 		 * the record on the left (hence the left merge).
3313 		 */
3314 		if (ctxt->c_contig_type == CONTIG_RIGHT) {
3315 			ret = ocfs2_merge_rec_left(inode,
3316 						   path,
3317 						   handle, split_rec,
3318 						   dealloc, et,
3319 						   split_index);
3320 			if (ret) {
3321 				mlog_errno(ret);
3322 				goto out;
3323 			}
3324 		} else {
3325 			ret = ocfs2_merge_rec_right(inode,
3326 						    path,
3327 						    handle, split_rec,
3328 						    split_index);
3329 			if (ret) {
3330 				mlog_errno(ret);
3331 				goto out;
3332 			}
3333 		}
3334 
3335 		if (ctxt->c_split_covers_rec) {
3336 			/*
3337 			 * The merge may have left an empty extent in
3338 			 * our leaf. Try to rotate it away.
3339 			 */
3340 			ret = ocfs2_rotate_tree_left(inode, handle, path,
3341 						     dealloc, et);
3342 			if (ret)
3343 				mlog_errno(ret);
3344 			ret = 0;
3345 		}
3346 	}
3347 
3348 out:
3349 	return ret;
3350 }
3351 
3352 static void ocfs2_subtract_from_rec(struct super_block *sb,
3353 				    enum ocfs2_split_type split,
3354 				    struct ocfs2_extent_rec *rec,
3355 				    struct ocfs2_extent_rec *split_rec)
3356 {
3357 	u64 len_blocks;
3358 
3359 	len_blocks = ocfs2_clusters_to_blocks(sb,
3360 				le16_to_cpu(split_rec->e_leaf_clusters));
3361 
3362 	if (split == SPLIT_LEFT) {
3363 		/*
3364 		 * Region is on the left edge of the existing
3365 		 * record.
3366 		 */
3367 		le32_add_cpu(&rec->e_cpos,
3368 			     le16_to_cpu(split_rec->e_leaf_clusters));
3369 		le64_add_cpu(&rec->e_blkno, len_blocks);
3370 		le16_add_cpu(&rec->e_leaf_clusters,
3371 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3372 	} else {
3373 		/*
3374 		 * Region is on the right edge of the existing
3375 		 * record.
3376 		 */
3377 		le16_add_cpu(&rec->e_leaf_clusters,
3378 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3379 	}
3380 }
3381 
3382 /*
3383  * Do the final bits of extent record insertion at the target leaf
3384  * list. If this leaf is part of an allocation tree, it is assumed
3385  * that the tree above has been prepared.
3386  */
3387 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3388 				 struct ocfs2_extent_list *el,
3389 				 struct ocfs2_insert_type *insert,
3390 				 struct inode *inode)
3391 {
3392 	int i = insert->ins_contig_index;
3393 	unsigned int range;
3394 	struct ocfs2_extent_rec *rec;
3395 
3396 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3397 
3398 	if (insert->ins_split != SPLIT_NONE) {
3399 		i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3400 		BUG_ON(i == -1);
3401 		rec = &el->l_recs[i];
3402 		ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3403 					insert_rec);
3404 		goto rotate;
3405 	}
3406 
3407 	/*
3408 	 * Contiguous insert - either left or right.
3409 	 */
3410 	if (insert->ins_contig != CONTIG_NONE) {
3411 		rec = &el->l_recs[i];
3412 		if (insert->ins_contig == CONTIG_LEFT) {
3413 			rec->e_blkno = insert_rec->e_blkno;
3414 			rec->e_cpos = insert_rec->e_cpos;
3415 		}
3416 		le16_add_cpu(&rec->e_leaf_clusters,
3417 			     le16_to_cpu(insert_rec->e_leaf_clusters));
3418 		return;
3419 	}
3420 
3421 	/*
3422 	 * Handle insert into an empty leaf.
3423 	 */
3424 	if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3425 	    ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3426 	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
3427 		el->l_recs[0] = *insert_rec;
3428 		el->l_next_free_rec = cpu_to_le16(1);
3429 		return;
3430 	}
3431 
3432 	/*
3433 	 * Appending insert.
3434 	 */
3435 	if (insert->ins_appending == APPEND_TAIL) {
3436 		i = le16_to_cpu(el->l_next_free_rec) - 1;
3437 		rec = &el->l_recs[i];
3438 		range = le32_to_cpu(rec->e_cpos)
3439 			+ le16_to_cpu(rec->e_leaf_clusters);
3440 		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3441 
3442 		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3443 				le16_to_cpu(el->l_count),
3444 				"inode %lu, depth %u, count %u, next free %u, "
3445 				"rec.cpos %u, rec.clusters %u, "
3446 				"insert.cpos %u, insert.clusters %u\n",
3447 				inode->i_ino,
3448 				le16_to_cpu(el->l_tree_depth),
3449 				le16_to_cpu(el->l_count),
3450 				le16_to_cpu(el->l_next_free_rec),
3451 				le32_to_cpu(el->l_recs[i].e_cpos),
3452 				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3453 				le32_to_cpu(insert_rec->e_cpos),
3454 				le16_to_cpu(insert_rec->e_leaf_clusters));
3455 		i++;
3456 		el->l_recs[i] = *insert_rec;
3457 		le16_add_cpu(&el->l_next_free_rec, 1);
3458 		return;
3459 	}
3460 
3461 rotate:
3462 	/*
3463 	 * Ok, we have to rotate.
3464 	 *
3465 	 * At this point, it is safe to assume that inserting into an
3466 	 * empty leaf and appending to a leaf have both been handled
3467 	 * above.
3468 	 *
3469 	 * This leaf needs to have space, either by the empty 1st
3470 	 * extent record, or by virtue of an l_next_rec < l_count.
3471 	 */
3472 	ocfs2_rotate_leaf(el, insert_rec);
3473 }
3474 
3475 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3476 					   handle_t *handle,
3477 					   struct ocfs2_path *path,
3478 					   struct ocfs2_extent_rec *insert_rec)
3479 {
3480 	int ret, i, next_free;
3481 	struct buffer_head *bh;
3482 	struct ocfs2_extent_list *el;
3483 	struct ocfs2_extent_rec *rec;
3484 
3485 	/*
3486 	 * Update everything except the leaf block.
3487 	 */
3488 	for (i = 0; i < path->p_tree_depth; i++) {
3489 		bh = path->p_node[i].bh;
3490 		el = path->p_node[i].el;
3491 
3492 		next_free = le16_to_cpu(el->l_next_free_rec);
3493 		if (next_free == 0) {
3494 			ocfs2_error(inode->i_sb,
3495 				    "Dinode %llu has a bad extent list",
3496 				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
3497 			ret = -EIO;
3498 			return;
3499 		}
3500 
3501 		rec = &el->l_recs[next_free - 1];
3502 
3503 		rec->e_int_clusters = insert_rec->e_cpos;
3504 		le32_add_cpu(&rec->e_int_clusters,
3505 			     le16_to_cpu(insert_rec->e_leaf_clusters));
3506 		le32_add_cpu(&rec->e_int_clusters,
3507 			     -le32_to_cpu(rec->e_cpos));
3508 
3509 		ret = ocfs2_journal_dirty(handle, bh);
3510 		if (ret)
3511 			mlog_errno(ret);
3512 
3513 	}
3514 }
3515 
3516 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3517 				    struct ocfs2_extent_rec *insert_rec,
3518 				    struct ocfs2_path *right_path,
3519 				    struct ocfs2_path **ret_left_path)
3520 {
3521 	int ret, next_free;
3522 	struct ocfs2_extent_list *el;
3523 	struct ocfs2_path *left_path = NULL;
3524 
3525 	*ret_left_path = NULL;
3526 
3527 	/*
3528 	 * This shouldn't happen for non-trees. The extent rec cluster
3529 	 * count manipulation below only works for interior nodes.
3530 	 */
3531 	BUG_ON(right_path->p_tree_depth == 0);
3532 
3533 	/*
3534 	 * If our appending insert is at the leftmost edge of a leaf,
3535 	 * then we might need to update the rightmost records of the
3536 	 * neighboring path.
3537 	 */
3538 	el = path_leaf_el(right_path);
3539 	next_free = le16_to_cpu(el->l_next_free_rec);
3540 	if (next_free == 0 ||
3541 	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3542 		u32 left_cpos;
3543 
3544 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3545 						    &left_cpos);
3546 		if (ret) {
3547 			mlog_errno(ret);
3548 			goto out;
3549 		}
3550 
3551 		mlog(0, "Append may need a left path update. cpos: %u, "
3552 		     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3553 		     left_cpos);
3554 
3555 		/*
3556 		 * No need to worry if the append is already in the
3557 		 * leftmost leaf.
3558 		 */
3559 		if (left_cpos) {
3560 			left_path = ocfs2_new_path(path_root_bh(right_path),
3561 						   path_root_el(right_path));
3562 			if (!left_path) {
3563 				ret = -ENOMEM;
3564 				mlog_errno(ret);
3565 				goto out;
3566 			}
3567 
3568 			ret = ocfs2_find_path(inode, left_path, left_cpos);
3569 			if (ret) {
3570 				mlog_errno(ret);
3571 				goto out;
3572 			}
3573 
3574 			/*
3575 			 * ocfs2_insert_path() will pass the left_path to the
3576 			 * journal for us.
3577 			 */
3578 		}
3579 	}
3580 
3581 	ret = ocfs2_journal_access_path(inode, handle, right_path);
3582 	if (ret) {
3583 		mlog_errno(ret);
3584 		goto out;
3585 	}
3586 
3587 	ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3588 
3589 	*ret_left_path = left_path;
3590 	ret = 0;
3591 out:
3592 	if (ret != 0)
3593 		ocfs2_free_path(left_path);
3594 
3595 	return ret;
3596 }
3597 
3598 static void ocfs2_split_record(struct inode *inode,
3599 			       struct ocfs2_path *left_path,
3600 			       struct ocfs2_path *right_path,
3601 			       struct ocfs2_extent_rec *split_rec,
3602 			       enum ocfs2_split_type split)
3603 {
3604 	int index;
3605 	u32 cpos = le32_to_cpu(split_rec->e_cpos);
3606 	struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3607 	struct ocfs2_extent_rec *rec, *tmprec;
3608 
3609 	right_el = path_leaf_el(right_path);;
3610 	if (left_path)
3611 		left_el = path_leaf_el(left_path);
3612 
3613 	el = right_el;
3614 	insert_el = right_el;
3615 	index = ocfs2_search_extent_list(el, cpos);
3616 	if (index != -1) {
3617 		if (index == 0 && left_path) {
3618 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3619 
3620 			/*
3621 			 * This typically means that the record
3622 			 * started in the left path but moved to the
3623 			 * right as a result of rotation. We either
3624 			 * move the existing record to the left, or we
3625 			 * do the later insert there.
3626 			 *
3627 			 * In this case, the left path should always
3628 			 * exist as the rotate code will have passed
3629 			 * it back for a post-insert update.
3630 			 */
3631 
3632 			if (split == SPLIT_LEFT) {
3633 				/*
3634 				 * It's a left split. Since we know
3635 				 * that the rotate code gave us an
3636 				 * empty extent in the left path, we
3637 				 * can just do the insert there.
3638 				 */
3639 				insert_el = left_el;
3640 			} else {
3641 				/*
3642 				 * Right split - we have to move the
3643 				 * existing record over to the left
3644 				 * leaf. The insert will be into the
3645 				 * newly created empty extent in the
3646 				 * right leaf.
3647 				 */
3648 				tmprec = &right_el->l_recs[index];
3649 				ocfs2_rotate_leaf(left_el, tmprec);
3650 				el = left_el;
3651 
3652 				memset(tmprec, 0, sizeof(*tmprec));
3653 				index = ocfs2_search_extent_list(left_el, cpos);
3654 				BUG_ON(index == -1);
3655 			}
3656 		}
3657 	} else {
3658 		BUG_ON(!left_path);
3659 		BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3660 		/*
3661 		 * Left path is easy - we can just allow the insert to
3662 		 * happen.
3663 		 */
3664 		el = left_el;
3665 		insert_el = left_el;
3666 		index = ocfs2_search_extent_list(el, cpos);
3667 		BUG_ON(index == -1);
3668 	}
3669 
3670 	rec = &el->l_recs[index];
3671 	ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3672 	ocfs2_rotate_leaf(insert_el, split_rec);
3673 }
3674 
3675 /*
3676  * This function only does inserts on an allocation b-tree. For tree
3677  * depth = 0, ocfs2_insert_at_leaf() is called directly.
3678  *
3679  * right_path is the path we want to do the actual insert
3680  * in. left_path should only be passed in if we need to update that
3681  * portion of the tree after an edge insert.
3682  */
3683 static int ocfs2_insert_path(struct inode *inode,
3684 			     handle_t *handle,
3685 			     struct ocfs2_path *left_path,
3686 			     struct ocfs2_path *right_path,
3687 			     struct ocfs2_extent_rec *insert_rec,
3688 			     struct ocfs2_insert_type *insert)
3689 {
3690 	int ret, subtree_index;
3691 	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3692 
3693 	if (left_path) {
3694 		int credits = handle->h_buffer_credits;
3695 
3696 		/*
3697 		 * There's a chance that left_path got passed back to
3698 		 * us without being accounted for in the
3699 		 * journal. Extend our transaction here to be sure we
3700 		 * can change those blocks.
3701 		 */
3702 		credits += left_path->p_tree_depth;
3703 
3704 		ret = ocfs2_extend_trans(handle, credits);
3705 		if (ret < 0) {
3706 			mlog_errno(ret);
3707 			goto out;
3708 		}
3709 
3710 		ret = ocfs2_journal_access_path(inode, handle, left_path);
3711 		if (ret < 0) {
3712 			mlog_errno(ret);
3713 			goto out;
3714 		}
3715 	}
3716 
3717 	/*
3718 	 * Pass both paths to the journal. The majority of inserts
3719 	 * will be touching all components anyway.
3720 	 */
3721 	ret = ocfs2_journal_access_path(inode, handle, right_path);
3722 	if (ret < 0) {
3723 		mlog_errno(ret);
3724 		goto out;
3725 	}
3726 
3727 	if (insert->ins_split != SPLIT_NONE) {
3728 		/*
3729 		 * We could call ocfs2_insert_at_leaf() for some types
3730 		 * of splits, but it's easier to just let one separate
3731 		 * function sort it all out.
3732 		 */
3733 		ocfs2_split_record(inode, left_path, right_path,
3734 				   insert_rec, insert->ins_split);
3735 
3736 		/*
3737 		 * Split might have modified either leaf and we don't
3738 		 * have a guarantee that the later edge insert will
3739 		 * dirty this for us.
3740 		 */
3741 		if (left_path)
3742 			ret = ocfs2_journal_dirty(handle,
3743 						  path_leaf_bh(left_path));
3744 			if (ret)
3745 				mlog_errno(ret);
3746 	} else
3747 		ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3748 				     insert, inode);
3749 
3750 	ret = ocfs2_journal_dirty(handle, leaf_bh);
3751 	if (ret)
3752 		mlog_errno(ret);
3753 
3754 	if (left_path) {
3755 		/*
3756 		 * The rotate code has indicated that we need to fix
3757 		 * up portions of the tree after the insert.
3758 		 *
3759 		 * XXX: Should we extend the transaction here?
3760 		 */
3761 		subtree_index = ocfs2_find_subtree_root(inode, left_path,
3762 							right_path);
3763 		ocfs2_complete_edge_insert(inode, handle, left_path,
3764 					   right_path, subtree_index);
3765 	}
3766 
3767 	ret = 0;
3768 out:
3769 	return ret;
3770 }
3771 
3772 static int ocfs2_do_insert_extent(struct inode *inode,
3773 				  handle_t *handle,
3774 				  struct ocfs2_extent_tree *et,
3775 				  struct ocfs2_extent_rec *insert_rec,
3776 				  struct ocfs2_insert_type *type)
3777 {
3778 	int ret, rotate = 0;
3779 	u32 cpos;
3780 	struct ocfs2_path *right_path = NULL;
3781 	struct ocfs2_path *left_path = NULL;
3782 	struct ocfs2_extent_list *el;
3783 
3784 	el = et->root_el;
3785 
3786 	ret = ocfs2_journal_access(handle, inode, et->root_bh,
3787 				   OCFS2_JOURNAL_ACCESS_WRITE);
3788 	if (ret) {
3789 		mlog_errno(ret);
3790 		goto out;
3791 	}
3792 
3793 	if (le16_to_cpu(el->l_tree_depth) == 0) {
3794 		ocfs2_insert_at_leaf(insert_rec, el, type, inode);
3795 		goto out_update_clusters;
3796 	}
3797 
3798 	right_path = ocfs2_new_path(et->root_bh, et->root_el);
3799 	if (!right_path) {
3800 		ret = -ENOMEM;
3801 		mlog_errno(ret);
3802 		goto out;
3803 	}
3804 
3805 	/*
3806 	 * Determine the path to start with. Rotations need the
3807 	 * rightmost path, everything else can go directly to the
3808 	 * target leaf.
3809 	 */
3810 	cpos = le32_to_cpu(insert_rec->e_cpos);
3811 	if (type->ins_appending == APPEND_NONE &&
3812 	    type->ins_contig == CONTIG_NONE) {
3813 		rotate = 1;
3814 		cpos = UINT_MAX;
3815 	}
3816 
3817 	ret = ocfs2_find_path(inode, right_path, cpos);
3818 	if (ret) {
3819 		mlog_errno(ret);
3820 		goto out;
3821 	}
3822 
3823 	/*
3824 	 * Rotations and appends need special treatment - they modify
3825 	 * parts of the tree's above them.
3826 	 *
3827 	 * Both might pass back a path immediate to the left of the
3828 	 * one being inserted to. This will be cause
3829 	 * ocfs2_insert_path() to modify the rightmost records of
3830 	 * left_path to account for an edge insert.
3831 	 *
3832 	 * XXX: When modifying this code, keep in mind that an insert
3833 	 * can wind up skipping both of these two special cases...
3834 	 */
3835 	if (rotate) {
3836 		ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
3837 					      le32_to_cpu(insert_rec->e_cpos),
3838 					      right_path, &left_path);
3839 		if (ret) {
3840 			mlog_errno(ret);
3841 			goto out;
3842 		}
3843 
3844 		/*
3845 		 * ocfs2_rotate_tree_right() might have extended the
3846 		 * transaction without re-journaling our tree root.
3847 		 */
3848 		ret = ocfs2_journal_access(handle, inode, et->root_bh,
3849 					   OCFS2_JOURNAL_ACCESS_WRITE);
3850 		if (ret) {
3851 			mlog_errno(ret);
3852 			goto out;
3853 		}
3854 	} else if (type->ins_appending == APPEND_TAIL
3855 		   && type->ins_contig != CONTIG_LEFT) {
3856 		ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
3857 					       right_path, &left_path);
3858 		if (ret) {
3859 			mlog_errno(ret);
3860 			goto out;
3861 		}
3862 	}
3863 
3864 	ret = ocfs2_insert_path(inode, handle, left_path, right_path,
3865 				insert_rec, type);
3866 	if (ret) {
3867 		mlog_errno(ret);
3868 		goto out;
3869 	}
3870 
3871 out_update_clusters:
3872 	if (type->ins_split == SPLIT_NONE)
3873 		ocfs2_update_clusters(inode, et,
3874 				      le16_to_cpu(insert_rec->e_leaf_clusters));
3875 
3876 	ret = ocfs2_journal_dirty(handle, et->root_bh);
3877 	if (ret)
3878 		mlog_errno(ret);
3879 
3880 out:
3881 	ocfs2_free_path(left_path);
3882 	ocfs2_free_path(right_path);
3883 
3884 	return ret;
3885 }
3886 
3887 static enum ocfs2_contig_type
3888 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
3889 			       struct ocfs2_extent_list *el, int index,
3890 			       struct ocfs2_extent_rec *split_rec)
3891 {
3892 	int status;
3893 	enum ocfs2_contig_type ret = CONTIG_NONE;
3894 	u32 left_cpos, right_cpos;
3895 	struct ocfs2_extent_rec *rec = NULL;
3896 	struct ocfs2_extent_list *new_el;
3897 	struct ocfs2_path *left_path = NULL, *right_path = NULL;
3898 	struct buffer_head *bh;
3899 	struct ocfs2_extent_block *eb;
3900 
3901 	if (index > 0) {
3902 		rec = &el->l_recs[index - 1];
3903 	} else if (path->p_tree_depth > 0) {
3904 		status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3905 						       path, &left_cpos);
3906 		if (status)
3907 			goto out;
3908 
3909 		if (left_cpos != 0) {
3910 			left_path = ocfs2_new_path(path_root_bh(path),
3911 						   path_root_el(path));
3912 			if (!left_path)
3913 				goto out;
3914 
3915 			status = ocfs2_find_path(inode, left_path, left_cpos);
3916 			if (status)
3917 				goto out;
3918 
3919 			new_el = path_leaf_el(left_path);
3920 
3921 			if (le16_to_cpu(new_el->l_next_free_rec) !=
3922 			    le16_to_cpu(new_el->l_count)) {
3923 				bh = path_leaf_bh(left_path);
3924 				eb = (struct ocfs2_extent_block *)bh->b_data;
3925 				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3926 								 eb);
3927 				goto out;
3928 			}
3929 			rec = &new_el->l_recs[
3930 				le16_to_cpu(new_el->l_next_free_rec) - 1];
3931 		}
3932 	}
3933 
3934 	/*
3935 	 * We're careful to check for an empty extent record here -
3936 	 * the merge code will know what to do if it sees one.
3937 	 */
3938 	if (rec) {
3939 		if (index == 1 && ocfs2_is_empty_extent(rec)) {
3940 			if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3941 				ret = CONTIG_RIGHT;
3942 		} else {
3943 			ret = ocfs2_extent_contig(inode, rec, split_rec);
3944 		}
3945 	}
3946 
3947 	rec = NULL;
3948 	if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
3949 		rec = &el->l_recs[index + 1];
3950 	else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
3951 		 path->p_tree_depth > 0) {
3952 		status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
3953 							path, &right_cpos);
3954 		if (status)
3955 			goto out;
3956 
3957 		if (right_cpos == 0)
3958 			goto out;
3959 
3960 		right_path = ocfs2_new_path(path_root_bh(path),
3961 					    path_root_el(path));
3962 		if (!right_path)
3963 			goto out;
3964 
3965 		status = ocfs2_find_path(inode, right_path, right_cpos);
3966 		if (status)
3967 			goto out;
3968 
3969 		new_el = path_leaf_el(right_path);
3970 		rec = &new_el->l_recs[0];
3971 		if (ocfs2_is_empty_extent(rec)) {
3972 			if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
3973 				bh = path_leaf_bh(right_path);
3974 				eb = (struct ocfs2_extent_block *)bh->b_data;
3975 				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3976 								 eb);
3977 				goto out;
3978 			}
3979 			rec = &new_el->l_recs[1];
3980 		}
3981 	}
3982 
3983 	if (rec) {
3984 		enum ocfs2_contig_type contig_type;
3985 
3986 		contig_type = ocfs2_extent_contig(inode, rec, split_rec);
3987 
3988 		if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
3989 			ret = CONTIG_LEFTRIGHT;
3990 		else if (ret == CONTIG_NONE)
3991 			ret = contig_type;
3992 	}
3993 
3994 out:
3995 	if (left_path)
3996 		ocfs2_free_path(left_path);
3997 	if (right_path)
3998 		ocfs2_free_path(right_path);
3999 
4000 	return ret;
4001 }
4002 
4003 static void ocfs2_figure_contig_type(struct inode *inode,
4004 				     struct ocfs2_insert_type *insert,
4005 				     struct ocfs2_extent_list *el,
4006 				     struct ocfs2_extent_rec *insert_rec)
4007 {
4008 	int i;
4009 	enum ocfs2_contig_type contig_type = CONTIG_NONE;
4010 
4011 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4012 
4013 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4014 		contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4015 						  insert_rec);
4016 		if (contig_type != CONTIG_NONE) {
4017 			insert->ins_contig_index = i;
4018 			break;
4019 		}
4020 	}
4021 	insert->ins_contig = contig_type;
4022 }
4023 
4024 /*
4025  * This should only be called against the righmost leaf extent list.
4026  *
4027  * ocfs2_figure_appending_type() will figure out whether we'll have to
4028  * insert at the tail of the rightmost leaf.
4029  *
4030  * This should also work against the root extent list for tree's with 0
4031  * depth. If we consider the root extent list to be the rightmost leaf node
4032  * then the logic here makes sense.
4033  */
4034 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4035 					struct ocfs2_extent_list *el,
4036 					struct ocfs2_extent_rec *insert_rec)
4037 {
4038 	int i;
4039 	u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4040 	struct ocfs2_extent_rec *rec;
4041 
4042 	insert->ins_appending = APPEND_NONE;
4043 
4044 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4045 
4046 	if (!el->l_next_free_rec)
4047 		goto set_tail_append;
4048 
4049 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4050 		/* Were all records empty? */
4051 		if (le16_to_cpu(el->l_next_free_rec) == 1)
4052 			goto set_tail_append;
4053 	}
4054 
4055 	i = le16_to_cpu(el->l_next_free_rec) - 1;
4056 	rec = &el->l_recs[i];
4057 
4058 	if (cpos >=
4059 	    (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4060 		goto set_tail_append;
4061 
4062 	return;
4063 
4064 set_tail_append:
4065 	insert->ins_appending = APPEND_TAIL;
4066 }
4067 
4068 /*
4069  * Helper function called at the begining of an insert.
4070  *
4071  * This computes a few things that are commonly used in the process of
4072  * inserting into the btree:
4073  *   - Whether the new extent is contiguous with an existing one.
4074  *   - The current tree depth.
4075  *   - Whether the insert is an appending one.
4076  *   - The total # of free records in the tree.
4077  *
4078  * All of the information is stored on the ocfs2_insert_type
4079  * structure.
4080  */
4081 static int ocfs2_figure_insert_type(struct inode *inode,
4082 				    struct ocfs2_extent_tree *et,
4083 				    struct buffer_head **last_eb_bh,
4084 				    struct ocfs2_extent_rec *insert_rec,
4085 				    int *free_records,
4086 				    struct ocfs2_insert_type *insert)
4087 {
4088 	int ret;
4089 	struct ocfs2_extent_block *eb;
4090 	struct ocfs2_extent_list *el;
4091 	struct ocfs2_path *path = NULL;
4092 	struct buffer_head *bh = NULL;
4093 
4094 	insert->ins_split = SPLIT_NONE;
4095 
4096 	el = et->root_el;
4097 	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4098 
4099 	if (el->l_tree_depth) {
4100 		/*
4101 		 * If we have tree depth, we read in the
4102 		 * rightmost extent block ahead of time as
4103 		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4104 		 * may want it later.
4105 		 */
4106 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4107 				       ocfs2_get_last_eb_blk(et), &bh,
4108 				       OCFS2_BH_CACHED, inode);
4109 		if (ret) {
4110 			mlog_exit(ret);
4111 			goto out;
4112 		}
4113 		eb = (struct ocfs2_extent_block *) bh->b_data;
4114 		el = &eb->h_list;
4115 	}
4116 
4117 	/*
4118 	 * Unless we have a contiguous insert, we'll need to know if
4119 	 * there is room left in our allocation tree for another
4120 	 * extent record.
4121 	 *
4122 	 * XXX: This test is simplistic, we can search for empty
4123 	 * extent records too.
4124 	 */
4125 	*free_records = le16_to_cpu(el->l_count) -
4126 		le16_to_cpu(el->l_next_free_rec);
4127 
4128 	if (!insert->ins_tree_depth) {
4129 		ocfs2_figure_contig_type(inode, insert, el, insert_rec);
4130 		ocfs2_figure_appending_type(insert, el, insert_rec);
4131 		return 0;
4132 	}
4133 
4134 	path = ocfs2_new_path(et->root_bh, et->root_el);
4135 	if (!path) {
4136 		ret = -ENOMEM;
4137 		mlog_errno(ret);
4138 		goto out;
4139 	}
4140 
4141 	/*
4142 	 * In the case that we're inserting past what the tree
4143 	 * currently accounts for, ocfs2_find_path() will return for
4144 	 * us the rightmost tree path. This is accounted for below in
4145 	 * the appending code.
4146 	 */
4147 	ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4148 	if (ret) {
4149 		mlog_errno(ret);
4150 		goto out;
4151 	}
4152 
4153 	el = path_leaf_el(path);
4154 
4155 	/*
4156 	 * Now that we have the path, there's two things we want to determine:
4157 	 * 1) Contiguousness (also set contig_index if this is so)
4158 	 *
4159 	 * 2) Are we doing an append? We can trivially break this up
4160          *     into two types of appends: simple record append, or a
4161          *     rotate inside the tail leaf.
4162 	 */
4163 	ocfs2_figure_contig_type(inode, insert, el, insert_rec);
4164 
4165 	/*
4166 	 * The insert code isn't quite ready to deal with all cases of
4167 	 * left contiguousness. Specifically, if it's an insert into
4168 	 * the 1st record in a leaf, it will require the adjustment of
4169 	 * cluster count on the last record of the path directly to it's
4170 	 * left. For now, just catch that case and fool the layers
4171 	 * above us. This works just fine for tree_depth == 0, which
4172 	 * is why we allow that above.
4173 	 */
4174 	if (insert->ins_contig == CONTIG_LEFT &&
4175 	    insert->ins_contig_index == 0)
4176 		insert->ins_contig = CONTIG_NONE;
4177 
4178 	/*
4179 	 * Ok, so we can simply compare against last_eb to figure out
4180 	 * whether the path doesn't exist. This will only happen in
4181 	 * the case that we're doing a tail append, so maybe we can
4182 	 * take advantage of that information somehow.
4183 	 */
4184 	if (ocfs2_get_last_eb_blk(et) ==
4185 	    path_leaf_bh(path)->b_blocknr) {
4186 		/*
4187 		 * Ok, ocfs2_find_path() returned us the rightmost
4188 		 * tree path. This might be an appending insert. There are
4189 		 * two cases:
4190 		 *    1) We're doing a true append at the tail:
4191 		 *	-This might even be off the end of the leaf
4192 		 *    2) We're "appending" by rotating in the tail
4193 		 */
4194 		ocfs2_figure_appending_type(insert, el, insert_rec);
4195 	}
4196 
4197 out:
4198 	ocfs2_free_path(path);
4199 
4200 	if (ret == 0)
4201 		*last_eb_bh = bh;
4202 	else
4203 		brelse(bh);
4204 	return ret;
4205 }
4206 
4207 /*
4208  * Insert an extent into an inode btree.
4209  *
4210  * The caller needs to update fe->i_clusters
4211  */
4212 int ocfs2_insert_extent(struct ocfs2_super *osb,
4213 			handle_t *handle,
4214 			struct inode *inode,
4215 			struct buffer_head *root_bh,
4216 			u32 cpos,
4217 			u64 start_blk,
4218 			u32 new_clusters,
4219 			u8 flags,
4220 			struct ocfs2_alloc_context *meta_ac,
4221 			enum ocfs2_extent_tree_type et_type)
4222 {
4223 	int status;
4224 	int uninitialized_var(free_records);
4225 	struct buffer_head *last_eb_bh = NULL;
4226 	struct ocfs2_insert_type insert = {0, };
4227 	struct ocfs2_extent_rec rec;
4228 	struct ocfs2_extent_tree *et = NULL;
4229 
4230 	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
4231 
4232 	et = ocfs2_new_extent_tree(root_bh, et_type);
4233 	if (!et) {
4234 		status = -ENOMEM;
4235 		mlog_errno(status);
4236 		goto bail;
4237 	}
4238 
4239 	mlog(0, "add %u clusters at position %u to inode %llu\n",
4240 	     new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4241 
4242 	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
4243 			(OCFS2_I(inode)->ip_clusters != cpos),
4244 			"Device %s, asking for sparse allocation: inode %llu, "
4245 			"cpos %u, clusters %u\n",
4246 			osb->dev_str,
4247 			(unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
4248 			OCFS2_I(inode)->ip_clusters);
4249 
4250 	memset(&rec, 0, sizeof(rec));
4251 	rec.e_cpos = cpu_to_le32(cpos);
4252 	rec.e_blkno = cpu_to_le64(start_blk);
4253 	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4254 	rec.e_flags = flags;
4255 
4256 	status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4257 					  &free_records, &insert);
4258 	if (status < 0) {
4259 		mlog_errno(status);
4260 		goto bail;
4261 	}
4262 
4263 	mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4264 	     "Insert.contig_index: %d, Insert.free_records: %d, "
4265 	     "Insert.tree_depth: %d\n",
4266 	     insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4267 	     free_records, insert.ins_tree_depth);
4268 
4269 	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4270 		status = ocfs2_grow_tree(inode, handle, et,
4271 					 &insert.ins_tree_depth, &last_eb_bh,
4272 					 meta_ac);
4273 		if (status) {
4274 			mlog_errno(status);
4275 			goto bail;
4276 		}
4277 	}
4278 
4279 	/* Finally, we can add clusters. This might rotate the tree for us. */
4280 	status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4281 	if (status < 0)
4282 		mlog_errno(status);
4283 	else if (et->type == OCFS2_DINODE_EXTENT)
4284 		ocfs2_extent_map_insert_rec(inode, &rec);
4285 
4286 bail:
4287 	if (last_eb_bh)
4288 		brelse(last_eb_bh);
4289 
4290 	if (et)
4291 		ocfs2_free_extent_tree(et);
4292 	mlog_exit(status);
4293 	return status;
4294 }
4295 
4296 static void ocfs2_make_right_split_rec(struct super_block *sb,
4297 				       struct ocfs2_extent_rec *split_rec,
4298 				       u32 cpos,
4299 				       struct ocfs2_extent_rec *rec)
4300 {
4301 	u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4302 	u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4303 
4304 	memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4305 
4306 	split_rec->e_cpos = cpu_to_le32(cpos);
4307 	split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4308 
4309 	split_rec->e_blkno = rec->e_blkno;
4310 	le64_add_cpu(&split_rec->e_blkno,
4311 		     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4312 
4313 	split_rec->e_flags = rec->e_flags;
4314 }
4315 
4316 static int ocfs2_split_and_insert(struct inode *inode,
4317 				  handle_t *handle,
4318 				  struct ocfs2_path *path,
4319 				  struct ocfs2_extent_tree *et,
4320 				  struct buffer_head **last_eb_bh,
4321 				  int split_index,
4322 				  struct ocfs2_extent_rec *orig_split_rec,
4323 				  struct ocfs2_alloc_context *meta_ac)
4324 {
4325 	int ret = 0, depth;
4326 	unsigned int insert_range, rec_range, do_leftright = 0;
4327 	struct ocfs2_extent_rec tmprec;
4328 	struct ocfs2_extent_list *rightmost_el;
4329 	struct ocfs2_extent_rec rec;
4330 	struct ocfs2_extent_rec split_rec = *orig_split_rec;
4331 	struct ocfs2_insert_type insert;
4332 	struct ocfs2_extent_block *eb;
4333 
4334 leftright:
4335 	/*
4336 	 * Store a copy of the record on the stack - it might move
4337 	 * around as the tree is manipulated below.
4338 	 */
4339 	rec = path_leaf_el(path)->l_recs[split_index];
4340 
4341 	rightmost_el = et->root_el;
4342 
4343 	depth = le16_to_cpu(rightmost_el->l_tree_depth);
4344 	if (depth) {
4345 		BUG_ON(!(*last_eb_bh));
4346 		eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4347 		rightmost_el = &eb->h_list;
4348 	}
4349 
4350 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4351 	    le16_to_cpu(rightmost_el->l_count)) {
4352 		ret = ocfs2_grow_tree(inode, handle, et,
4353 				      &depth, last_eb_bh, meta_ac);
4354 		if (ret) {
4355 			mlog_errno(ret);
4356 			goto out;
4357 		}
4358 	}
4359 
4360 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4361 	insert.ins_appending = APPEND_NONE;
4362 	insert.ins_contig = CONTIG_NONE;
4363 	insert.ins_tree_depth = depth;
4364 
4365 	insert_range = le32_to_cpu(split_rec.e_cpos) +
4366 		le16_to_cpu(split_rec.e_leaf_clusters);
4367 	rec_range = le32_to_cpu(rec.e_cpos) +
4368 		le16_to_cpu(rec.e_leaf_clusters);
4369 
4370 	if (split_rec.e_cpos == rec.e_cpos) {
4371 		insert.ins_split = SPLIT_LEFT;
4372 	} else if (insert_range == rec_range) {
4373 		insert.ins_split = SPLIT_RIGHT;
4374 	} else {
4375 		/*
4376 		 * Left/right split. We fake this as a right split
4377 		 * first and then make a second pass as a left split.
4378 		 */
4379 		insert.ins_split = SPLIT_RIGHT;
4380 
4381 		ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4382 					   &rec);
4383 
4384 		split_rec = tmprec;
4385 
4386 		BUG_ON(do_leftright);
4387 		do_leftright = 1;
4388 	}
4389 
4390 	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4391 	if (ret) {
4392 		mlog_errno(ret);
4393 		goto out;
4394 	}
4395 
4396 	if (do_leftright == 1) {
4397 		u32 cpos;
4398 		struct ocfs2_extent_list *el;
4399 
4400 		do_leftright++;
4401 		split_rec = *orig_split_rec;
4402 
4403 		ocfs2_reinit_path(path, 1);
4404 
4405 		cpos = le32_to_cpu(split_rec.e_cpos);
4406 		ret = ocfs2_find_path(inode, path, cpos);
4407 		if (ret) {
4408 			mlog_errno(ret);
4409 			goto out;
4410 		}
4411 
4412 		el = path_leaf_el(path);
4413 		split_index = ocfs2_search_extent_list(el, cpos);
4414 		goto leftright;
4415 	}
4416 out:
4417 
4418 	return ret;
4419 }
4420 
4421 /*
4422  * Mark part or all of the extent record at split_index in the leaf
4423  * pointed to by path as written. This removes the unwritten
4424  * extent flag.
4425  *
4426  * Care is taken to handle contiguousness so as to not grow the tree.
4427  *
4428  * meta_ac is not strictly necessary - we only truly need it if growth
4429  * of the tree is required. All other cases will degrade into a less
4430  * optimal tree layout.
4431  *
4432  * last_eb_bh should be the rightmost leaf block for any extent
4433  * btree. Since a split may grow the tree or a merge might shrink it,
4434  * the caller cannot trust the contents of that buffer after this call.
4435  *
4436  * This code is optimized for readability - several passes might be
4437  * made over certain portions of the tree. All of those blocks will
4438  * have been brought into cache (and pinned via the journal), so the
4439  * extra overhead is not expressed in terms of disk reads.
4440  */
4441 static int __ocfs2_mark_extent_written(struct inode *inode,
4442 				       struct ocfs2_extent_tree *et,
4443 				       handle_t *handle,
4444 				       struct ocfs2_path *path,
4445 				       int split_index,
4446 				       struct ocfs2_extent_rec *split_rec,
4447 				       struct ocfs2_alloc_context *meta_ac,
4448 				       struct ocfs2_cached_dealloc_ctxt *dealloc)
4449 {
4450 	int ret = 0;
4451 	struct ocfs2_extent_list *el = path_leaf_el(path);
4452 	struct buffer_head *last_eb_bh = NULL;
4453 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4454 	struct ocfs2_merge_ctxt ctxt;
4455 	struct ocfs2_extent_list *rightmost_el;
4456 
4457 	if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4458 		ret = -EIO;
4459 		mlog_errno(ret);
4460 		goto out;
4461 	}
4462 
4463 	if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4464 	    ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4465 	     (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4466 		ret = -EIO;
4467 		mlog_errno(ret);
4468 		goto out;
4469 	}
4470 
4471 	ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4472 							    split_index,
4473 							    split_rec);
4474 
4475 	/*
4476 	 * The core merge / split code wants to know how much room is
4477 	 * left in this inodes allocation tree, so we pass the
4478 	 * rightmost extent list.
4479 	 */
4480 	if (path->p_tree_depth) {
4481 		struct ocfs2_extent_block *eb;
4482 
4483 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4484 				       ocfs2_get_last_eb_blk(et),
4485 				       &last_eb_bh, OCFS2_BH_CACHED, inode);
4486 		if (ret) {
4487 			mlog_exit(ret);
4488 			goto out;
4489 		}
4490 
4491 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4492 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4493 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4494 			ret = -EROFS;
4495 			goto out;
4496 		}
4497 
4498 		rightmost_el = &eb->h_list;
4499 	} else
4500 		rightmost_el = path_root_el(path);
4501 
4502 	if (rec->e_cpos == split_rec->e_cpos &&
4503 	    rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4504 		ctxt.c_split_covers_rec = 1;
4505 	else
4506 		ctxt.c_split_covers_rec = 0;
4507 
4508 	ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4509 
4510 	mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4511 	     split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4512 	     ctxt.c_split_covers_rec);
4513 
4514 	if (ctxt.c_contig_type == CONTIG_NONE) {
4515 		if (ctxt.c_split_covers_rec)
4516 			el->l_recs[split_index] = *split_rec;
4517 		else
4518 			ret = ocfs2_split_and_insert(inode, handle, path, et,
4519 						     &last_eb_bh, split_index,
4520 						     split_rec, meta_ac);
4521 		if (ret)
4522 			mlog_errno(ret);
4523 	} else {
4524 		ret = ocfs2_try_to_merge_extent(inode, handle, path,
4525 						split_index, split_rec,
4526 						dealloc, &ctxt, et);
4527 		if (ret)
4528 			mlog_errno(ret);
4529 	}
4530 
4531 out:
4532 	brelse(last_eb_bh);
4533 	return ret;
4534 }
4535 
4536 /*
4537  * Mark the already-existing extent at cpos as written for len clusters.
4538  *
4539  * If the existing extent is larger than the request, initiate a
4540  * split. An attempt will be made at merging with adjacent extents.
4541  *
4542  * The caller is responsible for passing down meta_ac if we'll need it.
4543  */
4544 int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
4545 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
4546 			      struct ocfs2_alloc_context *meta_ac,
4547 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
4548 			      enum ocfs2_extent_tree_type et_type)
4549 {
4550 	int ret, index;
4551 	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4552 	struct ocfs2_extent_rec split_rec;
4553 	struct ocfs2_path *left_path = NULL;
4554 	struct ocfs2_extent_list *el;
4555 	struct ocfs2_extent_tree *et = NULL;
4556 
4557 	mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4558 	     inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4559 
4560 	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4561 		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4562 			    "that are being written to, but the feature bit "
4563 			    "is not set in the super block.",
4564 			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
4565 		ret = -EROFS;
4566 		goto out;
4567 	}
4568 
4569 	et = ocfs2_new_extent_tree(root_bh, et_type);
4570 	if (!et) {
4571 		ret = -ENOMEM;
4572 		mlog_errno(ret);
4573 		goto out;
4574 	}
4575 
4576 	/*
4577 	 * XXX: This should be fixed up so that we just re-insert the
4578 	 * next extent records.
4579 	 */
4580 	if (et_type == OCFS2_DINODE_EXTENT)
4581 		ocfs2_extent_map_trunc(inode, 0);
4582 
4583 	left_path = ocfs2_new_path(et->root_bh, et->root_el);
4584 	if (!left_path) {
4585 		ret = -ENOMEM;
4586 		mlog_errno(ret);
4587 		goto out;
4588 	}
4589 
4590 	ret = ocfs2_find_path(inode, left_path, cpos);
4591 	if (ret) {
4592 		mlog_errno(ret);
4593 		goto out;
4594 	}
4595 	el = path_leaf_el(left_path);
4596 
4597 	index = ocfs2_search_extent_list(el, cpos);
4598 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4599 		ocfs2_error(inode->i_sb,
4600 			    "Inode %llu has an extent at cpos %u which can no "
4601 			    "longer be found.\n",
4602 			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4603 		ret = -EROFS;
4604 		goto out;
4605 	}
4606 
4607 	memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4608 	split_rec.e_cpos = cpu_to_le32(cpos);
4609 	split_rec.e_leaf_clusters = cpu_to_le16(len);
4610 	split_rec.e_blkno = cpu_to_le64(start_blkno);
4611 	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4612 	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4613 
4614 	ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
4615 					  index, &split_rec, meta_ac,
4616 					  dealloc);
4617 	if (ret)
4618 		mlog_errno(ret);
4619 
4620 out:
4621 	ocfs2_free_path(left_path);
4622 	if (et)
4623 		ocfs2_free_extent_tree(et);
4624 	return ret;
4625 }
4626 
4627 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4628 			    handle_t *handle, struct ocfs2_path *path,
4629 			    int index, u32 new_range,
4630 			    struct ocfs2_alloc_context *meta_ac)
4631 {
4632 	int ret, depth, credits = handle->h_buffer_credits;
4633 	struct buffer_head *last_eb_bh = NULL;
4634 	struct ocfs2_extent_block *eb;
4635 	struct ocfs2_extent_list *rightmost_el, *el;
4636 	struct ocfs2_extent_rec split_rec;
4637 	struct ocfs2_extent_rec *rec;
4638 	struct ocfs2_insert_type insert;
4639 
4640 	/*
4641 	 * Setup the record to split before we grow the tree.
4642 	 */
4643 	el = path_leaf_el(path);
4644 	rec = &el->l_recs[index];
4645 	ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4646 
4647 	depth = path->p_tree_depth;
4648 	if (depth > 0) {
4649 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4650 				       ocfs2_get_last_eb_blk(et),
4651 				       &last_eb_bh, OCFS2_BH_CACHED, inode);
4652 		if (ret < 0) {
4653 			mlog_errno(ret);
4654 			goto out;
4655 		}
4656 
4657 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4658 		rightmost_el = &eb->h_list;
4659 	} else
4660 		rightmost_el = path_leaf_el(path);
4661 
4662 	credits += path->p_tree_depth +
4663 		   ocfs2_extend_meta_needed(et->root_el);
4664 	ret = ocfs2_extend_trans(handle, credits);
4665 	if (ret) {
4666 		mlog_errno(ret);
4667 		goto out;
4668 	}
4669 
4670 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4671 	    le16_to_cpu(rightmost_el->l_count)) {
4672 		ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
4673 				      meta_ac);
4674 		if (ret) {
4675 			mlog_errno(ret);
4676 			goto out;
4677 		}
4678 	}
4679 
4680 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4681 	insert.ins_appending = APPEND_NONE;
4682 	insert.ins_contig = CONTIG_NONE;
4683 	insert.ins_split = SPLIT_RIGHT;
4684 	insert.ins_tree_depth = depth;
4685 
4686 	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4687 	if (ret)
4688 		mlog_errno(ret);
4689 
4690 out:
4691 	brelse(last_eb_bh);
4692 	return ret;
4693 }
4694 
4695 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4696 			      struct ocfs2_path *path, int index,
4697 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
4698 			      u32 cpos, u32 len,
4699 			      struct ocfs2_extent_tree *et)
4700 {
4701 	int ret;
4702 	u32 left_cpos, rec_range, trunc_range;
4703 	int wants_rotate = 0, is_rightmost_tree_rec = 0;
4704 	struct super_block *sb = inode->i_sb;
4705 	struct ocfs2_path *left_path = NULL;
4706 	struct ocfs2_extent_list *el = path_leaf_el(path);
4707 	struct ocfs2_extent_rec *rec;
4708 	struct ocfs2_extent_block *eb;
4709 
4710 	if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4711 		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
4712 		if (ret) {
4713 			mlog_errno(ret);
4714 			goto out;
4715 		}
4716 
4717 		index--;
4718 	}
4719 
4720 	if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
4721 	    path->p_tree_depth) {
4722 		/*
4723 		 * Check whether this is the rightmost tree record. If
4724 		 * we remove all of this record or part of its right
4725 		 * edge then an update of the record lengths above it
4726 		 * will be required.
4727 		 */
4728 		eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
4729 		if (eb->h_next_leaf_blk == 0)
4730 			is_rightmost_tree_rec = 1;
4731 	}
4732 
4733 	rec = &el->l_recs[index];
4734 	if (index == 0 && path->p_tree_depth &&
4735 	    le32_to_cpu(rec->e_cpos) == cpos) {
4736 		/*
4737 		 * Changing the leftmost offset (via partial or whole
4738 		 * record truncate) of an interior (or rightmost) path
4739 		 * means we have to update the subtree that is formed
4740 		 * by this leaf and the one to it's left.
4741 		 *
4742 		 * There are two cases we can skip:
4743 		 *   1) Path is the leftmost one in our inode tree.
4744 		 *   2) The leaf is rightmost and will be empty after
4745 		 *      we remove the extent record - the rotate code
4746 		 *      knows how to update the newly formed edge.
4747 		 */
4748 
4749 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
4750 						    &left_cpos);
4751 		if (ret) {
4752 			mlog_errno(ret);
4753 			goto out;
4754 		}
4755 
4756 		if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
4757 			left_path = ocfs2_new_path(path_root_bh(path),
4758 						   path_root_el(path));
4759 			if (!left_path) {
4760 				ret = -ENOMEM;
4761 				mlog_errno(ret);
4762 				goto out;
4763 			}
4764 
4765 			ret = ocfs2_find_path(inode, left_path, left_cpos);
4766 			if (ret) {
4767 				mlog_errno(ret);
4768 				goto out;
4769 			}
4770 		}
4771 	}
4772 
4773 	ret = ocfs2_extend_rotate_transaction(handle, 0,
4774 					      handle->h_buffer_credits,
4775 					      path);
4776 	if (ret) {
4777 		mlog_errno(ret);
4778 		goto out;
4779 	}
4780 
4781 	ret = ocfs2_journal_access_path(inode, handle, path);
4782 	if (ret) {
4783 		mlog_errno(ret);
4784 		goto out;
4785 	}
4786 
4787 	ret = ocfs2_journal_access_path(inode, handle, left_path);
4788 	if (ret) {
4789 		mlog_errno(ret);
4790 		goto out;
4791 	}
4792 
4793 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4794 	trunc_range = cpos + len;
4795 
4796 	if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
4797 		int next_free;
4798 
4799 		memset(rec, 0, sizeof(*rec));
4800 		ocfs2_cleanup_merge(el, index);
4801 		wants_rotate = 1;
4802 
4803 		next_free = le16_to_cpu(el->l_next_free_rec);
4804 		if (is_rightmost_tree_rec && next_free > 1) {
4805 			/*
4806 			 * We skip the edge update if this path will
4807 			 * be deleted by the rotate code.
4808 			 */
4809 			rec = &el->l_recs[next_free - 1];
4810 			ocfs2_adjust_rightmost_records(inode, handle, path,
4811 						       rec);
4812 		}
4813 	} else if (le32_to_cpu(rec->e_cpos) == cpos) {
4814 		/* Remove leftmost portion of the record. */
4815 		le32_add_cpu(&rec->e_cpos, len);
4816 		le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
4817 		le16_add_cpu(&rec->e_leaf_clusters, -len);
4818 	} else if (rec_range == trunc_range) {
4819 		/* Remove rightmost portion of the record */
4820 		le16_add_cpu(&rec->e_leaf_clusters, -len);
4821 		if (is_rightmost_tree_rec)
4822 			ocfs2_adjust_rightmost_records(inode, handle, path, rec);
4823 	} else {
4824 		/* Caller should have trapped this. */
4825 		mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
4826 		     "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
4827 		     le32_to_cpu(rec->e_cpos),
4828 		     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
4829 		BUG();
4830 	}
4831 
4832 	if (left_path) {
4833 		int subtree_index;
4834 
4835 		subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
4836 		ocfs2_complete_edge_insert(inode, handle, left_path, path,
4837 					   subtree_index);
4838 	}
4839 
4840 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
4841 
4842 	ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
4843 	if (ret) {
4844 		mlog_errno(ret);
4845 		goto out;
4846 	}
4847 
4848 out:
4849 	ocfs2_free_path(left_path);
4850 	return ret;
4851 }
4852 
4853 int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
4854 			u32 cpos, u32 len, handle_t *handle,
4855 			struct ocfs2_alloc_context *meta_ac,
4856 			struct ocfs2_cached_dealloc_ctxt *dealloc,
4857 			enum ocfs2_extent_tree_type et_type)
4858 {
4859 	int ret, index;
4860 	u32 rec_range, trunc_range;
4861 	struct ocfs2_extent_rec *rec;
4862 	struct ocfs2_extent_list *el;
4863 	struct ocfs2_path *path = NULL;
4864 	struct ocfs2_extent_tree *et = NULL;
4865 
4866 	et = ocfs2_new_extent_tree(root_bh, et_type);
4867 	if (!et) {
4868 		ret = -ENOMEM;
4869 		mlog_errno(ret);
4870 		goto out;
4871 	}
4872 
4873 	ocfs2_extent_map_trunc(inode, 0);
4874 
4875 	path = ocfs2_new_path(et->root_bh, et->root_el);
4876 	if (!path) {
4877 		ret = -ENOMEM;
4878 		mlog_errno(ret);
4879 		goto out;
4880 	}
4881 
4882 	ret = ocfs2_find_path(inode, path, cpos);
4883 	if (ret) {
4884 		mlog_errno(ret);
4885 		goto out;
4886 	}
4887 
4888 	el = path_leaf_el(path);
4889 	index = ocfs2_search_extent_list(el, cpos);
4890 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4891 		ocfs2_error(inode->i_sb,
4892 			    "Inode %llu has an extent at cpos %u which can no "
4893 			    "longer be found.\n",
4894 			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4895 		ret = -EROFS;
4896 		goto out;
4897 	}
4898 
4899 	/*
4900 	 * We have 3 cases of extent removal:
4901 	 *   1) Range covers the entire extent rec
4902 	 *   2) Range begins or ends on one edge of the extent rec
4903 	 *   3) Range is in the middle of the extent rec (no shared edges)
4904 	 *
4905 	 * For case 1 we remove the extent rec and left rotate to
4906 	 * fill the hole.
4907 	 *
4908 	 * For case 2 we just shrink the existing extent rec, with a
4909 	 * tree update if the shrinking edge is also the edge of an
4910 	 * extent block.
4911 	 *
4912 	 * For case 3 we do a right split to turn the extent rec into
4913 	 * something case 2 can handle.
4914 	 */
4915 	rec = &el->l_recs[index];
4916 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4917 	trunc_range = cpos + len;
4918 
4919 	BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
4920 
4921 	mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
4922 	     "(cpos %u, len %u)\n",
4923 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
4924 	     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
4925 
4926 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
4927 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4928 					 cpos, len, et);
4929 		if (ret) {
4930 			mlog_errno(ret);
4931 			goto out;
4932 		}
4933 	} else {
4934 		ret = ocfs2_split_tree(inode, et, handle, path, index,
4935 				       trunc_range, meta_ac);
4936 		if (ret) {
4937 			mlog_errno(ret);
4938 			goto out;
4939 		}
4940 
4941 		/*
4942 		 * The split could have manipulated the tree enough to
4943 		 * move the record location, so we have to look for it again.
4944 		 */
4945 		ocfs2_reinit_path(path, 1);
4946 
4947 		ret = ocfs2_find_path(inode, path, cpos);
4948 		if (ret) {
4949 			mlog_errno(ret);
4950 			goto out;
4951 		}
4952 
4953 		el = path_leaf_el(path);
4954 		index = ocfs2_search_extent_list(el, cpos);
4955 		if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4956 			ocfs2_error(inode->i_sb,
4957 				    "Inode %llu: split at cpos %u lost record.",
4958 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
4959 				    cpos);
4960 			ret = -EROFS;
4961 			goto out;
4962 		}
4963 
4964 		/*
4965 		 * Double check our values here. If anything is fishy,
4966 		 * it's easier to catch it at the top level.
4967 		 */
4968 		rec = &el->l_recs[index];
4969 		rec_range = le32_to_cpu(rec->e_cpos) +
4970 			ocfs2_rec_clusters(el, rec);
4971 		if (rec_range != trunc_range) {
4972 			ocfs2_error(inode->i_sb,
4973 				    "Inode %llu: error after split at cpos %u"
4974 				    "trunc len %u, existing record is (%u,%u)",
4975 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
4976 				    cpos, len, le32_to_cpu(rec->e_cpos),
4977 				    ocfs2_rec_clusters(el, rec));
4978 			ret = -EROFS;
4979 			goto out;
4980 		}
4981 
4982 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4983 					 cpos, len, et);
4984 		if (ret) {
4985 			mlog_errno(ret);
4986 			goto out;
4987 		}
4988 	}
4989 
4990 out:
4991 	ocfs2_free_path(path);
4992 	if (et)
4993 		ocfs2_free_extent_tree(et);
4994 	return ret;
4995 }
4996 
4997 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
4998 {
4999 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5000 	struct ocfs2_dinode *di;
5001 	struct ocfs2_truncate_log *tl;
5002 
5003 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5004 	tl = &di->id2.i_dealloc;
5005 
5006 	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5007 			"slot %d, invalid truncate log parameters: used = "
5008 			"%u, count = %u\n", osb->slot_num,
5009 			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5010 	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5011 }
5012 
5013 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5014 					   unsigned int new_start)
5015 {
5016 	unsigned int tail_index;
5017 	unsigned int current_tail;
5018 
5019 	/* No records, nothing to coalesce */
5020 	if (!le16_to_cpu(tl->tl_used))
5021 		return 0;
5022 
5023 	tail_index = le16_to_cpu(tl->tl_used) - 1;
5024 	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5025 	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5026 
5027 	return current_tail == new_start;
5028 }
5029 
5030 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5031 			      handle_t *handle,
5032 			      u64 start_blk,
5033 			      unsigned int num_clusters)
5034 {
5035 	int status, index;
5036 	unsigned int start_cluster, tl_count;
5037 	struct inode *tl_inode = osb->osb_tl_inode;
5038 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5039 	struct ocfs2_dinode *di;
5040 	struct ocfs2_truncate_log *tl;
5041 
5042 	mlog_entry("start_blk = %llu, num_clusters = %u\n",
5043 		   (unsigned long long)start_blk, num_clusters);
5044 
5045 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5046 
5047 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5048 
5049 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5050 	tl = &di->id2.i_dealloc;
5051 	if (!OCFS2_IS_VALID_DINODE(di)) {
5052 		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5053 		status = -EIO;
5054 		goto bail;
5055 	}
5056 
5057 	tl_count = le16_to_cpu(tl->tl_count);
5058 	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5059 			tl_count == 0,
5060 			"Truncate record count on #%llu invalid "
5061 			"wanted %u, actual %u\n",
5062 			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5063 			ocfs2_truncate_recs_per_inode(osb->sb),
5064 			le16_to_cpu(tl->tl_count));
5065 
5066 	/* Caller should have known to flush before calling us. */
5067 	index = le16_to_cpu(tl->tl_used);
5068 	if (index >= tl_count) {
5069 		status = -ENOSPC;
5070 		mlog_errno(status);
5071 		goto bail;
5072 	}
5073 
5074 	status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5075 				      OCFS2_JOURNAL_ACCESS_WRITE);
5076 	if (status < 0) {
5077 		mlog_errno(status);
5078 		goto bail;
5079 	}
5080 
5081 	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5082 	     "%llu (index = %d)\n", num_clusters, start_cluster,
5083 	     (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5084 
5085 	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5086 		/*
5087 		 * Move index back to the record we are coalescing with.
5088 		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5089 		 */
5090 		index--;
5091 
5092 		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5093 		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5094 		     index, le32_to_cpu(tl->tl_recs[index].t_start),
5095 		     num_clusters);
5096 	} else {
5097 		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5098 		tl->tl_used = cpu_to_le16(index + 1);
5099 	}
5100 	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5101 
5102 	status = ocfs2_journal_dirty(handle, tl_bh);
5103 	if (status < 0) {
5104 		mlog_errno(status);
5105 		goto bail;
5106 	}
5107 
5108 bail:
5109 	mlog_exit(status);
5110 	return status;
5111 }
5112 
5113 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5114 					 handle_t *handle,
5115 					 struct inode *data_alloc_inode,
5116 					 struct buffer_head *data_alloc_bh)
5117 {
5118 	int status = 0;
5119 	int i;
5120 	unsigned int num_clusters;
5121 	u64 start_blk;
5122 	struct ocfs2_truncate_rec rec;
5123 	struct ocfs2_dinode *di;
5124 	struct ocfs2_truncate_log *tl;
5125 	struct inode *tl_inode = osb->osb_tl_inode;
5126 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5127 
5128 	mlog_entry_void();
5129 
5130 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5131 	tl = &di->id2.i_dealloc;
5132 	i = le16_to_cpu(tl->tl_used) - 1;
5133 	while (i >= 0) {
5134 		/* Caller has given us at least enough credits to
5135 		 * update the truncate log dinode */
5136 		status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5137 					      OCFS2_JOURNAL_ACCESS_WRITE);
5138 		if (status < 0) {
5139 			mlog_errno(status);
5140 			goto bail;
5141 		}
5142 
5143 		tl->tl_used = cpu_to_le16(i);
5144 
5145 		status = ocfs2_journal_dirty(handle, tl_bh);
5146 		if (status < 0) {
5147 			mlog_errno(status);
5148 			goto bail;
5149 		}
5150 
5151 		/* TODO: Perhaps we can calculate the bulk of the
5152 		 * credits up front rather than extending like
5153 		 * this. */
5154 		status = ocfs2_extend_trans(handle,
5155 					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5156 		if (status < 0) {
5157 			mlog_errno(status);
5158 			goto bail;
5159 		}
5160 
5161 		rec = tl->tl_recs[i];
5162 		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5163 						    le32_to_cpu(rec.t_start));
5164 		num_clusters = le32_to_cpu(rec.t_clusters);
5165 
5166 		/* if start_blk is not set, we ignore the record as
5167 		 * invalid. */
5168 		if (start_blk) {
5169 			mlog(0, "free record %d, start = %u, clusters = %u\n",
5170 			     i, le32_to_cpu(rec.t_start), num_clusters);
5171 
5172 			status = ocfs2_free_clusters(handle, data_alloc_inode,
5173 						     data_alloc_bh, start_blk,
5174 						     num_clusters);
5175 			if (status < 0) {
5176 				mlog_errno(status);
5177 				goto bail;
5178 			}
5179 		}
5180 		i--;
5181 	}
5182 
5183 bail:
5184 	mlog_exit(status);
5185 	return status;
5186 }
5187 
5188 /* Expects you to already be holding tl_inode->i_mutex */
5189 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5190 {
5191 	int status;
5192 	unsigned int num_to_flush;
5193 	handle_t *handle;
5194 	struct inode *tl_inode = osb->osb_tl_inode;
5195 	struct inode *data_alloc_inode = NULL;
5196 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5197 	struct buffer_head *data_alloc_bh = NULL;
5198 	struct ocfs2_dinode *di;
5199 	struct ocfs2_truncate_log *tl;
5200 
5201 	mlog_entry_void();
5202 
5203 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5204 
5205 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5206 	tl = &di->id2.i_dealloc;
5207 	if (!OCFS2_IS_VALID_DINODE(di)) {
5208 		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5209 		status = -EIO;
5210 		goto out;
5211 	}
5212 
5213 	num_to_flush = le16_to_cpu(tl->tl_used);
5214 	mlog(0, "Flush %u records from truncate log #%llu\n",
5215 	     num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5216 	if (!num_to_flush) {
5217 		status = 0;
5218 		goto out;
5219 	}
5220 
5221 	data_alloc_inode = ocfs2_get_system_file_inode(osb,
5222 						       GLOBAL_BITMAP_SYSTEM_INODE,
5223 						       OCFS2_INVALID_SLOT);
5224 	if (!data_alloc_inode) {
5225 		status = -EINVAL;
5226 		mlog(ML_ERROR, "Could not get bitmap inode!\n");
5227 		goto out;
5228 	}
5229 
5230 	mutex_lock(&data_alloc_inode->i_mutex);
5231 
5232 	status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5233 	if (status < 0) {
5234 		mlog_errno(status);
5235 		goto out_mutex;
5236 	}
5237 
5238 	handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5239 	if (IS_ERR(handle)) {
5240 		status = PTR_ERR(handle);
5241 		mlog_errno(status);
5242 		goto out_unlock;
5243 	}
5244 
5245 	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5246 					       data_alloc_bh);
5247 	if (status < 0)
5248 		mlog_errno(status);
5249 
5250 	ocfs2_commit_trans(osb, handle);
5251 
5252 out_unlock:
5253 	brelse(data_alloc_bh);
5254 	ocfs2_inode_unlock(data_alloc_inode, 1);
5255 
5256 out_mutex:
5257 	mutex_unlock(&data_alloc_inode->i_mutex);
5258 	iput(data_alloc_inode);
5259 
5260 out:
5261 	mlog_exit(status);
5262 	return status;
5263 }
5264 
5265 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5266 {
5267 	int status;
5268 	struct inode *tl_inode = osb->osb_tl_inode;
5269 
5270 	mutex_lock(&tl_inode->i_mutex);
5271 	status = __ocfs2_flush_truncate_log(osb);
5272 	mutex_unlock(&tl_inode->i_mutex);
5273 
5274 	return status;
5275 }
5276 
5277 static void ocfs2_truncate_log_worker(struct work_struct *work)
5278 {
5279 	int status;
5280 	struct ocfs2_super *osb =
5281 		container_of(work, struct ocfs2_super,
5282 			     osb_truncate_log_wq.work);
5283 
5284 	mlog_entry_void();
5285 
5286 	status = ocfs2_flush_truncate_log(osb);
5287 	if (status < 0)
5288 		mlog_errno(status);
5289 	else
5290 		ocfs2_init_inode_steal_slot(osb);
5291 
5292 	mlog_exit(status);
5293 }
5294 
5295 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5296 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5297 				       int cancel)
5298 {
5299 	if (osb->osb_tl_inode) {
5300 		/* We want to push off log flushes while truncates are
5301 		 * still running. */
5302 		if (cancel)
5303 			cancel_delayed_work(&osb->osb_truncate_log_wq);
5304 
5305 		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5306 				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5307 	}
5308 }
5309 
5310 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5311 				       int slot_num,
5312 				       struct inode **tl_inode,
5313 				       struct buffer_head **tl_bh)
5314 {
5315 	int status;
5316 	struct inode *inode = NULL;
5317 	struct buffer_head *bh = NULL;
5318 
5319 	inode = ocfs2_get_system_file_inode(osb,
5320 					   TRUNCATE_LOG_SYSTEM_INODE,
5321 					   slot_num);
5322 	if (!inode) {
5323 		status = -EINVAL;
5324 		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5325 		goto bail;
5326 	}
5327 
5328 	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
5329 				  OCFS2_BH_CACHED, inode);
5330 	if (status < 0) {
5331 		iput(inode);
5332 		mlog_errno(status);
5333 		goto bail;
5334 	}
5335 
5336 	*tl_inode = inode;
5337 	*tl_bh    = bh;
5338 bail:
5339 	mlog_exit(status);
5340 	return status;
5341 }
5342 
5343 /* called during the 1st stage of node recovery. we stamp a clean
5344  * truncate log and pass back a copy for processing later. if the
5345  * truncate log does not require processing, a *tl_copy is set to
5346  * NULL. */
5347 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5348 				      int slot_num,
5349 				      struct ocfs2_dinode **tl_copy)
5350 {
5351 	int status;
5352 	struct inode *tl_inode = NULL;
5353 	struct buffer_head *tl_bh = NULL;
5354 	struct ocfs2_dinode *di;
5355 	struct ocfs2_truncate_log *tl;
5356 
5357 	*tl_copy = NULL;
5358 
5359 	mlog(0, "recover truncate log from slot %d\n", slot_num);
5360 
5361 	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5362 	if (status < 0) {
5363 		mlog_errno(status);
5364 		goto bail;
5365 	}
5366 
5367 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5368 	tl = &di->id2.i_dealloc;
5369 	if (!OCFS2_IS_VALID_DINODE(di)) {
5370 		OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
5371 		status = -EIO;
5372 		goto bail;
5373 	}
5374 
5375 	if (le16_to_cpu(tl->tl_used)) {
5376 		mlog(0, "We'll have %u logs to recover\n",
5377 		     le16_to_cpu(tl->tl_used));
5378 
5379 		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5380 		if (!(*tl_copy)) {
5381 			status = -ENOMEM;
5382 			mlog_errno(status);
5383 			goto bail;
5384 		}
5385 
5386 		/* Assuming the write-out below goes well, this copy
5387 		 * will be passed back to recovery for processing. */
5388 		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5389 
5390 		/* All we need to do to clear the truncate log is set
5391 		 * tl_used. */
5392 		tl->tl_used = 0;
5393 
5394 		status = ocfs2_write_block(osb, tl_bh, tl_inode);
5395 		if (status < 0) {
5396 			mlog_errno(status);
5397 			goto bail;
5398 		}
5399 	}
5400 
5401 bail:
5402 	if (tl_inode)
5403 		iput(tl_inode);
5404 	if (tl_bh)
5405 		brelse(tl_bh);
5406 
5407 	if (status < 0 && (*tl_copy)) {
5408 		kfree(*tl_copy);
5409 		*tl_copy = NULL;
5410 	}
5411 
5412 	mlog_exit(status);
5413 	return status;
5414 }
5415 
5416 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5417 					 struct ocfs2_dinode *tl_copy)
5418 {
5419 	int status = 0;
5420 	int i;
5421 	unsigned int clusters, num_recs, start_cluster;
5422 	u64 start_blk;
5423 	handle_t *handle;
5424 	struct inode *tl_inode = osb->osb_tl_inode;
5425 	struct ocfs2_truncate_log *tl;
5426 
5427 	mlog_entry_void();
5428 
5429 	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5430 		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5431 		return -EINVAL;
5432 	}
5433 
5434 	tl = &tl_copy->id2.i_dealloc;
5435 	num_recs = le16_to_cpu(tl->tl_used);
5436 	mlog(0, "cleanup %u records from %llu\n", num_recs,
5437 	     (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5438 
5439 	mutex_lock(&tl_inode->i_mutex);
5440 	for(i = 0; i < num_recs; i++) {
5441 		if (ocfs2_truncate_log_needs_flush(osb)) {
5442 			status = __ocfs2_flush_truncate_log(osb);
5443 			if (status < 0) {
5444 				mlog_errno(status);
5445 				goto bail_up;
5446 			}
5447 		}
5448 
5449 		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5450 		if (IS_ERR(handle)) {
5451 			status = PTR_ERR(handle);
5452 			mlog_errno(status);
5453 			goto bail_up;
5454 		}
5455 
5456 		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5457 		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5458 		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5459 
5460 		status = ocfs2_truncate_log_append(osb, handle,
5461 						   start_blk, clusters);
5462 		ocfs2_commit_trans(osb, handle);
5463 		if (status < 0) {
5464 			mlog_errno(status);
5465 			goto bail_up;
5466 		}
5467 	}
5468 
5469 bail_up:
5470 	mutex_unlock(&tl_inode->i_mutex);
5471 
5472 	mlog_exit(status);
5473 	return status;
5474 }
5475 
5476 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5477 {
5478 	int status;
5479 	struct inode *tl_inode = osb->osb_tl_inode;
5480 
5481 	mlog_entry_void();
5482 
5483 	if (tl_inode) {
5484 		cancel_delayed_work(&osb->osb_truncate_log_wq);
5485 		flush_workqueue(ocfs2_wq);
5486 
5487 		status = ocfs2_flush_truncate_log(osb);
5488 		if (status < 0)
5489 			mlog_errno(status);
5490 
5491 		brelse(osb->osb_tl_bh);
5492 		iput(osb->osb_tl_inode);
5493 	}
5494 
5495 	mlog_exit_void();
5496 }
5497 
5498 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5499 {
5500 	int status;
5501 	struct inode *tl_inode = NULL;
5502 	struct buffer_head *tl_bh = NULL;
5503 
5504 	mlog_entry_void();
5505 
5506 	status = ocfs2_get_truncate_log_info(osb,
5507 					     osb->slot_num,
5508 					     &tl_inode,
5509 					     &tl_bh);
5510 	if (status < 0)
5511 		mlog_errno(status);
5512 
5513 	/* ocfs2_truncate_log_shutdown keys on the existence of
5514 	 * osb->osb_tl_inode so we don't set any of the osb variables
5515 	 * until we're sure all is well. */
5516 	INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5517 			  ocfs2_truncate_log_worker);
5518 	osb->osb_tl_bh    = tl_bh;
5519 	osb->osb_tl_inode = tl_inode;
5520 
5521 	mlog_exit(status);
5522 	return status;
5523 }
5524 
5525 /*
5526  * Delayed de-allocation of suballocator blocks.
5527  *
5528  * Some sets of block de-allocations might involve multiple suballocator inodes.
5529  *
5530  * The locking for this can get extremely complicated, especially when
5531  * the suballocator inodes to delete from aren't known until deep
5532  * within an unrelated codepath.
5533  *
5534  * ocfs2_extent_block structures are a good example of this - an inode
5535  * btree could have been grown by any number of nodes each allocating
5536  * out of their own suballoc inode.
5537  *
5538  * These structures allow the delay of block de-allocation until a
5539  * later time, when locking of multiple cluster inodes won't cause
5540  * deadlock.
5541  */
5542 
5543 /*
5544  * Describes a single block free from a suballocator
5545  */
5546 struct ocfs2_cached_block_free {
5547 	struct ocfs2_cached_block_free		*free_next;
5548 	u64					free_blk;
5549 	unsigned int				free_bit;
5550 };
5551 
5552 struct ocfs2_per_slot_free_list {
5553 	struct ocfs2_per_slot_free_list		*f_next_suballocator;
5554 	int					f_inode_type;
5555 	int					f_slot;
5556 	struct ocfs2_cached_block_free		*f_first;
5557 };
5558 
5559 static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5560 				   int sysfile_type,
5561 				   int slot,
5562 				   struct ocfs2_cached_block_free *head)
5563 {
5564 	int ret;
5565 	u64 bg_blkno;
5566 	handle_t *handle;
5567 	struct inode *inode;
5568 	struct buffer_head *di_bh = NULL;
5569 	struct ocfs2_cached_block_free *tmp;
5570 
5571 	inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5572 	if (!inode) {
5573 		ret = -EINVAL;
5574 		mlog_errno(ret);
5575 		goto out;
5576 	}
5577 
5578 	mutex_lock(&inode->i_mutex);
5579 
5580 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
5581 	if (ret) {
5582 		mlog_errno(ret);
5583 		goto out_mutex;
5584 	}
5585 
5586 	handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5587 	if (IS_ERR(handle)) {
5588 		ret = PTR_ERR(handle);
5589 		mlog_errno(ret);
5590 		goto out_unlock;
5591 	}
5592 
5593 	while (head) {
5594 		bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5595 						      head->free_bit);
5596 		mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5597 		     head->free_bit, (unsigned long long)head->free_blk);
5598 
5599 		ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5600 					       head->free_bit, bg_blkno, 1);
5601 		if (ret) {
5602 			mlog_errno(ret);
5603 			goto out_journal;
5604 		}
5605 
5606 		ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5607 		if (ret) {
5608 			mlog_errno(ret);
5609 			goto out_journal;
5610 		}
5611 
5612 		tmp = head;
5613 		head = head->free_next;
5614 		kfree(tmp);
5615 	}
5616 
5617 out_journal:
5618 	ocfs2_commit_trans(osb, handle);
5619 
5620 out_unlock:
5621 	ocfs2_inode_unlock(inode, 1);
5622 	brelse(di_bh);
5623 out_mutex:
5624 	mutex_unlock(&inode->i_mutex);
5625 	iput(inode);
5626 out:
5627 	while(head) {
5628 		/* Premature exit may have left some dangling items. */
5629 		tmp = head;
5630 		head = head->free_next;
5631 		kfree(tmp);
5632 	}
5633 
5634 	return ret;
5635 }
5636 
5637 int ocfs2_run_deallocs(struct ocfs2_super *osb,
5638 		       struct ocfs2_cached_dealloc_ctxt *ctxt)
5639 {
5640 	int ret = 0, ret2;
5641 	struct ocfs2_per_slot_free_list *fl;
5642 
5643 	if (!ctxt)
5644 		return 0;
5645 
5646 	while (ctxt->c_first_suballocator) {
5647 		fl = ctxt->c_first_suballocator;
5648 
5649 		if (fl->f_first) {
5650 			mlog(0, "Free items: (type %u, slot %d)\n",
5651 			     fl->f_inode_type, fl->f_slot);
5652 			ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5653 						       fl->f_slot, fl->f_first);
5654 			if (ret2)
5655 				mlog_errno(ret2);
5656 			if (!ret)
5657 				ret = ret2;
5658 		}
5659 
5660 		ctxt->c_first_suballocator = fl->f_next_suballocator;
5661 		kfree(fl);
5662 	}
5663 
5664 	return ret;
5665 }
5666 
5667 static struct ocfs2_per_slot_free_list *
5668 ocfs2_find_per_slot_free_list(int type,
5669 			      int slot,
5670 			      struct ocfs2_cached_dealloc_ctxt *ctxt)
5671 {
5672 	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5673 
5674 	while (fl) {
5675 		if (fl->f_inode_type == type && fl->f_slot == slot)
5676 			return fl;
5677 
5678 		fl = fl->f_next_suballocator;
5679 	}
5680 
5681 	fl = kmalloc(sizeof(*fl), GFP_NOFS);
5682 	if (fl) {
5683 		fl->f_inode_type = type;
5684 		fl->f_slot = slot;
5685 		fl->f_first = NULL;
5686 		fl->f_next_suballocator = ctxt->c_first_suballocator;
5687 
5688 		ctxt->c_first_suballocator = fl;
5689 	}
5690 	return fl;
5691 }
5692 
5693 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5694 				     int type, int slot, u64 blkno,
5695 				     unsigned int bit)
5696 {
5697 	int ret;
5698 	struct ocfs2_per_slot_free_list *fl;
5699 	struct ocfs2_cached_block_free *item;
5700 
5701 	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
5702 	if (fl == NULL) {
5703 		ret = -ENOMEM;
5704 		mlog_errno(ret);
5705 		goto out;
5706 	}
5707 
5708 	item = kmalloc(sizeof(*item), GFP_NOFS);
5709 	if (item == NULL) {
5710 		ret = -ENOMEM;
5711 		mlog_errno(ret);
5712 		goto out;
5713 	}
5714 
5715 	mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
5716 	     type, slot, bit, (unsigned long long)blkno);
5717 
5718 	item->free_blk = blkno;
5719 	item->free_bit = bit;
5720 	item->free_next = fl->f_first;
5721 
5722 	fl->f_first = item;
5723 
5724 	ret = 0;
5725 out:
5726 	return ret;
5727 }
5728 
5729 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
5730 					 struct ocfs2_extent_block *eb)
5731 {
5732 	return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
5733 					 le16_to_cpu(eb->h_suballoc_slot),
5734 					 le64_to_cpu(eb->h_blkno),
5735 					 le16_to_cpu(eb->h_suballoc_bit));
5736 }
5737 
5738 /* This function will figure out whether the currently last extent
5739  * block will be deleted, and if it will, what the new last extent
5740  * block will be so we can update his h_next_leaf_blk field, as well
5741  * as the dinodes i_last_eb_blk */
5742 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
5743 				       unsigned int clusters_to_del,
5744 				       struct ocfs2_path *path,
5745 				       struct buffer_head **new_last_eb)
5746 {
5747 	int next_free, ret = 0;
5748 	u32 cpos;
5749 	struct ocfs2_extent_rec *rec;
5750 	struct ocfs2_extent_block *eb;
5751 	struct ocfs2_extent_list *el;
5752 	struct buffer_head *bh = NULL;
5753 
5754 	*new_last_eb = NULL;
5755 
5756 	/* we have no tree, so of course, no last_eb. */
5757 	if (!path->p_tree_depth)
5758 		goto out;
5759 
5760 	/* trunc to zero special case - this makes tree_depth = 0
5761 	 * regardless of what it is.  */
5762 	if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
5763 		goto out;
5764 
5765 	el = path_leaf_el(path);
5766 	BUG_ON(!el->l_next_free_rec);
5767 
5768 	/*
5769 	 * Make sure that this extent list will actually be empty
5770 	 * after we clear away the data. We can shortcut out if
5771 	 * there's more than one non-empty extent in the
5772 	 * list. Otherwise, a check of the remaining extent is
5773 	 * necessary.
5774 	 */
5775 	next_free = le16_to_cpu(el->l_next_free_rec);
5776 	rec = NULL;
5777 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
5778 		if (next_free > 2)
5779 			goto out;
5780 
5781 		/* We may have a valid extent in index 1, check it. */
5782 		if (next_free == 2)
5783 			rec = &el->l_recs[1];
5784 
5785 		/*
5786 		 * Fall through - no more nonempty extents, so we want
5787 		 * to delete this leaf.
5788 		 */
5789 	} else {
5790 		if (next_free > 1)
5791 			goto out;
5792 
5793 		rec = &el->l_recs[0];
5794 	}
5795 
5796 	if (rec) {
5797 		/*
5798 		 * Check it we'll only be trimming off the end of this
5799 		 * cluster.
5800 		 */
5801 		if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
5802 			goto out;
5803 	}
5804 
5805 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
5806 	if (ret) {
5807 		mlog_errno(ret);
5808 		goto out;
5809 	}
5810 
5811 	ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
5812 	if (ret) {
5813 		mlog_errno(ret);
5814 		goto out;
5815 	}
5816 
5817 	eb = (struct ocfs2_extent_block *) bh->b_data;
5818 	el = &eb->h_list;
5819 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
5820 		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
5821 		ret = -EROFS;
5822 		goto out;
5823 	}
5824 
5825 	*new_last_eb = bh;
5826 	get_bh(*new_last_eb);
5827 	mlog(0, "returning block %llu, (cpos: %u)\n",
5828 	     (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
5829 out:
5830 	brelse(bh);
5831 
5832 	return ret;
5833 }
5834 
5835 /*
5836  * Trim some clusters off the rightmost edge of a tree. Only called
5837  * during truncate.
5838  *
5839  * The caller needs to:
5840  *   - start journaling of each path component.
5841  *   - compute and fully set up any new last ext block
5842  */
5843 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
5844 			   handle_t *handle, struct ocfs2_truncate_context *tc,
5845 			   u32 clusters_to_del, u64 *delete_start)
5846 {
5847 	int ret, i, index = path->p_tree_depth;
5848 	u32 new_edge = 0;
5849 	u64 deleted_eb = 0;
5850 	struct buffer_head *bh;
5851 	struct ocfs2_extent_list *el;
5852 	struct ocfs2_extent_rec *rec;
5853 
5854 	*delete_start = 0;
5855 
5856 	while (index >= 0) {
5857 		bh = path->p_node[index].bh;
5858 		el = path->p_node[index].el;
5859 
5860 		mlog(0, "traveling tree (index = %d, block = %llu)\n",
5861 		     index,  (unsigned long long)bh->b_blocknr);
5862 
5863 		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
5864 
5865 		if (index !=
5866 		    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
5867 			ocfs2_error(inode->i_sb,
5868 				    "Inode %lu has invalid ext. block %llu",
5869 				    inode->i_ino,
5870 				    (unsigned long long)bh->b_blocknr);
5871 			ret = -EROFS;
5872 			goto out;
5873 		}
5874 
5875 find_tail_record:
5876 		i = le16_to_cpu(el->l_next_free_rec) - 1;
5877 		rec = &el->l_recs[i];
5878 
5879 		mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
5880 		     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
5881 		     ocfs2_rec_clusters(el, rec),
5882 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
5883 		     le16_to_cpu(el->l_next_free_rec));
5884 
5885 		BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
5886 
5887 		if (le16_to_cpu(el->l_tree_depth) == 0) {
5888 			/*
5889 			 * If the leaf block contains a single empty
5890 			 * extent and no records, we can just remove
5891 			 * the block.
5892 			 */
5893 			if (i == 0 && ocfs2_is_empty_extent(rec)) {
5894 				memset(rec, 0,
5895 				       sizeof(struct ocfs2_extent_rec));
5896 				el->l_next_free_rec = cpu_to_le16(0);
5897 
5898 				goto delete;
5899 			}
5900 
5901 			/*
5902 			 * Remove any empty extents by shifting things
5903 			 * left. That should make life much easier on
5904 			 * the code below. This condition is rare
5905 			 * enough that we shouldn't see a performance
5906 			 * hit.
5907 			 */
5908 			if (ocfs2_is_empty_extent(&el->l_recs[0])) {
5909 				le16_add_cpu(&el->l_next_free_rec, -1);
5910 
5911 				for(i = 0;
5912 				    i < le16_to_cpu(el->l_next_free_rec); i++)
5913 					el->l_recs[i] = el->l_recs[i + 1];
5914 
5915 				memset(&el->l_recs[i], 0,
5916 				       sizeof(struct ocfs2_extent_rec));
5917 
5918 				/*
5919 				 * We've modified our extent list. The
5920 				 * simplest way to handle this change
5921 				 * is to being the search from the
5922 				 * start again.
5923 				 */
5924 				goto find_tail_record;
5925 			}
5926 
5927 			le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
5928 
5929 			/*
5930 			 * We'll use "new_edge" on our way back up the
5931 			 * tree to know what our rightmost cpos is.
5932 			 */
5933 			new_edge = le16_to_cpu(rec->e_leaf_clusters);
5934 			new_edge += le32_to_cpu(rec->e_cpos);
5935 
5936 			/*
5937 			 * The caller will use this to delete data blocks.
5938 			 */
5939 			*delete_start = le64_to_cpu(rec->e_blkno)
5940 				+ ocfs2_clusters_to_blocks(inode->i_sb,
5941 					le16_to_cpu(rec->e_leaf_clusters));
5942 
5943 			/*
5944 			 * If it's now empty, remove this record.
5945 			 */
5946 			if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
5947 				memset(rec, 0,
5948 				       sizeof(struct ocfs2_extent_rec));
5949 				le16_add_cpu(&el->l_next_free_rec, -1);
5950 			}
5951 		} else {
5952 			if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
5953 				memset(rec, 0,
5954 				       sizeof(struct ocfs2_extent_rec));
5955 				le16_add_cpu(&el->l_next_free_rec, -1);
5956 
5957 				goto delete;
5958 			}
5959 
5960 			/* Can this actually happen? */
5961 			if (le16_to_cpu(el->l_next_free_rec) == 0)
5962 				goto delete;
5963 
5964 			/*
5965 			 * We never actually deleted any clusters
5966 			 * because our leaf was empty. There's no
5967 			 * reason to adjust the rightmost edge then.
5968 			 */
5969 			if (new_edge == 0)
5970 				goto delete;
5971 
5972 			rec->e_int_clusters = cpu_to_le32(new_edge);
5973 			le32_add_cpu(&rec->e_int_clusters,
5974 				     -le32_to_cpu(rec->e_cpos));
5975 
5976 			 /*
5977 			  * A deleted child record should have been
5978 			  * caught above.
5979 			  */
5980 			 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
5981 		}
5982 
5983 delete:
5984 		ret = ocfs2_journal_dirty(handle, bh);
5985 		if (ret) {
5986 			mlog_errno(ret);
5987 			goto out;
5988 		}
5989 
5990 		mlog(0, "extent list container %llu, after: record %d: "
5991 		     "(%u, %u, %llu), next = %u.\n",
5992 		     (unsigned long long)bh->b_blocknr, i,
5993 		     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
5994 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
5995 		     le16_to_cpu(el->l_next_free_rec));
5996 
5997 		/*
5998 		 * We must be careful to only attempt delete of an
5999 		 * extent block (and not the root inode block).
6000 		 */
6001 		if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6002 			struct ocfs2_extent_block *eb =
6003 				(struct ocfs2_extent_block *)bh->b_data;
6004 
6005 			/*
6006 			 * Save this for use when processing the
6007 			 * parent block.
6008 			 */
6009 			deleted_eb = le64_to_cpu(eb->h_blkno);
6010 
6011 			mlog(0, "deleting this extent block.\n");
6012 
6013 			ocfs2_remove_from_cache(inode, bh);
6014 
6015 			BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6016 			BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6017 			BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6018 
6019 			ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6020 			/* An error here is not fatal. */
6021 			if (ret < 0)
6022 				mlog_errno(ret);
6023 		} else {
6024 			deleted_eb = 0;
6025 		}
6026 
6027 		index--;
6028 	}
6029 
6030 	ret = 0;
6031 out:
6032 	return ret;
6033 }
6034 
6035 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6036 			     unsigned int clusters_to_del,
6037 			     struct inode *inode,
6038 			     struct buffer_head *fe_bh,
6039 			     handle_t *handle,
6040 			     struct ocfs2_truncate_context *tc,
6041 			     struct ocfs2_path *path)
6042 {
6043 	int status;
6044 	struct ocfs2_dinode *fe;
6045 	struct ocfs2_extent_block *last_eb = NULL;
6046 	struct ocfs2_extent_list *el;
6047 	struct buffer_head *last_eb_bh = NULL;
6048 	u64 delete_blk = 0;
6049 
6050 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
6051 
6052 	status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6053 					     path, &last_eb_bh);
6054 	if (status < 0) {
6055 		mlog_errno(status);
6056 		goto bail;
6057 	}
6058 
6059 	/*
6060 	 * Each component will be touched, so we might as well journal
6061 	 * here to avoid having to handle errors later.
6062 	 */
6063 	status = ocfs2_journal_access_path(inode, handle, path);
6064 	if (status < 0) {
6065 		mlog_errno(status);
6066 		goto bail;
6067 	}
6068 
6069 	if (last_eb_bh) {
6070 		status = ocfs2_journal_access(handle, inode, last_eb_bh,
6071 					      OCFS2_JOURNAL_ACCESS_WRITE);
6072 		if (status < 0) {
6073 			mlog_errno(status);
6074 			goto bail;
6075 		}
6076 
6077 		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6078 	}
6079 
6080 	el = &(fe->id2.i_list);
6081 
6082 	/*
6083 	 * Lower levels depend on this never happening, but it's best
6084 	 * to check it up here before changing the tree.
6085 	 */
6086 	if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6087 		ocfs2_error(inode->i_sb,
6088 			    "Inode %lu has an empty extent record, depth %u\n",
6089 			    inode->i_ino, le16_to_cpu(el->l_tree_depth));
6090 		status = -EROFS;
6091 		goto bail;
6092 	}
6093 
6094 	spin_lock(&OCFS2_I(inode)->ip_lock);
6095 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6096 				      clusters_to_del;
6097 	spin_unlock(&OCFS2_I(inode)->ip_lock);
6098 	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6099 	inode->i_blocks = ocfs2_inode_sector_count(inode);
6100 
6101 	status = ocfs2_trim_tree(inode, path, handle, tc,
6102 				 clusters_to_del, &delete_blk);
6103 	if (status) {
6104 		mlog_errno(status);
6105 		goto bail;
6106 	}
6107 
6108 	if (le32_to_cpu(fe->i_clusters) == 0) {
6109 		/* trunc to zero is a special case. */
6110 		el->l_tree_depth = 0;
6111 		fe->i_last_eb_blk = 0;
6112 	} else if (last_eb)
6113 		fe->i_last_eb_blk = last_eb->h_blkno;
6114 
6115 	status = ocfs2_journal_dirty(handle, fe_bh);
6116 	if (status < 0) {
6117 		mlog_errno(status);
6118 		goto bail;
6119 	}
6120 
6121 	if (last_eb) {
6122 		/* If there will be a new last extent block, then by
6123 		 * definition, there cannot be any leaves to the right of
6124 		 * him. */
6125 		last_eb->h_next_leaf_blk = 0;
6126 		status = ocfs2_journal_dirty(handle, last_eb_bh);
6127 		if (status < 0) {
6128 			mlog_errno(status);
6129 			goto bail;
6130 		}
6131 	}
6132 
6133 	if (delete_blk) {
6134 		status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6135 						   clusters_to_del);
6136 		if (status < 0) {
6137 			mlog_errno(status);
6138 			goto bail;
6139 		}
6140 	}
6141 	status = 0;
6142 bail:
6143 
6144 	mlog_exit(status);
6145 	return status;
6146 }
6147 
6148 static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
6149 {
6150 	set_buffer_uptodate(bh);
6151 	mark_buffer_dirty(bh);
6152 	return 0;
6153 }
6154 
6155 static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
6156 {
6157 	set_buffer_uptodate(bh);
6158 	mark_buffer_dirty(bh);
6159 	return ocfs2_journal_dirty_data(handle, bh);
6160 }
6161 
6162 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6163 				     unsigned int from, unsigned int to,
6164 				     struct page *page, int zero, u64 *phys)
6165 {
6166 	int ret, partial = 0;
6167 
6168 	ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6169 	if (ret)
6170 		mlog_errno(ret);
6171 
6172 	if (zero)
6173 		zero_user_segment(page, from, to);
6174 
6175 	/*
6176 	 * Need to set the buffers we zero'd into uptodate
6177 	 * here if they aren't - ocfs2_map_page_blocks()
6178 	 * might've skipped some
6179 	 */
6180 	if (ocfs2_should_order_data(inode)) {
6181 		ret = walk_page_buffers(handle,
6182 					page_buffers(page),
6183 					from, to, &partial,
6184 					ocfs2_ordered_zero_func);
6185 		if (ret < 0)
6186 			mlog_errno(ret);
6187 	} else {
6188 		ret = walk_page_buffers(handle, page_buffers(page),
6189 					from, to, &partial,
6190 					ocfs2_writeback_zero_func);
6191 		if (ret < 0)
6192 			mlog_errno(ret);
6193 	}
6194 
6195 	if (!partial)
6196 		SetPageUptodate(page);
6197 
6198 	flush_dcache_page(page);
6199 }
6200 
6201 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6202 				     loff_t end, struct page **pages,
6203 				     int numpages, u64 phys, handle_t *handle)
6204 {
6205 	int i;
6206 	struct page *page;
6207 	unsigned int from, to = PAGE_CACHE_SIZE;
6208 	struct super_block *sb = inode->i_sb;
6209 
6210 	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6211 
6212 	if (numpages == 0)
6213 		goto out;
6214 
6215 	to = PAGE_CACHE_SIZE;
6216 	for(i = 0; i < numpages; i++) {
6217 		page = pages[i];
6218 
6219 		from = start & (PAGE_CACHE_SIZE - 1);
6220 		if ((end >> PAGE_CACHE_SHIFT) == page->index)
6221 			to = end & (PAGE_CACHE_SIZE - 1);
6222 
6223 		BUG_ON(from > PAGE_CACHE_SIZE);
6224 		BUG_ON(to > PAGE_CACHE_SIZE);
6225 
6226 		ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6227 					 &phys);
6228 
6229 		start = (page->index + 1) << PAGE_CACHE_SHIFT;
6230 	}
6231 out:
6232 	if (pages)
6233 		ocfs2_unlock_and_free_pages(pages, numpages);
6234 }
6235 
6236 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6237 				struct page **pages, int *num)
6238 {
6239 	int numpages, ret = 0;
6240 	struct super_block *sb = inode->i_sb;
6241 	struct address_space *mapping = inode->i_mapping;
6242 	unsigned long index;
6243 	loff_t last_page_bytes;
6244 
6245 	BUG_ON(start > end);
6246 
6247 	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6248 	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6249 
6250 	numpages = 0;
6251 	last_page_bytes = PAGE_ALIGN(end);
6252 	index = start >> PAGE_CACHE_SHIFT;
6253 	do {
6254 		pages[numpages] = grab_cache_page(mapping, index);
6255 		if (!pages[numpages]) {
6256 			ret = -ENOMEM;
6257 			mlog_errno(ret);
6258 			goto out;
6259 		}
6260 
6261 		numpages++;
6262 		index++;
6263 	} while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6264 
6265 out:
6266 	if (ret != 0) {
6267 		if (pages)
6268 			ocfs2_unlock_and_free_pages(pages, numpages);
6269 		numpages = 0;
6270 	}
6271 
6272 	*num = numpages;
6273 
6274 	return ret;
6275 }
6276 
6277 /*
6278  * Zero the area past i_size but still within an allocated
6279  * cluster. This avoids exposing nonzero data on subsequent file
6280  * extends.
6281  *
6282  * We need to call this before i_size is updated on the inode because
6283  * otherwise block_write_full_page() will skip writeout of pages past
6284  * i_size. The new_i_size parameter is passed for this reason.
6285  */
6286 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6287 				  u64 range_start, u64 range_end)
6288 {
6289 	int ret = 0, numpages;
6290 	struct page **pages = NULL;
6291 	u64 phys;
6292 	unsigned int ext_flags;
6293 	struct super_block *sb = inode->i_sb;
6294 
6295 	/*
6296 	 * File systems which don't support sparse files zero on every
6297 	 * extend.
6298 	 */
6299 	if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6300 		return 0;
6301 
6302 	pages = kcalloc(ocfs2_pages_per_cluster(sb),
6303 			sizeof(struct page *), GFP_NOFS);
6304 	if (pages == NULL) {
6305 		ret = -ENOMEM;
6306 		mlog_errno(ret);
6307 		goto out;
6308 	}
6309 
6310 	if (range_start == range_end)
6311 		goto out;
6312 
6313 	ret = ocfs2_extent_map_get_blocks(inode,
6314 					  range_start >> sb->s_blocksize_bits,
6315 					  &phys, NULL, &ext_flags);
6316 	if (ret) {
6317 		mlog_errno(ret);
6318 		goto out;
6319 	}
6320 
6321 	/*
6322 	 * Tail is a hole, or is marked unwritten. In either case, we
6323 	 * can count on read and write to return/push zero's.
6324 	 */
6325 	if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6326 		goto out;
6327 
6328 	ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6329 				   &numpages);
6330 	if (ret) {
6331 		mlog_errno(ret);
6332 		goto out;
6333 	}
6334 
6335 	ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6336 				 numpages, phys, handle);
6337 
6338 	/*
6339 	 * Initiate writeout of the pages we zero'd here. We don't
6340 	 * wait on them - the truncate_inode_pages() call later will
6341 	 * do that for us.
6342 	 */
6343 	ret = do_sync_mapping_range(inode->i_mapping, range_start,
6344 				    range_end - 1, SYNC_FILE_RANGE_WRITE);
6345 	if (ret)
6346 		mlog_errno(ret);
6347 
6348 out:
6349 	if (pages)
6350 		kfree(pages);
6351 
6352 	return ret;
6353 }
6354 
6355 static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
6356 {
6357 	unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6358 
6359 	memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
6360 }
6361 
6362 void ocfs2_dinode_new_extent_list(struct inode *inode,
6363 				  struct ocfs2_dinode *di)
6364 {
6365 	ocfs2_zero_dinode_id2(inode, di);
6366 	di->id2.i_list.l_tree_depth = 0;
6367 	di->id2.i_list.l_next_free_rec = 0;
6368 	di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
6369 }
6370 
6371 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6372 {
6373 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
6374 	struct ocfs2_inline_data *idata = &di->id2.i_data;
6375 
6376 	spin_lock(&oi->ip_lock);
6377 	oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6378 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6379 	spin_unlock(&oi->ip_lock);
6380 
6381 	/*
6382 	 * We clear the entire i_data structure here so that all
6383 	 * fields can be properly initialized.
6384 	 */
6385 	ocfs2_zero_dinode_id2(inode, di);
6386 
6387 	idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
6388 }
6389 
6390 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6391 					 struct buffer_head *di_bh)
6392 {
6393 	int ret, i, has_data, num_pages = 0;
6394 	handle_t *handle;
6395 	u64 uninitialized_var(block);
6396 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
6397 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6398 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6399 	struct ocfs2_alloc_context *data_ac = NULL;
6400 	struct page **pages = NULL;
6401 	loff_t end = osb->s_clustersize;
6402 
6403 	has_data = i_size_read(inode) ? 1 : 0;
6404 
6405 	if (has_data) {
6406 		pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6407 				sizeof(struct page *), GFP_NOFS);
6408 		if (pages == NULL) {
6409 			ret = -ENOMEM;
6410 			mlog_errno(ret);
6411 			goto out;
6412 		}
6413 
6414 		ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6415 		if (ret) {
6416 			mlog_errno(ret);
6417 			goto out;
6418 		}
6419 	}
6420 
6421 	handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
6422 	if (IS_ERR(handle)) {
6423 		ret = PTR_ERR(handle);
6424 		mlog_errno(ret);
6425 		goto out_unlock;
6426 	}
6427 
6428 	ret = ocfs2_journal_access(handle, inode, di_bh,
6429 				   OCFS2_JOURNAL_ACCESS_WRITE);
6430 	if (ret) {
6431 		mlog_errno(ret);
6432 		goto out_commit;
6433 	}
6434 
6435 	if (has_data) {
6436 		u32 bit_off, num;
6437 		unsigned int page_end;
6438 		u64 phys;
6439 
6440 		ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6441 					   &num);
6442 		if (ret) {
6443 			mlog_errno(ret);
6444 			goto out_commit;
6445 		}
6446 
6447 		/*
6448 		 * Save two copies, one for insert, and one that can
6449 		 * be changed by ocfs2_map_and_dirty_page() below.
6450 		 */
6451 		block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6452 
6453 		/*
6454 		 * Non sparse file systems zero on extend, so no need
6455 		 * to do that now.
6456 		 */
6457 		if (!ocfs2_sparse_alloc(osb) &&
6458 		    PAGE_CACHE_SIZE < osb->s_clustersize)
6459 			end = PAGE_CACHE_SIZE;
6460 
6461 		ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6462 		if (ret) {
6463 			mlog_errno(ret);
6464 			goto out_commit;
6465 		}
6466 
6467 		/*
6468 		 * This should populate the 1st page for us and mark
6469 		 * it up to date.
6470 		 */
6471 		ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6472 		if (ret) {
6473 			mlog_errno(ret);
6474 			goto out_commit;
6475 		}
6476 
6477 		page_end = PAGE_CACHE_SIZE;
6478 		if (PAGE_CACHE_SIZE > osb->s_clustersize)
6479 			page_end = osb->s_clustersize;
6480 
6481 		for (i = 0; i < num_pages; i++)
6482 			ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6483 						 pages[i], i > 0, &phys);
6484 	}
6485 
6486 	spin_lock(&oi->ip_lock);
6487 	oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6488 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6489 	spin_unlock(&oi->ip_lock);
6490 
6491 	ocfs2_dinode_new_extent_list(inode, di);
6492 
6493 	ocfs2_journal_dirty(handle, di_bh);
6494 
6495 	if (has_data) {
6496 		/*
6497 		 * An error at this point should be extremely rare. If
6498 		 * this proves to be false, we could always re-build
6499 		 * the in-inode data from our pages.
6500 		 */
6501 		ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
6502 					  0, block, 1, 0,
6503 					  NULL, OCFS2_DINODE_EXTENT);
6504 		if (ret) {
6505 			mlog_errno(ret);
6506 			goto out_commit;
6507 		}
6508 
6509 		inode->i_blocks = ocfs2_inode_sector_count(inode);
6510 	}
6511 
6512 out_commit:
6513 	ocfs2_commit_trans(osb, handle);
6514 
6515 out_unlock:
6516 	if (data_ac)
6517 		ocfs2_free_alloc_context(data_ac);
6518 
6519 out:
6520 	if (pages) {
6521 		ocfs2_unlock_and_free_pages(pages, num_pages);
6522 		kfree(pages);
6523 	}
6524 
6525 	return ret;
6526 }
6527 
6528 /*
6529  * It is expected, that by the time you call this function,
6530  * inode->i_size and fe->i_size have been adjusted.
6531  *
6532  * WARNING: This will kfree the truncate context
6533  */
6534 int ocfs2_commit_truncate(struct ocfs2_super *osb,
6535 			  struct inode *inode,
6536 			  struct buffer_head *fe_bh,
6537 			  struct ocfs2_truncate_context *tc)
6538 {
6539 	int status, i, credits, tl_sem = 0;
6540 	u32 clusters_to_del, new_highest_cpos, range;
6541 	struct ocfs2_extent_list *el;
6542 	handle_t *handle = NULL;
6543 	struct inode *tl_inode = osb->osb_tl_inode;
6544 	struct ocfs2_path *path = NULL;
6545 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
6546 
6547 	mlog_entry_void();
6548 
6549 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6550 						     i_size_read(inode));
6551 
6552 	path = ocfs2_new_path(fe_bh, &di->id2.i_list);
6553 	if (!path) {
6554 		status = -ENOMEM;
6555 		mlog_errno(status);
6556 		goto bail;
6557 	}
6558 
6559 	ocfs2_extent_map_trunc(inode, new_highest_cpos);
6560 
6561 start:
6562 	/*
6563 	 * Check that we still have allocation to delete.
6564 	 */
6565 	if (OCFS2_I(inode)->ip_clusters == 0) {
6566 		status = 0;
6567 		goto bail;
6568 	}
6569 
6570 	/*
6571 	 * Truncate always works against the rightmost tree branch.
6572 	 */
6573 	status = ocfs2_find_path(inode, path, UINT_MAX);
6574 	if (status) {
6575 		mlog_errno(status);
6576 		goto bail;
6577 	}
6578 
6579 	mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
6580 	     OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
6581 
6582 	/*
6583 	 * By now, el will point to the extent list on the bottom most
6584 	 * portion of this tree. Only the tail record is considered in
6585 	 * each pass.
6586 	 *
6587 	 * We handle the following cases, in order:
6588 	 * - empty extent: delete the remaining branch
6589 	 * - remove the entire record
6590 	 * - remove a partial record
6591 	 * - no record needs to be removed (truncate has completed)
6592 	 */
6593 	el = path_leaf_el(path);
6594 	if (le16_to_cpu(el->l_next_free_rec) == 0) {
6595 		ocfs2_error(inode->i_sb,
6596 			    "Inode %llu has empty extent block at %llu\n",
6597 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
6598 			    (unsigned long long)path_leaf_bh(path)->b_blocknr);
6599 		status = -EROFS;
6600 		goto bail;
6601 	}
6602 
6603 	i = le16_to_cpu(el->l_next_free_rec) - 1;
6604 	range = le32_to_cpu(el->l_recs[i].e_cpos) +
6605 		ocfs2_rec_clusters(el, &el->l_recs[i]);
6606 	if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
6607 		clusters_to_del = 0;
6608 	} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
6609 		clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
6610 	} else if (range > new_highest_cpos) {
6611 		clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
6612 				   le32_to_cpu(el->l_recs[i].e_cpos)) -
6613 				  new_highest_cpos;
6614 	} else {
6615 		status = 0;
6616 		goto bail;
6617 	}
6618 
6619 	mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
6620 	     clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
6621 
6622 	mutex_lock(&tl_inode->i_mutex);
6623 	tl_sem = 1;
6624 	/* ocfs2_truncate_log_needs_flush guarantees us at least one
6625 	 * record is free for use. If there isn't any, we flush to get
6626 	 * an empty truncate log.  */
6627 	if (ocfs2_truncate_log_needs_flush(osb)) {
6628 		status = __ocfs2_flush_truncate_log(osb);
6629 		if (status < 0) {
6630 			mlog_errno(status);
6631 			goto bail;
6632 		}
6633 	}
6634 
6635 	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
6636 						(struct ocfs2_dinode *)fe_bh->b_data,
6637 						el);
6638 	handle = ocfs2_start_trans(osb, credits);
6639 	if (IS_ERR(handle)) {
6640 		status = PTR_ERR(handle);
6641 		handle = NULL;
6642 		mlog_errno(status);
6643 		goto bail;
6644 	}
6645 
6646 	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
6647 				   tc, path);
6648 	if (status < 0) {
6649 		mlog_errno(status);
6650 		goto bail;
6651 	}
6652 
6653 	mutex_unlock(&tl_inode->i_mutex);
6654 	tl_sem = 0;
6655 
6656 	ocfs2_commit_trans(osb, handle);
6657 	handle = NULL;
6658 
6659 	ocfs2_reinit_path(path, 1);
6660 
6661 	/*
6662 	 * The check above will catch the case where we've truncated
6663 	 * away all allocation.
6664 	 */
6665 	goto start;
6666 
6667 bail:
6668 
6669 	ocfs2_schedule_truncate_log_flush(osb, 1);
6670 
6671 	if (tl_sem)
6672 		mutex_unlock(&tl_inode->i_mutex);
6673 
6674 	if (handle)
6675 		ocfs2_commit_trans(osb, handle);
6676 
6677 	ocfs2_run_deallocs(osb, &tc->tc_dealloc);
6678 
6679 	ocfs2_free_path(path);
6680 
6681 	/* This will drop the ext_alloc cluster lock for us */
6682 	ocfs2_free_truncate_context(tc);
6683 
6684 	mlog_exit(status);
6685 	return status;
6686 }
6687 
6688 /*
6689  * Expects the inode to already be locked.
6690  */
6691 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
6692 			   struct inode *inode,
6693 			   struct buffer_head *fe_bh,
6694 			   struct ocfs2_truncate_context **tc)
6695 {
6696 	int status;
6697 	unsigned int new_i_clusters;
6698 	struct ocfs2_dinode *fe;
6699 	struct ocfs2_extent_block *eb;
6700 	struct buffer_head *last_eb_bh = NULL;
6701 
6702 	mlog_entry_void();
6703 
6704 	*tc = NULL;
6705 
6706 	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
6707 						  i_size_read(inode));
6708 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
6709 
6710 	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
6711 	     "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
6712 	     (unsigned long long)le64_to_cpu(fe->i_size));
6713 
6714 	*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
6715 	if (!(*tc)) {
6716 		status = -ENOMEM;
6717 		mlog_errno(status);
6718 		goto bail;
6719 	}
6720 	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
6721 
6722 	if (fe->id2.i_list.l_tree_depth) {
6723 		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
6724 					  &last_eb_bh, OCFS2_BH_CACHED, inode);
6725 		if (status < 0) {
6726 			mlog_errno(status);
6727 			goto bail;
6728 		}
6729 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6730 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6731 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6732 
6733 			brelse(last_eb_bh);
6734 			status = -EIO;
6735 			goto bail;
6736 		}
6737 	}
6738 
6739 	(*tc)->tc_last_eb_bh = last_eb_bh;
6740 
6741 	status = 0;
6742 bail:
6743 	if (status < 0) {
6744 		if (*tc)
6745 			ocfs2_free_truncate_context(*tc);
6746 		*tc = NULL;
6747 	}
6748 	mlog_exit_void();
6749 	return status;
6750 }
6751 
6752 /*
6753  * 'start' is inclusive, 'end' is not.
6754  */
6755 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
6756 			  unsigned int start, unsigned int end, int trunc)
6757 {
6758 	int ret;
6759 	unsigned int numbytes;
6760 	handle_t *handle;
6761 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6762 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6763 	struct ocfs2_inline_data *idata = &di->id2.i_data;
6764 
6765 	if (end > i_size_read(inode))
6766 		end = i_size_read(inode);
6767 
6768 	BUG_ON(start >= end);
6769 
6770 	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
6771 	    !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
6772 	    !ocfs2_supports_inline_data(osb)) {
6773 		ocfs2_error(inode->i_sb,
6774 			    "Inline data flags for inode %llu don't agree! "
6775 			    "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
6776 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
6777 			    le16_to_cpu(di->i_dyn_features),
6778 			    OCFS2_I(inode)->ip_dyn_features,
6779 			    osb->s_feature_incompat);
6780 		ret = -EROFS;
6781 		goto out;
6782 	}
6783 
6784 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
6785 	if (IS_ERR(handle)) {
6786 		ret = PTR_ERR(handle);
6787 		mlog_errno(ret);
6788 		goto out;
6789 	}
6790 
6791 	ret = ocfs2_journal_access(handle, inode, di_bh,
6792 				   OCFS2_JOURNAL_ACCESS_WRITE);
6793 	if (ret) {
6794 		mlog_errno(ret);
6795 		goto out_commit;
6796 	}
6797 
6798 	numbytes = end - start;
6799 	memset(idata->id_data + start, 0, numbytes);
6800 
6801 	/*
6802 	 * No need to worry about the data page here - it's been
6803 	 * truncated already and inline data doesn't need it for
6804 	 * pushing zero's to disk, so we'll let readpage pick it up
6805 	 * later.
6806 	 */
6807 	if (trunc) {
6808 		i_size_write(inode, start);
6809 		di->i_size = cpu_to_le64(start);
6810 	}
6811 
6812 	inode->i_blocks = ocfs2_inode_sector_count(inode);
6813 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
6814 
6815 	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
6816 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
6817 
6818 	ocfs2_journal_dirty(handle, di_bh);
6819 
6820 out_commit:
6821 	ocfs2_commit_trans(osb, handle);
6822 
6823 out:
6824 	return ret;
6825 }
6826 
6827 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
6828 {
6829 	/*
6830 	 * The caller is responsible for completing deallocation
6831 	 * before freeing the context.
6832 	 */
6833 	if (tc->tc_dealloc.c_first_suballocator != NULL)
6834 		mlog(ML_NOTICE,
6835 		     "Truncate completion has non-empty dealloc context\n");
6836 
6837 	if (tc->tc_last_eb_bh)
6838 		brelse(tc->tc_last_eb_bh);
6839 
6840 	kfree(tc);
6841 }
6842