xref: /openbmc/linux/fs/ocfs2/alloc.c (revision f56654c4)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31 
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34 
35 #include "ocfs2.h"
36 
37 #include "alloc.h"
38 #include "aops.h"
39 #include "dlmglue.h"
40 #include "extent_map.h"
41 #include "inode.h"
42 #include "journal.h"
43 #include "localalloc.h"
44 #include "suballoc.h"
45 #include "sysfile.h"
46 #include "file.h"
47 #include "super.h"
48 #include "uptodate.h"
49 
50 #include "buffer_head_io.h"
51 
52 /*
53  * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract
54  * the b-tree operations in ocfs2. Now all the b-tree operations are not
55  * limited to ocfs2_dinode only. Any data which need to allocate clusters
56  * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree
57  * and operation.
58  *
59  * ocfs2_extent_tree contains info for the root of the b-tree, it must have a
60  * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
61  * functions.
62  * ocfs2_extent_tree_operations abstract the normal operations we do for
63  * the root of extent b-tree.
64  */
65 struct ocfs2_extent_tree;
66 
67 struct ocfs2_extent_tree_operations {
68 	void (*set_last_eb_blk) (struct ocfs2_extent_tree *et, u64 blkno);
69 	u64 (*get_last_eb_blk) (struct ocfs2_extent_tree *et);
70 	void (*update_clusters) (struct inode *inode,
71 				 struct ocfs2_extent_tree *et,
72 				 u32 new_clusters);
73 	int (*sanity_check) (struct inode *inode, struct ocfs2_extent_tree *et);
74 };
75 
76 struct ocfs2_extent_tree {
77 	enum ocfs2_extent_tree_type type;
78 	struct ocfs2_extent_tree_operations *eops;
79 	struct buffer_head *root_bh;
80 	struct ocfs2_extent_list *root_el;
81 	void *private;
82 };
83 
84 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
85 					 u64 blkno)
86 {
87 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
88 
89 	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
90 	di->i_last_eb_blk = cpu_to_le64(blkno);
91 }
92 
93 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
94 {
95 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data;
96 
97 	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
98 	return le64_to_cpu(di->i_last_eb_blk);
99 }
100 
101 static void ocfs2_dinode_update_clusters(struct inode *inode,
102 					 struct ocfs2_extent_tree *et,
103 					 u32 clusters)
104 {
105 	struct ocfs2_dinode *di =
106 			(struct ocfs2_dinode *)et->root_bh->b_data;
107 
108 	le32_add_cpu(&di->i_clusters, clusters);
109 	spin_lock(&OCFS2_I(inode)->ip_lock);
110 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
111 	spin_unlock(&OCFS2_I(inode)->ip_lock);
112 }
113 
114 static int ocfs2_dinode_sanity_check(struct inode *inode,
115 				     struct ocfs2_extent_tree *et)
116 {
117 	int ret = 0;
118 	struct ocfs2_dinode *di;
119 
120 	BUG_ON(et->type != OCFS2_DINODE_EXTENT);
121 
122 	di = (struct ocfs2_dinode *)et->root_bh->b_data;
123 	if (!OCFS2_IS_VALID_DINODE(di)) {
124 		ret = -EIO;
125 		ocfs2_error(inode->i_sb,
126 			"Inode %llu has invalid path root",
127 			(unsigned long long)OCFS2_I(inode)->ip_blkno);
128 	}
129 
130 	return ret;
131 }
132 
133 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
134 	.set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
135 	.get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
136 	.update_clusters	= ocfs2_dinode_update_clusters,
137 	.sanity_check		= ocfs2_dinode_sanity_check,
138 };
139 
140 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
141 					      u64 blkno)
142 {
143 	struct ocfs2_xattr_value_root *xv =
144 		(struct ocfs2_xattr_value_root *)et->private;
145 
146 	xv->xr_last_eb_blk = cpu_to_le64(blkno);
147 }
148 
149 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
150 {
151 	struct ocfs2_xattr_value_root *xv =
152 		(struct ocfs2_xattr_value_root *) et->private;
153 
154 	return le64_to_cpu(xv->xr_last_eb_blk);
155 }
156 
157 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
158 					      struct ocfs2_extent_tree *et,
159 					      u32 clusters)
160 {
161 	struct ocfs2_xattr_value_root *xv =
162 		(struct ocfs2_xattr_value_root *)et->private;
163 
164 	le32_add_cpu(&xv->xr_clusters, clusters);
165 }
166 
167 static int ocfs2_xattr_value_sanity_check(struct inode *inode,
168 					  struct ocfs2_extent_tree *et)
169 {
170 	return 0;
171 }
172 
173 static struct ocfs2_extent_tree_operations ocfs2_xattr_et_ops = {
174 	.set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
175 	.get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
176 	.update_clusters	= ocfs2_xattr_value_update_clusters,
177 	.sanity_check		= ocfs2_xattr_value_sanity_check,
178 };
179 
180 static struct ocfs2_extent_tree*
181 	 ocfs2_new_extent_tree(struct buffer_head *bh,
182 			       enum ocfs2_extent_tree_type et_type,
183 			       void *private)
184 {
185 	struct ocfs2_extent_tree *et;
186 
187 	et = kzalloc(sizeof(*et), GFP_NOFS);
188 	if (!et)
189 		return NULL;
190 
191 	et->type = et_type;
192 	get_bh(bh);
193 	et->root_bh = bh;
194 	et->private = private;
195 
196 	if (et_type == OCFS2_DINODE_EXTENT) {
197 		et->root_el = &((struct ocfs2_dinode *)bh->b_data)->id2.i_list;
198 		et->eops = &ocfs2_dinode_et_ops;
199 	} else if (et_type == OCFS2_XATTR_VALUE_EXTENT) {
200 		struct ocfs2_xattr_value_root *xv =
201 			(struct ocfs2_xattr_value_root *) private;
202 		et->root_el = &xv->xr_list;
203 		et->eops = &ocfs2_xattr_et_ops;
204 	}
205 
206 	return et;
207 }
208 
209 static void ocfs2_free_extent_tree(struct ocfs2_extent_tree *et)
210 {
211 	if (et) {
212 		brelse(et->root_bh);
213 		kfree(et);
214 	}
215 }
216 
217 static inline void ocfs2_set_last_eb_blk(struct ocfs2_extent_tree *et,
218 					 u64 new_last_eb_blk)
219 {
220 	et->eops->set_last_eb_blk(et, new_last_eb_blk);
221 }
222 
223 static inline u64 ocfs2_get_last_eb_blk(struct ocfs2_extent_tree *et)
224 {
225 	return et->eops->get_last_eb_blk(et);
226 }
227 
228 static inline void ocfs2_update_clusters(struct inode *inode,
229 					 struct ocfs2_extent_tree *et,
230 					 u32 clusters)
231 {
232 	et->eops->update_clusters(inode, et, clusters);
233 }
234 
235 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
236 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
237 					 struct ocfs2_extent_block *eb);
238 
239 /*
240  * Structures which describe a path through a btree, and functions to
241  * manipulate them.
242  *
243  * The idea here is to be as generic as possible with the tree
244  * manipulation code.
245  */
246 struct ocfs2_path_item {
247 	struct buffer_head		*bh;
248 	struct ocfs2_extent_list	*el;
249 };
250 
251 #define OCFS2_MAX_PATH_DEPTH	5
252 
253 struct ocfs2_path {
254 	int			p_tree_depth;
255 	struct ocfs2_path_item	p_node[OCFS2_MAX_PATH_DEPTH];
256 };
257 
258 #define path_root_bh(_path) ((_path)->p_node[0].bh)
259 #define path_root_el(_path) ((_path)->p_node[0].el)
260 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
261 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
262 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
263 
264 /*
265  * Reset the actual path elements so that we can re-use the structure
266  * to build another path. Generally, this involves freeing the buffer
267  * heads.
268  */
269 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
270 {
271 	int i, start = 0, depth = 0;
272 	struct ocfs2_path_item *node;
273 
274 	if (keep_root)
275 		start = 1;
276 
277 	for(i = start; i < path_num_items(path); i++) {
278 		node = &path->p_node[i];
279 
280 		brelse(node->bh);
281 		node->bh = NULL;
282 		node->el = NULL;
283 	}
284 
285 	/*
286 	 * Tree depth may change during truncate, or insert. If we're
287 	 * keeping the root extent list, then make sure that our path
288 	 * structure reflects the proper depth.
289 	 */
290 	if (keep_root)
291 		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
292 
293 	path->p_tree_depth = depth;
294 }
295 
296 static void ocfs2_free_path(struct ocfs2_path *path)
297 {
298 	if (path) {
299 		ocfs2_reinit_path(path, 0);
300 		kfree(path);
301 	}
302 }
303 
304 /*
305  * All the elements of src into dest. After this call, src could be freed
306  * without affecting dest.
307  *
308  * Both paths should have the same root. Any non-root elements of dest
309  * will be freed.
310  */
311 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
312 {
313 	int i;
314 
315 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
316 	BUG_ON(path_root_el(dest) != path_root_el(src));
317 
318 	ocfs2_reinit_path(dest, 1);
319 
320 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
321 		dest->p_node[i].bh = src->p_node[i].bh;
322 		dest->p_node[i].el = src->p_node[i].el;
323 
324 		if (dest->p_node[i].bh)
325 			get_bh(dest->p_node[i].bh);
326 	}
327 }
328 
329 /*
330  * Make the *dest path the same as src and re-initialize src path to
331  * have a root only.
332  */
333 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
334 {
335 	int i;
336 
337 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
338 
339 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
340 		brelse(dest->p_node[i].bh);
341 
342 		dest->p_node[i].bh = src->p_node[i].bh;
343 		dest->p_node[i].el = src->p_node[i].el;
344 
345 		src->p_node[i].bh = NULL;
346 		src->p_node[i].el = NULL;
347 	}
348 }
349 
350 /*
351  * Insert an extent block at given index.
352  *
353  * This will not take an additional reference on eb_bh.
354  */
355 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
356 					struct buffer_head *eb_bh)
357 {
358 	struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
359 
360 	/*
361 	 * Right now, no root bh is an extent block, so this helps
362 	 * catch code errors with dinode trees. The assertion can be
363 	 * safely removed if we ever need to insert extent block
364 	 * structures at the root.
365 	 */
366 	BUG_ON(index == 0);
367 
368 	path->p_node[index].bh = eb_bh;
369 	path->p_node[index].el = &eb->h_list;
370 }
371 
372 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
373 					 struct ocfs2_extent_list *root_el)
374 {
375 	struct ocfs2_path *path;
376 
377 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
378 
379 	path = kzalloc(sizeof(*path), GFP_NOFS);
380 	if (path) {
381 		path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
382 		get_bh(root_bh);
383 		path_root_bh(path) = root_bh;
384 		path_root_el(path) = root_el;
385 	}
386 
387 	return path;
388 }
389 
390 /*
391  * Convenience function to journal all components in a path.
392  */
393 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
394 				     struct ocfs2_path *path)
395 {
396 	int i, ret = 0;
397 
398 	if (!path)
399 		goto out;
400 
401 	for(i = 0; i < path_num_items(path); i++) {
402 		ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
403 					   OCFS2_JOURNAL_ACCESS_WRITE);
404 		if (ret < 0) {
405 			mlog_errno(ret);
406 			goto out;
407 		}
408 	}
409 
410 out:
411 	return ret;
412 }
413 
414 /*
415  * Return the index of the extent record which contains cluster #v_cluster.
416  * -1 is returned if it was not found.
417  *
418  * Should work fine on interior and exterior nodes.
419  */
420 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
421 {
422 	int ret = -1;
423 	int i;
424 	struct ocfs2_extent_rec *rec;
425 	u32 rec_end, rec_start, clusters;
426 
427 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
428 		rec = &el->l_recs[i];
429 
430 		rec_start = le32_to_cpu(rec->e_cpos);
431 		clusters = ocfs2_rec_clusters(el, rec);
432 
433 		rec_end = rec_start + clusters;
434 
435 		if (v_cluster >= rec_start && v_cluster < rec_end) {
436 			ret = i;
437 			break;
438 		}
439 	}
440 
441 	return ret;
442 }
443 
444 enum ocfs2_contig_type {
445 	CONTIG_NONE = 0,
446 	CONTIG_LEFT,
447 	CONTIG_RIGHT,
448 	CONTIG_LEFTRIGHT,
449 };
450 
451 
452 /*
453  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
454  * ocfs2_extent_contig only work properly against leaf nodes!
455  */
456 static int ocfs2_block_extent_contig(struct super_block *sb,
457 				     struct ocfs2_extent_rec *ext,
458 				     u64 blkno)
459 {
460 	u64 blk_end = le64_to_cpu(ext->e_blkno);
461 
462 	blk_end += ocfs2_clusters_to_blocks(sb,
463 				    le16_to_cpu(ext->e_leaf_clusters));
464 
465 	return blkno == blk_end;
466 }
467 
468 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
469 				  struct ocfs2_extent_rec *right)
470 {
471 	u32 left_range;
472 
473 	left_range = le32_to_cpu(left->e_cpos) +
474 		le16_to_cpu(left->e_leaf_clusters);
475 
476 	return (left_range == le32_to_cpu(right->e_cpos));
477 }
478 
479 static enum ocfs2_contig_type
480 	ocfs2_extent_contig(struct inode *inode,
481 			    struct ocfs2_extent_rec *ext,
482 			    struct ocfs2_extent_rec *insert_rec)
483 {
484 	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
485 
486 	/*
487 	 * Refuse to coalesce extent records with different flag
488 	 * fields - we don't want to mix unwritten extents with user
489 	 * data.
490 	 */
491 	if (ext->e_flags != insert_rec->e_flags)
492 		return CONTIG_NONE;
493 
494 	if (ocfs2_extents_adjacent(ext, insert_rec) &&
495 	    ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
496 			return CONTIG_RIGHT;
497 
498 	blkno = le64_to_cpu(ext->e_blkno);
499 	if (ocfs2_extents_adjacent(insert_rec, ext) &&
500 	    ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
501 		return CONTIG_LEFT;
502 
503 	return CONTIG_NONE;
504 }
505 
506 /*
507  * NOTE: We can have pretty much any combination of contiguousness and
508  * appending.
509  *
510  * The usefulness of APPEND_TAIL is more in that it lets us know that
511  * we'll have to update the path to that leaf.
512  */
513 enum ocfs2_append_type {
514 	APPEND_NONE = 0,
515 	APPEND_TAIL,
516 };
517 
518 enum ocfs2_split_type {
519 	SPLIT_NONE = 0,
520 	SPLIT_LEFT,
521 	SPLIT_RIGHT,
522 };
523 
524 struct ocfs2_insert_type {
525 	enum ocfs2_split_type	ins_split;
526 	enum ocfs2_append_type	ins_appending;
527 	enum ocfs2_contig_type	ins_contig;
528 	int			ins_contig_index;
529 	int			ins_tree_depth;
530 };
531 
532 struct ocfs2_merge_ctxt {
533 	enum ocfs2_contig_type	c_contig_type;
534 	int			c_has_empty_extent;
535 	int			c_split_covers_rec;
536 };
537 
538 /*
539  * How many free extents have we got before we need more meta data?
540  */
541 int ocfs2_num_free_extents(struct ocfs2_super *osb,
542 			   struct inode *inode,
543 			   struct buffer_head *root_bh,
544 			   enum ocfs2_extent_tree_type type,
545 			   void *private)
546 {
547 	int retval;
548 	struct ocfs2_extent_list *el = NULL;
549 	struct ocfs2_extent_block *eb;
550 	struct buffer_head *eb_bh = NULL;
551 	u64 last_eb_blk = 0;
552 
553 	mlog_entry_void();
554 
555 	if (type == OCFS2_DINODE_EXTENT) {
556 		struct ocfs2_dinode *fe =
557 				(struct ocfs2_dinode *)root_bh->b_data;
558 		if (!OCFS2_IS_VALID_DINODE(fe)) {
559 			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
560 			retval = -EIO;
561 			goto bail;
562 		}
563 
564 		if (fe->i_last_eb_blk)
565 			last_eb_blk = le64_to_cpu(fe->i_last_eb_blk);
566 		el = &fe->id2.i_list;
567 	} else if (type == OCFS2_XATTR_VALUE_EXTENT) {
568 		struct ocfs2_xattr_value_root *xv =
569 			(struct ocfs2_xattr_value_root *) private;
570 
571 		last_eb_blk = le64_to_cpu(xv->xr_last_eb_blk);
572 		el = &xv->xr_list;
573 	}
574 
575 	if (last_eb_blk) {
576 		retval = ocfs2_read_block(osb, last_eb_blk,
577 					  &eb_bh, OCFS2_BH_CACHED, inode);
578 		if (retval < 0) {
579 			mlog_errno(retval);
580 			goto bail;
581 		}
582 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
583 		el = &eb->h_list;
584 	}
585 
586 	BUG_ON(el->l_tree_depth != 0);
587 
588 	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
589 bail:
590 	if (eb_bh)
591 		brelse(eb_bh);
592 
593 	mlog_exit(retval);
594 	return retval;
595 }
596 
597 /* expects array to already be allocated
598  *
599  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
600  * l_count for you
601  */
602 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
603 				     handle_t *handle,
604 				     struct inode *inode,
605 				     int wanted,
606 				     struct ocfs2_alloc_context *meta_ac,
607 				     struct buffer_head *bhs[])
608 {
609 	int count, status, i;
610 	u16 suballoc_bit_start;
611 	u32 num_got;
612 	u64 first_blkno;
613 	struct ocfs2_extent_block *eb;
614 
615 	mlog_entry_void();
616 
617 	count = 0;
618 	while (count < wanted) {
619 		status = ocfs2_claim_metadata(osb,
620 					      handle,
621 					      meta_ac,
622 					      wanted - count,
623 					      &suballoc_bit_start,
624 					      &num_got,
625 					      &first_blkno);
626 		if (status < 0) {
627 			mlog_errno(status);
628 			goto bail;
629 		}
630 
631 		for(i = count;  i < (num_got + count); i++) {
632 			bhs[i] = sb_getblk(osb->sb, first_blkno);
633 			if (bhs[i] == NULL) {
634 				status = -EIO;
635 				mlog_errno(status);
636 				goto bail;
637 			}
638 			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
639 
640 			status = ocfs2_journal_access(handle, inode, bhs[i],
641 						      OCFS2_JOURNAL_ACCESS_CREATE);
642 			if (status < 0) {
643 				mlog_errno(status);
644 				goto bail;
645 			}
646 
647 			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
648 			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
649 			/* Ok, setup the minimal stuff here. */
650 			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
651 			eb->h_blkno = cpu_to_le64(first_blkno);
652 			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
653 			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
654 			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
655 			eb->h_list.l_count =
656 				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
657 
658 			suballoc_bit_start++;
659 			first_blkno++;
660 
661 			/* We'll also be dirtied by the caller, so
662 			 * this isn't absolutely necessary. */
663 			status = ocfs2_journal_dirty(handle, bhs[i]);
664 			if (status < 0) {
665 				mlog_errno(status);
666 				goto bail;
667 			}
668 		}
669 
670 		count += num_got;
671 	}
672 
673 	status = 0;
674 bail:
675 	if (status < 0) {
676 		for(i = 0; i < wanted; i++) {
677 			if (bhs[i])
678 				brelse(bhs[i]);
679 			bhs[i] = NULL;
680 		}
681 	}
682 	mlog_exit(status);
683 	return status;
684 }
685 
686 /*
687  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
688  *
689  * Returns the sum of the rightmost extent rec logical offset and
690  * cluster count.
691  *
692  * ocfs2_add_branch() uses this to determine what logical cluster
693  * value should be populated into the leftmost new branch records.
694  *
695  * ocfs2_shift_tree_depth() uses this to determine the # clusters
696  * value for the new topmost tree record.
697  */
698 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
699 {
700 	int i;
701 
702 	i = le16_to_cpu(el->l_next_free_rec) - 1;
703 
704 	return le32_to_cpu(el->l_recs[i].e_cpos) +
705 		ocfs2_rec_clusters(el, &el->l_recs[i]);
706 }
707 
708 /*
709  * Add an entire tree branch to our inode. eb_bh is the extent block
710  * to start at, if we don't want to start the branch at the dinode
711  * structure.
712  *
713  * last_eb_bh is required as we have to update it's next_leaf pointer
714  * for the new last extent block.
715  *
716  * the new branch will be 'empty' in the sense that every block will
717  * contain a single record with cluster count == 0.
718  */
719 static int ocfs2_add_branch(struct ocfs2_super *osb,
720 			    handle_t *handle,
721 			    struct inode *inode,
722 			    struct ocfs2_extent_tree *et,
723 			    struct buffer_head *eb_bh,
724 			    struct buffer_head **last_eb_bh,
725 			    struct ocfs2_alloc_context *meta_ac)
726 {
727 	int status, new_blocks, i;
728 	u64 next_blkno, new_last_eb_blk;
729 	struct buffer_head *bh;
730 	struct buffer_head **new_eb_bhs = NULL;
731 	struct ocfs2_extent_block *eb;
732 	struct ocfs2_extent_list  *eb_el;
733 	struct ocfs2_extent_list  *el;
734 	u32 new_cpos;
735 
736 	mlog_entry_void();
737 
738 	BUG_ON(!last_eb_bh || !*last_eb_bh);
739 
740 	if (eb_bh) {
741 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
742 		el = &eb->h_list;
743 	} else
744 		el = et->root_el;
745 
746 	/* we never add a branch to a leaf. */
747 	BUG_ON(!el->l_tree_depth);
748 
749 	new_blocks = le16_to_cpu(el->l_tree_depth);
750 
751 	/* allocate the number of new eb blocks we need */
752 	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
753 			     GFP_KERNEL);
754 	if (!new_eb_bhs) {
755 		status = -ENOMEM;
756 		mlog_errno(status);
757 		goto bail;
758 	}
759 
760 	status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
761 					   meta_ac, new_eb_bhs);
762 	if (status < 0) {
763 		mlog_errno(status);
764 		goto bail;
765 	}
766 
767 	eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
768 	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
769 
770 	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
771 	 * linked with the rest of the tree.
772 	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
773 	 *
774 	 * when we leave the loop, new_last_eb_blk will point to the
775 	 * newest leaf, and next_blkno will point to the topmost extent
776 	 * block. */
777 	next_blkno = new_last_eb_blk = 0;
778 	for(i = 0; i < new_blocks; i++) {
779 		bh = new_eb_bhs[i];
780 		eb = (struct ocfs2_extent_block *) bh->b_data;
781 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
782 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
783 			status = -EIO;
784 			goto bail;
785 		}
786 		eb_el = &eb->h_list;
787 
788 		status = ocfs2_journal_access(handle, inode, bh,
789 					      OCFS2_JOURNAL_ACCESS_CREATE);
790 		if (status < 0) {
791 			mlog_errno(status);
792 			goto bail;
793 		}
794 
795 		eb->h_next_leaf_blk = 0;
796 		eb_el->l_tree_depth = cpu_to_le16(i);
797 		eb_el->l_next_free_rec = cpu_to_le16(1);
798 		/*
799 		 * This actually counts as an empty extent as
800 		 * c_clusters == 0
801 		 */
802 		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
803 		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
804 		/*
805 		 * eb_el isn't always an interior node, but even leaf
806 		 * nodes want a zero'd flags and reserved field so
807 		 * this gets the whole 32 bits regardless of use.
808 		 */
809 		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
810 		if (!eb_el->l_tree_depth)
811 			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
812 
813 		status = ocfs2_journal_dirty(handle, bh);
814 		if (status < 0) {
815 			mlog_errno(status);
816 			goto bail;
817 		}
818 
819 		next_blkno = le64_to_cpu(eb->h_blkno);
820 	}
821 
822 	/* This is a bit hairy. We want to update up to three blocks
823 	 * here without leaving any of them in an inconsistent state
824 	 * in case of error. We don't have to worry about
825 	 * journal_dirty erroring as it won't unless we've aborted the
826 	 * handle (in which case we would never be here) so reserving
827 	 * the write with journal_access is all we need to do. */
828 	status = ocfs2_journal_access(handle, inode, *last_eb_bh,
829 				      OCFS2_JOURNAL_ACCESS_WRITE);
830 	if (status < 0) {
831 		mlog_errno(status);
832 		goto bail;
833 	}
834 	status = ocfs2_journal_access(handle, inode, et->root_bh,
835 				      OCFS2_JOURNAL_ACCESS_WRITE);
836 	if (status < 0) {
837 		mlog_errno(status);
838 		goto bail;
839 	}
840 	if (eb_bh) {
841 		status = ocfs2_journal_access(handle, inode, eb_bh,
842 					      OCFS2_JOURNAL_ACCESS_WRITE);
843 		if (status < 0) {
844 			mlog_errno(status);
845 			goto bail;
846 		}
847 	}
848 
849 	/* Link the new branch into the rest of the tree (el will
850 	 * either be on the root_bh, or the extent block passed in. */
851 	i = le16_to_cpu(el->l_next_free_rec);
852 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
853 	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
854 	el->l_recs[i].e_int_clusters = 0;
855 	le16_add_cpu(&el->l_next_free_rec, 1);
856 
857 	/* fe needs a new last extent block pointer, as does the
858 	 * next_leaf on the previously last-extent-block. */
859 	ocfs2_set_last_eb_blk(et, new_last_eb_blk);
860 
861 	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
862 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
863 
864 	status = ocfs2_journal_dirty(handle, *last_eb_bh);
865 	if (status < 0)
866 		mlog_errno(status);
867 	status = ocfs2_journal_dirty(handle, et->root_bh);
868 	if (status < 0)
869 		mlog_errno(status);
870 	if (eb_bh) {
871 		status = ocfs2_journal_dirty(handle, eb_bh);
872 		if (status < 0)
873 			mlog_errno(status);
874 	}
875 
876 	/*
877 	 * Some callers want to track the rightmost leaf so pass it
878 	 * back here.
879 	 */
880 	brelse(*last_eb_bh);
881 	get_bh(new_eb_bhs[0]);
882 	*last_eb_bh = new_eb_bhs[0];
883 
884 	status = 0;
885 bail:
886 	if (new_eb_bhs) {
887 		for (i = 0; i < new_blocks; i++)
888 			if (new_eb_bhs[i])
889 				brelse(new_eb_bhs[i]);
890 		kfree(new_eb_bhs);
891 	}
892 
893 	mlog_exit(status);
894 	return status;
895 }
896 
897 /*
898  * adds another level to the allocation tree.
899  * returns back the new extent block so you can add a branch to it
900  * after this call.
901  */
902 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
903 				  handle_t *handle,
904 				  struct inode *inode,
905 				  struct ocfs2_extent_tree *et,
906 				  struct ocfs2_alloc_context *meta_ac,
907 				  struct buffer_head **ret_new_eb_bh)
908 {
909 	int status, i;
910 	u32 new_clusters;
911 	struct buffer_head *new_eb_bh = NULL;
912 	struct ocfs2_extent_block *eb;
913 	struct ocfs2_extent_list  *root_el;
914 	struct ocfs2_extent_list  *eb_el;
915 
916 	mlog_entry_void();
917 
918 	status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
919 					   &new_eb_bh);
920 	if (status < 0) {
921 		mlog_errno(status);
922 		goto bail;
923 	}
924 
925 	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
926 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
927 		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
928 		status = -EIO;
929 		goto bail;
930 	}
931 
932 	eb_el = &eb->h_list;
933 	root_el = et->root_el;
934 
935 	status = ocfs2_journal_access(handle, inode, new_eb_bh,
936 				      OCFS2_JOURNAL_ACCESS_CREATE);
937 	if (status < 0) {
938 		mlog_errno(status);
939 		goto bail;
940 	}
941 
942 	/* copy the root extent list data into the new extent block */
943 	eb_el->l_tree_depth = root_el->l_tree_depth;
944 	eb_el->l_next_free_rec = root_el->l_next_free_rec;
945 	for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
946 		eb_el->l_recs[i] = root_el->l_recs[i];
947 
948 	status = ocfs2_journal_dirty(handle, new_eb_bh);
949 	if (status < 0) {
950 		mlog_errno(status);
951 		goto bail;
952 	}
953 
954 	status = ocfs2_journal_access(handle, inode, et->root_bh,
955 				      OCFS2_JOURNAL_ACCESS_WRITE);
956 	if (status < 0) {
957 		mlog_errno(status);
958 		goto bail;
959 	}
960 
961 	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
962 
963 	/* update root_bh now */
964 	le16_add_cpu(&root_el->l_tree_depth, 1);
965 	root_el->l_recs[0].e_cpos = 0;
966 	root_el->l_recs[0].e_blkno = eb->h_blkno;
967 	root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
968 	for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
969 		memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
970 	root_el->l_next_free_rec = cpu_to_le16(1);
971 
972 	/* If this is our 1st tree depth shift, then last_eb_blk
973 	 * becomes the allocated extent block */
974 	if (root_el->l_tree_depth == cpu_to_le16(1))
975 		ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
976 
977 	status = ocfs2_journal_dirty(handle, et->root_bh);
978 	if (status < 0) {
979 		mlog_errno(status);
980 		goto bail;
981 	}
982 
983 	*ret_new_eb_bh = new_eb_bh;
984 	new_eb_bh = NULL;
985 	status = 0;
986 bail:
987 	if (new_eb_bh)
988 		brelse(new_eb_bh);
989 
990 	mlog_exit(status);
991 	return status;
992 }
993 
994 /*
995  * Should only be called when there is no space left in any of the
996  * leaf nodes. What we want to do is find the lowest tree depth
997  * non-leaf extent block with room for new records. There are three
998  * valid results of this search:
999  *
1000  * 1) a lowest extent block is found, then we pass it back in
1001  *    *lowest_eb_bh and return '0'
1002  *
1003  * 2) the search fails to find anything, but the root_el has room. We
1004  *    pass NULL back in *lowest_eb_bh, but still return '0'
1005  *
1006  * 3) the search fails to find anything AND the root_el is full, in
1007  *    which case we return > 0
1008  *
1009  * return status < 0 indicates an error.
1010  */
1011 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1012 				    struct inode *inode,
1013 				    struct ocfs2_extent_tree *et,
1014 				    struct buffer_head **target_bh)
1015 {
1016 	int status = 0, i;
1017 	u64 blkno;
1018 	struct ocfs2_extent_block *eb;
1019 	struct ocfs2_extent_list  *el;
1020 	struct buffer_head *bh = NULL;
1021 	struct buffer_head *lowest_bh = NULL;
1022 
1023 	mlog_entry_void();
1024 
1025 	*target_bh = NULL;
1026 
1027 	el = et->root_el;
1028 
1029 	while(le16_to_cpu(el->l_tree_depth) > 1) {
1030 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
1031 			ocfs2_error(inode->i_sb, "Dinode %llu has empty "
1032 				    "extent list (next_free_rec == 0)",
1033 				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
1034 			status = -EIO;
1035 			goto bail;
1036 		}
1037 		i = le16_to_cpu(el->l_next_free_rec) - 1;
1038 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1039 		if (!blkno) {
1040 			ocfs2_error(inode->i_sb, "Dinode %llu has extent "
1041 				    "list where extent # %d has no physical "
1042 				    "block start",
1043 				    (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
1044 			status = -EIO;
1045 			goto bail;
1046 		}
1047 
1048 		if (bh) {
1049 			brelse(bh);
1050 			bh = NULL;
1051 		}
1052 
1053 		status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
1054 					  inode);
1055 		if (status < 0) {
1056 			mlog_errno(status);
1057 			goto bail;
1058 		}
1059 
1060 		eb = (struct ocfs2_extent_block *) bh->b_data;
1061 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1062 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1063 			status = -EIO;
1064 			goto bail;
1065 		}
1066 		el = &eb->h_list;
1067 
1068 		if (le16_to_cpu(el->l_next_free_rec) <
1069 		    le16_to_cpu(el->l_count)) {
1070 			if (lowest_bh)
1071 				brelse(lowest_bh);
1072 			lowest_bh = bh;
1073 			get_bh(lowest_bh);
1074 		}
1075 	}
1076 
1077 	/* If we didn't find one and the fe doesn't have any room,
1078 	 * then return '1' */
1079 	el = et->root_el;
1080 	if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1081 		status = 1;
1082 
1083 	*target_bh = lowest_bh;
1084 bail:
1085 	if (bh)
1086 		brelse(bh);
1087 
1088 	mlog_exit(status);
1089 	return status;
1090 }
1091 
1092 /*
1093  * Grow a b-tree so that it has more records.
1094  *
1095  * We might shift the tree depth in which case existing paths should
1096  * be considered invalid.
1097  *
1098  * Tree depth after the grow is returned via *final_depth.
1099  *
1100  * *last_eb_bh will be updated by ocfs2_add_branch().
1101  */
1102 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1103 			   struct ocfs2_extent_tree *et, int *final_depth,
1104 			   struct buffer_head **last_eb_bh,
1105 			   struct ocfs2_alloc_context *meta_ac)
1106 {
1107 	int ret, shift;
1108 	struct ocfs2_extent_list *el = et->root_el;
1109 	int depth = le16_to_cpu(el->l_tree_depth);
1110 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1111 	struct buffer_head *bh = NULL;
1112 
1113 	BUG_ON(meta_ac == NULL);
1114 
1115 	shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1116 	if (shift < 0) {
1117 		ret = shift;
1118 		mlog_errno(ret);
1119 		goto out;
1120 	}
1121 
1122 	/* We traveled all the way to the bottom of the allocation tree
1123 	 * and didn't find room for any more extents - we need to add
1124 	 * another tree level */
1125 	if (shift) {
1126 		BUG_ON(bh);
1127 		mlog(0, "need to shift tree depth (current = %d)\n", depth);
1128 
1129 		/* ocfs2_shift_tree_depth will return us a buffer with
1130 		 * the new extent block (so we can pass that to
1131 		 * ocfs2_add_branch). */
1132 		ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1133 					     meta_ac, &bh);
1134 		if (ret < 0) {
1135 			mlog_errno(ret);
1136 			goto out;
1137 		}
1138 		depth++;
1139 		if (depth == 1) {
1140 			/*
1141 			 * Special case: we have room now if we shifted from
1142 			 * tree_depth 0, so no more work needs to be done.
1143 			 *
1144 			 * We won't be calling add_branch, so pass
1145 			 * back *last_eb_bh as the new leaf. At depth
1146 			 * zero, it should always be null so there's
1147 			 * no reason to brelse.
1148 			 */
1149 			BUG_ON(*last_eb_bh);
1150 			get_bh(bh);
1151 			*last_eb_bh = bh;
1152 			goto out;
1153 		}
1154 	}
1155 
1156 	/* call ocfs2_add_branch to add the final part of the tree with
1157 	 * the new data. */
1158 	mlog(0, "add branch. bh = %p\n", bh);
1159 	ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1160 			       meta_ac);
1161 	if (ret < 0) {
1162 		mlog_errno(ret);
1163 		goto out;
1164 	}
1165 
1166 out:
1167 	if (final_depth)
1168 		*final_depth = depth;
1169 	brelse(bh);
1170 	return ret;
1171 }
1172 
1173 /*
1174  * This function will discard the rightmost extent record.
1175  */
1176 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1177 {
1178 	int next_free = le16_to_cpu(el->l_next_free_rec);
1179 	int count = le16_to_cpu(el->l_count);
1180 	unsigned int num_bytes;
1181 
1182 	BUG_ON(!next_free);
1183 	/* This will cause us to go off the end of our extent list. */
1184 	BUG_ON(next_free >= count);
1185 
1186 	num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1187 
1188 	memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1189 }
1190 
1191 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1192 			      struct ocfs2_extent_rec *insert_rec)
1193 {
1194 	int i, insert_index, next_free, has_empty, num_bytes;
1195 	u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1196 	struct ocfs2_extent_rec *rec;
1197 
1198 	next_free = le16_to_cpu(el->l_next_free_rec);
1199 	has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1200 
1201 	BUG_ON(!next_free);
1202 
1203 	/* The tree code before us didn't allow enough room in the leaf. */
1204 	BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1205 
1206 	/*
1207 	 * The easiest way to approach this is to just remove the
1208 	 * empty extent and temporarily decrement next_free.
1209 	 */
1210 	if (has_empty) {
1211 		/*
1212 		 * If next_free was 1 (only an empty extent), this
1213 		 * loop won't execute, which is fine. We still want
1214 		 * the decrement above to happen.
1215 		 */
1216 		for(i = 0; i < (next_free - 1); i++)
1217 			el->l_recs[i] = el->l_recs[i+1];
1218 
1219 		next_free--;
1220 	}
1221 
1222 	/*
1223 	 * Figure out what the new record index should be.
1224 	 */
1225 	for(i = 0; i < next_free; i++) {
1226 		rec = &el->l_recs[i];
1227 
1228 		if (insert_cpos < le32_to_cpu(rec->e_cpos))
1229 			break;
1230 	}
1231 	insert_index = i;
1232 
1233 	mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1234 	     insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1235 
1236 	BUG_ON(insert_index < 0);
1237 	BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1238 	BUG_ON(insert_index > next_free);
1239 
1240 	/*
1241 	 * No need to memmove if we're just adding to the tail.
1242 	 */
1243 	if (insert_index != next_free) {
1244 		BUG_ON(next_free >= le16_to_cpu(el->l_count));
1245 
1246 		num_bytes = next_free - insert_index;
1247 		num_bytes *= sizeof(struct ocfs2_extent_rec);
1248 		memmove(&el->l_recs[insert_index + 1],
1249 			&el->l_recs[insert_index],
1250 			num_bytes);
1251 	}
1252 
1253 	/*
1254 	 * Either we had an empty extent, and need to re-increment or
1255 	 * there was no empty extent on a non full rightmost leaf node,
1256 	 * in which case we still need to increment.
1257 	 */
1258 	next_free++;
1259 	el->l_next_free_rec = cpu_to_le16(next_free);
1260 	/*
1261 	 * Make sure none of the math above just messed up our tree.
1262 	 */
1263 	BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1264 
1265 	el->l_recs[insert_index] = *insert_rec;
1266 
1267 }
1268 
1269 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1270 {
1271 	int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1272 
1273 	BUG_ON(num_recs == 0);
1274 
1275 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1276 		num_recs--;
1277 		size = num_recs * sizeof(struct ocfs2_extent_rec);
1278 		memmove(&el->l_recs[0], &el->l_recs[1], size);
1279 		memset(&el->l_recs[num_recs], 0,
1280 		       sizeof(struct ocfs2_extent_rec));
1281 		el->l_next_free_rec = cpu_to_le16(num_recs);
1282 	}
1283 }
1284 
1285 /*
1286  * Create an empty extent record .
1287  *
1288  * l_next_free_rec may be updated.
1289  *
1290  * If an empty extent already exists do nothing.
1291  */
1292 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1293 {
1294 	int next_free = le16_to_cpu(el->l_next_free_rec);
1295 
1296 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1297 
1298 	if (next_free == 0)
1299 		goto set_and_inc;
1300 
1301 	if (ocfs2_is_empty_extent(&el->l_recs[0]))
1302 		return;
1303 
1304 	mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1305 			"Asked to create an empty extent in a full list:\n"
1306 			"count = %u, tree depth = %u",
1307 			le16_to_cpu(el->l_count),
1308 			le16_to_cpu(el->l_tree_depth));
1309 
1310 	ocfs2_shift_records_right(el);
1311 
1312 set_and_inc:
1313 	le16_add_cpu(&el->l_next_free_rec, 1);
1314 	memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1315 }
1316 
1317 /*
1318  * For a rotation which involves two leaf nodes, the "root node" is
1319  * the lowest level tree node which contains a path to both leafs. This
1320  * resulting set of information can be used to form a complete "subtree"
1321  *
1322  * This function is passed two full paths from the dinode down to a
1323  * pair of adjacent leaves. It's task is to figure out which path
1324  * index contains the subtree root - this can be the root index itself
1325  * in a worst-case rotation.
1326  *
1327  * The array index of the subtree root is passed back.
1328  */
1329 static int ocfs2_find_subtree_root(struct inode *inode,
1330 				   struct ocfs2_path *left,
1331 				   struct ocfs2_path *right)
1332 {
1333 	int i = 0;
1334 
1335 	/*
1336 	 * Check that the caller passed in two paths from the same tree.
1337 	 */
1338 	BUG_ON(path_root_bh(left) != path_root_bh(right));
1339 
1340 	do {
1341 		i++;
1342 
1343 		/*
1344 		 * The caller didn't pass two adjacent paths.
1345 		 */
1346 		mlog_bug_on_msg(i > left->p_tree_depth,
1347 				"Inode %lu, left depth %u, right depth %u\n"
1348 				"left leaf blk %llu, right leaf blk %llu\n",
1349 				inode->i_ino, left->p_tree_depth,
1350 				right->p_tree_depth,
1351 				(unsigned long long)path_leaf_bh(left)->b_blocknr,
1352 				(unsigned long long)path_leaf_bh(right)->b_blocknr);
1353 	} while (left->p_node[i].bh->b_blocknr ==
1354 		 right->p_node[i].bh->b_blocknr);
1355 
1356 	return i - 1;
1357 }
1358 
1359 typedef void (path_insert_t)(void *, struct buffer_head *);
1360 
1361 /*
1362  * Traverse a btree path in search of cpos, starting at root_el.
1363  *
1364  * This code can be called with a cpos larger than the tree, in which
1365  * case it will return the rightmost path.
1366  */
1367 static int __ocfs2_find_path(struct inode *inode,
1368 			     struct ocfs2_extent_list *root_el, u32 cpos,
1369 			     path_insert_t *func, void *data)
1370 {
1371 	int i, ret = 0;
1372 	u32 range;
1373 	u64 blkno;
1374 	struct buffer_head *bh = NULL;
1375 	struct ocfs2_extent_block *eb;
1376 	struct ocfs2_extent_list *el;
1377 	struct ocfs2_extent_rec *rec;
1378 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1379 
1380 	el = root_el;
1381 	while (el->l_tree_depth) {
1382 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
1383 			ocfs2_error(inode->i_sb,
1384 				    "Inode %llu has empty extent list at "
1385 				    "depth %u\n",
1386 				    (unsigned long long)oi->ip_blkno,
1387 				    le16_to_cpu(el->l_tree_depth));
1388 			ret = -EROFS;
1389 			goto out;
1390 
1391 		}
1392 
1393 		for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1394 			rec = &el->l_recs[i];
1395 
1396 			/*
1397 			 * In the case that cpos is off the allocation
1398 			 * tree, this should just wind up returning the
1399 			 * rightmost record.
1400 			 */
1401 			range = le32_to_cpu(rec->e_cpos) +
1402 				ocfs2_rec_clusters(el, rec);
1403 			if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1404 			    break;
1405 		}
1406 
1407 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1408 		if (blkno == 0) {
1409 			ocfs2_error(inode->i_sb,
1410 				    "Inode %llu has bad blkno in extent list "
1411 				    "at depth %u (index %d)\n",
1412 				    (unsigned long long)oi->ip_blkno,
1413 				    le16_to_cpu(el->l_tree_depth), i);
1414 			ret = -EROFS;
1415 			goto out;
1416 		}
1417 
1418 		brelse(bh);
1419 		bh = NULL;
1420 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1421 				       &bh, OCFS2_BH_CACHED, inode);
1422 		if (ret) {
1423 			mlog_errno(ret);
1424 			goto out;
1425 		}
1426 
1427 		eb = (struct ocfs2_extent_block *) bh->b_data;
1428 		el = &eb->h_list;
1429 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1430 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1431 			ret = -EIO;
1432 			goto out;
1433 		}
1434 
1435 		if (le16_to_cpu(el->l_next_free_rec) >
1436 		    le16_to_cpu(el->l_count)) {
1437 			ocfs2_error(inode->i_sb,
1438 				    "Inode %llu has bad count in extent list "
1439 				    "at block %llu (next free=%u, count=%u)\n",
1440 				    (unsigned long long)oi->ip_blkno,
1441 				    (unsigned long long)bh->b_blocknr,
1442 				    le16_to_cpu(el->l_next_free_rec),
1443 				    le16_to_cpu(el->l_count));
1444 			ret = -EROFS;
1445 			goto out;
1446 		}
1447 
1448 		if (func)
1449 			func(data, bh);
1450 	}
1451 
1452 out:
1453 	/*
1454 	 * Catch any trailing bh that the loop didn't handle.
1455 	 */
1456 	brelse(bh);
1457 
1458 	return ret;
1459 }
1460 
1461 /*
1462  * Given an initialized path (that is, it has a valid root extent
1463  * list), this function will traverse the btree in search of the path
1464  * which would contain cpos.
1465  *
1466  * The path traveled is recorded in the path structure.
1467  *
1468  * Note that this will not do any comparisons on leaf node extent
1469  * records, so it will work fine in the case that we just added a tree
1470  * branch.
1471  */
1472 struct find_path_data {
1473 	int index;
1474 	struct ocfs2_path *path;
1475 };
1476 static void find_path_ins(void *data, struct buffer_head *bh)
1477 {
1478 	struct find_path_data *fp = data;
1479 
1480 	get_bh(bh);
1481 	ocfs2_path_insert_eb(fp->path, fp->index, bh);
1482 	fp->index++;
1483 }
1484 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1485 			   u32 cpos)
1486 {
1487 	struct find_path_data data;
1488 
1489 	data.index = 1;
1490 	data.path = path;
1491 	return __ocfs2_find_path(inode, path_root_el(path), cpos,
1492 				 find_path_ins, &data);
1493 }
1494 
1495 static void find_leaf_ins(void *data, struct buffer_head *bh)
1496 {
1497 	struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1498 	struct ocfs2_extent_list *el = &eb->h_list;
1499 	struct buffer_head **ret = data;
1500 
1501 	/* We want to retain only the leaf block. */
1502 	if (le16_to_cpu(el->l_tree_depth) == 0) {
1503 		get_bh(bh);
1504 		*ret = bh;
1505 	}
1506 }
1507 /*
1508  * Find the leaf block in the tree which would contain cpos. No
1509  * checking of the actual leaf is done.
1510  *
1511  * Some paths want to call this instead of allocating a path structure
1512  * and calling ocfs2_find_path().
1513  *
1514  * This function doesn't handle non btree extent lists.
1515  */
1516 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1517 		    u32 cpos, struct buffer_head **leaf_bh)
1518 {
1519 	int ret;
1520 	struct buffer_head *bh = NULL;
1521 
1522 	ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1523 	if (ret) {
1524 		mlog_errno(ret);
1525 		goto out;
1526 	}
1527 
1528 	*leaf_bh = bh;
1529 out:
1530 	return ret;
1531 }
1532 
1533 /*
1534  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1535  *
1536  * Basically, we've moved stuff around at the bottom of the tree and
1537  * we need to fix up the extent records above the changes to reflect
1538  * the new changes.
1539  *
1540  * left_rec: the record on the left.
1541  * left_child_el: is the child list pointed to by left_rec
1542  * right_rec: the record to the right of left_rec
1543  * right_child_el: is the child list pointed to by right_rec
1544  *
1545  * By definition, this only works on interior nodes.
1546  */
1547 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1548 				  struct ocfs2_extent_list *left_child_el,
1549 				  struct ocfs2_extent_rec *right_rec,
1550 				  struct ocfs2_extent_list *right_child_el)
1551 {
1552 	u32 left_clusters, right_end;
1553 
1554 	/*
1555 	 * Interior nodes never have holes. Their cpos is the cpos of
1556 	 * the leftmost record in their child list. Their cluster
1557 	 * count covers the full theoretical range of their child list
1558 	 * - the range between their cpos and the cpos of the record
1559 	 * immediately to their right.
1560 	 */
1561 	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1562 	if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1563 		BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1564 		left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1565 	}
1566 	left_clusters -= le32_to_cpu(left_rec->e_cpos);
1567 	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1568 
1569 	/*
1570 	 * Calculate the rightmost cluster count boundary before
1571 	 * moving cpos - we will need to adjust clusters after
1572 	 * updating e_cpos to keep the same highest cluster count.
1573 	 */
1574 	right_end = le32_to_cpu(right_rec->e_cpos);
1575 	right_end += le32_to_cpu(right_rec->e_int_clusters);
1576 
1577 	right_rec->e_cpos = left_rec->e_cpos;
1578 	le32_add_cpu(&right_rec->e_cpos, left_clusters);
1579 
1580 	right_end -= le32_to_cpu(right_rec->e_cpos);
1581 	right_rec->e_int_clusters = cpu_to_le32(right_end);
1582 }
1583 
1584 /*
1585  * Adjust the adjacent root node records involved in a
1586  * rotation. left_el_blkno is passed in as a key so that we can easily
1587  * find it's index in the root list.
1588  */
1589 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1590 				      struct ocfs2_extent_list *left_el,
1591 				      struct ocfs2_extent_list *right_el,
1592 				      u64 left_el_blkno)
1593 {
1594 	int i;
1595 
1596 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1597 	       le16_to_cpu(left_el->l_tree_depth));
1598 
1599 	for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1600 		if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1601 			break;
1602 	}
1603 
1604 	/*
1605 	 * The path walking code should have never returned a root and
1606 	 * two paths which are not adjacent.
1607 	 */
1608 	BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1609 
1610 	ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1611 				      &root_el->l_recs[i + 1], right_el);
1612 }
1613 
1614 /*
1615  * We've changed a leaf block (in right_path) and need to reflect that
1616  * change back up the subtree.
1617  *
1618  * This happens in multiple places:
1619  *   - When we've moved an extent record from the left path leaf to the right
1620  *     path leaf to make room for an empty extent in the left path leaf.
1621  *   - When our insert into the right path leaf is at the leftmost edge
1622  *     and requires an update of the path immediately to it's left. This
1623  *     can occur at the end of some types of rotation and appending inserts.
1624  *   - When we've adjusted the last extent record in the left path leaf and the
1625  *     1st extent record in the right path leaf during cross extent block merge.
1626  */
1627 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1628 				       struct ocfs2_path *left_path,
1629 				       struct ocfs2_path *right_path,
1630 				       int subtree_index)
1631 {
1632 	int ret, i, idx;
1633 	struct ocfs2_extent_list *el, *left_el, *right_el;
1634 	struct ocfs2_extent_rec *left_rec, *right_rec;
1635 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1636 
1637 	/*
1638 	 * Update the counts and position values within all the
1639 	 * interior nodes to reflect the leaf rotation we just did.
1640 	 *
1641 	 * The root node is handled below the loop.
1642 	 *
1643 	 * We begin the loop with right_el and left_el pointing to the
1644 	 * leaf lists and work our way up.
1645 	 *
1646 	 * NOTE: within this loop, left_el and right_el always refer
1647 	 * to the *child* lists.
1648 	 */
1649 	left_el = path_leaf_el(left_path);
1650 	right_el = path_leaf_el(right_path);
1651 	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1652 		mlog(0, "Adjust records at index %u\n", i);
1653 
1654 		/*
1655 		 * One nice property of knowing that all of these
1656 		 * nodes are below the root is that we only deal with
1657 		 * the leftmost right node record and the rightmost
1658 		 * left node record.
1659 		 */
1660 		el = left_path->p_node[i].el;
1661 		idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1662 		left_rec = &el->l_recs[idx];
1663 
1664 		el = right_path->p_node[i].el;
1665 		right_rec = &el->l_recs[0];
1666 
1667 		ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1668 					      right_el);
1669 
1670 		ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1671 		if (ret)
1672 			mlog_errno(ret);
1673 
1674 		ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1675 		if (ret)
1676 			mlog_errno(ret);
1677 
1678 		/*
1679 		 * Setup our list pointers now so that the current
1680 		 * parents become children in the next iteration.
1681 		 */
1682 		left_el = left_path->p_node[i].el;
1683 		right_el = right_path->p_node[i].el;
1684 	}
1685 
1686 	/*
1687 	 * At the root node, adjust the two adjacent records which
1688 	 * begin our path to the leaves.
1689 	 */
1690 
1691 	el = left_path->p_node[subtree_index].el;
1692 	left_el = left_path->p_node[subtree_index + 1].el;
1693 	right_el = right_path->p_node[subtree_index + 1].el;
1694 
1695 	ocfs2_adjust_root_records(el, left_el, right_el,
1696 				  left_path->p_node[subtree_index + 1].bh->b_blocknr);
1697 
1698 	root_bh = left_path->p_node[subtree_index].bh;
1699 
1700 	ret = ocfs2_journal_dirty(handle, root_bh);
1701 	if (ret)
1702 		mlog_errno(ret);
1703 }
1704 
1705 static int ocfs2_rotate_subtree_right(struct inode *inode,
1706 				      handle_t *handle,
1707 				      struct ocfs2_path *left_path,
1708 				      struct ocfs2_path *right_path,
1709 				      int subtree_index)
1710 {
1711 	int ret, i;
1712 	struct buffer_head *right_leaf_bh;
1713 	struct buffer_head *left_leaf_bh = NULL;
1714 	struct buffer_head *root_bh;
1715 	struct ocfs2_extent_list *right_el, *left_el;
1716 	struct ocfs2_extent_rec move_rec;
1717 
1718 	left_leaf_bh = path_leaf_bh(left_path);
1719 	left_el = path_leaf_el(left_path);
1720 
1721 	if (left_el->l_next_free_rec != left_el->l_count) {
1722 		ocfs2_error(inode->i_sb,
1723 			    "Inode %llu has non-full interior leaf node %llu"
1724 			    "(next free = %u)",
1725 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
1726 			    (unsigned long long)left_leaf_bh->b_blocknr,
1727 			    le16_to_cpu(left_el->l_next_free_rec));
1728 		return -EROFS;
1729 	}
1730 
1731 	/*
1732 	 * This extent block may already have an empty record, so we
1733 	 * return early if so.
1734 	 */
1735 	if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1736 		return 0;
1737 
1738 	root_bh = left_path->p_node[subtree_index].bh;
1739 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1740 
1741 	ret = ocfs2_journal_access(handle, inode, root_bh,
1742 				   OCFS2_JOURNAL_ACCESS_WRITE);
1743 	if (ret) {
1744 		mlog_errno(ret);
1745 		goto out;
1746 	}
1747 
1748 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1749 		ret = ocfs2_journal_access(handle, inode,
1750 					   right_path->p_node[i].bh,
1751 					   OCFS2_JOURNAL_ACCESS_WRITE);
1752 		if (ret) {
1753 			mlog_errno(ret);
1754 			goto out;
1755 		}
1756 
1757 		ret = ocfs2_journal_access(handle, inode,
1758 					   left_path->p_node[i].bh,
1759 					   OCFS2_JOURNAL_ACCESS_WRITE);
1760 		if (ret) {
1761 			mlog_errno(ret);
1762 			goto out;
1763 		}
1764 	}
1765 
1766 	right_leaf_bh = path_leaf_bh(right_path);
1767 	right_el = path_leaf_el(right_path);
1768 
1769 	/* This is a code error, not a disk corruption. */
1770 	mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1771 			"because rightmost leaf block %llu is empty\n",
1772 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
1773 			(unsigned long long)right_leaf_bh->b_blocknr);
1774 
1775 	ocfs2_create_empty_extent(right_el);
1776 
1777 	ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1778 	if (ret) {
1779 		mlog_errno(ret);
1780 		goto out;
1781 	}
1782 
1783 	/* Do the copy now. */
1784 	i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1785 	move_rec = left_el->l_recs[i];
1786 	right_el->l_recs[0] = move_rec;
1787 
1788 	/*
1789 	 * Clear out the record we just copied and shift everything
1790 	 * over, leaving an empty extent in the left leaf.
1791 	 *
1792 	 * We temporarily subtract from next_free_rec so that the
1793 	 * shift will lose the tail record (which is now defunct).
1794 	 */
1795 	le16_add_cpu(&left_el->l_next_free_rec, -1);
1796 	ocfs2_shift_records_right(left_el);
1797 	memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1798 	le16_add_cpu(&left_el->l_next_free_rec, 1);
1799 
1800 	ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1801 	if (ret) {
1802 		mlog_errno(ret);
1803 		goto out;
1804 	}
1805 
1806 	ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1807 				subtree_index);
1808 
1809 out:
1810 	return ret;
1811 }
1812 
1813 /*
1814  * Given a full path, determine what cpos value would return us a path
1815  * containing the leaf immediately to the left of the current one.
1816  *
1817  * Will return zero if the path passed in is already the leftmost path.
1818  */
1819 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1820 					 struct ocfs2_path *path, u32 *cpos)
1821 {
1822 	int i, j, ret = 0;
1823 	u64 blkno;
1824 	struct ocfs2_extent_list *el;
1825 
1826 	BUG_ON(path->p_tree_depth == 0);
1827 
1828 	*cpos = 0;
1829 
1830 	blkno = path_leaf_bh(path)->b_blocknr;
1831 
1832 	/* Start at the tree node just above the leaf and work our way up. */
1833 	i = path->p_tree_depth - 1;
1834 	while (i >= 0) {
1835 		el = path->p_node[i].el;
1836 
1837 		/*
1838 		 * Find the extent record just before the one in our
1839 		 * path.
1840 		 */
1841 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1842 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1843 				if (j == 0) {
1844 					if (i == 0) {
1845 						/*
1846 						 * We've determined that the
1847 						 * path specified is already
1848 						 * the leftmost one - return a
1849 						 * cpos of zero.
1850 						 */
1851 						goto out;
1852 					}
1853 					/*
1854 					 * The leftmost record points to our
1855 					 * leaf - we need to travel up the
1856 					 * tree one level.
1857 					 */
1858 					goto next_node;
1859 				}
1860 
1861 				*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1862 				*cpos = *cpos + ocfs2_rec_clusters(el,
1863 							   &el->l_recs[j - 1]);
1864 				*cpos = *cpos - 1;
1865 				goto out;
1866 			}
1867 		}
1868 
1869 		/*
1870 		 * If we got here, we never found a valid node where
1871 		 * the tree indicated one should be.
1872 		 */
1873 		ocfs2_error(sb,
1874 			    "Invalid extent tree at extent block %llu\n",
1875 			    (unsigned long long)blkno);
1876 		ret = -EROFS;
1877 		goto out;
1878 
1879 next_node:
1880 		blkno = path->p_node[i].bh->b_blocknr;
1881 		i--;
1882 	}
1883 
1884 out:
1885 	return ret;
1886 }
1887 
1888 /*
1889  * Extend the transaction by enough credits to complete the rotation,
1890  * and still leave at least the original number of credits allocated
1891  * to this transaction.
1892  */
1893 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1894 					   int op_credits,
1895 					   struct ocfs2_path *path)
1896 {
1897 	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1898 
1899 	if (handle->h_buffer_credits < credits)
1900 		return ocfs2_extend_trans(handle, credits);
1901 
1902 	return 0;
1903 }
1904 
1905 /*
1906  * Trap the case where we're inserting into the theoretical range past
1907  * the _actual_ left leaf range. Otherwise, we'll rotate a record
1908  * whose cpos is less than ours into the right leaf.
1909  *
1910  * It's only necessary to look at the rightmost record of the left
1911  * leaf because the logic that calls us should ensure that the
1912  * theoretical ranges in the path components above the leaves are
1913  * correct.
1914  */
1915 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1916 						 u32 insert_cpos)
1917 {
1918 	struct ocfs2_extent_list *left_el;
1919 	struct ocfs2_extent_rec *rec;
1920 	int next_free;
1921 
1922 	left_el = path_leaf_el(left_path);
1923 	next_free = le16_to_cpu(left_el->l_next_free_rec);
1924 	rec = &left_el->l_recs[next_free - 1];
1925 
1926 	if (insert_cpos > le32_to_cpu(rec->e_cpos))
1927 		return 1;
1928 	return 0;
1929 }
1930 
1931 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
1932 {
1933 	int next_free = le16_to_cpu(el->l_next_free_rec);
1934 	unsigned int range;
1935 	struct ocfs2_extent_rec *rec;
1936 
1937 	if (next_free == 0)
1938 		return 0;
1939 
1940 	rec = &el->l_recs[0];
1941 	if (ocfs2_is_empty_extent(rec)) {
1942 		/* Empty list. */
1943 		if (next_free == 1)
1944 			return 0;
1945 		rec = &el->l_recs[1];
1946 	}
1947 
1948 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1949 	if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1950 		return 1;
1951 	return 0;
1952 }
1953 
1954 /*
1955  * Rotate all the records in a btree right one record, starting at insert_cpos.
1956  *
1957  * The path to the rightmost leaf should be passed in.
1958  *
1959  * The array is assumed to be large enough to hold an entire path (tree depth).
1960  *
1961  * Upon succesful return from this function:
1962  *
1963  * - The 'right_path' array will contain a path to the leaf block
1964  *   whose range contains e_cpos.
1965  * - That leaf block will have a single empty extent in list index 0.
1966  * - In the case that the rotation requires a post-insert update,
1967  *   *ret_left_path will contain a valid path which can be passed to
1968  *   ocfs2_insert_path().
1969  */
1970 static int ocfs2_rotate_tree_right(struct inode *inode,
1971 				   handle_t *handle,
1972 				   enum ocfs2_split_type split,
1973 				   u32 insert_cpos,
1974 				   struct ocfs2_path *right_path,
1975 				   struct ocfs2_path **ret_left_path)
1976 {
1977 	int ret, start, orig_credits = handle->h_buffer_credits;
1978 	u32 cpos;
1979 	struct ocfs2_path *left_path = NULL;
1980 
1981 	*ret_left_path = NULL;
1982 
1983 	left_path = ocfs2_new_path(path_root_bh(right_path),
1984 				   path_root_el(right_path));
1985 	if (!left_path) {
1986 		ret = -ENOMEM;
1987 		mlog_errno(ret);
1988 		goto out;
1989 	}
1990 
1991 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
1992 	if (ret) {
1993 		mlog_errno(ret);
1994 		goto out;
1995 	}
1996 
1997 	mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
1998 
1999 	/*
2000 	 * What we want to do here is:
2001 	 *
2002 	 * 1) Start with the rightmost path.
2003 	 *
2004 	 * 2) Determine a path to the leaf block directly to the left
2005 	 *    of that leaf.
2006 	 *
2007 	 * 3) Determine the 'subtree root' - the lowest level tree node
2008 	 *    which contains a path to both leaves.
2009 	 *
2010 	 * 4) Rotate the subtree.
2011 	 *
2012 	 * 5) Find the next subtree by considering the left path to be
2013 	 *    the new right path.
2014 	 *
2015 	 * The check at the top of this while loop also accepts
2016 	 * insert_cpos == cpos because cpos is only a _theoretical_
2017 	 * value to get us the left path - insert_cpos might very well
2018 	 * be filling that hole.
2019 	 *
2020 	 * Stop at a cpos of '0' because we either started at the
2021 	 * leftmost branch (i.e., a tree with one branch and a
2022 	 * rotation inside of it), or we've gone as far as we can in
2023 	 * rotating subtrees.
2024 	 */
2025 	while (cpos && insert_cpos <= cpos) {
2026 		mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2027 		     insert_cpos, cpos);
2028 
2029 		ret = ocfs2_find_path(inode, left_path, cpos);
2030 		if (ret) {
2031 			mlog_errno(ret);
2032 			goto out;
2033 		}
2034 
2035 		mlog_bug_on_msg(path_leaf_bh(left_path) ==
2036 				path_leaf_bh(right_path),
2037 				"Inode %lu: error during insert of %u "
2038 				"(left path cpos %u) results in two identical "
2039 				"paths ending at %llu\n",
2040 				inode->i_ino, insert_cpos, cpos,
2041 				(unsigned long long)
2042 				path_leaf_bh(left_path)->b_blocknr);
2043 
2044 		if (split == SPLIT_NONE &&
2045 		    ocfs2_rotate_requires_path_adjustment(left_path,
2046 							  insert_cpos)) {
2047 
2048 			/*
2049 			 * We've rotated the tree as much as we
2050 			 * should. The rest is up to
2051 			 * ocfs2_insert_path() to complete, after the
2052 			 * record insertion. We indicate this
2053 			 * situation by returning the left path.
2054 			 *
2055 			 * The reason we don't adjust the records here
2056 			 * before the record insert is that an error
2057 			 * later might break the rule where a parent
2058 			 * record e_cpos will reflect the actual
2059 			 * e_cpos of the 1st nonempty record of the
2060 			 * child list.
2061 			 */
2062 			*ret_left_path = left_path;
2063 			goto out_ret_path;
2064 		}
2065 
2066 		start = ocfs2_find_subtree_root(inode, left_path, right_path);
2067 
2068 		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2069 		     start,
2070 		     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2071 		     right_path->p_tree_depth);
2072 
2073 		ret = ocfs2_extend_rotate_transaction(handle, start,
2074 						      orig_credits, right_path);
2075 		if (ret) {
2076 			mlog_errno(ret);
2077 			goto out;
2078 		}
2079 
2080 		ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2081 						 right_path, start);
2082 		if (ret) {
2083 			mlog_errno(ret);
2084 			goto out;
2085 		}
2086 
2087 		if (split != SPLIT_NONE &&
2088 		    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2089 						insert_cpos)) {
2090 			/*
2091 			 * A rotate moves the rightmost left leaf
2092 			 * record over to the leftmost right leaf
2093 			 * slot. If we're doing an extent split
2094 			 * instead of a real insert, then we have to
2095 			 * check that the extent to be split wasn't
2096 			 * just moved over. If it was, then we can
2097 			 * exit here, passing left_path back -
2098 			 * ocfs2_split_extent() is smart enough to
2099 			 * search both leaves.
2100 			 */
2101 			*ret_left_path = left_path;
2102 			goto out_ret_path;
2103 		}
2104 
2105 		/*
2106 		 * There is no need to re-read the next right path
2107 		 * as we know that it'll be our current left
2108 		 * path. Optimize by copying values instead.
2109 		 */
2110 		ocfs2_mv_path(right_path, left_path);
2111 
2112 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2113 						    &cpos);
2114 		if (ret) {
2115 			mlog_errno(ret);
2116 			goto out;
2117 		}
2118 	}
2119 
2120 out:
2121 	ocfs2_free_path(left_path);
2122 
2123 out_ret_path:
2124 	return ret;
2125 }
2126 
2127 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2128 				      struct ocfs2_path *path)
2129 {
2130 	int i, idx;
2131 	struct ocfs2_extent_rec *rec;
2132 	struct ocfs2_extent_list *el;
2133 	struct ocfs2_extent_block *eb;
2134 	u32 range;
2135 
2136 	/* Path should always be rightmost. */
2137 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2138 	BUG_ON(eb->h_next_leaf_blk != 0ULL);
2139 
2140 	el = &eb->h_list;
2141 	BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2142 	idx = le16_to_cpu(el->l_next_free_rec) - 1;
2143 	rec = &el->l_recs[idx];
2144 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2145 
2146 	for (i = 0; i < path->p_tree_depth; i++) {
2147 		el = path->p_node[i].el;
2148 		idx = le16_to_cpu(el->l_next_free_rec) - 1;
2149 		rec = &el->l_recs[idx];
2150 
2151 		rec->e_int_clusters = cpu_to_le32(range);
2152 		le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2153 
2154 		ocfs2_journal_dirty(handle, path->p_node[i].bh);
2155 	}
2156 }
2157 
2158 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2159 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
2160 			      struct ocfs2_path *path, int unlink_start)
2161 {
2162 	int ret, i;
2163 	struct ocfs2_extent_block *eb;
2164 	struct ocfs2_extent_list *el;
2165 	struct buffer_head *bh;
2166 
2167 	for(i = unlink_start; i < path_num_items(path); i++) {
2168 		bh = path->p_node[i].bh;
2169 
2170 		eb = (struct ocfs2_extent_block *)bh->b_data;
2171 		/*
2172 		 * Not all nodes might have had their final count
2173 		 * decremented by the caller - handle this here.
2174 		 */
2175 		el = &eb->h_list;
2176 		if (le16_to_cpu(el->l_next_free_rec) > 1) {
2177 			mlog(ML_ERROR,
2178 			     "Inode %llu, attempted to remove extent block "
2179 			     "%llu with %u records\n",
2180 			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2181 			     (unsigned long long)le64_to_cpu(eb->h_blkno),
2182 			     le16_to_cpu(el->l_next_free_rec));
2183 
2184 			ocfs2_journal_dirty(handle, bh);
2185 			ocfs2_remove_from_cache(inode, bh);
2186 			continue;
2187 		}
2188 
2189 		el->l_next_free_rec = 0;
2190 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2191 
2192 		ocfs2_journal_dirty(handle, bh);
2193 
2194 		ret = ocfs2_cache_extent_block_free(dealloc, eb);
2195 		if (ret)
2196 			mlog_errno(ret);
2197 
2198 		ocfs2_remove_from_cache(inode, bh);
2199 	}
2200 }
2201 
2202 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2203 				 struct ocfs2_path *left_path,
2204 				 struct ocfs2_path *right_path,
2205 				 int subtree_index,
2206 				 struct ocfs2_cached_dealloc_ctxt *dealloc)
2207 {
2208 	int i;
2209 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2210 	struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2211 	struct ocfs2_extent_list *el;
2212 	struct ocfs2_extent_block *eb;
2213 
2214 	el = path_leaf_el(left_path);
2215 
2216 	eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2217 
2218 	for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2219 		if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2220 			break;
2221 
2222 	BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2223 
2224 	memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2225 	le16_add_cpu(&root_el->l_next_free_rec, -1);
2226 
2227 	eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2228 	eb->h_next_leaf_blk = 0;
2229 
2230 	ocfs2_journal_dirty(handle, root_bh);
2231 	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2232 
2233 	ocfs2_unlink_path(inode, handle, dealloc, right_path,
2234 			  subtree_index + 1);
2235 }
2236 
2237 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2238 				     struct ocfs2_path *left_path,
2239 				     struct ocfs2_path *right_path,
2240 				     int subtree_index,
2241 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
2242 				     int *deleted,
2243 				     struct ocfs2_extent_tree *et)
2244 {
2245 	int ret, i, del_right_subtree = 0, right_has_empty = 0;
2246 	struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2247 	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2248 	struct ocfs2_extent_block *eb;
2249 
2250 	*deleted = 0;
2251 
2252 	right_leaf_el = path_leaf_el(right_path);
2253 	left_leaf_el = path_leaf_el(left_path);
2254 	root_bh = left_path->p_node[subtree_index].bh;
2255 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2256 
2257 	if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2258 		return 0;
2259 
2260 	eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2261 	if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2262 		/*
2263 		 * It's legal for us to proceed if the right leaf is
2264 		 * the rightmost one and it has an empty extent. There
2265 		 * are two cases to handle - whether the leaf will be
2266 		 * empty after removal or not. If the leaf isn't empty
2267 		 * then just remove the empty extent up front. The
2268 		 * next block will handle empty leaves by flagging
2269 		 * them for unlink.
2270 		 *
2271 		 * Non rightmost leaves will throw -EAGAIN and the
2272 		 * caller can manually move the subtree and retry.
2273 		 */
2274 
2275 		if (eb->h_next_leaf_blk != 0ULL)
2276 			return -EAGAIN;
2277 
2278 		if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2279 			ret = ocfs2_journal_access(handle, inode,
2280 						   path_leaf_bh(right_path),
2281 						   OCFS2_JOURNAL_ACCESS_WRITE);
2282 			if (ret) {
2283 				mlog_errno(ret);
2284 				goto out;
2285 			}
2286 
2287 			ocfs2_remove_empty_extent(right_leaf_el);
2288 		} else
2289 			right_has_empty = 1;
2290 	}
2291 
2292 	if (eb->h_next_leaf_blk == 0ULL &&
2293 	    le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2294 		/*
2295 		 * We have to update i_last_eb_blk during the meta
2296 		 * data delete.
2297 		 */
2298 		ret = ocfs2_journal_access(handle, inode, et_root_bh,
2299 					   OCFS2_JOURNAL_ACCESS_WRITE);
2300 		if (ret) {
2301 			mlog_errno(ret);
2302 			goto out;
2303 		}
2304 
2305 		del_right_subtree = 1;
2306 	}
2307 
2308 	/*
2309 	 * Getting here with an empty extent in the right path implies
2310 	 * that it's the rightmost path and will be deleted.
2311 	 */
2312 	BUG_ON(right_has_empty && !del_right_subtree);
2313 
2314 	ret = ocfs2_journal_access(handle, inode, root_bh,
2315 				   OCFS2_JOURNAL_ACCESS_WRITE);
2316 	if (ret) {
2317 		mlog_errno(ret);
2318 		goto out;
2319 	}
2320 
2321 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2322 		ret = ocfs2_journal_access(handle, inode,
2323 					   right_path->p_node[i].bh,
2324 					   OCFS2_JOURNAL_ACCESS_WRITE);
2325 		if (ret) {
2326 			mlog_errno(ret);
2327 			goto out;
2328 		}
2329 
2330 		ret = ocfs2_journal_access(handle, inode,
2331 					   left_path->p_node[i].bh,
2332 					   OCFS2_JOURNAL_ACCESS_WRITE);
2333 		if (ret) {
2334 			mlog_errno(ret);
2335 			goto out;
2336 		}
2337 	}
2338 
2339 	if (!right_has_empty) {
2340 		/*
2341 		 * Only do this if we're moving a real
2342 		 * record. Otherwise, the action is delayed until
2343 		 * after removal of the right path in which case we
2344 		 * can do a simple shift to remove the empty extent.
2345 		 */
2346 		ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2347 		memset(&right_leaf_el->l_recs[0], 0,
2348 		       sizeof(struct ocfs2_extent_rec));
2349 	}
2350 	if (eb->h_next_leaf_blk == 0ULL) {
2351 		/*
2352 		 * Move recs over to get rid of empty extent, decrease
2353 		 * next_free. This is allowed to remove the last
2354 		 * extent in our leaf (setting l_next_free_rec to
2355 		 * zero) - the delete code below won't care.
2356 		 */
2357 		ocfs2_remove_empty_extent(right_leaf_el);
2358 	}
2359 
2360 	ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2361 	if (ret)
2362 		mlog_errno(ret);
2363 	ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2364 	if (ret)
2365 		mlog_errno(ret);
2366 
2367 	if (del_right_subtree) {
2368 		ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2369 				     subtree_index, dealloc);
2370 		ocfs2_update_edge_lengths(inode, handle, left_path);
2371 
2372 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2373 		ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2374 
2375 		/*
2376 		 * Removal of the extent in the left leaf was skipped
2377 		 * above so we could delete the right path
2378 		 * 1st.
2379 		 */
2380 		if (right_has_empty)
2381 			ocfs2_remove_empty_extent(left_leaf_el);
2382 
2383 		ret = ocfs2_journal_dirty(handle, et_root_bh);
2384 		if (ret)
2385 			mlog_errno(ret);
2386 
2387 		*deleted = 1;
2388 	} else
2389 		ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2390 					   subtree_index);
2391 
2392 out:
2393 	return ret;
2394 }
2395 
2396 /*
2397  * Given a full path, determine what cpos value would return us a path
2398  * containing the leaf immediately to the right of the current one.
2399  *
2400  * Will return zero if the path passed in is already the rightmost path.
2401  *
2402  * This looks similar, but is subtly different to
2403  * ocfs2_find_cpos_for_left_leaf().
2404  */
2405 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2406 					  struct ocfs2_path *path, u32 *cpos)
2407 {
2408 	int i, j, ret = 0;
2409 	u64 blkno;
2410 	struct ocfs2_extent_list *el;
2411 
2412 	*cpos = 0;
2413 
2414 	if (path->p_tree_depth == 0)
2415 		return 0;
2416 
2417 	blkno = path_leaf_bh(path)->b_blocknr;
2418 
2419 	/* Start at the tree node just above the leaf and work our way up. */
2420 	i = path->p_tree_depth - 1;
2421 	while (i >= 0) {
2422 		int next_free;
2423 
2424 		el = path->p_node[i].el;
2425 
2426 		/*
2427 		 * Find the extent record just after the one in our
2428 		 * path.
2429 		 */
2430 		next_free = le16_to_cpu(el->l_next_free_rec);
2431 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2432 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2433 				if (j == (next_free - 1)) {
2434 					if (i == 0) {
2435 						/*
2436 						 * We've determined that the
2437 						 * path specified is already
2438 						 * the rightmost one - return a
2439 						 * cpos of zero.
2440 						 */
2441 						goto out;
2442 					}
2443 					/*
2444 					 * The rightmost record points to our
2445 					 * leaf - we need to travel up the
2446 					 * tree one level.
2447 					 */
2448 					goto next_node;
2449 				}
2450 
2451 				*cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2452 				goto out;
2453 			}
2454 		}
2455 
2456 		/*
2457 		 * If we got here, we never found a valid node where
2458 		 * the tree indicated one should be.
2459 		 */
2460 		ocfs2_error(sb,
2461 			    "Invalid extent tree at extent block %llu\n",
2462 			    (unsigned long long)blkno);
2463 		ret = -EROFS;
2464 		goto out;
2465 
2466 next_node:
2467 		blkno = path->p_node[i].bh->b_blocknr;
2468 		i--;
2469 	}
2470 
2471 out:
2472 	return ret;
2473 }
2474 
2475 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2476 					    handle_t *handle,
2477 					    struct buffer_head *bh,
2478 					    struct ocfs2_extent_list *el)
2479 {
2480 	int ret;
2481 
2482 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2483 		return 0;
2484 
2485 	ret = ocfs2_journal_access(handle, inode, bh,
2486 				   OCFS2_JOURNAL_ACCESS_WRITE);
2487 	if (ret) {
2488 		mlog_errno(ret);
2489 		goto out;
2490 	}
2491 
2492 	ocfs2_remove_empty_extent(el);
2493 
2494 	ret = ocfs2_journal_dirty(handle, bh);
2495 	if (ret)
2496 		mlog_errno(ret);
2497 
2498 out:
2499 	return ret;
2500 }
2501 
2502 static int __ocfs2_rotate_tree_left(struct inode *inode,
2503 				    handle_t *handle, int orig_credits,
2504 				    struct ocfs2_path *path,
2505 				    struct ocfs2_cached_dealloc_ctxt *dealloc,
2506 				    struct ocfs2_path **empty_extent_path,
2507 				    struct ocfs2_extent_tree *et)
2508 {
2509 	int ret, subtree_root, deleted;
2510 	u32 right_cpos;
2511 	struct ocfs2_path *left_path = NULL;
2512 	struct ocfs2_path *right_path = NULL;
2513 
2514 	BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2515 
2516 	*empty_extent_path = NULL;
2517 
2518 	ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2519 					     &right_cpos);
2520 	if (ret) {
2521 		mlog_errno(ret);
2522 		goto out;
2523 	}
2524 
2525 	left_path = ocfs2_new_path(path_root_bh(path),
2526 				   path_root_el(path));
2527 	if (!left_path) {
2528 		ret = -ENOMEM;
2529 		mlog_errno(ret);
2530 		goto out;
2531 	}
2532 
2533 	ocfs2_cp_path(left_path, path);
2534 
2535 	right_path = ocfs2_new_path(path_root_bh(path),
2536 				    path_root_el(path));
2537 	if (!right_path) {
2538 		ret = -ENOMEM;
2539 		mlog_errno(ret);
2540 		goto out;
2541 	}
2542 
2543 	while (right_cpos) {
2544 		ret = ocfs2_find_path(inode, right_path, right_cpos);
2545 		if (ret) {
2546 			mlog_errno(ret);
2547 			goto out;
2548 		}
2549 
2550 		subtree_root = ocfs2_find_subtree_root(inode, left_path,
2551 						       right_path);
2552 
2553 		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2554 		     subtree_root,
2555 		     (unsigned long long)
2556 		     right_path->p_node[subtree_root].bh->b_blocknr,
2557 		     right_path->p_tree_depth);
2558 
2559 		ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2560 						      orig_credits, left_path);
2561 		if (ret) {
2562 			mlog_errno(ret);
2563 			goto out;
2564 		}
2565 
2566 		/*
2567 		 * Caller might still want to make changes to the
2568 		 * tree root, so re-add it to the journal here.
2569 		 */
2570 		ret = ocfs2_journal_access(handle, inode,
2571 					   path_root_bh(left_path),
2572 					   OCFS2_JOURNAL_ACCESS_WRITE);
2573 		if (ret) {
2574 			mlog_errno(ret);
2575 			goto out;
2576 		}
2577 
2578 		ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2579 						right_path, subtree_root,
2580 						dealloc, &deleted, et);
2581 		if (ret == -EAGAIN) {
2582 			/*
2583 			 * The rotation has to temporarily stop due to
2584 			 * the right subtree having an empty
2585 			 * extent. Pass it back to the caller for a
2586 			 * fixup.
2587 			 */
2588 			*empty_extent_path = right_path;
2589 			right_path = NULL;
2590 			goto out;
2591 		}
2592 		if (ret) {
2593 			mlog_errno(ret);
2594 			goto out;
2595 		}
2596 
2597 		/*
2598 		 * The subtree rotate might have removed records on
2599 		 * the rightmost edge. If so, then rotation is
2600 		 * complete.
2601 		 */
2602 		if (deleted)
2603 			break;
2604 
2605 		ocfs2_mv_path(left_path, right_path);
2606 
2607 		ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2608 						     &right_cpos);
2609 		if (ret) {
2610 			mlog_errno(ret);
2611 			goto out;
2612 		}
2613 	}
2614 
2615 out:
2616 	ocfs2_free_path(right_path);
2617 	ocfs2_free_path(left_path);
2618 
2619 	return ret;
2620 }
2621 
2622 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2623 				struct ocfs2_path *path,
2624 				struct ocfs2_cached_dealloc_ctxt *dealloc,
2625 				struct ocfs2_extent_tree *et)
2626 {
2627 	int ret, subtree_index;
2628 	u32 cpos;
2629 	struct ocfs2_path *left_path = NULL;
2630 	struct ocfs2_extent_block *eb;
2631 	struct ocfs2_extent_list *el;
2632 
2633 
2634 	ret = et->eops->sanity_check(inode, et);
2635 	if (ret)
2636 		goto out;
2637 	/*
2638 	 * There's two ways we handle this depending on
2639 	 * whether path is the only existing one.
2640 	 */
2641 	ret = ocfs2_extend_rotate_transaction(handle, 0,
2642 					      handle->h_buffer_credits,
2643 					      path);
2644 	if (ret) {
2645 		mlog_errno(ret);
2646 		goto out;
2647 	}
2648 
2649 	ret = ocfs2_journal_access_path(inode, handle, path);
2650 	if (ret) {
2651 		mlog_errno(ret);
2652 		goto out;
2653 	}
2654 
2655 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2656 	if (ret) {
2657 		mlog_errno(ret);
2658 		goto out;
2659 	}
2660 
2661 	if (cpos) {
2662 		/*
2663 		 * We have a path to the left of this one - it needs
2664 		 * an update too.
2665 		 */
2666 		left_path = ocfs2_new_path(path_root_bh(path),
2667 					   path_root_el(path));
2668 		if (!left_path) {
2669 			ret = -ENOMEM;
2670 			mlog_errno(ret);
2671 			goto out;
2672 		}
2673 
2674 		ret = ocfs2_find_path(inode, left_path, cpos);
2675 		if (ret) {
2676 			mlog_errno(ret);
2677 			goto out;
2678 		}
2679 
2680 		ret = ocfs2_journal_access_path(inode, handle, left_path);
2681 		if (ret) {
2682 			mlog_errno(ret);
2683 			goto out;
2684 		}
2685 
2686 		subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2687 
2688 		ocfs2_unlink_subtree(inode, handle, left_path, path,
2689 				     subtree_index, dealloc);
2690 		ocfs2_update_edge_lengths(inode, handle, left_path);
2691 
2692 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2693 		ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2694 	} else {
2695 		/*
2696 		 * 'path' is also the leftmost path which
2697 		 * means it must be the only one. This gets
2698 		 * handled differently because we want to
2699 		 * revert the inode back to having extents
2700 		 * in-line.
2701 		 */
2702 		ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2703 
2704 		el = et->root_el;
2705 		el->l_tree_depth = 0;
2706 		el->l_next_free_rec = 0;
2707 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2708 
2709 		ocfs2_set_last_eb_blk(et, 0);
2710 	}
2711 
2712 	ocfs2_journal_dirty(handle, path_root_bh(path));
2713 
2714 out:
2715 	ocfs2_free_path(left_path);
2716 	return ret;
2717 }
2718 
2719 /*
2720  * Left rotation of btree records.
2721  *
2722  * In many ways, this is (unsurprisingly) the opposite of right
2723  * rotation. We start at some non-rightmost path containing an empty
2724  * extent in the leaf block. The code works its way to the rightmost
2725  * path by rotating records to the left in every subtree.
2726  *
2727  * This is used by any code which reduces the number of extent records
2728  * in a leaf. After removal, an empty record should be placed in the
2729  * leftmost list position.
2730  *
2731  * This won't handle a length update of the rightmost path records if
2732  * the rightmost tree leaf record is removed so the caller is
2733  * responsible for detecting and correcting that.
2734  */
2735 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2736 				  struct ocfs2_path *path,
2737 				  struct ocfs2_cached_dealloc_ctxt *dealloc,
2738 				  struct ocfs2_extent_tree *et)
2739 {
2740 	int ret, orig_credits = handle->h_buffer_credits;
2741 	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2742 	struct ocfs2_extent_block *eb;
2743 	struct ocfs2_extent_list *el;
2744 
2745 	el = path_leaf_el(path);
2746 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2747 		return 0;
2748 
2749 	if (path->p_tree_depth == 0) {
2750 rightmost_no_delete:
2751 		/*
2752 		 * Inline extents. This is trivially handled, so do
2753 		 * it up front.
2754 		 */
2755 		ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2756 						       path_leaf_bh(path),
2757 						       path_leaf_el(path));
2758 		if (ret)
2759 			mlog_errno(ret);
2760 		goto out;
2761 	}
2762 
2763 	/*
2764 	 * Handle rightmost branch now. There's several cases:
2765 	 *  1) simple rotation leaving records in there. That's trivial.
2766 	 *  2) rotation requiring a branch delete - there's no more
2767 	 *     records left. Two cases of this:
2768 	 *     a) There are branches to the left.
2769 	 *     b) This is also the leftmost (the only) branch.
2770 	 *
2771 	 *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2772 	 *  2a) we need the left branch so that we can update it with the unlink
2773 	 *  2b) we need to bring the inode back to inline extents.
2774 	 */
2775 
2776 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2777 	el = &eb->h_list;
2778 	if (eb->h_next_leaf_blk == 0) {
2779 		/*
2780 		 * This gets a bit tricky if we're going to delete the
2781 		 * rightmost path. Get the other cases out of the way
2782 		 * 1st.
2783 		 */
2784 		if (le16_to_cpu(el->l_next_free_rec) > 1)
2785 			goto rightmost_no_delete;
2786 
2787 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
2788 			ret = -EIO;
2789 			ocfs2_error(inode->i_sb,
2790 				    "Inode %llu has empty extent block at %llu",
2791 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
2792 				    (unsigned long long)le64_to_cpu(eb->h_blkno));
2793 			goto out;
2794 		}
2795 
2796 		/*
2797 		 * XXX: The caller can not trust "path" any more after
2798 		 * this as it will have been deleted. What do we do?
2799 		 *
2800 		 * In theory the rotate-for-merge code will never get
2801 		 * here because it'll always ask for a rotate in a
2802 		 * nonempty list.
2803 		 */
2804 
2805 		ret = ocfs2_remove_rightmost_path(inode, handle, path,
2806 						  dealloc, et);
2807 		if (ret)
2808 			mlog_errno(ret);
2809 		goto out;
2810 	}
2811 
2812 	/*
2813 	 * Now we can loop, remembering the path we get from -EAGAIN
2814 	 * and restarting from there.
2815 	 */
2816 try_rotate:
2817 	ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2818 				       dealloc, &restart_path, et);
2819 	if (ret && ret != -EAGAIN) {
2820 		mlog_errno(ret);
2821 		goto out;
2822 	}
2823 
2824 	while (ret == -EAGAIN) {
2825 		tmp_path = restart_path;
2826 		restart_path = NULL;
2827 
2828 		ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2829 					       tmp_path, dealloc,
2830 					       &restart_path, et);
2831 		if (ret && ret != -EAGAIN) {
2832 			mlog_errno(ret);
2833 			goto out;
2834 		}
2835 
2836 		ocfs2_free_path(tmp_path);
2837 		tmp_path = NULL;
2838 
2839 		if (ret == 0)
2840 			goto try_rotate;
2841 	}
2842 
2843 out:
2844 	ocfs2_free_path(tmp_path);
2845 	ocfs2_free_path(restart_path);
2846 	return ret;
2847 }
2848 
2849 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2850 				int index)
2851 {
2852 	struct ocfs2_extent_rec *rec = &el->l_recs[index];
2853 	unsigned int size;
2854 
2855 	if (rec->e_leaf_clusters == 0) {
2856 		/*
2857 		 * We consumed all of the merged-from record. An empty
2858 		 * extent cannot exist anywhere but the 1st array
2859 		 * position, so move things over if the merged-from
2860 		 * record doesn't occupy that position.
2861 		 *
2862 		 * This creates a new empty extent so the caller
2863 		 * should be smart enough to have removed any existing
2864 		 * ones.
2865 		 */
2866 		if (index > 0) {
2867 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2868 			size = index * sizeof(struct ocfs2_extent_rec);
2869 			memmove(&el->l_recs[1], &el->l_recs[0], size);
2870 		}
2871 
2872 		/*
2873 		 * Always memset - the caller doesn't check whether it
2874 		 * created an empty extent, so there could be junk in
2875 		 * the other fields.
2876 		 */
2877 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2878 	}
2879 }
2880 
2881 static int ocfs2_get_right_path(struct inode *inode,
2882 				struct ocfs2_path *left_path,
2883 				struct ocfs2_path **ret_right_path)
2884 {
2885 	int ret;
2886 	u32 right_cpos;
2887 	struct ocfs2_path *right_path = NULL;
2888 	struct ocfs2_extent_list *left_el;
2889 
2890 	*ret_right_path = NULL;
2891 
2892 	/* This function shouldn't be called for non-trees. */
2893 	BUG_ON(left_path->p_tree_depth == 0);
2894 
2895 	left_el = path_leaf_el(left_path);
2896 	BUG_ON(left_el->l_next_free_rec != left_el->l_count);
2897 
2898 	ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2899 					     &right_cpos);
2900 	if (ret) {
2901 		mlog_errno(ret);
2902 		goto out;
2903 	}
2904 
2905 	/* This function shouldn't be called for the rightmost leaf. */
2906 	BUG_ON(right_cpos == 0);
2907 
2908 	right_path = ocfs2_new_path(path_root_bh(left_path),
2909 				    path_root_el(left_path));
2910 	if (!right_path) {
2911 		ret = -ENOMEM;
2912 		mlog_errno(ret);
2913 		goto out;
2914 	}
2915 
2916 	ret = ocfs2_find_path(inode, right_path, right_cpos);
2917 	if (ret) {
2918 		mlog_errno(ret);
2919 		goto out;
2920 	}
2921 
2922 	*ret_right_path = right_path;
2923 out:
2924 	if (ret)
2925 		ocfs2_free_path(right_path);
2926 	return ret;
2927 }
2928 
2929 /*
2930  * Remove split_rec clusters from the record at index and merge them
2931  * onto the beginning of the record "next" to it.
2932  * For index < l_count - 1, the next means the extent rec at index + 1.
2933  * For index == l_count - 1, the "next" means the 1st extent rec of the
2934  * next extent block.
2935  */
2936 static int ocfs2_merge_rec_right(struct inode *inode,
2937 				 struct ocfs2_path *left_path,
2938 				 handle_t *handle,
2939 				 struct ocfs2_extent_rec *split_rec,
2940 				 int index)
2941 {
2942 	int ret, next_free, i;
2943 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2944 	struct ocfs2_extent_rec *left_rec;
2945 	struct ocfs2_extent_rec *right_rec;
2946 	struct ocfs2_extent_list *right_el;
2947 	struct ocfs2_path *right_path = NULL;
2948 	int subtree_index = 0;
2949 	struct ocfs2_extent_list *el = path_leaf_el(left_path);
2950 	struct buffer_head *bh = path_leaf_bh(left_path);
2951 	struct buffer_head *root_bh = NULL;
2952 
2953 	BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2954 	left_rec = &el->l_recs[index];
2955 
2956 	if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
2957 	    le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
2958 		/* we meet with a cross extent block merge. */
2959 		ret = ocfs2_get_right_path(inode, left_path, &right_path);
2960 		if (ret) {
2961 			mlog_errno(ret);
2962 			goto out;
2963 		}
2964 
2965 		right_el = path_leaf_el(right_path);
2966 		next_free = le16_to_cpu(right_el->l_next_free_rec);
2967 		BUG_ON(next_free <= 0);
2968 		right_rec = &right_el->l_recs[0];
2969 		if (ocfs2_is_empty_extent(right_rec)) {
2970 			BUG_ON(next_free <= 1);
2971 			right_rec = &right_el->l_recs[1];
2972 		}
2973 
2974 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
2975 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
2976 		       le32_to_cpu(right_rec->e_cpos));
2977 
2978 		subtree_index = ocfs2_find_subtree_root(inode,
2979 							left_path, right_path);
2980 
2981 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
2982 						      handle->h_buffer_credits,
2983 						      right_path);
2984 		if (ret) {
2985 			mlog_errno(ret);
2986 			goto out;
2987 		}
2988 
2989 		root_bh = left_path->p_node[subtree_index].bh;
2990 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2991 
2992 		ret = ocfs2_journal_access(handle, inode, root_bh,
2993 					   OCFS2_JOURNAL_ACCESS_WRITE);
2994 		if (ret) {
2995 			mlog_errno(ret);
2996 			goto out;
2997 		}
2998 
2999 		for (i = subtree_index + 1;
3000 		     i < path_num_items(right_path); i++) {
3001 			ret = ocfs2_journal_access(handle, inode,
3002 						   right_path->p_node[i].bh,
3003 						   OCFS2_JOURNAL_ACCESS_WRITE);
3004 			if (ret) {
3005 				mlog_errno(ret);
3006 				goto out;
3007 			}
3008 
3009 			ret = ocfs2_journal_access(handle, inode,
3010 						   left_path->p_node[i].bh,
3011 						   OCFS2_JOURNAL_ACCESS_WRITE);
3012 			if (ret) {
3013 				mlog_errno(ret);
3014 				goto out;
3015 			}
3016 		}
3017 
3018 	} else {
3019 		BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3020 		right_rec = &el->l_recs[index + 1];
3021 	}
3022 
3023 	ret = ocfs2_journal_access(handle, inode, bh,
3024 				   OCFS2_JOURNAL_ACCESS_WRITE);
3025 	if (ret) {
3026 		mlog_errno(ret);
3027 		goto out;
3028 	}
3029 
3030 	le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3031 
3032 	le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3033 	le64_add_cpu(&right_rec->e_blkno,
3034 		     -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3035 	le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3036 
3037 	ocfs2_cleanup_merge(el, index);
3038 
3039 	ret = ocfs2_journal_dirty(handle, bh);
3040 	if (ret)
3041 		mlog_errno(ret);
3042 
3043 	if (right_path) {
3044 		ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3045 		if (ret)
3046 			mlog_errno(ret);
3047 
3048 		ocfs2_complete_edge_insert(inode, handle, left_path,
3049 					   right_path, subtree_index);
3050 	}
3051 out:
3052 	if (right_path)
3053 		ocfs2_free_path(right_path);
3054 	return ret;
3055 }
3056 
3057 static int ocfs2_get_left_path(struct inode *inode,
3058 			       struct ocfs2_path *right_path,
3059 			       struct ocfs2_path **ret_left_path)
3060 {
3061 	int ret;
3062 	u32 left_cpos;
3063 	struct ocfs2_path *left_path = NULL;
3064 
3065 	*ret_left_path = NULL;
3066 
3067 	/* This function shouldn't be called for non-trees. */
3068 	BUG_ON(right_path->p_tree_depth == 0);
3069 
3070 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3071 					    right_path, &left_cpos);
3072 	if (ret) {
3073 		mlog_errno(ret);
3074 		goto out;
3075 	}
3076 
3077 	/* This function shouldn't be called for the leftmost leaf. */
3078 	BUG_ON(left_cpos == 0);
3079 
3080 	left_path = ocfs2_new_path(path_root_bh(right_path),
3081 				   path_root_el(right_path));
3082 	if (!left_path) {
3083 		ret = -ENOMEM;
3084 		mlog_errno(ret);
3085 		goto out;
3086 	}
3087 
3088 	ret = ocfs2_find_path(inode, left_path, left_cpos);
3089 	if (ret) {
3090 		mlog_errno(ret);
3091 		goto out;
3092 	}
3093 
3094 	*ret_left_path = left_path;
3095 out:
3096 	if (ret)
3097 		ocfs2_free_path(left_path);
3098 	return ret;
3099 }
3100 
3101 /*
3102  * Remove split_rec clusters from the record at index and merge them
3103  * onto the tail of the record "before" it.
3104  * For index > 0, the "before" means the extent rec at index - 1.
3105  *
3106  * For index == 0, the "before" means the last record of the previous
3107  * extent block. And there is also a situation that we may need to
3108  * remove the rightmost leaf extent block in the right_path and change
3109  * the right path to indicate the new rightmost path.
3110  */
3111 static int ocfs2_merge_rec_left(struct inode *inode,
3112 				struct ocfs2_path *right_path,
3113 				handle_t *handle,
3114 				struct ocfs2_extent_rec *split_rec,
3115 				struct ocfs2_cached_dealloc_ctxt *dealloc,
3116 				struct ocfs2_extent_tree *et,
3117 				int index)
3118 {
3119 	int ret, i, subtree_index = 0, has_empty_extent = 0;
3120 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3121 	struct ocfs2_extent_rec *left_rec;
3122 	struct ocfs2_extent_rec *right_rec;
3123 	struct ocfs2_extent_list *el = path_leaf_el(right_path);
3124 	struct buffer_head *bh = path_leaf_bh(right_path);
3125 	struct buffer_head *root_bh = NULL;
3126 	struct ocfs2_path *left_path = NULL;
3127 	struct ocfs2_extent_list *left_el;
3128 
3129 	BUG_ON(index < 0);
3130 
3131 	right_rec = &el->l_recs[index];
3132 	if (index == 0) {
3133 		/* we meet with a cross extent block merge. */
3134 		ret = ocfs2_get_left_path(inode, right_path, &left_path);
3135 		if (ret) {
3136 			mlog_errno(ret);
3137 			goto out;
3138 		}
3139 
3140 		left_el = path_leaf_el(left_path);
3141 		BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3142 		       le16_to_cpu(left_el->l_count));
3143 
3144 		left_rec = &left_el->l_recs[
3145 				le16_to_cpu(left_el->l_next_free_rec) - 1];
3146 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3147 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
3148 		       le32_to_cpu(split_rec->e_cpos));
3149 
3150 		subtree_index = ocfs2_find_subtree_root(inode,
3151 							left_path, right_path);
3152 
3153 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3154 						      handle->h_buffer_credits,
3155 						      left_path);
3156 		if (ret) {
3157 			mlog_errno(ret);
3158 			goto out;
3159 		}
3160 
3161 		root_bh = left_path->p_node[subtree_index].bh;
3162 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3163 
3164 		ret = ocfs2_journal_access(handle, inode, root_bh,
3165 					   OCFS2_JOURNAL_ACCESS_WRITE);
3166 		if (ret) {
3167 			mlog_errno(ret);
3168 			goto out;
3169 		}
3170 
3171 		for (i = subtree_index + 1;
3172 		     i < path_num_items(right_path); i++) {
3173 			ret = ocfs2_journal_access(handle, inode,
3174 						   right_path->p_node[i].bh,
3175 						   OCFS2_JOURNAL_ACCESS_WRITE);
3176 			if (ret) {
3177 				mlog_errno(ret);
3178 				goto out;
3179 			}
3180 
3181 			ret = ocfs2_journal_access(handle, inode,
3182 						   left_path->p_node[i].bh,
3183 						   OCFS2_JOURNAL_ACCESS_WRITE);
3184 			if (ret) {
3185 				mlog_errno(ret);
3186 				goto out;
3187 			}
3188 		}
3189 	} else {
3190 		left_rec = &el->l_recs[index - 1];
3191 		if (ocfs2_is_empty_extent(&el->l_recs[0]))
3192 			has_empty_extent = 1;
3193 	}
3194 
3195 	ret = ocfs2_journal_access(handle, inode, bh,
3196 				   OCFS2_JOURNAL_ACCESS_WRITE);
3197 	if (ret) {
3198 		mlog_errno(ret);
3199 		goto out;
3200 	}
3201 
3202 	if (has_empty_extent && index == 1) {
3203 		/*
3204 		 * The easy case - we can just plop the record right in.
3205 		 */
3206 		*left_rec = *split_rec;
3207 
3208 		has_empty_extent = 0;
3209 	} else
3210 		le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3211 
3212 	le32_add_cpu(&right_rec->e_cpos, split_clusters);
3213 	le64_add_cpu(&right_rec->e_blkno,
3214 		     ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3215 	le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3216 
3217 	ocfs2_cleanup_merge(el, index);
3218 
3219 	ret = ocfs2_journal_dirty(handle, bh);
3220 	if (ret)
3221 		mlog_errno(ret);
3222 
3223 	if (left_path) {
3224 		ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3225 		if (ret)
3226 			mlog_errno(ret);
3227 
3228 		/*
3229 		 * In the situation that the right_rec is empty and the extent
3230 		 * block is empty also,  ocfs2_complete_edge_insert can't handle
3231 		 * it and we need to delete the right extent block.
3232 		 */
3233 		if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3234 		    le16_to_cpu(el->l_next_free_rec) == 1) {
3235 
3236 			ret = ocfs2_remove_rightmost_path(inode, handle,
3237 							  right_path,
3238 							  dealloc, et);
3239 			if (ret) {
3240 				mlog_errno(ret);
3241 				goto out;
3242 			}
3243 
3244 			/* Now the rightmost extent block has been deleted.
3245 			 * So we use the new rightmost path.
3246 			 */
3247 			ocfs2_mv_path(right_path, left_path);
3248 			left_path = NULL;
3249 		} else
3250 			ocfs2_complete_edge_insert(inode, handle, left_path,
3251 						   right_path, subtree_index);
3252 	}
3253 out:
3254 	if (left_path)
3255 		ocfs2_free_path(left_path);
3256 	return ret;
3257 }
3258 
3259 static int ocfs2_try_to_merge_extent(struct inode *inode,
3260 				     handle_t *handle,
3261 				     struct ocfs2_path *path,
3262 				     int split_index,
3263 				     struct ocfs2_extent_rec *split_rec,
3264 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
3265 				     struct ocfs2_merge_ctxt *ctxt,
3266 				     struct ocfs2_extent_tree *et)
3267 
3268 {
3269 	int ret = 0;
3270 	struct ocfs2_extent_list *el = path_leaf_el(path);
3271 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3272 
3273 	BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3274 
3275 	if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3276 		/*
3277 		 * The merge code will need to create an empty
3278 		 * extent to take the place of the newly
3279 		 * emptied slot. Remove any pre-existing empty
3280 		 * extents - having more than one in a leaf is
3281 		 * illegal.
3282 		 */
3283 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3284 					     dealloc, et);
3285 		if (ret) {
3286 			mlog_errno(ret);
3287 			goto out;
3288 		}
3289 		split_index--;
3290 		rec = &el->l_recs[split_index];
3291 	}
3292 
3293 	if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3294 		/*
3295 		 * Left-right contig implies this.
3296 		 */
3297 		BUG_ON(!ctxt->c_split_covers_rec);
3298 
3299 		/*
3300 		 * Since the leftright insert always covers the entire
3301 		 * extent, this call will delete the insert record
3302 		 * entirely, resulting in an empty extent record added to
3303 		 * the extent block.
3304 		 *
3305 		 * Since the adding of an empty extent shifts
3306 		 * everything back to the right, there's no need to
3307 		 * update split_index here.
3308 		 *
3309 		 * When the split_index is zero, we need to merge it to the
3310 		 * prevoius extent block. It is more efficient and easier
3311 		 * if we do merge_right first and merge_left later.
3312 		 */
3313 		ret = ocfs2_merge_rec_right(inode, path,
3314 					    handle, split_rec,
3315 					    split_index);
3316 		if (ret) {
3317 			mlog_errno(ret);
3318 			goto out;
3319 		}
3320 
3321 		/*
3322 		 * We can only get this from logic error above.
3323 		 */
3324 		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3325 
3326 		/* The merge left us with an empty extent, remove it. */
3327 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3328 					     dealloc, et);
3329 		if (ret) {
3330 			mlog_errno(ret);
3331 			goto out;
3332 		}
3333 
3334 		rec = &el->l_recs[split_index];
3335 
3336 		/*
3337 		 * Note that we don't pass split_rec here on purpose -
3338 		 * we've merged it into the rec already.
3339 		 */
3340 		ret = ocfs2_merge_rec_left(inode, path,
3341 					   handle, rec,
3342 					   dealloc, et,
3343 					   split_index);
3344 
3345 		if (ret) {
3346 			mlog_errno(ret);
3347 			goto out;
3348 		}
3349 
3350 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3351 					     dealloc, et);
3352 		/*
3353 		 * Error from this last rotate is not critical, so
3354 		 * print but don't bubble it up.
3355 		 */
3356 		if (ret)
3357 			mlog_errno(ret);
3358 		ret = 0;
3359 	} else {
3360 		/*
3361 		 * Merge a record to the left or right.
3362 		 *
3363 		 * 'contig_type' is relative to the existing record,
3364 		 * so for example, if we're "right contig", it's to
3365 		 * the record on the left (hence the left merge).
3366 		 */
3367 		if (ctxt->c_contig_type == CONTIG_RIGHT) {
3368 			ret = ocfs2_merge_rec_left(inode,
3369 						   path,
3370 						   handle, split_rec,
3371 						   dealloc, et,
3372 						   split_index);
3373 			if (ret) {
3374 				mlog_errno(ret);
3375 				goto out;
3376 			}
3377 		} else {
3378 			ret = ocfs2_merge_rec_right(inode,
3379 						    path,
3380 						    handle, split_rec,
3381 						    split_index);
3382 			if (ret) {
3383 				mlog_errno(ret);
3384 				goto out;
3385 			}
3386 		}
3387 
3388 		if (ctxt->c_split_covers_rec) {
3389 			/*
3390 			 * The merge may have left an empty extent in
3391 			 * our leaf. Try to rotate it away.
3392 			 */
3393 			ret = ocfs2_rotate_tree_left(inode, handle, path,
3394 						     dealloc, et);
3395 			if (ret)
3396 				mlog_errno(ret);
3397 			ret = 0;
3398 		}
3399 	}
3400 
3401 out:
3402 	return ret;
3403 }
3404 
3405 static void ocfs2_subtract_from_rec(struct super_block *sb,
3406 				    enum ocfs2_split_type split,
3407 				    struct ocfs2_extent_rec *rec,
3408 				    struct ocfs2_extent_rec *split_rec)
3409 {
3410 	u64 len_blocks;
3411 
3412 	len_blocks = ocfs2_clusters_to_blocks(sb,
3413 				le16_to_cpu(split_rec->e_leaf_clusters));
3414 
3415 	if (split == SPLIT_LEFT) {
3416 		/*
3417 		 * Region is on the left edge of the existing
3418 		 * record.
3419 		 */
3420 		le32_add_cpu(&rec->e_cpos,
3421 			     le16_to_cpu(split_rec->e_leaf_clusters));
3422 		le64_add_cpu(&rec->e_blkno, len_blocks);
3423 		le16_add_cpu(&rec->e_leaf_clusters,
3424 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3425 	} else {
3426 		/*
3427 		 * Region is on the right edge of the existing
3428 		 * record.
3429 		 */
3430 		le16_add_cpu(&rec->e_leaf_clusters,
3431 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3432 	}
3433 }
3434 
3435 /*
3436  * Do the final bits of extent record insertion at the target leaf
3437  * list. If this leaf is part of an allocation tree, it is assumed
3438  * that the tree above has been prepared.
3439  */
3440 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3441 				 struct ocfs2_extent_list *el,
3442 				 struct ocfs2_insert_type *insert,
3443 				 struct inode *inode)
3444 {
3445 	int i = insert->ins_contig_index;
3446 	unsigned int range;
3447 	struct ocfs2_extent_rec *rec;
3448 
3449 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3450 
3451 	if (insert->ins_split != SPLIT_NONE) {
3452 		i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3453 		BUG_ON(i == -1);
3454 		rec = &el->l_recs[i];
3455 		ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3456 					insert_rec);
3457 		goto rotate;
3458 	}
3459 
3460 	/*
3461 	 * Contiguous insert - either left or right.
3462 	 */
3463 	if (insert->ins_contig != CONTIG_NONE) {
3464 		rec = &el->l_recs[i];
3465 		if (insert->ins_contig == CONTIG_LEFT) {
3466 			rec->e_blkno = insert_rec->e_blkno;
3467 			rec->e_cpos = insert_rec->e_cpos;
3468 		}
3469 		le16_add_cpu(&rec->e_leaf_clusters,
3470 			     le16_to_cpu(insert_rec->e_leaf_clusters));
3471 		return;
3472 	}
3473 
3474 	/*
3475 	 * Handle insert into an empty leaf.
3476 	 */
3477 	if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3478 	    ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3479 	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
3480 		el->l_recs[0] = *insert_rec;
3481 		el->l_next_free_rec = cpu_to_le16(1);
3482 		return;
3483 	}
3484 
3485 	/*
3486 	 * Appending insert.
3487 	 */
3488 	if (insert->ins_appending == APPEND_TAIL) {
3489 		i = le16_to_cpu(el->l_next_free_rec) - 1;
3490 		rec = &el->l_recs[i];
3491 		range = le32_to_cpu(rec->e_cpos)
3492 			+ le16_to_cpu(rec->e_leaf_clusters);
3493 		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3494 
3495 		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3496 				le16_to_cpu(el->l_count),
3497 				"inode %lu, depth %u, count %u, next free %u, "
3498 				"rec.cpos %u, rec.clusters %u, "
3499 				"insert.cpos %u, insert.clusters %u\n",
3500 				inode->i_ino,
3501 				le16_to_cpu(el->l_tree_depth),
3502 				le16_to_cpu(el->l_count),
3503 				le16_to_cpu(el->l_next_free_rec),
3504 				le32_to_cpu(el->l_recs[i].e_cpos),
3505 				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3506 				le32_to_cpu(insert_rec->e_cpos),
3507 				le16_to_cpu(insert_rec->e_leaf_clusters));
3508 		i++;
3509 		el->l_recs[i] = *insert_rec;
3510 		le16_add_cpu(&el->l_next_free_rec, 1);
3511 		return;
3512 	}
3513 
3514 rotate:
3515 	/*
3516 	 * Ok, we have to rotate.
3517 	 *
3518 	 * At this point, it is safe to assume that inserting into an
3519 	 * empty leaf and appending to a leaf have both been handled
3520 	 * above.
3521 	 *
3522 	 * This leaf needs to have space, either by the empty 1st
3523 	 * extent record, or by virtue of an l_next_rec < l_count.
3524 	 */
3525 	ocfs2_rotate_leaf(el, insert_rec);
3526 }
3527 
3528 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3529 					   handle_t *handle,
3530 					   struct ocfs2_path *path,
3531 					   struct ocfs2_extent_rec *insert_rec)
3532 {
3533 	int ret, i, next_free;
3534 	struct buffer_head *bh;
3535 	struct ocfs2_extent_list *el;
3536 	struct ocfs2_extent_rec *rec;
3537 
3538 	/*
3539 	 * Update everything except the leaf block.
3540 	 */
3541 	for (i = 0; i < path->p_tree_depth; i++) {
3542 		bh = path->p_node[i].bh;
3543 		el = path->p_node[i].el;
3544 
3545 		next_free = le16_to_cpu(el->l_next_free_rec);
3546 		if (next_free == 0) {
3547 			ocfs2_error(inode->i_sb,
3548 				    "Dinode %llu has a bad extent list",
3549 				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
3550 			ret = -EIO;
3551 			return;
3552 		}
3553 
3554 		rec = &el->l_recs[next_free - 1];
3555 
3556 		rec->e_int_clusters = insert_rec->e_cpos;
3557 		le32_add_cpu(&rec->e_int_clusters,
3558 			     le16_to_cpu(insert_rec->e_leaf_clusters));
3559 		le32_add_cpu(&rec->e_int_clusters,
3560 			     -le32_to_cpu(rec->e_cpos));
3561 
3562 		ret = ocfs2_journal_dirty(handle, bh);
3563 		if (ret)
3564 			mlog_errno(ret);
3565 
3566 	}
3567 }
3568 
3569 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3570 				    struct ocfs2_extent_rec *insert_rec,
3571 				    struct ocfs2_path *right_path,
3572 				    struct ocfs2_path **ret_left_path)
3573 {
3574 	int ret, next_free;
3575 	struct ocfs2_extent_list *el;
3576 	struct ocfs2_path *left_path = NULL;
3577 
3578 	*ret_left_path = NULL;
3579 
3580 	/*
3581 	 * This shouldn't happen for non-trees. The extent rec cluster
3582 	 * count manipulation below only works for interior nodes.
3583 	 */
3584 	BUG_ON(right_path->p_tree_depth == 0);
3585 
3586 	/*
3587 	 * If our appending insert is at the leftmost edge of a leaf,
3588 	 * then we might need to update the rightmost records of the
3589 	 * neighboring path.
3590 	 */
3591 	el = path_leaf_el(right_path);
3592 	next_free = le16_to_cpu(el->l_next_free_rec);
3593 	if (next_free == 0 ||
3594 	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3595 		u32 left_cpos;
3596 
3597 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3598 						    &left_cpos);
3599 		if (ret) {
3600 			mlog_errno(ret);
3601 			goto out;
3602 		}
3603 
3604 		mlog(0, "Append may need a left path update. cpos: %u, "
3605 		     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3606 		     left_cpos);
3607 
3608 		/*
3609 		 * No need to worry if the append is already in the
3610 		 * leftmost leaf.
3611 		 */
3612 		if (left_cpos) {
3613 			left_path = ocfs2_new_path(path_root_bh(right_path),
3614 						   path_root_el(right_path));
3615 			if (!left_path) {
3616 				ret = -ENOMEM;
3617 				mlog_errno(ret);
3618 				goto out;
3619 			}
3620 
3621 			ret = ocfs2_find_path(inode, left_path, left_cpos);
3622 			if (ret) {
3623 				mlog_errno(ret);
3624 				goto out;
3625 			}
3626 
3627 			/*
3628 			 * ocfs2_insert_path() will pass the left_path to the
3629 			 * journal for us.
3630 			 */
3631 		}
3632 	}
3633 
3634 	ret = ocfs2_journal_access_path(inode, handle, right_path);
3635 	if (ret) {
3636 		mlog_errno(ret);
3637 		goto out;
3638 	}
3639 
3640 	ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3641 
3642 	*ret_left_path = left_path;
3643 	ret = 0;
3644 out:
3645 	if (ret != 0)
3646 		ocfs2_free_path(left_path);
3647 
3648 	return ret;
3649 }
3650 
3651 static void ocfs2_split_record(struct inode *inode,
3652 			       struct ocfs2_path *left_path,
3653 			       struct ocfs2_path *right_path,
3654 			       struct ocfs2_extent_rec *split_rec,
3655 			       enum ocfs2_split_type split)
3656 {
3657 	int index;
3658 	u32 cpos = le32_to_cpu(split_rec->e_cpos);
3659 	struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3660 	struct ocfs2_extent_rec *rec, *tmprec;
3661 
3662 	right_el = path_leaf_el(right_path);;
3663 	if (left_path)
3664 		left_el = path_leaf_el(left_path);
3665 
3666 	el = right_el;
3667 	insert_el = right_el;
3668 	index = ocfs2_search_extent_list(el, cpos);
3669 	if (index != -1) {
3670 		if (index == 0 && left_path) {
3671 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3672 
3673 			/*
3674 			 * This typically means that the record
3675 			 * started in the left path but moved to the
3676 			 * right as a result of rotation. We either
3677 			 * move the existing record to the left, or we
3678 			 * do the later insert there.
3679 			 *
3680 			 * In this case, the left path should always
3681 			 * exist as the rotate code will have passed
3682 			 * it back for a post-insert update.
3683 			 */
3684 
3685 			if (split == SPLIT_LEFT) {
3686 				/*
3687 				 * It's a left split. Since we know
3688 				 * that the rotate code gave us an
3689 				 * empty extent in the left path, we
3690 				 * can just do the insert there.
3691 				 */
3692 				insert_el = left_el;
3693 			} else {
3694 				/*
3695 				 * Right split - we have to move the
3696 				 * existing record over to the left
3697 				 * leaf. The insert will be into the
3698 				 * newly created empty extent in the
3699 				 * right leaf.
3700 				 */
3701 				tmprec = &right_el->l_recs[index];
3702 				ocfs2_rotate_leaf(left_el, tmprec);
3703 				el = left_el;
3704 
3705 				memset(tmprec, 0, sizeof(*tmprec));
3706 				index = ocfs2_search_extent_list(left_el, cpos);
3707 				BUG_ON(index == -1);
3708 			}
3709 		}
3710 	} else {
3711 		BUG_ON(!left_path);
3712 		BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3713 		/*
3714 		 * Left path is easy - we can just allow the insert to
3715 		 * happen.
3716 		 */
3717 		el = left_el;
3718 		insert_el = left_el;
3719 		index = ocfs2_search_extent_list(el, cpos);
3720 		BUG_ON(index == -1);
3721 	}
3722 
3723 	rec = &el->l_recs[index];
3724 	ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3725 	ocfs2_rotate_leaf(insert_el, split_rec);
3726 }
3727 
3728 /*
3729  * This function only does inserts on an allocation b-tree. For tree
3730  * depth = 0, ocfs2_insert_at_leaf() is called directly.
3731  *
3732  * right_path is the path we want to do the actual insert
3733  * in. left_path should only be passed in if we need to update that
3734  * portion of the tree after an edge insert.
3735  */
3736 static int ocfs2_insert_path(struct inode *inode,
3737 			     handle_t *handle,
3738 			     struct ocfs2_path *left_path,
3739 			     struct ocfs2_path *right_path,
3740 			     struct ocfs2_extent_rec *insert_rec,
3741 			     struct ocfs2_insert_type *insert)
3742 {
3743 	int ret, subtree_index;
3744 	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3745 
3746 	if (left_path) {
3747 		int credits = handle->h_buffer_credits;
3748 
3749 		/*
3750 		 * There's a chance that left_path got passed back to
3751 		 * us without being accounted for in the
3752 		 * journal. Extend our transaction here to be sure we
3753 		 * can change those blocks.
3754 		 */
3755 		credits += left_path->p_tree_depth;
3756 
3757 		ret = ocfs2_extend_trans(handle, credits);
3758 		if (ret < 0) {
3759 			mlog_errno(ret);
3760 			goto out;
3761 		}
3762 
3763 		ret = ocfs2_journal_access_path(inode, handle, left_path);
3764 		if (ret < 0) {
3765 			mlog_errno(ret);
3766 			goto out;
3767 		}
3768 	}
3769 
3770 	/*
3771 	 * Pass both paths to the journal. The majority of inserts
3772 	 * will be touching all components anyway.
3773 	 */
3774 	ret = ocfs2_journal_access_path(inode, handle, right_path);
3775 	if (ret < 0) {
3776 		mlog_errno(ret);
3777 		goto out;
3778 	}
3779 
3780 	if (insert->ins_split != SPLIT_NONE) {
3781 		/*
3782 		 * We could call ocfs2_insert_at_leaf() for some types
3783 		 * of splits, but it's easier to just let one separate
3784 		 * function sort it all out.
3785 		 */
3786 		ocfs2_split_record(inode, left_path, right_path,
3787 				   insert_rec, insert->ins_split);
3788 
3789 		/*
3790 		 * Split might have modified either leaf and we don't
3791 		 * have a guarantee that the later edge insert will
3792 		 * dirty this for us.
3793 		 */
3794 		if (left_path)
3795 			ret = ocfs2_journal_dirty(handle,
3796 						  path_leaf_bh(left_path));
3797 			if (ret)
3798 				mlog_errno(ret);
3799 	} else
3800 		ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3801 				     insert, inode);
3802 
3803 	ret = ocfs2_journal_dirty(handle, leaf_bh);
3804 	if (ret)
3805 		mlog_errno(ret);
3806 
3807 	if (left_path) {
3808 		/*
3809 		 * The rotate code has indicated that we need to fix
3810 		 * up portions of the tree after the insert.
3811 		 *
3812 		 * XXX: Should we extend the transaction here?
3813 		 */
3814 		subtree_index = ocfs2_find_subtree_root(inode, left_path,
3815 							right_path);
3816 		ocfs2_complete_edge_insert(inode, handle, left_path,
3817 					   right_path, subtree_index);
3818 	}
3819 
3820 	ret = 0;
3821 out:
3822 	return ret;
3823 }
3824 
3825 static int ocfs2_do_insert_extent(struct inode *inode,
3826 				  handle_t *handle,
3827 				  struct ocfs2_extent_tree *et,
3828 				  struct ocfs2_extent_rec *insert_rec,
3829 				  struct ocfs2_insert_type *type)
3830 {
3831 	int ret, rotate = 0;
3832 	u32 cpos;
3833 	struct ocfs2_path *right_path = NULL;
3834 	struct ocfs2_path *left_path = NULL;
3835 	struct ocfs2_extent_list *el;
3836 
3837 	el = et->root_el;
3838 
3839 	ret = ocfs2_journal_access(handle, inode, et->root_bh,
3840 				   OCFS2_JOURNAL_ACCESS_WRITE);
3841 	if (ret) {
3842 		mlog_errno(ret);
3843 		goto out;
3844 	}
3845 
3846 	if (le16_to_cpu(el->l_tree_depth) == 0) {
3847 		ocfs2_insert_at_leaf(insert_rec, el, type, inode);
3848 		goto out_update_clusters;
3849 	}
3850 
3851 	right_path = ocfs2_new_path(et->root_bh, et->root_el);
3852 	if (!right_path) {
3853 		ret = -ENOMEM;
3854 		mlog_errno(ret);
3855 		goto out;
3856 	}
3857 
3858 	/*
3859 	 * Determine the path to start with. Rotations need the
3860 	 * rightmost path, everything else can go directly to the
3861 	 * target leaf.
3862 	 */
3863 	cpos = le32_to_cpu(insert_rec->e_cpos);
3864 	if (type->ins_appending == APPEND_NONE &&
3865 	    type->ins_contig == CONTIG_NONE) {
3866 		rotate = 1;
3867 		cpos = UINT_MAX;
3868 	}
3869 
3870 	ret = ocfs2_find_path(inode, right_path, cpos);
3871 	if (ret) {
3872 		mlog_errno(ret);
3873 		goto out;
3874 	}
3875 
3876 	/*
3877 	 * Rotations and appends need special treatment - they modify
3878 	 * parts of the tree's above them.
3879 	 *
3880 	 * Both might pass back a path immediate to the left of the
3881 	 * one being inserted to. This will be cause
3882 	 * ocfs2_insert_path() to modify the rightmost records of
3883 	 * left_path to account for an edge insert.
3884 	 *
3885 	 * XXX: When modifying this code, keep in mind that an insert
3886 	 * can wind up skipping both of these two special cases...
3887 	 */
3888 	if (rotate) {
3889 		ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
3890 					      le32_to_cpu(insert_rec->e_cpos),
3891 					      right_path, &left_path);
3892 		if (ret) {
3893 			mlog_errno(ret);
3894 			goto out;
3895 		}
3896 
3897 		/*
3898 		 * ocfs2_rotate_tree_right() might have extended the
3899 		 * transaction without re-journaling our tree root.
3900 		 */
3901 		ret = ocfs2_journal_access(handle, inode, et->root_bh,
3902 					   OCFS2_JOURNAL_ACCESS_WRITE);
3903 		if (ret) {
3904 			mlog_errno(ret);
3905 			goto out;
3906 		}
3907 	} else if (type->ins_appending == APPEND_TAIL
3908 		   && type->ins_contig != CONTIG_LEFT) {
3909 		ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
3910 					       right_path, &left_path);
3911 		if (ret) {
3912 			mlog_errno(ret);
3913 			goto out;
3914 		}
3915 	}
3916 
3917 	ret = ocfs2_insert_path(inode, handle, left_path, right_path,
3918 				insert_rec, type);
3919 	if (ret) {
3920 		mlog_errno(ret);
3921 		goto out;
3922 	}
3923 
3924 out_update_clusters:
3925 	if (type->ins_split == SPLIT_NONE)
3926 		ocfs2_update_clusters(inode, et,
3927 				      le16_to_cpu(insert_rec->e_leaf_clusters));
3928 
3929 	ret = ocfs2_journal_dirty(handle, et->root_bh);
3930 	if (ret)
3931 		mlog_errno(ret);
3932 
3933 out:
3934 	ocfs2_free_path(left_path);
3935 	ocfs2_free_path(right_path);
3936 
3937 	return ret;
3938 }
3939 
3940 static enum ocfs2_contig_type
3941 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
3942 			       struct ocfs2_extent_list *el, int index,
3943 			       struct ocfs2_extent_rec *split_rec)
3944 {
3945 	int status;
3946 	enum ocfs2_contig_type ret = CONTIG_NONE;
3947 	u32 left_cpos, right_cpos;
3948 	struct ocfs2_extent_rec *rec = NULL;
3949 	struct ocfs2_extent_list *new_el;
3950 	struct ocfs2_path *left_path = NULL, *right_path = NULL;
3951 	struct buffer_head *bh;
3952 	struct ocfs2_extent_block *eb;
3953 
3954 	if (index > 0) {
3955 		rec = &el->l_recs[index - 1];
3956 	} else if (path->p_tree_depth > 0) {
3957 		status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3958 						       path, &left_cpos);
3959 		if (status)
3960 			goto out;
3961 
3962 		if (left_cpos != 0) {
3963 			left_path = ocfs2_new_path(path_root_bh(path),
3964 						   path_root_el(path));
3965 			if (!left_path)
3966 				goto out;
3967 
3968 			status = ocfs2_find_path(inode, left_path, left_cpos);
3969 			if (status)
3970 				goto out;
3971 
3972 			new_el = path_leaf_el(left_path);
3973 
3974 			if (le16_to_cpu(new_el->l_next_free_rec) !=
3975 			    le16_to_cpu(new_el->l_count)) {
3976 				bh = path_leaf_bh(left_path);
3977 				eb = (struct ocfs2_extent_block *)bh->b_data;
3978 				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3979 								 eb);
3980 				goto out;
3981 			}
3982 			rec = &new_el->l_recs[
3983 				le16_to_cpu(new_el->l_next_free_rec) - 1];
3984 		}
3985 	}
3986 
3987 	/*
3988 	 * We're careful to check for an empty extent record here -
3989 	 * the merge code will know what to do if it sees one.
3990 	 */
3991 	if (rec) {
3992 		if (index == 1 && ocfs2_is_empty_extent(rec)) {
3993 			if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3994 				ret = CONTIG_RIGHT;
3995 		} else {
3996 			ret = ocfs2_extent_contig(inode, rec, split_rec);
3997 		}
3998 	}
3999 
4000 	rec = NULL;
4001 	if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4002 		rec = &el->l_recs[index + 1];
4003 	else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4004 		 path->p_tree_depth > 0) {
4005 		status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
4006 							path, &right_cpos);
4007 		if (status)
4008 			goto out;
4009 
4010 		if (right_cpos == 0)
4011 			goto out;
4012 
4013 		right_path = ocfs2_new_path(path_root_bh(path),
4014 					    path_root_el(path));
4015 		if (!right_path)
4016 			goto out;
4017 
4018 		status = ocfs2_find_path(inode, right_path, right_cpos);
4019 		if (status)
4020 			goto out;
4021 
4022 		new_el = path_leaf_el(right_path);
4023 		rec = &new_el->l_recs[0];
4024 		if (ocfs2_is_empty_extent(rec)) {
4025 			if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4026 				bh = path_leaf_bh(right_path);
4027 				eb = (struct ocfs2_extent_block *)bh->b_data;
4028 				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
4029 								 eb);
4030 				goto out;
4031 			}
4032 			rec = &new_el->l_recs[1];
4033 		}
4034 	}
4035 
4036 	if (rec) {
4037 		enum ocfs2_contig_type contig_type;
4038 
4039 		contig_type = ocfs2_extent_contig(inode, rec, split_rec);
4040 
4041 		if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4042 			ret = CONTIG_LEFTRIGHT;
4043 		else if (ret == CONTIG_NONE)
4044 			ret = contig_type;
4045 	}
4046 
4047 out:
4048 	if (left_path)
4049 		ocfs2_free_path(left_path);
4050 	if (right_path)
4051 		ocfs2_free_path(right_path);
4052 
4053 	return ret;
4054 }
4055 
4056 static void ocfs2_figure_contig_type(struct inode *inode,
4057 				     struct ocfs2_insert_type *insert,
4058 				     struct ocfs2_extent_list *el,
4059 				     struct ocfs2_extent_rec *insert_rec)
4060 {
4061 	int i;
4062 	enum ocfs2_contig_type contig_type = CONTIG_NONE;
4063 
4064 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4065 
4066 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4067 		contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4068 						  insert_rec);
4069 		if (contig_type != CONTIG_NONE) {
4070 			insert->ins_contig_index = i;
4071 			break;
4072 		}
4073 	}
4074 	insert->ins_contig = contig_type;
4075 }
4076 
4077 /*
4078  * This should only be called against the righmost leaf extent list.
4079  *
4080  * ocfs2_figure_appending_type() will figure out whether we'll have to
4081  * insert at the tail of the rightmost leaf.
4082  *
4083  * This should also work against the root extent list for tree's with 0
4084  * depth. If we consider the root extent list to be the rightmost leaf node
4085  * then the logic here makes sense.
4086  */
4087 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4088 					struct ocfs2_extent_list *el,
4089 					struct ocfs2_extent_rec *insert_rec)
4090 {
4091 	int i;
4092 	u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4093 	struct ocfs2_extent_rec *rec;
4094 
4095 	insert->ins_appending = APPEND_NONE;
4096 
4097 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4098 
4099 	if (!el->l_next_free_rec)
4100 		goto set_tail_append;
4101 
4102 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4103 		/* Were all records empty? */
4104 		if (le16_to_cpu(el->l_next_free_rec) == 1)
4105 			goto set_tail_append;
4106 	}
4107 
4108 	i = le16_to_cpu(el->l_next_free_rec) - 1;
4109 	rec = &el->l_recs[i];
4110 
4111 	if (cpos >=
4112 	    (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4113 		goto set_tail_append;
4114 
4115 	return;
4116 
4117 set_tail_append:
4118 	insert->ins_appending = APPEND_TAIL;
4119 }
4120 
4121 /*
4122  * Helper function called at the begining of an insert.
4123  *
4124  * This computes a few things that are commonly used in the process of
4125  * inserting into the btree:
4126  *   - Whether the new extent is contiguous with an existing one.
4127  *   - The current tree depth.
4128  *   - Whether the insert is an appending one.
4129  *   - The total # of free records in the tree.
4130  *
4131  * All of the information is stored on the ocfs2_insert_type
4132  * structure.
4133  */
4134 static int ocfs2_figure_insert_type(struct inode *inode,
4135 				    struct ocfs2_extent_tree *et,
4136 				    struct buffer_head **last_eb_bh,
4137 				    struct ocfs2_extent_rec *insert_rec,
4138 				    int *free_records,
4139 				    struct ocfs2_insert_type *insert)
4140 {
4141 	int ret;
4142 	struct ocfs2_extent_block *eb;
4143 	struct ocfs2_extent_list *el;
4144 	struct ocfs2_path *path = NULL;
4145 	struct buffer_head *bh = NULL;
4146 
4147 	insert->ins_split = SPLIT_NONE;
4148 
4149 	el = et->root_el;
4150 	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4151 
4152 	if (el->l_tree_depth) {
4153 		/*
4154 		 * If we have tree depth, we read in the
4155 		 * rightmost extent block ahead of time as
4156 		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4157 		 * may want it later.
4158 		 */
4159 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4160 				       ocfs2_get_last_eb_blk(et), &bh,
4161 				       OCFS2_BH_CACHED, inode);
4162 		if (ret) {
4163 			mlog_exit(ret);
4164 			goto out;
4165 		}
4166 		eb = (struct ocfs2_extent_block *) bh->b_data;
4167 		el = &eb->h_list;
4168 	}
4169 
4170 	/*
4171 	 * Unless we have a contiguous insert, we'll need to know if
4172 	 * there is room left in our allocation tree for another
4173 	 * extent record.
4174 	 *
4175 	 * XXX: This test is simplistic, we can search for empty
4176 	 * extent records too.
4177 	 */
4178 	*free_records = le16_to_cpu(el->l_count) -
4179 		le16_to_cpu(el->l_next_free_rec);
4180 
4181 	if (!insert->ins_tree_depth) {
4182 		ocfs2_figure_contig_type(inode, insert, el, insert_rec);
4183 		ocfs2_figure_appending_type(insert, el, insert_rec);
4184 		return 0;
4185 	}
4186 
4187 	path = ocfs2_new_path(et->root_bh, et->root_el);
4188 	if (!path) {
4189 		ret = -ENOMEM;
4190 		mlog_errno(ret);
4191 		goto out;
4192 	}
4193 
4194 	/*
4195 	 * In the case that we're inserting past what the tree
4196 	 * currently accounts for, ocfs2_find_path() will return for
4197 	 * us the rightmost tree path. This is accounted for below in
4198 	 * the appending code.
4199 	 */
4200 	ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4201 	if (ret) {
4202 		mlog_errno(ret);
4203 		goto out;
4204 	}
4205 
4206 	el = path_leaf_el(path);
4207 
4208 	/*
4209 	 * Now that we have the path, there's two things we want to determine:
4210 	 * 1) Contiguousness (also set contig_index if this is so)
4211 	 *
4212 	 * 2) Are we doing an append? We can trivially break this up
4213          *     into two types of appends: simple record append, or a
4214          *     rotate inside the tail leaf.
4215 	 */
4216 	ocfs2_figure_contig_type(inode, insert, el, insert_rec);
4217 
4218 	/*
4219 	 * The insert code isn't quite ready to deal with all cases of
4220 	 * left contiguousness. Specifically, if it's an insert into
4221 	 * the 1st record in a leaf, it will require the adjustment of
4222 	 * cluster count on the last record of the path directly to it's
4223 	 * left. For now, just catch that case and fool the layers
4224 	 * above us. This works just fine for tree_depth == 0, which
4225 	 * is why we allow that above.
4226 	 */
4227 	if (insert->ins_contig == CONTIG_LEFT &&
4228 	    insert->ins_contig_index == 0)
4229 		insert->ins_contig = CONTIG_NONE;
4230 
4231 	/*
4232 	 * Ok, so we can simply compare against last_eb to figure out
4233 	 * whether the path doesn't exist. This will only happen in
4234 	 * the case that we're doing a tail append, so maybe we can
4235 	 * take advantage of that information somehow.
4236 	 */
4237 	if (ocfs2_get_last_eb_blk(et) ==
4238 	    path_leaf_bh(path)->b_blocknr) {
4239 		/*
4240 		 * Ok, ocfs2_find_path() returned us the rightmost
4241 		 * tree path. This might be an appending insert. There are
4242 		 * two cases:
4243 		 *    1) We're doing a true append at the tail:
4244 		 *	-This might even be off the end of the leaf
4245 		 *    2) We're "appending" by rotating in the tail
4246 		 */
4247 		ocfs2_figure_appending_type(insert, el, insert_rec);
4248 	}
4249 
4250 out:
4251 	ocfs2_free_path(path);
4252 
4253 	if (ret == 0)
4254 		*last_eb_bh = bh;
4255 	else
4256 		brelse(bh);
4257 	return ret;
4258 }
4259 
4260 /*
4261  * Insert an extent into an inode btree.
4262  *
4263  * The caller needs to update fe->i_clusters
4264  */
4265 static int ocfs2_insert_extent(struct ocfs2_super *osb,
4266 			       handle_t *handle,
4267 			       struct inode *inode,
4268 			       struct buffer_head *root_bh,
4269 			       u32 cpos,
4270 			       u64 start_blk,
4271 			       u32 new_clusters,
4272 			       u8 flags,
4273 			       struct ocfs2_alloc_context *meta_ac,
4274 			       struct ocfs2_extent_tree *et)
4275 {
4276 	int status;
4277 	int uninitialized_var(free_records);
4278 	struct buffer_head *last_eb_bh = NULL;
4279 	struct ocfs2_insert_type insert = {0, };
4280 	struct ocfs2_extent_rec rec;
4281 
4282 	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
4283 
4284 	mlog(0, "add %u clusters at position %u to inode %llu\n",
4285 	     new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4286 
4287 	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
4288 			(OCFS2_I(inode)->ip_clusters != cpos),
4289 			"Device %s, asking for sparse allocation: inode %llu, "
4290 			"cpos %u, clusters %u\n",
4291 			osb->dev_str,
4292 			(unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
4293 			OCFS2_I(inode)->ip_clusters);
4294 
4295 	memset(&rec, 0, sizeof(rec));
4296 	rec.e_cpos = cpu_to_le32(cpos);
4297 	rec.e_blkno = cpu_to_le64(start_blk);
4298 	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4299 	rec.e_flags = flags;
4300 
4301 	status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4302 					  &free_records, &insert);
4303 	if (status < 0) {
4304 		mlog_errno(status);
4305 		goto bail;
4306 	}
4307 
4308 	mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4309 	     "Insert.contig_index: %d, Insert.free_records: %d, "
4310 	     "Insert.tree_depth: %d\n",
4311 	     insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4312 	     free_records, insert.ins_tree_depth);
4313 
4314 	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4315 		status = ocfs2_grow_tree(inode, handle, et,
4316 					 &insert.ins_tree_depth, &last_eb_bh,
4317 					 meta_ac);
4318 		if (status) {
4319 			mlog_errno(status);
4320 			goto bail;
4321 		}
4322 	}
4323 
4324 	/* Finally, we can add clusters. This might rotate the tree for us. */
4325 	status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4326 	if (status < 0)
4327 		mlog_errno(status);
4328 	else if (et->type == OCFS2_DINODE_EXTENT)
4329 		ocfs2_extent_map_insert_rec(inode, &rec);
4330 
4331 bail:
4332 	if (last_eb_bh)
4333 		brelse(last_eb_bh);
4334 
4335 	mlog_exit(status);
4336 	return status;
4337 }
4338 
4339 int ocfs2_dinode_insert_extent(struct ocfs2_super *osb,
4340 			       handle_t *handle,
4341 			       struct inode *inode,
4342 			       struct buffer_head *root_bh,
4343 			       u32 cpos,
4344 			       u64 start_blk,
4345 			       u32 new_clusters,
4346 			       u8 flags,
4347 			       struct ocfs2_alloc_context *meta_ac)
4348 {
4349 	int status;
4350 	struct ocfs2_extent_tree *et = NULL;
4351 
4352 	et = ocfs2_new_extent_tree(root_bh, OCFS2_DINODE_EXTENT, NULL);
4353 	if (!et) {
4354 		status = -ENOMEM;
4355 		mlog_errno(status);
4356 		goto bail;
4357 	}
4358 
4359 	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
4360 				     cpos, start_blk, new_clusters,
4361 				     flags, meta_ac, et);
4362 
4363 	if (et)
4364 		ocfs2_free_extent_tree(et);
4365 bail:
4366 	return status;
4367 }
4368 
4369 int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb,
4370 				    handle_t *handle,
4371 				    struct inode *inode,
4372 				    struct buffer_head *root_bh,
4373 				    u32 cpos,
4374 				    u64 start_blk,
4375 				    u32 new_clusters,
4376 				    u8 flags,
4377 				    struct ocfs2_alloc_context *meta_ac,
4378 				    void *private)
4379 {
4380 	int status;
4381 	struct ocfs2_extent_tree *et = NULL;
4382 
4383 	et = ocfs2_new_extent_tree(root_bh, OCFS2_XATTR_VALUE_EXTENT, private);
4384 	if (!et) {
4385 		status = -ENOMEM;
4386 		mlog_errno(status);
4387 		goto bail;
4388 	}
4389 
4390 	status = ocfs2_insert_extent(osb, handle, inode, root_bh,
4391 				     cpos, start_blk, new_clusters,
4392 				     flags, meta_ac, et);
4393 
4394 	if (et)
4395 		ocfs2_free_extent_tree(et);
4396 bail:
4397 	return status;
4398 }
4399 
4400 /*
4401  * Allcate and add clusters into the extent b-tree.
4402  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4403  * The extent b-tree's root is root_el and it should be in root_bh, and
4404  * it is not limited to the file storage. Any extent tree can use this
4405  * function if it implements the proper ocfs2_extent_tree.
4406  */
4407 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4408 				struct inode *inode,
4409 				u32 *logical_offset,
4410 				u32 clusters_to_add,
4411 				int mark_unwritten,
4412 				struct buffer_head *root_bh,
4413 				struct ocfs2_extent_list *root_el,
4414 				handle_t *handle,
4415 				struct ocfs2_alloc_context *data_ac,
4416 				struct ocfs2_alloc_context *meta_ac,
4417 				enum ocfs2_alloc_restarted *reason_ret,
4418 				enum ocfs2_extent_tree_type type,
4419 				void *private)
4420 {
4421 	int status = 0;
4422 	int free_extents;
4423 	enum ocfs2_alloc_restarted reason = RESTART_NONE;
4424 	u32 bit_off, num_bits;
4425 	u64 block;
4426 	u8 flags = 0;
4427 
4428 	BUG_ON(!clusters_to_add);
4429 
4430 	if (mark_unwritten)
4431 		flags = OCFS2_EXT_UNWRITTEN;
4432 
4433 	free_extents = ocfs2_num_free_extents(osb, inode, root_bh, type,
4434 					      private);
4435 	if (free_extents < 0) {
4436 		status = free_extents;
4437 		mlog_errno(status);
4438 		goto leave;
4439 	}
4440 
4441 	/* there are two cases which could cause us to EAGAIN in the
4442 	 * we-need-more-metadata case:
4443 	 * 1) we haven't reserved *any*
4444 	 * 2) we are so fragmented, we've needed to add metadata too
4445 	 *    many times. */
4446 	if (!free_extents && !meta_ac) {
4447 		mlog(0, "we haven't reserved any metadata!\n");
4448 		status = -EAGAIN;
4449 		reason = RESTART_META;
4450 		goto leave;
4451 	} else if ((!free_extents)
4452 		   && (ocfs2_alloc_context_bits_left(meta_ac)
4453 		       < ocfs2_extend_meta_needed(root_el))) {
4454 		mlog(0, "filesystem is really fragmented...\n");
4455 		status = -EAGAIN;
4456 		reason = RESTART_META;
4457 		goto leave;
4458 	}
4459 
4460 	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4461 					clusters_to_add, &bit_off, &num_bits);
4462 	if (status < 0) {
4463 		if (status != -ENOSPC)
4464 			mlog_errno(status);
4465 		goto leave;
4466 	}
4467 
4468 	BUG_ON(num_bits > clusters_to_add);
4469 
4470 	/* reserve our write early -- insert_extent may update the inode */
4471 	status = ocfs2_journal_access(handle, inode, root_bh,
4472 				      OCFS2_JOURNAL_ACCESS_WRITE);
4473 	if (status < 0) {
4474 		mlog_errno(status);
4475 		goto leave;
4476 	}
4477 
4478 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4479 	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4480 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4481 	if (type == OCFS2_DINODE_EXTENT)
4482 		status = ocfs2_dinode_insert_extent(osb, handle, inode, root_bh,
4483 						    *logical_offset, block,
4484 						    num_bits, flags, meta_ac);
4485 	else
4486 		status = ocfs2_xattr_value_insert_extent(osb, handle,
4487 							 inode, root_bh,
4488 							 *logical_offset,
4489 							 block, num_bits, flags,
4490 							 meta_ac, private);
4491 	if (status < 0) {
4492 		mlog_errno(status);
4493 		goto leave;
4494 	}
4495 
4496 	status = ocfs2_journal_dirty(handle, root_bh);
4497 	if (status < 0) {
4498 		mlog_errno(status);
4499 		goto leave;
4500 	}
4501 
4502 	clusters_to_add -= num_bits;
4503 	*logical_offset += num_bits;
4504 
4505 	if (clusters_to_add) {
4506 		mlog(0, "need to alloc once more, wanted = %u\n",
4507 		     clusters_to_add);
4508 		status = -EAGAIN;
4509 		reason = RESTART_TRANS;
4510 	}
4511 
4512 leave:
4513 	mlog_exit(status);
4514 	if (reason_ret)
4515 		*reason_ret = reason;
4516 	return status;
4517 }
4518 
4519 static void ocfs2_make_right_split_rec(struct super_block *sb,
4520 				       struct ocfs2_extent_rec *split_rec,
4521 				       u32 cpos,
4522 				       struct ocfs2_extent_rec *rec)
4523 {
4524 	u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4525 	u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4526 
4527 	memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4528 
4529 	split_rec->e_cpos = cpu_to_le32(cpos);
4530 	split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4531 
4532 	split_rec->e_blkno = rec->e_blkno;
4533 	le64_add_cpu(&split_rec->e_blkno,
4534 		     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4535 
4536 	split_rec->e_flags = rec->e_flags;
4537 }
4538 
4539 static int ocfs2_split_and_insert(struct inode *inode,
4540 				  handle_t *handle,
4541 				  struct ocfs2_path *path,
4542 				  struct ocfs2_extent_tree *et,
4543 				  struct buffer_head **last_eb_bh,
4544 				  int split_index,
4545 				  struct ocfs2_extent_rec *orig_split_rec,
4546 				  struct ocfs2_alloc_context *meta_ac)
4547 {
4548 	int ret = 0, depth;
4549 	unsigned int insert_range, rec_range, do_leftright = 0;
4550 	struct ocfs2_extent_rec tmprec;
4551 	struct ocfs2_extent_list *rightmost_el;
4552 	struct ocfs2_extent_rec rec;
4553 	struct ocfs2_extent_rec split_rec = *orig_split_rec;
4554 	struct ocfs2_insert_type insert;
4555 	struct ocfs2_extent_block *eb;
4556 
4557 leftright:
4558 	/*
4559 	 * Store a copy of the record on the stack - it might move
4560 	 * around as the tree is manipulated below.
4561 	 */
4562 	rec = path_leaf_el(path)->l_recs[split_index];
4563 
4564 	rightmost_el = et->root_el;
4565 
4566 	depth = le16_to_cpu(rightmost_el->l_tree_depth);
4567 	if (depth) {
4568 		BUG_ON(!(*last_eb_bh));
4569 		eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4570 		rightmost_el = &eb->h_list;
4571 	}
4572 
4573 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4574 	    le16_to_cpu(rightmost_el->l_count)) {
4575 		ret = ocfs2_grow_tree(inode, handle, et,
4576 				      &depth, last_eb_bh, meta_ac);
4577 		if (ret) {
4578 			mlog_errno(ret);
4579 			goto out;
4580 		}
4581 	}
4582 
4583 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4584 	insert.ins_appending = APPEND_NONE;
4585 	insert.ins_contig = CONTIG_NONE;
4586 	insert.ins_tree_depth = depth;
4587 
4588 	insert_range = le32_to_cpu(split_rec.e_cpos) +
4589 		le16_to_cpu(split_rec.e_leaf_clusters);
4590 	rec_range = le32_to_cpu(rec.e_cpos) +
4591 		le16_to_cpu(rec.e_leaf_clusters);
4592 
4593 	if (split_rec.e_cpos == rec.e_cpos) {
4594 		insert.ins_split = SPLIT_LEFT;
4595 	} else if (insert_range == rec_range) {
4596 		insert.ins_split = SPLIT_RIGHT;
4597 	} else {
4598 		/*
4599 		 * Left/right split. We fake this as a right split
4600 		 * first and then make a second pass as a left split.
4601 		 */
4602 		insert.ins_split = SPLIT_RIGHT;
4603 
4604 		ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4605 					   &rec);
4606 
4607 		split_rec = tmprec;
4608 
4609 		BUG_ON(do_leftright);
4610 		do_leftright = 1;
4611 	}
4612 
4613 	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4614 	if (ret) {
4615 		mlog_errno(ret);
4616 		goto out;
4617 	}
4618 
4619 	if (do_leftright == 1) {
4620 		u32 cpos;
4621 		struct ocfs2_extent_list *el;
4622 
4623 		do_leftright++;
4624 		split_rec = *orig_split_rec;
4625 
4626 		ocfs2_reinit_path(path, 1);
4627 
4628 		cpos = le32_to_cpu(split_rec.e_cpos);
4629 		ret = ocfs2_find_path(inode, path, cpos);
4630 		if (ret) {
4631 			mlog_errno(ret);
4632 			goto out;
4633 		}
4634 
4635 		el = path_leaf_el(path);
4636 		split_index = ocfs2_search_extent_list(el, cpos);
4637 		goto leftright;
4638 	}
4639 out:
4640 
4641 	return ret;
4642 }
4643 
4644 /*
4645  * Mark part or all of the extent record at split_index in the leaf
4646  * pointed to by path as written. This removes the unwritten
4647  * extent flag.
4648  *
4649  * Care is taken to handle contiguousness so as to not grow the tree.
4650  *
4651  * meta_ac is not strictly necessary - we only truly need it if growth
4652  * of the tree is required. All other cases will degrade into a less
4653  * optimal tree layout.
4654  *
4655  * last_eb_bh should be the rightmost leaf block for any extent
4656  * btree. Since a split may grow the tree or a merge might shrink it,
4657  * the caller cannot trust the contents of that buffer after this call.
4658  *
4659  * This code is optimized for readability - several passes might be
4660  * made over certain portions of the tree. All of those blocks will
4661  * have been brought into cache (and pinned via the journal), so the
4662  * extra overhead is not expressed in terms of disk reads.
4663  */
4664 static int __ocfs2_mark_extent_written(struct inode *inode,
4665 				       struct ocfs2_extent_tree *et,
4666 				       handle_t *handle,
4667 				       struct ocfs2_path *path,
4668 				       int split_index,
4669 				       struct ocfs2_extent_rec *split_rec,
4670 				       struct ocfs2_alloc_context *meta_ac,
4671 				       struct ocfs2_cached_dealloc_ctxt *dealloc)
4672 {
4673 	int ret = 0;
4674 	struct ocfs2_extent_list *el = path_leaf_el(path);
4675 	struct buffer_head *last_eb_bh = NULL;
4676 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4677 	struct ocfs2_merge_ctxt ctxt;
4678 	struct ocfs2_extent_list *rightmost_el;
4679 
4680 	if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4681 		ret = -EIO;
4682 		mlog_errno(ret);
4683 		goto out;
4684 	}
4685 
4686 	if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4687 	    ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4688 	     (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4689 		ret = -EIO;
4690 		mlog_errno(ret);
4691 		goto out;
4692 	}
4693 
4694 	ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4695 							    split_index,
4696 							    split_rec);
4697 
4698 	/*
4699 	 * The core merge / split code wants to know how much room is
4700 	 * left in this inodes allocation tree, so we pass the
4701 	 * rightmost extent list.
4702 	 */
4703 	if (path->p_tree_depth) {
4704 		struct ocfs2_extent_block *eb;
4705 
4706 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4707 				       ocfs2_get_last_eb_blk(et),
4708 				       &last_eb_bh, OCFS2_BH_CACHED, inode);
4709 		if (ret) {
4710 			mlog_exit(ret);
4711 			goto out;
4712 		}
4713 
4714 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4715 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4716 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4717 			ret = -EROFS;
4718 			goto out;
4719 		}
4720 
4721 		rightmost_el = &eb->h_list;
4722 	} else
4723 		rightmost_el = path_root_el(path);
4724 
4725 	if (rec->e_cpos == split_rec->e_cpos &&
4726 	    rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4727 		ctxt.c_split_covers_rec = 1;
4728 	else
4729 		ctxt.c_split_covers_rec = 0;
4730 
4731 	ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4732 
4733 	mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4734 	     split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4735 	     ctxt.c_split_covers_rec);
4736 
4737 	if (ctxt.c_contig_type == CONTIG_NONE) {
4738 		if (ctxt.c_split_covers_rec)
4739 			el->l_recs[split_index] = *split_rec;
4740 		else
4741 			ret = ocfs2_split_and_insert(inode, handle, path, et,
4742 						     &last_eb_bh, split_index,
4743 						     split_rec, meta_ac);
4744 		if (ret)
4745 			mlog_errno(ret);
4746 	} else {
4747 		ret = ocfs2_try_to_merge_extent(inode, handle, path,
4748 						split_index, split_rec,
4749 						dealloc, &ctxt, et);
4750 		if (ret)
4751 			mlog_errno(ret);
4752 	}
4753 
4754 out:
4755 	brelse(last_eb_bh);
4756 	return ret;
4757 }
4758 
4759 /*
4760  * Mark the already-existing extent at cpos as written for len clusters.
4761  *
4762  * If the existing extent is larger than the request, initiate a
4763  * split. An attempt will be made at merging with adjacent extents.
4764  *
4765  * The caller is responsible for passing down meta_ac if we'll need it.
4766  */
4767 int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh,
4768 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
4769 			      struct ocfs2_alloc_context *meta_ac,
4770 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
4771 			      enum ocfs2_extent_tree_type et_type,
4772 			      void *private)
4773 {
4774 	int ret, index;
4775 	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4776 	struct ocfs2_extent_rec split_rec;
4777 	struct ocfs2_path *left_path = NULL;
4778 	struct ocfs2_extent_list *el;
4779 	struct ocfs2_extent_tree *et = NULL;
4780 
4781 	mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4782 	     inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4783 
4784 	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4785 		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4786 			    "that are being written to, but the feature bit "
4787 			    "is not set in the super block.",
4788 			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
4789 		ret = -EROFS;
4790 		goto out;
4791 	}
4792 
4793 	et = ocfs2_new_extent_tree(root_bh, et_type, private);
4794 	if (!et) {
4795 		ret = -ENOMEM;
4796 		mlog_errno(ret);
4797 		goto out;
4798 	}
4799 
4800 	/*
4801 	 * XXX: This should be fixed up so that we just re-insert the
4802 	 * next extent records.
4803 	 */
4804 	if (et_type == OCFS2_DINODE_EXTENT)
4805 		ocfs2_extent_map_trunc(inode, 0);
4806 
4807 	left_path = ocfs2_new_path(et->root_bh, et->root_el);
4808 	if (!left_path) {
4809 		ret = -ENOMEM;
4810 		mlog_errno(ret);
4811 		goto out;
4812 	}
4813 
4814 	ret = ocfs2_find_path(inode, left_path, cpos);
4815 	if (ret) {
4816 		mlog_errno(ret);
4817 		goto out;
4818 	}
4819 	el = path_leaf_el(left_path);
4820 
4821 	index = ocfs2_search_extent_list(el, cpos);
4822 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4823 		ocfs2_error(inode->i_sb,
4824 			    "Inode %llu has an extent at cpos %u which can no "
4825 			    "longer be found.\n",
4826 			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4827 		ret = -EROFS;
4828 		goto out;
4829 	}
4830 
4831 	memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4832 	split_rec.e_cpos = cpu_to_le32(cpos);
4833 	split_rec.e_leaf_clusters = cpu_to_le16(len);
4834 	split_rec.e_blkno = cpu_to_le64(start_blkno);
4835 	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4836 	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4837 
4838 	ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
4839 					  index, &split_rec, meta_ac,
4840 					  dealloc);
4841 	if (ret)
4842 		mlog_errno(ret);
4843 
4844 out:
4845 	ocfs2_free_path(left_path);
4846 	if (et)
4847 		ocfs2_free_extent_tree(et);
4848 	return ret;
4849 }
4850 
4851 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4852 			    handle_t *handle, struct ocfs2_path *path,
4853 			    int index, u32 new_range,
4854 			    struct ocfs2_alloc_context *meta_ac)
4855 {
4856 	int ret, depth, credits = handle->h_buffer_credits;
4857 	struct buffer_head *last_eb_bh = NULL;
4858 	struct ocfs2_extent_block *eb;
4859 	struct ocfs2_extent_list *rightmost_el, *el;
4860 	struct ocfs2_extent_rec split_rec;
4861 	struct ocfs2_extent_rec *rec;
4862 	struct ocfs2_insert_type insert;
4863 
4864 	/*
4865 	 * Setup the record to split before we grow the tree.
4866 	 */
4867 	el = path_leaf_el(path);
4868 	rec = &el->l_recs[index];
4869 	ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4870 
4871 	depth = path->p_tree_depth;
4872 	if (depth > 0) {
4873 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4874 				       ocfs2_get_last_eb_blk(et),
4875 				       &last_eb_bh, OCFS2_BH_CACHED, inode);
4876 		if (ret < 0) {
4877 			mlog_errno(ret);
4878 			goto out;
4879 		}
4880 
4881 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4882 		rightmost_el = &eb->h_list;
4883 	} else
4884 		rightmost_el = path_leaf_el(path);
4885 
4886 	credits += path->p_tree_depth +
4887 		   ocfs2_extend_meta_needed(et->root_el);
4888 	ret = ocfs2_extend_trans(handle, credits);
4889 	if (ret) {
4890 		mlog_errno(ret);
4891 		goto out;
4892 	}
4893 
4894 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4895 	    le16_to_cpu(rightmost_el->l_count)) {
4896 		ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
4897 				      meta_ac);
4898 		if (ret) {
4899 			mlog_errno(ret);
4900 			goto out;
4901 		}
4902 	}
4903 
4904 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4905 	insert.ins_appending = APPEND_NONE;
4906 	insert.ins_contig = CONTIG_NONE;
4907 	insert.ins_split = SPLIT_RIGHT;
4908 	insert.ins_tree_depth = depth;
4909 
4910 	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4911 	if (ret)
4912 		mlog_errno(ret);
4913 
4914 out:
4915 	brelse(last_eb_bh);
4916 	return ret;
4917 }
4918 
4919 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4920 			      struct ocfs2_path *path, int index,
4921 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
4922 			      u32 cpos, u32 len,
4923 			      struct ocfs2_extent_tree *et)
4924 {
4925 	int ret;
4926 	u32 left_cpos, rec_range, trunc_range;
4927 	int wants_rotate = 0, is_rightmost_tree_rec = 0;
4928 	struct super_block *sb = inode->i_sb;
4929 	struct ocfs2_path *left_path = NULL;
4930 	struct ocfs2_extent_list *el = path_leaf_el(path);
4931 	struct ocfs2_extent_rec *rec;
4932 	struct ocfs2_extent_block *eb;
4933 
4934 	if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4935 		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
4936 		if (ret) {
4937 			mlog_errno(ret);
4938 			goto out;
4939 		}
4940 
4941 		index--;
4942 	}
4943 
4944 	if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
4945 	    path->p_tree_depth) {
4946 		/*
4947 		 * Check whether this is the rightmost tree record. If
4948 		 * we remove all of this record or part of its right
4949 		 * edge then an update of the record lengths above it
4950 		 * will be required.
4951 		 */
4952 		eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
4953 		if (eb->h_next_leaf_blk == 0)
4954 			is_rightmost_tree_rec = 1;
4955 	}
4956 
4957 	rec = &el->l_recs[index];
4958 	if (index == 0 && path->p_tree_depth &&
4959 	    le32_to_cpu(rec->e_cpos) == cpos) {
4960 		/*
4961 		 * Changing the leftmost offset (via partial or whole
4962 		 * record truncate) of an interior (or rightmost) path
4963 		 * means we have to update the subtree that is formed
4964 		 * by this leaf and the one to it's left.
4965 		 *
4966 		 * There are two cases we can skip:
4967 		 *   1) Path is the leftmost one in our inode tree.
4968 		 *   2) The leaf is rightmost and will be empty after
4969 		 *      we remove the extent record - the rotate code
4970 		 *      knows how to update the newly formed edge.
4971 		 */
4972 
4973 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
4974 						    &left_cpos);
4975 		if (ret) {
4976 			mlog_errno(ret);
4977 			goto out;
4978 		}
4979 
4980 		if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
4981 			left_path = ocfs2_new_path(path_root_bh(path),
4982 						   path_root_el(path));
4983 			if (!left_path) {
4984 				ret = -ENOMEM;
4985 				mlog_errno(ret);
4986 				goto out;
4987 			}
4988 
4989 			ret = ocfs2_find_path(inode, left_path, left_cpos);
4990 			if (ret) {
4991 				mlog_errno(ret);
4992 				goto out;
4993 			}
4994 		}
4995 	}
4996 
4997 	ret = ocfs2_extend_rotate_transaction(handle, 0,
4998 					      handle->h_buffer_credits,
4999 					      path);
5000 	if (ret) {
5001 		mlog_errno(ret);
5002 		goto out;
5003 	}
5004 
5005 	ret = ocfs2_journal_access_path(inode, handle, path);
5006 	if (ret) {
5007 		mlog_errno(ret);
5008 		goto out;
5009 	}
5010 
5011 	ret = ocfs2_journal_access_path(inode, handle, left_path);
5012 	if (ret) {
5013 		mlog_errno(ret);
5014 		goto out;
5015 	}
5016 
5017 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5018 	trunc_range = cpos + len;
5019 
5020 	if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5021 		int next_free;
5022 
5023 		memset(rec, 0, sizeof(*rec));
5024 		ocfs2_cleanup_merge(el, index);
5025 		wants_rotate = 1;
5026 
5027 		next_free = le16_to_cpu(el->l_next_free_rec);
5028 		if (is_rightmost_tree_rec && next_free > 1) {
5029 			/*
5030 			 * We skip the edge update if this path will
5031 			 * be deleted by the rotate code.
5032 			 */
5033 			rec = &el->l_recs[next_free - 1];
5034 			ocfs2_adjust_rightmost_records(inode, handle, path,
5035 						       rec);
5036 		}
5037 	} else if (le32_to_cpu(rec->e_cpos) == cpos) {
5038 		/* Remove leftmost portion of the record. */
5039 		le32_add_cpu(&rec->e_cpos, len);
5040 		le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5041 		le16_add_cpu(&rec->e_leaf_clusters, -len);
5042 	} else if (rec_range == trunc_range) {
5043 		/* Remove rightmost portion of the record */
5044 		le16_add_cpu(&rec->e_leaf_clusters, -len);
5045 		if (is_rightmost_tree_rec)
5046 			ocfs2_adjust_rightmost_records(inode, handle, path, rec);
5047 	} else {
5048 		/* Caller should have trapped this. */
5049 		mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
5050 		     "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
5051 		     le32_to_cpu(rec->e_cpos),
5052 		     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5053 		BUG();
5054 	}
5055 
5056 	if (left_path) {
5057 		int subtree_index;
5058 
5059 		subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
5060 		ocfs2_complete_edge_insert(inode, handle, left_path, path,
5061 					   subtree_index);
5062 	}
5063 
5064 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
5065 
5066 	ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5067 	if (ret) {
5068 		mlog_errno(ret);
5069 		goto out;
5070 	}
5071 
5072 out:
5073 	ocfs2_free_path(left_path);
5074 	return ret;
5075 }
5076 
5077 int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh,
5078 			u32 cpos, u32 len, handle_t *handle,
5079 			struct ocfs2_alloc_context *meta_ac,
5080 			struct ocfs2_cached_dealloc_ctxt *dealloc,
5081 			enum ocfs2_extent_tree_type et_type,
5082 			void *private)
5083 {
5084 	int ret, index;
5085 	u32 rec_range, trunc_range;
5086 	struct ocfs2_extent_rec *rec;
5087 	struct ocfs2_extent_list *el;
5088 	struct ocfs2_path *path = NULL;
5089 	struct ocfs2_extent_tree *et = NULL;
5090 
5091 	et = ocfs2_new_extent_tree(root_bh, et_type, private);
5092 	if (!et) {
5093 		ret = -ENOMEM;
5094 		mlog_errno(ret);
5095 		goto out;
5096 	}
5097 
5098 	ocfs2_extent_map_trunc(inode, 0);
5099 
5100 	path = ocfs2_new_path(et->root_bh, et->root_el);
5101 	if (!path) {
5102 		ret = -ENOMEM;
5103 		mlog_errno(ret);
5104 		goto out;
5105 	}
5106 
5107 	ret = ocfs2_find_path(inode, path, cpos);
5108 	if (ret) {
5109 		mlog_errno(ret);
5110 		goto out;
5111 	}
5112 
5113 	el = path_leaf_el(path);
5114 	index = ocfs2_search_extent_list(el, cpos);
5115 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5116 		ocfs2_error(inode->i_sb,
5117 			    "Inode %llu has an extent at cpos %u which can no "
5118 			    "longer be found.\n",
5119 			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5120 		ret = -EROFS;
5121 		goto out;
5122 	}
5123 
5124 	/*
5125 	 * We have 3 cases of extent removal:
5126 	 *   1) Range covers the entire extent rec
5127 	 *   2) Range begins or ends on one edge of the extent rec
5128 	 *   3) Range is in the middle of the extent rec (no shared edges)
5129 	 *
5130 	 * For case 1 we remove the extent rec and left rotate to
5131 	 * fill the hole.
5132 	 *
5133 	 * For case 2 we just shrink the existing extent rec, with a
5134 	 * tree update if the shrinking edge is also the edge of an
5135 	 * extent block.
5136 	 *
5137 	 * For case 3 we do a right split to turn the extent rec into
5138 	 * something case 2 can handle.
5139 	 */
5140 	rec = &el->l_recs[index];
5141 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5142 	trunc_range = cpos + len;
5143 
5144 	BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5145 
5146 	mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
5147 	     "(cpos %u, len %u)\n",
5148 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
5149 	     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5150 
5151 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5152 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5153 					 cpos, len, et);
5154 		if (ret) {
5155 			mlog_errno(ret);
5156 			goto out;
5157 		}
5158 	} else {
5159 		ret = ocfs2_split_tree(inode, et, handle, path, index,
5160 				       trunc_range, meta_ac);
5161 		if (ret) {
5162 			mlog_errno(ret);
5163 			goto out;
5164 		}
5165 
5166 		/*
5167 		 * The split could have manipulated the tree enough to
5168 		 * move the record location, so we have to look for it again.
5169 		 */
5170 		ocfs2_reinit_path(path, 1);
5171 
5172 		ret = ocfs2_find_path(inode, path, cpos);
5173 		if (ret) {
5174 			mlog_errno(ret);
5175 			goto out;
5176 		}
5177 
5178 		el = path_leaf_el(path);
5179 		index = ocfs2_search_extent_list(el, cpos);
5180 		if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5181 			ocfs2_error(inode->i_sb,
5182 				    "Inode %llu: split at cpos %u lost record.",
5183 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
5184 				    cpos);
5185 			ret = -EROFS;
5186 			goto out;
5187 		}
5188 
5189 		/*
5190 		 * Double check our values here. If anything is fishy,
5191 		 * it's easier to catch it at the top level.
5192 		 */
5193 		rec = &el->l_recs[index];
5194 		rec_range = le32_to_cpu(rec->e_cpos) +
5195 			ocfs2_rec_clusters(el, rec);
5196 		if (rec_range != trunc_range) {
5197 			ocfs2_error(inode->i_sb,
5198 				    "Inode %llu: error after split at cpos %u"
5199 				    "trunc len %u, existing record is (%u,%u)",
5200 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
5201 				    cpos, len, le32_to_cpu(rec->e_cpos),
5202 				    ocfs2_rec_clusters(el, rec));
5203 			ret = -EROFS;
5204 			goto out;
5205 		}
5206 
5207 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5208 					 cpos, len, et);
5209 		if (ret) {
5210 			mlog_errno(ret);
5211 			goto out;
5212 		}
5213 	}
5214 
5215 out:
5216 	ocfs2_free_path(path);
5217 	if (et)
5218 		ocfs2_free_extent_tree(et);
5219 	return ret;
5220 }
5221 
5222 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5223 {
5224 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5225 	struct ocfs2_dinode *di;
5226 	struct ocfs2_truncate_log *tl;
5227 
5228 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5229 	tl = &di->id2.i_dealloc;
5230 
5231 	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5232 			"slot %d, invalid truncate log parameters: used = "
5233 			"%u, count = %u\n", osb->slot_num,
5234 			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5235 	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5236 }
5237 
5238 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5239 					   unsigned int new_start)
5240 {
5241 	unsigned int tail_index;
5242 	unsigned int current_tail;
5243 
5244 	/* No records, nothing to coalesce */
5245 	if (!le16_to_cpu(tl->tl_used))
5246 		return 0;
5247 
5248 	tail_index = le16_to_cpu(tl->tl_used) - 1;
5249 	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5250 	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5251 
5252 	return current_tail == new_start;
5253 }
5254 
5255 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5256 			      handle_t *handle,
5257 			      u64 start_blk,
5258 			      unsigned int num_clusters)
5259 {
5260 	int status, index;
5261 	unsigned int start_cluster, tl_count;
5262 	struct inode *tl_inode = osb->osb_tl_inode;
5263 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5264 	struct ocfs2_dinode *di;
5265 	struct ocfs2_truncate_log *tl;
5266 
5267 	mlog_entry("start_blk = %llu, num_clusters = %u\n",
5268 		   (unsigned long long)start_blk, num_clusters);
5269 
5270 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5271 
5272 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5273 
5274 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5275 	tl = &di->id2.i_dealloc;
5276 	if (!OCFS2_IS_VALID_DINODE(di)) {
5277 		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5278 		status = -EIO;
5279 		goto bail;
5280 	}
5281 
5282 	tl_count = le16_to_cpu(tl->tl_count);
5283 	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5284 			tl_count == 0,
5285 			"Truncate record count on #%llu invalid "
5286 			"wanted %u, actual %u\n",
5287 			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5288 			ocfs2_truncate_recs_per_inode(osb->sb),
5289 			le16_to_cpu(tl->tl_count));
5290 
5291 	/* Caller should have known to flush before calling us. */
5292 	index = le16_to_cpu(tl->tl_used);
5293 	if (index >= tl_count) {
5294 		status = -ENOSPC;
5295 		mlog_errno(status);
5296 		goto bail;
5297 	}
5298 
5299 	status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5300 				      OCFS2_JOURNAL_ACCESS_WRITE);
5301 	if (status < 0) {
5302 		mlog_errno(status);
5303 		goto bail;
5304 	}
5305 
5306 	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5307 	     "%llu (index = %d)\n", num_clusters, start_cluster,
5308 	     (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5309 
5310 	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5311 		/*
5312 		 * Move index back to the record we are coalescing with.
5313 		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5314 		 */
5315 		index--;
5316 
5317 		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5318 		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5319 		     index, le32_to_cpu(tl->tl_recs[index].t_start),
5320 		     num_clusters);
5321 	} else {
5322 		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5323 		tl->tl_used = cpu_to_le16(index + 1);
5324 	}
5325 	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5326 
5327 	status = ocfs2_journal_dirty(handle, tl_bh);
5328 	if (status < 0) {
5329 		mlog_errno(status);
5330 		goto bail;
5331 	}
5332 
5333 bail:
5334 	mlog_exit(status);
5335 	return status;
5336 }
5337 
5338 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5339 					 handle_t *handle,
5340 					 struct inode *data_alloc_inode,
5341 					 struct buffer_head *data_alloc_bh)
5342 {
5343 	int status = 0;
5344 	int i;
5345 	unsigned int num_clusters;
5346 	u64 start_blk;
5347 	struct ocfs2_truncate_rec rec;
5348 	struct ocfs2_dinode *di;
5349 	struct ocfs2_truncate_log *tl;
5350 	struct inode *tl_inode = osb->osb_tl_inode;
5351 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5352 
5353 	mlog_entry_void();
5354 
5355 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5356 	tl = &di->id2.i_dealloc;
5357 	i = le16_to_cpu(tl->tl_used) - 1;
5358 	while (i >= 0) {
5359 		/* Caller has given us at least enough credits to
5360 		 * update the truncate log dinode */
5361 		status = ocfs2_journal_access(handle, tl_inode, tl_bh,
5362 					      OCFS2_JOURNAL_ACCESS_WRITE);
5363 		if (status < 0) {
5364 			mlog_errno(status);
5365 			goto bail;
5366 		}
5367 
5368 		tl->tl_used = cpu_to_le16(i);
5369 
5370 		status = ocfs2_journal_dirty(handle, tl_bh);
5371 		if (status < 0) {
5372 			mlog_errno(status);
5373 			goto bail;
5374 		}
5375 
5376 		/* TODO: Perhaps we can calculate the bulk of the
5377 		 * credits up front rather than extending like
5378 		 * this. */
5379 		status = ocfs2_extend_trans(handle,
5380 					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5381 		if (status < 0) {
5382 			mlog_errno(status);
5383 			goto bail;
5384 		}
5385 
5386 		rec = tl->tl_recs[i];
5387 		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5388 						    le32_to_cpu(rec.t_start));
5389 		num_clusters = le32_to_cpu(rec.t_clusters);
5390 
5391 		/* if start_blk is not set, we ignore the record as
5392 		 * invalid. */
5393 		if (start_blk) {
5394 			mlog(0, "free record %d, start = %u, clusters = %u\n",
5395 			     i, le32_to_cpu(rec.t_start), num_clusters);
5396 
5397 			status = ocfs2_free_clusters(handle, data_alloc_inode,
5398 						     data_alloc_bh, start_blk,
5399 						     num_clusters);
5400 			if (status < 0) {
5401 				mlog_errno(status);
5402 				goto bail;
5403 			}
5404 		}
5405 		i--;
5406 	}
5407 
5408 bail:
5409 	mlog_exit(status);
5410 	return status;
5411 }
5412 
5413 /* Expects you to already be holding tl_inode->i_mutex */
5414 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5415 {
5416 	int status;
5417 	unsigned int num_to_flush;
5418 	handle_t *handle;
5419 	struct inode *tl_inode = osb->osb_tl_inode;
5420 	struct inode *data_alloc_inode = NULL;
5421 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5422 	struct buffer_head *data_alloc_bh = NULL;
5423 	struct ocfs2_dinode *di;
5424 	struct ocfs2_truncate_log *tl;
5425 
5426 	mlog_entry_void();
5427 
5428 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5429 
5430 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5431 	tl = &di->id2.i_dealloc;
5432 	if (!OCFS2_IS_VALID_DINODE(di)) {
5433 		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5434 		status = -EIO;
5435 		goto out;
5436 	}
5437 
5438 	num_to_flush = le16_to_cpu(tl->tl_used);
5439 	mlog(0, "Flush %u records from truncate log #%llu\n",
5440 	     num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5441 	if (!num_to_flush) {
5442 		status = 0;
5443 		goto out;
5444 	}
5445 
5446 	data_alloc_inode = ocfs2_get_system_file_inode(osb,
5447 						       GLOBAL_BITMAP_SYSTEM_INODE,
5448 						       OCFS2_INVALID_SLOT);
5449 	if (!data_alloc_inode) {
5450 		status = -EINVAL;
5451 		mlog(ML_ERROR, "Could not get bitmap inode!\n");
5452 		goto out;
5453 	}
5454 
5455 	mutex_lock(&data_alloc_inode->i_mutex);
5456 
5457 	status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5458 	if (status < 0) {
5459 		mlog_errno(status);
5460 		goto out_mutex;
5461 	}
5462 
5463 	handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5464 	if (IS_ERR(handle)) {
5465 		status = PTR_ERR(handle);
5466 		mlog_errno(status);
5467 		goto out_unlock;
5468 	}
5469 
5470 	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5471 					       data_alloc_bh);
5472 	if (status < 0)
5473 		mlog_errno(status);
5474 
5475 	ocfs2_commit_trans(osb, handle);
5476 
5477 out_unlock:
5478 	brelse(data_alloc_bh);
5479 	ocfs2_inode_unlock(data_alloc_inode, 1);
5480 
5481 out_mutex:
5482 	mutex_unlock(&data_alloc_inode->i_mutex);
5483 	iput(data_alloc_inode);
5484 
5485 out:
5486 	mlog_exit(status);
5487 	return status;
5488 }
5489 
5490 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5491 {
5492 	int status;
5493 	struct inode *tl_inode = osb->osb_tl_inode;
5494 
5495 	mutex_lock(&tl_inode->i_mutex);
5496 	status = __ocfs2_flush_truncate_log(osb);
5497 	mutex_unlock(&tl_inode->i_mutex);
5498 
5499 	return status;
5500 }
5501 
5502 static void ocfs2_truncate_log_worker(struct work_struct *work)
5503 {
5504 	int status;
5505 	struct ocfs2_super *osb =
5506 		container_of(work, struct ocfs2_super,
5507 			     osb_truncate_log_wq.work);
5508 
5509 	mlog_entry_void();
5510 
5511 	status = ocfs2_flush_truncate_log(osb);
5512 	if (status < 0)
5513 		mlog_errno(status);
5514 	else
5515 		ocfs2_init_inode_steal_slot(osb);
5516 
5517 	mlog_exit(status);
5518 }
5519 
5520 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5521 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5522 				       int cancel)
5523 {
5524 	if (osb->osb_tl_inode) {
5525 		/* We want to push off log flushes while truncates are
5526 		 * still running. */
5527 		if (cancel)
5528 			cancel_delayed_work(&osb->osb_truncate_log_wq);
5529 
5530 		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5531 				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5532 	}
5533 }
5534 
5535 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5536 				       int slot_num,
5537 				       struct inode **tl_inode,
5538 				       struct buffer_head **tl_bh)
5539 {
5540 	int status;
5541 	struct inode *inode = NULL;
5542 	struct buffer_head *bh = NULL;
5543 
5544 	inode = ocfs2_get_system_file_inode(osb,
5545 					   TRUNCATE_LOG_SYSTEM_INODE,
5546 					   slot_num);
5547 	if (!inode) {
5548 		status = -EINVAL;
5549 		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5550 		goto bail;
5551 	}
5552 
5553 	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
5554 				  OCFS2_BH_CACHED, inode);
5555 	if (status < 0) {
5556 		iput(inode);
5557 		mlog_errno(status);
5558 		goto bail;
5559 	}
5560 
5561 	*tl_inode = inode;
5562 	*tl_bh    = bh;
5563 bail:
5564 	mlog_exit(status);
5565 	return status;
5566 }
5567 
5568 /* called during the 1st stage of node recovery. we stamp a clean
5569  * truncate log and pass back a copy for processing later. if the
5570  * truncate log does not require processing, a *tl_copy is set to
5571  * NULL. */
5572 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5573 				      int slot_num,
5574 				      struct ocfs2_dinode **tl_copy)
5575 {
5576 	int status;
5577 	struct inode *tl_inode = NULL;
5578 	struct buffer_head *tl_bh = NULL;
5579 	struct ocfs2_dinode *di;
5580 	struct ocfs2_truncate_log *tl;
5581 
5582 	*tl_copy = NULL;
5583 
5584 	mlog(0, "recover truncate log from slot %d\n", slot_num);
5585 
5586 	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5587 	if (status < 0) {
5588 		mlog_errno(status);
5589 		goto bail;
5590 	}
5591 
5592 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5593 	tl = &di->id2.i_dealloc;
5594 	if (!OCFS2_IS_VALID_DINODE(di)) {
5595 		OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
5596 		status = -EIO;
5597 		goto bail;
5598 	}
5599 
5600 	if (le16_to_cpu(tl->tl_used)) {
5601 		mlog(0, "We'll have %u logs to recover\n",
5602 		     le16_to_cpu(tl->tl_used));
5603 
5604 		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5605 		if (!(*tl_copy)) {
5606 			status = -ENOMEM;
5607 			mlog_errno(status);
5608 			goto bail;
5609 		}
5610 
5611 		/* Assuming the write-out below goes well, this copy
5612 		 * will be passed back to recovery for processing. */
5613 		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5614 
5615 		/* All we need to do to clear the truncate log is set
5616 		 * tl_used. */
5617 		tl->tl_used = 0;
5618 
5619 		status = ocfs2_write_block(osb, tl_bh, tl_inode);
5620 		if (status < 0) {
5621 			mlog_errno(status);
5622 			goto bail;
5623 		}
5624 	}
5625 
5626 bail:
5627 	if (tl_inode)
5628 		iput(tl_inode);
5629 	if (tl_bh)
5630 		brelse(tl_bh);
5631 
5632 	if (status < 0 && (*tl_copy)) {
5633 		kfree(*tl_copy);
5634 		*tl_copy = NULL;
5635 	}
5636 
5637 	mlog_exit(status);
5638 	return status;
5639 }
5640 
5641 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5642 					 struct ocfs2_dinode *tl_copy)
5643 {
5644 	int status = 0;
5645 	int i;
5646 	unsigned int clusters, num_recs, start_cluster;
5647 	u64 start_blk;
5648 	handle_t *handle;
5649 	struct inode *tl_inode = osb->osb_tl_inode;
5650 	struct ocfs2_truncate_log *tl;
5651 
5652 	mlog_entry_void();
5653 
5654 	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5655 		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5656 		return -EINVAL;
5657 	}
5658 
5659 	tl = &tl_copy->id2.i_dealloc;
5660 	num_recs = le16_to_cpu(tl->tl_used);
5661 	mlog(0, "cleanup %u records from %llu\n", num_recs,
5662 	     (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5663 
5664 	mutex_lock(&tl_inode->i_mutex);
5665 	for(i = 0; i < num_recs; i++) {
5666 		if (ocfs2_truncate_log_needs_flush(osb)) {
5667 			status = __ocfs2_flush_truncate_log(osb);
5668 			if (status < 0) {
5669 				mlog_errno(status);
5670 				goto bail_up;
5671 			}
5672 		}
5673 
5674 		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5675 		if (IS_ERR(handle)) {
5676 			status = PTR_ERR(handle);
5677 			mlog_errno(status);
5678 			goto bail_up;
5679 		}
5680 
5681 		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5682 		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5683 		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5684 
5685 		status = ocfs2_truncate_log_append(osb, handle,
5686 						   start_blk, clusters);
5687 		ocfs2_commit_trans(osb, handle);
5688 		if (status < 0) {
5689 			mlog_errno(status);
5690 			goto bail_up;
5691 		}
5692 	}
5693 
5694 bail_up:
5695 	mutex_unlock(&tl_inode->i_mutex);
5696 
5697 	mlog_exit(status);
5698 	return status;
5699 }
5700 
5701 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5702 {
5703 	int status;
5704 	struct inode *tl_inode = osb->osb_tl_inode;
5705 
5706 	mlog_entry_void();
5707 
5708 	if (tl_inode) {
5709 		cancel_delayed_work(&osb->osb_truncate_log_wq);
5710 		flush_workqueue(ocfs2_wq);
5711 
5712 		status = ocfs2_flush_truncate_log(osb);
5713 		if (status < 0)
5714 			mlog_errno(status);
5715 
5716 		brelse(osb->osb_tl_bh);
5717 		iput(osb->osb_tl_inode);
5718 	}
5719 
5720 	mlog_exit_void();
5721 }
5722 
5723 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5724 {
5725 	int status;
5726 	struct inode *tl_inode = NULL;
5727 	struct buffer_head *tl_bh = NULL;
5728 
5729 	mlog_entry_void();
5730 
5731 	status = ocfs2_get_truncate_log_info(osb,
5732 					     osb->slot_num,
5733 					     &tl_inode,
5734 					     &tl_bh);
5735 	if (status < 0)
5736 		mlog_errno(status);
5737 
5738 	/* ocfs2_truncate_log_shutdown keys on the existence of
5739 	 * osb->osb_tl_inode so we don't set any of the osb variables
5740 	 * until we're sure all is well. */
5741 	INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5742 			  ocfs2_truncate_log_worker);
5743 	osb->osb_tl_bh    = tl_bh;
5744 	osb->osb_tl_inode = tl_inode;
5745 
5746 	mlog_exit(status);
5747 	return status;
5748 }
5749 
5750 /*
5751  * Delayed de-allocation of suballocator blocks.
5752  *
5753  * Some sets of block de-allocations might involve multiple suballocator inodes.
5754  *
5755  * The locking for this can get extremely complicated, especially when
5756  * the suballocator inodes to delete from aren't known until deep
5757  * within an unrelated codepath.
5758  *
5759  * ocfs2_extent_block structures are a good example of this - an inode
5760  * btree could have been grown by any number of nodes each allocating
5761  * out of their own suballoc inode.
5762  *
5763  * These structures allow the delay of block de-allocation until a
5764  * later time, when locking of multiple cluster inodes won't cause
5765  * deadlock.
5766  */
5767 
5768 /*
5769  * Describes a single block free from a suballocator
5770  */
5771 struct ocfs2_cached_block_free {
5772 	struct ocfs2_cached_block_free		*free_next;
5773 	u64					free_blk;
5774 	unsigned int				free_bit;
5775 };
5776 
5777 struct ocfs2_per_slot_free_list {
5778 	struct ocfs2_per_slot_free_list		*f_next_suballocator;
5779 	int					f_inode_type;
5780 	int					f_slot;
5781 	struct ocfs2_cached_block_free		*f_first;
5782 };
5783 
5784 static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5785 				   int sysfile_type,
5786 				   int slot,
5787 				   struct ocfs2_cached_block_free *head)
5788 {
5789 	int ret;
5790 	u64 bg_blkno;
5791 	handle_t *handle;
5792 	struct inode *inode;
5793 	struct buffer_head *di_bh = NULL;
5794 	struct ocfs2_cached_block_free *tmp;
5795 
5796 	inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5797 	if (!inode) {
5798 		ret = -EINVAL;
5799 		mlog_errno(ret);
5800 		goto out;
5801 	}
5802 
5803 	mutex_lock(&inode->i_mutex);
5804 
5805 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
5806 	if (ret) {
5807 		mlog_errno(ret);
5808 		goto out_mutex;
5809 	}
5810 
5811 	handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5812 	if (IS_ERR(handle)) {
5813 		ret = PTR_ERR(handle);
5814 		mlog_errno(ret);
5815 		goto out_unlock;
5816 	}
5817 
5818 	while (head) {
5819 		bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5820 						      head->free_bit);
5821 		mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5822 		     head->free_bit, (unsigned long long)head->free_blk);
5823 
5824 		ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5825 					       head->free_bit, bg_blkno, 1);
5826 		if (ret) {
5827 			mlog_errno(ret);
5828 			goto out_journal;
5829 		}
5830 
5831 		ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5832 		if (ret) {
5833 			mlog_errno(ret);
5834 			goto out_journal;
5835 		}
5836 
5837 		tmp = head;
5838 		head = head->free_next;
5839 		kfree(tmp);
5840 	}
5841 
5842 out_journal:
5843 	ocfs2_commit_trans(osb, handle);
5844 
5845 out_unlock:
5846 	ocfs2_inode_unlock(inode, 1);
5847 	brelse(di_bh);
5848 out_mutex:
5849 	mutex_unlock(&inode->i_mutex);
5850 	iput(inode);
5851 out:
5852 	while(head) {
5853 		/* Premature exit may have left some dangling items. */
5854 		tmp = head;
5855 		head = head->free_next;
5856 		kfree(tmp);
5857 	}
5858 
5859 	return ret;
5860 }
5861 
5862 int ocfs2_run_deallocs(struct ocfs2_super *osb,
5863 		       struct ocfs2_cached_dealloc_ctxt *ctxt)
5864 {
5865 	int ret = 0, ret2;
5866 	struct ocfs2_per_slot_free_list *fl;
5867 
5868 	if (!ctxt)
5869 		return 0;
5870 
5871 	while (ctxt->c_first_suballocator) {
5872 		fl = ctxt->c_first_suballocator;
5873 
5874 		if (fl->f_first) {
5875 			mlog(0, "Free items: (type %u, slot %d)\n",
5876 			     fl->f_inode_type, fl->f_slot);
5877 			ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5878 						       fl->f_slot, fl->f_first);
5879 			if (ret2)
5880 				mlog_errno(ret2);
5881 			if (!ret)
5882 				ret = ret2;
5883 		}
5884 
5885 		ctxt->c_first_suballocator = fl->f_next_suballocator;
5886 		kfree(fl);
5887 	}
5888 
5889 	return ret;
5890 }
5891 
5892 static struct ocfs2_per_slot_free_list *
5893 ocfs2_find_per_slot_free_list(int type,
5894 			      int slot,
5895 			      struct ocfs2_cached_dealloc_ctxt *ctxt)
5896 {
5897 	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5898 
5899 	while (fl) {
5900 		if (fl->f_inode_type == type && fl->f_slot == slot)
5901 			return fl;
5902 
5903 		fl = fl->f_next_suballocator;
5904 	}
5905 
5906 	fl = kmalloc(sizeof(*fl), GFP_NOFS);
5907 	if (fl) {
5908 		fl->f_inode_type = type;
5909 		fl->f_slot = slot;
5910 		fl->f_first = NULL;
5911 		fl->f_next_suballocator = ctxt->c_first_suballocator;
5912 
5913 		ctxt->c_first_suballocator = fl;
5914 	}
5915 	return fl;
5916 }
5917 
5918 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5919 				     int type, int slot, u64 blkno,
5920 				     unsigned int bit)
5921 {
5922 	int ret;
5923 	struct ocfs2_per_slot_free_list *fl;
5924 	struct ocfs2_cached_block_free *item;
5925 
5926 	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
5927 	if (fl == NULL) {
5928 		ret = -ENOMEM;
5929 		mlog_errno(ret);
5930 		goto out;
5931 	}
5932 
5933 	item = kmalloc(sizeof(*item), GFP_NOFS);
5934 	if (item == NULL) {
5935 		ret = -ENOMEM;
5936 		mlog_errno(ret);
5937 		goto out;
5938 	}
5939 
5940 	mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
5941 	     type, slot, bit, (unsigned long long)blkno);
5942 
5943 	item->free_blk = blkno;
5944 	item->free_bit = bit;
5945 	item->free_next = fl->f_first;
5946 
5947 	fl->f_first = item;
5948 
5949 	ret = 0;
5950 out:
5951 	return ret;
5952 }
5953 
5954 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
5955 					 struct ocfs2_extent_block *eb)
5956 {
5957 	return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
5958 					 le16_to_cpu(eb->h_suballoc_slot),
5959 					 le64_to_cpu(eb->h_blkno),
5960 					 le16_to_cpu(eb->h_suballoc_bit));
5961 }
5962 
5963 /* This function will figure out whether the currently last extent
5964  * block will be deleted, and if it will, what the new last extent
5965  * block will be so we can update his h_next_leaf_blk field, as well
5966  * as the dinodes i_last_eb_blk */
5967 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
5968 				       unsigned int clusters_to_del,
5969 				       struct ocfs2_path *path,
5970 				       struct buffer_head **new_last_eb)
5971 {
5972 	int next_free, ret = 0;
5973 	u32 cpos;
5974 	struct ocfs2_extent_rec *rec;
5975 	struct ocfs2_extent_block *eb;
5976 	struct ocfs2_extent_list *el;
5977 	struct buffer_head *bh = NULL;
5978 
5979 	*new_last_eb = NULL;
5980 
5981 	/* we have no tree, so of course, no last_eb. */
5982 	if (!path->p_tree_depth)
5983 		goto out;
5984 
5985 	/* trunc to zero special case - this makes tree_depth = 0
5986 	 * regardless of what it is.  */
5987 	if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
5988 		goto out;
5989 
5990 	el = path_leaf_el(path);
5991 	BUG_ON(!el->l_next_free_rec);
5992 
5993 	/*
5994 	 * Make sure that this extent list will actually be empty
5995 	 * after we clear away the data. We can shortcut out if
5996 	 * there's more than one non-empty extent in the
5997 	 * list. Otherwise, a check of the remaining extent is
5998 	 * necessary.
5999 	 */
6000 	next_free = le16_to_cpu(el->l_next_free_rec);
6001 	rec = NULL;
6002 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6003 		if (next_free > 2)
6004 			goto out;
6005 
6006 		/* We may have a valid extent in index 1, check it. */
6007 		if (next_free == 2)
6008 			rec = &el->l_recs[1];
6009 
6010 		/*
6011 		 * Fall through - no more nonempty extents, so we want
6012 		 * to delete this leaf.
6013 		 */
6014 	} else {
6015 		if (next_free > 1)
6016 			goto out;
6017 
6018 		rec = &el->l_recs[0];
6019 	}
6020 
6021 	if (rec) {
6022 		/*
6023 		 * Check it we'll only be trimming off the end of this
6024 		 * cluster.
6025 		 */
6026 		if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6027 			goto out;
6028 	}
6029 
6030 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6031 	if (ret) {
6032 		mlog_errno(ret);
6033 		goto out;
6034 	}
6035 
6036 	ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
6037 	if (ret) {
6038 		mlog_errno(ret);
6039 		goto out;
6040 	}
6041 
6042 	eb = (struct ocfs2_extent_block *) bh->b_data;
6043 	el = &eb->h_list;
6044 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6045 		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6046 		ret = -EROFS;
6047 		goto out;
6048 	}
6049 
6050 	*new_last_eb = bh;
6051 	get_bh(*new_last_eb);
6052 	mlog(0, "returning block %llu, (cpos: %u)\n",
6053 	     (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6054 out:
6055 	brelse(bh);
6056 
6057 	return ret;
6058 }
6059 
6060 /*
6061  * Trim some clusters off the rightmost edge of a tree. Only called
6062  * during truncate.
6063  *
6064  * The caller needs to:
6065  *   - start journaling of each path component.
6066  *   - compute and fully set up any new last ext block
6067  */
6068 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6069 			   handle_t *handle, struct ocfs2_truncate_context *tc,
6070 			   u32 clusters_to_del, u64 *delete_start)
6071 {
6072 	int ret, i, index = path->p_tree_depth;
6073 	u32 new_edge = 0;
6074 	u64 deleted_eb = 0;
6075 	struct buffer_head *bh;
6076 	struct ocfs2_extent_list *el;
6077 	struct ocfs2_extent_rec *rec;
6078 
6079 	*delete_start = 0;
6080 
6081 	while (index >= 0) {
6082 		bh = path->p_node[index].bh;
6083 		el = path->p_node[index].el;
6084 
6085 		mlog(0, "traveling tree (index = %d, block = %llu)\n",
6086 		     index,  (unsigned long long)bh->b_blocknr);
6087 
6088 		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6089 
6090 		if (index !=
6091 		    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6092 			ocfs2_error(inode->i_sb,
6093 				    "Inode %lu has invalid ext. block %llu",
6094 				    inode->i_ino,
6095 				    (unsigned long long)bh->b_blocknr);
6096 			ret = -EROFS;
6097 			goto out;
6098 		}
6099 
6100 find_tail_record:
6101 		i = le16_to_cpu(el->l_next_free_rec) - 1;
6102 		rec = &el->l_recs[i];
6103 
6104 		mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6105 		     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6106 		     ocfs2_rec_clusters(el, rec),
6107 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
6108 		     le16_to_cpu(el->l_next_free_rec));
6109 
6110 		BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6111 
6112 		if (le16_to_cpu(el->l_tree_depth) == 0) {
6113 			/*
6114 			 * If the leaf block contains a single empty
6115 			 * extent and no records, we can just remove
6116 			 * the block.
6117 			 */
6118 			if (i == 0 && ocfs2_is_empty_extent(rec)) {
6119 				memset(rec, 0,
6120 				       sizeof(struct ocfs2_extent_rec));
6121 				el->l_next_free_rec = cpu_to_le16(0);
6122 
6123 				goto delete;
6124 			}
6125 
6126 			/*
6127 			 * Remove any empty extents by shifting things
6128 			 * left. That should make life much easier on
6129 			 * the code below. This condition is rare
6130 			 * enough that we shouldn't see a performance
6131 			 * hit.
6132 			 */
6133 			if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6134 				le16_add_cpu(&el->l_next_free_rec, -1);
6135 
6136 				for(i = 0;
6137 				    i < le16_to_cpu(el->l_next_free_rec); i++)
6138 					el->l_recs[i] = el->l_recs[i + 1];
6139 
6140 				memset(&el->l_recs[i], 0,
6141 				       sizeof(struct ocfs2_extent_rec));
6142 
6143 				/*
6144 				 * We've modified our extent list. The
6145 				 * simplest way to handle this change
6146 				 * is to being the search from the
6147 				 * start again.
6148 				 */
6149 				goto find_tail_record;
6150 			}
6151 
6152 			le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6153 
6154 			/*
6155 			 * We'll use "new_edge" on our way back up the
6156 			 * tree to know what our rightmost cpos is.
6157 			 */
6158 			new_edge = le16_to_cpu(rec->e_leaf_clusters);
6159 			new_edge += le32_to_cpu(rec->e_cpos);
6160 
6161 			/*
6162 			 * The caller will use this to delete data blocks.
6163 			 */
6164 			*delete_start = le64_to_cpu(rec->e_blkno)
6165 				+ ocfs2_clusters_to_blocks(inode->i_sb,
6166 					le16_to_cpu(rec->e_leaf_clusters));
6167 
6168 			/*
6169 			 * If it's now empty, remove this record.
6170 			 */
6171 			if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6172 				memset(rec, 0,
6173 				       sizeof(struct ocfs2_extent_rec));
6174 				le16_add_cpu(&el->l_next_free_rec, -1);
6175 			}
6176 		} else {
6177 			if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6178 				memset(rec, 0,
6179 				       sizeof(struct ocfs2_extent_rec));
6180 				le16_add_cpu(&el->l_next_free_rec, -1);
6181 
6182 				goto delete;
6183 			}
6184 
6185 			/* Can this actually happen? */
6186 			if (le16_to_cpu(el->l_next_free_rec) == 0)
6187 				goto delete;
6188 
6189 			/*
6190 			 * We never actually deleted any clusters
6191 			 * because our leaf was empty. There's no
6192 			 * reason to adjust the rightmost edge then.
6193 			 */
6194 			if (new_edge == 0)
6195 				goto delete;
6196 
6197 			rec->e_int_clusters = cpu_to_le32(new_edge);
6198 			le32_add_cpu(&rec->e_int_clusters,
6199 				     -le32_to_cpu(rec->e_cpos));
6200 
6201 			 /*
6202 			  * A deleted child record should have been
6203 			  * caught above.
6204 			  */
6205 			 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6206 		}
6207 
6208 delete:
6209 		ret = ocfs2_journal_dirty(handle, bh);
6210 		if (ret) {
6211 			mlog_errno(ret);
6212 			goto out;
6213 		}
6214 
6215 		mlog(0, "extent list container %llu, after: record %d: "
6216 		     "(%u, %u, %llu), next = %u.\n",
6217 		     (unsigned long long)bh->b_blocknr, i,
6218 		     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6219 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
6220 		     le16_to_cpu(el->l_next_free_rec));
6221 
6222 		/*
6223 		 * We must be careful to only attempt delete of an
6224 		 * extent block (and not the root inode block).
6225 		 */
6226 		if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6227 			struct ocfs2_extent_block *eb =
6228 				(struct ocfs2_extent_block *)bh->b_data;
6229 
6230 			/*
6231 			 * Save this for use when processing the
6232 			 * parent block.
6233 			 */
6234 			deleted_eb = le64_to_cpu(eb->h_blkno);
6235 
6236 			mlog(0, "deleting this extent block.\n");
6237 
6238 			ocfs2_remove_from_cache(inode, bh);
6239 
6240 			BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6241 			BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6242 			BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6243 
6244 			ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6245 			/* An error here is not fatal. */
6246 			if (ret < 0)
6247 				mlog_errno(ret);
6248 		} else {
6249 			deleted_eb = 0;
6250 		}
6251 
6252 		index--;
6253 	}
6254 
6255 	ret = 0;
6256 out:
6257 	return ret;
6258 }
6259 
6260 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6261 			     unsigned int clusters_to_del,
6262 			     struct inode *inode,
6263 			     struct buffer_head *fe_bh,
6264 			     handle_t *handle,
6265 			     struct ocfs2_truncate_context *tc,
6266 			     struct ocfs2_path *path)
6267 {
6268 	int status;
6269 	struct ocfs2_dinode *fe;
6270 	struct ocfs2_extent_block *last_eb = NULL;
6271 	struct ocfs2_extent_list *el;
6272 	struct buffer_head *last_eb_bh = NULL;
6273 	u64 delete_blk = 0;
6274 
6275 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
6276 
6277 	status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6278 					     path, &last_eb_bh);
6279 	if (status < 0) {
6280 		mlog_errno(status);
6281 		goto bail;
6282 	}
6283 
6284 	/*
6285 	 * Each component will be touched, so we might as well journal
6286 	 * here to avoid having to handle errors later.
6287 	 */
6288 	status = ocfs2_journal_access_path(inode, handle, path);
6289 	if (status < 0) {
6290 		mlog_errno(status);
6291 		goto bail;
6292 	}
6293 
6294 	if (last_eb_bh) {
6295 		status = ocfs2_journal_access(handle, inode, last_eb_bh,
6296 					      OCFS2_JOURNAL_ACCESS_WRITE);
6297 		if (status < 0) {
6298 			mlog_errno(status);
6299 			goto bail;
6300 		}
6301 
6302 		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6303 	}
6304 
6305 	el = &(fe->id2.i_list);
6306 
6307 	/*
6308 	 * Lower levels depend on this never happening, but it's best
6309 	 * to check it up here before changing the tree.
6310 	 */
6311 	if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6312 		ocfs2_error(inode->i_sb,
6313 			    "Inode %lu has an empty extent record, depth %u\n",
6314 			    inode->i_ino, le16_to_cpu(el->l_tree_depth));
6315 		status = -EROFS;
6316 		goto bail;
6317 	}
6318 
6319 	spin_lock(&OCFS2_I(inode)->ip_lock);
6320 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6321 				      clusters_to_del;
6322 	spin_unlock(&OCFS2_I(inode)->ip_lock);
6323 	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6324 	inode->i_blocks = ocfs2_inode_sector_count(inode);
6325 
6326 	status = ocfs2_trim_tree(inode, path, handle, tc,
6327 				 clusters_to_del, &delete_blk);
6328 	if (status) {
6329 		mlog_errno(status);
6330 		goto bail;
6331 	}
6332 
6333 	if (le32_to_cpu(fe->i_clusters) == 0) {
6334 		/* trunc to zero is a special case. */
6335 		el->l_tree_depth = 0;
6336 		fe->i_last_eb_blk = 0;
6337 	} else if (last_eb)
6338 		fe->i_last_eb_blk = last_eb->h_blkno;
6339 
6340 	status = ocfs2_journal_dirty(handle, fe_bh);
6341 	if (status < 0) {
6342 		mlog_errno(status);
6343 		goto bail;
6344 	}
6345 
6346 	if (last_eb) {
6347 		/* If there will be a new last extent block, then by
6348 		 * definition, there cannot be any leaves to the right of
6349 		 * him. */
6350 		last_eb->h_next_leaf_blk = 0;
6351 		status = ocfs2_journal_dirty(handle, last_eb_bh);
6352 		if (status < 0) {
6353 			mlog_errno(status);
6354 			goto bail;
6355 		}
6356 	}
6357 
6358 	if (delete_blk) {
6359 		status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6360 						   clusters_to_del);
6361 		if (status < 0) {
6362 			mlog_errno(status);
6363 			goto bail;
6364 		}
6365 	}
6366 	status = 0;
6367 bail:
6368 
6369 	mlog_exit(status);
6370 	return status;
6371 }
6372 
6373 static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
6374 {
6375 	set_buffer_uptodate(bh);
6376 	mark_buffer_dirty(bh);
6377 	return 0;
6378 }
6379 
6380 static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
6381 {
6382 	set_buffer_uptodate(bh);
6383 	mark_buffer_dirty(bh);
6384 	return ocfs2_journal_dirty_data(handle, bh);
6385 }
6386 
6387 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6388 				     unsigned int from, unsigned int to,
6389 				     struct page *page, int zero, u64 *phys)
6390 {
6391 	int ret, partial = 0;
6392 
6393 	ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6394 	if (ret)
6395 		mlog_errno(ret);
6396 
6397 	if (zero)
6398 		zero_user_segment(page, from, to);
6399 
6400 	/*
6401 	 * Need to set the buffers we zero'd into uptodate
6402 	 * here if they aren't - ocfs2_map_page_blocks()
6403 	 * might've skipped some
6404 	 */
6405 	if (ocfs2_should_order_data(inode)) {
6406 		ret = walk_page_buffers(handle,
6407 					page_buffers(page),
6408 					from, to, &partial,
6409 					ocfs2_ordered_zero_func);
6410 		if (ret < 0)
6411 			mlog_errno(ret);
6412 	} else {
6413 		ret = walk_page_buffers(handle, page_buffers(page),
6414 					from, to, &partial,
6415 					ocfs2_writeback_zero_func);
6416 		if (ret < 0)
6417 			mlog_errno(ret);
6418 	}
6419 
6420 	if (!partial)
6421 		SetPageUptodate(page);
6422 
6423 	flush_dcache_page(page);
6424 }
6425 
6426 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6427 				     loff_t end, struct page **pages,
6428 				     int numpages, u64 phys, handle_t *handle)
6429 {
6430 	int i;
6431 	struct page *page;
6432 	unsigned int from, to = PAGE_CACHE_SIZE;
6433 	struct super_block *sb = inode->i_sb;
6434 
6435 	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6436 
6437 	if (numpages == 0)
6438 		goto out;
6439 
6440 	to = PAGE_CACHE_SIZE;
6441 	for(i = 0; i < numpages; i++) {
6442 		page = pages[i];
6443 
6444 		from = start & (PAGE_CACHE_SIZE - 1);
6445 		if ((end >> PAGE_CACHE_SHIFT) == page->index)
6446 			to = end & (PAGE_CACHE_SIZE - 1);
6447 
6448 		BUG_ON(from > PAGE_CACHE_SIZE);
6449 		BUG_ON(to > PAGE_CACHE_SIZE);
6450 
6451 		ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6452 					 &phys);
6453 
6454 		start = (page->index + 1) << PAGE_CACHE_SHIFT;
6455 	}
6456 out:
6457 	if (pages)
6458 		ocfs2_unlock_and_free_pages(pages, numpages);
6459 }
6460 
6461 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6462 				struct page **pages, int *num)
6463 {
6464 	int numpages, ret = 0;
6465 	struct super_block *sb = inode->i_sb;
6466 	struct address_space *mapping = inode->i_mapping;
6467 	unsigned long index;
6468 	loff_t last_page_bytes;
6469 
6470 	BUG_ON(start > end);
6471 
6472 	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6473 	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6474 
6475 	numpages = 0;
6476 	last_page_bytes = PAGE_ALIGN(end);
6477 	index = start >> PAGE_CACHE_SHIFT;
6478 	do {
6479 		pages[numpages] = grab_cache_page(mapping, index);
6480 		if (!pages[numpages]) {
6481 			ret = -ENOMEM;
6482 			mlog_errno(ret);
6483 			goto out;
6484 		}
6485 
6486 		numpages++;
6487 		index++;
6488 	} while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6489 
6490 out:
6491 	if (ret != 0) {
6492 		if (pages)
6493 			ocfs2_unlock_and_free_pages(pages, numpages);
6494 		numpages = 0;
6495 	}
6496 
6497 	*num = numpages;
6498 
6499 	return ret;
6500 }
6501 
6502 /*
6503  * Zero the area past i_size but still within an allocated
6504  * cluster. This avoids exposing nonzero data on subsequent file
6505  * extends.
6506  *
6507  * We need to call this before i_size is updated on the inode because
6508  * otherwise block_write_full_page() will skip writeout of pages past
6509  * i_size. The new_i_size parameter is passed for this reason.
6510  */
6511 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6512 				  u64 range_start, u64 range_end)
6513 {
6514 	int ret = 0, numpages;
6515 	struct page **pages = NULL;
6516 	u64 phys;
6517 	unsigned int ext_flags;
6518 	struct super_block *sb = inode->i_sb;
6519 
6520 	/*
6521 	 * File systems which don't support sparse files zero on every
6522 	 * extend.
6523 	 */
6524 	if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6525 		return 0;
6526 
6527 	pages = kcalloc(ocfs2_pages_per_cluster(sb),
6528 			sizeof(struct page *), GFP_NOFS);
6529 	if (pages == NULL) {
6530 		ret = -ENOMEM;
6531 		mlog_errno(ret);
6532 		goto out;
6533 	}
6534 
6535 	if (range_start == range_end)
6536 		goto out;
6537 
6538 	ret = ocfs2_extent_map_get_blocks(inode,
6539 					  range_start >> sb->s_blocksize_bits,
6540 					  &phys, NULL, &ext_flags);
6541 	if (ret) {
6542 		mlog_errno(ret);
6543 		goto out;
6544 	}
6545 
6546 	/*
6547 	 * Tail is a hole, or is marked unwritten. In either case, we
6548 	 * can count on read and write to return/push zero's.
6549 	 */
6550 	if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6551 		goto out;
6552 
6553 	ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6554 				   &numpages);
6555 	if (ret) {
6556 		mlog_errno(ret);
6557 		goto out;
6558 	}
6559 
6560 	ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6561 				 numpages, phys, handle);
6562 
6563 	/*
6564 	 * Initiate writeout of the pages we zero'd here. We don't
6565 	 * wait on them - the truncate_inode_pages() call later will
6566 	 * do that for us.
6567 	 */
6568 	ret = do_sync_mapping_range(inode->i_mapping, range_start,
6569 				    range_end - 1, SYNC_FILE_RANGE_WRITE);
6570 	if (ret)
6571 		mlog_errno(ret);
6572 
6573 out:
6574 	if (pages)
6575 		kfree(pages);
6576 
6577 	return ret;
6578 }
6579 
6580 static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
6581 {
6582 	unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6583 
6584 	memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
6585 }
6586 
6587 void ocfs2_dinode_new_extent_list(struct inode *inode,
6588 				  struct ocfs2_dinode *di)
6589 {
6590 	ocfs2_zero_dinode_id2(inode, di);
6591 	di->id2.i_list.l_tree_depth = 0;
6592 	di->id2.i_list.l_next_free_rec = 0;
6593 	di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
6594 }
6595 
6596 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6597 {
6598 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
6599 	struct ocfs2_inline_data *idata = &di->id2.i_data;
6600 
6601 	spin_lock(&oi->ip_lock);
6602 	oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6603 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6604 	spin_unlock(&oi->ip_lock);
6605 
6606 	/*
6607 	 * We clear the entire i_data structure here so that all
6608 	 * fields can be properly initialized.
6609 	 */
6610 	ocfs2_zero_dinode_id2(inode, di);
6611 
6612 	idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
6613 }
6614 
6615 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6616 					 struct buffer_head *di_bh)
6617 {
6618 	int ret, i, has_data, num_pages = 0;
6619 	handle_t *handle;
6620 	u64 uninitialized_var(block);
6621 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
6622 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6623 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6624 	struct ocfs2_alloc_context *data_ac = NULL;
6625 	struct page **pages = NULL;
6626 	loff_t end = osb->s_clustersize;
6627 
6628 	has_data = i_size_read(inode) ? 1 : 0;
6629 
6630 	if (has_data) {
6631 		pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6632 				sizeof(struct page *), GFP_NOFS);
6633 		if (pages == NULL) {
6634 			ret = -ENOMEM;
6635 			mlog_errno(ret);
6636 			goto out;
6637 		}
6638 
6639 		ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6640 		if (ret) {
6641 			mlog_errno(ret);
6642 			goto out;
6643 		}
6644 	}
6645 
6646 	handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
6647 	if (IS_ERR(handle)) {
6648 		ret = PTR_ERR(handle);
6649 		mlog_errno(ret);
6650 		goto out_unlock;
6651 	}
6652 
6653 	ret = ocfs2_journal_access(handle, inode, di_bh,
6654 				   OCFS2_JOURNAL_ACCESS_WRITE);
6655 	if (ret) {
6656 		mlog_errno(ret);
6657 		goto out_commit;
6658 	}
6659 
6660 	if (has_data) {
6661 		u32 bit_off, num;
6662 		unsigned int page_end;
6663 		u64 phys;
6664 
6665 		ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6666 					   &num);
6667 		if (ret) {
6668 			mlog_errno(ret);
6669 			goto out_commit;
6670 		}
6671 
6672 		/*
6673 		 * Save two copies, one for insert, and one that can
6674 		 * be changed by ocfs2_map_and_dirty_page() below.
6675 		 */
6676 		block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6677 
6678 		/*
6679 		 * Non sparse file systems zero on extend, so no need
6680 		 * to do that now.
6681 		 */
6682 		if (!ocfs2_sparse_alloc(osb) &&
6683 		    PAGE_CACHE_SIZE < osb->s_clustersize)
6684 			end = PAGE_CACHE_SIZE;
6685 
6686 		ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6687 		if (ret) {
6688 			mlog_errno(ret);
6689 			goto out_commit;
6690 		}
6691 
6692 		/*
6693 		 * This should populate the 1st page for us and mark
6694 		 * it up to date.
6695 		 */
6696 		ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6697 		if (ret) {
6698 			mlog_errno(ret);
6699 			goto out_commit;
6700 		}
6701 
6702 		page_end = PAGE_CACHE_SIZE;
6703 		if (PAGE_CACHE_SIZE > osb->s_clustersize)
6704 			page_end = osb->s_clustersize;
6705 
6706 		for (i = 0; i < num_pages; i++)
6707 			ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6708 						 pages[i], i > 0, &phys);
6709 	}
6710 
6711 	spin_lock(&oi->ip_lock);
6712 	oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6713 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6714 	spin_unlock(&oi->ip_lock);
6715 
6716 	ocfs2_dinode_new_extent_list(inode, di);
6717 
6718 	ocfs2_journal_dirty(handle, di_bh);
6719 
6720 	if (has_data) {
6721 		/*
6722 		 * An error at this point should be extremely rare. If
6723 		 * this proves to be false, we could always re-build
6724 		 * the in-inode data from our pages.
6725 		 */
6726 		ret = ocfs2_dinode_insert_extent(osb, handle, inode, di_bh,
6727 						 0, block, 1, 0, NULL);
6728 		if (ret) {
6729 			mlog_errno(ret);
6730 			goto out_commit;
6731 		}
6732 
6733 		inode->i_blocks = ocfs2_inode_sector_count(inode);
6734 	}
6735 
6736 out_commit:
6737 	ocfs2_commit_trans(osb, handle);
6738 
6739 out_unlock:
6740 	if (data_ac)
6741 		ocfs2_free_alloc_context(data_ac);
6742 
6743 out:
6744 	if (pages) {
6745 		ocfs2_unlock_and_free_pages(pages, num_pages);
6746 		kfree(pages);
6747 	}
6748 
6749 	return ret;
6750 }
6751 
6752 /*
6753  * It is expected, that by the time you call this function,
6754  * inode->i_size and fe->i_size have been adjusted.
6755  *
6756  * WARNING: This will kfree the truncate context
6757  */
6758 int ocfs2_commit_truncate(struct ocfs2_super *osb,
6759 			  struct inode *inode,
6760 			  struct buffer_head *fe_bh,
6761 			  struct ocfs2_truncate_context *tc)
6762 {
6763 	int status, i, credits, tl_sem = 0;
6764 	u32 clusters_to_del, new_highest_cpos, range;
6765 	struct ocfs2_extent_list *el;
6766 	handle_t *handle = NULL;
6767 	struct inode *tl_inode = osb->osb_tl_inode;
6768 	struct ocfs2_path *path = NULL;
6769 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
6770 
6771 	mlog_entry_void();
6772 
6773 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6774 						     i_size_read(inode));
6775 
6776 	path = ocfs2_new_path(fe_bh, &di->id2.i_list);
6777 	if (!path) {
6778 		status = -ENOMEM;
6779 		mlog_errno(status);
6780 		goto bail;
6781 	}
6782 
6783 	ocfs2_extent_map_trunc(inode, new_highest_cpos);
6784 
6785 start:
6786 	/*
6787 	 * Check that we still have allocation to delete.
6788 	 */
6789 	if (OCFS2_I(inode)->ip_clusters == 0) {
6790 		status = 0;
6791 		goto bail;
6792 	}
6793 
6794 	/*
6795 	 * Truncate always works against the rightmost tree branch.
6796 	 */
6797 	status = ocfs2_find_path(inode, path, UINT_MAX);
6798 	if (status) {
6799 		mlog_errno(status);
6800 		goto bail;
6801 	}
6802 
6803 	mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
6804 	     OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
6805 
6806 	/*
6807 	 * By now, el will point to the extent list on the bottom most
6808 	 * portion of this tree. Only the tail record is considered in
6809 	 * each pass.
6810 	 *
6811 	 * We handle the following cases, in order:
6812 	 * - empty extent: delete the remaining branch
6813 	 * - remove the entire record
6814 	 * - remove a partial record
6815 	 * - no record needs to be removed (truncate has completed)
6816 	 */
6817 	el = path_leaf_el(path);
6818 	if (le16_to_cpu(el->l_next_free_rec) == 0) {
6819 		ocfs2_error(inode->i_sb,
6820 			    "Inode %llu has empty extent block at %llu\n",
6821 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
6822 			    (unsigned long long)path_leaf_bh(path)->b_blocknr);
6823 		status = -EROFS;
6824 		goto bail;
6825 	}
6826 
6827 	i = le16_to_cpu(el->l_next_free_rec) - 1;
6828 	range = le32_to_cpu(el->l_recs[i].e_cpos) +
6829 		ocfs2_rec_clusters(el, &el->l_recs[i]);
6830 	if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
6831 		clusters_to_del = 0;
6832 	} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
6833 		clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
6834 	} else if (range > new_highest_cpos) {
6835 		clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
6836 				   le32_to_cpu(el->l_recs[i].e_cpos)) -
6837 				  new_highest_cpos;
6838 	} else {
6839 		status = 0;
6840 		goto bail;
6841 	}
6842 
6843 	mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
6844 	     clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
6845 
6846 	mutex_lock(&tl_inode->i_mutex);
6847 	tl_sem = 1;
6848 	/* ocfs2_truncate_log_needs_flush guarantees us at least one
6849 	 * record is free for use. If there isn't any, we flush to get
6850 	 * an empty truncate log.  */
6851 	if (ocfs2_truncate_log_needs_flush(osb)) {
6852 		status = __ocfs2_flush_truncate_log(osb);
6853 		if (status < 0) {
6854 			mlog_errno(status);
6855 			goto bail;
6856 		}
6857 	}
6858 
6859 	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
6860 						(struct ocfs2_dinode *)fe_bh->b_data,
6861 						el);
6862 	handle = ocfs2_start_trans(osb, credits);
6863 	if (IS_ERR(handle)) {
6864 		status = PTR_ERR(handle);
6865 		handle = NULL;
6866 		mlog_errno(status);
6867 		goto bail;
6868 	}
6869 
6870 	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
6871 				   tc, path);
6872 	if (status < 0) {
6873 		mlog_errno(status);
6874 		goto bail;
6875 	}
6876 
6877 	mutex_unlock(&tl_inode->i_mutex);
6878 	tl_sem = 0;
6879 
6880 	ocfs2_commit_trans(osb, handle);
6881 	handle = NULL;
6882 
6883 	ocfs2_reinit_path(path, 1);
6884 
6885 	/*
6886 	 * The check above will catch the case where we've truncated
6887 	 * away all allocation.
6888 	 */
6889 	goto start;
6890 
6891 bail:
6892 
6893 	ocfs2_schedule_truncate_log_flush(osb, 1);
6894 
6895 	if (tl_sem)
6896 		mutex_unlock(&tl_inode->i_mutex);
6897 
6898 	if (handle)
6899 		ocfs2_commit_trans(osb, handle);
6900 
6901 	ocfs2_run_deallocs(osb, &tc->tc_dealloc);
6902 
6903 	ocfs2_free_path(path);
6904 
6905 	/* This will drop the ext_alloc cluster lock for us */
6906 	ocfs2_free_truncate_context(tc);
6907 
6908 	mlog_exit(status);
6909 	return status;
6910 }
6911 
6912 /*
6913  * Expects the inode to already be locked.
6914  */
6915 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
6916 			   struct inode *inode,
6917 			   struct buffer_head *fe_bh,
6918 			   struct ocfs2_truncate_context **tc)
6919 {
6920 	int status;
6921 	unsigned int new_i_clusters;
6922 	struct ocfs2_dinode *fe;
6923 	struct ocfs2_extent_block *eb;
6924 	struct buffer_head *last_eb_bh = NULL;
6925 
6926 	mlog_entry_void();
6927 
6928 	*tc = NULL;
6929 
6930 	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
6931 						  i_size_read(inode));
6932 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
6933 
6934 	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
6935 	     "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
6936 	     (unsigned long long)le64_to_cpu(fe->i_size));
6937 
6938 	*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
6939 	if (!(*tc)) {
6940 		status = -ENOMEM;
6941 		mlog_errno(status);
6942 		goto bail;
6943 	}
6944 	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
6945 
6946 	if (fe->id2.i_list.l_tree_depth) {
6947 		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
6948 					  &last_eb_bh, OCFS2_BH_CACHED, inode);
6949 		if (status < 0) {
6950 			mlog_errno(status);
6951 			goto bail;
6952 		}
6953 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6954 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6955 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6956 
6957 			brelse(last_eb_bh);
6958 			status = -EIO;
6959 			goto bail;
6960 		}
6961 	}
6962 
6963 	(*tc)->tc_last_eb_bh = last_eb_bh;
6964 
6965 	status = 0;
6966 bail:
6967 	if (status < 0) {
6968 		if (*tc)
6969 			ocfs2_free_truncate_context(*tc);
6970 		*tc = NULL;
6971 	}
6972 	mlog_exit_void();
6973 	return status;
6974 }
6975 
6976 /*
6977  * 'start' is inclusive, 'end' is not.
6978  */
6979 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
6980 			  unsigned int start, unsigned int end, int trunc)
6981 {
6982 	int ret;
6983 	unsigned int numbytes;
6984 	handle_t *handle;
6985 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6986 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6987 	struct ocfs2_inline_data *idata = &di->id2.i_data;
6988 
6989 	if (end > i_size_read(inode))
6990 		end = i_size_read(inode);
6991 
6992 	BUG_ON(start >= end);
6993 
6994 	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
6995 	    !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
6996 	    !ocfs2_supports_inline_data(osb)) {
6997 		ocfs2_error(inode->i_sb,
6998 			    "Inline data flags for inode %llu don't agree! "
6999 			    "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7000 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
7001 			    le16_to_cpu(di->i_dyn_features),
7002 			    OCFS2_I(inode)->ip_dyn_features,
7003 			    osb->s_feature_incompat);
7004 		ret = -EROFS;
7005 		goto out;
7006 	}
7007 
7008 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7009 	if (IS_ERR(handle)) {
7010 		ret = PTR_ERR(handle);
7011 		mlog_errno(ret);
7012 		goto out;
7013 	}
7014 
7015 	ret = ocfs2_journal_access(handle, inode, di_bh,
7016 				   OCFS2_JOURNAL_ACCESS_WRITE);
7017 	if (ret) {
7018 		mlog_errno(ret);
7019 		goto out_commit;
7020 	}
7021 
7022 	numbytes = end - start;
7023 	memset(idata->id_data + start, 0, numbytes);
7024 
7025 	/*
7026 	 * No need to worry about the data page here - it's been
7027 	 * truncated already and inline data doesn't need it for
7028 	 * pushing zero's to disk, so we'll let readpage pick it up
7029 	 * later.
7030 	 */
7031 	if (trunc) {
7032 		i_size_write(inode, start);
7033 		di->i_size = cpu_to_le64(start);
7034 	}
7035 
7036 	inode->i_blocks = ocfs2_inode_sector_count(inode);
7037 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7038 
7039 	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7040 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7041 
7042 	ocfs2_journal_dirty(handle, di_bh);
7043 
7044 out_commit:
7045 	ocfs2_commit_trans(osb, handle);
7046 
7047 out:
7048 	return ret;
7049 }
7050 
7051 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7052 {
7053 	/*
7054 	 * The caller is responsible for completing deallocation
7055 	 * before freeing the context.
7056 	 */
7057 	if (tc->tc_dealloc.c_first_suballocator != NULL)
7058 		mlog(ML_NOTICE,
7059 		     "Truncate completion has non-empty dealloc context\n");
7060 
7061 	if (tc->tc_last_eb_bh)
7062 		brelse(tc->tc_last_eb_bh);
7063 
7064 	kfree(tc);
7065 }
7066