xref: /openbmc/linux/fs/ocfs2/alloc.c (revision 811f933df1e55615fd0bb4818f31e3868a8e6e23)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31 
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34 
35 #include "ocfs2.h"
36 
37 #include "alloc.h"
38 #include "aops.h"
39 #include "dlmglue.h"
40 #include "extent_map.h"
41 #include "inode.h"
42 #include "journal.h"
43 #include "localalloc.h"
44 #include "suballoc.h"
45 #include "sysfile.h"
46 #include "file.h"
47 #include "super.h"
48 #include "uptodate.h"
49 
50 #include "buffer_head_io.h"
51 
52 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
53 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
54 					 struct ocfs2_extent_block *eb);
55 
56 /*
57  * Structures which describe a path through a btree, and functions to
58  * manipulate them.
59  *
60  * The idea here is to be as generic as possible with the tree
61  * manipulation code.
62  */
63 struct ocfs2_path_item {
64 	struct buffer_head		*bh;
65 	struct ocfs2_extent_list	*el;
66 };
67 
68 #define OCFS2_MAX_PATH_DEPTH	5
69 
70 struct ocfs2_path {
71 	int			p_tree_depth;
72 	struct ocfs2_path_item	p_node[OCFS2_MAX_PATH_DEPTH];
73 };
74 
75 #define path_root_bh(_path) ((_path)->p_node[0].bh)
76 #define path_root_el(_path) ((_path)->p_node[0].el)
77 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
78 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
79 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
80 
81 /*
82  * Reset the actual path elements so that we can re-use the structure
83  * to build another path. Generally, this involves freeing the buffer
84  * heads.
85  */
86 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
87 {
88 	int i, start = 0, depth = 0;
89 	struct ocfs2_path_item *node;
90 
91 	if (keep_root)
92 		start = 1;
93 
94 	for(i = start; i < path_num_items(path); i++) {
95 		node = &path->p_node[i];
96 
97 		brelse(node->bh);
98 		node->bh = NULL;
99 		node->el = NULL;
100 	}
101 
102 	/*
103 	 * Tree depth may change during truncate, or insert. If we're
104 	 * keeping the root extent list, then make sure that our path
105 	 * structure reflects the proper depth.
106 	 */
107 	if (keep_root)
108 		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
109 
110 	path->p_tree_depth = depth;
111 }
112 
113 static void ocfs2_free_path(struct ocfs2_path *path)
114 {
115 	if (path) {
116 		ocfs2_reinit_path(path, 0);
117 		kfree(path);
118 	}
119 }
120 
121 /*
122  * All the elements of src into dest. After this call, src could be freed
123  * without affecting dest.
124  *
125  * Both paths should have the same root. Any non-root elements of dest
126  * will be freed.
127  */
128 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
129 {
130 	int i;
131 
132 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
133 	BUG_ON(path_root_el(dest) != path_root_el(src));
134 
135 	ocfs2_reinit_path(dest, 1);
136 
137 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
138 		dest->p_node[i].bh = src->p_node[i].bh;
139 		dest->p_node[i].el = src->p_node[i].el;
140 
141 		if (dest->p_node[i].bh)
142 			get_bh(dest->p_node[i].bh);
143 	}
144 }
145 
146 /*
147  * Make the *dest path the same as src and re-initialize src path to
148  * have a root only.
149  */
150 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
151 {
152 	int i;
153 
154 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
155 
156 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
157 		brelse(dest->p_node[i].bh);
158 
159 		dest->p_node[i].bh = src->p_node[i].bh;
160 		dest->p_node[i].el = src->p_node[i].el;
161 
162 		src->p_node[i].bh = NULL;
163 		src->p_node[i].el = NULL;
164 	}
165 }
166 
167 /*
168  * Insert an extent block at given index.
169  *
170  * This will not take an additional reference on eb_bh.
171  */
172 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
173 					struct buffer_head *eb_bh)
174 {
175 	struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
176 
177 	/*
178 	 * Right now, no root bh is an extent block, so this helps
179 	 * catch code errors with dinode trees. The assertion can be
180 	 * safely removed if we ever need to insert extent block
181 	 * structures at the root.
182 	 */
183 	BUG_ON(index == 0);
184 
185 	path->p_node[index].bh = eb_bh;
186 	path->p_node[index].el = &eb->h_list;
187 }
188 
189 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
190 					 struct ocfs2_extent_list *root_el)
191 {
192 	struct ocfs2_path *path;
193 
194 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
195 
196 	path = kzalloc(sizeof(*path), GFP_NOFS);
197 	if (path) {
198 		path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
199 		get_bh(root_bh);
200 		path_root_bh(path) = root_bh;
201 		path_root_el(path) = root_el;
202 	}
203 
204 	return path;
205 }
206 
207 /*
208  * Allocate and initialize a new path based on a disk inode tree.
209  */
210 static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
211 {
212 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
213 	struct ocfs2_extent_list *el = &di->id2.i_list;
214 
215 	return ocfs2_new_path(di_bh, el);
216 }
217 
218 /*
219  * Convenience function to journal all components in a path.
220  */
221 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
222 				     struct ocfs2_path *path)
223 {
224 	int i, ret = 0;
225 
226 	if (!path)
227 		goto out;
228 
229 	for(i = 0; i < path_num_items(path); i++) {
230 		ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
231 					   OCFS2_JOURNAL_ACCESS_WRITE);
232 		if (ret < 0) {
233 			mlog_errno(ret);
234 			goto out;
235 		}
236 	}
237 
238 out:
239 	return ret;
240 }
241 
242 /*
243  * Return the index of the extent record which contains cluster #v_cluster.
244  * -1 is returned if it was not found.
245  *
246  * Should work fine on interior and exterior nodes.
247  */
248 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
249 {
250 	int ret = -1;
251 	int i;
252 	struct ocfs2_extent_rec *rec;
253 	u32 rec_end, rec_start, clusters;
254 
255 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
256 		rec = &el->l_recs[i];
257 
258 		rec_start = le32_to_cpu(rec->e_cpos);
259 		clusters = ocfs2_rec_clusters(el, rec);
260 
261 		rec_end = rec_start + clusters;
262 
263 		if (v_cluster >= rec_start && v_cluster < rec_end) {
264 			ret = i;
265 			break;
266 		}
267 	}
268 
269 	return ret;
270 }
271 
272 enum ocfs2_contig_type {
273 	CONTIG_NONE = 0,
274 	CONTIG_LEFT,
275 	CONTIG_RIGHT,
276 	CONTIG_LEFTRIGHT,
277 };
278 
279 
280 /*
281  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
282  * ocfs2_extent_contig only work properly against leaf nodes!
283  */
284 static int ocfs2_block_extent_contig(struct super_block *sb,
285 				     struct ocfs2_extent_rec *ext,
286 				     u64 blkno)
287 {
288 	u64 blk_end = le64_to_cpu(ext->e_blkno);
289 
290 	blk_end += ocfs2_clusters_to_blocks(sb,
291 				    le16_to_cpu(ext->e_leaf_clusters));
292 
293 	return blkno == blk_end;
294 }
295 
296 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
297 				  struct ocfs2_extent_rec *right)
298 {
299 	u32 left_range;
300 
301 	left_range = le32_to_cpu(left->e_cpos) +
302 		le16_to_cpu(left->e_leaf_clusters);
303 
304 	return (left_range == le32_to_cpu(right->e_cpos));
305 }
306 
307 static enum ocfs2_contig_type
308 	ocfs2_extent_contig(struct inode *inode,
309 			    struct ocfs2_extent_rec *ext,
310 			    struct ocfs2_extent_rec *insert_rec)
311 {
312 	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
313 
314 	/*
315 	 * Refuse to coalesce extent records with different flag
316 	 * fields - we don't want to mix unwritten extents with user
317 	 * data.
318 	 */
319 	if (ext->e_flags != insert_rec->e_flags)
320 		return CONTIG_NONE;
321 
322 	if (ocfs2_extents_adjacent(ext, insert_rec) &&
323 	    ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
324 			return CONTIG_RIGHT;
325 
326 	blkno = le64_to_cpu(ext->e_blkno);
327 	if (ocfs2_extents_adjacent(insert_rec, ext) &&
328 	    ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
329 		return CONTIG_LEFT;
330 
331 	return CONTIG_NONE;
332 }
333 
334 /*
335  * NOTE: We can have pretty much any combination of contiguousness and
336  * appending.
337  *
338  * The usefulness of APPEND_TAIL is more in that it lets us know that
339  * we'll have to update the path to that leaf.
340  */
341 enum ocfs2_append_type {
342 	APPEND_NONE = 0,
343 	APPEND_TAIL,
344 };
345 
346 enum ocfs2_split_type {
347 	SPLIT_NONE = 0,
348 	SPLIT_LEFT,
349 	SPLIT_RIGHT,
350 };
351 
352 struct ocfs2_insert_type {
353 	enum ocfs2_split_type	ins_split;
354 	enum ocfs2_append_type	ins_appending;
355 	enum ocfs2_contig_type	ins_contig;
356 	int			ins_contig_index;
357 	int			ins_tree_depth;
358 };
359 
360 struct ocfs2_merge_ctxt {
361 	enum ocfs2_contig_type	c_contig_type;
362 	int			c_has_empty_extent;
363 	int			c_split_covers_rec;
364 };
365 
366 /*
367  * How many free extents have we got before we need more meta data?
368  */
369 int ocfs2_num_free_extents(struct ocfs2_super *osb,
370 			   struct inode *inode,
371 			   struct buffer_head *bh)
372 {
373 	int retval;
374 	struct ocfs2_extent_list *el;
375 	struct ocfs2_extent_block *eb;
376 	struct buffer_head *eb_bh = NULL;
377 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *)bh->b_data;
378 
379 	mlog_entry_void();
380 
381 	if (!OCFS2_IS_VALID_DINODE(fe)) {
382 		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
383 		retval = -EIO;
384 		goto bail;
385 	}
386 
387 	if (fe->i_last_eb_blk) {
388 		retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
389 					  &eb_bh, OCFS2_BH_CACHED, inode);
390 		if (retval < 0) {
391 			mlog_errno(retval);
392 			goto bail;
393 		}
394 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
395 		el = &eb->h_list;
396 	} else
397 		el = &fe->id2.i_list;
398 
399 	BUG_ON(el->l_tree_depth != 0);
400 
401 	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
402 bail:
403 	if (eb_bh)
404 		brelse(eb_bh);
405 
406 	mlog_exit(retval);
407 	return retval;
408 }
409 
410 /* expects array to already be allocated
411  *
412  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
413  * l_count for you
414  */
415 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
416 				     handle_t *handle,
417 				     struct inode *inode,
418 				     int wanted,
419 				     struct ocfs2_alloc_context *meta_ac,
420 				     struct buffer_head *bhs[])
421 {
422 	int count, status, i;
423 	u16 suballoc_bit_start;
424 	u32 num_got;
425 	u64 first_blkno;
426 	struct ocfs2_extent_block *eb;
427 
428 	mlog_entry_void();
429 
430 	count = 0;
431 	while (count < wanted) {
432 		status = ocfs2_claim_metadata(osb,
433 					      handle,
434 					      meta_ac,
435 					      wanted - count,
436 					      &suballoc_bit_start,
437 					      &num_got,
438 					      &first_blkno);
439 		if (status < 0) {
440 			mlog_errno(status);
441 			goto bail;
442 		}
443 
444 		for(i = count;  i < (num_got + count); i++) {
445 			bhs[i] = sb_getblk(osb->sb, first_blkno);
446 			if (bhs[i] == NULL) {
447 				status = -EIO;
448 				mlog_errno(status);
449 				goto bail;
450 			}
451 			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
452 
453 			status = ocfs2_journal_access(handle, inode, bhs[i],
454 						      OCFS2_JOURNAL_ACCESS_CREATE);
455 			if (status < 0) {
456 				mlog_errno(status);
457 				goto bail;
458 			}
459 
460 			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
461 			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
462 			/* Ok, setup the minimal stuff here. */
463 			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
464 			eb->h_blkno = cpu_to_le64(first_blkno);
465 			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
466 			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
467 			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
468 			eb->h_list.l_count =
469 				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
470 
471 			suballoc_bit_start++;
472 			first_blkno++;
473 
474 			/* We'll also be dirtied by the caller, so
475 			 * this isn't absolutely necessary. */
476 			status = ocfs2_journal_dirty(handle, bhs[i]);
477 			if (status < 0) {
478 				mlog_errno(status);
479 				goto bail;
480 			}
481 		}
482 
483 		count += num_got;
484 	}
485 
486 	status = 0;
487 bail:
488 	if (status < 0) {
489 		for(i = 0; i < wanted; i++) {
490 			if (bhs[i])
491 				brelse(bhs[i]);
492 			bhs[i] = NULL;
493 		}
494 	}
495 	mlog_exit(status);
496 	return status;
497 }
498 
499 /*
500  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
501  *
502  * Returns the sum of the rightmost extent rec logical offset and
503  * cluster count.
504  *
505  * ocfs2_add_branch() uses this to determine what logical cluster
506  * value should be populated into the leftmost new branch records.
507  *
508  * ocfs2_shift_tree_depth() uses this to determine the # clusters
509  * value for the new topmost tree record.
510  */
511 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
512 {
513 	int i;
514 
515 	i = le16_to_cpu(el->l_next_free_rec) - 1;
516 
517 	return le32_to_cpu(el->l_recs[i].e_cpos) +
518 		ocfs2_rec_clusters(el, &el->l_recs[i]);
519 }
520 
521 /*
522  * Add an entire tree branch to our inode. eb_bh is the extent block
523  * to start at, if we don't want to start the branch at the dinode
524  * structure.
525  *
526  * last_eb_bh is required as we have to update it's next_leaf pointer
527  * for the new last extent block.
528  *
529  * the new branch will be 'empty' in the sense that every block will
530  * contain a single record with cluster count == 0.
531  */
532 static int ocfs2_add_branch(struct ocfs2_super *osb,
533 			    handle_t *handle,
534 			    struct inode *inode,
535 			    struct buffer_head *fe_bh,
536 			    struct buffer_head *eb_bh,
537 			    struct buffer_head **last_eb_bh,
538 			    struct ocfs2_alloc_context *meta_ac)
539 {
540 	int status, new_blocks, i;
541 	u64 next_blkno, new_last_eb_blk;
542 	struct buffer_head *bh;
543 	struct buffer_head **new_eb_bhs = NULL;
544 	struct ocfs2_dinode *fe;
545 	struct ocfs2_extent_block *eb;
546 	struct ocfs2_extent_list  *eb_el;
547 	struct ocfs2_extent_list  *el;
548 	u32 new_cpos;
549 
550 	mlog_entry_void();
551 
552 	BUG_ON(!last_eb_bh || !*last_eb_bh);
553 
554 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
555 
556 	if (eb_bh) {
557 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
558 		el = &eb->h_list;
559 	} else
560 		el = &fe->id2.i_list;
561 
562 	/* we never add a branch to a leaf. */
563 	BUG_ON(!el->l_tree_depth);
564 
565 	new_blocks = le16_to_cpu(el->l_tree_depth);
566 
567 	/* allocate the number of new eb blocks we need */
568 	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
569 			     GFP_KERNEL);
570 	if (!new_eb_bhs) {
571 		status = -ENOMEM;
572 		mlog_errno(status);
573 		goto bail;
574 	}
575 
576 	status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
577 					   meta_ac, new_eb_bhs);
578 	if (status < 0) {
579 		mlog_errno(status);
580 		goto bail;
581 	}
582 
583 	eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
584 	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
585 
586 	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
587 	 * linked with the rest of the tree.
588 	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
589 	 *
590 	 * when we leave the loop, new_last_eb_blk will point to the
591 	 * newest leaf, and next_blkno will point to the topmost extent
592 	 * block. */
593 	next_blkno = new_last_eb_blk = 0;
594 	for(i = 0; i < new_blocks; i++) {
595 		bh = new_eb_bhs[i];
596 		eb = (struct ocfs2_extent_block *) bh->b_data;
597 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
598 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
599 			status = -EIO;
600 			goto bail;
601 		}
602 		eb_el = &eb->h_list;
603 
604 		status = ocfs2_journal_access(handle, inode, bh,
605 					      OCFS2_JOURNAL_ACCESS_CREATE);
606 		if (status < 0) {
607 			mlog_errno(status);
608 			goto bail;
609 		}
610 
611 		eb->h_next_leaf_blk = 0;
612 		eb_el->l_tree_depth = cpu_to_le16(i);
613 		eb_el->l_next_free_rec = cpu_to_le16(1);
614 		/*
615 		 * This actually counts as an empty extent as
616 		 * c_clusters == 0
617 		 */
618 		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
619 		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
620 		/*
621 		 * eb_el isn't always an interior node, but even leaf
622 		 * nodes want a zero'd flags and reserved field so
623 		 * this gets the whole 32 bits regardless of use.
624 		 */
625 		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
626 		if (!eb_el->l_tree_depth)
627 			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
628 
629 		status = ocfs2_journal_dirty(handle, bh);
630 		if (status < 0) {
631 			mlog_errno(status);
632 			goto bail;
633 		}
634 
635 		next_blkno = le64_to_cpu(eb->h_blkno);
636 	}
637 
638 	/* This is a bit hairy. We want to update up to three blocks
639 	 * here without leaving any of them in an inconsistent state
640 	 * in case of error. We don't have to worry about
641 	 * journal_dirty erroring as it won't unless we've aborted the
642 	 * handle (in which case we would never be here) so reserving
643 	 * the write with journal_access is all we need to do. */
644 	status = ocfs2_journal_access(handle, inode, *last_eb_bh,
645 				      OCFS2_JOURNAL_ACCESS_WRITE);
646 	if (status < 0) {
647 		mlog_errno(status);
648 		goto bail;
649 	}
650 	status = ocfs2_journal_access(handle, inode, fe_bh,
651 				      OCFS2_JOURNAL_ACCESS_WRITE);
652 	if (status < 0) {
653 		mlog_errno(status);
654 		goto bail;
655 	}
656 	if (eb_bh) {
657 		status = ocfs2_journal_access(handle, inode, eb_bh,
658 					      OCFS2_JOURNAL_ACCESS_WRITE);
659 		if (status < 0) {
660 			mlog_errno(status);
661 			goto bail;
662 		}
663 	}
664 
665 	/* Link the new branch into the rest of the tree (el will
666 	 * either be on the fe, or the extent block passed in. */
667 	i = le16_to_cpu(el->l_next_free_rec);
668 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
669 	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
670 	el->l_recs[i].e_int_clusters = 0;
671 	le16_add_cpu(&el->l_next_free_rec, 1);
672 
673 	/* fe needs a new last extent block pointer, as does the
674 	 * next_leaf on the previously last-extent-block. */
675 	fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk);
676 
677 	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
678 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
679 
680 	status = ocfs2_journal_dirty(handle, *last_eb_bh);
681 	if (status < 0)
682 		mlog_errno(status);
683 	status = ocfs2_journal_dirty(handle, fe_bh);
684 	if (status < 0)
685 		mlog_errno(status);
686 	if (eb_bh) {
687 		status = ocfs2_journal_dirty(handle, eb_bh);
688 		if (status < 0)
689 			mlog_errno(status);
690 	}
691 
692 	/*
693 	 * Some callers want to track the rightmost leaf so pass it
694 	 * back here.
695 	 */
696 	brelse(*last_eb_bh);
697 	get_bh(new_eb_bhs[0]);
698 	*last_eb_bh = new_eb_bhs[0];
699 
700 	status = 0;
701 bail:
702 	if (new_eb_bhs) {
703 		for (i = 0; i < new_blocks; i++)
704 			if (new_eb_bhs[i])
705 				brelse(new_eb_bhs[i]);
706 		kfree(new_eb_bhs);
707 	}
708 
709 	mlog_exit(status);
710 	return status;
711 }
712 
713 /*
714  * adds another level to the allocation tree.
715  * returns back the new extent block so you can add a branch to it
716  * after this call.
717  */
718 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
719 				  handle_t *handle,
720 				  struct inode *inode,
721 				  struct buffer_head *fe_bh,
722 				  struct ocfs2_alloc_context *meta_ac,
723 				  struct buffer_head **ret_new_eb_bh)
724 {
725 	int status, i;
726 	u32 new_clusters;
727 	struct buffer_head *new_eb_bh = NULL;
728 	struct ocfs2_dinode *fe;
729 	struct ocfs2_extent_block *eb;
730 	struct ocfs2_extent_list  *fe_el;
731 	struct ocfs2_extent_list  *eb_el;
732 
733 	mlog_entry_void();
734 
735 	status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
736 					   &new_eb_bh);
737 	if (status < 0) {
738 		mlog_errno(status);
739 		goto bail;
740 	}
741 
742 	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
743 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
744 		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
745 		status = -EIO;
746 		goto bail;
747 	}
748 
749 	eb_el = &eb->h_list;
750 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
751 	fe_el = &fe->id2.i_list;
752 
753 	status = ocfs2_journal_access(handle, inode, new_eb_bh,
754 				      OCFS2_JOURNAL_ACCESS_CREATE);
755 	if (status < 0) {
756 		mlog_errno(status);
757 		goto bail;
758 	}
759 
760 	/* copy the fe data into the new extent block */
761 	eb_el->l_tree_depth = fe_el->l_tree_depth;
762 	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
763 	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
764 		eb_el->l_recs[i] = fe_el->l_recs[i];
765 
766 	status = ocfs2_journal_dirty(handle, new_eb_bh);
767 	if (status < 0) {
768 		mlog_errno(status);
769 		goto bail;
770 	}
771 
772 	status = ocfs2_journal_access(handle, inode, fe_bh,
773 				      OCFS2_JOURNAL_ACCESS_WRITE);
774 	if (status < 0) {
775 		mlog_errno(status);
776 		goto bail;
777 	}
778 
779 	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
780 
781 	/* update fe now */
782 	le16_add_cpu(&fe_el->l_tree_depth, 1);
783 	fe_el->l_recs[0].e_cpos = 0;
784 	fe_el->l_recs[0].e_blkno = eb->h_blkno;
785 	fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
786 	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
787 		memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
788 	fe_el->l_next_free_rec = cpu_to_le16(1);
789 
790 	/* If this is our 1st tree depth shift, then last_eb_blk
791 	 * becomes the allocated extent block */
792 	if (fe_el->l_tree_depth == cpu_to_le16(1))
793 		fe->i_last_eb_blk = eb->h_blkno;
794 
795 	status = ocfs2_journal_dirty(handle, fe_bh);
796 	if (status < 0) {
797 		mlog_errno(status);
798 		goto bail;
799 	}
800 
801 	*ret_new_eb_bh = new_eb_bh;
802 	new_eb_bh = NULL;
803 	status = 0;
804 bail:
805 	if (new_eb_bh)
806 		brelse(new_eb_bh);
807 
808 	mlog_exit(status);
809 	return status;
810 }
811 
812 /*
813  * Should only be called when there is no space left in any of the
814  * leaf nodes. What we want to do is find the lowest tree depth
815  * non-leaf extent block with room for new records. There are three
816  * valid results of this search:
817  *
818  * 1) a lowest extent block is found, then we pass it back in
819  *    *lowest_eb_bh and return '0'
820  *
821  * 2) the search fails to find anything, but the dinode has room. We
822  *    pass NULL back in *lowest_eb_bh, but still return '0'
823  *
824  * 3) the search fails to find anything AND the dinode is full, in
825  *    which case we return > 0
826  *
827  * return status < 0 indicates an error.
828  */
829 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
830 				    struct inode *inode,
831 				    struct buffer_head *fe_bh,
832 				    struct buffer_head **target_bh)
833 {
834 	int status = 0, i;
835 	u64 blkno;
836 	struct ocfs2_dinode *fe;
837 	struct ocfs2_extent_block *eb;
838 	struct ocfs2_extent_list  *el;
839 	struct buffer_head *bh = NULL;
840 	struct buffer_head *lowest_bh = NULL;
841 
842 	mlog_entry_void();
843 
844 	*target_bh = NULL;
845 
846 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
847 	el = &fe->id2.i_list;
848 
849 	while(le16_to_cpu(el->l_tree_depth) > 1) {
850 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
851 			ocfs2_error(inode->i_sb, "Dinode %llu has empty "
852 				    "extent list (next_free_rec == 0)",
853 				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
854 			status = -EIO;
855 			goto bail;
856 		}
857 		i = le16_to_cpu(el->l_next_free_rec) - 1;
858 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
859 		if (!blkno) {
860 			ocfs2_error(inode->i_sb, "Dinode %llu has extent "
861 				    "list where extent # %d has no physical "
862 				    "block start",
863 				    (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
864 			status = -EIO;
865 			goto bail;
866 		}
867 
868 		if (bh) {
869 			brelse(bh);
870 			bh = NULL;
871 		}
872 
873 		status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED,
874 					  inode);
875 		if (status < 0) {
876 			mlog_errno(status);
877 			goto bail;
878 		}
879 
880 		eb = (struct ocfs2_extent_block *) bh->b_data;
881 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
882 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
883 			status = -EIO;
884 			goto bail;
885 		}
886 		el = &eb->h_list;
887 
888 		if (le16_to_cpu(el->l_next_free_rec) <
889 		    le16_to_cpu(el->l_count)) {
890 			if (lowest_bh)
891 				brelse(lowest_bh);
892 			lowest_bh = bh;
893 			get_bh(lowest_bh);
894 		}
895 	}
896 
897 	/* If we didn't find one and the fe doesn't have any room,
898 	 * then return '1' */
899 	if (!lowest_bh
900 	    && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count))
901 		status = 1;
902 
903 	*target_bh = lowest_bh;
904 bail:
905 	if (bh)
906 		brelse(bh);
907 
908 	mlog_exit(status);
909 	return status;
910 }
911 
912 /*
913  * Grow a b-tree so that it has more records.
914  *
915  * We might shift the tree depth in which case existing paths should
916  * be considered invalid.
917  *
918  * Tree depth after the grow is returned via *final_depth.
919  *
920  * *last_eb_bh will be updated by ocfs2_add_branch().
921  */
922 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
923 			   struct buffer_head *di_bh, int *final_depth,
924 			   struct buffer_head **last_eb_bh,
925 			   struct ocfs2_alloc_context *meta_ac)
926 {
927 	int ret, shift;
928 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
929 	int depth = le16_to_cpu(di->id2.i_list.l_tree_depth);
930 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
931 	struct buffer_head *bh = NULL;
932 
933 	BUG_ON(meta_ac == NULL);
934 
935 	shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh);
936 	if (shift < 0) {
937 		ret = shift;
938 		mlog_errno(ret);
939 		goto out;
940 	}
941 
942 	/* We traveled all the way to the bottom of the allocation tree
943 	 * and didn't find room for any more extents - we need to add
944 	 * another tree level */
945 	if (shift) {
946 		BUG_ON(bh);
947 		mlog(0, "need to shift tree depth (current = %d)\n", depth);
948 
949 		/* ocfs2_shift_tree_depth will return us a buffer with
950 		 * the new extent block (so we can pass that to
951 		 * ocfs2_add_branch). */
952 		ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh,
953 					     meta_ac, &bh);
954 		if (ret < 0) {
955 			mlog_errno(ret);
956 			goto out;
957 		}
958 		depth++;
959 		if (depth == 1) {
960 			/*
961 			 * Special case: we have room now if we shifted from
962 			 * tree_depth 0, so no more work needs to be done.
963 			 *
964 			 * We won't be calling add_branch, so pass
965 			 * back *last_eb_bh as the new leaf. At depth
966 			 * zero, it should always be null so there's
967 			 * no reason to brelse.
968 			 */
969 			BUG_ON(*last_eb_bh);
970 			get_bh(bh);
971 			*last_eb_bh = bh;
972 			goto out;
973 		}
974 	}
975 
976 	/* call ocfs2_add_branch to add the final part of the tree with
977 	 * the new data. */
978 	mlog(0, "add branch. bh = %p\n", bh);
979 	ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh,
980 			       meta_ac);
981 	if (ret < 0) {
982 		mlog_errno(ret);
983 		goto out;
984 	}
985 
986 out:
987 	if (final_depth)
988 		*final_depth = depth;
989 	brelse(bh);
990 	return ret;
991 }
992 
993 /*
994  * This function will discard the rightmost extent record.
995  */
996 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
997 {
998 	int next_free = le16_to_cpu(el->l_next_free_rec);
999 	int count = le16_to_cpu(el->l_count);
1000 	unsigned int num_bytes;
1001 
1002 	BUG_ON(!next_free);
1003 	/* This will cause us to go off the end of our extent list. */
1004 	BUG_ON(next_free >= count);
1005 
1006 	num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1007 
1008 	memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1009 }
1010 
1011 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1012 			      struct ocfs2_extent_rec *insert_rec)
1013 {
1014 	int i, insert_index, next_free, has_empty, num_bytes;
1015 	u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1016 	struct ocfs2_extent_rec *rec;
1017 
1018 	next_free = le16_to_cpu(el->l_next_free_rec);
1019 	has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1020 
1021 	BUG_ON(!next_free);
1022 
1023 	/* The tree code before us didn't allow enough room in the leaf. */
1024 	BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1025 
1026 	/*
1027 	 * The easiest way to approach this is to just remove the
1028 	 * empty extent and temporarily decrement next_free.
1029 	 */
1030 	if (has_empty) {
1031 		/*
1032 		 * If next_free was 1 (only an empty extent), this
1033 		 * loop won't execute, which is fine. We still want
1034 		 * the decrement above to happen.
1035 		 */
1036 		for(i = 0; i < (next_free - 1); i++)
1037 			el->l_recs[i] = el->l_recs[i+1];
1038 
1039 		next_free--;
1040 	}
1041 
1042 	/*
1043 	 * Figure out what the new record index should be.
1044 	 */
1045 	for(i = 0; i < next_free; i++) {
1046 		rec = &el->l_recs[i];
1047 
1048 		if (insert_cpos < le32_to_cpu(rec->e_cpos))
1049 			break;
1050 	}
1051 	insert_index = i;
1052 
1053 	mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1054 	     insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1055 
1056 	BUG_ON(insert_index < 0);
1057 	BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1058 	BUG_ON(insert_index > next_free);
1059 
1060 	/*
1061 	 * No need to memmove if we're just adding to the tail.
1062 	 */
1063 	if (insert_index != next_free) {
1064 		BUG_ON(next_free >= le16_to_cpu(el->l_count));
1065 
1066 		num_bytes = next_free - insert_index;
1067 		num_bytes *= sizeof(struct ocfs2_extent_rec);
1068 		memmove(&el->l_recs[insert_index + 1],
1069 			&el->l_recs[insert_index],
1070 			num_bytes);
1071 	}
1072 
1073 	/*
1074 	 * Either we had an empty extent, and need to re-increment or
1075 	 * there was no empty extent on a non full rightmost leaf node,
1076 	 * in which case we still need to increment.
1077 	 */
1078 	next_free++;
1079 	el->l_next_free_rec = cpu_to_le16(next_free);
1080 	/*
1081 	 * Make sure none of the math above just messed up our tree.
1082 	 */
1083 	BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1084 
1085 	el->l_recs[insert_index] = *insert_rec;
1086 
1087 }
1088 
1089 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1090 {
1091 	int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1092 
1093 	BUG_ON(num_recs == 0);
1094 
1095 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1096 		num_recs--;
1097 		size = num_recs * sizeof(struct ocfs2_extent_rec);
1098 		memmove(&el->l_recs[0], &el->l_recs[1], size);
1099 		memset(&el->l_recs[num_recs], 0,
1100 		       sizeof(struct ocfs2_extent_rec));
1101 		el->l_next_free_rec = cpu_to_le16(num_recs);
1102 	}
1103 }
1104 
1105 /*
1106  * Create an empty extent record .
1107  *
1108  * l_next_free_rec may be updated.
1109  *
1110  * If an empty extent already exists do nothing.
1111  */
1112 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1113 {
1114 	int next_free = le16_to_cpu(el->l_next_free_rec);
1115 
1116 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1117 
1118 	if (next_free == 0)
1119 		goto set_and_inc;
1120 
1121 	if (ocfs2_is_empty_extent(&el->l_recs[0]))
1122 		return;
1123 
1124 	mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1125 			"Asked to create an empty extent in a full list:\n"
1126 			"count = %u, tree depth = %u",
1127 			le16_to_cpu(el->l_count),
1128 			le16_to_cpu(el->l_tree_depth));
1129 
1130 	ocfs2_shift_records_right(el);
1131 
1132 set_and_inc:
1133 	le16_add_cpu(&el->l_next_free_rec, 1);
1134 	memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1135 }
1136 
1137 /*
1138  * For a rotation which involves two leaf nodes, the "root node" is
1139  * the lowest level tree node which contains a path to both leafs. This
1140  * resulting set of information can be used to form a complete "subtree"
1141  *
1142  * This function is passed two full paths from the dinode down to a
1143  * pair of adjacent leaves. It's task is to figure out which path
1144  * index contains the subtree root - this can be the root index itself
1145  * in a worst-case rotation.
1146  *
1147  * The array index of the subtree root is passed back.
1148  */
1149 static int ocfs2_find_subtree_root(struct inode *inode,
1150 				   struct ocfs2_path *left,
1151 				   struct ocfs2_path *right)
1152 {
1153 	int i = 0;
1154 
1155 	/*
1156 	 * Check that the caller passed in two paths from the same tree.
1157 	 */
1158 	BUG_ON(path_root_bh(left) != path_root_bh(right));
1159 
1160 	do {
1161 		i++;
1162 
1163 		/*
1164 		 * The caller didn't pass two adjacent paths.
1165 		 */
1166 		mlog_bug_on_msg(i > left->p_tree_depth,
1167 				"Inode %lu, left depth %u, right depth %u\n"
1168 				"left leaf blk %llu, right leaf blk %llu\n",
1169 				inode->i_ino, left->p_tree_depth,
1170 				right->p_tree_depth,
1171 				(unsigned long long)path_leaf_bh(left)->b_blocknr,
1172 				(unsigned long long)path_leaf_bh(right)->b_blocknr);
1173 	} while (left->p_node[i].bh->b_blocknr ==
1174 		 right->p_node[i].bh->b_blocknr);
1175 
1176 	return i - 1;
1177 }
1178 
1179 typedef void (path_insert_t)(void *, struct buffer_head *);
1180 
1181 /*
1182  * Traverse a btree path in search of cpos, starting at root_el.
1183  *
1184  * This code can be called with a cpos larger than the tree, in which
1185  * case it will return the rightmost path.
1186  */
1187 static int __ocfs2_find_path(struct inode *inode,
1188 			     struct ocfs2_extent_list *root_el, u32 cpos,
1189 			     path_insert_t *func, void *data)
1190 {
1191 	int i, ret = 0;
1192 	u32 range;
1193 	u64 blkno;
1194 	struct buffer_head *bh = NULL;
1195 	struct ocfs2_extent_block *eb;
1196 	struct ocfs2_extent_list *el;
1197 	struct ocfs2_extent_rec *rec;
1198 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1199 
1200 	el = root_el;
1201 	while (el->l_tree_depth) {
1202 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
1203 			ocfs2_error(inode->i_sb,
1204 				    "Inode %llu has empty extent list at "
1205 				    "depth %u\n",
1206 				    (unsigned long long)oi->ip_blkno,
1207 				    le16_to_cpu(el->l_tree_depth));
1208 			ret = -EROFS;
1209 			goto out;
1210 
1211 		}
1212 
1213 		for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1214 			rec = &el->l_recs[i];
1215 
1216 			/*
1217 			 * In the case that cpos is off the allocation
1218 			 * tree, this should just wind up returning the
1219 			 * rightmost record.
1220 			 */
1221 			range = le32_to_cpu(rec->e_cpos) +
1222 				ocfs2_rec_clusters(el, rec);
1223 			if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1224 			    break;
1225 		}
1226 
1227 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1228 		if (blkno == 0) {
1229 			ocfs2_error(inode->i_sb,
1230 				    "Inode %llu has bad blkno in extent list "
1231 				    "at depth %u (index %d)\n",
1232 				    (unsigned long long)oi->ip_blkno,
1233 				    le16_to_cpu(el->l_tree_depth), i);
1234 			ret = -EROFS;
1235 			goto out;
1236 		}
1237 
1238 		brelse(bh);
1239 		bh = NULL;
1240 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
1241 				       &bh, OCFS2_BH_CACHED, inode);
1242 		if (ret) {
1243 			mlog_errno(ret);
1244 			goto out;
1245 		}
1246 
1247 		eb = (struct ocfs2_extent_block *) bh->b_data;
1248 		el = &eb->h_list;
1249 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
1250 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
1251 			ret = -EIO;
1252 			goto out;
1253 		}
1254 
1255 		if (le16_to_cpu(el->l_next_free_rec) >
1256 		    le16_to_cpu(el->l_count)) {
1257 			ocfs2_error(inode->i_sb,
1258 				    "Inode %llu has bad count in extent list "
1259 				    "at block %llu (next free=%u, count=%u)\n",
1260 				    (unsigned long long)oi->ip_blkno,
1261 				    (unsigned long long)bh->b_blocknr,
1262 				    le16_to_cpu(el->l_next_free_rec),
1263 				    le16_to_cpu(el->l_count));
1264 			ret = -EROFS;
1265 			goto out;
1266 		}
1267 
1268 		if (func)
1269 			func(data, bh);
1270 	}
1271 
1272 out:
1273 	/*
1274 	 * Catch any trailing bh that the loop didn't handle.
1275 	 */
1276 	brelse(bh);
1277 
1278 	return ret;
1279 }
1280 
1281 /*
1282  * Given an initialized path (that is, it has a valid root extent
1283  * list), this function will traverse the btree in search of the path
1284  * which would contain cpos.
1285  *
1286  * The path traveled is recorded in the path structure.
1287  *
1288  * Note that this will not do any comparisons on leaf node extent
1289  * records, so it will work fine in the case that we just added a tree
1290  * branch.
1291  */
1292 struct find_path_data {
1293 	int index;
1294 	struct ocfs2_path *path;
1295 };
1296 static void find_path_ins(void *data, struct buffer_head *bh)
1297 {
1298 	struct find_path_data *fp = data;
1299 
1300 	get_bh(bh);
1301 	ocfs2_path_insert_eb(fp->path, fp->index, bh);
1302 	fp->index++;
1303 }
1304 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1305 			   u32 cpos)
1306 {
1307 	struct find_path_data data;
1308 
1309 	data.index = 1;
1310 	data.path = path;
1311 	return __ocfs2_find_path(inode, path_root_el(path), cpos,
1312 				 find_path_ins, &data);
1313 }
1314 
1315 static void find_leaf_ins(void *data, struct buffer_head *bh)
1316 {
1317 	struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1318 	struct ocfs2_extent_list *el = &eb->h_list;
1319 	struct buffer_head **ret = data;
1320 
1321 	/* We want to retain only the leaf block. */
1322 	if (le16_to_cpu(el->l_tree_depth) == 0) {
1323 		get_bh(bh);
1324 		*ret = bh;
1325 	}
1326 }
1327 /*
1328  * Find the leaf block in the tree which would contain cpos. No
1329  * checking of the actual leaf is done.
1330  *
1331  * Some paths want to call this instead of allocating a path structure
1332  * and calling ocfs2_find_path().
1333  *
1334  * This function doesn't handle non btree extent lists.
1335  */
1336 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1337 		    u32 cpos, struct buffer_head **leaf_bh)
1338 {
1339 	int ret;
1340 	struct buffer_head *bh = NULL;
1341 
1342 	ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1343 	if (ret) {
1344 		mlog_errno(ret);
1345 		goto out;
1346 	}
1347 
1348 	*leaf_bh = bh;
1349 out:
1350 	return ret;
1351 }
1352 
1353 /*
1354  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1355  *
1356  * Basically, we've moved stuff around at the bottom of the tree and
1357  * we need to fix up the extent records above the changes to reflect
1358  * the new changes.
1359  *
1360  * left_rec: the record on the left.
1361  * left_child_el: is the child list pointed to by left_rec
1362  * right_rec: the record to the right of left_rec
1363  * right_child_el: is the child list pointed to by right_rec
1364  *
1365  * By definition, this only works on interior nodes.
1366  */
1367 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1368 				  struct ocfs2_extent_list *left_child_el,
1369 				  struct ocfs2_extent_rec *right_rec,
1370 				  struct ocfs2_extent_list *right_child_el)
1371 {
1372 	u32 left_clusters, right_end;
1373 
1374 	/*
1375 	 * Interior nodes never have holes. Their cpos is the cpos of
1376 	 * the leftmost record in their child list. Their cluster
1377 	 * count covers the full theoretical range of their child list
1378 	 * - the range between their cpos and the cpos of the record
1379 	 * immediately to their right.
1380 	 */
1381 	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1382 	if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1383 		BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1384 		left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1385 	}
1386 	left_clusters -= le32_to_cpu(left_rec->e_cpos);
1387 	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1388 
1389 	/*
1390 	 * Calculate the rightmost cluster count boundary before
1391 	 * moving cpos - we will need to adjust clusters after
1392 	 * updating e_cpos to keep the same highest cluster count.
1393 	 */
1394 	right_end = le32_to_cpu(right_rec->e_cpos);
1395 	right_end += le32_to_cpu(right_rec->e_int_clusters);
1396 
1397 	right_rec->e_cpos = left_rec->e_cpos;
1398 	le32_add_cpu(&right_rec->e_cpos, left_clusters);
1399 
1400 	right_end -= le32_to_cpu(right_rec->e_cpos);
1401 	right_rec->e_int_clusters = cpu_to_le32(right_end);
1402 }
1403 
1404 /*
1405  * Adjust the adjacent root node records involved in a
1406  * rotation. left_el_blkno is passed in as a key so that we can easily
1407  * find it's index in the root list.
1408  */
1409 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1410 				      struct ocfs2_extent_list *left_el,
1411 				      struct ocfs2_extent_list *right_el,
1412 				      u64 left_el_blkno)
1413 {
1414 	int i;
1415 
1416 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1417 	       le16_to_cpu(left_el->l_tree_depth));
1418 
1419 	for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1420 		if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1421 			break;
1422 	}
1423 
1424 	/*
1425 	 * The path walking code should have never returned a root and
1426 	 * two paths which are not adjacent.
1427 	 */
1428 	BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1429 
1430 	ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1431 				      &root_el->l_recs[i + 1], right_el);
1432 }
1433 
1434 /*
1435  * We've changed a leaf block (in right_path) and need to reflect that
1436  * change back up the subtree.
1437  *
1438  * This happens in multiple places:
1439  *   - When we've moved an extent record from the left path leaf to the right
1440  *     path leaf to make room for an empty extent in the left path leaf.
1441  *   - When our insert into the right path leaf is at the leftmost edge
1442  *     and requires an update of the path immediately to it's left. This
1443  *     can occur at the end of some types of rotation and appending inserts.
1444  *   - When we've adjusted the last extent record in the left path leaf and the
1445  *     1st extent record in the right path leaf during cross extent block merge.
1446  */
1447 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1448 				       struct ocfs2_path *left_path,
1449 				       struct ocfs2_path *right_path,
1450 				       int subtree_index)
1451 {
1452 	int ret, i, idx;
1453 	struct ocfs2_extent_list *el, *left_el, *right_el;
1454 	struct ocfs2_extent_rec *left_rec, *right_rec;
1455 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1456 
1457 	/*
1458 	 * Update the counts and position values within all the
1459 	 * interior nodes to reflect the leaf rotation we just did.
1460 	 *
1461 	 * The root node is handled below the loop.
1462 	 *
1463 	 * We begin the loop with right_el and left_el pointing to the
1464 	 * leaf lists and work our way up.
1465 	 *
1466 	 * NOTE: within this loop, left_el and right_el always refer
1467 	 * to the *child* lists.
1468 	 */
1469 	left_el = path_leaf_el(left_path);
1470 	right_el = path_leaf_el(right_path);
1471 	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1472 		mlog(0, "Adjust records at index %u\n", i);
1473 
1474 		/*
1475 		 * One nice property of knowing that all of these
1476 		 * nodes are below the root is that we only deal with
1477 		 * the leftmost right node record and the rightmost
1478 		 * left node record.
1479 		 */
1480 		el = left_path->p_node[i].el;
1481 		idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1482 		left_rec = &el->l_recs[idx];
1483 
1484 		el = right_path->p_node[i].el;
1485 		right_rec = &el->l_recs[0];
1486 
1487 		ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1488 					      right_el);
1489 
1490 		ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1491 		if (ret)
1492 			mlog_errno(ret);
1493 
1494 		ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1495 		if (ret)
1496 			mlog_errno(ret);
1497 
1498 		/*
1499 		 * Setup our list pointers now so that the current
1500 		 * parents become children in the next iteration.
1501 		 */
1502 		left_el = left_path->p_node[i].el;
1503 		right_el = right_path->p_node[i].el;
1504 	}
1505 
1506 	/*
1507 	 * At the root node, adjust the two adjacent records which
1508 	 * begin our path to the leaves.
1509 	 */
1510 
1511 	el = left_path->p_node[subtree_index].el;
1512 	left_el = left_path->p_node[subtree_index + 1].el;
1513 	right_el = right_path->p_node[subtree_index + 1].el;
1514 
1515 	ocfs2_adjust_root_records(el, left_el, right_el,
1516 				  left_path->p_node[subtree_index + 1].bh->b_blocknr);
1517 
1518 	root_bh = left_path->p_node[subtree_index].bh;
1519 
1520 	ret = ocfs2_journal_dirty(handle, root_bh);
1521 	if (ret)
1522 		mlog_errno(ret);
1523 }
1524 
1525 static int ocfs2_rotate_subtree_right(struct inode *inode,
1526 				      handle_t *handle,
1527 				      struct ocfs2_path *left_path,
1528 				      struct ocfs2_path *right_path,
1529 				      int subtree_index)
1530 {
1531 	int ret, i;
1532 	struct buffer_head *right_leaf_bh;
1533 	struct buffer_head *left_leaf_bh = NULL;
1534 	struct buffer_head *root_bh;
1535 	struct ocfs2_extent_list *right_el, *left_el;
1536 	struct ocfs2_extent_rec move_rec;
1537 
1538 	left_leaf_bh = path_leaf_bh(left_path);
1539 	left_el = path_leaf_el(left_path);
1540 
1541 	if (left_el->l_next_free_rec != left_el->l_count) {
1542 		ocfs2_error(inode->i_sb,
1543 			    "Inode %llu has non-full interior leaf node %llu"
1544 			    "(next free = %u)",
1545 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
1546 			    (unsigned long long)left_leaf_bh->b_blocknr,
1547 			    le16_to_cpu(left_el->l_next_free_rec));
1548 		return -EROFS;
1549 	}
1550 
1551 	/*
1552 	 * This extent block may already have an empty record, so we
1553 	 * return early if so.
1554 	 */
1555 	if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1556 		return 0;
1557 
1558 	root_bh = left_path->p_node[subtree_index].bh;
1559 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1560 
1561 	ret = ocfs2_journal_access(handle, inode, root_bh,
1562 				   OCFS2_JOURNAL_ACCESS_WRITE);
1563 	if (ret) {
1564 		mlog_errno(ret);
1565 		goto out;
1566 	}
1567 
1568 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1569 		ret = ocfs2_journal_access(handle, inode,
1570 					   right_path->p_node[i].bh,
1571 					   OCFS2_JOURNAL_ACCESS_WRITE);
1572 		if (ret) {
1573 			mlog_errno(ret);
1574 			goto out;
1575 		}
1576 
1577 		ret = ocfs2_journal_access(handle, inode,
1578 					   left_path->p_node[i].bh,
1579 					   OCFS2_JOURNAL_ACCESS_WRITE);
1580 		if (ret) {
1581 			mlog_errno(ret);
1582 			goto out;
1583 		}
1584 	}
1585 
1586 	right_leaf_bh = path_leaf_bh(right_path);
1587 	right_el = path_leaf_el(right_path);
1588 
1589 	/* This is a code error, not a disk corruption. */
1590 	mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1591 			"because rightmost leaf block %llu is empty\n",
1592 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
1593 			(unsigned long long)right_leaf_bh->b_blocknr);
1594 
1595 	ocfs2_create_empty_extent(right_el);
1596 
1597 	ret = ocfs2_journal_dirty(handle, right_leaf_bh);
1598 	if (ret) {
1599 		mlog_errno(ret);
1600 		goto out;
1601 	}
1602 
1603 	/* Do the copy now. */
1604 	i = le16_to_cpu(left_el->l_next_free_rec) - 1;
1605 	move_rec = left_el->l_recs[i];
1606 	right_el->l_recs[0] = move_rec;
1607 
1608 	/*
1609 	 * Clear out the record we just copied and shift everything
1610 	 * over, leaving an empty extent in the left leaf.
1611 	 *
1612 	 * We temporarily subtract from next_free_rec so that the
1613 	 * shift will lose the tail record (which is now defunct).
1614 	 */
1615 	le16_add_cpu(&left_el->l_next_free_rec, -1);
1616 	ocfs2_shift_records_right(left_el);
1617 	memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1618 	le16_add_cpu(&left_el->l_next_free_rec, 1);
1619 
1620 	ret = ocfs2_journal_dirty(handle, left_leaf_bh);
1621 	if (ret) {
1622 		mlog_errno(ret);
1623 		goto out;
1624 	}
1625 
1626 	ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
1627 				subtree_index);
1628 
1629 out:
1630 	return ret;
1631 }
1632 
1633 /*
1634  * Given a full path, determine what cpos value would return us a path
1635  * containing the leaf immediately to the left of the current one.
1636  *
1637  * Will return zero if the path passed in is already the leftmost path.
1638  */
1639 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
1640 					 struct ocfs2_path *path, u32 *cpos)
1641 {
1642 	int i, j, ret = 0;
1643 	u64 blkno;
1644 	struct ocfs2_extent_list *el;
1645 
1646 	BUG_ON(path->p_tree_depth == 0);
1647 
1648 	*cpos = 0;
1649 
1650 	blkno = path_leaf_bh(path)->b_blocknr;
1651 
1652 	/* Start at the tree node just above the leaf and work our way up. */
1653 	i = path->p_tree_depth - 1;
1654 	while (i >= 0) {
1655 		el = path->p_node[i].el;
1656 
1657 		/*
1658 		 * Find the extent record just before the one in our
1659 		 * path.
1660 		 */
1661 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
1662 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
1663 				if (j == 0) {
1664 					if (i == 0) {
1665 						/*
1666 						 * We've determined that the
1667 						 * path specified is already
1668 						 * the leftmost one - return a
1669 						 * cpos of zero.
1670 						 */
1671 						goto out;
1672 					}
1673 					/*
1674 					 * The leftmost record points to our
1675 					 * leaf - we need to travel up the
1676 					 * tree one level.
1677 					 */
1678 					goto next_node;
1679 				}
1680 
1681 				*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
1682 				*cpos = *cpos + ocfs2_rec_clusters(el,
1683 							   &el->l_recs[j - 1]);
1684 				*cpos = *cpos - 1;
1685 				goto out;
1686 			}
1687 		}
1688 
1689 		/*
1690 		 * If we got here, we never found a valid node where
1691 		 * the tree indicated one should be.
1692 		 */
1693 		ocfs2_error(sb,
1694 			    "Invalid extent tree at extent block %llu\n",
1695 			    (unsigned long long)blkno);
1696 		ret = -EROFS;
1697 		goto out;
1698 
1699 next_node:
1700 		blkno = path->p_node[i].bh->b_blocknr;
1701 		i--;
1702 	}
1703 
1704 out:
1705 	return ret;
1706 }
1707 
1708 /*
1709  * Extend the transaction by enough credits to complete the rotation,
1710  * and still leave at least the original number of credits allocated
1711  * to this transaction.
1712  */
1713 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
1714 					   int op_credits,
1715 					   struct ocfs2_path *path)
1716 {
1717 	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
1718 
1719 	if (handle->h_buffer_credits < credits)
1720 		return ocfs2_extend_trans(handle, credits);
1721 
1722 	return 0;
1723 }
1724 
1725 /*
1726  * Trap the case where we're inserting into the theoretical range past
1727  * the _actual_ left leaf range. Otherwise, we'll rotate a record
1728  * whose cpos is less than ours into the right leaf.
1729  *
1730  * It's only necessary to look at the rightmost record of the left
1731  * leaf because the logic that calls us should ensure that the
1732  * theoretical ranges in the path components above the leaves are
1733  * correct.
1734  */
1735 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
1736 						 u32 insert_cpos)
1737 {
1738 	struct ocfs2_extent_list *left_el;
1739 	struct ocfs2_extent_rec *rec;
1740 	int next_free;
1741 
1742 	left_el = path_leaf_el(left_path);
1743 	next_free = le16_to_cpu(left_el->l_next_free_rec);
1744 	rec = &left_el->l_recs[next_free - 1];
1745 
1746 	if (insert_cpos > le32_to_cpu(rec->e_cpos))
1747 		return 1;
1748 	return 0;
1749 }
1750 
1751 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
1752 {
1753 	int next_free = le16_to_cpu(el->l_next_free_rec);
1754 	unsigned int range;
1755 	struct ocfs2_extent_rec *rec;
1756 
1757 	if (next_free == 0)
1758 		return 0;
1759 
1760 	rec = &el->l_recs[0];
1761 	if (ocfs2_is_empty_extent(rec)) {
1762 		/* Empty list. */
1763 		if (next_free == 1)
1764 			return 0;
1765 		rec = &el->l_recs[1];
1766 	}
1767 
1768 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1769 	if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1770 		return 1;
1771 	return 0;
1772 }
1773 
1774 /*
1775  * Rotate all the records in a btree right one record, starting at insert_cpos.
1776  *
1777  * The path to the rightmost leaf should be passed in.
1778  *
1779  * The array is assumed to be large enough to hold an entire path (tree depth).
1780  *
1781  * Upon succesful return from this function:
1782  *
1783  * - The 'right_path' array will contain a path to the leaf block
1784  *   whose range contains e_cpos.
1785  * - That leaf block will have a single empty extent in list index 0.
1786  * - In the case that the rotation requires a post-insert update,
1787  *   *ret_left_path will contain a valid path which can be passed to
1788  *   ocfs2_insert_path().
1789  */
1790 static int ocfs2_rotate_tree_right(struct inode *inode,
1791 				   handle_t *handle,
1792 				   enum ocfs2_split_type split,
1793 				   u32 insert_cpos,
1794 				   struct ocfs2_path *right_path,
1795 				   struct ocfs2_path **ret_left_path)
1796 {
1797 	int ret, start, orig_credits = handle->h_buffer_credits;
1798 	u32 cpos;
1799 	struct ocfs2_path *left_path = NULL;
1800 
1801 	*ret_left_path = NULL;
1802 
1803 	left_path = ocfs2_new_path(path_root_bh(right_path),
1804 				   path_root_el(right_path));
1805 	if (!left_path) {
1806 		ret = -ENOMEM;
1807 		mlog_errno(ret);
1808 		goto out;
1809 	}
1810 
1811 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
1812 	if (ret) {
1813 		mlog_errno(ret);
1814 		goto out;
1815 	}
1816 
1817 	mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
1818 
1819 	/*
1820 	 * What we want to do here is:
1821 	 *
1822 	 * 1) Start with the rightmost path.
1823 	 *
1824 	 * 2) Determine a path to the leaf block directly to the left
1825 	 *    of that leaf.
1826 	 *
1827 	 * 3) Determine the 'subtree root' - the lowest level tree node
1828 	 *    which contains a path to both leaves.
1829 	 *
1830 	 * 4) Rotate the subtree.
1831 	 *
1832 	 * 5) Find the next subtree by considering the left path to be
1833 	 *    the new right path.
1834 	 *
1835 	 * The check at the top of this while loop also accepts
1836 	 * insert_cpos == cpos because cpos is only a _theoretical_
1837 	 * value to get us the left path - insert_cpos might very well
1838 	 * be filling that hole.
1839 	 *
1840 	 * Stop at a cpos of '0' because we either started at the
1841 	 * leftmost branch (i.e., a tree with one branch and a
1842 	 * rotation inside of it), or we've gone as far as we can in
1843 	 * rotating subtrees.
1844 	 */
1845 	while (cpos && insert_cpos <= cpos) {
1846 		mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
1847 		     insert_cpos, cpos);
1848 
1849 		ret = ocfs2_find_path(inode, left_path, cpos);
1850 		if (ret) {
1851 			mlog_errno(ret);
1852 			goto out;
1853 		}
1854 
1855 		mlog_bug_on_msg(path_leaf_bh(left_path) ==
1856 				path_leaf_bh(right_path),
1857 				"Inode %lu: error during insert of %u "
1858 				"(left path cpos %u) results in two identical "
1859 				"paths ending at %llu\n",
1860 				inode->i_ino, insert_cpos, cpos,
1861 				(unsigned long long)
1862 				path_leaf_bh(left_path)->b_blocknr);
1863 
1864 		if (split == SPLIT_NONE &&
1865 		    ocfs2_rotate_requires_path_adjustment(left_path,
1866 							  insert_cpos)) {
1867 
1868 			/*
1869 			 * We've rotated the tree as much as we
1870 			 * should. The rest is up to
1871 			 * ocfs2_insert_path() to complete, after the
1872 			 * record insertion. We indicate this
1873 			 * situation by returning the left path.
1874 			 *
1875 			 * The reason we don't adjust the records here
1876 			 * before the record insert is that an error
1877 			 * later might break the rule where a parent
1878 			 * record e_cpos will reflect the actual
1879 			 * e_cpos of the 1st nonempty record of the
1880 			 * child list.
1881 			 */
1882 			*ret_left_path = left_path;
1883 			goto out_ret_path;
1884 		}
1885 
1886 		start = ocfs2_find_subtree_root(inode, left_path, right_path);
1887 
1888 		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
1889 		     start,
1890 		     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
1891 		     right_path->p_tree_depth);
1892 
1893 		ret = ocfs2_extend_rotate_transaction(handle, start,
1894 						      orig_credits, right_path);
1895 		if (ret) {
1896 			mlog_errno(ret);
1897 			goto out;
1898 		}
1899 
1900 		ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
1901 						 right_path, start);
1902 		if (ret) {
1903 			mlog_errno(ret);
1904 			goto out;
1905 		}
1906 
1907 		if (split != SPLIT_NONE &&
1908 		    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
1909 						insert_cpos)) {
1910 			/*
1911 			 * A rotate moves the rightmost left leaf
1912 			 * record over to the leftmost right leaf
1913 			 * slot. If we're doing an extent split
1914 			 * instead of a real insert, then we have to
1915 			 * check that the extent to be split wasn't
1916 			 * just moved over. If it was, then we can
1917 			 * exit here, passing left_path back -
1918 			 * ocfs2_split_extent() is smart enough to
1919 			 * search both leaves.
1920 			 */
1921 			*ret_left_path = left_path;
1922 			goto out_ret_path;
1923 		}
1924 
1925 		/*
1926 		 * There is no need to re-read the next right path
1927 		 * as we know that it'll be our current left
1928 		 * path. Optimize by copying values instead.
1929 		 */
1930 		ocfs2_mv_path(right_path, left_path);
1931 
1932 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
1933 						    &cpos);
1934 		if (ret) {
1935 			mlog_errno(ret);
1936 			goto out;
1937 		}
1938 	}
1939 
1940 out:
1941 	ocfs2_free_path(left_path);
1942 
1943 out_ret_path:
1944 	return ret;
1945 }
1946 
1947 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
1948 				      struct ocfs2_path *path)
1949 {
1950 	int i, idx;
1951 	struct ocfs2_extent_rec *rec;
1952 	struct ocfs2_extent_list *el;
1953 	struct ocfs2_extent_block *eb;
1954 	u32 range;
1955 
1956 	/* Path should always be rightmost. */
1957 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
1958 	BUG_ON(eb->h_next_leaf_blk != 0ULL);
1959 
1960 	el = &eb->h_list;
1961 	BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
1962 	idx = le16_to_cpu(el->l_next_free_rec) - 1;
1963 	rec = &el->l_recs[idx];
1964 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
1965 
1966 	for (i = 0; i < path->p_tree_depth; i++) {
1967 		el = path->p_node[i].el;
1968 		idx = le16_to_cpu(el->l_next_free_rec) - 1;
1969 		rec = &el->l_recs[idx];
1970 
1971 		rec->e_int_clusters = cpu_to_le32(range);
1972 		le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
1973 
1974 		ocfs2_journal_dirty(handle, path->p_node[i].bh);
1975 	}
1976 }
1977 
1978 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
1979 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
1980 			      struct ocfs2_path *path, int unlink_start)
1981 {
1982 	int ret, i;
1983 	struct ocfs2_extent_block *eb;
1984 	struct ocfs2_extent_list *el;
1985 	struct buffer_head *bh;
1986 
1987 	for(i = unlink_start; i < path_num_items(path); i++) {
1988 		bh = path->p_node[i].bh;
1989 
1990 		eb = (struct ocfs2_extent_block *)bh->b_data;
1991 		/*
1992 		 * Not all nodes might have had their final count
1993 		 * decremented by the caller - handle this here.
1994 		 */
1995 		el = &eb->h_list;
1996 		if (le16_to_cpu(el->l_next_free_rec) > 1) {
1997 			mlog(ML_ERROR,
1998 			     "Inode %llu, attempted to remove extent block "
1999 			     "%llu with %u records\n",
2000 			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2001 			     (unsigned long long)le64_to_cpu(eb->h_blkno),
2002 			     le16_to_cpu(el->l_next_free_rec));
2003 
2004 			ocfs2_journal_dirty(handle, bh);
2005 			ocfs2_remove_from_cache(inode, bh);
2006 			continue;
2007 		}
2008 
2009 		el->l_next_free_rec = 0;
2010 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2011 
2012 		ocfs2_journal_dirty(handle, bh);
2013 
2014 		ret = ocfs2_cache_extent_block_free(dealloc, eb);
2015 		if (ret)
2016 			mlog_errno(ret);
2017 
2018 		ocfs2_remove_from_cache(inode, bh);
2019 	}
2020 }
2021 
2022 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2023 				 struct ocfs2_path *left_path,
2024 				 struct ocfs2_path *right_path,
2025 				 int subtree_index,
2026 				 struct ocfs2_cached_dealloc_ctxt *dealloc)
2027 {
2028 	int i;
2029 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2030 	struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2031 	struct ocfs2_extent_list *el;
2032 	struct ocfs2_extent_block *eb;
2033 
2034 	el = path_leaf_el(left_path);
2035 
2036 	eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2037 
2038 	for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2039 		if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2040 			break;
2041 
2042 	BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2043 
2044 	memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2045 	le16_add_cpu(&root_el->l_next_free_rec, -1);
2046 
2047 	eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2048 	eb->h_next_leaf_blk = 0;
2049 
2050 	ocfs2_journal_dirty(handle, root_bh);
2051 	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2052 
2053 	ocfs2_unlink_path(inode, handle, dealloc, right_path,
2054 			  subtree_index + 1);
2055 }
2056 
2057 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2058 				     struct ocfs2_path *left_path,
2059 				     struct ocfs2_path *right_path,
2060 				     int subtree_index,
2061 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
2062 				     int *deleted)
2063 {
2064 	int ret, i, del_right_subtree = 0, right_has_empty = 0;
2065 	struct buffer_head *root_bh, *di_bh = path_root_bh(right_path);
2066 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
2067 	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2068 	struct ocfs2_extent_block *eb;
2069 
2070 	*deleted = 0;
2071 
2072 	right_leaf_el = path_leaf_el(right_path);
2073 	left_leaf_el = path_leaf_el(left_path);
2074 	root_bh = left_path->p_node[subtree_index].bh;
2075 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2076 
2077 	if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2078 		return 0;
2079 
2080 	eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2081 	if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2082 		/*
2083 		 * It's legal for us to proceed if the right leaf is
2084 		 * the rightmost one and it has an empty extent. There
2085 		 * are two cases to handle - whether the leaf will be
2086 		 * empty after removal or not. If the leaf isn't empty
2087 		 * then just remove the empty extent up front. The
2088 		 * next block will handle empty leaves by flagging
2089 		 * them for unlink.
2090 		 *
2091 		 * Non rightmost leaves will throw -EAGAIN and the
2092 		 * caller can manually move the subtree and retry.
2093 		 */
2094 
2095 		if (eb->h_next_leaf_blk != 0ULL)
2096 			return -EAGAIN;
2097 
2098 		if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2099 			ret = ocfs2_journal_access(handle, inode,
2100 						   path_leaf_bh(right_path),
2101 						   OCFS2_JOURNAL_ACCESS_WRITE);
2102 			if (ret) {
2103 				mlog_errno(ret);
2104 				goto out;
2105 			}
2106 
2107 			ocfs2_remove_empty_extent(right_leaf_el);
2108 		} else
2109 			right_has_empty = 1;
2110 	}
2111 
2112 	if (eb->h_next_leaf_blk == 0ULL &&
2113 	    le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2114 		/*
2115 		 * We have to update i_last_eb_blk during the meta
2116 		 * data delete.
2117 		 */
2118 		ret = ocfs2_journal_access(handle, inode, di_bh,
2119 					   OCFS2_JOURNAL_ACCESS_WRITE);
2120 		if (ret) {
2121 			mlog_errno(ret);
2122 			goto out;
2123 		}
2124 
2125 		del_right_subtree = 1;
2126 	}
2127 
2128 	/*
2129 	 * Getting here with an empty extent in the right path implies
2130 	 * that it's the rightmost path and will be deleted.
2131 	 */
2132 	BUG_ON(right_has_empty && !del_right_subtree);
2133 
2134 	ret = ocfs2_journal_access(handle, inode, root_bh,
2135 				   OCFS2_JOURNAL_ACCESS_WRITE);
2136 	if (ret) {
2137 		mlog_errno(ret);
2138 		goto out;
2139 	}
2140 
2141 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2142 		ret = ocfs2_journal_access(handle, inode,
2143 					   right_path->p_node[i].bh,
2144 					   OCFS2_JOURNAL_ACCESS_WRITE);
2145 		if (ret) {
2146 			mlog_errno(ret);
2147 			goto out;
2148 		}
2149 
2150 		ret = ocfs2_journal_access(handle, inode,
2151 					   left_path->p_node[i].bh,
2152 					   OCFS2_JOURNAL_ACCESS_WRITE);
2153 		if (ret) {
2154 			mlog_errno(ret);
2155 			goto out;
2156 		}
2157 	}
2158 
2159 	if (!right_has_empty) {
2160 		/*
2161 		 * Only do this if we're moving a real
2162 		 * record. Otherwise, the action is delayed until
2163 		 * after removal of the right path in which case we
2164 		 * can do a simple shift to remove the empty extent.
2165 		 */
2166 		ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2167 		memset(&right_leaf_el->l_recs[0], 0,
2168 		       sizeof(struct ocfs2_extent_rec));
2169 	}
2170 	if (eb->h_next_leaf_blk == 0ULL) {
2171 		/*
2172 		 * Move recs over to get rid of empty extent, decrease
2173 		 * next_free. This is allowed to remove the last
2174 		 * extent in our leaf (setting l_next_free_rec to
2175 		 * zero) - the delete code below won't care.
2176 		 */
2177 		ocfs2_remove_empty_extent(right_leaf_el);
2178 	}
2179 
2180 	ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2181 	if (ret)
2182 		mlog_errno(ret);
2183 	ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2184 	if (ret)
2185 		mlog_errno(ret);
2186 
2187 	if (del_right_subtree) {
2188 		ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2189 				     subtree_index, dealloc);
2190 		ocfs2_update_edge_lengths(inode, handle, left_path);
2191 
2192 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2193 		di->i_last_eb_blk = eb->h_blkno;
2194 
2195 		/*
2196 		 * Removal of the extent in the left leaf was skipped
2197 		 * above so we could delete the right path
2198 		 * 1st.
2199 		 */
2200 		if (right_has_empty)
2201 			ocfs2_remove_empty_extent(left_leaf_el);
2202 
2203 		ret = ocfs2_journal_dirty(handle, di_bh);
2204 		if (ret)
2205 			mlog_errno(ret);
2206 
2207 		*deleted = 1;
2208 	} else
2209 		ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2210 					   subtree_index);
2211 
2212 out:
2213 	return ret;
2214 }
2215 
2216 /*
2217  * Given a full path, determine what cpos value would return us a path
2218  * containing the leaf immediately to the right of the current one.
2219  *
2220  * Will return zero if the path passed in is already the rightmost path.
2221  *
2222  * This looks similar, but is subtly different to
2223  * ocfs2_find_cpos_for_left_leaf().
2224  */
2225 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2226 					  struct ocfs2_path *path, u32 *cpos)
2227 {
2228 	int i, j, ret = 0;
2229 	u64 blkno;
2230 	struct ocfs2_extent_list *el;
2231 
2232 	*cpos = 0;
2233 
2234 	if (path->p_tree_depth == 0)
2235 		return 0;
2236 
2237 	blkno = path_leaf_bh(path)->b_blocknr;
2238 
2239 	/* Start at the tree node just above the leaf and work our way up. */
2240 	i = path->p_tree_depth - 1;
2241 	while (i >= 0) {
2242 		int next_free;
2243 
2244 		el = path->p_node[i].el;
2245 
2246 		/*
2247 		 * Find the extent record just after the one in our
2248 		 * path.
2249 		 */
2250 		next_free = le16_to_cpu(el->l_next_free_rec);
2251 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2252 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2253 				if (j == (next_free - 1)) {
2254 					if (i == 0) {
2255 						/*
2256 						 * We've determined that the
2257 						 * path specified is already
2258 						 * the rightmost one - return a
2259 						 * cpos of zero.
2260 						 */
2261 						goto out;
2262 					}
2263 					/*
2264 					 * The rightmost record points to our
2265 					 * leaf - we need to travel up the
2266 					 * tree one level.
2267 					 */
2268 					goto next_node;
2269 				}
2270 
2271 				*cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2272 				goto out;
2273 			}
2274 		}
2275 
2276 		/*
2277 		 * If we got here, we never found a valid node where
2278 		 * the tree indicated one should be.
2279 		 */
2280 		ocfs2_error(sb,
2281 			    "Invalid extent tree at extent block %llu\n",
2282 			    (unsigned long long)blkno);
2283 		ret = -EROFS;
2284 		goto out;
2285 
2286 next_node:
2287 		blkno = path->p_node[i].bh->b_blocknr;
2288 		i--;
2289 	}
2290 
2291 out:
2292 	return ret;
2293 }
2294 
2295 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2296 					    handle_t *handle,
2297 					    struct buffer_head *bh,
2298 					    struct ocfs2_extent_list *el)
2299 {
2300 	int ret;
2301 
2302 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2303 		return 0;
2304 
2305 	ret = ocfs2_journal_access(handle, inode, bh,
2306 				   OCFS2_JOURNAL_ACCESS_WRITE);
2307 	if (ret) {
2308 		mlog_errno(ret);
2309 		goto out;
2310 	}
2311 
2312 	ocfs2_remove_empty_extent(el);
2313 
2314 	ret = ocfs2_journal_dirty(handle, bh);
2315 	if (ret)
2316 		mlog_errno(ret);
2317 
2318 out:
2319 	return ret;
2320 }
2321 
2322 static int __ocfs2_rotate_tree_left(struct inode *inode,
2323 				    handle_t *handle, int orig_credits,
2324 				    struct ocfs2_path *path,
2325 				    struct ocfs2_cached_dealloc_ctxt *dealloc,
2326 				    struct ocfs2_path **empty_extent_path)
2327 {
2328 	int ret, subtree_root, deleted;
2329 	u32 right_cpos;
2330 	struct ocfs2_path *left_path = NULL;
2331 	struct ocfs2_path *right_path = NULL;
2332 
2333 	BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2334 
2335 	*empty_extent_path = NULL;
2336 
2337 	ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2338 					     &right_cpos);
2339 	if (ret) {
2340 		mlog_errno(ret);
2341 		goto out;
2342 	}
2343 
2344 	left_path = ocfs2_new_path(path_root_bh(path),
2345 				   path_root_el(path));
2346 	if (!left_path) {
2347 		ret = -ENOMEM;
2348 		mlog_errno(ret);
2349 		goto out;
2350 	}
2351 
2352 	ocfs2_cp_path(left_path, path);
2353 
2354 	right_path = ocfs2_new_path(path_root_bh(path),
2355 				    path_root_el(path));
2356 	if (!right_path) {
2357 		ret = -ENOMEM;
2358 		mlog_errno(ret);
2359 		goto out;
2360 	}
2361 
2362 	while (right_cpos) {
2363 		ret = ocfs2_find_path(inode, right_path, right_cpos);
2364 		if (ret) {
2365 			mlog_errno(ret);
2366 			goto out;
2367 		}
2368 
2369 		subtree_root = ocfs2_find_subtree_root(inode, left_path,
2370 						       right_path);
2371 
2372 		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2373 		     subtree_root,
2374 		     (unsigned long long)
2375 		     right_path->p_node[subtree_root].bh->b_blocknr,
2376 		     right_path->p_tree_depth);
2377 
2378 		ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2379 						      orig_credits, left_path);
2380 		if (ret) {
2381 			mlog_errno(ret);
2382 			goto out;
2383 		}
2384 
2385 		/*
2386 		 * Caller might still want to make changes to the
2387 		 * tree root, so re-add it to the journal here.
2388 		 */
2389 		ret = ocfs2_journal_access(handle, inode,
2390 					   path_root_bh(left_path),
2391 					   OCFS2_JOURNAL_ACCESS_WRITE);
2392 		if (ret) {
2393 			mlog_errno(ret);
2394 			goto out;
2395 		}
2396 
2397 		ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2398 						right_path, subtree_root,
2399 						dealloc, &deleted);
2400 		if (ret == -EAGAIN) {
2401 			/*
2402 			 * The rotation has to temporarily stop due to
2403 			 * the right subtree having an empty
2404 			 * extent. Pass it back to the caller for a
2405 			 * fixup.
2406 			 */
2407 			*empty_extent_path = right_path;
2408 			right_path = NULL;
2409 			goto out;
2410 		}
2411 		if (ret) {
2412 			mlog_errno(ret);
2413 			goto out;
2414 		}
2415 
2416 		/*
2417 		 * The subtree rotate might have removed records on
2418 		 * the rightmost edge. If so, then rotation is
2419 		 * complete.
2420 		 */
2421 		if (deleted)
2422 			break;
2423 
2424 		ocfs2_mv_path(left_path, right_path);
2425 
2426 		ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2427 						     &right_cpos);
2428 		if (ret) {
2429 			mlog_errno(ret);
2430 			goto out;
2431 		}
2432 	}
2433 
2434 out:
2435 	ocfs2_free_path(right_path);
2436 	ocfs2_free_path(left_path);
2437 
2438 	return ret;
2439 }
2440 
2441 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2442 				       struct ocfs2_path *path,
2443 				       struct ocfs2_cached_dealloc_ctxt *dealloc)
2444 {
2445 	int ret, subtree_index;
2446 	u32 cpos;
2447 	struct ocfs2_path *left_path = NULL;
2448 	struct ocfs2_dinode *di;
2449 	struct ocfs2_extent_block *eb;
2450 	struct ocfs2_extent_list *el;
2451 
2452 	/*
2453 	 * XXX: This code assumes that the root is an inode, which is
2454 	 * true for now but may change as tree code gets generic.
2455 	 */
2456 	di = (struct ocfs2_dinode *)path_root_bh(path)->b_data;
2457 	if (!OCFS2_IS_VALID_DINODE(di)) {
2458 		ret = -EIO;
2459 		ocfs2_error(inode->i_sb,
2460 			    "Inode %llu has invalid path root",
2461 			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
2462 		goto out;
2463 	}
2464 
2465 	/*
2466 	 * There's two ways we handle this depending on
2467 	 * whether path is the only existing one.
2468 	 */
2469 	ret = ocfs2_extend_rotate_transaction(handle, 0,
2470 					      handle->h_buffer_credits,
2471 					      path);
2472 	if (ret) {
2473 		mlog_errno(ret);
2474 		goto out;
2475 	}
2476 
2477 	ret = ocfs2_journal_access_path(inode, handle, path);
2478 	if (ret) {
2479 		mlog_errno(ret);
2480 		goto out;
2481 	}
2482 
2483 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2484 	if (ret) {
2485 		mlog_errno(ret);
2486 		goto out;
2487 	}
2488 
2489 	if (cpos) {
2490 		/*
2491 		 * We have a path to the left of this one - it needs
2492 		 * an update too.
2493 		 */
2494 		left_path = ocfs2_new_path(path_root_bh(path),
2495 					   path_root_el(path));
2496 		if (!left_path) {
2497 			ret = -ENOMEM;
2498 			mlog_errno(ret);
2499 			goto out;
2500 		}
2501 
2502 		ret = ocfs2_find_path(inode, left_path, cpos);
2503 		if (ret) {
2504 			mlog_errno(ret);
2505 			goto out;
2506 		}
2507 
2508 		ret = ocfs2_journal_access_path(inode, handle, left_path);
2509 		if (ret) {
2510 			mlog_errno(ret);
2511 			goto out;
2512 		}
2513 
2514 		subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2515 
2516 		ocfs2_unlink_subtree(inode, handle, left_path, path,
2517 				     subtree_index, dealloc);
2518 		ocfs2_update_edge_lengths(inode, handle, left_path);
2519 
2520 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2521 		di->i_last_eb_blk = eb->h_blkno;
2522 	} else {
2523 		/*
2524 		 * 'path' is also the leftmost path which
2525 		 * means it must be the only one. This gets
2526 		 * handled differently because we want to
2527 		 * revert the inode back to having extents
2528 		 * in-line.
2529 		 */
2530 		ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2531 
2532 		el = &di->id2.i_list;
2533 		el->l_tree_depth = 0;
2534 		el->l_next_free_rec = 0;
2535 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2536 
2537 		di->i_last_eb_blk = 0;
2538 	}
2539 
2540 	ocfs2_journal_dirty(handle, path_root_bh(path));
2541 
2542 out:
2543 	ocfs2_free_path(left_path);
2544 	return ret;
2545 }
2546 
2547 /*
2548  * Left rotation of btree records.
2549  *
2550  * In many ways, this is (unsurprisingly) the opposite of right
2551  * rotation. We start at some non-rightmost path containing an empty
2552  * extent in the leaf block. The code works its way to the rightmost
2553  * path by rotating records to the left in every subtree.
2554  *
2555  * This is used by any code which reduces the number of extent records
2556  * in a leaf. After removal, an empty record should be placed in the
2557  * leftmost list position.
2558  *
2559  * This won't handle a length update of the rightmost path records if
2560  * the rightmost tree leaf record is removed so the caller is
2561  * responsible for detecting and correcting that.
2562  */
2563 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2564 				  struct ocfs2_path *path,
2565 				  struct ocfs2_cached_dealloc_ctxt *dealloc)
2566 {
2567 	int ret, orig_credits = handle->h_buffer_credits;
2568 	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2569 	struct ocfs2_extent_block *eb;
2570 	struct ocfs2_extent_list *el;
2571 
2572 	el = path_leaf_el(path);
2573 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2574 		return 0;
2575 
2576 	if (path->p_tree_depth == 0) {
2577 rightmost_no_delete:
2578 		/*
2579 		 * In-inode extents. This is trivially handled, so do
2580 		 * it up front.
2581 		 */
2582 		ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2583 						       path_leaf_bh(path),
2584 						       path_leaf_el(path));
2585 		if (ret)
2586 			mlog_errno(ret);
2587 		goto out;
2588 	}
2589 
2590 	/*
2591 	 * Handle rightmost branch now. There's several cases:
2592 	 *  1) simple rotation leaving records in there. That's trivial.
2593 	 *  2) rotation requiring a branch delete - there's no more
2594 	 *     records left. Two cases of this:
2595 	 *     a) There are branches to the left.
2596 	 *     b) This is also the leftmost (the only) branch.
2597 	 *
2598 	 *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2599 	 *  2a) we need the left branch so that we can update it with the unlink
2600 	 *  2b) we need to bring the inode back to inline extents.
2601 	 */
2602 
2603 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2604 	el = &eb->h_list;
2605 	if (eb->h_next_leaf_blk == 0) {
2606 		/*
2607 		 * This gets a bit tricky if we're going to delete the
2608 		 * rightmost path. Get the other cases out of the way
2609 		 * 1st.
2610 		 */
2611 		if (le16_to_cpu(el->l_next_free_rec) > 1)
2612 			goto rightmost_no_delete;
2613 
2614 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
2615 			ret = -EIO;
2616 			ocfs2_error(inode->i_sb,
2617 				    "Inode %llu has empty extent block at %llu",
2618 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
2619 				    (unsigned long long)le64_to_cpu(eb->h_blkno));
2620 			goto out;
2621 		}
2622 
2623 		/*
2624 		 * XXX: The caller can not trust "path" any more after
2625 		 * this as it will have been deleted. What do we do?
2626 		 *
2627 		 * In theory the rotate-for-merge code will never get
2628 		 * here because it'll always ask for a rotate in a
2629 		 * nonempty list.
2630 		 */
2631 
2632 		ret = ocfs2_remove_rightmost_path(inode, handle, path,
2633 						  dealloc);
2634 		if (ret)
2635 			mlog_errno(ret);
2636 		goto out;
2637 	}
2638 
2639 	/*
2640 	 * Now we can loop, remembering the path we get from -EAGAIN
2641 	 * and restarting from there.
2642 	 */
2643 try_rotate:
2644 	ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
2645 				       dealloc, &restart_path);
2646 	if (ret && ret != -EAGAIN) {
2647 		mlog_errno(ret);
2648 		goto out;
2649 	}
2650 
2651 	while (ret == -EAGAIN) {
2652 		tmp_path = restart_path;
2653 		restart_path = NULL;
2654 
2655 		ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
2656 					       tmp_path, dealloc,
2657 					       &restart_path);
2658 		if (ret && ret != -EAGAIN) {
2659 			mlog_errno(ret);
2660 			goto out;
2661 		}
2662 
2663 		ocfs2_free_path(tmp_path);
2664 		tmp_path = NULL;
2665 
2666 		if (ret == 0)
2667 			goto try_rotate;
2668 	}
2669 
2670 out:
2671 	ocfs2_free_path(tmp_path);
2672 	ocfs2_free_path(restart_path);
2673 	return ret;
2674 }
2675 
2676 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2677 				int index)
2678 {
2679 	struct ocfs2_extent_rec *rec = &el->l_recs[index];
2680 	unsigned int size;
2681 
2682 	if (rec->e_leaf_clusters == 0) {
2683 		/*
2684 		 * We consumed all of the merged-from record. An empty
2685 		 * extent cannot exist anywhere but the 1st array
2686 		 * position, so move things over if the merged-from
2687 		 * record doesn't occupy that position.
2688 		 *
2689 		 * This creates a new empty extent so the caller
2690 		 * should be smart enough to have removed any existing
2691 		 * ones.
2692 		 */
2693 		if (index > 0) {
2694 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
2695 			size = index * sizeof(struct ocfs2_extent_rec);
2696 			memmove(&el->l_recs[1], &el->l_recs[0], size);
2697 		}
2698 
2699 		/*
2700 		 * Always memset - the caller doesn't check whether it
2701 		 * created an empty extent, so there could be junk in
2702 		 * the other fields.
2703 		 */
2704 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2705 	}
2706 }
2707 
2708 static int ocfs2_get_right_path(struct inode *inode,
2709 				struct ocfs2_path *left_path,
2710 				struct ocfs2_path **ret_right_path)
2711 {
2712 	int ret;
2713 	u32 right_cpos;
2714 	struct ocfs2_path *right_path = NULL;
2715 	struct ocfs2_extent_list *left_el;
2716 
2717 	*ret_right_path = NULL;
2718 
2719 	/* This function shouldn't be called for non-trees. */
2720 	BUG_ON(left_path->p_tree_depth == 0);
2721 
2722 	left_el = path_leaf_el(left_path);
2723 	BUG_ON(left_el->l_next_free_rec != left_el->l_count);
2724 
2725 	ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2726 					     &right_cpos);
2727 	if (ret) {
2728 		mlog_errno(ret);
2729 		goto out;
2730 	}
2731 
2732 	/* This function shouldn't be called for the rightmost leaf. */
2733 	BUG_ON(right_cpos == 0);
2734 
2735 	right_path = ocfs2_new_path(path_root_bh(left_path),
2736 				    path_root_el(left_path));
2737 	if (!right_path) {
2738 		ret = -ENOMEM;
2739 		mlog_errno(ret);
2740 		goto out;
2741 	}
2742 
2743 	ret = ocfs2_find_path(inode, right_path, right_cpos);
2744 	if (ret) {
2745 		mlog_errno(ret);
2746 		goto out;
2747 	}
2748 
2749 	*ret_right_path = right_path;
2750 out:
2751 	if (ret)
2752 		ocfs2_free_path(right_path);
2753 	return ret;
2754 }
2755 
2756 /*
2757  * Remove split_rec clusters from the record at index and merge them
2758  * onto the beginning of the record "next" to it.
2759  * For index < l_count - 1, the next means the extent rec at index + 1.
2760  * For index == l_count - 1, the "next" means the 1st extent rec of the
2761  * next extent block.
2762  */
2763 static int ocfs2_merge_rec_right(struct inode *inode,
2764 				 struct ocfs2_path *left_path,
2765 				 handle_t *handle,
2766 				 struct ocfs2_extent_rec *split_rec,
2767 				 int index)
2768 {
2769 	int ret, next_free, i;
2770 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2771 	struct ocfs2_extent_rec *left_rec;
2772 	struct ocfs2_extent_rec *right_rec;
2773 	struct ocfs2_extent_list *right_el;
2774 	struct ocfs2_path *right_path = NULL;
2775 	int subtree_index = 0;
2776 	struct ocfs2_extent_list *el = path_leaf_el(left_path);
2777 	struct buffer_head *bh = path_leaf_bh(left_path);
2778 	struct buffer_head *root_bh = NULL;
2779 
2780 	BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2781 	left_rec = &el->l_recs[index];
2782 
2783 	if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
2784 	    le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
2785 		/* we meet with a cross extent block merge. */
2786 		ret = ocfs2_get_right_path(inode, left_path, &right_path);
2787 		if (ret) {
2788 			mlog_errno(ret);
2789 			goto out;
2790 		}
2791 
2792 		right_el = path_leaf_el(right_path);
2793 		next_free = le16_to_cpu(right_el->l_next_free_rec);
2794 		BUG_ON(next_free <= 0);
2795 		right_rec = &right_el->l_recs[0];
2796 		if (ocfs2_is_empty_extent(right_rec)) {
2797 			BUG_ON(next_free <= 1);
2798 			right_rec = &right_el->l_recs[1];
2799 		}
2800 
2801 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
2802 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
2803 		       le32_to_cpu(right_rec->e_cpos));
2804 
2805 		subtree_index = ocfs2_find_subtree_root(inode,
2806 							left_path, right_path);
2807 
2808 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
2809 						      handle->h_buffer_credits,
2810 						      right_path);
2811 		if (ret) {
2812 			mlog_errno(ret);
2813 			goto out;
2814 		}
2815 
2816 		root_bh = left_path->p_node[subtree_index].bh;
2817 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2818 
2819 		ret = ocfs2_journal_access(handle, inode, root_bh,
2820 					   OCFS2_JOURNAL_ACCESS_WRITE);
2821 		if (ret) {
2822 			mlog_errno(ret);
2823 			goto out;
2824 		}
2825 
2826 		for (i = subtree_index + 1;
2827 		     i < path_num_items(right_path); i++) {
2828 			ret = ocfs2_journal_access(handle, inode,
2829 						   right_path->p_node[i].bh,
2830 						   OCFS2_JOURNAL_ACCESS_WRITE);
2831 			if (ret) {
2832 				mlog_errno(ret);
2833 				goto out;
2834 			}
2835 
2836 			ret = ocfs2_journal_access(handle, inode,
2837 						   left_path->p_node[i].bh,
2838 						   OCFS2_JOURNAL_ACCESS_WRITE);
2839 			if (ret) {
2840 				mlog_errno(ret);
2841 				goto out;
2842 			}
2843 		}
2844 
2845 	} else {
2846 		BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
2847 		right_rec = &el->l_recs[index + 1];
2848 	}
2849 
2850 	ret = ocfs2_journal_access(handle, inode, bh,
2851 				   OCFS2_JOURNAL_ACCESS_WRITE);
2852 	if (ret) {
2853 		mlog_errno(ret);
2854 		goto out;
2855 	}
2856 
2857 	le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
2858 
2859 	le32_add_cpu(&right_rec->e_cpos, -split_clusters);
2860 	le64_add_cpu(&right_rec->e_blkno,
2861 		     -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
2862 	le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
2863 
2864 	ocfs2_cleanup_merge(el, index);
2865 
2866 	ret = ocfs2_journal_dirty(handle, bh);
2867 	if (ret)
2868 		mlog_errno(ret);
2869 
2870 	if (right_path) {
2871 		ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2872 		if (ret)
2873 			mlog_errno(ret);
2874 
2875 		ocfs2_complete_edge_insert(inode, handle, left_path,
2876 					   right_path, subtree_index);
2877 	}
2878 out:
2879 	if (right_path)
2880 		ocfs2_free_path(right_path);
2881 	return ret;
2882 }
2883 
2884 static int ocfs2_get_left_path(struct inode *inode,
2885 			       struct ocfs2_path *right_path,
2886 			       struct ocfs2_path **ret_left_path)
2887 {
2888 	int ret;
2889 	u32 left_cpos;
2890 	struct ocfs2_path *left_path = NULL;
2891 
2892 	*ret_left_path = NULL;
2893 
2894 	/* This function shouldn't be called for non-trees. */
2895 	BUG_ON(right_path->p_tree_depth == 0);
2896 
2897 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
2898 					    right_path, &left_cpos);
2899 	if (ret) {
2900 		mlog_errno(ret);
2901 		goto out;
2902 	}
2903 
2904 	/* This function shouldn't be called for the leftmost leaf. */
2905 	BUG_ON(left_cpos == 0);
2906 
2907 	left_path = ocfs2_new_path(path_root_bh(right_path),
2908 				   path_root_el(right_path));
2909 	if (!left_path) {
2910 		ret = -ENOMEM;
2911 		mlog_errno(ret);
2912 		goto out;
2913 	}
2914 
2915 	ret = ocfs2_find_path(inode, left_path, left_cpos);
2916 	if (ret) {
2917 		mlog_errno(ret);
2918 		goto out;
2919 	}
2920 
2921 	*ret_left_path = left_path;
2922 out:
2923 	if (ret)
2924 		ocfs2_free_path(left_path);
2925 	return ret;
2926 }
2927 
2928 /*
2929  * Remove split_rec clusters from the record at index and merge them
2930  * onto the tail of the record "before" it.
2931  * For index > 0, the "before" means the extent rec at index - 1.
2932  *
2933  * For index == 0, the "before" means the last record of the previous
2934  * extent block. And there is also a situation that we may need to
2935  * remove the rightmost leaf extent block in the right_path and change
2936  * the right path to indicate the new rightmost path.
2937  */
2938 static int ocfs2_merge_rec_left(struct inode *inode,
2939 				struct ocfs2_path *right_path,
2940 				handle_t *handle,
2941 				struct ocfs2_extent_rec *split_rec,
2942 				struct ocfs2_cached_dealloc_ctxt *dealloc,
2943 				int index)
2944 {
2945 	int ret, i, subtree_index = 0, has_empty_extent = 0;
2946 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2947 	struct ocfs2_extent_rec *left_rec;
2948 	struct ocfs2_extent_rec *right_rec;
2949 	struct ocfs2_extent_list *el = path_leaf_el(right_path);
2950 	struct buffer_head *bh = path_leaf_bh(right_path);
2951 	struct buffer_head *root_bh = NULL;
2952 	struct ocfs2_path *left_path = NULL;
2953 	struct ocfs2_extent_list *left_el;
2954 
2955 	BUG_ON(index < 0);
2956 
2957 	right_rec = &el->l_recs[index];
2958 	if (index == 0) {
2959 		/* we meet with a cross extent block merge. */
2960 		ret = ocfs2_get_left_path(inode, right_path, &left_path);
2961 		if (ret) {
2962 			mlog_errno(ret);
2963 			goto out;
2964 		}
2965 
2966 		left_el = path_leaf_el(left_path);
2967 		BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
2968 		       le16_to_cpu(left_el->l_count));
2969 
2970 		left_rec = &left_el->l_recs[
2971 				le16_to_cpu(left_el->l_next_free_rec) - 1];
2972 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
2973 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
2974 		       le32_to_cpu(split_rec->e_cpos));
2975 
2976 		subtree_index = ocfs2_find_subtree_root(inode,
2977 							left_path, right_path);
2978 
2979 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
2980 						      handle->h_buffer_credits,
2981 						      left_path);
2982 		if (ret) {
2983 			mlog_errno(ret);
2984 			goto out;
2985 		}
2986 
2987 		root_bh = left_path->p_node[subtree_index].bh;
2988 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2989 
2990 		ret = ocfs2_journal_access(handle, inode, root_bh,
2991 					   OCFS2_JOURNAL_ACCESS_WRITE);
2992 		if (ret) {
2993 			mlog_errno(ret);
2994 			goto out;
2995 		}
2996 
2997 		for (i = subtree_index + 1;
2998 		     i < path_num_items(right_path); i++) {
2999 			ret = ocfs2_journal_access(handle, inode,
3000 						   right_path->p_node[i].bh,
3001 						   OCFS2_JOURNAL_ACCESS_WRITE);
3002 			if (ret) {
3003 				mlog_errno(ret);
3004 				goto out;
3005 			}
3006 
3007 			ret = ocfs2_journal_access(handle, inode,
3008 						   left_path->p_node[i].bh,
3009 						   OCFS2_JOURNAL_ACCESS_WRITE);
3010 			if (ret) {
3011 				mlog_errno(ret);
3012 				goto out;
3013 			}
3014 		}
3015 	} else {
3016 		left_rec = &el->l_recs[index - 1];
3017 		if (ocfs2_is_empty_extent(&el->l_recs[0]))
3018 			has_empty_extent = 1;
3019 	}
3020 
3021 	ret = ocfs2_journal_access(handle, inode, bh,
3022 				   OCFS2_JOURNAL_ACCESS_WRITE);
3023 	if (ret) {
3024 		mlog_errno(ret);
3025 		goto out;
3026 	}
3027 
3028 	if (has_empty_extent && index == 1) {
3029 		/*
3030 		 * The easy case - we can just plop the record right in.
3031 		 */
3032 		*left_rec = *split_rec;
3033 
3034 		has_empty_extent = 0;
3035 	} else
3036 		le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3037 
3038 	le32_add_cpu(&right_rec->e_cpos, split_clusters);
3039 	le64_add_cpu(&right_rec->e_blkno,
3040 		     ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3041 	le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3042 
3043 	ocfs2_cleanup_merge(el, index);
3044 
3045 	ret = ocfs2_journal_dirty(handle, bh);
3046 	if (ret)
3047 		mlog_errno(ret);
3048 
3049 	if (left_path) {
3050 		ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3051 		if (ret)
3052 			mlog_errno(ret);
3053 
3054 		/*
3055 		 * In the situation that the right_rec is empty and the extent
3056 		 * block is empty also,  ocfs2_complete_edge_insert can't handle
3057 		 * it and we need to delete the right extent block.
3058 		 */
3059 		if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3060 		    le16_to_cpu(el->l_next_free_rec) == 1) {
3061 
3062 			ret = ocfs2_remove_rightmost_path(inode, handle,
3063 							  right_path, dealloc);
3064 			if (ret) {
3065 				mlog_errno(ret);
3066 				goto out;
3067 			}
3068 
3069 			/* Now the rightmost extent block has been deleted.
3070 			 * So we use the new rightmost path.
3071 			 */
3072 			ocfs2_mv_path(right_path, left_path);
3073 			left_path = NULL;
3074 		} else
3075 			ocfs2_complete_edge_insert(inode, handle, left_path,
3076 						   right_path, subtree_index);
3077 	}
3078 out:
3079 	if (left_path)
3080 		ocfs2_free_path(left_path);
3081 	return ret;
3082 }
3083 
3084 static int ocfs2_try_to_merge_extent(struct inode *inode,
3085 				     handle_t *handle,
3086 				     struct ocfs2_path *path,
3087 				     int split_index,
3088 				     struct ocfs2_extent_rec *split_rec,
3089 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
3090 				     struct ocfs2_merge_ctxt *ctxt)
3091 
3092 {
3093 	int ret = 0;
3094 	struct ocfs2_extent_list *el = path_leaf_el(path);
3095 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3096 
3097 	BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3098 
3099 	if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3100 		/*
3101 		 * The merge code will need to create an empty
3102 		 * extent to take the place of the newly
3103 		 * emptied slot. Remove any pre-existing empty
3104 		 * extents - having more than one in a leaf is
3105 		 * illegal.
3106 		 */
3107 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3108 					     dealloc);
3109 		if (ret) {
3110 			mlog_errno(ret);
3111 			goto out;
3112 		}
3113 		split_index--;
3114 		rec = &el->l_recs[split_index];
3115 	}
3116 
3117 	if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3118 		/*
3119 		 * Left-right contig implies this.
3120 		 */
3121 		BUG_ON(!ctxt->c_split_covers_rec);
3122 
3123 		/*
3124 		 * Since the leftright insert always covers the entire
3125 		 * extent, this call will delete the insert record
3126 		 * entirely, resulting in an empty extent record added to
3127 		 * the extent block.
3128 		 *
3129 		 * Since the adding of an empty extent shifts
3130 		 * everything back to the right, there's no need to
3131 		 * update split_index here.
3132 		 *
3133 		 * When the split_index is zero, we need to merge it to the
3134 		 * prevoius extent block. It is more efficient and easier
3135 		 * if we do merge_right first and merge_left later.
3136 		 */
3137 		ret = ocfs2_merge_rec_right(inode, path,
3138 					    handle, split_rec,
3139 					    split_index);
3140 		if (ret) {
3141 			mlog_errno(ret);
3142 			goto out;
3143 		}
3144 
3145 		/*
3146 		 * We can only get this from logic error above.
3147 		 */
3148 		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3149 
3150 		/* The merge left us with an empty extent, remove it. */
3151 		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
3152 		if (ret) {
3153 			mlog_errno(ret);
3154 			goto out;
3155 		}
3156 
3157 		rec = &el->l_recs[split_index];
3158 
3159 		/*
3160 		 * Note that we don't pass split_rec here on purpose -
3161 		 * we've merged it into the rec already.
3162 		 */
3163 		ret = ocfs2_merge_rec_left(inode, path,
3164 					   handle, rec,
3165 					   dealloc,
3166 					   split_index);
3167 
3168 		if (ret) {
3169 			mlog_errno(ret);
3170 			goto out;
3171 		}
3172 
3173 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3174 					     dealloc);
3175 		/*
3176 		 * Error from this last rotate is not critical, so
3177 		 * print but don't bubble it up.
3178 		 */
3179 		if (ret)
3180 			mlog_errno(ret);
3181 		ret = 0;
3182 	} else {
3183 		/*
3184 		 * Merge a record to the left or right.
3185 		 *
3186 		 * 'contig_type' is relative to the existing record,
3187 		 * so for example, if we're "right contig", it's to
3188 		 * the record on the left (hence the left merge).
3189 		 */
3190 		if (ctxt->c_contig_type == CONTIG_RIGHT) {
3191 			ret = ocfs2_merge_rec_left(inode,
3192 						   path,
3193 						   handle, split_rec,
3194 						   dealloc,
3195 						   split_index);
3196 			if (ret) {
3197 				mlog_errno(ret);
3198 				goto out;
3199 			}
3200 		} else {
3201 			ret = ocfs2_merge_rec_right(inode,
3202 						    path,
3203 						    handle, split_rec,
3204 						    split_index);
3205 			if (ret) {
3206 				mlog_errno(ret);
3207 				goto out;
3208 			}
3209 		}
3210 
3211 		if (ctxt->c_split_covers_rec) {
3212 			/*
3213 			 * The merge may have left an empty extent in
3214 			 * our leaf. Try to rotate it away.
3215 			 */
3216 			ret = ocfs2_rotate_tree_left(inode, handle, path,
3217 						     dealloc);
3218 			if (ret)
3219 				mlog_errno(ret);
3220 			ret = 0;
3221 		}
3222 	}
3223 
3224 out:
3225 	return ret;
3226 }
3227 
3228 static void ocfs2_subtract_from_rec(struct super_block *sb,
3229 				    enum ocfs2_split_type split,
3230 				    struct ocfs2_extent_rec *rec,
3231 				    struct ocfs2_extent_rec *split_rec)
3232 {
3233 	u64 len_blocks;
3234 
3235 	len_blocks = ocfs2_clusters_to_blocks(sb,
3236 				le16_to_cpu(split_rec->e_leaf_clusters));
3237 
3238 	if (split == SPLIT_LEFT) {
3239 		/*
3240 		 * Region is on the left edge of the existing
3241 		 * record.
3242 		 */
3243 		le32_add_cpu(&rec->e_cpos,
3244 			     le16_to_cpu(split_rec->e_leaf_clusters));
3245 		le64_add_cpu(&rec->e_blkno, len_blocks);
3246 		le16_add_cpu(&rec->e_leaf_clusters,
3247 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3248 	} else {
3249 		/*
3250 		 * Region is on the right edge of the existing
3251 		 * record.
3252 		 */
3253 		le16_add_cpu(&rec->e_leaf_clusters,
3254 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3255 	}
3256 }
3257 
3258 /*
3259  * Do the final bits of extent record insertion at the target leaf
3260  * list. If this leaf is part of an allocation tree, it is assumed
3261  * that the tree above has been prepared.
3262  */
3263 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3264 				 struct ocfs2_extent_list *el,
3265 				 struct ocfs2_insert_type *insert,
3266 				 struct inode *inode)
3267 {
3268 	int i = insert->ins_contig_index;
3269 	unsigned int range;
3270 	struct ocfs2_extent_rec *rec;
3271 
3272 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3273 
3274 	if (insert->ins_split != SPLIT_NONE) {
3275 		i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3276 		BUG_ON(i == -1);
3277 		rec = &el->l_recs[i];
3278 		ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3279 					insert_rec);
3280 		goto rotate;
3281 	}
3282 
3283 	/*
3284 	 * Contiguous insert - either left or right.
3285 	 */
3286 	if (insert->ins_contig != CONTIG_NONE) {
3287 		rec = &el->l_recs[i];
3288 		if (insert->ins_contig == CONTIG_LEFT) {
3289 			rec->e_blkno = insert_rec->e_blkno;
3290 			rec->e_cpos = insert_rec->e_cpos;
3291 		}
3292 		le16_add_cpu(&rec->e_leaf_clusters,
3293 			     le16_to_cpu(insert_rec->e_leaf_clusters));
3294 		return;
3295 	}
3296 
3297 	/*
3298 	 * Handle insert into an empty leaf.
3299 	 */
3300 	if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3301 	    ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3302 	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
3303 		el->l_recs[0] = *insert_rec;
3304 		el->l_next_free_rec = cpu_to_le16(1);
3305 		return;
3306 	}
3307 
3308 	/*
3309 	 * Appending insert.
3310 	 */
3311 	if (insert->ins_appending == APPEND_TAIL) {
3312 		i = le16_to_cpu(el->l_next_free_rec) - 1;
3313 		rec = &el->l_recs[i];
3314 		range = le32_to_cpu(rec->e_cpos)
3315 			+ le16_to_cpu(rec->e_leaf_clusters);
3316 		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3317 
3318 		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3319 				le16_to_cpu(el->l_count),
3320 				"inode %lu, depth %u, count %u, next free %u, "
3321 				"rec.cpos %u, rec.clusters %u, "
3322 				"insert.cpos %u, insert.clusters %u\n",
3323 				inode->i_ino,
3324 				le16_to_cpu(el->l_tree_depth),
3325 				le16_to_cpu(el->l_count),
3326 				le16_to_cpu(el->l_next_free_rec),
3327 				le32_to_cpu(el->l_recs[i].e_cpos),
3328 				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3329 				le32_to_cpu(insert_rec->e_cpos),
3330 				le16_to_cpu(insert_rec->e_leaf_clusters));
3331 		i++;
3332 		el->l_recs[i] = *insert_rec;
3333 		le16_add_cpu(&el->l_next_free_rec, 1);
3334 		return;
3335 	}
3336 
3337 rotate:
3338 	/*
3339 	 * Ok, we have to rotate.
3340 	 *
3341 	 * At this point, it is safe to assume that inserting into an
3342 	 * empty leaf and appending to a leaf have both been handled
3343 	 * above.
3344 	 *
3345 	 * This leaf needs to have space, either by the empty 1st
3346 	 * extent record, or by virtue of an l_next_rec < l_count.
3347 	 */
3348 	ocfs2_rotate_leaf(el, insert_rec);
3349 }
3350 
3351 static inline void ocfs2_update_dinode_clusters(struct inode *inode,
3352 						struct ocfs2_dinode *di,
3353 						u32 clusters)
3354 {
3355 	le32_add_cpu(&di->i_clusters, clusters);
3356 	spin_lock(&OCFS2_I(inode)->ip_lock);
3357 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
3358 	spin_unlock(&OCFS2_I(inode)->ip_lock);
3359 }
3360 
3361 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3362 					   handle_t *handle,
3363 					   struct ocfs2_path *path,
3364 					   struct ocfs2_extent_rec *insert_rec)
3365 {
3366 	int ret, i, next_free;
3367 	struct buffer_head *bh;
3368 	struct ocfs2_extent_list *el;
3369 	struct ocfs2_extent_rec *rec;
3370 
3371 	/*
3372 	 * Update everything except the leaf block.
3373 	 */
3374 	for (i = 0; i < path->p_tree_depth; i++) {
3375 		bh = path->p_node[i].bh;
3376 		el = path->p_node[i].el;
3377 
3378 		next_free = le16_to_cpu(el->l_next_free_rec);
3379 		if (next_free == 0) {
3380 			ocfs2_error(inode->i_sb,
3381 				    "Dinode %llu has a bad extent list",
3382 				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
3383 			ret = -EIO;
3384 			return;
3385 		}
3386 
3387 		rec = &el->l_recs[next_free - 1];
3388 
3389 		rec->e_int_clusters = insert_rec->e_cpos;
3390 		le32_add_cpu(&rec->e_int_clusters,
3391 			     le16_to_cpu(insert_rec->e_leaf_clusters));
3392 		le32_add_cpu(&rec->e_int_clusters,
3393 			     -le32_to_cpu(rec->e_cpos));
3394 
3395 		ret = ocfs2_journal_dirty(handle, bh);
3396 		if (ret)
3397 			mlog_errno(ret);
3398 
3399 	}
3400 }
3401 
3402 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3403 				    struct ocfs2_extent_rec *insert_rec,
3404 				    struct ocfs2_path *right_path,
3405 				    struct ocfs2_path **ret_left_path)
3406 {
3407 	int ret, next_free;
3408 	struct ocfs2_extent_list *el;
3409 	struct ocfs2_path *left_path = NULL;
3410 
3411 	*ret_left_path = NULL;
3412 
3413 	/*
3414 	 * This shouldn't happen for non-trees. The extent rec cluster
3415 	 * count manipulation below only works for interior nodes.
3416 	 */
3417 	BUG_ON(right_path->p_tree_depth == 0);
3418 
3419 	/*
3420 	 * If our appending insert is at the leftmost edge of a leaf,
3421 	 * then we might need to update the rightmost records of the
3422 	 * neighboring path.
3423 	 */
3424 	el = path_leaf_el(right_path);
3425 	next_free = le16_to_cpu(el->l_next_free_rec);
3426 	if (next_free == 0 ||
3427 	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3428 		u32 left_cpos;
3429 
3430 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3431 						    &left_cpos);
3432 		if (ret) {
3433 			mlog_errno(ret);
3434 			goto out;
3435 		}
3436 
3437 		mlog(0, "Append may need a left path update. cpos: %u, "
3438 		     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3439 		     left_cpos);
3440 
3441 		/*
3442 		 * No need to worry if the append is already in the
3443 		 * leftmost leaf.
3444 		 */
3445 		if (left_cpos) {
3446 			left_path = ocfs2_new_path(path_root_bh(right_path),
3447 						   path_root_el(right_path));
3448 			if (!left_path) {
3449 				ret = -ENOMEM;
3450 				mlog_errno(ret);
3451 				goto out;
3452 			}
3453 
3454 			ret = ocfs2_find_path(inode, left_path, left_cpos);
3455 			if (ret) {
3456 				mlog_errno(ret);
3457 				goto out;
3458 			}
3459 
3460 			/*
3461 			 * ocfs2_insert_path() will pass the left_path to the
3462 			 * journal for us.
3463 			 */
3464 		}
3465 	}
3466 
3467 	ret = ocfs2_journal_access_path(inode, handle, right_path);
3468 	if (ret) {
3469 		mlog_errno(ret);
3470 		goto out;
3471 	}
3472 
3473 	ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3474 
3475 	*ret_left_path = left_path;
3476 	ret = 0;
3477 out:
3478 	if (ret != 0)
3479 		ocfs2_free_path(left_path);
3480 
3481 	return ret;
3482 }
3483 
3484 static void ocfs2_split_record(struct inode *inode,
3485 			       struct ocfs2_path *left_path,
3486 			       struct ocfs2_path *right_path,
3487 			       struct ocfs2_extent_rec *split_rec,
3488 			       enum ocfs2_split_type split)
3489 {
3490 	int index;
3491 	u32 cpos = le32_to_cpu(split_rec->e_cpos);
3492 	struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3493 	struct ocfs2_extent_rec *rec, *tmprec;
3494 
3495 	right_el = path_leaf_el(right_path);;
3496 	if (left_path)
3497 		left_el = path_leaf_el(left_path);
3498 
3499 	el = right_el;
3500 	insert_el = right_el;
3501 	index = ocfs2_search_extent_list(el, cpos);
3502 	if (index != -1) {
3503 		if (index == 0 && left_path) {
3504 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3505 
3506 			/*
3507 			 * This typically means that the record
3508 			 * started in the left path but moved to the
3509 			 * right as a result of rotation. We either
3510 			 * move the existing record to the left, or we
3511 			 * do the later insert there.
3512 			 *
3513 			 * In this case, the left path should always
3514 			 * exist as the rotate code will have passed
3515 			 * it back for a post-insert update.
3516 			 */
3517 
3518 			if (split == SPLIT_LEFT) {
3519 				/*
3520 				 * It's a left split. Since we know
3521 				 * that the rotate code gave us an
3522 				 * empty extent in the left path, we
3523 				 * can just do the insert there.
3524 				 */
3525 				insert_el = left_el;
3526 			} else {
3527 				/*
3528 				 * Right split - we have to move the
3529 				 * existing record over to the left
3530 				 * leaf. The insert will be into the
3531 				 * newly created empty extent in the
3532 				 * right leaf.
3533 				 */
3534 				tmprec = &right_el->l_recs[index];
3535 				ocfs2_rotate_leaf(left_el, tmprec);
3536 				el = left_el;
3537 
3538 				memset(tmprec, 0, sizeof(*tmprec));
3539 				index = ocfs2_search_extent_list(left_el, cpos);
3540 				BUG_ON(index == -1);
3541 			}
3542 		}
3543 	} else {
3544 		BUG_ON(!left_path);
3545 		BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3546 		/*
3547 		 * Left path is easy - we can just allow the insert to
3548 		 * happen.
3549 		 */
3550 		el = left_el;
3551 		insert_el = left_el;
3552 		index = ocfs2_search_extent_list(el, cpos);
3553 		BUG_ON(index == -1);
3554 	}
3555 
3556 	rec = &el->l_recs[index];
3557 	ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3558 	ocfs2_rotate_leaf(insert_el, split_rec);
3559 }
3560 
3561 /*
3562  * This function only does inserts on an allocation b-tree. For dinode
3563  * lists, ocfs2_insert_at_leaf() is called directly.
3564  *
3565  * right_path is the path we want to do the actual insert
3566  * in. left_path should only be passed in if we need to update that
3567  * portion of the tree after an edge insert.
3568  */
3569 static int ocfs2_insert_path(struct inode *inode,
3570 			     handle_t *handle,
3571 			     struct ocfs2_path *left_path,
3572 			     struct ocfs2_path *right_path,
3573 			     struct ocfs2_extent_rec *insert_rec,
3574 			     struct ocfs2_insert_type *insert)
3575 {
3576 	int ret, subtree_index;
3577 	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3578 
3579 	if (left_path) {
3580 		int credits = handle->h_buffer_credits;
3581 
3582 		/*
3583 		 * There's a chance that left_path got passed back to
3584 		 * us without being accounted for in the
3585 		 * journal. Extend our transaction here to be sure we
3586 		 * can change those blocks.
3587 		 */
3588 		credits += left_path->p_tree_depth;
3589 
3590 		ret = ocfs2_extend_trans(handle, credits);
3591 		if (ret < 0) {
3592 			mlog_errno(ret);
3593 			goto out;
3594 		}
3595 
3596 		ret = ocfs2_journal_access_path(inode, handle, left_path);
3597 		if (ret < 0) {
3598 			mlog_errno(ret);
3599 			goto out;
3600 		}
3601 	}
3602 
3603 	/*
3604 	 * Pass both paths to the journal. The majority of inserts
3605 	 * will be touching all components anyway.
3606 	 */
3607 	ret = ocfs2_journal_access_path(inode, handle, right_path);
3608 	if (ret < 0) {
3609 		mlog_errno(ret);
3610 		goto out;
3611 	}
3612 
3613 	if (insert->ins_split != SPLIT_NONE) {
3614 		/*
3615 		 * We could call ocfs2_insert_at_leaf() for some types
3616 		 * of splits, but it's easier to just let one separate
3617 		 * function sort it all out.
3618 		 */
3619 		ocfs2_split_record(inode, left_path, right_path,
3620 				   insert_rec, insert->ins_split);
3621 
3622 		/*
3623 		 * Split might have modified either leaf and we don't
3624 		 * have a guarantee that the later edge insert will
3625 		 * dirty this for us.
3626 		 */
3627 		if (left_path)
3628 			ret = ocfs2_journal_dirty(handle,
3629 						  path_leaf_bh(left_path));
3630 			if (ret)
3631 				mlog_errno(ret);
3632 	} else
3633 		ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
3634 				     insert, inode);
3635 
3636 	ret = ocfs2_journal_dirty(handle, leaf_bh);
3637 	if (ret)
3638 		mlog_errno(ret);
3639 
3640 	if (left_path) {
3641 		/*
3642 		 * The rotate code has indicated that we need to fix
3643 		 * up portions of the tree after the insert.
3644 		 *
3645 		 * XXX: Should we extend the transaction here?
3646 		 */
3647 		subtree_index = ocfs2_find_subtree_root(inode, left_path,
3648 							right_path);
3649 		ocfs2_complete_edge_insert(inode, handle, left_path,
3650 					   right_path, subtree_index);
3651 	}
3652 
3653 	ret = 0;
3654 out:
3655 	return ret;
3656 }
3657 
3658 static int ocfs2_do_insert_extent(struct inode *inode,
3659 				  handle_t *handle,
3660 				  struct buffer_head *di_bh,
3661 				  struct ocfs2_extent_rec *insert_rec,
3662 				  struct ocfs2_insert_type *type)
3663 {
3664 	int ret, rotate = 0;
3665 	u32 cpos;
3666 	struct ocfs2_path *right_path = NULL;
3667 	struct ocfs2_path *left_path = NULL;
3668 	struct ocfs2_dinode *di;
3669 	struct ocfs2_extent_list *el;
3670 
3671 	di = (struct ocfs2_dinode *) di_bh->b_data;
3672 	el = &di->id2.i_list;
3673 
3674 	ret = ocfs2_journal_access(handle, inode, di_bh,
3675 				   OCFS2_JOURNAL_ACCESS_WRITE);
3676 	if (ret) {
3677 		mlog_errno(ret);
3678 		goto out;
3679 	}
3680 
3681 	if (le16_to_cpu(el->l_tree_depth) == 0) {
3682 		ocfs2_insert_at_leaf(insert_rec, el, type, inode);
3683 		goto out_update_clusters;
3684 	}
3685 
3686 	right_path = ocfs2_new_inode_path(di_bh);
3687 	if (!right_path) {
3688 		ret = -ENOMEM;
3689 		mlog_errno(ret);
3690 		goto out;
3691 	}
3692 
3693 	/*
3694 	 * Determine the path to start with. Rotations need the
3695 	 * rightmost path, everything else can go directly to the
3696 	 * target leaf.
3697 	 */
3698 	cpos = le32_to_cpu(insert_rec->e_cpos);
3699 	if (type->ins_appending == APPEND_NONE &&
3700 	    type->ins_contig == CONTIG_NONE) {
3701 		rotate = 1;
3702 		cpos = UINT_MAX;
3703 	}
3704 
3705 	ret = ocfs2_find_path(inode, right_path, cpos);
3706 	if (ret) {
3707 		mlog_errno(ret);
3708 		goto out;
3709 	}
3710 
3711 	/*
3712 	 * Rotations and appends need special treatment - they modify
3713 	 * parts of the tree's above them.
3714 	 *
3715 	 * Both might pass back a path immediate to the left of the
3716 	 * one being inserted to. This will be cause
3717 	 * ocfs2_insert_path() to modify the rightmost records of
3718 	 * left_path to account for an edge insert.
3719 	 *
3720 	 * XXX: When modifying this code, keep in mind that an insert
3721 	 * can wind up skipping both of these two special cases...
3722 	 */
3723 	if (rotate) {
3724 		ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
3725 					      le32_to_cpu(insert_rec->e_cpos),
3726 					      right_path, &left_path);
3727 		if (ret) {
3728 			mlog_errno(ret);
3729 			goto out;
3730 		}
3731 
3732 		/*
3733 		 * ocfs2_rotate_tree_right() might have extended the
3734 		 * transaction without re-journaling our tree root.
3735 		 */
3736 		ret = ocfs2_journal_access(handle, inode, di_bh,
3737 					   OCFS2_JOURNAL_ACCESS_WRITE);
3738 		if (ret) {
3739 			mlog_errno(ret);
3740 			goto out;
3741 		}
3742 	} else if (type->ins_appending == APPEND_TAIL
3743 		   && type->ins_contig != CONTIG_LEFT) {
3744 		ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
3745 					       right_path, &left_path);
3746 		if (ret) {
3747 			mlog_errno(ret);
3748 			goto out;
3749 		}
3750 	}
3751 
3752 	ret = ocfs2_insert_path(inode, handle, left_path, right_path,
3753 				insert_rec, type);
3754 	if (ret) {
3755 		mlog_errno(ret);
3756 		goto out;
3757 	}
3758 
3759 out_update_clusters:
3760 	if (type->ins_split == SPLIT_NONE)
3761 		ocfs2_update_dinode_clusters(inode, di,
3762 					     le16_to_cpu(insert_rec->e_leaf_clusters));
3763 
3764 	ret = ocfs2_journal_dirty(handle, di_bh);
3765 	if (ret)
3766 		mlog_errno(ret);
3767 
3768 out:
3769 	ocfs2_free_path(left_path);
3770 	ocfs2_free_path(right_path);
3771 
3772 	return ret;
3773 }
3774 
3775 static enum ocfs2_contig_type
3776 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
3777 			       struct ocfs2_extent_list *el, int index,
3778 			       struct ocfs2_extent_rec *split_rec)
3779 {
3780 	int status;
3781 	enum ocfs2_contig_type ret = CONTIG_NONE;
3782 	u32 left_cpos, right_cpos;
3783 	struct ocfs2_extent_rec *rec = NULL;
3784 	struct ocfs2_extent_list *new_el;
3785 	struct ocfs2_path *left_path = NULL, *right_path = NULL;
3786 	struct buffer_head *bh;
3787 	struct ocfs2_extent_block *eb;
3788 
3789 	if (index > 0) {
3790 		rec = &el->l_recs[index - 1];
3791 	} else if (path->p_tree_depth > 0) {
3792 		status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3793 						       path, &left_cpos);
3794 		if (status)
3795 			goto out;
3796 
3797 		if (left_cpos != 0) {
3798 			left_path = ocfs2_new_path(path_root_bh(path),
3799 						   path_root_el(path));
3800 			if (!left_path)
3801 				goto out;
3802 
3803 			status = ocfs2_find_path(inode, left_path, left_cpos);
3804 			if (status)
3805 				goto out;
3806 
3807 			new_el = path_leaf_el(left_path);
3808 
3809 			if (le16_to_cpu(new_el->l_next_free_rec) !=
3810 			    le16_to_cpu(new_el->l_count)) {
3811 				bh = path_leaf_bh(left_path);
3812 				eb = (struct ocfs2_extent_block *)bh->b_data;
3813 				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3814 								 eb);
3815 				goto out;
3816 			}
3817 			rec = &new_el->l_recs[
3818 				le16_to_cpu(new_el->l_next_free_rec) - 1];
3819 		}
3820 	}
3821 
3822 	/*
3823 	 * We're careful to check for an empty extent record here -
3824 	 * the merge code will know what to do if it sees one.
3825 	 */
3826 	if (rec) {
3827 		if (index == 1 && ocfs2_is_empty_extent(rec)) {
3828 			if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3829 				ret = CONTIG_RIGHT;
3830 		} else {
3831 			ret = ocfs2_extent_contig(inode, rec, split_rec);
3832 		}
3833 	}
3834 
3835 	rec = NULL;
3836 	if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
3837 		rec = &el->l_recs[index + 1];
3838 	else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
3839 		 path->p_tree_depth > 0) {
3840 		status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
3841 							path, &right_cpos);
3842 		if (status)
3843 			goto out;
3844 
3845 		if (right_cpos == 0)
3846 			goto out;
3847 
3848 		right_path = ocfs2_new_path(path_root_bh(path),
3849 					    path_root_el(path));
3850 		if (!right_path)
3851 			goto out;
3852 
3853 		status = ocfs2_find_path(inode, right_path, right_cpos);
3854 		if (status)
3855 			goto out;
3856 
3857 		new_el = path_leaf_el(right_path);
3858 		rec = &new_el->l_recs[0];
3859 		if (ocfs2_is_empty_extent(rec)) {
3860 			if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
3861 				bh = path_leaf_bh(right_path);
3862 				eb = (struct ocfs2_extent_block *)bh->b_data;
3863 				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3864 								 eb);
3865 				goto out;
3866 			}
3867 			rec = &new_el->l_recs[1];
3868 		}
3869 	}
3870 
3871 	if (rec) {
3872 		enum ocfs2_contig_type contig_type;
3873 
3874 		contig_type = ocfs2_extent_contig(inode, rec, split_rec);
3875 
3876 		if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
3877 			ret = CONTIG_LEFTRIGHT;
3878 		else if (ret == CONTIG_NONE)
3879 			ret = contig_type;
3880 	}
3881 
3882 out:
3883 	if (left_path)
3884 		ocfs2_free_path(left_path);
3885 	if (right_path)
3886 		ocfs2_free_path(right_path);
3887 
3888 	return ret;
3889 }
3890 
3891 static void ocfs2_figure_contig_type(struct inode *inode,
3892 				     struct ocfs2_insert_type *insert,
3893 				     struct ocfs2_extent_list *el,
3894 				     struct ocfs2_extent_rec *insert_rec)
3895 {
3896 	int i;
3897 	enum ocfs2_contig_type contig_type = CONTIG_NONE;
3898 
3899 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3900 
3901 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
3902 		contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
3903 						  insert_rec);
3904 		if (contig_type != CONTIG_NONE) {
3905 			insert->ins_contig_index = i;
3906 			break;
3907 		}
3908 	}
3909 	insert->ins_contig = contig_type;
3910 }
3911 
3912 /*
3913  * This should only be called against the righmost leaf extent list.
3914  *
3915  * ocfs2_figure_appending_type() will figure out whether we'll have to
3916  * insert at the tail of the rightmost leaf.
3917  *
3918  * This should also work against the dinode list for tree's with 0
3919  * depth. If we consider the dinode list to be the rightmost leaf node
3920  * then the logic here makes sense.
3921  */
3922 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
3923 					struct ocfs2_extent_list *el,
3924 					struct ocfs2_extent_rec *insert_rec)
3925 {
3926 	int i;
3927 	u32 cpos = le32_to_cpu(insert_rec->e_cpos);
3928 	struct ocfs2_extent_rec *rec;
3929 
3930 	insert->ins_appending = APPEND_NONE;
3931 
3932 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3933 
3934 	if (!el->l_next_free_rec)
3935 		goto set_tail_append;
3936 
3937 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
3938 		/* Were all records empty? */
3939 		if (le16_to_cpu(el->l_next_free_rec) == 1)
3940 			goto set_tail_append;
3941 	}
3942 
3943 	i = le16_to_cpu(el->l_next_free_rec) - 1;
3944 	rec = &el->l_recs[i];
3945 
3946 	if (cpos >=
3947 	    (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
3948 		goto set_tail_append;
3949 
3950 	return;
3951 
3952 set_tail_append:
3953 	insert->ins_appending = APPEND_TAIL;
3954 }
3955 
3956 /*
3957  * Helper function called at the begining of an insert.
3958  *
3959  * This computes a few things that are commonly used in the process of
3960  * inserting into the btree:
3961  *   - Whether the new extent is contiguous with an existing one.
3962  *   - The current tree depth.
3963  *   - Whether the insert is an appending one.
3964  *   - The total # of free records in the tree.
3965  *
3966  * All of the information is stored on the ocfs2_insert_type
3967  * structure.
3968  */
3969 static int ocfs2_figure_insert_type(struct inode *inode,
3970 				    struct buffer_head *di_bh,
3971 				    struct buffer_head **last_eb_bh,
3972 				    struct ocfs2_extent_rec *insert_rec,
3973 				    int *free_records,
3974 				    struct ocfs2_insert_type *insert)
3975 {
3976 	int ret;
3977 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
3978 	struct ocfs2_extent_block *eb;
3979 	struct ocfs2_extent_list *el;
3980 	struct ocfs2_path *path = NULL;
3981 	struct buffer_head *bh = NULL;
3982 
3983 	insert->ins_split = SPLIT_NONE;
3984 
3985 	el = &di->id2.i_list;
3986 	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
3987 
3988 	if (el->l_tree_depth) {
3989 		/*
3990 		 * If we have tree depth, we read in the
3991 		 * rightmost extent block ahead of time as
3992 		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
3993 		 * may want it later.
3994 		 */
3995 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
3996 				       le64_to_cpu(di->i_last_eb_blk), &bh,
3997 				       OCFS2_BH_CACHED, inode);
3998 		if (ret) {
3999 			mlog_exit(ret);
4000 			goto out;
4001 		}
4002 		eb = (struct ocfs2_extent_block *) bh->b_data;
4003 		el = &eb->h_list;
4004 	}
4005 
4006 	/*
4007 	 * Unless we have a contiguous insert, we'll need to know if
4008 	 * there is room left in our allocation tree for another
4009 	 * extent record.
4010 	 *
4011 	 * XXX: This test is simplistic, we can search for empty
4012 	 * extent records too.
4013 	 */
4014 	*free_records = le16_to_cpu(el->l_count) -
4015 		le16_to_cpu(el->l_next_free_rec);
4016 
4017 	if (!insert->ins_tree_depth) {
4018 		ocfs2_figure_contig_type(inode, insert, el, insert_rec);
4019 		ocfs2_figure_appending_type(insert, el, insert_rec);
4020 		return 0;
4021 	}
4022 
4023 	path = ocfs2_new_inode_path(di_bh);
4024 	if (!path) {
4025 		ret = -ENOMEM;
4026 		mlog_errno(ret);
4027 		goto out;
4028 	}
4029 
4030 	/*
4031 	 * In the case that we're inserting past what the tree
4032 	 * currently accounts for, ocfs2_find_path() will return for
4033 	 * us the rightmost tree path. This is accounted for below in
4034 	 * the appending code.
4035 	 */
4036 	ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4037 	if (ret) {
4038 		mlog_errno(ret);
4039 		goto out;
4040 	}
4041 
4042 	el = path_leaf_el(path);
4043 
4044 	/*
4045 	 * Now that we have the path, there's two things we want to determine:
4046 	 * 1) Contiguousness (also set contig_index if this is so)
4047 	 *
4048 	 * 2) Are we doing an append? We can trivially break this up
4049          *     into two types of appends: simple record append, or a
4050          *     rotate inside the tail leaf.
4051 	 */
4052 	ocfs2_figure_contig_type(inode, insert, el, insert_rec);
4053 
4054 	/*
4055 	 * The insert code isn't quite ready to deal with all cases of
4056 	 * left contiguousness. Specifically, if it's an insert into
4057 	 * the 1st record in a leaf, it will require the adjustment of
4058 	 * cluster count on the last record of the path directly to it's
4059 	 * left. For now, just catch that case and fool the layers
4060 	 * above us. This works just fine for tree_depth == 0, which
4061 	 * is why we allow that above.
4062 	 */
4063 	if (insert->ins_contig == CONTIG_LEFT &&
4064 	    insert->ins_contig_index == 0)
4065 		insert->ins_contig = CONTIG_NONE;
4066 
4067 	/*
4068 	 * Ok, so we can simply compare against last_eb to figure out
4069 	 * whether the path doesn't exist. This will only happen in
4070 	 * the case that we're doing a tail append, so maybe we can
4071 	 * take advantage of that information somehow.
4072 	 */
4073 	if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
4074 		/*
4075 		 * Ok, ocfs2_find_path() returned us the rightmost
4076 		 * tree path. This might be an appending insert. There are
4077 		 * two cases:
4078 		 *    1) We're doing a true append at the tail:
4079 		 *	-This might even be off the end of the leaf
4080 		 *    2) We're "appending" by rotating in the tail
4081 		 */
4082 		ocfs2_figure_appending_type(insert, el, insert_rec);
4083 	}
4084 
4085 out:
4086 	ocfs2_free_path(path);
4087 
4088 	if (ret == 0)
4089 		*last_eb_bh = bh;
4090 	else
4091 		brelse(bh);
4092 	return ret;
4093 }
4094 
4095 /*
4096  * Insert an extent into an inode btree.
4097  *
4098  * The caller needs to update fe->i_clusters
4099  */
4100 int ocfs2_insert_extent(struct ocfs2_super *osb,
4101 			handle_t *handle,
4102 			struct inode *inode,
4103 			struct buffer_head *fe_bh,
4104 			u32 cpos,
4105 			u64 start_blk,
4106 			u32 new_clusters,
4107 			u8 flags,
4108 			struct ocfs2_alloc_context *meta_ac)
4109 {
4110 	int status;
4111 	int uninitialized_var(free_records);
4112 	struct buffer_head *last_eb_bh = NULL;
4113 	struct ocfs2_insert_type insert = {0, };
4114 	struct ocfs2_extent_rec rec;
4115 
4116 	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
4117 
4118 	mlog(0, "add %u clusters at position %u to inode %llu\n",
4119 	     new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4120 
4121 	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
4122 			(OCFS2_I(inode)->ip_clusters != cpos),
4123 			"Device %s, asking for sparse allocation: inode %llu, "
4124 			"cpos %u, clusters %u\n",
4125 			osb->dev_str,
4126 			(unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
4127 			OCFS2_I(inode)->ip_clusters);
4128 
4129 	memset(&rec, 0, sizeof(rec));
4130 	rec.e_cpos = cpu_to_le32(cpos);
4131 	rec.e_blkno = cpu_to_le64(start_blk);
4132 	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4133 	rec.e_flags = flags;
4134 
4135 	status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
4136 					  &free_records, &insert);
4137 	if (status < 0) {
4138 		mlog_errno(status);
4139 		goto bail;
4140 	}
4141 
4142 	mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4143 	     "Insert.contig_index: %d, Insert.free_records: %d, "
4144 	     "Insert.tree_depth: %d\n",
4145 	     insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4146 	     free_records, insert.ins_tree_depth);
4147 
4148 	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4149 		status = ocfs2_grow_tree(inode, handle, fe_bh,
4150 					 &insert.ins_tree_depth, &last_eb_bh,
4151 					 meta_ac);
4152 		if (status) {
4153 			mlog_errno(status);
4154 			goto bail;
4155 		}
4156 	}
4157 
4158 	/* Finally, we can add clusters. This might rotate the tree for us. */
4159 	status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
4160 	if (status < 0)
4161 		mlog_errno(status);
4162 	else
4163 		ocfs2_extent_map_insert_rec(inode, &rec);
4164 
4165 bail:
4166 	if (last_eb_bh)
4167 		brelse(last_eb_bh);
4168 
4169 	mlog_exit(status);
4170 	return status;
4171 }
4172 
4173 static void ocfs2_make_right_split_rec(struct super_block *sb,
4174 				       struct ocfs2_extent_rec *split_rec,
4175 				       u32 cpos,
4176 				       struct ocfs2_extent_rec *rec)
4177 {
4178 	u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4179 	u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4180 
4181 	memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4182 
4183 	split_rec->e_cpos = cpu_to_le32(cpos);
4184 	split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4185 
4186 	split_rec->e_blkno = rec->e_blkno;
4187 	le64_add_cpu(&split_rec->e_blkno,
4188 		     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4189 
4190 	split_rec->e_flags = rec->e_flags;
4191 }
4192 
4193 static int ocfs2_split_and_insert(struct inode *inode,
4194 				  handle_t *handle,
4195 				  struct ocfs2_path *path,
4196 				  struct buffer_head *di_bh,
4197 				  struct buffer_head **last_eb_bh,
4198 				  int split_index,
4199 				  struct ocfs2_extent_rec *orig_split_rec,
4200 				  struct ocfs2_alloc_context *meta_ac)
4201 {
4202 	int ret = 0, depth;
4203 	unsigned int insert_range, rec_range, do_leftright = 0;
4204 	struct ocfs2_extent_rec tmprec;
4205 	struct ocfs2_extent_list *rightmost_el;
4206 	struct ocfs2_extent_rec rec;
4207 	struct ocfs2_extent_rec split_rec = *orig_split_rec;
4208 	struct ocfs2_insert_type insert;
4209 	struct ocfs2_extent_block *eb;
4210 	struct ocfs2_dinode *di;
4211 
4212 leftright:
4213 	/*
4214 	 * Store a copy of the record on the stack - it might move
4215 	 * around as the tree is manipulated below.
4216 	 */
4217 	rec = path_leaf_el(path)->l_recs[split_index];
4218 
4219 	di = (struct ocfs2_dinode *)di_bh->b_data;
4220 	rightmost_el = &di->id2.i_list;
4221 
4222 	depth = le16_to_cpu(rightmost_el->l_tree_depth);
4223 	if (depth) {
4224 		BUG_ON(!(*last_eb_bh));
4225 		eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4226 		rightmost_el = &eb->h_list;
4227 	}
4228 
4229 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4230 	    le16_to_cpu(rightmost_el->l_count)) {
4231 		ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh,
4232 				      meta_ac);
4233 		if (ret) {
4234 			mlog_errno(ret);
4235 			goto out;
4236 		}
4237 	}
4238 
4239 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4240 	insert.ins_appending = APPEND_NONE;
4241 	insert.ins_contig = CONTIG_NONE;
4242 	insert.ins_tree_depth = depth;
4243 
4244 	insert_range = le32_to_cpu(split_rec.e_cpos) +
4245 		le16_to_cpu(split_rec.e_leaf_clusters);
4246 	rec_range = le32_to_cpu(rec.e_cpos) +
4247 		le16_to_cpu(rec.e_leaf_clusters);
4248 
4249 	if (split_rec.e_cpos == rec.e_cpos) {
4250 		insert.ins_split = SPLIT_LEFT;
4251 	} else if (insert_range == rec_range) {
4252 		insert.ins_split = SPLIT_RIGHT;
4253 	} else {
4254 		/*
4255 		 * Left/right split. We fake this as a right split
4256 		 * first and then make a second pass as a left split.
4257 		 */
4258 		insert.ins_split = SPLIT_RIGHT;
4259 
4260 		ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4261 					   &rec);
4262 
4263 		split_rec = tmprec;
4264 
4265 		BUG_ON(do_leftright);
4266 		do_leftright = 1;
4267 	}
4268 
4269 	ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec,
4270 				     &insert);
4271 	if (ret) {
4272 		mlog_errno(ret);
4273 		goto out;
4274 	}
4275 
4276 	if (do_leftright == 1) {
4277 		u32 cpos;
4278 		struct ocfs2_extent_list *el;
4279 
4280 		do_leftright++;
4281 		split_rec = *orig_split_rec;
4282 
4283 		ocfs2_reinit_path(path, 1);
4284 
4285 		cpos = le32_to_cpu(split_rec.e_cpos);
4286 		ret = ocfs2_find_path(inode, path, cpos);
4287 		if (ret) {
4288 			mlog_errno(ret);
4289 			goto out;
4290 		}
4291 
4292 		el = path_leaf_el(path);
4293 		split_index = ocfs2_search_extent_list(el, cpos);
4294 		goto leftright;
4295 	}
4296 out:
4297 
4298 	return ret;
4299 }
4300 
4301 /*
4302  * Mark part or all of the extent record at split_index in the leaf
4303  * pointed to by path as written. This removes the unwritten
4304  * extent flag.
4305  *
4306  * Care is taken to handle contiguousness so as to not grow the tree.
4307  *
4308  * meta_ac is not strictly necessary - we only truly need it if growth
4309  * of the tree is required. All other cases will degrade into a less
4310  * optimal tree layout.
4311  *
4312  * last_eb_bh should be the rightmost leaf block for any inode with a
4313  * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call.
4314  *
4315  * This code is optimized for readability - several passes might be
4316  * made over certain portions of the tree. All of those blocks will
4317  * have been brought into cache (and pinned via the journal), so the
4318  * extra overhead is not expressed in terms of disk reads.
4319  */
4320 static int __ocfs2_mark_extent_written(struct inode *inode,
4321 				       struct buffer_head *di_bh,
4322 				       handle_t *handle,
4323 				       struct ocfs2_path *path,
4324 				       int split_index,
4325 				       struct ocfs2_extent_rec *split_rec,
4326 				       struct ocfs2_alloc_context *meta_ac,
4327 				       struct ocfs2_cached_dealloc_ctxt *dealloc)
4328 {
4329 	int ret = 0;
4330 	struct ocfs2_extent_list *el = path_leaf_el(path);
4331 	struct buffer_head *last_eb_bh = NULL;
4332 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4333 	struct ocfs2_merge_ctxt ctxt;
4334 	struct ocfs2_extent_list *rightmost_el;
4335 
4336 	if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4337 		ret = -EIO;
4338 		mlog_errno(ret);
4339 		goto out;
4340 	}
4341 
4342 	if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4343 	    ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4344 	     (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4345 		ret = -EIO;
4346 		mlog_errno(ret);
4347 		goto out;
4348 	}
4349 
4350 	ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4351 							    split_index,
4352 							    split_rec);
4353 
4354 	/*
4355 	 * The core merge / split code wants to know how much room is
4356 	 * left in this inodes allocation tree, so we pass the
4357 	 * rightmost extent list.
4358 	 */
4359 	if (path->p_tree_depth) {
4360 		struct ocfs2_extent_block *eb;
4361 		struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4362 
4363 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4364 				       le64_to_cpu(di->i_last_eb_blk),
4365 				       &last_eb_bh, OCFS2_BH_CACHED, inode);
4366 		if (ret) {
4367 			mlog_exit(ret);
4368 			goto out;
4369 		}
4370 
4371 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4372 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
4373 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
4374 			ret = -EROFS;
4375 			goto out;
4376 		}
4377 
4378 		rightmost_el = &eb->h_list;
4379 	} else
4380 		rightmost_el = path_root_el(path);
4381 
4382 	if (rec->e_cpos == split_rec->e_cpos &&
4383 	    rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4384 		ctxt.c_split_covers_rec = 1;
4385 	else
4386 		ctxt.c_split_covers_rec = 0;
4387 
4388 	ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4389 
4390 	mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4391 	     split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4392 	     ctxt.c_split_covers_rec);
4393 
4394 	if (ctxt.c_contig_type == CONTIG_NONE) {
4395 		if (ctxt.c_split_covers_rec)
4396 			el->l_recs[split_index] = *split_rec;
4397 		else
4398 			ret = ocfs2_split_and_insert(inode, handle, path, di_bh,
4399 						     &last_eb_bh, split_index,
4400 						     split_rec, meta_ac);
4401 		if (ret)
4402 			mlog_errno(ret);
4403 	} else {
4404 		ret = ocfs2_try_to_merge_extent(inode, handle, path,
4405 						split_index, split_rec,
4406 						dealloc, &ctxt);
4407 		if (ret)
4408 			mlog_errno(ret);
4409 	}
4410 
4411 out:
4412 	brelse(last_eb_bh);
4413 	return ret;
4414 }
4415 
4416 /*
4417  * Mark the already-existing extent at cpos as written for len clusters.
4418  *
4419  * If the existing extent is larger than the request, initiate a
4420  * split. An attempt will be made at merging with adjacent extents.
4421  *
4422  * The caller is responsible for passing down meta_ac if we'll need it.
4423  */
4424 int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh,
4425 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
4426 			      struct ocfs2_alloc_context *meta_ac,
4427 			      struct ocfs2_cached_dealloc_ctxt *dealloc)
4428 {
4429 	int ret, index;
4430 	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4431 	struct ocfs2_extent_rec split_rec;
4432 	struct ocfs2_path *left_path = NULL;
4433 	struct ocfs2_extent_list *el;
4434 
4435 	mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4436 	     inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4437 
4438 	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4439 		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4440 			    "that are being written to, but the feature bit "
4441 			    "is not set in the super block.",
4442 			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
4443 		ret = -EROFS;
4444 		goto out;
4445 	}
4446 
4447 	/*
4448 	 * XXX: This should be fixed up so that we just re-insert the
4449 	 * next extent records.
4450 	 */
4451 	ocfs2_extent_map_trunc(inode, 0);
4452 
4453 	left_path = ocfs2_new_inode_path(di_bh);
4454 	if (!left_path) {
4455 		ret = -ENOMEM;
4456 		mlog_errno(ret);
4457 		goto out;
4458 	}
4459 
4460 	ret = ocfs2_find_path(inode, left_path, cpos);
4461 	if (ret) {
4462 		mlog_errno(ret);
4463 		goto out;
4464 	}
4465 	el = path_leaf_el(left_path);
4466 
4467 	index = ocfs2_search_extent_list(el, cpos);
4468 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4469 		ocfs2_error(inode->i_sb,
4470 			    "Inode %llu has an extent at cpos %u which can no "
4471 			    "longer be found.\n",
4472 			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4473 		ret = -EROFS;
4474 		goto out;
4475 	}
4476 
4477 	memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4478 	split_rec.e_cpos = cpu_to_le32(cpos);
4479 	split_rec.e_leaf_clusters = cpu_to_le16(len);
4480 	split_rec.e_blkno = cpu_to_le64(start_blkno);
4481 	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4482 	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4483 
4484 	ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path,
4485 					  index, &split_rec, meta_ac, dealloc);
4486 	if (ret)
4487 		mlog_errno(ret);
4488 
4489 out:
4490 	ocfs2_free_path(left_path);
4491 	return ret;
4492 }
4493 
4494 static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh,
4495 			    handle_t *handle, struct ocfs2_path *path,
4496 			    int index, u32 new_range,
4497 			    struct ocfs2_alloc_context *meta_ac)
4498 {
4499 	int ret, depth, credits = handle->h_buffer_credits;
4500 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
4501 	struct buffer_head *last_eb_bh = NULL;
4502 	struct ocfs2_extent_block *eb;
4503 	struct ocfs2_extent_list *rightmost_el, *el;
4504 	struct ocfs2_extent_rec split_rec;
4505 	struct ocfs2_extent_rec *rec;
4506 	struct ocfs2_insert_type insert;
4507 
4508 	/*
4509 	 * Setup the record to split before we grow the tree.
4510 	 */
4511 	el = path_leaf_el(path);
4512 	rec = &el->l_recs[index];
4513 	ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
4514 
4515 	depth = path->p_tree_depth;
4516 	if (depth > 0) {
4517 		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
4518 				       le64_to_cpu(di->i_last_eb_blk),
4519 				       &last_eb_bh, OCFS2_BH_CACHED, inode);
4520 		if (ret < 0) {
4521 			mlog_errno(ret);
4522 			goto out;
4523 		}
4524 
4525 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4526 		rightmost_el = &eb->h_list;
4527 	} else
4528 		rightmost_el = path_leaf_el(path);
4529 
4530 	credits += path->p_tree_depth +
4531 		   ocfs2_extend_meta_needed(&di->id2.i_list);
4532 	ret = ocfs2_extend_trans(handle, credits);
4533 	if (ret) {
4534 		mlog_errno(ret);
4535 		goto out;
4536 	}
4537 
4538 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4539 	    le16_to_cpu(rightmost_el->l_count)) {
4540 		ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh,
4541 				      meta_ac);
4542 		if (ret) {
4543 			mlog_errno(ret);
4544 			goto out;
4545 		}
4546 	}
4547 
4548 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4549 	insert.ins_appending = APPEND_NONE;
4550 	insert.ins_contig = CONTIG_NONE;
4551 	insert.ins_split = SPLIT_RIGHT;
4552 	insert.ins_tree_depth = depth;
4553 
4554 	ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert);
4555 	if (ret)
4556 		mlog_errno(ret);
4557 
4558 out:
4559 	brelse(last_eb_bh);
4560 	return ret;
4561 }
4562 
4563 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
4564 			      struct ocfs2_path *path, int index,
4565 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
4566 			      u32 cpos, u32 len)
4567 {
4568 	int ret;
4569 	u32 left_cpos, rec_range, trunc_range;
4570 	int wants_rotate = 0, is_rightmost_tree_rec = 0;
4571 	struct super_block *sb = inode->i_sb;
4572 	struct ocfs2_path *left_path = NULL;
4573 	struct ocfs2_extent_list *el = path_leaf_el(path);
4574 	struct ocfs2_extent_rec *rec;
4575 	struct ocfs2_extent_block *eb;
4576 
4577 	if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
4578 		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4579 		if (ret) {
4580 			mlog_errno(ret);
4581 			goto out;
4582 		}
4583 
4584 		index--;
4585 	}
4586 
4587 	if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
4588 	    path->p_tree_depth) {
4589 		/*
4590 		 * Check whether this is the rightmost tree record. If
4591 		 * we remove all of this record or part of its right
4592 		 * edge then an update of the record lengths above it
4593 		 * will be required.
4594 		 */
4595 		eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
4596 		if (eb->h_next_leaf_blk == 0)
4597 			is_rightmost_tree_rec = 1;
4598 	}
4599 
4600 	rec = &el->l_recs[index];
4601 	if (index == 0 && path->p_tree_depth &&
4602 	    le32_to_cpu(rec->e_cpos) == cpos) {
4603 		/*
4604 		 * Changing the leftmost offset (via partial or whole
4605 		 * record truncate) of an interior (or rightmost) path
4606 		 * means we have to update the subtree that is formed
4607 		 * by this leaf and the one to it's left.
4608 		 *
4609 		 * There are two cases we can skip:
4610 		 *   1) Path is the leftmost one in our inode tree.
4611 		 *   2) The leaf is rightmost and will be empty after
4612 		 *      we remove the extent record - the rotate code
4613 		 *      knows how to update the newly formed edge.
4614 		 */
4615 
4616 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
4617 						    &left_cpos);
4618 		if (ret) {
4619 			mlog_errno(ret);
4620 			goto out;
4621 		}
4622 
4623 		if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
4624 			left_path = ocfs2_new_path(path_root_bh(path),
4625 						   path_root_el(path));
4626 			if (!left_path) {
4627 				ret = -ENOMEM;
4628 				mlog_errno(ret);
4629 				goto out;
4630 			}
4631 
4632 			ret = ocfs2_find_path(inode, left_path, left_cpos);
4633 			if (ret) {
4634 				mlog_errno(ret);
4635 				goto out;
4636 			}
4637 		}
4638 	}
4639 
4640 	ret = ocfs2_extend_rotate_transaction(handle, 0,
4641 					      handle->h_buffer_credits,
4642 					      path);
4643 	if (ret) {
4644 		mlog_errno(ret);
4645 		goto out;
4646 	}
4647 
4648 	ret = ocfs2_journal_access_path(inode, handle, path);
4649 	if (ret) {
4650 		mlog_errno(ret);
4651 		goto out;
4652 	}
4653 
4654 	ret = ocfs2_journal_access_path(inode, handle, left_path);
4655 	if (ret) {
4656 		mlog_errno(ret);
4657 		goto out;
4658 	}
4659 
4660 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4661 	trunc_range = cpos + len;
4662 
4663 	if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
4664 		int next_free;
4665 
4666 		memset(rec, 0, sizeof(*rec));
4667 		ocfs2_cleanup_merge(el, index);
4668 		wants_rotate = 1;
4669 
4670 		next_free = le16_to_cpu(el->l_next_free_rec);
4671 		if (is_rightmost_tree_rec && next_free > 1) {
4672 			/*
4673 			 * We skip the edge update if this path will
4674 			 * be deleted by the rotate code.
4675 			 */
4676 			rec = &el->l_recs[next_free - 1];
4677 			ocfs2_adjust_rightmost_records(inode, handle, path,
4678 						       rec);
4679 		}
4680 	} else if (le32_to_cpu(rec->e_cpos) == cpos) {
4681 		/* Remove leftmost portion of the record. */
4682 		le32_add_cpu(&rec->e_cpos, len);
4683 		le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
4684 		le16_add_cpu(&rec->e_leaf_clusters, -len);
4685 	} else if (rec_range == trunc_range) {
4686 		/* Remove rightmost portion of the record */
4687 		le16_add_cpu(&rec->e_leaf_clusters, -len);
4688 		if (is_rightmost_tree_rec)
4689 			ocfs2_adjust_rightmost_records(inode, handle, path, rec);
4690 	} else {
4691 		/* Caller should have trapped this. */
4692 		mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
4693 		     "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
4694 		     le32_to_cpu(rec->e_cpos),
4695 		     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
4696 		BUG();
4697 	}
4698 
4699 	if (left_path) {
4700 		int subtree_index;
4701 
4702 		subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
4703 		ocfs2_complete_edge_insert(inode, handle, left_path, path,
4704 					   subtree_index);
4705 	}
4706 
4707 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
4708 
4709 	ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
4710 	if (ret) {
4711 		mlog_errno(ret);
4712 		goto out;
4713 	}
4714 
4715 out:
4716 	ocfs2_free_path(left_path);
4717 	return ret;
4718 }
4719 
4720 int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh,
4721 			u32 cpos, u32 len, handle_t *handle,
4722 			struct ocfs2_alloc_context *meta_ac,
4723 			struct ocfs2_cached_dealloc_ctxt *dealloc)
4724 {
4725 	int ret, index;
4726 	u32 rec_range, trunc_range;
4727 	struct ocfs2_extent_rec *rec;
4728 	struct ocfs2_extent_list *el;
4729 	struct ocfs2_path *path;
4730 
4731 	ocfs2_extent_map_trunc(inode, 0);
4732 
4733 	path = ocfs2_new_inode_path(di_bh);
4734 	if (!path) {
4735 		ret = -ENOMEM;
4736 		mlog_errno(ret);
4737 		goto out;
4738 	}
4739 
4740 	ret = ocfs2_find_path(inode, path, cpos);
4741 	if (ret) {
4742 		mlog_errno(ret);
4743 		goto out;
4744 	}
4745 
4746 	el = path_leaf_el(path);
4747 	index = ocfs2_search_extent_list(el, cpos);
4748 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4749 		ocfs2_error(inode->i_sb,
4750 			    "Inode %llu has an extent at cpos %u which can no "
4751 			    "longer be found.\n",
4752 			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4753 		ret = -EROFS;
4754 		goto out;
4755 	}
4756 
4757 	/*
4758 	 * We have 3 cases of extent removal:
4759 	 *   1) Range covers the entire extent rec
4760 	 *   2) Range begins or ends on one edge of the extent rec
4761 	 *   3) Range is in the middle of the extent rec (no shared edges)
4762 	 *
4763 	 * For case 1 we remove the extent rec and left rotate to
4764 	 * fill the hole.
4765 	 *
4766 	 * For case 2 we just shrink the existing extent rec, with a
4767 	 * tree update if the shrinking edge is also the edge of an
4768 	 * extent block.
4769 	 *
4770 	 * For case 3 we do a right split to turn the extent rec into
4771 	 * something case 2 can handle.
4772 	 */
4773 	rec = &el->l_recs[index];
4774 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
4775 	trunc_range = cpos + len;
4776 
4777 	BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
4778 
4779 	mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
4780 	     "(cpos %u, len %u)\n",
4781 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
4782 	     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
4783 
4784 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
4785 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4786 					 cpos, len);
4787 		if (ret) {
4788 			mlog_errno(ret);
4789 			goto out;
4790 		}
4791 	} else {
4792 		ret = ocfs2_split_tree(inode, di_bh, handle, path, index,
4793 				       trunc_range, meta_ac);
4794 		if (ret) {
4795 			mlog_errno(ret);
4796 			goto out;
4797 		}
4798 
4799 		/*
4800 		 * The split could have manipulated the tree enough to
4801 		 * move the record location, so we have to look for it again.
4802 		 */
4803 		ocfs2_reinit_path(path, 1);
4804 
4805 		ret = ocfs2_find_path(inode, path, cpos);
4806 		if (ret) {
4807 			mlog_errno(ret);
4808 			goto out;
4809 		}
4810 
4811 		el = path_leaf_el(path);
4812 		index = ocfs2_search_extent_list(el, cpos);
4813 		if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4814 			ocfs2_error(inode->i_sb,
4815 				    "Inode %llu: split at cpos %u lost record.",
4816 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
4817 				    cpos);
4818 			ret = -EROFS;
4819 			goto out;
4820 		}
4821 
4822 		/*
4823 		 * Double check our values here. If anything is fishy,
4824 		 * it's easier to catch it at the top level.
4825 		 */
4826 		rec = &el->l_recs[index];
4827 		rec_range = le32_to_cpu(rec->e_cpos) +
4828 			ocfs2_rec_clusters(el, rec);
4829 		if (rec_range != trunc_range) {
4830 			ocfs2_error(inode->i_sb,
4831 				    "Inode %llu: error after split at cpos %u"
4832 				    "trunc len %u, existing record is (%u,%u)",
4833 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
4834 				    cpos, len, le32_to_cpu(rec->e_cpos),
4835 				    ocfs2_rec_clusters(el, rec));
4836 			ret = -EROFS;
4837 			goto out;
4838 		}
4839 
4840 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
4841 					 cpos, len);
4842 		if (ret) {
4843 			mlog_errno(ret);
4844 			goto out;
4845 		}
4846 	}
4847 
4848 out:
4849 	ocfs2_free_path(path);
4850 	return ret;
4851 }
4852 
4853 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
4854 {
4855 	struct buffer_head *tl_bh = osb->osb_tl_bh;
4856 	struct ocfs2_dinode *di;
4857 	struct ocfs2_truncate_log *tl;
4858 
4859 	di = (struct ocfs2_dinode *) tl_bh->b_data;
4860 	tl = &di->id2.i_dealloc;
4861 
4862 	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
4863 			"slot %d, invalid truncate log parameters: used = "
4864 			"%u, count = %u\n", osb->slot_num,
4865 			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
4866 	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
4867 }
4868 
4869 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
4870 					   unsigned int new_start)
4871 {
4872 	unsigned int tail_index;
4873 	unsigned int current_tail;
4874 
4875 	/* No records, nothing to coalesce */
4876 	if (!le16_to_cpu(tl->tl_used))
4877 		return 0;
4878 
4879 	tail_index = le16_to_cpu(tl->tl_used) - 1;
4880 	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
4881 	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
4882 
4883 	return current_tail == new_start;
4884 }
4885 
4886 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
4887 			      handle_t *handle,
4888 			      u64 start_blk,
4889 			      unsigned int num_clusters)
4890 {
4891 	int status, index;
4892 	unsigned int start_cluster, tl_count;
4893 	struct inode *tl_inode = osb->osb_tl_inode;
4894 	struct buffer_head *tl_bh = osb->osb_tl_bh;
4895 	struct ocfs2_dinode *di;
4896 	struct ocfs2_truncate_log *tl;
4897 
4898 	mlog_entry("start_blk = %llu, num_clusters = %u\n",
4899 		   (unsigned long long)start_blk, num_clusters);
4900 
4901 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
4902 
4903 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
4904 
4905 	di = (struct ocfs2_dinode *) tl_bh->b_data;
4906 	tl = &di->id2.i_dealloc;
4907 	if (!OCFS2_IS_VALID_DINODE(di)) {
4908 		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
4909 		status = -EIO;
4910 		goto bail;
4911 	}
4912 
4913 	tl_count = le16_to_cpu(tl->tl_count);
4914 	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
4915 			tl_count == 0,
4916 			"Truncate record count on #%llu invalid "
4917 			"wanted %u, actual %u\n",
4918 			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
4919 			ocfs2_truncate_recs_per_inode(osb->sb),
4920 			le16_to_cpu(tl->tl_count));
4921 
4922 	/* Caller should have known to flush before calling us. */
4923 	index = le16_to_cpu(tl->tl_used);
4924 	if (index >= tl_count) {
4925 		status = -ENOSPC;
4926 		mlog_errno(status);
4927 		goto bail;
4928 	}
4929 
4930 	status = ocfs2_journal_access(handle, tl_inode, tl_bh,
4931 				      OCFS2_JOURNAL_ACCESS_WRITE);
4932 	if (status < 0) {
4933 		mlog_errno(status);
4934 		goto bail;
4935 	}
4936 
4937 	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
4938 	     "%llu (index = %d)\n", num_clusters, start_cluster,
4939 	     (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
4940 
4941 	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
4942 		/*
4943 		 * Move index back to the record we are coalescing with.
4944 		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
4945 		 */
4946 		index--;
4947 
4948 		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
4949 		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
4950 		     index, le32_to_cpu(tl->tl_recs[index].t_start),
4951 		     num_clusters);
4952 	} else {
4953 		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
4954 		tl->tl_used = cpu_to_le16(index + 1);
4955 	}
4956 	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
4957 
4958 	status = ocfs2_journal_dirty(handle, tl_bh);
4959 	if (status < 0) {
4960 		mlog_errno(status);
4961 		goto bail;
4962 	}
4963 
4964 bail:
4965 	mlog_exit(status);
4966 	return status;
4967 }
4968 
4969 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
4970 					 handle_t *handle,
4971 					 struct inode *data_alloc_inode,
4972 					 struct buffer_head *data_alloc_bh)
4973 {
4974 	int status = 0;
4975 	int i;
4976 	unsigned int num_clusters;
4977 	u64 start_blk;
4978 	struct ocfs2_truncate_rec rec;
4979 	struct ocfs2_dinode *di;
4980 	struct ocfs2_truncate_log *tl;
4981 	struct inode *tl_inode = osb->osb_tl_inode;
4982 	struct buffer_head *tl_bh = osb->osb_tl_bh;
4983 
4984 	mlog_entry_void();
4985 
4986 	di = (struct ocfs2_dinode *) tl_bh->b_data;
4987 	tl = &di->id2.i_dealloc;
4988 	i = le16_to_cpu(tl->tl_used) - 1;
4989 	while (i >= 0) {
4990 		/* Caller has given us at least enough credits to
4991 		 * update the truncate log dinode */
4992 		status = ocfs2_journal_access(handle, tl_inode, tl_bh,
4993 					      OCFS2_JOURNAL_ACCESS_WRITE);
4994 		if (status < 0) {
4995 			mlog_errno(status);
4996 			goto bail;
4997 		}
4998 
4999 		tl->tl_used = cpu_to_le16(i);
5000 
5001 		status = ocfs2_journal_dirty(handle, tl_bh);
5002 		if (status < 0) {
5003 			mlog_errno(status);
5004 			goto bail;
5005 		}
5006 
5007 		/* TODO: Perhaps we can calculate the bulk of the
5008 		 * credits up front rather than extending like
5009 		 * this. */
5010 		status = ocfs2_extend_trans(handle,
5011 					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5012 		if (status < 0) {
5013 			mlog_errno(status);
5014 			goto bail;
5015 		}
5016 
5017 		rec = tl->tl_recs[i];
5018 		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5019 						    le32_to_cpu(rec.t_start));
5020 		num_clusters = le32_to_cpu(rec.t_clusters);
5021 
5022 		/* if start_blk is not set, we ignore the record as
5023 		 * invalid. */
5024 		if (start_blk) {
5025 			mlog(0, "free record %d, start = %u, clusters = %u\n",
5026 			     i, le32_to_cpu(rec.t_start), num_clusters);
5027 
5028 			status = ocfs2_free_clusters(handle, data_alloc_inode,
5029 						     data_alloc_bh, start_blk,
5030 						     num_clusters);
5031 			if (status < 0) {
5032 				mlog_errno(status);
5033 				goto bail;
5034 			}
5035 		}
5036 		i--;
5037 	}
5038 
5039 bail:
5040 	mlog_exit(status);
5041 	return status;
5042 }
5043 
5044 /* Expects you to already be holding tl_inode->i_mutex */
5045 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5046 {
5047 	int status;
5048 	unsigned int num_to_flush;
5049 	handle_t *handle;
5050 	struct inode *tl_inode = osb->osb_tl_inode;
5051 	struct inode *data_alloc_inode = NULL;
5052 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5053 	struct buffer_head *data_alloc_bh = NULL;
5054 	struct ocfs2_dinode *di;
5055 	struct ocfs2_truncate_log *tl;
5056 
5057 	mlog_entry_void();
5058 
5059 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5060 
5061 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5062 	tl = &di->id2.i_dealloc;
5063 	if (!OCFS2_IS_VALID_DINODE(di)) {
5064 		OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
5065 		status = -EIO;
5066 		goto out;
5067 	}
5068 
5069 	num_to_flush = le16_to_cpu(tl->tl_used);
5070 	mlog(0, "Flush %u records from truncate log #%llu\n",
5071 	     num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5072 	if (!num_to_flush) {
5073 		status = 0;
5074 		goto out;
5075 	}
5076 
5077 	data_alloc_inode = ocfs2_get_system_file_inode(osb,
5078 						       GLOBAL_BITMAP_SYSTEM_INODE,
5079 						       OCFS2_INVALID_SLOT);
5080 	if (!data_alloc_inode) {
5081 		status = -EINVAL;
5082 		mlog(ML_ERROR, "Could not get bitmap inode!\n");
5083 		goto out;
5084 	}
5085 
5086 	mutex_lock(&data_alloc_inode->i_mutex);
5087 
5088 	status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5089 	if (status < 0) {
5090 		mlog_errno(status);
5091 		goto out_mutex;
5092 	}
5093 
5094 	handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5095 	if (IS_ERR(handle)) {
5096 		status = PTR_ERR(handle);
5097 		mlog_errno(status);
5098 		goto out_unlock;
5099 	}
5100 
5101 	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5102 					       data_alloc_bh);
5103 	if (status < 0)
5104 		mlog_errno(status);
5105 
5106 	ocfs2_commit_trans(osb, handle);
5107 
5108 out_unlock:
5109 	brelse(data_alloc_bh);
5110 	ocfs2_inode_unlock(data_alloc_inode, 1);
5111 
5112 out_mutex:
5113 	mutex_unlock(&data_alloc_inode->i_mutex);
5114 	iput(data_alloc_inode);
5115 
5116 out:
5117 	mlog_exit(status);
5118 	return status;
5119 }
5120 
5121 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5122 {
5123 	int status;
5124 	struct inode *tl_inode = osb->osb_tl_inode;
5125 
5126 	mutex_lock(&tl_inode->i_mutex);
5127 	status = __ocfs2_flush_truncate_log(osb);
5128 	mutex_unlock(&tl_inode->i_mutex);
5129 
5130 	return status;
5131 }
5132 
5133 static void ocfs2_truncate_log_worker(struct work_struct *work)
5134 {
5135 	int status;
5136 	struct ocfs2_super *osb =
5137 		container_of(work, struct ocfs2_super,
5138 			     osb_truncate_log_wq.work);
5139 
5140 	mlog_entry_void();
5141 
5142 	status = ocfs2_flush_truncate_log(osb);
5143 	if (status < 0)
5144 		mlog_errno(status);
5145 	else
5146 		ocfs2_init_inode_steal_slot(osb);
5147 
5148 	mlog_exit(status);
5149 }
5150 
5151 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5152 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5153 				       int cancel)
5154 {
5155 	if (osb->osb_tl_inode) {
5156 		/* We want to push off log flushes while truncates are
5157 		 * still running. */
5158 		if (cancel)
5159 			cancel_delayed_work(&osb->osb_truncate_log_wq);
5160 
5161 		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5162 				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5163 	}
5164 }
5165 
5166 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5167 				       int slot_num,
5168 				       struct inode **tl_inode,
5169 				       struct buffer_head **tl_bh)
5170 {
5171 	int status;
5172 	struct inode *inode = NULL;
5173 	struct buffer_head *bh = NULL;
5174 
5175 	inode = ocfs2_get_system_file_inode(osb,
5176 					   TRUNCATE_LOG_SYSTEM_INODE,
5177 					   slot_num);
5178 	if (!inode) {
5179 		status = -EINVAL;
5180 		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5181 		goto bail;
5182 	}
5183 
5184 	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
5185 				  OCFS2_BH_CACHED, inode);
5186 	if (status < 0) {
5187 		iput(inode);
5188 		mlog_errno(status);
5189 		goto bail;
5190 	}
5191 
5192 	*tl_inode = inode;
5193 	*tl_bh    = bh;
5194 bail:
5195 	mlog_exit(status);
5196 	return status;
5197 }
5198 
5199 /* called during the 1st stage of node recovery. we stamp a clean
5200  * truncate log and pass back a copy for processing later. if the
5201  * truncate log does not require processing, a *tl_copy is set to
5202  * NULL. */
5203 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5204 				      int slot_num,
5205 				      struct ocfs2_dinode **tl_copy)
5206 {
5207 	int status;
5208 	struct inode *tl_inode = NULL;
5209 	struct buffer_head *tl_bh = NULL;
5210 	struct ocfs2_dinode *di;
5211 	struct ocfs2_truncate_log *tl;
5212 
5213 	*tl_copy = NULL;
5214 
5215 	mlog(0, "recover truncate log from slot %d\n", slot_num);
5216 
5217 	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5218 	if (status < 0) {
5219 		mlog_errno(status);
5220 		goto bail;
5221 	}
5222 
5223 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5224 	tl = &di->id2.i_dealloc;
5225 	if (!OCFS2_IS_VALID_DINODE(di)) {
5226 		OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
5227 		status = -EIO;
5228 		goto bail;
5229 	}
5230 
5231 	if (le16_to_cpu(tl->tl_used)) {
5232 		mlog(0, "We'll have %u logs to recover\n",
5233 		     le16_to_cpu(tl->tl_used));
5234 
5235 		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5236 		if (!(*tl_copy)) {
5237 			status = -ENOMEM;
5238 			mlog_errno(status);
5239 			goto bail;
5240 		}
5241 
5242 		/* Assuming the write-out below goes well, this copy
5243 		 * will be passed back to recovery for processing. */
5244 		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5245 
5246 		/* All we need to do to clear the truncate log is set
5247 		 * tl_used. */
5248 		tl->tl_used = 0;
5249 
5250 		status = ocfs2_write_block(osb, tl_bh, tl_inode);
5251 		if (status < 0) {
5252 			mlog_errno(status);
5253 			goto bail;
5254 		}
5255 	}
5256 
5257 bail:
5258 	if (tl_inode)
5259 		iput(tl_inode);
5260 	if (tl_bh)
5261 		brelse(tl_bh);
5262 
5263 	if (status < 0 && (*tl_copy)) {
5264 		kfree(*tl_copy);
5265 		*tl_copy = NULL;
5266 	}
5267 
5268 	mlog_exit(status);
5269 	return status;
5270 }
5271 
5272 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5273 					 struct ocfs2_dinode *tl_copy)
5274 {
5275 	int status = 0;
5276 	int i;
5277 	unsigned int clusters, num_recs, start_cluster;
5278 	u64 start_blk;
5279 	handle_t *handle;
5280 	struct inode *tl_inode = osb->osb_tl_inode;
5281 	struct ocfs2_truncate_log *tl;
5282 
5283 	mlog_entry_void();
5284 
5285 	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5286 		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5287 		return -EINVAL;
5288 	}
5289 
5290 	tl = &tl_copy->id2.i_dealloc;
5291 	num_recs = le16_to_cpu(tl->tl_used);
5292 	mlog(0, "cleanup %u records from %llu\n", num_recs,
5293 	     (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5294 
5295 	mutex_lock(&tl_inode->i_mutex);
5296 	for(i = 0; i < num_recs; i++) {
5297 		if (ocfs2_truncate_log_needs_flush(osb)) {
5298 			status = __ocfs2_flush_truncate_log(osb);
5299 			if (status < 0) {
5300 				mlog_errno(status);
5301 				goto bail_up;
5302 			}
5303 		}
5304 
5305 		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5306 		if (IS_ERR(handle)) {
5307 			status = PTR_ERR(handle);
5308 			mlog_errno(status);
5309 			goto bail_up;
5310 		}
5311 
5312 		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5313 		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5314 		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5315 
5316 		status = ocfs2_truncate_log_append(osb, handle,
5317 						   start_blk, clusters);
5318 		ocfs2_commit_trans(osb, handle);
5319 		if (status < 0) {
5320 			mlog_errno(status);
5321 			goto bail_up;
5322 		}
5323 	}
5324 
5325 bail_up:
5326 	mutex_unlock(&tl_inode->i_mutex);
5327 
5328 	mlog_exit(status);
5329 	return status;
5330 }
5331 
5332 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5333 {
5334 	int status;
5335 	struct inode *tl_inode = osb->osb_tl_inode;
5336 
5337 	mlog_entry_void();
5338 
5339 	if (tl_inode) {
5340 		cancel_delayed_work(&osb->osb_truncate_log_wq);
5341 		flush_workqueue(ocfs2_wq);
5342 
5343 		status = ocfs2_flush_truncate_log(osb);
5344 		if (status < 0)
5345 			mlog_errno(status);
5346 
5347 		brelse(osb->osb_tl_bh);
5348 		iput(osb->osb_tl_inode);
5349 	}
5350 
5351 	mlog_exit_void();
5352 }
5353 
5354 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5355 {
5356 	int status;
5357 	struct inode *tl_inode = NULL;
5358 	struct buffer_head *tl_bh = NULL;
5359 
5360 	mlog_entry_void();
5361 
5362 	status = ocfs2_get_truncate_log_info(osb,
5363 					     osb->slot_num,
5364 					     &tl_inode,
5365 					     &tl_bh);
5366 	if (status < 0)
5367 		mlog_errno(status);
5368 
5369 	/* ocfs2_truncate_log_shutdown keys on the existence of
5370 	 * osb->osb_tl_inode so we don't set any of the osb variables
5371 	 * until we're sure all is well. */
5372 	INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5373 			  ocfs2_truncate_log_worker);
5374 	osb->osb_tl_bh    = tl_bh;
5375 	osb->osb_tl_inode = tl_inode;
5376 
5377 	mlog_exit(status);
5378 	return status;
5379 }
5380 
5381 /*
5382  * Delayed de-allocation of suballocator blocks.
5383  *
5384  * Some sets of block de-allocations might involve multiple suballocator inodes.
5385  *
5386  * The locking for this can get extremely complicated, especially when
5387  * the suballocator inodes to delete from aren't known until deep
5388  * within an unrelated codepath.
5389  *
5390  * ocfs2_extent_block structures are a good example of this - an inode
5391  * btree could have been grown by any number of nodes each allocating
5392  * out of their own suballoc inode.
5393  *
5394  * These structures allow the delay of block de-allocation until a
5395  * later time, when locking of multiple cluster inodes won't cause
5396  * deadlock.
5397  */
5398 
5399 /*
5400  * Describes a single block free from a suballocator
5401  */
5402 struct ocfs2_cached_block_free {
5403 	struct ocfs2_cached_block_free		*free_next;
5404 	u64					free_blk;
5405 	unsigned int				free_bit;
5406 };
5407 
5408 struct ocfs2_per_slot_free_list {
5409 	struct ocfs2_per_slot_free_list		*f_next_suballocator;
5410 	int					f_inode_type;
5411 	int					f_slot;
5412 	struct ocfs2_cached_block_free		*f_first;
5413 };
5414 
5415 static int ocfs2_free_cached_items(struct ocfs2_super *osb,
5416 				   int sysfile_type,
5417 				   int slot,
5418 				   struct ocfs2_cached_block_free *head)
5419 {
5420 	int ret;
5421 	u64 bg_blkno;
5422 	handle_t *handle;
5423 	struct inode *inode;
5424 	struct buffer_head *di_bh = NULL;
5425 	struct ocfs2_cached_block_free *tmp;
5426 
5427 	inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
5428 	if (!inode) {
5429 		ret = -EINVAL;
5430 		mlog_errno(ret);
5431 		goto out;
5432 	}
5433 
5434 	mutex_lock(&inode->i_mutex);
5435 
5436 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
5437 	if (ret) {
5438 		mlog_errno(ret);
5439 		goto out_mutex;
5440 	}
5441 
5442 	handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
5443 	if (IS_ERR(handle)) {
5444 		ret = PTR_ERR(handle);
5445 		mlog_errno(ret);
5446 		goto out_unlock;
5447 	}
5448 
5449 	while (head) {
5450 		bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
5451 						      head->free_bit);
5452 		mlog(0, "Free bit: (bit %u, blkno %llu)\n",
5453 		     head->free_bit, (unsigned long long)head->free_blk);
5454 
5455 		ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
5456 					       head->free_bit, bg_blkno, 1);
5457 		if (ret) {
5458 			mlog_errno(ret);
5459 			goto out_journal;
5460 		}
5461 
5462 		ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
5463 		if (ret) {
5464 			mlog_errno(ret);
5465 			goto out_journal;
5466 		}
5467 
5468 		tmp = head;
5469 		head = head->free_next;
5470 		kfree(tmp);
5471 	}
5472 
5473 out_journal:
5474 	ocfs2_commit_trans(osb, handle);
5475 
5476 out_unlock:
5477 	ocfs2_inode_unlock(inode, 1);
5478 	brelse(di_bh);
5479 out_mutex:
5480 	mutex_unlock(&inode->i_mutex);
5481 	iput(inode);
5482 out:
5483 	while(head) {
5484 		/* Premature exit may have left some dangling items. */
5485 		tmp = head;
5486 		head = head->free_next;
5487 		kfree(tmp);
5488 	}
5489 
5490 	return ret;
5491 }
5492 
5493 int ocfs2_run_deallocs(struct ocfs2_super *osb,
5494 		       struct ocfs2_cached_dealloc_ctxt *ctxt)
5495 {
5496 	int ret = 0, ret2;
5497 	struct ocfs2_per_slot_free_list *fl;
5498 
5499 	if (!ctxt)
5500 		return 0;
5501 
5502 	while (ctxt->c_first_suballocator) {
5503 		fl = ctxt->c_first_suballocator;
5504 
5505 		if (fl->f_first) {
5506 			mlog(0, "Free items: (type %u, slot %d)\n",
5507 			     fl->f_inode_type, fl->f_slot);
5508 			ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
5509 						       fl->f_slot, fl->f_first);
5510 			if (ret2)
5511 				mlog_errno(ret2);
5512 			if (!ret)
5513 				ret = ret2;
5514 		}
5515 
5516 		ctxt->c_first_suballocator = fl->f_next_suballocator;
5517 		kfree(fl);
5518 	}
5519 
5520 	return ret;
5521 }
5522 
5523 static struct ocfs2_per_slot_free_list *
5524 ocfs2_find_per_slot_free_list(int type,
5525 			      int slot,
5526 			      struct ocfs2_cached_dealloc_ctxt *ctxt)
5527 {
5528 	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
5529 
5530 	while (fl) {
5531 		if (fl->f_inode_type == type && fl->f_slot == slot)
5532 			return fl;
5533 
5534 		fl = fl->f_next_suballocator;
5535 	}
5536 
5537 	fl = kmalloc(sizeof(*fl), GFP_NOFS);
5538 	if (fl) {
5539 		fl->f_inode_type = type;
5540 		fl->f_slot = slot;
5541 		fl->f_first = NULL;
5542 		fl->f_next_suballocator = ctxt->c_first_suballocator;
5543 
5544 		ctxt->c_first_suballocator = fl;
5545 	}
5546 	return fl;
5547 }
5548 
5549 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
5550 				     int type, int slot, u64 blkno,
5551 				     unsigned int bit)
5552 {
5553 	int ret;
5554 	struct ocfs2_per_slot_free_list *fl;
5555 	struct ocfs2_cached_block_free *item;
5556 
5557 	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
5558 	if (fl == NULL) {
5559 		ret = -ENOMEM;
5560 		mlog_errno(ret);
5561 		goto out;
5562 	}
5563 
5564 	item = kmalloc(sizeof(*item), GFP_NOFS);
5565 	if (item == NULL) {
5566 		ret = -ENOMEM;
5567 		mlog_errno(ret);
5568 		goto out;
5569 	}
5570 
5571 	mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
5572 	     type, slot, bit, (unsigned long long)blkno);
5573 
5574 	item->free_blk = blkno;
5575 	item->free_bit = bit;
5576 	item->free_next = fl->f_first;
5577 
5578 	fl->f_first = item;
5579 
5580 	ret = 0;
5581 out:
5582 	return ret;
5583 }
5584 
5585 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
5586 					 struct ocfs2_extent_block *eb)
5587 {
5588 	return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
5589 					 le16_to_cpu(eb->h_suballoc_slot),
5590 					 le64_to_cpu(eb->h_blkno),
5591 					 le16_to_cpu(eb->h_suballoc_bit));
5592 }
5593 
5594 /* This function will figure out whether the currently last extent
5595  * block will be deleted, and if it will, what the new last extent
5596  * block will be so we can update his h_next_leaf_blk field, as well
5597  * as the dinodes i_last_eb_blk */
5598 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
5599 				       unsigned int clusters_to_del,
5600 				       struct ocfs2_path *path,
5601 				       struct buffer_head **new_last_eb)
5602 {
5603 	int next_free, ret = 0;
5604 	u32 cpos;
5605 	struct ocfs2_extent_rec *rec;
5606 	struct ocfs2_extent_block *eb;
5607 	struct ocfs2_extent_list *el;
5608 	struct buffer_head *bh = NULL;
5609 
5610 	*new_last_eb = NULL;
5611 
5612 	/* we have no tree, so of course, no last_eb. */
5613 	if (!path->p_tree_depth)
5614 		goto out;
5615 
5616 	/* trunc to zero special case - this makes tree_depth = 0
5617 	 * regardless of what it is.  */
5618 	if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
5619 		goto out;
5620 
5621 	el = path_leaf_el(path);
5622 	BUG_ON(!el->l_next_free_rec);
5623 
5624 	/*
5625 	 * Make sure that this extent list will actually be empty
5626 	 * after we clear away the data. We can shortcut out if
5627 	 * there's more than one non-empty extent in the
5628 	 * list. Otherwise, a check of the remaining extent is
5629 	 * necessary.
5630 	 */
5631 	next_free = le16_to_cpu(el->l_next_free_rec);
5632 	rec = NULL;
5633 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
5634 		if (next_free > 2)
5635 			goto out;
5636 
5637 		/* We may have a valid extent in index 1, check it. */
5638 		if (next_free == 2)
5639 			rec = &el->l_recs[1];
5640 
5641 		/*
5642 		 * Fall through - no more nonempty extents, so we want
5643 		 * to delete this leaf.
5644 		 */
5645 	} else {
5646 		if (next_free > 1)
5647 			goto out;
5648 
5649 		rec = &el->l_recs[0];
5650 	}
5651 
5652 	if (rec) {
5653 		/*
5654 		 * Check it we'll only be trimming off the end of this
5655 		 * cluster.
5656 		 */
5657 		if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
5658 			goto out;
5659 	}
5660 
5661 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
5662 	if (ret) {
5663 		mlog_errno(ret);
5664 		goto out;
5665 	}
5666 
5667 	ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
5668 	if (ret) {
5669 		mlog_errno(ret);
5670 		goto out;
5671 	}
5672 
5673 	eb = (struct ocfs2_extent_block *) bh->b_data;
5674 	el = &eb->h_list;
5675 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
5676 		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
5677 		ret = -EROFS;
5678 		goto out;
5679 	}
5680 
5681 	*new_last_eb = bh;
5682 	get_bh(*new_last_eb);
5683 	mlog(0, "returning block %llu, (cpos: %u)\n",
5684 	     (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
5685 out:
5686 	brelse(bh);
5687 
5688 	return ret;
5689 }
5690 
5691 /*
5692  * Trim some clusters off the rightmost edge of a tree. Only called
5693  * during truncate.
5694  *
5695  * The caller needs to:
5696  *   - start journaling of each path component.
5697  *   - compute and fully set up any new last ext block
5698  */
5699 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
5700 			   handle_t *handle, struct ocfs2_truncate_context *tc,
5701 			   u32 clusters_to_del, u64 *delete_start)
5702 {
5703 	int ret, i, index = path->p_tree_depth;
5704 	u32 new_edge = 0;
5705 	u64 deleted_eb = 0;
5706 	struct buffer_head *bh;
5707 	struct ocfs2_extent_list *el;
5708 	struct ocfs2_extent_rec *rec;
5709 
5710 	*delete_start = 0;
5711 
5712 	while (index >= 0) {
5713 		bh = path->p_node[index].bh;
5714 		el = path->p_node[index].el;
5715 
5716 		mlog(0, "traveling tree (index = %d, block = %llu)\n",
5717 		     index,  (unsigned long long)bh->b_blocknr);
5718 
5719 		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
5720 
5721 		if (index !=
5722 		    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
5723 			ocfs2_error(inode->i_sb,
5724 				    "Inode %lu has invalid ext. block %llu",
5725 				    inode->i_ino,
5726 				    (unsigned long long)bh->b_blocknr);
5727 			ret = -EROFS;
5728 			goto out;
5729 		}
5730 
5731 find_tail_record:
5732 		i = le16_to_cpu(el->l_next_free_rec) - 1;
5733 		rec = &el->l_recs[i];
5734 
5735 		mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
5736 		     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
5737 		     ocfs2_rec_clusters(el, rec),
5738 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
5739 		     le16_to_cpu(el->l_next_free_rec));
5740 
5741 		BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
5742 
5743 		if (le16_to_cpu(el->l_tree_depth) == 0) {
5744 			/*
5745 			 * If the leaf block contains a single empty
5746 			 * extent and no records, we can just remove
5747 			 * the block.
5748 			 */
5749 			if (i == 0 && ocfs2_is_empty_extent(rec)) {
5750 				memset(rec, 0,
5751 				       sizeof(struct ocfs2_extent_rec));
5752 				el->l_next_free_rec = cpu_to_le16(0);
5753 
5754 				goto delete;
5755 			}
5756 
5757 			/*
5758 			 * Remove any empty extents by shifting things
5759 			 * left. That should make life much easier on
5760 			 * the code below. This condition is rare
5761 			 * enough that we shouldn't see a performance
5762 			 * hit.
5763 			 */
5764 			if (ocfs2_is_empty_extent(&el->l_recs[0])) {
5765 				le16_add_cpu(&el->l_next_free_rec, -1);
5766 
5767 				for(i = 0;
5768 				    i < le16_to_cpu(el->l_next_free_rec); i++)
5769 					el->l_recs[i] = el->l_recs[i + 1];
5770 
5771 				memset(&el->l_recs[i], 0,
5772 				       sizeof(struct ocfs2_extent_rec));
5773 
5774 				/*
5775 				 * We've modified our extent list. The
5776 				 * simplest way to handle this change
5777 				 * is to being the search from the
5778 				 * start again.
5779 				 */
5780 				goto find_tail_record;
5781 			}
5782 
5783 			le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
5784 
5785 			/*
5786 			 * We'll use "new_edge" on our way back up the
5787 			 * tree to know what our rightmost cpos is.
5788 			 */
5789 			new_edge = le16_to_cpu(rec->e_leaf_clusters);
5790 			new_edge += le32_to_cpu(rec->e_cpos);
5791 
5792 			/*
5793 			 * The caller will use this to delete data blocks.
5794 			 */
5795 			*delete_start = le64_to_cpu(rec->e_blkno)
5796 				+ ocfs2_clusters_to_blocks(inode->i_sb,
5797 					le16_to_cpu(rec->e_leaf_clusters));
5798 
5799 			/*
5800 			 * If it's now empty, remove this record.
5801 			 */
5802 			if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
5803 				memset(rec, 0,
5804 				       sizeof(struct ocfs2_extent_rec));
5805 				le16_add_cpu(&el->l_next_free_rec, -1);
5806 			}
5807 		} else {
5808 			if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
5809 				memset(rec, 0,
5810 				       sizeof(struct ocfs2_extent_rec));
5811 				le16_add_cpu(&el->l_next_free_rec, -1);
5812 
5813 				goto delete;
5814 			}
5815 
5816 			/* Can this actually happen? */
5817 			if (le16_to_cpu(el->l_next_free_rec) == 0)
5818 				goto delete;
5819 
5820 			/*
5821 			 * We never actually deleted any clusters
5822 			 * because our leaf was empty. There's no
5823 			 * reason to adjust the rightmost edge then.
5824 			 */
5825 			if (new_edge == 0)
5826 				goto delete;
5827 
5828 			rec->e_int_clusters = cpu_to_le32(new_edge);
5829 			le32_add_cpu(&rec->e_int_clusters,
5830 				     -le32_to_cpu(rec->e_cpos));
5831 
5832 			 /*
5833 			  * A deleted child record should have been
5834 			  * caught above.
5835 			  */
5836 			 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
5837 		}
5838 
5839 delete:
5840 		ret = ocfs2_journal_dirty(handle, bh);
5841 		if (ret) {
5842 			mlog_errno(ret);
5843 			goto out;
5844 		}
5845 
5846 		mlog(0, "extent list container %llu, after: record %d: "
5847 		     "(%u, %u, %llu), next = %u.\n",
5848 		     (unsigned long long)bh->b_blocknr, i,
5849 		     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
5850 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
5851 		     le16_to_cpu(el->l_next_free_rec));
5852 
5853 		/*
5854 		 * We must be careful to only attempt delete of an
5855 		 * extent block (and not the root inode block).
5856 		 */
5857 		if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
5858 			struct ocfs2_extent_block *eb =
5859 				(struct ocfs2_extent_block *)bh->b_data;
5860 
5861 			/*
5862 			 * Save this for use when processing the
5863 			 * parent block.
5864 			 */
5865 			deleted_eb = le64_to_cpu(eb->h_blkno);
5866 
5867 			mlog(0, "deleting this extent block.\n");
5868 
5869 			ocfs2_remove_from_cache(inode, bh);
5870 
5871 			BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
5872 			BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
5873 			BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
5874 
5875 			ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
5876 			/* An error here is not fatal. */
5877 			if (ret < 0)
5878 				mlog_errno(ret);
5879 		} else {
5880 			deleted_eb = 0;
5881 		}
5882 
5883 		index--;
5884 	}
5885 
5886 	ret = 0;
5887 out:
5888 	return ret;
5889 }
5890 
5891 static int ocfs2_do_truncate(struct ocfs2_super *osb,
5892 			     unsigned int clusters_to_del,
5893 			     struct inode *inode,
5894 			     struct buffer_head *fe_bh,
5895 			     handle_t *handle,
5896 			     struct ocfs2_truncate_context *tc,
5897 			     struct ocfs2_path *path)
5898 {
5899 	int status;
5900 	struct ocfs2_dinode *fe;
5901 	struct ocfs2_extent_block *last_eb = NULL;
5902 	struct ocfs2_extent_list *el;
5903 	struct buffer_head *last_eb_bh = NULL;
5904 	u64 delete_blk = 0;
5905 
5906 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
5907 
5908 	status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
5909 					     path, &last_eb_bh);
5910 	if (status < 0) {
5911 		mlog_errno(status);
5912 		goto bail;
5913 	}
5914 
5915 	/*
5916 	 * Each component will be touched, so we might as well journal
5917 	 * here to avoid having to handle errors later.
5918 	 */
5919 	status = ocfs2_journal_access_path(inode, handle, path);
5920 	if (status < 0) {
5921 		mlog_errno(status);
5922 		goto bail;
5923 	}
5924 
5925 	if (last_eb_bh) {
5926 		status = ocfs2_journal_access(handle, inode, last_eb_bh,
5927 					      OCFS2_JOURNAL_ACCESS_WRITE);
5928 		if (status < 0) {
5929 			mlog_errno(status);
5930 			goto bail;
5931 		}
5932 
5933 		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5934 	}
5935 
5936 	el = &(fe->id2.i_list);
5937 
5938 	/*
5939 	 * Lower levels depend on this never happening, but it's best
5940 	 * to check it up here before changing the tree.
5941 	 */
5942 	if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
5943 		ocfs2_error(inode->i_sb,
5944 			    "Inode %lu has an empty extent record, depth %u\n",
5945 			    inode->i_ino, le16_to_cpu(el->l_tree_depth));
5946 		status = -EROFS;
5947 		goto bail;
5948 	}
5949 
5950 	spin_lock(&OCFS2_I(inode)->ip_lock);
5951 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
5952 				      clusters_to_del;
5953 	spin_unlock(&OCFS2_I(inode)->ip_lock);
5954 	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
5955 	inode->i_blocks = ocfs2_inode_sector_count(inode);
5956 
5957 	status = ocfs2_trim_tree(inode, path, handle, tc,
5958 				 clusters_to_del, &delete_blk);
5959 	if (status) {
5960 		mlog_errno(status);
5961 		goto bail;
5962 	}
5963 
5964 	if (le32_to_cpu(fe->i_clusters) == 0) {
5965 		/* trunc to zero is a special case. */
5966 		el->l_tree_depth = 0;
5967 		fe->i_last_eb_blk = 0;
5968 	} else if (last_eb)
5969 		fe->i_last_eb_blk = last_eb->h_blkno;
5970 
5971 	status = ocfs2_journal_dirty(handle, fe_bh);
5972 	if (status < 0) {
5973 		mlog_errno(status);
5974 		goto bail;
5975 	}
5976 
5977 	if (last_eb) {
5978 		/* If there will be a new last extent block, then by
5979 		 * definition, there cannot be any leaves to the right of
5980 		 * him. */
5981 		last_eb->h_next_leaf_blk = 0;
5982 		status = ocfs2_journal_dirty(handle, last_eb_bh);
5983 		if (status < 0) {
5984 			mlog_errno(status);
5985 			goto bail;
5986 		}
5987 	}
5988 
5989 	if (delete_blk) {
5990 		status = ocfs2_truncate_log_append(osb, handle, delete_blk,
5991 						   clusters_to_del);
5992 		if (status < 0) {
5993 			mlog_errno(status);
5994 			goto bail;
5995 		}
5996 	}
5997 	status = 0;
5998 bail:
5999 
6000 	mlog_exit(status);
6001 	return status;
6002 }
6003 
6004 static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
6005 {
6006 	set_buffer_uptodate(bh);
6007 	mark_buffer_dirty(bh);
6008 	return 0;
6009 }
6010 
6011 static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
6012 {
6013 	set_buffer_uptodate(bh);
6014 	mark_buffer_dirty(bh);
6015 	return ocfs2_journal_dirty_data(handle, bh);
6016 }
6017 
6018 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6019 				     unsigned int from, unsigned int to,
6020 				     struct page *page, int zero, u64 *phys)
6021 {
6022 	int ret, partial = 0;
6023 
6024 	ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6025 	if (ret)
6026 		mlog_errno(ret);
6027 
6028 	if (zero)
6029 		zero_user_segment(page, from, to);
6030 
6031 	/*
6032 	 * Need to set the buffers we zero'd into uptodate
6033 	 * here if they aren't - ocfs2_map_page_blocks()
6034 	 * might've skipped some
6035 	 */
6036 	if (ocfs2_should_order_data(inode)) {
6037 		ret = walk_page_buffers(handle,
6038 					page_buffers(page),
6039 					from, to, &partial,
6040 					ocfs2_ordered_zero_func);
6041 		if (ret < 0)
6042 			mlog_errno(ret);
6043 	} else {
6044 		ret = walk_page_buffers(handle, page_buffers(page),
6045 					from, to, &partial,
6046 					ocfs2_writeback_zero_func);
6047 		if (ret < 0)
6048 			mlog_errno(ret);
6049 	}
6050 
6051 	if (!partial)
6052 		SetPageUptodate(page);
6053 
6054 	flush_dcache_page(page);
6055 }
6056 
6057 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6058 				     loff_t end, struct page **pages,
6059 				     int numpages, u64 phys, handle_t *handle)
6060 {
6061 	int i;
6062 	struct page *page;
6063 	unsigned int from, to = PAGE_CACHE_SIZE;
6064 	struct super_block *sb = inode->i_sb;
6065 
6066 	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6067 
6068 	if (numpages == 0)
6069 		goto out;
6070 
6071 	to = PAGE_CACHE_SIZE;
6072 	for(i = 0; i < numpages; i++) {
6073 		page = pages[i];
6074 
6075 		from = start & (PAGE_CACHE_SIZE - 1);
6076 		if ((end >> PAGE_CACHE_SHIFT) == page->index)
6077 			to = end & (PAGE_CACHE_SIZE - 1);
6078 
6079 		BUG_ON(from > PAGE_CACHE_SIZE);
6080 		BUG_ON(to > PAGE_CACHE_SIZE);
6081 
6082 		ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6083 					 &phys);
6084 
6085 		start = (page->index + 1) << PAGE_CACHE_SHIFT;
6086 	}
6087 out:
6088 	if (pages)
6089 		ocfs2_unlock_and_free_pages(pages, numpages);
6090 }
6091 
6092 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6093 				struct page **pages, int *num)
6094 {
6095 	int numpages, ret = 0;
6096 	struct super_block *sb = inode->i_sb;
6097 	struct address_space *mapping = inode->i_mapping;
6098 	unsigned long index;
6099 	loff_t last_page_bytes;
6100 
6101 	BUG_ON(start > end);
6102 
6103 	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6104 	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6105 
6106 	numpages = 0;
6107 	last_page_bytes = PAGE_ALIGN(end);
6108 	index = start >> PAGE_CACHE_SHIFT;
6109 	do {
6110 		pages[numpages] = grab_cache_page(mapping, index);
6111 		if (!pages[numpages]) {
6112 			ret = -ENOMEM;
6113 			mlog_errno(ret);
6114 			goto out;
6115 		}
6116 
6117 		numpages++;
6118 		index++;
6119 	} while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6120 
6121 out:
6122 	if (ret != 0) {
6123 		if (pages)
6124 			ocfs2_unlock_and_free_pages(pages, numpages);
6125 		numpages = 0;
6126 	}
6127 
6128 	*num = numpages;
6129 
6130 	return ret;
6131 }
6132 
6133 /*
6134  * Zero the area past i_size but still within an allocated
6135  * cluster. This avoids exposing nonzero data on subsequent file
6136  * extends.
6137  *
6138  * We need to call this before i_size is updated on the inode because
6139  * otherwise block_write_full_page() will skip writeout of pages past
6140  * i_size. The new_i_size parameter is passed for this reason.
6141  */
6142 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6143 				  u64 range_start, u64 range_end)
6144 {
6145 	int ret = 0, numpages;
6146 	struct page **pages = NULL;
6147 	u64 phys;
6148 	unsigned int ext_flags;
6149 	struct super_block *sb = inode->i_sb;
6150 
6151 	/*
6152 	 * File systems which don't support sparse files zero on every
6153 	 * extend.
6154 	 */
6155 	if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6156 		return 0;
6157 
6158 	pages = kcalloc(ocfs2_pages_per_cluster(sb),
6159 			sizeof(struct page *), GFP_NOFS);
6160 	if (pages == NULL) {
6161 		ret = -ENOMEM;
6162 		mlog_errno(ret);
6163 		goto out;
6164 	}
6165 
6166 	if (range_start == range_end)
6167 		goto out;
6168 
6169 	ret = ocfs2_extent_map_get_blocks(inode,
6170 					  range_start >> sb->s_blocksize_bits,
6171 					  &phys, NULL, &ext_flags);
6172 	if (ret) {
6173 		mlog_errno(ret);
6174 		goto out;
6175 	}
6176 
6177 	/*
6178 	 * Tail is a hole, or is marked unwritten. In either case, we
6179 	 * can count on read and write to return/push zero's.
6180 	 */
6181 	if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6182 		goto out;
6183 
6184 	ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6185 				   &numpages);
6186 	if (ret) {
6187 		mlog_errno(ret);
6188 		goto out;
6189 	}
6190 
6191 	ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6192 				 numpages, phys, handle);
6193 
6194 	/*
6195 	 * Initiate writeout of the pages we zero'd here. We don't
6196 	 * wait on them - the truncate_inode_pages() call later will
6197 	 * do that for us.
6198 	 */
6199 	ret = do_sync_mapping_range(inode->i_mapping, range_start,
6200 				    range_end - 1, SYNC_FILE_RANGE_WRITE);
6201 	if (ret)
6202 		mlog_errno(ret);
6203 
6204 out:
6205 	if (pages)
6206 		kfree(pages);
6207 
6208 	return ret;
6209 }
6210 
6211 static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di)
6212 {
6213 	unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6214 
6215 	memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2));
6216 }
6217 
6218 void ocfs2_dinode_new_extent_list(struct inode *inode,
6219 				  struct ocfs2_dinode *di)
6220 {
6221 	ocfs2_zero_dinode_id2(inode, di);
6222 	di->id2.i_list.l_tree_depth = 0;
6223 	di->id2.i_list.l_next_free_rec = 0;
6224 	di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb));
6225 }
6226 
6227 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6228 {
6229 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
6230 	struct ocfs2_inline_data *idata = &di->id2.i_data;
6231 
6232 	spin_lock(&oi->ip_lock);
6233 	oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6234 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6235 	spin_unlock(&oi->ip_lock);
6236 
6237 	/*
6238 	 * We clear the entire i_data structure here so that all
6239 	 * fields can be properly initialized.
6240 	 */
6241 	ocfs2_zero_dinode_id2(inode, di);
6242 
6243 	idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb));
6244 }
6245 
6246 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6247 					 struct buffer_head *di_bh)
6248 {
6249 	int ret, i, has_data, num_pages = 0;
6250 	handle_t *handle;
6251 	u64 uninitialized_var(block);
6252 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
6253 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6254 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6255 	struct ocfs2_alloc_context *data_ac = NULL;
6256 	struct page **pages = NULL;
6257 	loff_t end = osb->s_clustersize;
6258 
6259 	has_data = i_size_read(inode) ? 1 : 0;
6260 
6261 	if (has_data) {
6262 		pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6263 				sizeof(struct page *), GFP_NOFS);
6264 		if (pages == NULL) {
6265 			ret = -ENOMEM;
6266 			mlog_errno(ret);
6267 			goto out;
6268 		}
6269 
6270 		ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6271 		if (ret) {
6272 			mlog_errno(ret);
6273 			goto out;
6274 		}
6275 	}
6276 
6277 	handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
6278 	if (IS_ERR(handle)) {
6279 		ret = PTR_ERR(handle);
6280 		mlog_errno(ret);
6281 		goto out_unlock;
6282 	}
6283 
6284 	ret = ocfs2_journal_access(handle, inode, di_bh,
6285 				   OCFS2_JOURNAL_ACCESS_WRITE);
6286 	if (ret) {
6287 		mlog_errno(ret);
6288 		goto out_commit;
6289 	}
6290 
6291 	if (has_data) {
6292 		u32 bit_off, num;
6293 		unsigned int page_end;
6294 		u64 phys;
6295 
6296 		ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6297 					   &num);
6298 		if (ret) {
6299 			mlog_errno(ret);
6300 			goto out_commit;
6301 		}
6302 
6303 		/*
6304 		 * Save two copies, one for insert, and one that can
6305 		 * be changed by ocfs2_map_and_dirty_page() below.
6306 		 */
6307 		block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6308 
6309 		/*
6310 		 * Non sparse file systems zero on extend, so no need
6311 		 * to do that now.
6312 		 */
6313 		if (!ocfs2_sparse_alloc(osb) &&
6314 		    PAGE_CACHE_SIZE < osb->s_clustersize)
6315 			end = PAGE_CACHE_SIZE;
6316 
6317 		ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6318 		if (ret) {
6319 			mlog_errno(ret);
6320 			goto out_commit;
6321 		}
6322 
6323 		/*
6324 		 * This should populate the 1st page for us and mark
6325 		 * it up to date.
6326 		 */
6327 		ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
6328 		if (ret) {
6329 			mlog_errno(ret);
6330 			goto out_commit;
6331 		}
6332 
6333 		page_end = PAGE_CACHE_SIZE;
6334 		if (PAGE_CACHE_SIZE > osb->s_clustersize)
6335 			page_end = osb->s_clustersize;
6336 
6337 		for (i = 0; i < num_pages; i++)
6338 			ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
6339 						 pages[i], i > 0, &phys);
6340 	}
6341 
6342 	spin_lock(&oi->ip_lock);
6343 	oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
6344 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6345 	spin_unlock(&oi->ip_lock);
6346 
6347 	ocfs2_dinode_new_extent_list(inode, di);
6348 
6349 	ocfs2_journal_dirty(handle, di_bh);
6350 
6351 	if (has_data) {
6352 		/*
6353 		 * An error at this point should be extremely rare. If
6354 		 * this proves to be false, we could always re-build
6355 		 * the in-inode data from our pages.
6356 		 */
6357 		ret = ocfs2_insert_extent(osb, handle, inode, di_bh,
6358 					  0, block, 1, 0, NULL);
6359 		if (ret) {
6360 			mlog_errno(ret);
6361 			goto out_commit;
6362 		}
6363 
6364 		inode->i_blocks = ocfs2_inode_sector_count(inode);
6365 	}
6366 
6367 out_commit:
6368 	ocfs2_commit_trans(osb, handle);
6369 
6370 out_unlock:
6371 	if (data_ac)
6372 		ocfs2_free_alloc_context(data_ac);
6373 
6374 out:
6375 	if (pages) {
6376 		ocfs2_unlock_and_free_pages(pages, num_pages);
6377 		kfree(pages);
6378 	}
6379 
6380 	return ret;
6381 }
6382 
6383 /*
6384  * It is expected, that by the time you call this function,
6385  * inode->i_size and fe->i_size have been adjusted.
6386  *
6387  * WARNING: This will kfree the truncate context
6388  */
6389 int ocfs2_commit_truncate(struct ocfs2_super *osb,
6390 			  struct inode *inode,
6391 			  struct buffer_head *fe_bh,
6392 			  struct ocfs2_truncate_context *tc)
6393 {
6394 	int status, i, credits, tl_sem = 0;
6395 	u32 clusters_to_del, new_highest_cpos, range;
6396 	struct ocfs2_extent_list *el;
6397 	handle_t *handle = NULL;
6398 	struct inode *tl_inode = osb->osb_tl_inode;
6399 	struct ocfs2_path *path = NULL;
6400 
6401 	mlog_entry_void();
6402 
6403 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
6404 						     i_size_read(inode));
6405 
6406 	path = ocfs2_new_inode_path(fe_bh);
6407 	if (!path) {
6408 		status = -ENOMEM;
6409 		mlog_errno(status);
6410 		goto bail;
6411 	}
6412 
6413 	ocfs2_extent_map_trunc(inode, new_highest_cpos);
6414 
6415 start:
6416 	/*
6417 	 * Check that we still have allocation to delete.
6418 	 */
6419 	if (OCFS2_I(inode)->ip_clusters == 0) {
6420 		status = 0;
6421 		goto bail;
6422 	}
6423 
6424 	/*
6425 	 * Truncate always works against the rightmost tree branch.
6426 	 */
6427 	status = ocfs2_find_path(inode, path, UINT_MAX);
6428 	if (status) {
6429 		mlog_errno(status);
6430 		goto bail;
6431 	}
6432 
6433 	mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
6434 	     OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
6435 
6436 	/*
6437 	 * By now, el will point to the extent list on the bottom most
6438 	 * portion of this tree. Only the tail record is considered in
6439 	 * each pass.
6440 	 *
6441 	 * We handle the following cases, in order:
6442 	 * - empty extent: delete the remaining branch
6443 	 * - remove the entire record
6444 	 * - remove a partial record
6445 	 * - no record needs to be removed (truncate has completed)
6446 	 */
6447 	el = path_leaf_el(path);
6448 	if (le16_to_cpu(el->l_next_free_rec) == 0) {
6449 		ocfs2_error(inode->i_sb,
6450 			    "Inode %llu has empty extent block at %llu\n",
6451 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
6452 			    (unsigned long long)path_leaf_bh(path)->b_blocknr);
6453 		status = -EROFS;
6454 		goto bail;
6455 	}
6456 
6457 	i = le16_to_cpu(el->l_next_free_rec) - 1;
6458 	range = le32_to_cpu(el->l_recs[i].e_cpos) +
6459 		ocfs2_rec_clusters(el, &el->l_recs[i]);
6460 	if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
6461 		clusters_to_del = 0;
6462 	} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
6463 		clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
6464 	} else if (range > new_highest_cpos) {
6465 		clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
6466 				   le32_to_cpu(el->l_recs[i].e_cpos)) -
6467 				  new_highest_cpos;
6468 	} else {
6469 		status = 0;
6470 		goto bail;
6471 	}
6472 
6473 	mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
6474 	     clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
6475 
6476 	mutex_lock(&tl_inode->i_mutex);
6477 	tl_sem = 1;
6478 	/* ocfs2_truncate_log_needs_flush guarantees us at least one
6479 	 * record is free for use. If there isn't any, we flush to get
6480 	 * an empty truncate log.  */
6481 	if (ocfs2_truncate_log_needs_flush(osb)) {
6482 		status = __ocfs2_flush_truncate_log(osb);
6483 		if (status < 0) {
6484 			mlog_errno(status);
6485 			goto bail;
6486 		}
6487 	}
6488 
6489 	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
6490 						(struct ocfs2_dinode *)fe_bh->b_data,
6491 						el);
6492 	handle = ocfs2_start_trans(osb, credits);
6493 	if (IS_ERR(handle)) {
6494 		status = PTR_ERR(handle);
6495 		handle = NULL;
6496 		mlog_errno(status);
6497 		goto bail;
6498 	}
6499 
6500 	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
6501 				   tc, path);
6502 	if (status < 0) {
6503 		mlog_errno(status);
6504 		goto bail;
6505 	}
6506 
6507 	mutex_unlock(&tl_inode->i_mutex);
6508 	tl_sem = 0;
6509 
6510 	ocfs2_commit_trans(osb, handle);
6511 	handle = NULL;
6512 
6513 	ocfs2_reinit_path(path, 1);
6514 
6515 	/*
6516 	 * The check above will catch the case where we've truncated
6517 	 * away all allocation.
6518 	 */
6519 	goto start;
6520 
6521 bail:
6522 
6523 	ocfs2_schedule_truncate_log_flush(osb, 1);
6524 
6525 	if (tl_sem)
6526 		mutex_unlock(&tl_inode->i_mutex);
6527 
6528 	if (handle)
6529 		ocfs2_commit_trans(osb, handle);
6530 
6531 	ocfs2_run_deallocs(osb, &tc->tc_dealloc);
6532 
6533 	ocfs2_free_path(path);
6534 
6535 	/* This will drop the ext_alloc cluster lock for us */
6536 	ocfs2_free_truncate_context(tc);
6537 
6538 	mlog_exit(status);
6539 	return status;
6540 }
6541 
6542 /*
6543  * Expects the inode to already be locked.
6544  */
6545 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
6546 			   struct inode *inode,
6547 			   struct buffer_head *fe_bh,
6548 			   struct ocfs2_truncate_context **tc)
6549 {
6550 	int status;
6551 	unsigned int new_i_clusters;
6552 	struct ocfs2_dinode *fe;
6553 	struct ocfs2_extent_block *eb;
6554 	struct buffer_head *last_eb_bh = NULL;
6555 
6556 	mlog_entry_void();
6557 
6558 	*tc = NULL;
6559 
6560 	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
6561 						  i_size_read(inode));
6562 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
6563 
6564 	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
6565 	     "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
6566 	     (unsigned long long)le64_to_cpu(fe->i_size));
6567 
6568 	*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
6569 	if (!(*tc)) {
6570 		status = -ENOMEM;
6571 		mlog_errno(status);
6572 		goto bail;
6573 	}
6574 	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
6575 
6576 	if (fe->id2.i_list.l_tree_depth) {
6577 		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
6578 					  &last_eb_bh, OCFS2_BH_CACHED, inode);
6579 		if (status < 0) {
6580 			mlog_errno(status);
6581 			goto bail;
6582 		}
6583 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6584 		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
6585 			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
6586 
6587 			brelse(last_eb_bh);
6588 			status = -EIO;
6589 			goto bail;
6590 		}
6591 	}
6592 
6593 	(*tc)->tc_last_eb_bh = last_eb_bh;
6594 
6595 	status = 0;
6596 bail:
6597 	if (status < 0) {
6598 		if (*tc)
6599 			ocfs2_free_truncate_context(*tc);
6600 		*tc = NULL;
6601 	}
6602 	mlog_exit_void();
6603 	return status;
6604 }
6605 
6606 /*
6607  * 'start' is inclusive, 'end' is not.
6608  */
6609 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
6610 			  unsigned int start, unsigned int end, int trunc)
6611 {
6612 	int ret;
6613 	unsigned int numbytes;
6614 	handle_t *handle;
6615 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6616 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6617 	struct ocfs2_inline_data *idata = &di->id2.i_data;
6618 
6619 	if (end > i_size_read(inode))
6620 		end = i_size_read(inode);
6621 
6622 	BUG_ON(start >= end);
6623 
6624 	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
6625 	    !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
6626 	    !ocfs2_supports_inline_data(osb)) {
6627 		ocfs2_error(inode->i_sb,
6628 			    "Inline data flags for inode %llu don't agree! "
6629 			    "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
6630 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
6631 			    le16_to_cpu(di->i_dyn_features),
6632 			    OCFS2_I(inode)->ip_dyn_features,
6633 			    osb->s_feature_incompat);
6634 		ret = -EROFS;
6635 		goto out;
6636 	}
6637 
6638 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
6639 	if (IS_ERR(handle)) {
6640 		ret = PTR_ERR(handle);
6641 		mlog_errno(ret);
6642 		goto out;
6643 	}
6644 
6645 	ret = ocfs2_journal_access(handle, inode, di_bh,
6646 				   OCFS2_JOURNAL_ACCESS_WRITE);
6647 	if (ret) {
6648 		mlog_errno(ret);
6649 		goto out_commit;
6650 	}
6651 
6652 	numbytes = end - start;
6653 	memset(idata->id_data + start, 0, numbytes);
6654 
6655 	/*
6656 	 * No need to worry about the data page here - it's been
6657 	 * truncated already and inline data doesn't need it for
6658 	 * pushing zero's to disk, so we'll let readpage pick it up
6659 	 * later.
6660 	 */
6661 	if (trunc) {
6662 		i_size_write(inode, start);
6663 		di->i_size = cpu_to_le64(start);
6664 	}
6665 
6666 	inode->i_blocks = ocfs2_inode_sector_count(inode);
6667 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
6668 
6669 	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
6670 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
6671 
6672 	ocfs2_journal_dirty(handle, di_bh);
6673 
6674 out_commit:
6675 	ocfs2_commit_trans(osb, handle);
6676 
6677 out:
6678 	return ret;
6679 }
6680 
6681 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
6682 {
6683 	/*
6684 	 * The caller is responsible for completing deallocation
6685 	 * before freeing the context.
6686 	 */
6687 	if (tc->tc_dealloc.c_first_suballocator != NULL)
6688 		mlog(ML_NOTICE,
6689 		     "Truncate completion has non-empty dealloc context\n");
6690 
6691 	if (tc->tc_last_eb_bh)
6692 		brelse(tc->tc_last_eb_bh);
6693 
6694 	kfree(tc);
6695 }
6696