xref: /openbmc/linux/fs/ocfs2/alloc.c (revision 13723d00e374c2a6d6ccb5af6de965e89c3e1b01)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31 #include <linux/quotaops.h>
32 
33 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
34 #include <cluster/masklog.h>
35 
36 #include "ocfs2.h"
37 
38 #include "alloc.h"
39 #include "aops.h"
40 #include "blockcheck.h"
41 #include "dlmglue.h"
42 #include "extent_map.h"
43 #include "inode.h"
44 #include "journal.h"
45 #include "localalloc.h"
46 #include "suballoc.h"
47 #include "sysfile.h"
48 #include "file.h"
49 #include "super.h"
50 #include "uptodate.h"
51 
52 #include "buffer_head_io.h"
53 
54 
55 /*
56  * Operations for a specific extent tree type.
57  *
58  * To implement an on-disk btree (extent tree) type in ocfs2, add
59  * an ocfs2_extent_tree_operations structure and the matching
60  * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
61  * for the allocation portion of the extent tree.
62  */
63 struct ocfs2_extent_tree_operations {
64 	/*
65 	 * last_eb_blk is the block number of the right most leaf extent
66 	 * block.  Most on-disk structures containing an extent tree store
67 	 * this value for fast access.  The ->eo_set_last_eb_blk() and
68 	 * ->eo_get_last_eb_blk() operations access this value.  They are
69 	 *  both required.
70 	 */
71 	void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
72 				   u64 blkno);
73 	u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
74 
75 	/*
76 	 * The on-disk structure usually keeps track of how many total
77 	 * clusters are stored in this extent tree.  This function updates
78 	 * that value.  new_clusters is the delta, and must be
79 	 * added to the total.  Required.
80 	 */
81 	void (*eo_update_clusters)(struct inode *inode,
82 				   struct ocfs2_extent_tree *et,
83 				   u32 new_clusters);
84 
85 	/*
86 	 * If ->eo_insert_check() exists, it is called before rec is
87 	 * inserted into the extent tree.  It is optional.
88 	 */
89 	int (*eo_insert_check)(struct inode *inode,
90 			       struct ocfs2_extent_tree *et,
91 			       struct ocfs2_extent_rec *rec);
92 	int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
93 
94 	/*
95 	 * --------------------------------------------------------------
96 	 * The remaining are internal to ocfs2_extent_tree and don't have
97 	 * accessor functions
98 	 */
99 
100 	/*
101 	 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
102 	 * It is required.
103 	 */
104 	void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
105 
106 	/*
107 	 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
108 	 * it exists.  If it does not, et->et_max_leaf_clusters is set
109 	 * to 0 (unlimited).  Optional.
110 	 */
111 	void (*eo_fill_max_leaf_clusters)(struct inode *inode,
112 					  struct ocfs2_extent_tree *et);
113 };
114 
115 
116 /*
117  * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
118  * in the methods.
119  */
120 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
121 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
122 					 u64 blkno);
123 static void ocfs2_dinode_update_clusters(struct inode *inode,
124 					 struct ocfs2_extent_tree *et,
125 					 u32 clusters);
126 static int ocfs2_dinode_insert_check(struct inode *inode,
127 				     struct ocfs2_extent_tree *et,
128 				     struct ocfs2_extent_rec *rec);
129 static int ocfs2_dinode_sanity_check(struct inode *inode,
130 				     struct ocfs2_extent_tree *et);
131 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
132 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
133 	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
134 	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
135 	.eo_update_clusters	= ocfs2_dinode_update_clusters,
136 	.eo_insert_check	= ocfs2_dinode_insert_check,
137 	.eo_sanity_check	= ocfs2_dinode_sanity_check,
138 	.eo_fill_root_el	= ocfs2_dinode_fill_root_el,
139 };
140 
141 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
142 					 u64 blkno)
143 {
144 	struct ocfs2_dinode *di = et->et_object;
145 
146 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
147 	di->i_last_eb_blk = cpu_to_le64(blkno);
148 }
149 
150 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
151 {
152 	struct ocfs2_dinode *di = et->et_object;
153 
154 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
155 	return le64_to_cpu(di->i_last_eb_blk);
156 }
157 
158 static void ocfs2_dinode_update_clusters(struct inode *inode,
159 					 struct ocfs2_extent_tree *et,
160 					 u32 clusters)
161 {
162 	struct ocfs2_dinode *di = et->et_object;
163 
164 	le32_add_cpu(&di->i_clusters, clusters);
165 	spin_lock(&OCFS2_I(inode)->ip_lock);
166 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
167 	spin_unlock(&OCFS2_I(inode)->ip_lock);
168 }
169 
170 static int ocfs2_dinode_insert_check(struct inode *inode,
171 				     struct ocfs2_extent_tree *et,
172 				     struct ocfs2_extent_rec *rec)
173 {
174 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
175 
176 	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
177 	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
178 			(OCFS2_I(inode)->ip_clusters != rec->e_cpos),
179 			"Device %s, asking for sparse allocation: inode %llu, "
180 			"cpos %u, clusters %u\n",
181 			osb->dev_str,
182 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
183 			rec->e_cpos,
184 			OCFS2_I(inode)->ip_clusters);
185 
186 	return 0;
187 }
188 
189 static int ocfs2_dinode_sanity_check(struct inode *inode,
190 				     struct ocfs2_extent_tree *et)
191 {
192 	struct ocfs2_dinode *di = et->et_object;
193 
194 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
195 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
196 
197 	return 0;
198 }
199 
200 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
201 {
202 	struct ocfs2_dinode *di = et->et_object;
203 
204 	et->et_root_el = &di->id2.i_list;
205 }
206 
207 
208 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
209 {
210 	struct ocfs2_xattr_value_root *xv = et->et_object;
211 
212 	et->et_root_el = &xv->xr_list;
213 }
214 
215 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
216 					      u64 blkno)
217 {
218 	struct ocfs2_xattr_value_root *xv =
219 		(struct ocfs2_xattr_value_root *)et->et_object;
220 
221 	xv->xr_last_eb_blk = cpu_to_le64(blkno);
222 }
223 
224 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
225 {
226 	struct ocfs2_xattr_value_root *xv =
227 		(struct ocfs2_xattr_value_root *) et->et_object;
228 
229 	return le64_to_cpu(xv->xr_last_eb_blk);
230 }
231 
232 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
233 					      struct ocfs2_extent_tree *et,
234 					      u32 clusters)
235 {
236 	struct ocfs2_xattr_value_root *xv =
237 		(struct ocfs2_xattr_value_root *)et->et_object;
238 
239 	le32_add_cpu(&xv->xr_clusters, clusters);
240 }
241 
242 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
243 	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
244 	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
245 	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
246 	.eo_fill_root_el	= ocfs2_xattr_value_fill_root_el,
247 };
248 
249 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
250 {
251 	struct ocfs2_xattr_block *xb = et->et_object;
252 
253 	et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
254 }
255 
256 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
257 						    struct ocfs2_extent_tree *et)
258 {
259 	et->et_max_leaf_clusters =
260 		ocfs2_clusters_for_bytes(inode->i_sb,
261 					 OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
262 }
263 
264 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
265 					     u64 blkno)
266 {
267 	struct ocfs2_xattr_block *xb = et->et_object;
268 	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
269 
270 	xt->xt_last_eb_blk = cpu_to_le64(blkno);
271 }
272 
273 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
274 {
275 	struct ocfs2_xattr_block *xb = et->et_object;
276 	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
277 
278 	return le64_to_cpu(xt->xt_last_eb_blk);
279 }
280 
281 static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
282 					     struct ocfs2_extent_tree *et,
283 					     u32 clusters)
284 {
285 	struct ocfs2_xattr_block *xb = et->et_object;
286 
287 	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
288 }
289 
290 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
291 	.eo_set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
292 	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
293 	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
294 	.eo_fill_root_el	= ocfs2_xattr_tree_fill_root_el,
295 	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
296 };
297 
298 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
299 				     struct inode *inode,
300 				     struct buffer_head *bh,
301 				     ocfs2_journal_access_func access,
302 				     void *obj,
303 				     struct ocfs2_extent_tree_operations *ops)
304 {
305 	et->et_ops = ops;
306 	et->et_root_bh = bh;
307 	et->et_root_journal_access = access;
308 	if (!obj)
309 		obj = (void *)bh->b_data;
310 	et->et_object = obj;
311 
312 	et->et_ops->eo_fill_root_el(et);
313 	if (!et->et_ops->eo_fill_max_leaf_clusters)
314 		et->et_max_leaf_clusters = 0;
315 	else
316 		et->et_ops->eo_fill_max_leaf_clusters(inode, et);
317 }
318 
319 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
320 				   struct inode *inode,
321 				   struct buffer_head *bh)
322 {
323 	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
324 				 NULL, &ocfs2_dinode_et_ops);
325 }
326 
327 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
328 				       struct inode *inode,
329 				       struct buffer_head *bh)
330 {
331 	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
332 				 NULL, &ocfs2_xattr_tree_et_ops);
333 }
334 
335 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
336 					struct inode *inode,
337 					struct buffer_head *bh,
338 					struct ocfs2_xattr_value_root *xv)
339 {
340 	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access, xv,
341 				 &ocfs2_xattr_value_et_ops);
342 }
343 
344 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
345 					    u64 new_last_eb_blk)
346 {
347 	et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
348 }
349 
350 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
351 {
352 	return et->et_ops->eo_get_last_eb_blk(et);
353 }
354 
355 static inline void ocfs2_et_update_clusters(struct inode *inode,
356 					    struct ocfs2_extent_tree *et,
357 					    u32 clusters)
358 {
359 	et->et_ops->eo_update_clusters(inode, et, clusters);
360 }
361 
362 static inline int ocfs2_et_root_journal_access(handle_t *handle,
363 					       struct inode *inode,
364 					       struct ocfs2_extent_tree *et,
365 					       int type)
366 {
367 	return et->et_root_journal_access(handle, inode, et->et_root_bh,
368 					  type);
369 }
370 
371 static inline int ocfs2_et_insert_check(struct inode *inode,
372 					struct ocfs2_extent_tree *et,
373 					struct ocfs2_extent_rec *rec)
374 {
375 	int ret = 0;
376 
377 	if (et->et_ops->eo_insert_check)
378 		ret = et->et_ops->eo_insert_check(inode, et, rec);
379 	return ret;
380 }
381 
382 static inline int ocfs2_et_sanity_check(struct inode *inode,
383 					struct ocfs2_extent_tree *et)
384 {
385 	int ret = 0;
386 
387 	if (et->et_ops->eo_sanity_check)
388 		ret = et->et_ops->eo_sanity_check(inode, et);
389 	return ret;
390 }
391 
392 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
393 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
394 					 struct ocfs2_extent_block *eb);
395 
396 /*
397  * Structures which describe a path through a btree, and functions to
398  * manipulate them.
399  *
400  * The idea here is to be as generic as possible with the tree
401  * manipulation code.
402  */
403 struct ocfs2_path_item {
404 	struct buffer_head		*bh;
405 	struct ocfs2_extent_list	*el;
406 };
407 
408 #define OCFS2_MAX_PATH_DEPTH	5
409 
410 struct ocfs2_path {
411 	int				p_tree_depth;
412 	ocfs2_journal_access_func	p_root_access;
413 	struct ocfs2_path_item		p_node[OCFS2_MAX_PATH_DEPTH];
414 };
415 
416 #define path_root_bh(_path) ((_path)->p_node[0].bh)
417 #define path_root_el(_path) ((_path)->p_node[0].el)
418 #define path_root_access(_path)((_path)->p_root_access)
419 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
420 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
421 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
422 
423 /*
424  * Reset the actual path elements so that we can re-use the structure
425  * to build another path. Generally, this involves freeing the buffer
426  * heads.
427  */
428 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
429 {
430 	int i, start = 0, depth = 0;
431 	struct ocfs2_path_item *node;
432 
433 	if (keep_root)
434 		start = 1;
435 
436 	for(i = start; i < path_num_items(path); i++) {
437 		node = &path->p_node[i];
438 
439 		brelse(node->bh);
440 		node->bh = NULL;
441 		node->el = NULL;
442 	}
443 
444 	/*
445 	 * Tree depth may change during truncate, or insert. If we're
446 	 * keeping the root extent list, then make sure that our path
447 	 * structure reflects the proper depth.
448 	 */
449 	if (keep_root)
450 		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
451 	else
452 		path_root_access(path) = NULL;
453 
454 	path->p_tree_depth = depth;
455 }
456 
457 static void ocfs2_free_path(struct ocfs2_path *path)
458 {
459 	if (path) {
460 		ocfs2_reinit_path(path, 0);
461 		kfree(path);
462 	}
463 }
464 
465 /*
466  * All the elements of src into dest. After this call, src could be freed
467  * without affecting dest.
468  *
469  * Both paths should have the same root. Any non-root elements of dest
470  * will be freed.
471  */
472 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
473 {
474 	int i;
475 
476 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
477 	BUG_ON(path_root_el(dest) != path_root_el(src));
478 	BUG_ON(path_root_access(dest) != path_root_access(src));
479 
480 	ocfs2_reinit_path(dest, 1);
481 
482 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
483 		dest->p_node[i].bh = src->p_node[i].bh;
484 		dest->p_node[i].el = src->p_node[i].el;
485 
486 		if (dest->p_node[i].bh)
487 			get_bh(dest->p_node[i].bh);
488 	}
489 }
490 
491 /*
492  * Make the *dest path the same as src and re-initialize src path to
493  * have a root only.
494  */
495 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
496 {
497 	int i;
498 
499 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
500 	BUG_ON(path_root_access(dest) != path_root_access(src));
501 
502 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
503 		brelse(dest->p_node[i].bh);
504 
505 		dest->p_node[i].bh = src->p_node[i].bh;
506 		dest->p_node[i].el = src->p_node[i].el;
507 
508 		src->p_node[i].bh = NULL;
509 		src->p_node[i].el = NULL;
510 	}
511 }
512 
513 /*
514  * Insert an extent block at given index.
515  *
516  * This will not take an additional reference on eb_bh.
517  */
518 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
519 					struct buffer_head *eb_bh)
520 {
521 	struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
522 
523 	/*
524 	 * Right now, no root bh is an extent block, so this helps
525 	 * catch code errors with dinode trees. The assertion can be
526 	 * safely removed if we ever need to insert extent block
527 	 * structures at the root.
528 	 */
529 	BUG_ON(index == 0);
530 
531 	path->p_node[index].bh = eb_bh;
532 	path->p_node[index].el = &eb->h_list;
533 }
534 
535 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
536 					 struct ocfs2_extent_list *root_el,
537 					 ocfs2_journal_access_func access)
538 {
539 	struct ocfs2_path *path;
540 
541 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
542 
543 	path = kzalloc(sizeof(*path), GFP_NOFS);
544 	if (path) {
545 		path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
546 		get_bh(root_bh);
547 		path_root_bh(path) = root_bh;
548 		path_root_el(path) = root_el;
549 		path_root_access(path) = access;
550 	}
551 
552 	return path;
553 }
554 
555 static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
556 {
557 	return ocfs2_new_path(path_root_bh(path), path_root_el(path),
558 			      path_root_access(path));
559 }
560 
561 static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
562 {
563 	return ocfs2_new_path(et->et_root_bh, et->et_root_el,
564 			      et->et_root_journal_access);
565 }
566 
567 /*
568  * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
569  * otherwise it's the root_access function.
570  *
571  * I don't like the way this function's name looks next to
572  * ocfs2_journal_access_path(), but I don't have a better one.
573  */
574 static int ocfs2_path_bh_journal_access(handle_t *handle,
575 					struct inode *inode,
576 					struct ocfs2_path *path,
577 					int idx)
578 {
579 	ocfs2_journal_access_func access = path_root_access(path);
580 
581 	if (!access)
582 		access = ocfs2_journal_access;
583 
584 	if (idx)
585 		access = ocfs2_journal_access_eb;
586 
587 	return access(handle, inode, path->p_node[idx].bh,
588 		      OCFS2_JOURNAL_ACCESS_WRITE);
589 }
590 
591 /*
592  * Convenience function to journal all components in a path.
593  */
594 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
595 				     struct ocfs2_path *path)
596 {
597 	int i, ret = 0;
598 
599 	if (!path)
600 		goto out;
601 
602 	for(i = 0; i < path_num_items(path); i++) {
603 		ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
604 		if (ret < 0) {
605 			mlog_errno(ret);
606 			goto out;
607 		}
608 	}
609 
610 out:
611 	return ret;
612 }
613 
614 /*
615  * Return the index of the extent record which contains cluster #v_cluster.
616  * -1 is returned if it was not found.
617  *
618  * Should work fine on interior and exterior nodes.
619  */
620 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
621 {
622 	int ret = -1;
623 	int i;
624 	struct ocfs2_extent_rec *rec;
625 	u32 rec_end, rec_start, clusters;
626 
627 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
628 		rec = &el->l_recs[i];
629 
630 		rec_start = le32_to_cpu(rec->e_cpos);
631 		clusters = ocfs2_rec_clusters(el, rec);
632 
633 		rec_end = rec_start + clusters;
634 
635 		if (v_cluster >= rec_start && v_cluster < rec_end) {
636 			ret = i;
637 			break;
638 		}
639 	}
640 
641 	return ret;
642 }
643 
644 enum ocfs2_contig_type {
645 	CONTIG_NONE = 0,
646 	CONTIG_LEFT,
647 	CONTIG_RIGHT,
648 	CONTIG_LEFTRIGHT,
649 };
650 
651 
652 /*
653  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
654  * ocfs2_extent_contig only work properly against leaf nodes!
655  */
656 static int ocfs2_block_extent_contig(struct super_block *sb,
657 				     struct ocfs2_extent_rec *ext,
658 				     u64 blkno)
659 {
660 	u64 blk_end = le64_to_cpu(ext->e_blkno);
661 
662 	blk_end += ocfs2_clusters_to_blocks(sb,
663 				    le16_to_cpu(ext->e_leaf_clusters));
664 
665 	return blkno == blk_end;
666 }
667 
668 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
669 				  struct ocfs2_extent_rec *right)
670 {
671 	u32 left_range;
672 
673 	left_range = le32_to_cpu(left->e_cpos) +
674 		le16_to_cpu(left->e_leaf_clusters);
675 
676 	return (left_range == le32_to_cpu(right->e_cpos));
677 }
678 
679 static enum ocfs2_contig_type
680 	ocfs2_extent_contig(struct inode *inode,
681 			    struct ocfs2_extent_rec *ext,
682 			    struct ocfs2_extent_rec *insert_rec)
683 {
684 	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
685 
686 	/*
687 	 * Refuse to coalesce extent records with different flag
688 	 * fields - we don't want to mix unwritten extents with user
689 	 * data.
690 	 */
691 	if (ext->e_flags != insert_rec->e_flags)
692 		return CONTIG_NONE;
693 
694 	if (ocfs2_extents_adjacent(ext, insert_rec) &&
695 	    ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
696 			return CONTIG_RIGHT;
697 
698 	blkno = le64_to_cpu(ext->e_blkno);
699 	if (ocfs2_extents_adjacent(insert_rec, ext) &&
700 	    ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
701 		return CONTIG_LEFT;
702 
703 	return CONTIG_NONE;
704 }
705 
706 /*
707  * NOTE: We can have pretty much any combination of contiguousness and
708  * appending.
709  *
710  * The usefulness of APPEND_TAIL is more in that it lets us know that
711  * we'll have to update the path to that leaf.
712  */
713 enum ocfs2_append_type {
714 	APPEND_NONE = 0,
715 	APPEND_TAIL,
716 };
717 
718 enum ocfs2_split_type {
719 	SPLIT_NONE = 0,
720 	SPLIT_LEFT,
721 	SPLIT_RIGHT,
722 };
723 
724 struct ocfs2_insert_type {
725 	enum ocfs2_split_type	ins_split;
726 	enum ocfs2_append_type	ins_appending;
727 	enum ocfs2_contig_type	ins_contig;
728 	int			ins_contig_index;
729 	int			ins_tree_depth;
730 };
731 
732 struct ocfs2_merge_ctxt {
733 	enum ocfs2_contig_type	c_contig_type;
734 	int			c_has_empty_extent;
735 	int			c_split_covers_rec;
736 };
737 
738 static int ocfs2_validate_extent_block(struct super_block *sb,
739 				       struct buffer_head *bh)
740 {
741 	int rc;
742 	struct ocfs2_extent_block *eb =
743 		(struct ocfs2_extent_block *)bh->b_data;
744 
745 	mlog(0, "Validating extent block %llu\n",
746 	     (unsigned long long)bh->b_blocknr);
747 
748 	BUG_ON(!buffer_uptodate(bh));
749 
750 	/*
751 	 * If the ecc fails, we return the error but otherwise
752 	 * leave the filesystem running.  We know any error is
753 	 * local to this block.
754 	 */
755 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
756 	if (rc) {
757 		mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
758 		     (unsigned long long)bh->b_blocknr);
759 		return rc;
760 	}
761 
762 	/*
763 	 * Errors after here are fatal.
764 	 */
765 
766 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
767 		ocfs2_error(sb,
768 			    "Extent block #%llu has bad signature %.*s",
769 			    (unsigned long long)bh->b_blocknr, 7,
770 			    eb->h_signature);
771 		return -EINVAL;
772 	}
773 
774 	if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
775 		ocfs2_error(sb,
776 			    "Extent block #%llu has an invalid h_blkno "
777 			    "of %llu",
778 			    (unsigned long long)bh->b_blocknr,
779 			    (unsigned long long)le64_to_cpu(eb->h_blkno));
780 		return -EINVAL;
781 	}
782 
783 	if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
784 		ocfs2_error(sb,
785 			    "Extent block #%llu has an invalid "
786 			    "h_fs_generation of #%u",
787 			    (unsigned long long)bh->b_blocknr,
788 			    le32_to_cpu(eb->h_fs_generation));
789 		return -EINVAL;
790 	}
791 
792 	return 0;
793 }
794 
795 int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
796 			    struct buffer_head **bh)
797 {
798 	int rc;
799 	struct buffer_head *tmp = *bh;
800 
801 	rc = ocfs2_read_block(inode, eb_blkno, &tmp,
802 			      ocfs2_validate_extent_block);
803 
804 	/* If ocfs2_read_block() got us a new bh, pass it up. */
805 	if (!rc && !*bh)
806 		*bh = tmp;
807 
808 	return rc;
809 }
810 
811 
812 /*
813  * How many free extents have we got before we need more meta data?
814  */
815 int ocfs2_num_free_extents(struct ocfs2_super *osb,
816 			   struct inode *inode,
817 			   struct ocfs2_extent_tree *et)
818 {
819 	int retval;
820 	struct ocfs2_extent_list *el = NULL;
821 	struct ocfs2_extent_block *eb;
822 	struct buffer_head *eb_bh = NULL;
823 	u64 last_eb_blk = 0;
824 
825 	mlog_entry_void();
826 
827 	el = et->et_root_el;
828 	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
829 
830 	if (last_eb_blk) {
831 		retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
832 		if (retval < 0) {
833 			mlog_errno(retval);
834 			goto bail;
835 		}
836 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
837 		el = &eb->h_list;
838 	}
839 
840 	BUG_ON(el->l_tree_depth != 0);
841 
842 	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
843 bail:
844 	brelse(eb_bh);
845 
846 	mlog_exit(retval);
847 	return retval;
848 }
849 
850 /* expects array to already be allocated
851  *
852  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
853  * l_count for you
854  */
855 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
856 				     handle_t *handle,
857 				     struct inode *inode,
858 				     int wanted,
859 				     struct ocfs2_alloc_context *meta_ac,
860 				     struct buffer_head *bhs[])
861 {
862 	int count, status, i;
863 	u16 suballoc_bit_start;
864 	u32 num_got;
865 	u64 first_blkno;
866 	struct ocfs2_extent_block *eb;
867 
868 	mlog_entry_void();
869 
870 	count = 0;
871 	while (count < wanted) {
872 		status = ocfs2_claim_metadata(osb,
873 					      handle,
874 					      meta_ac,
875 					      wanted - count,
876 					      &suballoc_bit_start,
877 					      &num_got,
878 					      &first_blkno);
879 		if (status < 0) {
880 			mlog_errno(status);
881 			goto bail;
882 		}
883 
884 		for(i = count;  i < (num_got + count); i++) {
885 			bhs[i] = sb_getblk(osb->sb, first_blkno);
886 			if (bhs[i] == NULL) {
887 				status = -EIO;
888 				mlog_errno(status);
889 				goto bail;
890 			}
891 			ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
892 
893 			status = ocfs2_journal_access_eb(handle, inode, bhs[i],
894 							 OCFS2_JOURNAL_ACCESS_CREATE);
895 			if (status < 0) {
896 				mlog_errno(status);
897 				goto bail;
898 			}
899 
900 			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
901 			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
902 			/* Ok, setup the minimal stuff here. */
903 			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
904 			eb->h_blkno = cpu_to_le64(first_blkno);
905 			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
906 			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
907 			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
908 			eb->h_list.l_count =
909 				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
910 
911 			suballoc_bit_start++;
912 			first_blkno++;
913 
914 			/* We'll also be dirtied by the caller, so
915 			 * this isn't absolutely necessary. */
916 			status = ocfs2_journal_dirty(handle, bhs[i]);
917 			if (status < 0) {
918 				mlog_errno(status);
919 				goto bail;
920 			}
921 		}
922 
923 		count += num_got;
924 	}
925 
926 	status = 0;
927 bail:
928 	if (status < 0) {
929 		for(i = 0; i < wanted; i++) {
930 			brelse(bhs[i]);
931 			bhs[i] = NULL;
932 		}
933 	}
934 	mlog_exit(status);
935 	return status;
936 }
937 
938 /*
939  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
940  *
941  * Returns the sum of the rightmost extent rec logical offset and
942  * cluster count.
943  *
944  * ocfs2_add_branch() uses this to determine what logical cluster
945  * value should be populated into the leftmost new branch records.
946  *
947  * ocfs2_shift_tree_depth() uses this to determine the # clusters
948  * value for the new topmost tree record.
949  */
950 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
951 {
952 	int i;
953 
954 	i = le16_to_cpu(el->l_next_free_rec) - 1;
955 
956 	return le32_to_cpu(el->l_recs[i].e_cpos) +
957 		ocfs2_rec_clusters(el, &el->l_recs[i]);
958 }
959 
960 /*
961  * Add an entire tree branch to our inode. eb_bh is the extent block
962  * to start at, if we don't want to start the branch at the dinode
963  * structure.
964  *
965  * last_eb_bh is required as we have to update it's next_leaf pointer
966  * for the new last extent block.
967  *
968  * the new branch will be 'empty' in the sense that every block will
969  * contain a single record with cluster count == 0.
970  */
971 static int ocfs2_add_branch(struct ocfs2_super *osb,
972 			    handle_t *handle,
973 			    struct inode *inode,
974 			    struct ocfs2_extent_tree *et,
975 			    struct buffer_head *eb_bh,
976 			    struct buffer_head **last_eb_bh,
977 			    struct ocfs2_alloc_context *meta_ac)
978 {
979 	int status, new_blocks, i;
980 	u64 next_blkno, new_last_eb_blk;
981 	struct buffer_head *bh;
982 	struct buffer_head **new_eb_bhs = NULL;
983 	struct ocfs2_extent_block *eb;
984 	struct ocfs2_extent_list  *eb_el;
985 	struct ocfs2_extent_list  *el;
986 	u32 new_cpos;
987 
988 	mlog_entry_void();
989 
990 	BUG_ON(!last_eb_bh || !*last_eb_bh);
991 
992 	if (eb_bh) {
993 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
994 		el = &eb->h_list;
995 	} else
996 		el = et->et_root_el;
997 
998 	/* we never add a branch to a leaf. */
999 	BUG_ON(!el->l_tree_depth);
1000 
1001 	new_blocks = le16_to_cpu(el->l_tree_depth);
1002 
1003 	/* allocate the number of new eb blocks we need */
1004 	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
1005 			     GFP_KERNEL);
1006 	if (!new_eb_bhs) {
1007 		status = -ENOMEM;
1008 		mlog_errno(status);
1009 		goto bail;
1010 	}
1011 
1012 	status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
1013 					   meta_ac, new_eb_bhs);
1014 	if (status < 0) {
1015 		mlog_errno(status);
1016 		goto bail;
1017 	}
1018 
1019 	eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
1020 	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
1021 
1022 	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
1023 	 * linked with the rest of the tree.
1024 	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
1025 	 *
1026 	 * when we leave the loop, new_last_eb_blk will point to the
1027 	 * newest leaf, and next_blkno will point to the topmost extent
1028 	 * block. */
1029 	next_blkno = new_last_eb_blk = 0;
1030 	for(i = 0; i < new_blocks; i++) {
1031 		bh = new_eb_bhs[i];
1032 		eb = (struct ocfs2_extent_block *) bh->b_data;
1033 		/* ocfs2_create_new_meta_bhs() should create it right! */
1034 		BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1035 		eb_el = &eb->h_list;
1036 
1037 		status = ocfs2_journal_access_eb(handle, inode, bh,
1038 						 OCFS2_JOURNAL_ACCESS_CREATE);
1039 		if (status < 0) {
1040 			mlog_errno(status);
1041 			goto bail;
1042 		}
1043 
1044 		eb->h_next_leaf_blk = 0;
1045 		eb_el->l_tree_depth = cpu_to_le16(i);
1046 		eb_el->l_next_free_rec = cpu_to_le16(1);
1047 		/*
1048 		 * This actually counts as an empty extent as
1049 		 * c_clusters == 0
1050 		 */
1051 		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
1052 		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
1053 		/*
1054 		 * eb_el isn't always an interior node, but even leaf
1055 		 * nodes want a zero'd flags and reserved field so
1056 		 * this gets the whole 32 bits regardless of use.
1057 		 */
1058 		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
1059 		if (!eb_el->l_tree_depth)
1060 			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1061 
1062 		status = ocfs2_journal_dirty(handle, bh);
1063 		if (status < 0) {
1064 			mlog_errno(status);
1065 			goto bail;
1066 		}
1067 
1068 		next_blkno = le64_to_cpu(eb->h_blkno);
1069 	}
1070 
1071 	/* This is a bit hairy. We want to update up to three blocks
1072 	 * here without leaving any of them in an inconsistent state
1073 	 * in case of error. We don't have to worry about
1074 	 * journal_dirty erroring as it won't unless we've aborted the
1075 	 * handle (in which case we would never be here) so reserving
1076 	 * the write with journal_access is all we need to do. */
1077 	status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
1078 					 OCFS2_JOURNAL_ACCESS_WRITE);
1079 	if (status < 0) {
1080 		mlog_errno(status);
1081 		goto bail;
1082 	}
1083 	status = ocfs2_et_root_journal_access(handle, inode, et,
1084 					      OCFS2_JOURNAL_ACCESS_WRITE);
1085 	if (status < 0) {
1086 		mlog_errno(status);
1087 		goto bail;
1088 	}
1089 	if (eb_bh) {
1090 		status = ocfs2_journal_access_eb(handle, inode, eb_bh,
1091 						 OCFS2_JOURNAL_ACCESS_WRITE);
1092 		if (status < 0) {
1093 			mlog_errno(status);
1094 			goto bail;
1095 		}
1096 	}
1097 
1098 	/* Link the new branch into the rest of the tree (el will
1099 	 * either be on the root_bh, or the extent block passed in. */
1100 	i = le16_to_cpu(el->l_next_free_rec);
1101 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1102 	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1103 	el->l_recs[i].e_int_clusters = 0;
1104 	le16_add_cpu(&el->l_next_free_rec, 1);
1105 
1106 	/* fe needs a new last extent block pointer, as does the
1107 	 * next_leaf on the previously last-extent-block. */
1108 	ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1109 
1110 	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1111 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1112 
1113 	status = ocfs2_journal_dirty(handle, *last_eb_bh);
1114 	if (status < 0)
1115 		mlog_errno(status);
1116 	status = ocfs2_journal_dirty(handle, et->et_root_bh);
1117 	if (status < 0)
1118 		mlog_errno(status);
1119 	if (eb_bh) {
1120 		status = ocfs2_journal_dirty(handle, eb_bh);
1121 		if (status < 0)
1122 			mlog_errno(status);
1123 	}
1124 
1125 	/*
1126 	 * Some callers want to track the rightmost leaf so pass it
1127 	 * back here.
1128 	 */
1129 	brelse(*last_eb_bh);
1130 	get_bh(new_eb_bhs[0]);
1131 	*last_eb_bh = new_eb_bhs[0];
1132 
1133 	status = 0;
1134 bail:
1135 	if (new_eb_bhs) {
1136 		for (i = 0; i < new_blocks; i++)
1137 			brelse(new_eb_bhs[i]);
1138 		kfree(new_eb_bhs);
1139 	}
1140 
1141 	mlog_exit(status);
1142 	return status;
1143 }
1144 
1145 /*
1146  * adds another level to the allocation tree.
1147  * returns back the new extent block so you can add a branch to it
1148  * after this call.
1149  */
1150 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1151 				  handle_t *handle,
1152 				  struct inode *inode,
1153 				  struct ocfs2_extent_tree *et,
1154 				  struct ocfs2_alloc_context *meta_ac,
1155 				  struct buffer_head **ret_new_eb_bh)
1156 {
1157 	int status, i;
1158 	u32 new_clusters;
1159 	struct buffer_head *new_eb_bh = NULL;
1160 	struct ocfs2_extent_block *eb;
1161 	struct ocfs2_extent_list  *root_el;
1162 	struct ocfs2_extent_list  *eb_el;
1163 
1164 	mlog_entry_void();
1165 
1166 	status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
1167 					   &new_eb_bh);
1168 	if (status < 0) {
1169 		mlog_errno(status);
1170 		goto bail;
1171 	}
1172 
1173 	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1174 	/* ocfs2_create_new_meta_bhs() should create it right! */
1175 	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1176 
1177 	eb_el = &eb->h_list;
1178 	root_el = et->et_root_el;
1179 
1180 	status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
1181 					 OCFS2_JOURNAL_ACCESS_CREATE);
1182 	if (status < 0) {
1183 		mlog_errno(status);
1184 		goto bail;
1185 	}
1186 
1187 	/* copy the root extent list data into the new extent block */
1188 	eb_el->l_tree_depth = root_el->l_tree_depth;
1189 	eb_el->l_next_free_rec = root_el->l_next_free_rec;
1190 	for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1191 		eb_el->l_recs[i] = root_el->l_recs[i];
1192 
1193 	status = ocfs2_journal_dirty(handle, new_eb_bh);
1194 	if (status < 0) {
1195 		mlog_errno(status);
1196 		goto bail;
1197 	}
1198 
1199 	status = ocfs2_et_root_journal_access(handle, inode, et,
1200 					      OCFS2_JOURNAL_ACCESS_WRITE);
1201 	if (status < 0) {
1202 		mlog_errno(status);
1203 		goto bail;
1204 	}
1205 
1206 	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1207 
1208 	/* update root_bh now */
1209 	le16_add_cpu(&root_el->l_tree_depth, 1);
1210 	root_el->l_recs[0].e_cpos = 0;
1211 	root_el->l_recs[0].e_blkno = eb->h_blkno;
1212 	root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1213 	for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1214 		memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1215 	root_el->l_next_free_rec = cpu_to_le16(1);
1216 
1217 	/* If this is our 1st tree depth shift, then last_eb_blk
1218 	 * becomes the allocated extent block */
1219 	if (root_el->l_tree_depth == cpu_to_le16(1))
1220 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1221 
1222 	status = ocfs2_journal_dirty(handle, et->et_root_bh);
1223 	if (status < 0) {
1224 		mlog_errno(status);
1225 		goto bail;
1226 	}
1227 
1228 	*ret_new_eb_bh = new_eb_bh;
1229 	new_eb_bh = NULL;
1230 	status = 0;
1231 bail:
1232 	brelse(new_eb_bh);
1233 
1234 	mlog_exit(status);
1235 	return status;
1236 }
1237 
1238 /*
1239  * Should only be called when there is no space left in any of the
1240  * leaf nodes. What we want to do is find the lowest tree depth
1241  * non-leaf extent block with room for new records. There are three
1242  * valid results of this search:
1243  *
1244  * 1) a lowest extent block is found, then we pass it back in
1245  *    *lowest_eb_bh and return '0'
1246  *
1247  * 2) the search fails to find anything, but the root_el has room. We
1248  *    pass NULL back in *lowest_eb_bh, but still return '0'
1249  *
1250  * 3) the search fails to find anything AND the root_el is full, in
1251  *    which case we return > 0
1252  *
1253  * return status < 0 indicates an error.
1254  */
1255 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1256 				    struct inode *inode,
1257 				    struct ocfs2_extent_tree *et,
1258 				    struct buffer_head **target_bh)
1259 {
1260 	int status = 0, i;
1261 	u64 blkno;
1262 	struct ocfs2_extent_block *eb;
1263 	struct ocfs2_extent_list  *el;
1264 	struct buffer_head *bh = NULL;
1265 	struct buffer_head *lowest_bh = NULL;
1266 
1267 	mlog_entry_void();
1268 
1269 	*target_bh = NULL;
1270 
1271 	el = et->et_root_el;
1272 
1273 	while(le16_to_cpu(el->l_tree_depth) > 1) {
1274 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
1275 			ocfs2_error(inode->i_sb, "Dinode %llu has empty "
1276 				    "extent list (next_free_rec == 0)",
1277 				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
1278 			status = -EIO;
1279 			goto bail;
1280 		}
1281 		i = le16_to_cpu(el->l_next_free_rec) - 1;
1282 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1283 		if (!blkno) {
1284 			ocfs2_error(inode->i_sb, "Dinode %llu has extent "
1285 				    "list where extent # %d has no physical "
1286 				    "block start",
1287 				    (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
1288 			status = -EIO;
1289 			goto bail;
1290 		}
1291 
1292 		brelse(bh);
1293 		bh = NULL;
1294 
1295 		status = ocfs2_read_extent_block(inode, blkno, &bh);
1296 		if (status < 0) {
1297 			mlog_errno(status);
1298 			goto bail;
1299 		}
1300 
1301 		eb = (struct ocfs2_extent_block *) bh->b_data;
1302 		el = &eb->h_list;
1303 
1304 		if (le16_to_cpu(el->l_next_free_rec) <
1305 		    le16_to_cpu(el->l_count)) {
1306 			brelse(lowest_bh);
1307 			lowest_bh = bh;
1308 			get_bh(lowest_bh);
1309 		}
1310 	}
1311 
1312 	/* If we didn't find one and the fe doesn't have any room,
1313 	 * then return '1' */
1314 	el = et->et_root_el;
1315 	if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1316 		status = 1;
1317 
1318 	*target_bh = lowest_bh;
1319 bail:
1320 	brelse(bh);
1321 
1322 	mlog_exit(status);
1323 	return status;
1324 }
1325 
1326 /*
1327  * Grow a b-tree so that it has more records.
1328  *
1329  * We might shift the tree depth in which case existing paths should
1330  * be considered invalid.
1331  *
1332  * Tree depth after the grow is returned via *final_depth.
1333  *
1334  * *last_eb_bh will be updated by ocfs2_add_branch().
1335  */
1336 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1337 			   struct ocfs2_extent_tree *et, int *final_depth,
1338 			   struct buffer_head **last_eb_bh,
1339 			   struct ocfs2_alloc_context *meta_ac)
1340 {
1341 	int ret, shift;
1342 	struct ocfs2_extent_list *el = et->et_root_el;
1343 	int depth = le16_to_cpu(el->l_tree_depth);
1344 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1345 	struct buffer_head *bh = NULL;
1346 
1347 	BUG_ON(meta_ac == NULL);
1348 
1349 	shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1350 	if (shift < 0) {
1351 		ret = shift;
1352 		mlog_errno(ret);
1353 		goto out;
1354 	}
1355 
1356 	/* We traveled all the way to the bottom of the allocation tree
1357 	 * and didn't find room for any more extents - we need to add
1358 	 * another tree level */
1359 	if (shift) {
1360 		BUG_ON(bh);
1361 		mlog(0, "need to shift tree depth (current = %d)\n", depth);
1362 
1363 		/* ocfs2_shift_tree_depth will return us a buffer with
1364 		 * the new extent block (so we can pass that to
1365 		 * ocfs2_add_branch). */
1366 		ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1367 					     meta_ac, &bh);
1368 		if (ret < 0) {
1369 			mlog_errno(ret);
1370 			goto out;
1371 		}
1372 		depth++;
1373 		if (depth == 1) {
1374 			/*
1375 			 * Special case: we have room now if we shifted from
1376 			 * tree_depth 0, so no more work needs to be done.
1377 			 *
1378 			 * We won't be calling add_branch, so pass
1379 			 * back *last_eb_bh as the new leaf. At depth
1380 			 * zero, it should always be null so there's
1381 			 * no reason to brelse.
1382 			 */
1383 			BUG_ON(*last_eb_bh);
1384 			get_bh(bh);
1385 			*last_eb_bh = bh;
1386 			goto out;
1387 		}
1388 	}
1389 
1390 	/* call ocfs2_add_branch to add the final part of the tree with
1391 	 * the new data. */
1392 	mlog(0, "add branch. bh = %p\n", bh);
1393 	ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1394 			       meta_ac);
1395 	if (ret < 0) {
1396 		mlog_errno(ret);
1397 		goto out;
1398 	}
1399 
1400 out:
1401 	if (final_depth)
1402 		*final_depth = depth;
1403 	brelse(bh);
1404 	return ret;
1405 }
1406 
1407 /*
1408  * This function will discard the rightmost extent record.
1409  */
1410 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1411 {
1412 	int next_free = le16_to_cpu(el->l_next_free_rec);
1413 	int count = le16_to_cpu(el->l_count);
1414 	unsigned int num_bytes;
1415 
1416 	BUG_ON(!next_free);
1417 	/* This will cause us to go off the end of our extent list. */
1418 	BUG_ON(next_free >= count);
1419 
1420 	num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1421 
1422 	memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1423 }
1424 
1425 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1426 			      struct ocfs2_extent_rec *insert_rec)
1427 {
1428 	int i, insert_index, next_free, has_empty, num_bytes;
1429 	u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1430 	struct ocfs2_extent_rec *rec;
1431 
1432 	next_free = le16_to_cpu(el->l_next_free_rec);
1433 	has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1434 
1435 	BUG_ON(!next_free);
1436 
1437 	/* The tree code before us didn't allow enough room in the leaf. */
1438 	BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1439 
1440 	/*
1441 	 * The easiest way to approach this is to just remove the
1442 	 * empty extent and temporarily decrement next_free.
1443 	 */
1444 	if (has_empty) {
1445 		/*
1446 		 * If next_free was 1 (only an empty extent), this
1447 		 * loop won't execute, which is fine. We still want
1448 		 * the decrement above to happen.
1449 		 */
1450 		for(i = 0; i < (next_free - 1); i++)
1451 			el->l_recs[i] = el->l_recs[i+1];
1452 
1453 		next_free--;
1454 	}
1455 
1456 	/*
1457 	 * Figure out what the new record index should be.
1458 	 */
1459 	for(i = 0; i < next_free; i++) {
1460 		rec = &el->l_recs[i];
1461 
1462 		if (insert_cpos < le32_to_cpu(rec->e_cpos))
1463 			break;
1464 	}
1465 	insert_index = i;
1466 
1467 	mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1468 	     insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1469 
1470 	BUG_ON(insert_index < 0);
1471 	BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1472 	BUG_ON(insert_index > next_free);
1473 
1474 	/*
1475 	 * No need to memmove if we're just adding to the tail.
1476 	 */
1477 	if (insert_index != next_free) {
1478 		BUG_ON(next_free >= le16_to_cpu(el->l_count));
1479 
1480 		num_bytes = next_free - insert_index;
1481 		num_bytes *= sizeof(struct ocfs2_extent_rec);
1482 		memmove(&el->l_recs[insert_index + 1],
1483 			&el->l_recs[insert_index],
1484 			num_bytes);
1485 	}
1486 
1487 	/*
1488 	 * Either we had an empty extent, and need to re-increment or
1489 	 * there was no empty extent on a non full rightmost leaf node,
1490 	 * in which case we still need to increment.
1491 	 */
1492 	next_free++;
1493 	el->l_next_free_rec = cpu_to_le16(next_free);
1494 	/*
1495 	 * Make sure none of the math above just messed up our tree.
1496 	 */
1497 	BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1498 
1499 	el->l_recs[insert_index] = *insert_rec;
1500 
1501 }
1502 
1503 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1504 {
1505 	int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1506 
1507 	BUG_ON(num_recs == 0);
1508 
1509 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1510 		num_recs--;
1511 		size = num_recs * sizeof(struct ocfs2_extent_rec);
1512 		memmove(&el->l_recs[0], &el->l_recs[1], size);
1513 		memset(&el->l_recs[num_recs], 0,
1514 		       sizeof(struct ocfs2_extent_rec));
1515 		el->l_next_free_rec = cpu_to_le16(num_recs);
1516 	}
1517 }
1518 
1519 /*
1520  * Create an empty extent record .
1521  *
1522  * l_next_free_rec may be updated.
1523  *
1524  * If an empty extent already exists do nothing.
1525  */
1526 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1527 {
1528 	int next_free = le16_to_cpu(el->l_next_free_rec);
1529 
1530 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1531 
1532 	if (next_free == 0)
1533 		goto set_and_inc;
1534 
1535 	if (ocfs2_is_empty_extent(&el->l_recs[0]))
1536 		return;
1537 
1538 	mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1539 			"Asked to create an empty extent in a full list:\n"
1540 			"count = %u, tree depth = %u",
1541 			le16_to_cpu(el->l_count),
1542 			le16_to_cpu(el->l_tree_depth));
1543 
1544 	ocfs2_shift_records_right(el);
1545 
1546 set_and_inc:
1547 	le16_add_cpu(&el->l_next_free_rec, 1);
1548 	memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1549 }
1550 
1551 /*
1552  * For a rotation which involves two leaf nodes, the "root node" is
1553  * the lowest level tree node which contains a path to both leafs. This
1554  * resulting set of information can be used to form a complete "subtree"
1555  *
1556  * This function is passed two full paths from the dinode down to a
1557  * pair of adjacent leaves. It's task is to figure out which path
1558  * index contains the subtree root - this can be the root index itself
1559  * in a worst-case rotation.
1560  *
1561  * The array index of the subtree root is passed back.
1562  */
1563 static int ocfs2_find_subtree_root(struct inode *inode,
1564 				   struct ocfs2_path *left,
1565 				   struct ocfs2_path *right)
1566 {
1567 	int i = 0;
1568 
1569 	/*
1570 	 * Check that the caller passed in two paths from the same tree.
1571 	 */
1572 	BUG_ON(path_root_bh(left) != path_root_bh(right));
1573 
1574 	do {
1575 		i++;
1576 
1577 		/*
1578 		 * The caller didn't pass two adjacent paths.
1579 		 */
1580 		mlog_bug_on_msg(i > left->p_tree_depth,
1581 				"Inode %lu, left depth %u, right depth %u\n"
1582 				"left leaf blk %llu, right leaf blk %llu\n",
1583 				inode->i_ino, left->p_tree_depth,
1584 				right->p_tree_depth,
1585 				(unsigned long long)path_leaf_bh(left)->b_blocknr,
1586 				(unsigned long long)path_leaf_bh(right)->b_blocknr);
1587 	} while (left->p_node[i].bh->b_blocknr ==
1588 		 right->p_node[i].bh->b_blocknr);
1589 
1590 	return i - 1;
1591 }
1592 
1593 typedef void (path_insert_t)(void *, struct buffer_head *);
1594 
1595 /*
1596  * Traverse a btree path in search of cpos, starting at root_el.
1597  *
1598  * This code can be called with a cpos larger than the tree, in which
1599  * case it will return the rightmost path.
1600  */
1601 static int __ocfs2_find_path(struct inode *inode,
1602 			     struct ocfs2_extent_list *root_el, u32 cpos,
1603 			     path_insert_t *func, void *data)
1604 {
1605 	int i, ret = 0;
1606 	u32 range;
1607 	u64 blkno;
1608 	struct buffer_head *bh = NULL;
1609 	struct ocfs2_extent_block *eb;
1610 	struct ocfs2_extent_list *el;
1611 	struct ocfs2_extent_rec *rec;
1612 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1613 
1614 	el = root_el;
1615 	while (el->l_tree_depth) {
1616 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
1617 			ocfs2_error(inode->i_sb,
1618 				    "Inode %llu has empty extent list at "
1619 				    "depth %u\n",
1620 				    (unsigned long long)oi->ip_blkno,
1621 				    le16_to_cpu(el->l_tree_depth));
1622 			ret = -EROFS;
1623 			goto out;
1624 
1625 		}
1626 
1627 		for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1628 			rec = &el->l_recs[i];
1629 
1630 			/*
1631 			 * In the case that cpos is off the allocation
1632 			 * tree, this should just wind up returning the
1633 			 * rightmost record.
1634 			 */
1635 			range = le32_to_cpu(rec->e_cpos) +
1636 				ocfs2_rec_clusters(el, rec);
1637 			if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1638 			    break;
1639 		}
1640 
1641 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1642 		if (blkno == 0) {
1643 			ocfs2_error(inode->i_sb,
1644 				    "Inode %llu has bad blkno in extent list "
1645 				    "at depth %u (index %d)\n",
1646 				    (unsigned long long)oi->ip_blkno,
1647 				    le16_to_cpu(el->l_tree_depth), i);
1648 			ret = -EROFS;
1649 			goto out;
1650 		}
1651 
1652 		brelse(bh);
1653 		bh = NULL;
1654 		ret = ocfs2_read_extent_block(inode, blkno, &bh);
1655 		if (ret) {
1656 			mlog_errno(ret);
1657 			goto out;
1658 		}
1659 
1660 		eb = (struct ocfs2_extent_block *) bh->b_data;
1661 		el = &eb->h_list;
1662 
1663 		if (le16_to_cpu(el->l_next_free_rec) >
1664 		    le16_to_cpu(el->l_count)) {
1665 			ocfs2_error(inode->i_sb,
1666 				    "Inode %llu has bad count in extent list "
1667 				    "at block %llu (next free=%u, count=%u)\n",
1668 				    (unsigned long long)oi->ip_blkno,
1669 				    (unsigned long long)bh->b_blocknr,
1670 				    le16_to_cpu(el->l_next_free_rec),
1671 				    le16_to_cpu(el->l_count));
1672 			ret = -EROFS;
1673 			goto out;
1674 		}
1675 
1676 		if (func)
1677 			func(data, bh);
1678 	}
1679 
1680 out:
1681 	/*
1682 	 * Catch any trailing bh that the loop didn't handle.
1683 	 */
1684 	brelse(bh);
1685 
1686 	return ret;
1687 }
1688 
1689 /*
1690  * Given an initialized path (that is, it has a valid root extent
1691  * list), this function will traverse the btree in search of the path
1692  * which would contain cpos.
1693  *
1694  * The path traveled is recorded in the path structure.
1695  *
1696  * Note that this will not do any comparisons on leaf node extent
1697  * records, so it will work fine in the case that we just added a tree
1698  * branch.
1699  */
1700 struct find_path_data {
1701 	int index;
1702 	struct ocfs2_path *path;
1703 };
1704 static void find_path_ins(void *data, struct buffer_head *bh)
1705 {
1706 	struct find_path_data *fp = data;
1707 
1708 	get_bh(bh);
1709 	ocfs2_path_insert_eb(fp->path, fp->index, bh);
1710 	fp->index++;
1711 }
1712 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1713 			   u32 cpos)
1714 {
1715 	struct find_path_data data;
1716 
1717 	data.index = 1;
1718 	data.path = path;
1719 	return __ocfs2_find_path(inode, path_root_el(path), cpos,
1720 				 find_path_ins, &data);
1721 }
1722 
1723 static void find_leaf_ins(void *data, struct buffer_head *bh)
1724 {
1725 	struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1726 	struct ocfs2_extent_list *el = &eb->h_list;
1727 	struct buffer_head **ret = data;
1728 
1729 	/* We want to retain only the leaf block. */
1730 	if (le16_to_cpu(el->l_tree_depth) == 0) {
1731 		get_bh(bh);
1732 		*ret = bh;
1733 	}
1734 }
1735 /*
1736  * Find the leaf block in the tree which would contain cpos. No
1737  * checking of the actual leaf is done.
1738  *
1739  * Some paths want to call this instead of allocating a path structure
1740  * and calling ocfs2_find_path().
1741  *
1742  * This function doesn't handle non btree extent lists.
1743  */
1744 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1745 		    u32 cpos, struct buffer_head **leaf_bh)
1746 {
1747 	int ret;
1748 	struct buffer_head *bh = NULL;
1749 
1750 	ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1751 	if (ret) {
1752 		mlog_errno(ret);
1753 		goto out;
1754 	}
1755 
1756 	*leaf_bh = bh;
1757 out:
1758 	return ret;
1759 }
1760 
1761 /*
1762  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1763  *
1764  * Basically, we've moved stuff around at the bottom of the tree and
1765  * we need to fix up the extent records above the changes to reflect
1766  * the new changes.
1767  *
1768  * left_rec: the record on the left.
1769  * left_child_el: is the child list pointed to by left_rec
1770  * right_rec: the record to the right of left_rec
1771  * right_child_el: is the child list pointed to by right_rec
1772  *
1773  * By definition, this only works on interior nodes.
1774  */
1775 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1776 				  struct ocfs2_extent_list *left_child_el,
1777 				  struct ocfs2_extent_rec *right_rec,
1778 				  struct ocfs2_extent_list *right_child_el)
1779 {
1780 	u32 left_clusters, right_end;
1781 
1782 	/*
1783 	 * Interior nodes never have holes. Their cpos is the cpos of
1784 	 * the leftmost record in their child list. Their cluster
1785 	 * count covers the full theoretical range of their child list
1786 	 * - the range between their cpos and the cpos of the record
1787 	 * immediately to their right.
1788 	 */
1789 	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1790 	if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
1791 		BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1792 		left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1793 	}
1794 	left_clusters -= le32_to_cpu(left_rec->e_cpos);
1795 	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1796 
1797 	/*
1798 	 * Calculate the rightmost cluster count boundary before
1799 	 * moving cpos - we will need to adjust clusters after
1800 	 * updating e_cpos to keep the same highest cluster count.
1801 	 */
1802 	right_end = le32_to_cpu(right_rec->e_cpos);
1803 	right_end += le32_to_cpu(right_rec->e_int_clusters);
1804 
1805 	right_rec->e_cpos = left_rec->e_cpos;
1806 	le32_add_cpu(&right_rec->e_cpos, left_clusters);
1807 
1808 	right_end -= le32_to_cpu(right_rec->e_cpos);
1809 	right_rec->e_int_clusters = cpu_to_le32(right_end);
1810 }
1811 
1812 /*
1813  * Adjust the adjacent root node records involved in a
1814  * rotation. left_el_blkno is passed in as a key so that we can easily
1815  * find it's index in the root list.
1816  */
1817 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1818 				      struct ocfs2_extent_list *left_el,
1819 				      struct ocfs2_extent_list *right_el,
1820 				      u64 left_el_blkno)
1821 {
1822 	int i;
1823 
1824 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1825 	       le16_to_cpu(left_el->l_tree_depth));
1826 
1827 	for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1828 		if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1829 			break;
1830 	}
1831 
1832 	/*
1833 	 * The path walking code should have never returned a root and
1834 	 * two paths which are not adjacent.
1835 	 */
1836 	BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1837 
1838 	ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1839 				      &root_el->l_recs[i + 1], right_el);
1840 }
1841 
1842 /*
1843  * We've changed a leaf block (in right_path) and need to reflect that
1844  * change back up the subtree.
1845  *
1846  * This happens in multiple places:
1847  *   - When we've moved an extent record from the left path leaf to the right
1848  *     path leaf to make room for an empty extent in the left path leaf.
1849  *   - When our insert into the right path leaf is at the leftmost edge
1850  *     and requires an update of the path immediately to it's left. This
1851  *     can occur at the end of some types of rotation and appending inserts.
1852  *   - When we've adjusted the last extent record in the left path leaf and the
1853  *     1st extent record in the right path leaf during cross extent block merge.
1854  */
1855 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1856 				       struct ocfs2_path *left_path,
1857 				       struct ocfs2_path *right_path,
1858 				       int subtree_index)
1859 {
1860 	int ret, i, idx;
1861 	struct ocfs2_extent_list *el, *left_el, *right_el;
1862 	struct ocfs2_extent_rec *left_rec, *right_rec;
1863 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1864 
1865 	/*
1866 	 * Update the counts and position values within all the
1867 	 * interior nodes to reflect the leaf rotation we just did.
1868 	 *
1869 	 * The root node is handled below the loop.
1870 	 *
1871 	 * We begin the loop with right_el and left_el pointing to the
1872 	 * leaf lists and work our way up.
1873 	 *
1874 	 * NOTE: within this loop, left_el and right_el always refer
1875 	 * to the *child* lists.
1876 	 */
1877 	left_el = path_leaf_el(left_path);
1878 	right_el = path_leaf_el(right_path);
1879 	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
1880 		mlog(0, "Adjust records at index %u\n", i);
1881 
1882 		/*
1883 		 * One nice property of knowing that all of these
1884 		 * nodes are below the root is that we only deal with
1885 		 * the leftmost right node record and the rightmost
1886 		 * left node record.
1887 		 */
1888 		el = left_path->p_node[i].el;
1889 		idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
1890 		left_rec = &el->l_recs[idx];
1891 
1892 		el = right_path->p_node[i].el;
1893 		right_rec = &el->l_recs[0];
1894 
1895 		ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
1896 					      right_el);
1897 
1898 		ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
1899 		if (ret)
1900 			mlog_errno(ret);
1901 
1902 		ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
1903 		if (ret)
1904 			mlog_errno(ret);
1905 
1906 		/*
1907 		 * Setup our list pointers now so that the current
1908 		 * parents become children in the next iteration.
1909 		 */
1910 		left_el = left_path->p_node[i].el;
1911 		right_el = right_path->p_node[i].el;
1912 	}
1913 
1914 	/*
1915 	 * At the root node, adjust the two adjacent records which
1916 	 * begin our path to the leaves.
1917 	 */
1918 
1919 	el = left_path->p_node[subtree_index].el;
1920 	left_el = left_path->p_node[subtree_index + 1].el;
1921 	right_el = right_path->p_node[subtree_index + 1].el;
1922 
1923 	ocfs2_adjust_root_records(el, left_el, right_el,
1924 				  left_path->p_node[subtree_index + 1].bh->b_blocknr);
1925 
1926 	root_bh = left_path->p_node[subtree_index].bh;
1927 
1928 	ret = ocfs2_journal_dirty(handle, root_bh);
1929 	if (ret)
1930 		mlog_errno(ret);
1931 }
1932 
1933 static int ocfs2_rotate_subtree_right(struct inode *inode,
1934 				      handle_t *handle,
1935 				      struct ocfs2_path *left_path,
1936 				      struct ocfs2_path *right_path,
1937 				      int subtree_index)
1938 {
1939 	int ret, i;
1940 	struct buffer_head *right_leaf_bh;
1941 	struct buffer_head *left_leaf_bh = NULL;
1942 	struct buffer_head *root_bh;
1943 	struct ocfs2_extent_list *right_el, *left_el;
1944 	struct ocfs2_extent_rec move_rec;
1945 
1946 	left_leaf_bh = path_leaf_bh(left_path);
1947 	left_el = path_leaf_el(left_path);
1948 
1949 	if (left_el->l_next_free_rec != left_el->l_count) {
1950 		ocfs2_error(inode->i_sb,
1951 			    "Inode %llu has non-full interior leaf node %llu"
1952 			    "(next free = %u)",
1953 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
1954 			    (unsigned long long)left_leaf_bh->b_blocknr,
1955 			    le16_to_cpu(left_el->l_next_free_rec));
1956 		return -EROFS;
1957 	}
1958 
1959 	/*
1960 	 * This extent block may already have an empty record, so we
1961 	 * return early if so.
1962 	 */
1963 	if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
1964 		return 0;
1965 
1966 	root_bh = left_path->p_node[subtree_index].bh;
1967 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
1968 
1969 	ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
1970 					   subtree_index);
1971 	if (ret) {
1972 		mlog_errno(ret);
1973 		goto out;
1974 	}
1975 
1976 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
1977 		ret = ocfs2_path_bh_journal_access(handle, inode,
1978 						   right_path, i);
1979 		if (ret) {
1980 			mlog_errno(ret);
1981 			goto out;
1982 		}
1983 
1984 		ret = ocfs2_path_bh_journal_access(handle, inode,
1985 						   left_path, i);
1986 		if (ret) {
1987 			mlog_errno(ret);
1988 			goto out;
1989 		}
1990 	}
1991 
1992 	right_leaf_bh = path_leaf_bh(right_path);
1993 	right_el = path_leaf_el(right_path);
1994 
1995 	/* This is a code error, not a disk corruption. */
1996 	mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
1997 			"because rightmost leaf block %llu is empty\n",
1998 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
1999 			(unsigned long long)right_leaf_bh->b_blocknr);
2000 
2001 	ocfs2_create_empty_extent(right_el);
2002 
2003 	ret = ocfs2_journal_dirty(handle, right_leaf_bh);
2004 	if (ret) {
2005 		mlog_errno(ret);
2006 		goto out;
2007 	}
2008 
2009 	/* Do the copy now. */
2010 	i = le16_to_cpu(left_el->l_next_free_rec) - 1;
2011 	move_rec = left_el->l_recs[i];
2012 	right_el->l_recs[0] = move_rec;
2013 
2014 	/*
2015 	 * Clear out the record we just copied and shift everything
2016 	 * over, leaving an empty extent in the left leaf.
2017 	 *
2018 	 * We temporarily subtract from next_free_rec so that the
2019 	 * shift will lose the tail record (which is now defunct).
2020 	 */
2021 	le16_add_cpu(&left_el->l_next_free_rec, -1);
2022 	ocfs2_shift_records_right(left_el);
2023 	memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2024 	le16_add_cpu(&left_el->l_next_free_rec, 1);
2025 
2026 	ret = ocfs2_journal_dirty(handle, left_leaf_bh);
2027 	if (ret) {
2028 		mlog_errno(ret);
2029 		goto out;
2030 	}
2031 
2032 	ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2033 				subtree_index);
2034 
2035 out:
2036 	return ret;
2037 }
2038 
2039 /*
2040  * Given a full path, determine what cpos value would return us a path
2041  * containing the leaf immediately to the left of the current one.
2042  *
2043  * Will return zero if the path passed in is already the leftmost path.
2044  */
2045 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2046 					 struct ocfs2_path *path, u32 *cpos)
2047 {
2048 	int i, j, ret = 0;
2049 	u64 blkno;
2050 	struct ocfs2_extent_list *el;
2051 
2052 	BUG_ON(path->p_tree_depth == 0);
2053 
2054 	*cpos = 0;
2055 
2056 	blkno = path_leaf_bh(path)->b_blocknr;
2057 
2058 	/* Start at the tree node just above the leaf and work our way up. */
2059 	i = path->p_tree_depth - 1;
2060 	while (i >= 0) {
2061 		el = path->p_node[i].el;
2062 
2063 		/*
2064 		 * Find the extent record just before the one in our
2065 		 * path.
2066 		 */
2067 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2068 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2069 				if (j == 0) {
2070 					if (i == 0) {
2071 						/*
2072 						 * We've determined that the
2073 						 * path specified is already
2074 						 * the leftmost one - return a
2075 						 * cpos of zero.
2076 						 */
2077 						goto out;
2078 					}
2079 					/*
2080 					 * The leftmost record points to our
2081 					 * leaf - we need to travel up the
2082 					 * tree one level.
2083 					 */
2084 					goto next_node;
2085 				}
2086 
2087 				*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2088 				*cpos = *cpos + ocfs2_rec_clusters(el,
2089 							   &el->l_recs[j - 1]);
2090 				*cpos = *cpos - 1;
2091 				goto out;
2092 			}
2093 		}
2094 
2095 		/*
2096 		 * If we got here, we never found a valid node where
2097 		 * the tree indicated one should be.
2098 		 */
2099 		ocfs2_error(sb,
2100 			    "Invalid extent tree at extent block %llu\n",
2101 			    (unsigned long long)blkno);
2102 		ret = -EROFS;
2103 		goto out;
2104 
2105 next_node:
2106 		blkno = path->p_node[i].bh->b_blocknr;
2107 		i--;
2108 	}
2109 
2110 out:
2111 	return ret;
2112 }
2113 
2114 /*
2115  * Extend the transaction by enough credits to complete the rotation,
2116  * and still leave at least the original number of credits allocated
2117  * to this transaction.
2118  */
2119 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2120 					   int op_credits,
2121 					   struct ocfs2_path *path)
2122 {
2123 	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2124 
2125 	if (handle->h_buffer_credits < credits)
2126 		return ocfs2_extend_trans(handle, credits);
2127 
2128 	return 0;
2129 }
2130 
2131 /*
2132  * Trap the case where we're inserting into the theoretical range past
2133  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2134  * whose cpos is less than ours into the right leaf.
2135  *
2136  * It's only necessary to look at the rightmost record of the left
2137  * leaf because the logic that calls us should ensure that the
2138  * theoretical ranges in the path components above the leaves are
2139  * correct.
2140  */
2141 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2142 						 u32 insert_cpos)
2143 {
2144 	struct ocfs2_extent_list *left_el;
2145 	struct ocfs2_extent_rec *rec;
2146 	int next_free;
2147 
2148 	left_el = path_leaf_el(left_path);
2149 	next_free = le16_to_cpu(left_el->l_next_free_rec);
2150 	rec = &left_el->l_recs[next_free - 1];
2151 
2152 	if (insert_cpos > le32_to_cpu(rec->e_cpos))
2153 		return 1;
2154 	return 0;
2155 }
2156 
2157 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2158 {
2159 	int next_free = le16_to_cpu(el->l_next_free_rec);
2160 	unsigned int range;
2161 	struct ocfs2_extent_rec *rec;
2162 
2163 	if (next_free == 0)
2164 		return 0;
2165 
2166 	rec = &el->l_recs[0];
2167 	if (ocfs2_is_empty_extent(rec)) {
2168 		/* Empty list. */
2169 		if (next_free == 1)
2170 			return 0;
2171 		rec = &el->l_recs[1];
2172 	}
2173 
2174 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2175 	if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2176 		return 1;
2177 	return 0;
2178 }
2179 
2180 /*
2181  * Rotate all the records in a btree right one record, starting at insert_cpos.
2182  *
2183  * The path to the rightmost leaf should be passed in.
2184  *
2185  * The array is assumed to be large enough to hold an entire path (tree depth).
2186  *
2187  * Upon succesful return from this function:
2188  *
2189  * - The 'right_path' array will contain a path to the leaf block
2190  *   whose range contains e_cpos.
2191  * - That leaf block will have a single empty extent in list index 0.
2192  * - In the case that the rotation requires a post-insert update,
2193  *   *ret_left_path will contain a valid path which can be passed to
2194  *   ocfs2_insert_path().
2195  */
2196 static int ocfs2_rotate_tree_right(struct inode *inode,
2197 				   handle_t *handle,
2198 				   enum ocfs2_split_type split,
2199 				   u32 insert_cpos,
2200 				   struct ocfs2_path *right_path,
2201 				   struct ocfs2_path **ret_left_path)
2202 {
2203 	int ret, start, orig_credits = handle->h_buffer_credits;
2204 	u32 cpos;
2205 	struct ocfs2_path *left_path = NULL;
2206 
2207 	*ret_left_path = NULL;
2208 
2209 	left_path = ocfs2_new_path_from_path(right_path);
2210 	if (!left_path) {
2211 		ret = -ENOMEM;
2212 		mlog_errno(ret);
2213 		goto out;
2214 	}
2215 
2216 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
2217 	if (ret) {
2218 		mlog_errno(ret);
2219 		goto out;
2220 	}
2221 
2222 	mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2223 
2224 	/*
2225 	 * What we want to do here is:
2226 	 *
2227 	 * 1) Start with the rightmost path.
2228 	 *
2229 	 * 2) Determine a path to the leaf block directly to the left
2230 	 *    of that leaf.
2231 	 *
2232 	 * 3) Determine the 'subtree root' - the lowest level tree node
2233 	 *    which contains a path to both leaves.
2234 	 *
2235 	 * 4) Rotate the subtree.
2236 	 *
2237 	 * 5) Find the next subtree by considering the left path to be
2238 	 *    the new right path.
2239 	 *
2240 	 * The check at the top of this while loop also accepts
2241 	 * insert_cpos == cpos because cpos is only a _theoretical_
2242 	 * value to get us the left path - insert_cpos might very well
2243 	 * be filling that hole.
2244 	 *
2245 	 * Stop at a cpos of '0' because we either started at the
2246 	 * leftmost branch (i.e., a tree with one branch and a
2247 	 * rotation inside of it), or we've gone as far as we can in
2248 	 * rotating subtrees.
2249 	 */
2250 	while (cpos && insert_cpos <= cpos) {
2251 		mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2252 		     insert_cpos, cpos);
2253 
2254 		ret = ocfs2_find_path(inode, left_path, cpos);
2255 		if (ret) {
2256 			mlog_errno(ret);
2257 			goto out;
2258 		}
2259 
2260 		mlog_bug_on_msg(path_leaf_bh(left_path) ==
2261 				path_leaf_bh(right_path),
2262 				"Inode %lu: error during insert of %u "
2263 				"(left path cpos %u) results in two identical "
2264 				"paths ending at %llu\n",
2265 				inode->i_ino, insert_cpos, cpos,
2266 				(unsigned long long)
2267 				path_leaf_bh(left_path)->b_blocknr);
2268 
2269 		if (split == SPLIT_NONE &&
2270 		    ocfs2_rotate_requires_path_adjustment(left_path,
2271 							  insert_cpos)) {
2272 
2273 			/*
2274 			 * We've rotated the tree as much as we
2275 			 * should. The rest is up to
2276 			 * ocfs2_insert_path() to complete, after the
2277 			 * record insertion. We indicate this
2278 			 * situation by returning the left path.
2279 			 *
2280 			 * The reason we don't adjust the records here
2281 			 * before the record insert is that an error
2282 			 * later might break the rule where a parent
2283 			 * record e_cpos will reflect the actual
2284 			 * e_cpos of the 1st nonempty record of the
2285 			 * child list.
2286 			 */
2287 			*ret_left_path = left_path;
2288 			goto out_ret_path;
2289 		}
2290 
2291 		start = ocfs2_find_subtree_root(inode, left_path, right_path);
2292 
2293 		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2294 		     start,
2295 		     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2296 		     right_path->p_tree_depth);
2297 
2298 		ret = ocfs2_extend_rotate_transaction(handle, start,
2299 						      orig_credits, right_path);
2300 		if (ret) {
2301 			mlog_errno(ret);
2302 			goto out;
2303 		}
2304 
2305 		ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2306 						 right_path, start);
2307 		if (ret) {
2308 			mlog_errno(ret);
2309 			goto out;
2310 		}
2311 
2312 		if (split != SPLIT_NONE &&
2313 		    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2314 						insert_cpos)) {
2315 			/*
2316 			 * A rotate moves the rightmost left leaf
2317 			 * record over to the leftmost right leaf
2318 			 * slot. If we're doing an extent split
2319 			 * instead of a real insert, then we have to
2320 			 * check that the extent to be split wasn't
2321 			 * just moved over. If it was, then we can
2322 			 * exit here, passing left_path back -
2323 			 * ocfs2_split_extent() is smart enough to
2324 			 * search both leaves.
2325 			 */
2326 			*ret_left_path = left_path;
2327 			goto out_ret_path;
2328 		}
2329 
2330 		/*
2331 		 * There is no need to re-read the next right path
2332 		 * as we know that it'll be our current left
2333 		 * path. Optimize by copying values instead.
2334 		 */
2335 		ocfs2_mv_path(right_path, left_path);
2336 
2337 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2338 						    &cpos);
2339 		if (ret) {
2340 			mlog_errno(ret);
2341 			goto out;
2342 		}
2343 	}
2344 
2345 out:
2346 	ocfs2_free_path(left_path);
2347 
2348 out_ret_path:
2349 	return ret;
2350 }
2351 
2352 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2353 				      struct ocfs2_path *path)
2354 {
2355 	int i, idx;
2356 	struct ocfs2_extent_rec *rec;
2357 	struct ocfs2_extent_list *el;
2358 	struct ocfs2_extent_block *eb;
2359 	u32 range;
2360 
2361 	/* Path should always be rightmost. */
2362 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2363 	BUG_ON(eb->h_next_leaf_blk != 0ULL);
2364 
2365 	el = &eb->h_list;
2366 	BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2367 	idx = le16_to_cpu(el->l_next_free_rec) - 1;
2368 	rec = &el->l_recs[idx];
2369 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2370 
2371 	for (i = 0; i < path->p_tree_depth; i++) {
2372 		el = path->p_node[i].el;
2373 		idx = le16_to_cpu(el->l_next_free_rec) - 1;
2374 		rec = &el->l_recs[idx];
2375 
2376 		rec->e_int_clusters = cpu_to_le32(range);
2377 		le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2378 
2379 		ocfs2_journal_dirty(handle, path->p_node[i].bh);
2380 	}
2381 }
2382 
2383 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2384 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
2385 			      struct ocfs2_path *path, int unlink_start)
2386 {
2387 	int ret, i;
2388 	struct ocfs2_extent_block *eb;
2389 	struct ocfs2_extent_list *el;
2390 	struct buffer_head *bh;
2391 
2392 	for(i = unlink_start; i < path_num_items(path); i++) {
2393 		bh = path->p_node[i].bh;
2394 
2395 		eb = (struct ocfs2_extent_block *)bh->b_data;
2396 		/*
2397 		 * Not all nodes might have had their final count
2398 		 * decremented by the caller - handle this here.
2399 		 */
2400 		el = &eb->h_list;
2401 		if (le16_to_cpu(el->l_next_free_rec) > 1) {
2402 			mlog(ML_ERROR,
2403 			     "Inode %llu, attempted to remove extent block "
2404 			     "%llu with %u records\n",
2405 			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2406 			     (unsigned long long)le64_to_cpu(eb->h_blkno),
2407 			     le16_to_cpu(el->l_next_free_rec));
2408 
2409 			ocfs2_journal_dirty(handle, bh);
2410 			ocfs2_remove_from_cache(inode, bh);
2411 			continue;
2412 		}
2413 
2414 		el->l_next_free_rec = 0;
2415 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2416 
2417 		ocfs2_journal_dirty(handle, bh);
2418 
2419 		ret = ocfs2_cache_extent_block_free(dealloc, eb);
2420 		if (ret)
2421 			mlog_errno(ret);
2422 
2423 		ocfs2_remove_from_cache(inode, bh);
2424 	}
2425 }
2426 
2427 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2428 				 struct ocfs2_path *left_path,
2429 				 struct ocfs2_path *right_path,
2430 				 int subtree_index,
2431 				 struct ocfs2_cached_dealloc_ctxt *dealloc)
2432 {
2433 	int i;
2434 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2435 	struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2436 	struct ocfs2_extent_list *el;
2437 	struct ocfs2_extent_block *eb;
2438 
2439 	el = path_leaf_el(left_path);
2440 
2441 	eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2442 
2443 	for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2444 		if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2445 			break;
2446 
2447 	BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2448 
2449 	memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2450 	le16_add_cpu(&root_el->l_next_free_rec, -1);
2451 
2452 	eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2453 	eb->h_next_leaf_blk = 0;
2454 
2455 	ocfs2_journal_dirty(handle, root_bh);
2456 	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2457 
2458 	ocfs2_unlink_path(inode, handle, dealloc, right_path,
2459 			  subtree_index + 1);
2460 }
2461 
2462 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2463 				     struct ocfs2_path *left_path,
2464 				     struct ocfs2_path *right_path,
2465 				     int subtree_index,
2466 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
2467 				     int *deleted,
2468 				     struct ocfs2_extent_tree *et)
2469 {
2470 	int ret, i, del_right_subtree = 0, right_has_empty = 0;
2471 	struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2472 	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2473 	struct ocfs2_extent_block *eb;
2474 
2475 	*deleted = 0;
2476 
2477 	right_leaf_el = path_leaf_el(right_path);
2478 	left_leaf_el = path_leaf_el(left_path);
2479 	root_bh = left_path->p_node[subtree_index].bh;
2480 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2481 
2482 	if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2483 		return 0;
2484 
2485 	eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2486 	if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2487 		/*
2488 		 * It's legal for us to proceed if the right leaf is
2489 		 * the rightmost one and it has an empty extent. There
2490 		 * are two cases to handle - whether the leaf will be
2491 		 * empty after removal or not. If the leaf isn't empty
2492 		 * then just remove the empty extent up front. The
2493 		 * next block will handle empty leaves by flagging
2494 		 * them for unlink.
2495 		 *
2496 		 * Non rightmost leaves will throw -EAGAIN and the
2497 		 * caller can manually move the subtree and retry.
2498 		 */
2499 
2500 		if (eb->h_next_leaf_blk != 0ULL)
2501 			return -EAGAIN;
2502 
2503 		if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2504 			ret = ocfs2_journal_access_eb(handle, inode,
2505 						      path_leaf_bh(right_path),
2506 						      OCFS2_JOURNAL_ACCESS_WRITE);
2507 			if (ret) {
2508 				mlog_errno(ret);
2509 				goto out;
2510 			}
2511 
2512 			ocfs2_remove_empty_extent(right_leaf_el);
2513 		} else
2514 			right_has_empty = 1;
2515 	}
2516 
2517 	if (eb->h_next_leaf_blk == 0ULL &&
2518 	    le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2519 		/*
2520 		 * We have to update i_last_eb_blk during the meta
2521 		 * data delete.
2522 		 */
2523 		ret = ocfs2_et_root_journal_access(handle, inode, et,
2524 						   OCFS2_JOURNAL_ACCESS_WRITE);
2525 		if (ret) {
2526 			mlog_errno(ret);
2527 			goto out;
2528 		}
2529 
2530 		del_right_subtree = 1;
2531 	}
2532 
2533 	/*
2534 	 * Getting here with an empty extent in the right path implies
2535 	 * that it's the rightmost path and will be deleted.
2536 	 */
2537 	BUG_ON(right_has_empty && !del_right_subtree);
2538 
2539 	ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
2540 					   subtree_index);
2541 	if (ret) {
2542 		mlog_errno(ret);
2543 		goto out;
2544 	}
2545 
2546 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2547 		ret = ocfs2_path_bh_journal_access(handle, inode,
2548 						   right_path, i);
2549 		if (ret) {
2550 			mlog_errno(ret);
2551 			goto out;
2552 		}
2553 
2554 		ret = ocfs2_path_bh_journal_access(handle, inode,
2555 						   left_path, i);
2556 		if (ret) {
2557 			mlog_errno(ret);
2558 			goto out;
2559 		}
2560 	}
2561 
2562 	if (!right_has_empty) {
2563 		/*
2564 		 * Only do this if we're moving a real
2565 		 * record. Otherwise, the action is delayed until
2566 		 * after removal of the right path in which case we
2567 		 * can do a simple shift to remove the empty extent.
2568 		 */
2569 		ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2570 		memset(&right_leaf_el->l_recs[0], 0,
2571 		       sizeof(struct ocfs2_extent_rec));
2572 	}
2573 	if (eb->h_next_leaf_blk == 0ULL) {
2574 		/*
2575 		 * Move recs over to get rid of empty extent, decrease
2576 		 * next_free. This is allowed to remove the last
2577 		 * extent in our leaf (setting l_next_free_rec to
2578 		 * zero) - the delete code below won't care.
2579 		 */
2580 		ocfs2_remove_empty_extent(right_leaf_el);
2581 	}
2582 
2583 	ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2584 	if (ret)
2585 		mlog_errno(ret);
2586 	ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2587 	if (ret)
2588 		mlog_errno(ret);
2589 
2590 	if (del_right_subtree) {
2591 		ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2592 				     subtree_index, dealloc);
2593 		ocfs2_update_edge_lengths(inode, handle, left_path);
2594 
2595 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2596 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2597 
2598 		/*
2599 		 * Removal of the extent in the left leaf was skipped
2600 		 * above so we could delete the right path
2601 		 * 1st.
2602 		 */
2603 		if (right_has_empty)
2604 			ocfs2_remove_empty_extent(left_leaf_el);
2605 
2606 		ret = ocfs2_journal_dirty(handle, et_root_bh);
2607 		if (ret)
2608 			mlog_errno(ret);
2609 
2610 		*deleted = 1;
2611 	} else
2612 		ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2613 					   subtree_index);
2614 
2615 out:
2616 	return ret;
2617 }
2618 
2619 /*
2620  * Given a full path, determine what cpos value would return us a path
2621  * containing the leaf immediately to the right of the current one.
2622  *
2623  * Will return zero if the path passed in is already the rightmost path.
2624  *
2625  * This looks similar, but is subtly different to
2626  * ocfs2_find_cpos_for_left_leaf().
2627  */
2628 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2629 					  struct ocfs2_path *path, u32 *cpos)
2630 {
2631 	int i, j, ret = 0;
2632 	u64 blkno;
2633 	struct ocfs2_extent_list *el;
2634 
2635 	*cpos = 0;
2636 
2637 	if (path->p_tree_depth == 0)
2638 		return 0;
2639 
2640 	blkno = path_leaf_bh(path)->b_blocknr;
2641 
2642 	/* Start at the tree node just above the leaf and work our way up. */
2643 	i = path->p_tree_depth - 1;
2644 	while (i >= 0) {
2645 		int next_free;
2646 
2647 		el = path->p_node[i].el;
2648 
2649 		/*
2650 		 * Find the extent record just after the one in our
2651 		 * path.
2652 		 */
2653 		next_free = le16_to_cpu(el->l_next_free_rec);
2654 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2655 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2656 				if (j == (next_free - 1)) {
2657 					if (i == 0) {
2658 						/*
2659 						 * We've determined that the
2660 						 * path specified is already
2661 						 * the rightmost one - return a
2662 						 * cpos of zero.
2663 						 */
2664 						goto out;
2665 					}
2666 					/*
2667 					 * The rightmost record points to our
2668 					 * leaf - we need to travel up the
2669 					 * tree one level.
2670 					 */
2671 					goto next_node;
2672 				}
2673 
2674 				*cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2675 				goto out;
2676 			}
2677 		}
2678 
2679 		/*
2680 		 * If we got here, we never found a valid node where
2681 		 * the tree indicated one should be.
2682 		 */
2683 		ocfs2_error(sb,
2684 			    "Invalid extent tree at extent block %llu\n",
2685 			    (unsigned long long)blkno);
2686 		ret = -EROFS;
2687 		goto out;
2688 
2689 next_node:
2690 		blkno = path->p_node[i].bh->b_blocknr;
2691 		i--;
2692 	}
2693 
2694 out:
2695 	return ret;
2696 }
2697 
2698 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2699 					    handle_t *handle,
2700 					    struct ocfs2_path *path)
2701 {
2702 	int ret;
2703 	struct buffer_head *bh = path_leaf_bh(path);
2704 	struct ocfs2_extent_list *el = path_leaf_el(path);
2705 
2706 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2707 		return 0;
2708 
2709 	ret = ocfs2_path_bh_journal_access(handle, inode, path,
2710 					   path_num_items(path) - 1);
2711 	if (ret) {
2712 		mlog_errno(ret);
2713 		goto out;
2714 	}
2715 
2716 	ocfs2_remove_empty_extent(el);
2717 
2718 	ret = ocfs2_journal_dirty(handle, bh);
2719 	if (ret)
2720 		mlog_errno(ret);
2721 
2722 out:
2723 	return ret;
2724 }
2725 
2726 static int __ocfs2_rotate_tree_left(struct inode *inode,
2727 				    handle_t *handle, int orig_credits,
2728 				    struct ocfs2_path *path,
2729 				    struct ocfs2_cached_dealloc_ctxt *dealloc,
2730 				    struct ocfs2_path **empty_extent_path,
2731 				    struct ocfs2_extent_tree *et)
2732 {
2733 	int ret, subtree_root, deleted;
2734 	u32 right_cpos;
2735 	struct ocfs2_path *left_path = NULL;
2736 	struct ocfs2_path *right_path = NULL;
2737 
2738 	BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2739 
2740 	*empty_extent_path = NULL;
2741 
2742 	ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2743 					     &right_cpos);
2744 	if (ret) {
2745 		mlog_errno(ret);
2746 		goto out;
2747 	}
2748 
2749 	left_path = ocfs2_new_path_from_path(path);
2750 	if (!left_path) {
2751 		ret = -ENOMEM;
2752 		mlog_errno(ret);
2753 		goto out;
2754 	}
2755 
2756 	ocfs2_cp_path(left_path, path);
2757 
2758 	right_path = ocfs2_new_path_from_path(path);
2759 	if (!right_path) {
2760 		ret = -ENOMEM;
2761 		mlog_errno(ret);
2762 		goto out;
2763 	}
2764 
2765 	while (right_cpos) {
2766 		ret = ocfs2_find_path(inode, right_path, right_cpos);
2767 		if (ret) {
2768 			mlog_errno(ret);
2769 			goto out;
2770 		}
2771 
2772 		subtree_root = ocfs2_find_subtree_root(inode, left_path,
2773 						       right_path);
2774 
2775 		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2776 		     subtree_root,
2777 		     (unsigned long long)
2778 		     right_path->p_node[subtree_root].bh->b_blocknr,
2779 		     right_path->p_tree_depth);
2780 
2781 		ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2782 						      orig_credits, left_path);
2783 		if (ret) {
2784 			mlog_errno(ret);
2785 			goto out;
2786 		}
2787 
2788 		/*
2789 		 * Caller might still want to make changes to the
2790 		 * tree root, so re-add it to the journal here.
2791 		 */
2792 		ret = ocfs2_path_bh_journal_access(handle, inode,
2793 						   left_path, 0);
2794 		if (ret) {
2795 			mlog_errno(ret);
2796 			goto out;
2797 		}
2798 
2799 		ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2800 						right_path, subtree_root,
2801 						dealloc, &deleted, et);
2802 		if (ret == -EAGAIN) {
2803 			/*
2804 			 * The rotation has to temporarily stop due to
2805 			 * the right subtree having an empty
2806 			 * extent. Pass it back to the caller for a
2807 			 * fixup.
2808 			 */
2809 			*empty_extent_path = right_path;
2810 			right_path = NULL;
2811 			goto out;
2812 		}
2813 		if (ret) {
2814 			mlog_errno(ret);
2815 			goto out;
2816 		}
2817 
2818 		/*
2819 		 * The subtree rotate might have removed records on
2820 		 * the rightmost edge. If so, then rotation is
2821 		 * complete.
2822 		 */
2823 		if (deleted)
2824 			break;
2825 
2826 		ocfs2_mv_path(left_path, right_path);
2827 
2828 		ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2829 						     &right_cpos);
2830 		if (ret) {
2831 			mlog_errno(ret);
2832 			goto out;
2833 		}
2834 	}
2835 
2836 out:
2837 	ocfs2_free_path(right_path);
2838 	ocfs2_free_path(left_path);
2839 
2840 	return ret;
2841 }
2842 
2843 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
2844 				struct ocfs2_path *path,
2845 				struct ocfs2_cached_dealloc_ctxt *dealloc,
2846 				struct ocfs2_extent_tree *et)
2847 {
2848 	int ret, subtree_index;
2849 	u32 cpos;
2850 	struct ocfs2_path *left_path = NULL;
2851 	struct ocfs2_extent_block *eb;
2852 	struct ocfs2_extent_list *el;
2853 
2854 
2855 	ret = ocfs2_et_sanity_check(inode, et);
2856 	if (ret)
2857 		goto out;
2858 	/*
2859 	 * There's two ways we handle this depending on
2860 	 * whether path is the only existing one.
2861 	 */
2862 	ret = ocfs2_extend_rotate_transaction(handle, 0,
2863 					      handle->h_buffer_credits,
2864 					      path);
2865 	if (ret) {
2866 		mlog_errno(ret);
2867 		goto out;
2868 	}
2869 
2870 	ret = ocfs2_journal_access_path(inode, handle, path);
2871 	if (ret) {
2872 		mlog_errno(ret);
2873 		goto out;
2874 	}
2875 
2876 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
2877 	if (ret) {
2878 		mlog_errno(ret);
2879 		goto out;
2880 	}
2881 
2882 	if (cpos) {
2883 		/*
2884 		 * We have a path to the left of this one - it needs
2885 		 * an update too.
2886 		 */
2887 		left_path = ocfs2_new_path_from_path(path);
2888 		if (!left_path) {
2889 			ret = -ENOMEM;
2890 			mlog_errno(ret);
2891 			goto out;
2892 		}
2893 
2894 		ret = ocfs2_find_path(inode, left_path, cpos);
2895 		if (ret) {
2896 			mlog_errno(ret);
2897 			goto out;
2898 		}
2899 
2900 		ret = ocfs2_journal_access_path(inode, handle, left_path);
2901 		if (ret) {
2902 			mlog_errno(ret);
2903 			goto out;
2904 		}
2905 
2906 		subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
2907 
2908 		ocfs2_unlink_subtree(inode, handle, left_path, path,
2909 				     subtree_index, dealloc);
2910 		ocfs2_update_edge_lengths(inode, handle, left_path);
2911 
2912 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2913 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2914 	} else {
2915 		/*
2916 		 * 'path' is also the leftmost path which
2917 		 * means it must be the only one. This gets
2918 		 * handled differently because we want to
2919 		 * revert the inode back to having extents
2920 		 * in-line.
2921 		 */
2922 		ocfs2_unlink_path(inode, handle, dealloc, path, 1);
2923 
2924 		el = et->et_root_el;
2925 		el->l_tree_depth = 0;
2926 		el->l_next_free_rec = 0;
2927 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2928 
2929 		ocfs2_et_set_last_eb_blk(et, 0);
2930 	}
2931 
2932 	ocfs2_journal_dirty(handle, path_root_bh(path));
2933 
2934 out:
2935 	ocfs2_free_path(left_path);
2936 	return ret;
2937 }
2938 
2939 /*
2940  * Left rotation of btree records.
2941  *
2942  * In many ways, this is (unsurprisingly) the opposite of right
2943  * rotation. We start at some non-rightmost path containing an empty
2944  * extent in the leaf block. The code works its way to the rightmost
2945  * path by rotating records to the left in every subtree.
2946  *
2947  * This is used by any code which reduces the number of extent records
2948  * in a leaf. After removal, an empty record should be placed in the
2949  * leftmost list position.
2950  *
2951  * This won't handle a length update of the rightmost path records if
2952  * the rightmost tree leaf record is removed so the caller is
2953  * responsible for detecting and correcting that.
2954  */
2955 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
2956 				  struct ocfs2_path *path,
2957 				  struct ocfs2_cached_dealloc_ctxt *dealloc,
2958 				  struct ocfs2_extent_tree *et)
2959 {
2960 	int ret, orig_credits = handle->h_buffer_credits;
2961 	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
2962 	struct ocfs2_extent_block *eb;
2963 	struct ocfs2_extent_list *el;
2964 
2965 	el = path_leaf_el(path);
2966 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2967 		return 0;
2968 
2969 	if (path->p_tree_depth == 0) {
2970 rightmost_no_delete:
2971 		/*
2972 		 * Inline extents. This is trivially handled, so do
2973 		 * it up front.
2974 		 */
2975 		ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
2976 						       path);
2977 		if (ret)
2978 			mlog_errno(ret);
2979 		goto out;
2980 	}
2981 
2982 	/*
2983 	 * Handle rightmost branch now. There's several cases:
2984 	 *  1) simple rotation leaving records in there. That's trivial.
2985 	 *  2) rotation requiring a branch delete - there's no more
2986 	 *     records left. Two cases of this:
2987 	 *     a) There are branches to the left.
2988 	 *     b) This is also the leftmost (the only) branch.
2989 	 *
2990 	 *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
2991 	 *  2a) we need the left branch so that we can update it with the unlink
2992 	 *  2b) we need to bring the inode back to inline extents.
2993 	 */
2994 
2995 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2996 	el = &eb->h_list;
2997 	if (eb->h_next_leaf_blk == 0) {
2998 		/*
2999 		 * This gets a bit tricky if we're going to delete the
3000 		 * rightmost path. Get the other cases out of the way
3001 		 * 1st.
3002 		 */
3003 		if (le16_to_cpu(el->l_next_free_rec) > 1)
3004 			goto rightmost_no_delete;
3005 
3006 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
3007 			ret = -EIO;
3008 			ocfs2_error(inode->i_sb,
3009 				    "Inode %llu has empty extent block at %llu",
3010 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
3011 				    (unsigned long long)le64_to_cpu(eb->h_blkno));
3012 			goto out;
3013 		}
3014 
3015 		/*
3016 		 * XXX: The caller can not trust "path" any more after
3017 		 * this as it will have been deleted. What do we do?
3018 		 *
3019 		 * In theory the rotate-for-merge code will never get
3020 		 * here because it'll always ask for a rotate in a
3021 		 * nonempty list.
3022 		 */
3023 
3024 		ret = ocfs2_remove_rightmost_path(inode, handle, path,
3025 						  dealloc, et);
3026 		if (ret)
3027 			mlog_errno(ret);
3028 		goto out;
3029 	}
3030 
3031 	/*
3032 	 * Now we can loop, remembering the path we get from -EAGAIN
3033 	 * and restarting from there.
3034 	 */
3035 try_rotate:
3036 	ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
3037 				       dealloc, &restart_path, et);
3038 	if (ret && ret != -EAGAIN) {
3039 		mlog_errno(ret);
3040 		goto out;
3041 	}
3042 
3043 	while (ret == -EAGAIN) {
3044 		tmp_path = restart_path;
3045 		restart_path = NULL;
3046 
3047 		ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
3048 					       tmp_path, dealloc,
3049 					       &restart_path, et);
3050 		if (ret && ret != -EAGAIN) {
3051 			mlog_errno(ret);
3052 			goto out;
3053 		}
3054 
3055 		ocfs2_free_path(tmp_path);
3056 		tmp_path = NULL;
3057 
3058 		if (ret == 0)
3059 			goto try_rotate;
3060 	}
3061 
3062 out:
3063 	ocfs2_free_path(tmp_path);
3064 	ocfs2_free_path(restart_path);
3065 	return ret;
3066 }
3067 
3068 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3069 				int index)
3070 {
3071 	struct ocfs2_extent_rec *rec = &el->l_recs[index];
3072 	unsigned int size;
3073 
3074 	if (rec->e_leaf_clusters == 0) {
3075 		/*
3076 		 * We consumed all of the merged-from record. An empty
3077 		 * extent cannot exist anywhere but the 1st array
3078 		 * position, so move things over if the merged-from
3079 		 * record doesn't occupy that position.
3080 		 *
3081 		 * This creates a new empty extent so the caller
3082 		 * should be smart enough to have removed any existing
3083 		 * ones.
3084 		 */
3085 		if (index > 0) {
3086 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3087 			size = index * sizeof(struct ocfs2_extent_rec);
3088 			memmove(&el->l_recs[1], &el->l_recs[0], size);
3089 		}
3090 
3091 		/*
3092 		 * Always memset - the caller doesn't check whether it
3093 		 * created an empty extent, so there could be junk in
3094 		 * the other fields.
3095 		 */
3096 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3097 	}
3098 }
3099 
3100 static int ocfs2_get_right_path(struct inode *inode,
3101 				struct ocfs2_path *left_path,
3102 				struct ocfs2_path **ret_right_path)
3103 {
3104 	int ret;
3105 	u32 right_cpos;
3106 	struct ocfs2_path *right_path = NULL;
3107 	struct ocfs2_extent_list *left_el;
3108 
3109 	*ret_right_path = NULL;
3110 
3111 	/* This function shouldn't be called for non-trees. */
3112 	BUG_ON(left_path->p_tree_depth == 0);
3113 
3114 	left_el = path_leaf_el(left_path);
3115 	BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3116 
3117 	ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
3118 					     &right_cpos);
3119 	if (ret) {
3120 		mlog_errno(ret);
3121 		goto out;
3122 	}
3123 
3124 	/* This function shouldn't be called for the rightmost leaf. */
3125 	BUG_ON(right_cpos == 0);
3126 
3127 	right_path = ocfs2_new_path_from_path(left_path);
3128 	if (!right_path) {
3129 		ret = -ENOMEM;
3130 		mlog_errno(ret);
3131 		goto out;
3132 	}
3133 
3134 	ret = ocfs2_find_path(inode, right_path, right_cpos);
3135 	if (ret) {
3136 		mlog_errno(ret);
3137 		goto out;
3138 	}
3139 
3140 	*ret_right_path = right_path;
3141 out:
3142 	if (ret)
3143 		ocfs2_free_path(right_path);
3144 	return ret;
3145 }
3146 
3147 /*
3148  * Remove split_rec clusters from the record at index and merge them
3149  * onto the beginning of the record "next" to it.
3150  * For index < l_count - 1, the next means the extent rec at index + 1.
3151  * For index == l_count - 1, the "next" means the 1st extent rec of the
3152  * next extent block.
3153  */
3154 static int ocfs2_merge_rec_right(struct inode *inode,
3155 				 struct ocfs2_path *left_path,
3156 				 handle_t *handle,
3157 				 struct ocfs2_extent_rec *split_rec,
3158 				 int index)
3159 {
3160 	int ret, next_free, i;
3161 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3162 	struct ocfs2_extent_rec *left_rec;
3163 	struct ocfs2_extent_rec *right_rec;
3164 	struct ocfs2_extent_list *right_el;
3165 	struct ocfs2_path *right_path = NULL;
3166 	int subtree_index = 0;
3167 	struct ocfs2_extent_list *el = path_leaf_el(left_path);
3168 	struct buffer_head *bh = path_leaf_bh(left_path);
3169 	struct buffer_head *root_bh = NULL;
3170 
3171 	BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3172 	left_rec = &el->l_recs[index];
3173 
3174 	if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3175 	    le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3176 		/* we meet with a cross extent block merge. */
3177 		ret = ocfs2_get_right_path(inode, left_path, &right_path);
3178 		if (ret) {
3179 			mlog_errno(ret);
3180 			goto out;
3181 		}
3182 
3183 		right_el = path_leaf_el(right_path);
3184 		next_free = le16_to_cpu(right_el->l_next_free_rec);
3185 		BUG_ON(next_free <= 0);
3186 		right_rec = &right_el->l_recs[0];
3187 		if (ocfs2_is_empty_extent(right_rec)) {
3188 			BUG_ON(next_free <= 1);
3189 			right_rec = &right_el->l_recs[1];
3190 		}
3191 
3192 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3193 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
3194 		       le32_to_cpu(right_rec->e_cpos));
3195 
3196 		subtree_index = ocfs2_find_subtree_root(inode,
3197 							left_path, right_path);
3198 
3199 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3200 						      handle->h_buffer_credits,
3201 						      right_path);
3202 		if (ret) {
3203 			mlog_errno(ret);
3204 			goto out;
3205 		}
3206 
3207 		root_bh = left_path->p_node[subtree_index].bh;
3208 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3209 
3210 		ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3211 						   subtree_index);
3212 		if (ret) {
3213 			mlog_errno(ret);
3214 			goto out;
3215 		}
3216 
3217 		for (i = subtree_index + 1;
3218 		     i < path_num_items(right_path); i++) {
3219 			ret = ocfs2_path_bh_journal_access(handle, inode,
3220 							   right_path, i);
3221 			if (ret) {
3222 				mlog_errno(ret);
3223 				goto out;
3224 			}
3225 
3226 			ret = ocfs2_path_bh_journal_access(handle, inode,
3227 							   left_path, i);
3228 			if (ret) {
3229 				mlog_errno(ret);
3230 				goto out;
3231 			}
3232 		}
3233 
3234 	} else {
3235 		BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3236 		right_rec = &el->l_recs[index + 1];
3237 	}
3238 
3239 	ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
3240 					   path_num_items(left_path) - 1);
3241 	if (ret) {
3242 		mlog_errno(ret);
3243 		goto out;
3244 	}
3245 
3246 	le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3247 
3248 	le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3249 	le64_add_cpu(&right_rec->e_blkno,
3250 		     -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3251 	le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3252 
3253 	ocfs2_cleanup_merge(el, index);
3254 
3255 	ret = ocfs2_journal_dirty(handle, bh);
3256 	if (ret)
3257 		mlog_errno(ret);
3258 
3259 	if (right_path) {
3260 		ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3261 		if (ret)
3262 			mlog_errno(ret);
3263 
3264 		ocfs2_complete_edge_insert(inode, handle, left_path,
3265 					   right_path, subtree_index);
3266 	}
3267 out:
3268 	if (right_path)
3269 		ocfs2_free_path(right_path);
3270 	return ret;
3271 }
3272 
3273 static int ocfs2_get_left_path(struct inode *inode,
3274 			       struct ocfs2_path *right_path,
3275 			       struct ocfs2_path **ret_left_path)
3276 {
3277 	int ret;
3278 	u32 left_cpos;
3279 	struct ocfs2_path *left_path = NULL;
3280 
3281 	*ret_left_path = NULL;
3282 
3283 	/* This function shouldn't be called for non-trees. */
3284 	BUG_ON(right_path->p_tree_depth == 0);
3285 
3286 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3287 					    right_path, &left_cpos);
3288 	if (ret) {
3289 		mlog_errno(ret);
3290 		goto out;
3291 	}
3292 
3293 	/* This function shouldn't be called for the leftmost leaf. */
3294 	BUG_ON(left_cpos == 0);
3295 
3296 	left_path = ocfs2_new_path_from_path(right_path);
3297 	if (!left_path) {
3298 		ret = -ENOMEM;
3299 		mlog_errno(ret);
3300 		goto out;
3301 	}
3302 
3303 	ret = ocfs2_find_path(inode, left_path, left_cpos);
3304 	if (ret) {
3305 		mlog_errno(ret);
3306 		goto out;
3307 	}
3308 
3309 	*ret_left_path = left_path;
3310 out:
3311 	if (ret)
3312 		ocfs2_free_path(left_path);
3313 	return ret;
3314 }
3315 
3316 /*
3317  * Remove split_rec clusters from the record at index and merge them
3318  * onto the tail of the record "before" it.
3319  * For index > 0, the "before" means the extent rec at index - 1.
3320  *
3321  * For index == 0, the "before" means the last record of the previous
3322  * extent block. And there is also a situation that we may need to
3323  * remove the rightmost leaf extent block in the right_path and change
3324  * the right path to indicate the new rightmost path.
3325  */
3326 static int ocfs2_merge_rec_left(struct inode *inode,
3327 				struct ocfs2_path *right_path,
3328 				handle_t *handle,
3329 				struct ocfs2_extent_rec *split_rec,
3330 				struct ocfs2_cached_dealloc_ctxt *dealloc,
3331 				struct ocfs2_extent_tree *et,
3332 				int index)
3333 {
3334 	int ret, i, subtree_index = 0, has_empty_extent = 0;
3335 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3336 	struct ocfs2_extent_rec *left_rec;
3337 	struct ocfs2_extent_rec *right_rec;
3338 	struct ocfs2_extent_list *el = path_leaf_el(right_path);
3339 	struct buffer_head *bh = path_leaf_bh(right_path);
3340 	struct buffer_head *root_bh = NULL;
3341 	struct ocfs2_path *left_path = NULL;
3342 	struct ocfs2_extent_list *left_el;
3343 
3344 	BUG_ON(index < 0);
3345 
3346 	right_rec = &el->l_recs[index];
3347 	if (index == 0) {
3348 		/* we meet with a cross extent block merge. */
3349 		ret = ocfs2_get_left_path(inode, right_path, &left_path);
3350 		if (ret) {
3351 			mlog_errno(ret);
3352 			goto out;
3353 		}
3354 
3355 		left_el = path_leaf_el(left_path);
3356 		BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3357 		       le16_to_cpu(left_el->l_count));
3358 
3359 		left_rec = &left_el->l_recs[
3360 				le16_to_cpu(left_el->l_next_free_rec) - 1];
3361 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3362 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
3363 		       le32_to_cpu(split_rec->e_cpos));
3364 
3365 		subtree_index = ocfs2_find_subtree_root(inode,
3366 							left_path, right_path);
3367 
3368 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3369 						      handle->h_buffer_credits,
3370 						      left_path);
3371 		if (ret) {
3372 			mlog_errno(ret);
3373 			goto out;
3374 		}
3375 
3376 		root_bh = left_path->p_node[subtree_index].bh;
3377 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3378 
3379 		ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
3380 						   subtree_index);
3381 		if (ret) {
3382 			mlog_errno(ret);
3383 			goto out;
3384 		}
3385 
3386 		for (i = subtree_index + 1;
3387 		     i < path_num_items(right_path); i++) {
3388 			ret = ocfs2_path_bh_journal_access(handle, inode,
3389 							   right_path, i);
3390 			if (ret) {
3391 				mlog_errno(ret);
3392 				goto out;
3393 			}
3394 
3395 			ret = ocfs2_path_bh_journal_access(handle, inode,
3396 							   left_path, i);
3397 			if (ret) {
3398 				mlog_errno(ret);
3399 				goto out;
3400 			}
3401 		}
3402 	} else {
3403 		left_rec = &el->l_recs[index - 1];
3404 		if (ocfs2_is_empty_extent(&el->l_recs[0]))
3405 			has_empty_extent = 1;
3406 	}
3407 
3408 	ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
3409 					   path_num_items(left_path) - 1);
3410 	if (ret) {
3411 		mlog_errno(ret);
3412 		goto out;
3413 	}
3414 
3415 	if (has_empty_extent && index == 1) {
3416 		/*
3417 		 * The easy case - we can just plop the record right in.
3418 		 */
3419 		*left_rec = *split_rec;
3420 
3421 		has_empty_extent = 0;
3422 	} else
3423 		le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3424 
3425 	le32_add_cpu(&right_rec->e_cpos, split_clusters);
3426 	le64_add_cpu(&right_rec->e_blkno,
3427 		     ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3428 	le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3429 
3430 	ocfs2_cleanup_merge(el, index);
3431 
3432 	ret = ocfs2_journal_dirty(handle, bh);
3433 	if (ret)
3434 		mlog_errno(ret);
3435 
3436 	if (left_path) {
3437 		ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3438 		if (ret)
3439 			mlog_errno(ret);
3440 
3441 		/*
3442 		 * In the situation that the right_rec is empty and the extent
3443 		 * block is empty also,  ocfs2_complete_edge_insert can't handle
3444 		 * it and we need to delete the right extent block.
3445 		 */
3446 		if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3447 		    le16_to_cpu(el->l_next_free_rec) == 1) {
3448 
3449 			ret = ocfs2_remove_rightmost_path(inode, handle,
3450 							  right_path,
3451 							  dealloc, et);
3452 			if (ret) {
3453 				mlog_errno(ret);
3454 				goto out;
3455 			}
3456 
3457 			/* Now the rightmost extent block has been deleted.
3458 			 * So we use the new rightmost path.
3459 			 */
3460 			ocfs2_mv_path(right_path, left_path);
3461 			left_path = NULL;
3462 		} else
3463 			ocfs2_complete_edge_insert(inode, handle, left_path,
3464 						   right_path, subtree_index);
3465 	}
3466 out:
3467 	if (left_path)
3468 		ocfs2_free_path(left_path);
3469 	return ret;
3470 }
3471 
3472 static int ocfs2_try_to_merge_extent(struct inode *inode,
3473 				     handle_t *handle,
3474 				     struct ocfs2_path *path,
3475 				     int split_index,
3476 				     struct ocfs2_extent_rec *split_rec,
3477 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
3478 				     struct ocfs2_merge_ctxt *ctxt,
3479 				     struct ocfs2_extent_tree *et)
3480 
3481 {
3482 	int ret = 0;
3483 	struct ocfs2_extent_list *el = path_leaf_el(path);
3484 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3485 
3486 	BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3487 
3488 	if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3489 		/*
3490 		 * The merge code will need to create an empty
3491 		 * extent to take the place of the newly
3492 		 * emptied slot. Remove any pre-existing empty
3493 		 * extents - having more than one in a leaf is
3494 		 * illegal.
3495 		 */
3496 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3497 					     dealloc, et);
3498 		if (ret) {
3499 			mlog_errno(ret);
3500 			goto out;
3501 		}
3502 		split_index--;
3503 		rec = &el->l_recs[split_index];
3504 	}
3505 
3506 	if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3507 		/*
3508 		 * Left-right contig implies this.
3509 		 */
3510 		BUG_ON(!ctxt->c_split_covers_rec);
3511 
3512 		/*
3513 		 * Since the leftright insert always covers the entire
3514 		 * extent, this call will delete the insert record
3515 		 * entirely, resulting in an empty extent record added to
3516 		 * the extent block.
3517 		 *
3518 		 * Since the adding of an empty extent shifts
3519 		 * everything back to the right, there's no need to
3520 		 * update split_index here.
3521 		 *
3522 		 * When the split_index is zero, we need to merge it to the
3523 		 * prevoius extent block. It is more efficient and easier
3524 		 * if we do merge_right first and merge_left later.
3525 		 */
3526 		ret = ocfs2_merge_rec_right(inode, path,
3527 					    handle, split_rec,
3528 					    split_index);
3529 		if (ret) {
3530 			mlog_errno(ret);
3531 			goto out;
3532 		}
3533 
3534 		/*
3535 		 * We can only get this from logic error above.
3536 		 */
3537 		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3538 
3539 		/* The merge left us with an empty extent, remove it. */
3540 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3541 					     dealloc, et);
3542 		if (ret) {
3543 			mlog_errno(ret);
3544 			goto out;
3545 		}
3546 
3547 		rec = &el->l_recs[split_index];
3548 
3549 		/*
3550 		 * Note that we don't pass split_rec here on purpose -
3551 		 * we've merged it into the rec already.
3552 		 */
3553 		ret = ocfs2_merge_rec_left(inode, path,
3554 					   handle, rec,
3555 					   dealloc, et,
3556 					   split_index);
3557 
3558 		if (ret) {
3559 			mlog_errno(ret);
3560 			goto out;
3561 		}
3562 
3563 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3564 					     dealloc, et);
3565 		/*
3566 		 * Error from this last rotate is not critical, so
3567 		 * print but don't bubble it up.
3568 		 */
3569 		if (ret)
3570 			mlog_errno(ret);
3571 		ret = 0;
3572 	} else {
3573 		/*
3574 		 * Merge a record to the left or right.
3575 		 *
3576 		 * 'contig_type' is relative to the existing record,
3577 		 * so for example, if we're "right contig", it's to
3578 		 * the record on the left (hence the left merge).
3579 		 */
3580 		if (ctxt->c_contig_type == CONTIG_RIGHT) {
3581 			ret = ocfs2_merge_rec_left(inode,
3582 						   path,
3583 						   handle, split_rec,
3584 						   dealloc, et,
3585 						   split_index);
3586 			if (ret) {
3587 				mlog_errno(ret);
3588 				goto out;
3589 			}
3590 		} else {
3591 			ret = ocfs2_merge_rec_right(inode,
3592 						    path,
3593 						    handle, split_rec,
3594 						    split_index);
3595 			if (ret) {
3596 				mlog_errno(ret);
3597 				goto out;
3598 			}
3599 		}
3600 
3601 		if (ctxt->c_split_covers_rec) {
3602 			/*
3603 			 * The merge may have left an empty extent in
3604 			 * our leaf. Try to rotate it away.
3605 			 */
3606 			ret = ocfs2_rotate_tree_left(inode, handle, path,
3607 						     dealloc, et);
3608 			if (ret)
3609 				mlog_errno(ret);
3610 			ret = 0;
3611 		}
3612 	}
3613 
3614 out:
3615 	return ret;
3616 }
3617 
3618 static void ocfs2_subtract_from_rec(struct super_block *sb,
3619 				    enum ocfs2_split_type split,
3620 				    struct ocfs2_extent_rec *rec,
3621 				    struct ocfs2_extent_rec *split_rec)
3622 {
3623 	u64 len_blocks;
3624 
3625 	len_blocks = ocfs2_clusters_to_blocks(sb,
3626 				le16_to_cpu(split_rec->e_leaf_clusters));
3627 
3628 	if (split == SPLIT_LEFT) {
3629 		/*
3630 		 * Region is on the left edge of the existing
3631 		 * record.
3632 		 */
3633 		le32_add_cpu(&rec->e_cpos,
3634 			     le16_to_cpu(split_rec->e_leaf_clusters));
3635 		le64_add_cpu(&rec->e_blkno, len_blocks);
3636 		le16_add_cpu(&rec->e_leaf_clusters,
3637 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3638 	} else {
3639 		/*
3640 		 * Region is on the right edge of the existing
3641 		 * record.
3642 		 */
3643 		le16_add_cpu(&rec->e_leaf_clusters,
3644 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3645 	}
3646 }
3647 
3648 /*
3649  * Do the final bits of extent record insertion at the target leaf
3650  * list. If this leaf is part of an allocation tree, it is assumed
3651  * that the tree above has been prepared.
3652  */
3653 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3654 				 struct ocfs2_extent_list *el,
3655 				 struct ocfs2_insert_type *insert,
3656 				 struct inode *inode)
3657 {
3658 	int i = insert->ins_contig_index;
3659 	unsigned int range;
3660 	struct ocfs2_extent_rec *rec;
3661 
3662 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3663 
3664 	if (insert->ins_split != SPLIT_NONE) {
3665 		i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3666 		BUG_ON(i == -1);
3667 		rec = &el->l_recs[i];
3668 		ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3669 					insert_rec);
3670 		goto rotate;
3671 	}
3672 
3673 	/*
3674 	 * Contiguous insert - either left or right.
3675 	 */
3676 	if (insert->ins_contig != CONTIG_NONE) {
3677 		rec = &el->l_recs[i];
3678 		if (insert->ins_contig == CONTIG_LEFT) {
3679 			rec->e_blkno = insert_rec->e_blkno;
3680 			rec->e_cpos = insert_rec->e_cpos;
3681 		}
3682 		le16_add_cpu(&rec->e_leaf_clusters,
3683 			     le16_to_cpu(insert_rec->e_leaf_clusters));
3684 		return;
3685 	}
3686 
3687 	/*
3688 	 * Handle insert into an empty leaf.
3689 	 */
3690 	if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3691 	    ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3692 	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
3693 		el->l_recs[0] = *insert_rec;
3694 		el->l_next_free_rec = cpu_to_le16(1);
3695 		return;
3696 	}
3697 
3698 	/*
3699 	 * Appending insert.
3700 	 */
3701 	if (insert->ins_appending == APPEND_TAIL) {
3702 		i = le16_to_cpu(el->l_next_free_rec) - 1;
3703 		rec = &el->l_recs[i];
3704 		range = le32_to_cpu(rec->e_cpos)
3705 			+ le16_to_cpu(rec->e_leaf_clusters);
3706 		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3707 
3708 		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3709 				le16_to_cpu(el->l_count),
3710 				"inode %lu, depth %u, count %u, next free %u, "
3711 				"rec.cpos %u, rec.clusters %u, "
3712 				"insert.cpos %u, insert.clusters %u\n",
3713 				inode->i_ino,
3714 				le16_to_cpu(el->l_tree_depth),
3715 				le16_to_cpu(el->l_count),
3716 				le16_to_cpu(el->l_next_free_rec),
3717 				le32_to_cpu(el->l_recs[i].e_cpos),
3718 				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3719 				le32_to_cpu(insert_rec->e_cpos),
3720 				le16_to_cpu(insert_rec->e_leaf_clusters));
3721 		i++;
3722 		el->l_recs[i] = *insert_rec;
3723 		le16_add_cpu(&el->l_next_free_rec, 1);
3724 		return;
3725 	}
3726 
3727 rotate:
3728 	/*
3729 	 * Ok, we have to rotate.
3730 	 *
3731 	 * At this point, it is safe to assume that inserting into an
3732 	 * empty leaf and appending to a leaf have both been handled
3733 	 * above.
3734 	 *
3735 	 * This leaf needs to have space, either by the empty 1st
3736 	 * extent record, or by virtue of an l_next_rec < l_count.
3737 	 */
3738 	ocfs2_rotate_leaf(el, insert_rec);
3739 }
3740 
3741 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3742 					   handle_t *handle,
3743 					   struct ocfs2_path *path,
3744 					   struct ocfs2_extent_rec *insert_rec)
3745 {
3746 	int ret, i, next_free;
3747 	struct buffer_head *bh;
3748 	struct ocfs2_extent_list *el;
3749 	struct ocfs2_extent_rec *rec;
3750 
3751 	/*
3752 	 * Update everything except the leaf block.
3753 	 */
3754 	for (i = 0; i < path->p_tree_depth; i++) {
3755 		bh = path->p_node[i].bh;
3756 		el = path->p_node[i].el;
3757 
3758 		next_free = le16_to_cpu(el->l_next_free_rec);
3759 		if (next_free == 0) {
3760 			ocfs2_error(inode->i_sb,
3761 				    "Dinode %llu has a bad extent list",
3762 				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
3763 			ret = -EIO;
3764 			return;
3765 		}
3766 
3767 		rec = &el->l_recs[next_free - 1];
3768 
3769 		rec->e_int_clusters = insert_rec->e_cpos;
3770 		le32_add_cpu(&rec->e_int_clusters,
3771 			     le16_to_cpu(insert_rec->e_leaf_clusters));
3772 		le32_add_cpu(&rec->e_int_clusters,
3773 			     -le32_to_cpu(rec->e_cpos));
3774 
3775 		ret = ocfs2_journal_dirty(handle, bh);
3776 		if (ret)
3777 			mlog_errno(ret);
3778 
3779 	}
3780 }
3781 
3782 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3783 				    struct ocfs2_extent_rec *insert_rec,
3784 				    struct ocfs2_path *right_path,
3785 				    struct ocfs2_path **ret_left_path)
3786 {
3787 	int ret, next_free;
3788 	struct ocfs2_extent_list *el;
3789 	struct ocfs2_path *left_path = NULL;
3790 
3791 	*ret_left_path = NULL;
3792 
3793 	/*
3794 	 * This shouldn't happen for non-trees. The extent rec cluster
3795 	 * count manipulation below only works for interior nodes.
3796 	 */
3797 	BUG_ON(right_path->p_tree_depth == 0);
3798 
3799 	/*
3800 	 * If our appending insert is at the leftmost edge of a leaf,
3801 	 * then we might need to update the rightmost records of the
3802 	 * neighboring path.
3803 	 */
3804 	el = path_leaf_el(right_path);
3805 	next_free = le16_to_cpu(el->l_next_free_rec);
3806 	if (next_free == 0 ||
3807 	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3808 		u32 left_cpos;
3809 
3810 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3811 						    &left_cpos);
3812 		if (ret) {
3813 			mlog_errno(ret);
3814 			goto out;
3815 		}
3816 
3817 		mlog(0, "Append may need a left path update. cpos: %u, "
3818 		     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3819 		     left_cpos);
3820 
3821 		/*
3822 		 * No need to worry if the append is already in the
3823 		 * leftmost leaf.
3824 		 */
3825 		if (left_cpos) {
3826 			left_path = ocfs2_new_path_from_path(right_path);
3827 			if (!left_path) {
3828 				ret = -ENOMEM;
3829 				mlog_errno(ret);
3830 				goto out;
3831 			}
3832 
3833 			ret = ocfs2_find_path(inode, left_path, left_cpos);
3834 			if (ret) {
3835 				mlog_errno(ret);
3836 				goto out;
3837 			}
3838 
3839 			/*
3840 			 * ocfs2_insert_path() will pass the left_path to the
3841 			 * journal for us.
3842 			 */
3843 		}
3844 	}
3845 
3846 	ret = ocfs2_journal_access_path(inode, handle, right_path);
3847 	if (ret) {
3848 		mlog_errno(ret);
3849 		goto out;
3850 	}
3851 
3852 	ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
3853 
3854 	*ret_left_path = left_path;
3855 	ret = 0;
3856 out:
3857 	if (ret != 0)
3858 		ocfs2_free_path(left_path);
3859 
3860 	return ret;
3861 }
3862 
3863 static void ocfs2_split_record(struct inode *inode,
3864 			       struct ocfs2_path *left_path,
3865 			       struct ocfs2_path *right_path,
3866 			       struct ocfs2_extent_rec *split_rec,
3867 			       enum ocfs2_split_type split)
3868 {
3869 	int index;
3870 	u32 cpos = le32_to_cpu(split_rec->e_cpos);
3871 	struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
3872 	struct ocfs2_extent_rec *rec, *tmprec;
3873 
3874 	right_el = path_leaf_el(right_path);;
3875 	if (left_path)
3876 		left_el = path_leaf_el(left_path);
3877 
3878 	el = right_el;
3879 	insert_el = right_el;
3880 	index = ocfs2_search_extent_list(el, cpos);
3881 	if (index != -1) {
3882 		if (index == 0 && left_path) {
3883 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3884 
3885 			/*
3886 			 * This typically means that the record
3887 			 * started in the left path but moved to the
3888 			 * right as a result of rotation. We either
3889 			 * move the existing record to the left, or we
3890 			 * do the later insert there.
3891 			 *
3892 			 * In this case, the left path should always
3893 			 * exist as the rotate code will have passed
3894 			 * it back for a post-insert update.
3895 			 */
3896 
3897 			if (split == SPLIT_LEFT) {
3898 				/*
3899 				 * It's a left split. Since we know
3900 				 * that the rotate code gave us an
3901 				 * empty extent in the left path, we
3902 				 * can just do the insert there.
3903 				 */
3904 				insert_el = left_el;
3905 			} else {
3906 				/*
3907 				 * Right split - we have to move the
3908 				 * existing record over to the left
3909 				 * leaf. The insert will be into the
3910 				 * newly created empty extent in the
3911 				 * right leaf.
3912 				 */
3913 				tmprec = &right_el->l_recs[index];
3914 				ocfs2_rotate_leaf(left_el, tmprec);
3915 				el = left_el;
3916 
3917 				memset(tmprec, 0, sizeof(*tmprec));
3918 				index = ocfs2_search_extent_list(left_el, cpos);
3919 				BUG_ON(index == -1);
3920 			}
3921 		}
3922 	} else {
3923 		BUG_ON(!left_path);
3924 		BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
3925 		/*
3926 		 * Left path is easy - we can just allow the insert to
3927 		 * happen.
3928 		 */
3929 		el = left_el;
3930 		insert_el = left_el;
3931 		index = ocfs2_search_extent_list(el, cpos);
3932 		BUG_ON(index == -1);
3933 	}
3934 
3935 	rec = &el->l_recs[index];
3936 	ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
3937 	ocfs2_rotate_leaf(insert_el, split_rec);
3938 }
3939 
3940 /*
3941  * This function only does inserts on an allocation b-tree. For tree
3942  * depth = 0, ocfs2_insert_at_leaf() is called directly.
3943  *
3944  * right_path is the path we want to do the actual insert
3945  * in. left_path should only be passed in if we need to update that
3946  * portion of the tree after an edge insert.
3947  */
3948 static int ocfs2_insert_path(struct inode *inode,
3949 			     handle_t *handle,
3950 			     struct ocfs2_path *left_path,
3951 			     struct ocfs2_path *right_path,
3952 			     struct ocfs2_extent_rec *insert_rec,
3953 			     struct ocfs2_insert_type *insert)
3954 {
3955 	int ret, subtree_index;
3956 	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
3957 
3958 	if (left_path) {
3959 		int credits = handle->h_buffer_credits;
3960 
3961 		/*
3962 		 * There's a chance that left_path got passed back to
3963 		 * us without being accounted for in the
3964 		 * journal. Extend our transaction here to be sure we
3965 		 * can change those blocks.
3966 		 */
3967 		credits += left_path->p_tree_depth;
3968 
3969 		ret = ocfs2_extend_trans(handle, credits);
3970 		if (ret < 0) {
3971 			mlog_errno(ret);
3972 			goto out;
3973 		}
3974 
3975 		ret = ocfs2_journal_access_path(inode, handle, left_path);
3976 		if (ret < 0) {
3977 			mlog_errno(ret);
3978 			goto out;
3979 		}
3980 	}
3981 
3982 	/*
3983 	 * Pass both paths to the journal. The majority of inserts
3984 	 * will be touching all components anyway.
3985 	 */
3986 	ret = ocfs2_journal_access_path(inode, handle, right_path);
3987 	if (ret < 0) {
3988 		mlog_errno(ret);
3989 		goto out;
3990 	}
3991 
3992 	if (insert->ins_split != SPLIT_NONE) {
3993 		/*
3994 		 * We could call ocfs2_insert_at_leaf() for some types
3995 		 * of splits, but it's easier to just let one separate
3996 		 * function sort it all out.
3997 		 */
3998 		ocfs2_split_record(inode, left_path, right_path,
3999 				   insert_rec, insert->ins_split);
4000 
4001 		/*
4002 		 * Split might have modified either leaf and we don't
4003 		 * have a guarantee that the later edge insert will
4004 		 * dirty this for us.
4005 		 */
4006 		if (left_path)
4007 			ret = ocfs2_journal_dirty(handle,
4008 						  path_leaf_bh(left_path));
4009 			if (ret)
4010 				mlog_errno(ret);
4011 	} else
4012 		ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
4013 				     insert, inode);
4014 
4015 	ret = ocfs2_journal_dirty(handle, leaf_bh);
4016 	if (ret)
4017 		mlog_errno(ret);
4018 
4019 	if (left_path) {
4020 		/*
4021 		 * The rotate code has indicated that we need to fix
4022 		 * up portions of the tree after the insert.
4023 		 *
4024 		 * XXX: Should we extend the transaction here?
4025 		 */
4026 		subtree_index = ocfs2_find_subtree_root(inode, left_path,
4027 							right_path);
4028 		ocfs2_complete_edge_insert(inode, handle, left_path,
4029 					   right_path, subtree_index);
4030 	}
4031 
4032 	ret = 0;
4033 out:
4034 	return ret;
4035 }
4036 
4037 static int ocfs2_do_insert_extent(struct inode *inode,
4038 				  handle_t *handle,
4039 				  struct ocfs2_extent_tree *et,
4040 				  struct ocfs2_extent_rec *insert_rec,
4041 				  struct ocfs2_insert_type *type)
4042 {
4043 	int ret, rotate = 0;
4044 	u32 cpos;
4045 	struct ocfs2_path *right_path = NULL;
4046 	struct ocfs2_path *left_path = NULL;
4047 	struct ocfs2_extent_list *el;
4048 
4049 	el = et->et_root_el;
4050 
4051 	ret = ocfs2_et_root_journal_access(handle, inode, et,
4052 					   OCFS2_JOURNAL_ACCESS_WRITE);
4053 	if (ret) {
4054 		mlog_errno(ret);
4055 		goto out;
4056 	}
4057 
4058 	if (le16_to_cpu(el->l_tree_depth) == 0) {
4059 		ocfs2_insert_at_leaf(insert_rec, el, type, inode);
4060 		goto out_update_clusters;
4061 	}
4062 
4063 	right_path = ocfs2_new_path_from_et(et);
4064 	if (!right_path) {
4065 		ret = -ENOMEM;
4066 		mlog_errno(ret);
4067 		goto out;
4068 	}
4069 
4070 	/*
4071 	 * Determine the path to start with. Rotations need the
4072 	 * rightmost path, everything else can go directly to the
4073 	 * target leaf.
4074 	 */
4075 	cpos = le32_to_cpu(insert_rec->e_cpos);
4076 	if (type->ins_appending == APPEND_NONE &&
4077 	    type->ins_contig == CONTIG_NONE) {
4078 		rotate = 1;
4079 		cpos = UINT_MAX;
4080 	}
4081 
4082 	ret = ocfs2_find_path(inode, right_path, cpos);
4083 	if (ret) {
4084 		mlog_errno(ret);
4085 		goto out;
4086 	}
4087 
4088 	/*
4089 	 * Rotations and appends need special treatment - they modify
4090 	 * parts of the tree's above them.
4091 	 *
4092 	 * Both might pass back a path immediate to the left of the
4093 	 * one being inserted to. This will be cause
4094 	 * ocfs2_insert_path() to modify the rightmost records of
4095 	 * left_path to account for an edge insert.
4096 	 *
4097 	 * XXX: When modifying this code, keep in mind that an insert
4098 	 * can wind up skipping both of these two special cases...
4099 	 */
4100 	if (rotate) {
4101 		ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
4102 					      le32_to_cpu(insert_rec->e_cpos),
4103 					      right_path, &left_path);
4104 		if (ret) {
4105 			mlog_errno(ret);
4106 			goto out;
4107 		}
4108 
4109 		/*
4110 		 * ocfs2_rotate_tree_right() might have extended the
4111 		 * transaction without re-journaling our tree root.
4112 		 */
4113 		ret = ocfs2_et_root_journal_access(handle, inode, et,
4114 						   OCFS2_JOURNAL_ACCESS_WRITE);
4115 		if (ret) {
4116 			mlog_errno(ret);
4117 			goto out;
4118 		}
4119 	} else if (type->ins_appending == APPEND_TAIL
4120 		   && type->ins_contig != CONTIG_LEFT) {
4121 		ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
4122 					       right_path, &left_path);
4123 		if (ret) {
4124 			mlog_errno(ret);
4125 			goto out;
4126 		}
4127 	}
4128 
4129 	ret = ocfs2_insert_path(inode, handle, left_path, right_path,
4130 				insert_rec, type);
4131 	if (ret) {
4132 		mlog_errno(ret);
4133 		goto out;
4134 	}
4135 
4136 out_update_clusters:
4137 	if (type->ins_split == SPLIT_NONE)
4138 		ocfs2_et_update_clusters(inode, et,
4139 					 le16_to_cpu(insert_rec->e_leaf_clusters));
4140 
4141 	ret = ocfs2_journal_dirty(handle, et->et_root_bh);
4142 	if (ret)
4143 		mlog_errno(ret);
4144 
4145 out:
4146 	ocfs2_free_path(left_path);
4147 	ocfs2_free_path(right_path);
4148 
4149 	return ret;
4150 }
4151 
4152 static enum ocfs2_contig_type
4153 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4154 			       struct ocfs2_extent_list *el, int index,
4155 			       struct ocfs2_extent_rec *split_rec)
4156 {
4157 	int status;
4158 	enum ocfs2_contig_type ret = CONTIG_NONE;
4159 	u32 left_cpos, right_cpos;
4160 	struct ocfs2_extent_rec *rec = NULL;
4161 	struct ocfs2_extent_list *new_el;
4162 	struct ocfs2_path *left_path = NULL, *right_path = NULL;
4163 	struct buffer_head *bh;
4164 	struct ocfs2_extent_block *eb;
4165 
4166 	if (index > 0) {
4167 		rec = &el->l_recs[index - 1];
4168 	} else if (path->p_tree_depth > 0) {
4169 		status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
4170 						       path, &left_cpos);
4171 		if (status)
4172 			goto out;
4173 
4174 		if (left_cpos != 0) {
4175 			left_path = ocfs2_new_path_from_path(path);
4176 			if (!left_path)
4177 				goto out;
4178 
4179 			status = ocfs2_find_path(inode, left_path, left_cpos);
4180 			if (status)
4181 				goto out;
4182 
4183 			new_el = path_leaf_el(left_path);
4184 
4185 			if (le16_to_cpu(new_el->l_next_free_rec) !=
4186 			    le16_to_cpu(new_el->l_count)) {
4187 				bh = path_leaf_bh(left_path);
4188 				eb = (struct ocfs2_extent_block *)bh->b_data;
4189 				ocfs2_error(inode->i_sb,
4190 					    "Extent block #%llu has an "
4191 					    "invalid l_next_free_rec of "
4192 					    "%d.  It should have "
4193 					    "matched the l_count of %d",
4194 					    (unsigned long long)le64_to_cpu(eb->h_blkno),
4195 					    le16_to_cpu(new_el->l_next_free_rec),
4196 					    le16_to_cpu(new_el->l_count));
4197 				status = -EINVAL;
4198 				goto out;
4199 			}
4200 			rec = &new_el->l_recs[
4201 				le16_to_cpu(new_el->l_next_free_rec) - 1];
4202 		}
4203 	}
4204 
4205 	/*
4206 	 * We're careful to check for an empty extent record here -
4207 	 * the merge code will know what to do if it sees one.
4208 	 */
4209 	if (rec) {
4210 		if (index == 1 && ocfs2_is_empty_extent(rec)) {
4211 			if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4212 				ret = CONTIG_RIGHT;
4213 		} else {
4214 			ret = ocfs2_extent_contig(inode, rec, split_rec);
4215 		}
4216 	}
4217 
4218 	rec = NULL;
4219 	if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4220 		rec = &el->l_recs[index + 1];
4221 	else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4222 		 path->p_tree_depth > 0) {
4223 		status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
4224 							path, &right_cpos);
4225 		if (status)
4226 			goto out;
4227 
4228 		if (right_cpos == 0)
4229 			goto out;
4230 
4231 		right_path = ocfs2_new_path_from_path(path);
4232 		if (!right_path)
4233 			goto out;
4234 
4235 		status = ocfs2_find_path(inode, right_path, right_cpos);
4236 		if (status)
4237 			goto out;
4238 
4239 		new_el = path_leaf_el(right_path);
4240 		rec = &new_el->l_recs[0];
4241 		if (ocfs2_is_empty_extent(rec)) {
4242 			if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4243 				bh = path_leaf_bh(right_path);
4244 				eb = (struct ocfs2_extent_block *)bh->b_data;
4245 				ocfs2_error(inode->i_sb,
4246 					    "Extent block #%llu has an "
4247 					    "invalid l_next_free_rec of %d",
4248 					    (unsigned long long)le64_to_cpu(eb->h_blkno),
4249 					    le16_to_cpu(new_el->l_next_free_rec));
4250 				status = -EINVAL;
4251 				goto out;
4252 			}
4253 			rec = &new_el->l_recs[1];
4254 		}
4255 	}
4256 
4257 	if (rec) {
4258 		enum ocfs2_contig_type contig_type;
4259 
4260 		contig_type = ocfs2_extent_contig(inode, rec, split_rec);
4261 
4262 		if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4263 			ret = CONTIG_LEFTRIGHT;
4264 		else if (ret == CONTIG_NONE)
4265 			ret = contig_type;
4266 	}
4267 
4268 out:
4269 	if (left_path)
4270 		ocfs2_free_path(left_path);
4271 	if (right_path)
4272 		ocfs2_free_path(right_path);
4273 
4274 	return ret;
4275 }
4276 
4277 static void ocfs2_figure_contig_type(struct inode *inode,
4278 				     struct ocfs2_insert_type *insert,
4279 				     struct ocfs2_extent_list *el,
4280 				     struct ocfs2_extent_rec *insert_rec,
4281 				     struct ocfs2_extent_tree *et)
4282 {
4283 	int i;
4284 	enum ocfs2_contig_type contig_type = CONTIG_NONE;
4285 
4286 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4287 
4288 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4289 		contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4290 						  insert_rec);
4291 		if (contig_type != CONTIG_NONE) {
4292 			insert->ins_contig_index = i;
4293 			break;
4294 		}
4295 	}
4296 	insert->ins_contig = contig_type;
4297 
4298 	if (insert->ins_contig != CONTIG_NONE) {
4299 		struct ocfs2_extent_rec *rec =
4300 				&el->l_recs[insert->ins_contig_index];
4301 		unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4302 				   le16_to_cpu(insert_rec->e_leaf_clusters);
4303 
4304 		/*
4305 		 * Caller might want us to limit the size of extents, don't
4306 		 * calculate contiguousness if we might exceed that limit.
4307 		 */
4308 		if (et->et_max_leaf_clusters &&
4309 		    (len > et->et_max_leaf_clusters))
4310 			insert->ins_contig = CONTIG_NONE;
4311 	}
4312 }
4313 
4314 /*
4315  * This should only be called against the righmost leaf extent list.
4316  *
4317  * ocfs2_figure_appending_type() will figure out whether we'll have to
4318  * insert at the tail of the rightmost leaf.
4319  *
4320  * This should also work against the root extent list for tree's with 0
4321  * depth. If we consider the root extent list to be the rightmost leaf node
4322  * then the logic here makes sense.
4323  */
4324 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4325 					struct ocfs2_extent_list *el,
4326 					struct ocfs2_extent_rec *insert_rec)
4327 {
4328 	int i;
4329 	u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4330 	struct ocfs2_extent_rec *rec;
4331 
4332 	insert->ins_appending = APPEND_NONE;
4333 
4334 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4335 
4336 	if (!el->l_next_free_rec)
4337 		goto set_tail_append;
4338 
4339 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4340 		/* Were all records empty? */
4341 		if (le16_to_cpu(el->l_next_free_rec) == 1)
4342 			goto set_tail_append;
4343 	}
4344 
4345 	i = le16_to_cpu(el->l_next_free_rec) - 1;
4346 	rec = &el->l_recs[i];
4347 
4348 	if (cpos >=
4349 	    (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4350 		goto set_tail_append;
4351 
4352 	return;
4353 
4354 set_tail_append:
4355 	insert->ins_appending = APPEND_TAIL;
4356 }
4357 
4358 /*
4359  * Helper function called at the begining of an insert.
4360  *
4361  * This computes a few things that are commonly used in the process of
4362  * inserting into the btree:
4363  *   - Whether the new extent is contiguous with an existing one.
4364  *   - The current tree depth.
4365  *   - Whether the insert is an appending one.
4366  *   - The total # of free records in the tree.
4367  *
4368  * All of the information is stored on the ocfs2_insert_type
4369  * structure.
4370  */
4371 static int ocfs2_figure_insert_type(struct inode *inode,
4372 				    struct ocfs2_extent_tree *et,
4373 				    struct buffer_head **last_eb_bh,
4374 				    struct ocfs2_extent_rec *insert_rec,
4375 				    int *free_records,
4376 				    struct ocfs2_insert_type *insert)
4377 {
4378 	int ret;
4379 	struct ocfs2_extent_block *eb;
4380 	struct ocfs2_extent_list *el;
4381 	struct ocfs2_path *path = NULL;
4382 	struct buffer_head *bh = NULL;
4383 
4384 	insert->ins_split = SPLIT_NONE;
4385 
4386 	el = et->et_root_el;
4387 	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4388 
4389 	if (el->l_tree_depth) {
4390 		/*
4391 		 * If we have tree depth, we read in the
4392 		 * rightmost extent block ahead of time as
4393 		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4394 		 * may want it later.
4395 		 */
4396 		ret = ocfs2_read_extent_block(inode,
4397 					      ocfs2_et_get_last_eb_blk(et),
4398 					      &bh);
4399 		if (ret) {
4400 			mlog_exit(ret);
4401 			goto out;
4402 		}
4403 		eb = (struct ocfs2_extent_block *) bh->b_data;
4404 		el = &eb->h_list;
4405 	}
4406 
4407 	/*
4408 	 * Unless we have a contiguous insert, we'll need to know if
4409 	 * there is room left in our allocation tree for another
4410 	 * extent record.
4411 	 *
4412 	 * XXX: This test is simplistic, we can search for empty
4413 	 * extent records too.
4414 	 */
4415 	*free_records = le16_to_cpu(el->l_count) -
4416 		le16_to_cpu(el->l_next_free_rec);
4417 
4418 	if (!insert->ins_tree_depth) {
4419 		ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4420 		ocfs2_figure_appending_type(insert, el, insert_rec);
4421 		return 0;
4422 	}
4423 
4424 	path = ocfs2_new_path_from_et(et);
4425 	if (!path) {
4426 		ret = -ENOMEM;
4427 		mlog_errno(ret);
4428 		goto out;
4429 	}
4430 
4431 	/*
4432 	 * In the case that we're inserting past what the tree
4433 	 * currently accounts for, ocfs2_find_path() will return for
4434 	 * us the rightmost tree path. This is accounted for below in
4435 	 * the appending code.
4436 	 */
4437 	ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4438 	if (ret) {
4439 		mlog_errno(ret);
4440 		goto out;
4441 	}
4442 
4443 	el = path_leaf_el(path);
4444 
4445 	/*
4446 	 * Now that we have the path, there's two things we want to determine:
4447 	 * 1) Contiguousness (also set contig_index if this is so)
4448 	 *
4449 	 * 2) Are we doing an append? We can trivially break this up
4450          *     into two types of appends: simple record append, or a
4451          *     rotate inside the tail leaf.
4452 	 */
4453 	ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4454 
4455 	/*
4456 	 * The insert code isn't quite ready to deal with all cases of
4457 	 * left contiguousness. Specifically, if it's an insert into
4458 	 * the 1st record in a leaf, it will require the adjustment of
4459 	 * cluster count on the last record of the path directly to it's
4460 	 * left. For now, just catch that case and fool the layers
4461 	 * above us. This works just fine for tree_depth == 0, which
4462 	 * is why we allow that above.
4463 	 */
4464 	if (insert->ins_contig == CONTIG_LEFT &&
4465 	    insert->ins_contig_index == 0)
4466 		insert->ins_contig = CONTIG_NONE;
4467 
4468 	/*
4469 	 * Ok, so we can simply compare against last_eb to figure out
4470 	 * whether the path doesn't exist. This will only happen in
4471 	 * the case that we're doing a tail append, so maybe we can
4472 	 * take advantage of that information somehow.
4473 	 */
4474 	if (ocfs2_et_get_last_eb_blk(et) ==
4475 	    path_leaf_bh(path)->b_blocknr) {
4476 		/*
4477 		 * Ok, ocfs2_find_path() returned us the rightmost
4478 		 * tree path. This might be an appending insert. There are
4479 		 * two cases:
4480 		 *    1) We're doing a true append at the tail:
4481 		 *	-This might even be off the end of the leaf
4482 		 *    2) We're "appending" by rotating in the tail
4483 		 */
4484 		ocfs2_figure_appending_type(insert, el, insert_rec);
4485 	}
4486 
4487 out:
4488 	ocfs2_free_path(path);
4489 
4490 	if (ret == 0)
4491 		*last_eb_bh = bh;
4492 	else
4493 		brelse(bh);
4494 	return ret;
4495 }
4496 
4497 /*
4498  * Insert an extent into an inode btree.
4499  *
4500  * The caller needs to update fe->i_clusters
4501  */
4502 int ocfs2_insert_extent(struct ocfs2_super *osb,
4503 			handle_t *handle,
4504 			struct inode *inode,
4505 			struct ocfs2_extent_tree *et,
4506 			u32 cpos,
4507 			u64 start_blk,
4508 			u32 new_clusters,
4509 			u8 flags,
4510 			struct ocfs2_alloc_context *meta_ac)
4511 {
4512 	int status;
4513 	int uninitialized_var(free_records);
4514 	struct buffer_head *last_eb_bh = NULL;
4515 	struct ocfs2_insert_type insert = {0, };
4516 	struct ocfs2_extent_rec rec;
4517 
4518 	mlog(0, "add %u clusters at position %u to inode %llu\n",
4519 	     new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4520 
4521 	memset(&rec, 0, sizeof(rec));
4522 	rec.e_cpos = cpu_to_le32(cpos);
4523 	rec.e_blkno = cpu_to_le64(start_blk);
4524 	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4525 	rec.e_flags = flags;
4526 	status = ocfs2_et_insert_check(inode, et, &rec);
4527 	if (status) {
4528 		mlog_errno(status);
4529 		goto bail;
4530 	}
4531 
4532 	status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4533 					  &free_records, &insert);
4534 	if (status < 0) {
4535 		mlog_errno(status);
4536 		goto bail;
4537 	}
4538 
4539 	mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4540 	     "Insert.contig_index: %d, Insert.free_records: %d, "
4541 	     "Insert.tree_depth: %d\n",
4542 	     insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4543 	     free_records, insert.ins_tree_depth);
4544 
4545 	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4546 		status = ocfs2_grow_tree(inode, handle, et,
4547 					 &insert.ins_tree_depth, &last_eb_bh,
4548 					 meta_ac);
4549 		if (status) {
4550 			mlog_errno(status);
4551 			goto bail;
4552 		}
4553 	}
4554 
4555 	/* Finally, we can add clusters. This might rotate the tree for us. */
4556 	status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4557 	if (status < 0)
4558 		mlog_errno(status);
4559 	else if (et->et_ops == &ocfs2_dinode_et_ops)
4560 		ocfs2_extent_map_insert_rec(inode, &rec);
4561 
4562 bail:
4563 	brelse(last_eb_bh);
4564 
4565 	mlog_exit(status);
4566 	return status;
4567 }
4568 
4569 /*
4570  * Allcate and add clusters into the extent b-tree.
4571  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4572  * The extent b-tree's root is specified by et, and
4573  * it is not limited to the file storage. Any extent tree can use this
4574  * function if it implements the proper ocfs2_extent_tree.
4575  */
4576 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4577 				struct inode *inode,
4578 				u32 *logical_offset,
4579 				u32 clusters_to_add,
4580 				int mark_unwritten,
4581 				struct ocfs2_extent_tree *et,
4582 				handle_t *handle,
4583 				struct ocfs2_alloc_context *data_ac,
4584 				struct ocfs2_alloc_context *meta_ac,
4585 				enum ocfs2_alloc_restarted *reason_ret)
4586 {
4587 	int status = 0;
4588 	int free_extents;
4589 	enum ocfs2_alloc_restarted reason = RESTART_NONE;
4590 	u32 bit_off, num_bits;
4591 	u64 block;
4592 	u8 flags = 0;
4593 
4594 	BUG_ON(!clusters_to_add);
4595 
4596 	if (mark_unwritten)
4597 		flags = OCFS2_EXT_UNWRITTEN;
4598 
4599 	free_extents = ocfs2_num_free_extents(osb, inode, et);
4600 	if (free_extents < 0) {
4601 		status = free_extents;
4602 		mlog_errno(status);
4603 		goto leave;
4604 	}
4605 
4606 	/* there are two cases which could cause us to EAGAIN in the
4607 	 * we-need-more-metadata case:
4608 	 * 1) we haven't reserved *any*
4609 	 * 2) we are so fragmented, we've needed to add metadata too
4610 	 *    many times. */
4611 	if (!free_extents && !meta_ac) {
4612 		mlog(0, "we haven't reserved any metadata!\n");
4613 		status = -EAGAIN;
4614 		reason = RESTART_META;
4615 		goto leave;
4616 	} else if ((!free_extents)
4617 		   && (ocfs2_alloc_context_bits_left(meta_ac)
4618 		       < ocfs2_extend_meta_needed(et->et_root_el))) {
4619 		mlog(0, "filesystem is really fragmented...\n");
4620 		status = -EAGAIN;
4621 		reason = RESTART_META;
4622 		goto leave;
4623 	}
4624 
4625 	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4626 					clusters_to_add, &bit_off, &num_bits);
4627 	if (status < 0) {
4628 		if (status != -ENOSPC)
4629 			mlog_errno(status);
4630 		goto leave;
4631 	}
4632 
4633 	BUG_ON(num_bits > clusters_to_add);
4634 
4635 	/* reserve our write early -- insert_extent may update the tree root */
4636 	status = ocfs2_et_root_journal_access(handle, inode, et,
4637 					      OCFS2_JOURNAL_ACCESS_WRITE);
4638 	if (status < 0) {
4639 		mlog_errno(status);
4640 		goto leave;
4641 	}
4642 
4643 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4644 	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4645 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4646 	status = ocfs2_insert_extent(osb, handle, inode, et,
4647 				     *logical_offset, block,
4648 				     num_bits, flags, meta_ac);
4649 	if (status < 0) {
4650 		mlog_errno(status);
4651 		goto leave;
4652 	}
4653 
4654 	status = ocfs2_journal_dirty(handle, et->et_root_bh);
4655 	if (status < 0) {
4656 		mlog_errno(status);
4657 		goto leave;
4658 	}
4659 
4660 	clusters_to_add -= num_bits;
4661 	*logical_offset += num_bits;
4662 
4663 	if (clusters_to_add) {
4664 		mlog(0, "need to alloc once more, wanted = %u\n",
4665 		     clusters_to_add);
4666 		status = -EAGAIN;
4667 		reason = RESTART_TRANS;
4668 	}
4669 
4670 leave:
4671 	mlog_exit(status);
4672 	if (reason_ret)
4673 		*reason_ret = reason;
4674 	return status;
4675 }
4676 
4677 static void ocfs2_make_right_split_rec(struct super_block *sb,
4678 				       struct ocfs2_extent_rec *split_rec,
4679 				       u32 cpos,
4680 				       struct ocfs2_extent_rec *rec)
4681 {
4682 	u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4683 	u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4684 
4685 	memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4686 
4687 	split_rec->e_cpos = cpu_to_le32(cpos);
4688 	split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4689 
4690 	split_rec->e_blkno = rec->e_blkno;
4691 	le64_add_cpu(&split_rec->e_blkno,
4692 		     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4693 
4694 	split_rec->e_flags = rec->e_flags;
4695 }
4696 
4697 static int ocfs2_split_and_insert(struct inode *inode,
4698 				  handle_t *handle,
4699 				  struct ocfs2_path *path,
4700 				  struct ocfs2_extent_tree *et,
4701 				  struct buffer_head **last_eb_bh,
4702 				  int split_index,
4703 				  struct ocfs2_extent_rec *orig_split_rec,
4704 				  struct ocfs2_alloc_context *meta_ac)
4705 {
4706 	int ret = 0, depth;
4707 	unsigned int insert_range, rec_range, do_leftright = 0;
4708 	struct ocfs2_extent_rec tmprec;
4709 	struct ocfs2_extent_list *rightmost_el;
4710 	struct ocfs2_extent_rec rec;
4711 	struct ocfs2_extent_rec split_rec = *orig_split_rec;
4712 	struct ocfs2_insert_type insert;
4713 	struct ocfs2_extent_block *eb;
4714 
4715 leftright:
4716 	/*
4717 	 * Store a copy of the record on the stack - it might move
4718 	 * around as the tree is manipulated below.
4719 	 */
4720 	rec = path_leaf_el(path)->l_recs[split_index];
4721 
4722 	rightmost_el = et->et_root_el;
4723 
4724 	depth = le16_to_cpu(rightmost_el->l_tree_depth);
4725 	if (depth) {
4726 		BUG_ON(!(*last_eb_bh));
4727 		eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4728 		rightmost_el = &eb->h_list;
4729 	}
4730 
4731 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4732 	    le16_to_cpu(rightmost_el->l_count)) {
4733 		ret = ocfs2_grow_tree(inode, handle, et,
4734 				      &depth, last_eb_bh, meta_ac);
4735 		if (ret) {
4736 			mlog_errno(ret);
4737 			goto out;
4738 		}
4739 	}
4740 
4741 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4742 	insert.ins_appending = APPEND_NONE;
4743 	insert.ins_contig = CONTIG_NONE;
4744 	insert.ins_tree_depth = depth;
4745 
4746 	insert_range = le32_to_cpu(split_rec.e_cpos) +
4747 		le16_to_cpu(split_rec.e_leaf_clusters);
4748 	rec_range = le32_to_cpu(rec.e_cpos) +
4749 		le16_to_cpu(rec.e_leaf_clusters);
4750 
4751 	if (split_rec.e_cpos == rec.e_cpos) {
4752 		insert.ins_split = SPLIT_LEFT;
4753 	} else if (insert_range == rec_range) {
4754 		insert.ins_split = SPLIT_RIGHT;
4755 	} else {
4756 		/*
4757 		 * Left/right split. We fake this as a right split
4758 		 * first and then make a second pass as a left split.
4759 		 */
4760 		insert.ins_split = SPLIT_RIGHT;
4761 
4762 		ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4763 					   &rec);
4764 
4765 		split_rec = tmprec;
4766 
4767 		BUG_ON(do_leftright);
4768 		do_leftright = 1;
4769 	}
4770 
4771 	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4772 	if (ret) {
4773 		mlog_errno(ret);
4774 		goto out;
4775 	}
4776 
4777 	if (do_leftright == 1) {
4778 		u32 cpos;
4779 		struct ocfs2_extent_list *el;
4780 
4781 		do_leftright++;
4782 		split_rec = *orig_split_rec;
4783 
4784 		ocfs2_reinit_path(path, 1);
4785 
4786 		cpos = le32_to_cpu(split_rec.e_cpos);
4787 		ret = ocfs2_find_path(inode, path, cpos);
4788 		if (ret) {
4789 			mlog_errno(ret);
4790 			goto out;
4791 		}
4792 
4793 		el = path_leaf_el(path);
4794 		split_index = ocfs2_search_extent_list(el, cpos);
4795 		goto leftright;
4796 	}
4797 out:
4798 
4799 	return ret;
4800 }
4801 
4802 /*
4803  * Mark part or all of the extent record at split_index in the leaf
4804  * pointed to by path as written. This removes the unwritten
4805  * extent flag.
4806  *
4807  * Care is taken to handle contiguousness so as to not grow the tree.
4808  *
4809  * meta_ac is not strictly necessary - we only truly need it if growth
4810  * of the tree is required. All other cases will degrade into a less
4811  * optimal tree layout.
4812  *
4813  * last_eb_bh should be the rightmost leaf block for any extent
4814  * btree. Since a split may grow the tree or a merge might shrink it,
4815  * the caller cannot trust the contents of that buffer after this call.
4816  *
4817  * This code is optimized for readability - several passes might be
4818  * made over certain portions of the tree. All of those blocks will
4819  * have been brought into cache (and pinned via the journal), so the
4820  * extra overhead is not expressed in terms of disk reads.
4821  */
4822 static int __ocfs2_mark_extent_written(struct inode *inode,
4823 				       struct ocfs2_extent_tree *et,
4824 				       handle_t *handle,
4825 				       struct ocfs2_path *path,
4826 				       int split_index,
4827 				       struct ocfs2_extent_rec *split_rec,
4828 				       struct ocfs2_alloc_context *meta_ac,
4829 				       struct ocfs2_cached_dealloc_ctxt *dealloc)
4830 {
4831 	int ret = 0;
4832 	struct ocfs2_extent_list *el = path_leaf_el(path);
4833 	struct buffer_head *last_eb_bh = NULL;
4834 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
4835 	struct ocfs2_merge_ctxt ctxt;
4836 	struct ocfs2_extent_list *rightmost_el;
4837 
4838 	if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
4839 		ret = -EIO;
4840 		mlog_errno(ret);
4841 		goto out;
4842 	}
4843 
4844 	if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
4845 	    ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
4846 	     (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
4847 		ret = -EIO;
4848 		mlog_errno(ret);
4849 		goto out;
4850 	}
4851 
4852 	ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
4853 							    split_index,
4854 							    split_rec);
4855 
4856 	/*
4857 	 * The core merge / split code wants to know how much room is
4858 	 * left in this inodes allocation tree, so we pass the
4859 	 * rightmost extent list.
4860 	 */
4861 	if (path->p_tree_depth) {
4862 		struct ocfs2_extent_block *eb;
4863 
4864 		ret = ocfs2_read_extent_block(inode,
4865 					      ocfs2_et_get_last_eb_blk(et),
4866 					      &last_eb_bh);
4867 		if (ret) {
4868 			mlog_exit(ret);
4869 			goto out;
4870 		}
4871 
4872 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
4873 		rightmost_el = &eb->h_list;
4874 	} else
4875 		rightmost_el = path_root_el(path);
4876 
4877 	if (rec->e_cpos == split_rec->e_cpos &&
4878 	    rec->e_leaf_clusters == split_rec->e_leaf_clusters)
4879 		ctxt.c_split_covers_rec = 1;
4880 	else
4881 		ctxt.c_split_covers_rec = 0;
4882 
4883 	ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
4884 
4885 	mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
4886 	     split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
4887 	     ctxt.c_split_covers_rec);
4888 
4889 	if (ctxt.c_contig_type == CONTIG_NONE) {
4890 		if (ctxt.c_split_covers_rec)
4891 			el->l_recs[split_index] = *split_rec;
4892 		else
4893 			ret = ocfs2_split_and_insert(inode, handle, path, et,
4894 						     &last_eb_bh, split_index,
4895 						     split_rec, meta_ac);
4896 		if (ret)
4897 			mlog_errno(ret);
4898 	} else {
4899 		ret = ocfs2_try_to_merge_extent(inode, handle, path,
4900 						split_index, split_rec,
4901 						dealloc, &ctxt, et);
4902 		if (ret)
4903 			mlog_errno(ret);
4904 	}
4905 
4906 out:
4907 	brelse(last_eb_bh);
4908 	return ret;
4909 }
4910 
4911 /*
4912  * Mark the already-existing extent at cpos as written for len clusters.
4913  *
4914  * If the existing extent is larger than the request, initiate a
4915  * split. An attempt will be made at merging with adjacent extents.
4916  *
4917  * The caller is responsible for passing down meta_ac if we'll need it.
4918  */
4919 int ocfs2_mark_extent_written(struct inode *inode,
4920 			      struct ocfs2_extent_tree *et,
4921 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
4922 			      struct ocfs2_alloc_context *meta_ac,
4923 			      struct ocfs2_cached_dealloc_ctxt *dealloc)
4924 {
4925 	int ret, index;
4926 	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
4927 	struct ocfs2_extent_rec split_rec;
4928 	struct ocfs2_path *left_path = NULL;
4929 	struct ocfs2_extent_list *el;
4930 
4931 	mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
4932 	     inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
4933 
4934 	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
4935 		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
4936 			    "that are being written to, but the feature bit "
4937 			    "is not set in the super block.",
4938 			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
4939 		ret = -EROFS;
4940 		goto out;
4941 	}
4942 
4943 	/*
4944 	 * XXX: This should be fixed up so that we just re-insert the
4945 	 * next extent records.
4946 	 *
4947 	 * XXX: This is a hack on the extent tree, maybe it should be
4948 	 * an op?
4949 	 */
4950 	if (et->et_ops == &ocfs2_dinode_et_ops)
4951 		ocfs2_extent_map_trunc(inode, 0);
4952 
4953 	left_path = ocfs2_new_path_from_et(et);
4954 	if (!left_path) {
4955 		ret = -ENOMEM;
4956 		mlog_errno(ret);
4957 		goto out;
4958 	}
4959 
4960 	ret = ocfs2_find_path(inode, left_path, cpos);
4961 	if (ret) {
4962 		mlog_errno(ret);
4963 		goto out;
4964 	}
4965 	el = path_leaf_el(left_path);
4966 
4967 	index = ocfs2_search_extent_list(el, cpos);
4968 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
4969 		ocfs2_error(inode->i_sb,
4970 			    "Inode %llu has an extent at cpos %u which can no "
4971 			    "longer be found.\n",
4972 			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
4973 		ret = -EROFS;
4974 		goto out;
4975 	}
4976 
4977 	memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
4978 	split_rec.e_cpos = cpu_to_le32(cpos);
4979 	split_rec.e_leaf_clusters = cpu_to_le16(len);
4980 	split_rec.e_blkno = cpu_to_le64(start_blkno);
4981 	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
4982 	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
4983 
4984 	ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
4985 					  index, &split_rec, meta_ac,
4986 					  dealloc);
4987 	if (ret)
4988 		mlog_errno(ret);
4989 
4990 out:
4991 	ocfs2_free_path(left_path);
4992 	return ret;
4993 }
4994 
4995 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
4996 			    handle_t *handle, struct ocfs2_path *path,
4997 			    int index, u32 new_range,
4998 			    struct ocfs2_alloc_context *meta_ac)
4999 {
5000 	int ret, depth, credits = handle->h_buffer_credits;
5001 	struct buffer_head *last_eb_bh = NULL;
5002 	struct ocfs2_extent_block *eb;
5003 	struct ocfs2_extent_list *rightmost_el, *el;
5004 	struct ocfs2_extent_rec split_rec;
5005 	struct ocfs2_extent_rec *rec;
5006 	struct ocfs2_insert_type insert;
5007 
5008 	/*
5009 	 * Setup the record to split before we grow the tree.
5010 	 */
5011 	el = path_leaf_el(path);
5012 	rec = &el->l_recs[index];
5013 	ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
5014 
5015 	depth = path->p_tree_depth;
5016 	if (depth > 0) {
5017 		ret = ocfs2_read_extent_block(inode,
5018 					      ocfs2_et_get_last_eb_blk(et),
5019 					      &last_eb_bh);
5020 		if (ret < 0) {
5021 			mlog_errno(ret);
5022 			goto out;
5023 		}
5024 
5025 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5026 		rightmost_el = &eb->h_list;
5027 	} else
5028 		rightmost_el = path_leaf_el(path);
5029 
5030 	credits += path->p_tree_depth +
5031 		   ocfs2_extend_meta_needed(et->et_root_el);
5032 	ret = ocfs2_extend_trans(handle, credits);
5033 	if (ret) {
5034 		mlog_errno(ret);
5035 		goto out;
5036 	}
5037 
5038 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5039 	    le16_to_cpu(rightmost_el->l_count)) {
5040 		ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
5041 				      meta_ac);
5042 		if (ret) {
5043 			mlog_errno(ret);
5044 			goto out;
5045 		}
5046 	}
5047 
5048 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
5049 	insert.ins_appending = APPEND_NONE;
5050 	insert.ins_contig = CONTIG_NONE;
5051 	insert.ins_split = SPLIT_RIGHT;
5052 	insert.ins_tree_depth = depth;
5053 
5054 	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
5055 	if (ret)
5056 		mlog_errno(ret);
5057 
5058 out:
5059 	brelse(last_eb_bh);
5060 	return ret;
5061 }
5062 
5063 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5064 			      struct ocfs2_path *path, int index,
5065 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
5066 			      u32 cpos, u32 len,
5067 			      struct ocfs2_extent_tree *et)
5068 {
5069 	int ret;
5070 	u32 left_cpos, rec_range, trunc_range;
5071 	int wants_rotate = 0, is_rightmost_tree_rec = 0;
5072 	struct super_block *sb = inode->i_sb;
5073 	struct ocfs2_path *left_path = NULL;
5074 	struct ocfs2_extent_list *el = path_leaf_el(path);
5075 	struct ocfs2_extent_rec *rec;
5076 	struct ocfs2_extent_block *eb;
5077 
5078 	if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5079 		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5080 		if (ret) {
5081 			mlog_errno(ret);
5082 			goto out;
5083 		}
5084 
5085 		index--;
5086 	}
5087 
5088 	if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5089 	    path->p_tree_depth) {
5090 		/*
5091 		 * Check whether this is the rightmost tree record. If
5092 		 * we remove all of this record or part of its right
5093 		 * edge then an update of the record lengths above it
5094 		 * will be required.
5095 		 */
5096 		eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5097 		if (eb->h_next_leaf_blk == 0)
5098 			is_rightmost_tree_rec = 1;
5099 	}
5100 
5101 	rec = &el->l_recs[index];
5102 	if (index == 0 && path->p_tree_depth &&
5103 	    le32_to_cpu(rec->e_cpos) == cpos) {
5104 		/*
5105 		 * Changing the leftmost offset (via partial or whole
5106 		 * record truncate) of an interior (or rightmost) path
5107 		 * means we have to update the subtree that is formed
5108 		 * by this leaf and the one to it's left.
5109 		 *
5110 		 * There are two cases we can skip:
5111 		 *   1) Path is the leftmost one in our inode tree.
5112 		 *   2) The leaf is rightmost and will be empty after
5113 		 *      we remove the extent record - the rotate code
5114 		 *      knows how to update the newly formed edge.
5115 		 */
5116 
5117 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
5118 						    &left_cpos);
5119 		if (ret) {
5120 			mlog_errno(ret);
5121 			goto out;
5122 		}
5123 
5124 		if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5125 			left_path = ocfs2_new_path_from_path(path);
5126 			if (!left_path) {
5127 				ret = -ENOMEM;
5128 				mlog_errno(ret);
5129 				goto out;
5130 			}
5131 
5132 			ret = ocfs2_find_path(inode, left_path, left_cpos);
5133 			if (ret) {
5134 				mlog_errno(ret);
5135 				goto out;
5136 			}
5137 		}
5138 	}
5139 
5140 	ret = ocfs2_extend_rotate_transaction(handle, 0,
5141 					      handle->h_buffer_credits,
5142 					      path);
5143 	if (ret) {
5144 		mlog_errno(ret);
5145 		goto out;
5146 	}
5147 
5148 	ret = ocfs2_journal_access_path(inode, handle, path);
5149 	if (ret) {
5150 		mlog_errno(ret);
5151 		goto out;
5152 	}
5153 
5154 	ret = ocfs2_journal_access_path(inode, handle, left_path);
5155 	if (ret) {
5156 		mlog_errno(ret);
5157 		goto out;
5158 	}
5159 
5160 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5161 	trunc_range = cpos + len;
5162 
5163 	if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5164 		int next_free;
5165 
5166 		memset(rec, 0, sizeof(*rec));
5167 		ocfs2_cleanup_merge(el, index);
5168 		wants_rotate = 1;
5169 
5170 		next_free = le16_to_cpu(el->l_next_free_rec);
5171 		if (is_rightmost_tree_rec && next_free > 1) {
5172 			/*
5173 			 * We skip the edge update if this path will
5174 			 * be deleted by the rotate code.
5175 			 */
5176 			rec = &el->l_recs[next_free - 1];
5177 			ocfs2_adjust_rightmost_records(inode, handle, path,
5178 						       rec);
5179 		}
5180 	} else if (le32_to_cpu(rec->e_cpos) == cpos) {
5181 		/* Remove leftmost portion of the record. */
5182 		le32_add_cpu(&rec->e_cpos, len);
5183 		le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5184 		le16_add_cpu(&rec->e_leaf_clusters, -len);
5185 	} else if (rec_range == trunc_range) {
5186 		/* Remove rightmost portion of the record */
5187 		le16_add_cpu(&rec->e_leaf_clusters, -len);
5188 		if (is_rightmost_tree_rec)
5189 			ocfs2_adjust_rightmost_records(inode, handle, path, rec);
5190 	} else {
5191 		/* Caller should have trapped this. */
5192 		mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
5193 		     "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
5194 		     le32_to_cpu(rec->e_cpos),
5195 		     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5196 		BUG();
5197 	}
5198 
5199 	if (left_path) {
5200 		int subtree_index;
5201 
5202 		subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
5203 		ocfs2_complete_edge_insert(inode, handle, left_path, path,
5204 					   subtree_index);
5205 	}
5206 
5207 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
5208 
5209 	ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5210 	if (ret) {
5211 		mlog_errno(ret);
5212 		goto out;
5213 	}
5214 
5215 out:
5216 	ocfs2_free_path(left_path);
5217 	return ret;
5218 }
5219 
5220 int ocfs2_remove_extent(struct inode *inode,
5221 			struct ocfs2_extent_tree *et,
5222 			u32 cpos, u32 len, handle_t *handle,
5223 			struct ocfs2_alloc_context *meta_ac,
5224 			struct ocfs2_cached_dealloc_ctxt *dealloc)
5225 {
5226 	int ret, index;
5227 	u32 rec_range, trunc_range;
5228 	struct ocfs2_extent_rec *rec;
5229 	struct ocfs2_extent_list *el;
5230 	struct ocfs2_path *path = NULL;
5231 
5232 	ocfs2_extent_map_trunc(inode, 0);
5233 
5234 	path = ocfs2_new_path_from_et(et);
5235 	if (!path) {
5236 		ret = -ENOMEM;
5237 		mlog_errno(ret);
5238 		goto out;
5239 	}
5240 
5241 	ret = ocfs2_find_path(inode, path, cpos);
5242 	if (ret) {
5243 		mlog_errno(ret);
5244 		goto out;
5245 	}
5246 
5247 	el = path_leaf_el(path);
5248 	index = ocfs2_search_extent_list(el, cpos);
5249 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5250 		ocfs2_error(inode->i_sb,
5251 			    "Inode %llu has an extent at cpos %u which can no "
5252 			    "longer be found.\n",
5253 			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5254 		ret = -EROFS;
5255 		goto out;
5256 	}
5257 
5258 	/*
5259 	 * We have 3 cases of extent removal:
5260 	 *   1) Range covers the entire extent rec
5261 	 *   2) Range begins or ends on one edge of the extent rec
5262 	 *   3) Range is in the middle of the extent rec (no shared edges)
5263 	 *
5264 	 * For case 1 we remove the extent rec and left rotate to
5265 	 * fill the hole.
5266 	 *
5267 	 * For case 2 we just shrink the existing extent rec, with a
5268 	 * tree update if the shrinking edge is also the edge of an
5269 	 * extent block.
5270 	 *
5271 	 * For case 3 we do a right split to turn the extent rec into
5272 	 * something case 2 can handle.
5273 	 */
5274 	rec = &el->l_recs[index];
5275 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5276 	trunc_range = cpos + len;
5277 
5278 	BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5279 
5280 	mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
5281 	     "(cpos %u, len %u)\n",
5282 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
5283 	     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5284 
5285 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5286 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5287 					 cpos, len, et);
5288 		if (ret) {
5289 			mlog_errno(ret);
5290 			goto out;
5291 		}
5292 	} else {
5293 		ret = ocfs2_split_tree(inode, et, handle, path, index,
5294 				       trunc_range, meta_ac);
5295 		if (ret) {
5296 			mlog_errno(ret);
5297 			goto out;
5298 		}
5299 
5300 		/*
5301 		 * The split could have manipulated the tree enough to
5302 		 * move the record location, so we have to look for it again.
5303 		 */
5304 		ocfs2_reinit_path(path, 1);
5305 
5306 		ret = ocfs2_find_path(inode, path, cpos);
5307 		if (ret) {
5308 			mlog_errno(ret);
5309 			goto out;
5310 		}
5311 
5312 		el = path_leaf_el(path);
5313 		index = ocfs2_search_extent_list(el, cpos);
5314 		if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5315 			ocfs2_error(inode->i_sb,
5316 				    "Inode %llu: split at cpos %u lost record.",
5317 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
5318 				    cpos);
5319 			ret = -EROFS;
5320 			goto out;
5321 		}
5322 
5323 		/*
5324 		 * Double check our values here. If anything is fishy,
5325 		 * it's easier to catch it at the top level.
5326 		 */
5327 		rec = &el->l_recs[index];
5328 		rec_range = le32_to_cpu(rec->e_cpos) +
5329 			ocfs2_rec_clusters(el, rec);
5330 		if (rec_range != trunc_range) {
5331 			ocfs2_error(inode->i_sb,
5332 				    "Inode %llu: error after split at cpos %u"
5333 				    "trunc len %u, existing record is (%u,%u)",
5334 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
5335 				    cpos, len, le32_to_cpu(rec->e_cpos),
5336 				    ocfs2_rec_clusters(el, rec));
5337 			ret = -EROFS;
5338 			goto out;
5339 		}
5340 
5341 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5342 					 cpos, len, et);
5343 		if (ret) {
5344 			mlog_errno(ret);
5345 			goto out;
5346 		}
5347 	}
5348 
5349 out:
5350 	ocfs2_free_path(path);
5351 	return ret;
5352 }
5353 
5354 int ocfs2_remove_btree_range(struct inode *inode,
5355 			     struct ocfs2_extent_tree *et,
5356 			     u32 cpos, u32 phys_cpos, u32 len,
5357 			     struct ocfs2_cached_dealloc_ctxt *dealloc)
5358 {
5359 	int ret;
5360 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5361 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5362 	struct inode *tl_inode = osb->osb_tl_inode;
5363 	handle_t *handle;
5364 	struct ocfs2_alloc_context *meta_ac = NULL;
5365 
5366 	ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
5367 	if (ret) {
5368 		mlog_errno(ret);
5369 		return ret;
5370 	}
5371 
5372 	mutex_lock(&tl_inode->i_mutex);
5373 
5374 	if (ocfs2_truncate_log_needs_flush(osb)) {
5375 		ret = __ocfs2_flush_truncate_log(osb);
5376 		if (ret < 0) {
5377 			mlog_errno(ret);
5378 			goto out;
5379 		}
5380 	}
5381 
5382 	handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
5383 	if (IS_ERR(handle)) {
5384 		ret = PTR_ERR(handle);
5385 		mlog_errno(ret);
5386 		goto out;
5387 	}
5388 
5389 	ret = ocfs2_et_root_journal_access(handle, inode, et,
5390 					   OCFS2_JOURNAL_ACCESS_WRITE);
5391 	if (ret) {
5392 		mlog_errno(ret);
5393 		goto out;
5394 	}
5395 
5396 	ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
5397 				  dealloc);
5398 	if (ret) {
5399 		mlog_errno(ret);
5400 		goto out_commit;
5401 	}
5402 
5403 	ocfs2_et_update_clusters(inode, et, -len);
5404 
5405 	ret = ocfs2_journal_dirty(handle, et->et_root_bh);
5406 	if (ret) {
5407 		mlog_errno(ret);
5408 		goto out_commit;
5409 	}
5410 
5411 	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
5412 	if (ret)
5413 		mlog_errno(ret);
5414 
5415 out_commit:
5416 	ocfs2_commit_trans(osb, handle);
5417 out:
5418 	mutex_unlock(&tl_inode->i_mutex);
5419 
5420 	if (meta_ac)
5421 		ocfs2_free_alloc_context(meta_ac);
5422 
5423 	return ret;
5424 }
5425 
5426 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5427 {
5428 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5429 	struct ocfs2_dinode *di;
5430 	struct ocfs2_truncate_log *tl;
5431 
5432 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5433 	tl = &di->id2.i_dealloc;
5434 
5435 	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5436 			"slot %d, invalid truncate log parameters: used = "
5437 			"%u, count = %u\n", osb->slot_num,
5438 			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5439 	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5440 }
5441 
5442 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5443 					   unsigned int new_start)
5444 {
5445 	unsigned int tail_index;
5446 	unsigned int current_tail;
5447 
5448 	/* No records, nothing to coalesce */
5449 	if (!le16_to_cpu(tl->tl_used))
5450 		return 0;
5451 
5452 	tail_index = le16_to_cpu(tl->tl_used) - 1;
5453 	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5454 	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5455 
5456 	return current_tail == new_start;
5457 }
5458 
5459 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5460 			      handle_t *handle,
5461 			      u64 start_blk,
5462 			      unsigned int num_clusters)
5463 {
5464 	int status, index;
5465 	unsigned int start_cluster, tl_count;
5466 	struct inode *tl_inode = osb->osb_tl_inode;
5467 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5468 	struct ocfs2_dinode *di;
5469 	struct ocfs2_truncate_log *tl;
5470 
5471 	mlog_entry("start_blk = %llu, num_clusters = %u\n",
5472 		   (unsigned long long)start_blk, num_clusters);
5473 
5474 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5475 
5476 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5477 
5478 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5479 
5480 	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5481 	 * by the underlying call to ocfs2_read_inode_block(), so any
5482 	 * corruption is a code bug */
5483 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5484 
5485 	tl = &di->id2.i_dealloc;
5486 	tl_count = le16_to_cpu(tl->tl_count);
5487 	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5488 			tl_count == 0,
5489 			"Truncate record count on #%llu invalid "
5490 			"wanted %u, actual %u\n",
5491 			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5492 			ocfs2_truncate_recs_per_inode(osb->sb),
5493 			le16_to_cpu(tl->tl_count));
5494 
5495 	/* Caller should have known to flush before calling us. */
5496 	index = le16_to_cpu(tl->tl_used);
5497 	if (index >= tl_count) {
5498 		status = -ENOSPC;
5499 		mlog_errno(status);
5500 		goto bail;
5501 	}
5502 
5503 	status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
5504 					 OCFS2_JOURNAL_ACCESS_WRITE);
5505 	if (status < 0) {
5506 		mlog_errno(status);
5507 		goto bail;
5508 	}
5509 
5510 	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5511 	     "%llu (index = %d)\n", num_clusters, start_cluster,
5512 	     (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5513 
5514 	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5515 		/*
5516 		 * Move index back to the record we are coalescing with.
5517 		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5518 		 */
5519 		index--;
5520 
5521 		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5522 		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5523 		     index, le32_to_cpu(tl->tl_recs[index].t_start),
5524 		     num_clusters);
5525 	} else {
5526 		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5527 		tl->tl_used = cpu_to_le16(index + 1);
5528 	}
5529 	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5530 
5531 	status = ocfs2_journal_dirty(handle, tl_bh);
5532 	if (status < 0) {
5533 		mlog_errno(status);
5534 		goto bail;
5535 	}
5536 
5537 bail:
5538 	mlog_exit(status);
5539 	return status;
5540 }
5541 
5542 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5543 					 handle_t *handle,
5544 					 struct inode *data_alloc_inode,
5545 					 struct buffer_head *data_alloc_bh)
5546 {
5547 	int status = 0;
5548 	int i;
5549 	unsigned int num_clusters;
5550 	u64 start_blk;
5551 	struct ocfs2_truncate_rec rec;
5552 	struct ocfs2_dinode *di;
5553 	struct ocfs2_truncate_log *tl;
5554 	struct inode *tl_inode = osb->osb_tl_inode;
5555 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5556 
5557 	mlog_entry_void();
5558 
5559 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5560 	tl = &di->id2.i_dealloc;
5561 	i = le16_to_cpu(tl->tl_used) - 1;
5562 	while (i >= 0) {
5563 		/* Caller has given us at least enough credits to
5564 		 * update the truncate log dinode */
5565 		status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
5566 						 OCFS2_JOURNAL_ACCESS_WRITE);
5567 		if (status < 0) {
5568 			mlog_errno(status);
5569 			goto bail;
5570 		}
5571 
5572 		tl->tl_used = cpu_to_le16(i);
5573 
5574 		status = ocfs2_journal_dirty(handle, tl_bh);
5575 		if (status < 0) {
5576 			mlog_errno(status);
5577 			goto bail;
5578 		}
5579 
5580 		/* TODO: Perhaps we can calculate the bulk of the
5581 		 * credits up front rather than extending like
5582 		 * this. */
5583 		status = ocfs2_extend_trans(handle,
5584 					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5585 		if (status < 0) {
5586 			mlog_errno(status);
5587 			goto bail;
5588 		}
5589 
5590 		rec = tl->tl_recs[i];
5591 		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5592 						    le32_to_cpu(rec.t_start));
5593 		num_clusters = le32_to_cpu(rec.t_clusters);
5594 
5595 		/* if start_blk is not set, we ignore the record as
5596 		 * invalid. */
5597 		if (start_blk) {
5598 			mlog(0, "free record %d, start = %u, clusters = %u\n",
5599 			     i, le32_to_cpu(rec.t_start), num_clusters);
5600 
5601 			status = ocfs2_free_clusters(handle, data_alloc_inode,
5602 						     data_alloc_bh, start_blk,
5603 						     num_clusters);
5604 			if (status < 0) {
5605 				mlog_errno(status);
5606 				goto bail;
5607 			}
5608 		}
5609 		i--;
5610 	}
5611 
5612 bail:
5613 	mlog_exit(status);
5614 	return status;
5615 }
5616 
5617 /* Expects you to already be holding tl_inode->i_mutex */
5618 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5619 {
5620 	int status;
5621 	unsigned int num_to_flush;
5622 	handle_t *handle;
5623 	struct inode *tl_inode = osb->osb_tl_inode;
5624 	struct inode *data_alloc_inode = NULL;
5625 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5626 	struct buffer_head *data_alloc_bh = NULL;
5627 	struct ocfs2_dinode *di;
5628 	struct ocfs2_truncate_log *tl;
5629 
5630 	mlog_entry_void();
5631 
5632 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5633 
5634 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5635 
5636 	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5637 	 * by the underlying call to ocfs2_read_inode_block(), so any
5638 	 * corruption is a code bug */
5639 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5640 
5641 	tl = &di->id2.i_dealloc;
5642 	num_to_flush = le16_to_cpu(tl->tl_used);
5643 	mlog(0, "Flush %u records from truncate log #%llu\n",
5644 	     num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5645 	if (!num_to_flush) {
5646 		status = 0;
5647 		goto out;
5648 	}
5649 
5650 	data_alloc_inode = ocfs2_get_system_file_inode(osb,
5651 						       GLOBAL_BITMAP_SYSTEM_INODE,
5652 						       OCFS2_INVALID_SLOT);
5653 	if (!data_alloc_inode) {
5654 		status = -EINVAL;
5655 		mlog(ML_ERROR, "Could not get bitmap inode!\n");
5656 		goto out;
5657 	}
5658 
5659 	mutex_lock(&data_alloc_inode->i_mutex);
5660 
5661 	status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5662 	if (status < 0) {
5663 		mlog_errno(status);
5664 		goto out_mutex;
5665 	}
5666 
5667 	handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5668 	if (IS_ERR(handle)) {
5669 		status = PTR_ERR(handle);
5670 		mlog_errno(status);
5671 		goto out_unlock;
5672 	}
5673 
5674 	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5675 					       data_alloc_bh);
5676 	if (status < 0)
5677 		mlog_errno(status);
5678 
5679 	ocfs2_commit_trans(osb, handle);
5680 
5681 out_unlock:
5682 	brelse(data_alloc_bh);
5683 	ocfs2_inode_unlock(data_alloc_inode, 1);
5684 
5685 out_mutex:
5686 	mutex_unlock(&data_alloc_inode->i_mutex);
5687 	iput(data_alloc_inode);
5688 
5689 out:
5690 	mlog_exit(status);
5691 	return status;
5692 }
5693 
5694 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5695 {
5696 	int status;
5697 	struct inode *tl_inode = osb->osb_tl_inode;
5698 
5699 	mutex_lock(&tl_inode->i_mutex);
5700 	status = __ocfs2_flush_truncate_log(osb);
5701 	mutex_unlock(&tl_inode->i_mutex);
5702 
5703 	return status;
5704 }
5705 
5706 static void ocfs2_truncate_log_worker(struct work_struct *work)
5707 {
5708 	int status;
5709 	struct ocfs2_super *osb =
5710 		container_of(work, struct ocfs2_super,
5711 			     osb_truncate_log_wq.work);
5712 
5713 	mlog_entry_void();
5714 
5715 	status = ocfs2_flush_truncate_log(osb);
5716 	if (status < 0)
5717 		mlog_errno(status);
5718 	else
5719 		ocfs2_init_inode_steal_slot(osb);
5720 
5721 	mlog_exit(status);
5722 }
5723 
5724 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5725 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5726 				       int cancel)
5727 {
5728 	if (osb->osb_tl_inode) {
5729 		/* We want to push off log flushes while truncates are
5730 		 * still running. */
5731 		if (cancel)
5732 			cancel_delayed_work(&osb->osb_truncate_log_wq);
5733 
5734 		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5735 				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5736 	}
5737 }
5738 
5739 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5740 				       int slot_num,
5741 				       struct inode **tl_inode,
5742 				       struct buffer_head **tl_bh)
5743 {
5744 	int status;
5745 	struct inode *inode = NULL;
5746 	struct buffer_head *bh = NULL;
5747 
5748 	inode = ocfs2_get_system_file_inode(osb,
5749 					   TRUNCATE_LOG_SYSTEM_INODE,
5750 					   slot_num);
5751 	if (!inode) {
5752 		status = -EINVAL;
5753 		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5754 		goto bail;
5755 	}
5756 
5757 	status = ocfs2_read_inode_block(inode, &bh);
5758 	if (status < 0) {
5759 		iput(inode);
5760 		mlog_errno(status);
5761 		goto bail;
5762 	}
5763 
5764 	*tl_inode = inode;
5765 	*tl_bh    = bh;
5766 bail:
5767 	mlog_exit(status);
5768 	return status;
5769 }
5770 
5771 /* called during the 1st stage of node recovery. we stamp a clean
5772  * truncate log and pass back a copy for processing later. if the
5773  * truncate log does not require processing, a *tl_copy is set to
5774  * NULL. */
5775 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5776 				      int slot_num,
5777 				      struct ocfs2_dinode **tl_copy)
5778 {
5779 	int status;
5780 	struct inode *tl_inode = NULL;
5781 	struct buffer_head *tl_bh = NULL;
5782 	struct ocfs2_dinode *di;
5783 	struct ocfs2_truncate_log *tl;
5784 
5785 	*tl_copy = NULL;
5786 
5787 	mlog(0, "recover truncate log from slot %d\n", slot_num);
5788 
5789 	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5790 	if (status < 0) {
5791 		mlog_errno(status);
5792 		goto bail;
5793 	}
5794 
5795 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5796 
5797 	/* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
5798 	 * validated by the underlying call to ocfs2_read_inode_block(),
5799 	 * so any corruption is a code bug */
5800 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5801 
5802 	tl = &di->id2.i_dealloc;
5803 	if (le16_to_cpu(tl->tl_used)) {
5804 		mlog(0, "We'll have %u logs to recover\n",
5805 		     le16_to_cpu(tl->tl_used));
5806 
5807 		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
5808 		if (!(*tl_copy)) {
5809 			status = -ENOMEM;
5810 			mlog_errno(status);
5811 			goto bail;
5812 		}
5813 
5814 		/* Assuming the write-out below goes well, this copy
5815 		 * will be passed back to recovery for processing. */
5816 		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
5817 
5818 		/* All we need to do to clear the truncate log is set
5819 		 * tl_used. */
5820 		tl->tl_used = 0;
5821 
5822 		ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
5823 		status = ocfs2_write_block(osb, tl_bh, tl_inode);
5824 		if (status < 0) {
5825 			mlog_errno(status);
5826 			goto bail;
5827 		}
5828 	}
5829 
5830 bail:
5831 	if (tl_inode)
5832 		iput(tl_inode);
5833 	brelse(tl_bh);
5834 
5835 	if (status < 0 && (*tl_copy)) {
5836 		kfree(*tl_copy);
5837 		*tl_copy = NULL;
5838 	}
5839 
5840 	mlog_exit(status);
5841 	return status;
5842 }
5843 
5844 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
5845 					 struct ocfs2_dinode *tl_copy)
5846 {
5847 	int status = 0;
5848 	int i;
5849 	unsigned int clusters, num_recs, start_cluster;
5850 	u64 start_blk;
5851 	handle_t *handle;
5852 	struct inode *tl_inode = osb->osb_tl_inode;
5853 	struct ocfs2_truncate_log *tl;
5854 
5855 	mlog_entry_void();
5856 
5857 	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
5858 		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
5859 		return -EINVAL;
5860 	}
5861 
5862 	tl = &tl_copy->id2.i_dealloc;
5863 	num_recs = le16_to_cpu(tl->tl_used);
5864 	mlog(0, "cleanup %u records from %llu\n", num_recs,
5865 	     (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
5866 
5867 	mutex_lock(&tl_inode->i_mutex);
5868 	for(i = 0; i < num_recs; i++) {
5869 		if (ocfs2_truncate_log_needs_flush(osb)) {
5870 			status = __ocfs2_flush_truncate_log(osb);
5871 			if (status < 0) {
5872 				mlog_errno(status);
5873 				goto bail_up;
5874 			}
5875 		}
5876 
5877 		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5878 		if (IS_ERR(handle)) {
5879 			status = PTR_ERR(handle);
5880 			mlog_errno(status);
5881 			goto bail_up;
5882 		}
5883 
5884 		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
5885 		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
5886 		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
5887 
5888 		status = ocfs2_truncate_log_append(osb, handle,
5889 						   start_blk, clusters);
5890 		ocfs2_commit_trans(osb, handle);
5891 		if (status < 0) {
5892 			mlog_errno(status);
5893 			goto bail_up;
5894 		}
5895 	}
5896 
5897 bail_up:
5898 	mutex_unlock(&tl_inode->i_mutex);
5899 
5900 	mlog_exit(status);
5901 	return status;
5902 }
5903 
5904 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
5905 {
5906 	int status;
5907 	struct inode *tl_inode = osb->osb_tl_inode;
5908 
5909 	mlog_entry_void();
5910 
5911 	if (tl_inode) {
5912 		cancel_delayed_work(&osb->osb_truncate_log_wq);
5913 		flush_workqueue(ocfs2_wq);
5914 
5915 		status = ocfs2_flush_truncate_log(osb);
5916 		if (status < 0)
5917 			mlog_errno(status);
5918 
5919 		brelse(osb->osb_tl_bh);
5920 		iput(osb->osb_tl_inode);
5921 	}
5922 
5923 	mlog_exit_void();
5924 }
5925 
5926 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
5927 {
5928 	int status;
5929 	struct inode *tl_inode = NULL;
5930 	struct buffer_head *tl_bh = NULL;
5931 
5932 	mlog_entry_void();
5933 
5934 	status = ocfs2_get_truncate_log_info(osb,
5935 					     osb->slot_num,
5936 					     &tl_inode,
5937 					     &tl_bh);
5938 	if (status < 0)
5939 		mlog_errno(status);
5940 
5941 	/* ocfs2_truncate_log_shutdown keys on the existence of
5942 	 * osb->osb_tl_inode so we don't set any of the osb variables
5943 	 * until we're sure all is well. */
5944 	INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
5945 			  ocfs2_truncate_log_worker);
5946 	osb->osb_tl_bh    = tl_bh;
5947 	osb->osb_tl_inode = tl_inode;
5948 
5949 	mlog_exit(status);
5950 	return status;
5951 }
5952 
5953 /*
5954  * Delayed de-allocation of suballocator blocks.
5955  *
5956  * Some sets of block de-allocations might involve multiple suballocator inodes.
5957  *
5958  * The locking for this can get extremely complicated, especially when
5959  * the suballocator inodes to delete from aren't known until deep
5960  * within an unrelated codepath.
5961  *
5962  * ocfs2_extent_block structures are a good example of this - an inode
5963  * btree could have been grown by any number of nodes each allocating
5964  * out of their own suballoc inode.
5965  *
5966  * These structures allow the delay of block de-allocation until a
5967  * later time, when locking of multiple cluster inodes won't cause
5968  * deadlock.
5969  */
5970 
5971 /*
5972  * Describe a single bit freed from a suballocator.  For the block
5973  * suballocators, it represents one block.  For the global cluster
5974  * allocator, it represents some clusters and free_bit indicates
5975  * clusters number.
5976  */
5977 struct ocfs2_cached_block_free {
5978 	struct ocfs2_cached_block_free		*free_next;
5979 	u64					free_blk;
5980 	unsigned int				free_bit;
5981 };
5982 
5983 struct ocfs2_per_slot_free_list {
5984 	struct ocfs2_per_slot_free_list		*f_next_suballocator;
5985 	int					f_inode_type;
5986 	int					f_slot;
5987 	struct ocfs2_cached_block_free		*f_first;
5988 };
5989 
5990 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
5991 				    int sysfile_type,
5992 				    int slot,
5993 				    struct ocfs2_cached_block_free *head)
5994 {
5995 	int ret;
5996 	u64 bg_blkno;
5997 	handle_t *handle;
5998 	struct inode *inode;
5999 	struct buffer_head *di_bh = NULL;
6000 	struct ocfs2_cached_block_free *tmp;
6001 
6002 	inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
6003 	if (!inode) {
6004 		ret = -EINVAL;
6005 		mlog_errno(ret);
6006 		goto out;
6007 	}
6008 
6009 	mutex_lock(&inode->i_mutex);
6010 
6011 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
6012 	if (ret) {
6013 		mlog_errno(ret);
6014 		goto out_mutex;
6015 	}
6016 
6017 	handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
6018 	if (IS_ERR(handle)) {
6019 		ret = PTR_ERR(handle);
6020 		mlog_errno(ret);
6021 		goto out_unlock;
6022 	}
6023 
6024 	while (head) {
6025 		bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6026 						      head->free_bit);
6027 		mlog(0, "Free bit: (bit %u, blkno %llu)\n",
6028 		     head->free_bit, (unsigned long long)head->free_blk);
6029 
6030 		ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
6031 					       head->free_bit, bg_blkno, 1);
6032 		if (ret) {
6033 			mlog_errno(ret);
6034 			goto out_journal;
6035 		}
6036 
6037 		ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
6038 		if (ret) {
6039 			mlog_errno(ret);
6040 			goto out_journal;
6041 		}
6042 
6043 		tmp = head;
6044 		head = head->free_next;
6045 		kfree(tmp);
6046 	}
6047 
6048 out_journal:
6049 	ocfs2_commit_trans(osb, handle);
6050 
6051 out_unlock:
6052 	ocfs2_inode_unlock(inode, 1);
6053 	brelse(di_bh);
6054 out_mutex:
6055 	mutex_unlock(&inode->i_mutex);
6056 	iput(inode);
6057 out:
6058 	while(head) {
6059 		/* Premature exit may have left some dangling items. */
6060 		tmp = head;
6061 		head = head->free_next;
6062 		kfree(tmp);
6063 	}
6064 
6065 	return ret;
6066 }
6067 
6068 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6069 				u64 blkno, unsigned int bit)
6070 {
6071 	int ret = 0;
6072 	struct ocfs2_cached_block_free *item;
6073 
6074 	item = kmalloc(sizeof(*item), GFP_NOFS);
6075 	if (item == NULL) {
6076 		ret = -ENOMEM;
6077 		mlog_errno(ret);
6078 		return ret;
6079 	}
6080 
6081 	mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
6082 	     bit, (unsigned long long)blkno);
6083 
6084 	item->free_blk = blkno;
6085 	item->free_bit = bit;
6086 	item->free_next = ctxt->c_global_allocator;
6087 
6088 	ctxt->c_global_allocator = item;
6089 	return ret;
6090 }
6091 
6092 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6093 				      struct ocfs2_cached_block_free *head)
6094 {
6095 	struct ocfs2_cached_block_free *tmp;
6096 	struct inode *tl_inode = osb->osb_tl_inode;
6097 	handle_t *handle;
6098 	int ret = 0;
6099 
6100 	mutex_lock(&tl_inode->i_mutex);
6101 
6102 	while (head) {
6103 		if (ocfs2_truncate_log_needs_flush(osb)) {
6104 			ret = __ocfs2_flush_truncate_log(osb);
6105 			if (ret < 0) {
6106 				mlog_errno(ret);
6107 				break;
6108 			}
6109 		}
6110 
6111 		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6112 		if (IS_ERR(handle)) {
6113 			ret = PTR_ERR(handle);
6114 			mlog_errno(ret);
6115 			break;
6116 		}
6117 
6118 		ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6119 						head->free_bit);
6120 
6121 		ocfs2_commit_trans(osb, handle);
6122 		tmp = head;
6123 		head = head->free_next;
6124 		kfree(tmp);
6125 
6126 		if (ret < 0) {
6127 			mlog_errno(ret);
6128 			break;
6129 		}
6130 	}
6131 
6132 	mutex_unlock(&tl_inode->i_mutex);
6133 
6134 	while (head) {
6135 		/* Premature exit may have left some dangling items. */
6136 		tmp = head;
6137 		head = head->free_next;
6138 		kfree(tmp);
6139 	}
6140 
6141 	return ret;
6142 }
6143 
6144 int ocfs2_run_deallocs(struct ocfs2_super *osb,
6145 		       struct ocfs2_cached_dealloc_ctxt *ctxt)
6146 {
6147 	int ret = 0, ret2;
6148 	struct ocfs2_per_slot_free_list *fl;
6149 
6150 	if (!ctxt)
6151 		return 0;
6152 
6153 	while (ctxt->c_first_suballocator) {
6154 		fl = ctxt->c_first_suballocator;
6155 
6156 		if (fl->f_first) {
6157 			mlog(0, "Free items: (type %u, slot %d)\n",
6158 			     fl->f_inode_type, fl->f_slot);
6159 			ret2 = ocfs2_free_cached_blocks(osb,
6160 							fl->f_inode_type,
6161 							fl->f_slot,
6162 							fl->f_first);
6163 			if (ret2)
6164 				mlog_errno(ret2);
6165 			if (!ret)
6166 				ret = ret2;
6167 		}
6168 
6169 		ctxt->c_first_suballocator = fl->f_next_suballocator;
6170 		kfree(fl);
6171 	}
6172 
6173 	if (ctxt->c_global_allocator) {
6174 		ret2 = ocfs2_free_cached_clusters(osb,
6175 						  ctxt->c_global_allocator);
6176 		if (ret2)
6177 			mlog_errno(ret2);
6178 		if (!ret)
6179 			ret = ret2;
6180 
6181 		ctxt->c_global_allocator = NULL;
6182 	}
6183 
6184 	return ret;
6185 }
6186 
6187 static struct ocfs2_per_slot_free_list *
6188 ocfs2_find_per_slot_free_list(int type,
6189 			      int slot,
6190 			      struct ocfs2_cached_dealloc_ctxt *ctxt)
6191 {
6192 	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6193 
6194 	while (fl) {
6195 		if (fl->f_inode_type == type && fl->f_slot == slot)
6196 			return fl;
6197 
6198 		fl = fl->f_next_suballocator;
6199 	}
6200 
6201 	fl = kmalloc(sizeof(*fl), GFP_NOFS);
6202 	if (fl) {
6203 		fl->f_inode_type = type;
6204 		fl->f_slot = slot;
6205 		fl->f_first = NULL;
6206 		fl->f_next_suballocator = ctxt->c_first_suballocator;
6207 
6208 		ctxt->c_first_suballocator = fl;
6209 	}
6210 	return fl;
6211 }
6212 
6213 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6214 				     int type, int slot, u64 blkno,
6215 				     unsigned int bit)
6216 {
6217 	int ret;
6218 	struct ocfs2_per_slot_free_list *fl;
6219 	struct ocfs2_cached_block_free *item;
6220 
6221 	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6222 	if (fl == NULL) {
6223 		ret = -ENOMEM;
6224 		mlog_errno(ret);
6225 		goto out;
6226 	}
6227 
6228 	item = kmalloc(sizeof(*item), GFP_NOFS);
6229 	if (item == NULL) {
6230 		ret = -ENOMEM;
6231 		mlog_errno(ret);
6232 		goto out;
6233 	}
6234 
6235 	mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6236 	     type, slot, bit, (unsigned long long)blkno);
6237 
6238 	item->free_blk = blkno;
6239 	item->free_bit = bit;
6240 	item->free_next = fl->f_first;
6241 
6242 	fl->f_first = item;
6243 
6244 	ret = 0;
6245 out:
6246 	return ret;
6247 }
6248 
6249 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6250 					 struct ocfs2_extent_block *eb)
6251 {
6252 	return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6253 					 le16_to_cpu(eb->h_suballoc_slot),
6254 					 le64_to_cpu(eb->h_blkno),
6255 					 le16_to_cpu(eb->h_suballoc_bit));
6256 }
6257 
6258 /* This function will figure out whether the currently last extent
6259  * block will be deleted, and if it will, what the new last extent
6260  * block will be so we can update his h_next_leaf_blk field, as well
6261  * as the dinodes i_last_eb_blk */
6262 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6263 				       unsigned int clusters_to_del,
6264 				       struct ocfs2_path *path,
6265 				       struct buffer_head **new_last_eb)
6266 {
6267 	int next_free, ret = 0;
6268 	u32 cpos;
6269 	struct ocfs2_extent_rec *rec;
6270 	struct ocfs2_extent_block *eb;
6271 	struct ocfs2_extent_list *el;
6272 	struct buffer_head *bh = NULL;
6273 
6274 	*new_last_eb = NULL;
6275 
6276 	/* we have no tree, so of course, no last_eb. */
6277 	if (!path->p_tree_depth)
6278 		goto out;
6279 
6280 	/* trunc to zero special case - this makes tree_depth = 0
6281 	 * regardless of what it is.  */
6282 	if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6283 		goto out;
6284 
6285 	el = path_leaf_el(path);
6286 	BUG_ON(!el->l_next_free_rec);
6287 
6288 	/*
6289 	 * Make sure that this extent list will actually be empty
6290 	 * after we clear away the data. We can shortcut out if
6291 	 * there's more than one non-empty extent in the
6292 	 * list. Otherwise, a check of the remaining extent is
6293 	 * necessary.
6294 	 */
6295 	next_free = le16_to_cpu(el->l_next_free_rec);
6296 	rec = NULL;
6297 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6298 		if (next_free > 2)
6299 			goto out;
6300 
6301 		/* We may have a valid extent in index 1, check it. */
6302 		if (next_free == 2)
6303 			rec = &el->l_recs[1];
6304 
6305 		/*
6306 		 * Fall through - no more nonempty extents, so we want
6307 		 * to delete this leaf.
6308 		 */
6309 	} else {
6310 		if (next_free > 1)
6311 			goto out;
6312 
6313 		rec = &el->l_recs[0];
6314 	}
6315 
6316 	if (rec) {
6317 		/*
6318 		 * Check it we'll only be trimming off the end of this
6319 		 * cluster.
6320 		 */
6321 		if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6322 			goto out;
6323 	}
6324 
6325 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6326 	if (ret) {
6327 		mlog_errno(ret);
6328 		goto out;
6329 	}
6330 
6331 	ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
6332 	if (ret) {
6333 		mlog_errno(ret);
6334 		goto out;
6335 	}
6336 
6337 	eb = (struct ocfs2_extent_block *) bh->b_data;
6338 	el = &eb->h_list;
6339 
6340 	/* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6341 	 * Any corruption is a code bug. */
6342 	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6343 
6344 	*new_last_eb = bh;
6345 	get_bh(*new_last_eb);
6346 	mlog(0, "returning block %llu, (cpos: %u)\n",
6347 	     (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6348 out:
6349 	brelse(bh);
6350 
6351 	return ret;
6352 }
6353 
6354 /*
6355  * Trim some clusters off the rightmost edge of a tree. Only called
6356  * during truncate.
6357  *
6358  * The caller needs to:
6359  *   - start journaling of each path component.
6360  *   - compute and fully set up any new last ext block
6361  */
6362 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6363 			   handle_t *handle, struct ocfs2_truncate_context *tc,
6364 			   u32 clusters_to_del, u64 *delete_start)
6365 {
6366 	int ret, i, index = path->p_tree_depth;
6367 	u32 new_edge = 0;
6368 	u64 deleted_eb = 0;
6369 	struct buffer_head *bh;
6370 	struct ocfs2_extent_list *el;
6371 	struct ocfs2_extent_rec *rec;
6372 
6373 	*delete_start = 0;
6374 
6375 	while (index >= 0) {
6376 		bh = path->p_node[index].bh;
6377 		el = path->p_node[index].el;
6378 
6379 		mlog(0, "traveling tree (index = %d, block = %llu)\n",
6380 		     index,  (unsigned long long)bh->b_blocknr);
6381 
6382 		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6383 
6384 		if (index !=
6385 		    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6386 			ocfs2_error(inode->i_sb,
6387 				    "Inode %lu has invalid ext. block %llu",
6388 				    inode->i_ino,
6389 				    (unsigned long long)bh->b_blocknr);
6390 			ret = -EROFS;
6391 			goto out;
6392 		}
6393 
6394 find_tail_record:
6395 		i = le16_to_cpu(el->l_next_free_rec) - 1;
6396 		rec = &el->l_recs[i];
6397 
6398 		mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6399 		     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6400 		     ocfs2_rec_clusters(el, rec),
6401 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
6402 		     le16_to_cpu(el->l_next_free_rec));
6403 
6404 		BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6405 
6406 		if (le16_to_cpu(el->l_tree_depth) == 0) {
6407 			/*
6408 			 * If the leaf block contains a single empty
6409 			 * extent and no records, we can just remove
6410 			 * the block.
6411 			 */
6412 			if (i == 0 && ocfs2_is_empty_extent(rec)) {
6413 				memset(rec, 0,
6414 				       sizeof(struct ocfs2_extent_rec));
6415 				el->l_next_free_rec = cpu_to_le16(0);
6416 
6417 				goto delete;
6418 			}
6419 
6420 			/*
6421 			 * Remove any empty extents by shifting things
6422 			 * left. That should make life much easier on
6423 			 * the code below. This condition is rare
6424 			 * enough that we shouldn't see a performance
6425 			 * hit.
6426 			 */
6427 			if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6428 				le16_add_cpu(&el->l_next_free_rec, -1);
6429 
6430 				for(i = 0;
6431 				    i < le16_to_cpu(el->l_next_free_rec); i++)
6432 					el->l_recs[i] = el->l_recs[i + 1];
6433 
6434 				memset(&el->l_recs[i], 0,
6435 				       sizeof(struct ocfs2_extent_rec));
6436 
6437 				/*
6438 				 * We've modified our extent list. The
6439 				 * simplest way to handle this change
6440 				 * is to being the search from the
6441 				 * start again.
6442 				 */
6443 				goto find_tail_record;
6444 			}
6445 
6446 			le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6447 
6448 			/*
6449 			 * We'll use "new_edge" on our way back up the
6450 			 * tree to know what our rightmost cpos is.
6451 			 */
6452 			new_edge = le16_to_cpu(rec->e_leaf_clusters);
6453 			new_edge += le32_to_cpu(rec->e_cpos);
6454 
6455 			/*
6456 			 * The caller will use this to delete data blocks.
6457 			 */
6458 			*delete_start = le64_to_cpu(rec->e_blkno)
6459 				+ ocfs2_clusters_to_blocks(inode->i_sb,
6460 					le16_to_cpu(rec->e_leaf_clusters));
6461 
6462 			/*
6463 			 * If it's now empty, remove this record.
6464 			 */
6465 			if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6466 				memset(rec, 0,
6467 				       sizeof(struct ocfs2_extent_rec));
6468 				le16_add_cpu(&el->l_next_free_rec, -1);
6469 			}
6470 		} else {
6471 			if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6472 				memset(rec, 0,
6473 				       sizeof(struct ocfs2_extent_rec));
6474 				le16_add_cpu(&el->l_next_free_rec, -1);
6475 
6476 				goto delete;
6477 			}
6478 
6479 			/* Can this actually happen? */
6480 			if (le16_to_cpu(el->l_next_free_rec) == 0)
6481 				goto delete;
6482 
6483 			/*
6484 			 * We never actually deleted any clusters
6485 			 * because our leaf was empty. There's no
6486 			 * reason to adjust the rightmost edge then.
6487 			 */
6488 			if (new_edge == 0)
6489 				goto delete;
6490 
6491 			rec->e_int_clusters = cpu_to_le32(new_edge);
6492 			le32_add_cpu(&rec->e_int_clusters,
6493 				     -le32_to_cpu(rec->e_cpos));
6494 
6495 			 /*
6496 			  * A deleted child record should have been
6497 			  * caught above.
6498 			  */
6499 			 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6500 		}
6501 
6502 delete:
6503 		ret = ocfs2_journal_dirty(handle, bh);
6504 		if (ret) {
6505 			mlog_errno(ret);
6506 			goto out;
6507 		}
6508 
6509 		mlog(0, "extent list container %llu, after: record %d: "
6510 		     "(%u, %u, %llu), next = %u.\n",
6511 		     (unsigned long long)bh->b_blocknr, i,
6512 		     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6513 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
6514 		     le16_to_cpu(el->l_next_free_rec));
6515 
6516 		/*
6517 		 * We must be careful to only attempt delete of an
6518 		 * extent block (and not the root inode block).
6519 		 */
6520 		if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6521 			struct ocfs2_extent_block *eb =
6522 				(struct ocfs2_extent_block *)bh->b_data;
6523 
6524 			/*
6525 			 * Save this for use when processing the
6526 			 * parent block.
6527 			 */
6528 			deleted_eb = le64_to_cpu(eb->h_blkno);
6529 
6530 			mlog(0, "deleting this extent block.\n");
6531 
6532 			ocfs2_remove_from_cache(inode, bh);
6533 
6534 			BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6535 			BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6536 			BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6537 
6538 			ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6539 			/* An error here is not fatal. */
6540 			if (ret < 0)
6541 				mlog_errno(ret);
6542 		} else {
6543 			deleted_eb = 0;
6544 		}
6545 
6546 		index--;
6547 	}
6548 
6549 	ret = 0;
6550 out:
6551 	return ret;
6552 }
6553 
6554 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6555 			     unsigned int clusters_to_del,
6556 			     struct inode *inode,
6557 			     struct buffer_head *fe_bh,
6558 			     handle_t *handle,
6559 			     struct ocfs2_truncate_context *tc,
6560 			     struct ocfs2_path *path)
6561 {
6562 	int status;
6563 	struct ocfs2_dinode *fe;
6564 	struct ocfs2_extent_block *last_eb = NULL;
6565 	struct ocfs2_extent_list *el;
6566 	struct buffer_head *last_eb_bh = NULL;
6567 	u64 delete_blk = 0;
6568 
6569 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
6570 
6571 	status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6572 					     path, &last_eb_bh);
6573 	if (status < 0) {
6574 		mlog_errno(status);
6575 		goto bail;
6576 	}
6577 
6578 	/*
6579 	 * Each component will be touched, so we might as well journal
6580 	 * here to avoid having to handle errors later.
6581 	 */
6582 	status = ocfs2_journal_access_path(inode, handle, path);
6583 	if (status < 0) {
6584 		mlog_errno(status);
6585 		goto bail;
6586 	}
6587 
6588 	if (last_eb_bh) {
6589 		status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
6590 						 OCFS2_JOURNAL_ACCESS_WRITE);
6591 		if (status < 0) {
6592 			mlog_errno(status);
6593 			goto bail;
6594 		}
6595 
6596 		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6597 	}
6598 
6599 	el = &(fe->id2.i_list);
6600 
6601 	/*
6602 	 * Lower levels depend on this never happening, but it's best
6603 	 * to check it up here before changing the tree.
6604 	 */
6605 	if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6606 		ocfs2_error(inode->i_sb,
6607 			    "Inode %lu has an empty extent record, depth %u\n",
6608 			    inode->i_ino, le16_to_cpu(el->l_tree_depth));
6609 		status = -EROFS;
6610 		goto bail;
6611 	}
6612 
6613 	vfs_dq_free_space_nodirty(inode,
6614 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6615 	spin_lock(&OCFS2_I(inode)->ip_lock);
6616 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6617 				      clusters_to_del;
6618 	spin_unlock(&OCFS2_I(inode)->ip_lock);
6619 	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6620 	inode->i_blocks = ocfs2_inode_sector_count(inode);
6621 
6622 	status = ocfs2_trim_tree(inode, path, handle, tc,
6623 				 clusters_to_del, &delete_blk);
6624 	if (status) {
6625 		mlog_errno(status);
6626 		goto bail;
6627 	}
6628 
6629 	if (le32_to_cpu(fe->i_clusters) == 0) {
6630 		/* trunc to zero is a special case. */
6631 		el->l_tree_depth = 0;
6632 		fe->i_last_eb_blk = 0;
6633 	} else if (last_eb)
6634 		fe->i_last_eb_blk = last_eb->h_blkno;
6635 
6636 	status = ocfs2_journal_dirty(handle, fe_bh);
6637 	if (status < 0) {
6638 		mlog_errno(status);
6639 		goto bail;
6640 	}
6641 
6642 	if (last_eb) {
6643 		/* If there will be a new last extent block, then by
6644 		 * definition, there cannot be any leaves to the right of
6645 		 * him. */
6646 		last_eb->h_next_leaf_blk = 0;
6647 		status = ocfs2_journal_dirty(handle, last_eb_bh);
6648 		if (status < 0) {
6649 			mlog_errno(status);
6650 			goto bail;
6651 		}
6652 	}
6653 
6654 	if (delete_blk) {
6655 		status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6656 						   clusters_to_del);
6657 		if (status < 0) {
6658 			mlog_errno(status);
6659 			goto bail;
6660 		}
6661 	}
6662 	status = 0;
6663 bail:
6664 
6665 	mlog_exit(status);
6666 	return status;
6667 }
6668 
6669 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6670 {
6671 	set_buffer_uptodate(bh);
6672 	mark_buffer_dirty(bh);
6673 	return 0;
6674 }
6675 
6676 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6677 				     unsigned int from, unsigned int to,
6678 				     struct page *page, int zero, u64 *phys)
6679 {
6680 	int ret, partial = 0;
6681 
6682 	ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6683 	if (ret)
6684 		mlog_errno(ret);
6685 
6686 	if (zero)
6687 		zero_user_segment(page, from, to);
6688 
6689 	/*
6690 	 * Need to set the buffers we zero'd into uptodate
6691 	 * here if they aren't - ocfs2_map_page_blocks()
6692 	 * might've skipped some
6693 	 */
6694 	ret = walk_page_buffers(handle, page_buffers(page),
6695 				from, to, &partial,
6696 				ocfs2_zero_func);
6697 	if (ret < 0)
6698 		mlog_errno(ret);
6699 	else if (ocfs2_should_order_data(inode)) {
6700 		ret = ocfs2_jbd2_file_inode(handle, inode);
6701 		if (ret < 0)
6702 			mlog_errno(ret);
6703 	}
6704 
6705 	if (!partial)
6706 		SetPageUptodate(page);
6707 
6708 	flush_dcache_page(page);
6709 }
6710 
6711 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6712 				     loff_t end, struct page **pages,
6713 				     int numpages, u64 phys, handle_t *handle)
6714 {
6715 	int i;
6716 	struct page *page;
6717 	unsigned int from, to = PAGE_CACHE_SIZE;
6718 	struct super_block *sb = inode->i_sb;
6719 
6720 	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6721 
6722 	if (numpages == 0)
6723 		goto out;
6724 
6725 	to = PAGE_CACHE_SIZE;
6726 	for(i = 0; i < numpages; i++) {
6727 		page = pages[i];
6728 
6729 		from = start & (PAGE_CACHE_SIZE - 1);
6730 		if ((end >> PAGE_CACHE_SHIFT) == page->index)
6731 			to = end & (PAGE_CACHE_SIZE - 1);
6732 
6733 		BUG_ON(from > PAGE_CACHE_SIZE);
6734 		BUG_ON(to > PAGE_CACHE_SIZE);
6735 
6736 		ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6737 					 &phys);
6738 
6739 		start = (page->index + 1) << PAGE_CACHE_SHIFT;
6740 	}
6741 out:
6742 	if (pages)
6743 		ocfs2_unlock_and_free_pages(pages, numpages);
6744 }
6745 
6746 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6747 				struct page **pages, int *num)
6748 {
6749 	int numpages, ret = 0;
6750 	struct super_block *sb = inode->i_sb;
6751 	struct address_space *mapping = inode->i_mapping;
6752 	unsigned long index;
6753 	loff_t last_page_bytes;
6754 
6755 	BUG_ON(start > end);
6756 
6757 	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6758 	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6759 
6760 	numpages = 0;
6761 	last_page_bytes = PAGE_ALIGN(end);
6762 	index = start >> PAGE_CACHE_SHIFT;
6763 	do {
6764 		pages[numpages] = grab_cache_page(mapping, index);
6765 		if (!pages[numpages]) {
6766 			ret = -ENOMEM;
6767 			mlog_errno(ret);
6768 			goto out;
6769 		}
6770 
6771 		numpages++;
6772 		index++;
6773 	} while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6774 
6775 out:
6776 	if (ret != 0) {
6777 		if (pages)
6778 			ocfs2_unlock_and_free_pages(pages, numpages);
6779 		numpages = 0;
6780 	}
6781 
6782 	*num = numpages;
6783 
6784 	return ret;
6785 }
6786 
6787 /*
6788  * Zero the area past i_size but still within an allocated
6789  * cluster. This avoids exposing nonzero data on subsequent file
6790  * extends.
6791  *
6792  * We need to call this before i_size is updated on the inode because
6793  * otherwise block_write_full_page() will skip writeout of pages past
6794  * i_size. The new_i_size parameter is passed for this reason.
6795  */
6796 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6797 				  u64 range_start, u64 range_end)
6798 {
6799 	int ret = 0, numpages;
6800 	struct page **pages = NULL;
6801 	u64 phys;
6802 	unsigned int ext_flags;
6803 	struct super_block *sb = inode->i_sb;
6804 
6805 	/*
6806 	 * File systems which don't support sparse files zero on every
6807 	 * extend.
6808 	 */
6809 	if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
6810 		return 0;
6811 
6812 	pages = kcalloc(ocfs2_pages_per_cluster(sb),
6813 			sizeof(struct page *), GFP_NOFS);
6814 	if (pages == NULL) {
6815 		ret = -ENOMEM;
6816 		mlog_errno(ret);
6817 		goto out;
6818 	}
6819 
6820 	if (range_start == range_end)
6821 		goto out;
6822 
6823 	ret = ocfs2_extent_map_get_blocks(inode,
6824 					  range_start >> sb->s_blocksize_bits,
6825 					  &phys, NULL, &ext_flags);
6826 	if (ret) {
6827 		mlog_errno(ret);
6828 		goto out;
6829 	}
6830 
6831 	/*
6832 	 * Tail is a hole, or is marked unwritten. In either case, we
6833 	 * can count on read and write to return/push zero's.
6834 	 */
6835 	if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
6836 		goto out;
6837 
6838 	ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
6839 				   &numpages);
6840 	if (ret) {
6841 		mlog_errno(ret);
6842 		goto out;
6843 	}
6844 
6845 	ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
6846 				 numpages, phys, handle);
6847 
6848 	/*
6849 	 * Initiate writeout of the pages we zero'd here. We don't
6850 	 * wait on them - the truncate_inode_pages() call later will
6851 	 * do that for us.
6852 	 */
6853 	ret = do_sync_mapping_range(inode->i_mapping, range_start,
6854 				    range_end - 1, SYNC_FILE_RANGE_WRITE);
6855 	if (ret)
6856 		mlog_errno(ret);
6857 
6858 out:
6859 	if (pages)
6860 		kfree(pages);
6861 
6862 	return ret;
6863 }
6864 
6865 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
6866 					     struct ocfs2_dinode *di)
6867 {
6868 	unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
6869 	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
6870 
6871 	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
6872 		memset(&di->id2, 0, blocksize -
6873 				    offsetof(struct ocfs2_dinode, id2) -
6874 				    xattrsize);
6875 	else
6876 		memset(&di->id2, 0, blocksize -
6877 				    offsetof(struct ocfs2_dinode, id2));
6878 }
6879 
6880 void ocfs2_dinode_new_extent_list(struct inode *inode,
6881 				  struct ocfs2_dinode *di)
6882 {
6883 	ocfs2_zero_dinode_id2_with_xattr(inode, di);
6884 	di->id2.i_list.l_tree_depth = 0;
6885 	di->id2.i_list.l_next_free_rec = 0;
6886 	di->id2.i_list.l_count = cpu_to_le16(
6887 		ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
6888 }
6889 
6890 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
6891 {
6892 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
6893 	struct ocfs2_inline_data *idata = &di->id2.i_data;
6894 
6895 	spin_lock(&oi->ip_lock);
6896 	oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
6897 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
6898 	spin_unlock(&oi->ip_lock);
6899 
6900 	/*
6901 	 * We clear the entire i_data structure here so that all
6902 	 * fields can be properly initialized.
6903 	 */
6904 	ocfs2_zero_dinode_id2_with_xattr(inode, di);
6905 
6906 	idata->id_count = cpu_to_le16(
6907 			ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
6908 }
6909 
6910 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
6911 					 struct buffer_head *di_bh)
6912 {
6913 	int ret, i, has_data, num_pages = 0;
6914 	handle_t *handle;
6915 	u64 uninitialized_var(block);
6916 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
6917 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
6918 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
6919 	struct ocfs2_alloc_context *data_ac = NULL;
6920 	struct page **pages = NULL;
6921 	loff_t end = osb->s_clustersize;
6922 	struct ocfs2_extent_tree et;
6923 	int did_quota = 0;
6924 
6925 	has_data = i_size_read(inode) ? 1 : 0;
6926 
6927 	if (has_data) {
6928 		pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
6929 				sizeof(struct page *), GFP_NOFS);
6930 		if (pages == NULL) {
6931 			ret = -ENOMEM;
6932 			mlog_errno(ret);
6933 			goto out;
6934 		}
6935 
6936 		ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
6937 		if (ret) {
6938 			mlog_errno(ret);
6939 			goto out;
6940 		}
6941 	}
6942 
6943 	handle = ocfs2_start_trans(osb,
6944 				   ocfs2_inline_to_extents_credits(osb->sb));
6945 	if (IS_ERR(handle)) {
6946 		ret = PTR_ERR(handle);
6947 		mlog_errno(ret);
6948 		goto out_unlock;
6949 	}
6950 
6951 	ret = ocfs2_journal_access_di(handle, inode, di_bh,
6952 				      OCFS2_JOURNAL_ACCESS_WRITE);
6953 	if (ret) {
6954 		mlog_errno(ret);
6955 		goto out_commit;
6956 	}
6957 
6958 	if (has_data) {
6959 		u32 bit_off, num;
6960 		unsigned int page_end;
6961 		u64 phys;
6962 
6963 		if (vfs_dq_alloc_space_nodirty(inode,
6964 				       ocfs2_clusters_to_bytes(osb->sb, 1))) {
6965 			ret = -EDQUOT;
6966 			goto out_commit;
6967 		}
6968 		did_quota = 1;
6969 
6970 		ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
6971 					   &num);
6972 		if (ret) {
6973 			mlog_errno(ret);
6974 			goto out_commit;
6975 		}
6976 
6977 		/*
6978 		 * Save two copies, one for insert, and one that can
6979 		 * be changed by ocfs2_map_and_dirty_page() below.
6980 		 */
6981 		block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
6982 
6983 		/*
6984 		 * Non sparse file systems zero on extend, so no need
6985 		 * to do that now.
6986 		 */
6987 		if (!ocfs2_sparse_alloc(osb) &&
6988 		    PAGE_CACHE_SIZE < osb->s_clustersize)
6989 			end = PAGE_CACHE_SIZE;
6990 
6991 		ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
6992 		if (ret) {
6993 			mlog_errno(ret);
6994 			goto out_commit;
6995 		}
6996 
6997 		/*
6998 		 * This should populate the 1st page for us and mark
6999 		 * it up to date.
7000 		 */
7001 		ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
7002 		if (ret) {
7003 			mlog_errno(ret);
7004 			goto out_commit;
7005 		}
7006 
7007 		page_end = PAGE_CACHE_SIZE;
7008 		if (PAGE_CACHE_SIZE > osb->s_clustersize)
7009 			page_end = osb->s_clustersize;
7010 
7011 		for (i = 0; i < num_pages; i++)
7012 			ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
7013 						 pages[i], i > 0, &phys);
7014 	}
7015 
7016 	spin_lock(&oi->ip_lock);
7017 	oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
7018 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
7019 	spin_unlock(&oi->ip_lock);
7020 
7021 	ocfs2_dinode_new_extent_list(inode, di);
7022 
7023 	ocfs2_journal_dirty(handle, di_bh);
7024 
7025 	if (has_data) {
7026 		/*
7027 		 * An error at this point should be extremely rare. If
7028 		 * this proves to be false, we could always re-build
7029 		 * the in-inode data from our pages.
7030 		 */
7031 		ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
7032 		ret = ocfs2_insert_extent(osb, handle, inode, &et,
7033 					  0, block, 1, 0, NULL);
7034 		if (ret) {
7035 			mlog_errno(ret);
7036 			goto out_commit;
7037 		}
7038 
7039 		inode->i_blocks = ocfs2_inode_sector_count(inode);
7040 	}
7041 
7042 out_commit:
7043 	if (ret < 0 && did_quota)
7044 		vfs_dq_free_space_nodirty(inode,
7045 					  ocfs2_clusters_to_bytes(osb->sb, 1));
7046 
7047 	ocfs2_commit_trans(osb, handle);
7048 
7049 out_unlock:
7050 	if (data_ac)
7051 		ocfs2_free_alloc_context(data_ac);
7052 
7053 out:
7054 	if (pages) {
7055 		ocfs2_unlock_and_free_pages(pages, num_pages);
7056 		kfree(pages);
7057 	}
7058 
7059 	return ret;
7060 }
7061 
7062 /*
7063  * It is expected, that by the time you call this function,
7064  * inode->i_size and fe->i_size have been adjusted.
7065  *
7066  * WARNING: This will kfree the truncate context
7067  */
7068 int ocfs2_commit_truncate(struct ocfs2_super *osb,
7069 			  struct inode *inode,
7070 			  struct buffer_head *fe_bh,
7071 			  struct ocfs2_truncate_context *tc)
7072 {
7073 	int status, i, credits, tl_sem = 0;
7074 	u32 clusters_to_del, new_highest_cpos, range;
7075 	struct ocfs2_extent_list *el;
7076 	handle_t *handle = NULL;
7077 	struct inode *tl_inode = osb->osb_tl_inode;
7078 	struct ocfs2_path *path = NULL;
7079 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
7080 
7081 	mlog_entry_void();
7082 
7083 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7084 						     i_size_read(inode));
7085 
7086 	path = ocfs2_new_path(fe_bh, &di->id2.i_list,
7087 			      ocfs2_journal_access_di);
7088 	if (!path) {
7089 		status = -ENOMEM;
7090 		mlog_errno(status);
7091 		goto bail;
7092 	}
7093 
7094 	ocfs2_extent_map_trunc(inode, new_highest_cpos);
7095 
7096 start:
7097 	/*
7098 	 * Check that we still have allocation to delete.
7099 	 */
7100 	if (OCFS2_I(inode)->ip_clusters == 0) {
7101 		status = 0;
7102 		goto bail;
7103 	}
7104 
7105 	/*
7106 	 * Truncate always works against the rightmost tree branch.
7107 	 */
7108 	status = ocfs2_find_path(inode, path, UINT_MAX);
7109 	if (status) {
7110 		mlog_errno(status);
7111 		goto bail;
7112 	}
7113 
7114 	mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
7115 	     OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
7116 
7117 	/*
7118 	 * By now, el will point to the extent list on the bottom most
7119 	 * portion of this tree. Only the tail record is considered in
7120 	 * each pass.
7121 	 *
7122 	 * We handle the following cases, in order:
7123 	 * - empty extent: delete the remaining branch
7124 	 * - remove the entire record
7125 	 * - remove a partial record
7126 	 * - no record needs to be removed (truncate has completed)
7127 	 */
7128 	el = path_leaf_el(path);
7129 	if (le16_to_cpu(el->l_next_free_rec) == 0) {
7130 		ocfs2_error(inode->i_sb,
7131 			    "Inode %llu has empty extent block at %llu\n",
7132 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
7133 			    (unsigned long long)path_leaf_bh(path)->b_blocknr);
7134 		status = -EROFS;
7135 		goto bail;
7136 	}
7137 
7138 	i = le16_to_cpu(el->l_next_free_rec) - 1;
7139 	range = le32_to_cpu(el->l_recs[i].e_cpos) +
7140 		ocfs2_rec_clusters(el, &el->l_recs[i]);
7141 	if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
7142 		clusters_to_del = 0;
7143 	} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
7144 		clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
7145 	} else if (range > new_highest_cpos) {
7146 		clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
7147 				   le32_to_cpu(el->l_recs[i].e_cpos)) -
7148 				  new_highest_cpos;
7149 	} else {
7150 		status = 0;
7151 		goto bail;
7152 	}
7153 
7154 	mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
7155 	     clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7156 
7157 	mutex_lock(&tl_inode->i_mutex);
7158 	tl_sem = 1;
7159 	/* ocfs2_truncate_log_needs_flush guarantees us at least one
7160 	 * record is free for use. If there isn't any, we flush to get
7161 	 * an empty truncate log.  */
7162 	if (ocfs2_truncate_log_needs_flush(osb)) {
7163 		status = __ocfs2_flush_truncate_log(osb);
7164 		if (status < 0) {
7165 			mlog_errno(status);
7166 			goto bail;
7167 		}
7168 	}
7169 
7170 	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
7171 						(struct ocfs2_dinode *)fe_bh->b_data,
7172 						el);
7173 	handle = ocfs2_start_trans(osb, credits);
7174 	if (IS_ERR(handle)) {
7175 		status = PTR_ERR(handle);
7176 		handle = NULL;
7177 		mlog_errno(status);
7178 		goto bail;
7179 	}
7180 
7181 	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7182 				   tc, path);
7183 	if (status < 0) {
7184 		mlog_errno(status);
7185 		goto bail;
7186 	}
7187 
7188 	mutex_unlock(&tl_inode->i_mutex);
7189 	tl_sem = 0;
7190 
7191 	ocfs2_commit_trans(osb, handle);
7192 	handle = NULL;
7193 
7194 	ocfs2_reinit_path(path, 1);
7195 
7196 	/*
7197 	 * The check above will catch the case where we've truncated
7198 	 * away all allocation.
7199 	 */
7200 	goto start;
7201 
7202 bail:
7203 
7204 	ocfs2_schedule_truncate_log_flush(osb, 1);
7205 
7206 	if (tl_sem)
7207 		mutex_unlock(&tl_inode->i_mutex);
7208 
7209 	if (handle)
7210 		ocfs2_commit_trans(osb, handle);
7211 
7212 	ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7213 
7214 	ocfs2_free_path(path);
7215 
7216 	/* This will drop the ext_alloc cluster lock for us */
7217 	ocfs2_free_truncate_context(tc);
7218 
7219 	mlog_exit(status);
7220 	return status;
7221 }
7222 
7223 /*
7224  * Expects the inode to already be locked.
7225  */
7226 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7227 			   struct inode *inode,
7228 			   struct buffer_head *fe_bh,
7229 			   struct ocfs2_truncate_context **tc)
7230 {
7231 	int status;
7232 	unsigned int new_i_clusters;
7233 	struct ocfs2_dinode *fe;
7234 	struct ocfs2_extent_block *eb;
7235 	struct buffer_head *last_eb_bh = NULL;
7236 
7237 	mlog_entry_void();
7238 
7239 	*tc = NULL;
7240 
7241 	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7242 						  i_size_read(inode));
7243 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
7244 
7245 	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7246 	     "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7247 	     (unsigned long long)le64_to_cpu(fe->i_size));
7248 
7249 	*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7250 	if (!(*tc)) {
7251 		status = -ENOMEM;
7252 		mlog_errno(status);
7253 		goto bail;
7254 	}
7255 	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7256 
7257 	if (fe->id2.i_list.l_tree_depth) {
7258 		status = ocfs2_read_extent_block(inode,
7259 						 le64_to_cpu(fe->i_last_eb_blk),
7260 						 &last_eb_bh);
7261 		if (status < 0) {
7262 			mlog_errno(status);
7263 			goto bail;
7264 		}
7265 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7266 	}
7267 
7268 	(*tc)->tc_last_eb_bh = last_eb_bh;
7269 
7270 	status = 0;
7271 bail:
7272 	if (status < 0) {
7273 		if (*tc)
7274 			ocfs2_free_truncate_context(*tc);
7275 		*tc = NULL;
7276 	}
7277 	mlog_exit_void();
7278 	return status;
7279 }
7280 
7281 /*
7282  * 'start' is inclusive, 'end' is not.
7283  */
7284 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7285 			  unsigned int start, unsigned int end, int trunc)
7286 {
7287 	int ret;
7288 	unsigned int numbytes;
7289 	handle_t *handle;
7290 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7291 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7292 	struct ocfs2_inline_data *idata = &di->id2.i_data;
7293 
7294 	if (end > i_size_read(inode))
7295 		end = i_size_read(inode);
7296 
7297 	BUG_ON(start >= end);
7298 
7299 	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7300 	    !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7301 	    !ocfs2_supports_inline_data(osb)) {
7302 		ocfs2_error(inode->i_sb,
7303 			    "Inline data flags for inode %llu don't agree! "
7304 			    "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7305 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
7306 			    le16_to_cpu(di->i_dyn_features),
7307 			    OCFS2_I(inode)->ip_dyn_features,
7308 			    osb->s_feature_incompat);
7309 		ret = -EROFS;
7310 		goto out;
7311 	}
7312 
7313 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7314 	if (IS_ERR(handle)) {
7315 		ret = PTR_ERR(handle);
7316 		mlog_errno(ret);
7317 		goto out;
7318 	}
7319 
7320 	ret = ocfs2_journal_access_di(handle, inode, di_bh,
7321 				      OCFS2_JOURNAL_ACCESS_WRITE);
7322 	if (ret) {
7323 		mlog_errno(ret);
7324 		goto out_commit;
7325 	}
7326 
7327 	numbytes = end - start;
7328 	memset(idata->id_data + start, 0, numbytes);
7329 
7330 	/*
7331 	 * No need to worry about the data page here - it's been
7332 	 * truncated already and inline data doesn't need it for
7333 	 * pushing zero's to disk, so we'll let readpage pick it up
7334 	 * later.
7335 	 */
7336 	if (trunc) {
7337 		i_size_write(inode, start);
7338 		di->i_size = cpu_to_le64(start);
7339 	}
7340 
7341 	inode->i_blocks = ocfs2_inode_sector_count(inode);
7342 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7343 
7344 	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7345 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7346 
7347 	ocfs2_journal_dirty(handle, di_bh);
7348 
7349 out_commit:
7350 	ocfs2_commit_trans(osb, handle);
7351 
7352 out:
7353 	return ret;
7354 }
7355 
7356 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7357 {
7358 	/*
7359 	 * The caller is responsible for completing deallocation
7360 	 * before freeing the context.
7361 	 */
7362 	if (tc->tc_dealloc.c_first_suballocator != NULL)
7363 		mlog(ML_NOTICE,
7364 		     "Truncate completion has non-empty dealloc context\n");
7365 
7366 	brelse(tc->tc_last_eb_bh);
7367 
7368 	kfree(tc);
7369 }
7370