xref: /openbmc/linux/fs/ocfs2/alloc.c (revision d9a0a1f83bf083b55b3c1f16efddecc31abace61)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * alloc.c
5  *
6  * Extent allocs and frees
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/swap.h>
31 #include <linux/quotaops.h>
32 
33 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
34 #include <cluster/masklog.h>
35 
36 #include "ocfs2.h"
37 
38 #include "alloc.h"
39 #include "aops.h"
40 #include "blockcheck.h"
41 #include "dlmglue.h"
42 #include "extent_map.h"
43 #include "inode.h"
44 #include "journal.h"
45 #include "localalloc.h"
46 #include "suballoc.h"
47 #include "sysfile.h"
48 #include "file.h"
49 #include "super.h"
50 #include "uptodate.h"
51 #include "xattr.h"
52 
53 #include "buffer_head_io.h"
54 
55 
56 /*
57  * Operations for a specific extent tree type.
58  *
59  * To implement an on-disk btree (extent tree) type in ocfs2, add
60  * an ocfs2_extent_tree_operations structure and the matching
61  * ocfs2_init_<thingy>_extent_tree() function.  That's pretty much it
62  * for the allocation portion of the extent tree.
63  */
64 struct ocfs2_extent_tree_operations {
65 	/*
66 	 * last_eb_blk is the block number of the right most leaf extent
67 	 * block.  Most on-disk structures containing an extent tree store
68 	 * this value for fast access.  The ->eo_set_last_eb_blk() and
69 	 * ->eo_get_last_eb_blk() operations access this value.  They are
70 	 *  both required.
71 	 */
72 	void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et,
73 				   u64 blkno);
74 	u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et);
75 
76 	/*
77 	 * The on-disk structure usually keeps track of how many total
78 	 * clusters are stored in this extent tree.  This function updates
79 	 * that value.  new_clusters is the delta, and must be
80 	 * added to the total.  Required.
81 	 */
82 	void (*eo_update_clusters)(struct inode *inode,
83 				   struct ocfs2_extent_tree *et,
84 				   u32 new_clusters);
85 
86 	/*
87 	 * If ->eo_insert_check() exists, it is called before rec is
88 	 * inserted into the extent tree.  It is optional.
89 	 */
90 	int (*eo_insert_check)(struct inode *inode,
91 			       struct ocfs2_extent_tree *et,
92 			       struct ocfs2_extent_rec *rec);
93 	int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et);
94 
95 	/*
96 	 * --------------------------------------------------------------
97 	 * The remaining are internal to ocfs2_extent_tree and don't have
98 	 * accessor functions
99 	 */
100 
101 	/*
102 	 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el.
103 	 * It is required.
104 	 */
105 	void (*eo_fill_root_el)(struct ocfs2_extent_tree *et);
106 
107 	/*
108 	 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if
109 	 * it exists.  If it does not, et->et_max_leaf_clusters is set
110 	 * to 0 (unlimited).  Optional.
111 	 */
112 	void (*eo_fill_max_leaf_clusters)(struct inode *inode,
113 					  struct ocfs2_extent_tree *et);
114 };
115 
116 
117 /*
118  * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check
119  * in the methods.
120  */
121 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et);
122 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
123 					 u64 blkno);
124 static void ocfs2_dinode_update_clusters(struct inode *inode,
125 					 struct ocfs2_extent_tree *et,
126 					 u32 clusters);
127 static int ocfs2_dinode_insert_check(struct inode *inode,
128 				     struct ocfs2_extent_tree *et,
129 				     struct ocfs2_extent_rec *rec);
130 static int ocfs2_dinode_sanity_check(struct inode *inode,
131 				     struct ocfs2_extent_tree *et);
132 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
133 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
134 	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
135 	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
136 	.eo_update_clusters	= ocfs2_dinode_update_clusters,
137 	.eo_insert_check	= ocfs2_dinode_insert_check,
138 	.eo_sanity_check	= ocfs2_dinode_sanity_check,
139 	.eo_fill_root_el	= ocfs2_dinode_fill_root_el,
140 };
141 
142 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et,
143 					 u64 blkno)
144 {
145 	struct ocfs2_dinode *di = et->et_object;
146 
147 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
148 	di->i_last_eb_blk = cpu_to_le64(blkno);
149 }
150 
151 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et)
152 {
153 	struct ocfs2_dinode *di = et->et_object;
154 
155 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
156 	return le64_to_cpu(di->i_last_eb_blk);
157 }
158 
159 static void ocfs2_dinode_update_clusters(struct inode *inode,
160 					 struct ocfs2_extent_tree *et,
161 					 u32 clusters)
162 {
163 	struct ocfs2_dinode *di = et->et_object;
164 
165 	le32_add_cpu(&di->i_clusters, clusters);
166 	spin_lock(&OCFS2_I(inode)->ip_lock);
167 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
168 	spin_unlock(&OCFS2_I(inode)->ip_lock);
169 }
170 
171 static int ocfs2_dinode_insert_check(struct inode *inode,
172 				     struct ocfs2_extent_tree *et,
173 				     struct ocfs2_extent_rec *rec)
174 {
175 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
176 
177 	BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL);
178 	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
179 			(OCFS2_I(inode)->ip_clusters !=
180 			 le32_to_cpu(rec->e_cpos)),
181 			"Device %s, asking for sparse allocation: inode %llu, "
182 			"cpos %u, clusters %u\n",
183 			osb->dev_str,
184 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
185 			rec->e_cpos,
186 			OCFS2_I(inode)->ip_clusters);
187 
188 	return 0;
189 }
190 
191 static int ocfs2_dinode_sanity_check(struct inode *inode,
192 				     struct ocfs2_extent_tree *et)
193 {
194 	struct ocfs2_dinode *di = et->et_object;
195 
196 	BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
197 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
198 
199 	return 0;
200 }
201 
202 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
203 {
204 	struct ocfs2_dinode *di = et->et_object;
205 
206 	et->et_root_el = &di->id2.i_list;
207 }
208 
209 
210 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
211 {
212 	struct ocfs2_xattr_value_buf *vb = et->et_object;
213 
214 	et->et_root_el = &vb->vb_xv->xr_list;
215 }
216 
217 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
218 					      u64 blkno)
219 {
220 	struct ocfs2_xattr_value_buf *vb = et->et_object;
221 
222 	vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
223 }
224 
225 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
226 {
227 	struct ocfs2_xattr_value_buf *vb = et->et_object;
228 
229 	return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
230 }
231 
232 static void ocfs2_xattr_value_update_clusters(struct inode *inode,
233 					      struct ocfs2_extent_tree *et,
234 					      u32 clusters)
235 {
236 	struct ocfs2_xattr_value_buf *vb = et->et_object;
237 
238 	le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
239 }
240 
241 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
242 	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
243 	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
244 	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
245 	.eo_fill_root_el	= ocfs2_xattr_value_fill_root_el,
246 };
247 
248 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et)
249 {
250 	struct ocfs2_xattr_block *xb = et->et_object;
251 
252 	et->et_root_el = &xb->xb_attrs.xb_root.xt_list;
253 }
254 
255 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode,
256 						    struct ocfs2_extent_tree *et)
257 {
258 	et->et_max_leaf_clusters =
259 		ocfs2_clusters_for_bytes(inode->i_sb,
260 					 OCFS2_MAX_XATTR_TREE_LEAF_SIZE);
261 }
262 
263 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et,
264 					     u64 blkno)
265 {
266 	struct ocfs2_xattr_block *xb = et->et_object;
267 	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
268 
269 	xt->xt_last_eb_blk = cpu_to_le64(blkno);
270 }
271 
272 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et)
273 {
274 	struct ocfs2_xattr_block *xb = et->et_object;
275 	struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root;
276 
277 	return le64_to_cpu(xt->xt_last_eb_blk);
278 }
279 
280 static void ocfs2_xattr_tree_update_clusters(struct inode *inode,
281 					     struct ocfs2_extent_tree *et,
282 					     u32 clusters)
283 {
284 	struct ocfs2_xattr_block *xb = et->et_object;
285 
286 	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
287 }
288 
289 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
290 	.eo_set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
291 	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
292 	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
293 	.eo_fill_root_el	= ocfs2_xattr_tree_fill_root_el,
294 	.eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters,
295 };
296 
297 static void ocfs2_dx_root_set_last_eb_blk(struct ocfs2_extent_tree *et,
298 					  u64 blkno)
299 {
300 	struct ocfs2_dx_root_block *dx_root = et->et_object;
301 
302 	dx_root->dr_last_eb_blk = cpu_to_le64(blkno);
303 }
304 
305 static u64 ocfs2_dx_root_get_last_eb_blk(struct ocfs2_extent_tree *et)
306 {
307 	struct ocfs2_dx_root_block *dx_root = et->et_object;
308 
309 	return le64_to_cpu(dx_root->dr_last_eb_blk);
310 }
311 
312 static void ocfs2_dx_root_update_clusters(struct inode *inode,
313 					  struct ocfs2_extent_tree *et,
314 					  u32 clusters)
315 {
316 	struct ocfs2_dx_root_block *dx_root = et->et_object;
317 
318 	le32_add_cpu(&dx_root->dr_clusters, clusters);
319 }
320 
321 static int ocfs2_dx_root_sanity_check(struct inode *inode,
322 				      struct ocfs2_extent_tree *et)
323 {
324 	struct ocfs2_dx_root_block *dx_root = et->et_object;
325 
326 	BUG_ON(!OCFS2_IS_VALID_DX_ROOT(dx_root));
327 
328 	return 0;
329 }
330 
331 static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
332 {
333 	struct ocfs2_dx_root_block *dx_root = et->et_object;
334 
335 	et->et_root_el = &dx_root->dr_list;
336 }
337 
338 static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
339 	.eo_set_last_eb_blk	= ocfs2_dx_root_set_last_eb_blk,
340 	.eo_get_last_eb_blk	= ocfs2_dx_root_get_last_eb_blk,
341 	.eo_update_clusters	= ocfs2_dx_root_update_clusters,
342 	.eo_sanity_check	= ocfs2_dx_root_sanity_check,
343 	.eo_fill_root_el	= ocfs2_dx_root_fill_root_el,
344 };
345 
346 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
347 				     struct inode *inode,
348 				     struct buffer_head *bh,
349 				     ocfs2_journal_access_func access,
350 				     void *obj,
351 				     struct ocfs2_extent_tree_operations *ops)
352 {
353 	et->et_ops = ops;
354 	et->et_root_bh = bh;
355 	et->et_ci = INODE_CACHE(inode);
356 	et->et_root_journal_access = access;
357 	if (!obj)
358 		obj = (void *)bh->b_data;
359 	et->et_object = obj;
360 
361 	et->et_ops->eo_fill_root_el(et);
362 	if (!et->et_ops->eo_fill_max_leaf_clusters)
363 		et->et_max_leaf_clusters = 0;
364 	else
365 		et->et_ops->eo_fill_max_leaf_clusters(inode, et);
366 }
367 
368 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
369 				   struct inode *inode,
370 				   struct buffer_head *bh)
371 {
372 	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
373 				 NULL, &ocfs2_dinode_et_ops);
374 }
375 
376 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
377 				       struct inode *inode,
378 				       struct buffer_head *bh)
379 {
380 	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
381 				 NULL, &ocfs2_xattr_tree_et_ops);
382 }
383 
384 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
385 					struct inode *inode,
386 					struct ocfs2_xattr_value_buf *vb)
387 {
388 	__ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
389 				 &ocfs2_xattr_value_et_ops);
390 }
391 
392 void ocfs2_init_dx_root_extent_tree(struct ocfs2_extent_tree *et,
393 				    struct inode *inode,
394 				    struct buffer_head *bh)
395 {
396 	__ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_dr,
397 				 NULL, &ocfs2_dx_root_et_ops);
398 }
399 
400 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et,
401 					    u64 new_last_eb_blk)
402 {
403 	et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk);
404 }
405 
406 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et)
407 {
408 	return et->et_ops->eo_get_last_eb_blk(et);
409 }
410 
411 static inline void ocfs2_et_update_clusters(struct inode *inode,
412 					    struct ocfs2_extent_tree *et,
413 					    u32 clusters)
414 {
415 	et->et_ops->eo_update_clusters(inode, et, clusters);
416 }
417 
418 static inline int ocfs2_et_root_journal_access(handle_t *handle,
419 					       struct ocfs2_extent_tree *et,
420 					       int type)
421 {
422 	return et->et_root_journal_access(handle, et->et_ci, et->et_root_bh,
423 					  type);
424 }
425 
426 static inline int ocfs2_et_insert_check(struct inode *inode,
427 					struct ocfs2_extent_tree *et,
428 					struct ocfs2_extent_rec *rec)
429 {
430 	int ret = 0;
431 
432 	if (et->et_ops->eo_insert_check)
433 		ret = et->et_ops->eo_insert_check(inode, et, rec);
434 	return ret;
435 }
436 
437 static inline int ocfs2_et_sanity_check(struct inode *inode,
438 					struct ocfs2_extent_tree *et)
439 {
440 	int ret = 0;
441 
442 	if (et->et_ops->eo_sanity_check)
443 		ret = et->et_ops->eo_sanity_check(inode, et);
444 	return ret;
445 }
446 
447 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
448 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
449 					 struct ocfs2_extent_block *eb);
450 
451 /*
452  * Structures which describe a path through a btree, and functions to
453  * manipulate them.
454  *
455  * The idea here is to be as generic as possible with the tree
456  * manipulation code.
457  */
458 struct ocfs2_path_item {
459 	struct buffer_head		*bh;
460 	struct ocfs2_extent_list	*el;
461 };
462 
463 #define OCFS2_MAX_PATH_DEPTH	5
464 
465 struct ocfs2_path {
466 	int				p_tree_depth;
467 	ocfs2_journal_access_func	p_root_access;
468 	struct ocfs2_path_item		p_node[OCFS2_MAX_PATH_DEPTH];
469 };
470 
471 #define path_root_bh(_path) ((_path)->p_node[0].bh)
472 #define path_root_el(_path) ((_path)->p_node[0].el)
473 #define path_root_access(_path)((_path)->p_root_access)
474 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
475 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
476 #define path_num_items(_path) ((_path)->p_tree_depth + 1)
477 
478 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
479 			   u32 cpos);
480 static void ocfs2_adjust_rightmost_records(struct inode *inode,
481 					   handle_t *handle,
482 					   struct ocfs2_path *path,
483 					   struct ocfs2_extent_rec *insert_rec);
484 /*
485  * Reset the actual path elements so that we can re-use the structure
486  * to build another path. Generally, this involves freeing the buffer
487  * heads.
488  */
489 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
490 {
491 	int i, start = 0, depth = 0;
492 	struct ocfs2_path_item *node;
493 
494 	if (keep_root)
495 		start = 1;
496 
497 	for(i = start; i < path_num_items(path); i++) {
498 		node = &path->p_node[i];
499 
500 		brelse(node->bh);
501 		node->bh = NULL;
502 		node->el = NULL;
503 	}
504 
505 	/*
506 	 * Tree depth may change during truncate, or insert. If we're
507 	 * keeping the root extent list, then make sure that our path
508 	 * structure reflects the proper depth.
509 	 */
510 	if (keep_root)
511 		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
512 	else
513 		path_root_access(path) = NULL;
514 
515 	path->p_tree_depth = depth;
516 }
517 
518 static void ocfs2_free_path(struct ocfs2_path *path)
519 {
520 	if (path) {
521 		ocfs2_reinit_path(path, 0);
522 		kfree(path);
523 	}
524 }
525 
526 /*
527  * All the elements of src into dest. After this call, src could be freed
528  * without affecting dest.
529  *
530  * Both paths should have the same root. Any non-root elements of dest
531  * will be freed.
532  */
533 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
534 {
535 	int i;
536 
537 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
538 	BUG_ON(path_root_el(dest) != path_root_el(src));
539 	BUG_ON(path_root_access(dest) != path_root_access(src));
540 
541 	ocfs2_reinit_path(dest, 1);
542 
543 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
544 		dest->p_node[i].bh = src->p_node[i].bh;
545 		dest->p_node[i].el = src->p_node[i].el;
546 
547 		if (dest->p_node[i].bh)
548 			get_bh(dest->p_node[i].bh);
549 	}
550 }
551 
552 /*
553  * Make the *dest path the same as src and re-initialize src path to
554  * have a root only.
555  */
556 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
557 {
558 	int i;
559 
560 	BUG_ON(path_root_bh(dest) != path_root_bh(src));
561 	BUG_ON(path_root_access(dest) != path_root_access(src));
562 
563 	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
564 		brelse(dest->p_node[i].bh);
565 
566 		dest->p_node[i].bh = src->p_node[i].bh;
567 		dest->p_node[i].el = src->p_node[i].el;
568 
569 		src->p_node[i].bh = NULL;
570 		src->p_node[i].el = NULL;
571 	}
572 }
573 
574 /*
575  * Insert an extent block at given index.
576  *
577  * This will not take an additional reference on eb_bh.
578  */
579 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
580 					struct buffer_head *eb_bh)
581 {
582 	struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
583 
584 	/*
585 	 * Right now, no root bh is an extent block, so this helps
586 	 * catch code errors with dinode trees. The assertion can be
587 	 * safely removed if we ever need to insert extent block
588 	 * structures at the root.
589 	 */
590 	BUG_ON(index == 0);
591 
592 	path->p_node[index].bh = eb_bh;
593 	path->p_node[index].el = &eb->h_list;
594 }
595 
596 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
597 					 struct ocfs2_extent_list *root_el,
598 					 ocfs2_journal_access_func access)
599 {
600 	struct ocfs2_path *path;
601 
602 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
603 
604 	path = kzalloc(sizeof(*path), GFP_NOFS);
605 	if (path) {
606 		path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
607 		get_bh(root_bh);
608 		path_root_bh(path) = root_bh;
609 		path_root_el(path) = root_el;
610 		path_root_access(path) = access;
611 	}
612 
613 	return path;
614 }
615 
616 static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
617 {
618 	return ocfs2_new_path(path_root_bh(path), path_root_el(path),
619 			      path_root_access(path));
620 }
621 
622 static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
623 {
624 	return ocfs2_new_path(et->et_root_bh, et->et_root_el,
625 			      et->et_root_journal_access);
626 }
627 
628 /*
629  * Journal the buffer at depth idx.  All idx>0 are extent_blocks,
630  * otherwise it's the root_access function.
631  *
632  * I don't like the way this function's name looks next to
633  * ocfs2_journal_access_path(), but I don't have a better one.
634  */
635 static int ocfs2_path_bh_journal_access(handle_t *handle,
636 					struct ocfs2_caching_info *ci,
637 					struct ocfs2_path *path,
638 					int idx)
639 {
640 	ocfs2_journal_access_func access = path_root_access(path);
641 
642 	if (!access)
643 		access = ocfs2_journal_access;
644 
645 	if (idx)
646 		access = ocfs2_journal_access_eb;
647 
648 	return access(handle, ci, path->p_node[idx].bh,
649 		      OCFS2_JOURNAL_ACCESS_WRITE);
650 }
651 
652 /*
653  * Convenience function to journal all components in a path.
654  */
655 static int ocfs2_journal_access_path(struct ocfs2_caching_info *ci,
656 				     handle_t *handle,
657 				     struct ocfs2_path *path)
658 {
659 	int i, ret = 0;
660 
661 	if (!path)
662 		goto out;
663 
664 	for(i = 0; i < path_num_items(path); i++) {
665 		ret = ocfs2_path_bh_journal_access(handle, ci, path, i);
666 		if (ret < 0) {
667 			mlog_errno(ret);
668 			goto out;
669 		}
670 	}
671 
672 out:
673 	return ret;
674 }
675 
676 /*
677  * Return the index of the extent record which contains cluster #v_cluster.
678  * -1 is returned if it was not found.
679  *
680  * Should work fine on interior and exterior nodes.
681  */
682 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster)
683 {
684 	int ret = -1;
685 	int i;
686 	struct ocfs2_extent_rec *rec;
687 	u32 rec_end, rec_start, clusters;
688 
689 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
690 		rec = &el->l_recs[i];
691 
692 		rec_start = le32_to_cpu(rec->e_cpos);
693 		clusters = ocfs2_rec_clusters(el, rec);
694 
695 		rec_end = rec_start + clusters;
696 
697 		if (v_cluster >= rec_start && v_cluster < rec_end) {
698 			ret = i;
699 			break;
700 		}
701 	}
702 
703 	return ret;
704 }
705 
706 enum ocfs2_contig_type {
707 	CONTIG_NONE = 0,
708 	CONTIG_LEFT,
709 	CONTIG_RIGHT,
710 	CONTIG_LEFTRIGHT,
711 };
712 
713 
714 /*
715  * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
716  * ocfs2_extent_contig only work properly against leaf nodes!
717  */
718 static int ocfs2_block_extent_contig(struct super_block *sb,
719 				     struct ocfs2_extent_rec *ext,
720 				     u64 blkno)
721 {
722 	u64 blk_end = le64_to_cpu(ext->e_blkno);
723 
724 	blk_end += ocfs2_clusters_to_blocks(sb,
725 				    le16_to_cpu(ext->e_leaf_clusters));
726 
727 	return blkno == blk_end;
728 }
729 
730 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
731 				  struct ocfs2_extent_rec *right)
732 {
733 	u32 left_range;
734 
735 	left_range = le32_to_cpu(left->e_cpos) +
736 		le16_to_cpu(left->e_leaf_clusters);
737 
738 	return (left_range == le32_to_cpu(right->e_cpos));
739 }
740 
741 static enum ocfs2_contig_type
742 	ocfs2_extent_contig(struct inode *inode,
743 			    struct ocfs2_extent_rec *ext,
744 			    struct ocfs2_extent_rec *insert_rec)
745 {
746 	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
747 
748 	/*
749 	 * Refuse to coalesce extent records with different flag
750 	 * fields - we don't want to mix unwritten extents with user
751 	 * data.
752 	 */
753 	if (ext->e_flags != insert_rec->e_flags)
754 		return CONTIG_NONE;
755 
756 	if (ocfs2_extents_adjacent(ext, insert_rec) &&
757 	    ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
758 			return CONTIG_RIGHT;
759 
760 	blkno = le64_to_cpu(ext->e_blkno);
761 	if (ocfs2_extents_adjacent(insert_rec, ext) &&
762 	    ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
763 		return CONTIG_LEFT;
764 
765 	return CONTIG_NONE;
766 }
767 
768 /*
769  * NOTE: We can have pretty much any combination of contiguousness and
770  * appending.
771  *
772  * The usefulness of APPEND_TAIL is more in that it lets us know that
773  * we'll have to update the path to that leaf.
774  */
775 enum ocfs2_append_type {
776 	APPEND_NONE = 0,
777 	APPEND_TAIL,
778 };
779 
780 enum ocfs2_split_type {
781 	SPLIT_NONE = 0,
782 	SPLIT_LEFT,
783 	SPLIT_RIGHT,
784 };
785 
786 struct ocfs2_insert_type {
787 	enum ocfs2_split_type	ins_split;
788 	enum ocfs2_append_type	ins_appending;
789 	enum ocfs2_contig_type	ins_contig;
790 	int			ins_contig_index;
791 	int			ins_tree_depth;
792 };
793 
794 struct ocfs2_merge_ctxt {
795 	enum ocfs2_contig_type	c_contig_type;
796 	int			c_has_empty_extent;
797 	int			c_split_covers_rec;
798 };
799 
800 static int ocfs2_validate_extent_block(struct super_block *sb,
801 				       struct buffer_head *bh)
802 {
803 	int rc;
804 	struct ocfs2_extent_block *eb =
805 		(struct ocfs2_extent_block *)bh->b_data;
806 
807 	mlog(0, "Validating extent block %llu\n",
808 	     (unsigned long long)bh->b_blocknr);
809 
810 	BUG_ON(!buffer_uptodate(bh));
811 
812 	/*
813 	 * If the ecc fails, we return the error but otherwise
814 	 * leave the filesystem running.  We know any error is
815 	 * local to this block.
816 	 */
817 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
818 	if (rc) {
819 		mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
820 		     (unsigned long long)bh->b_blocknr);
821 		return rc;
822 	}
823 
824 	/*
825 	 * Errors after here are fatal.
826 	 */
827 
828 	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
829 		ocfs2_error(sb,
830 			    "Extent block #%llu has bad signature %.*s",
831 			    (unsigned long long)bh->b_blocknr, 7,
832 			    eb->h_signature);
833 		return -EINVAL;
834 	}
835 
836 	if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
837 		ocfs2_error(sb,
838 			    "Extent block #%llu has an invalid h_blkno "
839 			    "of %llu",
840 			    (unsigned long long)bh->b_blocknr,
841 			    (unsigned long long)le64_to_cpu(eb->h_blkno));
842 		return -EINVAL;
843 	}
844 
845 	if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
846 		ocfs2_error(sb,
847 			    "Extent block #%llu has an invalid "
848 			    "h_fs_generation of #%u",
849 			    (unsigned long long)bh->b_blocknr,
850 			    le32_to_cpu(eb->h_fs_generation));
851 		return -EINVAL;
852 	}
853 
854 	return 0;
855 }
856 
857 int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
858 			    struct buffer_head **bh)
859 {
860 	int rc;
861 	struct buffer_head *tmp = *bh;
862 
863 	rc = ocfs2_read_block(INODE_CACHE(inode), eb_blkno, &tmp,
864 			      ocfs2_validate_extent_block);
865 
866 	/* If ocfs2_read_block() got us a new bh, pass it up. */
867 	if (!rc && !*bh)
868 		*bh = tmp;
869 
870 	return rc;
871 }
872 
873 
874 /*
875  * How many free extents have we got before we need more meta data?
876  */
877 int ocfs2_num_free_extents(struct ocfs2_super *osb,
878 			   struct inode *inode,
879 			   struct ocfs2_extent_tree *et)
880 {
881 	int retval;
882 	struct ocfs2_extent_list *el = NULL;
883 	struct ocfs2_extent_block *eb;
884 	struct buffer_head *eb_bh = NULL;
885 	u64 last_eb_blk = 0;
886 
887 	mlog_entry_void();
888 
889 	el = et->et_root_el;
890 	last_eb_blk = ocfs2_et_get_last_eb_blk(et);
891 
892 	if (last_eb_blk) {
893 		retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
894 		if (retval < 0) {
895 			mlog_errno(retval);
896 			goto bail;
897 		}
898 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
899 		el = &eb->h_list;
900 	}
901 
902 	BUG_ON(el->l_tree_depth != 0);
903 
904 	retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec);
905 bail:
906 	brelse(eb_bh);
907 
908 	mlog_exit(retval);
909 	return retval;
910 }
911 
912 /* expects array to already be allocated
913  *
914  * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and
915  * l_count for you
916  */
917 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
918 				     handle_t *handle,
919 				     struct inode *inode,
920 				     int wanted,
921 				     struct ocfs2_alloc_context *meta_ac,
922 				     struct buffer_head *bhs[])
923 {
924 	int count, status, i;
925 	u16 suballoc_bit_start;
926 	u32 num_got;
927 	u64 first_blkno;
928 	struct ocfs2_extent_block *eb;
929 
930 	mlog_entry_void();
931 
932 	count = 0;
933 	while (count < wanted) {
934 		status = ocfs2_claim_metadata(osb,
935 					      handle,
936 					      meta_ac,
937 					      wanted - count,
938 					      &suballoc_bit_start,
939 					      &num_got,
940 					      &first_blkno);
941 		if (status < 0) {
942 			mlog_errno(status);
943 			goto bail;
944 		}
945 
946 		for(i = count;  i < (num_got + count); i++) {
947 			bhs[i] = sb_getblk(osb->sb, first_blkno);
948 			if (bhs[i] == NULL) {
949 				status = -EIO;
950 				mlog_errno(status);
951 				goto bail;
952 			}
953 			ocfs2_set_new_buffer_uptodate(INODE_CACHE(inode),
954 						      bhs[i]);
955 
956 			status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), bhs[i],
957 							 OCFS2_JOURNAL_ACCESS_CREATE);
958 			if (status < 0) {
959 				mlog_errno(status);
960 				goto bail;
961 			}
962 
963 			memset(bhs[i]->b_data, 0, osb->sb->s_blocksize);
964 			eb = (struct ocfs2_extent_block *) bhs[i]->b_data;
965 			/* Ok, setup the minimal stuff here. */
966 			strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE);
967 			eb->h_blkno = cpu_to_le64(first_blkno);
968 			eb->h_fs_generation = cpu_to_le32(osb->fs_generation);
969 			eb->h_suballoc_slot = cpu_to_le16(osb->slot_num);
970 			eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start);
971 			eb->h_list.l_count =
972 				cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb));
973 
974 			suballoc_bit_start++;
975 			first_blkno++;
976 
977 			/* We'll also be dirtied by the caller, so
978 			 * this isn't absolutely necessary. */
979 			status = ocfs2_journal_dirty(handle, bhs[i]);
980 			if (status < 0) {
981 				mlog_errno(status);
982 				goto bail;
983 			}
984 		}
985 
986 		count += num_got;
987 	}
988 
989 	status = 0;
990 bail:
991 	if (status < 0) {
992 		for(i = 0; i < wanted; i++) {
993 			brelse(bhs[i]);
994 			bhs[i] = NULL;
995 		}
996 	}
997 	mlog_exit(status);
998 	return status;
999 }
1000 
1001 /*
1002  * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
1003  *
1004  * Returns the sum of the rightmost extent rec logical offset and
1005  * cluster count.
1006  *
1007  * ocfs2_add_branch() uses this to determine what logical cluster
1008  * value should be populated into the leftmost new branch records.
1009  *
1010  * ocfs2_shift_tree_depth() uses this to determine the # clusters
1011  * value for the new topmost tree record.
1012  */
1013 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
1014 {
1015 	int i;
1016 
1017 	i = le16_to_cpu(el->l_next_free_rec) - 1;
1018 
1019 	return le32_to_cpu(el->l_recs[i].e_cpos) +
1020 		ocfs2_rec_clusters(el, &el->l_recs[i]);
1021 }
1022 
1023 /*
1024  * Change range of the branches in the right most path according to the leaf
1025  * extent block's rightmost record.
1026  */
1027 static int ocfs2_adjust_rightmost_branch(handle_t *handle,
1028 					 struct inode *inode,
1029 					 struct ocfs2_extent_tree *et)
1030 {
1031 	int status;
1032 	struct ocfs2_path *path = NULL;
1033 	struct ocfs2_extent_list *el;
1034 	struct ocfs2_extent_rec *rec;
1035 
1036 	path = ocfs2_new_path_from_et(et);
1037 	if (!path) {
1038 		status = -ENOMEM;
1039 		return status;
1040 	}
1041 
1042 	status = ocfs2_find_path(inode, path, UINT_MAX);
1043 	if (status < 0) {
1044 		mlog_errno(status);
1045 		goto out;
1046 	}
1047 
1048 	status = ocfs2_extend_trans(handle, path_num_items(path) +
1049 				    handle->h_buffer_credits);
1050 	if (status < 0) {
1051 		mlog_errno(status);
1052 		goto out;
1053 	}
1054 
1055 	status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
1056 	if (status < 0) {
1057 		mlog_errno(status);
1058 		goto out;
1059 	}
1060 
1061 	el = path_leaf_el(path);
1062 	rec = &el->l_recs[le32_to_cpu(el->l_next_free_rec) - 1];
1063 
1064 	ocfs2_adjust_rightmost_records(inode, handle, path, rec);
1065 
1066 out:
1067 	ocfs2_free_path(path);
1068 	return status;
1069 }
1070 
1071 /*
1072  * Add an entire tree branch to our inode. eb_bh is the extent block
1073  * to start at, if we don't want to start the branch at the dinode
1074  * structure.
1075  *
1076  * last_eb_bh is required as we have to update it's next_leaf pointer
1077  * for the new last extent block.
1078  *
1079  * the new branch will be 'empty' in the sense that every block will
1080  * contain a single record with cluster count == 0.
1081  */
1082 static int ocfs2_add_branch(struct ocfs2_super *osb,
1083 			    handle_t *handle,
1084 			    struct inode *inode,
1085 			    struct ocfs2_extent_tree *et,
1086 			    struct buffer_head *eb_bh,
1087 			    struct buffer_head **last_eb_bh,
1088 			    struct ocfs2_alloc_context *meta_ac)
1089 {
1090 	int status, new_blocks, i;
1091 	u64 next_blkno, new_last_eb_blk;
1092 	struct buffer_head *bh;
1093 	struct buffer_head **new_eb_bhs = NULL;
1094 	struct ocfs2_extent_block *eb;
1095 	struct ocfs2_extent_list  *eb_el;
1096 	struct ocfs2_extent_list  *el;
1097 	u32 new_cpos, root_end;
1098 
1099 	mlog_entry_void();
1100 
1101 	BUG_ON(!last_eb_bh || !*last_eb_bh);
1102 
1103 	if (eb_bh) {
1104 		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
1105 		el = &eb->h_list;
1106 	} else
1107 		el = et->et_root_el;
1108 
1109 	/* we never add a branch to a leaf. */
1110 	BUG_ON(!el->l_tree_depth);
1111 
1112 	new_blocks = le16_to_cpu(el->l_tree_depth);
1113 
1114 	eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data;
1115 	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
1116 	root_end = ocfs2_sum_rightmost_rec(et->et_root_el);
1117 
1118 	/*
1119 	 * If there is a gap before the root end and the real end
1120 	 * of the righmost leaf block, we need to remove the gap
1121 	 * between new_cpos and root_end first so that the tree
1122 	 * is consistent after we add a new branch(it will start
1123 	 * from new_cpos).
1124 	 */
1125 	if (root_end > new_cpos) {
1126 		mlog(0, "adjust the cluster end from %u to %u\n",
1127 		     root_end, new_cpos);
1128 		status = ocfs2_adjust_rightmost_branch(handle, inode, et);
1129 		if (status) {
1130 			mlog_errno(status);
1131 			goto bail;
1132 		}
1133 	}
1134 
1135 	/* allocate the number of new eb blocks we need */
1136 	new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *),
1137 			     GFP_KERNEL);
1138 	if (!new_eb_bhs) {
1139 		status = -ENOMEM;
1140 		mlog_errno(status);
1141 		goto bail;
1142 	}
1143 
1144 	status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks,
1145 					   meta_ac, new_eb_bhs);
1146 	if (status < 0) {
1147 		mlog_errno(status);
1148 		goto bail;
1149 	}
1150 
1151 	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
1152 	 * linked with the rest of the tree.
1153 	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
1154 	 *
1155 	 * when we leave the loop, new_last_eb_blk will point to the
1156 	 * newest leaf, and next_blkno will point to the topmost extent
1157 	 * block. */
1158 	next_blkno = new_last_eb_blk = 0;
1159 	for(i = 0; i < new_blocks; i++) {
1160 		bh = new_eb_bhs[i];
1161 		eb = (struct ocfs2_extent_block *) bh->b_data;
1162 		/* ocfs2_create_new_meta_bhs() should create it right! */
1163 		BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1164 		eb_el = &eb->h_list;
1165 
1166 		status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), bh,
1167 						 OCFS2_JOURNAL_ACCESS_CREATE);
1168 		if (status < 0) {
1169 			mlog_errno(status);
1170 			goto bail;
1171 		}
1172 
1173 		eb->h_next_leaf_blk = 0;
1174 		eb_el->l_tree_depth = cpu_to_le16(i);
1175 		eb_el->l_next_free_rec = cpu_to_le16(1);
1176 		/*
1177 		 * This actually counts as an empty extent as
1178 		 * c_clusters == 0
1179 		 */
1180 		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
1181 		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
1182 		/*
1183 		 * eb_el isn't always an interior node, but even leaf
1184 		 * nodes want a zero'd flags and reserved field so
1185 		 * this gets the whole 32 bits regardless of use.
1186 		 */
1187 		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
1188 		if (!eb_el->l_tree_depth)
1189 			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
1190 
1191 		status = ocfs2_journal_dirty(handle, bh);
1192 		if (status < 0) {
1193 			mlog_errno(status);
1194 			goto bail;
1195 		}
1196 
1197 		next_blkno = le64_to_cpu(eb->h_blkno);
1198 	}
1199 
1200 	/* This is a bit hairy. We want to update up to three blocks
1201 	 * here without leaving any of them in an inconsistent state
1202 	 * in case of error. We don't have to worry about
1203 	 * journal_dirty erroring as it won't unless we've aborted the
1204 	 * handle (in which case we would never be here) so reserving
1205 	 * the write with journal_access is all we need to do. */
1206 	status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), *last_eb_bh,
1207 					 OCFS2_JOURNAL_ACCESS_WRITE);
1208 	if (status < 0) {
1209 		mlog_errno(status);
1210 		goto bail;
1211 	}
1212 	status = ocfs2_et_root_journal_access(handle, et,
1213 					      OCFS2_JOURNAL_ACCESS_WRITE);
1214 	if (status < 0) {
1215 		mlog_errno(status);
1216 		goto bail;
1217 	}
1218 	if (eb_bh) {
1219 		status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), eb_bh,
1220 						 OCFS2_JOURNAL_ACCESS_WRITE);
1221 		if (status < 0) {
1222 			mlog_errno(status);
1223 			goto bail;
1224 		}
1225 	}
1226 
1227 	/* Link the new branch into the rest of the tree (el will
1228 	 * either be on the root_bh, or the extent block passed in. */
1229 	i = le16_to_cpu(el->l_next_free_rec);
1230 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
1231 	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
1232 	el->l_recs[i].e_int_clusters = 0;
1233 	le16_add_cpu(&el->l_next_free_rec, 1);
1234 
1235 	/* fe needs a new last extent block pointer, as does the
1236 	 * next_leaf on the previously last-extent-block. */
1237 	ocfs2_et_set_last_eb_blk(et, new_last_eb_blk);
1238 
1239 	eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
1240 	eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk);
1241 
1242 	status = ocfs2_journal_dirty(handle, *last_eb_bh);
1243 	if (status < 0)
1244 		mlog_errno(status);
1245 	status = ocfs2_journal_dirty(handle, et->et_root_bh);
1246 	if (status < 0)
1247 		mlog_errno(status);
1248 	if (eb_bh) {
1249 		status = ocfs2_journal_dirty(handle, eb_bh);
1250 		if (status < 0)
1251 			mlog_errno(status);
1252 	}
1253 
1254 	/*
1255 	 * Some callers want to track the rightmost leaf so pass it
1256 	 * back here.
1257 	 */
1258 	brelse(*last_eb_bh);
1259 	get_bh(new_eb_bhs[0]);
1260 	*last_eb_bh = new_eb_bhs[0];
1261 
1262 	status = 0;
1263 bail:
1264 	if (new_eb_bhs) {
1265 		for (i = 0; i < new_blocks; i++)
1266 			brelse(new_eb_bhs[i]);
1267 		kfree(new_eb_bhs);
1268 	}
1269 
1270 	mlog_exit(status);
1271 	return status;
1272 }
1273 
1274 /*
1275  * adds another level to the allocation tree.
1276  * returns back the new extent block so you can add a branch to it
1277  * after this call.
1278  */
1279 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
1280 				  handle_t *handle,
1281 				  struct inode *inode,
1282 				  struct ocfs2_extent_tree *et,
1283 				  struct ocfs2_alloc_context *meta_ac,
1284 				  struct buffer_head **ret_new_eb_bh)
1285 {
1286 	int status, i;
1287 	u32 new_clusters;
1288 	struct buffer_head *new_eb_bh = NULL;
1289 	struct ocfs2_extent_block *eb;
1290 	struct ocfs2_extent_list  *root_el;
1291 	struct ocfs2_extent_list  *eb_el;
1292 
1293 	mlog_entry_void();
1294 
1295 	status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac,
1296 					   &new_eb_bh);
1297 	if (status < 0) {
1298 		mlog_errno(status);
1299 		goto bail;
1300 	}
1301 
1302 	eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
1303 	/* ocfs2_create_new_meta_bhs() should create it right! */
1304 	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
1305 
1306 	eb_el = &eb->h_list;
1307 	root_el = et->et_root_el;
1308 
1309 	status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), new_eb_bh,
1310 					 OCFS2_JOURNAL_ACCESS_CREATE);
1311 	if (status < 0) {
1312 		mlog_errno(status);
1313 		goto bail;
1314 	}
1315 
1316 	/* copy the root extent list data into the new extent block */
1317 	eb_el->l_tree_depth = root_el->l_tree_depth;
1318 	eb_el->l_next_free_rec = root_el->l_next_free_rec;
1319 	for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1320 		eb_el->l_recs[i] = root_el->l_recs[i];
1321 
1322 	status = ocfs2_journal_dirty(handle, new_eb_bh);
1323 	if (status < 0) {
1324 		mlog_errno(status);
1325 		goto bail;
1326 	}
1327 
1328 	status = ocfs2_et_root_journal_access(handle, et,
1329 					      OCFS2_JOURNAL_ACCESS_WRITE);
1330 	if (status < 0) {
1331 		mlog_errno(status);
1332 		goto bail;
1333 	}
1334 
1335 	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
1336 
1337 	/* update root_bh now */
1338 	le16_add_cpu(&root_el->l_tree_depth, 1);
1339 	root_el->l_recs[0].e_cpos = 0;
1340 	root_el->l_recs[0].e_blkno = eb->h_blkno;
1341 	root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
1342 	for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
1343 		memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
1344 	root_el->l_next_free_rec = cpu_to_le16(1);
1345 
1346 	/* If this is our 1st tree depth shift, then last_eb_blk
1347 	 * becomes the allocated extent block */
1348 	if (root_el->l_tree_depth == cpu_to_le16(1))
1349 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
1350 
1351 	status = ocfs2_journal_dirty(handle, et->et_root_bh);
1352 	if (status < 0) {
1353 		mlog_errno(status);
1354 		goto bail;
1355 	}
1356 
1357 	*ret_new_eb_bh = new_eb_bh;
1358 	new_eb_bh = NULL;
1359 	status = 0;
1360 bail:
1361 	brelse(new_eb_bh);
1362 
1363 	mlog_exit(status);
1364 	return status;
1365 }
1366 
1367 /*
1368  * Should only be called when there is no space left in any of the
1369  * leaf nodes. What we want to do is find the lowest tree depth
1370  * non-leaf extent block with room for new records. There are three
1371  * valid results of this search:
1372  *
1373  * 1) a lowest extent block is found, then we pass it back in
1374  *    *lowest_eb_bh and return '0'
1375  *
1376  * 2) the search fails to find anything, but the root_el has room. We
1377  *    pass NULL back in *lowest_eb_bh, but still return '0'
1378  *
1379  * 3) the search fails to find anything AND the root_el is full, in
1380  *    which case we return > 0
1381  *
1382  * return status < 0 indicates an error.
1383  */
1384 static int ocfs2_find_branch_target(struct ocfs2_super *osb,
1385 				    struct inode *inode,
1386 				    struct ocfs2_extent_tree *et,
1387 				    struct buffer_head **target_bh)
1388 {
1389 	int status = 0, i;
1390 	u64 blkno;
1391 	struct ocfs2_extent_block *eb;
1392 	struct ocfs2_extent_list  *el;
1393 	struct buffer_head *bh = NULL;
1394 	struct buffer_head *lowest_bh = NULL;
1395 
1396 	mlog_entry_void();
1397 
1398 	*target_bh = NULL;
1399 
1400 	el = et->et_root_el;
1401 
1402 	while(le16_to_cpu(el->l_tree_depth) > 1) {
1403 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
1404 			ocfs2_error(inode->i_sb, "Dinode %llu has empty "
1405 				    "extent list (next_free_rec == 0)",
1406 				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
1407 			status = -EIO;
1408 			goto bail;
1409 		}
1410 		i = le16_to_cpu(el->l_next_free_rec) - 1;
1411 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1412 		if (!blkno) {
1413 			ocfs2_error(inode->i_sb, "Dinode %llu has extent "
1414 				    "list where extent # %d has no physical "
1415 				    "block start",
1416 				    (unsigned long long)OCFS2_I(inode)->ip_blkno, i);
1417 			status = -EIO;
1418 			goto bail;
1419 		}
1420 
1421 		brelse(bh);
1422 		bh = NULL;
1423 
1424 		status = ocfs2_read_extent_block(inode, blkno, &bh);
1425 		if (status < 0) {
1426 			mlog_errno(status);
1427 			goto bail;
1428 		}
1429 
1430 		eb = (struct ocfs2_extent_block *) bh->b_data;
1431 		el = &eb->h_list;
1432 
1433 		if (le16_to_cpu(el->l_next_free_rec) <
1434 		    le16_to_cpu(el->l_count)) {
1435 			brelse(lowest_bh);
1436 			lowest_bh = bh;
1437 			get_bh(lowest_bh);
1438 		}
1439 	}
1440 
1441 	/* If we didn't find one and the fe doesn't have any room,
1442 	 * then return '1' */
1443 	el = et->et_root_el;
1444 	if (!lowest_bh && (el->l_next_free_rec == el->l_count))
1445 		status = 1;
1446 
1447 	*target_bh = lowest_bh;
1448 bail:
1449 	brelse(bh);
1450 
1451 	mlog_exit(status);
1452 	return status;
1453 }
1454 
1455 /*
1456  * Grow a b-tree so that it has more records.
1457  *
1458  * We might shift the tree depth in which case existing paths should
1459  * be considered invalid.
1460  *
1461  * Tree depth after the grow is returned via *final_depth.
1462  *
1463  * *last_eb_bh will be updated by ocfs2_add_branch().
1464  */
1465 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle,
1466 			   struct ocfs2_extent_tree *et, int *final_depth,
1467 			   struct buffer_head **last_eb_bh,
1468 			   struct ocfs2_alloc_context *meta_ac)
1469 {
1470 	int ret, shift;
1471 	struct ocfs2_extent_list *el = et->et_root_el;
1472 	int depth = le16_to_cpu(el->l_tree_depth);
1473 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1474 	struct buffer_head *bh = NULL;
1475 
1476 	BUG_ON(meta_ac == NULL);
1477 
1478 	shift = ocfs2_find_branch_target(osb, inode, et, &bh);
1479 	if (shift < 0) {
1480 		ret = shift;
1481 		mlog_errno(ret);
1482 		goto out;
1483 	}
1484 
1485 	/* We traveled all the way to the bottom of the allocation tree
1486 	 * and didn't find room for any more extents - we need to add
1487 	 * another tree level */
1488 	if (shift) {
1489 		BUG_ON(bh);
1490 		mlog(0, "need to shift tree depth (current = %d)\n", depth);
1491 
1492 		/* ocfs2_shift_tree_depth will return us a buffer with
1493 		 * the new extent block (so we can pass that to
1494 		 * ocfs2_add_branch). */
1495 		ret = ocfs2_shift_tree_depth(osb, handle, inode, et,
1496 					     meta_ac, &bh);
1497 		if (ret < 0) {
1498 			mlog_errno(ret);
1499 			goto out;
1500 		}
1501 		depth++;
1502 		if (depth == 1) {
1503 			/*
1504 			 * Special case: we have room now if we shifted from
1505 			 * tree_depth 0, so no more work needs to be done.
1506 			 *
1507 			 * We won't be calling add_branch, so pass
1508 			 * back *last_eb_bh as the new leaf. At depth
1509 			 * zero, it should always be null so there's
1510 			 * no reason to brelse.
1511 			 */
1512 			BUG_ON(*last_eb_bh);
1513 			get_bh(bh);
1514 			*last_eb_bh = bh;
1515 			goto out;
1516 		}
1517 	}
1518 
1519 	/* call ocfs2_add_branch to add the final part of the tree with
1520 	 * the new data. */
1521 	mlog(0, "add branch. bh = %p\n", bh);
1522 	ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh,
1523 			       meta_ac);
1524 	if (ret < 0) {
1525 		mlog_errno(ret);
1526 		goto out;
1527 	}
1528 
1529 out:
1530 	if (final_depth)
1531 		*final_depth = depth;
1532 	brelse(bh);
1533 	return ret;
1534 }
1535 
1536 /*
1537  * This function will discard the rightmost extent record.
1538  */
1539 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
1540 {
1541 	int next_free = le16_to_cpu(el->l_next_free_rec);
1542 	int count = le16_to_cpu(el->l_count);
1543 	unsigned int num_bytes;
1544 
1545 	BUG_ON(!next_free);
1546 	/* This will cause us to go off the end of our extent list. */
1547 	BUG_ON(next_free >= count);
1548 
1549 	num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
1550 
1551 	memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
1552 }
1553 
1554 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1555 			      struct ocfs2_extent_rec *insert_rec)
1556 {
1557 	int i, insert_index, next_free, has_empty, num_bytes;
1558 	u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
1559 	struct ocfs2_extent_rec *rec;
1560 
1561 	next_free = le16_to_cpu(el->l_next_free_rec);
1562 	has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
1563 
1564 	BUG_ON(!next_free);
1565 
1566 	/* The tree code before us didn't allow enough room in the leaf. */
1567 	BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1568 
1569 	/*
1570 	 * The easiest way to approach this is to just remove the
1571 	 * empty extent and temporarily decrement next_free.
1572 	 */
1573 	if (has_empty) {
1574 		/*
1575 		 * If next_free was 1 (only an empty extent), this
1576 		 * loop won't execute, which is fine. We still want
1577 		 * the decrement above to happen.
1578 		 */
1579 		for(i = 0; i < (next_free - 1); i++)
1580 			el->l_recs[i] = el->l_recs[i+1];
1581 
1582 		next_free--;
1583 	}
1584 
1585 	/*
1586 	 * Figure out what the new record index should be.
1587 	 */
1588 	for(i = 0; i < next_free; i++) {
1589 		rec = &el->l_recs[i];
1590 
1591 		if (insert_cpos < le32_to_cpu(rec->e_cpos))
1592 			break;
1593 	}
1594 	insert_index = i;
1595 
1596 	mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
1597 	     insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
1598 
1599 	BUG_ON(insert_index < 0);
1600 	BUG_ON(insert_index >= le16_to_cpu(el->l_count));
1601 	BUG_ON(insert_index > next_free);
1602 
1603 	/*
1604 	 * No need to memmove if we're just adding to the tail.
1605 	 */
1606 	if (insert_index != next_free) {
1607 		BUG_ON(next_free >= le16_to_cpu(el->l_count));
1608 
1609 		num_bytes = next_free - insert_index;
1610 		num_bytes *= sizeof(struct ocfs2_extent_rec);
1611 		memmove(&el->l_recs[insert_index + 1],
1612 			&el->l_recs[insert_index],
1613 			num_bytes);
1614 	}
1615 
1616 	/*
1617 	 * Either we had an empty extent, and need to re-increment or
1618 	 * there was no empty extent on a non full rightmost leaf node,
1619 	 * in which case we still need to increment.
1620 	 */
1621 	next_free++;
1622 	el->l_next_free_rec = cpu_to_le16(next_free);
1623 	/*
1624 	 * Make sure none of the math above just messed up our tree.
1625 	 */
1626 	BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
1627 
1628 	el->l_recs[insert_index] = *insert_rec;
1629 
1630 }
1631 
1632 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el)
1633 {
1634 	int size, num_recs = le16_to_cpu(el->l_next_free_rec);
1635 
1636 	BUG_ON(num_recs == 0);
1637 
1638 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
1639 		num_recs--;
1640 		size = num_recs * sizeof(struct ocfs2_extent_rec);
1641 		memmove(&el->l_recs[0], &el->l_recs[1], size);
1642 		memset(&el->l_recs[num_recs], 0,
1643 		       sizeof(struct ocfs2_extent_rec));
1644 		el->l_next_free_rec = cpu_to_le16(num_recs);
1645 	}
1646 }
1647 
1648 /*
1649  * Create an empty extent record .
1650  *
1651  * l_next_free_rec may be updated.
1652  *
1653  * If an empty extent already exists do nothing.
1654  */
1655 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
1656 {
1657 	int next_free = le16_to_cpu(el->l_next_free_rec);
1658 
1659 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
1660 
1661 	if (next_free == 0)
1662 		goto set_and_inc;
1663 
1664 	if (ocfs2_is_empty_extent(&el->l_recs[0]))
1665 		return;
1666 
1667 	mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
1668 			"Asked to create an empty extent in a full list:\n"
1669 			"count = %u, tree depth = %u",
1670 			le16_to_cpu(el->l_count),
1671 			le16_to_cpu(el->l_tree_depth));
1672 
1673 	ocfs2_shift_records_right(el);
1674 
1675 set_and_inc:
1676 	le16_add_cpu(&el->l_next_free_rec, 1);
1677 	memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
1678 }
1679 
1680 /*
1681  * For a rotation which involves two leaf nodes, the "root node" is
1682  * the lowest level tree node which contains a path to both leafs. This
1683  * resulting set of information can be used to form a complete "subtree"
1684  *
1685  * This function is passed two full paths from the dinode down to a
1686  * pair of adjacent leaves. It's task is to figure out which path
1687  * index contains the subtree root - this can be the root index itself
1688  * in a worst-case rotation.
1689  *
1690  * The array index of the subtree root is passed back.
1691  */
1692 static int ocfs2_find_subtree_root(struct inode *inode,
1693 				   struct ocfs2_path *left,
1694 				   struct ocfs2_path *right)
1695 {
1696 	int i = 0;
1697 
1698 	/*
1699 	 * Check that the caller passed in two paths from the same tree.
1700 	 */
1701 	BUG_ON(path_root_bh(left) != path_root_bh(right));
1702 
1703 	do {
1704 		i++;
1705 
1706 		/*
1707 		 * The caller didn't pass two adjacent paths.
1708 		 */
1709 		mlog_bug_on_msg(i > left->p_tree_depth,
1710 				"Inode %lu, left depth %u, right depth %u\n"
1711 				"left leaf blk %llu, right leaf blk %llu\n",
1712 				inode->i_ino, left->p_tree_depth,
1713 				right->p_tree_depth,
1714 				(unsigned long long)path_leaf_bh(left)->b_blocknr,
1715 				(unsigned long long)path_leaf_bh(right)->b_blocknr);
1716 	} while (left->p_node[i].bh->b_blocknr ==
1717 		 right->p_node[i].bh->b_blocknr);
1718 
1719 	return i - 1;
1720 }
1721 
1722 typedef void (path_insert_t)(void *, struct buffer_head *);
1723 
1724 /*
1725  * Traverse a btree path in search of cpos, starting at root_el.
1726  *
1727  * This code can be called with a cpos larger than the tree, in which
1728  * case it will return the rightmost path.
1729  */
1730 static int __ocfs2_find_path(struct inode *inode,
1731 			     struct ocfs2_extent_list *root_el, u32 cpos,
1732 			     path_insert_t *func, void *data)
1733 {
1734 	int i, ret = 0;
1735 	u32 range;
1736 	u64 blkno;
1737 	struct buffer_head *bh = NULL;
1738 	struct ocfs2_extent_block *eb;
1739 	struct ocfs2_extent_list *el;
1740 	struct ocfs2_extent_rec *rec;
1741 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
1742 
1743 	el = root_el;
1744 	while (el->l_tree_depth) {
1745 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
1746 			ocfs2_error(inode->i_sb,
1747 				    "Inode %llu has empty extent list at "
1748 				    "depth %u\n",
1749 				    (unsigned long long)oi->ip_blkno,
1750 				    le16_to_cpu(el->l_tree_depth));
1751 			ret = -EROFS;
1752 			goto out;
1753 
1754 		}
1755 
1756 		for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
1757 			rec = &el->l_recs[i];
1758 
1759 			/*
1760 			 * In the case that cpos is off the allocation
1761 			 * tree, this should just wind up returning the
1762 			 * rightmost record.
1763 			 */
1764 			range = le32_to_cpu(rec->e_cpos) +
1765 				ocfs2_rec_clusters(el, rec);
1766 			if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
1767 			    break;
1768 		}
1769 
1770 		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
1771 		if (blkno == 0) {
1772 			ocfs2_error(inode->i_sb,
1773 				    "Inode %llu has bad blkno in extent list "
1774 				    "at depth %u (index %d)\n",
1775 				    (unsigned long long)oi->ip_blkno,
1776 				    le16_to_cpu(el->l_tree_depth), i);
1777 			ret = -EROFS;
1778 			goto out;
1779 		}
1780 
1781 		brelse(bh);
1782 		bh = NULL;
1783 		ret = ocfs2_read_extent_block(inode, blkno, &bh);
1784 		if (ret) {
1785 			mlog_errno(ret);
1786 			goto out;
1787 		}
1788 
1789 		eb = (struct ocfs2_extent_block *) bh->b_data;
1790 		el = &eb->h_list;
1791 
1792 		if (le16_to_cpu(el->l_next_free_rec) >
1793 		    le16_to_cpu(el->l_count)) {
1794 			ocfs2_error(inode->i_sb,
1795 				    "Inode %llu has bad count in extent list "
1796 				    "at block %llu (next free=%u, count=%u)\n",
1797 				    (unsigned long long)oi->ip_blkno,
1798 				    (unsigned long long)bh->b_blocknr,
1799 				    le16_to_cpu(el->l_next_free_rec),
1800 				    le16_to_cpu(el->l_count));
1801 			ret = -EROFS;
1802 			goto out;
1803 		}
1804 
1805 		if (func)
1806 			func(data, bh);
1807 	}
1808 
1809 out:
1810 	/*
1811 	 * Catch any trailing bh that the loop didn't handle.
1812 	 */
1813 	brelse(bh);
1814 
1815 	return ret;
1816 }
1817 
1818 /*
1819  * Given an initialized path (that is, it has a valid root extent
1820  * list), this function will traverse the btree in search of the path
1821  * which would contain cpos.
1822  *
1823  * The path traveled is recorded in the path structure.
1824  *
1825  * Note that this will not do any comparisons on leaf node extent
1826  * records, so it will work fine in the case that we just added a tree
1827  * branch.
1828  */
1829 struct find_path_data {
1830 	int index;
1831 	struct ocfs2_path *path;
1832 };
1833 static void find_path_ins(void *data, struct buffer_head *bh)
1834 {
1835 	struct find_path_data *fp = data;
1836 
1837 	get_bh(bh);
1838 	ocfs2_path_insert_eb(fp->path, fp->index, bh);
1839 	fp->index++;
1840 }
1841 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
1842 			   u32 cpos)
1843 {
1844 	struct find_path_data data;
1845 
1846 	data.index = 1;
1847 	data.path = path;
1848 	return __ocfs2_find_path(inode, path_root_el(path), cpos,
1849 				 find_path_ins, &data);
1850 }
1851 
1852 static void find_leaf_ins(void *data, struct buffer_head *bh)
1853 {
1854 	struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
1855 	struct ocfs2_extent_list *el = &eb->h_list;
1856 	struct buffer_head **ret = data;
1857 
1858 	/* We want to retain only the leaf block. */
1859 	if (le16_to_cpu(el->l_tree_depth) == 0) {
1860 		get_bh(bh);
1861 		*ret = bh;
1862 	}
1863 }
1864 /*
1865  * Find the leaf block in the tree which would contain cpos. No
1866  * checking of the actual leaf is done.
1867  *
1868  * Some paths want to call this instead of allocating a path structure
1869  * and calling ocfs2_find_path().
1870  *
1871  * This function doesn't handle non btree extent lists.
1872  */
1873 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
1874 		    u32 cpos, struct buffer_head **leaf_bh)
1875 {
1876 	int ret;
1877 	struct buffer_head *bh = NULL;
1878 
1879 	ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
1880 	if (ret) {
1881 		mlog_errno(ret);
1882 		goto out;
1883 	}
1884 
1885 	*leaf_bh = bh;
1886 out:
1887 	return ret;
1888 }
1889 
1890 /*
1891  * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
1892  *
1893  * Basically, we've moved stuff around at the bottom of the tree and
1894  * we need to fix up the extent records above the changes to reflect
1895  * the new changes.
1896  *
1897  * left_rec: the record on the left.
1898  * left_child_el: is the child list pointed to by left_rec
1899  * right_rec: the record to the right of left_rec
1900  * right_child_el: is the child list pointed to by right_rec
1901  *
1902  * By definition, this only works on interior nodes.
1903  */
1904 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
1905 				  struct ocfs2_extent_list *left_child_el,
1906 				  struct ocfs2_extent_rec *right_rec,
1907 				  struct ocfs2_extent_list *right_child_el)
1908 {
1909 	u32 left_clusters, right_end;
1910 
1911 	/*
1912 	 * Interior nodes never have holes. Their cpos is the cpos of
1913 	 * the leftmost record in their child list. Their cluster
1914 	 * count covers the full theoretical range of their child list
1915 	 * - the range between their cpos and the cpos of the record
1916 	 * immediately to their right.
1917 	 */
1918 	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
1919 	if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
1920 		BUG_ON(right_child_el->l_tree_depth);
1921 		BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
1922 		left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
1923 	}
1924 	left_clusters -= le32_to_cpu(left_rec->e_cpos);
1925 	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
1926 
1927 	/*
1928 	 * Calculate the rightmost cluster count boundary before
1929 	 * moving cpos - we will need to adjust clusters after
1930 	 * updating e_cpos to keep the same highest cluster count.
1931 	 */
1932 	right_end = le32_to_cpu(right_rec->e_cpos);
1933 	right_end += le32_to_cpu(right_rec->e_int_clusters);
1934 
1935 	right_rec->e_cpos = left_rec->e_cpos;
1936 	le32_add_cpu(&right_rec->e_cpos, left_clusters);
1937 
1938 	right_end -= le32_to_cpu(right_rec->e_cpos);
1939 	right_rec->e_int_clusters = cpu_to_le32(right_end);
1940 }
1941 
1942 /*
1943  * Adjust the adjacent root node records involved in a
1944  * rotation. left_el_blkno is passed in as a key so that we can easily
1945  * find it's index in the root list.
1946  */
1947 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1948 				      struct ocfs2_extent_list *left_el,
1949 				      struct ocfs2_extent_list *right_el,
1950 				      u64 left_el_blkno)
1951 {
1952 	int i;
1953 
1954 	BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
1955 	       le16_to_cpu(left_el->l_tree_depth));
1956 
1957 	for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
1958 		if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
1959 			break;
1960 	}
1961 
1962 	/*
1963 	 * The path walking code should have never returned a root and
1964 	 * two paths which are not adjacent.
1965 	 */
1966 	BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
1967 
1968 	ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
1969 				      &root_el->l_recs[i + 1], right_el);
1970 }
1971 
1972 /*
1973  * We've changed a leaf block (in right_path) and need to reflect that
1974  * change back up the subtree.
1975  *
1976  * This happens in multiple places:
1977  *   - When we've moved an extent record from the left path leaf to the right
1978  *     path leaf to make room for an empty extent in the left path leaf.
1979  *   - When our insert into the right path leaf is at the leftmost edge
1980  *     and requires an update of the path immediately to it's left. This
1981  *     can occur at the end of some types of rotation and appending inserts.
1982  *   - When we've adjusted the last extent record in the left path leaf and the
1983  *     1st extent record in the right path leaf during cross extent block merge.
1984  */
1985 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1986 				       struct ocfs2_path *left_path,
1987 				       struct ocfs2_path *right_path,
1988 				       int subtree_index)
1989 {
1990 	int ret, i, idx;
1991 	struct ocfs2_extent_list *el, *left_el, *right_el;
1992 	struct ocfs2_extent_rec *left_rec, *right_rec;
1993 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
1994 
1995 	/*
1996 	 * Update the counts and position values within all the
1997 	 * interior nodes to reflect the leaf rotation we just did.
1998 	 *
1999 	 * The root node is handled below the loop.
2000 	 *
2001 	 * We begin the loop with right_el and left_el pointing to the
2002 	 * leaf lists and work our way up.
2003 	 *
2004 	 * NOTE: within this loop, left_el and right_el always refer
2005 	 * to the *child* lists.
2006 	 */
2007 	left_el = path_leaf_el(left_path);
2008 	right_el = path_leaf_el(right_path);
2009 	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
2010 		mlog(0, "Adjust records at index %u\n", i);
2011 
2012 		/*
2013 		 * One nice property of knowing that all of these
2014 		 * nodes are below the root is that we only deal with
2015 		 * the leftmost right node record and the rightmost
2016 		 * left node record.
2017 		 */
2018 		el = left_path->p_node[i].el;
2019 		idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
2020 		left_rec = &el->l_recs[idx];
2021 
2022 		el = right_path->p_node[i].el;
2023 		right_rec = &el->l_recs[0];
2024 
2025 		ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
2026 					      right_el);
2027 
2028 		ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
2029 		if (ret)
2030 			mlog_errno(ret);
2031 
2032 		ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
2033 		if (ret)
2034 			mlog_errno(ret);
2035 
2036 		/*
2037 		 * Setup our list pointers now so that the current
2038 		 * parents become children in the next iteration.
2039 		 */
2040 		left_el = left_path->p_node[i].el;
2041 		right_el = right_path->p_node[i].el;
2042 	}
2043 
2044 	/*
2045 	 * At the root node, adjust the two adjacent records which
2046 	 * begin our path to the leaves.
2047 	 */
2048 
2049 	el = left_path->p_node[subtree_index].el;
2050 	left_el = left_path->p_node[subtree_index + 1].el;
2051 	right_el = right_path->p_node[subtree_index + 1].el;
2052 
2053 	ocfs2_adjust_root_records(el, left_el, right_el,
2054 				  left_path->p_node[subtree_index + 1].bh->b_blocknr);
2055 
2056 	root_bh = left_path->p_node[subtree_index].bh;
2057 
2058 	ret = ocfs2_journal_dirty(handle, root_bh);
2059 	if (ret)
2060 		mlog_errno(ret);
2061 }
2062 
2063 static int ocfs2_rotate_subtree_right(struct inode *inode,
2064 				      handle_t *handle,
2065 				      struct ocfs2_path *left_path,
2066 				      struct ocfs2_path *right_path,
2067 				      int subtree_index)
2068 {
2069 	int ret, i;
2070 	struct buffer_head *right_leaf_bh;
2071 	struct buffer_head *left_leaf_bh = NULL;
2072 	struct buffer_head *root_bh;
2073 	struct ocfs2_extent_list *right_el, *left_el;
2074 	struct ocfs2_extent_rec move_rec;
2075 
2076 	left_leaf_bh = path_leaf_bh(left_path);
2077 	left_el = path_leaf_el(left_path);
2078 
2079 	if (left_el->l_next_free_rec != left_el->l_count) {
2080 		ocfs2_error(inode->i_sb,
2081 			    "Inode %llu has non-full interior leaf node %llu"
2082 			    "(next free = %u)",
2083 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
2084 			    (unsigned long long)left_leaf_bh->b_blocknr,
2085 			    le16_to_cpu(left_el->l_next_free_rec));
2086 		return -EROFS;
2087 	}
2088 
2089 	/*
2090 	 * This extent block may already have an empty record, so we
2091 	 * return early if so.
2092 	 */
2093 	if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
2094 		return 0;
2095 
2096 	root_bh = left_path->p_node[subtree_index].bh;
2097 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2098 
2099 	ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode), right_path,
2100 					   subtree_index);
2101 	if (ret) {
2102 		mlog_errno(ret);
2103 		goto out;
2104 	}
2105 
2106 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2107 		ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode),
2108 						   right_path, i);
2109 		if (ret) {
2110 			mlog_errno(ret);
2111 			goto out;
2112 		}
2113 
2114 		ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode),
2115 						   left_path, i);
2116 		if (ret) {
2117 			mlog_errno(ret);
2118 			goto out;
2119 		}
2120 	}
2121 
2122 	right_leaf_bh = path_leaf_bh(right_path);
2123 	right_el = path_leaf_el(right_path);
2124 
2125 	/* This is a code error, not a disk corruption. */
2126 	mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
2127 			"because rightmost leaf block %llu is empty\n",
2128 			(unsigned long long)OCFS2_I(inode)->ip_blkno,
2129 			(unsigned long long)right_leaf_bh->b_blocknr);
2130 
2131 	ocfs2_create_empty_extent(right_el);
2132 
2133 	ret = ocfs2_journal_dirty(handle, right_leaf_bh);
2134 	if (ret) {
2135 		mlog_errno(ret);
2136 		goto out;
2137 	}
2138 
2139 	/* Do the copy now. */
2140 	i = le16_to_cpu(left_el->l_next_free_rec) - 1;
2141 	move_rec = left_el->l_recs[i];
2142 	right_el->l_recs[0] = move_rec;
2143 
2144 	/*
2145 	 * Clear out the record we just copied and shift everything
2146 	 * over, leaving an empty extent in the left leaf.
2147 	 *
2148 	 * We temporarily subtract from next_free_rec so that the
2149 	 * shift will lose the tail record (which is now defunct).
2150 	 */
2151 	le16_add_cpu(&left_el->l_next_free_rec, -1);
2152 	ocfs2_shift_records_right(left_el);
2153 	memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2154 	le16_add_cpu(&left_el->l_next_free_rec, 1);
2155 
2156 	ret = ocfs2_journal_dirty(handle, left_leaf_bh);
2157 	if (ret) {
2158 		mlog_errno(ret);
2159 		goto out;
2160 	}
2161 
2162 	ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2163 				subtree_index);
2164 
2165 out:
2166 	return ret;
2167 }
2168 
2169 /*
2170  * Given a full path, determine what cpos value would return us a path
2171  * containing the leaf immediately to the left of the current one.
2172  *
2173  * Will return zero if the path passed in is already the leftmost path.
2174  */
2175 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
2176 					 struct ocfs2_path *path, u32 *cpos)
2177 {
2178 	int i, j, ret = 0;
2179 	u64 blkno;
2180 	struct ocfs2_extent_list *el;
2181 
2182 	BUG_ON(path->p_tree_depth == 0);
2183 
2184 	*cpos = 0;
2185 
2186 	blkno = path_leaf_bh(path)->b_blocknr;
2187 
2188 	/* Start at the tree node just above the leaf and work our way up. */
2189 	i = path->p_tree_depth - 1;
2190 	while (i >= 0) {
2191 		el = path->p_node[i].el;
2192 
2193 		/*
2194 		 * Find the extent record just before the one in our
2195 		 * path.
2196 		 */
2197 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2198 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2199 				if (j == 0) {
2200 					if (i == 0) {
2201 						/*
2202 						 * We've determined that the
2203 						 * path specified is already
2204 						 * the leftmost one - return a
2205 						 * cpos of zero.
2206 						 */
2207 						goto out;
2208 					}
2209 					/*
2210 					 * The leftmost record points to our
2211 					 * leaf - we need to travel up the
2212 					 * tree one level.
2213 					 */
2214 					goto next_node;
2215 				}
2216 
2217 				*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
2218 				*cpos = *cpos + ocfs2_rec_clusters(el,
2219 							   &el->l_recs[j - 1]);
2220 				*cpos = *cpos - 1;
2221 				goto out;
2222 			}
2223 		}
2224 
2225 		/*
2226 		 * If we got here, we never found a valid node where
2227 		 * the tree indicated one should be.
2228 		 */
2229 		ocfs2_error(sb,
2230 			    "Invalid extent tree at extent block %llu\n",
2231 			    (unsigned long long)blkno);
2232 		ret = -EROFS;
2233 		goto out;
2234 
2235 next_node:
2236 		blkno = path->p_node[i].bh->b_blocknr;
2237 		i--;
2238 	}
2239 
2240 out:
2241 	return ret;
2242 }
2243 
2244 /*
2245  * Extend the transaction by enough credits to complete the rotation,
2246  * and still leave at least the original number of credits allocated
2247  * to this transaction.
2248  */
2249 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
2250 					   int op_credits,
2251 					   struct ocfs2_path *path)
2252 {
2253 	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits;
2254 
2255 	if (handle->h_buffer_credits < credits)
2256 		return ocfs2_extend_trans(handle, credits);
2257 
2258 	return 0;
2259 }
2260 
2261 /*
2262  * Trap the case where we're inserting into the theoretical range past
2263  * the _actual_ left leaf range. Otherwise, we'll rotate a record
2264  * whose cpos is less than ours into the right leaf.
2265  *
2266  * It's only necessary to look at the rightmost record of the left
2267  * leaf because the logic that calls us should ensure that the
2268  * theoretical ranges in the path components above the leaves are
2269  * correct.
2270  */
2271 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
2272 						 u32 insert_cpos)
2273 {
2274 	struct ocfs2_extent_list *left_el;
2275 	struct ocfs2_extent_rec *rec;
2276 	int next_free;
2277 
2278 	left_el = path_leaf_el(left_path);
2279 	next_free = le16_to_cpu(left_el->l_next_free_rec);
2280 	rec = &left_el->l_recs[next_free - 1];
2281 
2282 	if (insert_cpos > le32_to_cpu(rec->e_cpos))
2283 		return 1;
2284 	return 0;
2285 }
2286 
2287 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2288 {
2289 	int next_free = le16_to_cpu(el->l_next_free_rec);
2290 	unsigned int range;
2291 	struct ocfs2_extent_rec *rec;
2292 
2293 	if (next_free == 0)
2294 		return 0;
2295 
2296 	rec = &el->l_recs[0];
2297 	if (ocfs2_is_empty_extent(rec)) {
2298 		/* Empty list. */
2299 		if (next_free == 1)
2300 			return 0;
2301 		rec = &el->l_recs[1];
2302 	}
2303 
2304 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2305 	if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
2306 		return 1;
2307 	return 0;
2308 }
2309 
2310 /*
2311  * Rotate all the records in a btree right one record, starting at insert_cpos.
2312  *
2313  * The path to the rightmost leaf should be passed in.
2314  *
2315  * The array is assumed to be large enough to hold an entire path (tree depth).
2316  *
2317  * Upon succesful return from this function:
2318  *
2319  * - The 'right_path' array will contain a path to the leaf block
2320  *   whose range contains e_cpos.
2321  * - That leaf block will have a single empty extent in list index 0.
2322  * - In the case that the rotation requires a post-insert update,
2323  *   *ret_left_path will contain a valid path which can be passed to
2324  *   ocfs2_insert_path().
2325  */
2326 static int ocfs2_rotate_tree_right(struct inode *inode,
2327 				   handle_t *handle,
2328 				   enum ocfs2_split_type split,
2329 				   u32 insert_cpos,
2330 				   struct ocfs2_path *right_path,
2331 				   struct ocfs2_path **ret_left_path)
2332 {
2333 	int ret, start, orig_credits = handle->h_buffer_credits;
2334 	u32 cpos;
2335 	struct ocfs2_path *left_path = NULL;
2336 
2337 	*ret_left_path = NULL;
2338 
2339 	left_path = ocfs2_new_path_from_path(right_path);
2340 	if (!left_path) {
2341 		ret = -ENOMEM;
2342 		mlog_errno(ret);
2343 		goto out;
2344 	}
2345 
2346 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
2347 	if (ret) {
2348 		mlog_errno(ret);
2349 		goto out;
2350 	}
2351 
2352 	mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
2353 
2354 	/*
2355 	 * What we want to do here is:
2356 	 *
2357 	 * 1) Start with the rightmost path.
2358 	 *
2359 	 * 2) Determine a path to the leaf block directly to the left
2360 	 *    of that leaf.
2361 	 *
2362 	 * 3) Determine the 'subtree root' - the lowest level tree node
2363 	 *    which contains a path to both leaves.
2364 	 *
2365 	 * 4) Rotate the subtree.
2366 	 *
2367 	 * 5) Find the next subtree by considering the left path to be
2368 	 *    the new right path.
2369 	 *
2370 	 * The check at the top of this while loop also accepts
2371 	 * insert_cpos == cpos because cpos is only a _theoretical_
2372 	 * value to get us the left path - insert_cpos might very well
2373 	 * be filling that hole.
2374 	 *
2375 	 * Stop at a cpos of '0' because we either started at the
2376 	 * leftmost branch (i.e., a tree with one branch and a
2377 	 * rotation inside of it), or we've gone as far as we can in
2378 	 * rotating subtrees.
2379 	 */
2380 	while (cpos && insert_cpos <= cpos) {
2381 		mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
2382 		     insert_cpos, cpos);
2383 
2384 		ret = ocfs2_find_path(inode, left_path, cpos);
2385 		if (ret) {
2386 			mlog_errno(ret);
2387 			goto out;
2388 		}
2389 
2390 		mlog_bug_on_msg(path_leaf_bh(left_path) ==
2391 				path_leaf_bh(right_path),
2392 				"Inode %lu: error during insert of %u "
2393 				"(left path cpos %u) results in two identical "
2394 				"paths ending at %llu\n",
2395 				inode->i_ino, insert_cpos, cpos,
2396 				(unsigned long long)
2397 				path_leaf_bh(left_path)->b_blocknr);
2398 
2399 		if (split == SPLIT_NONE &&
2400 		    ocfs2_rotate_requires_path_adjustment(left_path,
2401 							  insert_cpos)) {
2402 
2403 			/*
2404 			 * We've rotated the tree as much as we
2405 			 * should. The rest is up to
2406 			 * ocfs2_insert_path() to complete, after the
2407 			 * record insertion. We indicate this
2408 			 * situation by returning the left path.
2409 			 *
2410 			 * The reason we don't adjust the records here
2411 			 * before the record insert is that an error
2412 			 * later might break the rule where a parent
2413 			 * record e_cpos will reflect the actual
2414 			 * e_cpos of the 1st nonempty record of the
2415 			 * child list.
2416 			 */
2417 			*ret_left_path = left_path;
2418 			goto out_ret_path;
2419 		}
2420 
2421 		start = ocfs2_find_subtree_root(inode, left_path, right_path);
2422 
2423 		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2424 		     start,
2425 		     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
2426 		     right_path->p_tree_depth);
2427 
2428 		ret = ocfs2_extend_rotate_transaction(handle, start,
2429 						      orig_credits, right_path);
2430 		if (ret) {
2431 			mlog_errno(ret);
2432 			goto out;
2433 		}
2434 
2435 		ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
2436 						 right_path, start);
2437 		if (ret) {
2438 			mlog_errno(ret);
2439 			goto out;
2440 		}
2441 
2442 		if (split != SPLIT_NONE &&
2443 		    ocfs2_leftmost_rec_contains(path_leaf_el(right_path),
2444 						insert_cpos)) {
2445 			/*
2446 			 * A rotate moves the rightmost left leaf
2447 			 * record over to the leftmost right leaf
2448 			 * slot. If we're doing an extent split
2449 			 * instead of a real insert, then we have to
2450 			 * check that the extent to be split wasn't
2451 			 * just moved over. If it was, then we can
2452 			 * exit here, passing left_path back -
2453 			 * ocfs2_split_extent() is smart enough to
2454 			 * search both leaves.
2455 			 */
2456 			*ret_left_path = left_path;
2457 			goto out_ret_path;
2458 		}
2459 
2460 		/*
2461 		 * There is no need to re-read the next right path
2462 		 * as we know that it'll be our current left
2463 		 * path. Optimize by copying values instead.
2464 		 */
2465 		ocfs2_mv_path(right_path, left_path);
2466 
2467 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
2468 						    &cpos);
2469 		if (ret) {
2470 			mlog_errno(ret);
2471 			goto out;
2472 		}
2473 	}
2474 
2475 out:
2476 	ocfs2_free_path(left_path);
2477 
2478 out_ret_path:
2479 	return ret;
2480 }
2481 
2482 static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
2483 				     int subtree_index, struct ocfs2_path *path)
2484 {
2485 	int i, idx, ret;
2486 	struct ocfs2_extent_rec *rec;
2487 	struct ocfs2_extent_list *el;
2488 	struct ocfs2_extent_block *eb;
2489 	u32 range;
2490 
2491 	/*
2492 	 * In normal tree rotation process, we will never touch the
2493 	 * tree branch above subtree_index and ocfs2_extend_rotate_transaction
2494 	 * doesn't reserve the credits for them either.
2495 	 *
2496 	 * But we do have a special case here which will update the rightmost
2497 	 * records for all the bh in the path.
2498 	 * So we have to allocate extra credits and access them.
2499 	 */
2500 	ret = ocfs2_extend_trans(handle,
2501 				 handle->h_buffer_credits + subtree_index);
2502 	if (ret) {
2503 		mlog_errno(ret);
2504 		goto out;
2505 	}
2506 
2507 	ret = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
2508 	if (ret) {
2509 		mlog_errno(ret);
2510 		goto out;
2511 	}
2512 
2513 	/* Path should always be rightmost. */
2514 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
2515 	BUG_ON(eb->h_next_leaf_blk != 0ULL);
2516 
2517 	el = &eb->h_list;
2518 	BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
2519 	idx = le16_to_cpu(el->l_next_free_rec) - 1;
2520 	rec = &el->l_recs[idx];
2521 	range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
2522 
2523 	for (i = 0; i < path->p_tree_depth; i++) {
2524 		el = path->p_node[i].el;
2525 		idx = le16_to_cpu(el->l_next_free_rec) - 1;
2526 		rec = &el->l_recs[idx];
2527 
2528 		rec->e_int_clusters = cpu_to_le32(range);
2529 		le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos));
2530 
2531 		ocfs2_journal_dirty(handle, path->p_node[i].bh);
2532 	}
2533 out:
2534 	return ret;
2535 }
2536 
2537 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
2538 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
2539 			      struct ocfs2_path *path, int unlink_start)
2540 {
2541 	int ret, i;
2542 	struct ocfs2_extent_block *eb;
2543 	struct ocfs2_extent_list *el;
2544 	struct buffer_head *bh;
2545 
2546 	for(i = unlink_start; i < path_num_items(path); i++) {
2547 		bh = path->p_node[i].bh;
2548 
2549 		eb = (struct ocfs2_extent_block *)bh->b_data;
2550 		/*
2551 		 * Not all nodes might have had their final count
2552 		 * decremented by the caller - handle this here.
2553 		 */
2554 		el = &eb->h_list;
2555 		if (le16_to_cpu(el->l_next_free_rec) > 1) {
2556 			mlog(ML_ERROR,
2557 			     "Inode %llu, attempted to remove extent block "
2558 			     "%llu with %u records\n",
2559 			     (unsigned long long)OCFS2_I(inode)->ip_blkno,
2560 			     (unsigned long long)le64_to_cpu(eb->h_blkno),
2561 			     le16_to_cpu(el->l_next_free_rec));
2562 
2563 			ocfs2_journal_dirty(handle, bh);
2564 			ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
2565 			continue;
2566 		}
2567 
2568 		el->l_next_free_rec = 0;
2569 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
2570 
2571 		ocfs2_journal_dirty(handle, bh);
2572 
2573 		ret = ocfs2_cache_extent_block_free(dealloc, eb);
2574 		if (ret)
2575 			mlog_errno(ret);
2576 
2577 		ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
2578 	}
2579 }
2580 
2581 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle,
2582 				 struct ocfs2_path *left_path,
2583 				 struct ocfs2_path *right_path,
2584 				 int subtree_index,
2585 				 struct ocfs2_cached_dealloc_ctxt *dealloc)
2586 {
2587 	int i;
2588 	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
2589 	struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el;
2590 	struct ocfs2_extent_list *el;
2591 	struct ocfs2_extent_block *eb;
2592 
2593 	el = path_leaf_el(left_path);
2594 
2595 	eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data;
2596 
2597 	for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++)
2598 		if (root_el->l_recs[i].e_blkno == eb->h_blkno)
2599 			break;
2600 
2601 	BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec));
2602 
2603 	memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
2604 	le16_add_cpu(&root_el->l_next_free_rec, -1);
2605 
2606 	eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2607 	eb->h_next_leaf_blk = 0;
2608 
2609 	ocfs2_journal_dirty(handle, root_bh);
2610 	ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2611 
2612 	ocfs2_unlink_path(inode, handle, dealloc, right_path,
2613 			  subtree_index + 1);
2614 }
2615 
2616 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
2617 				     struct ocfs2_path *left_path,
2618 				     struct ocfs2_path *right_path,
2619 				     int subtree_index,
2620 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
2621 				     int *deleted,
2622 				     struct ocfs2_extent_tree *et)
2623 {
2624 	int ret, i, del_right_subtree = 0, right_has_empty = 0;
2625 	struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path);
2626 	struct ocfs2_extent_list *right_leaf_el, *left_leaf_el;
2627 	struct ocfs2_extent_block *eb;
2628 
2629 	*deleted = 0;
2630 
2631 	right_leaf_el = path_leaf_el(right_path);
2632 	left_leaf_el = path_leaf_el(left_path);
2633 	root_bh = left_path->p_node[subtree_index].bh;
2634 	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2635 
2636 	if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0]))
2637 		return 0;
2638 
2639 	eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data;
2640 	if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) {
2641 		/*
2642 		 * It's legal for us to proceed if the right leaf is
2643 		 * the rightmost one and it has an empty extent. There
2644 		 * are two cases to handle - whether the leaf will be
2645 		 * empty after removal or not. If the leaf isn't empty
2646 		 * then just remove the empty extent up front. The
2647 		 * next block will handle empty leaves by flagging
2648 		 * them for unlink.
2649 		 *
2650 		 * Non rightmost leaves will throw -EAGAIN and the
2651 		 * caller can manually move the subtree and retry.
2652 		 */
2653 
2654 		if (eb->h_next_leaf_blk != 0ULL)
2655 			return -EAGAIN;
2656 
2657 		if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
2658 			ret = ocfs2_journal_access_eb(handle, INODE_CACHE(inode),
2659 						      path_leaf_bh(right_path),
2660 						      OCFS2_JOURNAL_ACCESS_WRITE);
2661 			if (ret) {
2662 				mlog_errno(ret);
2663 				goto out;
2664 			}
2665 
2666 			ocfs2_remove_empty_extent(right_leaf_el);
2667 		} else
2668 			right_has_empty = 1;
2669 	}
2670 
2671 	if (eb->h_next_leaf_blk == 0ULL &&
2672 	    le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) {
2673 		/*
2674 		 * We have to update i_last_eb_blk during the meta
2675 		 * data delete.
2676 		 */
2677 		ret = ocfs2_et_root_journal_access(handle, et,
2678 						   OCFS2_JOURNAL_ACCESS_WRITE);
2679 		if (ret) {
2680 			mlog_errno(ret);
2681 			goto out;
2682 		}
2683 
2684 		del_right_subtree = 1;
2685 	}
2686 
2687 	/*
2688 	 * Getting here with an empty extent in the right path implies
2689 	 * that it's the rightmost path and will be deleted.
2690 	 */
2691 	BUG_ON(right_has_empty && !del_right_subtree);
2692 
2693 	ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode), right_path,
2694 					   subtree_index);
2695 	if (ret) {
2696 		mlog_errno(ret);
2697 		goto out;
2698 	}
2699 
2700 	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
2701 		ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode),
2702 						   right_path, i);
2703 		if (ret) {
2704 			mlog_errno(ret);
2705 			goto out;
2706 		}
2707 
2708 		ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode),
2709 						   left_path, i);
2710 		if (ret) {
2711 			mlog_errno(ret);
2712 			goto out;
2713 		}
2714 	}
2715 
2716 	if (!right_has_empty) {
2717 		/*
2718 		 * Only do this if we're moving a real
2719 		 * record. Otherwise, the action is delayed until
2720 		 * after removal of the right path in which case we
2721 		 * can do a simple shift to remove the empty extent.
2722 		 */
2723 		ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]);
2724 		memset(&right_leaf_el->l_recs[0], 0,
2725 		       sizeof(struct ocfs2_extent_rec));
2726 	}
2727 	if (eb->h_next_leaf_blk == 0ULL) {
2728 		/*
2729 		 * Move recs over to get rid of empty extent, decrease
2730 		 * next_free. This is allowed to remove the last
2731 		 * extent in our leaf (setting l_next_free_rec to
2732 		 * zero) - the delete code below won't care.
2733 		 */
2734 		ocfs2_remove_empty_extent(right_leaf_el);
2735 	}
2736 
2737 	ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
2738 	if (ret)
2739 		mlog_errno(ret);
2740 	ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2741 	if (ret)
2742 		mlog_errno(ret);
2743 
2744 	if (del_right_subtree) {
2745 		ocfs2_unlink_subtree(inode, handle, left_path, right_path,
2746 				     subtree_index, dealloc);
2747 		ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
2748 						left_path);
2749 		if (ret) {
2750 			mlog_errno(ret);
2751 			goto out;
2752 		}
2753 
2754 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
2755 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
2756 
2757 		/*
2758 		 * Removal of the extent in the left leaf was skipped
2759 		 * above so we could delete the right path
2760 		 * 1st.
2761 		 */
2762 		if (right_has_empty)
2763 			ocfs2_remove_empty_extent(left_leaf_el);
2764 
2765 		ret = ocfs2_journal_dirty(handle, et_root_bh);
2766 		if (ret)
2767 			mlog_errno(ret);
2768 
2769 		*deleted = 1;
2770 	} else
2771 		ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
2772 					   subtree_index);
2773 
2774 out:
2775 	return ret;
2776 }
2777 
2778 /*
2779  * Given a full path, determine what cpos value would return us a path
2780  * containing the leaf immediately to the right of the current one.
2781  *
2782  * Will return zero if the path passed in is already the rightmost path.
2783  *
2784  * This looks similar, but is subtly different to
2785  * ocfs2_find_cpos_for_left_leaf().
2786  */
2787 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb,
2788 					  struct ocfs2_path *path, u32 *cpos)
2789 {
2790 	int i, j, ret = 0;
2791 	u64 blkno;
2792 	struct ocfs2_extent_list *el;
2793 
2794 	*cpos = 0;
2795 
2796 	if (path->p_tree_depth == 0)
2797 		return 0;
2798 
2799 	blkno = path_leaf_bh(path)->b_blocknr;
2800 
2801 	/* Start at the tree node just above the leaf and work our way up. */
2802 	i = path->p_tree_depth - 1;
2803 	while (i >= 0) {
2804 		int next_free;
2805 
2806 		el = path->p_node[i].el;
2807 
2808 		/*
2809 		 * Find the extent record just after the one in our
2810 		 * path.
2811 		 */
2812 		next_free = le16_to_cpu(el->l_next_free_rec);
2813 		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
2814 			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
2815 				if (j == (next_free - 1)) {
2816 					if (i == 0) {
2817 						/*
2818 						 * We've determined that the
2819 						 * path specified is already
2820 						 * the rightmost one - return a
2821 						 * cpos of zero.
2822 						 */
2823 						goto out;
2824 					}
2825 					/*
2826 					 * The rightmost record points to our
2827 					 * leaf - we need to travel up the
2828 					 * tree one level.
2829 					 */
2830 					goto next_node;
2831 				}
2832 
2833 				*cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos);
2834 				goto out;
2835 			}
2836 		}
2837 
2838 		/*
2839 		 * If we got here, we never found a valid node where
2840 		 * the tree indicated one should be.
2841 		 */
2842 		ocfs2_error(sb,
2843 			    "Invalid extent tree at extent block %llu\n",
2844 			    (unsigned long long)blkno);
2845 		ret = -EROFS;
2846 		goto out;
2847 
2848 next_node:
2849 		blkno = path->p_node[i].bh->b_blocknr;
2850 		i--;
2851 	}
2852 
2853 out:
2854 	return ret;
2855 }
2856 
2857 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
2858 					    handle_t *handle,
2859 					    struct ocfs2_path *path)
2860 {
2861 	int ret;
2862 	struct buffer_head *bh = path_leaf_bh(path);
2863 	struct ocfs2_extent_list *el = path_leaf_el(path);
2864 
2865 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
2866 		return 0;
2867 
2868 	ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode), path,
2869 					   path_num_items(path) - 1);
2870 	if (ret) {
2871 		mlog_errno(ret);
2872 		goto out;
2873 	}
2874 
2875 	ocfs2_remove_empty_extent(el);
2876 
2877 	ret = ocfs2_journal_dirty(handle, bh);
2878 	if (ret)
2879 		mlog_errno(ret);
2880 
2881 out:
2882 	return ret;
2883 }
2884 
2885 static int __ocfs2_rotate_tree_left(struct inode *inode,
2886 				    handle_t *handle, int orig_credits,
2887 				    struct ocfs2_path *path,
2888 				    struct ocfs2_cached_dealloc_ctxt *dealloc,
2889 				    struct ocfs2_path **empty_extent_path,
2890 				    struct ocfs2_extent_tree *et)
2891 {
2892 	int ret, subtree_root, deleted;
2893 	u32 right_cpos;
2894 	struct ocfs2_path *left_path = NULL;
2895 	struct ocfs2_path *right_path = NULL;
2896 
2897 	BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0])));
2898 
2899 	*empty_extent_path = NULL;
2900 
2901 	ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path,
2902 					     &right_cpos);
2903 	if (ret) {
2904 		mlog_errno(ret);
2905 		goto out;
2906 	}
2907 
2908 	left_path = ocfs2_new_path_from_path(path);
2909 	if (!left_path) {
2910 		ret = -ENOMEM;
2911 		mlog_errno(ret);
2912 		goto out;
2913 	}
2914 
2915 	ocfs2_cp_path(left_path, path);
2916 
2917 	right_path = ocfs2_new_path_from_path(path);
2918 	if (!right_path) {
2919 		ret = -ENOMEM;
2920 		mlog_errno(ret);
2921 		goto out;
2922 	}
2923 
2924 	while (right_cpos) {
2925 		ret = ocfs2_find_path(inode, right_path, right_cpos);
2926 		if (ret) {
2927 			mlog_errno(ret);
2928 			goto out;
2929 		}
2930 
2931 		subtree_root = ocfs2_find_subtree_root(inode, left_path,
2932 						       right_path);
2933 
2934 		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
2935 		     subtree_root,
2936 		     (unsigned long long)
2937 		     right_path->p_node[subtree_root].bh->b_blocknr,
2938 		     right_path->p_tree_depth);
2939 
2940 		ret = ocfs2_extend_rotate_transaction(handle, subtree_root,
2941 						      orig_credits, left_path);
2942 		if (ret) {
2943 			mlog_errno(ret);
2944 			goto out;
2945 		}
2946 
2947 		/*
2948 		 * Caller might still want to make changes to the
2949 		 * tree root, so re-add it to the journal here.
2950 		 */
2951 		ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode),
2952 						   left_path, 0);
2953 		if (ret) {
2954 			mlog_errno(ret);
2955 			goto out;
2956 		}
2957 
2958 		ret = ocfs2_rotate_subtree_left(inode, handle, left_path,
2959 						right_path, subtree_root,
2960 						dealloc, &deleted, et);
2961 		if (ret == -EAGAIN) {
2962 			/*
2963 			 * The rotation has to temporarily stop due to
2964 			 * the right subtree having an empty
2965 			 * extent. Pass it back to the caller for a
2966 			 * fixup.
2967 			 */
2968 			*empty_extent_path = right_path;
2969 			right_path = NULL;
2970 			goto out;
2971 		}
2972 		if (ret) {
2973 			mlog_errno(ret);
2974 			goto out;
2975 		}
2976 
2977 		/*
2978 		 * The subtree rotate might have removed records on
2979 		 * the rightmost edge. If so, then rotation is
2980 		 * complete.
2981 		 */
2982 		if (deleted)
2983 			break;
2984 
2985 		ocfs2_mv_path(left_path, right_path);
2986 
2987 		ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2988 						     &right_cpos);
2989 		if (ret) {
2990 			mlog_errno(ret);
2991 			goto out;
2992 		}
2993 	}
2994 
2995 out:
2996 	ocfs2_free_path(right_path);
2997 	ocfs2_free_path(left_path);
2998 
2999 	return ret;
3000 }
3001 
3002 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
3003 				struct ocfs2_path *path,
3004 				struct ocfs2_cached_dealloc_ctxt *dealloc,
3005 				struct ocfs2_extent_tree *et)
3006 {
3007 	int ret, subtree_index;
3008 	u32 cpos;
3009 	struct ocfs2_path *left_path = NULL;
3010 	struct ocfs2_extent_block *eb;
3011 	struct ocfs2_extent_list *el;
3012 
3013 
3014 	ret = ocfs2_et_sanity_check(inode, et);
3015 	if (ret)
3016 		goto out;
3017 	/*
3018 	 * There's two ways we handle this depending on
3019 	 * whether path is the only existing one.
3020 	 */
3021 	ret = ocfs2_extend_rotate_transaction(handle, 0,
3022 					      handle->h_buffer_credits,
3023 					      path);
3024 	if (ret) {
3025 		mlog_errno(ret);
3026 		goto out;
3027 	}
3028 
3029 	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
3030 	if (ret) {
3031 		mlog_errno(ret);
3032 		goto out;
3033 	}
3034 
3035 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
3036 	if (ret) {
3037 		mlog_errno(ret);
3038 		goto out;
3039 	}
3040 
3041 	if (cpos) {
3042 		/*
3043 		 * We have a path to the left of this one - it needs
3044 		 * an update too.
3045 		 */
3046 		left_path = ocfs2_new_path_from_path(path);
3047 		if (!left_path) {
3048 			ret = -ENOMEM;
3049 			mlog_errno(ret);
3050 			goto out;
3051 		}
3052 
3053 		ret = ocfs2_find_path(inode, left_path, cpos);
3054 		if (ret) {
3055 			mlog_errno(ret);
3056 			goto out;
3057 		}
3058 
3059 		ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
3060 		if (ret) {
3061 			mlog_errno(ret);
3062 			goto out;
3063 		}
3064 
3065 		subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
3066 
3067 		ocfs2_unlink_subtree(inode, handle, left_path, path,
3068 				     subtree_index, dealloc);
3069 		ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
3070 						left_path);
3071 		if (ret) {
3072 			mlog_errno(ret);
3073 			goto out;
3074 		}
3075 
3076 		eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
3077 		ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
3078 	} else {
3079 		/*
3080 		 * 'path' is also the leftmost path which
3081 		 * means it must be the only one. This gets
3082 		 * handled differently because we want to
3083 		 * revert the inode back to having extents
3084 		 * in-line.
3085 		 */
3086 		ocfs2_unlink_path(inode, handle, dealloc, path, 1);
3087 
3088 		el = et->et_root_el;
3089 		el->l_tree_depth = 0;
3090 		el->l_next_free_rec = 0;
3091 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3092 
3093 		ocfs2_et_set_last_eb_blk(et, 0);
3094 	}
3095 
3096 	ocfs2_journal_dirty(handle, path_root_bh(path));
3097 
3098 out:
3099 	ocfs2_free_path(left_path);
3100 	return ret;
3101 }
3102 
3103 /*
3104  * Left rotation of btree records.
3105  *
3106  * In many ways, this is (unsurprisingly) the opposite of right
3107  * rotation. We start at some non-rightmost path containing an empty
3108  * extent in the leaf block. The code works its way to the rightmost
3109  * path by rotating records to the left in every subtree.
3110  *
3111  * This is used by any code which reduces the number of extent records
3112  * in a leaf. After removal, an empty record should be placed in the
3113  * leftmost list position.
3114  *
3115  * This won't handle a length update of the rightmost path records if
3116  * the rightmost tree leaf record is removed so the caller is
3117  * responsible for detecting and correcting that.
3118  */
3119 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle,
3120 				  struct ocfs2_path *path,
3121 				  struct ocfs2_cached_dealloc_ctxt *dealloc,
3122 				  struct ocfs2_extent_tree *et)
3123 {
3124 	int ret, orig_credits = handle->h_buffer_credits;
3125 	struct ocfs2_path *tmp_path = NULL, *restart_path = NULL;
3126 	struct ocfs2_extent_block *eb;
3127 	struct ocfs2_extent_list *el;
3128 
3129 	el = path_leaf_el(path);
3130 	if (!ocfs2_is_empty_extent(&el->l_recs[0]))
3131 		return 0;
3132 
3133 	if (path->p_tree_depth == 0) {
3134 rightmost_no_delete:
3135 		/*
3136 		 * Inline extents. This is trivially handled, so do
3137 		 * it up front.
3138 		 */
3139 		ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
3140 						       path);
3141 		if (ret)
3142 			mlog_errno(ret);
3143 		goto out;
3144 	}
3145 
3146 	/*
3147 	 * Handle rightmost branch now. There's several cases:
3148 	 *  1) simple rotation leaving records in there. That's trivial.
3149 	 *  2) rotation requiring a branch delete - there's no more
3150 	 *     records left. Two cases of this:
3151 	 *     a) There are branches to the left.
3152 	 *     b) This is also the leftmost (the only) branch.
3153 	 *
3154 	 *  1) is handled via ocfs2_rotate_rightmost_leaf_left()
3155 	 *  2a) we need the left branch so that we can update it with the unlink
3156 	 *  2b) we need to bring the inode back to inline extents.
3157 	 */
3158 
3159 	eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
3160 	el = &eb->h_list;
3161 	if (eb->h_next_leaf_blk == 0) {
3162 		/*
3163 		 * This gets a bit tricky if we're going to delete the
3164 		 * rightmost path. Get the other cases out of the way
3165 		 * 1st.
3166 		 */
3167 		if (le16_to_cpu(el->l_next_free_rec) > 1)
3168 			goto rightmost_no_delete;
3169 
3170 		if (le16_to_cpu(el->l_next_free_rec) == 0) {
3171 			ret = -EIO;
3172 			ocfs2_error(inode->i_sb,
3173 				    "Inode %llu has empty extent block at %llu",
3174 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
3175 				    (unsigned long long)le64_to_cpu(eb->h_blkno));
3176 			goto out;
3177 		}
3178 
3179 		/*
3180 		 * XXX: The caller can not trust "path" any more after
3181 		 * this as it will have been deleted. What do we do?
3182 		 *
3183 		 * In theory the rotate-for-merge code will never get
3184 		 * here because it'll always ask for a rotate in a
3185 		 * nonempty list.
3186 		 */
3187 
3188 		ret = ocfs2_remove_rightmost_path(inode, handle, path,
3189 						  dealloc, et);
3190 		if (ret)
3191 			mlog_errno(ret);
3192 		goto out;
3193 	}
3194 
3195 	/*
3196 	 * Now we can loop, remembering the path we get from -EAGAIN
3197 	 * and restarting from there.
3198 	 */
3199 try_rotate:
3200 	ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path,
3201 				       dealloc, &restart_path, et);
3202 	if (ret && ret != -EAGAIN) {
3203 		mlog_errno(ret);
3204 		goto out;
3205 	}
3206 
3207 	while (ret == -EAGAIN) {
3208 		tmp_path = restart_path;
3209 		restart_path = NULL;
3210 
3211 		ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits,
3212 					       tmp_path, dealloc,
3213 					       &restart_path, et);
3214 		if (ret && ret != -EAGAIN) {
3215 			mlog_errno(ret);
3216 			goto out;
3217 		}
3218 
3219 		ocfs2_free_path(tmp_path);
3220 		tmp_path = NULL;
3221 
3222 		if (ret == 0)
3223 			goto try_rotate;
3224 	}
3225 
3226 out:
3227 	ocfs2_free_path(tmp_path);
3228 	ocfs2_free_path(restart_path);
3229 	return ret;
3230 }
3231 
3232 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
3233 				int index)
3234 {
3235 	struct ocfs2_extent_rec *rec = &el->l_recs[index];
3236 	unsigned int size;
3237 
3238 	if (rec->e_leaf_clusters == 0) {
3239 		/*
3240 		 * We consumed all of the merged-from record. An empty
3241 		 * extent cannot exist anywhere but the 1st array
3242 		 * position, so move things over if the merged-from
3243 		 * record doesn't occupy that position.
3244 		 *
3245 		 * This creates a new empty extent so the caller
3246 		 * should be smart enough to have removed any existing
3247 		 * ones.
3248 		 */
3249 		if (index > 0) {
3250 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
3251 			size = index * sizeof(struct ocfs2_extent_rec);
3252 			memmove(&el->l_recs[1], &el->l_recs[0], size);
3253 		}
3254 
3255 		/*
3256 		 * Always memset - the caller doesn't check whether it
3257 		 * created an empty extent, so there could be junk in
3258 		 * the other fields.
3259 		 */
3260 		memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
3261 	}
3262 }
3263 
3264 static int ocfs2_get_right_path(struct inode *inode,
3265 				struct ocfs2_path *left_path,
3266 				struct ocfs2_path **ret_right_path)
3267 {
3268 	int ret;
3269 	u32 right_cpos;
3270 	struct ocfs2_path *right_path = NULL;
3271 	struct ocfs2_extent_list *left_el;
3272 
3273 	*ret_right_path = NULL;
3274 
3275 	/* This function shouldn't be called for non-trees. */
3276 	BUG_ON(left_path->p_tree_depth == 0);
3277 
3278 	left_el = path_leaf_el(left_path);
3279 	BUG_ON(left_el->l_next_free_rec != left_el->l_count);
3280 
3281 	ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
3282 					     &right_cpos);
3283 	if (ret) {
3284 		mlog_errno(ret);
3285 		goto out;
3286 	}
3287 
3288 	/* This function shouldn't be called for the rightmost leaf. */
3289 	BUG_ON(right_cpos == 0);
3290 
3291 	right_path = ocfs2_new_path_from_path(left_path);
3292 	if (!right_path) {
3293 		ret = -ENOMEM;
3294 		mlog_errno(ret);
3295 		goto out;
3296 	}
3297 
3298 	ret = ocfs2_find_path(inode, right_path, right_cpos);
3299 	if (ret) {
3300 		mlog_errno(ret);
3301 		goto out;
3302 	}
3303 
3304 	*ret_right_path = right_path;
3305 out:
3306 	if (ret)
3307 		ocfs2_free_path(right_path);
3308 	return ret;
3309 }
3310 
3311 /*
3312  * Remove split_rec clusters from the record at index and merge them
3313  * onto the beginning of the record "next" to it.
3314  * For index < l_count - 1, the next means the extent rec at index + 1.
3315  * For index == l_count - 1, the "next" means the 1st extent rec of the
3316  * next extent block.
3317  */
3318 static int ocfs2_merge_rec_right(struct inode *inode,
3319 				 struct ocfs2_path *left_path,
3320 				 handle_t *handle,
3321 				 struct ocfs2_extent_rec *split_rec,
3322 				 int index)
3323 {
3324 	int ret, next_free, i;
3325 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3326 	struct ocfs2_extent_rec *left_rec;
3327 	struct ocfs2_extent_rec *right_rec;
3328 	struct ocfs2_extent_list *right_el;
3329 	struct ocfs2_path *right_path = NULL;
3330 	int subtree_index = 0;
3331 	struct ocfs2_extent_list *el = path_leaf_el(left_path);
3332 	struct buffer_head *bh = path_leaf_bh(left_path);
3333 	struct buffer_head *root_bh = NULL;
3334 
3335 	BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
3336 	left_rec = &el->l_recs[index];
3337 
3338 	if (index == le16_to_cpu(el->l_next_free_rec) - 1 &&
3339 	    le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
3340 		/* we meet with a cross extent block merge. */
3341 		ret = ocfs2_get_right_path(inode, left_path, &right_path);
3342 		if (ret) {
3343 			mlog_errno(ret);
3344 			goto out;
3345 		}
3346 
3347 		right_el = path_leaf_el(right_path);
3348 		next_free = le16_to_cpu(right_el->l_next_free_rec);
3349 		BUG_ON(next_free <= 0);
3350 		right_rec = &right_el->l_recs[0];
3351 		if (ocfs2_is_empty_extent(right_rec)) {
3352 			BUG_ON(next_free <= 1);
3353 			right_rec = &right_el->l_recs[1];
3354 		}
3355 
3356 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3357 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
3358 		       le32_to_cpu(right_rec->e_cpos));
3359 
3360 		subtree_index = ocfs2_find_subtree_root(inode,
3361 							left_path, right_path);
3362 
3363 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3364 						      handle->h_buffer_credits,
3365 						      right_path);
3366 		if (ret) {
3367 			mlog_errno(ret);
3368 			goto out;
3369 		}
3370 
3371 		root_bh = left_path->p_node[subtree_index].bh;
3372 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3373 
3374 		ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode), right_path,
3375 						   subtree_index);
3376 		if (ret) {
3377 			mlog_errno(ret);
3378 			goto out;
3379 		}
3380 
3381 		for (i = subtree_index + 1;
3382 		     i < path_num_items(right_path); i++) {
3383 			ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode),
3384 							   right_path, i);
3385 			if (ret) {
3386 				mlog_errno(ret);
3387 				goto out;
3388 			}
3389 
3390 			ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode),
3391 							   left_path, i);
3392 			if (ret) {
3393 				mlog_errno(ret);
3394 				goto out;
3395 			}
3396 		}
3397 
3398 	} else {
3399 		BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
3400 		right_rec = &el->l_recs[index + 1];
3401 	}
3402 
3403 	ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode), left_path,
3404 					   path_num_items(left_path) - 1);
3405 	if (ret) {
3406 		mlog_errno(ret);
3407 		goto out;
3408 	}
3409 
3410 	le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters);
3411 
3412 	le32_add_cpu(&right_rec->e_cpos, -split_clusters);
3413 	le64_add_cpu(&right_rec->e_blkno,
3414 		     -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3415 	le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters);
3416 
3417 	ocfs2_cleanup_merge(el, index);
3418 
3419 	ret = ocfs2_journal_dirty(handle, bh);
3420 	if (ret)
3421 		mlog_errno(ret);
3422 
3423 	if (right_path) {
3424 		ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
3425 		if (ret)
3426 			mlog_errno(ret);
3427 
3428 		ocfs2_complete_edge_insert(inode, handle, left_path,
3429 					   right_path, subtree_index);
3430 	}
3431 out:
3432 	if (right_path)
3433 		ocfs2_free_path(right_path);
3434 	return ret;
3435 }
3436 
3437 static int ocfs2_get_left_path(struct inode *inode,
3438 			       struct ocfs2_path *right_path,
3439 			       struct ocfs2_path **ret_left_path)
3440 {
3441 	int ret;
3442 	u32 left_cpos;
3443 	struct ocfs2_path *left_path = NULL;
3444 
3445 	*ret_left_path = NULL;
3446 
3447 	/* This function shouldn't be called for non-trees. */
3448 	BUG_ON(right_path->p_tree_depth == 0);
3449 
3450 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3451 					    right_path, &left_cpos);
3452 	if (ret) {
3453 		mlog_errno(ret);
3454 		goto out;
3455 	}
3456 
3457 	/* This function shouldn't be called for the leftmost leaf. */
3458 	BUG_ON(left_cpos == 0);
3459 
3460 	left_path = ocfs2_new_path_from_path(right_path);
3461 	if (!left_path) {
3462 		ret = -ENOMEM;
3463 		mlog_errno(ret);
3464 		goto out;
3465 	}
3466 
3467 	ret = ocfs2_find_path(inode, left_path, left_cpos);
3468 	if (ret) {
3469 		mlog_errno(ret);
3470 		goto out;
3471 	}
3472 
3473 	*ret_left_path = left_path;
3474 out:
3475 	if (ret)
3476 		ocfs2_free_path(left_path);
3477 	return ret;
3478 }
3479 
3480 /*
3481  * Remove split_rec clusters from the record at index and merge them
3482  * onto the tail of the record "before" it.
3483  * For index > 0, the "before" means the extent rec at index - 1.
3484  *
3485  * For index == 0, the "before" means the last record of the previous
3486  * extent block. And there is also a situation that we may need to
3487  * remove the rightmost leaf extent block in the right_path and change
3488  * the right path to indicate the new rightmost path.
3489  */
3490 static int ocfs2_merge_rec_left(struct inode *inode,
3491 				struct ocfs2_path *right_path,
3492 				handle_t *handle,
3493 				struct ocfs2_extent_rec *split_rec,
3494 				struct ocfs2_cached_dealloc_ctxt *dealloc,
3495 				struct ocfs2_extent_tree *et,
3496 				int index)
3497 {
3498 	int ret, i, subtree_index = 0, has_empty_extent = 0;
3499 	unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
3500 	struct ocfs2_extent_rec *left_rec;
3501 	struct ocfs2_extent_rec *right_rec;
3502 	struct ocfs2_extent_list *el = path_leaf_el(right_path);
3503 	struct buffer_head *bh = path_leaf_bh(right_path);
3504 	struct buffer_head *root_bh = NULL;
3505 	struct ocfs2_path *left_path = NULL;
3506 	struct ocfs2_extent_list *left_el;
3507 
3508 	BUG_ON(index < 0);
3509 
3510 	right_rec = &el->l_recs[index];
3511 	if (index == 0) {
3512 		/* we meet with a cross extent block merge. */
3513 		ret = ocfs2_get_left_path(inode, right_path, &left_path);
3514 		if (ret) {
3515 			mlog_errno(ret);
3516 			goto out;
3517 		}
3518 
3519 		left_el = path_leaf_el(left_path);
3520 		BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
3521 		       le16_to_cpu(left_el->l_count));
3522 
3523 		left_rec = &left_el->l_recs[
3524 				le16_to_cpu(left_el->l_next_free_rec) - 1];
3525 		BUG_ON(le32_to_cpu(left_rec->e_cpos) +
3526 		       le16_to_cpu(left_rec->e_leaf_clusters) !=
3527 		       le32_to_cpu(split_rec->e_cpos));
3528 
3529 		subtree_index = ocfs2_find_subtree_root(inode,
3530 							left_path, right_path);
3531 
3532 		ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
3533 						      handle->h_buffer_credits,
3534 						      left_path);
3535 		if (ret) {
3536 			mlog_errno(ret);
3537 			goto out;
3538 		}
3539 
3540 		root_bh = left_path->p_node[subtree_index].bh;
3541 		BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
3542 
3543 		ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode), right_path,
3544 						   subtree_index);
3545 		if (ret) {
3546 			mlog_errno(ret);
3547 			goto out;
3548 		}
3549 
3550 		for (i = subtree_index + 1;
3551 		     i < path_num_items(right_path); i++) {
3552 			ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode),
3553 							   right_path, i);
3554 			if (ret) {
3555 				mlog_errno(ret);
3556 				goto out;
3557 			}
3558 
3559 			ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode),
3560 							   left_path, i);
3561 			if (ret) {
3562 				mlog_errno(ret);
3563 				goto out;
3564 			}
3565 		}
3566 	} else {
3567 		left_rec = &el->l_recs[index - 1];
3568 		if (ocfs2_is_empty_extent(&el->l_recs[0]))
3569 			has_empty_extent = 1;
3570 	}
3571 
3572 	ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode), right_path,
3573 					   path_num_items(right_path) - 1);
3574 	if (ret) {
3575 		mlog_errno(ret);
3576 		goto out;
3577 	}
3578 
3579 	if (has_empty_extent && index == 1) {
3580 		/*
3581 		 * The easy case - we can just plop the record right in.
3582 		 */
3583 		*left_rec = *split_rec;
3584 
3585 		has_empty_extent = 0;
3586 	} else
3587 		le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
3588 
3589 	le32_add_cpu(&right_rec->e_cpos, split_clusters);
3590 	le64_add_cpu(&right_rec->e_blkno,
3591 		     ocfs2_clusters_to_blocks(inode->i_sb, split_clusters));
3592 	le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters);
3593 
3594 	ocfs2_cleanup_merge(el, index);
3595 
3596 	ret = ocfs2_journal_dirty(handle, bh);
3597 	if (ret)
3598 		mlog_errno(ret);
3599 
3600 	if (left_path) {
3601 		ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3602 		if (ret)
3603 			mlog_errno(ret);
3604 
3605 		/*
3606 		 * In the situation that the right_rec is empty and the extent
3607 		 * block is empty also,  ocfs2_complete_edge_insert can't handle
3608 		 * it and we need to delete the right extent block.
3609 		 */
3610 		if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3611 		    le16_to_cpu(el->l_next_free_rec) == 1) {
3612 
3613 			ret = ocfs2_remove_rightmost_path(inode, handle,
3614 							  right_path,
3615 							  dealloc, et);
3616 			if (ret) {
3617 				mlog_errno(ret);
3618 				goto out;
3619 			}
3620 
3621 			/* Now the rightmost extent block has been deleted.
3622 			 * So we use the new rightmost path.
3623 			 */
3624 			ocfs2_mv_path(right_path, left_path);
3625 			left_path = NULL;
3626 		} else
3627 			ocfs2_complete_edge_insert(inode, handle, left_path,
3628 						   right_path, subtree_index);
3629 	}
3630 out:
3631 	if (left_path)
3632 		ocfs2_free_path(left_path);
3633 	return ret;
3634 }
3635 
3636 static int ocfs2_try_to_merge_extent(struct inode *inode,
3637 				     handle_t *handle,
3638 				     struct ocfs2_path *path,
3639 				     int split_index,
3640 				     struct ocfs2_extent_rec *split_rec,
3641 				     struct ocfs2_cached_dealloc_ctxt *dealloc,
3642 				     struct ocfs2_merge_ctxt *ctxt,
3643 				     struct ocfs2_extent_tree *et)
3644 
3645 {
3646 	int ret = 0;
3647 	struct ocfs2_extent_list *el = path_leaf_el(path);
3648 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
3649 
3650 	BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
3651 
3652 	if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) {
3653 		/*
3654 		 * The merge code will need to create an empty
3655 		 * extent to take the place of the newly
3656 		 * emptied slot. Remove any pre-existing empty
3657 		 * extents - having more than one in a leaf is
3658 		 * illegal.
3659 		 */
3660 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3661 					     dealloc, et);
3662 		if (ret) {
3663 			mlog_errno(ret);
3664 			goto out;
3665 		}
3666 		split_index--;
3667 		rec = &el->l_recs[split_index];
3668 	}
3669 
3670 	if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) {
3671 		/*
3672 		 * Left-right contig implies this.
3673 		 */
3674 		BUG_ON(!ctxt->c_split_covers_rec);
3675 
3676 		/*
3677 		 * Since the leftright insert always covers the entire
3678 		 * extent, this call will delete the insert record
3679 		 * entirely, resulting in an empty extent record added to
3680 		 * the extent block.
3681 		 *
3682 		 * Since the adding of an empty extent shifts
3683 		 * everything back to the right, there's no need to
3684 		 * update split_index here.
3685 		 *
3686 		 * When the split_index is zero, we need to merge it to the
3687 		 * prevoius extent block. It is more efficient and easier
3688 		 * if we do merge_right first and merge_left later.
3689 		 */
3690 		ret = ocfs2_merge_rec_right(inode, path,
3691 					    handle, split_rec,
3692 					    split_index);
3693 		if (ret) {
3694 			mlog_errno(ret);
3695 			goto out;
3696 		}
3697 
3698 		/*
3699 		 * We can only get this from logic error above.
3700 		 */
3701 		BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
3702 
3703 		/* The merge left us with an empty extent, remove it. */
3704 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3705 					     dealloc, et);
3706 		if (ret) {
3707 			mlog_errno(ret);
3708 			goto out;
3709 		}
3710 
3711 		rec = &el->l_recs[split_index];
3712 
3713 		/*
3714 		 * Note that we don't pass split_rec here on purpose -
3715 		 * we've merged it into the rec already.
3716 		 */
3717 		ret = ocfs2_merge_rec_left(inode, path,
3718 					   handle, rec,
3719 					   dealloc, et,
3720 					   split_index);
3721 
3722 		if (ret) {
3723 			mlog_errno(ret);
3724 			goto out;
3725 		}
3726 
3727 		ret = ocfs2_rotate_tree_left(inode, handle, path,
3728 					     dealloc, et);
3729 		/*
3730 		 * Error from this last rotate is not critical, so
3731 		 * print but don't bubble it up.
3732 		 */
3733 		if (ret)
3734 			mlog_errno(ret);
3735 		ret = 0;
3736 	} else {
3737 		/*
3738 		 * Merge a record to the left or right.
3739 		 *
3740 		 * 'contig_type' is relative to the existing record,
3741 		 * so for example, if we're "right contig", it's to
3742 		 * the record on the left (hence the left merge).
3743 		 */
3744 		if (ctxt->c_contig_type == CONTIG_RIGHT) {
3745 			ret = ocfs2_merge_rec_left(inode,
3746 						   path,
3747 						   handle, split_rec,
3748 						   dealloc, et,
3749 						   split_index);
3750 			if (ret) {
3751 				mlog_errno(ret);
3752 				goto out;
3753 			}
3754 		} else {
3755 			ret = ocfs2_merge_rec_right(inode,
3756 						    path,
3757 						    handle, split_rec,
3758 						    split_index);
3759 			if (ret) {
3760 				mlog_errno(ret);
3761 				goto out;
3762 			}
3763 		}
3764 
3765 		if (ctxt->c_split_covers_rec) {
3766 			/*
3767 			 * The merge may have left an empty extent in
3768 			 * our leaf. Try to rotate it away.
3769 			 */
3770 			ret = ocfs2_rotate_tree_left(inode, handle, path,
3771 						     dealloc, et);
3772 			if (ret)
3773 				mlog_errno(ret);
3774 			ret = 0;
3775 		}
3776 	}
3777 
3778 out:
3779 	return ret;
3780 }
3781 
3782 static void ocfs2_subtract_from_rec(struct super_block *sb,
3783 				    enum ocfs2_split_type split,
3784 				    struct ocfs2_extent_rec *rec,
3785 				    struct ocfs2_extent_rec *split_rec)
3786 {
3787 	u64 len_blocks;
3788 
3789 	len_blocks = ocfs2_clusters_to_blocks(sb,
3790 				le16_to_cpu(split_rec->e_leaf_clusters));
3791 
3792 	if (split == SPLIT_LEFT) {
3793 		/*
3794 		 * Region is on the left edge of the existing
3795 		 * record.
3796 		 */
3797 		le32_add_cpu(&rec->e_cpos,
3798 			     le16_to_cpu(split_rec->e_leaf_clusters));
3799 		le64_add_cpu(&rec->e_blkno, len_blocks);
3800 		le16_add_cpu(&rec->e_leaf_clusters,
3801 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3802 	} else {
3803 		/*
3804 		 * Region is on the right edge of the existing
3805 		 * record.
3806 		 */
3807 		le16_add_cpu(&rec->e_leaf_clusters,
3808 			     -le16_to_cpu(split_rec->e_leaf_clusters));
3809 	}
3810 }
3811 
3812 /*
3813  * Do the final bits of extent record insertion at the target leaf
3814  * list. If this leaf is part of an allocation tree, it is assumed
3815  * that the tree above has been prepared.
3816  */
3817 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
3818 				 struct ocfs2_extent_list *el,
3819 				 struct ocfs2_insert_type *insert,
3820 				 struct inode *inode)
3821 {
3822 	int i = insert->ins_contig_index;
3823 	unsigned int range;
3824 	struct ocfs2_extent_rec *rec;
3825 
3826 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
3827 
3828 	if (insert->ins_split != SPLIT_NONE) {
3829 		i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos));
3830 		BUG_ON(i == -1);
3831 		rec = &el->l_recs[i];
3832 		ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec,
3833 					insert_rec);
3834 		goto rotate;
3835 	}
3836 
3837 	/*
3838 	 * Contiguous insert - either left or right.
3839 	 */
3840 	if (insert->ins_contig != CONTIG_NONE) {
3841 		rec = &el->l_recs[i];
3842 		if (insert->ins_contig == CONTIG_LEFT) {
3843 			rec->e_blkno = insert_rec->e_blkno;
3844 			rec->e_cpos = insert_rec->e_cpos;
3845 		}
3846 		le16_add_cpu(&rec->e_leaf_clusters,
3847 			     le16_to_cpu(insert_rec->e_leaf_clusters));
3848 		return;
3849 	}
3850 
3851 	/*
3852 	 * Handle insert into an empty leaf.
3853 	 */
3854 	if (le16_to_cpu(el->l_next_free_rec) == 0 ||
3855 	    ((le16_to_cpu(el->l_next_free_rec) == 1) &&
3856 	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
3857 		el->l_recs[0] = *insert_rec;
3858 		el->l_next_free_rec = cpu_to_le16(1);
3859 		return;
3860 	}
3861 
3862 	/*
3863 	 * Appending insert.
3864 	 */
3865 	if (insert->ins_appending == APPEND_TAIL) {
3866 		i = le16_to_cpu(el->l_next_free_rec) - 1;
3867 		rec = &el->l_recs[i];
3868 		range = le32_to_cpu(rec->e_cpos)
3869 			+ le16_to_cpu(rec->e_leaf_clusters);
3870 		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
3871 
3872 		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
3873 				le16_to_cpu(el->l_count),
3874 				"inode %lu, depth %u, count %u, next free %u, "
3875 				"rec.cpos %u, rec.clusters %u, "
3876 				"insert.cpos %u, insert.clusters %u\n",
3877 				inode->i_ino,
3878 				le16_to_cpu(el->l_tree_depth),
3879 				le16_to_cpu(el->l_count),
3880 				le16_to_cpu(el->l_next_free_rec),
3881 				le32_to_cpu(el->l_recs[i].e_cpos),
3882 				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
3883 				le32_to_cpu(insert_rec->e_cpos),
3884 				le16_to_cpu(insert_rec->e_leaf_clusters));
3885 		i++;
3886 		el->l_recs[i] = *insert_rec;
3887 		le16_add_cpu(&el->l_next_free_rec, 1);
3888 		return;
3889 	}
3890 
3891 rotate:
3892 	/*
3893 	 * Ok, we have to rotate.
3894 	 *
3895 	 * At this point, it is safe to assume that inserting into an
3896 	 * empty leaf and appending to a leaf have both been handled
3897 	 * above.
3898 	 *
3899 	 * This leaf needs to have space, either by the empty 1st
3900 	 * extent record, or by virtue of an l_next_rec < l_count.
3901 	 */
3902 	ocfs2_rotate_leaf(el, insert_rec);
3903 }
3904 
3905 static void ocfs2_adjust_rightmost_records(struct inode *inode,
3906 					   handle_t *handle,
3907 					   struct ocfs2_path *path,
3908 					   struct ocfs2_extent_rec *insert_rec)
3909 {
3910 	int ret, i, next_free;
3911 	struct buffer_head *bh;
3912 	struct ocfs2_extent_list *el;
3913 	struct ocfs2_extent_rec *rec;
3914 
3915 	/*
3916 	 * Update everything except the leaf block.
3917 	 */
3918 	for (i = 0; i < path->p_tree_depth; i++) {
3919 		bh = path->p_node[i].bh;
3920 		el = path->p_node[i].el;
3921 
3922 		next_free = le16_to_cpu(el->l_next_free_rec);
3923 		if (next_free == 0) {
3924 			ocfs2_error(inode->i_sb,
3925 				    "Dinode %llu has a bad extent list",
3926 				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
3927 			ret = -EIO;
3928 			return;
3929 		}
3930 
3931 		rec = &el->l_recs[next_free - 1];
3932 
3933 		rec->e_int_clusters = insert_rec->e_cpos;
3934 		le32_add_cpu(&rec->e_int_clusters,
3935 			     le16_to_cpu(insert_rec->e_leaf_clusters));
3936 		le32_add_cpu(&rec->e_int_clusters,
3937 			     -le32_to_cpu(rec->e_cpos));
3938 
3939 		ret = ocfs2_journal_dirty(handle, bh);
3940 		if (ret)
3941 			mlog_errno(ret);
3942 
3943 	}
3944 }
3945 
3946 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
3947 				    struct ocfs2_extent_rec *insert_rec,
3948 				    struct ocfs2_path *right_path,
3949 				    struct ocfs2_path **ret_left_path)
3950 {
3951 	int ret, next_free;
3952 	struct ocfs2_extent_list *el;
3953 	struct ocfs2_path *left_path = NULL;
3954 
3955 	*ret_left_path = NULL;
3956 
3957 	/*
3958 	 * This shouldn't happen for non-trees. The extent rec cluster
3959 	 * count manipulation below only works for interior nodes.
3960 	 */
3961 	BUG_ON(right_path->p_tree_depth == 0);
3962 
3963 	/*
3964 	 * If our appending insert is at the leftmost edge of a leaf,
3965 	 * then we might need to update the rightmost records of the
3966 	 * neighboring path.
3967 	 */
3968 	el = path_leaf_el(right_path);
3969 	next_free = le16_to_cpu(el->l_next_free_rec);
3970 	if (next_free == 0 ||
3971 	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
3972 		u32 left_cpos;
3973 
3974 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
3975 						    &left_cpos);
3976 		if (ret) {
3977 			mlog_errno(ret);
3978 			goto out;
3979 		}
3980 
3981 		mlog(0, "Append may need a left path update. cpos: %u, "
3982 		     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
3983 		     left_cpos);
3984 
3985 		/*
3986 		 * No need to worry if the append is already in the
3987 		 * leftmost leaf.
3988 		 */
3989 		if (left_cpos) {
3990 			left_path = ocfs2_new_path_from_path(right_path);
3991 			if (!left_path) {
3992 				ret = -ENOMEM;
3993 				mlog_errno(ret);
3994 				goto out;
3995 			}
3996 
3997 			ret = ocfs2_find_path(inode, left_path, left_cpos);
3998 			if (ret) {
3999 				mlog_errno(ret);
4000 				goto out;
4001 			}
4002 
4003 			/*
4004 			 * ocfs2_insert_path() will pass the left_path to the
4005 			 * journal for us.
4006 			 */
4007 		}
4008 	}
4009 
4010 	ret = ocfs2_journal_access_path(INODE_CACHE(inode), handle, right_path);
4011 	if (ret) {
4012 		mlog_errno(ret);
4013 		goto out;
4014 	}
4015 
4016 	ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec);
4017 
4018 	*ret_left_path = left_path;
4019 	ret = 0;
4020 out:
4021 	if (ret != 0)
4022 		ocfs2_free_path(left_path);
4023 
4024 	return ret;
4025 }
4026 
4027 static void ocfs2_split_record(struct inode *inode,
4028 			       struct ocfs2_path *left_path,
4029 			       struct ocfs2_path *right_path,
4030 			       struct ocfs2_extent_rec *split_rec,
4031 			       enum ocfs2_split_type split)
4032 {
4033 	int index;
4034 	u32 cpos = le32_to_cpu(split_rec->e_cpos);
4035 	struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el;
4036 	struct ocfs2_extent_rec *rec, *tmprec;
4037 
4038 	right_el = path_leaf_el(right_path);
4039 	if (left_path)
4040 		left_el = path_leaf_el(left_path);
4041 
4042 	el = right_el;
4043 	insert_el = right_el;
4044 	index = ocfs2_search_extent_list(el, cpos);
4045 	if (index != -1) {
4046 		if (index == 0 && left_path) {
4047 			BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0]));
4048 
4049 			/*
4050 			 * This typically means that the record
4051 			 * started in the left path but moved to the
4052 			 * right as a result of rotation. We either
4053 			 * move the existing record to the left, or we
4054 			 * do the later insert there.
4055 			 *
4056 			 * In this case, the left path should always
4057 			 * exist as the rotate code will have passed
4058 			 * it back for a post-insert update.
4059 			 */
4060 
4061 			if (split == SPLIT_LEFT) {
4062 				/*
4063 				 * It's a left split. Since we know
4064 				 * that the rotate code gave us an
4065 				 * empty extent in the left path, we
4066 				 * can just do the insert there.
4067 				 */
4068 				insert_el = left_el;
4069 			} else {
4070 				/*
4071 				 * Right split - we have to move the
4072 				 * existing record over to the left
4073 				 * leaf. The insert will be into the
4074 				 * newly created empty extent in the
4075 				 * right leaf.
4076 				 */
4077 				tmprec = &right_el->l_recs[index];
4078 				ocfs2_rotate_leaf(left_el, tmprec);
4079 				el = left_el;
4080 
4081 				memset(tmprec, 0, sizeof(*tmprec));
4082 				index = ocfs2_search_extent_list(left_el, cpos);
4083 				BUG_ON(index == -1);
4084 			}
4085 		}
4086 	} else {
4087 		BUG_ON(!left_path);
4088 		BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0]));
4089 		/*
4090 		 * Left path is easy - we can just allow the insert to
4091 		 * happen.
4092 		 */
4093 		el = left_el;
4094 		insert_el = left_el;
4095 		index = ocfs2_search_extent_list(el, cpos);
4096 		BUG_ON(index == -1);
4097 	}
4098 
4099 	rec = &el->l_recs[index];
4100 	ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec);
4101 	ocfs2_rotate_leaf(insert_el, split_rec);
4102 }
4103 
4104 /*
4105  * This function only does inserts on an allocation b-tree. For tree
4106  * depth = 0, ocfs2_insert_at_leaf() is called directly.
4107  *
4108  * right_path is the path we want to do the actual insert
4109  * in. left_path should only be passed in if we need to update that
4110  * portion of the tree after an edge insert.
4111  */
4112 static int ocfs2_insert_path(struct inode *inode,
4113 			     handle_t *handle,
4114 			     struct ocfs2_path *left_path,
4115 			     struct ocfs2_path *right_path,
4116 			     struct ocfs2_extent_rec *insert_rec,
4117 			     struct ocfs2_insert_type *insert)
4118 {
4119 	int ret, subtree_index;
4120 	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
4121 
4122 	if (left_path) {
4123 		int credits = handle->h_buffer_credits;
4124 
4125 		/*
4126 		 * There's a chance that left_path got passed back to
4127 		 * us without being accounted for in the
4128 		 * journal. Extend our transaction here to be sure we
4129 		 * can change those blocks.
4130 		 */
4131 		credits += left_path->p_tree_depth;
4132 
4133 		ret = ocfs2_extend_trans(handle, credits);
4134 		if (ret < 0) {
4135 			mlog_errno(ret);
4136 			goto out;
4137 		}
4138 
4139 		ret = ocfs2_journal_access_path(INODE_CACHE(inode), handle, left_path);
4140 		if (ret < 0) {
4141 			mlog_errno(ret);
4142 			goto out;
4143 		}
4144 	}
4145 
4146 	/*
4147 	 * Pass both paths to the journal. The majority of inserts
4148 	 * will be touching all components anyway.
4149 	 */
4150 	ret = ocfs2_journal_access_path(INODE_CACHE(inode), handle, right_path);
4151 	if (ret < 0) {
4152 		mlog_errno(ret);
4153 		goto out;
4154 	}
4155 
4156 	if (insert->ins_split != SPLIT_NONE) {
4157 		/*
4158 		 * We could call ocfs2_insert_at_leaf() for some types
4159 		 * of splits, but it's easier to just let one separate
4160 		 * function sort it all out.
4161 		 */
4162 		ocfs2_split_record(inode, left_path, right_path,
4163 				   insert_rec, insert->ins_split);
4164 
4165 		/*
4166 		 * Split might have modified either leaf and we don't
4167 		 * have a guarantee that the later edge insert will
4168 		 * dirty this for us.
4169 		 */
4170 		if (left_path)
4171 			ret = ocfs2_journal_dirty(handle,
4172 						  path_leaf_bh(left_path));
4173 			if (ret)
4174 				mlog_errno(ret);
4175 	} else
4176 		ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path),
4177 				     insert, inode);
4178 
4179 	ret = ocfs2_journal_dirty(handle, leaf_bh);
4180 	if (ret)
4181 		mlog_errno(ret);
4182 
4183 	if (left_path) {
4184 		/*
4185 		 * The rotate code has indicated that we need to fix
4186 		 * up portions of the tree after the insert.
4187 		 *
4188 		 * XXX: Should we extend the transaction here?
4189 		 */
4190 		subtree_index = ocfs2_find_subtree_root(inode, left_path,
4191 							right_path);
4192 		ocfs2_complete_edge_insert(inode, handle, left_path,
4193 					   right_path, subtree_index);
4194 	}
4195 
4196 	ret = 0;
4197 out:
4198 	return ret;
4199 }
4200 
4201 static int ocfs2_do_insert_extent(struct inode *inode,
4202 				  handle_t *handle,
4203 				  struct ocfs2_extent_tree *et,
4204 				  struct ocfs2_extent_rec *insert_rec,
4205 				  struct ocfs2_insert_type *type)
4206 {
4207 	int ret, rotate = 0;
4208 	u32 cpos;
4209 	struct ocfs2_path *right_path = NULL;
4210 	struct ocfs2_path *left_path = NULL;
4211 	struct ocfs2_extent_list *el;
4212 
4213 	el = et->et_root_el;
4214 
4215 	ret = ocfs2_et_root_journal_access(handle, et,
4216 					   OCFS2_JOURNAL_ACCESS_WRITE);
4217 	if (ret) {
4218 		mlog_errno(ret);
4219 		goto out;
4220 	}
4221 
4222 	if (le16_to_cpu(el->l_tree_depth) == 0) {
4223 		ocfs2_insert_at_leaf(insert_rec, el, type, inode);
4224 		goto out_update_clusters;
4225 	}
4226 
4227 	right_path = ocfs2_new_path_from_et(et);
4228 	if (!right_path) {
4229 		ret = -ENOMEM;
4230 		mlog_errno(ret);
4231 		goto out;
4232 	}
4233 
4234 	/*
4235 	 * Determine the path to start with. Rotations need the
4236 	 * rightmost path, everything else can go directly to the
4237 	 * target leaf.
4238 	 */
4239 	cpos = le32_to_cpu(insert_rec->e_cpos);
4240 	if (type->ins_appending == APPEND_NONE &&
4241 	    type->ins_contig == CONTIG_NONE) {
4242 		rotate = 1;
4243 		cpos = UINT_MAX;
4244 	}
4245 
4246 	ret = ocfs2_find_path(inode, right_path, cpos);
4247 	if (ret) {
4248 		mlog_errno(ret);
4249 		goto out;
4250 	}
4251 
4252 	/*
4253 	 * Rotations and appends need special treatment - they modify
4254 	 * parts of the tree's above them.
4255 	 *
4256 	 * Both might pass back a path immediate to the left of the
4257 	 * one being inserted to. This will be cause
4258 	 * ocfs2_insert_path() to modify the rightmost records of
4259 	 * left_path to account for an edge insert.
4260 	 *
4261 	 * XXX: When modifying this code, keep in mind that an insert
4262 	 * can wind up skipping both of these two special cases...
4263 	 */
4264 	if (rotate) {
4265 		ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split,
4266 					      le32_to_cpu(insert_rec->e_cpos),
4267 					      right_path, &left_path);
4268 		if (ret) {
4269 			mlog_errno(ret);
4270 			goto out;
4271 		}
4272 
4273 		/*
4274 		 * ocfs2_rotate_tree_right() might have extended the
4275 		 * transaction without re-journaling our tree root.
4276 		 */
4277 		ret = ocfs2_et_root_journal_access(handle, et,
4278 						   OCFS2_JOURNAL_ACCESS_WRITE);
4279 		if (ret) {
4280 			mlog_errno(ret);
4281 			goto out;
4282 		}
4283 	} else if (type->ins_appending == APPEND_TAIL
4284 		   && type->ins_contig != CONTIG_LEFT) {
4285 		ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
4286 					       right_path, &left_path);
4287 		if (ret) {
4288 			mlog_errno(ret);
4289 			goto out;
4290 		}
4291 	}
4292 
4293 	ret = ocfs2_insert_path(inode, handle, left_path, right_path,
4294 				insert_rec, type);
4295 	if (ret) {
4296 		mlog_errno(ret);
4297 		goto out;
4298 	}
4299 
4300 out_update_clusters:
4301 	if (type->ins_split == SPLIT_NONE)
4302 		ocfs2_et_update_clusters(inode, et,
4303 					 le16_to_cpu(insert_rec->e_leaf_clusters));
4304 
4305 	ret = ocfs2_journal_dirty(handle, et->et_root_bh);
4306 	if (ret)
4307 		mlog_errno(ret);
4308 
4309 out:
4310 	ocfs2_free_path(left_path);
4311 	ocfs2_free_path(right_path);
4312 
4313 	return ret;
4314 }
4315 
4316 static enum ocfs2_contig_type
4317 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
4318 			       struct ocfs2_extent_list *el, int index,
4319 			       struct ocfs2_extent_rec *split_rec)
4320 {
4321 	int status;
4322 	enum ocfs2_contig_type ret = CONTIG_NONE;
4323 	u32 left_cpos, right_cpos;
4324 	struct ocfs2_extent_rec *rec = NULL;
4325 	struct ocfs2_extent_list *new_el;
4326 	struct ocfs2_path *left_path = NULL, *right_path = NULL;
4327 	struct buffer_head *bh;
4328 	struct ocfs2_extent_block *eb;
4329 
4330 	if (index > 0) {
4331 		rec = &el->l_recs[index - 1];
4332 	} else if (path->p_tree_depth > 0) {
4333 		status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
4334 						       path, &left_cpos);
4335 		if (status)
4336 			goto out;
4337 
4338 		if (left_cpos != 0) {
4339 			left_path = ocfs2_new_path_from_path(path);
4340 			if (!left_path)
4341 				goto out;
4342 
4343 			status = ocfs2_find_path(inode, left_path, left_cpos);
4344 			if (status)
4345 				goto out;
4346 
4347 			new_el = path_leaf_el(left_path);
4348 
4349 			if (le16_to_cpu(new_el->l_next_free_rec) !=
4350 			    le16_to_cpu(new_el->l_count)) {
4351 				bh = path_leaf_bh(left_path);
4352 				eb = (struct ocfs2_extent_block *)bh->b_data;
4353 				ocfs2_error(inode->i_sb,
4354 					    "Extent block #%llu has an "
4355 					    "invalid l_next_free_rec of "
4356 					    "%d.  It should have "
4357 					    "matched the l_count of %d",
4358 					    (unsigned long long)le64_to_cpu(eb->h_blkno),
4359 					    le16_to_cpu(new_el->l_next_free_rec),
4360 					    le16_to_cpu(new_el->l_count));
4361 				status = -EINVAL;
4362 				goto out;
4363 			}
4364 			rec = &new_el->l_recs[
4365 				le16_to_cpu(new_el->l_next_free_rec) - 1];
4366 		}
4367 	}
4368 
4369 	/*
4370 	 * We're careful to check for an empty extent record here -
4371 	 * the merge code will know what to do if it sees one.
4372 	 */
4373 	if (rec) {
4374 		if (index == 1 && ocfs2_is_empty_extent(rec)) {
4375 			if (split_rec->e_cpos == el->l_recs[index].e_cpos)
4376 				ret = CONTIG_RIGHT;
4377 		} else {
4378 			ret = ocfs2_extent_contig(inode, rec, split_rec);
4379 		}
4380 	}
4381 
4382 	rec = NULL;
4383 	if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
4384 		rec = &el->l_recs[index + 1];
4385 	else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
4386 		 path->p_tree_depth > 0) {
4387 		status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
4388 							path, &right_cpos);
4389 		if (status)
4390 			goto out;
4391 
4392 		if (right_cpos == 0)
4393 			goto out;
4394 
4395 		right_path = ocfs2_new_path_from_path(path);
4396 		if (!right_path)
4397 			goto out;
4398 
4399 		status = ocfs2_find_path(inode, right_path, right_cpos);
4400 		if (status)
4401 			goto out;
4402 
4403 		new_el = path_leaf_el(right_path);
4404 		rec = &new_el->l_recs[0];
4405 		if (ocfs2_is_empty_extent(rec)) {
4406 			if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
4407 				bh = path_leaf_bh(right_path);
4408 				eb = (struct ocfs2_extent_block *)bh->b_data;
4409 				ocfs2_error(inode->i_sb,
4410 					    "Extent block #%llu has an "
4411 					    "invalid l_next_free_rec of %d",
4412 					    (unsigned long long)le64_to_cpu(eb->h_blkno),
4413 					    le16_to_cpu(new_el->l_next_free_rec));
4414 				status = -EINVAL;
4415 				goto out;
4416 			}
4417 			rec = &new_el->l_recs[1];
4418 		}
4419 	}
4420 
4421 	if (rec) {
4422 		enum ocfs2_contig_type contig_type;
4423 
4424 		contig_type = ocfs2_extent_contig(inode, rec, split_rec);
4425 
4426 		if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
4427 			ret = CONTIG_LEFTRIGHT;
4428 		else if (ret == CONTIG_NONE)
4429 			ret = contig_type;
4430 	}
4431 
4432 out:
4433 	if (left_path)
4434 		ocfs2_free_path(left_path);
4435 	if (right_path)
4436 		ocfs2_free_path(right_path);
4437 
4438 	return ret;
4439 }
4440 
4441 static void ocfs2_figure_contig_type(struct inode *inode,
4442 				     struct ocfs2_insert_type *insert,
4443 				     struct ocfs2_extent_list *el,
4444 				     struct ocfs2_extent_rec *insert_rec,
4445 				     struct ocfs2_extent_tree *et)
4446 {
4447 	int i;
4448 	enum ocfs2_contig_type contig_type = CONTIG_NONE;
4449 
4450 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4451 
4452 	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
4453 		contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
4454 						  insert_rec);
4455 		if (contig_type != CONTIG_NONE) {
4456 			insert->ins_contig_index = i;
4457 			break;
4458 		}
4459 	}
4460 	insert->ins_contig = contig_type;
4461 
4462 	if (insert->ins_contig != CONTIG_NONE) {
4463 		struct ocfs2_extent_rec *rec =
4464 				&el->l_recs[insert->ins_contig_index];
4465 		unsigned int len = le16_to_cpu(rec->e_leaf_clusters) +
4466 				   le16_to_cpu(insert_rec->e_leaf_clusters);
4467 
4468 		/*
4469 		 * Caller might want us to limit the size of extents, don't
4470 		 * calculate contiguousness if we might exceed that limit.
4471 		 */
4472 		if (et->et_max_leaf_clusters &&
4473 		    (len > et->et_max_leaf_clusters))
4474 			insert->ins_contig = CONTIG_NONE;
4475 	}
4476 }
4477 
4478 /*
4479  * This should only be called against the righmost leaf extent list.
4480  *
4481  * ocfs2_figure_appending_type() will figure out whether we'll have to
4482  * insert at the tail of the rightmost leaf.
4483  *
4484  * This should also work against the root extent list for tree's with 0
4485  * depth. If we consider the root extent list to be the rightmost leaf node
4486  * then the logic here makes sense.
4487  */
4488 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
4489 					struct ocfs2_extent_list *el,
4490 					struct ocfs2_extent_rec *insert_rec)
4491 {
4492 	int i;
4493 	u32 cpos = le32_to_cpu(insert_rec->e_cpos);
4494 	struct ocfs2_extent_rec *rec;
4495 
4496 	insert->ins_appending = APPEND_NONE;
4497 
4498 	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
4499 
4500 	if (!el->l_next_free_rec)
4501 		goto set_tail_append;
4502 
4503 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
4504 		/* Were all records empty? */
4505 		if (le16_to_cpu(el->l_next_free_rec) == 1)
4506 			goto set_tail_append;
4507 	}
4508 
4509 	i = le16_to_cpu(el->l_next_free_rec) - 1;
4510 	rec = &el->l_recs[i];
4511 
4512 	if (cpos >=
4513 	    (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
4514 		goto set_tail_append;
4515 
4516 	return;
4517 
4518 set_tail_append:
4519 	insert->ins_appending = APPEND_TAIL;
4520 }
4521 
4522 /*
4523  * Helper function called at the begining of an insert.
4524  *
4525  * This computes a few things that are commonly used in the process of
4526  * inserting into the btree:
4527  *   - Whether the new extent is contiguous with an existing one.
4528  *   - The current tree depth.
4529  *   - Whether the insert is an appending one.
4530  *   - The total # of free records in the tree.
4531  *
4532  * All of the information is stored on the ocfs2_insert_type
4533  * structure.
4534  */
4535 static int ocfs2_figure_insert_type(struct inode *inode,
4536 				    struct ocfs2_extent_tree *et,
4537 				    struct buffer_head **last_eb_bh,
4538 				    struct ocfs2_extent_rec *insert_rec,
4539 				    int *free_records,
4540 				    struct ocfs2_insert_type *insert)
4541 {
4542 	int ret;
4543 	struct ocfs2_extent_block *eb;
4544 	struct ocfs2_extent_list *el;
4545 	struct ocfs2_path *path = NULL;
4546 	struct buffer_head *bh = NULL;
4547 
4548 	insert->ins_split = SPLIT_NONE;
4549 
4550 	el = et->et_root_el;
4551 	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
4552 
4553 	if (el->l_tree_depth) {
4554 		/*
4555 		 * If we have tree depth, we read in the
4556 		 * rightmost extent block ahead of time as
4557 		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
4558 		 * may want it later.
4559 		 */
4560 		ret = ocfs2_read_extent_block(inode,
4561 					      ocfs2_et_get_last_eb_blk(et),
4562 					      &bh);
4563 		if (ret) {
4564 			mlog_exit(ret);
4565 			goto out;
4566 		}
4567 		eb = (struct ocfs2_extent_block *) bh->b_data;
4568 		el = &eb->h_list;
4569 	}
4570 
4571 	/*
4572 	 * Unless we have a contiguous insert, we'll need to know if
4573 	 * there is room left in our allocation tree for another
4574 	 * extent record.
4575 	 *
4576 	 * XXX: This test is simplistic, we can search for empty
4577 	 * extent records too.
4578 	 */
4579 	*free_records = le16_to_cpu(el->l_count) -
4580 		le16_to_cpu(el->l_next_free_rec);
4581 
4582 	if (!insert->ins_tree_depth) {
4583 		ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4584 		ocfs2_figure_appending_type(insert, el, insert_rec);
4585 		return 0;
4586 	}
4587 
4588 	path = ocfs2_new_path_from_et(et);
4589 	if (!path) {
4590 		ret = -ENOMEM;
4591 		mlog_errno(ret);
4592 		goto out;
4593 	}
4594 
4595 	/*
4596 	 * In the case that we're inserting past what the tree
4597 	 * currently accounts for, ocfs2_find_path() will return for
4598 	 * us the rightmost tree path. This is accounted for below in
4599 	 * the appending code.
4600 	 */
4601 	ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
4602 	if (ret) {
4603 		mlog_errno(ret);
4604 		goto out;
4605 	}
4606 
4607 	el = path_leaf_el(path);
4608 
4609 	/*
4610 	 * Now that we have the path, there's two things we want to determine:
4611 	 * 1) Contiguousness (also set contig_index if this is so)
4612 	 *
4613 	 * 2) Are we doing an append? We can trivially break this up
4614          *     into two types of appends: simple record append, or a
4615          *     rotate inside the tail leaf.
4616 	 */
4617 	ocfs2_figure_contig_type(inode, insert, el, insert_rec, et);
4618 
4619 	/*
4620 	 * The insert code isn't quite ready to deal with all cases of
4621 	 * left contiguousness. Specifically, if it's an insert into
4622 	 * the 1st record in a leaf, it will require the adjustment of
4623 	 * cluster count on the last record of the path directly to it's
4624 	 * left. For now, just catch that case and fool the layers
4625 	 * above us. This works just fine for tree_depth == 0, which
4626 	 * is why we allow that above.
4627 	 */
4628 	if (insert->ins_contig == CONTIG_LEFT &&
4629 	    insert->ins_contig_index == 0)
4630 		insert->ins_contig = CONTIG_NONE;
4631 
4632 	/*
4633 	 * Ok, so we can simply compare against last_eb to figure out
4634 	 * whether the path doesn't exist. This will only happen in
4635 	 * the case that we're doing a tail append, so maybe we can
4636 	 * take advantage of that information somehow.
4637 	 */
4638 	if (ocfs2_et_get_last_eb_blk(et) ==
4639 	    path_leaf_bh(path)->b_blocknr) {
4640 		/*
4641 		 * Ok, ocfs2_find_path() returned us the rightmost
4642 		 * tree path. This might be an appending insert. There are
4643 		 * two cases:
4644 		 *    1) We're doing a true append at the tail:
4645 		 *	-This might even be off the end of the leaf
4646 		 *    2) We're "appending" by rotating in the tail
4647 		 */
4648 		ocfs2_figure_appending_type(insert, el, insert_rec);
4649 	}
4650 
4651 out:
4652 	ocfs2_free_path(path);
4653 
4654 	if (ret == 0)
4655 		*last_eb_bh = bh;
4656 	else
4657 		brelse(bh);
4658 	return ret;
4659 }
4660 
4661 /*
4662  * Insert an extent into an inode btree.
4663  *
4664  * The caller needs to update fe->i_clusters
4665  */
4666 int ocfs2_insert_extent(struct ocfs2_super *osb,
4667 			handle_t *handle,
4668 			struct inode *inode,
4669 			struct ocfs2_extent_tree *et,
4670 			u32 cpos,
4671 			u64 start_blk,
4672 			u32 new_clusters,
4673 			u8 flags,
4674 			struct ocfs2_alloc_context *meta_ac)
4675 {
4676 	int status;
4677 	int uninitialized_var(free_records);
4678 	struct buffer_head *last_eb_bh = NULL;
4679 	struct ocfs2_insert_type insert = {0, };
4680 	struct ocfs2_extent_rec rec;
4681 
4682 	mlog(0, "add %u clusters at position %u to inode %llu\n",
4683 	     new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4684 
4685 	memset(&rec, 0, sizeof(rec));
4686 	rec.e_cpos = cpu_to_le32(cpos);
4687 	rec.e_blkno = cpu_to_le64(start_blk);
4688 	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
4689 	rec.e_flags = flags;
4690 	status = ocfs2_et_insert_check(inode, et, &rec);
4691 	if (status) {
4692 		mlog_errno(status);
4693 		goto bail;
4694 	}
4695 
4696 	status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec,
4697 					  &free_records, &insert);
4698 	if (status < 0) {
4699 		mlog_errno(status);
4700 		goto bail;
4701 	}
4702 
4703 	mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
4704 	     "Insert.contig_index: %d, Insert.free_records: %d, "
4705 	     "Insert.tree_depth: %d\n",
4706 	     insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
4707 	     free_records, insert.ins_tree_depth);
4708 
4709 	if (insert.ins_contig == CONTIG_NONE && free_records == 0) {
4710 		status = ocfs2_grow_tree(inode, handle, et,
4711 					 &insert.ins_tree_depth, &last_eb_bh,
4712 					 meta_ac);
4713 		if (status) {
4714 			mlog_errno(status);
4715 			goto bail;
4716 		}
4717 	}
4718 
4719 	/* Finally, we can add clusters. This might rotate the tree for us. */
4720 	status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert);
4721 	if (status < 0)
4722 		mlog_errno(status);
4723 	else if (et->et_ops == &ocfs2_dinode_et_ops)
4724 		ocfs2_extent_map_insert_rec(inode, &rec);
4725 
4726 bail:
4727 	brelse(last_eb_bh);
4728 
4729 	mlog_exit(status);
4730 	return status;
4731 }
4732 
4733 /*
4734  * Allcate and add clusters into the extent b-tree.
4735  * The new clusters(clusters_to_add) will be inserted at logical_offset.
4736  * The extent b-tree's root is specified by et, and
4737  * it is not limited to the file storage. Any extent tree can use this
4738  * function if it implements the proper ocfs2_extent_tree.
4739  */
4740 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
4741 				struct inode *inode,
4742 				u32 *logical_offset,
4743 				u32 clusters_to_add,
4744 				int mark_unwritten,
4745 				struct ocfs2_extent_tree *et,
4746 				handle_t *handle,
4747 				struct ocfs2_alloc_context *data_ac,
4748 				struct ocfs2_alloc_context *meta_ac,
4749 				enum ocfs2_alloc_restarted *reason_ret)
4750 {
4751 	int status = 0;
4752 	int free_extents;
4753 	enum ocfs2_alloc_restarted reason = RESTART_NONE;
4754 	u32 bit_off, num_bits;
4755 	u64 block;
4756 	u8 flags = 0;
4757 
4758 	BUG_ON(!clusters_to_add);
4759 
4760 	if (mark_unwritten)
4761 		flags = OCFS2_EXT_UNWRITTEN;
4762 
4763 	free_extents = ocfs2_num_free_extents(osb, inode, et);
4764 	if (free_extents < 0) {
4765 		status = free_extents;
4766 		mlog_errno(status);
4767 		goto leave;
4768 	}
4769 
4770 	/* there are two cases which could cause us to EAGAIN in the
4771 	 * we-need-more-metadata case:
4772 	 * 1) we haven't reserved *any*
4773 	 * 2) we are so fragmented, we've needed to add metadata too
4774 	 *    many times. */
4775 	if (!free_extents && !meta_ac) {
4776 		mlog(0, "we haven't reserved any metadata!\n");
4777 		status = -EAGAIN;
4778 		reason = RESTART_META;
4779 		goto leave;
4780 	} else if ((!free_extents)
4781 		   && (ocfs2_alloc_context_bits_left(meta_ac)
4782 		       < ocfs2_extend_meta_needed(et->et_root_el))) {
4783 		mlog(0, "filesystem is really fragmented...\n");
4784 		status = -EAGAIN;
4785 		reason = RESTART_META;
4786 		goto leave;
4787 	}
4788 
4789 	status = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
4790 					clusters_to_add, &bit_off, &num_bits);
4791 	if (status < 0) {
4792 		if (status != -ENOSPC)
4793 			mlog_errno(status);
4794 		goto leave;
4795 	}
4796 
4797 	BUG_ON(num_bits > clusters_to_add);
4798 
4799 	/* reserve our write early -- insert_extent may update the tree root */
4800 	status = ocfs2_et_root_journal_access(handle, et,
4801 					      OCFS2_JOURNAL_ACCESS_WRITE);
4802 	if (status < 0) {
4803 		mlog_errno(status);
4804 		goto leave;
4805 	}
4806 
4807 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
4808 	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
4809 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
4810 	status = ocfs2_insert_extent(osb, handle, inode, et,
4811 				     *logical_offset, block,
4812 				     num_bits, flags, meta_ac);
4813 	if (status < 0) {
4814 		mlog_errno(status);
4815 		goto leave;
4816 	}
4817 
4818 	status = ocfs2_journal_dirty(handle, et->et_root_bh);
4819 	if (status < 0) {
4820 		mlog_errno(status);
4821 		goto leave;
4822 	}
4823 
4824 	clusters_to_add -= num_bits;
4825 	*logical_offset += num_bits;
4826 
4827 	if (clusters_to_add) {
4828 		mlog(0, "need to alloc once more, wanted = %u\n",
4829 		     clusters_to_add);
4830 		status = -EAGAIN;
4831 		reason = RESTART_TRANS;
4832 	}
4833 
4834 leave:
4835 	mlog_exit(status);
4836 	if (reason_ret)
4837 		*reason_ret = reason;
4838 	return status;
4839 }
4840 
4841 static void ocfs2_make_right_split_rec(struct super_block *sb,
4842 				       struct ocfs2_extent_rec *split_rec,
4843 				       u32 cpos,
4844 				       struct ocfs2_extent_rec *rec)
4845 {
4846 	u32 rec_cpos = le32_to_cpu(rec->e_cpos);
4847 	u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters);
4848 
4849 	memset(split_rec, 0, sizeof(struct ocfs2_extent_rec));
4850 
4851 	split_rec->e_cpos = cpu_to_le32(cpos);
4852 	split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos);
4853 
4854 	split_rec->e_blkno = rec->e_blkno;
4855 	le64_add_cpu(&split_rec->e_blkno,
4856 		     ocfs2_clusters_to_blocks(sb, cpos - rec_cpos));
4857 
4858 	split_rec->e_flags = rec->e_flags;
4859 }
4860 
4861 static int ocfs2_split_and_insert(struct inode *inode,
4862 				  handle_t *handle,
4863 				  struct ocfs2_path *path,
4864 				  struct ocfs2_extent_tree *et,
4865 				  struct buffer_head **last_eb_bh,
4866 				  int split_index,
4867 				  struct ocfs2_extent_rec *orig_split_rec,
4868 				  struct ocfs2_alloc_context *meta_ac)
4869 {
4870 	int ret = 0, depth;
4871 	unsigned int insert_range, rec_range, do_leftright = 0;
4872 	struct ocfs2_extent_rec tmprec;
4873 	struct ocfs2_extent_list *rightmost_el;
4874 	struct ocfs2_extent_rec rec;
4875 	struct ocfs2_extent_rec split_rec = *orig_split_rec;
4876 	struct ocfs2_insert_type insert;
4877 	struct ocfs2_extent_block *eb;
4878 
4879 leftright:
4880 	/*
4881 	 * Store a copy of the record on the stack - it might move
4882 	 * around as the tree is manipulated below.
4883 	 */
4884 	rec = path_leaf_el(path)->l_recs[split_index];
4885 
4886 	rightmost_el = et->et_root_el;
4887 
4888 	depth = le16_to_cpu(rightmost_el->l_tree_depth);
4889 	if (depth) {
4890 		BUG_ON(!(*last_eb_bh));
4891 		eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data;
4892 		rightmost_el = &eb->h_list;
4893 	}
4894 
4895 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
4896 	    le16_to_cpu(rightmost_el->l_count)) {
4897 		ret = ocfs2_grow_tree(inode, handle, et,
4898 				      &depth, last_eb_bh, meta_ac);
4899 		if (ret) {
4900 			mlog_errno(ret);
4901 			goto out;
4902 		}
4903 	}
4904 
4905 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
4906 	insert.ins_appending = APPEND_NONE;
4907 	insert.ins_contig = CONTIG_NONE;
4908 	insert.ins_tree_depth = depth;
4909 
4910 	insert_range = le32_to_cpu(split_rec.e_cpos) +
4911 		le16_to_cpu(split_rec.e_leaf_clusters);
4912 	rec_range = le32_to_cpu(rec.e_cpos) +
4913 		le16_to_cpu(rec.e_leaf_clusters);
4914 
4915 	if (split_rec.e_cpos == rec.e_cpos) {
4916 		insert.ins_split = SPLIT_LEFT;
4917 	} else if (insert_range == rec_range) {
4918 		insert.ins_split = SPLIT_RIGHT;
4919 	} else {
4920 		/*
4921 		 * Left/right split. We fake this as a right split
4922 		 * first and then make a second pass as a left split.
4923 		 */
4924 		insert.ins_split = SPLIT_RIGHT;
4925 
4926 		ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range,
4927 					   &rec);
4928 
4929 		split_rec = tmprec;
4930 
4931 		BUG_ON(do_leftright);
4932 		do_leftright = 1;
4933 	}
4934 
4935 	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
4936 	if (ret) {
4937 		mlog_errno(ret);
4938 		goto out;
4939 	}
4940 
4941 	if (do_leftright == 1) {
4942 		u32 cpos;
4943 		struct ocfs2_extent_list *el;
4944 
4945 		do_leftright++;
4946 		split_rec = *orig_split_rec;
4947 
4948 		ocfs2_reinit_path(path, 1);
4949 
4950 		cpos = le32_to_cpu(split_rec.e_cpos);
4951 		ret = ocfs2_find_path(inode, path, cpos);
4952 		if (ret) {
4953 			mlog_errno(ret);
4954 			goto out;
4955 		}
4956 
4957 		el = path_leaf_el(path);
4958 		split_index = ocfs2_search_extent_list(el, cpos);
4959 		goto leftright;
4960 	}
4961 out:
4962 
4963 	return ret;
4964 }
4965 
4966 static int ocfs2_replace_extent_rec(struct inode *inode,
4967 				    handle_t *handle,
4968 				    struct ocfs2_path *path,
4969 				    struct ocfs2_extent_list *el,
4970 				    int split_index,
4971 				    struct ocfs2_extent_rec *split_rec)
4972 {
4973 	int ret;
4974 
4975 	ret = ocfs2_path_bh_journal_access(handle, INODE_CACHE(inode), path,
4976 					   path_num_items(path) - 1);
4977 	if (ret) {
4978 		mlog_errno(ret);
4979 		goto out;
4980 	}
4981 
4982 	el->l_recs[split_index] = *split_rec;
4983 
4984 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
4985 out:
4986 	return ret;
4987 }
4988 
4989 /*
4990  * Mark part or all of the extent record at split_index in the leaf
4991  * pointed to by path as written. This removes the unwritten
4992  * extent flag.
4993  *
4994  * Care is taken to handle contiguousness so as to not grow the tree.
4995  *
4996  * meta_ac is not strictly necessary - we only truly need it if growth
4997  * of the tree is required. All other cases will degrade into a less
4998  * optimal tree layout.
4999  *
5000  * last_eb_bh should be the rightmost leaf block for any extent
5001  * btree. Since a split may grow the tree or a merge might shrink it,
5002  * the caller cannot trust the contents of that buffer after this call.
5003  *
5004  * This code is optimized for readability - several passes might be
5005  * made over certain portions of the tree. All of those blocks will
5006  * have been brought into cache (and pinned via the journal), so the
5007  * extra overhead is not expressed in terms of disk reads.
5008  */
5009 static int __ocfs2_mark_extent_written(struct inode *inode,
5010 				       struct ocfs2_extent_tree *et,
5011 				       handle_t *handle,
5012 				       struct ocfs2_path *path,
5013 				       int split_index,
5014 				       struct ocfs2_extent_rec *split_rec,
5015 				       struct ocfs2_alloc_context *meta_ac,
5016 				       struct ocfs2_cached_dealloc_ctxt *dealloc)
5017 {
5018 	int ret = 0;
5019 	struct ocfs2_extent_list *el = path_leaf_el(path);
5020 	struct buffer_head *last_eb_bh = NULL;
5021 	struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
5022 	struct ocfs2_merge_ctxt ctxt;
5023 	struct ocfs2_extent_list *rightmost_el;
5024 
5025 	if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) {
5026 		ret = -EIO;
5027 		mlog_errno(ret);
5028 		goto out;
5029 	}
5030 
5031 	if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) ||
5032 	    ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) <
5033 	     (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) {
5034 		ret = -EIO;
5035 		mlog_errno(ret);
5036 		goto out;
5037 	}
5038 
5039 	ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
5040 							    split_index,
5041 							    split_rec);
5042 
5043 	/*
5044 	 * The core merge / split code wants to know how much room is
5045 	 * left in this inodes allocation tree, so we pass the
5046 	 * rightmost extent list.
5047 	 */
5048 	if (path->p_tree_depth) {
5049 		struct ocfs2_extent_block *eb;
5050 
5051 		ret = ocfs2_read_extent_block(inode,
5052 					      ocfs2_et_get_last_eb_blk(et),
5053 					      &last_eb_bh);
5054 		if (ret) {
5055 			mlog_exit(ret);
5056 			goto out;
5057 		}
5058 
5059 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5060 		rightmost_el = &eb->h_list;
5061 	} else
5062 		rightmost_el = path_root_el(path);
5063 
5064 	if (rec->e_cpos == split_rec->e_cpos &&
5065 	    rec->e_leaf_clusters == split_rec->e_leaf_clusters)
5066 		ctxt.c_split_covers_rec = 1;
5067 	else
5068 		ctxt.c_split_covers_rec = 0;
5069 
5070 	ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]);
5071 
5072 	mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n",
5073 	     split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent,
5074 	     ctxt.c_split_covers_rec);
5075 
5076 	if (ctxt.c_contig_type == CONTIG_NONE) {
5077 		if (ctxt.c_split_covers_rec)
5078 			ret = ocfs2_replace_extent_rec(inode, handle,
5079 						       path, el,
5080 						       split_index, split_rec);
5081 		else
5082 			ret = ocfs2_split_and_insert(inode, handle, path, et,
5083 						     &last_eb_bh, split_index,
5084 						     split_rec, meta_ac);
5085 		if (ret)
5086 			mlog_errno(ret);
5087 	} else {
5088 		ret = ocfs2_try_to_merge_extent(inode, handle, path,
5089 						split_index, split_rec,
5090 						dealloc, &ctxt, et);
5091 		if (ret)
5092 			mlog_errno(ret);
5093 	}
5094 
5095 out:
5096 	brelse(last_eb_bh);
5097 	return ret;
5098 }
5099 
5100 /*
5101  * Mark the already-existing extent at cpos as written for len clusters.
5102  *
5103  * If the existing extent is larger than the request, initiate a
5104  * split. An attempt will be made at merging with adjacent extents.
5105  *
5106  * The caller is responsible for passing down meta_ac if we'll need it.
5107  */
5108 int ocfs2_mark_extent_written(struct inode *inode,
5109 			      struct ocfs2_extent_tree *et,
5110 			      handle_t *handle, u32 cpos, u32 len, u32 phys,
5111 			      struct ocfs2_alloc_context *meta_ac,
5112 			      struct ocfs2_cached_dealloc_ctxt *dealloc)
5113 {
5114 	int ret, index;
5115 	u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys);
5116 	struct ocfs2_extent_rec split_rec;
5117 	struct ocfs2_path *left_path = NULL;
5118 	struct ocfs2_extent_list *el;
5119 
5120 	mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n",
5121 	     inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno);
5122 
5123 	if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) {
5124 		ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents "
5125 			    "that are being written to, but the feature bit "
5126 			    "is not set in the super block.",
5127 			    (unsigned long long)OCFS2_I(inode)->ip_blkno);
5128 		ret = -EROFS;
5129 		goto out;
5130 	}
5131 
5132 	/*
5133 	 * XXX: This should be fixed up so that we just re-insert the
5134 	 * next extent records.
5135 	 *
5136 	 * XXX: This is a hack on the extent tree, maybe it should be
5137 	 * an op?
5138 	 */
5139 	if (et->et_ops == &ocfs2_dinode_et_ops)
5140 		ocfs2_extent_map_trunc(inode, 0);
5141 
5142 	left_path = ocfs2_new_path_from_et(et);
5143 	if (!left_path) {
5144 		ret = -ENOMEM;
5145 		mlog_errno(ret);
5146 		goto out;
5147 	}
5148 
5149 	ret = ocfs2_find_path(inode, left_path, cpos);
5150 	if (ret) {
5151 		mlog_errno(ret);
5152 		goto out;
5153 	}
5154 	el = path_leaf_el(left_path);
5155 
5156 	index = ocfs2_search_extent_list(el, cpos);
5157 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5158 		ocfs2_error(inode->i_sb,
5159 			    "Inode %llu has an extent at cpos %u which can no "
5160 			    "longer be found.\n",
5161 			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5162 		ret = -EROFS;
5163 		goto out;
5164 	}
5165 
5166 	memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec));
5167 	split_rec.e_cpos = cpu_to_le32(cpos);
5168 	split_rec.e_leaf_clusters = cpu_to_le16(len);
5169 	split_rec.e_blkno = cpu_to_le64(start_blkno);
5170 	split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags;
5171 	split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN;
5172 
5173 	ret = __ocfs2_mark_extent_written(inode, et, handle, left_path,
5174 					  index, &split_rec, meta_ac,
5175 					  dealloc);
5176 	if (ret)
5177 		mlog_errno(ret);
5178 
5179 out:
5180 	ocfs2_free_path(left_path);
5181 	return ret;
5182 }
5183 
5184 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
5185 			    handle_t *handle, struct ocfs2_path *path,
5186 			    int index, u32 new_range,
5187 			    struct ocfs2_alloc_context *meta_ac)
5188 {
5189 	int ret, depth, credits = handle->h_buffer_credits;
5190 	struct buffer_head *last_eb_bh = NULL;
5191 	struct ocfs2_extent_block *eb;
5192 	struct ocfs2_extent_list *rightmost_el, *el;
5193 	struct ocfs2_extent_rec split_rec;
5194 	struct ocfs2_extent_rec *rec;
5195 	struct ocfs2_insert_type insert;
5196 
5197 	/*
5198 	 * Setup the record to split before we grow the tree.
5199 	 */
5200 	el = path_leaf_el(path);
5201 	rec = &el->l_recs[index];
5202 	ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec);
5203 
5204 	depth = path->p_tree_depth;
5205 	if (depth > 0) {
5206 		ret = ocfs2_read_extent_block(inode,
5207 					      ocfs2_et_get_last_eb_blk(et),
5208 					      &last_eb_bh);
5209 		if (ret < 0) {
5210 			mlog_errno(ret);
5211 			goto out;
5212 		}
5213 
5214 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
5215 		rightmost_el = &eb->h_list;
5216 	} else
5217 		rightmost_el = path_leaf_el(path);
5218 
5219 	credits += path->p_tree_depth +
5220 		   ocfs2_extend_meta_needed(et->et_root_el);
5221 	ret = ocfs2_extend_trans(handle, credits);
5222 	if (ret) {
5223 		mlog_errno(ret);
5224 		goto out;
5225 	}
5226 
5227 	if (le16_to_cpu(rightmost_el->l_next_free_rec) ==
5228 	    le16_to_cpu(rightmost_el->l_count)) {
5229 		ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh,
5230 				      meta_ac);
5231 		if (ret) {
5232 			mlog_errno(ret);
5233 			goto out;
5234 		}
5235 	}
5236 
5237 	memset(&insert, 0, sizeof(struct ocfs2_insert_type));
5238 	insert.ins_appending = APPEND_NONE;
5239 	insert.ins_contig = CONTIG_NONE;
5240 	insert.ins_split = SPLIT_RIGHT;
5241 	insert.ins_tree_depth = depth;
5242 
5243 	ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert);
5244 	if (ret)
5245 		mlog_errno(ret);
5246 
5247 out:
5248 	brelse(last_eb_bh);
5249 	return ret;
5250 }
5251 
5252 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
5253 			      struct ocfs2_path *path, int index,
5254 			      struct ocfs2_cached_dealloc_ctxt *dealloc,
5255 			      u32 cpos, u32 len,
5256 			      struct ocfs2_extent_tree *et)
5257 {
5258 	int ret;
5259 	u32 left_cpos, rec_range, trunc_range;
5260 	int wants_rotate = 0, is_rightmost_tree_rec = 0;
5261 	struct super_block *sb = inode->i_sb;
5262 	struct ocfs2_path *left_path = NULL;
5263 	struct ocfs2_extent_list *el = path_leaf_el(path);
5264 	struct ocfs2_extent_rec *rec;
5265 	struct ocfs2_extent_block *eb;
5266 
5267 	if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) {
5268 		ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5269 		if (ret) {
5270 			mlog_errno(ret);
5271 			goto out;
5272 		}
5273 
5274 		index--;
5275 	}
5276 
5277 	if (index == (le16_to_cpu(el->l_next_free_rec) - 1) &&
5278 	    path->p_tree_depth) {
5279 		/*
5280 		 * Check whether this is the rightmost tree record. If
5281 		 * we remove all of this record or part of its right
5282 		 * edge then an update of the record lengths above it
5283 		 * will be required.
5284 		 */
5285 		eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
5286 		if (eb->h_next_leaf_blk == 0)
5287 			is_rightmost_tree_rec = 1;
5288 	}
5289 
5290 	rec = &el->l_recs[index];
5291 	if (index == 0 && path->p_tree_depth &&
5292 	    le32_to_cpu(rec->e_cpos) == cpos) {
5293 		/*
5294 		 * Changing the leftmost offset (via partial or whole
5295 		 * record truncate) of an interior (or rightmost) path
5296 		 * means we have to update the subtree that is formed
5297 		 * by this leaf and the one to it's left.
5298 		 *
5299 		 * There are two cases we can skip:
5300 		 *   1) Path is the leftmost one in our inode tree.
5301 		 *   2) The leaf is rightmost and will be empty after
5302 		 *      we remove the extent record - the rotate code
5303 		 *      knows how to update the newly formed edge.
5304 		 */
5305 
5306 		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path,
5307 						    &left_cpos);
5308 		if (ret) {
5309 			mlog_errno(ret);
5310 			goto out;
5311 		}
5312 
5313 		if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
5314 			left_path = ocfs2_new_path_from_path(path);
5315 			if (!left_path) {
5316 				ret = -ENOMEM;
5317 				mlog_errno(ret);
5318 				goto out;
5319 			}
5320 
5321 			ret = ocfs2_find_path(inode, left_path, left_cpos);
5322 			if (ret) {
5323 				mlog_errno(ret);
5324 				goto out;
5325 			}
5326 		}
5327 	}
5328 
5329 	ret = ocfs2_extend_rotate_transaction(handle, 0,
5330 					      handle->h_buffer_credits,
5331 					      path);
5332 	if (ret) {
5333 		mlog_errno(ret);
5334 		goto out;
5335 	}
5336 
5337 	ret = ocfs2_journal_access_path(et->et_ci, handle, path);
5338 	if (ret) {
5339 		mlog_errno(ret);
5340 		goto out;
5341 	}
5342 
5343 	ret = ocfs2_journal_access_path(et->et_ci, handle, left_path);
5344 	if (ret) {
5345 		mlog_errno(ret);
5346 		goto out;
5347 	}
5348 
5349 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5350 	trunc_range = cpos + len;
5351 
5352 	if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) {
5353 		int next_free;
5354 
5355 		memset(rec, 0, sizeof(*rec));
5356 		ocfs2_cleanup_merge(el, index);
5357 		wants_rotate = 1;
5358 
5359 		next_free = le16_to_cpu(el->l_next_free_rec);
5360 		if (is_rightmost_tree_rec && next_free > 1) {
5361 			/*
5362 			 * We skip the edge update if this path will
5363 			 * be deleted by the rotate code.
5364 			 */
5365 			rec = &el->l_recs[next_free - 1];
5366 			ocfs2_adjust_rightmost_records(inode, handle, path,
5367 						       rec);
5368 		}
5369 	} else if (le32_to_cpu(rec->e_cpos) == cpos) {
5370 		/* Remove leftmost portion of the record. */
5371 		le32_add_cpu(&rec->e_cpos, len);
5372 		le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len));
5373 		le16_add_cpu(&rec->e_leaf_clusters, -len);
5374 	} else if (rec_range == trunc_range) {
5375 		/* Remove rightmost portion of the record */
5376 		le16_add_cpu(&rec->e_leaf_clusters, -len);
5377 		if (is_rightmost_tree_rec)
5378 			ocfs2_adjust_rightmost_records(inode, handle, path, rec);
5379 	} else {
5380 		/* Caller should have trapped this. */
5381 		mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) "
5382 		     "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno,
5383 		     le32_to_cpu(rec->e_cpos),
5384 		     le16_to_cpu(rec->e_leaf_clusters), cpos, len);
5385 		BUG();
5386 	}
5387 
5388 	if (left_path) {
5389 		int subtree_index;
5390 
5391 		subtree_index = ocfs2_find_subtree_root(inode, left_path, path);
5392 		ocfs2_complete_edge_insert(inode, handle, left_path, path,
5393 					   subtree_index);
5394 	}
5395 
5396 	ocfs2_journal_dirty(handle, path_leaf_bh(path));
5397 
5398 	ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et);
5399 	if (ret) {
5400 		mlog_errno(ret);
5401 		goto out;
5402 	}
5403 
5404 out:
5405 	ocfs2_free_path(left_path);
5406 	return ret;
5407 }
5408 
5409 int ocfs2_remove_extent(struct inode *inode,
5410 			struct ocfs2_extent_tree *et,
5411 			u32 cpos, u32 len, handle_t *handle,
5412 			struct ocfs2_alloc_context *meta_ac,
5413 			struct ocfs2_cached_dealloc_ctxt *dealloc)
5414 {
5415 	int ret, index;
5416 	u32 rec_range, trunc_range;
5417 	struct ocfs2_extent_rec *rec;
5418 	struct ocfs2_extent_list *el;
5419 	struct ocfs2_path *path = NULL;
5420 
5421 	ocfs2_extent_map_trunc(inode, 0);
5422 
5423 	path = ocfs2_new_path_from_et(et);
5424 	if (!path) {
5425 		ret = -ENOMEM;
5426 		mlog_errno(ret);
5427 		goto out;
5428 	}
5429 
5430 	ret = ocfs2_find_path(inode, path, cpos);
5431 	if (ret) {
5432 		mlog_errno(ret);
5433 		goto out;
5434 	}
5435 
5436 	el = path_leaf_el(path);
5437 	index = ocfs2_search_extent_list(el, cpos);
5438 	if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5439 		ocfs2_error(inode->i_sb,
5440 			    "Inode %llu has an extent at cpos %u which can no "
5441 			    "longer be found.\n",
5442 			    (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos);
5443 		ret = -EROFS;
5444 		goto out;
5445 	}
5446 
5447 	/*
5448 	 * We have 3 cases of extent removal:
5449 	 *   1) Range covers the entire extent rec
5450 	 *   2) Range begins or ends on one edge of the extent rec
5451 	 *   3) Range is in the middle of the extent rec (no shared edges)
5452 	 *
5453 	 * For case 1 we remove the extent rec and left rotate to
5454 	 * fill the hole.
5455 	 *
5456 	 * For case 2 we just shrink the existing extent rec, with a
5457 	 * tree update if the shrinking edge is also the edge of an
5458 	 * extent block.
5459 	 *
5460 	 * For case 3 we do a right split to turn the extent rec into
5461 	 * something case 2 can handle.
5462 	 */
5463 	rec = &el->l_recs[index];
5464 	rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec);
5465 	trunc_range = cpos + len;
5466 
5467 	BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range);
5468 
5469 	mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d "
5470 	     "(cpos %u, len %u)\n",
5471 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index,
5472 	     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec));
5473 
5474 	if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) {
5475 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5476 					 cpos, len, et);
5477 		if (ret) {
5478 			mlog_errno(ret);
5479 			goto out;
5480 		}
5481 	} else {
5482 		ret = ocfs2_split_tree(inode, et, handle, path, index,
5483 				       trunc_range, meta_ac);
5484 		if (ret) {
5485 			mlog_errno(ret);
5486 			goto out;
5487 		}
5488 
5489 		/*
5490 		 * The split could have manipulated the tree enough to
5491 		 * move the record location, so we have to look for it again.
5492 		 */
5493 		ocfs2_reinit_path(path, 1);
5494 
5495 		ret = ocfs2_find_path(inode, path, cpos);
5496 		if (ret) {
5497 			mlog_errno(ret);
5498 			goto out;
5499 		}
5500 
5501 		el = path_leaf_el(path);
5502 		index = ocfs2_search_extent_list(el, cpos);
5503 		if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) {
5504 			ocfs2_error(inode->i_sb,
5505 				    "Inode %llu: split at cpos %u lost record.",
5506 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
5507 				    cpos);
5508 			ret = -EROFS;
5509 			goto out;
5510 		}
5511 
5512 		/*
5513 		 * Double check our values here. If anything is fishy,
5514 		 * it's easier to catch it at the top level.
5515 		 */
5516 		rec = &el->l_recs[index];
5517 		rec_range = le32_to_cpu(rec->e_cpos) +
5518 			ocfs2_rec_clusters(el, rec);
5519 		if (rec_range != trunc_range) {
5520 			ocfs2_error(inode->i_sb,
5521 				    "Inode %llu: error after split at cpos %u"
5522 				    "trunc len %u, existing record is (%u,%u)",
5523 				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
5524 				    cpos, len, le32_to_cpu(rec->e_cpos),
5525 				    ocfs2_rec_clusters(el, rec));
5526 			ret = -EROFS;
5527 			goto out;
5528 		}
5529 
5530 		ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc,
5531 					 cpos, len, et);
5532 		if (ret) {
5533 			mlog_errno(ret);
5534 			goto out;
5535 		}
5536 	}
5537 
5538 out:
5539 	ocfs2_free_path(path);
5540 	return ret;
5541 }
5542 
5543 int ocfs2_remove_btree_range(struct inode *inode,
5544 			     struct ocfs2_extent_tree *et,
5545 			     u32 cpos, u32 phys_cpos, u32 len,
5546 			     struct ocfs2_cached_dealloc_ctxt *dealloc)
5547 {
5548 	int ret;
5549 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
5550 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
5551 	struct inode *tl_inode = osb->osb_tl_inode;
5552 	handle_t *handle;
5553 	struct ocfs2_alloc_context *meta_ac = NULL;
5554 
5555 	ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
5556 	if (ret) {
5557 		mlog_errno(ret);
5558 		return ret;
5559 	}
5560 
5561 	mutex_lock(&tl_inode->i_mutex);
5562 
5563 	if (ocfs2_truncate_log_needs_flush(osb)) {
5564 		ret = __ocfs2_flush_truncate_log(osb);
5565 		if (ret < 0) {
5566 			mlog_errno(ret);
5567 			goto out;
5568 		}
5569 	}
5570 
5571 	handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
5572 	if (IS_ERR(handle)) {
5573 		ret = PTR_ERR(handle);
5574 		mlog_errno(ret);
5575 		goto out;
5576 	}
5577 
5578 	ret = ocfs2_et_root_journal_access(handle, et,
5579 					   OCFS2_JOURNAL_ACCESS_WRITE);
5580 	if (ret) {
5581 		mlog_errno(ret);
5582 		goto out;
5583 	}
5584 
5585 	vfs_dq_free_space_nodirty(inode,
5586 				  ocfs2_clusters_to_bytes(inode->i_sb, len));
5587 
5588 	ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
5589 				  dealloc);
5590 	if (ret) {
5591 		mlog_errno(ret);
5592 		goto out_commit;
5593 	}
5594 
5595 	ocfs2_et_update_clusters(inode, et, -len);
5596 
5597 	ret = ocfs2_journal_dirty(handle, et->et_root_bh);
5598 	if (ret) {
5599 		mlog_errno(ret);
5600 		goto out_commit;
5601 	}
5602 
5603 	ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
5604 	if (ret)
5605 		mlog_errno(ret);
5606 
5607 out_commit:
5608 	ocfs2_commit_trans(osb, handle);
5609 out:
5610 	mutex_unlock(&tl_inode->i_mutex);
5611 
5612 	if (meta_ac)
5613 		ocfs2_free_alloc_context(meta_ac);
5614 
5615 	return ret;
5616 }
5617 
5618 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
5619 {
5620 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5621 	struct ocfs2_dinode *di;
5622 	struct ocfs2_truncate_log *tl;
5623 
5624 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5625 	tl = &di->id2.i_dealloc;
5626 
5627 	mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count),
5628 			"slot %d, invalid truncate log parameters: used = "
5629 			"%u, count = %u\n", osb->slot_num,
5630 			le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count));
5631 	return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count);
5632 }
5633 
5634 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl,
5635 					   unsigned int new_start)
5636 {
5637 	unsigned int tail_index;
5638 	unsigned int current_tail;
5639 
5640 	/* No records, nothing to coalesce */
5641 	if (!le16_to_cpu(tl->tl_used))
5642 		return 0;
5643 
5644 	tail_index = le16_to_cpu(tl->tl_used) - 1;
5645 	current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start);
5646 	current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters);
5647 
5648 	return current_tail == new_start;
5649 }
5650 
5651 int ocfs2_truncate_log_append(struct ocfs2_super *osb,
5652 			      handle_t *handle,
5653 			      u64 start_blk,
5654 			      unsigned int num_clusters)
5655 {
5656 	int status, index;
5657 	unsigned int start_cluster, tl_count;
5658 	struct inode *tl_inode = osb->osb_tl_inode;
5659 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5660 	struct ocfs2_dinode *di;
5661 	struct ocfs2_truncate_log *tl;
5662 
5663 	mlog_entry("start_blk = %llu, num_clusters = %u\n",
5664 		   (unsigned long long)start_blk, num_clusters);
5665 
5666 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5667 
5668 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
5669 
5670 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5671 
5672 	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5673 	 * by the underlying call to ocfs2_read_inode_block(), so any
5674 	 * corruption is a code bug */
5675 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5676 
5677 	tl = &di->id2.i_dealloc;
5678 	tl_count = le16_to_cpu(tl->tl_count);
5679 	mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
5680 			tl_count == 0,
5681 			"Truncate record count on #%llu invalid "
5682 			"wanted %u, actual %u\n",
5683 			(unsigned long long)OCFS2_I(tl_inode)->ip_blkno,
5684 			ocfs2_truncate_recs_per_inode(osb->sb),
5685 			le16_to_cpu(tl->tl_count));
5686 
5687 	/* Caller should have known to flush before calling us. */
5688 	index = le16_to_cpu(tl->tl_used);
5689 	if (index >= tl_count) {
5690 		status = -ENOSPC;
5691 		mlog_errno(status);
5692 		goto bail;
5693 	}
5694 
5695 	status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5696 					 OCFS2_JOURNAL_ACCESS_WRITE);
5697 	if (status < 0) {
5698 		mlog_errno(status);
5699 		goto bail;
5700 	}
5701 
5702 	mlog(0, "Log truncate of %u clusters starting at cluster %u to "
5703 	     "%llu (index = %d)\n", num_clusters, start_cluster,
5704 	     (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index);
5705 
5706 	if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) {
5707 		/*
5708 		 * Move index back to the record we are coalescing with.
5709 		 * ocfs2_truncate_log_can_coalesce() guarantees nonzero
5710 		 */
5711 		index--;
5712 
5713 		num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters);
5714 		mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n",
5715 		     index, le32_to_cpu(tl->tl_recs[index].t_start),
5716 		     num_clusters);
5717 	} else {
5718 		tl->tl_recs[index].t_start = cpu_to_le32(start_cluster);
5719 		tl->tl_used = cpu_to_le16(index + 1);
5720 	}
5721 	tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters);
5722 
5723 	status = ocfs2_journal_dirty(handle, tl_bh);
5724 	if (status < 0) {
5725 		mlog_errno(status);
5726 		goto bail;
5727 	}
5728 
5729 bail:
5730 	mlog_exit(status);
5731 	return status;
5732 }
5733 
5734 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
5735 					 handle_t *handle,
5736 					 struct inode *data_alloc_inode,
5737 					 struct buffer_head *data_alloc_bh)
5738 {
5739 	int status = 0;
5740 	int i;
5741 	unsigned int num_clusters;
5742 	u64 start_blk;
5743 	struct ocfs2_truncate_rec rec;
5744 	struct ocfs2_dinode *di;
5745 	struct ocfs2_truncate_log *tl;
5746 	struct inode *tl_inode = osb->osb_tl_inode;
5747 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5748 
5749 	mlog_entry_void();
5750 
5751 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5752 	tl = &di->id2.i_dealloc;
5753 	i = le16_to_cpu(tl->tl_used) - 1;
5754 	while (i >= 0) {
5755 		/* Caller has given us at least enough credits to
5756 		 * update the truncate log dinode */
5757 		status = ocfs2_journal_access_di(handle, INODE_CACHE(tl_inode), tl_bh,
5758 						 OCFS2_JOURNAL_ACCESS_WRITE);
5759 		if (status < 0) {
5760 			mlog_errno(status);
5761 			goto bail;
5762 		}
5763 
5764 		tl->tl_used = cpu_to_le16(i);
5765 
5766 		status = ocfs2_journal_dirty(handle, tl_bh);
5767 		if (status < 0) {
5768 			mlog_errno(status);
5769 			goto bail;
5770 		}
5771 
5772 		/* TODO: Perhaps we can calculate the bulk of the
5773 		 * credits up front rather than extending like
5774 		 * this. */
5775 		status = ocfs2_extend_trans(handle,
5776 					    OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC);
5777 		if (status < 0) {
5778 			mlog_errno(status);
5779 			goto bail;
5780 		}
5781 
5782 		rec = tl->tl_recs[i];
5783 		start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb,
5784 						    le32_to_cpu(rec.t_start));
5785 		num_clusters = le32_to_cpu(rec.t_clusters);
5786 
5787 		/* if start_blk is not set, we ignore the record as
5788 		 * invalid. */
5789 		if (start_blk) {
5790 			mlog(0, "free record %d, start = %u, clusters = %u\n",
5791 			     i, le32_to_cpu(rec.t_start), num_clusters);
5792 
5793 			status = ocfs2_free_clusters(handle, data_alloc_inode,
5794 						     data_alloc_bh, start_blk,
5795 						     num_clusters);
5796 			if (status < 0) {
5797 				mlog_errno(status);
5798 				goto bail;
5799 			}
5800 		}
5801 		i--;
5802 	}
5803 
5804 bail:
5805 	mlog_exit(status);
5806 	return status;
5807 }
5808 
5809 /* Expects you to already be holding tl_inode->i_mutex */
5810 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5811 {
5812 	int status;
5813 	unsigned int num_to_flush;
5814 	handle_t *handle;
5815 	struct inode *tl_inode = osb->osb_tl_inode;
5816 	struct inode *data_alloc_inode = NULL;
5817 	struct buffer_head *tl_bh = osb->osb_tl_bh;
5818 	struct buffer_head *data_alloc_bh = NULL;
5819 	struct ocfs2_dinode *di;
5820 	struct ocfs2_truncate_log *tl;
5821 
5822 	mlog_entry_void();
5823 
5824 	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
5825 
5826 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5827 
5828 	/* tl_bh is loaded from ocfs2_truncate_log_init().  It's validated
5829 	 * by the underlying call to ocfs2_read_inode_block(), so any
5830 	 * corruption is a code bug */
5831 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5832 
5833 	tl = &di->id2.i_dealloc;
5834 	num_to_flush = le16_to_cpu(tl->tl_used);
5835 	mlog(0, "Flush %u records from truncate log #%llu\n",
5836 	     num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
5837 	if (!num_to_flush) {
5838 		status = 0;
5839 		goto out;
5840 	}
5841 
5842 	data_alloc_inode = ocfs2_get_system_file_inode(osb,
5843 						       GLOBAL_BITMAP_SYSTEM_INODE,
5844 						       OCFS2_INVALID_SLOT);
5845 	if (!data_alloc_inode) {
5846 		status = -EINVAL;
5847 		mlog(ML_ERROR, "Could not get bitmap inode!\n");
5848 		goto out;
5849 	}
5850 
5851 	mutex_lock(&data_alloc_inode->i_mutex);
5852 
5853 	status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
5854 	if (status < 0) {
5855 		mlog_errno(status);
5856 		goto out_mutex;
5857 	}
5858 
5859 	handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
5860 	if (IS_ERR(handle)) {
5861 		status = PTR_ERR(handle);
5862 		mlog_errno(status);
5863 		goto out_unlock;
5864 	}
5865 
5866 	status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode,
5867 					       data_alloc_bh);
5868 	if (status < 0)
5869 		mlog_errno(status);
5870 
5871 	ocfs2_commit_trans(osb, handle);
5872 
5873 out_unlock:
5874 	brelse(data_alloc_bh);
5875 	ocfs2_inode_unlock(data_alloc_inode, 1);
5876 
5877 out_mutex:
5878 	mutex_unlock(&data_alloc_inode->i_mutex);
5879 	iput(data_alloc_inode);
5880 
5881 out:
5882 	mlog_exit(status);
5883 	return status;
5884 }
5885 
5886 int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
5887 {
5888 	int status;
5889 	struct inode *tl_inode = osb->osb_tl_inode;
5890 
5891 	mutex_lock(&tl_inode->i_mutex);
5892 	status = __ocfs2_flush_truncate_log(osb);
5893 	mutex_unlock(&tl_inode->i_mutex);
5894 
5895 	return status;
5896 }
5897 
5898 static void ocfs2_truncate_log_worker(struct work_struct *work)
5899 {
5900 	int status;
5901 	struct ocfs2_super *osb =
5902 		container_of(work, struct ocfs2_super,
5903 			     osb_truncate_log_wq.work);
5904 
5905 	mlog_entry_void();
5906 
5907 	status = ocfs2_flush_truncate_log(osb);
5908 	if (status < 0)
5909 		mlog_errno(status);
5910 	else
5911 		ocfs2_init_inode_steal_slot(osb);
5912 
5913 	mlog_exit(status);
5914 }
5915 
5916 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ)
5917 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb,
5918 				       int cancel)
5919 {
5920 	if (osb->osb_tl_inode) {
5921 		/* We want to push off log flushes while truncates are
5922 		 * still running. */
5923 		if (cancel)
5924 			cancel_delayed_work(&osb->osb_truncate_log_wq);
5925 
5926 		queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq,
5927 				   OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL);
5928 	}
5929 }
5930 
5931 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
5932 				       int slot_num,
5933 				       struct inode **tl_inode,
5934 				       struct buffer_head **tl_bh)
5935 {
5936 	int status;
5937 	struct inode *inode = NULL;
5938 	struct buffer_head *bh = NULL;
5939 
5940 	inode = ocfs2_get_system_file_inode(osb,
5941 					   TRUNCATE_LOG_SYSTEM_INODE,
5942 					   slot_num);
5943 	if (!inode) {
5944 		status = -EINVAL;
5945 		mlog(ML_ERROR, "Could not get load truncate log inode!\n");
5946 		goto bail;
5947 	}
5948 
5949 	status = ocfs2_read_inode_block(inode, &bh);
5950 	if (status < 0) {
5951 		iput(inode);
5952 		mlog_errno(status);
5953 		goto bail;
5954 	}
5955 
5956 	*tl_inode = inode;
5957 	*tl_bh    = bh;
5958 bail:
5959 	mlog_exit(status);
5960 	return status;
5961 }
5962 
5963 /* called during the 1st stage of node recovery. we stamp a clean
5964  * truncate log and pass back a copy for processing later. if the
5965  * truncate log does not require processing, a *tl_copy is set to
5966  * NULL. */
5967 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
5968 				      int slot_num,
5969 				      struct ocfs2_dinode **tl_copy)
5970 {
5971 	int status;
5972 	struct inode *tl_inode = NULL;
5973 	struct buffer_head *tl_bh = NULL;
5974 	struct ocfs2_dinode *di;
5975 	struct ocfs2_truncate_log *tl;
5976 
5977 	*tl_copy = NULL;
5978 
5979 	mlog(0, "recover truncate log from slot %d\n", slot_num);
5980 
5981 	status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh);
5982 	if (status < 0) {
5983 		mlog_errno(status);
5984 		goto bail;
5985 	}
5986 
5987 	di = (struct ocfs2_dinode *) tl_bh->b_data;
5988 
5989 	/* tl_bh is loaded from ocfs2_get_truncate_log_info().  It's
5990 	 * validated by the underlying call to ocfs2_read_inode_block(),
5991 	 * so any corruption is a code bug */
5992 	BUG_ON(!OCFS2_IS_VALID_DINODE(di));
5993 
5994 	tl = &di->id2.i_dealloc;
5995 	if (le16_to_cpu(tl->tl_used)) {
5996 		mlog(0, "We'll have %u logs to recover\n",
5997 		     le16_to_cpu(tl->tl_used));
5998 
5999 		*tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL);
6000 		if (!(*tl_copy)) {
6001 			status = -ENOMEM;
6002 			mlog_errno(status);
6003 			goto bail;
6004 		}
6005 
6006 		/* Assuming the write-out below goes well, this copy
6007 		 * will be passed back to recovery for processing. */
6008 		memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size);
6009 
6010 		/* All we need to do to clear the truncate log is set
6011 		 * tl_used. */
6012 		tl->tl_used = 0;
6013 
6014 		ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
6015 		status = ocfs2_write_block(osb, tl_bh, INODE_CACHE(tl_inode));
6016 		if (status < 0) {
6017 			mlog_errno(status);
6018 			goto bail;
6019 		}
6020 	}
6021 
6022 bail:
6023 	if (tl_inode)
6024 		iput(tl_inode);
6025 	brelse(tl_bh);
6026 
6027 	if (status < 0 && (*tl_copy)) {
6028 		kfree(*tl_copy);
6029 		*tl_copy = NULL;
6030 	}
6031 
6032 	mlog_exit(status);
6033 	return status;
6034 }
6035 
6036 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
6037 					 struct ocfs2_dinode *tl_copy)
6038 {
6039 	int status = 0;
6040 	int i;
6041 	unsigned int clusters, num_recs, start_cluster;
6042 	u64 start_blk;
6043 	handle_t *handle;
6044 	struct inode *tl_inode = osb->osb_tl_inode;
6045 	struct ocfs2_truncate_log *tl;
6046 
6047 	mlog_entry_void();
6048 
6049 	if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) {
6050 		mlog(ML_ERROR, "Asked to recover my own truncate log!\n");
6051 		return -EINVAL;
6052 	}
6053 
6054 	tl = &tl_copy->id2.i_dealloc;
6055 	num_recs = le16_to_cpu(tl->tl_used);
6056 	mlog(0, "cleanup %u records from %llu\n", num_recs,
6057 	     (unsigned long long)le64_to_cpu(tl_copy->i_blkno));
6058 
6059 	mutex_lock(&tl_inode->i_mutex);
6060 	for(i = 0; i < num_recs; i++) {
6061 		if (ocfs2_truncate_log_needs_flush(osb)) {
6062 			status = __ocfs2_flush_truncate_log(osb);
6063 			if (status < 0) {
6064 				mlog_errno(status);
6065 				goto bail_up;
6066 			}
6067 		}
6068 
6069 		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6070 		if (IS_ERR(handle)) {
6071 			status = PTR_ERR(handle);
6072 			mlog_errno(status);
6073 			goto bail_up;
6074 		}
6075 
6076 		clusters = le32_to_cpu(tl->tl_recs[i].t_clusters);
6077 		start_cluster = le32_to_cpu(tl->tl_recs[i].t_start);
6078 		start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster);
6079 
6080 		status = ocfs2_truncate_log_append(osb, handle,
6081 						   start_blk, clusters);
6082 		ocfs2_commit_trans(osb, handle);
6083 		if (status < 0) {
6084 			mlog_errno(status);
6085 			goto bail_up;
6086 		}
6087 	}
6088 
6089 bail_up:
6090 	mutex_unlock(&tl_inode->i_mutex);
6091 
6092 	mlog_exit(status);
6093 	return status;
6094 }
6095 
6096 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb)
6097 {
6098 	int status;
6099 	struct inode *tl_inode = osb->osb_tl_inode;
6100 
6101 	mlog_entry_void();
6102 
6103 	if (tl_inode) {
6104 		cancel_delayed_work(&osb->osb_truncate_log_wq);
6105 		flush_workqueue(ocfs2_wq);
6106 
6107 		status = ocfs2_flush_truncate_log(osb);
6108 		if (status < 0)
6109 			mlog_errno(status);
6110 
6111 		brelse(osb->osb_tl_bh);
6112 		iput(osb->osb_tl_inode);
6113 	}
6114 
6115 	mlog_exit_void();
6116 }
6117 
6118 int ocfs2_truncate_log_init(struct ocfs2_super *osb)
6119 {
6120 	int status;
6121 	struct inode *tl_inode = NULL;
6122 	struct buffer_head *tl_bh = NULL;
6123 
6124 	mlog_entry_void();
6125 
6126 	status = ocfs2_get_truncate_log_info(osb,
6127 					     osb->slot_num,
6128 					     &tl_inode,
6129 					     &tl_bh);
6130 	if (status < 0)
6131 		mlog_errno(status);
6132 
6133 	/* ocfs2_truncate_log_shutdown keys on the existence of
6134 	 * osb->osb_tl_inode so we don't set any of the osb variables
6135 	 * until we're sure all is well. */
6136 	INIT_DELAYED_WORK(&osb->osb_truncate_log_wq,
6137 			  ocfs2_truncate_log_worker);
6138 	osb->osb_tl_bh    = tl_bh;
6139 	osb->osb_tl_inode = tl_inode;
6140 
6141 	mlog_exit(status);
6142 	return status;
6143 }
6144 
6145 /*
6146  * Delayed de-allocation of suballocator blocks.
6147  *
6148  * Some sets of block de-allocations might involve multiple suballocator inodes.
6149  *
6150  * The locking for this can get extremely complicated, especially when
6151  * the suballocator inodes to delete from aren't known until deep
6152  * within an unrelated codepath.
6153  *
6154  * ocfs2_extent_block structures are a good example of this - an inode
6155  * btree could have been grown by any number of nodes each allocating
6156  * out of their own suballoc inode.
6157  *
6158  * These structures allow the delay of block de-allocation until a
6159  * later time, when locking of multiple cluster inodes won't cause
6160  * deadlock.
6161  */
6162 
6163 /*
6164  * Describe a single bit freed from a suballocator.  For the block
6165  * suballocators, it represents one block.  For the global cluster
6166  * allocator, it represents some clusters and free_bit indicates
6167  * clusters number.
6168  */
6169 struct ocfs2_cached_block_free {
6170 	struct ocfs2_cached_block_free		*free_next;
6171 	u64					free_blk;
6172 	unsigned int				free_bit;
6173 };
6174 
6175 struct ocfs2_per_slot_free_list {
6176 	struct ocfs2_per_slot_free_list		*f_next_suballocator;
6177 	int					f_inode_type;
6178 	int					f_slot;
6179 	struct ocfs2_cached_block_free		*f_first;
6180 };
6181 
6182 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
6183 				    int sysfile_type,
6184 				    int slot,
6185 				    struct ocfs2_cached_block_free *head)
6186 {
6187 	int ret;
6188 	u64 bg_blkno;
6189 	handle_t *handle;
6190 	struct inode *inode;
6191 	struct buffer_head *di_bh = NULL;
6192 	struct ocfs2_cached_block_free *tmp;
6193 
6194 	inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot);
6195 	if (!inode) {
6196 		ret = -EINVAL;
6197 		mlog_errno(ret);
6198 		goto out;
6199 	}
6200 
6201 	mutex_lock(&inode->i_mutex);
6202 
6203 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
6204 	if (ret) {
6205 		mlog_errno(ret);
6206 		goto out_mutex;
6207 	}
6208 
6209 	handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE);
6210 	if (IS_ERR(handle)) {
6211 		ret = PTR_ERR(handle);
6212 		mlog_errno(ret);
6213 		goto out_unlock;
6214 	}
6215 
6216 	while (head) {
6217 		bg_blkno = ocfs2_which_suballoc_group(head->free_blk,
6218 						      head->free_bit);
6219 		mlog(0, "Free bit: (bit %u, blkno %llu)\n",
6220 		     head->free_bit, (unsigned long long)head->free_blk);
6221 
6222 		ret = ocfs2_free_suballoc_bits(handle, inode, di_bh,
6223 					       head->free_bit, bg_blkno, 1);
6224 		if (ret) {
6225 			mlog_errno(ret);
6226 			goto out_journal;
6227 		}
6228 
6229 		ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE);
6230 		if (ret) {
6231 			mlog_errno(ret);
6232 			goto out_journal;
6233 		}
6234 
6235 		tmp = head;
6236 		head = head->free_next;
6237 		kfree(tmp);
6238 	}
6239 
6240 out_journal:
6241 	ocfs2_commit_trans(osb, handle);
6242 
6243 out_unlock:
6244 	ocfs2_inode_unlock(inode, 1);
6245 	brelse(di_bh);
6246 out_mutex:
6247 	mutex_unlock(&inode->i_mutex);
6248 	iput(inode);
6249 out:
6250 	while(head) {
6251 		/* Premature exit may have left some dangling items. */
6252 		tmp = head;
6253 		head = head->free_next;
6254 		kfree(tmp);
6255 	}
6256 
6257 	return ret;
6258 }
6259 
6260 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6261 				u64 blkno, unsigned int bit)
6262 {
6263 	int ret = 0;
6264 	struct ocfs2_cached_block_free *item;
6265 
6266 	item = kmalloc(sizeof(*item), GFP_NOFS);
6267 	if (item == NULL) {
6268 		ret = -ENOMEM;
6269 		mlog_errno(ret);
6270 		return ret;
6271 	}
6272 
6273 	mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
6274 	     bit, (unsigned long long)blkno);
6275 
6276 	item->free_blk = blkno;
6277 	item->free_bit = bit;
6278 	item->free_next = ctxt->c_global_allocator;
6279 
6280 	ctxt->c_global_allocator = item;
6281 	return ret;
6282 }
6283 
6284 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
6285 				      struct ocfs2_cached_block_free *head)
6286 {
6287 	struct ocfs2_cached_block_free *tmp;
6288 	struct inode *tl_inode = osb->osb_tl_inode;
6289 	handle_t *handle;
6290 	int ret = 0;
6291 
6292 	mutex_lock(&tl_inode->i_mutex);
6293 
6294 	while (head) {
6295 		if (ocfs2_truncate_log_needs_flush(osb)) {
6296 			ret = __ocfs2_flush_truncate_log(osb);
6297 			if (ret < 0) {
6298 				mlog_errno(ret);
6299 				break;
6300 			}
6301 		}
6302 
6303 		handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
6304 		if (IS_ERR(handle)) {
6305 			ret = PTR_ERR(handle);
6306 			mlog_errno(ret);
6307 			break;
6308 		}
6309 
6310 		ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
6311 						head->free_bit);
6312 
6313 		ocfs2_commit_trans(osb, handle);
6314 		tmp = head;
6315 		head = head->free_next;
6316 		kfree(tmp);
6317 
6318 		if (ret < 0) {
6319 			mlog_errno(ret);
6320 			break;
6321 		}
6322 	}
6323 
6324 	mutex_unlock(&tl_inode->i_mutex);
6325 
6326 	while (head) {
6327 		/* Premature exit may have left some dangling items. */
6328 		tmp = head;
6329 		head = head->free_next;
6330 		kfree(tmp);
6331 	}
6332 
6333 	return ret;
6334 }
6335 
6336 int ocfs2_run_deallocs(struct ocfs2_super *osb,
6337 		       struct ocfs2_cached_dealloc_ctxt *ctxt)
6338 {
6339 	int ret = 0, ret2;
6340 	struct ocfs2_per_slot_free_list *fl;
6341 
6342 	if (!ctxt)
6343 		return 0;
6344 
6345 	while (ctxt->c_first_suballocator) {
6346 		fl = ctxt->c_first_suballocator;
6347 
6348 		if (fl->f_first) {
6349 			mlog(0, "Free items: (type %u, slot %d)\n",
6350 			     fl->f_inode_type, fl->f_slot);
6351 			ret2 = ocfs2_free_cached_blocks(osb,
6352 							fl->f_inode_type,
6353 							fl->f_slot,
6354 							fl->f_first);
6355 			if (ret2)
6356 				mlog_errno(ret2);
6357 			if (!ret)
6358 				ret = ret2;
6359 		}
6360 
6361 		ctxt->c_first_suballocator = fl->f_next_suballocator;
6362 		kfree(fl);
6363 	}
6364 
6365 	if (ctxt->c_global_allocator) {
6366 		ret2 = ocfs2_free_cached_clusters(osb,
6367 						  ctxt->c_global_allocator);
6368 		if (ret2)
6369 			mlog_errno(ret2);
6370 		if (!ret)
6371 			ret = ret2;
6372 
6373 		ctxt->c_global_allocator = NULL;
6374 	}
6375 
6376 	return ret;
6377 }
6378 
6379 static struct ocfs2_per_slot_free_list *
6380 ocfs2_find_per_slot_free_list(int type,
6381 			      int slot,
6382 			      struct ocfs2_cached_dealloc_ctxt *ctxt)
6383 {
6384 	struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator;
6385 
6386 	while (fl) {
6387 		if (fl->f_inode_type == type && fl->f_slot == slot)
6388 			return fl;
6389 
6390 		fl = fl->f_next_suballocator;
6391 	}
6392 
6393 	fl = kmalloc(sizeof(*fl), GFP_NOFS);
6394 	if (fl) {
6395 		fl->f_inode_type = type;
6396 		fl->f_slot = slot;
6397 		fl->f_first = NULL;
6398 		fl->f_next_suballocator = ctxt->c_first_suballocator;
6399 
6400 		ctxt->c_first_suballocator = fl;
6401 	}
6402 	return fl;
6403 }
6404 
6405 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
6406 				     int type, int slot, u64 blkno,
6407 				     unsigned int bit)
6408 {
6409 	int ret;
6410 	struct ocfs2_per_slot_free_list *fl;
6411 	struct ocfs2_cached_block_free *item;
6412 
6413 	fl = ocfs2_find_per_slot_free_list(type, slot, ctxt);
6414 	if (fl == NULL) {
6415 		ret = -ENOMEM;
6416 		mlog_errno(ret);
6417 		goto out;
6418 	}
6419 
6420 	item = kmalloc(sizeof(*item), GFP_NOFS);
6421 	if (item == NULL) {
6422 		ret = -ENOMEM;
6423 		mlog_errno(ret);
6424 		goto out;
6425 	}
6426 
6427 	mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n",
6428 	     type, slot, bit, (unsigned long long)blkno);
6429 
6430 	item->free_blk = blkno;
6431 	item->free_bit = bit;
6432 	item->free_next = fl->f_first;
6433 
6434 	fl->f_first = item;
6435 
6436 	ret = 0;
6437 out:
6438 	return ret;
6439 }
6440 
6441 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt,
6442 					 struct ocfs2_extent_block *eb)
6443 {
6444 	return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE,
6445 					 le16_to_cpu(eb->h_suballoc_slot),
6446 					 le64_to_cpu(eb->h_blkno),
6447 					 le16_to_cpu(eb->h_suballoc_bit));
6448 }
6449 
6450 /* This function will figure out whether the currently last extent
6451  * block will be deleted, and if it will, what the new last extent
6452  * block will be so we can update his h_next_leaf_blk field, as well
6453  * as the dinodes i_last_eb_blk */
6454 static int ocfs2_find_new_last_ext_blk(struct inode *inode,
6455 				       unsigned int clusters_to_del,
6456 				       struct ocfs2_path *path,
6457 				       struct buffer_head **new_last_eb)
6458 {
6459 	int next_free, ret = 0;
6460 	u32 cpos;
6461 	struct ocfs2_extent_rec *rec;
6462 	struct ocfs2_extent_block *eb;
6463 	struct ocfs2_extent_list *el;
6464 	struct buffer_head *bh = NULL;
6465 
6466 	*new_last_eb = NULL;
6467 
6468 	/* we have no tree, so of course, no last_eb. */
6469 	if (!path->p_tree_depth)
6470 		goto out;
6471 
6472 	/* trunc to zero special case - this makes tree_depth = 0
6473 	 * regardless of what it is.  */
6474 	if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
6475 		goto out;
6476 
6477 	el = path_leaf_el(path);
6478 	BUG_ON(!el->l_next_free_rec);
6479 
6480 	/*
6481 	 * Make sure that this extent list will actually be empty
6482 	 * after we clear away the data. We can shortcut out if
6483 	 * there's more than one non-empty extent in the
6484 	 * list. Otherwise, a check of the remaining extent is
6485 	 * necessary.
6486 	 */
6487 	next_free = le16_to_cpu(el->l_next_free_rec);
6488 	rec = NULL;
6489 	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6490 		if (next_free > 2)
6491 			goto out;
6492 
6493 		/* We may have a valid extent in index 1, check it. */
6494 		if (next_free == 2)
6495 			rec = &el->l_recs[1];
6496 
6497 		/*
6498 		 * Fall through - no more nonempty extents, so we want
6499 		 * to delete this leaf.
6500 		 */
6501 	} else {
6502 		if (next_free > 1)
6503 			goto out;
6504 
6505 		rec = &el->l_recs[0];
6506 	}
6507 
6508 	if (rec) {
6509 		/*
6510 		 * Check it we'll only be trimming off the end of this
6511 		 * cluster.
6512 		 */
6513 		if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
6514 			goto out;
6515 	}
6516 
6517 	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
6518 	if (ret) {
6519 		mlog_errno(ret);
6520 		goto out;
6521 	}
6522 
6523 	ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
6524 	if (ret) {
6525 		mlog_errno(ret);
6526 		goto out;
6527 	}
6528 
6529 	eb = (struct ocfs2_extent_block *) bh->b_data;
6530 	el = &eb->h_list;
6531 
6532 	/* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
6533 	 * Any corruption is a code bug. */
6534 	BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
6535 
6536 	*new_last_eb = bh;
6537 	get_bh(*new_last_eb);
6538 	mlog(0, "returning block %llu, (cpos: %u)\n",
6539 	     (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
6540 out:
6541 	brelse(bh);
6542 
6543 	return ret;
6544 }
6545 
6546 /*
6547  * Trim some clusters off the rightmost edge of a tree. Only called
6548  * during truncate.
6549  *
6550  * The caller needs to:
6551  *   - start journaling of each path component.
6552  *   - compute and fully set up any new last ext block
6553  */
6554 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
6555 			   handle_t *handle, struct ocfs2_truncate_context *tc,
6556 			   u32 clusters_to_del, u64 *delete_start)
6557 {
6558 	int ret, i, index = path->p_tree_depth;
6559 	u32 new_edge = 0;
6560 	u64 deleted_eb = 0;
6561 	struct buffer_head *bh;
6562 	struct ocfs2_extent_list *el;
6563 	struct ocfs2_extent_rec *rec;
6564 
6565 	*delete_start = 0;
6566 
6567 	while (index >= 0) {
6568 		bh = path->p_node[index].bh;
6569 		el = path->p_node[index].el;
6570 
6571 		mlog(0, "traveling tree (index = %d, block = %llu)\n",
6572 		     index,  (unsigned long long)bh->b_blocknr);
6573 
6574 		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
6575 
6576 		if (index !=
6577 		    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
6578 			ocfs2_error(inode->i_sb,
6579 				    "Inode %lu has invalid ext. block %llu",
6580 				    inode->i_ino,
6581 				    (unsigned long long)bh->b_blocknr);
6582 			ret = -EROFS;
6583 			goto out;
6584 		}
6585 
6586 find_tail_record:
6587 		i = le16_to_cpu(el->l_next_free_rec) - 1;
6588 		rec = &el->l_recs[i];
6589 
6590 		mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
6591 		     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
6592 		     ocfs2_rec_clusters(el, rec),
6593 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
6594 		     le16_to_cpu(el->l_next_free_rec));
6595 
6596 		BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
6597 
6598 		if (le16_to_cpu(el->l_tree_depth) == 0) {
6599 			/*
6600 			 * If the leaf block contains a single empty
6601 			 * extent and no records, we can just remove
6602 			 * the block.
6603 			 */
6604 			if (i == 0 && ocfs2_is_empty_extent(rec)) {
6605 				memset(rec, 0,
6606 				       sizeof(struct ocfs2_extent_rec));
6607 				el->l_next_free_rec = cpu_to_le16(0);
6608 
6609 				goto delete;
6610 			}
6611 
6612 			/*
6613 			 * Remove any empty extents by shifting things
6614 			 * left. That should make life much easier on
6615 			 * the code below. This condition is rare
6616 			 * enough that we shouldn't see a performance
6617 			 * hit.
6618 			 */
6619 			if (ocfs2_is_empty_extent(&el->l_recs[0])) {
6620 				le16_add_cpu(&el->l_next_free_rec, -1);
6621 
6622 				for(i = 0;
6623 				    i < le16_to_cpu(el->l_next_free_rec); i++)
6624 					el->l_recs[i] = el->l_recs[i + 1];
6625 
6626 				memset(&el->l_recs[i], 0,
6627 				       sizeof(struct ocfs2_extent_rec));
6628 
6629 				/*
6630 				 * We've modified our extent list. The
6631 				 * simplest way to handle this change
6632 				 * is to being the search from the
6633 				 * start again.
6634 				 */
6635 				goto find_tail_record;
6636 			}
6637 
6638 			le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
6639 
6640 			/*
6641 			 * We'll use "new_edge" on our way back up the
6642 			 * tree to know what our rightmost cpos is.
6643 			 */
6644 			new_edge = le16_to_cpu(rec->e_leaf_clusters);
6645 			new_edge += le32_to_cpu(rec->e_cpos);
6646 
6647 			/*
6648 			 * The caller will use this to delete data blocks.
6649 			 */
6650 			*delete_start = le64_to_cpu(rec->e_blkno)
6651 				+ ocfs2_clusters_to_blocks(inode->i_sb,
6652 					le16_to_cpu(rec->e_leaf_clusters));
6653 
6654 			/*
6655 			 * If it's now empty, remove this record.
6656 			 */
6657 			if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
6658 				memset(rec, 0,
6659 				       sizeof(struct ocfs2_extent_rec));
6660 				le16_add_cpu(&el->l_next_free_rec, -1);
6661 			}
6662 		} else {
6663 			if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
6664 				memset(rec, 0,
6665 				       sizeof(struct ocfs2_extent_rec));
6666 				le16_add_cpu(&el->l_next_free_rec, -1);
6667 
6668 				goto delete;
6669 			}
6670 
6671 			/* Can this actually happen? */
6672 			if (le16_to_cpu(el->l_next_free_rec) == 0)
6673 				goto delete;
6674 
6675 			/*
6676 			 * We never actually deleted any clusters
6677 			 * because our leaf was empty. There's no
6678 			 * reason to adjust the rightmost edge then.
6679 			 */
6680 			if (new_edge == 0)
6681 				goto delete;
6682 
6683 			rec->e_int_clusters = cpu_to_le32(new_edge);
6684 			le32_add_cpu(&rec->e_int_clusters,
6685 				     -le32_to_cpu(rec->e_cpos));
6686 
6687 			 /*
6688 			  * A deleted child record should have been
6689 			  * caught above.
6690 			  */
6691 			 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
6692 		}
6693 
6694 delete:
6695 		ret = ocfs2_journal_dirty(handle, bh);
6696 		if (ret) {
6697 			mlog_errno(ret);
6698 			goto out;
6699 		}
6700 
6701 		mlog(0, "extent list container %llu, after: record %d: "
6702 		     "(%u, %u, %llu), next = %u.\n",
6703 		     (unsigned long long)bh->b_blocknr, i,
6704 		     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
6705 		     (unsigned long long)le64_to_cpu(rec->e_blkno),
6706 		     le16_to_cpu(el->l_next_free_rec));
6707 
6708 		/*
6709 		 * We must be careful to only attempt delete of an
6710 		 * extent block (and not the root inode block).
6711 		 */
6712 		if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
6713 			struct ocfs2_extent_block *eb =
6714 				(struct ocfs2_extent_block *)bh->b_data;
6715 
6716 			/*
6717 			 * Save this for use when processing the
6718 			 * parent block.
6719 			 */
6720 			deleted_eb = le64_to_cpu(eb->h_blkno);
6721 
6722 			mlog(0, "deleting this extent block.\n");
6723 
6724 			ocfs2_remove_from_cache(INODE_CACHE(inode), bh);
6725 
6726 			BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
6727 			BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
6728 			BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
6729 
6730 			ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb);
6731 			/* An error here is not fatal. */
6732 			if (ret < 0)
6733 				mlog_errno(ret);
6734 		} else {
6735 			deleted_eb = 0;
6736 		}
6737 
6738 		index--;
6739 	}
6740 
6741 	ret = 0;
6742 out:
6743 	return ret;
6744 }
6745 
6746 static int ocfs2_do_truncate(struct ocfs2_super *osb,
6747 			     unsigned int clusters_to_del,
6748 			     struct inode *inode,
6749 			     struct buffer_head *fe_bh,
6750 			     handle_t *handle,
6751 			     struct ocfs2_truncate_context *tc,
6752 			     struct ocfs2_path *path)
6753 {
6754 	int status;
6755 	struct ocfs2_dinode *fe;
6756 	struct ocfs2_extent_block *last_eb = NULL;
6757 	struct ocfs2_extent_list *el;
6758 	struct buffer_head *last_eb_bh = NULL;
6759 	u64 delete_blk = 0;
6760 
6761 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
6762 
6763 	status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
6764 					     path, &last_eb_bh);
6765 	if (status < 0) {
6766 		mlog_errno(status);
6767 		goto bail;
6768 	}
6769 
6770 	/*
6771 	 * Each component will be touched, so we might as well journal
6772 	 * here to avoid having to handle errors later.
6773 	 */
6774 	status = ocfs2_journal_access_path(INODE_CACHE(inode), handle, path);
6775 	if (status < 0) {
6776 		mlog_errno(status);
6777 		goto bail;
6778 	}
6779 
6780 	if (last_eb_bh) {
6781 		status = ocfs2_journal_access_eb(handle, INODE_CACHE(inode), last_eb_bh,
6782 						 OCFS2_JOURNAL_ACCESS_WRITE);
6783 		if (status < 0) {
6784 			mlog_errno(status);
6785 			goto bail;
6786 		}
6787 
6788 		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
6789 	}
6790 
6791 	el = &(fe->id2.i_list);
6792 
6793 	/*
6794 	 * Lower levels depend on this never happening, but it's best
6795 	 * to check it up here before changing the tree.
6796 	 */
6797 	if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
6798 		ocfs2_error(inode->i_sb,
6799 			    "Inode %lu has an empty extent record, depth %u\n",
6800 			    inode->i_ino, le16_to_cpu(el->l_tree_depth));
6801 		status = -EROFS;
6802 		goto bail;
6803 	}
6804 
6805 	vfs_dq_free_space_nodirty(inode,
6806 			ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
6807 	spin_lock(&OCFS2_I(inode)->ip_lock);
6808 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
6809 				      clusters_to_del;
6810 	spin_unlock(&OCFS2_I(inode)->ip_lock);
6811 	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
6812 	inode->i_blocks = ocfs2_inode_sector_count(inode);
6813 
6814 	status = ocfs2_trim_tree(inode, path, handle, tc,
6815 				 clusters_to_del, &delete_blk);
6816 	if (status) {
6817 		mlog_errno(status);
6818 		goto bail;
6819 	}
6820 
6821 	if (le32_to_cpu(fe->i_clusters) == 0) {
6822 		/* trunc to zero is a special case. */
6823 		el->l_tree_depth = 0;
6824 		fe->i_last_eb_blk = 0;
6825 	} else if (last_eb)
6826 		fe->i_last_eb_blk = last_eb->h_blkno;
6827 
6828 	status = ocfs2_journal_dirty(handle, fe_bh);
6829 	if (status < 0) {
6830 		mlog_errno(status);
6831 		goto bail;
6832 	}
6833 
6834 	if (last_eb) {
6835 		/* If there will be a new last extent block, then by
6836 		 * definition, there cannot be any leaves to the right of
6837 		 * him. */
6838 		last_eb->h_next_leaf_blk = 0;
6839 		status = ocfs2_journal_dirty(handle, last_eb_bh);
6840 		if (status < 0) {
6841 			mlog_errno(status);
6842 			goto bail;
6843 		}
6844 	}
6845 
6846 	if (delete_blk) {
6847 		status = ocfs2_truncate_log_append(osb, handle, delete_blk,
6848 						   clusters_to_del);
6849 		if (status < 0) {
6850 			mlog_errno(status);
6851 			goto bail;
6852 		}
6853 	}
6854 	status = 0;
6855 bail:
6856 	brelse(last_eb_bh);
6857 	mlog_exit(status);
6858 	return status;
6859 }
6860 
6861 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh)
6862 {
6863 	set_buffer_uptodate(bh);
6864 	mark_buffer_dirty(bh);
6865 	return 0;
6866 }
6867 
6868 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
6869 				     unsigned int from, unsigned int to,
6870 				     struct page *page, int zero, u64 *phys)
6871 {
6872 	int ret, partial = 0;
6873 
6874 	ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0);
6875 	if (ret)
6876 		mlog_errno(ret);
6877 
6878 	if (zero)
6879 		zero_user_segment(page, from, to);
6880 
6881 	/*
6882 	 * Need to set the buffers we zero'd into uptodate
6883 	 * here if they aren't - ocfs2_map_page_blocks()
6884 	 * might've skipped some
6885 	 */
6886 	ret = walk_page_buffers(handle, page_buffers(page),
6887 				from, to, &partial,
6888 				ocfs2_zero_func);
6889 	if (ret < 0)
6890 		mlog_errno(ret);
6891 	else if (ocfs2_should_order_data(inode)) {
6892 		ret = ocfs2_jbd2_file_inode(handle, inode);
6893 		if (ret < 0)
6894 			mlog_errno(ret);
6895 	}
6896 
6897 	if (!partial)
6898 		SetPageUptodate(page);
6899 
6900 	flush_dcache_page(page);
6901 }
6902 
6903 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start,
6904 				     loff_t end, struct page **pages,
6905 				     int numpages, u64 phys, handle_t *handle)
6906 {
6907 	int i;
6908 	struct page *page;
6909 	unsigned int from, to = PAGE_CACHE_SIZE;
6910 	struct super_block *sb = inode->i_sb;
6911 
6912 	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
6913 
6914 	if (numpages == 0)
6915 		goto out;
6916 
6917 	to = PAGE_CACHE_SIZE;
6918 	for(i = 0; i < numpages; i++) {
6919 		page = pages[i];
6920 
6921 		from = start & (PAGE_CACHE_SIZE - 1);
6922 		if ((end >> PAGE_CACHE_SHIFT) == page->index)
6923 			to = end & (PAGE_CACHE_SIZE - 1);
6924 
6925 		BUG_ON(from > PAGE_CACHE_SIZE);
6926 		BUG_ON(to > PAGE_CACHE_SIZE);
6927 
6928 		ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1,
6929 					 &phys);
6930 
6931 		start = (page->index + 1) << PAGE_CACHE_SHIFT;
6932 	}
6933 out:
6934 	if (pages)
6935 		ocfs2_unlock_and_free_pages(pages, numpages);
6936 }
6937 
6938 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end,
6939 				struct page **pages, int *num)
6940 {
6941 	int numpages, ret = 0;
6942 	struct super_block *sb = inode->i_sb;
6943 	struct address_space *mapping = inode->i_mapping;
6944 	unsigned long index;
6945 	loff_t last_page_bytes;
6946 
6947 	BUG_ON(start > end);
6948 
6949 	BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits !=
6950 	       (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits);
6951 
6952 	numpages = 0;
6953 	last_page_bytes = PAGE_ALIGN(end);
6954 	index = start >> PAGE_CACHE_SHIFT;
6955 	do {
6956 		pages[numpages] = grab_cache_page(mapping, index);
6957 		if (!pages[numpages]) {
6958 			ret = -ENOMEM;
6959 			mlog_errno(ret);
6960 			goto out;
6961 		}
6962 
6963 		numpages++;
6964 		index++;
6965 	} while (index < (last_page_bytes >> PAGE_CACHE_SHIFT));
6966 
6967 out:
6968 	if (ret != 0) {
6969 		if (pages)
6970 			ocfs2_unlock_and_free_pages(pages, numpages);
6971 		numpages = 0;
6972 	}
6973 
6974 	*num = numpages;
6975 
6976 	return ret;
6977 }
6978 
6979 /*
6980  * Zero the area past i_size but still within an allocated
6981  * cluster. This avoids exposing nonzero data on subsequent file
6982  * extends.
6983  *
6984  * We need to call this before i_size is updated on the inode because
6985  * otherwise block_write_full_page() will skip writeout of pages past
6986  * i_size. The new_i_size parameter is passed for this reason.
6987  */
6988 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
6989 				  u64 range_start, u64 range_end)
6990 {
6991 	int ret = 0, numpages;
6992 	struct page **pages = NULL;
6993 	u64 phys;
6994 	unsigned int ext_flags;
6995 	struct super_block *sb = inode->i_sb;
6996 
6997 	/*
6998 	 * File systems which don't support sparse files zero on every
6999 	 * extend.
7000 	 */
7001 	if (!ocfs2_sparse_alloc(OCFS2_SB(sb)))
7002 		return 0;
7003 
7004 	pages = kcalloc(ocfs2_pages_per_cluster(sb),
7005 			sizeof(struct page *), GFP_NOFS);
7006 	if (pages == NULL) {
7007 		ret = -ENOMEM;
7008 		mlog_errno(ret);
7009 		goto out;
7010 	}
7011 
7012 	if (range_start == range_end)
7013 		goto out;
7014 
7015 	ret = ocfs2_extent_map_get_blocks(inode,
7016 					  range_start >> sb->s_blocksize_bits,
7017 					  &phys, NULL, &ext_flags);
7018 	if (ret) {
7019 		mlog_errno(ret);
7020 		goto out;
7021 	}
7022 
7023 	/*
7024 	 * Tail is a hole, or is marked unwritten. In either case, we
7025 	 * can count on read and write to return/push zero's.
7026 	 */
7027 	if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN)
7028 		goto out;
7029 
7030 	ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages,
7031 				   &numpages);
7032 	if (ret) {
7033 		mlog_errno(ret);
7034 		goto out;
7035 	}
7036 
7037 	ocfs2_zero_cluster_pages(inode, range_start, range_end, pages,
7038 				 numpages, phys, handle);
7039 
7040 	/*
7041 	 * Initiate writeout of the pages we zero'd here. We don't
7042 	 * wait on them - the truncate_inode_pages() call later will
7043 	 * do that for us.
7044 	 */
7045 	ret = do_sync_mapping_range(inode->i_mapping, range_start,
7046 				    range_end - 1, SYNC_FILE_RANGE_WRITE);
7047 	if (ret)
7048 		mlog_errno(ret);
7049 
7050 out:
7051 	if (pages)
7052 		kfree(pages);
7053 
7054 	return ret;
7055 }
7056 
7057 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode,
7058 					     struct ocfs2_dinode *di)
7059 {
7060 	unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits;
7061 	unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size);
7062 
7063 	if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL)
7064 		memset(&di->id2, 0, blocksize -
7065 				    offsetof(struct ocfs2_dinode, id2) -
7066 				    xattrsize);
7067 	else
7068 		memset(&di->id2, 0, blocksize -
7069 				    offsetof(struct ocfs2_dinode, id2));
7070 }
7071 
7072 void ocfs2_dinode_new_extent_list(struct inode *inode,
7073 				  struct ocfs2_dinode *di)
7074 {
7075 	ocfs2_zero_dinode_id2_with_xattr(inode, di);
7076 	di->id2.i_list.l_tree_depth = 0;
7077 	di->id2.i_list.l_next_free_rec = 0;
7078 	di->id2.i_list.l_count = cpu_to_le16(
7079 		ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di));
7080 }
7081 
7082 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di)
7083 {
7084 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
7085 	struct ocfs2_inline_data *idata = &di->id2.i_data;
7086 
7087 	spin_lock(&oi->ip_lock);
7088 	oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
7089 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
7090 	spin_unlock(&oi->ip_lock);
7091 
7092 	/*
7093 	 * We clear the entire i_data structure here so that all
7094 	 * fields can be properly initialized.
7095 	 */
7096 	ocfs2_zero_dinode_id2_with_xattr(inode, di);
7097 
7098 	idata->id_count = cpu_to_le16(
7099 			ocfs2_max_inline_data_with_xattr(inode->i_sb, di));
7100 }
7101 
7102 int ocfs2_convert_inline_data_to_extents(struct inode *inode,
7103 					 struct buffer_head *di_bh)
7104 {
7105 	int ret, i, has_data, num_pages = 0;
7106 	handle_t *handle;
7107 	u64 uninitialized_var(block);
7108 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
7109 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7110 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7111 	struct ocfs2_alloc_context *data_ac = NULL;
7112 	struct page **pages = NULL;
7113 	loff_t end = osb->s_clustersize;
7114 	struct ocfs2_extent_tree et;
7115 	int did_quota = 0;
7116 
7117 	has_data = i_size_read(inode) ? 1 : 0;
7118 
7119 	if (has_data) {
7120 		pages = kcalloc(ocfs2_pages_per_cluster(osb->sb),
7121 				sizeof(struct page *), GFP_NOFS);
7122 		if (pages == NULL) {
7123 			ret = -ENOMEM;
7124 			mlog_errno(ret);
7125 			goto out;
7126 		}
7127 
7128 		ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
7129 		if (ret) {
7130 			mlog_errno(ret);
7131 			goto out;
7132 		}
7133 	}
7134 
7135 	handle = ocfs2_start_trans(osb,
7136 				   ocfs2_inline_to_extents_credits(osb->sb));
7137 	if (IS_ERR(handle)) {
7138 		ret = PTR_ERR(handle);
7139 		mlog_errno(ret);
7140 		goto out_unlock;
7141 	}
7142 
7143 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7144 				      OCFS2_JOURNAL_ACCESS_WRITE);
7145 	if (ret) {
7146 		mlog_errno(ret);
7147 		goto out_commit;
7148 	}
7149 
7150 	if (has_data) {
7151 		u32 bit_off, num;
7152 		unsigned int page_end;
7153 		u64 phys;
7154 
7155 		if (vfs_dq_alloc_space_nodirty(inode,
7156 				       ocfs2_clusters_to_bytes(osb->sb, 1))) {
7157 			ret = -EDQUOT;
7158 			goto out_commit;
7159 		}
7160 		did_quota = 1;
7161 
7162 		ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
7163 					   &num);
7164 		if (ret) {
7165 			mlog_errno(ret);
7166 			goto out_commit;
7167 		}
7168 
7169 		/*
7170 		 * Save two copies, one for insert, and one that can
7171 		 * be changed by ocfs2_map_and_dirty_page() below.
7172 		 */
7173 		block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off);
7174 
7175 		/*
7176 		 * Non sparse file systems zero on extend, so no need
7177 		 * to do that now.
7178 		 */
7179 		if (!ocfs2_sparse_alloc(osb) &&
7180 		    PAGE_CACHE_SIZE < osb->s_clustersize)
7181 			end = PAGE_CACHE_SIZE;
7182 
7183 		ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
7184 		if (ret) {
7185 			mlog_errno(ret);
7186 			goto out_commit;
7187 		}
7188 
7189 		/*
7190 		 * This should populate the 1st page for us and mark
7191 		 * it up to date.
7192 		 */
7193 		ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
7194 		if (ret) {
7195 			mlog_errno(ret);
7196 			goto out_commit;
7197 		}
7198 
7199 		page_end = PAGE_CACHE_SIZE;
7200 		if (PAGE_CACHE_SIZE > osb->s_clustersize)
7201 			page_end = osb->s_clustersize;
7202 
7203 		for (i = 0; i < num_pages; i++)
7204 			ocfs2_map_and_dirty_page(inode, handle, 0, page_end,
7205 						 pages[i], i > 0, &phys);
7206 	}
7207 
7208 	spin_lock(&oi->ip_lock);
7209 	oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL;
7210 	di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
7211 	spin_unlock(&oi->ip_lock);
7212 
7213 	ocfs2_dinode_new_extent_list(inode, di);
7214 
7215 	ocfs2_journal_dirty(handle, di_bh);
7216 
7217 	if (has_data) {
7218 		/*
7219 		 * An error at this point should be extremely rare. If
7220 		 * this proves to be false, we could always re-build
7221 		 * the in-inode data from our pages.
7222 		 */
7223 		ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
7224 		ret = ocfs2_insert_extent(osb, handle, inode, &et,
7225 					  0, block, 1, 0, NULL);
7226 		if (ret) {
7227 			mlog_errno(ret);
7228 			goto out_commit;
7229 		}
7230 
7231 		inode->i_blocks = ocfs2_inode_sector_count(inode);
7232 	}
7233 
7234 out_commit:
7235 	if (ret < 0 && did_quota)
7236 		vfs_dq_free_space_nodirty(inode,
7237 					  ocfs2_clusters_to_bytes(osb->sb, 1));
7238 
7239 	ocfs2_commit_trans(osb, handle);
7240 
7241 out_unlock:
7242 	if (data_ac)
7243 		ocfs2_free_alloc_context(data_ac);
7244 
7245 out:
7246 	if (pages) {
7247 		ocfs2_unlock_and_free_pages(pages, num_pages);
7248 		kfree(pages);
7249 	}
7250 
7251 	return ret;
7252 }
7253 
7254 /*
7255  * It is expected, that by the time you call this function,
7256  * inode->i_size and fe->i_size have been adjusted.
7257  *
7258  * WARNING: This will kfree the truncate context
7259  */
7260 int ocfs2_commit_truncate(struct ocfs2_super *osb,
7261 			  struct inode *inode,
7262 			  struct buffer_head *fe_bh,
7263 			  struct ocfs2_truncate_context *tc)
7264 {
7265 	int status, i, credits, tl_sem = 0;
7266 	u32 clusters_to_del, new_highest_cpos, range;
7267 	struct ocfs2_extent_list *el;
7268 	handle_t *handle = NULL;
7269 	struct inode *tl_inode = osb->osb_tl_inode;
7270 	struct ocfs2_path *path = NULL;
7271 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data;
7272 
7273 	mlog_entry_void();
7274 
7275 	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
7276 						     i_size_read(inode));
7277 
7278 	path = ocfs2_new_path(fe_bh, &di->id2.i_list,
7279 			      ocfs2_journal_access_di);
7280 	if (!path) {
7281 		status = -ENOMEM;
7282 		mlog_errno(status);
7283 		goto bail;
7284 	}
7285 
7286 	ocfs2_extent_map_trunc(inode, new_highest_cpos);
7287 
7288 start:
7289 	/*
7290 	 * Check that we still have allocation to delete.
7291 	 */
7292 	if (OCFS2_I(inode)->ip_clusters == 0) {
7293 		status = 0;
7294 		goto bail;
7295 	}
7296 
7297 	/*
7298 	 * Truncate always works against the rightmost tree branch.
7299 	 */
7300 	status = ocfs2_find_path(inode, path, UINT_MAX);
7301 	if (status) {
7302 		mlog_errno(status);
7303 		goto bail;
7304 	}
7305 
7306 	mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
7307 	     OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
7308 
7309 	/*
7310 	 * By now, el will point to the extent list on the bottom most
7311 	 * portion of this tree. Only the tail record is considered in
7312 	 * each pass.
7313 	 *
7314 	 * We handle the following cases, in order:
7315 	 * - empty extent: delete the remaining branch
7316 	 * - remove the entire record
7317 	 * - remove a partial record
7318 	 * - no record needs to be removed (truncate has completed)
7319 	 */
7320 	el = path_leaf_el(path);
7321 	if (le16_to_cpu(el->l_next_free_rec) == 0) {
7322 		ocfs2_error(inode->i_sb,
7323 			    "Inode %llu has empty extent block at %llu\n",
7324 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
7325 			    (unsigned long long)path_leaf_bh(path)->b_blocknr);
7326 		status = -EROFS;
7327 		goto bail;
7328 	}
7329 
7330 	i = le16_to_cpu(el->l_next_free_rec) - 1;
7331 	range = le32_to_cpu(el->l_recs[i].e_cpos) +
7332 		ocfs2_rec_clusters(el, &el->l_recs[i]);
7333 	if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
7334 		clusters_to_del = 0;
7335 	} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
7336 		clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
7337 	} else if (range > new_highest_cpos) {
7338 		clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
7339 				   le32_to_cpu(el->l_recs[i].e_cpos)) -
7340 				  new_highest_cpos;
7341 	} else {
7342 		status = 0;
7343 		goto bail;
7344 	}
7345 
7346 	mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
7347 	     clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
7348 
7349 	mutex_lock(&tl_inode->i_mutex);
7350 	tl_sem = 1;
7351 	/* ocfs2_truncate_log_needs_flush guarantees us at least one
7352 	 * record is free for use. If there isn't any, we flush to get
7353 	 * an empty truncate log.  */
7354 	if (ocfs2_truncate_log_needs_flush(osb)) {
7355 		status = __ocfs2_flush_truncate_log(osb);
7356 		if (status < 0) {
7357 			mlog_errno(status);
7358 			goto bail;
7359 		}
7360 	}
7361 
7362 	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
7363 						(struct ocfs2_dinode *)fe_bh->b_data,
7364 						el);
7365 	handle = ocfs2_start_trans(osb, credits);
7366 	if (IS_ERR(handle)) {
7367 		status = PTR_ERR(handle);
7368 		handle = NULL;
7369 		mlog_errno(status);
7370 		goto bail;
7371 	}
7372 
7373 	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
7374 				   tc, path);
7375 	if (status < 0) {
7376 		mlog_errno(status);
7377 		goto bail;
7378 	}
7379 
7380 	mutex_unlock(&tl_inode->i_mutex);
7381 	tl_sem = 0;
7382 
7383 	ocfs2_commit_trans(osb, handle);
7384 	handle = NULL;
7385 
7386 	ocfs2_reinit_path(path, 1);
7387 
7388 	/*
7389 	 * The check above will catch the case where we've truncated
7390 	 * away all allocation.
7391 	 */
7392 	goto start;
7393 
7394 bail:
7395 
7396 	ocfs2_schedule_truncate_log_flush(osb, 1);
7397 
7398 	if (tl_sem)
7399 		mutex_unlock(&tl_inode->i_mutex);
7400 
7401 	if (handle)
7402 		ocfs2_commit_trans(osb, handle);
7403 
7404 	ocfs2_run_deallocs(osb, &tc->tc_dealloc);
7405 
7406 	ocfs2_free_path(path);
7407 
7408 	/* This will drop the ext_alloc cluster lock for us */
7409 	ocfs2_free_truncate_context(tc);
7410 
7411 	mlog_exit(status);
7412 	return status;
7413 }
7414 
7415 /*
7416  * Expects the inode to already be locked.
7417  */
7418 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
7419 			   struct inode *inode,
7420 			   struct buffer_head *fe_bh,
7421 			   struct ocfs2_truncate_context **tc)
7422 {
7423 	int status;
7424 	unsigned int new_i_clusters;
7425 	struct ocfs2_dinode *fe;
7426 	struct ocfs2_extent_block *eb;
7427 	struct buffer_head *last_eb_bh = NULL;
7428 
7429 	mlog_entry_void();
7430 
7431 	*tc = NULL;
7432 
7433 	new_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
7434 						  i_size_read(inode));
7435 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
7436 
7437 	mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size ="
7438 	     "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters,
7439 	     (unsigned long long)le64_to_cpu(fe->i_size));
7440 
7441 	*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
7442 	if (!(*tc)) {
7443 		status = -ENOMEM;
7444 		mlog_errno(status);
7445 		goto bail;
7446 	}
7447 	ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
7448 
7449 	if (fe->id2.i_list.l_tree_depth) {
7450 		status = ocfs2_read_extent_block(inode,
7451 						 le64_to_cpu(fe->i_last_eb_blk),
7452 						 &last_eb_bh);
7453 		if (status < 0) {
7454 			mlog_errno(status);
7455 			goto bail;
7456 		}
7457 		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
7458 	}
7459 
7460 	(*tc)->tc_last_eb_bh = last_eb_bh;
7461 
7462 	status = 0;
7463 bail:
7464 	if (status < 0) {
7465 		if (*tc)
7466 			ocfs2_free_truncate_context(*tc);
7467 		*tc = NULL;
7468 	}
7469 	mlog_exit_void();
7470 	return status;
7471 }
7472 
7473 /*
7474  * 'start' is inclusive, 'end' is not.
7475  */
7476 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
7477 			  unsigned int start, unsigned int end, int trunc)
7478 {
7479 	int ret;
7480 	unsigned int numbytes;
7481 	handle_t *handle;
7482 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
7483 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
7484 	struct ocfs2_inline_data *idata = &di->id2.i_data;
7485 
7486 	if (end > i_size_read(inode))
7487 		end = i_size_read(inode);
7488 
7489 	BUG_ON(start >= end);
7490 
7491 	if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) ||
7492 	    !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) ||
7493 	    !ocfs2_supports_inline_data(osb)) {
7494 		ocfs2_error(inode->i_sb,
7495 			    "Inline data flags for inode %llu don't agree! "
7496 			    "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n",
7497 			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
7498 			    le16_to_cpu(di->i_dyn_features),
7499 			    OCFS2_I(inode)->ip_dyn_features,
7500 			    osb->s_feature_incompat);
7501 		ret = -EROFS;
7502 		goto out;
7503 	}
7504 
7505 	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
7506 	if (IS_ERR(handle)) {
7507 		ret = PTR_ERR(handle);
7508 		mlog_errno(ret);
7509 		goto out;
7510 	}
7511 
7512 	ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
7513 				      OCFS2_JOURNAL_ACCESS_WRITE);
7514 	if (ret) {
7515 		mlog_errno(ret);
7516 		goto out_commit;
7517 	}
7518 
7519 	numbytes = end - start;
7520 	memset(idata->id_data + start, 0, numbytes);
7521 
7522 	/*
7523 	 * No need to worry about the data page here - it's been
7524 	 * truncated already and inline data doesn't need it for
7525 	 * pushing zero's to disk, so we'll let readpage pick it up
7526 	 * later.
7527 	 */
7528 	if (trunc) {
7529 		i_size_write(inode, start);
7530 		di->i_size = cpu_to_le64(start);
7531 	}
7532 
7533 	inode->i_blocks = ocfs2_inode_sector_count(inode);
7534 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
7535 
7536 	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
7537 	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
7538 
7539 	ocfs2_journal_dirty(handle, di_bh);
7540 
7541 out_commit:
7542 	ocfs2_commit_trans(osb, handle);
7543 
7544 out:
7545 	return ret;
7546 }
7547 
7548 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc)
7549 {
7550 	/*
7551 	 * The caller is responsible for completing deallocation
7552 	 * before freeing the context.
7553 	 */
7554 	if (tc->tc_dealloc.c_first_suballocator != NULL)
7555 		mlog(ML_NOTICE,
7556 		     "Truncate completion has non-empty dealloc context\n");
7557 
7558 	brelse(tc->tc_last_eb_bh);
7559 
7560 	kfree(tc);
7561 }
7562