xref: /openbmc/linux/fs/ocfs2/suballoc.c (revision 5ac072e1)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26 
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31 
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34 
35 #include "ocfs2.h"
36 
37 #include "alloc.h"
38 #include "blockcheck.h"
39 #include "dlmglue.h"
40 #include "inode.h"
41 #include "journal.h"
42 #include "localalloc.h"
43 #include "suballoc.h"
44 #include "super.h"
45 #include "sysfile.h"
46 #include "uptodate.h"
47 
48 #include "buffer_head_io.h"
49 
50 #define NOT_ALLOC_NEW_GROUP		0
51 #define ALLOC_NEW_GROUP			1
52 
53 #define OCFS2_MAX_INODES_TO_STEAL	1024
54 
55 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
56 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
57 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
58 static int ocfs2_block_group_fill(handle_t *handle,
59 				  struct inode *alloc_inode,
60 				  struct buffer_head *bg_bh,
61 				  u64 group_blkno,
62 				  u16 my_chain,
63 				  struct ocfs2_chain_list *cl);
64 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
65 				   struct inode *alloc_inode,
66 				   struct buffer_head *bh,
67 				   u64 max_block);
68 
69 static int ocfs2_cluster_group_search(struct inode *inode,
70 				      struct buffer_head *group_bh,
71 				      u32 bits_wanted, u32 min_bits,
72 				      u64 max_block,
73 				      u16 *bit_off, u16 *bits_found);
74 static int ocfs2_block_group_search(struct inode *inode,
75 				    struct buffer_head *group_bh,
76 				    u32 bits_wanted, u32 min_bits,
77 				    u64 max_block,
78 				    u16 *bit_off, u16 *bits_found);
79 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
80 				     struct ocfs2_alloc_context *ac,
81 				     handle_t *handle,
82 				     u32 bits_wanted,
83 				     u32 min_bits,
84 				     u16 *bit_off,
85 				     unsigned int *num_bits,
86 				     u64 *bg_blkno);
87 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
88 					 int nr);
89 static inline int ocfs2_block_group_set_bits(handle_t *handle,
90 					     struct inode *alloc_inode,
91 					     struct ocfs2_group_desc *bg,
92 					     struct buffer_head *group_bh,
93 					     unsigned int bit_off,
94 					     unsigned int num_bits);
95 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
96 					       struct inode *alloc_inode,
97 					       struct ocfs2_group_desc *bg,
98 					       struct buffer_head *group_bh,
99 					       unsigned int bit_off,
100 					       unsigned int num_bits);
101 
102 static int ocfs2_relink_block_group(handle_t *handle,
103 				    struct inode *alloc_inode,
104 				    struct buffer_head *fe_bh,
105 				    struct buffer_head *bg_bh,
106 				    struct buffer_head *prev_bg_bh,
107 				    u16 chain);
108 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
109 						     u32 wanted);
110 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
111 						   u64 bg_blkno,
112 						   u16 bg_bit_off);
113 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
114 						u64 data_blkno,
115 						u64 *bg_blkno,
116 						u16 *bg_bit_off);
117 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
118 					     u32 bits_wanted, u64 max_block,
119 					     struct ocfs2_alloc_context **ac);
120 
121 void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
122 {
123 	struct inode *inode = ac->ac_inode;
124 
125 	if (inode) {
126 		if (ac->ac_which != OCFS2_AC_USE_LOCAL)
127 			ocfs2_inode_unlock(inode, 1);
128 
129 		mutex_unlock(&inode->i_mutex);
130 
131 		iput(inode);
132 		ac->ac_inode = NULL;
133 	}
134 	brelse(ac->ac_bh);
135 	ac->ac_bh = NULL;
136 }
137 
138 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
139 {
140 	ocfs2_free_ac_resource(ac);
141 	kfree(ac);
142 }
143 
144 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
145 {
146 	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
147 }
148 
149 #define do_error(fmt, ...)						\
150 	do{								\
151 		if (clean_error)					\
152 			mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__);	\
153 		else							\
154 			ocfs2_error(sb, fmt, ##__VA_ARGS__);		\
155 	} while (0)
156 
157 static int ocfs2_validate_gd_self(struct super_block *sb,
158 				  struct buffer_head *bh,
159 				  int clean_error)
160 {
161 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
162 
163 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
164 		do_error("Group descriptor #%llu has bad signature %.*s",
165 			 (unsigned long long)bh->b_blocknr, 7,
166 			 gd->bg_signature);
167 		return -EINVAL;
168 	}
169 
170 	if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
171 		do_error("Group descriptor #%llu has an invalid bg_blkno "
172 			 "of %llu",
173 			 (unsigned long long)bh->b_blocknr,
174 			 (unsigned long long)le64_to_cpu(gd->bg_blkno));
175 		return -EINVAL;
176 	}
177 
178 	if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
179 		do_error("Group descriptor #%llu has an invalid "
180 			 "fs_generation of #%u",
181 			 (unsigned long long)bh->b_blocknr,
182 			 le32_to_cpu(gd->bg_generation));
183 		return -EINVAL;
184 	}
185 
186 	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
187 		do_error("Group descriptor #%llu has bit count %u but "
188 			 "claims that %u are free",
189 			 (unsigned long long)bh->b_blocknr,
190 			 le16_to_cpu(gd->bg_bits),
191 			 le16_to_cpu(gd->bg_free_bits_count));
192 		return -EINVAL;
193 	}
194 
195 	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
196 		do_error("Group descriptor #%llu has bit count %u but "
197 			 "max bitmap bits of %u",
198 			 (unsigned long long)bh->b_blocknr,
199 			 le16_to_cpu(gd->bg_bits),
200 			 8 * le16_to_cpu(gd->bg_size));
201 		return -EINVAL;
202 	}
203 
204 	return 0;
205 }
206 
207 static int ocfs2_validate_gd_parent(struct super_block *sb,
208 				    struct ocfs2_dinode *di,
209 				    struct buffer_head *bh,
210 				    int clean_error)
211 {
212 	unsigned int max_bits;
213 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
214 
215 	if (di->i_blkno != gd->bg_parent_dinode) {
216 		do_error("Group descriptor #%llu has bad parent "
217 			 "pointer (%llu, expected %llu)",
218 			 (unsigned long long)bh->b_blocknr,
219 			 (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
220 			 (unsigned long long)le64_to_cpu(di->i_blkno));
221 		return -EINVAL;
222 	}
223 
224 	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
225 	if (le16_to_cpu(gd->bg_bits) > max_bits) {
226 		do_error("Group descriptor #%llu has bit count of %u",
227 			 (unsigned long long)bh->b_blocknr,
228 			 le16_to_cpu(gd->bg_bits));
229 		return -EINVAL;
230 	}
231 
232 	if (le16_to_cpu(gd->bg_chain) >=
233 	    le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
234 		do_error("Group descriptor #%llu has bad chain %u",
235 			 (unsigned long long)bh->b_blocknr,
236 			 le16_to_cpu(gd->bg_chain));
237 		return -EINVAL;
238 	}
239 
240 	return 0;
241 }
242 
243 #undef do_error
244 
245 /*
246  * This version only prints errors.  It does not fail the filesystem, and
247  * exists only for resize.
248  */
249 int ocfs2_check_group_descriptor(struct super_block *sb,
250 				 struct ocfs2_dinode *di,
251 				 struct buffer_head *bh)
252 {
253 	int rc;
254 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
255 
256 	BUG_ON(!buffer_uptodate(bh));
257 
258 	/*
259 	 * If the ecc fails, we return the error but otherwise
260 	 * leave the filesystem running.  We know any error is
261 	 * local to this block.
262 	 */
263 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
264 	if (rc) {
265 		mlog(ML_ERROR,
266 		     "Checksum failed for group descriptor %llu\n",
267 		     (unsigned long long)bh->b_blocknr);
268 	} else
269 		rc = ocfs2_validate_gd_self(sb, bh, 1);
270 	if (!rc)
271 		rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
272 
273 	return rc;
274 }
275 
276 static int ocfs2_validate_group_descriptor(struct super_block *sb,
277 					   struct buffer_head *bh)
278 {
279 	int rc;
280 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
281 
282 	mlog(0, "Validating group descriptor %llu\n",
283 	     (unsigned long long)bh->b_blocknr);
284 
285 	BUG_ON(!buffer_uptodate(bh));
286 
287 	/*
288 	 * If the ecc fails, we return the error but otherwise
289 	 * leave the filesystem running.  We know any error is
290 	 * local to this block.
291 	 */
292 	rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
293 	if (rc)
294 		return rc;
295 
296 	/*
297 	 * Errors after here are fatal.
298 	 */
299 
300 	return ocfs2_validate_gd_self(sb, bh, 0);
301 }
302 
303 int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
304 				u64 gd_blkno, struct buffer_head **bh)
305 {
306 	int rc;
307 	struct buffer_head *tmp = *bh;
308 
309 	rc = ocfs2_read_block(inode, gd_blkno, &tmp,
310 			      ocfs2_validate_group_descriptor);
311 	if (rc)
312 		goto out;
313 
314 	rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
315 	if (rc) {
316 		brelse(tmp);
317 		goto out;
318 	}
319 
320 	/* If ocfs2_read_block() got us a new bh, pass it up. */
321 	if (!*bh)
322 		*bh = tmp;
323 
324 out:
325 	return rc;
326 }
327 
328 static int ocfs2_block_group_fill(handle_t *handle,
329 				  struct inode *alloc_inode,
330 				  struct buffer_head *bg_bh,
331 				  u64 group_blkno,
332 				  u16 my_chain,
333 				  struct ocfs2_chain_list *cl)
334 {
335 	int status = 0;
336 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
337 	struct super_block * sb = alloc_inode->i_sb;
338 
339 	mlog_entry_void();
340 
341 	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
342 		ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
343 			    "b_blocknr (%llu)",
344 			    (unsigned long long)group_blkno,
345 			    (unsigned long long) bg_bh->b_blocknr);
346 		status = -EIO;
347 		goto bail;
348 	}
349 
350 	status = ocfs2_journal_access_gd(handle,
351 					 alloc_inode,
352 					 bg_bh,
353 					 OCFS2_JOURNAL_ACCESS_CREATE);
354 	if (status < 0) {
355 		mlog_errno(status);
356 		goto bail;
357 	}
358 
359 	memset(bg, 0, sb->s_blocksize);
360 	strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
361 	bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
362 	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
363 	bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
364 	bg->bg_chain = cpu_to_le16(my_chain);
365 	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
366 	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
367 	bg->bg_blkno = cpu_to_le64(group_blkno);
368 	/* set the 1st bit in the bitmap to account for the descriptor block */
369 	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
370 	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
371 
372 	status = ocfs2_journal_dirty(handle, bg_bh);
373 	if (status < 0)
374 		mlog_errno(status);
375 
376 	/* There is no need to zero out or otherwise initialize the
377 	 * other blocks in a group - All valid FS metadata in a block
378 	 * group stores the superblock fs_generation value at
379 	 * allocation time. */
380 
381 bail:
382 	mlog_exit(status);
383 	return status;
384 }
385 
386 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
387 {
388 	u16 curr, best;
389 
390 	best = curr = 0;
391 	while (curr < le16_to_cpu(cl->cl_count)) {
392 		if (le32_to_cpu(cl->cl_recs[best].c_total) >
393 		    le32_to_cpu(cl->cl_recs[curr].c_total))
394 			best = curr;
395 		curr++;
396 	}
397 	return best;
398 }
399 
400 /*
401  * We expect the block group allocator to already be locked.
402  */
403 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
404 				   struct inode *alloc_inode,
405 				   struct buffer_head *bh,
406 				   u64 max_block)
407 {
408 	int status, credits;
409 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
410 	struct ocfs2_chain_list *cl;
411 	struct ocfs2_alloc_context *ac = NULL;
412 	handle_t *handle = NULL;
413 	u32 bit_off, num_bits;
414 	u16 alloc_rec;
415 	u64 bg_blkno;
416 	struct buffer_head *bg_bh = NULL;
417 	struct ocfs2_group_desc *bg;
418 
419 	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
420 
421 	mlog_entry_void();
422 
423 	cl = &fe->id2.i_chain;
424 	status = ocfs2_reserve_clusters_with_limit(osb,
425 						   le16_to_cpu(cl->cl_cpg),
426 						   max_block, &ac);
427 	if (status < 0) {
428 		if (status != -ENOSPC)
429 			mlog_errno(status);
430 		goto bail;
431 	}
432 
433 	credits = ocfs2_calc_group_alloc_credits(osb->sb,
434 						 le16_to_cpu(cl->cl_cpg));
435 	handle = ocfs2_start_trans(osb, credits);
436 	if (IS_ERR(handle)) {
437 		status = PTR_ERR(handle);
438 		handle = NULL;
439 		mlog_errno(status);
440 		goto bail;
441 	}
442 
443 	status = ocfs2_claim_clusters(osb,
444 				      handle,
445 				      ac,
446 				      le16_to_cpu(cl->cl_cpg),
447 				      &bit_off,
448 				      &num_bits);
449 	if (status < 0) {
450 		if (status != -ENOSPC)
451 			mlog_errno(status);
452 		goto bail;
453 	}
454 
455 	alloc_rec = ocfs2_find_smallest_chain(cl);
456 
457 	/* setup the group */
458 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
459 	mlog(0, "new descriptor, record %u, at block %llu\n",
460 	     alloc_rec, (unsigned long long)bg_blkno);
461 
462 	bg_bh = sb_getblk(osb->sb, bg_blkno);
463 	if (!bg_bh) {
464 		status = -EIO;
465 		mlog_errno(status);
466 		goto bail;
467 	}
468 	ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
469 
470 	status = ocfs2_block_group_fill(handle,
471 					alloc_inode,
472 					bg_bh,
473 					bg_blkno,
474 					alloc_rec,
475 					cl);
476 	if (status < 0) {
477 		mlog_errno(status);
478 		goto bail;
479 	}
480 
481 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
482 
483 	status = ocfs2_journal_access_di(handle, alloc_inode,
484 					 bh, OCFS2_JOURNAL_ACCESS_WRITE);
485 	if (status < 0) {
486 		mlog_errno(status);
487 		goto bail;
488 	}
489 
490 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
491 		     le16_to_cpu(bg->bg_free_bits_count));
492 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
493 	cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
494 	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
495 		le16_add_cpu(&cl->cl_next_free_rec, 1);
496 
497 	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
498 					le16_to_cpu(bg->bg_free_bits_count));
499 	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
500 	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
501 
502 	status = ocfs2_journal_dirty(handle, bh);
503 	if (status < 0) {
504 		mlog_errno(status);
505 		goto bail;
506 	}
507 
508 	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
509 	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
510 	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
511 					     le32_to_cpu(fe->i_clusters)));
512 	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
513 	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
514 	alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
515 
516 	status = 0;
517 bail:
518 	if (handle)
519 		ocfs2_commit_trans(osb, handle);
520 
521 	if (ac)
522 		ocfs2_free_alloc_context(ac);
523 
524 	brelse(bg_bh);
525 
526 	mlog_exit(status);
527 	return status;
528 }
529 
530 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
531 				       struct ocfs2_alloc_context *ac,
532 				       int type,
533 				       u32 slot,
534 				       int alloc_new_group)
535 {
536 	int status;
537 	u32 bits_wanted = ac->ac_bits_wanted;
538 	struct inode *alloc_inode;
539 	struct buffer_head *bh = NULL;
540 	struct ocfs2_dinode *fe;
541 	u32 free_bits;
542 
543 	mlog_entry_void();
544 
545 	alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
546 	if (!alloc_inode) {
547 		mlog_errno(-EINVAL);
548 		return -EINVAL;
549 	}
550 
551 	mutex_lock(&alloc_inode->i_mutex);
552 
553 	status = ocfs2_inode_lock(alloc_inode, &bh, 1);
554 	if (status < 0) {
555 		mutex_unlock(&alloc_inode->i_mutex);
556 		iput(alloc_inode);
557 
558 		mlog_errno(status);
559 		return status;
560 	}
561 
562 	ac->ac_inode = alloc_inode;
563 	ac->ac_alloc_slot = slot;
564 
565 	fe = (struct ocfs2_dinode *) bh->b_data;
566 
567 	/* The bh was validated by the inode read inside
568 	 * ocfs2_inode_lock().  Any corruption is a code bug. */
569 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
570 
571 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
572 		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
573 			    (unsigned long long)le64_to_cpu(fe->i_blkno));
574 		status = -EIO;
575 		goto bail;
576 	}
577 
578 	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
579 		le32_to_cpu(fe->id1.bitmap1.i_used);
580 
581 	if (bits_wanted > free_bits) {
582 		/* cluster bitmap never grows */
583 		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
584 			mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
585 			     bits_wanted, free_bits);
586 			status = -ENOSPC;
587 			goto bail;
588 		}
589 
590 		if (alloc_new_group != ALLOC_NEW_GROUP) {
591 			mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
592 			     "and we don't alloc a new group for it.\n",
593 			     slot, bits_wanted, free_bits);
594 			status = -ENOSPC;
595 			goto bail;
596 		}
597 
598 		status = ocfs2_block_group_alloc(osb, alloc_inode, bh,
599 						 ac->ac_max_block);
600 		if (status < 0) {
601 			if (status != -ENOSPC)
602 				mlog_errno(status);
603 			goto bail;
604 		}
605 		atomic_inc(&osb->alloc_stats.bg_extends);
606 
607 		/* You should never ask for this much metadata */
608 		BUG_ON(bits_wanted >
609 		       (le32_to_cpu(fe->id1.bitmap1.i_total)
610 			- le32_to_cpu(fe->id1.bitmap1.i_used)));
611 	}
612 
613 	get_bh(bh);
614 	ac->ac_bh = bh;
615 bail:
616 	brelse(bh);
617 
618 	mlog_exit(status);
619 	return status;
620 }
621 
622 int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb,
623 				      int blocks,
624 				      struct ocfs2_alloc_context **ac)
625 {
626 	int status;
627 	u32 slot;
628 
629 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
630 	if (!(*ac)) {
631 		status = -ENOMEM;
632 		mlog_errno(status);
633 		goto bail;
634 	}
635 
636 	(*ac)->ac_bits_wanted = blocks;
637 	(*ac)->ac_which = OCFS2_AC_USE_META;
638 	slot = osb->slot_num;
639 	(*ac)->ac_group_search = ocfs2_block_group_search;
640 
641 	status = ocfs2_reserve_suballoc_bits(osb, (*ac),
642 					     EXTENT_ALLOC_SYSTEM_INODE,
643 					     slot, ALLOC_NEW_GROUP);
644 	if (status < 0) {
645 		if (status != -ENOSPC)
646 			mlog_errno(status);
647 		goto bail;
648 	}
649 
650 	status = 0;
651 bail:
652 	if ((status < 0) && *ac) {
653 		ocfs2_free_alloc_context(*ac);
654 		*ac = NULL;
655 	}
656 
657 	mlog_exit(status);
658 	return status;
659 }
660 
661 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
662 			       struct ocfs2_extent_list *root_el,
663 			       struct ocfs2_alloc_context **ac)
664 {
665 	return ocfs2_reserve_new_metadata_blocks(osb,
666 					ocfs2_extend_meta_needed(root_el),
667 					ac);
668 }
669 
670 static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
671 					      struct ocfs2_alloc_context *ac)
672 {
673 	int i, status = -ENOSPC;
674 	s16 slot = ocfs2_get_inode_steal_slot(osb);
675 
676 	/* Start to steal inodes from the first slot after ours. */
677 	if (slot == OCFS2_INVALID_SLOT)
678 		slot = osb->slot_num + 1;
679 
680 	for (i = 0; i < osb->max_slots; i++, slot++) {
681 		if (slot == osb->max_slots)
682 			slot = 0;
683 
684 		if (slot == osb->slot_num)
685 			continue;
686 
687 		status = ocfs2_reserve_suballoc_bits(osb, ac,
688 						     INODE_ALLOC_SYSTEM_INODE,
689 						     slot, NOT_ALLOC_NEW_GROUP);
690 		if (status >= 0) {
691 			ocfs2_set_inode_steal_slot(osb, slot);
692 			break;
693 		}
694 
695 		ocfs2_free_ac_resource(ac);
696 	}
697 
698 	return status;
699 }
700 
701 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
702 			    struct ocfs2_alloc_context **ac)
703 {
704 	int status;
705 	s16 slot = ocfs2_get_inode_steal_slot(osb);
706 
707 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
708 	if (!(*ac)) {
709 		status = -ENOMEM;
710 		mlog_errno(status);
711 		goto bail;
712 	}
713 
714 	(*ac)->ac_bits_wanted = 1;
715 	(*ac)->ac_which = OCFS2_AC_USE_INODE;
716 
717 	(*ac)->ac_group_search = ocfs2_block_group_search;
718 
719 	/*
720 	 * stat(2) can't handle i_ino > 32bits, so we tell the
721 	 * lower levels not to allocate us a block group past that
722 	 * limit.  The 'inode64' mount option avoids this behavior.
723 	 */
724 	if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64))
725 		(*ac)->ac_max_block = (u32)~0U;
726 
727 	/*
728 	 * slot is set when we successfully steal inode from other nodes.
729 	 * It is reset in 3 places:
730 	 * 1. when we flush the truncate log
731 	 * 2. when we complete local alloc recovery.
732 	 * 3. when we successfully allocate from our own slot.
733 	 * After it is set, we will go on stealing inodes until we find the
734 	 * need to check our slots to see whether there is some space for us.
735 	 */
736 	if (slot != OCFS2_INVALID_SLOT &&
737 	    atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
738 		goto inode_steal;
739 
740 	atomic_set(&osb->s_num_inodes_stolen, 0);
741 	status = ocfs2_reserve_suballoc_bits(osb, *ac,
742 					     INODE_ALLOC_SYSTEM_INODE,
743 					     osb->slot_num, ALLOC_NEW_GROUP);
744 	if (status >= 0) {
745 		status = 0;
746 
747 		/*
748 		 * Some inodes must be freed by us, so try to allocate
749 		 * from our own next time.
750 		 */
751 		if (slot != OCFS2_INVALID_SLOT)
752 			ocfs2_init_inode_steal_slot(osb);
753 		goto bail;
754 	} else if (status < 0 && status != -ENOSPC) {
755 		mlog_errno(status);
756 		goto bail;
757 	}
758 
759 	ocfs2_free_ac_resource(*ac);
760 
761 inode_steal:
762 	status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
763 	atomic_inc(&osb->s_num_inodes_stolen);
764 	if (status < 0) {
765 		if (status != -ENOSPC)
766 			mlog_errno(status);
767 		goto bail;
768 	}
769 
770 	status = 0;
771 bail:
772 	if ((status < 0) && *ac) {
773 		ocfs2_free_alloc_context(*ac);
774 		*ac = NULL;
775 	}
776 
777 	mlog_exit(status);
778 	return status;
779 }
780 
781 /* local alloc code has to do the same thing, so rather than do this
782  * twice.. */
783 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
784 				      struct ocfs2_alloc_context *ac)
785 {
786 	int status;
787 
788 	ac->ac_which = OCFS2_AC_USE_MAIN;
789 	ac->ac_group_search = ocfs2_cluster_group_search;
790 
791 	status = ocfs2_reserve_suballoc_bits(osb, ac,
792 					     GLOBAL_BITMAP_SYSTEM_INODE,
793 					     OCFS2_INVALID_SLOT,
794 					     ALLOC_NEW_GROUP);
795 	if (status < 0 && status != -ENOSPC) {
796 		mlog_errno(status);
797 		goto bail;
798 	}
799 
800 bail:
801 	return status;
802 }
803 
804 /* Callers don't need to care which bitmap (local alloc or main) to
805  * use so we figure it out for them, but unfortunately this clutters
806  * things a bit. */
807 static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb,
808 					     u32 bits_wanted, u64 max_block,
809 					     struct ocfs2_alloc_context **ac)
810 {
811 	int status;
812 
813 	mlog_entry_void();
814 
815 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
816 	if (!(*ac)) {
817 		status = -ENOMEM;
818 		mlog_errno(status);
819 		goto bail;
820 	}
821 
822 	(*ac)->ac_bits_wanted = bits_wanted;
823 	(*ac)->ac_max_block = max_block;
824 
825 	status = -ENOSPC;
826 	if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
827 		status = ocfs2_reserve_local_alloc_bits(osb,
828 							bits_wanted,
829 							*ac);
830 		if (status == -EFBIG) {
831 			/* The local alloc window is outside ac_max_block.
832 			 * use the main bitmap. */
833 			status = -ENOSPC;
834 		} else if ((status < 0) && (status != -ENOSPC)) {
835 			mlog_errno(status);
836 			goto bail;
837 		}
838 	}
839 
840 	if (status == -ENOSPC) {
841 		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
842 		if (status < 0) {
843 			if (status != -ENOSPC)
844 				mlog_errno(status);
845 			goto bail;
846 		}
847 	}
848 
849 	status = 0;
850 bail:
851 	if ((status < 0) && *ac) {
852 		ocfs2_free_alloc_context(*ac);
853 		*ac = NULL;
854 	}
855 
856 	mlog_exit(status);
857 	return status;
858 }
859 
860 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
861 			   u32 bits_wanted,
862 			   struct ocfs2_alloc_context **ac)
863 {
864 	return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac);
865 }
866 
867 /*
868  * More or less lifted from ext3. I'll leave their description below:
869  *
870  * "For ext3 allocations, we must not reuse any blocks which are
871  * allocated in the bitmap buffer's "last committed data" copy.  This
872  * prevents deletes from freeing up the page for reuse until we have
873  * committed the delete transaction.
874  *
875  * If we didn't do this, then deleting something and reallocating it as
876  * data would allow the old block to be overwritten before the
877  * transaction committed (because we force data to disk before commit).
878  * This would lead to corruption if we crashed between overwriting the
879  * data and committing the delete.
880  *
881  * @@@ We may want to make this allocation behaviour conditional on
882  * data-writes at some point, and disable it for metadata allocations or
883  * sync-data inodes."
884  *
885  * Note: OCFS2 already does this differently for metadata vs data
886  * allocations, as those bitmaps are separate and undo access is never
887  * called on a metadata group descriptor.
888  */
889 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
890 					 int nr)
891 {
892 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
893 
894 	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
895 		return 0;
896 	if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
897 		return 1;
898 
899 	bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
900 	return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
901 }
902 
903 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
904 					     struct buffer_head *bg_bh,
905 					     unsigned int bits_wanted,
906 					     unsigned int total_bits,
907 					     u16 *bit_off,
908 					     u16 *bits_found)
909 {
910 	void *bitmap;
911 	u16 best_offset, best_size;
912 	int offset, start, found, status = 0;
913 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
914 
915 	/* Callers got this descriptor from
916 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
917 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
918 
919 	found = start = best_offset = best_size = 0;
920 	bitmap = bg->bg_bitmap;
921 
922 	while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
923 		if (offset == total_bits)
924 			break;
925 
926 		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
927 			/* We found a zero, but we can't use it as it
928 			 * hasn't been put to disk yet! */
929 			found = 0;
930 			start = offset + 1;
931 		} else if (offset == start) {
932 			/* we found a zero */
933 			found++;
934 			/* move start to the next bit to test */
935 			start++;
936 		} else {
937 			/* got a zero after some ones */
938 			found = 1;
939 			start = offset + 1;
940 		}
941 		if (found > best_size) {
942 			best_size = found;
943 			best_offset = start - found;
944 		}
945 		/* we got everything we needed */
946 		if (found == bits_wanted) {
947 			/* mlog(0, "Found it all!\n"); */
948 			break;
949 		}
950 	}
951 
952 	/* XXX: I think the first clause is equivalent to the second
953 	 * 	- jlbec */
954 	if (found == bits_wanted) {
955 		*bit_off = start - found;
956 		*bits_found = found;
957 	} else if (best_size) {
958 		*bit_off = best_offset;
959 		*bits_found = best_size;
960 	} else {
961 		status = -ENOSPC;
962 		/* No error log here -- see the comment above
963 		 * ocfs2_test_bg_bit_allocatable */
964 	}
965 
966 	return status;
967 }
968 
969 static inline int ocfs2_block_group_set_bits(handle_t *handle,
970 					     struct inode *alloc_inode,
971 					     struct ocfs2_group_desc *bg,
972 					     struct buffer_head *group_bh,
973 					     unsigned int bit_off,
974 					     unsigned int num_bits)
975 {
976 	int status;
977 	void *bitmap = bg->bg_bitmap;
978 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
979 
980 	mlog_entry_void();
981 
982 	/* All callers get the descriptor via
983 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
984 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
985 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
986 
987 	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
988 	     num_bits);
989 
990 	if (ocfs2_is_cluster_bitmap(alloc_inode))
991 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
992 
993 	status = ocfs2_journal_access_gd(handle,
994 					 alloc_inode,
995 					 group_bh,
996 					 journal_type);
997 	if (status < 0) {
998 		mlog_errno(status);
999 		goto bail;
1000 	}
1001 
1002 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
1003 
1004 	while(num_bits--)
1005 		ocfs2_set_bit(bit_off++, bitmap);
1006 
1007 	status = ocfs2_journal_dirty(handle,
1008 				     group_bh);
1009 	if (status < 0) {
1010 		mlog_errno(status);
1011 		goto bail;
1012 	}
1013 
1014 bail:
1015 	mlog_exit(status);
1016 	return status;
1017 }
1018 
1019 /* find the one with the most empty bits */
1020 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
1021 {
1022 	u16 curr, best;
1023 
1024 	BUG_ON(!cl->cl_next_free_rec);
1025 
1026 	best = curr = 0;
1027 	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
1028 		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
1029 		    le32_to_cpu(cl->cl_recs[best].c_free))
1030 			best = curr;
1031 		curr++;
1032 	}
1033 
1034 	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
1035 	return best;
1036 }
1037 
1038 static int ocfs2_relink_block_group(handle_t *handle,
1039 				    struct inode *alloc_inode,
1040 				    struct buffer_head *fe_bh,
1041 				    struct buffer_head *bg_bh,
1042 				    struct buffer_head *prev_bg_bh,
1043 				    u16 chain)
1044 {
1045 	int status;
1046 	/* there is a really tiny chance the journal calls could fail,
1047 	 * but we wouldn't want inconsistent blocks in *any* case. */
1048 	u64 fe_ptr, bg_ptr, prev_bg_ptr;
1049 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
1050 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
1051 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
1052 
1053 	/* The caller got these descriptors from
1054 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1055 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1056 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
1057 
1058 	mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
1059 	     (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
1060 	     (unsigned long long)le64_to_cpu(bg->bg_blkno),
1061 	     (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
1062 
1063 	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
1064 	bg_ptr = le64_to_cpu(bg->bg_next_group);
1065 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
1066 
1067 	status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh,
1068 					 OCFS2_JOURNAL_ACCESS_WRITE);
1069 	if (status < 0) {
1070 		mlog_errno(status);
1071 		goto out_rollback;
1072 	}
1073 
1074 	prev_bg->bg_next_group = bg->bg_next_group;
1075 
1076 	status = ocfs2_journal_dirty(handle, prev_bg_bh);
1077 	if (status < 0) {
1078 		mlog_errno(status);
1079 		goto out_rollback;
1080 	}
1081 
1082 	status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh,
1083 					 OCFS2_JOURNAL_ACCESS_WRITE);
1084 	if (status < 0) {
1085 		mlog_errno(status);
1086 		goto out_rollback;
1087 	}
1088 
1089 	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
1090 
1091 	status = ocfs2_journal_dirty(handle, bg_bh);
1092 	if (status < 0) {
1093 		mlog_errno(status);
1094 		goto out_rollback;
1095 	}
1096 
1097 	status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh,
1098 					 OCFS2_JOURNAL_ACCESS_WRITE);
1099 	if (status < 0) {
1100 		mlog_errno(status);
1101 		goto out_rollback;
1102 	}
1103 
1104 	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
1105 
1106 	status = ocfs2_journal_dirty(handle, fe_bh);
1107 	if (status < 0) {
1108 		mlog_errno(status);
1109 		goto out_rollback;
1110 	}
1111 
1112 	status = 0;
1113 out_rollback:
1114 	if (status < 0) {
1115 		fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
1116 		bg->bg_next_group = cpu_to_le64(bg_ptr);
1117 		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
1118 	}
1119 
1120 	mlog_exit(status);
1121 	return status;
1122 }
1123 
1124 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
1125 						     u32 wanted)
1126 {
1127 	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
1128 }
1129 
1130 /* return 0 on success, -ENOSPC to keep searching and any other < 0
1131  * value on error. */
1132 static int ocfs2_cluster_group_search(struct inode *inode,
1133 				      struct buffer_head *group_bh,
1134 				      u32 bits_wanted, u32 min_bits,
1135 				      u64 max_block,
1136 				      u16 *bit_off, u16 *bits_found)
1137 {
1138 	int search = -ENOSPC;
1139 	int ret;
1140 	u64 blkoff;
1141 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
1142 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1143 	u16 tmp_off, tmp_found;
1144 	unsigned int max_bits, gd_cluster_off;
1145 
1146 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1147 
1148 	if (gd->bg_free_bits_count) {
1149 		max_bits = le16_to_cpu(gd->bg_bits);
1150 
1151 		/* Tail groups in cluster bitmaps which aren't cpg
1152 		 * aligned are prone to partial extention by a failed
1153 		 * fs resize. If the file system resize never got to
1154 		 * update the dinode cluster count, then we don't want
1155 		 * to trust any clusters past it, regardless of what
1156 		 * the group descriptor says. */
1157 		gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
1158 							  le64_to_cpu(gd->bg_blkno));
1159 		if ((gd_cluster_off + max_bits) >
1160 		    OCFS2_I(inode)->ip_clusters) {
1161 			max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
1162 			mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
1163 			     (unsigned long long)le64_to_cpu(gd->bg_blkno),
1164 			     le16_to_cpu(gd->bg_bits),
1165 			     OCFS2_I(inode)->ip_clusters, max_bits);
1166 		}
1167 
1168 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1169 							group_bh, bits_wanted,
1170 							max_bits,
1171 							&tmp_off, &tmp_found);
1172 		if (ret)
1173 			return ret;
1174 
1175 		if (max_block) {
1176 			blkoff = ocfs2_clusters_to_blocks(inode->i_sb,
1177 							  gd_cluster_off +
1178 							  tmp_off + tmp_found);
1179 			mlog(0, "Checking %llu against %llu\n",
1180 			     (unsigned long long)blkoff,
1181 			     (unsigned long long)max_block);
1182 			if (blkoff > max_block)
1183 				return -ENOSPC;
1184 		}
1185 
1186 		/* ocfs2_block_group_find_clear_bits() might
1187 		 * return success, but we still want to return
1188 		 * -ENOSPC unless it found the minimum number
1189 		 * of bits. */
1190 		if (min_bits <= tmp_found) {
1191 			*bit_off = tmp_off;
1192 			*bits_found = tmp_found;
1193 			search = 0; /* success */
1194 		} else if (tmp_found) {
1195 			/*
1196 			 * Don't show bits which we'll be returning
1197 			 * for allocation to the local alloc bitmap.
1198 			 */
1199 			ocfs2_local_alloc_seen_free_bits(osb, tmp_found);
1200 		}
1201 	}
1202 
1203 	return search;
1204 }
1205 
1206 static int ocfs2_block_group_search(struct inode *inode,
1207 				    struct buffer_head *group_bh,
1208 				    u32 bits_wanted, u32 min_bits,
1209 				    u64 max_block,
1210 				    u16 *bit_off, u16 *bits_found)
1211 {
1212 	int ret = -ENOSPC;
1213 	u64 blkoff;
1214 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
1215 
1216 	BUG_ON(min_bits != 1);
1217 	BUG_ON(ocfs2_is_cluster_bitmap(inode));
1218 
1219 	if (bg->bg_free_bits_count) {
1220 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
1221 							group_bh, bits_wanted,
1222 							le16_to_cpu(bg->bg_bits),
1223 							bit_off, bits_found);
1224 		if (!ret && max_block) {
1225 			blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off +
1226 				*bits_found;
1227 			mlog(0, "Checking %llu against %llu\n",
1228 			     (unsigned long long)blkoff,
1229 			     (unsigned long long)max_block);
1230 			if (blkoff > max_block)
1231 				ret = -ENOSPC;
1232 		}
1233 	}
1234 
1235 	return ret;
1236 }
1237 
1238 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
1239 				       handle_t *handle,
1240 				       struct buffer_head *di_bh,
1241 				       u32 num_bits,
1242 				       u16 chain)
1243 {
1244 	int ret;
1245 	u32 tmp_used;
1246 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
1247 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
1248 
1249 	ret = ocfs2_journal_access_di(handle, inode, di_bh,
1250 				      OCFS2_JOURNAL_ACCESS_WRITE);
1251 	if (ret < 0) {
1252 		mlog_errno(ret);
1253 		goto out;
1254 	}
1255 
1256 	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1257 	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1258 	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1259 
1260 	ret = ocfs2_journal_dirty(handle, di_bh);
1261 	if (ret < 0)
1262 		mlog_errno(ret);
1263 
1264 out:
1265 	return ret;
1266 }
1267 
1268 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1269 				  handle_t *handle,
1270 				  u32 bits_wanted,
1271 				  u32 min_bits,
1272 				  u16 *bit_off,
1273 				  unsigned int *num_bits,
1274 				  u64 gd_blkno,
1275 				  u16 *bits_left)
1276 {
1277 	int ret;
1278 	u16 found;
1279 	struct buffer_head *group_bh = NULL;
1280 	struct ocfs2_group_desc *gd;
1281 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
1282 	struct inode *alloc_inode = ac->ac_inode;
1283 
1284 	ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
1285 					  &group_bh);
1286 	if (ret < 0) {
1287 		mlog_errno(ret);
1288 		return ret;
1289 	}
1290 
1291 	gd = (struct ocfs2_group_desc *) group_bh->b_data;
1292 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1293 				  ac->ac_max_block, bit_off, &found);
1294 	if (ret < 0) {
1295 		if (ret != -ENOSPC)
1296 			mlog_errno(ret);
1297 		goto out;
1298 	}
1299 
1300 	*num_bits = found;
1301 
1302 	ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1303 					       *num_bits,
1304 					       le16_to_cpu(gd->bg_chain));
1305 	if (ret < 0) {
1306 		mlog_errno(ret);
1307 		goto out;
1308 	}
1309 
1310 	ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1311 					 *bit_off, *num_bits);
1312 	if (ret < 0)
1313 		mlog_errno(ret);
1314 
1315 	*bits_left = le16_to_cpu(gd->bg_free_bits_count);
1316 
1317 out:
1318 	brelse(group_bh);
1319 
1320 	return ret;
1321 }
1322 
1323 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1324 			      handle_t *handle,
1325 			      u32 bits_wanted,
1326 			      u32 min_bits,
1327 			      u16 *bit_off,
1328 			      unsigned int *num_bits,
1329 			      u64 *bg_blkno,
1330 			      u16 *bits_left)
1331 {
1332 	int status;
1333 	u16 chain, tmp_bits;
1334 	u32 tmp_used;
1335 	u64 next_group;
1336 	struct inode *alloc_inode = ac->ac_inode;
1337 	struct buffer_head *group_bh = NULL;
1338 	struct buffer_head *prev_group_bh = NULL;
1339 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1340 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1341 	struct ocfs2_group_desc *bg;
1342 
1343 	chain = ac->ac_chain;
1344 	mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1345 	     bits_wanted, chain,
1346 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1347 
1348 	status = ocfs2_read_group_descriptor(alloc_inode, fe,
1349 					     le64_to_cpu(cl->cl_recs[chain].c_blkno),
1350 					     &group_bh);
1351 	if (status < 0) {
1352 		mlog_errno(status);
1353 		goto bail;
1354 	}
1355 	bg = (struct ocfs2_group_desc *) group_bh->b_data;
1356 
1357 	status = -ENOSPC;
1358 	/* for now, the chain search is a bit simplistic. We just use
1359 	 * the 1st group with any empty bits. */
1360 	while ((status = ac->ac_group_search(alloc_inode, group_bh,
1361 					     bits_wanted, min_bits,
1362 					     ac->ac_max_block, bit_off,
1363 					     &tmp_bits)) == -ENOSPC) {
1364 		if (!bg->bg_next_group)
1365 			break;
1366 
1367 		brelse(prev_group_bh);
1368 		prev_group_bh = NULL;
1369 
1370 		next_group = le64_to_cpu(bg->bg_next_group);
1371 		prev_group_bh = group_bh;
1372 		group_bh = NULL;
1373 		status = ocfs2_read_group_descriptor(alloc_inode, fe,
1374 						     next_group, &group_bh);
1375 		if (status < 0) {
1376 			mlog_errno(status);
1377 			goto bail;
1378 		}
1379 		bg = (struct ocfs2_group_desc *) group_bh->b_data;
1380 	}
1381 	if (status < 0) {
1382 		if (status != -ENOSPC)
1383 			mlog_errno(status);
1384 		goto bail;
1385 	}
1386 
1387 	mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1388 	     tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1389 
1390 	*num_bits = tmp_bits;
1391 
1392 	BUG_ON(*num_bits == 0);
1393 
1394 	/*
1395 	 * Keep track of previous block descriptor read. When
1396 	 * we find a target, if we have read more than X
1397 	 * number of descriptors, and the target is reasonably
1398 	 * empty, relink him to top of his chain.
1399 	 *
1400 	 * We've read 0 extra blocks and only send one more to
1401 	 * the transaction, yet the next guy to search has a
1402 	 * much easier time.
1403 	 *
1404 	 * Do this *after* figuring out how many bits we're taking out
1405 	 * of our target group.
1406 	 */
1407 	if (ac->ac_allow_chain_relink &&
1408 	    (prev_group_bh) &&
1409 	    (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1410 		status = ocfs2_relink_block_group(handle, alloc_inode,
1411 						  ac->ac_bh, group_bh,
1412 						  prev_group_bh, chain);
1413 		if (status < 0) {
1414 			mlog_errno(status);
1415 			goto bail;
1416 		}
1417 	}
1418 
1419 	/* Ok, claim our bits now: set the info on dinode, chainlist
1420 	 * and then the group */
1421 	status = ocfs2_journal_access_di(handle,
1422 					 alloc_inode,
1423 					 ac->ac_bh,
1424 					 OCFS2_JOURNAL_ACCESS_WRITE);
1425 	if (status < 0) {
1426 		mlog_errno(status);
1427 		goto bail;
1428 	}
1429 
1430 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1431 	fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1432 	le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1433 
1434 	status = ocfs2_journal_dirty(handle,
1435 				     ac->ac_bh);
1436 	if (status < 0) {
1437 		mlog_errno(status);
1438 		goto bail;
1439 	}
1440 
1441 	status = ocfs2_block_group_set_bits(handle,
1442 					    alloc_inode,
1443 					    bg,
1444 					    group_bh,
1445 					    *bit_off,
1446 					    *num_bits);
1447 	if (status < 0) {
1448 		mlog_errno(status);
1449 		goto bail;
1450 	}
1451 
1452 	mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1453 	     (unsigned long long)le64_to_cpu(fe->i_blkno));
1454 
1455 	*bg_blkno = le64_to_cpu(bg->bg_blkno);
1456 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
1457 bail:
1458 	brelse(group_bh);
1459 	brelse(prev_group_bh);
1460 
1461 	mlog_exit(status);
1462 	return status;
1463 }
1464 
1465 /* will give out up to bits_wanted contiguous bits. */
1466 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1467 				     struct ocfs2_alloc_context *ac,
1468 				     handle_t *handle,
1469 				     u32 bits_wanted,
1470 				     u32 min_bits,
1471 				     u16 *bit_off,
1472 				     unsigned int *num_bits,
1473 				     u64 *bg_blkno)
1474 {
1475 	int status;
1476 	u16 victim, i;
1477 	u16 bits_left = 0;
1478 	u64 hint_blkno = ac->ac_last_group;
1479 	struct ocfs2_chain_list *cl;
1480 	struct ocfs2_dinode *fe;
1481 
1482 	mlog_entry_void();
1483 
1484 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1485 	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1486 	BUG_ON(!ac->ac_bh);
1487 
1488 	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1489 
1490 	/* The bh was validated by the inode read during
1491 	 * ocfs2_reserve_suballoc_bits().  Any corruption is a code bug. */
1492 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1493 
1494 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1495 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
1496 		ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1497 			    "bits but only %u total.",
1498 			    (unsigned long long)le64_to_cpu(fe->i_blkno),
1499 			    le32_to_cpu(fe->id1.bitmap1.i_used),
1500 			    le32_to_cpu(fe->id1.bitmap1.i_total));
1501 		status = -EIO;
1502 		goto bail;
1503 	}
1504 
1505 	if (hint_blkno) {
1506 		/* Attempt to short-circuit the usual search mechanism
1507 		 * by jumping straight to the most recently used
1508 		 * allocation group. This helps us mantain some
1509 		 * contiguousness across allocations. */
1510 		status = ocfs2_search_one_group(ac, handle, bits_wanted,
1511 						min_bits, bit_off, num_bits,
1512 						hint_blkno, &bits_left);
1513 		if (!status) {
1514 			/* Be careful to update *bg_blkno here as the
1515 			 * caller is expecting it to be filled in, and
1516 			 * ocfs2_search_one_group() won't do that for
1517 			 * us. */
1518 			*bg_blkno = hint_blkno;
1519 			goto set_hint;
1520 		}
1521 		if (status < 0 && status != -ENOSPC) {
1522 			mlog_errno(status);
1523 			goto bail;
1524 		}
1525 	}
1526 
1527 	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1528 
1529 	victim = ocfs2_find_victim_chain(cl);
1530 	ac->ac_chain = victim;
1531 	ac->ac_allow_chain_relink = 1;
1532 
1533 	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
1534 				    num_bits, bg_blkno, &bits_left);
1535 	if (!status)
1536 		goto set_hint;
1537 	if (status < 0 && status != -ENOSPC) {
1538 		mlog_errno(status);
1539 		goto bail;
1540 	}
1541 
1542 	mlog(0, "Search of victim chain %u came up with nothing, "
1543 	     "trying all chains now.\n", victim);
1544 
1545 	/* If we didn't pick a good victim, then just default to
1546 	 * searching each chain in order. Don't allow chain relinking
1547 	 * because we only calculate enough journal credits for one
1548 	 * relink per alloc. */
1549 	ac->ac_allow_chain_relink = 0;
1550 	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1551 		if (i == victim)
1552 			continue;
1553 		if (!cl->cl_recs[i].c_free)
1554 			continue;
1555 
1556 		ac->ac_chain = i;
1557 		status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1558 					    bit_off, num_bits, bg_blkno,
1559 					    &bits_left);
1560 		if (!status)
1561 			break;
1562 		if (status < 0 && status != -ENOSPC) {
1563 			mlog_errno(status);
1564 			goto bail;
1565 		}
1566 	}
1567 
1568 set_hint:
1569 	if (status != -ENOSPC) {
1570 		/* If the next search of this group is not likely to
1571 		 * yield a suitable extent, then we reset the last
1572 		 * group hint so as to not waste a disk read */
1573 		if (bits_left < min_bits)
1574 			ac->ac_last_group = 0;
1575 		else
1576 			ac->ac_last_group = *bg_blkno;
1577 	}
1578 
1579 bail:
1580 	mlog_exit(status);
1581 	return status;
1582 }
1583 
1584 int ocfs2_claim_metadata(struct ocfs2_super *osb,
1585 			 handle_t *handle,
1586 			 struct ocfs2_alloc_context *ac,
1587 			 u32 bits_wanted,
1588 			 u16 *suballoc_bit_start,
1589 			 unsigned int *num_bits,
1590 			 u64 *blkno_start)
1591 {
1592 	int status;
1593 	u64 bg_blkno;
1594 
1595 	BUG_ON(!ac);
1596 	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1597 	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1598 
1599 	status = ocfs2_claim_suballoc_bits(osb,
1600 					   ac,
1601 					   handle,
1602 					   bits_wanted,
1603 					   1,
1604 					   suballoc_bit_start,
1605 					   num_bits,
1606 					   &bg_blkno);
1607 	if (status < 0) {
1608 		mlog_errno(status);
1609 		goto bail;
1610 	}
1611 	atomic_inc(&osb->alloc_stats.bg_allocs);
1612 
1613 	*blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1614 	ac->ac_bits_given += (*num_bits);
1615 	status = 0;
1616 bail:
1617 	mlog_exit(status);
1618 	return status;
1619 }
1620 
1621 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1622 			  handle_t *handle,
1623 			  struct ocfs2_alloc_context *ac,
1624 			  u16 *suballoc_bit,
1625 			  u64 *fe_blkno)
1626 {
1627 	int status;
1628 	unsigned int num_bits;
1629 	u64 bg_blkno;
1630 
1631 	mlog_entry_void();
1632 
1633 	BUG_ON(!ac);
1634 	BUG_ON(ac->ac_bits_given != 0);
1635 	BUG_ON(ac->ac_bits_wanted != 1);
1636 	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1637 
1638 	status = ocfs2_claim_suballoc_bits(osb,
1639 					   ac,
1640 					   handle,
1641 					   1,
1642 					   1,
1643 					   suballoc_bit,
1644 					   &num_bits,
1645 					   &bg_blkno);
1646 	if (status < 0) {
1647 		mlog_errno(status);
1648 		goto bail;
1649 	}
1650 	atomic_inc(&osb->alloc_stats.bg_allocs);
1651 
1652 	BUG_ON(num_bits != 1);
1653 
1654 	*fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1655 	ac->ac_bits_given++;
1656 	status = 0;
1657 bail:
1658 	mlog_exit(status);
1659 	return status;
1660 }
1661 
1662 /* translate a group desc. blkno and it's bitmap offset into
1663  * disk cluster offset. */
1664 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1665 						   u64 bg_blkno,
1666 						   u16 bg_bit_off)
1667 {
1668 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1669 	u32 cluster = 0;
1670 
1671 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1672 
1673 	if (bg_blkno != osb->first_cluster_group_blkno)
1674 		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1675 	cluster += (u32) bg_bit_off;
1676 	return cluster;
1677 }
1678 
1679 /* given a cluster offset, calculate which block group it belongs to
1680  * and return that block offset. */
1681 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
1682 {
1683 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1684 	u32 group_no;
1685 
1686 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1687 
1688 	group_no = cluster / osb->bitmap_cpg;
1689 	if (!group_no)
1690 		return osb->first_cluster_group_blkno;
1691 	return ocfs2_clusters_to_blocks(inode->i_sb,
1692 					group_no * osb->bitmap_cpg);
1693 }
1694 
1695 /* given the block number of a cluster start, calculate which cluster
1696  * group and descriptor bitmap offset that corresponds to. */
1697 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1698 						u64 data_blkno,
1699 						u64 *bg_blkno,
1700 						u16 *bg_bit_off)
1701 {
1702 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1703 	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1704 
1705 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1706 
1707 	*bg_blkno = ocfs2_which_cluster_group(inode,
1708 					      data_cluster);
1709 
1710 	if (*bg_blkno == osb->first_cluster_group_blkno)
1711 		*bg_bit_off = (u16) data_cluster;
1712 	else
1713 		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1714 							     data_blkno - *bg_blkno);
1715 }
1716 
1717 /*
1718  * min_bits - minimum contiguous chunk from this total allocation we
1719  * can handle. set to what we asked for originally for a full
1720  * contig. allocation, set to '1' to indicate we can deal with extents
1721  * of any size.
1722  */
1723 int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1724 			   handle_t *handle,
1725 			   struct ocfs2_alloc_context *ac,
1726 			   u32 min_clusters,
1727 			   u32 max_clusters,
1728 			   u32 *cluster_start,
1729 			   u32 *num_clusters)
1730 {
1731 	int status;
1732 	unsigned int bits_wanted = max_clusters;
1733 	u64 bg_blkno = 0;
1734 	u16 bg_bit_off;
1735 
1736 	mlog_entry_void();
1737 
1738 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1739 
1740 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1741 	       && ac->ac_which != OCFS2_AC_USE_MAIN);
1742 
1743 	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1744 		status = ocfs2_claim_local_alloc_bits(osb,
1745 						      handle,
1746 						      ac,
1747 						      bits_wanted,
1748 						      cluster_start,
1749 						      num_clusters);
1750 		if (!status)
1751 			atomic_inc(&osb->alloc_stats.local_data);
1752 	} else {
1753 		if (min_clusters > (osb->bitmap_cpg - 1)) {
1754 			/* The only paths asking for contiguousness
1755 			 * should know about this already. */
1756 			mlog(ML_ERROR, "minimum allocation requested %u exceeds "
1757 			     "group bitmap size %u!\n", min_clusters,
1758 			     osb->bitmap_cpg);
1759 			status = -ENOSPC;
1760 			goto bail;
1761 		}
1762 		/* clamp the current request down to a realistic size. */
1763 		if (bits_wanted > (osb->bitmap_cpg - 1))
1764 			bits_wanted = osb->bitmap_cpg - 1;
1765 
1766 		status = ocfs2_claim_suballoc_bits(osb,
1767 						   ac,
1768 						   handle,
1769 						   bits_wanted,
1770 						   min_clusters,
1771 						   &bg_bit_off,
1772 						   num_clusters,
1773 						   &bg_blkno);
1774 		if (!status) {
1775 			*cluster_start =
1776 				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1777 								 bg_blkno,
1778 								 bg_bit_off);
1779 			atomic_inc(&osb->alloc_stats.bitmap_data);
1780 		}
1781 	}
1782 	if (status < 0) {
1783 		if (status != -ENOSPC)
1784 			mlog_errno(status);
1785 		goto bail;
1786 	}
1787 
1788 	ac->ac_bits_given += *num_clusters;
1789 
1790 bail:
1791 	mlog_exit(status);
1792 	return status;
1793 }
1794 
1795 int ocfs2_claim_clusters(struct ocfs2_super *osb,
1796 			 handle_t *handle,
1797 			 struct ocfs2_alloc_context *ac,
1798 			 u32 min_clusters,
1799 			 u32 *cluster_start,
1800 			 u32 *num_clusters)
1801 {
1802 	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1803 
1804 	return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
1805 				      bits_wanted, cluster_start, num_clusters);
1806 }
1807 
1808 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1809 					       struct inode *alloc_inode,
1810 					       struct ocfs2_group_desc *bg,
1811 					       struct buffer_head *group_bh,
1812 					       unsigned int bit_off,
1813 					       unsigned int num_bits)
1814 {
1815 	int status;
1816 	unsigned int tmp;
1817 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1818 	struct ocfs2_group_desc *undo_bg = NULL;
1819 
1820 	mlog_entry_void();
1821 
1822 	/* The caller got this descriptor from
1823 	 * ocfs2_read_group_descriptor().  Any corruption is a code bug. */
1824 	BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
1825 
1826 	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1827 
1828 	if (ocfs2_is_cluster_bitmap(alloc_inode))
1829 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1830 
1831 	status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh,
1832 					 journal_type);
1833 	if (status < 0) {
1834 		mlog_errno(status);
1835 		goto bail;
1836 	}
1837 
1838 	if (ocfs2_is_cluster_bitmap(alloc_inode))
1839 		undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1840 
1841 	tmp = num_bits;
1842 	while(tmp--) {
1843 		ocfs2_clear_bit((bit_off + tmp),
1844 				(unsigned long *) bg->bg_bitmap);
1845 		if (ocfs2_is_cluster_bitmap(alloc_inode))
1846 			ocfs2_set_bit(bit_off + tmp,
1847 				      (unsigned long *) undo_bg->bg_bitmap);
1848 	}
1849 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1850 
1851 	status = ocfs2_journal_dirty(handle, group_bh);
1852 	if (status < 0)
1853 		mlog_errno(status);
1854 bail:
1855 	return status;
1856 }
1857 
1858 /*
1859  * expects the suballoc inode to already be locked.
1860  */
1861 int ocfs2_free_suballoc_bits(handle_t *handle,
1862 			     struct inode *alloc_inode,
1863 			     struct buffer_head *alloc_bh,
1864 			     unsigned int start_bit,
1865 			     u64 bg_blkno,
1866 			     unsigned int count)
1867 {
1868 	int status = 0;
1869 	u32 tmp_used;
1870 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1871 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1872 	struct buffer_head *group_bh = NULL;
1873 	struct ocfs2_group_desc *group;
1874 
1875 	mlog_entry_void();
1876 
1877 	/* The alloc_bh comes from ocfs2_free_dinode() or
1878 	 * ocfs2_free_clusters().  The callers have all locked the
1879 	 * allocator and gotten alloc_bh from the lock call.  This
1880 	 * validates the dinode buffer.  Any corruption that has happended
1881 	 * is a code bug. */
1882 	BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
1883 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1884 
1885 	mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
1886 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1887 	     (unsigned long long)bg_blkno, start_bit);
1888 
1889 	status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
1890 					     &group_bh);
1891 	if (status < 0) {
1892 		mlog_errno(status);
1893 		goto bail;
1894 	}
1895 	group = (struct ocfs2_group_desc *) group_bh->b_data;
1896 
1897 	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1898 
1899 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1900 					      group, group_bh,
1901 					      start_bit, count);
1902 	if (status < 0) {
1903 		mlog_errno(status);
1904 		goto bail;
1905 	}
1906 
1907 	status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh,
1908 					 OCFS2_JOURNAL_ACCESS_WRITE);
1909 	if (status < 0) {
1910 		mlog_errno(status);
1911 		goto bail;
1912 	}
1913 
1914 	le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1915 		     count);
1916 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1917 	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1918 
1919 	status = ocfs2_journal_dirty(handle, alloc_bh);
1920 	if (status < 0) {
1921 		mlog_errno(status);
1922 		goto bail;
1923 	}
1924 
1925 bail:
1926 	brelse(group_bh);
1927 
1928 	mlog_exit(status);
1929 	return status;
1930 }
1931 
1932 int ocfs2_free_dinode(handle_t *handle,
1933 		      struct inode *inode_alloc_inode,
1934 		      struct buffer_head *inode_alloc_bh,
1935 		      struct ocfs2_dinode *di)
1936 {
1937 	u64 blk = le64_to_cpu(di->i_blkno);
1938 	u16 bit = le16_to_cpu(di->i_suballoc_bit);
1939 	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1940 
1941 	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1942 					inode_alloc_bh, bit, bg_blkno, 1);
1943 }
1944 
1945 int ocfs2_free_clusters(handle_t *handle,
1946 		       struct inode *bitmap_inode,
1947 		       struct buffer_head *bitmap_bh,
1948 		       u64 start_blk,
1949 		       unsigned int num_clusters)
1950 {
1951 	int status;
1952 	u16 bg_start_bit;
1953 	u64 bg_blkno;
1954 	struct ocfs2_dinode *fe;
1955 
1956 	/* You can't ever have a contiguous set of clusters
1957 	 * bigger than a block group bitmap so we never have to worry
1958 	 * about looping on them. */
1959 
1960 	mlog_entry_void();
1961 
1962 	/* This is expensive. We can safely remove once this stuff has
1963 	 * gotten tested really well. */
1964 	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1965 
1966 	fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1967 
1968 	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1969 				     &bg_start_bit);
1970 
1971 	mlog(0, "want to free %u clusters starting at block %llu\n",
1972 	     num_clusters, (unsigned long long)start_blk);
1973 	mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
1974 	     (unsigned long long)bg_blkno, bg_start_bit);
1975 
1976 	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1977 					  bg_start_bit, bg_blkno,
1978 					  num_clusters);
1979 	if (status < 0) {
1980 		mlog_errno(status);
1981 		goto out;
1982 	}
1983 
1984 	ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb),
1985 					 num_clusters);
1986 
1987 out:
1988 	mlog_exit(status);
1989 	return status;
1990 }
1991 
1992 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1993 {
1994 	printk("Block Group:\n");
1995 	printk("bg_signature:       %s\n", bg->bg_signature);
1996 	printk("bg_size:            %u\n", bg->bg_size);
1997 	printk("bg_bits:            %u\n", bg->bg_bits);
1998 	printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1999 	printk("bg_chain:           %u\n", bg->bg_chain);
2000 	printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
2001 	printk("bg_next_group:      %llu\n",
2002 	       (unsigned long long)bg->bg_next_group);
2003 	printk("bg_parent_dinode:   %llu\n",
2004 	       (unsigned long long)bg->bg_parent_dinode);
2005 	printk("bg_blkno:           %llu\n",
2006 	       (unsigned long long)bg->bg_blkno);
2007 }
2008 
2009 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
2010 {
2011 	int i;
2012 
2013 	printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
2014 	printk("i_signature:                  %s\n", fe->i_signature);
2015 	printk("i_size:                       %llu\n",
2016 	       (unsigned long long)fe->i_size);
2017 	printk("i_clusters:                   %u\n", fe->i_clusters);
2018 	printk("i_generation:                 %u\n",
2019 	       le32_to_cpu(fe->i_generation));
2020 	printk("id1.bitmap1.i_used:           %u\n",
2021 	       le32_to_cpu(fe->id1.bitmap1.i_used));
2022 	printk("id1.bitmap1.i_total:          %u\n",
2023 	       le32_to_cpu(fe->id1.bitmap1.i_total));
2024 	printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
2025 	printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
2026 	printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
2027 	printk("id2.i_chain.cl_next_free_rec: %u\n",
2028 	       fe->id2.i_chain.cl_next_free_rec);
2029 	for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
2030 		printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
2031 		       fe->id2.i_chain.cl_recs[i].c_free);
2032 		printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
2033 		       fe->id2.i_chain.cl_recs[i].c_total);
2034 		printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
2035 		       (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
2036 	}
2037 }
2038 
2039 /*
2040  * For a given allocation, determine which allocators will need to be
2041  * accessed, and lock them, reserving the appropriate number of bits.
2042  *
2043  * Sparse file systems call this from ocfs2_write_begin_nolock()
2044  * and ocfs2_allocate_unwritten_extents().
2045  *
2046  * File systems which don't support holes call this from
2047  * ocfs2_extend_allocation().
2048  */
2049 int ocfs2_lock_allocators(struct inode *inode,
2050 			  struct ocfs2_extent_tree *et,
2051 			  u32 clusters_to_add, u32 extents_to_split,
2052 			  struct ocfs2_alloc_context **data_ac,
2053 			  struct ocfs2_alloc_context **meta_ac)
2054 {
2055 	int ret = 0, num_free_extents;
2056 	unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
2057 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2058 
2059 	*meta_ac = NULL;
2060 	if (data_ac)
2061 		*data_ac = NULL;
2062 
2063 	BUG_ON(clusters_to_add != 0 && data_ac == NULL);
2064 
2065 	num_free_extents = ocfs2_num_free_extents(osb, inode, et);
2066 	if (num_free_extents < 0) {
2067 		ret = num_free_extents;
2068 		mlog_errno(ret);
2069 		goto out;
2070 	}
2071 
2072 	/*
2073 	 * Sparse allocation file systems need to be more conservative
2074 	 * with reserving room for expansion - the actual allocation
2075 	 * happens while we've got a journal handle open so re-taking
2076 	 * a cluster lock (because we ran out of room for another
2077 	 * extent) will violate ordering rules.
2078 	 *
2079 	 * Most of the time we'll only be seeing this 1 cluster at a time
2080 	 * anyway.
2081 	 *
2082 	 * Always lock for any unwritten extents - we might want to
2083 	 * add blocks during a split.
2084 	 */
2085 	if (!num_free_extents ||
2086 	    (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
2087 		ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac);
2088 		if (ret < 0) {
2089 			if (ret != -ENOSPC)
2090 				mlog_errno(ret);
2091 			goto out;
2092 		}
2093 	}
2094 
2095 	if (clusters_to_add == 0)
2096 		goto out;
2097 
2098 	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
2099 	if (ret < 0) {
2100 		if (ret != -ENOSPC)
2101 			mlog_errno(ret);
2102 		goto out;
2103 	}
2104 
2105 out:
2106 	if (ret) {
2107 		if (*meta_ac) {
2108 			ocfs2_free_alloc_context(*meta_ac);
2109 			*meta_ac = NULL;
2110 		}
2111 
2112 		/*
2113 		 * We cannot have an error and a non null *data_ac.
2114 		 */
2115 	}
2116 
2117 	return ret;
2118 }
2119