xref: /openbmc/linux/fs/ocfs2/suballoc.c (revision 643d1f7f)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26 
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31 
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34 
35 #include "ocfs2.h"
36 
37 #include "alloc.h"
38 #include "dlmglue.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "super.h"
44 #include "sysfile.h"
45 #include "uptodate.h"
46 
47 #include "buffer_head_io.h"
48 
49 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
52 static int ocfs2_block_group_fill(handle_t *handle,
53 				  struct inode *alloc_inode,
54 				  struct buffer_head *bg_bh,
55 				  u64 group_blkno,
56 				  u16 my_chain,
57 				  struct ocfs2_chain_list *cl);
58 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
59 				   struct inode *alloc_inode,
60 				   struct buffer_head *bh);
61 
62 static int ocfs2_cluster_group_search(struct inode *inode,
63 				      struct buffer_head *group_bh,
64 				      u32 bits_wanted, u32 min_bits,
65 				      u16 *bit_off, u16 *bits_found);
66 static int ocfs2_block_group_search(struct inode *inode,
67 				    struct buffer_head *group_bh,
68 				    u32 bits_wanted, u32 min_bits,
69 				    u16 *bit_off, u16 *bits_found);
70 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
71 				     struct ocfs2_alloc_context *ac,
72 				     handle_t *handle,
73 				     u32 bits_wanted,
74 				     u32 min_bits,
75 				     u16 *bit_off,
76 				     unsigned int *num_bits,
77 				     u64 *bg_blkno);
78 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
79 					 int nr);
80 static inline int ocfs2_block_group_set_bits(handle_t *handle,
81 					     struct inode *alloc_inode,
82 					     struct ocfs2_group_desc *bg,
83 					     struct buffer_head *group_bh,
84 					     unsigned int bit_off,
85 					     unsigned int num_bits);
86 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
87 					       struct inode *alloc_inode,
88 					       struct ocfs2_group_desc *bg,
89 					       struct buffer_head *group_bh,
90 					       unsigned int bit_off,
91 					       unsigned int num_bits);
92 
93 static int ocfs2_relink_block_group(handle_t *handle,
94 				    struct inode *alloc_inode,
95 				    struct buffer_head *fe_bh,
96 				    struct buffer_head *bg_bh,
97 				    struct buffer_head *prev_bg_bh,
98 				    u16 chain);
99 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
100 						     u32 wanted);
101 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
102 						   u64 bg_blkno,
103 						   u16 bg_bit_off);
104 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
105 						u64 data_blkno,
106 						u64 *bg_blkno,
107 						u16 *bg_bit_off);
108 
109 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
110 {
111 	struct inode *inode = ac->ac_inode;
112 
113 	if (inode) {
114 		if (ac->ac_which != OCFS2_AC_USE_LOCAL)
115 			ocfs2_inode_unlock(inode, 1);
116 
117 		mutex_unlock(&inode->i_mutex);
118 
119 		iput(inode);
120 	}
121 	if (ac->ac_bh)
122 		brelse(ac->ac_bh);
123 	kfree(ac);
124 }
125 
126 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
127 {
128 	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
129 }
130 
131 /* somewhat more expensive than our other checks, so use sparingly. */
132 int ocfs2_check_group_descriptor(struct super_block *sb,
133 				 struct ocfs2_dinode *di,
134 				 struct ocfs2_group_desc *gd)
135 {
136 	unsigned int max_bits;
137 
138 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
139 		OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
140 		return -EIO;
141 	}
142 
143 	if (di->i_blkno != gd->bg_parent_dinode) {
144 		ocfs2_error(sb, "Group descriptor # %llu has bad parent "
145 			    "pointer (%llu, expected %llu)",
146 			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
147 			    (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
148 			    (unsigned long long)le64_to_cpu(di->i_blkno));
149 		return -EIO;
150 	}
151 
152 	max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
153 	if (le16_to_cpu(gd->bg_bits) > max_bits) {
154 		ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
155 			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
156 			    le16_to_cpu(gd->bg_bits));
157 		return -EIO;
158 	}
159 
160 	if (le16_to_cpu(gd->bg_chain) >=
161 	    le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
162 		ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
163 			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
164 			    le16_to_cpu(gd->bg_chain));
165 		return -EIO;
166 	}
167 
168 	if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
169 		ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
170 			    "claims that %u are free",
171 			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
172 			    le16_to_cpu(gd->bg_bits),
173 			    le16_to_cpu(gd->bg_free_bits_count));
174 		return -EIO;
175 	}
176 
177 	if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
178 		ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
179 			    "max bitmap bits of %u",
180 			    (unsigned long long)le64_to_cpu(gd->bg_blkno),
181 			    le16_to_cpu(gd->bg_bits),
182 			    8 * le16_to_cpu(gd->bg_size));
183 		return -EIO;
184 	}
185 
186 	return 0;
187 }
188 
189 static int ocfs2_block_group_fill(handle_t *handle,
190 				  struct inode *alloc_inode,
191 				  struct buffer_head *bg_bh,
192 				  u64 group_blkno,
193 				  u16 my_chain,
194 				  struct ocfs2_chain_list *cl)
195 {
196 	int status = 0;
197 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
198 	struct super_block * sb = alloc_inode->i_sb;
199 
200 	mlog_entry_void();
201 
202 	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
203 		ocfs2_error(alloc_inode->i_sb, "group block (%llu) != "
204 			    "b_blocknr (%llu)",
205 			    (unsigned long long)group_blkno,
206 			    (unsigned long long) bg_bh->b_blocknr);
207 		status = -EIO;
208 		goto bail;
209 	}
210 
211 	status = ocfs2_journal_access(handle,
212 				      alloc_inode,
213 				      bg_bh,
214 				      OCFS2_JOURNAL_ACCESS_CREATE);
215 	if (status < 0) {
216 		mlog_errno(status);
217 		goto bail;
218 	}
219 
220 	memset(bg, 0, sb->s_blocksize);
221 	strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
222 	bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
223 	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
224 	bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
225 	bg->bg_chain = cpu_to_le16(my_chain);
226 	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
227 	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
228 	bg->bg_blkno = cpu_to_le64(group_blkno);
229 	/* set the 1st bit in the bitmap to account for the descriptor block */
230 	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
231 	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
232 
233 	status = ocfs2_journal_dirty(handle, bg_bh);
234 	if (status < 0)
235 		mlog_errno(status);
236 
237 	/* There is no need to zero out or otherwise initialize the
238 	 * other blocks in a group - All valid FS metadata in a block
239 	 * group stores the superblock fs_generation value at
240 	 * allocation time. */
241 
242 bail:
243 	mlog_exit(status);
244 	return status;
245 }
246 
247 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
248 {
249 	u16 curr, best;
250 
251 	best = curr = 0;
252 	while (curr < le16_to_cpu(cl->cl_count)) {
253 		if (le32_to_cpu(cl->cl_recs[best].c_total) >
254 		    le32_to_cpu(cl->cl_recs[curr].c_total))
255 			best = curr;
256 		curr++;
257 	}
258 	return best;
259 }
260 
261 /*
262  * We expect the block group allocator to already be locked.
263  */
264 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
265 				   struct inode *alloc_inode,
266 				   struct buffer_head *bh)
267 {
268 	int status, credits;
269 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
270 	struct ocfs2_chain_list *cl;
271 	struct ocfs2_alloc_context *ac = NULL;
272 	handle_t *handle = NULL;
273 	u32 bit_off, num_bits;
274 	u16 alloc_rec;
275 	u64 bg_blkno;
276 	struct buffer_head *bg_bh = NULL;
277 	struct ocfs2_group_desc *bg;
278 
279 	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
280 
281 	mlog_entry_void();
282 
283 	cl = &fe->id2.i_chain;
284 	status = ocfs2_reserve_clusters(osb,
285 					le16_to_cpu(cl->cl_cpg),
286 					&ac);
287 	if (status < 0) {
288 		if (status != -ENOSPC)
289 			mlog_errno(status);
290 		goto bail;
291 	}
292 
293 	credits = ocfs2_calc_group_alloc_credits(osb->sb,
294 						 le16_to_cpu(cl->cl_cpg));
295 	handle = ocfs2_start_trans(osb, credits);
296 	if (IS_ERR(handle)) {
297 		status = PTR_ERR(handle);
298 		handle = NULL;
299 		mlog_errno(status);
300 		goto bail;
301 	}
302 
303 	status = ocfs2_claim_clusters(osb,
304 				      handle,
305 				      ac,
306 				      le16_to_cpu(cl->cl_cpg),
307 				      &bit_off,
308 				      &num_bits);
309 	if (status < 0) {
310 		if (status != -ENOSPC)
311 			mlog_errno(status);
312 		goto bail;
313 	}
314 
315 	alloc_rec = ocfs2_find_smallest_chain(cl);
316 
317 	/* setup the group */
318 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
319 	mlog(0, "new descriptor, record %u, at block %llu\n",
320 	     alloc_rec, (unsigned long long)bg_blkno);
321 
322 	bg_bh = sb_getblk(osb->sb, bg_blkno);
323 	if (!bg_bh) {
324 		status = -EIO;
325 		mlog_errno(status);
326 		goto bail;
327 	}
328 	ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
329 
330 	status = ocfs2_block_group_fill(handle,
331 					alloc_inode,
332 					bg_bh,
333 					bg_blkno,
334 					alloc_rec,
335 					cl);
336 	if (status < 0) {
337 		mlog_errno(status);
338 		goto bail;
339 	}
340 
341 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
342 
343 	status = ocfs2_journal_access(handle, alloc_inode,
344 				      bh, OCFS2_JOURNAL_ACCESS_WRITE);
345 	if (status < 0) {
346 		mlog_errno(status);
347 		goto bail;
348 	}
349 
350 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
351 		     le16_to_cpu(bg->bg_free_bits_count));
352 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
353 	cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
354 	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
355 		le16_add_cpu(&cl->cl_next_free_rec, 1);
356 
357 	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
358 					le16_to_cpu(bg->bg_free_bits_count));
359 	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
360 	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
361 
362 	status = ocfs2_journal_dirty(handle, bh);
363 	if (status < 0) {
364 		mlog_errno(status);
365 		goto bail;
366 	}
367 
368 	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
369 	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
370 	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
371 					     le32_to_cpu(fe->i_clusters)));
372 	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
373 	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
374 	alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
375 
376 	status = 0;
377 bail:
378 	if (handle)
379 		ocfs2_commit_trans(osb, handle);
380 
381 	if (ac)
382 		ocfs2_free_alloc_context(ac);
383 
384 	if (bg_bh)
385 		brelse(bg_bh);
386 
387 	mlog_exit(status);
388 	return status;
389 }
390 
391 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
392 				       struct ocfs2_alloc_context *ac,
393 				       int type,
394 				       u32 slot)
395 {
396 	int status;
397 	u32 bits_wanted = ac->ac_bits_wanted;
398 	struct inode *alloc_inode;
399 	struct buffer_head *bh = NULL;
400 	struct ocfs2_dinode *fe;
401 	u32 free_bits;
402 
403 	mlog_entry_void();
404 
405 	alloc_inode = ocfs2_get_system_file_inode(osb, type, slot);
406 	if (!alloc_inode) {
407 		mlog_errno(-EINVAL);
408 		return -EINVAL;
409 	}
410 
411 	mutex_lock(&alloc_inode->i_mutex);
412 
413 	status = ocfs2_inode_lock(alloc_inode, &bh, 1);
414 	if (status < 0) {
415 		mutex_unlock(&alloc_inode->i_mutex);
416 		iput(alloc_inode);
417 
418 		mlog_errno(status);
419 		return status;
420 	}
421 
422 	ac->ac_inode = alloc_inode;
423 
424 	fe = (struct ocfs2_dinode *) bh->b_data;
425 	if (!OCFS2_IS_VALID_DINODE(fe)) {
426 		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
427 		status = -EIO;
428 		goto bail;
429 	}
430 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
431 		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
432 			    (unsigned long long)le64_to_cpu(fe->i_blkno));
433 		status = -EIO;
434 		goto bail;
435 	}
436 
437 	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
438 		le32_to_cpu(fe->id1.bitmap1.i_used);
439 
440 	if (bits_wanted > free_bits) {
441 		/* cluster bitmap never grows */
442 		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
443 			mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
444 			     bits_wanted, free_bits);
445 			status = -ENOSPC;
446 			goto bail;
447 		}
448 
449 		status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
450 		if (status < 0) {
451 			if (status != -ENOSPC)
452 				mlog_errno(status);
453 			goto bail;
454 		}
455 		atomic_inc(&osb->alloc_stats.bg_extends);
456 
457 		/* You should never ask for this much metadata */
458 		BUG_ON(bits_wanted >
459 		       (le32_to_cpu(fe->id1.bitmap1.i_total)
460 			- le32_to_cpu(fe->id1.bitmap1.i_used)));
461 	}
462 
463 	get_bh(bh);
464 	ac->ac_bh = bh;
465 bail:
466 	if (bh)
467 		brelse(bh);
468 
469 	mlog_exit(status);
470 	return status;
471 }
472 
473 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
474 			       struct ocfs2_dinode *fe,
475 			       struct ocfs2_alloc_context **ac)
476 {
477 	int status;
478 	u32 slot;
479 
480 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
481 	if (!(*ac)) {
482 		status = -ENOMEM;
483 		mlog_errno(status);
484 		goto bail;
485 	}
486 
487 	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
488 	(*ac)->ac_which = OCFS2_AC_USE_META;
489 	slot = osb->slot_num;
490 	(*ac)->ac_group_search = ocfs2_block_group_search;
491 
492 	status = ocfs2_reserve_suballoc_bits(osb, (*ac),
493 					     EXTENT_ALLOC_SYSTEM_INODE, slot);
494 	if (status < 0) {
495 		if (status != -ENOSPC)
496 			mlog_errno(status);
497 		goto bail;
498 	}
499 
500 	status = 0;
501 bail:
502 	if ((status < 0) && *ac) {
503 		ocfs2_free_alloc_context(*ac);
504 		*ac = NULL;
505 	}
506 
507 	mlog_exit(status);
508 	return status;
509 }
510 
511 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
512 			    struct ocfs2_alloc_context **ac)
513 {
514 	int status;
515 
516 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
517 	if (!(*ac)) {
518 		status = -ENOMEM;
519 		mlog_errno(status);
520 		goto bail;
521 	}
522 
523 	(*ac)->ac_bits_wanted = 1;
524 	(*ac)->ac_which = OCFS2_AC_USE_INODE;
525 
526 	(*ac)->ac_group_search = ocfs2_block_group_search;
527 
528 	status = ocfs2_reserve_suballoc_bits(osb, *ac,
529 					     INODE_ALLOC_SYSTEM_INODE,
530 					     osb->slot_num);
531 	if (status < 0) {
532 		if (status != -ENOSPC)
533 			mlog_errno(status);
534 		goto bail;
535 	}
536 
537 	status = 0;
538 bail:
539 	if ((status < 0) && *ac) {
540 		ocfs2_free_alloc_context(*ac);
541 		*ac = NULL;
542 	}
543 
544 	mlog_exit(status);
545 	return status;
546 }
547 
548 /* local alloc code has to do the same thing, so rather than do this
549  * twice.. */
550 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
551 				      struct ocfs2_alloc_context *ac)
552 {
553 	int status;
554 
555 	ac->ac_which = OCFS2_AC_USE_MAIN;
556 	ac->ac_group_search = ocfs2_cluster_group_search;
557 
558 	status = ocfs2_reserve_suballoc_bits(osb, ac,
559 					     GLOBAL_BITMAP_SYSTEM_INODE,
560 					     OCFS2_INVALID_SLOT);
561 	if (status < 0 && status != -ENOSPC) {
562 		mlog_errno(status);
563 		goto bail;
564 	}
565 
566 bail:
567 	return status;
568 }
569 
570 /* Callers don't need to care which bitmap (local alloc or main) to
571  * use so we figure it out for them, but unfortunately this clutters
572  * things a bit. */
573 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
574 			   u32 bits_wanted,
575 			   struct ocfs2_alloc_context **ac)
576 {
577 	int status;
578 
579 	mlog_entry_void();
580 
581 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
582 	if (!(*ac)) {
583 		status = -ENOMEM;
584 		mlog_errno(status);
585 		goto bail;
586 	}
587 
588 	(*ac)->ac_bits_wanted = bits_wanted;
589 
590 	status = -ENOSPC;
591 	if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
592 		status = ocfs2_reserve_local_alloc_bits(osb,
593 							bits_wanted,
594 							*ac);
595 		if ((status < 0) && (status != -ENOSPC)) {
596 			mlog_errno(status);
597 			goto bail;
598 		} else if (status == -ENOSPC) {
599 			/* reserve_local_bits will return enospc with
600 			 * the local alloc inode still locked, so we
601 			 * can change this safely here. */
602 			mlog(0, "Disabling local alloc\n");
603 			/* We set to OCFS2_LA_DISABLED so that umount
604 			 * can clean up what's left of the local
605 			 * allocation */
606 			osb->local_alloc_state = OCFS2_LA_DISABLED;
607 		}
608 	}
609 
610 	if (status == -ENOSPC) {
611 		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
612 		if (status < 0) {
613 			if (status != -ENOSPC)
614 				mlog_errno(status);
615 			goto bail;
616 		}
617 	}
618 
619 	status = 0;
620 bail:
621 	if ((status < 0) && *ac) {
622 		ocfs2_free_alloc_context(*ac);
623 		*ac = NULL;
624 	}
625 
626 	mlog_exit(status);
627 	return status;
628 }
629 
630 /*
631  * More or less lifted from ext3. I'll leave their description below:
632  *
633  * "For ext3 allocations, we must not reuse any blocks which are
634  * allocated in the bitmap buffer's "last committed data" copy.  This
635  * prevents deletes from freeing up the page for reuse until we have
636  * committed the delete transaction.
637  *
638  * If we didn't do this, then deleting something and reallocating it as
639  * data would allow the old block to be overwritten before the
640  * transaction committed (because we force data to disk before commit).
641  * This would lead to corruption if we crashed between overwriting the
642  * data and committing the delete.
643  *
644  * @@@ We may want to make this allocation behaviour conditional on
645  * data-writes at some point, and disable it for metadata allocations or
646  * sync-data inodes."
647  *
648  * Note: OCFS2 already does this differently for metadata vs data
649  * allocations, as those bitmaps are separate and undo access is never
650  * called on a metadata group descriptor.
651  */
652 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
653 					 int nr)
654 {
655 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
656 
657 	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
658 		return 0;
659 	if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
660 		return 1;
661 
662 	bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
663 	return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
664 }
665 
666 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
667 					     struct buffer_head *bg_bh,
668 					     unsigned int bits_wanted,
669 					     unsigned int total_bits,
670 					     u16 *bit_off,
671 					     u16 *bits_found)
672 {
673 	void *bitmap;
674 	u16 best_offset, best_size;
675 	int offset, start, found, status = 0;
676 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
677 
678 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
679 		OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
680 		return -EIO;
681 	}
682 
683 	found = start = best_offset = best_size = 0;
684 	bitmap = bg->bg_bitmap;
685 
686 	while((offset = ocfs2_find_next_zero_bit(bitmap, total_bits, start)) != -1) {
687 		if (offset == total_bits)
688 			break;
689 
690 		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
691 			/* We found a zero, but we can't use it as it
692 			 * hasn't been put to disk yet! */
693 			found = 0;
694 			start = offset + 1;
695 		} else if (offset == start) {
696 			/* we found a zero */
697 			found++;
698 			/* move start to the next bit to test */
699 			start++;
700 		} else {
701 			/* got a zero after some ones */
702 			found = 1;
703 			start = offset + 1;
704 		}
705 		if (found > best_size) {
706 			best_size = found;
707 			best_offset = start - found;
708 		}
709 		/* we got everything we needed */
710 		if (found == bits_wanted) {
711 			/* mlog(0, "Found it all!\n"); */
712 			break;
713 		}
714 	}
715 
716 	/* XXX: I think the first clause is equivalent to the second
717 	 * 	- jlbec */
718 	if (found == bits_wanted) {
719 		*bit_off = start - found;
720 		*bits_found = found;
721 	} else if (best_size) {
722 		*bit_off = best_offset;
723 		*bits_found = best_size;
724 	} else {
725 		status = -ENOSPC;
726 		/* No error log here -- see the comment above
727 		 * ocfs2_test_bg_bit_allocatable */
728 	}
729 
730 	return status;
731 }
732 
733 static inline int ocfs2_block_group_set_bits(handle_t *handle,
734 					     struct inode *alloc_inode,
735 					     struct ocfs2_group_desc *bg,
736 					     struct buffer_head *group_bh,
737 					     unsigned int bit_off,
738 					     unsigned int num_bits)
739 {
740 	int status;
741 	void *bitmap = bg->bg_bitmap;
742 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
743 
744 	mlog_entry_void();
745 
746 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
747 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
748 		status = -EIO;
749 		goto bail;
750 	}
751 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
752 
753 	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
754 	     num_bits);
755 
756 	if (ocfs2_is_cluster_bitmap(alloc_inode))
757 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
758 
759 	status = ocfs2_journal_access(handle,
760 				      alloc_inode,
761 				      group_bh,
762 				      journal_type);
763 	if (status < 0) {
764 		mlog_errno(status);
765 		goto bail;
766 	}
767 
768 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
769 
770 	while(num_bits--)
771 		ocfs2_set_bit(bit_off++, bitmap);
772 
773 	status = ocfs2_journal_dirty(handle,
774 				     group_bh);
775 	if (status < 0) {
776 		mlog_errno(status);
777 		goto bail;
778 	}
779 
780 bail:
781 	mlog_exit(status);
782 	return status;
783 }
784 
785 /* find the one with the most empty bits */
786 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
787 {
788 	u16 curr, best;
789 
790 	BUG_ON(!cl->cl_next_free_rec);
791 
792 	best = curr = 0;
793 	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
794 		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
795 		    le32_to_cpu(cl->cl_recs[best].c_free))
796 			best = curr;
797 		curr++;
798 	}
799 
800 	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
801 	return best;
802 }
803 
804 static int ocfs2_relink_block_group(handle_t *handle,
805 				    struct inode *alloc_inode,
806 				    struct buffer_head *fe_bh,
807 				    struct buffer_head *bg_bh,
808 				    struct buffer_head *prev_bg_bh,
809 				    u16 chain)
810 {
811 	int status;
812 	/* there is a really tiny chance the journal calls could fail,
813 	 * but we wouldn't want inconsistent blocks in *any* case. */
814 	u64 fe_ptr, bg_ptr, prev_bg_ptr;
815 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
816 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
817 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
818 
819 	if (!OCFS2_IS_VALID_DINODE(fe)) {
820 		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
821 		status = -EIO;
822 		goto out;
823 	}
824 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
825 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
826 		status = -EIO;
827 		goto out;
828 	}
829 	if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
830 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
831 		status = -EIO;
832 		goto out;
833 	}
834 
835 	mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
836 	     (unsigned long long)le64_to_cpu(fe->i_blkno), chain,
837 	     (unsigned long long)le64_to_cpu(bg->bg_blkno),
838 	     (unsigned long long)le64_to_cpu(prev_bg->bg_blkno));
839 
840 	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
841 	bg_ptr = le64_to_cpu(bg->bg_next_group);
842 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
843 
844 	status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
845 				      OCFS2_JOURNAL_ACCESS_WRITE);
846 	if (status < 0) {
847 		mlog_errno(status);
848 		goto out_rollback;
849 	}
850 
851 	prev_bg->bg_next_group = bg->bg_next_group;
852 
853 	status = ocfs2_journal_dirty(handle, prev_bg_bh);
854 	if (status < 0) {
855 		mlog_errno(status);
856 		goto out_rollback;
857 	}
858 
859 	status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
860 				      OCFS2_JOURNAL_ACCESS_WRITE);
861 	if (status < 0) {
862 		mlog_errno(status);
863 		goto out_rollback;
864 	}
865 
866 	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
867 
868 	status = ocfs2_journal_dirty(handle, bg_bh);
869 	if (status < 0) {
870 		mlog_errno(status);
871 		goto out_rollback;
872 	}
873 
874 	status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
875 				      OCFS2_JOURNAL_ACCESS_WRITE);
876 	if (status < 0) {
877 		mlog_errno(status);
878 		goto out_rollback;
879 	}
880 
881 	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
882 
883 	status = ocfs2_journal_dirty(handle, fe_bh);
884 	if (status < 0) {
885 		mlog_errno(status);
886 		goto out_rollback;
887 	}
888 
889 	status = 0;
890 out_rollback:
891 	if (status < 0) {
892 		fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
893 		bg->bg_next_group = cpu_to_le64(bg_ptr);
894 		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
895 	}
896 out:
897 	mlog_exit(status);
898 	return status;
899 }
900 
901 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
902 						     u32 wanted)
903 {
904 	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
905 }
906 
907 /* return 0 on success, -ENOSPC to keep searching and any other < 0
908  * value on error. */
909 static int ocfs2_cluster_group_search(struct inode *inode,
910 				      struct buffer_head *group_bh,
911 				      u32 bits_wanted, u32 min_bits,
912 				      u16 *bit_off, u16 *bits_found)
913 {
914 	int search = -ENOSPC;
915 	int ret;
916 	struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data;
917 	u16 tmp_off, tmp_found;
918 	unsigned int max_bits, gd_cluster_off;
919 
920 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
921 
922 	if (gd->bg_free_bits_count) {
923 		max_bits = le16_to_cpu(gd->bg_bits);
924 
925 		/* Tail groups in cluster bitmaps which aren't cpg
926 		 * aligned are prone to partial extention by a failed
927 		 * fs resize. If the file system resize never got to
928 		 * update the dinode cluster count, then we don't want
929 		 * to trust any clusters past it, regardless of what
930 		 * the group descriptor says. */
931 		gd_cluster_off = ocfs2_blocks_to_clusters(inode->i_sb,
932 							  le64_to_cpu(gd->bg_blkno));
933 		if ((gd_cluster_off + max_bits) >
934 		    OCFS2_I(inode)->ip_clusters) {
935 			max_bits = OCFS2_I(inode)->ip_clusters - gd_cluster_off;
936 			mlog(0, "Desc %llu, bg_bits %u, clusters %u, use %u\n",
937 			     (unsigned long long)le64_to_cpu(gd->bg_blkno),
938 			     le16_to_cpu(gd->bg_bits),
939 			     OCFS2_I(inode)->ip_clusters, max_bits);
940 		}
941 
942 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
943 							group_bh, bits_wanted,
944 							max_bits,
945 							&tmp_off, &tmp_found);
946 		if (ret)
947 			return ret;
948 
949 		/* ocfs2_block_group_find_clear_bits() might
950 		 * return success, but we still want to return
951 		 * -ENOSPC unless it found the minimum number
952 		 * of bits. */
953 		if (min_bits <= tmp_found) {
954 			*bit_off = tmp_off;
955 			*bits_found = tmp_found;
956 			search = 0; /* success */
957 		}
958 	}
959 
960 	return search;
961 }
962 
963 static int ocfs2_block_group_search(struct inode *inode,
964 				    struct buffer_head *group_bh,
965 				    u32 bits_wanted, u32 min_bits,
966 				    u16 *bit_off, u16 *bits_found)
967 {
968 	int ret = -ENOSPC;
969 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
970 
971 	BUG_ON(min_bits != 1);
972 	BUG_ON(ocfs2_is_cluster_bitmap(inode));
973 
974 	if (bg->bg_free_bits_count)
975 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
976 							group_bh, bits_wanted,
977 							le16_to_cpu(bg->bg_bits),
978 							bit_off, bits_found);
979 
980 	return ret;
981 }
982 
983 static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
984 				       handle_t *handle,
985 				       struct buffer_head *di_bh,
986 				       u32 num_bits,
987 				       u16 chain)
988 {
989 	int ret;
990 	u32 tmp_used;
991 	struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
992 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
993 
994 	ret = ocfs2_journal_access(handle, inode, di_bh,
995 				   OCFS2_JOURNAL_ACCESS_WRITE);
996 	if (ret < 0) {
997 		mlog_errno(ret);
998 		goto out;
999 	}
1000 
1001 	tmp_used = le32_to_cpu(di->id1.bitmap1.i_used);
1002 	di->id1.bitmap1.i_used = cpu_to_le32(num_bits + tmp_used);
1003 	le32_add_cpu(&cl->cl_recs[chain].c_free, -num_bits);
1004 
1005 	ret = ocfs2_journal_dirty(handle, di_bh);
1006 	if (ret < 0)
1007 		mlog_errno(ret);
1008 
1009 out:
1010 	return ret;
1011 }
1012 
1013 static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
1014 				  handle_t *handle,
1015 				  u32 bits_wanted,
1016 				  u32 min_bits,
1017 				  u16 *bit_off,
1018 				  unsigned int *num_bits,
1019 				  u64 gd_blkno,
1020 				  u16 *bits_left)
1021 {
1022 	int ret;
1023 	u16 found;
1024 	struct buffer_head *group_bh = NULL;
1025 	struct ocfs2_group_desc *gd;
1026 	struct inode *alloc_inode = ac->ac_inode;
1027 
1028 	ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno,
1029 			       &group_bh, OCFS2_BH_CACHED, alloc_inode);
1030 	if (ret < 0) {
1031 		mlog_errno(ret);
1032 		return ret;
1033 	}
1034 
1035 	gd = (struct ocfs2_group_desc *) group_bh->b_data;
1036 	if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
1037 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
1038 		ret = -EIO;
1039 		goto out;
1040 	}
1041 
1042 	ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
1043 				  bit_off, &found);
1044 	if (ret < 0) {
1045 		if (ret != -ENOSPC)
1046 			mlog_errno(ret);
1047 		goto out;
1048 	}
1049 
1050 	*num_bits = found;
1051 
1052 	ret = ocfs2_alloc_dinode_update_counts(alloc_inode, handle, ac->ac_bh,
1053 					       *num_bits,
1054 					       le16_to_cpu(gd->bg_chain));
1055 	if (ret < 0) {
1056 		mlog_errno(ret);
1057 		goto out;
1058 	}
1059 
1060 	ret = ocfs2_block_group_set_bits(handle, alloc_inode, gd, group_bh,
1061 					 *bit_off, *num_bits);
1062 	if (ret < 0)
1063 		mlog_errno(ret);
1064 
1065 	*bits_left = le16_to_cpu(gd->bg_free_bits_count);
1066 
1067 out:
1068 	brelse(group_bh);
1069 
1070 	return ret;
1071 }
1072 
1073 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
1074 			      handle_t *handle,
1075 			      u32 bits_wanted,
1076 			      u32 min_bits,
1077 			      u16 *bit_off,
1078 			      unsigned int *num_bits,
1079 			      u64 *bg_blkno,
1080 			      u16 *bits_left)
1081 {
1082 	int status;
1083 	u16 chain, tmp_bits;
1084 	u32 tmp_used;
1085 	u64 next_group;
1086 	struct inode *alloc_inode = ac->ac_inode;
1087 	struct buffer_head *group_bh = NULL;
1088 	struct buffer_head *prev_group_bh = NULL;
1089 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1090 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1091 	struct ocfs2_group_desc *bg;
1092 
1093 	chain = ac->ac_chain;
1094 	mlog(0, "trying to alloc %u bits from chain %u, inode %llu\n",
1095 	     bits_wanted, chain,
1096 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
1097 
1098 	status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1099 				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
1100 				  &group_bh, OCFS2_BH_CACHED, alloc_inode);
1101 	if (status < 0) {
1102 		mlog_errno(status);
1103 		goto bail;
1104 	}
1105 	bg = (struct ocfs2_group_desc *) group_bh->b_data;
1106 	status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1107 	if (status) {
1108 		mlog_errno(status);
1109 		goto bail;
1110 	}
1111 
1112 	status = -ENOSPC;
1113 	/* for now, the chain search is a bit simplistic. We just use
1114 	 * the 1st group with any empty bits. */
1115 	while ((status = ac->ac_group_search(alloc_inode, group_bh,
1116 					     bits_wanted, min_bits, bit_off,
1117 					     &tmp_bits)) == -ENOSPC) {
1118 		if (!bg->bg_next_group)
1119 			break;
1120 
1121 		if (prev_group_bh) {
1122 			brelse(prev_group_bh);
1123 			prev_group_bh = NULL;
1124 		}
1125 		next_group = le64_to_cpu(bg->bg_next_group);
1126 		prev_group_bh = group_bh;
1127 		group_bh = NULL;
1128 		status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1129 					  next_group, &group_bh,
1130 					  OCFS2_BH_CACHED, alloc_inode);
1131 		if (status < 0) {
1132 			mlog_errno(status);
1133 			goto bail;
1134 		}
1135 		bg = (struct ocfs2_group_desc *) group_bh->b_data;
1136 		status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
1137 		if (status) {
1138 			mlog_errno(status);
1139 			goto bail;
1140 		}
1141 	}
1142 	if (status < 0) {
1143 		if (status != -ENOSPC)
1144 			mlog_errno(status);
1145 		goto bail;
1146 	}
1147 
1148 	mlog(0, "alloc succeeds: we give %u bits from block group %llu\n",
1149 	     tmp_bits, (unsigned long long)le64_to_cpu(bg->bg_blkno));
1150 
1151 	*num_bits = tmp_bits;
1152 
1153 	BUG_ON(*num_bits == 0);
1154 
1155 	/*
1156 	 * Keep track of previous block descriptor read. When
1157 	 * we find a target, if we have read more than X
1158 	 * number of descriptors, and the target is reasonably
1159 	 * empty, relink him to top of his chain.
1160 	 *
1161 	 * We've read 0 extra blocks and only send one more to
1162 	 * the transaction, yet the next guy to search has a
1163 	 * much easier time.
1164 	 *
1165 	 * Do this *after* figuring out how many bits we're taking out
1166 	 * of our target group.
1167 	 */
1168 	if (ac->ac_allow_chain_relink &&
1169 	    (prev_group_bh) &&
1170 	    (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1171 		status = ocfs2_relink_block_group(handle, alloc_inode,
1172 						  ac->ac_bh, group_bh,
1173 						  prev_group_bh, chain);
1174 		if (status < 0) {
1175 			mlog_errno(status);
1176 			goto bail;
1177 		}
1178 	}
1179 
1180 	/* Ok, claim our bits now: set the info on dinode, chainlist
1181 	 * and then the group */
1182 	status = ocfs2_journal_access(handle,
1183 				      alloc_inode,
1184 				      ac->ac_bh,
1185 				      OCFS2_JOURNAL_ACCESS_WRITE);
1186 	if (status < 0) {
1187 		mlog_errno(status);
1188 		goto bail;
1189 	}
1190 
1191 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1192 	fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1193 	le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1194 
1195 	status = ocfs2_journal_dirty(handle,
1196 				     ac->ac_bh);
1197 	if (status < 0) {
1198 		mlog_errno(status);
1199 		goto bail;
1200 	}
1201 
1202 	status = ocfs2_block_group_set_bits(handle,
1203 					    alloc_inode,
1204 					    bg,
1205 					    group_bh,
1206 					    *bit_off,
1207 					    *num_bits);
1208 	if (status < 0) {
1209 		mlog_errno(status);
1210 		goto bail;
1211 	}
1212 
1213 	mlog(0, "Allocated %u bits from suballocator %llu\n", *num_bits,
1214 	     (unsigned long long)le64_to_cpu(fe->i_blkno));
1215 
1216 	*bg_blkno = le64_to_cpu(bg->bg_blkno);
1217 	*bits_left = le16_to_cpu(bg->bg_free_bits_count);
1218 bail:
1219 	if (group_bh)
1220 		brelse(group_bh);
1221 	if (prev_group_bh)
1222 		brelse(prev_group_bh);
1223 
1224 	mlog_exit(status);
1225 	return status;
1226 }
1227 
1228 /* will give out up to bits_wanted contiguous bits. */
1229 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1230 				     struct ocfs2_alloc_context *ac,
1231 				     handle_t *handle,
1232 				     u32 bits_wanted,
1233 				     u32 min_bits,
1234 				     u16 *bit_off,
1235 				     unsigned int *num_bits,
1236 				     u64 *bg_blkno)
1237 {
1238 	int status;
1239 	u16 victim, i;
1240 	u16 bits_left = 0;
1241 	u64 hint_blkno = ac->ac_last_group;
1242 	struct ocfs2_chain_list *cl;
1243 	struct ocfs2_dinode *fe;
1244 
1245 	mlog_entry_void();
1246 
1247 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1248 	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1249 	BUG_ON(!ac->ac_bh);
1250 
1251 	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1252 	if (!OCFS2_IS_VALID_DINODE(fe)) {
1253 		OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
1254 		status = -EIO;
1255 		goto bail;
1256 	}
1257 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1258 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
1259 		ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
1260 			    "bits but only %u total.",
1261 			    (unsigned long long)le64_to_cpu(fe->i_blkno),
1262 			    le32_to_cpu(fe->id1.bitmap1.i_used),
1263 			    le32_to_cpu(fe->id1.bitmap1.i_total));
1264 		status = -EIO;
1265 		goto bail;
1266 	}
1267 
1268 	if (hint_blkno) {
1269 		/* Attempt to short-circuit the usual search mechanism
1270 		 * by jumping straight to the most recently used
1271 		 * allocation group. This helps us mantain some
1272 		 * contiguousness across allocations. */
1273 		status = ocfs2_search_one_group(ac, handle, bits_wanted,
1274 						min_bits, bit_off, num_bits,
1275 						hint_blkno, &bits_left);
1276 		if (!status) {
1277 			/* Be careful to update *bg_blkno here as the
1278 			 * caller is expecting it to be filled in, and
1279 			 * ocfs2_search_one_group() won't do that for
1280 			 * us. */
1281 			*bg_blkno = hint_blkno;
1282 			goto set_hint;
1283 		}
1284 		if (status < 0 && status != -ENOSPC) {
1285 			mlog_errno(status);
1286 			goto bail;
1287 		}
1288 	}
1289 
1290 	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1291 
1292 	victim = ocfs2_find_victim_chain(cl);
1293 	ac->ac_chain = victim;
1294 	ac->ac_allow_chain_relink = 1;
1295 
1296 	status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, bit_off,
1297 				    num_bits, bg_blkno, &bits_left);
1298 	if (!status)
1299 		goto set_hint;
1300 	if (status < 0 && status != -ENOSPC) {
1301 		mlog_errno(status);
1302 		goto bail;
1303 	}
1304 
1305 	mlog(0, "Search of victim chain %u came up with nothing, "
1306 	     "trying all chains now.\n", victim);
1307 
1308 	/* If we didn't pick a good victim, then just default to
1309 	 * searching each chain in order. Don't allow chain relinking
1310 	 * because we only calculate enough journal credits for one
1311 	 * relink per alloc. */
1312 	ac->ac_allow_chain_relink = 0;
1313 	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1314 		if (i == victim)
1315 			continue;
1316 		if (!cl->cl_recs[i].c_free)
1317 			continue;
1318 
1319 		ac->ac_chain = i;
1320 		status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits,
1321 					    bit_off, num_bits, bg_blkno,
1322 					    &bits_left);
1323 		if (!status)
1324 			break;
1325 		if (status < 0 && status != -ENOSPC) {
1326 			mlog_errno(status);
1327 			goto bail;
1328 		}
1329 	}
1330 
1331 set_hint:
1332 	if (status != -ENOSPC) {
1333 		/* If the next search of this group is not likely to
1334 		 * yield a suitable extent, then we reset the last
1335 		 * group hint so as to not waste a disk read */
1336 		if (bits_left < min_bits)
1337 			ac->ac_last_group = 0;
1338 		else
1339 			ac->ac_last_group = *bg_blkno;
1340 	}
1341 
1342 bail:
1343 	mlog_exit(status);
1344 	return status;
1345 }
1346 
1347 int ocfs2_claim_metadata(struct ocfs2_super *osb,
1348 			 handle_t *handle,
1349 			 struct ocfs2_alloc_context *ac,
1350 			 u32 bits_wanted,
1351 			 u16 *suballoc_bit_start,
1352 			 unsigned int *num_bits,
1353 			 u64 *blkno_start)
1354 {
1355 	int status;
1356 	u64 bg_blkno;
1357 
1358 	BUG_ON(!ac);
1359 	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1360 	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1361 
1362 	status = ocfs2_claim_suballoc_bits(osb,
1363 					   ac,
1364 					   handle,
1365 					   bits_wanted,
1366 					   1,
1367 					   suballoc_bit_start,
1368 					   num_bits,
1369 					   &bg_blkno);
1370 	if (status < 0) {
1371 		mlog_errno(status);
1372 		goto bail;
1373 	}
1374 	atomic_inc(&osb->alloc_stats.bg_allocs);
1375 
1376 	*blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1377 	ac->ac_bits_given += (*num_bits);
1378 	status = 0;
1379 bail:
1380 	mlog_exit(status);
1381 	return status;
1382 }
1383 
1384 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1385 			  handle_t *handle,
1386 			  struct ocfs2_alloc_context *ac,
1387 			  u16 *suballoc_bit,
1388 			  u64 *fe_blkno)
1389 {
1390 	int status;
1391 	unsigned int num_bits;
1392 	u64 bg_blkno;
1393 
1394 	mlog_entry_void();
1395 
1396 	BUG_ON(!ac);
1397 	BUG_ON(ac->ac_bits_given != 0);
1398 	BUG_ON(ac->ac_bits_wanted != 1);
1399 	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1400 
1401 	status = ocfs2_claim_suballoc_bits(osb,
1402 					   ac,
1403 					   handle,
1404 					   1,
1405 					   1,
1406 					   suballoc_bit,
1407 					   &num_bits,
1408 					   &bg_blkno);
1409 	if (status < 0) {
1410 		mlog_errno(status);
1411 		goto bail;
1412 	}
1413 	atomic_inc(&osb->alloc_stats.bg_allocs);
1414 
1415 	BUG_ON(num_bits != 1);
1416 
1417 	*fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1418 	ac->ac_bits_given++;
1419 	status = 0;
1420 bail:
1421 	mlog_exit(status);
1422 	return status;
1423 }
1424 
1425 /* translate a group desc. blkno and it's bitmap offset into
1426  * disk cluster offset. */
1427 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1428 						   u64 bg_blkno,
1429 						   u16 bg_bit_off)
1430 {
1431 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1432 	u32 cluster = 0;
1433 
1434 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1435 
1436 	if (bg_blkno != osb->first_cluster_group_blkno)
1437 		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1438 	cluster += (u32) bg_bit_off;
1439 	return cluster;
1440 }
1441 
1442 /* given a cluster offset, calculate which block group it belongs to
1443  * and return that block offset. */
1444 u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster)
1445 {
1446 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1447 	u32 group_no;
1448 
1449 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1450 
1451 	group_no = cluster / osb->bitmap_cpg;
1452 	if (!group_no)
1453 		return osb->first_cluster_group_blkno;
1454 	return ocfs2_clusters_to_blocks(inode->i_sb,
1455 					group_no * osb->bitmap_cpg);
1456 }
1457 
1458 /* given the block number of a cluster start, calculate which cluster
1459  * group and descriptor bitmap offset that corresponds to. */
1460 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1461 						u64 data_blkno,
1462 						u64 *bg_blkno,
1463 						u16 *bg_bit_off)
1464 {
1465 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1466 	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1467 
1468 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1469 
1470 	*bg_blkno = ocfs2_which_cluster_group(inode,
1471 					      data_cluster);
1472 
1473 	if (*bg_blkno == osb->first_cluster_group_blkno)
1474 		*bg_bit_off = (u16) data_cluster;
1475 	else
1476 		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1477 							     data_blkno - *bg_blkno);
1478 }
1479 
1480 /*
1481  * min_bits - minimum contiguous chunk from this total allocation we
1482  * can handle. set to what we asked for originally for a full
1483  * contig. allocation, set to '1' to indicate we can deal with extents
1484  * of any size.
1485  */
1486 int __ocfs2_claim_clusters(struct ocfs2_super *osb,
1487 			   handle_t *handle,
1488 			   struct ocfs2_alloc_context *ac,
1489 			   u32 min_clusters,
1490 			   u32 max_clusters,
1491 			   u32 *cluster_start,
1492 			   u32 *num_clusters)
1493 {
1494 	int status;
1495 	unsigned int bits_wanted = max_clusters;
1496 	u64 bg_blkno = 0;
1497 	u16 bg_bit_off;
1498 
1499 	mlog_entry_void();
1500 
1501 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1502 
1503 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1504 	       && ac->ac_which != OCFS2_AC_USE_MAIN);
1505 
1506 	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1507 		status = ocfs2_claim_local_alloc_bits(osb,
1508 						      handle,
1509 						      ac,
1510 						      bits_wanted,
1511 						      cluster_start,
1512 						      num_clusters);
1513 		if (!status)
1514 			atomic_inc(&osb->alloc_stats.local_data);
1515 	} else {
1516 		if (min_clusters > (osb->bitmap_cpg - 1)) {
1517 			/* The only paths asking for contiguousness
1518 			 * should know about this already. */
1519 			mlog(ML_ERROR, "minimum allocation requested %u exceeds "
1520 			     "group bitmap size %u!\n", min_clusters,
1521 			     osb->bitmap_cpg);
1522 			status = -ENOSPC;
1523 			goto bail;
1524 		}
1525 		/* clamp the current request down to a realistic size. */
1526 		if (bits_wanted > (osb->bitmap_cpg - 1))
1527 			bits_wanted = osb->bitmap_cpg - 1;
1528 
1529 		status = ocfs2_claim_suballoc_bits(osb,
1530 						   ac,
1531 						   handle,
1532 						   bits_wanted,
1533 						   min_clusters,
1534 						   &bg_bit_off,
1535 						   num_clusters,
1536 						   &bg_blkno);
1537 		if (!status) {
1538 			*cluster_start =
1539 				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1540 								 bg_blkno,
1541 								 bg_bit_off);
1542 			atomic_inc(&osb->alloc_stats.bitmap_data);
1543 		}
1544 	}
1545 	if (status < 0) {
1546 		if (status != -ENOSPC)
1547 			mlog_errno(status);
1548 		goto bail;
1549 	}
1550 
1551 	ac->ac_bits_given += *num_clusters;
1552 
1553 bail:
1554 	mlog_exit(status);
1555 	return status;
1556 }
1557 
1558 int ocfs2_claim_clusters(struct ocfs2_super *osb,
1559 			 handle_t *handle,
1560 			 struct ocfs2_alloc_context *ac,
1561 			 u32 min_clusters,
1562 			 u32 *cluster_start,
1563 			 u32 *num_clusters)
1564 {
1565 	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1566 
1567 	return __ocfs2_claim_clusters(osb, handle, ac, min_clusters,
1568 				      bits_wanted, cluster_start, num_clusters);
1569 }
1570 
1571 static inline int ocfs2_block_group_clear_bits(handle_t *handle,
1572 					       struct inode *alloc_inode,
1573 					       struct ocfs2_group_desc *bg,
1574 					       struct buffer_head *group_bh,
1575 					       unsigned int bit_off,
1576 					       unsigned int num_bits)
1577 {
1578 	int status;
1579 	unsigned int tmp;
1580 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1581 	struct ocfs2_group_desc *undo_bg = NULL;
1582 
1583 	mlog_entry_void();
1584 
1585 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1586 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1587 		status = -EIO;
1588 		goto bail;
1589 	}
1590 
1591 	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1592 
1593 	if (ocfs2_is_cluster_bitmap(alloc_inode))
1594 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1595 
1596 	status = ocfs2_journal_access(handle, alloc_inode, group_bh,
1597 				      journal_type);
1598 	if (status < 0) {
1599 		mlog_errno(status);
1600 		goto bail;
1601 	}
1602 
1603 	if (ocfs2_is_cluster_bitmap(alloc_inode))
1604 		undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1605 
1606 	tmp = num_bits;
1607 	while(tmp--) {
1608 		ocfs2_clear_bit((bit_off + tmp),
1609 				(unsigned long *) bg->bg_bitmap);
1610 		if (ocfs2_is_cluster_bitmap(alloc_inode))
1611 			ocfs2_set_bit(bit_off + tmp,
1612 				      (unsigned long *) undo_bg->bg_bitmap);
1613 	}
1614 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1615 
1616 	status = ocfs2_journal_dirty(handle, group_bh);
1617 	if (status < 0)
1618 		mlog_errno(status);
1619 bail:
1620 	return status;
1621 }
1622 
1623 /*
1624  * expects the suballoc inode to already be locked.
1625  */
1626 int ocfs2_free_suballoc_bits(handle_t *handle,
1627 			     struct inode *alloc_inode,
1628 			     struct buffer_head *alloc_bh,
1629 			     unsigned int start_bit,
1630 			     u64 bg_blkno,
1631 			     unsigned int count)
1632 {
1633 	int status = 0;
1634 	u32 tmp_used;
1635 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1636 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1637 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1638 	struct buffer_head *group_bh = NULL;
1639 	struct ocfs2_group_desc *group;
1640 
1641 	mlog_entry_void();
1642 
1643 	if (!OCFS2_IS_VALID_DINODE(fe)) {
1644 		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
1645 		status = -EIO;
1646 		goto bail;
1647 	}
1648 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1649 
1650 	mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
1651 	     (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
1652 	     (unsigned long long)bg_blkno, start_bit);
1653 
1654 	status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
1655 				  alloc_inode);
1656 	if (status < 0) {
1657 		mlog_errno(status);
1658 		goto bail;
1659 	}
1660 
1661 	group = (struct ocfs2_group_desc *) group_bh->b_data;
1662 	status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
1663 	if (status) {
1664 		mlog_errno(status);
1665 		goto bail;
1666 	}
1667 	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1668 
1669 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1670 					      group, group_bh,
1671 					      start_bit, count);
1672 	if (status < 0) {
1673 		mlog_errno(status);
1674 		goto bail;
1675 	}
1676 
1677 	status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
1678 				      OCFS2_JOURNAL_ACCESS_WRITE);
1679 	if (status < 0) {
1680 		mlog_errno(status);
1681 		goto bail;
1682 	}
1683 
1684 	le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1685 		     count);
1686 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1687 	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1688 
1689 	status = ocfs2_journal_dirty(handle, alloc_bh);
1690 	if (status < 0) {
1691 		mlog_errno(status);
1692 		goto bail;
1693 	}
1694 
1695 bail:
1696 	if (group_bh)
1697 		brelse(group_bh);
1698 
1699 	mlog_exit(status);
1700 	return status;
1701 }
1702 
1703 int ocfs2_free_dinode(handle_t *handle,
1704 		      struct inode *inode_alloc_inode,
1705 		      struct buffer_head *inode_alloc_bh,
1706 		      struct ocfs2_dinode *di)
1707 {
1708 	u64 blk = le64_to_cpu(di->i_blkno);
1709 	u16 bit = le16_to_cpu(di->i_suballoc_bit);
1710 	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1711 
1712 	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1713 					inode_alloc_bh, bit, bg_blkno, 1);
1714 }
1715 
1716 int ocfs2_free_clusters(handle_t *handle,
1717 		       struct inode *bitmap_inode,
1718 		       struct buffer_head *bitmap_bh,
1719 		       u64 start_blk,
1720 		       unsigned int num_clusters)
1721 {
1722 	int status;
1723 	u16 bg_start_bit;
1724 	u64 bg_blkno;
1725 	struct ocfs2_dinode *fe;
1726 
1727 	/* You can't ever have a contiguous set of clusters
1728 	 * bigger than a block group bitmap so we never have to worry
1729 	 * about looping on them. */
1730 
1731 	mlog_entry_void();
1732 
1733 	/* This is expensive. We can safely remove once this stuff has
1734 	 * gotten tested really well. */
1735 	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1736 
1737 	fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1738 
1739 	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1740 				     &bg_start_bit);
1741 
1742 	mlog(0, "want to free %u clusters starting at block %llu\n",
1743 	     num_clusters, (unsigned long long)start_blk);
1744 	mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n",
1745 	     (unsigned long long)bg_blkno, bg_start_bit);
1746 
1747 	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1748 					  bg_start_bit, bg_blkno,
1749 					  num_clusters);
1750 	if (status < 0)
1751 		mlog_errno(status);
1752 
1753 	mlog_exit(status);
1754 	return status;
1755 }
1756 
1757 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1758 {
1759 	printk("Block Group:\n");
1760 	printk("bg_signature:       %s\n", bg->bg_signature);
1761 	printk("bg_size:            %u\n", bg->bg_size);
1762 	printk("bg_bits:            %u\n", bg->bg_bits);
1763 	printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1764 	printk("bg_chain:           %u\n", bg->bg_chain);
1765 	printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
1766 	printk("bg_next_group:      %llu\n",
1767 	       (unsigned long long)bg->bg_next_group);
1768 	printk("bg_parent_dinode:   %llu\n",
1769 	       (unsigned long long)bg->bg_parent_dinode);
1770 	printk("bg_blkno:           %llu\n",
1771 	       (unsigned long long)bg->bg_blkno);
1772 }
1773 
1774 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1775 {
1776 	int i;
1777 
1778 	printk("Suballoc Inode %llu:\n", (unsigned long long)fe->i_blkno);
1779 	printk("i_signature:                  %s\n", fe->i_signature);
1780 	printk("i_size:                       %llu\n",
1781 	       (unsigned long long)fe->i_size);
1782 	printk("i_clusters:                   %u\n", fe->i_clusters);
1783 	printk("i_generation:                 %u\n",
1784 	       le32_to_cpu(fe->i_generation));
1785 	printk("id1.bitmap1.i_used:           %u\n",
1786 	       le32_to_cpu(fe->id1.bitmap1.i_used));
1787 	printk("id1.bitmap1.i_total:          %u\n",
1788 	       le32_to_cpu(fe->id1.bitmap1.i_total));
1789 	printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
1790 	printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
1791 	printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
1792 	printk("id2.i_chain.cl_next_free_rec: %u\n",
1793 	       fe->id2.i_chain.cl_next_free_rec);
1794 	for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
1795 		printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
1796 		       fe->id2.i_chain.cl_recs[i].c_free);
1797 		printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
1798 		       fe->id2.i_chain.cl_recs[i].c_total);
1799 		printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %llu\n", i,
1800 		       (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno);
1801 	}
1802 }
1803