xref: /openbmc/linux/fs/ocfs2/suballoc.c (revision 87c2ce3b)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * suballoc.c
5  *
6  * metadata alloc and free
7  * Inspired by ext3 block groups.
8  *
9  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
10  *
11  * This program is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU General Public
13  * License as published by the Free Software Foundation; either
14  * version 2 of the License, or (at your option) any later version.
15  *
16  * This program is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19  * General Public License for more details.
20  *
21  * You should have received a copy of the GNU General Public
22  * License along with this program; if not, write to the
23  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24  * Boston, MA 021110-1307, USA.
25  */
26 
27 #include <linux/fs.h>
28 #include <linux/types.h>
29 #include <linux/slab.h>
30 #include <linux/highmem.h>
31 
32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
33 #include <cluster/masklog.h>
34 
35 #include "ocfs2.h"
36 
37 #include "alloc.h"
38 #include "dlmglue.h"
39 #include "inode.h"
40 #include "journal.h"
41 #include "localalloc.h"
42 #include "suballoc.h"
43 #include "super.h"
44 #include "sysfile.h"
45 #include "uptodate.h"
46 
47 #include "buffer_head_io.h"
48 
49 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
52 static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
53 				  struct inode *alloc_inode,
54 				  struct buffer_head *bg_bh,
55 				  u64 group_blkno,
56 				  u16 my_chain,
57 				  struct ocfs2_chain_list *cl);
58 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
59 				   struct inode *alloc_inode,
60 				   struct buffer_head *bh);
61 
62 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
63 				       struct ocfs2_alloc_context *ac);
64 
65 static int ocfs2_cluster_group_search(struct inode *inode,
66 				      struct buffer_head *group_bh,
67 				      u32 bits_wanted, u32 min_bits,
68 				      u16 *bit_off, u16 *bits_found);
69 static int ocfs2_block_group_search(struct inode *inode,
70 				    struct buffer_head *group_bh,
71 				    u32 bits_wanted, u32 min_bits,
72 				    u16 *bit_off, u16 *bits_found);
73 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
74 			      u32 bits_wanted,
75 			      u32 min_bits,
76 			      u16 *bit_off,
77 			      unsigned int *num_bits,
78 			      u64 *bg_blkno);
79 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
80 				     struct ocfs2_alloc_context *ac,
81 				     u32 bits_wanted,
82 				     u32 min_bits,
83 				     u16 *bit_off,
84 				     unsigned int *num_bits,
85 				     u64 *bg_blkno);
86 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
87 					 int nr);
88 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
89 					     struct buffer_head *bg_bh,
90 					     unsigned int bits_wanted,
91 					     u16 *bit_off,
92 					     u16 *bits_found);
93 static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
94 					     struct inode *alloc_inode,
95 					     struct ocfs2_group_desc *bg,
96 					     struct buffer_head *group_bh,
97 					     unsigned int bit_off,
98 					     unsigned int num_bits);
99 static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
100 					       struct inode *alloc_inode,
101 					       struct ocfs2_group_desc *bg,
102 					       struct buffer_head *group_bh,
103 					       unsigned int bit_off,
104 					       unsigned int num_bits);
105 
106 static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
107 				    struct inode *alloc_inode,
108 				    struct buffer_head *fe_bh,
109 				    struct buffer_head *bg_bh,
110 				    struct buffer_head *prev_bg_bh,
111 				    u16 chain);
112 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
113 						     u32 wanted);
114 static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
115 				    struct inode *alloc_inode,
116 				    struct buffer_head *alloc_bh,
117 				    unsigned int start_bit,
118 				    u64 bg_blkno,
119 				    unsigned int count);
120 static inline u64 ocfs2_which_suballoc_group(u64 block,
121 					     unsigned int bit);
122 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
123 						   u64 bg_blkno,
124 						   u16 bg_bit_off);
125 static inline u64 ocfs2_which_cluster_group(struct inode *inode,
126 					    u32 cluster);
127 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
128 						u64 data_blkno,
129 						u64 *bg_blkno,
130 						u16 *bg_bit_off);
131 
132 void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
133 {
134 	if (ac->ac_inode)
135 		iput(ac->ac_inode);
136 	if (ac->ac_bh)
137 		brelse(ac->ac_bh);
138 	kfree(ac);
139 }
140 
141 static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
142 {
143 	return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
144 }
145 
146 static int ocfs2_block_group_fill(struct ocfs2_journal_handle *handle,
147 				  struct inode *alloc_inode,
148 				  struct buffer_head *bg_bh,
149 				  u64 group_blkno,
150 				  u16 my_chain,
151 				  struct ocfs2_chain_list *cl)
152 {
153 	int status = 0;
154 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
155 	struct super_block * sb = alloc_inode->i_sb;
156 
157 	mlog_entry_void();
158 
159 	if (((unsigned long long) bg_bh->b_blocknr) != group_blkno) {
160 		ocfs2_error(alloc_inode->i_sb, "group block (%"MLFu64") "
161 			    "!= b_blocknr (%llu)", group_blkno,
162 			    (unsigned long long) bg_bh->b_blocknr);
163 		status = -EIO;
164 		goto bail;
165 	}
166 
167 	status = ocfs2_journal_access(handle,
168 				      alloc_inode,
169 				      bg_bh,
170 				      OCFS2_JOURNAL_ACCESS_CREATE);
171 	if (status < 0) {
172 		mlog_errno(status);
173 		goto bail;
174 	}
175 
176 	memset(bg, 0, sb->s_blocksize);
177 	strcpy(bg->bg_signature, OCFS2_GROUP_DESC_SIGNATURE);
178 	bg->bg_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
179 	bg->bg_size = cpu_to_le16(ocfs2_group_bitmap_size(sb));
180 	bg->bg_bits = cpu_to_le16(ocfs2_bits_per_group(cl));
181 	bg->bg_chain = cpu_to_le16(my_chain);
182 	bg->bg_next_group = cl->cl_recs[my_chain].c_blkno;
183 	bg->bg_parent_dinode = cpu_to_le64(OCFS2_I(alloc_inode)->ip_blkno);
184 	bg->bg_blkno = cpu_to_le64(group_blkno);
185 	/* set the 1st bit in the bitmap to account for the descriptor block */
186 	ocfs2_set_bit(0, (unsigned long *)bg->bg_bitmap);
187 	bg->bg_free_bits_count = cpu_to_le16(le16_to_cpu(bg->bg_bits) - 1);
188 
189 	status = ocfs2_journal_dirty(handle, bg_bh);
190 	if (status < 0)
191 		mlog_errno(status);
192 
193 	/* There is no need to zero out or otherwise initialize the
194 	 * other blocks in a group - All valid FS metadata in a block
195 	 * group stores the superblock fs_generation value at
196 	 * allocation time. */
197 
198 bail:
199 	mlog_exit(status);
200 	return status;
201 }
202 
203 static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl)
204 {
205 	u16 curr, best;
206 
207 	best = curr = 0;
208 	while (curr < le16_to_cpu(cl->cl_count)) {
209 		if (le32_to_cpu(cl->cl_recs[best].c_total) >
210 		    le32_to_cpu(cl->cl_recs[curr].c_total))
211 			best = curr;
212 		curr++;
213 	}
214 	return best;
215 }
216 
217 /*
218  * We expect the block group allocator to already be locked.
219  */
220 static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
221 				   struct inode *alloc_inode,
222 				   struct buffer_head *bh)
223 {
224 	int status, credits;
225 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data;
226 	struct ocfs2_chain_list *cl;
227 	struct ocfs2_alloc_context *ac = NULL;
228 	struct ocfs2_journal_handle *handle = NULL;
229 	u32 bit_off, num_bits;
230 	u16 alloc_rec;
231 	u64 bg_blkno;
232 	struct buffer_head *bg_bh = NULL;
233 	struct ocfs2_group_desc *bg;
234 
235 	BUG_ON(ocfs2_is_cluster_bitmap(alloc_inode));
236 
237 	mlog_entry_void();
238 
239 	handle = ocfs2_alloc_handle(osb);
240 	if (!handle) {
241 		status = -ENOMEM;
242 		mlog_errno(status);
243 		goto bail;
244 	}
245 
246 	cl = &fe->id2.i_chain;
247 	status = ocfs2_reserve_clusters(osb,
248 					handle,
249 					le16_to_cpu(cl->cl_cpg),
250 					&ac);
251 	if (status < 0) {
252 		if (status != -ENOSPC)
253 			mlog_errno(status);
254 		goto bail;
255 	}
256 
257 	credits = ocfs2_calc_group_alloc_credits(osb->sb,
258 						 le16_to_cpu(cl->cl_cpg));
259 	handle = ocfs2_start_trans(osb, handle, credits);
260 	if (IS_ERR(handle)) {
261 		status = PTR_ERR(handle);
262 		handle = NULL;
263 		mlog_errno(status);
264 		goto bail;
265 	}
266 
267 	status = ocfs2_claim_clusters(osb,
268 				      handle,
269 				      ac,
270 				      le16_to_cpu(cl->cl_cpg),
271 				      &bit_off,
272 				      &num_bits);
273 	if (status < 0) {
274 		if (status != -ENOSPC)
275 			mlog_errno(status);
276 		goto bail;
277 	}
278 
279 	alloc_rec = ocfs2_find_smallest_chain(cl);
280 
281 	/* setup the group */
282 	bg_blkno = ocfs2_clusters_to_blocks(osb->sb, bit_off);
283 	mlog(0, "new descriptor, record %u, at block %"MLFu64"\n",
284 	     alloc_rec, bg_blkno);
285 
286 	bg_bh = sb_getblk(osb->sb, bg_blkno);
287 	if (!bg_bh) {
288 		status = -EIO;
289 		mlog_errno(status);
290 		goto bail;
291 	}
292 	ocfs2_set_new_buffer_uptodate(alloc_inode, bg_bh);
293 
294 	status = ocfs2_block_group_fill(handle,
295 					alloc_inode,
296 					bg_bh,
297 					bg_blkno,
298 					alloc_rec,
299 					cl);
300 	if (status < 0) {
301 		mlog_errno(status);
302 		goto bail;
303 	}
304 
305 	bg = (struct ocfs2_group_desc *) bg_bh->b_data;
306 
307 	status = ocfs2_journal_access(handle, alloc_inode,
308 				      bh, OCFS2_JOURNAL_ACCESS_WRITE);
309 	if (status < 0) {
310 		mlog_errno(status);
311 		goto bail;
312 	}
313 
314 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_free,
315 		     le16_to_cpu(bg->bg_free_bits_count));
316 	le32_add_cpu(&cl->cl_recs[alloc_rec].c_total, le16_to_cpu(bg->bg_bits));
317 	cl->cl_recs[alloc_rec].c_blkno  = cpu_to_le64(bg_blkno);
318 	if (le16_to_cpu(cl->cl_next_free_rec) < le16_to_cpu(cl->cl_count))
319 		le16_add_cpu(&cl->cl_next_free_rec, 1);
320 
321 	le32_add_cpu(&fe->id1.bitmap1.i_used, le16_to_cpu(bg->bg_bits) -
322 					le16_to_cpu(bg->bg_free_bits_count));
323 	le32_add_cpu(&fe->id1.bitmap1.i_total, le16_to_cpu(bg->bg_bits));
324 	le32_add_cpu(&fe->i_clusters, le16_to_cpu(cl->cl_cpg));
325 
326 	status = ocfs2_journal_dirty(handle, bh);
327 	if (status < 0) {
328 		mlog_errno(status);
329 		goto bail;
330 	}
331 
332 	spin_lock(&OCFS2_I(alloc_inode)->ip_lock);
333 	OCFS2_I(alloc_inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
334 	fe->i_size = cpu_to_le64(ocfs2_clusters_to_bytes(alloc_inode->i_sb,
335 					     le32_to_cpu(fe->i_clusters)));
336 	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
337 	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
338 	alloc_inode->i_blocks =
339 		ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
340 
341 	status = 0;
342 bail:
343 	if (handle)
344 		ocfs2_commit_trans(handle);
345 
346 	if (ac)
347 		ocfs2_free_alloc_context(ac);
348 
349 	if (bg_bh)
350 		brelse(bg_bh);
351 
352 	mlog_exit(status);
353 	return status;
354 }
355 
356 static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
357 				       struct ocfs2_alloc_context *ac)
358 {
359 	int status;
360 	u32 bits_wanted = ac->ac_bits_wanted;
361 	struct inode *alloc_inode = ac->ac_inode;
362 	struct buffer_head *bh = NULL;
363 	struct ocfs2_journal_handle *handle = ac->ac_handle;
364 	struct ocfs2_dinode *fe;
365 	u32 free_bits;
366 
367 	mlog_entry_void();
368 
369 	BUG_ON(handle->flags & OCFS2_HANDLE_STARTED);
370 
371 	ocfs2_handle_add_inode(handle, alloc_inode);
372 	status = ocfs2_meta_lock(alloc_inode, handle, &bh, 1);
373 	if (status < 0) {
374 		mlog_errno(status);
375 		goto bail;
376 	}
377 
378 	fe = (struct ocfs2_dinode *) bh->b_data;
379 	if (!OCFS2_IS_VALID_DINODE(fe)) {
380 		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
381 		status = -EIO;
382 		goto bail;
383 	}
384 	if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
385 		ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator "
386 			    "# %"MLFu64, le64_to_cpu(fe->i_blkno));
387 		status = -EIO;
388 		goto bail;
389 	}
390 
391 	free_bits = le32_to_cpu(fe->id1.bitmap1.i_total) -
392 		le32_to_cpu(fe->id1.bitmap1.i_used);
393 
394 	if (bits_wanted > free_bits) {
395 		/* cluster bitmap never grows */
396 		if (ocfs2_is_cluster_bitmap(alloc_inode)) {
397 			mlog(0, "Disk Full: wanted=%u, free_bits=%u\n",
398 			     bits_wanted, free_bits);
399 			status = -ENOSPC;
400 			goto bail;
401 		}
402 
403 		status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
404 		if (status < 0) {
405 			if (status != -ENOSPC)
406 				mlog_errno(status);
407 			goto bail;
408 		}
409 		atomic_inc(&osb->alloc_stats.bg_extends);
410 
411 		/* You should never ask for this much metadata */
412 		BUG_ON(bits_wanted >
413 		       (le32_to_cpu(fe->id1.bitmap1.i_total)
414 			- le32_to_cpu(fe->id1.bitmap1.i_used)));
415 	}
416 
417 	get_bh(bh);
418 	ac->ac_bh = bh;
419 bail:
420 	if (bh)
421 		brelse(bh);
422 
423 	mlog_exit(status);
424 	return status;
425 }
426 
427 int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
428 			       struct ocfs2_journal_handle *handle,
429 			       struct ocfs2_dinode *fe,
430 			       struct ocfs2_alloc_context **ac)
431 {
432 	int status;
433 	struct inode *alloc_inode = NULL;
434 
435 	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
436 	if (!(*ac)) {
437 		status = -ENOMEM;
438 		mlog_errno(status);
439 		goto bail;
440 	}
441 
442 	(*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe);
443 	(*ac)->ac_handle = handle;
444 	(*ac)->ac_which = OCFS2_AC_USE_META;
445 
446 #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS
447 	alloc_inode = ocfs2_get_system_file_inode(osb,
448 						  EXTENT_ALLOC_SYSTEM_INODE,
449 						  0);
450 #else
451 	alloc_inode = ocfs2_get_system_file_inode(osb,
452 						  EXTENT_ALLOC_SYSTEM_INODE,
453 						  osb->slot_num);
454 #endif
455 	if (!alloc_inode) {
456 		status = -ENOMEM;
457 		mlog_errno(status);
458 		goto bail;
459 	}
460 
461 	(*ac)->ac_inode = igrab(alloc_inode);
462 	(*ac)->ac_group_search = ocfs2_block_group_search;
463 
464 	status = ocfs2_reserve_suballoc_bits(osb, (*ac));
465 	if (status < 0) {
466 		if (status != -ENOSPC)
467 			mlog_errno(status);
468 		goto bail;
469 	}
470 
471 	status = 0;
472 bail:
473 	if ((status < 0) && *ac) {
474 		ocfs2_free_alloc_context(*ac);
475 		*ac = NULL;
476 	}
477 
478 	if (alloc_inode)
479 		iput(alloc_inode);
480 
481 	mlog_exit(status);
482 	return status;
483 }
484 
485 int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
486 			    struct ocfs2_journal_handle *handle,
487 			    struct ocfs2_alloc_context **ac)
488 {
489 	int status;
490 	struct inode *alloc_inode = NULL;
491 
492 	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
493 	if (!(*ac)) {
494 		status = -ENOMEM;
495 		mlog_errno(status);
496 		goto bail;
497 	}
498 
499 	(*ac)->ac_bits_wanted = 1;
500 	(*ac)->ac_handle = handle;
501 	(*ac)->ac_which = OCFS2_AC_USE_INODE;
502 
503 	alloc_inode = ocfs2_get_system_file_inode(osb,
504 						  INODE_ALLOC_SYSTEM_INODE,
505 						  osb->slot_num);
506 	if (!alloc_inode) {
507 		status = -ENOMEM;
508 		mlog_errno(status);
509 		goto bail;
510 	}
511 
512 	(*ac)->ac_inode = igrab(alloc_inode);
513 	(*ac)->ac_group_search = ocfs2_block_group_search;
514 
515 	status = ocfs2_reserve_suballoc_bits(osb, *ac);
516 	if (status < 0) {
517 		if (status != -ENOSPC)
518 			mlog_errno(status);
519 		goto bail;
520 	}
521 
522 	status = 0;
523 bail:
524 	if ((status < 0) && *ac) {
525 		ocfs2_free_alloc_context(*ac);
526 		*ac = NULL;
527 	}
528 
529 	if (alloc_inode)
530 		iput(alloc_inode);
531 
532 	mlog_exit(status);
533 	return status;
534 }
535 
536 /* local alloc code has to do the same thing, so rather than do this
537  * twice.. */
538 int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
539 				      struct ocfs2_alloc_context *ac)
540 {
541 	int status;
542 
543 	ac->ac_inode = ocfs2_get_system_file_inode(osb,
544 						   GLOBAL_BITMAP_SYSTEM_INODE,
545 						   OCFS2_INVALID_SLOT);
546 	if (!ac->ac_inode) {
547 		status = -EINVAL;
548 		mlog(ML_ERROR, "Could not get bitmap inode!\n");
549 		goto bail;
550 	}
551 	ac->ac_which = OCFS2_AC_USE_MAIN;
552 	ac->ac_group_search = ocfs2_cluster_group_search;
553 
554 	status = ocfs2_reserve_suballoc_bits(osb, ac);
555 	if (status < 0 && status != -ENOSPC)
556 		mlog_errno(status);
557 bail:
558 	return status;
559 }
560 
561 /* Callers don't need to care which bitmap (local alloc or main) to
562  * use so we figure it out for them, but unfortunately this clutters
563  * things a bit. */
564 int ocfs2_reserve_clusters(struct ocfs2_super *osb,
565 			   struct ocfs2_journal_handle *handle,
566 			   u32 bits_wanted,
567 			   struct ocfs2_alloc_context **ac)
568 {
569 	int status;
570 
571 	mlog_entry_void();
572 
573 	BUG_ON(!handle);
574 
575 	*ac = kcalloc(1, sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
576 	if (!(*ac)) {
577 		status = -ENOMEM;
578 		mlog_errno(status);
579 		goto bail;
580 	}
581 
582 	(*ac)->ac_bits_wanted = bits_wanted;
583 	(*ac)->ac_handle = handle;
584 
585 	status = -ENOSPC;
586 	if (ocfs2_alloc_should_use_local(osb, bits_wanted)) {
587 		status = ocfs2_reserve_local_alloc_bits(osb,
588 							handle,
589 							bits_wanted,
590 							*ac);
591 		if ((status < 0) && (status != -ENOSPC)) {
592 			mlog_errno(status);
593 			goto bail;
594 		} else if (status == -ENOSPC) {
595 			/* reserve_local_bits will return enospc with
596 			 * the local alloc inode still locked, so we
597 			 * can change this safely here. */
598 			mlog(0, "Disabling local alloc\n");
599 			/* We set to OCFS2_LA_DISABLED so that umount
600 			 * can clean up what's left of the local
601 			 * allocation */
602 			osb->local_alloc_state = OCFS2_LA_DISABLED;
603 		}
604 	}
605 
606 	if (status == -ENOSPC) {
607 		status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
608 		if (status < 0) {
609 			if (status != -ENOSPC)
610 				mlog_errno(status);
611 			goto bail;
612 		}
613 	}
614 
615 	status = 0;
616 bail:
617 	if ((status < 0) && *ac) {
618 		ocfs2_free_alloc_context(*ac);
619 		*ac = NULL;
620 	}
621 
622 	mlog_exit(status);
623 	return status;
624 }
625 
626 /*
627  * More or less lifted from ext3. I'll leave their description below:
628  *
629  * "For ext3 allocations, we must not reuse any blocks which are
630  * allocated in the bitmap buffer's "last committed data" copy.  This
631  * prevents deletes from freeing up the page for reuse until we have
632  * committed the delete transaction.
633  *
634  * If we didn't do this, then deleting something and reallocating it as
635  * data would allow the old block to be overwritten before the
636  * transaction committed (because we force data to disk before commit).
637  * This would lead to corruption if we crashed between overwriting the
638  * data and committing the delete.
639  *
640  * @@@ We may want to make this allocation behaviour conditional on
641  * data-writes at some point, and disable it for metadata allocations or
642  * sync-data inodes."
643  *
644  * Note: OCFS2 already does this differently for metadata vs data
645  * allocations, as those bitmaps are seperate and undo access is never
646  * called on a metadata group descriptor.
647  */
648 static int ocfs2_test_bg_bit_allocatable(struct buffer_head *bg_bh,
649 					 int nr)
650 {
651 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
652 
653 	if (ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap))
654 		return 0;
655 	if (!buffer_jbd(bg_bh) || !bh2jh(bg_bh)->b_committed_data)
656 		return 1;
657 
658 	bg = (struct ocfs2_group_desc *) bh2jh(bg_bh)->b_committed_data;
659 	return !ocfs2_test_bit(nr, (unsigned long *)bg->bg_bitmap);
660 }
661 
662 static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
663 					     struct buffer_head *bg_bh,
664 					     unsigned int bits_wanted,
665 					     u16 *bit_off,
666 					     u16 *bits_found)
667 {
668 	void *bitmap;
669 	u16 best_offset, best_size;
670 	int offset, start, found, status = 0;
671 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
672 
673 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
674 		OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
675 		return -EIO;
676 	}
677 
678 	found = start = best_offset = best_size = 0;
679 	bitmap = bg->bg_bitmap;
680 
681 	while((offset = ocfs2_find_next_zero_bit(bitmap,
682 						 le16_to_cpu(bg->bg_bits),
683 						 start)) != -1) {
684 		if (offset == le16_to_cpu(bg->bg_bits))
685 			break;
686 
687 		if (!ocfs2_test_bg_bit_allocatable(bg_bh, offset)) {
688 			/* We found a zero, but we can't use it as it
689 			 * hasn't been put to disk yet! */
690 			found = 0;
691 			start = offset + 1;
692 		} else if (offset == start) {
693 			/* we found a zero */
694 			found++;
695 			/* move start to the next bit to test */
696 			start++;
697 		} else {
698 			/* got a zero after some ones */
699 			found = 1;
700 			start = offset + 1;
701 		}
702 		if (found > best_size) {
703 			best_size = found;
704 			best_offset = start - found;
705 		}
706 		/* we got everything we needed */
707 		if (found == bits_wanted) {
708 			/* mlog(0, "Found it all!\n"); */
709 			break;
710 		}
711 	}
712 
713 	/* XXX: I think the first clause is equivalent to the second
714 	 * 	- jlbec */
715 	if (found == bits_wanted) {
716 		*bit_off = start - found;
717 		*bits_found = found;
718 	} else if (best_size) {
719 		*bit_off = best_offset;
720 		*bits_found = best_size;
721 	} else {
722 		status = -ENOSPC;
723 		/* No error log here -- see the comment above
724 		 * ocfs2_test_bg_bit_allocatable */
725 	}
726 
727 	return status;
728 }
729 
730 static inline int ocfs2_block_group_set_bits(struct ocfs2_journal_handle *handle,
731 					     struct inode *alloc_inode,
732 					     struct ocfs2_group_desc *bg,
733 					     struct buffer_head *group_bh,
734 					     unsigned int bit_off,
735 					     unsigned int num_bits)
736 {
737 	int status;
738 	void *bitmap = bg->bg_bitmap;
739 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
740 
741 	mlog_entry_void();
742 
743 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
744 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
745 		status = -EIO;
746 		goto bail;
747 	}
748 	BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
749 
750 	mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
751 	     num_bits);
752 
753 	if (ocfs2_is_cluster_bitmap(alloc_inode))
754 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
755 
756 	status = ocfs2_journal_access(handle,
757 				      alloc_inode,
758 				      group_bh,
759 				      journal_type);
760 	if (status < 0) {
761 		mlog_errno(status);
762 		goto bail;
763 	}
764 
765 	le16_add_cpu(&bg->bg_free_bits_count, -num_bits);
766 
767 	while(num_bits--)
768 		ocfs2_set_bit(bit_off++, bitmap);
769 
770 	status = ocfs2_journal_dirty(handle,
771 				     group_bh);
772 	if (status < 0) {
773 		mlog_errno(status);
774 		goto bail;
775 	}
776 
777 bail:
778 	mlog_exit(status);
779 	return status;
780 }
781 
782 /* find the one with the most empty bits */
783 static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl)
784 {
785 	u16 curr, best;
786 
787 	BUG_ON(!cl->cl_next_free_rec);
788 
789 	best = curr = 0;
790 	while (curr < le16_to_cpu(cl->cl_next_free_rec)) {
791 		if (le32_to_cpu(cl->cl_recs[curr].c_free) >
792 		    le32_to_cpu(cl->cl_recs[best].c_free))
793 			best = curr;
794 		curr++;
795 	}
796 
797 	BUG_ON(best >= le16_to_cpu(cl->cl_next_free_rec));
798 	return best;
799 }
800 
801 static int ocfs2_relink_block_group(struct ocfs2_journal_handle *handle,
802 				    struct inode *alloc_inode,
803 				    struct buffer_head *fe_bh,
804 				    struct buffer_head *bg_bh,
805 				    struct buffer_head *prev_bg_bh,
806 				    u16 chain)
807 {
808 	int status;
809 	/* there is a really tiny chance the journal calls could fail,
810 	 * but we wouldn't want inconsistent blocks in *any* case. */
811 	u64 fe_ptr, bg_ptr, prev_bg_ptr;
812 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
813 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
814 	struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
815 
816 	if (!OCFS2_IS_VALID_DINODE(fe)) {
817 		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
818 		status = -EIO;
819 		goto out;
820 	}
821 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
822 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
823 		status = -EIO;
824 		goto out;
825 	}
826 	if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
827 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
828 		status = -EIO;
829 		goto out;
830 	}
831 
832 	mlog(0, "In suballoc %"MLFu64", chain %u, move group %"MLFu64" to "
833 	     "top, prev = %"MLFu64"\n",
834 	     fe->i_blkno, chain, bg->bg_blkno, prev_bg->bg_blkno);
835 
836 	fe_ptr = le64_to_cpu(fe->id2.i_chain.cl_recs[chain].c_blkno);
837 	bg_ptr = le64_to_cpu(bg->bg_next_group);
838 	prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
839 
840 	status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
841 				      OCFS2_JOURNAL_ACCESS_WRITE);
842 	if (status < 0) {
843 		mlog_errno(status);
844 		goto out_rollback;
845 	}
846 
847 	prev_bg->bg_next_group = bg->bg_next_group;
848 
849 	status = ocfs2_journal_dirty(handle, prev_bg_bh);
850 	if (status < 0) {
851 		mlog_errno(status);
852 		goto out_rollback;
853 	}
854 
855 	status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
856 				      OCFS2_JOURNAL_ACCESS_WRITE);
857 	if (status < 0) {
858 		mlog_errno(status);
859 		goto out_rollback;
860 	}
861 
862 	bg->bg_next_group = fe->id2.i_chain.cl_recs[chain].c_blkno;
863 
864 	status = ocfs2_journal_dirty(handle, bg_bh);
865 	if (status < 0) {
866 		mlog_errno(status);
867 		goto out_rollback;
868 	}
869 
870 	status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
871 				      OCFS2_JOURNAL_ACCESS_WRITE);
872 	if (status < 0) {
873 		mlog_errno(status);
874 		goto out_rollback;
875 	}
876 
877 	fe->id2.i_chain.cl_recs[chain].c_blkno = bg->bg_blkno;
878 
879 	status = ocfs2_journal_dirty(handle, fe_bh);
880 	if (status < 0) {
881 		mlog_errno(status);
882 		goto out_rollback;
883 	}
884 
885 	status = 0;
886 out_rollback:
887 	if (status < 0) {
888 		fe->id2.i_chain.cl_recs[chain].c_blkno = cpu_to_le64(fe_ptr);
889 		bg->bg_next_group = cpu_to_le64(bg_ptr);
890 		prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
891 	}
892 out:
893 	mlog_exit(status);
894 	return status;
895 }
896 
897 static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg,
898 						     u32 wanted)
899 {
900 	return le16_to_cpu(bg->bg_free_bits_count) > wanted;
901 }
902 
903 /* return 0 on success, -ENOSPC to keep searching and any other < 0
904  * value on error. */
905 static int ocfs2_cluster_group_search(struct inode *inode,
906 				      struct buffer_head *group_bh,
907 				      u32 bits_wanted, u32 min_bits,
908 				      u16 *bit_off, u16 *bits_found)
909 {
910 	int search = -ENOSPC;
911 	int ret;
912 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
913 	u16 tmp_off, tmp_found;
914 
915 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
916 
917 	if (bg->bg_free_bits_count) {
918 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
919 							group_bh, bits_wanted,
920 							&tmp_off, &tmp_found);
921 		if (ret)
922 			return ret;
923 
924 		/* ocfs2_block_group_find_clear_bits() might
925 		 * return success, but we still want to return
926 		 * -ENOSPC unless it found the minimum number
927 		 * of bits. */
928 		if (min_bits <= tmp_found) {
929 			*bit_off = tmp_off;
930 			*bits_found = tmp_found;
931 			search = 0; /* success */
932 		}
933 	}
934 
935 	return search;
936 }
937 
938 static int ocfs2_block_group_search(struct inode *inode,
939 				    struct buffer_head *group_bh,
940 				    u32 bits_wanted, u32 min_bits,
941 				    u16 *bit_off, u16 *bits_found)
942 {
943 	int ret = -ENOSPC;
944 	struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data;
945 
946 	BUG_ON(min_bits != 1);
947 	BUG_ON(ocfs2_is_cluster_bitmap(inode));
948 
949 	if (bg->bg_free_bits_count)
950 		ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb),
951 							group_bh, bits_wanted,
952 							bit_off, bits_found);
953 
954 	return ret;
955 }
956 
957 static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
958 			      u32 bits_wanted,
959 			      u32 min_bits,
960 			      u16 *bit_off,
961 			      unsigned int *num_bits,
962 			      u64 *bg_blkno)
963 {
964 	int status;
965 	u16 chain, tmp_bits;
966 	u32 tmp_used;
967 	u64 next_group;
968 	struct ocfs2_journal_handle *handle = ac->ac_handle;
969 	struct inode *alloc_inode = ac->ac_inode;
970 	struct buffer_head *group_bh = NULL;
971 	struct buffer_head *prev_group_bh = NULL;
972 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
973 	struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
974 	struct ocfs2_group_desc *bg;
975 
976 	chain = ac->ac_chain;
977 	mlog(0, "trying to alloc %u bits from chain %u, inode %"MLFu64"\n",
978 	     bits_wanted, chain, OCFS2_I(alloc_inode)->ip_blkno);
979 
980 	status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
981 				  le64_to_cpu(cl->cl_recs[chain].c_blkno),
982 				  &group_bh, OCFS2_BH_CACHED, alloc_inode);
983 	if (status < 0) {
984 		mlog_errno(status);
985 		goto bail;
986 	}
987 	bg = (struct ocfs2_group_desc *) group_bh->b_data;
988 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
989 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
990 		status = -EIO;
991 		goto bail;
992 	}
993 
994 	status = -ENOSPC;
995 	/* for now, the chain search is a bit simplistic. We just use
996 	 * the 1st group with any empty bits. */
997 	while ((status = ac->ac_group_search(alloc_inode, group_bh,
998 					     bits_wanted, min_bits, bit_off,
999 					     &tmp_bits)) == -ENOSPC) {
1000 		if (!bg->bg_next_group)
1001 			break;
1002 
1003 		if (prev_group_bh) {
1004 			brelse(prev_group_bh);
1005 			prev_group_bh = NULL;
1006 		}
1007 		next_group = le64_to_cpu(bg->bg_next_group);
1008 		prev_group_bh = group_bh;
1009 		group_bh = NULL;
1010 		status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb),
1011 					  next_group, &group_bh,
1012 					  OCFS2_BH_CACHED, alloc_inode);
1013 		if (status < 0) {
1014 			mlog_errno(status);
1015 			goto bail;
1016 		}
1017 		bg = (struct ocfs2_group_desc *) group_bh->b_data;
1018 		if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1019 			OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1020 			status = -EIO;
1021 			goto bail;
1022 		}
1023 	}
1024 	if (status < 0) {
1025 		if (status != -ENOSPC)
1026 			mlog_errno(status);
1027 		goto bail;
1028 	}
1029 
1030 	mlog(0, "alloc succeeds: we give %u bits from block group %"MLFu64"\n",
1031 	     tmp_bits, bg->bg_blkno);
1032 
1033 	*num_bits = tmp_bits;
1034 
1035 	BUG_ON(*num_bits == 0);
1036 
1037 	/*
1038 	 * Keep track of previous block descriptor read. When
1039 	 * we find a target, if we have read more than X
1040 	 * number of descriptors, and the target is reasonably
1041 	 * empty, relink him to top of his chain.
1042 	 *
1043 	 * We've read 0 extra blocks and only send one more to
1044 	 * the transaction, yet the next guy to search has a
1045 	 * much easier time.
1046 	 *
1047 	 * Do this *after* figuring out how many bits we're taking out
1048 	 * of our target group.
1049 	 */
1050 	if (ac->ac_allow_chain_relink &&
1051 	    (prev_group_bh) &&
1052 	    (ocfs2_block_group_reasonably_empty(bg, *num_bits))) {
1053 		status = ocfs2_relink_block_group(handle, alloc_inode,
1054 						  ac->ac_bh, group_bh,
1055 						  prev_group_bh, chain);
1056 		if (status < 0) {
1057 			mlog_errno(status);
1058 			goto bail;
1059 		}
1060 	}
1061 
1062 	/* Ok, claim our bits now: set the info on dinode, chainlist
1063 	 * and then the group */
1064 	status = ocfs2_journal_access(handle,
1065 				      alloc_inode,
1066 				      ac->ac_bh,
1067 				      OCFS2_JOURNAL_ACCESS_WRITE);
1068 	if (status < 0) {
1069 		mlog_errno(status);
1070 		goto bail;
1071 	}
1072 
1073 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1074 	fe->id1.bitmap1.i_used = cpu_to_le32(*num_bits + tmp_used);
1075 	le32_add_cpu(&cl->cl_recs[chain].c_free, -(*num_bits));
1076 
1077 	status = ocfs2_journal_dirty(handle,
1078 				     ac->ac_bh);
1079 	if (status < 0) {
1080 		mlog_errno(status);
1081 		goto bail;
1082 	}
1083 
1084 	status = ocfs2_block_group_set_bits(handle,
1085 					    alloc_inode,
1086 					    bg,
1087 					    group_bh,
1088 					    *bit_off,
1089 					    *num_bits);
1090 	if (status < 0) {
1091 		mlog_errno(status);
1092 		goto bail;
1093 	}
1094 
1095 	mlog(0, "Allocated %u bits from suballocator %"MLFu64"\n",
1096 	     *num_bits, fe->i_blkno);
1097 
1098 	*bg_blkno = le64_to_cpu(bg->bg_blkno);
1099 bail:
1100 	if (group_bh)
1101 		brelse(group_bh);
1102 	if (prev_group_bh)
1103 		brelse(prev_group_bh);
1104 
1105 	mlog_exit(status);
1106 	return status;
1107 }
1108 
1109 /* will give out up to bits_wanted contiguous bits. */
1110 static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
1111 				     struct ocfs2_alloc_context *ac,
1112 				     u32 bits_wanted,
1113 				     u32 min_bits,
1114 				     u16 *bit_off,
1115 				     unsigned int *num_bits,
1116 				     u64 *bg_blkno)
1117 {
1118 	int status;
1119 	u16 victim, i;
1120 	struct ocfs2_chain_list *cl;
1121 	struct ocfs2_dinode *fe;
1122 
1123 	mlog_entry_void();
1124 
1125 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1126 	BUG_ON(bits_wanted > (ac->ac_bits_wanted - ac->ac_bits_given));
1127 	BUG_ON(!ac->ac_bh);
1128 
1129 	fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
1130 	if (!OCFS2_IS_VALID_DINODE(fe)) {
1131 		OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
1132 		status = -EIO;
1133 		goto bail;
1134 	}
1135 	if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
1136 	    le32_to_cpu(fe->id1.bitmap1.i_total)) {
1137 		ocfs2_error(osb->sb, "Chain allocator dinode %"MLFu64" has %u"
1138 			    "used bits but only %u total.",
1139 			    le64_to_cpu(fe->i_blkno),
1140 			    le32_to_cpu(fe->id1.bitmap1.i_used),
1141 			    le32_to_cpu(fe->id1.bitmap1.i_total));
1142 		status = -EIO;
1143 		goto bail;
1144 	}
1145 
1146 	cl = (struct ocfs2_chain_list *) &fe->id2.i_chain;
1147 
1148 	victim = ocfs2_find_victim_chain(cl);
1149 	ac->ac_chain = victim;
1150 	ac->ac_allow_chain_relink = 1;
1151 
1152 	status = ocfs2_search_chain(ac, bits_wanted, min_bits, bit_off,
1153 				    num_bits, bg_blkno);
1154 	if (!status)
1155 		goto bail;
1156 	if (status < 0 && status != -ENOSPC) {
1157 		mlog_errno(status);
1158 		goto bail;
1159 	}
1160 
1161 	mlog(0, "Search of victim chain %u came up with nothing, "
1162 	     "trying all chains now.\n", victim);
1163 
1164 	/* If we didn't pick a good victim, then just default to
1165 	 * searching each chain in order. Don't allow chain relinking
1166 	 * because we only calculate enough journal credits for one
1167 	 * relink per alloc. */
1168 	ac->ac_allow_chain_relink = 0;
1169 	for (i = 0; i < le16_to_cpu(cl->cl_next_free_rec); i ++) {
1170 		if (i == victim)
1171 			continue;
1172 		if (!cl->cl_recs[i].c_free)
1173 			continue;
1174 
1175 		ac->ac_chain = i;
1176 		status = ocfs2_search_chain(ac, bits_wanted, min_bits,
1177 					    bit_off, num_bits,
1178 					    bg_blkno);
1179 		if (!status)
1180 			break;
1181 		if (status < 0 && status != -ENOSPC) {
1182 			mlog_errno(status);
1183 			goto bail;
1184 		}
1185 	}
1186 bail:
1187 
1188 	mlog_exit(status);
1189 	return status;
1190 }
1191 
1192 int ocfs2_claim_metadata(struct ocfs2_super *osb,
1193 			 struct ocfs2_journal_handle *handle,
1194 			 struct ocfs2_alloc_context *ac,
1195 			 u32 bits_wanted,
1196 			 u16 *suballoc_bit_start,
1197 			 unsigned int *num_bits,
1198 			 u64 *blkno_start)
1199 {
1200 	int status;
1201 	u64 bg_blkno;
1202 
1203 	BUG_ON(!ac);
1204 	BUG_ON(ac->ac_bits_wanted < (ac->ac_bits_given + bits_wanted));
1205 	BUG_ON(ac->ac_which != OCFS2_AC_USE_META);
1206 	BUG_ON(ac->ac_handle != handle);
1207 
1208 	status = ocfs2_claim_suballoc_bits(osb,
1209 					   ac,
1210 					   bits_wanted,
1211 					   1,
1212 					   suballoc_bit_start,
1213 					   num_bits,
1214 					   &bg_blkno);
1215 	if (status < 0) {
1216 		mlog_errno(status);
1217 		goto bail;
1218 	}
1219 	atomic_inc(&osb->alloc_stats.bg_allocs);
1220 
1221 	*blkno_start = bg_blkno + (u64) *suballoc_bit_start;
1222 	ac->ac_bits_given += (*num_bits);
1223 	status = 0;
1224 bail:
1225 	mlog_exit(status);
1226 	return status;
1227 }
1228 
1229 int ocfs2_claim_new_inode(struct ocfs2_super *osb,
1230 			  struct ocfs2_journal_handle *handle,
1231 			  struct ocfs2_alloc_context *ac,
1232 			  u16 *suballoc_bit,
1233 			  u64 *fe_blkno)
1234 {
1235 	int status;
1236 	unsigned int num_bits;
1237 	u64 bg_blkno;
1238 
1239 	mlog_entry_void();
1240 
1241 	BUG_ON(!ac);
1242 	BUG_ON(ac->ac_bits_given != 0);
1243 	BUG_ON(ac->ac_bits_wanted != 1);
1244 	BUG_ON(ac->ac_which != OCFS2_AC_USE_INODE);
1245 	BUG_ON(ac->ac_handle != handle);
1246 
1247 	status = ocfs2_claim_suballoc_bits(osb,
1248 					   ac,
1249 					   1,
1250 					   1,
1251 					   suballoc_bit,
1252 					   &num_bits,
1253 					   &bg_blkno);
1254 	if (status < 0) {
1255 		mlog_errno(status);
1256 		goto bail;
1257 	}
1258 	atomic_inc(&osb->alloc_stats.bg_allocs);
1259 
1260 	BUG_ON(num_bits != 1);
1261 
1262 	*fe_blkno = bg_blkno + (u64) (*suballoc_bit);
1263 	ac->ac_bits_given++;
1264 	status = 0;
1265 bail:
1266 	mlog_exit(status);
1267 	return status;
1268 }
1269 
1270 /* translate a group desc. blkno and it's bitmap offset into
1271  * disk cluster offset. */
1272 static inline u32 ocfs2_desc_bitmap_to_cluster_off(struct inode *inode,
1273 						   u64 bg_blkno,
1274 						   u16 bg_bit_off)
1275 {
1276 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1277 	u32 cluster = 0;
1278 
1279 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1280 
1281 	if (bg_blkno != osb->first_cluster_group_blkno)
1282 		cluster = ocfs2_blocks_to_clusters(inode->i_sb, bg_blkno);
1283 	cluster += (u32) bg_bit_off;
1284 	return cluster;
1285 }
1286 
1287 /* given a cluster offset, calculate which block group it belongs to
1288  * and return that block offset. */
1289 static inline u64 ocfs2_which_cluster_group(struct inode *inode,
1290 					    u32 cluster)
1291 {
1292 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1293 	u32 group_no;
1294 
1295 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1296 
1297 	group_no = cluster / osb->bitmap_cpg;
1298 	if (!group_no)
1299 		return osb->first_cluster_group_blkno;
1300 	return ocfs2_clusters_to_blocks(inode->i_sb,
1301 					group_no * osb->bitmap_cpg);
1302 }
1303 
1304 /* given the block number of a cluster start, calculate which cluster
1305  * group and descriptor bitmap offset that corresponds to. */
1306 static inline void ocfs2_block_to_cluster_group(struct inode *inode,
1307 						u64 data_blkno,
1308 						u64 *bg_blkno,
1309 						u16 *bg_bit_off)
1310 {
1311 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1312 	u32 data_cluster = ocfs2_blocks_to_clusters(osb->sb, data_blkno);
1313 
1314 	BUG_ON(!ocfs2_is_cluster_bitmap(inode));
1315 
1316 	*bg_blkno = ocfs2_which_cluster_group(inode,
1317 					      data_cluster);
1318 
1319 	if (*bg_blkno == osb->first_cluster_group_blkno)
1320 		*bg_bit_off = (u16) data_cluster;
1321 	else
1322 		*bg_bit_off = (u16) ocfs2_blocks_to_clusters(osb->sb,
1323 							     data_blkno - *bg_blkno);
1324 }
1325 
1326 /*
1327  * min_bits - minimum contiguous chunk from this total allocation we
1328  * can handle. set to what we asked for originally for a full
1329  * contig. allocation, set to '1' to indicate we can deal with extents
1330  * of any size.
1331  */
1332 int ocfs2_claim_clusters(struct ocfs2_super *osb,
1333 			 struct ocfs2_journal_handle *handle,
1334 			 struct ocfs2_alloc_context *ac,
1335 			 u32 min_clusters,
1336 			 u32 *cluster_start,
1337 			 u32 *num_clusters)
1338 {
1339 	int status;
1340 	unsigned int bits_wanted = ac->ac_bits_wanted - ac->ac_bits_given;
1341 	u64 bg_blkno;
1342 	u16 bg_bit_off;
1343 
1344 	mlog_entry_void();
1345 
1346 	BUG_ON(!ac);
1347 	BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted);
1348 
1349 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL
1350 	       && ac->ac_which != OCFS2_AC_USE_MAIN);
1351 	BUG_ON(ac->ac_handle != handle);
1352 
1353 	if (ac->ac_which == OCFS2_AC_USE_LOCAL) {
1354 		status = ocfs2_claim_local_alloc_bits(osb,
1355 						      handle,
1356 						      ac,
1357 						      bits_wanted,
1358 						      cluster_start,
1359 						      num_clusters);
1360 		if (!status)
1361 			atomic_inc(&osb->alloc_stats.local_data);
1362 	} else {
1363 		if (min_clusters > (osb->bitmap_cpg - 1)) {
1364 			/* The only paths asking for contiguousness
1365 			 * should know about this already. */
1366 			mlog(ML_ERROR, "minimum allocation requested exceeds "
1367 				       "group bitmap size!");
1368 			status = -ENOSPC;
1369 			goto bail;
1370 		}
1371 		/* clamp the current request down to a realistic size. */
1372 		if (bits_wanted > (osb->bitmap_cpg - 1))
1373 			bits_wanted = osb->bitmap_cpg - 1;
1374 
1375 		status = ocfs2_claim_suballoc_bits(osb,
1376 						   ac,
1377 						   bits_wanted,
1378 						   min_clusters,
1379 						   &bg_bit_off,
1380 						   num_clusters,
1381 						   &bg_blkno);
1382 		if (!status) {
1383 			*cluster_start =
1384 				ocfs2_desc_bitmap_to_cluster_off(ac->ac_inode,
1385 								 bg_blkno,
1386 								 bg_bit_off);
1387 			atomic_inc(&osb->alloc_stats.bitmap_data);
1388 		}
1389 	}
1390 	if (status < 0) {
1391 		if (status != -ENOSPC)
1392 			mlog_errno(status);
1393 		goto bail;
1394 	}
1395 
1396 	ac->ac_bits_given += *num_clusters;
1397 
1398 bail:
1399 	mlog_exit(status);
1400 	return status;
1401 }
1402 
1403 static inline int ocfs2_block_group_clear_bits(struct ocfs2_journal_handle *handle,
1404 					       struct inode *alloc_inode,
1405 					       struct ocfs2_group_desc *bg,
1406 					       struct buffer_head *group_bh,
1407 					       unsigned int bit_off,
1408 					       unsigned int num_bits)
1409 {
1410 	int status;
1411 	unsigned int tmp;
1412 	int journal_type = OCFS2_JOURNAL_ACCESS_WRITE;
1413 	struct ocfs2_group_desc *undo_bg = NULL;
1414 
1415 	mlog_entry_void();
1416 
1417 	if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
1418 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
1419 		status = -EIO;
1420 		goto bail;
1421 	}
1422 
1423 	mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
1424 
1425 	if (ocfs2_is_cluster_bitmap(alloc_inode))
1426 		journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
1427 
1428 	status = ocfs2_journal_access(handle, alloc_inode, group_bh,
1429 				      journal_type);
1430 	if (status < 0) {
1431 		mlog_errno(status);
1432 		goto bail;
1433 	}
1434 
1435 	if (ocfs2_is_cluster_bitmap(alloc_inode))
1436 		undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data;
1437 
1438 	tmp = num_bits;
1439 	while(tmp--) {
1440 		ocfs2_clear_bit((bit_off + tmp),
1441 				(unsigned long *) bg->bg_bitmap);
1442 		if (ocfs2_is_cluster_bitmap(alloc_inode))
1443 			ocfs2_set_bit(bit_off + tmp,
1444 				      (unsigned long *) undo_bg->bg_bitmap);
1445 	}
1446 	le16_add_cpu(&bg->bg_free_bits_count, num_bits);
1447 
1448 	status = ocfs2_journal_dirty(handle, group_bh);
1449 	if (status < 0)
1450 		mlog_errno(status);
1451 bail:
1452 	return status;
1453 }
1454 
1455 /*
1456  * expects the suballoc inode to already be locked.
1457  */
1458 static int ocfs2_free_suballoc_bits(struct ocfs2_journal_handle *handle,
1459 				    struct inode *alloc_inode,
1460 				    struct buffer_head *alloc_bh,
1461 				    unsigned int start_bit,
1462 				    u64 bg_blkno,
1463 				    unsigned int count)
1464 {
1465 	int status = 0;
1466 	u32 tmp_used;
1467 	struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb);
1468 	struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data;
1469 	struct ocfs2_chain_list *cl = &fe->id2.i_chain;
1470 	struct buffer_head *group_bh = NULL;
1471 	struct ocfs2_group_desc *group;
1472 
1473 	mlog_entry_void();
1474 
1475 	if (!OCFS2_IS_VALID_DINODE(fe)) {
1476 		OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
1477 		status = -EIO;
1478 		goto bail;
1479 	}
1480 	BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
1481 
1482 	mlog(0, "suballocator %"MLFu64": freeing %u bits from group %"MLFu64
1483 	     ", starting at %u\n",
1484 	     OCFS2_I(alloc_inode)->ip_blkno, count, bg_blkno,
1485 	     start_bit);
1486 
1487 	status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED,
1488 				  alloc_inode);
1489 	if (status < 0) {
1490 		mlog_errno(status);
1491 		goto bail;
1492 	}
1493 
1494 	group = (struct ocfs2_group_desc *) group_bh->b_data;
1495 	if (!OCFS2_IS_VALID_GROUP_DESC(group)) {
1496 		OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, group);
1497 		status = -EIO;
1498 		goto bail;
1499 	}
1500 	BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
1501 
1502 	status = ocfs2_block_group_clear_bits(handle, alloc_inode,
1503 					      group, group_bh,
1504 					      start_bit, count);
1505 	if (status < 0) {
1506 		mlog_errno(status);
1507 		goto bail;
1508 	}
1509 
1510 	status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
1511 				      OCFS2_JOURNAL_ACCESS_WRITE);
1512 	if (status < 0) {
1513 		mlog_errno(status);
1514 		goto bail;
1515 	}
1516 
1517 	le32_add_cpu(&cl->cl_recs[le16_to_cpu(group->bg_chain)].c_free,
1518 		     count);
1519 	tmp_used = le32_to_cpu(fe->id1.bitmap1.i_used);
1520 	fe->id1.bitmap1.i_used = cpu_to_le32(tmp_used - count);
1521 
1522 	status = ocfs2_journal_dirty(handle, alloc_bh);
1523 	if (status < 0) {
1524 		mlog_errno(status);
1525 		goto bail;
1526 	}
1527 
1528 bail:
1529 	if (group_bh)
1530 		brelse(group_bh);
1531 
1532 	mlog_exit(status);
1533 	return status;
1534 }
1535 
1536 static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit)
1537 {
1538 	u64 group = block - (u64) bit;
1539 
1540 	return group;
1541 }
1542 
1543 int ocfs2_free_dinode(struct ocfs2_journal_handle *handle,
1544 		      struct inode *inode_alloc_inode,
1545 		      struct buffer_head *inode_alloc_bh,
1546 		      struct ocfs2_dinode *di)
1547 {
1548 	u64 blk = le64_to_cpu(di->i_blkno);
1549 	u16 bit = le16_to_cpu(di->i_suballoc_bit);
1550 	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1551 
1552 	return ocfs2_free_suballoc_bits(handle, inode_alloc_inode,
1553 					inode_alloc_bh, bit, bg_blkno, 1);
1554 }
1555 
1556 int ocfs2_free_extent_block(struct ocfs2_journal_handle *handle,
1557 			    struct inode *eb_alloc_inode,
1558 			    struct buffer_head *eb_alloc_bh,
1559 			    struct ocfs2_extent_block *eb)
1560 {
1561 	u64 blk = le64_to_cpu(eb->h_blkno);
1562 	u16 bit = le16_to_cpu(eb->h_suballoc_bit);
1563 	u64 bg_blkno = ocfs2_which_suballoc_group(blk, bit);
1564 
1565 	return ocfs2_free_suballoc_bits(handle, eb_alloc_inode, eb_alloc_bh,
1566 					bit, bg_blkno, 1);
1567 }
1568 
1569 int ocfs2_free_clusters(struct ocfs2_journal_handle *handle,
1570 		       struct inode *bitmap_inode,
1571 		       struct buffer_head *bitmap_bh,
1572 		       u64 start_blk,
1573 		       unsigned int num_clusters)
1574 {
1575 	int status;
1576 	u16 bg_start_bit;
1577 	u64 bg_blkno;
1578 	struct ocfs2_dinode *fe;
1579 
1580 	/* You can't ever have a contiguous set of clusters
1581 	 * bigger than a block group bitmap so we never have to worry
1582 	 * about looping on them. */
1583 
1584 	mlog_entry_void();
1585 
1586 	/* This is expensive. We can safely remove once this stuff has
1587 	 * gotten tested really well. */
1588 	BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk)));
1589 
1590 	fe = (struct ocfs2_dinode *) bitmap_bh->b_data;
1591 
1592 	ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno,
1593 				     &bg_start_bit);
1594 
1595 	mlog(0, "want to free %u clusters starting at block %"MLFu64"\n",
1596 	     num_clusters, start_blk);
1597 	mlog(0, "bg_blkno = %"MLFu64", bg_start_bit = %u\n",
1598 	     bg_blkno, bg_start_bit);
1599 
1600 	status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh,
1601 					  bg_start_bit, bg_blkno,
1602 					  num_clusters);
1603 	if (status < 0)
1604 		mlog_errno(status);
1605 
1606 	mlog_exit(status);
1607 	return status;
1608 }
1609 
1610 static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg)
1611 {
1612 	printk("Block Group:\n");
1613 	printk("bg_signature:       %s\n", bg->bg_signature);
1614 	printk("bg_size:            %u\n", bg->bg_size);
1615 	printk("bg_bits:            %u\n", bg->bg_bits);
1616 	printk("bg_free_bits_count: %u\n", bg->bg_free_bits_count);
1617 	printk("bg_chain:           %u\n", bg->bg_chain);
1618 	printk("bg_generation:      %u\n", le32_to_cpu(bg->bg_generation));
1619 	printk("bg_next_group:      %"MLFu64"\n", bg->bg_next_group);
1620 	printk("bg_parent_dinode:   %"MLFu64"\n", bg->bg_parent_dinode);
1621 	printk("bg_blkno:           %"MLFu64"\n", bg->bg_blkno);
1622 }
1623 
1624 static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe)
1625 {
1626 	int i;
1627 
1628 	printk("Suballoc Inode %"MLFu64":\n", fe->i_blkno);
1629 	printk("i_signature:                  %s\n", fe->i_signature);
1630 	printk("i_size:                       %"MLFu64"\n", fe->i_size);
1631 	printk("i_clusters:                   %u\n", fe->i_clusters);
1632 	printk("i_generation:                 %u\n",
1633 	       le32_to_cpu(fe->i_generation));
1634 	printk("id1.bitmap1.i_used:           %u\n",
1635 	       le32_to_cpu(fe->id1.bitmap1.i_used));
1636 	printk("id1.bitmap1.i_total:          %u\n",
1637 	       le32_to_cpu(fe->id1.bitmap1.i_total));
1638 	printk("id2.i_chain.cl_cpg:           %u\n", fe->id2.i_chain.cl_cpg);
1639 	printk("id2.i_chain.cl_bpc:           %u\n", fe->id2.i_chain.cl_bpc);
1640 	printk("id2.i_chain.cl_count:         %u\n", fe->id2.i_chain.cl_count);
1641 	printk("id2.i_chain.cl_next_free_rec: %u\n",
1642 	       fe->id2.i_chain.cl_next_free_rec);
1643 	for(i = 0; i < fe->id2.i_chain.cl_next_free_rec; i++) {
1644 		printk("fe->id2.i_chain.cl_recs[%d].c_free:  %u\n", i,
1645 		       fe->id2.i_chain.cl_recs[i].c_free);
1646 		printk("fe->id2.i_chain.cl_recs[%d].c_total: %u\n", i,
1647 		       fe->id2.i_chain.cl_recs[i].c_total);
1648 		printk("fe->id2.i_chain.cl_recs[%d].c_blkno: %"MLFu64"\n", i,
1649 		       fe->id2.i_chain.cl_recs[i].c_blkno);
1650 	}
1651 }
1652