xref: /openbmc/linux/fs/ocfs2/localalloc.c (revision 9ac8d3fb)
1 /* -*- mode: c; c-basic-offset: 8; -*-
2  * vim: noexpandtab sw=8 ts=8 sts=0:
3  *
4  * localalloc.c
5  *
6  * Node local data allocation
7  *
8  * Copyright (C) 2002, 2004 Oracle.  All rights reserved.
9  *
10  * This program is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU General Public
12  * License as published by the Free Software Foundation; either
13  * version 2 of the License, or (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public
21  * License along with this program; if not, write to the
22  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23  * Boston, MA 021110-1307, USA.
24  */
25 
26 #include <linux/fs.h>
27 #include <linux/types.h>
28 #include <linux/slab.h>
29 #include <linux/highmem.h>
30 #include <linux/bitops.h>
31 #include <linux/debugfs.h>
32 
33 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
34 #include <cluster/masklog.h>
35 
36 #include "ocfs2.h"
37 
38 #include "alloc.h"
39 #include "dlmglue.h"
40 #include "inode.h"
41 #include "journal.h"
42 #include "localalloc.h"
43 #include "suballoc.h"
44 #include "super.h"
45 #include "sysfile.h"
46 
47 #include "buffer_head_io.h"
48 
49 #define OCFS2_LOCAL_ALLOC(dinode)	(&((dinode)->id2.i_lab))
50 
51 static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc);
52 
53 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
54 					     struct ocfs2_dinode *alloc,
55 					     u32 numbits);
56 
57 static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc);
58 
59 static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
60 				    handle_t *handle,
61 				    struct ocfs2_dinode *alloc,
62 				    struct inode *main_bm_inode,
63 				    struct buffer_head *main_bm_bh);
64 
65 static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
66 						struct ocfs2_alloc_context **ac,
67 						struct inode **bitmap_inode,
68 						struct buffer_head **bitmap_bh);
69 
70 static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
71 					handle_t *handle,
72 					struct ocfs2_alloc_context *ac);
73 
74 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
75 					  struct inode *local_alloc_inode);
76 
77 #ifdef CONFIG_OCFS2_FS_STATS
78 
79 static int ocfs2_la_debug_open(struct inode *inode, struct file *file)
80 {
81 	file->private_data = inode->i_private;
82 	return 0;
83 }
84 
85 #define LA_DEBUG_BUF_SZ	PAGE_CACHE_SIZE
86 #define LA_DEBUG_VER	1
87 static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf,
88 				   size_t count, loff_t *ppos)
89 {
90 	static DEFINE_MUTEX(la_debug_mutex);
91 	struct ocfs2_super *osb = file->private_data;
92 	int written, ret;
93 	char *buf = osb->local_alloc_debug_buf;
94 
95 	mutex_lock(&la_debug_mutex);
96 	memset(buf, 0, LA_DEBUG_BUF_SZ);
97 
98 	written = snprintf(buf, LA_DEBUG_BUF_SZ,
99 			   "0x%x\t0x%llx\t%u\t%u\t0x%x\n",
100 			   LA_DEBUG_VER,
101 			   (unsigned long long)osb->la_last_gd,
102 			   osb->local_alloc_default_bits,
103 			   osb->local_alloc_bits, osb->local_alloc_state);
104 
105 	ret = simple_read_from_buffer(userbuf, count, ppos, buf, written);
106 
107 	mutex_unlock(&la_debug_mutex);
108 	return ret;
109 }
110 
111 static const struct file_operations ocfs2_la_debug_fops = {
112 	.open =		ocfs2_la_debug_open,
113 	.read =		ocfs2_la_debug_read,
114 };
115 
116 static void ocfs2_init_la_debug(struct ocfs2_super *osb)
117 {
118 	osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS);
119 	if (!osb->local_alloc_debug_buf)
120 		return;
121 
122 	osb->local_alloc_debug = debugfs_create_file("local_alloc_stats",
123 						     S_IFREG|S_IRUSR,
124 						     osb->osb_debug_root,
125 						     osb,
126 						     &ocfs2_la_debug_fops);
127 	if (!osb->local_alloc_debug) {
128 		kfree(osb->local_alloc_debug_buf);
129 		osb->local_alloc_debug_buf = NULL;
130 	}
131 }
132 
133 static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
134 {
135 	if (osb->local_alloc_debug)
136 		debugfs_remove(osb->local_alloc_debug);
137 
138 	if (osb->local_alloc_debug_buf)
139 		kfree(osb->local_alloc_debug_buf);
140 
141 	osb->local_alloc_debug_buf = NULL;
142 	osb->local_alloc_debug = NULL;
143 }
144 #else	/* CONFIG_OCFS2_FS_STATS */
145 static void ocfs2_init_la_debug(struct ocfs2_super *osb)
146 {
147 	return;
148 }
149 static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb)
150 {
151 	return;
152 }
153 #endif
154 
155 static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb)
156 {
157 	return (osb->local_alloc_state == OCFS2_LA_THROTTLED ||
158 		osb->local_alloc_state == OCFS2_LA_ENABLED);
159 }
160 
161 void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
162 				      unsigned int num_clusters)
163 {
164 	spin_lock(&osb->osb_lock);
165 	if (osb->local_alloc_state == OCFS2_LA_DISABLED ||
166 	    osb->local_alloc_state == OCFS2_LA_THROTTLED)
167 		if (num_clusters >= osb->local_alloc_default_bits) {
168 			cancel_delayed_work(&osb->la_enable_wq);
169 			osb->local_alloc_state = OCFS2_LA_ENABLED;
170 		}
171 	spin_unlock(&osb->osb_lock);
172 }
173 
174 void ocfs2_la_enable_worker(struct work_struct *work)
175 {
176 	struct ocfs2_super *osb =
177 		container_of(work, struct ocfs2_super,
178 			     la_enable_wq.work);
179 	spin_lock(&osb->osb_lock);
180 	osb->local_alloc_state = OCFS2_LA_ENABLED;
181 	spin_unlock(&osb->osb_lock);
182 }
183 
184 /*
185  * Tell us whether a given allocation should use the local alloc
186  * file. Otherwise, it has to go to the main bitmap.
187  *
188  * This function does semi-dirty reads of local alloc size and state!
189  * This is ok however, as the values are re-checked once under mutex.
190  */
191 int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits)
192 {
193 	int ret = 0;
194 	int la_bits;
195 
196 	spin_lock(&osb->osb_lock);
197 	la_bits = osb->local_alloc_bits;
198 
199 	if (!ocfs2_la_state_enabled(osb))
200 		goto bail;
201 
202 	/* la_bits should be at least twice the size (in clusters) of
203 	 * a new block group. We want to be sure block group
204 	 * allocations go through the local alloc, so allow an
205 	 * allocation to take up to half the bitmap. */
206 	if (bits > (la_bits / 2))
207 		goto bail;
208 
209 	ret = 1;
210 bail:
211 	mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n",
212 	     osb->local_alloc_state, (unsigned long long)bits, la_bits, ret);
213 	spin_unlock(&osb->osb_lock);
214 	return ret;
215 }
216 
217 int ocfs2_load_local_alloc(struct ocfs2_super *osb)
218 {
219 	int status = 0;
220 	struct ocfs2_dinode *alloc = NULL;
221 	struct buffer_head *alloc_bh = NULL;
222 	u32 num_used;
223 	struct inode *inode = NULL;
224 	struct ocfs2_local_alloc *la;
225 
226 	mlog_entry_void();
227 
228 	ocfs2_init_la_debug(osb);
229 
230 	if (osb->local_alloc_bits == 0)
231 		goto bail;
232 
233 	if (osb->local_alloc_bits >= osb->bitmap_cpg) {
234 		mlog(ML_NOTICE, "Requested local alloc window %d is larger "
235 		     "than max possible %u. Using defaults.\n",
236 		     osb->local_alloc_bits, (osb->bitmap_cpg - 1));
237 		osb->local_alloc_bits =
238 			ocfs2_megabytes_to_clusters(osb->sb,
239 						    OCFS2_DEFAULT_LOCAL_ALLOC_SIZE);
240 	}
241 
242 	/* read the alloc off disk */
243 	inode = ocfs2_get_system_file_inode(osb, LOCAL_ALLOC_SYSTEM_INODE,
244 					    osb->slot_num);
245 	if (!inode) {
246 		status = -EINVAL;
247 		mlog_errno(status);
248 		goto bail;
249 	}
250 
251 	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
252 				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
253 	if (status < 0) {
254 		mlog_errno(status);
255 		goto bail;
256 	}
257 
258 	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
259 	la = OCFS2_LOCAL_ALLOC(alloc);
260 
261 	if (!(le32_to_cpu(alloc->i_flags) &
262 	    (OCFS2_LOCAL_ALLOC_FL|OCFS2_BITMAP_FL))) {
263 		mlog(ML_ERROR, "Invalid local alloc inode, %llu\n",
264 		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
265 		status = -EINVAL;
266 		goto bail;
267 	}
268 
269 	if ((la->la_size == 0) ||
270 	    (le16_to_cpu(la->la_size) > ocfs2_local_alloc_size(inode->i_sb))) {
271 		mlog(ML_ERROR, "Local alloc size is invalid (la_size = %u)\n",
272 		     le16_to_cpu(la->la_size));
273 		status = -EINVAL;
274 		goto bail;
275 	}
276 
277 	/* do a little verification. */
278 	num_used = ocfs2_local_alloc_count_bits(alloc);
279 
280 	/* hopefully the local alloc has always been recovered before
281 	 * we load it. */
282 	if (num_used
283 	    || alloc->id1.bitmap1.i_used
284 	    || alloc->id1.bitmap1.i_total
285 	    || la->la_bm_off)
286 		mlog(ML_ERROR, "Local alloc hasn't been recovered!\n"
287 		     "found = %u, set = %u, taken = %u, off = %u\n",
288 		     num_used, le32_to_cpu(alloc->id1.bitmap1.i_used),
289 		     le32_to_cpu(alloc->id1.bitmap1.i_total),
290 		     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
291 
292 	osb->local_alloc_bh = alloc_bh;
293 	osb->local_alloc_state = OCFS2_LA_ENABLED;
294 
295 bail:
296 	if (status < 0)
297 		brelse(alloc_bh);
298 	if (inode)
299 		iput(inode);
300 
301 	if (status < 0)
302 		ocfs2_shutdown_la_debug(osb);
303 
304 	mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits);
305 
306 	mlog_exit(status);
307 	return status;
308 }
309 
310 /*
311  * return any unused bits to the bitmap and write out a clean
312  * local_alloc.
313  *
314  * local_alloc_bh is optional. If not passed, we will simply use the
315  * one off osb. If you do pass it however, be warned that it *will* be
316  * returned brelse'd and NULL'd out.*/
317 void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
318 {
319 	int status;
320 	handle_t *handle;
321 	struct inode *local_alloc_inode = NULL;
322 	struct buffer_head *bh = NULL;
323 	struct buffer_head *main_bm_bh = NULL;
324 	struct inode *main_bm_inode = NULL;
325 	struct ocfs2_dinode *alloc_copy = NULL;
326 	struct ocfs2_dinode *alloc = NULL;
327 
328 	mlog_entry_void();
329 
330 	cancel_delayed_work(&osb->la_enable_wq);
331 	flush_workqueue(ocfs2_wq);
332 
333 	ocfs2_shutdown_la_debug(osb);
334 
335 	if (osb->local_alloc_state == OCFS2_LA_UNUSED)
336 		goto out;
337 
338 	local_alloc_inode =
339 		ocfs2_get_system_file_inode(osb,
340 					    LOCAL_ALLOC_SYSTEM_INODE,
341 					    osb->slot_num);
342 	if (!local_alloc_inode) {
343 		status = -ENOENT;
344 		mlog_errno(status);
345 		goto out;
346 	}
347 
348 	osb->local_alloc_state = OCFS2_LA_DISABLED;
349 
350 	main_bm_inode = ocfs2_get_system_file_inode(osb,
351 						    GLOBAL_BITMAP_SYSTEM_INODE,
352 						    OCFS2_INVALID_SLOT);
353 	if (!main_bm_inode) {
354 		status = -EINVAL;
355 		mlog_errno(status);
356 		goto out;
357 	}
358 
359 	mutex_lock(&main_bm_inode->i_mutex);
360 
361 	status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
362 	if (status < 0) {
363 		mlog_errno(status);
364 		goto out_mutex;
365 	}
366 
367 	/* WINDOW_MOVE_CREDITS is a bit heavy... */
368 	handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
369 	if (IS_ERR(handle)) {
370 		mlog_errno(PTR_ERR(handle));
371 		handle = NULL;
372 		goto out_unlock;
373 	}
374 
375 	bh = osb->local_alloc_bh;
376 	alloc = (struct ocfs2_dinode *) bh->b_data;
377 
378 	alloc_copy = kmalloc(bh->b_size, GFP_NOFS);
379 	if (!alloc_copy) {
380 		status = -ENOMEM;
381 		goto out_commit;
382 	}
383 	memcpy(alloc_copy, alloc, bh->b_size);
384 
385 	status = ocfs2_journal_access(handle, local_alloc_inode, bh,
386 				      OCFS2_JOURNAL_ACCESS_WRITE);
387 	if (status < 0) {
388 		mlog_errno(status);
389 		goto out_commit;
390 	}
391 
392 	ocfs2_clear_local_alloc(alloc);
393 
394 	status = ocfs2_journal_dirty(handle, bh);
395 	if (status < 0) {
396 		mlog_errno(status);
397 		goto out_commit;
398 	}
399 
400 	brelse(bh);
401 	osb->local_alloc_bh = NULL;
402 	osb->local_alloc_state = OCFS2_LA_UNUSED;
403 
404 	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
405 					  main_bm_inode, main_bm_bh);
406 	if (status < 0)
407 		mlog_errno(status);
408 
409 out_commit:
410 	ocfs2_commit_trans(osb, handle);
411 
412 out_unlock:
413 	brelse(main_bm_bh);
414 
415 	ocfs2_inode_unlock(main_bm_inode, 1);
416 
417 out_mutex:
418 	mutex_unlock(&main_bm_inode->i_mutex);
419 	iput(main_bm_inode);
420 
421 out:
422 	if (local_alloc_inode)
423 		iput(local_alloc_inode);
424 
425 	if (alloc_copy)
426 		kfree(alloc_copy);
427 
428 	mlog_exit_void();
429 }
430 
431 /*
432  * We want to free the bitmap bits outside of any recovery context as
433  * we'll need a cluster lock to do so, but we must clear the local
434  * alloc before giving up the recovered nodes journal. To solve this,
435  * we kmalloc a copy of the local alloc before it's change for the
436  * caller to process with ocfs2_complete_local_alloc_recovery
437  */
438 int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
439 				     int slot_num,
440 				     struct ocfs2_dinode **alloc_copy)
441 {
442 	int status = 0;
443 	struct buffer_head *alloc_bh = NULL;
444 	struct inode *inode = NULL;
445 	struct ocfs2_dinode *alloc;
446 
447 	mlog_entry("(slot_num = %d)\n", slot_num);
448 
449 	*alloc_copy = NULL;
450 
451 	inode = ocfs2_get_system_file_inode(osb,
452 					    LOCAL_ALLOC_SYSTEM_INODE,
453 					    slot_num);
454 	if (!inode) {
455 		status = -EINVAL;
456 		mlog_errno(status);
457 		goto bail;
458 	}
459 
460 	mutex_lock(&inode->i_mutex);
461 
462 	status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
463 				   &alloc_bh, OCFS2_BH_IGNORE_CACHE);
464 	if (status < 0) {
465 		mlog_errno(status);
466 		goto bail;
467 	}
468 
469 	*alloc_copy = kmalloc(alloc_bh->b_size, GFP_KERNEL);
470 	if (!(*alloc_copy)) {
471 		status = -ENOMEM;
472 		goto bail;
473 	}
474 	memcpy((*alloc_copy), alloc_bh->b_data, alloc_bh->b_size);
475 
476 	alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
477 	ocfs2_clear_local_alloc(alloc);
478 
479 	status = ocfs2_write_block(osb, alloc_bh, inode);
480 	if (status < 0)
481 		mlog_errno(status);
482 
483 bail:
484 	if ((status < 0) && (*alloc_copy)) {
485 		kfree(*alloc_copy);
486 		*alloc_copy = NULL;
487 	}
488 
489 	brelse(alloc_bh);
490 
491 	if (inode) {
492 		mutex_unlock(&inode->i_mutex);
493 		iput(inode);
494 	}
495 
496 	mlog_exit(status);
497 	return status;
498 }
499 
500 /*
501  * Step 2: By now, we've completed the journal recovery, we've stamped
502  * a clean local alloc on disk and dropped the node out of the
503  * recovery map. Dlm locks will no longer stall, so lets clear out the
504  * main bitmap.
505  */
506 int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
507 					struct ocfs2_dinode *alloc)
508 {
509 	int status;
510 	handle_t *handle;
511 	struct buffer_head *main_bm_bh = NULL;
512 	struct inode *main_bm_inode;
513 
514 	mlog_entry_void();
515 
516 	main_bm_inode = ocfs2_get_system_file_inode(osb,
517 						    GLOBAL_BITMAP_SYSTEM_INODE,
518 						    OCFS2_INVALID_SLOT);
519 	if (!main_bm_inode) {
520 		status = -EINVAL;
521 		mlog_errno(status);
522 		goto out;
523 	}
524 
525 	mutex_lock(&main_bm_inode->i_mutex);
526 
527 	status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
528 	if (status < 0) {
529 		mlog_errno(status);
530 		goto out_mutex;
531 	}
532 
533 	handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
534 	if (IS_ERR(handle)) {
535 		status = PTR_ERR(handle);
536 		handle = NULL;
537 		mlog_errno(status);
538 		goto out_unlock;
539 	}
540 
541 	/* we want the bitmap change to be recorded on disk asap */
542 	handle->h_sync = 1;
543 
544 	status = ocfs2_sync_local_to_main(osb, handle, alloc,
545 					  main_bm_inode, main_bm_bh);
546 	if (status < 0)
547 		mlog_errno(status);
548 
549 	ocfs2_commit_trans(osb, handle);
550 
551 out_unlock:
552 	ocfs2_inode_unlock(main_bm_inode, 1);
553 
554 out_mutex:
555 	mutex_unlock(&main_bm_inode->i_mutex);
556 
557 	brelse(main_bm_bh);
558 
559 	iput(main_bm_inode);
560 
561 out:
562 	if (!status)
563 		ocfs2_init_inode_steal_slot(osb);
564 	mlog_exit(status);
565 	return status;
566 }
567 
568 /* Check to see if the local alloc window is within ac->ac_max_block */
569 static int ocfs2_local_alloc_in_range(struct inode *inode,
570 				      struct ocfs2_alloc_context *ac,
571 				      u32 bits_wanted)
572 {
573 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
574 	struct ocfs2_dinode *alloc;
575 	struct ocfs2_local_alloc *la;
576 	int start;
577 	u64 block_off;
578 
579 	if (!ac->ac_max_block)
580 		return 1;
581 
582 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
583 	la = OCFS2_LOCAL_ALLOC(alloc);
584 
585 	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
586 	if (start == -1) {
587 		mlog_errno(-ENOSPC);
588 		return 0;
589 	}
590 
591 	/*
592 	 * Converting (bm_off + start + bits_wanted) to blocks gives us
593 	 * the blkno just past our actual allocation.  This is perfect
594 	 * to compare with ac_max_block.
595 	 */
596 	block_off = ocfs2_clusters_to_blocks(inode->i_sb,
597 					     le32_to_cpu(la->la_bm_off) +
598 					     start + bits_wanted);
599 	mlog(0, "Checking %llu against %llu\n",
600 	     (unsigned long long)block_off,
601 	     (unsigned long long)ac->ac_max_block);
602 	if (block_off > ac->ac_max_block)
603 		return 0;
604 
605 	return 1;
606 }
607 
608 /*
609  * make sure we've got at least bits_wanted contiguous bits in the
610  * local alloc. You lose them when you drop i_mutex.
611  *
612  * We will add ourselves to the transaction passed in, but may start
613  * our own in order to shift windows.
614  */
615 int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
616 				   u32 bits_wanted,
617 				   struct ocfs2_alloc_context *ac)
618 {
619 	int status;
620 	struct ocfs2_dinode *alloc;
621 	struct inode *local_alloc_inode;
622 	unsigned int free_bits;
623 
624 	mlog_entry_void();
625 
626 	BUG_ON(!ac);
627 
628 	local_alloc_inode =
629 		ocfs2_get_system_file_inode(osb,
630 					    LOCAL_ALLOC_SYSTEM_INODE,
631 					    osb->slot_num);
632 	if (!local_alloc_inode) {
633 		status = -ENOENT;
634 		mlog_errno(status);
635 		goto bail;
636 	}
637 
638 	mutex_lock(&local_alloc_inode->i_mutex);
639 
640 	/*
641 	 * We must double check state and allocator bits because
642 	 * another process may have changed them while holding i_mutex.
643 	 */
644 	spin_lock(&osb->osb_lock);
645 	if (!ocfs2_la_state_enabled(osb) ||
646 	    (bits_wanted > osb->local_alloc_bits)) {
647 		spin_unlock(&osb->osb_lock);
648 		status = -ENOSPC;
649 		goto bail;
650 	}
651 	spin_unlock(&osb->osb_lock);
652 
653 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
654 
655 #ifdef CONFIG_OCFS2_DEBUG_FS
656 	if (le32_to_cpu(alloc->id1.bitmap1.i_used) !=
657 	    ocfs2_local_alloc_count_bits(alloc)) {
658 		ocfs2_error(osb->sb, "local alloc inode %llu says it has "
659 			    "%u free bits, but a count shows %u",
660 			    (unsigned long long)le64_to_cpu(alloc->i_blkno),
661 			    le32_to_cpu(alloc->id1.bitmap1.i_used),
662 			    ocfs2_local_alloc_count_bits(alloc));
663 		status = -EIO;
664 		goto bail;
665 	}
666 #endif
667 
668 	free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
669 		le32_to_cpu(alloc->id1.bitmap1.i_used);
670 	if (bits_wanted > free_bits) {
671 		/* uhoh, window change time. */
672 		status =
673 			ocfs2_local_alloc_slide_window(osb, local_alloc_inode);
674 		if (status < 0) {
675 			if (status != -ENOSPC)
676 				mlog_errno(status);
677 			goto bail;
678 		}
679 
680 		/*
681 		 * Under certain conditions, the window slide code
682 		 * might have reduced the number of bits available or
683 		 * disabled the the local alloc entirely. Re-check
684 		 * here and return -ENOSPC if necessary.
685 		 */
686 		status = -ENOSPC;
687 		if (!ocfs2_la_state_enabled(osb))
688 			goto bail;
689 
690 		free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) -
691 			le32_to_cpu(alloc->id1.bitmap1.i_used);
692 		if (bits_wanted > free_bits)
693 			goto bail;
694 	}
695 
696 	if (ac->ac_max_block)
697 		mlog(0, "Calling in_range for max block %llu\n",
698 		     (unsigned long long)ac->ac_max_block);
699 
700 	if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac,
701 					bits_wanted)) {
702 		/*
703 		 * The window is outside ac->ac_max_block.
704 		 * This errno tells the caller to keep localalloc enabled
705 		 * but to get the allocation from the main bitmap.
706 		 */
707 		status = -EFBIG;
708 		goto bail;
709 	}
710 
711 	ac->ac_inode = local_alloc_inode;
712 	/* We should never use localalloc from another slot */
713 	ac->ac_alloc_slot = osb->slot_num;
714 	ac->ac_which = OCFS2_AC_USE_LOCAL;
715 	get_bh(osb->local_alloc_bh);
716 	ac->ac_bh = osb->local_alloc_bh;
717 	status = 0;
718 bail:
719 	if (status < 0 && local_alloc_inode) {
720 		mutex_unlock(&local_alloc_inode->i_mutex);
721 		iput(local_alloc_inode);
722 	}
723 
724 	mlog(0, "bits=%d, slot=%d, ret=%d\n", bits_wanted, osb->slot_num,
725 	     status);
726 
727 	mlog_exit(status);
728 	return status;
729 }
730 
731 int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
732 				 handle_t *handle,
733 				 struct ocfs2_alloc_context *ac,
734 				 u32 bits_wanted,
735 				 u32 *bit_off,
736 				 u32 *num_bits)
737 {
738 	int status, start;
739 	struct inode *local_alloc_inode;
740 	void *bitmap;
741 	struct ocfs2_dinode *alloc;
742 	struct ocfs2_local_alloc *la;
743 
744 	mlog_entry_void();
745 	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
746 
747 	local_alloc_inode = ac->ac_inode;
748 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
749 	la = OCFS2_LOCAL_ALLOC(alloc);
750 
751 	start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted);
752 	if (start == -1) {
753 		/* TODO: Shouldn't we just BUG here? */
754 		status = -ENOSPC;
755 		mlog_errno(status);
756 		goto bail;
757 	}
758 
759 	bitmap = la->la_bitmap;
760 	*bit_off = le32_to_cpu(la->la_bm_off) + start;
761 	/* local alloc is always contiguous by nature -- we never
762 	 * delete bits from it! */
763 	*num_bits = bits_wanted;
764 
765 	status = ocfs2_journal_access(handle, local_alloc_inode,
766 				      osb->local_alloc_bh,
767 				      OCFS2_JOURNAL_ACCESS_WRITE);
768 	if (status < 0) {
769 		mlog_errno(status);
770 		goto bail;
771 	}
772 
773 	while(bits_wanted--)
774 		ocfs2_set_bit(start++, bitmap);
775 
776 	le32_add_cpu(&alloc->id1.bitmap1.i_used, *num_bits);
777 
778 	status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
779 	if (status < 0) {
780 		mlog_errno(status);
781 		goto bail;
782 	}
783 
784 	status = 0;
785 bail:
786 	mlog_exit(status);
787 	return status;
788 }
789 
790 static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
791 {
792 	int i;
793 	u8 *buffer;
794 	u32 count = 0;
795 	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
796 
797 	mlog_entry_void();
798 
799 	buffer = la->la_bitmap;
800 	for (i = 0; i < le16_to_cpu(la->la_size); i++)
801 		count += hweight8(buffer[i]);
802 
803 	mlog_exit(count);
804 	return count;
805 }
806 
807 static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb,
808 					     struct ocfs2_dinode *alloc,
809 					     u32 numbits)
810 {
811 	int numfound, bitoff, left, startoff, lastzero;
812 	void *bitmap = NULL;
813 
814 	mlog_entry("(numbits wanted = %u)\n", numbits);
815 
816 	if (!alloc->id1.bitmap1.i_total) {
817 		mlog(0, "No bits in my window!\n");
818 		bitoff = -1;
819 		goto bail;
820 	}
821 
822 	bitmap = OCFS2_LOCAL_ALLOC(alloc)->la_bitmap;
823 
824 	numfound = bitoff = startoff = 0;
825 	lastzero = -1;
826 	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
827 	while ((bitoff = ocfs2_find_next_zero_bit(bitmap, left, startoff)) != -1) {
828 		if (bitoff == left) {
829 			/* mlog(0, "bitoff (%d) == left", bitoff); */
830 			break;
831 		}
832 		/* mlog(0, "Found a zero: bitoff = %d, startoff = %d, "
833 		   "numfound = %d\n", bitoff, startoff, numfound);*/
834 
835 		/* Ok, we found a zero bit... is it contig. or do we
836 		 * start over?*/
837 		if (bitoff == startoff) {
838 			/* we found a zero */
839 			numfound++;
840 			startoff++;
841 		} else {
842 			/* got a zero after some ones */
843 			numfound = 1;
844 			startoff = bitoff+1;
845 		}
846 		/* we got everything we needed */
847 		if (numfound == numbits) {
848 			/* mlog(0, "Found it all!\n"); */
849 			break;
850 		}
851 	}
852 
853 	mlog(0, "Exiting loop, bitoff = %d, numfound = %d\n", bitoff,
854 	     numfound);
855 
856 	if (numfound == numbits)
857 		bitoff = startoff - numfound;
858 	else
859 		bitoff = -1;
860 
861 bail:
862 	mlog_exit(bitoff);
863 	return bitoff;
864 }
865 
866 static void ocfs2_clear_local_alloc(struct ocfs2_dinode *alloc)
867 {
868 	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
869 	int i;
870 	mlog_entry_void();
871 
872 	alloc->id1.bitmap1.i_total = 0;
873 	alloc->id1.bitmap1.i_used = 0;
874 	la->la_bm_off = 0;
875 	for(i = 0; i < le16_to_cpu(la->la_size); i++)
876 		la->la_bitmap[i] = 0;
877 
878 	mlog_exit_void();
879 }
880 
881 #if 0
882 /* turn this on and uncomment below to aid debugging window shifts. */
883 static void ocfs2_verify_zero_bits(unsigned long *bitmap,
884 				   unsigned int start,
885 				   unsigned int count)
886 {
887 	unsigned int tmp = count;
888 	while(tmp--) {
889 		if (ocfs2_test_bit(start + tmp, bitmap)) {
890 			printk("ocfs2_verify_zero_bits: start = %u, count = "
891 			       "%u\n", start, count);
892 			printk("ocfs2_verify_zero_bits: bit %u is set!",
893 			       start + tmp);
894 			BUG();
895 		}
896 	}
897 }
898 #endif
899 
900 /*
901  * sync the local alloc to main bitmap.
902  *
903  * assumes you've already locked the main bitmap -- the bitmap inode
904  * passed is used for caching.
905  */
906 static int ocfs2_sync_local_to_main(struct ocfs2_super *osb,
907 				    handle_t *handle,
908 				    struct ocfs2_dinode *alloc,
909 				    struct inode *main_bm_inode,
910 				    struct buffer_head *main_bm_bh)
911 {
912 	int status = 0;
913 	int bit_off, left, count, start;
914 	u64 la_start_blk;
915 	u64 blkno;
916 	void *bitmap;
917 	struct ocfs2_local_alloc *la = OCFS2_LOCAL_ALLOC(alloc);
918 
919 	mlog_entry("total = %u, used = %u\n",
920 		   le32_to_cpu(alloc->id1.bitmap1.i_total),
921 		   le32_to_cpu(alloc->id1.bitmap1.i_used));
922 
923 	if (!alloc->id1.bitmap1.i_total) {
924 		mlog(0, "nothing to sync!\n");
925 		goto bail;
926 	}
927 
928 	if (le32_to_cpu(alloc->id1.bitmap1.i_used) ==
929 	    le32_to_cpu(alloc->id1.bitmap1.i_total)) {
930 		mlog(0, "all bits were taken!\n");
931 		goto bail;
932 	}
933 
934 	la_start_blk = ocfs2_clusters_to_blocks(osb->sb,
935 						le32_to_cpu(la->la_bm_off));
936 	bitmap = la->la_bitmap;
937 	start = count = bit_off = 0;
938 	left = le32_to_cpu(alloc->id1.bitmap1.i_total);
939 
940 	while ((bit_off = ocfs2_find_next_zero_bit(bitmap, left, start))
941 	       != -1) {
942 		if ((bit_off < left) && (bit_off == start)) {
943 			count++;
944 			start++;
945 			continue;
946 		}
947 		if (count) {
948 			blkno = la_start_blk +
949 				ocfs2_clusters_to_blocks(osb->sb,
950 							 start - count);
951 
952 			mlog(0, "freeing %u bits starting at local alloc bit "
953 			     "%u (la_start_blk = %llu, blkno = %llu)\n",
954 			     count, start - count,
955 			     (unsigned long long)la_start_blk,
956 			     (unsigned long long)blkno);
957 
958 			status = ocfs2_free_clusters(handle, main_bm_inode,
959 						     main_bm_bh, blkno, count);
960 			if (status < 0) {
961 				mlog_errno(status);
962 				goto bail;
963 			}
964 		}
965 		if (bit_off >= left)
966 			break;
967 		count = 1;
968 		start = bit_off + 1;
969 	}
970 
971 bail:
972 	mlog_exit(status);
973 	return status;
974 }
975 
976 enum ocfs2_la_event {
977 	OCFS2_LA_EVENT_SLIDE,		/* Normal window slide. */
978 	OCFS2_LA_EVENT_FRAGMENTED,	/* The global bitmap has
979 					 * enough bits theoretically
980 					 * free, but a contiguous
981 					 * allocation could not be
982 					 * found. */
983 	OCFS2_LA_EVENT_ENOSPC,		/* Global bitmap doesn't have
984 					 * enough bits free to satisfy
985 					 * our request. */
986 };
987 #define OCFS2_LA_ENABLE_INTERVAL (30 * HZ)
988 /*
989  * Given an event, calculate the size of our next local alloc window.
990  *
991  * This should always be called under i_mutex of the local alloc inode
992  * so that local alloc disabling doesn't race with processes trying to
993  * use the allocator.
994  *
995  * Returns the state which the local alloc was left in. This value can
996  * be ignored by some paths.
997  */
998 static int ocfs2_recalc_la_window(struct ocfs2_super *osb,
999 				  enum ocfs2_la_event event)
1000 {
1001 	unsigned int bits;
1002 	int state;
1003 
1004 	spin_lock(&osb->osb_lock);
1005 	if (osb->local_alloc_state == OCFS2_LA_DISABLED) {
1006 		WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED);
1007 		goto out_unlock;
1008 	}
1009 
1010 	/*
1011 	 * ENOSPC and fragmentation are treated similarly for now.
1012 	 */
1013 	if (event == OCFS2_LA_EVENT_ENOSPC ||
1014 	    event == OCFS2_LA_EVENT_FRAGMENTED) {
1015 		/*
1016 		 * We ran out of contiguous space in the primary
1017 		 * bitmap. Drastically reduce the number of bits used
1018 		 * by local alloc until we have to disable it.
1019 		 */
1020 		bits = osb->local_alloc_bits >> 1;
1021 		if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) {
1022 			/*
1023 			 * By setting state to THROTTLED, we'll keep
1024 			 * the number of local alloc bits used down
1025 			 * until an event occurs which would give us
1026 			 * reason to assume the bitmap situation might
1027 			 * have changed.
1028 			 */
1029 			osb->local_alloc_state = OCFS2_LA_THROTTLED;
1030 			osb->local_alloc_bits = bits;
1031 		} else {
1032 			osb->local_alloc_state = OCFS2_LA_DISABLED;
1033 		}
1034 		queue_delayed_work(ocfs2_wq, &osb->la_enable_wq,
1035 				   OCFS2_LA_ENABLE_INTERVAL);
1036 		goto out_unlock;
1037 	}
1038 
1039 	/*
1040 	 * Don't increase the size of the local alloc window until we
1041 	 * know we might be able to fulfill the request. Otherwise, we
1042 	 * risk bouncing around the global bitmap during periods of
1043 	 * low space.
1044 	 */
1045 	if (osb->local_alloc_state != OCFS2_LA_THROTTLED)
1046 		osb->local_alloc_bits = osb->local_alloc_default_bits;
1047 
1048 out_unlock:
1049 	state = osb->local_alloc_state;
1050 	spin_unlock(&osb->osb_lock);
1051 
1052 	return state;
1053 }
1054 
1055 static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb,
1056 						struct ocfs2_alloc_context **ac,
1057 						struct inode **bitmap_inode,
1058 						struct buffer_head **bitmap_bh)
1059 {
1060 	int status;
1061 
1062 	*ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
1063 	if (!(*ac)) {
1064 		status = -ENOMEM;
1065 		mlog_errno(status);
1066 		goto bail;
1067 	}
1068 
1069 retry_enospc:
1070 	(*ac)->ac_bits_wanted = osb->local_alloc_bits;
1071 
1072 	status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac);
1073 	if (status == -ENOSPC) {
1074 		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) ==
1075 		    OCFS2_LA_DISABLED)
1076 			goto bail;
1077 
1078 		ocfs2_free_ac_resource(*ac);
1079 		memset(*ac, 0, sizeof(struct ocfs2_alloc_context));
1080 		goto retry_enospc;
1081 	}
1082 	if (status < 0) {
1083 		mlog_errno(status);
1084 		goto bail;
1085 	}
1086 
1087 	*bitmap_inode = (*ac)->ac_inode;
1088 	igrab(*bitmap_inode);
1089 	*bitmap_bh = (*ac)->ac_bh;
1090 	get_bh(*bitmap_bh);
1091 	status = 0;
1092 bail:
1093 	if ((status < 0) && *ac) {
1094 		ocfs2_free_alloc_context(*ac);
1095 		*ac = NULL;
1096 	}
1097 
1098 	mlog_exit(status);
1099 	return status;
1100 }
1101 
1102 /*
1103  * pass it the bitmap lock in lock_bh if you have it.
1104  */
1105 static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb,
1106 					handle_t *handle,
1107 					struct ocfs2_alloc_context *ac)
1108 {
1109 	int status = 0;
1110 	u32 cluster_off, cluster_count;
1111 	struct ocfs2_dinode *alloc = NULL;
1112 	struct ocfs2_local_alloc *la;
1113 
1114 	mlog_entry_void();
1115 
1116 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
1117 	la = OCFS2_LOCAL_ALLOC(alloc);
1118 
1119 	if (alloc->id1.bitmap1.i_total)
1120 		mlog(0, "asking me to alloc a new window over a non-empty "
1121 		     "one\n");
1122 
1123 	mlog(0, "Allocating %u clusters for a new window.\n",
1124 	     osb->local_alloc_bits);
1125 
1126 	/* Instruct the allocation code to try the most recently used
1127 	 * cluster group. We'll re-record the group used this pass
1128 	 * below. */
1129 	ac->ac_last_group = osb->la_last_gd;
1130 
1131 	/* we used the generic suballoc reserve function, but we set
1132 	 * everything up nicely, so there's no reason why we can't use
1133 	 * the more specific cluster api to claim bits. */
1134 	status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits,
1135 				      &cluster_off, &cluster_count);
1136 	if (status == -ENOSPC) {
1137 retry_enospc:
1138 		/*
1139 		 * Note: We could also try syncing the journal here to
1140 		 * allow use of any free bits which the current
1141 		 * transaction can't give us access to. --Mark
1142 		 */
1143 		if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) ==
1144 		    OCFS2_LA_DISABLED)
1145 			goto bail;
1146 
1147 		status = ocfs2_claim_clusters(osb, handle, ac,
1148 					      osb->local_alloc_bits,
1149 					      &cluster_off,
1150 					      &cluster_count);
1151 		if (status == -ENOSPC)
1152 			goto retry_enospc;
1153 		/*
1154 		 * We only shrunk the *minimum* number of in our
1155 		 * request - it's entirely possible that the allocator
1156 		 * might give us more than we asked for.
1157 		 */
1158 		if (status == 0) {
1159 			spin_lock(&osb->osb_lock);
1160 			osb->local_alloc_bits = cluster_count;
1161 			spin_unlock(&osb->osb_lock);
1162 		}
1163 	}
1164 	if (status < 0) {
1165 		if (status != -ENOSPC)
1166 			mlog_errno(status);
1167 		goto bail;
1168 	}
1169 
1170 	osb->la_last_gd = ac->ac_last_group;
1171 
1172 	la->la_bm_off = cpu_to_le32(cluster_off);
1173 	alloc->id1.bitmap1.i_total = cpu_to_le32(cluster_count);
1174 	/* just in case... In the future when we find space ourselves,
1175 	 * we don't have to get all contiguous -- but we'll have to
1176 	 * set all previously used bits in bitmap and update
1177 	 * la_bits_set before setting the bits in the main bitmap. */
1178 	alloc->id1.bitmap1.i_used = 0;
1179 	memset(OCFS2_LOCAL_ALLOC(alloc)->la_bitmap, 0,
1180 	       le16_to_cpu(la->la_size));
1181 
1182 	mlog(0, "New window allocated:\n");
1183 	mlog(0, "window la_bm_off = %u\n",
1184 	     OCFS2_LOCAL_ALLOC(alloc)->la_bm_off);
1185 	mlog(0, "window bits = %u\n", le32_to_cpu(alloc->id1.bitmap1.i_total));
1186 
1187 bail:
1188 	mlog_exit(status);
1189 	return status;
1190 }
1191 
1192 /* Note that we do *NOT* lock the local alloc inode here as
1193  * it's been locked already for us. */
1194 static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
1195 					  struct inode *local_alloc_inode)
1196 {
1197 	int status = 0;
1198 	struct buffer_head *main_bm_bh = NULL;
1199 	struct inode *main_bm_inode = NULL;
1200 	handle_t *handle = NULL;
1201 	struct ocfs2_dinode *alloc;
1202 	struct ocfs2_dinode *alloc_copy = NULL;
1203 	struct ocfs2_alloc_context *ac = NULL;
1204 
1205 	mlog_entry_void();
1206 
1207 	ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE);
1208 
1209 	/* This will lock the main bitmap for us. */
1210 	status = ocfs2_local_alloc_reserve_for_window(osb,
1211 						      &ac,
1212 						      &main_bm_inode,
1213 						      &main_bm_bh);
1214 	if (status < 0) {
1215 		if (status != -ENOSPC)
1216 			mlog_errno(status);
1217 		goto bail;
1218 	}
1219 
1220 	handle = ocfs2_start_trans(osb, OCFS2_WINDOW_MOVE_CREDITS);
1221 	if (IS_ERR(handle)) {
1222 		status = PTR_ERR(handle);
1223 		handle = NULL;
1224 		mlog_errno(status);
1225 		goto bail;
1226 	}
1227 
1228 	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
1229 
1230 	/* We want to clear the local alloc before doing anything
1231 	 * else, so that if we error later during this operation,
1232 	 * local alloc shutdown won't try to double free main bitmap
1233 	 * bits. Make a copy so the sync function knows which bits to
1234 	 * free. */
1235 	alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_NOFS);
1236 	if (!alloc_copy) {
1237 		status = -ENOMEM;
1238 		mlog_errno(status);
1239 		goto bail;
1240 	}
1241 	memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
1242 
1243 	status = ocfs2_journal_access(handle, local_alloc_inode,
1244 				      osb->local_alloc_bh,
1245 				      OCFS2_JOURNAL_ACCESS_WRITE);
1246 	if (status < 0) {
1247 		mlog_errno(status);
1248 		goto bail;
1249 	}
1250 
1251 	ocfs2_clear_local_alloc(alloc);
1252 
1253 	status = ocfs2_journal_dirty(handle, osb->local_alloc_bh);
1254 	if (status < 0) {
1255 		mlog_errno(status);
1256 		goto bail;
1257 	}
1258 
1259 	status = ocfs2_sync_local_to_main(osb, handle, alloc_copy,
1260 					  main_bm_inode, main_bm_bh);
1261 	if (status < 0) {
1262 		mlog_errno(status);
1263 		goto bail;
1264 	}
1265 
1266 	status = ocfs2_local_alloc_new_window(osb, handle, ac);
1267 	if (status < 0) {
1268 		if (status != -ENOSPC)
1269 			mlog_errno(status);
1270 		goto bail;
1271 	}
1272 
1273 	atomic_inc(&osb->alloc_stats.moves);
1274 
1275 	status = 0;
1276 bail:
1277 	if (handle)
1278 		ocfs2_commit_trans(osb, handle);
1279 
1280 	brelse(main_bm_bh);
1281 
1282 	if (main_bm_inode)
1283 		iput(main_bm_inode);
1284 
1285 	if (alloc_copy)
1286 		kfree(alloc_copy);
1287 
1288 	if (ac)
1289 		ocfs2_free_alloc_context(ac);
1290 
1291 	mlog_exit(status);
1292 	return status;
1293 }
1294 
1295