xref: /openbmc/linux/fs/btrfs/free-space-cache.c (revision e8e0929d)
1 /*
2  * Copyright (C) 2008 Red Hat.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/pagemap.h>
20 #include <linux/sched.h>
21 #include <linux/math64.h>
22 #include "ctree.h"
23 #include "free-space-cache.h"
24 #include "transaction.h"
25 
26 #define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
27 #define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
28 
29 static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
30 					  u64 offset)
31 {
32 	BUG_ON(offset < bitmap_start);
33 	offset -= bitmap_start;
34 	return (unsigned long)(div64_u64(offset, sectorsize));
35 }
36 
37 static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
38 {
39 	return (unsigned long)(div64_u64(bytes, sectorsize));
40 }
41 
42 static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
43 				   u64 offset)
44 {
45 	u64 bitmap_start;
46 	u64 bytes_per_bitmap;
47 
48 	bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
49 	bitmap_start = offset - block_group->key.objectid;
50 	bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
51 	bitmap_start *= bytes_per_bitmap;
52 	bitmap_start += block_group->key.objectid;
53 
54 	return bitmap_start;
55 }
56 
57 static int tree_insert_offset(struct rb_root *root, u64 offset,
58 			      struct rb_node *node, int bitmap)
59 {
60 	struct rb_node **p = &root->rb_node;
61 	struct rb_node *parent = NULL;
62 	struct btrfs_free_space *info;
63 
64 	while (*p) {
65 		parent = *p;
66 		info = rb_entry(parent, struct btrfs_free_space, offset_index);
67 
68 		if (offset < info->offset) {
69 			p = &(*p)->rb_left;
70 		} else if (offset > info->offset) {
71 			p = &(*p)->rb_right;
72 		} else {
73 			/*
74 			 * we could have a bitmap entry and an extent entry
75 			 * share the same offset.  If this is the case, we want
76 			 * the extent entry to always be found first if we do a
77 			 * linear search through the tree, since we want to have
78 			 * the quickest allocation time, and allocating from an
79 			 * extent is faster than allocating from a bitmap.  So
80 			 * if we're inserting a bitmap and we find an entry at
81 			 * this offset, we want to go right, or after this entry
82 			 * logically.  If we are inserting an extent and we've
83 			 * found a bitmap, we want to go left, or before
84 			 * logically.
85 			 */
86 			if (bitmap) {
87 				WARN_ON(info->bitmap);
88 				p = &(*p)->rb_right;
89 			} else {
90 				WARN_ON(!info->bitmap);
91 				p = &(*p)->rb_left;
92 			}
93 		}
94 	}
95 
96 	rb_link_node(node, parent, p);
97 	rb_insert_color(node, root);
98 
99 	return 0;
100 }
101 
102 /*
103  * searches the tree for the given offset.
104  *
105  * fuzzy - If this is set, then we are trying to make an allocation, and we just
106  * want a section that has at least bytes size and comes at or after the given
107  * offset.
108  */
109 static struct btrfs_free_space *
110 tree_search_offset(struct btrfs_block_group_cache *block_group,
111 		   u64 offset, int bitmap_only, int fuzzy)
112 {
113 	struct rb_node *n = block_group->free_space_offset.rb_node;
114 	struct btrfs_free_space *entry, *prev = NULL;
115 
116 	/* find entry that is closest to the 'offset' */
117 	while (1) {
118 		if (!n) {
119 			entry = NULL;
120 			break;
121 		}
122 
123 		entry = rb_entry(n, struct btrfs_free_space, offset_index);
124 		prev = entry;
125 
126 		if (offset < entry->offset)
127 			n = n->rb_left;
128 		else if (offset > entry->offset)
129 			n = n->rb_right;
130 		else
131 			break;
132 	}
133 
134 	if (bitmap_only) {
135 		if (!entry)
136 			return NULL;
137 		if (entry->bitmap)
138 			return entry;
139 
140 		/*
141 		 * bitmap entry and extent entry may share same offset,
142 		 * in that case, bitmap entry comes after extent entry.
143 		 */
144 		n = rb_next(n);
145 		if (!n)
146 			return NULL;
147 		entry = rb_entry(n, struct btrfs_free_space, offset_index);
148 		if (entry->offset != offset)
149 			return NULL;
150 
151 		WARN_ON(!entry->bitmap);
152 		return entry;
153 	} else if (entry) {
154 		if (entry->bitmap) {
155 			/*
156 			 * if previous extent entry covers the offset,
157 			 * we should return it instead of the bitmap entry
158 			 */
159 			n = &entry->offset_index;
160 			while (1) {
161 				n = rb_prev(n);
162 				if (!n)
163 					break;
164 				prev = rb_entry(n, struct btrfs_free_space,
165 						offset_index);
166 				if (!prev->bitmap) {
167 					if (prev->offset + prev->bytes > offset)
168 						entry = prev;
169 					break;
170 				}
171 			}
172 		}
173 		return entry;
174 	}
175 
176 	if (!prev)
177 		return NULL;
178 
179 	/* find last entry before the 'offset' */
180 	entry = prev;
181 	if (entry->offset > offset) {
182 		n = rb_prev(&entry->offset_index);
183 		if (n) {
184 			entry = rb_entry(n, struct btrfs_free_space,
185 					offset_index);
186 			BUG_ON(entry->offset > offset);
187 		} else {
188 			if (fuzzy)
189 				return entry;
190 			else
191 				return NULL;
192 		}
193 	}
194 
195 	if (entry->bitmap) {
196 		n = &entry->offset_index;
197 		while (1) {
198 			n = rb_prev(n);
199 			if (!n)
200 				break;
201 			prev = rb_entry(n, struct btrfs_free_space,
202 					offset_index);
203 			if (!prev->bitmap) {
204 				if (prev->offset + prev->bytes > offset)
205 					return prev;
206 				break;
207 			}
208 		}
209 		if (entry->offset + BITS_PER_BITMAP *
210 		    block_group->sectorsize > offset)
211 			return entry;
212 	} else if (entry->offset + entry->bytes > offset)
213 		return entry;
214 
215 	if (!fuzzy)
216 		return NULL;
217 
218 	while (1) {
219 		if (entry->bitmap) {
220 			if (entry->offset + BITS_PER_BITMAP *
221 			    block_group->sectorsize > offset)
222 				break;
223 		} else {
224 			if (entry->offset + entry->bytes > offset)
225 				break;
226 		}
227 
228 		n = rb_next(&entry->offset_index);
229 		if (!n)
230 			return NULL;
231 		entry = rb_entry(n, struct btrfs_free_space, offset_index);
232 	}
233 	return entry;
234 }
235 
236 static void unlink_free_space(struct btrfs_block_group_cache *block_group,
237 			      struct btrfs_free_space *info)
238 {
239 	rb_erase(&info->offset_index, &block_group->free_space_offset);
240 	block_group->free_extents--;
241 	block_group->free_space -= info->bytes;
242 }
243 
244 static int link_free_space(struct btrfs_block_group_cache *block_group,
245 			   struct btrfs_free_space *info)
246 {
247 	int ret = 0;
248 
249 	BUG_ON(!info->bitmap && !info->bytes);
250 	ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
251 				 &info->offset_index, (info->bitmap != NULL));
252 	if (ret)
253 		return ret;
254 
255 	block_group->free_space += info->bytes;
256 	block_group->free_extents++;
257 	return ret;
258 }
259 
260 static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
261 {
262 	u64 max_bytes;
263 	u64 bitmap_bytes;
264 	u64 extent_bytes;
265 
266 	/*
267 	 * The goal is to keep the total amount of memory used per 1gb of space
268 	 * at or below 32k, so we need to adjust how much memory we allow to be
269 	 * used by extent based free space tracking
270 	 */
271 	max_bytes = MAX_CACHE_BYTES_PER_GIG *
272 		(div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
273 
274 	/*
275 	 * we want to account for 1 more bitmap than what we have so we can make
276 	 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
277 	 * we add more bitmaps.
278 	 */
279 	bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
280 
281 	if (bitmap_bytes >= max_bytes) {
282 		block_group->extents_thresh = 0;
283 		return;
284 	}
285 
286 	/*
287 	 * we want the extent entry threshold to always be at most 1/2 the maxw
288 	 * bytes we can have, or whatever is less than that.
289 	 */
290 	extent_bytes = max_bytes - bitmap_bytes;
291 	extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
292 
293 	block_group->extents_thresh =
294 		div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
295 }
296 
297 static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
298 			      struct btrfs_free_space *info, u64 offset,
299 			      u64 bytes)
300 {
301 	unsigned long start, end;
302 	unsigned long i;
303 
304 	start = offset_to_bit(info->offset, block_group->sectorsize, offset);
305 	end = start + bytes_to_bits(bytes, block_group->sectorsize);
306 	BUG_ON(end > BITS_PER_BITMAP);
307 
308 	for (i = start; i < end; i++)
309 		clear_bit(i, info->bitmap);
310 
311 	info->bytes -= bytes;
312 	block_group->free_space -= bytes;
313 }
314 
315 static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
316 			    struct btrfs_free_space *info, u64 offset,
317 			    u64 bytes)
318 {
319 	unsigned long start, end;
320 	unsigned long i;
321 
322 	start = offset_to_bit(info->offset, block_group->sectorsize, offset);
323 	end = start + bytes_to_bits(bytes, block_group->sectorsize);
324 	BUG_ON(end > BITS_PER_BITMAP);
325 
326 	for (i = start; i < end; i++)
327 		set_bit(i, info->bitmap);
328 
329 	info->bytes += bytes;
330 	block_group->free_space += bytes;
331 }
332 
333 static int search_bitmap(struct btrfs_block_group_cache *block_group,
334 			 struct btrfs_free_space *bitmap_info, u64 *offset,
335 			 u64 *bytes)
336 {
337 	unsigned long found_bits = 0;
338 	unsigned long bits, i;
339 	unsigned long next_zero;
340 
341 	i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
342 			  max_t(u64, *offset, bitmap_info->offset));
343 	bits = bytes_to_bits(*bytes, block_group->sectorsize);
344 
345 	for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
346 	     i < BITS_PER_BITMAP;
347 	     i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
348 		next_zero = find_next_zero_bit(bitmap_info->bitmap,
349 					       BITS_PER_BITMAP, i);
350 		if ((next_zero - i) >= bits) {
351 			found_bits = next_zero - i;
352 			break;
353 		}
354 		i = next_zero;
355 	}
356 
357 	if (found_bits) {
358 		*offset = (u64)(i * block_group->sectorsize) +
359 			bitmap_info->offset;
360 		*bytes = (u64)(found_bits) * block_group->sectorsize;
361 		return 0;
362 	}
363 
364 	return -1;
365 }
366 
367 static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
368 						*block_group, u64 *offset,
369 						u64 *bytes, int debug)
370 {
371 	struct btrfs_free_space *entry;
372 	struct rb_node *node;
373 	int ret;
374 
375 	if (!block_group->free_space_offset.rb_node)
376 		return NULL;
377 
378 	entry = tree_search_offset(block_group,
379 				   offset_to_bitmap(block_group, *offset),
380 				   0, 1);
381 	if (!entry)
382 		return NULL;
383 
384 	for (node = &entry->offset_index; node; node = rb_next(node)) {
385 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
386 		if (entry->bytes < *bytes)
387 			continue;
388 
389 		if (entry->bitmap) {
390 			ret = search_bitmap(block_group, entry, offset, bytes);
391 			if (!ret)
392 				return entry;
393 			continue;
394 		}
395 
396 		*offset = entry->offset;
397 		*bytes = entry->bytes;
398 		return entry;
399 	}
400 
401 	return NULL;
402 }
403 
404 static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
405 			   struct btrfs_free_space *info, u64 offset)
406 {
407 	u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
408 	int max_bitmaps = (int)div64_u64(block_group->key.offset +
409 					 bytes_per_bg - 1, bytes_per_bg);
410 	BUG_ON(block_group->total_bitmaps >= max_bitmaps);
411 
412 	info->offset = offset_to_bitmap(block_group, offset);
413 	info->bytes = 0;
414 	link_free_space(block_group, info);
415 	block_group->total_bitmaps++;
416 
417 	recalculate_thresholds(block_group);
418 }
419 
420 static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
421 			      struct btrfs_free_space *bitmap_info,
422 			      u64 *offset, u64 *bytes)
423 {
424 	u64 end;
425 	u64 search_start, search_bytes;
426 	int ret;
427 
428 again:
429 	end = bitmap_info->offset +
430 		(u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
431 
432 	/*
433 	 * XXX - this can go away after a few releases.
434 	 *
435 	 * since the only user of btrfs_remove_free_space is the tree logging
436 	 * stuff, and the only way to test that is under crash conditions, we
437 	 * want to have this debug stuff here just in case somethings not
438 	 * working.  Search the bitmap for the space we are trying to use to
439 	 * make sure its actually there.  If its not there then we need to stop
440 	 * because something has gone wrong.
441 	 */
442 	search_start = *offset;
443 	search_bytes = *bytes;
444 	ret = search_bitmap(block_group, bitmap_info, &search_start,
445 			    &search_bytes);
446 	BUG_ON(ret < 0 || search_start != *offset);
447 
448 	if (*offset > bitmap_info->offset && *offset + *bytes > end) {
449 		bitmap_clear_bits(block_group, bitmap_info, *offset,
450 				  end - *offset + 1);
451 		*bytes -= end - *offset + 1;
452 		*offset = end + 1;
453 	} else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
454 		bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
455 		*bytes = 0;
456 	}
457 
458 	if (*bytes) {
459 		struct rb_node *next = rb_next(&bitmap_info->offset_index);
460 		if (!bitmap_info->bytes) {
461 			unlink_free_space(block_group, bitmap_info);
462 			kfree(bitmap_info->bitmap);
463 			kfree(bitmap_info);
464 			block_group->total_bitmaps--;
465 			recalculate_thresholds(block_group);
466 		}
467 
468 		/*
469 		 * no entry after this bitmap, but we still have bytes to
470 		 * remove, so something has gone wrong.
471 		 */
472 		if (!next)
473 			return -EINVAL;
474 
475 		bitmap_info = rb_entry(next, struct btrfs_free_space,
476 				       offset_index);
477 
478 		/*
479 		 * if the next entry isn't a bitmap we need to return to let the
480 		 * extent stuff do its work.
481 		 */
482 		if (!bitmap_info->bitmap)
483 			return -EAGAIN;
484 
485 		/*
486 		 * Ok the next item is a bitmap, but it may not actually hold
487 		 * the information for the rest of this free space stuff, so
488 		 * look for it, and if we don't find it return so we can try
489 		 * everything over again.
490 		 */
491 		search_start = *offset;
492 		search_bytes = *bytes;
493 		ret = search_bitmap(block_group, bitmap_info, &search_start,
494 				    &search_bytes);
495 		if (ret < 0 || search_start != *offset)
496 			return -EAGAIN;
497 
498 		goto again;
499 	} else if (!bitmap_info->bytes) {
500 		unlink_free_space(block_group, bitmap_info);
501 		kfree(bitmap_info->bitmap);
502 		kfree(bitmap_info);
503 		block_group->total_bitmaps--;
504 		recalculate_thresholds(block_group);
505 	}
506 
507 	return 0;
508 }
509 
510 static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
511 			      struct btrfs_free_space *info)
512 {
513 	struct btrfs_free_space *bitmap_info;
514 	int added = 0;
515 	u64 bytes, offset, end;
516 	int ret;
517 
518 	/*
519 	 * If we are below the extents threshold then we can add this as an
520 	 * extent, and don't have to deal with the bitmap
521 	 */
522 	if (block_group->free_extents < block_group->extents_thresh &&
523 	    info->bytes > block_group->sectorsize * 4)
524 		return 0;
525 
526 	/*
527 	 * some block groups are so tiny they can't be enveloped by a bitmap, so
528 	 * don't even bother to create a bitmap for this
529 	 */
530 	if (BITS_PER_BITMAP * block_group->sectorsize >
531 	    block_group->key.offset)
532 		return 0;
533 
534 	bytes = info->bytes;
535 	offset = info->offset;
536 
537 again:
538 	bitmap_info = tree_search_offset(block_group,
539 					 offset_to_bitmap(block_group, offset),
540 					 1, 0);
541 	if (!bitmap_info) {
542 		BUG_ON(added);
543 		goto new_bitmap;
544 	}
545 
546 	end = bitmap_info->offset +
547 		(u64)(BITS_PER_BITMAP * block_group->sectorsize);
548 
549 	if (offset >= bitmap_info->offset && offset + bytes > end) {
550 		bitmap_set_bits(block_group, bitmap_info, offset,
551 				end - offset);
552 		bytes -= end - offset;
553 		offset = end;
554 		added = 0;
555 	} else if (offset >= bitmap_info->offset && offset + bytes <= end) {
556 		bitmap_set_bits(block_group, bitmap_info, offset, bytes);
557 		bytes = 0;
558 	} else {
559 		BUG();
560 	}
561 
562 	if (!bytes) {
563 		ret = 1;
564 		goto out;
565 	} else
566 		goto again;
567 
568 new_bitmap:
569 	if (info && info->bitmap) {
570 		add_new_bitmap(block_group, info, offset);
571 		added = 1;
572 		info = NULL;
573 		goto again;
574 	} else {
575 		spin_unlock(&block_group->tree_lock);
576 
577 		/* no pre-allocated info, allocate a new one */
578 		if (!info) {
579 			info = kzalloc(sizeof(struct btrfs_free_space),
580 				       GFP_NOFS);
581 			if (!info) {
582 				spin_lock(&block_group->tree_lock);
583 				ret = -ENOMEM;
584 				goto out;
585 			}
586 		}
587 
588 		/* allocate the bitmap */
589 		info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
590 		spin_lock(&block_group->tree_lock);
591 		if (!info->bitmap) {
592 			ret = -ENOMEM;
593 			goto out;
594 		}
595 		goto again;
596 	}
597 
598 out:
599 	if (info) {
600 		if (info->bitmap)
601 			kfree(info->bitmap);
602 		kfree(info);
603 	}
604 
605 	return ret;
606 }
607 
608 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
609 			 u64 offset, u64 bytes)
610 {
611 	struct btrfs_free_space *right_info = NULL;
612 	struct btrfs_free_space *left_info = NULL;
613 	struct btrfs_free_space *info = NULL;
614 	int ret = 0;
615 
616 	info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
617 	if (!info)
618 		return -ENOMEM;
619 
620 	info->offset = offset;
621 	info->bytes = bytes;
622 
623 	spin_lock(&block_group->tree_lock);
624 
625 	/*
626 	 * first we want to see if there is free space adjacent to the range we
627 	 * are adding, if there is remove that struct and add a new one to
628 	 * cover the entire range
629 	 */
630 	right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
631 	if (right_info && rb_prev(&right_info->offset_index))
632 		left_info = rb_entry(rb_prev(&right_info->offset_index),
633 				     struct btrfs_free_space, offset_index);
634 	else
635 		left_info = tree_search_offset(block_group, offset - 1, 0, 0);
636 
637 	/*
638 	 * If there was no extent directly to the left or right of this new
639 	 * extent then we know we're going to have to allocate a new extent, so
640 	 * before we do that see if we need to drop this into a bitmap
641 	 */
642 	if ((!left_info || left_info->bitmap) &&
643 	    (!right_info || right_info->bitmap)) {
644 		ret = insert_into_bitmap(block_group, info);
645 
646 		if (ret < 0) {
647 			goto out;
648 		} else if (ret) {
649 			ret = 0;
650 			goto out;
651 		}
652 	}
653 
654 	if (right_info && !right_info->bitmap) {
655 		unlink_free_space(block_group, right_info);
656 		info->bytes += right_info->bytes;
657 		kfree(right_info);
658 	}
659 
660 	if (left_info && !left_info->bitmap &&
661 	    left_info->offset + left_info->bytes == offset) {
662 		unlink_free_space(block_group, left_info);
663 		info->offset = left_info->offset;
664 		info->bytes += left_info->bytes;
665 		kfree(left_info);
666 	}
667 
668 	ret = link_free_space(block_group, info);
669 	if (ret)
670 		kfree(info);
671 out:
672 	spin_unlock(&block_group->tree_lock);
673 
674 	if (ret) {
675 		printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
676 		BUG_ON(ret == -EEXIST);
677 	}
678 
679 	return ret;
680 }
681 
682 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
683 			    u64 offset, u64 bytes)
684 {
685 	struct btrfs_free_space *info;
686 	struct btrfs_free_space *next_info = NULL;
687 	int ret = 0;
688 
689 	spin_lock(&block_group->tree_lock);
690 
691 again:
692 	info = tree_search_offset(block_group, offset, 0, 0);
693 	if (!info) {
694 		/*
695 		 * oops didn't find an extent that matched the space we wanted
696 		 * to remove, look for a bitmap instead
697 		 */
698 		info = tree_search_offset(block_group,
699 					  offset_to_bitmap(block_group, offset),
700 					  1, 0);
701 		if (!info) {
702 			WARN_ON(1);
703 			goto out_lock;
704 		}
705 	}
706 
707 	if (info->bytes < bytes && rb_next(&info->offset_index)) {
708 		u64 end;
709 		next_info = rb_entry(rb_next(&info->offset_index),
710 					     struct btrfs_free_space,
711 					     offset_index);
712 
713 		if (next_info->bitmap)
714 			end = next_info->offset + BITS_PER_BITMAP *
715 				block_group->sectorsize - 1;
716 		else
717 			end = next_info->offset + next_info->bytes;
718 
719 		if (next_info->bytes < bytes ||
720 		    next_info->offset > offset || offset > end) {
721 			printk(KERN_CRIT "Found free space at %llu, size %llu,"
722 			      " trying to use %llu\n",
723 			      (unsigned long long)info->offset,
724 			      (unsigned long long)info->bytes,
725 			      (unsigned long long)bytes);
726 			WARN_ON(1);
727 			ret = -EINVAL;
728 			goto out_lock;
729 		}
730 
731 		info = next_info;
732 	}
733 
734 	if (info->bytes == bytes) {
735 		unlink_free_space(block_group, info);
736 		if (info->bitmap) {
737 			kfree(info->bitmap);
738 			block_group->total_bitmaps--;
739 		}
740 		kfree(info);
741 		goto out_lock;
742 	}
743 
744 	if (!info->bitmap && info->offset == offset) {
745 		unlink_free_space(block_group, info);
746 		info->offset += bytes;
747 		info->bytes -= bytes;
748 		link_free_space(block_group, info);
749 		goto out_lock;
750 	}
751 
752 	if (!info->bitmap && info->offset <= offset &&
753 	    info->offset + info->bytes >= offset + bytes) {
754 		u64 old_start = info->offset;
755 		/*
756 		 * we're freeing space in the middle of the info,
757 		 * this can happen during tree log replay
758 		 *
759 		 * first unlink the old info and then
760 		 * insert it again after the hole we're creating
761 		 */
762 		unlink_free_space(block_group, info);
763 		if (offset + bytes < info->offset + info->bytes) {
764 			u64 old_end = info->offset + info->bytes;
765 
766 			info->offset = offset + bytes;
767 			info->bytes = old_end - info->offset;
768 			ret = link_free_space(block_group, info);
769 			WARN_ON(ret);
770 			if (ret)
771 				goto out_lock;
772 		} else {
773 			/* the hole we're creating ends at the end
774 			 * of the info struct, just free the info
775 			 */
776 			kfree(info);
777 		}
778 		spin_unlock(&block_group->tree_lock);
779 
780 		/* step two, insert a new info struct to cover
781 		 * anything before the hole
782 		 */
783 		ret = btrfs_add_free_space(block_group, old_start,
784 					   offset - old_start);
785 		WARN_ON(ret);
786 		goto out;
787 	}
788 
789 	ret = remove_from_bitmap(block_group, info, &offset, &bytes);
790 	if (ret == -EAGAIN)
791 		goto again;
792 	BUG_ON(ret);
793 out_lock:
794 	spin_unlock(&block_group->tree_lock);
795 out:
796 	return ret;
797 }
798 
799 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
800 			   u64 bytes)
801 {
802 	struct btrfs_free_space *info;
803 	struct rb_node *n;
804 	int count = 0;
805 
806 	for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
807 		info = rb_entry(n, struct btrfs_free_space, offset_index);
808 		if (info->bytes >= bytes)
809 			count++;
810 		printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
811 		       (unsigned long long)info->offset,
812 		       (unsigned long long)info->bytes,
813 		       (info->bitmap) ? "yes" : "no");
814 	}
815 	printk(KERN_INFO "block group has cluster?: %s\n",
816 	       list_empty(&block_group->cluster_list) ? "no" : "yes");
817 	printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
818 	       "\n", count);
819 }
820 
821 u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
822 {
823 	struct btrfs_free_space *info;
824 	struct rb_node *n;
825 	u64 ret = 0;
826 
827 	for (n = rb_first(&block_group->free_space_offset); n;
828 	     n = rb_next(n)) {
829 		info = rb_entry(n, struct btrfs_free_space, offset_index);
830 		ret += info->bytes;
831 	}
832 
833 	return ret;
834 }
835 
836 /*
837  * for a given cluster, put all of its extents back into the free
838  * space cache.  If the block group passed doesn't match the block group
839  * pointed to by the cluster, someone else raced in and freed the
840  * cluster already.  In that case, we just return without changing anything
841  */
842 static int
843 __btrfs_return_cluster_to_free_space(
844 			     struct btrfs_block_group_cache *block_group,
845 			     struct btrfs_free_cluster *cluster)
846 {
847 	struct btrfs_free_space *entry;
848 	struct rb_node *node;
849 	bool bitmap;
850 
851 	spin_lock(&cluster->lock);
852 	if (cluster->block_group != block_group)
853 		goto out;
854 
855 	bitmap = cluster->points_to_bitmap;
856 	cluster->block_group = NULL;
857 	cluster->window_start = 0;
858 	list_del_init(&cluster->block_group_list);
859 	cluster->points_to_bitmap = false;
860 
861 	if (bitmap)
862 		goto out;
863 
864 	node = rb_first(&cluster->root);
865 	while (node) {
866 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
867 		node = rb_next(&entry->offset_index);
868 		rb_erase(&entry->offset_index, &cluster->root);
869 		BUG_ON(entry->bitmap);
870 		tree_insert_offset(&block_group->free_space_offset,
871 				   entry->offset, &entry->offset_index, 0);
872 	}
873 	cluster->root.rb_node = NULL;
874 
875 out:
876 	spin_unlock(&cluster->lock);
877 	btrfs_put_block_group(block_group);
878 	return 0;
879 }
880 
881 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
882 {
883 	struct btrfs_free_space *info;
884 	struct rb_node *node;
885 	struct btrfs_free_cluster *cluster;
886 	struct list_head *head;
887 
888 	spin_lock(&block_group->tree_lock);
889 	while ((head = block_group->cluster_list.next) !=
890 	       &block_group->cluster_list) {
891 		cluster = list_entry(head, struct btrfs_free_cluster,
892 				     block_group_list);
893 
894 		WARN_ON(cluster->block_group != block_group);
895 		__btrfs_return_cluster_to_free_space(block_group, cluster);
896 		if (need_resched()) {
897 			spin_unlock(&block_group->tree_lock);
898 			cond_resched();
899 			spin_lock(&block_group->tree_lock);
900 		}
901 	}
902 
903 	while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
904 		info = rb_entry(node, struct btrfs_free_space, offset_index);
905 		unlink_free_space(block_group, info);
906 		if (info->bitmap)
907 			kfree(info->bitmap);
908 		kfree(info);
909 		if (need_resched()) {
910 			spin_unlock(&block_group->tree_lock);
911 			cond_resched();
912 			spin_lock(&block_group->tree_lock);
913 		}
914 	}
915 
916 	spin_unlock(&block_group->tree_lock);
917 }
918 
919 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
920 			       u64 offset, u64 bytes, u64 empty_size)
921 {
922 	struct btrfs_free_space *entry = NULL;
923 	u64 bytes_search = bytes + empty_size;
924 	u64 ret = 0;
925 
926 	spin_lock(&block_group->tree_lock);
927 	entry = find_free_space(block_group, &offset, &bytes_search, 0);
928 	if (!entry)
929 		goto out;
930 
931 	ret = offset;
932 	if (entry->bitmap) {
933 		bitmap_clear_bits(block_group, entry, offset, bytes);
934 		if (!entry->bytes) {
935 			unlink_free_space(block_group, entry);
936 			kfree(entry->bitmap);
937 			kfree(entry);
938 			block_group->total_bitmaps--;
939 			recalculate_thresholds(block_group);
940 		}
941 	} else {
942 		unlink_free_space(block_group, entry);
943 		entry->offset += bytes;
944 		entry->bytes -= bytes;
945 		if (!entry->bytes)
946 			kfree(entry);
947 		else
948 			link_free_space(block_group, entry);
949 	}
950 
951 out:
952 	spin_unlock(&block_group->tree_lock);
953 
954 	return ret;
955 }
956 
957 /*
958  * given a cluster, put all of its extents back into the free space
959  * cache.  If a block group is passed, this function will only free
960  * a cluster that belongs to the passed block group.
961  *
962  * Otherwise, it'll get a reference on the block group pointed to by the
963  * cluster and remove the cluster from it.
964  */
965 int btrfs_return_cluster_to_free_space(
966 			       struct btrfs_block_group_cache *block_group,
967 			       struct btrfs_free_cluster *cluster)
968 {
969 	int ret;
970 
971 	/* first, get a safe pointer to the block group */
972 	spin_lock(&cluster->lock);
973 	if (!block_group) {
974 		block_group = cluster->block_group;
975 		if (!block_group) {
976 			spin_unlock(&cluster->lock);
977 			return 0;
978 		}
979 	} else if (cluster->block_group != block_group) {
980 		/* someone else has already freed it don't redo their work */
981 		spin_unlock(&cluster->lock);
982 		return 0;
983 	}
984 	atomic_inc(&block_group->count);
985 	spin_unlock(&cluster->lock);
986 
987 	/* now return any extents the cluster had on it */
988 	spin_lock(&block_group->tree_lock);
989 	ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
990 	spin_unlock(&block_group->tree_lock);
991 
992 	/* finally drop our ref */
993 	btrfs_put_block_group(block_group);
994 	return ret;
995 }
996 
997 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
998 				   struct btrfs_free_cluster *cluster,
999 				   u64 bytes, u64 min_start)
1000 {
1001 	struct btrfs_free_space *entry;
1002 	int err;
1003 	u64 search_start = cluster->window_start;
1004 	u64 search_bytes = bytes;
1005 	u64 ret = 0;
1006 
1007 	spin_lock(&block_group->tree_lock);
1008 	spin_lock(&cluster->lock);
1009 
1010 	if (!cluster->points_to_bitmap)
1011 		goto out;
1012 
1013 	if (cluster->block_group != block_group)
1014 		goto out;
1015 
1016 	/*
1017 	 * search_start is the beginning of the bitmap, but at some point it may
1018 	 * be a good idea to point to the actual start of the free area in the
1019 	 * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
1020 	 * to 1 to make sure we get the bitmap entry
1021 	 */
1022 	entry = tree_search_offset(block_group,
1023 				   offset_to_bitmap(block_group, search_start),
1024 				   1, 0);
1025 	if (!entry || !entry->bitmap)
1026 		goto out;
1027 
1028 	search_start = min_start;
1029 	search_bytes = bytes;
1030 
1031 	err = search_bitmap(block_group, entry, &search_start,
1032 			    &search_bytes);
1033 	if (err)
1034 		goto out;
1035 
1036 	ret = search_start;
1037 	bitmap_clear_bits(block_group, entry, ret, bytes);
1038 out:
1039 	spin_unlock(&cluster->lock);
1040 	spin_unlock(&block_group->tree_lock);
1041 
1042 	return ret;
1043 }
1044 
1045 /*
1046  * given a cluster, try to allocate 'bytes' from it, returns 0
1047  * if it couldn't find anything suitably large, or a logical disk offset
1048  * if things worked out
1049  */
1050 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
1051 			     struct btrfs_free_cluster *cluster, u64 bytes,
1052 			     u64 min_start)
1053 {
1054 	struct btrfs_free_space *entry = NULL;
1055 	struct rb_node *node;
1056 	u64 ret = 0;
1057 
1058 	if (cluster->points_to_bitmap)
1059 		return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
1060 					       min_start);
1061 
1062 	spin_lock(&cluster->lock);
1063 	if (bytes > cluster->max_size)
1064 		goto out;
1065 
1066 	if (cluster->block_group != block_group)
1067 		goto out;
1068 
1069 	node = rb_first(&cluster->root);
1070 	if (!node)
1071 		goto out;
1072 
1073 	entry = rb_entry(node, struct btrfs_free_space, offset_index);
1074 
1075 	while(1) {
1076 		if (entry->bytes < bytes || entry->offset < min_start) {
1077 			struct rb_node *node;
1078 
1079 			node = rb_next(&entry->offset_index);
1080 			if (!node)
1081 				break;
1082 			entry = rb_entry(node, struct btrfs_free_space,
1083 					 offset_index);
1084 			continue;
1085 		}
1086 		ret = entry->offset;
1087 
1088 		entry->offset += bytes;
1089 		entry->bytes -= bytes;
1090 
1091 		if (entry->bytes == 0) {
1092 			rb_erase(&entry->offset_index, &cluster->root);
1093 			kfree(entry);
1094 		}
1095 		break;
1096 	}
1097 out:
1098 	spin_unlock(&cluster->lock);
1099 
1100 	return ret;
1101 }
1102 
1103 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
1104 				struct btrfs_free_space *entry,
1105 				struct btrfs_free_cluster *cluster,
1106 				u64 offset, u64 bytes, u64 min_bytes)
1107 {
1108 	unsigned long next_zero;
1109 	unsigned long i;
1110 	unsigned long search_bits;
1111 	unsigned long total_bits;
1112 	unsigned long found_bits;
1113 	unsigned long start = 0;
1114 	unsigned long total_found = 0;
1115 	bool found = false;
1116 
1117 	i = offset_to_bit(entry->offset, block_group->sectorsize,
1118 			  max_t(u64, offset, entry->offset));
1119 	search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
1120 	total_bits = bytes_to_bits(bytes, block_group->sectorsize);
1121 
1122 again:
1123 	found_bits = 0;
1124 	for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
1125 	     i < BITS_PER_BITMAP;
1126 	     i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
1127 		next_zero = find_next_zero_bit(entry->bitmap,
1128 					       BITS_PER_BITMAP, i);
1129 		if (next_zero - i >= search_bits) {
1130 			found_bits = next_zero - i;
1131 			break;
1132 		}
1133 		i = next_zero;
1134 	}
1135 
1136 	if (!found_bits)
1137 		return -1;
1138 
1139 	if (!found) {
1140 		start = i;
1141 		found = true;
1142 	}
1143 
1144 	total_found += found_bits;
1145 
1146 	if (cluster->max_size < found_bits * block_group->sectorsize)
1147 		cluster->max_size = found_bits * block_group->sectorsize;
1148 
1149 	if (total_found < total_bits) {
1150 		i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
1151 		if (i - start > total_bits * 2) {
1152 			total_found = 0;
1153 			cluster->max_size = 0;
1154 			found = false;
1155 		}
1156 		goto again;
1157 	}
1158 
1159 	cluster->window_start = start * block_group->sectorsize +
1160 		entry->offset;
1161 	cluster->points_to_bitmap = true;
1162 
1163 	return 0;
1164 }
1165 
1166 /*
1167  * here we try to find a cluster of blocks in a block group.  The goal
1168  * is to find at least bytes free and up to empty_size + bytes free.
1169  * We might not find them all in one contiguous area.
1170  *
1171  * returns zero and sets up cluster if things worked out, otherwise
1172  * it returns -enospc
1173  */
1174 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
1175 			     struct btrfs_root *root,
1176 			     struct btrfs_block_group_cache *block_group,
1177 			     struct btrfs_free_cluster *cluster,
1178 			     u64 offset, u64 bytes, u64 empty_size)
1179 {
1180 	struct btrfs_free_space *entry = NULL;
1181 	struct rb_node *node;
1182 	struct btrfs_free_space *next;
1183 	struct btrfs_free_space *last = NULL;
1184 	u64 min_bytes;
1185 	u64 window_start;
1186 	u64 window_free;
1187 	u64 max_extent = 0;
1188 	bool found_bitmap = false;
1189 	int ret;
1190 
1191 	/* for metadata, allow allocates with more holes */
1192 	if (btrfs_test_opt(root, SSD_SPREAD)) {
1193 		min_bytes = bytes + empty_size;
1194 	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
1195 		/*
1196 		 * we want to do larger allocations when we are
1197 		 * flushing out the delayed refs, it helps prevent
1198 		 * making more work as we go along.
1199 		 */
1200 		if (trans->transaction->delayed_refs.flushing)
1201 			min_bytes = max(bytes, (bytes + empty_size) >> 1);
1202 		else
1203 			min_bytes = max(bytes, (bytes + empty_size) >> 4);
1204 	} else
1205 		min_bytes = max(bytes, (bytes + empty_size) >> 2);
1206 
1207 	spin_lock(&block_group->tree_lock);
1208 	spin_lock(&cluster->lock);
1209 
1210 	/* someone already found a cluster, hooray */
1211 	if (cluster->block_group) {
1212 		ret = 0;
1213 		goto out;
1214 	}
1215 again:
1216 	entry = tree_search_offset(block_group, offset, found_bitmap, 1);
1217 	if (!entry) {
1218 		ret = -ENOSPC;
1219 		goto out;
1220 	}
1221 
1222 	/*
1223 	 * If found_bitmap is true, we exhausted our search for extent entries,
1224 	 * and we just want to search all of the bitmaps that we can find, and
1225 	 * ignore any extent entries we find.
1226 	 */
1227 	while (entry->bitmap || found_bitmap ||
1228 	       (!entry->bitmap && entry->bytes < min_bytes)) {
1229 		struct rb_node *node = rb_next(&entry->offset_index);
1230 
1231 		if (entry->bitmap && entry->bytes > bytes + empty_size) {
1232 			ret = btrfs_bitmap_cluster(block_group, entry, cluster,
1233 						   offset, bytes + empty_size,
1234 						   min_bytes);
1235 			if (!ret)
1236 				goto got_it;
1237 		}
1238 
1239 		if (!node) {
1240 			ret = -ENOSPC;
1241 			goto out;
1242 		}
1243 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
1244 	}
1245 
1246 	/*
1247 	 * We already searched all the extent entries from the passed in offset
1248 	 * to the end and didn't find enough space for the cluster, and we also
1249 	 * didn't find any bitmaps that met our criteria, just go ahead and exit
1250 	 */
1251 	if (found_bitmap) {
1252 		ret = -ENOSPC;
1253 		goto out;
1254 	}
1255 
1256 	cluster->points_to_bitmap = false;
1257 	window_start = entry->offset;
1258 	window_free = entry->bytes;
1259 	last = entry;
1260 	max_extent = entry->bytes;
1261 
1262 	while (1) {
1263 		/* out window is just right, lets fill it */
1264 		if (window_free >= bytes + empty_size)
1265 			break;
1266 
1267 		node = rb_next(&last->offset_index);
1268 		if (!node) {
1269 			if (found_bitmap)
1270 				goto again;
1271 			ret = -ENOSPC;
1272 			goto out;
1273 		}
1274 		next = rb_entry(node, struct btrfs_free_space, offset_index);
1275 
1276 		/*
1277 		 * we found a bitmap, so if this search doesn't result in a
1278 		 * cluster, we know to go and search again for the bitmaps and
1279 		 * start looking for space there
1280 		 */
1281 		if (next->bitmap) {
1282 			if (!found_bitmap)
1283 				offset = next->offset;
1284 			found_bitmap = true;
1285 			last = next;
1286 			continue;
1287 		}
1288 
1289 		/*
1290 		 * we haven't filled the empty size and the window is
1291 		 * very large.  reset and try again
1292 		 */
1293 		if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
1294 		    next->offset - window_start > (bytes + empty_size) * 2) {
1295 			entry = next;
1296 			window_start = entry->offset;
1297 			window_free = entry->bytes;
1298 			last = entry;
1299 			max_extent = 0;
1300 		} else {
1301 			last = next;
1302 			window_free += next->bytes;
1303 			if (entry->bytes > max_extent)
1304 				max_extent = entry->bytes;
1305 		}
1306 	}
1307 
1308 	cluster->window_start = entry->offset;
1309 
1310 	/*
1311 	 * now we've found our entries, pull them out of the free space
1312 	 * cache and put them into the cluster rbtree
1313 	 *
1314 	 * The cluster includes an rbtree, but only uses the offset index
1315 	 * of each free space cache entry.
1316 	 */
1317 	while (1) {
1318 		node = rb_next(&entry->offset_index);
1319 		if (entry->bitmap && node) {
1320 			entry = rb_entry(node, struct btrfs_free_space,
1321 					 offset_index);
1322 			continue;
1323 		} else if (entry->bitmap && !node) {
1324 			break;
1325 		}
1326 
1327 		rb_erase(&entry->offset_index, &block_group->free_space_offset);
1328 		ret = tree_insert_offset(&cluster->root, entry->offset,
1329 					 &entry->offset_index, 0);
1330 		BUG_ON(ret);
1331 
1332 		if (!node || entry == last)
1333 			break;
1334 
1335 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
1336 	}
1337 
1338 	cluster->max_size = max_extent;
1339 got_it:
1340 	ret = 0;
1341 	atomic_inc(&block_group->count);
1342 	list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
1343 	cluster->block_group = block_group;
1344 out:
1345 	spin_unlock(&cluster->lock);
1346 	spin_unlock(&block_group->tree_lock);
1347 
1348 	return ret;
1349 }
1350 
1351 /*
1352  * simple code to zero out a cluster
1353  */
1354 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
1355 {
1356 	spin_lock_init(&cluster->lock);
1357 	spin_lock_init(&cluster->refill_lock);
1358 	cluster->root.rb_node = NULL;
1359 	cluster->max_size = 0;
1360 	cluster->points_to_bitmap = false;
1361 	INIT_LIST_HEAD(&cluster->block_group_list);
1362 	cluster->block_group = NULL;
1363 }
1364 
1365