xref: /openbmc/linux/fs/btrfs/free-space-cache.c (revision d2999e1b)
1 /*
2  * Copyright (C) 2008 Red Hat.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/pagemap.h>
20 #include <linux/sched.h>
21 #include <linux/slab.h>
22 #include <linux/math64.h>
23 #include <linux/ratelimit.h>
24 #include "ctree.h"
25 #include "free-space-cache.h"
26 #include "transaction.h"
27 #include "disk-io.h"
28 #include "extent_io.h"
29 #include "inode-map.h"
30 
31 #define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
32 #define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
33 
34 static int link_free_space(struct btrfs_free_space_ctl *ctl,
35 			   struct btrfs_free_space *info);
36 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
37 			      struct btrfs_free_space *info);
38 
39 static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
40 					       struct btrfs_path *path,
41 					       u64 offset)
42 {
43 	struct btrfs_key key;
44 	struct btrfs_key location;
45 	struct btrfs_disk_key disk_key;
46 	struct btrfs_free_space_header *header;
47 	struct extent_buffer *leaf;
48 	struct inode *inode = NULL;
49 	int ret;
50 
51 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
52 	key.offset = offset;
53 	key.type = 0;
54 
55 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
56 	if (ret < 0)
57 		return ERR_PTR(ret);
58 	if (ret > 0) {
59 		btrfs_release_path(path);
60 		return ERR_PTR(-ENOENT);
61 	}
62 
63 	leaf = path->nodes[0];
64 	header = btrfs_item_ptr(leaf, path->slots[0],
65 				struct btrfs_free_space_header);
66 	btrfs_free_space_key(leaf, header, &disk_key);
67 	btrfs_disk_key_to_cpu(&location, &disk_key);
68 	btrfs_release_path(path);
69 
70 	inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
71 	if (!inode)
72 		return ERR_PTR(-ENOENT);
73 	if (IS_ERR(inode))
74 		return inode;
75 	if (is_bad_inode(inode)) {
76 		iput(inode);
77 		return ERR_PTR(-ENOENT);
78 	}
79 
80 	mapping_set_gfp_mask(inode->i_mapping,
81 			mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
82 
83 	return inode;
84 }
85 
86 struct inode *lookup_free_space_inode(struct btrfs_root *root,
87 				      struct btrfs_block_group_cache
88 				      *block_group, struct btrfs_path *path)
89 {
90 	struct inode *inode = NULL;
91 	u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
92 
93 	spin_lock(&block_group->lock);
94 	if (block_group->inode)
95 		inode = igrab(block_group->inode);
96 	spin_unlock(&block_group->lock);
97 	if (inode)
98 		return inode;
99 
100 	inode = __lookup_free_space_inode(root, path,
101 					  block_group->key.objectid);
102 	if (IS_ERR(inode))
103 		return inode;
104 
105 	spin_lock(&block_group->lock);
106 	if (!((BTRFS_I(inode)->flags & flags) == flags)) {
107 		btrfs_info(root->fs_info,
108 			"Old style space inode found, converting.");
109 		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
110 			BTRFS_INODE_NODATACOW;
111 		block_group->disk_cache_state = BTRFS_DC_CLEAR;
112 	}
113 
114 	if (!block_group->iref) {
115 		block_group->inode = igrab(inode);
116 		block_group->iref = 1;
117 	}
118 	spin_unlock(&block_group->lock);
119 
120 	return inode;
121 }
122 
123 static int __create_free_space_inode(struct btrfs_root *root,
124 				     struct btrfs_trans_handle *trans,
125 				     struct btrfs_path *path,
126 				     u64 ino, u64 offset)
127 {
128 	struct btrfs_key key;
129 	struct btrfs_disk_key disk_key;
130 	struct btrfs_free_space_header *header;
131 	struct btrfs_inode_item *inode_item;
132 	struct extent_buffer *leaf;
133 	u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
134 	int ret;
135 
136 	ret = btrfs_insert_empty_inode(trans, root, path, ino);
137 	if (ret)
138 		return ret;
139 
140 	/* We inline crc's for the free disk space cache */
141 	if (ino != BTRFS_FREE_INO_OBJECTID)
142 		flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
143 
144 	leaf = path->nodes[0];
145 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
146 				    struct btrfs_inode_item);
147 	btrfs_item_key(leaf, &disk_key, path->slots[0]);
148 	memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
149 			     sizeof(*inode_item));
150 	btrfs_set_inode_generation(leaf, inode_item, trans->transid);
151 	btrfs_set_inode_size(leaf, inode_item, 0);
152 	btrfs_set_inode_nbytes(leaf, inode_item, 0);
153 	btrfs_set_inode_uid(leaf, inode_item, 0);
154 	btrfs_set_inode_gid(leaf, inode_item, 0);
155 	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
156 	btrfs_set_inode_flags(leaf, inode_item, flags);
157 	btrfs_set_inode_nlink(leaf, inode_item, 1);
158 	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
159 	btrfs_set_inode_block_group(leaf, inode_item, offset);
160 	btrfs_mark_buffer_dirty(leaf);
161 	btrfs_release_path(path);
162 
163 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
164 	key.offset = offset;
165 	key.type = 0;
166 
167 	ret = btrfs_insert_empty_item(trans, root, path, &key,
168 				      sizeof(struct btrfs_free_space_header));
169 	if (ret < 0) {
170 		btrfs_release_path(path);
171 		return ret;
172 	}
173 	leaf = path->nodes[0];
174 	header = btrfs_item_ptr(leaf, path->slots[0],
175 				struct btrfs_free_space_header);
176 	memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
177 	btrfs_set_free_space_key(leaf, header, &disk_key);
178 	btrfs_mark_buffer_dirty(leaf);
179 	btrfs_release_path(path);
180 
181 	return 0;
182 }
183 
184 int create_free_space_inode(struct btrfs_root *root,
185 			    struct btrfs_trans_handle *trans,
186 			    struct btrfs_block_group_cache *block_group,
187 			    struct btrfs_path *path)
188 {
189 	int ret;
190 	u64 ino;
191 
192 	ret = btrfs_find_free_objectid(root, &ino);
193 	if (ret < 0)
194 		return ret;
195 
196 	return __create_free_space_inode(root, trans, path, ino,
197 					 block_group->key.objectid);
198 }
199 
200 int btrfs_check_trunc_cache_free_space(struct btrfs_root *root,
201 				       struct btrfs_block_rsv *rsv)
202 {
203 	u64 needed_bytes;
204 	int ret;
205 
206 	/* 1 for slack space, 1 for updating the inode */
207 	needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
208 		btrfs_calc_trans_metadata_size(root, 1);
209 
210 	spin_lock(&rsv->lock);
211 	if (rsv->reserved < needed_bytes)
212 		ret = -ENOSPC;
213 	else
214 		ret = 0;
215 	spin_unlock(&rsv->lock);
216 	return ret;
217 }
218 
219 int btrfs_truncate_free_space_cache(struct btrfs_root *root,
220 				    struct btrfs_trans_handle *trans,
221 				    struct inode *inode)
222 {
223 	int ret = 0;
224 
225 	btrfs_i_size_write(inode, 0);
226 	truncate_pagecache(inode, 0);
227 
228 	/*
229 	 * We don't need an orphan item because truncating the free space cache
230 	 * will never be split across transactions.
231 	 */
232 	ret = btrfs_truncate_inode_items(trans, root, inode,
233 					 0, BTRFS_EXTENT_DATA_KEY);
234 	if (ret) {
235 		btrfs_abort_transaction(trans, root, ret);
236 		return ret;
237 	}
238 
239 	ret = btrfs_update_inode(trans, root, inode);
240 	if (ret)
241 		btrfs_abort_transaction(trans, root, ret);
242 
243 	return ret;
244 }
245 
246 static int readahead_cache(struct inode *inode)
247 {
248 	struct file_ra_state *ra;
249 	unsigned long last_index;
250 
251 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
252 	if (!ra)
253 		return -ENOMEM;
254 
255 	file_ra_state_init(ra, inode->i_mapping);
256 	last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
257 
258 	page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
259 
260 	kfree(ra);
261 
262 	return 0;
263 }
264 
265 struct io_ctl {
266 	void *cur, *orig;
267 	struct page *page;
268 	struct page **pages;
269 	struct btrfs_root *root;
270 	unsigned long size;
271 	int index;
272 	int num_pages;
273 	unsigned check_crcs:1;
274 };
275 
276 static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
277 		       struct btrfs_root *root, int write)
278 {
279 	int num_pages;
280 	int check_crcs = 0;
281 
282 	num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
283 		    PAGE_CACHE_SHIFT;
284 
285 	if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
286 		check_crcs = 1;
287 
288 	/* Make sure we can fit our crcs into the first page */
289 	if (write && check_crcs &&
290 	    (num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE)
291 		return -ENOSPC;
292 
293 	memset(io_ctl, 0, sizeof(struct io_ctl));
294 
295 	io_ctl->pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
296 	if (!io_ctl->pages)
297 		return -ENOMEM;
298 
299 	io_ctl->num_pages = num_pages;
300 	io_ctl->root = root;
301 	io_ctl->check_crcs = check_crcs;
302 
303 	return 0;
304 }
305 
306 static void io_ctl_free(struct io_ctl *io_ctl)
307 {
308 	kfree(io_ctl->pages);
309 }
310 
311 static void io_ctl_unmap_page(struct io_ctl *io_ctl)
312 {
313 	if (io_ctl->cur) {
314 		kunmap(io_ctl->page);
315 		io_ctl->cur = NULL;
316 		io_ctl->orig = NULL;
317 	}
318 }
319 
320 static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
321 {
322 	ASSERT(io_ctl->index < io_ctl->num_pages);
323 	io_ctl->page = io_ctl->pages[io_ctl->index++];
324 	io_ctl->cur = kmap(io_ctl->page);
325 	io_ctl->orig = io_ctl->cur;
326 	io_ctl->size = PAGE_CACHE_SIZE;
327 	if (clear)
328 		memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
329 }
330 
331 static void io_ctl_drop_pages(struct io_ctl *io_ctl)
332 {
333 	int i;
334 
335 	io_ctl_unmap_page(io_ctl);
336 
337 	for (i = 0; i < io_ctl->num_pages; i++) {
338 		if (io_ctl->pages[i]) {
339 			ClearPageChecked(io_ctl->pages[i]);
340 			unlock_page(io_ctl->pages[i]);
341 			page_cache_release(io_ctl->pages[i]);
342 		}
343 	}
344 }
345 
346 static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
347 				int uptodate)
348 {
349 	struct page *page;
350 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
351 	int i;
352 
353 	for (i = 0; i < io_ctl->num_pages; i++) {
354 		page = find_or_create_page(inode->i_mapping, i, mask);
355 		if (!page) {
356 			io_ctl_drop_pages(io_ctl);
357 			return -ENOMEM;
358 		}
359 		io_ctl->pages[i] = page;
360 		if (uptodate && !PageUptodate(page)) {
361 			btrfs_readpage(NULL, page);
362 			lock_page(page);
363 			if (!PageUptodate(page)) {
364 				btrfs_err(BTRFS_I(inode)->root->fs_info,
365 					   "error reading free space cache");
366 				io_ctl_drop_pages(io_ctl);
367 				return -EIO;
368 			}
369 		}
370 	}
371 
372 	for (i = 0; i < io_ctl->num_pages; i++) {
373 		clear_page_dirty_for_io(io_ctl->pages[i]);
374 		set_page_extent_mapped(io_ctl->pages[i]);
375 	}
376 
377 	return 0;
378 }
379 
380 static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
381 {
382 	__le64 *val;
383 
384 	io_ctl_map_page(io_ctl, 1);
385 
386 	/*
387 	 * Skip the csum areas.  If we don't check crcs then we just have a
388 	 * 64bit chunk at the front of the first page.
389 	 */
390 	if (io_ctl->check_crcs) {
391 		io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
392 		io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
393 	} else {
394 		io_ctl->cur += sizeof(u64);
395 		io_ctl->size -= sizeof(u64) * 2;
396 	}
397 
398 	val = io_ctl->cur;
399 	*val = cpu_to_le64(generation);
400 	io_ctl->cur += sizeof(u64);
401 }
402 
403 static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
404 {
405 	__le64 *gen;
406 
407 	/*
408 	 * Skip the crc area.  If we don't check crcs then we just have a 64bit
409 	 * chunk at the front of the first page.
410 	 */
411 	if (io_ctl->check_crcs) {
412 		io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
413 		io_ctl->size -= sizeof(u64) +
414 			(sizeof(u32) * io_ctl->num_pages);
415 	} else {
416 		io_ctl->cur += sizeof(u64);
417 		io_ctl->size -= sizeof(u64) * 2;
418 	}
419 
420 	gen = io_ctl->cur;
421 	if (le64_to_cpu(*gen) != generation) {
422 		printk_ratelimited(KERN_ERR "BTRFS: space cache generation "
423 				   "(%Lu) does not match inode (%Lu)\n", *gen,
424 				   generation);
425 		io_ctl_unmap_page(io_ctl);
426 		return -EIO;
427 	}
428 	io_ctl->cur += sizeof(u64);
429 	return 0;
430 }
431 
432 static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
433 {
434 	u32 *tmp;
435 	u32 crc = ~(u32)0;
436 	unsigned offset = 0;
437 
438 	if (!io_ctl->check_crcs) {
439 		io_ctl_unmap_page(io_ctl);
440 		return;
441 	}
442 
443 	if (index == 0)
444 		offset = sizeof(u32) * io_ctl->num_pages;
445 
446 	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
447 			      PAGE_CACHE_SIZE - offset);
448 	btrfs_csum_final(crc, (char *)&crc);
449 	io_ctl_unmap_page(io_ctl);
450 	tmp = kmap(io_ctl->pages[0]);
451 	tmp += index;
452 	*tmp = crc;
453 	kunmap(io_ctl->pages[0]);
454 }
455 
456 static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
457 {
458 	u32 *tmp, val;
459 	u32 crc = ~(u32)0;
460 	unsigned offset = 0;
461 
462 	if (!io_ctl->check_crcs) {
463 		io_ctl_map_page(io_ctl, 0);
464 		return 0;
465 	}
466 
467 	if (index == 0)
468 		offset = sizeof(u32) * io_ctl->num_pages;
469 
470 	tmp = kmap(io_ctl->pages[0]);
471 	tmp += index;
472 	val = *tmp;
473 	kunmap(io_ctl->pages[0]);
474 
475 	io_ctl_map_page(io_ctl, 0);
476 	crc = btrfs_csum_data(io_ctl->orig + offset, crc,
477 			      PAGE_CACHE_SIZE - offset);
478 	btrfs_csum_final(crc, (char *)&crc);
479 	if (val != crc) {
480 		printk_ratelimited(KERN_ERR "BTRFS: csum mismatch on free "
481 				   "space cache\n");
482 		io_ctl_unmap_page(io_ctl);
483 		return -EIO;
484 	}
485 
486 	return 0;
487 }
488 
489 static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
490 			    void *bitmap)
491 {
492 	struct btrfs_free_space_entry *entry;
493 
494 	if (!io_ctl->cur)
495 		return -ENOSPC;
496 
497 	entry = io_ctl->cur;
498 	entry->offset = cpu_to_le64(offset);
499 	entry->bytes = cpu_to_le64(bytes);
500 	entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
501 		BTRFS_FREE_SPACE_EXTENT;
502 	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
503 	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
504 
505 	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
506 		return 0;
507 
508 	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
509 
510 	/* No more pages to map */
511 	if (io_ctl->index >= io_ctl->num_pages)
512 		return 0;
513 
514 	/* map the next page */
515 	io_ctl_map_page(io_ctl, 1);
516 	return 0;
517 }
518 
519 static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
520 {
521 	if (!io_ctl->cur)
522 		return -ENOSPC;
523 
524 	/*
525 	 * If we aren't at the start of the current page, unmap this one and
526 	 * map the next one if there is any left.
527 	 */
528 	if (io_ctl->cur != io_ctl->orig) {
529 		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
530 		if (io_ctl->index >= io_ctl->num_pages)
531 			return -ENOSPC;
532 		io_ctl_map_page(io_ctl, 0);
533 	}
534 
535 	memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
536 	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
537 	if (io_ctl->index < io_ctl->num_pages)
538 		io_ctl_map_page(io_ctl, 0);
539 	return 0;
540 }
541 
542 static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
543 {
544 	/*
545 	 * If we're not on the boundary we know we've modified the page and we
546 	 * need to crc the page.
547 	 */
548 	if (io_ctl->cur != io_ctl->orig)
549 		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
550 	else
551 		io_ctl_unmap_page(io_ctl);
552 
553 	while (io_ctl->index < io_ctl->num_pages) {
554 		io_ctl_map_page(io_ctl, 1);
555 		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
556 	}
557 }
558 
559 static int io_ctl_read_entry(struct io_ctl *io_ctl,
560 			    struct btrfs_free_space *entry, u8 *type)
561 {
562 	struct btrfs_free_space_entry *e;
563 	int ret;
564 
565 	if (!io_ctl->cur) {
566 		ret = io_ctl_check_crc(io_ctl, io_ctl->index);
567 		if (ret)
568 			return ret;
569 	}
570 
571 	e = io_ctl->cur;
572 	entry->offset = le64_to_cpu(e->offset);
573 	entry->bytes = le64_to_cpu(e->bytes);
574 	*type = e->type;
575 	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
576 	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
577 
578 	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
579 		return 0;
580 
581 	io_ctl_unmap_page(io_ctl);
582 
583 	return 0;
584 }
585 
586 static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
587 			      struct btrfs_free_space *entry)
588 {
589 	int ret;
590 
591 	ret = io_ctl_check_crc(io_ctl, io_ctl->index);
592 	if (ret)
593 		return ret;
594 
595 	memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
596 	io_ctl_unmap_page(io_ctl);
597 
598 	return 0;
599 }
600 
601 /*
602  * Since we attach pinned extents after the fact we can have contiguous sections
603  * of free space that are split up in entries.  This poses a problem with the
604  * tree logging stuff since it could have allocated across what appears to be 2
605  * entries since we would have merged the entries when adding the pinned extents
606  * back to the free space cache.  So run through the space cache that we just
607  * loaded and merge contiguous entries.  This will make the log replay stuff not
608  * blow up and it will make for nicer allocator behavior.
609  */
610 static void merge_space_tree(struct btrfs_free_space_ctl *ctl)
611 {
612 	struct btrfs_free_space *e, *prev = NULL;
613 	struct rb_node *n;
614 
615 again:
616 	spin_lock(&ctl->tree_lock);
617 	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
618 		e = rb_entry(n, struct btrfs_free_space, offset_index);
619 		if (!prev)
620 			goto next;
621 		if (e->bitmap || prev->bitmap)
622 			goto next;
623 		if (prev->offset + prev->bytes == e->offset) {
624 			unlink_free_space(ctl, prev);
625 			unlink_free_space(ctl, e);
626 			prev->bytes += e->bytes;
627 			kmem_cache_free(btrfs_free_space_cachep, e);
628 			link_free_space(ctl, prev);
629 			prev = NULL;
630 			spin_unlock(&ctl->tree_lock);
631 			goto again;
632 		}
633 next:
634 		prev = e;
635 	}
636 	spin_unlock(&ctl->tree_lock);
637 }
638 
639 static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
640 				   struct btrfs_free_space_ctl *ctl,
641 				   struct btrfs_path *path, u64 offset)
642 {
643 	struct btrfs_free_space_header *header;
644 	struct extent_buffer *leaf;
645 	struct io_ctl io_ctl;
646 	struct btrfs_key key;
647 	struct btrfs_free_space *e, *n;
648 	struct list_head bitmaps;
649 	u64 num_entries;
650 	u64 num_bitmaps;
651 	u64 generation;
652 	u8 type;
653 	int ret = 0;
654 
655 	INIT_LIST_HEAD(&bitmaps);
656 
657 	/* Nothing in the space cache, goodbye */
658 	if (!i_size_read(inode))
659 		return 0;
660 
661 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
662 	key.offset = offset;
663 	key.type = 0;
664 
665 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
666 	if (ret < 0)
667 		return 0;
668 	else if (ret > 0) {
669 		btrfs_release_path(path);
670 		return 0;
671 	}
672 
673 	ret = -1;
674 
675 	leaf = path->nodes[0];
676 	header = btrfs_item_ptr(leaf, path->slots[0],
677 				struct btrfs_free_space_header);
678 	num_entries = btrfs_free_space_entries(leaf, header);
679 	num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
680 	generation = btrfs_free_space_generation(leaf, header);
681 	btrfs_release_path(path);
682 
683 	if (!BTRFS_I(inode)->generation) {
684 		btrfs_info(root->fs_info,
685 			   "The free space cache file (%llu) is invalid. skip it\n",
686 			   offset);
687 		return 0;
688 	}
689 
690 	if (BTRFS_I(inode)->generation != generation) {
691 		btrfs_err(root->fs_info,
692 			"free space inode generation (%llu) "
693 			"did not match free space cache generation (%llu)",
694 			BTRFS_I(inode)->generation, generation);
695 		return 0;
696 	}
697 
698 	if (!num_entries)
699 		return 0;
700 
701 	ret = io_ctl_init(&io_ctl, inode, root, 0);
702 	if (ret)
703 		return ret;
704 
705 	ret = readahead_cache(inode);
706 	if (ret)
707 		goto out;
708 
709 	ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
710 	if (ret)
711 		goto out;
712 
713 	ret = io_ctl_check_crc(&io_ctl, 0);
714 	if (ret)
715 		goto free_cache;
716 
717 	ret = io_ctl_check_generation(&io_ctl, generation);
718 	if (ret)
719 		goto free_cache;
720 
721 	while (num_entries) {
722 		e = kmem_cache_zalloc(btrfs_free_space_cachep,
723 				      GFP_NOFS);
724 		if (!e)
725 			goto free_cache;
726 
727 		ret = io_ctl_read_entry(&io_ctl, e, &type);
728 		if (ret) {
729 			kmem_cache_free(btrfs_free_space_cachep, e);
730 			goto free_cache;
731 		}
732 
733 		if (!e->bytes) {
734 			kmem_cache_free(btrfs_free_space_cachep, e);
735 			goto free_cache;
736 		}
737 
738 		if (type == BTRFS_FREE_SPACE_EXTENT) {
739 			spin_lock(&ctl->tree_lock);
740 			ret = link_free_space(ctl, e);
741 			spin_unlock(&ctl->tree_lock);
742 			if (ret) {
743 				btrfs_err(root->fs_info,
744 					"Duplicate entries in free space cache, dumping");
745 				kmem_cache_free(btrfs_free_space_cachep, e);
746 				goto free_cache;
747 			}
748 		} else {
749 			ASSERT(num_bitmaps);
750 			num_bitmaps--;
751 			e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
752 			if (!e->bitmap) {
753 				kmem_cache_free(
754 					btrfs_free_space_cachep, e);
755 				goto free_cache;
756 			}
757 			spin_lock(&ctl->tree_lock);
758 			ret = link_free_space(ctl, e);
759 			ctl->total_bitmaps++;
760 			ctl->op->recalc_thresholds(ctl);
761 			spin_unlock(&ctl->tree_lock);
762 			if (ret) {
763 				btrfs_err(root->fs_info,
764 					"Duplicate entries in free space cache, dumping");
765 				kmem_cache_free(btrfs_free_space_cachep, e);
766 				goto free_cache;
767 			}
768 			list_add_tail(&e->list, &bitmaps);
769 		}
770 
771 		num_entries--;
772 	}
773 
774 	io_ctl_unmap_page(&io_ctl);
775 
776 	/*
777 	 * We add the bitmaps at the end of the entries in order that
778 	 * the bitmap entries are added to the cache.
779 	 */
780 	list_for_each_entry_safe(e, n, &bitmaps, list) {
781 		list_del_init(&e->list);
782 		ret = io_ctl_read_bitmap(&io_ctl, e);
783 		if (ret)
784 			goto free_cache;
785 	}
786 
787 	io_ctl_drop_pages(&io_ctl);
788 	merge_space_tree(ctl);
789 	ret = 1;
790 out:
791 	io_ctl_free(&io_ctl);
792 	return ret;
793 free_cache:
794 	io_ctl_drop_pages(&io_ctl);
795 	__btrfs_remove_free_space_cache(ctl);
796 	goto out;
797 }
798 
799 int load_free_space_cache(struct btrfs_fs_info *fs_info,
800 			  struct btrfs_block_group_cache *block_group)
801 {
802 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
803 	struct btrfs_root *root = fs_info->tree_root;
804 	struct inode *inode;
805 	struct btrfs_path *path;
806 	int ret = 0;
807 	bool matched;
808 	u64 used = btrfs_block_group_used(&block_group->item);
809 
810 	/*
811 	 * If this block group has been marked to be cleared for one reason or
812 	 * another then we can't trust the on disk cache, so just return.
813 	 */
814 	spin_lock(&block_group->lock);
815 	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
816 		spin_unlock(&block_group->lock);
817 		return 0;
818 	}
819 	spin_unlock(&block_group->lock);
820 
821 	path = btrfs_alloc_path();
822 	if (!path)
823 		return 0;
824 	path->search_commit_root = 1;
825 	path->skip_locking = 1;
826 
827 	inode = lookup_free_space_inode(root, block_group, path);
828 	if (IS_ERR(inode)) {
829 		btrfs_free_path(path);
830 		return 0;
831 	}
832 
833 	/* We may have converted the inode and made the cache invalid. */
834 	spin_lock(&block_group->lock);
835 	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
836 		spin_unlock(&block_group->lock);
837 		btrfs_free_path(path);
838 		goto out;
839 	}
840 	spin_unlock(&block_group->lock);
841 
842 	ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
843 				      path, block_group->key.objectid);
844 	btrfs_free_path(path);
845 	if (ret <= 0)
846 		goto out;
847 
848 	spin_lock(&ctl->tree_lock);
849 	matched = (ctl->free_space == (block_group->key.offset - used -
850 				       block_group->bytes_super));
851 	spin_unlock(&ctl->tree_lock);
852 
853 	if (!matched) {
854 		__btrfs_remove_free_space_cache(ctl);
855 		btrfs_warn(fs_info, "block group %llu has wrong amount of free space",
856 			block_group->key.objectid);
857 		ret = -1;
858 	}
859 out:
860 	if (ret < 0) {
861 		/* This cache is bogus, make sure it gets cleared */
862 		spin_lock(&block_group->lock);
863 		block_group->disk_cache_state = BTRFS_DC_CLEAR;
864 		spin_unlock(&block_group->lock);
865 		ret = 0;
866 
867 		btrfs_warn(fs_info, "failed to load free space cache for block group %llu, rebuild it now",
868 			block_group->key.objectid);
869 	}
870 
871 	iput(inode);
872 	return ret;
873 }
874 
875 static noinline_for_stack
876 int write_cache_extent_entries(struct io_ctl *io_ctl,
877 			      struct btrfs_free_space_ctl *ctl,
878 			      struct btrfs_block_group_cache *block_group,
879 			      int *entries, int *bitmaps,
880 			      struct list_head *bitmap_list)
881 {
882 	int ret;
883 	struct btrfs_free_cluster *cluster = NULL;
884 	struct rb_node *node = rb_first(&ctl->free_space_offset);
885 
886 	/* Get the cluster for this block_group if it exists */
887 	if (block_group && !list_empty(&block_group->cluster_list)) {
888 		cluster = list_entry(block_group->cluster_list.next,
889 				     struct btrfs_free_cluster,
890 				     block_group_list);
891 	}
892 
893 	if (!node && cluster) {
894 		node = rb_first(&cluster->root);
895 		cluster = NULL;
896 	}
897 
898 	/* Write out the extent entries */
899 	while (node) {
900 		struct btrfs_free_space *e;
901 
902 		e = rb_entry(node, struct btrfs_free_space, offset_index);
903 		*entries += 1;
904 
905 		ret = io_ctl_add_entry(io_ctl, e->offset, e->bytes,
906 				       e->bitmap);
907 		if (ret)
908 			goto fail;
909 
910 		if (e->bitmap) {
911 			list_add_tail(&e->list, bitmap_list);
912 			*bitmaps += 1;
913 		}
914 		node = rb_next(node);
915 		if (!node && cluster) {
916 			node = rb_first(&cluster->root);
917 			cluster = NULL;
918 		}
919 	}
920 	return 0;
921 fail:
922 	return -ENOSPC;
923 }
924 
925 static noinline_for_stack int
926 update_cache_item(struct btrfs_trans_handle *trans,
927 		  struct btrfs_root *root,
928 		  struct inode *inode,
929 		  struct btrfs_path *path, u64 offset,
930 		  int entries, int bitmaps)
931 {
932 	struct btrfs_key key;
933 	struct btrfs_free_space_header *header;
934 	struct extent_buffer *leaf;
935 	int ret;
936 
937 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
938 	key.offset = offset;
939 	key.type = 0;
940 
941 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
942 	if (ret < 0) {
943 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
944 				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
945 				 GFP_NOFS);
946 		goto fail;
947 	}
948 	leaf = path->nodes[0];
949 	if (ret > 0) {
950 		struct btrfs_key found_key;
951 		ASSERT(path->slots[0]);
952 		path->slots[0]--;
953 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
954 		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
955 		    found_key.offset != offset) {
956 			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
957 					 inode->i_size - 1,
958 					 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
959 					 NULL, GFP_NOFS);
960 			btrfs_release_path(path);
961 			goto fail;
962 		}
963 	}
964 
965 	BTRFS_I(inode)->generation = trans->transid;
966 	header = btrfs_item_ptr(leaf, path->slots[0],
967 				struct btrfs_free_space_header);
968 	btrfs_set_free_space_entries(leaf, header, entries);
969 	btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
970 	btrfs_set_free_space_generation(leaf, header, trans->transid);
971 	btrfs_mark_buffer_dirty(leaf);
972 	btrfs_release_path(path);
973 
974 	return 0;
975 
976 fail:
977 	return -1;
978 }
979 
980 static noinline_for_stack int
981 write_pinned_extent_entries(struct btrfs_root *root,
982 			    struct btrfs_block_group_cache *block_group,
983 			    struct io_ctl *io_ctl,
984 			    int *entries)
985 {
986 	u64 start, extent_start, extent_end, len;
987 	struct extent_io_tree *unpin = NULL;
988 	int ret;
989 
990 	if (!block_group)
991 		return 0;
992 
993 	/*
994 	 * We want to add any pinned extents to our free space cache
995 	 * so we don't leak the space
996 	 *
997 	 * We shouldn't have switched the pinned extents yet so this is the
998 	 * right one
999 	 */
1000 	unpin = root->fs_info->pinned_extents;
1001 
1002 	start = block_group->key.objectid;
1003 
1004 	while (start < block_group->key.objectid + block_group->key.offset) {
1005 		ret = find_first_extent_bit(unpin, start,
1006 					    &extent_start, &extent_end,
1007 					    EXTENT_DIRTY, NULL);
1008 		if (ret)
1009 			return 0;
1010 
1011 		/* This pinned extent is out of our range */
1012 		if (extent_start >= block_group->key.objectid +
1013 		    block_group->key.offset)
1014 			return 0;
1015 
1016 		extent_start = max(extent_start, start);
1017 		extent_end = min(block_group->key.objectid +
1018 				 block_group->key.offset, extent_end + 1);
1019 		len = extent_end - extent_start;
1020 
1021 		*entries += 1;
1022 		ret = io_ctl_add_entry(io_ctl, extent_start, len, NULL);
1023 		if (ret)
1024 			return -ENOSPC;
1025 
1026 		start = extent_end;
1027 	}
1028 
1029 	return 0;
1030 }
1031 
1032 static noinline_for_stack int
1033 write_bitmap_entries(struct io_ctl *io_ctl, struct list_head *bitmap_list)
1034 {
1035 	struct list_head *pos, *n;
1036 	int ret;
1037 
1038 	/* Write out the bitmaps */
1039 	list_for_each_safe(pos, n, bitmap_list) {
1040 		struct btrfs_free_space *entry =
1041 			list_entry(pos, struct btrfs_free_space, list);
1042 
1043 		ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
1044 		if (ret)
1045 			return -ENOSPC;
1046 		list_del_init(&entry->list);
1047 	}
1048 
1049 	return 0;
1050 }
1051 
1052 static int flush_dirty_cache(struct inode *inode)
1053 {
1054 	int ret;
1055 
1056 	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
1057 	if (ret)
1058 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
1059 				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
1060 				 GFP_NOFS);
1061 
1062 	return ret;
1063 }
1064 
1065 static void noinline_for_stack
1066 cleanup_write_cache_enospc(struct inode *inode,
1067 			   struct io_ctl *io_ctl,
1068 			   struct extent_state **cached_state,
1069 			   struct list_head *bitmap_list)
1070 {
1071 	struct list_head *pos, *n;
1072 
1073 	list_for_each_safe(pos, n, bitmap_list) {
1074 		struct btrfs_free_space *entry =
1075 			list_entry(pos, struct btrfs_free_space, list);
1076 		list_del_init(&entry->list);
1077 	}
1078 	io_ctl_drop_pages(io_ctl);
1079 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1080 			     i_size_read(inode) - 1, cached_state,
1081 			     GFP_NOFS);
1082 }
1083 
1084 /**
1085  * __btrfs_write_out_cache - write out cached info to an inode
1086  * @root - the root the inode belongs to
1087  * @ctl - the free space cache we are going to write out
1088  * @block_group - the block_group for this cache if it belongs to a block_group
1089  * @trans - the trans handle
1090  * @path - the path to use
1091  * @offset - the offset for the key we'll insert
1092  *
1093  * This function writes out a free space cache struct to disk for quick recovery
1094  * on mount.  This will return 0 if it was successfull in writing the cache out,
1095  * and -1 if it was not.
1096  */
1097 static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
1098 				   struct btrfs_free_space_ctl *ctl,
1099 				   struct btrfs_block_group_cache *block_group,
1100 				   struct btrfs_trans_handle *trans,
1101 				   struct btrfs_path *path, u64 offset)
1102 {
1103 	struct extent_state *cached_state = NULL;
1104 	struct io_ctl io_ctl;
1105 	LIST_HEAD(bitmap_list);
1106 	int entries = 0;
1107 	int bitmaps = 0;
1108 	int ret;
1109 
1110 	if (!i_size_read(inode))
1111 		return -1;
1112 
1113 	ret = io_ctl_init(&io_ctl, inode, root, 1);
1114 	if (ret)
1115 		return -1;
1116 
1117 	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)) {
1118 		down_write(&block_group->data_rwsem);
1119 		spin_lock(&block_group->lock);
1120 		if (block_group->delalloc_bytes) {
1121 			block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1122 			spin_unlock(&block_group->lock);
1123 			up_write(&block_group->data_rwsem);
1124 			BTRFS_I(inode)->generation = 0;
1125 			ret = 0;
1126 			goto out;
1127 		}
1128 		spin_unlock(&block_group->lock);
1129 	}
1130 
1131 	/* Lock all pages first so we can lock the extent safely. */
1132 	io_ctl_prepare_pages(&io_ctl, inode, 0);
1133 
1134 	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
1135 			 0, &cached_state);
1136 
1137 	io_ctl_set_generation(&io_ctl, trans->transid);
1138 
1139 	/* Write out the extent entries in the free space cache */
1140 	ret = write_cache_extent_entries(&io_ctl, ctl,
1141 					 block_group, &entries, &bitmaps,
1142 					 &bitmap_list);
1143 	if (ret)
1144 		goto out_nospc;
1145 
1146 	/*
1147 	 * Some spaces that are freed in the current transaction are pinned,
1148 	 * they will be added into free space cache after the transaction is
1149 	 * committed, we shouldn't lose them.
1150 	 */
1151 	ret = write_pinned_extent_entries(root, block_group, &io_ctl, &entries);
1152 	if (ret)
1153 		goto out_nospc;
1154 
1155 	/* At last, we write out all the bitmaps. */
1156 	ret = write_bitmap_entries(&io_ctl, &bitmap_list);
1157 	if (ret)
1158 		goto out_nospc;
1159 
1160 	/* Zero out the rest of the pages just to make sure */
1161 	io_ctl_zero_remaining_pages(&io_ctl);
1162 
1163 	/* Everything is written out, now we dirty the pages in the file. */
1164 	ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
1165 				0, i_size_read(inode), &cached_state);
1166 	if (ret)
1167 		goto out_nospc;
1168 
1169 	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1170 		up_write(&block_group->data_rwsem);
1171 	/*
1172 	 * Release the pages and unlock the extent, we will flush
1173 	 * them out later
1174 	 */
1175 	io_ctl_drop_pages(&io_ctl);
1176 
1177 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
1178 			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
1179 
1180 	/* Flush the dirty pages in the cache file. */
1181 	ret = flush_dirty_cache(inode);
1182 	if (ret)
1183 		goto out;
1184 
1185 	/* Update the cache item to tell everyone this cache file is valid. */
1186 	ret = update_cache_item(trans, root, inode, path, offset,
1187 				entries, bitmaps);
1188 out:
1189 	io_ctl_free(&io_ctl);
1190 	if (ret) {
1191 		invalidate_inode_pages2(inode->i_mapping);
1192 		BTRFS_I(inode)->generation = 0;
1193 	}
1194 	btrfs_update_inode(trans, root, inode);
1195 	return ret;
1196 
1197 out_nospc:
1198 	cleanup_write_cache_enospc(inode, &io_ctl, &cached_state, &bitmap_list);
1199 
1200 	if (block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA))
1201 		up_write(&block_group->data_rwsem);
1202 
1203 	goto out;
1204 }
1205 
1206 int btrfs_write_out_cache(struct btrfs_root *root,
1207 			  struct btrfs_trans_handle *trans,
1208 			  struct btrfs_block_group_cache *block_group,
1209 			  struct btrfs_path *path)
1210 {
1211 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
1212 	struct inode *inode;
1213 	int ret = 0;
1214 
1215 	root = root->fs_info->tree_root;
1216 
1217 	spin_lock(&block_group->lock);
1218 	if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
1219 		spin_unlock(&block_group->lock);
1220 		return 0;
1221 	}
1222 
1223 	if (block_group->delalloc_bytes) {
1224 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
1225 		spin_unlock(&block_group->lock);
1226 		return 0;
1227 	}
1228 	spin_unlock(&block_group->lock);
1229 
1230 	inode = lookup_free_space_inode(root, block_group, path);
1231 	if (IS_ERR(inode))
1232 		return 0;
1233 
1234 	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
1235 				      path, block_group->key.objectid);
1236 	if (ret) {
1237 		spin_lock(&block_group->lock);
1238 		block_group->disk_cache_state = BTRFS_DC_ERROR;
1239 		spin_unlock(&block_group->lock);
1240 		ret = 0;
1241 #ifdef DEBUG
1242 		btrfs_err(root->fs_info,
1243 			"failed to write free space cache for block group %llu",
1244 			block_group->key.objectid);
1245 #endif
1246 	}
1247 
1248 	iput(inode);
1249 	return ret;
1250 }
1251 
1252 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
1253 					  u64 offset)
1254 {
1255 	ASSERT(offset >= bitmap_start);
1256 	offset -= bitmap_start;
1257 	return (unsigned long)(div_u64(offset, unit));
1258 }
1259 
1260 static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
1261 {
1262 	return (unsigned long)(div_u64(bytes, unit));
1263 }
1264 
1265 static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
1266 				   u64 offset)
1267 {
1268 	u64 bitmap_start;
1269 	u64 bytes_per_bitmap;
1270 
1271 	bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
1272 	bitmap_start = offset - ctl->start;
1273 	bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
1274 	bitmap_start *= bytes_per_bitmap;
1275 	bitmap_start += ctl->start;
1276 
1277 	return bitmap_start;
1278 }
1279 
1280 static int tree_insert_offset(struct rb_root *root, u64 offset,
1281 			      struct rb_node *node, int bitmap)
1282 {
1283 	struct rb_node **p = &root->rb_node;
1284 	struct rb_node *parent = NULL;
1285 	struct btrfs_free_space *info;
1286 
1287 	while (*p) {
1288 		parent = *p;
1289 		info = rb_entry(parent, struct btrfs_free_space, offset_index);
1290 
1291 		if (offset < info->offset) {
1292 			p = &(*p)->rb_left;
1293 		} else if (offset > info->offset) {
1294 			p = &(*p)->rb_right;
1295 		} else {
1296 			/*
1297 			 * we could have a bitmap entry and an extent entry
1298 			 * share the same offset.  If this is the case, we want
1299 			 * the extent entry to always be found first if we do a
1300 			 * linear search through the tree, since we want to have
1301 			 * the quickest allocation time, and allocating from an
1302 			 * extent is faster than allocating from a bitmap.  So
1303 			 * if we're inserting a bitmap and we find an entry at
1304 			 * this offset, we want to go right, or after this entry
1305 			 * logically.  If we are inserting an extent and we've
1306 			 * found a bitmap, we want to go left, or before
1307 			 * logically.
1308 			 */
1309 			if (bitmap) {
1310 				if (info->bitmap) {
1311 					WARN_ON_ONCE(1);
1312 					return -EEXIST;
1313 				}
1314 				p = &(*p)->rb_right;
1315 			} else {
1316 				if (!info->bitmap) {
1317 					WARN_ON_ONCE(1);
1318 					return -EEXIST;
1319 				}
1320 				p = &(*p)->rb_left;
1321 			}
1322 		}
1323 	}
1324 
1325 	rb_link_node(node, parent, p);
1326 	rb_insert_color(node, root);
1327 
1328 	return 0;
1329 }
1330 
1331 /*
1332  * searches the tree for the given offset.
1333  *
1334  * fuzzy - If this is set, then we are trying to make an allocation, and we just
1335  * want a section that has at least bytes size and comes at or after the given
1336  * offset.
1337  */
1338 static struct btrfs_free_space *
1339 tree_search_offset(struct btrfs_free_space_ctl *ctl,
1340 		   u64 offset, int bitmap_only, int fuzzy)
1341 {
1342 	struct rb_node *n = ctl->free_space_offset.rb_node;
1343 	struct btrfs_free_space *entry, *prev = NULL;
1344 
1345 	/* find entry that is closest to the 'offset' */
1346 	while (1) {
1347 		if (!n) {
1348 			entry = NULL;
1349 			break;
1350 		}
1351 
1352 		entry = rb_entry(n, struct btrfs_free_space, offset_index);
1353 		prev = entry;
1354 
1355 		if (offset < entry->offset)
1356 			n = n->rb_left;
1357 		else if (offset > entry->offset)
1358 			n = n->rb_right;
1359 		else
1360 			break;
1361 	}
1362 
1363 	if (bitmap_only) {
1364 		if (!entry)
1365 			return NULL;
1366 		if (entry->bitmap)
1367 			return entry;
1368 
1369 		/*
1370 		 * bitmap entry and extent entry may share same offset,
1371 		 * in that case, bitmap entry comes after extent entry.
1372 		 */
1373 		n = rb_next(n);
1374 		if (!n)
1375 			return NULL;
1376 		entry = rb_entry(n, struct btrfs_free_space, offset_index);
1377 		if (entry->offset != offset)
1378 			return NULL;
1379 
1380 		WARN_ON(!entry->bitmap);
1381 		return entry;
1382 	} else if (entry) {
1383 		if (entry->bitmap) {
1384 			/*
1385 			 * if previous extent entry covers the offset,
1386 			 * we should return it instead of the bitmap entry
1387 			 */
1388 			n = rb_prev(&entry->offset_index);
1389 			if (n) {
1390 				prev = rb_entry(n, struct btrfs_free_space,
1391 						offset_index);
1392 				if (!prev->bitmap &&
1393 				    prev->offset + prev->bytes > offset)
1394 					entry = prev;
1395 			}
1396 		}
1397 		return entry;
1398 	}
1399 
1400 	if (!prev)
1401 		return NULL;
1402 
1403 	/* find last entry before the 'offset' */
1404 	entry = prev;
1405 	if (entry->offset > offset) {
1406 		n = rb_prev(&entry->offset_index);
1407 		if (n) {
1408 			entry = rb_entry(n, struct btrfs_free_space,
1409 					offset_index);
1410 			ASSERT(entry->offset <= offset);
1411 		} else {
1412 			if (fuzzy)
1413 				return entry;
1414 			else
1415 				return NULL;
1416 		}
1417 	}
1418 
1419 	if (entry->bitmap) {
1420 		n = rb_prev(&entry->offset_index);
1421 		if (n) {
1422 			prev = rb_entry(n, struct btrfs_free_space,
1423 					offset_index);
1424 			if (!prev->bitmap &&
1425 			    prev->offset + prev->bytes > offset)
1426 				return prev;
1427 		}
1428 		if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
1429 			return entry;
1430 	} else if (entry->offset + entry->bytes > offset)
1431 		return entry;
1432 
1433 	if (!fuzzy)
1434 		return NULL;
1435 
1436 	while (1) {
1437 		if (entry->bitmap) {
1438 			if (entry->offset + BITS_PER_BITMAP *
1439 			    ctl->unit > offset)
1440 				break;
1441 		} else {
1442 			if (entry->offset + entry->bytes > offset)
1443 				break;
1444 		}
1445 
1446 		n = rb_next(&entry->offset_index);
1447 		if (!n)
1448 			return NULL;
1449 		entry = rb_entry(n, struct btrfs_free_space, offset_index);
1450 	}
1451 	return entry;
1452 }
1453 
1454 static inline void
1455 __unlink_free_space(struct btrfs_free_space_ctl *ctl,
1456 		    struct btrfs_free_space *info)
1457 {
1458 	rb_erase(&info->offset_index, &ctl->free_space_offset);
1459 	ctl->free_extents--;
1460 }
1461 
1462 static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
1463 			      struct btrfs_free_space *info)
1464 {
1465 	__unlink_free_space(ctl, info);
1466 	ctl->free_space -= info->bytes;
1467 }
1468 
1469 static int link_free_space(struct btrfs_free_space_ctl *ctl,
1470 			   struct btrfs_free_space *info)
1471 {
1472 	int ret = 0;
1473 
1474 	ASSERT(info->bytes || info->bitmap);
1475 	ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
1476 				 &info->offset_index, (info->bitmap != NULL));
1477 	if (ret)
1478 		return ret;
1479 
1480 	ctl->free_space += info->bytes;
1481 	ctl->free_extents++;
1482 	return ret;
1483 }
1484 
1485 static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
1486 {
1487 	struct btrfs_block_group_cache *block_group = ctl->private;
1488 	u64 max_bytes;
1489 	u64 bitmap_bytes;
1490 	u64 extent_bytes;
1491 	u64 size = block_group->key.offset;
1492 	u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
1493 	int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
1494 
1495 	max_bitmaps = max(max_bitmaps, 1);
1496 
1497 	ASSERT(ctl->total_bitmaps <= max_bitmaps);
1498 
1499 	/*
1500 	 * The goal is to keep the total amount of memory used per 1gb of space
1501 	 * at or below 32k, so we need to adjust how much memory we allow to be
1502 	 * used by extent based free space tracking
1503 	 */
1504 	if (size < 1024 * 1024 * 1024)
1505 		max_bytes = MAX_CACHE_BYTES_PER_GIG;
1506 	else
1507 		max_bytes = MAX_CACHE_BYTES_PER_GIG *
1508 			div64_u64(size, 1024 * 1024 * 1024);
1509 
1510 	/*
1511 	 * we want to account for 1 more bitmap than what we have so we can make
1512 	 * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
1513 	 * we add more bitmaps.
1514 	 */
1515 	bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
1516 
1517 	if (bitmap_bytes >= max_bytes) {
1518 		ctl->extents_thresh = 0;
1519 		return;
1520 	}
1521 
1522 	/*
1523 	 * we want the extent entry threshold to always be at most 1/2 the maxw
1524 	 * bytes we can have, or whatever is less than that.
1525 	 */
1526 	extent_bytes = max_bytes - bitmap_bytes;
1527 	extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
1528 
1529 	ctl->extents_thresh =
1530 		div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
1531 }
1532 
1533 static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1534 				       struct btrfs_free_space *info,
1535 				       u64 offset, u64 bytes)
1536 {
1537 	unsigned long start, count;
1538 
1539 	start = offset_to_bit(info->offset, ctl->unit, offset);
1540 	count = bytes_to_bits(bytes, ctl->unit);
1541 	ASSERT(start + count <= BITS_PER_BITMAP);
1542 
1543 	bitmap_clear(info->bitmap, start, count);
1544 
1545 	info->bytes -= bytes;
1546 }
1547 
1548 static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
1549 			      struct btrfs_free_space *info, u64 offset,
1550 			      u64 bytes)
1551 {
1552 	__bitmap_clear_bits(ctl, info, offset, bytes);
1553 	ctl->free_space -= bytes;
1554 }
1555 
1556 static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
1557 			    struct btrfs_free_space *info, u64 offset,
1558 			    u64 bytes)
1559 {
1560 	unsigned long start, count;
1561 
1562 	start = offset_to_bit(info->offset, ctl->unit, offset);
1563 	count = bytes_to_bits(bytes, ctl->unit);
1564 	ASSERT(start + count <= BITS_PER_BITMAP);
1565 
1566 	bitmap_set(info->bitmap, start, count);
1567 
1568 	info->bytes += bytes;
1569 	ctl->free_space += bytes;
1570 }
1571 
1572 /*
1573  * If we can not find suitable extent, we will use bytes to record
1574  * the size of the max extent.
1575  */
1576 static int search_bitmap(struct btrfs_free_space_ctl *ctl,
1577 			 struct btrfs_free_space *bitmap_info, u64 *offset,
1578 			 u64 *bytes)
1579 {
1580 	unsigned long found_bits = 0;
1581 	unsigned long max_bits = 0;
1582 	unsigned long bits, i;
1583 	unsigned long next_zero;
1584 	unsigned long extent_bits;
1585 
1586 	i = offset_to_bit(bitmap_info->offset, ctl->unit,
1587 			  max_t(u64, *offset, bitmap_info->offset));
1588 	bits = bytes_to_bits(*bytes, ctl->unit);
1589 
1590 	for_each_set_bit_from(i, bitmap_info->bitmap, BITS_PER_BITMAP) {
1591 		next_zero = find_next_zero_bit(bitmap_info->bitmap,
1592 					       BITS_PER_BITMAP, i);
1593 		extent_bits = next_zero - i;
1594 		if (extent_bits >= bits) {
1595 			found_bits = extent_bits;
1596 			break;
1597 		} else if (extent_bits > max_bits) {
1598 			max_bits = extent_bits;
1599 		}
1600 		i = next_zero;
1601 	}
1602 
1603 	if (found_bits) {
1604 		*offset = (u64)(i * ctl->unit) + bitmap_info->offset;
1605 		*bytes = (u64)(found_bits) * ctl->unit;
1606 		return 0;
1607 	}
1608 
1609 	*bytes = (u64)(max_bits) * ctl->unit;
1610 	return -1;
1611 }
1612 
1613 /* Cache the size of the max extent in bytes */
1614 static struct btrfs_free_space *
1615 find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
1616 		unsigned long align, u64 *max_extent_size)
1617 {
1618 	struct btrfs_free_space *entry;
1619 	struct rb_node *node;
1620 	u64 tmp;
1621 	u64 align_off;
1622 	int ret;
1623 
1624 	if (!ctl->free_space_offset.rb_node)
1625 		goto out;
1626 
1627 	entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
1628 	if (!entry)
1629 		goto out;
1630 
1631 	for (node = &entry->offset_index; node; node = rb_next(node)) {
1632 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
1633 		if (entry->bytes < *bytes) {
1634 			if (entry->bytes > *max_extent_size)
1635 				*max_extent_size = entry->bytes;
1636 			continue;
1637 		}
1638 
1639 		/* make sure the space returned is big enough
1640 		 * to match our requested alignment
1641 		 */
1642 		if (*bytes >= align) {
1643 			tmp = entry->offset - ctl->start + align - 1;
1644 			do_div(tmp, align);
1645 			tmp = tmp * align + ctl->start;
1646 			align_off = tmp - entry->offset;
1647 		} else {
1648 			align_off = 0;
1649 			tmp = entry->offset;
1650 		}
1651 
1652 		if (entry->bytes < *bytes + align_off) {
1653 			if (entry->bytes > *max_extent_size)
1654 				*max_extent_size = entry->bytes;
1655 			continue;
1656 		}
1657 
1658 		if (entry->bitmap) {
1659 			u64 size = *bytes;
1660 
1661 			ret = search_bitmap(ctl, entry, &tmp, &size);
1662 			if (!ret) {
1663 				*offset = tmp;
1664 				*bytes = size;
1665 				return entry;
1666 			} else if (size > *max_extent_size) {
1667 				*max_extent_size = size;
1668 			}
1669 			continue;
1670 		}
1671 
1672 		*offset = tmp;
1673 		*bytes = entry->bytes - align_off;
1674 		return entry;
1675 	}
1676 out:
1677 	return NULL;
1678 }
1679 
1680 static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
1681 			   struct btrfs_free_space *info, u64 offset)
1682 {
1683 	info->offset = offset_to_bitmap(ctl, offset);
1684 	info->bytes = 0;
1685 	INIT_LIST_HEAD(&info->list);
1686 	link_free_space(ctl, info);
1687 	ctl->total_bitmaps++;
1688 
1689 	ctl->op->recalc_thresholds(ctl);
1690 }
1691 
1692 static void free_bitmap(struct btrfs_free_space_ctl *ctl,
1693 			struct btrfs_free_space *bitmap_info)
1694 {
1695 	unlink_free_space(ctl, bitmap_info);
1696 	kfree(bitmap_info->bitmap);
1697 	kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
1698 	ctl->total_bitmaps--;
1699 	ctl->op->recalc_thresholds(ctl);
1700 }
1701 
1702 static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
1703 			      struct btrfs_free_space *bitmap_info,
1704 			      u64 *offset, u64 *bytes)
1705 {
1706 	u64 end;
1707 	u64 search_start, search_bytes;
1708 	int ret;
1709 
1710 again:
1711 	end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
1712 
1713 	/*
1714 	 * We need to search for bits in this bitmap.  We could only cover some
1715 	 * of the extent in this bitmap thanks to how we add space, so we need
1716 	 * to search for as much as it as we can and clear that amount, and then
1717 	 * go searching for the next bit.
1718 	 */
1719 	search_start = *offset;
1720 	search_bytes = ctl->unit;
1721 	search_bytes = min(search_bytes, end - search_start + 1);
1722 	ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
1723 	if (ret < 0 || search_start != *offset)
1724 		return -EINVAL;
1725 
1726 	/* We may have found more bits than what we need */
1727 	search_bytes = min(search_bytes, *bytes);
1728 
1729 	/* Cannot clear past the end of the bitmap */
1730 	search_bytes = min(search_bytes, end - search_start + 1);
1731 
1732 	bitmap_clear_bits(ctl, bitmap_info, search_start, search_bytes);
1733 	*offset += search_bytes;
1734 	*bytes -= search_bytes;
1735 
1736 	if (*bytes) {
1737 		struct rb_node *next = rb_next(&bitmap_info->offset_index);
1738 		if (!bitmap_info->bytes)
1739 			free_bitmap(ctl, bitmap_info);
1740 
1741 		/*
1742 		 * no entry after this bitmap, but we still have bytes to
1743 		 * remove, so something has gone wrong.
1744 		 */
1745 		if (!next)
1746 			return -EINVAL;
1747 
1748 		bitmap_info = rb_entry(next, struct btrfs_free_space,
1749 				       offset_index);
1750 
1751 		/*
1752 		 * if the next entry isn't a bitmap we need to return to let the
1753 		 * extent stuff do its work.
1754 		 */
1755 		if (!bitmap_info->bitmap)
1756 			return -EAGAIN;
1757 
1758 		/*
1759 		 * Ok the next item is a bitmap, but it may not actually hold
1760 		 * the information for the rest of this free space stuff, so
1761 		 * look for it, and if we don't find it return so we can try
1762 		 * everything over again.
1763 		 */
1764 		search_start = *offset;
1765 		search_bytes = ctl->unit;
1766 		ret = search_bitmap(ctl, bitmap_info, &search_start,
1767 				    &search_bytes);
1768 		if (ret < 0 || search_start != *offset)
1769 			return -EAGAIN;
1770 
1771 		goto again;
1772 	} else if (!bitmap_info->bytes)
1773 		free_bitmap(ctl, bitmap_info);
1774 
1775 	return 0;
1776 }
1777 
1778 static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
1779 			       struct btrfs_free_space *info, u64 offset,
1780 			       u64 bytes)
1781 {
1782 	u64 bytes_to_set = 0;
1783 	u64 end;
1784 
1785 	end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
1786 
1787 	bytes_to_set = min(end - offset, bytes);
1788 
1789 	bitmap_set_bits(ctl, info, offset, bytes_to_set);
1790 
1791 	return bytes_to_set;
1792 
1793 }
1794 
1795 static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
1796 		      struct btrfs_free_space *info)
1797 {
1798 	struct btrfs_block_group_cache *block_group = ctl->private;
1799 
1800 	/*
1801 	 * If we are below the extents threshold then we can add this as an
1802 	 * extent, and don't have to deal with the bitmap
1803 	 */
1804 	if (ctl->free_extents < ctl->extents_thresh) {
1805 		/*
1806 		 * If this block group has some small extents we don't want to
1807 		 * use up all of our free slots in the cache with them, we want
1808 		 * to reserve them to larger extents, however if we have plent
1809 		 * of cache left then go ahead an dadd them, no sense in adding
1810 		 * the overhead of a bitmap if we don't have to.
1811 		 */
1812 		if (info->bytes <= block_group->sectorsize * 4) {
1813 			if (ctl->free_extents * 2 <= ctl->extents_thresh)
1814 				return false;
1815 		} else {
1816 			return false;
1817 		}
1818 	}
1819 
1820 	/*
1821 	 * The original block groups from mkfs can be really small, like 8
1822 	 * megabytes, so don't bother with a bitmap for those entries.  However
1823 	 * some block groups can be smaller than what a bitmap would cover but
1824 	 * are still large enough that they could overflow the 32k memory limit,
1825 	 * so allow those block groups to still be allowed to have a bitmap
1826 	 * entry.
1827 	 */
1828 	if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
1829 		return false;
1830 
1831 	return true;
1832 }
1833 
1834 static struct btrfs_free_space_op free_space_op = {
1835 	.recalc_thresholds	= recalculate_thresholds,
1836 	.use_bitmap		= use_bitmap,
1837 };
1838 
1839 static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
1840 			      struct btrfs_free_space *info)
1841 {
1842 	struct btrfs_free_space *bitmap_info;
1843 	struct btrfs_block_group_cache *block_group = NULL;
1844 	int added = 0;
1845 	u64 bytes, offset, bytes_added;
1846 	int ret;
1847 
1848 	bytes = info->bytes;
1849 	offset = info->offset;
1850 
1851 	if (!ctl->op->use_bitmap(ctl, info))
1852 		return 0;
1853 
1854 	if (ctl->op == &free_space_op)
1855 		block_group = ctl->private;
1856 again:
1857 	/*
1858 	 * Since we link bitmaps right into the cluster we need to see if we
1859 	 * have a cluster here, and if so and it has our bitmap we need to add
1860 	 * the free space to that bitmap.
1861 	 */
1862 	if (block_group && !list_empty(&block_group->cluster_list)) {
1863 		struct btrfs_free_cluster *cluster;
1864 		struct rb_node *node;
1865 		struct btrfs_free_space *entry;
1866 
1867 		cluster = list_entry(block_group->cluster_list.next,
1868 				     struct btrfs_free_cluster,
1869 				     block_group_list);
1870 		spin_lock(&cluster->lock);
1871 		node = rb_first(&cluster->root);
1872 		if (!node) {
1873 			spin_unlock(&cluster->lock);
1874 			goto no_cluster_bitmap;
1875 		}
1876 
1877 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
1878 		if (!entry->bitmap) {
1879 			spin_unlock(&cluster->lock);
1880 			goto no_cluster_bitmap;
1881 		}
1882 
1883 		if (entry->offset == offset_to_bitmap(ctl, offset)) {
1884 			bytes_added = add_bytes_to_bitmap(ctl, entry,
1885 							  offset, bytes);
1886 			bytes -= bytes_added;
1887 			offset += bytes_added;
1888 		}
1889 		spin_unlock(&cluster->lock);
1890 		if (!bytes) {
1891 			ret = 1;
1892 			goto out;
1893 		}
1894 	}
1895 
1896 no_cluster_bitmap:
1897 	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
1898 					 1, 0);
1899 	if (!bitmap_info) {
1900 		ASSERT(added == 0);
1901 		goto new_bitmap;
1902 	}
1903 
1904 	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
1905 	bytes -= bytes_added;
1906 	offset += bytes_added;
1907 	added = 0;
1908 
1909 	if (!bytes) {
1910 		ret = 1;
1911 		goto out;
1912 	} else
1913 		goto again;
1914 
1915 new_bitmap:
1916 	if (info && info->bitmap) {
1917 		add_new_bitmap(ctl, info, offset);
1918 		added = 1;
1919 		info = NULL;
1920 		goto again;
1921 	} else {
1922 		spin_unlock(&ctl->tree_lock);
1923 
1924 		/* no pre-allocated info, allocate a new one */
1925 		if (!info) {
1926 			info = kmem_cache_zalloc(btrfs_free_space_cachep,
1927 						 GFP_NOFS);
1928 			if (!info) {
1929 				spin_lock(&ctl->tree_lock);
1930 				ret = -ENOMEM;
1931 				goto out;
1932 			}
1933 		}
1934 
1935 		/* allocate the bitmap */
1936 		info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
1937 		spin_lock(&ctl->tree_lock);
1938 		if (!info->bitmap) {
1939 			ret = -ENOMEM;
1940 			goto out;
1941 		}
1942 		goto again;
1943 	}
1944 
1945 out:
1946 	if (info) {
1947 		if (info->bitmap)
1948 			kfree(info->bitmap);
1949 		kmem_cache_free(btrfs_free_space_cachep, info);
1950 	}
1951 
1952 	return ret;
1953 }
1954 
1955 static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
1956 			  struct btrfs_free_space *info, bool update_stat)
1957 {
1958 	struct btrfs_free_space *left_info;
1959 	struct btrfs_free_space *right_info;
1960 	bool merged = false;
1961 	u64 offset = info->offset;
1962 	u64 bytes = info->bytes;
1963 
1964 	/*
1965 	 * first we want to see if there is free space adjacent to the range we
1966 	 * are adding, if there is remove that struct and add a new one to
1967 	 * cover the entire range
1968 	 */
1969 	right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
1970 	if (right_info && rb_prev(&right_info->offset_index))
1971 		left_info = rb_entry(rb_prev(&right_info->offset_index),
1972 				     struct btrfs_free_space, offset_index);
1973 	else
1974 		left_info = tree_search_offset(ctl, offset - 1, 0, 0);
1975 
1976 	if (right_info && !right_info->bitmap) {
1977 		if (update_stat)
1978 			unlink_free_space(ctl, right_info);
1979 		else
1980 			__unlink_free_space(ctl, right_info);
1981 		info->bytes += right_info->bytes;
1982 		kmem_cache_free(btrfs_free_space_cachep, right_info);
1983 		merged = true;
1984 	}
1985 
1986 	if (left_info && !left_info->bitmap &&
1987 	    left_info->offset + left_info->bytes == offset) {
1988 		if (update_stat)
1989 			unlink_free_space(ctl, left_info);
1990 		else
1991 			__unlink_free_space(ctl, left_info);
1992 		info->offset = left_info->offset;
1993 		info->bytes += left_info->bytes;
1994 		kmem_cache_free(btrfs_free_space_cachep, left_info);
1995 		merged = true;
1996 	}
1997 
1998 	return merged;
1999 }
2000 
2001 int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
2002 			   u64 offset, u64 bytes)
2003 {
2004 	struct btrfs_free_space *info;
2005 	int ret = 0;
2006 
2007 	info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
2008 	if (!info)
2009 		return -ENOMEM;
2010 
2011 	info->offset = offset;
2012 	info->bytes = bytes;
2013 
2014 	spin_lock(&ctl->tree_lock);
2015 
2016 	if (try_merge_free_space(ctl, info, true))
2017 		goto link;
2018 
2019 	/*
2020 	 * There was no extent directly to the left or right of this new
2021 	 * extent then we know we're going to have to allocate a new extent, so
2022 	 * before we do that see if we need to drop this into a bitmap
2023 	 */
2024 	ret = insert_into_bitmap(ctl, info);
2025 	if (ret < 0) {
2026 		goto out;
2027 	} else if (ret) {
2028 		ret = 0;
2029 		goto out;
2030 	}
2031 link:
2032 	ret = link_free_space(ctl, info);
2033 	if (ret)
2034 		kmem_cache_free(btrfs_free_space_cachep, info);
2035 out:
2036 	spin_unlock(&ctl->tree_lock);
2037 
2038 	if (ret) {
2039 		printk(KERN_CRIT "BTRFS: unable to add free space :%d\n", ret);
2040 		ASSERT(ret != -EEXIST);
2041 	}
2042 
2043 	return ret;
2044 }
2045 
2046 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
2047 			    u64 offset, u64 bytes)
2048 {
2049 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2050 	struct btrfs_free_space *info;
2051 	int ret;
2052 	bool re_search = false;
2053 
2054 	spin_lock(&ctl->tree_lock);
2055 
2056 again:
2057 	ret = 0;
2058 	if (!bytes)
2059 		goto out_lock;
2060 
2061 	info = tree_search_offset(ctl, offset, 0, 0);
2062 	if (!info) {
2063 		/*
2064 		 * oops didn't find an extent that matched the space we wanted
2065 		 * to remove, look for a bitmap instead
2066 		 */
2067 		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
2068 					  1, 0);
2069 		if (!info) {
2070 			/*
2071 			 * If we found a partial bit of our free space in a
2072 			 * bitmap but then couldn't find the other part this may
2073 			 * be a problem, so WARN about it.
2074 			 */
2075 			WARN_ON(re_search);
2076 			goto out_lock;
2077 		}
2078 	}
2079 
2080 	re_search = false;
2081 	if (!info->bitmap) {
2082 		unlink_free_space(ctl, info);
2083 		if (offset == info->offset) {
2084 			u64 to_free = min(bytes, info->bytes);
2085 
2086 			info->bytes -= to_free;
2087 			info->offset += to_free;
2088 			if (info->bytes) {
2089 				ret = link_free_space(ctl, info);
2090 				WARN_ON(ret);
2091 			} else {
2092 				kmem_cache_free(btrfs_free_space_cachep, info);
2093 			}
2094 
2095 			offset += to_free;
2096 			bytes -= to_free;
2097 			goto again;
2098 		} else {
2099 			u64 old_end = info->bytes + info->offset;
2100 
2101 			info->bytes = offset - info->offset;
2102 			ret = link_free_space(ctl, info);
2103 			WARN_ON(ret);
2104 			if (ret)
2105 				goto out_lock;
2106 
2107 			/* Not enough bytes in this entry to satisfy us */
2108 			if (old_end < offset + bytes) {
2109 				bytes -= old_end - offset;
2110 				offset = old_end;
2111 				goto again;
2112 			} else if (old_end == offset + bytes) {
2113 				/* all done */
2114 				goto out_lock;
2115 			}
2116 			spin_unlock(&ctl->tree_lock);
2117 
2118 			ret = btrfs_add_free_space(block_group, offset + bytes,
2119 						   old_end - (offset + bytes));
2120 			WARN_ON(ret);
2121 			goto out;
2122 		}
2123 	}
2124 
2125 	ret = remove_from_bitmap(ctl, info, &offset, &bytes);
2126 	if (ret == -EAGAIN) {
2127 		re_search = true;
2128 		goto again;
2129 	}
2130 out_lock:
2131 	spin_unlock(&ctl->tree_lock);
2132 out:
2133 	return ret;
2134 }
2135 
2136 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
2137 			   u64 bytes)
2138 {
2139 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2140 	struct btrfs_free_space *info;
2141 	struct rb_node *n;
2142 	int count = 0;
2143 
2144 	for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
2145 		info = rb_entry(n, struct btrfs_free_space, offset_index);
2146 		if (info->bytes >= bytes && !block_group->ro)
2147 			count++;
2148 		btrfs_crit(block_group->fs_info,
2149 			   "entry offset %llu, bytes %llu, bitmap %s",
2150 			   info->offset, info->bytes,
2151 		       (info->bitmap) ? "yes" : "no");
2152 	}
2153 	btrfs_info(block_group->fs_info, "block group has cluster?: %s",
2154 	       list_empty(&block_group->cluster_list) ? "no" : "yes");
2155 	btrfs_info(block_group->fs_info,
2156 		   "%d blocks of free space at or bigger than bytes is", count);
2157 }
2158 
2159 void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
2160 {
2161 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2162 
2163 	spin_lock_init(&ctl->tree_lock);
2164 	ctl->unit = block_group->sectorsize;
2165 	ctl->start = block_group->key.objectid;
2166 	ctl->private = block_group;
2167 	ctl->op = &free_space_op;
2168 
2169 	/*
2170 	 * we only want to have 32k of ram per block group for keeping
2171 	 * track of free space, and if we pass 1/2 of that we want to
2172 	 * start converting things over to using bitmaps
2173 	 */
2174 	ctl->extents_thresh = ((1024 * 32) / 2) /
2175 				sizeof(struct btrfs_free_space);
2176 }
2177 
2178 /*
2179  * for a given cluster, put all of its extents back into the free
2180  * space cache.  If the block group passed doesn't match the block group
2181  * pointed to by the cluster, someone else raced in and freed the
2182  * cluster already.  In that case, we just return without changing anything
2183  */
2184 static int
2185 __btrfs_return_cluster_to_free_space(
2186 			     struct btrfs_block_group_cache *block_group,
2187 			     struct btrfs_free_cluster *cluster)
2188 {
2189 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2190 	struct btrfs_free_space *entry;
2191 	struct rb_node *node;
2192 
2193 	spin_lock(&cluster->lock);
2194 	if (cluster->block_group != block_group)
2195 		goto out;
2196 
2197 	cluster->block_group = NULL;
2198 	cluster->window_start = 0;
2199 	list_del_init(&cluster->block_group_list);
2200 
2201 	node = rb_first(&cluster->root);
2202 	while (node) {
2203 		bool bitmap;
2204 
2205 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
2206 		node = rb_next(&entry->offset_index);
2207 		rb_erase(&entry->offset_index, &cluster->root);
2208 
2209 		bitmap = (entry->bitmap != NULL);
2210 		if (!bitmap)
2211 			try_merge_free_space(ctl, entry, false);
2212 		tree_insert_offset(&ctl->free_space_offset,
2213 				   entry->offset, &entry->offset_index, bitmap);
2214 	}
2215 	cluster->root = RB_ROOT;
2216 
2217 out:
2218 	spin_unlock(&cluster->lock);
2219 	btrfs_put_block_group(block_group);
2220 	return 0;
2221 }
2222 
2223 static void __btrfs_remove_free_space_cache_locked(
2224 				struct btrfs_free_space_ctl *ctl)
2225 {
2226 	struct btrfs_free_space *info;
2227 	struct rb_node *node;
2228 
2229 	while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
2230 		info = rb_entry(node, struct btrfs_free_space, offset_index);
2231 		if (!info->bitmap) {
2232 			unlink_free_space(ctl, info);
2233 			kmem_cache_free(btrfs_free_space_cachep, info);
2234 		} else {
2235 			free_bitmap(ctl, info);
2236 		}
2237 		if (need_resched()) {
2238 			spin_unlock(&ctl->tree_lock);
2239 			cond_resched();
2240 			spin_lock(&ctl->tree_lock);
2241 		}
2242 	}
2243 }
2244 
2245 void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
2246 {
2247 	spin_lock(&ctl->tree_lock);
2248 	__btrfs_remove_free_space_cache_locked(ctl);
2249 	spin_unlock(&ctl->tree_lock);
2250 }
2251 
2252 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
2253 {
2254 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2255 	struct btrfs_free_cluster *cluster;
2256 	struct list_head *head;
2257 
2258 	spin_lock(&ctl->tree_lock);
2259 	while ((head = block_group->cluster_list.next) !=
2260 	       &block_group->cluster_list) {
2261 		cluster = list_entry(head, struct btrfs_free_cluster,
2262 				     block_group_list);
2263 
2264 		WARN_ON(cluster->block_group != block_group);
2265 		__btrfs_return_cluster_to_free_space(block_group, cluster);
2266 		if (need_resched()) {
2267 			spin_unlock(&ctl->tree_lock);
2268 			cond_resched();
2269 			spin_lock(&ctl->tree_lock);
2270 		}
2271 	}
2272 	__btrfs_remove_free_space_cache_locked(ctl);
2273 	spin_unlock(&ctl->tree_lock);
2274 
2275 }
2276 
2277 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
2278 			       u64 offset, u64 bytes, u64 empty_size,
2279 			       u64 *max_extent_size)
2280 {
2281 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2282 	struct btrfs_free_space *entry = NULL;
2283 	u64 bytes_search = bytes + empty_size;
2284 	u64 ret = 0;
2285 	u64 align_gap = 0;
2286 	u64 align_gap_len = 0;
2287 
2288 	spin_lock(&ctl->tree_lock);
2289 	entry = find_free_space(ctl, &offset, &bytes_search,
2290 				block_group->full_stripe_len, max_extent_size);
2291 	if (!entry)
2292 		goto out;
2293 
2294 	ret = offset;
2295 	if (entry->bitmap) {
2296 		bitmap_clear_bits(ctl, entry, offset, bytes);
2297 		if (!entry->bytes)
2298 			free_bitmap(ctl, entry);
2299 	} else {
2300 		unlink_free_space(ctl, entry);
2301 		align_gap_len = offset - entry->offset;
2302 		align_gap = entry->offset;
2303 
2304 		entry->offset = offset + bytes;
2305 		WARN_ON(entry->bytes < bytes + align_gap_len);
2306 
2307 		entry->bytes -= bytes + align_gap_len;
2308 		if (!entry->bytes)
2309 			kmem_cache_free(btrfs_free_space_cachep, entry);
2310 		else
2311 			link_free_space(ctl, entry);
2312 	}
2313 out:
2314 	spin_unlock(&ctl->tree_lock);
2315 
2316 	if (align_gap_len)
2317 		__btrfs_add_free_space(ctl, align_gap, align_gap_len);
2318 	return ret;
2319 }
2320 
2321 /*
2322  * given a cluster, put all of its extents back into the free space
2323  * cache.  If a block group is passed, this function will only free
2324  * a cluster that belongs to the passed block group.
2325  *
2326  * Otherwise, it'll get a reference on the block group pointed to by the
2327  * cluster and remove the cluster from it.
2328  */
2329 int btrfs_return_cluster_to_free_space(
2330 			       struct btrfs_block_group_cache *block_group,
2331 			       struct btrfs_free_cluster *cluster)
2332 {
2333 	struct btrfs_free_space_ctl *ctl;
2334 	int ret;
2335 
2336 	/* first, get a safe pointer to the block group */
2337 	spin_lock(&cluster->lock);
2338 	if (!block_group) {
2339 		block_group = cluster->block_group;
2340 		if (!block_group) {
2341 			spin_unlock(&cluster->lock);
2342 			return 0;
2343 		}
2344 	} else if (cluster->block_group != block_group) {
2345 		/* someone else has already freed it don't redo their work */
2346 		spin_unlock(&cluster->lock);
2347 		return 0;
2348 	}
2349 	atomic_inc(&block_group->count);
2350 	spin_unlock(&cluster->lock);
2351 
2352 	ctl = block_group->free_space_ctl;
2353 
2354 	/* now return any extents the cluster had on it */
2355 	spin_lock(&ctl->tree_lock);
2356 	ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
2357 	spin_unlock(&ctl->tree_lock);
2358 
2359 	/* finally drop our ref */
2360 	btrfs_put_block_group(block_group);
2361 	return ret;
2362 }
2363 
2364 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
2365 				   struct btrfs_free_cluster *cluster,
2366 				   struct btrfs_free_space *entry,
2367 				   u64 bytes, u64 min_start,
2368 				   u64 *max_extent_size)
2369 {
2370 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2371 	int err;
2372 	u64 search_start = cluster->window_start;
2373 	u64 search_bytes = bytes;
2374 	u64 ret = 0;
2375 
2376 	search_start = min_start;
2377 	search_bytes = bytes;
2378 
2379 	err = search_bitmap(ctl, entry, &search_start, &search_bytes);
2380 	if (err) {
2381 		if (search_bytes > *max_extent_size)
2382 			*max_extent_size = search_bytes;
2383 		return 0;
2384 	}
2385 
2386 	ret = search_start;
2387 	__bitmap_clear_bits(ctl, entry, ret, bytes);
2388 
2389 	return ret;
2390 }
2391 
2392 /*
2393  * given a cluster, try to allocate 'bytes' from it, returns 0
2394  * if it couldn't find anything suitably large, or a logical disk offset
2395  * if things worked out
2396  */
2397 u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
2398 			     struct btrfs_free_cluster *cluster, u64 bytes,
2399 			     u64 min_start, u64 *max_extent_size)
2400 {
2401 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2402 	struct btrfs_free_space *entry = NULL;
2403 	struct rb_node *node;
2404 	u64 ret = 0;
2405 
2406 	spin_lock(&cluster->lock);
2407 	if (bytes > cluster->max_size)
2408 		goto out;
2409 
2410 	if (cluster->block_group != block_group)
2411 		goto out;
2412 
2413 	node = rb_first(&cluster->root);
2414 	if (!node)
2415 		goto out;
2416 
2417 	entry = rb_entry(node, struct btrfs_free_space, offset_index);
2418 	while (1) {
2419 		if (entry->bytes < bytes && entry->bytes > *max_extent_size)
2420 			*max_extent_size = entry->bytes;
2421 
2422 		if (entry->bytes < bytes ||
2423 		    (!entry->bitmap && entry->offset < min_start)) {
2424 			node = rb_next(&entry->offset_index);
2425 			if (!node)
2426 				break;
2427 			entry = rb_entry(node, struct btrfs_free_space,
2428 					 offset_index);
2429 			continue;
2430 		}
2431 
2432 		if (entry->bitmap) {
2433 			ret = btrfs_alloc_from_bitmap(block_group,
2434 						      cluster, entry, bytes,
2435 						      cluster->window_start,
2436 						      max_extent_size);
2437 			if (ret == 0) {
2438 				node = rb_next(&entry->offset_index);
2439 				if (!node)
2440 					break;
2441 				entry = rb_entry(node, struct btrfs_free_space,
2442 						 offset_index);
2443 				continue;
2444 			}
2445 			cluster->window_start += bytes;
2446 		} else {
2447 			ret = entry->offset;
2448 
2449 			entry->offset += bytes;
2450 			entry->bytes -= bytes;
2451 		}
2452 
2453 		if (entry->bytes == 0)
2454 			rb_erase(&entry->offset_index, &cluster->root);
2455 		break;
2456 	}
2457 out:
2458 	spin_unlock(&cluster->lock);
2459 
2460 	if (!ret)
2461 		return 0;
2462 
2463 	spin_lock(&ctl->tree_lock);
2464 
2465 	ctl->free_space -= bytes;
2466 	if (entry->bytes == 0) {
2467 		ctl->free_extents--;
2468 		if (entry->bitmap) {
2469 			kfree(entry->bitmap);
2470 			ctl->total_bitmaps--;
2471 			ctl->op->recalc_thresholds(ctl);
2472 		}
2473 		kmem_cache_free(btrfs_free_space_cachep, entry);
2474 	}
2475 
2476 	spin_unlock(&ctl->tree_lock);
2477 
2478 	return ret;
2479 }
2480 
2481 static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
2482 				struct btrfs_free_space *entry,
2483 				struct btrfs_free_cluster *cluster,
2484 				u64 offset, u64 bytes,
2485 				u64 cont1_bytes, u64 min_bytes)
2486 {
2487 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2488 	unsigned long next_zero;
2489 	unsigned long i;
2490 	unsigned long want_bits;
2491 	unsigned long min_bits;
2492 	unsigned long found_bits;
2493 	unsigned long start = 0;
2494 	unsigned long total_found = 0;
2495 	int ret;
2496 
2497 	i = offset_to_bit(entry->offset, ctl->unit,
2498 			  max_t(u64, offset, entry->offset));
2499 	want_bits = bytes_to_bits(bytes, ctl->unit);
2500 	min_bits = bytes_to_bits(min_bytes, ctl->unit);
2501 
2502 again:
2503 	found_bits = 0;
2504 	for_each_set_bit_from(i, entry->bitmap, BITS_PER_BITMAP) {
2505 		next_zero = find_next_zero_bit(entry->bitmap,
2506 					       BITS_PER_BITMAP, i);
2507 		if (next_zero - i >= min_bits) {
2508 			found_bits = next_zero - i;
2509 			break;
2510 		}
2511 		i = next_zero;
2512 	}
2513 
2514 	if (!found_bits)
2515 		return -ENOSPC;
2516 
2517 	if (!total_found) {
2518 		start = i;
2519 		cluster->max_size = 0;
2520 	}
2521 
2522 	total_found += found_bits;
2523 
2524 	if (cluster->max_size < found_bits * ctl->unit)
2525 		cluster->max_size = found_bits * ctl->unit;
2526 
2527 	if (total_found < want_bits || cluster->max_size < cont1_bytes) {
2528 		i = next_zero + 1;
2529 		goto again;
2530 	}
2531 
2532 	cluster->window_start = start * ctl->unit + entry->offset;
2533 	rb_erase(&entry->offset_index, &ctl->free_space_offset);
2534 	ret = tree_insert_offset(&cluster->root, entry->offset,
2535 				 &entry->offset_index, 1);
2536 	ASSERT(!ret); /* -EEXIST; Logic error */
2537 
2538 	trace_btrfs_setup_cluster(block_group, cluster,
2539 				  total_found * ctl->unit, 1);
2540 	return 0;
2541 }
2542 
2543 /*
2544  * This searches the block group for just extents to fill the cluster with.
2545  * Try to find a cluster with at least bytes total bytes, at least one
2546  * extent of cont1_bytes, and other clusters of at least min_bytes.
2547  */
2548 static noinline int
2549 setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
2550 			struct btrfs_free_cluster *cluster,
2551 			struct list_head *bitmaps, u64 offset, u64 bytes,
2552 			u64 cont1_bytes, u64 min_bytes)
2553 {
2554 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2555 	struct btrfs_free_space *first = NULL;
2556 	struct btrfs_free_space *entry = NULL;
2557 	struct btrfs_free_space *last;
2558 	struct rb_node *node;
2559 	u64 window_free;
2560 	u64 max_extent;
2561 	u64 total_size = 0;
2562 
2563 	entry = tree_search_offset(ctl, offset, 0, 1);
2564 	if (!entry)
2565 		return -ENOSPC;
2566 
2567 	/*
2568 	 * We don't want bitmaps, so just move along until we find a normal
2569 	 * extent entry.
2570 	 */
2571 	while (entry->bitmap || entry->bytes < min_bytes) {
2572 		if (entry->bitmap && list_empty(&entry->list))
2573 			list_add_tail(&entry->list, bitmaps);
2574 		node = rb_next(&entry->offset_index);
2575 		if (!node)
2576 			return -ENOSPC;
2577 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
2578 	}
2579 
2580 	window_free = entry->bytes;
2581 	max_extent = entry->bytes;
2582 	first = entry;
2583 	last = entry;
2584 
2585 	for (node = rb_next(&entry->offset_index); node;
2586 	     node = rb_next(&entry->offset_index)) {
2587 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
2588 
2589 		if (entry->bitmap) {
2590 			if (list_empty(&entry->list))
2591 				list_add_tail(&entry->list, bitmaps);
2592 			continue;
2593 		}
2594 
2595 		if (entry->bytes < min_bytes)
2596 			continue;
2597 
2598 		last = entry;
2599 		window_free += entry->bytes;
2600 		if (entry->bytes > max_extent)
2601 			max_extent = entry->bytes;
2602 	}
2603 
2604 	if (window_free < bytes || max_extent < cont1_bytes)
2605 		return -ENOSPC;
2606 
2607 	cluster->window_start = first->offset;
2608 
2609 	node = &first->offset_index;
2610 
2611 	/*
2612 	 * now we've found our entries, pull them out of the free space
2613 	 * cache and put them into the cluster rbtree
2614 	 */
2615 	do {
2616 		int ret;
2617 
2618 		entry = rb_entry(node, struct btrfs_free_space, offset_index);
2619 		node = rb_next(&entry->offset_index);
2620 		if (entry->bitmap || entry->bytes < min_bytes)
2621 			continue;
2622 
2623 		rb_erase(&entry->offset_index, &ctl->free_space_offset);
2624 		ret = tree_insert_offset(&cluster->root, entry->offset,
2625 					 &entry->offset_index, 0);
2626 		total_size += entry->bytes;
2627 		ASSERT(!ret); /* -EEXIST; Logic error */
2628 	} while (node && entry != last);
2629 
2630 	cluster->max_size = max_extent;
2631 	trace_btrfs_setup_cluster(block_group, cluster, total_size, 0);
2632 	return 0;
2633 }
2634 
2635 /*
2636  * This specifically looks for bitmaps that may work in the cluster, we assume
2637  * that we have already failed to find extents that will work.
2638  */
2639 static noinline int
2640 setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
2641 		     struct btrfs_free_cluster *cluster,
2642 		     struct list_head *bitmaps, u64 offset, u64 bytes,
2643 		     u64 cont1_bytes, u64 min_bytes)
2644 {
2645 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2646 	struct btrfs_free_space *entry;
2647 	int ret = -ENOSPC;
2648 	u64 bitmap_offset = offset_to_bitmap(ctl, offset);
2649 
2650 	if (ctl->total_bitmaps == 0)
2651 		return -ENOSPC;
2652 
2653 	/*
2654 	 * The bitmap that covers offset won't be in the list unless offset
2655 	 * is just its start offset.
2656 	 */
2657 	entry = list_first_entry(bitmaps, struct btrfs_free_space, list);
2658 	if (entry->offset != bitmap_offset) {
2659 		entry = tree_search_offset(ctl, bitmap_offset, 1, 0);
2660 		if (entry && list_empty(&entry->list))
2661 			list_add(&entry->list, bitmaps);
2662 	}
2663 
2664 	list_for_each_entry(entry, bitmaps, list) {
2665 		if (entry->bytes < bytes)
2666 			continue;
2667 		ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
2668 					   bytes, cont1_bytes, min_bytes);
2669 		if (!ret)
2670 			return 0;
2671 	}
2672 
2673 	/*
2674 	 * The bitmaps list has all the bitmaps that record free space
2675 	 * starting after offset, so no more search is required.
2676 	 */
2677 	return -ENOSPC;
2678 }
2679 
2680 /*
2681  * here we try to find a cluster of blocks in a block group.  The goal
2682  * is to find at least bytes+empty_size.
2683  * We might not find them all in one contiguous area.
2684  *
2685  * returns zero and sets up cluster if things worked out, otherwise
2686  * it returns -enospc
2687  */
2688 int btrfs_find_space_cluster(struct btrfs_root *root,
2689 			     struct btrfs_block_group_cache *block_group,
2690 			     struct btrfs_free_cluster *cluster,
2691 			     u64 offset, u64 bytes, u64 empty_size)
2692 {
2693 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2694 	struct btrfs_free_space *entry, *tmp;
2695 	LIST_HEAD(bitmaps);
2696 	u64 min_bytes;
2697 	u64 cont1_bytes;
2698 	int ret;
2699 
2700 	/*
2701 	 * Choose the minimum extent size we'll require for this
2702 	 * cluster.  For SSD_SPREAD, don't allow any fragmentation.
2703 	 * For metadata, allow allocates with smaller extents.  For
2704 	 * data, keep it dense.
2705 	 */
2706 	if (btrfs_test_opt(root, SSD_SPREAD)) {
2707 		cont1_bytes = min_bytes = bytes + empty_size;
2708 	} else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
2709 		cont1_bytes = bytes;
2710 		min_bytes = block_group->sectorsize;
2711 	} else {
2712 		cont1_bytes = max(bytes, (bytes + empty_size) >> 2);
2713 		min_bytes = block_group->sectorsize;
2714 	}
2715 
2716 	spin_lock(&ctl->tree_lock);
2717 
2718 	/*
2719 	 * If we know we don't have enough space to make a cluster don't even
2720 	 * bother doing all the work to try and find one.
2721 	 */
2722 	if (ctl->free_space < bytes) {
2723 		spin_unlock(&ctl->tree_lock);
2724 		return -ENOSPC;
2725 	}
2726 
2727 	spin_lock(&cluster->lock);
2728 
2729 	/* someone already found a cluster, hooray */
2730 	if (cluster->block_group) {
2731 		ret = 0;
2732 		goto out;
2733 	}
2734 
2735 	trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
2736 				 min_bytes);
2737 
2738 	INIT_LIST_HEAD(&bitmaps);
2739 	ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
2740 				      bytes + empty_size,
2741 				      cont1_bytes, min_bytes);
2742 	if (ret)
2743 		ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
2744 					   offset, bytes + empty_size,
2745 					   cont1_bytes, min_bytes);
2746 
2747 	/* Clear our temporary list */
2748 	list_for_each_entry_safe(entry, tmp, &bitmaps, list)
2749 		list_del_init(&entry->list);
2750 
2751 	if (!ret) {
2752 		atomic_inc(&block_group->count);
2753 		list_add_tail(&cluster->block_group_list,
2754 			      &block_group->cluster_list);
2755 		cluster->block_group = block_group;
2756 	} else {
2757 		trace_btrfs_failed_cluster_setup(block_group);
2758 	}
2759 out:
2760 	spin_unlock(&cluster->lock);
2761 	spin_unlock(&ctl->tree_lock);
2762 
2763 	return ret;
2764 }
2765 
2766 /*
2767  * simple code to zero out a cluster
2768  */
2769 void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
2770 {
2771 	spin_lock_init(&cluster->lock);
2772 	spin_lock_init(&cluster->refill_lock);
2773 	cluster->root = RB_ROOT;
2774 	cluster->max_size = 0;
2775 	INIT_LIST_HEAD(&cluster->block_group_list);
2776 	cluster->block_group = NULL;
2777 }
2778 
2779 static int do_trimming(struct btrfs_block_group_cache *block_group,
2780 		       u64 *total_trimmed, u64 start, u64 bytes,
2781 		       u64 reserved_start, u64 reserved_bytes)
2782 {
2783 	struct btrfs_space_info *space_info = block_group->space_info;
2784 	struct btrfs_fs_info *fs_info = block_group->fs_info;
2785 	int ret;
2786 	int update = 0;
2787 	u64 trimmed = 0;
2788 
2789 	spin_lock(&space_info->lock);
2790 	spin_lock(&block_group->lock);
2791 	if (!block_group->ro) {
2792 		block_group->reserved += reserved_bytes;
2793 		space_info->bytes_reserved += reserved_bytes;
2794 		update = 1;
2795 	}
2796 	spin_unlock(&block_group->lock);
2797 	spin_unlock(&space_info->lock);
2798 
2799 	ret = btrfs_error_discard_extent(fs_info->extent_root,
2800 					 start, bytes, &trimmed);
2801 	if (!ret)
2802 		*total_trimmed += trimmed;
2803 
2804 	btrfs_add_free_space(block_group, reserved_start, reserved_bytes);
2805 
2806 	if (update) {
2807 		spin_lock(&space_info->lock);
2808 		spin_lock(&block_group->lock);
2809 		if (block_group->ro)
2810 			space_info->bytes_readonly += reserved_bytes;
2811 		block_group->reserved -= reserved_bytes;
2812 		space_info->bytes_reserved -= reserved_bytes;
2813 		spin_unlock(&space_info->lock);
2814 		spin_unlock(&block_group->lock);
2815 	}
2816 
2817 	return ret;
2818 }
2819 
2820 static int trim_no_bitmap(struct btrfs_block_group_cache *block_group,
2821 			  u64 *total_trimmed, u64 start, u64 end, u64 minlen)
2822 {
2823 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2824 	struct btrfs_free_space *entry;
2825 	struct rb_node *node;
2826 	int ret = 0;
2827 	u64 extent_start;
2828 	u64 extent_bytes;
2829 	u64 bytes;
2830 
2831 	while (start < end) {
2832 		spin_lock(&ctl->tree_lock);
2833 
2834 		if (ctl->free_space < minlen) {
2835 			spin_unlock(&ctl->tree_lock);
2836 			break;
2837 		}
2838 
2839 		entry = tree_search_offset(ctl, start, 0, 1);
2840 		if (!entry) {
2841 			spin_unlock(&ctl->tree_lock);
2842 			break;
2843 		}
2844 
2845 		/* skip bitmaps */
2846 		while (entry->bitmap) {
2847 			node = rb_next(&entry->offset_index);
2848 			if (!node) {
2849 				spin_unlock(&ctl->tree_lock);
2850 				goto out;
2851 			}
2852 			entry = rb_entry(node, struct btrfs_free_space,
2853 					 offset_index);
2854 		}
2855 
2856 		if (entry->offset >= end) {
2857 			spin_unlock(&ctl->tree_lock);
2858 			break;
2859 		}
2860 
2861 		extent_start = entry->offset;
2862 		extent_bytes = entry->bytes;
2863 		start = max(start, extent_start);
2864 		bytes = min(extent_start + extent_bytes, end) - start;
2865 		if (bytes < minlen) {
2866 			spin_unlock(&ctl->tree_lock);
2867 			goto next;
2868 		}
2869 
2870 		unlink_free_space(ctl, entry);
2871 		kmem_cache_free(btrfs_free_space_cachep, entry);
2872 
2873 		spin_unlock(&ctl->tree_lock);
2874 
2875 		ret = do_trimming(block_group, total_trimmed, start, bytes,
2876 				  extent_start, extent_bytes);
2877 		if (ret)
2878 			break;
2879 next:
2880 		start += bytes;
2881 
2882 		if (fatal_signal_pending(current)) {
2883 			ret = -ERESTARTSYS;
2884 			break;
2885 		}
2886 
2887 		cond_resched();
2888 	}
2889 out:
2890 	return ret;
2891 }
2892 
2893 static int trim_bitmaps(struct btrfs_block_group_cache *block_group,
2894 			u64 *total_trimmed, u64 start, u64 end, u64 minlen)
2895 {
2896 	struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
2897 	struct btrfs_free_space *entry;
2898 	int ret = 0;
2899 	int ret2;
2900 	u64 bytes;
2901 	u64 offset = offset_to_bitmap(ctl, start);
2902 
2903 	while (offset < end) {
2904 		bool next_bitmap = false;
2905 
2906 		spin_lock(&ctl->tree_lock);
2907 
2908 		if (ctl->free_space < minlen) {
2909 			spin_unlock(&ctl->tree_lock);
2910 			break;
2911 		}
2912 
2913 		entry = tree_search_offset(ctl, offset, 1, 0);
2914 		if (!entry) {
2915 			spin_unlock(&ctl->tree_lock);
2916 			next_bitmap = true;
2917 			goto next;
2918 		}
2919 
2920 		bytes = minlen;
2921 		ret2 = search_bitmap(ctl, entry, &start, &bytes);
2922 		if (ret2 || start >= end) {
2923 			spin_unlock(&ctl->tree_lock);
2924 			next_bitmap = true;
2925 			goto next;
2926 		}
2927 
2928 		bytes = min(bytes, end - start);
2929 		if (bytes < minlen) {
2930 			spin_unlock(&ctl->tree_lock);
2931 			goto next;
2932 		}
2933 
2934 		bitmap_clear_bits(ctl, entry, start, bytes);
2935 		if (entry->bytes == 0)
2936 			free_bitmap(ctl, entry);
2937 
2938 		spin_unlock(&ctl->tree_lock);
2939 
2940 		ret = do_trimming(block_group, total_trimmed, start, bytes,
2941 				  start, bytes);
2942 		if (ret)
2943 			break;
2944 next:
2945 		if (next_bitmap) {
2946 			offset += BITS_PER_BITMAP * ctl->unit;
2947 		} else {
2948 			start += bytes;
2949 			if (start >= offset + BITS_PER_BITMAP * ctl->unit)
2950 				offset += BITS_PER_BITMAP * ctl->unit;
2951 		}
2952 
2953 		if (fatal_signal_pending(current)) {
2954 			ret = -ERESTARTSYS;
2955 			break;
2956 		}
2957 
2958 		cond_resched();
2959 	}
2960 
2961 	return ret;
2962 }
2963 
2964 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
2965 			   u64 *trimmed, u64 start, u64 end, u64 minlen)
2966 {
2967 	int ret;
2968 
2969 	*trimmed = 0;
2970 
2971 	ret = trim_no_bitmap(block_group, trimmed, start, end, minlen);
2972 	if (ret)
2973 		return ret;
2974 
2975 	ret = trim_bitmaps(block_group, trimmed, start, end, minlen);
2976 
2977 	return ret;
2978 }
2979 
2980 /*
2981  * Find the left-most item in the cache tree, and then return the
2982  * smallest inode number in the item.
2983  *
2984  * Note: the returned inode number may not be the smallest one in
2985  * the tree, if the left-most item is a bitmap.
2986  */
2987 u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
2988 {
2989 	struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
2990 	struct btrfs_free_space *entry = NULL;
2991 	u64 ino = 0;
2992 
2993 	spin_lock(&ctl->tree_lock);
2994 
2995 	if (RB_EMPTY_ROOT(&ctl->free_space_offset))
2996 		goto out;
2997 
2998 	entry = rb_entry(rb_first(&ctl->free_space_offset),
2999 			 struct btrfs_free_space, offset_index);
3000 
3001 	if (!entry->bitmap) {
3002 		ino = entry->offset;
3003 
3004 		unlink_free_space(ctl, entry);
3005 		entry->offset++;
3006 		entry->bytes--;
3007 		if (!entry->bytes)
3008 			kmem_cache_free(btrfs_free_space_cachep, entry);
3009 		else
3010 			link_free_space(ctl, entry);
3011 	} else {
3012 		u64 offset = 0;
3013 		u64 count = 1;
3014 		int ret;
3015 
3016 		ret = search_bitmap(ctl, entry, &offset, &count);
3017 		/* Logic error; Should be empty if it can't find anything */
3018 		ASSERT(!ret);
3019 
3020 		ino = offset;
3021 		bitmap_clear_bits(ctl, entry, offset, 1);
3022 		if (entry->bytes == 0)
3023 			free_bitmap(ctl, entry);
3024 	}
3025 out:
3026 	spin_unlock(&ctl->tree_lock);
3027 
3028 	return ino;
3029 }
3030 
3031 struct inode *lookup_free_ino_inode(struct btrfs_root *root,
3032 				    struct btrfs_path *path)
3033 {
3034 	struct inode *inode = NULL;
3035 
3036 	spin_lock(&root->cache_lock);
3037 	if (root->cache_inode)
3038 		inode = igrab(root->cache_inode);
3039 	spin_unlock(&root->cache_lock);
3040 	if (inode)
3041 		return inode;
3042 
3043 	inode = __lookup_free_space_inode(root, path, 0);
3044 	if (IS_ERR(inode))
3045 		return inode;
3046 
3047 	spin_lock(&root->cache_lock);
3048 	if (!btrfs_fs_closing(root->fs_info))
3049 		root->cache_inode = igrab(inode);
3050 	spin_unlock(&root->cache_lock);
3051 
3052 	return inode;
3053 }
3054 
3055 int create_free_ino_inode(struct btrfs_root *root,
3056 			  struct btrfs_trans_handle *trans,
3057 			  struct btrfs_path *path)
3058 {
3059 	return __create_free_space_inode(root, trans, path,
3060 					 BTRFS_FREE_INO_OBJECTID, 0);
3061 }
3062 
3063 int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
3064 {
3065 	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3066 	struct btrfs_path *path;
3067 	struct inode *inode;
3068 	int ret = 0;
3069 	u64 root_gen = btrfs_root_generation(&root->root_item);
3070 
3071 	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
3072 		return 0;
3073 
3074 	/*
3075 	 * If we're unmounting then just return, since this does a search on the
3076 	 * normal root and not the commit root and we could deadlock.
3077 	 */
3078 	if (btrfs_fs_closing(fs_info))
3079 		return 0;
3080 
3081 	path = btrfs_alloc_path();
3082 	if (!path)
3083 		return 0;
3084 
3085 	inode = lookup_free_ino_inode(root, path);
3086 	if (IS_ERR(inode))
3087 		goto out;
3088 
3089 	if (root_gen != BTRFS_I(inode)->generation)
3090 		goto out_put;
3091 
3092 	ret = __load_free_space_cache(root, inode, ctl, path, 0);
3093 
3094 	if (ret < 0)
3095 		btrfs_err(fs_info,
3096 			"failed to load free ino cache for root %llu",
3097 			root->root_key.objectid);
3098 out_put:
3099 	iput(inode);
3100 out:
3101 	btrfs_free_path(path);
3102 	return ret;
3103 }
3104 
3105 int btrfs_write_out_ino_cache(struct btrfs_root *root,
3106 			      struct btrfs_trans_handle *trans,
3107 			      struct btrfs_path *path,
3108 			      struct inode *inode)
3109 {
3110 	struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
3111 	int ret;
3112 
3113 	if (!btrfs_test_opt(root, INODE_MAP_CACHE))
3114 		return 0;
3115 
3116 	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
3117 	if (ret) {
3118 		btrfs_delalloc_release_metadata(inode, inode->i_size);
3119 #ifdef DEBUG
3120 		btrfs_err(root->fs_info,
3121 			"failed to write free ino cache for root %llu",
3122 			root->root_key.objectid);
3123 #endif
3124 	}
3125 
3126 	return ret;
3127 }
3128 
3129 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
3130 /*
3131  * Use this if you need to make a bitmap or extent entry specifically, it
3132  * doesn't do any of the merging that add_free_space does, this acts a lot like
3133  * how the free space cache loading stuff works, so you can get really weird
3134  * configurations.
3135  */
3136 int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
3137 			      u64 offset, u64 bytes, bool bitmap)
3138 {
3139 	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3140 	struct btrfs_free_space *info = NULL, *bitmap_info;
3141 	void *map = NULL;
3142 	u64 bytes_added;
3143 	int ret;
3144 
3145 again:
3146 	if (!info) {
3147 		info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
3148 		if (!info)
3149 			return -ENOMEM;
3150 	}
3151 
3152 	if (!bitmap) {
3153 		spin_lock(&ctl->tree_lock);
3154 		info->offset = offset;
3155 		info->bytes = bytes;
3156 		ret = link_free_space(ctl, info);
3157 		spin_unlock(&ctl->tree_lock);
3158 		if (ret)
3159 			kmem_cache_free(btrfs_free_space_cachep, info);
3160 		return ret;
3161 	}
3162 
3163 	if (!map) {
3164 		map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
3165 		if (!map) {
3166 			kmem_cache_free(btrfs_free_space_cachep, info);
3167 			return -ENOMEM;
3168 		}
3169 	}
3170 
3171 	spin_lock(&ctl->tree_lock);
3172 	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3173 					 1, 0);
3174 	if (!bitmap_info) {
3175 		info->bitmap = map;
3176 		map = NULL;
3177 		add_new_bitmap(ctl, info, offset);
3178 		bitmap_info = info;
3179 	}
3180 
3181 	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
3182 	bytes -= bytes_added;
3183 	offset += bytes_added;
3184 	spin_unlock(&ctl->tree_lock);
3185 
3186 	if (bytes)
3187 		goto again;
3188 
3189 	if (map)
3190 		kfree(map);
3191 	return 0;
3192 }
3193 
3194 /*
3195  * Checks to see if the given range is in the free space cache.  This is really
3196  * just used to check the absence of space, so if there is free space in the
3197  * range at all we will return 1.
3198  */
3199 int test_check_exists(struct btrfs_block_group_cache *cache,
3200 		      u64 offset, u64 bytes)
3201 {
3202 	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
3203 	struct btrfs_free_space *info;
3204 	int ret = 0;
3205 
3206 	spin_lock(&ctl->tree_lock);
3207 	info = tree_search_offset(ctl, offset, 0, 0);
3208 	if (!info) {
3209 		info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
3210 					  1, 0);
3211 		if (!info)
3212 			goto out;
3213 	}
3214 
3215 have_info:
3216 	if (info->bitmap) {
3217 		u64 bit_off, bit_bytes;
3218 		struct rb_node *n;
3219 		struct btrfs_free_space *tmp;
3220 
3221 		bit_off = offset;
3222 		bit_bytes = ctl->unit;
3223 		ret = search_bitmap(ctl, info, &bit_off, &bit_bytes);
3224 		if (!ret) {
3225 			if (bit_off == offset) {
3226 				ret = 1;
3227 				goto out;
3228 			} else if (bit_off > offset &&
3229 				   offset + bytes > bit_off) {
3230 				ret = 1;
3231 				goto out;
3232 			}
3233 		}
3234 
3235 		n = rb_prev(&info->offset_index);
3236 		while (n) {
3237 			tmp = rb_entry(n, struct btrfs_free_space,
3238 				       offset_index);
3239 			if (tmp->offset + tmp->bytes < offset)
3240 				break;
3241 			if (offset + bytes < tmp->offset) {
3242 				n = rb_prev(&info->offset_index);
3243 				continue;
3244 			}
3245 			info = tmp;
3246 			goto have_info;
3247 		}
3248 
3249 		n = rb_next(&info->offset_index);
3250 		while (n) {
3251 			tmp = rb_entry(n, struct btrfs_free_space,
3252 				       offset_index);
3253 			if (offset + bytes < tmp->offset)
3254 				break;
3255 			if (tmp->offset + tmp->bytes < offset) {
3256 				n = rb_next(&info->offset_index);
3257 				continue;
3258 			}
3259 			info = tmp;
3260 			goto have_info;
3261 		}
3262 
3263 		goto out;
3264 	}
3265 
3266 	if (info->offset == offset) {
3267 		ret = 1;
3268 		goto out;
3269 	}
3270 
3271 	if (offset > info->offset && offset < info->offset + info->bytes)
3272 		ret = 1;
3273 out:
3274 	spin_unlock(&ctl->tree_lock);
3275 	return ret;
3276 }
3277 #endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
3278