xref: /openbmc/linux/fs/btrfs/extent-tree.c (revision df388556)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include "compat.h"
25 #include "hash.h"
26 #include "crc32c.h"
27 #include "ctree.h"
28 #include "disk-io.h"
29 #include "print-tree.h"
30 #include "transaction.h"
31 #include "volumes.h"
32 #include "locking.h"
33 #include "ref-cache.h"
34 
35 #define PENDING_EXTENT_INSERT 0
36 #define PENDING_EXTENT_DELETE 1
37 #define PENDING_BACKREF_UPDATE 2
38 
39 struct pending_extent_op {
40 	int type;
41 	u64 bytenr;
42 	u64 num_bytes;
43 	u64 parent;
44 	u64 orig_parent;
45 	u64 generation;
46 	u64 orig_generation;
47 	int level;
48 	struct list_head list;
49 	int del;
50 };
51 
52 static int finish_current_insert(struct btrfs_trans_handle *trans,
53 				 struct btrfs_root *extent_root, int all);
54 static int del_pending_extents(struct btrfs_trans_handle *trans,
55 			       struct btrfs_root *extent_root, int all);
56 static int pin_down_bytes(struct btrfs_trans_handle *trans,
57 			  struct btrfs_root *root,
58 			  u64 bytenr, u64 num_bytes, int is_data);
59 static int update_block_group(struct btrfs_trans_handle *trans,
60 			      struct btrfs_root *root,
61 			      u64 bytenr, u64 num_bytes, int alloc,
62 			      int mark_free);
63 
64 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
65 			  struct btrfs_root *extent_root, u64 alloc_bytes,
66 			  u64 flags, int force);
67 
68 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
69 {
70 	return (cache->flags & bits) == bits;
71 }
72 
73 /*
74  * this adds the block group to the fs_info rb tree for the block group
75  * cache
76  */
77 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
78 				struct btrfs_block_group_cache *block_group)
79 {
80 	struct rb_node **p;
81 	struct rb_node *parent = NULL;
82 	struct btrfs_block_group_cache *cache;
83 
84 	spin_lock(&info->block_group_cache_lock);
85 	p = &info->block_group_cache_tree.rb_node;
86 
87 	while (*p) {
88 		parent = *p;
89 		cache = rb_entry(parent, struct btrfs_block_group_cache,
90 				 cache_node);
91 		if (block_group->key.objectid < cache->key.objectid) {
92 			p = &(*p)->rb_left;
93 		} else if (block_group->key.objectid > cache->key.objectid) {
94 			p = &(*p)->rb_right;
95 		} else {
96 			spin_unlock(&info->block_group_cache_lock);
97 			return -EEXIST;
98 		}
99 	}
100 
101 	rb_link_node(&block_group->cache_node, parent, p);
102 	rb_insert_color(&block_group->cache_node,
103 			&info->block_group_cache_tree);
104 	spin_unlock(&info->block_group_cache_lock);
105 
106 	return 0;
107 }
108 
109 /*
110  * This will return the block group at or after bytenr if contains is 0, else
111  * it will return the block group that contains the bytenr
112  */
113 static struct btrfs_block_group_cache *
114 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
115 			      int contains)
116 {
117 	struct btrfs_block_group_cache *cache, *ret = NULL;
118 	struct rb_node *n;
119 	u64 end, start;
120 
121 	spin_lock(&info->block_group_cache_lock);
122 	n = info->block_group_cache_tree.rb_node;
123 
124 	while (n) {
125 		cache = rb_entry(n, struct btrfs_block_group_cache,
126 				 cache_node);
127 		end = cache->key.objectid + cache->key.offset - 1;
128 		start = cache->key.objectid;
129 
130 		if (bytenr < start) {
131 			if (!contains && (!ret || start < ret->key.objectid))
132 				ret = cache;
133 			n = n->rb_left;
134 		} else if (bytenr > start) {
135 			if (contains && bytenr <= end) {
136 				ret = cache;
137 				break;
138 			}
139 			n = n->rb_right;
140 		} else {
141 			ret = cache;
142 			break;
143 		}
144 	}
145 	if (ret)
146 		atomic_inc(&ret->count);
147 	spin_unlock(&info->block_group_cache_lock);
148 
149 	return ret;
150 }
151 
152 /*
153  * this is only called by cache_block_group, since we could have freed extents
154  * we need to check the pinned_extents for any extents that can't be used yet
155  * since their free space will be released as soon as the transaction commits.
156  */
157 static int add_new_free_space(struct btrfs_block_group_cache *block_group,
158 			      struct btrfs_fs_info *info, u64 start, u64 end)
159 {
160 	u64 extent_start, extent_end, size;
161 	int ret;
162 
163 	mutex_lock(&info->pinned_mutex);
164 	while (start < end) {
165 		ret = find_first_extent_bit(&info->pinned_extents, start,
166 					    &extent_start, &extent_end,
167 					    EXTENT_DIRTY);
168 		if (ret)
169 			break;
170 
171 		if (extent_start == start) {
172 			start = extent_end + 1;
173 		} else if (extent_start > start && extent_start < end) {
174 			size = extent_start - start;
175 			ret = btrfs_add_free_space(block_group, start,
176 						   size);
177 			BUG_ON(ret);
178 			start = extent_end + 1;
179 		} else {
180 			break;
181 		}
182 	}
183 
184 	if (start < end) {
185 		size = end - start;
186 		ret = btrfs_add_free_space(block_group, start, size);
187 		BUG_ON(ret);
188 	}
189 	mutex_unlock(&info->pinned_mutex);
190 
191 	return 0;
192 }
193 
194 static int remove_sb_from_cache(struct btrfs_root *root,
195 				struct btrfs_block_group_cache *cache)
196 {
197 	u64 bytenr;
198 	u64 *logical;
199 	int stripe_len;
200 	int i, nr, ret;
201 
202 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
203 		bytenr = btrfs_sb_offset(i);
204 		ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
205 				       cache->key.objectid, bytenr, 0,
206 				       &logical, &nr, &stripe_len);
207 		BUG_ON(ret);
208 		while (nr--) {
209 			btrfs_remove_free_space(cache, logical[nr],
210 						stripe_len);
211 		}
212 		kfree(logical);
213 	}
214 	return 0;
215 }
216 
217 static int cache_block_group(struct btrfs_root *root,
218 			     struct btrfs_block_group_cache *block_group)
219 {
220 	struct btrfs_path *path;
221 	int ret = 0;
222 	struct btrfs_key key;
223 	struct extent_buffer *leaf;
224 	int slot;
225 	u64 last;
226 
227 	if (!block_group)
228 		return 0;
229 
230 	root = root->fs_info->extent_root;
231 
232 	if (block_group->cached)
233 		return 0;
234 
235 	path = btrfs_alloc_path();
236 	if (!path)
237 		return -ENOMEM;
238 
239 	path->reada = 2;
240 	/*
241 	 * we get into deadlocks with paths held by callers of this function.
242 	 * since the alloc_mutex is protecting things right now, just
243 	 * skip the locking here
244 	 */
245 	path->skip_locking = 1;
246 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
247 	key.objectid = last;
248 	key.offset = 0;
249 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
250 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
251 	if (ret < 0)
252 		goto err;
253 
254 	while (1) {
255 		leaf = path->nodes[0];
256 		slot = path->slots[0];
257 		if (slot >= btrfs_header_nritems(leaf)) {
258 			ret = btrfs_next_leaf(root, path);
259 			if (ret < 0)
260 				goto err;
261 			if (ret == 0)
262 				continue;
263 			else
264 				break;
265 		}
266 		btrfs_item_key_to_cpu(leaf, &key, slot);
267 		if (key.objectid < block_group->key.objectid)
268 			goto next;
269 
270 		if (key.objectid >= block_group->key.objectid +
271 		    block_group->key.offset)
272 			break;
273 
274 		if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
275 			add_new_free_space(block_group, root->fs_info, last,
276 					   key.objectid);
277 
278 			last = key.objectid + key.offset;
279 		}
280 next:
281 		path->slots[0]++;
282 	}
283 
284 	add_new_free_space(block_group, root->fs_info, last,
285 			   block_group->key.objectid +
286 			   block_group->key.offset);
287 
288 	remove_sb_from_cache(root, block_group);
289 	block_group->cached = 1;
290 	ret = 0;
291 err:
292 	btrfs_free_path(path);
293 	return ret;
294 }
295 
296 /*
297  * return the block group that starts at or after bytenr
298  */
299 static struct btrfs_block_group_cache *
300 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
301 {
302 	struct btrfs_block_group_cache *cache;
303 
304 	cache = block_group_cache_tree_search(info, bytenr, 0);
305 
306 	return cache;
307 }
308 
309 /*
310  * return the block group that contains teh given bytenr
311  */
312 struct btrfs_block_group_cache *btrfs_lookup_block_group(
313 						 struct btrfs_fs_info *info,
314 						 u64 bytenr)
315 {
316 	struct btrfs_block_group_cache *cache;
317 
318 	cache = block_group_cache_tree_search(info, bytenr, 1);
319 
320 	return cache;
321 }
322 
323 static inline void put_block_group(struct btrfs_block_group_cache *cache)
324 {
325 	if (atomic_dec_and_test(&cache->count))
326 		kfree(cache);
327 }
328 
329 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
330 						  u64 flags)
331 {
332 	struct list_head *head = &info->space_info;
333 	struct btrfs_space_info *found;
334 
335 	rcu_read_lock();
336 	list_for_each_entry_rcu(found, head, list) {
337 		if (found->flags == flags) {
338 			rcu_read_unlock();
339 			return found;
340 		}
341 	}
342 	rcu_read_unlock();
343 	return NULL;
344 }
345 
346 /*
347  * after adding space to the filesystem, we need to clear the full flags
348  * on all the space infos.
349  */
350 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
351 {
352 	struct list_head *head = &info->space_info;
353 	struct btrfs_space_info *found;
354 
355 	rcu_read_lock();
356 	list_for_each_entry_rcu(found, head, list)
357 		found->full = 0;
358 	rcu_read_unlock();
359 }
360 
361 static u64 div_factor(u64 num, int factor)
362 {
363 	if (factor == 10)
364 		return num;
365 	num *= factor;
366 	do_div(num, 10);
367 	return num;
368 }
369 
370 u64 btrfs_find_block_group(struct btrfs_root *root,
371 			   u64 search_start, u64 search_hint, int owner)
372 {
373 	struct btrfs_block_group_cache *cache;
374 	u64 used;
375 	u64 last = max(search_hint, search_start);
376 	u64 group_start = 0;
377 	int full_search = 0;
378 	int factor = 9;
379 	int wrapped = 0;
380 again:
381 	while (1) {
382 		cache = btrfs_lookup_first_block_group(root->fs_info, last);
383 		if (!cache)
384 			break;
385 
386 		spin_lock(&cache->lock);
387 		last = cache->key.objectid + cache->key.offset;
388 		used = btrfs_block_group_used(&cache->item);
389 
390 		if ((full_search || !cache->ro) &&
391 		    block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
392 			if (used + cache->pinned + cache->reserved <
393 			    div_factor(cache->key.offset, factor)) {
394 				group_start = cache->key.objectid;
395 				spin_unlock(&cache->lock);
396 				put_block_group(cache);
397 				goto found;
398 			}
399 		}
400 		spin_unlock(&cache->lock);
401 		put_block_group(cache);
402 		cond_resched();
403 	}
404 	if (!wrapped) {
405 		last = search_start;
406 		wrapped = 1;
407 		goto again;
408 	}
409 	if (!full_search && factor < 10) {
410 		last = search_start;
411 		full_search = 1;
412 		factor = 10;
413 		goto again;
414 	}
415 found:
416 	return group_start;
417 }
418 
419 /* simple helper to search for an existing extent at a given offset */
420 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
421 {
422 	int ret;
423 	struct btrfs_key key;
424 	struct btrfs_path *path;
425 
426 	path = btrfs_alloc_path();
427 	BUG_ON(!path);
428 	key.objectid = start;
429 	key.offset = len;
430 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
431 	ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
432 				0, 0);
433 	btrfs_free_path(path);
434 	return ret;
435 }
436 
437 /*
438  * Back reference rules.  Back refs have three main goals:
439  *
440  * 1) differentiate between all holders of references to an extent so that
441  *    when a reference is dropped we can make sure it was a valid reference
442  *    before freeing the extent.
443  *
444  * 2) Provide enough information to quickly find the holders of an extent
445  *    if we notice a given block is corrupted or bad.
446  *
447  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
448  *    maintenance.  This is actually the same as #2, but with a slightly
449  *    different use case.
450  *
451  * File extents can be referenced by:
452  *
453  * - multiple snapshots, subvolumes, or different generations in one subvol
454  * - different files inside a single subvolume
455  * - different offsets inside a file (bookend extents in file.c)
456  *
457  * The extent ref structure has fields for:
458  *
459  * - Objectid of the subvolume root
460  * - Generation number of the tree holding the reference
461  * - objectid of the file holding the reference
462  * - number of references holding by parent node (alway 1 for tree blocks)
463  *
464  * Btree leaf may hold multiple references to a file extent. In most cases,
465  * these references are from same file and the corresponding offsets inside
466  * the file are close together.
467  *
468  * When a file extent is allocated the fields are filled in:
469  *     (root_key.objectid, trans->transid, inode objectid, 1)
470  *
471  * When a leaf is cow'd new references are added for every file extent found
472  * in the leaf.  It looks similar to the create case, but trans->transid will
473  * be different when the block is cow'd.
474  *
475  *     (root_key.objectid, trans->transid, inode objectid,
476  *      number of references in the leaf)
477  *
478  * When a file extent is removed either during snapshot deletion or
479  * file truncation, we find the corresponding back reference and check
480  * the following fields:
481  *
482  *     (btrfs_header_owner(leaf), btrfs_header_generation(leaf),
483  *      inode objectid)
484  *
485  * Btree extents can be referenced by:
486  *
487  * - Different subvolumes
488  * - Different generations of the same subvolume
489  *
490  * When a tree block is created, back references are inserted:
491  *
492  * (root->root_key.objectid, trans->transid, level, 1)
493  *
494  * When a tree block is cow'd, new back references are added for all the
495  * blocks it points to. If the tree block isn't in reference counted root,
496  * the old back references are removed. These new back references are of
497  * the form (trans->transid will have increased since creation):
498  *
499  * (root->root_key.objectid, trans->transid, level, 1)
500  *
501  * When a backref is in deleting, the following fields are checked:
502  *
503  * if backref was for a tree root:
504  *     (btrfs_header_owner(itself), btrfs_header_generation(itself), level)
505  * else
506  *     (btrfs_header_owner(parent), btrfs_header_generation(parent), level)
507  *
508  * Back Reference Key composing:
509  *
510  * The key objectid corresponds to the first byte in the extent, the key
511  * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first
512  * byte of parent extent. If a extent is tree root, the key offset is set
513  * to the key objectid.
514  */
515 
516 static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans,
517 					  struct btrfs_root *root,
518 					  struct btrfs_path *path,
519 					  u64 bytenr, u64 parent,
520 					  u64 ref_root, u64 ref_generation,
521 					  u64 owner_objectid, int del)
522 {
523 	struct btrfs_key key;
524 	struct btrfs_extent_ref *ref;
525 	struct extent_buffer *leaf;
526 	u64 ref_objectid;
527 	int ret;
528 
529 	key.objectid = bytenr;
530 	key.type = BTRFS_EXTENT_REF_KEY;
531 	key.offset = parent;
532 
533 	ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1);
534 	if (ret < 0)
535 		goto out;
536 	if (ret > 0) {
537 		ret = -ENOENT;
538 		goto out;
539 	}
540 
541 	leaf = path->nodes[0];
542 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
543 	ref_objectid = btrfs_ref_objectid(leaf, ref);
544 	if (btrfs_ref_root(leaf, ref) != ref_root ||
545 	    btrfs_ref_generation(leaf, ref) != ref_generation ||
546 	    (ref_objectid != owner_objectid &&
547 	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
548 		ret = -EIO;
549 		WARN_ON(1);
550 		goto out;
551 	}
552 	ret = 0;
553 out:
554 	return ret;
555 }
556 
557 /*
558  * updates all the backrefs that are pending on update_list for the
559  * extent_root
560  */
561 static noinline int update_backrefs(struct btrfs_trans_handle *trans,
562 				    struct btrfs_root *extent_root,
563 				    struct btrfs_path *path,
564 				    struct list_head *update_list)
565 {
566 	struct btrfs_key key;
567 	struct btrfs_extent_ref *ref;
568 	struct btrfs_fs_info *info = extent_root->fs_info;
569 	struct pending_extent_op *op;
570 	struct extent_buffer *leaf;
571 	int ret = 0;
572 	struct list_head *cur = update_list->next;
573 	u64 ref_objectid;
574 	u64 ref_root = extent_root->root_key.objectid;
575 
576 	op = list_entry(cur, struct pending_extent_op, list);
577 
578 search:
579 	key.objectid = op->bytenr;
580 	key.type = BTRFS_EXTENT_REF_KEY;
581 	key.offset = op->orig_parent;
582 
583 	ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
584 	BUG_ON(ret);
585 
586 	leaf = path->nodes[0];
587 
588 loop:
589 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
590 
591 	ref_objectid = btrfs_ref_objectid(leaf, ref);
592 
593 	if (btrfs_ref_root(leaf, ref) != ref_root ||
594 	    btrfs_ref_generation(leaf, ref) != op->orig_generation ||
595 	    (ref_objectid != op->level &&
596 	     ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
597 		printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
598 		       "root %llu, owner %u\n",
599 		       (unsigned long long)op->bytenr,
600 		       (unsigned long long)op->orig_parent,
601 		       (unsigned long long)ref_root, op->level);
602 		btrfs_print_leaf(extent_root, leaf);
603 		BUG();
604 	}
605 
606 	key.objectid = op->bytenr;
607 	key.offset = op->parent;
608 	key.type = BTRFS_EXTENT_REF_KEY;
609 	ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
610 	BUG_ON(ret);
611 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
612 	btrfs_set_ref_generation(leaf, ref, op->generation);
613 
614 	cur = cur->next;
615 
616 	list_del_init(&op->list);
617 	unlock_extent(&info->extent_ins, op->bytenr,
618 		      op->bytenr + op->num_bytes - 1, GFP_NOFS);
619 	kfree(op);
620 
621 	if (cur == update_list) {
622 		btrfs_mark_buffer_dirty(path->nodes[0]);
623 		btrfs_release_path(extent_root, path);
624 		goto out;
625 	}
626 
627 	op = list_entry(cur, struct pending_extent_op, list);
628 
629 	path->slots[0]++;
630 	while (path->slots[0] < btrfs_header_nritems(leaf)) {
631 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
632 		if (key.objectid == op->bytenr &&
633 		    key.type == BTRFS_EXTENT_REF_KEY)
634 			goto loop;
635 		path->slots[0]++;
636 	}
637 
638 	btrfs_mark_buffer_dirty(path->nodes[0]);
639 	btrfs_release_path(extent_root, path);
640 	goto search;
641 
642 out:
643 	return 0;
644 }
645 
646 static noinline int insert_extents(struct btrfs_trans_handle *trans,
647 				   struct btrfs_root *extent_root,
648 				   struct btrfs_path *path,
649 				   struct list_head *insert_list, int nr)
650 {
651 	struct btrfs_key *keys;
652 	u32 *data_size;
653 	struct pending_extent_op *op;
654 	struct extent_buffer *leaf;
655 	struct list_head *cur = insert_list->next;
656 	struct btrfs_fs_info *info = extent_root->fs_info;
657 	u64 ref_root = extent_root->root_key.objectid;
658 	int i = 0, last = 0, ret;
659 	int total = nr * 2;
660 
661 	if (!nr)
662 		return 0;
663 
664 	keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
665 	if (!keys)
666 		return -ENOMEM;
667 
668 	data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
669 	if (!data_size) {
670 		kfree(keys);
671 		return -ENOMEM;
672 	}
673 
674 	list_for_each_entry(op, insert_list, list) {
675 		keys[i].objectid = op->bytenr;
676 		keys[i].offset = op->num_bytes;
677 		keys[i].type = BTRFS_EXTENT_ITEM_KEY;
678 		data_size[i] = sizeof(struct btrfs_extent_item);
679 		i++;
680 
681 		keys[i].objectid = op->bytenr;
682 		keys[i].offset = op->parent;
683 		keys[i].type = BTRFS_EXTENT_REF_KEY;
684 		data_size[i] = sizeof(struct btrfs_extent_ref);
685 		i++;
686 	}
687 
688 	op = list_entry(cur, struct pending_extent_op, list);
689 	i = 0;
690 	while (i < total) {
691 		int c;
692 		ret = btrfs_insert_some_items(trans, extent_root, path,
693 					      keys+i, data_size+i, total-i);
694 		BUG_ON(ret < 0);
695 
696 		if (last && ret > 1)
697 			BUG();
698 
699 		leaf = path->nodes[0];
700 		for (c = 0; c < ret; c++) {
701 			int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
702 
703 			/*
704 			 * if the first item we inserted was a backref, then
705 			 * the EXTENT_ITEM will be the odd c's, else it will
706 			 * be the even c's
707 			 */
708 			if ((ref_first && (c % 2)) ||
709 			    (!ref_first && !(c % 2))) {
710 				struct btrfs_extent_item *itm;
711 
712 				itm = btrfs_item_ptr(leaf, path->slots[0] + c,
713 						     struct btrfs_extent_item);
714 				btrfs_set_extent_refs(path->nodes[0], itm, 1);
715 				op->del++;
716 			} else {
717 				struct btrfs_extent_ref *ref;
718 
719 				ref = btrfs_item_ptr(leaf, path->slots[0] + c,
720 						     struct btrfs_extent_ref);
721 				btrfs_set_ref_root(leaf, ref, ref_root);
722 				btrfs_set_ref_generation(leaf, ref,
723 							 op->generation);
724 				btrfs_set_ref_objectid(leaf, ref, op->level);
725 				btrfs_set_ref_num_refs(leaf, ref, 1);
726 				op->del++;
727 			}
728 
729 			/*
730 			 * using del to see when its ok to free up the
731 			 * pending_extent_op.  In the case where we insert the
732 			 * last item on the list in order to help do batching
733 			 * we need to not free the extent op until we actually
734 			 * insert the extent_item
735 			 */
736 			if (op->del == 2) {
737 				unlock_extent(&info->extent_ins, op->bytenr,
738 					      op->bytenr + op->num_bytes - 1,
739 					      GFP_NOFS);
740 				cur = cur->next;
741 				list_del_init(&op->list);
742 				kfree(op);
743 				if (cur != insert_list)
744 					op = list_entry(cur,
745 						struct pending_extent_op,
746 						list);
747 			}
748 		}
749 		btrfs_mark_buffer_dirty(leaf);
750 		btrfs_release_path(extent_root, path);
751 
752 		/*
753 		 * Ok backref's and items usually go right next to eachother,
754 		 * but if we could only insert 1 item that means that we
755 		 * inserted on the end of a leaf, and we have no idea what may
756 		 * be on the next leaf so we just play it safe.  In order to
757 		 * try and help this case we insert the last thing on our
758 		 * insert list so hopefully it will end up being the last
759 		 * thing on the leaf and everything else will be before it,
760 		 * which will let us insert a whole bunch of items at the same
761 		 * time.
762 		 */
763 		if (ret == 1 && !last && (i + ret < total)) {
764 			/*
765 			 * last: where we will pick up the next time around
766 			 * i: our current key to insert, will be total - 1
767 			 * cur: the current op we are screwing with
768 			 * op: duh
769 			 */
770 			last = i + ret;
771 			i = total - 1;
772 			cur = insert_list->prev;
773 			op = list_entry(cur, struct pending_extent_op, list);
774 		} else if (last) {
775 			/*
776 			 * ok we successfully inserted the last item on the
777 			 * list, lets reset everything
778 			 *
779 			 * i: our current key to insert, so where we left off
780 			 *    last time
781 			 * last: done with this
782 			 * cur: the op we are messing with
783 			 * op: duh
784 			 * total: since we inserted the last key, we need to
785 			 *        decrement total so we dont overflow
786 			 */
787 			i = last;
788 			last = 0;
789 			total--;
790 			if (i < total) {
791 				cur = insert_list->next;
792 				op = list_entry(cur, struct pending_extent_op,
793 						list);
794 			}
795 		} else {
796 			i += ret;
797 		}
798 
799 		cond_resched();
800 	}
801 	ret = 0;
802 	kfree(keys);
803 	kfree(data_size);
804 	return ret;
805 }
806 
807 static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
808 					  struct btrfs_root *root,
809 					  struct btrfs_path *path,
810 					  u64 bytenr, u64 parent,
811 					  u64 ref_root, u64 ref_generation,
812 					  u64 owner_objectid)
813 {
814 	struct btrfs_key key;
815 	struct extent_buffer *leaf;
816 	struct btrfs_extent_ref *ref;
817 	u32 num_refs;
818 	int ret;
819 
820 	key.objectid = bytenr;
821 	key.type = BTRFS_EXTENT_REF_KEY;
822 	key.offset = parent;
823 
824 	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref));
825 	if (ret == 0) {
826 		leaf = path->nodes[0];
827 		ref = btrfs_item_ptr(leaf, path->slots[0],
828 				     struct btrfs_extent_ref);
829 		btrfs_set_ref_root(leaf, ref, ref_root);
830 		btrfs_set_ref_generation(leaf, ref, ref_generation);
831 		btrfs_set_ref_objectid(leaf, ref, owner_objectid);
832 		btrfs_set_ref_num_refs(leaf, ref, 1);
833 	} else if (ret == -EEXIST) {
834 		u64 existing_owner;
835 		BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
836 		leaf = path->nodes[0];
837 		ref = btrfs_item_ptr(leaf, path->slots[0],
838 				     struct btrfs_extent_ref);
839 		if (btrfs_ref_root(leaf, ref) != ref_root ||
840 		    btrfs_ref_generation(leaf, ref) != ref_generation) {
841 			ret = -EIO;
842 			WARN_ON(1);
843 			goto out;
844 		}
845 
846 		num_refs = btrfs_ref_num_refs(leaf, ref);
847 		BUG_ON(num_refs == 0);
848 		btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
849 
850 		existing_owner = btrfs_ref_objectid(leaf, ref);
851 		if (existing_owner != owner_objectid &&
852 		    existing_owner != BTRFS_MULTIPLE_OBJECTIDS) {
853 			btrfs_set_ref_objectid(leaf, ref,
854 					BTRFS_MULTIPLE_OBJECTIDS);
855 		}
856 		ret = 0;
857 	} else {
858 		goto out;
859 	}
860 	btrfs_mark_buffer_dirty(path->nodes[0]);
861 out:
862 	btrfs_release_path(root, path);
863 	return ret;
864 }
865 
866 static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
867 					  struct btrfs_root *root,
868 					  struct btrfs_path *path)
869 {
870 	struct extent_buffer *leaf;
871 	struct btrfs_extent_ref *ref;
872 	u32 num_refs;
873 	int ret = 0;
874 
875 	leaf = path->nodes[0];
876 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
877 	num_refs = btrfs_ref_num_refs(leaf, ref);
878 	BUG_ON(num_refs == 0);
879 	num_refs -= 1;
880 	if (num_refs == 0) {
881 		ret = btrfs_del_item(trans, root, path);
882 	} else {
883 		btrfs_set_ref_num_refs(leaf, ref, num_refs);
884 		btrfs_mark_buffer_dirty(leaf);
885 	}
886 	btrfs_release_path(root, path);
887 	return ret;
888 }
889 
890 #ifdef BIO_RW_DISCARD
891 static void btrfs_issue_discard(struct block_device *bdev,
892 				u64 start, u64 len)
893 {
894 	blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
895 }
896 #endif
897 
898 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
899 				u64 num_bytes)
900 {
901 #ifdef BIO_RW_DISCARD
902 	int ret;
903 	u64 map_length = num_bytes;
904 	struct btrfs_multi_bio *multi = NULL;
905 
906 	/* Tell the block device(s) that the sectors can be discarded */
907 	ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
908 			      bytenr, &map_length, &multi, 0);
909 	if (!ret) {
910 		struct btrfs_bio_stripe *stripe = multi->stripes;
911 		int i;
912 
913 		if (map_length > num_bytes)
914 			map_length = num_bytes;
915 
916 		for (i = 0; i < multi->num_stripes; i++, stripe++) {
917 			btrfs_issue_discard(stripe->dev->bdev,
918 					    stripe->physical,
919 					    map_length);
920 		}
921 		kfree(multi);
922 	}
923 
924 	return ret;
925 #else
926 	return 0;
927 #endif
928 }
929 
930 static noinline int free_extents(struct btrfs_trans_handle *trans,
931 				 struct btrfs_root *extent_root,
932 				 struct list_head *del_list)
933 {
934 	struct btrfs_fs_info *info = extent_root->fs_info;
935 	struct btrfs_path *path;
936 	struct btrfs_key key, found_key;
937 	struct extent_buffer *leaf;
938 	struct list_head *cur;
939 	struct pending_extent_op *op;
940 	struct btrfs_extent_item *ei;
941 	int ret, num_to_del, extent_slot = 0, found_extent = 0;
942 	u32 refs;
943 	u64 bytes_freed = 0;
944 
945 	path = btrfs_alloc_path();
946 	if (!path)
947 		return -ENOMEM;
948 	path->reada = 1;
949 
950 search:
951 	/* search for the backref for the current ref we want to delete */
952 	cur = del_list->next;
953 	op = list_entry(cur, struct pending_extent_op, list);
954 	ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
955 				    op->orig_parent,
956 				    extent_root->root_key.objectid,
957 				    op->orig_generation, op->level, 1);
958 	if (ret) {
959 		printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
960 		       "root %llu gen %llu owner %u\n",
961 		       (unsigned long long)op->bytenr,
962 		       (unsigned long long)extent_root->root_key.objectid,
963 		       (unsigned long long)op->orig_generation, op->level);
964 		btrfs_print_leaf(extent_root, path->nodes[0]);
965 		WARN_ON(1);
966 		goto out;
967 	}
968 
969 	extent_slot = path->slots[0];
970 	num_to_del = 1;
971 	found_extent = 0;
972 
973 	/*
974 	 * if we aren't the first item on the leaf we can move back one and see
975 	 * if our ref is right next to our extent item
976 	 */
977 	if (likely(extent_slot)) {
978 		extent_slot--;
979 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
980 				      extent_slot);
981 		if (found_key.objectid == op->bytenr &&
982 		    found_key.type == BTRFS_EXTENT_ITEM_KEY &&
983 		    found_key.offset == op->num_bytes) {
984 			num_to_del++;
985 			found_extent = 1;
986 		}
987 	}
988 
989 	/*
990 	 * if we didn't find the extent we need to delete the backref and then
991 	 * search for the extent item key so we can update its ref count
992 	 */
993 	if (!found_extent) {
994 		key.objectid = op->bytenr;
995 		key.type = BTRFS_EXTENT_ITEM_KEY;
996 		key.offset = op->num_bytes;
997 
998 		ret = remove_extent_backref(trans, extent_root, path);
999 		BUG_ON(ret);
1000 		btrfs_release_path(extent_root, path);
1001 		ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
1002 		BUG_ON(ret);
1003 		extent_slot = path->slots[0];
1004 	}
1005 
1006 	/* this is where we update the ref count for the extent */
1007 	leaf = path->nodes[0];
1008 	ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
1009 	refs = btrfs_extent_refs(leaf, ei);
1010 	BUG_ON(refs == 0);
1011 	refs--;
1012 	btrfs_set_extent_refs(leaf, ei, refs);
1013 
1014 	btrfs_mark_buffer_dirty(leaf);
1015 
1016 	/*
1017 	 * This extent needs deleting.  The reason cur_slot is extent_slot +
1018 	 * num_to_del is because extent_slot points to the slot where the extent
1019 	 * is, and if the backref was not right next to the extent we will be
1020 	 * deleting at least 1 item, and will want to start searching at the
1021 	 * slot directly next to extent_slot.  However if we did find the
1022 	 * backref next to the extent item them we will be deleting at least 2
1023 	 * items and will want to start searching directly after the ref slot
1024 	 */
1025 	if (!refs) {
1026 		struct list_head *pos, *n, *end;
1027 		int cur_slot = extent_slot+num_to_del;
1028 		u64 super_used;
1029 		u64 root_used;
1030 
1031 		path->slots[0] = extent_slot;
1032 		bytes_freed = op->num_bytes;
1033 
1034 		mutex_lock(&info->pinned_mutex);
1035 		ret = pin_down_bytes(trans, extent_root, op->bytenr,
1036 				     op->num_bytes, op->level >=
1037 				     BTRFS_FIRST_FREE_OBJECTID);
1038 		mutex_unlock(&info->pinned_mutex);
1039 		BUG_ON(ret < 0);
1040 		op->del = ret;
1041 
1042 		/*
1043 		 * we need to see if we can delete multiple things at once, so
1044 		 * start looping through the list of extents we are wanting to
1045 		 * delete and see if their extent/backref's are right next to
1046 		 * eachother and the extents only have 1 ref
1047 		 */
1048 		for (pos = cur->next; pos != del_list; pos = pos->next) {
1049 			struct pending_extent_op *tmp;
1050 
1051 			tmp = list_entry(pos, struct pending_extent_op, list);
1052 
1053 			/* we only want to delete extent+ref at this stage */
1054 			if (cur_slot >= btrfs_header_nritems(leaf) - 1)
1055 				break;
1056 
1057 			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
1058 			if (found_key.objectid != tmp->bytenr ||
1059 			    found_key.type != BTRFS_EXTENT_ITEM_KEY ||
1060 			    found_key.offset != tmp->num_bytes)
1061 				break;
1062 
1063 			/* check to make sure this extent only has one ref */
1064 			ei = btrfs_item_ptr(leaf, cur_slot,
1065 					    struct btrfs_extent_item);
1066 			if (btrfs_extent_refs(leaf, ei) != 1)
1067 				break;
1068 
1069 			btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
1070 			if (found_key.objectid != tmp->bytenr ||
1071 			    found_key.type != BTRFS_EXTENT_REF_KEY ||
1072 			    found_key.offset != tmp->orig_parent)
1073 				break;
1074 
1075 			/*
1076 			 * the ref is right next to the extent, we can set the
1077 			 * ref count to 0 since we will delete them both now
1078 			 */
1079 			btrfs_set_extent_refs(leaf, ei, 0);
1080 
1081 			/* pin down the bytes for this extent */
1082 			mutex_lock(&info->pinned_mutex);
1083 			ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
1084 					     tmp->num_bytes, tmp->level >=
1085 					     BTRFS_FIRST_FREE_OBJECTID);
1086 			mutex_unlock(&info->pinned_mutex);
1087 			BUG_ON(ret < 0);
1088 
1089 			/*
1090 			 * use the del field to tell if we need to go ahead and
1091 			 * free up the extent when we delete the item or not.
1092 			 */
1093 			tmp->del = ret;
1094 			bytes_freed += tmp->num_bytes;
1095 
1096 			num_to_del += 2;
1097 			cur_slot += 2;
1098 		}
1099 		end = pos;
1100 
1101 		/* update the free space counters */
1102 		spin_lock(&info->delalloc_lock);
1103 		super_used = btrfs_super_bytes_used(&info->super_copy);
1104 		btrfs_set_super_bytes_used(&info->super_copy,
1105 					   super_used - bytes_freed);
1106 
1107 		root_used = btrfs_root_used(&extent_root->root_item);
1108 		btrfs_set_root_used(&extent_root->root_item,
1109 				    root_used - bytes_freed);
1110 		spin_unlock(&info->delalloc_lock);
1111 
1112 		/* delete the items */
1113 		ret = btrfs_del_items(trans, extent_root, path,
1114 				      path->slots[0], num_to_del);
1115 		BUG_ON(ret);
1116 
1117 		/*
1118 		 * loop through the extents we deleted and do the cleanup work
1119 		 * on them
1120 		 */
1121 		for (pos = cur, n = pos->next; pos != end;
1122 		     pos = n, n = pos->next) {
1123 			struct pending_extent_op *tmp;
1124 			tmp = list_entry(pos, struct pending_extent_op, list);
1125 
1126 			/*
1127 			 * remember tmp->del tells us wether or not we pinned
1128 			 * down the extent
1129 			 */
1130 			ret = update_block_group(trans, extent_root,
1131 						 tmp->bytenr, tmp->num_bytes, 0,
1132 						 tmp->del);
1133 			BUG_ON(ret);
1134 
1135 			list_del_init(&tmp->list);
1136 			unlock_extent(&info->extent_ins, tmp->bytenr,
1137 				      tmp->bytenr + tmp->num_bytes - 1,
1138 				      GFP_NOFS);
1139 			kfree(tmp);
1140 		}
1141 	} else if (refs && found_extent) {
1142 		/*
1143 		 * the ref and extent were right next to eachother, but the
1144 		 * extent still has a ref, so just free the backref and keep
1145 		 * going
1146 		 */
1147 		ret = remove_extent_backref(trans, extent_root, path);
1148 		BUG_ON(ret);
1149 
1150 		list_del_init(&op->list);
1151 		unlock_extent(&info->extent_ins, op->bytenr,
1152 			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
1153 		kfree(op);
1154 	} else {
1155 		/*
1156 		 * the extent has multiple refs and the backref we were looking
1157 		 * for was not right next to it, so just unlock and go next,
1158 		 * we're good to go
1159 		 */
1160 		list_del_init(&op->list);
1161 		unlock_extent(&info->extent_ins, op->bytenr,
1162 			      op->bytenr + op->num_bytes - 1, GFP_NOFS);
1163 		kfree(op);
1164 	}
1165 
1166 	btrfs_release_path(extent_root, path);
1167 	if (!list_empty(del_list))
1168 		goto search;
1169 
1170 out:
1171 	btrfs_free_path(path);
1172 	return ret;
1173 }
1174 
1175 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1176 				     struct btrfs_root *root, u64 bytenr,
1177 				     u64 orig_parent, u64 parent,
1178 				     u64 orig_root, u64 ref_root,
1179 				     u64 orig_generation, u64 ref_generation,
1180 				     u64 owner_objectid)
1181 {
1182 	int ret;
1183 	struct btrfs_root *extent_root = root->fs_info->extent_root;
1184 	struct btrfs_path *path;
1185 
1186 	if (root == root->fs_info->extent_root) {
1187 		struct pending_extent_op *extent_op;
1188 		u64 num_bytes;
1189 
1190 		BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
1191 		num_bytes = btrfs_level_size(root, (int)owner_objectid);
1192 		mutex_lock(&root->fs_info->extent_ins_mutex);
1193 		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
1194 				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
1195 			u64 priv;
1196 			ret = get_state_private(&root->fs_info->extent_ins,
1197 						bytenr, &priv);
1198 			BUG_ON(ret);
1199 			extent_op = (struct pending_extent_op *)
1200 							(unsigned long)priv;
1201 			BUG_ON(extent_op->parent != orig_parent);
1202 			BUG_ON(extent_op->generation != orig_generation);
1203 
1204 			extent_op->parent = parent;
1205 			extent_op->generation = ref_generation;
1206 		} else {
1207 			extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
1208 			BUG_ON(!extent_op);
1209 
1210 			extent_op->type = PENDING_BACKREF_UPDATE;
1211 			extent_op->bytenr = bytenr;
1212 			extent_op->num_bytes = num_bytes;
1213 			extent_op->parent = parent;
1214 			extent_op->orig_parent = orig_parent;
1215 			extent_op->generation = ref_generation;
1216 			extent_op->orig_generation = orig_generation;
1217 			extent_op->level = (int)owner_objectid;
1218 			INIT_LIST_HEAD(&extent_op->list);
1219 			extent_op->del = 0;
1220 
1221 			set_extent_bits(&root->fs_info->extent_ins,
1222 					bytenr, bytenr + num_bytes - 1,
1223 					EXTENT_WRITEBACK, GFP_NOFS);
1224 			set_state_private(&root->fs_info->extent_ins,
1225 					  bytenr, (unsigned long)extent_op);
1226 		}
1227 		mutex_unlock(&root->fs_info->extent_ins_mutex);
1228 		return 0;
1229 	}
1230 
1231 	path = btrfs_alloc_path();
1232 	if (!path)
1233 		return -ENOMEM;
1234 	ret = lookup_extent_backref(trans, extent_root, path,
1235 				    bytenr, orig_parent, orig_root,
1236 				    orig_generation, owner_objectid, 1);
1237 	if (ret)
1238 		goto out;
1239 	ret = remove_extent_backref(trans, extent_root, path);
1240 	if (ret)
1241 		goto out;
1242 	ret = insert_extent_backref(trans, extent_root, path, bytenr,
1243 				    parent, ref_root, ref_generation,
1244 				    owner_objectid);
1245 	BUG_ON(ret);
1246 	finish_current_insert(trans, extent_root, 0);
1247 	del_pending_extents(trans, extent_root, 0);
1248 out:
1249 	btrfs_free_path(path);
1250 	return ret;
1251 }
1252 
1253 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
1254 			    struct btrfs_root *root, u64 bytenr,
1255 			    u64 orig_parent, u64 parent,
1256 			    u64 ref_root, u64 ref_generation,
1257 			    u64 owner_objectid)
1258 {
1259 	int ret;
1260 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1261 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1262 		return 0;
1263 	ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
1264 					parent, ref_root, ref_root,
1265 					ref_generation, ref_generation,
1266 					owner_objectid);
1267 	return ret;
1268 }
1269 
1270 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1271 				  struct btrfs_root *root, u64 bytenr,
1272 				  u64 orig_parent, u64 parent,
1273 				  u64 orig_root, u64 ref_root,
1274 				  u64 orig_generation, u64 ref_generation,
1275 				  u64 owner_objectid)
1276 {
1277 	struct btrfs_path *path;
1278 	int ret;
1279 	struct btrfs_key key;
1280 	struct extent_buffer *l;
1281 	struct btrfs_extent_item *item;
1282 	u32 refs;
1283 
1284 	path = btrfs_alloc_path();
1285 	if (!path)
1286 		return -ENOMEM;
1287 
1288 	path->reada = 1;
1289 	key.objectid = bytenr;
1290 	key.type = BTRFS_EXTENT_ITEM_KEY;
1291 	key.offset = (u64)-1;
1292 
1293 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1294 				0, 1);
1295 	if (ret < 0)
1296 		return ret;
1297 	BUG_ON(ret == 0 || path->slots[0] == 0);
1298 
1299 	path->slots[0]--;
1300 	l = path->nodes[0];
1301 
1302 	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
1303 	if (key.objectid != bytenr) {
1304 		btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]);
1305 		printk(KERN_ERR "btrfs wanted %llu found %llu\n",
1306 		       (unsigned long long)bytenr,
1307 		       (unsigned long long)key.objectid);
1308 		BUG();
1309 	}
1310 	BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
1311 
1312 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1313 	refs = btrfs_extent_refs(l, item);
1314 	btrfs_set_extent_refs(l, item, refs + 1);
1315 	btrfs_mark_buffer_dirty(path->nodes[0]);
1316 
1317 	btrfs_release_path(root->fs_info->extent_root, path);
1318 
1319 	path->reada = 1;
1320 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
1321 				    path, bytenr, parent,
1322 				    ref_root, ref_generation,
1323 				    owner_objectid);
1324 	BUG_ON(ret);
1325 	finish_current_insert(trans, root->fs_info->extent_root, 0);
1326 	del_pending_extents(trans, root->fs_info->extent_root, 0);
1327 
1328 	btrfs_free_path(path);
1329 	return 0;
1330 }
1331 
1332 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1333 			 struct btrfs_root *root,
1334 			 u64 bytenr, u64 num_bytes, u64 parent,
1335 			 u64 ref_root, u64 ref_generation,
1336 			 u64 owner_objectid)
1337 {
1338 	int ret;
1339 	if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
1340 	    owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
1341 		return 0;
1342 	ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
1343 				     0, ref_root, 0, ref_generation,
1344 				     owner_objectid);
1345 	return ret;
1346 }
1347 
1348 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
1349 			 struct btrfs_root *root)
1350 {
1351 	u64 start;
1352 	u64 end;
1353 	int ret;
1354 
1355 	while(1) {
1356 		finish_current_insert(trans, root->fs_info->extent_root, 1);
1357 		del_pending_extents(trans, root->fs_info->extent_root, 1);
1358 
1359 		/* is there more work to do? */
1360 		ret = find_first_extent_bit(&root->fs_info->pending_del,
1361 					    0, &start, &end, EXTENT_WRITEBACK);
1362 		if (!ret)
1363 			continue;
1364 		ret = find_first_extent_bit(&root->fs_info->extent_ins,
1365 					    0, &start, &end, EXTENT_WRITEBACK);
1366 		if (!ret)
1367 			continue;
1368 		break;
1369 	}
1370 	return 0;
1371 }
1372 
1373 int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
1374 			    struct btrfs_root *root, u64 bytenr,
1375 			    u64 num_bytes, u32 *refs)
1376 {
1377 	struct btrfs_path *path;
1378 	int ret;
1379 	struct btrfs_key key;
1380 	struct extent_buffer *l;
1381 	struct btrfs_extent_item *item;
1382 
1383 	WARN_ON(num_bytes < root->sectorsize);
1384 	path = btrfs_alloc_path();
1385 	path->reada = 1;
1386 	key.objectid = bytenr;
1387 	key.offset = num_bytes;
1388 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
1389 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
1390 				0, 0);
1391 	if (ret < 0)
1392 		goto out;
1393 	if (ret != 0) {
1394 		btrfs_print_leaf(root, path->nodes[0]);
1395 		printk(KERN_INFO "btrfs failed to find block number %llu\n",
1396 		       (unsigned long long)bytenr);
1397 		BUG();
1398 	}
1399 	l = path->nodes[0];
1400 	item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
1401 	*refs = btrfs_extent_refs(l, item);
1402 out:
1403 	btrfs_free_path(path);
1404 	return 0;
1405 }
1406 
1407 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
1408 			  struct btrfs_root *root, u64 objectid, u64 bytenr)
1409 {
1410 	struct btrfs_root *extent_root = root->fs_info->extent_root;
1411 	struct btrfs_path *path;
1412 	struct extent_buffer *leaf;
1413 	struct btrfs_extent_ref *ref_item;
1414 	struct btrfs_key key;
1415 	struct btrfs_key found_key;
1416 	u64 ref_root;
1417 	u64 last_snapshot;
1418 	u32 nritems;
1419 	int ret;
1420 
1421 	key.objectid = bytenr;
1422 	key.offset = (u64)-1;
1423 	key.type = BTRFS_EXTENT_ITEM_KEY;
1424 
1425 	path = btrfs_alloc_path();
1426 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
1427 	if (ret < 0)
1428 		goto out;
1429 	BUG_ON(ret == 0);
1430 
1431 	ret = -ENOENT;
1432 	if (path->slots[0] == 0)
1433 		goto out;
1434 
1435 	path->slots[0]--;
1436 	leaf = path->nodes[0];
1437 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1438 
1439 	if (found_key.objectid != bytenr ||
1440 	    found_key.type != BTRFS_EXTENT_ITEM_KEY)
1441 		goto out;
1442 
1443 	last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1444 	while (1) {
1445 		leaf = path->nodes[0];
1446 		nritems = btrfs_header_nritems(leaf);
1447 		if (path->slots[0] >= nritems) {
1448 			ret = btrfs_next_leaf(extent_root, path);
1449 			if (ret < 0)
1450 				goto out;
1451 			if (ret == 0)
1452 				continue;
1453 			break;
1454 		}
1455 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1456 		if (found_key.objectid != bytenr)
1457 			break;
1458 
1459 		if (found_key.type != BTRFS_EXTENT_REF_KEY) {
1460 			path->slots[0]++;
1461 			continue;
1462 		}
1463 
1464 		ref_item = btrfs_item_ptr(leaf, path->slots[0],
1465 					  struct btrfs_extent_ref);
1466 		ref_root = btrfs_ref_root(leaf, ref_item);
1467 		if ((ref_root != root->root_key.objectid &&
1468 		     ref_root != BTRFS_TREE_LOG_OBJECTID) ||
1469 		     objectid != btrfs_ref_objectid(leaf, ref_item)) {
1470 			ret = 1;
1471 			goto out;
1472 		}
1473 		if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) {
1474 			ret = 1;
1475 			goto out;
1476 		}
1477 
1478 		path->slots[0]++;
1479 	}
1480 	ret = 0;
1481 out:
1482 	btrfs_free_path(path);
1483 	return ret;
1484 }
1485 
1486 int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1487 		    struct extent_buffer *buf, u32 nr_extents)
1488 {
1489 	struct btrfs_key key;
1490 	struct btrfs_file_extent_item *fi;
1491 	u64 root_gen;
1492 	u32 nritems;
1493 	int i;
1494 	int level;
1495 	int ret = 0;
1496 	int shared = 0;
1497 
1498 	if (!root->ref_cows)
1499 		return 0;
1500 
1501 	if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1502 		shared = 0;
1503 		root_gen = root->root_key.offset;
1504 	} else {
1505 		shared = 1;
1506 		root_gen = trans->transid - 1;
1507 	}
1508 
1509 	level = btrfs_header_level(buf);
1510 	nritems = btrfs_header_nritems(buf);
1511 
1512 	if (level == 0) {
1513 		struct btrfs_leaf_ref *ref;
1514 		struct btrfs_extent_info *info;
1515 
1516 		ref = btrfs_alloc_leaf_ref(root, nr_extents);
1517 		if (!ref) {
1518 			ret = -ENOMEM;
1519 			goto out;
1520 		}
1521 
1522 		ref->root_gen = root_gen;
1523 		ref->bytenr = buf->start;
1524 		ref->owner = btrfs_header_owner(buf);
1525 		ref->generation = btrfs_header_generation(buf);
1526 		ref->nritems = nr_extents;
1527 		info = ref->extents;
1528 
1529 		for (i = 0; nr_extents > 0 && i < nritems; i++) {
1530 			u64 disk_bytenr;
1531 			btrfs_item_key_to_cpu(buf, &key, i);
1532 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1533 				continue;
1534 			fi = btrfs_item_ptr(buf, i,
1535 					    struct btrfs_file_extent_item);
1536 			if (btrfs_file_extent_type(buf, fi) ==
1537 			    BTRFS_FILE_EXTENT_INLINE)
1538 				continue;
1539 			disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1540 			if (disk_bytenr == 0)
1541 				continue;
1542 
1543 			info->bytenr = disk_bytenr;
1544 			info->num_bytes =
1545 				btrfs_file_extent_disk_num_bytes(buf, fi);
1546 			info->objectid = key.objectid;
1547 			info->offset = key.offset;
1548 			info++;
1549 		}
1550 
1551 		ret = btrfs_add_leaf_ref(root, ref, shared);
1552 		if (ret == -EEXIST && shared) {
1553 			struct btrfs_leaf_ref *old;
1554 			old = btrfs_lookup_leaf_ref(root, ref->bytenr);
1555 			BUG_ON(!old);
1556 			btrfs_remove_leaf_ref(root, old);
1557 			btrfs_free_leaf_ref(root, old);
1558 			ret = btrfs_add_leaf_ref(root, ref, shared);
1559 		}
1560 		WARN_ON(ret);
1561 		btrfs_free_leaf_ref(root, ref);
1562 	}
1563 out:
1564 	return ret;
1565 }
1566 
1567 /* when a block goes through cow, we update the reference counts of
1568  * everything that block points to.  The internal pointers of the block
1569  * can be in just about any order, and it is likely to have clusters of
1570  * things that are close together and clusters of things that are not.
1571  *
1572  * To help reduce the seeks that come with updating all of these reference
1573  * counts, sort them by byte number before actual updates are done.
1574  *
1575  * struct refsort is used to match byte number to slot in the btree block.
1576  * we sort based on the byte number and then use the slot to actually
1577  * find the item.
1578  *
1579  * struct refsort is smaller than strcut btrfs_item and smaller than
1580  * struct btrfs_key_ptr.  Since we're currently limited to the page size
1581  * for a btree block, there's no way for a kmalloc of refsorts for a
1582  * single node to be bigger than a page.
1583  */
1584 struct refsort {
1585 	u64 bytenr;
1586 	u32 slot;
1587 };
1588 
1589 /*
1590  * for passing into sort()
1591  */
1592 static int refsort_cmp(const void *a_void, const void *b_void)
1593 {
1594 	const struct refsort *a = a_void;
1595 	const struct refsort *b = b_void;
1596 
1597 	if (a->bytenr < b->bytenr)
1598 		return -1;
1599 	if (a->bytenr > b->bytenr)
1600 		return 1;
1601 	return 0;
1602 }
1603 
1604 
1605 noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
1606 			   struct btrfs_root *root,
1607 			   struct extent_buffer *orig_buf,
1608 			   struct extent_buffer *buf, u32 *nr_extents)
1609 {
1610 	u64 bytenr;
1611 	u64 ref_root;
1612 	u64 orig_root;
1613 	u64 ref_generation;
1614 	u64 orig_generation;
1615 	struct refsort *sorted;
1616 	u32 nritems;
1617 	u32 nr_file_extents = 0;
1618 	struct btrfs_key key;
1619 	struct btrfs_file_extent_item *fi;
1620 	int i;
1621 	int level;
1622 	int ret = 0;
1623 	int faili = 0;
1624 	int refi = 0;
1625 	int slot;
1626 	int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
1627 			    u64, u64, u64, u64, u64, u64, u64, u64);
1628 
1629 	ref_root = btrfs_header_owner(buf);
1630 	ref_generation = btrfs_header_generation(buf);
1631 	orig_root = btrfs_header_owner(orig_buf);
1632 	orig_generation = btrfs_header_generation(orig_buf);
1633 
1634 	nritems = btrfs_header_nritems(buf);
1635 	level = btrfs_header_level(buf);
1636 
1637 	sorted = kmalloc(sizeof(struct refsort) * nritems, GFP_NOFS);
1638 	BUG_ON(!sorted);
1639 
1640 	if (root->ref_cows) {
1641 		process_func = __btrfs_inc_extent_ref;
1642 	} else {
1643 		if (level == 0 &&
1644 		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1645 			goto out;
1646 		if (level != 0 &&
1647 		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1648 			goto out;
1649 		process_func = __btrfs_update_extent_ref;
1650 	}
1651 
1652 	/*
1653 	 * we make two passes through the items.  In the first pass we
1654 	 * only record the byte number and slot.  Then we sort based on
1655 	 * byte number and do the actual work based on the sorted results
1656 	 */
1657 	for (i = 0; i < nritems; i++) {
1658 		cond_resched();
1659 		if (level == 0) {
1660 			btrfs_item_key_to_cpu(buf, &key, i);
1661 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1662 				continue;
1663 			fi = btrfs_item_ptr(buf, i,
1664 					    struct btrfs_file_extent_item);
1665 			if (btrfs_file_extent_type(buf, fi) ==
1666 			    BTRFS_FILE_EXTENT_INLINE)
1667 				continue;
1668 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1669 			if (bytenr == 0)
1670 				continue;
1671 
1672 			nr_file_extents++;
1673 			sorted[refi].bytenr = bytenr;
1674 			sorted[refi].slot = i;
1675 			refi++;
1676 		} else {
1677 			bytenr = btrfs_node_blockptr(buf, i);
1678 			sorted[refi].bytenr = bytenr;
1679 			sorted[refi].slot = i;
1680 			refi++;
1681 		}
1682 	}
1683 	/*
1684 	 * if refi == 0, we didn't actually put anything into the sorted
1685 	 * array and we're done
1686 	 */
1687 	if (refi == 0)
1688 		goto out;
1689 
1690 	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
1691 
1692 	for (i = 0; i < refi; i++) {
1693 		cond_resched();
1694 		slot = sorted[i].slot;
1695 		bytenr = sorted[i].bytenr;
1696 
1697 		if (level == 0) {
1698 			btrfs_item_key_to_cpu(buf, &key, slot);
1699 
1700 			ret = process_func(trans, root, bytenr,
1701 					   orig_buf->start, buf->start,
1702 					   orig_root, ref_root,
1703 					   orig_generation, ref_generation,
1704 					   key.objectid);
1705 
1706 			if (ret) {
1707 				faili = slot;
1708 				WARN_ON(1);
1709 				goto fail;
1710 			}
1711 		} else {
1712 			ret = process_func(trans, root, bytenr,
1713 					   orig_buf->start, buf->start,
1714 					   orig_root, ref_root,
1715 					   orig_generation, ref_generation,
1716 					   level - 1);
1717 			if (ret) {
1718 				faili = slot;
1719 				WARN_ON(1);
1720 				goto fail;
1721 			}
1722 		}
1723 	}
1724 out:
1725 	kfree(sorted);
1726 	if (nr_extents) {
1727 		if (level == 0)
1728 			*nr_extents = nr_file_extents;
1729 		else
1730 			*nr_extents = nritems;
1731 	}
1732 	return 0;
1733 fail:
1734 	kfree(sorted);
1735 	WARN_ON(1);
1736 	return ret;
1737 }
1738 
1739 int btrfs_update_ref(struct btrfs_trans_handle *trans,
1740 		     struct btrfs_root *root, struct extent_buffer *orig_buf,
1741 		     struct extent_buffer *buf, int start_slot, int nr)
1742 
1743 {
1744 	u64 bytenr;
1745 	u64 ref_root;
1746 	u64 orig_root;
1747 	u64 ref_generation;
1748 	u64 orig_generation;
1749 	struct btrfs_key key;
1750 	struct btrfs_file_extent_item *fi;
1751 	int i;
1752 	int ret;
1753 	int slot;
1754 	int level;
1755 
1756 	BUG_ON(start_slot < 0);
1757 	BUG_ON(start_slot + nr > btrfs_header_nritems(buf));
1758 
1759 	ref_root = btrfs_header_owner(buf);
1760 	ref_generation = btrfs_header_generation(buf);
1761 	orig_root = btrfs_header_owner(orig_buf);
1762 	orig_generation = btrfs_header_generation(orig_buf);
1763 	level = btrfs_header_level(buf);
1764 
1765 	if (!root->ref_cows) {
1766 		if (level == 0 &&
1767 		    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
1768 			return 0;
1769 		if (level != 0 &&
1770 		    root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID)
1771 			return 0;
1772 	}
1773 
1774 	for (i = 0, slot = start_slot; i < nr; i++, slot++) {
1775 		cond_resched();
1776 		if (level == 0) {
1777 			btrfs_item_key_to_cpu(buf, &key, slot);
1778 			if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
1779 				continue;
1780 			fi = btrfs_item_ptr(buf, slot,
1781 					    struct btrfs_file_extent_item);
1782 			if (btrfs_file_extent_type(buf, fi) ==
1783 			    BTRFS_FILE_EXTENT_INLINE)
1784 				continue;
1785 			bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
1786 			if (bytenr == 0)
1787 				continue;
1788 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
1789 					    orig_buf->start, buf->start,
1790 					    orig_root, ref_root,
1791 					    orig_generation, ref_generation,
1792 					    key.objectid);
1793 			if (ret)
1794 				goto fail;
1795 		} else {
1796 			bytenr = btrfs_node_blockptr(buf, slot);
1797 			ret = __btrfs_update_extent_ref(trans, root, bytenr,
1798 					    orig_buf->start, buf->start,
1799 					    orig_root, ref_root,
1800 					    orig_generation, ref_generation,
1801 					    level - 1);
1802 			if (ret)
1803 				goto fail;
1804 		}
1805 	}
1806 	return 0;
1807 fail:
1808 	WARN_ON(1);
1809 	return -1;
1810 }
1811 
1812 static int write_one_cache_group(struct btrfs_trans_handle *trans,
1813 				 struct btrfs_root *root,
1814 				 struct btrfs_path *path,
1815 				 struct btrfs_block_group_cache *cache)
1816 {
1817 	int ret;
1818 	int pending_ret;
1819 	struct btrfs_root *extent_root = root->fs_info->extent_root;
1820 	unsigned long bi;
1821 	struct extent_buffer *leaf;
1822 
1823 	ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
1824 	if (ret < 0)
1825 		goto fail;
1826 	BUG_ON(ret);
1827 
1828 	leaf = path->nodes[0];
1829 	bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
1830 	write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
1831 	btrfs_mark_buffer_dirty(leaf);
1832 	btrfs_release_path(extent_root, path);
1833 fail:
1834 	finish_current_insert(trans, extent_root, 0);
1835 	pending_ret = del_pending_extents(trans, extent_root, 0);
1836 	if (ret)
1837 		return ret;
1838 	if (pending_ret)
1839 		return pending_ret;
1840 	return 0;
1841 
1842 }
1843 
1844 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
1845 				   struct btrfs_root *root)
1846 {
1847 	struct btrfs_block_group_cache *cache, *entry;
1848 	struct rb_node *n;
1849 	int err = 0;
1850 	int werr = 0;
1851 	struct btrfs_path *path;
1852 	u64 last = 0;
1853 
1854 	path = btrfs_alloc_path();
1855 	if (!path)
1856 		return -ENOMEM;
1857 
1858 	while (1) {
1859 		cache = NULL;
1860 		spin_lock(&root->fs_info->block_group_cache_lock);
1861 		for (n = rb_first(&root->fs_info->block_group_cache_tree);
1862 		     n; n = rb_next(n)) {
1863 			entry = rb_entry(n, struct btrfs_block_group_cache,
1864 					 cache_node);
1865 			if (entry->dirty) {
1866 				cache = entry;
1867 				break;
1868 			}
1869 		}
1870 		spin_unlock(&root->fs_info->block_group_cache_lock);
1871 
1872 		if (!cache)
1873 			break;
1874 
1875 		cache->dirty = 0;
1876 		last += cache->key.offset;
1877 
1878 		err = write_one_cache_group(trans, root,
1879 					    path, cache);
1880 		/*
1881 		 * if we fail to write the cache group, we want
1882 		 * to keep it marked dirty in hopes that a later
1883 		 * write will work
1884 		 */
1885 		if (err) {
1886 			werr = err;
1887 			continue;
1888 		}
1889 	}
1890 	btrfs_free_path(path);
1891 	return werr;
1892 }
1893 
1894 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
1895 {
1896 	struct btrfs_block_group_cache *block_group;
1897 	int readonly = 0;
1898 
1899 	block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
1900 	if (!block_group || block_group->ro)
1901 		readonly = 1;
1902 	if (block_group)
1903 		put_block_group(block_group);
1904 	return readonly;
1905 }
1906 
1907 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
1908 			     u64 total_bytes, u64 bytes_used,
1909 			     struct btrfs_space_info **space_info)
1910 {
1911 	struct btrfs_space_info *found;
1912 
1913 	found = __find_space_info(info, flags);
1914 	if (found) {
1915 		spin_lock(&found->lock);
1916 		found->total_bytes += total_bytes;
1917 		found->bytes_used += bytes_used;
1918 		found->full = 0;
1919 		spin_unlock(&found->lock);
1920 		*space_info = found;
1921 		return 0;
1922 	}
1923 	found = kzalloc(sizeof(*found), GFP_NOFS);
1924 	if (!found)
1925 		return -ENOMEM;
1926 
1927 	INIT_LIST_HEAD(&found->block_groups);
1928 	init_rwsem(&found->groups_sem);
1929 	spin_lock_init(&found->lock);
1930 	found->flags = flags;
1931 	found->total_bytes = total_bytes;
1932 	found->bytes_used = bytes_used;
1933 	found->bytes_pinned = 0;
1934 	found->bytes_reserved = 0;
1935 	found->bytes_readonly = 0;
1936 	found->bytes_delalloc = 0;
1937 	found->full = 0;
1938 	found->force_alloc = 0;
1939 	*space_info = found;
1940 	list_add_rcu(&found->list, &info->space_info);
1941 	return 0;
1942 }
1943 
1944 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
1945 {
1946 	u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 |
1947 				   BTRFS_BLOCK_GROUP_RAID1 |
1948 				   BTRFS_BLOCK_GROUP_RAID10 |
1949 				   BTRFS_BLOCK_GROUP_DUP);
1950 	if (extra_flags) {
1951 		if (flags & BTRFS_BLOCK_GROUP_DATA)
1952 			fs_info->avail_data_alloc_bits |= extra_flags;
1953 		if (flags & BTRFS_BLOCK_GROUP_METADATA)
1954 			fs_info->avail_metadata_alloc_bits |= extra_flags;
1955 		if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
1956 			fs_info->avail_system_alloc_bits |= extra_flags;
1957 	}
1958 }
1959 
1960 static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
1961 {
1962 	spin_lock(&cache->space_info->lock);
1963 	spin_lock(&cache->lock);
1964 	if (!cache->ro) {
1965 		cache->space_info->bytes_readonly += cache->key.offset -
1966 					btrfs_block_group_used(&cache->item);
1967 		cache->ro = 1;
1968 	}
1969 	spin_unlock(&cache->lock);
1970 	spin_unlock(&cache->space_info->lock);
1971 }
1972 
1973 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
1974 {
1975 	u64 num_devices = root->fs_info->fs_devices->rw_devices;
1976 
1977 	if (num_devices == 1)
1978 		flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
1979 	if (num_devices < 4)
1980 		flags &= ~BTRFS_BLOCK_GROUP_RAID10;
1981 
1982 	if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
1983 	    (flags & (BTRFS_BLOCK_GROUP_RAID1 |
1984 		      BTRFS_BLOCK_GROUP_RAID10))) {
1985 		flags &= ~BTRFS_BLOCK_GROUP_DUP;
1986 	}
1987 
1988 	if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
1989 	    (flags & BTRFS_BLOCK_GROUP_RAID10)) {
1990 		flags &= ~BTRFS_BLOCK_GROUP_RAID1;
1991 	}
1992 
1993 	if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
1994 	    ((flags & BTRFS_BLOCK_GROUP_RAID1) |
1995 	     (flags & BTRFS_BLOCK_GROUP_RAID10) |
1996 	     (flags & BTRFS_BLOCK_GROUP_DUP)))
1997 		flags &= ~BTRFS_BLOCK_GROUP_RAID0;
1998 	return flags;
1999 }
2000 
2001 static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
2002 {
2003 	struct btrfs_fs_info *info = root->fs_info;
2004 	u64 alloc_profile;
2005 
2006 	if (data) {
2007 		alloc_profile = info->avail_data_alloc_bits &
2008 			info->data_alloc_profile;
2009 		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
2010 	} else if (root == root->fs_info->chunk_root) {
2011 		alloc_profile = info->avail_system_alloc_bits &
2012 			info->system_alloc_profile;
2013 		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
2014 	} else {
2015 		alloc_profile = info->avail_metadata_alloc_bits &
2016 			info->metadata_alloc_profile;
2017 		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
2018 	}
2019 
2020 	return btrfs_reduce_alloc_profile(root, data);
2021 }
2022 
2023 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
2024 {
2025 	u64 alloc_target;
2026 
2027 	alloc_target = btrfs_get_alloc_profile(root, 1);
2028 	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
2029 						       alloc_target);
2030 }
2031 
2032 /*
2033  * for now this just makes sure we have at least 5% of our metadata space free
2034  * for use.
2035  */
2036 int btrfs_check_metadata_free_space(struct btrfs_root *root)
2037 {
2038 	struct btrfs_fs_info *info = root->fs_info;
2039 	struct btrfs_space_info *meta_sinfo;
2040 	u64 alloc_target, thresh;
2041 	int committed = 0, ret;
2042 
2043 	/* get the space info for where the metadata will live */
2044 	alloc_target = btrfs_get_alloc_profile(root, 0);
2045 	meta_sinfo = __find_space_info(info, alloc_target);
2046 
2047 again:
2048 	spin_lock(&meta_sinfo->lock);
2049 	if (!meta_sinfo->full)
2050 		thresh = meta_sinfo->total_bytes * 80;
2051 	else
2052 		thresh = meta_sinfo->total_bytes * 95;
2053 
2054 	do_div(thresh, 100);
2055 
2056 	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
2057 	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
2058 		struct btrfs_trans_handle *trans;
2059 		if (!meta_sinfo->full) {
2060 			meta_sinfo->force_alloc = 1;
2061 			spin_unlock(&meta_sinfo->lock);
2062 
2063 			trans = btrfs_start_transaction(root, 1);
2064 			if (!trans)
2065 				return -ENOMEM;
2066 
2067 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2068 					     2 * 1024 * 1024, alloc_target, 0);
2069 			btrfs_end_transaction(trans, root);
2070 			goto again;
2071 		}
2072 		spin_unlock(&meta_sinfo->lock);
2073 
2074 		if (!committed) {
2075 			committed = 1;
2076 			trans = btrfs_join_transaction(root, 1);
2077 			if (!trans)
2078 				return -ENOMEM;
2079 			ret = btrfs_commit_transaction(trans, root);
2080 			if (ret)
2081 				return ret;
2082 			goto again;
2083 		}
2084 		return -ENOSPC;
2085 	}
2086 	spin_unlock(&meta_sinfo->lock);
2087 
2088 	return 0;
2089 }
2090 
2091 /*
2092  * This will check the space that the inode allocates from to make sure we have
2093  * enough space for bytes.
2094  */
2095 int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
2096 				u64 bytes)
2097 {
2098 	struct btrfs_space_info *data_sinfo;
2099 	int ret = 0, committed = 0;
2100 
2101 	/* make sure bytes are sectorsize aligned */
2102 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2103 
2104 	data_sinfo = BTRFS_I(inode)->space_info;
2105 again:
2106 	/* make sure we have enough space to handle the data first */
2107 	spin_lock(&data_sinfo->lock);
2108 	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
2109 	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
2110 	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
2111 	    data_sinfo->bytes_may_use < bytes) {
2112 		struct btrfs_trans_handle *trans;
2113 
2114 		/*
2115 		 * if we don't have enough free bytes in this space then we need
2116 		 * to alloc a new chunk.
2117 		 */
2118 		if (!data_sinfo->full) {
2119 			u64 alloc_target;
2120 
2121 			data_sinfo->force_alloc = 1;
2122 			spin_unlock(&data_sinfo->lock);
2123 
2124 			alloc_target = btrfs_get_alloc_profile(root, 1);
2125 			trans = btrfs_start_transaction(root, 1);
2126 			if (!trans)
2127 				return -ENOMEM;
2128 
2129 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2130 					     bytes + 2 * 1024 * 1024,
2131 					     alloc_target, 0);
2132 			btrfs_end_transaction(trans, root);
2133 			if (ret)
2134 				return ret;
2135 			goto again;
2136 		}
2137 		spin_unlock(&data_sinfo->lock);
2138 
2139 		/* commit the current transaction and try again */
2140 		if (!committed) {
2141 			committed = 1;
2142 			trans = btrfs_join_transaction(root, 1);
2143 			if (!trans)
2144 				return -ENOMEM;
2145 			ret = btrfs_commit_transaction(trans, root);
2146 			if (ret)
2147 				return ret;
2148 			goto again;
2149 		}
2150 
2151 		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
2152 		       ", %llu bytes_used, %llu bytes_reserved, "
2153 		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
2154 		       "%llu total\n", bytes, data_sinfo->bytes_delalloc,
2155 		       data_sinfo->bytes_used, data_sinfo->bytes_reserved,
2156 		       data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
2157 		       data_sinfo->bytes_may_use, data_sinfo->total_bytes);
2158 		return -ENOSPC;
2159 	}
2160 	data_sinfo->bytes_may_use += bytes;
2161 	BTRFS_I(inode)->reserved_bytes += bytes;
2162 	spin_unlock(&data_sinfo->lock);
2163 
2164 	return btrfs_check_metadata_free_space(root);
2165 }
2166 
2167 /*
2168  * if there was an error for whatever reason after calling
2169  * btrfs_check_data_free_space, call this so we can cleanup the counters.
2170  */
2171 void btrfs_free_reserved_data_space(struct btrfs_root *root,
2172 				    struct inode *inode, u64 bytes)
2173 {
2174 	struct btrfs_space_info *data_sinfo;
2175 
2176 	/* make sure bytes are sectorsize aligned */
2177 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
2178 
2179 	data_sinfo = BTRFS_I(inode)->space_info;
2180 	spin_lock(&data_sinfo->lock);
2181 	data_sinfo->bytes_may_use -= bytes;
2182 	BTRFS_I(inode)->reserved_bytes -= bytes;
2183 	spin_unlock(&data_sinfo->lock);
2184 }
2185 
2186 /* called when we are adding a delalloc extent to the inode's io_tree */
2187 void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
2188 				  u64 bytes)
2189 {
2190 	struct btrfs_space_info *data_sinfo;
2191 
2192 	/* get the space info for where this inode will be storing its data */
2193 	data_sinfo = BTRFS_I(inode)->space_info;
2194 
2195 	/* make sure we have enough space to handle the data first */
2196 	spin_lock(&data_sinfo->lock);
2197 	data_sinfo->bytes_delalloc += bytes;
2198 
2199 	/*
2200 	 * we are adding a delalloc extent without calling
2201 	 * btrfs_check_data_free_space first.  This happens on a weird
2202 	 * writepage condition, but shouldn't hurt our accounting
2203 	 */
2204 	if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
2205 		data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
2206 		BTRFS_I(inode)->reserved_bytes = 0;
2207 	} else {
2208 		data_sinfo->bytes_may_use -= bytes;
2209 		BTRFS_I(inode)->reserved_bytes -= bytes;
2210 	}
2211 
2212 	spin_unlock(&data_sinfo->lock);
2213 }
2214 
2215 /* called when we are clearing an delalloc extent from the inode's io_tree */
2216 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
2217 			      u64 bytes)
2218 {
2219 	struct btrfs_space_info *info;
2220 
2221 	info = BTRFS_I(inode)->space_info;
2222 
2223 	spin_lock(&info->lock);
2224 	info->bytes_delalloc -= bytes;
2225 	spin_unlock(&info->lock);
2226 }
2227 
2228 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
2229 			  struct btrfs_root *extent_root, u64 alloc_bytes,
2230 			  u64 flags, int force)
2231 {
2232 	struct btrfs_space_info *space_info;
2233 	u64 thresh;
2234 	int ret = 0;
2235 
2236 	mutex_lock(&extent_root->fs_info->chunk_mutex);
2237 
2238 	flags = btrfs_reduce_alloc_profile(extent_root, flags);
2239 
2240 	space_info = __find_space_info(extent_root->fs_info, flags);
2241 	if (!space_info) {
2242 		ret = update_space_info(extent_root->fs_info, flags,
2243 					0, 0, &space_info);
2244 		BUG_ON(ret);
2245 	}
2246 	BUG_ON(!space_info);
2247 
2248 	spin_lock(&space_info->lock);
2249 	if (space_info->force_alloc) {
2250 		force = 1;
2251 		space_info->force_alloc = 0;
2252 	}
2253 	if (space_info->full) {
2254 		spin_unlock(&space_info->lock);
2255 		goto out;
2256 	}
2257 
2258 	thresh = space_info->total_bytes - space_info->bytes_readonly;
2259 	thresh = div_factor(thresh, 6);
2260 	if (!force &&
2261 	   (space_info->bytes_used + space_info->bytes_pinned +
2262 	    space_info->bytes_reserved + alloc_bytes) < thresh) {
2263 		spin_unlock(&space_info->lock);
2264 		goto out;
2265 	}
2266 	spin_unlock(&space_info->lock);
2267 
2268 	ret = btrfs_alloc_chunk(trans, extent_root, flags);
2269 	if (ret)
2270 		space_info->full = 1;
2271 out:
2272 	mutex_unlock(&extent_root->fs_info->chunk_mutex);
2273 	return ret;
2274 }
2275 
2276 static int update_block_group(struct btrfs_trans_handle *trans,
2277 			      struct btrfs_root *root,
2278 			      u64 bytenr, u64 num_bytes, int alloc,
2279 			      int mark_free)
2280 {
2281 	struct btrfs_block_group_cache *cache;
2282 	struct btrfs_fs_info *info = root->fs_info;
2283 	u64 total = num_bytes;
2284 	u64 old_val;
2285 	u64 byte_in_group;
2286 
2287 	while (total) {
2288 		cache = btrfs_lookup_block_group(info, bytenr);
2289 		if (!cache)
2290 			return -1;
2291 		byte_in_group = bytenr - cache->key.objectid;
2292 		WARN_ON(byte_in_group > cache->key.offset);
2293 
2294 		spin_lock(&cache->space_info->lock);
2295 		spin_lock(&cache->lock);
2296 		cache->dirty = 1;
2297 		old_val = btrfs_block_group_used(&cache->item);
2298 		num_bytes = min(total, cache->key.offset - byte_in_group);
2299 		if (alloc) {
2300 			old_val += num_bytes;
2301 			cache->space_info->bytes_used += num_bytes;
2302 			if (cache->ro)
2303 				cache->space_info->bytes_readonly -= num_bytes;
2304 			btrfs_set_block_group_used(&cache->item, old_val);
2305 			spin_unlock(&cache->lock);
2306 			spin_unlock(&cache->space_info->lock);
2307 		} else {
2308 			old_val -= num_bytes;
2309 			cache->space_info->bytes_used -= num_bytes;
2310 			if (cache->ro)
2311 				cache->space_info->bytes_readonly += num_bytes;
2312 			btrfs_set_block_group_used(&cache->item, old_val);
2313 			spin_unlock(&cache->lock);
2314 			spin_unlock(&cache->space_info->lock);
2315 			if (mark_free) {
2316 				int ret;
2317 
2318 				ret = btrfs_discard_extent(root, bytenr,
2319 							   num_bytes);
2320 				WARN_ON(ret);
2321 
2322 				ret = btrfs_add_free_space(cache, bytenr,
2323 							   num_bytes);
2324 				WARN_ON(ret);
2325 			}
2326 		}
2327 		put_block_group(cache);
2328 		total -= num_bytes;
2329 		bytenr += num_bytes;
2330 	}
2331 	return 0;
2332 }
2333 
2334 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
2335 {
2336 	struct btrfs_block_group_cache *cache;
2337 	u64 bytenr;
2338 
2339 	cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
2340 	if (!cache)
2341 		return 0;
2342 
2343 	bytenr = cache->key.objectid;
2344 	put_block_group(cache);
2345 
2346 	return bytenr;
2347 }
2348 
2349 int btrfs_update_pinned_extents(struct btrfs_root *root,
2350 				u64 bytenr, u64 num, int pin)
2351 {
2352 	u64 len;
2353 	struct btrfs_block_group_cache *cache;
2354 	struct btrfs_fs_info *fs_info = root->fs_info;
2355 
2356 	WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex));
2357 	if (pin) {
2358 		set_extent_dirty(&fs_info->pinned_extents,
2359 				bytenr, bytenr + num - 1, GFP_NOFS);
2360 	} else {
2361 		clear_extent_dirty(&fs_info->pinned_extents,
2362 				bytenr, bytenr + num - 1, GFP_NOFS);
2363 	}
2364 	while (num > 0) {
2365 		cache = btrfs_lookup_block_group(fs_info, bytenr);
2366 		BUG_ON(!cache);
2367 		len = min(num, cache->key.offset -
2368 			  (bytenr - cache->key.objectid));
2369 		if (pin) {
2370 			spin_lock(&cache->space_info->lock);
2371 			spin_lock(&cache->lock);
2372 			cache->pinned += len;
2373 			cache->space_info->bytes_pinned += len;
2374 			spin_unlock(&cache->lock);
2375 			spin_unlock(&cache->space_info->lock);
2376 			fs_info->total_pinned += len;
2377 		} else {
2378 			spin_lock(&cache->space_info->lock);
2379 			spin_lock(&cache->lock);
2380 			cache->pinned -= len;
2381 			cache->space_info->bytes_pinned -= len;
2382 			spin_unlock(&cache->lock);
2383 			spin_unlock(&cache->space_info->lock);
2384 			fs_info->total_pinned -= len;
2385 			if (cache->cached)
2386 				btrfs_add_free_space(cache, bytenr, len);
2387 		}
2388 		put_block_group(cache);
2389 		bytenr += len;
2390 		num -= len;
2391 	}
2392 	return 0;
2393 }
2394 
2395 static int update_reserved_extents(struct btrfs_root *root,
2396 				   u64 bytenr, u64 num, int reserve)
2397 {
2398 	u64 len;
2399 	struct btrfs_block_group_cache *cache;
2400 	struct btrfs_fs_info *fs_info = root->fs_info;
2401 
2402 	while (num > 0) {
2403 		cache = btrfs_lookup_block_group(fs_info, bytenr);
2404 		BUG_ON(!cache);
2405 		len = min(num, cache->key.offset -
2406 			  (bytenr - cache->key.objectid));
2407 
2408 		spin_lock(&cache->space_info->lock);
2409 		spin_lock(&cache->lock);
2410 		if (reserve) {
2411 			cache->reserved += len;
2412 			cache->space_info->bytes_reserved += len;
2413 		} else {
2414 			cache->reserved -= len;
2415 			cache->space_info->bytes_reserved -= len;
2416 		}
2417 		spin_unlock(&cache->lock);
2418 		spin_unlock(&cache->space_info->lock);
2419 		put_block_group(cache);
2420 		bytenr += len;
2421 		num -= len;
2422 	}
2423 	return 0;
2424 }
2425 
2426 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
2427 {
2428 	u64 last = 0;
2429 	u64 start;
2430 	u64 end;
2431 	struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents;
2432 	int ret;
2433 
2434 	mutex_lock(&root->fs_info->pinned_mutex);
2435 	while (1) {
2436 		ret = find_first_extent_bit(pinned_extents, last,
2437 					    &start, &end, EXTENT_DIRTY);
2438 		if (ret)
2439 			break;
2440 		set_extent_dirty(copy, start, end, GFP_NOFS);
2441 		last = end + 1;
2442 	}
2443 	mutex_unlock(&root->fs_info->pinned_mutex);
2444 	return 0;
2445 }
2446 
2447 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
2448 			       struct btrfs_root *root,
2449 			       struct extent_io_tree *unpin)
2450 {
2451 	u64 start;
2452 	u64 end;
2453 	int ret;
2454 
2455 	mutex_lock(&root->fs_info->pinned_mutex);
2456 	while (1) {
2457 		ret = find_first_extent_bit(unpin, 0, &start, &end,
2458 					    EXTENT_DIRTY);
2459 		if (ret)
2460 			break;
2461 
2462 		ret = btrfs_discard_extent(root, start, end + 1 - start);
2463 
2464 		btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
2465 		clear_extent_dirty(unpin, start, end, GFP_NOFS);
2466 
2467 		if (need_resched()) {
2468 			mutex_unlock(&root->fs_info->pinned_mutex);
2469 			cond_resched();
2470 			mutex_lock(&root->fs_info->pinned_mutex);
2471 		}
2472 	}
2473 	mutex_unlock(&root->fs_info->pinned_mutex);
2474 	return ret;
2475 }
2476 
2477 static int finish_current_insert(struct btrfs_trans_handle *trans,
2478 				 struct btrfs_root *extent_root, int all)
2479 {
2480 	u64 start;
2481 	u64 end;
2482 	u64 priv;
2483 	u64 search = 0;
2484 	struct btrfs_fs_info *info = extent_root->fs_info;
2485 	struct btrfs_path *path;
2486 	struct pending_extent_op *extent_op, *tmp;
2487 	struct list_head insert_list, update_list;
2488 	int ret;
2489 	int num_inserts = 0, max_inserts, restart = 0;
2490 
2491 	path = btrfs_alloc_path();
2492 	INIT_LIST_HEAD(&insert_list);
2493 	INIT_LIST_HEAD(&update_list);
2494 
2495 	max_inserts = extent_root->leafsize /
2496 		(2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
2497 		 sizeof(struct btrfs_extent_ref) +
2498 		 sizeof(struct btrfs_extent_item));
2499 again:
2500 	mutex_lock(&info->extent_ins_mutex);
2501 	while (1) {
2502 		ret = find_first_extent_bit(&info->extent_ins, search, &start,
2503 					    &end, EXTENT_WRITEBACK);
2504 		if (ret) {
2505 			if (restart && !num_inserts &&
2506 			    list_empty(&update_list)) {
2507 				restart = 0;
2508 				search = 0;
2509 				continue;
2510 			}
2511 			break;
2512 		}
2513 
2514 		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
2515 		if (!ret) {
2516 			if (all)
2517 				restart = 1;
2518 			search = end + 1;
2519 			if (need_resched()) {
2520 				mutex_unlock(&info->extent_ins_mutex);
2521 				cond_resched();
2522 				mutex_lock(&info->extent_ins_mutex);
2523 			}
2524 			continue;
2525 		}
2526 
2527 		ret = get_state_private(&info->extent_ins, start, &priv);
2528 		BUG_ON(ret);
2529 		extent_op = (struct pending_extent_op *)(unsigned long) priv;
2530 
2531 		if (extent_op->type == PENDING_EXTENT_INSERT) {
2532 			num_inserts++;
2533 			list_add_tail(&extent_op->list, &insert_list);
2534 			search = end + 1;
2535 			if (num_inserts == max_inserts) {
2536 				restart = 1;
2537 				break;
2538 			}
2539 		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
2540 			list_add_tail(&extent_op->list, &update_list);
2541 			search = end + 1;
2542 		} else {
2543 			BUG();
2544 		}
2545 	}
2546 
2547 	/*
2548 	 * process the update list, clear the writeback bit for it, and if
2549 	 * somebody marked this thing for deletion then just unlock it and be
2550 	 * done, the free_extents will handle it
2551 	 */
2552 	list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
2553 		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2554 				  extent_op->bytenr + extent_op->num_bytes - 1,
2555 				  EXTENT_WRITEBACK, GFP_NOFS);
2556 		if (extent_op->del) {
2557 			list_del_init(&extent_op->list);
2558 			unlock_extent(&info->extent_ins, extent_op->bytenr,
2559 				      extent_op->bytenr + extent_op->num_bytes
2560 				      - 1, GFP_NOFS);
2561 			kfree(extent_op);
2562 		}
2563 	}
2564 	mutex_unlock(&info->extent_ins_mutex);
2565 
2566 	/*
2567 	 * still have things left on the update list, go ahead an update
2568 	 * everything
2569 	 */
2570 	if (!list_empty(&update_list)) {
2571 		ret = update_backrefs(trans, extent_root, path, &update_list);
2572 		BUG_ON(ret);
2573 
2574 		/* we may have COW'ed new blocks, so lets start over */
2575 		if (all)
2576 			restart = 1;
2577 	}
2578 
2579 	/*
2580 	 * if no inserts need to be done, but we skipped some extents and we
2581 	 * need to make sure everything is cleaned then reset everything and
2582 	 * go back to the beginning
2583 	 */
2584 	if (!num_inserts && restart) {
2585 		search = 0;
2586 		restart = 0;
2587 		INIT_LIST_HEAD(&update_list);
2588 		INIT_LIST_HEAD(&insert_list);
2589 		goto again;
2590 	} else if (!num_inserts) {
2591 		goto out;
2592 	}
2593 
2594 	/*
2595 	 * process the insert extents list.  Again if we are deleting this
2596 	 * extent, then just unlock it, pin down the bytes if need be, and be
2597 	 * done with it.  Saves us from having to actually insert the extent
2598 	 * into the tree and then subsequently come along and delete it
2599 	 */
2600 	mutex_lock(&info->extent_ins_mutex);
2601 	list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
2602 		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
2603 				  extent_op->bytenr + extent_op->num_bytes - 1,
2604 				  EXTENT_WRITEBACK, GFP_NOFS);
2605 		if (extent_op->del) {
2606 			u64 used;
2607 			list_del_init(&extent_op->list);
2608 			unlock_extent(&info->extent_ins, extent_op->bytenr,
2609 				      extent_op->bytenr + extent_op->num_bytes
2610 				      - 1, GFP_NOFS);
2611 
2612 			mutex_lock(&extent_root->fs_info->pinned_mutex);
2613 			ret = pin_down_bytes(trans, extent_root,
2614 					     extent_op->bytenr,
2615 					     extent_op->num_bytes, 0);
2616 			mutex_unlock(&extent_root->fs_info->pinned_mutex);
2617 
2618 			spin_lock(&info->delalloc_lock);
2619 			used = btrfs_super_bytes_used(&info->super_copy);
2620 			btrfs_set_super_bytes_used(&info->super_copy,
2621 					used - extent_op->num_bytes);
2622 			used = btrfs_root_used(&extent_root->root_item);
2623 			btrfs_set_root_used(&extent_root->root_item,
2624 					used - extent_op->num_bytes);
2625 			spin_unlock(&info->delalloc_lock);
2626 
2627 			ret = update_block_group(trans, extent_root,
2628 						 extent_op->bytenr,
2629 						 extent_op->num_bytes,
2630 						 0, ret > 0);
2631 			BUG_ON(ret);
2632 			kfree(extent_op);
2633 			num_inserts--;
2634 		}
2635 	}
2636 	mutex_unlock(&info->extent_ins_mutex);
2637 
2638 	ret = insert_extents(trans, extent_root, path, &insert_list,
2639 			     num_inserts);
2640 	BUG_ON(ret);
2641 
2642 	/*
2643 	 * if restart is set for whatever reason we need to go back and start
2644 	 * searching through the pending list again.
2645 	 *
2646 	 * We just inserted some extents, which could have resulted in new
2647 	 * blocks being allocated, which would result in new blocks needing
2648 	 * updates, so if all is set we _must_ restart to get the updated
2649 	 * blocks.
2650 	 */
2651 	if (restart || all) {
2652 		INIT_LIST_HEAD(&insert_list);
2653 		INIT_LIST_HEAD(&update_list);
2654 		search = 0;
2655 		restart = 0;
2656 		num_inserts = 0;
2657 		goto again;
2658 	}
2659 out:
2660 	btrfs_free_path(path);
2661 	return 0;
2662 }
2663 
2664 static int pin_down_bytes(struct btrfs_trans_handle *trans,
2665 			  struct btrfs_root *root,
2666 			  u64 bytenr, u64 num_bytes, int is_data)
2667 {
2668 	int err = 0;
2669 	struct extent_buffer *buf;
2670 
2671 	if (is_data)
2672 		goto pinit;
2673 
2674 	buf = btrfs_find_tree_block(root, bytenr, num_bytes);
2675 	if (!buf)
2676 		goto pinit;
2677 
2678 	/* we can reuse a block if it hasn't been written
2679 	 * and it is from this transaction.  We can't
2680 	 * reuse anything from the tree log root because
2681 	 * it has tiny sub-transactions.
2682 	 */
2683 	if (btrfs_buffer_uptodate(buf, 0) &&
2684 	    btrfs_try_tree_lock(buf)) {
2685 		u64 header_owner = btrfs_header_owner(buf);
2686 		u64 header_transid = btrfs_header_generation(buf);
2687 		if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
2688 		    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
2689 		    header_transid == trans->transid &&
2690 		    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
2691 			clean_tree_block(NULL, root, buf);
2692 			btrfs_tree_unlock(buf);
2693 			free_extent_buffer(buf);
2694 			return 1;
2695 		}
2696 		btrfs_tree_unlock(buf);
2697 	}
2698 	free_extent_buffer(buf);
2699 pinit:
2700 	btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
2701 
2702 	BUG_ON(err < 0);
2703 	return 0;
2704 }
2705 
2706 /*
2707  * remove an extent from the root, returns 0 on success
2708  */
2709 static int __free_extent(struct btrfs_trans_handle *trans,
2710 			 struct btrfs_root *root,
2711 			 u64 bytenr, u64 num_bytes, u64 parent,
2712 			 u64 root_objectid, u64 ref_generation,
2713 			 u64 owner_objectid, int pin, int mark_free)
2714 {
2715 	struct btrfs_path *path;
2716 	struct btrfs_key key;
2717 	struct btrfs_fs_info *info = root->fs_info;
2718 	struct btrfs_root *extent_root = info->extent_root;
2719 	struct extent_buffer *leaf;
2720 	int ret;
2721 	int extent_slot = 0;
2722 	int found_extent = 0;
2723 	int num_to_del = 1;
2724 	struct btrfs_extent_item *ei;
2725 	u32 refs;
2726 
2727 	key.objectid = bytenr;
2728 	btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
2729 	key.offset = num_bytes;
2730 	path = btrfs_alloc_path();
2731 	if (!path)
2732 		return -ENOMEM;
2733 
2734 	path->reada = 1;
2735 	ret = lookup_extent_backref(trans, extent_root, path,
2736 				    bytenr, parent, root_objectid,
2737 				    ref_generation, owner_objectid, 1);
2738 	if (ret == 0) {
2739 		struct btrfs_key found_key;
2740 		extent_slot = path->slots[0];
2741 		while (extent_slot > 0) {
2742 			extent_slot--;
2743 			btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2744 					      extent_slot);
2745 			if (found_key.objectid != bytenr)
2746 				break;
2747 			if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
2748 			    found_key.offset == num_bytes) {
2749 				found_extent = 1;
2750 				break;
2751 			}
2752 			if (path->slots[0] - extent_slot > 5)
2753 				break;
2754 		}
2755 		if (!found_extent) {
2756 			ret = remove_extent_backref(trans, extent_root, path);
2757 			BUG_ON(ret);
2758 			btrfs_release_path(extent_root, path);
2759 			ret = btrfs_search_slot(trans, extent_root,
2760 						&key, path, -1, 1);
2761 			if (ret) {
2762 				printk(KERN_ERR "umm, got %d back from search"
2763 				       ", was looking for %llu\n", ret,
2764 				       (unsigned long long)bytenr);
2765 				btrfs_print_leaf(extent_root, path->nodes[0]);
2766 			}
2767 			BUG_ON(ret);
2768 			extent_slot = path->slots[0];
2769 		}
2770 	} else {
2771 		btrfs_print_leaf(extent_root, path->nodes[0]);
2772 		WARN_ON(1);
2773 		printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
2774 		       "root %llu gen %llu owner %llu\n",
2775 		       (unsigned long long)bytenr,
2776 		       (unsigned long long)root_objectid,
2777 		       (unsigned long long)ref_generation,
2778 		       (unsigned long long)owner_objectid);
2779 	}
2780 
2781 	leaf = path->nodes[0];
2782 	ei = btrfs_item_ptr(leaf, extent_slot,
2783 			    struct btrfs_extent_item);
2784 	refs = btrfs_extent_refs(leaf, ei);
2785 	BUG_ON(refs == 0);
2786 	refs -= 1;
2787 	btrfs_set_extent_refs(leaf, ei, refs);
2788 
2789 	btrfs_mark_buffer_dirty(leaf);
2790 
2791 	if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
2792 		struct btrfs_extent_ref *ref;
2793 		ref = btrfs_item_ptr(leaf, path->slots[0],
2794 				     struct btrfs_extent_ref);
2795 		BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
2796 		/* if the back ref and the extent are next to each other
2797 		 * they get deleted below in one shot
2798 		 */
2799 		path->slots[0] = extent_slot;
2800 		num_to_del = 2;
2801 	} else if (found_extent) {
2802 		/* otherwise delete the extent back ref */
2803 		ret = remove_extent_backref(trans, extent_root, path);
2804 		BUG_ON(ret);
2805 		/* if refs are 0, we need to setup the path for deletion */
2806 		if (refs == 0) {
2807 			btrfs_release_path(extent_root, path);
2808 			ret = btrfs_search_slot(trans, extent_root, &key, path,
2809 						-1, 1);
2810 			BUG_ON(ret);
2811 		}
2812 	}
2813 
2814 	if (refs == 0) {
2815 		u64 super_used;
2816 		u64 root_used;
2817 
2818 		if (pin) {
2819 			mutex_lock(&root->fs_info->pinned_mutex);
2820 			ret = pin_down_bytes(trans, root, bytenr, num_bytes,
2821 				owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
2822 			mutex_unlock(&root->fs_info->pinned_mutex);
2823 			if (ret > 0)
2824 				mark_free = 1;
2825 			BUG_ON(ret < 0);
2826 		}
2827 		/* block accounting for super block */
2828 		spin_lock(&info->delalloc_lock);
2829 		super_used = btrfs_super_bytes_used(&info->super_copy);
2830 		btrfs_set_super_bytes_used(&info->super_copy,
2831 					   super_used - num_bytes);
2832 
2833 		/* block accounting for root item */
2834 		root_used = btrfs_root_used(&root->root_item);
2835 		btrfs_set_root_used(&root->root_item,
2836 					   root_used - num_bytes);
2837 		spin_unlock(&info->delalloc_lock);
2838 		ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
2839 				      num_to_del);
2840 		BUG_ON(ret);
2841 		btrfs_release_path(extent_root, path);
2842 
2843 		if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
2844 			ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
2845 			BUG_ON(ret);
2846 		}
2847 
2848 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
2849 					 mark_free);
2850 		BUG_ON(ret);
2851 	}
2852 	btrfs_free_path(path);
2853 	finish_current_insert(trans, extent_root, 0);
2854 	return ret;
2855 }
2856 
2857 /*
2858  * find all the blocks marked as pending in the radix tree and remove
2859  * them from the extent map
2860  */
2861 static int del_pending_extents(struct btrfs_trans_handle *trans,
2862 			       struct btrfs_root *extent_root, int all)
2863 {
2864 	int ret;
2865 	int err = 0;
2866 	u64 start;
2867 	u64 end;
2868 	u64 priv;
2869 	u64 search = 0;
2870 	int nr = 0, skipped = 0;
2871 	struct extent_io_tree *pending_del;
2872 	struct extent_io_tree *extent_ins;
2873 	struct pending_extent_op *extent_op;
2874 	struct btrfs_fs_info *info = extent_root->fs_info;
2875 	struct list_head delete_list;
2876 
2877 	INIT_LIST_HEAD(&delete_list);
2878 	extent_ins = &extent_root->fs_info->extent_ins;
2879 	pending_del = &extent_root->fs_info->pending_del;
2880 
2881 again:
2882 	mutex_lock(&info->extent_ins_mutex);
2883 	while (1) {
2884 		ret = find_first_extent_bit(pending_del, search, &start, &end,
2885 					    EXTENT_WRITEBACK);
2886 		if (ret) {
2887 			if (all && skipped && !nr) {
2888 				search = 0;
2889 				skipped = 0;
2890 				continue;
2891 			}
2892 			mutex_unlock(&info->extent_ins_mutex);
2893 			break;
2894 		}
2895 
2896 		ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
2897 		if (!ret) {
2898 			search = end+1;
2899 			skipped = 1;
2900 
2901 			if (need_resched()) {
2902 				mutex_unlock(&info->extent_ins_mutex);
2903 				cond_resched();
2904 				mutex_lock(&info->extent_ins_mutex);
2905 			}
2906 
2907 			continue;
2908 		}
2909 		BUG_ON(ret < 0);
2910 
2911 		ret = get_state_private(pending_del, start, &priv);
2912 		BUG_ON(ret);
2913 		extent_op = (struct pending_extent_op *)(unsigned long)priv;
2914 
2915 		clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
2916 				  GFP_NOFS);
2917 		if (!test_range_bit(extent_ins, start, end,
2918 				    EXTENT_WRITEBACK, 0)) {
2919 			list_add_tail(&extent_op->list, &delete_list);
2920 			nr++;
2921 		} else {
2922 			kfree(extent_op);
2923 
2924 			ret = get_state_private(&info->extent_ins, start,
2925 						&priv);
2926 			BUG_ON(ret);
2927 			extent_op = (struct pending_extent_op *)
2928 						(unsigned long)priv;
2929 
2930 			clear_extent_bits(&info->extent_ins, start, end,
2931 					  EXTENT_WRITEBACK, GFP_NOFS);
2932 
2933 			if (extent_op->type == PENDING_BACKREF_UPDATE) {
2934 				list_add_tail(&extent_op->list, &delete_list);
2935 				search = end + 1;
2936 				nr++;
2937 				continue;
2938 			}
2939 
2940 			mutex_lock(&extent_root->fs_info->pinned_mutex);
2941 			ret = pin_down_bytes(trans, extent_root, start,
2942 					     end + 1 - start, 0);
2943 			mutex_unlock(&extent_root->fs_info->pinned_mutex);
2944 
2945 			ret = update_block_group(trans, extent_root, start,
2946 						end + 1 - start, 0, ret > 0);
2947 
2948 			unlock_extent(extent_ins, start, end, GFP_NOFS);
2949 			BUG_ON(ret);
2950 			kfree(extent_op);
2951 		}
2952 		if (ret)
2953 			err = ret;
2954 
2955 		search = end + 1;
2956 
2957 		if (need_resched()) {
2958 			mutex_unlock(&info->extent_ins_mutex);
2959 			cond_resched();
2960 			mutex_lock(&info->extent_ins_mutex);
2961 		}
2962 	}
2963 
2964 	if (nr) {
2965 		ret = free_extents(trans, extent_root, &delete_list);
2966 		BUG_ON(ret);
2967 	}
2968 
2969 	if (all && skipped) {
2970 		INIT_LIST_HEAD(&delete_list);
2971 		search = 0;
2972 		nr = 0;
2973 		goto again;
2974 	}
2975 
2976 	if (!err)
2977 		finish_current_insert(trans, extent_root, 0);
2978 	return err;
2979 }
2980 
2981 /*
2982  * remove an extent from the root, returns 0 on success
2983  */
2984 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2985 			       struct btrfs_root *root,
2986 			       u64 bytenr, u64 num_bytes, u64 parent,
2987 			       u64 root_objectid, u64 ref_generation,
2988 			       u64 owner_objectid, int pin)
2989 {
2990 	struct btrfs_root *extent_root = root->fs_info->extent_root;
2991 	int pending_ret;
2992 	int ret;
2993 
2994 	WARN_ON(num_bytes < root->sectorsize);
2995 	if (root == extent_root) {
2996 		struct pending_extent_op *extent_op = NULL;
2997 
2998 		mutex_lock(&root->fs_info->extent_ins_mutex);
2999 		if (test_range_bit(&root->fs_info->extent_ins, bytenr,
3000 				bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
3001 			u64 priv;
3002 			ret = get_state_private(&root->fs_info->extent_ins,
3003 						bytenr, &priv);
3004 			BUG_ON(ret);
3005 			extent_op = (struct pending_extent_op *)
3006 						(unsigned long)priv;
3007 
3008 			extent_op->del = 1;
3009 			if (extent_op->type == PENDING_EXTENT_INSERT) {
3010 				mutex_unlock(&root->fs_info->extent_ins_mutex);
3011 				return 0;
3012 			}
3013 		}
3014 
3015 		if (extent_op) {
3016 			ref_generation = extent_op->orig_generation;
3017 			parent = extent_op->orig_parent;
3018 		}
3019 
3020 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3021 		BUG_ON(!extent_op);
3022 
3023 		extent_op->type = PENDING_EXTENT_DELETE;
3024 		extent_op->bytenr = bytenr;
3025 		extent_op->num_bytes = num_bytes;
3026 		extent_op->parent = parent;
3027 		extent_op->orig_parent = parent;
3028 		extent_op->generation = ref_generation;
3029 		extent_op->orig_generation = ref_generation;
3030 		extent_op->level = (int)owner_objectid;
3031 		INIT_LIST_HEAD(&extent_op->list);
3032 		extent_op->del = 0;
3033 
3034 		set_extent_bits(&root->fs_info->pending_del,
3035 				bytenr, bytenr + num_bytes - 1,
3036 				EXTENT_WRITEBACK, GFP_NOFS);
3037 		set_state_private(&root->fs_info->pending_del,
3038 				  bytenr, (unsigned long)extent_op);
3039 		mutex_unlock(&root->fs_info->extent_ins_mutex);
3040 		return 0;
3041 	}
3042 	/* if metadata always pin */
3043 	if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
3044 		if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3045 			mutex_lock(&root->fs_info->pinned_mutex);
3046 			btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
3047 			mutex_unlock(&root->fs_info->pinned_mutex);
3048 			update_reserved_extents(root, bytenr, num_bytes, 0);
3049 			return 0;
3050 		}
3051 		pin = 1;
3052 	}
3053 
3054 	/* if data pin when any transaction has committed this */
3055 	if (ref_generation != trans->transid)
3056 		pin = 1;
3057 
3058 	ret = __free_extent(trans, root, bytenr, num_bytes, parent,
3059 			    root_objectid, ref_generation,
3060 			    owner_objectid, pin, pin == 0);
3061 
3062 	finish_current_insert(trans, root->fs_info->extent_root, 0);
3063 	pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
3064 	return ret ? ret : pending_ret;
3065 }
3066 
3067 int btrfs_free_extent(struct btrfs_trans_handle *trans,
3068 		      struct btrfs_root *root,
3069 		      u64 bytenr, u64 num_bytes, u64 parent,
3070 		      u64 root_objectid, u64 ref_generation,
3071 		      u64 owner_objectid, int pin)
3072 {
3073 	int ret;
3074 
3075 	ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
3076 				  root_objectid, ref_generation,
3077 				  owner_objectid, pin);
3078 	return ret;
3079 }
3080 
3081 static u64 stripe_align(struct btrfs_root *root, u64 val)
3082 {
3083 	u64 mask = ((u64)root->stripesize - 1);
3084 	u64 ret = (val + mask) & ~mask;
3085 	return ret;
3086 }
3087 
3088 /*
3089  * walks the btree of allocated extents and find a hole of a given size.
3090  * The key ins is changed to record the hole:
3091  * ins->objectid == block start
3092  * ins->flags = BTRFS_EXTENT_ITEM_KEY
3093  * ins->offset == number of blocks
3094  * Any available blocks before search_start are skipped.
3095  */
3096 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
3097 				     struct btrfs_root *orig_root,
3098 				     u64 num_bytes, u64 empty_size,
3099 				     u64 search_start, u64 search_end,
3100 				     u64 hint_byte, struct btrfs_key *ins,
3101 				     u64 exclude_start, u64 exclude_nr,
3102 				     int data)
3103 {
3104 	int ret = 0;
3105 	struct btrfs_root *root = orig_root->fs_info->extent_root;
3106 	u64 total_needed = num_bytes;
3107 	u64 *last_ptr = NULL;
3108 	u64 last_wanted = 0;
3109 	struct btrfs_block_group_cache *block_group = NULL;
3110 	int chunk_alloc_done = 0;
3111 	int empty_cluster = 2 * 1024 * 1024;
3112 	int allowed_chunk_alloc = 0;
3113 	struct list_head *head = NULL, *cur = NULL;
3114 	int loop = 0;
3115 	int extra_loop = 0;
3116 	struct btrfs_space_info *space_info;
3117 
3118 	WARN_ON(num_bytes < root->sectorsize);
3119 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
3120 	ins->objectid = 0;
3121 	ins->offset = 0;
3122 
3123 	if (orig_root->ref_cows || empty_size)
3124 		allowed_chunk_alloc = 1;
3125 
3126 	if (data & BTRFS_BLOCK_GROUP_METADATA) {
3127 		last_ptr = &root->fs_info->last_alloc;
3128 		if (!btrfs_test_opt(root, SSD))
3129 			empty_cluster = 64 * 1024;
3130 	}
3131 
3132 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
3133 		last_ptr = &root->fs_info->last_data_alloc;
3134 
3135 	if (last_ptr) {
3136 		if (*last_ptr) {
3137 			hint_byte = *last_ptr;
3138 			last_wanted = *last_ptr;
3139 		} else
3140 			empty_size += empty_cluster;
3141 	} else {
3142 		empty_cluster = 0;
3143 	}
3144 	search_start = max(search_start, first_logical_byte(root, 0));
3145 	search_start = max(search_start, hint_byte);
3146 
3147 	if (last_wanted && search_start != last_wanted) {
3148 		last_wanted = 0;
3149 		empty_size += empty_cluster;
3150 	}
3151 
3152 	total_needed += empty_size;
3153 	block_group = btrfs_lookup_block_group(root->fs_info, search_start);
3154 	if (!block_group)
3155 		block_group = btrfs_lookup_first_block_group(root->fs_info,
3156 							     search_start);
3157 	space_info = __find_space_info(root->fs_info, data);
3158 
3159 	down_read(&space_info->groups_sem);
3160 	while (1) {
3161 		struct btrfs_free_space *free_space;
3162 		/*
3163 		 * the only way this happens if our hint points to a block
3164 		 * group thats not of the proper type, while looping this
3165 		 * should never happen
3166 		 */
3167 		if (empty_size)
3168 			extra_loop = 1;
3169 
3170 		if (!block_group)
3171 			goto new_group_no_lock;
3172 
3173 		if (unlikely(!block_group->cached)) {
3174 			mutex_lock(&block_group->cache_mutex);
3175 			ret = cache_block_group(root, block_group);
3176 			mutex_unlock(&block_group->cache_mutex);
3177 			if (ret)
3178 				break;
3179 		}
3180 
3181 		mutex_lock(&block_group->alloc_mutex);
3182 		if (unlikely(!block_group_bits(block_group, data)))
3183 			goto new_group;
3184 
3185 		if (unlikely(block_group->ro))
3186 			goto new_group;
3187 
3188 		free_space = btrfs_find_free_space(block_group, search_start,
3189 						   total_needed);
3190 		if (free_space) {
3191 			u64 start = block_group->key.objectid;
3192 			u64 end = block_group->key.objectid +
3193 				block_group->key.offset;
3194 
3195 			search_start = stripe_align(root, free_space->offset);
3196 
3197 			/* move on to the next group */
3198 			if (search_start + num_bytes >= search_end)
3199 				goto new_group;
3200 
3201 			/* move on to the next group */
3202 			if (search_start + num_bytes > end)
3203 				goto new_group;
3204 
3205 			if (last_wanted && search_start != last_wanted) {
3206 				total_needed += empty_cluster;
3207 				empty_size += empty_cluster;
3208 				last_wanted = 0;
3209 				/*
3210 				 * if search_start is still in this block group
3211 				 * then we just re-search this block group
3212 				 */
3213 				if (search_start >= start &&
3214 				    search_start < end) {
3215 					mutex_unlock(&block_group->alloc_mutex);
3216 					continue;
3217 				}
3218 
3219 				/* else we go to the next block group */
3220 				goto new_group;
3221 			}
3222 
3223 			if (exclude_nr > 0 &&
3224 			    (search_start + num_bytes > exclude_start &&
3225 			     search_start < exclude_start + exclude_nr)) {
3226 				search_start = exclude_start + exclude_nr;
3227 				/*
3228 				 * if search_start is still in this block group
3229 				 * then we just re-search this block group
3230 				 */
3231 				if (search_start >= start &&
3232 				    search_start < end) {
3233 					mutex_unlock(&block_group->alloc_mutex);
3234 					last_wanted = 0;
3235 					continue;
3236 				}
3237 
3238 				/* else we go to the next block group */
3239 				goto new_group;
3240 			}
3241 
3242 			ins->objectid = search_start;
3243 			ins->offset = num_bytes;
3244 
3245 			btrfs_remove_free_space_lock(block_group, search_start,
3246 						     num_bytes);
3247 			/* we are all good, lets return */
3248 			mutex_unlock(&block_group->alloc_mutex);
3249 			break;
3250 		}
3251 new_group:
3252 		mutex_unlock(&block_group->alloc_mutex);
3253 		put_block_group(block_group);
3254 		block_group = NULL;
3255 new_group_no_lock:
3256 		/* don't try to compare new allocations against the
3257 		 * last allocation any more
3258 		 */
3259 		last_wanted = 0;
3260 
3261 		/*
3262 		 * Here's how this works.
3263 		 * loop == 0: we were searching a block group via a hint
3264 		 *		and didn't find anything, so we start at
3265 		 *		the head of the block groups and keep searching
3266 		 * loop == 1: we're searching through all of the block groups
3267 		 *		if we hit the head again we have searched
3268 		 *		all of the block groups for this space and we
3269 		 *		need to try and allocate, if we cant error out.
3270 		 * loop == 2: we allocated more space and are looping through
3271 		 *		all of the block groups again.
3272 		 */
3273 		if (loop == 0) {
3274 			head = &space_info->block_groups;
3275 			cur = head->next;
3276 			loop++;
3277 		} else if (loop == 1 && cur == head) {
3278 			int keep_going;
3279 
3280 			/* at this point we give up on the empty_size
3281 			 * allocations and just try to allocate the min
3282 			 * space.
3283 			 *
3284 			 * The extra_loop field was set if an empty_size
3285 			 * allocation was attempted above, and if this
3286 			 * is try we need to try the loop again without
3287 			 * the additional empty_size.
3288 			 */
3289 			total_needed -= empty_size;
3290 			empty_size = 0;
3291 			keep_going = extra_loop;
3292 			loop++;
3293 
3294 			if (allowed_chunk_alloc && !chunk_alloc_done) {
3295 				up_read(&space_info->groups_sem);
3296 				ret = do_chunk_alloc(trans, root, num_bytes +
3297 						     2 * 1024 * 1024, data, 1);
3298 				down_read(&space_info->groups_sem);
3299 				if (ret < 0)
3300 					goto loop_check;
3301 				head = &space_info->block_groups;
3302 				/*
3303 				 * we've allocated a new chunk, keep
3304 				 * trying
3305 				 */
3306 				keep_going = 1;
3307 				chunk_alloc_done = 1;
3308 			} else if (!allowed_chunk_alloc) {
3309 				space_info->force_alloc = 1;
3310 			}
3311 loop_check:
3312 			if (keep_going) {
3313 				cur = head->next;
3314 				extra_loop = 0;
3315 			} else {
3316 				break;
3317 			}
3318 		} else if (cur == head) {
3319 			break;
3320 		}
3321 
3322 		block_group = list_entry(cur, struct btrfs_block_group_cache,
3323 					 list);
3324 		atomic_inc(&block_group->count);
3325 
3326 		search_start = block_group->key.objectid;
3327 		cur = cur->next;
3328 	}
3329 
3330 	/* we found what we needed */
3331 	if (ins->objectid) {
3332 		if (!(data & BTRFS_BLOCK_GROUP_DATA))
3333 			trans->block_group = block_group->key.objectid;
3334 
3335 		if (last_ptr)
3336 			*last_ptr = ins->objectid + ins->offset;
3337 		ret = 0;
3338 	} else if (!ret) {
3339 		printk(KERN_ERR "btrfs searching for %llu bytes, "
3340 		       "num_bytes %llu, loop %d, allowed_alloc %d\n",
3341 		       (unsigned long long)total_needed,
3342 		       (unsigned long long)num_bytes,
3343 		       loop, allowed_chunk_alloc);
3344 		ret = -ENOSPC;
3345 	}
3346 	if (block_group)
3347 		put_block_group(block_group);
3348 
3349 	up_read(&space_info->groups_sem);
3350 	return ret;
3351 }
3352 
3353 static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
3354 {
3355 	struct btrfs_block_group_cache *cache;
3356 
3357 	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
3358 	       (unsigned long long)(info->total_bytes - info->bytes_used -
3359 				    info->bytes_pinned - info->bytes_reserved),
3360 	       (info->full) ? "" : "not ");
3361 	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
3362 	       " may_use=%llu, used=%llu\n", info->total_bytes,
3363 	       info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
3364 	       info->bytes_used);
3365 
3366 	down_read(&info->groups_sem);
3367 	list_for_each_entry(cache, &info->block_groups, list) {
3368 		spin_lock(&cache->lock);
3369 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
3370 		       "%llu pinned %llu reserved\n",
3371 		       (unsigned long long)cache->key.objectid,
3372 		       (unsigned long long)cache->key.offset,
3373 		       (unsigned long long)btrfs_block_group_used(&cache->item),
3374 		       (unsigned long long)cache->pinned,
3375 		       (unsigned long long)cache->reserved);
3376 		btrfs_dump_free_space(cache, bytes);
3377 		spin_unlock(&cache->lock);
3378 	}
3379 	up_read(&info->groups_sem);
3380 }
3381 
3382 static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3383 				  struct btrfs_root *root,
3384 				  u64 num_bytes, u64 min_alloc_size,
3385 				  u64 empty_size, u64 hint_byte,
3386 				  u64 search_end, struct btrfs_key *ins,
3387 				  u64 data)
3388 {
3389 	int ret;
3390 	u64 search_start = 0;
3391 	struct btrfs_fs_info *info = root->fs_info;
3392 
3393 	data = btrfs_get_alloc_profile(root, data);
3394 again:
3395 	/*
3396 	 * the only place that sets empty_size is btrfs_realloc_node, which
3397 	 * is not called recursively on allocations
3398 	 */
3399 	if (empty_size || root->ref_cows) {
3400 		if (!(data & BTRFS_BLOCK_GROUP_METADATA)) {
3401 			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3402 				     2 * 1024 * 1024,
3403 				     BTRFS_BLOCK_GROUP_METADATA |
3404 				     (info->metadata_alloc_profile &
3405 				      info->avail_metadata_alloc_bits), 0);
3406 		}
3407 		ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3408 				     num_bytes + 2 * 1024 * 1024, data, 0);
3409 	}
3410 
3411 	WARN_ON(num_bytes < root->sectorsize);
3412 	ret = find_free_extent(trans, root, num_bytes, empty_size,
3413 			       search_start, search_end, hint_byte, ins,
3414 			       trans->alloc_exclude_start,
3415 			       trans->alloc_exclude_nr, data);
3416 
3417 	if (ret == -ENOSPC && num_bytes > min_alloc_size) {
3418 		num_bytes = num_bytes >> 1;
3419 		num_bytes = num_bytes & ~(root->sectorsize - 1);
3420 		num_bytes = max(num_bytes, min_alloc_size);
3421 		do_chunk_alloc(trans, root->fs_info->extent_root,
3422 			       num_bytes, data, 1);
3423 		goto again;
3424 	}
3425 	if (ret) {
3426 		struct btrfs_space_info *sinfo;
3427 
3428 		sinfo = __find_space_info(root->fs_info, data);
3429 		printk(KERN_ERR "btrfs allocation failed flags %llu, "
3430 		       "wanted %llu\n", (unsigned long long)data,
3431 		       (unsigned long long)num_bytes);
3432 		dump_space_info(sinfo, num_bytes);
3433 		BUG();
3434 	}
3435 
3436 	return ret;
3437 }
3438 
3439 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
3440 {
3441 	struct btrfs_block_group_cache *cache;
3442 	int ret = 0;
3443 
3444 	cache = btrfs_lookup_block_group(root->fs_info, start);
3445 	if (!cache) {
3446 		printk(KERN_ERR "Unable to find block group for %llu\n",
3447 		       (unsigned long long)start);
3448 		return -ENOSPC;
3449 	}
3450 
3451 	ret = btrfs_discard_extent(root, start, len);
3452 
3453 	btrfs_add_free_space(cache, start, len);
3454 	put_block_group(cache);
3455 	update_reserved_extents(root, start, len, 0);
3456 
3457 	return ret;
3458 }
3459 
3460 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
3461 				  struct btrfs_root *root,
3462 				  u64 num_bytes, u64 min_alloc_size,
3463 				  u64 empty_size, u64 hint_byte,
3464 				  u64 search_end, struct btrfs_key *ins,
3465 				  u64 data)
3466 {
3467 	int ret;
3468 	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
3469 				     empty_size, hint_byte, search_end, ins,
3470 				     data);
3471 	update_reserved_extents(root, ins->objectid, ins->offset, 1);
3472 	return ret;
3473 }
3474 
3475 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3476 					 struct btrfs_root *root, u64 parent,
3477 					 u64 root_objectid, u64 ref_generation,
3478 					 u64 owner, struct btrfs_key *ins)
3479 {
3480 	int ret;
3481 	int pending_ret;
3482 	u64 super_used;
3483 	u64 root_used;
3484 	u64 num_bytes = ins->offset;
3485 	u32 sizes[2];
3486 	struct btrfs_fs_info *info = root->fs_info;
3487 	struct btrfs_root *extent_root = info->extent_root;
3488 	struct btrfs_extent_item *extent_item;
3489 	struct btrfs_extent_ref *ref;
3490 	struct btrfs_path *path;
3491 	struct btrfs_key keys[2];
3492 
3493 	if (parent == 0)
3494 		parent = ins->objectid;
3495 
3496 	/* block accounting for super block */
3497 	spin_lock(&info->delalloc_lock);
3498 	super_used = btrfs_super_bytes_used(&info->super_copy);
3499 	btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes);
3500 
3501 	/* block accounting for root item */
3502 	root_used = btrfs_root_used(&root->root_item);
3503 	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
3504 	spin_unlock(&info->delalloc_lock);
3505 
3506 	if (root == extent_root) {
3507 		struct pending_extent_op *extent_op;
3508 
3509 		extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
3510 		BUG_ON(!extent_op);
3511 
3512 		extent_op->type = PENDING_EXTENT_INSERT;
3513 		extent_op->bytenr = ins->objectid;
3514 		extent_op->num_bytes = ins->offset;
3515 		extent_op->parent = parent;
3516 		extent_op->orig_parent = 0;
3517 		extent_op->generation = ref_generation;
3518 		extent_op->orig_generation = 0;
3519 		extent_op->level = (int)owner;
3520 		INIT_LIST_HEAD(&extent_op->list);
3521 		extent_op->del = 0;
3522 
3523 		mutex_lock(&root->fs_info->extent_ins_mutex);
3524 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
3525 				ins->objectid + ins->offset - 1,
3526 				EXTENT_WRITEBACK, GFP_NOFS);
3527 		set_state_private(&root->fs_info->extent_ins,
3528 				  ins->objectid, (unsigned long)extent_op);
3529 		mutex_unlock(&root->fs_info->extent_ins_mutex);
3530 		goto update_block;
3531 	}
3532 
3533 	memcpy(&keys[0], ins, sizeof(*ins));
3534 	keys[1].objectid = ins->objectid;
3535 	keys[1].type = BTRFS_EXTENT_REF_KEY;
3536 	keys[1].offset = parent;
3537 	sizes[0] = sizeof(*extent_item);
3538 	sizes[1] = sizeof(*ref);
3539 
3540 	path = btrfs_alloc_path();
3541 	BUG_ON(!path);
3542 
3543 	ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
3544 				       sizes, 2);
3545 	BUG_ON(ret);
3546 
3547 	extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3548 				     struct btrfs_extent_item);
3549 	btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
3550 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
3551 			     struct btrfs_extent_ref);
3552 
3553 	btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
3554 	btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
3555 	btrfs_set_ref_objectid(path->nodes[0], ref, owner);
3556 	btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
3557 
3558 	btrfs_mark_buffer_dirty(path->nodes[0]);
3559 
3560 	trans->alloc_exclude_start = 0;
3561 	trans->alloc_exclude_nr = 0;
3562 	btrfs_free_path(path);
3563 	finish_current_insert(trans, extent_root, 0);
3564 	pending_ret = del_pending_extents(trans, extent_root, 0);
3565 
3566 	if (ret)
3567 		goto out;
3568 	if (pending_ret) {
3569 		ret = pending_ret;
3570 		goto out;
3571 	}
3572 
3573 update_block:
3574 	ret = update_block_group(trans, root, ins->objectid,
3575 				 ins->offset, 1, 0);
3576 	if (ret) {
3577 		printk(KERN_ERR "btrfs update block group failed for %llu "
3578 		       "%llu\n", (unsigned long long)ins->objectid,
3579 		       (unsigned long long)ins->offset);
3580 		BUG();
3581 	}
3582 out:
3583 	return ret;
3584 }
3585 
3586 int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
3587 				struct btrfs_root *root, u64 parent,
3588 				u64 root_objectid, u64 ref_generation,
3589 				u64 owner, struct btrfs_key *ins)
3590 {
3591 	int ret;
3592 
3593 	if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
3594 		return 0;
3595 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3596 					    ref_generation, owner, ins);
3597 	update_reserved_extents(root, ins->objectid, ins->offset, 0);
3598 	return ret;
3599 }
3600 
3601 /*
3602  * this is used by the tree logging recovery code.  It records that
3603  * an extent has been allocated and makes sure to clear the free
3604  * space cache bits as well
3605  */
3606 int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
3607 				struct btrfs_root *root, u64 parent,
3608 				u64 root_objectid, u64 ref_generation,
3609 				u64 owner, struct btrfs_key *ins)
3610 {
3611 	int ret;
3612 	struct btrfs_block_group_cache *block_group;
3613 
3614 	block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
3615 	mutex_lock(&block_group->cache_mutex);
3616 	cache_block_group(root, block_group);
3617 	mutex_unlock(&block_group->cache_mutex);
3618 
3619 	ret = btrfs_remove_free_space(block_group, ins->objectid,
3620 				      ins->offset);
3621 	BUG_ON(ret);
3622 	put_block_group(block_group);
3623 	ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
3624 					    ref_generation, owner, ins);
3625 	return ret;
3626 }
3627 
3628 /*
3629  * finds a free extent and does all the dirty work required for allocation
3630  * returns the key for the extent through ins, and a tree buffer for
3631  * the first block of the extent through buf.
3632  *
3633  * returns 0 if everything worked, non-zero otherwise.
3634  */
3635 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
3636 		       struct btrfs_root *root,
3637 		       u64 num_bytes, u64 parent, u64 min_alloc_size,
3638 		       u64 root_objectid, u64 ref_generation,
3639 		       u64 owner_objectid, u64 empty_size, u64 hint_byte,
3640 		       u64 search_end, struct btrfs_key *ins, u64 data)
3641 {
3642 	int ret;
3643 
3644 	ret = __btrfs_reserve_extent(trans, root, num_bytes,
3645 				     min_alloc_size, empty_size, hint_byte,
3646 				     search_end, ins, data);
3647 	BUG_ON(ret);
3648 	if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
3649 		ret = __btrfs_alloc_reserved_extent(trans, root, parent,
3650 					root_objectid, ref_generation,
3651 					owner_objectid, ins);
3652 		BUG_ON(ret);
3653 
3654 	} else {
3655 		update_reserved_extents(root, ins->objectid, ins->offset, 1);
3656 	}
3657 	return ret;
3658 }
3659 
3660 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
3661 					    struct btrfs_root *root,
3662 					    u64 bytenr, u32 blocksize,
3663 					    int level)
3664 {
3665 	struct extent_buffer *buf;
3666 
3667 	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
3668 	if (!buf)
3669 		return ERR_PTR(-ENOMEM);
3670 	btrfs_set_header_generation(buf, trans->transid);
3671 	btrfs_set_buffer_lockdep_class(buf, level);
3672 	btrfs_tree_lock(buf);
3673 	clean_tree_block(trans, root, buf);
3674 
3675 	btrfs_set_lock_blocking(buf);
3676 	btrfs_set_buffer_uptodate(buf);
3677 
3678 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
3679 		set_extent_dirty(&root->dirty_log_pages, buf->start,
3680 			 buf->start + buf->len - 1, GFP_NOFS);
3681 	} else {
3682 		set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
3683 			 buf->start + buf->len - 1, GFP_NOFS);
3684 	}
3685 	trans->blocks_used++;
3686 	/* this returns a buffer locked for blocking */
3687 	return buf;
3688 }
3689 
3690 /*
3691  * helper function to allocate a block for a given tree
3692  * returns the tree buffer or NULL.
3693  */
3694 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
3695 					     struct btrfs_root *root,
3696 					     u32 blocksize, u64 parent,
3697 					     u64 root_objectid,
3698 					     u64 ref_generation,
3699 					     int level,
3700 					     u64 hint,
3701 					     u64 empty_size)
3702 {
3703 	struct btrfs_key ins;
3704 	int ret;
3705 	struct extent_buffer *buf;
3706 
3707 	ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize,
3708 				 root_objectid, ref_generation, level,
3709 				 empty_size, hint, (u64)-1, &ins, 0);
3710 	if (ret) {
3711 		BUG_ON(ret > 0);
3712 		return ERR_PTR(ret);
3713 	}
3714 
3715 	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
3716 				    blocksize, level);
3717 	return buf;
3718 }
3719 
3720 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
3721 			struct btrfs_root *root, struct extent_buffer *leaf)
3722 {
3723 	u64 leaf_owner;
3724 	u64 leaf_generation;
3725 	struct refsort *sorted;
3726 	struct btrfs_key key;
3727 	struct btrfs_file_extent_item *fi;
3728 	int i;
3729 	int nritems;
3730 	int ret;
3731 	int refi = 0;
3732 	int slot;
3733 
3734 	BUG_ON(!btrfs_is_leaf(leaf));
3735 	nritems = btrfs_header_nritems(leaf);
3736 	leaf_owner = btrfs_header_owner(leaf);
3737 	leaf_generation = btrfs_header_generation(leaf);
3738 
3739 	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3740 	/* we do this loop twice.  The first time we build a list
3741 	 * of the extents we have a reference on, then we sort the list
3742 	 * by bytenr.  The second time around we actually do the
3743 	 * extent freeing.
3744 	 */
3745 	for (i = 0; i < nritems; i++) {
3746 		u64 disk_bytenr;
3747 		cond_resched();
3748 
3749 		btrfs_item_key_to_cpu(leaf, &key, i);
3750 
3751 		/* only extents have references, skip everything else */
3752 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3753 			continue;
3754 
3755 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3756 
3757 		/* inline extents live in the btree, they don't have refs */
3758 		if (btrfs_file_extent_type(leaf, fi) ==
3759 		    BTRFS_FILE_EXTENT_INLINE)
3760 			continue;
3761 
3762 		disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
3763 
3764 		/* holes don't have refs */
3765 		if (disk_bytenr == 0)
3766 			continue;
3767 
3768 		sorted[refi].bytenr = disk_bytenr;
3769 		sorted[refi].slot = i;
3770 		refi++;
3771 	}
3772 
3773 	if (refi == 0)
3774 		goto out;
3775 
3776 	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3777 
3778 	for (i = 0; i < refi; i++) {
3779 		u64 disk_bytenr;
3780 
3781 		disk_bytenr = sorted[i].bytenr;
3782 		slot = sorted[i].slot;
3783 
3784 		cond_resched();
3785 
3786 		btrfs_item_key_to_cpu(leaf, &key, slot);
3787 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3788 			continue;
3789 
3790 		fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
3791 
3792 		ret = __btrfs_free_extent(trans, root, disk_bytenr,
3793 				btrfs_file_extent_disk_num_bytes(leaf, fi),
3794 				leaf->start, leaf_owner, leaf_generation,
3795 				key.objectid, 0);
3796 		BUG_ON(ret);
3797 
3798 		atomic_inc(&root->fs_info->throttle_gen);
3799 		wake_up(&root->fs_info->transaction_throttle);
3800 		cond_resched();
3801 	}
3802 out:
3803 	kfree(sorted);
3804 	return 0;
3805 }
3806 
3807 static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
3808 					struct btrfs_root *root,
3809 					struct btrfs_leaf_ref *ref)
3810 {
3811 	int i;
3812 	int ret;
3813 	struct btrfs_extent_info *info;
3814 	struct refsort *sorted;
3815 
3816 	if (ref->nritems == 0)
3817 		return 0;
3818 
3819 	sorted = kmalloc(sizeof(*sorted) * ref->nritems, GFP_NOFS);
3820 	for (i = 0; i < ref->nritems; i++) {
3821 		sorted[i].bytenr = ref->extents[i].bytenr;
3822 		sorted[i].slot = i;
3823 	}
3824 	sort(sorted, ref->nritems, sizeof(struct refsort), refsort_cmp, NULL);
3825 
3826 	/*
3827 	 * the items in the ref were sorted when the ref was inserted
3828 	 * into the ref cache, so this is already in order
3829 	 */
3830 	for (i = 0; i < ref->nritems; i++) {
3831 		info = ref->extents + sorted[i].slot;
3832 		ret = __btrfs_free_extent(trans, root, info->bytenr,
3833 					  info->num_bytes, ref->bytenr,
3834 					  ref->owner, ref->generation,
3835 					  info->objectid, 0);
3836 
3837 		atomic_inc(&root->fs_info->throttle_gen);
3838 		wake_up(&root->fs_info->transaction_throttle);
3839 		cond_resched();
3840 
3841 		BUG_ON(ret);
3842 		info++;
3843 	}
3844 
3845 	kfree(sorted);
3846 	return 0;
3847 }
3848 
3849 static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
3850 				     u64 len, u32 *refs)
3851 {
3852 	int ret;
3853 
3854 	ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
3855 	BUG_ON(ret);
3856 
3857 #if 0 /* some debugging code in case we see problems here */
3858 	/* if the refs count is one, it won't get increased again.  But
3859 	 * if the ref count is > 1, someone may be decreasing it at
3860 	 * the same time we are.
3861 	 */
3862 	if (*refs != 1) {
3863 		struct extent_buffer *eb = NULL;
3864 		eb = btrfs_find_create_tree_block(root, start, len);
3865 		if (eb)
3866 			btrfs_tree_lock(eb);
3867 
3868 		mutex_lock(&root->fs_info->alloc_mutex);
3869 		ret = lookup_extent_ref(NULL, root, start, len, refs);
3870 		BUG_ON(ret);
3871 		mutex_unlock(&root->fs_info->alloc_mutex);
3872 
3873 		if (eb) {
3874 			btrfs_tree_unlock(eb);
3875 			free_extent_buffer(eb);
3876 		}
3877 		if (*refs == 1) {
3878 			printk(KERN_ERR "btrfs block %llu went down to one "
3879 			       "during drop_snap\n", (unsigned long long)start);
3880 		}
3881 
3882 	}
3883 #endif
3884 
3885 	cond_resched();
3886 	return ret;
3887 }
3888 
3889 /*
3890  * this is used while deleting old snapshots, and it drops the refs
3891  * on a whole subtree starting from a level 1 node.
3892  *
3893  * The idea is to sort all the leaf pointers, and then drop the
3894  * ref on all the leaves in order.  Most of the time the leaves
3895  * will have ref cache entries, so no leaf IOs will be required to
3896  * find the extents they have references on.
3897  *
3898  * For each leaf, any references it has are also dropped in order
3899  *
3900  * This ends up dropping the references in something close to optimal
3901  * order for reading and modifying the extent allocation tree.
3902  */
3903 static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
3904 					struct btrfs_root *root,
3905 					struct btrfs_path *path)
3906 {
3907 	u64 bytenr;
3908 	u64 root_owner;
3909 	u64 root_gen;
3910 	struct extent_buffer *eb = path->nodes[1];
3911 	struct extent_buffer *leaf;
3912 	struct btrfs_leaf_ref *ref;
3913 	struct refsort *sorted = NULL;
3914 	int nritems = btrfs_header_nritems(eb);
3915 	int ret;
3916 	int i;
3917 	int refi = 0;
3918 	int slot = path->slots[1];
3919 	u32 blocksize = btrfs_level_size(root, 0);
3920 	u32 refs;
3921 
3922 	if (nritems == 0)
3923 		goto out;
3924 
3925 	root_owner = btrfs_header_owner(eb);
3926 	root_gen = btrfs_header_generation(eb);
3927 	sorted = kmalloc(sizeof(*sorted) * nritems, GFP_NOFS);
3928 
3929 	/*
3930 	 * step one, sort all the leaf pointers so we don't scribble
3931 	 * randomly into the extent allocation tree
3932 	 */
3933 	for (i = slot; i < nritems; i++) {
3934 		sorted[refi].bytenr = btrfs_node_blockptr(eb, i);
3935 		sorted[refi].slot = i;
3936 		refi++;
3937 	}
3938 
3939 	/*
3940 	 * nritems won't be zero, but if we're picking up drop_snapshot
3941 	 * after a crash, slot might be > 0, so double check things
3942 	 * just in case.
3943 	 */
3944 	if (refi == 0)
3945 		goto out;
3946 
3947 	sort(sorted, refi, sizeof(struct refsort), refsort_cmp, NULL);
3948 
3949 	/*
3950 	 * the first loop frees everything the leaves point to
3951 	 */
3952 	for (i = 0; i < refi; i++) {
3953 		u64 ptr_gen;
3954 
3955 		bytenr = sorted[i].bytenr;
3956 
3957 		/*
3958 		 * check the reference count on this leaf.  If it is > 1
3959 		 * we just decrement it below and don't update any
3960 		 * of the refs the leaf points to.
3961 		 */
3962 		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
3963 		BUG_ON(ret);
3964 		if (refs != 1)
3965 			continue;
3966 
3967 		ptr_gen = btrfs_node_ptr_generation(eb, sorted[i].slot);
3968 
3969 		/*
3970 		 * the leaf only had one reference, which means the
3971 		 * only thing pointing to this leaf is the snapshot
3972 		 * we're deleting.  It isn't possible for the reference
3973 		 * count to increase again later
3974 		 *
3975 		 * The reference cache is checked for the leaf,
3976 		 * and if found we'll be able to drop any refs held by
3977 		 * the leaf without needing to read it in.
3978 		 */
3979 		ref = btrfs_lookup_leaf_ref(root, bytenr);
3980 		if (ref && ref->generation != ptr_gen) {
3981 			btrfs_free_leaf_ref(root, ref);
3982 			ref = NULL;
3983 		}
3984 		if (ref) {
3985 			ret = cache_drop_leaf_ref(trans, root, ref);
3986 			BUG_ON(ret);
3987 			btrfs_remove_leaf_ref(root, ref);
3988 			btrfs_free_leaf_ref(root, ref);
3989 		} else {
3990 			/*
3991 			 * the leaf wasn't in the reference cache, so
3992 			 * we have to read it.
3993 			 */
3994 			leaf = read_tree_block(root, bytenr, blocksize,
3995 					       ptr_gen);
3996 			ret = btrfs_drop_leaf_ref(trans, root, leaf);
3997 			BUG_ON(ret);
3998 			free_extent_buffer(leaf);
3999 		}
4000 		atomic_inc(&root->fs_info->throttle_gen);
4001 		wake_up(&root->fs_info->transaction_throttle);
4002 		cond_resched();
4003 	}
4004 
4005 	/*
4006 	 * run through the loop again to free the refs on the leaves.
4007 	 * This is faster than doing it in the loop above because
4008 	 * the leaves are likely to be clustered together.  We end up
4009 	 * working in nice chunks on the extent allocation tree.
4010 	 */
4011 	for (i = 0; i < refi; i++) {
4012 		bytenr = sorted[i].bytenr;
4013 		ret = __btrfs_free_extent(trans, root, bytenr,
4014 					blocksize, eb->start,
4015 					root_owner, root_gen, 0, 1);
4016 		BUG_ON(ret);
4017 
4018 		atomic_inc(&root->fs_info->throttle_gen);
4019 		wake_up(&root->fs_info->transaction_throttle);
4020 		cond_resched();
4021 	}
4022 out:
4023 	kfree(sorted);
4024 
4025 	/*
4026 	 * update the path to show we've processed the entire level 1
4027 	 * node.  This will get saved into the root's drop_snapshot_progress
4028 	 * field so these drops are not repeated again if this transaction
4029 	 * commits.
4030 	 */
4031 	path->slots[1] = nritems;
4032 	return 0;
4033 }
4034 
4035 /*
4036  * helper function for drop_snapshot, this walks down the tree dropping ref
4037  * counts as it goes.
4038  */
4039 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
4040 				   struct btrfs_root *root,
4041 				   struct btrfs_path *path, int *level)
4042 {
4043 	u64 root_owner;
4044 	u64 root_gen;
4045 	u64 bytenr;
4046 	u64 ptr_gen;
4047 	struct extent_buffer *next;
4048 	struct extent_buffer *cur;
4049 	struct extent_buffer *parent;
4050 	u32 blocksize;
4051 	int ret;
4052 	u32 refs;
4053 
4054 	WARN_ON(*level < 0);
4055 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
4056 	ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
4057 				path->nodes[*level]->len, &refs);
4058 	BUG_ON(ret);
4059 	if (refs > 1)
4060 		goto out;
4061 
4062 	/*
4063 	 * walk down to the last node level and free all the leaves
4064 	 */
4065 	while (*level >= 0) {
4066 		WARN_ON(*level < 0);
4067 		WARN_ON(*level >= BTRFS_MAX_LEVEL);
4068 		cur = path->nodes[*level];
4069 
4070 		if (btrfs_header_level(cur) != *level)
4071 			WARN_ON(1);
4072 
4073 		if (path->slots[*level] >=
4074 		    btrfs_header_nritems(cur))
4075 			break;
4076 
4077 		/* the new code goes down to level 1 and does all the
4078 		 * leaves pointed to that node in bulk.  So, this check
4079 		 * for level 0 will always be false.
4080 		 *
4081 		 * But, the disk format allows the drop_snapshot_progress
4082 		 * field in the root to leave things in a state where
4083 		 * a leaf will need cleaning up here.  If someone crashes
4084 		 * with the old code and then boots with the new code,
4085 		 * we might find a leaf here.
4086 		 */
4087 		if (*level == 0) {
4088 			ret = btrfs_drop_leaf_ref(trans, root, cur);
4089 			BUG_ON(ret);
4090 			break;
4091 		}
4092 
4093 		/*
4094 		 * once we get to level one, process the whole node
4095 		 * at once, including everything below it.
4096 		 */
4097 		if (*level == 1) {
4098 			ret = drop_level_one_refs(trans, root, path);
4099 			BUG_ON(ret);
4100 			break;
4101 		}
4102 
4103 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
4104 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4105 		blocksize = btrfs_level_size(root, *level - 1);
4106 
4107 		ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
4108 		BUG_ON(ret);
4109 
4110 		/*
4111 		 * if there is more than one reference, we don't need
4112 		 * to read that node to drop any references it has.  We
4113 		 * just drop the ref we hold on that node and move on to the
4114 		 * next slot in this level.
4115 		 */
4116 		if (refs != 1) {
4117 			parent = path->nodes[*level];
4118 			root_owner = btrfs_header_owner(parent);
4119 			root_gen = btrfs_header_generation(parent);
4120 			path->slots[*level]++;
4121 
4122 			ret = __btrfs_free_extent(trans, root, bytenr,
4123 						blocksize, parent->start,
4124 						root_owner, root_gen,
4125 						*level - 1, 1);
4126 			BUG_ON(ret);
4127 
4128 			atomic_inc(&root->fs_info->throttle_gen);
4129 			wake_up(&root->fs_info->transaction_throttle);
4130 			cond_resched();
4131 
4132 			continue;
4133 		}
4134 
4135 		/*
4136 		 * we need to keep freeing things in the next level down.
4137 		 * read the block and loop around to process it
4138 		 */
4139 		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
4140 		WARN_ON(*level <= 0);
4141 		if (path->nodes[*level-1])
4142 			free_extent_buffer(path->nodes[*level-1]);
4143 		path->nodes[*level-1] = next;
4144 		*level = btrfs_header_level(next);
4145 		path->slots[*level] = 0;
4146 		cond_resched();
4147 	}
4148 out:
4149 	WARN_ON(*level < 0);
4150 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
4151 
4152 	if (path->nodes[*level] == root->node) {
4153 		parent = path->nodes[*level];
4154 		bytenr = path->nodes[*level]->start;
4155 	} else {
4156 		parent = path->nodes[*level + 1];
4157 		bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]);
4158 	}
4159 
4160 	blocksize = btrfs_level_size(root, *level);
4161 	root_owner = btrfs_header_owner(parent);
4162 	root_gen = btrfs_header_generation(parent);
4163 
4164 	/*
4165 	 * cleanup and free the reference on the last node
4166 	 * we processed
4167 	 */
4168 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
4169 				  parent->start, root_owner, root_gen,
4170 				  *level, 1);
4171 	free_extent_buffer(path->nodes[*level]);
4172 	path->nodes[*level] = NULL;
4173 
4174 	*level += 1;
4175 	BUG_ON(ret);
4176 
4177 	cond_resched();
4178 	return 0;
4179 }
4180 
4181 /*
4182  * helper function for drop_subtree, this function is similar to
4183  * walk_down_tree. The main difference is that it checks reference
4184  * counts while tree blocks are locked.
4185  */
4186 static noinline int walk_down_subtree(struct btrfs_trans_handle *trans,
4187 				      struct btrfs_root *root,
4188 				      struct btrfs_path *path, int *level)
4189 {
4190 	struct extent_buffer *next;
4191 	struct extent_buffer *cur;
4192 	struct extent_buffer *parent;
4193 	u64 bytenr;
4194 	u64 ptr_gen;
4195 	u32 blocksize;
4196 	u32 refs;
4197 	int ret;
4198 
4199 	cur = path->nodes[*level];
4200 	ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len,
4201 				      &refs);
4202 	BUG_ON(ret);
4203 	if (refs > 1)
4204 		goto out;
4205 
4206 	while (*level >= 0) {
4207 		cur = path->nodes[*level];
4208 		if (*level == 0) {
4209 			ret = btrfs_drop_leaf_ref(trans, root, cur);
4210 			BUG_ON(ret);
4211 			clean_tree_block(trans, root, cur);
4212 			break;
4213 		}
4214 		if (path->slots[*level] >= btrfs_header_nritems(cur)) {
4215 			clean_tree_block(trans, root, cur);
4216 			break;
4217 		}
4218 
4219 		bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
4220 		blocksize = btrfs_level_size(root, *level - 1);
4221 		ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
4222 
4223 		next = read_tree_block(root, bytenr, blocksize, ptr_gen);
4224 		btrfs_tree_lock(next);
4225 		btrfs_set_lock_blocking(next);
4226 
4227 		ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize,
4228 					      &refs);
4229 		BUG_ON(ret);
4230 		if (refs > 1) {
4231 			parent = path->nodes[*level];
4232 			ret = btrfs_free_extent(trans, root, bytenr,
4233 					blocksize, parent->start,
4234 					btrfs_header_owner(parent),
4235 					btrfs_header_generation(parent),
4236 					*level - 1, 1);
4237 			BUG_ON(ret);
4238 			path->slots[*level]++;
4239 			btrfs_tree_unlock(next);
4240 			free_extent_buffer(next);
4241 			continue;
4242 		}
4243 
4244 		*level = btrfs_header_level(next);
4245 		path->nodes[*level] = next;
4246 		path->slots[*level] = 0;
4247 		path->locks[*level] = 1;
4248 		cond_resched();
4249 	}
4250 out:
4251 	parent = path->nodes[*level + 1];
4252 	bytenr = path->nodes[*level]->start;
4253 	blocksize = path->nodes[*level]->len;
4254 
4255 	ret = btrfs_free_extent(trans, root, bytenr, blocksize,
4256 			parent->start, btrfs_header_owner(parent),
4257 			btrfs_header_generation(parent), *level, 1);
4258 	BUG_ON(ret);
4259 
4260 	if (path->locks[*level]) {
4261 		btrfs_tree_unlock(path->nodes[*level]);
4262 		path->locks[*level] = 0;
4263 	}
4264 	free_extent_buffer(path->nodes[*level]);
4265 	path->nodes[*level] = NULL;
4266 	*level += 1;
4267 	cond_resched();
4268 	return 0;
4269 }
4270 
4271 /*
4272  * helper for dropping snapshots.  This walks back up the tree in the path
4273  * to find the first node higher up where we haven't yet gone through
4274  * all the slots
4275  */
4276 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
4277 				 struct btrfs_root *root,
4278 				 struct btrfs_path *path,
4279 				 int *level, int max_level)
4280 {
4281 	u64 root_owner;
4282 	u64 root_gen;
4283 	struct btrfs_root_item *root_item = &root->root_item;
4284 	int i;
4285 	int slot;
4286 	int ret;
4287 
4288 	for (i = *level; i < max_level && path->nodes[i]; i++) {
4289 		slot = path->slots[i];
4290 		if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
4291 			struct extent_buffer *node;
4292 			struct btrfs_disk_key disk_key;
4293 
4294 			/*
4295 			 * there is more work to do in this level.
4296 			 * Update the drop_progress marker to reflect
4297 			 * the work we've done so far, and then bump
4298 			 * the slot number
4299 			 */
4300 			node = path->nodes[i];
4301 			path->slots[i]++;
4302 			*level = i;
4303 			WARN_ON(*level == 0);
4304 			btrfs_node_key(node, &disk_key, path->slots[i]);
4305 			memcpy(&root_item->drop_progress,
4306 			       &disk_key, sizeof(disk_key));
4307 			root_item->drop_level = i;
4308 			return 0;
4309 		} else {
4310 			struct extent_buffer *parent;
4311 
4312 			/*
4313 			 * this whole node is done, free our reference
4314 			 * on it and go up one level
4315 			 */
4316 			if (path->nodes[*level] == root->node)
4317 				parent = path->nodes[*level];
4318 			else
4319 				parent = path->nodes[*level + 1];
4320 
4321 			root_owner = btrfs_header_owner(parent);
4322 			root_gen = btrfs_header_generation(parent);
4323 
4324 			clean_tree_block(trans, root, path->nodes[*level]);
4325 			ret = btrfs_free_extent(trans, root,
4326 						path->nodes[*level]->start,
4327 						path->nodes[*level]->len,
4328 						parent->start, root_owner,
4329 						root_gen, *level, 1);
4330 			BUG_ON(ret);
4331 			if (path->locks[*level]) {
4332 				btrfs_tree_unlock(path->nodes[*level]);
4333 				path->locks[*level] = 0;
4334 			}
4335 			free_extent_buffer(path->nodes[*level]);
4336 			path->nodes[*level] = NULL;
4337 			*level = i + 1;
4338 		}
4339 	}
4340 	return 1;
4341 }
4342 
4343 /*
4344  * drop the reference count on the tree rooted at 'snap'.  This traverses
4345  * the tree freeing any blocks that have a ref count of zero after being
4346  * decremented.
4347  */
4348 int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
4349 			*root)
4350 {
4351 	int ret = 0;
4352 	int wret;
4353 	int level;
4354 	struct btrfs_path *path;
4355 	int i;
4356 	int orig_level;
4357 	struct btrfs_root_item *root_item = &root->root_item;
4358 
4359 	WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
4360 	path = btrfs_alloc_path();
4361 	BUG_ON(!path);
4362 
4363 	level = btrfs_header_level(root->node);
4364 	orig_level = level;
4365 	if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
4366 		path->nodes[level] = root->node;
4367 		extent_buffer_get(root->node);
4368 		path->slots[level] = 0;
4369 	} else {
4370 		struct btrfs_key key;
4371 		struct btrfs_disk_key found_key;
4372 		struct extent_buffer *node;
4373 
4374 		btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
4375 		level = root_item->drop_level;
4376 		path->lowest_level = level;
4377 		wret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4378 		if (wret < 0) {
4379 			ret = wret;
4380 			goto out;
4381 		}
4382 		node = path->nodes[level];
4383 		btrfs_node_key(node, &found_key, path->slots[level]);
4384 		WARN_ON(memcmp(&found_key, &root_item->drop_progress,
4385 			       sizeof(found_key)));
4386 		/*
4387 		 * unlock our path, this is safe because only this
4388 		 * function is allowed to delete this snapshot
4389 		 */
4390 		for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
4391 			if (path->nodes[i] && path->locks[i]) {
4392 				path->locks[i] = 0;
4393 				btrfs_tree_unlock(path->nodes[i]);
4394 			}
4395 		}
4396 	}
4397 	while (1) {
4398 		wret = walk_down_tree(trans, root, path, &level);
4399 		if (wret > 0)
4400 			break;
4401 		if (wret < 0)
4402 			ret = wret;
4403 
4404 		wret = walk_up_tree(trans, root, path, &level,
4405 				    BTRFS_MAX_LEVEL);
4406 		if (wret > 0)
4407 			break;
4408 		if (wret < 0)
4409 			ret = wret;
4410 		if (trans->transaction->in_commit) {
4411 			ret = -EAGAIN;
4412 			break;
4413 		}
4414 		atomic_inc(&root->fs_info->throttle_gen);
4415 		wake_up(&root->fs_info->transaction_throttle);
4416 	}
4417 	for (i = 0; i <= orig_level; i++) {
4418 		if (path->nodes[i]) {
4419 			free_extent_buffer(path->nodes[i]);
4420 			path->nodes[i] = NULL;
4421 		}
4422 	}
4423 out:
4424 	btrfs_free_path(path);
4425 	return ret;
4426 }
4427 
4428 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
4429 			struct btrfs_root *root,
4430 			struct extent_buffer *node,
4431 			struct extent_buffer *parent)
4432 {
4433 	struct btrfs_path *path;
4434 	int level;
4435 	int parent_level;
4436 	int ret = 0;
4437 	int wret;
4438 
4439 	path = btrfs_alloc_path();
4440 	BUG_ON(!path);
4441 
4442 	btrfs_assert_tree_locked(parent);
4443 	parent_level = btrfs_header_level(parent);
4444 	extent_buffer_get(parent);
4445 	path->nodes[parent_level] = parent;
4446 	path->slots[parent_level] = btrfs_header_nritems(parent);
4447 
4448 	btrfs_assert_tree_locked(node);
4449 	level = btrfs_header_level(node);
4450 	extent_buffer_get(node);
4451 	path->nodes[level] = node;
4452 	path->slots[level] = 0;
4453 
4454 	while (1) {
4455 		wret = walk_down_subtree(trans, root, path, &level);
4456 		if (wret < 0)
4457 			ret = wret;
4458 		if (wret != 0)
4459 			break;
4460 
4461 		wret = walk_up_tree(trans, root, path, &level, parent_level);
4462 		if (wret < 0)
4463 			ret = wret;
4464 		if (wret != 0)
4465 			break;
4466 	}
4467 
4468 	btrfs_free_path(path);
4469 	return ret;
4470 }
4471 
4472 static unsigned long calc_ra(unsigned long start, unsigned long last,
4473 			     unsigned long nr)
4474 {
4475 	return min(last, start + nr - 1);
4476 }
4477 
4478 static noinline int relocate_inode_pages(struct inode *inode, u64 start,
4479 					 u64 len)
4480 {
4481 	u64 page_start;
4482 	u64 page_end;
4483 	unsigned long first_index;
4484 	unsigned long last_index;
4485 	unsigned long i;
4486 	struct page *page;
4487 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4488 	struct file_ra_state *ra;
4489 	struct btrfs_ordered_extent *ordered;
4490 	unsigned int total_read = 0;
4491 	unsigned int total_dirty = 0;
4492 	int ret = 0;
4493 
4494 	ra = kzalloc(sizeof(*ra), GFP_NOFS);
4495 
4496 	mutex_lock(&inode->i_mutex);
4497 	first_index = start >> PAGE_CACHE_SHIFT;
4498 	last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
4499 
4500 	/* make sure the dirty trick played by the caller work */
4501 	ret = invalidate_inode_pages2_range(inode->i_mapping,
4502 					    first_index, last_index);
4503 	if (ret)
4504 		goto out_unlock;
4505 
4506 	file_ra_state_init(ra, inode->i_mapping);
4507 
4508 	for (i = first_index ; i <= last_index; i++) {
4509 		if (total_read % ra->ra_pages == 0) {
4510 			btrfs_force_ra(inode->i_mapping, ra, NULL, i,
4511 				       calc_ra(i, last_index, ra->ra_pages));
4512 		}
4513 		total_read++;
4514 again:
4515 		if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
4516 			BUG_ON(1);
4517 		page = grab_cache_page(inode->i_mapping, i);
4518 		if (!page) {
4519 			ret = -ENOMEM;
4520 			goto out_unlock;
4521 		}
4522 		if (!PageUptodate(page)) {
4523 			btrfs_readpage(NULL, page);
4524 			lock_page(page);
4525 			if (!PageUptodate(page)) {
4526 				unlock_page(page);
4527 				page_cache_release(page);
4528 				ret = -EIO;
4529 				goto out_unlock;
4530 			}
4531 		}
4532 		wait_on_page_writeback(page);
4533 
4534 		page_start = (u64)page->index << PAGE_CACHE_SHIFT;
4535 		page_end = page_start + PAGE_CACHE_SIZE - 1;
4536 		lock_extent(io_tree, page_start, page_end, GFP_NOFS);
4537 
4538 		ordered = btrfs_lookup_ordered_extent(inode, page_start);
4539 		if (ordered) {
4540 			unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4541 			unlock_page(page);
4542 			page_cache_release(page);
4543 			btrfs_start_ordered_extent(inode, ordered, 1);
4544 			btrfs_put_ordered_extent(ordered);
4545 			goto again;
4546 		}
4547 		set_page_extent_mapped(page);
4548 
4549 		if (i == first_index)
4550 			set_extent_bits(io_tree, page_start, page_end,
4551 					EXTENT_BOUNDARY, GFP_NOFS);
4552 		btrfs_set_extent_delalloc(inode, page_start, page_end);
4553 
4554 		set_page_dirty(page);
4555 		total_dirty++;
4556 
4557 		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
4558 		unlock_page(page);
4559 		page_cache_release(page);
4560 	}
4561 
4562 out_unlock:
4563 	kfree(ra);
4564 	mutex_unlock(&inode->i_mutex);
4565 	balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
4566 	return ret;
4567 }
4568 
4569 static noinline int relocate_data_extent(struct inode *reloc_inode,
4570 					 struct btrfs_key *extent_key,
4571 					 u64 offset)
4572 {
4573 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4574 	struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
4575 	struct extent_map *em;
4576 	u64 start = extent_key->objectid - offset;
4577 	u64 end = start + extent_key->offset - 1;
4578 
4579 	em = alloc_extent_map(GFP_NOFS);
4580 	BUG_ON(!em || IS_ERR(em));
4581 
4582 	em->start = start;
4583 	em->len = extent_key->offset;
4584 	em->block_len = extent_key->offset;
4585 	em->block_start = extent_key->objectid;
4586 	em->bdev = root->fs_info->fs_devices->latest_bdev;
4587 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
4588 
4589 	/* setup extent map to cheat btrfs_readpage */
4590 	lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4591 	while (1) {
4592 		int ret;
4593 		spin_lock(&em_tree->lock);
4594 		ret = add_extent_mapping(em_tree, em);
4595 		spin_unlock(&em_tree->lock);
4596 		if (ret != -EEXIST) {
4597 			free_extent_map(em);
4598 			break;
4599 		}
4600 		btrfs_drop_extent_cache(reloc_inode, start, end, 0);
4601 	}
4602 	unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
4603 
4604 	return relocate_inode_pages(reloc_inode, start, extent_key->offset);
4605 }
4606 
4607 struct btrfs_ref_path {
4608 	u64 extent_start;
4609 	u64 nodes[BTRFS_MAX_LEVEL];
4610 	u64 root_objectid;
4611 	u64 root_generation;
4612 	u64 owner_objectid;
4613 	u32 num_refs;
4614 	int lowest_level;
4615 	int current_level;
4616 	int shared_level;
4617 
4618 	struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
4619 	u64 new_nodes[BTRFS_MAX_LEVEL];
4620 };
4621 
4622 struct disk_extent {
4623 	u64 ram_bytes;
4624 	u64 disk_bytenr;
4625 	u64 disk_num_bytes;
4626 	u64 offset;
4627 	u64 num_bytes;
4628 	u8 compression;
4629 	u8 encryption;
4630 	u16 other_encoding;
4631 };
4632 
4633 static int is_cowonly_root(u64 root_objectid)
4634 {
4635 	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
4636 	    root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
4637 	    root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
4638 	    root_objectid == BTRFS_DEV_TREE_OBJECTID ||
4639 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
4640 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID)
4641 		return 1;
4642 	return 0;
4643 }
4644 
4645 static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
4646 				    struct btrfs_root *extent_root,
4647 				    struct btrfs_ref_path *ref_path,
4648 				    int first_time)
4649 {
4650 	struct extent_buffer *leaf;
4651 	struct btrfs_path *path;
4652 	struct btrfs_extent_ref *ref;
4653 	struct btrfs_key key;
4654 	struct btrfs_key found_key;
4655 	u64 bytenr;
4656 	u32 nritems;
4657 	int level;
4658 	int ret = 1;
4659 
4660 	path = btrfs_alloc_path();
4661 	if (!path)
4662 		return -ENOMEM;
4663 
4664 	if (first_time) {
4665 		ref_path->lowest_level = -1;
4666 		ref_path->current_level = -1;
4667 		ref_path->shared_level = -1;
4668 		goto walk_up;
4669 	}
4670 walk_down:
4671 	level = ref_path->current_level - 1;
4672 	while (level >= -1) {
4673 		u64 parent;
4674 		if (level < ref_path->lowest_level)
4675 			break;
4676 
4677 		if (level >= 0)
4678 			bytenr = ref_path->nodes[level];
4679 		else
4680 			bytenr = ref_path->extent_start;
4681 		BUG_ON(bytenr == 0);
4682 
4683 		parent = ref_path->nodes[level + 1];
4684 		ref_path->nodes[level + 1] = 0;
4685 		ref_path->current_level = level;
4686 		BUG_ON(parent == 0);
4687 
4688 		key.objectid = bytenr;
4689 		key.offset = parent + 1;
4690 		key.type = BTRFS_EXTENT_REF_KEY;
4691 
4692 		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4693 		if (ret < 0)
4694 			goto out;
4695 		BUG_ON(ret == 0);
4696 
4697 		leaf = path->nodes[0];
4698 		nritems = btrfs_header_nritems(leaf);
4699 		if (path->slots[0] >= nritems) {
4700 			ret = btrfs_next_leaf(extent_root, path);
4701 			if (ret < 0)
4702 				goto out;
4703 			if (ret > 0)
4704 				goto next;
4705 			leaf = path->nodes[0];
4706 		}
4707 
4708 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4709 		if (found_key.objectid == bytenr &&
4710 		    found_key.type == BTRFS_EXTENT_REF_KEY) {
4711 			if (level < ref_path->shared_level)
4712 				ref_path->shared_level = level;
4713 			goto found;
4714 		}
4715 next:
4716 		level--;
4717 		btrfs_release_path(extent_root, path);
4718 		cond_resched();
4719 	}
4720 	/* reached lowest level */
4721 	ret = 1;
4722 	goto out;
4723 walk_up:
4724 	level = ref_path->current_level;
4725 	while (level < BTRFS_MAX_LEVEL - 1) {
4726 		u64 ref_objectid;
4727 
4728 		if (level >= 0)
4729 			bytenr = ref_path->nodes[level];
4730 		else
4731 			bytenr = ref_path->extent_start;
4732 
4733 		BUG_ON(bytenr == 0);
4734 
4735 		key.objectid = bytenr;
4736 		key.offset = 0;
4737 		key.type = BTRFS_EXTENT_REF_KEY;
4738 
4739 		ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
4740 		if (ret < 0)
4741 			goto out;
4742 
4743 		leaf = path->nodes[0];
4744 		nritems = btrfs_header_nritems(leaf);
4745 		if (path->slots[0] >= nritems) {
4746 			ret = btrfs_next_leaf(extent_root, path);
4747 			if (ret < 0)
4748 				goto out;
4749 			if (ret > 0) {
4750 				/* the extent was freed by someone */
4751 				if (ref_path->lowest_level == level)
4752 					goto out;
4753 				btrfs_release_path(extent_root, path);
4754 				goto walk_down;
4755 			}
4756 			leaf = path->nodes[0];
4757 		}
4758 
4759 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4760 		if (found_key.objectid != bytenr ||
4761 				found_key.type != BTRFS_EXTENT_REF_KEY) {
4762 			/* the extent was freed by someone */
4763 			if (ref_path->lowest_level == level) {
4764 				ret = 1;
4765 				goto out;
4766 			}
4767 			btrfs_release_path(extent_root, path);
4768 			goto walk_down;
4769 		}
4770 found:
4771 		ref = btrfs_item_ptr(leaf, path->slots[0],
4772 				struct btrfs_extent_ref);
4773 		ref_objectid = btrfs_ref_objectid(leaf, ref);
4774 		if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
4775 			if (first_time) {
4776 				level = (int)ref_objectid;
4777 				BUG_ON(level >= BTRFS_MAX_LEVEL);
4778 				ref_path->lowest_level = level;
4779 				ref_path->current_level = level;
4780 				ref_path->nodes[level] = bytenr;
4781 			} else {
4782 				WARN_ON(ref_objectid != level);
4783 			}
4784 		} else {
4785 			WARN_ON(level != -1);
4786 		}
4787 		first_time = 0;
4788 
4789 		if (ref_path->lowest_level == level) {
4790 			ref_path->owner_objectid = ref_objectid;
4791 			ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
4792 		}
4793 
4794 		/*
4795 		 * the block is tree root or the block isn't in reference
4796 		 * counted tree.
4797 		 */
4798 		if (found_key.objectid == found_key.offset ||
4799 		    is_cowonly_root(btrfs_ref_root(leaf, ref))) {
4800 			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4801 			ref_path->root_generation =
4802 				btrfs_ref_generation(leaf, ref);
4803 			if (level < 0) {
4804 				/* special reference from the tree log */
4805 				ref_path->nodes[0] = found_key.offset;
4806 				ref_path->current_level = 0;
4807 			}
4808 			ret = 0;
4809 			goto out;
4810 		}
4811 
4812 		level++;
4813 		BUG_ON(ref_path->nodes[level] != 0);
4814 		ref_path->nodes[level] = found_key.offset;
4815 		ref_path->current_level = level;
4816 
4817 		/*
4818 		 * the reference was created in the running transaction,
4819 		 * no need to continue walking up.
4820 		 */
4821 		if (btrfs_ref_generation(leaf, ref) == trans->transid) {
4822 			ref_path->root_objectid = btrfs_ref_root(leaf, ref);
4823 			ref_path->root_generation =
4824 				btrfs_ref_generation(leaf, ref);
4825 			ret = 0;
4826 			goto out;
4827 		}
4828 
4829 		btrfs_release_path(extent_root, path);
4830 		cond_resched();
4831 	}
4832 	/* reached max tree level, but no tree root found. */
4833 	BUG();
4834 out:
4835 	btrfs_free_path(path);
4836 	return ret;
4837 }
4838 
4839 static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
4840 				struct btrfs_root *extent_root,
4841 				struct btrfs_ref_path *ref_path,
4842 				u64 extent_start)
4843 {
4844 	memset(ref_path, 0, sizeof(*ref_path));
4845 	ref_path->extent_start = extent_start;
4846 
4847 	return __next_ref_path(trans, extent_root, ref_path, 1);
4848 }
4849 
4850 static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
4851 			       struct btrfs_root *extent_root,
4852 			       struct btrfs_ref_path *ref_path)
4853 {
4854 	return __next_ref_path(trans, extent_root, ref_path, 0);
4855 }
4856 
4857 static noinline int get_new_locations(struct inode *reloc_inode,
4858 				      struct btrfs_key *extent_key,
4859 				      u64 offset, int no_fragment,
4860 				      struct disk_extent **extents,
4861 				      int *nr_extents)
4862 {
4863 	struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
4864 	struct btrfs_path *path;
4865 	struct btrfs_file_extent_item *fi;
4866 	struct extent_buffer *leaf;
4867 	struct disk_extent *exts = *extents;
4868 	struct btrfs_key found_key;
4869 	u64 cur_pos;
4870 	u64 last_byte;
4871 	u32 nritems;
4872 	int nr = 0;
4873 	int max = *nr_extents;
4874 	int ret;
4875 
4876 	WARN_ON(!no_fragment && *extents);
4877 	if (!exts) {
4878 		max = 1;
4879 		exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
4880 		if (!exts)
4881 			return -ENOMEM;
4882 	}
4883 
4884 	path = btrfs_alloc_path();
4885 	BUG_ON(!path);
4886 
4887 	cur_pos = extent_key->objectid - offset;
4888 	last_byte = extent_key->objectid + extent_key->offset;
4889 	ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
4890 				       cur_pos, 0);
4891 	if (ret < 0)
4892 		goto out;
4893 	if (ret > 0) {
4894 		ret = -ENOENT;
4895 		goto out;
4896 	}
4897 
4898 	while (1) {
4899 		leaf = path->nodes[0];
4900 		nritems = btrfs_header_nritems(leaf);
4901 		if (path->slots[0] >= nritems) {
4902 			ret = btrfs_next_leaf(root, path);
4903 			if (ret < 0)
4904 				goto out;
4905 			if (ret > 0)
4906 				break;
4907 			leaf = path->nodes[0];
4908 		}
4909 
4910 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4911 		if (found_key.offset != cur_pos ||
4912 		    found_key.type != BTRFS_EXTENT_DATA_KEY ||
4913 		    found_key.objectid != reloc_inode->i_ino)
4914 			break;
4915 
4916 		fi = btrfs_item_ptr(leaf, path->slots[0],
4917 				    struct btrfs_file_extent_item);
4918 		if (btrfs_file_extent_type(leaf, fi) !=
4919 		    BTRFS_FILE_EXTENT_REG ||
4920 		    btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
4921 			break;
4922 
4923 		if (nr == max) {
4924 			struct disk_extent *old = exts;
4925 			max *= 2;
4926 			exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
4927 			memcpy(exts, old, sizeof(*exts) * nr);
4928 			if (old != *extents)
4929 				kfree(old);
4930 		}
4931 
4932 		exts[nr].disk_bytenr =
4933 			btrfs_file_extent_disk_bytenr(leaf, fi);
4934 		exts[nr].disk_num_bytes =
4935 			btrfs_file_extent_disk_num_bytes(leaf, fi);
4936 		exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
4937 		exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
4938 		exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
4939 		exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
4940 		exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
4941 		exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
4942 									   fi);
4943 		BUG_ON(exts[nr].offset > 0);
4944 		BUG_ON(exts[nr].compression || exts[nr].encryption);
4945 		BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
4946 
4947 		cur_pos += exts[nr].num_bytes;
4948 		nr++;
4949 
4950 		if (cur_pos + offset >= last_byte)
4951 			break;
4952 
4953 		if (no_fragment) {
4954 			ret = 1;
4955 			goto out;
4956 		}
4957 		path->slots[0]++;
4958 	}
4959 
4960 	BUG_ON(cur_pos + offset > last_byte);
4961 	if (cur_pos + offset < last_byte) {
4962 		ret = -ENOENT;
4963 		goto out;
4964 	}
4965 	ret = 0;
4966 out:
4967 	btrfs_free_path(path);
4968 	if (ret) {
4969 		if (exts != *extents)
4970 			kfree(exts);
4971 	} else {
4972 		*extents = exts;
4973 		*nr_extents = nr;
4974 	}
4975 	return ret;
4976 }
4977 
4978 static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
4979 					struct btrfs_root *root,
4980 					struct btrfs_path *path,
4981 					struct btrfs_key *extent_key,
4982 					struct btrfs_key *leaf_key,
4983 					struct btrfs_ref_path *ref_path,
4984 					struct disk_extent *new_extents,
4985 					int nr_extents)
4986 {
4987 	struct extent_buffer *leaf;
4988 	struct btrfs_file_extent_item *fi;
4989 	struct inode *inode = NULL;
4990 	struct btrfs_key key;
4991 	u64 lock_start = 0;
4992 	u64 lock_end = 0;
4993 	u64 num_bytes;
4994 	u64 ext_offset;
4995 	u64 search_end = (u64)-1;
4996 	u32 nritems;
4997 	int nr_scaned = 0;
4998 	int extent_locked = 0;
4999 	int extent_type;
5000 	int ret;
5001 
5002 	memcpy(&key, leaf_key, sizeof(key));
5003 	if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
5004 		if (key.objectid < ref_path->owner_objectid ||
5005 		    (key.objectid == ref_path->owner_objectid &&
5006 		     key.type < BTRFS_EXTENT_DATA_KEY)) {
5007 			key.objectid = ref_path->owner_objectid;
5008 			key.type = BTRFS_EXTENT_DATA_KEY;
5009 			key.offset = 0;
5010 		}
5011 	}
5012 
5013 	while (1) {
5014 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
5015 		if (ret < 0)
5016 			goto out;
5017 
5018 		leaf = path->nodes[0];
5019 		nritems = btrfs_header_nritems(leaf);
5020 next:
5021 		if (extent_locked && ret > 0) {
5022 			/*
5023 			 * the file extent item was modified by someone
5024 			 * before the extent got locked.
5025 			 */
5026 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
5027 				      lock_end, GFP_NOFS);
5028 			extent_locked = 0;
5029 		}
5030 
5031 		if (path->slots[0] >= nritems) {
5032 			if (++nr_scaned > 2)
5033 				break;
5034 
5035 			BUG_ON(extent_locked);
5036 			ret = btrfs_next_leaf(root, path);
5037 			if (ret < 0)
5038 				goto out;
5039 			if (ret > 0)
5040 				break;
5041 			leaf = path->nodes[0];
5042 			nritems = btrfs_header_nritems(leaf);
5043 		}
5044 
5045 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
5046 
5047 		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
5048 			if ((key.objectid > ref_path->owner_objectid) ||
5049 			    (key.objectid == ref_path->owner_objectid &&
5050 			     key.type > BTRFS_EXTENT_DATA_KEY) ||
5051 			    key.offset >= search_end)
5052 				break;
5053 		}
5054 
5055 		if (inode && key.objectid != inode->i_ino) {
5056 			BUG_ON(extent_locked);
5057 			btrfs_release_path(root, path);
5058 			mutex_unlock(&inode->i_mutex);
5059 			iput(inode);
5060 			inode = NULL;
5061 			continue;
5062 		}
5063 
5064 		if (key.type != BTRFS_EXTENT_DATA_KEY) {
5065 			path->slots[0]++;
5066 			ret = 1;
5067 			goto next;
5068 		}
5069 		fi = btrfs_item_ptr(leaf, path->slots[0],
5070 				    struct btrfs_file_extent_item);
5071 		extent_type = btrfs_file_extent_type(leaf, fi);
5072 		if ((extent_type != BTRFS_FILE_EXTENT_REG &&
5073 		     extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
5074 		    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
5075 		     extent_key->objectid)) {
5076 			path->slots[0]++;
5077 			ret = 1;
5078 			goto next;
5079 		}
5080 
5081 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
5082 		ext_offset = btrfs_file_extent_offset(leaf, fi);
5083 
5084 		if (search_end == (u64)-1) {
5085 			search_end = key.offset - ext_offset +
5086 				btrfs_file_extent_ram_bytes(leaf, fi);
5087 		}
5088 
5089 		if (!extent_locked) {
5090 			lock_start = key.offset;
5091 			lock_end = lock_start + num_bytes - 1;
5092 		} else {
5093 			if (lock_start > key.offset ||
5094 			    lock_end + 1 < key.offset + num_bytes) {
5095 				unlock_extent(&BTRFS_I(inode)->io_tree,
5096 					      lock_start, lock_end, GFP_NOFS);
5097 				extent_locked = 0;
5098 			}
5099 		}
5100 
5101 		if (!inode) {
5102 			btrfs_release_path(root, path);
5103 
5104 			inode = btrfs_iget_locked(root->fs_info->sb,
5105 						  key.objectid, root);
5106 			if (inode->i_state & I_NEW) {
5107 				BTRFS_I(inode)->root = root;
5108 				BTRFS_I(inode)->location.objectid =
5109 					key.objectid;
5110 				BTRFS_I(inode)->location.type =
5111 					BTRFS_INODE_ITEM_KEY;
5112 				BTRFS_I(inode)->location.offset = 0;
5113 				btrfs_read_locked_inode(inode);
5114 				unlock_new_inode(inode);
5115 			}
5116 			/*
5117 			 * some code call btrfs_commit_transaction while
5118 			 * holding the i_mutex, so we can't use mutex_lock
5119 			 * here.
5120 			 */
5121 			if (is_bad_inode(inode) ||
5122 			    !mutex_trylock(&inode->i_mutex)) {
5123 				iput(inode);
5124 				inode = NULL;
5125 				key.offset = (u64)-1;
5126 				goto skip;
5127 			}
5128 		}
5129 
5130 		if (!extent_locked) {
5131 			struct btrfs_ordered_extent *ordered;
5132 
5133 			btrfs_release_path(root, path);
5134 
5135 			lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
5136 				    lock_end, GFP_NOFS);
5137 			ordered = btrfs_lookup_first_ordered_extent(inode,
5138 								    lock_end);
5139 			if (ordered &&
5140 			    ordered->file_offset <= lock_end &&
5141 			    ordered->file_offset + ordered->len > lock_start) {
5142 				unlock_extent(&BTRFS_I(inode)->io_tree,
5143 					      lock_start, lock_end, GFP_NOFS);
5144 				btrfs_start_ordered_extent(inode, ordered, 1);
5145 				btrfs_put_ordered_extent(ordered);
5146 				key.offset += num_bytes;
5147 				goto skip;
5148 			}
5149 			if (ordered)
5150 				btrfs_put_ordered_extent(ordered);
5151 
5152 			extent_locked = 1;
5153 			continue;
5154 		}
5155 
5156 		if (nr_extents == 1) {
5157 			/* update extent pointer in place */
5158 			btrfs_set_file_extent_disk_bytenr(leaf, fi,
5159 						new_extents[0].disk_bytenr);
5160 			btrfs_set_file_extent_disk_num_bytes(leaf, fi,
5161 						new_extents[0].disk_num_bytes);
5162 			btrfs_mark_buffer_dirty(leaf);
5163 
5164 			btrfs_drop_extent_cache(inode, key.offset,
5165 						key.offset + num_bytes - 1, 0);
5166 
5167 			ret = btrfs_inc_extent_ref(trans, root,
5168 						new_extents[0].disk_bytenr,
5169 						new_extents[0].disk_num_bytes,
5170 						leaf->start,
5171 						root->root_key.objectid,
5172 						trans->transid,
5173 						key.objectid);
5174 			BUG_ON(ret);
5175 
5176 			ret = btrfs_free_extent(trans, root,
5177 						extent_key->objectid,
5178 						extent_key->offset,
5179 						leaf->start,
5180 						btrfs_header_owner(leaf),
5181 						btrfs_header_generation(leaf),
5182 						key.objectid, 0);
5183 			BUG_ON(ret);
5184 
5185 			btrfs_release_path(root, path);
5186 			key.offset += num_bytes;
5187 		} else {
5188 			BUG_ON(1);
5189 #if 0
5190 			u64 alloc_hint;
5191 			u64 extent_len;
5192 			int i;
5193 			/*
5194 			 * drop old extent pointer at first, then insert the
5195 			 * new pointers one bye one
5196 			 */
5197 			btrfs_release_path(root, path);
5198 			ret = btrfs_drop_extents(trans, root, inode, key.offset,
5199 						 key.offset + num_bytes,
5200 						 key.offset, &alloc_hint);
5201 			BUG_ON(ret);
5202 
5203 			for (i = 0; i < nr_extents; i++) {
5204 				if (ext_offset >= new_extents[i].num_bytes) {
5205 					ext_offset -= new_extents[i].num_bytes;
5206 					continue;
5207 				}
5208 				extent_len = min(new_extents[i].num_bytes -
5209 						 ext_offset, num_bytes);
5210 
5211 				ret = btrfs_insert_empty_item(trans, root,
5212 							      path, &key,
5213 							      sizeof(*fi));
5214 				BUG_ON(ret);
5215 
5216 				leaf = path->nodes[0];
5217 				fi = btrfs_item_ptr(leaf, path->slots[0],
5218 						struct btrfs_file_extent_item);
5219 				btrfs_set_file_extent_generation(leaf, fi,
5220 							trans->transid);
5221 				btrfs_set_file_extent_type(leaf, fi,
5222 							BTRFS_FILE_EXTENT_REG);
5223 				btrfs_set_file_extent_disk_bytenr(leaf, fi,
5224 						new_extents[i].disk_bytenr);
5225 				btrfs_set_file_extent_disk_num_bytes(leaf, fi,
5226 						new_extents[i].disk_num_bytes);
5227 				btrfs_set_file_extent_ram_bytes(leaf, fi,
5228 						new_extents[i].ram_bytes);
5229 
5230 				btrfs_set_file_extent_compression(leaf, fi,
5231 						new_extents[i].compression);
5232 				btrfs_set_file_extent_encryption(leaf, fi,
5233 						new_extents[i].encryption);
5234 				btrfs_set_file_extent_other_encoding(leaf, fi,
5235 						new_extents[i].other_encoding);
5236 
5237 				btrfs_set_file_extent_num_bytes(leaf, fi,
5238 							extent_len);
5239 				ext_offset += new_extents[i].offset;
5240 				btrfs_set_file_extent_offset(leaf, fi,
5241 							ext_offset);
5242 				btrfs_mark_buffer_dirty(leaf);
5243 
5244 				btrfs_drop_extent_cache(inode, key.offset,
5245 						key.offset + extent_len - 1, 0);
5246 
5247 				ret = btrfs_inc_extent_ref(trans, root,
5248 						new_extents[i].disk_bytenr,
5249 						new_extents[i].disk_num_bytes,
5250 						leaf->start,
5251 						root->root_key.objectid,
5252 						trans->transid, key.objectid);
5253 				BUG_ON(ret);
5254 				btrfs_release_path(root, path);
5255 
5256 				inode_add_bytes(inode, extent_len);
5257 
5258 				ext_offset = 0;
5259 				num_bytes -= extent_len;
5260 				key.offset += extent_len;
5261 
5262 				if (num_bytes == 0)
5263 					break;
5264 			}
5265 			BUG_ON(i >= nr_extents);
5266 #endif
5267 		}
5268 
5269 		if (extent_locked) {
5270 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
5271 				      lock_end, GFP_NOFS);
5272 			extent_locked = 0;
5273 		}
5274 skip:
5275 		if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
5276 		    key.offset >= search_end)
5277 			break;
5278 
5279 		cond_resched();
5280 	}
5281 	ret = 0;
5282 out:
5283 	btrfs_release_path(root, path);
5284 	if (inode) {
5285 		mutex_unlock(&inode->i_mutex);
5286 		if (extent_locked) {
5287 			unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
5288 				      lock_end, GFP_NOFS);
5289 		}
5290 		iput(inode);
5291 	}
5292 	return ret;
5293 }
5294 
5295 int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
5296 			       struct btrfs_root *root,
5297 			       struct extent_buffer *buf, u64 orig_start)
5298 {
5299 	int level;
5300 	int ret;
5301 
5302 	BUG_ON(btrfs_header_generation(buf) != trans->transid);
5303 	BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5304 
5305 	level = btrfs_header_level(buf);
5306 	if (level == 0) {
5307 		struct btrfs_leaf_ref *ref;
5308 		struct btrfs_leaf_ref *orig_ref;
5309 
5310 		orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
5311 		if (!orig_ref)
5312 			return -ENOENT;
5313 
5314 		ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
5315 		if (!ref) {
5316 			btrfs_free_leaf_ref(root, orig_ref);
5317 			return -ENOMEM;
5318 		}
5319 
5320 		ref->nritems = orig_ref->nritems;
5321 		memcpy(ref->extents, orig_ref->extents,
5322 			sizeof(ref->extents[0]) * ref->nritems);
5323 
5324 		btrfs_free_leaf_ref(root, orig_ref);
5325 
5326 		ref->root_gen = trans->transid;
5327 		ref->bytenr = buf->start;
5328 		ref->owner = btrfs_header_owner(buf);
5329 		ref->generation = btrfs_header_generation(buf);
5330 
5331 		ret = btrfs_add_leaf_ref(root, ref, 0);
5332 		WARN_ON(ret);
5333 		btrfs_free_leaf_ref(root, ref);
5334 	}
5335 	return 0;
5336 }
5337 
5338 static noinline int invalidate_extent_cache(struct btrfs_root *root,
5339 					struct extent_buffer *leaf,
5340 					struct btrfs_block_group_cache *group,
5341 					struct btrfs_root *target_root)
5342 {
5343 	struct btrfs_key key;
5344 	struct inode *inode = NULL;
5345 	struct btrfs_file_extent_item *fi;
5346 	u64 num_bytes;
5347 	u64 skip_objectid = 0;
5348 	u32 nritems;
5349 	u32 i;
5350 
5351 	nritems = btrfs_header_nritems(leaf);
5352 	for (i = 0; i < nritems; i++) {
5353 		btrfs_item_key_to_cpu(leaf, &key, i);
5354 		if (key.objectid == skip_objectid ||
5355 		    key.type != BTRFS_EXTENT_DATA_KEY)
5356 			continue;
5357 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
5358 		if (btrfs_file_extent_type(leaf, fi) ==
5359 		    BTRFS_FILE_EXTENT_INLINE)
5360 			continue;
5361 		if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
5362 			continue;
5363 		if (!inode || inode->i_ino != key.objectid) {
5364 			iput(inode);
5365 			inode = btrfs_ilookup(target_root->fs_info->sb,
5366 					      key.objectid, target_root, 1);
5367 		}
5368 		if (!inode) {
5369 			skip_objectid = key.objectid;
5370 			continue;
5371 		}
5372 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
5373 
5374 		lock_extent(&BTRFS_I(inode)->io_tree, key.offset,
5375 			    key.offset + num_bytes - 1, GFP_NOFS);
5376 		btrfs_drop_extent_cache(inode, key.offset,
5377 					key.offset + num_bytes - 1, 1);
5378 		unlock_extent(&BTRFS_I(inode)->io_tree, key.offset,
5379 			      key.offset + num_bytes - 1, GFP_NOFS);
5380 		cond_resched();
5381 	}
5382 	iput(inode);
5383 	return 0;
5384 }
5385 
5386 static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
5387 					struct btrfs_root *root,
5388 					struct extent_buffer *leaf,
5389 					struct btrfs_block_group_cache *group,
5390 					struct inode *reloc_inode)
5391 {
5392 	struct btrfs_key key;
5393 	struct btrfs_key extent_key;
5394 	struct btrfs_file_extent_item *fi;
5395 	struct btrfs_leaf_ref *ref;
5396 	struct disk_extent *new_extent;
5397 	u64 bytenr;
5398 	u64 num_bytes;
5399 	u32 nritems;
5400 	u32 i;
5401 	int ext_index;
5402 	int nr_extent;
5403 	int ret;
5404 
5405 	new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
5406 	BUG_ON(!new_extent);
5407 
5408 	ref = btrfs_lookup_leaf_ref(root, leaf->start);
5409 	BUG_ON(!ref);
5410 
5411 	ext_index = -1;
5412 	nritems = btrfs_header_nritems(leaf);
5413 	for (i = 0; i < nritems; i++) {
5414 		btrfs_item_key_to_cpu(leaf, &key, i);
5415 		if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
5416 			continue;
5417 		fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
5418 		if (btrfs_file_extent_type(leaf, fi) ==
5419 		    BTRFS_FILE_EXTENT_INLINE)
5420 			continue;
5421 		bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
5422 		num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
5423 		if (bytenr == 0)
5424 			continue;
5425 
5426 		ext_index++;
5427 		if (bytenr >= group->key.objectid + group->key.offset ||
5428 		    bytenr + num_bytes <= group->key.objectid)
5429 			continue;
5430 
5431 		extent_key.objectid = bytenr;
5432 		extent_key.offset = num_bytes;
5433 		extent_key.type = BTRFS_EXTENT_ITEM_KEY;
5434 		nr_extent = 1;
5435 		ret = get_new_locations(reloc_inode, &extent_key,
5436 					group->key.objectid, 1,
5437 					&new_extent, &nr_extent);
5438 		if (ret > 0)
5439 			continue;
5440 		BUG_ON(ret < 0);
5441 
5442 		BUG_ON(ref->extents[ext_index].bytenr != bytenr);
5443 		BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
5444 		ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
5445 		ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
5446 
5447 		btrfs_set_file_extent_disk_bytenr(leaf, fi,
5448 						new_extent->disk_bytenr);
5449 		btrfs_set_file_extent_disk_num_bytes(leaf, fi,
5450 						new_extent->disk_num_bytes);
5451 		btrfs_mark_buffer_dirty(leaf);
5452 
5453 		ret = btrfs_inc_extent_ref(trans, root,
5454 					new_extent->disk_bytenr,
5455 					new_extent->disk_num_bytes,
5456 					leaf->start,
5457 					root->root_key.objectid,
5458 					trans->transid, key.objectid);
5459 		BUG_ON(ret);
5460 		ret = btrfs_free_extent(trans, root,
5461 					bytenr, num_bytes, leaf->start,
5462 					btrfs_header_owner(leaf),
5463 					btrfs_header_generation(leaf),
5464 					key.objectid, 0);
5465 		BUG_ON(ret);
5466 		cond_resched();
5467 	}
5468 	kfree(new_extent);
5469 	BUG_ON(ext_index + 1 != ref->nritems);
5470 	btrfs_free_leaf_ref(root, ref);
5471 	return 0;
5472 }
5473 
5474 int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
5475 			  struct btrfs_root *root)
5476 {
5477 	struct btrfs_root *reloc_root;
5478 	int ret;
5479 
5480 	if (root->reloc_root) {
5481 		reloc_root = root->reloc_root;
5482 		root->reloc_root = NULL;
5483 		list_add(&reloc_root->dead_list,
5484 			 &root->fs_info->dead_reloc_roots);
5485 
5486 		btrfs_set_root_bytenr(&reloc_root->root_item,
5487 				      reloc_root->node->start);
5488 		btrfs_set_root_level(&root->root_item,
5489 				     btrfs_header_level(reloc_root->node));
5490 		memset(&reloc_root->root_item.drop_progress, 0,
5491 			sizeof(struct btrfs_disk_key));
5492 		reloc_root->root_item.drop_level = 0;
5493 
5494 		ret = btrfs_update_root(trans, root->fs_info->tree_root,
5495 					&reloc_root->root_key,
5496 					&reloc_root->root_item);
5497 		BUG_ON(ret);
5498 	}
5499 	return 0;
5500 }
5501 
5502 int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
5503 {
5504 	struct btrfs_trans_handle *trans;
5505 	struct btrfs_root *reloc_root;
5506 	struct btrfs_root *prev_root = NULL;
5507 	struct list_head dead_roots;
5508 	int ret;
5509 	unsigned long nr;
5510 
5511 	INIT_LIST_HEAD(&dead_roots);
5512 	list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
5513 
5514 	while (!list_empty(&dead_roots)) {
5515 		reloc_root = list_entry(dead_roots.prev,
5516 					struct btrfs_root, dead_list);
5517 		list_del_init(&reloc_root->dead_list);
5518 
5519 		BUG_ON(reloc_root->commit_root != NULL);
5520 		while (1) {
5521 			trans = btrfs_join_transaction(root, 1);
5522 			BUG_ON(!trans);
5523 
5524 			mutex_lock(&root->fs_info->drop_mutex);
5525 			ret = btrfs_drop_snapshot(trans, reloc_root);
5526 			if (ret != -EAGAIN)
5527 				break;
5528 			mutex_unlock(&root->fs_info->drop_mutex);
5529 
5530 			nr = trans->blocks_used;
5531 			ret = btrfs_end_transaction(trans, root);
5532 			BUG_ON(ret);
5533 			btrfs_btree_balance_dirty(root, nr);
5534 		}
5535 
5536 		free_extent_buffer(reloc_root->node);
5537 
5538 		ret = btrfs_del_root(trans, root->fs_info->tree_root,
5539 				     &reloc_root->root_key);
5540 		BUG_ON(ret);
5541 		mutex_unlock(&root->fs_info->drop_mutex);
5542 
5543 		nr = trans->blocks_used;
5544 		ret = btrfs_end_transaction(trans, root);
5545 		BUG_ON(ret);
5546 		btrfs_btree_balance_dirty(root, nr);
5547 
5548 		kfree(prev_root);
5549 		prev_root = reloc_root;
5550 	}
5551 	if (prev_root) {
5552 		btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
5553 		kfree(prev_root);
5554 	}
5555 	return 0;
5556 }
5557 
5558 int btrfs_add_dead_reloc_root(struct btrfs_root *root)
5559 {
5560 	list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
5561 	return 0;
5562 }
5563 
5564 int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
5565 {
5566 	struct btrfs_root *reloc_root;
5567 	struct btrfs_trans_handle *trans;
5568 	struct btrfs_key location;
5569 	int found;
5570 	int ret;
5571 
5572 	mutex_lock(&root->fs_info->tree_reloc_mutex);
5573 	ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
5574 	BUG_ON(ret);
5575 	found = !list_empty(&root->fs_info->dead_reloc_roots);
5576 	mutex_unlock(&root->fs_info->tree_reloc_mutex);
5577 
5578 	if (found) {
5579 		trans = btrfs_start_transaction(root, 1);
5580 		BUG_ON(!trans);
5581 		ret = btrfs_commit_transaction(trans, root);
5582 		BUG_ON(ret);
5583 	}
5584 
5585 	location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
5586 	location.offset = (u64)-1;
5587 	location.type = BTRFS_ROOT_ITEM_KEY;
5588 
5589 	reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
5590 	BUG_ON(!reloc_root);
5591 	btrfs_orphan_cleanup(reloc_root);
5592 	return 0;
5593 }
5594 
5595 static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
5596 				    struct btrfs_root *root)
5597 {
5598 	struct btrfs_root *reloc_root;
5599 	struct extent_buffer *eb;
5600 	struct btrfs_root_item *root_item;
5601 	struct btrfs_key root_key;
5602 	int ret;
5603 
5604 	BUG_ON(!root->ref_cows);
5605 	if (root->reloc_root)
5606 		return 0;
5607 
5608 	root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
5609 	BUG_ON(!root_item);
5610 
5611 	ret = btrfs_copy_root(trans, root, root->commit_root,
5612 			      &eb, BTRFS_TREE_RELOC_OBJECTID);
5613 	BUG_ON(ret);
5614 
5615 	root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
5616 	root_key.offset = root->root_key.objectid;
5617 	root_key.type = BTRFS_ROOT_ITEM_KEY;
5618 
5619 	memcpy(root_item, &root->root_item, sizeof(root_item));
5620 	btrfs_set_root_refs(root_item, 0);
5621 	btrfs_set_root_bytenr(root_item, eb->start);
5622 	btrfs_set_root_level(root_item, btrfs_header_level(eb));
5623 	btrfs_set_root_generation(root_item, trans->transid);
5624 
5625 	btrfs_tree_unlock(eb);
5626 	free_extent_buffer(eb);
5627 
5628 	ret = btrfs_insert_root(trans, root->fs_info->tree_root,
5629 				&root_key, root_item);
5630 	BUG_ON(ret);
5631 	kfree(root_item);
5632 
5633 	reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
5634 						 &root_key);
5635 	BUG_ON(!reloc_root);
5636 	reloc_root->last_trans = trans->transid;
5637 	reloc_root->commit_root = NULL;
5638 	reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
5639 
5640 	root->reloc_root = reloc_root;
5641 	return 0;
5642 }
5643 
5644 /*
5645  * Core function of space balance.
5646  *
5647  * The idea is using reloc trees to relocate tree blocks in reference
5648  * counted roots. There is one reloc tree for each subvol, and all
5649  * reloc trees share same root key objectid. Reloc trees are snapshots
5650  * of the latest committed roots of subvols (root->commit_root).
5651  *
5652  * To relocate a tree block referenced by a subvol, there are two steps.
5653  * COW the block through subvol's reloc tree, then update block pointer
5654  * in the subvol to point to the new block. Since all reloc trees share
5655  * same root key objectid, doing special handing for tree blocks owned
5656  * by them is easy. Once a tree block has been COWed in one reloc tree,
5657  * we can use the resulting new block directly when the same block is
5658  * required to COW again through other reloc trees. By this way, relocated
5659  * tree blocks are shared between reloc trees, so they are also shared
5660  * between subvols.
5661  */
5662 static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
5663 				      struct btrfs_root *root,
5664 				      struct btrfs_path *path,
5665 				      struct btrfs_key *first_key,
5666 				      struct btrfs_ref_path *ref_path,
5667 				      struct btrfs_block_group_cache *group,
5668 				      struct inode *reloc_inode)
5669 {
5670 	struct btrfs_root *reloc_root;
5671 	struct extent_buffer *eb = NULL;
5672 	struct btrfs_key *keys;
5673 	u64 *nodes;
5674 	int level;
5675 	int shared_level;
5676 	int lowest_level = 0;
5677 	int ret;
5678 
5679 	if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
5680 		lowest_level = ref_path->owner_objectid;
5681 
5682 	if (!root->ref_cows) {
5683 		path->lowest_level = lowest_level;
5684 		ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
5685 		BUG_ON(ret < 0);
5686 		path->lowest_level = 0;
5687 		btrfs_release_path(root, path);
5688 		return 0;
5689 	}
5690 
5691 	mutex_lock(&root->fs_info->tree_reloc_mutex);
5692 	ret = init_reloc_tree(trans, root);
5693 	BUG_ON(ret);
5694 	reloc_root = root->reloc_root;
5695 
5696 	shared_level = ref_path->shared_level;
5697 	ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
5698 
5699 	keys = ref_path->node_keys;
5700 	nodes = ref_path->new_nodes;
5701 	memset(&keys[shared_level + 1], 0,
5702 	       sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
5703 	memset(&nodes[shared_level + 1], 0,
5704 	       sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
5705 
5706 	if (nodes[lowest_level] == 0) {
5707 		path->lowest_level = lowest_level;
5708 		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5709 					0, 1);
5710 		BUG_ON(ret);
5711 		for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
5712 			eb = path->nodes[level];
5713 			if (!eb || eb == reloc_root->node)
5714 				break;
5715 			nodes[level] = eb->start;
5716 			if (level == 0)
5717 				btrfs_item_key_to_cpu(eb, &keys[level], 0);
5718 			else
5719 				btrfs_node_key_to_cpu(eb, &keys[level], 0);
5720 		}
5721 		if (nodes[0] &&
5722 		    ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5723 			eb = path->nodes[0];
5724 			ret = replace_extents_in_leaf(trans, reloc_root, eb,
5725 						      group, reloc_inode);
5726 			BUG_ON(ret);
5727 		}
5728 		btrfs_release_path(reloc_root, path);
5729 	} else {
5730 		ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
5731 				       lowest_level);
5732 		BUG_ON(ret);
5733 	}
5734 
5735 	/*
5736 	 * replace tree blocks in the fs tree with tree blocks in
5737 	 * the reloc tree.
5738 	 */
5739 	ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
5740 	BUG_ON(ret < 0);
5741 
5742 	if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5743 		ret = btrfs_search_slot(trans, reloc_root, first_key, path,
5744 					0, 0);
5745 		BUG_ON(ret);
5746 		extent_buffer_get(path->nodes[0]);
5747 		eb = path->nodes[0];
5748 		btrfs_release_path(reloc_root, path);
5749 		ret = invalidate_extent_cache(reloc_root, eb, group, root);
5750 		BUG_ON(ret);
5751 		free_extent_buffer(eb);
5752 	}
5753 
5754 	mutex_unlock(&root->fs_info->tree_reloc_mutex);
5755 	path->lowest_level = 0;
5756 	return 0;
5757 }
5758 
5759 static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
5760 					struct btrfs_root *root,
5761 					struct btrfs_path *path,
5762 					struct btrfs_key *first_key,
5763 					struct btrfs_ref_path *ref_path)
5764 {
5765 	int ret;
5766 
5767 	ret = relocate_one_path(trans, root, path, first_key,
5768 				ref_path, NULL, NULL);
5769 	BUG_ON(ret);
5770 
5771 	if (root == root->fs_info->extent_root)
5772 		btrfs_extent_post_op(trans, root);
5773 
5774 	return 0;
5775 }
5776 
5777 static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
5778 				    struct btrfs_root *extent_root,
5779 				    struct btrfs_path *path,
5780 				    struct btrfs_key *extent_key)
5781 {
5782 	int ret;
5783 
5784 	ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
5785 	if (ret)
5786 		goto out;
5787 	ret = btrfs_del_item(trans, extent_root, path);
5788 out:
5789 	btrfs_release_path(extent_root, path);
5790 	return ret;
5791 }
5792 
5793 static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
5794 						struct btrfs_ref_path *ref_path)
5795 {
5796 	struct btrfs_key root_key;
5797 
5798 	root_key.objectid = ref_path->root_objectid;
5799 	root_key.type = BTRFS_ROOT_ITEM_KEY;
5800 	if (is_cowonly_root(ref_path->root_objectid))
5801 		root_key.offset = 0;
5802 	else
5803 		root_key.offset = (u64)-1;
5804 
5805 	return btrfs_read_fs_root_no_name(fs_info, &root_key);
5806 }
5807 
5808 static noinline int relocate_one_extent(struct btrfs_root *extent_root,
5809 					struct btrfs_path *path,
5810 					struct btrfs_key *extent_key,
5811 					struct btrfs_block_group_cache *group,
5812 					struct inode *reloc_inode, int pass)
5813 {
5814 	struct btrfs_trans_handle *trans;
5815 	struct btrfs_root *found_root;
5816 	struct btrfs_ref_path *ref_path = NULL;
5817 	struct disk_extent *new_extents = NULL;
5818 	int nr_extents = 0;
5819 	int loops;
5820 	int ret;
5821 	int level;
5822 	struct btrfs_key first_key;
5823 	u64 prev_block = 0;
5824 
5825 
5826 	trans = btrfs_start_transaction(extent_root, 1);
5827 	BUG_ON(!trans);
5828 
5829 	if (extent_key->objectid == 0) {
5830 		ret = del_extent_zero(trans, extent_root, path, extent_key);
5831 		goto out;
5832 	}
5833 
5834 	ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
5835 	if (!ref_path) {
5836 		ret = -ENOMEM;
5837 		goto out;
5838 	}
5839 
5840 	for (loops = 0; ; loops++) {
5841 		if (loops == 0) {
5842 			ret = btrfs_first_ref_path(trans, extent_root, ref_path,
5843 						   extent_key->objectid);
5844 		} else {
5845 			ret = btrfs_next_ref_path(trans, extent_root, ref_path);
5846 		}
5847 		if (ret < 0)
5848 			goto out;
5849 		if (ret > 0)
5850 			break;
5851 
5852 		if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
5853 		    ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
5854 			continue;
5855 
5856 		found_root = read_ref_root(extent_root->fs_info, ref_path);
5857 		BUG_ON(!found_root);
5858 		/*
5859 		 * for reference counted tree, only process reference paths
5860 		 * rooted at the latest committed root.
5861 		 */
5862 		if (found_root->ref_cows &&
5863 		    ref_path->root_generation != found_root->root_key.offset)
5864 			continue;
5865 
5866 		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5867 			if (pass == 0) {
5868 				/*
5869 				 * copy data extents to new locations
5870 				 */
5871 				u64 group_start = group->key.objectid;
5872 				ret = relocate_data_extent(reloc_inode,
5873 							   extent_key,
5874 							   group_start);
5875 				if (ret < 0)
5876 					goto out;
5877 				break;
5878 			}
5879 			level = 0;
5880 		} else {
5881 			level = ref_path->owner_objectid;
5882 		}
5883 
5884 		if (prev_block != ref_path->nodes[level]) {
5885 			struct extent_buffer *eb;
5886 			u64 block_start = ref_path->nodes[level];
5887 			u64 block_size = btrfs_level_size(found_root, level);
5888 
5889 			eb = read_tree_block(found_root, block_start,
5890 					     block_size, 0);
5891 			btrfs_tree_lock(eb);
5892 			BUG_ON(level != btrfs_header_level(eb));
5893 
5894 			if (level == 0)
5895 				btrfs_item_key_to_cpu(eb, &first_key, 0);
5896 			else
5897 				btrfs_node_key_to_cpu(eb, &first_key, 0);
5898 
5899 			btrfs_tree_unlock(eb);
5900 			free_extent_buffer(eb);
5901 			prev_block = block_start;
5902 		}
5903 
5904 		mutex_lock(&extent_root->fs_info->trans_mutex);
5905 		btrfs_record_root_in_trans(found_root);
5906 		mutex_unlock(&extent_root->fs_info->trans_mutex);
5907 		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
5908 			/*
5909 			 * try to update data extent references while
5910 			 * keeping metadata shared between snapshots.
5911 			 */
5912 			if (pass == 1) {
5913 				ret = relocate_one_path(trans, found_root,
5914 						path, &first_key, ref_path,
5915 						group, reloc_inode);
5916 				if (ret < 0)
5917 					goto out;
5918 				continue;
5919 			}
5920 			/*
5921 			 * use fallback method to process the remaining
5922 			 * references.
5923 			 */
5924 			if (!new_extents) {
5925 				u64 group_start = group->key.objectid;
5926 				new_extents = kmalloc(sizeof(*new_extents),
5927 						      GFP_NOFS);
5928 				nr_extents = 1;
5929 				ret = get_new_locations(reloc_inode,
5930 							extent_key,
5931 							group_start, 1,
5932 							&new_extents,
5933 							&nr_extents);
5934 				if (ret)
5935 					goto out;
5936 			}
5937 			ret = replace_one_extent(trans, found_root,
5938 						path, extent_key,
5939 						&first_key, ref_path,
5940 						new_extents, nr_extents);
5941 		} else {
5942 			ret = relocate_tree_block(trans, found_root, path,
5943 						  &first_key, ref_path);
5944 		}
5945 		if (ret < 0)
5946 			goto out;
5947 	}
5948 	ret = 0;
5949 out:
5950 	btrfs_end_transaction(trans, extent_root);
5951 	kfree(new_extents);
5952 	kfree(ref_path);
5953 	return ret;
5954 }
5955 
5956 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
5957 {
5958 	u64 num_devices;
5959 	u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
5960 		BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
5961 
5962 	num_devices = root->fs_info->fs_devices->rw_devices;
5963 	if (num_devices == 1) {
5964 		stripped |= BTRFS_BLOCK_GROUP_DUP;
5965 		stripped = flags & ~stripped;
5966 
5967 		/* turn raid0 into single device chunks */
5968 		if (flags & BTRFS_BLOCK_GROUP_RAID0)
5969 			return stripped;
5970 
5971 		/* turn mirroring into duplication */
5972 		if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
5973 			     BTRFS_BLOCK_GROUP_RAID10))
5974 			return stripped | BTRFS_BLOCK_GROUP_DUP;
5975 		return flags;
5976 	} else {
5977 		/* they already had raid on here, just return */
5978 		if (flags & stripped)
5979 			return flags;
5980 
5981 		stripped |= BTRFS_BLOCK_GROUP_DUP;
5982 		stripped = flags & ~stripped;
5983 
5984 		/* switch duplicated blocks with raid1 */
5985 		if (flags & BTRFS_BLOCK_GROUP_DUP)
5986 			return stripped | BTRFS_BLOCK_GROUP_RAID1;
5987 
5988 		/* turn single device chunks into raid0 */
5989 		return stripped | BTRFS_BLOCK_GROUP_RAID0;
5990 	}
5991 	return flags;
5992 }
5993 
5994 static int __alloc_chunk_for_shrink(struct btrfs_root *root,
5995 		     struct btrfs_block_group_cache *shrink_block_group,
5996 		     int force)
5997 {
5998 	struct btrfs_trans_handle *trans;
5999 	u64 new_alloc_flags;
6000 	u64 calc;
6001 
6002 	spin_lock(&shrink_block_group->lock);
6003 	if (btrfs_block_group_used(&shrink_block_group->item) > 0) {
6004 		spin_unlock(&shrink_block_group->lock);
6005 
6006 		trans = btrfs_start_transaction(root, 1);
6007 		spin_lock(&shrink_block_group->lock);
6008 
6009 		new_alloc_flags = update_block_group_flags(root,
6010 						   shrink_block_group->flags);
6011 		if (new_alloc_flags != shrink_block_group->flags) {
6012 			calc =
6013 			     btrfs_block_group_used(&shrink_block_group->item);
6014 		} else {
6015 			calc = shrink_block_group->key.offset;
6016 		}
6017 		spin_unlock(&shrink_block_group->lock);
6018 
6019 		do_chunk_alloc(trans, root->fs_info->extent_root,
6020 			       calc + 2 * 1024 * 1024, new_alloc_flags, force);
6021 
6022 		btrfs_end_transaction(trans, root);
6023 	} else
6024 		spin_unlock(&shrink_block_group->lock);
6025 	return 0;
6026 }
6027 
6028 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
6029 				 struct btrfs_root *root,
6030 				 u64 objectid, u64 size)
6031 {
6032 	struct btrfs_path *path;
6033 	struct btrfs_inode_item *item;
6034 	struct extent_buffer *leaf;
6035 	int ret;
6036 
6037 	path = btrfs_alloc_path();
6038 	if (!path)
6039 		return -ENOMEM;
6040 
6041 	ret = btrfs_insert_empty_inode(trans, root, path, objectid);
6042 	if (ret)
6043 		goto out;
6044 
6045 	leaf = path->nodes[0];
6046 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
6047 	memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item));
6048 	btrfs_set_inode_generation(leaf, item, 1);
6049 	btrfs_set_inode_size(leaf, item, size);
6050 	btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
6051 	btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
6052 	btrfs_mark_buffer_dirty(leaf);
6053 	btrfs_release_path(root, path);
6054 out:
6055 	btrfs_free_path(path);
6056 	return ret;
6057 }
6058 
6059 static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
6060 					struct btrfs_block_group_cache *group)
6061 {
6062 	struct inode *inode = NULL;
6063 	struct btrfs_trans_handle *trans;
6064 	struct btrfs_root *root;
6065 	struct btrfs_key root_key;
6066 	u64 objectid = BTRFS_FIRST_FREE_OBJECTID;
6067 	int err = 0;
6068 
6069 	root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
6070 	root_key.type = BTRFS_ROOT_ITEM_KEY;
6071 	root_key.offset = (u64)-1;
6072 	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
6073 	if (IS_ERR(root))
6074 		return ERR_CAST(root);
6075 
6076 	trans = btrfs_start_transaction(root, 1);
6077 	BUG_ON(!trans);
6078 
6079 	err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
6080 	if (err)
6081 		goto out;
6082 
6083 	err = __insert_orphan_inode(trans, root, objectid, group->key.offset);
6084 	BUG_ON(err);
6085 
6086 	err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0,
6087 				       group->key.offset, 0, group->key.offset,
6088 				       0, 0, 0);
6089 	BUG_ON(err);
6090 
6091 	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
6092 	if (inode->i_state & I_NEW) {
6093 		BTRFS_I(inode)->root = root;
6094 		BTRFS_I(inode)->location.objectid = objectid;
6095 		BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
6096 		BTRFS_I(inode)->location.offset = 0;
6097 		btrfs_read_locked_inode(inode);
6098 		unlock_new_inode(inode);
6099 		BUG_ON(is_bad_inode(inode));
6100 	} else {
6101 		BUG_ON(1);
6102 	}
6103 	BTRFS_I(inode)->index_cnt = group->key.objectid;
6104 
6105 	err = btrfs_orphan_add(trans, inode);
6106 out:
6107 	btrfs_end_transaction(trans, root);
6108 	if (err) {
6109 		if (inode)
6110 			iput(inode);
6111 		inode = ERR_PTR(err);
6112 	}
6113 	return inode;
6114 }
6115 
6116 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
6117 {
6118 
6119 	struct btrfs_ordered_sum *sums;
6120 	struct btrfs_sector_sum *sector_sum;
6121 	struct btrfs_ordered_extent *ordered;
6122 	struct btrfs_root *root = BTRFS_I(inode)->root;
6123 	struct list_head list;
6124 	size_t offset;
6125 	int ret;
6126 	u64 disk_bytenr;
6127 
6128 	INIT_LIST_HEAD(&list);
6129 
6130 	ordered = btrfs_lookup_ordered_extent(inode, file_pos);
6131 	BUG_ON(ordered->file_offset != file_pos || ordered->len != len);
6132 
6133 	disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
6134 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
6135 				       disk_bytenr + len - 1, &list);
6136 
6137 	while (!list_empty(&list)) {
6138 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
6139 		list_del_init(&sums->list);
6140 
6141 		sector_sum = sums->sums;
6142 		sums->bytenr = ordered->start;
6143 
6144 		offset = 0;
6145 		while (offset < sums->len) {
6146 			sector_sum->bytenr += ordered->start - disk_bytenr;
6147 			sector_sum++;
6148 			offset += root->sectorsize;
6149 		}
6150 
6151 		btrfs_add_ordered_sum(inode, ordered, sums);
6152 	}
6153 	btrfs_put_ordered_extent(ordered);
6154 	return 0;
6155 }
6156 
6157 int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start)
6158 {
6159 	struct btrfs_trans_handle *trans;
6160 	struct btrfs_path *path;
6161 	struct btrfs_fs_info *info = root->fs_info;
6162 	struct extent_buffer *leaf;
6163 	struct inode *reloc_inode;
6164 	struct btrfs_block_group_cache *block_group;
6165 	struct btrfs_key key;
6166 	u64 skipped;
6167 	u64 cur_byte;
6168 	u64 total_found;
6169 	u32 nritems;
6170 	int ret;
6171 	int progress;
6172 	int pass = 0;
6173 
6174 	root = root->fs_info->extent_root;
6175 
6176 	block_group = btrfs_lookup_block_group(info, group_start);
6177 	BUG_ON(!block_group);
6178 
6179 	printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n",
6180 	       (unsigned long long)block_group->key.objectid,
6181 	       (unsigned long long)block_group->flags);
6182 
6183 	path = btrfs_alloc_path();
6184 	BUG_ON(!path);
6185 
6186 	reloc_inode = create_reloc_inode(info, block_group);
6187 	BUG_ON(IS_ERR(reloc_inode));
6188 
6189 	__alloc_chunk_for_shrink(root, block_group, 1);
6190 	set_block_group_readonly(block_group);
6191 
6192 	btrfs_start_delalloc_inodes(info->tree_root);
6193 	btrfs_wait_ordered_extents(info->tree_root, 0);
6194 again:
6195 	skipped = 0;
6196 	total_found = 0;
6197 	progress = 0;
6198 	key.objectid = block_group->key.objectid;
6199 	key.offset = 0;
6200 	key.type = 0;
6201 	cur_byte = key.objectid;
6202 
6203 	trans = btrfs_start_transaction(info->tree_root, 1);
6204 	btrfs_commit_transaction(trans, info->tree_root);
6205 
6206 	mutex_lock(&root->fs_info->cleaner_mutex);
6207 	btrfs_clean_old_snapshots(info->tree_root);
6208 	btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
6209 	mutex_unlock(&root->fs_info->cleaner_mutex);
6210 
6211 	while (1) {
6212 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6213 		if (ret < 0)
6214 			goto out;
6215 next:
6216 		leaf = path->nodes[0];
6217 		nritems = btrfs_header_nritems(leaf);
6218 		if (path->slots[0] >= nritems) {
6219 			ret = btrfs_next_leaf(root, path);
6220 			if (ret < 0)
6221 				goto out;
6222 			if (ret == 1) {
6223 				ret = 0;
6224 				break;
6225 			}
6226 			leaf = path->nodes[0];
6227 			nritems = btrfs_header_nritems(leaf);
6228 		}
6229 
6230 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
6231 
6232 		if (key.objectid >= block_group->key.objectid +
6233 		    block_group->key.offset)
6234 			break;
6235 
6236 		if (progress && need_resched()) {
6237 			btrfs_release_path(root, path);
6238 			cond_resched();
6239 			progress = 0;
6240 			continue;
6241 		}
6242 		progress = 1;
6243 
6244 		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY ||
6245 		    key.objectid + key.offset <= cur_byte) {
6246 			path->slots[0]++;
6247 			goto next;
6248 		}
6249 
6250 		total_found++;
6251 		cur_byte = key.objectid + key.offset;
6252 		btrfs_release_path(root, path);
6253 
6254 		__alloc_chunk_for_shrink(root, block_group, 0);
6255 		ret = relocate_one_extent(root, path, &key, block_group,
6256 					  reloc_inode, pass);
6257 		BUG_ON(ret < 0);
6258 		if (ret > 0)
6259 			skipped++;
6260 
6261 		key.objectid = cur_byte;
6262 		key.type = 0;
6263 		key.offset = 0;
6264 	}
6265 
6266 	btrfs_release_path(root, path);
6267 
6268 	if (pass == 0) {
6269 		btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1);
6270 		invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1);
6271 	}
6272 
6273 	if (total_found > 0) {
6274 		printk(KERN_INFO "btrfs found %llu extents in pass %d\n",
6275 		       (unsigned long long)total_found, pass);
6276 		pass++;
6277 		if (total_found == skipped && pass > 2) {
6278 			iput(reloc_inode);
6279 			reloc_inode = create_reloc_inode(info, block_group);
6280 			pass = 0;
6281 		}
6282 		goto again;
6283 	}
6284 
6285 	/* delete reloc_inode */
6286 	iput(reloc_inode);
6287 
6288 	/* unpin extents in this range */
6289 	trans = btrfs_start_transaction(info->tree_root, 1);
6290 	btrfs_commit_transaction(trans, info->tree_root);
6291 
6292 	spin_lock(&block_group->lock);
6293 	WARN_ON(block_group->pinned > 0);
6294 	WARN_ON(block_group->reserved > 0);
6295 	WARN_ON(btrfs_block_group_used(&block_group->item) > 0);
6296 	spin_unlock(&block_group->lock);
6297 	put_block_group(block_group);
6298 	ret = 0;
6299 out:
6300 	btrfs_free_path(path);
6301 	return ret;
6302 }
6303 
6304 static int find_first_block_group(struct btrfs_root *root,
6305 		struct btrfs_path *path, struct btrfs_key *key)
6306 {
6307 	int ret = 0;
6308 	struct btrfs_key found_key;
6309 	struct extent_buffer *leaf;
6310 	int slot;
6311 
6312 	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
6313 	if (ret < 0)
6314 		goto out;
6315 
6316 	while (1) {
6317 		slot = path->slots[0];
6318 		leaf = path->nodes[0];
6319 		if (slot >= btrfs_header_nritems(leaf)) {
6320 			ret = btrfs_next_leaf(root, path);
6321 			if (ret == 0)
6322 				continue;
6323 			if (ret < 0)
6324 				goto out;
6325 			break;
6326 		}
6327 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
6328 
6329 		if (found_key.objectid >= key->objectid &&
6330 		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
6331 			ret = 0;
6332 			goto out;
6333 		}
6334 		path->slots[0]++;
6335 	}
6336 	ret = -ENOENT;
6337 out:
6338 	return ret;
6339 }
6340 
6341 int btrfs_free_block_groups(struct btrfs_fs_info *info)
6342 {
6343 	struct btrfs_block_group_cache *block_group;
6344 	struct btrfs_space_info *space_info;
6345 	struct rb_node *n;
6346 
6347 	spin_lock(&info->block_group_cache_lock);
6348 	while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
6349 		block_group = rb_entry(n, struct btrfs_block_group_cache,
6350 				       cache_node);
6351 		rb_erase(&block_group->cache_node,
6352 			 &info->block_group_cache_tree);
6353 		spin_unlock(&info->block_group_cache_lock);
6354 
6355 		btrfs_remove_free_space_cache(block_group);
6356 		down_write(&block_group->space_info->groups_sem);
6357 		list_del(&block_group->list);
6358 		up_write(&block_group->space_info->groups_sem);
6359 
6360 		WARN_ON(atomic_read(&block_group->count) != 1);
6361 		kfree(block_group);
6362 
6363 		spin_lock(&info->block_group_cache_lock);
6364 	}
6365 	spin_unlock(&info->block_group_cache_lock);
6366 
6367 	/* now that all the block groups are freed, go through and
6368 	 * free all the space_info structs.  This is only called during
6369 	 * the final stages of unmount, and so we know nobody is
6370 	 * using them.  We call synchronize_rcu() once before we start,
6371 	 * just to be on the safe side.
6372 	 */
6373 	synchronize_rcu();
6374 
6375 	while(!list_empty(&info->space_info)) {
6376 		space_info = list_entry(info->space_info.next,
6377 					struct btrfs_space_info,
6378 					list);
6379 
6380 		list_del(&space_info->list);
6381 		kfree(space_info);
6382 	}
6383 	return 0;
6384 }
6385 
6386 int btrfs_read_block_groups(struct btrfs_root *root)
6387 {
6388 	struct btrfs_path *path;
6389 	int ret;
6390 	struct btrfs_block_group_cache *cache;
6391 	struct btrfs_fs_info *info = root->fs_info;
6392 	struct btrfs_space_info *space_info;
6393 	struct btrfs_key key;
6394 	struct btrfs_key found_key;
6395 	struct extent_buffer *leaf;
6396 
6397 	root = info->extent_root;
6398 	key.objectid = 0;
6399 	key.offset = 0;
6400 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
6401 	path = btrfs_alloc_path();
6402 	if (!path)
6403 		return -ENOMEM;
6404 
6405 	while (1) {
6406 		ret = find_first_block_group(root, path, &key);
6407 		if (ret > 0) {
6408 			ret = 0;
6409 			goto error;
6410 		}
6411 		if (ret != 0)
6412 			goto error;
6413 
6414 		leaf = path->nodes[0];
6415 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6416 		cache = kzalloc(sizeof(*cache), GFP_NOFS);
6417 		if (!cache) {
6418 			ret = -ENOMEM;
6419 			break;
6420 		}
6421 
6422 		atomic_set(&cache->count, 1);
6423 		spin_lock_init(&cache->lock);
6424 		mutex_init(&cache->alloc_mutex);
6425 		mutex_init(&cache->cache_mutex);
6426 		INIT_LIST_HEAD(&cache->list);
6427 		read_extent_buffer(leaf, &cache->item,
6428 				   btrfs_item_ptr_offset(leaf, path->slots[0]),
6429 				   sizeof(cache->item));
6430 		memcpy(&cache->key, &found_key, sizeof(found_key));
6431 
6432 		key.objectid = found_key.objectid + found_key.offset;
6433 		btrfs_release_path(root, path);
6434 		cache->flags = btrfs_block_group_flags(&cache->item);
6435 
6436 		ret = update_space_info(info, cache->flags, found_key.offset,
6437 					btrfs_block_group_used(&cache->item),
6438 					&space_info);
6439 		BUG_ON(ret);
6440 		cache->space_info = space_info;
6441 		down_write(&space_info->groups_sem);
6442 		list_add_tail(&cache->list, &space_info->block_groups);
6443 		up_write(&space_info->groups_sem);
6444 
6445 		ret = btrfs_add_block_group_cache(root->fs_info, cache);
6446 		BUG_ON(ret);
6447 
6448 		set_avail_alloc_bits(root->fs_info, cache->flags);
6449 		if (btrfs_chunk_readonly(root, cache->key.objectid))
6450 			set_block_group_readonly(cache);
6451 	}
6452 	ret = 0;
6453 error:
6454 	btrfs_free_path(path);
6455 	return ret;
6456 }
6457 
6458 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
6459 			   struct btrfs_root *root, u64 bytes_used,
6460 			   u64 type, u64 chunk_objectid, u64 chunk_offset,
6461 			   u64 size)
6462 {
6463 	int ret;
6464 	struct btrfs_root *extent_root;
6465 	struct btrfs_block_group_cache *cache;
6466 
6467 	extent_root = root->fs_info->extent_root;
6468 
6469 	root->fs_info->last_trans_new_blockgroup = trans->transid;
6470 
6471 	cache = kzalloc(sizeof(*cache), GFP_NOFS);
6472 	if (!cache)
6473 		return -ENOMEM;
6474 
6475 	cache->key.objectid = chunk_offset;
6476 	cache->key.offset = size;
6477 	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
6478 	atomic_set(&cache->count, 1);
6479 	spin_lock_init(&cache->lock);
6480 	mutex_init(&cache->alloc_mutex);
6481 	mutex_init(&cache->cache_mutex);
6482 	INIT_LIST_HEAD(&cache->list);
6483 
6484 	btrfs_set_block_group_used(&cache->item, bytes_used);
6485 	btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
6486 	cache->flags = type;
6487 	btrfs_set_block_group_flags(&cache->item, type);
6488 
6489 	ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
6490 				&cache->space_info);
6491 	BUG_ON(ret);
6492 	down_write(&cache->space_info->groups_sem);
6493 	list_add_tail(&cache->list, &cache->space_info->block_groups);
6494 	up_write(&cache->space_info->groups_sem);
6495 
6496 	ret = btrfs_add_block_group_cache(root->fs_info, cache);
6497 	BUG_ON(ret);
6498 
6499 	ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item,
6500 				sizeof(cache->item));
6501 	BUG_ON(ret);
6502 
6503 	finish_current_insert(trans, extent_root, 0);
6504 	ret = del_pending_extents(trans, extent_root, 0);
6505 	BUG_ON(ret);
6506 	set_avail_alloc_bits(extent_root->fs_info, type);
6507 
6508 	return 0;
6509 }
6510 
6511 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
6512 			     struct btrfs_root *root, u64 group_start)
6513 {
6514 	struct btrfs_path *path;
6515 	struct btrfs_block_group_cache *block_group;
6516 	struct btrfs_key key;
6517 	int ret;
6518 
6519 	root = root->fs_info->extent_root;
6520 
6521 	block_group = btrfs_lookup_block_group(root->fs_info, group_start);
6522 	BUG_ON(!block_group);
6523 	BUG_ON(!block_group->ro);
6524 
6525 	memcpy(&key, &block_group->key, sizeof(key));
6526 
6527 	path = btrfs_alloc_path();
6528 	BUG_ON(!path);
6529 
6530 	spin_lock(&root->fs_info->block_group_cache_lock);
6531 	rb_erase(&block_group->cache_node,
6532 		 &root->fs_info->block_group_cache_tree);
6533 	spin_unlock(&root->fs_info->block_group_cache_lock);
6534 	btrfs_remove_free_space_cache(block_group);
6535 	down_write(&block_group->space_info->groups_sem);
6536 	list_del(&block_group->list);
6537 	up_write(&block_group->space_info->groups_sem);
6538 
6539 	spin_lock(&block_group->space_info->lock);
6540 	block_group->space_info->total_bytes -= block_group->key.offset;
6541 	block_group->space_info->bytes_readonly -= block_group->key.offset;
6542 	spin_unlock(&block_group->space_info->lock);
6543 	block_group->space_info->full = 0;
6544 
6545 	put_block_group(block_group);
6546 	put_block_group(block_group);
6547 
6548 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
6549 	if (ret > 0)
6550 		ret = -EIO;
6551 	if (ret < 0)
6552 		goto out;
6553 
6554 	ret = btrfs_del_item(trans, root, path);
6555 out:
6556 	btrfs_free_path(path);
6557 	return ret;
6558 }
6559