xref: /openbmc/linux/fs/btrfs/extent_io.c (revision f35e839a)
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/pagemap.h>
6 #include <linux/page-flags.h>
7 #include <linux/spinlock.h>
8 #include <linux/blkdev.h>
9 #include <linux/swap.h>
10 #include <linux/writeback.h>
11 #include <linux/pagevec.h>
12 #include <linux/prefetch.h>
13 #include <linux/cleancache.h>
14 #include "extent_io.h"
15 #include "extent_map.h"
16 #include "compat.h"
17 #include "ctree.h"
18 #include "btrfs_inode.h"
19 #include "volumes.h"
20 #include "check-integrity.h"
21 #include "locking.h"
22 #include "rcu-string.h"
23 
24 static struct kmem_cache *extent_state_cache;
25 static struct kmem_cache *extent_buffer_cache;
26 
27 #ifdef CONFIG_BTRFS_DEBUG
28 static LIST_HEAD(buffers);
29 static LIST_HEAD(states);
30 
31 static DEFINE_SPINLOCK(leak_lock);
32 
33 static inline
34 void btrfs_leak_debug_add(struct list_head *new, struct list_head *head)
35 {
36 	unsigned long flags;
37 
38 	spin_lock_irqsave(&leak_lock, flags);
39 	list_add(new, head);
40 	spin_unlock_irqrestore(&leak_lock, flags);
41 }
42 
43 static inline
44 void btrfs_leak_debug_del(struct list_head *entry)
45 {
46 	unsigned long flags;
47 
48 	spin_lock_irqsave(&leak_lock, flags);
49 	list_del(entry);
50 	spin_unlock_irqrestore(&leak_lock, flags);
51 }
52 
53 static inline
54 void btrfs_leak_debug_check(void)
55 {
56 	struct extent_state *state;
57 	struct extent_buffer *eb;
58 
59 	while (!list_empty(&states)) {
60 		state = list_entry(states.next, struct extent_state, leak_list);
61 		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
62 		       "state %lu in tree %p refs %d\n",
63 		       (unsigned long long)state->start,
64 		       (unsigned long long)state->end,
65 		       state->state, state->tree, atomic_read(&state->refs));
66 		list_del(&state->leak_list);
67 		kmem_cache_free(extent_state_cache, state);
68 	}
69 
70 	while (!list_empty(&buffers)) {
71 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
72 		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
73 		       "refs %d\n", (unsigned long long)eb->start,
74 		       eb->len, atomic_read(&eb->refs));
75 		list_del(&eb->leak_list);
76 		kmem_cache_free(extent_buffer_cache, eb);
77 	}
78 }
79 #else
80 #define btrfs_leak_debug_add(new, head)	do {} while (0)
81 #define btrfs_leak_debug_del(entry)	do {} while (0)
82 #define btrfs_leak_debug_check()	do {} while (0)
83 #endif
84 
85 #define BUFFER_LRU_MAX 64
86 
87 struct tree_entry {
88 	u64 start;
89 	u64 end;
90 	struct rb_node rb_node;
91 };
92 
93 struct extent_page_data {
94 	struct bio *bio;
95 	struct extent_io_tree *tree;
96 	get_extent_t *get_extent;
97 	unsigned long bio_flags;
98 
99 	/* tells writepage not to lock the state bits for this range
100 	 * it still does the unlocking
101 	 */
102 	unsigned int extent_locked:1;
103 
104 	/* tells the submit_bio code to use a WRITE_SYNC */
105 	unsigned int sync_io:1;
106 };
107 
108 static noinline void flush_write_bio(void *data);
109 static inline struct btrfs_fs_info *
110 tree_fs_info(struct extent_io_tree *tree)
111 {
112 	return btrfs_sb(tree->mapping->host->i_sb);
113 }
114 
115 int __init extent_io_init(void)
116 {
117 	extent_state_cache = kmem_cache_create("btrfs_extent_state",
118 			sizeof(struct extent_state), 0,
119 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
120 	if (!extent_state_cache)
121 		return -ENOMEM;
122 
123 	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
124 			sizeof(struct extent_buffer), 0,
125 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
126 	if (!extent_buffer_cache)
127 		goto free_state_cache;
128 	return 0;
129 
130 free_state_cache:
131 	kmem_cache_destroy(extent_state_cache);
132 	return -ENOMEM;
133 }
134 
135 void extent_io_exit(void)
136 {
137 	btrfs_leak_debug_check();
138 
139 	/*
140 	 * Make sure all delayed rcu free are flushed before we
141 	 * destroy caches.
142 	 */
143 	rcu_barrier();
144 	if (extent_state_cache)
145 		kmem_cache_destroy(extent_state_cache);
146 	if (extent_buffer_cache)
147 		kmem_cache_destroy(extent_buffer_cache);
148 }
149 
150 void extent_io_tree_init(struct extent_io_tree *tree,
151 			 struct address_space *mapping)
152 {
153 	tree->state = RB_ROOT;
154 	INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
155 	tree->ops = NULL;
156 	tree->dirty_bytes = 0;
157 	spin_lock_init(&tree->lock);
158 	spin_lock_init(&tree->buffer_lock);
159 	tree->mapping = mapping;
160 }
161 
162 static struct extent_state *alloc_extent_state(gfp_t mask)
163 {
164 	struct extent_state *state;
165 
166 	state = kmem_cache_alloc(extent_state_cache, mask);
167 	if (!state)
168 		return state;
169 	state->state = 0;
170 	state->private = 0;
171 	state->tree = NULL;
172 	btrfs_leak_debug_add(&state->leak_list, &states);
173 	atomic_set(&state->refs, 1);
174 	init_waitqueue_head(&state->wq);
175 	trace_alloc_extent_state(state, mask, _RET_IP_);
176 	return state;
177 }
178 
179 void free_extent_state(struct extent_state *state)
180 {
181 	if (!state)
182 		return;
183 	if (atomic_dec_and_test(&state->refs)) {
184 		WARN_ON(state->tree);
185 		btrfs_leak_debug_del(&state->leak_list);
186 		trace_free_extent_state(state, _RET_IP_);
187 		kmem_cache_free(extent_state_cache, state);
188 	}
189 }
190 
191 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
192 				   struct rb_node *node)
193 {
194 	struct rb_node **p = &root->rb_node;
195 	struct rb_node *parent = NULL;
196 	struct tree_entry *entry;
197 
198 	while (*p) {
199 		parent = *p;
200 		entry = rb_entry(parent, struct tree_entry, rb_node);
201 
202 		if (offset < entry->start)
203 			p = &(*p)->rb_left;
204 		else if (offset > entry->end)
205 			p = &(*p)->rb_right;
206 		else
207 			return parent;
208 	}
209 
210 	rb_link_node(node, parent, p);
211 	rb_insert_color(node, root);
212 	return NULL;
213 }
214 
215 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
216 				     struct rb_node **prev_ret,
217 				     struct rb_node **next_ret)
218 {
219 	struct rb_root *root = &tree->state;
220 	struct rb_node *n = root->rb_node;
221 	struct rb_node *prev = NULL;
222 	struct rb_node *orig_prev = NULL;
223 	struct tree_entry *entry;
224 	struct tree_entry *prev_entry = NULL;
225 
226 	while (n) {
227 		entry = rb_entry(n, struct tree_entry, rb_node);
228 		prev = n;
229 		prev_entry = entry;
230 
231 		if (offset < entry->start)
232 			n = n->rb_left;
233 		else if (offset > entry->end)
234 			n = n->rb_right;
235 		else
236 			return n;
237 	}
238 
239 	if (prev_ret) {
240 		orig_prev = prev;
241 		while (prev && offset > prev_entry->end) {
242 			prev = rb_next(prev);
243 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
244 		}
245 		*prev_ret = prev;
246 		prev = orig_prev;
247 	}
248 
249 	if (next_ret) {
250 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
251 		while (prev && offset < prev_entry->start) {
252 			prev = rb_prev(prev);
253 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
254 		}
255 		*next_ret = prev;
256 	}
257 	return NULL;
258 }
259 
260 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
261 					  u64 offset)
262 {
263 	struct rb_node *prev = NULL;
264 	struct rb_node *ret;
265 
266 	ret = __etree_search(tree, offset, &prev, NULL);
267 	if (!ret)
268 		return prev;
269 	return ret;
270 }
271 
272 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
273 		     struct extent_state *other)
274 {
275 	if (tree->ops && tree->ops->merge_extent_hook)
276 		tree->ops->merge_extent_hook(tree->mapping->host, new,
277 					     other);
278 }
279 
280 /*
281  * utility function to look for merge candidates inside a given range.
282  * Any extents with matching state are merged together into a single
283  * extent in the tree.  Extents with EXTENT_IO in their state field
284  * are not merged because the end_io handlers need to be able to do
285  * operations on them without sleeping (or doing allocations/splits).
286  *
287  * This should be called with the tree lock held.
288  */
289 static void merge_state(struct extent_io_tree *tree,
290 		        struct extent_state *state)
291 {
292 	struct extent_state *other;
293 	struct rb_node *other_node;
294 
295 	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
296 		return;
297 
298 	other_node = rb_prev(&state->rb_node);
299 	if (other_node) {
300 		other = rb_entry(other_node, struct extent_state, rb_node);
301 		if (other->end == state->start - 1 &&
302 		    other->state == state->state) {
303 			merge_cb(tree, state, other);
304 			state->start = other->start;
305 			other->tree = NULL;
306 			rb_erase(&other->rb_node, &tree->state);
307 			free_extent_state(other);
308 		}
309 	}
310 	other_node = rb_next(&state->rb_node);
311 	if (other_node) {
312 		other = rb_entry(other_node, struct extent_state, rb_node);
313 		if (other->start == state->end + 1 &&
314 		    other->state == state->state) {
315 			merge_cb(tree, state, other);
316 			state->end = other->end;
317 			other->tree = NULL;
318 			rb_erase(&other->rb_node, &tree->state);
319 			free_extent_state(other);
320 		}
321 	}
322 }
323 
324 static void set_state_cb(struct extent_io_tree *tree,
325 			 struct extent_state *state, unsigned long *bits)
326 {
327 	if (tree->ops && tree->ops->set_bit_hook)
328 		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
329 }
330 
331 static void clear_state_cb(struct extent_io_tree *tree,
332 			   struct extent_state *state, unsigned long *bits)
333 {
334 	if (tree->ops && tree->ops->clear_bit_hook)
335 		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
336 }
337 
338 static void set_state_bits(struct extent_io_tree *tree,
339 			   struct extent_state *state, unsigned long *bits);
340 
341 /*
342  * insert an extent_state struct into the tree.  'bits' are set on the
343  * struct before it is inserted.
344  *
345  * This may return -EEXIST if the extent is already there, in which case the
346  * state struct is freed.
347  *
348  * The tree lock is not taken internally.  This is a utility function and
349  * probably isn't what you want to call (see set/clear_extent_bit).
350  */
351 static int insert_state(struct extent_io_tree *tree,
352 			struct extent_state *state, u64 start, u64 end,
353 			unsigned long *bits)
354 {
355 	struct rb_node *node;
356 
357 	if (end < start)
358 		WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
359 		       (unsigned long long)end,
360 		       (unsigned long long)start);
361 	state->start = start;
362 	state->end = end;
363 
364 	set_state_bits(tree, state, bits);
365 
366 	node = tree_insert(&tree->state, end, &state->rb_node);
367 	if (node) {
368 		struct extent_state *found;
369 		found = rb_entry(node, struct extent_state, rb_node);
370 		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
371 		       "%llu %llu\n", (unsigned long long)found->start,
372 		       (unsigned long long)found->end,
373 		       (unsigned long long)start, (unsigned long long)end);
374 		return -EEXIST;
375 	}
376 	state->tree = tree;
377 	merge_state(tree, state);
378 	return 0;
379 }
380 
381 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
382 		     u64 split)
383 {
384 	if (tree->ops && tree->ops->split_extent_hook)
385 		tree->ops->split_extent_hook(tree->mapping->host, orig, split);
386 }
387 
388 /*
389  * split a given extent state struct in two, inserting the preallocated
390  * struct 'prealloc' as the newly created second half.  'split' indicates an
391  * offset inside 'orig' where it should be split.
392  *
393  * Before calling,
394  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
395  * are two extent state structs in the tree:
396  * prealloc: [orig->start, split - 1]
397  * orig: [ split, orig->end ]
398  *
399  * The tree locks are not taken by this function. They need to be held
400  * by the caller.
401  */
402 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
403 		       struct extent_state *prealloc, u64 split)
404 {
405 	struct rb_node *node;
406 
407 	split_cb(tree, orig, split);
408 
409 	prealloc->start = orig->start;
410 	prealloc->end = split - 1;
411 	prealloc->state = orig->state;
412 	orig->start = split;
413 
414 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
415 	if (node) {
416 		free_extent_state(prealloc);
417 		return -EEXIST;
418 	}
419 	prealloc->tree = tree;
420 	return 0;
421 }
422 
423 static struct extent_state *next_state(struct extent_state *state)
424 {
425 	struct rb_node *next = rb_next(&state->rb_node);
426 	if (next)
427 		return rb_entry(next, struct extent_state, rb_node);
428 	else
429 		return NULL;
430 }
431 
432 /*
433  * utility function to clear some bits in an extent state struct.
434  * it will optionally wake up any one waiting on this state (wake == 1).
435  *
436  * If no bits are set on the state struct after clearing things, the
437  * struct is freed and removed from the tree
438  */
439 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
440 					    struct extent_state *state,
441 					    unsigned long *bits, int wake)
442 {
443 	struct extent_state *next;
444 	unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS;
445 
446 	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
447 		u64 range = state->end - state->start + 1;
448 		WARN_ON(range > tree->dirty_bytes);
449 		tree->dirty_bytes -= range;
450 	}
451 	clear_state_cb(tree, state, bits);
452 	state->state &= ~bits_to_clear;
453 	if (wake)
454 		wake_up(&state->wq);
455 	if (state->state == 0) {
456 		next = next_state(state);
457 		if (state->tree) {
458 			rb_erase(&state->rb_node, &tree->state);
459 			state->tree = NULL;
460 			free_extent_state(state);
461 		} else {
462 			WARN_ON(1);
463 		}
464 	} else {
465 		merge_state(tree, state);
466 		next = next_state(state);
467 	}
468 	return next;
469 }
470 
471 static struct extent_state *
472 alloc_extent_state_atomic(struct extent_state *prealloc)
473 {
474 	if (!prealloc)
475 		prealloc = alloc_extent_state(GFP_ATOMIC);
476 
477 	return prealloc;
478 }
479 
480 static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
481 {
482 	btrfs_panic(tree_fs_info(tree), err, "Locking error: "
483 		    "Extent tree was modified by another "
484 		    "thread while locked.");
485 }
486 
487 /*
488  * clear some bits on a range in the tree.  This may require splitting
489  * or inserting elements in the tree, so the gfp mask is used to
490  * indicate which allocations or sleeping are allowed.
491  *
492  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
493  * the given range from the tree regardless of state (ie for truncate).
494  *
495  * the range [start, end] is inclusive.
496  *
497  * This takes the tree lock, and returns 0 on success and < 0 on error.
498  */
499 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
500 		     unsigned long bits, int wake, int delete,
501 		     struct extent_state **cached_state,
502 		     gfp_t mask)
503 {
504 	struct extent_state *state;
505 	struct extent_state *cached;
506 	struct extent_state *prealloc = NULL;
507 	struct rb_node *node;
508 	u64 last_end;
509 	int err;
510 	int clear = 0;
511 
512 	if (delete)
513 		bits |= ~EXTENT_CTLBITS;
514 	bits |= EXTENT_FIRST_DELALLOC;
515 
516 	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
517 		clear = 1;
518 again:
519 	if (!prealloc && (mask & __GFP_WAIT)) {
520 		prealloc = alloc_extent_state(mask);
521 		if (!prealloc)
522 			return -ENOMEM;
523 	}
524 
525 	spin_lock(&tree->lock);
526 	if (cached_state) {
527 		cached = *cached_state;
528 
529 		if (clear) {
530 			*cached_state = NULL;
531 			cached_state = NULL;
532 		}
533 
534 		if (cached && cached->tree && cached->start <= start &&
535 		    cached->end > start) {
536 			if (clear)
537 				atomic_dec(&cached->refs);
538 			state = cached;
539 			goto hit_next;
540 		}
541 		if (clear)
542 			free_extent_state(cached);
543 	}
544 	/*
545 	 * this search will find the extents that end after
546 	 * our range starts
547 	 */
548 	node = tree_search(tree, start);
549 	if (!node)
550 		goto out;
551 	state = rb_entry(node, struct extent_state, rb_node);
552 hit_next:
553 	if (state->start > end)
554 		goto out;
555 	WARN_ON(state->end < start);
556 	last_end = state->end;
557 
558 	/* the state doesn't have the wanted bits, go ahead */
559 	if (!(state->state & bits)) {
560 		state = next_state(state);
561 		goto next;
562 	}
563 
564 	/*
565 	 *     | ---- desired range ---- |
566 	 *  | state | or
567 	 *  | ------------- state -------------- |
568 	 *
569 	 * We need to split the extent we found, and may flip
570 	 * bits on second half.
571 	 *
572 	 * If the extent we found extends past our range, we
573 	 * just split and search again.  It'll get split again
574 	 * the next time though.
575 	 *
576 	 * If the extent we found is inside our range, we clear
577 	 * the desired bit on it.
578 	 */
579 
580 	if (state->start < start) {
581 		prealloc = alloc_extent_state_atomic(prealloc);
582 		BUG_ON(!prealloc);
583 		err = split_state(tree, state, prealloc, start);
584 		if (err)
585 			extent_io_tree_panic(tree, err);
586 
587 		prealloc = NULL;
588 		if (err)
589 			goto out;
590 		if (state->end <= end) {
591 			state = clear_state_bit(tree, state, &bits, wake);
592 			goto next;
593 		}
594 		goto search_again;
595 	}
596 	/*
597 	 * | ---- desired range ---- |
598 	 *                        | state |
599 	 * We need to split the extent, and clear the bit
600 	 * on the first half
601 	 */
602 	if (state->start <= end && state->end > end) {
603 		prealloc = alloc_extent_state_atomic(prealloc);
604 		BUG_ON(!prealloc);
605 		err = split_state(tree, state, prealloc, end + 1);
606 		if (err)
607 			extent_io_tree_panic(tree, err);
608 
609 		if (wake)
610 			wake_up(&state->wq);
611 
612 		clear_state_bit(tree, prealloc, &bits, wake);
613 
614 		prealloc = NULL;
615 		goto out;
616 	}
617 
618 	state = clear_state_bit(tree, state, &bits, wake);
619 next:
620 	if (last_end == (u64)-1)
621 		goto out;
622 	start = last_end + 1;
623 	if (start <= end && state && !need_resched())
624 		goto hit_next;
625 	goto search_again;
626 
627 out:
628 	spin_unlock(&tree->lock);
629 	if (prealloc)
630 		free_extent_state(prealloc);
631 
632 	return 0;
633 
634 search_again:
635 	if (start > end)
636 		goto out;
637 	spin_unlock(&tree->lock);
638 	if (mask & __GFP_WAIT)
639 		cond_resched();
640 	goto again;
641 }
642 
643 static void wait_on_state(struct extent_io_tree *tree,
644 			  struct extent_state *state)
645 		__releases(tree->lock)
646 		__acquires(tree->lock)
647 {
648 	DEFINE_WAIT(wait);
649 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
650 	spin_unlock(&tree->lock);
651 	schedule();
652 	spin_lock(&tree->lock);
653 	finish_wait(&state->wq, &wait);
654 }
655 
656 /*
657  * waits for one or more bits to clear on a range in the state tree.
658  * The range [start, end] is inclusive.
659  * The tree lock is taken by this function
660  */
661 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
662 			    unsigned long bits)
663 {
664 	struct extent_state *state;
665 	struct rb_node *node;
666 
667 	spin_lock(&tree->lock);
668 again:
669 	while (1) {
670 		/*
671 		 * this search will find all the extents that end after
672 		 * our range starts
673 		 */
674 		node = tree_search(tree, start);
675 		if (!node)
676 			break;
677 
678 		state = rb_entry(node, struct extent_state, rb_node);
679 
680 		if (state->start > end)
681 			goto out;
682 
683 		if (state->state & bits) {
684 			start = state->start;
685 			atomic_inc(&state->refs);
686 			wait_on_state(tree, state);
687 			free_extent_state(state);
688 			goto again;
689 		}
690 		start = state->end + 1;
691 
692 		if (start > end)
693 			break;
694 
695 		cond_resched_lock(&tree->lock);
696 	}
697 out:
698 	spin_unlock(&tree->lock);
699 }
700 
701 static void set_state_bits(struct extent_io_tree *tree,
702 			   struct extent_state *state,
703 			   unsigned long *bits)
704 {
705 	unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS;
706 
707 	set_state_cb(tree, state, bits);
708 	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
709 		u64 range = state->end - state->start + 1;
710 		tree->dirty_bytes += range;
711 	}
712 	state->state |= bits_to_set;
713 }
714 
715 static void cache_state(struct extent_state *state,
716 			struct extent_state **cached_ptr)
717 {
718 	if (cached_ptr && !(*cached_ptr)) {
719 		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
720 			*cached_ptr = state;
721 			atomic_inc(&state->refs);
722 		}
723 	}
724 }
725 
726 static void uncache_state(struct extent_state **cached_ptr)
727 {
728 	if (cached_ptr && (*cached_ptr)) {
729 		struct extent_state *state = *cached_ptr;
730 		*cached_ptr = NULL;
731 		free_extent_state(state);
732 	}
733 }
734 
735 /*
736  * set some bits on a range in the tree.  This may require allocations or
737  * sleeping, so the gfp mask is used to indicate what is allowed.
738  *
739  * If any of the exclusive bits are set, this will fail with -EEXIST if some
740  * part of the range already has the desired bits set.  The start of the
741  * existing range is returned in failed_start in this case.
742  *
743  * [start, end] is inclusive This takes the tree lock.
744  */
745 
746 static int __must_check
747 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
748 		 unsigned long bits, unsigned long exclusive_bits,
749 		 u64 *failed_start, struct extent_state **cached_state,
750 		 gfp_t mask)
751 {
752 	struct extent_state *state;
753 	struct extent_state *prealloc = NULL;
754 	struct rb_node *node;
755 	int err = 0;
756 	u64 last_start;
757 	u64 last_end;
758 
759 	bits |= EXTENT_FIRST_DELALLOC;
760 again:
761 	if (!prealloc && (mask & __GFP_WAIT)) {
762 		prealloc = alloc_extent_state(mask);
763 		BUG_ON(!prealloc);
764 	}
765 
766 	spin_lock(&tree->lock);
767 	if (cached_state && *cached_state) {
768 		state = *cached_state;
769 		if (state->start <= start && state->end > start &&
770 		    state->tree) {
771 			node = &state->rb_node;
772 			goto hit_next;
773 		}
774 	}
775 	/*
776 	 * this search will find all the extents that end after
777 	 * our range starts.
778 	 */
779 	node = tree_search(tree, start);
780 	if (!node) {
781 		prealloc = alloc_extent_state_atomic(prealloc);
782 		BUG_ON(!prealloc);
783 		err = insert_state(tree, prealloc, start, end, &bits);
784 		if (err)
785 			extent_io_tree_panic(tree, err);
786 
787 		prealloc = NULL;
788 		goto out;
789 	}
790 	state = rb_entry(node, struct extent_state, rb_node);
791 hit_next:
792 	last_start = state->start;
793 	last_end = state->end;
794 
795 	/*
796 	 * | ---- desired range ---- |
797 	 * | state |
798 	 *
799 	 * Just lock what we found and keep going
800 	 */
801 	if (state->start == start && state->end <= end) {
802 		if (state->state & exclusive_bits) {
803 			*failed_start = state->start;
804 			err = -EEXIST;
805 			goto out;
806 		}
807 
808 		set_state_bits(tree, state, &bits);
809 		cache_state(state, cached_state);
810 		merge_state(tree, state);
811 		if (last_end == (u64)-1)
812 			goto out;
813 		start = last_end + 1;
814 		state = next_state(state);
815 		if (start < end && state && state->start == start &&
816 		    !need_resched())
817 			goto hit_next;
818 		goto search_again;
819 	}
820 
821 	/*
822 	 *     | ---- desired range ---- |
823 	 * | state |
824 	 *   or
825 	 * | ------------- state -------------- |
826 	 *
827 	 * We need to split the extent we found, and may flip bits on
828 	 * second half.
829 	 *
830 	 * If the extent we found extends past our
831 	 * range, we just split and search again.  It'll get split
832 	 * again the next time though.
833 	 *
834 	 * If the extent we found is inside our range, we set the
835 	 * desired bit on it.
836 	 */
837 	if (state->start < start) {
838 		if (state->state & exclusive_bits) {
839 			*failed_start = start;
840 			err = -EEXIST;
841 			goto out;
842 		}
843 
844 		prealloc = alloc_extent_state_atomic(prealloc);
845 		BUG_ON(!prealloc);
846 		err = split_state(tree, state, prealloc, start);
847 		if (err)
848 			extent_io_tree_panic(tree, err);
849 
850 		prealloc = NULL;
851 		if (err)
852 			goto out;
853 		if (state->end <= end) {
854 			set_state_bits(tree, state, &bits);
855 			cache_state(state, cached_state);
856 			merge_state(tree, state);
857 			if (last_end == (u64)-1)
858 				goto out;
859 			start = last_end + 1;
860 			state = next_state(state);
861 			if (start < end && state && state->start == start &&
862 			    !need_resched())
863 				goto hit_next;
864 		}
865 		goto search_again;
866 	}
867 	/*
868 	 * | ---- desired range ---- |
869 	 *     | state | or               | state |
870 	 *
871 	 * There's a hole, we need to insert something in it and
872 	 * ignore the extent we found.
873 	 */
874 	if (state->start > start) {
875 		u64 this_end;
876 		if (end < last_start)
877 			this_end = end;
878 		else
879 			this_end = last_start - 1;
880 
881 		prealloc = alloc_extent_state_atomic(prealloc);
882 		BUG_ON(!prealloc);
883 
884 		/*
885 		 * Avoid to free 'prealloc' if it can be merged with
886 		 * the later extent.
887 		 */
888 		err = insert_state(tree, prealloc, start, this_end,
889 				   &bits);
890 		if (err)
891 			extent_io_tree_panic(tree, err);
892 
893 		cache_state(prealloc, cached_state);
894 		prealloc = NULL;
895 		start = this_end + 1;
896 		goto search_again;
897 	}
898 	/*
899 	 * | ---- desired range ---- |
900 	 *                        | state |
901 	 * We need to split the extent, and set the bit
902 	 * on the first half
903 	 */
904 	if (state->start <= end && state->end > end) {
905 		if (state->state & exclusive_bits) {
906 			*failed_start = start;
907 			err = -EEXIST;
908 			goto out;
909 		}
910 
911 		prealloc = alloc_extent_state_atomic(prealloc);
912 		BUG_ON(!prealloc);
913 		err = split_state(tree, state, prealloc, end + 1);
914 		if (err)
915 			extent_io_tree_panic(tree, err);
916 
917 		set_state_bits(tree, prealloc, &bits);
918 		cache_state(prealloc, cached_state);
919 		merge_state(tree, prealloc);
920 		prealloc = NULL;
921 		goto out;
922 	}
923 
924 	goto search_again;
925 
926 out:
927 	spin_unlock(&tree->lock);
928 	if (prealloc)
929 		free_extent_state(prealloc);
930 
931 	return err;
932 
933 search_again:
934 	if (start > end)
935 		goto out;
936 	spin_unlock(&tree->lock);
937 	if (mask & __GFP_WAIT)
938 		cond_resched();
939 	goto again;
940 }
941 
942 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
943 		   unsigned long bits, u64 * failed_start,
944 		   struct extent_state **cached_state, gfp_t mask)
945 {
946 	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
947 				cached_state, mask);
948 }
949 
950 
951 /**
952  * convert_extent_bit - convert all bits in a given range from one bit to
953  * 			another
954  * @tree:	the io tree to search
955  * @start:	the start offset in bytes
956  * @end:	the end offset in bytes (inclusive)
957  * @bits:	the bits to set in this range
958  * @clear_bits:	the bits to clear in this range
959  * @cached_state:	state that we're going to cache
960  * @mask:	the allocation mask
961  *
962  * This will go through and set bits for the given range.  If any states exist
963  * already in this range they are set with the given bit and cleared of the
964  * clear_bits.  This is only meant to be used by things that are mergeable, ie
965  * converting from say DELALLOC to DIRTY.  This is not meant to be used with
966  * boundary bits like LOCK.
967  */
968 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
969 		       unsigned long bits, unsigned long clear_bits,
970 		       struct extent_state **cached_state, gfp_t mask)
971 {
972 	struct extent_state *state;
973 	struct extent_state *prealloc = NULL;
974 	struct rb_node *node;
975 	int err = 0;
976 	u64 last_start;
977 	u64 last_end;
978 
979 again:
980 	if (!prealloc && (mask & __GFP_WAIT)) {
981 		prealloc = alloc_extent_state(mask);
982 		if (!prealloc)
983 			return -ENOMEM;
984 	}
985 
986 	spin_lock(&tree->lock);
987 	if (cached_state && *cached_state) {
988 		state = *cached_state;
989 		if (state->start <= start && state->end > start &&
990 		    state->tree) {
991 			node = &state->rb_node;
992 			goto hit_next;
993 		}
994 	}
995 
996 	/*
997 	 * this search will find all the extents that end after
998 	 * our range starts.
999 	 */
1000 	node = tree_search(tree, start);
1001 	if (!node) {
1002 		prealloc = alloc_extent_state_atomic(prealloc);
1003 		if (!prealloc) {
1004 			err = -ENOMEM;
1005 			goto out;
1006 		}
1007 		err = insert_state(tree, prealloc, start, end, &bits);
1008 		prealloc = NULL;
1009 		if (err)
1010 			extent_io_tree_panic(tree, err);
1011 		goto out;
1012 	}
1013 	state = rb_entry(node, struct extent_state, rb_node);
1014 hit_next:
1015 	last_start = state->start;
1016 	last_end = state->end;
1017 
1018 	/*
1019 	 * | ---- desired range ---- |
1020 	 * | state |
1021 	 *
1022 	 * Just lock what we found and keep going
1023 	 */
1024 	if (state->start == start && state->end <= end) {
1025 		set_state_bits(tree, state, &bits);
1026 		cache_state(state, cached_state);
1027 		state = clear_state_bit(tree, state, &clear_bits, 0);
1028 		if (last_end == (u64)-1)
1029 			goto out;
1030 		start = last_end + 1;
1031 		if (start < end && state && state->start == start &&
1032 		    !need_resched())
1033 			goto hit_next;
1034 		goto search_again;
1035 	}
1036 
1037 	/*
1038 	 *     | ---- desired range ---- |
1039 	 * | state |
1040 	 *   or
1041 	 * | ------------- state -------------- |
1042 	 *
1043 	 * We need to split the extent we found, and may flip bits on
1044 	 * second half.
1045 	 *
1046 	 * If the extent we found extends past our
1047 	 * range, we just split and search again.  It'll get split
1048 	 * again the next time though.
1049 	 *
1050 	 * If the extent we found is inside our range, we set the
1051 	 * desired bit on it.
1052 	 */
1053 	if (state->start < start) {
1054 		prealloc = alloc_extent_state_atomic(prealloc);
1055 		if (!prealloc) {
1056 			err = -ENOMEM;
1057 			goto out;
1058 		}
1059 		err = split_state(tree, state, prealloc, start);
1060 		if (err)
1061 			extent_io_tree_panic(tree, err);
1062 		prealloc = NULL;
1063 		if (err)
1064 			goto out;
1065 		if (state->end <= end) {
1066 			set_state_bits(tree, state, &bits);
1067 			cache_state(state, cached_state);
1068 			state = clear_state_bit(tree, state, &clear_bits, 0);
1069 			if (last_end == (u64)-1)
1070 				goto out;
1071 			start = last_end + 1;
1072 			if (start < end && state && state->start == start &&
1073 			    !need_resched())
1074 				goto hit_next;
1075 		}
1076 		goto search_again;
1077 	}
1078 	/*
1079 	 * | ---- desired range ---- |
1080 	 *     | state | or               | state |
1081 	 *
1082 	 * There's a hole, we need to insert something in it and
1083 	 * ignore the extent we found.
1084 	 */
1085 	if (state->start > start) {
1086 		u64 this_end;
1087 		if (end < last_start)
1088 			this_end = end;
1089 		else
1090 			this_end = last_start - 1;
1091 
1092 		prealloc = alloc_extent_state_atomic(prealloc);
1093 		if (!prealloc) {
1094 			err = -ENOMEM;
1095 			goto out;
1096 		}
1097 
1098 		/*
1099 		 * Avoid to free 'prealloc' if it can be merged with
1100 		 * the later extent.
1101 		 */
1102 		err = insert_state(tree, prealloc, start, this_end,
1103 				   &bits);
1104 		if (err)
1105 			extent_io_tree_panic(tree, err);
1106 		cache_state(prealloc, cached_state);
1107 		prealloc = NULL;
1108 		start = this_end + 1;
1109 		goto search_again;
1110 	}
1111 	/*
1112 	 * | ---- desired range ---- |
1113 	 *                        | state |
1114 	 * We need to split the extent, and set the bit
1115 	 * on the first half
1116 	 */
1117 	if (state->start <= end && state->end > end) {
1118 		prealloc = alloc_extent_state_atomic(prealloc);
1119 		if (!prealloc) {
1120 			err = -ENOMEM;
1121 			goto out;
1122 		}
1123 
1124 		err = split_state(tree, state, prealloc, end + 1);
1125 		if (err)
1126 			extent_io_tree_panic(tree, err);
1127 
1128 		set_state_bits(tree, prealloc, &bits);
1129 		cache_state(prealloc, cached_state);
1130 		clear_state_bit(tree, prealloc, &clear_bits, 0);
1131 		prealloc = NULL;
1132 		goto out;
1133 	}
1134 
1135 	goto search_again;
1136 
1137 out:
1138 	spin_unlock(&tree->lock);
1139 	if (prealloc)
1140 		free_extent_state(prealloc);
1141 
1142 	return err;
1143 
1144 search_again:
1145 	if (start > end)
1146 		goto out;
1147 	spin_unlock(&tree->lock);
1148 	if (mask & __GFP_WAIT)
1149 		cond_resched();
1150 	goto again;
1151 }
1152 
1153 /* wrappers around set/clear extent bit */
1154 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1155 		     gfp_t mask)
1156 {
1157 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
1158 			      NULL, mask);
1159 }
1160 
1161 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1162 		    unsigned long bits, gfp_t mask)
1163 {
1164 	return set_extent_bit(tree, start, end, bits, NULL,
1165 			      NULL, mask);
1166 }
1167 
1168 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1169 		      unsigned long bits, gfp_t mask)
1170 {
1171 	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
1172 }
1173 
1174 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1175 			struct extent_state **cached_state, gfp_t mask)
1176 {
1177 	return set_extent_bit(tree, start, end,
1178 			      EXTENT_DELALLOC | EXTENT_UPTODATE,
1179 			      NULL, cached_state, mask);
1180 }
1181 
1182 int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
1183 		      struct extent_state **cached_state, gfp_t mask)
1184 {
1185 	return set_extent_bit(tree, start, end,
1186 			      EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
1187 			      NULL, cached_state, mask);
1188 }
1189 
1190 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1191 		       gfp_t mask)
1192 {
1193 	return clear_extent_bit(tree, start, end,
1194 				EXTENT_DIRTY | EXTENT_DELALLOC |
1195 				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
1196 }
1197 
1198 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
1199 		     gfp_t mask)
1200 {
1201 	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
1202 			      NULL, mask);
1203 }
1204 
1205 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1206 			struct extent_state **cached_state, gfp_t mask)
1207 {
1208 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
1209 			      cached_state, mask);
1210 }
1211 
1212 int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1213 			  struct extent_state **cached_state, gfp_t mask)
1214 {
1215 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1216 				cached_state, mask);
1217 }
1218 
1219 /*
1220  * either insert or lock state struct between start and end use mask to tell
1221  * us if waiting is desired.
1222  */
1223 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1224 		     unsigned long bits, struct extent_state **cached_state)
1225 {
1226 	int err;
1227 	u64 failed_start;
1228 	while (1) {
1229 		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1230 				       EXTENT_LOCKED, &failed_start,
1231 				       cached_state, GFP_NOFS);
1232 		if (err == -EEXIST) {
1233 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1234 			start = failed_start;
1235 		} else
1236 			break;
1237 		WARN_ON(start > end);
1238 	}
1239 	return err;
1240 }
1241 
1242 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1243 {
1244 	return lock_extent_bits(tree, start, end, 0, NULL);
1245 }
1246 
1247 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1248 {
1249 	int err;
1250 	u64 failed_start;
1251 
1252 	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1253 			       &failed_start, NULL, GFP_NOFS);
1254 	if (err == -EEXIST) {
1255 		if (failed_start > start)
1256 			clear_extent_bit(tree, start, failed_start - 1,
1257 					 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
1258 		return 0;
1259 	}
1260 	return 1;
1261 }
1262 
1263 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1264 			 struct extent_state **cached, gfp_t mask)
1265 {
1266 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1267 				mask);
1268 }
1269 
1270 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1271 {
1272 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1273 				GFP_NOFS);
1274 }
1275 
1276 int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1277 {
1278 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1279 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1280 	struct page *page;
1281 
1282 	while (index <= end_index) {
1283 		page = find_get_page(inode->i_mapping, index);
1284 		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1285 		clear_page_dirty_for_io(page);
1286 		page_cache_release(page);
1287 		index++;
1288 	}
1289 	return 0;
1290 }
1291 
1292 int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1293 {
1294 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1295 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1296 	struct page *page;
1297 
1298 	while (index <= end_index) {
1299 		page = find_get_page(inode->i_mapping, index);
1300 		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1301 		account_page_redirty(page);
1302 		__set_page_dirty_nobuffers(page);
1303 		page_cache_release(page);
1304 		index++;
1305 	}
1306 	return 0;
1307 }
1308 
1309 /*
1310  * helper function to set both pages and extents in the tree writeback
1311  */
1312 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1313 {
1314 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1315 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1316 	struct page *page;
1317 
1318 	while (index <= end_index) {
1319 		page = find_get_page(tree->mapping, index);
1320 		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1321 		set_page_writeback(page);
1322 		page_cache_release(page);
1323 		index++;
1324 	}
1325 	return 0;
1326 }
1327 
1328 /* find the first state struct with 'bits' set after 'start', and
1329  * return it.  tree->lock must be held.  NULL will returned if
1330  * nothing was found after 'start'
1331  */
1332 static struct extent_state *
1333 find_first_extent_bit_state(struct extent_io_tree *tree,
1334 			    u64 start, unsigned long bits)
1335 {
1336 	struct rb_node *node;
1337 	struct extent_state *state;
1338 
1339 	/*
1340 	 * this search will find all the extents that end after
1341 	 * our range starts.
1342 	 */
1343 	node = tree_search(tree, start);
1344 	if (!node)
1345 		goto out;
1346 
1347 	while (1) {
1348 		state = rb_entry(node, struct extent_state, rb_node);
1349 		if (state->end >= start && (state->state & bits))
1350 			return state;
1351 
1352 		node = rb_next(node);
1353 		if (!node)
1354 			break;
1355 	}
1356 out:
1357 	return NULL;
1358 }
1359 
1360 /*
1361  * find the first offset in the io tree with 'bits' set. zero is
1362  * returned if we find something, and *start_ret and *end_ret are
1363  * set to reflect the state struct that was found.
1364  *
1365  * If nothing was found, 1 is returned. If found something, return 0.
1366  */
1367 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1368 			  u64 *start_ret, u64 *end_ret, unsigned long bits,
1369 			  struct extent_state **cached_state)
1370 {
1371 	struct extent_state *state;
1372 	struct rb_node *n;
1373 	int ret = 1;
1374 
1375 	spin_lock(&tree->lock);
1376 	if (cached_state && *cached_state) {
1377 		state = *cached_state;
1378 		if (state->end == start - 1 && state->tree) {
1379 			n = rb_next(&state->rb_node);
1380 			while (n) {
1381 				state = rb_entry(n, struct extent_state,
1382 						 rb_node);
1383 				if (state->state & bits)
1384 					goto got_it;
1385 				n = rb_next(n);
1386 			}
1387 			free_extent_state(*cached_state);
1388 			*cached_state = NULL;
1389 			goto out;
1390 		}
1391 		free_extent_state(*cached_state);
1392 		*cached_state = NULL;
1393 	}
1394 
1395 	state = find_first_extent_bit_state(tree, start, bits);
1396 got_it:
1397 	if (state) {
1398 		cache_state(state, cached_state);
1399 		*start_ret = state->start;
1400 		*end_ret = state->end;
1401 		ret = 0;
1402 	}
1403 out:
1404 	spin_unlock(&tree->lock);
1405 	return ret;
1406 }
1407 
1408 /*
1409  * find a contiguous range of bytes in the file marked as delalloc, not
1410  * more than 'max_bytes'.  start and end are used to return the range,
1411  *
1412  * 1 is returned if we find something, 0 if nothing was in the tree
1413  */
1414 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1415 					u64 *start, u64 *end, u64 max_bytes,
1416 					struct extent_state **cached_state)
1417 {
1418 	struct rb_node *node;
1419 	struct extent_state *state;
1420 	u64 cur_start = *start;
1421 	u64 found = 0;
1422 	u64 total_bytes = 0;
1423 
1424 	spin_lock(&tree->lock);
1425 
1426 	/*
1427 	 * this search will find all the extents that end after
1428 	 * our range starts.
1429 	 */
1430 	node = tree_search(tree, cur_start);
1431 	if (!node) {
1432 		if (!found)
1433 			*end = (u64)-1;
1434 		goto out;
1435 	}
1436 
1437 	while (1) {
1438 		state = rb_entry(node, struct extent_state, rb_node);
1439 		if (found && (state->start != cur_start ||
1440 			      (state->state & EXTENT_BOUNDARY))) {
1441 			goto out;
1442 		}
1443 		if (!(state->state & EXTENT_DELALLOC)) {
1444 			if (!found)
1445 				*end = state->end;
1446 			goto out;
1447 		}
1448 		if (!found) {
1449 			*start = state->start;
1450 			*cached_state = state;
1451 			atomic_inc(&state->refs);
1452 		}
1453 		found++;
1454 		*end = state->end;
1455 		cur_start = state->end + 1;
1456 		node = rb_next(node);
1457 		if (!node)
1458 			break;
1459 		total_bytes += state->end - state->start + 1;
1460 		if (total_bytes >= max_bytes)
1461 			break;
1462 	}
1463 out:
1464 	spin_unlock(&tree->lock);
1465 	return found;
1466 }
1467 
1468 static noinline void __unlock_for_delalloc(struct inode *inode,
1469 					   struct page *locked_page,
1470 					   u64 start, u64 end)
1471 {
1472 	int ret;
1473 	struct page *pages[16];
1474 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1475 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1476 	unsigned long nr_pages = end_index - index + 1;
1477 	int i;
1478 
1479 	if (index == locked_page->index && end_index == index)
1480 		return;
1481 
1482 	while (nr_pages > 0) {
1483 		ret = find_get_pages_contig(inode->i_mapping, index,
1484 				     min_t(unsigned long, nr_pages,
1485 				     ARRAY_SIZE(pages)), pages);
1486 		for (i = 0; i < ret; i++) {
1487 			if (pages[i] != locked_page)
1488 				unlock_page(pages[i]);
1489 			page_cache_release(pages[i]);
1490 		}
1491 		nr_pages -= ret;
1492 		index += ret;
1493 		cond_resched();
1494 	}
1495 }
1496 
1497 static noinline int lock_delalloc_pages(struct inode *inode,
1498 					struct page *locked_page,
1499 					u64 delalloc_start,
1500 					u64 delalloc_end)
1501 {
1502 	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1503 	unsigned long start_index = index;
1504 	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1505 	unsigned long pages_locked = 0;
1506 	struct page *pages[16];
1507 	unsigned long nrpages;
1508 	int ret;
1509 	int i;
1510 
1511 	/* the caller is responsible for locking the start index */
1512 	if (index == locked_page->index && index == end_index)
1513 		return 0;
1514 
1515 	/* skip the page at the start index */
1516 	nrpages = end_index - index + 1;
1517 	while (nrpages > 0) {
1518 		ret = find_get_pages_contig(inode->i_mapping, index,
1519 				     min_t(unsigned long,
1520 				     nrpages, ARRAY_SIZE(pages)), pages);
1521 		if (ret == 0) {
1522 			ret = -EAGAIN;
1523 			goto done;
1524 		}
1525 		/* now we have an array of pages, lock them all */
1526 		for (i = 0; i < ret; i++) {
1527 			/*
1528 			 * the caller is taking responsibility for
1529 			 * locked_page
1530 			 */
1531 			if (pages[i] != locked_page) {
1532 				lock_page(pages[i]);
1533 				if (!PageDirty(pages[i]) ||
1534 				    pages[i]->mapping != inode->i_mapping) {
1535 					ret = -EAGAIN;
1536 					unlock_page(pages[i]);
1537 					page_cache_release(pages[i]);
1538 					goto done;
1539 				}
1540 			}
1541 			page_cache_release(pages[i]);
1542 			pages_locked++;
1543 		}
1544 		nrpages -= ret;
1545 		index += ret;
1546 		cond_resched();
1547 	}
1548 	ret = 0;
1549 done:
1550 	if (ret && pages_locked) {
1551 		__unlock_for_delalloc(inode, locked_page,
1552 			      delalloc_start,
1553 			      ((u64)(start_index + pages_locked - 1)) <<
1554 			      PAGE_CACHE_SHIFT);
1555 	}
1556 	return ret;
1557 }
1558 
1559 /*
1560  * find a contiguous range of bytes in the file marked as delalloc, not
1561  * more than 'max_bytes'.  start and end are used to return the range,
1562  *
1563  * 1 is returned if we find something, 0 if nothing was in the tree
1564  */
1565 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1566 					     struct extent_io_tree *tree,
1567 					     struct page *locked_page,
1568 					     u64 *start, u64 *end,
1569 					     u64 max_bytes)
1570 {
1571 	u64 delalloc_start;
1572 	u64 delalloc_end;
1573 	u64 found;
1574 	struct extent_state *cached_state = NULL;
1575 	int ret;
1576 	int loops = 0;
1577 
1578 again:
1579 	/* step one, find a bunch of delalloc bytes starting at start */
1580 	delalloc_start = *start;
1581 	delalloc_end = 0;
1582 	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1583 				    max_bytes, &cached_state);
1584 	if (!found || delalloc_end <= *start) {
1585 		*start = delalloc_start;
1586 		*end = delalloc_end;
1587 		free_extent_state(cached_state);
1588 		return found;
1589 	}
1590 
1591 	/*
1592 	 * start comes from the offset of locked_page.  We have to lock
1593 	 * pages in order, so we can't process delalloc bytes before
1594 	 * locked_page
1595 	 */
1596 	if (delalloc_start < *start)
1597 		delalloc_start = *start;
1598 
1599 	/*
1600 	 * make sure to limit the number of pages we try to lock down
1601 	 * if we're looping.
1602 	 */
1603 	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1604 		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1605 
1606 	/* step two, lock all the pages after the page that has start */
1607 	ret = lock_delalloc_pages(inode, locked_page,
1608 				  delalloc_start, delalloc_end);
1609 	if (ret == -EAGAIN) {
1610 		/* some of the pages are gone, lets avoid looping by
1611 		 * shortening the size of the delalloc range we're searching
1612 		 */
1613 		free_extent_state(cached_state);
1614 		if (!loops) {
1615 			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1616 			max_bytes = PAGE_CACHE_SIZE - offset;
1617 			loops = 1;
1618 			goto again;
1619 		} else {
1620 			found = 0;
1621 			goto out_failed;
1622 		}
1623 	}
1624 	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
1625 
1626 	/* step three, lock the state bits for the whole range */
1627 	lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
1628 
1629 	/* then test to make sure it is all still delalloc */
1630 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1631 			     EXTENT_DELALLOC, 1, cached_state);
1632 	if (!ret) {
1633 		unlock_extent_cached(tree, delalloc_start, delalloc_end,
1634 				     &cached_state, GFP_NOFS);
1635 		__unlock_for_delalloc(inode, locked_page,
1636 			      delalloc_start, delalloc_end);
1637 		cond_resched();
1638 		goto again;
1639 	}
1640 	free_extent_state(cached_state);
1641 	*start = delalloc_start;
1642 	*end = delalloc_end;
1643 out_failed:
1644 	return found;
1645 }
1646 
1647 int extent_clear_unlock_delalloc(struct inode *inode,
1648 				struct extent_io_tree *tree,
1649 				u64 start, u64 end, struct page *locked_page,
1650 				unsigned long op)
1651 {
1652 	int ret;
1653 	struct page *pages[16];
1654 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1655 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1656 	unsigned long nr_pages = end_index - index + 1;
1657 	int i;
1658 	unsigned long clear_bits = 0;
1659 
1660 	if (op & EXTENT_CLEAR_UNLOCK)
1661 		clear_bits |= EXTENT_LOCKED;
1662 	if (op & EXTENT_CLEAR_DIRTY)
1663 		clear_bits |= EXTENT_DIRTY;
1664 
1665 	if (op & EXTENT_CLEAR_DELALLOC)
1666 		clear_bits |= EXTENT_DELALLOC;
1667 
1668 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1669 	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1670 		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1671 		    EXTENT_SET_PRIVATE2)))
1672 		return 0;
1673 
1674 	while (nr_pages > 0) {
1675 		ret = find_get_pages_contig(inode->i_mapping, index,
1676 				     min_t(unsigned long,
1677 				     nr_pages, ARRAY_SIZE(pages)), pages);
1678 		for (i = 0; i < ret; i++) {
1679 
1680 			if (op & EXTENT_SET_PRIVATE2)
1681 				SetPagePrivate2(pages[i]);
1682 
1683 			if (pages[i] == locked_page) {
1684 				page_cache_release(pages[i]);
1685 				continue;
1686 			}
1687 			if (op & EXTENT_CLEAR_DIRTY)
1688 				clear_page_dirty_for_io(pages[i]);
1689 			if (op & EXTENT_SET_WRITEBACK)
1690 				set_page_writeback(pages[i]);
1691 			if (op & EXTENT_END_WRITEBACK)
1692 				end_page_writeback(pages[i]);
1693 			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1694 				unlock_page(pages[i]);
1695 			page_cache_release(pages[i]);
1696 		}
1697 		nr_pages -= ret;
1698 		index += ret;
1699 		cond_resched();
1700 	}
1701 	return 0;
1702 }
1703 
1704 /*
1705  * count the number of bytes in the tree that have a given bit(s)
1706  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1707  * cached.  The total number found is returned.
1708  */
1709 u64 count_range_bits(struct extent_io_tree *tree,
1710 		     u64 *start, u64 search_end, u64 max_bytes,
1711 		     unsigned long bits, int contig)
1712 {
1713 	struct rb_node *node;
1714 	struct extent_state *state;
1715 	u64 cur_start = *start;
1716 	u64 total_bytes = 0;
1717 	u64 last = 0;
1718 	int found = 0;
1719 
1720 	if (search_end <= cur_start) {
1721 		WARN_ON(1);
1722 		return 0;
1723 	}
1724 
1725 	spin_lock(&tree->lock);
1726 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1727 		total_bytes = tree->dirty_bytes;
1728 		goto out;
1729 	}
1730 	/*
1731 	 * this search will find all the extents that end after
1732 	 * our range starts.
1733 	 */
1734 	node = tree_search(tree, cur_start);
1735 	if (!node)
1736 		goto out;
1737 
1738 	while (1) {
1739 		state = rb_entry(node, struct extent_state, rb_node);
1740 		if (state->start > search_end)
1741 			break;
1742 		if (contig && found && state->start > last + 1)
1743 			break;
1744 		if (state->end >= cur_start && (state->state & bits) == bits) {
1745 			total_bytes += min(search_end, state->end) + 1 -
1746 				       max(cur_start, state->start);
1747 			if (total_bytes >= max_bytes)
1748 				break;
1749 			if (!found) {
1750 				*start = max(cur_start, state->start);
1751 				found = 1;
1752 			}
1753 			last = state->end;
1754 		} else if (contig && found) {
1755 			break;
1756 		}
1757 		node = rb_next(node);
1758 		if (!node)
1759 			break;
1760 	}
1761 out:
1762 	spin_unlock(&tree->lock);
1763 	return total_bytes;
1764 }
1765 
1766 /*
1767  * set the private field for a given byte offset in the tree.  If there isn't
1768  * an extent_state there already, this does nothing.
1769  */
1770 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1771 {
1772 	struct rb_node *node;
1773 	struct extent_state *state;
1774 	int ret = 0;
1775 
1776 	spin_lock(&tree->lock);
1777 	/*
1778 	 * this search will find all the extents that end after
1779 	 * our range starts.
1780 	 */
1781 	node = tree_search(tree, start);
1782 	if (!node) {
1783 		ret = -ENOENT;
1784 		goto out;
1785 	}
1786 	state = rb_entry(node, struct extent_state, rb_node);
1787 	if (state->start != start) {
1788 		ret = -ENOENT;
1789 		goto out;
1790 	}
1791 	state->private = private;
1792 out:
1793 	spin_unlock(&tree->lock);
1794 	return ret;
1795 }
1796 
1797 void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[],
1798 			    int count)
1799 {
1800 	struct rb_node *node;
1801 	struct extent_state *state;
1802 
1803 	spin_lock(&tree->lock);
1804 	/*
1805 	 * this search will find all the extents that end after
1806 	 * our range starts.
1807 	 */
1808 	node = tree_search(tree, start);
1809 	BUG_ON(!node);
1810 
1811 	state = rb_entry(node, struct extent_state, rb_node);
1812 	BUG_ON(state->start != start);
1813 
1814 	while (count) {
1815 		state->private = *csums++;
1816 		count--;
1817 		state = next_state(state);
1818 	}
1819 	spin_unlock(&tree->lock);
1820 }
1821 
1822 static inline u64 __btrfs_get_bio_offset(struct bio *bio, int bio_index)
1823 {
1824 	struct bio_vec *bvec = bio->bi_io_vec + bio_index;
1825 
1826 	return page_offset(bvec->bv_page) + bvec->bv_offset;
1827 }
1828 
1829 void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, int bio_index,
1830 			u32 csums[], int count)
1831 {
1832 	struct rb_node *node;
1833 	struct extent_state *state = NULL;
1834 	u64 start;
1835 
1836 	spin_lock(&tree->lock);
1837 	do {
1838 		start = __btrfs_get_bio_offset(bio, bio_index);
1839 		if (state == NULL || state->start != start) {
1840 			node = tree_search(tree, start);
1841 			BUG_ON(!node);
1842 
1843 			state = rb_entry(node, struct extent_state, rb_node);
1844 			BUG_ON(state->start != start);
1845 		}
1846 		state->private = *csums++;
1847 		count--;
1848 		bio_index++;
1849 
1850 		state = next_state(state);
1851 	} while (count);
1852 	spin_unlock(&tree->lock);
1853 }
1854 
1855 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1856 {
1857 	struct rb_node *node;
1858 	struct extent_state *state;
1859 	int ret = 0;
1860 
1861 	spin_lock(&tree->lock);
1862 	/*
1863 	 * this search will find all the extents that end after
1864 	 * our range starts.
1865 	 */
1866 	node = tree_search(tree, start);
1867 	if (!node) {
1868 		ret = -ENOENT;
1869 		goto out;
1870 	}
1871 	state = rb_entry(node, struct extent_state, rb_node);
1872 	if (state->start != start) {
1873 		ret = -ENOENT;
1874 		goto out;
1875 	}
1876 	*private = state->private;
1877 out:
1878 	spin_unlock(&tree->lock);
1879 	return ret;
1880 }
1881 
1882 /*
1883  * searches a range in the state tree for a given mask.
1884  * If 'filled' == 1, this returns 1 only if every extent in the tree
1885  * has the bits set.  Otherwise, 1 is returned if any bit in the
1886  * range is found set.
1887  */
1888 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1889 		   unsigned long bits, int filled, struct extent_state *cached)
1890 {
1891 	struct extent_state *state = NULL;
1892 	struct rb_node *node;
1893 	int bitset = 0;
1894 
1895 	spin_lock(&tree->lock);
1896 	if (cached && cached->tree && cached->start <= start &&
1897 	    cached->end > start)
1898 		node = &cached->rb_node;
1899 	else
1900 		node = tree_search(tree, start);
1901 	while (node && start <= end) {
1902 		state = rb_entry(node, struct extent_state, rb_node);
1903 
1904 		if (filled && state->start > start) {
1905 			bitset = 0;
1906 			break;
1907 		}
1908 
1909 		if (state->start > end)
1910 			break;
1911 
1912 		if (state->state & bits) {
1913 			bitset = 1;
1914 			if (!filled)
1915 				break;
1916 		} else if (filled) {
1917 			bitset = 0;
1918 			break;
1919 		}
1920 
1921 		if (state->end == (u64)-1)
1922 			break;
1923 
1924 		start = state->end + 1;
1925 		if (start > end)
1926 			break;
1927 		node = rb_next(node);
1928 		if (!node) {
1929 			if (filled)
1930 				bitset = 0;
1931 			break;
1932 		}
1933 	}
1934 	spin_unlock(&tree->lock);
1935 	return bitset;
1936 }
1937 
1938 /*
1939  * helper function to set a given page up to date if all the
1940  * extents in the tree for that page are up to date
1941  */
1942 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1943 {
1944 	u64 start = page_offset(page);
1945 	u64 end = start + PAGE_CACHE_SIZE - 1;
1946 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1947 		SetPageUptodate(page);
1948 }
1949 
1950 /*
1951  * helper function to unlock a page if all the extents in the tree
1952  * for that page are unlocked
1953  */
1954 static void check_page_locked(struct extent_io_tree *tree, struct page *page)
1955 {
1956 	u64 start = page_offset(page);
1957 	u64 end = start + PAGE_CACHE_SIZE - 1;
1958 	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1959 		unlock_page(page);
1960 }
1961 
1962 /*
1963  * helper function to end page writeback if all the extents
1964  * in the tree for that page are done with writeback
1965  */
1966 static void check_page_writeback(struct extent_io_tree *tree,
1967 				 struct page *page)
1968 {
1969 	end_page_writeback(page);
1970 }
1971 
1972 /*
1973  * When IO fails, either with EIO or csum verification fails, we
1974  * try other mirrors that might have a good copy of the data.  This
1975  * io_failure_record is used to record state as we go through all the
1976  * mirrors.  If another mirror has good data, the page is set up to date
1977  * and things continue.  If a good mirror can't be found, the original
1978  * bio end_io callback is called to indicate things have failed.
1979  */
1980 struct io_failure_record {
1981 	struct page *page;
1982 	u64 start;
1983 	u64 len;
1984 	u64 logical;
1985 	unsigned long bio_flags;
1986 	int this_mirror;
1987 	int failed_mirror;
1988 	int in_validation;
1989 };
1990 
1991 static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1992 				int did_repair)
1993 {
1994 	int ret;
1995 	int err = 0;
1996 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1997 
1998 	set_state_private(failure_tree, rec->start, 0);
1999 	ret = clear_extent_bits(failure_tree, rec->start,
2000 				rec->start + rec->len - 1,
2001 				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2002 	if (ret)
2003 		err = ret;
2004 
2005 	ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
2006 				rec->start + rec->len - 1,
2007 				EXTENT_DAMAGED, GFP_NOFS);
2008 	if (ret && !err)
2009 		err = ret;
2010 
2011 	kfree(rec);
2012 	return err;
2013 }
2014 
2015 static void repair_io_failure_callback(struct bio *bio, int err)
2016 {
2017 	complete(bio->bi_private);
2018 }
2019 
2020 /*
2021  * this bypasses the standard btrfs submit functions deliberately, as
2022  * the standard behavior is to write all copies in a raid setup. here we only
2023  * want to write the one bad copy. so we do the mapping for ourselves and issue
2024  * submit_bio directly.
2025  * to avoid any synchronization issues, wait for the data after writing, which
2026  * actually prevents the read that triggered the error from finishing.
2027  * currently, there can be no more than two copies of every data bit. thus,
2028  * exactly one rewrite is required.
2029  */
2030 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
2031 			u64 length, u64 logical, struct page *page,
2032 			int mirror_num)
2033 {
2034 	struct bio *bio;
2035 	struct btrfs_device *dev;
2036 	DECLARE_COMPLETION_ONSTACK(compl);
2037 	u64 map_length = 0;
2038 	u64 sector;
2039 	struct btrfs_bio *bbio = NULL;
2040 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
2041 	int ret;
2042 
2043 	BUG_ON(!mirror_num);
2044 
2045 	/* we can't repair anything in raid56 yet */
2046 	if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
2047 		return 0;
2048 
2049 	bio = bio_alloc(GFP_NOFS, 1);
2050 	if (!bio)
2051 		return -EIO;
2052 	bio->bi_private = &compl;
2053 	bio->bi_end_io = repair_io_failure_callback;
2054 	bio->bi_size = 0;
2055 	map_length = length;
2056 
2057 	ret = btrfs_map_block(fs_info, WRITE, logical,
2058 			      &map_length, &bbio, mirror_num);
2059 	if (ret) {
2060 		bio_put(bio);
2061 		return -EIO;
2062 	}
2063 	BUG_ON(mirror_num != bbio->mirror_num);
2064 	sector = bbio->stripes[mirror_num-1].physical >> 9;
2065 	bio->bi_sector = sector;
2066 	dev = bbio->stripes[mirror_num-1].dev;
2067 	kfree(bbio);
2068 	if (!dev || !dev->bdev || !dev->writeable) {
2069 		bio_put(bio);
2070 		return -EIO;
2071 	}
2072 	bio->bi_bdev = dev->bdev;
2073 	bio_add_page(bio, page, length, start - page_offset(page));
2074 	btrfsic_submit_bio(WRITE_SYNC, bio);
2075 	wait_for_completion(&compl);
2076 
2077 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2078 		/* try to remap that extent elsewhere? */
2079 		bio_put(bio);
2080 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
2081 		return -EIO;
2082 	}
2083 
2084 	printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
2085 		      "(dev %s sector %llu)\n", page->mapping->host->i_ino,
2086 		      start, rcu_str_deref(dev->name), sector);
2087 
2088 	bio_put(bio);
2089 	return 0;
2090 }
2091 
2092 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
2093 			 int mirror_num)
2094 {
2095 	u64 start = eb->start;
2096 	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
2097 	int ret = 0;
2098 
2099 	for (i = 0; i < num_pages; i++) {
2100 		struct page *p = extent_buffer_page(eb, i);
2101 		ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
2102 					start, p, mirror_num);
2103 		if (ret)
2104 			break;
2105 		start += PAGE_CACHE_SIZE;
2106 	}
2107 
2108 	return ret;
2109 }
2110 
2111 /*
2112  * each time an IO finishes, we do a fast check in the IO failure tree
2113  * to see if we need to process or clean up an io_failure_record
2114  */
2115 static int clean_io_failure(u64 start, struct page *page)
2116 {
2117 	u64 private;
2118 	u64 private_failure;
2119 	struct io_failure_record *failrec;
2120 	struct btrfs_fs_info *fs_info;
2121 	struct extent_state *state;
2122 	int num_copies;
2123 	int did_repair = 0;
2124 	int ret;
2125 	struct inode *inode = page->mapping->host;
2126 
2127 	private = 0;
2128 	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
2129 				(u64)-1, 1, EXTENT_DIRTY, 0);
2130 	if (!ret)
2131 		return 0;
2132 
2133 	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
2134 				&private_failure);
2135 	if (ret)
2136 		return 0;
2137 
2138 	failrec = (struct io_failure_record *)(unsigned long) private_failure;
2139 	BUG_ON(!failrec->this_mirror);
2140 
2141 	if (failrec->in_validation) {
2142 		/* there was no real error, just free the record */
2143 		pr_debug("clean_io_failure: freeing dummy error at %llu\n",
2144 			 failrec->start);
2145 		did_repair = 1;
2146 		goto out;
2147 	}
2148 
2149 	spin_lock(&BTRFS_I(inode)->io_tree.lock);
2150 	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
2151 					    failrec->start,
2152 					    EXTENT_LOCKED);
2153 	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
2154 
2155 	if (state && state->start == failrec->start) {
2156 		fs_info = BTRFS_I(inode)->root->fs_info;
2157 		num_copies = btrfs_num_copies(fs_info, failrec->logical,
2158 					      failrec->len);
2159 		if (num_copies > 1)  {
2160 			ret = repair_io_failure(fs_info, start, failrec->len,
2161 						failrec->logical, page,
2162 						failrec->failed_mirror);
2163 			did_repair = !ret;
2164 		}
2165 		ret = 0;
2166 	}
2167 
2168 out:
2169 	if (!ret)
2170 		ret = free_io_failure(inode, failrec, did_repair);
2171 
2172 	return ret;
2173 }
2174 
2175 /*
2176  * this is a generic handler for readpage errors (default
2177  * readpage_io_failed_hook). if other copies exist, read those and write back
2178  * good data to the failed position. does not investigate in remapping the
2179  * failed extent elsewhere, hoping the device will be smart enough to do this as
2180  * needed
2181  */
2182 
2183 static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2184 				u64 start, u64 end, int failed_mirror,
2185 				struct extent_state *state)
2186 {
2187 	struct io_failure_record *failrec = NULL;
2188 	u64 private;
2189 	struct extent_map *em;
2190 	struct inode *inode = page->mapping->host;
2191 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2192 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2193 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2194 	struct bio *bio;
2195 	int num_copies;
2196 	int ret;
2197 	int read_mode;
2198 	u64 logical;
2199 
2200 	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2201 
2202 	ret = get_state_private(failure_tree, start, &private);
2203 	if (ret) {
2204 		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2205 		if (!failrec)
2206 			return -ENOMEM;
2207 		failrec->start = start;
2208 		failrec->len = end - start + 1;
2209 		failrec->this_mirror = 0;
2210 		failrec->bio_flags = 0;
2211 		failrec->in_validation = 0;
2212 
2213 		read_lock(&em_tree->lock);
2214 		em = lookup_extent_mapping(em_tree, start, failrec->len);
2215 		if (!em) {
2216 			read_unlock(&em_tree->lock);
2217 			kfree(failrec);
2218 			return -EIO;
2219 		}
2220 
2221 		if (em->start > start || em->start + em->len < start) {
2222 			free_extent_map(em);
2223 			em = NULL;
2224 		}
2225 		read_unlock(&em_tree->lock);
2226 
2227 		if (!em) {
2228 			kfree(failrec);
2229 			return -EIO;
2230 		}
2231 		logical = start - em->start;
2232 		logical = em->block_start + logical;
2233 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2234 			logical = em->block_start;
2235 			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2236 			extent_set_compress_type(&failrec->bio_flags,
2237 						 em->compress_type);
2238 		}
2239 		pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2240 			 "len=%llu\n", logical, start, failrec->len);
2241 		failrec->logical = logical;
2242 		free_extent_map(em);
2243 
2244 		/* set the bits in the private failure tree */
2245 		ret = set_extent_bits(failure_tree, start, end,
2246 					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2247 		if (ret >= 0)
2248 			ret = set_state_private(failure_tree, start,
2249 						(u64)(unsigned long)failrec);
2250 		/* set the bits in the inode's tree */
2251 		if (ret >= 0)
2252 			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2253 						GFP_NOFS);
2254 		if (ret < 0) {
2255 			kfree(failrec);
2256 			return ret;
2257 		}
2258 	} else {
2259 		failrec = (struct io_failure_record *)(unsigned long)private;
2260 		pr_debug("bio_readpage_error: (found) logical=%llu, "
2261 			 "start=%llu, len=%llu, validation=%d\n",
2262 			 failrec->logical, failrec->start, failrec->len,
2263 			 failrec->in_validation);
2264 		/*
2265 		 * when data can be on disk more than twice, add to failrec here
2266 		 * (e.g. with a list for failed_mirror) to make
2267 		 * clean_io_failure() clean all those errors at once.
2268 		 */
2269 	}
2270 	num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
2271 				      failrec->logical, failrec->len);
2272 	if (num_copies == 1) {
2273 		/*
2274 		 * we only have a single copy of the data, so don't bother with
2275 		 * all the retry and error correction code that follows. no
2276 		 * matter what the error is, it is very likely to persist.
2277 		 */
2278 		pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2279 			 "state=%p, num_copies=%d, next_mirror %d, "
2280 			 "failed_mirror %d\n", state, num_copies,
2281 			 failrec->this_mirror, failed_mirror);
2282 		free_io_failure(inode, failrec, 0);
2283 		return -EIO;
2284 	}
2285 
2286 	if (!state) {
2287 		spin_lock(&tree->lock);
2288 		state = find_first_extent_bit_state(tree, failrec->start,
2289 						    EXTENT_LOCKED);
2290 		if (state && state->start != failrec->start)
2291 			state = NULL;
2292 		spin_unlock(&tree->lock);
2293 	}
2294 
2295 	/*
2296 	 * there are two premises:
2297 	 *	a) deliver good data to the caller
2298 	 *	b) correct the bad sectors on disk
2299 	 */
2300 	if (failed_bio->bi_vcnt > 1) {
2301 		/*
2302 		 * to fulfill b), we need to know the exact failing sectors, as
2303 		 * we don't want to rewrite any more than the failed ones. thus,
2304 		 * we need separate read requests for the failed bio
2305 		 *
2306 		 * if the following BUG_ON triggers, our validation request got
2307 		 * merged. we need separate requests for our algorithm to work.
2308 		 */
2309 		BUG_ON(failrec->in_validation);
2310 		failrec->in_validation = 1;
2311 		failrec->this_mirror = failed_mirror;
2312 		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2313 	} else {
2314 		/*
2315 		 * we're ready to fulfill a) and b) alongside. get a good copy
2316 		 * of the failed sector and if we succeed, we have setup
2317 		 * everything for repair_io_failure to do the rest for us.
2318 		 */
2319 		if (failrec->in_validation) {
2320 			BUG_ON(failrec->this_mirror != failed_mirror);
2321 			failrec->in_validation = 0;
2322 			failrec->this_mirror = 0;
2323 		}
2324 		failrec->failed_mirror = failed_mirror;
2325 		failrec->this_mirror++;
2326 		if (failrec->this_mirror == failed_mirror)
2327 			failrec->this_mirror++;
2328 		read_mode = READ_SYNC;
2329 	}
2330 
2331 	if (!state || failrec->this_mirror > num_copies) {
2332 		pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2333 			 "next_mirror %d, failed_mirror %d\n", state,
2334 			 num_copies, failrec->this_mirror, failed_mirror);
2335 		free_io_failure(inode, failrec, 0);
2336 		return -EIO;
2337 	}
2338 
2339 	bio = bio_alloc(GFP_NOFS, 1);
2340 	if (!bio) {
2341 		free_io_failure(inode, failrec, 0);
2342 		return -EIO;
2343 	}
2344 	bio->bi_private = state;
2345 	bio->bi_end_io = failed_bio->bi_end_io;
2346 	bio->bi_sector = failrec->logical >> 9;
2347 	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2348 	bio->bi_size = 0;
2349 
2350 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
2351 
2352 	pr_debug("bio_readpage_error: submitting new read[%#x] to "
2353 		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2354 		 failrec->this_mirror, num_copies, failrec->in_validation);
2355 
2356 	ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
2357 					 failrec->this_mirror,
2358 					 failrec->bio_flags, 0);
2359 	return ret;
2360 }
2361 
2362 /* lots and lots of room for performance fixes in the end_bio funcs */
2363 
2364 int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2365 {
2366 	int uptodate = (err == 0);
2367 	struct extent_io_tree *tree;
2368 	int ret;
2369 
2370 	tree = &BTRFS_I(page->mapping->host)->io_tree;
2371 
2372 	if (tree->ops && tree->ops->writepage_end_io_hook) {
2373 		ret = tree->ops->writepage_end_io_hook(page, start,
2374 					       end, NULL, uptodate);
2375 		if (ret)
2376 			uptodate = 0;
2377 	}
2378 
2379 	if (!uptodate) {
2380 		ClearPageUptodate(page);
2381 		SetPageError(page);
2382 	}
2383 	return 0;
2384 }
2385 
2386 /*
2387  * after a writepage IO is done, we need to:
2388  * clear the uptodate bits on error
2389  * clear the writeback bits in the extent tree for this IO
2390  * end_page_writeback if the page has no more pending IO
2391  *
2392  * Scheduling is not allowed, so the extent state tree is expected
2393  * to have one and only one object corresponding to this IO.
2394  */
2395 static void end_bio_extent_writepage(struct bio *bio, int err)
2396 {
2397 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2398 	struct extent_io_tree *tree;
2399 	u64 start;
2400 	u64 end;
2401 	int whole_page;
2402 
2403 	do {
2404 		struct page *page = bvec->bv_page;
2405 		tree = &BTRFS_I(page->mapping->host)->io_tree;
2406 
2407 		start = page_offset(page) + bvec->bv_offset;
2408 		end = start + bvec->bv_len - 1;
2409 
2410 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2411 			whole_page = 1;
2412 		else
2413 			whole_page = 0;
2414 
2415 		if (--bvec >= bio->bi_io_vec)
2416 			prefetchw(&bvec->bv_page->flags);
2417 
2418 		if (end_extent_writepage(page, err, start, end))
2419 			continue;
2420 
2421 		if (whole_page)
2422 			end_page_writeback(page);
2423 		else
2424 			check_page_writeback(tree, page);
2425 	} while (bvec >= bio->bi_io_vec);
2426 
2427 	bio_put(bio);
2428 }
2429 
2430 /*
2431  * after a readpage IO is done, we need to:
2432  * clear the uptodate bits on error
2433  * set the uptodate bits if things worked
2434  * set the page up to date if all extents in the tree are uptodate
2435  * clear the lock bit in the extent tree
2436  * unlock the page if there are no other extents locked for it
2437  *
2438  * Scheduling is not allowed, so the extent state tree is expected
2439  * to have one and only one object corresponding to this IO.
2440  */
2441 static void end_bio_extent_readpage(struct bio *bio, int err)
2442 {
2443 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2444 	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
2445 	struct bio_vec *bvec = bio->bi_io_vec;
2446 	struct extent_io_tree *tree;
2447 	u64 start;
2448 	u64 end;
2449 	int whole_page;
2450 	int mirror;
2451 	int ret;
2452 
2453 	if (err)
2454 		uptodate = 0;
2455 
2456 	do {
2457 		struct page *page = bvec->bv_page;
2458 		struct extent_state *cached = NULL;
2459 		struct extent_state *state;
2460 
2461 		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2462 			 "mirror=%ld\n", (u64)bio->bi_sector, err,
2463 			 (long int)bio->bi_bdev);
2464 		tree = &BTRFS_I(page->mapping->host)->io_tree;
2465 
2466 		start = page_offset(page) + bvec->bv_offset;
2467 		end = start + bvec->bv_len - 1;
2468 
2469 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2470 			whole_page = 1;
2471 		else
2472 			whole_page = 0;
2473 
2474 		if (++bvec <= bvec_end)
2475 			prefetchw(&bvec->bv_page->flags);
2476 
2477 		spin_lock(&tree->lock);
2478 		state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
2479 		if (state && state->start == start) {
2480 			/*
2481 			 * take a reference on the state, unlock will drop
2482 			 * the ref
2483 			 */
2484 			cache_state(state, &cached);
2485 		}
2486 		spin_unlock(&tree->lock);
2487 
2488 		mirror = (int)(unsigned long)bio->bi_bdev;
2489 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2490 			ret = tree->ops->readpage_end_io_hook(page, start, end,
2491 							      state, mirror);
2492 			if (ret)
2493 				uptodate = 0;
2494 			else
2495 				clean_io_failure(start, page);
2496 		}
2497 
2498 		if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
2499 			ret = tree->ops->readpage_io_failed_hook(page, mirror);
2500 			if (!ret && !err &&
2501 			    test_bit(BIO_UPTODATE, &bio->bi_flags))
2502 				uptodate = 1;
2503 		} else if (!uptodate) {
2504 			/*
2505 			 * The generic bio_readpage_error handles errors the
2506 			 * following way: If possible, new read requests are
2507 			 * created and submitted and will end up in
2508 			 * end_bio_extent_readpage as well (if we're lucky, not
2509 			 * in the !uptodate case). In that case it returns 0 and
2510 			 * we just go on with the next page in our bio. If it
2511 			 * can't handle the error it will return -EIO and we
2512 			 * remain responsible for that page.
2513 			 */
2514 			ret = bio_readpage_error(bio, page, start, end, mirror, NULL);
2515 			if (ret == 0) {
2516 				uptodate =
2517 					test_bit(BIO_UPTODATE, &bio->bi_flags);
2518 				if (err)
2519 					uptodate = 0;
2520 				uncache_state(&cached);
2521 				continue;
2522 			}
2523 		}
2524 
2525 		if (uptodate && tree->track_uptodate) {
2526 			set_extent_uptodate(tree, start, end, &cached,
2527 					    GFP_ATOMIC);
2528 		}
2529 		unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2530 
2531 		if (whole_page) {
2532 			if (uptodate) {
2533 				SetPageUptodate(page);
2534 			} else {
2535 				ClearPageUptodate(page);
2536 				SetPageError(page);
2537 			}
2538 			unlock_page(page);
2539 		} else {
2540 			if (uptodate) {
2541 				check_page_uptodate(tree, page);
2542 			} else {
2543 				ClearPageUptodate(page);
2544 				SetPageError(page);
2545 			}
2546 			check_page_locked(tree, page);
2547 		}
2548 	} while (bvec <= bvec_end);
2549 
2550 	bio_put(bio);
2551 }
2552 
2553 struct bio *
2554 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2555 		gfp_t gfp_flags)
2556 {
2557 	struct bio *bio;
2558 
2559 	bio = bio_alloc(gfp_flags, nr_vecs);
2560 
2561 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2562 		while (!bio && (nr_vecs /= 2))
2563 			bio = bio_alloc(gfp_flags, nr_vecs);
2564 	}
2565 
2566 	if (bio) {
2567 		bio->bi_size = 0;
2568 		bio->bi_bdev = bdev;
2569 		bio->bi_sector = first_sector;
2570 	}
2571 	return bio;
2572 }
2573 
2574 static int __must_check submit_one_bio(int rw, struct bio *bio,
2575 				       int mirror_num, unsigned long bio_flags)
2576 {
2577 	int ret = 0;
2578 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2579 	struct page *page = bvec->bv_page;
2580 	struct extent_io_tree *tree = bio->bi_private;
2581 	u64 start;
2582 
2583 	start = page_offset(page) + bvec->bv_offset;
2584 
2585 	bio->bi_private = NULL;
2586 
2587 	bio_get(bio);
2588 
2589 	if (tree->ops && tree->ops->submit_bio_hook)
2590 		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2591 					   mirror_num, bio_flags, start);
2592 	else
2593 		btrfsic_submit_bio(rw, bio);
2594 
2595 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2596 		ret = -EOPNOTSUPP;
2597 	bio_put(bio);
2598 	return ret;
2599 }
2600 
2601 static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
2602 		     unsigned long offset, size_t size, struct bio *bio,
2603 		     unsigned long bio_flags)
2604 {
2605 	int ret = 0;
2606 	if (tree->ops && tree->ops->merge_bio_hook)
2607 		ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
2608 						bio_flags);
2609 	BUG_ON(ret < 0);
2610 	return ret;
2611 
2612 }
2613 
2614 static int submit_extent_page(int rw, struct extent_io_tree *tree,
2615 			      struct page *page, sector_t sector,
2616 			      size_t size, unsigned long offset,
2617 			      struct block_device *bdev,
2618 			      struct bio **bio_ret,
2619 			      unsigned long max_pages,
2620 			      bio_end_io_t end_io_func,
2621 			      int mirror_num,
2622 			      unsigned long prev_bio_flags,
2623 			      unsigned long bio_flags)
2624 {
2625 	int ret = 0;
2626 	struct bio *bio;
2627 	int nr;
2628 	int contig = 0;
2629 	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
2630 	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2631 	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
2632 
2633 	if (bio_ret && *bio_ret) {
2634 		bio = *bio_ret;
2635 		if (old_compressed)
2636 			contig = bio->bi_sector == sector;
2637 		else
2638 			contig = bio_end_sector(bio) == sector;
2639 
2640 		if (prev_bio_flags != bio_flags || !contig ||
2641 		    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
2642 		    bio_add_page(bio, page, page_size, offset) < page_size) {
2643 			ret = submit_one_bio(rw, bio, mirror_num,
2644 					     prev_bio_flags);
2645 			if (ret < 0)
2646 				return ret;
2647 			bio = NULL;
2648 		} else {
2649 			return 0;
2650 		}
2651 	}
2652 	if (this_compressed)
2653 		nr = BIO_MAX_PAGES;
2654 	else
2655 		nr = bio_get_nr_vecs(bdev);
2656 
2657 	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
2658 	if (!bio)
2659 		return -ENOMEM;
2660 
2661 	bio_add_page(bio, page, page_size, offset);
2662 	bio->bi_end_io = end_io_func;
2663 	bio->bi_private = tree;
2664 
2665 	if (bio_ret)
2666 		*bio_ret = bio;
2667 	else
2668 		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
2669 
2670 	return ret;
2671 }
2672 
2673 static void attach_extent_buffer_page(struct extent_buffer *eb,
2674 				      struct page *page)
2675 {
2676 	if (!PagePrivate(page)) {
2677 		SetPagePrivate(page);
2678 		page_cache_get(page);
2679 		set_page_private(page, (unsigned long)eb);
2680 	} else {
2681 		WARN_ON(page->private != (unsigned long)eb);
2682 	}
2683 }
2684 
2685 void set_page_extent_mapped(struct page *page)
2686 {
2687 	if (!PagePrivate(page)) {
2688 		SetPagePrivate(page);
2689 		page_cache_get(page);
2690 		set_page_private(page, EXTENT_PAGE_PRIVATE);
2691 	}
2692 }
2693 
2694 /*
2695  * basic readpage implementation.  Locked extent state structs are inserted
2696  * into the tree that are removed when the IO is done (by the end_io
2697  * handlers)
2698  * XXX JDM: This needs looking at to ensure proper page locking
2699  */
2700 static int __extent_read_full_page(struct extent_io_tree *tree,
2701 				   struct page *page,
2702 				   get_extent_t *get_extent,
2703 				   struct bio **bio, int mirror_num,
2704 				   unsigned long *bio_flags, int rw)
2705 {
2706 	struct inode *inode = page->mapping->host;
2707 	u64 start = page_offset(page);
2708 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2709 	u64 end;
2710 	u64 cur = start;
2711 	u64 extent_offset;
2712 	u64 last_byte = i_size_read(inode);
2713 	u64 block_start;
2714 	u64 cur_end;
2715 	sector_t sector;
2716 	struct extent_map *em;
2717 	struct block_device *bdev;
2718 	struct btrfs_ordered_extent *ordered;
2719 	int ret;
2720 	int nr = 0;
2721 	size_t pg_offset = 0;
2722 	size_t iosize;
2723 	size_t disk_io_size;
2724 	size_t blocksize = inode->i_sb->s_blocksize;
2725 	unsigned long this_bio_flag = 0;
2726 
2727 	set_page_extent_mapped(page);
2728 
2729 	if (!PageUptodate(page)) {
2730 		if (cleancache_get_page(page) == 0) {
2731 			BUG_ON(blocksize != PAGE_SIZE);
2732 			goto out;
2733 		}
2734 	}
2735 
2736 	end = page_end;
2737 	while (1) {
2738 		lock_extent(tree, start, end);
2739 		ordered = btrfs_lookup_ordered_extent(inode, start);
2740 		if (!ordered)
2741 			break;
2742 		unlock_extent(tree, start, end);
2743 		btrfs_start_ordered_extent(inode, ordered, 1);
2744 		btrfs_put_ordered_extent(ordered);
2745 	}
2746 
2747 	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2748 		char *userpage;
2749 		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2750 
2751 		if (zero_offset) {
2752 			iosize = PAGE_CACHE_SIZE - zero_offset;
2753 			userpage = kmap_atomic(page);
2754 			memset(userpage + zero_offset, 0, iosize);
2755 			flush_dcache_page(page);
2756 			kunmap_atomic(userpage);
2757 		}
2758 	}
2759 	while (cur <= end) {
2760 		unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2761 
2762 		if (cur >= last_byte) {
2763 			char *userpage;
2764 			struct extent_state *cached = NULL;
2765 
2766 			iosize = PAGE_CACHE_SIZE - pg_offset;
2767 			userpage = kmap_atomic(page);
2768 			memset(userpage + pg_offset, 0, iosize);
2769 			flush_dcache_page(page);
2770 			kunmap_atomic(userpage);
2771 			set_extent_uptodate(tree, cur, cur + iosize - 1,
2772 					    &cached, GFP_NOFS);
2773 			unlock_extent_cached(tree, cur, cur + iosize - 1,
2774 					     &cached, GFP_NOFS);
2775 			break;
2776 		}
2777 		em = get_extent(inode, page, pg_offset, cur,
2778 				end - cur + 1, 0);
2779 		if (IS_ERR_OR_NULL(em)) {
2780 			SetPageError(page);
2781 			unlock_extent(tree, cur, end);
2782 			break;
2783 		}
2784 		extent_offset = cur - em->start;
2785 		BUG_ON(extent_map_end(em) <= cur);
2786 		BUG_ON(end < cur);
2787 
2788 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2789 			this_bio_flag = EXTENT_BIO_COMPRESSED;
2790 			extent_set_compress_type(&this_bio_flag,
2791 						 em->compress_type);
2792 		}
2793 
2794 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2795 		cur_end = min(extent_map_end(em) - 1, end);
2796 		iosize = ALIGN(iosize, blocksize);
2797 		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2798 			disk_io_size = em->block_len;
2799 			sector = em->block_start >> 9;
2800 		} else {
2801 			sector = (em->block_start + extent_offset) >> 9;
2802 			disk_io_size = iosize;
2803 		}
2804 		bdev = em->bdev;
2805 		block_start = em->block_start;
2806 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2807 			block_start = EXTENT_MAP_HOLE;
2808 		free_extent_map(em);
2809 		em = NULL;
2810 
2811 		/* we've found a hole, just zero and go on */
2812 		if (block_start == EXTENT_MAP_HOLE) {
2813 			char *userpage;
2814 			struct extent_state *cached = NULL;
2815 
2816 			userpage = kmap_atomic(page);
2817 			memset(userpage + pg_offset, 0, iosize);
2818 			flush_dcache_page(page);
2819 			kunmap_atomic(userpage);
2820 
2821 			set_extent_uptodate(tree, cur, cur + iosize - 1,
2822 					    &cached, GFP_NOFS);
2823 			unlock_extent_cached(tree, cur, cur + iosize - 1,
2824 			                     &cached, GFP_NOFS);
2825 			cur = cur + iosize;
2826 			pg_offset += iosize;
2827 			continue;
2828 		}
2829 		/* the get_extent function already copied into the page */
2830 		if (test_range_bit(tree, cur, cur_end,
2831 				   EXTENT_UPTODATE, 1, NULL)) {
2832 			check_page_uptodate(tree, page);
2833 			unlock_extent(tree, cur, cur + iosize - 1);
2834 			cur = cur + iosize;
2835 			pg_offset += iosize;
2836 			continue;
2837 		}
2838 		/* we have an inline extent but it didn't get marked up
2839 		 * to date.  Error out
2840 		 */
2841 		if (block_start == EXTENT_MAP_INLINE) {
2842 			SetPageError(page);
2843 			unlock_extent(tree, cur, cur + iosize - 1);
2844 			cur = cur + iosize;
2845 			pg_offset += iosize;
2846 			continue;
2847 		}
2848 
2849 		pnr -= page->index;
2850 		ret = submit_extent_page(rw, tree, page,
2851 					 sector, disk_io_size, pg_offset,
2852 					 bdev, bio, pnr,
2853 					 end_bio_extent_readpage, mirror_num,
2854 					 *bio_flags,
2855 					 this_bio_flag);
2856 		if (!ret) {
2857 			nr++;
2858 			*bio_flags = this_bio_flag;
2859 		} else {
2860 			SetPageError(page);
2861 			unlock_extent(tree, cur, cur + iosize - 1);
2862 		}
2863 		cur = cur + iosize;
2864 		pg_offset += iosize;
2865 	}
2866 out:
2867 	if (!nr) {
2868 		if (!PageError(page))
2869 			SetPageUptodate(page);
2870 		unlock_page(page);
2871 	}
2872 	return 0;
2873 }
2874 
2875 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2876 			    get_extent_t *get_extent, int mirror_num)
2877 {
2878 	struct bio *bio = NULL;
2879 	unsigned long bio_flags = 0;
2880 	int ret;
2881 
2882 	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2883 				      &bio_flags, READ);
2884 	if (bio)
2885 		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2886 	return ret;
2887 }
2888 
2889 static noinline void update_nr_written(struct page *page,
2890 				      struct writeback_control *wbc,
2891 				      unsigned long nr_written)
2892 {
2893 	wbc->nr_to_write -= nr_written;
2894 	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2895 	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2896 		page->mapping->writeback_index = page->index + nr_written;
2897 }
2898 
2899 /*
2900  * the writepage semantics are similar to regular writepage.  extent
2901  * records are inserted to lock ranges in the tree, and as dirty areas
2902  * are found, they are marked writeback.  Then the lock bits are removed
2903  * and the end_io handler clears the writeback ranges
2904  */
2905 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2906 			      void *data)
2907 {
2908 	struct inode *inode = page->mapping->host;
2909 	struct extent_page_data *epd = data;
2910 	struct extent_io_tree *tree = epd->tree;
2911 	u64 start = page_offset(page);
2912 	u64 delalloc_start;
2913 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2914 	u64 end;
2915 	u64 cur = start;
2916 	u64 extent_offset;
2917 	u64 last_byte = i_size_read(inode);
2918 	u64 block_start;
2919 	u64 iosize;
2920 	sector_t sector;
2921 	struct extent_state *cached_state = NULL;
2922 	struct extent_map *em;
2923 	struct block_device *bdev;
2924 	int ret;
2925 	int nr = 0;
2926 	size_t pg_offset = 0;
2927 	size_t blocksize;
2928 	loff_t i_size = i_size_read(inode);
2929 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2930 	u64 nr_delalloc;
2931 	u64 delalloc_end;
2932 	int page_started;
2933 	int compressed;
2934 	int write_flags;
2935 	unsigned long nr_written = 0;
2936 	bool fill_delalloc = true;
2937 
2938 	if (wbc->sync_mode == WB_SYNC_ALL)
2939 		write_flags = WRITE_SYNC;
2940 	else
2941 		write_flags = WRITE;
2942 
2943 	trace___extent_writepage(page, inode, wbc);
2944 
2945 	WARN_ON(!PageLocked(page));
2946 
2947 	ClearPageError(page);
2948 
2949 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2950 	if (page->index > end_index ||
2951 	   (page->index == end_index && !pg_offset)) {
2952 		page->mapping->a_ops->invalidatepage(page, 0);
2953 		unlock_page(page);
2954 		return 0;
2955 	}
2956 
2957 	if (page->index == end_index) {
2958 		char *userpage;
2959 
2960 		userpage = kmap_atomic(page);
2961 		memset(userpage + pg_offset, 0,
2962 		       PAGE_CACHE_SIZE - pg_offset);
2963 		kunmap_atomic(userpage);
2964 		flush_dcache_page(page);
2965 	}
2966 	pg_offset = 0;
2967 
2968 	set_page_extent_mapped(page);
2969 
2970 	if (!tree->ops || !tree->ops->fill_delalloc)
2971 		fill_delalloc = false;
2972 
2973 	delalloc_start = start;
2974 	delalloc_end = 0;
2975 	page_started = 0;
2976 	if (!epd->extent_locked && fill_delalloc) {
2977 		u64 delalloc_to_write = 0;
2978 		/*
2979 		 * make sure the wbc mapping index is at least updated
2980 		 * to this page.
2981 		 */
2982 		update_nr_written(page, wbc, 0);
2983 
2984 		while (delalloc_end < page_end) {
2985 			nr_delalloc = find_lock_delalloc_range(inode, tree,
2986 						       page,
2987 						       &delalloc_start,
2988 						       &delalloc_end,
2989 						       128 * 1024 * 1024);
2990 			if (nr_delalloc == 0) {
2991 				delalloc_start = delalloc_end + 1;
2992 				continue;
2993 			}
2994 			ret = tree->ops->fill_delalloc(inode, page,
2995 						       delalloc_start,
2996 						       delalloc_end,
2997 						       &page_started,
2998 						       &nr_written);
2999 			/* File system has been set read-only */
3000 			if (ret) {
3001 				SetPageError(page);
3002 				goto done;
3003 			}
3004 			/*
3005 			 * delalloc_end is already one less than the total
3006 			 * length, so we don't subtract one from
3007 			 * PAGE_CACHE_SIZE
3008 			 */
3009 			delalloc_to_write += (delalloc_end - delalloc_start +
3010 					      PAGE_CACHE_SIZE) >>
3011 					      PAGE_CACHE_SHIFT;
3012 			delalloc_start = delalloc_end + 1;
3013 		}
3014 		if (wbc->nr_to_write < delalloc_to_write) {
3015 			int thresh = 8192;
3016 
3017 			if (delalloc_to_write < thresh * 2)
3018 				thresh = delalloc_to_write;
3019 			wbc->nr_to_write = min_t(u64, delalloc_to_write,
3020 						 thresh);
3021 		}
3022 
3023 		/* did the fill delalloc function already unlock and start
3024 		 * the IO?
3025 		 */
3026 		if (page_started) {
3027 			ret = 0;
3028 			/*
3029 			 * we've unlocked the page, so we can't update
3030 			 * the mapping's writeback index, just update
3031 			 * nr_to_write.
3032 			 */
3033 			wbc->nr_to_write -= nr_written;
3034 			goto done_unlocked;
3035 		}
3036 	}
3037 	if (tree->ops && tree->ops->writepage_start_hook) {
3038 		ret = tree->ops->writepage_start_hook(page, start,
3039 						      page_end);
3040 		if (ret) {
3041 			/* Fixup worker will requeue */
3042 			if (ret == -EBUSY)
3043 				wbc->pages_skipped++;
3044 			else
3045 				redirty_page_for_writepage(wbc, page);
3046 			update_nr_written(page, wbc, nr_written);
3047 			unlock_page(page);
3048 			ret = 0;
3049 			goto done_unlocked;
3050 		}
3051 	}
3052 
3053 	/*
3054 	 * we don't want to touch the inode after unlocking the page,
3055 	 * so we update the mapping writeback index now
3056 	 */
3057 	update_nr_written(page, wbc, nr_written + 1);
3058 
3059 	end = page_end;
3060 	if (last_byte <= start) {
3061 		if (tree->ops && tree->ops->writepage_end_io_hook)
3062 			tree->ops->writepage_end_io_hook(page, start,
3063 							 page_end, NULL, 1);
3064 		goto done;
3065 	}
3066 
3067 	blocksize = inode->i_sb->s_blocksize;
3068 
3069 	while (cur <= end) {
3070 		if (cur >= last_byte) {
3071 			if (tree->ops && tree->ops->writepage_end_io_hook)
3072 				tree->ops->writepage_end_io_hook(page, cur,
3073 							 page_end, NULL, 1);
3074 			break;
3075 		}
3076 		em = epd->get_extent(inode, page, pg_offset, cur,
3077 				     end - cur + 1, 1);
3078 		if (IS_ERR_OR_NULL(em)) {
3079 			SetPageError(page);
3080 			break;
3081 		}
3082 
3083 		extent_offset = cur - em->start;
3084 		BUG_ON(extent_map_end(em) <= cur);
3085 		BUG_ON(end < cur);
3086 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
3087 		iosize = ALIGN(iosize, blocksize);
3088 		sector = (em->block_start + extent_offset) >> 9;
3089 		bdev = em->bdev;
3090 		block_start = em->block_start;
3091 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
3092 		free_extent_map(em);
3093 		em = NULL;
3094 
3095 		/*
3096 		 * compressed and inline extents are written through other
3097 		 * paths in the FS
3098 		 */
3099 		if (compressed || block_start == EXTENT_MAP_HOLE ||
3100 		    block_start == EXTENT_MAP_INLINE) {
3101 			/*
3102 			 * end_io notification does not happen here for
3103 			 * compressed extents
3104 			 */
3105 			if (!compressed && tree->ops &&
3106 			    tree->ops->writepage_end_io_hook)
3107 				tree->ops->writepage_end_io_hook(page, cur,
3108 							 cur + iosize - 1,
3109 							 NULL, 1);
3110 			else if (compressed) {
3111 				/* we don't want to end_page_writeback on
3112 				 * a compressed extent.  this happens
3113 				 * elsewhere
3114 				 */
3115 				nr++;
3116 			}
3117 
3118 			cur += iosize;
3119 			pg_offset += iosize;
3120 			continue;
3121 		}
3122 		/* leave this out until we have a page_mkwrite call */
3123 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
3124 				   EXTENT_DIRTY, 0, NULL)) {
3125 			cur = cur + iosize;
3126 			pg_offset += iosize;
3127 			continue;
3128 		}
3129 
3130 		if (tree->ops && tree->ops->writepage_io_hook) {
3131 			ret = tree->ops->writepage_io_hook(page, cur,
3132 						cur + iosize - 1);
3133 		} else {
3134 			ret = 0;
3135 		}
3136 		if (ret) {
3137 			SetPageError(page);
3138 		} else {
3139 			unsigned long max_nr = end_index + 1;
3140 
3141 			set_range_writeback(tree, cur, cur + iosize - 1);
3142 			if (!PageWriteback(page)) {
3143 				printk(KERN_ERR "btrfs warning page %lu not "
3144 				       "writeback, cur %llu end %llu\n",
3145 				       page->index, (unsigned long long)cur,
3146 				       (unsigned long long)end);
3147 			}
3148 
3149 			ret = submit_extent_page(write_flags, tree, page,
3150 						 sector, iosize, pg_offset,
3151 						 bdev, &epd->bio, max_nr,
3152 						 end_bio_extent_writepage,
3153 						 0, 0, 0);
3154 			if (ret)
3155 				SetPageError(page);
3156 		}
3157 		cur = cur + iosize;
3158 		pg_offset += iosize;
3159 		nr++;
3160 	}
3161 done:
3162 	if (nr == 0) {
3163 		/* make sure the mapping tag for page dirty gets cleared */
3164 		set_page_writeback(page);
3165 		end_page_writeback(page);
3166 	}
3167 	unlock_page(page);
3168 
3169 done_unlocked:
3170 
3171 	/* drop our reference on any cached states */
3172 	free_extent_state(cached_state);
3173 	return 0;
3174 }
3175 
3176 static int eb_wait(void *word)
3177 {
3178 	io_schedule();
3179 	return 0;
3180 }
3181 
3182 void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3183 {
3184 	wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
3185 		    TASK_UNINTERRUPTIBLE);
3186 }
3187 
3188 static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3189 				     struct btrfs_fs_info *fs_info,
3190 				     struct extent_page_data *epd)
3191 {
3192 	unsigned long i, num_pages;
3193 	int flush = 0;
3194 	int ret = 0;
3195 
3196 	if (!btrfs_try_tree_write_lock(eb)) {
3197 		flush = 1;
3198 		flush_write_bio(epd);
3199 		btrfs_tree_lock(eb);
3200 	}
3201 
3202 	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3203 		btrfs_tree_unlock(eb);
3204 		if (!epd->sync_io)
3205 			return 0;
3206 		if (!flush) {
3207 			flush_write_bio(epd);
3208 			flush = 1;
3209 		}
3210 		while (1) {
3211 			wait_on_extent_buffer_writeback(eb);
3212 			btrfs_tree_lock(eb);
3213 			if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3214 				break;
3215 			btrfs_tree_unlock(eb);
3216 		}
3217 	}
3218 
3219 	/*
3220 	 * We need to do this to prevent races in people who check if the eb is
3221 	 * under IO since we can end up having no IO bits set for a short period
3222 	 * of time.
3223 	 */
3224 	spin_lock(&eb->refs_lock);
3225 	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3226 		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3227 		spin_unlock(&eb->refs_lock);
3228 		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3229 		__percpu_counter_add(&fs_info->dirty_metadata_bytes,
3230 				     -eb->len,
3231 				     fs_info->dirty_metadata_batch);
3232 		ret = 1;
3233 	} else {
3234 		spin_unlock(&eb->refs_lock);
3235 	}
3236 
3237 	btrfs_tree_unlock(eb);
3238 
3239 	if (!ret)
3240 		return ret;
3241 
3242 	num_pages = num_extent_pages(eb->start, eb->len);
3243 	for (i = 0; i < num_pages; i++) {
3244 		struct page *p = extent_buffer_page(eb, i);
3245 
3246 		if (!trylock_page(p)) {
3247 			if (!flush) {
3248 				flush_write_bio(epd);
3249 				flush = 1;
3250 			}
3251 			lock_page(p);
3252 		}
3253 	}
3254 
3255 	return ret;
3256 }
3257 
3258 static void end_extent_buffer_writeback(struct extent_buffer *eb)
3259 {
3260 	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3261 	smp_mb__after_clear_bit();
3262 	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3263 }
3264 
3265 static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3266 {
3267 	int uptodate = err == 0;
3268 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
3269 	struct extent_buffer *eb;
3270 	int done;
3271 
3272 	do {
3273 		struct page *page = bvec->bv_page;
3274 
3275 		bvec--;
3276 		eb = (struct extent_buffer *)page->private;
3277 		BUG_ON(!eb);
3278 		done = atomic_dec_and_test(&eb->io_pages);
3279 
3280 		if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
3281 			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3282 			ClearPageUptodate(page);
3283 			SetPageError(page);
3284 		}
3285 
3286 		end_page_writeback(page);
3287 
3288 		if (!done)
3289 			continue;
3290 
3291 		end_extent_buffer_writeback(eb);
3292 	} while (bvec >= bio->bi_io_vec);
3293 
3294 	bio_put(bio);
3295 
3296 }
3297 
3298 static int write_one_eb(struct extent_buffer *eb,
3299 			struct btrfs_fs_info *fs_info,
3300 			struct writeback_control *wbc,
3301 			struct extent_page_data *epd)
3302 {
3303 	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3304 	u64 offset = eb->start;
3305 	unsigned long i, num_pages;
3306 	unsigned long bio_flags = 0;
3307 	int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META;
3308 	int ret = 0;
3309 
3310 	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3311 	num_pages = num_extent_pages(eb->start, eb->len);
3312 	atomic_set(&eb->io_pages, num_pages);
3313 	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
3314 		bio_flags = EXTENT_BIO_TREE_LOG;
3315 
3316 	for (i = 0; i < num_pages; i++) {
3317 		struct page *p = extent_buffer_page(eb, i);
3318 
3319 		clear_page_dirty_for_io(p);
3320 		set_page_writeback(p);
3321 		ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
3322 					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3323 					 -1, end_bio_extent_buffer_writepage,
3324 					 0, epd->bio_flags, bio_flags);
3325 		epd->bio_flags = bio_flags;
3326 		if (ret) {
3327 			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3328 			SetPageError(p);
3329 			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3330 				end_extent_buffer_writeback(eb);
3331 			ret = -EIO;
3332 			break;
3333 		}
3334 		offset += PAGE_CACHE_SIZE;
3335 		update_nr_written(p, wbc, 1);
3336 		unlock_page(p);
3337 	}
3338 
3339 	if (unlikely(ret)) {
3340 		for (; i < num_pages; i++) {
3341 			struct page *p = extent_buffer_page(eb, i);
3342 			unlock_page(p);
3343 		}
3344 	}
3345 
3346 	return ret;
3347 }
3348 
3349 int btree_write_cache_pages(struct address_space *mapping,
3350 				   struct writeback_control *wbc)
3351 {
3352 	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
3353 	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
3354 	struct extent_buffer *eb, *prev_eb = NULL;
3355 	struct extent_page_data epd = {
3356 		.bio = NULL,
3357 		.tree = tree,
3358 		.extent_locked = 0,
3359 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3360 		.bio_flags = 0,
3361 	};
3362 	int ret = 0;
3363 	int done = 0;
3364 	int nr_to_write_done = 0;
3365 	struct pagevec pvec;
3366 	int nr_pages;
3367 	pgoff_t index;
3368 	pgoff_t end;		/* Inclusive */
3369 	int scanned = 0;
3370 	int tag;
3371 
3372 	pagevec_init(&pvec, 0);
3373 	if (wbc->range_cyclic) {
3374 		index = mapping->writeback_index; /* Start from prev offset */
3375 		end = -1;
3376 	} else {
3377 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
3378 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
3379 		scanned = 1;
3380 	}
3381 	if (wbc->sync_mode == WB_SYNC_ALL)
3382 		tag = PAGECACHE_TAG_TOWRITE;
3383 	else
3384 		tag = PAGECACHE_TAG_DIRTY;
3385 retry:
3386 	if (wbc->sync_mode == WB_SYNC_ALL)
3387 		tag_pages_for_writeback(mapping, index, end);
3388 	while (!done && !nr_to_write_done && (index <= end) &&
3389 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3390 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3391 		unsigned i;
3392 
3393 		scanned = 1;
3394 		for (i = 0; i < nr_pages; i++) {
3395 			struct page *page = pvec.pages[i];
3396 
3397 			if (!PagePrivate(page))
3398 				continue;
3399 
3400 			if (!wbc->range_cyclic && page->index > end) {
3401 				done = 1;
3402 				break;
3403 			}
3404 
3405 			spin_lock(&mapping->private_lock);
3406 			if (!PagePrivate(page)) {
3407 				spin_unlock(&mapping->private_lock);
3408 				continue;
3409 			}
3410 
3411 			eb = (struct extent_buffer *)page->private;
3412 
3413 			/*
3414 			 * Shouldn't happen and normally this would be a BUG_ON
3415 			 * but no sense in crashing the users box for something
3416 			 * we can survive anyway.
3417 			 */
3418 			if (!eb) {
3419 				spin_unlock(&mapping->private_lock);
3420 				WARN_ON(1);
3421 				continue;
3422 			}
3423 
3424 			if (eb == prev_eb) {
3425 				spin_unlock(&mapping->private_lock);
3426 				continue;
3427 			}
3428 
3429 			ret = atomic_inc_not_zero(&eb->refs);
3430 			spin_unlock(&mapping->private_lock);
3431 			if (!ret)
3432 				continue;
3433 
3434 			prev_eb = eb;
3435 			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
3436 			if (!ret) {
3437 				free_extent_buffer(eb);
3438 				continue;
3439 			}
3440 
3441 			ret = write_one_eb(eb, fs_info, wbc, &epd);
3442 			if (ret) {
3443 				done = 1;
3444 				free_extent_buffer(eb);
3445 				break;
3446 			}
3447 			free_extent_buffer(eb);
3448 
3449 			/*
3450 			 * the filesystem may choose to bump up nr_to_write.
3451 			 * We have to make sure to honor the new nr_to_write
3452 			 * at any time
3453 			 */
3454 			nr_to_write_done = wbc->nr_to_write <= 0;
3455 		}
3456 		pagevec_release(&pvec);
3457 		cond_resched();
3458 	}
3459 	if (!scanned && !done) {
3460 		/*
3461 		 * We hit the last page and there is more work to be done: wrap
3462 		 * back to the start of the file
3463 		 */
3464 		scanned = 1;
3465 		index = 0;
3466 		goto retry;
3467 	}
3468 	flush_write_bio(&epd);
3469 	return ret;
3470 }
3471 
3472 /**
3473  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
3474  * @mapping: address space structure to write
3475  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3476  * @writepage: function called for each page
3477  * @data: data passed to writepage function
3478  *
3479  * If a page is already under I/O, write_cache_pages() skips it, even
3480  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
3481  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
3482  * and msync() need to guarantee that all the data which was dirty at the time
3483  * the call was made get new I/O started against them.  If wbc->sync_mode is
3484  * WB_SYNC_ALL then we were called for data integrity and we must wait for
3485  * existing IO to complete.
3486  */
3487 static int extent_write_cache_pages(struct extent_io_tree *tree,
3488 			     struct address_space *mapping,
3489 			     struct writeback_control *wbc,
3490 			     writepage_t writepage, void *data,
3491 			     void (*flush_fn)(void *))
3492 {
3493 	struct inode *inode = mapping->host;
3494 	int ret = 0;
3495 	int done = 0;
3496 	int nr_to_write_done = 0;
3497 	struct pagevec pvec;
3498 	int nr_pages;
3499 	pgoff_t index;
3500 	pgoff_t end;		/* Inclusive */
3501 	int scanned = 0;
3502 	int tag;
3503 
3504 	/*
3505 	 * We have to hold onto the inode so that ordered extents can do their
3506 	 * work when the IO finishes.  The alternative to this is failing to add
3507 	 * an ordered extent if the igrab() fails there and that is a huge pain
3508 	 * to deal with, so instead just hold onto the inode throughout the
3509 	 * writepages operation.  If it fails here we are freeing up the inode
3510 	 * anyway and we'd rather not waste our time writing out stuff that is
3511 	 * going to be truncated anyway.
3512 	 */
3513 	if (!igrab(inode))
3514 		return 0;
3515 
3516 	pagevec_init(&pvec, 0);
3517 	if (wbc->range_cyclic) {
3518 		index = mapping->writeback_index; /* Start from prev offset */
3519 		end = -1;
3520 	} else {
3521 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
3522 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
3523 		scanned = 1;
3524 	}
3525 	if (wbc->sync_mode == WB_SYNC_ALL)
3526 		tag = PAGECACHE_TAG_TOWRITE;
3527 	else
3528 		tag = PAGECACHE_TAG_DIRTY;
3529 retry:
3530 	if (wbc->sync_mode == WB_SYNC_ALL)
3531 		tag_pages_for_writeback(mapping, index, end);
3532 	while (!done && !nr_to_write_done && (index <= end) &&
3533 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3534 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3535 		unsigned i;
3536 
3537 		scanned = 1;
3538 		for (i = 0; i < nr_pages; i++) {
3539 			struct page *page = pvec.pages[i];
3540 
3541 			/*
3542 			 * At this point we hold neither mapping->tree_lock nor
3543 			 * lock on the page itself: the page may be truncated or
3544 			 * invalidated (changing page->mapping to NULL), or even
3545 			 * swizzled back from swapper_space to tmpfs file
3546 			 * mapping
3547 			 */
3548 			if (!trylock_page(page)) {
3549 				flush_fn(data);
3550 				lock_page(page);
3551 			}
3552 
3553 			if (unlikely(page->mapping != mapping)) {
3554 				unlock_page(page);
3555 				continue;
3556 			}
3557 
3558 			if (!wbc->range_cyclic && page->index > end) {
3559 				done = 1;
3560 				unlock_page(page);
3561 				continue;
3562 			}
3563 
3564 			if (wbc->sync_mode != WB_SYNC_NONE) {
3565 				if (PageWriteback(page))
3566 					flush_fn(data);
3567 				wait_on_page_writeback(page);
3568 			}
3569 
3570 			if (PageWriteback(page) ||
3571 			    !clear_page_dirty_for_io(page)) {
3572 				unlock_page(page);
3573 				continue;
3574 			}
3575 
3576 			ret = (*writepage)(page, wbc, data);
3577 
3578 			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
3579 				unlock_page(page);
3580 				ret = 0;
3581 			}
3582 			if (ret)
3583 				done = 1;
3584 
3585 			/*
3586 			 * the filesystem may choose to bump up nr_to_write.
3587 			 * We have to make sure to honor the new nr_to_write
3588 			 * at any time
3589 			 */
3590 			nr_to_write_done = wbc->nr_to_write <= 0;
3591 		}
3592 		pagevec_release(&pvec);
3593 		cond_resched();
3594 	}
3595 	if (!scanned && !done) {
3596 		/*
3597 		 * We hit the last page and there is more work to be done: wrap
3598 		 * back to the start of the file
3599 		 */
3600 		scanned = 1;
3601 		index = 0;
3602 		goto retry;
3603 	}
3604 	btrfs_add_delayed_iput(inode);
3605 	return ret;
3606 }
3607 
3608 static void flush_epd_write_bio(struct extent_page_data *epd)
3609 {
3610 	if (epd->bio) {
3611 		int rw = WRITE;
3612 		int ret;
3613 
3614 		if (epd->sync_io)
3615 			rw = WRITE_SYNC;
3616 
3617 		ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
3618 		BUG_ON(ret < 0); /* -ENOMEM */
3619 		epd->bio = NULL;
3620 	}
3621 }
3622 
3623 static noinline void flush_write_bio(void *data)
3624 {
3625 	struct extent_page_data *epd = data;
3626 	flush_epd_write_bio(epd);
3627 }
3628 
3629 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3630 			  get_extent_t *get_extent,
3631 			  struct writeback_control *wbc)
3632 {
3633 	int ret;
3634 	struct extent_page_data epd = {
3635 		.bio = NULL,
3636 		.tree = tree,
3637 		.get_extent = get_extent,
3638 		.extent_locked = 0,
3639 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3640 		.bio_flags = 0,
3641 	};
3642 
3643 	ret = __extent_writepage(page, wbc, &epd);
3644 
3645 	flush_epd_write_bio(&epd);
3646 	return ret;
3647 }
3648 
3649 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3650 			      u64 start, u64 end, get_extent_t *get_extent,
3651 			      int mode)
3652 {
3653 	int ret = 0;
3654 	struct address_space *mapping = inode->i_mapping;
3655 	struct page *page;
3656 	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
3657 		PAGE_CACHE_SHIFT;
3658 
3659 	struct extent_page_data epd = {
3660 		.bio = NULL,
3661 		.tree = tree,
3662 		.get_extent = get_extent,
3663 		.extent_locked = 1,
3664 		.sync_io = mode == WB_SYNC_ALL,
3665 		.bio_flags = 0,
3666 	};
3667 	struct writeback_control wbc_writepages = {
3668 		.sync_mode	= mode,
3669 		.nr_to_write	= nr_pages * 2,
3670 		.range_start	= start,
3671 		.range_end	= end + 1,
3672 	};
3673 
3674 	while (start <= end) {
3675 		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
3676 		if (clear_page_dirty_for_io(page))
3677 			ret = __extent_writepage(page, &wbc_writepages, &epd);
3678 		else {
3679 			if (tree->ops && tree->ops->writepage_end_io_hook)
3680 				tree->ops->writepage_end_io_hook(page, start,
3681 						 start + PAGE_CACHE_SIZE - 1,
3682 						 NULL, 1);
3683 			unlock_page(page);
3684 		}
3685 		page_cache_release(page);
3686 		start += PAGE_CACHE_SIZE;
3687 	}
3688 
3689 	flush_epd_write_bio(&epd);
3690 	return ret;
3691 }
3692 
3693 int extent_writepages(struct extent_io_tree *tree,
3694 		      struct address_space *mapping,
3695 		      get_extent_t *get_extent,
3696 		      struct writeback_control *wbc)
3697 {
3698 	int ret = 0;
3699 	struct extent_page_data epd = {
3700 		.bio = NULL,
3701 		.tree = tree,
3702 		.get_extent = get_extent,
3703 		.extent_locked = 0,
3704 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3705 		.bio_flags = 0,
3706 	};
3707 
3708 	ret = extent_write_cache_pages(tree, mapping, wbc,
3709 				       __extent_writepage, &epd,
3710 				       flush_write_bio);
3711 	flush_epd_write_bio(&epd);
3712 	return ret;
3713 }
3714 
3715 int extent_readpages(struct extent_io_tree *tree,
3716 		     struct address_space *mapping,
3717 		     struct list_head *pages, unsigned nr_pages,
3718 		     get_extent_t get_extent)
3719 {
3720 	struct bio *bio = NULL;
3721 	unsigned page_idx;
3722 	unsigned long bio_flags = 0;
3723 	struct page *pagepool[16];
3724 	struct page *page;
3725 	int i = 0;
3726 	int nr = 0;
3727 
3728 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
3729 		page = list_entry(pages->prev, struct page, lru);
3730 
3731 		prefetchw(&page->flags);
3732 		list_del(&page->lru);
3733 		if (add_to_page_cache_lru(page, mapping,
3734 					page->index, GFP_NOFS)) {
3735 			page_cache_release(page);
3736 			continue;
3737 		}
3738 
3739 		pagepool[nr++] = page;
3740 		if (nr < ARRAY_SIZE(pagepool))
3741 			continue;
3742 		for (i = 0; i < nr; i++) {
3743 			__extent_read_full_page(tree, pagepool[i], get_extent,
3744 					&bio, 0, &bio_flags, READ);
3745 			page_cache_release(pagepool[i]);
3746 		}
3747 		nr = 0;
3748 	}
3749 	for (i = 0; i < nr; i++) {
3750 		__extent_read_full_page(tree, pagepool[i], get_extent,
3751 					&bio, 0, &bio_flags, READ);
3752 		page_cache_release(pagepool[i]);
3753 	}
3754 
3755 	BUG_ON(!list_empty(pages));
3756 	if (bio)
3757 		return submit_one_bio(READ, bio, 0, bio_flags);
3758 	return 0;
3759 }
3760 
3761 /*
3762  * basic invalidatepage code, this waits on any locked or writeback
3763  * ranges corresponding to the page, and then deletes any extent state
3764  * records from the tree
3765  */
3766 int extent_invalidatepage(struct extent_io_tree *tree,
3767 			  struct page *page, unsigned long offset)
3768 {
3769 	struct extent_state *cached_state = NULL;
3770 	u64 start = page_offset(page);
3771 	u64 end = start + PAGE_CACHE_SIZE - 1;
3772 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
3773 
3774 	start += ALIGN(offset, blocksize);
3775 	if (start > end)
3776 		return 0;
3777 
3778 	lock_extent_bits(tree, start, end, 0, &cached_state);
3779 	wait_on_page_writeback(page);
3780 	clear_extent_bit(tree, start, end,
3781 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
3782 			 EXTENT_DO_ACCOUNTING,
3783 			 1, 1, &cached_state, GFP_NOFS);
3784 	return 0;
3785 }
3786 
3787 /*
3788  * a helper for releasepage, this tests for areas of the page that
3789  * are locked or under IO and drops the related state bits if it is safe
3790  * to drop the page.
3791  */
3792 static int try_release_extent_state(struct extent_map_tree *map,
3793 				    struct extent_io_tree *tree,
3794 				    struct page *page, gfp_t mask)
3795 {
3796 	u64 start = page_offset(page);
3797 	u64 end = start + PAGE_CACHE_SIZE - 1;
3798 	int ret = 1;
3799 
3800 	if (test_range_bit(tree, start, end,
3801 			   EXTENT_IOBITS, 0, NULL))
3802 		ret = 0;
3803 	else {
3804 		if ((mask & GFP_NOFS) == GFP_NOFS)
3805 			mask = GFP_NOFS;
3806 		/*
3807 		 * at this point we can safely clear everything except the
3808 		 * locked bit and the nodatasum bit
3809 		 */
3810 		ret = clear_extent_bit(tree, start, end,
3811 				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
3812 				 0, 0, NULL, mask);
3813 
3814 		/* if clear_extent_bit failed for enomem reasons,
3815 		 * we can't allow the release to continue.
3816 		 */
3817 		if (ret < 0)
3818 			ret = 0;
3819 		else
3820 			ret = 1;
3821 	}
3822 	return ret;
3823 }
3824 
3825 /*
3826  * a helper for releasepage.  As long as there are no locked extents
3827  * in the range corresponding to the page, both state records and extent
3828  * map records are removed
3829  */
3830 int try_release_extent_mapping(struct extent_map_tree *map,
3831 			       struct extent_io_tree *tree, struct page *page,
3832 			       gfp_t mask)
3833 {
3834 	struct extent_map *em;
3835 	u64 start = page_offset(page);
3836 	u64 end = start + PAGE_CACHE_SIZE - 1;
3837 
3838 	if ((mask & __GFP_WAIT) &&
3839 	    page->mapping->host->i_size > 16 * 1024 * 1024) {
3840 		u64 len;
3841 		while (start <= end) {
3842 			len = end - start + 1;
3843 			write_lock(&map->lock);
3844 			em = lookup_extent_mapping(map, start, len);
3845 			if (!em) {
3846 				write_unlock(&map->lock);
3847 				break;
3848 			}
3849 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
3850 			    em->start != start) {
3851 				write_unlock(&map->lock);
3852 				free_extent_map(em);
3853 				break;
3854 			}
3855 			if (!test_range_bit(tree, em->start,
3856 					    extent_map_end(em) - 1,
3857 					    EXTENT_LOCKED | EXTENT_WRITEBACK,
3858 					    0, NULL)) {
3859 				remove_extent_mapping(map, em);
3860 				/* once for the rb tree */
3861 				free_extent_map(em);
3862 			}
3863 			start = extent_map_end(em);
3864 			write_unlock(&map->lock);
3865 
3866 			/* once for us */
3867 			free_extent_map(em);
3868 		}
3869 	}
3870 	return try_release_extent_state(map, tree, page, mask);
3871 }
3872 
3873 /*
3874  * helper function for fiemap, which doesn't want to see any holes.
3875  * This maps until we find something past 'last'
3876  */
3877 static struct extent_map *get_extent_skip_holes(struct inode *inode,
3878 						u64 offset,
3879 						u64 last,
3880 						get_extent_t *get_extent)
3881 {
3882 	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
3883 	struct extent_map *em;
3884 	u64 len;
3885 
3886 	if (offset >= last)
3887 		return NULL;
3888 
3889 	while(1) {
3890 		len = last - offset;
3891 		if (len == 0)
3892 			break;
3893 		len = ALIGN(len, sectorsize);
3894 		em = get_extent(inode, NULL, 0, offset, len, 0);
3895 		if (IS_ERR_OR_NULL(em))
3896 			return em;
3897 
3898 		/* if this isn't a hole return it */
3899 		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
3900 		    em->block_start != EXTENT_MAP_HOLE) {
3901 			return em;
3902 		}
3903 
3904 		/* this is a hole, advance to the next extent */
3905 		offset = extent_map_end(em);
3906 		free_extent_map(em);
3907 		if (offset >= last)
3908 			break;
3909 	}
3910 	return NULL;
3911 }
3912 
3913 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3914 		__u64 start, __u64 len, get_extent_t *get_extent)
3915 {
3916 	int ret = 0;
3917 	u64 off = start;
3918 	u64 max = start + len;
3919 	u32 flags = 0;
3920 	u32 found_type;
3921 	u64 last;
3922 	u64 last_for_get_extent = 0;
3923 	u64 disko = 0;
3924 	u64 isize = i_size_read(inode);
3925 	struct btrfs_key found_key;
3926 	struct extent_map *em = NULL;
3927 	struct extent_state *cached_state = NULL;
3928 	struct btrfs_path *path;
3929 	struct btrfs_file_extent_item *item;
3930 	int end = 0;
3931 	u64 em_start = 0;
3932 	u64 em_len = 0;
3933 	u64 em_end = 0;
3934 	unsigned long emflags;
3935 
3936 	if (len == 0)
3937 		return -EINVAL;
3938 
3939 	path = btrfs_alloc_path();
3940 	if (!path)
3941 		return -ENOMEM;
3942 	path->leave_spinning = 1;
3943 
3944 	start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3945 	len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3946 
3947 	/*
3948 	 * lookup the last file extent.  We're not using i_size here
3949 	 * because there might be preallocation past i_size
3950 	 */
3951 	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
3952 				       path, btrfs_ino(inode), -1, 0);
3953 	if (ret < 0) {
3954 		btrfs_free_path(path);
3955 		return ret;
3956 	}
3957 	WARN_ON(!ret);
3958 	path->slots[0]--;
3959 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3960 			      struct btrfs_file_extent_item);
3961 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
3962 	found_type = btrfs_key_type(&found_key);
3963 
3964 	/* No extents, but there might be delalloc bits */
3965 	if (found_key.objectid != btrfs_ino(inode) ||
3966 	    found_type != BTRFS_EXTENT_DATA_KEY) {
3967 		/* have to trust i_size as the end */
3968 		last = (u64)-1;
3969 		last_for_get_extent = isize;
3970 	} else {
3971 		/*
3972 		 * remember the start of the last extent.  There are a
3973 		 * bunch of different factors that go into the length of the
3974 		 * extent, so its much less complex to remember where it started
3975 		 */
3976 		last = found_key.offset;
3977 		last_for_get_extent = last + 1;
3978 	}
3979 	btrfs_free_path(path);
3980 
3981 	/*
3982 	 * we might have some extents allocated but more delalloc past those
3983 	 * extents.  so, we trust isize unless the start of the last extent is
3984 	 * beyond isize
3985 	 */
3986 	if (last < isize) {
3987 		last = (u64)-1;
3988 		last_for_get_extent = isize;
3989 	}
3990 
3991 	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3992 			 &cached_state);
3993 
3994 	em = get_extent_skip_holes(inode, start, last_for_get_extent,
3995 				   get_extent);
3996 	if (!em)
3997 		goto out;
3998 	if (IS_ERR(em)) {
3999 		ret = PTR_ERR(em);
4000 		goto out;
4001 	}
4002 
4003 	while (!end) {
4004 		u64 offset_in_extent;
4005 
4006 		/* break if the extent we found is outside the range */
4007 		if (em->start >= max || extent_map_end(em) < off)
4008 			break;
4009 
4010 		/*
4011 		 * get_extent may return an extent that starts before our
4012 		 * requested range.  We have to make sure the ranges
4013 		 * we return to fiemap always move forward and don't
4014 		 * overlap, so adjust the offsets here
4015 		 */
4016 		em_start = max(em->start, off);
4017 
4018 		/*
4019 		 * record the offset from the start of the extent
4020 		 * for adjusting the disk offset below
4021 		 */
4022 		offset_in_extent = em_start - em->start;
4023 		em_end = extent_map_end(em);
4024 		em_len = em_end - em_start;
4025 		emflags = em->flags;
4026 		disko = 0;
4027 		flags = 0;
4028 
4029 		/*
4030 		 * bump off for our next call to get_extent
4031 		 */
4032 		off = extent_map_end(em);
4033 		if (off >= max)
4034 			end = 1;
4035 
4036 		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
4037 			end = 1;
4038 			flags |= FIEMAP_EXTENT_LAST;
4039 		} else if (em->block_start == EXTENT_MAP_INLINE) {
4040 			flags |= (FIEMAP_EXTENT_DATA_INLINE |
4041 				  FIEMAP_EXTENT_NOT_ALIGNED);
4042 		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
4043 			flags |= (FIEMAP_EXTENT_DELALLOC |
4044 				  FIEMAP_EXTENT_UNKNOWN);
4045 		} else {
4046 			disko = em->block_start + offset_in_extent;
4047 		}
4048 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
4049 			flags |= FIEMAP_EXTENT_ENCODED;
4050 
4051 		free_extent_map(em);
4052 		em = NULL;
4053 		if ((em_start >= last) || em_len == (u64)-1 ||
4054 		   (last == (u64)-1 && isize <= em_end)) {
4055 			flags |= FIEMAP_EXTENT_LAST;
4056 			end = 1;
4057 		}
4058 
4059 		/* now scan forward to see if this is really the last extent. */
4060 		em = get_extent_skip_holes(inode, off, last_for_get_extent,
4061 					   get_extent);
4062 		if (IS_ERR(em)) {
4063 			ret = PTR_ERR(em);
4064 			goto out;
4065 		}
4066 		if (!em) {
4067 			flags |= FIEMAP_EXTENT_LAST;
4068 			end = 1;
4069 		}
4070 		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
4071 					      em_len, flags);
4072 		if (ret)
4073 			goto out_free;
4074 	}
4075 out_free:
4076 	free_extent_map(em);
4077 out:
4078 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
4079 			     &cached_state, GFP_NOFS);
4080 	return ret;
4081 }
4082 
4083 static void __free_extent_buffer(struct extent_buffer *eb)
4084 {
4085 	btrfs_leak_debug_del(&eb->leak_list);
4086 	kmem_cache_free(extent_buffer_cache, eb);
4087 }
4088 
4089 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
4090 						   u64 start,
4091 						   unsigned long len,
4092 						   gfp_t mask)
4093 {
4094 	struct extent_buffer *eb = NULL;
4095 
4096 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
4097 	if (eb == NULL)
4098 		return NULL;
4099 	eb->start = start;
4100 	eb->len = len;
4101 	eb->tree = tree;
4102 	eb->bflags = 0;
4103 	rwlock_init(&eb->lock);
4104 	atomic_set(&eb->write_locks, 0);
4105 	atomic_set(&eb->read_locks, 0);
4106 	atomic_set(&eb->blocking_readers, 0);
4107 	atomic_set(&eb->blocking_writers, 0);
4108 	atomic_set(&eb->spinning_readers, 0);
4109 	atomic_set(&eb->spinning_writers, 0);
4110 	eb->lock_nested = 0;
4111 	init_waitqueue_head(&eb->write_lock_wq);
4112 	init_waitqueue_head(&eb->read_lock_wq);
4113 
4114 	btrfs_leak_debug_add(&eb->leak_list, &buffers);
4115 
4116 	spin_lock_init(&eb->refs_lock);
4117 	atomic_set(&eb->refs, 1);
4118 	atomic_set(&eb->io_pages, 0);
4119 
4120 	/*
4121 	 * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
4122 	 */
4123 	BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
4124 		> MAX_INLINE_EXTENT_BUFFER_SIZE);
4125 	BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
4126 
4127 	return eb;
4128 }
4129 
4130 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
4131 {
4132 	unsigned long i;
4133 	struct page *p;
4134 	struct extent_buffer *new;
4135 	unsigned long num_pages = num_extent_pages(src->start, src->len);
4136 
4137 	new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
4138 	if (new == NULL)
4139 		return NULL;
4140 
4141 	for (i = 0; i < num_pages; i++) {
4142 		p = alloc_page(GFP_ATOMIC);
4143 		BUG_ON(!p);
4144 		attach_extent_buffer_page(new, p);
4145 		WARN_ON(PageDirty(p));
4146 		SetPageUptodate(p);
4147 		new->pages[i] = p;
4148 	}
4149 
4150 	copy_extent_buffer(new, src, 0, 0, src->len);
4151 	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
4152 	set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
4153 
4154 	return new;
4155 }
4156 
4157 struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
4158 {
4159 	struct extent_buffer *eb;
4160 	unsigned long num_pages = num_extent_pages(0, len);
4161 	unsigned long i;
4162 
4163 	eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
4164 	if (!eb)
4165 		return NULL;
4166 
4167 	for (i = 0; i < num_pages; i++) {
4168 		eb->pages[i] = alloc_page(GFP_ATOMIC);
4169 		if (!eb->pages[i])
4170 			goto err;
4171 	}
4172 	set_extent_buffer_uptodate(eb);
4173 	btrfs_set_header_nritems(eb, 0);
4174 	set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4175 
4176 	return eb;
4177 err:
4178 	for (; i > 0; i--)
4179 		__free_page(eb->pages[i - 1]);
4180 	__free_extent_buffer(eb);
4181 	return NULL;
4182 }
4183 
4184 static int extent_buffer_under_io(struct extent_buffer *eb)
4185 {
4186 	return (atomic_read(&eb->io_pages) ||
4187 		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4188 		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4189 }
4190 
4191 /*
4192  * Helper for releasing extent buffer page.
4193  */
4194 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4195 						unsigned long start_idx)
4196 {
4197 	unsigned long index;
4198 	unsigned long num_pages;
4199 	struct page *page;
4200 	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4201 
4202 	BUG_ON(extent_buffer_under_io(eb));
4203 
4204 	num_pages = num_extent_pages(eb->start, eb->len);
4205 	index = start_idx + num_pages;
4206 	if (start_idx >= index)
4207 		return;
4208 
4209 	do {
4210 		index--;
4211 		page = extent_buffer_page(eb, index);
4212 		if (page && mapped) {
4213 			spin_lock(&page->mapping->private_lock);
4214 			/*
4215 			 * We do this since we'll remove the pages after we've
4216 			 * removed the eb from the radix tree, so we could race
4217 			 * and have this page now attached to the new eb.  So
4218 			 * only clear page_private if it's still connected to
4219 			 * this eb.
4220 			 */
4221 			if (PagePrivate(page) &&
4222 			    page->private == (unsigned long)eb) {
4223 				BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4224 				BUG_ON(PageDirty(page));
4225 				BUG_ON(PageWriteback(page));
4226 				/*
4227 				 * We need to make sure we haven't be attached
4228 				 * to a new eb.
4229 				 */
4230 				ClearPagePrivate(page);
4231 				set_page_private(page, 0);
4232 				/* One for the page private */
4233 				page_cache_release(page);
4234 			}
4235 			spin_unlock(&page->mapping->private_lock);
4236 
4237 		}
4238 		if (page) {
4239 			/* One for when we alloced the page */
4240 			page_cache_release(page);
4241 		}
4242 	} while (index != start_idx);
4243 }
4244 
4245 /*
4246  * Helper for releasing the extent buffer.
4247  */
4248 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4249 {
4250 	btrfs_release_extent_buffer_page(eb, 0);
4251 	__free_extent_buffer(eb);
4252 }
4253 
4254 static void check_buffer_tree_ref(struct extent_buffer *eb)
4255 {
4256 	int refs;
4257 	/* the ref bit is tricky.  We have to make sure it is set
4258 	 * if we have the buffer dirty.   Otherwise the
4259 	 * code to free a buffer can end up dropping a dirty
4260 	 * page
4261 	 *
4262 	 * Once the ref bit is set, it won't go away while the
4263 	 * buffer is dirty or in writeback, and it also won't
4264 	 * go away while we have the reference count on the
4265 	 * eb bumped.
4266 	 *
4267 	 * We can't just set the ref bit without bumping the
4268 	 * ref on the eb because free_extent_buffer might
4269 	 * see the ref bit and try to clear it.  If this happens
4270 	 * free_extent_buffer might end up dropping our original
4271 	 * ref by mistake and freeing the page before we are able
4272 	 * to add one more ref.
4273 	 *
4274 	 * So bump the ref count first, then set the bit.  If someone
4275 	 * beat us to it, drop the ref we added.
4276 	 */
4277 	refs = atomic_read(&eb->refs);
4278 	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4279 		return;
4280 
4281 	spin_lock(&eb->refs_lock);
4282 	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4283 		atomic_inc(&eb->refs);
4284 	spin_unlock(&eb->refs_lock);
4285 }
4286 
4287 static void mark_extent_buffer_accessed(struct extent_buffer *eb)
4288 {
4289 	unsigned long num_pages, i;
4290 
4291 	check_buffer_tree_ref(eb);
4292 
4293 	num_pages = num_extent_pages(eb->start, eb->len);
4294 	for (i = 0; i < num_pages; i++) {
4295 		struct page *p = extent_buffer_page(eb, i);
4296 		mark_page_accessed(p);
4297 	}
4298 }
4299 
4300 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
4301 					  u64 start, unsigned long len)
4302 {
4303 	unsigned long num_pages = num_extent_pages(start, len);
4304 	unsigned long i;
4305 	unsigned long index = start >> PAGE_CACHE_SHIFT;
4306 	struct extent_buffer *eb;
4307 	struct extent_buffer *exists = NULL;
4308 	struct page *p;
4309 	struct address_space *mapping = tree->mapping;
4310 	int uptodate = 1;
4311 	int ret;
4312 
4313 	rcu_read_lock();
4314 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4315 	if (eb && atomic_inc_not_zero(&eb->refs)) {
4316 		rcu_read_unlock();
4317 		mark_extent_buffer_accessed(eb);
4318 		return eb;
4319 	}
4320 	rcu_read_unlock();
4321 
4322 	eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
4323 	if (!eb)
4324 		return NULL;
4325 
4326 	for (i = 0; i < num_pages; i++, index++) {
4327 		p = find_or_create_page(mapping, index, GFP_NOFS);
4328 		if (!p)
4329 			goto free_eb;
4330 
4331 		spin_lock(&mapping->private_lock);
4332 		if (PagePrivate(p)) {
4333 			/*
4334 			 * We could have already allocated an eb for this page
4335 			 * and attached one so lets see if we can get a ref on
4336 			 * the existing eb, and if we can we know it's good and
4337 			 * we can just return that one, else we know we can just
4338 			 * overwrite page->private.
4339 			 */
4340 			exists = (struct extent_buffer *)p->private;
4341 			if (atomic_inc_not_zero(&exists->refs)) {
4342 				spin_unlock(&mapping->private_lock);
4343 				unlock_page(p);
4344 				page_cache_release(p);
4345 				mark_extent_buffer_accessed(exists);
4346 				goto free_eb;
4347 			}
4348 
4349 			/*
4350 			 * Do this so attach doesn't complain and we need to
4351 			 * drop the ref the old guy had.
4352 			 */
4353 			ClearPagePrivate(p);
4354 			WARN_ON(PageDirty(p));
4355 			page_cache_release(p);
4356 		}
4357 		attach_extent_buffer_page(eb, p);
4358 		spin_unlock(&mapping->private_lock);
4359 		WARN_ON(PageDirty(p));
4360 		mark_page_accessed(p);
4361 		eb->pages[i] = p;
4362 		if (!PageUptodate(p))
4363 			uptodate = 0;
4364 
4365 		/*
4366 		 * see below about how we avoid a nasty race with release page
4367 		 * and why we unlock later
4368 		 */
4369 	}
4370 	if (uptodate)
4371 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4372 again:
4373 	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
4374 	if (ret)
4375 		goto free_eb;
4376 
4377 	spin_lock(&tree->buffer_lock);
4378 	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
4379 	if (ret == -EEXIST) {
4380 		exists = radix_tree_lookup(&tree->buffer,
4381 						start >> PAGE_CACHE_SHIFT);
4382 		if (!atomic_inc_not_zero(&exists->refs)) {
4383 			spin_unlock(&tree->buffer_lock);
4384 			radix_tree_preload_end();
4385 			exists = NULL;
4386 			goto again;
4387 		}
4388 		spin_unlock(&tree->buffer_lock);
4389 		radix_tree_preload_end();
4390 		mark_extent_buffer_accessed(exists);
4391 		goto free_eb;
4392 	}
4393 	/* add one reference for the tree */
4394 	check_buffer_tree_ref(eb);
4395 	spin_unlock(&tree->buffer_lock);
4396 	radix_tree_preload_end();
4397 
4398 	/*
4399 	 * there is a race where release page may have
4400 	 * tried to find this extent buffer in the radix
4401 	 * but failed.  It will tell the VM it is safe to
4402 	 * reclaim the, and it will clear the page private bit.
4403 	 * We must make sure to set the page private bit properly
4404 	 * after the extent buffer is in the radix tree so
4405 	 * it doesn't get lost
4406 	 */
4407 	SetPageChecked(eb->pages[0]);
4408 	for (i = 1; i < num_pages; i++) {
4409 		p = extent_buffer_page(eb, i);
4410 		ClearPageChecked(p);
4411 		unlock_page(p);
4412 	}
4413 	unlock_page(eb->pages[0]);
4414 	return eb;
4415 
4416 free_eb:
4417 	for (i = 0; i < num_pages; i++) {
4418 		if (eb->pages[i])
4419 			unlock_page(eb->pages[i]);
4420 	}
4421 
4422 	WARN_ON(!atomic_dec_and_test(&eb->refs));
4423 	btrfs_release_extent_buffer(eb);
4424 	return exists;
4425 }
4426 
4427 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
4428 					 u64 start, unsigned long len)
4429 {
4430 	struct extent_buffer *eb;
4431 
4432 	rcu_read_lock();
4433 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4434 	if (eb && atomic_inc_not_zero(&eb->refs)) {
4435 		rcu_read_unlock();
4436 		mark_extent_buffer_accessed(eb);
4437 		return eb;
4438 	}
4439 	rcu_read_unlock();
4440 
4441 	return NULL;
4442 }
4443 
4444 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4445 {
4446 	struct extent_buffer *eb =
4447 			container_of(head, struct extent_buffer, rcu_head);
4448 
4449 	__free_extent_buffer(eb);
4450 }
4451 
4452 /* Expects to have eb->eb_lock already held */
4453 static int release_extent_buffer(struct extent_buffer *eb)
4454 {
4455 	WARN_ON(atomic_read(&eb->refs) == 0);
4456 	if (atomic_dec_and_test(&eb->refs)) {
4457 		if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
4458 			spin_unlock(&eb->refs_lock);
4459 		} else {
4460 			struct extent_io_tree *tree = eb->tree;
4461 
4462 			spin_unlock(&eb->refs_lock);
4463 
4464 			spin_lock(&tree->buffer_lock);
4465 			radix_tree_delete(&tree->buffer,
4466 					  eb->start >> PAGE_CACHE_SHIFT);
4467 			spin_unlock(&tree->buffer_lock);
4468 		}
4469 
4470 		/* Should be safe to release our pages at this point */
4471 		btrfs_release_extent_buffer_page(eb, 0);
4472 		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4473 		return 1;
4474 	}
4475 	spin_unlock(&eb->refs_lock);
4476 
4477 	return 0;
4478 }
4479 
4480 void free_extent_buffer(struct extent_buffer *eb)
4481 {
4482 	int refs;
4483 	int old;
4484 	if (!eb)
4485 		return;
4486 
4487 	while (1) {
4488 		refs = atomic_read(&eb->refs);
4489 		if (refs <= 3)
4490 			break;
4491 		old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
4492 		if (old == refs)
4493 			return;
4494 	}
4495 
4496 	spin_lock(&eb->refs_lock);
4497 	if (atomic_read(&eb->refs) == 2 &&
4498 	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
4499 		atomic_dec(&eb->refs);
4500 
4501 	if (atomic_read(&eb->refs) == 2 &&
4502 	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
4503 	    !extent_buffer_under_io(eb) &&
4504 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4505 		atomic_dec(&eb->refs);
4506 
4507 	/*
4508 	 * I know this is terrible, but it's temporary until we stop tracking
4509 	 * the uptodate bits and such for the extent buffers.
4510 	 */
4511 	release_extent_buffer(eb);
4512 }
4513 
4514 void free_extent_buffer_stale(struct extent_buffer *eb)
4515 {
4516 	if (!eb)
4517 		return;
4518 
4519 	spin_lock(&eb->refs_lock);
4520 	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
4521 
4522 	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
4523 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4524 		atomic_dec(&eb->refs);
4525 	release_extent_buffer(eb);
4526 }
4527 
4528 void clear_extent_buffer_dirty(struct extent_buffer *eb)
4529 {
4530 	unsigned long i;
4531 	unsigned long num_pages;
4532 	struct page *page;
4533 
4534 	num_pages = num_extent_pages(eb->start, eb->len);
4535 
4536 	for (i = 0; i < num_pages; i++) {
4537 		page = extent_buffer_page(eb, i);
4538 		if (!PageDirty(page))
4539 			continue;
4540 
4541 		lock_page(page);
4542 		WARN_ON(!PagePrivate(page));
4543 
4544 		clear_page_dirty_for_io(page);
4545 		spin_lock_irq(&page->mapping->tree_lock);
4546 		if (!PageDirty(page)) {
4547 			radix_tree_tag_clear(&page->mapping->page_tree,
4548 						page_index(page),
4549 						PAGECACHE_TAG_DIRTY);
4550 		}
4551 		spin_unlock_irq(&page->mapping->tree_lock);
4552 		ClearPageError(page);
4553 		unlock_page(page);
4554 	}
4555 	WARN_ON(atomic_read(&eb->refs) == 0);
4556 }
4557 
4558 int set_extent_buffer_dirty(struct extent_buffer *eb)
4559 {
4560 	unsigned long i;
4561 	unsigned long num_pages;
4562 	int was_dirty = 0;
4563 
4564 	check_buffer_tree_ref(eb);
4565 
4566 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
4567 
4568 	num_pages = num_extent_pages(eb->start, eb->len);
4569 	WARN_ON(atomic_read(&eb->refs) == 0);
4570 	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
4571 
4572 	for (i = 0; i < num_pages; i++)
4573 		set_page_dirty(extent_buffer_page(eb, i));
4574 	return was_dirty;
4575 }
4576 
4577 int clear_extent_buffer_uptodate(struct extent_buffer *eb)
4578 {
4579 	unsigned long i;
4580 	struct page *page;
4581 	unsigned long num_pages;
4582 
4583 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4584 	num_pages = num_extent_pages(eb->start, eb->len);
4585 	for (i = 0; i < num_pages; i++) {
4586 		page = extent_buffer_page(eb, i);
4587 		if (page)
4588 			ClearPageUptodate(page);
4589 	}
4590 	return 0;
4591 }
4592 
4593 int set_extent_buffer_uptodate(struct extent_buffer *eb)
4594 {
4595 	unsigned long i;
4596 	struct page *page;
4597 	unsigned long num_pages;
4598 
4599 	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4600 	num_pages = num_extent_pages(eb->start, eb->len);
4601 	for (i = 0; i < num_pages; i++) {
4602 		page = extent_buffer_page(eb, i);
4603 		SetPageUptodate(page);
4604 	}
4605 	return 0;
4606 }
4607 
4608 int extent_buffer_uptodate(struct extent_buffer *eb)
4609 {
4610 	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4611 }
4612 
4613 int read_extent_buffer_pages(struct extent_io_tree *tree,
4614 			     struct extent_buffer *eb, u64 start, int wait,
4615 			     get_extent_t *get_extent, int mirror_num)
4616 {
4617 	unsigned long i;
4618 	unsigned long start_i;
4619 	struct page *page;
4620 	int err;
4621 	int ret = 0;
4622 	int locked_pages = 0;
4623 	int all_uptodate = 1;
4624 	unsigned long num_pages;
4625 	unsigned long num_reads = 0;
4626 	struct bio *bio = NULL;
4627 	unsigned long bio_flags = 0;
4628 
4629 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
4630 		return 0;
4631 
4632 	if (start) {
4633 		WARN_ON(start < eb->start);
4634 		start_i = (start >> PAGE_CACHE_SHIFT) -
4635 			(eb->start >> PAGE_CACHE_SHIFT);
4636 	} else {
4637 		start_i = 0;
4638 	}
4639 
4640 	num_pages = num_extent_pages(eb->start, eb->len);
4641 	for (i = start_i; i < num_pages; i++) {
4642 		page = extent_buffer_page(eb, i);
4643 		if (wait == WAIT_NONE) {
4644 			if (!trylock_page(page))
4645 				goto unlock_exit;
4646 		} else {
4647 			lock_page(page);
4648 		}
4649 		locked_pages++;
4650 		if (!PageUptodate(page)) {
4651 			num_reads++;
4652 			all_uptodate = 0;
4653 		}
4654 	}
4655 	if (all_uptodate) {
4656 		if (start_i == 0)
4657 			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4658 		goto unlock_exit;
4659 	}
4660 
4661 	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
4662 	eb->read_mirror = 0;
4663 	atomic_set(&eb->io_pages, num_reads);
4664 	for (i = start_i; i < num_pages; i++) {
4665 		page = extent_buffer_page(eb, i);
4666 		if (!PageUptodate(page)) {
4667 			ClearPageError(page);
4668 			err = __extent_read_full_page(tree, page,
4669 						      get_extent, &bio,
4670 						      mirror_num, &bio_flags,
4671 						      READ | REQ_META);
4672 			if (err)
4673 				ret = err;
4674 		} else {
4675 			unlock_page(page);
4676 		}
4677 	}
4678 
4679 	if (bio) {
4680 		err = submit_one_bio(READ | REQ_META, bio, mirror_num,
4681 				     bio_flags);
4682 		if (err)
4683 			return err;
4684 	}
4685 
4686 	if (ret || wait != WAIT_COMPLETE)
4687 		return ret;
4688 
4689 	for (i = start_i; i < num_pages; i++) {
4690 		page = extent_buffer_page(eb, i);
4691 		wait_on_page_locked(page);
4692 		if (!PageUptodate(page))
4693 			ret = -EIO;
4694 	}
4695 
4696 	return ret;
4697 
4698 unlock_exit:
4699 	i = start_i;
4700 	while (locked_pages > 0) {
4701 		page = extent_buffer_page(eb, i);
4702 		i++;
4703 		unlock_page(page);
4704 		locked_pages--;
4705 	}
4706 	return ret;
4707 }
4708 
4709 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
4710 			unsigned long start,
4711 			unsigned long len)
4712 {
4713 	size_t cur;
4714 	size_t offset;
4715 	struct page *page;
4716 	char *kaddr;
4717 	char *dst = (char *)dstv;
4718 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4719 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4720 
4721 	WARN_ON(start > eb->len);
4722 	WARN_ON(start + len > eb->start + eb->len);
4723 
4724 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4725 
4726 	while (len > 0) {
4727 		page = extent_buffer_page(eb, i);
4728 
4729 		cur = min(len, (PAGE_CACHE_SIZE - offset));
4730 		kaddr = page_address(page);
4731 		memcpy(dst, kaddr + offset, cur);
4732 
4733 		dst += cur;
4734 		len -= cur;
4735 		offset = 0;
4736 		i++;
4737 	}
4738 }
4739 
4740 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4741 			       unsigned long min_len, char **map,
4742 			       unsigned long *map_start,
4743 			       unsigned long *map_len)
4744 {
4745 	size_t offset = start & (PAGE_CACHE_SIZE - 1);
4746 	char *kaddr;
4747 	struct page *p;
4748 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4749 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4750 	unsigned long end_i = (start_offset + start + min_len - 1) >>
4751 		PAGE_CACHE_SHIFT;
4752 
4753 	if (i != end_i)
4754 		return -EINVAL;
4755 
4756 	if (i == 0) {
4757 		offset = start_offset;
4758 		*map_start = 0;
4759 	} else {
4760 		offset = 0;
4761 		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
4762 	}
4763 
4764 	if (start + min_len > eb->len) {
4765 		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4766 		       "wanted %lu %lu\n", (unsigned long long)eb->start,
4767 		       eb->len, start, min_len);
4768 		return -EINVAL;
4769 	}
4770 
4771 	p = extent_buffer_page(eb, i);
4772 	kaddr = page_address(p);
4773 	*map = kaddr + offset;
4774 	*map_len = PAGE_CACHE_SIZE - offset;
4775 	return 0;
4776 }
4777 
4778 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
4779 			  unsigned long start,
4780 			  unsigned long len)
4781 {
4782 	size_t cur;
4783 	size_t offset;
4784 	struct page *page;
4785 	char *kaddr;
4786 	char *ptr = (char *)ptrv;
4787 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4788 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4789 	int ret = 0;
4790 
4791 	WARN_ON(start > eb->len);
4792 	WARN_ON(start + len > eb->start + eb->len);
4793 
4794 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4795 
4796 	while (len > 0) {
4797 		page = extent_buffer_page(eb, i);
4798 
4799 		cur = min(len, (PAGE_CACHE_SIZE - offset));
4800 
4801 		kaddr = page_address(page);
4802 		ret = memcmp(ptr, kaddr + offset, cur);
4803 		if (ret)
4804 			break;
4805 
4806 		ptr += cur;
4807 		len -= cur;
4808 		offset = 0;
4809 		i++;
4810 	}
4811 	return ret;
4812 }
4813 
4814 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
4815 			 unsigned long start, unsigned long len)
4816 {
4817 	size_t cur;
4818 	size_t offset;
4819 	struct page *page;
4820 	char *kaddr;
4821 	char *src = (char *)srcv;
4822 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4823 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4824 
4825 	WARN_ON(start > eb->len);
4826 	WARN_ON(start + len > eb->start + eb->len);
4827 
4828 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4829 
4830 	while (len > 0) {
4831 		page = extent_buffer_page(eb, i);
4832 		WARN_ON(!PageUptodate(page));
4833 
4834 		cur = min(len, PAGE_CACHE_SIZE - offset);
4835 		kaddr = page_address(page);
4836 		memcpy(kaddr + offset, src, cur);
4837 
4838 		src += cur;
4839 		len -= cur;
4840 		offset = 0;
4841 		i++;
4842 	}
4843 }
4844 
4845 void memset_extent_buffer(struct extent_buffer *eb, char c,
4846 			  unsigned long start, unsigned long len)
4847 {
4848 	size_t cur;
4849 	size_t offset;
4850 	struct page *page;
4851 	char *kaddr;
4852 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4853 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4854 
4855 	WARN_ON(start > eb->len);
4856 	WARN_ON(start + len > eb->start + eb->len);
4857 
4858 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4859 
4860 	while (len > 0) {
4861 		page = extent_buffer_page(eb, i);
4862 		WARN_ON(!PageUptodate(page));
4863 
4864 		cur = min(len, PAGE_CACHE_SIZE - offset);
4865 		kaddr = page_address(page);
4866 		memset(kaddr + offset, c, cur);
4867 
4868 		len -= cur;
4869 		offset = 0;
4870 		i++;
4871 	}
4872 }
4873 
4874 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
4875 			unsigned long dst_offset, unsigned long src_offset,
4876 			unsigned long len)
4877 {
4878 	u64 dst_len = dst->len;
4879 	size_t cur;
4880 	size_t offset;
4881 	struct page *page;
4882 	char *kaddr;
4883 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4884 	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4885 
4886 	WARN_ON(src->len != dst_len);
4887 
4888 	offset = (start_offset + dst_offset) &
4889 		((unsigned long)PAGE_CACHE_SIZE - 1);
4890 
4891 	while (len > 0) {
4892 		page = extent_buffer_page(dst, i);
4893 		WARN_ON(!PageUptodate(page));
4894 
4895 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
4896 
4897 		kaddr = page_address(page);
4898 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
4899 
4900 		src_offset += cur;
4901 		len -= cur;
4902 		offset = 0;
4903 		i++;
4904 	}
4905 }
4906 
4907 static void move_pages(struct page *dst_page, struct page *src_page,
4908 		       unsigned long dst_off, unsigned long src_off,
4909 		       unsigned long len)
4910 {
4911 	char *dst_kaddr = page_address(dst_page);
4912 	if (dst_page == src_page) {
4913 		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
4914 	} else {
4915 		char *src_kaddr = page_address(src_page);
4916 		char *p = dst_kaddr + dst_off + len;
4917 		char *s = src_kaddr + src_off + len;
4918 
4919 		while (len--)
4920 			*--p = *--s;
4921 	}
4922 }
4923 
4924 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
4925 {
4926 	unsigned long distance = (src > dst) ? src - dst : dst - src;
4927 	return distance < len;
4928 }
4929 
4930 static void copy_pages(struct page *dst_page, struct page *src_page,
4931 		       unsigned long dst_off, unsigned long src_off,
4932 		       unsigned long len)
4933 {
4934 	char *dst_kaddr = page_address(dst_page);
4935 	char *src_kaddr;
4936 	int must_memmove = 0;
4937 
4938 	if (dst_page != src_page) {
4939 		src_kaddr = page_address(src_page);
4940 	} else {
4941 		src_kaddr = dst_kaddr;
4942 		if (areas_overlap(src_off, dst_off, len))
4943 			must_memmove = 1;
4944 	}
4945 
4946 	if (must_memmove)
4947 		memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
4948 	else
4949 		memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
4950 }
4951 
4952 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4953 			   unsigned long src_offset, unsigned long len)
4954 {
4955 	size_t cur;
4956 	size_t dst_off_in_page;
4957 	size_t src_off_in_page;
4958 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4959 	unsigned long dst_i;
4960 	unsigned long src_i;
4961 
4962 	if (src_offset + len > dst->len) {
4963 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4964 		       "len %lu dst len %lu\n", src_offset, len, dst->len);
4965 		BUG_ON(1);
4966 	}
4967 	if (dst_offset + len > dst->len) {
4968 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4969 		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
4970 		BUG_ON(1);
4971 	}
4972 
4973 	while (len > 0) {
4974 		dst_off_in_page = (start_offset + dst_offset) &
4975 			((unsigned long)PAGE_CACHE_SIZE - 1);
4976 		src_off_in_page = (start_offset + src_offset) &
4977 			((unsigned long)PAGE_CACHE_SIZE - 1);
4978 
4979 		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4980 		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
4981 
4982 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
4983 					       src_off_in_page));
4984 		cur = min_t(unsigned long, cur,
4985 			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
4986 
4987 		copy_pages(extent_buffer_page(dst, dst_i),
4988 			   extent_buffer_page(dst, src_i),
4989 			   dst_off_in_page, src_off_in_page, cur);
4990 
4991 		src_offset += cur;
4992 		dst_offset += cur;
4993 		len -= cur;
4994 	}
4995 }
4996 
4997 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4998 			   unsigned long src_offset, unsigned long len)
4999 {
5000 	size_t cur;
5001 	size_t dst_off_in_page;
5002 	size_t src_off_in_page;
5003 	unsigned long dst_end = dst_offset + len - 1;
5004 	unsigned long src_end = src_offset + len - 1;
5005 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
5006 	unsigned long dst_i;
5007 	unsigned long src_i;
5008 
5009 	if (src_offset + len > dst->len) {
5010 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
5011 		       "len %lu len %lu\n", src_offset, len, dst->len);
5012 		BUG_ON(1);
5013 	}
5014 	if (dst_offset + len > dst->len) {
5015 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
5016 		       "len %lu len %lu\n", dst_offset, len, dst->len);
5017 		BUG_ON(1);
5018 	}
5019 	if (dst_offset < src_offset) {
5020 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
5021 		return;
5022 	}
5023 	while (len > 0) {
5024 		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
5025 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
5026 
5027 		dst_off_in_page = (start_offset + dst_end) &
5028 			((unsigned long)PAGE_CACHE_SIZE - 1);
5029 		src_off_in_page = (start_offset + src_end) &
5030 			((unsigned long)PAGE_CACHE_SIZE - 1);
5031 
5032 		cur = min_t(unsigned long, len, src_off_in_page + 1);
5033 		cur = min(cur, dst_off_in_page + 1);
5034 		move_pages(extent_buffer_page(dst, dst_i),
5035 			   extent_buffer_page(dst, src_i),
5036 			   dst_off_in_page - cur + 1,
5037 			   src_off_in_page - cur + 1, cur);
5038 
5039 		dst_end -= cur;
5040 		src_end -= cur;
5041 		len -= cur;
5042 	}
5043 }
5044 
5045 int try_release_extent_buffer(struct page *page)
5046 {
5047 	struct extent_buffer *eb;
5048 
5049 	/*
5050 	 * We need to make sure noboody is attaching this page to an eb right
5051 	 * now.
5052 	 */
5053 	spin_lock(&page->mapping->private_lock);
5054 	if (!PagePrivate(page)) {
5055 		spin_unlock(&page->mapping->private_lock);
5056 		return 1;
5057 	}
5058 
5059 	eb = (struct extent_buffer *)page->private;
5060 	BUG_ON(!eb);
5061 
5062 	/*
5063 	 * This is a little awful but should be ok, we need to make sure that
5064 	 * the eb doesn't disappear out from under us while we're looking at
5065 	 * this page.
5066 	 */
5067 	spin_lock(&eb->refs_lock);
5068 	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
5069 		spin_unlock(&eb->refs_lock);
5070 		spin_unlock(&page->mapping->private_lock);
5071 		return 0;
5072 	}
5073 	spin_unlock(&page->mapping->private_lock);
5074 
5075 	/*
5076 	 * If tree ref isn't set then we know the ref on this eb is a real ref,
5077 	 * so just return, this page will likely be freed soon anyway.
5078 	 */
5079 	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
5080 		spin_unlock(&eb->refs_lock);
5081 		return 0;
5082 	}
5083 
5084 	return release_extent_buffer(eb);
5085 }
5086