xref: /openbmc/linux/fs/btrfs/extent_io.c (revision 9d749629)
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/pagemap.h>
6 #include <linux/page-flags.h>
7 #include <linux/module.h>
8 #include <linux/spinlock.h>
9 #include <linux/blkdev.h>
10 #include <linux/swap.h>
11 #include <linux/writeback.h>
12 #include <linux/pagevec.h>
13 #include <linux/prefetch.h>
14 #include <linux/cleancache.h>
15 #include "extent_io.h"
16 #include "extent_map.h"
17 #include "compat.h"
18 #include "ctree.h"
19 #include "btrfs_inode.h"
20 #include "volumes.h"
21 #include "check-integrity.h"
22 #include "locking.h"
23 #include "rcu-string.h"
24 
25 static struct kmem_cache *extent_state_cache;
26 static struct kmem_cache *extent_buffer_cache;
27 
28 static LIST_HEAD(buffers);
29 static LIST_HEAD(states);
30 
31 #define LEAK_DEBUG 0
32 #if LEAK_DEBUG
33 static DEFINE_SPINLOCK(leak_lock);
34 #endif
35 
36 #define BUFFER_LRU_MAX 64
37 
38 struct tree_entry {
39 	u64 start;
40 	u64 end;
41 	struct rb_node rb_node;
42 };
43 
44 struct extent_page_data {
45 	struct bio *bio;
46 	struct extent_io_tree *tree;
47 	get_extent_t *get_extent;
48 	unsigned long bio_flags;
49 
50 	/* tells writepage not to lock the state bits for this range
51 	 * it still does the unlocking
52 	 */
53 	unsigned int extent_locked:1;
54 
55 	/* tells the submit_bio code to use a WRITE_SYNC */
56 	unsigned int sync_io:1;
57 };
58 
59 static noinline void flush_write_bio(void *data);
60 static inline struct btrfs_fs_info *
61 tree_fs_info(struct extent_io_tree *tree)
62 {
63 	return btrfs_sb(tree->mapping->host->i_sb);
64 }
65 
66 int __init extent_io_init(void)
67 {
68 	extent_state_cache = kmem_cache_create("btrfs_extent_state",
69 			sizeof(struct extent_state), 0,
70 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
71 	if (!extent_state_cache)
72 		return -ENOMEM;
73 
74 	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
75 			sizeof(struct extent_buffer), 0,
76 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
77 	if (!extent_buffer_cache)
78 		goto free_state_cache;
79 	return 0;
80 
81 free_state_cache:
82 	kmem_cache_destroy(extent_state_cache);
83 	return -ENOMEM;
84 }
85 
86 void extent_io_exit(void)
87 {
88 	struct extent_state *state;
89 	struct extent_buffer *eb;
90 
91 	while (!list_empty(&states)) {
92 		state = list_entry(states.next, struct extent_state, leak_list);
93 		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
94 		       "state %lu in tree %p refs %d\n",
95 		       (unsigned long long)state->start,
96 		       (unsigned long long)state->end,
97 		       state->state, state->tree, atomic_read(&state->refs));
98 		list_del(&state->leak_list);
99 		kmem_cache_free(extent_state_cache, state);
100 
101 	}
102 
103 	while (!list_empty(&buffers)) {
104 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
105 		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
106 		       "refs %d\n", (unsigned long long)eb->start,
107 		       eb->len, atomic_read(&eb->refs));
108 		list_del(&eb->leak_list);
109 		kmem_cache_free(extent_buffer_cache, eb);
110 	}
111 
112 	/*
113 	 * Make sure all delayed rcu free are flushed before we
114 	 * destroy caches.
115 	 */
116 	rcu_barrier();
117 	if (extent_state_cache)
118 		kmem_cache_destroy(extent_state_cache);
119 	if (extent_buffer_cache)
120 		kmem_cache_destroy(extent_buffer_cache);
121 }
122 
123 void extent_io_tree_init(struct extent_io_tree *tree,
124 			 struct address_space *mapping)
125 {
126 	tree->state = RB_ROOT;
127 	INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
128 	tree->ops = NULL;
129 	tree->dirty_bytes = 0;
130 	spin_lock_init(&tree->lock);
131 	spin_lock_init(&tree->buffer_lock);
132 	tree->mapping = mapping;
133 }
134 
135 static struct extent_state *alloc_extent_state(gfp_t mask)
136 {
137 	struct extent_state *state;
138 #if LEAK_DEBUG
139 	unsigned long flags;
140 #endif
141 
142 	state = kmem_cache_alloc(extent_state_cache, mask);
143 	if (!state)
144 		return state;
145 	state->state = 0;
146 	state->private = 0;
147 	state->tree = NULL;
148 #if LEAK_DEBUG
149 	spin_lock_irqsave(&leak_lock, flags);
150 	list_add(&state->leak_list, &states);
151 	spin_unlock_irqrestore(&leak_lock, flags);
152 #endif
153 	atomic_set(&state->refs, 1);
154 	init_waitqueue_head(&state->wq);
155 	trace_alloc_extent_state(state, mask, _RET_IP_);
156 	return state;
157 }
158 
159 void free_extent_state(struct extent_state *state)
160 {
161 	if (!state)
162 		return;
163 	if (atomic_dec_and_test(&state->refs)) {
164 #if LEAK_DEBUG
165 		unsigned long flags;
166 #endif
167 		WARN_ON(state->tree);
168 #if LEAK_DEBUG
169 		spin_lock_irqsave(&leak_lock, flags);
170 		list_del(&state->leak_list);
171 		spin_unlock_irqrestore(&leak_lock, flags);
172 #endif
173 		trace_free_extent_state(state, _RET_IP_);
174 		kmem_cache_free(extent_state_cache, state);
175 	}
176 }
177 
178 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
179 				   struct rb_node *node)
180 {
181 	struct rb_node **p = &root->rb_node;
182 	struct rb_node *parent = NULL;
183 	struct tree_entry *entry;
184 
185 	while (*p) {
186 		parent = *p;
187 		entry = rb_entry(parent, struct tree_entry, rb_node);
188 
189 		if (offset < entry->start)
190 			p = &(*p)->rb_left;
191 		else if (offset > entry->end)
192 			p = &(*p)->rb_right;
193 		else
194 			return parent;
195 	}
196 
197 	rb_link_node(node, parent, p);
198 	rb_insert_color(node, root);
199 	return NULL;
200 }
201 
202 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
203 				     struct rb_node **prev_ret,
204 				     struct rb_node **next_ret)
205 {
206 	struct rb_root *root = &tree->state;
207 	struct rb_node *n = root->rb_node;
208 	struct rb_node *prev = NULL;
209 	struct rb_node *orig_prev = NULL;
210 	struct tree_entry *entry;
211 	struct tree_entry *prev_entry = NULL;
212 
213 	while (n) {
214 		entry = rb_entry(n, struct tree_entry, rb_node);
215 		prev = n;
216 		prev_entry = entry;
217 
218 		if (offset < entry->start)
219 			n = n->rb_left;
220 		else if (offset > entry->end)
221 			n = n->rb_right;
222 		else
223 			return n;
224 	}
225 
226 	if (prev_ret) {
227 		orig_prev = prev;
228 		while (prev && offset > prev_entry->end) {
229 			prev = rb_next(prev);
230 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
231 		}
232 		*prev_ret = prev;
233 		prev = orig_prev;
234 	}
235 
236 	if (next_ret) {
237 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
238 		while (prev && offset < prev_entry->start) {
239 			prev = rb_prev(prev);
240 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
241 		}
242 		*next_ret = prev;
243 	}
244 	return NULL;
245 }
246 
247 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
248 					  u64 offset)
249 {
250 	struct rb_node *prev = NULL;
251 	struct rb_node *ret;
252 
253 	ret = __etree_search(tree, offset, &prev, NULL);
254 	if (!ret)
255 		return prev;
256 	return ret;
257 }
258 
259 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
260 		     struct extent_state *other)
261 {
262 	if (tree->ops && tree->ops->merge_extent_hook)
263 		tree->ops->merge_extent_hook(tree->mapping->host, new,
264 					     other);
265 }
266 
267 /*
268  * utility function to look for merge candidates inside a given range.
269  * Any extents with matching state are merged together into a single
270  * extent in the tree.  Extents with EXTENT_IO in their state field
271  * are not merged because the end_io handlers need to be able to do
272  * operations on them without sleeping (or doing allocations/splits).
273  *
274  * This should be called with the tree lock held.
275  */
276 static void merge_state(struct extent_io_tree *tree,
277 		        struct extent_state *state)
278 {
279 	struct extent_state *other;
280 	struct rb_node *other_node;
281 
282 	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
283 		return;
284 
285 	other_node = rb_prev(&state->rb_node);
286 	if (other_node) {
287 		other = rb_entry(other_node, struct extent_state, rb_node);
288 		if (other->end == state->start - 1 &&
289 		    other->state == state->state) {
290 			merge_cb(tree, state, other);
291 			state->start = other->start;
292 			other->tree = NULL;
293 			rb_erase(&other->rb_node, &tree->state);
294 			free_extent_state(other);
295 		}
296 	}
297 	other_node = rb_next(&state->rb_node);
298 	if (other_node) {
299 		other = rb_entry(other_node, struct extent_state, rb_node);
300 		if (other->start == state->end + 1 &&
301 		    other->state == state->state) {
302 			merge_cb(tree, state, other);
303 			state->end = other->end;
304 			other->tree = NULL;
305 			rb_erase(&other->rb_node, &tree->state);
306 			free_extent_state(other);
307 		}
308 	}
309 }
310 
311 static void set_state_cb(struct extent_io_tree *tree,
312 			 struct extent_state *state, int *bits)
313 {
314 	if (tree->ops && tree->ops->set_bit_hook)
315 		tree->ops->set_bit_hook(tree->mapping->host, state, bits);
316 }
317 
318 static void clear_state_cb(struct extent_io_tree *tree,
319 			   struct extent_state *state, int *bits)
320 {
321 	if (tree->ops && tree->ops->clear_bit_hook)
322 		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
323 }
324 
325 static void set_state_bits(struct extent_io_tree *tree,
326 			   struct extent_state *state, int *bits);
327 
328 /*
329  * insert an extent_state struct into the tree.  'bits' are set on the
330  * struct before it is inserted.
331  *
332  * This may return -EEXIST if the extent is already there, in which case the
333  * state struct is freed.
334  *
335  * The tree lock is not taken internally.  This is a utility function and
336  * probably isn't what you want to call (see set/clear_extent_bit).
337  */
338 static int insert_state(struct extent_io_tree *tree,
339 			struct extent_state *state, u64 start, u64 end,
340 			int *bits)
341 {
342 	struct rb_node *node;
343 
344 	if (end < start)
345 		WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
346 		       (unsigned long long)end,
347 		       (unsigned long long)start);
348 	state->start = start;
349 	state->end = end;
350 
351 	set_state_bits(tree, state, bits);
352 
353 	node = tree_insert(&tree->state, end, &state->rb_node);
354 	if (node) {
355 		struct extent_state *found;
356 		found = rb_entry(node, struct extent_state, rb_node);
357 		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
358 		       "%llu %llu\n", (unsigned long long)found->start,
359 		       (unsigned long long)found->end,
360 		       (unsigned long long)start, (unsigned long long)end);
361 		return -EEXIST;
362 	}
363 	state->tree = tree;
364 	merge_state(tree, state);
365 	return 0;
366 }
367 
368 static void split_cb(struct extent_io_tree *tree, struct extent_state *orig,
369 		     u64 split)
370 {
371 	if (tree->ops && tree->ops->split_extent_hook)
372 		tree->ops->split_extent_hook(tree->mapping->host, orig, split);
373 }
374 
375 /*
376  * split a given extent state struct in two, inserting the preallocated
377  * struct 'prealloc' as the newly created second half.  'split' indicates an
378  * offset inside 'orig' where it should be split.
379  *
380  * Before calling,
381  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
382  * are two extent state structs in the tree:
383  * prealloc: [orig->start, split - 1]
384  * orig: [ split, orig->end ]
385  *
386  * The tree locks are not taken by this function. They need to be held
387  * by the caller.
388  */
389 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
390 		       struct extent_state *prealloc, u64 split)
391 {
392 	struct rb_node *node;
393 
394 	split_cb(tree, orig, split);
395 
396 	prealloc->start = orig->start;
397 	prealloc->end = split - 1;
398 	prealloc->state = orig->state;
399 	orig->start = split;
400 
401 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
402 	if (node) {
403 		free_extent_state(prealloc);
404 		return -EEXIST;
405 	}
406 	prealloc->tree = tree;
407 	return 0;
408 }
409 
410 static struct extent_state *next_state(struct extent_state *state)
411 {
412 	struct rb_node *next = rb_next(&state->rb_node);
413 	if (next)
414 		return rb_entry(next, struct extent_state, rb_node);
415 	else
416 		return NULL;
417 }
418 
419 /*
420  * utility function to clear some bits in an extent state struct.
421  * it will optionally wake up any one waiting on this state (wake == 1).
422  *
423  * If no bits are set on the state struct after clearing things, the
424  * struct is freed and removed from the tree
425  */
426 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
427 					    struct extent_state *state,
428 					    int *bits, int wake)
429 {
430 	struct extent_state *next;
431 	int bits_to_clear = *bits & ~EXTENT_CTLBITS;
432 
433 	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
434 		u64 range = state->end - state->start + 1;
435 		WARN_ON(range > tree->dirty_bytes);
436 		tree->dirty_bytes -= range;
437 	}
438 	clear_state_cb(tree, state, bits);
439 	state->state &= ~bits_to_clear;
440 	if (wake)
441 		wake_up(&state->wq);
442 	if (state->state == 0) {
443 		next = next_state(state);
444 		if (state->tree) {
445 			rb_erase(&state->rb_node, &tree->state);
446 			state->tree = NULL;
447 			free_extent_state(state);
448 		} else {
449 			WARN_ON(1);
450 		}
451 	} else {
452 		merge_state(tree, state);
453 		next = next_state(state);
454 	}
455 	return next;
456 }
457 
458 static struct extent_state *
459 alloc_extent_state_atomic(struct extent_state *prealloc)
460 {
461 	if (!prealloc)
462 		prealloc = alloc_extent_state(GFP_ATOMIC);
463 
464 	return prealloc;
465 }
466 
467 void extent_io_tree_panic(struct extent_io_tree *tree, int err)
468 {
469 	btrfs_panic(tree_fs_info(tree), err, "Locking error: "
470 		    "Extent tree was modified by another "
471 		    "thread while locked.");
472 }
473 
474 /*
475  * clear some bits on a range in the tree.  This may require splitting
476  * or inserting elements in the tree, so the gfp mask is used to
477  * indicate which allocations or sleeping are allowed.
478  *
479  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
480  * the given range from the tree regardless of state (ie for truncate).
481  *
482  * the range [start, end] is inclusive.
483  *
484  * This takes the tree lock, and returns 0 on success and < 0 on error.
485  */
486 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
487 		     int bits, int wake, int delete,
488 		     struct extent_state **cached_state,
489 		     gfp_t mask)
490 {
491 	struct extent_state *state;
492 	struct extent_state *cached;
493 	struct extent_state *prealloc = NULL;
494 	struct rb_node *node;
495 	u64 last_end;
496 	int err;
497 	int clear = 0;
498 
499 	if (delete)
500 		bits |= ~EXTENT_CTLBITS;
501 	bits |= EXTENT_FIRST_DELALLOC;
502 
503 	if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
504 		clear = 1;
505 again:
506 	if (!prealloc && (mask & __GFP_WAIT)) {
507 		prealloc = alloc_extent_state(mask);
508 		if (!prealloc)
509 			return -ENOMEM;
510 	}
511 
512 	spin_lock(&tree->lock);
513 	if (cached_state) {
514 		cached = *cached_state;
515 
516 		if (clear) {
517 			*cached_state = NULL;
518 			cached_state = NULL;
519 		}
520 
521 		if (cached && cached->tree && cached->start <= start &&
522 		    cached->end > start) {
523 			if (clear)
524 				atomic_dec(&cached->refs);
525 			state = cached;
526 			goto hit_next;
527 		}
528 		if (clear)
529 			free_extent_state(cached);
530 	}
531 	/*
532 	 * this search will find the extents that end after
533 	 * our range starts
534 	 */
535 	node = tree_search(tree, start);
536 	if (!node)
537 		goto out;
538 	state = rb_entry(node, struct extent_state, rb_node);
539 hit_next:
540 	if (state->start > end)
541 		goto out;
542 	WARN_ON(state->end < start);
543 	last_end = state->end;
544 
545 	/* the state doesn't have the wanted bits, go ahead */
546 	if (!(state->state & bits)) {
547 		state = next_state(state);
548 		goto next;
549 	}
550 
551 	/*
552 	 *     | ---- desired range ---- |
553 	 *  | state | or
554 	 *  | ------------- state -------------- |
555 	 *
556 	 * We need to split the extent we found, and may flip
557 	 * bits on second half.
558 	 *
559 	 * If the extent we found extends past our range, we
560 	 * just split and search again.  It'll get split again
561 	 * the next time though.
562 	 *
563 	 * If the extent we found is inside our range, we clear
564 	 * the desired bit on it.
565 	 */
566 
567 	if (state->start < start) {
568 		prealloc = alloc_extent_state_atomic(prealloc);
569 		BUG_ON(!prealloc);
570 		err = split_state(tree, state, prealloc, start);
571 		if (err)
572 			extent_io_tree_panic(tree, err);
573 
574 		prealloc = NULL;
575 		if (err)
576 			goto out;
577 		if (state->end <= end) {
578 			state = clear_state_bit(tree, state, &bits, wake);
579 			goto next;
580 		}
581 		goto search_again;
582 	}
583 	/*
584 	 * | ---- desired range ---- |
585 	 *                        | state |
586 	 * We need to split the extent, and clear the bit
587 	 * on the first half
588 	 */
589 	if (state->start <= end && state->end > end) {
590 		prealloc = alloc_extent_state_atomic(prealloc);
591 		BUG_ON(!prealloc);
592 		err = split_state(tree, state, prealloc, end + 1);
593 		if (err)
594 			extent_io_tree_panic(tree, err);
595 
596 		if (wake)
597 			wake_up(&state->wq);
598 
599 		clear_state_bit(tree, prealloc, &bits, wake);
600 
601 		prealloc = NULL;
602 		goto out;
603 	}
604 
605 	state = clear_state_bit(tree, state, &bits, wake);
606 next:
607 	if (last_end == (u64)-1)
608 		goto out;
609 	start = last_end + 1;
610 	if (start <= end && state && !need_resched())
611 		goto hit_next;
612 	goto search_again;
613 
614 out:
615 	spin_unlock(&tree->lock);
616 	if (prealloc)
617 		free_extent_state(prealloc);
618 
619 	return 0;
620 
621 search_again:
622 	if (start > end)
623 		goto out;
624 	spin_unlock(&tree->lock);
625 	if (mask & __GFP_WAIT)
626 		cond_resched();
627 	goto again;
628 }
629 
630 static void wait_on_state(struct extent_io_tree *tree,
631 			  struct extent_state *state)
632 		__releases(tree->lock)
633 		__acquires(tree->lock)
634 {
635 	DEFINE_WAIT(wait);
636 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
637 	spin_unlock(&tree->lock);
638 	schedule();
639 	spin_lock(&tree->lock);
640 	finish_wait(&state->wq, &wait);
641 }
642 
643 /*
644  * waits for one or more bits to clear on a range in the state tree.
645  * The range [start, end] is inclusive.
646  * The tree lock is taken by this function
647  */
648 void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
649 {
650 	struct extent_state *state;
651 	struct rb_node *node;
652 
653 	spin_lock(&tree->lock);
654 again:
655 	while (1) {
656 		/*
657 		 * this search will find all the extents that end after
658 		 * our range starts
659 		 */
660 		node = tree_search(tree, start);
661 		if (!node)
662 			break;
663 
664 		state = rb_entry(node, struct extent_state, rb_node);
665 
666 		if (state->start > end)
667 			goto out;
668 
669 		if (state->state & bits) {
670 			start = state->start;
671 			atomic_inc(&state->refs);
672 			wait_on_state(tree, state);
673 			free_extent_state(state);
674 			goto again;
675 		}
676 		start = state->end + 1;
677 
678 		if (start > end)
679 			break;
680 
681 		cond_resched_lock(&tree->lock);
682 	}
683 out:
684 	spin_unlock(&tree->lock);
685 }
686 
687 static void set_state_bits(struct extent_io_tree *tree,
688 			   struct extent_state *state,
689 			   int *bits)
690 {
691 	int bits_to_set = *bits & ~EXTENT_CTLBITS;
692 
693 	set_state_cb(tree, state, bits);
694 	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
695 		u64 range = state->end - state->start + 1;
696 		tree->dirty_bytes += range;
697 	}
698 	state->state |= bits_to_set;
699 }
700 
701 static void cache_state(struct extent_state *state,
702 			struct extent_state **cached_ptr)
703 {
704 	if (cached_ptr && !(*cached_ptr)) {
705 		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
706 			*cached_ptr = state;
707 			atomic_inc(&state->refs);
708 		}
709 	}
710 }
711 
712 static void uncache_state(struct extent_state **cached_ptr)
713 {
714 	if (cached_ptr && (*cached_ptr)) {
715 		struct extent_state *state = *cached_ptr;
716 		*cached_ptr = NULL;
717 		free_extent_state(state);
718 	}
719 }
720 
721 /*
722  * set some bits on a range in the tree.  This may require allocations or
723  * sleeping, so the gfp mask is used to indicate what is allowed.
724  *
725  * If any of the exclusive bits are set, this will fail with -EEXIST if some
726  * part of the range already has the desired bits set.  The start of the
727  * existing range is returned in failed_start in this case.
728  *
729  * [start, end] is inclusive This takes the tree lock.
730  */
731 
732 static int __must_check
733 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
734 		 int bits, int exclusive_bits, u64 *failed_start,
735 		 struct extent_state **cached_state, gfp_t mask)
736 {
737 	struct extent_state *state;
738 	struct extent_state *prealloc = NULL;
739 	struct rb_node *node;
740 	int err = 0;
741 	u64 last_start;
742 	u64 last_end;
743 
744 	bits |= EXTENT_FIRST_DELALLOC;
745 again:
746 	if (!prealloc && (mask & __GFP_WAIT)) {
747 		prealloc = alloc_extent_state(mask);
748 		BUG_ON(!prealloc);
749 	}
750 
751 	spin_lock(&tree->lock);
752 	if (cached_state && *cached_state) {
753 		state = *cached_state;
754 		if (state->start <= start && state->end > start &&
755 		    state->tree) {
756 			node = &state->rb_node;
757 			goto hit_next;
758 		}
759 	}
760 	/*
761 	 * this search will find all the extents that end after
762 	 * our range starts.
763 	 */
764 	node = tree_search(tree, start);
765 	if (!node) {
766 		prealloc = alloc_extent_state_atomic(prealloc);
767 		BUG_ON(!prealloc);
768 		err = insert_state(tree, prealloc, start, end, &bits);
769 		if (err)
770 			extent_io_tree_panic(tree, err);
771 
772 		prealloc = NULL;
773 		goto out;
774 	}
775 	state = rb_entry(node, struct extent_state, rb_node);
776 hit_next:
777 	last_start = state->start;
778 	last_end = state->end;
779 
780 	/*
781 	 * | ---- desired range ---- |
782 	 * | state |
783 	 *
784 	 * Just lock what we found and keep going
785 	 */
786 	if (state->start == start && state->end <= end) {
787 		if (state->state & exclusive_bits) {
788 			*failed_start = state->start;
789 			err = -EEXIST;
790 			goto out;
791 		}
792 
793 		set_state_bits(tree, state, &bits);
794 		cache_state(state, cached_state);
795 		merge_state(tree, state);
796 		if (last_end == (u64)-1)
797 			goto out;
798 		start = last_end + 1;
799 		state = next_state(state);
800 		if (start < end && state && state->start == start &&
801 		    !need_resched())
802 			goto hit_next;
803 		goto search_again;
804 	}
805 
806 	/*
807 	 *     | ---- desired range ---- |
808 	 * | state |
809 	 *   or
810 	 * | ------------- state -------------- |
811 	 *
812 	 * We need to split the extent we found, and may flip bits on
813 	 * second half.
814 	 *
815 	 * If the extent we found extends past our
816 	 * range, we just split and search again.  It'll get split
817 	 * again the next time though.
818 	 *
819 	 * If the extent we found is inside our range, we set the
820 	 * desired bit on it.
821 	 */
822 	if (state->start < start) {
823 		if (state->state & exclusive_bits) {
824 			*failed_start = start;
825 			err = -EEXIST;
826 			goto out;
827 		}
828 
829 		prealloc = alloc_extent_state_atomic(prealloc);
830 		BUG_ON(!prealloc);
831 		err = split_state(tree, state, prealloc, start);
832 		if (err)
833 			extent_io_tree_panic(tree, err);
834 
835 		prealloc = NULL;
836 		if (err)
837 			goto out;
838 		if (state->end <= end) {
839 			set_state_bits(tree, state, &bits);
840 			cache_state(state, cached_state);
841 			merge_state(tree, state);
842 			if (last_end == (u64)-1)
843 				goto out;
844 			start = last_end + 1;
845 			state = next_state(state);
846 			if (start < end && state && state->start == start &&
847 			    !need_resched())
848 				goto hit_next;
849 		}
850 		goto search_again;
851 	}
852 	/*
853 	 * | ---- desired range ---- |
854 	 *     | state | or               | state |
855 	 *
856 	 * There's a hole, we need to insert something in it and
857 	 * ignore the extent we found.
858 	 */
859 	if (state->start > start) {
860 		u64 this_end;
861 		if (end < last_start)
862 			this_end = end;
863 		else
864 			this_end = last_start - 1;
865 
866 		prealloc = alloc_extent_state_atomic(prealloc);
867 		BUG_ON(!prealloc);
868 
869 		/*
870 		 * Avoid to free 'prealloc' if it can be merged with
871 		 * the later extent.
872 		 */
873 		err = insert_state(tree, prealloc, start, this_end,
874 				   &bits);
875 		if (err)
876 			extent_io_tree_panic(tree, err);
877 
878 		cache_state(prealloc, cached_state);
879 		prealloc = NULL;
880 		start = this_end + 1;
881 		goto search_again;
882 	}
883 	/*
884 	 * | ---- desired range ---- |
885 	 *                        | state |
886 	 * We need to split the extent, and set the bit
887 	 * on the first half
888 	 */
889 	if (state->start <= end && state->end > end) {
890 		if (state->state & exclusive_bits) {
891 			*failed_start = start;
892 			err = -EEXIST;
893 			goto out;
894 		}
895 
896 		prealloc = alloc_extent_state_atomic(prealloc);
897 		BUG_ON(!prealloc);
898 		err = split_state(tree, state, prealloc, end + 1);
899 		if (err)
900 			extent_io_tree_panic(tree, err);
901 
902 		set_state_bits(tree, prealloc, &bits);
903 		cache_state(prealloc, cached_state);
904 		merge_state(tree, prealloc);
905 		prealloc = NULL;
906 		goto out;
907 	}
908 
909 	goto search_again;
910 
911 out:
912 	spin_unlock(&tree->lock);
913 	if (prealloc)
914 		free_extent_state(prealloc);
915 
916 	return err;
917 
918 search_again:
919 	if (start > end)
920 		goto out;
921 	spin_unlock(&tree->lock);
922 	if (mask & __GFP_WAIT)
923 		cond_resched();
924 	goto again;
925 }
926 
927 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits,
928 		   u64 *failed_start, struct extent_state **cached_state,
929 		   gfp_t mask)
930 {
931 	return __set_extent_bit(tree, start, end, bits, 0, failed_start,
932 				cached_state, mask);
933 }
934 
935 
936 /**
937  * convert_extent_bit - convert all bits in a given range from one bit to
938  * 			another
939  * @tree:	the io tree to search
940  * @start:	the start offset in bytes
941  * @end:	the end offset in bytes (inclusive)
942  * @bits:	the bits to set in this range
943  * @clear_bits:	the bits to clear in this range
944  * @cached_state:	state that we're going to cache
945  * @mask:	the allocation mask
946  *
947  * This will go through and set bits for the given range.  If any states exist
948  * already in this range they are set with the given bit and cleared of the
949  * clear_bits.  This is only meant to be used by things that are mergeable, ie
950  * converting from say DELALLOC to DIRTY.  This is not meant to be used with
951  * boundary bits like LOCK.
952  */
953 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
954 		       int bits, int clear_bits,
955 		       struct extent_state **cached_state, gfp_t mask)
956 {
957 	struct extent_state *state;
958 	struct extent_state *prealloc = NULL;
959 	struct rb_node *node;
960 	int err = 0;
961 	u64 last_start;
962 	u64 last_end;
963 
964 again:
965 	if (!prealloc && (mask & __GFP_WAIT)) {
966 		prealloc = alloc_extent_state(mask);
967 		if (!prealloc)
968 			return -ENOMEM;
969 	}
970 
971 	spin_lock(&tree->lock);
972 	if (cached_state && *cached_state) {
973 		state = *cached_state;
974 		if (state->start <= start && state->end > start &&
975 		    state->tree) {
976 			node = &state->rb_node;
977 			goto hit_next;
978 		}
979 	}
980 
981 	/*
982 	 * this search will find all the extents that end after
983 	 * our range starts.
984 	 */
985 	node = tree_search(tree, start);
986 	if (!node) {
987 		prealloc = alloc_extent_state_atomic(prealloc);
988 		if (!prealloc) {
989 			err = -ENOMEM;
990 			goto out;
991 		}
992 		err = insert_state(tree, prealloc, start, end, &bits);
993 		prealloc = NULL;
994 		if (err)
995 			extent_io_tree_panic(tree, err);
996 		goto out;
997 	}
998 	state = rb_entry(node, struct extent_state, rb_node);
999 hit_next:
1000 	last_start = state->start;
1001 	last_end = state->end;
1002 
1003 	/*
1004 	 * | ---- desired range ---- |
1005 	 * | state |
1006 	 *
1007 	 * Just lock what we found and keep going
1008 	 */
1009 	if (state->start == start && state->end <= end) {
1010 		set_state_bits(tree, state, &bits);
1011 		cache_state(state, cached_state);
1012 		state = clear_state_bit(tree, state, &clear_bits, 0);
1013 		if (last_end == (u64)-1)
1014 			goto out;
1015 		start = last_end + 1;
1016 		if (start < end && state && state->start == start &&
1017 		    !need_resched())
1018 			goto hit_next;
1019 		goto search_again;
1020 	}
1021 
1022 	/*
1023 	 *     | ---- desired range ---- |
1024 	 * | state |
1025 	 *   or
1026 	 * | ------------- state -------------- |
1027 	 *
1028 	 * We need to split the extent we found, and may flip bits on
1029 	 * second half.
1030 	 *
1031 	 * If the extent we found extends past our
1032 	 * range, we just split and search again.  It'll get split
1033 	 * again the next time though.
1034 	 *
1035 	 * If the extent we found is inside our range, we set the
1036 	 * desired bit on it.
1037 	 */
1038 	if (state->start < start) {
1039 		prealloc = alloc_extent_state_atomic(prealloc);
1040 		if (!prealloc) {
1041 			err = -ENOMEM;
1042 			goto out;
1043 		}
1044 		err = split_state(tree, state, prealloc, start);
1045 		if (err)
1046 			extent_io_tree_panic(tree, err);
1047 		prealloc = NULL;
1048 		if (err)
1049 			goto out;
1050 		if (state->end <= end) {
1051 			set_state_bits(tree, state, &bits);
1052 			cache_state(state, cached_state);
1053 			state = clear_state_bit(tree, state, &clear_bits, 0);
1054 			if (last_end == (u64)-1)
1055 				goto out;
1056 			start = last_end + 1;
1057 			if (start < end && state && state->start == start &&
1058 			    !need_resched())
1059 				goto hit_next;
1060 		}
1061 		goto search_again;
1062 	}
1063 	/*
1064 	 * | ---- desired range ---- |
1065 	 *     | state | or               | state |
1066 	 *
1067 	 * There's a hole, we need to insert something in it and
1068 	 * ignore the extent we found.
1069 	 */
1070 	if (state->start > start) {
1071 		u64 this_end;
1072 		if (end < last_start)
1073 			this_end = end;
1074 		else
1075 			this_end = last_start - 1;
1076 
1077 		prealloc = alloc_extent_state_atomic(prealloc);
1078 		if (!prealloc) {
1079 			err = -ENOMEM;
1080 			goto out;
1081 		}
1082 
1083 		/*
1084 		 * Avoid to free 'prealloc' if it can be merged with
1085 		 * the later extent.
1086 		 */
1087 		err = insert_state(tree, prealloc, start, this_end,
1088 				   &bits);
1089 		if (err)
1090 			extent_io_tree_panic(tree, err);
1091 		cache_state(prealloc, cached_state);
1092 		prealloc = NULL;
1093 		start = this_end + 1;
1094 		goto search_again;
1095 	}
1096 	/*
1097 	 * | ---- desired range ---- |
1098 	 *                        | state |
1099 	 * We need to split the extent, and set the bit
1100 	 * on the first half
1101 	 */
1102 	if (state->start <= end && state->end > end) {
1103 		prealloc = alloc_extent_state_atomic(prealloc);
1104 		if (!prealloc) {
1105 			err = -ENOMEM;
1106 			goto out;
1107 		}
1108 
1109 		err = split_state(tree, state, prealloc, end + 1);
1110 		if (err)
1111 			extent_io_tree_panic(tree, err);
1112 
1113 		set_state_bits(tree, prealloc, &bits);
1114 		cache_state(prealloc, cached_state);
1115 		clear_state_bit(tree, prealloc, &clear_bits, 0);
1116 		prealloc = NULL;
1117 		goto out;
1118 	}
1119 
1120 	goto search_again;
1121 
1122 out:
1123 	spin_unlock(&tree->lock);
1124 	if (prealloc)
1125 		free_extent_state(prealloc);
1126 
1127 	return err;
1128 
1129 search_again:
1130 	if (start > end)
1131 		goto out;
1132 	spin_unlock(&tree->lock);
1133 	if (mask & __GFP_WAIT)
1134 		cond_resched();
1135 	goto again;
1136 }
1137 
1138 /* wrappers around set/clear extent bit */
1139 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1140 		     gfp_t mask)
1141 {
1142 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
1143 			      NULL, mask);
1144 }
1145 
1146 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1147 		    int bits, gfp_t mask)
1148 {
1149 	return set_extent_bit(tree, start, end, bits, NULL,
1150 			      NULL, mask);
1151 }
1152 
1153 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1154 		      int bits, gfp_t mask)
1155 {
1156 	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
1157 }
1158 
1159 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
1160 			struct extent_state **cached_state, gfp_t mask)
1161 {
1162 	return set_extent_bit(tree, start, end,
1163 			      EXTENT_DELALLOC | EXTENT_UPTODATE,
1164 			      NULL, cached_state, mask);
1165 }
1166 
1167 int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
1168 		      struct extent_state **cached_state, gfp_t mask)
1169 {
1170 	return set_extent_bit(tree, start, end,
1171 			      EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
1172 			      NULL, cached_state, mask);
1173 }
1174 
1175 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
1176 		       gfp_t mask)
1177 {
1178 	return clear_extent_bit(tree, start, end,
1179 				EXTENT_DIRTY | EXTENT_DELALLOC |
1180 				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
1181 }
1182 
1183 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
1184 		     gfp_t mask)
1185 {
1186 	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
1187 			      NULL, mask);
1188 }
1189 
1190 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1191 			struct extent_state **cached_state, gfp_t mask)
1192 {
1193 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
1194 			      cached_state, mask);
1195 }
1196 
1197 int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
1198 			  struct extent_state **cached_state, gfp_t mask)
1199 {
1200 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
1201 				cached_state, mask);
1202 }
1203 
1204 /*
1205  * either insert or lock state struct between start and end use mask to tell
1206  * us if waiting is desired.
1207  */
1208 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1209 		     int bits, struct extent_state **cached_state)
1210 {
1211 	int err;
1212 	u64 failed_start;
1213 	while (1) {
1214 		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1215 				       EXTENT_LOCKED, &failed_start,
1216 				       cached_state, GFP_NOFS);
1217 		if (err == -EEXIST) {
1218 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1219 			start = failed_start;
1220 		} else
1221 			break;
1222 		WARN_ON(start > end);
1223 	}
1224 	return err;
1225 }
1226 
1227 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1228 {
1229 	return lock_extent_bits(tree, start, end, 0, NULL);
1230 }
1231 
1232 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1233 {
1234 	int err;
1235 	u64 failed_start;
1236 
1237 	err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1238 			       &failed_start, NULL, GFP_NOFS);
1239 	if (err == -EEXIST) {
1240 		if (failed_start > start)
1241 			clear_extent_bit(tree, start, failed_start - 1,
1242 					 EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS);
1243 		return 0;
1244 	}
1245 	return 1;
1246 }
1247 
1248 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1249 			 struct extent_state **cached, gfp_t mask)
1250 {
1251 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1252 				mask);
1253 }
1254 
1255 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1256 {
1257 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1258 				GFP_NOFS);
1259 }
1260 
1261 /*
1262  * helper function to set both pages and extents in the tree writeback
1263  */
1264 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1265 {
1266 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1267 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1268 	struct page *page;
1269 
1270 	while (index <= end_index) {
1271 		page = find_get_page(tree->mapping, index);
1272 		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1273 		set_page_writeback(page);
1274 		page_cache_release(page);
1275 		index++;
1276 	}
1277 	return 0;
1278 }
1279 
1280 /* find the first state struct with 'bits' set after 'start', and
1281  * return it.  tree->lock must be held.  NULL will returned if
1282  * nothing was found after 'start'
1283  */
1284 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1285 						 u64 start, int bits)
1286 {
1287 	struct rb_node *node;
1288 	struct extent_state *state;
1289 
1290 	/*
1291 	 * this search will find all the extents that end after
1292 	 * our range starts.
1293 	 */
1294 	node = tree_search(tree, start);
1295 	if (!node)
1296 		goto out;
1297 
1298 	while (1) {
1299 		state = rb_entry(node, struct extent_state, rb_node);
1300 		if (state->end >= start && (state->state & bits))
1301 			return state;
1302 
1303 		node = rb_next(node);
1304 		if (!node)
1305 			break;
1306 	}
1307 out:
1308 	return NULL;
1309 }
1310 
1311 /*
1312  * find the first offset in the io tree with 'bits' set. zero is
1313  * returned if we find something, and *start_ret and *end_ret are
1314  * set to reflect the state struct that was found.
1315  *
1316  * If nothing was found, 1 is returned. If found something, return 0.
1317  */
1318 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1319 			  u64 *start_ret, u64 *end_ret, int bits,
1320 			  struct extent_state **cached_state)
1321 {
1322 	struct extent_state *state;
1323 	struct rb_node *n;
1324 	int ret = 1;
1325 
1326 	spin_lock(&tree->lock);
1327 	if (cached_state && *cached_state) {
1328 		state = *cached_state;
1329 		if (state->end == start - 1 && state->tree) {
1330 			n = rb_next(&state->rb_node);
1331 			while (n) {
1332 				state = rb_entry(n, struct extent_state,
1333 						 rb_node);
1334 				if (state->state & bits)
1335 					goto got_it;
1336 				n = rb_next(n);
1337 			}
1338 			free_extent_state(*cached_state);
1339 			*cached_state = NULL;
1340 			goto out;
1341 		}
1342 		free_extent_state(*cached_state);
1343 		*cached_state = NULL;
1344 	}
1345 
1346 	state = find_first_extent_bit_state(tree, start, bits);
1347 got_it:
1348 	if (state) {
1349 		cache_state(state, cached_state);
1350 		*start_ret = state->start;
1351 		*end_ret = state->end;
1352 		ret = 0;
1353 	}
1354 out:
1355 	spin_unlock(&tree->lock);
1356 	return ret;
1357 }
1358 
1359 /*
1360  * find a contiguous range of bytes in the file marked as delalloc, not
1361  * more than 'max_bytes'.  start and end are used to return the range,
1362  *
1363  * 1 is returned if we find something, 0 if nothing was in the tree
1364  */
1365 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1366 					u64 *start, u64 *end, u64 max_bytes,
1367 					struct extent_state **cached_state)
1368 {
1369 	struct rb_node *node;
1370 	struct extent_state *state;
1371 	u64 cur_start = *start;
1372 	u64 found = 0;
1373 	u64 total_bytes = 0;
1374 
1375 	spin_lock(&tree->lock);
1376 
1377 	/*
1378 	 * this search will find all the extents that end after
1379 	 * our range starts.
1380 	 */
1381 	node = tree_search(tree, cur_start);
1382 	if (!node) {
1383 		if (!found)
1384 			*end = (u64)-1;
1385 		goto out;
1386 	}
1387 
1388 	while (1) {
1389 		state = rb_entry(node, struct extent_state, rb_node);
1390 		if (found && (state->start != cur_start ||
1391 			      (state->state & EXTENT_BOUNDARY))) {
1392 			goto out;
1393 		}
1394 		if (!(state->state & EXTENT_DELALLOC)) {
1395 			if (!found)
1396 				*end = state->end;
1397 			goto out;
1398 		}
1399 		if (!found) {
1400 			*start = state->start;
1401 			*cached_state = state;
1402 			atomic_inc(&state->refs);
1403 		}
1404 		found++;
1405 		*end = state->end;
1406 		cur_start = state->end + 1;
1407 		node = rb_next(node);
1408 		if (!node)
1409 			break;
1410 		total_bytes += state->end - state->start + 1;
1411 		if (total_bytes >= max_bytes)
1412 			break;
1413 	}
1414 out:
1415 	spin_unlock(&tree->lock);
1416 	return found;
1417 }
1418 
1419 static noinline void __unlock_for_delalloc(struct inode *inode,
1420 					   struct page *locked_page,
1421 					   u64 start, u64 end)
1422 {
1423 	int ret;
1424 	struct page *pages[16];
1425 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1426 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1427 	unsigned long nr_pages = end_index - index + 1;
1428 	int i;
1429 
1430 	if (index == locked_page->index && end_index == index)
1431 		return;
1432 
1433 	while (nr_pages > 0) {
1434 		ret = find_get_pages_contig(inode->i_mapping, index,
1435 				     min_t(unsigned long, nr_pages,
1436 				     ARRAY_SIZE(pages)), pages);
1437 		for (i = 0; i < ret; i++) {
1438 			if (pages[i] != locked_page)
1439 				unlock_page(pages[i]);
1440 			page_cache_release(pages[i]);
1441 		}
1442 		nr_pages -= ret;
1443 		index += ret;
1444 		cond_resched();
1445 	}
1446 }
1447 
1448 static noinline int lock_delalloc_pages(struct inode *inode,
1449 					struct page *locked_page,
1450 					u64 delalloc_start,
1451 					u64 delalloc_end)
1452 {
1453 	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1454 	unsigned long start_index = index;
1455 	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1456 	unsigned long pages_locked = 0;
1457 	struct page *pages[16];
1458 	unsigned long nrpages;
1459 	int ret;
1460 	int i;
1461 
1462 	/* the caller is responsible for locking the start index */
1463 	if (index == locked_page->index && index == end_index)
1464 		return 0;
1465 
1466 	/* skip the page at the start index */
1467 	nrpages = end_index - index + 1;
1468 	while (nrpages > 0) {
1469 		ret = find_get_pages_contig(inode->i_mapping, index,
1470 				     min_t(unsigned long,
1471 				     nrpages, ARRAY_SIZE(pages)), pages);
1472 		if (ret == 0) {
1473 			ret = -EAGAIN;
1474 			goto done;
1475 		}
1476 		/* now we have an array of pages, lock them all */
1477 		for (i = 0; i < ret; i++) {
1478 			/*
1479 			 * the caller is taking responsibility for
1480 			 * locked_page
1481 			 */
1482 			if (pages[i] != locked_page) {
1483 				lock_page(pages[i]);
1484 				if (!PageDirty(pages[i]) ||
1485 				    pages[i]->mapping != inode->i_mapping) {
1486 					ret = -EAGAIN;
1487 					unlock_page(pages[i]);
1488 					page_cache_release(pages[i]);
1489 					goto done;
1490 				}
1491 			}
1492 			page_cache_release(pages[i]);
1493 			pages_locked++;
1494 		}
1495 		nrpages -= ret;
1496 		index += ret;
1497 		cond_resched();
1498 	}
1499 	ret = 0;
1500 done:
1501 	if (ret && pages_locked) {
1502 		__unlock_for_delalloc(inode, locked_page,
1503 			      delalloc_start,
1504 			      ((u64)(start_index + pages_locked - 1)) <<
1505 			      PAGE_CACHE_SHIFT);
1506 	}
1507 	return ret;
1508 }
1509 
1510 /*
1511  * find a contiguous range of bytes in the file marked as delalloc, not
1512  * more than 'max_bytes'.  start and end are used to return the range,
1513  *
1514  * 1 is returned if we find something, 0 if nothing was in the tree
1515  */
1516 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1517 					     struct extent_io_tree *tree,
1518 					     struct page *locked_page,
1519 					     u64 *start, u64 *end,
1520 					     u64 max_bytes)
1521 {
1522 	u64 delalloc_start;
1523 	u64 delalloc_end;
1524 	u64 found;
1525 	struct extent_state *cached_state = NULL;
1526 	int ret;
1527 	int loops = 0;
1528 
1529 again:
1530 	/* step one, find a bunch of delalloc bytes starting at start */
1531 	delalloc_start = *start;
1532 	delalloc_end = 0;
1533 	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1534 				    max_bytes, &cached_state);
1535 	if (!found || delalloc_end <= *start) {
1536 		*start = delalloc_start;
1537 		*end = delalloc_end;
1538 		free_extent_state(cached_state);
1539 		return found;
1540 	}
1541 
1542 	/*
1543 	 * start comes from the offset of locked_page.  We have to lock
1544 	 * pages in order, so we can't process delalloc bytes before
1545 	 * locked_page
1546 	 */
1547 	if (delalloc_start < *start)
1548 		delalloc_start = *start;
1549 
1550 	/*
1551 	 * make sure to limit the number of pages we try to lock down
1552 	 * if we're looping.
1553 	 */
1554 	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1555 		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1556 
1557 	/* step two, lock all the pages after the page that has start */
1558 	ret = lock_delalloc_pages(inode, locked_page,
1559 				  delalloc_start, delalloc_end);
1560 	if (ret == -EAGAIN) {
1561 		/* some of the pages are gone, lets avoid looping by
1562 		 * shortening the size of the delalloc range we're searching
1563 		 */
1564 		free_extent_state(cached_state);
1565 		if (!loops) {
1566 			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1567 			max_bytes = PAGE_CACHE_SIZE - offset;
1568 			loops = 1;
1569 			goto again;
1570 		} else {
1571 			found = 0;
1572 			goto out_failed;
1573 		}
1574 	}
1575 	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
1576 
1577 	/* step three, lock the state bits for the whole range */
1578 	lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
1579 
1580 	/* then test to make sure it is all still delalloc */
1581 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1582 			     EXTENT_DELALLOC, 1, cached_state);
1583 	if (!ret) {
1584 		unlock_extent_cached(tree, delalloc_start, delalloc_end,
1585 				     &cached_state, GFP_NOFS);
1586 		__unlock_for_delalloc(inode, locked_page,
1587 			      delalloc_start, delalloc_end);
1588 		cond_resched();
1589 		goto again;
1590 	}
1591 	free_extent_state(cached_state);
1592 	*start = delalloc_start;
1593 	*end = delalloc_end;
1594 out_failed:
1595 	return found;
1596 }
1597 
1598 int extent_clear_unlock_delalloc(struct inode *inode,
1599 				struct extent_io_tree *tree,
1600 				u64 start, u64 end, struct page *locked_page,
1601 				unsigned long op)
1602 {
1603 	int ret;
1604 	struct page *pages[16];
1605 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1606 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1607 	unsigned long nr_pages = end_index - index + 1;
1608 	int i;
1609 	int clear_bits = 0;
1610 
1611 	if (op & EXTENT_CLEAR_UNLOCK)
1612 		clear_bits |= EXTENT_LOCKED;
1613 	if (op & EXTENT_CLEAR_DIRTY)
1614 		clear_bits |= EXTENT_DIRTY;
1615 
1616 	if (op & EXTENT_CLEAR_DELALLOC)
1617 		clear_bits |= EXTENT_DELALLOC;
1618 
1619 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1620 	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
1621 		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
1622 		    EXTENT_SET_PRIVATE2)))
1623 		return 0;
1624 
1625 	while (nr_pages > 0) {
1626 		ret = find_get_pages_contig(inode->i_mapping, index,
1627 				     min_t(unsigned long,
1628 				     nr_pages, ARRAY_SIZE(pages)), pages);
1629 		for (i = 0; i < ret; i++) {
1630 
1631 			if (op & EXTENT_SET_PRIVATE2)
1632 				SetPagePrivate2(pages[i]);
1633 
1634 			if (pages[i] == locked_page) {
1635 				page_cache_release(pages[i]);
1636 				continue;
1637 			}
1638 			if (op & EXTENT_CLEAR_DIRTY)
1639 				clear_page_dirty_for_io(pages[i]);
1640 			if (op & EXTENT_SET_WRITEBACK)
1641 				set_page_writeback(pages[i]);
1642 			if (op & EXTENT_END_WRITEBACK)
1643 				end_page_writeback(pages[i]);
1644 			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
1645 				unlock_page(pages[i]);
1646 			page_cache_release(pages[i]);
1647 		}
1648 		nr_pages -= ret;
1649 		index += ret;
1650 		cond_resched();
1651 	}
1652 	return 0;
1653 }
1654 
1655 /*
1656  * count the number of bytes in the tree that have a given bit(s)
1657  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1658  * cached.  The total number found is returned.
1659  */
1660 u64 count_range_bits(struct extent_io_tree *tree,
1661 		     u64 *start, u64 search_end, u64 max_bytes,
1662 		     unsigned long bits, int contig)
1663 {
1664 	struct rb_node *node;
1665 	struct extent_state *state;
1666 	u64 cur_start = *start;
1667 	u64 total_bytes = 0;
1668 	u64 last = 0;
1669 	int found = 0;
1670 
1671 	if (search_end <= cur_start) {
1672 		WARN_ON(1);
1673 		return 0;
1674 	}
1675 
1676 	spin_lock(&tree->lock);
1677 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1678 		total_bytes = tree->dirty_bytes;
1679 		goto out;
1680 	}
1681 	/*
1682 	 * this search will find all the extents that end after
1683 	 * our range starts.
1684 	 */
1685 	node = tree_search(tree, cur_start);
1686 	if (!node)
1687 		goto out;
1688 
1689 	while (1) {
1690 		state = rb_entry(node, struct extent_state, rb_node);
1691 		if (state->start > search_end)
1692 			break;
1693 		if (contig && found && state->start > last + 1)
1694 			break;
1695 		if (state->end >= cur_start && (state->state & bits) == bits) {
1696 			total_bytes += min(search_end, state->end) + 1 -
1697 				       max(cur_start, state->start);
1698 			if (total_bytes >= max_bytes)
1699 				break;
1700 			if (!found) {
1701 				*start = max(cur_start, state->start);
1702 				found = 1;
1703 			}
1704 			last = state->end;
1705 		} else if (contig && found) {
1706 			break;
1707 		}
1708 		node = rb_next(node);
1709 		if (!node)
1710 			break;
1711 	}
1712 out:
1713 	spin_unlock(&tree->lock);
1714 	return total_bytes;
1715 }
1716 
1717 /*
1718  * set the private field for a given byte offset in the tree.  If there isn't
1719  * an extent_state there already, this does nothing.
1720  */
1721 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1722 {
1723 	struct rb_node *node;
1724 	struct extent_state *state;
1725 	int ret = 0;
1726 
1727 	spin_lock(&tree->lock);
1728 	/*
1729 	 * this search will find all the extents that end after
1730 	 * our range starts.
1731 	 */
1732 	node = tree_search(tree, start);
1733 	if (!node) {
1734 		ret = -ENOENT;
1735 		goto out;
1736 	}
1737 	state = rb_entry(node, struct extent_state, rb_node);
1738 	if (state->start != start) {
1739 		ret = -ENOENT;
1740 		goto out;
1741 	}
1742 	state->private = private;
1743 out:
1744 	spin_unlock(&tree->lock);
1745 	return ret;
1746 }
1747 
1748 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1749 {
1750 	struct rb_node *node;
1751 	struct extent_state *state;
1752 	int ret = 0;
1753 
1754 	spin_lock(&tree->lock);
1755 	/*
1756 	 * this search will find all the extents that end after
1757 	 * our range starts.
1758 	 */
1759 	node = tree_search(tree, start);
1760 	if (!node) {
1761 		ret = -ENOENT;
1762 		goto out;
1763 	}
1764 	state = rb_entry(node, struct extent_state, rb_node);
1765 	if (state->start != start) {
1766 		ret = -ENOENT;
1767 		goto out;
1768 	}
1769 	*private = state->private;
1770 out:
1771 	spin_unlock(&tree->lock);
1772 	return ret;
1773 }
1774 
1775 /*
1776  * searches a range in the state tree for a given mask.
1777  * If 'filled' == 1, this returns 1 only if every extent in the tree
1778  * has the bits set.  Otherwise, 1 is returned if any bit in the
1779  * range is found set.
1780  */
1781 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1782 		   int bits, int filled, struct extent_state *cached)
1783 {
1784 	struct extent_state *state = NULL;
1785 	struct rb_node *node;
1786 	int bitset = 0;
1787 
1788 	spin_lock(&tree->lock);
1789 	if (cached && cached->tree && cached->start <= start &&
1790 	    cached->end > start)
1791 		node = &cached->rb_node;
1792 	else
1793 		node = tree_search(tree, start);
1794 	while (node && start <= end) {
1795 		state = rb_entry(node, struct extent_state, rb_node);
1796 
1797 		if (filled && state->start > start) {
1798 			bitset = 0;
1799 			break;
1800 		}
1801 
1802 		if (state->start > end)
1803 			break;
1804 
1805 		if (state->state & bits) {
1806 			bitset = 1;
1807 			if (!filled)
1808 				break;
1809 		} else if (filled) {
1810 			bitset = 0;
1811 			break;
1812 		}
1813 
1814 		if (state->end == (u64)-1)
1815 			break;
1816 
1817 		start = state->end + 1;
1818 		if (start > end)
1819 			break;
1820 		node = rb_next(node);
1821 		if (!node) {
1822 			if (filled)
1823 				bitset = 0;
1824 			break;
1825 		}
1826 	}
1827 	spin_unlock(&tree->lock);
1828 	return bitset;
1829 }
1830 
1831 /*
1832  * helper function to set a given page up to date if all the
1833  * extents in the tree for that page are up to date
1834  */
1835 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
1836 {
1837 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1838 	u64 end = start + PAGE_CACHE_SIZE - 1;
1839 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1840 		SetPageUptodate(page);
1841 }
1842 
1843 /*
1844  * helper function to unlock a page if all the extents in the tree
1845  * for that page are unlocked
1846  */
1847 static void check_page_locked(struct extent_io_tree *tree, struct page *page)
1848 {
1849 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1850 	u64 end = start + PAGE_CACHE_SIZE - 1;
1851 	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1852 		unlock_page(page);
1853 }
1854 
1855 /*
1856  * helper function to end page writeback if all the extents
1857  * in the tree for that page are done with writeback
1858  */
1859 static void check_page_writeback(struct extent_io_tree *tree,
1860 				 struct page *page)
1861 {
1862 	end_page_writeback(page);
1863 }
1864 
1865 /*
1866  * When IO fails, either with EIO or csum verification fails, we
1867  * try other mirrors that might have a good copy of the data.  This
1868  * io_failure_record is used to record state as we go through all the
1869  * mirrors.  If another mirror has good data, the page is set up to date
1870  * and things continue.  If a good mirror can't be found, the original
1871  * bio end_io callback is called to indicate things have failed.
1872  */
1873 struct io_failure_record {
1874 	struct page *page;
1875 	u64 start;
1876 	u64 len;
1877 	u64 logical;
1878 	unsigned long bio_flags;
1879 	int this_mirror;
1880 	int failed_mirror;
1881 	int in_validation;
1882 };
1883 
1884 static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1885 				int did_repair)
1886 {
1887 	int ret;
1888 	int err = 0;
1889 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1890 
1891 	set_state_private(failure_tree, rec->start, 0);
1892 	ret = clear_extent_bits(failure_tree, rec->start,
1893 				rec->start + rec->len - 1,
1894 				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1895 	if (ret)
1896 		err = ret;
1897 
1898 	if (did_repair) {
1899 		ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1900 					rec->start + rec->len - 1,
1901 					EXTENT_DAMAGED, GFP_NOFS);
1902 		if (ret && !err)
1903 			err = ret;
1904 	}
1905 
1906 	kfree(rec);
1907 	return err;
1908 }
1909 
1910 static void repair_io_failure_callback(struct bio *bio, int err)
1911 {
1912 	complete(bio->bi_private);
1913 }
1914 
1915 /*
1916  * this bypasses the standard btrfs submit functions deliberately, as
1917  * the standard behavior is to write all copies in a raid setup. here we only
1918  * want to write the one bad copy. so we do the mapping for ourselves and issue
1919  * submit_bio directly.
1920  * to avoid any synchronization issues, wait for the data after writing, which
1921  * actually prevents the read that triggered the error from finishing.
1922  * currently, there can be no more than two copies of every data bit. thus,
1923  * exactly one rewrite is required.
1924  */
1925 int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
1926 			u64 length, u64 logical, struct page *page,
1927 			int mirror_num)
1928 {
1929 	struct bio *bio;
1930 	struct btrfs_device *dev;
1931 	DECLARE_COMPLETION_ONSTACK(compl);
1932 	u64 map_length = 0;
1933 	u64 sector;
1934 	struct btrfs_bio *bbio = NULL;
1935 	int ret;
1936 
1937 	BUG_ON(!mirror_num);
1938 
1939 	bio = bio_alloc(GFP_NOFS, 1);
1940 	if (!bio)
1941 		return -EIO;
1942 	bio->bi_private = &compl;
1943 	bio->bi_end_io = repair_io_failure_callback;
1944 	bio->bi_size = 0;
1945 	map_length = length;
1946 
1947 	ret = btrfs_map_block(fs_info, WRITE, logical,
1948 			      &map_length, &bbio, mirror_num);
1949 	if (ret) {
1950 		bio_put(bio);
1951 		return -EIO;
1952 	}
1953 	BUG_ON(mirror_num != bbio->mirror_num);
1954 	sector = bbio->stripes[mirror_num-1].physical >> 9;
1955 	bio->bi_sector = sector;
1956 	dev = bbio->stripes[mirror_num-1].dev;
1957 	kfree(bbio);
1958 	if (!dev || !dev->bdev || !dev->writeable) {
1959 		bio_put(bio);
1960 		return -EIO;
1961 	}
1962 	bio->bi_bdev = dev->bdev;
1963 	bio_add_page(bio, page, length, start-page_offset(page));
1964 	btrfsic_submit_bio(WRITE_SYNC, bio);
1965 	wait_for_completion(&compl);
1966 
1967 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1968 		/* try to remap that extent elsewhere? */
1969 		bio_put(bio);
1970 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
1971 		return -EIO;
1972 	}
1973 
1974 	printk_ratelimited_in_rcu(KERN_INFO "btrfs read error corrected: ino %lu off %llu "
1975 		      "(dev %s sector %llu)\n", page->mapping->host->i_ino,
1976 		      start, rcu_str_deref(dev->name), sector);
1977 
1978 	bio_put(bio);
1979 	return 0;
1980 }
1981 
1982 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
1983 			 int mirror_num)
1984 {
1985 	u64 start = eb->start;
1986 	unsigned long i, num_pages = num_extent_pages(eb->start, eb->len);
1987 	int ret = 0;
1988 
1989 	for (i = 0; i < num_pages; i++) {
1990 		struct page *p = extent_buffer_page(eb, i);
1991 		ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE,
1992 					start, p, mirror_num);
1993 		if (ret)
1994 			break;
1995 		start += PAGE_CACHE_SIZE;
1996 	}
1997 
1998 	return ret;
1999 }
2000 
2001 /*
2002  * each time an IO finishes, we do a fast check in the IO failure tree
2003  * to see if we need to process or clean up an io_failure_record
2004  */
2005 static int clean_io_failure(u64 start, struct page *page)
2006 {
2007 	u64 private;
2008 	u64 private_failure;
2009 	struct io_failure_record *failrec;
2010 	struct btrfs_fs_info *fs_info;
2011 	struct extent_state *state;
2012 	int num_copies;
2013 	int did_repair = 0;
2014 	int ret;
2015 	struct inode *inode = page->mapping->host;
2016 
2017 	private = 0;
2018 	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
2019 				(u64)-1, 1, EXTENT_DIRTY, 0);
2020 	if (!ret)
2021 		return 0;
2022 
2023 	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
2024 				&private_failure);
2025 	if (ret)
2026 		return 0;
2027 
2028 	failrec = (struct io_failure_record *)(unsigned long) private_failure;
2029 	BUG_ON(!failrec->this_mirror);
2030 
2031 	if (failrec->in_validation) {
2032 		/* there was no real error, just free the record */
2033 		pr_debug("clean_io_failure: freeing dummy error at %llu\n",
2034 			 failrec->start);
2035 		did_repair = 1;
2036 		goto out;
2037 	}
2038 
2039 	spin_lock(&BTRFS_I(inode)->io_tree.lock);
2040 	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
2041 					    failrec->start,
2042 					    EXTENT_LOCKED);
2043 	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
2044 
2045 	if (state && state->start == failrec->start) {
2046 		fs_info = BTRFS_I(inode)->root->fs_info;
2047 		num_copies = btrfs_num_copies(fs_info, failrec->logical,
2048 					      failrec->len);
2049 		if (num_copies > 1)  {
2050 			ret = repair_io_failure(fs_info, start, failrec->len,
2051 						failrec->logical, page,
2052 						failrec->failed_mirror);
2053 			did_repair = !ret;
2054 		}
2055 	}
2056 
2057 out:
2058 	if (!ret)
2059 		ret = free_io_failure(inode, failrec, did_repair);
2060 
2061 	return ret;
2062 }
2063 
2064 /*
2065  * this is a generic handler for readpage errors (default
2066  * readpage_io_failed_hook). if other copies exist, read those and write back
2067  * good data to the failed position. does not investigate in remapping the
2068  * failed extent elsewhere, hoping the device will be smart enough to do this as
2069  * needed
2070  */
2071 
2072 static int bio_readpage_error(struct bio *failed_bio, struct page *page,
2073 				u64 start, u64 end, int failed_mirror,
2074 				struct extent_state *state)
2075 {
2076 	struct io_failure_record *failrec = NULL;
2077 	u64 private;
2078 	struct extent_map *em;
2079 	struct inode *inode = page->mapping->host;
2080 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2081 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2082 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
2083 	struct bio *bio;
2084 	int num_copies;
2085 	int ret;
2086 	int read_mode;
2087 	u64 logical;
2088 
2089 	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
2090 
2091 	ret = get_state_private(failure_tree, start, &private);
2092 	if (ret) {
2093 		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2094 		if (!failrec)
2095 			return -ENOMEM;
2096 		failrec->start = start;
2097 		failrec->len = end - start + 1;
2098 		failrec->this_mirror = 0;
2099 		failrec->bio_flags = 0;
2100 		failrec->in_validation = 0;
2101 
2102 		read_lock(&em_tree->lock);
2103 		em = lookup_extent_mapping(em_tree, start, failrec->len);
2104 		if (!em) {
2105 			read_unlock(&em_tree->lock);
2106 			kfree(failrec);
2107 			return -EIO;
2108 		}
2109 
2110 		if (em->start > start || em->start + em->len < start) {
2111 			free_extent_map(em);
2112 			em = NULL;
2113 		}
2114 		read_unlock(&em_tree->lock);
2115 
2116 		if (!em) {
2117 			kfree(failrec);
2118 			return -EIO;
2119 		}
2120 		logical = start - em->start;
2121 		logical = em->block_start + logical;
2122 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2123 			logical = em->block_start;
2124 			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2125 			extent_set_compress_type(&failrec->bio_flags,
2126 						 em->compress_type);
2127 		}
2128 		pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2129 			 "len=%llu\n", logical, start, failrec->len);
2130 		failrec->logical = logical;
2131 		free_extent_map(em);
2132 
2133 		/* set the bits in the private failure tree */
2134 		ret = set_extent_bits(failure_tree, start, end,
2135 					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2136 		if (ret >= 0)
2137 			ret = set_state_private(failure_tree, start,
2138 						(u64)(unsigned long)failrec);
2139 		/* set the bits in the inode's tree */
2140 		if (ret >= 0)
2141 			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2142 						GFP_NOFS);
2143 		if (ret < 0) {
2144 			kfree(failrec);
2145 			return ret;
2146 		}
2147 	} else {
2148 		failrec = (struct io_failure_record *)(unsigned long)private;
2149 		pr_debug("bio_readpage_error: (found) logical=%llu, "
2150 			 "start=%llu, len=%llu, validation=%d\n",
2151 			 failrec->logical, failrec->start, failrec->len,
2152 			 failrec->in_validation);
2153 		/*
2154 		 * when data can be on disk more than twice, add to failrec here
2155 		 * (e.g. with a list for failed_mirror) to make
2156 		 * clean_io_failure() clean all those errors at once.
2157 		 */
2158 	}
2159 	num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info,
2160 				      failrec->logical, failrec->len);
2161 	if (num_copies == 1) {
2162 		/*
2163 		 * we only have a single copy of the data, so don't bother with
2164 		 * all the retry and error correction code that follows. no
2165 		 * matter what the error is, it is very likely to persist.
2166 		 */
2167 		pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2168 			 "state=%p, num_copies=%d, next_mirror %d, "
2169 			 "failed_mirror %d\n", state, num_copies,
2170 			 failrec->this_mirror, failed_mirror);
2171 		free_io_failure(inode, failrec, 0);
2172 		return -EIO;
2173 	}
2174 
2175 	if (!state) {
2176 		spin_lock(&tree->lock);
2177 		state = find_first_extent_bit_state(tree, failrec->start,
2178 						    EXTENT_LOCKED);
2179 		if (state && state->start != failrec->start)
2180 			state = NULL;
2181 		spin_unlock(&tree->lock);
2182 	}
2183 
2184 	/*
2185 	 * there are two premises:
2186 	 *	a) deliver good data to the caller
2187 	 *	b) correct the bad sectors on disk
2188 	 */
2189 	if (failed_bio->bi_vcnt > 1) {
2190 		/*
2191 		 * to fulfill b), we need to know the exact failing sectors, as
2192 		 * we don't want to rewrite any more than the failed ones. thus,
2193 		 * we need separate read requests for the failed bio
2194 		 *
2195 		 * if the following BUG_ON triggers, our validation request got
2196 		 * merged. we need separate requests for our algorithm to work.
2197 		 */
2198 		BUG_ON(failrec->in_validation);
2199 		failrec->in_validation = 1;
2200 		failrec->this_mirror = failed_mirror;
2201 		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2202 	} else {
2203 		/*
2204 		 * we're ready to fulfill a) and b) alongside. get a good copy
2205 		 * of the failed sector and if we succeed, we have setup
2206 		 * everything for repair_io_failure to do the rest for us.
2207 		 */
2208 		if (failrec->in_validation) {
2209 			BUG_ON(failrec->this_mirror != failed_mirror);
2210 			failrec->in_validation = 0;
2211 			failrec->this_mirror = 0;
2212 		}
2213 		failrec->failed_mirror = failed_mirror;
2214 		failrec->this_mirror++;
2215 		if (failrec->this_mirror == failed_mirror)
2216 			failrec->this_mirror++;
2217 		read_mode = READ_SYNC;
2218 	}
2219 
2220 	if (!state || failrec->this_mirror > num_copies) {
2221 		pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2222 			 "next_mirror %d, failed_mirror %d\n", state,
2223 			 num_copies, failrec->this_mirror, failed_mirror);
2224 		free_io_failure(inode, failrec, 0);
2225 		return -EIO;
2226 	}
2227 
2228 	bio = bio_alloc(GFP_NOFS, 1);
2229 	if (!bio) {
2230 		free_io_failure(inode, failrec, 0);
2231 		return -EIO;
2232 	}
2233 	bio->bi_private = state;
2234 	bio->bi_end_io = failed_bio->bi_end_io;
2235 	bio->bi_sector = failrec->logical >> 9;
2236 	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2237 	bio->bi_size = 0;
2238 
2239 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
2240 
2241 	pr_debug("bio_readpage_error: submitting new read[%#x] to "
2242 		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2243 		 failrec->this_mirror, num_copies, failrec->in_validation);
2244 
2245 	ret = tree->ops->submit_bio_hook(inode, read_mode, bio,
2246 					 failrec->this_mirror,
2247 					 failrec->bio_flags, 0);
2248 	return ret;
2249 }
2250 
2251 /* lots and lots of room for performance fixes in the end_bio funcs */
2252 
2253 int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2254 {
2255 	int uptodate = (err == 0);
2256 	struct extent_io_tree *tree;
2257 	int ret;
2258 
2259 	tree = &BTRFS_I(page->mapping->host)->io_tree;
2260 
2261 	if (tree->ops && tree->ops->writepage_end_io_hook) {
2262 		ret = tree->ops->writepage_end_io_hook(page, start,
2263 					       end, NULL, uptodate);
2264 		if (ret)
2265 			uptodate = 0;
2266 	}
2267 
2268 	if (!uptodate) {
2269 		ClearPageUptodate(page);
2270 		SetPageError(page);
2271 	}
2272 	return 0;
2273 }
2274 
2275 /*
2276  * after a writepage IO is done, we need to:
2277  * clear the uptodate bits on error
2278  * clear the writeback bits in the extent tree for this IO
2279  * end_page_writeback if the page has no more pending IO
2280  *
2281  * Scheduling is not allowed, so the extent state tree is expected
2282  * to have one and only one object corresponding to this IO.
2283  */
2284 static void end_bio_extent_writepage(struct bio *bio, int err)
2285 {
2286 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2287 	struct extent_io_tree *tree;
2288 	u64 start;
2289 	u64 end;
2290 	int whole_page;
2291 
2292 	do {
2293 		struct page *page = bvec->bv_page;
2294 		tree = &BTRFS_I(page->mapping->host)->io_tree;
2295 
2296 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2297 			 bvec->bv_offset;
2298 		end = start + bvec->bv_len - 1;
2299 
2300 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2301 			whole_page = 1;
2302 		else
2303 			whole_page = 0;
2304 
2305 		if (--bvec >= bio->bi_io_vec)
2306 			prefetchw(&bvec->bv_page->flags);
2307 
2308 		if (end_extent_writepage(page, err, start, end))
2309 			continue;
2310 
2311 		if (whole_page)
2312 			end_page_writeback(page);
2313 		else
2314 			check_page_writeback(tree, page);
2315 	} while (bvec >= bio->bi_io_vec);
2316 
2317 	bio_put(bio);
2318 }
2319 
2320 /*
2321  * after a readpage IO is done, we need to:
2322  * clear the uptodate bits on error
2323  * set the uptodate bits if things worked
2324  * set the page up to date if all extents in the tree are uptodate
2325  * clear the lock bit in the extent tree
2326  * unlock the page if there are no other extents locked for it
2327  *
2328  * Scheduling is not allowed, so the extent state tree is expected
2329  * to have one and only one object corresponding to this IO.
2330  */
2331 static void end_bio_extent_readpage(struct bio *bio, int err)
2332 {
2333 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
2334 	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
2335 	struct bio_vec *bvec = bio->bi_io_vec;
2336 	struct extent_io_tree *tree;
2337 	u64 start;
2338 	u64 end;
2339 	int whole_page;
2340 	int mirror;
2341 	int ret;
2342 
2343 	if (err)
2344 		uptodate = 0;
2345 
2346 	do {
2347 		struct page *page = bvec->bv_page;
2348 		struct extent_state *cached = NULL;
2349 		struct extent_state *state;
2350 
2351 		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
2352 			 "mirror=%ld\n", (u64)bio->bi_sector, err,
2353 			 (long int)bio->bi_bdev);
2354 		tree = &BTRFS_I(page->mapping->host)->io_tree;
2355 
2356 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
2357 			bvec->bv_offset;
2358 		end = start + bvec->bv_len - 1;
2359 
2360 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
2361 			whole_page = 1;
2362 		else
2363 			whole_page = 0;
2364 
2365 		if (++bvec <= bvec_end)
2366 			prefetchw(&bvec->bv_page->flags);
2367 
2368 		spin_lock(&tree->lock);
2369 		state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
2370 		if (state && state->start == start) {
2371 			/*
2372 			 * take a reference on the state, unlock will drop
2373 			 * the ref
2374 			 */
2375 			cache_state(state, &cached);
2376 		}
2377 		spin_unlock(&tree->lock);
2378 
2379 		mirror = (int)(unsigned long)bio->bi_bdev;
2380 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
2381 			ret = tree->ops->readpage_end_io_hook(page, start, end,
2382 							      state, mirror);
2383 			if (ret)
2384 				uptodate = 0;
2385 			else
2386 				clean_io_failure(start, page);
2387 		}
2388 
2389 		if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
2390 			ret = tree->ops->readpage_io_failed_hook(page, mirror);
2391 			if (!ret && !err &&
2392 			    test_bit(BIO_UPTODATE, &bio->bi_flags))
2393 				uptodate = 1;
2394 		} else if (!uptodate) {
2395 			/*
2396 			 * The generic bio_readpage_error handles errors the
2397 			 * following way: If possible, new read requests are
2398 			 * created and submitted and will end up in
2399 			 * end_bio_extent_readpage as well (if we're lucky, not
2400 			 * in the !uptodate case). In that case it returns 0 and
2401 			 * we just go on with the next page in our bio. If it
2402 			 * can't handle the error it will return -EIO and we
2403 			 * remain responsible for that page.
2404 			 */
2405 			ret = bio_readpage_error(bio, page, start, end, mirror, NULL);
2406 			if (ret == 0) {
2407 				uptodate =
2408 					test_bit(BIO_UPTODATE, &bio->bi_flags);
2409 				if (err)
2410 					uptodate = 0;
2411 				uncache_state(&cached);
2412 				continue;
2413 			}
2414 		}
2415 
2416 		if (uptodate && tree->track_uptodate) {
2417 			set_extent_uptodate(tree, start, end, &cached,
2418 					    GFP_ATOMIC);
2419 		}
2420 		unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
2421 
2422 		if (whole_page) {
2423 			if (uptodate) {
2424 				SetPageUptodate(page);
2425 			} else {
2426 				ClearPageUptodate(page);
2427 				SetPageError(page);
2428 			}
2429 			unlock_page(page);
2430 		} else {
2431 			if (uptodate) {
2432 				check_page_uptodate(tree, page);
2433 			} else {
2434 				ClearPageUptodate(page);
2435 				SetPageError(page);
2436 			}
2437 			check_page_locked(tree, page);
2438 		}
2439 	} while (bvec <= bvec_end);
2440 
2441 	bio_put(bio);
2442 }
2443 
2444 struct bio *
2445 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
2446 		gfp_t gfp_flags)
2447 {
2448 	struct bio *bio;
2449 
2450 	bio = bio_alloc(gfp_flags, nr_vecs);
2451 
2452 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
2453 		while (!bio && (nr_vecs /= 2))
2454 			bio = bio_alloc(gfp_flags, nr_vecs);
2455 	}
2456 
2457 	if (bio) {
2458 		bio->bi_size = 0;
2459 		bio->bi_bdev = bdev;
2460 		bio->bi_sector = first_sector;
2461 	}
2462 	return bio;
2463 }
2464 
2465 static int __must_check submit_one_bio(int rw, struct bio *bio,
2466 				       int mirror_num, unsigned long bio_flags)
2467 {
2468 	int ret = 0;
2469 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
2470 	struct page *page = bvec->bv_page;
2471 	struct extent_io_tree *tree = bio->bi_private;
2472 	u64 start;
2473 
2474 	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
2475 
2476 	bio->bi_private = NULL;
2477 
2478 	bio_get(bio);
2479 
2480 	if (tree->ops && tree->ops->submit_bio_hook)
2481 		ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
2482 					   mirror_num, bio_flags, start);
2483 	else
2484 		btrfsic_submit_bio(rw, bio);
2485 
2486 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
2487 		ret = -EOPNOTSUPP;
2488 	bio_put(bio);
2489 	return ret;
2490 }
2491 
2492 static int merge_bio(struct extent_io_tree *tree, struct page *page,
2493 		     unsigned long offset, size_t size, struct bio *bio,
2494 		     unsigned long bio_flags)
2495 {
2496 	int ret = 0;
2497 	if (tree->ops && tree->ops->merge_bio_hook)
2498 		ret = tree->ops->merge_bio_hook(page, offset, size, bio,
2499 						bio_flags);
2500 	BUG_ON(ret < 0);
2501 	return ret;
2502 
2503 }
2504 
2505 static int submit_extent_page(int rw, struct extent_io_tree *tree,
2506 			      struct page *page, sector_t sector,
2507 			      size_t size, unsigned long offset,
2508 			      struct block_device *bdev,
2509 			      struct bio **bio_ret,
2510 			      unsigned long max_pages,
2511 			      bio_end_io_t end_io_func,
2512 			      int mirror_num,
2513 			      unsigned long prev_bio_flags,
2514 			      unsigned long bio_flags)
2515 {
2516 	int ret = 0;
2517 	struct bio *bio;
2518 	int nr;
2519 	int contig = 0;
2520 	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
2521 	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
2522 	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
2523 
2524 	if (bio_ret && *bio_ret) {
2525 		bio = *bio_ret;
2526 		if (old_compressed)
2527 			contig = bio->bi_sector == sector;
2528 		else
2529 			contig = bio->bi_sector + (bio->bi_size >> 9) ==
2530 				sector;
2531 
2532 		if (prev_bio_flags != bio_flags || !contig ||
2533 		    merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
2534 		    bio_add_page(bio, page, page_size, offset) < page_size) {
2535 			ret = submit_one_bio(rw, bio, mirror_num,
2536 					     prev_bio_flags);
2537 			if (ret < 0)
2538 				return ret;
2539 			bio = NULL;
2540 		} else {
2541 			return 0;
2542 		}
2543 	}
2544 	if (this_compressed)
2545 		nr = BIO_MAX_PAGES;
2546 	else
2547 		nr = bio_get_nr_vecs(bdev);
2548 
2549 	bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
2550 	if (!bio)
2551 		return -ENOMEM;
2552 
2553 	bio_add_page(bio, page, page_size, offset);
2554 	bio->bi_end_io = end_io_func;
2555 	bio->bi_private = tree;
2556 
2557 	if (bio_ret)
2558 		*bio_ret = bio;
2559 	else
2560 		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
2561 
2562 	return ret;
2563 }
2564 
2565 void attach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
2566 {
2567 	if (!PagePrivate(page)) {
2568 		SetPagePrivate(page);
2569 		page_cache_get(page);
2570 		set_page_private(page, (unsigned long)eb);
2571 	} else {
2572 		WARN_ON(page->private != (unsigned long)eb);
2573 	}
2574 }
2575 
2576 void set_page_extent_mapped(struct page *page)
2577 {
2578 	if (!PagePrivate(page)) {
2579 		SetPagePrivate(page);
2580 		page_cache_get(page);
2581 		set_page_private(page, EXTENT_PAGE_PRIVATE);
2582 	}
2583 }
2584 
2585 /*
2586  * basic readpage implementation.  Locked extent state structs are inserted
2587  * into the tree that are removed when the IO is done (by the end_io
2588  * handlers)
2589  * XXX JDM: This needs looking at to ensure proper page locking
2590  */
2591 static int __extent_read_full_page(struct extent_io_tree *tree,
2592 				   struct page *page,
2593 				   get_extent_t *get_extent,
2594 				   struct bio **bio, int mirror_num,
2595 				   unsigned long *bio_flags)
2596 {
2597 	struct inode *inode = page->mapping->host;
2598 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2599 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2600 	u64 end;
2601 	u64 cur = start;
2602 	u64 extent_offset;
2603 	u64 last_byte = i_size_read(inode);
2604 	u64 block_start;
2605 	u64 cur_end;
2606 	sector_t sector;
2607 	struct extent_map *em;
2608 	struct block_device *bdev;
2609 	struct btrfs_ordered_extent *ordered;
2610 	int ret;
2611 	int nr = 0;
2612 	size_t pg_offset = 0;
2613 	size_t iosize;
2614 	size_t disk_io_size;
2615 	size_t blocksize = inode->i_sb->s_blocksize;
2616 	unsigned long this_bio_flag = 0;
2617 
2618 	set_page_extent_mapped(page);
2619 
2620 	if (!PageUptodate(page)) {
2621 		if (cleancache_get_page(page) == 0) {
2622 			BUG_ON(blocksize != PAGE_SIZE);
2623 			goto out;
2624 		}
2625 	}
2626 
2627 	end = page_end;
2628 	while (1) {
2629 		lock_extent(tree, start, end);
2630 		ordered = btrfs_lookup_ordered_extent(inode, start);
2631 		if (!ordered)
2632 			break;
2633 		unlock_extent(tree, start, end);
2634 		btrfs_start_ordered_extent(inode, ordered, 1);
2635 		btrfs_put_ordered_extent(ordered);
2636 	}
2637 
2638 	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2639 		char *userpage;
2640 		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2641 
2642 		if (zero_offset) {
2643 			iosize = PAGE_CACHE_SIZE - zero_offset;
2644 			userpage = kmap_atomic(page);
2645 			memset(userpage + zero_offset, 0, iosize);
2646 			flush_dcache_page(page);
2647 			kunmap_atomic(userpage);
2648 		}
2649 	}
2650 	while (cur <= end) {
2651 		if (cur >= last_byte) {
2652 			char *userpage;
2653 			struct extent_state *cached = NULL;
2654 
2655 			iosize = PAGE_CACHE_SIZE - pg_offset;
2656 			userpage = kmap_atomic(page);
2657 			memset(userpage + pg_offset, 0, iosize);
2658 			flush_dcache_page(page);
2659 			kunmap_atomic(userpage);
2660 			set_extent_uptodate(tree, cur, cur + iosize - 1,
2661 					    &cached, GFP_NOFS);
2662 			unlock_extent_cached(tree, cur, cur + iosize - 1,
2663 					     &cached, GFP_NOFS);
2664 			break;
2665 		}
2666 		em = get_extent(inode, page, pg_offset, cur,
2667 				end - cur + 1, 0);
2668 		if (IS_ERR_OR_NULL(em)) {
2669 			SetPageError(page);
2670 			unlock_extent(tree, cur, end);
2671 			break;
2672 		}
2673 		extent_offset = cur - em->start;
2674 		BUG_ON(extent_map_end(em) <= cur);
2675 		BUG_ON(end < cur);
2676 
2677 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2678 			this_bio_flag = EXTENT_BIO_COMPRESSED;
2679 			extent_set_compress_type(&this_bio_flag,
2680 						 em->compress_type);
2681 		}
2682 
2683 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2684 		cur_end = min(extent_map_end(em) - 1, end);
2685 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2686 		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2687 			disk_io_size = em->block_len;
2688 			sector = em->block_start >> 9;
2689 		} else {
2690 			sector = (em->block_start + extent_offset) >> 9;
2691 			disk_io_size = iosize;
2692 		}
2693 		bdev = em->bdev;
2694 		block_start = em->block_start;
2695 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2696 			block_start = EXTENT_MAP_HOLE;
2697 		free_extent_map(em);
2698 		em = NULL;
2699 
2700 		/* we've found a hole, just zero and go on */
2701 		if (block_start == EXTENT_MAP_HOLE) {
2702 			char *userpage;
2703 			struct extent_state *cached = NULL;
2704 
2705 			userpage = kmap_atomic(page);
2706 			memset(userpage + pg_offset, 0, iosize);
2707 			flush_dcache_page(page);
2708 			kunmap_atomic(userpage);
2709 
2710 			set_extent_uptodate(tree, cur, cur + iosize - 1,
2711 					    &cached, GFP_NOFS);
2712 			unlock_extent_cached(tree, cur, cur + iosize - 1,
2713 			                     &cached, GFP_NOFS);
2714 			cur = cur + iosize;
2715 			pg_offset += iosize;
2716 			continue;
2717 		}
2718 		/* the get_extent function already copied into the page */
2719 		if (test_range_bit(tree, cur, cur_end,
2720 				   EXTENT_UPTODATE, 1, NULL)) {
2721 			check_page_uptodate(tree, page);
2722 			unlock_extent(tree, cur, cur + iosize - 1);
2723 			cur = cur + iosize;
2724 			pg_offset += iosize;
2725 			continue;
2726 		}
2727 		/* we have an inline extent but it didn't get marked up
2728 		 * to date.  Error out
2729 		 */
2730 		if (block_start == EXTENT_MAP_INLINE) {
2731 			SetPageError(page);
2732 			unlock_extent(tree, cur, cur + iosize - 1);
2733 			cur = cur + iosize;
2734 			pg_offset += iosize;
2735 			continue;
2736 		}
2737 
2738 		ret = 0;
2739 		if (tree->ops && tree->ops->readpage_io_hook) {
2740 			ret = tree->ops->readpage_io_hook(page, cur,
2741 							  cur + iosize - 1);
2742 		}
2743 		if (!ret) {
2744 			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2745 			pnr -= page->index;
2746 			ret = submit_extent_page(READ, tree, page,
2747 					 sector, disk_io_size, pg_offset,
2748 					 bdev, bio, pnr,
2749 					 end_bio_extent_readpage, mirror_num,
2750 					 *bio_flags,
2751 					 this_bio_flag);
2752 			if (!ret) {
2753 				nr++;
2754 				*bio_flags = this_bio_flag;
2755 			}
2756 		}
2757 		if (ret) {
2758 			SetPageError(page);
2759 			unlock_extent(tree, cur, cur + iosize - 1);
2760 		}
2761 		cur = cur + iosize;
2762 		pg_offset += iosize;
2763 	}
2764 out:
2765 	if (!nr) {
2766 		if (!PageError(page))
2767 			SetPageUptodate(page);
2768 		unlock_page(page);
2769 	}
2770 	return 0;
2771 }
2772 
2773 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2774 			    get_extent_t *get_extent, int mirror_num)
2775 {
2776 	struct bio *bio = NULL;
2777 	unsigned long bio_flags = 0;
2778 	int ret;
2779 
2780 	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2781 				      &bio_flags);
2782 	if (bio)
2783 		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2784 	return ret;
2785 }
2786 
2787 static noinline void update_nr_written(struct page *page,
2788 				      struct writeback_control *wbc,
2789 				      unsigned long nr_written)
2790 {
2791 	wbc->nr_to_write -= nr_written;
2792 	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2793 	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2794 		page->mapping->writeback_index = page->index + nr_written;
2795 }
2796 
2797 /*
2798  * the writepage semantics are similar to regular writepage.  extent
2799  * records are inserted to lock ranges in the tree, and as dirty areas
2800  * are found, they are marked writeback.  Then the lock bits are removed
2801  * and the end_io handler clears the writeback ranges
2802  */
2803 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2804 			      void *data)
2805 {
2806 	struct inode *inode = page->mapping->host;
2807 	struct extent_page_data *epd = data;
2808 	struct extent_io_tree *tree = epd->tree;
2809 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2810 	u64 delalloc_start;
2811 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2812 	u64 end;
2813 	u64 cur = start;
2814 	u64 extent_offset;
2815 	u64 last_byte = i_size_read(inode);
2816 	u64 block_start;
2817 	u64 iosize;
2818 	sector_t sector;
2819 	struct extent_state *cached_state = NULL;
2820 	struct extent_map *em;
2821 	struct block_device *bdev;
2822 	int ret;
2823 	int nr = 0;
2824 	size_t pg_offset = 0;
2825 	size_t blocksize;
2826 	loff_t i_size = i_size_read(inode);
2827 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2828 	u64 nr_delalloc;
2829 	u64 delalloc_end;
2830 	int page_started;
2831 	int compressed;
2832 	int write_flags;
2833 	unsigned long nr_written = 0;
2834 	bool fill_delalloc = true;
2835 
2836 	if (wbc->sync_mode == WB_SYNC_ALL)
2837 		write_flags = WRITE_SYNC;
2838 	else
2839 		write_flags = WRITE;
2840 
2841 	trace___extent_writepage(page, inode, wbc);
2842 
2843 	WARN_ON(!PageLocked(page));
2844 
2845 	ClearPageError(page);
2846 
2847 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2848 	if (page->index > end_index ||
2849 	   (page->index == end_index && !pg_offset)) {
2850 		page->mapping->a_ops->invalidatepage(page, 0);
2851 		unlock_page(page);
2852 		return 0;
2853 	}
2854 
2855 	if (page->index == end_index) {
2856 		char *userpage;
2857 
2858 		userpage = kmap_atomic(page);
2859 		memset(userpage + pg_offset, 0,
2860 		       PAGE_CACHE_SIZE - pg_offset);
2861 		kunmap_atomic(userpage);
2862 		flush_dcache_page(page);
2863 	}
2864 	pg_offset = 0;
2865 
2866 	set_page_extent_mapped(page);
2867 
2868 	if (!tree->ops || !tree->ops->fill_delalloc)
2869 		fill_delalloc = false;
2870 
2871 	delalloc_start = start;
2872 	delalloc_end = 0;
2873 	page_started = 0;
2874 	if (!epd->extent_locked && fill_delalloc) {
2875 		u64 delalloc_to_write = 0;
2876 		/*
2877 		 * make sure the wbc mapping index is at least updated
2878 		 * to this page.
2879 		 */
2880 		update_nr_written(page, wbc, 0);
2881 
2882 		while (delalloc_end < page_end) {
2883 			nr_delalloc = find_lock_delalloc_range(inode, tree,
2884 						       page,
2885 						       &delalloc_start,
2886 						       &delalloc_end,
2887 						       128 * 1024 * 1024);
2888 			if (nr_delalloc == 0) {
2889 				delalloc_start = delalloc_end + 1;
2890 				continue;
2891 			}
2892 			ret = tree->ops->fill_delalloc(inode, page,
2893 						       delalloc_start,
2894 						       delalloc_end,
2895 						       &page_started,
2896 						       &nr_written);
2897 			/* File system has been set read-only */
2898 			if (ret) {
2899 				SetPageError(page);
2900 				goto done;
2901 			}
2902 			/*
2903 			 * delalloc_end is already one less than the total
2904 			 * length, so we don't subtract one from
2905 			 * PAGE_CACHE_SIZE
2906 			 */
2907 			delalloc_to_write += (delalloc_end - delalloc_start +
2908 					      PAGE_CACHE_SIZE) >>
2909 					      PAGE_CACHE_SHIFT;
2910 			delalloc_start = delalloc_end + 1;
2911 		}
2912 		if (wbc->nr_to_write < delalloc_to_write) {
2913 			int thresh = 8192;
2914 
2915 			if (delalloc_to_write < thresh * 2)
2916 				thresh = delalloc_to_write;
2917 			wbc->nr_to_write = min_t(u64, delalloc_to_write,
2918 						 thresh);
2919 		}
2920 
2921 		/* did the fill delalloc function already unlock and start
2922 		 * the IO?
2923 		 */
2924 		if (page_started) {
2925 			ret = 0;
2926 			/*
2927 			 * we've unlocked the page, so we can't update
2928 			 * the mapping's writeback index, just update
2929 			 * nr_to_write.
2930 			 */
2931 			wbc->nr_to_write -= nr_written;
2932 			goto done_unlocked;
2933 		}
2934 	}
2935 	if (tree->ops && tree->ops->writepage_start_hook) {
2936 		ret = tree->ops->writepage_start_hook(page, start,
2937 						      page_end);
2938 		if (ret) {
2939 			/* Fixup worker will requeue */
2940 			if (ret == -EBUSY)
2941 				wbc->pages_skipped++;
2942 			else
2943 				redirty_page_for_writepage(wbc, page);
2944 			update_nr_written(page, wbc, nr_written);
2945 			unlock_page(page);
2946 			ret = 0;
2947 			goto done_unlocked;
2948 		}
2949 	}
2950 
2951 	/*
2952 	 * we don't want to touch the inode after unlocking the page,
2953 	 * so we update the mapping writeback index now
2954 	 */
2955 	update_nr_written(page, wbc, nr_written + 1);
2956 
2957 	end = page_end;
2958 	if (last_byte <= start) {
2959 		if (tree->ops && tree->ops->writepage_end_io_hook)
2960 			tree->ops->writepage_end_io_hook(page, start,
2961 							 page_end, NULL, 1);
2962 		goto done;
2963 	}
2964 
2965 	blocksize = inode->i_sb->s_blocksize;
2966 
2967 	while (cur <= end) {
2968 		if (cur >= last_byte) {
2969 			if (tree->ops && tree->ops->writepage_end_io_hook)
2970 				tree->ops->writepage_end_io_hook(page, cur,
2971 							 page_end, NULL, 1);
2972 			break;
2973 		}
2974 		em = epd->get_extent(inode, page, pg_offset, cur,
2975 				     end - cur + 1, 1);
2976 		if (IS_ERR_OR_NULL(em)) {
2977 			SetPageError(page);
2978 			break;
2979 		}
2980 
2981 		extent_offset = cur - em->start;
2982 		BUG_ON(extent_map_end(em) <= cur);
2983 		BUG_ON(end < cur);
2984 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2985 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2986 		sector = (em->block_start + extent_offset) >> 9;
2987 		bdev = em->bdev;
2988 		block_start = em->block_start;
2989 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2990 		free_extent_map(em);
2991 		em = NULL;
2992 
2993 		/*
2994 		 * compressed and inline extents are written through other
2995 		 * paths in the FS
2996 		 */
2997 		if (compressed || block_start == EXTENT_MAP_HOLE ||
2998 		    block_start == EXTENT_MAP_INLINE) {
2999 			/*
3000 			 * end_io notification does not happen here for
3001 			 * compressed extents
3002 			 */
3003 			if (!compressed && tree->ops &&
3004 			    tree->ops->writepage_end_io_hook)
3005 				tree->ops->writepage_end_io_hook(page, cur,
3006 							 cur + iosize - 1,
3007 							 NULL, 1);
3008 			else if (compressed) {
3009 				/* we don't want to end_page_writeback on
3010 				 * a compressed extent.  this happens
3011 				 * elsewhere
3012 				 */
3013 				nr++;
3014 			}
3015 
3016 			cur += iosize;
3017 			pg_offset += iosize;
3018 			continue;
3019 		}
3020 		/* leave this out until we have a page_mkwrite call */
3021 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
3022 				   EXTENT_DIRTY, 0, NULL)) {
3023 			cur = cur + iosize;
3024 			pg_offset += iosize;
3025 			continue;
3026 		}
3027 
3028 		if (tree->ops && tree->ops->writepage_io_hook) {
3029 			ret = tree->ops->writepage_io_hook(page, cur,
3030 						cur + iosize - 1);
3031 		} else {
3032 			ret = 0;
3033 		}
3034 		if (ret) {
3035 			SetPageError(page);
3036 		} else {
3037 			unsigned long max_nr = end_index + 1;
3038 
3039 			set_range_writeback(tree, cur, cur + iosize - 1);
3040 			if (!PageWriteback(page)) {
3041 				printk(KERN_ERR "btrfs warning page %lu not "
3042 				       "writeback, cur %llu end %llu\n",
3043 				       page->index, (unsigned long long)cur,
3044 				       (unsigned long long)end);
3045 			}
3046 
3047 			ret = submit_extent_page(write_flags, tree, page,
3048 						 sector, iosize, pg_offset,
3049 						 bdev, &epd->bio, max_nr,
3050 						 end_bio_extent_writepage,
3051 						 0, 0, 0);
3052 			if (ret)
3053 				SetPageError(page);
3054 		}
3055 		cur = cur + iosize;
3056 		pg_offset += iosize;
3057 		nr++;
3058 	}
3059 done:
3060 	if (nr == 0) {
3061 		/* make sure the mapping tag for page dirty gets cleared */
3062 		set_page_writeback(page);
3063 		end_page_writeback(page);
3064 	}
3065 	unlock_page(page);
3066 
3067 done_unlocked:
3068 
3069 	/* drop our reference on any cached states */
3070 	free_extent_state(cached_state);
3071 	return 0;
3072 }
3073 
3074 static int eb_wait(void *word)
3075 {
3076 	io_schedule();
3077 	return 0;
3078 }
3079 
3080 static void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
3081 {
3082 	wait_on_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK, eb_wait,
3083 		    TASK_UNINTERRUPTIBLE);
3084 }
3085 
3086 static int lock_extent_buffer_for_io(struct extent_buffer *eb,
3087 				     struct btrfs_fs_info *fs_info,
3088 				     struct extent_page_data *epd)
3089 {
3090 	unsigned long i, num_pages;
3091 	int flush = 0;
3092 	int ret = 0;
3093 
3094 	if (!btrfs_try_tree_write_lock(eb)) {
3095 		flush = 1;
3096 		flush_write_bio(epd);
3097 		btrfs_tree_lock(eb);
3098 	}
3099 
3100 	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
3101 		btrfs_tree_unlock(eb);
3102 		if (!epd->sync_io)
3103 			return 0;
3104 		if (!flush) {
3105 			flush_write_bio(epd);
3106 			flush = 1;
3107 		}
3108 		while (1) {
3109 			wait_on_extent_buffer_writeback(eb);
3110 			btrfs_tree_lock(eb);
3111 			if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
3112 				break;
3113 			btrfs_tree_unlock(eb);
3114 		}
3115 	}
3116 
3117 	/*
3118 	 * We need to do this to prevent races in people who check if the eb is
3119 	 * under IO since we can end up having no IO bits set for a short period
3120 	 * of time.
3121 	 */
3122 	spin_lock(&eb->refs_lock);
3123 	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3124 		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3125 		spin_unlock(&eb->refs_lock);
3126 		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
3127 		spin_lock(&fs_info->delalloc_lock);
3128 		if (fs_info->dirty_metadata_bytes >= eb->len)
3129 			fs_info->dirty_metadata_bytes -= eb->len;
3130 		else
3131 			WARN_ON(1);
3132 		spin_unlock(&fs_info->delalloc_lock);
3133 		ret = 1;
3134 	} else {
3135 		spin_unlock(&eb->refs_lock);
3136 	}
3137 
3138 	btrfs_tree_unlock(eb);
3139 
3140 	if (!ret)
3141 		return ret;
3142 
3143 	num_pages = num_extent_pages(eb->start, eb->len);
3144 	for (i = 0; i < num_pages; i++) {
3145 		struct page *p = extent_buffer_page(eb, i);
3146 
3147 		if (!trylock_page(p)) {
3148 			if (!flush) {
3149 				flush_write_bio(epd);
3150 				flush = 1;
3151 			}
3152 			lock_page(p);
3153 		}
3154 	}
3155 
3156 	return ret;
3157 }
3158 
3159 static void end_extent_buffer_writeback(struct extent_buffer *eb)
3160 {
3161 	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
3162 	smp_mb__after_clear_bit();
3163 	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
3164 }
3165 
3166 static void end_bio_extent_buffer_writepage(struct bio *bio, int err)
3167 {
3168 	int uptodate = err == 0;
3169 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
3170 	struct extent_buffer *eb;
3171 	int done;
3172 
3173 	do {
3174 		struct page *page = bvec->bv_page;
3175 
3176 		bvec--;
3177 		eb = (struct extent_buffer *)page->private;
3178 		BUG_ON(!eb);
3179 		done = atomic_dec_and_test(&eb->io_pages);
3180 
3181 		if (!uptodate || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) {
3182 			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3183 			ClearPageUptodate(page);
3184 			SetPageError(page);
3185 		}
3186 
3187 		end_page_writeback(page);
3188 
3189 		if (!done)
3190 			continue;
3191 
3192 		end_extent_buffer_writeback(eb);
3193 	} while (bvec >= bio->bi_io_vec);
3194 
3195 	bio_put(bio);
3196 
3197 }
3198 
3199 static int write_one_eb(struct extent_buffer *eb,
3200 			struct btrfs_fs_info *fs_info,
3201 			struct writeback_control *wbc,
3202 			struct extent_page_data *epd)
3203 {
3204 	struct block_device *bdev = fs_info->fs_devices->latest_bdev;
3205 	u64 offset = eb->start;
3206 	unsigned long i, num_pages;
3207 	unsigned long bio_flags = 0;
3208 	int rw = (epd->sync_io ? WRITE_SYNC : WRITE);
3209 	int ret = 0;
3210 
3211 	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3212 	num_pages = num_extent_pages(eb->start, eb->len);
3213 	atomic_set(&eb->io_pages, num_pages);
3214 	if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID)
3215 		bio_flags = EXTENT_BIO_TREE_LOG;
3216 
3217 	for (i = 0; i < num_pages; i++) {
3218 		struct page *p = extent_buffer_page(eb, i);
3219 
3220 		clear_page_dirty_for_io(p);
3221 		set_page_writeback(p);
3222 		ret = submit_extent_page(rw, eb->tree, p, offset >> 9,
3223 					 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3224 					 -1, end_bio_extent_buffer_writepage,
3225 					 0, epd->bio_flags, bio_flags);
3226 		epd->bio_flags = bio_flags;
3227 		if (ret) {
3228 			set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
3229 			SetPageError(p);
3230 			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
3231 				end_extent_buffer_writeback(eb);
3232 			ret = -EIO;
3233 			break;
3234 		}
3235 		offset += PAGE_CACHE_SIZE;
3236 		update_nr_written(p, wbc, 1);
3237 		unlock_page(p);
3238 	}
3239 
3240 	if (unlikely(ret)) {
3241 		for (; i < num_pages; i++) {
3242 			struct page *p = extent_buffer_page(eb, i);
3243 			unlock_page(p);
3244 		}
3245 	}
3246 
3247 	return ret;
3248 }
3249 
3250 int btree_write_cache_pages(struct address_space *mapping,
3251 				   struct writeback_control *wbc)
3252 {
3253 	struct extent_io_tree *tree = &BTRFS_I(mapping->host)->io_tree;
3254 	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
3255 	struct extent_buffer *eb, *prev_eb = NULL;
3256 	struct extent_page_data epd = {
3257 		.bio = NULL,
3258 		.tree = tree,
3259 		.extent_locked = 0,
3260 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3261 		.bio_flags = 0,
3262 	};
3263 	int ret = 0;
3264 	int done = 0;
3265 	int nr_to_write_done = 0;
3266 	struct pagevec pvec;
3267 	int nr_pages;
3268 	pgoff_t index;
3269 	pgoff_t end;		/* Inclusive */
3270 	int scanned = 0;
3271 	int tag;
3272 
3273 	pagevec_init(&pvec, 0);
3274 	if (wbc->range_cyclic) {
3275 		index = mapping->writeback_index; /* Start from prev offset */
3276 		end = -1;
3277 	} else {
3278 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
3279 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
3280 		scanned = 1;
3281 	}
3282 	if (wbc->sync_mode == WB_SYNC_ALL)
3283 		tag = PAGECACHE_TAG_TOWRITE;
3284 	else
3285 		tag = PAGECACHE_TAG_DIRTY;
3286 retry:
3287 	if (wbc->sync_mode == WB_SYNC_ALL)
3288 		tag_pages_for_writeback(mapping, index, end);
3289 	while (!done && !nr_to_write_done && (index <= end) &&
3290 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3291 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3292 		unsigned i;
3293 
3294 		scanned = 1;
3295 		for (i = 0; i < nr_pages; i++) {
3296 			struct page *page = pvec.pages[i];
3297 
3298 			if (!PagePrivate(page))
3299 				continue;
3300 
3301 			if (!wbc->range_cyclic && page->index > end) {
3302 				done = 1;
3303 				break;
3304 			}
3305 
3306 			spin_lock(&mapping->private_lock);
3307 			if (!PagePrivate(page)) {
3308 				spin_unlock(&mapping->private_lock);
3309 				continue;
3310 			}
3311 
3312 			eb = (struct extent_buffer *)page->private;
3313 
3314 			/*
3315 			 * Shouldn't happen and normally this would be a BUG_ON
3316 			 * but no sense in crashing the users box for something
3317 			 * we can survive anyway.
3318 			 */
3319 			if (!eb) {
3320 				spin_unlock(&mapping->private_lock);
3321 				WARN_ON(1);
3322 				continue;
3323 			}
3324 
3325 			if (eb == prev_eb) {
3326 				spin_unlock(&mapping->private_lock);
3327 				continue;
3328 			}
3329 
3330 			ret = atomic_inc_not_zero(&eb->refs);
3331 			spin_unlock(&mapping->private_lock);
3332 			if (!ret)
3333 				continue;
3334 
3335 			prev_eb = eb;
3336 			ret = lock_extent_buffer_for_io(eb, fs_info, &epd);
3337 			if (!ret) {
3338 				free_extent_buffer(eb);
3339 				continue;
3340 			}
3341 
3342 			ret = write_one_eb(eb, fs_info, wbc, &epd);
3343 			if (ret) {
3344 				done = 1;
3345 				free_extent_buffer(eb);
3346 				break;
3347 			}
3348 			free_extent_buffer(eb);
3349 
3350 			/*
3351 			 * the filesystem may choose to bump up nr_to_write.
3352 			 * We have to make sure to honor the new nr_to_write
3353 			 * at any time
3354 			 */
3355 			nr_to_write_done = wbc->nr_to_write <= 0;
3356 		}
3357 		pagevec_release(&pvec);
3358 		cond_resched();
3359 	}
3360 	if (!scanned && !done) {
3361 		/*
3362 		 * We hit the last page and there is more work to be done: wrap
3363 		 * back to the start of the file
3364 		 */
3365 		scanned = 1;
3366 		index = 0;
3367 		goto retry;
3368 	}
3369 	flush_write_bio(&epd);
3370 	return ret;
3371 }
3372 
3373 /**
3374  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
3375  * @mapping: address space structure to write
3376  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
3377  * @writepage: function called for each page
3378  * @data: data passed to writepage function
3379  *
3380  * If a page is already under I/O, write_cache_pages() skips it, even
3381  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
3382  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
3383  * and msync() need to guarantee that all the data which was dirty at the time
3384  * the call was made get new I/O started against them.  If wbc->sync_mode is
3385  * WB_SYNC_ALL then we were called for data integrity and we must wait for
3386  * existing IO to complete.
3387  */
3388 static int extent_write_cache_pages(struct extent_io_tree *tree,
3389 			     struct address_space *mapping,
3390 			     struct writeback_control *wbc,
3391 			     writepage_t writepage, void *data,
3392 			     void (*flush_fn)(void *))
3393 {
3394 	struct inode *inode = mapping->host;
3395 	int ret = 0;
3396 	int done = 0;
3397 	int nr_to_write_done = 0;
3398 	struct pagevec pvec;
3399 	int nr_pages;
3400 	pgoff_t index;
3401 	pgoff_t end;		/* Inclusive */
3402 	int scanned = 0;
3403 	int tag;
3404 
3405 	/*
3406 	 * We have to hold onto the inode so that ordered extents can do their
3407 	 * work when the IO finishes.  The alternative to this is failing to add
3408 	 * an ordered extent if the igrab() fails there and that is a huge pain
3409 	 * to deal with, so instead just hold onto the inode throughout the
3410 	 * writepages operation.  If it fails here we are freeing up the inode
3411 	 * anyway and we'd rather not waste our time writing out stuff that is
3412 	 * going to be truncated anyway.
3413 	 */
3414 	if (!igrab(inode))
3415 		return 0;
3416 
3417 	pagevec_init(&pvec, 0);
3418 	if (wbc->range_cyclic) {
3419 		index = mapping->writeback_index; /* Start from prev offset */
3420 		end = -1;
3421 	} else {
3422 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
3423 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
3424 		scanned = 1;
3425 	}
3426 	if (wbc->sync_mode == WB_SYNC_ALL)
3427 		tag = PAGECACHE_TAG_TOWRITE;
3428 	else
3429 		tag = PAGECACHE_TAG_DIRTY;
3430 retry:
3431 	if (wbc->sync_mode == WB_SYNC_ALL)
3432 		tag_pages_for_writeback(mapping, index, end);
3433 	while (!done && !nr_to_write_done && (index <= end) &&
3434 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
3435 			min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
3436 		unsigned i;
3437 
3438 		scanned = 1;
3439 		for (i = 0; i < nr_pages; i++) {
3440 			struct page *page = pvec.pages[i];
3441 
3442 			/*
3443 			 * At this point we hold neither mapping->tree_lock nor
3444 			 * lock on the page itself: the page may be truncated or
3445 			 * invalidated (changing page->mapping to NULL), or even
3446 			 * swizzled back from swapper_space to tmpfs file
3447 			 * mapping
3448 			 */
3449 			if (tree->ops &&
3450 			    tree->ops->write_cache_pages_lock_hook) {
3451 				tree->ops->write_cache_pages_lock_hook(page,
3452 							       data, flush_fn);
3453 			} else {
3454 				if (!trylock_page(page)) {
3455 					flush_fn(data);
3456 					lock_page(page);
3457 				}
3458 			}
3459 
3460 			if (unlikely(page->mapping != mapping)) {
3461 				unlock_page(page);
3462 				continue;
3463 			}
3464 
3465 			if (!wbc->range_cyclic && page->index > end) {
3466 				done = 1;
3467 				unlock_page(page);
3468 				continue;
3469 			}
3470 
3471 			if (wbc->sync_mode != WB_SYNC_NONE) {
3472 				if (PageWriteback(page))
3473 					flush_fn(data);
3474 				wait_on_page_writeback(page);
3475 			}
3476 
3477 			if (PageWriteback(page) ||
3478 			    !clear_page_dirty_for_io(page)) {
3479 				unlock_page(page);
3480 				continue;
3481 			}
3482 
3483 			ret = (*writepage)(page, wbc, data);
3484 
3485 			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
3486 				unlock_page(page);
3487 				ret = 0;
3488 			}
3489 			if (ret)
3490 				done = 1;
3491 
3492 			/*
3493 			 * the filesystem may choose to bump up nr_to_write.
3494 			 * We have to make sure to honor the new nr_to_write
3495 			 * at any time
3496 			 */
3497 			nr_to_write_done = wbc->nr_to_write <= 0;
3498 		}
3499 		pagevec_release(&pvec);
3500 		cond_resched();
3501 	}
3502 	if (!scanned && !done) {
3503 		/*
3504 		 * We hit the last page and there is more work to be done: wrap
3505 		 * back to the start of the file
3506 		 */
3507 		scanned = 1;
3508 		index = 0;
3509 		goto retry;
3510 	}
3511 	btrfs_add_delayed_iput(inode);
3512 	return ret;
3513 }
3514 
3515 static void flush_epd_write_bio(struct extent_page_data *epd)
3516 {
3517 	if (epd->bio) {
3518 		int rw = WRITE;
3519 		int ret;
3520 
3521 		if (epd->sync_io)
3522 			rw = WRITE_SYNC;
3523 
3524 		ret = submit_one_bio(rw, epd->bio, 0, epd->bio_flags);
3525 		BUG_ON(ret < 0); /* -ENOMEM */
3526 		epd->bio = NULL;
3527 	}
3528 }
3529 
3530 static noinline void flush_write_bio(void *data)
3531 {
3532 	struct extent_page_data *epd = data;
3533 	flush_epd_write_bio(epd);
3534 }
3535 
3536 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
3537 			  get_extent_t *get_extent,
3538 			  struct writeback_control *wbc)
3539 {
3540 	int ret;
3541 	struct extent_page_data epd = {
3542 		.bio = NULL,
3543 		.tree = tree,
3544 		.get_extent = get_extent,
3545 		.extent_locked = 0,
3546 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3547 		.bio_flags = 0,
3548 	};
3549 
3550 	ret = __extent_writepage(page, wbc, &epd);
3551 
3552 	flush_epd_write_bio(&epd);
3553 	return ret;
3554 }
3555 
3556 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
3557 			      u64 start, u64 end, get_extent_t *get_extent,
3558 			      int mode)
3559 {
3560 	int ret = 0;
3561 	struct address_space *mapping = inode->i_mapping;
3562 	struct page *page;
3563 	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
3564 		PAGE_CACHE_SHIFT;
3565 
3566 	struct extent_page_data epd = {
3567 		.bio = NULL,
3568 		.tree = tree,
3569 		.get_extent = get_extent,
3570 		.extent_locked = 1,
3571 		.sync_io = mode == WB_SYNC_ALL,
3572 		.bio_flags = 0,
3573 	};
3574 	struct writeback_control wbc_writepages = {
3575 		.sync_mode	= mode,
3576 		.nr_to_write	= nr_pages * 2,
3577 		.range_start	= start,
3578 		.range_end	= end + 1,
3579 	};
3580 
3581 	while (start <= end) {
3582 		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
3583 		if (clear_page_dirty_for_io(page))
3584 			ret = __extent_writepage(page, &wbc_writepages, &epd);
3585 		else {
3586 			if (tree->ops && tree->ops->writepage_end_io_hook)
3587 				tree->ops->writepage_end_io_hook(page, start,
3588 						 start + PAGE_CACHE_SIZE - 1,
3589 						 NULL, 1);
3590 			unlock_page(page);
3591 		}
3592 		page_cache_release(page);
3593 		start += PAGE_CACHE_SIZE;
3594 	}
3595 
3596 	flush_epd_write_bio(&epd);
3597 	return ret;
3598 }
3599 
3600 int extent_writepages(struct extent_io_tree *tree,
3601 		      struct address_space *mapping,
3602 		      get_extent_t *get_extent,
3603 		      struct writeback_control *wbc)
3604 {
3605 	int ret = 0;
3606 	struct extent_page_data epd = {
3607 		.bio = NULL,
3608 		.tree = tree,
3609 		.get_extent = get_extent,
3610 		.extent_locked = 0,
3611 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
3612 		.bio_flags = 0,
3613 	};
3614 
3615 	ret = extent_write_cache_pages(tree, mapping, wbc,
3616 				       __extent_writepage, &epd,
3617 				       flush_write_bio);
3618 	flush_epd_write_bio(&epd);
3619 	return ret;
3620 }
3621 
3622 int extent_readpages(struct extent_io_tree *tree,
3623 		     struct address_space *mapping,
3624 		     struct list_head *pages, unsigned nr_pages,
3625 		     get_extent_t get_extent)
3626 {
3627 	struct bio *bio = NULL;
3628 	unsigned page_idx;
3629 	unsigned long bio_flags = 0;
3630 	struct page *pagepool[16];
3631 	struct page *page;
3632 	int i = 0;
3633 	int nr = 0;
3634 
3635 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
3636 		page = list_entry(pages->prev, struct page, lru);
3637 
3638 		prefetchw(&page->flags);
3639 		list_del(&page->lru);
3640 		if (add_to_page_cache_lru(page, mapping,
3641 					page->index, GFP_NOFS)) {
3642 			page_cache_release(page);
3643 			continue;
3644 		}
3645 
3646 		pagepool[nr++] = page;
3647 		if (nr < ARRAY_SIZE(pagepool))
3648 			continue;
3649 		for (i = 0; i < nr; i++) {
3650 			__extent_read_full_page(tree, pagepool[i], get_extent,
3651 					&bio, 0, &bio_flags);
3652 			page_cache_release(pagepool[i]);
3653 		}
3654 		nr = 0;
3655 	}
3656 	for (i = 0; i < nr; i++) {
3657 		__extent_read_full_page(tree, pagepool[i], get_extent,
3658 					&bio, 0, &bio_flags);
3659 		page_cache_release(pagepool[i]);
3660 	}
3661 
3662 	BUG_ON(!list_empty(pages));
3663 	if (bio)
3664 		return submit_one_bio(READ, bio, 0, bio_flags);
3665 	return 0;
3666 }
3667 
3668 /*
3669  * basic invalidatepage code, this waits on any locked or writeback
3670  * ranges corresponding to the page, and then deletes any extent state
3671  * records from the tree
3672  */
3673 int extent_invalidatepage(struct extent_io_tree *tree,
3674 			  struct page *page, unsigned long offset)
3675 {
3676 	struct extent_state *cached_state = NULL;
3677 	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
3678 	u64 end = start + PAGE_CACHE_SIZE - 1;
3679 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
3680 
3681 	start += (offset + blocksize - 1) & ~(blocksize - 1);
3682 	if (start > end)
3683 		return 0;
3684 
3685 	lock_extent_bits(tree, start, end, 0, &cached_state);
3686 	wait_on_page_writeback(page);
3687 	clear_extent_bit(tree, start, end,
3688 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
3689 			 EXTENT_DO_ACCOUNTING,
3690 			 1, 1, &cached_state, GFP_NOFS);
3691 	return 0;
3692 }
3693 
3694 /*
3695  * a helper for releasepage, this tests for areas of the page that
3696  * are locked or under IO and drops the related state bits if it is safe
3697  * to drop the page.
3698  */
3699 int try_release_extent_state(struct extent_map_tree *map,
3700 			     struct extent_io_tree *tree, struct page *page,
3701 			     gfp_t mask)
3702 {
3703 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3704 	u64 end = start + PAGE_CACHE_SIZE - 1;
3705 	int ret = 1;
3706 
3707 	if (test_range_bit(tree, start, end,
3708 			   EXTENT_IOBITS, 0, NULL))
3709 		ret = 0;
3710 	else {
3711 		if ((mask & GFP_NOFS) == GFP_NOFS)
3712 			mask = GFP_NOFS;
3713 		/*
3714 		 * at this point we can safely clear everything except the
3715 		 * locked bit and the nodatasum bit
3716 		 */
3717 		ret = clear_extent_bit(tree, start, end,
3718 				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
3719 				 0, 0, NULL, mask);
3720 
3721 		/* if clear_extent_bit failed for enomem reasons,
3722 		 * we can't allow the release to continue.
3723 		 */
3724 		if (ret < 0)
3725 			ret = 0;
3726 		else
3727 			ret = 1;
3728 	}
3729 	return ret;
3730 }
3731 
3732 /*
3733  * a helper for releasepage.  As long as there are no locked extents
3734  * in the range corresponding to the page, both state records and extent
3735  * map records are removed
3736  */
3737 int try_release_extent_mapping(struct extent_map_tree *map,
3738 			       struct extent_io_tree *tree, struct page *page,
3739 			       gfp_t mask)
3740 {
3741 	struct extent_map *em;
3742 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
3743 	u64 end = start + PAGE_CACHE_SIZE - 1;
3744 
3745 	if ((mask & __GFP_WAIT) &&
3746 	    page->mapping->host->i_size > 16 * 1024 * 1024) {
3747 		u64 len;
3748 		while (start <= end) {
3749 			len = end - start + 1;
3750 			write_lock(&map->lock);
3751 			em = lookup_extent_mapping(map, start, len);
3752 			if (!em) {
3753 				write_unlock(&map->lock);
3754 				break;
3755 			}
3756 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
3757 			    em->start != start) {
3758 				write_unlock(&map->lock);
3759 				free_extent_map(em);
3760 				break;
3761 			}
3762 			if (!test_range_bit(tree, em->start,
3763 					    extent_map_end(em) - 1,
3764 					    EXTENT_LOCKED | EXTENT_WRITEBACK,
3765 					    0, NULL)) {
3766 				remove_extent_mapping(map, em);
3767 				/* once for the rb tree */
3768 				free_extent_map(em);
3769 			}
3770 			start = extent_map_end(em);
3771 			write_unlock(&map->lock);
3772 
3773 			/* once for us */
3774 			free_extent_map(em);
3775 		}
3776 	}
3777 	return try_release_extent_state(map, tree, page, mask);
3778 }
3779 
3780 /*
3781  * helper function for fiemap, which doesn't want to see any holes.
3782  * This maps until we find something past 'last'
3783  */
3784 static struct extent_map *get_extent_skip_holes(struct inode *inode,
3785 						u64 offset,
3786 						u64 last,
3787 						get_extent_t *get_extent)
3788 {
3789 	u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
3790 	struct extent_map *em;
3791 	u64 len;
3792 
3793 	if (offset >= last)
3794 		return NULL;
3795 
3796 	while(1) {
3797 		len = last - offset;
3798 		if (len == 0)
3799 			break;
3800 		len = (len + sectorsize - 1) & ~(sectorsize - 1);
3801 		em = get_extent(inode, NULL, 0, offset, len, 0);
3802 		if (IS_ERR_OR_NULL(em))
3803 			return em;
3804 
3805 		/* if this isn't a hole return it */
3806 		if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
3807 		    em->block_start != EXTENT_MAP_HOLE) {
3808 			return em;
3809 		}
3810 
3811 		/* this is a hole, advance to the next extent */
3812 		offset = extent_map_end(em);
3813 		free_extent_map(em);
3814 		if (offset >= last)
3815 			break;
3816 	}
3817 	return NULL;
3818 }
3819 
3820 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3821 		__u64 start, __u64 len, get_extent_t *get_extent)
3822 {
3823 	int ret = 0;
3824 	u64 off = start;
3825 	u64 max = start + len;
3826 	u32 flags = 0;
3827 	u32 found_type;
3828 	u64 last;
3829 	u64 last_for_get_extent = 0;
3830 	u64 disko = 0;
3831 	u64 isize = i_size_read(inode);
3832 	struct btrfs_key found_key;
3833 	struct extent_map *em = NULL;
3834 	struct extent_state *cached_state = NULL;
3835 	struct btrfs_path *path;
3836 	struct btrfs_file_extent_item *item;
3837 	int end = 0;
3838 	u64 em_start = 0;
3839 	u64 em_len = 0;
3840 	u64 em_end = 0;
3841 	unsigned long emflags;
3842 
3843 	if (len == 0)
3844 		return -EINVAL;
3845 
3846 	path = btrfs_alloc_path();
3847 	if (!path)
3848 		return -ENOMEM;
3849 	path->leave_spinning = 1;
3850 
3851 	start = ALIGN(start, BTRFS_I(inode)->root->sectorsize);
3852 	len = ALIGN(len, BTRFS_I(inode)->root->sectorsize);
3853 
3854 	/*
3855 	 * lookup the last file extent.  We're not using i_size here
3856 	 * because there might be preallocation past i_size
3857 	 */
3858 	ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
3859 				       path, btrfs_ino(inode), -1, 0);
3860 	if (ret < 0) {
3861 		btrfs_free_path(path);
3862 		return ret;
3863 	}
3864 	WARN_ON(!ret);
3865 	path->slots[0]--;
3866 	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
3867 			      struct btrfs_file_extent_item);
3868 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
3869 	found_type = btrfs_key_type(&found_key);
3870 
3871 	/* No extents, but there might be delalloc bits */
3872 	if (found_key.objectid != btrfs_ino(inode) ||
3873 	    found_type != BTRFS_EXTENT_DATA_KEY) {
3874 		/* have to trust i_size as the end */
3875 		last = (u64)-1;
3876 		last_for_get_extent = isize;
3877 	} else {
3878 		/*
3879 		 * remember the start of the last extent.  There are a
3880 		 * bunch of different factors that go into the length of the
3881 		 * extent, so its much less complex to remember where it started
3882 		 */
3883 		last = found_key.offset;
3884 		last_for_get_extent = last + 1;
3885 	}
3886 	btrfs_free_path(path);
3887 
3888 	/*
3889 	 * we might have some extents allocated but more delalloc past those
3890 	 * extents.  so, we trust isize unless the start of the last extent is
3891 	 * beyond isize
3892 	 */
3893 	if (last < isize) {
3894 		last = (u64)-1;
3895 		last_for_get_extent = isize;
3896 	}
3897 
3898 	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
3899 			 &cached_state);
3900 
3901 	em = get_extent_skip_holes(inode, start, last_for_get_extent,
3902 				   get_extent);
3903 	if (!em)
3904 		goto out;
3905 	if (IS_ERR(em)) {
3906 		ret = PTR_ERR(em);
3907 		goto out;
3908 	}
3909 
3910 	while (!end) {
3911 		u64 offset_in_extent;
3912 
3913 		/* break if the extent we found is outside the range */
3914 		if (em->start >= max || extent_map_end(em) < off)
3915 			break;
3916 
3917 		/*
3918 		 * get_extent may return an extent that starts before our
3919 		 * requested range.  We have to make sure the ranges
3920 		 * we return to fiemap always move forward and don't
3921 		 * overlap, so adjust the offsets here
3922 		 */
3923 		em_start = max(em->start, off);
3924 
3925 		/*
3926 		 * record the offset from the start of the extent
3927 		 * for adjusting the disk offset below
3928 		 */
3929 		offset_in_extent = em_start - em->start;
3930 		em_end = extent_map_end(em);
3931 		em_len = em_end - em_start;
3932 		emflags = em->flags;
3933 		disko = 0;
3934 		flags = 0;
3935 
3936 		/*
3937 		 * bump off for our next call to get_extent
3938 		 */
3939 		off = extent_map_end(em);
3940 		if (off >= max)
3941 			end = 1;
3942 
3943 		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
3944 			end = 1;
3945 			flags |= FIEMAP_EXTENT_LAST;
3946 		} else if (em->block_start == EXTENT_MAP_INLINE) {
3947 			flags |= (FIEMAP_EXTENT_DATA_INLINE |
3948 				  FIEMAP_EXTENT_NOT_ALIGNED);
3949 		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
3950 			flags |= (FIEMAP_EXTENT_DELALLOC |
3951 				  FIEMAP_EXTENT_UNKNOWN);
3952 		} else {
3953 			disko = em->block_start + offset_in_extent;
3954 		}
3955 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3956 			flags |= FIEMAP_EXTENT_ENCODED;
3957 
3958 		free_extent_map(em);
3959 		em = NULL;
3960 		if ((em_start >= last) || em_len == (u64)-1 ||
3961 		   (last == (u64)-1 && isize <= em_end)) {
3962 			flags |= FIEMAP_EXTENT_LAST;
3963 			end = 1;
3964 		}
3965 
3966 		/* now scan forward to see if this is really the last extent. */
3967 		em = get_extent_skip_holes(inode, off, last_for_get_extent,
3968 					   get_extent);
3969 		if (IS_ERR(em)) {
3970 			ret = PTR_ERR(em);
3971 			goto out;
3972 		}
3973 		if (!em) {
3974 			flags |= FIEMAP_EXTENT_LAST;
3975 			end = 1;
3976 		}
3977 		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3978 					      em_len, flags);
3979 		if (ret)
3980 			goto out_free;
3981 	}
3982 out_free:
3983 	free_extent_map(em);
3984 out:
3985 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3986 			     &cached_state, GFP_NOFS);
3987 	return ret;
3988 }
3989 
3990 static void __free_extent_buffer(struct extent_buffer *eb)
3991 {
3992 #if LEAK_DEBUG
3993 	unsigned long flags;
3994 	spin_lock_irqsave(&leak_lock, flags);
3995 	list_del(&eb->leak_list);
3996 	spin_unlock_irqrestore(&leak_lock, flags);
3997 #endif
3998 	if (eb->pages && eb->pages != eb->inline_pages)
3999 		kfree(eb->pages);
4000 	kmem_cache_free(extent_buffer_cache, eb);
4001 }
4002 
4003 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
4004 						   u64 start,
4005 						   unsigned long len,
4006 						   gfp_t mask)
4007 {
4008 	struct extent_buffer *eb = NULL;
4009 #if LEAK_DEBUG
4010 	unsigned long flags;
4011 #endif
4012 
4013 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
4014 	if (eb == NULL)
4015 		return NULL;
4016 	eb->start = start;
4017 	eb->len = len;
4018 	eb->tree = tree;
4019 	eb->bflags = 0;
4020 	rwlock_init(&eb->lock);
4021 	atomic_set(&eb->write_locks, 0);
4022 	atomic_set(&eb->read_locks, 0);
4023 	atomic_set(&eb->blocking_readers, 0);
4024 	atomic_set(&eb->blocking_writers, 0);
4025 	atomic_set(&eb->spinning_readers, 0);
4026 	atomic_set(&eb->spinning_writers, 0);
4027 	eb->lock_nested = 0;
4028 	init_waitqueue_head(&eb->write_lock_wq);
4029 	init_waitqueue_head(&eb->read_lock_wq);
4030 
4031 #if LEAK_DEBUG
4032 	spin_lock_irqsave(&leak_lock, flags);
4033 	list_add(&eb->leak_list, &buffers);
4034 	spin_unlock_irqrestore(&leak_lock, flags);
4035 #endif
4036 	spin_lock_init(&eb->refs_lock);
4037 	atomic_set(&eb->refs, 1);
4038 	atomic_set(&eb->io_pages, 0);
4039 
4040 	if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
4041 		struct page **pages;
4042 		int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
4043 			PAGE_CACHE_SHIFT;
4044 		pages = kzalloc(num_pages, mask);
4045 		if (!pages) {
4046 			__free_extent_buffer(eb);
4047 			return NULL;
4048 		}
4049 		eb->pages = pages;
4050 	} else {
4051 		eb->pages = eb->inline_pages;
4052 	}
4053 
4054 	return eb;
4055 }
4056 
4057 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
4058 {
4059 	unsigned long i;
4060 	struct page *p;
4061 	struct extent_buffer *new;
4062 	unsigned long num_pages = num_extent_pages(src->start, src->len);
4063 
4064 	new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
4065 	if (new == NULL)
4066 		return NULL;
4067 
4068 	for (i = 0; i < num_pages; i++) {
4069 		p = alloc_page(GFP_ATOMIC);
4070 		BUG_ON(!p);
4071 		attach_extent_buffer_page(new, p);
4072 		WARN_ON(PageDirty(p));
4073 		SetPageUptodate(p);
4074 		new->pages[i] = p;
4075 	}
4076 
4077 	copy_extent_buffer(new, src, 0, 0, src->len);
4078 	set_bit(EXTENT_BUFFER_UPTODATE, &new->bflags);
4079 	set_bit(EXTENT_BUFFER_DUMMY, &new->bflags);
4080 
4081 	return new;
4082 }
4083 
4084 struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
4085 {
4086 	struct extent_buffer *eb;
4087 	unsigned long num_pages = num_extent_pages(0, len);
4088 	unsigned long i;
4089 
4090 	eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
4091 	if (!eb)
4092 		return NULL;
4093 
4094 	for (i = 0; i < num_pages; i++) {
4095 		eb->pages[i] = alloc_page(GFP_ATOMIC);
4096 		if (!eb->pages[i])
4097 			goto err;
4098 	}
4099 	set_extent_buffer_uptodate(eb);
4100 	btrfs_set_header_nritems(eb, 0);
4101 	set_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4102 
4103 	return eb;
4104 err:
4105 	for (; i > 0; i--)
4106 		__free_page(eb->pages[i - 1]);
4107 	__free_extent_buffer(eb);
4108 	return NULL;
4109 }
4110 
4111 static int extent_buffer_under_io(struct extent_buffer *eb)
4112 {
4113 	return (atomic_read(&eb->io_pages) ||
4114 		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
4115 		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4116 }
4117 
4118 /*
4119  * Helper for releasing extent buffer page.
4120  */
4121 static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
4122 						unsigned long start_idx)
4123 {
4124 	unsigned long index;
4125 	unsigned long num_pages;
4126 	struct page *page;
4127 	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
4128 
4129 	BUG_ON(extent_buffer_under_io(eb));
4130 
4131 	num_pages = num_extent_pages(eb->start, eb->len);
4132 	index = start_idx + num_pages;
4133 	if (start_idx >= index)
4134 		return;
4135 
4136 	do {
4137 		index--;
4138 		page = extent_buffer_page(eb, index);
4139 		if (page && mapped) {
4140 			spin_lock(&page->mapping->private_lock);
4141 			/*
4142 			 * We do this since we'll remove the pages after we've
4143 			 * removed the eb from the radix tree, so we could race
4144 			 * and have this page now attached to the new eb.  So
4145 			 * only clear page_private if it's still connected to
4146 			 * this eb.
4147 			 */
4148 			if (PagePrivate(page) &&
4149 			    page->private == (unsigned long)eb) {
4150 				BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
4151 				BUG_ON(PageDirty(page));
4152 				BUG_ON(PageWriteback(page));
4153 				/*
4154 				 * We need to make sure we haven't be attached
4155 				 * to a new eb.
4156 				 */
4157 				ClearPagePrivate(page);
4158 				set_page_private(page, 0);
4159 				/* One for the page private */
4160 				page_cache_release(page);
4161 			}
4162 			spin_unlock(&page->mapping->private_lock);
4163 
4164 		}
4165 		if (page) {
4166 			/* One for when we alloced the page */
4167 			page_cache_release(page);
4168 		}
4169 	} while (index != start_idx);
4170 }
4171 
4172 /*
4173  * Helper for releasing the extent buffer.
4174  */
4175 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
4176 {
4177 	btrfs_release_extent_buffer_page(eb, 0);
4178 	__free_extent_buffer(eb);
4179 }
4180 
4181 static void check_buffer_tree_ref(struct extent_buffer *eb)
4182 {
4183 	/* the ref bit is tricky.  We have to make sure it is set
4184 	 * if we have the buffer dirty.   Otherwise the
4185 	 * code to free a buffer can end up dropping a dirty
4186 	 * page
4187 	 *
4188 	 * Once the ref bit is set, it won't go away while the
4189 	 * buffer is dirty or in writeback, and it also won't
4190 	 * go away while we have the reference count on the
4191 	 * eb bumped.
4192 	 *
4193 	 * We can't just set the ref bit without bumping the
4194 	 * ref on the eb because free_extent_buffer might
4195 	 * see the ref bit and try to clear it.  If this happens
4196 	 * free_extent_buffer might end up dropping our original
4197 	 * ref by mistake and freeing the page before we are able
4198 	 * to add one more ref.
4199 	 *
4200 	 * So bump the ref count first, then set the bit.  If someone
4201 	 * beat us to it, drop the ref we added.
4202 	 */
4203 	spin_lock(&eb->refs_lock);
4204 	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4205 		atomic_inc(&eb->refs);
4206 	spin_unlock(&eb->refs_lock);
4207 }
4208 
4209 static void mark_extent_buffer_accessed(struct extent_buffer *eb)
4210 {
4211 	unsigned long num_pages, i;
4212 
4213 	check_buffer_tree_ref(eb);
4214 
4215 	num_pages = num_extent_pages(eb->start, eb->len);
4216 	for (i = 0; i < num_pages; i++) {
4217 		struct page *p = extent_buffer_page(eb, i);
4218 		mark_page_accessed(p);
4219 	}
4220 }
4221 
4222 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
4223 					  u64 start, unsigned long len)
4224 {
4225 	unsigned long num_pages = num_extent_pages(start, len);
4226 	unsigned long i;
4227 	unsigned long index = start >> PAGE_CACHE_SHIFT;
4228 	struct extent_buffer *eb;
4229 	struct extent_buffer *exists = NULL;
4230 	struct page *p;
4231 	struct address_space *mapping = tree->mapping;
4232 	int uptodate = 1;
4233 	int ret;
4234 
4235 	rcu_read_lock();
4236 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4237 	if (eb && atomic_inc_not_zero(&eb->refs)) {
4238 		rcu_read_unlock();
4239 		mark_extent_buffer_accessed(eb);
4240 		return eb;
4241 	}
4242 	rcu_read_unlock();
4243 
4244 	eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
4245 	if (!eb)
4246 		return NULL;
4247 
4248 	for (i = 0; i < num_pages; i++, index++) {
4249 		p = find_or_create_page(mapping, index, GFP_NOFS);
4250 		if (!p)
4251 			goto free_eb;
4252 
4253 		spin_lock(&mapping->private_lock);
4254 		if (PagePrivate(p)) {
4255 			/*
4256 			 * We could have already allocated an eb for this page
4257 			 * and attached one so lets see if we can get a ref on
4258 			 * the existing eb, and if we can we know it's good and
4259 			 * we can just return that one, else we know we can just
4260 			 * overwrite page->private.
4261 			 */
4262 			exists = (struct extent_buffer *)p->private;
4263 			if (atomic_inc_not_zero(&exists->refs)) {
4264 				spin_unlock(&mapping->private_lock);
4265 				unlock_page(p);
4266 				page_cache_release(p);
4267 				mark_extent_buffer_accessed(exists);
4268 				goto free_eb;
4269 			}
4270 
4271 			/*
4272 			 * Do this so attach doesn't complain and we need to
4273 			 * drop the ref the old guy had.
4274 			 */
4275 			ClearPagePrivate(p);
4276 			WARN_ON(PageDirty(p));
4277 			page_cache_release(p);
4278 		}
4279 		attach_extent_buffer_page(eb, p);
4280 		spin_unlock(&mapping->private_lock);
4281 		WARN_ON(PageDirty(p));
4282 		mark_page_accessed(p);
4283 		eb->pages[i] = p;
4284 		if (!PageUptodate(p))
4285 			uptodate = 0;
4286 
4287 		/*
4288 		 * see below about how we avoid a nasty race with release page
4289 		 * and why we unlock later
4290 		 */
4291 	}
4292 	if (uptodate)
4293 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4294 again:
4295 	ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
4296 	if (ret)
4297 		goto free_eb;
4298 
4299 	spin_lock(&tree->buffer_lock);
4300 	ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
4301 	if (ret == -EEXIST) {
4302 		exists = radix_tree_lookup(&tree->buffer,
4303 						start >> PAGE_CACHE_SHIFT);
4304 		if (!atomic_inc_not_zero(&exists->refs)) {
4305 			spin_unlock(&tree->buffer_lock);
4306 			radix_tree_preload_end();
4307 			exists = NULL;
4308 			goto again;
4309 		}
4310 		spin_unlock(&tree->buffer_lock);
4311 		radix_tree_preload_end();
4312 		mark_extent_buffer_accessed(exists);
4313 		goto free_eb;
4314 	}
4315 	/* add one reference for the tree */
4316 	check_buffer_tree_ref(eb);
4317 	spin_unlock(&tree->buffer_lock);
4318 	radix_tree_preload_end();
4319 
4320 	/*
4321 	 * there is a race where release page may have
4322 	 * tried to find this extent buffer in the radix
4323 	 * but failed.  It will tell the VM it is safe to
4324 	 * reclaim the, and it will clear the page private bit.
4325 	 * We must make sure to set the page private bit properly
4326 	 * after the extent buffer is in the radix tree so
4327 	 * it doesn't get lost
4328 	 */
4329 	SetPageChecked(eb->pages[0]);
4330 	for (i = 1; i < num_pages; i++) {
4331 		p = extent_buffer_page(eb, i);
4332 		ClearPageChecked(p);
4333 		unlock_page(p);
4334 	}
4335 	unlock_page(eb->pages[0]);
4336 	return eb;
4337 
4338 free_eb:
4339 	for (i = 0; i < num_pages; i++) {
4340 		if (eb->pages[i])
4341 			unlock_page(eb->pages[i]);
4342 	}
4343 
4344 	WARN_ON(!atomic_dec_and_test(&eb->refs));
4345 	btrfs_release_extent_buffer(eb);
4346 	return exists;
4347 }
4348 
4349 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
4350 					 u64 start, unsigned long len)
4351 {
4352 	struct extent_buffer *eb;
4353 
4354 	rcu_read_lock();
4355 	eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
4356 	if (eb && atomic_inc_not_zero(&eb->refs)) {
4357 		rcu_read_unlock();
4358 		mark_extent_buffer_accessed(eb);
4359 		return eb;
4360 	}
4361 	rcu_read_unlock();
4362 
4363 	return NULL;
4364 }
4365 
4366 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
4367 {
4368 	struct extent_buffer *eb =
4369 			container_of(head, struct extent_buffer, rcu_head);
4370 
4371 	__free_extent_buffer(eb);
4372 }
4373 
4374 /* Expects to have eb->eb_lock already held */
4375 static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
4376 {
4377 	WARN_ON(atomic_read(&eb->refs) == 0);
4378 	if (atomic_dec_and_test(&eb->refs)) {
4379 		if (test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags)) {
4380 			spin_unlock(&eb->refs_lock);
4381 		} else {
4382 			struct extent_io_tree *tree = eb->tree;
4383 
4384 			spin_unlock(&eb->refs_lock);
4385 
4386 			spin_lock(&tree->buffer_lock);
4387 			radix_tree_delete(&tree->buffer,
4388 					  eb->start >> PAGE_CACHE_SHIFT);
4389 			spin_unlock(&tree->buffer_lock);
4390 		}
4391 
4392 		/* Should be safe to release our pages at this point */
4393 		btrfs_release_extent_buffer_page(eb, 0);
4394 		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
4395 		return 1;
4396 	}
4397 	spin_unlock(&eb->refs_lock);
4398 
4399 	return 0;
4400 }
4401 
4402 void free_extent_buffer(struct extent_buffer *eb)
4403 {
4404 	if (!eb)
4405 		return;
4406 
4407 	spin_lock(&eb->refs_lock);
4408 	if (atomic_read(&eb->refs) == 2 &&
4409 	    test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
4410 		atomic_dec(&eb->refs);
4411 
4412 	if (atomic_read(&eb->refs) == 2 &&
4413 	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
4414 	    !extent_buffer_under_io(eb) &&
4415 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4416 		atomic_dec(&eb->refs);
4417 
4418 	/*
4419 	 * I know this is terrible, but it's temporary until we stop tracking
4420 	 * the uptodate bits and such for the extent buffers.
4421 	 */
4422 	release_extent_buffer(eb, GFP_ATOMIC);
4423 }
4424 
4425 void free_extent_buffer_stale(struct extent_buffer *eb)
4426 {
4427 	if (!eb)
4428 		return;
4429 
4430 	spin_lock(&eb->refs_lock);
4431 	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
4432 
4433 	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
4434 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
4435 		atomic_dec(&eb->refs);
4436 	release_extent_buffer(eb, GFP_NOFS);
4437 }
4438 
4439 void clear_extent_buffer_dirty(struct extent_buffer *eb)
4440 {
4441 	unsigned long i;
4442 	unsigned long num_pages;
4443 	struct page *page;
4444 
4445 	num_pages = num_extent_pages(eb->start, eb->len);
4446 
4447 	for (i = 0; i < num_pages; i++) {
4448 		page = extent_buffer_page(eb, i);
4449 		if (!PageDirty(page))
4450 			continue;
4451 
4452 		lock_page(page);
4453 		WARN_ON(!PagePrivate(page));
4454 
4455 		clear_page_dirty_for_io(page);
4456 		spin_lock_irq(&page->mapping->tree_lock);
4457 		if (!PageDirty(page)) {
4458 			radix_tree_tag_clear(&page->mapping->page_tree,
4459 						page_index(page),
4460 						PAGECACHE_TAG_DIRTY);
4461 		}
4462 		spin_unlock_irq(&page->mapping->tree_lock);
4463 		ClearPageError(page);
4464 		unlock_page(page);
4465 	}
4466 	WARN_ON(atomic_read(&eb->refs) == 0);
4467 }
4468 
4469 int set_extent_buffer_dirty(struct extent_buffer *eb)
4470 {
4471 	unsigned long i;
4472 	unsigned long num_pages;
4473 	int was_dirty = 0;
4474 
4475 	check_buffer_tree_ref(eb);
4476 
4477 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
4478 
4479 	num_pages = num_extent_pages(eb->start, eb->len);
4480 	WARN_ON(atomic_read(&eb->refs) == 0);
4481 	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
4482 
4483 	for (i = 0; i < num_pages; i++)
4484 		set_page_dirty(extent_buffer_page(eb, i));
4485 	return was_dirty;
4486 }
4487 
4488 static int range_straddles_pages(u64 start, u64 len)
4489 {
4490 	if (len < PAGE_CACHE_SIZE)
4491 		return 1;
4492 	if (start & (PAGE_CACHE_SIZE - 1))
4493 		return 1;
4494 	if ((start + len) & (PAGE_CACHE_SIZE - 1))
4495 		return 1;
4496 	return 0;
4497 }
4498 
4499 int clear_extent_buffer_uptodate(struct extent_buffer *eb)
4500 {
4501 	unsigned long i;
4502 	struct page *page;
4503 	unsigned long num_pages;
4504 
4505 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4506 	num_pages = num_extent_pages(eb->start, eb->len);
4507 	for (i = 0; i < num_pages; i++) {
4508 		page = extent_buffer_page(eb, i);
4509 		if (page)
4510 			ClearPageUptodate(page);
4511 	}
4512 	return 0;
4513 }
4514 
4515 int set_extent_buffer_uptodate(struct extent_buffer *eb)
4516 {
4517 	unsigned long i;
4518 	struct page *page;
4519 	unsigned long num_pages;
4520 
4521 	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4522 	num_pages = num_extent_pages(eb->start, eb->len);
4523 	for (i = 0; i < num_pages; i++) {
4524 		page = extent_buffer_page(eb, i);
4525 		SetPageUptodate(page);
4526 	}
4527 	return 0;
4528 }
4529 
4530 int extent_range_uptodate(struct extent_io_tree *tree,
4531 			  u64 start, u64 end)
4532 {
4533 	struct page *page;
4534 	int ret;
4535 	int pg_uptodate = 1;
4536 	int uptodate;
4537 	unsigned long index;
4538 
4539 	if (range_straddles_pages(start, end - start + 1)) {
4540 		ret = test_range_bit(tree, start, end,
4541 				     EXTENT_UPTODATE, 1, NULL);
4542 		if (ret)
4543 			return 1;
4544 	}
4545 	while (start <= end) {
4546 		index = start >> PAGE_CACHE_SHIFT;
4547 		page = find_get_page(tree->mapping, index);
4548 		if (!page)
4549 			return 1;
4550 		uptodate = PageUptodate(page);
4551 		page_cache_release(page);
4552 		if (!uptodate) {
4553 			pg_uptodate = 0;
4554 			break;
4555 		}
4556 		start += PAGE_CACHE_SIZE;
4557 	}
4558 	return pg_uptodate;
4559 }
4560 
4561 int extent_buffer_uptodate(struct extent_buffer *eb)
4562 {
4563 	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4564 }
4565 
4566 int read_extent_buffer_pages(struct extent_io_tree *tree,
4567 			     struct extent_buffer *eb, u64 start, int wait,
4568 			     get_extent_t *get_extent, int mirror_num)
4569 {
4570 	unsigned long i;
4571 	unsigned long start_i;
4572 	struct page *page;
4573 	int err;
4574 	int ret = 0;
4575 	int locked_pages = 0;
4576 	int all_uptodate = 1;
4577 	unsigned long num_pages;
4578 	unsigned long num_reads = 0;
4579 	struct bio *bio = NULL;
4580 	unsigned long bio_flags = 0;
4581 
4582 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
4583 		return 0;
4584 
4585 	if (start) {
4586 		WARN_ON(start < eb->start);
4587 		start_i = (start >> PAGE_CACHE_SHIFT) -
4588 			(eb->start >> PAGE_CACHE_SHIFT);
4589 	} else {
4590 		start_i = 0;
4591 	}
4592 
4593 	num_pages = num_extent_pages(eb->start, eb->len);
4594 	for (i = start_i; i < num_pages; i++) {
4595 		page = extent_buffer_page(eb, i);
4596 		if (wait == WAIT_NONE) {
4597 			if (!trylock_page(page))
4598 				goto unlock_exit;
4599 		} else {
4600 			lock_page(page);
4601 		}
4602 		locked_pages++;
4603 		if (!PageUptodate(page)) {
4604 			num_reads++;
4605 			all_uptodate = 0;
4606 		}
4607 	}
4608 	if (all_uptodate) {
4609 		if (start_i == 0)
4610 			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4611 		goto unlock_exit;
4612 	}
4613 
4614 	clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
4615 	eb->read_mirror = 0;
4616 	atomic_set(&eb->io_pages, num_reads);
4617 	for (i = start_i; i < num_pages; i++) {
4618 		page = extent_buffer_page(eb, i);
4619 		if (!PageUptodate(page)) {
4620 			ClearPageError(page);
4621 			err = __extent_read_full_page(tree, page,
4622 						      get_extent, &bio,
4623 						      mirror_num, &bio_flags);
4624 			if (err)
4625 				ret = err;
4626 		} else {
4627 			unlock_page(page);
4628 		}
4629 	}
4630 
4631 	if (bio) {
4632 		err = submit_one_bio(READ, bio, mirror_num, bio_flags);
4633 		if (err)
4634 			return err;
4635 	}
4636 
4637 	if (ret || wait != WAIT_COMPLETE)
4638 		return ret;
4639 
4640 	for (i = start_i; i < num_pages; i++) {
4641 		page = extent_buffer_page(eb, i);
4642 		wait_on_page_locked(page);
4643 		if (!PageUptodate(page))
4644 			ret = -EIO;
4645 	}
4646 
4647 	return ret;
4648 
4649 unlock_exit:
4650 	i = start_i;
4651 	while (locked_pages > 0) {
4652 		page = extent_buffer_page(eb, i);
4653 		i++;
4654 		unlock_page(page);
4655 		locked_pages--;
4656 	}
4657 	return ret;
4658 }
4659 
4660 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
4661 			unsigned long start,
4662 			unsigned long len)
4663 {
4664 	size_t cur;
4665 	size_t offset;
4666 	struct page *page;
4667 	char *kaddr;
4668 	char *dst = (char *)dstv;
4669 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4670 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4671 
4672 	WARN_ON(start > eb->len);
4673 	WARN_ON(start + len > eb->start + eb->len);
4674 
4675 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4676 
4677 	while (len > 0) {
4678 		page = extent_buffer_page(eb, i);
4679 
4680 		cur = min(len, (PAGE_CACHE_SIZE - offset));
4681 		kaddr = page_address(page);
4682 		memcpy(dst, kaddr + offset, cur);
4683 
4684 		dst += cur;
4685 		len -= cur;
4686 		offset = 0;
4687 		i++;
4688 	}
4689 }
4690 
4691 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
4692 			       unsigned long min_len, char **map,
4693 			       unsigned long *map_start,
4694 			       unsigned long *map_len)
4695 {
4696 	size_t offset = start & (PAGE_CACHE_SIZE - 1);
4697 	char *kaddr;
4698 	struct page *p;
4699 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4700 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4701 	unsigned long end_i = (start_offset + start + min_len - 1) >>
4702 		PAGE_CACHE_SHIFT;
4703 
4704 	if (i != end_i)
4705 		return -EINVAL;
4706 
4707 	if (i == 0) {
4708 		offset = start_offset;
4709 		*map_start = 0;
4710 	} else {
4711 		offset = 0;
4712 		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
4713 	}
4714 
4715 	if (start + min_len > eb->len) {
4716 		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
4717 		       "wanted %lu %lu\n", (unsigned long long)eb->start,
4718 		       eb->len, start, min_len);
4719 		return -EINVAL;
4720 	}
4721 
4722 	p = extent_buffer_page(eb, i);
4723 	kaddr = page_address(p);
4724 	*map = kaddr + offset;
4725 	*map_len = PAGE_CACHE_SIZE - offset;
4726 	return 0;
4727 }
4728 
4729 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
4730 			  unsigned long start,
4731 			  unsigned long len)
4732 {
4733 	size_t cur;
4734 	size_t offset;
4735 	struct page *page;
4736 	char *kaddr;
4737 	char *ptr = (char *)ptrv;
4738 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4739 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4740 	int ret = 0;
4741 
4742 	WARN_ON(start > eb->len);
4743 	WARN_ON(start + len > eb->start + eb->len);
4744 
4745 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4746 
4747 	while (len > 0) {
4748 		page = extent_buffer_page(eb, i);
4749 
4750 		cur = min(len, (PAGE_CACHE_SIZE - offset));
4751 
4752 		kaddr = page_address(page);
4753 		ret = memcmp(ptr, kaddr + offset, cur);
4754 		if (ret)
4755 			break;
4756 
4757 		ptr += cur;
4758 		len -= cur;
4759 		offset = 0;
4760 		i++;
4761 	}
4762 	return ret;
4763 }
4764 
4765 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
4766 			 unsigned long start, unsigned long len)
4767 {
4768 	size_t cur;
4769 	size_t offset;
4770 	struct page *page;
4771 	char *kaddr;
4772 	char *src = (char *)srcv;
4773 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4774 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4775 
4776 	WARN_ON(start > eb->len);
4777 	WARN_ON(start + len > eb->start + eb->len);
4778 
4779 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4780 
4781 	while (len > 0) {
4782 		page = extent_buffer_page(eb, i);
4783 		WARN_ON(!PageUptodate(page));
4784 
4785 		cur = min(len, PAGE_CACHE_SIZE - offset);
4786 		kaddr = page_address(page);
4787 		memcpy(kaddr + offset, src, cur);
4788 
4789 		src += cur;
4790 		len -= cur;
4791 		offset = 0;
4792 		i++;
4793 	}
4794 }
4795 
4796 void memset_extent_buffer(struct extent_buffer *eb, char c,
4797 			  unsigned long start, unsigned long len)
4798 {
4799 	size_t cur;
4800 	size_t offset;
4801 	struct page *page;
4802 	char *kaddr;
4803 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
4804 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
4805 
4806 	WARN_ON(start > eb->len);
4807 	WARN_ON(start + len > eb->start + eb->len);
4808 
4809 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
4810 
4811 	while (len > 0) {
4812 		page = extent_buffer_page(eb, i);
4813 		WARN_ON(!PageUptodate(page));
4814 
4815 		cur = min(len, PAGE_CACHE_SIZE - offset);
4816 		kaddr = page_address(page);
4817 		memset(kaddr + offset, c, cur);
4818 
4819 		len -= cur;
4820 		offset = 0;
4821 		i++;
4822 	}
4823 }
4824 
4825 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
4826 			unsigned long dst_offset, unsigned long src_offset,
4827 			unsigned long len)
4828 {
4829 	u64 dst_len = dst->len;
4830 	size_t cur;
4831 	size_t offset;
4832 	struct page *page;
4833 	char *kaddr;
4834 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4835 	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4836 
4837 	WARN_ON(src->len != dst_len);
4838 
4839 	offset = (start_offset + dst_offset) &
4840 		((unsigned long)PAGE_CACHE_SIZE - 1);
4841 
4842 	while (len > 0) {
4843 		page = extent_buffer_page(dst, i);
4844 		WARN_ON(!PageUptodate(page));
4845 
4846 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
4847 
4848 		kaddr = page_address(page);
4849 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
4850 
4851 		src_offset += cur;
4852 		len -= cur;
4853 		offset = 0;
4854 		i++;
4855 	}
4856 }
4857 
4858 static void move_pages(struct page *dst_page, struct page *src_page,
4859 		       unsigned long dst_off, unsigned long src_off,
4860 		       unsigned long len)
4861 {
4862 	char *dst_kaddr = page_address(dst_page);
4863 	if (dst_page == src_page) {
4864 		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
4865 	} else {
4866 		char *src_kaddr = page_address(src_page);
4867 		char *p = dst_kaddr + dst_off + len;
4868 		char *s = src_kaddr + src_off + len;
4869 
4870 		while (len--)
4871 			*--p = *--s;
4872 	}
4873 }
4874 
4875 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
4876 {
4877 	unsigned long distance = (src > dst) ? src - dst : dst - src;
4878 	return distance < len;
4879 }
4880 
4881 static void copy_pages(struct page *dst_page, struct page *src_page,
4882 		       unsigned long dst_off, unsigned long src_off,
4883 		       unsigned long len)
4884 {
4885 	char *dst_kaddr = page_address(dst_page);
4886 	char *src_kaddr;
4887 	int must_memmove = 0;
4888 
4889 	if (dst_page != src_page) {
4890 		src_kaddr = page_address(src_page);
4891 	} else {
4892 		src_kaddr = dst_kaddr;
4893 		if (areas_overlap(src_off, dst_off, len))
4894 			must_memmove = 1;
4895 	}
4896 
4897 	if (must_memmove)
4898 		memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
4899 	else
4900 		memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
4901 }
4902 
4903 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4904 			   unsigned long src_offset, unsigned long len)
4905 {
4906 	size_t cur;
4907 	size_t dst_off_in_page;
4908 	size_t src_off_in_page;
4909 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4910 	unsigned long dst_i;
4911 	unsigned long src_i;
4912 
4913 	if (src_offset + len > dst->len) {
4914 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4915 		       "len %lu dst len %lu\n", src_offset, len, dst->len);
4916 		BUG_ON(1);
4917 	}
4918 	if (dst_offset + len > dst->len) {
4919 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4920 		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
4921 		BUG_ON(1);
4922 	}
4923 
4924 	while (len > 0) {
4925 		dst_off_in_page = (start_offset + dst_offset) &
4926 			((unsigned long)PAGE_CACHE_SIZE - 1);
4927 		src_off_in_page = (start_offset + src_offset) &
4928 			((unsigned long)PAGE_CACHE_SIZE - 1);
4929 
4930 		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
4931 		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
4932 
4933 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
4934 					       src_off_in_page));
4935 		cur = min_t(unsigned long, cur,
4936 			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
4937 
4938 		copy_pages(extent_buffer_page(dst, dst_i),
4939 			   extent_buffer_page(dst, src_i),
4940 			   dst_off_in_page, src_off_in_page, cur);
4941 
4942 		src_offset += cur;
4943 		dst_offset += cur;
4944 		len -= cur;
4945 	}
4946 }
4947 
4948 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
4949 			   unsigned long src_offset, unsigned long len)
4950 {
4951 	size_t cur;
4952 	size_t dst_off_in_page;
4953 	size_t src_off_in_page;
4954 	unsigned long dst_end = dst_offset + len - 1;
4955 	unsigned long src_end = src_offset + len - 1;
4956 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
4957 	unsigned long dst_i;
4958 	unsigned long src_i;
4959 
4960 	if (src_offset + len > dst->len) {
4961 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
4962 		       "len %lu len %lu\n", src_offset, len, dst->len);
4963 		BUG_ON(1);
4964 	}
4965 	if (dst_offset + len > dst->len) {
4966 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
4967 		       "len %lu len %lu\n", dst_offset, len, dst->len);
4968 		BUG_ON(1);
4969 	}
4970 	if (dst_offset < src_offset) {
4971 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
4972 		return;
4973 	}
4974 	while (len > 0) {
4975 		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
4976 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
4977 
4978 		dst_off_in_page = (start_offset + dst_end) &
4979 			((unsigned long)PAGE_CACHE_SIZE - 1);
4980 		src_off_in_page = (start_offset + src_end) &
4981 			((unsigned long)PAGE_CACHE_SIZE - 1);
4982 
4983 		cur = min_t(unsigned long, len, src_off_in_page + 1);
4984 		cur = min(cur, dst_off_in_page + 1);
4985 		move_pages(extent_buffer_page(dst, dst_i),
4986 			   extent_buffer_page(dst, src_i),
4987 			   dst_off_in_page - cur + 1,
4988 			   src_off_in_page - cur + 1, cur);
4989 
4990 		dst_end -= cur;
4991 		src_end -= cur;
4992 		len -= cur;
4993 	}
4994 }
4995 
4996 int try_release_extent_buffer(struct page *page, gfp_t mask)
4997 {
4998 	struct extent_buffer *eb;
4999 
5000 	/*
5001 	 * We need to make sure noboody is attaching this page to an eb right
5002 	 * now.
5003 	 */
5004 	spin_lock(&page->mapping->private_lock);
5005 	if (!PagePrivate(page)) {
5006 		spin_unlock(&page->mapping->private_lock);
5007 		return 1;
5008 	}
5009 
5010 	eb = (struct extent_buffer *)page->private;
5011 	BUG_ON(!eb);
5012 
5013 	/*
5014 	 * This is a little awful but should be ok, we need to make sure that
5015 	 * the eb doesn't disappear out from under us while we're looking at
5016 	 * this page.
5017 	 */
5018 	spin_lock(&eb->refs_lock);
5019 	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
5020 		spin_unlock(&eb->refs_lock);
5021 		spin_unlock(&page->mapping->private_lock);
5022 		return 0;
5023 	}
5024 	spin_unlock(&page->mapping->private_lock);
5025 
5026 	if ((mask & GFP_NOFS) == GFP_NOFS)
5027 		mask = GFP_NOFS;
5028 
5029 	/*
5030 	 * If tree ref isn't set then we know the ref on this eb is a real ref,
5031 	 * so just return, this page will likely be freed soon anyway.
5032 	 */
5033 	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
5034 		spin_unlock(&eb->refs_lock);
5035 		return 0;
5036 	}
5037 
5038 	return release_extent_buffer(eb, mask);
5039 }
5040