xref: /openbmc/linux/fs/btrfs/extent_io.c (revision e8e0929d)
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include <linux/writeback.h>
13 #include <linux/pagevec.h>
14 #include "extent_io.h"
15 #include "extent_map.h"
16 #include "compat.h"
17 #include "ctree.h"
18 #include "btrfs_inode.h"
19 
20 static struct kmem_cache *extent_state_cache;
21 static struct kmem_cache *extent_buffer_cache;
22 
23 static LIST_HEAD(buffers);
24 static LIST_HEAD(states);
25 
26 #define LEAK_DEBUG 0
27 #if LEAK_DEBUG
28 static DEFINE_SPINLOCK(leak_lock);
29 #endif
30 
31 #define BUFFER_LRU_MAX 64
32 
33 struct tree_entry {
34 	u64 start;
35 	u64 end;
36 	struct rb_node rb_node;
37 };
38 
39 struct extent_page_data {
40 	struct bio *bio;
41 	struct extent_io_tree *tree;
42 	get_extent_t *get_extent;
43 
44 	/* tells writepage not to lock the state bits for this range
45 	 * it still does the unlocking
46 	 */
47 	unsigned int extent_locked:1;
48 
49 	/* tells the submit_bio code to use a WRITE_SYNC */
50 	unsigned int sync_io:1;
51 };
52 
53 int __init extent_io_init(void)
54 {
55 	extent_state_cache = kmem_cache_create("extent_state",
56 			sizeof(struct extent_state), 0,
57 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
58 	if (!extent_state_cache)
59 		return -ENOMEM;
60 
61 	extent_buffer_cache = kmem_cache_create("extent_buffers",
62 			sizeof(struct extent_buffer), 0,
63 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
64 	if (!extent_buffer_cache)
65 		goto free_state_cache;
66 	return 0;
67 
68 free_state_cache:
69 	kmem_cache_destroy(extent_state_cache);
70 	return -ENOMEM;
71 }
72 
73 void extent_io_exit(void)
74 {
75 	struct extent_state *state;
76 	struct extent_buffer *eb;
77 
78 	while (!list_empty(&states)) {
79 		state = list_entry(states.next, struct extent_state, leak_list);
80 		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
81 		       "state %lu in tree %p refs %d\n",
82 		       (unsigned long long)state->start,
83 		       (unsigned long long)state->end,
84 		       state->state, state->tree, atomic_read(&state->refs));
85 		list_del(&state->leak_list);
86 		kmem_cache_free(extent_state_cache, state);
87 
88 	}
89 
90 	while (!list_empty(&buffers)) {
91 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
92 		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
93 		       "refs %d\n", (unsigned long long)eb->start,
94 		       eb->len, atomic_read(&eb->refs));
95 		list_del(&eb->leak_list);
96 		kmem_cache_free(extent_buffer_cache, eb);
97 	}
98 	if (extent_state_cache)
99 		kmem_cache_destroy(extent_state_cache);
100 	if (extent_buffer_cache)
101 		kmem_cache_destroy(extent_buffer_cache);
102 }
103 
104 void extent_io_tree_init(struct extent_io_tree *tree,
105 			  struct address_space *mapping, gfp_t mask)
106 {
107 	tree->state.rb_node = NULL;
108 	tree->buffer.rb_node = NULL;
109 	tree->ops = NULL;
110 	tree->dirty_bytes = 0;
111 	spin_lock_init(&tree->lock);
112 	spin_lock_init(&tree->buffer_lock);
113 	tree->mapping = mapping;
114 }
115 
116 static struct extent_state *alloc_extent_state(gfp_t mask)
117 {
118 	struct extent_state *state;
119 #if LEAK_DEBUG
120 	unsigned long flags;
121 #endif
122 
123 	state = kmem_cache_alloc(extent_state_cache, mask);
124 	if (!state)
125 		return state;
126 	state->state = 0;
127 	state->private = 0;
128 	state->tree = NULL;
129 #if LEAK_DEBUG
130 	spin_lock_irqsave(&leak_lock, flags);
131 	list_add(&state->leak_list, &states);
132 	spin_unlock_irqrestore(&leak_lock, flags);
133 #endif
134 	atomic_set(&state->refs, 1);
135 	init_waitqueue_head(&state->wq);
136 	return state;
137 }
138 
139 static void free_extent_state(struct extent_state *state)
140 {
141 	if (!state)
142 		return;
143 	if (atomic_dec_and_test(&state->refs)) {
144 #if LEAK_DEBUG
145 		unsigned long flags;
146 #endif
147 		WARN_ON(state->tree);
148 #if LEAK_DEBUG
149 		spin_lock_irqsave(&leak_lock, flags);
150 		list_del(&state->leak_list);
151 		spin_unlock_irqrestore(&leak_lock, flags);
152 #endif
153 		kmem_cache_free(extent_state_cache, state);
154 	}
155 }
156 
157 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
158 				   struct rb_node *node)
159 {
160 	struct rb_node **p = &root->rb_node;
161 	struct rb_node *parent = NULL;
162 	struct tree_entry *entry;
163 
164 	while (*p) {
165 		parent = *p;
166 		entry = rb_entry(parent, struct tree_entry, rb_node);
167 
168 		if (offset < entry->start)
169 			p = &(*p)->rb_left;
170 		else if (offset > entry->end)
171 			p = &(*p)->rb_right;
172 		else
173 			return parent;
174 	}
175 
176 	entry = rb_entry(node, struct tree_entry, rb_node);
177 	rb_link_node(node, parent, p);
178 	rb_insert_color(node, root);
179 	return NULL;
180 }
181 
182 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
183 				     struct rb_node **prev_ret,
184 				     struct rb_node **next_ret)
185 {
186 	struct rb_root *root = &tree->state;
187 	struct rb_node *n = root->rb_node;
188 	struct rb_node *prev = NULL;
189 	struct rb_node *orig_prev = NULL;
190 	struct tree_entry *entry;
191 	struct tree_entry *prev_entry = NULL;
192 
193 	while (n) {
194 		entry = rb_entry(n, struct tree_entry, rb_node);
195 		prev = n;
196 		prev_entry = entry;
197 
198 		if (offset < entry->start)
199 			n = n->rb_left;
200 		else if (offset > entry->end)
201 			n = n->rb_right;
202 		else
203 			return n;
204 	}
205 
206 	if (prev_ret) {
207 		orig_prev = prev;
208 		while (prev && offset > prev_entry->end) {
209 			prev = rb_next(prev);
210 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
211 		}
212 		*prev_ret = prev;
213 		prev = orig_prev;
214 	}
215 
216 	if (next_ret) {
217 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
218 		while (prev && offset < prev_entry->start) {
219 			prev = rb_prev(prev);
220 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
221 		}
222 		*next_ret = prev;
223 	}
224 	return NULL;
225 }
226 
227 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
228 					  u64 offset)
229 {
230 	struct rb_node *prev = NULL;
231 	struct rb_node *ret;
232 
233 	ret = __etree_search(tree, offset, &prev, NULL);
234 	if (!ret)
235 		return prev;
236 	return ret;
237 }
238 
239 static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
240 					  u64 offset, struct rb_node *node)
241 {
242 	struct rb_root *root = &tree->buffer;
243 	struct rb_node **p = &root->rb_node;
244 	struct rb_node *parent = NULL;
245 	struct extent_buffer *eb;
246 
247 	while (*p) {
248 		parent = *p;
249 		eb = rb_entry(parent, struct extent_buffer, rb_node);
250 
251 		if (offset < eb->start)
252 			p = &(*p)->rb_left;
253 		else if (offset > eb->start)
254 			p = &(*p)->rb_right;
255 		else
256 			return eb;
257 	}
258 
259 	rb_link_node(node, parent, p);
260 	rb_insert_color(node, root);
261 	return NULL;
262 }
263 
264 static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
265 					   u64 offset)
266 {
267 	struct rb_root *root = &tree->buffer;
268 	struct rb_node *n = root->rb_node;
269 	struct extent_buffer *eb;
270 
271 	while (n) {
272 		eb = rb_entry(n, struct extent_buffer, rb_node);
273 		if (offset < eb->start)
274 			n = n->rb_left;
275 		else if (offset > eb->start)
276 			n = n->rb_right;
277 		else
278 			return eb;
279 	}
280 	return NULL;
281 }
282 
283 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
284 		     struct extent_state *other)
285 {
286 	if (tree->ops && tree->ops->merge_extent_hook)
287 		tree->ops->merge_extent_hook(tree->mapping->host, new,
288 					     other);
289 }
290 
291 /*
292  * utility function to look for merge candidates inside a given range.
293  * Any extents with matching state are merged together into a single
294  * extent in the tree.  Extents with EXTENT_IO in their state field
295  * are not merged because the end_io handlers need to be able to do
296  * operations on them without sleeping (or doing allocations/splits).
297  *
298  * This should be called with the tree lock held.
299  */
300 static int merge_state(struct extent_io_tree *tree,
301 		       struct extent_state *state)
302 {
303 	struct extent_state *other;
304 	struct rb_node *other_node;
305 
306 	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
307 		return 0;
308 
309 	other_node = rb_prev(&state->rb_node);
310 	if (other_node) {
311 		other = rb_entry(other_node, struct extent_state, rb_node);
312 		if (other->end == state->start - 1 &&
313 		    other->state == state->state) {
314 			merge_cb(tree, state, other);
315 			state->start = other->start;
316 			other->tree = NULL;
317 			rb_erase(&other->rb_node, &tree->state);
318 			free_extent_state(other);
319 		}
320 	}
321 	other_node = rb_next(&state->rb_node);
322 	if (other_node) {
323 		other = rb_entry(other_node, struct extent_state, rb_node);
324 		if (other->start == state->end + 1 &&
325 		    other->state == state->state) {
326 			merge_cb(tree, state, other);
327 			other->start = state->start;
328 			state->tree = NULL;
329 			rb_erase(&state->rb_node, &tree->state);
330 			free_extent_state(state);
331 			state = NULL;
332 		}
333 	}
334 
335 	return 0;
336 }
337 
338 static int set_state_cb(struct extent_io_tree *tree,
339 			 struct extent_state *state,
340 			 unsigned long bits)
341 {
342 	if (tree->ops && tree->ops->set_bit_hook) {
343 		return tree->ops->set_bit_hook(tree->mapping->host,
344 					       state->start, state->end,
345 					       state->state, bits);
346 	}
347 
348 	return 0;
349 }
350 
351 static void clear_state_cb(struct extent_io_tree *tree,
352 			   struct extent_state *state,
353 			   unsigned long bits)
354 {
355 	if (tree->ops && tree->ops->clear_bit_hook)
356 		tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
357 }
358 
359 /*
360  * insert an extent_state struct into the tree.  'bits' are set on the
361  * struct before it is inserted.
362  *
363  * This may return -EEXIST if the extent is already there, in which case the
364  * state struct is freed.
365  *
366  * The tree lock is not taken internally.  This is a utility function and
367  * probably isn't what you want to call (see set/clear_extent_bit).
368  */
369 static int insert_state(struct extent_io_tree *tree,
370 			struct extent_state *state, u64 start, u64 end,
371 			int bits)
372 {
373 	struct rb_node *node;
374 	int ret;
375 
376 	if (end < start) {
377 		printk(KERN_ERR "btrfs end < start %llu %llu\n",
378 		       (unsigned long long)end,
379 		       (unsigned long long)start);
380 		WARN_ON(1);
381 	}
382 	state->start = start;
383 	state->end = end;
384 	ret = set_state_cb(tree, state, bits);
385 	if (ret)
386 		return ret;
387 
388 	if (bits & EXTENT_DIRTY)
389 		tree->dirty_bytes += end - start + 1;
390 	state->state |= bits;
391 	node = tree_insert(&tree->state, end, &state->rb_node);
392 	if (node) {
393 		struct extent_state *found;
394 		found = rb_entry(node, struct extent_state, rb_node);
395 		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
396 		       "%llu %llu\n", (unsigned long long)found->start,
397 		       (unsigned long long)found->end,
398 		       (unsigned long long)start, (unsigned long long)end);
399 		free_extent_state(state);
400 		return -EEXIST;
401 	}
402 	state->tree = tree;
403 	merge_state(tree, state);
404 	return 0;
405 }
406 
407 static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
408 		     u64 split)
409 {
410 	if (tree->ops && tree->ops->split_extent_hook)
411 		return tree->ops->split_extent_hook(tree->mapping->host,
412 						    orig, split);
413 	return 0;
414 }
415 
416 /*
417  * split a given extent state struct in two, inserting the preallocated
418  * struct 'prealloc' as the newly created second half.  'split' indicates an
419  * offset inside 'orig' where it should be split.
420  *
421  * Before calling,
422  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
423  * are two extent state structs in the tree:
424  * prealloc: [orig->start, split - 1]
425  * orig: [ split, orig->end ]
426  *
427  * The tree locks are not taken by this function. They need to be held
428  * by the caller.
429  */
430 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
431 		       struct extent_state *prealloc, u64 split)
432 {
433 	struct rb_node *node;
434 
435 	split_cb(tree, orig, split);
436 
437 	prealloc->start = orig->start;
438 	prealloc->end = split - 1;
439 	prealloc->state = orig->state;
440 	orig->start = split;
441 
442 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
443 	if (node) {
444 		free_extent_state(prealloc);
445 		return -EEXIST;
446 	}
447 	prealloc->tree = tree;
448 	return 0;
449 }
450 
451 /*
452  * utility function to clear some bits in an extent state struct.
453  * it will optionally wake up any one waiting on this state (wake == 1), or
454  * forcibly remove the state from the tree (delete == 1).
455  *
456  * If no bits are set on the state struct after clearing things, the
457  * struct is freed and removed from the tree
458  */
459 static int clear_state_bit(struct extent_io_tree *tree,
460 			    struct extent_state *state, int bits, int wake,
461 			    int delete)
462 {
463 	int ret = state->state & bits;
464 
465 	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
466 		u64 range = state->end - state->start + 1;
467 		WARN_ON(range > tree->dirty_bytes);
468 		tree->dirty_bytes -= range;
469 	}
470 	clear_state_cb(tree, state, bits);
471 	state->state &= ~bits;
472 	if (wake)
473 		wake_up(&state->wq);
474 	if (delete || state->state == 0) {
475 		if (state->tree) {
476 			clear_state_cb(tree, state, state->state);
477 			rb_erase(&state->rb_node, &tree->state);
478 			state->tree = NULL;
479 			free_extent_state(state);
480 		} else {
481 			WARN_ON(1);
482 		}
483 	} else {
484 		merge_state(tree, state);
485 	}
486 	return ret;
487 }
488 
489 /*
490  * clear some bits on a range in the tree.  This may require splitting
491  * or inserting elements in the tree, so the gfp mask is used to
492  * indicate which allocations or sleeping are allowed.
493  *
494  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
495  * the given range from the tree regardless of state (ie for truncate).
496  *
497  * the range [start, end] is inclusive.
498  *
499  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
500  * bits were already set, or zero if none of the bits were already set.
501  */
502 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
503 		     int bits, int wake, int delete,
504 		     struct extent_state **cached_state,
505 		     gfp_t mask)
506 {
507 	struct extent_state *state;
508 	struct extent_state *cached;
509 	struct extent_state *prealloc = NULL;
510 	struct rb_node *next_node;
511 	struct rb_node *node;
512 	u64 last_end;
513 	int err;
514 	int set = 0;
515 
516 again:
517 	if (!prealloc && (mask & __GFP_WAIT)) {
518 		prealloc = alloc_extent_state(mask);
519 		if (!prealloc)
520 			return -ENOMEM;
521 	}
522 
523 	spin_lock(&tree->lock);
524 	if (cached_state) {
525 		cached = *cached_state;
526 		*cached_state = NULL;
527 		cached_state = NULL;
528 		if (cached && cached->tree && cached->start == start) {
529 			atomic_dec(&cached->refs);
530 			state = cached;
531 			goto hit_next;
532 		}
533 		free_extent_state(cached);
534 	}
535 	/*
536 	 * this search will find the extents that end after
537 	 * our range starts
538 	 */
539 	node = tree_search(tree, start);
540 	if (!node)
541 		goto out;
542 	state = rb_entry(node, struct extent_state, rb_node);
543 hit_next:
544 	if (state->start > end)
545 		goto out;
546 	WARN_ON(state->end < start);
547 	last_end = state->end;
548 
549 	/*
550 	 *     | ---- desired range ---- |
551 	 *  | state | or
552 	 *  | ------------- state -------------- |
553 	 *
554 	 * We need to split the extent we found, and may flip
555 	 * bits on second half.
556 	 *
557 	 * If the extent we found extends past our range, we
558 	 * just split and search again.  It'll get split again
559 	 * the next time though.
560 	 *
561 	 * If the extent we found is inside our range, we clear
562 	 * the desired bit on it.
563 	 */
564 
565 	if (state->start < start) {
566 		if (!prealloc)
567 			prealloc = alloc_extent_state(GFP_ATOMIC);
568 		err = split_state(tree, state, prealloc, start);
569 		BUG_ON(err == -EEXIST);
570 		prealloc = NULL;
571 		if (err)
572 			goto out;
573 		if (state->end <= end) {
574 			set |= clear_state_bit(tree, state, bits, wake,
575 					       delete);
576 			if (last_end == (u64)-1)
577 				goto out;
578 			start = last_end + 1;
579 		}
580 		goto search_again;
581 	}
582 	/*
583 	 * | ---- desired range ---- |
584 	 *                        | state |
585 	 * We need to split the extent, and clear the bit
586 	 * on the first half
587 	 */
588 	if (state->start <= end && state->end > end) {
589 		if (!prealloc)
590 			prealloc = alloc_extent_state(GFP_ATOMIC);
591 		err = split_state(tree, state, prealloc, end + 1);
592 		BUG_ON(err == -EEXIST);
593 		if (wake)
594 			wake_up(&state->wq);
595 
596 		set |= clear_state_bit(tree, prealloc, bits, wake, delete);
597 
598 		prealloc = NULL;
599 		goto out;
600 	}
601 
602 	if (state->end < end && prealloc && !need_resched())
603 		next_node = rb_next(&state->rb_node);
604 	else
605 		next_node = NULL;
606 
607 	set |= clear_state_bit(tree, state, bits, wake, delete);
608 	if (last_end == (u64)-1)
609 		goto out;
610 	start = last_end + 1;
611 	if (start <= end && next_node) {
612 		state = rb_entry(next_node, struct extent_state,
613 				 rb_node);
614 		if (state->start == start)
615 			goto hit_next;
616 	}
617 	goto search_again;
618 
619 out:
620 	spin_unlock(&tree->lock);
621 	if (prealloc)
622 		free_extent_state(prealloc);
623 
624 	return set;
625 
626 search_again:
627 	if (start > end)
628 		goto out;
629 	spin_unlock(&tree->lock);
630 	if (mask & __GFP_WAIT)
631 		cond_resched();
632 	goto again;
633 }
634 
635 static int wait_on_state(struct extent_io_tree *tree,
636 			 struct extent_state *state)
637 		__releases(tree->lock)
638 		__acquires(tree->lock)
639 {
640 	DEFINE_WAIT(wait);
641 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
642 	spin_unlock(&tree->lock);
643 	schedule();
644 	spin_lock(&tree->lock);
645 	finish_wait(&state->wq, &wait);
646 	return 0;
647 }
648 
649 /*
650  * waits for one or more bits to clear on a range in the state tree.
651  * The range [start, end] is inclusive.
652  * The tree lock is taken by this function
653  */
654 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
655 {
656 	struct extent_state *state;
657 	struct rb_node *node;
658 
659 	spin_lock(&tree->lock);
660 again:
661 	while (1) {
662 		/*
663 		 * this search will find all the extents that end after
664 		 * our range starts
665 		 */
666 		node = tree_search(tree, start);
667 		if (!node)
668 			break;
669 
670 		state = rb_entry(node, struct extent_state, rb_node);
671 
672 		if (state->start > end)
673 			goto out;
674 
675 		if (state->state & bits) {
676 			start = state->start;
677 			atomic_inc(&state->refs);
678 			wait_on_state(tree, state);
679 			free_extent_state(state);
680 			goto again;
681 		}
682 		start = state->end + 1;
683 
684 		if (start > end)
685 			break;
686 
687 		if (need_resched()) {
688 			spin_unlock(&tree->lock);
689 			cond_resched();
690 			spin_lock(&tree->lock);
691 		}
692 	}
693 out:
694 	spin_unlock(&tree->lock);
695 	return 0;
696 }
697 
698 static int set_state_bits(struct extent_io_tree *tree,
699 			   struct extent_state *state,
700 			   int bits)
701 {
702 	int ret;
703 
704 	ret = set_state_cb(tree, state, bits);
705 	if (ret)
706 		return ret;
707 
708 	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
709 		u64 range = state->end - state->start + 1;
710 		tree->dirty_bytes += range;
711 	}
712 	state->state |= bits;
713 
714 	return 0;
715 }
716 
717 static void cache_state(struct extent_state *state,
718 			struct extent_state **cached_ptr)
719 {
720 	if (cached_ptr && !(*cached_ptr)) {
721 		if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) {
722 			*cached_ptr = state;
723 			atomic_inc(&state->refs);
724 		}
725 	}
726 }
727 
728 /*
729  * set some bits on a range in the tree.  This may require allocations or
730  * sleeping, so the gfp mask is used to indicate what is allowed.
731  *
732  * If any of the exclusive bits are set, this will fail with -EEXIST if some
733  * part of the range already has the desired bits set.  The start of the
734  * existing range is returned in failed_start in this case.
735  *
736  * [start, end] is inclusive This takes the tree lock.
737  */
738 
739 static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
740 			  int bits, int exclusive_bits, u64 *failed_start,
741 			  struct extent_state **cached_state,
742 			  gfp_t mask)
743 {
744 	struct extent_state *state;
745 	struct extent_state *prealloc = NULL;
746 	struct rb_node *node;
747 	int err = 0;
748 	u64 last_start;
749 	u64 last_end;
750 
751 again:
752 	if (!prealloc && (mask & __GFP_WAIT)) {
753 		prealloc = alloc_extent_state(mask);
754 		if (!prealloc)
755 			return -ENOMEM;
756 	}
757 
758 	spin_lock(&tree->lock);
759 	if (cached_state && *cached_state) {
760 		state = *cached_state;
761 		if (state->start == start && state->tree) {
762 			node = &state->rb_node;
763 			goto hit_next;
764 		}
765 	}
766 	/*
767 	 * this search will find all the extents that end after
768 	 * our range starts.
769 	 */
770 	node = tree_search(tree, start);
771 	if (!node) {
772 		err = insert_state(tree, prealloc, start, end, bits);
773 		prealloc = NULL;
774 		BUG_ON(err == -EEXIST);
775 		goto out;
776 	}
777 	state = rb_entry(node, struct extent_state, rb_node);
778 hit_next:
779 	last_start = state->start;
780 	last_end = state->end;
781 
782 	/*
783 	 * | ---- desired range ---- |
784 	 * | state |
785 	 *
786 	 * Just lock what we found and keep going
787 	 */
788 	if (state->start == start && state->end <= end) {
789 		struct rb_node *next_node;
790 		if (state->state & exclusive_bits) {
791 			*failed_start = state->start;
792 			err = -EEXIST;
793 			goto out;
794 		}
795 
796 		err = set_state_bits(tree, state, bits);
797 		if (err)
798 			goto out;
799 
800 		cache_state(state, cached_state);
801 		merge_state(tree, state);
802 		if (last_end == (u64)-1)
803 			goto out;
804 
805 		start = last_end + 1;
806 		if (start < end && prealloc && !need_resched()) {
807 			next_node = rb_next(node);
808 			if (next_node) {
809 				state = rb_entry(next_node, struct extent_state,
810 						 rb_node);
811 				if (state->start == start)
812 					goto hit_next;
813 			}
814 		}
815 		goto search_again;
816 	}
817 
818 	/*
819 	 *     | ---- desired range ---- |
820 	 * | state |
821 	 *   or
822 	 * | ------------- state -------------- |
823 	 *
824 	 * We need to split the extent we found, and may flip bits on
825 	 * second half.
826 	 *
827 	 * If the extent we found extends past our
828 	 * range, we just split and search again.  It'll get split
829 	 * again the next time though.
830 	 *
831 	 * If the extent we found is inside our range, we set the
832 	 * desired bit on it.
833 	 */
834 	if (state->start < start) {
835 		if (state->state & exclusive_bits) {
836 			*failed_start = start;
837 			err = -EEXIST;
838 			goto out;
839 		}
840 		err = split_state(tree, state, prealloc, start);
841 		BUG_ON(err == -EEXIST);
842 		prealloc = NULL;
843 		if (err)
844 			goto out;
845 		if (state->end <= end) {
846 			err = set_state_bits(tree, state, bits);
847 			if (err)
848 				goto out;
849 			cache_state(state, cached_state);
850 			merge_state(tree, state);
851 			if (last_end == (u64)-1)
852 				goto out;
853 			start = last_end + 1;
854 		}
855 		goto search_again;
856 	}
857 	/*
858 	 * | ---- desired range ---- |
859 	 *     | state | or               | state |
860 	 *
861 	 * There's a hole, we need to insert something in it and
862 	 * ignore the extent we found.
863 	 */
864 	if (state->start > start) {
865 		u64 this_end;
866 		if (end < last_start)
867 			this_end = end;
868 		else
869 			this_end = last_start - 1;
870 		err = insert_state(tree, prealloc, start, this_end,
871 				   bits);
872 		BUG_ON(err == -EEXIST);
873 		if (err) {
874 			prealloc = NULL;
875 			goto out;
876 		}
877 		cache_state(prealloc, cached_state);
878 		prealloc = NULL;
879 		start = this_end + 1;
880 		goto search_again;
881 	}
882 	/*
883 	 * | ---- desired range ---- |
884 	 *                        | state |
885 	 * We need to split the extent, and set the bit
886 	 * on the first half
887 	 */
888 	if (state->start <= end && state->end > end) {
889 		if (state->state & exclusive_bits) {
890 			*failed_start = start;
891 			err = -EEXIST;
892 			goto out;
893 		}
894 		err = split_state(tree, state, prealloc, end + 1);
895 		BUG_ON(err == -EEXIST);
896 
897 		err = set_state_bits(tree, prealloc, bits);
898 		if (err) {
899 			prealloc = NULL;
900 			goto out;
901 		}
902 		cache_state(prealloc, cached_state);
903 		merge_state(tree, prealloc);
904 		prealloc = NULL;
905 		goto out;
906 	}
907 
908 	goto search_again;
909 
910 out:
911 	spin_unlock(&tree->lock);
912 	if (prealloc)
913 		free_extent_state(prealloc);
914 
915 	return err;
916 
917 search_again:
918 	if (start > end)
919 		goto out;
920 	spin_unlock(&tree->lock);
921 	if (mask & __GFP_WAIT)
922 		cond_resched();
923 	goto again;
924 }
925 
926 /* wrappers around set/clear extent bit */
927 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
928 		     gfp_t mask)
929 {
930 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
931 			      NULL, mask);
932 }
933 
934 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
935 		    int bits, gfp_t mask)
936 {
937 	return set_extent_bit(tree, start, end, bits, 0, NULL,
938 			      NULL, mask);
939 }
940 
941 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
942 		      int bits, gfp_t mask)
943 {
944 	return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
945 }
946 
947 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
948 		     gfp_t mask)
949 {
950 	return set_extent_bit(tree, start, end,
951 			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
952 			      0, NULL, NULL, mask);
953 }
954 
955 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
956 		       gfp_t mask)
957 {
958 	return clear_extent_bit(tree, start, end,
959 				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
960 				NULL, mask);
961 }
962 
963 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
964 		     gfp_t mask)
965 {
966 	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
967 			      NULL, mask);
968 }
969 
970 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
971 		       gfp_t mask)
972 {
973 	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
974 				NULL, mask);
975 }
976 
977 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
978 			gfp_t mask)
979 {
980 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
981 			      NULL, mask);
982 }
983 
984 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
985 				 u64 end, gfp_t mask)
986 {
987 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
988 				NULL, mask);
989 }
990 
991 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
992 {
993 	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
994 }
995 
996 /*
997  * either insert or lock state struct between start and end use mask to tell
998  * us if waiting is desired.
999  */
1000 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1001 		     int bits, struct extent_state **cached_state, gfp_t mask)
1002 {
1003 	int err;
1004 	u64 failed_start;
1005 	while (1) {
1006 		err = set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
1007 				     EXTENT_LOCKED, &failed_start,
1008 				     cached_state, mask);
1009 		if (err == -EEXIST && (mask & __GFP_WAIT)) {
1010 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1011 			start = failed_start;
1012 		} else {
1013 			break;
1014 		}
1015 		WARN_ON(start > end);
1016 	}
1017 	return err;
1018 }
1019 
1020 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
1021 {
1022 	return lock_extent_bits(tree, start, end, 0, NULL, mask);
1023 }
1024 
1025 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1026 		    gfp_t mask)
1027 {
1028 	int err;
1029 	u64 failed_start;
1030 
1031 	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1032 			     &failed_start, NULL, mask);
1033 	if (err == -EEXIST) {
1034 		if (failed_start > start)
1035 			clear_extent_bit(tree, start, failed_start - 1,
1036 					 EXTENT_LOCKED, 1, 0, NULL, mask);
1037 		return 0;
1038 	}
1039 	return 1;
1040 }
1041 
1042 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
1043 			 struct extent_state **cached, gfp_t mask)
1044 {
1045 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
1046 				mask);
1047 }
1048 
1049 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
1050 		  gfp_t mask)
1051 {
1052 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
1053 				mask);
1054 }
1055 
1056 /*
1057  * helper function to set pages and extents in the tree dirty
1058  */
1059 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
1060 {
1061 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1062 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1063 	struct page *page;
1064 
1065 	while (index <= end_index) {
1066 		page = find_get_page(tree->mapping, index);
1067 		BUG_ON(!page);
1068 		__set_page_dirty_nobuffers(page);
1069 		page_cache_release(page);
1070 		index++;
1071 	}
1072 	return 0;
1073 }
1074 
1075 /*
1076  * helper function to set both pages and extents in the tree writeback
1077  */
1078 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
1079 {
1080 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1081 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1082 	struct page *page;
1083 
1084 	while (index <= end_index) {
1085 		page = find_get_page(tree->mapping, index);
1086 		BUG_ON(!page);
1087 		set_page_writeback(page);
1088 		page_cache_release(page);
1089 		index++;
1090 	}
1091 	return 0;
1092 }
1093 
1094 /*
1095  * find the first offset in the io tree with 'bits' set. zero is
1096  * returned if we find something, and *start_ret and *end_ret are
1097  * set to reflect the state struct that was found.
1098  *
1099  * If nothing was found, 1 is returned, < 0 on error
1100  */
1101 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1102 			  u64 *start_ret, u64 *end_ret, int bits)
1103 {
1104 	struct rb_node *node;
1105 	struct extent_state *state;
1106 	int ret = 1;
1107 
1108 	spin_lock(&tree->lock);
1109 	/*
1110 	 * this search will find all the extents that end after
1111 	 * our range starts.
1112 	 */
1113 	node = tree_search(tree, start);
1114 	if (!node)
1115 		goto out;
1116 
1117 	while (1) {
1118 		state = rb_entry(node, struct extent_state, rb_node);
1119 		if (state->end >= start && (state->state & bits)) {
1120 			*start_ret = state->start;
1121 			*end_ret = state->end;
1122 			ret = 0;
1123 			break;
1124 		}
1125 		node = rb_next(node);
1126 		if (!node)
1127 			break;
1128 	}
1129 out:
1130 	spin_unlock(&tree->lock);
1131 	return ret;
1132 }
1133 
1134 /* find the first state struct with 'bits' set after 'start', and
1135  * return it.  tree->lock must be held.  NULL will returned if
1136  * nothing was found after 'start'
1137  */
1138 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1139 						 u64 start, int bits)
1140 {
1141 	struct rb_node *node;
1142 	struct extent_state *state;
1143 
1144 	/*
1145 	 * this search will find all the extents that end after
1146 	 * our range starts.
1147 	 */
1148 	node = tree_search(tree, start);
1149 	if (!node)
1150 		goto out;
1151 
1152 	while (1) {
1153 		state = rb_entry(node, struct extent_state, rb_node);
1154 		if (state->end >= start && (state->state & bits))
1155 			return state;
1156 
1157 		node = rb_next(node);
1158 		if (!node)
1159 			break;
1160 	}
1161 out:
1162 	return NULL;
1163 }
1164 
1165 /*
1166  * find a contiguous range of bytes in the file marked as delalloc, not
1167  * more than 'max_bytes'.  start and end are used to return the range,
1168  *
1169  * 1 is returned if we find something, 0 if nothing was in the tree
1170  */
1171 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1172 					u64 *start, u64 *end, u64 max_bytes)
1173 {
1174 	struct rb_node *node;
1175 	struct extent_state *state;
1176 	u64 cur_start = *start;
1177 	u64 found = 0;
1178 	u64 total_bytes = 0;
1179 
1180 	spin_lock(&tree->lock);
1181 
1182 	/*
1183 	 * this search will find all the extents that end after
1184 	 * our range starts.
1185 	 */
1186 	node = tree_search(tree, cur_start);
1187 	if (!node) {
1188 		if (!found)
1189 			*end = (u64)-1;
1190 		goto out;
1191 	}
1192 
1193 	while (1) {
1194 		state = rb_entry(node, struct extent_state, rb_node);
1195 		if (found && (state->start != cur_start ||
1196 			      (state->state & EXTENT_BOUNDARY))) {
1197 			goto out;
1198 		}
1199 		if (!(state->state & EXTENT_DELALLOC)) {
1200 			if (!found)
1201 				*end = state->end;
1202 			goto out;
1203 		}
1204 		if (!found)
1205 			*start = state->start;
1206 		found++;
1207 		*end = state->end;
1208 		cur_start = state->end + 1;
1209 		node = rb_next(node);
1210 		if (!node)
1211 			break;
1212 		total_bytes += state->end - state->start + 1;
1213 		if (total_bytes >= max_bytes)
1214 			break;
1215 	}
1216 out:
1217 	spin_unlock(&tree->lock);
1218 	return found;
1219 }
1220 
1221 static noinline int __unlock_for_delalloc(struct inode *inode,
1222 					  struct page *locked_page,
1223 					  u64 start, u64 end)
1224 {
1225 	int ret;
1226 	struct page *pages[16];
1227 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1228 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1229 	unsigned long nr_pages = end_index - index + 1;
1230 	int i;
1231 
1232 	if (index == locked_page->index && end_index == index)
1233 		return 0;
1234 
1235 	while (nr_pages > 0) {
1236 		ret = find_get_pages_contig(inode->i_mapping, index,
1237 				     min_t(unsigned long, nr_pages,
1238 				     ARRAY_SIZE(pages)), pages);
1239 		for (i = 0; i < ret; i++) {
1240 			if (pages[i] != locked_page)
1241 				unlock_page(pages[i]);
1242 			page_cache_release(pages[i]);
1243 		}
1244 		nr_pages -= ret;
1245 		index += ret;
1246 		cond_resched();
1247 	}
1248 	return 0;
1249 }
1250 
1251 static noinline int lock_delalloc_pages(struct inode *inode,
1252 					struct page *locked_page,
1253 					u64 delalloc_start,
1254 					u64 delalloc_end)
1255 {
1256 	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1257 	unsigned long start_index = index;
1258 	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1259 	unsigned long pages_locked = 0;
1260 	struct page *pages[16];
1261 	unsigned long nrpages;
1262 	int ret;
1263 	int i;
1264 
1265 	/* the caller is responsible for locking the start index */
1266 	if (index == locked_page->index && index == end_index)
1267 		return 0;
1268 
1269 	/* skip the page at the start index */
1270 	nrpages = end_index - index + 1;
1271 	while (nrpages > 0) {
1272 		ret = find_get_pages_contig(inode->i_mapping, index,
1273 				     min_t(unsigned long,
1274 				     nrpages, ARRAY_SIZE(pages)), pages);
1275 		if (ret == 0) {
1276 			ret = -EAGAIN;
1277 			goto done;
1278 		}
1279 		/* now we have an array of pages, lock them all */
1280 		for (i = 0; i < ret; i++) {
1281 			/*
1282 			 * the caller is taking responsibility for
1283 			 * locked_page
1284 			 */
1285 			if (pages[i] != locked_page) {
1286 				lock_page(pages[i]);
1287 				if (!PageDirty(pages[i]) ||
1288 				    pages[i]->mapping != inode->i_mapping) {
1289 					ret = -EAGAIN;
1290 					unlock_page(pages[i]);
1291 					page_cache_release(pages[i]);
1292 					goto done;
1293 				}
1294 			}
1295 			page_cache_release(pages[i]);
1296 			pages_locked++;
1297 		}
1298 		nrpages -= ret;
1299 		index += ret;
1300 		cond_resched();
1301 	}
1302 	ret = 0;
1303 done:
1304 	if (ret && pages_locked) {
1305 		__unlock_for_delalloc(inode, locked_page,
1306 			      delalloc_start,
1307 			      ((u64)(start_index + pages_locked - 1)) <<
1308 			      PAGE_CACHE_SHIFT);
1309 	}
1310 	return ret;
1311 }
1312 
1313 /*
1314  * find a contiguous range of bytes in the file marked as delalloc, not
1315  * more than 'max_bytes'.  start and end are used to return the range,
1316  *
1317  * 1 is returned if we find something, 0 if nothing was in the tree
1318  */
1319 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1320 					     struct extent_io_tree *tree,
1321 					     struct page *locked_page,
1322 					     u64 *start, u64 *end,
1323 					     u64 max_bytes)
1324 {
1325 	u64 delalloc_start;
1326 	u64 delalloc_end;
1327 	u64 found;
1328 	struct extent_state *cached_state = NULL;
1329 	int ret;
1330 	int loops = 0;
1331 
1332 again:
1333 	/* step one, find a bunch of delalloc bytes starting at start */
1334 	delalloc_start = *start;
1335 	delalloc_end = 0;
1336 	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1337 				    max_bytes);
1338 	if (!found || delalloc_end <= *start) {
1339 		*start = delalloc_start;
1340 		*end = delalloc_end;
1341 		return found;
1342 	}
1343 
1344 	/*
1345 	 * start comes from the offset of locked_page.  We have to lock
1346 	 * pages in order, so we can't process delalloc bytes before
1347 	 * locked_page
1348 	 */
1349 	if (delalloc_start < *start)
1350 		delalloc_start = *start;
1351 
1352 	/*
1353 	 * make sure to limit the number of pages we try to lock down
1354 	 * if we're looping.
1355 	 */
1356 	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1357 		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1358 
1359 	/* step two, lock all the pages after the page that has start */
1360 	ret = lock_delalloc_pages(inode, locked_page,
1361 				  delalloc_start, delalloc_end);
1362 	if (ret == -EAGAIN) {
1363 		/* some of the pages are gone, lets avoid looping by
1364 		 * shortening the size of the delalloc range we're searching
1365 		 */
1366 		free_extent_state(cached_state);
1367 		if (!loops) {
1368 			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1369 			max_bytes = PAGE_CACHE_SIZE - offset;
1370 			loops = 1;
1371 			goto again;
1372 		} else {
1373 			found = 0;
1374 			goto out_failed;
1375 		}
1376 	}
1377 	BUG_ON(ret);
1378 
1379 	/* step three, lock the state bits for the whole range */
1380 	lock_extent_bits(tree, delalloc_start, delalloc_end,
1381 			 0, &cached_state, GFP_NOFS);
1382 
1383 	/* then test to make sure it is all still delalloc */
1384 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1385 			     EXTENT_DELALLOC, 1, cached_state);
1386 	if (!ret) {
1387 		unlock_extent_cached(tree, delalloc_start, delalloc_end,
1388 				     &cached_state, GFP_NOFS);
1389 		__unlock_for_delalloc(inode, locked_page,
1390 			      delalloc_start, delalloc_end);
1391 		cond_resched();
1392 		goto again;
1393 	}
1394 	free_extent_state(cached_state);
1395 	*start = delalloc_start;
1396 	*end = delalloc_end;
1397 out_failed:
1398 	return found;
1399 }
1400 
1401 int extent_clear_unlock_delalloc(struct inode *inode,
1402 				struct extent_io_tree *tree,
1403 				u64 start, u64 end, struct page *locked_page,
1404 				int unlock_pages,
1405 				int clear_unlock,
1406 				int clear_delalloc, int clear_dirty,
1407 				int set_writeback,
1408 				int end_writeback,
1409 				int set_private2)
1410 {
1411 	int ret;
1412 	struct page *pages[16];
1413 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1414 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1415 	unsigned long nr_pages = end_index - index + 1;
1416 	int i;
1417 	int clear_bits = 0;
1418 
1419 	if (clear_unlock)
1420 		clear_bits |= EXTENT_LOCKED;
1421 	if (clear_dirty)
1422 		clear_bits |= EXTENT_DIRTY;
1423 
1424 	if (clear_delalloc)
1425 		clear_bits |= EXTENT_DELALLOC;
1426 
1427 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
1428 	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback ||
1429 	      set_private2))
1430 		return 0;
1431 
1432 	while (nr_pages > 0) {
1433 		ret = find_get_pages_contig(inode->i_mapping, index,
1434 				     min_t(unsigned long,
1435 				     nr_pages, ARRAY_SIZE(pages)), pages);
1436 		for (i = 0; i < ret; i++) {
1437 
1438 			if (set_private2)
1439 				SetPagePrivate2(pages[i]);
1440 
1441 			if (pages[i] == locked_page) {
1442 				page_cache_release(pages[i]);
1443 				continue;
1444 			}
1445 			if (clear_dirty)
1446 				clear_page_dirty_for_io(pages[i]);
1447 			if (set_writeback)
1448 				set_page_writeback(pages[i]);
1449 			if (end_writeback)
1450 				end_page_writeback(pages[i]);
1451 			if (unlock_pages)
1452 				unlock_page(pages[i]);
1453 			page_cache_release(pages[i]);
1454 		}
1455 		nr_pages -= ret;
1456 		index += ret;
1457 		cond_resched();
1458 	}
1459 	return 0;
1460 }
1461 
1462 /*
1463  * count the number of bytes in the tree that have a given bit(s)
1464  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1465  * cached.  The total number found is returned.
1466  */
1467 u64 count_range_bits(struct extent_io_tree *tree,
1468 		     u64 *start, u64 search_end, u64 max_bytes,
1469 		     unsigned long bits)
1470 {
1471 	struct rb_node *node;
1472 	struct extent_state *state;
1473 	u64 cur_start = *start;
1474 	u64 total_bytes = 0;
1475 	int found = 0;
1476 
1477 	if (search_end <= cur_start) {
1478 		WARN_ON(1);
1479 		return 0;
1480 	}
1481 
1482 	spin_lock(&tree->lock);
1483 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1484 		total_bytes = tree->dirty_bytes;
1485 		goto out;
1486 	}
1487 	/*
1488 	 * this search will find all the extents that end after
1489 	 * our range starts.
1490 	 */
1491 	node = tree_search(tree, cur_start);
1492 	if (!node)
1493 		goto out;
1494 
1495 	while (1) {
1496 		state = rb_entry(node, struct extent_state, rb_node);
1497 		if (state->start > search_end)
1498 			break;
1499 		if (state->end >= cur_start && (state->state & bits)) {
1500 			total_bytes += min(search_end, state->end) + 1 -
1501 				       max(cur_start, state->start);
1502 			if (total_bytes >= max_bytes)
1503 				break;
1504 			if (!found) {
1505 				*start = state->start;
1506 				found = 1;
1507 			}
1508 		}
1509 		node = rb_next(node);
1510 		if (!node)
1511 			break;
1512 	}
1513 out:
1514 	spin_unlock(&tree->lock);
1515 	return total_bytes;
1516 }
1517 
1518 /*
1519  * set the private field for a given byte offset in the tree.  If there isn't
1520  * an extent_state there already, this does nothing.
1521  */
1522 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1523 {
1524 	struct rb_node *node;
1525 	struct extent_state *state;
1526 	int ret = 0;
1527 
1528 	spin_lock(&tree->lock);
1529 	/*
1530 	 * this search will find all the extents that end after
1531 	 * our range starts.
1532 	 */
1533 	node = tree_search(tree, start);
1534 	if (!node) {
1535 		ret = -ENOENT;
1536 		goto out;
1537 	}
1538 	state = rb_entry(node, struct extent_state, rb_node);
1539 	if (state->start != start) {
1540 		ret = -ENOENT;
1541 		goto out;
1542 	}
1543 	state->private = private;
1544 out:
1545 	spin_unlock(&tree->lock);
1546 	return ret;
1547 }
1548 
1549 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1550 {
1551 	struct rb_node *node;
1552 	struct extent_state *state;
1553 	int ret = 0;
1554 
1555 	spin_lock(&tree->lock);
1556 	/*
1557 	 * this search will find all the extents that end after
1558 	 * our range starts.
1559 	 */
1560 	node = tree_search(tree, start);
1561 	if (!node) {
1562 		ret = -ENOENT;
1563 		goto out;
1564 	}
1565 	state = rb_entry(node, struct extent_state, rb_node);
1566 	if (state->start != start) {
1567 		ret = -ENOENT;
1568 		goto out;
1569 	}
1570 	*private = state->private;
1571 out:
1572 	spin_unlock(&tree->lock);
1573 	return ret;
1574 }
1575 
1576 /*
1577  * searches a range in the state tree for a given mask.
1578  * If 'filled' == 1, this returns 1 only if every extent in the tree
1579  * has the bits set.  Otherwise, 1 is returned if any bit in the
1580  * range is found set.
1581  */
1582 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1583 		   int bits, int filled, struct extent_state *cached)
1584 {
1585 	struct extent_state *state = NULL;
1586 	struct rb_node *node;
1587 	int bitset = 0;
1588 
1589 	spin_lock(&tree->lock);
1590 	if (cached && cached->tree && cached->start == start)
1591 		node = &cached->rb_node;
1592 	else
1593 		node = tree_search(tree, start);
1594 	while (node && start <= end) {
1595 		state = rb_entry(node, struct extent_state, rb_node);
1596 
1597 		if (filled && state->start > start) {
1598 			bitset = 0;
1599 			break;
1600 		}
1601 
1602 		if (state->start > end)
1603 			break;
1604 
1605 		if (state->state & bits) {
1606 			bitset = 1;
1607 			if (!filled)
1608 				break;
1609 		} else if (filled) {
1610 			bitset = 0;
1611 			break;
1612 		}
1613 
1614 		if (state->end == (u64)-1)
1615 			break;
1616 
1617 		start = state->end + 1;
1618 		if (start > end)
1619 			break;
1620 		node = rb_next(node);
1621 		if (!node) {
1622 			if (filled)
1623 				bitset = 0;
1624 			break;
1625 		}
1626 	}
1627 	spin_unlock(&tree->lock);
1628 	return bitset;
1629 }
1630 
1631 /*
1632  * helper function to set a given page up to date if all the
1633  * extents in the tree for that page are up to date
1634  */
1635 static int check_page_uptodate(struct extent_io_tree *tree,
1636 			       struct page *page)
1637 {
1638 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1639 	u64 end = start + PAGE_CACHE_SIZE - 1;
1640 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
1641 		SetPageUptodate(page);
1642 	return 0;
1643 }
1644 
1645 /*
1646  * helper function to unlock a page if all the extents in the tree
1647  * for that page are unlocked
1648  */
1649 static int check_page_locked(struct extent_io_tree *tree,
1650 			     struct page *page)
1651 {
1652 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1653 	u64 end = start + PAGE_CACHE_SIZE - 1;
1654 	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
1655 		unlock_page(page);
1656 	return 0;
1657 }
1658 
1659 /*
1660  * helper function to end page writeback if all the extents
1661  * in the tree for that page are done with writeback
1662  */
1663 static int check_page_writeback(struct extent_io_tree *tree,
1664 			     struct page *page)
1665 {
1666 	end_page_writeback(page);
1667 	return 0;
1668 }
1669 
1670 /* lots and lots of room for performance fixes in the end_bio funcs */
1671 
1672 /*
1673  * after a writepage IO is done, we need to:
1674  * clear the uptodate bits on error
1675  * clear the writeback bits in the extent tree for this IO
1676  * end_page_writeback if the page has no more pending IO
1677  *
1678  * Scheduling is not allowed, so the extent state tree is expected
1679  * to have one and only one object corresponding to this IO.
1680  */
1681 static void end_bio_extent_writepage(struct bio *bio, int err)
1682 {
1683 	int uptodate = err == 0;
1684 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1685 	struct extent_io_tree *tree;
1686 	u64 start;
1687 	u64 end;
1688 	int whole_page;
1689 	int ret;
1690 
1691 	do {
1692 		struct page *page = bvec->bv_page;
1693 		tree = &BTRFS_I(page->mapping->host)->io_tree;
1694 
1695 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1696 			 bvec->bv_offset;
1697 		end = start + bvec->bv_len - 1;
1698 
1699 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1700 			whole_page = 1;
1701 		else
1702 			whole_page = 0;
1703 
1704 		if (--bvec >= bio->bi_io_vec)
1705 			prefetchw(&bvec->bv_page->flags);
1706 		if (tree->ops && tree->ops->writepage_end_io_hook) {
1707 			ret = tree->ops->writepage_end_io_hook(page, start,
1708 						       end, NULL, uptodate);
1709 			if (ret)
1710 				uptodate = 0;
1711 		}
1712 
1713 		if (!uptodate && tree->ops &&
1714 		    tree->ops->writepage_io_failed_hook) {
1715 			ret = tree->ops->writepage_io_failed_hook(bio, page,
1716 							 start, end, NULL);
1717 			if (ret == 0) {
1718 				uptodate = (err == 0);
1719 				continue;
1720 			}
1721 		}
1722 
1723 		if (!uptodate) {
1724 			clear_extent_uptodate(tree, start, end, GFP_NOFS);
1725 			ClearPageUptodate(page);
1726 			SetPageError(page);
1727 		}
1728 
1729 		if (whole_page)
1730 			end_page_writeback(page);
1731 		else
1732 			check_page_writeback(tree, page);
1733 	} while (bvec >= bio->bi_io_vec);
1734 
1735 	bio_put(bio);
1736 }
1737 
1738 /*
1739  * after a readpage IO is done, we need to:
1740  * clear the uptodate bits on error
1741  * set the uptodate bits if things worked
1742  * set the page up to date if all extents in the tree are uptodate
1743  * clear the lock bit in the extent tree
1744  * unlock the page if there are no other extents locked for it
1745  *
1746  * Scheduling is not allowed, so the extent state tree is expected
1747  * to have one and only one object corresponding to this IO.
1748  */
1749 static void end_bio_extent_readpage(struct bio *bio, int err)
1750 {
1751 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1752 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1753 	struct extent_io_tree *tree;
1754 	u64 start;
1755 	u64 end;
1756 	int whole_page;
1757 	int ret;
1758 
1759 	if (err)
1760 		uptodate = 0;
1761 
1762 	do {
1763 		struct page *page = bvec->bv_page;
1764 		tree = &BTRFS_I(page->mapping->host)->io_tree;
1765 
1766 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1767 			bvec->bv_offset;
1768 		end = start + bvec->bv_len - 1;
1769 
1770 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1771 			whole_page = 1;
1772 		else
1773 			whole_page = 0;
1774 
1775 		if (--bvec >= bio->bi_io_vec)
1776 			prefetchw(&bvec->bv_page->flags);
1777 
1778 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1779 			ret = tree->ops->readpage_end_io_hook(page, start, end,
1780 							      NULL);
1781 			if (ret)
1782 				uptodate = 0;
1783 		}
1784 		if (!uptodate && tree->ops &&
1785 		    tree->ops->readpage_io_failed_hook) {
1786 			ret = tree->ops->readpage_io_failed_hook(bio, page,
1787 							 start, end, NULL);
1788 			if (ret == 0) {
1789 				uptodate =
1790 					test_bit(BIO_UPTODATE, &bio->bi_flags);
1791 				if (err)
1792 					uptodate = 0;
1793 				continue;
1794 			}
1795 		}
1796 
1797 		if (uptodate) {
1798 			set_extent_uptodate(tree, start, end,
1799 					    GFP_ATOMIC);
1800 		}
1801 		unlock_extent(tree, start, end, GFP_ATOMIC);
1802 
1803 		if (whole_page) {
1804 			if (uptodate) {
1805 				SetPageUptodate(page);
1806 			} else {
1807 				ClearPageUptodate(page);
1808 				SetPageError(page);
1809 			}
1810 			unlock_page(page);
1811 		} else {
1812 			if (uptodate) {
1813 				check_page_uptodate(tree, page);
1814 			} else {
1815 				ClearPageUptodate(page);
1816 				SetPageError(page);
1817 			}
1818 			check_page_locked(tree, page);
1819 		}
1820 	} while (bvec >= bio->bi_io_vec);
1821 
1822 	bio_put(bio);
1823 }
1824 
1825 /*
1826  * IO done from prepare_write is pretty simple, we just unlock
1827  * the structs in the extent tree when done, and set the uptodate bits
1828  * as appropriate.
1829  */
1830 static void end_bio_extent_preparewrite(struct bio *bio, int err)
1831 {
1832 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1833 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1834 	struct extent_io_tree *tree;
1835 	u64 start;
1836 	u64 end;
1837 
1838 	do {
1839 		struct page *page = bvec->bv_page;
1840 		tree = &BTRFS_I(page->mapping->host)->io_tree;
1841 
1842 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1843 			bvec->bv_offset;
1844 		end = start + bvec->bv_len - 1;
1845 
1846 		if (--bvec >= bio->bi_io_vec)
1847 			prefetchw(&bvec->bv_page->flags);
1848 
1849 		if (uptodate) {
1850 			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1851 		} else {
1852 			ClearPageUptodate(page);
1853 			SetPageError(page);
1854 		}
1855 
1856 		unlock_extent(tree, start, end, GFP_ATOMIC);
1857 
1858 	} while (bvec >= bio->bi_io_vec);
1859 
1860 	bio_put(bio);
1861 }
1862 
1863 static struct bio *
1864 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1865 		 gfp_t gfp_flags)
1866 {
1867 	struct bio *bio;
1868 
1869 	bio = bio_alloc(gfp_flags, nr_vecs);
1870 
1871 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1872 		while (!bio && (nr_vecs /= 2))
1873 			bio = bio_alloc(gfp_flags, nr_vecs);
1874 	}
1875 
1876 	if (bio) {
1877 		bio->bi_size = 0;
1878 		bio->bi_bdev = bdev;
1879 		bio->bi_sector = first_sector;
1880 	}
1881 	return bio;
1882 }
1883 
1884 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1885 			  unsigned long bio_flags)
1886 {
1887 	int ret = 0;
1888 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1889 	struct page *page = bvec->bv_page;
1890 	struct extent_io_tree *tree = bio->bi_private;
1891 	u64 start;
1892 	u64 end;
1893 
1894 	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1895 	end = start + bvec->bv_len - 1;
1896 
1897 	bio->bi_private = NULL;
1898 
1899 	bio_get(bio);
1900 
1901 	if (tree->ops && tree->ops->submit_bio_hook)
1902 		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1903 					   mirror_num, bio_flags);
1904 	else
1905 		submit_bio(rw, bio);
1906 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
1907 		ret = -EOPNOTSUPP;
1908 	bio_put(bio);
1909 	return ret;
1910 }
1911 
1912 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1913 			      struct page *page, sector_t sector,
1914 			      size_t size, unsigned long offset,
1915 			      struct block_device *bdev,
1916 			      struct bio **bio_ret,
1917 			      unsigned long max_pages,
1918 			      bio_end_io_t end_io_func,
1919 			      int mirror_num,
1920 			      unsigned long prev_bio_flags,
1921 			      unsigned long bio_flags)
1922 {
1923 	int ret = 0;
1924 	struct bio *bio;
1925 	int nr;
1926 	int contig = 0;
1927 	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1928 	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1929 	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1930 
1931 	if (bio_ret && *bio_ret) {
1932 		bio = *bio_ret;
1933 		if (old_compressed)
1934 			contig = bio->bi_sector == sector;
1935 		else
1936 			contig = bio->bi_sector + (bio->bi_size >> 9) ==
1937 				sector;
1938 
1939 		if (prev_bio_flags != bio_flags || !contig ||
1940 		    (tree->ops && tree->ops->merge_bio_hook &&
1941 		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
1942 					       bio_flags)) ||
1943 		    bio_add_page(bio, page, page_size, offset) < page_size) {
1944 			ret = submit_one_bio(rw, bio, mirror_num,
1945 					     prev_bio_flags);
1946 			bio = NULL;
1947 		} else {
1948 			return 0;
1949 		}
1950 	}
1951 	if (this_compressed)
1952 		nr = BIO_MAX_PAGES;
1953 	else
1954 		nr = bio_get_nr_vecs(bdev);
1955 
1956 	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1957 
1958 	bio_add_page(bio, page, page_size, offset);
1959 	bio->bi_end_io = end_io_func;
1960 	bio->bi_private = tree;
1961 
1962 	if (bio_ret)
1963 		*bio_ret = bio;
1964 	else
1965 		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1966 
1967 	return ret;
1968 }
1969 
1970 void set_page_extent_mapped(struct page *page)
1971 {
1972 	if (!PagePrivate(page)) {
1973 		SetPagePrivate(page);
1974 		page_cache_get(page);
1975 		set_page_private(page, EXTENT_PAGE_PRIVATE);
1976 	}
1977 }
1978 
1979 static void set_page_extent_head(struct page *page, unsigned long len)
1980 {
1981 	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1982 }
1983 
1984 /*
1985  * basic readpage implementation.  Locked extent state structs are inserted
1986  * into the tree that are removed when the IO is done (by the end_io
1987  * handlers)
1988  */
1989 static int __extent_read_full_page(struct extent_io_tree *tree,
1990 				   struct page *page,
1991 				   get_extent_t *get_extent,
1992 				   struct bio **bio, int mirror_num,
1993 				   unsigned long *bio_flags)
1994 {
1995 	struct inode *inode = page->mapping->host;
1996 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1997 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
1998 	u64 end;
1999 	u64 cur = start;
2000 	u64 extent_offset;
2001 	u64 last_byte = i_size_read(inode);
2002 	u64 block_start;
2003 	u64 cur_end;
2004 	sector_t sector;
2005 	struct extent_map *em;
2006 	struct block_device *bdev;
2007 	int ret;
2008 	int nr = 0;
2009 	size_t page_offset = 0;
2010 	size_t iosize;
2011 	size_t disk_io_size;
2012 	size_t blocksize = inode->i_sb->s_blocksize;
2013 	unsigned long this_bio_flag = 0;
2014 
2015 	set_page_extent_mapped(page);
2016 
2017 	end = page_end;
2018 	lock_extent(tree, start, end, GFP_NOFS);
2019 
2020 	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
2021 		char *userpage;
2022 		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
2023 
2024 		if (zero_offset) {
2025 			iosize = PAGE_CACHE_SIZE - zero_offset;
2026 			userpage = kmap_atomic(page, KM_USER0);
2027 			memset(userpage + zero_offset, 0, iosize);
2028 			flush_dcache_page(page);
2029 			kunmap_atomic(userpage, KM_USER0);
2030 		}
2031 	}
2032 	while (cur <= end) {
2033 		if (cur >= last_byte) {
2034 			char *userpage;
2035 			iosize = PAGE_CACHE_SIZE - page_offset;
2036 			userpage = kmap_atomic(page, KM_USER0);
2037 			memset(userpage + page_offset, 0, iosize);
2038 			flush_dcache_page(page);
2039 			kunmap_atomic(userpage, KM_USER0);
2040 			set_extent_uptodate(tree, cur, cur + iosize - 1,
2041 					    GFP_NOFS);
2042 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2043 			break;
2044 		}
2045 		em = get_extent(inode, page, page_offset, cur,
2046 				end - cur + 1, 0);
2047 		if (IS_ERR(em) || !em) {
2048 			SetPageError(page);
2049 			unlock_extent(tree, cur, end, GFP_NOFS);
2050 			break;
2051 		}
2052 		extent_offset = cur - em->start;
2053 		BUG_ON(extent_map_end(em) <= cur);
2054 		BUG_ON(end < cur);
2055 
2056 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2057 			this_bio_flag = EXTENT_BIO_COMPRESSED;
2058 
2059 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2060 		cur_end = min(extent_map_end(em) - 1, end);
2061 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2062 		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2063 			disk_io_size = em->block_len;
2064 			sector = em->block_start >> 9;
2065 		} else {
2066 			sector = (em->block_start + extent_offset) >> 9;
2067 			disk_io_size = iosize;
2068 		}
2069 		bdev = em->bdev;
2070 		block_start = em->block_start;
2071 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2072 			block_start = EXTENT_MAP_HOLE;
2073 		free_extent_map(em);
2074 		em = NULL;
2075 
2076 		/* we've found a hole, just zero and go on */
2077 		if (block_start == EXTENT_MAP_HOLE) {
2078 			char *userpage;
2079 			userpage = kmap_atomic(page, KM_USER0);
2080 			memset(userpage + page_offset, 0, iosize);
2081 			flush_dcache_page(page);
2082 			kunmap_atomic(userpage, KM_USER0);
2083 
2084 			set_extent_uptodate(tree, cur, cur + iosize - 1,
2085 					    GFP_NOFS);
2086 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2087 			cur = cur + iosize;
2088 			page_offset += iosize;
2089 			continue;
2090 		}
2091 		/* the get_extent function already copied into the page */
2092 		if (test_range_bit(tree, cur, cur_end,
2093 				   EXTENT_UPTODATE, 1, NULL)) {
2094 			check_page_uptodate(tree, page);
2095 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2096 			cur = cur + iosize;
2097 			page_offset += iosize;
2098 			continue;
2099 		}
2100 		/* we have an inline extent but it didn't get marked up
2101 		 * to date.  Error out
2102 		 */
2103 		if (block_start == EXTENT_MAP_INLINE) {
2104 			SetPageError(page);
2105 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2106 			cur = cur + iosize;
2107 			page_offset += iosize;
2108 			continue;
2109 		}
2110 
2111 		ret = 0;
2112 		if (tree->ops && tree->ops->readpage_io_hook) {
2113 			ret = tree->ops->readpage_io_hook(page, cur,
2114 							  cur + iosize - 1);
2115 		}
2116 		if (!ret) {
2117 			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2118 			pnr -= page->index;
2119 			ret = submit_extent_page(READ, tree, page,
2120 					 sector, disk_io_size, page_offset,
2121 					 bdev, bio, pnr,
2122 					 end_bio_extent_readpage, mirror_num,
2123 					 *bio_flags,
2124 					 this_bio_flag);
2125 			nr++;
2126 			*bio_flags = this_bio_flag;
2127 		}
2128 		if (ret)
2129 			SetPageError(page);
2130 		cur = cur + iosize;
2131 		page_offset += iosize;
2132 	}
2133 	if (!nr) {
2134 		if (!PageError(page))
2135 			SetPageUptodate(page);
2136 		unlock_page(page);
2137 	}
2138 	return 0;
2139 }
2140 
2141 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2142 			    get_extent_t *get_extent)
2143 {
2144 	struct bio *bio = NULL;
2145 	unsigned long bio_flags = 0;
2146 	int ret;
2147 
2148 	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2149 				      &bio_flags);
2150 	if (bio)
2151 		submit_one_bio(READ, bio, 0, bio_flags);
2152 	return ret;
2153 }
2154 
2155 static noinline void update_nr_written(struct page *page,
2156 				      struct writeback_control *wbc,
2157 				      unsigned long nr_written)
2158 {
2159 	wbc->nr_to_write -= nr_written;
2160 	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2161 	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2162 		page->mapping->writeback_index = page->index + nr_written;
2163 }
2164 
2165 /*
2166  * the writepage semantics are similar to regular writepage.  extent
2167  * records are inserted to lock ranges in the tree, and as dirty areas
2168  * are found, they are marked writeback.  Then the lock bits are removed
2169  * and the end_io handler clears the writeback ranges
2170  */
2171 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2172 			      void *data)
2173 {
2174 	struct inode *inode = page->mapping->host;
2175 	struct extent_page_data *epd = data;
2176 	struct extent_io_tree *tree = epd->tree;
2177 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2178 	u64 delalloc_start;
2179 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2180 	u64 end;
2181 	u64 cur = start;
2182 	u64 extent_offset;
2183 	u64 last_byte = i_size_read(inode);
2184 	u64 block_start;
2185 	u64 iosize;
2186 	u64 unlock_start;
2187 	sector_t sector;
2188 	struct extent_state *cached_state = NULL;
2189 	struct extent_map *em;
2190 	struct block_device *bdev;
2191 	int ret;
2192 	int nr = 0;
2193 	size_t pg_offset = 0;
2194 	size_t blocksize;
2195 	loff_t i_size = i_size_read(inode);
2196 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2197 	u64 nr_delalloc;
2198 	u64 delalloc_end;
2199 	int page_started;
2200 	int compressed;
2201 	int write_flags;
2202 	unsigned long nr_written = 0;
2203 
2204 	if (wbc->sync_mode == WB_SYNC_ALL)
2205 		write_flags = WRITE_SYNC_PLUG;
2206 	else
2207 		write_flags = WRITE;
2208 
2209 	WARN_ON(!PageLocked(page));
2210 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2211 	if (page->index > end_index ||
2212 	   (page->index == end_index && !pg_offset)) {
2213 		page->mapping->a_ops->invalidatepage(page, 0);
2214 		unlock_page(page);
2215 		return 0;
2216 	}
2217 
2218 	if (page->index == end_index) {
2219 		char *userpage;
2220 
2221 		userpage = kmap_atomic(page, KM_USER0);
2222 		memset(userpage + pg_offset, 0,
2223 		       PAGE_CACHE_SIZE - pg_offset);
2224 		kunmap_atomic(userpage, KM_USER0);
2225 		flush_dcache_page(page);
2226 	}
2227 	pg_offset = 0;
2228 
2229 	set_page_extent_mapped(page);
2230 
2231 	delalloc_start = start;
2232 	delalloc_end = 0;
2233 	page_started = 0;
2234 	if (!epd->extent_locked) {
2235 		u64 delalloc_to_write = 0;
2236 		/*
2237 		 * make sure the wbc mapping index is at least updated
2238 		 * to this page.
2239 		 */
2240 		update_nr_written(page, wbc, 0);
2241 
2242 		while (delalloc_end < page_end) {
2243 			nr_delalloc = find_lock_delalloc_range(inode, tree,
2244 						       page,
2245 						       &delalloc_start,
2246 						       &delalloc_end,
2247 						       128 * 1024 * 1024);
2248 			if (nr_delalloc == 0) {
2249 				delalloc_start = delalloc_end + 1;
2250 				continue;
2251 			}
2252 			tree->ops->fill_delalloc(inode, page, delalloc_start,
2253 						 delalloc_end, &page_started,
2254 						 &nr_written);
2255 			/*
2256 			 * delalloc_end is already one less than the total
2257 			 * length, so we don't subtract one from
2258 			 * PAGE_CACHE_SIZE
2259 			 */
2260 			delalloc_to_write += (delalloc_end - delalloc_start +
2261 					      PAGE_CACHE_SIZE) >>
2262 					      PAGE_CACHE_SHIFT;
2263 			delalloc_start = delalloc_end + 1;
2264 		}
2265 		if (wbc->nr_to_write < delalloc_to_write) {
2266 			int thresh = 8192;
2267 
2268 			if (delalloc_to_write < thresh * 2)
2269 				thresh = delalloc_to_write;
2270 			wbc->nr_to_write = min_t(u64, delalloc_to_write,
2271 						 thresh);
2272 		}
2273 
2274 		/* did the fill delalloc function already unlock and start
2275 		 * the IO?
2276 		 */
2277 		if (page_started) {
2278 			ret = 0;
2279 			/*
2280 			 * we've unlocked the page, so we can't update
2281 			 * the mapping's writeback index, just update
2282 			 * nr_to_write.
2283 			 */
2284 			wbc->nr_to_write -= nr_written;
2285 			goto done_unlocked;
2286 		}
2287 	}
2288 	if (tree->ops && tree->ops->writepage_start_hook) {
2289 		ret = tree->ops->writepage_start_hook(page, start,
2290 						      page_end);
2291 		if (ret == -EAGAIN) {
2292 			redirty_page_for_writepage(wbc, page);
2293 			update_nr_written(page, wbc, nr_written);
2294 			unlock_page(page);
2295 			ret = 0;
2296 			goto done_unlocked;
2297 		}
2298 	}
2299 
2300 	/*
2301 	 * we don't want to touch the inode after unlocking the page,
2302 	 * so we update the mapping writeback index now
2303 	 */
2304 	update_nr_written(page, wbc, nr_written + 1);
2305 
2306 	end = page_end;
2307 	if (last_byte <= start) {
2308 		if (tree->ops && tree->ops->writepage_end_io_hook)
2309 			tree->ops->writepage_end_io_hook(page, start,
2310 							 page_end, NULL, 1);
2311 		unlock_start = page_end + 1;
2312 		goto done;
2313 	}
2314 
2315 	blocksize = inode->i_sb->s_blocksize;
2316 
2317 	while (cur <= end) {
2318 		if (cur >= last_byte) {
2319 			if (tree->ops && tree->ops->writepage_end_io_hook)
2320 				tree->ops->writepage_end_io_hook(page, cur,
2321 							 page_end, NULL, 1);
2322 			unlock_start = page_end + 1;
2323 			break;
2324 		}
2325 		em = epd->get_extent(inode, page, pg_offset, cur,
2326 				     end - cur + 1, 1);
2327 		if (IS_ERR(em) || !em) {
2328 			SetPageError(page);
2329 			break;
2330 		}
2331 
2332 		extent_offset = cur - em->start;
2333 		BUG_ON(extent_map_end(em) <= cur);
2334 		BUG_ON(end < cur);
2335 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2336 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2337 		sector = (em->block_start + extent_offset) >> 9;
2338 		bdev = em->bdev;
2339 		block_start = em->block_start;
2340 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2341 		free_extent_map(em);
2342 		em = NULL;
2343 
2344 		/*
2345 		 * compressed and inline extents are written through other
2346 		 * paths in the FS
2347 		 */
2348 		if (compressed || block_start == EXTENT_MAP_HOLE ||
2349 		    block_start == EXTENT_MAP_INLINE) {
2350 			/*
2351 			 * end_io notification does not happen here for
2352 			 * compressed extents
2353 			 */
2354 			if (!compressed && tree->ops &&
2355 			    tree->ops->writepage_end_io_hook)
2356 				tree->ops->writepage_end_io_hook(page, cur,
2357 							 cur + iosize - 1,
2358 							 NULL, 1);
2359 			else if (compressed) {
2360 				/* we don't want to end_page_writeback on
2361 				 * a compressed extent.  this happens
2362 				 * elsewhere
2363 				 */
2364 				nr++;
2365 			}
2366 
2367 			cur += iosize;
2368 			pg_offset += iosize;
2369 			unlock_start = cur;
2370 			continue;
2371 		}
2372 		/* leave this out until we have a page_mkwrite call */
2373 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2374 				   EXTENT_DIRTY, 0, NULL)) {
2375 			cur = cur + iosize;
2376 			pg_offset += iosize;
2377 			continue;
2378 		}
2379 
2380 		if (tree->ops && tree->ops->writepage_io_hook) {
2381 			ret = tree->ops->writepage_io_hook(page, cur,
2382 						cur + iosize - 1);
2383 		} else {
2384 			ret = 0;
2385 		}
2386 		if (ret) {
2387 			SetPageError(page);
2388 		} else {
2389 			unsigned long max_nr = end_index + 1;
2390 
2391 			set_range_writeback(tree, cur, cur + iosize - 1);
2392 			if (!PageWriteback(page)) {
2393 				printk(KERN_ERR "btrfs warning page %lu not "
2394 				       "writeback, cur %llu end %llu\n",
2395 				       page->index, (unsigned long long)cur,
2396 				       (unsigned long long)end);
2397 			}
2398 
2399 			ret = submit_extent_page(write_flags, tree, page,
2400 						 sector, iosize, pg_offset,
2401 						 bdev, &epd->bio, max_nr,
2402 						 end_bio_extent_writepage,
2403 						 0, 0, 0);
2404 			if (ret)
2405 				SetPageError(page);
2406 		}
2407 		cur = cur + iosize;
2408 		pg_offset += iosize;
2409 		nr++;
2410 	}
2411 done:
2412 	if (nr == 0) {
2413 		/* make sure the mapping tag for page dirty gets cleared */
2414 		set_page_writeback(page);
2415 		end_page_writeback(page);
2416 	}
2417 	unlock_page(page);
2418 
2419 done_unlocked:
2420 
2421 	/* drop our reference on any cached states */
2422 	free_extent_state(cached_state);
2423 	return 0;
2424 }
2425 
2426 /**
2427  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2428  * @mapping: address space structure to write
2429  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2430  * @writepage: function called for each page
2431  * @data: data passed to writepage function
2432  *
2433  * If a page is already under I/O, write_cache_pages() skips it, even
2434  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2435  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2436  * and msync() need to guarantee that all the data which was dirty at the time
2437  * the call was made get new I/O started against them.  If wbc->sync_mode is
2438  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2439  * existing IO to complete.
2440  */
2441 static int extent_write_cache_pages(struct extent_io_tree *tree,
2442 			     struct address_space *mapping,
2443 			     struct writeback_control *wbc,
2444 			     writepage_t writepage, void *data,
2445 			     void (*flush_fn)(void *))
2446 {
2447 	int ret = 0;
2448 	int done = 0;
2449 	int nr_to_write_done = 0;
2450 	struct pagevec pvec;
2451 	int nr_pages;
2452 	pgoff_t index;
2453 	pgoff_t end;		/* Inclusive */
2454 	int scanned = 0;
2455 	int range_whole = 0;
2456 
2457 	pagevec_init(&pvec, 0);
2458 	if (wbc->range_cyclic) {
2459 		index = mapping->writeback_index; /* Start from prev offset */
2460 		end = -1;
2461 	} else {
2462 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
2463 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
2464 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2465 			range_whole = 1;
2466 		scanned = 1;
2467 	}
2468 retry:
2469 	while (!done && !nr_to_write_done && (index <= end) &&
2470 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2471 			      PAGECACHE_TAG_DIRTY, min(end - index,
2472 				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2473 		unsigned i;
2474 
2475 		scanned = 1;
2476 		for (i = 0; i < nr_pages; i++) {
2477 			struct page *page = pvec.pages[i];
2478 
2479 			/*
2480 			 * At this point we hold neither mapping->tree_lock nor
2481 			 * lock on the page itself: the page may be truncated or
2482 			 * invalidated (changing page->mapping to NULL), or even
2483 			 * swizzled back from swapper_space to tmpfs file
2484 			 * mapping
2485 			 */
2486 			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2487 				tree->ops->write_cache_pages_lock_hook(page);
2488 			else
2489 				lock_page(page);
2490 
2491 			if (unlikely(page->mapping != mapping)) {
2492 				unlock_page(page);
2493 				continue;
2494 			}
2495 
2496 			if (!wbc->range_cyclic && page->index > end) {
2497 				done = 1;
2498 				unlock_page(page);
2499 				continue;
2500 			}
2501 
2502 			if (wbc->sync_mode != WB_SYNC_NONE) {
2503 				if (PageWriteback(page))
2504 					flush_fn(data);
2505 				wait_on_page_writeback(page);
2506 			}
2507 
2508 			if (PageWriteback(page) ||
2509 			    !clear_page_dirty_for_io(page)) {
2510 				unlock_page(page);
2511 				continue;
2512 			}
2513 
2514 			ret = (*writepage)(page, wbc, data);
2515 
2516 			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2517 				unlock_page(page);
2518 				ret = 0;
2519 			}
2520 			if (ret)
2521 				done = 1;
2522 
2523 			/*
2524 			 * the filesystem may choose to bump up nr_to_write.
2525 			 * We have to make sure to honor the new nr_to_write
2526 			 * at any time
2527 			 */
2528 			nr_to_write_done = wbc->nr_to_write <= 0;
2529 		}
2530 		pagevec_release(&pvec);
2531 		cond_resched();
2532 	}
2533 	if (!scanned && !done) {
2534 		/*
2535 		 * We hit the last page and there is more work to be done: wrap
2536 		 * back to the start of the file
2537 		 */
2538 		scanned = 1;
2539 		index = 0;
2540 		goto retry;
2541 	}
2542 	return ret;
2543 }
2544 
2545 static void flush_epd_write_bio(struct extent_page_data *epd)
2546 {
2547 	if (epd->bio) {
2548 		if (epd->sync_io)
2549 			submit_one_bio(WRITE_SYNC, epd->bio, 0, 0);
2550 		else
2551 			submit_one_bio(WRITE, epd->bio, 0, 0);
2552 		epd->bio = NULL;
2553 	}
2554 }
2555 
2556 static noinline void flush_write_bio(void *data)
2557 {
2558 	struct extent_page_data *epd = data;
2559 	flush_epd_write_bio(epd);
2560 }
2561 
2562 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2563 			  get_extent_t *get_extent,
2564 			  struct writeback_control *wbc)
2565 {
2566 	int ret;
2567 	struct address_space *mapping = page->mapping;
2568 	struct extent_page_data epd = {
2569 		.bio = NULL,
2570 		.tree = tree,
2571 		.get_extent = get_extent,
2572 		.extent_locked = 0,
2573 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
2574 	};
2575 	struct writeback_control wbc_writepages = {
2576 		.bdi		= wbc->bdi,
2577 		.sync_mode	= wbc->sync_mode,
2578 		.older_than_this = NULL,
2579 		.nr_to_write	= 64,
2580 		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
2581 		.range_end	= (loff_t)-1,
2582 	};
2583 
2584 	ret = __extent_writepage(page, wbc, &epd);
2585 
2586 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
2587 				 __extent_writepage, &epd, flush_write_bio);
2588 	flush_epd_write_bio(&epd);
2589 	return ret;
2590 }
2591 
2592 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2593 			      u64 start, u64 end, get_extent_t *get_extent,
2594 			      int mode)
2595 {
2596 	int ret = 0;
2597 	struct address_space *mapping = inode->i_mapping;
2598 	struct page *page;
2599 	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2600 		PAGE_CACHE_SHIFT;
2601 
2602 	struct extent_page_data epd = {
2603 		.bio = NULL,
2604 		.tree = tree,
2605 		.get_extent = get_extent,
2606 		.extent_locked = 1,
2607 		.sync_io = mode == WB_SYNC_ALL,
2608 	};
2609 	struct writeback_control wbc_writepages = {
2610 		.bdi		= inode->i_mapping->backing_dev_info,
2611 		.sync_mode	= mode,
2612 		.older_than_this = NULL,
2613 		.nr_to_write	= nr_pages * 2,
2614 		.range_start	= start,
2615 		.range_end	= end + 1,
2616 	};
2617 
2618 	while (start <= end) {
2619 		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2620 		if (clear_page_dirty_for_io(page))
2621 			ret = __extent_writepage(page, &wbc_writepages, &epd);
2622 		else {
2623 			if (tree->ops && tree->ops->writepage_end_io_hook)
2624 				tree->ops->writepage_end_io_hook(page, start,
2625 						 start + PAGE_CACHE_SIZE - 1,
2626 						 NULL, 1);
2627 			unlock_page(page);
2628 		}
2629 		page_cache_release(page);
2630 		start += PAGE_CACHE_SIZE;
2631 	}
2632 
2633 	flush_epd_write_bio(&epd);
2634 	return ret;
2635 }
2636 
2637 int extent_writepages(struct extent_io_tree *tree,
2638 		      struct address_space *mapping,
2639 		      get_extent_t *get_extent,
2640 		      struct writeback_control *wbc)
2641 {
2642 	int ret = 0;
2643 	struct extent_page_data epd = {
2644 		.bio = NULL,
2645 		.tree = tree,
2646 		.get_extent = get_extent,
2647 		.extent_locked = 0,
2648 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
2649 	};
2650 
2651 	ret = extent_write_cache_pages(tree, mapping, wbc,
2652 				       __extent_writepage, &epd,
2653 				       flush_write_bio);
2654 	flush_epd_write_bio(&epd);
2655 	return ret;
2656 }
2657 
2658 int extent_readpages(struct extent_io_tree *tree,
2659 		     struct address_space *mapping,
2660 		     struct list_head *pages, unsigned nr_pages,
2661 		     get_extent_t get_extent)
2662 {
2663 	struct bio *bio = NULL;
2664 	unsigned page_idx;
2665 	struct pagevec pvec;
2666 	unsigned long bio_flags = 0;
2667 
2668 	pagevec_init(&pvec, 0);
2669 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2670 		struct page *page = list_entry(pages->prev, struct page, lru);
2671 
2672 		prefetchw(&page->flags);
2673 		list_del(&page->lru);
2674 		/*
2675 		 * what we want to do here is call add_to_page_cache_lru,
2676 		 * but that isn't exported, so we reproduce it here
2677 		 */
2678 		if (!add_to_page_cache(page, mapping,
2679 					page->index, GFP_KERNEL)) {
2680 
2681 			/* open coding of lru_cache_add, also not exported */
2682 			page_cache_get(page);
2683 			if (!pagevec_add(&pvec, page))
2684 				__pagevec_lru_add_file(&pvec);
2685 			__extent_read_full_page(tree, page, get_extent,
2686 						&bio, 0, &bio_flags);
2687 		}
2688 		page_cache_release(page);
2689 	}
2690 	if (pagevec_count(&pvec))
2691 		__pagevec_lru_add_file(&pvec);
2692 	BUG_ON(!list_empty(pages));
2693 	if (bio)
2694 		submit_one_bio(READ, bio, 0, bio_flags);
2695 	return 0;
2696 }
2697 
2698 /*
2699  * basic invalidatepage code, this waits on any locked or writeback
2700  * ranges corresponding to the page, and then deletes any extent state
2701  * records from the tree
2702  */
2703 int extent_invalidatepage(struct extent_io_tree *tree,
2704 			  struct page *page, unsigned long offset)
2705 {
2706 	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2707 	u64 end = start + PAGE_CACHE_SIZE - 1;
2708 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2709 
2710 	start += (offset + blocksize - 1) & ~(blocksize - 1);
2711 	if (start > end)
2712 		return 0;
2713 
2714 	lock_extent(tree, start, end, GFP_NOFS);
2715 	wait_on_page_writeback(page);
2716 	clear_extent_bit(tree, start, end,
2717 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2718 			 1, 1, NULL, GFP_NOFS);
2719 	return 0;
2720 }
2721 
2722 /*
2723  * simple commit_write call, set_range_dirty is used to mark both
2724  * the pages and the extent records as dirty
2725  */
2726 int extent_commit_write(struct extent_io_tree *tree,
2727 			struct inode *inode, struct page *page,
2728 			unsigned from, unsigned to)
2729 {
2730 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2731 
2732 	set_page_extent_mapped(page);
2733 	set_page_dirty(page);
2734 
2735 	if (pos > inode->i_size) {
2736 		i_size_write(inode, pos);
2737 		mark_inode_dirty(inode);
2738 	}
2739 	return 0;
2740 }
2741 
2742 int extent_prepare_write(struct extent_io_tree *tree,
2743 			 struct inode *inode, struct page *page,
2744 			 unsigned from, unsigned to, get_extent_t *get_extent)
2745 {
2746 	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2747 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2748 	u64 block_start;
2749 	u64 orig_block_start;
2750 	u64 block_end;
2751 	u64 cur_end;
2752 	struct extent_map *em;
2753 	unsigned blocksize = 1 << inode->i_blkbits;
2754 	size_t page_offset = 0;
2755 	size_t block_off_start;
2756 	size_t block_off_end;
2757 	int err = 0;
2758 	int iocount = 0;
2759 	int ret = 0;
2760 	int isnew;
2761 
2762 	set_page_extent_mapped(page);
2763 
2764 	block_start = (page_start + from) & ~((u64)blocksize - 1);
2765 	block_end = (page_start + to - 1) | (blocksize - 1);
2766 	orig_block_start = block_start;
2767 
2768 	lock_extent(tree, page_start, page_end, GFP_NOFS);
2769 	while (block_start <= block_end) {
2770 		em = get_extent(inode, page, page_offset, block_start,
2771 				block_end - block_start + 1, 1);
2772 		if (IS_ERR(em) || !em)
2773 			goto err;
2774 
2775 		cur_end = min(block_end, extent_map_end(em) - 1);
2776 		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2777 		block_off_end = block_off_start + blocksize;
2778 		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2779 
2780 		if (!PageUptodate(page) && isnew &&
2781 		    (block_off_end > to || block_off_start < from)) {
2782 			void *kaddr;
2783 
2784 			kaddr = kmap_atomic(page, KM_USER0);
2785 			if (block_off_end > to)
2786 				memset(kaddr + to, 0, block_off_end - to);
2787 			if (block_off_start < from)
2788 				memset(kaddr + block_off_start, 0,
2789 				       from - block_off_start);
2790 			flush_dcache_page(page);
2791 			kunmap_atomic(kaddr, KM_USER0);
2792 		}
2793 		if ((em->block_start != EXTENT_MAP_HOLE &&
2794 		     em->block_start != EXTENT_MAP_INLINE) &&
2795 		    !isnew && !PageUptodate(page) &&
2796 		    (block_off_end > to || block_off_start < from) &&
2797 		    !test_range_bit(tree, block_start, cur_end,
2798 				    EXTENT_UPTODATE, 1, NULL)) {
2799 			u64 sector;
2800 			u64 extent_offset = block_start - em->start;
2801 			size_t iosize;
2802 			sector = (em->block_start + extent_offset) >> 9;
2803 			iosize = (cur_end - block_start + blocksize) &
2804 				~((u64)blocksize - 1);
2805 			/*
2806 			 * we've already got the extent locked, but we
2807 			 * need to split the state such that our end_bio
2808 			 * handler can clear the lock.
2809 			 */
2810 			set_extent_bit(tree, block_start,
2811 				       block_start + iosize - 1,
2812 				       EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
2813 			ret = submit_extent_page(READ, tree, page,
2814 					 sector, iosize, page_offset, em->bdev,
2815 					 NULL, 1,
2816 					 end_bio_extent_preparewrite, 0,
2817 					 0, 0);
2818 			iocount++;
2819 			block_start = block_start + iosize;
2820 		} else {
2821 			set_extent_uptodate(tree, block_start, cur_end,
2822 					    GFP_NOFS);
2823 			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2824 			block_start = cur_end + 1;
2825 		}
2826 		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2827 		free_extent_map(em);
2828 	}
2829 	if (iocount) {
2830 		wait_extent_bit(tree, orig_block_start,
2831 				block_end, EXTENT_LOCKED);
2832 	}
2833 	check_page_uptodate(tree, page);
2834 err:
2835 	/* FIXME, zero out newly allocated blocks on error */
2836 	return err;
2837 }
2838 
2839 /*
2840  * a helper for releasepage, this tests for areas of the page that
2841  * are locked or under IO and drops the related state bits if it is safe
2842  * to drop the page.
2843  */
2844 int try_release_extent_state(struct extent_map_tree *map,
2845 			     struct extent_io_tree *tree, struct page *page,
2846 			     gfp_t mask)
2847 {
2848 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2849 	u64 end = start + PAGE_CACHE_SIZE - 1;
2850 	int ret = 1;
2851 
2852 	if (test_range_bit(tree, start, end,
2853 			   EXTENT_IOBITS, 0, NULL))
2854 		ret = 0;
2855 	else {
2856 		if ((mask & GFP_NOFS) == GFP_NOFS)
2857 			mask = GFP_NOFS;
2858 		/*
2859 		 * at this point we can safely clear everything except the
2860 		 * locked bit and the nodatasum bit
2861 		 */
2862 		clear_extent_bit(tree, start, end,
2863 				 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
2864 				 0, 0, NULL, mask);
2865 	}
2866 	return ret;
2867 }
2868 
2869 /*
2870  * a helper for releasepage.  As long as there are no locked extents
2871  * in the range corresponding to the page, both state records and extent
2872  * map records are removed
2873  */
2874 int try_release_extent_mapping(struct extent_map_tree *map,
2875 			       struct extent_io_tree *tree, struct page *page,
2876 			       gfp_t mask)
2877 {
2878 	struct extent_map *em;
2879 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2880 	u64 end = start + PAGE_CACHE_SIZE - 1;
2881 
2882 	if ((mask & __GFP_WAIT) &&
2883 	    page->mapping->host->i_size > 16 * 1024 * 1024) {
2884 		u64 len;
2885 		while (start <= end) {
2886 			len = end - start + 1;
2887 			write_lock(&map->lock);
2888 			em = lookup_extent_mapping(map, start, len);
2889 			if (!em || IS_ERR(em)) {
2890 				write_unlock(&map->lock);
2891 				break;
2892 			}
2893 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2894 			    em->start != start) {
2895 				write_unlock(&map->lock);
2896 				free_extent_map(em);
2897 				break;
2898 			}
2899 			if (!test_range_bit(tree, em->start,
2900 					    extent_map_end(em) - 1,
2901 					    EXTENT_LOCKED | EXTENT_WRITEBACK,
2902 					    0, NULL)) {
2903 				remove_extent_mapping(map, em);
2904 				/* once for the rb tree */
2905 				free_extent_map(em);
2906 			}
2907 			start = extent_map_end(em);
2908 			write_unlock(&map->lock);
2909 
2910 			/* once for us */
2911 			free_extent_map(em);
2912 		}
2913 	}
2914 	return try_release_extent_state(map, tree, page, mask);
2915 }
2916 
2917 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2918 		get_extent_t *get_extent)
2919 {
2920 	struct inode *inode = mapping->host;
2921 	u64 start = iblock << inode->i_blkbits;
2922 	sector_t sector = 0;
2923 	size_t blksize = (1 << inode->i_blkbits);
2924 	struct extent_map *em;
2925 
2926 	lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2927 		    GFP_NOFS);
2928 	em = get_extent(inode, NULL, 0, start, blksize, 0);
2929 	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2930 		      GFP_NOFS);
2931 	if (!em || IS_ERR(em))
2932 		return 0;
2933 
2934 	if (em->block_start > EXTENT_MAP_LAST_BYTE)
2935 		goto out;
2936 
2937 	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2938 out:
2939 	free_extent_map(em);
2940 	return sector;
2941 }
2942 
2943 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2944 		__u64 start, __u64 len, get_extent_t *get_extent)
2945 {
2946 	int ret;
2947 	u64 off = start;
2948 	u64 max = start + len;
2949 	u32 flags = 0;
2950 	u64 disko = 0;
2951 	struct extent_map *em = NULL;
2952 	int end = 0;
2953 	u64 em_start = 0, em_len = 0;
2954 	unsigned long emflags;
2955 	ret = 0;
2956 
2957 	if (len == 0)
2958 		return -EINVAL;
2959 
2960 	lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2961 		GFP_NOFS);
2962 	em = get_extent(inode, NULL, 0, off, max - off, 0);
2963 	if (!em)
2964 		goto out;
2965 	if (IS_ERR(em)) {
2966 		ret = PTR_ERR(em);
2967 		goto out;
2968 	}
2969 	while (!end) {
2970 		off = em->start + em->len;
2971 		if (off >= max)
2972 			end = 1;
2973 
2974 		em_start = em->start;
2975 		em_len = em->len;
2976 
2977 		disko = 0;
2978 		flags = 0;
2979 
2980 		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
2981 			end = 1;
2982 			flags |= FIEMAP_EXTENT_LAST;
2983 		} else if (em->block_start == EXTENT_MAP_HOLE) {
2984 			flags |= FIEMAP_EXTENT_UNWRITTEN;
2985 		} else if (em->block_start == EXTENT_MAP_INLINE) {
2986 			flags |= (FIEMAP_EXTENT_DATA_INLINE |
2987 				  FIEMAP_EXTENT_NOT_ALIGNED);
2988 		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
2989 			flags |= (FIEMAP_EXTENT_DELALLOC |
2990 				  FIEMAP_EXTENT_UNKNOWN);
2991 		} else {
2992 			disko = em->block_start;
2993 		}
2994 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2995 			flags |= FIEMAP_EXTENT_ENCODED;
2996 
2997 		emflags = em->flags;
2998 		free_extent_map(em);
2999 		em = NULL;
3000 
3001 		if (!end) {
3002 			em = get_extent(inode, NULL, 0, off, max - off, 0);
3003 			if (!em)
3004 				goto out;
3005 			if (IS_ERR(em)) {
3006 				ret = PTR_ERR(em);
3007 				goto out;
3008 			}
3009 			emflags = em->flags;
3010 		}
3011 		if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
3012 			flags |= FIEMAP_EXTENT_LAST;
3013 			end = 1;
3014 		}
3015 
3016 		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
3017 					em_len, flags);
3018 		if (ret)
3019 			goto out_free;
3020 	}
3021 out_free:
3022 	free_extent_map(em);
3023 out:
3024 	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
3025 			GFP_NOFS);
3026 	return ret;
3027 }
3028 
3029 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
3030 					      unsigned long i)
3031 {
3032 	struct page *p;
3033 	struct address_space *mapping;
3034 
3035 	if (i == 0)
3036 		return eb->first_page;
3037 	i += eb->start >> PAGE_CACHE_SHIFT;
3038 	mapping = eb->first_page->mapping;
3039 	if (!mapping)
3040 		return NULL;
3041 
3042 	/*
3043 	 * extent_buffer_page is only called after pinning the page
3044 	 * by increasing the reference count.  So we know the page must
3045 	 * be in the radix tree.
3046 	 */
3047 	rcu_read_lock();
3048 	p = radix_tree_lookup(&mapping->page_tree, i);
3049 	rcu_read_unlock();
3050 
3051 	return p;
3052 }
3053 
3054 static inline unsigned long num_extent_pages(u64 start, u64 len)
3055 {
3056 	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3057 		(start >> PAGE_CACHE_SHIFT);
3058 }
3059 
3060 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
3061 						   u64 start,
3062 						   unsigned long len,
3063 						   gfp_t mask)
3064 {
3065 	struct extent_buffer *eb = NULL;
3066 #if LEAK_DEBUG
3067 	unsigned long flags;
3068 #endif
3069 
3070 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
3071 	eb->start = start;
3072 	eb->len = len;
3073 	spin_lock_init(&eb->lock);
3074 	init_waitqueue_head(&eb->lock_wq);
3075 
3076 #if LEAK_DEBUG
3077 	spin_lock_irqsave(&leak_lock, flags);
3078 	list_add(&eb->leak_list, &buffers);
3079 	spin_unlock_irqrestore(&leak_lock, flags);
3080 #endif
3081 	atomic_set(&eb->refs, 1);
3082 
3083 	return eb;
3084 }
3085 
3086 static void __free_extent_buffer(struct extent_buffer *eb)
3087 {
3088 #if LEAK_DEBUG
3089 	unsigned long flags;
3090 	spin_lock_irqsave(&leak_lock, flags);
3091 	list_del(&eb->leak_list);
3092 	spin_unlock_irqrestore(&leak_lock, flags);
3093 #endif
3094 	kmem_cache_free(extent_buffer_cache, eb);
3095 }
3096 
3097 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3098 					  u64 start, unsigned long len,
3099 					  struct page *page0,
3100 					  gfp_t mask)
3101 {
3102 	unsigned long num_pages = num_extent_pages(start, len);
3103 	unsigned long i;
3104 	unsigned long index = start >> PAGE_CACHE_SHIFT;
3105 	struct extent_buffer *eb;
3106 	struct extent_buffer *exists = NULL;
3107 	struct page *p;
3108 	struct address_space *mapping = tree->mapping;
3109 	int uptodate = 1;
3110 
3111 	spin_lock(&tree->buffer_lock);
3112 	eb = buffer_search(tree, start);
3113 	if (eb) {
3114 		atomic_inc(&eb->refs);
3115 		spin_unlock(&tree->buffer_lock);
3116 		mark_page_accessed(eb->first_page);
3117 		return eb;
3118 	}
3119 	spin_unlock(&tree->buffer_lock);
3120 
3121 	eb = __alloc_extent_buffer(tree, start, len, mask);
3122 	if (!eb)
3123 		return NULL;
3124 
3125 	if (page0) {
3126 		eb->first_page = page0;
3127 		i = 1;
3128 		index++;
3129 		page_cache_get(page0);
3130 		mark_page_accessed(page0);
3131 		set_page_extent_mapped(page0);
3132 		set_page_extent_head(page0, len);
3133 		uptodate = PageUptodate(page0);
3134 	} else {
3135 		i = 0;
3136 	}
3137 	for (; i < num_pages; i++, index++) {
3138 		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
3139 		if (!p) {
3140 			WARN_ON(1);
3141 			goto free_eb;
3142 		}
3143 		set_page_extent_mapped(p);
3144 		mark_page_accessed(p);
3145 		if (i == 0) {
3146 			eb->first_page = p;
3147 			set_page_extent_head(p, len);
3148 		} else {
3149 			set_page_private(p, EXTENT_PAGE_PRIVATE);
3150 		}
3151 		if (!PageUptodate(p))
3152 			uptodate = 0;
3153 		unlock_page(p);
3154 	}
3155 	if (uptodate)
3156 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3157 
3158 	spin_lock(&tree->buffer_lock);
3159 	exists = buffer_tree_insert(tree, start, &eb->rb_node);
3160 	if (exists) {
3161 		/* add one reference for the caller */
3162 		atomic_inc(&exists->refs);
3163 		spin_unlock(&tree->buffer_lock);
3164 		goto free_eb;
3165 	}
3166 	spin_unlock(&tree->buffer_lock);
3167 
3168 	/* add one reference for the tree */
3169 	atomic_inc(&eb->refs);
3170 	return eb;
3171 
3172 free_eb:
3173 	if (!atomic_dec_and_test(&eb->refs))
3174 		return exists;
3175 	for (index = 1; index < i; index++)
3176 		page_cache_release(extent_buffer_page(eb, index));
3177 	page_cache_release(extent_buffer_page(eb, 0));
3178 	__free_extent_buffer(eb);
3179 	return exists;
3180 }
3181 
3182 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3183 					 u64 start, unsigned long len,
3184 					  gfp_t mask)
3185 {
3186 	struct extent_buffer *eb;
3187 
3188 	spin_lock(&tree->buffer_lock);
3189 	eb = buffer_search(tree, start);
3190 	if (eb)
3191 		atomic_inc(&eb->refs);
3192 	spin_unlock(&tree->buffer_lock);
3193 
3194 	if (eb)
3195 		mark_page_accessed(eb->first_page);
3196 
3197 	return eb;
3198 }
3199 
3200 void free_extent_buffer(struct extent_buffer *eb)
3201 {
3202 	if (!eb)
3203 		return;
3204 
3205 	if (!atomic_dec_and_test(&eb->refs))
3206 		return;
3207 
3208 	WARN_ON(1);
3209 }
3210 
3211 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3212 			      struct extent_buffer *eb)
3213 {
3214 	unsigned long i;
3215 	unsigned long num_pages;
3216 	struct page *page;
3217 
3218 	num_pages = num_extent_pages(eb->start, eb->len);
3219 
3220 	for (i = 0; i < num_pages; i++) {
3221 		page = extent_buffer_page(eb, i);
3222 		if (!PageDirty(page))
3223 			continue;
3224 
3225 		lock_page(page);
3226 		if (i == 0)
3227 			set_page_extent_head(page, eb->len);
3228 		else
3229 			set_page_private(page, EXTENT_PAGE_PRIVATE);
3230 
3231 		clear_page_dirty_for_io(page);
3232 		spin_lock_irq(&page->mapping->tree_lock);
3233 		if (!PageDirty(page)) {
3234 			radix_tree_tag_clear(&page->mapping->page_tree,
3235 						page_index(page),
3236 						PAGECACHE_TAG_DIRTY);
3237 		}
3238 		spin_unlock_irq(&page->mapping->tree_lock);
3239 		unlock_page(page);
3240 	}
3241 	return 0;
3242 }
3243 
3244 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3245 				    struct extent_buffer *eb)
3246 {
3247 	return wait_on_extent_writeback(tree, eb->start,
3248 					eb->start + eb->len - 1);
3249 }
3250 
3251 int set_extent_buffer_dirty(struct extent_io_tree *tree,
3252 			     struct extent_buffer *eb)
3253 {
3254 	unsigned long i;
3255 	unsigned long num_pages;
3256 	int was_dirty = 0;
3257 
3258 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3259 	num_pages = num_extent_pages(eb->start, eb->len);
3260 	for (i = 0; i < num_pages; i++)
3261 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3262 	return was_dirty;
3263 }
3264 
3265 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3266 				struct extent_buffer *eb)
3267 {
3268 	unsigned long i;
3269 	struct page *page;
3270 	unsigned long num_pages;
3271 
3272 	num_pages = num_extent_pages(eb->start, eb->len);
3273 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3274 
3275 	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3276 			      GFP_NOFS);
3277 	for (i = 0; i < num_pages; i++) {
3278 		page = extent_buffer_page(eb, i);
3279 		if (page)
3280 			ClearPageUptodate(page);
3281 	}
3282 	return 0;
3283 }
3284 
3285 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3286 				struct extent_buffer *eb)
3287 {
3288 	unsigned long i;
3289 	struct page *page;
3290 	unsigned long num_pages;
3291 
3292 	num_pages = num_extent_pages(eb->start, eb->len);
3293 
3294 	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3295 			    GFP_NOFS);
3296 	for (i = 0; i < num_pages; i++) {
3297 		page = extent_buffer_page(eb, i);
3298 		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3299 		    ((i == num_pages - 1) &&
3300 		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3301 			check_page_uptodate(tree, page);
3302 			continue;
3303 		}
3304 		SetPageUptodate(page);
3305 	}
3306 	return 0;
3307 }
3308 
3309 int extent_range_uptodate(struct extent_io_tree *tree,
3310 			  u64 start, u64 end)
3311 {
3312 	struct page *page;
3313 	int ret;
3314 	int pg_uptodate = 1;
3315 	int uptodate;
3316 	unsigned long index;
3317 
3318 	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL);
3319 	if (ret)
3320 		return 1;
3321 	while (start <= end) {
3322 		index = start >> PAGE_CACHE_SHIFT;
3323 		page = find_get_page(tree->mapping, index);
3324 		uptodate = PageUptodate(page);
3325 		page_cache_release(page);
3326 		if (!uptodate) {
3327 			pg_uptodate = 0;
3328 			break;
3329 		}
3330 		start += PAGE_CACHE_SIZE;
3331 	}
3332 	return pg_uptodate;
3333 }
3334 
3335 int extent_buffer_uptodate(struct extent_io_tree *tree,
3336 			   struct extent_buffer *eb)
3337 {
3338 	int ret = 0;
3339 	unsigned long num_pages;
3340 	unsigned long i;
3341 	struct page *page;
3342 	int pg_uptodate = 1;
3343 
3344 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3345 		return 1;
3346 
3347 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3348 			   EXTENT_UPTODATE, 1, NULL);
3349 	if (ret)
3350 		return ret;
3351 
3352 	num_pages = num_extent_pages(eb->start, eb->len);
3353 	for (i = 0; i < num_pages; i++) {
3354 		page = extent_buffer_page(eb, i);
3355 		if (!PageUptodate(page)) {
3356 			pg_uptodate = 0;
3357 			break;
3358 		}
3359 	}
3360 	return pg_uptodate;
3361 }
3362 
3363 int read_extent_buffer_pages(struct extent_io_tree *tree,
3364 			     struct extent_buffer *eb,
3365 			     u64 start, int wait,
3366 			     get_extent_t *get_extent, int mirror_num)
3367 {
3368 	unsigned long i;
3369 	unsigned long start_i;
3370 	struct page *page;
3371 	int err;
3372 	int ret = 0;
3373 	int locked_pages = 0;
3374 	int all_uptodate = 1;
3375 	int inc_all_pages = 0;
3376 	unsigned long num_pages;
3377 	struct bio *bio = NULL;
3378 	unsigned long bio_flags = 0;
3379 
3380 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3381 		return 0;
3382 
3383 	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3384 			   EXTENT_UPTODATE, 1, NULL)) {
3385 		return 0;
3386 	}
3387 
3388 	if (start) {
3389 		WARN_ON(start < eb->start);
3390 		start_i = (start >> PAGE_CACHE_SHIFT) -
3391 			(eb->start >> PAGE_CACHE_SHIFT);
3392 	} else {
3393 		start_i = 0;
3394 	}
3395 
3396 	num_pages = num_extent_pages(eb->start, eb->len);
3397 	for (i = start_i; i < num_pages; i++) {
3398 		page = extent_buffer_page(eb, i);
3399 		if (!wait) {
3400 			if (!trylock_page(page))
3401 				goto unlock_exit;
3402 		} else {
3403 			lock_page(page);
3404 		}
3405 		locked_pages++;
3406 		if (!PageUptodate(page))
3407 			all_uptodate = 0;
3408 	}
3409 	if (all_uptodate) {
3410 		if (start_i == 0)
3411 			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3412 		goto unlock_exit;
3413 	}
3414 
3415 	for (i = start_i; i < num_pages; i++) {
3416 		page = extent_buffer_page(eb, i);
3417 		if (inc_all_pages)
3418 			page_cache_get(page);
3419 		if (!PageUptodate(page)) {
3420 			if (start_i == 0)
3421 				inc_all_pages = 1;
3422 			ClearPageError(page);
3423 			err = __extent_read_full_page(tree, page,
3424 						      get_extent, &bio,
3425 						      mirror_num, &bio_flags);
3426 			if (err)
3427 				ret = err;
3428 		} else {
3429 			unlock_page(page);
3430 		}
3431 	}
3432 
3433 	if (bio)
3434 		submit_one_bio(READ, bio, mirror_num, bio_flags);
3435 
3436 	if (ret || !wait)
3437 		return ret;
3438 
3439 	for (i = start_i; i < num_pages; i++) {
3440 		page = extent_buffer_page(eb, i);
3441 		wait_on_page_locked(page);
3442 		if (!PageUptodate(page))
3443 			ret = -EIO;
3444 	}
3445 
3446 	if (!ret)
3447 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3448 	return ret;
3449 
3450 unlock_exit:
3451 	i = start_i;
3452 	while (locked_pages > 0) {
3453 		page = extent_buffer_page(eb, i);
3454 		i++;
3455 		unlock_page(page);
3456 		locked_pages--;
3457 	}
3458 	return ret;
3459 }
3460 
3461 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3462 			unsigned long start,
3463 			unsigned long len)
3464 {
3465 	size_t cur;
3466 	size_t offset;
3467 	struct page *page;
3468 	char *kaddr;
3469 	char *dst = (char *)dstv;
3470 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3471 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3472 
3473 	WARN_ON(start > eb->len);
3474 	WARN_ON(start + len > eb->start + eb->len);
3475 
3476 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3477 
3478 	while (len > 0) {
3479 		page = extent_buffer_page(eb, i);
3480 
3481 		cur = min(len, (PAGE_CACHE_SIZE - offset));
3482 		kaddr = kmap_atomic(page, KM_USER1);
3483 		memcpy(dst, kaddr + offset, cur);
3484 		kunmap_atomic(kaddr, KM_USER1);
3485 
3486 		dst += cur;
3487 		len -= cur;
3488 		offset = 0;
3489 		i++;
3490 	}
3491 }
3492 
3493 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3494 			       unsigned long min_len, char **token, char **map,
3495 			       unsigned long *map_start,
3496 			       unsigned long *map_len, int km)
3497 {
3498 	size_t offset = start & (PAGE_CACHE_SIZE - 1);
3499 	char *kaddr;
3500 	struct page *p;
3501 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3502 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3503 	unsigned long end_i = (start_offset + start + min_len - 1) >>
3504 		PAGE_CACHE_SHIFT;
3505 
3506 	if (i != end_i)
3507 		return -EINVAL;
3508 
3509 	if (i == 0) {
3510 		offset = start_offset;
3511 		*map_start = 0;
3512 	} else {
3513 		offset = 0;
3514 		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3515 	}
3516 
3517 	if (start + min_len > eb->len) {
3518 		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3519 		       "wanted %lu %lu\n", (unsigned long long)eb->start,
3520 		       eb->len, start, min_len);
3521 		WARN_ON(1);
3522 	}
3523 
3524 	p = extent_buffer_page(eb, i);
3525 	kaddr = kmap_atomic(p, km);
3526 	*token = kaddr;
3527 	*map = kaddr + offset;
3528 	*map_len = PAGE_CACHE_SIZE - offset;
3529 	return 0;
3530 }
3531 
3532 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3533 		      unsigned long min_len,
3534 		      char **token, char **map,
3535 		      unsigned long *map_start,
3536 		      unsigned long *map_len, int km)
3537 {
3538 	int err;
3539 	int save = 0;
3540 	if (eb->map_token) {
3541 		unmap_extent_buffer(eb, eb->map_token, km);
3542 		eb->map_token = NULL;
3543 		save = 1;
3544 	}
3545 	err = map_private_extent_buffer(eb, start, min_len, token, map,
3546 				       map_start, map_len, km);
3547 	if (!err && save) {
3548 		eb->map_token = *token;
3549 		eb->kaddr = *map;
3550 		eb->map_start = *map_start;
3551 		eb->map_len = *map_len;
3552 	}
3553 	return err;
3554 }
3555 
3556 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3557 {
3558 	kunmap_atomic(token, km);
3559 }
3560 
3561 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3562 			  unsigned long start,
3563 			  unsigned long len)
3564 {
3565 	size_t cur;
3566 	size_t offset;
3567 	struct page *page;
3568 	char *kaddr;
3569 	char *ptr = (char *)ptrv;
3570 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3571 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3572 	int ret = 0;
3573 
3574 	WARN_ON(start > eb->len);
3575 	WARN_ON(start + len > eb->start + eb->len);
3576 
3577 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3578 
3579 	while (len > 0) {
3580 		page = extent_buffer_page(eb, i);
3581 
3582 		cur = min(len, (PAGE_CACHE_SIZE - offset));
3583 
3584 		kaddr = kmap_atomic(page, KM_USER0);
3585 		ret = memcmp(ptr, kaddr + offset, cur);
3586 		kunmap_atomic(kaddr, KM_USER0);
3587 		if (ret)
3588 			break;
3589 
3590 		ptr += cur;
3591 		len -= cur;
3592 		offset = 0;
3593 		i++;
3594 	}
3595 	return ret;
3596 }
3597 
3598 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3599 			 unsigned long start, unsigned long len)
3600 {
3601 	size_t cur;
3602 	size_t offset;
3603 	struct page *page;
3604 	char *kaddr;
3605 	char *src = (char *)srcv;
3606 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3607 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3608 
3609 	WARN_ON(start > eb->len);
3610 	WARN_ON(start + len > eb->start + eb->len);
3611 
3612 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3613 
3614 	while (len > 0) {
3615 		page = extent_buffer_page(eb, i);
3616 		WARN_ON(!PageUptodate(page));
3617 
3618 		cur = min(len, PAGE_CACHE_SIZE - offset);
3619 		kaddr = kmap_atomic(page, KM_USER1);
3620 		memcpy(kaddr + offset, src, cur);
3621 		kunmap_atomic(kaddr, KM_USER1);
3622 
3623 		src += cur;
3624 		len -= cur;
3625 		offset = 0;
3626 		i++;
3627 	}
3628 }
3629 
3630 void memset_extent_buffer(struct extent_buffer *eb, char c,
3631 			  unsigned long start, unsigned long len)
3632 {
3633 	size_t cur;
3634 	size_t offset;
3635 	struct page *page;
3636 	char *kaddr;
3637 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3638 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3639 
3640 	WARN_ON(start > eb->len);
3641 	WARN_ON(start + len > eb->start + eb->len);
3642 
3643 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3644 
3645 	while (len > 0) {
3646 		page = extent_buffer_page(eb, i);
3647 		WARN_ON(!PageUptodate(page));
3648 
3649 		cur = min(len, PAGE_CACHE_SIZE - offset);
3650 		kaddr = kmap_atomic(page, KM_USER0);
3651 		memset(kaddr + offset, c, cur);
3652 		kunmap_atomic(kaddr, KM_USER0);
3653 
3654 		len -= cur;
3655 		offset = 0;
3656 		i++;
3657 	}
3658 }
3659 
3660 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3661 			unsigned long dst_offset, unsigned long src_offset,
3662 			unsigned long len)
3663 {
3664 	u64 dst_len = dst->len;
3665 	size_t cur;
3666 	size_t offset;
3667 	struct page *page;
3668 	char *kaddr;
3669 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3670 	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3671 
3672 	WARN_ON(src->len != dst_len);
3673 
3674 	offset = (start_offset + dst_offset) &
3675 		((unsigned long)PAGE_CACHE_SIZE - 1);
3676 
3677 	while (len > 0) {
3678 		page = extent_buffer_page(dst, i);
3679 		WARN_ON(!PageUptodate(page));
3680 
3681 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3682 
3683 		kaddr = kmap_atomic(page, KM_USER0);
3684 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
3685 		kunmap_atomic(kaddr, KM_USER0);
3686 
3687 		src_offset += cur;
3688 		len -= cur;
3689 		offset = 0;
3690 		i++;
3691 	}
3692 }
3693 
3694 static void move_pages(struct page *dst_page, struct page *src_page,
3695 		       unsigned long dst_off, unsigned long src_off,
3696 		       unsigned long len)
3697 {
3698 	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3699 	if (dst_page == src_page) {
3700 		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3701 	} else {
3702 		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3703 		char *p = dst_kaddr + dst_off + len;
3704 		char *s = src_kaddr + src_off + len;
3705 
3706 		while (len--)
3707 			*--p = *--s;
3708 
3709 		kunmap_atomic(src_kaddr, KM_USER1);
3710 	}
3711 	kunmap_atomic(dst_kaddr, KM_USER0);
3712 }
3713 
3714 static void copy_pages(struct page *dst_page, struct page *src_page,
3715 		       unsigned long dst_off, unsigned long src_off,
3716 		       unsigned long len)
3717 {
3718 	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3719 	char *src_kaddr;
3720 
3721 	if (dst_page != src_page)
3722 		src_kaddr = kmap_atomic(src_page, KM_USER1);
3723 	else
3724 		src_kaddr = dst_kaddr;
3725 
3726 	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3727 	kunmap_atomic(dst_kaddr, KM_USER0);
3728 	if (dst_page != src_page)
3729 		kunmap_atomic(src_kaddr, KM_USER1);
3730 }
3731 
3732 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3733 			   unsigned long src_offset, unsigned long len)
3734 {
3735 	size_t cur;
3736 	size_t dst_off_in_page;
3737 	size_t src_off_in_page;
3738 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3739 	unsigned long dst_i;
3740 	unsigned long src_i;
3741 
3742 	if (src_offset + len > dst->len) {
3743 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3744 		       "len %lu dst len %lu\n", src_offset, len, dst->len);
3745 		BUG_ON(1);
3746 	}
3747 	if (dst_offset + len > dst->len) {
3748 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3749 		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
3750 		BUG_ON(1);
3751 	}
3752 
3753 	while (len > 0) {
3754 		dst_off_in_page = (start_offset + dst_offset) &
3755 			((unsigned long)PAGE_CACHE_SIZE - 1);
3756 		src_off_in_page = (start_offset + src_offset) &
3757 			((unsigned long)PAGE_CACHE_SIZE - 1);
3758 
3759 		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3760 		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3761 
3762 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3763 					       src_off_in_page));
3764 		cur = min_t(unsigned long, cur,
3765 			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3766 
3767 		copy_pages(extent_buffer_page(dst, dst_i),
3768 			   extent_buffer_page(dst, src_i),
3769 			   dst_off_in_page, src_off_in_page, cur);
3770 
3771 		src_offset += cur;
3772 		dst_offset += cur;
3773 		len -= cur;
3774 	}
3775 }
3776 
3777 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3778 			   unsigned long src_offset, unsigned long len)
3779 {
3780 	size_t cur;
3781 	size_t dst_off_in_page;
3782 	size_t src_off_in_page;
3783 	unsigned long dst_end = dst_offset + len - 1;
3784 	unsigned long src_end = src_offset + len - 1;
3785 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3786 	unsigned long dst_i;
3787 	unsigned long src_i;
3788 
3789 	if (src_offset + len > dst->len) {
3790 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3791 		       "len %lu len %lu\n", src_offset, len, dst->len);
3792 		BUG_ON(1);
3793 	}
3794 	if (dst_offset + len > dst->len) {
3795 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3796 		       "len %lu len %lu\n", dst_offset, len, dst->len);
3797 		BUG_ON(1);
3798 	}
3799 	if (dst_offset < src_offset) {
3800 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3801 		return;
3802 	}
3803 	while (len > 0) {
3804 		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3805 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3806 
3807 		dst_off_in_page = (start_offset + dst_end) &
3808 			((unsigned long)PAGE_CACHE_SIZE - 1);
3809 		src_off_in_page = (start_offset + src_end) &
3810 			((unsigned long)PAGE_CACHE_SIZE - 1);
3811 
3812 		cur = min_t(unsigned long, len, src_off_in_page + 1);
3813 		cur = min(cur, dst_off_in_page + 1);
3814 		move_pages(extent_buffer_page(dst, dst_i),
3815 			   extent_buffer_page(dst, src_i),
3816 			   dst_off_in_page - cur + 1,
3817 			   src_off_in_page - cur + 1, cur);
3818 
3819 		dst_end -= cur;
3820 		src_end -= cur;
3821 		len -= cur;
3822 	}
3823 }
3824 
3825 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3826 {
3827 	u64 start = page_offset(page);
3828 	struct extent_buffer *eb;
3829 	int ret = 1;
3830 	unsigned long i;
3831 	unsigned long num_pages;
3832 
3833 	spin_lock(&tree->buffer_lock);
3834 	eb = buffer_search(tree, start);
3835 	if (!eb)
3836 		goto out;
3837 
3838 	if (atomic_read(&eb->refs) > 1) {
3839 		ret = 0;
3840 		goto out;
3841 	}
3842 	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3843 		ret = 0;
3844 		goto out;
3845 	}
3846 	/* at this point we can safely release the extent buffer */
3847 	num_pages = num_extent_pages(eb->start, eb->len);
3848 	for (i = 0; i < num_pages; i++)
3849 		page_cache_release(extent_buffer_page(eb, i));
3850 	rb_erase(&eb->rb_node, &tree->buffer);
3851 	__free_extent_buffer(eb);
3852 out:
3853 	spin_unlock(&tree->buffer_lock);
3854 	return ret;
3855 }
3856