xref: /openbmc/linux/fs/btrfs/extent_io.c (revision b8bb76713ec50df2f11efee386e16f93d51e1076)
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include <linux/writeback.h>
13 #include <linux/pagevec.h>
14 #include "extent_io.h"
15 #include "extent_map.h"
16 #include "compat.h"
17 #include "ctree.h"
18 #include "btrfs_inode.h"
19 
20 /* temporary define until extent_map moves out of btrfs */
21 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
22 				       unsigned long extra_flags,
23 				       void (*ctor)(void *, struct kmem_cache *,
24 						    unsigned long));
25 
26 static struct kmem_cache *extent_state_cache;
27 static struct kmem_cache *extent_buffer_cache;
28 
29 static LIST_HEAD(buffers);
30 static LIST_HEAD(states);
31 
32 #define LEAK_DEBUG 0
33 #if LEAK_DEBUG
34 static DEFINE_SPINLOCK(leak_lock);
35 #endif
36 
37 #define BUFFER_LRU_MAX 64
38 
39 struct tree_entry {
40 	u64 start;
41 	u64 end;
42 	struct rb_node rb_node;
43 };
44 
45 struct extent_page_data {
46 	struct bio *bio;
47 	struct extent_io_tree *tree;
48 	get_extent_t *get_extent;
49 
50 	/* tells writepage not to lock the state bits for this range
51 	 * it still does the unlocking
52 	 */
53 	int extent_locked;
54 };
55 
56 int __init extent_io_init(void)
57 {
58 	extent_state_cache = btrfs_cache_create("extent_state",
59 					    sizeof(struct extent_state), 0,
60 					    NULL);
61 	if (!extent_state_cache)
62 		return -ENOMEM;
63 
64 	extent_buffer_cache = btrfs_cache_create("extent_buffers",
65 					    sizeof(struct extent_buffer), 0,
66 					    NULL);
67 	if (!extent_buffer_cache)
68 		goto free_state_cache;
69 	return 0;
70 
71 free_state_cache:
72 	kmem_cache_destroy(extent_state_cache);
73 	return -ENOMEM;
74 }
75 
76 void extent_io_exit(void)
77 {
78 	struct extent_state *state;
79 	struct extent_buffer *eb;
80 
81 	while (!list_empty(&states)) {
82 		state = list_entry(states.next, struct extent_state, leak_list);
83 		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
84 		       "state %lu in tree %p refs %d\n",
85 		       (unsigned long long)state->start,
86 		       (unsigned long long)state->end,
87 		       state->state, state->tree, atomic_read(&state->refs));
88 		list_del(&state->leak_list);
89 		kmem_cache_free(extent_state_cache, state);
90 
91 	}
92 
93 	while (!list_empty(&buffers)) {
94 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
95 		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
96 		       "refs %d\n", (unsigned long long)eb->start,
97 		       eb->len, atomic_read(&eb->refs));
98 		list_del(&eb->leak_list);
99 		kmem_cache_free(extent_buffer_cache, eb);
100 	}
101 	if (extent_state_cache)
102 		kmem_cache_destroy(extent_state_cache);
103 	if (extent_buffer_cache)
104 		kmem_cache_destroy(extent_buffer_cache);
105 }
106 
107 void extent_io_tree_init(struct extent_io_tree *tree,
108 			  struct address_space *mapping, gfp_t mask)
109 {
110 	tree->state.rb_node = NULL;
111 	tree->buffer.rb_node = NULL;
112 	tree->ops = NULL;
113 	tree->dirty_bytes = 0;
114 	spin_lock_init(&tree->lock);
115 	spin_lock_init(&tree->buffer_lock);
116 	tree->mapping = mapping;
117 }
118 
119 static struct extent_state *alloc_extent_state(gfp_t mask)
120 {
121 	struct extent_state *state;
122 #if LEAK_DEBUG
123 	unsigned long flags;
124 #endif
125 
126 	state = kmem_cache_alloc(extent_state_cache, mask);
127 	if (!state)
128 		return state;
129 	state->state = 0;
130 	state->private = 0;
131 	state->tree = NULL;
132 #if LEAK_DEBUG
133 	spin_lock_irqsave(&leak_lock, flags);
134 	list_add(&state->leak_list, &states);
135 	spin_unlock_irqrestore(&leak_lock, flags);
136 #endif
137 	atomic_set(&state->refs, 1);
138 	init_waitqueue_head(&state->wq);
139 	return state;
140 }
141 
142 static void free_extent_state(struct extent_state *state)
143 {
144 	if (!state)
145 		return;
146 	if (atomic_dec_and_test(&state->refs)) {
147 #if LEAK_DEBUG
148 		unsigned long flags;
149 #endif
150 		WARN_ON(state->tree);
151 #if LEAK_DEBUG
152 		spin_lock_irqsave(&leak_lock, flags);
153 		list_del(&state->leak_list);
154 		spin_unlock_irqrestore(&leak_lock, flags);
155 #endif
156 		kmem_cache_free(extent_state_cache, state);
157 	}
158 }
159 
160 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
161 				   struct rb_node *node)
162 {
163 	struct rb_node **p = &root->rb_node;
164 	struct rb_node *parent = NULL;
165 	struct tree_entry *entry;
166 
167 	while (*p) {
168 		parent = *p;
169 		entry = rb_entry(parent, struct tree_entry, rb_node);
170 
171 		if (offset < entry->start)
172 			p = &(*p)->rb_left;
173 		else if (offset > entry->end)
174 			p = &(*p)->rb_right;
175 		else
176 			return parent;
177 	}
178 
179 	entry = rb_entry(node, struct tree_entry, rb_node);
180 	rb_link_node(node, parent, p);
181 	rb_insert_color(node, root);
182 	return NULL;
183 }
184 
185 static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset,
186 				     struct rb_node **prev_ret,
187 				     struct rb_node **next_ret)
188 {
189 	struct rb_root *root = &tree->state;
190 	struct rb_node *n = root->rb_node;
191 	struct rb_node *prev = NULL;
192 	struct rb_node *orig_prev = NULL;
193 	struct tree_entry *entry;
194 	struct tree_entry *prev_entry = NULL;
195 
196 	while (n) {
197 		entry = rb_entry(n, struct tree_entry, rb_node);
198 		prev = n;
199 		prev_entry = entry;
200 
201 		if (offset < entry->start)
202 			n = n->rb_left;
203 		else if (offset > entry->end)
204 			n = n->rb_right;
205 		else
206 			return n;
207 	}
208 
209 	if (prev_ret) {
210 		orig_prev = prev;
211 		while (prev && offset > prev_entry->end) {
212 			prev = rb_next(prev);
213 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
214 		}
215 		*prev_ret = prev;
216 		prev = orig_prev;
217 	}
218 
219 	if (next_ret) {
220 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
221 		while (prev && offset < prev_entry->start) {
222 			prev = rb_prev(prev);
223 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
224 		}
225 		*next_ret = prev;
226 	}
227 	return NULL;
228 }
229 
230 static inline struct rb_node *tree_search(struct extent_io_tree *tree,
231 					  u64 offset)
232 {
233 	struct rb_node *prev = NULL;
234 	struct rb_node *ret;
235 
236 	ret = __etree_search(tree, offset, &prev, NULL);
237 	if (!ret)
238 		return prev;
239 	return ret;
240 }
241 
242 static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
243 					  u64 offset, struct rb_node *node)
244 {
245 	struct rb_root *root = &tree->buffer;
246 	struct rb_node **p = &root->rb_node;
247 	struct rb_node *parent = NULL;
248 	struct extent_buffer *eb;
249 
250 	while (*p) {
251 		parent = *p;
252 		eb = rb_entry(parent, struct extent_buffer, rb_node);
253 
254 		if (offset < eb->start)
255 			p = &(*p)->rb_left;
256 		else if (offset > eb->start)
257 			p = &(*p)->rb_right;
258 		else
259 			return eb;
260 	}
261 
262 	rb_link_node(node, parent, p);
263 	rb_insert_color(node, root);
264 	return NULL;
265 }
266 
267 static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
268 					   u64 offset)
269 {
270 	struct rb_root *root = &tree->buffer;
271 	struct rb_node *n = root->rb_node;
272 	struct extent_buffer *eb;
273 
274 	while (n) {
275 		eb = rb_entry(n, struct extent_buffer, rb_node);
276 		if (offset < eb->start)
277 			n = n->rb_left;
278 		else if (offset > eb->start)
279 			n = n->rb_right;
280 		else
281 			return eb;
282 	}
283 	return NULL;
284 }
285 
286 /*
287  * utility function to look for merge candidates inside a given range.
288  * Any extents with matching state are merged together into a single
289  * extent in the tree.  Extents with EXTENT_IO in their state field
290  * are not merged because the end_io handlers need to be able to do
291  * operations on them without sleeping (or doing allocations/splits).
292  *
293  * This should be called with the tree lock held.
294  */
295 static int merge_state(struct extent_io_tree *tree,
296 		       struct extent_state *state)
297 {
298 	struct extent_state *other;
299 	struct rb_node *other_node;
300 
301 	if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY))
302 		return 0;
303 
304 	other_node = rb_prev(&state->rb_node);
305 	if (other_node) {
306 		other = rb_entry(other_node, struct extent_state, rb_node);
307 		if (other->end == state->start - 1 &&
308 		    other->state == state->state) {
309 			state->start = other->start;
310 			other->tree = NULL;
311 			rb_erase(&other->rb_node, &tree->state);
312 			free_extent_state(other);
313 		}
314 	}
315 	other_node = rb_next(&state->rb_node);
316 	if (other_node) {
317 		other = rb_entry(other_node, struct extent_state, rb_node);
318 		if (other->start == state->end + 1 &&
319 		    other->state == state->state) {
320 			other->start = state->start;
321 			state->tree = NULL;
322 			rb_erase(&state->rb_node, &tree->state);
323 			free_extent_state(state);
324 		}
325 	}
326 	return 0;
327 }
328 
329 static void set_state_cb(struct extent_io_tree *tree,
330 			 struct extent_state *state,
331 			 unsigned long bits)
332 {
333 	if (tree->ops && tree->ops->set_bit_hook) {
334 		tree->ops->set_bit_hook(tree->mapping->host, state->start,
335 					state->end, state->state, bits);
336 	}
337 }
338 
339 static void clear_state_cb(struct extent_io_tree *tree,
340 			   struct extent_state *state,
341 			   unsigned long bits)
342 {
343 	if (tree->ops && tree->ops->clear_bit_hook) {
344 		tree->ops->clear_bit_hook(tree->mapping->host, state->start,
345 					  state->end, state->state, bits);
346 	}
347 }
348 
349 /*
350  * insert an extent_state struct into the tree.  'bits' are set on the
351  * struct before it is inserted.
352  *
353  * This may return -EEXIST if the extent is already there, in which case the
354  * state struct is freed.
355  *
356  * The tree lock is not taken internally.  This is a utility function and
357  * probably isn't what you want to call (see set/clear_extent_bit).
358  */
359 static int insert_state(struct extent_io_tree *tree,
360 			struct extent_state *state, u64 start, u64 end,
361 			int bits)
362 {
363 	struct rb_node *node;
364 
365 	if (end < start) {
366 		printk(KERN_ERR "btrfs end < start %llu %llu\n",
367 		       (unsigned long long)end,
368 		       (unsigned long long)start);
369 		WARN_ON(1);
370 	}
371 	if (bits & EXTENT_DIRTY)
372 		tree->dirty_bytes += end - start + 1;
373 	set_state_cb(tree, state, bits);
374 	state->state |= bits;
375 	state->start = start;
376 	state->end = end;
377 	node = tree_insert(&tree->state, end, &state->rb_node);
378 	if (node) {
379 		struct extent_state *found;
380 		found = rb_entry(node, struct extent_state, rb_node);
381 		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
382 		       "%llu %llu\n", (unsigned long long)found->start,
383 		       (unsigned long long)found->end,
384 		       (unsigned long long)start, (unsigned long long)end);
385 		free_extent_state(state);
386 		return -EEXIST;
387 	}
388 	state->tree = tree;
389 	merge_state(tree, state);
390 	return 0;
391 }
392 
393 /*
394  * split a given extent state struct in two, inserting the preallocated
395  * struct 'prealloc' as the newly created second half.  'split' indicates an
396  * offset inside 'orig' where it should be split.
397  *
398  * Before calling,
399  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
400  * are two extent state structs in the tree:
401  * prealloc: [orig->start, split - 1]
402  * orig: [ split, orig->end ]
403  *
404  * The tree locks are not taken by this function. They need to be held
405  * by the caller.
406  */
407 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
408 		       struct extent_state *prealloc, u64 split)
409 {
410 	struct rb_node *node;
411 	prealloc->start = orig->start;
412 	prealloc->end = split - 1;
413 	prealloc->state = orig->state;
414 	orig->start = split;
415 
416 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
417 	if (node) {
418 		free_extent_state(prealloc);
419 		return -EEXIST;
420 	}
421 	prealloc->tree = tree;
422 	return 0;
423 }
424 
425 /*
426  * utility function to clear some bits in an extent state struct.
427  * it will optionally wake up any one waiting on this state (wake == 1), or
428  * forcibly remove the state from the tree (delete == 1).
429  *
430  * If no bits are set on the state struct after clearing things, the
431  * struct is freed and removed from the tree
432  */
433 static int clear_state_bit(struct extent_io_tree *tree,
434 			    struct extent_state *state, int bits, int wake,
435 			    int delete)
436 {
437 	int ret = state->state & bits;
438 
439 	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
440 		u64 range = state->end - state->start + 1;
441 		WARN_ON(range > tree->dirty_bytes);
442 		tree->dirty_bytes -= range;
443 	}
444 	clear_state_cb(tree, state, bits);
445 	state->state &= ~bits;
446 	if (wake)
447 		wake_up(&state->wq);
448 	if (delete || state->state == 0) {
449 		if (state->tree) {
450 			clear_state_cb(tree, state, state->state);
451 			rb_erase(&state->rb_node, &tree->state);
452 			state->tree = NULL;
453 			free_extent_state(state);
454 		} else {
455 			WARN_ON(1);
456 		}
457 	} else {
458 		merge_state(tree, state);
459 	}
460 	return ret;
461 }
462 
463 /*
464  * clear some bits on a range in the tree.  This may require splitting
465  * or inserting elements in the tree, so the gfp mask is used to
466  * indicate which allocations or sleeping are allowed.
467  *
468  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
469  * the given range from the tree regardless of state (ie for truncate).
470  *
471  * the range [start, end] is inclusive.
472  *
473  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
474  * bits were already set, or zero if none of the bits were already set.
475  */
476 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
477 		     int bits, int wake, int delete, gfp_t mask)
478 {
479 	struct extent_state *state;
480 	struct extent_state *prealloc = NULL;
481 	struct rb_node *node;
482 	int err;
483 	int set = 0;
484 
485 again:
486 	if (!prealloc && (mask & __GFP_WAIT)) {
487 		prealloc = alloc_extent_state(mask);
488 		if (!prealloc)
489 			return -ENOMEM;
490 	}
491 
492 	spin_lock(&tree->lock);
493 	/*
494 	 * this search will find the extents that end after
495 	 * our range starts
496 	 */
497 	node = tree_search(tree, start);
498 	if (!node)
499 		goto out;
500 	state = rb_entry(node, struct extent_state, rb_node);
501 	if (state->start > end)
502 		goto out;
503 	WARN_ON(state->end < start);
504 
505 	/*
506 	 *     | ---- desired range ---- |
507 	 *  | state | or
508 	 *  | ------------- state -------------- |
509 	 *
510 	 * We need to split the extent we found, and may flip
511 	 * bits on second half.
512 	 *
513 	 * If the extent we found extends past our range, we
514 	 * just split and search again.  It'll get split again
515 	 * the next time though.
516 	 *
517 	 * If the extent we found is inside our range, we clear
518 	 * the desired bit on it.
519 	 */
520 
521 	if (state->start < start) {
522 		if (!prealloc)
523 			prealloc = alloc_extent_state(GFP_ATOMIC);
524 		err = split_state(tree, state, prealloc, start);
525 		BUG_ON(err == -EEXIST);
526 		prealloc = NULL;
527 		if (err)
528 			goto out;
529 		if (state->end <= end) {
530 			start = state->end + 1;
531 			set |= clear_state_bit(tree, state, bits,
532 					wake, delete);
533 		} else {
534 			start = state->start;
535 		}
536 		goto search_again;
537 	}
538 	/*
539 	 * | ---- desired range ---- |
540 	 *                        | state |
541 	 * We need to split the extent, and clear the bit
542 	 * on the first half
543 	 */
544 	if (state->start <= end && state->end > end) {
545 		if (!prealloc)
546 			prealloc = alloc_extent_state(GFP_ATOMIC);
547 		err = split_state(tree, state, prealloc, end + 1);
548 		BUG_ON(err == -EEXIST);
549 
550 		if (wake)
551 			wake_up(&state->wq);
552 		set |= clear_state_bit(tree, prealloc, bits,
553 				       wake, delete);
554 		prealloc = NULL;
555 		goto out;
556 	}
557 
558 	start = state->end + 1;
559 	set |= clear_state_bit(tree, state, bits, wake, delete);
560 	goto search_again;
561 
562 out:
563 	spin_unlock(&tree->lock);
564 	if (prealloc)
565 		free_extent_state(prealloc);
566 
567 	return set;
568 
569 search_again:
570 	if (start > end)
571 		goto out;
572 	spin_unlock(&tree->lock);
573 	if (mask & __GFP_WAIT)
574 		cond_resched();
575 	goto again;
576 }
577 
578 static int wait_on_state(struct extent_io_tree *tree,
579 			 struct extent_state *state)
580 		__releases(tree->lock)
581 		__acquires(tree->lock)
582 {
583 	DEFINE_WAIT(wait);
584 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
585 	spin_unlock(&tree->lock);
586 	schedule();
587 	spin_lock(&tree->lock);
588 	finish_wait(&state->wq, &wait);
589 	return 0;
590 }
591 
592 /*
593  * waits for one or more bits to clear on a range in the state tree.
594  * The range [start, end] is inclusive.
595  * The tree lock is taken by this function
596  */
597 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
598 {
599 	struct extent_state *state;
600 	struct rb_node *node;
601 
602 	spin_lock(&tree->lock);
603 again:
604 	while (1) {
605 		/*
606 		 * this search will find all the extents that end after
607 		 * our range starts
608 		 */
609 		node = tree_search(tree, start);
610 		if (!node)
611 			break;
612 
613 		state = rb_entry(node, struct extent_state, rb_node);
614 
615 		if (state->start > end)
616 			goto out;
617 
618 		if (state->state & bits) {
619 			start = state->start;
620 			atomic_inc(&state->refs);
621 			wait_on_state(tree, state);
622 			free_extent_state(state);
623 			goto again;
624 		}
625 		start = state->end + 1;
626 
627 		if (start > end)
628 			break;
629 
630 		if (need_resched()) {
631 			spin_unlock(&tree->lock);
632 			cond_resched();
633 			spin_lock(&tree->lock);
634 		}
635 	}
636 out:
637 	spin_unlock(&tree->lock);
638 	return 0;
639 }
640 
641 static void set_state_bits(struct extent_io_tree *tree,
642 			   struct extent_state *state,
643 			   int bits)
644 {
645 	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
646 		u64 range = state->end - state->start + 1;
647 		tree->dirty_bytes += range;
648 	}
649 	set_state_cb(tree, state, bits);
650 	state->state |= bits;
651 }
652 
653 /*
654  * set some bits on a range in the tree.  This may require allocations
655  * or sleeping, so the gfp mask is used to indicate what is allowed.
656  *
657  * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
658  * range already has the desired bits set.  The start of the existing
659  * range is returned in failed_start in this case.
660  *
661  * [start, end] is inclusive
662  * This takes the tree lock.
663  */
664 static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
665 			  int bits, int exclusive, u64 *failed_start,
666 			  gfp_t mask)
667 {
668 	struct extent_state *state;
669 	struct extent_state *prealloc = NULL;
670 	struct rb_node *node;
671 	int err = 0;
672 	int set;
673 	u64 last_start;
674 	u64 last_end;
675 again:
676 	if (!prealloc && (mask & __GFP_WAIT)) {
677 		prealloc = alloc_extent_state(mask);
678 		if (!prealloc)
679 			return -ENOMEM;
680 	}
681 
682 	spin_lock(&tree->lock);
683 	/*
684 	 * this search will find all the extents that end after
685 	 * our range starts.
686 	 */
687 	node = tree_search(tree, start);
688 	if (!node) {
689 		err = insert_state(tree, prealloc, start, end, bits);
690 		prealloc = NULL;
691 		BUG_ON(err == -EEXIST);
692 		goto out;
693 	}
694 
695 	state = rb_entry(node, struct extent_state, rb_node);
696 	last_start = state->start;
697 	last_end = state->end;
698 
699 	/*
700 	 * | ---- desired range ---- |
701 	 * | state |
702 	 *
703 	 * Just lock what we found and keep going
704 	 */
705 	if (state->start == start && state->end <= end) {
706 		set = state->state & bits;
707 		if (set && exclusive) {
708 			*failed_start = state->start;
709 			err = -EEXIST;
710 			goto out;
711 		}
712 		set_state_bits(tree, state, bits);
713 		start = state->end + 1;
714 		merge_state(tree, state);
715 		goto search_again;
716 	}
717 
718 	/*
719 	 *     | ---- desired range ---- |
720 	 * | state |
721 	 *   or
722 	 * | ------------- state -------------- |
723 	 *
724 	 * We need to split the extent we found, and may flip bits on
725 	 * second half.
726 	 *
727 	 * If the extent we found extends past our
728 	 * range, we just split and search again.  It'll get split
729 	 * again the next time though.
730 	 *
731 	 * If the extent we found is inside our range, we set the
732 	 * desired bit on it.
733 	 */
734 	if (state->start < start) {
735 		set = state->state & bits;
736 		if (exclusive && set) {
737 			*failed_start = start;
738 			err = -EEXIST;
739 			goto out;
740 		}
741 		err = split_state(tree, state, prealloc, start);
742 		BUG_ON(err == -EEXIST);
743 		prealloc = NULL;
744 		if (err)
745 			goto out;
746 		if (state->end <= end) {
747 			set_state_bits(tree, state, bits);
748 			start = state->end + 1;
749 			merge_state(tree, state);
750 		} else {
751 			start = state->start;
752 		}
753 		goto search_again;
754 	}
755 	/*
756 	 * | ---- desired range ---- |
757 	 *     | state | or               | state |
758 	 *
759 	 * There's a hole, we need to insert something in it and
760 	 * ignore the extent we found.
761 	 */
762 	if (state->start > start) {
763 		u64 this_end;
764 		if (end < last_start)
765 			this_end = end;
766 		else
767 			this_end = last_start - 1;
768 		err = insert_state(tree, prealloc, start, this_end,
769 				   bits);
770 		prealloc = NULL;
771 		BUG_ON(err == -EEXIST);
772 		if (err)
773 			goto out;
774 		start = this_end + 1;
775 		goto search_again;
776 	}
777 	/*
778 	 * | ---- desired range ---- |
779 	 *                        | state |
780 	 * We need to split the extent, and set the bit
781 	 * on the first half
782 	 */
783 	if (state->start <= end && state->end > end) {
784 		set = state->state & bits;
785 		if (exclusive && set) {
786 			*failed_start = start;
787 			err = -EEXIST;
788 			goto out;
789 		}
790 		err = split_state(tree, state, prealloc, end + 1);
791 		BUG_ON(err == -EEXIST);
792 
793 		set_state_bits(tree, prealloc, bits);
794 		merge_state(tree, prealloc);
795 		prealloc = NULL;
796 		goto out;
797 	}
798 
799 	goto search_again;
800 
801 out:
802 	spin_unlock(&tree->lock);
803 	if (prealloc)
804 		free_extent_state(prealloc);
805 
806 	return err;
807 
808 search_again:
809 	if (start > end)
810 		goto out;
811 	spin_unlock(&tree->lock);
812 	if (mask & __GFP_WAIT)
813 		cond_resched();
814 	goto again;
815 }
816 
817 /* wrappers around set/clear extent bit */
818 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
819 		     gfp_t mask)
820 {
821 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
822 			      mask);
823 }
824 
825 int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
826 		       gfp_t mask)
827 {
828 	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
829 }
830 
831 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
832 		    int bits, gfp_t mask)
833 {
834 	return set_extent_bit(tree, start, end, bits, 0, NULL,
835 			      mask);
836 }
837 
838 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
839 		      int bits, gfp_t mask)
840 {
841 	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
842 }
843 
844 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
845 		     gfp_t mask)
846 {
847 	return set_extent_bit(tree, start, end,
848 			      EXTENT_DELALLOC | EXTENT_DIRTY,
849 			      0, NULL, mask);
850 }
851 
852 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
853 		       gfp_t mask)
854 {
855 	return clear_extent_bit(tree, start, end,
856 				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
857 }
858 
859 int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
860 			 gfp_t mask)
861 {
862 	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
863 }
864 
865 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
866 		     gfp_t mask)
867 {
868 	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
869 			      mask);
870 }
871 
872 static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
873 		       gfp_t mask)
874 {
875 	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
876 }
877 
878 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
879 			gfp_t mask)
880 {
881 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
882 			      mask);
883 }
884 
885 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
886 				 u64 end, gfp_t mask)
887 {
888 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
889 }
890 
891 static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end,
892 			 gfp_t mask)
893 {
894 	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
895 			      0, NULL, mask);
896 }
897 
898 static int clear_extent_writeback(struct extent_io_tree *tree, u64 start,
899 				  u64 end, gfp_t mask)
900 {
901 	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
902 }
903 
904 int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
905 {
906 	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
907 }
908 
909 /*
910  * either insert or lock state struct between start and end use mask to tell
911  * us if waiting is desired.
912  */
913 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
914 {
915 	int err;
916 	u64 failed_start;
917 	while (1) {
918 		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
919 				     &failed_start, mask);
920 		if (err == -EEXIST && (mask & __GFP_WAIT)) {
921 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
922 			start = failed_start;
923 		} else {
924 			break;
925 		}
926 		WARN_ON(start > end);
927 	}
928 	return err;
929 }
930 
931 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
932 		    gfp_t mask)
933 {
934 	int err;
935 	u64 failed_start;
936 
937 	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
938 			     &failed_start, mask);
939 	if (err == -EEXIST) {
940 		if (failed_start > start)
941 			clear_extent_bit(tree, start, failed_start - 1,
942 					 EXTENT_LOCKED, 1, 0, mask);
943 		return 0;
944 	}
945 	return 1;
946 }
947 
948 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
949 		  gfp_t mask)
950 {
951 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
952 }
953 
954 /*
955  * helper function to set pages and extents in the tree dirty
956  */
957 int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
958 {
959 	unsigned long index = start >> PAGE_CACHE_SHIFT;
960 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
961 	struct page *page;
962 
963 	while (index <= end_index) {
964 		page = find_get_page(tree->mapping, index);
965 		BUG_ON(!page);
966 		__set_page_dirty_nobuffers(page);
967 		page_cache_release(page);
968 		index++;
969 	}
970 	set_extent_dirty(tree, start, end, GFP_NOFS);
971 	return 0;
972 }
973 
974 /*
975  * helper function to set both pages and extents in the tree writeback
976  */
977 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
978 {
979 	unsigned long index = start >> PAGE_CACHE_SHIFT;
980 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
981 	struct page *page;
982 
983 	while (index <= end_index) {
984 		page = find_get_page(tree->mapping, index);
985 		BUG_ON(!page);
986 		set_page_writeback(page);
987 		page_cache_release(page);
988 		index++;
989 	}
990 	set_extent_writeback(tree, start, end, GFP_NOFS);
991 	return 0;
992 }
993 
994 /*
995  * find the first offset in the io tree with 'bits' set. zero is
996  * returned if we find something, and *start_ret and *end_ret are
997  * set to reflect the state struct that was found.
998  *
999  * If nothing was found, 1 is returned, < 0 on error
1000  */
1001 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1002 			  u64 *start_ret, u64 *end_ret, int bits)
1003 {
1004 	struct rb_node *node;
1005 	struct extent_state *state;
1006 	int ret = 1;
1007 
1008 	spin_lock(&tree->lock);
1009 	/*
1010 	 * this search will find all the extents that end after
1011 	 * our range starts.
1012 	 */
1013 	node = tree_search(tree, start);
1014 	if (!node)
1015 		goto out;
1016 
1017 	while (1) {
1018 		state = rb_entry(node, struct extent_state, rb_node);
1019 		if (state->end >= start && (state->state & bits)) {
1020 			*start_ret = state->start;
1021 			*end_ret = state->end;
1022 			ret = 0;
1023 			break;
1024 		}
1025 		node = rb_next(node);
1026 		if (!node)
1027 			break;
1028 	}
1029 out:
1030 	spin_unlock(&tree->lock);
1031 	return ret;
1032 }
1033 
1034 /* find the first state struct with 'bits' set after 'start', and
1035  * return it.  tree->lock must be held.  NULL will returned if
1036  * nothing was found after 'start'
1037  */
1038 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
1039 						 u64 start, int bits)
1040 {
1041 	struct rb_node *node;
1042 	struct extent_state *state;
1043 
1044 	/*
1045 	 * this search will find all the extents that end after
1046 	 * our range starts.
1047 	 */
1048 	node = tree_search(tree, start);
1049 	if (!node)
1050 		goto out;
1051 
1052 	while (1) {
1053 		state = rb_entry(node, struct extent_state, rb_node);
1054 		if (state->end >= start && (state->state & bits))
1055 			return state;
1056 
1057 		node = rb_next(node);
1058 		if (!node)
1059 			break;
1060 	}
1061 out:
1062 	return NULL;
1063 }
1064 
1065 /*
1066  * find a contiguous range of bytes in the file marked as delalloc, not
1067  * more than 'max_bytes'.  start and end are used to return the range,
1068  *
1069  * 1 is returned if we find something, 0 if nothing was in the tree
1070  */
1071 static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1072 					u64 *start, u64 *end, u64 max_bytes)
1073 {
1074 	struct rb_node *node;
1075 	struct extent_state *state;
1076 	u64 cur_start = *start;
1077 	u64 found = 0;
1078 	u64 total_bytes = 0;
1079 
1080 	spin_lock(&tree->lock);
1081 
1082 	/*
1083 	 * this search will find all the extents that end after
1084 	 * our range starts.
1085 	 */
1086 	node = tree_search(tree, cur_start);
1087 	if (!node) {
1088 		if (!found)
1089 			*end = (u64)-1;
1090 		goto out;
1091 	}
1092 
1093 	while (1) {
1094 		state = rb_entry(node, struct extent_state, rb_node);
1095 		if (found && (state->start != cur_start ||
1096 			      (state->state & EXTENT_BOUNDARY))) {
1097 			goto out;
1098 		}
1099 		if (!(state->state & EXTENT_DELALLOC)) {
1100 			if (!found)
1101 				*end = state->end;
1102 			goto out;
1103 		}
1104 		if (!found)
1105 			*start = state->start;
1106 		found++;
1107 		*end = state->end;
1108 		cur_start = state->end + 1;
1109 		node = rb_next(node);
1110 		if (!node)
1111 			break;
1112 		total_bytes += state->end - state->start + 1;
1113 		if (total_bytes >= max_bytes)
1114 			break;
1115 	}
1116 out:
1117 	spin_unlock(&tree->lock);
1118 	return found;
1119 }
1120 
1121 static noinline int __unlock_for_delalloc(struct inode *inode,
1122 					  struct page *locked_page,
1123 					  u64 start, u64 end)
1124 {
1125 	int ret;
1126 	struct page *pages[16];
1127 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1128 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1129 	unsigned long nr_pages = end_index - index + 1;
1130 	int i;
1131 
1132 	if (index == locked_page->index && end_index == index)
1133 		return 0;
1134 
1135 	while (nr_pages > 0) {
1136 		ret = find_get_pages_contig(inode->i_mapping, index,
1137 				     min_t(unsigned long, nr_pages,
1138 				     ARRAY_SIZE(pages)), pages);
1139 		for (i = 0; i < ret; i++) {
1140 			if (pages[i] != locked_page)
1141 				unlock_page(pages[i]);
1142 			page_cache_release(pages[i]);
1143 		}
1144 		nr_pages -= ret;
1145 		index += ret;
1146 		cond_resched();
1147 	}
1148 	return 0;
1149 }
1150 
1151 static noinline int lock_delalloc_pages(struct inode *inode,
1152 					struct page *locked_page,
1153 					u64 delalloc_start,
1154 					u64 delalloc_end)
1155 {
1156 	unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1157 	unsigned long start_index = index;
1158 	unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1159 	unsigned long pages_locked = 0;
1160 	struct page *pages[16];
1161 	unsigned long nrpages;
1162 	int ret;
1163 	int i;
1164 
1165 	/* the caller is responsible for locking the start index */
1166 	if (index == locked_page->index && index == end_index)
1167 		return 0;
1168 
1169 	/* skip the page at the start index */
1170 	nrpages = end_index - index + 1;
1171 	while (nrpages > 0) {
1172 		ret = find_get_pages_contig(inode->i_mapping, index,
1173 				     min_t(unsigned long,
1174 				     nrpages, ARRAY_SIZE(pages)), pages);
1175 		if (ret == 0) {
1176 			ret = -EAGAIN;
1177 			goto done;
1178 		}
1179 		/* now we have an array of pages, lock them all */
1180 		for (i = 0; i < ret; i++) {
1181 			/*
1182 			 * the caller is taking responsibility for
1183 			 * locked_page
1184 			 */
1185 			if (pages[i] != locked_page) {
1186 				lock_page(pages[i]);
1187 				if (!PageDirty(pages[i]) ||
1188 				    pages[i]->mapping != inode->i_mapping) {
1189 					ret = -EAGAIN;
1190 					unlock_page(pages[i]);
1191 					page_cache_release(pages[i]);
1192 					goto done;
1193 				}
1194 			}
1195 			page_cache_release(pages[i]);
1196 			pages_locked++;
1197 		}
1198 		nrpages -= ret;
1199 		index += ret;
1200 		cond_resched();
1201 	}
1202 	ret = 0;
1203 done:
1204 	if (ret && pages_locked) {
1205 		__unlock_for_delalloc(inode, locked_page,
1206 			      delalloc_start,
1207 			      ((u64)(start_index + pages_locked - 1)) <<
1208 			      PAGE_CACHE_SHIFT);
1209 	}
1210 	return ret;
1211 }
1212 
1213 /*
1214  * find a contiguous range of bytes in the file marked as delalloc, not
1215  * more than 'max_bytes'.  start and end are used to return the range,
1216  *
1217  * 1 is returned if we find something, 0 if nothing was in the tree
1218  */
1219 static noinline u64 find_lock_delalloc_range(struct inode *inode,
1220 					     struct extent_io_tree *tree,
1221 					     struct page *locked_page,
1222 					     u64 *start, u64 *end,
1223 					     u64 max_bytes)
1224 {
1225 	u64 delalloc_start;
1226 	u64 delalloc_end;
1227 	u64 found;
1228 	int ret;
1229 	int loops = 0;
1230 
1231 again:
1232 	/* step one, find a bunch of delalloc bytes starting at start */
1233 	delalloc_start = *start;
1234 	delalloc_end = 0;
1235 	found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1236 				    max_bytes);
1237 	if (!found || delalloc_end <= *start) {
1238 		*start = delalloc_start;
1239 		*end = delalloc_end;
1240 		return found;
1241 	}
1242 
1243 	/*
1244 	 * start comes from the offset of locked_page.  We have to lock
1245 	 * pages in order, so we can't process delalloc bytes before
1246 	 * locked_page
1247 	 */
1248 	if (delalloc_start < *start)
1249 		delalloc_start = *start;
1250 
1251 	/*
1252 	 * make sure to limit the number of pages we try to lock down
1253 	 * if we're looping.
1254 	 */
1255 	if (delalloc_end + 1 - delalloc_start > max_bytes && loops)
1256 		delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1;
1257 
1258 	/* step two, lock all the pages after the page that has start */
1259 	ret = lock_delalloc_pages(inode, locked_page,
1260 				  delalloc_start, delalloc_end);
1261 	if (ret == -EAGAIN) {
1262 		/* some of the pages are gone, lets avoid looping by
1263 		 * shortening the size of the delalloc range we're searching
1264 		 */
1265 		if (!loops) {
1266 			unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1267 			max_bytes = PAGE_CACHE_SIZE - offset;
1268 			loops = 1;
1269 			goto again;
1270 		} else {
1271 			found = 0;
1272 			goto out_failed;
1273 		}
1274 	}
1275 	BUG_ON(ret);
1276 
1277 	/* step three, lock the state bits for the whole range */
1278 	lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1279 
1280 	/* then test to make sure it is all still delalloc */
1281 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
1282 			     EXTENT_DELALLOC, 1);
1283 	if (!ret) {
1284 		unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1285 		__unlock_for_delalloc(inode, locked_page,
1286 			      delalloc_start, delalloc_end);
1287 		cond_resched();
1288 		goto again;
1289 	}
1290 	*start = delalloc_start;
1291 	*end = delalloc_end;
1292 out_failed:
1293 	return found;
1294 }
1295 
1296 int extent_clear_unlock_delalloc(struct inode *inode,
1297 				struct extent_io_tree *tree,
1298 				u64 start, u64 end, struct page *locked_page,
1299 				int unlock_pages,
1300 				int clear_unlock,
1301 				int clear_delalloc, int clear_dirty,
1302 				int set_writeback,
1303 				int end_writeback)
1304 {
1305 	int ret;
1306 	struct page *pages[16];
1307 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1308 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1309 	unsigned long nr_pages = end_index - index + 1;
1310 	int i;
1311 	int clear_bits = 0;
1312 
1313 	if (clear_unlock)
1314 		clear_bits |= EXTENT_LOCKED;
1315 	if (clear_dirty)
1316 		clear_bits |= EXTENT_DIRTY;
1317 
1318 	if (clear_delalloc)
1319 		clear_bits |= EXTENT_DELALLOC;
1320 
1321 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1322 	if (!(unlock_pages || clear_dirty || set_writeback || end_writeback))
1323 		return 0;
1324 
1325 	while (nr_pages > 0) {
1326 		ret = find_get_pages_contig(inode->i_mapping, index,
1327 				     min_t(unsigned long,
1328 				     nr_pages, ARRAY_SIZE(pages)), pages);
1329 		for (i = 0; i < ret; i++) {
1330 			if (pages[i] == locked_page) {
1331 				page_cache_release(pages[i]);
1332 				continue;
1333 			}
1334 			if (clear_dirty)
1335 				clear_page_dirty_for_io(pages[i]);
1336 			if (set_writeback)
1337 				set_page_writeback(pages[i]);
1338 			if (end_writeback)
1339 				end_page_writeback(pages[i]);
1340 			if (unlock_pages)
1341 				unlock_page(pages[i]);
1342 			page_cache_release(pages[i]);
1343 		}
1344 		nr_pages -= ret;
1345 		index += ret;
1346 		cond_resched();
1347 	}
1348 	return 0;
1349 }
1350 
1351 /*
1352  * count the number of bytes in the tree that have a given bit(s)
1353  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
1354  * cached.  The total number found is returned.
1355  */
1356 u64 count_range_bits(struct extent_io_tree *tree,
1357 		     u64 *start, u64 search_end, u64 max_bytes,
1358 		     unsigned long bits)
1359 {
1360 	struct rb_node *node;
1361 	struct extent_state *state;
1362 	u64 cur_start = *start;
1363 	u64 total_bytes = 0;
1364 	int found = 0;
1365 
1366 	if (search_end <= cur_start) {
1367 		WARN_ON(1);
1368 		return 0;
1369 	}
1370 
1371 	spin_lock(&tree->lock);
1372 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1373 		total_bytes = tree->dirty_bytes;
1374 		goto out;
1375 	}
1376 	/*
1377 	 * this search will find all the extents that end after
1378 	 * our range starts.
1379 	 */
1380 	node = tree_search(tree, cur_start);
1381 	if (!node)
1382 		goto out;
1383 
1384 	while (1) {
1385 		state = rb_entry(node, struct extent_state, rb_node);
1386 		if (state->start > search_end)
1387 			break;
1388 		if (state->end >= cur_start && (state->state & bits)) {
1389 			total_bytes += min(search_end, state->end) + 1 -
1390 				       max(cur_start, state->start);
1391 			if (total_bytes >= max_bytes)
1392 				break;
1393 			if (!found) {
1394 				*start = state->start;
1395 				found = 1;
1396 			}
1397 		}
1398 		node = rb_next(node);
1399 		if (!node)
1400 			break;
1401 	}
1402 out:
1403 	spin_unlock(&tree->lock);
1404 	return total_bytes;
1405 }
1406 
1407 #if 0
1408 /*
1409  * helper function to lock both pages and extents in the tree.
1410  * pages must be locked first.
1411  */
1412 static int lock_range(struct extent_io_tree *tree, u64 start, u64 end)
1413 {
1414 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1415 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1416 	struct page *page;
1417 	int err;
1418 
1419 	while (index <= end_index) {
1420 		page = grab_cache_page(tree->mapping, index);
1421 		if (!page) {
1422 			err = -ENOMEM;
1423 			goto failed;
1424 		}
1425 		if (IS_ERR(page)) {
1426 			err = PTR_ERR(page);
1427 			goto failed;
1428 		}
1429 		index++;
1430 	}
1431 	lock_extent(tree, start, end, GFP_NOFS);
1432 	return 0;
1433 
1434 failed:
1435 	/*
1436 	 * we failed above in getting the page at 'index', so we undo here
1437 	 * up to but not including the page at 'index'
1438 	 */
1439 	end_index = index;
1440 	index = start >> PAGE_CACHE_SHIFT;
1441 	while (index < end_index) {
1442 		page = find_get_page(tree->mapping, index);
1443 		unlock_page(page);
1444 		page_cache_release(page);
1445 		index++;
1446 	}
1447 	return err;
1448 }
1449 
1450 /*
1451  * helper function to unlock both pages and extents in the tree.
1452  */
1453 static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end)
1454 {
1455 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1456 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1457 	struct page *page;
1458 
1459 	while (index <= end_index) {
1460 		page = find_get_page(tree->mapping, index);
1461 		unlock_page(page);
1462 		page_cache_release(page);
1463 		index++;
1464 	}
1465 	unlock_extent(tree, start, end, GFP_NOFS);
1466 	return 0;
1467 }
1468 #endif
1469 
1470 /*
1471  * set the private field for a given byte offset in the tree.  If there isn't
1472  * an extent_state there already, this does nothing.
1473  */
1474 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
1475 {
1476 	struct rb_node *node;
1477 	struct extent_state *state;
1478 	int ret = 0;
1479 
1480 	spin_lock(&tree->lock);
1481 	/*
1482 	 * this search will find all the extents that end after
1483 	 * our range starts.
1484 	 */
1485 	node = tree_search(tree, start);
1486 	if (!node) {
1487 		ret = -ENOENT;
1488 		goto out;
1489 	}
1490 	state = rb_entry(node, struct extent_state, rb_node);
1491 	if (state->start != start) {
1492 		ret = -ENOENT;
1493 		goto out;
1494 	}
1495 	state->private = private;
1496 out:
1497 	spin_unlock(&tree->lock);
1498 	return ret;
1499 }
1500 
1501 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
1502 {
1503 	struct rb_node *node;
1504 	struct extent_state *state;
1505 	int ret = 0;
1506 
1507 	spin_lock(&tree->lock);
1508 	/*
1509 	 * this search will find all the extents that end after
1510 	 * our range starts.
1511 	 */
1512 	node = tree_search(tree, start);
1513 	if (!node) {
1514 		ret = -ENOENT;
1515 		goto out;
1516 	}
1517 	state = rb_entry(node, struct extent_state, rb_node);
1518 	if (state->start != start) {
1519 		ret = -ENOENT;
1520 		goto out;
1521 	}
1522 	*private = state->private;
1523 out:
1524 	spin_unlock(&tree->lock);
1525 	return ret;
1526 }
1527 
1528 /*
1529  * searches a range in the state tree for a given mask.
1530  * If 'filled' == 1, this returns 1 only if every extent in the tree
1531  * has the bits set.  Otherwise, 1 is returned if any bit in the
1532  * range is found set.
1533  */
1534 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
1535 		   int bits, int filled)
1536 {
1537 	struct extent_state *state = NULL;
1538 	struct rb_node *node;
1539 	int bitset = 0;
1540 
1541 	spin_lock(&tree->lock);
1542 	node = tree_search(tree, start);
1543 	while (node && start <= end) {
1544 		state = rb_entry(node, struct extent_state, rb_node);
1545 
1546 		if (filled && state->start > start) {
1547 			bitset = 0;
1548 			break;
1549 		}
1550 
1551 		if (state->start > end)
1552 			break;
1553 
1554 		if (state->state & bits) {
1555 			bitset = 1;
1556 			if (!filled)
1557 				break;
1558 		} else if (filled) {
1559 			bitset = 0;
1560 			break;
1561 		}
1562 		start = state->end + 1;
1563 		if (start > end)
1564 			break;
1565 		node = rb_next(node);
1566 		if (!node) {
1567 			if (filled)
1568 				bitset = 0;
1569 			break;
1570 		}
1571 	}
1572 	spin_unlock(&tree->lock);
1573 	return bitset;
1574 }
1575 
1576 /*
1577  * helper function to set a given page up to date if all the
1578  * extents in the tree for that page are up to date
1579  */
1580 static int check_page_uptodate(struct extent_io_tree *tree,
1581 			       struct page *page)
1582 {
1583 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1584 	u64 end = start + PAGE_CACHE_SIZE - 1;
1585 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1586 		SetPageUptodate(page);
1587 	return 0;
1588 }
1589 
1590 /*
1591  * helper function to unlock a page if all the extents in the tree
1592  * for that page are unlocked
1593  */
1594 static int check_page_locked(struct extent_io_tree *tree,
1595 			     struct page *page)
1596 {
1597 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1598 	u64 end = start + PAGE_CACHE_SIZE - 1;
1599 	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1600 		unlock_page(page);
1601 	return 0;
1602 }
1603 
1604 /*
1605  * helper function to end page writeback if all the extents
1606  * in the tree for that page are done with writeback
1607  */
1608 static int check_page_writeback(struct extent_io_tree *tree,
1609 			     struct page *page)
1610 {
1611 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1612 	u64 end = start + PAGE_CACHE_SIZE - 1;
1613 	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1614 		end_page_writeback(page);
1615 	return 0;
1616 }
1617 
1618 /* lots and lots of room for performance fixes in the end_bio funcs */
1619 
1620 /*
1621  * after a writepage IO is done, we need to:
1622  * clear the uptodate bits on error
1623  * clear the writeback bits in the extent tree for this IO
1624  * end_page_writeback if the page has no more pending IO
1625  *
1626  * Scheduling is not allowed, so the extent state tree is expected
1627  * to have one and only one object corresponding to this IO.
1628  */
1629 static void end_bio_extent_writepage(struct bio *bio, int err)
1630 {
1631 	int uptodate = err == 0;
1632 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1633 	struct extent_io_tree *tree;
1634 	u64 start;
1635 	u64 end;
1636 	int whole_page;
1637 	int ret;
1638 
1639 	do {
1640 		struct page *page = bvec->bv_page;
1641 		tree = &BTRFS_I(page->mapping->host)->io_tree;
1642 
1643 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1644 			 bvec->bv_offset;
1645 		end = start + bvec->bv_len - 1;
1646 
1647 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1648 			whole_page = 1;
1649 		else
1650 			whole_page = 0;
1651 
1652 		if (--bvec >= bio->bi_io_vec)
1653 			prefetchw(&bvec->bv_page->flags);
1654 		if (tree->ops && tree->ops->writepage_end_io_hook) {
1655 			ret = tree->ops->writepage_end_io_hook(page, start,
1656 						       end, NULL, uptodate);
1657 			if (ret)
1658 				uptodate = 0;
1659 		}
1660 
1661 		if (!uptodate && tree->ops &&
1662 		    tree->ops->writepage_io_failed_hook) {
1663 			ret = tree->ops->writepage_io_failed_hook(bio, page,
1664 							 start, end, NULL);
1665 			if (ret == 0) {
1666 				uptodate = (err == 0);
1667 				continue;
1668 			}
1669 		}
1670 
1671 		if (!uptodate) {
1672 			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1673 			ClearPageUptodate(page);
1674 			SetPageError(page);
1675 		}
1676 
1677 		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1678 
1679 		if (whole_page)
1680 			end_page_writeback(page);
1681 		else
1682 			check_page_writeback(tree, page);
1683 	} while (bvec >= bio->bi_io_vec);
1684 
1685 	bio_put(bio);
1686 }
1687 
1688 /*
1689  * after a readpage IO is done, we need to:
1690  * clear the uptodate bits on error
1691  * set the uptodate bits if things worked
1692  * set the page up to date if all extents in the tree are uptodate
1693  * clear the lock bit in the extent tree
1694  * unlock the page if there are no other extents locked for it
1695  *
1696  * Scheduling is not allowed, so the extent state tree is expected
1697  * to have one and only one object corresponding to this IO.
1698  */
1699 static void end_bio_extent_readpage(struct bio *bio, int err)
1700 {
1701 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1702 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1703 	struct extent_io_tree *tree;
1704 	u64 start;
1705 	u64 end;
1706 	int whole_page;
1707 	int ret;
1708 
1709 	if (err)
1710 		uptodate = 0;
1711 
1712 	do {
1713 		struct page *page = bvec->bv_page;
1714 		tree = &BTRFS_I(page->mapping->host)->io_tree;
1715 
1716 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1717 			bvec->bv_offset;
1718 		end = start + bvec->bv_len - 1;
1719 
1720 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1721 			whole_page = 1;
1722 		else
1723 			whole_page = 0;
1724 
1725 		if (--bvec >= bio->bi_io_vec)
1726 			prefetchw(&bvec->bv_page->flags);
1727 
1728 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1729 			ret = tree->ops->readpage_end_io_hook(page, start, end,
1730 							      NULL);
1731 			if (ret)
1732 				uptodate = 0;
1733 		}
1734 		if (!uptodate && tree->ops &&
1735 		    tree->ops->readpage_io_failed_hook) {
1736 			ret = tree->ops->readpage_io_failed_hook(bio, page,
1737 							 start, end, NULL);
1738 			if (ret == 0) {
1739 				uptodate =
1740 					test_bit(BIO_UPTODATE, &bio->bi_flags);
1741 				if (err)
1742 					uptodate = 0;
1743 				continue;
1744 			}
1745 		}
1746 
1747 		if (uptodate) {
1748 			set_extent_uptodate(tree, start, end,
1749 					    GFP_ATOMIC);
1750 		}
1751 		unlock_extent(tree, start, end, GFP_ATOMIC);
1752 
1753 		if (whole_page) {
1754 			if (uptodate) {
1755 				SetPageUptodate(page);
1756 			} else {
1757 				ClearPageUptodate(page);
1758 				SetPageError(page);
1759 			}
1760 			unlock_page(page);
1761 		} else {
1762 			if (uptodate) {
1763 				check_page_uptodate(tree, page);
1764 			} else {
1765 				ClearPageUptodate(page);
1766 				SetPageError(page);
1767 			}
1768 			check_page_locked(tree, page);
1769 		}
1770 	} while (bvec >= bio->bi_io_vec);
1771 
1772 	bio_put(bio);
1773 }
1774 
1775 /*
1776  * IO done from prepare_write is pretty simple, we just unlock
1777  * the structs in the extent tree when done, and set the uptodate bits
1778  * as appropriate.
1779  */
1780 static void end_bio_extent_preparewrite(struct bio *bio, int err)
1781 {
1782 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1783 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1784 	struct extent_io_tree *tree;
1785 	u64 start;
1786 	u64 end;
1787 
1788 	do {
1789 		struct page *page = bvec->bv_page;
1790 		tree = &BTRFS_I(page->mapping->host)->io_tree;
1791 
1792 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1793 			bvec->bv_offset;
1794 		end = start + bvec->bv_len - 1;
1795 
1796 		if (--bvec >= bio->bi_io_vec)
1797 			prefetchw(&bvec->bv_page->flags);
1798 
1799 		if (uptodate) {
1800 			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1801 		} else {
1802 			ClearPageUptodate(page);
1803 			SetPageError(page);
1804 		}
1805 
1806 		unlock_extent(tree, start, end, GFP_ATOMIC);
1807 
1808 	} while (bvec >= bio->bi_io_vec);
1809 
1810 	bio_put(bio);
1811 }
1812 
1813 static struct bio *
1814 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1815 		 gfp_t gfp_flags)
1816 {
1817 	struct bio *bio;
1818 
1819 	bio = bio_alloc(gfp_flags, nr_vecs);
1820 
1821 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1822 		while (!bio && (nr_vecs /= 2))
1823 			bio = bio_alloc(gfp_flags, nr_vecs);
1824 	}
1825 
1826 	if (bio) {
1827 		bio->bi_size = 0;
1828 		bio->bi_bdev = bdev;
1829 		bio->bi_sector = first_sector;
1830 	}
1831 	return bio;
1832 }
1833 
1834 static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1835 			  unsigned long bio_flags)
1836 {
1837 	int ret = 0;
1838 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1839 	struct page *page = bvec->bv_page;
1840 	struct extent_io_tree *tree = bio->bi_private;
1841 	u64 start;
1842 	u64 end;
1843 
1844 	start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1845 	end = start + bvec->bv_len - 1;
1846 
1847 	bio->bi_private = NULL;
1848 
1849 	bio_get(bio);
1850 
1851 	if (tree->ops && tree->ops->submit_bio_hook)
1852 		tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1853 					   mirror_num, bio_flags);
1854 	else
1855 		submit_bio(rw, bio);
1856 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
1857 		ret = -EOPNOTSUPP;
1858 	bio_put(bio);
1859 	return ret;
1860 }
1861 
1862 static int submit_extent_page(int rw, struct extent_io_tree *tree,
1863 			      struct page *page, sector_t sector,
1864 			      size_t size, unsigned long offset,
1865 			      struct block_device *bdev,
1866 			      struct bio **bio_ret,
1867 			      unsigned long max_pages,
1868 			      bio_end_io_t end_io_func,
1869 			      int mirror_num,
1870 			      unsigned long prev_bio_flags,
1871 			      unsigned long bio_flags)
1872 {
1873 	int ret = 0;
1874 	struct bio *bio;
1875 	int nr;
1876 	int contig = 0;
1877 	int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1878 	int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1879 	size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE);
1880 
1881 	if (bio_ret && *bio_ret) {
1882 		bio = *bio_ret;
1883 		if (old_compressed)
1884 			contig = bio->bi_sector == sector;
1885 		else
1886 			contig = bio->bi_sector + (bio->bi_size >> 9) ==
1887 				sector;
1888 
1889 		if (prev_bio_flags != bio_flags || !contig ||
1890 		    (tree->ops && tree->ops->merge_bio_hook &&
1891 		     tree->ops->merge_bio_hook(page, offset, page_size, bio,
1892 					       bio_flags)) ||
1893 		    bio_add_page(bio, page, page_size, offset) < page_size) {
1894 			ret = submit_one_bio(rw, bio, mirror_num,
1895 					     prev_bio_flags);
1896 			bio = NULL;
1897 		} else {
1898 			return 0;
1899 		}
1900 	}
1901 	if (this_compressed)
1902 		nr = BIO_MAX_PAGES;
1903 	else
1904 		nr = bio_get_nr_vecs(bdev);
1905 
1906 	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1907 
1908 	bio_add_page(bio, page, page_size, offset);
1909 	bio->bi_end_io = end_io_func;
1910 	bio->bi_private = tree;
1911 
1912 	if (bio_ret)
1913 		*bio_ret = bio;
1914 	else
1915 		ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1916 
1917 	return ret;
1918 }
1919 
1920 void set_page_extent_mapped(struct page *page)
1921 {
1922 	if (!PagePrivate(page)) {
1923 		SetPagePrivate(page);
1924 		page_cache_get(page);
1925 		set_page_private(page, EXTENT_PAGE_PRIVATE);
1926 	}
1927 }
1928 
1929 static void set_page_extent_head(struct page *page, unsigned long len)
1930 {
1931 	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1932 }
1933 
1934 /*
1935  * basic readpage implementation.  Locked extent state structs are inserted
1936  * into the tree that are removed when the IO is done (by the end_io
1937  * handlers)
1938  */
1939 static int __extent_read_full_page(struct extent_io_tree *tree,
1940 				   struct page *page,
1941 				   get_extent_t *get_extent,
1942 				   struct bio **bio, int mirror_num,
1943 				   unsigned long *bio_flags)
1944 {
1945 	struct inode *inode = page->mapping->host;
1946 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1947 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
1948 	u64 end;
1949 	u64 cur = start;
1950 	u64 extent_offset;
1951 	u64 last_byte = i_size_read(inode);
1952 	u64 block_start;
1953 	u64 cur_end;
1954 	sector_t sector;
1955 	struct extent_map *em;
1956 	struct block_device *bdev;
1957 	int ret;
1958 	int nr = 0;
1959 	size_t page_offset = 0;
1960 	size_t iosize;
1961 	size_t disk_io_size;
1962 	size_t blocksize = inode->i_sb->s_blocksize;
1963 	unsigned long this_bio_flag = 0;
1964 
1965 	set_page_extent_mapped(page);
1966 
1967 	end = page_end;
1968 	lock_extent(tree, start, end, GFP_NOFS);
1969 
1970 	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1971 		char *userpage;
1972 		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1973 
1974 		if (zero_offset) {
1975 			iosize = PAGE_CACHE_SIZE - zero_offset;
1976 			userpage = kmap_atomic(page, KM_USER0);
1977 			memset(userpage + zero_offset, 0, iosize);
1978 			flush_dcache_page(page);
1979 			kunmap_atomic(userpage, KM_USER0);
1980 		}
1981 	}
1982 	while (cur <= end) {
1983 		if (cur >= last_byte) {
1984 			char *userpage;
1985 			iosize = PAGE_CACHE_SIZE - page_offset;
1986 			userpage = kmap_atomic(page, KM_USER0);
1987 			memset(userpage + page_offset, 0, iosize);
1988 			flush_dcache_page(page);
1989 			kunmap_atomic(userpage, KM_USER0);
1990 			set_extent_uptodate(tree, cur, cur + iosize - 1,
1991 					    GFP_NOFS);
1992 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1993 			break;
1994 		}
1995 		em = get_extent(inode, page, page_offset, cur,
1996 				end - cur + 1, 0);
1997 		if (IS_ERR(em) || !em) {
1998 			SetPageError(page);
1999 			unlock_extent(tree, cur, end, GFP_NOFS);
2000 			break;
2001 		}
2002 		extent_offset = cur - em->start;
2003 		BUG_ON(extent_map_end(em) <= cur);
2004 		BUG_ON(end < cur);
2005 
2006 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2007 			this_bio_flag = EXTENT_BIO_COMPRESSED;
2008 
2009 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2010 		cur_end = min(extent_map_end(em) - 1, end);
2011 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2012 		if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
2013 			disk_io_size = em->block_len;
2014 			sector = em->block_start >> 9;
2015 		} else {
2016 			sector = (em->block_start + extent_offset) >> 9;
2017 			disk_io_size = iosize;
2018 		}
2019 		bdev = em->bdev;
2020 		block_start = em->block_start;
2021 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2022 			block_start = EXTENT_MAP_HOLE;
2023 		free_extent_map(em);
2024 		em = NULL;
2025 
2026 		/* we've found a hole, just zero and go on */
2027 		if (block_start == EXTENT_MAP_HOLE) {
2028 			char *userpage;
2029 			userpage = kmap_atomic(page, KM_USER0);
2030 			memset(userpage + page_offset, 0, iosize);
2031 			flush_dcache_page(page);
2032 			kunmap_atomic(userpage, KM_USER0);
2033 
2034 			set_extent_uptodate(tree, cur, cur + iosize - 1,
2035 					    GFP_NOFS);
2036 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2037 			cur = cur + iosize;
2038 			page_offset += iosize;
2039 			continue;
2040 		}
2041 		/* the get_extent function already copied into the page */
2042 		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
2043 			check_page_uptodate(tree, page);
2044 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2045 			cur = cur + iosize;
2046 			page_offset += iosize;
2047 			continue;
2048 		}
2049 		/* we have an inline extent but it didn't get marked up
2050 		 * to date.  Error out
2051 		 */
2052 		if (block_start == EXTENT_MAP_INLINE) {
2053 			SetPageError(page);
2054 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
2055 			cur = cur + iosize;
2056 			page_offset += iosize;
2057 			continue;
2058 		}
2059 
2060 		ret = 0;
2061 		if (tree->ops && tree->ops->readpage_io_hook) {
2062 			ret = tree->ops->readpage_io_hook(page, cur,
2063 							  cur + iosize - 1);
2064 		}
2065 		if (!ret) {
2066 			unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2067 			pnr -= page->index;
2068 			ret = submit_extent_page(READ, tree, page,
2069 					 sector, disk_io_size, page_offset,
2070 					 bdev, bio, pnr,
2071 					 end_bio_extent_readpage, mirror_num,
2072 					 *bio_flags,
2073 					 this_bio_flag);
2074 			nr++;
2075 			*bio_flags = this_bio_flag;
2076 		}
2077 		if (ret)
2078 			SetPageError(page);
2079 		cur = cur + iosize;
2080 		page_offset += iosize;
2081 	}
2082 	if (!nr) {
2083 		if (!PageError(page))
2084 			SetPageUptodate(page);
2085 		unlock_page(page);
2086 	}
2087 	return 0;
2088 }
2089 
2090 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2091 			    get_extent_t *get_extent)
2092 {
2093 	struct bio *bio = NULL;
2094 	unsigned long bio_flags = 0;
2095 	int ret;
2096 
2097 	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2098 				      &bio_flags);
2099 	if (bio)
2100 		submit_one_bio(READ, bio, 0, bio_flags);
2101 	return ret;
2102 }
2103 
2104 /*
2105  * the writepage semantics are similar to regular writepage.  extent
2106  * records are inserted to lock ranges in the tree, and as dirty areas
2107  * are found, they are marked writeback.  Then the lock bits are removed
2108  * and the end_io handler clears the writeback ranges
2109  */
2110 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2111 			      void *data)
2112 {
2113 	struct inode *inode = page->mapping->host;
2114 	struct extent_page_data *epd = data;
2115 	struct extent_io_tree *tree = epd->tree;
2116 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2117 	u64 delalloc_start;
2118 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
2119 	u64 end;
2120 	u64 cur = start;
2121 	u64 extent_offset;
2122 	u64 last_byte = i_size_read(inode);
2123 	u64 block_start;
2124 	u64 iosize;
2125 	u64 unlock_start;
2126 	sector_t sector;
2127 	struct extent_map *em;
2128 	struct block_device *bdev;
2129 	int ret;
2130 	int nr = 0;
2131 	size_t pg_offset = 0;
2132 	size_t blocksize;
2133 	loff_t i_size = i_size_read(inode);
2134 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
2135 	u64 nr_delalloc;
2136 	u64 delalloc_end;
2137 	int page_started;
2138 	int compressed;
2139 	unsigned long nr_written = 0;
2140 
2141 	WARN_ON(!PageLocked(page));
2142 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2143 	if (page->index > end_index ||
2144 	   (page->index == end_index && !pg_offset)) {
2145 		page->mapping->a_ops->invalidatepage(page, 0);
2146 		unlock_page(page);
2147 		return 0;
2148 	}
2149 
2150 	if (page->index == end_index) {
2151 		char *userpage;
2152 
2153 		userpage = kmap_atomic(page, KM_USER0);
2154 		memset(userpage + pg_offset, 0,
2155 		       PAGE_CACHE_SIZE - pg_offset);
2156 		kunmap_atomic(userpage, KM_USER0);
2157 		flush_dcache_page(page);
2158 	}
2159 	pg_offset = 0;
2160 
2161 	set_page_extent_mapped(page);
2162 
2163 	delalloc_start = start;
2164 	delalloc_end = 0;
2165 	page_started = 0;
2166 	if (!epd->extent_locked) {
2167 		while (delalloc_end < page_end) {
2168 			nr_delalloc = find_lock_delalloc_range(inode, tree,
2169 						       page,
2170 						       &delalloc_start,
2171 						       &delalloc_end,
2172 						       128 * 1024 * 1024);
2173 			if (nr_delalloc == 0) {
2174 				delalloc_start = delalloc_end + 1;
2175 				continue;
2176 			}
2177 			tree->ops->fill_delalloc(inode, page, delalloc_start,
2178 						 delalloc_end, &page_started,
2179 						 &nr_written);
2180 			delalloc_start = delalloc_end + 1;
2181 		}
2182 
2183 		/* did the fill delalloc function already unlock and start
2184 		 * the IO?
2185 		 */
2186 		if (page_started) {
2187 			ret = 0;
2188 			goto update_nr_written;
2189 		}
2190 	}
2191 	lock_extent(tree, start, page_end, GFP_NOFS);
2192 
2193 	unlock_start = start;
2194 
2195 	if (tree->ops && tree->ops->writepage_start_hook) {
2196 		ret = tree->ops->writepage_start_hook(page, start,
2197 						      page_end);
2198 		if (ret == -EAGAIN) {
2199 			unlock_extent(tree, start, page_end, GFP_NOFS);
2200 			redirty_page_for_writepage(wbc, page);
2201 			unlock_page(page);
2202 			ret = 0;
2203 			goto update_nr_written;
2204 		}
2205 	}
2206 
2207 	nr_written++;
2208 
2209 	end = page_end;
2210 	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0))
2211 		printk(KERN_ERR "btrfs delalloc bits after lock_extent\n");
2212 
2213 	if (last_byte <= start) {
2214 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
2215 		unlock_extent(tree, start, page_end, GFP_NOFS);
2216 		if (tree->ops && tree->ops->writepage_end_io_hook)
2217 			tree->ops->writepage_end_io_hook(page, start,
2218 							 page_end, NULL, 1);
2219 		unlock_start = page_end + 1;
2220 		goto done;
2221 	}
2222 
2223 	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
2224 	blocksize = inode->i_sb->s_blocksize;
2225 
2226 	while (cur <= end) {
2227 		if (cur >= last_byte) {
2228 			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
2229 			unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2230 			if (tree->ops && tree->ops->writepage_end_io_hook)
2231 				tree->ops->writepage_end_io_hook(page, cur,
2232 							 page_end, NULL, 1);
2233 			unlock_start = page_end + 1;
2234 			break;
2235 		}
2236 		em = epd->get_extent(inode, page, pg_offset, cur,
2237 				     end - cur + 1, 1);
2238 		if (IS_ERR(em) || !em) {
2239 			SetPageError(page);
2240 			break;
2241 		}
2242 
2243 		extent_offset = cur - em->start;
2244 		BUG_ON(extent_map_end(em) <= cur);
2245 		BUG_ON(end < cur);
2246 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
2247 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
2248 		sector = (em->block_start + extent_offset) >> 9;
2249 		bdev = em->bdev;
2250 		block_start = em->block_start;
2251 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2252 		free_extent_map(em);
2253 		em = NULL;
2254 
2255 		/*
2256 		 * compressed and inline extents are written through other
2257 		 * paths in the FS
2258 		 */
2259 		if (compressed || block_start == EXTENT_MAP_HOLE ||
2260 		    block_start == EXTENT_MAP_INLINE) {
2261 			clear_extent_dirty(tree, cur,
2262 					   cur + iosize - 1, GFP_NOFS);
2263 
2264 			unlock_extent(tree, unlock_start, cur + iosize - 1,
2265 				      GFP_NOFS);
2266 
2267 			/*
2268 			 * end_io notification does not happen here for
2269 			 * compressed extents
2270 			 */
2271 			if (!compressed && tree->ops &&
2272 			    tree->ops->writepage_end_io_hook)
2273 				tree->ops->writepage_end_io_hook(page, cur,
2274 							 cur + iosize - 1,
2275 							 NULL, 1);
2276 			else if (compressed) {
2277 				/* we don't want to end_page_writeback on
2278 				 * a compressed extent.  this happens
2279 				 * elsewhere
2280 				 */
2281 				nr++;
2282 			}
2283 
2284 			cur += iosize;
2285 			pg_offset += iosize;
2286 			unlock_start = cur;
2287 			continue;
2288 		}
2289 		/* leave this out until we have a page_mkwrite call */
2290 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2291 				   EXTENT_DIRTY, 0)) {
2292 			cur = cur + iosize;
2293 			pg_offset += iosize;
2294 			continue;
2295 		}
2296 
2297 		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2298 		if (tree->ops && tree->ops->writepage_io_hook) {
2299 			ret = tree->ops->writepage_io_hook(page, cur,
2300 						cur + iosize - 1);
2301 		} else {
2302 			ret = 0;
2303 		}
2304 		if (ret) {
2305 			SetPageError(page);
2306 		} else {
2307 			unsigned long max_nr = end_index + 1;
2308 
2309 			set_range_writeback(tree, cur, cur + iosize - 1);
2310 			if (!PageWriteback(page)) {
2311 				printk(KERN_ERR "btrfs warning page %lu not "
2312 				       "writeback, cur %llu end %llu\n",
2313 				       page->index, (unsigned long long)cur,
2314 				       (unsigned long long)end);
2315 			}
2316 
2317 			ret = submit_extent_page(WRITE, tree, page, sector,
2318 						 iosize, pg_offset, bdev,
2319 						 &epd->bio, max_nr,
2320 						 end_bio_extent_writepage,
2321 						 0, 0, 0);
2322 			if (ret)
2323 				SetPageError(page);
2324 		}
2325 		cur = cur + iosize;
2326 		pg_offset += iosize;
2327 		nr++;
2328 	}
2329 done:
2330 	if (nr == 0) {
2331 		/* make sure the mapping tag for page dirty gets cleared */
2332 		set_page_writeback(page);
2333 		end_page_writeback(page);
2334 	}
2335 	if (unlock_start <= page_end)
2336 		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
2337 	unlock_page(page);
2338 
2339 update_nr_written:
2340 	wbc->nr_to_write -= nr_written;
2341 	if (wbc->range_cyclic || (wbc->nr_to_write > 0 &&
2342 	    wbc->range_start == 0 && wbc->range_end == LLONG_MAX))
2343 		page->mapping->writeback_index = page->index + nr_written;
2344 	return 0;
2345 }
2346 
2347 /**
2348  * write_cache_pages - walk the list of dirty pages of the given address space and write all of them.
2349  * @mapping: address space structure to write
2350  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2351  * @writepage: function called for each page
2352  * @data: data passed to writepage function
2353  *
2354  * If a page is already under I/O, write_cache_pages() skips it, even
2355  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2356  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2357  * and msync() need to guarantee that all the data which was dirty at the time
2358  * the call was made get new I/O started against them.  If wbc->sync_mode is
2359  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2360  * existing IO to complete.
2361  */
2362 static int extent_write_cache_pages(struct extent_io_tree *tree,
2363 			     struct address_space *mapping,
2364 			     struct writeback_control *wbc,
2365 			     writepage_t writepage, void *data,
2366 			     void (*flush_fn)(void *))
2367 {
2368 	struct backing_dev_info *bdi = mapping->backing_dev_info;
2369 	int ret = 0;
2370 	int done = 0;
2371 	struct pagevec pvec;
2372 	int nr_pages;
2373 	pgoff_t index;
2374 	pgoff_t end;		/* Inclusive */
2375 	int scanned = 0;
2376 	int range_whole = 0;
2377 
2378 	pagevec_init(&pvec, 0);
2379 	if (wbc->range_cyclic) {
2380 		index = mapping->writeback_index; /* Start from prev offset */
2381 		end = -1;
2382 	} else {
2383 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
2384 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
2385 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2386 			range_whole = 1;
2387 		scanned = 1;
2388 	}
2389 retry:
2390 	while (!done && (index <= end) &&
2391 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2392 			      PAGECACHE_TAG_DIRTY, min(end - index,
2393 				  (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2394 		unsigned i;
2395 
2396 		scanned = 1;
2397 		for (i = 0; i < nr_pages; i++) {
2398 			struct page *page = pvec.pages[i];
2399 
2400 			/*
2401 			 * At this point we hold neither mapping->tree_lock nor
2402 			 * lock on the page itself: the page may be truncated or
2403 			 * invalidated (changing page->mapping to NULL), or even
2404 			 * swizzled back from swapper_space to tmpfs file
2405 			 * mapping
2406 			 */
2407 			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
2408 				tree->ops->write_cache_pages_lock_hook(page);
2409 			else
2410 				lock_page(page);
2411 
2412 			if (unlikely(page->mapping != mapping)) {
2413 				unlock_page(page);
2414 				continue;
2415 			}
2416 
2417 			if (!wbc->range_cyclic && page->index > end) {
2418 				done = 1;
2419 				unlock_page(page);
2420 				continue;
2421 			}
2422 
2423 			if (wbc->sync_mode != WB_SYNC_NONE) {
2424 				if (PageWriteback(page))
2425 					flush_fn(data);
2426 				wait_on_page_writeback(page);
2427 			}
2428 
2429 			if (PageWriteback(page) ||
2430 			    !clear_page_dirty_for_io(page)) {
2431 				unlock_page(page);
2432 				continue;
2433 			}
2434 
2435 			ret = (*writepage)(page, wbc, data);
2436 
2437 			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2438 				unlock_page(page);
2439 				ret = 0;
2440 			}
2441 			if (ret || wbc->nr_to_write <= 0)
2442 				done = 1;
2443 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
2444 				wbc->encountered_congestion = 1;
2445 				done = 1;
2446 			}
2447 		}
2448 		pagevec_release(&pvec);
2449 		cond_resched();
2450 	}
2451 	if (!scanned && !done) {
2452 		/*
2453 		 * We hit the last page and there is more work to be done: wrap
2454 		 * back to the start of the file
2455 		 */
2456 		scanned = 1;
2457 		index = 0;
2458 		goto retry;
2459 	}
2460 	return ret;
2461 }
2462 
2463 static noinline void flush_write_bio(void *data)
2464 {
2465 	struct extent_page_data *epd = data;
2466 	if (epd->bio) {
2467 		submit_one_bio(WRITE, epd->bio, 0, 0);
2468 		epd->bio = NULL;
2469 	}
2470 }
2471 
2472 int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2473 			  get_extent_t *get_extent,
2474 			  struct writeback_control *wbc)
2475 {
2476 	int ret;
2477 	struct address_space *mapping = page->mapping;
2478 	struct extent_page_data epd = {
2479 		.bio = NULL,
2480 		.tree = tree,
2481 		.get_extent = get_extent,
2482 		.extent_locked = 0,
2483 	};
2484 	struct writeback_control wbc_writepages = {
2485 		.bdi		= wbc->bdi,
2486 		.sync_mode	= WB_SYNC_NONE,
2487 		.older_than_this = NULL,
2488 		.nr_to_write	= 64,
2489 		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
2490 		.range_end	= (loff_t)-1,
2491 	};
2492 
2493 
2494 	ret = __extent_writepage(page, wbc, &epd);
2495 
2496 	extent_write_cache_pages(tree, mapping, &wbc_writepages,
2497 				 __extent_writepage, &epd, flush_write_bio);
2498 	if (epd.bio)
2499 		submit_one_bio(WRITE, epd.bio, 0, 0);
2500 	return ret;
2501 }
2502 
2503 int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
2504 			      u64 start, u64 end, get_extent_t *get_extent,
2505 			      int mode)
2506 {
2507 	int ret = 0;
2508 	struct address_space *mapping = inode->i_mapping;
2509 	struct page *page;
2510 	unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >>
2511 		PAGE_CACHE_SHIFT;
2512 
2513 	struct extent_page_data epd = {
2514 		.bio = NULL,
2515 		.tree = tree,
2516 		.get_extent = get_extent,
2517 		.extent_locked = 1,
2518 	};
2519 	struct writeback_control wbc_writepages = {
2520 		.bdi		= inode->i_mapping->backing_dev_info,
2521 		.sync_mode	= mode,
2522 		.older_than_this = NULL,
2523 		.nr_to_write	= nr_pages * 2,
2524 		.range_start	= start,
2525 		.range_end	= end + 1,
2526 	};
2527 
2528 	while (start <= end) {
2529 		page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
2530 		if (clear_page_dirty_for_io(page))
2531 			ret = __extent_writepage(page, &wbc_writepages, &epd);
2532 		else {
2533 			if (tree->ops && tree->ops->writepage_end_io_hook)
2534 				tree->ops->writepage_end_io_hook(page, start,
2535 						 start + PAGE_CACHE_SIZE - 1,
2536 						 NULL, 1);
2537 			unlock_page(page);
2538 		}
2539 		page_cache_release(page);
2540 		start += PAGE_CACHE_SIZE;
2541 	}
2542 
2543 	if (epd.bio)
2544 		submit_one_bio(WRITE, epd.bio, 0, 0);
2545 	return ret;
2546 }
2547 
2548 int extent_writepages(struct extent_io_tree *tree,
2549 		      struct address_space *mapping,
2550 		      get_extent_t *get_extent,
2551 		      struct writeback_control *wbc)
2552 {
2553 	int ret = 0;
2554 	struct extent_page_data epd = {
2555 		.bio = NULL,
2556 		.tree = tree,
2557 		.get_extent = get_extent,
2558 		.extent_locked = 0,
2559 	};
2560 
2561 	ret = extent_write_cache_pages(tree, mapping, wbc,
2562 				       __extent_writepage, &epd,
2563 				       flush_write_bio);
2564 	if (epd.bio)
2565 		submit_one_bio(WRITE, epd.bio, 0, 0);
2566 	return ret;
2567 }
2568 
2569 int extent_readpages(struct extent_io_tree *tree,
2570 		     struct address_space *mapping,
2571 		     struct list_head *pages, unsigned nr_pages,
2572 		     get_extent_t get_extent)
2573 {
2574 	struct bio *bio = NULL;
2575 	unsigned page_idx;
2576 	struct pagevec pvec;
2577 	unsigned long bio_flags = 0;
2578 
2579 	pagevec_init(&pvec, 0);
2580 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2581 		struct page *page = list_entry(pages->prev, struct page, lru);
2582 
2583 		prefetchw(&page->flags);
2584 		list_del(&page->lru);
2585 		/*
2586 		 * what we want to do here is call add_to_page_cache_lru,
2587 		 * but that isn't exported, so we reproduce it here
2588 		 */
2589 		if (!add_to_page_cache(page, mapping,
2590 					page->index, GFP_KERNEL)) {
2591 
2592 			/* open coding of lru_cache_add, also not exported */
2593 			page_cache_get(page);
2594 			if (!pagevec_add(&pvec, page))
2595 				__pagevec_lru_add_file(&pvec);
2596 			__extent_read_full_page(tree, page, get_extent,
2597 						&bio, 0, &bio_flags);
2598 		}
2599 		page_cache_release(page);
2600 	}
2601 	if (pagevec_count(&pvec))
2602 		__pagevec_lru_add_file(&pvec);
2603 	BUG_ON(!list_empty(pages));
2604 	if (bio)
2605 		submit_one_bio(READ, bio, 0, bio_flags);
2606 	return 0;
2607 }
2608 
2609 /*
2610  * basic invalidatepage code, this waits on any locked or writeback
2611  * ranges corresponding to the page, and then deletes any extent state
2612  * records from the tree
2613  */
2614 int extent_invalidatepage(struct extent_io_tree *tree,
2615 			  struct page *page, unsigned long offset)
2616 {
2617 	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2618 	u64 end = start + PAGE_CACHE_SIZE - 1;
2619 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2620 
2621 	start += (offset + blocksize - 1) & ~(blocksize - 1);
2622 	if (start > end)
2623 		return 0;
2624 
2625 	lock_extent(tree, start, end, GFP_NOFS);
2626 	wait_on_extent_writeback(tree, start, end);
2627 	clear_extent_bit(tree, start, end,
2628 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2629 			 1, 1, GFP_NOFS);
2630 	return 0;
2631 }
2632 
2633 /*
2634  * simple commit_write call, set_range_dirty is used to mark both
2635  * the pages and the extent records as dirty
2636  */
2637 int extent_commit_write(struct extent_io_tree *tree,
2638 			struct inode *inode, struct page *page,
2639 			unsigned from, unsigned to)
2640 {
2641 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2642 
2643 	set_page_extent_mapped(page);
2644 	set_page_dirty(page);
2645 
2646 	if (pos > inode->i_size) {
2647 		i_size_write(inode, pos);
2648 		mark_inode_dirty(inode);
2649 	}
2650 	return 0;
2651 }
2652 
2653 int extent_prepare_write(struct extent_io_tree *tree,
2654 			 struct inode *inode, struct page *page,
2655 			 unsigned from, unsigned to, get_extent_t *get_extent)
2656 {
2657 	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2658 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2659 	u64 block_start;
2660 	u64 orig_block_start;
2661 	u64 block_end;
2662 	u64 cur_end;
2663 	struct extent_map *em;
2664 	unsigned blocksize = 1 << inode->i_blkbits;
2665 	size_t page_offset = 0;
2666 	size_t block_off_start;
2667 	size_t block_off_end;
2668 	int err = 0;
2669 	int iocount = 0;
2670 	int ret = 0;
2671 	int isnew;
2672 
2673 	set_page_extent_mapped(page);
2674 
2675 	block_start = (page_start + from) & ~((u64)blocksize - 1);
2676 	block_end = (page_start + to - 1) | (blocksize - 1);
2677 	orig_block_start = block_start;
2678 
2679 	lock_extent(tree, page_start, page_end, GFP_NOFS);
2680 	while (block_start <= block_end) {
2681 		em = get_extent(inode, page, page_offset, block_start,
2682 				block_end - block_start + 1, 1);
2683 		if (IS_ERR(em) || !em)
2684 			goto err;
2685 
2686 		cur_end = min(block_end, extent_map_end(em) - 1);
2687 		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2688 		block_off_end = block_off_start + blocksize;
2689 		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2690 
2691 		if (!PageUptodate(page) && isnew &&
2692 		    (block_off_end > to || block_off_start < from)) {
2693 			void *kaddr;
2694 
2695 			kaddr = kmap_atomic(page, KM_USER0);
2696 			if (block_off_end > to)
2697 				memset(kaddr + to, 0, block_off_end - to);
2698 			if (block_off_start < from)
2699 				memset(kaddr + block_off_start, 0,
2700 				       from - block_off_start);
2701 			flush_dcache_page(page);
2702 			kunmap_atomic(kaddr, KM_USER0);
2703 		}
2704 		if ((em->block_start != EXTENT_MAP_HOLE &&
2705 		     em->block_start != EXTENT_MAP_INLINE) &&
2706 		    !isnew && !PageUptodate(page) &&
2707 		    (block_off_end > to || block_off_start < from) &&
2708 		    !test_range_bit(tree, block_start, cur_end,
2709 				    EXTENT_UPTODATE, 1)) {
2710 			u64 sector;
2711 			u64 extent_offset = block_start - em->start;
2712 			size_t iosize;
2713 			sector = (em->block_start + extent_offset) >> 9;
2714 			iosize = (cur_end - block_start + blocksize) &
2715 				~((u64)blocksize - 1);
2716 			/*
2717 			 * we've already got the extent locked, but we
2718 			 * need to split the state such that our end_bio
2719 			 * handler can clear the lock.
2720 			 */
2721 			set_extent_bit(tree, block_start,
2722 				       block_start + iosize - 1,
2723 				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2724 			ret = submit_extent_page(READ, tree, page,
2725 					 sector, iosize, page_offset, em->bdev,
2726 					 NULL, 1,
2727 					 end_bio_extent_preparewrite, 0,
2728 					 0, 0);
2729 			iocount++;
2730 			block_start = block_start + iosize;
2731 		} else {
2732 			set_extent_uptodate(tree, block_start, cur_end,
2733 					    GFP_NOFS);
2734 			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2735 			block_start = cur_end + 1;
2736 		}
2737 		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2738 		free_extent_map(em);
2739 	}
2740 	if (iocount) {
2741 		wait_extent_bit(tree, orig_block_start,
2742 				block_end, EXTENT_LOCKED);
2743 	}
2744 	check_page_uptodate(tree, page);
2745 err:
2746 	/* FIXME, zero out newly allocated blocks on error */
2747 	return err;
2748 }
2749 
2750 /*
2751  * a helper for releasepage, this tests for areas of the page that
2752  * are locked or under IO and drops the related state bits if it is safe
2753  * to drop the page.
2754  */
2755 int try_release_extent_state(struct extent_map_tree *map,
2756 			     struct extent_io_tree *tree, struct page *page,
2757 			     gfp_t mask)
2758 {
2759 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2760 	u64 end = start + PAGE_CACHE_SIZE - 1;
2761 	int ret = 1;
2762 
2763 	if (test_range_bit(tree, start, end,
2764 			   EXTENT_IOBITS | EXTENT_ORDERED, 0))
2765 		ret = 0;
2766 	else {
2767 		if ((mask & GFP_NOFS) == GFP_NOFS)
2768 			mask = GFP_NOFS;
2769 		clear_extent_bit(tree, start, end, EXTENT_UPTODATE,
2770 				 1, 1, mask);
2771 	}
2772 	return ret;
2773 }
2774 
2775 /*
2776  * a helper for releasepage.  As long as there are no locked extents
2777  * in the range corresponding to the page, both state records and extent
2778  * map records are removed
2779  */
2780 int try_release_extent_mapping(struct extent_map_tree *map,
2781 			       struct extent_io_tree *tree, struct page *page,
2782 			       gfp_t mask)
2783 {
2784 	struct extent_map *em;
2785 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2786 	u64 end = start + PAGE_CACHE_SIZE - 1;
2787 
2788 	if ((mask & __GFP_WAIT) &&
2789 	    page->mapping->host->i_size > 16 * 1024 * 1024) {
2790 		u64 len;
2791 		while (start <= end) {
2792 			len = end - start + 1;
2793 			spin_lock(&map->lock);
2794 			em = lookup_extent_mapping(map, start, len);
2795 			if (!em || IS_ERR(em)) {
2796 				spin_unlock(&map->lock);
2797 				break;
2798 			}
2799 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
2800 			    em->start != start) {
2801 				spin_unlock(&map->lock);
2802 				free_extent_map(em);
2803 				break;
2804 			}
2805 			if (!test_range_bit(tree, em->start,
2806 					    extent_map_end(em) - 1,
2807 					    EXTENT_LOCKED | EXTENT_WRITEBACK |
2808 					    EXTENT_ORDERED,
2809 					    0)) {
2810 				remove_extent_mapping(map, em);
2811 				/* once for the rb tree */
2812 				free_extent_map(em);
2813 			}
2814 			start = extent_map_end(em);
2815 			spin_unlock(&map->lock);
2816 
2817 			/* once for us */
2818 			free_extent_map(em);
2819 		}
2820 	}
2821 	return try_release_extent_state(map, tree, page, mask);
2822 }
2823 
2824 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2825 		get_extent_t *get_extent)
2826 {
2827 	struct inode *inode = mapping->host;
2828 	u64 start = iblock << inode->i_blkbits;
2829 	sector_t sector = 0;
2830 	size_t blksize = (1 << inode->i_blkbits);
2831 	struct extent_map *em;
2832 
2833 	lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2834 		    GFP_NOFS);
2835 	em = get_extent(inode, NULL, 0, start, blksize, 0);
2836 	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2837 		      GFP_NOFS);
2838 	if (!em || IS_ERR(em))
2839 		return 0;
2840 
2841 	if (em->block_start > EXTENT_MAP_LAST_BYTE)
2842 		goto out;
2843 
2844 	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2845 out:
2846 	free_extent_map(em);
2847 	return sector;
2848 }
2849 
2850 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2851 		__u64 start, __u64 len, get_extent_t *get_extent)
2852 {
2853 	int ret;
2854 	u64 off = start;
2855 	u64 max = start + len;
2856 	u32 flags = 0;
2857 	u64 disko = 0;
2858 	struct extent_map *em = NULL;
2859 	int end = 0;
2860 	u64 em_start = 0, em_len = 0;
2861 	unsigned long emflags;
2862 	ret = 0;
2863 
2864 	if (len == 0)
2865 		return -EINVAL;
2866 
2867 	lock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2868 		GFP_NOFS);
2869 	em = get_extent(inode, NULL, 0, off, max - off, 0);
2870 	if (!em)
2871 		goto out;
2872 	if (IS_ERR(em)) {
2873 		ret = PTR_ERR(em);
2874 		goto out;
2875 	}
2876 	while (!end) {
2877 		off = em->start + em->len;
2878 		if (off >= max)
2879 			end = 1;
2880 
2881 		em_start = em->start;
2882 		em_len = em->len;
2883 
2884 		disko = 0;
2885 		flags = 0;
2886 
2887 		switch (em->block_start) {
2888 		case EXTENT_MAP_LAST_BYTE:
2889 			end = 1;
2890 			flags |= FIEMAP_EXTENT_LAST;
2891 			break;
2892 		case EXTENT_MAP_HOLE:
2893 			flags |= FIEMAP_EXTENT_UNWRITTEN;
2894 			break;
2895 		case EXTENT_MAP_INLINE:
2896 			flags |= (FIEMAP_EXTENT_DATA_INLINE |
2897 				  FIEMAP_EXTENT_NOT_ALIGNED);
2898 			break;
2899 		case EXTENT_MAP_DELALLOC:
2900 			flags |= (FIEMAP_EXTENT_DELALLOC |
2901 				  FIEMAP_EXTENT_UNKNOWN);
2902 			break;
2903 		default:
2904 			disko = em->block_start;
2905 			break;
2906 		}
2907 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
2908 			flags |= FIEMAP_EXTENT_ENCODED;
2909 
2910 		emflags = em->flags;
2911 		free_extent_map(em);
2912 		em = NULL;
2913 
2914 		if (!end) {
2915 			em = get_extent(inode, NULL, 0, off, max - off, 0);
2916 			if (!em)
2917 				goto out;
2918 			if (IS_ERR(em)) {
2919 				ret = PTR_ERR(em);
2920 				goto out;
2921 			}
2922 			emflags = em->flags;
2923 		}
2924 		if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
2925 			flags |= FIEMAP_EXTENT_LAST;
2926 			end = 1;
2927 		}
2928 
2929 		ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
2930 					em_len, flags);
2931 		if (ret)
2932 			goto out_free;
2933 	}
2934 out_free:
2935 	free_extent_map(em);
2936 out:
2937 	unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len,
2938 			GFP_NOFS);
2939 	return ret;
2940 }
2941 
2942 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2943 					      unsigned long i)
2944 {
2945 	struct page *p;
2946 	struct address_space *mapping;
2947 
2948 	if (i == 0)
2949 		return eb->first_page;
2950 	i += eb->start >> PAGE_CACHE_SHIFT;
2951 	mapping = eb->first_page->mapping;
2952 	if (!mapping)
2953 		return NULL;
2954 
2955 	/*
2956 	 * extent_buffer_page is only called after pinning the page
2957 	 * by increasing the reference count.  So we know the page must
2958 	 * be in the radix tree.
2959 	 */
2960 	rcu_read_lock();
2961 	p = radix_tree_lookup(&mapping->page_tree, i);
2962 	rcu_read_unlock();
2963 
2964 	return p;
2965 }
2966 
2967 static inline unsigned long num_extent_pages(u64 start, u64 len)
2968 {
2969 	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2970 		(start >> PAGE_CACHE_SHIFT);
2971 }
2972 
2973 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
2974 						   u64 start,
2975 						   unsigned long len,
2976 						   gfp_t mask)
2977 {
2978 	struct extent_buffer *eb = NULL;
2979 #if LEAK_DEBUG
2980 	unsigned long flags;
2981 #endif
2982 
2983 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2984 	eb->start = start;
2985 	eb->len = len;
2986 	spin_lock_init(&eb->lock);
2987 	init_waitqueue_head(&eb->lock_wq);
2988 
2989 #if LEAK_DEBUG
2990 	spin_lock_irqsave(&leak_lock, flags);
2991 	list_add(&eb->leak_list, &buffers);
2992 	spin_unlock_irqrestore(&leak_lock, flags);
2993 #endif
2994 	atomic_set(&eb->refs, 1);
2995 
2996 	return eb;
2997 }
2998 
2999 static void __free_extent_buffer(struct extent_buffer *eb)
3000 {
3001 #if LEAK_DEBUG
3002 	unsigned long flags;
3003 	spin_lock_irqsave(&leak_lock, flags);
3004 	list_del(&eb->leak_list);
3005 	spin_unlock_irqrestore(&leak_lock, flags);
3006 #endif
3007 	kmem_cache_free(extent_buffer_cache, eb);
3008 }
3009 
3010 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
3011 					  u64 start, unsigned long len,
3012 					  struct page *page0,
3013 					  gfp_t mask)
3014 {
3015 	unsigned long num_pages = num_extent_pages(start, len);
3016 	unsigned long i;
3017 	unsigned long index = start >> PAGE_CACHE_SHIFT;
3018 	struct extent_buffer *eb;
3019 	struct extent_buffer *exists = NULL;
3020 	struct page *p;
3021 	struct address_space *mapping = tree->mapping;
3022 	int uptodate = 1;
3023 
3024 	spin_lock(&tree->buffer_lock);
3025 	eb = buffer_search(tree, start);
3026 	if (eb) {
3027 		atomic_inc(&eb->refs);
3028 		spin_unlock(&tree->buffer_lock);
3029 		mark_page_accessed(eb->first_page);
3030 		return eb;
3031 	}
3032 	spin_unlock(&tree->buffer_lock);
3033 
3034 	eb = __alloc_extent_buffer(tree, start, len, mask);
3035 	if (!eb)
3036 		return NULL;
3037 
3038 	if (page0) {
3039 		eb->first_page = page0;
3040 		i = 1;
3041 		index++;
3042 		page_cache_get(page0);
3043 		mark_page_accessed(page0);
3044 		set_page_extent_mapped(page0);
3045 		set_page_extent_head(page0, len);
3046 		uptodate = PageUptodate(page0);
3047 	} else {
3048 		i = 0;
3049 	}
3050 	for (; i < num_pages; i++, index++) {
3051 		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
3052 		if (!p) {
3053 			WARN_ON(1);
3054 			goto free_eb;
3055 		}
3056 		set_page_extent_mapped(p);
3057 		mark_page_accessed(p);
3058 		if (i == 0) {
3059 			eb->first_page = p;
3060 			set_page_extent_head(p, len);
3061 		} else {
3062 			set_page_private(p, EXTENT_PAGE_PRIVATE);
3063 		}
3064 		if (!PageUptodate(p))
3065 			uptodate = 0;
3066 		unlock_page(p);
3067 	}
3068 	if (uptodate)
3069 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3070 
3071 	spin_lock(&tree->buffer_lock);
3072 	exists = buffer_tree_insert(tree, start, &eb->rb_node);
3073 	if (exists) {
3074 		/* add one reference for the caller */
3075 		atomic_inc(&exists->refs);
3076 		spin_unlock(&tree->buffer_lock);
3077 		goto free_eb;
3078 	}
3079 	spin_unlock(&tree->buffer_lock);
3080 
3081 	/* add one reference for the tree */
3082 	atomic_inc(&eb->refs);
3083 	return eb;
3084 
3085 free_eb:
3086 	if (!atomic_dec_and_test(&eb->refs))
3087 		return exists;
3088 	for (index = 1; index < i; index++)
3089 		page_cache_release(extent_buffer_page(eb, index));
3090 	page_cache_release(extent_buffer_page(eb, 0));
3091 	__free_extent_buffer(eb);
3092 	return exists;
3093 }
3094 
3095 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
3096 					 u64 start, unsigned long len,
3097 					  gfp_t mask)
3098 {
3099 	struct extent_buffer *eb;
3100 
3101 	spin_lock(&tree->buffer_lock);
3102 	eb = buffer_search(tree, start);
3103 	if (eb)
3104 		atomic_inc(&eb->refs);
3105 	spin_unlock(&tree->buffer_lock);
3106 
3107 	if (eb)
3108 		mark_page_accessed(eb->first_page);
3109 
3110 	return eb;
3111 }
3112 
3113 void free_extent_buffer(struct extent_buffer *eb)
3114 {
3115 	if (!eb)
3116 		return;
3117 
3118 	if (!atomic_dec_and_test(&eb->refs))
3119 		return;
3120 
3121 	WARN_ON(1);
3122 }
3123 
3124 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3125 			      struct extent_buffer *eb)
3126 {
3127 	unsigned long i;
3128 	unsigned long num_pages;
3129 	struct page *page;
3130 
3131 	num_pages = num_extent_pages(eb->start, eb->len);
3132 
3133 	for (i = 0; i < num_pages; i++) {
3134 		page = extent_buffer_page(eb, i);
3135 		if (!PageDirty(page))
3136 			continue;
3137 
3138 		lock_page(page);
3139 		if (i == 0)
3140 			set_page_extent_head(page, eb->len);
3141 		else
3142 			set_page_private(page, EXTENT_PAGE_PRIVATE);
3143 
3144 		clear_page_dirty_for_io(page);
3145 		spin_lock_irq(&page->mapping->tree_lock);
3146 		if (!PageDirty(page)) {
3147 			radix_tree_tag_clear(&page->mapping->page_tree,
3148 						page_index(page),
3149 						PAGECACHE_TAG_DIRTY);
3150 		}
3151 		spin_unlock_irq(&page->mapping->tree_lock);
3152 		unlock_page(page);
3153 	}
3154 	return 0;
3155 }
3156 
3157 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
3158 				    struct extent_buffer *eb)
3159 {
3160 	return wait_on_extent_writeback(tree, eb->start,
3161 					eb->start + eb->len - 1);
3162 }
3163 
3164 int set_extent_buffer_dirty(struct extent_io_tree *tree,
3165 			     struct extent_buffer *eb)
3166 {
3167 	unsigned long i;
3168 	unsigned long num_pages;
3169 	int was_dirty = 0;
3170 
3171 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
3172 	num_pages = num_extent_pages(eb->start, eb->len);
3173 	for (i = 0; i < num_pages; i++)
3174 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
3175 	return was_dirty;
3176 }
3177 
3178 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3179 				struct extent_buffer *eb)
3180 {
3181 	unsigned long i;
3182 	struct page *page;
3183 	unsigned long num_pages;
3184 
3185 	num_pages = num_extent_pages(eb->start, eb->len);
3186 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3187 
3188 	clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3189 			      GFP_NOFS);
3190 	for (i = 0; i < num_pages; i++) {
3191 		page = extent_buffer_page(eb, i);
3192 		if (page)
3193 			ClearPageUptodate(page);
3194 	}
3195 	return 0;
3196 }
3197 
3198 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
3199 				struct extent_buffer *eb)
3200 {
3201 	unsigned long i;
3202 	struct page *page;
3203 	unsigned long num_pages;
3204 
3205 	num_pages = num_extent_pages(eb->start, eb->len);
3206 
3207 	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3208 			    GFP_NOFS);
3209 	for (i = 0; i < num_pages; i++) {
3210 		page = extent_buffer_page(eb, i);
3211 		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
3212 		    ((i == num_pages - 1) &&
3213 		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
3214 			check_page_uptodate(tree, page);
3215 			continue;
3216 		}
3217 		SetPageUptodate(page);
3218 	}
3219 	return 0;
3220 }
3221 
3222 int extent_range_uptodate(struct extent_io_tree *tree,
3223 			  u64 start, u64 end)
3224 {
3225 	struct page *page;
3226 	int ret;
3227 	int pg_uptodate = 1;
3228 	int uptodate;
3229 	unsigned long index;
3230 
3231 	ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1);
3232 	if (ret)
3233 		return 1;
3234 	while (start <= end) {
3235 		index = start >> PAGE_CACHE_SHIFT;
3236 		page = find_get_page(tree->mapping, index);
3237 		uptodate = PageUptodate(page);
3238 		page_cache_release(page);
3239 		if (!uptodate) {
3240 			pg_uptodate = 0;
3241 			break;
3242 		}
3243 		start += PAGE_CACHE_SIZE;
3244 	}
3245 	return pg_uptodate;
3246 }
3247 
3248 int extent_buffer_uptodate(struct extent_io_tree *tree,
3249 			   struct extent_buffer *eb)
3250 {
3251 	int ret = 0;
3252 	unsigned long num_pages;
3253 	unsigned long i;
3254 	struct page *page;
3255 	int pg_uptodate = 1;
3256 
3257 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3258 		return 1;
3259 
3260 	ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3261 			   EXTENT_UPTODATE, 1);
3262 	if (ret)
3263 		return ret;
3264 
3265 	num_pages = num_extent_pages(eb->start, eb->len);
3266 	for (i = 0; i < num_pages; i++) {
3267 		page = extent_buffer_page(eb, i);
3268 		if (!PageUptodate(page)) {
3269 			pg_uptodate = 0;
3270 			break;
3271 		}
3272 	}
3273 	return pg_uptodate;
3274 }
3275 
3276 int read_extent_buffer_pages(struct extent_io_tree *tree,
3277 			     struct extent_buffer *eb,
3278 			     u64 start, int wait,
3279 			     get_extent_t *get_extent, int mirror_num)
3280 {
3281 	unsigned long i;
3282 	unsigned long start_i;
3283 	struct page *page;
3284 	int err;
3285 	int ret = 0;
3286 	int locked_pages = 0;
3287 	int all_uptodate = 1;
3288 	int inc_all_pages = 0;
3289 	unsigned long num_pages;
3290 	struct bio *bio = NULL;
3291 	unsigned long bio_flags = 0;
3292 
3293 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
3294 		return 0;
3295 
3296 	if (test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3297 			   EXTENT_UPTODATE, 1)) {
3298 		return 0;
3299 	}
3300 
3301 	if (start) {
3302 		WARN_ON(start < eb->start);
3303 		start_i = (start >> PAGE_CACHE_SHIFT) -
3304 			(eb->start >> PAGE_CACHE_SHIFT);
3305 	} else {
3306 		start_i = 0;
3307 	}
3308 
3309 	num_pages = num_extent_pages(eb->start, eb->len);
3310 	for (i = start_i; i < num_pages; i++) {
3311 		page = extent_buffer_page(eb, i);
3312 		if (!wait) {
3313 			if (!trylock_page(page))
3314 				goto unlock_exit;
3315 		} else {
3316 			lock_page(page);
3317 		}
3318 		locked_pages++;
3319 		if (!PageUptodate(page))
3320 			all_uptodate = 0;
3321 	}
3322 	if (all_uptodate) {
3323 		if (start_i == 0)
3324 			set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3325 		goto unlock_exit;
3326 	}
3327 
3328 	for (i = start_i; i < num_pages; i++) {
3329 		page = extent_buffer_page(eb, i);
3330 		if (inc_all_pages)
3331 			page_cache_get(page);
3332 		if (!PageUptodate(page)) {
3333 			if (start_i == 0)
3334 				inc_all_pages = 1;
3335 			ClearPageError(page);
3336 			err = __extent_read_full_page(tree, page,
3337 						      get_extent, &bio,
3338 						      mirror_num, &bio_flags);
3339 			if (err)
3340 				ret = err;
3341 		} else {
3342 			unlock_page(page);
3343 		}
3344 	}
3345 
3346 	if (bio)
3347 		submit_one_bio(READ, bio, mirror_num, bio_flags);
3348 
3349 	if (ret || !wait)
3350 		return ret;
3351 
3352 	for (i = start_i; i < num_pages; i++) {
3353 		page = extent_buffer_page(eb, i);
3354 		wait_on_page_locked(page);
3355 		if (!PageUptodate(page))
3356 			ret = -EIO;
3357 	}
3358 
3359 	if (!ret)
3360 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3361 	return ret;
3362 
3363 unlock_exit:
3364 	i = start_i;
3365 	while (locked_pages > 0) {
3366 		page = extent_buffer_page(eb, i);
3367 		i++;
3368 		unlock_page(page);
3369 		locked_pages--;
3370 	}
3371 	return ret;
3372 }
3373 
3374 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
3375 			unsigned long start,
3376 			unsigned long len)
3377 {
3378 	size_t cur;
3379 	size_t offset;
3380 	struct page *page;
3381 	char *kaddr;
3382 	char *dst = (char *)dstv;
3383 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3384 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3385 
3386 	WARN_ON(start > eb->len);
3387 	WARN_ON(start + len > eb->start + eb->len);
3388 
3389 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3390 
3391 	while (len > 0) {
3392 		page = extent_buffer_page(eb, i);
3393 
3394 		cur = min(len, (PAGE_CACHE_SIZE - offset));
3395 		kaddr = kmap_atomic(page, KM_USER1);
3396 		memcpy(dst, kaddr + offset, cur);
3397 		kunmap_atomic(kaddr, KM_USER1);
3398 
3399 		dst += cur;
3400 		len -= cur;
3401 		offset = 0;
3402 		i++;
3403 	}
3404 }
3405 
3406 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
3407 			       unsigned long min_len, char **token, char **map,
3408 			       unsigned long *map_start,
3409 			       unsigned long *map_len, int km)
3410 {
3411 	size_t offset = start & (PAGE_CACHE_SIZE - 1);
3412 	char *kaddr;
3413 	struct page *p;
3414 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3415 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3416 	unsigned long end_i = (start_offset + start + min_len - 1) >>
3417 		PAGE_CACHE_SHIFT;
3418 
3419 	if (i != end_i)
3420 		return -EINVAL;
3421 
3422 	if (i == 0) {
3423 		offset = start_offset;
3424 		*map_start = 0;
3425 	} else {
3426 		offset = 0;
3427 		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
3428 	}
3429 
3430 	if (start + min_len > eb->len) {
3431 		printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
3432 		       "wanted %lu %lu\n", (unsigned long long)eb->start,
3433 		       eb->len, start, min_len);
3434 		WARN_ON(1);
3435 	}
3436 
3437 	p = extent_buffer_page(eb, i);
3438 	kaddr = kmap_atomic(p, km);
3439 	*token = kaddr;
3440 	*map = kaddr + offset;
3441 	*map_len = PAGE_CACHE_SIZE - offset;
3442 	return 0;
3443 }
3444 
3445 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
3446 		      unsigned long min_len,
3447 		      char **token, char **map,
3448 		      unsigned long *map_start,
3449 		      unsigned long *map_len, int km)
3450 {
3451 	int err;
3452 	int save = 0;
3453 	if (eb->map_token) {
3454 		unmap_extent_buffer(eb, eb->map_token, km);
3455 		eb->map_token = NULL;
3456 		save = 1;
3457 	}
3458 	err = map_private_extent_buffer(eb, start, min_len, token, map,
3459 				       map_start, map_len, km);
3460 	if (!err && save) {
3461 		eb->map_token = *token;
3462 		eb->kaddr = *map;
3463 		eb->map_start = *map_start;
3464 		eb->map_len = *map_len;
3465 	}
3466 	return err;
3467 }
3468 
3469 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
3470 {
3471 	kunmap_atomic(token, km);
3472 }
3473 
3474 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
3475 			  unsigned long start,
3476 			  unsigned long len)
3477 {
3478 	size_t cur;
3479 	size_t offset;
3480 	struct page *page;
3481 	char *kaddr;
3482 	char *ptr = (char *)ptrv;
3483 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3484 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3485 	int ret = 0;
3486 
3487 	WARN_ON(start > eb->len);
3488 	WARN_ON(start + len > eb->start + eb->len);
3489 
3490 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3491 
3492 	while (len > 0) {
3493 		page = extent_buffer_page(eb, i);
3494 
3495 		cur = min(len, (PAGE_CACHE_SIZE - offset));
3496 
3497 		kaddr = kmap_atomic(page, KM_USER0);
3498 		ret = memcmp(ptr, kaddr + offset, cur);
3499 		kunmap_atomic(kaddr, KM_USER0);
3500 		if (ret)
3501 			break;
3502 
3503 		ptr += cur;
3504 		len -= cur;
3505 		offset = 0;
3506 		i++;
3507 	}
3508 	return ret;
3509 }
3510 
3511 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3512 			 unsigned long start, unsigned long len)
3513 {
3514 	size_t cur;
3515 	size_t offset;
3516 	struct page *page;
3517 	char *kaddr;
3518 	char *src = (char *)srcv;
3519 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3520 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3521 
3522 	WARN_ON(start > eb->len);
3523 	WARN_ON(start + len > eb->start + eb->len);
3524 
3525 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3526 
3527 	while (len > 0) {
3528 		page = extent_buffer_page(eb, i);
3529 		WARN_ON(!PageUptodate(page));
3530 
3531 		cur = min(len, PAGE_CACHE_SIZE - offset);
3532 		kaddr = kmap_atomic(page, KM_USER1);
3533 		memcpy(kaddr + offset, src, cur);
3534 		kunmap_atomic(kaddr, KM_USER1);
3535 
3536 		src += cur;
3537 		len -= cur;
3538 		offset = 0;
3539 		i++;
3540 	}
3541 }
3542 
3543 void memset_extent_buffer(struct extent_buffer *eb, char c,
3544 			  unsigned long start, unsigned long len)
3545 {
3546 	size_t cur;
3547 	size_t offset;
3548 	struct page *page;
3549 	char *kaddr;
3550 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3551 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3552 
3553 	WARN_ON(start > eb->len);
3554 	WARN_ON(start + len > eb->start + eb->len);
3555 
3556 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3557 
3558 	while (len > 0) {
3559 		page = extent_buffer_page(eb, i);
3560 		WARN_ON(!PageUptodate(page));
3561 
3562 		cur = min(len, PAGE_CACHE_SIZE - offset);
3563 		kaddr = kmap_atomic(page, KM_USER0);
3564 		memset(kaddr + offset, c, cur);
3565 		kunmap_atomic(kaddr, KM_USER0);
3566 
3567 		len -= cur;
3568 		offset = 0;
3569 		i++;
3570 	}
3571 }
3572 
3573 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3574 			unsigned long dst_offset, unsigned long src_offset,
3575 			unsigned long len)
3576 {
3577 	u64 dst_len = dst->len;
3578 	size_t cur;
3579 	size_t offset;
3580 	struct page *page;
3581 	char *kaddr;
3582 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3583 	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3584 
3585 	WARN_ON(src->len != dst_len);
3586 
3587 	offset = (start_offset + dst_offset) &
3588 		((unsigned long)PAGE_CACHE_SIZE - 1);
3589 
3590 	while (len > 0) {
3591 		page = extent_buffer_page(dst, i);
3592 		WARN_ON(!PageUptodate(page));
3593 
3594 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3595 
3596 		kaddr = kmap_atomic(page, KM_USER0);
3597 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
3598 		kunmap_atomic(kaddr, KM_USER0);
3599 
3600 		src_offset += cur;
3601 		len -= cur;
3602 		offset = 0;
3603 		i++;
3604 	}
3605 }
3606 
3607 static void move_pages(struct page *dst_page, struct page *src_page,
3608 		       unsigned long dst_off, unsigned long src_off,
3609 		       unsigned long len)
3610 {
3611 	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3612 	if (dst_page == src_page) {
3613 		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3614 	} else {
3615 		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3616 		char *p = dst_kaddr + dst_off + len;
3617 		char *s = src_kaddr + src_off + len;
3618 
3619 		while (len--)
3620 			*--p = *--s;
3621 
3622 		kunmap_atomic(src_kaddr, KM_USER1);
3623 	}
3624 	kunmap_atomic(dst_kaddr, KM_USER0);
3625 }
3626 
3627 static void copy_pages(struct page *dst_page, struct page *src_page,
3628 		       unsigned long dst_off, unsigned long src_off,
3629 		       unsigned long len)
3630 {
3631 	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3632 	char *src_kaddr;
3633 
3634 	if (dst_page != src_page)
3635 		src_kaddr = kmap_atomic(src_page, KM_USER1);
3636 	else
3637 		src_kaddr = dst_kaddr;
3638 
3639 	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3640 	kunmap_atomic(dst_kaddr, KM_USER0);
3641 	if (dst_page != src_page)
3642 		kunmap_atomic(src_kaddr, KM_USER1);
3643 }
3644 
3645 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3646 			   unsigned long src_offset, unsigned long len)
3647 {
3648 	size_t cur;
3649 	size_t dst_off_in_page;
3650 	size_t src_off_in_page;
3651 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3652 	unsigned long dst_i;
3653 	unsigned long src_i;
3654 
3655 	if (src_offset + len > dst->len) {
3656 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3657 		       "len %lu dst len %lu\n", src_offset, len, dst->len);
3658 		BUG_ON(1);
3659 	}
3660 	if (dst_offset + len > dst->len) {
3661 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3662 		       "len %lu dst len %lu\n", dst_offset, len, dst->len);
3663 		BUG_ON(1);
3664 	}
3665 
3666 	while (len > 0) {
3667 		dst_off_in_page = (start_offset + dst_offset) &
3668 			((unsigned long)PAGE_CACHE_SIZE - 1);
3669 		src_off_in_page = (start_offset + src_offset) &
3670 			((unsigned long)PAGE_CACHE_SIZE - 1);
3671 
3672 		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3673 		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3674 
3675 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3676 					       src_off_in_page));
3677 		cur = min_t(unsigned long, cur,
3678 			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3679 
3680 		copy_pages(extent_buffer_page(dst, dst_i),
3681 			   extent_buffer_page(dst, src_i),
3682 			   dst_off_in_page, src_off_in_page, cur);
3683 
3684 		src_offset += cur;
3685 		dst_offset += cur;
3686 		len -= cur;
3687 	}
3688 }
3689 
3690 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3691 			   unsigned long src_offset, unsigned long len)
3692 {
3693 	size_t cur;
3694 	size_t dst_off_in_page;
3695 	size_t src_off_in_page;
3696 	unsigned long dst_end = dst_offset + len - 1;
3697 	unsigned long src_end = src_offset + len - 1;
3698 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3699 	unsigned long dst_i;
3700 	unsigned long src_i;
3701 
3702 	if (src_offset + len > dst->len) {
3703 		printk(KERN_ERR "btrfs memmove bogus src_offset %lu move "
3704 		       "len %lu len %lu\n", src_offset, len, dst->len);
3705 		BUG_ON(1);
3706 	}
3707 	if (dst_offset + len > dst->len) {
3708 		printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move "
3709 		       "len %lu len %lu\n", dst_offset, len, dst->len);
3710 		BUG_ON(1);
3711 	}
3712 	if (dst_offset < src_offset) {
3713 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3714 		return;
3715 	}
3716 	while (len > 0) {
3717 		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3718 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3719 
3720 		dst_off_in_page = (start_offset + dst_end) &
3721 			((unsigned long)PAGE_CACHE_SIZE - 1);
3722 		src_off_in_page = (start_offset + src_end) &
3723 			((unsigned long)PAGE_CACHE_SIZE - 1);
3724 
3725 		cur = min_t(unsigned long, len, src_off_in_page + 1);
3726 		cur = min(cur, dst_off_in_page + 1);
3727 		move_pages(extent_buffer_page(dst, dst_i),
3728 			   extent_buffer_page(dst, src_i),
3729 			   dst_off_in_page - cur + 1,
3730 			   src_off_in_page - cur + 1, cur);
3731 
3732 		dst_end -= cur;
3733 		src_end -= cur;
3734 		len -= cur;
3735 	}
3736 }
3737 
3738 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
3739 {
3740 	u64 start = page_offset(page);
3741 	struct extent_buffer *eb;
3742 	int ret = 1;
3743 	unsigned long i;
3744 	unsigned long num_pages;
3745 
3746 	spin_lock(&tree->buffer_lock);
3747 	eb = buffer_search(tree, start);
3748 	if (!eb)
3749 		goto out;
3750 
3751 	if (atomic_read(&eb->refs) > 1) {
3752 		ret = 0;
3753 		goto out;
3754 	}
3755 	if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
3756 		ret = 0;
3757 		goto out;
3758 	}
3759 	/* at this point we can safely release the extent buffer */
3760 	num_pages = num_extent_pages(eb->start, eb->len);
3761 	for (i = 0; i < num_pages; i++)
3762 		page_cache_release(extent_buffer_page(eb, i));
3763 	rb_erase(&eb->rb_node, &tree->buffer);
3764 	__free_extent_buffer(eb);
3765 out:
3766 	spin_unlock(&tree->buffer_lock);
3767 	return ret;
3768 }
3769