xref: /openbmc/linux/fs/btrfs/extent_map.c (revision 07157aac)
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include "extent_map.h"
12 
13 static struct kmem_cache *extent_map_cache;
14 static struct kmem_cache *extent_state_cache;
15 
16 struct tree_entry {
17 	u64 start;
18 	u64 end;
19 	int in_tree;
20 	struct rb_node rb_node;
21 };
22 
23 /* bits for the extent state */
24 #define EXTENT_DIRTY 1
25 #define EXTENT_WRITEBACK (1 << 1)
26 #define EXTENT_UPTODATE (1 << 2)
27 #define EXTENT_LOCKED (1 << 3)
28 #define EXTENT_NEW (1 << 4)
29 #define EXTENT_DELALLOC (1 << 5)
30 
31 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
32 
33 void __init extent_map_init(void)
34 {
35 	extent_map_cache = kmem_cache_create("extent_map",
36 					    sizeof(struct extent_map), 0,
37 					    SLAB_RECLAIM_ACCOUNT |
38 					    SLAB_DESTROY_BY_RCU,
39 					    NULL);
40 	extent_state_cache = kmem_cache_create("extent_state",
41 					    sizeof(struct extent_state), 0,
42 					    SLAB_RECLAIM_ACCOUNT |
43 					    SLAB_DESTROY_BY_RCU,
44 					    NULL);
45 }
46 
47 void __exit extent_map_exit(void)
48 {
49 	if (extent_map_cache)
50 		kmem_cache_destroy(extent_map_cache);
51 	if (extent_state_cache)
52 		kmem_cache_destroy(extent_state_cache);
53 }
54 
55 void extent_map_tree_init(struct extent_map_tree *tree,
56 			  struct address_space *mapping, gfp_t mask)
57 {
58 	tree->map.rb_node = NULL;
59 	tree->state.rb_node = NULL;
60 	tree->ops = NULL;
61 	rwlock_init(&tree->lock);
62 	tree->mapping = mapping;
63 }
64 EXPORT_SYMBOL(extent_map_tree_init);
65 
66 struct extent_map *alloc_extent_map(gfp_t mask)
67 {
68 	struct extent_map *em;
69 	em = kmem_cache_alloc(extent_map_cache, mask);
70 	if (!em || IS_ERR(em))
71 		return em;
72 	em->in_tree = 0;
73 	atomic_set(&em->refs, 1);
74 	return em;
75 }
76 EXPORT_SYMBOL(alloc_extent_map);
77 
78 void free_extent_map(struct extent_map *em)
79 {
80 	if (atomic_dec_and_test(&em->refs)) {
81 		WARN_ON(em->in_tree);
82 		kmem_cache_free(extent_map_cache, em);
83 	}
84 }
85 EXPORT_SYMBOL(free_extent_map);
86 
87 
88 struct extent_state *alloc_extent_state(gfp_t mask)
89 {
90 	struct extent_state *state;
91 	state = kmem_cache_alloc(extent_state_cache, mask);
92 	if (!state || IS_ERR(state))
93 		return state;
94 	state->state = 0;
95 	state->in_tree = 0;
96 	state->private = 0;
97 	atomic_set(&state->refs, 1);
98 	init_waitqueue_head(&state->wq);
99 	return state;
100 }
101 EXPORT_SYMBOL(alloc_extent_state);
102 
103 void free_extent_state(struct extent_state *state)
104 {
105 	if (atomic_dec_and_test(&state->refs)) {
106 		WARN_ON(state->in_tree);
107 		kmem_cache_free(extent_state_cache, state);
108 	}
109 }
110 EXPORT_SYMBOL(free_extent_state);
111 
112 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
113 				   struct rb_node *node)
114 {
115 	struct rb_node ** p = &root->rb_node;
116 	struct rb_node * parent = NULL;
117 	struct tree_entry *entry;
118 
119 	while(*p) {
120 		parent = *p;
121 		entry = rb_entry(parent, struct tree_entry, rb_node);
122 
123 		if (offset < entry->start)
124 			p = &(*p)->rb_left;
125 		else if (offset > entry->end)
126 			p = &(*p)->rb_right;
127 		else
128 			return parent;
129 	}
130 
131 	entry = rb_entry(node, struct tree_entry, rb_node);
132 	entry->in_tree = 1;
133 	rb_link_node(node, parent, p);
134 	rb_insert_color(node, root);
135 	return NULL;
136 }
137 
138 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
139 				   struct rb_node **prev_ret)
140 {
141 	struct rb_node * n = root->rb_node;
142 	struct rb_node *prev = NULL;
143 	struct tree_entry *entry;
144 	struct tree_entry *prev_entry = NULL;
145 
146 	while(n) {
147 		entry = rb_entry(n, struct tree_entry, rb_node);
148 		prev = n;
149 		prev_entry = entry;
150 
151 		if (offset < entry->start)
152 			n = n->rb_left;
153 		else if (offset > entry->end)
154 			n = n->rb_right;
155 		else
156 			return n;
157 	}
158 	if (!prev_ret)
159 		return NULL;
160 	while(prev && offset > prev_entry->end) {
161 		prev = rb_next(prev);
162 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
163 	}
164 	*prev_ret = prev;
165 	return NULL;
166 }
167 
168 static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
169 {
170 	struct rb_node *prev;
171 	struct rb_node *ret;
172 	ret = __tree_search(root, offset, &prev);
173 	if (!ret)
174 		return prev;
175 	return ret;
176 }
177 
178 static int tree_delete(struct rb_root *root, u64 offset)
179 {
180 	struct rb_node *node;
181 	struct tree_entry *entry;
182 
183 	node = __tree_search(root, offset, NULL);
184 	if (!node)
185 		return -ENOENT;
186 	entry = rb_entry(node, struct tree_entry, rb_node);
187 	entry->in_tree = 0;
188 	rb_erase(node, root);
189 	return 0;
190 }
191 
192 /*
193  * add_extent_mapping tries a simple backward merge with existing
194  * mappings.  The extent_map struct passed in will be inserted into
195  * the tree directly (no copies made, just a reference taken).
196  */
197 int add_extent_mapping(struct extent_map_tree *tree,
198 		       struct extent_map *em)
199 {
200 	int ret = 0;
201 	struct extent_map *prev = NULL;
202 	struct rb_node *rb;
203 
204 	write_lock_irq(&tree->lock);
205 	rb = tree_insert(&tree->map, em->end, &em->rb_node);
206 	if (rb) {
207 		prev = rb_entry(rb, struct extent_map, rb_node);
208 		printk("found extent map %Lu %Lu on insert of %Lu %Lu\n", prev->start, prev->end, em->start, em->end);
209 		ret = -EEXIST;
210 		goto out;
211 	}
212 	atomic_inc(&em->refs);
213 	if (em->start != 0) {
214 		rb = rb_prev(&em->rb_node);
215 		if (rb)
216 			prev = rb_entry(rb, struct extent_map, rb_node);
217 		if (prev && prev->end + 1 == em->start &&
218 		    ((em->block_start == 0 && prev->block_start == 0) ||
219 			     (em->block_start == prev->block_end + 1))) {
220 			em->start = prev->start;
221 			em->block_start = prev->block_start;
222 			rb_erase(&prev->rb_node, &tree->map);
223 			prev->in_tree = 0;
224 			free_extent_map(prev);
225 		}
226 	 }
227 out:
228 	write_unlock_irq(&tree->lock);
229 	return ret;
230 }
231 EXPORT_SYMBOL(add_extent_mapping);
232 
233 /*
234  * lookup_extent_mapping returns the first extent_map struct in the
235  * tree that intersects the [start, end] (inclusive) range.  There may
236  * be additional objects in the tree that intersect, so check the object
237  * returned carefully to make sure you don't need additional lookups.
238  */
239 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
240 					 u64 start, u64 end)
241 {
242 	struct extent_map *em;
243 	struct rb_node *rb_node;
244 
245 	read_lock_irq(&tree->lock);
246 	rb_node = tree_search(&tree->map, start);
247 	if (!rb_node) {
248 		em = NULL;
249 		goto out;
250 	}
251 	if (IS_ERR(rb_node)) {
252 		em = ERR_PTR(PTR_ERR(rb_node));
253 		goto out;
254 	}
255 	em = rb_entry(rb_node, struct extent_map, rb_node);
256 	if (em->end < start || em->start > end) {
257 		em = NULL;
258 		goto out;
259 	}
260 	atomic_inc(&em->refs);
261 out:
262 	read_unlock_irq(&tree->lock);
263 	return em;
264 }
265 EXPORT_SYMBOL(lookup_extent_mapping);
266 
267 /*
268  * removes an extent_map struct from the tree.  No reference counts are
269  * dropped, and no checks are done to  see if the range is in use
270  */
271 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
272 {
273 	int ret;
274 
275 	write_lock_irq(&tree->lock);
276 	ret = tree_delete(&tree->map, em->end);
277 	write_unlock_irq(&tree->lock);
278 	return ret;
279 }
280 EXPORT_SYMBOL(remove_extent_mapping);
281 
282 /*
283  * utility function to look for merge candidates inside a given range.
284  * Any extents with matching state are merged together into a single
285  * extent in the tree.  Extents with EXTENT_IO in their state field
286  * are not merged because the end_io handlers need to be able to do
287  * operations on them without sleeping (or doing allocations/splits).
288  *
289  * This should be called with the tree lock held.
290  */
291 static int merge_state(struct extent_map_tree *tree,
292 		       struct extent_state *state)
293 {
294 	struct extent_state *other;
295 	struct rb_node *other_node;
296 
297 	if (state->state & EXTENT_IOBITS)
298 		return 0;
299 
300 	other_node = rb_prev(&state->rb_node);
301 	if (other_node) {
302 		other = rb_entry(other_node, struct extent_state, rb_node);
303 		if (other->end == state->start - 1 &&
304 		    other->state == state->state) {
305 			state->start = other->start;
306 			other->in_tree = 0;
307 			rb_erase(&other->rb_node, &tree->state);
308 			free_extent_state(other);
309 		}
310 	}
311 	other_node = rb_next(&state->rb_node);
312 	if (other_node) {
313 		other = rb_entry(other_node, struct extent_state, rb_node);
314 		if (other->start == state->end + 1 &&
315 		    other->state == state->state) {
316 			other->start = state->start;
317 			state->in_tree = 0;
318 			rb_erase(&state->rb_node, &tree->state);
319 			free_extent_state(state);
320 		}
321 	}
322 	return 0;
323 }
324 
325 /*
326  * insert an extent_state struct into the tree.  'bits' are set on the
327  * struct before it is inserted.
328  *
329  * This may return -EEXIST if the extent is already there, in which case the
330  * state struct is freed.
331  *
332  * The tree lock is not taken internally.  This is a utility function and
333  * probably isn't what you want to call (see set/clear_extent_bit).
334  */
335 static int insert_state(struct extent_map_tree *tree,
336 			struct extent_state *state, u64 start, u64 end,
337 			int bits)
338 {
339 	struct rb_node *node;
340 
341 	if (end < start) {
342 		printk("end < start %Lu %Lu\n", end, start);
343 		WARN_ON(1);
344 	}
345 	state->state |= bits;
346 	state->start = start;
347 	state->end = end;
348 	if ((end & 4095) == 0) {
349 		printk("insert state %Lu %Lu strange end\n", start, end);
350 		WARN_ON(1);
351 	}
352 	node = tree_insert(&tree->state, end, &state->rb_node);
353 	if (node) {
354 		struct extent_state *found;
355 		found = rb_entry(node, struct extent_state, rb_node);
356 		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
357 		free_extent_state(state);
358 		return -EEXIST;
359 	}
360 	merge_state(tree, state);
361 	return 0;
362 }
363 
364 /*
365  * split a given extent state struct in two, inserting the preallocated
366  * struct 'prealloc' as the newly created second half.  'split' indicates an
367  * offset inside 'orig' where it should be split.
368  *
369  * Before calling,
370  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
371  * are two extent state structs in the tree:
372  * prealloc: [orig->start, split - 1]
373  * orig: [ split, orig->end ]
374  *
375  * The tree locks are not taken by this function. They need to be held
376  * by the caller.
377  */
378 static int split_state(struct extent_map_tree *tree, struct extent_state *orig,
379 		       struct extent_state *prealloc, u64 split)
380 {
381 	struct rb_node *node;
382 	prealloc->start = orig->start;
383 	prealloc->end = split - 1;
384 	prealloc->state = orig->state;
385 	orig->start = split;
386 	if ((prealloc->end & 4095) == 0) {
387 		printk("insert state %Lu %Lu strange end\n", prealloc->start,
388 		       prealloc->end);
389 		WARN_ON(1);
390 	}
391 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
392 	if (node) {
393 		struct extent_state *found;
394 		found = rb_entry(node, struct extent_state, rb_node);
395 		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
396 		free_extent_state(prealloc);
397 		return -EEXIST;
398 	}
399 	return 0;
400 }
401 
402 /*
403  * utility function to clear some bits in an extent state struct.
404  * it will optionally wake up any one waiting on this state (wake == 1), or
405  * forcibly remove the state from the tree (delete == 1).
406  *
407  * If no bits are set on the state struct after clearing things, the
408  * struct is freed and removed from the tree
409  */
410 static int clear_state_bit(struct extent_map_tree *tree,
411 			    struct extent_state *state, int bits, int wake,
412 			    int delete)
413 {
414 	int ret = state->state & bits;
415 	state->state &= ~bits;
416 	if (wake)
417 		wake_up(&state->wq);
418 	if (delete || state->state == 0) {
419 		if (state->in_tree) {
420 			rb_erase(&state->rb_node, &tree->state);
421 			state->in_tree = 0;
422 			free_extent_state(state);
423 		} else {
424 			WARN_ON(1);
425 		}
426 	} else {
427 		merge_state(tree, state);
428 	}
429 	return ret;
430 }
431 
432 /*
433  * clear some bits on a range in the tree.  This may require splitting
434  * or inserting elements in the tree, so the gfp mask is used to
435  * indicate which allocations or sleeping are allowed.
436  *
437  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
438  * the given range from the tree regardless of state (ie for truncate).
439  *
440  * the range [start, end] is inclusive.
441  *
442  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
443  * bits were already set, or zero if none of the bits were already set.
444  */
445 int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end,
446 		     int bits, int wake, int delete, gfp_t mask)
447 {
448 	struct extent_state *state;
449 	struct extent_state *prealloc = NULL;
450 	struct rb_node *node;
451 	int err;
452 	int set = 0;
453 
454 again:
455 	if (!prealloc && (mask & __GFP_WAIT)) {
456 		prealloc = alloc_extent_state(mask);
457 		if (!prealloc)
458 			return -ENOMEM;
459 	}
460 
461 	write_lock_irq(&tree->lock);
462 	/*
463 	 * this search will find the extents that end after
464 	 * our range starts
465 	 */
466 	node = tree_search(&tree->state, start);
467 	if (!node)
468 		goto out;
469 	state = rb_entry(node, struct extent_state, rb_node);
470 	if (state->start > end)
471 		goto out;
472 	WARN_ON(state->end < start);
473 
474 	/*
475 	 *     | ---- desired range ---- |
476 	 *  | state | or
477 	 *  | ------------- state -------------- |
478 	 *
479 	 * We need to split the extent we found, and may flip
480 	 * bits on second half.
481 	 *
482 	 * If the extent we found extends past our range, we
483 	 * just split and search again.  It'll get split again
484 	 * the next time though.
485 	 *
486 	 * If the extent we found is inside our range, we clear
487 	 * the desired bit on it.
488 	 */
489 
490 	if (state->start < start) {
491 		err = split_state(tree, state, prealloc, start);
492 		BUG_ON(err == -EEXIST);
493 		prealloc = NULL;
494 		if (err)
495 			goto out;
496 		if (state->end <= end) {
497 			start = state->end + 1;
498 			set |= clear_state_bit(tree, state, bits,
499 					wake, delete);
500 		} else {
501 			start = state->start;
502 		}
503 		goto search_again;
504 	}
505 	/*
506 	 * | ---- desired range ---- |
507 	 *                        | state |
508 	 * We need to split the extent, and clear the bit
509 	 * on the first half
510 	 */
511 	if (state->start <= end && state->end > end) {
512 		err = split_state(tree, state, prealloc, end + 1);
513 		BUG_ON(err == -EEXIST);
514 
515 		if (wake)
516 			wake_up(&state->wq);
517 		set |= clear_state_bit(tree, prealloc, bits,
518 				       wake, delete);
519 		prealloc = NULL;
520 		goto out;
521 	}
522 
523 	start = state->end + 1;
524 	set |= clear_state_bit(tree, state, bits, wake, delete);
525 	goto search_again;
526 
527 out:
528 	write_unlock_irq(&tree->lock);
529 	if (prealloc)
530 		free_extent_state(prealloc);
531 
532 	return set;
533 
534 search_again:
535 	if (start >= end)
536 		goto out;
537 	write_unlock_irq(&tree->lock);
538 	if (mask & __GFP_WAIT)
539 		cond_resched();
540 	goto again;
541 }
542 EXPORT_SYMBOL(clear_extent_bit);
543 
544 static int wait_on_state(struct extent_map_tree *tree,
545 			 struct extent_state *state)
546 {
547 	DEFINE_WAIT(wait);
548 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
549 	read_unlock_irq(&tree->lock);
550 	schedule();
551 	read_lock_irq(&tree->lock);
552 	finish_wait(&state->wq, &wait);
553 	return 0;
554 }
555 
556 /*
557  * waits for one or more bits to clear on a range in the state tree.
558  * The range [start, end] is inclusive.
559  * The tree lock is taken by this function
560  */
561 int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits)
562 {
563 	struct extent_state *state;
564 	struct rb_node *node;
565 
566 	read_lock_irq(&tree->lock);
567 again:
568 	while (1) {
569 		/*
570 		 * this search will find all the extents that end after
571 		 * our range starts
572 		 */
573 		node = tree_search(&tree->state, start);
574 		if (!node)
575 			break;
576 
577 		state = rb_entry(node, struct extent_state, rb_node);
578 
579 		if (state->start > end)
580 			goto out;
581 
582 		if (state->state & bits) {
583 			start = state->start;
584 			atomic_inc(&state->refs);
585 			wait_on_state(tree, state);
586 			free_extent_state(state);
587 			goto again;
588 		}
589 		start = state->end + 1;
590 
591 		if (start > end)
592 			break;
593 
594 		if (need_resched()) {
595 			read_unlock_irq(&tree->lock);
596 			cond_resched();
597 			read_lock_irq(&tree->lock);
598 		}
599 	}
600 out:
601 	read_unlock_irq(&tree->lock);
602 	return 0;
603 }
604 EXPORT_SYMBOL(wait_extent_bit);
605 
606 /*
607  * set some bits on a range in the tree.  This may require allocations
608  * or sleeping, so the gfp mask is used to indicate what is allowed.
609  *
610  * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
611  * range already has the desired bits set.  The start of the existing
612  * range is returned in failed_start in this case.
613  *
614  * [start, end] is inclusive
615  * This takes the tree lock.
616  */
617 int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits,
618 		   int exclusive, u64 *failed_start, gfp_t mask)
619 {
620 	struct extent_state *state;
621 	struct extent_state *prealloc = NULL;
622 	struct rb_node *node;
623 	int err = 0;
624 	int set;
625 	u64 last_start;
626 	u64 last_end;
627 again:
628 	if (!prealloc && (mask & __GFP_WAIT)) {
629 		prealloc = alloc_extent_state(mask);
630 		if (!prealloc)
631 			return -ENOMEM;
632 	}
633 
634 	write_lock_irq(&tree->lock);
635 	/*
636 	 * this search will find all the extents that end after
637 	 * our range starts.
638 	 */
639 	node = tree_search(&tree->state, start);
640 	if (!node) {
641 		err = insert_state(tree, prealloc, start, end, bits);
642 		prealloc = NULL;
643 		BUG_ON(err == -EEXIST);
644 		goto out;
645 	}
646 
647 	state = rb_entry(node, struct extent_state, rb_node);
648 	last_start = state->start;
649 	last_end = state->end;
650 
651 	/*
652 	 * | ---- desired range ---- |
653 	 * | state |
654 	 *
655 	 * Just lock what we found and keep going
656 	 */
657 	if (state->start == start && state->end <= end) {
658 		set = state->state & bits;
659 		if (set && exclusive) {
660 			*failed_start = state->start;
661 			err = -EEXIST;
662 			goto out;
663 		}
664 		state->state |= bits;
665 		start = state->end + 1;
666 		merge_state(tree, state);
667 		goto search_again;
668 	}
669 
670 	/*
671 	 *     | ---- desired range ---- |
672 	 * | state |
673 	 *   or
674 	 * | ------------- state -------------- |
675 	 *
676 	 * We need to split the extent we found, and may flip bits on
677 	 * second half.
678 	 *
679 	 * If the extent we found extends past our
680 	 * range, we just split and search again.  It'll get split
681 	 * again the next time though.
682 	 *
683 	 * If the extent we found is inside our range, we set the
684 	 * desired bit on it.
685 	 */
686 	if (state->start < start) {
687 		set = state->state & bits;
688 		if (exclusive && set) {
689 			*failed_start = start;
690 			err = -EEXIST;
691 			goto out;
692 		}
693 		err = split_state(tree, state, prealloc, start);
694 		BUG_ON(err == -EEXIST);
695 		prealloc = NULL;
696 		if (err)
697 			goto out;
698 		if (state->end <= end) {
699 			state->state |= bits;
700 			start = state->end + 1;
701 			merge_state(tree, state);
702 		} else {
703 			start = state->start;
704 		}
705 		goto search_again;
706 	}
707 	/*
708 	 * | ---- desired range ---- |
709 	 *                        | state |
710 	 * We need to split the extent, and set the bit
711 	 * on the first half
712 	 */
713 	if (state->start <= end && state->end > end) {
714 		set = state->state & bits;
715 		if (exclusive && set) {
716 			*failed_start = start;
717 			err = -EEXIST;
718 			goto out;
719 		}
720 		err = split_state(tree, state, prealloc, end + 1);
721 		BUG_ON(err == -EEXIST);
722 
723 		prealloc->state |= bits;
724 		merge_state(tree, prealloc);
725 		prealloc = NULL;
726 		goto out;
727 	}
728 
729 	/*
730 	 * | ---- desired range ---- |
731 	 *     | state | or               | state |
732 	 *
733 	 * There's a hole, we need to insert something in it and
734 	 * ignore the extent we found.
735 	 */
736 	if (state->start > start) {
737 		u64 this_end;
738 		if (end < last_start)
739 			this_end = end;
740 		else
741 			this_end = last_start -1;
742 		err = insert_state(tree, prealloc, start, this_end,
743 				   bits);
744 		prealloc = NULL;
745 		BUG_ON(err == -EEXIST);
746 		if (err)
747 			goto out;
748 		start = this_end + 1;
749 		goto search_again;
750 	}
751 	goto search_again;
752 
753 out:
754 	write_unlock_irq(&tree->lock);
755 	if (prealloc)
756 		free_extent_state(prealloc);
757 
758 	return err;
759 
760 search_again:
761 	if (start > end)
762 		goto out;
763 	write_unlock_irq(&tree->lock);
764 	if (mask & __GFP_WAIT)
765 		cond_resched();
766 	goto again;
767 }
768 EXPORT_SYMBOL(set_extent_bit);
769 
770 /* wrappers around set/clear extent bit */
771 int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
772 		     gfp_t mask)
773 {
774 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
775 			      mask);
776 }
777 EXPORT_SYMBOL(set_extent_dirty);
778 
779 int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end,
780 		     gfp_t mask)
781 {
782 	return set_extent_bit(tree, start, end,
783 			      EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
784 			      mask);
785 }
786 EXPORT_SYMBOL(set_extent_delalloc);
787 
788 int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
789 		       gfp_t mask)
790 {
791 	return clear_extent_bit(tree, start, end,
792 				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
793 }
794 EXPORT_SYMBOL(clear_extent_dirty);
795 
796 int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
797 		     gfp_t mask)
798 {
799 	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
800 			      mask);
801 }
802 EXPORT_SYMBOL(set_extent_new);
803 
804 int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
805 		       gfp_t mask)
806 {
807 	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
808 }
809 EXPORT_SYMBOL(clear_extent_new);
810 
811 int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
812 			gfp_t mask)
813 {
814 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
815 			      mask);
816 }
817 EXPORT_SYMBOL(set_extent_uptodate);
818 
819 int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
820 			  gfp_t mask)
821 {
822 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
823 }
824 EXPORT_SYMBOL(clear_extent_uptodate);
825 
826 int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
827 			 gfp_t mask)
828 {
829 	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
830 			      0, NULL, mask);
831 }
832 EXPORT_SYMBOL(set_extent_writeback);
833 
834 int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
835 			   gfp_t mask)
836 {
837 	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
838 }
839 EXPORT_SYMBOL(clear_extent_writeback);
840 
841 int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end)
842 {
843 	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
844 }
845 EXPORT_SYMBOL(wait_on_extent_writeback);
846 
847 /*
848  * locks a range in ascending order, waiting for any locked regions
849  * it hits on the way.  [start,end] are inclusive, and this will sleep.
850  */
851 int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask)
852 {
853 	int err;
854 	u64 failed_start;
855 	while (1) {
856 		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
857 				     &failed_start, mask);
858 		if (err == -EEXIST && (mask & __GFP_WAIT)) {
859 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
860 			start = failed_start;
861 		} else {
862 			break;
863 		}
864 		WARN_ON(start > end);
865 	}
866 	return err;
867 }
868 EXPORT_SYMBOL(lock_extent);
869 
870 int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end,
871 		  gfp_t mask)
872 {
873 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
874 }
875 EXPORT_SYMBOL(unlock_extent);
876 
877 /*
878  * helper function to set pages and extents in the tree dirty
879  */
880 int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end)
881 {
882 	unsigned long index = start >> PAGE_CACHE_SHIFT;
883 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
884 	struct page *page;
885 
886 	while (index <= end_index) {
887 		page = find_get_page(tree->mapping, index);
888 		BUG_ON(!page);
889 		__set_page_dirty_nobuffers(page);
890 		page_cache_release(page);
891 		index++;
892 	}
893 	set_extent_dirty(tree, start, end, GFP_NOFS);
894 	return 0;
895 }
896 EXPORT_SYMBOL(set_range_dirty);
897 
898 /*
899  * helper function to set both pages and extents in the tree writeback
900  */
901 int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end)
902 {
903 	unsigned long index = start >> PAGE_CACHE_SHIFT;
904 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
905 	struct page *page;
906 
907 	while (index <= end_index) {
908 		page = find_get_page(tree->mapping, index);
909 		BUG_ON(!page);
910 		set_page_writeback(page);
911 		page_cache_release(page);
912 		index++;
913 	}
914 	set_extent_writeback(tree, start, end, GFP_NOFS);
915 	return 0;
916 }
917 EXPORT_SYMBOL(set_range_writeback);
918 
919 u64 find_lock_delalloc_range(struct extent_map_tree *tree,
920 			     u64 start, u64 lock_start, u64 *end, u64 max_bytes)
921 {
922 	struct rb_node *node;
923 	struct extent_state *state;
924 	u64 cur_start = start;
925 	u64 found = 0;
926 	u64 total_bytes = 0;
927 
928 	write_lock_irq(&tree->lock);
929 	/*
930 	 * this search will find all the extents that end after
931 	 * our range starts.
932 	 */
933 search_again:
934 	node = tree_search(&tree->state, cur_start);
935 	if (!node || IS_ERR(node)) {
936 		goto out;
937 	}
938 
939 	while(1) {
940 		state = rb_entry(node, struct extent_state, rb_node);
941 		if (state->start != cur_start) {
942 			goto out;
943 		}
944 		if (!(state->state & EXTENT_DELALLOC)) {
945 			goto out;
946 		}
947 		if (state->start >= lock_start) {
948 			if (state->state & EXTENT_LOCKED) {
949 				DEFINE_WAIT(wait);
950 				atomic_inc(&state->refs);
951 				write_unlock_irq(&tree->lock);
952 				schedule();
953 				write_lock_irq(&tree->lock);
954 				finish_wait(&state->wq, &wait);
955 				free_extent_state(state);
956 				goto search_again;
957 			}
958 			state->state |= EXTENT_LOCKED;
959 		}
960 		found++;
961 		*end = state->end;
962 		cur_start = state->end + 1;
963 		node = rb_next(node);
964 		if (!node)
965 			break;
966 		total_bytes = state->end - state->start + 1;
967 		if (total_bytes >= max_bytes)
968 			break;
969 	}
970 out:
971 	write_unlock_irq(&tree->lock);
972 	return found;
973 }
974 
975 /*
976  * helper function to lock both pages and extents in the tree.
977  * pages must be locked first.
978  */
979 int lock_range(struct extent_map_tree *tree, u64 start, u64 end)
980 {
981 	unsigned long index = start >> PAGE_CACHE_SHIFT;
982 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
983 	struct page *page;
984 	int err;
985 
986 	while (index <= end_index) {
987 		page = grab_cache_page(tree->mapping, index);
988 		if (!page) {
989 			err = -ENOMEM;
990 			goto failed;
991 		}
992 		if (IS_ERR(page)) {
993 			err = PTR_ERR(page);
994 			goto failed;
995 		}
996 		index++;
997 	}
998 	lock_extent(tree, start, end, GFP_NOFS);
999 	return 0;
1000 
1001 failed:
1002 	/*
1003 	 * we failed above in getting the page at 'index', so we undo here
1004 	 * up to but not including the page at 'index'
1005 	 */
1006 	end_index = index;
1007 	index = start >> PAGE_CACHE_SHIFT;
1008 	while (index < end_index) {
1009 		page = find_get_page(tree->mapping, index);
1010 		unlock_page(page);
1011 		page_cache_release(page);
1012 		index++;
1013 	}
1014 	return err;
1015 }
1016 EXPORT_SYMBOL(lock_range);
1017 
1018 /*
1019  * helper function to unlock both pages and extents in the tree.
1020  */
1021 int unlock_range(struct extent_map_tree *tree, u64 start, u64 end)
1022 {
1023 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1024 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1025 	struct page *page;
1026 
1027 	while (index <= end_index) {
1028 		page = find_get_page(tree->mapping, index);
1029 		unlock_page(page);
1030 		page_cache_release(page);
1031 		index++;
1032 	}
1033 	unlock_extent(tree, start, end, GFP_NOFS);
1034 	return 0;
1035 }
1036 EXPORT_SYMBOL(unlock_range);
1037 
1038 int set_state_private(struct extent_map_tree *tree, u64 start, u64 private)
1039 {
1040 	struct rb_node *node;
1041 	struct extent_state *state;
1042 	int ret = 0;
1043 
1044 	write_lock_irq(&tree->lock);
1045 	/*
1046 	 * this search will find all the extents that end after
1047 	 * our range starts.
1048 	 */
1049 	node = tree_search(&tree->state, start);
1050 	if (!node || IS_ERR(node)) {
1051 		ret = -ENOENT;
1052 		goto out;
1053 	}
1054 	state = rb_entry(node, struct extent_state, rb_node);
1055 	if (state->start != start) {
1056 		ret = -ENOENT;
1057 		goto out;
1058 	}
1059 	state->private = private;
1060 out:
1061 	write_unlock_irq(&tree->lock);
1062 	return ret;
1063 
1064 }
1065 
1066 int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private)
1067 {
1068 	struct rb_node *node;
1069 	struct extent_state *state;
1070 	int ret = 0;
1071 
1072 	read_lock_irq(&tree->lock);
1073 	/*
1074 	 * this search will find all the extents that end after
1075 	 * our range starts.
1076 	 */
1077 	node = tree_search(&tree->state, start);
1078 	if (!node || IS_ERR(node)) {
1079 		ret = -ENOENT;
1080 		goto out;
1081 	}
1082 	state = rb_entry(node, struct extent_state, rb_node);
1083 	if (state->start != start) {
1084 		ret = -ENOENT;
1085 		goto out;
1086 	}
1087 	*private = state->private;
1088 out:
1089 	read_unlock_irq(&tree->lock);
1090 	return ret;
1091 }
1092 
1093 /*
1094  * searches a range in the state tree for a given mask.
1095  * If 'filled' == 1, this returns 1 only if ever extent in the tree
1096  * has the bits set.  Otherwise, 1 is returned if any bit in the
1097  * range is found set.
1098  */
1099 static int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
1100 			  int bits, int filled)
1101 {
1102 	struct extent_state *state = NULL;
1103 	struct rb_node *node;
1104 	int bitset = 0;
1105 
1106 	read_lock_irq(&tree->lock);
1107 	node = tree_search(&tree->state, start);
1108 	while (node && start <= end) {
1109 		state = rb_entry(node, struct extent_state, rb_node);
1110 		if (state->start > end)
1111 			break;
1112 
1113 		if (filled && state->start > start) {
1114 			bitset = 0;
1115 			break;
1116 		}
1117 		if (state->state & bits) {
1118 			bitset = 1;
1119 			if (!filled)
1120 				break;
1121 		} else if (filled) {
1122 			bitset = 0;
1123 			break;
1124 		}
1125 		start = state->end + 1;
1126 		if (start > end)
1127 			break;
1128 		node = rb_next(node);
1129 	}
1130 	read_unlock_irq(&tree->lock);
1131 	return bitset;
1132 }
1133 
1134 /*
1135  * helper function to set a given page up to date if all the
1136  * extents in the tree for that page are up to date
1137  */
1138 static int check_page_uptodate(struct extent_map_tree *tree,
1139 			       struct page *page)
1140 {
1141 	u64 start = page->index << PAGE_CACHE_SHIFT;
1142 	u64 end = start + PAGE_CACHE_SIZE - 1;
1143 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1144 		SetPageUptodate(page);
1145 	return 0;
1146 }
1147 
1148 /*
1149  * helper function to unlock a page if all the extents in the tree
1150  * for that page are unlocked
1151  */
1152 static int check_page_locked(struct extent_map_tree *tree,
1153 			     struct page *page)
1154 {
1155 	u64 start = page->index << PAGE_CACHE_SHIFT;
1156 	u64 end = start + PAGE_CACHE_SIZE - 1;
1157 	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1158 		unlock_page(page);
1159 	return 0;
1160 }
1161 
1162 /*
1163  * helper function to end page writeback if all the extents
1164  * in the tree for that page are done with writeback
1165  */
1166 static int check_page_writeback(struct extent_map_tree *tree,
1167 			     struct page *page)
1168 {
1169 	u64 start = page->index << PAGE_CACHE_SHIFT;
1170 	u64 end = start + PAGE_CACHE_SIZE - 1;
1171 	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1172 		end_page_writeback(page);
1173 	return 0;
1174 }
1175 
1176 /* lots and lots of room for performance fixes in the end_bio funcs */
1177 
1178 /*
1179  * after a writepage IO is done, we need to:
1180  * clear the uptodate bits on error
1181  * clear the writeback bits in the extent tree for this IO
1182  * end_page_writeback if the page has no more pending IO
1183  *
1184  * Scheduling is not allowed, so the extent state tree is expected
1185  * to have one and only one object corresponding to this IO.
1186  */
1187 static int end_bio_extent_writepage(struct bio *bio,
1188 				   unsigned int bytes_done, int err)
1189 {
1190 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1191 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1192 	struct extent_map_tree *tree = bio->bi_private;
1193 	u64 start;
1194 	u64 end;
1195 	int whole_page;
1196 
1197 	if (bio->bi_size)
1198 		return 1;
1199 
1200 	do {
1201 		struct page *page = bvec->bv_page;
1202 		start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1203 		end = start + bvec->bv_len - 1;
1204 
1205 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1206 			whole_page = 1;
1207 		else
1208 			whole_page = 0;
1209 
1210 		if (--bvec >= bio->bi_io_vec)
1211 			prefetchw(&bvec->bv_page->flags);
1212 
1213 		if (!uptodate) {
1214 			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1215 			ClearPageUptodate(page);
1216 			SetPageError(page);
1217 		}
1218 		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1219 
1220 		if (whole_page)
1221 			end_page_writeback(page);
1222 		else
1223 			check_page_writeback(tree, page);
1224 	} while (bvec >= bio->bi_io_vec);
1225 
1226 	bio_put(bio);
1227 	return 0;
1228 }
1229 
1230 /*
1231  * after a readpage IO is done, we need to:
1232  * clear the uptodate bits on error
1233  * set the uptodate bits if things worked
1234  * set the page up to date if all extents in the tree are uptodate
1235  * clear the lock bit in the extent tree
1236  * unlock the page if there are no other extents locked for it
1237  *
1238  * Scheduling is not allowed, so the extent state tree is expected
1239  * to have one and only one object corresponding to this IO.
1240  */
1241 static int end_bio_extent_readpage(struct bio *bio,
1242 				   unsigned int bytes_done, int err)
1243 {
1244 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1245 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1246 	struct extent_map_tree *tree = bio->bi_private;
1247 	u64 start;
1248 	u64 end;
1249 	int whole_page;
1250 	int ret;
1251 
1252 	if (bio->bi_size)
1253 		return 1;
1254 
1255 	do {
1256 		struct page *page = bvec->bv_page;
1257 		start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1258 		end = start + bvec->bv_len - 1;
1259 
1260 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1261 			whole_page = 1;
1262 		else
1263 			whole_page = 0;
1264 
1265 		if (--bvec >= bio->bi_io_vec)
1266 			prefetchw(&bvec->bv_page->flags);
1267 
1268 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1269 			ret = tree->ops->readpage_end_io_hook(page, start, end);
1270 			if (ret)
1271 				uptodate = 0;
1272 		}
1273 		if (uptodate) {
1274 			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1275 			if (whole_page)
1276 				SetPageUptodate(page);
1277 			else
1278 				check_page_uptodate(tree, page);
1279 		} else {
1280 			ClearPageUptodate(page);
1281 			SetPageError(page);
1282 		}
1283 
1284 		unlock_extent(tree, start, end, GFP_ATOMIC);
1285 
1286 		if (whole_page)
1287 			unlock_page(page);
1288 		else
1289 			check_page_locked(tree, page);
1290 	} while (bvec >= bio->bi_io_vec);
1291 
1292 	bio_put(bio);
1293 	return 0;
1294 }
1295 
1296 /*
1297  * IO done from prepare_write is pretty simple, we just unlock
1298  * the structs in the extent tree when done, and set the uptodate bits
1299  * as appropriate.
1300  */
1301 static int end_bio_extent_preparewrite(struct bio *bio,
1302 				       unsigned int bytes_done, int err)
1303 {
1304 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1305 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1306 	struct extent_map_tree *tree = bio->bi_private;
1307 	u64 start;
1308 	u64 end;
1309 
1310 	if (bio->bi_size)
1311 		return 1;
1312 
1313 	do {
1314 		struct page *page = bvec->bv_page;
1315 		start = (page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1316 		end = start + bvec->bv_len - 1;
1317 
1318 		if (--bvec >= bio->bi_io_vec)
1319 			prefetchw(&bvec->bv_page->flags);
1320 
1321 		if (uptodate) {
1322 			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1323 		} else {
1324 			ClearPageUptodate(page);
1325 			SetPageError(page);
1326 		}
1327 
1328 		unlock_extent(tree, start, end, GFP_ATOMIC);
1329 
1330 	} while (bvec >= bio->bi_io_vec);
1331 
1332 	bio_put(bio);
1333 	return 0;
1334 }
1335 
1336 static int submit_extent_page(int rw, struct extent_map_tree *tree,
1337 			      struct page *page, sector_t sector,
1338 			      size_t size, unsigned long offset,
1339 			      struct block_device *bdev,
1340 			      bio_end_io_t end_io_func)
1341 {
1342 	struct bio *bio;
1343 	int ret = 0;
1344 
1345 	bio = bio_alloc(GFP_NOIO, 1);
1346 
1347 	bio->bi_sector = sector;
1348 	bio->bi_bdev = bdev;
1349 	bio->bi_io_vec[0].bv_page = page;
1350 	bio->bi_io_vec[0].bv_len = size;
1351 	bio->bi_io_vec[0].bv_offset = offset;
1352 
1353 	bio->bi_vcnt = 1;
1354 	bio->bi_idx = 0;
1355 	bio->bi_size = size;
1356 
1357 	bio->bi_end_io = end_io_func;
1358 	bio->bi_private = tree;
1359 
1360 	bio_get(bio);
1361 	submit_bio(rw, bio);
1362 
1363 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
1364 		ret = -EOPNOTSUPP;
1365 
1366 	bio_put(bio);
1367 	return ret;
1368 }
1369 
1370 /*
1371  * basic readpage implementation.  Locked extent state structs are inserted
1372  * into the tree that are removed when the IO is done (by the end_io
1373  * handlers)
1374  */
1375 int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
1376 			  get_extent_t *get_extent)
1377 {
1378 	struct inode *inode = page->mapping->host;
1379 	u64 start = page->index << PAGE_CACHE_SHIFT;
1380 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
1381 	u64 end;
1382 	u64 cur = start;
1383 	u64 extent_offset;
1384 	u64 last_byte = i_size_read(inode);
1385 	u64 block_start;
1386 	u64 cur_end;
1387 	sector_t sector;
1388 	struct extent_map *em;
1389 	struct block_device *bdev;
1390 	int ret;
1391 	int nr = 0;
1392 	size_t page_offset = 0;
1393 	size_t iosize;
1394 	size_t blocksize = inode->i_sb->s_blocksize;
1395 
1396 	if (!PagePrivate(page)) {
1397 		SetPagePrivate(page);
1398 		set_page_private(page, 1);
1399 		WARN_ON(!page->mapping->a_ops->invalidatepage);
1400 		page_cache_get(page);
1401 	}
1402 
1403 	end = page_end;
1404 	lock_extent(tree, start, end, GFP_NOFS);
1405 
1406 	while (cur <= end) {
1407 		if (cur >= last_byte) {
1408 			iosize = PAGE_CACHE_SIZE - page_offset;
1409 			zero_user_page(page, page_offset, iosize, KM_USER0);
1410 			set_extent_uptodate(tree, cur, cur + iosize - 1,
1411 					    GFP_NOFS);
1412 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1413 			break;
1414 		}
1415 		em = get_extent(inode, page, page_offset, cur, end, 0);
1416 		if (IS_ERR(em) || !em) {
1417 			SetPageError(page);
1418 			unlock_extent(tree, cur, end, GFP_NOFS);
1419 			break;
1420 		}
1421 
1422 		extent_offset = cur - em->start;
1423 		BUG_ON(em->end < cur);
1424 		BUG_ON(end < cur);
1425 
1426 		iosize = min(em->end - cur, end - cur) + 1;
1427 		cur_end = min(em->end, end);
1428 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1429 		sector = (em->block_start + extent_offset) >> 9;
1430 		bdev = em->bdev;
1431 		block_start = em->block_start;
1432 		free_extent_map(em);
1433 		em = NULL;
1434 
1435 		/* we've found a hole, just zero and go on */
1436 		if (block_start == 0) {
1437 			zero_user_page(page, page_offset, iosize, KM_USER0);
1438 			set_extent_uptodate(tree, cur, cur + iosize - 1,
1439 					    GFP_NOFS);
1440 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1441 			cur = cur + iosize;
1442 			page_offset += iosize;
1443 			continue;
1444 		}
1445 		/* the get_extent function already copied into the page */
1446 		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1447 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1448 			cur = cur + iosize;
1449 			page_offset += iosize;
1450 			continue;
1451 		}
1452 
1453 		ret = 0;
1454 		if (tree->ops && tree->ops->readpage_io_hook) {
1455 			ret = tree->ops->readpage_io_hook(page, cur,
1456 							  cur + iosize - 1);
1457 		}
1458 		if (!ret) {
1459 			ret = submit_extent_page(READ, tree, page,
1460 						 sector, iosize, page_offset,
1461 						 bdev, end_bio_extent_readpage);
1462 		}
1463 		if (ret)
1464 			SetPageError(page);
1465 		cur = cur + iosize;
1466 		page_offset += iosize;
1467 		nr++;
1468 	}
1469 	if (!nr) {
1470 		if (!PageError(page))
1471 			SetPageUptodate(page);
1472 		unlock_page(page);
1473 	}
1474 	return 0;
1475 }
1476 EXPORT_SYMBOL(extent_read_full_page);
1477 
1478 /*
1479  * the writepage semantics are similar to regular writepage.  extent
1480  * records are inserted to lock ranges in the tree, and as dirty areas
1481  * are found, they are marked writeback.  Then the lock bits are removed
1482  * and the end_io handler clears the writeback ranges
1483  */
1484 int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
1485 			  get_extent_t *get_extent,
1486 			  struct writeback_control *wbc)
1487 {
1488 	struct inode *inode = page->mapping->host;
1489 	u64 start = page->index << PAGE_CACHE_SHIFT;
1490 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
1491 	u64 end;
1492 	u64 cur = start;
1493 	u64 extent_offset;
1494 	u64 last_byte = i_size_read(inode);
1495 	u64 block_start;
1496 	sector_t sector;
1497 	struct extent_map *em;
1498 	struct block_device *bdev;
1499 	int ret;
1500 	int nr = 0;
1501 	size_t page_offset = 0;
1502 	size_t iosize;
1503 	size_t blocksize;
1504 	loff_t i_size = i_size_read(inode);
1505 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1506 	u64 nr_delalloc;
1507 	u64 delalloc_end;
1508 
1509 	WARN_ON(!PageLocked(page));
1510 	if (page->index > end_index) {
1511 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1512 		unlock_page(page);
1513 		return 0;
1514 	}
1515 
1516 	if (page->index == end_index) {
1517 		size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
1518 		zero_user_page(page, offset,
1519 			       PAGE_CACHE_SIZE - offset, KM_USER0);
1520 	}
1521 
1522 	if (!PagePrivate(page)) {
1523 		SetPagePrivate(page);
1524 		set_page_private(page, 1);
1525 		WARN_ON(!page->mapping->a_ops->invalidatepage);
1526 		page_cache_get(page);
1527 	}
1528 
1529 	lock_extent(tree, start, page_end, GFP_NOFS);
1530 	nr_delalloc = find_lock_delalloc_range(tree, start, page_end + 1,
1531 					       &delalloc_end,
1532 					       128 * 1024 * 1024);
1533 	if (nr_delalloc) {
1534 		tree->ops->fill_delalloc(inode, start, delalloc_end);
1535 		if (delalloc_end >= page_end + 1) {
1536 			clear_extent_bit(tree, page_end + 1, delalloc_end,
1537 					 EXTENT_LOCKED | EXTENT_DELALLOC,
1538 					 1, 0, GFP_NOFS);
1539 		}
1540 		clear_extent_bit(tree, start, page_end, EXTENT_DELALLOC,
1541 				 0, 0, GFP_NOFS);
1542 		if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1543 			printk("found delalloc bits after clear extent_bit\n");
1544 		}
1545 	} else if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1546 		printk("found delalloc bits after find_delalloc_range returns 0\n");
1547 	}
1548 
1549 	end = page_end;
1550 	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1551 		printk("found delalloc bits after lock_extent\n");
1552 	}
1553 
1554 	if (last_byte <= start) {
1555 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1556 		goto done;
1557 	}
1558 
1559 	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
1560 	blocksize = inode->i_sb->s_blocksize;
1561 
1562 	while (cur <= end) {
1563 		if (cur >= last_byte) {
1564 			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
1565 			break;
1566 		}
1567 		em = get_extent(inode, page, page_offset, cur, end, 0);
1568 		if (IS_ERR(em) || !em) {
1569 			SetPageError(page);
1570 			break;
1571 		}
1572 
1573 		extent_offset = cur - em->start;
1574 		BUG_ON(em->end < cur);
1575 		BUG_ON(end < cur);
1576 		iosize = min(em->end - cur, end - cur) + 1;
1577 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1578 		sector = (em->block_start + extent_offset) >> 9;
1579 		bdev = em->bdev;
1580 		block_start = em->block_start;
1581 		free_extent_map(em);
1582 		em = NULL;
1583 
1584 		if (block_start == 0 || block_start == EXTENT_MAP_INLINE) {
1585 			clear_extent_dirty(tree, cur,
1586 					   cur + iosize - 1, GFP_NOFS);
1587 			cur = cur + iosize;
1588 			page_offset += iosize;
1589 			continue;
1590 		}
1591 
1592 		/* leave this out until we have a page_mkwrite call */
1593 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
1594 				   EXTENT_DIRTY, 0)) {
1595 			cur = cur + iosize;
1596 			page_offset += iosize;
1597 			continue;
1598 		}
1599 		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
1600 		ret = tree->ops->writepage_io_hook(page, cur, cur + iosize - 1);
1601 		if (ret)
1602 			SetPageError(page);
1603 		else {
1604 			set_range_writeback(tree, cur, cur + iosize - 1);
1605 			ret = submit_extent_page(WRITE, tree, page, sector,
1606 						 iosize, page_offset, bdev,
1607 						 end_bio_extent_writepage);
1608 			if (ret)
1609 				SetPageError(page);
1610 		}
1611 		cur = cur + iosize;
1612 		page_offset += iosize;
1613 		nr++;
1614 	}
1615 done:
1616 	WARN_ON(test_range_bit(tree, start, page_end, EXTENT_DIRTY, 0));
1617 	unlock_extent(tree, start, page_end, GFP_NOFS);
1618 	unlock_page(page);
1619 	return 0;
1620 }
1621 EXPORT_SYMBOL(extent_write_full_page);
1622 
1623 /*
1624  * basic invalidatepage code, this waits on any locked or writeback
1625  * ranges corresponding to the page, and then deletes any extent state
1626  * records from the tree
1627  */
1628 int extent_invalidatepage(struct extent_map_tree *tree,
1629 			  struct page *page, unsigned long offset)
1630 {
1631 	u64 start = (page->index << PAGE_CACHE_SHIFT);
1632 	u64 end = start + PAGE_CACHE_SIZE - 1;
1633 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
1634 
1635 	start += (offset + blocksize -1) & ~(blocksize - 1);
1636 	if (start > end)
1637 		return 0;
1638 
1639 	lock_extent(tree, start, end, GFP_NOFS);
1640 	wait_on_extent_writeback(tree, start, end);
1641 	clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY,
1642 			 1, 1, GFP_NOFS);
1643 	return 0;
1644 }
1645 EXPORT_SYMBOL(extent_invalidatepage);
1646 
1647 /*
1648  * simple commit_write call, set_range_dirty is used to mark both
1649  * the pages and the extent records as dirty
1650  */
1651 int extent_commit_write(struct extent_map_tree *tree,
1652 			struct inode *inode, struct page *page,
1653 			unsigned from, unsigned to)
1654 {
1655 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1656 
1657 	if (!PagePrivate(page)) {
1658 		SetPagePrivate(page);
1659 		set_page_private(page, 1);
1660 		WARN_ON(!page->mapping->a_ops->invalidatepage);
1661 		page_cache_get(page);
1662 	}
1663 
1664 	set_page_dirty(page);
1665 
1666 	if (pos > inode->i_size) {
1667 		i_size_write(inode, pos);
1668 		mark_inode_dirty(inode);
1669 	}
1670 	return 0;
1671 }
1672 EXPORT_SYMBOL(extent_commit_write);
1673 
1674 int extent_prepare_write(struct extent_map_tree *tree,
1675 			 struct inode *inode, struct page *page,
1676 			 unsigned from, unsigned to, get_extent_t *get_extent)
1677 {
1678 	u64 page_start = page->index << PAGE_CACHE_SHIFT;
1679 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
1680 	u64 block_start;
1681 	u64 orig_block_start;
1682 	u64 block_end;
1683 	u64 cur_end;
1684 	struct extent_map *em;
1685 	unsigned blocksize = 1 << inode->i_blkbits;
1686 	size_t page_offset = 0;
1687 	size_t block_off_start;
1688 	size_t block_off_end;
1689 	int err = 0;
1690 	int iocount = 0;
1691 	int ret = 0;
1692 	int isnew;
1693 
1694 	if (!PagePrivate(page)) {
1695 		SetPagePrivate(page);
1696 		set_page_private(page, 1);
1697 		WARN_ON(!page->mapping->a_ops->invalidatepage);
1698 		page_cache_get(page);
1699 	}
1700 	block_start = (page_start + from) & ~((u64)blocksize - 1);
1701 	block_end = (page_start + to - 1) | (blocksize - 1);
1702 	orig_block_start = block_start;
1703 
1704 	lock_extent(tree, page_start, page_end, GFP_NOFS);
1705 	while(block_start <= block_end) {
1706 		em = get_extent(inode, page, page_offset, block_start,
1707 				block_end, 1);
1708 		if (IS_ERR(em) || !em) {
1709 			goto err;
1710 		}
1711 		cur_end = min(block_end, em->end);
1712 		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
1713 		block_off_end = block_off_start + blocksize;
1714 		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
1715 
1716 		if (!PageUptodate(page) && isnew &&
1717 		    (block_off_end > to || block_off_start < from)) {
1718 			void *kaddr;
1719 
1720 			kaddr = kmap_atomic(page, KM_USER0);
1721 			if (block_off_end > to)
1722 				memset(kaddr + to, 0, block_off_end - to);
1723 			if (block_off_start < from)
1724 				memset(kaddr + block_off_start, 0,
1725 				       from - block_off_start);
1726 			flush_dcache_page(page);
1727 			kunmap_atomic(kaddr, KM_USER0);
1728 		}
1729 		if (!isnew && !PageUptodate(page) &&
1730 		    (block_off_end > to || block_off_start < from) &&
1731 		    !test_range_bit(tree, block_start, cur_end,
1732 				    EXTENT_UPTODATE, 1)) {
1733 			u64 sector;
1734 			u64 extent_offset = block_start - em->start;
1735 			size_t iosize;
1736 			sector = (em->block_start + extent_offset) >> 9;
1737 			iosize = (cur_end - block_start + blocksize - 1) &
1738 				~((u64)blocksize - 1);
1739 			/*
1740 			 * we've already got the extent locked, but we
1741 			 * need to split the state such that our end_bio
1742 			 * handler can clear the lock.
1743 			 */
1744 			set_extent_bit(tree, block_start,
1745 				       block_start + iosize - 1,
1746 				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
1747 			ret = submit_extent_page(READ, tree, page,
1748 					 sector, iosize, page_offset, em->bdev,
1749 					 end_bio_extent_preparewrite);
1750 			iocount++;
1751 			block_start = block_start + iosize;
1752 		} else {
1753 			set_extent_uptodate(tree, block_start, cur_end,
1754 					    GFP_NOFS);
1755 			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
1756 			block_start = cur_end + 1;
1757 		}
1758 		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
1759 		free_extent_map(em);
1760 	}
1761 	if (iocount) {
1762 		wait_extent_bit(tree, orig_block_start,
1763 				block_end, EXTENT_LOCKED);
1764 	}
1765 	check_page_uptodate(tree, page);
1766 err:
1767 	/* FIXME, zero out newly allocated blocks on error */
1768 	return err;
1769 }
1770 EXPORT_SYMBOL(extent_prepare_write);
1771 
1772 /*
1773  * a helper for releasepage.  As long as there are no locked extents
1774  * in the range corresponding to the page, both state records and extent
1775  * map records are removed
1776  */
1777 int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
1778 {
1779 	struct extent_map *em;
1780 	u64 start = page->index << PAGE_CACHE_SHIFT;
1781 	u64 end = start + PAGE_CACHE_SIZE - 1;
1782 	u64 orig_start = start;
1783 	int ret = 1;
1784 
1785 	while (start <= end) {
1786 		em = lookup_extent_mapping(tree, start, end);
1787 		if (!em || IS_ERR(em))
1788 			break;
1789 		if (!test_range_bit(tree, em->start, em->end,
1790 				    EXTENT_LOCKED, 0)) {
1791 			remove_extent_mapping(tree, em);
1792 			/* once for the rb tree */
1793 			free_extent_map(em);
1794 		}
1795 		start = em->end + 1;
1796 		/* once for us */
1797 		free_extent_map(em);
1798 	}
1799 	if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0))
1800 		ret = 0;
1801 	else
1802 		clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
1803 				 1, 1, GFP_NOFS);
1804 	return ret;
1805 }
1806 EXPORT_SYMBOL(try_release_extent_mapping);
1807 
1808