xref: /openbmc/linux/fs/btrfs/extent_map.c (revision 5f56406a)
1 #include <linux/bitops.h>
2 #include <linux/slab.h>
3 #include <linux/bio.h>
4 #include <linux/mm.h>
5 #include <linux/gfp.h>
6 #include <linux/pagemap.h>
7 #include <linux/page-flags.h>
8 #include <linux/module.h>
9 #include <linux/spinlock.h>
10 #include <linux/blkdev.h>
11 #include <linux/swap.h>
12 #include <linux/version.h>
13 #include <linux/writeback.h>
14 #include <linux/pagevec.h>
15 #include "extent_map.h"
16 
17 /* temporary define until extent_map moves out of btrfs */
18 struct kmem_cache *btrfs_cache_create(const char *name, size_t size,
19 				       unsigned long extra_flags,
20 				       void (*ctor)(void *, struct kmem_cache *,
21 						    unsigned long));
22 
23 static struct kmem_cache *extent_map_cache;
24 static struct kmem_cache *extent_state_cache;
25 static struct kmem_cache *extent_buffer_cache;
26 
27 static LIST_HEAD(buffers);
28 static LIST_HEAD(states);
29 
30 static spinlock_t state_lock = SPIN_LOCK_UNLOCKED;
31 #define BUFFER_LRU_MAX 64
32 
33 struct tree_entry {
34 	u64 start;
35 	u64 end;
36 	int in_tree;
37 	struct rb_node rb_node;
38 };
39 
40 struct extent_page_data {
41 	struct bio *bio;
42 	struct extent_map_tree *tree;
43 	get_extent_t *get_extent;
44 };
45 
46 int __init extent_map_init(void)
47 {
48 	extent_map_cache = btrfs_cache_create("extent_map",
49 					    sizeof(struct extent_map), 0,
50 					    NULL);
51 	if (!extent_map_cache)
52 		return -ENOMEM;
53 	extent_state_cache = btrfs_cache_create("extent_state",
54 					    sizeof(struct extent_state), 0,
55 					    NULL);
56 	if (!extent_state_cache)
57 		goto free_map_cache;
58 	extent_buffer_cache = btrfs_cache_create("extent_buffers",
59 					    sizeof(struct extent_buffer), 0,
60 					    NULL);
61 	if (!extent_buffer_cache)
62 		goto free_state_cache;
63 	return 0;
64 
65 free_state_cache:
66 	kmem_cache_destroy(extent_state_cache);
67 free_map_cache:
68 	kmem_cache_destroy(extent_map_cache);
69 	return -ENOMEM;
70 }
71 
72 void extent_map_exit(void)
73 {
74 	struct extent_state *state;
75 
76 	while (!list_empty(&states)) {
77 		state = list_entry(states.next, struct extent_state, list);
78 		printk("state leak: start %Lu end %Lu state %lu in tree %d refs %d\n", state->start, state->end, state->state, state->in_tree, atomic_read(&state->refs));
79 		list_del(&state->list);
80 		kmem_cache_free(extent_state_cache, state);
81 
82 	}
83 
84 	if (extent_map_cache)
85 		kmem_cache_destroy(extent_map_cache);
86 	if (extent_state_cache)
87 		kmem_cache_destroy(extent_state_cache);
88 	if (extent_buffer_cache)
89 		kmem_cache_destroy(extent_buffer_cache);
90 }
91 
92 void extent_map_tree_init(struct extent_map_tree *tree,
93 			  struct address_space *mapping, gfp_t mask)
94 {
95 	tree->map.rb_node = NULL;
96 	tree->state.rb_node = NULL;
97 	tree->ops = NULL;
98 	tree->dirty_bytes = 0;
99 	rwlock_init(&tree->lock);
100 	spin_lock_init(&tree->lru_lock);
101 	tree->mapping = mapping;
102 	INIT_LIST_HEAD(&tree->buffer_lru);
103 	tree->lru_size = 0;
104 }
105 EXPORT_SYMBOL(extent_map_tree_init);
106 
107 void extent_map_tree_empty_lru(struct extent_map_tree *tree)
108 {
109 	struct extent_buffer *eb;
110 	while(!list_empty(&tree->buffer_lru)) {
111 		eb = list_entry(tree->buffer_lru.next, struct extent_buffer,
112 				lru);
113 		list_del_init(&eb->lru);
114 		free_extent_buffer(eb);
115 	}
116 }
117 EXPORT_SYMBOL(extent_map_tree_empty_lru);
118 
119 struct extent_map *alloc_extent_map(gfp_t mask)
120 {
121 	struct extent_map *em;
122 	em = kmem_cache_alloc(extent_map_cache, mask);
123 	if (!em || IS_ERR(em))
124 		return em;
125 	em->in_tree = 0;
126 	atomic_set(&em->refs, 1);
127 	return em;
128 }
129 EXPORT_SYMBOL(alloc_extent_map);
130 
131 void free_extent_map(struct extent_map *em)
132 {
133 	if (!em)
134 		return;
135 	if (atomic_dec_and_test(&em->refs)) {
136 		WARN_ON(em->in_tree);
137 		kmem_cache_free(extent_map_cache, em);
138 	}
139 }
140 EXPORT_SYMBOL(free_extent_map);
141 
142 
143 struct extent_state *alloc_extent_state(gfp_t mask)
144 {
145 	struct extent_state *state;
146 	unsigned long flags;
147 
148 	state = kmem_cache_alloc(extent_state_cache, mask);
149 	if (!state || IS_ERR(state))
150 		return state;
151 	state->state = 0;
152 	state->in_tree = 0;
153 	state->private = 0;
154 
155 	spin_lock_irqsave(&state_lock, flags);
156 	list_add(&state->list, &states);
157 	spin_unlock_irqrestore(&state_lock, flags);
158 
159 	atomic_set(&state->refs, 1);
160 	init_waitqueue_head(&state->wq);
161 	return state;
162 }
163 EXPORT_SYMBOL(alloc_extent_state);
164 
165 void free_extent_state(struct extent_state *state)
166 {
167 	unsigned long flags;
168 	if (!state)
169 		return;
170 	if (atomic_dec_and_test(&state->refs)) {
171 		WARN_ON(state->in_tree);
172 		spin_lock_irqsave(&state_lock, flags);
173 		list_del(&state->list);
174 		spin_unlock_irqrestore(&state_lock, flags);
175 		kmem_cache_free(extent_state_cache, state);
176 	}
177 }
178 EXPORT_SYMBOL(free_extent_state);
179 
180 static struct rb_node *tree_insert(struct rb_root *root, u64 offset,
181 				   struct rb_node *node)
182 {
183 	struct rb_node ** p = &root->rb_node;
184 	struct rb_node * parent = NULL;
185 	struct tree_entry *entry;
186 
187 	while(*p) {
188 		parent = *p;
189 		entry = rb_entry(parent, struct tree_entry, rb_node);
190 
191 		if (offset < entry->start)
192 			p = &(*p)->rb_left;
193 		else if (offset > entry->end)
194 			p = &(*p)->rb_right;
195 		else
196 			return parent;
197 	}
198 
199 	entry = rb_entry(node, struct tree_entry, rb_node);
200 	entry->in_tree = 1;
201 	rb_link_node(node, parent, p);
202 	rb_insert_color(node, root);
203 	return NULL;
204 }
205 
206 static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
207 				     struct rb_node **prev_ret,
208 				     struct rb_node **next_ret)
209 {
210 	struct rb_node * n = root->rb_node;
211 	struct rb_node *prev = NULL;
212 	struct rb_node *orig_prev = NULL;
213 	struct tree_entry *entry;
214 	struct tree_entry *prev_entry = NULL;
215 
216 	while(n) {
217 		entry = rb_entry(n, struct tree_entry, rb_node);
218 		prev = n;
219 		prev_entry = entry;
220 
221 		if (offset < entry->start)
222 			n = n->rb_left;
223 		else if (offset > entry->end)
224 			n = n->rb_right;
225 		else
226 			return n;
227 	}
228 
229 	if (prev_ret) {
230 		orig_prev = prev;
231 		while(prev && offset > prev_entry->end) {
232 			prev = rb_next(prev);
233 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
234 		}
235 		*prev_ret = prev;
236 		prev = orig_prev;
237 	}
238 
239 	if (next_ret) {
240 		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
241 		while(prev && offset < prev_entry->start) {
242 			prev = rb_prev(prev);
243 			prev_entry = rb_entry(prev, struct tree_entry, rb_node);
244 		}
245 		*next_ret = prev;
246 	}
247 	return NULL;
248 }
249 
250 static inline struct rb_node *tree_search(struct rb_root *root, u64 offset)
251 {
252 	struct rb_node *prev;
253 	struct rb_node *ret;
254 	ret = __tree_search(root, offset, &prev, NULL);
255 	if (!ret)
256 		return prev;
257 	return ret;
258 }
259 
260 static int tree_delete(struct rb_root *root, u64 offset)
261 {
262 	struct rb_node *node;
263 	struct tree_entry *entry;
264 
265 	node = __tree_search(root, offset, NULL, NULL);
266 	if (!node)
267 		return -ENOENT;
268 	entry = rb_entry(node, struct tree_entry, rb_node);
269 	entry->in_tree = 0;
270 	rb_erase(node, root);
271 	return 0;
272 }
273 
274 /*
275  * add_extent_mapping tries a simple backward merge with existing
276  * mappings.  The extent_map struct passed in will be inserted into
277  * the tree directly (no copies made, just a reference taken).
278  */
279 int add_extent_mapping(struct extent_map_tree *tree,
280 		       struct extent_map *em)
281 {
282 	int ret = 0;
283 	struct extent_map *prev = NULL;
284 	struct rb_node *rb;
285 
286 	write_lock_irq(&tree->lock);
287 	rb = tree_insert(&tree->map, em->end, &em->rb_node);
288 	if (rb) {
289 		prev = rb_entry(rb, struct extent_map, rb_node);
290 		ret = -EEXIST;
291 		goto out;
292 	}
293 	atomic_inc(&em->refs);
294 	if (em->start != 0) {
295 		rb = rb_prev(&em->rb_node);
296 		if (rb)
297 			prev = rb_entry(rb, struct extent_map, rb_node);
298 		if (prev && prev->end + 1 == em->start &&
299 		    ((em->block_start == EXTENT_MAP_HOLE &&
300 		      prev->block_start == EXTENT_MAP_HOLE) ||
301 		     (em->block_start == EXTENT_MAP_INLINE &&
302 		      prev->block_start == EXTENT_MAP_INLINE) ||
303 		     (em->block_start == EXTENT_MAP_DELALLOC &&
304 		      prev->block_start == EXTENT_MAP_DELALLOC) ||
305 		     (em->block_start < EXTENT_MAP_DELALLOC - 1 &&
306 		      em->block_start == prev->block_end + 1))) {
307 			em->start = prev->start;
308 			em->block_start = prev->block_start;
309 			rb_erase(&prev->rb_node, &tree->map);
310 			prev->in_tree = 0;
311 			free_extent_map(prev);
312 		}
313 	 }
314 out:
315 	write_unlock_irq(&tree->lock);
316 	return ret;
317 }
318 EXPORT_SYMBOL(add_extent_mapping);
319 
320 /*
321  * lookup_extent_mapping returns the first extent_map struct in the
322  * tree that intersects the [start, end] (inclusive) range.  There may
323  * be additional objects in the tree that intersect, so check the object
324  * returned carefully to make sure you don't need additional lookups.
325  */
326 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
327 					 u64 start, u64 end)
328 {
329 	struct extent_map *em;
330 	struct rb_node *rb_node;
331 	struct rb_node *prev = NULL;
332 	struct rb_node *next = NULL;
333 
334 	read_lock_irq(&tree->lock);
335 	rb_node = __tree_search(&tree->map, start, &prev, &next);
336 	if (!rb_node && prev) {
337 		em = rb_entry(prev, struct extent_map, rb_node);
338 		if (em->start <= end && em->end >= start)
339 			goto found;
340 	}
341 	if (!rb_node && next) {
342 		em = rb_entry(next, struct extent_map, rb_node);
343 		if (em->start <= end && em->end >= start)
344 			goto found;
345 	}
346 	if (!rb_node) {
347 		em = NULL;
348 		goto out;
349 	}
350 	if (IS_ERR(rb_node)) {
351 		em = ERR_PTR(PTR_ERR(rb_node));
352 		goto out;
353 	}
354 	em = rb_entry(rb_node, struct extent_map, rb_node);
355 	if (em->end < start || em->start > end) {
356 		em = NULL;
357 		goto out;
358 	}
359 found:
360 	atomic_inc(&em->refs);
361 out:
362 	read_unlock_irq(&tree->lock);
363 	return em;
364 }
365 EXPORT_SYMBOL(lookup_extent_mapping);
366 
367 /*
368  * removes an extent_map struct from the tree.  No reference counts are
369  * dropped, and no checks are done to  see if the range is in use
370  */
371 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
372 {
373 	int ret;
374 
375 	write_lock_irq(&tree->lock);
376 	ret = tree_delete(&tree->map, em->end);
377 	write_unlock_irq(&tree->lock);
378 	return ret;
379 }
380 EXPORT_SYMBOL(remove_extent_mapping);
381 
382 /*
383  * utility function to look for merge candidates inside a given range.
384  * Any extents with matching state are merged together into a single
385  * extent in the tree.  Extents with EXTENT_IO in their state field
386  * are not merged because the end_io handlers need to be able to do
387  * operations on them without sleeping (or doing allocations/splits).
388  *
389  * This should be called with the tree lock held.
390  */
391 static int merge_state(struct extent_map_tree *tree,
392 		       struct extent_state *state)
393 {
394 	struct extent_state *other;
395 	struct rb_node *other_node;
396 
397 	if (state->state & EXTENT_IOBITS)
398 		return 0;
399 
400 	other_node = rb_prev(&state->rb_node);
401 	if (other_node) {
402 		other = rb_entry(other_node, struct extent_state, rb_node);
403 		if (other->end == state->start - 1 &&
404 		    other->state == state->state) {
405 			state->start = other->start;
406 			other->in_tree = 0;
407 			rb_erase(&other->rb_node, &tree->state);
408 			free_extent_state(other);
409 		}
410 	}
411 	other_node = rb_next(&state->rb_node);
412 	if (other_node) {
413 		other = rb_entry(other_node, struct extent_state, rb_node);
414 		if (other->start == state->end + 1 &&
415 		    other->state == state->state) {
416 			other->start = state->start;
417 			state->in_tree = 0;
418 			rb_erase(&state->rb_node, &tree->state);
419 			free_extent_state(state);
420 		}
421 	}
422 	return 0;
423 }
424 
425 /*
426  * insert an extent_state struct into the tree.  'bits' are set on the
427  * struct before it is inserted.
428  *
429  * This may return -EEXIST if the extent is already there, in which case the
430  * state struct is freed.
431  *
432  * The tree lock is not taken internally.  This is a utility function and
433  * probably isn't what you want to call (see set/clear_extent_bit).
434  */
435 static int insert_state(struct extent_map_tree *tree,
436 			struct extent_state *state, u64 start, u64 end,
437 			int bits)
438 {
439 	struct rb_node *node;
440 
441 	if (end < start) {
442 		printk("end < start %Lu %Lu\n", end, start);
443 		WARN_ON(1);
444 	}
445 	if (bits & EXTENT_DIRTY)
446 		tree->dirty_bytes += end - start + 1;
447 	state->state |= bits;
448 	state->start = start;
449 	state->end = end;
450 	node = tree_insert(&tree->state, end, &state->rb_node);
451 	if (node) {
452 		struct extent_state *found;
453 		found = rb_entry(node, struct extent_state, rb_node);
454 		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, start, end);
455 		free_extent_state(state);
456 		return -EEXIST;
457 	}
458 	merge_state(tree, state);
459 	return 0;
460 }
461 
462 /*
463  * split a given extent state struct in two, inserting the preallocated
464  * struct 'prealloc' as the newly created second half.  'split' indicates an
465  * offset inside 'orig' where it should be split.
466  *
467  * Before calling,
468  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
469  * are two extent state structs in the tree:
470  * prealloc: [orig->start, split - 1]
471  * orig: [ split, orig->end ]
472  *
473  * The tree locks are not taken by this function. They need to be held
474  * by the caller.
475  */
476 static int split_state(struct extent_map_tree *tree, struct extent_state *orig,
477 		       struct extent_state *prealloc, u64 split)
478 {
479 	struct rb_node *node;
480 	prealloc->start = orig->start;
481 	prealloc->end = split - 1;
482 	prealloc->state = orig->state;
483 	orig->start = split;
484 
485 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
486 	if (node) {
487 		struct extent_state *found;
488 		found = rb_entry(node, struct extent_state, rb_node);
489 		printk("found node %Lu %Lu on insert of %Lu %Lu\n", found->start, found->end, prealloc->start, prealloc->end);
490 		free_extent_state(prealloc);
491 		return -EEXIST;
492 	}
493 	return 0;
494 }
495 
496 /*
497  * utility function to clear some bits in an extent state struct.
498  * it will optionally wake up any one waiting on this state (wake == 1), or
499  * forcibly remove the state from the tree (delete == 1).
500  *
501  * If no bits are set on the state struct after clearing things, the
502  * struct is freed and removed from the tree
503  */
504 static int clear_state_bit(struct extent_map_tree *tree,
505 			    struct extent_state *state, int bits, int wake,
506 			    int delete)
507 {
508 	int ret = state->state & bits;
509 
510 	if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
511 		u64 range = state->end - state->start + 1;
512 		WARN_ON(range > tree->dirty_bytes);
513 		tree->dirty_bytes -= range;
514 	}
515 	state->state &= ~bits;
516 	if (wake)
517 		wake_up(&state->wq);
518 	if (delete || state->state == 0) {
519 		if (state->in_tree) {
520 			rb_erase(&state->rb_node, &tree->state);
521 			state->in_tree = 0;
522 			free_extent_state(state);
523 		} else {
524 			WARN_ON(1);
525 		}
526 	} else {
527 		merge_state(tree, state);
528 	}
529 	return ret;
530 }
531 
532 /*
533  * clear some bits on a range in the tree.  This may require splitting
534  * or inserting elements in the tree, so the gfp mask is used to
535  * indicate which allocations or sleeping are allowed.
536  *
537  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
538  * the given range from the tree regardless of state (ie for truncate).
539  *
540  * the range [start, end] is inclusive.
541  *
542  * This takes the tree lock, and returns < 0 on error, > 0 if any of the
543  * bits were already set, or zero if none of the bits were already set.
544  */
545 int clear_extent_bit(struct extent_map_tree *tree, u64 start, u64 end,
546 		     int bits, int wake, int delete, gfp_t mask)
547 {
548 	struct extent_state *state;
549 	struct extent_state *prealloc = NULL;
550 	struct rb_node *node;
551 	unsigned long flags;
552 	int err;
553 	int set = 0;
554 
555 again:
556 	if (!prealloc && (mask & __GFP_WAIT)) {
557 		prealloc = alloc_extent_state(mask);
558 		if (!prealloc)
559 			return -ENOMEM;
560 	}
561 
562 	write_lock_irqsave(&tree->lock, flags);
563 	/*
564 	 * this search will find the extents that end after
565 	 * our range starts
566 	 */
567 	node = tree_search(&tree->state, start);
568 	if (!node)
569 		goto out;
570 	state = rb_entry(node, struct extent_state, rb_node);
571 	if (state->start > end)
572 		goto out;
573 	WARN_ON(state->end < start);
574 
575 	/*
576 	 *     | ---- desired range ---- |
577 	 *  | state | or
578 	 *  | ------------- state -------------- |
579 	 *
580 	 * We need to split the extent we found, and may flip
581 	 * bits on second half.
582 	 *
583 	 * If the extent we found extends past our range, we
584 	 * just split and search again.  It'll get split again
585 	 * the next time though.
586 	 *
587 	 * If the extent we found is inside our range, we clear
588 	 * the desired bit on it.
589 	 */
590 
591 	if (state->start < start) {
592 		err = split_state(tree, state, prealloc, start);
593 		BUG_ON(err == -EEXIST);
594 		prealloc = NULL;
595 		if (err)
596 			goto out;
597 		if (state->end <= end) {
598 			start = state->end + 1;
599 			set |= clear_state_bit(tree, state, bits,
600 					wake, delete);
601 		} else {
602 			start = state->start;
603 		}
604 		goto search_again;
605 	}
606 	/*
607 	 * | ---- desired range ---- |
608 	 *                        | state |
609 	 * We need to split the extent, and clear the bit
610 	 * on the first half
611 	 */
612 	if (state->start <= end && state->end > end) {
613 		err = split_state(tree, state, prealloc, end + 1);
614 		BUG_ON(err == -EEXIST);
615 
616 		if (wake)
617 			wake_up(&state->wq);
618 		set |= clear_state_bit(tree, prealloc, bits,
619 				       wake, delete);
620 		prealloc = NULL;
621 		goto out;
622 	}
623 
624 	start = state->end + 1;
625 	set |= clear_state_bit(tree, state, bits, wake, delete);
626 	goto search_again;
627 
628 out:
629 	write_unlock_irqrestore(&tree->lock, flags);
630 	if (prealloc)
631 		free_extent_state(prealloc);
632 
633 	return set;
634 
635 search_again:
636 	if (start > end)
637 		goto out;
638 	write_unlock_irqrestore(&tree->lock, flags);
639 	if (mask & __GFP_WAIT)
640 		cond_resched();
641 	goto again;
642 }
643 EXPORT_SYMBOL(clear_extent_bit);
644 
645 static int wait_on_state(struct extent_map_tree *tree,
646 			 struct extent_state *state)
647 {
648 	DEFINE_WAIT(wait);
649 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
650 	read_unlock_irq(&tree->lock);
651 	schedule();
652 	read_lock_irq(&tree->lock);
653 	finish_wait(&state->wq, &wait);
654 	return 0;
655 }
656 
657 /*
658  * waits for one or more bits to clear on a range in the state tree.
659  * The range [start, end] is inclusive.
660  * The tree lock is taken by this function
661  */
662 int wait_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits)
663 {
664 	struct extent_state *state;
665 	struct rb_node *node;
666 
667 	read_lock_irq(&tree->lock);
668 again:
669 	while (1) {
670 		/*
671 		 * this search will find all the extents that end after
672 		 * our range starts
673 		 */
674 		node = tree_search(&tree->state, start);
675 		if (!node)
676 			break;
677 
678 		state = rb_entry(node, struct extent_state, rb_node);
679 
680 		if (state->start > end)
681 			goto out;
682 
683 		if (state->state & bits) {
684 			start = state->start;
685 			atomic_inc(&state->refs);
686 			wait_on_state(tree, state);
687 			free_extent_state(state);
688 			goto again;
689 		}
690 		start = state->end + 1;
691 
692 		if (start > end)
693 			break;
694 
695 		if (need_resched()) {
696 			read_unlock_irq(&tree->lock);
697 			cond_resched();
698 			read_lock_irq(&tree->lock);
699 		}
700 	}
701 out:
702 	read_unlock_irq(&tree->lock);
703 	return 0;
704 }
705 EXPORT_SYMBOL(wait_extent_bit);
706 
707 static void set_state_bits(struct extent_map_tree *tree,
708 			   struct extent_state *state,
709 			   int bits)
710 {
711 	if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
712 		u64 range = state->end - state->start + 1;
713 		tree->dirty_bytes += range;
714 	}
715 	state->state |= bits;
716 }
717 
718 /*
719  * set some bits on a range in the tree.  This may require allocations
720  * or sleeping, so the gfp mask is used to indicate what is allowed.
721  *
722  * If 'exclusive' == 1, this will fail with -EEXIST if some part of the
723  * range already has the desired bits set.  The start of the existing
724  * range is returned in failed_start in this case.
725  *
726  * [start, end] is inclusive
727  * This takes the tree lock.
728  */
729 int set_extent_bit(struct extent_map_tree *tree, u64 start, u64 end, int bits,
730 		   int exclusive, u64 *failed_start, gfp_t mask)
731 {
732 	struct extent_state *state;
733 	struct extent_state *prealloc = NULL;
734 	struct rb_node *node;
735 	unsigned long flags;
736 	int err = 0;
737 	int set;
738 	u64 last_start;
739 	u64 last_end;
740 again:
741 	if (!prealloc && (mask & __GFP_WAIT)) {
742 		prealloc = alloc_extent_state(mask);
743 		if (!prealloc)
744 			return -ENOMEM;
745 	}
746 
747 	write_lock_irqsave(&tree->lock, flags);
748 	/*
749 	 * this search will find all the extents that end after
750 	 * our range starts.
751 	 */
752 	node = tree_search(&tree->state, start);
753 	if (!node) {
754 		err = insert_state(tree, prealloc, start, end, bits);
755 		prealloc = NULL;
756 		BUG_ON(err == -EEXIST);
757 		goto out;
758 	}
759 
760 	state = rb_entry(node, struct extent_state, rb_node);
761 	last_start = state->start;
762 	last_end = state->end;
763 
764 	/*
765 	 * | ---- desired range ---- |
766 	 * | state |
767 	 *
768 	 * Just lock what we found and keep going
769 	 */
770 	if (state->start == start && state->end <= end) {
771 		set = state->state & bits;
772 		if (set && exclusive) {
773 			*failed_start = state->start;
774 			err = -EEXIST;
775 			goto out;
776 		}
777 		set_state_bits(tree, state, bits);
778 		start = state->end + 1;
779 		merge_state(tree, state);
780 		goto search_again;
781 	}
782 
783 	/*
784 	 *     | ---- desired range ---- |
785 	 * | state |
786 	 *   or
787 	 * | ------------- state -------------- |
788 	 *
789 	 * We need to split the extent we found, and may flip bits on
790 	 * second half.
791 	 *
792 	 * If the extent we found extends past our
793 	 * range, we just split and search again.  It'll get split
794 	 * again the next time though.
795 	 *
796 	 * If the extent we found is inside our range, we set the
797 	 * desired bit on it.
798 	 */
799 	if (state->start < start) {
800 		set = state->state & bits;
801 		if (exclusive && set) {
802 			*failed_start = start;
803 			err = -EEXIST;
804 			goto out;
805 		}
806 		err = split_state(tree, state, prealloc, start);
807 		BUG_ON(err == -EEXIST);
808 		prealloc = NULL;
809 		if (err)
810 			goto out;
811 		if (state->end <= end) {
812 			set_state_bits(tree, state, bits);
813 			start = state->end + 1;
814 			merge_state(tree, state);
815 		} else {
816 			start = state->start;
817 		}
818 		goto search_again;
819 	}
820 	/*
821 	 * | ---- desired range ---- |
822 	 *     | state | or               | state |
823 	 *
824 	 * There's a hole, we need to insert something in it and
825 	 * ignore the extent we found.
826 	 */
827 	if (state->start > start) {
828 		u64 this_end;
829 		if (end < last_start)
830 			this_end = end;
831 		else
832 			this_end = last_start -1;
833 		err = insert_state(tree, prealloc, start, this_end,
834 				   bits);
835 		prealloc = NULL;
836 		BUG_ON(err == -EEXIST);
837 		if (err)
838 			goto out;
839 		start = this_end + 1;
840 		goto search_again;
841 	}
842 	/*
843 	 * | ---- desired range ---- |
844 	 *                        | state |
845 	 * We need to split the extent, and set the bit
846 	 * on the first half
847 	 */
848 	if (state->start <= end && state->end > end) {
849 		set = state->state & bits;
850 		if (exclusive && set) {
851 			*failed_start = start;
852 			err = -EEXIST;
853 			goto out;
854 		}
855 		err = split_state(tree, state, prealloc, end + 1);
856 		BUG_ON(err == -EEXIST);
857 
858 		set_state_bits(tree, prealloc, bits);
859 		merge_state(tree, prealloc);
860 		prealloc = NULL;
861 		goto out;
862 	}
863 
864 	goto search_again;
865 
866 out:
867 	write_unlock_irqrestore(&tree->lock, flags);
868 	if (prealloc)
869 		free_extent_state(prealloc);
870 
871 	return err;
872 
873 search_again:
874 	if (start > end)
875 		goto out;
876 	write_unlock_irqrestore(&tree->lock, flags);
877 	if (mask & __GFP_WAIT)
878 		cond_resched();
879 	goto again;
880 }
881 EXPORT_SYMBOL(set_extent_bit);
882 
883 /* wrappers around set/clear extent bit */
884 int set_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
885 		     gfp_t mask)
886 {
887 	return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL,
888 			      mask);
889 }
890 EXPORT_SYMBOL(set_extent_dirty);
891 
892 int set_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
893 		    int bits, gfp_t mask)
894 {
895 	return set_extent_bit(tree, start, end, bits, 0, NULL,
896 			      mask);
897 }
898 EXPORT_SYMBOL(set_extent_bits);
899 
900 int clear_extent_bits(struct extent_map_tree *tree, u64 start, u64 end,
901 		      int bits, gfp_t mask)
902 {
903 	return clear_extent_bit(tree, start, end, bits, 0, 0, mask);
904 }
905 EXPORT_SYMBOL(clear_extent_bits);
906 
907 int set_extent_delalloc(struct extent_map_tree *tree, u64 start, u64 end,
908 		     gfp_t mask)
909 {
910 	return set_extent_bit(tree, start, end,
911 			      EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
912 			      mask);
913 }
914 EXPORT_SYMBOL(set_extent_delalloc);
915 
916 int clear_extent_dirty(struct extent_map_tree *tree, u64 start, u64 end,
917 		       gfp_t mask)
918 {
919 	return clear_extent_bit(tree, start, end,
920 				EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask);
921 }
922 EXPORT_SYMBOL(clear_extent_dirty);
923 
924 int set_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
925 		     gfp_t mask)
926 {
927 	return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL,
928 			      mask);
929 }
930 EXPORT_SYMBOL(set_extent_new);
931 
932 int clear_extent_new(struct extent_map_tree *tree, u64 start, u64 end,
933 		       gfp_t mask)
934 {
935 	return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask);
936 }
937 EXPORT_SYMBOL(clear_extent_new);
938 
939 int set_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
940 			gfp_t mask)
941 {
942 	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
943 			      mask);
944 }
945 EXPORT_SYMBOL(set_extent_uptodate);
946 
947 int clear_extent_uptodate(struct extent_map_tree *tree, u64 start, u64 end,
948 			  gfp_t mask)
949 {
950 	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask);
951 }
952 EXPORT_SYMBOL(clear_extent_uptodate);
953 
954 int set_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
955 			 gfp_t mask)
956 {
957 	return set_extent_bit(tree, start, end, EXTENT_WRITEBACK,
958 			      0, NULL, mask);
959 }
960 EXPORT_SYMBOL(set_extent_writeback);
961 
962 int clear_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end,
963 			   gfp_t mask)
964 {
965 	return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask);
966 }
967 EXPORT_SYMBOL(clear_extent_writeback);
968 
969 int wait_on_extent_writeback(struct extent_map_tree *tree, u64 start, u64 end)
970 {
971 	return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
972 }
973 EXPORT_SYMBOL(wait_on_extent_writeback);
974 
975 /*
976  * locks a range in ascending order, waiting for any locked regions
977  * it hits on the way.  [start,end] are inclusive, and this will sleep.
978  */
979 int lock_extent(struct extent_map_tree *tree, u64 start, u64 end, gfp_t mask)
980 {
981 	int err;
982 	u64 failed_start;
983 	while (1) {
984 		err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1,
985 				     &failed_start, mask);
986 		if (err == -EEXIST && (mask & __GFP_WAIT)) {
987 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
988 			start = failed_start;
989 		} else {
990 			break;
991 		}
992 		WARN_ON(start > end);
993 	}
994 	return err;
995 }
996 EXPORT_SYMBOL(lock_extent);
997 
998 int unlock_extent(struct extent_map_tree *tree, u64 start, u64 end,
999 		  gfp_t mask)
1000 {
1001 	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask);
1002 }
1003 EXPORT_SYMBOL(unlock_extent);
1004 
1005 /*
1006  * helper function to set pages and extents in the tree dirty
1007  */
1008 int set_range_dirty(struct extent_map_tree *tree, u64 start, u64 end)
1009 {
1010 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1011 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1012 	struct page *page;
1013 
1014 	while (index <= end_index) {
1015 		page = find_get_page(tree->mapping, index);
1016 		BUG_ON(!page);
1017 		__set_page_dirty_nobuffers(page);
1018 		page_cache_release(page);
1019 		index++;
1020 	}
1021 	set_extent_dirty(tree, start, end, GFP_NOFS);
1022 	return 0;
1023 }
1024 EXPORT_SYMBOL(set_range_dirty);
1025 
1026 /*
1027  * helper function to set both pages and extents in the tree writeback
1028  */
1029 int set_range_writeback(struct extent_map_tree *tree, u64 start, u64 end)
1030 {
1031 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1032 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1033 	struct page *page;
1034 
1035 	while (index <= end_index) {
1036 		page = find_get_page(tree->mapping, index);
1037 		BUG_ON(!page);
1038 		set_page_writeback(page);
1039 		page_cache_release(page);
1040 		index++;
1041 	}
1042 	set_extent_writeback(tree, start, end, GFP_NOFS);
1043 	return 0;
1044 }
1045 EXPORT_SYMBOL(set_range_writeback);
1046 
1047 int find_first_extent_bit(struct extent_map_tree *tree, u64 start,
1048 			  u64 *start_ret, u64 *end_ret, int bits)
1049 {
1050 	struct rb_node *node;
1051 	struct extent_state *state;
1052 	int ret = 1;
1053 
1054 	read_lock_irq(&tree->lock);
1055 	/*
1056 	 * this search will find all the extents that end after
1057 	 * our range starts.
1058 	 */
1059 	node = tree_search(&tree->state, start);
1060 	if (!node || IS_ERR(node)) {
1061 		goto out;
1062 	}
1063 
1064 	while(1) {
1065 		state = rb_entry(node, struct extent_state, rb_node);
1066 		if (state->end >= start && (state->state & bits)) {
1067 			*start_ret = state->start;
1068 			*end_ret = state->end;
1069 			ret = 0;
1070 			break;
1071 		}
1072 		node = rb_next(node);
1073 		if (!node)
1074 			break;
1075 	}
1076 out:
1077 	read_unlock_irq(&tree->lock);
1078 	return ret;
1079 }
1080 EXPORT_SYMBOL(find_first_extent_bit);
1081 
1082 u64 find_lock_delalloc_range(struct extent_map_tree *tree,
1083 			     u64 *start, u64 *end, u64 max_bytes)
1084 {
1085 	struct rb_node *node;
1086 	struct extent_state *state;
1087 	u64 cur_start = *start;
1088 	u64 found = 0;
1089 	u64 total_bytes = 0;
1090 
1091 	write_lock_irq(&tree->lock);
1092 	/*
1093 	 * this search will find all the extents that end after
1094 	 * our range starts.
1095 	 */
1096 search_again:
1097 	node = tree_search(&tree->state, cur_start);
1098 	if (!node || IS_ERR(node)) {
1099 		*end = (u64)-1;
1100 		goto out;
1101 	}
1102 
1103 	while(1) {
1104 		state = rb_entry(node, struct extent_state, rb_node);
1105 		if (found && state->start != cur_start) {
1106 			goto out;
1107 		}
1108 		if (!(state->state & EXTENT_DELALLOC)) {
1109 			if (!found)
1110 				*end = state->end;
1111 			goto out;
1112 		}
1113 		if (!found) {
1114 			struct extent_state *prev_state;
1115 			struct rb_node *prev_node = node;
1116 			while(1) {
1117 				prev_node = rb_prev(prev_node);
1118 				if (!prev_node)
1119 					break;
1120 				prev_state = rb_entry(prev_node,
1121 						      struct extent_state,
1122 						      rb_node);
1123 				if (!(prev_state->state & EXTENT_DELALLOC))
1124 					break;
1125 				state = prev_state;
1126 				node = prev_node;
1127 			}
1128 		}
1129 		if (state->state & EXTENT_LOCKED) {
1130 			DEFINE_WAIT(wait);
1131 			atomic_inc(&state->refs);
1132 			prepare_to_wait(&state->wq, &wait,
1133 					TASK_UNINTERRUPTIBLE);
1134 			write_unlock_irq(&tree->lock);
1135 			schedule();
1136 			write_lock_irq(&tree->lock);
1137 			finish_wait(&state->wq, &wait);
1138 			free_extent_state(state);
1139 			goto search_again;
1140 		}
1141 		state->state |= EXTENT_LOCKED;
1142 		if (!found)
1143 			*start = state->start;
1144 		found++;
1145 		*end = state->end;
1146 		cur_start = state->end + 1;
1147 		node = rb_next(node);
1148 		if (!node)
1149 			break;
1150 		total_bytes += state->end - state->start + 1;
1151 		if (total_bytes >= max_bytes)
1152 			break;
1153 	}
1154 out:
1155 	write_unlock_irq(&tree->lock);
1156 	return found;
1157 }
1158 
1159 u64 count_range_bits(struct extent_map_tree *tree,
1160 		     u64 *start, u64 search_end, u64 max_bytes,
1161 		     unsigned long bits)
1162 {
1163 	struct rb_node *node;
1164 	struct extent_state *state;
1165 	u64 cur_start = *start;
1166 	u64 total_bytes = 0;
1167 	int found = 0;
1168 
1169 	if (search_end <= cur_start) {
1170 		printk("search_end %Lu start %Lu\n", search_end, cur_start);
1171 		WARN_ON(1);
1172 		return 0;
1173 	}
1174 
1175 	write_lock_irq(&tree->lock);
1176 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
1177 		total_bytes = tree->dirty_bytes;
1178 		goto out;
1179 	}
1180 	/*
1181 	 * this search will find all the extents that end after
1182 	 * our range starts.
1183 	 */
1184 	node = tree_search(&tree->state, cur_start);
1185 	if (!node || IS_ERR(node)) {
1186 		goto out;
1187 	}
1188 
1189 	while(1) {
1190 		state = rb_entry(node, struct extent_state, rb_node);
1191 		if (state->start > search_end)
1192 			break;
1193 		if (state->end >= cur_start && (state->state & bits)) {
1194 			total_bytes += min(search_end, state->end) + 1 -
1195 				       max(cur_start, state->start);
1196 			if (total_bytes >= max_bytes)
1197 				break;
1198 			if (!found) {
1199 				*start = state->start;
1200 				found = 1;
1201 			}
1202 		}
1203 		node = rb_next(node);
1204 		if (!node)
1205 			break;
1206 	}
1207 out:
1208 	write_unlock_irq(&tree->lock);
1209 	return total_bytes;
1210 }
1211 /*
1212  * helper function to lock both pages and extents in the tree.
1213  * pages must be locked first.
1214  */
1215 int lock_range(struct extent_map_tree *tree, u64 start, u64 end)
1216 {
1217 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1218 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1219 	struct page *page;
1220 	int err;
1221 
1222 	while (index <= end_index) {
1223 		page = grab_cache_page(tree->mapping, index);
1224 		if (!page) {
1225 			err = -ENOMEM;
1226 			goto failed;
1227 		}
1228 		if (IS_ERR(page)) {
1229 			err = PTR_ERR(page);
1230 			goto failed;
1231 		}
1232 		index++;
1233 	}
1234 	lock_extent(tree, start, end, GFP_NOFS);
1235 	return 0;
1236 
1237 failed:
1238 	/*
1239 	 * we failed above in getting the page at 'index', so we undo here
1240 	 * up to but not including the page at 'index'
1241 	 */
1242 	end_index = index;
1243 	index = start >> PAGE_CACHE_SHIFT;
1244 	while (index < end_index) {
1245 		page = find_get_page(tree->mapping, index);
1246 		unlock_page(page);
1247 		page_cache_release(page);
1248 		index++;
1249 	}
1250 	return err;
1251 }
1252 EXPORT_SYMBOL(lock_range);
1253 
1254 /*
1255  * helper function to unlock both pages and extents in the tree.
1256  */
1257 int unlock_range(struct extent_map_tree *tree, u64 start, u64 end)
1258 {
1259 	unsigned long index = start >> PAGE_CACHE_SHIFT;
1260 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1261 	struct page *page;
1262 
1263 	while (index <= end_index) {
1264 		page = find_get_page(tree->mapping, index);
1265 		unlock_page(page);
1266 		page_cache_release(page);
1267 		index++;
1268 	}
1269 	unlock_extent(tree, start, end, GFP_NOFS);
1270 	return 0;
1271 }
1272 EXPORT_SYMBOL(unlock_range);
1273 
1274 int set_state_private(struct extent_map_tree *tree, u64 start, u64 private)
1275 {
1276 	struct rb_node *node;
1277 	struct extent_state *state;
1278 	int ret = 0;
1279 
1280 	write_lock_irq(&tree->lock);
1281 	/*
1282 	 * this search will find all the extents that end after
1283 	 * our range starts.
1284 	 */
1285 	node = tree_search(&tree->state, start);
1286 	if (!node || IS_ERR(node)) {
1287 		ret = -ENOENT;
1288 		goto out;
1289 	}
1290 	state = rb_entry(node, struct extent_state, rb_node);
1291 	if (state->start != start) {
1292 		ret = -ENOENT;
1293 		goto out;
1294 	}
1295 	state->private = private;
1296 out:
1297 	write_unlock_irq(&tree->lock);
1298 	return ret;
1299 }
1300 
1301 int get_state_private(struct extent_map_tree *tree, u64 start, u64 *private)
1302 {
1303 	struct rb_node *node;
1304 	struct extent_state *state;
1305 	int ret = 0;
1306 
1307 	read_lock_irq(&tree->lock);
1308 	/*
1309 	 * this search will find all the extents that end after
1310 	 * our range starts.
1311 	 */
1312 	node = tree_search(&tree->state, start);
1313 	if (!node || IS_ERR(node)) {
1314 		ret = -ENOENT;
1315 		goto out;
1316 	}
1317 	state = rb_entry(node, struct extent_state, rb_node);
1318 	if (state->start != start) {
1319 		ret = -ENOENT;
1320 		goto out;
1321 	}
1322 	*private = state->private;
1323 out:
1324 	read_unlock_irq(&tree->lock);
1325 	return ret;
1326 }
1327 
1328 /*
1329  * searches a range in the state tree for a given mask.
1330  * If 'filled' == 1, this returns 1 only if ever extent in the tree
1331  * has the bits set.  Otherwise, 1 is returned if any bit in the
1332  * range is found set.
1333  */
1334 int test_range_bit(struct extent_map_tree *tree, u64 start, u64 end,
1335 		   int bits, int filled)
1336 {
1337 	struct extent_state *state = NULL;
1338 	struct rb_node *node;
1339 	int bitset = 0;
1340 
1341 	read_lock_irq(&tree->lock);
1342 	node = tree_search(&tree->state, start);
1343 	while (node && start <= end) {
1344 		state = rb_entry(node, struct extent_state, rb_node);
1345 
1346 		if (filled && state->start > start) {
1347 			bitset = 0;
1348 			break;
1349 		}
1350 
1351 		if (state->start > end)
1352 			break;
1353 
1354 		if (state->state & bits) {
1355 			bitset = 1;
1356 			if (!filled)
1357 				break;
1358 		} else if (filled) {
1359 			bitset = 0;
1360 			break;
1361 		}
1362 		start = state->end + 1;
1363 		if (start > end)
1364 			break;
1365 		node = rb_next(node);
1366 		if (!node) {
1367 			if (filled)
1368 				bitset = 0;
1369 			break;
1370 		}
1371 	}
1372 	read_unlock_irq(&tree->lock);
1373 	return bitset;
1374 }
1375 EXPORT_SYMBOL(test_range_bit);
1376 
1377 /*
1378  * helper function to set a given page up to date if all the
1379  * extents in the tree for that page are up to date
1380  */
1381 static int check_page_uptodate(struct extent_map_tree *tree,
1382 			       struct page *page)
1383 {
1384 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1385 	u64 end = start + PAGE_CACHE_SIZE - 1;
1386 	if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1))
1387 		SetPageUptodate(page);
1388 	return 0;
1389 }
1390 
1391 /*
1392  * helper function to unlock a page if all the extents in the tree
1393  * for that page are unlocked
1394  */
1395 static int check_page_locked(struct extent_map_tree *tree,
1396 			     struct page *page)
1397 {
1398 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1399 	u64 end = start + PAGE_CACHE_SIZE - 1;
1400 	if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0))
1401 		unlock_page(page);
1402 	return 0;
1403 }
1404 
1405 /*
1406  * helper function to end page writeback if all the extents
1407  * in the tree for that page are done with writeback
1408  */
1409 static int check_page_writeback(struct extent_map_tree *tree,
1410 			     struct page *page)
1411 {
1412 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1413 	u64 end = start + PAGE_CACHE_SIZE - 1;
1414 	if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0))
1415 		end_page_writeback(page);
1416 	return 0;
1417 }
1418 
1419 /* lots and lots of room for performance fixes in the end_bio funcs */
1420 
1421 /*
1422  * after a writepage IO is done, we need to:
1423  * clear the uptodate bits on error
1424  * clear the writeback bits in the extent tree for this IO
1425  * end_page_writeback if the page has no more pending IO
1426  *
1427  * Scheduling is not allowed, so the extent state tree is expected
1428  * to have one and only one object corresponding to this IO.
1429  */
1430 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1431 static void end_bio_extent_writepage(struct bio *bio, int err)
1432 #else
1433 static int end_bio_extent_writepage(struct bio *bio,
1434 				   unsigned int bytes_done, int err)
1435 #endif
1436 {
1437 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1438 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1439 	struct extent_map_tree *tree = bio->bi_private;
1440 	u64 start;
1441 	u64 end;
1442 	int whole_page;
1443 
1444 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1445 	if (bio->bi_size)
1446 		return 1;
1447 #endif
1448 
1449 	do {
1450 		struct page *page = bvec->bv_page;
1451 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1452 			 bvec->bv_offset;
1453 		end = start + bvec->bv_len - 1;
1454 
1455 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1456 			whole_page = 1;
1457 		else
1458 			whole_page = 0;
1459 
1460 		if (--bvec >= bio->bi_io_vec)
1461 			prefetchw(&bvec->bv_page->flags);
1462 
1463 		if (!uptodate) {
1464 			clear_extent_uptodate(tree, start, end, GFP_ATOMIC);
1465 			ClearPageUptodate(page);
1466 			SetPageError(page);
1467 		}
1468 		clear_extent_writeback(tree, start, end, GFP_ATOMIC);
1469 
1470 		if (whole_page)
1471 			end_page_writeback(page);
1472 		else
1473 			check_page_writeback(tree, page);
1474 		if (tree->ops && tree->ops->writepage_end_io_hook)
1475 			tree->ops->writepage_end_io_hook(page, start, end);
1476 	} while (bvec >= bio->bi_io_vec);
1477 
1478 	bio_put(bio);
1479 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1480 	return 0;
1481 #endif
1482 }
1483 
1484 /*
1485  * after a readpage IO is done, we need to:
1486  * clear the uptodate bits on error
1487  * set the uptodate bits if things worked
1488  * set the page up to date if all extents in the tree are uptodate
1489  * clear the lock bit in the extent tree
1490  * unlock the page if there are no other extents locked for it
1491  *
1492  * Scheduling is not allowed, so the extent state tree is expected
1493  * to have one and only one object corresponding to this IO.
1494  */
1495 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1496 static void end_bio_extent_readpage(struct bio *bio, int err)
1497 #else
1498 static int end_bio_extent_readpage(struct bio *bio,
1499 				   unsigned int bytes_done, int err)
1500 #endif
1501 {
1502 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1503 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1504 	struct extent_map_tree *tree = bio->bi_private;
1505 	u64 start;
1506 	u64 end;
1507 	int whole_page;
1508 	int ret;
1509 
1510 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1511 	if (bio->bi_size)
1512 		return 1;
1513 #endif
1514 
1515 	do {
1516 		struct page *page = bvec->bv_page;
1517 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1518 			bvec->bv_offset;
1519 		end = start + bvec->bv_len - 1;
1520 
1521 		if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
1522 			whole_page = 1;
1523 		else
1524 			whole_page = 0;
1525 
1526 		if (--bvec >= bio->bi_io_vec)
1527 			prefetchw(&bvec->bv_page->flags);
1528 
1529 		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
1530 			ret = tree->ops->readpage_end_io_hook(page, start, end);
1531 			if (ret)
1532 				uptodate = 0;
1533 		}
1534 		if (uptodate) {
1535 			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1536 			if (whole_page)
1537 				SetPageUptodate(page);
1538 			else
1539 				check_page_uptodate(tree, page);
1540 		} else {
1541 			ClearPageUptodate(page);
1542 			SetPageError(page);
1543 		}
1544 
1545 		unlock_extent(tree, start, end, GFP_ATOMIC);
1546 
1547 		if (whole_page)
1548 			unlock_page(page);
1549 		else
1550 			check_page_locked(tree, page);
1551 	} while (bvec >= bio->bi_io_vec);
1552 
1553 	bio_put(bio);
1554 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1555 	return 0;
1556 #endif
1557 }
1558 
1559 /*
1560  * IO done from prepare_write is pretty simple, we just unlock
1561  * the structs in the extent tree when done, and set the uptodate bits
1562  * as appropriate.
1563  */
1564 #if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,23)
1565 static void end_bio_extent_preparewrite(struct bio *bio, int err)
1566 #else
1567 static int end_bio_extent_preparewrite(struct bio *bio,
1568 				       unsigned int bytes_done, int err)
1569 #endif
1570 {
1571 	const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1572 	struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1573 	struct extent_map_tree *tree = bio->bi_private;
1574 	u64 start;
1575 	u64 end;
1576 
1577 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1578 	if (bio->bi_size)
1579 		return 1;
1580 #endif
1581 
1582 	do {
1583 		struct page *page = bvec->bv_page;
1584 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
1585 			bvec->bv_offset;
1586 		end = start + bvec->bv_len - 1;
1587 
1588 		if (--bvec >= bio->bi_io_vec)
1589 			prefetchw(&bvec->bv_page->flags);
1590 
1591 		if (uptodate) {
1592 			set_extent_uptodate(tree, start, end, GFP_ATOMIC);
1593 		} else {
1594 			ClearPageUptodate(page);
1595 			SetPageError(page);
1596 		}
1597 
1598 		unlock_extent(tree, start, end, GFP_ATOMIC);
1599 
1600 	} while (bvec >= bio->bi_io_vec);
1601 
1602 	bio_put(bio);
1603 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
1604 	return 0;
1605 #endif
1606 }
1607 
1608 static struct bio *
1609 extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1610 		 gfp_t gfp_flags)
1611 {
1612 	struct bio *bio;
1613 
1614 	bio = bio_alloc(gfp_flags, nr_vecs);
1615 
1616 	if (bio == NULL && (current->flags & PF_MEMALLOC)) {
1617 		while (!bio && (nr_vecs /= 2))
1618 			bio = bio_alloc(gfp_flags, nr_vecs);
1619 	}
1620 
1621 	if (bio) {
1622 		bio->bi_bdev = bdev;
1623 		bio->bi_sector = first_sector;
1624 	}
1625 	return bio;
1626 }
1627 
1628 static int submit_one_bio(int rw, struct bio *bio)
1629 {
1630 	u64 maxsector;
1631 	int ret = 0;
1632 
1633 	bio_get(bio);
1634 
1635         maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
1636 	if (maxsector < bio->bi_sector) {
1637 		printk("sector too large max %Lu got %llu\n", maxsector,
1638 			(unsigned long long)bio->bi_sector);
1639 		WARN_ON(1);
1640 	}
1641 
1642 	submit_bio(rw, bio);
1643 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
1644 		ret = -EOPNOTSUPP;
1645 	bio_put(bio);
1646 	return ret;
1647 }
1648 
1649 static int submit_extent_page(int rw, struct extent_map_tree *tree,
1650 			      struct page *page, sector_t sector,
1651 			      size_t size, unsigned long offset,
1652 			      struct block_device *bdev,
1653 			      struct bio **bio_ret,
1654 			      unsigned long max_pages,
1655 			      bio_end_io_t end_io_func)
1656 {
1657 	int ret = 0;
1658 	struct bio *bio;
1659 	int nr;
1660 
1661 	if (bio_ret && *bio_ret) {
1662 		bio = *bio_ret;
1663 		if (bio->bi_sector + (bio->bi_size >> 9) != sector ||
1664 		    bio_add_page(bio, page, size, offset) < size) {
1665 			ret = submit_one_bio(rw, bio);
1666 			bio = NULL;
1667 		} else {
1668 			return 0;
1669 		}
1670 	}
1671 	nr = min_t(int, max_pages, bio_get_nr_vecs(bdev));
1672 	bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1673 	if (!bio) {
1674 		printk("failed to allocate bio nr %d\n", nr);
1675 	}
1676 	bio_add_page(bio, page, size, offset);
1677 	bio->bi_end_io = end_io_func;
1678 	bio->bi_private = tree;
1679 	if (bio_ret) {
1680 		*bio_ret = bio;
1681 	} else {
1682 		ret = submit_one_bio(rw, bio);
1683 	}
1684 
1685 	return ret;
1686 }
1687 
1688 void set_page_extent_mapped(struct page *page)
1689 {
1690 	if (!PagePrivate(page)) {
1691 		SetPagePrivate(page);
1692 		WARN_ON(!page->mapping->a_ops->invalidatepage);
1693 		set_page_private(page, EXTENT_PAGE_PRIVATE);
1694 		page_cache_get(page);
1695 	}
1696 }
1697 
1698 void set_page_extent_head(struct page *page, unsigned long len)
1699 {
1700 	set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
1701 }
1702 
1703 /*
1704  * basic readpage implementation.  Locked extent state structs are inserted
1705  * into the tree that are removed when the IO is done (by the end_io
1706  * handlers)
1707  */
1708 static int __extent_read_full_page(struct extent_map_tree *tree,
1709 				   struct page *page,
1710 				   get_extent_t *get_extent,
1711 				   struct bio **bio)
1712 {
1713 	struct inode *inode = page->mapping->host;
1714 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1715 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
1716 	u64 end;
1717 	u64 cur = start;
1718 	u64 extent_offset;
1719 	u64 last_byte = i_size_read(inode);
1720 	u64 block_start;
1721 	u64 cur_end;
1722 	sector_t sector;
1723 	struct extent_map *em;
1724 	struct block_device *bdev;
1725 	int ret;
1726 	int nr = 0;
1727 	size_t page_offset = 0;
1728 	size_t iosize;
1729 	size_t blocksize = inode->i_sb->s_blocksize;
1730 
1731 	set_page_extent_mapped(page);
1732 
1733 	end = page_end;
1734 	lock_extent(tree, start, end, GFP_NOFS);
1735 
1736 	while (cur <= end) {
1737 		if (cur >= last_byte) {
1738 			char *userpage;
1739 			iosize = PAGE_CACHE_SIZE - page_offset;
1740 			userpage = kmap_atomic(page, KM_USER0);
1741 			memset(userpage + page_offset, 0, iosize);
1742 			flush_dcache_page(page);
1743 			kunmap_atomic(userpage, KM_USER0);
1744 			set_extent_uptodate(tree, cur, cur + iosize - 1,
1745 					    GFP_NOFS);
1746 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1747 			break;
1748 		}
1749 		em = get_extent(inode, page, page_offset, cur, end, 0);
1750 		if (IS_ERR(em) || !em) {
1751 			SetPageError(page);
1752 			unlock_extent(tree, cur, end, GFP_NOFS);
1753 			break;
1754 		}
1755 
1756 		extent_offset = cur - em->start;
1757 		BUG_ON(em->end < cur);
1758 		BUG_ON(end < cur);
1759 
1760 		iosize = min(em->end - cur, end - cur) + 1;
1761 		cur_end = min(em->end, end);
1762 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1763 		sector = (em->block_start + extent_offset) >> 9;
1764 		bdev = em->bdev;
1765 		block_start = em->block_start;
1766 		free_extent_map(em);
1767 		em = NULL;
1768 
1769 		/* we've found a hole, just zero and go on */
1770 		if (block_start == EXTENT_MAP_HOLE) {
1771 			char *userpage;
1772 			userpage = kmap_atomic(page, KM_USER0);
1773 			memset(userpage + page_offset, 0, iosize);
1774 			flush_dcache_page(page);
1775 			kunmap_atomic(userpage, KM_USER0);
1776 
1777 			set_extent_uptodate(tree, cur, cur + iosize - 1,
1778 					    GFP_NOFS);
1779 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1780 			cur = cur + iosize;
1781 			page_offset += iosize;
1782 			continue;
1783 		}
1784 		/* the get_extent function already copied into the page */
1785 		if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) {
1786 			unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
1787 			cur = cur + iosize;
1788 			page_offset += iosize;
1789 			continue;
1790 		}
1791 
1792 		ret = 0;
1793 		if (tree->ops && tree->ops->readpage_io_hook) {
1794 			ret = tree->ops->readpage_io_hook(page, cur,
1795 							  cur + iosize - 1);
1796 		}
1797 		if (!ret) {
1798 			unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
1799 			nr -= page->index;
1800 			ret = submit_extent_page(READ, tree, page,
1801 					 sector, iosize, page_offset,
1802 					 bdev, bio, nr,
1803 					 end_bio_extent_readpage);
1804 		}
1805 		if (ret)
1806 			SetPageError(page);
1807 		cur = cur + iosize;
1808 		page_offset += iosize;
1809 		nr++;
1810 	}
1811 	if (!nr) {
1812 		if (!PageError(page))
1813 			SetPageUptodate(page);
1814 		unlock_page(page);
1815 	}
1816 	return 0;
1817 }
1818 
1819 int extent_read_full_page(struct extent_map_tree *tree, struct page *page,
1820 			    get_extent_t *get_extent)
1821 {
1822 	struct bio *bio = NULL;
1823 	int ret;
1824 
1825 	ret = __extent_read_full_page(tree, page, get_extent, &bio);
1826 	if (bio)
1827 		submit_one_bio(READ, bio);
1828 	return ret;
1829 }
1830 EXPORT_SYMBOL(extent_read_full_page);
1831 
1832 /*
1833  * the writepage semantics are similar to regular writepage.  extent
1834  * records are inserted to lock ranges in the tree, and as dirty areas
1835  * are found, they are marked writeback.  Then the lock bits are removed
1836  * and the end_io handler clears the writeback ranges
1837  */
1838 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1839 			      void *data)
1840 {
1841 	struct inode *inode = page->mapping->host;
1842 	struct extent_page_data *epd = data;
1843 	struct extent_map_tree *tree = epd->tree;
1844 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
1845 	u64 delalloc_start;
1846 	u64 page_end = start + PAGE_CACHE_SIZE - 1;
1847 	u64 end;
1848 	u64 cur = start;
1849 	u64 extent_offset;
1850 	u64 last_byte = i_size_read(inode);
1851 	u64 block_start;
1852 	u64 iosize;
1853 	sector_t sector;
1854 	struct extent_map *em;
1855 	struct block_device *bdev;
1856 	int ret;
1857 	int nr = 0;
1858 	size_t page_offset = 0;
1859 	size_t blocksize;
1860 	loff_t i_size = i_size_read(inode);
1861 	unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1862 	u64 nr_delalloc;
1863 	u64 delalloc_end;
1864 
1865 	WARN_ON(!PageLocked(page));
1866 	if (page->index > end_index) {
1867 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1868 		unlock_page(page);
1869 		return 0;
1870 	}
1871 
1872 	if (page->index == end_index) {
1873 		char *userpage;
1874 
1875 		size_t offset = i_size & (PAGE_CACHE_SIZE - 1);
1876 
1877 		userpage = kmap_atomic(page, KM_USER0);
1878 		memset(userpage + offset, 0, PAGE_CACHE_SIZE - offset);
1879 		flush_dcache_page(page);
1880 		kunmap_atomic(userpage, KM_USER0);
1881 	}
1882 
1883 	set_page_extent_mapped(page);
1884 
1885 	delalloc_start = start;
1886 	delalloc_end = 0;
1887 	while(delalloc_end < page_end) {
1888 		nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start,
1889 						       &delalloc_end,
1890 						       128 * 1024 * 1024);
1891 		if (nr_delalloc == 0) {
1892 			delalloc_start = delalloc_end + 1;
1893 			continue;
1894 		}
1895 		tree->ops->fill_delalloc(inode, delalloc_start,
1896 					 delalloc_end);
1897 		clear_extent_bit(tree, delalloc_start,
1898 				 delalloc_end,
1899 				 EXTENT_LOCKED | EXTENT_DELALLOC,
1900 				 1, 0, GFP_NOFS);
1901 		delalloc_start = delalloc_end + 1;
1902 	}
1903 	lock_extent(tree, start, page_end, GFP_NOFS);
1904 
1905 	end = page_end;
1906 	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
1907 		printk("found delalloc bits after lock_extent\n");
1908 	}
1909 
1910 	if (last_byte <= start) {
1911 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
1912 		goto done;
1913 	}
1914 
1915 	set_extent_uptodate(tree, start, page_end, GFP_NOFS);
1916 	blocksize = inode->i_sb->s_blocksize;
1917 
1918 	while (cur <= end) {
1919 		if (cur >= last_byte) {
1920 			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
1921 			break;
1922 		}
1923 		em = epd->get_extent(inode, page, page_offset, cur, end, 1);
1924 		if (IS_ERR(em) || !em) {
1925 			SetPageError(page);
1926 			break;
1927 		}
1928 
1929 		extent_offset = cur - em->start;
1930 		BUG_ON(em->end < cur);
1931 		BUG_ON(end < cur);
1932 		iosize = min(em->end - cur, end - cur) + 1;
1933 		iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1934 		sector = (em->block_start + extent_offset) >> 9;
1935 		bdev = em->bdev;
1936 		block_start = em->block_start;
1937 		free_extent_map(em);
1938 		em = NULL;
1939 
1940 		if (block_start == EXTENT_MAP_HOLE ||
1941 		    block_start == EXTENT_MAP_INLINE) {
1942 			clear_extent_dirty(tree, cur,
1943 					   cur + iosize - 1, GFP_NOFS);
1944 			cur = cur + iosize;
1945 			page_offset += iosize;
1946 			continue;
1947 		}
1948 
1949 		/* leave this out until we have a page_mkwrite call */
1950 		if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
1951 				   EXTENT_DIRTY, 0)) {
1952 			cur = cur + iosize;
1953 			page_offset += iosize;
1954 			continue;
1955 		}
1956 		clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
1957 		if (tree->ops && tree->ops->writepage_io_hook) {
1958 			ret = tree->ops->writepage_io_hook(page, cur,
1959 						cur + iosize - 1);
1960 		} else {
1961 			ret = 0;
1962 		}
1963 		if (ret)
1964 			SetPageError(page);
1965 		else {
1966 			unsigned long max_nr = end_index + 1;
1967 			set_range_writeback(tree, cur, cur + iosize - 1);
1968 			if (!PageWriteback(page)) {
1969 				printk("warning page %lu not writeback, "
1970 				       "cur %llu end %llu\n", page->index,
1971 				       (unsigned long long)cur,
1972 				       (unsigned long long)end);
1973 			}
1974 
1975 			ret = submit_extent_page(WRITE, tree, page, sector,
1976 						 iosize, page_offset, bdev,
1977 						 &epd->bio, max_nr,
1978 						 end_bio_extent_writepage);
1979 			if (ret)
1980 				SetPageError(page);
1981 		}
1982 		cur = cur + iosize;
1983 		page_offset += iosize;
1984 		nr++;
1985 	}
1986 done:
1987 	if (nr == 0) {
1988 		/* make sure the mapping tag for page dirty gets cleared */
1989 		set_page_writeback(page);
1990 		end_page_writeback(page);
1991 	}
1992 	unlock_extent(tree, start, page_end, GFP_NOFS);
1993 	unlock_page(page);
1994 	return 0;
1995 }
1996 
1997 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
1998 
1999 /* Taken directly from 2.6.23 for 2.6.18 back port */
2000 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
2001                                 void *data);
2002 
2003 /**
2004  * write_cache_pages - walk the list of dirty pages of the given address space
2005  * and write all of them.
2006  * @mapping: address space structure to write
2007  * @wbc: subtract the number of written pages from *@wbc->nr_to_write
2008  * @writepage: function called for each page
2009  * @data: data passed to writepage function
2010  *
2011  * If a page is already under I/O, write_cache_pages() skips it, even
2012  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
2013  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
2014  * and msync() need to guarantee that all the data which was dirty at the time
2015  * the call was made get new I/O started against them.  If wbc->sync_mode is
2016  * WB_SYNC_ALL then we were called for data integrity and we must wait for
2017  * existing IO to complete.
2018  */
2019 static int write_cache_pages(struct address_space *mapping,
2020 		      struct writeback_control *wbc, writepage_t writepage,
2021 		      void *data)
2022 {
2023 	struct backing_dev_info *bdi = mapping->backing_dev_info;
2024 	int ret = 0;
2025 	int done = 0;
2026 	struct pagevec pvec;
2027 	int nr_pages;
2028 	pgoff_t index;
2029 	pgoff_t end;		/* Inclusive */
2030 	int scanned = 0;
2031 	int range_whole = 0;
2032 
2033 	if (wbc->nonblocking && bdi_write_congested(bdi)) {
2034 		wbc->encountered_congestion = 1;
2035 		return 0;
2036 	}
2037 
2038 	pagevec_init(&pvec, 0);
2039 	if (wbc->range_cyclic) {
2040 		index = mapping->writeback_index; /* Start from prev offset */
2041 		end = -1;
2042 	} else {
2043 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
2044 		end = wbc->range_end >> PAGE_CACHE_SHIFT;
2045 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2046 			range_whole = 1;
2047 		scanned = 1;
2048 	}
2049 retry:
2050 	while (!done && (index <= end) &&
2051 	       (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
2052 					      PAGECACHE_TAG_DIRTY,
2053 					      min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) {
2054 		unsigned i;
2055 
2056 		scanned = 1;
2057 		for (i = 0; i < nr_pages; i++) {
2058 			struct page *page = pvec.pages[i];
2059 
2060 			/*
2061 			 * At this point we hold neither mapping->tree_lock nor
2062 			 * lock on the page itself: the page may be truncated or
2063 			 * invalidated (changing page->mapping to NULL), or even
2064 			 * swizzled back from swapper_space to tmpfs file
2065 			 * mapping
2066 			 */
2067 			lock_page(page);
2068 
2069 			if (unlikely(page->mapping != mapping)) {
2070 				unlock_page(page);
2071 				continue;
2072 			}
2073 
2074 			if (!wbc->range_cyclic && page->index > end) {
2075 				done = 1;
2076 				unlock_page(page);
2077 				continue;
2078 			}
2079 
2080 			if (wbc->sync_mode != WB_SYNC_NONE)
2081 				wait_on_page_writeback(page);
2082 
2083 			if (PageWriteback(page) ||
2084 			    !clear_page_dirty_for_io(page)) {
2085 				unlock_page(page);
2086 				continue;
2087 			}
2088 
2089 			ret = (*writepage)(page, wbc, data);
2090 
2091 			if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) {
2092 				unlock_page(page);
2093 				ret = 0;
2094 			}
2095 			if (ret || (--(wbc->nr_to_write) <= 0))
2096 				done = 1;
2097 			if (wbc->nonblocking && bdi_write_congested(bdi)) {
2098 				wbc->encountered_congestion = 1;
2099 				done = 1;
2100 			}
2101 		}
2102 		pagevec_release(&pvec);
2103 		cond_resched();
2104 	}
2105 	if (!scanned && !done) {
2106 		/*
2107 		 * We hit the last page and there is more work to be done: wrap
2108 		 * back to the start of the file
2109 		 */
2110 		scanned = 1;
2111 		index = 0;
2112 		goto retry;
2113 	}
2114 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2115 		mapping->writeback_index = index;
2116 	return ret;
2117 }
2118 #endif
2119 
2120 int extent_write_full_page(struct extent_map_tree *tree, struct page *page,
2121 			  get_extent_t *get_extent,
2122 			  struct writeback_control *wbc)
2123 {
2124 	int ret;
2125 	struct address_space *mapping = page->mapping;
2126 	struct extent_page_data epd = {
2127 		.bio = NULL,
2128 		.tree = tree,
2129 		.get_extent = get_extent,
2130 	};
2131 	struct writeback_control wbc_writepages = {
2132 		.bdi		= wbc->bdi,
2133 		.sync_mode	= WB_SYNC_NONE,
2134 		.older_than_this = NULL,
2135 		.nr_to_write	= 64,
2136 		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
2137 		.range_end	= (loff_t)-1,
2138 	};
2139 
2140 
2141 	ret = __extent_writepage(page, wbc, &epd);
2142 
2143 	write_cache_pages(mapping, &wbc_writepages, __extent_writepage, &epd);
2144 	if (epd.bio) {
2145 		submit_one_bio(WRITE, epd.bio);
2146 	}
2147 	return ret;
2148 }
2149 EXPORT_SYMBOL(extent_write_full_page);
2150 
2151 
2152 int extent_writepages(struct extent_map_tree *tree,
2153 		      struct address_space *mapping,
2154 		      get_extent_t *get_extent,
2155 		      struct writeback_control *wbc)
2156 {
2157 	int ret = 0;
2158 	struct extent_page_data epd = {
2159 		.bio = NULL,
2160 		.tree = tree,
2161 		.get_extent = get_extent,
2162 	};
2163 
2164 	ret = write_cache_pages(mapping, wbc, __extent_writepage, &epd);
2165 	if (epd.bio) {
2166 		submit_one_bio(WRITE, epd.bio);
2167 	}
2168 	return ret;
2169 }
2170 EXPORT_SYMBOL(extent_writepages);
2171 
2172 int extent_readpages(struct extent_map_tree *tree,
2173 		     struct address_space *mapping,
2174 		     struct list_head *pages, unsigned nr_pages,
2175 		     get_extent_t get_extent)
2176 {
2177 	struct bio *bio = NULL;
2178 	unsigned page_idx;
2179 	struct pagevec pvec;
2180 
2181 	pagevec_init(&pvec, 0);
2182 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
2183 		struct page *page = list_entry(pages->prev, struct page, lru);
2184 
2185 		prefetchw(&page->flags);
2186 		list_del(&page->lru);
2187 		/*
2188 		 * what we want to do here is call add_to_page_cache_lru,
2189 		 * but that isn't exported, so we reproduce it here
2190 		 */
2191 		if (!add_to_page_cache(page, mapping,
2192 					page->index, GFP_KERNEL)) {
2193 
2194 			/* open coding of lru_cache_add, also not exported */
2195 			page_cache_get(page);
2196 			if (!pagevec_add(&pvec, page))
2197 				__pagevec_lru_add(&pvec);
2198 			__extent_read_full_page(tree, page, get_extent, &bio);
2199 		}
2200 		page_cache_release(page);
2201 	}
2202 	if (pagevec_count(&pvec))
2203 		__pagevec_lru_add(&pvec);
2204 	BUG_ON(!list_empty(pages));
2205 	if (bio)
2206 		submit_one_bio(READ, bio);
2207 	return 0;
2208 }
2209 EXPORT_SYMBOL(extent_readpages);
2210 
2211 /*
2212  * basic invalidatepage code, this waits on any locked or writeback
2213  * ranges corresponding to the page, and then deletes any extent state
2214  * records from the tree
2215  */
2216 int extent_invalidatepage(struct extent_map_tree *tree,
2217 			  struct page *page, unsigned long offset)
2218 {
2219 	u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2220 	u64 end = start + PAGE_CACHE_SIZE - 1;
2221 	size_t blocksize = page->mapping->host->i_sb->s_blocksize;
2222 
2223 	start += (offset + blocksize -1) & ~(blocksize - 1);
2224 	if (start > end)
2225 		return 0;
2226 
2227 	lock_extent(tree, start, end, GFP_NOFS);
2228 	wait_on_extent_writeback(tree, start, end);
2229 	clear_extent_bit(tree, start, end,
2230 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC,
2231 			 1, 1, GFP_NOFS);
2232 	return 0;
2233 }
2234 EXPORT_SYMBOL(extent_invalidatepage);
2235 
2236 /*
2237  * simple commit_write call, set_range_dirty is used to mark both
2238  * the pages and the extent records as dirty
2239  */
2240 int extent_commit_write(struct extent_map_tree *tree,
2241 			struct inode *inode, struct page *page,
2242 			unsigned from, unsigned to)
2243 {
2244 	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
2245 
2246 	set_page_extent_mapped(page);
2247 	set_page_dirty(page);
2248 
2249 	if (pos > inode->i_size) {
2250 		i_size_write(inode, pos);
2251 		mark_inode_dirty(inode);
2252 	}
2253 	return 0;
2254 }
2255 EXPORT_SYMBOL(extent_commit_write);
2256 
2257 int extent_prepare_write(struct extent_map_tree *tree,
2258 			 struct inode *inode, struct page *page,
2259 			 unsigned from, unsigned to, get_extent_t *get_extent)
2260 {
2261 	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
2262 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
2263 	u64 block_start;
2264 	u64 orig_block_start;
2265 	u64 block_end;
2266 	u64 cur_end;
2267 	struct extent_map *em;
2268 	unsigned blocksize = 1 << inode->i_blkbits;
2269 	size_t page_offset = 0;
2270 	size_t block_off_start;
2271 	size_t block_off_end;
2272 	int err = 0;
2273 	int iocount = 0;
2274 	int ret = 0;
2275 	int isnew;
2276 
2277 	set_page_extent_mapped(page);
2278 
2279 	block_start = (page_start + from) & ~((u64)blocksize - 1);
2280 	block_end = (page_start + to - 1) | (blocksize - 1);
2281 	orig_block_start = block_start;
2282 
2283 	lock_extent(tree, page_start, page_end, GFP_NOFS);
2284 	while(block_start <= block_end) {
2285 		em = get_extent(inode, page, page_offset, block_start,
2286 				block_end, 1);
2287 		if (IS_ERR(em) || !em) {
2288 			goto err;
2289 		}
2290 		cur_end = min(block_end, em->end);
2291 		block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
2292 		block_off_end = block_off_start + blocksize;
2293 		isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
2294 
2295 		if (!PageUptodate(page) && isnew &&
2296 		    (block_off_end > to || block_off_start < from)) {
2297 			void *kaddr;
2298 
2299 			kaddr = kmap_atomic(page, KM_USER0);
2300 			if (block_off_end > to)
2301 				memset(kaddr + to, 0, block_off_end - to);
2302 			if (block_off_start < from)
2303 				memset(kaddr + block_off_start, 0,
2304 				       from - block_off_start);
2305 			flush_dcache_page(page);
2306 			kunmap_atomic(kaddr, KM_USER0);
2307 		}
2308 		if ((em->block_start != EXTENT_MAP_HOLE &&
2309 		     em->block_start != EXTENT_MAP_INLINE) &&
2310 		    !isnew && !PageUptodate(page) &&
2311 		    (block_off_end > to || block_off_start < from) &&
2312 		    !test_range_bit(tree, block_start, cur_end,
2313 				    EXTENT_UPTODATE, 1)) {
2314 			u64 sector;
2315 			u64 extent_offset = block_start - em->start;
2316 			size_t iosize;
2317 			sector = (em->block_start + extent_offset) >> 9;
2318 			iosize = (cur_end - block_start + blocksize) &
2319 				~((u64)blocksize - 1);
2320 			/*
2321 			 * we've already got the extent locked, but we
2322 			 * need to split the state such that our end_bio
2323 			 * handler can clear the lock.
2324 			 */
2325 			set_extent_bit(tree, block_start,
2326 				       block_start + iosize - 1,
2327 				       EXTENT_LOCKED, 0, NULL, GFP_NOFS);
2328 			ret = submit_extent_page(READ, tree, page,
2329 					 sector, iosize, page_offset, em->bdev,
2330 					 NULL, 1,
2331 					 end_bio_extent_preparewrite);
2332 			iocount++;
2333 			block_start = block_start + iosize;
2334 		} else {
2335 			set_extent_uptodate(tree, block_start, cur_end,
2336 					    GFP_NOFS);
2337 			unlock_extent(tree, block_start, cur_end, GFP_NOFS);
2338 			block_start = cur_end + 1;
2339 		}
2340 		page_offset = block_start & (PAGE_CACHE_SIZE - 1);
2341 		free_extent_map(em);
2342 	}
2343 	if (iocount) {
2344 		wait_extent_bit(tree, orig_block_start,
2345 				block_end, EXTENT_LOCKED);
2346 	}
2347 	check_page_uptodate(tree, page);
2348 err:
2349 	/* FIXME, zero out newly allocated blocks on error */
2350 	return err;
2351 }
2352 EXPORT_SYMBOL(extent_prepare_write);
2353 
2354 /*
2355  * a helper for releasepage.  As long as there are no locked extents
2356  * in the range corresponding to the page, both state records and extent
2357  * map records are removed
2358  */
2359 int try_release_extent_mapping(struct extent_map_tree *tree, struct page *page)
2360 {
2361 	struct extent_map *em;
2362 	u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
2363 	u64 end = start + PAGE_CACHE_SIZE - 1;
2364 	u64 orig_start = start;
2365 	int ret = 1;
2366 
2367 	while (start <= end) {
2368 		em = lookup_extent_mapping(tree, start, end);
2369 		if (!em || IS_ERR(em))
2370 			break;
2371 		if (!test_range_bit(tree, em->start, em->end,
2372 				    EXTENT_LOCKED, 0)) {
2373 			remove_extent_mapping(tree, em);
2374 			/* once for the rb tree */
2375 			free_extent_map(em);
2376 		}
2377 		start = em->end + 1;
2378 		/* once for us */
2379 		free_extent_map(em);
2380 	}
2381 	if (test_range_bit(tree, orig_start, end, EXTENT_LOCKED, 0))
2382 		ret = 0;
2383 	else
2384 		clear_extent_bit(tree, orig_start, end, EXTENT_UPTODATE,
2385 				 1, 1, GFP_NOFS);
2386 	return ret;
2387 }
2388 EXPORT_SYMBOL(try_release_extent_mapping);
2389 
2390 sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2391 		get_extent_t *get_extent)
2392 {
2393 	struct inode *inode = mapping->host;
2394 	u64 start = iblock << inode->i_blkbits;
2395 	u64 end = start + (1 << inode->i_blkbits) - 1;
2396 	sector_t sector = 0;
2397 	struct extent_map *em;
2398 
2399 	em = get_extent(inode, NULL, 0, start, end, 0);
2400 	if (!em || IS_ERR(em))
2401 		return 0;
2402 
2403 	if (em->block_start == EXTENT_MAP_INLINE ||
2404 	    em->block_start == EXTENT_MAP_HOLE)
2405 		goto out;
2406 
2407 	sector = (em->block_start + start - em->start) >> inode->i_blkbits;
2408 out:
2409 	free_extent_map(em);
2410 	return sector;
2411 }
2412 
2413 static int add_lru(struct extent_map_tree *tree, struct extent_buffer *eb)
2414 {
2415 	if (list_empty(&eb->lru)) {
2416 		extent_buffer_get(eb);
2417 		list_add(&eb->lru, &tree->buffer_lru);
2418 		tree->lru_size++;
2419 		if (tree->lru_size >= BUFFER_LRU_MAX) {
2420 			struct extent_buffer *rm;
2421 			rm = list_entry(tree->buffer_lru.prev,
2422 					struct extent_buffer, lru);
2423 			tree->lru_size--;
2424 			list_del_init(&rm->lru);
2425 			free_extent_buffer(rm);
2426 		}
2427 	} else
2428 		list_move(&eb->lru, &tree->buffer_lru);
2429 	return 0;
2430 }
2431 static struct extent_buffer *find_lru(struct extent_map_tree *tree,
2432 				      u64 start, unsigned long len)
2433 {
2434 	struct list_head *lru = &tree->buffer_lru;
2435 	struct list_head *cur = lru->next;
2436 	struct extent_buffer *eb;
2437 
2438 	if (list_empty(lru))
2439 		return NULL;
2440 
2441 	do {
2442 		eb = list_entry(cur, struct extent_buffer, lru);
2443 		if (eb->start == start && eb->len == len) {
2444 			extent_buffer_get(eb);
2445 			return eb;
2446 		}
2447 		cur = cur->next;
2448 	} while (cur != lru);
2449 	return NULL;
2450 }
2451 
2452 static inline unsigned long num_extent_pages(u64 start, u64 len)
2453 {
2454 	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2455 		(start >> PAGE_CACHE_SHIFT);
2456 }
2457 
2458 static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2459 					      unsigned long i)
2460 {
2461 	struct page *p;
2462 	struct address_space *mapping;
2463 
2464 	if (i == 0)
2465 		return eb->first_page;
2466 	i += eb->start >> PAGE_CACHE_SHIFT;
2467 	mapping = eb->first_page->mapping;
2468 	read_lock_irq(&mapping->tree_lock);
2469 	p = radix_tree_lookup(&mapping->page_tree, i);
2470 	read_unlock_irq(&mapping->tree_lock);
2471 	return p;
2472 }
2473 
2474 static struct extent_buffer *__alloc_extent_buffer(struct extent_map_tree *tree,
2475 						   u64 start,
2476 						   unsigned long len,
2477 						   gfp_t mask)
2478 {
2479 	struct extent_buffer *eb = NULL;
2480 
2481 	spin_lock(&tree->lru_lock);
2482 	eb = find_lru(tree, start, len);
2483 	spin_unlock(&tree->lru_lock);
2484 	if (eb) {
2485 		return eb;
2486 	}
2487 
2488 	eb = kmem_cache_zalloc(extent_buffer_cache, mask);
2489 	INIT_LIST_HEAD(&eb->lru);
2490 	eb->start = start;
2491 	eb->len = len;
2492 	atomic_set(&eb->refs, 1);
2493 
2494 	return eb;
2495 }
2496 
2497 static void __free_extent_buffer(struct extent_buffer *eb)
2498 {
2499 	kmem_cache_free(extent_buffer_cache, eb);
2500 }
2501 
2502 struct extent_buffer *alloc_extent_buffer(struct extent_map_tree *tree,
2503 					  u64 start, unsigned long len,
2504 					  struct page *page0,
2505 					  gfp_t mask)
2506 {
2507 	unsigned long num_pages = num_extent_pages(start, len);
2508 	unsigned long i;
2509 	unsigned long index = start >> PAGE_CACHE_SHIFT;
2510 	struct extent_buffer *eb;
2511 	struct page *p;
2512 	struct address_space *mapping = tree->mapping;
2513 	int uptodate = 1;
2514 
2515 	eb = __alloc_extent_buffer(tree, start, len, mask);
2516 	if (!eb || IS_ERR(eb))
2517 		return NULL;
2518 
2519 	if (eb->flags & EXTENT_BUFFER_FILLED)
2520 		goto lru_add;
2521 
2522 	if (page0) {
2523 		eb->first_page = page0;
2524 		i = 1;
2525 		index++;
2526 		page_cache_get(page0);
2527 		mark_page_accessed(page0);
2528 		set_page_extent_mapped(page0);
2529 		WARN_ON(!PageUptodate(page0));
2530 		set_page_extent_head(page0, len);
2531 	} else {
2532 		i = 0;
2533 	}
2534 	for (; i < num_pages; i++, index++) {
2535 		p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
2536 		if (!p) {
2537 			WARN_ON(1);
2538 			goto fail;
2539 		}
2540 		set_page_extent_mapped(p);
2541 		mark_page_accessed(p);
2542 		if (i == 0) {
2543 			eb->first_page = p;
2544 			set_page_extent_head(p, len);
2545 		} else {
2546 			set_page_private(p, EXTENT_PAGE_PRIVATE);
2547 		}
2548 		if (!PageUptodate(p))
2549 			uptodate = 0;
2550 		unlock_page(p);
2551 	}
2552 	if (uptodate)
2553 		eb->flags |= EXTENT_UPTODATE;
2554 	eb->flags |= EXTENT_BUFFER_FILLED;
2555 
2556 lru_add:
2557 	spin_lock(&tree->lru_lock);
2558 	add_lru(tree, eb);
2559 	spin_unlock(&tree->lru_lock);
2560 	return eb;
2561 
2562 fail:
2563 	spin_lock(&tree->lru_lock);
2564 	list_del_init(&eb->lru);
2565 	spin_unlock(&tree->lru_lock);
2566 	if (!atomic_dec_and_test(&eb->refs))
2567 		return NULL;
2568 	for (index = 1; index < i; index++) {
2569 		page_cache_release(extent_buffer_page(eb, index));
2570 	}
2571 	if (i > 0)
2572 		page_cache_release(extent_buffer_page(eb, 0));
2573 	__free_extent_buffer(eb);
2574 	return NULL;
2575 }
2576 EXPORT_SYMBOL(alloc_extent_buffer);
2577 
2578 struct extent_buffer *find_extent_buffer(struct extent_map_tree *tree,
2579 					 u64 start, unsigned long len,
2580 					  gfp_t mask)
2581 {
2582 	unsigned long num_pages = num_extent_pages(start, len);
2583 	unsigned long i;
2584 	unsigned long index = start >> PAGE_CACHE_SHIFT;
2585 	struct extent_buffer *eb;
2586 	struct page *p;
2587 	struct address_space *mapping = tree->mapping;
2588 	int uptodate = 1;
2589 
2590 	eb = __alloc_extent_buffer(tree, start, len, mask);
2591 	if (!eb || IS_ERR(eb))
2592 		return NULL;
2593 
2594 	if (eb->flags & EXTENT_BUFFER_FILLED)
2595 		goto lru_add;
2596 
2597 	for (i = 0; i < num_pages; i++, index++) {
2598 		p = find_lock_page(mapping, index);
2599 		if (!p) {
2600 			goto fail;
2601 		}
2602 		set_page_extent_mapped(p);
2603 		mark_page_accessed(p);
2604 
2605 		if (i == 0) {
2606 			eb->first_page = p;
2607 			set_page_extent_head(p, len);
2608 		} else {
2609 			set_page_private(p, EXTENT_PAGE_PRIVATE);
2610 		}
2611 
2612 		if (!PageUptodate(p))
2613 			uptodate = 0;
2614 		unlock_page(p);
2615 	}
2616 	if (uptodate)
2617 		eb->flags |= EXTENT_UPTODATE;
2618 	eb->flags |= EXTENT_BUFFER_FILLED;
2619 
2620 lru_add:
2621 	spin_lock(&tree->lru_lock);
2622 	add_lru(tree, eb);
2623 	spin_unlock(&tree->lru_lock);
2624 	return eb;
2625 fail:
2626 	spin_lock(&tree->lru_lock);
2627 	list_del_init(&eb->lru);
2628 	spin_unlock(&tree->lru_lock);
2629 	if (!atomic_dec_and_test(&eb->refs))
2630 		return NULL;
2631 	for (index = 1; index < i; index++) {
2632 		page_cache_release(extent_buffer_page(eb, index));
2633 	}
2634 	if (i > 0)
2635 		page_cache_release(extent_buffer_page(eb, 0));
2636 	__free_extent_buffer(eb);
2637 	return NULL;
2638 }
2639 EXPORT_SYMBOL(find_extent_buffer);
2640 
2641 void free_extent_buffer(struct extent_buffer *eb)
2642 {
2643 	unsigned long i;
2644 	unsigned long num_pages;
2645 
2646 	if (!eb)
2647 		return;
2648 
2649 	if (!atomic_dec_and_test(&eb->refs))
2650 		return;
2651 
2652 	WARN_ON(!list_empty(&eb->lru));
2653 	num_pages = num_extent_pages(eb->start, eb->len);
2654 
2655 	for (i = 1; i < num_pages; i++) {
2656 		page_cache_release(extent_buffer_page(eb, i));
2657 	}
2658 	page_cache_release(extent_buffer_page(eb, 0));
2659 	__free_extent_buffer(eb);
2660 }
2661 EXPORT_SYMBOL(free_extent_buffer);
2662 
2663 int clear_extent_buffer_dirty(struct extent_map_tree *tree,
2664 			      struct extent_buffer *eb)
2665 {
2666 	int set;
2667 	unsigned long i;
2668 	unsigned long num_pages;
2669 	struct page *page;
2670 
2671 	u64 start = eb->start;
2672 	u64 end = start + eb->len - 1;
2673 
2674 	set = clear_extent_dirty(tree, start, end, GFP_NOFS);
2675 	num_pages = num_extent_pages(eb->start, eb->len);
2676 
2677 	for (i = 0; i < num_pages; i++) {
2678 		page = extent_buffer_page(eb, i);
2679 		lock_page(page);
2680 		if (i == 0)
2681 			set_page_extent_head(page, eb->len);
2682 		else
2683 			set_page_private(page, EXTENT_PAGE_PRIVATE);
2684 
2685 		/*
2686 		 * if we're on the last page or the first page and the
2687 		 * block isn't aligned on a page boundary, do extra checks
2688 		 * to make sure we don't clean page that is partially dirty
2689 		 */
2690 		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2691 		    ((i == num_pages - 1) &&
2692 		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2693 			start = (u64)page->index << PAGE_CACHE_SHIFT;
2694 			end  = start + PAGE_CACHE_SIZE - 1;
2695 			if (test_range_bit(tree, start, end,
2696 					   EXTENT_DIRTY, 0)) {
2697 				unlock_page(page);
2698 				continue;
2699 			}
2700 		}
2701 		clear_page_dirty_for_io(page);
2702 		write_lock_irq(&page->mapping->tree_lock);
2703 		if (!PageDirty(page)) {
2704 			radix_tree_tag_clear(&page->mapping->page_tree,
2705 						page_index(page),
2706 						PAGECACHE_TAG_DIRTY);
2707 		}
2708 		write_unlock_irq(&page->mapping->tree_lock);
2709 		unlock_page(page);
2710 	}
2711 	return 0;
2712 }
2713 EXPORT_SYMBOL(clear_extent_buffer_dirty);
2714 
2715 int wait_on_extent_buffer_writeback(struct extent_map_tree *tree,
2716 				    struct extent_buffer *eb)
2717 {
2718 	return wait_on_extent_writeback(tree, eb->start,
2719 					eb->start + eb->len - 1);
2720 }
2721 EXPORT_SYMBOL(wait_on_extent_buffer_writeback);
2722 
2723 int set_extent_buffer_dirty(struct extent_map_tree *tree,
2724 			     struct extent_buffer *eb)
2725 {
2726 	unsigned long i;
2727 	unsigned long num_pages;
2728 
2729 	num_pages = num_extent_pages(eb->start, eb->len);
2730 	for (i = 0; i < num_pages; i++) {
2731 		struct page *page = extent_buffer_page(eb, i);
2732 		/* writepage may need to do something special for the
2733 		 * first page, we have to make sure page->private is
2734 		 * properly set.  releasepage may drop page->private
2735 		 * on us if the page isn't already dirty.
2736 		 */
2737 		if (i == 0) {
2738 			lock_page(page);
2739 			set_page_extent_head(page, eb->len);
2740 		} else if (PagePrivate(page) &&
2741 			   page->private != EXTENT_PAGE_PRIVATE) {
2742 			lock_page(page);
2743 			set_page_extent_mapped(page);
2744 			unlock_page(page);
2745 		}
2746 		__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
2747 		if (i == 0)
2748 			unlock_page(page);
2749 	}
2750 	return set_extent_dirty(tree, eb->start,
2751 				eb->start + eb->len - 1, GFP_NOFS);
2752 }
2753 EXPORT_SYMBOL(set_extent_buffer_dirty);
2754 
2755 int set_extent_buffer_uptodate(struct extent_map_tree *tree,
2756 				struct extent_buffer *eb)
2757 {
2758 	unsigned long i;
2759 	struct page *page;
2760 	unsigned long num_pages;
2761 
2762 	num_pages = num_extent_pages(eb->start, eb->len);
2763 
2764 	set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
2765 			    GFP_NOFS);
2766 	for (i = 0; i < num_pages; i++) {
2767 		page = extent_buffer_page(eb, i);
2768 		if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
2769 		    ((i == num_pages - 1) &&
2770 		     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
2771 			check_page_uptodate(tree, page);
2772 			continue;
2773 		}
2774 		SetPageUptodate(page);
2775 	}
2776 	return 0;
2777 }
2778 EXPORT_SYMBOL(set_extent_buffer_uptodate);
2779 
2780 int extent_buffer_uptodate(struct extent_map_tree *tree,
2781 			     struct extent_buffer *eb)
2782 {
2783 	if (eb->flags & EXTENT_UPTODATE)
2784 		return 1;
2785 	return test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2786 			   EXTENT_UPTODATE, 1);
2787 }
2788 EXPORT_SYMBOL(extent_buffer_uptodate);
2789 
2790 int read_extent_buffer_pages(struct extent_map_tree *tree,
2791 			     struct extent_buffer *eb,
2792 			     u64 start,
2793 			     int wait)
2794 {
2795 	unsigned long i;
2796 	unsigned long start_i;
2797 	struct page *page;
2798 	int err;
2799 	int ret = 0;
2800 	unsigned long num_pages;
2801 
2802 	if (eb->flags & EXTENT_UPTODATE)
2803 		return 0;
2804 
2805 	if (0 && test_range_bit(tree, eb->start, eb->start + eb->len - 1,
2806 			   EXTENT_UPTODATE, 1)) {
2807 		return 0;
2808 	}
2809 
2810 	if (start) {
2811 		WARN_ON(start < eb->start);
2812 		start_i = (start >> PAGE_CACHE_SHIFT) -
2813 			(eb->start >> PAGE_CACHE_SHIFT);
2814 	} else {
2815 		start_i = 0;
2816 	}
2817 
2818 	num_pages = num_extent_pages(eb->start, eb->len);
2819 	for (i = start_i; i < num_pages; i++) {
2820 		page = extent_buffer_page(eb, i);
2821 		if (PageUptodate(page)) {
2822 			continue;
2823 		}
2824 		if (!wait) {
2825 			if (TestSetPageLocked(page)) {
2826 				continue;
2827 			}
2828 		} else {
2829 			lock_page(page);
2830 		}
2831 		if (!PageUptodate(page)) {
2832 			err = page->mapping->a_ops->readpage(NULL, page);
2833 			if (err) {
2834 				ret = err;
2835 			}
2836 		} else {
2837 			unlock_page(page);
2838 		}
2839 	}
2840 
2841 	if (ret || !wait) {
2842 		return ret;
2843 	}
2844 
2845 	for (i = start_i; i < num_pages; i++) {
2846 		page = extent_buffer_page(eb, i);
2847 		wait_on_page_locked(page);
2848 		if (!PageUptodate(page)) {
2849 			ret = -EIO;
2850 		}
2851 	}
2852 	if (!ret)
2853 		eb->flags |= EXTENT_UPTODATE;
2854 	return ret;
2855 }
2856 EXPORT_SYMBOL(read_extent_buffer_pages);
2857 
2858 void read_extent_buffer(struct extent_buffer *eb, void *dstv,
2859 			unsigned long start,
2860 			unsigned long len)
2861 {
2862 	size_t cur;
2863 	size_t offset;
2864 	struct page *page;
2865 	char *kaddr;
2866 	char *dst = (char *)dstv;
2867 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2868 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2869 	unsigned long num_pages = num_extent_pages(eb->start, eb->len);
2870 
2871 	WARN_ON(start > eb->len);
2872 	WARN_ON(start + len > eb->start + eb->len);
2873 
2874 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2875 
2876 	while(len > 0) {
2877 		page = extent_buffer_page(eb, i);
2878 		if (!PageUptodate(page)) {
2879 			printk("page %lu not up to date i %lu, total %lu, len %lu\n", page->index, i, num_pages, eb->len);
2880 			WARN_ON(1);
2881 		}
2882 		WARN_ON(!PageUptodate(page));
2883 
2884 		cur = min(len, (PAGE_CACHE_SIZE - offset));
2885 		kaddr = kmap_atomic(page, KM_USER1);
2886 		memcpy(dst, kaddr + offset, cur);
2887 		kunmap_atomic(kaddr, KM_USER1);
2888 
2889 		dst += cur;
2890 		len -= cur;
2891 		offset = 0;
2892 		i++;
2893 	}
2894 }
2895 EXPORT_SYMBOL(read_extent_buffer);
2896 
2897 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
2898 			       unsigned long min_len, char **token, char **map,
2899 			       unsigned long *map_start,
2900 			       unsigned long *map_len, int km)
2901 {
2902 	size_t offset = start & (PAGE_CACHE_SIZE - 1);
2903 	char *kaddr;
2904 	struct page *p;
2905 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2906 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2907 	unsigned long end_i = (start_offset + start + min_len - 1) >>
2908 		PAGE_CACHE_SHIFT;
2909 
2910 	if (i != end_i)
2911 		return -EINVAL;
2912 
2913 	if (i == 0) {
2914 		offset = start_offset;
2915 		*map_start = 0;
2916 	} else {
2917 		offset = 0;
2918 		*map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset;
2919 	}
2920 	if (start + min_len > eb->len) {
2921 printk("bad mapping eb start %Lu len %lu, wanted %lu %lu\n", eb->start, eb->len, start, min_len);
2922 		WARN_ON(1);
2923 	}
2924 
2925 	p = extent_buffer_page(eb, i);
2926 	WARN_ON(!PageUptodate(p));
2927 	kaddr = kmap_atomic(p, km);
2928 	*token = kaddr;
2929 	*map = kaddr + offset;
2930 	*map_len = PAGE_CACHE_SIZE - offset;
2931 	return 0;
2932 }
2933 EXPORT_SYMBOL(map_private_extent_buffer);
2934 
2935 int map_extent_buffer(struct extent_buffer *eb, unsigned long start,
2936 		      unsigned long min_len,
2937 		      char **token, char **map,
2938 		      unsigned long *map_start,
2939 		      unsigned long *map_len, int km)
2940 {
2941 	int err;
2942 	int save = 0;
2943 	if (eb->map_token) {
2944 		unmap_extent_buffer(eb, eb->map_token, km);
2945 		eb->map_token = NULL;
2946 		save = 1;
2947 	}
2948 	err = map_private_extent_buffer(eb, start, min_len, token, map,
2949 				       map_start, map_len, km);
2950 	if (!err && save) {
2951 		eb->map_token = *token;
2952 		eb->kaddr = *map;
2953 		eb->map_start = *map_start;
2954 		eb->map_len = *map_len;
2955 	}
2956 	return err;
2957 }
2958 EXPORT_SYMBOL(map_extent_buffer);
2959 
2960 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km)
2961 {
2962 	kunmap_atomic(token, km);
2963 }
2964 EXPORT_SYMBOL(unmap_extent_buffer);
2965 
2966 int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
2967 			  unsigned long start,
2968 			  unsigned long len)
2969 {
2970 	size_t cur;
2971 	size_t offset;
2972 	struct page *page;
2973 	char *kaddr;
2974 	char *ptr = (char *)ptrv;
2975 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
2976 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
2977 	int ret = 0;
2978 
2979 	WARN_ON(start > eb->len);
2980 	WARN_ON(start + len > eb->start + eb->len);
2981 
2982 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
2983 
2984 	while(len > 0) {
2985 		page = extent_buffer_page(eb, i);
2986 		WARN_ON(!PageUptodate(page));
2987 
2988 		cur = min(len, (PAGE_CACHE_SIZE - offset));
2989 
2990 		kaddr = kmap_atomic(page, KM_USER0);
2991 		ret = memcmp(ptr, kaddr + offset, cur);
2992 		kunmap_atomic(kaddr, KM_USER0);
2993 		if (ret)
2994 			break;
2995 
2996 		ptr += cur;
2997 		len -= cur;
2998 		offset = 0;
2999 		i++;
3000 	}
3001 	return ret;
3002 }
3003 EXPORT_SYMBOL(memcmp_extent_buffer);
3004 
3005 void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
3006 			 unsigned long start, unsigned long len)
3007 {
3008 	size_t cur;
3009 	size_t offset;
3010 	struct page *page;
3011 	char *kaddr;
3012 	char *src = (char *)srcv;
3013 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3014 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3015 
3016 	WARN_ON(start > eb->len);
3017 	WARN_ON(start + len > eb->start + eb->len);
3018 
3019 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3020 
3021 	while(len > 0) {
3022 		page = extent_buffer_page(eb, i);
3023 		WARN_ON(!PageUptodate(page));
3024 
3025 		cur = min(len, PAGE_CACHE_SIZE - offset);
3026 		kaddr = kmap_atomic(page, KM_USER1);
3027 		memcpy(kaddr + offset, src, cur);
3028 		kunmap_atomic(kaddr, KM_USER1);
3029 
3030 		src += cur;
3031 		len -= cur;
3032 		offset = 0;
3033 		i++;
3034 	}
3035 }
3036 EXPORT_SYMBOL(write_extent_buffer);
3037 
3038 void memset_extent_buffer(struct extent_buffer *eb, char c,
3039 			  unsigned long start, unsigned long len)
3040 {
3041 	size_t cur;
3042 	size_t offset;
3043 	struct page *page;
3044 	char *kaddr;
3045 	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
3046 	unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT;
3047 
3048 	WARN_ON(start > eb->len);
3049 	WARN_ON(start + len > eb->start + eb->len);
3050 
3051 	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
3052 
3053 	while(len > 0) {
3054 		page = extent_buffer_page(eb, i);
3055 		WARN_ON(!PageUptodate(page));
3056 
3057 		cur = min(len, PAGE_CACHE_SIZE - offset);
3058 		kaddr = kmap_atomic(page, KM_USER0);
3059 		memset(kaddr + offset, c, cur);
3060 		kunmap_atomic(kaddr, KM_USER0);
3061 
3062 		len -= cur;
3063 		offset = 0;
3064 		i++;
3065 	}
3066 }
3067 EXPORT_SYMBOL(memset_extent_buffer);
3068 
3069 void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
3070 			unsigned long dst_offset, unsigned long src_offset,
3071 			unsigned long len)
3072 {
3073 	u64 dst_len = dst->len;
3074 	size_t cur;
3075 	size_t offset;
3076 	struct page *page;
3077 	char *kaddr;
3078 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3079 	unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3080 
3081 	WARN_ON(src->len != dst_len);
3082 
3083 	offset = (start_offset + dst_offset) &
3084 		((unsigned long)PAGE_CACHE_SIZE - 1);
3085 
3086 	while(len > 0) {
3087 		page = extent_buffer_page(dst, i);
3088 		WARN_ON(!PageUptodate(page));
3089 
3090 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset));
3091 
3092 		kaddr = kmap_atomic(page, KM_USER0);
3093 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
3094 		kunmap_atomic(kaddr, KM_USER0);
3095 
3096 		src_offset += cur;
3097 		len -= cur;
3098 		offset = 0;
3099 		i++;
3100 	}
3101 }
3102 EXPORT_SYMBOL(copy_extent_buffer);
3103 
3104 static void move_pages(struct page *dst_page, struct page *src_page,
3105 		       unsigned long dst_off, unsigned long src_off,
3106 		       unsigned long len)
3107 {
3108 	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3109 	if (dst_page == src_page) {
3110 		memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len);
3111 	} else {
3112 		char *src_kaddr = kmap_atomic(src_page, KM_USER1);
3113 		char *p = dst_kaddr + dst_off + len;
3114 		char *s = src_kaddr + src_off + len;
3115 
3116 		while (len--)
3117 			*--p = *--s;
3118 
3119 		kunmap_atomic(src_kaddr, KM_USER1);
3120 	}
3121 	kunmap_atomic(dst_kaddr, KM_USER0);
3122 }
3123 
3124 static void copy_pages(struct page *dst_page, struct page *src_page,
3125 		       unsigned long dst_off, unsigned long src_off,
3126 		       unsigned long len)
3127 {
3128 	char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
3129 	char *src_kaddr;
3130 
3131 	if (dst_page != src_page)
3132 		src_kaddr = kmap_atomic(src_page, KM_USER1);
3133 	else
3134 		src_kaddr = dst_kaddr;
3135 
3136 	memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
3137 	kunmap_atomic(dst_kaddr, KM_USER0);
3138 	if (dst_page != src_page)
3139 		kunmap_atomic(src_kaddr, KM_USER1);
3140 }
3141 
3142 void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3143 			   unsigned long src_offset, unsigned long len)
3144 {
3145 	size_t cur;
3146 	size_t dst_off_in_page;
3147 	size_t src_off_in_page;
3148 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3149 	unsigned long dst_i;
3150 	unsigned long src_i;
3151 
3152 	if (src_offset + len > dst->len) {
3153 		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3154 		       src_offset, len, dst->len);
3155 		BUG_ON(1);
3156 	}
3157 	if (dst_offset + len > dst->len) {
3158 		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3159 		       dst_offset, len, dst->len);
3160 		BUG_ON(1);
3161 	}
3162 
3163 	while(len > 0) {
3164 		dst_off_in_page = (start_offset + dst_offset) &
3165 			((unsigned long)PAGE_CACHE_SIZE - 1);
3166 		src_off_in_page = (start_offset + src_offset) &
3167 			((unsigned long)PAGE_CACHE_SIZE - 1);
3168 
3169 		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
3170 		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
3171 
3172 		cur = min(len, (unsigned long)(PAGE_CACHE_SIZE -
3173 					       src_off_in_page));
3174 		cur = min_t(unsigned long, cur,
3175 			(unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page));
3176 
3177 		copy_pages(extent_buffer_page(dst, dst_i),
3178 			   extent_buffer_page(dst, src_i),
3179 			   dst_off_in_page, src_off_in_page, cur);
3180 
3181 		src_offset += cur;
3182 		dst_offset += cur;
3183 		len -= cur;
3184 	}
3185 }
3186 EXPORT_SYMBOL(memcpy_extent_buffer);
3187 
3188 void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
3189 			   unsigned long src_offset, unsigned long len)
3190 {
3191 	size_t cur;
3192 	size_t dst_off_in_page;
3193 	size_t src_off_in_page;
3194 	unsigned long dst_end = dst_offset + len - 1;
3195 	unsigned long src_end = src_offset + len - 1;
3196 	size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1);
3197 	unsigned long dst_i;
3198 	unsigned long src_i;
3199 
3200 	if (src_offset + len > dst->len) {
3201 		printk("memmove bogus src_offset %lu move len %lu len %lu\n",
3202 		       src_offset, len, dst->len);
3203 		BUG_ON(1);
3204 	}
3205 	if (dst_offset + len > dst->len) {
3206 		printk("memmove bogus dst_offset %lu move len %lu len %lu\n",
3207 		       dst_offset, len, dst->len);
3208 		BUG_ON(1);
3209 	}
3210 	if (dst_offset < src_offset) {
3211 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
3212 		return;
3213 	}
3214 	while(len > 0) {
3215 		dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT;
3216 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
3217 
3218 		dst_off_in_page = (start_offset + dst_end) &
3219 			((unsigned long)PAGE_CACHE_SIZE - 1);
3220 		src_off_in_page = (start_offset + src_end) &
3221 			((unsigned long)PAGE_CACHE_SIZE - 1);
3222 
3223 		cur = min_t(unsigned long, len, src_off_in_page + 1);
3224 		cur = min(cur, dst_off_in_page + 1);
3225 		move_pages(extent_buffer_page(dst, dst_i),
3226 			   extent_buffer_page(dst, src_i),
3227 			   dst_off_in_page - cur + 1,
3228 			   src_off_in_page - cur + 1, cur);
3229 
3230 		dst_end -= cur;
3231 		src_end -= cur;
3232 		len -= cur;
3233 	}
3234 }
3235 EXPORT_SYMBOL(memmove_extent_buffer);
3236