xref: /openbmc/linux/fs/btrfs/extent_io.c (revision 6726d552)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/bitops.h>
4 #include <linux/slab.h>
5 #include <linux/bio.h>
6 #include <linux/mm.h>
7 #include <linux/pagemap.h>
8 #include <linux/page-flags.h>
9 #include <linux/sched/mm.h>
10 #include <linux/spinlock.h>
11 #include <linux/blkdev.h>
12 #include <linux/swap.h>
13 #include <linux/writeback.h>
14 #include <linux/pagevec.h>
15 #include <linux/prefetch.h>
16 #include <linux/fsverity.h>
17 #include "misc.h"
18 #include "extent_io.h"
19 #include "extent-io-tree.h"
20 #include "extent_map.h"
21 #include "ctree.h"
22 #include "btrfs_inode.h"
23 #include "volumes.h"
24 #include "check-integrity.h"
25 #include "locking.h"
26 #include "rcu-string.h"
27 #include "backref.h"
28 #include "disk-io.h"
29 #include "subpage.h"
30 #include "zoned.h"
31 #include "block-group.h"
32 #include "compression.h"
33 
34 static struct kmem_cache *extent_state_cache;
35 static struct kmem_cache *extent_buffer_cache;
36 static struct bio_set btrfs_bioset;
37 
38 static inline bool extent_state_in_tree(const struct extent_state *state)
39 {
40 	return !RB_EMPTY_NODE(&state->rb_node);
41 }
42 
43 #ifdef CONFIG_BTRFS_DEBUG
44 static LIST_HEAD(states);
45 static DEFINE_SPINLOCK(leak_lock);
46 
47 static inline void btrfs_leak_debug_add(spinlock_t *lock,
48 					struct list_head *new,
49 					struct list_head *head)
50 {
51 	unsigned long flags;
52 
53 	spin_lock_irqsave(lock, flags);
54 	list_add(new, head);
55 	spin_unlock_irqrestore(lock, flags);
56 }
57 
58 static inline void btrfs_leak_debug_del(spinlock_t *lock,
59 					struct list_head *entry)
60 {
61 	unsigned long flags;
62 
63 	spin_lock_irqsave(lock, flags);
64 	list_del(entry);
65 	spin_unlock_irqrestore(lock, flags);
66 }
67 
68 void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
69 {
70 	struct extent_buffer *eb;
71 	unsigned long flags;
72 
73 	/*
74 	 * If we didn't get into open_ctree our allocated_ebs will not be
75 	 * initialized, so just skip this.
76 	 */
77 	if (!fs_info->allocated_ebs.next)
78 		return;
79 
80 	WARN_ON(!list_empty(&fs_info->allocated_ebs));
81 	spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
82 	while (!list_empty(&fs_info->allocated_ebs)) {
83 		eb = list_first_entry(&fs_info->allocated_ebs,
84 				      struct extent_buffer, leak_list);
85 		pr_err(
86 	"BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
87 		       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
88 		       btrfs_header_owner(eb));
89 		list_del(&eb->leak_list);
90 		kmem_cache_free(extent_buffer_cache, eb);
91 	}
92 	spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
93 }
94 
95 static inline void btrfs_extent_state_leak_debug_check(void)
96 {
97 	struct extent_state *state;
98 
99 	while (!list_empty(&states)) {
100 		state = list_entry(states.next, struct extent_state, leak_list);
101 		pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
102 		       state->start, state->end, state->state,
103 		       extent_state_in_tree(state),
104 		       refcount_read(&state->refs));
105 		list_del(&state->leak_list);
106 		kmem_cache_free(extent_state_cache, state);
107 	}
108 }
109 
110 #define btrfs_debug_check_extent_io_range(tree, start, end)		\
111 	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
112 static inline void __btrfs_debug_check_extent_io_range(const char *caller,
113 		struct extent_io_tree *tree, u64 start, u64 end)
114 {
115 	struct inode *inode = tree->private_data;
116 	u64 isize;
117 
118 	if (!inode || !is_data_inode(inode))
119 		return;
120 
121 	isize = i_size_read(inode);
122 	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
123 		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
124 		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
125 			caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
126 	}
127 }
128 #else
129 #define btrfs_leak_debug_add(lock, new, head)	do {} while (0)
130 #define btrfs_leak_debug_del(lock, entry)	do {} while (0)
131 #define btrfs_extent_state_leak_debug_check()	do {} while (0)
132 #define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
133 #endif
134 
135 struct tree_entry {
136 	u64 start;
137 	u64 end;
138 	struct rb_node rb_node;
139 };
140 
141 /*
142  * Structure to record info about the bio being assembled, and other info like
143  * how many bytes are there before stripe/ordered extent boundary.
144  */
145 struct btrfs_bio_ctrl {
146 	struct bio *bio;
147 	int mirror_num;
148 	enum btrfs_compression_type compress_type;
149 	u32 len_to_stripe_boundary;
150 	u32 len_to_oe_boundary;
151 };
152 
153 struct extent_page_data {
154 	struct btrfs_bio_ctrl bio_ctrl;
155 	/* tells writepage not to lock the state bits for this range
156 	 * it still does the unlocking
157 	 */
158 	unsigned int extent_locked:1;
159 
160 	/* tells the submit_bio code to use REQ_SYNC */
161 	unsigned int sync_io:1;
162 };
163 
164 static int add_extent_changeset(struct extent_state *state, u32 bits,
165 				 struct extent_changeset *changeset,
166 				 int set)
167 {
168 	int ret;
169 
170 	if (!changeset)
171 		return 0;
172 	if (set && (state->state & bits) == bits)
173 		return 0;
174 	if (!set && (state->state & bits) == 0)
175 		return 0;
176 	changeset->bytes_changed += state->end - state->start + 1;
177 	ret = ulist_add(&changeset->range_changed, state->start, state->end,
178 			GFP_ATOMIC);
179 	return ret;
180 }
181 
182 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
183 {
184 	struct bio *bio;
185 	struct bio_vec *bv;
186 	struct inode *inode;
187 	int mirror_num;
188 
189 	if (!bio_ctrl->bio)
190 		return;
191 
192 	bio = bio_ctrl->bio;
193 	bv = bio_first_bvec_all(bio);
194 	inode = bv->bv_page->mapping->host;
195 	mirror_num = bio_ctrl->mirror_num;
196 
197 	/* Caller should ensure the bio has at least some range added */
198 	ASSERT(bio->bi_iter.bi_size);
199 
200 	btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
201 
202 	if (!is_data_inode(inode))
203 		btrfs_submit_metadata_bio(inode, bio, mirror_num);
204 	else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
205 		btrfs_submit_data_write_bio(inode, bio, mirror_num);
206 	else
207 		btrfs_submit_data_read_bio(inode, bio, mirror_num,
208 					   bio_ctrl->compress_type);
209 
210 	/* The bio is owned by the bi_end_io handler now */
211 	bio_ctrl->bio = NULL;
212 }
213 
214 /*
215  * Submit or fail the current bio in an extent_page_data structure.
216  */
217 static void submit_write_bio(struct extent_page_data *epd, int ret)
218 {
219 	struct bio *bio = epd->bio_ctrl.bio;
220 
221 	if (!bio)
222 		return;
223 
224 	if (ret) {
225 		ASSERT(ret < 0);
226 		bio->bi_status = errno_to_blk_status(ret);
227 		bio_endio(bio);
228 		/* The bio is owned by the bi_end_io handler now */
229 		epd->bio_ctrl.bio = NULL;
230 	} else {
231 		submit_one_bio(&epd->bio_ctrl);
232 	}
233 }
234 
235 int __init extent_state_cache_init(void)
236 {
237 	extent_state_cache = kmem_cache_create("btrfs_extent_state",
238 			sizeof(struct extent_state), 0,
239 			SLAB_MEM_SPREAD, NULL);
240 	if (!extent_state_cache)
241 		return -ENOMEM;
242 	return 0;
243 }
244 
245 int __init extent_io_init(void)
246 {
247 	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
248 			sizeof(struct extent_buffer), 0,
249 			SLAB_MEM_SPREAD, NULL);
250 	if (!extent_buffer_cache)
251 		return -ENOMEM;
252 
253 	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
254 			offsetof(struct btrfs_bio, bio),
255 			BIOSET_NEED_BVECS))
256 		goto free_buffer_cache;
257 
258 	if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
259 		goto free_bioset;
260 
261 	return 0;
262 
263 free_bioset:
264 	bioset_exit(&btrfs_bioset);
265 
266 free_buffer_cache:
267 	kmem_cache_destroy(extent_buffer_cache);
268 	extent_buffer_cache = NULL;
269 	return -ENOMEM;
270 }
271 
272 void __cold extent_state_cache_exit(void)
273 {
274 	btrfs_extent_state_leak_debug_check();
275 	kmem_cache_destroy(extent_state_cache);
276 }
277 
278 void __cold extent_io_exit(void)
279 {
280 	/*
281 	 * Make sure all delayed rcu free are flushed before we
282 	 * destroy caches.
283 	 */
284 	rcu_barrier();
285 	kmem_cache_destroy(extent_buffer_cache);
286 	bioset_exit(&btrfs_bioset);
287 }
288 
289 /*
290  * For the file_extent_tree, we want to hold the inode lock when we lookup and
291  * update the disk_i_size, but lockdep will complain because our io_tree we hold
292  * the tree lock and get the inode lock when setting delalloc.  These two things
293  * are unrelated, so make a class for the file_extent_tree so we don't get the
294  * two locking patterns mixed up.
295  */
296 static struct lock_class_key file_extent_tree_class;
297 
298 void extent_io_tree_init(struct btrfs_fs_info *fs_info,
299 			 struct extent_io_tree *tree, unsigned int owner,
300 			 void *private_data)
301 {
302 	tree->fs_info = fs_info;
303 	tree->state = RB_ROOT;
304 	tree->dirty_bytes = 0;
305 	spin_lock_init(&tree->lock);
306 	tree->private_data = private_data;
307 	tree->owner = owner;
308 	if (owner == IO_TREE_INODE_FILE_EXTENT)
309 		lockdep_set_class(&tree->lock, &file_extent_tree_class);
310 }
311 
312 void extent_io_tree_release(struct extent_io_tree *tree)
313 {
314 	spin_lock(&tree->lock);
315 	/*
316 	 * Do a single barrier for the waitqueue_active check here, the state
317 	 * of the waitqueue should not change once extent_io_tree_release is
318 	 * called.
319 	 */
320 	smp_mb();
321 	while (!RB_EMPTY_ROOT(&tree->state)) {
322 		struct rb_node *node;
323 		struct extent_state *state;
324 
325 		node = rb_first(&tree->state);
326 		state = rb_entry(node, struct extent_state, rb_node);
327 		rb_erase(&state->rb_node, &tree->state);
328 		RB_CLEAR_NODE(&state->rb_node);
329 		/*
330 		 * btree io trees aren't supposed to have tasks waiting for
331 		 * changes in the flags of extent states ever.
332 		 */
333 		ASSERT(!waitqueue_active(&state->wq));
334 		free_extent_state(state);
335 
336 		cond_resched_lock(&tree->lock);
337 	}
338 	spin_unlock(&tree->lock);
339 }
340 
341 static struct extent_state *alloc_extent_state(gfp_t mask)
342 {
343 	struct extent_state *state;
344 
345 	/*
346 	 * The given mask might be not appropriate for the slab allocator,
347 	 * drop the unsupported bits
348 	 */
349 	mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
350 	state = kmem_cache_alloc(extent_state_cache, mask);
351 	if (!state)
352 		return state;
353 	state->state = 0;
354 	state->failrec = NULL;
355 	RB_CLEAR_NODE(&state->rb_node);
356 	btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
357 	refcount_set(&state->refs, 1);
358 	init_waitqueue_head(&state->wq);
359 	trace_alloc_extent_state(state, mask, _RET_IP_);
360 	return state;
361 }
362 
363 void free_extent_state(struct extent_state *state)
364 {
365 	if (!state)
366 		return;
367 	if (refcount_dec_and_test(&state->refs)) {
368 		WARN_ON(extent_state_in_tree(state));
369 		btrfs_leak_debug_del(&leak_lock, &state->leak_list);
370 		trace_free_extent_state(state, _RET_IP_);
371 		kmem_cache_free(extent_state_cache, state);
372 	}
373 }
374 
375 /**
376  * Search @tree for an entry that contains @offset. Such entry would have
377  * entry->start <= offset && entry->end >= offset.
378  *
379  * @tree:       the tree to search
380  * @offset:     offset that should fall within an entry in @tree
381  * @node_ret:   pointer where new node should be anchored (used when inserting an
382  *	        entry in the tree)
383  * @parent_ret: points to entry which would have been the parent of the entry,
384  *               containing @offset
385  *
386  * Return a pointer to the entry that contains @offset byte address and don't change
387  * @node_ret and @parent_ret.
388  *
389  * If no such entry exists, return pointer to entry that ends before @offset
390  * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
391  */
392 static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree,
393 					             u64 offset,
394 						     struct rb_node ***node_ret,
395 						     struct rb_node **parent_ret)
396 {
397 	struct rb_root *root = &tree->state;
398 	struct rb_node **node = &root->rb_node;
399 	struct rb_node *prev = NULL;
400 	struct tree_entry *entry;
401 
402 	while (*node) {
403 		prev = *node;
404 		entry = rb_entry(prev, struct tree_entry, rb_node);
405 
406 		if (offset < entry->start)
407 			node = &(*node)->rb_left;
408 		else if (offset > entry->end)
409 			node = &(*node)->rb_right;
410 		else
411 			return *node;
412 	}
413 
414 	if (node_ret)
415 		*node_ret = node;
416 	if (parent_ret)
417 		*parent_ret = prev;
418 
419 	/* Search neighbors until we find the first one past the end */
420 	while (prev && offset > entry->end) {
421 		prev = rb_next(prev);
422 		entry = rb_entry(prev, struct tree_entry, rb_node);
423 	}
424 
425 	return prev;
426 }
427 
428 /*
429  * Inexact rb-tree search, return the next entry if @offset is not found
430  */
431 static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset)
432 {
433 	return tree_search_for_insert(tree, offset, NULL, NULL);
434 }
435 
436 /**
437  * Search offset in the tree or fill neighbor rbtree node pointers.
438  *
439  * @tree:      the tree to search
440  * @offset:    offset that should fall within an entry in @tree
441  * @next_ret:  pointer to the first entry whose range ends after @offset
442  * @prev_ret:  pointer to the first entry whose range begins before @offset
443  *
444  * Return a pointer to the entry that contains @offset byte address. If no
445  * such entry exists, then return NULL and fill @prev_ret and @next_ret.
446  * Otherwise return the found entry and other pointers are left untouched.
447  */
448 static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree,
449 					     u64 offset,
450 					     struct rb_node **prev_ret,
451 					     struct rb_node **next_ret)
452 {
453 	struct rb_root *root = &tree->state;
454 	struct rb_node **node = &root->rb_node;
455 	struct rb_node *prev = NULL;
456 	struct rb_node *orig_prev = NULL;
457 	struct tree_entry *entry;
458 
459 	ASSERT(prev_ret);
460 	ASSERT(next_ret);
461 
462 	while (*node) {
463 		prev = *node;
464 		entry = rb_entry(prev, struct tree_entry, rb_node);
465 
466 		if (offset < entry->start)
467 			node = &(*node)->rb_left;
468 		else if (offset > entry->end)
469 			node = &(*node)->rb_right;
470 		else
471 			return *node;
472 	}
473 
474 	orig_prev = prev;
475 	while (prev && offset > entry->end) {
476 		prev = rb_next(prev);
477 		entry = rb_entry(prev, struct tree_entry, rb_node);
478 	}
479 	*next_ret = prev;
480 	prev = orig_prev;
481 
482 	entry = rb_entry(prev, struct tree_entry, rb_node);
483 	while (prev && offset < entry->start) {
484 		prev = rb_prev(prev);
485 		entry = rb_entry(prev, struct tree_entry, rb_node);
486 	}
487 	*prev_ret = prev;
488 
489 	return NULL;
490 }
491 
492 /*
493  * utility function to look for merge candidates inside a given range.
494  * Any extents with matching state are merged together into a single
495  * extent in the tree.  Extents with EXTENT_IO in their state field
496  * are not merged because the end_io handlers need to be able to do
497  * operations on them without sleeping (or doing allocations/splits).
498  *
499  * This should be called with the tree lock held.
500  */
501 static void merge_state(struct extent_io_tree *tree,
502 		        struct extent_state *state)
503 {
504 	struct extent_state *other;
505 	struct rb_node *other_node;
506 
507 	if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
508 		return;
509 
510 	other_node = rb_prev(&state->rb_node);
511 	if (other_node) {
512 		other = rb_entry(other_node, struct extent_state, rb_node);
513 		if (other->end == state->start - 1 &&
514 		    other->state == state->state) {
515 			if (tree->private_data &&
516 			    is_data_inode(tree->private_data))
517 				btrfs_merge_delalloc_extent(tree->private_data,
518 							    state, other);
519 			state->start = other->start;
520 			rb_erase(&other->rb_node, &tree->state);
521 			RB_CLEAR_NODE(&other->rb_node);
522 			free_extent_state(other);
523 		}
524 	}
525 	other_node = rb_next(&state->rb_node);
526 	if (other_node) {
527 		other = rb_entry(other_node, struct extent_state, rb_node);
528 		if (other->start == state->end + 1 &&
529 		    other->state == state->state) {
530 			if (tree->private_data &&
531 			    is_data_inode(tree->private_data))
532 				btrfs_merge_delalloc_extent(tree->private_data,
533 							    state, other);
534 			state->end = other->end;
535 			rb_erase(&other->rb_node, &tree->state);
536 			RB_CLEAR_NODE(&other->rb_node);
537 			free_extent_state(other);
538 		}
539 	}
540 }
541 
542 static void set_state_bits(struct extent_io_tree *tree,
543 			   struct extent_state *state, u32 bits,
544 			   struct extent_changeset *changeset);
545 
546 /*
547  * insert an extent_state struct into the tree.  'bits' are set on the
548  * struct before it is inserted.
549  *
550  * This may return -EEXIST if the extent is already there, in which case the
551  * state struct is freed.
552  *
553  * The tree lock is not taken internally.  This is a utility function and
554  * probably isn't what you want to call (see set/clear_extent_bit).
555  */
556 static int insert_state(struct extent_io_tree *tree,
557 			struct extent_state *state,
558 			u32 bits, struct extent_changeset *changeset)
559 {
560 	struct rb_node **node;
561 	struct rb_node *parent;
562 	const u64 end = state->end;
563 
564 	set_state_bits(tree, state, bits, changeset);
565 
566 	node = &tree->state.rb_node;
567 	while (*node) {
568 		struct tree_entry *entry;
569 
570 		parent = *node;
571 		entry = rb_entry(parent, struct tree_entry, rb_node);
572 
573 		if (end < entry->start) {
574 			node = &(*node)->rb_left;
575 		} else if (end > entry->end) {
576 			node = &(*node)->rb_right;
577 		} else {
578 			btrfs_err(tree->fs_info,
579 			       "found node %llu %llu on insert of %llu %llu",
580 			       entry->start, entry->end, state->start, end);
581 			return -EEXIST;
582 		}
583 	}
584 
585 	rb_link_node(&state->rb_node, parent, node);
586 	rb_insert_color(&state->rb_node, &tree->state);
587 
588 	merge_state(tree, state);
589 	return 0;
590 }
591 
592 /*
593  * Insert state to @tree to the location given by @node and @parent.
594  */
595 static void insert_state_fast(struct extent_io_tree *tree,
596 			      struct extent_state *state, struct rb_node **node,
597 			      struct rb_node *parent, unsigned bits,
598 			      struct extent_changeset *changeset)
599 {
600 	set_state_bits(tree, state, bits, changeset);
601 	rb_link_node(&state->rb_node, parent, node);
602 	rb_insert_color(&state->rb_node, &tree->state);
603 	merge_state(tree, state);
604 }
605 
606 /*
607  * split a given extent state struct in two, inserting the preallocated
608  * struct 'prealloc' as the newly created second half.  'split' indicates an
609  * offset inside 'orig' where it should be split.
610  *
611  * Before calling,
612  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
613  * are two extent state structs in the tree:
614  * prealloc: [orig->start, split - 1]
615  * orig: [ split, orig->end ]
616  *
617  * The tree locks are not taken by this function. They need to be held
618  * by the caller.
619  */
620 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
621 		       struct extent_state *prealloc, u64 split)
622 {
623 	struct rb_node *parent = NULL;
624 	struct rb_node **node;
625 
626 	if (tree->private_data && is_data_inode(tree->private_data))
627 		btrfs_split_delalloc_extent(tree->private_data, orig, split);
628 
629 	prealloc->start = orig->start;
630 	prealloc->end = split - 1;
631 	prealloc->state = orig->state;
632 	orig->start = split;
633 
634 	parent = &orig->rb_node;
635 	node = &parent;
636 	while (*node) {
637 		struct tree_entry *entry;
638 
639 		parent = *node;
640 		entry = rb_entry(parent, struct tree_entry, rb_node);
641 
642 		if (prealloc->end < entry->start) {
643 			node = &(*node)->rb_left;
644 		} else if (prealloc->end > entry->end) {
645 			node = &(*node)->rb_right;
646 		} else {
647 			free_extent_state(prealloc);
648 			return -EEXIST;
649 		}
650 	}
651 
652 	rb_link_node(&prealloc->rb_node, parent, node);
653 	rb_insert_color(&prealloc->rb_node, &tree->state);
654 
655 	return 0;
656 }
657 
658 static struct extent_state *next_state(struct extent_state *state)
659 {
660 	struct rb_node *next = rb_next(&state->rb_node);
661 	if (next)
662 		return rb_entry(next, struct extent_state, rb_node);
663 	else
664 		return NULL;
665 }
666 
667 /*
668  * utility function to clear some bits in an extent state struct.
669  * it will optionally wake up anyone waiting on this state (wake == 1).
670  *
671  * If no bits are set on the state struct after clearing things, the
672  * struct is freed and removed from the tree
673  */
674 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
675 					    struct extent_state *state,
676 					    u32 bits, int wake,
677 					    struct extent_changeset *changeset)
678 {
679 	struct extent_state *next;
680 	u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
681 	int ret;
682 
683 	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
684 		u64 range = state->end - state->start + 1;
685 		WARN_ON(range > tree->dirty_bytes);
686 		tree->dirty_bytes -= range;
687 	}
688 
689 	if (tree->private_data && is_data_inode(tree->private_data))
690 		btrfs_clear_delalloc_extent(tree->private_data, state, bits);
691 
692 	ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
693 	BUG_ON(ret < 0);
694 	state->state &= ~bits_to_clear;
695 	if (wake)
696 		wake_up(&state->wq);
697 	if (state->state == 0) {
698 		next = next_state(state);
699 		if (extent_state_in_tree(state)) {
700 			rb_erase(&state->rb_node, &tree->state);
701 			RB_CLEAR_NODE(&state->rb_node);
702 			free_extent_state(state);
703 		} else {
704 			WARN_ON(1);
705 		}
706 	} else {
707 		merge_state(tree, state);
708 		next = next_state(state);
709 	}
710 	return next;
711 }
712 
713 static struct extent_state *
714 alloc_extent_state_atomic(struct extent_state *prealloc)
715 {
716 	if (!prealloc)
717 		prealloc = alloc_extent_state(GFP_ATOMIC);
718 
719 	return prealloc;
720 }
721 
722 static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
723 {
724 	btrfs_panic(tree->fs_info, err,
725 	"locking error: extent tree was modified by another thread while locked");
726 }
727 
728 /*
729  * clear some bits on a range in the tree.  This may require splitting
730  * or inserting elements in the tree, so the gfp mask is used to
731  * indicate which allocations or sleeping are allowed.
732  *
733  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
734  * the given range from the tree regardless of state (ie for truncate).
735  *
736  * the range [start, end] is inclusive.
737  *
738  * This takes the tree lock, and returns 0 on success and < 0 on error.
739  */
740 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
741 		       u32 bits, int wake, int delete,
742 		       struct extent_state **cached_state,
743 		       gfp_t mask, struct extent_changeset *changeset)
744 {
745 	struct extent_state *state;
746 	struct extent_state *cached;
747 	struct extent_state *prealloc = NULL;
748 	struct rb_node *node;
749 	u64 last_end;
750 	int err;
751 	int clear = 0;
752 
753 	btrfs_debug_check_extent_io_range(tree, start, end);
754 	trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
755 
756 	if (bits & EXTENT_DELALLOC)
757 		bits |= EXTENT_NORESERVE;
758 
759 	if (delete)
760 		bits |= ~EXTENT_CTLBITS;
761 
762 	if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
763 		clear = 1;
764 again:
765 	if (!prealloc && gfpflags_allow_blocking(mask)) {
766 		/*
767 		 * Don't care for allocation failure here because we might end
768 		 * up not needing the pre-allocated extent state at all, which
769 		 * is the case if we only have in the tree extent states that
770 		 * cover our input range and don't cover too any other range.
771 		 * If we end up needing a new extent state we allocate it later.
772 		 */
773 		prealloc = alloc_extent_state(mask);
774 	}
775 
776 	spin_lock(&tree->lock);
777 	if (cached_state) {
778 		cached = *cached_state;
779 
780 		if (clear) {
781 			*cached_state = NULL;
782 			cached_state = NULL;
783 		}
784 
785 		if (cached && extent_state_in_tree(cached) &&
786 		    cached->start <= start && cached->end > start) {
787 			if (clear)
788 				refcount_dec(&cached->refs);
789 			state = cached;
790 			goto hit_next;
791 		}
792 		if (clear)
793 			free_extent_state(cached);
794 	}
795 	/*
796 	 * this search will find the extents that end after
797 	 * our range starts
798 	 */
799 	node = tree_search(tree, start);
800 	if (!node)
801 		goto out;
802 	state = rb_entry(node, struct extent_state, rb_node);
803 hit_next:
804 	if (state->start > end)
805 		goto out;
806 	WARN_ON(state->end < start);
807 	last_end = state->end;
808 
809 	/* the state doesn't have the wanted bits, go ahead */
810 	if (!(state->state & bits)) {
811 		state = next_state(state);
812 		goto next;
813 	}
814 
815 	/*
816 	 *     | ---- desired range ---- |
817 	 *  | state | or
818 	 *  | ------------- state -------------- |
819 	 *
820 	 * We need to split the extent we found, and may flip
821 	 * bits on second half.
822 	 *
823 	 * If the extent we found extends past our range, we
824 	 * just split and search again.  It'll get split again
825 	 * the next time though.
826 	 *
827 	 * If the extent we found is inside our range, we clear
828 	 * the desired bit on it.
829 	 */
830 
831 	if (state->start < start) {
832 		prealloc = alloc_extent_state_atomic(prealloc);
833 		BUG_ON(!prealloc);
834 		err = split_state(tree, state, prealloc, start);
835 		if (err)
836 			extent_io_tree_panic(tree, err);
837 
838 		prealloc = NULL;
839 		if (err)
840 			goto out;
841 		if (state->end <= end) {
842 			state = clear_state_bit(tree, state, bits, wake, changeset);
843 			goto next;
844 		}
845 		goto search_again;
846 	}
847 	/*
848 	 * | ---- desired range ---- |
849 	 *                        | state |
850 	 * We need to split the extent, and clear the bit
851 	 * on the first half
852 	 */
853 	if (state->start <= end && state->end > end) {
854 		prealloc = alloc_extent_state_atomic(prealloc);
855 		BUG_ON(!prealloc);
856 		err = split_state(tree, state, prealloc, end + 1);
857 		if (err)
858 			extent_io_tree_panic(tree, err);
859 
860 		if (wake)
861 			wake_up(&state->wq);
862 
863 		clear_state_bit(tree, prealloc, bits, wake, changeset);
864 
865 		prealloc = NULL;
866 		goto out;
867 	}
868 
869 	state = clear_state_bit(tree, state, bits, wake, changeset);
870 next:
871 	if (last_end == (u64)-1)
872 		goto out;
873 	start = last_end + 1;
874 	if (start <= end && state && !need_resched())
875 		goto hit_next;
876 
877 search_again:
878 	if (start > end)
879 		goto out;
880 	spin_unlock(&tree->lock);
881 	if (gfpflags_allow_blocking(mask))
882 		cond_resched();
883 	goto again;
884 
885 out:
886 	spin_unlock(&tree->lock);
887 	if (prealloc)
888 		free_extent_state(prealloc);
889 
890 	return 0;
891 
892 }
893 
894 static void wait_on_state(struct extent_io_tree *tree,
895 			  struct extent_state *state)
896 		__releases(tree->lock)
897 		__acquires(tree->lock)
898 {
899 	DEFINE_WAIT(wait);
900 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
901 	spin_unlock(&tree->lock);
902 	schedule();
903 	spin_lock(&tree->lock);
904 	finish_wait(&state->wq, &wait);
905 }
906 
907 /*
908  * waits for one or more bits to clear on a range in the state tree.
909  * The range [start, end] is inclusive.
910  * The tree lock is taken by this function
911  */
912 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
913 			    u32 bits)
914 {
915 	struct extent_state *state;
916 	struct rb_node *node;
917 
918 	btrfs_debug_check_extent_io_range(tree, start, end);
919 
920 	spin_lock(&tree->lock);
921 again:
922 	while (1) {
923 		/*
924 		 * this search will find all the extents that end after
925 		 * our range starts
926 		 */
927 		node = tree_search(tree, start);
928 process_node:
929 		if (!node)
930 			break;
931 
932 		state = rb_entry(node, struct extent_state, rb_node);
933 
934 		if (state->start > end)
935 			goto out;
936 
937 		if (state->state & bits) {
938 			start = state->start;
939 			refcount_inc(&state->refs);
940 			wait_on_state(tree, state);
941 			free_extent_state(state);
942 			goto again;
943 		}
944 		start = state->end + 1;
945 
946 		if (start > end)
947 			break;
948 
949 		if (!cond_resched_lock(&tree->lock)) {
950 			node = rb_next(node);
951 			goto process_node;
952 		}
953 	}
954 out:
955 	spin_unlock(&tree->lock);
956 }
957 
958 static void set_state_bits(struct extent_io_tree *tree,
959 			   struct extent_state *state,
960 			   u32 bits, struct extent_changeset *changeset)
961 {
962 	u32 bits_to_set = bits & ~EXTENT_CTLBITS;
963 	int ret;
964 
965 	if (tree->private_data && is_data_inode(tree->private_data))
966 		btrfs_set_delalloc_extent(tree->private_data, state, bits);
967 
968 	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
969 		u64 range = state->end - state->start + 1;
970 		tree->dirty_bytes += range;
971 	}
972 	ret = add_extent_changeset(state, bits_to_set, changeset, 1);
973 	BUG_ON(ret < 0);
974 	state->state |= bits_to_set;
975 }
976 
977 static void cache_state_if_flags(struct extent_state *state,
978 				 struct extent_state **cached_ptr,
979 				 unsigned flags)
980 {
981 	if (cached_ptr && !(*cached_ptr)) {
982 		if (!flags || (state->state & flags)) {
983 			*cached_ptr = state;
984 			refcount_inc(&state->refs);
985 		}
986 	}
987 }
988 
989 static void cache_state(struct extent_state *state,
990 			struct extent_state **cached_ptr)
991 {
992 	return cache_state_if_flags(state, cached_ptr,
993 				    EXTENT_LOCKED | EXTENT_BOUNDARY);
994 }
995 
996 /*
997  * set some bits on a range in the tree.  This may require allocations or
998  * sleeping, so the gfp mask is used to indicate what is allowed.
999  *
1000  * If any of the exclusive bits are set, this will fail with -EEXIST if some
1001  * part of the range already has the desired bits set.  The start of the
1002  * existing range is returned in failed_start in this case.
1003  *
1004  * [start, end] is inclusive This takes the tree lock.
1005  */
1006 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
1007 		   u32 exclusive_bits, u64 *failed_start,
1008 		   struct extent_state **cached_state, gfp_t mask,
1009 		   struct extent_changeset *changeset)
1010 {
1011 	struct extent_state *state;
1012 	struct extent_state *prealloc = NULL;
1013 	struct rb_node *node;
1014 	struct rb_node **p;
1015 	struct rb_node *parent;
1016 	int err = 0;
1017 	u64 last_start;
1018 	u64 last_end;
1019 
1020 	btrfs_debug_check_extent_io_range(tree, start, end);
1021 	trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
1022 
1023 	if (exclusive_bits)
1024 		ASSERT(failed_start);
1025 	else
1026 		ASSERT(failed_start == NULL);
1027 again:
1028 	if (!prealloc && gfpflags_allow_blocking(mask)) {
1029 		/*
1030 		 * Don't care for allocation failure here because we might end
1031 		 * up not needing the pre-allocated extent state at all, which
1032 		 * is the case if we only have in the tree extent states that
1033 		 * cover our input range and don't cover too any other range.
1034 		 * If we end up needing a new extent state we allocate it later.
1035 		 */
1036 		prealloc = alloc_extent_state(mask);
1037 	}
1038 
1039 	spin_lock(&tree->lock);
1040 	if (cached_state && *cached_state) {
1041 		state = *cached_state;
1042 		if (state->start <= start && state->end > start &&
1043 		    extent_state_in_tree(state)) {
1044 			node = &state->rb_node;
1045 			goto hit_next;
1046 		}
1047 	}
1048 	/*
1049 	 * this search will find all the extents that end after
1050 	 * our range starts.
1051 	 */
1052 	node = tree_search_for_insert(tree, start, &p, &parent);
1053 	if (!node) {
1054 		prealloc = alloc_extent_state_atomic(prealloc);
1055 		BUG_ON(!prealloc);
1056 		prealloc->start = start;
1057 		prealloc->end = end;
1058 		insert_state_fast(tree, prealloc, p, parent, bits, changeset);
1059 		cache_state(prealloc, cached_state);
1060 		prealloc = NULL;
1061 		goto out;
1062 	}
1063 	state = rb_entry(node, struct extent_state, rb_node);
1064 hit_next:
1065 	last_start = state->start;
1066 	last_end = state->end;
1067 
1068 	/*
1069 	 * | ---- desired range ---- |
1070 	 * | state |
1071 	 *
1072 	 * Just lock what we found and keep going
1073 	 */
1074 	if (state->start == start && state->end <= end) {
1075 		if (state->state & exclusive_bits) {
1076 			*failed_start = state->start;
1077 			err = -EEXIST;
1078 			goto out;
1079 		}
1080 
1081 		set_state_bits(tree, state, bits, changeset);
1082 		cache_state(state, cached_state);
1083 		merge_state(tree, state);
1084 		if (last_end == (u64)-1)
1085 			goto out;
1086 		start = last_end + 1;
1087 		state = next_state(state);
1088 		if (start < end && state && state->start == start &&
1089 		    !need_resched())
1090 			goto hit_next;
1091 		goto search_again;
1092 	}
1093 
1094 	/*
1095 	 *     | ---- desired range ---- |
1096 	 * | state |
1097 	 *   or
1098 	 * | ------------- state -------------- |
1099 	 *
1100 	 * We need to split the extent we found, and may flip bits on
1101 	 * second half.
1102 	 *
1103 	 * If the extent we found extends past our
1104 	 * range, we just split and search again.  It'll get split
1105 	 * again the next time though.
1106 	 *
1107 	 * If the extent we found is inside our range, we set the
1108 	 * desired bit on it.
1109 	 */
1110 	if (state->start < start) {
1111 		if (state->state & exclusive_bits) {
1112 			*failed_start = start;
1113 			err = -EEXIST;
1114 			goto out;
1115 		}
1116 
1117 		/*
1118 		 * If this extent already has all the bits we want set, then
1119 		 * skip it, not necessary to split it or do anything with it.
1120 		 */
1121 		if ((state->state & bits) == bits) {
1122 			start = state->end + 1;
1123 			cache_state(state, cached_state);
1124 			goto search_again;
1125 		}
1126 
1127 		prealloc = alloc_extent_state_atomic(prealloc);
1128 		BUG_ON(!prealloc);
1129 		err = split_state(tree, state, prealloc, start);
1130 		if (err)
1131 			extent_io_tree_panic(tree, err);
1132 
1133 		prealloc = NULL;
1134 		if (err)
1135 			goto out;
1136 		if (state->end <= end) {
1137 			set_state_bits(tree, state, bits, changeset);
1138 			cache_state(state, cached_state);
1139 			merge_state(tree, state);
1140 			if (last_end == (u64)-1)
1141 				goto out;
1142 			start = last_end + 1;
1143 			state = next_state(state);
1144 			if (start < end && state && state->start == start &&
1145 			    !need_resched())
1146 				goto hit_next;
1147 		}
1148 		goto search_again;
1149 	}
1150 	/*
1151 	 * | ---- desired range ---- |
1152 	 *     | state | or               | state |
1153 	 *
1154 	 * There's a hole, we need to insert something in it and
1155 	 * ignore the extent we found.
1156 	 */
1157 	if (state->start > start) {
1158 		u64 this_end;
1159 		if (end < last_start)
1160 			this_end = end;
1161 		else
1162 			this_end = last_start - 1;
1163 
1164 		prealloc = alloc_extent_state_atomic(prealloc);
1165 		BUG_ON(!prealloc);
1166 
1167 		/*
1168 		 * Avoid to free 'prealloc' if it can be merged with
1169 		 * the later extent.
1170 		 */
1171 		prealloc->start = start;
1172 		prealloc->end = this_end;
1173 		err = insert_state(tree, prealloc, bits, changeset);
1174 		if (err)
1175 			extent_io_tree_panic(tree, err);
1176 
1177 		cache_state(prealloc, cached_state);
1178 		prealloc = NULL;
1179 		start = this_end + 1;
1180 		goto search_again;
1181 	}
1182 	/*
1183 	 * | ---- desired range ---- |
1184 	 *                        | state |
1185 	 * We need to split the extent, and set the bit
1186 	 * on the first half
1187 	 */
1188 	if (state->start <= end && state->end > end) {
1189 		if (state->state & exclusive_bits) {
1190 			*failed_start = start;
1191 			err = -EEXIST;
1192 			goto out;
1193 		}
1194 
1195 		prealloc = alloc_extent_state_atomic(prealloc);
1196 		BUG_ON(!prealloc);
1197 		err = split_state(tree, state, prealloc, end + 1);
1198 		if (err)
1199 			extent_io_tree_panic(tree, err);
1200 
1201 		set_state_bits(tree, prealloc, bits, changeset);
1202 		cache_state(prealloc, cached_state);
1203 		merge_state(tree, prealloc);
1204 		prealloc = NULL;
1205 		goto out;
1206 	}
1207 
1208 search_again:
1209 	if (start > end)
1210 		goto out;
1211 	spin_unlock(&tree->lock);
1212 	if (gfpflags_allow_blocking(mask))
1213 		cond_resched();
1214 	goto again;
1215 
1216 out:
1217 	spin_unlock(&tree->lock);
1218 	if (prealloc)
1219 		free_extent_state(prealloc);
1220 
1221 	return err;
1222 
1223 }
1224 
1225 /**
1226  * convert_extent_bit - convert all bits in a given range from one bit to
1227  * 			another
1228  * @tree:	the io tree to search
1229  * @start:	the start offset in bytes
1230  * @end:	the end offset in bytes (inclusive)
1231  * @bits:	the bits to set in this range
1232  * @clear_bits:	the bits to clear in this range
1233  * @cached_state:	state that we're going to cache
1234  *
1235  * This will go through and set bits for the given range.  If any states exist
1236  * already in this range they are set with the given bit and cleared of the
1237  * clear_bits.  This is only meant to be used by things that are mergeable, ie
1238  * converting from say DELALLOC to DIRTY.  This is not meant to be used with
1239  * boundary bits like LOCK.
1240  *
1241  * All allocations are done with GFP_NOFS.
1242  */
1243 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1244 		       u32 bits, u32 clear_bits,
1245 		       struct extent_state **cached_state)
1246 {
1247 	struct extent_state *state;
1248 	struct extent_state *prealloc = NULL;
1249 	struct rb_node *node;
1250 	struct rb_node **p;
1251 	struct rb_node *parent;
1252 	int err = 0;
1253 	u64 last_start;
1254 	u64 last_end;
1255 	bool first_iteration = true;
1256 
1257 	btrfs_debug_check_extent_io_range(tree, start, end);
1258 	trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1259 				       clear_bits);
1260 
1261 again:
1262 	if (!prealloc) {
1263 		/*
1264 		 * Best effort, don't worry if extent state allocation fails
1265 		 * here for the first iteration. We might have a cached state
1266 		 * that matches exactly the target range, in which case no
1267 		 * extent state allocations are needed. We'll only know this
1268 		 * after locking the tree.
1269 		 */
1270 		prealloc = alloc_extent_state(GFP_NOFS);
1271 		if (!prealloc && !first_iteration)
1272 			return -ENOMEM;
1273 	}
1274 
1275 	spin_lock(&tree->lock);
1276 	if (cached_state && *cached_state) {
1277 		state = *cached_state;
1278 		if (state->start <= start && state->end > start &&
1279 		    extent_state_in_tree(state)) {
1280 			node = &state->rb_node;
1281 			goto hit_next;
1282 		}
1283 	}
1284 
1285 	/*
1286 	 * this search will find all the extents that end after
1287 	 * our range starts.
1288 	 */
1289 	node = tree_search_for_insert(tree, start, &p, &parent);
1290 	if (!node) {
1291 		prealloc = alloc_extent_state_atomic(prealloc);
1292 		if (!prealloc) {
1293 			err = -ENOMEM;
1294 			goto out;
1295 		}
1296 		prealloc->start = start;
1297 		prealloc->end = end;
1298 		insert_state_fast(tree, prealloc, p, parent, bits, NULL);
1299 		cache_state(prealloc, cached_state);
1300 		prealloc = NULL;
1301 		goto out;
1302 	}
1303 	state = rb_entry(node, struct extent_state, rb_node);
1304 hit_next:
1305 	last_start = state->start;
1306 	last_end = state->end;
1307 
1308 	/*
1309 	 * | ---- desired range ---- |
1310 	 * | state |
1311 	 *
1312 	 * Just lock what we found and keep going
1313 	 */
1314 	if (state->start == start && state->end <= end) {
1315 		set_state_bits(tree, state, bits, NULL);
1316 		cache_state(state, cached_state);
1317 		state = clear_state_bit(tree, state, clear_bits, 0, NULL);
1318 		if (last_end == (u64)-1)
1319 			goto out;
1320 		start = last_end + 1;
1321 		if (start < end && state && state->start == start &&
1322 		    !need_resched())
1323 			goto hit_next;
1324 		goto search_again;
1325 	}
1326 
1327 	/*
1328 	 *     | ---- desired range ---- |
1329 	 * | state |
1330 	 *   or
1331 	 * | ------------- state -------------- |
1332 	 *
1333 	 * We need to split the extent we found, and may flip bits on
1334 	 * second half.
1335 	 *
1336 	 * If the extent we found extends past our
1337 	 * range, we just split and search again.  It'll get split
1338 	 * again the next time though.
1339 	 *
1340 	 * If the extent we found is inside our range, we set the
1341 	 * desired bit on it.
1342 	 */
1343 	if (state->start < start) {
1344 		prealloc = alloc_extent_state_atomic(prealloc);
1345 		if (!prealloc) {
1346 			err = -ENOMEM;
1347 			goto out;
1348 		}
1349 		err = split_state(tree, state, prealloc, start);
1350 		if (err)
1351 			extent_io_tree_panic(tree, err);
1352 		prealloc = NULL;
1353 		if (err)
1354 			goto out;
1355 		if (state->end <= end) {
1356 			set_state_bits(tree, state, bits, NULL);
1357 			cache_state(state, cached_state);
1358 			state = clear_state_bit(tree, state, clear_bits, 0, NULL);
1359 			if (last_end == (u64)-1)
1360 				goto out;
1361 			start = last_end + 1;
1362 			if (start < end && state && state->start == start &&
1363 			    !need_resched())
1364 				goto hit_next;
1365 		}
1366 		goto search_again;
1367 	}
1368 	/*
1369 	 * | ---- desired range ---- |
1370 	 *     | state | or               | state |
1371 	 *
1372 	 * There's a hole, we need to insert something in it and
1373 	 * ignore the extent we found.
1374 	 */
1375 	if (state->start > start) {
1376 		u64 this_end;
1377 		if (end < last_start)
1378 			this_end = end;
1379 		else
1380 			this_end = last_start - 1;
1381 
1382 		prealloc = alloc_extent_state_atomic(prealloc);
1383 		if (!prealloc) {
1384 			err = -ENOMEM;
1385 			goto out;
1386 		}
1387 
1388 		/*
1389 		 * Avoid to free 'prealloc' if it can be merged with
1390 		 * the later extent.
1391 		 */
1392 		prealloc->start = start;
1393 		prealloc->end = this_end;
1394 		err = insert_state(tree, prealloc, bits, NULL);
1395 		if (err)
1396 			extent_io_tree_panic(tree, err);
1397 		cache_state(prealloc, cached_state);
1398 		prealloc = NULL;
1399 		start = this_end + 1;
1400 		goto search_again;
1401 	}
1402 	/*
1403 	 * | ---- desired range ---- |
1404 	 *                        | state |
1405 	 * We need to split the extent, and set the bit
1406 	 * on the first half
1407 	 */
1408 	if (state->start <= end && state->end > end) {
1409 		prealloc = alloc_extent_state_atomic(prealloc);
1410 		if (!prealloc) {
1411 			err = -ENOMEM;
1412 			goto out;
1413 		}
1414 
1415 		err = split_state(tree, state, prealloc, end + 1);
1416 		if (err)
1417 			extent_io_tree_panic(tree, err);
1418 
1419 		set_state_bits(tree, prealloc, bits, NULL);
1420 		cache_state(prealloc, cached_state);
1421 		clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
1422 		prealloc = NULL;
1423 		goto out;
1424 	}
1425 
1426 search_again:
1427 	if (start > end)
1428 		goto out;
1429 	spin_unlock(&tree->lock);
1430 	cond_resched();
1431 	first_iteration = false;
1432 	goto again;
1433 
1434 out:
1435 	spin_unlock(&tree->lock);
1436 	if (prealloc)
1437 		free_extent_state(prealloc);
1438 
1439 	return err;
1440 }
1441 
1442 /* wrappers around set/clear extent bit */
1443 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1444 			   u32 bits, struct extent_changeset *changeset)
1445 {
1446 	/*
1447 	 * We don't support EXTENT_LOCKED yet, as current changeset will
1448 	 * record any bits changed, so for EXTENT_LOCKED case, it will
1449 	 * either fail with -EEXIST or changeset will record the whole
1450 	 * range.
1451 	 */
1452 	BUG_ON(bits & EXTENT_LOCKED);
1453 
1454 	return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
1455 			      changeset);
1456 }
1457 
1458 int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
1459 			   u32 bits)
1460 {
1461 	return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1462 			      GFP_NOWAIT, NULL);
1463 }
1464 
1465 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1466 		     u32 bits, int wake, int delete,
1467 		     struct extent_state **cached)
1468 {
1469 	return __clear_extent_bit(tree, start, end, bits, wake, delete,
1470 				  cached, GFP_NOFS, NULL);
1471 }
1472 
1473 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1474 		u32 bits, struct extent_changeset *changeset)
1475 {
1476 	/*
1477 	 * Don't support EXTENT_LOCKED case, same reason as
1478 	 * set_record_extent_bits().
1479 	 */
1480 	BUG_ON(bits & EXTENT_LOCKED);
1481 
1482 	return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
1483 				  changeset);
1484 }
1485 
1486 /*
1487  * either insert or lock state struct between start and end use mask to tell
1488  * us if waiting is desired.
1489  */
1490 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1491 		     struct extent_state **cached_state)
1492 {
1493 	int err;
1494 	u64 failed_start;
1495 
1496 	while (1) {
1497 		err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
1498 				     EXTENT_LOCKED, &failed_start,
1499 				     cached_state, GFP_NOFS, NULL);
1500 		if (err == -EEXIST) {
1501 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1502 			start = failed_start;
1503 		} else
1504 			break;
1505 		WARN_ON(start > end);
1506 	}
1507 	return err;
1508 }
1509 
1510 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1511 {
1512 	int err;
1513 	u64 failed_start;
1514 
1515 	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1516 			     &failed_start, NULL, GFP_NOFS, NULL);
1517 	if (err == -EEXIST) {
1518 		if (failed_start > start)
1519 			clear_extent_bit(tree, start, failed_start - 1,
1520 					 EXTENT_LOCKED, 1, 0, NULL);
1521 		return 0;
1522 	}
1523 	return 1;
1524 }
1525 
1526 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1527 {
1528 	unsigned long index = start >> PAGE_SHIFT;
1529 	unsigned long end_index = end >> PAGE_SHIFT;
1530 	struct page *page;
1531 
1532 	while (index <= end_index) {
1533 		page = find_get_page(inode->i_mapping, index);
1534 		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1535 		clear_page_dirty_for_io(page);
1536 		put_page(page);
1537 		index++;
1538 	}
1539 }
1540 
1541 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1542 {
1543 	struct address_space *mapping = inode->i_mapping;
1544 	unsigned long index = start >> PAGE_SHIFT;
1545 	unsigned long end_index = end >> PAGE_SHIFT;
1546 	struct folio *folio;
1547 
1548 	while (index <= end_index) {
1549 		folio = filemap_get_folio(mapping, index);
1550 		filemap_dirty_folio(mapping, folio);
1551 		folio_account_redirty(folio);
1552 		index += folio_nr_pages(folio);
1553 		folio_put(folio);
1554 	}
1555 }
1556 
1557 /* find the first state struct with 'bits' set after 'start', and
1558  * return it.  tree->lock must be held.  NULL will returned if
1559  * nothing was found after 'start'
1560  */
1561 static struct extent_state *
1562 find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
1563 {
1564 	struct rb_node *node;
1565 	struct extent_state *state;
1566 
1567 	/*
1568 	 * this search will find all the extents that end after
1569 	 * our range starts.
1570 	 */
1571 	node = tree_search(tree, start);
1572 	if (!node)
1573 		goto out;
1574 
1575 	while (1) {
1576 		state = rb_entry(node, struct extent_state, rb_node);
1577 		if (state->end >= start && (state->state & bits))
1578 			return state;
1579 
1580 		node = rb_next(node);
1581 		if (!node)
1582 			break;
1583 	}
1584 out:
1585 	return NULL;
1586 }
1587 
1588 /*
1589  * Find the first offset in the io tree with one or more @bits set.
1590  *
1591  * Note: If there are multiple bits set in @bits, any of them will match.
1592  *
1593  * Return 0 if we find something, and update @start_ret and @end_ret.
1594  * Return 1 if we found nothing.
1595  */
1596 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1597 			  u64 *start_ret, u64 *end_ret, u32 bits,
1598 			  struct extent_state **cached_state)
1599 {
1600 	struct extent_state *state;
1601 	int ret = 1;
1602 
1603 	spin_lock(&tree->lock);
1604 	if (cached_state && *cached_state) {
1605 		state = *cached_state;
1606 		if (state->end == start - 1 && extent_state_in_tree(state)) {
1607 			while ((state = next_state(state)) != NULL) {
1608 				if (state->state & bits)
1609 					goto got_it;
1610 			}
1611 			free_extent_state(*cached_state);
1612 			*cached_state = NULL;
1613 			goto out;
1614 		}
1615 		free_extent_state(*cached_state);
1616 		*cached_state = NULL;
1617 	}
1618 
1619 	state = find_first_extent_bit_state(tree, start, bits);
1620 got_it:
1621 	if (state) {
1622 		cache_state_if_flags(state, cached_state, 0);
1623 		*start_ret = state->start;
1624 		*end_ret = state->end;
1625 		ret = 0;
1626 	}
1627 out:
1628 	spin_unlock(&tree->lock);
1629 	return ret;
1630 }
1631 
1632 /**
1633  * Find a contiguous area of bits
1634  *
1635  * @tree:      io tree to check
1636  * @start:     offset to start the search from
1637  * @start_ret: the first offset we found with the bits set
1638  * @end_ret:   the final contiguous range of the bits that were set
1639  * @bits:      bits to look for
1640  *
1641  * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
1642  * to set bits appropriately, and then merge them again.  During this time it
1643  * will drop the tree->lock, so use this helper if you want to find the actual
1644  * contiguous area for given bits.  We will search to the first bit we find, and
1645  * then walk down the tree until we find a non-contiguous area.  The area
1646  * returned will be the full contiguous area with the bits set.
1647  */
1648 int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
1649 			       u64 *start_ret, u64 *end_ret, u32 bits)
1650 {
1651 	struct extent_state *state;
1652 	int ret = 1;
1653 
1654 	spin_lock(&tree->lock);
1655 	state = find_first_extent_bit_state(tree, start, bits);
1656 	if (state) {
1657 		*start_ret = state->start;
1658 		*end_ret = state->end;
1659 		while ((state = next_state(state)) != NULL) {
1660 			if (state->start > (*end_ret + 1))
1661 				break;
1662 			*end_ret = state->end;
1663 		}
1664 		ret = 0;
1665 	}
1666 	spin_unlock(&tree->lock);
1667 	return ret;
1668 }
1669 
1670 /**
1671  * Find the first range that has @bits not set. This range could start before
1672  * @start.
1673  *
1674  * @tree:      the tree to search
1675  * @start:     offset at/after which the found extent should start
1676  * @start_ret: records the beginning of the range
1677  * @end_ret:   records the end of the range (inclusive)
1678  * @bits:      the set of bits which must be unset
1679  *
1680  * Since unallocated range is also considered one which doesn't have the bits
1681  * set it's possible that @end_ret contains -1, this happens in case the range
1682  * spans (last_range_end, end of device]. In this case it's up to the caller to
1683  * trim @end_ret to the appropriate size.
1684  */
1685 void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
1686 				 u64 *start_ret, u64 *end_ret, u32 bits)
1687 {
1688 	struct extent_state *state;
1689 	struct rb_node *node, *prev = NULL, *next;
1690 
1691 	spin_lock(&tree->lock);
1692 
1693 	/* Find first extent with bits cleared */
1694 	while (1) {
1695 		node = tree_search_prev_next(tree, start, &prev, &next);
1696 		if (!node && !next && !prev) {
1697 			/*
1698 			 * Tree is completely empty, send full range and let
1699 			 * caller deal with it
1700 			 */
1701 			*start_ret = 0;
1702 			*end_ret = -1;
1703 			goto out;
1704 		} else if (!node && !next) {
1705 			/*
1706 			 * We are past the last allocated chunk, set start at
1707 			 * the end of the last extent.
1708 			 */
1709 			state = rb_entry(prev, struct extent_state, rb_node);
1710 			*start_ret = state->end + 1;
1711 			*end_ret = -1;
1712 			goto out;
1713 		} else if (!node) {
1714 			node = next;
1715 		}
1716 		/*
1717 		 * At this point 'node' either contains 'start' or start is
1718 		 * before 'node'
1719 		 */
1720 		state = rb_entry(node, struct extent_state, rb_node);
1721 
1722 		if (in_range(start, state->start, state->end - state->start + 1)) {
1723 			if (state->state & bits) {
1724 				/*
1725 				 * |--range with bits sets--|
1726 				 *    |
1727 				 *    start
1728 				 */
1729 				start = state->end + 1;
1730 			} else {
1731 				/*
1732 				 * 'start' falls within a range that doesn't
1733 				 * have the bits set, so take its start as
1734 				 * the beginning of the desired range
1735 				 *
1736 				 * |--range with bits cleared----|
1737 				 *      |
1738 				 *      start
1739 				 */
1740 				*start_ret = state->start;
1741 				break;
1742 			}
1743 		} else {
1744 			/*
1745 			 * |---prev range---|---hole/unset---|---node range---|
1746 			 *                          |
1747 			 *                        start
1748 			 *
1749 			 *                        or
1750 			 *
1751 			 * |---hole/unset--||--first node--|
1752 			 * 0   |
1753 			 *    start
1754 			 */
1755 			if (prev) {
1756 				state = rb_entry(prev, struct extent_state,
1757 						 rb_node);
1758 				*start_ret = state->end + 1;
1759 			} else {
1760 				*start_ret = 0;
1761 			}
1762 			break;
1763 		}
1764 	}
1765 
1766 	/*
1767 	 * Find the longest stretch from start until an entry which has the
1768 	 * bits set
1769 	 */
1770 	while (1) {
1771 		state = rb_entry(node, struct extent_state, rb_node);
1772 		if (state->end >= start && !(state->state & bits)) {
1773 			*end_ret = state->end;
1774 		} else {
1775 			*end_ret = state->start - 1;
1776 			break;
1777 		}
1778 
1779 		node = rb_next(node);
1780 		if (!node)
1781 			break;
1782 	}
1783 out:
1784 	spin_unlock(&tree->lock);
1785 }
1786 
1787 /*
1788  * find a contiguous range of bytes in the file marked as delalloc, not
1789  * more than 'max_bytes'.  start and end are used to return the range,
1790  *
1791  * true is returned if we find something, false if nothing was in the tree
1792  */
1793 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
1794 			       u64 *end, u64 max_bytes,
1795 			       struct extent_state **cached_state)
1796 {
1797 	struct rb_node *node;
1798 	struct extent_state *state;
1799 	u64 cur_start = *start;
1800 	bool found = false;
1801 	u64 total_bytes = 0;
1802 
1803 	spin_lock(&tree->lock);
1804 
1805 	/*
1806 	 * this search will find all the extents that end after
1807 	 * our range starts.
1808 	 */
1809 	node = tree_search(tree, cur_start);
1810 	if (!node) {
1811 		*end = (u64)-1;
1812 		goto out;
1813 	}
1814 
1815 	while (1) {
1816 		state = rb_entry(node, struct extent_state, rb_node);
1817 		if (found && (state->start != cur_start ||
1818 			      (state->state & EXTENT_BOUNDARY))) {
1819 			goto out;
1820 		}
1821 		if (!(state->state & EXTENT_DELALLOC)) {
1822 			if (!found)
1823 				*end = state->end;
1824 			goto out;
1825 		}
1826 		if (!found) {
1827 			*start = state->start;
1828 			*cached_state = state;
1829 			refcount_inc(&state->refs);
1830 		}
1831 		found = true;
1832 		*end = state->end;
1833 		cur_start = state->end + 1;
1834 		node = rb_next(node);
1835 		total_bytes += state->end - state->start + 1;
1836 		if (total_bytes >= max_bytes)
1837 			break;
1838 		if (!node)
1839 			break;
1840 	}
1841 out:
1842 	spin_unlock(&tree->lock);
1843 	return found;
1844 }
1845 
1846 /*
1847  * Process one page for __process_pages_contig().
1848  *
1849  * Return >0 if we hit @page == @locked_page.
1850  * Return 0 if we updated the page status.
1851  * Return -EGAIN if the we need to try again.
1852  * (For PAGE_LOCK case but got dirty page or page not belong to mapping)
1853  */
1854 static int process_one_page(struct btrfs_fs_info *fs_info,
1855 			    struct address_space *mapping,
1856 			    struct page *page, struct page *locked_page,
1857 			    unsigned long page_ops, u64 start, u64 end)
1858 {
1859 	u32 len;
1860 
1861 	ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
1862 	len = end + 1 - start;
1863 
1864 	if (page_ops & PAGE_SET_ORDERED)
1865 		btrfs_page_clamp_set_ordered(fs_info, page, start, len);
1866 	if (page_ops & PAGE_SET_ERROR)
1867 		btrfs_page_clamp_set_error(fs_info, page, start, len);
1868 	if (page_ops & PAGE_START_WRITEBACK) {
1869 		btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
1870 		btrfs_page_clamp_set_writeback(fs_info, page, start, len);
1871 	}
1872 	if (page_ops & PAGE_END_WRITEBACK)
1873 		btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
1874 
1875 	if (page == locked_page)
1876 		return 1;
1877 
1878 	if (page_ops & PAGE_LOCK) {
1879 		int ret;
1880 
1881 		ret = btrfs_page_start_writer_lock(fs_info, page, start, len);
1882 		if (ret)
1883 			return ret;
1884 		if (!PageDirty(page) || page->mapping != mapping) {
1885 			btrfs_page_end_writer_lock(fs_info, page, start, len);
1886 			return -EAGAIN;
1887 		}
1888 	}
1889 	if (page_ops & PAGE_UNLOCK)
1890 		btrfs_page_end_writer_lock(fs_info, page, start, len);
1891 	return 0;
1892 }
1893 
1894 static int __process_pages_contig(struct address_space *mapping,
1895 				  struct page *locked_page,
1896 				  u64 start, u64 end, unsigned long page_ops,
1897 				  u64 *processed_end)
1898 {
1899 	struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
1900 	pgoff_t start_index = start >> PAGE_SHIFT;
1901 	pgoff_t end_index = end >> PAGE_SHIFT;
1902 	pgoff_t index = start_index;
1903 	unsigned long nr_pages = end_index - start_index + 1;
1904 	unsigned long pages_processed = 0;
1905 	struct page *pages[16];
1906 	int err = 0;
1907 	int i;
1908 
1909 	if (page_ops & PAGE_LOCK) {
1910 		ASSERT(page_ops == PAGE_LOCK);
1911 		ASSERT(processed_end && *processed_end == start);
1912 	}
1913 
1914 	if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1915 		mapping_set_error(mapping, -EIO);
1916 
1917 	while (nr_pages > 0) {
1918 		int found_pages;
1919 
1920 		found_pages = find_get_pages_contig(mapping, index,
1921 				     min_t(unsigned long,
1922 				     nr_pages, ARRAY_SIZE(pages)), pages);
1923 		if (found_pages == 0) {
1924 			/*
1925 			 * Only if we're going to lock these pages, we can find
1926 			 * nothing at @index.
1927 			 */
1928 			ASSERT(page_ops & PAGE_LOCK);
1929 			err = -EAGAIN;
1930 			goto out;
1931 		}
1932 
1933 		for (i = 0; i < found_pages; i++) {
1934 			int process_ret;
1935 
1936 			process_ret = process_one_page(fs_info, mapping,
1937 					pages[i], locked_page, page_ops,
1938 					start, end);
1939 			if (process_ret < 0) {
1940 				for (; i < found_pages; i++)
1941 					put_page(pages[i]);
1942 				err = -EAGAIN;
1943 				goto out;
1944 			}
1945 			put_page(pages[i]);
1946 			pages_processed++;
1947 		}
1948 		nr_pages -= found_pages;
1949 		index += found_pages;
1950 		cond_resched();
1951 	}
1952 out:
1953 	if (err && processed_end) {
1954 		/*
1955 		 * Update @processed_end. I know this is awful since it has
1956 		 * two different return value patterns (inclusive vs exclusive).
1957 		 *
1958 		 * But the exclusive pattern is necessary if @start is 0, or we
1959 		 * underflow and check against processed_end won't work as
1960 		 * expected.
1961 		 */
1962 		if (pages_processed)
1963 			*processed_end = min(end,
1964 			((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1);
1965 		else
1966 			*processed_end = start;
1967 	}
1968 	return err;
1969 }
1970 
1971 static noinline void __unlock_for_delalloc(struct inode *inode,
1972 					   struct page *locked_page,
1973 					   u64 start, u64 end)
1974 {
1975 	unsigned long index = start >> PAGE_SHIFT;
1976 	unsigned long end_index = end >> PAGE_SHIFT;
1977 
1978 	ASSERT(locked_page);
1979 	if (index == locked_page->index && end_index == index)
1980 		return;
1981 
1982 	__process_pages_contig(inode->i_mapping, locked_page, start, end,
1983 			       PAGE_UNLOCK, NULL);
1984 }
1985 
1986 static noinline int lock_delalloc_pages(struct inode *inode,
1987 					struct page *locked_page,
1988 					u64 delalloc_start,
1989 					u64 delalloc_end)
1990 {
1991 	unsigned long index = delalloc_start >> PAGE_SHIFT;
1992 	unsigned long end_index = delalloc_end >> PAGE_SHIFT;
1993 	u64 processed_end = delalloc_start;
1994 	int ret;
1995 
1996 	ASSERT(locked_page);
1997 	if (index == locked_page->index && index == end_index)
1998 		return 0;
1999 
2000 	ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start,
2001 				     delalloc_end, PAGE_LOCK, &processed_end);
2002 	if (ret == -EAGAIN && processed_end > delalloc_start)
2003 		__unlock_for_delalloc(inode, locked_page, delalloc_start,
2004 				      processed_end);
2005 	return ret;
2006 }
2007 
2008 /*
2009  * Find and lock a contiguous range of bytes in the file marked as delalloc, no
2010  * more than @max_bytes.
2011  *
2012  * @start:	The original start bytenr to search.
2013  *		Will store the extent range start bytenr.
2014  * @end:	The original end bytenr of the search range
2015  *		Will store the extent range end bytenr.
2016  *
2017  * Return true if we find a delalloc range which starts inside the original
2018  * range, and @start/@end will store the delalloc range start/end.
2019  *
2020  * Return false if we can't find any delalloc range which starts inside the
2021  * original range, and @start/@end will be the non-delalloc range start/end.
2022  */
2023 EXPORT_FOR_TESTS
2024 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
2025 				    struct page *locked_page, u64 *start,
2026 				    u64 *end)
2027 {
2028 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2029 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2030 	const u64 orig_start = *start;
2031 	const u64 orig_end = *end;
2032 	/* The sanity tests may not set a valid fs_info. */
2033 	u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
2034 	u64 delalloc_start;
2035 	u64 delalloc_end;
2036 	bool found;
2037 	struct extent_state *cached_state = NULL;
2038 	int ret;
2039 	int loops = 0;
2040 
2041 	/* Caller should pass a valid @end to indicate the search range end */
2042 	ASSERT(orig_end > orig_start);
2043 
2044 	/* The range should at least cover part of the page */
2045 	ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
2046 		 orig_end <= page_offset(locked_page)));
2047 again:
2048 	/* step one, find a bunch of delalloc bytes starting at start */
2049 	delalloc_start = *start;
2050 	delalloc_end = 0;
2051 	found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
2052 					  max_bytes, &cached_state);
2053 	if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
2054 		*start = delalloc_start;
2055 
2056 		/* @delalloc_end can be -1, never go beyond @orig_end */
2057 		*end = min(delalloc_end, orig_end);
2058 		free_extent_state(cached_state);
2059 		return false;
2060 	}
2061 
2062 	/*
2063 	 * start comes from the offset of locked_page.  We have to lock
2064 	 * pages in order, so we can't process delalloc bytes before
2065 	 * locked_page
2066 	 */
2067 	if (delalloc_start < *start)
2068 		delalloc_start = *start;
2069 
2070 	/*
2071 	 * make sure to limit the number of pages we try to lock down
2072 	 */
2073 	if (delalloc_end + 1 - delalloc_start > max_bytes)
2074 		delalloc_end = delalloc_start + max_bytes - 1;
2075 
2076 	/* step two, lock all the pages after the page that has start */
2077 	ret = lock_delalloc_pages(inode, locked_page,
2078 				  delalloc_start, delalloc_end);
2079 	ASSERT(!ret || ret == -EAGAIN);
2080 	if (ret == -EAGAIN) {
2081 		/* some of the pages are gone, lets avoid looping by
2082 		 * shortening the size of the delalloc range we're searching
2083 		 */
2084 		free_extent_state(cached_state);
2085 		cached_state = NULL;
2086 		if (!loops) {
2087 			max_bytes = PAGE_SIZE;
2088 			loops = 1;
2089 			goto again;
2090 		} else {
2091 			found = false;
2092 			goto out_failed;
2093 		}
2094 	}
2095 
2096 	/* step three, lock the state bits for the whole range */
2097 	lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
2098 
2099 	/* then test to make sure it is all still delalloc */
2100 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
2101 			     EXTENT_DELALLOC, 1, cached_state);
2102 	if (!ret) {
2103 		unlock_extent_cached(tree, delalloc_start, delalloc_end,
2104 				     &cached_state);
2105 		__unlock_for_delalloc(inode, locked_page,
2106 			      delalloc_start, delalloc_end);
2107 		cond_resched();
2108 		goto again;
2109 	}
2110 	free_extent_state(cached_state);
2111 	*start = delalloc_start;
2112 	*end = delalloc_end;
2113 out_failed:
2114 	return found;
2115 }
2116 
2117 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2118 				  struct page *locked_page,
2119 				  u32 clear_bits, unsigned long page_ops)
2120 {
2121 	clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
2122 
2123 	__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
2124 			       start, end, page_ops, NULL);
2125 }
2126 
2127 /*
2128  * count the number of bytes in the tree that have a given bit(s)
2129  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
2130  * cached.  The total number found is returned.
2131  */
2132 u64 count_range_bits(struct extent_io_tree *tree,
2133 		     u64 *start, u64 search_end, u64 max_bytes,
2134 		     u32 bits, int contig)
2135 {
2136 	struct rb_node *node;
2137 	struct extent_state *state;
2138 	u64 cur_start = *start;
2139 	u64 total_bytes = 0;
2140 	u64 last = 0;
2141 	int found = 0;
2142 
2143 	if (WARN_ON(search_end <= cur_start))
2144 		return 0;
2145 
2146 	spin_lock(&tree->lock);
2147 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
2148 		total_bytes = tree->dirty_bytes;
2149 		goto out;
2150 	}
2151 	/*
2152 	 * this search will find all the extents that end after
2153 	 * our range starts.
2154 	 */
2155 	node = tree_search(tree, cur_start);
2156 	if (!node)
2157 		goto out;
2158 
2159 	while (1) {
2160 		state = rb_entry(node, struct extent_state, rb_node);
2161 		if (state->start > search_end)
2162 			break;
2163 		if (contig && found && state->start > last + 1)
2164 			break;
2165 		if (state->end >= cur_start && (state->state & bits) == bits) {
2166 			total_bytes += min(search_end, state->end) + 1 -
2167 				       max(cur_start, state->start);
2168 			if (total_bytes >= max_bytes)
2169 				break;
2170 			if (!found) {
2171 				*start = max(cur_start, state->start);
2172 				found = 1;
2173 			}
2174 			last = state->end;
2175 		} else if (contig && found) {
2176 			break;
2177 		}
2178 		node = rb_next(node);
2179 		if (!node)
2180 			break;
2181 	}
2182 out:
2183 	spin_unlock(&tree->lock);
2184 	return total_bytes;
2185 }
2186 
2187 /*
2188  * set the private field for a given byte offset in the tree.  If there isn't
2189  * an extent_state there already, this does nothing.
2190  */
2191 int set_state_failrec(struct extent_io_tree *tree, u64 start,
2192 		      struct io_failure_record *failrec)
2193 {
2194 	struct rb_node *node;
2195 	struct extent_state *state;
2196 	int ret = 0;
2197 
2198 	spin_lock(&tree->lock);
2199 	/*
2200 	 * this search will find all the extents that end after
2201 	 * our range starts.
2202 	 */
2203 	node = tree_search(tree, start);
2204 	if (!node) {
2205 		ret = -ENOENT;
2206 		goto out;
2207 	}
2208 	state = rb_entry(node, struct extent_state, rb_node);
2209 	if (state->start != start) {
2210 		ret = -ENOENT;
2211 		goto out;
2212 	}
2213 	state->failrec = failrec;
2214 out:
2215 	spin_unlock(&tree->lock);
2216 	return ret;
2217 }
2218 
2219 struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
2220 {
2221 	struct rb_node *node;
2222 	struct extent_state *state;
2223 	struct io_failure_record *failrec;
2224 
2225 	spin_lock(&tree->lock);
2226 	/*
2227 	 * this search will find all the extents that end after
2228 	 * our range starts.
2229 	 */
2230 	node = tree_search(tree, start);
2231 	if (!node) {
2232 		failrec = ERR_PTR(-ENOENT);
2233 		goto out;
2234 	}
2235 	state = rb_entry(node, struct extent_state, rb_node);
2236 	if (state->start != start) {
2237 		failrec = ERR_PTR(-ENOENT);
2238 		goto out;
2239 	}
2240 
2241 	failrec = state->failrec;
2242 out:
2243 	spin_unlock(&tree->lock);
2244 	return failrec;
2245 }
2246 
2247 /*
2248  * searches a range in the state tree for a given mask.
2249  * If 'filled' == 1, this returns 1 only if every extent in the tree
2250  * has the bits set.  Otherwise, 1 is returned if any bit in the
2251  * range is found set.
2252  */
2253 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
2254 		   u32 bits, int filled, struct extent_state *cached)
2255 {
2256 	struct extent_state *state = NULL;
2257 	struct rb_node *node;
2258 	int bitset = 0;
2259 
2260 	spin_lock(&tree->lock);
2261 	if (cached && extent_state_in_tree(cached) && cached->start <= start &&
2262 	    cached->end > start)
2263 		node = &cached->rb_node;
2264 	else
2265 		node = tree_search(tree, start);
2266 	while (node && start <= end) {
2267 		state = rb_entry(node, struct extent_state, rb_node);
2268 
2269 		if (filled && state->start > start) {
2270 			bitset = 0;
2271 			break;
2272 		}
2273 
2274 		if (state->start > end)
2275 			break;
2276 
2277 		if (state->state & bits) {
2278 			bitset = 1;
2279 			if (!filled)
2280 				break;
2281 		} else if (filled) {
2282 			bitset = 0;
2283 			break;
2284 		}
2285 
2286 		if (state->end == (u64)-1)
2287 			break;
2288 
2289 		start = state->end + 1;
2290 		if (start > end)
2291 			break;
2292 		node = rb_next(node);
2293 		if (!node) {
2294 			if (filled)
2295 				bitset = 0;
2296 			break;
2297 		}
2298 	}
2299 	spin_unlock(&tree->lock);
2300 	return bitset;
2301 }
2302 
2303 int free_io_failure(struct extent_io_tree *failure_tree,
2304 		    struct extent_io_tree *io_tree,
2305 		    struct io_failure_record *rec)
2306 {
2307 	int ret;
2308 	int err = 0;
2309 
2310 	set_state_failrec(failure_tree, rec->start, NULL);
2311 	ret = clear_extent_bits(failure_tree, rec->start,
2312 				rec->start + rec->len - 1,
2313 				EXTENT_LOCKED | EXTENT_DIRTY);
2314 	if (ret)
2315 		err = ret;
2316 
2317 	ret = clear_extent_bits(io_tree, rec->start,
2318 				rec->start + rec->len - 1,
2319 				EXTENT_DAMAGED);
2320 	if (ret && !err)
2321 		err = ret;
2322 
2323 	kfree(rec);
2324 	return err;
2325 }
2326 
2327 /*
2328  * this bypasses the standard btrfs submit functions deliberately, as
2329  * the standard behavior is to write all copies in a raid setup. here we only
2330  * want to write the one bad copy. so we do the mapping for ourselves and issue
2331  * submit_bio directly.
2332  * to avoid any synchronization issues, wait for the data after writing, which
2333  * actually prevents the read that triggered the error from finishing.
2334  * currently, there can be no more than two copies of every data bit. thus,
2335  * exactly one rewrite is required.
2336  */
2337 static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2338 			     u64 length, u64 logical, struct page *page,
2339 			     unsigned int pg_offset, int mirror_num)
2340 {
2341 	struct btrfs_device *dev;
2342 	struct bio_vec bvec;
2343 	struct bio bio;
2344 	u64 map_length = 0;
2345 	u64 sector;
2346 	struct btrfs_io_context *bioc = NULL;
2347 	int ret = 0;
2348 
2349 	ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
2350 	BUG_ON(!mirror_num);
2351 
2352 	if (btrfs_repair_one_zone(fs_info, logical))
2353 		return 0;
2354 
2355 	map_length = length;
2356 
2357 	/*
2358 	 * Avoid races with device replace and make sure our bioc has devices
2359 	 * associated to its stripes that don't go away while we are doing the
2360 	 * read repair operation.
2361 	 */
2362 	btrfs_bio_counter_inc_blocked(fs_info);
2363 	if (btrfs_is_parity_mirror(fs_info, logical, length)) {
2364 		/*
2365 		 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2366 		 * to update all raid stripes, but here we just want to correct
2367 		 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2368 		 * stripe's dev and sector.
2369 		 */
2370 		ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
2371 				      &map_length, &bioc, 0);
2372 		if (ret)
2373 			goto out_counter_dec;
2374 		ASSERT(bioc->mirror_num == 1);
2375 	} else {
2376 		ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
2377 				      &map_length, &bioc, mirror_num);
2378 		if (ret)
2379 			goto out_counter_dec;
2380 		BUG_ON(mirror_num != bioc->mirror_num);
2381 	}
2382 
2383 	sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
2384 	dev = bioc->stripes[bioc->mirror_num - 1].dev;
2385 	btrfs_put_bioc(bioc);
2386 
2387 	if (!dev || !dev->bdev ||
2388 	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
2389 		ret = -EIO;
2390 		goto out_counter_dec;
2391 	}
2392 
2393 	bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
2394 	bio.bi_iter.bi_sector = sector;
2395 	__bio_add_page(&bio, page, length, pg_offset);
2396 
2397 	btrfsic_check_bio(&bio);
2398 	ret = submit_bio_wait(&bio);
2399 	if (ret) {
2400 		/* try to remap that extent elsewhere? */
2401 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
2402 		goto out_bio_uninit;
2403 	}
2404 
2405 	btrfs_info_rl_in_rcu(fs_info,
2406 		"read error corrected: ino %llu off %llu (dev %s sector %llu)",
2407 				  ino, start,
2408 				  rcu_str_deref(dev->name), sector);
2409 	ret = 0;
2410 
2411 out_bio_uninit:
2412 	bio_uninit(&bio);
2413 out_counter_dec:
2414 	btrfs_bio_counter_dec(fs_info);
2415 	return ret;
2416 }
2417 
2418 int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
2419 {
2420 	struct btrfs_fs_info *fs_info = eb->fs_info;
2421 	u64 start = eb->start;
2422 	int i, num_pages = num_extent_pages(eb);
2423 	int ret = 0;
2424 
2425 	if (sb_rdonly(fs_info->sb))
2426 		return -EROFS;
2427 
2428 	for (i = 0; i < num_pages; i++) {
2429 		struct page *p = eb->pages[i];
2430 
2431 		ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
2432 					start - page_offset(p), mirror_num);
2433 		if (ret)
2434 			break;
2435 		start += PAGE_SIZE;
2436 	}
2437 
2438 	return ret;
2439 }
2440 
2441 static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
2442 {
2443 	if (cur_mirror == failrec->num_copies)
2444 		return cur_mirror + 1 - failrec->num_copies;
2445 	return cur_mirror + 1;
2446 }
2447 
2448 static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
2449 {
2450 	if (cur_mirror == 1)
2451 		return failrec->num_copies;
2452 	return cur_mirror - 1;
2453 }
2454 
2455 /*
2456  * each time an IO finishes, we do a fast check in the IO failure tree
2457  * to see if we need to process or clean up an io_failure_record
2458  */
2459 int clean_io_failure(struct btrfs_fs_info *fs_info,
2460 		     struct extent_io_tree *failure_tree,
2461 		     struct extent_io_tree *io_tree, u64 start,
2462 		     struct page *page, u64 ino, unsigned int pg_offset)
2463 {
2464 	u64 private;
2465 	struct io_failure_record *failrec;
2466 	struct extent_state *state;
2467 	int mirror;
2468 	int ret;
2469 
2470 	private = 0;
2471 	ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2472 			       EXTENT_DIRTY, 0);
2473 	if (!ret)
2474 		return 0;
2475 
2476 	failrec = get_state_failrec(failure_tree, start);
2477 	if (IS_ERR(failrec))
2478 		return 0;
2479 
2480 	BUG_ON(!failrec->this_mirror);
2481 
2482 	if (sb_rdonly(fs_info->sb))
2483 		goto out;
2484 
2485 	spin_lock(&io_tree->lock);
2486 	state = find_first_extent_bit_state(io_tree,
2487 					    failrec->start,
2488 					    EXTENT_LOCKED);
2489 	spin_unlock(&io_tree->lock);
2490 
2491 	if (!state || state->start > failrec->start ||
2492 	    state->end < failrec->start + failrec->len - 1)
2493 		goto out;
2494 
2495 	mirror = failrec->this_mirror;
2496 	do {
2497 		mirror = prev_mirror(failrec, mirror);
2498 		repair_io_failure(fs_info, ino, start, failrec->len,
2499 				  failrec->logical, page, pg_offset, mirror);
2500 	} while (mirror != failrec->failed_mirror);
2501 
2502 out:
2503 	free_io_failure(failure_tree, io_tree, failrec);
2504 	return 0;
2505 }
2506 
2507 /*
2508  * Can be called when
2509  * - hold extent lock
2510  * - under ordered extent
2511  * - the inode is freeing
2512  */
2513 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
2514 {
2515 	struct extent_io_tree *failure_tree = &inode->io_failure_tree;
2516 	struct io_failure_record *failrec;
2517 	struct extent_state *state, *next;
2518 
2519 	if (RB_EMPTY_ROOT(&failure_tree->state))
2520 		return;
2521 
2522 	spin_lock(&failure_tree->lock);
2523 	state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2524 	while (state) {
2525 		if (state->start > end)
2526 			break;
2527 
2528 		ASSERT(state->end <= end);
2529 
2530 		next = next_state(state);
2531 
2532 		failrec = state->failrec;
2533 		free_extent_state(state);
2534 		kfree(failrec);
2535 
2536 		state = next;
2537 	}
2538 	spin_unlock(&failure_tree->lock);
2539 }
2540 
2541 static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
2542 							     struct btrfs_bio *bbio,
2543 							     unsigned int bio_offset)
2544 {
2545 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2546 	u64 start = bbio->file_offset + bio_offset;
2547 	struct io_failure_record *failrec;
2548 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2549 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2550 	const u32 sectorsize = fs_info->sectorsize;
2551 	int ret;
2552 
2553 	failrec = get_state_failrec(failure_tree, start);
2554 	if (!IS_ERR(failrec)) {
2555 		btrfs_debug(fs_info,
2556 	"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
2557 			failrec->logical, failrec->start, failrec->len);
2558 		/*
2559 		 * when data can be on disk more than twice, add to failrec here
2560 		 * (e.g. with a list for failed_mirror) to make
2561 		 * clean_io_failure() clean all those errors at once.
2562 		 */
2563 		ASSERT(failrec->this_mirror == bbio->mirror_num);
2564 		ASSERT(failrec->len == fs_info->sectorsize);
2565 		return failrec;
2566 	}
2567 
2568 	failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2569 	if (!failrec)
2570 		return ERR_PTR(-ENOMEM);
2571 
2572 	failrec->start = start;
2573 	failrec->len = sectorsize;
2574 	failrec->failed_mirror = bbio->mirror_num;
2575 	failrec->this_mirror = bbio->mirror_num;
2576 	failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
2577 
2578 	btrfs_debug(fs_info,
2579 		    "new io failure record logical %llu start %llu",
2580 		    failrec->logical, start);
2581 
2582 	failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize);
2583 	if (failrec->num_copies == 1) {
2584 		/*
2585 		 * We only have a single copy of the data, so don't bother with
2586 		 * all the retry and error correction code that follows. No
2587 		 * matter what the error is, it is very likely to persist.
2588 		 */
2589 		btrfs_debug(fs_info,
2590 			"cannot repair logical %llu num_copies %d",
2591 			failrec->logical, failrec->num_copies);
2592 		kfree(failrec);
2593 		return ERR_PTR(-EIO);
2594 	}
2595 
2596 	/* Set the bits in the private failure tree */
2597 	ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
2598 			      EXTENT_LOCKED | EXTENT_DIRTY);
2599 	if (ret >= 0) {
2600 		ret = set_state_failrec(failure_tree, start, failrec);
2601 		/* Set the bits in the inode's tree */
2602 		ret = set_extent_bits(tree, start, start + sectorsize - 1,
2603 				      EXTENT_DAMAGED);
2604 	} else if (ret < 0) {
2605 		kfree(failrec);
2606 		return ERR_PTR(ret);
2607 	}
2608 
2609 	return failrec;
2610 }
2611 
2612 int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
2613 			    u32 bio_offset, struct page *page, unsigned int pgoff,
2614 			    submit_bio_hook_t *submit_bio_hook)
2615 {
2616 	u64 start = failed_bbio->file_offset + bio_offset;
2617 	struct io_failure_record *failrec;
2618 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2619 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2620 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2621 	struct bio *failed_bio = &failed_bbio->bio;
2622 	const int icsum = bio_offset >> fs_info->sectorsize_bits;
2623 	struct bio *repair_bio;
2624 	struct btrfs_bio *repair_bbio;
2625 
2626 	btrfs_debug(fs_info,
2627 		   "repair read error: read error at %llu", start);
2628 
2629 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2630 
2631 	failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset);
2632 	if (IS_ERR(failrec))
2633 		return PTR_ERR(failrec);
2634 
2635 	/*
2636 	 * There are two premises:
2637 	 * a) deliver good data to the caller
2638 	 * b) correct the bad sectors on disk
2639 	 *
2640 	 * Since we're only doing repair for one sector, we only need to get
2641 	 * a good copy of the failed sector and if we succeed, we have setup
2642 	 * everything for repair_io_failure to do the rest for us.
2643 	 */
2644 	failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
2645 	if (failrec->this_mirror == failrec->failed_mirror) {
2646 		btrfs_debug(fs_info,
2647 			"failed to repair num_copies %d this_mirror %d failed_mirror %d",
2648 			failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
2649 		free_io_failure(failure_tree, tree, failrec);
2650 		return -EIO;
2651 	}
2652 
2653 	repair_bio = btrfs_bio_alloc(1);
2654 	repair_bbio = btrfs_bio(repair_bio);
2655 	repair_bbio->file_offset = start;
2656 	repair_bio->bi_opf = REQ_OP_READ;
2657 	repair_bio->bi_end_io = failed_bio->bi_end_io;
2658 	repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
2659 	repair_bio->bi_private = failed_bio->bi_private;
2660 
2661 	if (failed_bbio->csum) {
2662 		const u32 csum_size = fs_info->csum_size;
2663 
2664 		repair_bbio->csum = repair_bbio->csum_inline;
2665 		memcpy(repair_bbio->csum,
2666 		       failed_bbio->csum + csum_size * icsum, csum_size);
2667 	}
2668 
2669 	bio_add_page(repair_bio, page, failrec->len, pgoff);
2670 	repair_bbio->iter = repair_bio->bi_iter;
2671 
2672 	btrfs_debug(btrfs_sb(inode->i_sb),
2673 		    "repair read error: submitting new read to mirror %d",
2674 		    failrec->this_mirror);
2675 
2676 	/*
2677 	 * At this point we have a bio, so any errors from submit_bio_hook()
2678 	 * will be handled by the endio on the repair_bio, so we can't return an
2679 	 * error here.
2680 	 */
2681 	submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0);
2682 	return BLK_STS_OK;
2683 }
2684 
2685 static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
2686 {
2687 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
2688 
2689 	ASSERT(page_offset(page) <= start &&
2690 	       start + len <= page_offset(page) + PAGE_SIZE);
2691 
2692 	if (uptodate) {
2693 		if (fsverity_active(page->mapping->host) &&
2694 		    !PageError(page) &&
2695 		    !PageUptodate(page) &&
2696 		    start < i_size_read(page->mapping->host) &&
2697 		    !fsverity_verify_page(page)) {
2698 			btrfs_page_set_error(fs_info, page, start, len);
2699 		} else {
2700 			btrfs_page_set_uptodate(fs_info, page, start, len);
2701 		}
2702 	} else {
2703 		btrfs_page_clear_uptodate(fs_info, page, start, len);
2704 		btrfs_page_set_error(fs_info, page, start, len);
2705 	}
2706 
2707 	if (!btrfs_is_subpage(fs_info, page))
2708 		unlock_page(page);
2709 	else
2710 		btrfs_subpage_end_reader(fs_info, page, start, len);
2711 }
2712 
2713 static void end_sector_io(struct page *page, u64 offset, bool uptodate)
2714 {
2715 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
2716 	const u32 sectorsize = inode->root->fs_info->sectorsize;
2717 	struct extent_state *cached = NULL;
2718 
2719 	end_page_read(page, uptodate, offset, sectorsize);
2720 	if (uptodate)
2721 		set_extent_uptodate(&inode->io_tree, offset,
2722 				    offset + sectorsize - 1, &cached, GFP_ATOMIC);
2723 	unlock_extent_cached_atomic(&inode->io_tree, offset,
2724 				    offset + sectorsize - 1, &cached);
2725 }
2726 
2727 static void submit_data_read_repair(struct inode *inode,
2728 				    struct btrfs_bio *failed_bbio,
2729 				    u32 bio_offset, const struct bio_vec *bvec,
2730 				    unsigned int error_bitmap)
2731 {
2732 	const unsigned int pgoff = bvec->bv_offset;
2733 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2734 	struct page *page = bvec->bv_page;
2735 	const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset;
2736 	const u64 end = start + bvec->bv_len - 1;
2737 	const u32 sectorsize = fs_info->sectorsize;
2738 	const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
2739 	int i;
2740 
2741 	BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE);
2742 
2743 	/* This repair is only for data */
2744 	ASSERT(is_data_inode(inode));
2745 
2746 	/* We're here because we had some read errors or csum mismatch */
2747 	ASSERT(error_bitmap);
2748 
2749 	/*
2750 	 * We only get called on buffered IO, thus page must be mapped and bio
2751 	 * must not be cloned.
2752 	 */
2753 	ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED));
2754 
2755 	/* Iterate through all the sectors in the range */
2756 	for (i = 0; i < nr_bits; i++) {
2757 		const unsigned int offset = i * sectorsize;
2758 		bool uptodate = false;
2759 		int ret;
2760 
2761 		if (!(error_bitmap & (1U << i))) {
2762 			/*
2763 			 * This sector has no error, just end the page read
2764 			 * and unlock the range.
2765 			 */
2766 			uptodate = true;
2767 			goto next;
2768 		}
2769 
2770 		ret = btrfs_repair_one_sector(inode, failed_bbio,
2771 				bio_offset + offset, page, pgoff + offset,
2772 				btrfs_submit_data_read_bio);
2773 		if (!ret) {
2774 			/*
2775 			 * We have submitted the read repair, the page release
2776 			 * will be handled by the endio function of the
2777 			 * submitted repair bio.
2778 			 * Thus we don't need to do any thing here.
2779 			 */
2780 			continue;
2781 		}
2782 		/*
2783 		 * Continue on failed repair, otherwise the remaining sectors
2784 		 * will not be properly unlocked.
2785 		 */
2786 next:
2787 		end_sector_io(page, start + offset, uptodate);
2788 	}
2789 }
2790 
2791 /* lots and lots of room for performance fixes in the end_bio funcs */
2792 
2793 void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2794 {
2795 	struct btrfs_inode *inode;
2796 	const bool uptodate = (err == 0);
2797 	int ret = 0;
2798 
2799 	ASSERT(page && page->mapping);
2800 	inode = BTRFS_I(page->mapping->host);
2801 	btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
2802 
2803 	if (!uptodate) {
2804 		const struct btrfs_fs_info *fs_info = inode->root->fs_info;
2805 		u32 len;
2806 
2807 		ASSERT(end + 1 - start <= U32_MAX);
2808 		len = end + 1 - start;
2809 
2810 		btrfs_page_clear_uptodate(fs_info, page, start, len);
2811 		btrfs_page_set_error(fs_info, page, start, len);
2812 		ret = err < 0 ? err : -EIO;
2813 		mapping_set_error(page->mapping, ret);
2814 	}
2815 }
2816 
2817 /*
2818  * after a writepage IO is done, we need to:
2819  * clear the uptodate bits on error
2820  * clear the writeback bits in the extent tree for this IO
2821  * end_page_writeback if the page has no more pending IO
2822  *
2823  * Scheduling is not allowed, so the extent state tree is expected
2824  * to have one and only one object corresponding to this IO.
2825  */
2826 static void end_bio_extent_writepage(struct bio *bio)
2827 {
2828 	int error = blk_status_to_errno(bio->bi_status);
2829 	struct bio_vec *bvec;
2830 	u64 start;
2831 	u64 end;
2832 	struct bvec_iter_all iter_all;
2833 	bool first_bvec = true;
2834 
2835 	ASSERT(!bio_flagged(bio, BIO_CLONED));
2836 	bio_for_each_segment_all(bvec, bio, iter_all) {
2837 		struct page *page = bvec->bv_page;
2838 		struct inode *inode = page->mapping->host;
2839 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2840 		const u32 sectorsize = fs_info->sectorsize;
2841 
2842 		/* Our read/write should always be sector aligned. */
2843 		if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
2844 			btrfs_err(fs_info,
2845 		"partial page write in btrfs with offset %u and length %u",
2846 				  bvec->bv_offset, bvec->bv_len);
2847 		else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
2848 			btrfs_info(fs_info,
2849 		"incomplete page write with offset %u and length %u",
2850 				   bvec->bv_offset, bvec->bv_len);
2851 
2852 		start = page_offset(page) + bvec->bv_offset;
2853 		end = start + bvec->bv_len - 1;
2854 
2855 		if (first_bvec) {
2856 			btrfs_record_physical_zoned(inode, start, bio);
2857 			first_bvec = false;
2858 		}
2859 
2860 		end_extent_writepage(page, error, start, end);
2861 
2862 		btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
2863 	}
2864 
2865 	bio_put(bio);
2866 }
2867 
2868 /*
2869  * Record previously processed extent range
2870  *
2871  * For endio_readpage_release_extent() to handle a full extent range, reducing
2872  * the extent io operations.
2873  */
2874 struct processed_extent {
2875 	struct btrfs_inode *inode;
2876 	/* Start of the range in @inode */
2877 	u64 start;
2878 	/* End of the range in @inode */
2879 	u64 end;
2880 	bool uptodate;
2881 };
2882 
2883 /*
2884  * Try to release processed extent range
2885  *
2886  * May not release the extent range right now if the current range is
2887  * contiguous to processed extent.
2888  *
2889  * Will release processed extent when any of @inode, @uptodate, the range is
2890  * no longer contiguous to the processed range.
2891  *
2892  * Passing @inode == NULL will force processed extent to be released.
2893  */
2894 static void endio_readpage_release_extent(struct processed_extent *processed,
2895 			      struct btrfs_inode *inode, u64 start, u64 end,
2896 			      bool uptodate)
2897 {
2898 	struct extent_state *cached = NULL;
2899 	struct extent_io_tree *tree;
2900 
2901 	/* The first extent, initialize @processed */
2902 	if (!processed->inode)
2903 		goto update;
2904 
2905 	/*
2906 	 * Contiguous to processed extent, just uptodate the end.
2907 	 *
2908 	 * Several things to notice:
2909 	 *
2910 	 * - bio can be merged as long as on-disk bytenr is contiguous
2911 	 *   This means we can have page belonging to other inodes, thus need to
2912 	 *   check if the inode still matches.
2913 	 * - bvec can contain range beyond current page for multi-page bvec
2914 	 *   Thus we need to do processed->end + 1 >= start check
2915 	 */
2916 	if (processed->inode == inode && processed->uptodate == uptodate &&
2917 	    processed->end + 1 >= start && end >= processed->end) {
2918 		processed->end = end;
2919 		return;
2920 	}
2921 
2922 	tree = &processed->inode->io_tree;
2923 	/*
2924 	 * Now we don't have range contiguous to the processed range, release
2925 	 * the processed range now.
2926 	 */
2927 	if (processed->uptodate && tree->track_uptodate)
2928 		set_extent_uptodate(tree, processed->start, processed->end,
2929 				    &cached, GFP_ATOMIC);
2930 	unlock_extent_cached_atomic(tree, processed->start, processed->end,
2931 				    &cached);
2932 
2933 update:
2934 	/* Update processed to current range */
2935 	processed->inode = inode;
2936 	processed->start = start;
2937 	processed->end = end;
2938 	processed->uptodate = uptodate;
2939 }
2940 
2941 static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
2942 {
2943 	ASSERT(PageLocked(page));
2944 	if (!btrfs_is_subpage(fs_info, page))
2945 		return;
2946 
2947 	ASSERT(PagePrivate(page));
2948 	btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
2949 }
2950 
2951 /*
2952  * Find extent buffer for a givne bytenr.
2953  *
2954  * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
2955  * in endio context.
2956  */
2957 static struct extent_buffer *find_extent_buffer_readpage(
2958 		struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
2959 {
2960 	struct extent_buffer *eb;
2961 
2962 	/*
2963 	 * For regular sectorsize, we can use page->private to grab extent
2964 	 * buffer
2965 	 */
2966 	if (fs_info->nodesize >= PAGE_SIZE) {
2967 		ASSERT(PagePrivate(page) && page->private);
2968 		return (struct extent_buffer *)page->private;
2969 	}
2970 
2971 	/* For subpage case, we need to lookup buffer radix tree */
2972 	rcu_read_lock();
2973 	eb = radix_tree_lookup(&fs_info->buffer_radix,
2974 			       bytenr >> fs_info->sectorsize_bits);
2975 	rcu_read_unlock();
2976 	ASSERT(eb);
2977 	return eb;
2978 }
2979 
2980 /*
2981  * after a readpage IO is done, we need to:
2982  * clear the uptodate bits on error
2983  * set the uptodate bits if things worked
2984  * set the page up to date if all extents in the tree are uptodate
2985  * clear the lock bit in the extent tree
2986  * unlock the page if there are no other extents locked for it
2987  *
2988  * Scheduling is not allowed, so the extent state tree is expected
2989  * to have one and only one object corresponding to this IO.
2990  */
2991 static void end_bio_extent_readpage(struct bio *bio)
2992 {
2993 	struct bio_vec *bvec;
2994 	struct btrfs_bio *bbio = btrfs_bio(bio);
2995 	struct extent_io_tree *tree, *failure_tree;
2996 	struct processed_extent processed = { 0 };
2997 	/*
2998 	 * The offset to the beginning of a bio, since one bio can never be
2999 	 * larger than UINT_MAX, u32 here is enough.
3000 	 */
3001 	u32 bio_offset = 0;
3002 	int mirror;
3003 	struct bvec_iter_all iter_all;
3004 
3005 	ASSERT(!bio_flagged(bio, BIO_CLONED));
3006 	bio_for_each_segment_all(bvec, bio, iter_all) {
3007 		bool uptodate = !bio->bi_status;
3008 		struct page *page = bvec->bv_page;
3009 		struct inode *inode = page->mapping->host;
3010 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3011 		const u32 sectorsize = fs_info->sectorsize;
3012 		unsigned int error_bitmap = (unsigned int)-1;
3013 		bool repair = false;
3014 		u64 start;
3015 		u64 end;
3016 		u32 len;
3017 
3018 		btrfs_debug(fs_info,
3019 			"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
3020 			bio->bi_iter.bi_sector, bio->bi_status,
3021 			bbio->mirror_num);
3022 		tree = &BTRFS_I(inode)->io_tree;
3023 		failure_tree = &BTRFS_I(inode)->io_failure_tree;
3024 
3025 		/*
3026 		 * We always issue full-sector reads, but if some block in a
3027 		 * page fails to read, blk_update_request() will advance
3028 		 * bv_offset and adjust bv_len to compensate.  Print a warning
3029 		 * for unaligned offsets, and an error if they don't add up to
3030 		 * a full sector.
3031 		 */
3032 		if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
3033 			btrfs_err(fs_info,
3034 		"partial page read in btrfs with offset %u and length %u",
3035 				  bvec->bv_offset, bvec->bv_len);
3036 		else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
3037 				     sectorsize))
3038 			btrfs_info(fs_info,
3039 		"incomplete page read with offset %u and length %u",
3040 				   bvec->bv_offset, bvec->bv_len);
3041 
3042 		start = page_offset(page) + bvec->bv_offset;
3043 		end = start + bvec->bv_len - 1;
3044 		len = bvec->bv_len;
3045 
3046 		mirror = bbio->mirror_num;
3047 		if (likely(uptodate)) {
3048 			if (is_data_inode(inode)) {
3049 				error_bitmap = btrfs_verify_data_csum(bbio,
3050 						bio_offset, page, start, end);
3051 				if (error_bitmap)
3052 					uptodate = false;
3053 			} else {
3054 				if (btrfs_validate_metadata_buffer(bbio,
3055 						page, start, end, mirror))
3056 					uptodate = false;
3057 			}
3058 		}
3059 
3060 		if (likely(uptodate)) {
3061 			loff_t i_size = i_size_read(inode);
3062 			pgoff_t end_index = i_size >> PAGE_SHIFT;
3063 
3064 			clean_io_failure(BTRFS_I(inode)->root->fs_info,
3065 					 failure_tree, tree, start, page,
3066 					 btrfs_ino(BTRFS_I(inode)), 0);
3067 
3068 			/*
3069 			 * Zero out the remaining part if this range straddles
3070 			 * i_size.
3071 			 *
3072 			 * Here we should only zero the range inside the bvec,
3073 			 * not touch anything else.
3074 			 *
3075 			 * NOTE: i_size is exclusive while end is inclusive.
3076 			 */
3077 			if (page->index == end_index && i_size <= end) {
3078 				u32 zero_start = max(offset_in_page(i_size),
3079 						     offset_in_page(start));
3080 
3081 				zero_user_segment(page, zero_start,
3082 						  offset_in_page(end) + 1);
3083 			}
3084 		} else if (is_data_inode(inode)) {
3085 			/*
3086 			 * Only try to repair bios that actually made it to a
3087 			 * device.  If the bio failed to be submitted mirror
3088 			 * is 0 and we need to fail it without retrying.
3089 			 *
3090 			 * This also includes the high level bios for compressed
3091 			 * extents - these never make it to a device and repair
3092 			 * is already handled on the lower compressed bio.
3093 			 */
3094 			if (mirror > 0)
3095 				repair = true;
3096 		} else {
3097 			struct extent_buffer *eb;
3098 
3099 			eb = find_extent_buffer_readpage(fs_info, page, start);
3100 			set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
3101 			eb->read_mirror = mirror;
3102 			atomic_dec(&eb->io_pages);
3103 		}
3104 
3105 		if (repair) {
3106 			/*
3107 			 * submit_data_read_repair() will handle all the good
3108 			 * and bad sectors, we just continue to the next bvec.
3109 			 */
3110 			submit_data_read_repair(inode, bbio, bio_offset, bvec,
3111 						error_bitmap);
3112 		} else {
3113 			/* Update page status and unlock */
3114 			end_page_read(page, uptodate, start, len);
3115 			endio_readpage_release_extent(&processed, BTRFS_I(inode),
3116 					start, end, PageUptodate(page));
3117 		}
3118 
3119 		ASSERT(bio_offset + len > bio_offset);
3120 		bio_offset += len;
3121 
3122 	}
3123 	/* Release the last extent */
3124 	endio_readpage_release_extent(&processed, NULL, 0, 0, false);
3125 	btrfs_bio_free_csum(bbio);
3126 	bio_put(bio);
3127 }
3128 
3129 /**
3130  * Populate every free slot in a provided array with pages.
3131  *
3132  * @nr_pages:   number of pages to allocate
3133  * @page_array: the array to fill with pages; any existing non-null entries in
3134  * 		the array will be skipped
3135  *
3136  * Return: 0        if all pages were able to be allocated;
3137  *         -ENOMEM  otherwise, and the caller is responsible for freeing all
3138  *                  non-null page pointers in the array.
3139  */
3140 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
3141 {
3142 	unsigned int allocated;
3143 
3144 	for (allocated = 0; allocated < nr_pages;) {
3145 		unsigned int last = allocated;
3146 
3147 		allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
3148 
3149 		if (allocated == nr_pages)
3150 			return 0;
3151 
3152 		/*
3153 		 * During this iteration, no page could be allocated, even
3154 		 * though alloc_pages_bulk_array() falls back to alloc_page()
3155 		 * if  it could not bulk-allocate. So we must be out of memory.
3156 		 */
3157 		if (allocated == last)
3158 			return -ENOMEM;
3159 
3160 		memalloc_retry_wait(GFP_NOFS);
3161 	}
3162 	return 0;
3163 }
3164 
3165 /*
3166  * Initialize the members up to but not including 'bio'. Use after allocating a
3167  * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
3168  * 'bio' because use of __GFP_ZERO is not supported.
3169  */
3170 static inline void btrfs_bio_init(struct btrfs_bio *bbio)
3171 {
3172 	memset(bbio, 0, offsetof(struct btrfs_bio, bio));
3173 }
3174 
3175 /*
3176  * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
3177  *
3178  * The bio allocation is backed by bioset and does not fail.
3179  */
3180 struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
3181 {
3182 	struct bio *bio;
3183 
3184 	ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
3185 	bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset);
3186 	btrfs_bio_init(btrfs_bio(bio));
3187 	return bio;
3188 }
3189 
3190 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
3191 {
3192 	struct bio *bio;
3193 	struct btrfs_bio *bbio;
3194 
3195 	ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
3196 
3197 	/* this will never fail when it's backed by a bioset */
3198 	bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
3199 	ASSERT(bio);
3200 
3201 	bbio = btrfs_bio(bio);
3202 	btrfs_bio_init(bbio);
3203 
3204 	bio_trim(bio, offset >> 9, size >> 9);
3205 	bbio->iter = bio->bi_iter;
3206 	return bio;
3207 }
3208 
3209 /**
3210  * Attempt to add a page to bio
3211  *
3212  * @bio_ctrl:	record both the bio, and its bio_flags
3213  * @page:	page to add to the bio
3214  * @disk_bytenr:  offset of the new bio or to check whether we are adding
3215  *                a contiguous page to the previous one
3216  * @size:	portion of page that we want to write
3217  * @pg_offset:	starting offset in the page
3218  * @compress_type:   compression type of the current bio to see if we can merge them
3219  *
3220  * Attempt to add a page to bio considering stripe alignment etc.
3221  *
3222  * Return >= 0 for the number of bytes added to the bio.
3223  * Can return 0 if the current bio is already at stripe/zone boundary.
3224  * Return <0 for error.
3225  */
3226 static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
3227 			      struct page *page,
3228 			      u64 disk_bytenr, unsigned int size,
3229 			      unsigned int pg_offset,
3230 			      enum btrfs_compression_type compress_type)
3231 {
3232 	struct bio *bio = bio_ctrl->bio;
3233 	u32 bio_size = bio->bi_iter.bi_size;
3234 	u32 real_size;
3235 	const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
3236 	bool contig;
3237 	int ret;
3238 
3239 	ASSERT(bio);
3240 	/* The limit should be calculated when bio_ctrl->bio is allocated */
3241 	ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
3242 	if (bio_ctrl->compress_type != compress_type)
3243 		return 0;
3244 
3245 	if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE)
3246 		contig = bio->bi_iter.bi_sector == sector;
3247 	else
3248 		contig = bio_end_sector(bio) == sector;
3249 	if (!contig)
3250 		return 0;
3251 
3252 	real_size = min(bio_ctrl->len_to_oe_boundary,
3253 			bio_ctrl->len_to_stripe_boundary) - bio_size;
3254 	real_size = min(real_size, size);
3255 
3256 	/*
3257 	 * If real_size is 0, never call bio_add_*_page(), as even size is 0,
3258 	 * bio will still execute its endio function on the page!
3259 	 */
3260 	if (real_size == 0)
3261 		return 0;
3262 
3263 	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
3264 		ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
3265 	else
3266 		ret = bio_add_page(bio, page, real_size, pg_offset);
3267 
3268 	return ret;
3269 }
3270 
3271 static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
3272 			       struct btrfs_inode *inode, u64 file_offset)
3273 {
3274 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3275 	struct btrfs_io_geometry geom;
3276 	struct btrfs_ordered_extent *ordered;
3277 	struct extent_map *em;
3278 	u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
3279 	int ret;
3280 
3281 	/*
3282 	 * Pages for compressed extent are never submitted to disk directly,
3283 	 * thus it has no real boundary, just set them to U32_MAX.
3284 	 *
3285 	 * The split happens for real compressed bio, which happens in
3286 	 * btrfs_submit_compressed_read/write().
3287 	 */
3288 	if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
3289 		bio_ctrl->len_to_oe_boundary = U32_MAX;
3290 		bio_ctrl->len_to_stripe_boundary = U32_MAX;
3291 		return 0;
3292 	}
3293 	em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
3294 	if (IS_ERR(em))
3295 		return PTR_ERR(em);
3296 	ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
3297 				    logical, &geom);
3298 	free_extent_map(em);
3299 	if (ret < 0) {
3300 		return ret;
3301 	}
3302 	if (geom.len > U32_MAX)
3303 		bio_ctrl->len_to_stripe_boundary = U32_MAX;
3304 	else
3305 		bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
3306 
3307 	if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
3308 		bio_ctrl->len_to_oe_boundary = U32_MAX;
3309 		return 0;
3310 	}
3311 
3312 	/* Ordered extent not yet created, so we're good */
3313 	ordered = btrfs_lookup_ordered_extent(inode, file_offset);
3314 	if (!ordered) {
3315 		bio_ctrl->len_to_oe_boundary = U32_MAX;
3316 		return 0;
3317 	}
3318 
3319 	bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
3320 		ordered->disk_bytenr + ordered->disk_num_bytes - logical);
3321 	btrfs_put_ordered_extent(ordered);
3322 	return 0;
3323 }
3324 
3325 static int alloc_new_bio(struct btrfs_inode *inode,
3326 			 struct btrfs_bio_ctrl *bio_ctrl,
3327 			 struct writeback_control *wbc,
3328 			 blk_opf_t opf,
3329 			 bio_end_io_t end_io_func,
3330 			 u64 disk_bytenr, u32 offset, u64 file_offset,
3331 			 enum btrfs_compression_type compress_type)
3332 {
3333 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3334 	struct bio *bio;
3335 	int ret;
3336 
3337 	bio = btrfs_bio_alloc(BIO_MAX_VECS);
3338 	/*
3339 	 * For compressed page range, its disk_bytenr is always @disk_bytenr
3340 	 * passed in, no matter if we have added any range into previous bio.
3341 	 */
3342 	if (compress_type != BTRFS_COMPRESS_NONE)
3343 		bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
3344 	else
3345 		bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
3346 	bio_ctrl->bio = bio;
3347 	bio_ctrl->compress_type = compress_type;
3348 	bio->bi_end_io = end_io_func;
3349 	bio->bi_opf = opf;
3350 	ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
3351 	if (ret < 0)
3352 		goto error;
3353 
3354 	if (wbc) {
3355 		/*
3356 		 * For Zone append we need the correct block_device that we are
3357 		 * going to write to set in the bio to be able to respect the
3358 		 * hardware limitation.  Look it up here:
3359 		 */
3360 		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
3361 			struct btrfs_device *dev;
3362 
3363 			dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
3364 						     fs_info->sectorsize);
3365 			if (IS_ERR(dev)) {
3366 				ret = PTR_ERR(dev);
3367 				goto error;
3368 			}
3369 
3370 			bio_set_dev(bio, dev->bdev);
3371 		} else {
3372 			/*
3373 			 * Otherwise pick the last added device to support
3374 			 * cgroup writeback.  For multi-device file systems this
3375 			 * means blk-cgroup policies have to always be set on the
3376 			 * last added/replaced device.  This is a bit odd but has
3377 			 * been like that for a long time.
3378 			 */
3379 			bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
3380 		}
3381 		wbc_init_bio(wbc, bio);
3382 	} else {
3383 		ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
3384 	}
3385 	return 0;
3386 error:
3387 	bio_ctrl->bio = NULL;
3388 	bio->bi_status = errno_to_blk_status(ret);
3389 	bio_endio(bio);
3390 	return ret;
3391 }
3392 
3393 /*
3394  * @opf:	bio REQ_OP_* and REQ_* flags as one value
3395  * @wbc:	optional writeback control for io accounting
3396  * @page:	page to add to the bio
3397  * @disk_bytenr: logical bytenr where the write will be
3398  * @size:	portion of page that we want to write to
3399  * @pg_offset:	offset of the new bio or to check whether we are adding
3400  *              a contiguous page to the previous one
3401  * @bio_ret:	must be valid pointer, newly allocated bio will be stored there
3402  * @end_io_func:     end_io callback for new bio
3403  * @mirror_num:	     desired mirror to read/write
3404  * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
3405  * @compress_type:   compress type for current bio
3406  */
3407 static int submit_extent_page(blk_opf_t opf,
3408 			      struct writeback_control *wbc,
3409 			      struct btrfs_bio_ctrl *bio_ctrl,
3410 			      struct page *page, u64 disk_bytenr,
3411 			      size_t size, unsigned long pg_offset,
3412 			      bio_end_io_t end_io_func,
3413 			      enum btrfs_compression_type compress_type,
3414 			      bool force_bio_submit)
3415 {
3416 	int ret = 0;
3417 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
3418 	unsigned int cur = pg_offset;
3419 
3420 	ASSERT(bio_ctrl);
3421 
3422 	ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
3423 	       pg_offset + size <= PAGE_SIZE);
3424 	if (force_bio_submit)
3425 		submit_one_bio(bio_ctrl);
3426 
3427 	while (cur < pg_offset + size) {
3428 		u32 offset = cur - pg_offset;
3429 		int added;
3430 
3431 		/* Allocate new bio if needed */
3432 		if (!bio_ctrl->bio) {
3433 			ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
3434 					    end_io_func, disk_bytenr, offset,
3435 					    page_offset(page) + cur,
3436 					    compress_type);
3437 			if (ret < 0)
3438 				return ret;
3439 		}
3440 		/*
3441 		 * We must go through btrfs_bio_add_page() to ensure each
3442 		 * page range won't cross various boundaries.
3443 		 */
3444 		if (compress_type != BTRFS_COMPRESS_NONE)
3445 			added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
3446 					size - offset, pg_offset + offset,
3447 					compress_type);
3448 		else
3449 			added = btrfs_bio_add_page(bio_ctrl, page,
3450 					disk_bytenr + offset, size - offset,
3451 					pg_offset + offset, compress_type);
3452 
3453 		/* Metadata page range should never be split */
3454 		if (!is_data_inode(&inode->vfs_inode))
3455 			ASSERT(added == 0 || added == size - offset);
3456 
3457 		/* At least we added some page, update the account */
3458 		if (wbc && added)
3459 			wbc_account_cgroup_owner(wbc, page, added);
3460 
3461 		/* We have reached boundary, submit right now */
3462 		if (added < size - offset) {
3463 			/* The bio should contain some page(s) */
3464 			ASSERT(bio_ctrl->bio->bi_iter.bi_size);
3465 			submit_one_bio(bio_ctrl);
3466 		}
3467 		cur += added;
3468 	}
3469 	return 0;
3470 }
3471 
3472 static int attach_extent_buffer_page(struct extent_buffer *eb,
3473 				     struct page *page,
3474 				     struct btrfs_subpage *prealloc)
3475 {
3476 	struct btrfs_fs_info *fs_info = eb->fs_info;
3477 	int ret = 0;
3478 
3479 	/*
3480 	 * If the page is mapped to btree inode, we should hold the private
3481 	 * lock to prevent race.
3482 	 * For cloned or dummy extent buffers, their pages are not mapped and
3483 	 * will not race with any other ebs.
3484 	 */
3485 	if (page->mapping)
3486 		lockdep_assert_held(&page->mapping->private_lock);
3487 
3488 	if (fs_info->nodesize >= PAGE_SIZE) {
3489 		if (!PagePrivate(page))
3490 			attach_page_private(page, eb);
3491 		else
3492 			WARN_ON(page->private != (unsigned long)eb);
3493 		return 0;
3494 	}
3495 
3496 	/* Already mapped, just free prealloc */
3497 	if (PagePrivate(page)) {
3498 		btrfs_free_subpage(prealloc);
3499 		return 0;
3500 	}
3501 
3502 	if (prealloc)
3503 		/* Has preallocated memory for subpage */
3504 		attach_page_private(page, prealloc);
3505 	else
3506 		/* Do new allocation to attach subpage */
3507 		ret = btrfs_attach_subpage(fs_info, page,
3508 					   BTRFS_SUBPAGE_METADATA);
3509 	return ret;
3510 }
3511 
3512 int set_page_extent_mapped(struct page *page)
3513 {
3514 	struct btrfs_fs_info *fs_info;
3515 
3516 	ASSERT(page->mapping);
3517 
3518 	if (PagePrivate(page))
3519 		return 0;
3520 
3521 	fs_info = btrfs_sb(page->mapping->host->i_sb);
3522 
3523 	if (btrfs_is_subpage(fs_info, page))
3524 		return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
3525 
3526 	attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
3527 	return 0;
3528 }
3529 
3530 void clear_page_extent_mapped(struct page *page)
3531 {
3532 	struct btrfs_fs_info *fs_info;
3533 
3534 	ASSERT(page->mapping);
3535 
3536 	if (!PagePrivate(page))
3537 		return;
3538 
3539 	fs_info = btrfs_sb(page->mapping->host->i_sb);
3540 	if (btrfs_is_subpage(fs_info, page))
3541 		return btrfs_detach_subpage(fs_info, page);
3542 
3543 	detach_page_private(page);
3544 }
3545 
3546 static struct extent_map *
3547 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
3548 		 u64 start, u64 len, struct extent_map **em_cached)
3549 {
3550 	struct extent_map *em;
3551 
3552 	if (em_cached && *em_cached) {
3553 		em = *em_cached;
3554 		if (extent_map_in_tree(em) && start >= em->start &&
3555 		    start < extent_map_end(em)) {
3556 			refcount_inc(&em->refs);
3557 			return em;
3558 		}
3559 
3560 		free_extent_map(em);
3561 		*em_cached = NULL;
3562 	}
3563 
3564 	em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
3565 	if (em_cached && !IS_ERR(em)) {
3566 		BUG_ON(*em_cached);
3567 		refcount_inc(&em->refs);
3568 		*em_cached = em;
3569 	}
3570 	return em;
3571 }
3572 /*
3573  * basic readpage implementation.  Locked extent state structs are inserted
3574  * into the tree that are removed when the IO is done (by the end_io
3575  * handlers)
3576  * XXX JDM: This needs looking at to ensure proper page locking
3577  * return 0 on success, otherwise return error
3578  */
3579 static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
3580 		      struct btrfs_bio_ctrl *bio_ctrl,
3581 		      blk_opf_t read_flags, u64 *prev_em_start)
3582 {
3583 	struct inode *inode = page->mapping->host;
3584 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3585 	u64 start = page_offset(page);
3586 	const u64 end = start + PAGE_SIZE - 1;
3587 	u64 cur = start;
3588 	u64 extent_offset;
3589 	u64 last_byte = i_size_read(inode);
3590 	u64 block_start;
3591 	u64 cur_end;
3592 	struct extent_map *em;
3593 	int ret = 0;
3594 	size_t pg_offset = 0;
3595 	size_t iosize;
3596 	size_t blocksize = inode->i_sb->s_blocksize;
3597 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
3598 
3599 	ret = set_page_extent_mapped(page);
3600 	if (ret < 0) {
3601 		unlock_extent(tree, start, end);
3602 		btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
3603 		unlock_page(page);
3604 		goto out;
3605 	}
3606 
3607 	if (page->index == last_byte >> PAGE_SHIFT) {
3608 		size_t zero_offset = offset_in_page(last_byte);
3609 
3610 		if (zero_offset) {
3611 			iosize = PAGE_SIZE - zero_offset;
3612 			memzero_page(page, zero_offset, iosize);
3613 		}
3614 	}
3615 	begin_page_read(fs_info, page);
3616 	while (cur <= end) {
3617 		unsigned long this_bio_flag = 0;
3618 		bool force_bio_submit = false;
3619 		u64 disk_bytenr;
3620 
3621 		ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
3622 		if (cur >= last_byte) {
3623 			struct extent_state *cached = NULL;
3624 
3625 			iosize = PAGE_SIZE - pg_offset;
3626 			memzero_page(page, pg_offset, iosize);
3627 			set_extent_uptodate(tree, cur, cur + iosize - 1,
3628 					    &cached, GFP_NOFS);
3629 			unlock_extent_cached(tree, cur,
3630 					     cur + iosize - 1, &cached);
3631 			end_page_read(page, true, cur, iosize);
3632 			break;
3633 		}
3634 		em = __get_extent_map(inode, page, pg_offset, cur,
3635 				      end - cur + 1, em_cached);
3636 		if (IS_ERR(em)) {
3637 			unlock_extent(tree, cur, end);
3638 			end_page_read(page, false, cur, end + 1 - cur);
3639 			ret = PTR_ERR(em);
3640 			break;
3641 		}
3642 		extent_offset = cur - em->start;
3643 		BUG_ON(extent_map_end(em) <= cur);
3644 		BUG_ON(end < cur);
3645 
3646 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3647 			this_bio_flag = em->compress_type;
3648 
3649 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
3650 		cur_end = min(extent_map_end(em) - 1, end);
3651 		iosize = ALIGN(iosize, blocksize);
3652 		if (this_bio_flag != BTRFS_COMPRESS_NONE)
3653 			disk_bytenr = em->block_start;
3654 		else
3655 			disk_bytenr = em->block_start + extent_offset;
3656 		block_start = em->block_start;
3657 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3658 			block_start = EXTENT_MAP_HOLE;
3659 
3660 		/*
3661 		 * If we have a file range that points to a compressed extent
3662 		 * and it's followed by a consecutive file range that points
3663 		 * to the same compressed extent (possibly with a different
3664 		 * offset and/or length, so it either points to the whole extent
3665 		 * or only part of it), we must make sure we do not submit a
3666 		 * single bio to populate the pages for the 2 ranges because
3667 		 * this makes the compressed extent read zero out the pages
3668 		 * belonging to the 2nd range. Imagine the following scenario:
3669 		 *
3670 		 *  File layout
3671 		 *  [0 - 8K]                     [8K - 24K]
3672 		 *    |                               |
3673 		 *    |                               |
3674 		 * points to extent X,         points to extent X,
3675 		 * offset 4K, length of 8K     offset 0, length 16K
3676 		 *
3677 		 * [extent X, compressed length = 4K uncompressed length = 16K]
3678 		 *
3679 		 * If the bio to read the compressed extent covers both ranges,
3680 		 * it will decompress extent X into the pages belonging to the
3681 		 * first range and then it will stop, zeroing out the remaining
3682 		 * pages that belong to the other range that points to extent X.
3683 		 * So here we make sure we submit 2 bios, one for the first
3684 		 * range and another one for the third range. Both will target
3685 		 * the same physical extent from disk, but we can't currently
3686 		 * make the compressed bio endio callback populate the pages
3687 		 * for both ranges because each compressed bio is tightly
3688 		 * coupled with a single extent map, and each range can have
3689 		 * an extent map with a different offset value relative to the
3690 		 * uncompressed data of our extent and different lengths. This
3691 		 * is a corner case so we prioritize correctness over
3692 		 * non-optimal behavior (submitting 2 bios for the same extent).
3693 		 */
3694 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3695 		    prev_em_start && *prev_em_start != (u64)-1 &&
3696 		    *prev_em_start != em->start)
3697 			force_bio_submit = true;
3698 
3699 		if (prev_em_start)
3700 			*prev_em_start = em->start;
3701 
3702 		free_extent_map(em);
3703 		em = NULL;
3704 
3705 		/* we've found a hole, just zero and go on */
3706 		if (block_start == EXTENT_MAP_HOLE) {
3707 			struct extent_state *cached = NULL;
3708 
3709 			memzero_page(page, pg_offset, iosize);
3710 
3711 			set_extent_uptodate(tree, cur, cur + iosize - 1,
3712 					    &cached, GFP_NOFS);
3713 			unlock_extent_cached(tree, cur,
3714 					     cur + iosize - 1, &cached);
3715 			end_page_read(page, true, cur, iosize);
3716 			cur = cur + iosize;
3717 			pg_offset += iosize;
3718 			continue;
3719 		}
3720 		/* the get_extent function already copied into the page */
3721 		if (test_range_bit(tree, cur, cur_end,
3722 				   EXTENT_UPTODATE, 1, NULL)) {
3723 			unlock_extent(tree, cur, cur + iosize - 1);
3724 			end_page_read(page, true, cur, iosize);
3725 			cur = cur + iosize;
3726 			pg_offset += iosize;
3727 			continue;
3728 		}
3729 		/* we have an inline extent but it didn't get marked up
3730 		 * to date.  Error out
3731 		 */
3732 		if (block_start == EXTENT_MAP_INLINE) {
3733 			unlock_extent(tree, cur, cur + iosize - 1);
3734 			end_page_read(page, false, cur, iosize);
3735 			cur = cur + iosize;
3736 			pg_offset += iosize;
3737 			continue;
3738 		}
3739 
3740 		ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
3741 					 bio_ctrl, page, disk_bytenr, iosize,
3742 					 pg_offset, end_bio_extent_readpage,
3743 					 this_bio_flag, force_bio_submit);
3744 		if (ret) {
3745 			/*
3746 			 * We have to unlock the remaining range, or the page
3747 			 * will never be unlocked.
3748 			 */
3749 			unlock_extent(tree, cur, end);
3750 			end_page_read(page, false, cur, end + 1 - cur);
3751 			goto out;
3752 		}
3753 		cur = cur + iosize;
3754 		pg_offset += iosize;
3755 	}
3756 out:
3757 	return ret;
3758 }
3759 
3760 int btrfs_read_folio(struct file *file, struct folio *folio)
3761 {
3762 	struct page *page = &folio->page;
3763 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
3764 	u64 start = page_offset(page);
3765 	u64 end = start + PAGE_SIZE - 1;
3766 	struct btrfs_bio_ctrl bio_ctrl = { 0 };
3767 	int ret;
3768 
3769 	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
3770 
3771 	ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
3772 	/*
3773 	 * If btrfs_do_readpage() failed we will want to submit the assembled
3774 	 * bio to do the cleanup.
3775 	 */
3776 	submit_one_bio(&bio_ctrl);
3777 	return ret;
3778 }
3779 
3780 static inline void contiguous_readpages(struct page *pages[], int nr_pages,
3781 					u64 start, u64 end,
3782 					struct extent_map **em_cached,
3783 					struct btrfs_bio_ctrl *bio_ctrl,
3784 					u64 *prev_em_start)
3785 {
3786 	struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
3787 	int index;
3788 
3789 	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
3790 
3791 	for (index = 0; index < nr_pages; index++) {
3792 		btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
3793 				  REQ_RAHEAD, prev_em_start);
3794 		put_page(pages[index]);
3795 	}
3796 }
3797 
3798 /*
3799  * helper for __extent_writepage, doing all of the delayed allocation setup.
3800  *
3801  * This returns 1 if btrfs_run_delalloc_range function did all the work required
3802  * to write the page (copy into inline extent).  In this case the IO has
3803  * been started and the page is already unlocked.
3804  *
3805  * This returns 0 if all went well (page still locked)
3806  * This returns < 0 if there were errors (page still locked)
3807  */
3808 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
3809 		struct page *page, struct writeback_control *wbc)
3810 {
3811 	const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
3812 	u64 delalloc_start = page_offset(page);
3813 	u64 delalloc_to_write = 0;
3814 	/* How many pages are started by btrfs_run_delalloc_range() */
3815 	unsigned long nr_written = 0;
3816 	int ret;
3817 	int page_started = 0;
3818 
3819 	while (delalloc_start < page_end) {
3820 		u64 delalloc_end = page_end;
3821 		bool found;
3822 
3823 		found = find_lock_delalloc_range(&inode->vfs_inode, page,
3824 					       &delalloc_start,
3825 					       &delalloc_end);
3826 		if (!found) {
3827 			delalloc_start = delalloc_end + 1;
3828 			continue;
3829 		}
3830 		ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
3831 				delalloc_end, &page_started, &nr_written, wbc);
3832 		if (ret) {
3833 			btrfs_page_set_error(inode->root->fs_info, page,
3834 					     page_offset(page), PAGE_SIZE);
3835 			return ret;
3836 		}
3837 		/*
3838 		 * delalloc_end is already one less than the total length, so
3839 		 * we don't subtract one from PAGE_SIZE
3840 		 */
3841 		delalloc_to_write += (delalloc_end - delalloc_start +
3842 				      PAGE_SIZE) >> PAGE_SHIFT;
3843 		delalloc_start = delalloc_end + 1;
3844 	}
3845 	if (wbc->nr_to_write < delalloc_to_write) {
3846 		int thresh = 8192;
3847 
3848 		if (delalloc_to_write < thresh * 2)
3849 			thresh = delalloc_to_write;
3850 		wbc->nr_to_write = min_t(u64, delalloc_to_write,
3851 					 thresh);
3852 	}
3853 
3854 	/* Did btrfs_run_dealloc_range() already unlock and start the IO? */
3855 	if (page_started) {
3856 		/*
3857 		 * We've unlocked the page, so we can't update the mapping's
3858 		 * writeback index, just update nr_to_write.
3859 		 */
3860 		wbc->nr_to_write -= nr_written;
3861 		return 1;
3862 	}
3863 
3864 	return 0;
3865 }
3866 
3867 /*
3868  * Find the first byte we need to write.
3869  *
3870  * For subpage, one page can contain several sectors, and
3871  * __extent_writepage_io() will just grab all extent maps in the page
3872  * range and try to submit all non-inline/non-compressed extents.
3873  *
3874  * This is a big problem for subpage, we shouldn't re-submit already written
3875  * data at all.
3876  * This function will lookup subpage dirty bit to find which range we really
3877  * need to submit.
3878  *
3879  * Return the next dirty range in [@start, @end).
3880  * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
3881  */
3882 static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
3883 				 struct page *page, u64 *start, u64 *end)
3884 {
3885 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
3886 	struct btrfs_subpage_info *spi = fs_info->subpage_info;
3887 	u64 orig_start = *start;
3888 	/* Declare as unsigned long so we can use bitmap ops */
3889 	unsigned long flags;
3890 	int range_start_bit;
3891 	int range_end_bit;
3892 
3893 	/*
3894 	 * For regular sector size == page size case, since one page only
3895 	 * contains one sector, we return the page offset directly.
3896 	 */
3897 	if (!btrfs_is_subpage(fs_info, page)) {
3898 		*start = page_offset(page);
3899 		*end = page_offset(page) + PAGE_SIZE;
3900 		return;
3901 	}
3902 
3903 	range_start_bit = spi->dirty_offset +
3904 			  (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
3905 
3906 	/* We should have the page locked, but just in case */
3907 	spin_lock_irqsave(&subpage->lock, flags);
3908 	bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
3909 			       spi->dirty_offset + spi->bitmap_nr_bits);
3910 	spin_unlock_irqrestore(&subpage->lock, flags);
3911 
3912 	range_start_bit -= spi->dirty_offset;
3913 	range_end_bit -= spi->dirty_offset;
3914 
3915 	*start = page_offset(page) + range_start_bit * fs_info->sectorsize;
3916 	*end = page_offset(page) + range_end_bit * fs_info->sectorsize;
3917 }
3918 
3919 /*
3920  * helper for __extent_writepage.  This calls the writepage start hooks,
3921  * and does the loop to map the page into extents and bios.
3922  *
3923  * We return 1 if the IO is started and the page is unlocked,
3924  * 0 if all went well (page still locked)
3925  * < 0 if there were errors (page still locked)
3926  */
3927 static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
3928 				 struct page *page,
3929 				 struct writeback_control *wbc,
3930 				 struct extent_page_data *epd,
3931 				 loff_t i_size,
3932 				 int *nr_ret)
3933 {
3934 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3935 	u64 cur = page_offset(page);
3936 	u64 end = cur + PAGE_SIZE - 1;
3937 	u64 extent_offset;
3938 	u64 block_start;
3939 	struct extent_map *em;
3940 	int saved_ret = 0;
3941 	int ret = 0;
3942 	int nr = 0;
3943 	enum req_op op = REQ_OP_WRITE;
3944 	const blk_opf_t write_flags = wbc_to_write_flags(wbc);
3945 	bool has_error = false;
3946 	bool compressed;
3947 
3948 	ret = btrfs_writepage_cow_fixup(page);
3949 	if (ret) {
3950 		/* Fixup worker will requeue */
3951 		redirty_page_for_writepage(wbc, page);
3952 		unlock_page(page);
3953 		return 1;
3954 	}
3955 
3956 	/*
3957 	 * we don't want to touch the inode after unlocking the page,
3958 	 * so we update the mapping writeback index now
3959 	 */
3960 	wbc->nr_to_write--;
3961 
3962 	while (cur <= end) {
3963 		u64 disk_bytenr;
3964 		u64 em_end;
3965 		u64 dirty_range_start = cur;
3966 		u64 dirty_range_end;
3967 		u32 iosize;
3968 
3969 		if (cur >= i_size) {
3970 			btrfs_writepage_endio_finish_ordered(inode, page, cur,
3971 							     end, true);
3972 			/*
3973 			 * This range is beyond i_size, thus we don't need to
3974 			 * bother writing back.
3975 			 * But we still need to clear the dirty subpage bit, or
3976 			 * the next time the page gets dirtied, we will try to
3977 			 * writeback the sectors with subpage dirty bits,
3978 			 * causing writeback without ordered extent.
3979 			 */
3980 			btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
3981 			break;
3982 		}
3983 
3984 		find_next_dirty_byte(fs_info, page, &dirty_range_start,
3985 				     &dirty_range_end);
3986 		if (cur < dirty_range_start) {
3987 			cur = dirty_range_start;
3988 			continue;
3989 		}
3990 
3991 		em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
3992 		if (IS_ERR(em)) {
3993 			btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
3994 			ret = PTR_ERR_OR_ZERO(em);
3995 			has_error = true;
3996 			if (!saved_ret)
3997 				saved_ret = ret;
3998 			break;
3999 		}
4000 
4001 		extent_offset = cur - em->start;
4002 		em_end = extent_map_end(em);
4003 		ASSERT(cur <= em_end);
4004 		ASSERT(cur < end);
4005 		ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
4006 		ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
4007 		block_start = em->block_start;
4008 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4009 		disk_bytenr = em->block_start + extent_offset;
4010 
4011 		/*
4012 		 * Note that em_end from extent_map_end() and dirty_range_end from
4013 		 * find_next_dirty_byte() are all exclusive
4014 		 */
4015 		iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
4016 
4017 		if (btrfs_use_zone_append(inode, em->block_start))
4018 			op = REQ_OP_ZONE_APPEND;
4019 
4020 		free_extent_map(em);
4021 		em = NULL;
4022 
4023 		/*
4024 		 * compressed and inline extents are written through other
4025 		 * paths in the FS
4026 		 */
4027 		if (compressed || block_start == EXTENT_MAP_HOLE ||
4028 		    block_start == EXTENT_MAP_INLINE) {
4029 			if (compressed)
4030 				nr++;
4031 			else
4032 				btrfs_writepage_endio_finish_ordered(inode,
4033 						page, cur, cur + iosize - 1, true);
4034 			btrfs_page_clear_dirty(fs_info, page, cur, iosize);
4035 			cur += iosize;
4036 			continue;
4037 		}
4038 
4039 		btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
4040 		if (!PageWriteback(page)) {
4041 			btrfs_err(inode->root->fs_info,
4042 				   "page %lu not writeback, cur %llu end %llu",
4043 			       page->index, cur, end);
4044 		}
4045 
4046 		/*
4047 		 * Although the PageDirty bit is cleared before entering this
4048 		 * function, subpage dirty bit is not cleared.
4049 		 * So clear subpage dirty bit here so next time we won't submit
4050 		 * page for range already written to disk.
4051 		 */
4052 		btrfs_page_clear_dirty(fs_info, page, cur, iosize);
4053 
4054 		ret = submit_extent_page(op | write_flags, wbc,
4055 					 &epd->bio_ctrl, page,
4056 					 disk_bytenr, iosize,
4057 					 cur - page_offset(page),
4058 					 end_bio_extent_writepage,
4059 					 0, false);
4060 		if (ret) {
4061 			has_error = true;
4062 			if (!saved_ret)
4063 				saved_ret = ret;
4064 
4065 			btrfs_page_set_error(fs_info, page, cur, iosize);
4066 			if (PageWriteback(page))
4067 				btrfs_page_clear_writeback(fs_info, page, cur,
4068 							   iosize);
4069 		}
4070 
4071 		cur += iosize;
4072 		nr++;
4073 	}
4074 	/*
4075 	 * If we finish without problem, we should not only clear page dirty,
4076 	 * but also empty subpage dirty bits
4077 	 */
4078 	if (!has_error)
4079 		btrfs_page_assert_not_dirty(fs_info, page);
4080 	else
4081 		ret = saved_ret;
4082 	*nr_ret = nr;
4083 	return ret;
4084 }
4085 
4086 /*
4087  * the writepage semantics are similar to regular writepage.  extent
4088  * records are inserted to lock ranges in the tree, and as dirty areas
4089  * are found, they are marked writeback.  Then the lock bits are removed
4090  * and the end_io handler clears the writeback ranges
4091  *
4092  * Return 0 if everything goes well.
4093  * Return <0 for error.
4094  */
4095 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
4096 			      struct extent_page_data *epd)
4097 {
4098 	struct folio *folio = page_folio(page);
4099 	struct inode *inode = page->mapping->host;
4100 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4101 	const u64 page_start = page_offset(page);
4102 	const u64 page_end = page_start + PAGE_SIZE - 1;
4103 	int ret;
4104 	int nr = 0;
4105 	size_t pg_offset;
4106 	loff_t i_size = i_size_read(inode);
4107 	unsigned long end_index = i_size >> PAGE_SHIFT;
4108 
4109 	trace___extent_writepage(page, inode, wbc);
4110 
4111 	WARN_ON(!PageLocked(page));
4112 
4113 	btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
4114 			       page_offset(page), PAGE_SIZE);
4115 
4116 	pg_offset = offset_in_page(i_size);
4117 	if (page->index > end_index ||
4118 	   (page->index == end_index && !pg_offset)) {
4119 		folio_invalidate(folio, 0, folio_size(folio));
4120 		folio_unlock(folio);
4121 		return 0;
4122 	}
4123 
4124 	if (page->index == end_index)
4125 		memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
4126 
4127 	ret = set_page_extent_mapped(page);
4128 	if (ret < 0) {
4129 		SetPageError(page);
4130 		goto done;
4131 	}
4132 
4133 	if (!epd->extent_locked) {
4134 		ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
4135 		if (ret == 1)
4136 			return 0;
4137 		if (ret)
4138 			goto done;
4139 	}
4140 
4141 	ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
4142 				    &nr);
4143 	if (ret == 1)
4144 		return 0;
4145 
4146 done:
4147 	if (nr == 0) {
4148 		/* make sure the mapping tag for page dirty gets cleared */
4149 		set_page_writeback(page);
4150 		end_page_writeback(page);
4151 	}
4152 	/*
4153 	 * Here we used to have a check for PageError() and then set @ret and
4154 	 * call end_extent_writepage().
4155 	 *
4156 	 * But in fact setting @ret here will cause different error paths
4157 	 * between subpage and regular sectorsize.
4158 	 *
4159 	 * For regular page size, we never submit current page, but only add
4160 	 * current page to current bio.
4161 	 * The bio submission can only happen in next page.
4162 	 * Thus if we hit the PageError() branch, @ret is already set to
4163 	 * non-zero value and will not get updated for regular sectorsize.
4164 	 *
4165 	 * But for subpage case, it's possible we submit part of current page,
4166 	 * thus can get PageError() set by submitted bio of the same page,
4167 	 * while our @ret is still 0.
4168 	 *
4169 	 * So here we unify the behavior and don't set @ret.
4170 	 * Error can still be properly passed to higher layer as page will
4171 	 * be set error, here we just don't handle the IO failure.
4172 	 *
4173 	 * NOTE: This is just a hotfix for subpage.
4174 	 * The root fix will be properly ending ordered extent when we hit
4175 	 * an error during writeback.
4176 	 *
4177 	 * But that needs a bigger refactoring, as we not only need to grab the
4178 	 * submitted OE, but also need to know exactly at which bytenr we hit
4179 	 * the error.
4180 	 * Currently the full page based __extent_writepage_io() is not
4181 	 * capable of that.
4182 	 */
4183 	if (PageError(page))
4184 		end_extent_writepage(page, ret, page_start, page_end);
4185 	if (epd->extent_locked) {
4186 		/*
4187 		 * If epd->extent_locked, it's from extent_write_locked_range(),
4188 		 * the page can either be locked by lock_page() or
4189 		 * process_one_page().
4190 		 * Let btrfs_page_unlock_writer() handle both cases.
4191 		 */
4192 		ASSERT(wbc);
4193 		btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
4194 					 wbc->range_end + 1 - wbc->range_start);
4195 	} else {
4196 		unlock_page(page);
4197 	}
4198 	ASSERT(ret <= 0);
4199 	return ret;
4200 }
4201 
4202 void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
4203 {
4204 	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
4205 		       TASK_UNINTERRUPTIBLE);
4206 }
4207 
4208 static void end_extent_buffer_writeback(struct extent_buffer *eb)
4209 {
4210 	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
4211 	smp_mb__after_atomic();
4212 	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
4213 }
4214 
4215 /*
4216  * Lock extent buffer status and pages for writeback.
4217  *
4218  * May try to flush write bio if we can't get the lock.
4219  *
4220  * Return  0 if the extent buffer doesn't need to be submitted.
4221  *           (E.g. the extent buffer is not dirty)
4222  * Return >0 is the extent buffer is submitted to bio.
4223  * Return <0 if something went wrong, no page is locked.
4224  */
4225 static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
4226 			  struct extent_page_data *epd)
4227 {
4228 	struct btrfs_fs_info *fs_info = eb->fs_info;
4229 	int i, num_pages;
4230 	int flush = 0;
4231 	int ret = 0;
4232 
4233 	if (!btrfs_try_tree_write_lock(eb)) {
4234 		submit_write_bio(epd, 0);
4235 		flush = 1;
4236 		btrfs_tree_lock(eb);
4237 	}
4238 
4239 	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
4240 		btrfs_tree_unlock(eb);
4241 		if (!epd->sync_io)
4242 			return 0;
4243 		if (!flush) {
4244 			submit_write_bio(epd, 0);
4245 			flush = 1;
4246 		}
4247 		while (1) {
4248 			wait_on_extent_buffer_writeback(eb);
4249 			btrfs_tree_lock(eb);
4250 			if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
4251 				break;
4252 			btrfs_tree_unlock(eb);
4253 		}
4254 	}
4255 
4256 	/*
4257 	 * We need to do this to prevent races in people who check if the eb is
4258 	 * under IO since we can end up having no IO bits set for a short period
4259 	 * of time.
4260 	 */
4261 	spin_lock(&eb->refs_lock);
4262 	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
4263 		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
4264 		spin_unlock(&eb->refs_lock);
4265 		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
4266 		percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4267 					 -eb->len,
4268 					 fs_info->dirty_metadata_batch);
4269 		ret = 1;
4270 	} else {
4271 		spin_unlock(&eb->refs_lock);
4272 	}
4273 
4274 	btrfs_tree_unlock(eb);
4275 
4276 	/*
4277 	 * Either we don't need to submit any tree block, or we're submitting
4278 	 * subpage eb.
4279 	 * Subpage metadata doesn't use page locking at all, so we can skip
4280 	 * the page locking.
4281 	 */
4282 	if (!ret || fs_info->nodesize < PAGE_SIZE)
4283 		return ret;
4284 
4285 	num_pages = num_extent_pages(eb);
4286 	for (i = 0; i < num_pages; i++) {
4287 		struct page *p = eb->pages[i];
4288 
4289 		if (!trylock_page(p)) {
4290 			if (!flush) {
4291 				submit_write_bio(epd, 0);
4292 				flush = 1;
4293 			}
4294 			lock_page(p);
4295 		}
4296 	}
4297 
4298 	return ret;
4299 }
4300 
4301 static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
4302 {
4303 	struct btrfs_fs_info *fs_info = eb->fs_info;
4304 
4305 	btrfs_page_set_error(fs_info, page, eb->start, eb->len);
4306 	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
4307 		return;
4308 
4309 	/*
4310 	 * A read may stumble upon this buffer later, make sure that it gets an
4311 	 * error and knows there was an error.
4312 	 */
4313 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4314 
4315 	/*
4316 	 * We need to set the mapping with the io error as well because a write
4317 	 * error will flip the file system readonly, and then syncfs() will
4318 	 * return a 0 because we are readonly if we don't modify the err seq for
4319 	 * the superblock.
4320 	 */
4321 	mapping_set_error(page->mapping, -EIO);
4322 
4323 	/*
4324 	 * If we error out, we should add back the dirty_metadata_bytes
4325 	 * to make it consistent.
4326 	 */
4327 	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4328 				 eb->len, fs_info->dirty_metadata_batch);
4329 
4330 	/*
4331 	 * If writeback for a btree extent that doesn't belong to a log tree
4332 	 * failed, increment the counter transaction->eb_write_errors.
4333 	 * We do this because while the transaction is running and before it's
4334 	 * committing (when we call filemap_fdata[write|wait]_range against
4335 	 * the btree inode), we might have
4336 	 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
4337 	 * returns an error or an error happens during writeback, when we're
4338 	 * committing the transaction we wouldn't know about it, since the pages
4339 	 * can be no longer dirty nor marked anymore for writeback (if a
4340 	 * subsequent modification to the extent buffer didn't happen before the
4341 	 * transaction commit), which makes filemap_fdata[write|wait]_range not
4342 	 * able to find the pages tagged with SetPageError at transaction
4343 	 * commit time. So if this happens we must abort the transaction,
4344 	 * otherwise we commit a super block with btree roots that point to
4345 	 * btree nodes/leafs whose content on disk is invalid - either garbage
4346 	 * or the content of some node/leaf from a past generation that got
4347 	 * cowed or deleted and is no longer valid.
4348 	 *
4349 	 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
4350 	 * not be enough - we need to distinguish between log tree extents vs
4351 	 * non-log tree extents, and the next filemap_fdatawait_range() call
4352 	 * will catch and clear such errors in the mapping - and that call might
4353 	 * be from a log sync and not from a transaction commit. Also, checking
4354 	 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
4355 	 * not done and would not be reliable - the eb might have been released
4356 	 * from memory and reading it back again means that flag would not be
4357 	 * set (since it's a runtime flag, not persisted on disk).
4358 	 *
4359 	 * Using the flags below in the btree inode also makes us achieve the
4360 	 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
4361 	 * writeback for all dirty pages and before filemap_fdatawait_range()
4362 	 * is called, the writeback for all dirty pages had already finished
4363 	 * with errors - because we were not using AS_EIO/AS_ENOSPC,
4364 	 * filemap_fdatawait_range() would return success, as it could not know
4365 	 * that writeback errors happened (the pages were no longer tagged for
4366 	 * writeback).
4367 	 */
4368 	switch (eb->log_index) {
4369 	case -1:
4370 		set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
4371 		break;
4372 	case 0:
4373 		set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
4374 		break;
4375 	case 1:
4376 		set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
4377 		break;
4378 	default:
4379 		BUG(); /* unexpected, logic error */
4380 	}
4381 }
4382 
4383 /*
4384  * The endio specific version which won't touch any unsafe spinlock in endio
4385  * context.
4386  */
4387 static struct extent_buffer *find_extent_buffer_nolock(
4388 		struct btrfs_fs_info *fs_info, u64 start)
4389 {
4390 	struct extent_buffer *eb;
4391 
4392 	rcu_read_lock();
4393 	eb = radix_tree_lookup(&fs_info->buffer_radix,
4394 			       start >> fs_info->sectorsize_bits);
4395 	if (eb && atomic_inc_not_zero(&eb->refs)) {
4396 		rcu_read_unlock();
4397 		return eb;
4398 	}
4399 	rcu_read_unlock();
4400 	return NULL;
4401 }
4402 
4403 /*
4404  * The endio function for subpage extent buffer write.
4405  *
4406  * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
4407  * after all extent buffers in the page has finished their writeback.
4408  */
4409 static void end_bio_subpage_eb_writepage(struct bio *bio)
4410 {
4411 	struct btrfs_fs_info *fs_info;
4412 	struct bio_vec *bvec;
4413 	struct bvec_iter_all iter_all;
4414 
4415 	fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
4416 	ASSERT(fs_info->nodesize < PAGE_SIZE);
4417 
4418 	ASSERT(!bio_flagged(bio, BIO_CLONED));
4419 	bio_for_each_segment_all(bvec, bio, iter_all) {
4420 		struct page *page = bvec->bv_page;
4421 		u64 bvec_start = page_offset(page) + bvec->bv_offset;
4422 		u64 bvec_end = bvec_start + bvec->bv_len - 1;
4423 		u64 cur_bytenr = bvec_start;
4424 
4425 		ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
4426 
4427 		/* Iterate through all extent buffers in the range */
4428 		while (cur_bytenr <= bvec_end) {
4429 			struct extent_buffer *eb;
4430 			int done;
4431 
4432 			/*
4433 			 * Here we can't use find_extent_buffer(), as it may
4434 			 * try to lock eb->refs_lock, which is not safe in endio
4435 			 * context.
4436 			 */
4437 			eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
4438 			ASSERT(eb);
4439 
4440 			cur_bytenr = eb->start + eb->len;
4441 
4442 			ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
4443 			done = atomic_dec_and_test(&eb->io_pages);
4444 			ASSERT(done);
4445 
4446 			if (bio->bi_status ||
4447 			    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
4448 				ClearPageUptodate(page);
4449 				set_btree_ioerr(page, eb);
4450 			}
4451 
4452 			btrfs_subpage_clear_writeback(fs_info, page, eb->start,
4453 						      eb->len);
4454 			end_extent_buffer_writeback(eb);
4455 			/*
4456 			 * free_extent_buffer() will grab spinlock which is not
4457 			 * safe in endio context. Thus here we manually dec
4458 			 * the ref.
4459 			 */
4460 			atomic_dec(&eb->refs);
4461 		}
4462 	}
4463 	bio_put(bio);
4464 }
4465 
4466 static void end_bio_extent_buffer_writepage(struct bio *bio)
4467 {
4468 	struct bio_vec *bvec;
4469 	struct extent_buffer *eb;
4470 	int done;
4471 	struct bvec_iter_all iter_all;
4472 
4473 	ASSERT(!bio_flagged(bio, BIO_CLONED));
4474 	bio_for_each_segment_all(bvec, bio, iter_all) {
4475 		struct page *page = bvec->bv_page;
4476 
4477 		eb = (struct extent_buffer *)page->private;
4478 		BUG_ON(!eb);
4479 		done = atomic_dec_and_test(&eb->io_pages);
4480 
4481 		if (bio->bi_status ||
4482 		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
4483 			ClearPageUptodate(page);
4484 			set_btree_ioerr(page, eb);
4485 		}
4486 
4487 		end_page_writeback(page);
4488 
4489 		if (!done)
4490 			continue;
4491 
4492 		end_extent_buffer_writeback(eb);
4493 	}
4494 
4495 	bio_put(bio);
4496 }
4497 
4498 static void prepare_eb_write(struct extent_buffer *eb)
4499 {
4500 	u32 nritems;
4501 	unsigned long start;
4502 	unsigned long end;
4503 
4504 	clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
4505 	atomic_set(&eb->io_pages, num_extent_pages(eb));
4506 
4507 	/* Set btree blocks beyond nritems with 0 to avoid stale content */
4508 	nritems = btrfs_header_nritems(eb);
4509 	if (btrfs_header_level(eb) > 0) {
4510 		end = btrfs_node_key_ptr_offset(nritems);
4511 		memzero_extent_buffer(eb, end, eb->len - end);
4512 	} else {
4513 		/*
4514 		 * Leaf:
4515 		 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
4516 		 */
4517 		start = btrfs_item_nr_offset(nritems);
4518 		end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
4519 		memzero_extent_buffer(eb, start, end - start);
4520 	}
4521 }
4522 
4523 /*
4524  * Unlike the work in write_one_eb(), we rely completely on extent locking.
4525  * Page locking is only utilized at minimum to keep the VMM code happy.
4526  */
4527 static int write_one_subpage_eb(struct extent_buffer *eb,
4528 				struct writeback_control *wbc,
4529 				struct extent_page_data *epd)
4530 {
4531 	struct btrfs_fs_info *fs_info = eb->fs_info;
4532 	struct page *page = eb->pages[0];
4533 	blk_opf_t write_flags = wbc_to_write_flags(wbc);
4534 	bool no_dirty_ebs = false;
4535 	int ret;
4536 
4537 	prepare_eb_write(eb);
4538 
4539 	/* clear_page_dirty_for_io() in subpage helper needs page locked */
4540 	lock_page(page);
4541 	btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
4542 
4543 	/* Check if this is the last dirty bit to update nr_written */
4544 	no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
4545 							  eb->start, eb->len);
4546 	if (no_dirty_ebs)
4547 		clear_page_dirty_for_io(page);
4548 
4549 	ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
4550 			&epd->bio_ctrl, page, eb->start, eb->len,
4551 			eb->start - page_offset(page),
4552 			end_bio_subpage_eb_writepage, 0, false);
4553 	if (ret) {
4554 		btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
4555 		set_btree_ioerr(page, eb);
4556 		unlock_page(page);
4557 
4558 		if (atomic_dec_and_test(&eb->io_pages))
4559 			end_extent_buffer_writeback(eb);
4560 		return -EIO;
4561 	}
4562 	unlock_page(page);
4563 	/*
4564 	 * Submission finished without problem, if no range of the page is
4565 	 * dirty anymore, we have submitted a page.  Update nr_written in wbc.
4566 	 */
4567 	if (no_dirty_ebs)
4568 		wbc->nr_to_write--;
4569 	return ret;
4570 }
4571 
4572 static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
4573 			struct writeback_control *wbc,
4574 			struct extent_page_data *epd)
4575 {
4576 	u64 disk_bytenr = eb->start;
4577 	int i, num_pages;
4578 	blk_opf_t write_flags = wbc_to_write_flags(wbc);
4579 	int ret = 0;
4580 
4581 	prepare_eb_write(eb);
4582 
4583 	num_pages = num_extent_pages(eb);
4584 	for (i = 0; i < num_pages; i++) {
4585 		struct page *p = eb->pages[i];
4586 
4587 		clear_page_dirty_for_io(p);
4588 		set_page_writeback(p);
4589 		ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
4590 					 &epd->bio_ctrl, p, disk_bytenr,
4591 					 PAGE_SIZE, 0,
4592 					 end_bio_extent_buffer_writepage,
4593 					 0, false);
4594 		if (ret) {
4595 			set_btree_ioerr(p, eb);
4596 			if (PageWriteback(p))
4597 				end_page_writeback(p);
4598 			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
4599 				end_extent_buffer_writeback(eb);
4600 			ret = -EIO;
4601 			break;
4602 		}
4603 		disk_bytenr += PAGE_SIZE;
4604 		wbc->nr_to_write--;
4605 		unlock_page(p);
4606 	}
4607 
4608 	if (unlikely(ret)) {
4609 		for (; i < num_pages; i++) {
4610 			struct page *p = eb->pages[i];
4611 			clear_page_dirty_for_io(p);
4612 			unlock_page(p);
4613 		}
4614 	}
4615 
4616 	return ret;
4617 }
4618 
4619 /*
4620  * Submit one subpage btree page.
4621  *
4622  * The main difference to submit_eb_page() is:
4623  * - Page locking
4624  *   For subpage, we don't rely on page locking at all.
4625  *
4626  * - Flush write bio
4627  *   We only flush bio if we may be unable to fit current extent buffers into
4628  *   current bio.
4629  *
4630  * Return >=0 for the number of submitted extent buffers.
4631  * Return <0 for fatal error.
4632  */
4633 static int submit_eb_subpage(struct page *page,
4634 			     struct writeback_control *wbc,
4635 			     struct extent_page_data *epd)
4636 {
4637 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
4638 	int submitted = 0;
4639 	u64 page_start = page_offset(page);
4640 	int bit_start = 0;
4641 	int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
4642 	int ret;
4643 
4644 	/* Lock and write each dirty extent buffers in the range */
4645 	while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
4646 		struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
4647 		struct extent_buffer *eb;
4648 		unsigned long flags;
4649 		u64 start;
4650 
4651 		/*
4652 		 * Take private lock to ensure the subpage won't be detached
4653 		 * in the meantime.
4654 		 */
4655 		spin_lock(&page->mapping->private_lock);
4656 		if (!PagePrivate(page)) {
4657 			spin_unlock(&page->mapping->private_lock);
4658 			break;
4659 		}
4660 		spin_lock_irqsave(&subpage->lock, flags);
4661 		if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
4662 			      subpage->bitmaps)) {
4663 			spin_unlock_irqrestore(&subpage->lock, flags);
4664 			spin_unlock(&page->mapping->private_lock);
4665 			bit_start++;
4666 			continue;
4667 		}
4668 
4669 		start = page_start + bit_start * fs_info->sectorsize;
4670 		bit_start += sectors_per_node;
4671 
4672 		/*
4673 		 * Here we just want to grab the eb without touching extra
4674 		 * spin locks, so call find_extent_buffer_nolock().
4675 		 */
4676 		eb = find_extent_buffer_nolock(fs_info, start);
4677 		spin_unlock_irqrestore(&subpage->lock, flags);
4678 		spin_unlock(&page->mapping->private_lock);
4679 
4680 		/*
4681 		 * The eb has already reached 0 refs thus find_extent_buffer()
4682 		 * doesn't return it. We don't need to write back such eb
4683 		 * anyway.
4684 		 */
4685 		if (!eb)
4686 			continue;
4687 
4688 		ret = lock_extent_buffer_for_io(eb, epd);
4689 		if (ret == 0) {
4690 			free_extent_buffer(eb);
4691 			continue;
4692 		}
4693 		if (ret < 0) {
4694 			free_extent_buffer(eb);
4695 			goto cleanup;
4696 		}
4697 		ret = write_one_subpage_eb(eb, wbc, epd);
4698 		free_extent_buffer(eb);
4699 		if (ret < 0)
4700 			goto cleanup;
4701 		submitted++;
4702 	}
4703 	return submitted;
4704 
4705 cleanup:
4706 	/* We hit error, end bio for the submitted extent buffers */
4707 	submit_write_bio(epd, ret);
4708 	return ret;
4709 }
4710 
4711 /*
4712  * Submit all page(s) of one extent buffer.
4713  *
4714  * @page:	the page of one extent buffer
4715  * @eb_context:	to determine if we need to submit this page, if current page
4716  *		belongs to this eb, we don't need to submit
4717  *
4718  * The caller should pass each page in their bytenr order, and here we use
4719  * @eb_context to determine if we have submitted pages of one extent buffer.
4720  *
4721  * If we have, we just skip until we hit a new page that doesn't belong to
4722  * current @eb_context.
4723  *
4724  * If not, we submit all the page(s) of the extent buffer.
4725  *
4726  * Return >0 if we have submitted the extent buffer successfully.
4727  * Return 0 if we don't need to submit the page, as it's already submitted by
4728  * previous call.
4729  * Return <0 for fatal error.
4730  */
4731 static int submit_eb_page(struct page *page, struct writeback_control *wbc,
4732 			  struct extent_page_data *epd,
4733 			  struct extent_buffer **eb_context)
4734 {
4735 	struct address_space *mapping = page->mapping;
4736 	struct btrfs_block_group *cache = NULL;
4737 	struct extent_buffer *eb;
4738 	int ret;
4739 
4740 	if (!PagePrivate(page))
4741 		return 0;
4742 
4743 	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
4744 		return submit_eb_subpage(page, wbc, epd);
4745 
4746 	spin_lock(&mapping->private_lock);
4747 	if (!PagePrivate(page)) {
4748 		spin_unlock(&mapping->private_lock);
4749 		return 0;
4750 	}
4751 
4752 	eb = (struct extent_buffer *)page->private;
4753 
4754 	/*
4755 	 * Shouldn't happen and normally this would be a BUG_ON but no point
4756 	 * crashing the machine for something we can survive anyway.
4757 	 */
4758 	if (WARN_ON(!eb)) {
4759 		spin_unlock(&mapping->private_lock);
4760 		return 0;
4761 	}
4762 
4763 	if (eb == *eb_context) {
4764 		spin_unlock(&mapping->private_lock);
4765 		return 0;
4766 	}
4767 	ret = atomic_inc_not_zero(&eb->refs);
4768 	spin_unlock(&mapping->private_lock);
4769 	if (!ret)
4770 		return 0;
4771 
4772 	if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
4773 		/*
4774 		 * If for_sync, this hole will be filled with
4775 		 * trasnsaction commit.
4776 		 */
4777 		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
4778 			ret = -EAGAIN;
4779 		else
4780 			ret = 0;
4781 		free_extent_buffer(eb);
4782 		return ret;
4783 	}
4784 
4785 	*eb_context = eb;
4786 
4787 	ret = lock_extent_buffer_for_io(eb, epd);
4788 	if (ret <= 0) {
4789 		btrfs_revert_meta_write_pointer(cache, eb);
4790 		if (cache)
4791 			btrfs_put_block_group(cache);
4792 		free_extent_buffer(eb);
4793 		return ret;
4794 	}
4795 	if (cache) {
4796 		/*
4797 		 * Implies write in zoned mode. Mark the last eb in a block group.
4798 		 */
4799 		btrfs_schedule_zone_finish_bg(cache, eb);
4800 		btrfs_put_block_group(cache);
4801 	}
4802 	ret = write_one_eb(eb, wbc, epd);
4803 	free_extent_buffer(eb);
4804 	if (ret < 0)
4805 		return ret;
4806 	return 1;
4807 }
4808 
4809 int btree_write_cache_pages(struct address_space *mapping,
4810 				   struct writeback_control *wbc)
4811 {
4812 	struct extent_buffer *eb_context = NULL;
4813 	struct extent_page_data epd = {
4814 		.bio_ctrl = { 0 },
4815 		.extent_locked = 0,
4816 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
4817 	};
4818 	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
4819 	int ret = 0;
4820 	int done = 0;
4821 	int nr_to_write_done = 0;
4822 	struct pagevec pvec;
4823 	int nr_pages;
4824 	pgoff_t index;
4825 	pgoff_t end;		/* Inclusive */
4826 	int scanned = 0;
4827 	xa_mark_t tag;
4828 
4829 	pagevec_init(&pvec);
4830 	if (wbc->range_cyclic) {
4831 		index = mapping->writeback_index; /* Start from prev offset */
4832 		end = -1;
4833 		/*
4834 		 * Start from the beginning does not need to cycle over the
4835 		 * range, mark it as scanned.
4836 		 */
4837 		scanned = (index == 0);
4838 	} else {
4839 		index = wbc->range_start >> PAGE_SHIFT;
4840 		end = wbc->range_end >> PAGE_SHIFT;
4841 		scanned = 1;
4842 	}
4843 	if (wbc->sync_mode == WB_SYNC_ALL)
4844 		tag = PAGECACHE_TAG_TOWRITE;
4845 	else
4846 		tag = PAGECACHE_TAG_DIRTY;
4847 	btrfs_zoned_meta_io_lock(fs_info);
4848 retry:
4849 	if (wbc->sync_mode == WB_SYNC_ALL)
4850 		tag_pages_for_writeback(mapping, index, end);
4851 	while (!done && !nr_to_write_done && (index <= end) &&
4852 	       (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
4853 			tag))) {
4854 		unsigned i;
4855 
4856 		for (i = 0; i < nr_pages; i++) {
4857 			struct page *page = pvec.pages[i];
4858 
4859 			ret = submit_eb_page(page, wbc, &epd, &eb_context);
4860 			if (ret == 0)
4861 				continue;
4862 			if (ret < 0) {
4863 				done = 1;
4864 				break;
4865 			}
4866 
4867 			/*
4868 			 * the filesystem may choose to bump up nr_to_write.
4869 			 * We have to make sure to honor the new nr_to_write
4870 			 * at any time
4871 			 */
4872 			nr_to_write_done = wbc->nr_to_write <= 0;
4873 		}
4874 		pagevec_release(&pvec);
4875 		cond_resched();
4876 	}
4877 	if (!scanned && !done) {
4878 		/*
4879 		 * We hit the last page and there is more work to be done: wrap
4880 		 * back to the start of the file
4881 		 */
4882 		scanned = 1;
4883 		index = 0;
4884 		goto retry;
4885 	}
4886 	/*
4887 	 * If something went wrong, don't allow any metadata write bio to be
4888 	 * submitted.
4889 	 *
4890 	 * This would prevent use-after-free if we had dirty pages not
4891 	 * cleaned up, which can still happen by fuzzed images.
4892 	 *
4893 	 * - Bad extent tree
4894 	 *   Allowing existing tree block to be allocated for other trees.
4895 	 *
4896 	 * - Log tree operations
4897 	 *   Exiting tree blocks get allocated to log tree, bumps its
4898 	 *   generation, then get cleaned in tree re-balance.
4899 	 *   Such tree block will not be written back, since it's clean,
4900 	 *   thus no WRITTEN flag set.
4901 	 *   And after log writes back, this tree block is not traced by
4902 	 *   any dirty extent_io_tree.
4903 	 *
4904 	 * - Offending tree block gets re-dirtied from its original owner
4905 	 *   Since it has bumped generation, no WRITTEN flag, it can be
4906 	 *   reused without COWing. This tree block will not be traced
4907 	 *   by btrfs_transaction::dirty_pages.
4908 	 *
4909 	 *   Now such dirty tree block will not be cleaned by any dirty
4910 	 *   extent io tree. Thus we don't want to submit such wild eb
4911 	 *   if the fs already has error.
4912 	 *
4913 	 * We can get ret > 0 from submit_extent_page() indicating how many ebs
4914 	 * were submitted. Reset it to 0 to avoid false alerts for the caller.
4915 	 */
4916 	if (ret > 0)
4917 		ret = 0;
4918 	if (!ret && BTRFS_FS_ERROR(fs_info))
4919 		ret = -EROFS;
4920 	submit_write_bio(&epd, ret);
4921 
4922 	btrfs_zoned_meta_io_unlock(fs_info);
4923 	return ret;
4924 }
4925 
4926 /**
4927  * Walk the list of dirty pages of the given address space and write all of them.
4928  *
4929  * @mapping: address space structure to write
4930  * @wbc:     subtract the number of written pages from *@wbc->nr_to_write
4931  * @epd:     holds context for the write, namely the bio
4932  *
4933  * If a page is already under I/O, write_cache_pages() skips it, even
4934  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
4935  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
4936  * and msync() need to guarantee that all the data which was dirty at the time
4937  * the call was made get new I/O started against them.  If wbc->sync_mode is
4938  * WB_SYNC_ALL then we were called for data integrity and we must wait for
4939  * existing IO to complete.
4940  */
4941 static int extent_write_cache_pages(struct address_space *mapping,
4942 			     struct writeback_control *wbc,
4943 			     struct extent_page_data *epd)
4944 {
4945 	struct inode *inode = mapping->host;
4946 	int ret = 0;
4947 	int done = 0;
4948 	int nr_to_write_done = 0;
4949 	struct pagevec pvec;
4950 	int nr_pages;
4951 	pgoff_t index;
4952 	pgoff_t end;		/* Inclusive */
4953 	pgoff_t done_index;
4954 	int range_whole = 0;
4955 	int scanned = 0;
4956 	xa_mark_t tag;
4957 
4958 	/*
4959 	 * We have to hold onto the inode so that ordered extents can do their
4960 	 * work when the IO finishes.  The alternative to this is failing to add
4961 	 * an ordered extent if the igrab() fails there and that is a huge pain
4962 	 * to deal with, so instead just hold onto the inode throughout the
4963 	 * writepages operation.  If it fails here we are freeing up the inode
4964 	 * anyway and we'd rather not waste our time writing out stuff that is
4965 	 * going to be truncated anyway.
4966 	 */
4967 	if (!igrab(inode))
4968 		return 0;
4969 
4970 	pagevec_init(&pvec);
4971 	if (wbc->range_cyclic) {
4972 		index = mapping->writeback_index; /* Start from prev offset */
4973 		end = -1;
4974 		/*
4975 		 * Start from the beginning does not need to cycle over the
4976 		 * range, mark it as scanned.
4977 		 */
4978 		scanned = (index == 0);
4979 	} else {
4980 		index = wbc->range_start >> PAGE_SHIFT;
4981 		end = wbc->range_end >> PAGE_SHIFT;
4982 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
4983 			range_whole = 1;
4984 		scanned = 1;
4985 	}
4986 
4987 	/*
4988 	 * We do the tagged writepage as long as the snapshot flush bit is set
4989 	 * and we are the first one who do the filemap_flush() on this inode.
4990 	 *
4991 	 * The nr_to_write == LONG_MAX is needed to make sure other flushers do
4992 	 * not race in and drop the bit.
4993 	 */
4994 	if (range_whole && wbc->nr_to_write == LONG_MAX &&
4995 	    test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
4996 			       &BTRFS_I(inode)->runtime_flags))
4997 		wbc->tagged_writepages = 1;
4998 
4999 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
5000 		tag = PAGECACHE_TAG_TOWRITE;
5001 	else
5002 		tag = PAGECACHE_TAG_DIRTY;
5003 retry:
5004 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
5005 		tag_pages_for_writeback(mapping, index, end);
5006 	done_index = index;
5007 	while (!done && !nr_to_write_done && (index <= end) &&
5008 			(nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
5009 						&index, end, tag))) {
5010 		unsigned i;
5011 
5012 		for (i = 0; i < nr_pages; i++) {
5013 			struct page *page = pvec.pages[i];
5014 
5015 			done_index = page->index + 1;
5016 			/*
5017 			 * At this point we hold neither the i_pages lock nor
5018 			 * the page lock: the page may be truncated or
5019 			 * invalidated (changing page->mapping to NULL),
5020 			 * or even swizzled back from swapper_space to
5021 			 * tmpfs file mapping
5022 			 */
5023 			if (!trylock_page(page)) {
5024 				submit_write_bio(epd, 0);
5025 				lock_page(page);
5026 			}
5027 
5028 			if (unlikely(page->mapping != mapping)) {
5029 				unlock_page(page);
5030 				continue;
5031 			}
5032 
5033 			if (wbc->sync_mode != WB_SYNC_NONE) {
5034 				if (PageWriteback(page))
5035 					submit_write_bio(epd, 0);
5036 				wait_on_page_writeback(page);
5037 			}
5038 
5039 			if (PageWriteback(page) ||
5040 			    !clear_page_dirty_for_io(page)) {
5041 				unlock_page(page);
5042 				continue;
5043 			}
5044 
5045 			ret = __extent_writepage(page, wbc, epd);
5046 			if (ret < 0) {
5047 				done = 1;
5048 				break;
5049 			}
5050 
5051 			/*
5052 			 * the filesystem may choose to bump up nr_to_write.
5053 			 * We have to make sure to honor the new nr_to_write
5054 			 * at any time
5055 			 */
5056 			nr_to_write_done = wbc->nr_to_write <= 0;
5057 		}
5058 		pagevec_release(&pvec);
5059 		cond_resched();
5060 	}
5061 	if (!scanned && !done) {
5062 		/*
5063 		 * We hit the last page and there is more work to be done: wrap
5064 		 * back to the start of the file
5065 		 */
5066 		scanned = 1;
5067 		index = 0;
5068 
5069 		/*
5070 		 * If we're looping we could run into a page that is locked by a
5071 		 * writer and that writer could be waiting on writeback for a
5072 		 * page in our current bio, and thus deadlock, so flush the
5073 		 * write bio here.
5074 		 */
5075 		submit_write_bio(epd, 0);
5076 		goto retry;
5077 	}
5078 
5079 	if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
5080 		mapping->writeback_index = done_index;
5081 
5082 	btrfs_add_delayed_iput(inode);
5083 	return ret;
5084 }
5085 
5086 /*
5087  * Submit the pages in the range to bio for call sites which delalloc range has
5088  * already been ran (aka, ordered extent inserted) and all pages are still
5089  * locked.
5090  */
5091 int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
5092 {
5093 	bool found_error = false;
5094 	int first_error = 0;
5095 	int ret = 0;
5096 	struct address_space *mapping = inode->i_mapping;
5097 	struct page *page;
5098 	u64 cur = start;
5099 	unsigned long nr_pages;
5100 	const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
5101 	struct extent_page_data epd = {
5102 		.bio_ctrl = { 0 },
5103 		.extent_locked = 1,
5104 		.sync_io = 1,
5105 	};
5106 	struct writeback_control wbc_writepages = {
5107 		.sync_mode	= WB_SYNC_ALL,
5108 		.range_start	= start,
5109 		.range_end	= end + 1,
5110 		/* We're called from an async helper function */
5111 		.punt_to_cgroup	= 1,
5112 		.no_cgroup_owner = 1,
5113 	};
5114 
5115 	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
5116 	nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
5117 		   PAGE_SHIFT;
5118 	wbc_writepages.nr_to_write = nr_pages * 2;
5119 
5120 	wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
5121 	while (cur <= end) {
5122 		u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
5123 
5124 		page = find_get_page(mapping, cur >> PAGE_SHIFT);
5125 		/*
5126 		 * All pages in the range are locked since
5127 		 * btrfs_run_delalloc_range(), thus there is no way to clear
5128 		 * the page dirty flag.
5129 		 */
5130 		ASSERT(PageLocked(page));
5131 		ASSERT(PageDirty(page));
5132 		clear_page_dirty_for_io(page);
5133 		ret = __extent_writepage(page, &wbc_writepages, &epd);
5134 		ASSERT(ret <= 0);
5135 		if (ret < 0) {
5136 			found_error = true;
5137 			first_error = ret;
5138 		}
5139 		put_page(page);
5140 		cur = cur_end + 1;
5141 	}
5142 
5143 	submit_write_bio(&epd, found_error ? ret : 0);
5144 
5145 	wbc_detach_inode(&wbc_writepages);
5146 	if (found_error)
5147 		return first_error;
5148 	return ret;
5149 }
5150 
5151 int extent_writepages(struct address_space *mapping,
5152 		      struct writeback_control *wbc)
5153 {
5154 	struct inode *inode = mapping->host;
5155 	int ret = 0;
5156 	struct extent_page_data epd = {
5157 		.bio_ctrl = { 0 },
5158 		.extent_locked = 0,
5159 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
5160 	};
5161 
5162 	/*
5163 	 * Allow only a single thread to do the reloc work in zoned mode to
5164 	 * protect the write pointer updates.
5165 	 */
5166 	btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
5167 	ret = extent_write_cache_pages(mapping, wbc, &epd);
5168 	submit_write_bio(&epd, ret);
5169 	btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
5170 	return ret;
5171 }
5172 
5173 void extent_readahead(struct readahead_control *rac)
5174 {
5175 	struct btrfs_bio_ctrl bio_ctrl = { 0 };
5176 	struct page *pagepool[16];
5177 	struct extent_map *em_cached = NULL;
5178 	u64 prev_em_start = (u64)-1;
5179 	int nr;
5180 
5181 	while ((nr = readahead_page_batch(rac, pagepool))) {
5182 		u64 contig_start = readahead_pos(rac);
5183 		u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
5184 
5185 		contiguous_readpages(pagepool, nr, contig_start, contig_end,
5186 				&em_cached, &bio_ctrl, &prev_em_start);
5187 	}
5188 
5189 	if (em_cached)
5190 		free_extent_map(em_cached);
5191 	submit_one_bio(&bio_ctrl);
5192 }
5193 
5194 /*
5195  * basic invalidate_folio code, this waits on any locked or writeback
5196  * ranges corresponding to the folio, and then deletes any extent state
5197  * records from the tree
5198  */
5199 int extent_invalidate_folio(struct extent_io_tree *tree,
5200 			  struct folio *folio, size_t offset)
5201 {
5202 	struct extent_state *cached_state = NULL;
5203 	u64 start = folio_pos(folio);
5204 	u64 end = start + folio_size(folio) - 1;
5205 	size_t blocksize = folio->mapping->host->i_sb->s_blocksize;
5206 
5207 	/* This function is only called for the btree inode */
5208 	ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
5209 
5210 	start += ALIGN(offset, blocksize);
5211 	if (start > end)
5212 		return 0;
5213 
5214 	lock_extent_bits(tree, start, end, &cached_state);
5215 	folio_wait_writeback(folio);
5216 
5217 	/*
5218 	 * Currently for btree io tree, only EXTENT_LOCKED is utilized,
5219 	 * so here we only need to unlock the extent range to free any
5220 	 * existing extent state.
5221 	 */
5222 	unlock_extent_cached(tree, start, end, &cached_state);
5223 	return 0;
5224 }
5225 
5226 /*
5227  * a helper for release_folio, this tests for areas of the page that
5228  * are locked or under IO and drops the related state bits if it is safe
5229  * to drop the page.
5230  */
5231 static int try_release_extent_state(struct extent_io_tree *tree,
5232 				    struct page *page, gfp_t mask)
5233 {
5234 	u64 start = page_offset(page);
5235 	u64 end = start + PAGE_SIZE - 1;
5236 	int ret = 1;
5237 
5238 	if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
5239 		ret = 0;
5240 	} else {
5241 		/*
5242 		 * At this point we can safely clear everything except the
5243 		 * locked bit, the nodatasum bit and the delalloc new bit.
5244 		 * The delalloc new bit will be cleared by ordered extent
5245 		 * completion.
5246 		 */
5247 		ret = __clear_extent_bit(tree, start, end,
5248 			 ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
5249 			 0, 0, NULL, mask, NULL);
5250 
5251 		/* if clear_extent_bit failed for enomem reasons,
5252 		 * we can't allow the release to continue.
5253 		 */
5254 		if (ret < 0)
5255 			ret = 0;
5256 		else
5257 			ret = 1;
5258 	}
5259 	return ret;
5260 }
5261 
5262 /*
5263  * a helper for release_folio.  As long as there are no locked extents
5264  * in the range corresponding to the page, both state records and extent
5265  * map records are removed
5266  */
5267 int try_release_extent_mapping(struct page *page, gfp_t mask)
5268 {
5269 	struct extent_map *em;
5270 	u64 start = page_offset(page);
5271 	u64 end = start + PAGE_SIZE - 1;
5272 	struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
5273 	struct extent_io_tree *tree = &btrfs_inode->io_tree;
5274 	struct extent_map_tree *map = &btrfs_inode->extent_tree;
5275 
5276 	if (gfpflags_allow_blocking(mask) &&
5277 	    page->mapping->host->i_size > SZ_16M) {
5278 		u64 len;
5279 		while (start <= end) {
5280 			struct btrfs_fs_info *fs_info;
5281 			u64 cur_gen;
5282 
5283 			len = end - start + 1;
5284 			write_lock(&map->lock);
5285 			em = lookup_extent_mapping(map, start, len);
5286 			if (!em) {
5287 				write_unlock(&map->lock);
5288 				break;
5289 			}
5290 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
5291 			    em->start != start) {
5292 				write_unlock(&map->lock);
5293 				free_extent_map(em);
5294 				break;
5295 			}
5296 			if (test_range_bit(tree, em->start,
5297 					   extent_map_end(em) - 1,
5298 					   EXTENT_LOCKED, 0, NULL))
5299 				goto next;
5300 			/*
5301 			 * If it's not in the list of modified extents, used
5302 			 * by a fast fsync, we can remove it. If it's being
5303 			 * logged we can safely remove it since fsync took an
5304 			 * extra reference on the em.
5305 			 */
5306 			if (list_empty(&em->list) ||
5307 			    test_bit(EXTENT_FLAG_LOGGING, &em->flags))
5308 				goto remove_em;
5309 			/*
5310 			 * If it's in the list of modified extents, remove it
5311 			 * only if its generation is older then the current one,
5312 			 * in which case we don't need it for a fast fsync.
5313 			 * Otherwise don't remove it, we could be racing with an
5314 			 * ongoing fast fsync that could miss the new extent.
5315 			 */
5316 			fs_info = btrfs_inode->root->fs_info;
5317 			spin_lock(&fs_info->trans_lock);
5318 			cur_gen = fs_info->generation;
5319 			spin_unlock(&fs_info->trans_lock);
5320 			if (em->generation >= cur_gen)
5321 				goto next;
5322 remove_em:
5323 			/*
5324 			 * We only remove extent maps that are not in the list of
5325 			 * modified extents or that are in the list but with a
5326 			 * generation lower then the current generation, so there
5327 			 * is no need to set the full fsync flag on the inode (it
5328 			 * hurts the fsync performance for workloads with a data
5329 			 * size that exceeds or is close to the system's memory).
5330 			 */
5331 			remove_extent_mapping(map, em);
5332 			/* once for the rb tree */
5333 			free_extent_map(em);
5334 next:
5335 			start = extent_map_end(em);
5336 			write_unlock(&map->lock);
5337 
5338 			/* once for us */
5339 			free_extent_map(em);
5340 
5341 			cond_resched(); /* Allow large-extent preemption. */
5342 		}
5343 	}
5344 	return try_release_extent_state(tree, page, mask);
5345 }
5346 
5347 /*
5348  * helper function for fiemap, which doesn't want to see any holes.
5349  * This maps until we find something past 'last'
5350  */
5351 static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
5352 						u64 offset, u64 last)
5353 {
5354 	u64 sectorsize = btrfs_inode_sectorsize(inode);
5355 	struct extent_map *em;
5356 	u64 len;
5357 
5358 	if (offset >= last)
5359 		return NULL;
5360 
5361 	while (1) {
5362 		len = last - offset;
5363 		if (len == 0)
5364 			break;
5365 		len = ALIGN(len, sectorsize);
5366 		em = btrfs_get_extent_fiemap(inode, offset, len);
5367 		if (IS_ERR(em))
5368 			return em;
5369 
5370 		/* if this isn't a hole return it */
5371 		if (em->block_start != EXTENT_MAP_HOLE)
5372 			return em;
5373 
5374 		/* this is a hole, advance to the next extent */
5375 		offset = extent_map_end(em);
5376 		free_extent_map(em);
5377 		if (offset >= last)
5378 			break;
5379 	}
5380 	return NULL;
5381 }
5382 
5383 /*
5384  * To cache previous fiemap extent
5385  *
5386  * Will be used for merging fiemap extent
5387  */
5388 struct fiemap_cache {
5389 	u64 offset;
5390 	u64 phys;
5391 	u64 len;
5392 	u32 flags;
5393 	bool cached;
5394 };
5395 
5396 /*
5397  * Helper to submit fiemap extent.
5398  *
5399  * Will try to merge current fiemap extent specified by @offset, @phys,
5400  * @len and @flags with cached one.
5401  * And only when we fails to merge, cached one will be submitted as
5402  * fiemap extent.
5403  *
5404  * Return value is the same as fiemap_fill_next_extent().
5405  */
5406 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
5407 				struct fiemap_cache *cache,
5408 				u64 offset, u64 phys, u64 len, u32 flags)
5409 {
5410 	int ret = 0;
5411 
5412 	if (!cache->cached)
5413 		goto assign;
5414 
5415 	/*
5416 	 * Sanity check, extent_fiemap() should have ensured that new
5417 	 * fiemap extent won't overlap with cached one.
5418 	 * Not recoverable.
5419 	 *
5420 	 * NOTE: Physical address can overlap, due to compression
5421 	 */
5422 	if (cache->offset + cache->len > offset) {
5423 		WARN_ON(1);
5424 		return -EINVAL;
5425 	}
5426 
5427 	/*
5428 	 * Only merges fiemap extents if
5429 	 * 1) Their logical addresses are continuous
5430 	 *
5431 	 * 2) Their physical addresses are continuous
5432 	 *    So truly compressed (physical size smaller than logical size)
5433 	 *    extents won't get merged with each other
5434 	 *
5435 	 * 3) Share same flags except FIEMAP_EXTENT_LAST
5436 	 *    So regular extent won't get merged with prealloc extent
5437 	 */
5438 	if (cache->offset + cache->len  == offset &&
5439 	    cache->phys + cache->len == phys  &&
5440 	    (cache->flags & ~FIEMAP_EXTENT_LAST) ==
5441 			(flags & ~FIEMAP_EXTENT_LAST)) {
5442 		cache->len += len;
5443 		cache->flags |= flags;
5444 		goto try_submit_last;
5445 	}
5446 
5447 	/* Not mergeable, need to submit cached one */
5448 	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5449 				      cache->len, cache->flags);
5450 	cache->cached = false;
5451 	if (ret)
5452 		return ret;
5453 assign:
5454 	cache->cached = true;
5455 	cache->offset = offset;
5456 	cache->phys = phys;
5457 	cache->len = len;
5458 	cache->flags = flags;
5459 try_submit_last:
5460 	if (cache->flags & FIEMAP_EXTENT_LAST) {
5461 		ret = fiemap_fill_next_extent(fieinfo, cache->offset,
5462 				cache->phys, cache->len, cache->flags);
5463 		cache->cached = false;
5464 	}
5465 	return ret;
5466 }
5467 
5468 /*
5469  * Emit last fiemap cache
5470  *
5471  * The last fiemap cache may still be cached in the following case:
5472  * 0		      4k		    8k
5473  * |<- Fiemap range ->|
5474  * |<------------  First extent ----------->|
5475  *
5476  * In this case, the first extent range will be cached but not emitted.
5477  * So we must emit it before ending extent_fiemap().
5478  */
5479 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
5480 				  struct fiemap_cache *cache)
5481 {
5482 	int ret;
5483 
5484 	if (!cache->cached)
5485 		return 0;
5486 
5487 	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5488 				      cache->len, cache->flags);
5489 	cache->cached = false;
5490 	if (ret > 0)
5491 		ret = 0;
5492 	return ret;
5493 }
5494 
5495 int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
5496 		  u64 start, u64 len)
5497 {
5498 	int ret = 0;
5499 	u64 off;
5500 	u64 max = start + len;
5501 	u32 flags = 0;
5502 	u32 found_type;
5503 	u64 last;
5504 	u64 last_for_get_extent = 0;
5505 	u64 disko = 0;
5506 	u64 isize = i_size_read(&inode->vfs_inode);
5507 	struct btrfs_key found_key;
5508 	struct extent_map *em = NULL;
5509 	struct extent_state *cached_state = NULL;
5510 	struct btrfs_path *path;
5511 	struct btrfs_root *root = inode->root;
5512 	struct fiemap_cache cache = { 0 };
5513 	struct ulist *roots;
5514 	struct ulist *tmp_ulist;
5515 	int end = 0;
5516 	u64 em_start = 0;
5517 	u64 em_len = 0;
5518 	u64 em_end = 0;
5519 
5520 	if (len == 0)
5521 		return -EINVAL;
5522 
5523 	path = btrfs_alloc_path();
5524 	if (!path)
5525 		return -ENOMEM;
5526 
5527 	roots = ulist_alloc(GFP_KERNEL);
5528 	tmp_ulist = ulist_alloc(GFP_KERNEL);
5529 	if (!roots || !tmp_ulist) {
5530 		ret = -ENOMEM;
5531 		goto out_free_ulist;
5532 	}
5533 
5534 	/*
5535 	 * We can't initialize that to 'start' as this could miss extents due
5536 	 * to extent item merging
5537 	 */
5538 	off = 0;
5539 	start = round_down(start, btrfs_inode_sectorsize(inode));
5540 	len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
5541 
5542 	/*
5543 	 * lookup the last file extent.  We're not using i_size here
5544 	 * because there might be preallocation past i_size
5545 	 */
5546 	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
5547 				       0);
5548 	if (ret < 0) {
5549 		goto out_free_ulist;
5550 	} else {
5551 		WARN_ON(!ret);
5552 		if (ret == 1)
5553 			ret = 0;
5554 	}
5555 
5556 	path->slots[0]--;
5557 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5558 	found_type = found_key.type;
5559 
5560 	/* No extents, but there might be delalloc bits */
5561 	if (found_key.objectid != btrfs_ino(inode) ||
5562 	    found_type != BTRFS_EXTENT_DATA_KEY) {
5563 		/* have to trust i_size as the end */
5564 		last = (u64)-1;
5565 		last_for_get_extent = isize;
5566 	} else {
5567 		/*
5568 		 * remember the start of the last extent.  There are a
5569 		 * bunch of different factors that go into the length of the
5570 		 * extent, so its much less complex to remember where it started
5571 		 */
5572 		last = found_key.offset;
5573 		last_for_get_extent = last + 1;
5574 	}
5575 	btrfs_release_path(path);
5576 
5577 	/*
5578 	 * we might have some extents allocated but more delalloc past those
5579 	 * extents.  so, we trust isize unless the start of the last extent is
5580 	 * beyond isize
5581 	 */
5582 	if (last < isize) {
5583 		last = (u64)-1;
5584 		last_for_get_extent = isize;
5585 	}
5586 
5587 	lock_extent_bits(&inode->io_tree, start, start + len - 1,
5588 			 &cached_state);
5589 
5590 	em = get_extent_skip_holes(inode, start, last_for_get_extent);
5591 	if (!em)
5592 		goto out;
5593 	if (IS_ERR(em)) {
5594 		ret = PTR_ERR(em);
5595 		goto out;
5596 	}
5597 
5598 	while (!end) {
5599 		u64 offset_in_extent = 0;
5600 
5601 		/* break if the extent we found is outside the range */
5602 		if (em->start >= max || extent_map_end(em) < off)
5603 			break;
5604 
5605 		/*
5606 		 * get_extent may return an extent that starts before our
5607 		 * requested range.  We have to make sure the ranges
5608 		 * we return to fiemap always move forward and don't
5609 		 * overlap, so adjust the offsets here
5610 		 */
5611 		em_start = max(em->start, off);
5612 
5613 		/*
5614 		 * record the offset from the start of the extent
5615 		 * for adjusting the disk offset below.  Only do this if the
5616 		 * extent isn't compressed since our in ram offset may be past
5617 		 * what we have actually allocated on disk.
5618 		 */
5619 		if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5620 			offset_in_extent = em_start - em->start;
5621 		em_end = extent_map_end(em);
5622 		em_len = em_end - em_start;
5623 		flags = 0;
5624 		if (em->block_start < EXTENT_MAP_LAST_BYTE)
5625 			disko = em->block_start + offset_in_extent;
5626 		else
5627 			disko = 0;
5628 
5629 		/*
5630 		 * bump off for our next call to get_extent
5631 		 */
5632 		off = extent_map_end(em);
5633 		if (off >= max)
5634 			end = 1;
5635 
5636 		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
5637 			end = 1;
5638 			flags |= FIEMAP_EXTENT_LAST;
5639 		} else if (em->block_start == EXTENT_MAP_INLINE) {
5640 			flags |= (FIEMAP_EXTENT_DATA_INLINE |
5641 				  FIEMAP_EXTENT_NOT_ALIGNED);
5642 		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
5643 			flags |= (FIEMAP_EXTENT_DELALLOC |
5644 				  FIEMAP_EXTENT_UNKNOWN);
5645 		} else if (fieinfo->fi_extents_max) {
5646 			u64 bytenr = em->block_start -
5647 				(em->start - em->orig_start);
5648 
5649 			/*
5650 			 * As btrfs supports shared space, this information
5651 			 * can be exported to userspace tools via
5652 			 * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
5653 			 * then we're just getting a count and we can skip the
5654 			 * lookup stuff.
5655 			 */
5656 			ret = btrfs_check_shared(root, btrfs_ino(inode),
5657 						 bytenr, roots, tmp_ulist);
5658 			if (ret < 0)
5659 				goto out_free;
5660 			if (ret)
5661 				flags |= FIEMAP_EXTENT_SHARED;
5662 			ret = 0;
5663 		}
5664 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5665 			flags |= FIEMAP_EXTENT_ENCODED;
5666 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5667 			flags |= FIEMAP_EXTENT_UNWRITTEN;
5668 
5669 		free_extent_map(em);
5670 		em = NULL;
5671 		if ((em_start >= last) || em_len == (u64)-1 ||
5672 		   (last == (u64)-1 && isize <= em_end)) {
5673 			flags |= FIEMAP_EXTENT_LAST;
5674 			end = 1;
5675 		}
5676 
5677 		/* now scan forward to see if this is really the last extent. */
5678 		em = get_extent_skip_holes(inode, off, last_for_get_extent);
5679 		if (IS_ERR(em)) {
5680 			ret = PTR_ERR(em);
5681 			goto out;
5682 		}
5683 		if (!em) {
5684 			flags |= FIEMAP_EXTENT_LAST;
5685 			end = 1;
5686 		}
5687 		ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
5688 					   em_len, flags);
5689 		if (ret) {
5690 			if (ret == 1)
5691 				ret = 0;
5692 			goto out_free;
5693 		}
5694 	}
5695 out_free:
5696 	if (!ret)
5697 		ret = emit_last_fiemap_cache(fieinfo, &cache);
5698 	free_extent_map(em);
5699 out:
5700 	unlock_extent_cached(&inode->io_tree, start, start + len - 1,
5701 			     &cached_state);
5702 
5703 out_free_ulist:
5704 	btrfs_free_path(path);
5705 	ulist_free(roots);
5706 	ulist_free(tmp_ulist);
5707 	return ret;
5708 }
5709 
5710 static void __free_extent_buffer(struct extent_buffer *eb)
5711 {
5712 	kmem_cache_free(extent_buffer_cache, eb);
5713 }
5714 
5715 int extent_buffer_under_io(const struct extent_buffer *eb)
5716 {
5717 	return (atomic_read(&eb->io_pages) ||
5718 		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
5719 		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5720 }
5721 
5722 static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
5723 {
5724 	struct btrfs_subpage *subpage;
5725 
5726 	lockdep_assert_held(&page->mapping->private_lock);
5727 
5728 	if (PagePrivate(page)) {
5729 		subpage = (struct btrfs_subpage *)page->private;
5730 		if (atomic_read(&subpage->eb_refs))
5731 			return true;
5732 		/*
5733 		 * Even there is no eb refs here, we may still have
5734 		 * end_page_read() call relying on page::private.
5735 		 */
5736 		if (atomic_read(&subpage->readers))
5737 			return true;
5738 	}
5739 	return false;
5740 }
5741 
5742 static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
5743 {
5744 	struct btrfs_fs_info *fs_info = eb->fs_info;
5745 	const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5746 
5747 	/*
5748 	 * For mapped eb, we're going to change the page private, which should
5749 	 * be done under the private_lock.
5750 	 */
5751 	if (mapped)
5752 		spin_lock(&page->mapping->private_lock);
5753 
5754 	if (!PagePrivate(page)) {
5755 		if (mapped)
5756 			spin_unlock(&page->mapping->private_lock);
5757 		return;
5758 	}
5759 
5760 	if (fs_info->nodesize >= PAGE_SIZE) {
5761 		/*
5762 		 * We do this since we'll remove the pages after we've
5763 		 * removed the eb from the radix tree, so we could race
5764 		 * and have this page now attached to the new eb.  So
5765 		 * only clear page_private if it's still connected to
5766 		 * this eb.
5767 		 */
5768 		if (PagePrivate(page) &&
5769 		    page->private == (unsigned long)eb) {
5770 			BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5771 			BUG_ON(PageDirty(page));
5772 			BUG_ON(PageWriteback(page));
5773 			/*
5774 			 * We need to make sure we haven't be attached
5775 			 * to a new eb.
5776 			 */
5777 			detach_page_private(page);
5778 		}
5779 		if (mapped)
5780 			spin_unlock(&page->mapping->private_lock);
5781 		return;
5782 	}
5783 
5784 	/*
5785 	 * For subpage, we can have dummy eb with page private.  In this case,
5786 	 * we can directly detach the private as such page is only attached to
5787 	 * one dummy eb, no sharing.
5788 	 */
5789 	if (!mapped) {
5790 		btrfs_detach_subpage(fs_info, page);
5791 		return;
5792 	}
5793 
5794 	btrfs_page_dec_eb_refs(fs_info, page);
5795 
5796 	/*
5797 	 * We can only detach the page private if there are no other ebs in the
5798 	 * page range and no unfinished IO.
5799 	 */
5800 	if (!page_range_has_eb(fs_info, page))
5801 		btrfs_detach_subpage(fs_info, page);
5802 
5803 	spin_unlock(&page->mapping->private_lock);
5804 }
5805 
5806 /* Release all pages attached to the extent buffer */
5807 static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
5808 {
5809 	int i;
5810 	int num_pages;
5811 
5812 	ASSERT(!extent_buffer_under_io(eb));
5813 
5814 	num_pages = num_extent_pages(eb);
5815 	for (i = 0; i < num_pages; i++) {
5816 		struct page *page = eb->pages[i];
5817 
5818 		if (!page)
5819 			continue;
5820 
5821 		detach_extent_buffer_page(eb, page);
5822 
5823 		/* One for when we allocated the page */
5824 		put_page(page);
5825 	}
5826 }
5827 
5828 /*
5829  * Helper for releasing the extent buffer.
5830  */
5831 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
5832 {
5833 	btrfs_release_extent_buffer_pages(eb);
5834 	btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
5835 	__free_extent_buffer(eb);
5836 }
5837 
5838 static struct extent_buffer *
5839 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
5840 		      unsigned long len)
5841 {
5842 	struct extent_buffer *eb = NULL;
5843 
5844 	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
5845 	eb->start = start;
5846 	eb->len = len;
5847 	eb->fs_info = fs_info;
5848 	eb->bflags = 0;
5849 	init_rwsem(&eb->lock);
5850 
5851 	btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
5852 			     &fs_info->allocated_ebs);
5853 	INIT_LIST_HEAD(&eb->release_list);
5854 
5855 	spin_lock_init(&eb->refs_lock);
5856 	atomic_set(&eb->refs, 1);
5857 	atomic_set(&eb->io_pages, 0);
5858 
5859 	ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
5860 
5861 	return eb;
5862 }
5863 
5864 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
5865 {
5866 	int i;
5867 	struct extent_buffer *new;
5868 	int num_pages = num_extent_pages(src);
5869 	int ret;
5870 
5871 	new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
5872 	if (new == NULL)
5873 		return NULL;
5874 
5875 	/*
5876 	 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
5877 	 * btrfs_release_extent_buffer() have different behavior for
5878 	 * UNMAPPED subpage extent buffer.
5879 	 */
5880 	set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
5881 
5882 	memset(new->pages, 0, sizeof(*new->pages) * num_pages);
5883 	ret = btrfs_alloc_page_array(num_pages, new->pages);
5884 	if (ret) {
5885 		btrfs_release_extent_buffer(new);
5886 		return NULL;
5887 	}
5888 
5889 	for (i = 0; i < num_pages; i++) {
5890 		int ret;
5891 		struct page *p = new->pages[i];
5892 
5893 		ret = attach_extent_buffer_page(new, p, NULL);
5894 		if (ret < 0) {
5895 			btrfs_release_extent_buffer(new);
5896 			return NULL;
5897 		}
5898 		WARN_ON(PageDirty(p));
5899 		copy_page(page_address(p), page_address(src->pages[i]));
5900 	}
5901 	set_extent_buffer_uptodate(new);
5902 
5903 	return new;
5904 }
5905 
5906 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5907 						  u64 start, unsigned long len)
5908 {
5909 	struct extent_buffer *eb;
5910 	int num_pages;
5911 	int i;
5912 	int ret;
5913 
5914 	eb = __alloc_extent_buffer(fs_info, start, len);
5915 	if (!eb)
5916 		return NULL;
5917 
5918 	num_pages = num_extent_pages(eb);
5919 	ret = btrfs_alloc_page_array(num_pages, eb->pages);
5920 	if (ret)
5921 		goto err;
5922 
5923 	for (i = 0; i < num_pages; i++) {
5924 		struct page *p = eb->pages[i];
5925 
5926 		ret = attach_extent_buffer_page(eb, p, NULL);
5927 		if (ret < 0)
5928 			goto err;
5929 	}
5930 
5931 	set_extent_buffer_uptodate(eb);
5932 	btrfs_set_header_nritems(eb, 0);
5933 	set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5934 
5935 	return eb;
5936 err:
5937 	for (i = 0; i < num_pages; i++) {
5938 		if (eb->pages[i]) {
5939 			detach_extent_buffer_page(eb, eb->pages[i]);
5940 			__free_page(eb->pages[i]);
5941 		}
5942 	}
5943 	__free_extent_buffer(eb);
5944 	return NULL;
5945 }
5946 
5947 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5948 						u64 start)
5949 {
5950 	return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
5951 }
5952 
5953 static void check_buffer_tree_ref(struct extent_buffer *eb)
5954 {
5955 	int refs;
5956 	/*
5957 	 * The TREE_REF bit is first set when the extent_buffer is added
5958 	 * to the radix tree. It is also reset, if unset, when a new reference
5959 	 * is created by find_extent_buffer.
5960 	 *
5961 	 * It is only cleared in two cases: freeing the last non-tree
5962 	 * reference to the extent_buffer when its STALE bit is set or
5963 	 * calling release_folio when the tree reference is the only reference.
5964 	 *
5965 	 * In both cases, care is taken to ensure that the extent_buffer's
5966 	 * pages are not under io. However, release_folio can be concurrently
5967 	 * called with creating new references, which is prone to race
5968 	 * conditions between the calls to check_buffer_tree_ref in those
5969 	 * codepaths and clearing TREE_REF in try_release_extent_buffer.
5970 	 *
5971 	 * The actual lifetime of the extent_buffer in the radix tree is
5972 	 * adequately protected by the refcount, but the TREE_REF bit and
5973 	 * its corresponding reference are not. To protect against this
5974 	 * class of races, we call check_buffer_tree_ref from the codepaths
5975 	 * which trigger io after they set eb->io_pages. Note that once io is
5976 	 * initiated, TREE_REF can no longer be cleared, so that is the
5977 	 * moment at which any such race is best fixed.
5978 	 */
5979 	refs = atomic_read(&eb->refs);
5980 	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5981 		return;
5982 
5983 	spin_lock(&eb->refs_lock);
5984 	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
5985 		atomic_inc(&eb->refs);
5986 	spin_unlock(&eb->refs_lock);
5987 }
5988 
5989 static void mark_extent_buffer_accessed(struct extent_buffer *eb,
5990 		struct page *accessed)
5991 {
5992 	int num_pages, i;
5993 
5994 	check_buffer_tree_ref(eb);
5995 
5996 	num_pages = num_extent_pages(eb);
5997 	for (i = 0; i < num_pages; i++) {
5998 		struct page *p = eb->pages[i];
5999 
6000 		if (p != accessed)
6001 			mark_page_accessed(p);
6002 	}
6003 }
6004 
6005 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
6006 					 u64 start)
6007 {
6008 	struct extent_buffer *eb;
6009 
6010 	eb = find_extent_buffer_nolock(fs_info, start);
6011 	if (!eb)
6012 		return NULL;
6013 	/*
6014 	 * Lock our eb's refs_lock to avoid races with free_extent_buffer().
6015 	 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
6016 	 * another task running free_extent_buffer() might have seen that flag
6017 	 * set, eb->refs == 2, that the buffer isn't under IO (dirty and
6018 	 * writeback flags not set) and it's still in the tree (flag
6019 	 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
6020 	 * decrementing the extent buffer's reference count twice.  So here we
6021 	 * could race and increment the eb's reference count, clear its stale
6022 	 * flag, mark it as dirty and drop our reference before the other task
6023 	 * finishes executing free_extent_buffer, which would later result in
6024 	 * an attempt to free an extent buffer that is dirty.
6025 	 */
6026 	if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
6027 		spin_lock(&eb->refs_lock);
6028 		spin_unlock(&eb->refs_lock);
6029 	}
6030 	mark_extent_buffer_accessed(eb, NULL);
6031 	return eb;
6032 }
6033 
6034 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6035 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
6036 					u64 start)
6037 {
6038 	struct extent_buffer *eb, *exists = NULL;
6039 	int ret;
6040 
6041 	eb = find_extent_buffer(fs_info, start);
6042 	if (eb)
6043 		return eb;
6044 	eb = alloc_dummy_extent_buffer(fs_info, start);
6045 	if (!eb)
6046 		return ERR_PTR(-ENOMEM);
6047 	eb->fs_info = fs_info;
6048 again:
6049 	ret = radix_tree_preload(GFP_NOFS);
6050 	if (ret) {
6051 		exists = ERR_PTR(ret);
6052 		goto free_eb;
6053 	}
6054 	spin_lock(&fs_info->buffer_lock);
6055 	ret = radix_tree_insert(&fs_info->buffer_radix,
6056 				start >> fs_info->sectorsize_bits, eb);
6057 	spin_unlock(&fs_info->buffer_lock);
6058 	radix_tree_preload_end();
6059 	if (ret == -EEXIST) {
6060 		exists = find_extent_buffer(fs_info, start);
6061 		if (exists)
6062 			goto free_eb;
6063 		else
6064 			goto again;
6065 	}
6066 	check_buffer_tree_ref(eb);
6067 	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
6068 
6069 	return eb;
6070 free_eb:
6071 	btrfs_release_extent_buffer(eb);
6072 	return exists;
6073 }
6074 #endif
6075 
6076 static struct extent_buffer *grab_extent_buffer(
6077 		struct btrfs_fs_info *fs_info, struct page *page)
6078 {
6079 	struct extent_buffer *exists;
6080 
6081 	/*
6082 	 * For subpage case, we completely rely on radix tree to ensure we
6083 	 * don't try to insert two ebs for the same bytenr.  So here we always
6084 	 * return NULL and just continue.
6085 	 */
6086 	if (fs_info->nodesize < PAGE_SIZE)
6087 		return NULL;
6088 
6089 	/* Page not yet attached to an extent buffer */
6090 	if (!PagePrivate(page))
6091 		return NULL;
6092 
6093 	/*
6094 	 * We could have already allocated an eb for this page and attached one
6095 	 * so lets see if we can get a ref on the existing eb, and if we can we
6096 	 * know it's good and we can just return that one, else we know we can
6097 	 * just overwrite page->private.
6098 	 */
6099 	exists = (struct extent_buffer *)page->private;
6100 	if (atomic_inc_not_zero(&exists->refs))
6101 		return exists;
6102 
6103 	WARN_ON(PageDirty(page));
6104 	detach_page_private(page);
6105 	return NULL;
6106 }
6107 
6108 static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
6109 {
6110 	if (!IS_ALIGNED(start, fs_info->sectorsize)) {
6111 		btrfs_err(fs_info, "bad tree block start %llu", start);
6112 		return -EINVAL;
6113 	}
6114 
6115 	if (fs_info->nodesize < PAGE_SIZE &&
6116 	    offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
6117 		btrfs_err(fs_info,
6118 		"tree block crosses page boundary, start %llu nodesize %u",
6119 			  start, fs_info->nodesize);
6120 		return -EINVAL;
6121 	}
6122 	if (fs_info->nodesize >= PAGE_SIZE &&
6123 	    !PAGE_ALIGNED(start)) {
6124 		btrfs_err(fs_info,
6125 		"tree block is not page aligned, start %llu nodesize %u",
6126 			  start, fs_info->nodesize);
6127 		return -EINVAL;
6128 	}
6129 	return 0;
6130 }
6131 
6132 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
6133 					  u64 start, u64 owner_root, int level)
6134 {
6135 	unsigned long len = fs_info->nodesize;
6136 	int num_pages;
6137 	int i;
6138 	unsigned long index = start >> PAGE_SHIFT;
6139 	struct extent_buffer *eb;
6140 	struct extent_buffer *exists = NULL;
6141 	struct page *p;
6142 	struct address_space *mapping = fs_info->btree_inode->i_mapping;
6143 	int uptodate = 1;
6144 	int ret;
6145 
6146 	if (check_eb_alignment(fs_info, start))
6147 		return ERR_PTR(-EINVAL);
6148 
6149 #if BITS_PER_LONG == 32
6150 	if (start >= MAX_LFS_FILESIZE) {
6151 		btrfs_err_rl(fs_info,
6152 		"extent buffer %llu is beyond 32bit page cache limit", start);
6153 		btrfs_err_32bit_limit(fs_info);
6154 		return ERR_PTR(-EOVERFLOW);
6155 	}
6156 	if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6157 		btrfs_warn_32bit_limit(fs_info);
6158 #endif
6159 
6160 	eb = find_extent_buffer(fs_info, start);
6161 	if (eb)
6162 		return eb;
6163 
6164 	eb = __alloc_extent_buffer(fs_info, start, len);
6165 	if (!eb)
6166 		return ERR_PTR(-ENOMEM);
6167 	btrfs_set_buffer_lockdep_class(owner_root, eb, level);
6168 
6169 	num_pages = num_extent_pages(eb);
6170 	for (i = 0; i < num_pages; i++, index++) {
6171 		struct btrfs_subpage *prealloc = NULL;
6172 
6173 		p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
6174 		if (!p) {
6175 			exists = ERR_PTR(-ENOMEM);
6176 			goto free_eb;
6177 		}
6178 
6179 		/*
6180 		 * Preallocate page->private for subpage case, so that we won't
6181 		 * allocate memory with private_lock hold.  The memory will be
6182 		 * freed by attach_extent_buffer_page() or freed manually if
6183 		 * we exit earlier.
6184 		 *
6185 		 * Although we have ensured one subpage eb can only have one
6186 		 * page, but it may change in the future for 16K page size
6187 		 * support, so we still preallocate the memory in the loop.
6188 		 */
6189 		if (fs_info->nodesize < PAGE_SIZE) {
6190 			prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
6191 			if (IS_ERR(prealloc)) {
6192 				ret = PTR_ERR(prealloc);
6193 				unlock_page(p);
6194 				put_page(p);
6195 				exists = ERR_PTR(ret);
6196 				goto free_eb;
6197 			}
6198 		}
6199 
6200 		spin_lock(&mapping->private_lock);
6201 		exists = grab_extent_buffer(fs_info, p);
6202 		if (exists) {
6203 			spin_unlock(&mapping->private_lock);
6204 			unlock_page(p);
6205 			put_page(p);
6206 			mark_extent_buffer_accessed(exists, p);
6207 			btrfs_free_subpage(prealloc);
6208 			goto free_eb;
6209 		}
6210 		/* Should not fail, as we have preallocated the memory */
6211 		ret = attach_extent_buffer_page(eb, p, prealloc);
6212 		ASSERT(!ret);
6213 		/*
6214 		 * To inform we have extra eb under allocation, so that
6215 		 * detach_extent_buffer_page() won't release the page private
6216 		 * when the eb hasn't yet been inserted into radix tree.
6217 		 *
6218 		 * The ref will be decreased when the eb released the page, in
6219 		 * detach_extent_buffer_page().
6220 		 * Thus needs no special handling in error path.
6221 		 */
6222 		btrfs_page_inc_eb_refs(fs_info, p);
6223 		spin_unlock(&mapping->private_lock);
6224 
6225 		WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
6226 		eb->pages[i] = p;
6227 		if (!PageUptodate(p))
6228 			uptodate = 0;
6229 
6230 		/*
6231 		 * We can't unlock the pages just yet since the extent buffer
6232 		 * hasn't been properly inserted in the radix tree, this
6233 		 * opens a race with btree_release_folio which can free a page
6234 		 * while we are still filling in all pages for the buffer and
6235 		 * we could crash.
6236 		 */
6237 	}
6238 	if (uptodate)
6239 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6240 again:
6241 	ret = radix_tree_preload(GFP_NOFS);
6242 	if (ret) {
6243 		exists = ERR_PTR(ret);
6244 		goto free_eb;
6245 	}
6246 
6247 	spin_lock(&fs_info->buffer_lock);
6248 	ret = radix_tree_insert(&fs_info->buffer_radix,
6249 				start >> fs_info->sectorsize_bits, eb);
6250 	spin_unlock(&fs_info->buffer_lock);
6251 	radix_tree_preload_end();
6252 	if (ret == -EEXIST) {
6253 		exists = find_extent_buffer(fs_info, start);
6254 		if (exists)
6255 			goto free_eb;
6256 		else
6257 			goto again;
6258 	}
6259 	/* add one reference for the tree */
6260 	check_buffer_tree_ref(eb);
6261 	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
6262 
6263 	/*
6264 	 * Now it's safe to unlock the pages because any calls to
6265 	 * btree_release_folio will correctly detect that a page belongs to a
6266 	 * live buffer and won't free them prematurely.
6267 	 */
6268 	for (i = 0; i < num_pages; i++)
6269 		unlock_page(eb->pages[i]);
6270 	return eb;
6271 
6272 free_eb:
6273 	WARN_ON(!atomic_dec_and_test(&eb->refs));
6274 	for (i = 0; i < num_pages; i++) {
6275 		if (eb->pages[i])
6276 			unlock_page(eb->pages[i]);
6277 	}
6278 
6279 	btrfs_release_extent_buffer(eb);
6280 	return exists;
6281 }
6282 
6283 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
6284 {
6285 	struct extent_buffer *eb =
6286 			container_of(head, struct extent_buffer, rcu_head);
6287 
6288 	__free_extent_buffer(eb);
6289 }
6290 
6291 static int release_extent_buffer(struct extent_buffer *eb)
6292 	__releases(&eb->refs_lock)
6293 {
6294 	lockdep_assert_held(&eb->refs_lock);
6295 
6296 	WARN_ON(atomic_read(&eb->refs) == 0);
6297 	if (atomic_dec_and_test(&eb->refs)) {
6298 		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
6299 			struct btrfs_fs_info *fs_info = eb->fs_info;
6300 
6301 			spin_unlock(&eb->refs_lock);
6302 
6303 			spin_lock(&fs_info->buffer_lock);
6304 			radix_tree_delete(&fs_info->buffer_radix,
6305 					  eb->start >> fs_info->sectorsize_bits);
6306 			spin_unlock(&fs_info->buffer_lock);
6307 		} else {
6308 			spin_unlock(&eb->refs_lock);
6309 		}
6310 
6311 		btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
6312 		/* Should be safe to release our pages at this point */
6313 		btrfs_release_extent_buffer_pages(eb);
6314 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6315 		if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
6316 			__free_extent_buffer(eb);
6317 			return 1;
6318 		}
6319 #endif
6320 		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
6321 		return 1;
6322 	}
6323 	spin_unlock(&eb->refs_lock);
6324 
6325 	return 0;
6326 }
6327 
6328 void free_extent_buffer(struct extent_buffer *eb)
6329 {
6330 	int refs;
6331 	int old;
6332 	if (!eb)
6333 		return;
6334 
6335 	while (1) {
6336 		refs = atomic_read(&eb->refs);
6337 		if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
6338 		    || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
6339 			refs == 1))
6340 			break;
6341 		old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
6342 		if (old == refs)
6343 			return;
6344 	}
6345 
6346 	spin_lock(&eb->refs_lock);
6347 	if (atomic_read(&eb->refs) == 2 &&
6348 	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
6349 	    !extent_buffer_under_io(eb) &&
6350 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6351 		atomic_dec(&eb->refs);
6352 
6353 	/*
6354 	 * I know this is terrible, but it's temporary until we stop tracking
6355 	 * the uptodate bits and such for the extent buffers.
6356 	 */
6357 	release_extent_buffer(eb);
6358 }
6359 
6360 void free_extent_buffer_stale(struct extent_buffer *eb)
6361 {
6362 	if (!eb)
6363 		return;
6364 
6365 	spin_lock(&eb->refs_lock);
6366 	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
6367 
6368 	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
6369 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6370 		atomic_dec(&eb->refs);
6371 	release_extent_buffer(eb);
6372 }
6373 
6374 static void btree_clear_page_dirty(struct page *page)
6375 {
6376 	ASSERT(PageDirty(page));
6377 	ASSERT(PageLocked(page));
6378 	clear_page_dirty_for_io(page);
6379 	xa_lock_irq(&page->mapping->i_pages);
6380 	if (!PageDirty(page))
6381 		__xa_clear_mark(&page->mapping->i_pages,
6382 				page_index(page), PAGECACHE_TAG_DIRTY);
6383 	xa_unlock_irq(&page->mapping->i_pages);
6384 }
6385 
6386 static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
6387 {
6388 	struct btrfs_fs_info *fs_info = eb->fs_info;
6389 	struct page *page = eb->pages[0];
6390 	bool last;
6391 
6392 	/* btree_clear_page_dirty() needs page locked */
6393 	lock_page(page);
6394 	last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
6395 						  eb->len);
6396 	if (last)
6397 		btree_clear_page_dirty(page);
6398 	unlock_page(page);
6399 	WARN_ON(atomic_read(&eb->refs) == 0);
6400 }
6401 
6402 void clear_extent_buffer_dirty(const struct extent_buffer *eb)
6403 {
6404 	int i;
6405 	int num_pages;
6406 	struct page *page;
6407 
6408 	if (eb->fs_info->nodesize < PAGE_SIZE)
6409 		return clear_subpage_extent_buffer_dirty(eb);
6410 
6411 	num_pages = num_extent_pages(eb);
6412 
6413 	for (i = 0; i < num_pages; i++) {
6414 		page = eb->pages[i];
6415 		if (!PageDirty(page))
6416 			continue;
6417 		lock_page(page);
6418 		btree_clear_page_dirty(page);
6419 		ClearPageError(page);
6420 		unlock_page(page);
6421 	}
6422 	WARN_ON(atomic_read(&eb->refs) == 0);
6423 }
6424 
6425 bool set_extent_buffer_dirty(struct extent_buffer *eb)
6426 {
6427 	int i;
6428 	int num_pages;
6429 	bool was_dirty;
6430 
6431 	check_buffer_tree_ref(eb);
6432 
6433 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
6434 
6435 	num_pages = num_extent_pages(eb);
6436 	WARN_ON(atomic_read(&eb->refs) == 0);
6437 	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
6438 
6439 	if (!was_dirty) {
6440 		bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
6441 
6442 		/*
6443 		 * For subpage case, we can have other extent buffers in the
6444 		 * same page, and in clear_subpage_extent_buffer_dirty() we
6445 		 * have to clear page dirty without subpage lock held.
6446 		 * This can cause race where our page gets dirty cleared after
6447 		 * we just set it.
6448 		 *
6449 		 * Thankfully, clear_subpage_extent_buffer_dirty() has locked
6450 		 * its page for other reasons, we can use page lock to prevent
6451 		 * the above race.
6452 		 */
6453 		if (subpage)
6454 			lock_page(eb->pages[0]);
6455 		for (i = 0; i < num_pages; i++)
6456 			btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
6457 					     eb->start, eb->len);
6458 		if (subpage)
6459 			unlock_page(eb->pages[0]);
6460 	}
6461 #ifdef CONFIG_BTRFS_DEBUG
6462 	for (i = 0; i < num_pages; i++)
6463 		ASSERT(PageDirty(eb->pages[i]));
6464 #endif
6465 
6466 	return was_dirty;
6467 }
6468 
6469 void clear_extent_buffer_uptodate(struct extent_buffer *eb)
6470 {
6471 	struct btrfs_fs_info *fs_info = eb->fs_info;
6472 	struct page *page;
6473 	int num_pages;
6474 	int i;
6475 
6476 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6477 	num_pages = num_extent_pages(eb);
6478 	for (i = 0; i < num_pages; i++) {
6479 		page = eb->pages[i];
6480 		if (!page)
6481 			continue;
6482 
6483 		/*
6484 		 * This is special handling for metadata subpage, as regular
6485 		 * btrfs_is_subpage() can not handle cloned/dummy metadata.
6486 		 */
6487 		if (fs_info->nodesize >= PAGE_SIZE)
6488 			ClearPageUptodate(page);
6489 		else
6490 			btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
6491 						     eb->len);
6492 	}
6493 }
6494 
6495 void set_extent_buffer_uptodate(struct extent_buffer *eb)
6496 {
6497 	struct btrfs_fs_info *fs_info = eb->fs_info;
6498 	struct page *page;
6499 	int num_pages;
6500 	int i;
6501 
6502 	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6503 	num_pages = num_extent_pages(eb);
6504 	for (i = 0; i < num_pages; i++) {
6505 		page = eb->pages[i];
6506 
6507 		/*
6508 		 * This is special handling for metadata subpage, as regular
6509 		 * btrfs_is_subpage() can not handle cloned/dummy metadata.
6510 		 */
6511 		if (fs_info->nodesize >= PAGE_SIZE)
6512 			SetPageUptodate(page);
6513 		else
6514 			btrfs_subpage_set_uptodate(fs_info, page, eb->start,
6515 						   eb->len);
6516 	}
6517 }
6518 
6519 static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
6520 				      int mirror_num)
6521 {
6522 	struct btrfs_fs_info *fs_info = eb->fs_info;
6523 	struct extent_io_tree *io_tree;
6524 	struct page *page = eb->pages[0];
6525 	struct btrfs_bio_ctrl bio_ctrl = {
6526 		.mirror_num = mirror_num,
6527 	};
6528 	int ret = 0;
6529 
6530 	ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
6531 	ASSERT(PagePrivate(page));
6532 	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
6533 
6534 	if (wait == WAIT_NONE) {
6535 		if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
6536 			return -EAGAIN;
6537 	} else {
6538 		ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6539 		if (ret < 0)
6540 			return ret;
6541 	}
6542 
6543 	ret = 0;
6544 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
6545 	    PageUptodate(page) ||
6546 	    btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
6547 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6548 		unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6549 		return ret;
6550 	}
6551 
6552 	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
6553 	eb->read_mirror = 0;
6554 	atomic_set(&eb->io_pages, 1);
6555 	check_buffer_tree_ref(eb);
6556 	btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
6557 
6558 	btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
6559 	ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
6560 				 page, eb->start, eb->len,
6561 				 eb->start - page_offset(page),
6562 				 end_bio_extent_readpage, 0, true);
6563 	if (ret) {
6564 		/*
6565 		 * In the endio function, if we hit something wrong we will
6566 		 * increase the io_pages, so here we need to decrease it for
6567 		 * error path.
6568 		 */
6569 		atomic_dec(&eb->io_pages);
6570 	}
6571 	submit_one_bio(&bio_ctrl);
6572 	if (ret || wait != WAIT_COMPLETE)
6573 		return ret;
6574 
6575 	wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
6576 	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
6577 		ret = -EIO;
6578 	return ret;
6579 }
6580 
6581 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
6582 {
6583 	int i;
6584 	struct page *page;
6585 	int err;
6586 	int ret = 0;
6587 	int locked_pages = 0;
6588 	int all_uptodate = 1;
6589 	int num_pages;
6590 	unsigned long num_reads = 0;
6591 	struct btrfs_bio_ctrl bio_ctrl = {
6592 		.mirror_num = mirror_num,
6593 	};
6594 
6595 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
6596 		return 0;
6597 
6598 	/*
6599 	 * We could have had EXTENT_BUFFER_UPTODATE cleared by the write
6600 	 * operation, which could potentially still be in flight.  In this case
6601 	 * we simply want to return an error.
6602 	 */
6603 	if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
6604 		return -EIO;
6605 
6606 	if (eb->fs_info->nodesize < PAGE_SIZE)
6607 		return read_extent_buffer_subpage(eb, wait, mirror_num);
6608 
6609 	num_pages = num_extent_pages(eb);
6610 	for (i = 0; i < num_pages; i++) {
6611 		page = eb->pages[i];
6612 		if (wait == WAIT_NONE) {
6613 			/*
6614 			 * WAIT_NONE is only utilized by readahead. If we can't
6615 			 * acquire the lock atomically it means either the eb
6616 			 * is being read out or under modification.
6617 			 * Either way the eb will be or has been cached,
6618 			 * readahead can exit safely.
6619 			 */
6620 			if (!trylock_page(page))
6621 				goto unlock_exit;
6622 		} else {
6623 			lock_page(page);
6624 		}
6625 		locked_pages++;
6626 	}
6627 	/*
6628 	 * We need to firstly lock all pages to make sure that
6629 	 * the uptodate bit of our pages won't be affected by
6630 	 * clear_extent_buffer_uptodate().
6631 	 */
6632 	for (i = 0; i < num_pages; i++) {
6633 		page = eb->pages[i];
6634 		if (!PageUptodate(page)) {
6635 			num_reads++;
6636 			all_uptodate = 0;
6637 		}
6638 	}
6639 
6640 	if (all_uptodate) {
6641 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6642 		goto unlock_exit;
6643 	}
6644 
6645 	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
6646 	eb->read_mirror = 0;
6647 	atomic_set(&eb->io_pages, num_reads);
6648 	/*
6649 	 * It is possible for release_folio to clear the TREE_REF bit before we
6650 	 * set io_pages. See check_buffer_tree_ref for a more detailed comment.
6651 	 */
6652 	check_buffer_tree_ref(eb);
6653 	for (i = 0; i < num_pages; i++) {
6654 		page = eb->pages[i];
6655 
6656 		if (!PageUptodate(page)) {
6657 			if (ret) {
6658 				atomic_dec(&eb->io_pages);
6659 				unlock_page(page);
6660 				continue;
6661 			}
6662 
6663 			ClearPageError(page);
6664 			err = submit_extent_page(REQ_OP_READ, NULL,
6665 					 &bio_ctrl, page, page_offset(page),
6666 					 PAGE_SIZE, 0, end_bio_extent_readpage,
6667 					 0, false);
6668 			if (err) {
6669 				/*
6670 				 * We failed to submit the bio so it's the
6671 				 * caller's responsibility to perform cleanup
6672 				 * i.e unlock page/set error bit.
6673 				 */
6674 				ret = err;
6675 				SetPageError(page);
6676 				unlock_page(page);
6677 				atomic_dec(&eb->io_pages);
6678 			}
6679 		} else {
6680 			unlock_page(page);
6681 		}
6682 	}
6683 
6684 	submit_one_bio(&bio_ctrl);
6685 
6686 	if (ret || wait != WAIT_COMPLETE)
6687 		return ret;
6688 
6689 	for (i = 0; i < num_pages; i++) {
6690 		page = eb->pages[i];
6691 		wait_on_page_locked(page);
6692 		if (!PageUptodate(page))
6693 			ret = -EIO;
6694 	}
6695 
6696 	return ret;
6697 
6698 unlock_exit:
6699 	while (locked_pages > 0) {
6700 		locked_pages--;
6701 		page = eb->pages[locked_pages];
6702 		unlock_page(page);
6703 	}
6704 	return ret;
6705 }
6706 
6707 static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
6708 			    unsigned long len)
6709 {
6710 	btrfs_warn(eb->fs_info,
6711 		"access to eb bytenr %llu len %lu out of range start %lu len %lu",
6712 		eb->start, eb->len, start, len);
6713 	WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6714 
6715 	return true;
6716 }
6717 
6718 /*
6719  * Check if the [start, start + len) range is valid before reading/writing
6720  * the eb.
6721  * NOTE: @start and @len are offset inside the eb, not logical address.
6722  *
6723  * Caller should not touch the dst/src memory if this function returns error.
6724  */
6725 static inline int check_eb_range(const struct extent_buffer *eb,
6726 				 unsigned long start, unsigned long len)
6727 {
6728 	unsigned long offset;
6729 
6730 	/* start, start + len should not go beyond eb->len nor overflow */
6731 	if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
6732 		return report_eb_range(eb, start, len);
6733 
6734 	return false;
6735 }
6736 
6737 void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
6738 			unsigned long start, unsigned long len)
6739 {
6740 	size_t cur;
6741 	size_t offset;
6742 	struct page *page;
6743 	char *kaddr;
6744 	char *dst = (char *)dstv;
6745 	unsigned long i = get_eb_page_index(start);
6746 
6747 	if (check_eb_range(eb, start, len))
6748 		return;
6749 
6750 	offset = get_eb_offset_in_page(eb, start);
6751 
6752 	while (len > 0) {
6753 		page = eb->pages[i];
6754 
6755 		cur = min(len, (PAGE_SIZE - offset));
6756 		kaddr = page_address(page);
6757 		memcpy(dst, kaddr + offset, cur);
6758 
6759 		dst += cur;
6760 		len -= cur;
6761 		offset = 0;
6762 		i++;
6763 	}
6764 }
6765 
6766 int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
6767 				       void __user *dstv,
6768 				       unsigned long start, unsigned long len)
6769 {
6770 	size_t cur;
6771 	size_t offset;
6772 	struct page *page;
6773 	char *kaddr;
6774 	char __user *dst = (char __user *)dstv;
6775 	unsigned long i = get_eb_page_index(start);
6776 	int ret = 0;
6777 
6778 	WARN_ON(start > eb->len);
6779 	WARN_ON(start + len > eb->start + eb->len);
6780 
6781 	offset = get_eb_offset_in_page(eb, start);
6782 
6783 	while (len > 0) {
6784 		page = eb->pages[i];
6785 
6786 		cur = min(len, (PAGE_SIZE - offset));
6787 		kaddr = page_address(page);
6788 		if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
6789 			ret = -EFAULT;
6790 			break;
6791 		}
6792 
6793 		dst += cur;
6794 		len -= cur;
6795 		offset = 0;
6796 		i++;
6797 	}
6798 
6799 	return ret;
6800 }
6801 
6802 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
6803 			 unsigned long start, unsigned long len)
6804 {
6805 	size_t cur;
6806 	size_t offset;
6807 	struct page *page;
6808 	char *kaddr;
6809 	char *ptr = (char *)ptrv;
6810 	unsigned long i = get_eb_page_index(start);
6811 	int ret = 0;
6812 
6813 	if (check_eb_range(eb, start, len))
6814 		return -EINVAL;
6815 
6816 	offset = get_eb_offset_in_page(eb, start);
6817 
6818 	while (len > 0) {
6819 		page = eb->pages[i];
6820 
6821 		cur = min(len, (PAGE_SIZE - offset));
6822 
6823 		kaddr = page_address(page);
6824 		ret = memcmp(ptr, kaddr + offset, cur);
6825 		if (ret)
6826 			break;
6827 
6828 		ptr += cur;
6829 		len -= cur;
6830 		offset = 0;
6831 		i++;
6832 	}
6833 	return ret;
6834 }
6835 
6836 /*
6837  * Check that the extent buffer is uptodate.
6838  *
6839  * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
6840  * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
6841  */
6842 static void assert_eb_page_uptodate(const struct extent_buffer *eb,
6843 				    struct page *page)
6844 {
6845 	struct btrfs_fs_info *fs_info = eb->fs_info;
6846 
6847 	/*
6848 	 * If we are using the commit root we could potentially clear a page
6849 	 * Uptodate while we're using the extent buffer that we've previously
6850 	 * looked up.  We don't want to complain in this case, as the page was
6851 	 * valid before, we just didn't write it out.  Instead we want to catch
6852 	 * the case where we didn't actually read the block properly, which
6853 	 * would have !PageUptodate && !PageError, as we clear PageError before
6854 	 * reading.
6855 	 */
6856 	if (fs_info->nodesize < PAGE_SIZE) {
6857 		bool uptodate, error;
6858 
6859 		uptodate = btrfs_subpage_test_uptodate(fs_info, page,
6860 						       eb->start, eb->len);
6861 		error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
6862 		WARN_ON(!uptodate && !error);
6863 	} else {
6864 		WARN_ON(!PageUptodate(page) && !PageError(page));
6865 	}
6866 }
6867 
6868 void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
6869 		const void *srcv)
6870 {
6871 	char *kaddr;
6872 
6873 	assert_eb_page_uptodate(eb, eb->pages[0]);
6874 	kaddr = page_address(eb->pages[0]) +
6875 		get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
6876 						   chunk_tree_uuid));
6877 	memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
6878 }
6879 
6880 void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
6881 {
6882 	char *kaddr;
6883 
6884 	assert_eb_page_uptodate(eb, eb->pages[0]);
6885 	kaddr = page_address(eb->pages[0]) +
6886 		get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
6887 	memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
6888 }
6889 
6890 void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
6891 			 unsigned long start, unsigned long len)
6892 {
6893 	size_t cur;
6894 	size_t offset;
6895 	struct page *page;
6896 	char *kaddr;
6897 	char *src = (char *)srcv;
6898 	unsigned long i = get_eb_page_index(start);
6899 
6900 	WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
6901 
6902 	if (check_eb_range(eb, start, len))
6903 		return;
6904 
6905 	offset = get_eb_offset_in_page(eb, start);
6906 
6907 	while (len > 0) {
6908 		page = eb->pages[i];
6909 		assert_eb_page_uptodate(eb, page);
6910 
6911 		cur = min(len, PAGE_SIZE - offset);
6912 		kaddr = page_address(page);
6913 		memcpy(kaddr + offset, src, cur);
6914 
6915 		src += cur;
6916 		len -= cur;
6917 		offset = 0;
6918 		i++;
6919 	}
6920 }
6921 
6922 void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
6923 		unsigned long len)
6924 {
6925 	size_t cur;
6926 	size_t offset;
6927 	struct page *page;
6928 	char *kaddr;
6929 	unsigned long i = get_eb_page_index(start);
6930 
6931 	if (check_eb_range(eb, start, len))
6932 		return;
6933 
6934 	offset = get_eb_offset_in_page(eb, start);
6935 
6936 	while (len > 0) {
6937 		page = eb->pages[i];
6938 		assert_eb_page_uptodate(eb, page);
6939 
6940 		cur = min(len, PAGE_SIZE - offset);
6941 		kaddr = page_address(page);
6942 		memset(kaddr + offset, 0, cur);
6943 
6944 		len -= cur;
6945 		offset = 0;
6946 		i++;
6947 	}
6948 }
6949 
6950 void copy_extent_buffer_full(const struct extent_buffer *dst,
6951 			     const struct extent_buffer *src)
6952 {
6953 	int i;
6954 	int num_pages;
6955 
6956 	ASSERT(dst->len == src->len);
6957 
6958 	if (dst->fs_info->nodesize >= PAGE_SIZE) {
6959 		num_pages = num_extent_pages(dst);
6960 		for (i = 0; i < num_pages; i++)
6961 			copy_page(page_address(dst->pages[i]),
6962 				  page_address(src->pages[i]));
6963 	} else {
6964 		size_t src_offset = get_eb_offset_in_page(src, 0);
6965 		size_t dst_offset = get_eb_offset_in_page(dst, 0);
6966 
6967 		ASSERT(src->fs_info->nodesize < PAGE_SIZE);
6968 		memcpy(page_address(dst->pages[0]) + dst_offset,
6969 		       page_address(src->pages[0]) + src_offset,
6970 		       src->len);
6971 	}
6972 }
6973 
6974 void copy_extent_buffer(const struct extent_buffer *dst,
6975 			const struct extent_buffer *src,
6976 			unsigned long dst_offset, unsigned long src_offset,
6977 			unsigned long len)
6978 {
6979 	u64 dst_len = dst->len;
6980 	size_t cur;
6981 	size_t offset;
6982 	struct page *page;
6983 	char *kaddr;
6984 	unsigned long i = get_eb_page_index(dst_offset);
6985 
6986 	if (check_eb_range(dst, dst_offset, len) ||
6987 	    check_eb_range(src, src_offset, len))
6988 		return;
6989 
6990 	WARN_ON(src->len != dst_len);
6991 
6992 	offset = get_eb_offset_in_page(dst, dst_offset);
6993 
6994 	while (len > 0) {
6995 		page = dst->pages[i];
6996 		assert_eb_page_uptodate(dst, page);
6997 
6998 		cur = min(len, (unsigned long)(PAGE_SIZE - offset));
6999 
7000 		kaddr = page_address(page);
7001 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
7002 
7003 		src_offset += cur;
7004 		len -= cur;
7005 		offset = 0;
7006 		i++;
7007 	}
7008 }
7009 
7010 /*
7011  * eb_bitmap_offset() - calculate the page and offset of the byte containing the
7012  * given bit number
7013  * @eb: the extent buffer
7014  * @start: offset of the bitmap item in the extent buffer
7015  * @nr: bit number
7016  * @page_index: return index of the page in the extent buffer that contains the
7017  * given bit number
7018  * @page_offset: return offset into the page given by page_index
7019  *
7020  * This helper hides the ugliness of finding the byte in an extent buffer which
7021  * contains a given bit.
7022  */
7023 static inline void eb_bitmap_offset(const struct extent_buffer *eb,
7024 				    unsigned long start, unsigned long nr,
7025 				    unsigned long *page_index,
7026 				    size_t *page_offset)
7027 {
7028 	size_t byte_offset = BIT_BYTE(nr);
7029 	size_t offset;
7030 
7031 	/*
7032 	 * The byte we want is the offset of the extent buffer + the offset of
7033 	 * the bitmap item in the extent buffer + the offset of the byte in the
7034 	 * bitmap item.
7035 	 */
7036 	offset = start + offset_in_page(eb->start) + byte_offset;
7037 
7038 	*page_index = offset >> PAGE_SHIFT;
7039 	*page_offset = offset_in_page(offset);
7040 }
7041 
7042 /**
7043  * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
7044  * @eb: the extent buffer
7045  * @start: offset of the bitmap item in the extent buffer
7046  * @nr: bit number to test
7047  */
7048 int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
7049 			   unsigned long nr)
7050 {
7051 	u8 *kaddr;
7052 	struct page *page;
7053 	unsigned long i;
7054 	size_t offset;
7055 
7056 	eb_bitmap_offset(eb, start, nr, &i, &offset);
7057 	page = eb->pages[i];
7058 	assert_eb_page_uptodate(eb, page);
7059 	kaddr = page_address(page);
7060 	return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
7061 }
7062 
7063 /**
7064  * extent_buffer_bitmap_set - set an area of a bitmap
7065  * @eb: the extent buffer
7066  * @start: offset of the bitmap item in the extent buffer
7067  * @pos: bit number of the first bit
7068  * @len: number of bits to set
7069  */
7070 void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
7071 			      unsigned long pos, unsigned long len)
7072 {
7073 	u8 *kaddr;
7074 	struct page *page;
7075 	unsigned long i;
7076 	size_t offset;
7077 	const unsigned int size = pos + len;
7078 	int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
7079 	u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
7080 
7081 	eb_bitmap_offset(eb, start, pos, &i, &offset);
7082 	page = eb->pages[i];
7083 	assert_eb_page_uptodate(eb, page);
7084 	kaddr = page_address(page);
7085 
7086 	while (len >= bits_to_set) {
7087 		kaddr[offset] |= mask_to_set;
7088 		len -= bits_to_set;
7089 		bits_to_set = BITS_PER_BYTE;
7090 		mask_to_set = ~0;
7091 		if (++offset >= PAGE_SIZE && len > 0) {
7092 			offset = 0;
7093 			page = eb->pages[++i];
7094 			assert_eb_page_uptodate(eb, page);
7095 			kaddr = page_address(page);
7096 		}
7097 	}
7098 	if (len) {
7099 		mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
7100 		kaddr[offset] |= mask_to_set;
7101 	}
7102 }
7103 
7104 
7105 /**
7106  * extent_buffer_bitmap_clear - clear an area of a bitmap
7107  * @eb: the extent buffer
7108  * @start: offset of the bitmap item in the extent buffer
7109  * @pos: bit number of the first bit
7110  * @len: number of bits to clear
7111  */
7112 void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
7113 				unsigned long start, unsigned long pos,
7114 				unsigned long len)
7115 {
7116 	u8 *kaddr;
7117 	struct page *page;
7118 	unsigned long i;
7119 	size_t offset;
7120 	const unsigned int size = pos + len;
7121 	int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
7122 	u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
7123 
7124 	eb_bitmap_offset(eb, start, pos, &i, &offset);
7125 	page = eb->pages[i];
7126 	assert_eb_page_uptodate(eb, page);
7127 	kaddr = page_address(page);
7128 
7129 	while (len >= bits_to_clear) {
7130 		kaddr[offset] &= ~mask_to_clear;
7131 		len -= bits_to_clear;
7132 		bits_to_clear = BITS_PER_BYTE;
7133 		mask_to_clear = ~0;
7134 		if (++offset >= PAGE_SIZE && len > 0) {
7135 			offset = 0;
7136 			page = eb->pages[++i];
7137 			assert_eb_page_uptodate(eb, page);
7138 			kaddr = page_address(page);
7139 		}
7140 	}
7141 	if (len) {
7142 		mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
7143 		kaddr[offset] &= ~mask_to_clear;
7144 	}
7145 }
7146 
7147 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
7148 {
7149 	unsigned long distance = (src > dst) ? src - dst : dst - src;
7150 	return distance < len;
7151 }
7152 
7153 static void copy_pages(struct page *dst_page, struct page *src_page,
7154 		       unsigned long dst_off, unsigned long src_off,
7155 		       unsigned long len)
7156 {
7157 	char *dst_kaddr = page_address(dst_page);
7158 	char *src_kaddr;
7159 	int must_memmove = 0;
7160 
7161 	if (dst_page != src_page) {
7162 		src_kaddr = page_address(src_page);
7163 	} else {
7164 		src_kaddr = dst_kaddr;
7165 		if (areas_overlap(src_off, dst_off, len))
7166 			must_memmove = 1;
7167 	}
7168 
7169 	if (must_memmove)
7170 		memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
7171 	else
7172 		memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
7173 }
7174 
7175 void memcpy_extent_buffer(const struct extent_buffer *dst,
7176 			  unsigned long dst_offset, unsigned long src_offset,
7177 			  unsigned long len)
7178 {
7179 	size_t cur;
7180 	size_t dst_off_in_page;
7181 	size_t src_off_in_page;
7182 	unsigned long dst_i;
7183 	unsigned long src_i;
7184 
7185 	if (check_eb_range(dst, dst_offset, len) ||
7186 	    check_eb_range(dst, src_offset, len))
7187 		return;
7188 
7189 	while (len > 0) {
7190 		dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
7191 		src_off_in_page = get_eb_offset_in_page(dst, src_offset);
7192 
7193 		dst_i = get_eb_page_index(dst_offset);
7194 		src_i = get_eb_page_index(src_offset);
7195 
7196 		cur = min(len, (unsigned long)(PAGE_SIZE -
7197 					       src_off_in_page));
7198 		cur = min_t(unsigned long, cur,
7199 			(unsigned long)(PAGE_SIZE - dst_off_in_page));
7200 
7201 		copy_pages(dst->pages[dst_i], dst->pages[src_i],
7202 			   dst_off_in_page, src_off_in_page, cur);
7203 
7204 		src_offset += cur;
7205 		dst_offset += cur;
7206 		len -= cur;
7207 	}
7208 }
7209 
7210 void memmove_extent_buffer(const struct extent_buffer *dst,
7211 			   unsigned long dst_offset, unsigned long src_offset,
7212 			   unsigned long len)
7213 {
7214 	size_t cur;
7215 	size_t dst_off_in_page;
7216 	size_t src_off_in_page;
7217 	unsigned long dst_end = dst_offset + len - 1;
7218 	unsigned long src_end = src_offset + len - 1;
7219 	unsigned long dst_i;
7220 	unsigned long src_i;
7221 
7222 	if (check_eb_range(dst, dst_offset, len) ||
7223 	    check_eb_range(dst, src_offset, len))
7224 		return;
7225 	if (dst_offset < src_offset) {
7226 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
7227 		return;
7228 	}
7229 	while (len > 0) {
7230 		dst_i = get_eb_page_index(dst_end);
7231 		src_i = get_eb_page_index(src_end);
7232 
7233 		dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
7234 		src_off_in_page = get_eb_offset_in_page(dst, src_end);
7235 
7236 		cur = min_t(unsigned long, len, src_off_in_page + 1);
7237 		cur = min(cur, dst_off_in_page + 1);
7238 		copy_pages(dst->pages[dst_i], dst->pages[src_i],
7239 			   dst_off_in_page - cur + 1,
7240 			   src_off_in_page - cur + 1, cur);
7241 
7242 		dst_end -= cur;
7243 		src_end -= cur;
7244 		len -= cur;
7245 	}
7246 }
7247 
7248 #define GANG_LOOKUP_SIZE	16
7249 static struct extent_buffer *get_next_extent_buffer(
7250 		struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
7251 {
7252 	struct extent_buffer *gang[GANG_LOOKUP_SIZE];
7253 	struct extent_buffer *found = NULL;
7254 	u64 page_start = page_offset(page);
7255 	u64 cur = page_start;
7256 
7257 	ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
7258 	lockdep_assert_held(&fs_info->buffer_lock);
7259 
7260 	while (cur < page_start + PAGE_SIZE) {
7261 		int ret;
7262 		int i;
7263 
7264 		ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
7265 				(void **)gang, cur >> fs_info->sectorsize_bits,
7266 				min_t(unsigned int, GANG_LOOKUP_SIZE,
7267 				      PAGE_SIZE / fs_info->nodesize));
7268 		if (ret == 0)
7269 			goto out;
7270 		for (i = 0; i < ret; i++) {
7271 			/* Already beyond page end */
7272 			if (gang[i]->start >= page_start + PAGE_SIZE)
7273 				goto out;
7274 			/* Found one */
7275 			if (gang[i]->start >= bytenr) {
7276 				found = gang[i];
7277 				goto out;
7278 			}
7279 		}
7280 		cur = gang[ret - 1]->start + gang[ret - 1]->len;
7281 	}
7282 out:
7283 	return found;
7284 }
7285 
7286 static int try_release_subpage_extent_buffer(struct page *page)
7287 {
7288 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
7289 	u64 cur = page_offset(page);
7290 	const u64 end = page_offset(page) + PAGE_SIZE;
7291 	int ret;
7292 
7293 	while (cur < end) {
7294 		struct extent_buffer *eb = NULL;
7295 
7296 		/*
7297 		 * Unlike try_release_extent_buffer() which uses page->private
7298 		 * to grab buffer, for subpage case we rely on radix tree, thus
7299 		 * we need to ensure radix tree consistency.
7300 		 *
7301 		 * We also want an atomic snapshot of the radix tree, thus go
7302 		 * with spinlock rather than RCU.
7303 		 */
7304 		spin_lock(&fs_info->buffer_lock);
7305 		eb = get_next_extent_buffer(fs_info, page, cur);
7306 		if (!eb) {
7307 			/* No more eb in the page range after or at cur */
7308 			spin_unlock(&fs_info->buffer_lock);
7309 			break;
7310 		}
7311 		cur = eb->start + eb->len;
7312 
7313 		/*
7314 		 * The same as try_release_extent_buffer(), to ensure the eb
7315 		 * won't disappear out from under us.
7316 		 */
7317 		spin_lock(&eb->refs_lock);
7318 		if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
7319 			spin_unlock(&eb->refs_lock);
7320 			spin_unlock(&fs_info->buffer_lock);
7321 			break;
7322 		}
7323 		spin_unlock(&fs_info->buffer_lock);
7324 
7325 		/*
7326 		 * If tree ref isn't set then we know the ref on this eb is a
7327 		 * real ref, so just return, this eb will likely be freed soon
7328 		 * anyway.
7329 		 */
7330 		if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7331 			spin_unlock(&eb->refs_lock);
7332 			break;
7333 		}
7334 
7335 		/*
7336 		 * Here we don't care about the return value, we will always
7337 		 * check the page private at the end.  And
7338 		 * release_extent_buffer() will release the refs_lock.
7339 		 */
7340 		release_extent_buffer(eb);
7341 	}
7342 	/*
7343 	 * Finally to check if we have cleared page private, as if we have
7344 	 * released all ebs in the page, the page private should be cleared now.
7345 	 */
7346 	spin_lock(&page->mapping->private_lock);
7347 	if (!PagePrivate(page))
7348 		ret = 1;
7349 	else
7350 		ret = 0;
7351 	spin_unlock(&page->mapping->private_lock);
7352 	return ret;
7353 
7354 }
7355 
7356 int try_release_extent_buffer(struct page *page)
7357 {
7358 	struct extent_buffer *eb;
7359 
7360 	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
7361 		return try_release_subpage_extent_buffer(page);
7362 
7363 	/*
7364 	 * We need to make sure nobody is changing page->private, as we rely on
7365 	 * page->private as the pointer to extent buffer.
7366 	 */
7367 	spin_lock(&page->mapping->private_lock);
7368 	if (!PagePrivate(page)) {
7369 		spin_unlock(&page->mapping->private_lock);
7370 		return 1;
7371 	}
7372 
7373 	eb = (struct extent_buffer *)page->private;
7374 	BUG_ON(!eb);
7375 
7376 	/*
7377 	 * This is a little awful but should be ok, we need to make sure that
7378 	 * the eb doesn't disappear out from under us while we're looking at
7379 	 * this page.
7380 	 */
7381 	spin_lock(&eb->refs_lock);
7382 	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
7383 		spin_unlock(&eb->refs_lock);
7384 		spin_unlock(&page->mapping->private_lock);
7385 		return 0;
7386 	}
7387 	spin_unlock(&page->mapping->private_lock);
7388 
7389 	/*
7390 	 * If tree ref isn't set then we know the ref on this eb is a real ref,
7391 	 * so just return, this page will likely be freed soon anyway.
7392 	 */
7393 	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7394 		spin_unlock(&eb->refs_lock);
7395 		return 0;
7396 	}
7397 
7398 	return release_extent_buffer(eb);
7399 }
7400 
7401 /*
7402  * btrfs_readahead_tree_block - attempt to readahead a child block
7403  * @fs_info:	the fs_info
7404  * @bytenr:	bytenr to read
7405  * @owner_root: objectid of the root that owns this eb
7406  * @gen:	generation for the uptodate check, can be 0
7407  * @level:	level for the eb
7408  *
7409  * Attempt to readahead a tree block at @bytenr.  If @gen is 0 then we do a
7410  * normal uptodate check of the eb, without checking the generation.  If we have
7411  * to read the block we will not block on anything.
7412  */
7413 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
7414 				u64 bytenr, u64 owner_root, u64 gen, int level)
7415 {
7416 	struct extent_buffer *eb;
7417 	int ret;
7418 
7419 	eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
7420 	if (IS_ERR(eb))
7421 		return;
7422 
7423 	if (btrfs_buffer_uptodate(eb, gen, 1)) {
7424 		free_extent_buffer(eb);
7425 		return;
7426 	}
7427 
7428 	ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
7429 	if (ret < 0)
7430 		free_extent_buffer_stale(eb);
7431 	else
7432 		free_extent_buffer(eb);
7433 }
7434 
7435 /*
7436  * btrfs_readahead_node_child - readahead a node's child block
7437  * @node:	parent node we're reading from
7438  * @slot:	slot in the parent node for the child we want to read
7439  *
7440  * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
7441  * the slot in the node provided.
7442  */
7443 void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
7444 {
7445 	btrfs_readahead_tree_block(node->fs_info,
7446 				   btrfs_node_blockptr(node, slot),
7447 				   btrfs_header_owner(node),
7448 				   btrfs_node_ptr_generation(node, slot),
7449 				   btrfs_header_level(node) - 1);
7450 }
7451