xref: /openbmc/linux/fs/btrfs/extent_io.c (revision 0153682e)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/bitops.h>
4 #include <linux/slab.h>
5 #include <linux/bio.h>
6 #include <linux/mm.h>
7 #include <linux/pagemap.h>
8 #include <linux/page-flags.h>
9 #include <linux/sched/mm.h>
10 #include <linux/spinlock.h>
11 #include <linux/blkdev.h>
12 #include <linux/swap.h>
13 #include <linux/writeback.h>
14 #include <linux/pagevec.h>
15 #include <linux/prefetch.h>
16 #include <linux/fsverity.h>
17 #include "misc.h"
18 #include "extent_io.h"
19 #include "extent-io-tree.h"
20 #include "extent_map.h"
21 #include "ctree.h"
22 #include "btrfs_inode.h"
23 #include "volumes.h"
24 #include "check-integrity.h"
25 #include "locking.h"
26 #include "rcu-string.h"
27 #include "backref.h"
28 #include "disk-io.h"
29 #include "subpage.h"
30 #include "zoned.h"
31 #include "block-group.h"
32 #include "compression.h"
33 
34 static struct kmem_cache *extent_state_cache;
35 static struct kmem_cache *extent_buffer_cache;
36 static struct bio_set btrfs_bioset;
37 
38 static inline bool extent_state_in_tree(const struct extent_state *state)
39 {
40 	return !RB_EMPTY_NODE(&state->rb_node);
41 }
42 
43 #ifdef CONFIG_BTRFS_DEBUG
44 static LIST_HEAD(states);
45 static DEFINE_SPINLOCK(leak_lock);
46 
47 static inline void btrfs_leak_debug_add(spinlock_t *lock,
48 					struct list_head *new,
49 					struct list_head *head)
50 {
51 	unsigned long flags;
52 
53 	spin_lock_irqsave(lock, flags);
54 	list_add(new, head);
55 	spin_unlock_irqrestore(lock, flags);
56 }
57 
58 static inline void btrfs_leak_debug_del(spinlock_t *lock,
59 					struct list_head *entry)
60 {
61 	unsigned long flags;
62 
63 	spin_lock_irqsave(lock, flags);
64 	list_del(entry);
65 	spin_unlock_irqrestore(lock, flags);
66 }
67 
68 void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
69 {
70 	struct extent_buffer *eb;
71 	unsigned long flags;
72 
73 	/*
74 	 * If we didn't get into open_ctree our allocated_ebs will not be
75 	 * initialized, so just skip this.
76 	 */
77 	if (!fs_info->allocated_ebs.next)
78 		return;
79 
80 	WARN_ON(!list_empty(&fs_info->allocated_ebs));
81 	spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
82 	while (!list_empty(&fs_info->allocated_ebs)) {
83 		eb = list_first_entry(&fs_info->allocated_ebs,
84 				      struct extent_buffer, leak_list);
85 		pr_err(
86 	"BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
87 		       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
88 		       btrfs_header_owner(eb));
89 		list_del(&eb->leak_list);
90 		kmem_cache_free(extent_buffer_cache, eb);
91 	}
92 	spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
93 }
94 
95 static inline void btrfs_extent_state_leak_debug_check(void)
96 {
97 	struct extent_state *state;
98 
99 	while (!list_empty(&states)) {
100 		state = list_entry(states.next, struct extent_state, leak_list);
101 		pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
102 		       state->start, state->end, state->state,
103 		       extent_state_in_tree(state),
104 		       refcount_read(&state->refs));
105 		list_del(&state->leak_list);
106 		kmem_cache_free(extent_state_cache, state);
107 	}
108 }
109 
110 #define btrfs_debug_check_extent_io_range(tree, start, end)		\
111 	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
112 static inline void __btrfs_debug_check_extent_io_range(const char *caller,
113 		struct extent_io_tree *tree, u64 start, u64 end)
114 {
115 	struct inode *inode = tree->private_data;
116 	u64 isize;
117 
118 	if (!inode || !is_data_inode(inode))
119 		return;
120 
121 	isize = i_size_read(inode);
122 	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
123 		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
124 		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
125 			caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
126 	}
127 }
128 #else
129 #define btrfs_leak_debug_add(lock, new, head)	do {} while (0)
130 #define btrfs_leak_debug_del(lock, entry)	do {} while (0)
131 #define btrfs_extent_state_leak_debug_check()	do {} while (0)
132 #define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
133 #endif
134 
135 struct tree_entry {
136 	u64 start;
137 	u64 end;
138 	struct rb_node rb_node;
139 };
140 
141 /*
142  * Structure to record info about the bio being assembled, and other info like
143  * how many bytes are there before stripe/ordered extent boundary.
144  */
145 struct btrfs_bio_ctrl {
146 	struct bio *bio;
147 	int mirror_num;
148 	enum btrfs_compression_type compress_type;
149 	u32 len_to_stripe_boundary;
150 	u32 len_to_oe_boundary;
151 };
152 
153 struct extent_page_data {
154 	struct btrfs_bio_ctrl bio_ctrl;
155 	/* tells writepage not to lock the state bits for this range
156 	 * it still does the unlocking
157 	 */
158 	unsigned int extent_locked:1;
159 
160 	/* tells the submit_bio code to use REQ_SYNC */
161 	unsigned int sync_io:1;
162 };
163 
164 static int add_extent_changeset(struct extent_state *state, u32 bits,
165 				 struct extent_changeset *changeset,
166 				 int set)
167 {
168 	int ret;
169 
170 	if (!changeset)
171 		return 0;
172 	if (set && (state->state & bits) == bits)
173 		return 0;
174 	if (!set && (state->state & bits) == 0)
175 		return 0;
176 	changeset->bytes_changed += state->end - state->start + 1;
177 	ret = ulist_add(&changeset->range_changed, state->start, state->end,
178 			GFP_ATOMIC);
179 	return ret;
180 }
181 
182 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
183 {
184 	struct bio *bio;
185 	struct bio_vec *bv;
186 	struct inode *inode;
187 	int mirror_num;
188 
189 	if (!bio_ctrl->bio)
190 		return;
191 
192 	bio = bio_ctrl->bio;
193 	bv = bio_first_bvec_all(bio);
194 	inode = bv->bv_page->mapping->host;
195 	mirror_num = bio_ctrl->mirror_num;
196 
197 	/* Caller should ensure the bio has at least some range added */
198 	ASSERT(bio->bi_iter.bi_size);
199 
200 	btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
201 
202 	if (!is_data_inode(inode))
203 		btrfs_submit_metadata_bio(inode, bio, mirror_num);
204 	else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
205 		btrfs_submit_data_write_bio(inode, bio, mirror_num);
206 	else
207 		btrfs_submit_data_read_bio(inode, bio, mirror_num,
208 					   bio_ctrl->compress_type);
209 
210 	/* The bio is owned by the bi_end_io handler now */
211 	bio_ctrl->bio = NULL;
212 }
213 
214 /*
215  * Submit or fail the current bio in an extent_page_data structure.
216  */
217 static void submit_write_bio(struct extent_page_data *epd, int ret)
218 {
219 	struct bio *bio = epd->bio_ctrl.bio;
220 
221 	if (!bio)
222 		return;
223 
224 	if (ret) {
225 		ASSERT(ret < 0);
226 		bio->bi_status = errno_to_blk_status(ret);
227 		bio_endio(bio);
228 		/* The bio is owned by the bi_end_io handler now */
229 		epd->bio_ctrl.bio = NULL;
230 	} else {
231 		submit_one_bio(&epd->bio_ctrl);
232 	}
233 }
234 
235 int __init extent_state_cache_init(void)
236 {
237 	extent_state_cache = kmem_cache_create("btrfs_extent_state",
238 			sizeof(struct extent_state), 0,
239 			SLAB_MEM_SPREAD, NULL);
240 	if (!extent_state_cache)
241 		return -ENOMEM;
242 	return 0;
243 }
244 
245 int __init extent_io_init(void)
246 {
247 	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
248 			sizeof(struct extent_buffer), 0,
249 			SLAB_MEM_SPREAD, NULL);
250 	if (!extent_buffer_cache)
251 		return -ENOMEM;
252 
253 	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
254 			offsetof(struct btrfs_bio, bio),
255 			BIOSET_NEED_BVECS))
256 		goto free_buffer_cache;
257 
258 	if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
259 		goto free_bioset;
260 
261 	return 0;
262 
263 free_bioset:
264 	bioset_exit(&btrfs_bioset);
265 
266 free_buffer_cache:
267 	kmem_cache_destroy(extent_buffer_cache);
268 	extent_buffer_cache = NULL;
269 	return -ENOMEM;
270 }
271 
272 void __cold extent_state_cache_exit(void)
273 {
274 	btrfs_extent_state_leak_debug_check();
275 	kmem_cache_destroy(extent_state_cache);
276 }
277 
278 void __cold extent_io_exit(void)
279 {
280 	/*
281 	 * Make sure all delayed rcu free are flushed before we
282 	 * destroy caches.
283 	 */
284 	rcu_barrier();
285 	kmem_cache_destroy(extent_buffer_cache);
286 	bioset_exit(&btrfs_bioset);
287 }
288 
289 /*
290  * For the file_extent_tree, we want to hold the inode lock when we lookup and
291  * update the disk_i_size, but lockdep will complain because our io_tree we hold
292  * the tree lock and get the inode lock when setting delalloc.  These two things
293  * are unrelated, so make a class for the file_extent_tree so we don't get the
294  * two locking patterns mixed up.
295  */
296 static struct lock_class_key file_extent_tree_class;
297 
298 void extent_io_tree_init(struct btrfs_fs_info *fs_info,
299 			 struct extent_io_tree *tree, unsigned int owner,
300 			 void *private_data)
301 {
302 	tree->fs_info = fs_info;
303 	tree->state = RB_ROOT;
304 	tree->dirty_bytes = 0;
305 	spin_lock_init(&tree->lock);
306 	tree->private_data = private_data;
307 	tree->owner = owner;
308 	if (owner == IO_TREE_INODE_FILE_EXTENT)
309 		lockdep_set_class(&tree->lock, &file_extent_tree_class);
310 }
311 
312 void extent_io_tree_release(struct extent_io_tree *tree)
313 {
314 	spin_lock(&tree->lock);
315 	/*
316 	 * Do a single barrier for the waitqueue_active check here, the state
317 	 * of the waitqueue should not change once extent_io_tree_release is
318 	 * called.
319 	 */
320 	smp_mb();
321 	while (!RB_EMPTY_ROOT(&tree->state)) {
322 		struct rb_node *node;
323 		struct extent_state *state;
324 
325 		node = rb_first(&tree->state);
326 		state = rb_entry(node, struct extent_state, rb_node);
327 		rb_erase(&state->rb_node, &tree->state);
328 		RB_CLEAR_NODE(&state->rb_node);
329 		/*
330 		 * btree io trees aren't supposed to have tasks waiting for
331 		 * changes in the flags of extent states ever.
332 		 */
333 		ASSERT(!waitqueue_active(&state->wq));
334 		free_extent_state(state);
335 
336 		cond_resched_lock(&tree->lock);
337 	}
338 	spin_unlock(&tree->lock);
339 }
340 
341 static struct extent_state *alloc_extent_state(gfp_t mask)
342 {
343 	struct extent_state *state;
344 
345 	/*
346 	 * The given mask might be not appropriate for the slab allocator,
347 	 * drop the unsupported bits
348 	 */
349 	mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
350 	state = kmem_cache_alloc(extent_state_cache, mask);
351 	if (!state)
352 		return state;
353 	state->state = 0;
354 	state->failrec = NULL;
355 	RB_CLEAR_NODE(&state->rb_node);
356 	btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
357 	refcount_set(&state->refs, 1);
358 	init_waitqueue_head(&state->wq);
359 	trace_alloc_extent_state(state, mask, _RET_IP_);
360 	return state;
361 }
362 
363 void free_extent_state(struct extent_state *state)
364 {
365 	if (!state)
366 		return;
367 	if (refcount_dec_and_test(&state->refs)) {
368 		WARN_ON(extent_state_in_tree(state));
369 		btrfs_leak_debug_del(&leak_lock, &state->leak_list);
370 		trace_free_extent_state(state, _RET_IP_);
371 		kmem_cache_free(extent_state_cache, state);
372 	}
373 }
374 
375 /**
376  * Search @tree for an entry that contains @offset. Such entry would have
377  * entry->start <= offset && entry->end >= offset.
378  *
379  * @tree:       the tree to search
380  * @offset:     offset that should fall within an entry in @tree
381  * @node_ret:   pointer where new node should be anchored (used when inserting an
382  *	        entry in the tree)
383  * @parent_ret: points to entry which would have been the parent of the entry,
384  *               containing @offset
385  *
386  * Return a pointer to the entry that contains @offset byte address and don't change
387  * @node_ret and @parent_ret.
388  *
389  * If no such entry exists, return pointer to entry that ends before @offset
390  * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
391  */
392 static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree,
393 					             u64 offset,
394 						     struct rb_node ***node_ret,
395 						     struct rb_node **parent_ret)
396 {
397 	struct rb_root *root = &tree->state;
398 	struct rb_node **node = &root->rb_node;
399 	struct rb_node *prev = NULL;
400 	struct tree_entry *entry;
401 
402 	while (*node) {
403 		prev = *node;
404 		entry = rb_entry(prev, struct tree_entry, rb_node);
405 
406 		if (offset < entry->start)
407 			node = &(*node)->rb_left;
408 		else if (offset > entry->end)
409 			node = &(*node)->rb_right;
410 		else
411 			return *node;
412 	}
413 
414 	if (node_ret)
415 		*node_ret = node;
416 	if (parent_ret)
417 		*parent_ret = prev;
418 
419 	/* Search neighbors until we find the first one past the end */
420 	while (prev && offset > entry->end) {
421 		prev = rb_next(prev);
422 		entry = rb_entry(prev, struct tree_entry, rb_node);
423 	}
424 
425 	return prev;
426 }
427 
428 /*
429  * Inexact rb-tree search, return the next entry if @offset is not found
430  */
431 static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset)
432 {
433 	return tree_search_for_insert(tree, offset, NULL, NULL);
434 }
435 
436 /**
437  * Search offset in the tree or fill neighbor rbtree node pointers.
438  *
439  * @tree:      the tree to search
440  * @offset:    offset that should fall within an entry in @tree
441  * @next_ret:  pointer to the first entry whose range ends after @offset
442  * @prev_ret:  pointer to the first entry whose range begins before @offset
443  *
444  * Return a pointer to the entry that contains @offset byte address. If no
445  * such entry exists, then return NULL and fill @prev_ret and @next_ret.
446  * Otherwise return the found entry and other pointers are left untouched.
447  */
448 static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree,
449 					     u64 offset,
450 					     struct rb_node **prev_ret,
451 					     struct rb_node **next_ret)
452 {
453 	struct rb_root *root = &tree->state;
454 	struct rb_node **node = &root->rb_node;
455 	struct rb_node *prev = NULL;
456 	struct rb_node *orig_prev = NULL;
457 	struct tree_entry *entry;
458 
459 	ASSERT(prev_ret);
460 	ASSERT(next_ret);
461 
462 	while (*node) {
463 		prev = *node;
464 		entry = rb_entry(prev, struct tree_entry, rb_node);
465 
466 		if (offset < entry->start)
467 			node = &(*node)->rb_left;
468 		else if (offset > entry->end)
469 			node = &(*node)->rb_right;
470 		else
471 			return *node;
472 	}
473 
474 	orig_prev = prev;
475 	while (prev && offset > entry->end) {
476 		prev = rb_next(prev);
477 		entry = rb_entry(prev, struct tree_entry, rb_node);
478 	}
479 	*next_ret = prev;
480 	prev = orig_prev;
481 
482 	entry = rb_entry(prev, struct tree_entry, rb_node);
483 	while (prev && offset < entry->start) {
484 		prev = rb_prev(prev);
485 		entry = rb_entry(prev, struct tree_entry, rb_node);
486 	}
487 	*prev_ret = prev;
488 
489 	return NULL;
490 }
491 
492 /*
493  * utility function to look for merge candidates inside a given range.
494  * Any extents with matching state are merged together into a single
495  * extent in the tree.  Extents with EXTENT_IO in their state field
496  * are not merged because the end_io handlers need to be able to do
497  * operations on them without sleeping (or doing allocations/splits).
498  *
499  * This should be called with the tree lock held.
500  */
501 static void merge_state(struct extent_io_tree *tree,
502 		        struct extent_state *state)
503 {
504 	struct extent_state *other;
505 	struct rb_node *other_node;
506 
507 	if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
508 		return;
509 
510 	other_node = rb_prev(&state->rb_node);
511 	if (other_node) {
512 		other = rb_entry(other_node, struct extent_state, rb_node);
513 		if (other->end == state->start - 1 &&
514 		    other->state == state->state) {
515 			if (tree->private_data &&
516 			    is_data_inode(tree->private_data))
517 				btrfs_merge_delalloc_extent(tree->private_data,
518 							    state, other);
519 			state->start = other->start;
520 			rb_erase(&other->rb_node, &tree->state);
521 			RB_CLEAR_NODE(&other->rb_node);
522 			free_extent_state(other);
523 		}
524 	}
525 	other_node = rb_next(&state->rb_node);
526 	if (other_node) {
527 		other = rb_entry(other_node, struct extent_state, rb_node);
528 		if (other->start == state->end + 1 &&
529 		    other->state == state->state) {
530 			if (tree->private_data &&
531 			    is_data_inode(tree->private_data))
532 				btrfs_merge_delalloc_extent(tree->private_data,
533 							    state, other);
534 			state->end = other->end;
535 			rb_erase(&other->rb_node, &tree->state);
536 			RB_CLEAR_NODE(&other->rb_node);
537 			free_extent_state(other);
538 		}
539 	}
540 }
541 
542 static void set_state_bits(struct extent_io_tree *tree,
543 			   struct extent_state *state, u32 bits,
544 			   struct extent_changeset *changeset);
545 
546 /*
547  * insert an extent_state struct into the tree.  'bits' are set on the
548  * struct before it is inserted.
549  *
550  * This may return -EEXIST if the extent is already there, in which case the
551  * state struct is freed.
552  *
553  * The tree lock is not taken internally.  This is a utility function and
554  * probably isn't what you want to call (see set/clear_extent_bit).
555  */
556 static int insert_state(struct extent_io_tree *tree,
557 			struct extent_state *state,
558 			u32 bits, struct extent_changeset *changeset)
559 {
560 	struct rb_node **node;
561 	struct rb_node *parent;
562 	const u64 end = state->end;
563 
564 	set_state_bits(tree, state, bits, changeset);
565 
566 	node = &tree->state.rb_node;
567 	while (*node) {
568 		struct tree_entry *entry;
569 
570 		parent = *node;
571 		entry = rb_entry(parent, struct tree_entry, rb_node);
572 
573 		if (end < entry->start) {
574 			node = &(*node)->rb_left;
575 		} else if (end > entry->end) {
576 			node = &(*node)->rb_right;
577 		} else {
578 			btrfs_err(tree->fs_info,
579 			       "found node %llu %llu on insert of %llu %llu",
580 			       entry->start, entry->end, state->start, end);
581 			return -EEXIST;
582 		}
583 	}
584 
585 	rb_link_node(&state->rb_node, parent, node);
586 	rb_insert_color(&state->rb_node, &tree->state);
587 
588 	merge_state(tree, state);
589 	return 0;
590 }
591 
592 /*
593  * Insert state to @tree to the location given by @node and @parent.
594  */
595 static void insert_state_fast(struct extent_io_tree *tree,
596 			      struct extent_state *state, struct rb_node **node,
597 			      struct rb_node *parent, unsigned bits,
598 			      struct extent_changeset *changeset)
599 {
600 	set_state_bits(tree, state, bits, changeset);
601 	rb_link_node(&state->rb_node, parent, node);
602 	rb_insert_color(&state->rb_node, &tree->state);
603 	merge_state(tree, state);
604 }
605 
606 /*
607  * split a given extent state struct in two, inserting the preallocated
608  * struct 'prealloc' as the newly created second half.  'split' indicates an
609  * offset inside 'orig' where it should be split.
610  *
611  * Before calling,
612  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
613  * are two extent state structs in the tree:
614  * prealloc: [orig->start, split - 1]
615  * orig: [ split, orig->end ]
616  *
617  * The tree locks are not taken by this function. They need to be held
618  * by the caller.
619  */
620 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
621 		       struct extent_state *prealloc, u64 split)
622 {
623 	struct rb_node *parent = NULL;
624 	struct rb_node **node;
625 
626 	if (tree->private_data && is_data_inode(tree->private_data))
627 		btrfs_split_delalloc_extent(tree->private_data, orig, split);
628 
629 	prealloc->start = orig->start;
630 	prealloc->end = split - 1;
631 	prealloc->state = orig->state;
632 	orig->start = split;
633 
634 	parent = &orig->rb_node;
635 	node = &parent;
636 	while (*node) {
637 		struct tree_entry *entry;
638 
639 		parent = *node;
640 		entry = rb_entry(parent, struct tree_entry, rb_node);
641 
642 		if (prealloc->end < entry->start) {
643 			node = &(*node)->rb_left;
644 		} else if (prealloc->end > entry->end) {
645 			node = &(*node)->rb_right;
646 		} else {
647 			free_extent_state(prealloc);
648 			return -EEXIST;
649 		}
650 	}
651 
652 	rb_link_node(&prealloc->rb_node, parent, node);
653 	rb_insert_color(&prealloc->rb_node, &tree->state);
654 
655 	return 0;
656 }
657 
658 static struct extent_state *next_state(struct extent_state *state)
659 {
660 	struct rb_node *next = rb_next(&state->rb_node);
661 	if (next)
662 		return rb_entry(next, struct extent_state, rb_node);
663 	else
664 		return NULL;
665 }
666 
667 /*
668  * utility function to clear some bits in an extent state struct.
669  * it will optionally wake up anyone waiting on this state (wake == 1).
670  *
671  * If no bits are set on the state struct after clearing things, the
672  * struct is freed and removed from the tree
673  */
674 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
675 					    struct extent_state *state,
676 					    u32 bits, int wake,
677 					    struct extent_changeset *changeset)
678 {
679 	struct extent_state *next;
680 	u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
681 	int ret;
682 
683 	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
684 		u64 range = state->end - state->start + 1;
685 		WARN_ON(range > tree->dirty_bytes);
686 		tree->dirty_bytes -= range;
687 	}
688 
689 	if (tree->private_data && is_data_inode(tree->private_data))
690 		btrfs_clear_delalloc_extent(tree->private_data, state, bits);
691 
692 	ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
693 	BUG_ON(ret < 0);
694 	state->state &= ~bits_to_clear;
695 	if (wake)
696 		wake_up(&state->wq);
697 	if (state->state == 0) {
698 		next = next_state(state);
699 		if (extent_state_in_tree(state)) {
700 			rb_erase(&state->rb_node, &tree->state);
701 			RB_CLEAR_NODE(&state->rb_node);
702 			free_extent_state(state);
703 		} else {
704 			WARN_ON(1);
705 		}
706 	} else {
707 		merge_state(tree, state);
708 		next = next_state(state);
709 	}
710 	return next;
711 }
712 
713 static struct extent_state *
714 alloc_extent_state_atomic(struct extent_state *prealloc)
715 {
716 	if (!prealloc)
717 		prealloc = alloc_extent_state(GFP_ATOMIC);
718 
719 	return prealloc;
720 }
721 
722 static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
723 {
724 	btrfs_panic(tree->fs_info, err,
725 	"locking error: extent tree was modified by another thread while locked");
726 }
727 
728 /*
729  * clear some bits on a range in the tree.  This may require splitting
730  * or inserting elements in the tree, so the gfp mask is used to
731  * indicate which allocations or sleeping are allowed.
732  *
733  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
734  * the given range from the tree regardless of state (ie for truncate).
735  *
736  * the range [start, end] is inclusive.
737  *
738  * This takes the tree lock, and returns 0 on success and < 0 on error.
739  */
740 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
741 		       u32 bits, int wake, int delete,
742 		       struct extent_state **cached_state,
743 		       gfp_t mask, struct extent_changeset *changeset)
744 {
745 	struct extent_state *state;
746 	struct extent_state *cached;
747 	struct extent_state *prealloc = NULL;
748 	struct rb_node *node;
749 	u64 last_end;
750 	int err;
751 	int clear = 0;
752 
753 	btrfs_debug_check_extent_io_range(tree, start, end);
754 	trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
755 
756 	if (bits & EXTENT_DELALLOC)
757 		bits |= EXTENT_NORESERVE;
758 
759 	if (delete)
760 		bits |= ~EXTENT_CTLBITS;
761 
762 	if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
763 		clear = 1;
764 again:
765 	if (!prealloc && gfpflags_allow_blocking(mask)) {
766 		/*
767 		 * Don't care for allocation failure here because we might end
768 		 * up not needing the pre-allocated extent state at all, which
769 		 * is the case if we only have in the tree extent states that
770 		 * cover our input range and don't cover too any other range.
771 		 * If we end up needing a new extent state we allocate it later.
772 		 */
773 		prealloc = alloc_extent_state(mask);
774 	}
775 
776 	spin_lock(&tree->lock);
777 	if (cached_state) {
778 		cached = *cached_state;
779 
780 		if (clear) {
781 			*cached_state = NULL;
782 			cached_state = NULL;
783 		}
784 
785 		if (cached && extent_state_in_tree(cached) &&
786 		    cached->start <= start && cached->end > start) {
787 			if (clear)
788 				refcount_dec(&cached->refs);
789 			state = cached;
790 			goto hit_next;
791 		}
792 		if (clear)
793 			free_extent_state(cached);
794 	}
795 	/*
796 	 * this search will find the extents that end after
797 	 * our range starts
798 	 */
799 	node = tree_search(tree, start);
800 	if (!node)
801 		goto out;
802 	state = rb_entry(node, struct extent_state, rb_node);
803 hit_next:
804 	if (state->start > end)
805 		goto out;
806 	WARN_ON(state->end < start);
807 	last_end = state->end;
808 
809 	/* the state doesn't have the wanted bits, go ahead */
810 	if (!(state->state & bits)) {
811 		state = next_state(state);
812 		goto next;
813 	}
814 
815 	/*
816 	 *     | ---- desired range ---- |
817 	 *  | state | or
818 	 *  | ------------- state -------------- |
819 	 *
820 	 * We need to split the extent we found, and may flip
821 	 * bits on second half.
822 	 *
823 	 * If the extent we found extends past our range, we
824 	 * just split and search again.  It'll get split again
825 	 * the next time though.
826 	 *
827 	 * If the extent we found is inside our range, we clear
828 	 * the desired bit on it.
829 	 */
830 
831 	if (state->start < start) {
832 		prealloc = alloc_extent_state_atomic(prealloc);
833 		BUG_ON(!prealloc);
834 		err = split_state(tree, state, prealloc, start);
835 		if (err)
836 			extent_io_tree_panic(tree, err);
837 
838 		prealloc = NULL;
839 		if (err)
840 			goto out;
841 		if (state->end <= end) {
842 			state = clear_state_bit(tree, state, bits, wake, changeset);
843 			goto next;
844 		}
845 		goto search_again;
846 	}
847 	/*
848 	 * | ---- desired range ---- |
849 	 *                        | state |
850 	 * We need to split the extent, and clear the bit
851 	 * on the first half
852 	 */
853 	if (state->start <= end && state->end > end) {
854 		prealloc = alloc_extent_state_atomic(prealloc);
855 		BUG_ON(!prealloc);
856 		err = split_state(tree, state, prealloc, end + 1);
857 		if (err)
858 			extent_io_tree_panic(tree, err);
859 
860 		if (wake)
861 			wake_up(&state->wq);
862 
863 		clear_state_bit(tree, prealloc, bits, wake, changeset);
864 
865 		prealloc = NULL;
866 		goto out;
867 	}
868 
869 	state = clear_state_bit(tree, state, bits, wake, changeset);
870 next:
871 	if (last_end == (u64)-1)
872 		goto out;
873 	start = last_end + 1;
874 	if (start <= end && state && !need_resched())
875 		goto hit_next;
876 
877 search_again:
878 	if (start > end)
879 		goto out;
880 	spin_unlock(&tree->lock);
881 	if (gfpflags_allow_blocking(mask))
882 		cond_resched();
883 	goto again;
884 
885 out:
886 	spin_unlock(&tree->lock);
887 	if (prealloc)
888 		free_extent_state(prealloc);
889 
890 	return 0;
891 
892 }
893 
894 static void wait_on_state(struct extent_io_tree *tree,
895 			  struct extent_state *state)
896 		__releases(tree->lock)
897 		__acquires(tree->lock)
898 {
899 	DEFINE_WAIT(wait);
900 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
901 	spin_unlock(&tree->lock);
902 	schedule();
903 	spin_lock(&tree->lock);
904 	finish_wait(&state->wq, &wait);
905 }
906 
907 /*
908  * waits for one or more bits to clear on a range in the state tree.
909  * The range [start, end] is inclusive.
910  * The tree lock is taken by this function
911  */
912 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
913 			    u32 bits)
914 {
915 	struct extent_state *state;
916 	struct rb_node *node;
917 
918 	btrfs_debug_check_extent_io_range(tree, start, end);
919 
920 	spin_lock(&tree->lock);
921 again:
922 	while (1) {
923 		/*
924 		 * this search will find all the extents that end after
925 		 * our range starts
926 		 */
927 		node = tree_search(tree, start);
928 process_node:
929 		if (!node)
930 			break;
931 
932 		state = rb_entry(node, struct extent_state, rb_node);
933 
934 		if (state->start > end)
935 			goto out;
936 
937 		if (state->state & bits) {
938 			start = state->start;
939 			refcount_inc(&state->refs);
940 			wait_on_state(tree, state);
941 			free_extent_state(state);
942 			goto again;
943 		}
944 		start = state->end + 1;
945 
946 		if (start > end)
947 			break;
948 
949 		if (!cond_resched_lock(&tree->lock)) {
950 			node = rb_next(node);
951 			goto process_node;
952 		}
953 	}
954 out:
955 	spin_unlock(&tree->lock);
956 }
957 
958 static void set_state_bits(struct extent_io_tree *tree,
959 			   struct extent_state *state,
960 			   u32 bits, struct extent_changeset *changeset)
961 {
962 	u32 bits_to_set = bits & ~EXTENT_CTLBITS;
963 	int ret;
964 
965 	if (tree->private_data && is_data_inode(tree->private_data))
966 		btrfs_set_delalloc_extent(tree->private_data, state, bits);
967 
968 	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
969 		u64 range = state->end - state->start + 1;
970 		tree->dirty_bytes += range;
971 	}
972 	ret = add_extent_changeset(state, bits_to_set, changeset, 1);
973 	BUG_ON(ret < 0);
974 	state->state |= bits_to_set;
975 }
976 
977 static void cache_state_if_flags(struct extent_state *state,
978 				 struct extent_state **cached_ptr,
979 				 unsigned flags)
980 {
981 	if (cached_ptr && !(*cached_ptr)) {
982 		if (!flags || (state->state & flags)) {
983 			*cached_ptr = state;
984 			refcount_inc(&state->refs);
985 		}
986 	}
987 }
988 
989 static void cache_state(struct extent_state *state,
990 			struct extent_state **cached_ptr)
991 {
992 	return cache_state_if_flags(state, cached_ptr,
993 				    EXTENT_LOCKED | EXTENT_BOUNDARY);
994 }
995 
996 /*
997  * set some bits on a range in the tree.  This may require allocations or
998  * sleeping, so the gfp mask is used to indicate what is allowed.
999  *
1000  * If any of the exclusive bits are set, this will fail with -EEXIST if some
1001  * part of the range already has the desired bits set.  The start of the
1002  * existing range is returned in failed_start in this case.
1003  *
1004  * [start, end] is inclusive This takes the tree lock.
1005  */
1006 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
1007 		   u32 exclusive_bits, u64 *failed_start,
1008 		   struct extent_state **cached_state, gfp_t mask,
1009 		   struct extent_changeset *changeset)
1010 {
1011 	struct extent_state *state;
1012 	struct extent_state *prealloc = NULL;
1013 	struct rb_node *node;
1014 	struct rb_node **p;
1015 	struct rb_node *parent;
1016 	int err = 0;
1017 	u64 last_start;
1018 	u64 last_end;
1019 
1020 	btrfs_debug_check_extent_io_range(tree, start, end);
1021 	trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
1022 
1023 	if (exclusive_bits)
1024 		ASSERT(failed_start);
1025 	else
1026 		ASSERT(failed_start == NULL);
1027 again:
1028 	if (!prealloc && gfpflags_allow_blocking(mask)) {
1029 		/*
1030 		 * Don't care for allocation failure here because we might end
1031 		 * up not needing the pre-allocated extent state at all, which
1032 		 * is the case if we only have in the tree extent states that
1033 		 * cover our input range and don't cover too any other range.
1034 		 * If we end up needing a new extent state we allocate it later.
1035 		 */
1036 		prealloc = alloc_extent_state(mask);
1037 	}
1038 
1039 	spin_lock(&tree->lock);
1040 	if (cached_state && *cached_state) {
1041 		state = *cached_state;
1042 		if (state->start <= start && state->end > start &&
1043 		    extent_state_in_tree(state)) {
1044 			node = &state->rb_node;
1045 			goto hit_next;
1046 		}
1047 	}
1048 	/*
1049 	 * this search will find all the extents that end after
1050 	 * our range starts.
1051 	 */
1052 	node = tree_search_for_insert(tree, start, &p, &parent);
1053 	if (!node) {
1054 		prealloc = alloc_extent_state_atomic(prealloc);
1055 		BUG_ON(!prealloc);
1056 		prealloc->start = start;
1057 		prealloc->end = end;
1058 		insert_state_fast(tree, prealloc, p, parent, bits, changeset);
1059 		cache_state(prealloc, cached_state);
1060 		prealloc = NULL;
1061 		goto out;
1062 	}
1063 	state = rb_entry(node, struct extent_state, rb_node);
1064 hit_next:
1065 	last_start = state->start;
1066 	last_end = state->end;
1067 
1068 	/*
1069 	 * | ---- desired range ---- |
1070 	 * | state |
1071 	 *
1072 	 * Just lock what we found and keep going
1073 	 */
1074 	if (state->start == start && state->end <= end) {
1075 		if (state->state & exclusive_bits) {
1076 			*failed_start = state->start;
1077 			err = -EEXIST;
1078 			goto out;
1079 		}
1080 
1081 		set_state_bits(tree, state, bits, changeset);
1082 		cache_state(state, cached_state);
1083 		merge_state(tree, state);
1084 		if (last_end == (u64)-1)
1085 			goto out;
1086 		start = last_end + 1;
1087 		state = next_state(state);
1088 		if (start < end && state && state->start == start &&
1089 		    !need_resched())
1090 			goto hit_next;
1091 		goto search_again;
1092 	}
1093 
1094 	/*
1095 	 *     | ---- desired range ---- |
1096 	 * | state |
1097 	 *   or
1098 	 * | ------------- state -------------- |
1099 	 *
1100 	 * We need to split the extent we found, and may flip bits on
1101 	 * second half.
1102 	 *
1103 	 * If the extent we found extends past our
1104 	 * range, we just split and search again.  It'll get split
1105 	 * again the next time though.
1106 	 *
1107 	 * If the extent we found is inside our range, we set the
1108 	 * desired bit on it.
1109 	 */
1110 	if (state->start < start) {
1111 		if (state->state & exclusive_bits) {
1112 			*failed_start = start;
1113 			err = -EEXIST;
1114 			goto out;
1115 		}
1116 
1117 		/*
1118 		 * If this extent already has all the bits we want set, then
1119 		 * skip it, not necessary to split it or do anything with it.
1120 		 */
1121 		if ((state->state & bits) == bits) {
1122 			start = state->end + 1;
1123 			cache_state(state, cached_state);
1124 			goto search_again;
1125 		}
1126 
1127 		prealloc = alloc_extent_state_atomic(prealloc);
1128 		BUG_ON(!prealloc);
1129 		err = split_state(tree, state, prealloc, start);
1130 		if (err)
1131 			extent_io_tree_panic(tree, err);
1132 
1133 		prealloc = NULL;
1134 		if (err)
1135 			goto out;
1136 		if (state->end <= end) {
1137 			set_state_bits(tree, state, bits, changeset);
1138 			cache_state(state, cached_state);
1139 			merge_state(tree, state);
1140 			if (last_end == (u64)-1)
1141 				goto out;
1142 			start = last_end + 1;
1143 			state = next_state(state);
1144 			if (start < end && state && state->start == start &&
1145 			    !need_resched())
1146 				goto hit_next;
1147 		}
1148 		goto search_again;
1149 	}
1150 	/*
1151 	 * | ---- desired range ---- |
1152 	 *     | state | or               | state |
1153 	 *
1154 	 * There's a hole, we need to insert something in it and
1155 	 * ignore the extent we found.
1156 	 */
1157 	if (state->start > start) {
1158 		u64 this_end;
1159 		if (end < last_start)
1160 			this_end = end;
1161 		else
1162 			this_end = last_start - 1;
1163 
1164 		prealloc = alloc_extent_state_atomic(prealloc);
1165 		BUG_ON(!prealloc);
1166 
1167 		/*
1168 		 * Avoid to free 'prealloc' if it can be merged with
1169 		 * the later extent.
1170 		 */
1171 		prealloc->start = start;
1172 		prealloc->end = this_end;
1173 		err = insert_state(tree, prealloc, bits, changeset);
1174 		if (err)
1175 			extent_io_tree_panic(tree, err);
1176 
1177 		cache_state(prealloc, cached_state);
1178 		prealloc = NULL;
1179 		start = this_end + 1;
1180 		goto search_again;
1181 	}
1182 	/*
1183 	 * | ---- desired range ---- |
1184 	 *                        | state |
1185 	 * We need to split the extent, and set the bit
1186 	 * on the first half
1187 	 */
1188 	if (state->start <= end && state->end > end) {
1189 		if (state->state & exclusive_bits) {
1190 			*failed_start = start;
1191 			err = -EEXIST;
1192 			goto out;
1193 		}
1194 
1195 		prealloc = alloc_extent_state_atomic(prealloc);
1196 		BUG_ON(!prealloc);
1197 		err = split_state(tree, state, prealloc, end + 1);
1198 		if (err)
1199 			extent_io_tree_panic(tree, err);
1200 
1201 		set_state_bits(tree, prealloc, bits, changeset);
1202 		cache_state(prealloc, cached_state);
1203 		merge_state(tree, prealloc);
1204 		prealloc = NULL;
1205 		goto out;
1206 	}
1207 
1208 search_again:
1209 	if (start > end)
1210 		goto out;
1211 	spin_unlock(&tree->lock);
1212 	if (gfpflags_allow_blocking(mask))
1213 		cond_resched();
1214 	goto again;
1215 
1216 out:
1217 	spin_unlock(&tree->lock);
1218 	if (prealloc)
1219 		free_extent_state(prealloc);
1220 
1221 	return err;
1222 
1223 }
1224 
1225 /**
1226  * convert_extent_bit - convert all bits in a given range from one bit to
1227  * 			another
1228  * @tree:	the io tree to search
1229  * @start:	the start offset in bytes
1230  * @end:	the end offset in bytes (inclusive)
1231  * @bits:	the bits to set in this range
1232  * @clear_bits:	the bits to clear in this range
1233  * @cached_state:	state that we're going to cache
1234  *
1235  * This will go through and set bits for the given range.  If any states exist
1236  * already in this range they are set with the given bit and cleared of the
1237  * clear_bits.  This is only meant to be used by things that are mergeable, ie
1238  * converting from say DELALLOC to DIRTY.  This is not meant to be used with
1239  * boundary bits like LOCK.
1240  *
1241  * All allocations are done with GFP_NOFS.
1242  */
1243 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1244 		       u32 bits, u32 clear_bits,
1245 		       struct extent_state **cached_state)
1246 {
1247 	struct extent_state *state;
1248 	struct extent_state *prealloc = NULL;
1249 	struct rb_node *node;
1250 	struct rb_node **p;
1251 	struct rb_node *parent;
1252 	int err = 0;
1253 	u64 last_start;
1254 	u64 last_end;
1255 	bool first_iteration = true;
1256 
1257 	btrfs_debug_check_extent_io_range(tree, start, end);
1258 	trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1259 				       clear_bits);
1260 
1261 again:
1262 	if (!prealloc) {
1263 		/*
1264 		 * Best effort, don't worry if extent state allocation fails
1265 		 * here for the first iteration. We might have a cached state
1266 		 * that matches exactly the target range, in which case no
1267 		 * extent state allocations are needed. We'll only know this
1268 		 * after locking the tree.
1269 		 */
1270 		prealloc = alloc_extent_state(GFP_NOFS);
1271 		if (!prealloc && !first_iteration)
1272 			return -ENOMEM;
1273 	}
1274 
1275 	spin_lock(&tree->lock);
1276 	if (cached_state && *cached_state) {
1277 		state = *cached_state;
1278 		if (state->start <= start && state->end > start &&
1279 		    extent_state_in_tree(state)) {
1280 			node = &state->rb_node;
1281 			goto hit_next;
1282 		}
1283 	}
1284 
1285 	/*
1286 	 * this search will find all the extents that end after
1287 	 * our range starts.
1288 	 */
1289 	node = tree_search_for_insert(tree, start, &p, &parent);
1290 	if (!node) {
1291 		prealloc = alloc_extent_state_atomic(prealloc);
1292 		if (!prealloc) {
1293 			err = -ENOMEM;
1294 			goto out;
1295 		}
1296 		prealloc->start = start;
1297 		prealloc->end = end;
1298 		insert_state_fast(tree, prealloc, p, parent, bits, NULL);
1299 		cache_state(prealloc, cached_state);
1300 		prealloc = NULL;
1301 		goto out;
1302 	}
1303 	state = rb_entry(node, struct extent_state, rb_node);
1304 hit_next:
1305 	last_start = state->start;
1306 	last_end = state->end;
1307 
1308 	/*
1309 	 * | ---- desired range ---- |
1310 	 * | state |
1311 	 *
1312 	 * Just lock what we found and keep going
1313 	 */
1314 	if (state->start == start && state->end <= end) {
1315 		set_state_bits(tree, state, bits, NULL);
1316 		cache_state(state, cached_state);
1317 		state = clear_state_bit(tree, state, clear_bits, 0, NULL);
1318 		if (last_end == (u64)-1)
1319 			goto out;
1320 		start = last_end + 1;
1321 		if (start < end && state && state->start == start &&
1322 		    !need_resched())
1323 			goto hit_next;
1324 		goto search_again;
1325 	}
1326 
1327 	/*
1328 	 *     | ---- desired range ---- |
1329 	 * | state |
1330 	 *   or
1331 	 * | ------------- state -------------- |
1332 	 *
1333 	 * We need to split the extent we found, and may flip bits on
1334 	 * second half.
1335 	 *
1336 	 * If the extent we found extends past our
1337 	 * range, we just split and search again.  It'll get split
1338 	 * again the next time though.
1339 	 *
1340 	 * If the extent we found is inside our range, we set the
1341 	 * desired bit on it.
1342 	 */
1343 	if (state->start < start) {
1344 		prealloc = alloc_extent_state_atomic(prealloc);
1345 		if (!prealloc) {
1346 			err = -ENOMEM;
1347 			goto out;
1348 		}
1349 		err = split_state(tree, state, prealloc, start);
1350 		if (err)
1351 			extent_io_tree_panic(tree, err);
1352 		prealloc = NULL;
1353 		if (err)
1354 			goto out;
1355 		if (state->end <= end) {
1356 			set_state_bits(tree, state, bits, NULL);
1357 			cache_state(state, cached_state);
1358 			state = clear_state_bit(tree, state, clear_bits, 0, NULL);
1359 			if (last_end == (u64)-1)
1360 				goto out;
1361 			start = last_end + 1;
1362 			if (start < end && state && state->start == start &&
1363 			    !need_resched())
1364 				goto hit_next;
1365 		}
1366 		goto search_again;
1367 	}
1368 	/*
1369 	 * | ---- desired range ---- |
1370 	 *     | state | or               | state |
1371 	 *
1372 	 * There's a hole, we need to insert something in it and
1373 	 * ignore the extent we found.
1374 	 */
1375 	if (state->start > start) {
1376 		u64 this_end;
1377 		if (end < last_start)
1378 			this_end = end;
1379 		else
1380 			this_end = last_start - 1;
1381 
1382 		prealloc = alloc_extent_state_atomic(prealloc);
1383 		if (!prealloc) {
1384 			err = -ENOMEM;
1385 			goto out;
1386 		}
1387 
1388 		/*
1389 		 * Avoid to free 'prealloc' if it can be merged with
1390 		 * the later extent.
1391 		 */
1392 		prealloc->start = start;
1393 		prealloc->end = this_end;
1394 		err = insert_state(tree, prealloc, bits, NULL);
1395 		if (err)
1396 			extent_io_tree_panic(tree, err);
1397 		cache_state(prealloc, cached_state);
1398 		prealloc = NULL;
1399 		start = this_end + 1;
1400 		goto search_again;
1401 	}
1402 	/*
1403 	 * | ---- desired range ---- |
1404 	 *                        | state |
1405 	 * We need to split the extent, and set the bit
1406 	 * on the first half
1407 	 */
1408 	if (state->start <= end && state->end > end) {
1409 		prealloc = alloc_extent_state_atomic(prealloc);
1410 		if (!prealloc) {
1411 			err = -ENOMEM;
1412 			goto out;
1413 		}
1414 
1415 		err = split_state(tree, state, prealloc, end + 1);
1416 		if (err)
1417 			extent_io_tree_panic(tree, err);
1418 
1419 		set_state_bits(tree, prealloc, bits, NULL);
1420 		cache_state(prealloc, cached_state);
1421 		clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
1422 		prealloc = NULL;
1423 		goto out;
1424 	}
1425 
1426 search_again:
1427 	if (start > end)
1428 		goto out;
1429 	spin_unlock(&tree->lock);
1430 	cond_resched();
1431 	first_iteration = false;
1432 	goto again;
1433 
1434 out:
1435 	spin_unlock(&tree->lock);
1436 	if (prealloc)
1437 		free_extent_state(prealloc);
1438 
1439 	return err;
1440 }
1441 
1442 /* wrappers around set/clear extent bit */
1443 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1444 			   u32 bits, struct extent_changeset *changeset)
1445 {
1446 	/*
1447 	 * We don't support EXTENT_LOCKED yet, as current changeset will
1448 	 * record any bits changed, so for EXTENT_LOCKED case, it will
1449 	 * either fail with -EEXIST or changeset will record the whole
1450 	 * range.
1451 	 */
1452 	BUG_ON(bits & EXTENT_LOCKED);
1453 
1454 	return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
1455 			      changeset);
1456 }
1457 
1458 int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
1459 			   u32 bits)
1460 {
1461 	return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1462 			      GFP_NOWAIT, NULL);
1463 }
1464 
1465 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1466 		     u32 bits, int wake, int delete,
1467 		     struct extent_state **cached)
1468 {
1469 	return __clear_extent_bit(tree, start, end, bits, wake, delete,
1470 				  cached, GFP_NOFS, NULL);
1471 }
1472 
1473 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1474 		u32 bits, struct extent_changeset *changeset)
1475 {
1476 	/*
1477 	 * Don't support EXTENT_LOCKED case, same reason as
1478 	 * set_record_extent_bits().
1479 	 */
1480 	BUG_ON(bits & EXTENT_LOCKED);
1481 
1482 	return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
1483 				  changeset);
1484 }
1485 
1486 /*
1487  * either insert or lock state struct between start and end use mask to tell
1488  * us if waiting is desired.
1489  */
1490 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1491 		     struct extent_state **cached_state)
1492 {
1493 	int err;
1494 	u64 failed_start;
1495 
1496 	while (1) {
1497 		err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
1498 				     EXTENT_LOCKED, &failed_start,
1499 				     cached_state, GFP_NOFS, NULL);
1500 		if (err == -EEXIST) {
1501 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1502 			start = failed_start;
1503 		} else
1504 			break;
1505 		WARN_ON(start > end);
1506 	}
1507 	return err;
1508 }
1509 
1510 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1511 {
1512 	int err;
1513 	u64 failed_start;
1514 
1515 	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1516 			     &failed_start, NULL, GFP_NOFS, NULL);
1517 	if (err == -EEXIST) {
1518 		if (failed_start > start)
1519 			clear_extent_bit(tree, start, failed_start - 1,
1520 					 EXTENT_LOCKED, 1, 0, NULL);
1521 		return 0;
1522 	}
1523 	return 1;
1524 }
1525 
1526 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1527 {
1528 	unsigned long index = start >> PAGE_SHIFT;
1529 	unsigned long end_index = end >> PAGE_SHIFT;
1530 	struct page *page;
1531 
1532 	while (index <= end_index) {
1533 		page = find_get_page(inode->i_mapping, index);
1534 		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1535 		clear_page_dirty_for_io(page);
1536 		put_page(page);
1537 		index++;
1538 	}
1539 }
1540 
1541 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1542 {
1543 	struct address_space *mapping = inode->i_mapping;
1544 	unsigned long index = start >> PAGE_SHIFT;
1545 	unsigned long end_index = end >> PAGE_SHIFT;
1546 	struct folio *folio;
1547 
1548 	while (index <= end_index) {
1549 		folio = filemap_get_folio(mapping, index);
1550 		filemap_dirty_folio(mapping, folio);
1551 		folio_account_redirty(folio);
1552 		index += folio_nr_pages(folio);
1553 		folio_put(folio);
1554 	}
1555 }
1556 
1557 /* find the first state struct with 'bits' set after 'start', and
1558  * return it.  tree->lock must be held.  NULL will returned if
1559  * nothing was found after 'start'
1560  */
1561 static struct extent_state *
1562 find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
1563 {
1564 	struct rb_node *node;
1565 	struct extent_state *state;
1566 
1567 	/*
1568 	 * this search will find all the extents that end after
1569 	 * our range starts.
1570 	 */
1571 	node = tree_search(tree, start);
1572 	if (!node)
1573 		goto out;
1574 
1575 	while (1) {
1576 		state = rb_entry(node, struct extent_state, rb_node);
1577 		if (state->end >= start && (state->state & bits))
1578 			return state;
1579 
1580 		node = rb_next(node);
1581 		if (!node)
1582 			break;
1583 	}
1584 out:
1585 	return NULL;
1586 }
1587 
1588 /*
1589  * Find the first offset in the io tree with one or more @bits set.
1590  *
1591  * Note: If there are multiple bits set in @bits, any of them will match.
1592  *
1593  * Return 0 if we find something, and update @start_ret and @end_ret.
1594  * Return 1 if we found nothing.
1595  */
1596 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1597 			  u64 *start_ret, u64 *end_ret, u32 bits,
1598 			  struct extent_state **cached_state)
1599 {
1600 	struct extent_state *state;
1601 	int ret = 1;
1602 
1603 	spin_lock(&tree->lock);
1604 	if (cached_state && *cached_state) {
1605 		state = *cached_state;
1606 		if (state->end == start - 1 && extent_state_in_tree(state)) {
1607 			while ((state = next_state(state)) != NULL) {
1608 				if (state->state & bits)
1609 					goto got_it;
1610 			}
1611 			free_extent_state(*cached_state);
1612 			*cached_state = NULL;
1613 			goto out;
1614 		}
1615 		free_extent_state(*cached_state);
1616 		*cached_state = NULL;
1617 	}
1618 
1619 	state = find_first_extent_bit_state(tree, start, bits);
1620 got_it:
1621 	if (state) {
1622 		cache_state_if_flags(state, cached_state, 0);
1623 		*start_ret = state->start;
1624 		*end_ret = state->end;
1625 		ret = 0;
1626 	}
1627 out:
1628 	spin_unlock(&tree->lock);
1629 	return ret;
1630 }
1631 
1632 /**
1633  * Find a contiguous area of bits
1634  *
1635  * @tree:      io tree to check
1636  * @start:     offset to start the search from
1637  * @start_ret: the first offset we found with the bits set
1638  * @end_ret:   the final contiguous range of the bits that were set
1639  * @bits:      bits to look for
1640  *
1641  * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
1642  * to set bits appropriately, and then merge them again.  During this time it
1643  * will drop the tree->lock, so use this helper if you want to find the actual
1644  * contiguous area for given bits.  We will search to the first bit we find, and
1645  * then walk down the tree until we find a non-contiguous area.  The area
1646  * returned will be the full contiguous area with the bits set.
1647  */
1648 int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
1649 			       u64 *start_ret, u64 *end_ret, u32 bits)
1650 {
1651 	struct extent_state *state;
1652 	int ret = 1;
1653 
1654 	spin_lock(&tree->lock);
1655 	state = find_first_extent_bit_state(tree, start, bits);
1656 	if (state) {
1657 		*start_ret = state->start;
1658 		*end_ret = state->end;
1659 		while ((state = next_state(state)) != NULL) {
1660 			if (state->start > (*end_ret + 1))
1661 				break;
1662 			*end_ret = state->end;
1663 		}
1664 		ret = 0;
1665 	}
1666 	spin_unlock(&tree->lock);
1667 	return ret;
1668 }
1669 
1670 /**
1671  * Find the first range that has @bits not set. This range could start before
1672  * @start.
1673  *
1674  * @tree:      the tree to search
1675  * @start:     offset at/after which the found extent should start
1676  * @start_ret: records the beginning of the range
1677  * @end_ret:   records the end of the range (inclusive)
1678  * @bits:      the set of bits which must be unset
1679  *
1680  * Since unallocated range is also considered one which doesn't have the bits
1681  * set it's possible that @end_ret contains -1, this happens in case the range
1682  * spans (last_range_end, end of device]. In this case it's up to the caller to
1683  * trim @end_ret to the appropriate size.
1684  */
1685 void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
1686 				 u64 *start_ret, u64 *end_ret, u32 bits)
1687 {
1688 	struct extent_state *state;
1689 	struct rb_node *node, *prev = NULL, *next;
1690 
1691 	spin_lock(&tree->lock);
1692 
1693 	/* Find first extent with bits cleared */
1694 	while (1) {
1695 		node = tree_search_prev_next(tree, start, &prev, &next);
1696 		if (!node && !next && !prev) {
1697 			/*
1698 			 * Tree is completely empty, send full range and let
1699 			 * caller deal with it
1700 			 */
1701 			*start_ret = 0;
1702 			*end_ret = -1;
1703 			goto out;
1704 		} else if (!node && !next) {
1705 			/*
1706 			 * We are past the last allocated chunk, set start at
1707 			 * the end of the last extent.
1708 			 */
1709 			state = rb_entry(prev, struct extent_state, rb_node);
1710 			*start_ret = state->end + 1;
1711 			*end_ret = -1;
1712 			goto out;
1713 		} else if (!node) {
1714 			node = next;
1715 		}
1716 		/*
1717 		 * At this point 'node' either contains 'start' or start is
1718 		 * before 'node'
1719 		 */
1720 		state = rb_entry(node, struct extent_state, rb_node);
1721 
1722 		if (in_range(start, state->start, state->end - state->start + 1)) {
1723 			if (state->state & bits) {
1724 				/*
1725 				 * |--range with bits sets--|
1726 				 *    |
1727 				 *    start
1728 				 */
1729 				start = state->end + 1;
1730 			} else {
1731 				/*
1732 				 * 'start' falls within a range that doesn't
1733 				 * have the bits set, so take its start as
1734 				 * the beginning of the desired range
1735 				 *
1736 				 * |--range with bits cleared----|
1737 				 *      |
1738 				 *      start
1739 				 */
1740 				*start_ret = state->start;
1741 				break;
1742 			}
1743 		} else {
1744 			/*
1745 			 * |---prev range---|---hole/unset---|---node range---|
1746 			 *                          |
1747 			 *                        start
1748 			 *
1749 			 *                        or
1750 			 *
1751 			 * |---hole/unset--||--first node--|
1752 			 * 0   |
1753 			 *    start
1754 			 */
1755 			if (prev) {
1756 				state = rb_entry(prev, struct extent_state,
1757 						 rb_node);
1758 				*start_ret = state->end + 1;
1759 			} else {
1760 				*start_ret = 0;
1761 			}
1762 			break;
1763 		}
1764 	}
1765 
1766 	/*
1767 	 * Find the longest stretch from start until an entry which has the
1768 	 * bits set
1769 	 */
1770 	while (1) {
1771 		state = rb_entry(node, struct extent_state, rb_node);
1772 		if (state->end >= start && !(state->state & bits)) {
1773 			*end_ret = state->end;
1774 		} else {
1775 			*end_ret = state->start - 1;
1776 			break;
1777 		}
1778 
1779 		node = rb_next(node);
1780 		if (!node)
1781 			break;
1782 	}
1783 out:
1784 	spin_unlock(&tree->lock);
1785 }
1786 
1787 /*
1788  * find a contiguous range of bytes in the file marked as delalloc, not
1789  * more than 'max_bytes'.  start and end are used to return the range,
1790  *
1791  * true is returned if we find something, false if nothing was in the tree
1792  */
1793 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
1794 			       u64 *end, u64 max_bytes,
1795 			       struct extent_state **cached_state)
1796 {
1797 	struct rb_node *node;
1798 	struct extent_state *state;
1799 	u64 cur_start = *start;
1800 	bool found = false;
1801 	u64 total_bytes = 0;
1802 
1803 	spin_lock(&tree->lock);
1804 
1805 	/*
1806 	 * this search will find all the extents that end after
1807 	 * our range starts.
1808 	 */
1809 	node = tree_search(tree, cur_start);
1810 	if (!node) {
1811 		*end = (u64)-1;
1812 		goto out;
1813 	}
1814 
1815 	while (1) {
1816 		state = rb_entry(node, struct extent_state, rb_node);
1817 		if (found && (state->start != cur_start ||
1818 			      (state->state & EXTENT_BOUNDARY))) {
1819 			goto out;
1820 		}
1821 		if (!(state->state & EXTENT_DELALLOC)) {
1822 			if (!found)
1823 				*end = state->end;
1824 			goto out;
1825 		}
1826 		if (!found) {
1827 			*start = state->start;
1828 			*cached_state = state;
1829 			refcount_inc(&state->refs);
1830 		}
1831 		found = true;
1832 		*end = state->end;
1833 		cur_start = state->end + 1;
1834 		node = rb_next(node);
1835 		total_bytes += state->end - state->start + 1;
1836 		if (total_bytes >= max_bytes)
1837 			break;
1838 		if (!node)
1839 			break;
1840 	}
1841 out:
1842 	spin_unlock(&tree->lock);
1843 	return found;
1844 }
1845 
1846 /*
1847  * Process one page for __process_pages_contig().
1848  *
1849  * Return >0 if we hit @page == @locked_page.
1850  * Return 0 if we updated the page status.
1851  * Return -EGAIN if the we need to try again.
1852  * (For PAGE_LOCK case but got dirty page or page not belong to mapping)
1853  */
1854 static int process_one_page(struct btrfs_fs_info *fs_info,
1855 			    struct address_space *mapping,
1856 			    struct page *page, struct page *locked_page,
1857 			    unsigned long page_ops, u64 start, u64 end)
1858 {
1859 	u32 len;
1860 
1861 	ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
1862 	len = end + 1 - start;
1863 
1864 	if (page_ops & PAGE_SET_ORDERED)
1865 		btrfs_page_clamp_set_ordered(fs_info, page, start, len);
1866 	if (page_ops & PAGE_SET_ERROR)
1867 		btrfs_page_clamp_set_error(fs_info, page, start, len);
1868 	if (page_ops & PAGE_START_WRITEBACK) {
1869 		btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
1870 		btrfs_page_clamp_set_writeback(fs_info, page, start, len);
1871 	}
1872 	if (page_ops & PAGE_END_WRITEBACK)
1873 		btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
1874 
1875 	if (page == locked_page)
1876 		return 1;
1877 
1878 	if (page_ops & PAGE_LOCK) {
1879 		int ret;
1880 
1881 		ret = btrfs_page_start_writer_lock(fs_info, page, start, len);
1882 		if (ret)
1883 			return ret;
1884 		if (!PageDirty(page) || page->mapping != mapping) {
1885 			btrfs_page_end_writer_lock(fs_info, page, start, len);
1886 			return -EAGAIN;
1887 		}
1888 	}
1889 	if (page_ops & PAGE_UNLOCK)
1890 		btrfs_page_end_writer_lock(fs_info, page, start, len);
1891 	return 0;
1892 }
1893 
1894 static int __process_pages_contig(struct address_space *mapping,
1895 				  struct page *locked_page,
1896 				  u64 start, u64 end, unsigned long page_ops,
1897 				  u64 *processed_end)
1898 {
1899 	struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
1900 	pgoff_t start_index = start >> PAGE_SHIFT;
1901 	pgoff_t end_index = end >> PAGE_SHIFT;
1902 	pgoff_t index = start_index;
1903 	unsigned long nr_pages = end_index - start_index + 1;
1904 	unsigned long pages_processed = 0;
1905 	struct page *pages[16];
1906 	int err = 0;
1907 	int i;
1908 
1909 	if (page_ops & PAGE_LOCK) {
1910 		ASSERT(page_ops == PAGE_LOCK);
1911 		ASSERT(processed_end && *processed_end == start);
1912 	}
1913 
1914 	if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
1915 		mapping_set_error(mapping, -EIO);
1916 
1917 	while (nr_pages > 0) {
1918 		int found_pages;
1919 
1920 		found_pages = find_get_pages_contig(mapping, index,
1921 				     min_t(unsigned long,
1922 				     nr_pages, ARRAY_SIZE(pages)), pages);
1923 		if (found_pages == 0) {
1924 			/*
1925 			 * Only if we're going to lock these pages, we can find
1926 			 * nothing at @index.
1927 			 */
1928 			ASSERT(page_ops & PAGE_LOCK);
1929 			err = -EAGAIN;
1930 			goto out;
1931 		}
1932 
1933 		for (i = 0; i < found_pages; i++) {
1934 			int process_ret;
1935 
1936 			process_ret = process_one_page(fs_info, mapping,
1937 					pages[i], locked_page, page_ops,
1938 					start, end);
1939 			if (process_ret < 0) {
1940 				for (; i < found_pages; i++)
1941 					put_page(pages[i]);
1942 				err = -EAGAIN;
1943 				goto out;
1944 			}
1945 			put_page(pages[i]);
1946 			pages_processed++;
1947 		}
1948 		nr_pages -= found_pages;
1949 		index += found_pages;
1950 		cond_resched();
1951 	}
1952 out:
1953 	if (err && processed_end) {
1954 		/*
1955 		 * Update @processed_end. I know this is awful since it has
1956 		 * two different return value patterns (inclusive vs exclusive).
1957 		 *
1958 		 * But the exclusive pattern is necessary if @start is 0, or we
1959 		 * underflow and check against processed_end won't work as
1960 		 * expected.
1961 		 */
1962 		if (pages_processed)
1963 			*processed_end = min(end,
1964 			((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1);
1965 		else
1966 			*processed_end = start;
1967 	}
1968 	return err;
1969 }
1970 
1971 static noinline void __unlock_for_delalloc(struct inode *inode,
1972 					   struct page *locked_page,
1973 					   u64 start, u64 end)
1974 {
1975 	unsigned long index = start >> PAGE_SHIFT;
1976 	unsigned long end_index = end >> PAGE_SHIFT;
1977 
1978 	ASSERT(locked_page);
1979 	if (index == locked_page->index && end_index == index)
1980 		return;
1981 
1982 	__process_pages_contig(inode->i_mapping, locked_page, start, end,
1983 			       PAGE_UNLOCK, NULL);
1984 }
1985 
1986 static noinline int lock_delalloc_pages(struct inode *inode,
1987 					struct page *locked_page,
1988 					u64 delalloc_start,
1989 					u64 delalloc_end)
1990 {
1991 	unsigned long index = delalloc_start >> PAGE_SHIFT;
1992 	unsigned long end_index = delalloc_end >> PAGE_SHIFT;
1993 	u64 processed_end = delalloc_start;
1994 	int ret;
1995 
1996 	ASSERT(locked_page);
1997 	if (index == locked_page->index && index == end_index)
1998 		return 0;
1999 
2000 	ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start,
2001 				     delalloc_end, PAGE_LOCK, &processed_end);
2002 	if (ret == -EAGAIN && processed_end > delalloc_start)
2003 		__unlock_for_delalloc(inode, locked_page, delalloc_start,
2004 				      processed_end);
2005 	return ret;
2006 }
2007 
2008 /*
2009  * Find and lock a contiguous range of bytes in the file marked as delalloc, no
2010  * more than @max_bytes.
2011  *
2012  * @start:	The original start bytenr to search.
2013  *		Will store the extent range start bytenr.
2014  * @end:	The original end bytenr of the search range
2015  *		Will store the extent range end bytenr.
2016  *
2017  * Return true if we find a delalloc range which starts inside the original
2018  * range, and @start/@end will store the delalloc range start/end.
2019  *
2020  * Return false if we can't find any delalloc range which starts inside the
2021  * original range, and @start/@end will be the non-delalloc range start/end.
2022  */
2023 EXPORT_FOR_TESTS
2024 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
2025 				    struct page *locked_page, u64 *start,
2026 				    u64 *end)
2027 {
2028 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2029 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2030 	const u64 orig_start = *start;
2031 	const u64 orig_end = *end;
2032 	/* The sanity tests may not set a valid fs_info. */
2033 	u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
2034 	u64 delalloc_start;
2035 	u64 delalloc_end;
2036 	bool found;
2037 	struct extent_state *cached_state = NULL;
2038 	int ret;
2039 	int loops = 0;
2040 
2041 	/* Caller should pass a valid @end to indicate the search range end */
2042 	ASSERT(orig_end > orig_start);
2043 
2044 	/* The range should at least cover part of the page */
2045 	ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
2046 		 orig_end <= page_offset(locked_page)));
2047 again:
2048 	/* step one, find a bunch of delalloc bytes starting at start */
2049 	delalloc_start = *start;
2050 	delalloc_end = 0;
2051 	found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
2052 					  max_bytes, &cached_state);
2053 	if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
2054 		*start = delalloc_start;
2055 
2056 		/* @delalloc_end can be -1, never go beyond @orig_end */
2057 		*end = min(delalloc_end, orig_end);
2058 		free_extent_state(cached_state);
2059 		return false;
2060 	}
2061 
2062 	/*
2063 	 * start comes from the offset of locked_page.  We have to lock
2064 	 * pages in order, so we can't process delalloc bytes before
2065 	 * locked_page
2066 	 */
2067 	if (delalloc_start < *start)
2068 		delalloc_start = *start;
2069 
2070 	/*
2071 	 * make sure to limit the number of pages we try to lock down
2072 	 */
2073 	if (delalloc_end + 1 - delalloc_start > max_bytes)
2074 		delalloc_end = delalloc_start + max_bytes - 1;
2075 
2076 	/* step two, lock all the pages after the page that has start */
2077 	ret = lock_delalloc_pages(inode, locked_page,
2078 				  delalloc_start, delalloc_end);
2079 	ASSERT(!ret || ret == -EAGAIN);
2080 	if (ret == -EAGAIN) {
2081 		/* some of the pages are gone, lets avoid looping by
2082 		 * shortening the size of the delalloc range we're searching
2083 		 */
2084 		free_extent_state(cached_state);
2085 		cached_state = NULL;
2086 		if (!loops) {
2087 			max_bytes = PAGE_SIZE;
2088 			loops = 1;
2089 			goto again;
2090 		} else {
2091 			found = false;
2092 			goto out_failed;
2093 		}
2094 	}
2095 
2096 	/* step three, lock the state bits for the whole range */
2097 	lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
2098 
2099 	/* then test to make sure it is all still delalloc */
2100 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
2101 			     EXTENT_DELALLOC, 1, cached_state);
2102 	if (!ret) {
2103 		unlock_extent_cached(tree, delalloc_start, delalloc_end,
2104 				     &cached_state);
2105 		__unlock_for_delalloc(inode, locked_page,
2106 			      delalloc_start, delalloc_end);
2107 		cond_resched();
2108 		goto again;
2109 	}
2110 	free_extent_state(cached_state);
2111 	*start = delalloc_start;
2112 	*end = delalloc_end;
2113 out_failed:
2114 	return found;
2115 }
2116 
2117 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2118 				  struct page *locked_page,
2119 				  u32 clear_bits, unsigned long page_ops)
2120 {
2121 	clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
2122 
2123 	__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
2124 			       start, end, page_ops, NULL);
2125 }
2126 
2127 /*
2128  * count the number of bytes in the tree that have a given bit(s)
2129  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
2130  * cached.  The total number found is returned.
2131  */
2132 u64 count_range_bits(struct extent_io_tree *tree,
2133 		     u64 *start, u64 search_end, u64 max_bytes,
2134 		     u32 bits, int contig)
2135 {
2136 	struct rb_node *node;
2137 	struct extent_state *state;
2138 	u64 cur_start = *start;
2139 	u64 total_bytes = 0;
2140 	u64 last = 0;
2141 	int found = 0;
2142 
2143 	if (WARN_ON(search_end <= cur_start))
2144 		return 0;
2145 
2146 	spin_lock(&tree->lock);
2147 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
2148 		total_bytes = tree->dirty_bytes;
2149 		goto out;
2150 	}
2151 	/*
2152 	 * this search will find all the extents that end after
2153 	 * our range starts.
2154 	 */
2155 	node = tree_search(tree, cur_start);
2156 	if (!node)
2157 		goto out;
2158 
2159 	while (1) {
2160 		state = rb_entry(node, struct extent_state, rb_node);
2161 		if (state->start > search_end)
2162 			break;
2163 		if (contig && found && state->start > last + 1)
2164 			break;
2165 		if (state->end >= cur_start && (state->state & bits) == bits) {
2166 			total_bytes += min(search_end, state->end) + 1 -
2167 				       max(cur_start, state->start);
2168 			if (total_bytes >= max_bytes)
2169 				break;
2170 			if (!found) {
2171 				*start = max(cur_start, state->start);
2172 				found = 1;
2173 			}
2174 			last = state->end;
2175 		} else if (contig && found) {
2176 			break;
2177 		}
2178 		node = rb_next(node);
2179 		if (!node)
2180 			break;
2181 	}
2182 out:
2183 	spin_unlock(&tree->lock);
2184 	return total_bytes;
2185 }
2186 
2187 /*
2188  * set the private field for a given byte offset in the tree.  If there isn't
2189  * an extent_state there already, this does nothing.
2190  */
2191 int set_state_failrec(struct extent_io_tree *tree, u64 start,
2192 		      struct io_failure_record *failrec)
2193 {
2194 	struct rb_node *node;
2195 	struct extent_state *state;
2196 	int ret = 0;
2197 
2198 	spin_lock(&tree->lock);
2199 	/*
2200 	 * this search will find all the extents that end after
2201 	 * our range starts.
2202 	 */
2203 	node = tree_search(tree, start);
2204 	if (!node) {
2205 		ret = -ENOENT;
2206 		goto out;
2207 	}
2208 	state = rb_entry(node, struct extent_state, rb_node);
2209 	if (state->start != start) {
2210 		ret = -ENOENT;
2211 		goto out;
2212 	}
2213 	state->failrec = failrec;
2214 out:
2215 	spin_unlock(&tree->lock);
2216 	return ret;
2217 }
2218 
2219 struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
2220 {
2221 	struct rb_node *node;
2222 	struct extent_state *state;
2223 	struct io_failure_record *failrec;
2224 
2225 	spin_lock(&tree->lock);
2226 	/*
2227 	 * this search will find all the extents that end after
2228 	 * our range starts.
2229 	 */
2230 	node = tree_search(tree, start);
2231 	if (!node) {
2232 		failrec = ERR_PTR(-ENOENT);
2233 		goto out;
2234 	}
2235 	state = rb_entry(node, struct extent_state, rb_node);
2236 	if (state->start != start) {
2237 		failrec = ERR_PTR(-ENOENT);
2238 		goto out;
2239 	}
2240 
2241 	failrec = state->failrec;
2242 out:
2243 	spin_unlock(&tree->lock);
2244 	return failrec;
2245 }
2246 
2247 /*
2248  * searches a range in the state tree for a given mask.
2249  * If 'filled' == 1, this returns 1 only if every extent in the tree
2250  * has the bits set.  Otherwise, 1 is returned if any bit in the
2251  * range is found set.
2252  */
2253 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
2254 		   u32 bits, int filled, struct extent_state *cached)
2255 {
2256 	struct extent_state *state = NULL;
2257 	struct rb_node *node;
2258 	int bitset = 0;
2259 
2260 	spin_lock(&tree->lock);
2261 	if (cached && extent_state_in_tree(cached) && cached->start <= start &&
2262 	    cached->end > start)
2263 		node = &cached->rb_node;
2264 	else
2265 		node = tree_search(tree, start);
2266 	while (node && start <= end) {
2267 		state = rb_entry(node, struct extent_state, rb_node);
2268 
2269 		if (filled && state->start > start) {
2270 			bitset = 0;
2271 			break;
2272 		}
2273 
2274 		if (state->start > end)
2275 			break;
2276 
2277 		if (state->state & bits) {
2278 			bitset = 1;
2279 			if (!filled)
2280 				break;
2281 		} else if (filled) {
2282 			bitset = 0;
2283 			break;
2284 		}
2285 
2286 		if (state->end == (u64)-1)
2287 			break;
2288 
2289 		start = state->end + 1;
2290 		if (start > end)
2291 			break;
2292 		node = rb_next(node);
2293 		if (!node) {
2294 			if (filled)
2295 				bitset = 0;
2296 			break;
2297 		}
2298 	}
2299 	spin_unlock(&tree->lock);
2300 	return bitset;
2301 }
2302 
2303 int free_io_failure(struct extent_io_tree *failure_tree,
2304 		    struct extent_io_tree *io_tree,
2305 		    struct io_failure_record *rec)
2306 {
2307 	int ret;
2308 	int err = 0;
2309 
2310 	set_state_failrec(failure_tree, rec->start, NULL);
2311 	ret = clear_extent_bits(failure_tree, rec->start,
2312 				rec->start + rec->len - 1,
2313 				EXTENT_LOCKED | EXTENT_DIRTY);
2314 	if (ret)
2315 		err = ret;
2316 
2317 	ret = clear_extent_bits(io_tree, rec->start,
2318 				rec->start + rec->len - 1,
2319 				EXTENT_DAMAGED);
2320 	if (ret && !err)
2321 		err = ret;
2322 
2323 	kfree(rec);
2324 	return err;
2325 }
2326 
2327 /*
2328  * this bypasses the standard btrfs submit functions deliberately, as
2329  * the standard behavior is to write all copies in a raid setup. here we only
2330  * want to write the one bad copy. so we do the mapping for ourselves and issue
2331  * submit_bio directly.
2332  * to avoid any synchronization issues, wait for the data after writing, which
2333  * actually prevents the read that triggered the error from finishing.
2334  * currently, there can be no more than two copies of every data bit. thus,
2335  * exactly one rewrite is required.
2336  */
2337 static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2338 			     u64 length, u64 logical, struct page *page,
2339 			     unsigned int pg_offset, int mirror_num)
2340 {
2341 	struct btrfs_device *dev;
2342 	struct bio_vec bvec;
2343 	struct bio bio;
2344 	u64 map_length = 0;
2345 	u64 sector;
2346 	struct btrfs_io_context *bioc = NULL;
2347 	int ret = 0;
2348 
2349 	ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
2350 	BUG_ON(!mirror_num);
2351 
2352 	if (btrfs_repair_one_zone(fs_info, logical))
2353 		return 0;
2354 
2355 	map_length = length;
2356 
2357 	/*
2358 	 * Avoid races with device replace and make sure our bioc has devices
2359 	 * associated to its stripes that don't go away while we are doing the
2360 	 * read repair operation.
2361 	 */
2362 	btrfs_bio_counter_inc_blocked(fs_info);
2363 	if (btrfs_is_parity_mirror(fs_info, logical, length)) {
2364 		/*
2365 		 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2366 		 * to update all raid stripes, but here we just want to correct
2367 		 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2368 		 * stripe's dev and sector.
2369 		 */
2370 		ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
2371 				      &map_length, &bioc, 0);
2372 		if (ret)
2373 			goto out_counter_dec;
2374 		ASSERT(bioc->mirror_num == 1);
2375 	} else {
2376 		ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
2377 				      &map_length, &bioc, mirror_num);
2378 		if (ret)
2379 			goto out_counter_dec;
2380 		BUG_ON(mirror_num != bioc->mirror_num);
2381 	}
2382 
2383 	sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
2384 	dev = bioc->stripes[bioc->mirror_num - 1].dev;
2385 	btrfs_put_bioc(bioc);
2386 
2387 	if (!dev || !dev->bdev ||
2388 	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
2389 		ret = -EIO;
2390 		goto out_counter_dec;
2391 	}
2392 
2393 	bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
2394 	bio.bi_iter.bi_sector = sector;
2395 	__bio_add_page(&bio, page, length, pg_offset);
2396 
2397 	btrfsic_check_bio(&bio);
2398 	ret = submit_bio_wait(&bio);
2399 	if (ret) {
2400 		/* try to remap that extent elsewhere? */
2401 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
2402 		goto out_bio_uninit;
2403 	}
2404 
2405 	btrfs_info_rl_in_rcu(fs_info,
2406 		"read error corrected: ino %llu off %llu (dev %s sector %llu)",
2407 				  ino, start,
2408 				  rcu_str_deref(dev->name), sector);
2409 	ret = 0;
2410 
2411 out_bio_uninit:
2412 	bio_uninit(&bio);
2413 out_counter_dec:
2414 	btrfs_bio_counter_dec(fs_info);
2415 	return ret;
2416 }
2417 
2418 int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
2419 {
2420 	struct btrfs_fs_info *fs_info = eb->fs_info;
2421 	u64 start = eb->start;
2422 	int i, num_pages = num_extent_pages(eb);
2423 	int ret = 0;
2424 
2425 	if (sb_rdonly(fs_info->sb))
2426 		return -EROFS;
2427 
2428 	for (i = 0; i < num_pages; i++) {
2429 		struct page *p = eb->pages[i];
2430 
2431 		ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
2432 					start - page_offset(p), mirror_num);
2433 		if (ret)
2434 			break;
2435 		start += PAGE_SIZE;
2436 	}
2437 
2438 	return ret;
2439 }
2440 
2441 static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
2442 {
2443 	if (cur_mirror == failrec->num_copies)
2444 		return cur_mirror + 1 - failrec->num_copies;
2445 	return cur_mirror + 1;
2446 }
2447 
2448 static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
2449 {
2450 	if (cur_mirror == 1)
2451 		return failrec->num_copies;
2452 	return cur_mirror - 1;
2453 }
2454 
2455 /*
2456  * each time an IO finishes, we do a fast check in the IO failure tree
2457  * to see if we need to process or clean up an io_failure_record
2458  */
2459 int clean_io_failure(struct btrfs_fs_info *fs_info,
2460 		     struct extent_io_tree *failure_tree,
2461 		     struct extent_io_tree *io_tree, u64 start,
2462 		     struct page *page, u64 ino, unsigned int pg_offset)
2463 {
2464 	u64 private;
2465 	struct io_failure_record *failrec;
2466 	struct extent_state *state;
2467 	int mirror;
2468 	int ret;
2469 
2470 	private = 0;
2471 	ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2472 			       EXTENT_DIRTY, 0);
2473 	if (!ret)
2474 		return 0;
2475 
2476 	failrec = get_state_failrec(failure_tree, start);
2477 	if (IS_ERR(failrec))
2478 		return 0;
2479 
2480 	BUG_ON(!failrec->this_mirror);
2481 
2482 	if (sb_rdonly(fs_info->sb))
2483 		goto out;
2484 
2485 	spin_lock(&io_tree->lock);
2486 	state = find_first_extent_bit_state(io_tree,
2487 					    failrec->start,
2488 					    EXTENT_LOCKED);
2489 	spin_unlock(&io_tree->lock);
2490 
2491 	if (!state || state->start > failrec->start ||
2492 	    state->end < failrec->start + failrec->len - 1)
2493 		goto out;
2494 
2495 	mirror = failrec->this_mirror;
2496 	do {
2497 		mirror = prev_mirror(failrec, mirror);
2498 		repair_io_failure(fs_info, ino, start, failrec->len,
2499 				  failrec->logical, page, pg_offset, mirror);
2500 	} while (mirror != failrec->failed_mirror);
2501 
2502 out:
2503 	free_io_failure(failure_tree, io_tree, failrec);
2504 	return 0;
2505 }
2506 
2507 /*
2508  * Can be called when
2509  * - hold extent lock
2510  * - under ordered extent
2511  * - the inode is freeing
2512  */
2513 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
2514 {
2515 	struct extent_io_tree *failure_tree = &inode->io_failure_tree;
2516 	struct io_failure_record *failrec;
2517 	struct extent_state *state, *next;
2518 
2519 	if (RB_EMPTY_ROOT(&failure_tree->state))
2520 		return;
2521 
2522 	spin_lock(&failure_tree->lock);
2523 	state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2524 	while (state) {
2525 		if (state->start > end)
2526 			break;
2527 
2528 		ASSERT(state->end <= end);
2529 
2530 		next = next_state(state);
2531 
2532 		failrec = state->failrec;
2533 		free_extent_state(state);
2534 		kfree(failrec);
2535 
2536 		state = next;
2537 	}
2538 	spin_unlock(&failure_tree->lock);
2539 }
2540 
2541 static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
2542 							     struct btrfs_bio *bbio,
2543 							     unsigned int bio_offset)
2544 {
2545 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2546 	u64 start = bbio->file_offset + bio_offset;
2547 	struct io_failure_record *failrec;
2548 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2549 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2550 	const u32 sectorsize = fs_info->sectorsize;
2551 	int ret;
2552 
2553 	failrec = get_state_failrec(failure_tree, start);
2554 	if (!IS_ERR(failrec)) {
2555 		btrfs_debug(fs_info,
2556 	"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
2557 			failrec->logical, failrec->start, failrec->len);
2558 		/*
2559 		 * when data can be on disk more than twice, add to failrec here
2560 		 * (e.g. with a list for failed_mirror) to make
2561 		 * clean_io_failure() clean all those errors at once.
2562 		 */
2563 		ASSERT(failrec->this_mirror == bbio->mirror_num);
2564 		ASSERT(failrec->len == fs_info->sectorsize);
2565 		return failrec;
2566 	}
2567 
2568 	failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2569 	if (!failrec)
2570 		return ERR_PTR(-ENOMEM);
2571 
2572 	failrec->start = start;
2573 	failrec->len = sectorsize;
2574 	failrec->failed_mirror = bbio->mirror_num;
2575 	failrec->this_mirror = bbio->mirror_num;
2576 	failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
2577 
2578 	btrfs_debug(fs_info,
2579 		    "new io failure record logical %llu start %llu",
2580 		    failrec->logical, start);
2581 
2582 	failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize);
2583 	if (failrec->num_copies == 1) {
2584 		/*
2585 		 * We only have a single copy of the data, so don't bother with
2586 		 * all the retry and error correction code that follows. No
2587 		 * matter what the error is, it is very likely to persist.
2588 		 */
2589 		btrfs_debug(fs_info,
2590 			"cannot repair logical %llu num_copies %d",
2591 			failrec->logical, failrec->num_copies);
2592 		kfree(failrec);
2593 		return ERR_PTR(-EIO);
2594 	}
2595 
2596 	/* Set the bits in the private failure tree */
2597 	ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
2598 			      EXTENT_LOCKED | EXTENT_DIRTY);
2599 	if (ret >= 0) {
2600 		ret = set_state_failrec(failure_tree, start, failrec);
2601 		/* Set the bits in the inode's tree */
2602 		ret = set_extent_bits(tree, start, start + sectorsize - 1,
2603 				      EXTENT_DAMAGED);
2604 	} else if (ret < 0) {
2605 		kfree(failrec);
2606 		return ERR_PTR(ret);
2607 	}
2608 
2609 	return failrec;
2610 }
2611 
2612 int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
2613 			    u32 bio_offset, struct page *page, unsigned int pgoff,
2614 			    submit_bio_hook_t *submit_bio_hook)
2615 {
2616 	u64 start = failed_bbio->file_offset + bio_offset;
2617 	struct io_failure_record *failrec;
2618 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2619 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2620 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2621 	struct bio *failed_bio = &failed_bbio->bio;
2622 	const int icsum = bio_offset >> fs_info->sectorsize_bits;
2623 	struct bio *repair_bio;
2624 	struct btrfs_bio *repair_bbio;
2625 
2626 	btrfs_debug(fs_info,
2627 		   "repair read error: read error at %llu", start);
2628 
2629 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2630 
2631 	failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset);
2632 	if (IS_ERR(failrec))
2633 		return PTR_ERR(failrec);
2634 
2635 	/*
2636 	 * There are two premises:
2637 	 * a) deliver good data to the caller
2638 	 * b) correct the bad sectors on disk
2639 	 *
2640 	 * Since we're only doing repair for one sector, we only need to get
2641 	 * a good copy of the failed sector and if we succeed, we have setup
2642 	 * everything for repair_io_failure to do the rest for us.
2643 	 */
2644 	failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
2645 	if (failrec->this_mirror == failrec->failed_mirror) {
2646 		btrfs_debug(fs_info,
2647 			"failed to repair num_copies %d this_mirror %d failed_mirror %d",
2648 			failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
2649 		free_io_failure(failure_tree, tree, failrec);
2650 		return -EIO;
2651 	}
2652 
2653 	repair_bio = btrfs_bio_alloc(1);
2654 	repair_bbio = btrfs_bio(repair_bio);
2655 	repair_bbio->file_offset = start;
2656 	repair_bio->bi_opf = REQ_OP_READ;
2657 	repair_bio->bi_end_io = failed_bio->bi_end_io;
2658 	repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
2659 	repair_bio->bi_private = failed_bio->bi_private;
2660 
2661 	if (failed_bbio->csum) {
2662 		const u32 csum_size = fs_info->csum_size;
2663 
2664 		repair_bbio->csum = repair_bbio->csum_inline;
2665 		memcpy(repair_bbio->csum,
2666 		       failed_bbio->csum + csum_size * icsum, csum_size);
2667 	}
2668 
2669 	bio_add_page(repair_bio, page, failrec->len, pgoff);
2670 	repair_bbio->iter = repair_bio->bi_iter;
2671 
2672 	btrfs_debug(btrfs_sb(inode->i_sb),
2673 		    "repair read error: submitting new read to mirror %d",
2674 		    failrec->this_mirror);
2675 
2676 	/*
2677 	 * At this point we have a bio, so any errors from submit_bio_hook()
2678 	 * will be handled by the endio on the repair_bio, so we can't return an
2679 	 * error here.
2680 	 */
2681 	submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0);
2682 	return BLK_STS_OK;
2683 }
2684 
2685 static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
2686 {
2687 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
2688 
2689 	ASSERT(page_offset(page) <= start &&
2690 	       start + len <= page_offset(page) + PAGE_SIZE);
2691 
2692 	if (uptodate) {
2693 		if (fsverity_active(page->mapping->host) &&
2694 		    !PageError(page) &&
2695 		    !PageUptodate(page) &&
2696 		    start < i_size_read(page->mapping->host) &&
2697 		    !fsverity_verify_page(page)) {
2698 			btrfs_page_set_error(fs_info, page, start, len);
2699 		} else {
2700 			btrfs_page_set_uptodate(fs_info, page, start, len);
2701 		}
2702 	} else {
2703 		btrfs_page_clear_uptodate(fs_info, page, start, len);
2704 		btrfs_page_set_error(fs_info, page, start, len);
2705 	}
2706 
2707 	if (!btrfs_is_subpage(fs_info, page))
2708 		unlock_page(page);
2709 	else
2710 		btrfs_subpage_end_reader(fs_info, page, start, len);
2711 }
2712 
2713 static void end_sector_io(struct page *page, u64 offset, bool uptodate)
2714 {
2715 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
2716 	const u32 sectorsize = inode->root->fs_info->sectorsize;
2717 	struct extent_state *cached = NULL;
2718 
2719 	end_page_read(page, uptodate, offset, sectorsize);
2720 	if (uptodate)
2721 		set_extent_uptodate(&inode->io_tree, offset,
2722 				    offset + sectorsize - 1, &cached, GFP_ATOMIC);
2723 	unlock_extent_cached_atomic(&inode->io_tree, offset,
2724 				    offset + sectorsize - 1, &cached);
2725 }
2726 
2727 static void submit_data_read_repair(struct inode *inode,
2728 				    struct btrfs_bio *failed_bbio,
2729 				    u32 bio_offset, const struct bio_vec *bvec,
2730 				    unsigned int error_bitmap)
2731 {
2732 	const unsigned int pgoff = bvec->bv_offset;
2733 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2734 	struct page *page = bvec->bv_page;
2735 	const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset;
2736 	const u64 end = start + bvec->bv_len - 1;
2737 	const u32 sectorsize = fs_info->sectorsize;
2738 	const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
2739 	int i;
2740 
2741 	BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE);
2742 
2743 	/* This repair is only for data */
2744 	ASSERT(is_data_inode(inode));
2745 
2746 	/* We're here because we had some read errors or csum mismatch */
2747 	ASSERT(error_bitmap);
2748 
2749 	/*
2750 	 * We only get called on buffered IO, thus page must be mapped and bio
2751 	 * must not be cloned.
2752 	 */
2753 	ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED));
2754 
2755 	/* Iterate through all the sectors in the range */
2756 	for (i = 0; i < nr_bits; i++) {
2757 		const unsigned int offset = i * sectorsize;
2758 		bool uptodate = false;
2759 		int ret;
2760 
2761 		if (!(error_bitmap & (1U << i))) {
2762 			/*
2763 			 * This sector has no error, just end the page read
2764 			 * and unlock the range.
2765 			 */
2766 			uptodate = true;
2767 			goto next;
2768 		}
2769 
2770 		ret = btrfs_repair_one_sector(inode, failed_bbio,
2771 				bio_offset + offset, page, pgoff + offset,
2772 				btrfs_submit_data_read_bio);
2773 		if (!ret) {
2774 			/*
2775 			 * We have submitted the read repair, the page release
2776 			 * will be handled by the endio function of the
2777 			 * submitted repair bio.
2778 			 * Thus we don't need to do any thing here.
2779 			 */
2780 			continue;
2781 		}
2782 		/*
2783 		 * Continue on failed repair, otherwise the remaining sectors
2784 		 * will not be properly unlocked.
2785 		 */
2786 next:
2787 		end_sector_io(page, start + offset, uptodate);
2788 	}
2789 }
2790 
2791 /* lots and lots of room for performance fixes in the end_bio funcs */
2792 
2793 void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2794 {
2795 	struct btrfs_inode *inode;
2796 	const bool uptodate = (err == 0);
2797 	int ret = 0;
2798 
2799 	ASSERT(page && page->mapping);
2800 	inode = BTRFS_I(page->mapping->host);
2801 	btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
2802 
2803 	if (!uptodate) {
2804 		const struct btrfs_fs_info *fs_info = inode->root->fs_info;
2805 		u32 len;
2806 
2807 		ASSERT(end + 1 - start <= U32_MAX);
2808 		len = end + 1 - start;
2809 
2810 		btrfs_page_clear_uptodate(fs_info, page, start, len);
2811 		btrfs_page_set_error(fs_info, page, start, len);
2812 		ret = err < 0 ? err : -EIO;
2813 		mapping_set_error(page->mapping, ret);
2814 	}
2815 }
2816 
2817 /*
2818  * after a writepage IO is done, we need to:
2819  * clear the uptodate bits on error
2820  * clear the writeback bits in the extent tree for this IO
2821  * end_page_writeback if the page has no more pending IO
2822  *
2823  * Scheduling is not allowed, so the extent state tree is expected
2824  * to have one and only one object corresponding to this IO.
2825  */
2826 static void end_bio_extent_writepage(struct bio *bio)
2827 {
2828 	int error = blk_status_to_errno(bio->bi_status);
2829 	struct bio_vec *bvec;
2830 	u64 start;
2831 	u64 end;
2832 	struct bvec_iter_all iter_all;
2833 	bool first_bvec = true;
2834 
2835 	ASSERT(!bio_flagged(bio, BIO_CLONED));
2836 	bio_for_each_segment_all(bvec, bio, iter_all) {
2837 		struct page *page = bvec->bv_page;
2838 		struct inode *inode = page->mapping->host;
2839 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2840 		const u32 sectorsize = fs_info->sectorsize;
2841 
2842 		/* Our read/write should always be sector aligned. */
2843 		if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
2844 			btrfs_err(fs_info,
2845 		"partial page write in btrfs with offset %u and length %u",
2846 				  bvec->bv_offset, bvec->bv_len);
2847 		else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
2848 			btrfs_info(fs_info,
2849 		"incomplete page write with offset %u and length %u",
2850 				   bvec->bv_offset, bvec->bv_len);
2851 
2852 		start = page_offset(page) + bvec->bv_offset;
2853 		end = start + bvec->bv_len - 1;
2854 
2855 		if (first_bvec) {
2856 			btrfs_record_physical_zoned(inode, start, bio);
2857 			first_bvec = false;
2858 		}
2859 
2860 		end_extent_writepage(page, error, start, end);
2861 
2862 		btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
2863 	}
2864 
2865 	bio_put(bio);
2866 }
2867 
2868 /*
2869  * Record previously processed extent range
2870  *
2871  * For endio_readpage_release_extent() to handle a full extent range, reducing
2872  * the extent io operations.
2873  */
2874 struct processed_extent {
2875 	struct btrfs_inode *inode;
2876 	/* Start of the range in @inode */
2877 	u64 start;
2878 	/* End of the range in @inode */
2879 	u64 end;
2880 	bool uptodate;
2881 };
2882 
2883 /*
2884  * Try to release processed extent range
2885  *
2886  * May not release the extent range right now if the current range is
2887  * contiguous to processed extent.
2888  *
2889  * Will release processed extent when any of @inode, @uptodate, the range is
2890  * no longer contiguous to the processed range.
2891  *
2892  * Passing @inode == NULL will force processed extent to be released.
2893  */
2894 static void endio_readpage_release_extent(struct processed_extent *processed,
2895 			      struct btrfs_inode *inode, u64 start, u64 end,
2896 			      bool uptodate)
2897 {
2898 	struct extent_state *cached = NULL;
2899 	struct extent_io_tree *tree;
2900 
2901 	/* The first extent, initialize @processed */
2902 	if (!processed->inode)
2903 		goto update;
2904 
2905 	/*
2906 	 * Contiguous to processed extent, just uptodate the end.
2907 	 *
2908 	 * Several things to notice:
2909 	 *
2910 	 * - bio can be merged as long as on-disk bytenr is contiguous
2911 	 *   This means we can have page belonging to other inodes, thus need to
2912 	 *   check if the inode still matches.
2913 	 * - bvec can contain range beyond current page for multi-page bvec
2914 	 *   Thus we need to do processed->end + 1 >= start check
2915 	 */
2916 	if (processed->inode == inode && processed->uptodate == uptodate &&
2917 	    processed->end + 1 >= start && end >= processed->end) {
2918 		processed->end = end;
2919 		return;
2920 	}
2921 
2922 	tree = &processed->inode->io_tree;
2923 	/*
2924 	 * Now we don't have range contiguous to the processed range, release
2925 	 * the processed range now.
2926 	 */
2927 	if (processed->uptodate && tree->track_uptodate)
2928 		set_extent_uptodate(tree, processed->start, processed->end,
2929 				    &cached, GFP_ATOMIC);
2930 	unlock_extent_cached_atomic(tree, processed->start, processed->end,
2931 				    &cached);
2932 
2933 update:
2934 	/* Update processed to current range */
2935 	processed->inode = inode;
2936 	processed->start = start;
2937 	processed->end = end;
2938 	processed->uptodate = uptodate;
2939 }
2940 
2941 static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
2942 {
2943 	ASSERT(PageLocked(page));
2944 	if (!btrfs_is_subpage(fs_info, page))
2945 		return;
2946 
2947 	ASSERT(PagePrivate(page));
2948 	btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
2949 }
2950 
2951 /*
2952  * Find extent buffer for a givne bytenr.
2953  *
2954  * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
2955  * in endio context.
2956  */
2957 static struct extent_buffer *find_extent_buffer_readpage(
2958 		struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
2959 {
2960 	struct extent_buffer *eb;
2961 
2962 	/*
2963 	 * For regular sectorsize, we can use page->private to grab extent
2964 	 * buffer
2965 	 */
2966 	if (fs_info->nodesize >= PAGE_SIZE) {
2967 		ASSERT(PagePrivate(page) && page->private);
2968 		return (struct extent_buffer *)page->private;
2969 	}
2970 
2971 	/* For subpage case, we need to lookup buffer radix tree */
2972 	rcu_read_lock();
2973 	eb = radix_tree_lookup(&fs_info->buffer_radix,
2974 			       bytenr >> fs_info->sectorsize_bits);
2975 	rcu_read_unlock();
2976 	ASSERT(eb);
2977 	return eb;
2978 }
2979 
2980 /*
2981  * after a readpage IO is done, we need to:
2982  * clear the uptodate bits on error
2983  * set the uptodate bits if things worked
2984  * set the page up to date if all extents in the tree are uptodate
2985  * clear the lock bit in the extent tree
2986  * unlock the page if there are no other extents locked for it
2987  *
2988  * Scheduling is not allowed, so the extent state tree is expected
2989  * to have one and only one object corresponding to this IO.
2990  */
2991 static void end_bio_extent_readpage(struct bio *bio)
2992 {
2993 	struct bio_vec *bvec;
2994 	struct btrfs_bio *bbio = btrfs_bio(bio);
2995 	struct extent_io_tree *tree, *failure_tree;
2996 	struct processed_extent processed = { 0 };
2997 	/*
2998 	 * The offset to the beginning of a bio, since one bio can never be
2999 	 * larger than UINT_MAX, u32 here is enough.
3000 	 */
3001 	u32 bio_offset = 0;
3002 	int mirror;
3003 	struct bvec_iter_all iter_all;
3004 
3005 	ASSERT(!bio_flagged(bio, BIO_CLONED));
3006 	bio_for_each_segment_all(bvec, bio, iter_all) {
3007 		bool uptodate = !bio->bi_status;
3008 		struct page *page = bvec->bv_page;
3009 		struct inode *inode = page->mapping->host;
3010 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3011 		const u32 sectorsize = fs_info->sectorsize;
3012 		unsigned int error_bitmap = (unsigned int)-1;
3013 		bool repair = false;
3014 		u64 start;
3015 		u64 end;
3016 		u32 len;
3017 
3018 		btrfs_debug(fs_info,
3019 			"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
3020 			bio->bi_iter.bi_sector, bio->bi_status,
3021 			bbio->mirror_num);
3022 		tree = &BTRFS_I(inode)->io_tree;
3023 		failure_tree = &BTRFS_I(inode)->io_failure_tree;
3024 
3025 		/*
3026 		 * We always issue full-sector reads, but if some block in a
3027 		 * page fails to read, blk_update_request() will advance
3028 		 * bv_offset and adjust bv_len to compensate.  Print a warning
3029 		 * for unaligned offsets, and an error if they don't add up to
3030 		 * a full sector.
3031 		 */
3032 		if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
3033 			btrfs_err(fs_info,
3034 		"partial page read in btrfs with offset %u and length %u",
3035 				  bvec->bv_offset, bvec->bv_len);
3036 		else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
3037 				     sectorsize))
3038 			btrfs_info(fs_info,
3039 		"incomplete page read with offset %u and length %u",
3040 				   bvec->bv_offset, bvec->bv_len);
3041 
3042 		start = page_offset(page) + bvec->bv_offset;
3043 		end = start + bvec->bv_len - 1;
3044 		len = bvec->bv_len;
3045 
3046 		mirror = bbio->mirror_num;
3047 		if (likely(uptodate)) {
3048 			if (is_data_inode(inode)) {
3049 				error_bitmap = btrfs_verify_data_csum(bbio,
3050 						bio_offset, page, start, end);
3051 				if (error_bitmap)
3052 					uptodate = false;
3053 			} else {
3054 				if (btrfs_validate_metadata_buffer(bbio,
3055 						page, start, end, mirror))
3056 					uptodate = false;
3057 			}
3058 		}
3059 
3060 		if (likely(uptodate)) {
3061 			loff_t i_size = i_size_read(inode);
3062 			pgoff_t end_index = i_size >> PAGE_SHIFT;
3063 
3064 			clean_io_failure(BTRFS_I(inode)->root->fs_info,
3065 					 failure_tree, tree, start, page,
3066 					 btrfs_ino(BTRFS_I(inode)), 0);
3067 
3068 			/*
3069 			 * Zero out the remaining part if this range straddles
3070 			 * i_size.
3071 			 *
3072 			 * Here we should only zero the range inside the bvec,
3073 			 * not touch anything else.
3074 			 *
3075 			 * NOTE: i_size is exclusive while end is inclusive.
3076 			 */
3077 			if (page->index == end_index && i_size <= end) {
3078 				u32 zero_start = max(offset_in_page(i_size),
3079 						     offset_in_page(start));
3080 
3081 				zero_user_segment(page, zero_start,
3082 						  offset_in_page(end) + 1);
3083 			}
3084 		} else if (is_data_inode(inode)) {
3085 			/*
3086 			 * Only try to repair bios that actually made it to a
3087 			 * device.  If the bio failed to be submitted mirror
3088 			 * is 0 and we need to fail it without retrying.
3089 			 *
3090 			 * This also includes the high level bios for compressed
3091 			 * extents - these never make it to a device and repair
3092 			 * is already handled on the lower compressed bio.
3093 			 */
3094 			if (mirror > 0)
3095 				repair = true;
3096 		} else {
3097 			struct extent_buffer *eb;
3098 
3099 			eb = find_extent_buffer_readpage(fs_info, page, start);
3100 			set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
3101 			eb->read_mirror = mirror;
3102 			atomic_dec(&eb->io_pages);
3103 		}
3104 
3105 		if (repair) {
3106 			/*
3107 			 * submit_data_read_repair() will handle all the good
3108 			 * and bad sectors, we just continue to the next bvec.
3109 			 */
3110 			submit_data_read_repair(inode, bbio, bio_offset, bvec,
3111 						error_bitmap);
3112 		} else {
3113 			/* Update page status and unlock */
3114 			end_page_read(page, uptodate, start, len);
3115 			endio_readpage_release_extent(&processed, BTRFS_I(inode),
3116 					start, end, PageUptodate(page));
3117 		}
3118 
3119 		ASSERT(bio_offset + len > bio_offset);
3120 		bio_offset += len;
3121 
3122 	}
3123 	/* Release the last extent */
3124 	endio_readpage_release_extent(&processed, NULL, 0, 0, false);
3125 	btrfs_bio_free_csum(bbio);
3126 	bio_put(bio);
3127 }
3128 
3129 /**
3130  * Populate every free slot in a provided array with pages.
3131  *
3132  * @nr_pages:   number of pages to allocate
3133  * @page_array: the array to fill with pages; any existing non-null entries in
3134  * 		the array will be skipped
3135  *
3136  * Return: 0        if all pages were able to be allocated;
3137  *         -ENOMEM  otherwise, and the caller is responsible for freeing all
3138  *                  non-null page pointers in the array.
3139  */
3140 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
3141 {
3142 	unsigned int allocated;
3143 
3144 	for (allocated = 0; allocated < nr_pages;) {
3145 		unsigned int last = allocated;
3146 
3147 		allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
3148 
3149 		if (allocated == nr_pages)
3150 			return 0;
3151 
3152 		/*
3153 		 * During this iteration, no page could be allocated, even
3154 		 * though alloc_pages_bulk_array() falls back to alloc_page()
3155 		 * if  it could not bulk-allocate. So we must be out of memory.
3156 		 */
3157 		if (allocated == last)
3158 			return -ENOMEM;
3159 
3160 		memalloc_retry_wait(GFP_NOFS);
3161 	}
3162 	return 0;
3163 }
3164 
3165 /*
3166  * Initialize the members up to but not including 'bio'. Use after allocating a
3167  * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
3168  * 'bio' because use of __GFP_ZERO is not supported.
3169  */
3170 static inline void btrfs_bio_init(struct btrfs_bio *bbio)
3171 {
3172 	memset(bbio, 0, offsetof(struct btrfs_bio, bio));
3173 }
3174 
3175 /*
3176  * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
3177  *
3178  * The bio allocation is backed by bioset and does not fail.
3179  */
3180 struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
3181 {
3182 	struct bio *bio;
3183 
3184 	ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
3185 	bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset);
3186 	btrfs_bio_init(btrfs_bio(bio));
3187 	return bio;
3188 }
3189 
3190 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
3191 {
3192 	struct bio *bio;
3193 	struct btrfs_bio *bbio;
3194 
3195 	ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
3196 
3197 	/* this will never fail when it's backed by a bioset */
3198 	bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
3199 	ASSERT(bio);
3200 
3201 	bbio = btrfs_bio(bio);
3202 	btrfs_bio_init(bbio);
3203 
3204 	bio_trim(bio, offset >> 9, size >> 9);
3205 	bbio->iter = bio->bi_iter;
3206 	return bio;
3207 }
3208 
3209 /**
3210  * Attempt to add a page to bio
3211  *
3212  * @bio_ctrl:	record both the bio, and its bio_flags
3213  * @page:	page to add to the bio
3214  * @disk_bytenr:  offset of the new bio or to check whether we are adding
3215  *                a contiguous page to the previous one
3216  * @size:	portion of page that we want to write
3217  * @pg_offset:	starting offset in the page
3218  * @compress_type:   compression type of the current bio to see if we can merge them
3219  *
3220  * Attempt to add a page to bio considering stripe alignment etc.
3221  *
3222  * Return >= 0 for the number of bytes added to the bio.
3223  * Can return 0 if the current bio is already at stripe/zone boundary.
3224  * Return <0 for error.
3225  */
3226 static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
3227 			      struct page *page,
3228 			      u64 disk_bytenr, unsigned int size,
3229 			      unsigned int pg_offset,
3230 			      enum btrfs_compression_type compress_type)
3231 {
3232 	struct bio *bio = bio_ctrl->bio;
3233 	u32 bio_size = bio->bi_iter.bi_size;
3234 	u32 real_size;
3235 	const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
3236 	bool contig = false;
3237 	int ret;
3238 
3239 	ASSERT(bio);
3240 	/* The limit should be calculated when bio_ctrl->bio is allocated */
3241 	ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
3242 	if (bio_ctrl->compress_type != compress_type)
3243 		return 0;
3244 
3245 
3246 	if (bio->bi_iter.bi_size == 0) {
3247 		/* We can always add a page into an empty bio. */
3248 		contig = true;
3249 	} else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) {
3250 		struct bio_vec *bvec = bio_last_bvec_all(bio);
3251 
3252 		/*
3253 		 * The contig check requires the following conditions to be met:
3254 		 * 1) The pages are belonging to the same inode
3255 		 *    This is implied by the call chain.
3256 		 *
3257 		 * 2) The range has adjacent logical bytenr
3258 		 *
3259 		 * 3) The range has adjacent file offset
3260 		 *    This is required for the usage of btrfs_bio->file_offset.
3261 		 */
3262 		if (bio_end_sector(bio) == sector &&
3263 		    page_offset(bvec->bv_page) + bvec->bv_offset +
3264 		    bvec->bv_len == page_offset(page) + pg_offset)
3265 			contig = true;
3266 	} else {
3267 		/*
3268 		 * For compression, all IO should have its logical bytenr
3269 		 * set to the starting bytenr of the compressed extent.
3270 		 */
3271 		contig = bio->bi_iter.bi_sector == sector;
3272 	}
3273 
3274 	if (!contig)
3275 		return 0;
3276 
3277 	real_size = min(bio_ctrl->len_to_oe_boundary,
3278 			bio_ctrl->len_to_stripe_boundary) - bio_size;
3279 	real_size = min(real_size, size);
3280 
3281 	/*
3282 	 * If real_size is 0, never call bio_add_*_page(), as even size is 0,
3283 	 * bio will still execute its endio function on the page!
3284 	 */
3285 	if (real_size == 0)
3286 		return 0;
3287 
3288 	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
3289 		ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
3290 	else
3291 		ret = bio_add_page(bio, page, real_size, pg_offset);
3292 
3293 	return ret;
3294 }
3295 
3296 static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
3297 			       struct btrfs_inode *inode, u64 file_offset)
3298 {
3299 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3300 	struct btrfs_io_geometry geom;
3301 	struct btrfs_ordered_extent *ordered;
3302 	struct extent_map *em;
3303 	u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
3304 	int ret;
3305 
3306 	/*
3307 	 * Pages for compressed extent are never submitted to disk directly,
3308 	 * thus it has no real boundary, just set them to U32_MAX.
3309 	 *
3310 	 * The split happens for real compressed bio, which happens in
3311 	 * btrfs_submit_compressed_read/write().
3312 	 */
3313 	if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
3314 		bio_ctrl->len_to_oe_boundary = U32_MAX;
3315 		bio_ctrl->len_to_stripe_boundary = U32_MAX;
3316 		return 0;
3317 	}
3318 	em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
3319 	if (IS_ERR(em))
3320 		return PTR_ERR(em);
3321 	ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
3322 				    logical, &geom);
3323 	free_extent_map(em);
3324 	if (ret < 0) {
3325 		return ret;
3326 	}
3327 	if (geom.len > U32_MAX)
3328 		bio_ctrl->len_to_stripe_boundary = U32_MAX;
3329 	else
3330 		bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
3331 
3332 	if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
3333 		bio_ctrl->len_to_oe_boundary = U32_MAX;
3334 		return 0;
3335 	}
3336 
3337 	/* Ordered extent not yet created, so we're good */
3338 	ordered = btrfs_lookup_ordered_extent(inode, file_offset);
3339 	if (!ordered) {
3340 		bio_ctrl->len_to_oe_boundary = U32_MAX;
3341 		return 0;
3342 	}
3343 
3344 	bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
3345 		ordered->disk_bytenr + ordered->disk_num_bytes - logical);
3346 	btrfs_put_ordered_extent(ordered);
3347 	return 0;
3348 }
3349 
3350 static int alloc_new_bio(struct btrfs_inode *inode,
3351 			 struct btrfs_bio_ctrl *bio_ctrl,
3352 			 struct writeback_control *wbc,
3353 			 blk_opf_t opf,
3354 			 bio_end_io_t end_io_func,
3355 			 u64 disk_bytenr, u32 offset, u64 file_offset,
3356 			 enum btrfs_compression_type compress_type)
3357 {
3358 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3359 	struct bio *bio;
3360 	int ret;
3361 
3362 	bio = btrfs_bio_alloc(BIO_MAX_VECS);
3363 	/*
3364 	 * For compressed page range, its disk_bytenr is always @disk_bytenr
3365 	 * passed in, no matter if we have added any range into previous bio.
3366 	 */
3367 	if (compress_type != BTRFS_COMPRESS_NONE)
3368 		bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
3369 	else
3370 		bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
3371 	bio_ctrl->bio = bio;
3372 	bio_ctrl->compress_type = compress_type;
3373 	bio->bi_end_io = end_io_func;
3374 	bio->bi_opf = opf;
3375 	ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
3376 	if (ret < 0)
3377 		goto error;
3378 
3379 	if (wbc) {
3380 		/*
3381 		 * For Zone append we need the correct block_device that we are
3382 		 * going to write to set in the bio to be able to respect the
3383 		 * hardware limitation.  Look it up here:
3384 		 */
3385 		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
3386 			struct btrfs_device *dev;
3387 
3388 			dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
3389 						     fs_info->sectorsize);
3390 			if (IS_ERR(dev)) {
3391 				ret = PTR_ERR(dev);
3392 				goto error;
3393 			}
3394 
3395 			bio_set_dev(bio, dev->bdev);
3396 		} else {
3397 			/*
3398 			 * Otherwise pick the last added device to support
3399 			 * cgroup writeback.  For multi-device file systems this
3400 			 * means blk-cgroup policies have to always be set on the
3401 			 * last added/replaced device.  This is a bit odd but has
3402 			 * been like that for a long time.
3403 			 */
3404 			bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
3405 		}
3406 		wbc_init_bio(wbc, bio);
3407 	} else {
3408 		ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
3409 	}
3410 	return 0;
3411 error:
3412 	bio_ctrl->bio = NULL;
3413 	bio->bi_status = errno_to_blk_status(ret);
3414 	bio_endio(bio);
3415 	return ret;
3416 }
3417 
3418 /*
3419  * @opf:	bio REQ_OP_* and REQ_* flags as one value
3420  * @wbc:	optional writeback control for io accounting
3421  * @page:	page to add to the bio
3422  * @disk_bytenr: logical bytenr where the write will be
3423  * @size:	portion of page that we want to write to
3424  * @pg_offset:	offset of the new bio or to check whether we are adding
3425  *              a contiguous page to the previous one
3426  * @bio_ret:	must be valid pointer, newly allocated bio will be stored there
3427  * @end_io_func:     end_io callback for new bio
3428  * @mirror_num:	     desired mirror to read/write
3429  * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
3430  * @compress_type:   compress type for current bio
3431  */
3432 static int submit_extent_page(blk_opf_t opf,
3433 			      struct writeback_control *wbc,
3434 			      struct btrfs_bio_ctrl *bio_ctrl,
3435 			      struct page *page, u64 disk_bytenr,
3436 			      size_t size, unsigned long pg_offset,
3437 			      bio_end_io_t end_io_func,
3438 			      enum btrfs_compression_type compress_type,
3439 			      bool force_bio_submit)
3440 {
3441 	int ret = 0;
3442 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
3443 	unsigned int cur = pg_offset;
3444 
3445 	ASSERT(bio_ctrl);
3446 
3447 	ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
3448 	       pg_offset + size <= PAGE_SIZE);
3449 	if (force_bio_submit)
3450 		submit_one_bio(bio_ctrl);
3451 
3452 	while (cur < pg_offset + size) {
3453 		u32 offset = cur - pg_offset;
3454 		int added;
3455 
3456 		/* Allocate new bio if needed */
3457 		if (!bio_ctrl->bio) {
3458 			ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
3459 					    end_io_func, disk_bytenr, offset,
3460 					    page_offset(page) + cur,
3461 					    compress_type);
3462 			if (ret < 0)
3463 				return ret;
3464 		}
3465 		/*
3466 		 * We must go through btrfs_bio_add_page() to ensure each
3467 		 * page range won't cross various boundaries.
3468 		 */
3469 		if (compress_type != BTRFS_COMPRESS_NONE)
3470 			added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
3471 					size - offset, pg_offset + offset,
3472 					compress_type);
3473 		else
3474 			added = btrfs_bio_add_page(bio_ctrl, page,
3475 					disk_bytenr + offset, size - offset,
3476 					pg_offset + offset, compress_type);
3477 
3478 		/* Metadata page range should never be split */
3479 		if (!is_data_inode(&inode->vfs_inode))
3480 			ASSERT(added == 0 || added == size - offset);
3481 
3482 		/* At least we added some page, update the account */
3483 		if (wbc && added)
3484 			wbc_account_cgroup_owner(wbc, page, added);
3485 
3486 		/* We have reached boundary, submit right now */
3487 		if (added < size - offset) {
3488 			/* The bio should contain some page(s) */
3489 			ASSERT(bio_ctrl->bio->bi_iter.bi_size);
3490 			submit_one_bio(bio_ctrl);
3491 		}
3492 		cur += added;
3493 	}
3494 	return 0;
3495 }
3496 
3497 static int attach_extent_buffer_page(struct extent_buffer *eb,
3498 				     struct page *page,
3499 				     struct btrfs_subpage *prealloc)
3500 {
3501 	struct btrfs_fs_info *fs_info = eb->fs_info;
3502 	int ret = 0;
3503 
3504 	/*
3505 	 * If the page is mapped to btree inode, we should hold the private
3506 	 * lock to prevent race.
3507 	 * For cloned or dummy extent buffers, their pages are not mapped and
3508 	 * will not race with any other ebs.
3509 	 */
3510 	if (page->mapping)
3511 		lockdep_assert_held(&page->mapping->private_lock);
3512 
3513 	if (fs_info->nodesize >= PAGE_SIZE) {
3514 		if (!PagePrivate(page))
3515 			attach_page_private(page, eb);
3516 		else
3517 			WARN_ON(page->private != (unsigned long)eb);
3518 		return 0;
3519 	}
3520 
3521 	/* Already mapped, just free prealloc */
3522 	if (PagePrivate(page)) {
3523 		btrfs_free_subpage(prealloc);
3524 		return 0;
3525 	}
3526 
3527 	if (prealloc)
3528 		/* Has preallocated memory for subpage */
3529 		attach_page_private(page, prealloc);
3530 	else
3531 		/* Do new allocation to attach subpage */
3532 		ret = btrfs_attach_subpage(fs_info, page,
3533 					   BTRFS_SUBPAGE_METADATA);
3534 	return ret;
3535 }
3536 
3537 int set_page_extent_mapped(struct page *page)
3538 {
3539 	struct btrfs_fs_info *fs_info;
3540 
3541 	ASSERT(page->mapping);
3542 
3543 	if (PagePrivate(page))
3544 		return 0;
3545 
3546 	fs_info = btrfs_sb(page->mapping->host->i_sb);
3547 
3548 	if (btrfs_is_subpage(fs_info, page))
3549 		return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
3550 
3551 	attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
3552 	return 0;
3553 }
3554 
3555 void clear_page_extent_mapped(struct page *page)
3556 {
3557 	struct btrfs_fs_info *fs_info;
3558 
3559 	ASSERT(page->mapping);
3560 
3561 	if (!PagePrivate(page))
3562 		return;
3563 
3564 	fs_info = btrfs_sb(page->mapping->host->i_sb);
3565 	if (btrfs_is_subpage(fs_info, page))
3566 		return btrfs_detach_subpage(fs_info, page);
3567 
3568 	detach_page_private(page);
3569 }
3570 
3571 static struct extent_map *
3572 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
3573 		 u64 start, u64 len, struct extent_map **em_cached)
3574 {
3575 	struct extent_map *em;
3576 
3577 	if (em_cached && *em_cached) {
3578 		em = *em_cached;
3579 		if (extent_map_in_tree(em) && start >= em->start &&
3580 		    start < extent_map_end(em)) {
3581 			refcount_inc(&em->refs);
3582 			return em;
3583 		}
3584 
3585 		free_extent_map(em);
3586 		*em_cached = NULL;
3587 	}
3588 
3589 	em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
3590 	if (em_cached && !IS_ERR(em)) {
3591 		BUG_ON(*em_cached);
3592 		refcount_inc(&em->refs);
3593 		*em_cached = em;
3594 	}
3595 	return em;
3596 }
3597 /*
3598  * basic readpage implementation.  Locked extent state structs are inserted
3599  * into the tree that are removed when the IO is done (by the end_io
3600  * handlers)
3601  * XXX JDM: This needs looking at to ensure proper page locking
3602  * return 0 on success, otherwise return error
3603  */
3604 static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
3605 		      struct btrfs_bio_ctrl *bio_ctrl,
3606 		      blk_opf_t read_flags, u64 *prev_em_start)
3607 {
3608 	struct inode *inode = page->mapping->host;
3609 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3610 	u64 start = page_offset(page);
3611 	const u64 end = start + PAGE_SIZE - 1;
3612 	u64 cur = start;
3613 	u64 extent_offset;
3614 	u64 last_byte = i_size_read(inode);
3615 	u64 block_start;
3616 	u64 cur_end;
3617 	struct extent_map *em;
3618 	int ret = 0;
3619 	size_t pg_offset = 0;
3620 	size_t iosize;
3621 	size_t blocksize = inode->i_sb->s_blocksize;
3622 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
3623 
3624 	ret = set_page_extent_mapped(page);
3625 	if (ret < 0) {
3626 		unlock_extent(tree, start, end);
3627 		btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
3628 		unlock_page(page);
3629 		goto out;
3630 	}
3631 
3632 	if (page->index == last_byte >> PAGE_SHIFT) {
3633 		size_t zero_offset = offset_in_page(last_byte);
3634 
3635 		if (zero_offset) {
3636 			iosize = PAGE_SIZE - zero_offset;
3637 			memzero_page(page, zero_offset, iosize);
3638 		}
3639 	}
3640 	begin_page_read(fs_info, page);
3641 	while (cur <= end) {
3642 		unsigned long this_bio_flag = 0;
3643 		bool force_bio_submit = false;
3644 		u64 disk_bytenr;
3645 
3646 		ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
3647 		if (cur >= last_byte) {
3648 			struct extent_state *cached = NULL;
3649 
3650 			iosize = PAGE_SIZE - pg_offset;
3651 			memzero_page(page, pg_offset, iosize);
3652 			set_extent_uptodate(tree, cur, cur + iosize - 1,
3653 					    &cached, GFP_NOFS);
3654 			unlock_extent_cached(tree, cur,
3655 					     cur + iosize - 1, &cached);
3656 			end_page_read(page, true, cur, iosize);
3657 			break;
3658 		}
3659 		em = __get_extent_map(inode, page, pg_offset, cur,
3660 				      end - cur + 1, em_cached);
3661 		if (IS_ERR(em)) {
3662 			unlock_extent(tree, cur, end);
3663 			end_page_read(page, false, cur, end + 1 - cur);
3664 			ret = PTR_ERR(em);
3665 			break;
3666 		}
3667 		extent_offset = cur - em->start;
3668 		BUG_ON(extent_map_end(em) <= cur);
3669 		BUG_ON(end < cur);
3670 
3671 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3672 			this_bio_flag = em->compress_type;
3673 
3674 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
3675 		cur_end = min(extent_map_end(em) - 1, end);
3676 		iosize = ALIGN(iosize, blocksize);
3677 		if (this_bio_flag != BTRFS_COMPRESS_NONE)
3678 			disk_bytenr = em->block_start;
3679 		else
3680 			disk_bytenr = em->block_start + extent_offset;
3681 		block_start = em->block_start;
3682 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3683 			block_start = EXTENT_MAP_HOLE;
3684 
3685 		/*
3686 		 * If we have a file range that points to a compressed extent
3687 		 * and it's followed by a consecutive file range that points
3688 		 * to the same compressed extent (possibly with a different
3689 		 * offset and/or length, so it either points to the whole extent
3690 		 * or only part of it), we must make sure we do not submit a
3691 		 * single bio to populate the pages for the 2 ranges because
3692 		 * this makes the compressed extent read zero out the pages
3693 		 * belonging to the 2nd range. Imagine the following scenario:
3694 		 *
3695 		 *  File layout
3696 		 *  [0 - 8K]                     [8K - 24K]
3697 		 *    |                               |
3698 		 *    |                               |
3699 		 * points to extent X,         points to extent X,
3700 		 * offset 4K, length of 8K     offset 0, length 16K
3701 		 *
3702 		 * [extent X, compressed length = 4K uncompressed length = 16K]
3703 		 *
3704 		 * If the bio to read the compressed extent covers both ranges,
3705 		 * it will decompress extent X into the pages belonging to the
3706 		 * first range and then it will stop, zeroing out the remaining
3707 		 * pages that belong to the other range that points to extent X.
3708 		 * So here we make sure we submit 2 bios, one for the first
3709 		 * range and another one for the third range. Both will target
3710 		 * the same physical extent from disk, but we can't currently
3711 		 * make the compressed bio endio callback populate the pages
3712 		 * for both ranges because each compressed bio is tightly
3713 		 * coupled with a single extent map, and each range can have
3714 		 * an extent map with a different offset value relative to the
3715 		 * uncompressed data of our extent and different lengths. This
3716 		 * is a corner case so we prioritize correctness over
3717 		 * non-optimal behavior (submitting 2 bios for the same extent).
3718 		 */
3719 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3720 		    prev_em_start && *prev_em_start != (u64)-1 &&
3721 		    *prev_em_start != em->start)
3722 			force_bio_submit = true;
3723 
3724 		if (prev_em_start)
3725 			*prev_em_start = em->start;
3726 
3727 		free_extent_map(em);
3728 		em = NULL;
3729 
3730 		/* we've found a hole, just zero and go on */
3731 		if (block_start == EXTENT_MAP_HOLE) {
3732 			struct extent_state *cached = NULL;
3733 
3734 			memzero_page(page, pg_offset, iosize);
3735 
3736 			set_extent_uptodate(tree, cur, cur + iosize - 1,
3737 					    &cached, GFP_NOFS);
3738 			unlock_extent_cached(tree, cur,
3739 					     cur + iosize - 1, &cached);
3740 			end_page_read(page, true, cur, iosize);
3741 			cur = cur + iosize;
3742 			pg_offset += iosize;
3743 			continue;
3744 		}
3745 		/* the get_extent function already copied into the page */
3746 		if (test_range_bit(tree, cur, cur_end,
3747 				   EXTENT_UPTODATE, 1, NULL)) {
3748 			unlock_extent(tree, cur, cur + iosize - 1);
3749 			end_page_read(page, true, cur, iosize);
3750 			cur = cur + iosize;
3751 			pg_offset += iosize;
3752 			continue;
3753 		}
3754 		/* we have an inline extent but it didn't get marked up
3755 		 * to date.  Error out
3756 		 */
3757 		if (block_start == EXTENT_MAP_INLINE) {
3758 			unlock_extent(tree, cur, cur + iosize - 1);
3759 			end_page_read(page, false, cur, iosize);
3760 			cur = cur + iosize;
3761 			pg_offset += iosize;
3762 			continue;
3763 		}
3764 
3765 		ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
3766 					 bio_ctrl, page, disk_bytenr, iosize,
3767 					 pg_offset, end_bio_extent_readpage,
3768 					 this_bio_flag, force_bio_submit);
3769 		if (ret) {
3770 			/*
3771 			 * We have to unlock the remaining range, or the page
3772 			 * will never be unlocked.
3773 			 */
3774 			unlock_extent(tree, cur, end);
3775 			end_page_read(page, false, cur, end + 1 - cur);
3776 			goto out;
3777 		}
3778 		cur = cur + iosize;
3779 		pg_offset += iosize;
3780 	}
3781 out:
3782 	return ret;
3783 }
3784 
3785 int btrfs_read_folio(struct file *file, struct folio *folio)
3786 {
3787 	struct page *page = &folio->page;
3788 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
3789 	u64 start = page_offset(page);
3790 	u64 end = start + PAGE_SIZE - 1;
3791 	struct btrfs_bio_ctrl bio_ctrl = { 0 };
3792 	int ret;
3793 
3794 	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
3795 
3796 	ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
3797 	/*
3798 	 * If btrfs_do_readpage() failed we will want to submit the assembled
3799 	 * bio to do the cleanup.
3800 	 */
3801 	submit_one_bio(&bio_ctrl);
3802 	return ret;
3803 }
3804 
3805 static inline void contiguous_readpages(struct page *pages[], int nr_pages,
3806 					u64 start, u64 end,
3807 					struct extent_map **em_cached,
3808 					struct btrfs_bio_ctrl *bio_ctrl,
3809 					u64 *prev_em_start)
3810 {
3811 	struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
3812 	int index;
3813 
3814 	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
3815 
3816 	for (index = 0; index < nr_pages; index++) {
3817 		btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
3818 				  REQ_RAHEAD, prev_em_start);
3819 		put_page(pages[index]);
3820 	}
3821 }
3822 
3823 /*
3824  * helper for __extent_writepage, doing all of the delayed allocation setup.
3825  *
3826  * This returns 1 if btrfs_run_delalloc_range function did all the work required
3827  * to write the page (copy into inline extent).  In this case the IO has
3828  * been started and the page is already unlocked.
3829  *
3830  * This returns 0 if all went well (page still locked)
3831  * This returns < 0 if there were errors (page still locked)
3832  */
3833 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
3834 		struct page *page, struct writeback_control *wbc)
3835 {
3836 	const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
3837 	u64 delalloc_start = page_offset(page);
3838 	u64 delalloc_to_write = 0;
3839 	/* How many pages are started by btrfs_run_delalloc_range() */
3840 	unsigned long nr_written = 0;
3841 	int ret;
3842 	int page_started = 0;
3843 
3844 	while (delalloc_start < page_end) {
3845 		u64 delalloc_end = page_end;
3846 		bool found;
3847 
3848 		found = find_lock_delalloc_range(&inode->vfs_inode, page,
3849 					       &delalloc_start,
3850 					       &delalloc_end);
3851 		if (!found) {
3852 			delalloc_start = delalloc_end + 1;
3853 			continue;
3854 		}
3855 		ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
3856 				delalloc_end, &page_started, &nr_written, wbc);
3857 		if (ret) {
3858 			btrfs_page_set_error(inode->root->fs_info, page,
3859 					     page_offset(page), PAGE_SIZE);
3860 			return ret;
3861 		}
3862 		/*
3863 		 * delalloc_end is already one less than the total length, so
3864 		 * we don't subtract one from PAGE_SIZE
3865 		 */
3866 		delalloc_to_write += (delalloc_end - delalloc_start +
3867 				      PAGE_SIZE) >> PAGE_SHIFT;
3868 		delalloc_start = delalloc_end + 1;
3869 	}
3870 	if (wbc->nr_to_write < delalloc_to_write) {
3871 		int thresh = 8192;
3872 
3873 		if (delalloc_to_write < thresh * 2)
3874 			thresh = delalloc_to_write;
3875 		wbc->nr_to_write = min_t(u64, delalloc_to_write,
3876 					 thresh);
3877 	}
3878 
3879 	/* Did btrfs_run_dealloc_range() already unlock and start the IO? */
3880 	if (page_started) {
3881 		/*
3882 		 * We've unlocked the page, so we can't update the mapping's
3883 		 * writeback index, just update nr_to_write.
3884 		 */
3885 		wbc->nr_to_write -= nr_written;
3886 		return 1;
3887 	}
3888 
3889 	return 0;
3890 }
3891 
3892 /*
3893  * Find the first byte we need to write.
3894  *
3895  * For subpage, one page can contain several sectors, and
3896  * __extent_writepage_io() will just grab all extent maps in the page
3897  * range and try to submit all non-inline/non-compressed extents.
3898  *
3899  * This is a big problem for subpage, we shouldn't re-submit already written
3900  * data at all.
3901  * This function will lookup subpage dirty bit to find which range we really
3902  * need to submit.
3903  *
3904  * Return the next dirty range in [@start, @end).
3905  * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
3906  */
3907 static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
3908 				 struct page *page, u64 *start, u64 *end)
3909 {
3910 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
3911 	struct btrfs_subpage_info *spi = fs_info->subpage_info;
3912 	u64 orig_start = *start;
3913 	/* Declare as unsigned long so we can use bitmap ops */
3914 	unsigned long flags;
3915 	int range_start_bit;
3916 	int range_end_bit;
3917 
3918 	/*
3919 	 * For regular sector size == page size case, since one page only
3920 	 * contains one sector, we return the page offset directly.
3921 	 */
3922 	if (!btrfs_is_subpage(fs_info, page)) {
3923 		*start = page_offset(page);
3924 		*end = page_offset(page) + PAGE_SIZE;
3925 		return;
3926 	}
3927 
3928 	range_start_bit = spi->dirty_offset +
3929 			  (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
3930 
3931 	/* We should have the page locked, but just in case */
3932 	spin_lock_irqsave(&subpage->lock, flags);
3933 	bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
3934 			       spi->dirty_offset + spi->bitmap_nr_bits);
3935 	spin_unlock_irqrestore(&subpage->lock, flags);
3936 
3937 	range_start_bit -= spi->dirty_offset;
3938 	range_end_bit -= spi->dirty_offset;
3939 
3940 	*start = page_offset(page) + range_start_bit * fs_info->sectorsize;
3941 	*end = page_offset(page) + range_end_bit * fs_info->sectorsize;
3942 }
3943 
3944 /*
3945  * helper for __extent_writepage.  This calls the writepage start hooks,
3946  * and does the loop to map the page into extents and bios.
3947  *
3948  * We return 1 if the IO is started and the page is unlocked,
3949  * 0 if all went well (page still locked)
3950  * < 0 if there were errors (page still locked)
3951  */
3952 static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
3953 				 struct page *page,
3954 				 struct writeback_control *wbc,
3955 				 struct extent_page_data *epd,
3956 				 loff_t i_size,
3957 				 int *nr_ret)
3958 {
3959 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3960 	u64 cur = page_offset(page);
3961 	u64 end = cur + PAGE_SIZE - 1;
3962 	u64 extent_offset;
3963 	u64 block_start;
3964 	struct extent_map *em;
3965 	int saved_ret = 0;
3966 	int ret = 0;
3967 	int nr = 0;
3968 	enum req_op op = REQ_OP_WRITE;
3969 	const blk_opf_t write_flags = wbc_to_write_flags(wbc);
3970 	bool has_error = false;
3971 	bool compressed;
3972 
3973 	ret = btrfs_writepage_cow_fixup(page);
3974 	if (ret) {
3975 		/* Fixup worker will requeue */
3976 		redirty_page_for_writepage(wbc, page);
3977 		unlock_page(page);
3978 		return 1;
3979 	}
3980 
3981 	/*
3982 	 * we don't want to touch the inode after unlocking the page,
3983 	 * so we update the mapping writeback index now
3984 	 */
3985 	wbc->nr_to_write--;
3986 
3987 	while (cur <= end) {
3988 		u64 disk_bytenr;
3989 		u64 em_end;
3990 		u64 dirty_range_start = cur;
3991 		u64 dirty_range_end;
3992 		u32 iosize;
3993 
3994 		if (cur >= i_size) {
3995 			btrfs_writepage_endio_finish_ordered(inode, page, cur,
3996 							     end, true);
3997 			/*
3998 			 * This range is beyond i_size, thus we don't need to
3999 			 * bother writing back.
4000 			 * But we still need to clear the dirty subpage bit, or
4001 			 * the next time the page gets dirtied, we will try to
4002 			 * writeback the sectors with subpage dirty bits,
4003 			 * causing writeback without ordered extent.
4004 			 */
4005 			btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
4006 			break;
4007 		}
4008 
4009 		find_next_dirty_byte(fs_info, page, &dirty_range_start,
4010 				     &dirty_range_end);
4011 		if (cur < dirty_range_start) {
4012 			cur = dirty_range_start;
4013 			continue;
4014 		}
4015 
4016 		em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
4017 		if (IS_ERR(em)) {
4018 			btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
4019 			ret = PTR_ERR_OR_ZERO(em);
4020 			has_error = true;
4021 			if (!saved_ret)
4022 				saved_ret = ret;
4023 			break;
4024 		}
4025 
4026 		extent_offset = cur - em->start;
4027 		em_end = extent_map_end(em);
4028 		ASSERT(cur <= em_end);
4029 		ASSERT(cur < end);
4030 		ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
4031 		ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
4032 		block_start = em->block_start;
4033 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4034 		disk_bytenr = em->block_start + extent_offset;
4035 
4036 		/*
4037 		 * Note that em_end from extent_map_end() and dirty_range_end from
4038 		 * find_next_dirty_byte() are all exclusive
4039 		 */
4040 		iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
4041 
4042 		if (btrfs_use_zone_append(inode, em->block_start))
4043 			op = REQ_OP_ZONE_APPEND;
4044 
4045 		free_extent_map(em);
4046 		em = NULL;
4047 
4048 		/*
4049 		 * compressed and inline extents are written through other
4050 		 * paths in the FS
4051 		 */
4052 		if (compressed || block_start == EXTENT_MAP_HOLE ||
4053 		    block_start == EXTENT_MAP_INLINE) {
4054 			if (compressed)
4055 				nr++;
4056 			else
4057 				btrfs_writepage_endio_finish_ordered(inode,
4058 						page, cur, cur + iosize - 1, true);
4059 			btrfs_page_clear_dirty(fs_info, page, cur, iosize);
4060 			cur += iosize;
4061 			continue;
4062 		}
4063 
4064 		btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
4065 		if (!PageWriteback(page)) {
4066 			btrfs_err(inode->root->fs_info,
4067 				   "page %lu not writeback, cur %llu end %llu",
4068 			       page->index, cur, end);
4069 		}
4070 
4071 		/*
4072 		 * Although the PageDirty bit is cleared before entering this
4073 		 * function, subpage dirty bit is not cleared.
4074 		 * So clear subpage dirty bit here so next time we won't submit
4075 		 * page for range already written to disk.
4076 		 */
4077 		btrfs_page_clear_dirty(fs_info, page, cur, iosize);
4078 
4079 		ret = submit_extent_page(op | write_flags, wbc,
4080 					 &epd->bio_ctrl, page,
4081 					 disk_bytenr, iosize,
4082 					 cur - page_offset(page),
4083 					 end_bio_extent_writepage,
4084 					 0, false);
4085 		if (ret) {
4086 			has_error = true;
4087 			if (!saved_ret)
4088 				saved_ret = ret;
4089 
4090 			btrfs_page_set_error(fs_info, page, cur, iosize);
4091 			if (PageWriteback(page))
4092 				btrfs_page_clear_writeback(fs_info, page, cur,
4093 							   iosize);
4094 		}
4095 
4096 		cur += iosize;
4097 		nr++;
4098 	}
4099 	/*
4100 	 * If we finish without problem, we should not only clear page dirty,
4101 	 * but also empty subpage dirty bits
4102 	 */
4103 	if (!has_error)
4104 		btrfs_page_assert_not_dirty(fs_info, page);
4105 	else
4106 		ret = saved_ret;
4107 	*nr_ret = nr;
4108 	return ret;
4109 }
4110 
4111 /*
4112  * the writepage semantics are similar to regular writepage.  extent
4113  * records are inserted to lock ranges in the tree, and as dirty areas
4114  * are found, they are marked writeback.  Then the lock bits are removed
4115  * and the end_io handler clears the writeback ranges
4116  *
4117  * Return 0 if everything goes well.
4118  * Return <0 for error.
4119  */
4120 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
4121 			      struct extent_page_data *epd)
4122 {
4123 	struct folio *folio = page_folio(page);
4124 	struct inode *inode = page->mapping->host;
4125 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4126 	const u64 page_start = page_offset(page);
4127 	const u64 page_end = page_start + PAGE_SIZE - 1;
4128 	int ret;
4129 	int nr = 0;
4130 	size_t pg_offset;
4131 	loff_t i_size = i_size_read(inode);
4132 	unsigned long end_index = i_size >> PAGE_SHIFT;
4133 
4134 	trace___extent_writepage(page, inode, wbc);
4135 
4136 	WARN_ON(!PageLocked(page));
4137 
4138 	btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
4139 			       page_offset(page), PAGE_SIZE);
4140 
4141 	pg_offset = offset_in_page(i_size);
4142 	if (page->index > end_index ||
4143 	   (page->index == end_index && !pg_offset)) {
4144 		folio_invalidate(folio, 0, folio_size(folio));
4145 		folio_unlock(folio);
4146 		return 0;
4147 	}
4148 
4149 	if (page->index == end_index)
4150 		memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
4151 
4152 	ret = set_page_extent_mapped(page);
4153 	if (ret < 0) {
4154 		SetPageError(page);
4155 		goto done;
4156 	}
4157 
4158 	if (!epd->extent_locked) {
4159 		ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
4160 		if (ret == 1)
4161 			return 0;
4162 		if (ret)
4163 			goto done;
4164 	}
4165 
4166 	ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
4167 				    &nr);
4168 	if (ret == 1)
4169 		return 0;
4170 
4171 done:
4172 	if (nr == 0) {
4173 		/* make sure the mapping tag for page dirty gets cleared */
4174 		set_page_writeback(page);
4175 		end_page_writeback(page);
4176 	}
4177 	/*
4178 	 * Here we used to have a check for PageError() and then set @ret and
4179 	 * call end_extent_writepage().
4180 	 *
4181 	 * But in fact setting @ret here will cause different error paths
4182 	 * between subpage and regular sectorsize.
4183 	 *
4184 	 * For regular page size, we never submit current page, but only add
4185 	 * current page to current bio.
4186 	 * The bio submission can only happen in next page.
4187 	 * Thus if we hit the PageError() branch, @ret is already set to
4188 	 * non-zero value and will not get updated for regular sectorsize.
4189 	 *
4190 	 * But for subpage case, it's possible we submit part of current page,
4191 	 * thus can get PageError() set by submitted bio of the same page,
4192 	 * while our @ret is still 0.
4193 	 *
4194 	 * So here we unify the behavior and don't set @ret.
4195 	 * Error can still be properly passed to higher layer as page will
4196 	 * be set error, here we just don't handle the IO failure.
4197 	 *
4198 	 * NOTE: This is just a hotfix for subpage.
4199 	 * The root fix will be properly ending ordered extent when we hit
4200 	 * an error during writeback.
4201 	 *
4202 	 * But that needs a bigger refactoring, as we not only need to grab the
4203 	 * submitted OE, but also need to know exactly at which bytenr we hit
4204 	 * the error.
4205 	 * Currently the full page based __extent_writepage_io() is not
4206 	 * capable of that.
4207 	 */
4208 	if (PageError(page))
4209 		end_extent_writepage(page, ret, page_start, page_end);
4210 	if (epd->extent_locked) {
4211 		/*
4212 		 * If epd->extent_locked, it's from extent_write_locked_range(),
4213 		 * the page can either be locked by lock_page() or
4214 		 * process_one_page().
4215 		 * Let btrfs_page_unlock_writer() handle both cases.
4216 		 */
4217 		ASSERT(wbc);
4218 		btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
4219 					 wbc->range_end + 1 - wbc->range_start);
4220 	} else {
4221 		unlock_page(page);
4222 	}
4223 	ASSERT(ret <= 0);
4224 	return ret;
4225 }
4226 
4227 void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
4228 {
4229 	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
4230 		       TASK_UNINTERRUPTIBLE);
4231 }
4232 
4233 static void end_extent_buffer_writeback(struct extent_buffer *eb)
4234 {
4235 	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
4236 	smp_mb__after_atomic();
4237 	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
4238 }
4239 
4240 /*
4241  * Lock extent buffer status and pages for writeback.
4242  *
4243  * May try to flush write bio if we can't get the lock.
4244  *
4245  * Return  0 if the extent buffer doesn't need to be submitted.
4246  *           (E.g. the extent buffer is not dirty)
4247  * Return >0 is the extent buffer is submitted to bio.
4248  * Return <0 if something went wrong, no page is locked.
4249  */
4250 static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
4251 			  struct extent_page_data *epd)
4252 {
4253 	struct btrfs_fs_info *fs_info = eb->fs_info;
4254 	int i, num_pages;
4255 	int flush = 0;
4256 	int ret = 0;
4257 
4258 	if (!btrfs_try_tree_write_lock(eb)) {
4259 		submit_write_bio(epd, 0);
4260 		flush = 1;
4261 		btrfs_tree_lock(eb);
4262 	}
4263 
4264 	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
4265 		btrfs_tree_unlock(eb);
4266 		if (!epd->sync_io)
4267 			return 0;
4268 		if (!flush) {
4269 			submit_write_bio(epd, 0);
4270 			flush = 1;
4271 		}
4272 		while (1) {
4273 			wait_on_extent_buffer_writeback(eb);
4274 			btrfs_tree_lock(eb);
4275 			if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
4276 				break;
4277 			btrfs_tree_unlock(eb);
4278 		}
4279 	}
4280 
4281 	/*
4282 	 * We need to do this to prevent races in people who check if the eb is
4283 	 * under IO since we can end up having no IO bits set for a short period
4284 	 * of time.
4285 	 */
4286 	spin_lock(&eb->refs_lock);
4287 	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
4288 		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
4289 		spin_unlock(&eb->refs_lock);
4290 		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
4291 		percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4292 					 -eb->len,
4293 					 fs_info->dirty_metadata_batch);
4294 		ret = 1;
4295 	} else {
4296 		spin_unlock(&eb->refs_lock);
4297 	}
4298 
4299 	btrfs_tree_unlock(eb);
4300 
4301 	/*
4302 	 * Either we don't need to submit any tree block, or we're submitting
4303 	 * subpage eb.
4304 	 * Subpage metadata doesn't use page locking at all, so we can skip
4305 	 * the page locking.
4306 	 */
4307 	if (!ret || fs_info->nodesize < PAGE_SIZE)
4308 		return ret;
4309 
4310 	num_pages = num_extent_pages(eb);
4311 	for (i = 0; i < num_pages; i++) {
4312 		struct page *p = eb->pages[i];
4313 
4314 		if (!trylock_page(p)) {
4315 			if (!flush) {
4316 				submit_write_bio(epd, 0);
4317 				flush = 1;
4318 			}
4319 			lock_page(p);
4320 		}
4321 	}
4322 
4323 	return ret;
4324 }
4325 
4326 static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
4327 {
4328 	struct btrfs_fs_info *fs_info = eb->fs_info;
4329 
4330 	btrfs_page_set_error(fs_info, page, eb->start, eb->len);
4331 	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
4332 		return;
4333 
4334 	/*
4335 	 * A read may stumble upon this buffer later, make sure that it gets an
4336 	 * error and knows there was an error.
4337 	 */
4338 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4339 
4340 	/*
4341 	 * We need to set the mapping with the io error as well because a write
4342 	 * error will flip the file system readonly, and then syncfs() will
4343 	 * return a 0 because we are readonly if we don't modify the err seq for
4344 	 * the superblock.
4345 	 */
4346 	mapping_set_error(page->mapping, -EIO);
4347 
4348 	/*
4349 	 * If we error out, we should add back the dirty_metadata_bytes
4350 	 * to make it consistent.
4351 	 */
4352 	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4353 				 eb->len, fs_info->dirty_metadata_batch);
4354 
4355 	/*
4356 	 * If writeback for a btree extent that doesn't belong to a log tree
4357 	 * failed, increment the counter transaction->eb_write_errors.
4358 	 * We do this because while the transaction is running and before it's
4359 	 * committing (when we call filemap_fdata[write|wait]_range against
4360 	 * the btree inode), we might have
4361 	 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
4362 	 * returns an error or an error happens during writeback, when we're
4363 	 * committing the transaction we wouldn't know about it, since the pages
4364 	 * can be no longer dirty nor marked anymore for writeback (if a
4365 	 * subsequent modification to the extent buffer didn't happen before the
4366 	 * transaction commit), which makes filemap_fdata[write|wait]_range not
4367 	 * able to find the pages tagged with SetPageError at transaction
4368 	 * commit time. So if this happens we must abort the transaction,
4369 	 * otherwise we commit a super block with btree roots that point to
4370 	 * btree nodes/leafs whose content on disk is invalid - either garbage
4371 	 * or the content of some node/leaf from a past generation that got
4372 	 * cowed or deleted and is no longer valid.
4373 	 *
4374 	 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
4375 	 * not be enough - we need to distinguish between log tree extents vs
4376 	 * non-log tree extents, and the next filemap_fdatawait_range() call
4377 	 * will catch and clear such errors in the mapping - and that call might
4378 	 * be from a log sync and not from a transaction commit. Also, checking
4379 	 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
4380 	 * not done and would not be reliable - the eb might have been released
4381 	 * from memory and reading it back again means that flag would not be
4382 	 * set (since it's a runtime flag, not persisted on disk).
4383 	 *
4384 	 * Using the flags below in the btree inode also makes us achieve the
4385 	 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
4386 	 * writeback for all dirty pages and before filemap_fdatawait_range()
4387 	 * is called, the writeback for all dirty pages had already finished
4388 	 * with errors - because we were not using AS_EIO/AS_ENOSPC,
4389 	 * filemap_fdatawait_range() would return success, as it could not know
4390 	 * that writeback errors happened (the pages were no longer tagged for
4391 	 * writeback).
4392 	 */
4393 	switch (eb->log_index) {
4394 	case -1:
4395 		set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
4396 		break;
4397 	case 0:
4398 		set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
4399 		break;
4400 	case 1:
4401 		set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
4402 		break;
4403 	default:
4404 		BUG(); /* unexpected, logic error */
4405 	}
4406 }
4407 
4408 /*
4409  * The endio specific version which won't touch any unsafe spinlock in endio
4410  * context.
4411  */
4412 static struct extent_buffer *find_extent_buffer_nolock(
4413 		struct btrfs_fs_info *fs_info, u64 start)
4414 {
4415 	struct extent_buffer *eb;
4416 
4417 	rcu_read_lock();
4418 	eb = radix_tree_lookup(&fs_info->buffer_radix,
4419 			       start >> fs_info->sectorsize_bits);
4420 	if (eb && atomic_inc_not_zero(&eb->refs)) {
4421 		rcu_read_unlock();
4422 		return eb;
4423 	}
4424 	rcu_read_unlock();
4425 	return NULL;
4426 }
4427 
4428 /*
4429  * The endio function for subpage extent buffer write.
4430  *
4431  * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
4432  * after all extent buffers in the page has finished their writeback.
4433  */
4434 static void end_bio_subpage_eb_writepage(struct bio *bio)
4435 {
4436 	struct btrfs_fs_info *fs_info;
4437 	struct bio_vec *bvec;
4438 	struct bvec_iter_all iter_all;
4439 
4440 	fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
4441 	ASSERT(fs_info->nodesize < PAGE_SIZE);
4442 
4443 	ASSERT(!bio_flagged(bio, BIO_CLONED));
4444 	bio_for_each_segment_all(bvec, bio, iter_all) {
4445 		struct page *page = bvec->bv_page;
4446 		u64 bvec_start = page_offset(page) + bvec->bv_offset;
4447 		u64 bvec_end = bvec_start + bvec->bv_len - 1;
4448 		u64 cur_bytenr = bvec_start;
4449 
4450 		ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
4451 
4452 		/* Iterate through all extent buffers in the range */
4453 		while (cur_bytenr <= bvec_end) {
4454 			struct extent_buffer *eb;
4455 			int done;
4456 
4457 			/*
4458 			 * Here we can't use find_extent_buffer(), as it may
4459 			 * try to lock eb->refs_lock, which is not safe in endio
4460 			 * context.
4461 			 */
4462 			eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
4463 			ASSERT(eb);
4464 
4465 			cur_bytenr = eb->start + eb->len;
4466 
4467 			ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
4468 			done = atomic_dec_and_test(&eb->io_pages);
4469 			ASSERT(done);
4470 
4471 			if (bio->bi_status ||
4472 			    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
4473 				ClearPageUptodate(page);
4474 				set_btree_ioerr(page, eb);
4475 			}
4476 
4477 			btrfs_subpage_clear_writeback(fs_info, page, eb->start,
4478 						      eb->len);
4479 			end_extent_buffer_writeback(eb);
4480 			/*
4481 			 * free_extent_buffer() will grab spinlock which is not
4482 			 * safe in endio context. Thus here we manually dec
4483 			 * the ref.
4484 			 */
4485 			atomic_dec(&eb->refs);
4486 		}
4487 	}
4488 	bio_put(bio);
4489 }
4490 
4491 static void end_bio_extent_buffer_writepage(struct bio *bio)
4492 {
4493 	struct bio_vec *bvec;
4494 	struct extent_buffer *eb;
4495 	int done;
4496 	struct bvec_iter_all iter_all;
4497 
4498 	ASSERT(!bio_flagged(bio, BIO_CLONED));
4499 	bio_for_each_segment_all(bvec, bio, iter_all) {
4500 		struct page *page = bvec->bv_page;
4501 
4502 		eb = (struct extent_buffer *)page->private;
4503 		BUG_ON(!eb);
4504 		done = atomic_dec_and_test(&eb->io_pages);
4505 
4506 		if (bio->bi_status ||
4507 		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
4508 			ClearPageUptodate(page);
4509 			set_btree_ioerr(page, eb);
4510 		}
4511 
4512 		end_page_writeback(page);
4513 
4514 		if (!done)
4515 			continue;
4516 
4517 		end_extent_buffer_writeback(eb);
4518 	}
4519 
4520 	bio_put(bio);
4521 }
4522 
4523 static void prepare_eb_write(struct extent_buffer *eb)
4524 {
4525 	u32 nritems;
4526 	unsigned long start;
4527 	unsigned long end;
4528 
4529 	clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
4530 	atomic_set(&eb->io_pages, num_extent_pages(eb));
4531 
4532 	/* Set btree blocks beyond nritems with 0 to avoid stale content */
4533 	nritems = btrfs_header_nritems(eb);
4534 	if (btrfs_header_level(eb) > 0) {
4535 		end = btrfs_node_key_ptr_offset(nritems);
4536 		memzero_extent_buffer(eb, end, eb->len - end);
4537 	} else {
4538 		/*
4539 		 * Leaf:
4540 		 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
4541 		 */
4542 		start = btrfs_item_nr_offset(nritems);
4543 		end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
4544 		memzero_extent_buffer(eb, start, end - start);
4545 	}
4546 }
4547 
4548 /*
4549  * Unlike the work in write_one_eb(), we rely completely on extent locking.
4550  * Page locking is only utilized at minimum to keep the VMM code happy.
4551  */
4552 static int write_one_subpage_eb(struct extent_buffer *eb,
4553 				struct writeback_control *wbc,
4554 				struct extent_page_data *epd)
4555 {
4556 	struct btrfs_fs_info *fs_info = eb->fs_info;
4557 	struct page *page = eb->pages[0];
4558 	blk_opf_t write_flags = wbc_to_write_flags(wbc);
4559 	bool no_dirty_ebs = false;
4560 	int ret;
4561 
4562 	prepare_eb_write(eb);
4563 
4564 	/* clear_page_dirty_for_io() in subpage helper needs page locked */
4565 	lock_page(page);
4566 	btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
4567 
4568 	/* Check if this is the last dirty bit to update nr_written */
4569 	no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
4570 							  eb->start, eb->len);
4571 	if (no_dirty_ebs)
4572 		clear_page_dirty_for_io(page);
4573 
4574 	ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
4575 			&epd->bio_ctrl, page, eb->start, eb->len,
4576 			eb->start - page_offset(page),
4577 			end_bio_subpage_eb_writepage, 0, false);
4578 	if (ret) {
4579 		btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
4580 		set_btree_ioerr(page, eb);
4581 		unlock_page(page);
4582 
4583 		if (atomic_dec_and_test(&eb->io_pages))
4584 			end_extent_buffer_writeback(eb);
4585 		return -EIO;
4586 	}
4587 	unlock_page(page);
4588 	/*
4589 	 * Submission finished without problem, if no range of the page is
4590 	 * dirty anymore, we have submitted a page.  Update nr_written in wbc.
4591 	 */
4592 	if (no_dirty_ebs)
4593 		wbc->nr_to_write--;
4594 	return ret;
4595 }
4596 
4597 static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
4598 			struct writeback_control *wbc,
4599 			struct extent_page_data *epd)
4600 {
4601 	u64 disk_bytenr = eb->start;
4602 	int i, num_pages;
4603 	blk_opf_t write_flags = wbc_to_write_flags(wbc);
4604 	int ret = 0;
4605 
4606 	prepare_eb_write(eb);
4607 
4608 	num_pages = num_extent_pages(eb);
4609 	for (i = 0; i < num_pages; i++) {
4610 		struct page *p = eb->pages[i];
4611 
4612 		clear_page_dirty_for_io(p);
4613 		set_page_writeback(p);
4614 		ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
4615 					 &epd->bio_ctrl, p, disk_bytenr,
4616 					 PAGE_SIZE, 0,
4617 					 end_bio_extent_buffer_writepage,
4618 					 0, false);
4619 		if (ret) {
4620 			set_btree_ioerr(p, eb);
4621 			if (PageWriteback(p))
4622 				end_page_writeback(p);
4623 			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
4624 				end_extent_buffer_writeback(eb);
4625 			ret = -EIO;
4626 			break;
4627 		}
4628 		disk_bytenr += PAGE_SIZE;
4629 		wbc->nr_to_write--;
4630 		unlock_page(p);
4631 	}
4632 
4633 	if (unlikely(ret)) {
4634 		for (; i < num_pages; i++) {
4635 			struct page *p = eb->pages[i];
4636 			clear_page_dirty_for_io(p);
4637 			unlock_page(p);
4638 		}
4639 	}
4640 
4641 	return ret;
4642 }
4643 
4644 /*
4645  * Submit one subpage btree page.
4646  *
4647  * The main difference to submit_eb_page() is:
4648  * - Page locking
4649  *   For subpage, we don't rely on page locking at all.
4650  *
4651  * - Flush write bio
4652  *   We only flush bio if we may be unable to fit current extent buffers into
4653  *   current bio.
4654  *
4655  * Return >=0 for the number of submitted extent buffers.
4656  * Return <0 for fatal error.
4657  */
4658 static int submit_eb_subpage(struct page *page,
4659 			     struct writeback_control *wbc,
4660 			     struct extent_page_data *epd)
4661 {
4662 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
4663 	int submitted = 0;
4664 	u64 page_start = page_offset(page);
4665 	int bit_start = 0;
4666 	int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
4667 	int ret;
4668 
4669 	/* Lock and write each dirty extent buffers in the range */
4670 	while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
4671 		struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
4672 		struct extent_buffer *eb;
4673 		unsigned long flags;
4674 		u64 start;
4675 
4676 		/*
4677 		 * Take private lock to ensure the subpage won't be detached
4678 		 * in the meantime.
4679 		 */
4680 		spin_lock(&page->mapping->private_lock);
4681 		if (!PagePrivate(page)) {
4682 			spin_unlock(&page->mapping->private_lock);
4683 			break;
4684 		}
4685 		spin_lock_irqsave(&subpage->lock, flags);
4686 		if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
4687 			      subpage->bitmaps)) {
4688 			spin_unlock_irqrestore(&subpage->lock, flags);
4689 			spin_unlock(&page->mapping->private_lock);
4690 			bit_start++;
4691 			continue;
4692 		}
4693 
4694 		start = page_start + bit_start * fs_info->sectorsize;
4695 		bit_start += sectors_per_node;
4696 
4697 		/*
4698 		 * Here we just want to grab the eb without touching extra
4699 		 * spin locks, so call find_extent_buffer_nolock().
4700 		 */
4701 		eb = find_extent_buffer_nolock(fs_info, start);
4702 		spin_unlock_irqrestore(&subpage->lock, flags);
4703 		spin_unlock(&page->mapping->private_lock);
4704 
4705 		/*
4706 		 * The eb has already reached 0 refs thus find_extent_buffer()
4707 		 * doesn't return it. We don't need to write back such eb
4708 		 * anyway.
4709 		 */
4710 		if (!eb)
4711 			continue;
4712 
4713 		ret = lock_extent_buffer_for_io(eb, epd);
4714 		if (ret == 0) {
4715 			free_extent_buffer(eb);
4716 			continue;
4717 		}
4718 		if (ret < 0) {
4719 			free_extent_buffer(eb);
4720 			goto cleanup;
4721 		}
4722 		ret = write_one_subpage_eb(eb, wbc, epd);
4723 		free_extent_buffer(eb);
4724 		if (ret < 0)
4725 			goto cleanup;
4726 		submitted++;
4727 	}
4728 	return submitted;
4729 
4730 cleanup:
4731 	/* We hit error, end bio for the submitted extent buffers */
4732 	submit_write_bio(epd, ret);
4733 	return ret;
4734 }
4735 
4736 /*
4737  * Submit all page(s) of one extent buffer.
4738  *
4739  * @page:	the page of one extent buffer
4740  * @eb_context:	to determine if we need to submit this page, if current page
4741  *		belongs to this eb, we don't need to submit
4742  *
4743  * The caller should pass each page in their bytenr order, and here we use
4744  * @eb_context to determine if we have submitted pages of one extent buffer.
4745  *
4746  * If we have, we just skip until we hit a new page that doesn't belong to
4747  * current @eb_context.
4748  *
4749  * If not, we submit all the page(s) of the extent buffer.
4750  *
4751  * Return >0 if we have submitted the extent buffer successfully.
4752  * Return 0 if we don't need to submit the page, as it's already submitted by
4753  * previous call.
4754  * Return <0 for fatal error.
4755  */
4756 static int submit_eb_page(struct page *page, struct writeback_control *wbc,
4757 			  struct extent_page_data *epd,
4758 			  struct extent_buffer **eb_context)
4759 {
4760 	struct address_space *mapping = page->mapping;
4761 	struct btrfs_block_group *cache = NULL;
4762 	struct extent_buffer *eb;
4763 	int ret;
4764 
4765 	if (!PagePrivate(page))
4766 		return 0;
4767 
4768 	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
4769 		return submit_eb_subpage(page, wbc, epd);
4770 
4771 	spin_lock(&mapping->private_lock);
4772 	if (!PagePrivate(page)) {
4773 		spin_unlock(&mapping->private_lock);
4774 		return 0;
4775 	}
4776 
4777 	eb = (struct extent_buffer *)page->private;
4778 
4779 	/*
4780 	 * Shouldn't happen and normally this would be a BUG_ON but no point
4781 	 * crashing the machine for something we can survive anyway.
4782 	 */
4783 	if (WARN_ON(!eb)) {
4784 		spin_unlock(&mapping->private_lock);
4785 		return 0;
4786 	}
4787 
4788 	if (eb == *eb_context) {
4789 		spin_unlock(&mapping->private_lock);
4790 		return 0;
4791 	}
4792 	ret = atomic_inc_not_zero(&eb->refs);
4793 	spin_unlock(&mapping->private_lock);
4794 	if (!ret)
4795 		return 0;
4796 
4797 	if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
4798 		/*
4799 		 * If for_sync, this hole will be filled with
4800 		 * trasnsaction commit.
4801 		 */
4802 		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
4803 			ret = -EAGAIN;
4804 		else
4805 			ret = 0;
4806 		free_extent_buffer(eb);
4807 		return ret;
4808 	}
4809 
4810 	*eb_context = eb;
4811 
4812 	ret = lock_extent_buffer_for_io(eb, epd);
4813 	if (ret <= 0) {
4814 		btrfs_revert_meta_write_pointer(cache, eb);
4815 		if (cache)
4816 			btrfs_put_block_group(cache);
4817 		free_extent_buffer(eb);
4818 		return ret;
4819 	}
4820 	if (cache) {
4821 		/*
4822 		 * Implies write in zoned mode. Mark the last eb in a block group.
4823 		 */
4824 		btrfs_schedule_zone_finish_bg(cache, eb);
4825 		btrfs_put_block_group(cache);
4826 	}
4827 	ret = write_one_eb(eb, wbc, epd);
4828 	free_extent_buffer(eb);
4829 	if (ret < 0)
4830 		return ret;
4831 	return 1;
4832 }
4833 
4834 int btree_write_cache_pages(struct address_space *mapping,
4835 				   struct writeback_control *wbc)
4836 {
4837 	struct extent_buffer *eb_context = NULL;
4838 	struct extent_page_data epd = {
4839 		.bio_ctrl = { 0 },
4840 		.extent_locked = 0,
4841 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
4842 	};
4843 	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
4844 	int ret = 0;
4845 	int done = 0;
4846 	int nr_to_write_done = 0;
4847 	struct pagevec pvec;
4848 	int nr_pages;
4849 	pgoff_t index;
4850 	pgoff_t end;		/* Inclusive */
4851 	int scanned = 0;
4852 	xa_mark_t tag;
4853 
4854 	pagevec_init(&pvec);
4855 	if (wbc->range_cyclic) {
4856 		index = mapping->writeback_index; /* Start from prev offset */
4857 		end = -1;
4858 		/*
4859 		 * Start from the beginning does not need to cycle over the
4860 		 * range, mark it as scanned.
4861 		 */
4862 		scanned = (index == 0);
4863 	} else {
4864 		index = wbc->range_start >> PAGE_SHIFT;
4865 		end = wbc->range_end >> PAGE_SHIFT;
4866 		scanned = 1;
4867 	}
4868 	if (wbc->sync_mode == WB_SYNC_ALL)
4869 		tag = PAGECACHE_TAG_TOWRITE;
4870 	else
4871 		tag = PAGECACHE_TAG_DIRTY;
4872 	btrfs_zoned_meta_io_lock(fs_info);
4873 retry:
4874 	if (wbc->sync_mode == WB_SYNC_ALL)
4875 		tag_pages_for_writeback(mapping, index, end);
4876 	while (!done && !nr_to_write_done && (index <= end) &&
4877 	       (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
4878 			tag))) {
4879 		unsigned i;
4880 
4881 		for (i = 0; i < nr_pages; i++) {
4882 			struct page *page = pvec.pages[i];
4883 
4884 			ret = submit_eb_page(page, wbc, &epd, &eb_context);
4885 			if (ret == 0)
4886 				continue;
4887 			if (ret < 0) {
4888 				done = 1;
4889 				break;
4890 			}
4891 
4892 			/*
4893 			 * the filesystem may choose to bump up nr_to_write.
4894 			 * We have to make sure to honor the new nr_to_write
4895 			 * at any time
4896 			 */
4897 			nr_to_write_done = wbc->nr_to_write <= 0;
4898 		}
4899 		pagevec_release(&pvec);
4900 		cond_resched();
4901 	}
4902 	if (!scanned && !done) {
4903 		/*
4904 		 * We hit the last page and there is more work to be done: wrap
4905 		 * back to the start of the file
4906 		 */
4907 		scanned = 1;
4908 		index = 0;
4909 		goto retry;
4910 	}
4911 	/*
4912 	 * If something went wrong, don't allow any metadata write bio to be
4913 	 * submitted.
4914 	 *
4915 	 * This would prevent use-after-free if we had dirty pages not
4916 	 * cleaned up, which can still happen by fuzzed images.
4917 	 *
4918 	 * - Bad extent tree
4919 	 *   Allowing existing tree block to be allocated for other trees.
4920 	 *
4921 	 * - Log tree operations
4922 	 *   Exiting tree blocks get allocated to log tree, bumps its
4923 	 *   generation, then get cleaned in tree re-balance.
4924 	 *   Such tree block will not be written back, since it's clean,
4925 	 *   thus no WRITTEN flag set.
4926 	 *   And after log writes back, this tree block is not traced by
4927 	 *   any dirty extent_io_tree.
4928 	 *
4929 	 * - Offending tree block gets re-dirtied from its original owner
4930 	 *   Since it has bumped generation, no WRITTEN flag, it can be
4931 	 *   reused without COWing. This tree block will not be traced
4932 	 *   by btrfs_transaction::dirty_pages.
4933 	 *
4934 	 *   Now such dirty tree block will not be cleaned by any dirty
4935 	 *   extent io tree. Thus we don't want to submit such wild eb
4936 	 *   if the fs already has error.
4937 	 *
4938 	 * We can get ret > 0 from submit_extent_page() indicating how many ebs
4939 	 * were submitted. Reset it to 0 to avoid false alerts for the caller.
4940 	 */
4941 	if (ret > 0)
4942 		ret = 0;
4943 	if (!ret && BTRFS_FS_ERROR(fs_info))
4944 		ret = -EROFS;
4945 	submit_write_bio(&epd, ret);
4946 
4947 	btrfs_zoned_meta_io_unlock(fs_info);
4948 	return ret;
4949 }
4950 
4951 /**
4952  * Walk the list of dirty pages of the given address space and write all of them.
4953  *
4954  * @mapping: address space structure to write
4955  * @wbc:     subtract the number of written pages from *@wbc->nr_to_write
4956  * @epd:     holds context for the write, namely the bio
4957  *
4958  * If a page is already under I/O, write_cache_pages() skips it, even
4959  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
4960  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
4961  * and msync() need to guarantee that all the data which was dirty at the time
4962  * the call was made get new I/O started against them.  If wbc->sync_mode is
4963  * WB_SYNC_ALL then we were called for data integrity and we must wait for
4964  * existing IO to complete.
4965  */
4966 static int extent_write_cache_pages(struct address_space *mapping,
4967 			     struct writeback_control *wbc,
4968 			     struct extent_page_data *epd)
4969 {
4970 	struct inode *inode = mapping->host;
4971 	int ret = 0;
4972 	int done = 0;
4973 	int nr_to_write_done = 0;
4974 	struct pagevec pvec;
4975 	int nr_pages;
4976 	pgoff_t index;
4977 	pgoff_t end;		/* Inclusive */
4978 	pgoff_t done_index;
4979 	int range_whole = 0;
4980 	int scanned = 0;
4981 	xa_mark_t tag;
4982 
4983 	/*
4984 	 * We have to hold onto the inode so that ordered extents can do their
4985 	 * work when the IO finishes.  The alternative to this is failing to add
4986 	 * an ordered extent if the igrab() fails there and that is a huge pain
4987 	 * to deal with, so instead just hold onto the inode throughout the
4988 	 * writepages operation.  If it fails here we are freeing up the inode
4989 	 * anyway and we'd rather not waste our time writing out stuff that is
4990 	 * going to be truncated anyway.
4991 	 */
4992 	if (!igrab(inode))
4993 		return 0;
4994 
4995 	pagevec_init(&pvec);
4996 	if (wbc->range_cyclic) {
4997 		index = mapping->writeback_index; /* Start from prev offset */
4998 		end = -1;
4999 		/*
5000 		 * Start from the beginning does not need to cycle over the
5001 		 * range, mark it as scanned.
5002 		 */
5003 		scanned = (index == 0);
5004 	} else {
5005 		index = wbc->range_start >> PAGE_SHIFT;
5006 		end = wbc->range_end >> PAGE_SHIFT;
5007 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
5008 			range_whole = 1;
5009 		scanned = 1;
5010 	}
5011 
5012 	/*
5013 	 * We do the tagged writepage as long as the snapshot flush bit is set
5014 	 * and we are the first one who do the filemap_flush() on this inode.
5015 	 *
5016 	 * The nr_to_write == LONG_MAX is needed to make sure other flushers do
5017 	 * not race in and drop the bit.
5018 	 */
5019 	if (range_whole && wbc->nr_to_write == LONG_MAX &&
5020 	    test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
5021 			       &BTRFS_I(inode)->runtime_flags))
5022 		wbc->tagged_writepages = 1;
5023 
5024 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
5025 		tag = PAGECACHE_TAG_TOWRITE;
5026 	else
5027 		tag = PAGECACHE_TAG_DIRTY;
5028 retry:
5029 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
5030 		tag_pages_for_writeback(mapping, index, end);
5031 	done_index = index;
5032 	while (!done && !nr_to_write_done && (index <= end) &&
5033 			(nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
5034 						&index, end, tag))) {
5035 		unsigned i;
5036 
5037 		for (i = 0; i < nr_pages; i++) {
5038 			struct page *page = pvec.pages[i];
5039 
5040 			done_index = page->index + 1;
5041 			/*
5042 			 * At this point we hold neither the i_pages lock nor
5043 			 * the page lock: the page may be truncated or
5044 			 * invalidated (changing page->mapping to NULL),
5045 			 * or even swizzled back from swapper_space to
5046 			 * tmpfs file mapping
5047 			 */
5048 			if (!trylock_page(page)) {
5049 				submit_write_bio(epd, 0);
5050 				lock_page(page);
5051 			}
5052 
5053 			if (unlikely(page->mapping != mapping)) {
5054 				unlock_page(page);
5055 				continue;
5056 			}
5057 
5058 			if (wbc->sync_mode != WB_SYNC_NONE) {
5059 				if (PageWriteback(page))
5060 					submit_write_bio(epd, 0);
5061 				wait_on_page_writeback(page);
5062 			}
5063 
5064 			if (PageWriteback(page) ||
5065 			    !clear_page_dirty_for_io(page)) {
5066 				unlock_page(page);
5067 				continue;
5068 			}
5069 
5070 			ret = __extent_writepage(page, wbc, epd);
5071 			if (ret < 0) {
5072 				done = 1;
5073 				break;
5074 			}
5075 
5076 			/*
5077 			 * the filesystem may choose to bump up nr_to_write.
5078 			 * We have to make sure to honor the new nr_to_write
5079 			 * at any time
5080 			 */
5081 			nr_to_write_done = wbc->nr_to_write <= 0;
5082 		}
5083 		pagevec_release(&pvec);
5084 		cond_resched();
5085 	}
5086 	if (!scanned && !done) {
5087 		/*
5088 		 * We hit the last page and there is more work to be done: wrap
5089 		 * back to the start of the file
5090 		 */
5091 		scanned = 1;
5092 		index = 0;
5093 
5094 		/*
5095 		 * If we're looping we could run into a page that is locked by a
5096 		 * writer and that writer could be waiting on writeback for a
5097 		 * page in our current bio, and thus deadlock, so flush the
5098 		 * write bio here.
5099 		 */
5100 		submit_write_bio(epd, 0);
5101 		goto retry;
5102 	}
5103 
5104 	if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
5105 		mapping->writeback_index = done_index;
5106 
5107 	btrfs_add_delayed_iput(inode);
5108 	return ret;
5109 }
5110 
5111 /*
5112  * Submit the pages in the range to bio for call sites which delalloc range has
5113  * already been ran (aka, ordered extent inserted) and all pages are still
5114  * locked.
5115  */
5116 int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
5117 {
5118 	bool found_error = false;
5119 	int first_error = 0;
5120 	int ret = 0;
5121 	struct address_space *mapping = inode->i_mapping;
5122 	struct page *page;
5123 	u64 cur = start;
5124 	unsigned long nr_pages;
5125 	const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
5126 	struct extent_page_data epd = {
5127 		.bio_ctrl = { 0 },
5128 		.extent_locked = 1,
5129 		.sync_io = 1,
5130 	};
5131 	struct writeback_control wbc_writepages = {
5132 		.sync_mode	= WB_SYNC_ALL,
5133 		.range_start	= start,
5134 		.range_end	= end + 1,
5135 		/* We're called from an async helper function */
5136 		.punt_to_cgroup	= 1,
5137 		.no_cgroup_owner = 1,
5138 	};
5139 
5140 	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
5141 	nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
5142 		   PAGE_SHIFT;
5143 	wbc_writepages.nr_to_write = nr_pages * 2;
5144 
5145 	wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
5146 	while (cur <= end) {
5147 		u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
5148 
5149 		page = find_get_page(mapping, cur >> PAGE_SHIFT);
5150 		/*
5151 		 * All pages in the range are locked since
5152 		 * btrfs_run_delalloc_range(), thus there is no way to clear
5153 		 * the page dirty flag.
5154 		 */
5155 		ASSERT(PageLocked(page));
5156 		ASSERT(PageDirty(page));
5157 		clear_page_dirty_for_io(page);
5158 		ret = __extent_writepage(page, &wbc_writepages, &epd);
5159 		ASSERT(ret <= 0);
5160 		if (ret < 0) {
5161 			found_error = true;
5162 			first_error = ret;
5163 		}
5164 		put_page(page);
5165 		cur = cur_end + 1;
5166 	}
5167 
5168 	submit_write_bio(&epd, found_error ? ret : 0);
5169 
5170 	wbc_detach_inode(&wbc_writepages);
5171 	if (found_error)
5172 		return first_error;
5173 	return ret;
5174 }
5175 
5176 int extent_writepages(struct address_space *mapping,
5177 		      struct writeback_control *wbc)
5178 {
5179 	struct inode *inode = mapping->host;
5180 	int ret = 0;
5181 	struct extent_page_data epd = {
5182 		.bio_ctrl = { 0 },
5183 		.extent_locked = 0,
5184 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
5185 	};
5186 
5187 	/*
5188 	 * Allow only a single thread to do the reloc work in zoned mode to
5189 	 * protect the write pointer updates.
5190 	 */
5191 	btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
5192 	ret = extent_write_cache_pages(mapping, wbc, &epd);
5193 	submit_write_bio(&epd, ret);
5194 	btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
5195 	return ret;
5196 }
5197 
5198 void extent_readahead(struct readahead_control *rac)
5199 {
5200 	struct btrfs_bio_ctrl bio_ctrl = { 0 };
5201 	struct page *pagepool[16];
5202 	struct extent_map *em_cached = NULL;
5203 	u64 prev_em_start = (u64)-1;
5204 	int nr;
5205 
5206 	while ((nr = readahead_page_batch(rac, pagepool))) {
5207 		u64 contig_start = readahead_pos(rac);
5208 		u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
5209 
5210 		contiguous_readpages(pagepool, nr, contig_start, contig_end,
5211 				&em_cached, &bio_ctrl, &prev_em_start);
5212 	}
5213 
5214 	if (em_cached)
5215 		free_extent_map(em_cached);
5216 	submit_one_bio(&bio_ctrl);
5217 }
5218 
5219 /*
5220  * basic invalidate_folio code, this waits on any locked or writeback
5221  * ranges corresponding to the folio, and then deletes any extent state
5222  * records from the tree
5223  */
5224 int extent_invalidate_folio(struct extent_io_tree *tree,
5225 			  struct folio *folio, size_t offset)
5226 {
5227 	struct extent_state *cached_state = NULL;
5228 	u64 start = folio_pos(folio);
5229 	u64 end = start + folio_size(folio) - 1;
5230 	size_t blocksize = folio->mapping->host->i_sb->s_blocksize;
5231 
5232 	/* This function is only called for the btree inode */
5233 	ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
5234 
5235 	start += ALIGN(offset, blocksize);
5236 	if (start > end)
5237 		return 0;
5238 
5239 	lock_extent_bits(tree, start, end, &cached_state);
5240 	folio_wait_writeback(folio);
5241 
5242 	/*
5243 	 * Currently for btree io tree, only EXTENT_LOCKED is utilized,
5244 	 * so here we only need to unlock the extent range to free any
5245 	 * existing extent state.
5246 	 */
5247 	unlock_extent_cached(tree, start, end, &cached_state);
5248 	return 0;
5249 }
5250 
5251 /*
5252  * a helper for release_folio, this tests for areas of the page that
5253  * are locked or under IO and drops the related state bits if it is safe
5254  * to drop the page.
5255  */
5256 static int try_release_extent_state(struct extent_io_tree *tree,
5257 				    struct page *page, gfp_t mask)
5258 {
5259 	u64 start = page_offset(page);
5260 	u64 end = start + PAGE_SIZE - 1;
5261 	int ret = 1;
5262 
5263 	if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
5264 		ret = 0;
5265 	} else {
5266 		/*
5267 		 * At this point we can safely clear everything except the
5268 		 * locked bit, the nodatasum bit and the delalloc new bit.
5269 		 * The delalloc new bit will be cleared by ordered extent
5270 		 * completion.
5271 		 */
5272 		ret = __clear_extent_bit(tree, start, end,
5273 			 ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
5274 			 0, 0, NULL, mask, NULL);
5275 
5276 		/* if clear_extent_bit failed for enomem reasons,
5277 		 * we can't allow the release to continue.
5278 		 */
5279 		if (ret < 0)
5280 			ret = 0;
5281 		else
5282 			ret = 1;
5283 	}
5284 	return ret;
5285 }
5286 
5287 /*
5288  * a helper for release_folio.  As long as there are no locked extents
5289  * in the range corresponding to the page, both state records and extent
5290  * map records are removed
5291  */
5292 int try_release_extent_mapping(struct page *page, gfp_t mask)
5293 {
5294 	struct extent_map *em;
5295 	u64 start = page_offset(page);
5296 	u64 end = start + PAGE_SIZE - 1;
5297 	struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
5298 	struct extent_io_tree *tree = &btrfs_inode->io_tree;
5299 	struct extent_map_tree *map = &btrfs_inode->extent_tree;
5300 
5301 	if (gfpflags_allow_blocking(mask) &&
5302 	    page->mapping->host->i_size > SZ_16M) {
5303 		u64 len;
5304 		while (start <= end) {
5305 			struct btrfs_fs_info *fs_info;
5306 			u64 cur_gen;
5307 
5308 			len = end - start + 1;
5309 			write_lock(&map->lock);
5310 			em = lookup_extent_mapping(map, start, len);
5311 			if (!em) {
5312 				write_unlock(&map->lock);
5313 				break;
5314 			}
5315 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
5316 			    em->start != start) {
5317 				write_unlock(&map->lock);
5318 				free_extent_map(em);
5319 				break;
5320 			}
5321 			if (test_range_bit(tree, em->start,
5322 					   extent_map_end(em) - 1,
5323 					   EXTENT_LOCKED, 0, NULL))
5324 				goto next;
5325 			/*
5326 			 * If it's not in the list of modified extents, used
5327 			 * by a fast fsync, we can remove it. If it's being
5328 			 * logged we can safely remove it since fsync took an
5329 			 * extra reference on the em.
5330 			 */
5331 			if (list_empty(&em->list) ||
5332 			    test_bit(EXTENT_FLAG_LOGGING, &em->flags))
5333 				goto remove_em;
5334 			/*
5335 			 * If it's in the list of modified extents, remove it
5336 			 * only if its generation is older then the current one,
5337 			 * in which case we don't need it for a fast fsync.
5338 			 * Otherwise don't remove it, we could be racing with an
5339 			 * ongoing fast fsync that could miss the new extent.
5340 			 */
5341 			fs_info = btrfs_inode->root->fs_info;
5342 			spin_lock(&fs_info->trans_lock);
5343 			cur_gen = fs_info->generation;
5344 			spin_unlock(&fs_info->trans_lock);
5345 			if (em->generation >= cur_gen)
5346 				goto next;
5347 remove_em:
5348 			/*
5349 			 * We only remove extent maps that are not in the list of
5350 			 * modified extents or that are in the list but with a
5351 			 * generation lower then the current generation, so there
5352 			 * is no need to set the full fsync flag on the inode (it
5353 			 * hurts the fsync performance for workloads with a data
5354 			 * size that exceeds or is close to the system's memory).
5355 			 */
5356 			remove_extent_mapping(map, em);
5357 			/* once for the rb tree */
5358 			free_extent_map(em);
5359 next:
5360 			start = extent_map_end(em);
5361 			write_unlock(&map->lock);
5362 
5363 			/* once for us */
5364 			free_extent_map(em);
5365 
5366 			cond_resched(); /* Allow large-extent preemption. */
5367 		}
5368 	}
5369 	return try_release_extent_state(tree, page, mask);
5370 }
5371 
5372 /*
5373  * helper function for fiemap, which doesn't want to see any holes.
5374  * This maps until we find something past 'last'
5375  */
5376 static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
5377 						u64 offset, u64 last)
5378 {
5379 	u64 sectorsize = btrfs_inode_sectorsize(inode);
5380 	struct extent_map *em;
5381 	u64 len;
5382 
5383 	if (offset >= last)
5384 		return NULL;
5385 
5386 	while (1) {
5387 		len = last - offset;
5388 		if (len == 0)
5389 			break;
5390 		len = ALIGN(len, sectorsize);
5391 		em = btrfs_get_extent_fiemap(inode, offset, len);
5392 		if (IS_ERR(em))
5393 			return em;
5394 
5395 		/* if this isn't a hole return it */
5396 		if (em->block_start != EXTENT_MAP_HOLE)
5397 			return em;
5398 
5399 		/* this is a hole, advance to the next extent */
5400 		offset = extent_map_end(em);
5401 		free_extent_map(em);
5402 		if (offset >= last)
5403 			break;
5404 	}
5405 	return NULL;
5406 }
5407 
5408 /*
5409  * To cache previous fiemap extent
5410  *
5411  * Will be used for merging fiemap extent
5412  */
5413 struct fiemap_cache {
5414 	u64 offset;
5415 	u64 phys;
5416 	u64 len;
5417 	u32 flags;
5418 	bool cached;
5419 };
5420 
5421 /*
5422  * Helper to submit fiemap extent.
5423  *
5424  * Will try to merge current fiemap extent specified by @offset, @phys,
5425  * @len and @flags with cached one.
5426  * And only when we fails to merge, cached one will be submitted as
5427  * fiemap extent.
5428  *
5429  * Return value is the same as fiemap_fill_next_extent().
5430  */
5431 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
5432 				struct fiemap_cache *cache,
5433 				u64 offset, u64 phys, u64 len, u32 flags)
5434 {
5435 	int ret = 0;
5436 
5437 	if (!cache->cached)
5438 		goto assign;
5439 
5440 	/*
5441 	 * Sanity check, extent_fiemap() should have ensured that new
5442 	 * fiemap extent won't overlap with cached one.
5443 	 * Not recoverable.
5444 	 *
5445 	 * NOTE: Physical address can overlap, due to compression
5446 	 */
5447 	if (cache->offset + cache->len > offset) {
5448 		WARN_ON(1);
5449 		return -EINVAL;
5450 	}
5451 
5452 	/*
5453 	 * Only merges fiemap extents if
5454 	 * 1) Their logical addresses are continuous
5455 	 *
5456 	 * 2) Their physical addresses are continuous
5457 	 *    So truly compressed (physical size smaller than logical size)
5458 	 *    extents won't get merged with each other
5459 	 *
5460 	 * 3) Share same flags except FIEMAP_EXTENT_LAST
5461 	 *    So regular extent won't get merged with prealloc extent
5462 	 */
5463 	if (cache->offset + cache->len  == offset &&
5464 	    cache->phys + cache->len == phys  &&
5465 	    (cache->flags & ~FIEMAP_EXTENT_LAST) ==
5466 			(flags & ~FIEMAP_EXTENT_LAST)) {
5467 		cache->len += len;
5468 		cache->flags |= flags;
5469 		goto try_submit_last;
5470 	}
5471 
5472 	/* Not mergeable, need to submit cached one */
5473 	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5474 				      cache->len, cache->flags);
5475 	cache->cached = false;
5476 	if (ret)
5477 		return ret;
5478 assign:
5479 	cache->cached = true;
5480 	cache->offset = offset;
5481 	cache->phys = phys;
5482 	cache->len = len;
5483 	cache->flags = flags;
5484 try_submit_last:
5485 	if (cache->flags & FIEMAP_EXTENT_LAST) {
5486 		ret = fiemap_fill_next_extent(fieinfo, cache->offset,
5487 				cache->phys, cache->len, cache->flags);
5488 		cache->cached = false;
5489 	}
5490 	return ret;
5491 }
5492 
5493 /*
5494  * Emit last fiemap cache
5495  *
5496  * The last fiemap cache may still be cached in the following case:
5497  * 0		      4k		    8k
5498  * |<- Fiemap range ->|
5499  * |<------------  First extent ----------->|
5500  *
5501  * In this case, the first extent range will be cached but not emitted.
5502  * So we must emit it before ending extent_fiemap().
5503  */
5504 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
5505 				  struct fiemap_cache *cache)
5506 {
5507 	int ret;
5508 
5509 	if (!cache->cached)
5510 		return 0;
5511 
5512 	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5513 				      cache->len, cache->flags);
5514 	cache->cached = false;
5515 	if (ret > 0)
5516 		ret = 0;
5517 	return ret;
5518 }
5519 
5520 int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
5521 		  u64 start, u64 len)
5522 {
5523 	int ret = 0;
5524 	u64 off;
5525 	u64 max = start + len;
5526 	u32 flags = 0;
5527 	u32 found_type;
5528 	u64 last;
5529 	u64 last_for_get_extent = 0;
5530 	u64 disko = 0;
5531 	u64 isize = i_size_read(&inode->vfs_inode);
5532 	struct btrfs_key found_key;
5533 	struct extent_map *em = NULL;
5534 	struct extent_state *cached_state = NULL;
5535 	struct btrfs_path *path;
5536 	struct btrfs_root *root = inode->root;
5537 	struct fiemap_cache cache = { 0 };
5538 	struct ulist *roots;
5539 	struct ulist *tmp_ulist;
5540 	int end = 0;
5541 	u64 em_start = 0;
5542 	u64 em_len = 0;
5543 	u64 em_end = 0;
5544 
5545 	if (len == 0)
5546 		return -EINVAL;
5547 
5548 	path = btrfs_alloc_path();
5549 	if (!path)
5550 		return -ENOMEM;
5551 
5552 	roots = ulist_alloc(GFP_KERNEL);
5553 	tmp_ulist = ulist_alloc(GFP_KERNEL);
5554 	if (!roots || !tmp_ulist) {
5555 		ret = -ENOMEM;
5556 		goto out_free_ulist;
5557 	}
5558 
5559 	/*
5560 	 * We can't initialize that to 'start' as this could miss extents due
5561 	 * to extent item merging
5562 	 */
5563 	off = 0;
5564 	start = round_down(start, btrfs_inode_sectorsize(inode));
5565 	len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
5566 
5567 	/*
5568 	 * lookup the last file extent.  We're not using i_size here
5569 	 * because there might be preallocation past i_size
5570 	 */
5571 	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
5572 				       0);
5573 	if (ret < 0) {
5574 		goto out_free_ulist;
5575 	} else {
5576 		WARN_ON(!ret);
5577 		if (ret == 1)
5578 			ret = 0;
5579 	}
5580 
5581 	path->slots[0]--;
5582 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5583 	found_type = found_key.type;
5584 
5585 	/* No extents, but there might be delalloc bits */
5586 	if (found_key.objectid != btrfs_ino(inode) ||
5587 	    found_type != BTRFS_EXTENT_DATA_KEY) {
5588 		/* have to trust i_size as the end */
5589 		last = (u64)-1;
5590 		last_for_get_extent = isize;
5591 	} else {
5592 		/*
5593 		 * remember the start of the last extent.  There are a
5594 		 * bunch of different factors that go into the length of the
5595 		 * extent, so its much less complex to remember where it started
5596 		 */
5597 		last = found_key.offset;
5598 		last_for_get_extent = last + 1;
5599 	}
5600 	btrfs_release_path(path);
5601 
5602 	/*
5603 	 * we might have some extents allocated but more delalloc past those
5604 	 * extents.  so, we trust isize unless the start of the last extent is
5605 	 * beyond isize
5606 	 */
5607 	if (last < isize) {
5608 		last = (u64)-1;
5609 		last_for_get_extent = isize;
5610 	}
5611 
5612 	lock_extent_bits(&inode->io_tree, start, start + len - 1,
5613 			 &cached_state);
5614 
5615 	em = get_extent_skip_holes(inode, start, last_for_get_extent);
5616 	if (!em)
5617 		goto out;
5618 	if (IS_ERR(em)) {
5619 		ret = PTR_ERR(em);
5620 		goto out;
5621 	}
5622 
5623 	while (!end) {
5624 		u64 offset_in_extent = 0;
5625 
5626 		/* break if the extent we found is outside the range */
5627 		if (em->start >= max || extent_map_end(em) < off)
5628 			break;
5629 
5630 		/*
5631 		 * get_extent may return an extent that starts before our
5632 		 * requested range.  We have to make sure the ranges
5633 		 * we return to fiemap always move forward and don't
5634 		 * overlap, so adjust the offsets here
5635 		 */
5636 		em_start = max(em->start, off);
5637 
5638 		/*
5639 		 * record the offset from the start of the extent
5640 		 * for adjusting the disk offset below.  Only do this if the
5641 		 * extent isn't compressed since our in ram offset may be past
5642 		 * what we have actually allocated on disk.
5643 		 */
5644 		if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5645 			offset_in_extent = em_start - em->start;
5646 		em_end = extent_map_end(em);
5647 		em_len = em_end - em_start;
5648 		flags = 0;
5649 		if (em->block_start < EXTENT_MAP_LAST_BYTE)
5650 			disko = em->block_start + offset_in_extent;
5651 		else
5652 			disko = 0;
5653 
5654 		/*
5655 		 * bump off for our next call to get_extent
5656 		 */
5657 		off = extent_map_end(em);
5658 		if (off >= max)
5659 			end = 1;
5660 
5661 		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
5662 			end = 1;
5663 			flags |= FIEMAP_EXTENT_LAST;
5664 		} else if (em->block_start == EXTENT_MAP_INLINE) {
5665 			flags |= (FIEMAP_EXTENT_DATA_INLINE |
5666 				  FIEMAP_EXTENT_NOT_ALIGNED);
5667 		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
5668 			flags |= (FIEMAP_EXTENT_DELALLOC |
5669 				  FIEMAP_EXTENT_UNKNOWN);
5670 		} else if (fieinfo->fi_extents_max) {
5671 			u64 bytenr = em->block_start -
5672 				(em->start - em->orig_start);
5673 
5674 			/*
5675 			 * As btrfs supports shared space, this information
5676 			 * can be exported to userspace tools via
5677 			 * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
5678 			 * then we're just getting a count and we can skip the
5679 			 * lookup stuff.
5680 			 */
5681 			ret = btrfs_check_shared(root, btrfs_ino(inode),
5682 						 bytenr, roots, tmp_ulist);
5683 			if (ret < 0)
5684 				goto out_free;
5685 			if (ret)
5686 				flags |= FIEMAP_EXTENT_SHARED;
5687 			ret = 0;
5688 		}
5689 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5690 			flags |= FIEMAP_EXTENT_ENCODED;
5691 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5692 			flags |= FIEMAP_EXTENT_UNWRITTEN;
5693 
5694 		free_extent_map(em);
5695 		em = NULL;
5696 		if ((em_start >= last) || em_len == (u64)-1 ||
5697 		   (last == (u64)-1 && isize <= em_end)) {
5698 			flags |= FIEMAP_EXTENT_LAST;
5699 			end = 1;
5700 		}
5701 
5702 		/* now scan forward to see if this is really the last extent. */
5703 		em = get_extent_skip_holes(inode, off, last_for_get_extent);
5704 		if (IS_ERR(em)) {
5705 			ret = PTR_ERR(em);
5706 			goto out;
5707 		}
5708 		if (!em) {
5709 			flags |= FIEMAP_EXTENT_LAST;
5710 			end = 1;
5711 		}
5712 		ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
5713 					   em_len, flags);
5714 		if (ret) {
5715 			if (ret == 1)
5716 				ret = 0;
5717 			goto out_free;
5718 		}
5719 	}
5720 out_free:
5721 	if (!ret)
5722 		ret = emit_last_fiemap_cache(fieinfo, &cache);
5723 	free_extent_map(em);
5724 out:
5725 	unlock_extent_cached(&inode->io_tree, start, start + len - 1,
5726 			     &cached_state);
5727 
5728 out_free_ulist:
5729 	btrfs_free_path(path);
5730 	ulist_free(roots);
5731 	ulist_free(tmp_ulist);
5732 	return ret;
5733 }
5734 
5735 static void __free_extent_buffer(struct extent_buffer *eb)
5736 {
5737 	kmem_cache_free(extent_buffer_cache, eb);
5738 }
5739 
5740 int extent_buffer_under_io(const struct extent_buffer *eb)
5741 {
5742 	return (atomic_read(&eb->io_pages) ||
5743 		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
5744 		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5745 }
5746 
5747 static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
5748 {
5749 	struct btrfs_subpage *subpage;
5750 
5751 	lockdep_assert_held(&page->mapping->private_lock);
5752 
5753 	if (PagePrivate(page)) {
5754 		subpage = (struct btrfs_subpage *)page->private;
5755 		if (atomic_read(&subpage->eb_refs))
5756 			return true;
5757 		/*
5758 		 * Even there is no eb refs here, we may still have
5759 		 * end_page_read() call relying on page::private.
5760 		 */
5761 		if (atomic_read(&subpage->readers))
5762 			return true;
5763 	}
5764 	return false;
5765 }
5766 
5767 static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
5768 {
5769 	struct btrfs_fs_info *fs_info = eb->fs_info;
5770 	const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5771 
5772 	/*
5773 	 * For mapped eb, we're going to change the page private, which should
5774 	 * be done under the private_lock.
5775 	 */
5776 	if (mapped)
5777 		spin_lock(&page->mapping->private_lock);
5778 
5779 	if (!PagePrivate(page)) {
5780 		if (mapped)
5781 			spin_unlock(&page->mapping->private_lock);
5782 		return;
5783 	}
5784 
5785 	if (fs_info->nodesize >= PAGE_SIZE) {
5786 		/*
5787 		 * We do this since we'll remove the pages after we've
5788 		 * removed the eb from the radix tree, so we could race
5789 		 * and have this page now attached to the new eb.  So
5790 		 * only clear page_private if it's still connected to
5791 		 * this eb.
5792 		 */
5793 		if (PagePrivate(page) &&
5794 		    page->private == (unsigned long)eb) {
5795 			BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5796 			BUG_ON(PageDirty(page));
5797 			BUG_ON(PageWriteback(page));
5798 			/*
5799 			 * We need to make sure we haven't be attached
5800 			 * to a new eb.
5801 			 */
5802 			detach_page_private(page);
5803 		}
5804 		if (mapped)
5805 			spin_unlock(&page->mapping->private_lock);
5806 		return;
5807 	}
5808 
5809 	/*
5810 	 * For subpage, we can have dummy eb with page private.  In this case,
5811 	 * we can directly detach the private as such page is only attached to
5812 	 * one dummy eb, no sharing.
5813 	 */
5814 	if (!mapped) {
5815 		btrfs_detach_subpage(fs_info, page);
5816 		return;
5817 	}
5818 
5819 	btrfs_page_dec_eb_refs(fs_info, page);
5820 
5821 	/*
5822 	 * We can only detach the page private if there are no other ebs in the
5823 	 * page range and no unfinished IO.
5824 	 */
5825 	if (!page_range_has_eb(fs_info, page))
5826 		btrfs_detach_subpage(fs_info, page);
5827 
5828 	spin_unlock(&page->mapping->private_lock);
5829 }
5830 
5831 /* Release all pages attached to the extent buffer */
5832 static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
5833 {
5834 	int i;
5835 	int num_pages;
5836 
5837 	ASSERT(!extent_buffer_under_io(eb));
5838 
5839 	num_pages = num_extent_pages(eb);
5840 	for (i = 0; i < num_pages; i++) {
5841 		struct page *page = eb->pages[i];
5842 
5843 		if (!page)
5844 			continue;
5845 
5846 		detach_extent_buffer_page(eb, page);
5847 
5848 		/* One for when we allocated the page */
5849 		put_page(page);
5850 	}
5851 }
5852 
5853 /*
5854  * Helper for releasing the extent buffer.
5855  */
5856 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
5857 {
5858 	btrfs_release_extent_buffer_pages(eb);
5859 	btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
5860 	__free_extent_buffer(eb);
5861 }
5862 
5863 static struct extent_buffer *
5864 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
5865 		      unsigned long len)
5866 {
5867 	struct extent_buffer *eb = NULL;
5868 
5869 	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
5870 	eb->start = start;
5871 	eb->len = len;
5872 	eb->fs_info = fs_info;
5873 	eb->bflags = 0;
5874 	init_rwsem(&eb->lock);
5875 
5876 	btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
5877 			     &fs_info->allocated_ebs);
5878 	INIT_LIST_HEAD(&eb->release_list);
5879 
5880 	spin_lock_init(&eb->refs_lock);
5881 	atomic_set(&eb->refs, 1);
5882 	atomic_set(&eb->io_pages, 0);
5883 
5884 	ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
5885 
5886 	return eb;
5887 }
5888 
5889 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
5890 {
5891 	int i;
5892 	struct extent_buffer *new;
5893 	int num_pages = num_extent_pages(src);
5894 	int ret;
5895 
5896 	new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
5897 	if (new == NULL)
5898 		return NULL;
5899 
5900 	/*
5901 	 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
5902 	 * btrfs_release_extent_buffer() have different behavior for
5903 	 * UNMAPPED subpage extent buffer.
5904 	 */
5905 	set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
5906 
5907 	memset(new->pages, 0, sizeof(*new->pages) * num_pages);
5908 	ret = btrfs_alloc_page_array(num_pages, new->pages);
5909 	if (ret) {
5910 		btrfs_release_extent_buffer(new);
5911 		return NULL;
5912 	}
5913 
5914 	for (i = 0; i < num_pages; i++) {
5915 		int ret;
5916 		struct page *p = new->pages[i];
5917 
5918 		ret = attach_extent_buffer_page(new, p, NULL);
5919 		if (ret < 0) {
5920 			btrfs_release_extent_buffer(new);
5921 			return NULL;
5922 		}
5923 		WARN_ON(PageDirty(p));
5924 		copy_page(page_address(p), page_address(src->pages[i]));
5925 	}
5926 	set_extent_buffer_uptodate(new);
5927 
5928 	return new;
5929 }
5930 
5931 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5932 						  u64 start, unsigned long len)
5933 {
5934 	struct extent_buffer *eb;
5935 	int num_pages;
5936 	int i;
5937 	int ret;
5938 
5939 	eb = __alloc_extent_buffer(fs_info, start, len);
5940 	if (!eb)
5941 		return NULL;
5942 
5943 	num_pages = num_extent_pages(eb);
5944 	ret = btrfs_alloc_page_array(num_pages, eb->pages);
5945 	if (ret)
5946 		goto err;
5947 
5948 	for (i = 0; i < num_pages; i++) {
5949 		struct page *p = eb->pages[i];
5950 
5951 		ret = attach_extent_buffer_page(eb, p, NULL);
5952 		if (ret < 0)
5953 			goto err;
5954 	}
5955 
5956 	set_extent_buffer_uptodate(eb);
5957 	btrfs_set_header_nritems(eb, 0);
5958 	set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5959 
5960 	return eb;
5961 err:
5962 	for (i = 0; i < num_pages; i++) {
5963 		if (eb->pages[i]) {
5964 			detach_extent_buffer_page(eb, eb->pages[i]);
5965 			__free_page(eb->pages[i]);
5966 		}
5967 	}
5968 	__free_extent_buffer(eb);
5969 	return NULL;
5970 }
5971 
5972 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5973 						u64 start)
5974 {
5975 	return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
5976 }
5977 
5978 static void check_buffer_tree_ref(struct extent_buffer *eb)
5979 {
5980 	int refs;
5981 	/*
5982 	 * The TREE_REF bit is first set when the extent_buffer is added
5983 	 * to the radix tree. It is also reset, if unset, when a new reference
5984 	 * is created by find_extent_buffer.
5985 	 *
5986 	 * It is only cleared in two cases: freeing the last non-tree
5987 	 * reference to the extent_buffer when its STALE bit is set or
5988 	 * calling release_folio when the tree reference is the only reference.
5989 	 *
5990 	 * In both cases, care is taken to ensure that the extent_buffer's
5991 	 * pages are not under io. However, release_folio can be concurrently
5992 	 * called with creating new references, which is prone to race
5993 	 * conditions between the calls to check_buffer_tree_ref in those
5994 	 * codepaths and clearing TREE_REF in try_release_extent_buffer.
5995 	 *
5996 	 * The actual lifetime of the extent_buffer in the radix tree is
5997 	 * adequately protected by the refcount, but the TREE_REF bit and
5998 	 * its corresponding reference are not. To protect against this
5999 	 * class of races, we call check_buffer_tree_ref from the codepaths
6000 	 * which trigger io after they set eb->io_pages. Note that once io is
6001 	 * initiated, TREE_REF can no longer be cleared, so that is the
6002 	 * moment at which any such race is best fixed.
6003 	 */
6004 	refs = atomic_read(&eb->refs);
6005 	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6006 		return;
6007 
6008 	spin_lock(&eb->refs_lock);
6009 	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6010 		atomic_inc(&eb->refs);
6011 	spin_unlock(&eb->refs_lock);
6012 }
6013 
6014 static void mark_extent_buffer_accessed(struct extent_buffer *eb,
6015 		struct page *accessed)
6016 {
6017 	int num_pages, i;
6018 
6019 	check_buffer_tree_ref(eb);
6020 
6021 	num_pages = num_extent_pages(eb);
6022 	for (i = 0; i < num_pages; i++) {
6023 		struct page *p = eb->pages[i];
6024 
6025 		if (p != accessed)
6026 			mark_page_accessed(p);
6027 	}
6028 }
6029 
6030 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
6031 					 u64 start)
6032 {
6033 	struct extent_buffer *eb;
6034 
6035 	eb = find_extent_buffer_nolock(fs_info, start);
6036 	if (!eb)
6037 		return NULL;
6038 	/*
6039 	 * Lock our eb's refs_lock to avoid races with free_extent_buffer().
6040 	 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
6041 	 * another task running free_extent_buffer() might have seen that flag
6042 	 * set, eb->refs == 2, that the buffer isn't under IO (dirty and
6043 	 * writeback flags not set) and it's still in the tree (flag
6044 	 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
6045 	 * decrementing the extent buffer's reference count twice.  So here we
6046 	 * could race and increment the eb's reference count, clear its stale
6047 	 * flag, mark it as dirty and drop our reference before the other task
6048 	 * finishes executing free_extent_buffer, which would later result in
6049 	 * an attempt to free an extent buffer that is dirty.
6050 	 */
6051 	if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
6052 		spin_lock(&eb->refs_lock);
6053 		spin_unlock(&eb->refs_lock);
6054 	}
6055 	mark_extent_buffer_accessed(eb, NULL);
6056 	return eb;
6057 }
6058 
6059 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6060 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
6061 					u64 start)
6062 {
6063 	struct extent_buffer *eb, *exists = NULL;
6064 	int ret;
6065 
6066 	eb = find_extent_buffer(fs_info, start);
6067 	if (eb)
6068 		return eb;
6069 	eb = alloc_dummy_extent_buffer(fs_info, start);
6070 	if (!eb)
6071 		return ERR_PTR(-ENOMEM);
6072 	eb->fs_info = fs_info;
6073 again:
6074 	ret = radix_tree_preload(GFP_NOFS);
6075 	if (ret) {
6076 		exists = ERR_PTR(ret);
6077 		goto free_eb;
6078 	}
6079 	spin_lock(&fs_info->buffer_lock);
6080 	ret = radix_tree_insert(&fs_info->buffer_radix,
6081 				start >> fs_info->sectorsize_bits, eb);
6082 	spin_unlock(&fs_info->buffer_lock);
6083 	radix_tree_preload_end();
6084 	if (ret == -EEXIST) {
6085 		exists = find_extent_buffer(fs_info, start);
6086 		if (exists)
6087 			goto free_eb;
6088 		else
6089 			goto again;
6090 	}
6091 	check_buffer_tree_ref(eb);
6092 	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
6093 
6094 	return eb;
6095 free_eb:
6096 	btrfs_release_extent_buffer(eb);
6097 	return exists;
6098 }
6099 #endif
6100 
6101 static struct extent_buffer *grab_extent_buffer(
6102 		struct btrfs_fs_info *fs_info, struct page *page)
6103 {
6104 	struct extent_buffer *exists;
6105 
6106 	/*
6107 	 * For subpage case, we completely rely on radix tree to ensure we
6108 	 * don't try to insert two ebs for the same bytenr.  So here we always
6109 	 * return NULL and just continue.
6110 	 */
6111 	if (fs_info->nodesize < PAGE_SIZE)
6112 		return NULL;
6113 
6114 	/* Page not yet attached to an extent buffer */
6115 	if (!PagePrivate(page))
6116 		return NULL;
6117 
6118 	/*
6119 	 * We could have already allocated an eb for this page and attached one
6120 	 * so lets see if we can get a ref on the existing eb, and if we can we
6121 	 * know it's good and we can just return that one, else we know we can
6122 	 * just overwrite page->private.
6123 	 */
6124 	exists = (struct extent_buffer *)page->private;
6125 	if (atomic_inc_not_zero(&exists->refs))
6126 		return exists;
6127 
6128 	WARN_ON(PageDirty(page));
6129 	detach_page_private(page);
6130 	return NULL;
6131 }
6132 
6133 static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
6134 {
6135 	if (!IS_ALIGNED(start, fs_info->sectorsize)) {
6136 		btrfs_err(fs_info, "bad tree block start %llu", start);
6137 		return -EINVAL;
6138 	}
6139 
6140 	if (fs_info->nodesize < PAGE_SIZE &&
6141 	    offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
6142 		btrfs_err(fs_info,
6143 		"tree block crosses page boundary, start %llu nodesize %u",
6144 			  start, fs_info->nodesize);
6145 		return -EINVAL;
6146 	}
6147 	if (fs_info->nodesize >= PAGE_SIZE &&
6148 	    !PAGE_ALIGNED(start)) {
6149 		btrfs_err(fs_info,
6150 		"tree block is not page aligned, start %llu nodesize %u",
6151 			  start, fs_info->nodesize);
6152 		return -EINVAL;
6153 	}
6154 	return 0;
6155 }
6156 
6157 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
6158 					  u64 start, u64 owner_root, int level)
6159 {
6160 	unsigned long len = fs_info->nodesize;
6161 	int num_pages;
6162 	int i;
6163 	unsigned long index = start >> PAGE_SHIFT;
6164 	struct extent_buffer *eb;
6165 	struct extent_buffer *exists = NULL;
6166 	struct page *p;
6167 	struct address_space *mapping = fs_info->btree_inode->i_mapping;
6168 	u64 lockdep_owner = owner_root;
6169 	int uptodate = 1;
6170 	int ret;
6171 
6172 	if (check_eb_alignment(fs_info, start))
6173 		return ERR_PTR(-EINVAL);
6174 
6175 #if BITS_PER_LONG == 32
6176 	if (start >= MAX_LFS_FILESIZE) {
6177 		btrfs_err_rl(fs_info,
6178 		"extent buffer %llu is beyond 32bit page cache limit", start);
6179 		btrfs_err_32bit_limit(fs_info);
6180 		return ERR_PTR(-EOVERFLOW);
6181 	}
6182 	if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6183 		btrfs_warn_32bit_limit(fs_info);
6184 #endif
6185 
6186 	eb = find_extent_buffer(fs_info, start);
6187 	if (eb)
6188 		return eb;
6189 
6190 	eb = __alloc_extent_buffer(fs_info, start, len);
6191 	if (!eb)
6192 		return ERR_PTR(-ENOMEM);
6193 
6194 	/*
6195 	 * The reloc trees are just snapshots, so we need them to appear to be
6196 	 * just like any other fs tree WRT lockdep.
6197 	 */
6198 	if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID)
6199 		lockdep_owner = BTRFS_FS_TREE_OBJECTID;
6200 
6201 	btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
6202 
6203 	num_pages = num_extent_pages(eb);
6204 	for (i = 0; i < num_pages; i++, index++) {
6205 		struct btrfs_subpage *prealloc = NULL;
6206 
6207 		p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
6208 		if (!p) {
6209 			exists = ERR_PTR(-ENOMEM);
6210 			goto free_eb;
6211 		}
6212 
6213 		/*
6214 		 * Preallocate page->private for subpage case, so that we won't
6215 		 * allocate memory with private_lock hold.  The memory will be
6216 		 * freed by attach_extent_buffer_page() or freed manually if
6217 		 * we exit earlier.
6218 		 *
6219 		 * Although we have ensured one subpage eb can only have one
6220 		 * page, but it may change in the future for 16K page size
6221 		 * support, so we still preallocate the memory in the loop.
6222 		 */
6223 		if (fs_info->nodesize < PAGE_SIZE) {
6224 			prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
6225 			if (IS_ERR(prealloc)) {
6226 				ret = PTR_ERR(prealloc);
6227 				unlock_page(p);
6228 				put_page(p);
6229 				exists = ERR_PTR(ret);
6230 				goto free_eb;
6231 			}
6232 		}
6233 
6234 		spin_lock(&mapping->private_lock);
6235 		exists = grab_extent_buffer(fs_info, p);
6236 		if (exists) {
6237 			spin_unlock(&mapping->private_lock);
6238 			unlock_page(p);
6239 			put_page(p);
6240 			mark_extent_buffer_accessed(exists, p);
6241 			btrfs_free_subpage(prealloc);
6242 			goto free_eb;
6243 		}
6244 		/* Should not fail, as we have preallocated the memory */
6245 		ret = attach_extent_buffer_page(eb, p, prealloc);
6246 		ASSERT(!ret);
6247 		/*
6248 		 * To inform we have extra eb under allocation, so that
6249 		 * detach_extent_buffer_page() won't release the page private
6250 		 * when the eb hasn't yet been inserted into radix tree.
6251 		 *
6252 		 * The ref will be decreased when the eb released the page, in
6253 		 * detach_extent_buffer_page().
6254 		 * Thus needs no special handling in error path.
6255 		 */
6256 		btrfs_page_inc_eb_refs(fs_info, p);
6257 		spin_unlock(&mapping->private_lock);
6258 
6259 		WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
6260 		eb->pages[i] = p;
6261 		if (!PageUptodate(p))
6262 			uptodate = 0;
6263 
6264 		/*
6265 		 * We can't unlock the pages just yet since the extent buffer
6266 		 * hasn't been properly inserted in the radix tree, this
6267 		 * opens a race with btree_release_folio which can free a page
6268 		 * while we are still filling in all pages for the buffer and
6269 		 * we could crash.
6270 		 */
6271 	}
6272 	if (uptodate)
6273 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6274 again:
6275 	ret = radix_tree_preload(GFP_NOFS);
6276 	if (ret) {
6277 		exists = ERR_PTR(ret);
6278 		goto free_eb;
6279 	}
6280 
6281 	spin_lock(&fs_info->buffer_lock);
6282 	ret = radix_tree_insert(&fs_info->buffer_radix,
6283 				start >> fs_info->sectorsize_bits, eb);
6284 	spin_unlock(&fs_info->buffer_lock);
6285 	radix_tree_preload_end();
6286 	if (ret == -EEXIST) {
6287 		exists = find_extent_buffer(fs_info, start);
6288 		if (exists)
6289 			goto free_eb;
6290 		else
6291 			goto again;
6292 	}
6293 	/* add one reference for the tree */
6294 	check_buffer_tree_ref(eb);
6295 	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
6296 
6297 	/*
6298 	 * Now it's safe to unlock the pages because any calls to
6299 	 * btree_release_folio will correctly detect that a page belongs to a
6300 	 * live buffer and won't free them prematurely.
6301 	 */
6302 	for (i = 0; i < num_pages; i++)
6303 		unlock_page(eb->pages[i]);
6304 	return eb;
6305 
6306 free_eb:
6307 	WARN_ON(!atomic_dec_and_test(&eb->refs));
6308 	for (i = 0; i < num_pages; i++) {
6309 		if (eb->pages[i])
6310 			unlock_page(eb->pages[i]);
6311 	}
6312 
6313 	btrfs_release_extent_buffer(eb);
6314 	return exists;
6315 }
6316 
6317 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
6318 {
6319 	struct extent_buffer *eb =
6320 			container_of(head, struct extent_buffer, rcu_head);
6321 
6322 	__free_extent_buffer(eb);
6323 }
6324 
6325 static int release_extent_buffer(struct extent_buffer *eb)
6326 	__releases(&eb->refs_lock)
6327 {
6328 	lockdep_assert_held(&eb->refs_lock);
6329 
6330 	WARN_ON(atomic_read(&eb->refs) == 0);
6331 	if (atomic_dec_and_test(&eb->refs)) {
6332 		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
6333 			struct btrfs_fs_info *fs_info = eb->fs_info;
6334 
6335 			spin_unlock(&eb->refs_lock);
6336 
6337 			spin_lock(&fs_info->buffer_lock);
6338 			radix_tree_delete(&fs_info->buffer_radix,
6339 					  eb->start >> fs_info->sectorsize_bits);
6340 			spin_unlock(&fs_info->buffer_lock);
6341 		} else {
6342 			spin_unlock(&eb->refs_lock);
6343 		}
6344 
6345 		btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
6346 		/* Should be safe to release our pages at this point */
6347 		btrfs_release_extent_buffer_pages(eb);
6348 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6349 		if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
6350 			__free_extent_buffer(eb);
6351 			return 1;
6352 		}
6353 #endif
6354 		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
6355 		return 1;
6356 	}
6357 	spin_unlock(&eb->refs_lock);
6358 
6359 	return 0;
6360 }
6361 
6362 void free_extent_buffer(struct extent_buffer *eb)
6363 {
6364 	int refs;
6365 	int old;
6366 	if (!eb)
6367 		return;
6368 
6369 	while (1) {
6370 		refs = atomic_read(&eb->refs);
6371 		if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
6372 		    || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
6373 			refs == 1))
6374 			break;
6375 		old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
6376 		if (old == refs)
6377 			return;
6378 	}
6379 
6380 	spin_lock(&eb->refs_lock);
6381 	if (atomic_read(&eb->refs) == 2 &&
6382 	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
6383 	    !extent_buffer_under_io(eb) &&
6384 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6385 		atomic_dec(&eb->refs);
6386 
6387 	/*
6388 	 * I know this is terrible, but it's temporary until we stop tracking
6389 	 * the uptodate bits and such for the extent buffers.
6390 	 */
6391 	release_extent_buffer(eb);
6392 }
6393 
6394 void free_extent_buffer_stale(struct extent_buffer *eb)
6395 {
6396 	if (!eb)
6397 		return;
6398 
6399 	spin_lock(&eb->refs_lock);
6400 	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
6401 
6402 	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
6403 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6404 		atomic_dec(&eb->refs);
6405 	release_extent_buffer(eb);
6406 }
6407 
6408 static void btree_clear_page_dirty(struct page *page)
6409 {
6410 	ASSERT(PageDirty(page));
6411 	ASSERT(PageLocked(page));
6412 	clear_page_dirty_for_io(page);
6413 	xa_lock_irq(&page->mapping->i_pages);
6414 	if (!PageDirty(page))
6415 		__xa_clear_mark(&page->mapping->i_pages,
6416 				page_index(page), PAGECACHE_TAG_DIRTY);
6417 	xa_unlock_irq(&page->mapping->i_pages);
6418 }
6419 
6420 static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
6421 {
6422 	struct btrfs_fs_info *fs_info = eb->fs_info;
6423 	struct page *page = eb->pages[0];
6424 	bool last;
6425 
6426 	/* btree_clear_page_dirty() needs page locked */
6427 	lock_page(page);
6428 	last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
6429 						  eb->len);
6430 	if (last)
6431 		btree_clear_page_dirty(page);
6432 	unlock_page(page);
6433 	WARN_ON(atomic_read(&eb->refs) == 0);
6434 }
6435 
6436 void clear_extent_buffer_dirty(const struct extent_buffer *eb)
6437 {
6438 	int i;
6439 	int num_pages;
6440 	struct page *page;
6441 
6442 	if (eb->fs_info->nodesize < PAGE_SIZE)
6443 		return clear_subpage_extent_buffer_dirty(eb);
6444 
6445 	num_pages = num_extent_pages(eb);
6446 
6447 	for (i = 0; i < num_pages; i++) {
6448 		page = eb->pages[i];
6449 		if (!PageDirty(page))
6450 			continue;
6451 		lock_page(page);
6452 		btree_clear_page_dirty(page);
6453 		ClearPageError(page);
6454 		unlock_page(page);
6455 	}
6456 	WARN_ON(atomic_read(&eb->refs) == 0);
6457 }
6458 
6459 bool set_extent_buffer_dirty(struct extent_buffer *eb)
6460 {
6461 	int i;
6462 	int num_pages;
6463 	bool was_dirty;
6464 
6465 	check_buffer_tree_ref(eb);
6466 
6467 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
6468 
6469 	num_pages = num_extent_pages(eb);
6470 	WARN_ON(atomic_read(&eb->refs) == 0);
6471 	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
6472 
6473 	if (!was_dirty) {
6474 		bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
6475 
6476 		/*
6477 		 * For subpage case, we can have other extent buffers in the
6478 		 * same page, and in clear_subpage_extent_buffer_dirty() we
6479 		 * have to clear page dirty without subpage lock held.
6480 		 * This can cause race where our page gets dirty cleared after
6481 		 * we just set it.
6482 		 *
6483 		 * Thankfully, clear_subpage_extent_buffer_dirty() has locked
6484 		 * its page for other reasons, we can use page lock to prevent
6485 		 * the above race.
6486 		 */
6487 		if (subpage)
6488 			lock_page(eb->pages[0]);
6489 		for (i = 0; i < num_pages; i++)
6490 			btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
6491 					     eb->start, eb->len);
6492 		if (subpage)
6493 			unlock_page(eb->pages[0]);
6494 	}
6495 #ifdef CONFIG_BTRFS_DEBUG
6496 	for (i = 0; i < num_pages; i++)
6497 		ASSERT(PageDirty(eb->pages[i]));
6498 #endif
6499 
6500 	return was_dirty;
6501 }
6502 
6503 void clear_extent_buffer_uptodate(struct extent_buffer *eb)
6504 {
6505 	struct btrfs_fs_info *fs_info = eb->fs_info;
6506 	struct page *page;
6507 	int num_pages;
6508 	int i;
6509 
6510 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6511 	num_pages = num_extent_pages(eb);
6512 	for (i = 0; i < num_pages; i++) {
6513 		page = eb->pages[i];
6514 		if (!page)
6515 			continue;
6516 
6517 		/*
6518 		 * This is special handling for metadata subpage, as regular
6519 		 * btrfs_is_subpage() can not handle cloned/dummy metadata.
6520 		 */
6521 		if (fs_info->nodesize >= PAGE_SIZE)
6522 			ClearPageUptodate(page);
6523 		else
6524 			btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
6525 						     eb->len);
6526 	}
6527 }
6528 
6529 void set_extent_buffer_uptodate(struct extent_buffer *eb)
6530 {
6531 	struct btrfs_fs_info *fs_info = eb->fs_info;
6532 	struct page *page;
6533 	int num_pages;
6534 	int i;
6535 
6536 	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6537 	num_pages = num_extent_pages(eb);
6538 	for (i = 0; i < num_pages; i++) {
6539 		page = eb->pages[i];
6540 
6541 		/*
6542 		 * This is special handling for metadata subpage, as regular
6543 		 * btrfs_is_subpage() can not handle cloned/dummy metadata.
6544 		 */
6545 		if (fs_info->nodesize >= PAGE_SIZE)
6546 			SetPageUptodate(page);
6547 		else
6548 			btrfs_subpage_set_uptodate(fs_info, page, eb->start,
6549 						   eb->len);
6550 	}
6551 }
6552 
6553 static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
6554 				      int mirror_num)
6555 {
6556 	struct btrfs_fs_info *fs_info = eb->fs_info;
6557 	struct extent_io_tree *io_tree;
6558 	struct page *page = eb->pages[0];
6559 	struct btrfs_bio_ctrl bio_ctrl = {
6560 		.mirror_num = mirror_num,
6561 	};
6562 	int ret = 0;
6563 
6564 	ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
6565 	ASSERT(PagePrivate(page));
6566 	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
6567 
6568 	if (wait == WAIT_NONE) {
6569 		if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
6570 			return -EAGAIN;
6571 	} else {
6572 		ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6573 		if (ret < 0)
6574 			return ret;
6575 	}
6576 
6577 	ret = 0;
6578 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
6579 	    PageUptodate(page) ||
6580 	    btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
6581 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6582 		unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6583 		return ret;
6584 	}
6585 
6586 	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
6587 	eb->read_mirror = 0;
6588 	atomic_set(&eb->io_pages, 1);
6589 	check_buffer_tree_ref(eb);
6590 	btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
6591 
6592 	btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
6593 	ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
6594 				 page, eb->start, eb->len,
6595 				 eb->start - page_offset(page),
6596 				 end_bio_extent_readpage, 0, true);
6597 	if (ret) {
6598 		/*
6599 		 * In the endio function, if we hit something wrong we will
6600 		 * increase the io_pages, so here we need to decrease it for
6601 		 * error path.
6602 		 */
6603 		atomic_dec(&eb->io_pages);
6604 	}
6605 	submit_one_bio(&bio_ctrl);
6606 	if (ret || wait != WAIT_COMPLETE)
6607 		return ret;
6608 
6609 	wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
6610 	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
6611 		ret = -EIO;
6612 	return ret;
6613 }
6614 
6615 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
6616 {
6617 	int i;
6618 	struct page *page;
6619 	int err;
6620 	int ret = 0;
6621 	int locked_pages = 0;
6622 	int all_uptodate = 1;
6623 	int num_pages;
6624 	unsigned long num_reads = 0;
6625 	struct btrfs_bio_ctrl bio_ctrl = {
6626 		.mirror_num = mirror_num,
6627 	};
6628 
6629 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
6630 		return 0;
6631 
6632 	/*
6633 	 * We could have had EXTENT_BUFFER_UPTODATE cleared by the write
6634 	 * operation, which could potentially still be in flight.  In this case
6635 	 * we simply want to return an error.
6636 	 */
6637 	if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
6638 		return -EIO;
6639 
6640 	if (eb->fs_info->nodesize < PAGE_SIZE)
6641 		return read_extent_buffer_subpage(eb, wait, mirror_num);
6642 
6643 	num_pages = num_extent_pages(eb);
6644 	for (i = 0; i < num_pages; i++) {
6645 		page = eb->pages[i];
6646 		if (wait == WAIT_NONE) {
6647 			/*
6648 			 * WAIT_NONE is only utilized by readahead. If we can't
6649 			 * acquire the lock atomically it means either the eb
6650 			 * is being read out or under modification.
6651 			 * Either way the eb will be or has been cached,
6652 			 * readahead can exit safely.
6653 			 */
6654 			if (!trylock_page(page))
6655 				goto unlock_exit;
6656 		} else {
6657 			lock_page(page);
6658 		}
6659 		locked_pages++;
6660 	}
6661 	/*
6662 	 * We need to firstly lock all pages to make sure that
6663 	 * the uptodate bit of our pages won't be affected by
6664 	 * clear_extent_buffer_uptodate().
6665 	 */
6666 	for (i = 0; i < num_pages; i++) {
6667 		page = eb->pages[i];
6668 		if (!PageUptodate(page)) {
6669 			num_reads++;
6670 			all_uptodate = 0;
6671 		}
6672 	}
6673 
6674 	if (all_uptodate) {
6675 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6676 		goto unlock_exit;
6677 	}
6678 
6679 	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
6680 	eb->read_mirror = 0;
6681 	atomic_set(&eb->io_pages, num_reads);
6682 	/*
6683 	 * It is possible for release_folio to clear the TREE_REF bit before we
6684 	 * set io_pages. See check_buffer_tree_ref for a more detailed comment.
6685 	 */
6686 	check_buffer_tree_ref(eb);
6687 	for (i = 0; i < num_pages; i++) {
6688 		page = eb->pages[i];
6689 
6690 		if (!PageUptodate(page)) {
6691 			if (ret) {
6692 				atomic_dec(&eb->io_pages);
6693 				unlock_page(page);
6694 				continue;
6695 			}
6696 
6697 			ClearPageError(page);
6698 			err = submit_extent_page(REQ_OP_READ, NULL,
6699 					 &bio_ctrl, page, page_offset(page),
6700 					 PAGE_SIZE, 0, end_bio_extent_readpage,
6701 					 0, false);
6702 			if (err) {
6703 				/*
6704 				 * We failed to submit the bio so it's the
6705 				 * caller's responsibility to perform cleanup
6706 				 * i.e unlock page/set error bit.
6707 				 */
6708 				ret = err;
6709 				SetPageError(page);
6710 				unlock_page(page);
6711 				atomic_dec(&eb->io_pages);
6712 			}
6713 		} else {
6714 			unlock_page(page);
6715 		}
6716 	}
6717 
6718 	submit_one_bio(&bio_ctrl);
6719 
6720 	if (ret || wait != WAIT_COMPLETE)
6721 		return ret;
6722 
6723 	for (i = 0; i < num_pages; i++) {
6724 		page = eb->pages[i];
6725 		wait_on_page_locked(page);
6726 		if (!PageUptodate(page))
6727 			ret = -EIO;
6728 	}
6729 
6730 	return ret;
6731 
6732 unlock_exit:
6733 	while (locked_pages > 0) {
6734 		locked_pages--;
6735 		page = eb->pages[locked_pages];
6736 		unlock_page(page);
6737 	}
6738 	return ret;
6739 }
6740 
6741 static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
6742 			    unsigned long len)
6743 {
6744 	btrfs_warn(eb->fs_info,
6745 		"access to eb bytenr %llu len %lu out of range start %lu len %lu",
6746 		eb->start, eb->len, start, len);
6747 	WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6748 
6749 	return true;
6750 }
6751 
6752 /*
6753  * Check if the [start, start + len) range is valid before reading/writing
6754  * the eb.
6755  * NOTE: @start and @len are offset inside the eb, not logical address.
6756  *
6757  * Caller should not touch the dst/src memory if this function returns error.
6758  */
6759 static inline int check_eb_range(const struct extent_buffer *eb,
6760 				 unsigned long start, unsigned long len)
6761 {
6762 	unsigned long offset;
6763 
6764 	/* start, start + len should not go beyond eb->len nor overflow */
6765 	if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
6766 		return report_eb_range(eb, start, len);
6767 
6768 	return false;
6769 }
6770 
6771 void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
6772 			unsigned long start, unsigned long len)
6773 {
6774 	size_t cur;
6775 	size_t offset;
6776 	struct page *page;
6777 	char *kaddr;
6778 	char *dst = (char *)dstv;
6779 	unsigned long i = get_eb_page_index(start);
6780 
6781 	if (check_eb_range(eb, start, len))
6782 		return;
6783 
6784 	offset = get_eb_offset_in_page(eb, start);
6785 
6786 	while (len > 0) {
6787 		page = eb->pages[i];
6788 
6789 		cur = min(len, (PAGE_SIZE - offset));
6790 		kaddr = page_address(page);
6791 		memcpy(dst, kaddr + offset, cur);
6792 
6793 		dst += cur;
6794 		len -= cur;
6795 		offset = 0;
6796 		i++;
6797 	}
6798 }
6799 
6800 int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
6801 				       void __user *dstv,
6802 				       unsigned long start, unsigned long len)
6803 {
6804 	size_t cur;
6805 	size_t offset;
6806 	struct page *page;
6807 	char *kaddr;
6808 	char __user *dst = (char __user *)dstv;
6809 	unsigned long i = get_eb_page_index(start);
6810 	int ret = 0;
6811 
6812 	WARN_ON(start > eb->len);
6813 	WARN_ON(start + len > eb->start + eb->len);
6814 
6815 	offset = get_eb_offset_in_page(eb, start);
6816 
6817 	while (len > 0) {
6818 		page = eb->pages[i];
6819 
6820 		cur = min(len, (PAGE_SIZE - offset));
6821 		kaddr = page_address(page);
6822 		if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
6823 			ret = -EFAULT;
6824 			break;
6825 		}
6826 
6827 		dst += cur;
6828 		len -= cur;
6829 		offset = 0;
6830 		i++;
6831 	}
6832 
6833 	return ret;
6834 }
6835 
6836 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
6837 			 unsigned long start, unsigned long len)
6838 {
6839 	size_t cur;
6840 	size_t offset;
6841 	struct page *page;
6842 	char *kaddr;
6843 	char *ptr = (char *)ptrv;
6844 	unsigned long i = get_eb_page_index(start);
6845 	int ret = 0;
6846 
6847 	if (check_eb_range(eb, start, len))
6848 		return -EINVAL;
6849 
6850 	offset = get_eb_offset_in_page(eb, start);
6851 
6852 	while (len > 0) {
6853 		page = eb->pages[i];
6854 
6855 		cur = min(len, (PAGE_SIZE - offset));
6856 
6857 		kaddr = page_address(page);
6858 		ret = memcmp(ptr, kaddr + offset, cur);
6859 		if (ret)
6860 			break;
6861 
6862 		ptr += cur;
6863 		len -= cur;
6864 		offset = 0;
6865 		i++;
6866 	}
6867 	return ret;
6868 }
6869 
6870 /*
6871  * Check that the extent buffer is uptodate.
6872  *
6873  * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
6874  * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
6875  */
6876 static void assert_eb_page_uptodate(const struct extent_buffer *eb,
6877 				    struct page *page)
6878 {
6879 	struct btrfs_fs_info *fs_info = eb->fs_info;
6880 
6881 	/*
6882 	 * If we are using the commit root we could potentially clear a page
6883 	 * Uptodate while we're using the extent buffer that we've previously
6884 	 * looked up.  We don't want to complain in this case, as the page was
6885 	 * valid before, we just didn't write it out.  Instead we want to catch
6886 	 * the case where we didn't actually read the block properly, which
6887 	 * would have !PageUptodate && !PageError, as we clear PageError before
6888 	 * reading.
6889 	 */
6890 	if (fs_info->nodesize < PAGE_SIZE) {
6891 		bool uptodate, error;
6892 
6893 		uptodate = btrfs_subpage_test_uptodate(fs_info, page,
6894 						       eb->start, eb->len);
6895 		error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
6896 		WARN_ON(!uptodate && !error);
6897 	} else {
6898 		WARN_ON(!PageUptodate(page) && !PageError(page));
6899 	}
6900 }
6901 
6902 void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
6903 		const void *srcv)
6904 {
6905 	char *kaddr;
6906 
6907 	assert_eb_page_uptodate(eb, eb->pages[0]);
6908 	kaddr = page_address(eb->pages[0]) +
6909 		get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
6910 						   chunk_tree_uuid));
6911 	memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
6912 }
6913 
6914 void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
6915 {
6916 	char *kaddr;
6917 
6918 	assert_eb_page_uptodate(eb, eb->pages[0]);
6919 	kaddr = page_address(eb->pages[0]) +
6920 		get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
6921 	memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
6922 }
6923 
6924 void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
6925 			 unsigned long start, unsigned long len)
6926 {
6927 	size_t cur;
6928 	size_t offset;
6929 	struct page *page;
6930 	char *kaddr;
6931 	char *src = (char *)srcv;
6932 	unsigned long i = get_eb_page_index(start);
6933 
6934 	WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
6935 
6936 	if (check_eb_range(eb, start, len))
6937 		return;
6938 
6939 	offset = get_eb_offset_in_page(eb, start);
6940 
6941 	while (len > 0) {
6942 		page = eb->pages[i];
6943 		assert_eb_page_uptodate(eb, page);
6944 
6945 		cur = min(len, PAGE_SIZE - offset);
6946 		kaddr = page_address(page);
6947 		memcpy(kaddr + offset, src, cur);
6948 
6949 		src += cur;
6950 		len -= cur;
6951 		offset = 0;
6952 		i++;
6953 	}
6954 }
6955 
6956 void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
6957 		unsigned long len)
6958 {
6959 	size_t cur;
6960 	size_t offset;
6961 	struct page *page;
6962 	char *kaddr;
6963 	unsigned long i = get_eb_page_index(start);
6964 
6965 	if (check_eb_range(eb, start, len))
6966 		return;
6967 
6968 	offset = get_eb_offset_in_page(eb, start);
6969 
6970 	while (len > 0) {
6971 		page = eb->pages[i];
6972 		assert_eb_page_uptodate(eb, page);
6973 
6974 		cur = min(len, PAGE_SIZE - offset);
6975 		kaddr = page_address(page);
6976 		memset(kaddr + offset, 0, cur);
6977 
6978 		len -= cur;
6979 		offset = 0;
6980 		i++;
6981 	}
6982 }
6983 
6984 void copy_extent_buffer_full(const struct extent_buffer *dst,
6985 			     const struct extent_buffer *src)
6986 {
6987 	int i;
6988 	int num_pages;
6989 
6990 	ASSERT(dst->len == src->len);
6991 
6992 	if (dst->fs_info->nodesize >= PAGE_SIZE) {
6993 		num_pages = num_extent_pages(dst);
6994 		for (i = 0; i < num_pages; i++)
6995 			copy_page(page_address(dst->pages[i]),
6996 				  page_address(src->pages[i]));
6997 	} else {
6998 		size_t src_offset = get_eb_offset_in_page(src, 0);
6999 		size_t dst_offset = get_eb_offset_in_page(dst, 0);
7000 
7001 		ASSERT(src->fs_info->nodesize < PAGE_SIZE);
7002 		memcpy(page_address(dst->pages[0]) + dst_offset,
7003 		       page_address(src->pages[0]) + src_offset,
7004 		       src->len);
7005 	}
7006 }
7007 
7008 void copy_extent_buffer(const struct extent_buffer *dst,
7009 			const struct extent_buffer *src,
7010 			unsigned long dst_offset, unsigned long src_offset,
7011 			unsigned long len)
7012 {
7013 	u64 dst_len = dst->len;
7014 	size_t cur;
7015 	size_t offset;
7016 	struct page *page;
7017 	char *kaddr;
7018 	unsigned long i = get_eb_page_index(dst_offset);
7019 
7020 	if (check_eb_range(dst, dst_offset, len) ||
7021 	    check_eb_range(src, src_offset, len))
7022 		return;
7023 
7024 	WARN_ON(src->len != dst_len);
7025 
7026 	offset = get_eb_offset_in_page(dst, dst_offset);
7027 
7028 	while (len > 0) {
7029 		page = dst->pages[i];
7030 		assert_eb_page_uptodate(dst, page);
7031 
7032 		cur = min(len, (unsigned long)(PAGE_SIZE - offset));
7033 
7034 		kaddr = page_address(page);
7035 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
7036 
7037 		src_offset += cur;
7038 		len -= cur;
7039 		offset = 0;
7040 		i++;
7041 	}
7042 }
7043 
7044 /*
7045  * eb_bitmap_offset() - calculate the page and offset of the byte containing the
7046  * given bit number
7047  * @eb: the extent buffer
7048  * @start: offset of the bitmap item in the extent buffer
7049  * @nr: bit number
7050  * @page_index: return index of the page in the extent buffer that contains the
7051  * given bit number
7052  * @page_offset: return offset into the page given by page_index
7053  *
7054  * This helper hides the ugliness of finding the byte in an extent buffer which
7055  * contains a given bit.
7056  */
7057 static inline void eb_bitmap_offset(const struct extent_buffer *eb,
7058 				    unsigned long start, unsigned long nr,
7059 				    unsigned long *page_index,
7060 				    size_t *page_offset)
7061 {
7062 	size_t byte_offset = BIT_BYTE(nr);
7063 	size_t offset;
7064 
7065 	/*
7066 	 * The byte we want is the offset of the extent buffer + the offset of
7067 	 * the bitmap item in the extent buffer + the offset of the byte in the
7068 	 * bitmap item.
7069 	 */
7070 	offset = start + offset_in_page(eb->start) + byte_offset;
7071 
7072 	*page_index = offset >> PAGE_SHIFT;
7073 	*page_offset = offset_in_page(offset);
7074 }
7075 
7076 /**
7077  * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
7078  * @eb: the extent buffer
7079  * @start: offset of the bitmap item in the extent buffer
7080  * @nr: bit number to test
7081  */
7082 int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
7083 			   unsigned long nr)
7084 {
7085 	u8 *kaddr;
7086 	struct page *page;
7087 	unsigned long i;
7088 	size_t offset;
7089 
7090 	eb_bitmap_offset(eb, start, nr, &i, &offset);
7091 	page = eb->pages[i];
7092 	assert_eb_page_uptodate(eb, page);
7093 	kaddr = page_address(page);
7094 	return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
7095 }
7096 
7097 /**
7098  * extent_buffer_bitmap_set - set an area of a bitmap
7099  * @eb: the extent buffer
7100  * @start: offset of the bitmap item in the extent buffer
7101  * @pos: bit number of the first bit
7102  * @len: number of bits to set
7103  */
7104 void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
7105 			      unsigned long pos, unsigned long len)
7106 {
7107 	u8 *kaddr;
7108 	struct page *page;
7109 	unsigned long i;
7110 	size_t offset;
7111 	const unsigned int size = pos + len;
7112 	int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
7113 	u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
7114 
7115 	eb_bitmap_offset(eb, start, pos, &i, &offset);
7116 	page = eb->pages[i];
7117 	assert_eb_page_uptodate(eb, page);
7118 	kaddr = page_address(page);
7119 
7120 	while (len >= bits_to_set) {
7121 		kaddr[offset] |= mask_to_set;
7122 		len -= bits_to_set;
7123 		bits_to_set = BITS_PER_BYTE;
7124 		mask_to_set = ~0;
7125 		if (++offset >= PAGE_SIZE && len > 0) {
7126 			offset = 0;
7127 			page = eb->pages[++i];
7128 			assert_eb_page_uptodate(eb, page);
7129 			kaddr = page_address(page);
7130 		}
7131 	}
7132 	if (len) {
7133 		mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
7134 		kaddr[offset] |= mask_to_set;
7135 	}
7136 }
7137 
7138 
7139 /**
7140  * extent_buffer_bitmap_clear - clear an area of a bitmap
7141  * @eb: the extent buffer
7142  * @start: offset of the bitmap item in the extent buffer
7143  * @pos: bit number of the first bit
7144  * @len: number of bits to clear
7145  */
7146 void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
7147 				unsigned long start, unsigned long pos,
7148 				unsigned long len)
7149 {
7150 	u8 *kaddr;
7151 	struct page *page;
7152 	unsigned long i;
7153 	size_t offset;
7154 	const unsigned int size = pos + len;
7155 	int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
7156 	u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
7157 
7158 	eb_bitmap_offset(eb, start, pos, &i, &offset);
7159 	page = eb->pages[i];
7160 	assert_eb_page_uptodate(eb, page);
7161 	kaddr = page_address(page);
7162 
7163 	while (len >= bits_to_clear) {
7164 		kaddr[offset] &= ~mask_to_clear;
7165 		len -= bits_to_clear;
7166 		bits_to_clear = BITS_PER_BYTE;
7167 		mask_to_clear = ~0;
7168 		if (++offset >= PAGE_SIZE && len > 0) {
7169 			offset = 0;
7170 			page = eb->pages[++i];
7171 			assert_eb_page_uptodate(eb, page);
7172 			kaddr = page_address(page);
7173 		}
7174 	}
7175 	if (len) {
7176 		mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
7177 		kaddr[offset] &= ~mask_to_clear;
7178 	}
7179 }
7180 
7181 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
7182 {
7183 	unsigned long distance = (src > dst) ? src - dst : dst - src;
7184 	return distance < len;
7185 }
7186 
7187 static void copy_pages(struct page *dst_page, struct page *src_page,
7188 		       unsigned long dst_off, unsigned long src_off,
7189 		       unsigned long len)
7190 {
7191 	char *dst_kaddr = page_address(dst_page);
7192 	char *src_kaddr;
7193 	int must_memmove = 0;
7194 
7195 	if (dst_page != src_page) {
7196 		src_kaddr = page_address(src_page);
7197 	} else {
7198 		src_kaddr = dst_kaddr;
7199 		if (areas_overlap(src_off, dst_off, len))
7200 			must_memmove = 1;
7201 	}
7202 
7203 	if (must_memmove)
7204 		memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
7205 	else
7206 		memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
7207 }
7208 
7209 void memcpy_extent_buffer(const struct extent_buffer *dst,
7210 			  unsigned long dst_offset, unsigned long src_offset,
7211 			  unsigned long len)
7212 {
7213 	size_t cur;
7214 	size_t dst_off_in_page;
7215 	size_t src_off_in_page;
7216 	unsigned long dst_i;
7217 	unsigned long src_i;
7218 
7219 	if (check_eb_range(dst, dst_offset, len) ||
7220 	    check_eb_range(dst, src_offset, len))
7221 		return;
7222 
7223 	while (len > 0) {
7224 		dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
7225 		src_off_in_page = get_eb_offset_in_page(dst, src_offset);
7226 
7227 		dst_i = get_eb_page_index(dst_offset);
7228 		src_i = get_eb_page_index(src_offset);
7229 
7230 		cur = min(len, (unsigned long)(PAGE_SIZE -
7231 					       src_off_in_page));
7232 		cur = min_t(unsigned long, cur,
7233 			(unsigned long)(PAGE_SIZE - dst_off_in_page));
7234 
7235 		copy_pages(dst->pages[dst_i], dst->pages[src_i],
7236 			   dst_off_in_page, src_off_in_page, cur);
7237 
7238 		src_offset += cur;
7239 		dst_offset += cur;
7240 		len -= cur;
7241 	}
7242 }
7243 
7244 void memmove_extent_buffer(const struct extent_buffer *dst,
7245 			   unsigned long dst_offset, unsigned long src_offset,
7246 			   unsigned long len)
7247 {
7248 	size_t cur;
7249 	size_t dst_off_in_page;
7250 	size_t src_off_in_page;
7251 	unsigned long dst_end = dst_offset + len - 1;
7252 	unsigned long src_end = src_offset + len - 1;
7253 	unsigned long dst_i;
7254 	unsigned long src_i;
7255 
7256 	if (check_eb_range(dst, dst_offset, len) ||
7257 	    check_eb_range(dst, src_offset, len))
7258 		return;
7259 	if (dst_offset < src_offset) {
7260 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
7261 		return;
7262 	}
7263 	while (len > 0) {
7264 		dst_i = get_eb_page_index(dst_end);
7265 		src_i = get_eb_page_index(src_end);
7266 
7267 		dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
7268 		src_off_in_page = get_eb_offset_in_page(dst, src_end);
7269 
7270 		cur = min_t(unsigned long, len, src_off_in_page + 1);
7271 		cur = min(cur, dst_off_in_page + 1);
7272 		copy_pages(dst->pages[dst_i], dst->pages[src_i],
7273 			   dst_off_in_page - cur + 1,
7274 			   src_off_in_page - cur + 1, cur);
7275 
7276 		dst_end -= cur;
7277 		src_end -= cur;
7278 		len -= cur;
7279 	}
7280 }
7281 
7282 #define GANG_LOOKUP_SIZE	16
7283 static struct extent_buffer *get_next_extent_buffer(
7284 		struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
7285 {
7286 	struct extent_buffer *gang[GANG_LOOKUP_SIZE];
7287 	struct extent_buffer *found = NULL;
7288 	u64 page_start = page_offset(page);
7289 	u64 cur = page_start;
7290 
7291 	ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
7292 	lockdep_assert_held(&fs_info->buffer_lock);
7293 
7294 	while (cur < page_start + PAGE_SIZE) {
7295 		int ret;
7296 		int i;
7297 
7298 		ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
7299 				(void **)gang, cur >> fs_info->sectorsize_bits,
7300 				min_t(unsigned int, GANG_LOOKUP_SIZE,
7301 				      PAGE_SIZE / fs_info->nodesize));
7302 		if (ret == 0)
7303 			goto out;
7304 		for (i = 0; i < ret; i++) {
7305 			/* Already beyond page end */
7306 			if (gang[i]->start >= page_start + PAGE_SIZE)
7307 				goto out;
7308 			/* Found one */
7309 			if (gang[i]->start >= bytenr) {
7310 				found = gang[i];
7311 				goto out;
7312 			}
7313 		}
7314 		cur = gang[ret - 1]->start + gang[ret - 1]->len;
7315 	}
7316 out:
7317 	return found;
7318 }
7319 
7320 static int try_release_subpage_extent_buffer(struct page *page)
7321 {
7322 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
7323 	u64 cur = page_offset(page);
7324 	const u64 end = page_offset(page) + PAGE_SIZE;
7325 	int ret;
7326 
7327 	while (cur < end) {
7328 		struct extent_buffer *eb = NULL;
7329 
7330 		/*
7331 		 * Unlike try_release_extent_buffer() which uses page->private
7332 		 * to grab buffer, for subpage case we rely on radix tree, thus
7333 		 * we need to ensure radix tree consistency.
7334 		 *
7335 		 * We also want an atomic snapshot of the radix tree, thus go
7336 		 * with spinlock rather than RCU.
7337 		 */
7338 		spin_lock(&fs_info->buffer_lock);
7339 		eb = get_next_extent_buffer(fs_info, page, cur);
7340 		if (!eb) {
7341 			/* No more eb in the page range after or at cur */
7342 			spin_unlock(&fs_info->buffer_lock);
7343 			break;
7344 		}
7345 		cur = eb->start + eb->len;
7346 
7347 		/*
7348 		 * The same as try_release_extent_buffer(), to ensure the eb
7349 		 * won't disappear out from under us.
7350 		 */
7351 		spin_lock(&eb->refs_lock);
7352 		if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
7353 			spin_unlock(&eb->refs_lock);
7354 			spin_unlock(&fs_info->buffer_lock);
7355 			break;
7356 		}
7357 		spin_unlock(&fs_info->buffer_lock);
7358 
7359 		/*
7360 		 * If tree ref isn't set then we know the ref on this eb is a
7361 		 * real ref, so just return, this eb will likely be freed soon
7362 		 * anyway.
7363 		 */
7364 		if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7365 			spin_unlock(&eb->refs_lock);
7366 			break;
7367 		}
7368 
7369 		/*
7370 		 * Here we don't care about the return value, we will always
7371 		 * check the page private at the end.  And
7372 		 * release_extent_buffer() will release the refs_lock.
7373 		 */
7374 		release_extent_buffer(eb);
7375 	}
7376 	/*
7377 	 * Finally to check if we have cleared page private, as if we have
7378 	 * released all ebs in the page, the page private should be cleared now.
7379 	 */
7380 	spin_lock(&page->mapping->private_lock);
7381 	if (!PagePrivate(page))
7382 		ret = 1;
7383 	else
7384 		ret = 0;
7385 	spin_unlock(&page->mapping->private_lock);
7386 	return ret;
7387 
7388 }
7389 
7390 int try_release_extent_buffer(struct page *page)
7391 {
7392 	struct extent_buffer *eb;
7393 
7394 	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
7395 		return try_release_subpage_extent_buffer(page);
7396 
7397 	/*
7398 	 * We need to make sure nobody is changing page->private, as we rely on
7399 	 * page->private as the pointer to extent buffer.
7400 	 */
7401 	spin_lock(&page->mapping->private_lock);
7402 	if (!PagePrivate(page)) {
7403 		spin_unlock(&page->mapping->private_lock);
7404 		return 1;
7405 	}
7406 
7407 	eb = (struct extent_buffer *)page->private;
7408 	BUG_ON(!eb);
7409 
7410 	/*
7411 	 * This is a little awful but should be ok, we need to make sure that
7412 	 * the eb doesn't disappear out from under us while we're looking at
7413 	 * this page.
7414 	 */
7415 	spin_lock(&eb->refs_lock);
7416 	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
7417 		spin_unlock(&eb->refs_lock);
7418 		spin_unlock(&page->mapping->private_lock);
7419 		return 0;
7420 	}
7421 	spin_unlock(&page->mapping->private_lock);
7422 
7423 	/*
7424 	 * If tree ref isn't set then we know the ref on this eb is a real ref,
7425 	 * so just return, this page will likely be freed soon anyway.
7426 	 */
7427 	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7428 		spin_unlock(&eb->refs_lock);
7429 		return 0;
7430 	}
7431 
7432 	return release_extent_buffer(eb);
7433 }
7434 
7435 /*
7436  * btrfs_readahead_tree_block - attempt to readahead a child block
7437  * @fs_info:	the fs_info
7438  * @bytenr:	bytenr to read
7439  * @owner_root: objectid of the root that owns this eb
7440  * @gen:	generation for the uptodate check, can be 0
7441  * @level:	level for the eb
7442  *
7443  * Attempt to readahead a tree block at @bytenr.  If @gen is 0 then we do a
7444  * normal uptodate check of the eb, without checking the generation.  If we have
7445  * to read the block we will not block on anything.
7446  */
7447 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
7448 				u64 bytenr, u64 owner_root, u64 gen, int level)
7449 {
7450 	struct extent_buffer *eb;
7451 	int ret;
7452 
7453 	eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
7454 	if (IS_ERR(eb))
7455 		return;
7456 
7457 	if (btrfs_buffer_uptodate(eb, gen, 1)) {
7458 		free_extent_buffer(eb);
7459 		return;
7460 	}
7461 
7462 	ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
7463 	if (ret < 0)
7464 		free_extent_buffer_stale(eb);
7465 	else
7466 		free_extent_buffer(eb);
7467 }
7468 
7469 /*
7470  * btrfs_readahead_node_child - readahead a node's child block
7471  * @node:	parent node we're reading from
7472  * @slot:	slot in the parent node for the child we want to read
7473  *
7474  * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
7475  * the slot in the node provided.
7476  */
7477 void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
7478 {
7479 	btrfs_readahead_tree_block(node->fs_info,
7480 				   btrfs_node_blockptr(node, slot),
7481 				   btrfs_header_owner(node),
7482 				   btrfs_node_ptr_generation(node, slot),
7483 				   btrfs_header_level(node) - 1);
7484 }
7485