xref: /openbmc/linux/fs/btrfs/extent_io.c (revision 54a611b6)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/bitops.h>
4 #include <linux/slab.h>
5 #include <linux/bio.h>
6 #include <linux/mm.h>
7 #include <linux/pagemap.h>
8 #include <linux/page-flags.h>
9 #include <linux/sched/mm.h>
10 #include <linux/spinlock.h>
11 #include <linux/blkdev.h>
12 #include <linux/swap.h>
13 #include <linux/writeback.h>
14 #include <linux/pagevec.h>
15 #include <linux/prefetch.h>
16 #include <linux/fsverity.h>
17 #include "misc.h"
18 #include "extent_io.h"
19 #include "extent-io-tree.h"
20 #include "extent_map.h"
21 #include "ctree.h"
22 #include "btrfs_inode.h"
23 #include "volumes.h"
24 #include "check-integrity.h"
25 #include "locking.h"
26 #include "rcu-string.h"
27 #include "backref.h"
28 #include "disk-io.h"
29 #include "subpage.h"
30 #include "zoned.h"
31 #include "block-group.h"
32 #include "compression.h"
33 
34 static struct kmem_cache *extent_state_cache;
35 static struct kmem_cache *extent_buffer_cache;
36 static struct bio_set btrfs_bioset;
37 
38 static inline bool extent_state_in_tree(const struct extent_state *state)
39 {
40 	return !RB_EMPTY_NODE(&state->rb_node);
41 }
42 
43 #ifdef CONFIG_BTRFS_DEBUG
44 static LIST_HEAD(states);
45 static DEFINE_SPINLOCK(leak_lock);
46 
47 static inline void btrfs_leak_debug_add(spinlock_t *lock,
48 					struct list_head *new,
49 					struct list_head *head)
50 {
51 	unsigned long flags;
52 
53 	spin_lock_irqsave(lock, flags);
54 	list_add(new, head);
55 	spin_unlock_irqrestore(lock, flags);
56 }
57 
58 static inline void btrfs_leak_debug_del(spinlock_t *lock,
59 					struct list_head *entry)
60 {
61 	unsigned long flags;
62 
63 	spin_lock_irqsave(lock, flags);
64 	list_del(entry);
65 	spin_unlock_irqrestore(lock, flags);
66 }
67 
68 void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info)
69 {
70 	struct extent_buffer *eb;
71 	unsigned long flags;
72 
73 	/*
74 	 * If we didn't get into open_ctree our allocated_ebs will not be
75 	 * initialized, so just skip this.
76 	 */
77 	if (!fs_info->allocated_ebs.next)
78 		return;
79 
80 	WARN_ON(!list_empty(&fs_info->allocated_ebs));
81 	spin_lock_irqsave(&fs_info->eb_leak_lock, flags);
82 	while (!list_empty(&fs_info->allocated_ebs)) {
83 		eb = list_first_entry(&fs_info->allocated_ebs,
84 				      struct extent_buffer, leak_list);
85 		pr_err(
86 	"BTRFS: buffer leak start %llu len %lu refs %d bflags %lu owner %llu\n",
87 		       eb->start, eb->len, atomic_read(&eb->refs), eb->bflags,
88 		       btrfs_header_owner(eb));
89 		list_del(&eb->leak_list);
90 		kmem_cache_free(extent_buffer_cache, eb);
91 	}
92 	spin_unlock_irqrestore(&fs_info->eb_leak_lock, flags);
93 }
94 
95 static inline void btrfs_extent_state_leak_debug_check(void)
96 {
97 	struct extent_state *state;
98 
99 	while (!list_empty(&states)) {
100 		state = list_entry(states.next, struct extent_state, leak_list);
101 		pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
102 		       state->start, state->end, state->state,
103 		       extent_state_in_tree(state),
104 		       refcount_read(&state->refs));
105 		list_del(&state->leak_list);
106 		kmem_cache_free(extent_state_cache, state);
107 	}
108 }
109 
110 #define btrfs_debug_check_extent_io_range(tree, start, end)		\
111 	__btrfs_debug_check_extent_io_range(__func__, (tree), (start), (end))
112 static inline void __btrfs_debug_check_extent_io_range(const char *caller,
113 		struct extent_io_tree *tree, u64 start, u64 end)
114 {
115 	struct inode *inode = tree->private_data;
116 	u64 isize;
117 
118 	if (!inode || !is_data_inode(inode))
119 		return;
120 
121 	isize = i_size_read(inode);
122 	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
123 		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
124 		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
125 			caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
126 	}
127 }
128 #else
129 #define btrfs_leak_debug_add(lock, new, head)	do {} while (0)
130 #define btrfs_leak_debug_del(lock, entry)	do {} while (0)
131 #define btrfs_extent_state_leak_debug_check()	do {} while (0)
132 #define btrfs_debug_check_extent_io_range(c, s, e)	do {} while (0)
133 #endif
134 
135 struct tree_entry {
136 	u64 start;
137 	u64 end;
138 	struct rb_node rb_node;
139 };
140 
141 /*
142  * Structure to record info about the bio being assembled, and other info like
143  * how many bytes are there before stripe/ordered extent boundary.
144  */
145 struct btrfs_bio_ctrl {
146 	struct bio *bio;
147 	int mirror_num;
148 	enum btrfs_compression_type compress_type;
149 	u32 len_to_stripe_boundary;
150 	u32 len_to_oe_boundary;
151 };
152 
153 struct extent_page_data {
154 	struct btrfs_bio_ctrl bio_ctrl;
155 	/* tells writepage not to lock the state bits for this range
156 	 * it still does the unlocking
157 	 */
158 	unsigned int extent_locked:1;
159 
160 	/* tells the submit_bio code to use REQ_SYNC */
161 	unsigned int sync_io:1;
162 };
163 
164 static int add_extent_changeset(struct extent_state *state, u32 bits,
165 				 struct extent_changeset *changeset,
166 				 int set)
167 {
168 	int ret;
169 
170 	if (!changeset)
171 		return 0;
172 	if (set && (state->state & bits) == bits)
173 		return 0;
174 	if (!set && (state->state & bits) == 0)
175 		return 0;
176 	changeset->bytes_changed += state->end - state->start + 1;
177 	ret = ulist_add(&changeset->range_changed, state->start, state->end,
178 			GFP_ATOMIC);
179 	return ret;
180 }
181 
182 static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl)
183 {
184 	struct bio *bio;
185 	struct bio_vec *bv;
186 	struct inode *inode;
187 	int mirror_num;
188 
189 	if (!bio_ctrl->bio)
190 		return;
191 
192 	bio = bio_ctrl->bio;
193 	bv = bio_first_bvec_all(bio);
194 	inode = bv->bv_page->mapping->host;
195 	mirror_num = bio_ctrl->mirror_num;
196 
197 	/* Caller should ensure the bio has at least some range added */
198 	ASSERT(bio->bi_iter.bi_size);
199 
200 	btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset;
201 
202 	if (!is_data_inode(inode))
203 		btrfs_submit_metadata_bio(inode, bio, mirror_num);
204 	else if (btrfs_op(bio) == BTRFS_MAP_WRITE)
205 		btrfs_submit_data_write_bio(inode, bio, mirror_num);
206 	else
207 		btrfs_submit_data_read_bio(inode, bio, mirror_num,
208 					   bio_ctrl->compress_type);
209 
210 	/* The bio is owned by the bi_end_io handler now */
211 	bio_ctrl->bio = NULL;
212 }
213 
214 /*
215  * Submit or fail the current bio in an extent_page_data structure.
216  */
217 static void submit_write_bio(struct extent_page_data *epd, int ret)
218 {
219 	struct bio *bio = epd->bio_ctrl.bio;
220 
221 	if (!bio)
222 		return;
223 
224 	if (ret) {
225 		ASSERT(ret < 0);
226 		bio->bi_status = errno_to_blk_status(ret);
227 		bio_endio(bio);
228 		/* The bio is owned by the bi_end_io handler now */
229 		epd->bio_ctrl.bio = NULL;
230 	} else {
231 		submit_one_bio(&epd->bio_ctrl);
232 	}
233 }
234 
235 int __init extent_state_cache_init(void)
236 {
237 	extent_state_cache = kmem_cache_create("btrfs_extent_state",
238 			sizeof(struct extent_state), 0,
239 			SLAB_MEM_SPREAD, NULL);
240 	if (!extent_state_cache)
241 		return -ENOMEM;
242 	return 0;
243 }
244 
245 int __init extent_io_init(void)
246 {
247 	extent_buffer_cache = kmem_cache_create("btrfs_extent_buffer",
248 			sizeof(struct extent_buffer), 0,
249 			SLAB_MEM_SPREAD, NULL);
250 	if (!extent_buffer_cache)
251 		return -ENOMEM;
252 
253 	if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
254 			offsetof(struct btrfs_bio, bio),
255 			BIOSET_NEED_BVECS))
256 		goto free_buffer_cache;
257 
258 	if (bioset_integrity_create(&btrfs_bioset, BIO_POOL_SIZE))
259 		goto free_bioset;
260 
261 	return 0;
262 
263 free_bioset:
264 	bioset_exit(&btrfs_bioset);
265 
266 free_buffer_cache:
267 	kmem_cache_destroy(extent_buffer_cache);
268 	extent_buffer_cache = NULL;
269 	return -ENOMEM;
270 }
271 
272 void __cold extent_state_cache_exit(void)
273 {
274 	btrfs_extent_state_leak_debug_check();
275 	kmem_cache_destroy(extent_state_cache);
276 }
277 
278 void __cold extent_io_exit(void)
279 {
280 	/*
281 	 * Make sure all delayed rcu free are flushed before we
282 	 * destroy caches.
283 	 */
284 	rcu_barrier();
285 	kmem_cache_destroy(extent_buffer_cache);
286 	bioset_exit(&btrfs_bioset);
287 }
288 
289 /*
290  * For the file_extent_tree, we want to hold the inode lock when we lookup and
291  * update the disk_i_size, but lockdep will complain because our io_tree we hold
292  * the tree lock and get the inode lock when setting delalloc.  These two things
293  * are unrelated, so make a class for the file_extent_tree so we don't get the
294  * two locking patterns mixed up.
295  */
296 static struct lock_class_key file_extent_tree_class;
297 
298 void extent_io_tree_init(struct btrfs_fs_info *fs_info,
299 			 struct extent_io_tree *tree, unsigned int owner,
300 			 void *private_data)
301 {
302 	tree->fs_info = fs_info;
303 	tree->state = RB_ROOT;
304 	tree->dirty_bytes = 0;
305 	spin_lock_init(&tree->lock);
306 	tree->private_data = private_data;
307 	tree->owner = owner;
308 	if (owner == IO_TREE_INODE_FILE_EXTENT)
309 		lockdep_set_class(&tree->lock, &file_extent_tree_class);
310 }
311 
312 void extent_io_tree_release(struct extent_io_tree *tree)
313 {
314 	spin_lock(&tree->lock);
315 	/*
316 	 * Do a single barrier for the waitqueue_active check here, the state
317 	 * of the waitqueue should not change once extent_io_tree_release is
318 	 * called.
319 	 */
320 	smp_mb();
321 	while (!RB_EMPTY_ROOT(&tree->state)) {
322 		struct rb_node *node;
323 		struct extent_state *state;
324 
325 		node = rb_first(&tree->state);
326 		state = rb_entry(node, struct extent_state, rb_node);
327 		rb_erase(&state->rb_node, &tree->state);
328 		RB_CLEAR_NODE(&state->rb_node);
329 		/*
330 		 * btree io trees aren't supposed to have tasks waiting for
331 		 * changes in the flags of extent states ever.
332 		 */
333 		ASSERT(!waitqueue_active(&state->wq));
334 		free_extent_state(state);
335 
336 		cond_resched_lock(&tree->lock);
337 	}
338 	spin_unlock(&tree->lock);
339 }
340 
341 static struct extent_state *alloc_extent_state(gfp_t mask)
342 {
343 	struct extent_state *state;
344 
345 	/*
346 	 * The given mask might be not appropriate for the slab allocator,
347 	 * drop the unsupported bits
348 	 */
349 	mask &= ~(__GFP_DMA32|__GFP_HIGHMEM);
350 	state = kmem_cache_alloc(extent_state_cache, mask);
351 	if (!state)
352 		return state;
353 	state->state = 0;
354 	state->failrec = NULL;
355 	RB_CLEAR_NODE(&state->rb_node);
356 	btrfs_leak_debug_add(&leak_lock, &state->leak_list, &states);
357 	refcount_set(&state->refs, 1);
358 	init_waitqueue_head(&state->wq);
359 	trace_alloc_extent_state(state, mask, _RET_IP_);
360 	return state;
361 }
362 
363 void free_extent_state(struct extent_state *state)
364 {
365 	if (!state)
366 		return;
367 	if (refcount_dec_and_test(&state->refs)) {
368 		WARN_ON(extent_state_in_tree(state));
369 		btrfs_leak_debug_del(&leak_lock, &state->leak_list);
370 		trace_free_extent_state(state, _RET_IP_);
371 		kmem_cache_free(extent_state_cache, state);
372 	}
373 }
374 
375 /**
376  * Search @tree for an entry that contains @offset. Such entry would have
377  * entry->start <= offset && entry->end >= offset.
378  *
379  * @tree:       the tree to search
380  * @offset:     offset that should fall within an entry in @tree
381  * @node_ret:   pointer where new node should be anchored (used when inserting an
382  *	        entry in the tree)
383  * @parent_ret: points to entry which would have been the parent of the entry,
384  *               containing @offset
385  *
386  * Return a pointer to the entry that contains @offset byte address and don't change
387  * @node_ret and @parent_ret.
388  *
389  * If no such entry exists, return pointer to entry that ends before @offset
390  * and fill parameters @node_ret and @parent_ret, ie. does not return NULL.
391  */
392 static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree,
393 					             u64 offset,
394 						     struct rb_node ***node_ret,
395 						     struct rb_node **parent_ret)
396 {
397 	struct rb_root *root = &tree->state;
398 	struct rb_node **node = &root->rb_node;
399 	struct rb_node *prev = NULL;
400 	struct tree_entry *entry;
401 
402 	while (*node) {
403 		prev = *node;
404 		entry = rb_entry(prev, struct tree_entry, rb_node);
405 
406 		if (offset < entry->start)
407 			node = &(*node)->rb_left;
408 		else if (offset > entry->end)
409 			node = &(*node)->rb_right;
410 		else
411 			return *node;
412 	}
413 
414 	if (node_ret)
415 		*node_ret = node;
416 	if (parent_ret)
417 		*parent_ret = prev;
418 
419 	/* Search neighbors until we find the first one past the end */
420 	while (prev && offset > entry->end) {
421 		prev = rb_next(prev);
422 		entry = rb_entry(prev, struct tree_entry, rb_node);
423 	}
424 
425 	return prev;
426 }
427 
428 /*
429  * Inexact rb-tree search, return the next entry if @offset is not found
430  */
431 static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset)
432 {
433 	return tree_search_for_insert(tree, offset, NULL, NULL);
434 }
435 
436 /**
437  * Search offset in the tree or fill neighbor rbtree node pointers.
438  *
439  * @tree:      the tree to search
440  * @offset:    offset that should fall within an entry in @tree
441  * @next_ret:  pointer to the first entry whose range ends after @offset
442  * @prev_ret:  pointer to the first entry whose range begins before @offset
443  *
444  * Return a pointer to the entry that contains @offset byte address. If no
445  * such entry exists, then return NULL and fill @prev_ret and @next_ret.
446  * Otherwise return the found entry and other pointers are left untouched.
447  */
448 static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree,
449 					     u64 offset,
450 					     struct rb_node **prev_ret,
451 					     struct rb_node **next_ret)
452 {
453 	struct rb_root *root = &tree->state;
454 	struct rb_node **node = &root->rb_node;
455 	struct rb_node *prev = NULL;
456 	struct rb_node *orig_prev = NULL;
457 	struct tree_entry *entry;
458 
459 	ASSERT(prev_ret);
460 	ASSERT(next_ret);
461 
462 	while (*node) {
463 		prev = *node;
464 		entry = rb_entry(prev, struct tree_entry, rb_node);
465 
466 		if (offset < entry->start)
467 			node = &(*node)->rb_left;
468 		else if (offset > entry->end)
469 			node = &(*node)->rb_right;
470 		else
471 			return *node;
472 	}
473 
474 	orig_prev = prev;
475 	while (prev && offset > entry->end) {
476 		prev = rb_next(prev);
477 		entry = rb_entry(prev, struct tree_entry, rb_node);
478 	}
479 	*next_ret = prev;
480 	prev = orig_prev;
481 
482 	entry = rb_entry(prev, struct tree_entry, rb_node);
483 	while (prev && offset < entry->start) {
484 		prev = rb_prev(prev);
485 		entry = rb_entry(prev, struct tree_entry, rb_node);
486 	}
487 	*prev_ret = prev;
488 
489 	return NULL;
490 }
491 
492 /*
493  * utility function to look for merge candidates inside a given range.
494  * Any extents with matching state are merged together into a single
495  * extent in the tree.  Extents with EXTENT_IO in their state field
496  * are not merged because the end_io handlers need to be able to do
497  * operations on them without sleeping (or doing allocations/splits).
498  *
499  * This should be called with the tree lock held.
500  */
501 static void merge_state(struct extent_io_tree *tree,
502 		        struct extent_state *state)
503 {
504 	struct extent_state *other;
505 	struct rb_node *other_node;
506 
507 	if (state->state & (EXTENT_LOCKED | EXTENT_BOUNDARY))
508 		return;
509 
510 	other_node = rb_prev(&state->rb_node);
511 	if (other_node) {
512 		other = rb_entry(other_node, struct extent_state, rb_node);
513 		if (other->end == state->start - 1 &&
514 		    other->state == state->state) {
515 			if (tree->private_data &&
516 			    is_data_inode(tree->private_data))
517 				btrfs_merge_delalloc_extent(tree->private_data,
518 							    state, other);
519 			state->start = other->start;
520 			rb_erase(&other->rb_node, &tree->state);
521 			RB_CLEAR_NODE(&other->rb_node);
522 			free_extent_state(other);
523 		}
524 	}
525 	other_node = rb_next(&state->rb_node);
526 	if (other_node) {
527 		other = rb_entry(other_node, struct extent_state, rb_node);
528 		if (other->start == state->end + 1 &&
529 		    other->state == state->state) {
530 			if (tree->private_data &&
531 			    is_data_inode(tree->private_data))
532 				btrfs_merge_delalloc_extent(tree->private_data,
533 							    state, other);
534 			state->end = other->end;
535 			rb_erase(&other->rb_node, &tree->state);
536 			RB_CLEAR_NODE(&other->rb_node);
537 			free_extent_state(other);
538 		}
539 	}
540 }
541 
542 static void set_state_bits(struct extent_io_tree *tree,
543 			   struct extent_state *state, u32 bits,
544 			   struct extent_changeset *changeset);
545 
546 /*
547  * insert an extent_state struct into the tree.  'bits' are set on the
548  * struct before it is inserted.
549  *
550  * This may return -EEXIST if the extent is already there, in which case the
551  * state struct is freed.
552  *
553  * The tree lock is not taken internally.  This is a utility function and
554  * probably isn't what you want to call (see set/clear_extent_bit).
555  */
556 static int insert_state(struct extent_io_tree *tree,
557 			struct extent_state *state,
558 			u32 bits, struct extent_changeset *changeset)
559 {
560 	struct rb_node **node;
561 	struct rb_node *parent;
562 	const u64 end = state->end;
563 
564 	set_state_bits(tree, state, bits, changeset);
565 
566 	node = &tree->state.rb_node;
567 	while (*node) {
568 		struct tree_entry *entry;
569 
570 		parent = *node;
571 		entry = rb_entry(parent, struct tree_entry, rb_node);
572 
573 		if (end < entry->start) {
574 			node = &(*node)->rb_left;
575 		} else if (end > entry->end) {
576 			node = &(*node)->rb_right;
577 		} else {
578 			btrfs_err(tree->fs_info,
579 			       "found node %llu %llu on insert of %llu %llu",
580 			       entry->start, entry->end, state->start, end);
581 			return -EEXIST;
582 		}
583 	}
584 
585 	rb_link_node(&state->rb_node, parent, node);
586 	rb_insert_color(&state->rb_node, &tree->state);
587 
588 	merge_state(tree, state);
589 	return 0;
590 }
591 
592 /*
593  * Insert state to @tree to the location given by @node and @parent.
594  */
595 static void insert_state_fast(struct extent_io_tree *tree,
596 			      struct extent_state *state, struct rb_node **node,
597 			      struct rb_node *parent, unsigned bits,
598 			      struct extent_changeset *changeset)
599 {
600 	set_state_bits(tree, state, bits, changeset);
601 	rb_link_node(&state->rb_node, parent, node);
602 	rb_insert_color(&state->rb_node, &tree->state);
603 	merge_state(tree, state);
604 }
605 
606 /*
607  * split a given extent state struct in two, inserting the preallocated
608  * struct 'prealloc' as the newly created second half.  'split' indicates an
609  * offset inside 'orig' where it should be split.
610  *
611  * Before calling,
612  * the tree has 'orig' at [orig->start, orig->end].  After calling, there
613  * are two extent state structs in the tree:
614  * prealloc: [orig->start, split - 1]
615  * orig: [ split, orig->end ]
616  *
617  * The tree locks are not taken by this function. They need to be held
618  * by the caller.
619  */
620 static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
621 		       struct extent_state *prealloc, u64 split)
622 {
623 	struct rb_node *parent = NULL;
624 	struct rb_node **node;
625 
626 	if (tree->private_data && is_data_inode(tree->private_data))
627 		btrfs_split_delalloc_extent(tree->private_data, orig, split);
628 
629 	prealloc->start = orig->start;
630 	prealloc->end = split - 1;
631 	prealloc->state = orig->state;
632 	orig->start = split;
633 
634 	parent = &orig->rb_node;
635 	node = &parent;
636 	while (*node) {
637 		struct tree_entry *entry;
638 
639 		parent = *node;
640 		entry = rb_entry(parent, struct tree_entry, rb_node);
641 
642 		if (prealloc->end < entry->start) {
643 			node = &(*node)->rb_left;
644 		} else if (prealloc->end > entry->end) {
645 			node = &(*node)->rb_right;
646 		} else {
647 			free_extent_state(prealloc);
648 			return -EEXIST;
649 		}
650 	}
651 
652 	rb_link_node(&prealloc->rb_node, parent, node);
653 	rb_insert_color(&prealloc->rb_node, &tree->state);
654 
655 	return 0;
656 }
657 
658 static struct extent_state *next_state(struct extent_state *state)
659 {
660 	struct rb_node *next = rb_next(&state->rb_node);
661 	if (next)
662 		return rb_entry(next, struct extent_state, rb_node);
663 	else
664 		return NULL;
665 }
666 
667 /*
668  * utility function to clear some bits in an extent state struct.
669  * it will optionally wake up anyone waiting on this state (wake == 1).
670  *
671  * If no bits are set on the state struct after clearing things, the
672  * struct is freed and removed from the tree
673  */
674 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
675 					    struct extent_state *state,
676 					    u32 bits, int wake,
677 					    struct extent_changeset *changeset)
678 {
679 	struct extent_state *next;
680 	u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
681 	int ret;
682 
683 	if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
684 		u64 range = state->end - state->start + 1;
685 		WARN_ON(range > tree->dirty_bytes);
686 		tree->dirty_bytes -= range;
687 	}
688 
689 	if (tree->private_data && is_data_inode(tree->private_data))
690 		btrfs_clear_delalloc_extent(tree->private_data, state, bits);
691 
692 	ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
693 	BUG_ON(ret < 0);
694 	state->state &= ~bits_to_clear;
695 	if (wake)
696 		wake_up(&state->wq);
697 	if (state->state == 0) {
698 		next = next_state(state);
699 		if (extent_state_in_tree(state)) {
700 			rb_erase(&state->rb_node, &tree->state);
701 			RB_CLEAR_NODE(&state->rb_node);
702 			free_extent_state(state);
703 		} else {
704 			WARN_ON(1);
705 		}
706 	} else {
707 		merge_state(tree, state);
708 		next = next_state(state);
709 	}
710 	return next;
711 }
712 
713 static struct extent_state *
714 alloc_extent_state_atomic(struct extent_state *prealloc)
715 {
716 	if (!prealloc)
717 		prealloc = alloc_extent_state(GFP_ATOMIC);
718 
719 	return prealloc;
720 }
721 
722 static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
723 {
724 	btrfs_panic(tree->fs_info, err,
725 	"locking error: extent tree was modified by another thread while locked");
726 }
727 
728 /*
729  * clear some bits on a range in the tree.  This may require splitting
730  * or inserting elements in the tree, so the gfp mask is used to
731  * indicate which allocations or sleeping are allowed.
732  *
733  * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove
734  * the given range from the tree regardless of state (ie for truncate).
735  *
736  * the range [start, end] is inclusive.
737  *
738  * This takes the tree lock, and returns 0 on success and < 0 on error.
739  */
740 int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
741 		       u32 bits, int wake, int delete,
742 		       struct extent_state **cached_state,
743 		       gfp_t mask, struct extent_changeset *changeset)
744 {
745 	struct extent_state *state;
746 	struct extent_state *cached;
747 	struct extent_state *prealloc = NULL;
748 	struct rb_node *node;
749 	u64 last_end;
750 	int err;
751 	int clear = 0;
752 
753 	btrfs_debug_check_extent_io_range(tree, start, end);
754 	trace_btrfs_clear_extent_bit(tree, start, end - start + 1, bits);
755 
756 	if (bits & EXTENT_DELALLOC)
757 		bits |= EXTENT_NORESERVE;
758 
759 	if (delete)
760 		bits |= ~EXTENT_CTLBITS;
761 
762 	if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
763 		clear = 1;
764 again:
765 	if (!prealloc && gfpflags_allow_blocking(mask)) {
766 		/*
767 		 * Don't care for allocation failure here because we might end
768 		 * up not needing the pre-allocated extent state at all, which
769 		 * is the case if we only have in the tree extent states that
770 		 * cover our input range and don't cover too any other range.
771 		 * If we end up needing a new extent state we allocate it later.
772 		 */
773 		prealloc = alloc_extent_state(mask);
774 	}
775 
776 	spin_lock(&tree->lock);
777 	if (cached_state) {
778 		cached = *cached_state;
779 
780 		if (clear) {
781 			*cached_state = NULL;
782 			cached_state = NULL;
783 		}
784 
785 		if (cached && extent_state_in_tree(cached) &&
786 		    cached->start <= start && cached->end > start) {
787 			if (clear)
788 				refcount_dec(&cached->refs);
789 			state = cached;
790 			goto hit_next;
791 		}
792 		if (clear)
793 			free_extent_state(cached);
794 	}
795 	/*
796 	 * this search will find the extents that end after
797 	 * our range starts
798 	 */
799 	node = tree_search(tree, start);
800 	if (!node)
801 		goto out;
802 	state = rb_entry(node, struct extent_state, rb_node);
803 hit_next:
804 	if (state->start > end)
805 		goto out;
806 	WARN_ON(state->end < start);
807 	last_end = state->end;
808 
809 	/* the state doesn't have the wanted bits, go ahead */
810 	if (!(state->state & bits)) {
811 		state = next_state(state);
812 		goto next;
813 	}
814 
815 	/*
816 	 *     | ---- desired range ---- |
817 	 *  | state | or
818 	 *  | ------------- state -------------- |
819 	 *
820 	 * We need to split the extent we found, and may flip
821 	 * bits on second half.
822 	 *
823 	 * If the extent we found extends past our range, we
824 	 * just split and search again.  It'll get split again
825 	 * the next time though.
826 	 *
827 	 * If the extent we found is inside our range, we clear
828 	 * the desired bit on it.
829 	 */
830 
831 	if (state->start < start) {
832 		prealloc = alloc_extent_state_atomic(prealloc);
833 		BUG_ON(!prealloc);
834 		err = split_state(tree, state, prealloc, start);
835 		if (err)
836 			extent_io_tree_panic(tree, err);
837 
838 		prealloc = NULL;
839 		if (err)
840 			goto out;
841 		if (state->end <= end) {
842 			state = clear_state_bit(tree, state, bits, wake, changeset);
843 			goto next;
844 		}
845 		goto search_again;
846 	}
847 	/*
848 	 * | ---- desired range ---- |
849 	 *                        | state |
850 	 * We need to split the extent, and clear the bit
851 	 * on the first half
852 	 */
853 	if (state->start <= end && state->end > end) {
854 		prealloc = alloc_extent_state_atomic(prealloc);
855 		BUG_ON(!prealloc);
856 		err = split_state(tree, state, prealloc, end + 1);
857 		if (err)
858 			extent_io_tree_panic(tree, err);
859 
860 		if (wake)
861 			wake_up(&state->wq);
862 
863 		clear_state_bit(tree, prealloc, bits, wake, changeset);
864 
865 		prealloc = NULL;
866 		goto out;
867 	}
868 
869 	state = clear_state_bit(tree, state, bits, wake, changeset);
870 next:
871 	if (last_end == (u64)-1)
872 		goto out;
873 	start = last_end + 1;
874 	if (start <= end && state && !need_resched())
875 		goto hit_next;
876 
877 search_again:
878 	if (start > end)
879 		goto out;
880 	spin_unlock(&tree->lock);
881 	if (gfpflags_allow_blocking(mask))
882 		cond_resched();
883 	goto again;
884 
885 out:
886 	spin_unlock(&tree->lock);
887 	if (prealloc)
888 		free_extent_state(prealloc);
889 
890 	return 0;
891 
892 }
893 
894 static void wait_on_state(struct extent_io_tree *tree,
895 			  struct extent_state *state)
896 		__releases(tree->lock)
897 		__acquires(tree->lock)
898 {
899 	DEFINE_WAIT(wait);
900 	prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE);
901 	spin_unlock(&tree->lock);
902 	schedule();
903 	spin_lock(&tree->lock);
904 	finish_wait(&state->wq, &wait);
905 }
906 
907 /*
908  * waits for one or more bits to clear on a range in the state tree.
909  * The range [start, end] is inclusive.
910  * The tree lock is taken by this function
911  */
912 static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
913 			    u32 bits)
914 {
915 	struct extent_state *state;
916 	struct rb_node *node;
917 
918 	btrfs_debug_check_extent_io_range(tree, start, end);
919 
920 	spin_lock(&tree->lock);
921 again:
922 	while (1) {
923 		/*
924 		 * this search will find all the extents that end after
925 		 * our range starts
926 		 */
927 		node = tree_search(tree, start);
928 process_node:
929 		if (!node)
930 			break;
931 
932 		state = rb_entry(node, struct extent_state, rb_node);
933 
934 		if (state->start > end)
935 			goto out;
936 
937 		if (state->state & bits) {
938 			start = state->start;
939 			refcount_inc(&state->refs);
940 			wait_on_state(tree, state);
941 			free_extent_state(state);
942 			goto again;
943 		}
944 		start = state->end + 1;
945 
946 		if (start > end)
947 			break;
948 
949 		if (!cond_resched_lock(&tree->lock)) {
950 			node = rb_next(node);
951 			goto process_node;
952 		}
953 	}
954 out:
955 	spin_unlock(&tree->lock);
956 }
957 
958 static void set_state_bits(struct extent_io_tree *tree,
959 			   struct extent_state *state,
960 			   u32 bits, struct extent_changeset *changeset)
961 {
962 	u32 bits_to_set = bits & ~EXTENT_CTLBITS;
963 	int ret;
964 
965 	if (tree->private_data && is_data_inode(tree->private_data))
966 		btrfs_set_delalloc_extent(tree->private_data, state, bits);
967 
968 	if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
969 		u64 range = state->end - state->start + 1;
970 		tree->dirty_bytes += range;
971 	}
972 	ret = add_extent_changeset(state, bits_to_set, changeset, 1);
973 	BUG_ON(ret < 0);
974 	state->state |= bits_to_set;
975 }
976 
977 static void cache_state_if_flags(struct extent_state *state,
978 				 struct extent_state **cached_ptr,
979 				 unsigned flags)
980 {
981 	if (cached_ptr && !(*cached_ptr)) {
982 		if (!flags || (state->state & flags)) {
983 			*cached_ptr = state;
984 			refcount_inc(&state->refs);
985 		}
986 	}
987 }
988 
989 static void cache_state(struct extent_state *state,
990 			struct extent_state **cached_ptr)
991 {
992 	return cache_state_if_flags(state, cached_ptr,
993 				    EXTENT_LOCKED | EXTENT_BOUNDARY);
994 }
995 
996 /*
997  * set some bits on a range in the tree.  This may require allocations or
998  * sleeping, so the gfp mask is used to indicate what is allowed.
999  *
1000  * If any of the exclusive bits are set, this will fail with -EEXIST if some
1001  * part of the range already has the desired bits set.  The start of the
1002  * existing range is returned in failed_start in this case.
1003  *
1004  * [start, end] is inclusive This takes the tree lock.
1005  */
1006 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
1007 		   u32 exclusive_bits, u64 *failed_start,
1008 		   struct extent_state **cached_state, gfp_t mask,
1009 		   struct extent_changeset *changeset)
1010 {
1011 	struct extent_state *state;
1012 	struct extent_state *prealloc = NULL;
1013 	struct rb_node *node;
1014 	struct rb_node **p;
1015 	struct rb_node *parent;
1016 	int err = 0;
1017 	u64 last_start;
1018 	u64 last_end;
1019 
1020 	btrfs_debug_check_extent_io_range(tree, start, end);
1021 	trace_btrfs_set_extent_bit(tree, start, end - start + 1, bits);
1022 
1023 	if (exclusive_bits)
1024 		ASSERT(failed_start);
1025 	else
1026 		ASSERT(failed_start == NULL);
1027 again:
1028 	if (!prealloc && gfpflags_allow_blocking(mask)) {
1029 		/*
1030 		 * Don't care for allocation failure here because we might end
1031 		 * up not needing the pre-allocated extent state at all, which
1032 		 * is the case if we only have in the tree extent states that
1033 		 * cover our input range and don't cover too any other range.
1034 		 * If we end up needing a new extent state we allocate it later.
1035 		 */
1036 		prealloc = alloc_extent_state(mask);
1037 	}
1038 
1039 	spin_lock(&tree->lock);
1040 	if (cached_state && *cached_state) {
1041 		state = *cached_state;
1042 		if (state->start <= start && state->end > start &&
1043 		    extent_state_in_tree(state)) {
1044 			node = &state->rb_node;
1045 			goto hit_next;
1046 		}
1047 	}
1048 	/*
1049 	 * this search will find all the extents that end after
1050 	 * our range starts.
1051 	 */
1052 	node = tree_search_for_insert(tree, start, &p, &parent);
1053 	if (!node) {
1054 		prealloc = alloc_extent_state_atomic(prealloc);
1055 		BUG_ON(!prealloc);
1056 		prealloc->start = start;
1057 		prealloc->end = end;
1058 		insert_state_fast(tree, prealloc, p, parent, bits, changeset);
1059 		cache_state(prealloc, cached_state);
1060 		prealloc = NULL;
1061 		goto out;
1062 	}
1063 	state = rb_entry(node, struct extent_state, rb_node);
1064 hit_next:
1065 	last_start = state->start;
1066 	last_end = state->end;
1067 
1068 	/*
1069 	 * | ---- desired range ---- |
1070 	 * | state |
1071 	 *
1072 	 * Just lock what we found and keep going
1073 	 */
1074 	if (state->start == start && state->end <= end) {
1075 		if (state->state & exclusive_bits) {
1076 			*failed_start = state->start;
1077 			err = -EEXIST;
1078 			goto out;
1079 		}
1080 
1081 		set_state_bits(tree, state, bits, changeset);
1082 		cache_state(state, cached_state);
1083 		merge_state(tree, state);
1084 		if (last_end == (u64)-1)
1085 			goto out;
1086 		start = last_end + 1;
1087 		state = next_state(state);
1088 		if (start < end && state && state->start == start &&
1089 		    !need_resched())
1090 			goto hit_next;
1091 		goto search_again;
1092 	}
1093 
1094 	/*
1095 	 *     | ---- desired range ---- |
1096 	 * | state |
1097 	 *   or
1098 	 * | ------------- state -------------- |
1099 	 *
1100 	 * We need to split the extent we found, and may flip bits on
1101 	 * second half.
1102 	 *
1103 	 * If the extent we found extends past our
1104 	 * range, we just split and search again.  It'll get split
1105 	 * again the next time though.
1106 	 *
1107 	 * If the extent we found is inside our range, we set the
1108 	 * desired bit on it.
1109 	 */
1110 	if (state->start < start) {
1111 		if (state->state & exclusive_bits) {
1112 			*failed_start = start;
1113 			err = -EEXIST;
1114 			goto out;
1115 		}
1116 
1117 		/*
1118 		 * If this extent already has all the bits we want set, then
1119 		 * skip it, not necessary to split it or do anything with it.
1120 		 */
1121 		if ((state->state & bits) == bits) {
1122 			start = state->end + 1;
1123 			cache_state(state, cached_state);
1124 			goto search_again;
1125 		}
1126 
1127 		prealloc = alloc_extent_state_atomic(prealloc);
1128 		BUG_ON(!prealloc);
1129 		err = split_state(tree, state, prealloc, start);
1130 		if (err)
1131 			extent_io_tree_panic(tree, err);
1132 
1133 		prealloc = NULL;
1134 		if (err)
1135 			goto out;
1136 		if (state->end <= end) {
1137 			set_state_bits(tree, state, bits, changeset);
1138 			cache_state(state, cached_state);
1139 			merge_state(tree, state);
1140 			if (last_end == (u64)-1)
1141 				goto out;
1142 			start = last_end + 1;
1143 			state = next_state(state);
1144 			if (start < end && state && state->start == start &&
1145 			    !need_resched())
1146 				goto hit_next;
1147 		}
1148 		goto search_again;
1149 	}
1150 	/*
1151 	 * | ---- desired range ---- |
1152 	 *     | state | or               | state |
1153 	 *
1154 	 * There's a hole, we need to insert something in it and
1155 	 * ignore the extent we found.
1156 	 */
1157 	if (state->start > start) {
1158 		u64 this_end;
1159 		if (end < last_start)
1160 			this_end = end;
1161 		else
1162 			this_end = last_start - 1;
1163 
1164 		prealloc = alloc_extent_state_atomic(prealloc);
1165 		BUG_ON(!prealloc);
1166 
1167 		/*
1168 		 * Avoid to free 'prealloc' if it can be merged with
1169 		 * the later extent.
1170 		 */
1171 		prealloc->start = start;
1172 		prealloc->end = this_end;
1173 		err = insert_state(tree, prealloc, bits, changeset);
1174 		if (err)
1175 			extent_io_tree_panic(tree, err);
1176 
1177 		cache_state(prealloc, cached_state);
1178 		prealloc = NULL;
1179 		start = this_end + 1;
1180 		goto search_again;
1181 	}
1182 	/*
1183 	 * | ---- desired range ---- |
1184 	 *                        | state |
1185 	 * We need to split the extent, and set the bit
1186 	 * on the first half
1187 	 */
1188 	if (state->start <= end && state->end > end) {
1189 		if (state->state & exclusive_bits) {
1190 			*failed_start = start;
1191 			err = -EEXIST;
1192 			goto out;
1193 		}
1194 
1195 		prealloc = alloc_extent_state_atomic(prealloc);
1196 		BUG_ON(!prealloc);
1197 		err = split_state(tree, state, prealloc, end + 1);
1198 		if (err)
1199 			extent_io_tree_panic(tree, err);
1200 
1201 		set_state_bits(tree, prealloc, bits, changeset);
1202 		cache_state(prealloc, cached_state);
1203 		merge_state(tree, prealloc);
1204 		prealloc = NULL;
1205 		goto out;
1206 	}
1207 
1208 search_again:
1209 	if (start > end)
1210 		goto out;
1211 	spin_unlock(&tree->lock);
1212 	if (gfpflags_allow_blocking(mask))
1213 		cond_resched();
1214 	goto again;
1215 
1216 out:
1217 	spin_unlock(&tree->lock);
1218 	if (prealloc)
1219 		free_extent_state(prealloc);
1220 
1221 	return err;
1222 
1223 }
1224 
1225 /**
1226  * convert_extent_bit - convert all bits in a given range from one bit to
1227  * 			another
1228  * @tree:	the io tree to search
1229  * @start:	the start offset in bytes
1230  * @end:	the end offset in bytes (inclusive)
1231  * @bits:	the bits to set in this range
1232  * @clear_bits:	the bits to clear in this range
1233  * @cached_state:	state that we're going to cache
1234  *
1235  * This will go through and set bits for the given range.  If any states exist
1236  * already in this range they are set with the given bit and cleared of the
1237  * clear_bits.  This is only meant to be used by things that are mergeable, ie
1238  * converting from say DELALLOC to DIRTY.  This is not meant to be used with
1239  * boundary bits like LOCK.
1240  *
1241  * All allocations are done with GFP_NOFS.
1242  */
1243 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1244 		       u32 bits, u32 clear_bits,
1245 		       struct extent_state **cached_state)
1246 {
1247 	struct extent_state *state;
1248 	struct extent_state *prealloc = NULL;
1249 	struct rb_node *node;
1250 	struct rb_node **p;
1251 	struct rb_node *parent;
1252 	int err = 0;
1253 	u64 last_start;
1254 	u64 last_end;
1255 	bool first_iteration = true;
1256 
1257 	btrfs_debug_check_extent_io_range(tree, start, end);
1258 	trace_btrfs_convert_extent_bit(tree, start, end - start + 1, bits,
1259 				       clear_bits);
1260 
1261 again:
1262 	if (!prealloc) {
1263 		/*
1264 		 * Best effort, don't worry if extent state allocation fails
1265 		 * here for the first iteration. We might have a cached state
1266 		 * that matches exactly the target range, in which case no
1267 		 * extent state allocations are needed. We'll only know this
1268 		 * after locking the tree.
1269 		 */
1270 		prealloc = alloc_extent_state(GFP_NOFS);
1271 		if (!prealloc && !first_iteration)
1272 			return -ENOMEM;
1273 	}
1274 
1275 	spin_lock(&tree->lock);
1276 	if (cached_state && *cached_state) {
1277 		state = *cached_state;
1278 		if (state->start <= start && state->end > start &&
1279 		    extent_state_in_tree(state)) {
1280 			node = &state->rb_node;
1281 			goto hit_next;
1282 		}
1283 	}
1284 
1285 	/*
1286 	 * this search will find all the extents that end after
1287 	 * our range starts.
1288 	 */
1289 	node = tree_search_for_insert(tree, start, &p, &parent);
1290 	if (!node) {
1291 		prealloc = alloc_extent_state_atomic(prealloc);
1292 		if (!prealloc) {
1293 			err = -ENOMEM;
1294 			goto out;
1295 		}
1296 		prealloc->start = start;
1297 		prealloc->end = end;
1298 		insert_state_fast(tree, prealloc, p, parent, bits, NULL);
1299 		cache_state(prealloc, cached_state);
1300 		prealloc = NULL;
1301 		goto out;
1302 	}
1303 	state = rb_entry(node, struct extent_state, rb_node);
1304 hit_next:
1305 	last_start = state->start;
1306 	last_end = state->end;
1307 
1308 	/*
1309 	 * | ---- desired range ---- |
1310 	 * | state |
1311 	 *
1312 	 * Just lock what we found and keep going
1313 	 */
1314 	if (state->start == start && state->end <= end) {
1315 		set_state_bits(tree, state, bits, NULL);
1316 		cache_state(state, cached_state);
1317 		state = clear_state_bit(tree, state, clear_bits, 0, NULL);
1318 		if (last_end == (u64)-1)
1319 			goto out;
1320 		start = last_end + 1;
1321 		if (start < end && state && state->start == start &&
1322 		    !need_resched())
1323 			goto hit_next;
1324 		goto search_again;
1325 	}
1326 
1327 	/*
1328 	 *     | ---- desired range ---- |
1329 	 * | state |
1330 	 *   or
1331 	 * | ------------- state -------------- |
1332 	 *
1333 	 * We need to split the extent we found, and may flip bits on
1334 	 * second half.
1335 	 *
1336 	 * If the extent we found extends past our
1337 	 * range, we just split and search again.  It'll get split
1338 	 * again the next time though.
1339 	 *
1340 	 * If the extent we found is inside our range, we set the
1341 	 * desired bit on it.
1342 	 */
1343 	if (state->start < start) {
1344 		prealloc = alloc_extent_state_atomic(prealloc);
1345 		if (!prealloc) {
1346 			err = -ENOMEM;
1347 			goto out;
1348 		}
1349 		err = split_state(tree, state, prealloc, start);
1350 		if (err)
1351 			extent_io_tree_panic(tree, err);
1352 		prealloc = NULL;
1353 		if (err)
1354 			goto out;
1355 		if (state->end <= end) {
1356 			set_state_bits(tree, state, bits, NULL);
1357 			cache_state(state, cached_state);
1358 			state = clear_state_bit(tree, state, clear_bits, 0, NULL);
1359 			if (last_end == (u64)-1)
1360 				goto out;
1361 			start = last_end + 1;
1362 			if (start < end && state && state->start == start &&
1363 			    !need_resched())
1364 				goto hit_next;
1365 		}
1366 		goto search_again;
1367 	}
1368 	/*
1369 	 * | ---- desired range ---- |
1370 	 *     | state | or               | state |
1371 	 *
1372 	 * There's a hole, we need to insert something in it and
1373 	 * ignore the extent we found.
1374 	 */
1375 	if (state->start > start) {
1376 		u64 this_end;
1377 		if (end < last_start)
1378 			this_end = end;
1379 		else
1380 			this_end = last_start - 1;
1381 
1382 		prealloc = alloc_extent_state_atomic(prealloc);
1383 		if (!prealloc) {
1384 			err = -ENOMEM;
1385 			goto out;
1386 		}
1387 
1388 		/*
1389 		 * Avoid to free 'prealloc' if it can be merged with
1390 		 * the later extent.
1391 		 */
1392 		prealloc->start = start;
1393 		prealloc->end = this_end;
1394 		err = insert_state(tree, prealloc, bits, NULL);
1395 		if (err)
1396 			extent_io_tree_panic(tree, err);
1397 		cache_state(prealloc, cached_state);
1398 		prealloc = NULL;
1399 		start = this_end + 1;
1400 		goto search_again;
1401 	}
1402 	/*
1403 	 * | ---- desired range ---- |
1404 	 *                        | state |
1405 	 * We need to split the extent, and set the bit
1406 	 * on the first half
1407 	 */
1408 	if (state->start <= end && state->end > end) {
1409 		prealloc = alloc_extent_state_atomic(prealloc);
1410 		if (!prealloc) {
1411 			err = -ENOMEM;
1412 			goto out;
1413 		}
1414 
1415 		err = split_state(tree, state, prealloc, end + 1);
1416 		if (err)
1417 			extent_io_tree_panic(tree, err);
1418 
1419 		set_state_bits(tree, prealloc, bits, NULL);
1420 		cache_state(prealloc, cached_state);
1421 		clear_state_bit(tree, prealloc, clear_bits, 0, NULL);
1422 		prealloc = NULL;
1423 		goto out;
1424 	}
1425 
1426 search_again:
1427 	if (start > end)
1428 		goto out;
1429 	spin_unlock(&tree->lock);
1430 	cond_resched();
1431 	first_iteration = false;
1432 	goto again;
1433 
1434 out:
1435 	spin_unlock(&tree->lock);
1436 	if (prealloc)
1437 		free_extent_state(prealloc);
1438 
1439 	return err;
1440 }
1441 
1442 /* wrappers around set/clear extent bit */
1443 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1444 			   u32 bits, struct extent_changeset *changeset)
1445 {
1446 	/*
1447 	 * We don't support EXTENT_LOCKED yet, as current changeset will
1448 	 * record any bits changed, so for EXTENT_LOCKED case, it will
1449 	 * either fail with -EEXIST or changeset will record the whole
1450 	 * range.
1451 	 */
1452 	BUG_ON(bits & EXTENT_LOCKED);
1453 
1454 	return set_extent_bit(tree, start, end, bits, 0, NULL, NULL, GFP_NOFS,
1455 			      changeset);
1456 }
1457 
1458 int set_extent_bits_nowait(struct extent_io_tree *tree, u64 start, u64 end,
1459 			   u32 bits)
1460 {
1461 	return set_extent_bit(tree, start, end, bits, 0, NULL, NULL,
1462 			      GFP_NOWAIT, NULL);
1463 }
1464 
1465 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
1466 		     u32 bits, int wake, int delete,
1467 		     struct extent_state **cached)
1468 {
1469 	return __clear_extent_bit(tree, start, end, bits, wake, delete,
1470 				  cached, GFP_NOFS, NULL);
1471 }
1472 
1473 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1474 		u32 bits, struct extent_changeset *changeset)
1475 {
1476 	/*
1477 	 * Don't support EXTENT_LOCKED case, same reason as
1478 	 * set_record_extent_bits().
1479 	 */
1480 	BUG_ON(bits & EXTENT_LOCKED);
1481 
1482 	return __clear_extent_bit(tree, start, end, bits, 0, 0, NULL, GFP_NOFS,
1483 				  changeset);
1484 }
1485 
1486 /*
1487  * either insert or lock state struct between start and end use mask to tell
1488  * us if waiting is desired.
1489  */
1490 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
1491 		     struct extent_state **cached_state)
1492 {
1493 	int err;
1494 	u64 failed_start;
1495 
1496 	while (1) {
1497 		err = set_extent_bit(tree, start, end, EXTENT_LOCKED,
1498 				     EXTENT_LOCKED, &failed_start,
1499 				     cached_state, GFP_NOFS, NULL);
1500 		if (err == -EEXIST) {
1501 			wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
1502 			start = failed_start;
1503 		} else
1504 			break;
1505 		WARN_ON(start > end);
1506 	}
1507 	return err;
1508 }
1509 
1510 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
1511 {
1512 	int err;
1513 	u64 failed_start;
1514 
1515 	err = set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED,
1516 			     &failed_start, NULL, GFP_NOFS, NULL);
1517 	if (err == -EEXIST) {
1518 		if (failed_start > start)
1519 			clear_extent_bit(tree, start, failed_start - 1,
1520 					 EXTENT_LOCKED, 1, 0, NULL);
1521 		return 0;
1522 	}
1523 	return 1;
1524 }
1525 
1526 void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
1527 {
1528 	unsigned long index = start >> PAGE_SHIFT;
1529 	unsigned long end_index = end >> PAGE_SHIFT;
1530 	struct page *page;
1531 
1532 	while (index <= end_index) {
1533 		page = find_get_page(inode->i_mapping, index);
1534 		BUG_ON(!page); /* Pages should be in the extent_io_tree */
1535 		clear_page_dirty_for_io(page);
1536 		put_page(page);
1537 		index++;
1538 	}
1539 }
1540 
1541 void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
1542 {
1543 	struct address_space *mapping = inode->i_mapping;
1544 	unsigned long index = start >> PAGE_SHIFT;
1545 	unsigned long end_index = end >> PAGE_SHIFT;
1546 	struct folio *folio;
1547 
1548 	while (index <= end_index) {
1549 		folio = filemap_get_folio(mapping, index);
1550 		filemap_dirty_folio(mapping, folio);
1551 		folio_account_redirty(folio);
1552 		index += folio_nr_pages(folio);
1553 		folio_put(folio);
1554 	}
1555 }
1556 
1557 /* find the first state struct with 'bits' set after 'start', and
1558  * return it.  tree->lock must be held.  NULL will returned if
1559  * nothing was found after 'start'
1560  */
1561 static struct extent_state *
1562 find_first_extent_bit_state(struct extent_io_tree *tree, u64 start, u32 bits)
1563 {
1564 	struct rb_node *node;
1565 	struct extent_state *state;
1566 
1567 	/*
1568 	 * this search will find all the extents that end after
1569 	 * our range starts.
1570 	 */
1571 	node = tree_search(tree, start);
1572 	if (!node)
1573 		goto out;
1574 
1575 	while (1) {
1576 		state = rb_entry(node, struct extent_state, rb_node);
1577 		if (state->end >= start && (state->state & bits))
1578 			return state;
1579 
1580 		node = rb_next(node);
1581 		if (!node)
1582 			break;
1583 	}
1584 out:
1585 	return NULL;
1586 }
1587 
1588 /*
1589  * Find the first offset in the io tree with one or more @bits set.
1590  *
1591  * Note: If there are multiple bits set in @bits, any of them will match.
1592  *
1593  * Return 0 if we find something, and update @start_ret and @end_ret.
1594  * Return 1 if we found nothing.
1595  */
1596 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
1597 			  u64 *start_ret, u64 *end_ret, u32 bits,
1598 			  struct extent_state **cached_state)
1599 {
1600 	struct extent_state *state;
1601 	int ret = 1;
1602 
1603 	spin_lock(&tree->lock);
1604 	if (cached_state && *cached_state) {
1605 		state = *cached_state;
1606 		if (state->end == start - 1 && extent_state_in_tree(state)) {
1607 			while ((state = next_state(state)) != NULL) {
1608 				if (state->state & bits)
1609 					goto got_it;
1610 			}
1611 			free_extent_state(*cached_state);
1612 			*cached_state = NULL;
1613 			goto out;
1614 		}
1615 		free_extent_state(*cached_state);
1616 		*cached_state = NULL;
1617 	}
1618 
1619 	state = find_first_extent_bit_state(tree, start, bits);
1620 got_it:
1621 	if (state) {
1622 		cache_state_if_flags(state, cached_state, 0);
1623 		*start_ret = state->start;
1624 		*end_ret = state->end;
1625 		ret = 0;
1626 	}
1627 out:
1628 	spin_unlock(&tree->lock);
1629 	return ret;
1630 }
1631 
1632 /**
1633  * Find a contiguous area of bits
1634  *
1635  * @tree:      io tree to check
1636  * @start:     offset to start the search from
1637  * @start_ret: the first offset we found with the bits set
1638  * @end_ret:   the final contiguous range of the bits that were set
1639  * @bits:      bits to look for
1640  *
1641  * set_extent_bit and clear_extent_bit can temporarily split contiguous ranges
1642  * to set bits appropriately, and then merge them again.  During this time it
1643  * will drop the tree->lock, so use this helper if you want to find the actual
1644  * contiguous area for given bits.  We will search to the first bit we find, and
1645  * then walk down the tree until we find a non-contiguous area.  The area
1646  * returned will be the full contiguous area with the bits set.
1647  */
1648 int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
1649 			       u64 *start_ret, u64 *end_ret, u32 bits)
1650 {
1651 	struct extent_state *state;
1652 	int ret = 1;
1653 
1654 	spin_lock(&tree->lock);
1655 	state = find_first_extent_bit_state(tree, start, bits);
1656 	if (state) {
1657 		*start_ret = state->start;
1658 		*end_ret = state->end;
1659 		while ((state = next_state(state)) != NULL) {
1660 			if (state->start > (*end_ret + 1))
1661 				break;
1662 			*end_ret = state->end;
1663 		}
1664 		ret = 0;
1665 	}
1666 	spin_unlock(&tree->lock);
1667 	return ret;
1668 }
1669 
1670 /**
1671  * Find the first range that has @bits not set. This range could start before
1672  * @start.
1673  *
1674  * @tree:      the tree to search
1675  * @start:     offset at/after which the found extent should start
1676  * @start_ret: records the beginning of the range
1677  * @end_ret:   records the end of the range (inclusive)
1678  * @bits:      the set of bits which must be unset
1679  *
1680  * Since unallocated range is also considered one which doesn't have the bits
1681  * set it's possible that @end_ret contains -1, this happens in case the range
1682  * spans (last_range_end, end of device]. In this case it's up to the caller to
1683  * trim @end_ret to the appropriate size.
1684  */
1685 void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
1686 				 u64 *start_ret, u64 *end_ret, u32 bits)
1687 {
1688 	struct extent_state *state;
1689 	struct rb_node *node, *prev = NULL, *next;
1690 
1691 	spin_lock(&tree->lock);
1692 
1693 	/* Find first extent with bits cleared */
1694 	while (1) {
1695 		node = tree_search_prev_next(tree, start, &prev, &next);
1696 		if (!node && !next && !prev) {
1697 			/*
1698 			 * Tree is completely empty, send full range and let
1699 			 * caller deal with it
1700 			 */
1701 			*start_ret = 0;
1702 			*end_ret = -1;
1703 			goto out;
1704 		} else if (!node && !next) {
1705 			/*
1706 			 * We are past the last allocated chunk, set start at
1707 			 * the end of the last extent.
1708 			 */
1709 			state = rb_entry(prev, struct extent_state, rb_node);
1710 			*start_ret = state->end + 1;
1711 			*end_ret = -1;
1712 			goto out;
1713 		} else if (!node) {
1714 			node = next;
1715 		}
1716 		/*
1717 		 * At this point 'node' either contains 'start' or start is
1718 		 * before 'node'
1719 		 */
1720 		state = rb_entry(node, struct extent_state, rb_node);
1721 
1722 		if (in_range(start, state->start, state->end - state->start + 1)) {
1723 			if (state->state & bits) {
1724 				/*
1725 				 * |--range with bits sets--|
1726 				 *    |
1727 				 *    start
1728 				 */
1729 				start = state->end + 1;
1730 			} else {
1731 				/*
1732 				 * 'start' falls within a range that doesn't
1733 				 * have the bits set, so take its start as
1734 				 * the beginning of the desired range
1735 				 *
1736 				 * |--range with bits cleared----|
1737 				 *      |
1738 				 *      start
1739 				 */
1740 				*start_ret = state->start;
1741 				break;
1742 			}
1743 		} else {
1744 			/*
1745 			 * |---prev range---|---hole/unset---|---node range---|
1746 			 *                          |
1747 			 *                        start
1748 			 *
1749 			 *                        or
1750 			 *
1751 			 * |---hole/unset--||--first node--|
1752 			 * 0   |
1753 			 *    start
1754 			 */
1755 			if (prev) {
1756 				state = rb_entry(prev, struct extent_state,
1757 						 rb_node);
1758 				*start_ret = state->end + 1;
1759 			} else {
1760 				*start_ret = 0;
1761 			}
1762 			break;
1763 		}
1764 	}
1765 
1766 	/*
1767 	 * Find the longest stretch from start until an entry which has the
1768 	 * bits set
1769 	 */
1770 	while (1) {
1771 		state = rb_entry(node, struct extent_state, rb_node);
1772 		if (state->end >= start && !(state->state & bits)) {
1773 			*end_ret = state->end;
1774 		} else {
1775 			*end_ret = state->start - 1;
1776 			break;
1777 		}
1778 
1779 		node = rb_next(node);
1780 		if (!node)
1781 			break;
1782 	}
1783 out:
1784 	spin_unlock(&tree->lock);
1785 }
1786 
1787 /*
1788  * find a contiguous range of bytes in the file marked as delalloc, not
1789  * more than 'max_bytes'.  start and end are used to return the range,
1790  *
1791  * true is returned if we find something, false if nothing was in the tree
1792  */
1793 bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
1794 			       u64 *end, u64 max_bytes,
1795 			       struct extent_state **cached_state)
1796 {
1797 	struct rb_node *node;
1798 	struct extent_state *state;
1799 	u64 cur_start = *start;
1800 	bool found = false;
1801 	u64 total_bytes = 0;
1802 
1803 	spin_lock(&tree->lock);
1804 
1805 	/*
1806 	 * this search will find all the extents that end after
1807 	 * our range starts.
1808 	 */
1809 	node = tree_search(tree, cur_start);
1810 	if (!node) {
1811 		*end = (u64)-1;
1812 		goto out;
1813 	}
1814 
1815 	while (1) {
1816 		state = rb_entry(node, struct extent_state, rb_node);
1817 		if (found && (state->start != cur_start ||
1818 			      (state->state & EXTENT_BOUNDARY))) {
1819 			goto out;
1820 		}
1821 		if (!(state->state & EXTENT_DELALLOC)) {
1822 			if (!found)
1823 				*end = state->end;
1824 			goto out;
1825 		}
1826 		if (!found) {
1827 			*start = state->start;
1828 			*cached_state = state;
1829 			refcount_inc(&state->refs);
1830 		}
1831 		found = true;
1832 		*end = state->end;
1833 		cur_start = state->end + 1;
1834 		node = rb_next(node);
1835 		total_bytes += state->end - state->start + 1;
1836 		if (total_bytes >= max_bytes)
1837 			break;
1838 		if (!node)
1839 			break;
1840 	}
1841 out:
1842 	spin_unlock(&tree->lock);
1843 	return found;
1844 }
1845 
1846 /*
1847  * Process one page for __process_pages_contig().
1848  *
1849  * Return >0 if we hit @page == @locked_page.
1850  * Return 0 if we updated the page status.
1851  * Return -EGAIN if the we need to try again.
1852  * (For PAGE_LOCK case but got dirty page or page not belong to mapping)
1853  */
1854 static int process_one_page(struct btrfs_fs_info *fs_info,
1855 			    struct address_space *mapping,
1856 			    struct page *page, struct page *locked_page,
1857 			    unsigned long page_ops, u64 start, u64 end)
1858 {
1859 	u32 len;
1860 
1861 	ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX);
1862 	len = end + 1 - start;
1863 
1864 	if (page_ops & PAGE_SET_ORDERED)
1865 		btrfs_page_clamp_set_ordered(fs_info, page, start, len);
1866 	if (page_ops & PAGE_SET_ERROR)
1867 		btrfs_page_clamp_set_error(fs_info, page, start, len);
1868 	if (page_ops & PAGE_START_WRITEBACK) {
1869 		btrfs_page_clamp_clear_dirty(fs_info, page, start, len);
1870 		btrfs_page_clamp_set_writeback(fs_info, page, start, len);
1871 	}
1872 	if (page_ops & PAGE_END_WRITEBACK)
1873 		btrfs_page_clamp_clear_writeback(fs_info, page, start, len);
1874 
1875 	if (page == locked_page)
1876 		return 1;
1877 
1878 	if (page_ops & PAGE_LOCK) {
1879 		int ret;
1880 
1881 		ret = btrfs_page_start_writer_lock(fs_info, page, start, len);
1882 		if (ret)
1883 			return ret;
1884 		if (!PageDirty(page) || page->mapping != mapping) {
1885 			btrfs_page_end_writer_lock(fs_info, page, start, len);
1886 			return -EAGAIN;
1887 		}
1888 	}
1889 	if (page_ops & PAGE_UNLOCK)
1890 		btrfs_page_end_writer_lock(fs_info, page, start, len);
1891 	return 0;
1892 }
1893 
1894 static int __process_pages_contig(struct address_space *mapping,
1895 				  struct page *locked_page,
1896 				  u64 start, u64 end, unsigned long page_ops,
1897 				  u64 *processed_end)
1898 {
1899 	struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb);
1900 	pgoff_t start_index = start >> PAGE_SHIFT;
1901 	pgoff_t end_index = end >> PAGE_SHIFT;
1902 	pgoff_t index = start_index;
1903 	unsigned long pages_processed = 0;
1904 	struct folio_batch fbatch;
1905 	int err = 0;
1906 	int i;
1907 
1908 	if (page_ops & PAGE_LOCK) {
1909 		ASSERT(page_ops == PAGE_LOCK);
1910 		ASSERT(processed_end && *processed_end == start);
1911 	}
1912 
1913 	if ((page_ops & PAGE_SET_ERROR) && start_index <= end_index)
1914 		mapping_set_error(mapping, -EIO);
1915 
1916 	folio_batch_init(&fbatch);
1917 	while (index <= end_index) {
1918 		int found_folios;
1919 
1920 		found_folios = filemap_get_folios_contig(mapping, &index,
1921 				end_index, &fbatch);
1922 
1923 		if (found_folios == 0) {
1924 			/*
1925 			 * Only if we're going to lock these pages, we can find
1926 			 * nothing at @index.
1927 			 */
1928 			ASSERT(page_ops & PAGE_LOCK);
1929 			err = -EAGAIN;
1930 			goto out;
1931 		}
1932 
1933 		for (i = 0; i < found_folios; i++) {
1934 			int process_ret;
1935 			struct folio *folio = fbatch.folios[i];
1936 			process_ret = process_one_page(fs_info, mapping,
1937 					&folio->page, locked_page, page_ops,
1938 					start, end);
1939 			if (process_ret < 0) {
1940 				err = -EAGAIN;
1941 				folio_batch_release(&fbatch);
1942 				goto out;
1943 			}
1944 			pages_processed += folio_nr_pages(folio);
1945 		}
1946 		folio_batch_release(&fbatch);
1947 		cond_resched();
1948 	}
1949 out:
1950 	if (err && processed_end) {
1951 		/*
1952 		 * Update @processed_end. I know this is awful since it has
1953 		 * two different return value patterns (inclusive vs exclusive).
1954 		 *
1955 		 * But the exclusive pattern is necessary if @start is 0, or we
1956 		 * underflow and check against processed_end won't work as
1957 		 * expected.
1958 		 */
1959 		if (pages_processed)
1960 			*processed_end = min(end,
1961 			((u64)(start_index + pages_processed) << PAGE_SHIFT) - 1);
1962 		else
1963 			*processed_end = start;
1964 	}
1965 	return err;
1966 }
1967 
1968 static noinline void __unlock_for_delalloc(struct inode *inode,
1969 					   struct page *locked_page,
1970 					   u64 start, u64 end)
1971 {
1972 	unsigned long index = start >> PAGE_SHIFT;
1973 	unsigned long end_index = end >> PAGE_SHIFT;
1974 
1975 	ASSERT(locked_page);
1976 	if (index == locked_page->index && end_index == index)
1977 		return;
1978 
1979 	__process_pages_contig(inode->i_mapping, locked_page, start, end,
1980 			       PAGE_UNLOCK, NULL);
1981 }
1982 
1983 static noinline int lock_delalloc_pages(struct inode *inode,
1984 					struct page *locked_page,
1985 					u64 delalloc_start,
1986 					u64 delalloc_end)
1987 {
1988 	unsigned long index = delalloc_start >> PAGE_SHIFT;
1989 	unsigned long end_index = delalloc_end >> PAGE_SHIFT;
1990 	u64 processed_end = delalloc_start;
1991 	int ret;
1992 
1993 	ASSERT(locked_page);
1994 	if (index == locked_page->index && index == end_index)
1995 		return 0;
1996 
1997 	ret = __process_pages_contig(inode->i_mapping, locked_page, delalloc_start,
1998 				     delalloc_end, PAGE_LOCK, &processed_end);
1999 	if (ret == -EAGAIN && processed_end > delalloc_start)
2000 		__unlock_for_delalloc(inode, locked_page, delalloc_start,
2001 				      processed_end);
2002 	return ret;
2003 }
2004 
2005 /*
2006  * Find and lock a contiguous range of bytes in the file marked as delalloc, no
2007  * more than @max_bytes.
2008  *
2009  * @start:	The original start bytenr to search.
2010  *		Will store the extent range start bytenr.
2011  * @end:	The original end bytenr of the search range
2012  *		Will store the extent range end bytenr.
2013  *
2014  * Return true if we find a delalloc range which starts inside the original
2015  * range, and @start/@end will store the delalloc range start/end.
2016  *
2017  * Return false if we can't find any delalloc range which starts inside the
2018  * original range, and @start/@end will be the non-delalloc range start/end.
2019  */
2020 EXPORT_FOR_TESTS
2021 noinline_for_stack bool find_lock_delalloc_range(struct inode *inode,
2022 				    struct page *locked_page, u64 *start,
2023 				    u64 *end)
2024 {
2025 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2026 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2027 	const u64 orig_start = *start;
2028 	const u64 orig_end = *end;
2029 	/* The sanity tests may not set a valid fs_info. */
2030 	u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE;
2031 	u64 delalloc_start;
2032 	u64 delalloc_end;
2033 	bool found;
2034 	struct extent_state *cached_state = NULL;
2035 	int ret;
2036 	int loops = 0;
2037 
2038 	/* Caller should pass a valid @end to indicate the search range end */
2039 	ASSERT(orig_end > orig_start);
2040 
2041 	/* The range should at least cover part of the page */
2042 	ASSERT(!(orig_start >= page_offset(locked_page) + PAGE_SIZE ||
2043 		 orig_end <= page_offset(locked_page)));
2044 again:
2045 	/* step one, find a bunch of delalloc bytes starting at start */
2046 	delalloc_start = *start;
2047 	delalloc_end = 0;
2048 	found = btrfs_find_delalloc_range(tree, &delalloc_start, &delalloc_end,
2049 					  max_bytes, &cached_state);
2050 	if (!found || delalloc_end <= *start || delalloc_start > orig_end) {
2051 		*start = delalloc_start;
2052 
2053 		/* @delalloc_end can be -1, never go beyond @orig_end */
2054 		*end = min(delalloc_end, orig_end);
2055 		free_extent_state(cached_state);
2056 		return false;
2057 	}
2058 
2059 	/*
2060 	 * start comes from the offset of locked_page.  We have to lock
2061 	 * pages in order, so we can't process delalloc bytes before
2062 	 * locked_page
2063 	 */
2064 	if (delalloc_start < *start)
2065 		delalloc_start = *start;
2066 
2067 	/*
2068 	 * make sure to limit the number of pages we try to lock down
2069 	 */
2070 	if (delalloc_end + 1 - delalloc_start > max_bytes)
2071 		delalloc_end = delalloc_start + max_bytes - 1;
2072 
2073 	/* step two, lock all the pages after the page that has start */
2074 	ret = lock_delalloc_pages(inode, locked_page,
2075 				  delalloc_start, delalloc_end);
2076 	ASSERT(!ret || ret == -EAGAIN);
2077 	if (ret == -EAGAIN) {
2078 		/* some of the pages are gone, lets avoid looping by
2079 		 * shortening the size of the delalloc range we're searching
2080 		 */
2081 		free_extent_state(cached_state);
2082 		cached_state = NULL;
2083 		if (!loops) {
2084 			max_bytes = PAGE_SIZE;
2085 			loops = 1;
2086 			goto again;
2087 		} else {
2088 			found = false;
2089 			goto out_failed;
2090 		}
2091 	}
2092 
2093 	/* step three, lock the state bits for the whole range */
2094 	lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
2095 
2096 	/* then test to make sure it is all still delalloc */
2097 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
2098 			     EXTENT_DELALLOC, 1, cached_state);
2099 	if (!ret) {
2100 		unlock_extent_cached(tree, delalloc_start, delalloc_end,
2101 				     &cached_state);
2102 		__unlock_for_delalloc(inode, locked_page,
2103 			      delalloc_start, delalloc_end);
2104 		cond_resched();
2105 		goto again;
2106 	}
2107 	free_extent_state(cached_state);
2108 	*start = delalloc_start;
2109 	*end = delalloc_end;
2110 out_failed:
2111 	return found;
2112 }
2113 
2114 void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2115 				  struct page *locked_page,
2116 				  u32 clear_bits, unsigned long page_ops)
2117 {
2118 	clear_extent_bit(&inode->io_tree, start, end, clear_bits, 1, 0, NULL);
2119 
2120 	__process_pages_contig(inode->vfs_inode.i_mapping, locked_page,
2121 			       start, end, page_ops, NULL);
2122 }
2123 
2124 /*
2125  * count the number of bytes in the tree that have a given bit(s)
2126  * set.  This can be fairly slow, except for EXTENT_DIRTY which is
2127  * cached.  The total number found is returned.
2128  */
2129 u64 count_range_bits(struct extent_io_tree *tree,
2130 		     u64 *start, u64 search_end, u64 max_bytes,
2131 		     u32 bits, int contig)
2132 {
2133 	struct rb_node *node;
2134 	struct extent_state *state;
2135 	u64 cur_start = *start;
2136 	u64 total_bytes = 0;
2137 	u64 last = 0;
2138 	int found = 0;
2139 
2140 	if (WARN_ON(search_end <= cur_start))
2141 		return 0;
2142 
2143 	spin_lock(&tree->lock);
2144 	if (cur_start == 0 && bits == EXTENT_DIRTY) {
2145 		total_bytes = tree->dirty_bytes;
2146 		goto out;
2147 	}
2148 	/*
2149 	 * this search will find all the extents that end after
2150 	 * our range starts.
2151 	 */
2152 	node = tree_search(tree, cur_start);
2153 	if (!node)
2154 		goto out;
2155 
2156 	while (1) {
2157 		state = rb_entry(node, struct extent_state, rb_node);
2158 		if (state->start > search_end)
2159 			break;
2160 		if (contig && found && state->start > last + 1)
2161 			break;
2162 		if (state->end >= cur_start && (state->state & bits) == bits) {
2163 			total_bytes += min(search_end, state->end) + 1 -
2164 				       max(cur_start, state->start);
2165 			if (total_bytes >= max_bytes)
2166 				break;
2167 			if (!found) {
2168 				*start = max(cur_start, state->start);
2169 				found = 1;
2170 			}
2171 			last = state->end;
2172 		} else if (contig && found) {
2173 			break;
2174 		}
2175 		node = rb_next(node);
2176 		if (!node)
2177 			break;
2178 	}
2179 out:
2180 	spin_unlock(&tree->lock);
2181 	return total_bytes;
2182 }
2183 
2184 /*
2185  * set the private field for a given byte offset in the tree.  If there isn't
2186  * an extent_state there already, this does nothing.
2187  */
2188 int set_state_failrec(struct extent_io_tree *tree, u64 start,
2189 		      struct io_failure_record *failrec)
2190 {
2191 	struct rb_node *node;
2192 	struct extent_state *state;
2193 	int ret = 0;
2194 
2195 	spin_lock(&tree->lock);
2196 	/*
2197 	 * this search will find all the extents that end after
2198 	 * our range starts.
2199 	 */
2200 	node = tree_search(tree, start);
2201 	if (!node) {
2202 		ret = -ENOENT;
2203 		goto out;
2204 	}
2205 	state = rb_entry(node, struct extent_state, rb_node);
2206 	if (state->start != start) {
2207 		ret = -ENOENT;
2208 		goto out;
2209 	}
2210 	state->failrec = failrec;
2211 out:
2212 	spin_unlock(&tree->lock);
2213 	return ret;
2214 }
2215 
2216 struct io_failure_record *get_state_failrec(struct extent_io_tree *tree, u64 start)
2217 {
2218 	struct rb_node *node;
2219 	struct extent_state *state;
2220 	struct io_failure_record *failrec;
2221 
2222 	spin_lock(&tree->lock);
2223 	/*
2224 	 * this search will find all the extents that end after
2225 	 * our range starts.
2226 	 */
2227 	node = tree_search(tree, start);
2228 	if (!node) {
2229 		failrec = ERR_PTR(-ENOENT);
2230 		goto out;
2231 	}
2232 	state = rb_entry(node, struct extent_state, rb_node);
2233 	if (state->start != start) {
2234 		failrec = ERR_PTR(-ENOENT);
2235 		goto out;
2236 	}
2237 
2238 	failrec = state->failrec;
2239 out:
2240 	spin_unlock(&tree->lock);
2241 	return failrec;
2242 }
2243 
2244 /*
2245  * searches a range in the state tree for a given mask.
2246  * If 'filled' == 1, this returns 1 only if every extent in the tree
2247  * has the bits set.  Otherwise, 1 is returned if any bit in the
2248  * range is found set.
2249  */
2250 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
2251 		   u32 bits, int filled, struct extent_state *cached)
2252 {
2253 	struct extent_state *state = NULL;
2254 	struct rb_node *node;
2255 	int bitset = 0;
2256 
2257 	spin_lock(&tree->lock);
2258 	if (cached && extent_state_in_tree(cached) && cached->start <= start &&
2259 	    cached->end > start)
2260 		node = &cached->rb_node;
2261 	else
2262 		node = tree_search(tree, start);
2263 	while (node && start <= end) {
2264 		state = rb_entry(node, struct extent_state, rb_node);
2265 
2266 		if (filled && state->start > start) {
2267 			bitset = 0;
2268 			break;
2269 		}
2270 
2271 		if (state->start > end)
2272 			break;
2273 
2274 		if (state->state & bits) {
2275 			bitset = 1;
2276 			if (!filled)
2277 				break;
2278 		} else if (filled) {
2279 			bitset = 0;
2280 			break;
2281 		}
2282 
2283 		if (state->end == (u64)-1)
2284 			break;
2285 
2286 		start = state->end + 1;
2287 		if (start > end)
2288 			break;
2289 		node = rb_next(node);
2290 		if (!node) {
2291 			if (filled)
2292 				bitset = 0;
2293 			break;
2294 		}
2295 	}
2296 	spin_unlock(&tree->lock);
2297 	return bitset;
2298 }
2299 
2300 int free_io_failure(struct extent_io_tree *failure_tree,
2301 		    struct extent_io_tree *io_tree,
2302 		    struct io_failure_record *rec)
2303 {
2304 	int ret;
2305 	int err = 0;
2306 
2307 	set_state_failrec(failure_tree, rec->start, NULL);
2308 	ret = clear_extent_bits(failure_tree, rec->start,
2309 				rec->start + rec->len - 1,
2310 				EXTENT_LOCKED | EXTENT_DIRTY);
2311 	if (ret)
2312 		err = ret;
2313 
2314 	ret = clear_extent_bits(io_tree, rec->start,
2315 				rec->start + rec->len - 1,
2316 				EXTENT_DAMAGED);
2317 	if (ret && !err)
2318 		err = ret;
2319 
2320 	kfree(rec);
2321 	return err;
2322 }
2323 
2324 /*
2325  * this bypasses the standard btrfs submit functions deliberately, as
2326  * the standard behavior is to write all copies in a raid setup. here we only
2327  * want to write the one bad copy. so we do the mapping for ourselves and issue
2328  * submit_bio directly.
2329  * to avoid any synchronization issues, wait for the data after writing, which
2330  * actually prevents the read that triggered the error from finishing.
2331  * currently, there can be no more than two copies of every data bit. thus,
2332  * exactly one rewrite is required.
2333  */
2334 static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
2335 			     u64 length, u64 logical, struct page *page,
2336 			     unsigned int pg_offset, int mirror_num)
2337 {
2338 	struct btrfs_device *dev;
2339 	struct bio_vec bvec;
2340 	struct bio bio;
2341 	u64 map_length = 0;
2342 	u64 sector;
2343 	struct btrfs_io_context *bioc = NULL;
2344 	int ret = 0;
2345 
2346 	ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
2347 	BUG_ON(!mirror_num);
2348 
2349 	if (btrfs_repair_one_zone(fs_info, logical))
2350 		return 0;
2351 
2352 	map_length = length;
2353 
2354 	/*
2355 	 * Avoid races with device replace and make sure our bioc has devices
2356 	 * associated to its stripes that don't go away while we are doing the
2357 	 * read repair operation.
2358 	 */
2359 	btrfs_bio_counter_inc_blocked(fs_info);
2360 	if (btrfs_is_parity_mirror(fs_info, logical, length)) {
2361 		/*
2362 		 * Note that we don't use BTRFS_MAP_WRITE because it's supposed
2363 		 * to update all raid stripes, but here we just want to correct
2364 		 * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
2365 		 * stripe's dev and sector.
2366 		 */
2367 		ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
2368 				      &map_length, &bioc, 0);
2369 		if (ret)
2370 			goto out_counter_dec;
2371 		ASSERT(bioc->mirror_num == 1);
2372 	} else {
2373 		ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
2374 				      &map_length, &bioc, mirror_num);
2375 		if (ret)
2376 			goto out_counter_dec;
2377 		BUG_ON(mirror_num != bioc->mirror_num);
2378 	}
2379 
2380 	sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
2381 	dev = bioc->stripes[bioc->mirror_num - 1].dev;
2382 	btrfs_put_bioc(bioc);
2383 
2384 	if (!dev || !dev->bdev ||
2385 	    !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
2386 		ret = -EIO;
2387 		goto out_counter_dec;
2388 	}
2389 
2390 	bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
2391 	bio.bi_iter.bi_sector = sector;
2392 	__bio_add_page(&bio, page, length, pg_offset);
2393 
2394 	btrfsic_check_bio(&bio);
2395 	ret = submit_bio_wait(&bio);
2396 	if (ret) {
2397 		/* try to remap that extent elsewhere? */
2398 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
2399 		goto out_bio_uninit;
2400 	}
2401 
2402 	btrfs_info_rl_in_rcu(fs_info,
2403 		"read error corrected: ino %llu off %llu (dev %s sector %llu)",
2404 				  ino, start,
2405 				  rcu_str_deref(dev->name), sector);
2406 	ret = 0;
2407 
2408 out_bio_uninit:
2409 	bio_uninit(&bio);
2410 out_counter_dec:
2411 	btrfs_bio_counter_dec(fs_info);
2412 	return ret;
2413 }
2414 
2415 int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num)
2416 {
2417 	struct btrfs_fs_info *fs_info = eb->fs_info;
2418 	u64 start = eb->start;
2419 	int i, num_pages = num_extent_pages(eb);
2420 	int ret = 0;
2421 
2422 	if (sb_rdonly(fs_info->sb))
2423 		return -EROFS;
2424 
2425 	for (i = 0; i < num_pages; i++) {
2426 		struct page *p = eb->pages[i];
2427 
2428 		ret = repair_io_failure(fs_info, 0, start, PAGE_SIZE, start, p,
2429 					start - page_offset(p), mirror_num);
2430 		if (ret)
2431 			break;
2432 		start += PAGE_SIZE;
2433 	}
2434 
2435 	return ret;
2436 }
2437 
2438 static int next_mirror(const struct io_failure_record *failrec, int cur_mirror)
2439 {
2440 	if (cur_mirror == failrec->num_copies)
2441 		return cur_mirror + 1 - failrec->num_copies;
2442 	return cur_mirror + 1;
2443 }
2444 
2445 static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror)
2446 {
2447 	if (cur_mirror == 1)
2448 		return failrec->num_copies;
2449 	return cur_mirror - 1;
2450 }
2451 
2452 /*
2453  * each time an IO finishes, we do a fast check in the IO failure tree
2454  * to see if we need to process or clean up an io_failure_record
2455  */
2456 int clean_io_failure(struct btrfs_fs_info *fs_info,
2457 		     struct extent_io_tree *failure_tree,
2458 		     struct extent_io_tree *io_tree, u64 start,
2459 		     struct page *page, u64 ino, unsigned int pg_offset)
2460 {
2461 	u64 private;
2462 	struct io_failure_record *failrec;
2463 	struct extent_state *state;
2464 	int mirror;
2465 	int ret;
2466 
2467 	private = 0;
2468 	ret = count_range_bits(failure_tree, &private, (u64)-1, 1,
2469 			       EXTENT_DIRTY, 0);
2470 	if (!ret)
2471 		return 0;
2472 
2473 	failrec = get_state_failrec(failure_tree, start);
2474 	if (IS_ERR(failrec))
2475 		return 0;
2476 
2477 	BUG_ON(!failrec->this_mirror);
2478 
2479 	if (sb_rdonly(fs_info->sb))
2480 		goto out;
2481 
2482 	spin_lock(&io_tree->lock);
2483 	state = find_first_extent_bit_state(io_tree,
2484 					    failrec->start,
2485 					    EXTENT_LOCKED);
2486 	spin_unlock(&io_tree->lock);
2487 
2488 	if (!state || state->start > failrec->start ||
2489 	    state->end < failrec->start + failrec->len - 1)
2490 		goto out;
2491 
2492 	mirror = failrec->this_mirror;
2493 	do {
2494 		mirror = prev_mirror(failrec, mirror);
2495 		repair_io_failure(fs_info, ino, start, failrec->len,
2496 				  failrec->logical, page, pg_offset, mirror);
2497 	} while (mirror != failrec->failed_mirror);
2498 
2499 out:
2500 	free_io_failure(failure_tree, io_tree, failrec);
2501 	return 0;
2502 }
2503 
2504 /*
2505  * Can be called when
2506  * - hold extent lock
2507  * - under ordered extent
2508  * - the inode is freeing
2509  */
2510 void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end)
2511 {
2512 	struct extent_io_tree *failure_tree = &inode->io_failure_tree;
2513 	struct io_failure_record *failrec;
2514 	struct extent_state *state, *next;
2515 
2516 	if (RB_EMPTY_ROOT(&failure_tree->state))
2517 		return;
2518 
2519 	spin_lock(&failure_tree->lock);
2520 	state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY);
2521 	while (state) {
2522 		if (state->start > end)
2523 			break;
2524 
2525 		ASSERT(state->end <= end);
2526 
2527 		next = next_state(state);
2528 
2529 		failrec = state->failrec;
2530 		free_extent_state(state);
2531 		kfree(failrec);
2532 
2533 		state = next;
2534 	}
2535 	spin_unlock(&failure_tree->lock);
2536 }
2537 
2538 static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode,
2539 							     struct btrfs_bio *bbio,
2540 							     unsigned int bio_offset)
2541 {
2542 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2543 	u64 start = bbio->file_offset + bio_offset;
2544 	struct io_failure_record *failrec;
2545 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2546 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2547 	const u32 sectorsize = fs_info->sectorsize;
2548 	int ret;
2549 
2550 	failrec = get_state_failrec(failure_tree, start);
2551 	if (!IS_ERR(failrec)) {
2552 		btrfs_debug(fs_info,
2553 	"Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu",
2554 			failrec->logical, failrec->start, failrec->len);
2555 		/*
2556 		 * when data can be on disk more than twice, add to failrec here
2557 		 * (e.g. with a list for failed_mirror) to make
2558 		 * clean_io_failure() clean all those errors at once.
2559 		 */
2560 		ASSERT(failrec->this_mirror == bbio->mirror_num);
2561 		ASSERT(failrec->len == fs_info->sectorsize);
2562 		return failrec;
2563 	}
2564 
2565 	failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2566 	if (!failrec)
2567 		return ERR_PTR(-ENOMEM);
2568 
2569 	failrec->start = start;
2570 	failrec->len = sectorsize;
2571 	failrec->failed_mirror = bbio->mirror_num;
2572 	failrec->this_mirror = bbio->mirror_num;
2573 	failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset;
2574 
2575 	btrfs_debug(fs_info,
2576 		    "new io failure record logical %llu start %llu",
2577 		    failrec->logical, start);
2578 
2579 	failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize);
2580 	if (failrec->num_copies == 1) {
2581 		/*
2582 		 * We only have a single copy of the data, so don't bother with
2583 		 * all the retry and error correction code that follows. No
2584 		 * matter what the error is, it is very likely to persist.
2585 		 */
2586 		btrfs_debug(fs_info,
2587 			"cannot repair logical %llu num_copies %d",
2588 			failrec->logical, failrec->num_copies);
2589 		kfree(failrec);
2590 		return ERR_PTR(-EIO);
2591 	}
2592 
2593 	/* Set the bits in the private failure tree */
2594 	ret = set_extent_bits(failure_tree, start, start + sectorsize - 1,
2595 			      EXTENT_LOCKED | EXTENT_DIRTY);
2596 	if (ret >= 0) {
2597 		ret = set_state_failrec(failure_tree, start, failrec);
2598 		/* Set the bits in the inode's tree */
2599 		ret = set_extent_bits(tree, start, start + sectorsize - 1,
2600 				      EXTENT_DAMAGED);
2601 	} else if (ret < 0) {
2602 		kfree(failrec);
2603 		return ERR_PTR(ret);
2604 	}
2605 
2606 	return failrec;
2607 }
2608 
2609 int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
2610 			    u32 bio_offset, struct page *page, unsigned int pgoff,
2611 			    submit_bio_hook_t *submit_bio_hook)
2612 {
2613 	u64 start = failed_bbio->file_offset + bio_offset;
2614 	struct io_failure_record *failrec;
2615 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2616 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
2617 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
2618 	struct bio *failed_bio = &failed_bbio->bio;
2619 	const int icsum = bio_offset >> fs_info->sectorsize_bits;
2620 	struct bio *repair_bio;
2621 	struct btrfs_bio *repair_bbio;
2622 
2623 	btrfs_debug(fs_info,
2624 		   "repair read error: read error at %llu", start);
2625 
2626 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
2627 
2628 	failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset);
2629 	if (IS_ERR(failrec))
2630 		return PTR_ERR(failrec);
2631 
2632 	/*
2633 	 * There are two premises:
2634 	 * a) deliver good data to the caller
2635 	 * b) correct the bad sectors on disk
2636 	 *
2637 	 * Since we're only doing repair for one sector, we only need to get
2638 	 * a good copy of the failed sector and if we succeed, we have setup
2639 	 * everything for repair_io_failure to do the rest for us.
2640 	 */
2641 	failrec->this_mirror = next_mirror(failrec, failrec->this_mirror);
2642 	if (failrec->this_mirror == failrec->failed_mirror) {
2643 		btrfs_debug(fs_info,
2644 			"failed to repair num_copies %d this_mirror %d failed_mirror %d",
2645 			failrec->num_copies, failrec->this_mirror, failrec->failed_mirror);
2646 		free_io_failure(failure_tree, tree, failrec);
2647 		return -EIO;
2648 	}
2649 
2650 	repair_bio = btrfs_bio_alloc(1);
2651 	repair_bbio = btrfs_bio(repair_bio);
2652 	repair_bbio->file_offset = start;
2653 	repair_bio->bi_opf = REQ_OP_READ;
2654 	repair_bio->bi_end_io = failed_bio->bi_end_io;
2655 	repair_bio->bi_iter.bi_sector = failrec->logical >> 9;
2656 	repair_bio->bi_private = failed_bio->bi_private;
2657 
2658 	if (failed_bbio->csum) {
2659 		const u32 csum_size = fs_info->csum_size;
2660 
2661 		repair_bbio->csum = repair_bbio->csum_inline;
2662 		memcpy(repair_bbio->csum,
2663 		       failed_bbio->csum + csum_size * icsum, csum_size);
2664 	}
2665 
2666 	bio_add_page(repair_bio, page, failrec->len, pgoff);
2667 	repair_bbio->iter = repair_bio->bi_iter;
2668 
2669 	btrfs_debug(btrfs_sb(inode->i_sb),
2670 		    "repair read error: submitting new read to mirror %d",
2671 		    failrec->this_mirror);
2672 
2673 	/*
2674 	 * At this point we have a bio, so any errors from submit_bio_hook()
2675 	 * will be handled by the endio on the repair_bio, so we can't return an
2676 	 * error here.
2677 	 */
2678 	submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0);
2679 	return BLK_STS_OK;
2680 }
2681 
2682 static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len)
2683 {
2684 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
2685 
2686 	ASSERT(page_offset(page) <= start &&
2687 	       start + len <= page_offset(page) + PAGE_SIZE);
2688 
2689 	if (uptodate) {
2690 		if (fsverity_active(page->mapping->host) &&
2691 		    !PageError(page) &&
2692 		    !PageUptodate(page) &&
2693 		    start < i_size_read(page->mapping->host) &&
2694 		    !fsverity_verify_page(page)) {
2695 			btrfs_page_set_error(fs_info, page, start, len);
2696 		} else {
2697 			btrfs_page_set_uptodate(fs_info, page, start, len);
2698 		}
2699 	} else {
2700 		btrfs_page_clear_uptodate(fs_info, page, start, len);
2701 		btrfs_page_set_error(fs_info, page, start, len);
2702 	}
2703 
2704 	if (!btrfs_is_subpage(fs_info, page))
2705 		unlock_page(page);
2706 	else
2707 		btrfs_subpage_end_reader(fs_info, page, start, len);
2708 }
2709 
2710 static void end_sector_io(struct page *page, u64 offset, bool uptodate)
2711 {
2712 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
2713 	const u32 sectorsize = inode->root->fs_info->sectorsize;
2714 	struct extent_state *cached = NULL;
2715 
2716 	end_page_read(page, uptodate, offset, sectorsize);
2717 	if (uptodate)
2718 		set_extent_uptodate(&inode->io_tree, offset,
2719 				    offset + sectorsize - 1, &cached, GFP_ATOMIC);
2720 	unlock_extent_cached_atomic(&inode->io_tree, offset,
2721 				    offset + sectorsize - 1, &cached);
2722 }
2723 
2724 static void submit_data_read_repair(struct inode *inode,
2725 				    struct btrfs_bio *failed_bbio,
2726 				    u32 bio_offset, const struct bio_vec *bvec,
2727 				    unsigned int error_bitmap)
2728 {
2729 	const unsigned int pgoff = bvec->bv_offset;
2730 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2731 	struct page *page = bvec->bv_page;
2732 	const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset;
2733 	const u64 end = start + bvec->bv_len - 1;
2734 	const u32 sectorsize = fs_info->sectorsize;
2735 	const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits;
2736 	int i;
2737 
2738 	BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE);
2739 
2740 	/* This repair is only for data */
2741 	ASSERT(is_data_inode(inode));
2742 
2743 	/* We're here because we had some read errors or csum mismatch */
2744 	ASSERT(error_bitmap);
2745 
2746 	/*
2747 	 * We only get called on buffered IO, thus page must be mapped and bio
2748 	 * must not be cloned.
2749 	 */
2750 	ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED));
2751 
2752 	/* Iterate through all the sectors in the range */
2753 	for (i = 0; i < nr_bits; i++) {
2754 		const unsigned int offset = i * sectorsize;
2755 		bool uptodate = false;
2756 		int ret;
2757 
2758 		if (!(error_bitmap & (1U << i))) {
2759 			/*
2760 			 * This sector has no error, just end the page read
2761 			 * and unlock the range.
2762 			 */
2763 			uptodate = true;
2764 			goto next;
2765 		}
2766 
2767 		ret = btrfs_repair_one_sector(inode, failed_bbio,
2768 				bio_offset + offset, page, pgoff + offset,
2769 				btrfs_submit_data_read_bio);
2770 		if (!ret) {
2771 			/*
2772 			 * We have submitted the read repair, the page release
2773 			 * will be handled by the endio function of the
2774 			 * submitted repair bio.
2775 			 * Thus we don't need to do any thing here.
2776 			 */
2777 			continue;
2778 		}
2779 		/*
2780 		 * Continue on failed repair, otherwise the remaining sectors
2781 		 * will not be properly unlocked.
2782 		 */
2783 next:
2784 		end_sector_io(page, start + offset, uptodate);
2785 	}
2786 }
2787 
2788 /* lots and lots of room for performance fixes in the end_bio funcs */
2789 
2790 void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
2791 {
2792 	struct btrfs_inode *inode;
2793 	const bool uptodate = (err == 0);
2794 	int ret = 0;
2795 
2796 	ASSERT(page && page->mapping);
2797 	inode = BTRFS_I(page->mapping->host);
2798 	btrfs_writepage_endio_finish_ordered(inode, page, start, end, uptodate);
2799 
2800 	if (!uptodate) {
2801 		const struct btrfs_fs_info *fs_info = inode->root->fs_info;
2802 		u32 len;
2803 
2804 		ASSERT(end + 1 - start <= U32_MAX);
2805 		len = end + 1 - start;
2806 
2807 		btrfs_page_clear_uptodate(fs_info, page, start, len);
2808 		btrfs_page_set_error(fs_info, page, start, len);
2809 		ret = err < 0 ? err : -EIO;
2810 		mapping_set_error(page->mapping, ret);
2811 	}
2812 }
2813 
2814 /*
2815  * after a writepage IO is done, we need to:
2816  * clear the uptodate bits on error
2817  * clear the writeback bits in the extent tree for this IO
2818  * end_page_writeback if the page has no more pending IO
2819  *
2820  * Scheduling is not allowed, so the extent state tree is expected
2821  * to have one and only one object corresponding to this IO.
2822  */
2823 static void end_bio_extent_writepage(struct bio *bio)
2824 {
2825 	int error = blk_status_to_errno(bio->bi_status);
2826 	struct bio_vec *bvec;
2827 	u64 start;
2828 	u64 end;
2829 	struct bvec_iter_all iter_all;
2830 	bool first_bvec = true;
2831 
2832 	ASSERT(!bio_flagged(bio, BIO_CLONED));
2833 	bio_for_each_segment_all(bvec, bio, iter_all) {
2834 		struct page *page = bvec->bv_page;
2835 		struct inode *inode = page->mapping->host;
2836 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2837 		const u32 sectorsize = fs_info->sectorsize;
2838 
2839 		/* Our read/write should always be sector aligned. */
2840 		if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
2841 			btrfs_err(fs_info,
2842 		"partial page write in btrfs with offset %u and length %u",
2843 				  bvec->bv_offset, bvec->bv_len);
2844 		else if (!IS_ALIGNED(bvec->bv_len, sectorsize))
2845 			btrfs_info(fs_info,
2846 		"incomplete page write with offset %u and length %u",
2847 				   bvec->bv_offset, bvec->bv_len);
2848 
2849 		start = page_offset(page) + bvec->bv_offset;
2850 		end = start + bvec->bv_len - 1;
2851 
2852 		if (first_bvec) {
2853 			btrfs_record_physical_zoned(inode, start, bio);
2854 			first_bvec = false;
2855 		}
2856 
2857 		end_extent_writepage(page, error, start, end);
2858 
2859 		btrfs_page_clear_writeback(fs_info, page, start, bvec->bv_len);
2860 	}
2861 
2862 	bio_put(bio);
2863 }
2864 
2865 /*
2866  * Record previously processed extent range
2867  *
2868  * For endio_readpage_release_extent() to handle a full extent range, reducing
2869  * the extent io operations.
2870  */
2871 struct processed_extent {
2872 	struct btrfs_inode *inode;
2873 	/* Start of the range in @inode */
2874 	u64 start;
2875 	/* End of the range in @inode */
2876 	u64 end;
2877 	bool uptodate;
2878 };
2879 
2880 /*
2881  * Try to release processed extent range
2882  *
2883  * May not release the extent range right now if the current range is
2884  * contiguous to processed extent.
2885  *
2886  * Will release processed extent when any of @inode, @uptodate, the range is
2887  * no longer contiguous to the processed range.
2888  *
2889  * Passing @inode == NULL will force processed extent to be released.
2890  */
2891 static void endio_readpage_release_extent(struct processed_extent *processed,
2892 			      struct btrfs_inode *inode, u64 start, u64 end,
2893 			      bool uptodate)
2894 {
2895 	struct extent_state *cached = NULL;
2896 	struct extent_io_tree *tree;
2897 
2898 	/* The first extent, initialize @processed */
2899 	if (!processed->inode)
2900 		goto update;
2901 
2902 	/*
2903 	 * Contiguous to processed extent, just uptodate the end.
2904 	 *
2905 	 * Several things to notice:
2906 	 *
2907 	 * - bio can be merged as long as on-disk bytenr is contiguous
2908 	 *   This means we can have page belonging to other inodes, thus need to
2909 	 *   check if the inode still matches.
2910 	 * - bvec can contain range beyond current page for multi-page bvec
2911 	 *   Thus we need to do processed->end + 1 >= start check
2912 	 */
2913 	if (processed->inode == inode && processed->uptodate == uptodate &&
2914 	    processed->end + 1 >= start && end >= processed->end) {
2915 		processed->end = end;
2916 		return;
2917 	}
2918 
2919 	tree = &processed->inode->io_tree;
2920 	/*
2921 	 * Now we don't have range contiguous to the processed range, release
2922 	 * the processed range now.
2923 	 */
2924 	if (processed->uptodate && tree->track_uptodate)
2925 		set_extent_uptodate(tree, processed->start, processed->end,
2926 				    &cached, GFP_ATOMIC);
2927 	unlock_extent_cached_atomic(tree, processed->start, processed->end,
2928 				    &cached);
2929 
2930 update:
2931 	/* Update processed to current range */
2932 	processed->inode = inode;
2933 	processed->start = start;
2934 	processed->end = end;
2935 	processed->uptodate = uptodate;
2936 }
2937 
2938 static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page)
2939 {
2940 	ASSERT(PageLocked(page));
2941 	if (!btrfs_is_subpage(fs_info, page))
2942 		return;
2943 
2944 	ASSERT(PagePrivate(page));
2945 	btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE);
2946 }
2947 
2948 /*
2949  * Find extent buffer for a givne bytenr.
2950  *
2951  * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking
2952  * in endio context.
2953  */
2954 static struct extent_buffer *find_extent_buffer_readpage(
2955 		struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
2956 {
2957 	struct extent_buffer *eb;
2958 
2959 	/*
2960 	 * For regular sectorsize, we can use page->private to grab extent
2961 	 * buffer
2962 	 */
2963 	if (fs_info->nodesize >= PAGE_SIZE) {
2964 		ASSERT(PagePrivate(page) && page->private);
2965 		return (struct extent_buffer *)page->private;
2966 	}
2967 
2968 	/* For subpage case, we need to lookup buffer radix tree */
2969 	rcu_read_lock();
2970 	eb = radix_tree_lookup(&fs_info->buffer_radix,
2971 			       bytenr >> fs_info->sectorsize_bits);
2972 	rcu_read_unlock();
2973 	ASSERT(eb);
2974 	return eb;
2975 }
2976 
2977 /*
2978  * after a readpage IO is done, we need to:
2979  * clear the uptodate bits on error
2980  * set the uptodate bits if things worked
2981  * set the page up to date if all extents in the tree are uptodate
2982  * clear the lock bit in the extent tree
2983  * unlock the page if there are no other extents locked for it
2984  *
2985  * Scheduling is not allowed, so the extent state tree is expected
2986  * to have one and only one object corresponding to this IO.
2987  */
2988 static void end_bio_extent_readpage(struct bio *bio)
2989 {
2990 	struct bio_vec *bvec;
2991 	struct btrfs_bio *bbio = btrfs_bio(bio);
2992 	struct extent_io_tree *tree, *failure_tree;
2993 	struct processed_extent processed = { 0 };
2994 	/*
2995 	 * The offset to the beginning of a bio, since one bio can never be
2996 	 * larger than UINT_MAX, u32 here is enough.
2997 	 */
2998 	u32 bio_offset = 0;
2999 	int mirror;
3000 	struct bvec_iter_all iter_all;
3001 
3002 	ASSERT(!bio_flagged(bio, BIO_CLONED));
3003 	bio_for_each_segment_all(bvec, bio, iter_all) {
3004 		bool uptodate = !bio->bi_status;
3005 		struct page *page = bvec->bv_page;
3006 		struct inode *inode = page->mapping->host;
3007 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3008 		const u32 sectorsize = fs_info->sectorsize;
3009 		unsigned int error_bitmap = (unsigned int)-1;
3010 		bool repair = false;
3011 		u64 start;
3012 		u64 end;
3013 		u32 len;
3014 
3015 		btrfs_debug(fs_info,
3016 			"end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u",
3017 			bio->bi_iter.bi_sector, bio->bi_status,
3018 			bbio->mirror_num);
3019 		tree = &BTRFS_I(inode)->io_tree;
3020 		failure_tree = &BTRFS_I(inode)->io_failure_tree;
3021 
3022 		/*
3023 		 * We always issue full-sector reads, but if some block in a
3024 		 * page fails to read, blk_update_request() will advance
3025 		 * bv_offset and adjust bv_len to compensate.  Print a warning
3026 		 * for unaligned offsets, and an error if they don't add up to
3027 		 * a full sector.
3028 		 */
3029 		if (!IS_ALIGNED(bvec->bv_offset, sectorsize))
3030 			btrfs_err(fs_info,
3031 		"partial page read in btrfs with offset %u and length %u",
3032 				  bvec->bv_offset, bvec->bv_len);
3033 		else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len,
3034 				     sectorsize))
3035 			btrfs_info(fs_info,
3036 		"incomplete page read with offset %u and length %u",
3037 				   bvec->bv_offset, bvec->bv_len);
3038 
3039 		start = page_offset(page) + bvec->bv_offset;
3040 		end = start + bvec->bv_len - 1;
3041 		len = bvec->bv_len;
3042 
3043 		mirror = bbio->mirror_num;
3044 		if (likely(uptodate)) {
3045 			if (is_data_inode(inode)) {
3046 				error_bitmap = btrfs_verify_data_csum(bbio,
3047 						bio_offset, page, start, end);
3048 				if (error_bitmap)
3049 					uptodate = false;
3050 			} else {
3051 				if (btrfs_validate_metadata_buffer(bbio,
3052 						page, start, end, mirror))
3053 					uptodate = false;
3054 			}
3055 		}
3056 
3057 		if (likely(uptodate)) {
3058 			loff_t i_size = i_size_read(inode);
3059 			pgoff_t end_index = i_size >> PAGE_SHIFT;
3060 
3061 			clean_io_failure(BTRFS_I(inode)->root->fs_info,
3062 					 failure_tree, tree, start, page,
3063 					 btrfs_ino(BTRFS_I(inode)), 0);
3064 
3065 			/*
3066 			 * Zero out the remaining part if this range straddles
3067 			 * i_size.
3068 			 *
3069 			 * Here we should only zero the range inside the bvec,
3070 			 * not touch anything else.
3071 			 *
3072 			 * NOTE: i_size is exclusive while end is inclusive.
3073 			 */
3074 			if (page->index == end_index && i_size <= end) {
3075 				u32 zero_start = max(offset_in_page(i_size),
3076 						     offset_in_page(start));
3077 
3078 				zero_user_segment(page, zero_start,
3079 						  offset_in_page(end) + 1);
3080 			}
3081 		} else if (is_data_inode(inode)) {
3082 			/*
3083 			 * Only try to repair bios that actually made it to a
3084 			 * device.  If the bio failed to be submitted mirror
3085 			 * is 0 and we need to fail it without retrying.
3086 			 *
3087 			 * This also includes the high level bios for compressed
3088 			 * extents - these never make it to a device and repair
3089 			 * is already handled on the lower compressed bio.
3090 			 */
3091 			if (mirror > 0)
3092 				repair = true;
3093 		} else {
3094 			struct extent_buffer *eb;
3095 
3096 			eb = find_extent_buffer_readpage(fs_info, page, start);
3097 			set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
3098 			eb->read_mirror = mirror;
3099 			atomic_dec(&eb->io_pages);
3100 		}
3101 
3102 		if (repair) {
3103 			/*
3104 			 * submit_data_read_repair() will handle all the good
3105 			 * and bad sectors, we just continue to the next bvec.
3106 			 */
3107 			submit_data_read_repair(inode, bbio, bio_offset, bvec,
3108 						error_bitmap);
3109 		} else {
3110 			/* Update page status and unlock */
3111 			end_page_read(page, uptodate, start, len);
3112 			endio_readpage_release_extent(&processed, BTRFS_I(inode),
3113 					start, end, PageUptodate(page));
3114 		}
3115 
3116 		ASSERT(bio_offset + len > bio_offset);
3117 		bio_offset += len;
3118 
3119 	}
3120 	/* Release the last extent */
3121 	endio_readpage_release_extent(&processed, NULL, 0, 0, false);
3122 	btrfs_bio_free_csum(bbio);
3123 	bio_put(bio);
3124 }
3125 
3126 /**
3127  * Populate every free slot in a provided array with pages.
3128  *
3129  * @nr_pages:   number of pages to allocate
3130  * @page_array: the array to fill with pages; any existing non-null entries in
3131  * 		the array will be skipped
3132  *
3133  * Return: 0        if all pages were able to be allocated;
3134  *         -ENOMEM  otherwise, and the caller is responsible for freeing all
3135  *                  non-null page pointers in the array.
3136  */
3137 int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array)
3138 {
3139 	unsigned int allocated;
3140 
3141 	for (allocated = 0; allocated < nr_pages;) {
3142 		unsigned int last = allocated;
3143 
3144 		allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array);
3145 
3146 		if (allocated == nr_pages)
3147 			return 0;
3148 
3149 		/*
3150 		 * During this iteration, no page could be allocated, even
3151 		 * though alloc_pages_bulk_array() falls back to alloc_page()
3152 		 * if  it could not bulk-allocate. So we must be out of memory.
3153 		 */
3154 		if (allocated == last)
3155 			return -ENOMEM;
3156 
3157 		memalloc_retry_wait(GFP_NOFS);
3158 	}
3159 	return 0;
3160 }
3161 
3162 /*
3163  * Initialize the members up to but not including 'bio'. Use after allocating a
3164  * new bio by bio_alloc_bioset as it does not initialize the bytes outside of
3165  * 'bio' because use of __GFP_ZERO is not supported.
3166  */
3167 static inline void btrfs_bio_init(struct btrfs_bio *bbio)
3168 {
3169 	memset(bbio, 0, offsetof(struct btrfs_bio, bio));
3170 }
3171 
3172 /*
3173  * Allocate a btrfs_io_bio, with @nr_iovecs as maximum number of iovecs.
3174  *
3175  * The bio allocation is backed by bioset and does not fail.
3176  */
3177 struct bio *btrfs_bio_alloc(unsigned int nr_iovecs)
3178 {
3179 	struct bio *bio;
3180 
3181 	ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS);
3182 	bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset);
3183 	btrfs_bio_init(btrfs_bio(bio));
3184 	return bio;
3185 }
3186 
3187 struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size)
3188 {
3189 	struct bio *bio;
3190 	struct btrfs_bio *bbio;
3191 
3192 	ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
3193 
3194 	/* this will never fail when it's backed by a bioset */
3195 	bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
3196 	ASSERT(bio);
3197 
3198 	bbio = btrfs_bio(bio);
3199 	btrfs_bio_init(bbio);
3200 
3201 	bio_trim(bio, offset >> 9, size >> 9);
3202 	bbio->iter = bio->bi_iter;
3203 	return bio;
3204 }
3205 
3206 /**
3207  * Attempt to add a page to bio
3208  *
3209  * @bio_ctrl:	record both the bio, and its bio_flags
3210  * @page:	page to add to the bio
3211  * @disk_bytenr:  offset of the new bio or to check whether we are adding
3212  *                a contiguous page to the previous one
3213  * @size:	portion of page that we want to write
3214  * @pg_offset:	starting offset in the page
3215  * @compress_type:   compression type of the current bio to see if we can merge them
3216  *
3217  * Attempt to add a page to bio considering stripe alignment etc.
3218  *
3219  * Return >= 0 for the number of bytes added to the bio.
3220  * Can return 0 if the current bio is already at stripe/zone boundary.
3221  * Return <0 for error.
3222  */
3223 static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl,
3224 			      struct page *page,
3225 			      u64 disk_bytenr, unsigned int size,
3226 			      unsigned int pg_offset,
3227 			      enum btrfs_compression_type compress_type)
3228 {
3229 	struct bio *bio = bio_ctrl->bio;
3230 	u32 bio_size = bio->bi_iter.bi_size;
3231 	u32 real_size;
3232 	const sector_t sector = disk_bytenr >> SECTOR_SHIFT;
3233 	bool contig = false;
3234 	int ret;
3235 
3236 	ASSERT(bio);
3237 	/* The limit should be calculated when bio_ctrl->bio is allocated */
3238 	ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary);
3239 	if (bio_ctrl->compress_type != compress_type)
3240 		return 0;
3241 
3242 
3243 	if (bio->bi_iter.bi_size == 0) {
3244 		/* We can always add a page into an empty bio. */
3245 		contig = true;
3246 	} else if (bio_ctrl->compress_type == BTRFS_COMPRESS_NONE) {
3247 		struct bio_vec *bvec = bio_last_bvec_all(bio);
3248 
3249 		/*
3250 		 * The contig check requires the following conditions to be met:
3251 		 * 1) The pages are belonging to the same inode
3252 		 *    This is implied by the call chain.
3253 		 *
3254 		 * 2) The range has adjacent logical bytenr
3255 		 *
3256 		 * 3) The range has adjacent file offset
3257 		 *    This is required for the usage of btrfs_bio->file_offset.
3258 		 */
3259 		if (bio_end_sector(bio) == sector &&
3260 		    page_offset(bvec->bv_page) + bvec->bv_offset +
3261 		    bvec->bv_len == page_offset(page) + pg_offset)
3262 			contig = true;
3263 	} else {
3264 		/*
3265 		 * For compression, all IO should have its logical bytenr
3266 		 * set to the starting bytenr of the compressed extent.
3267 		 */
3268 		contig = bio->bi_iter.bi_sector == sector;
3269 	}
3270 
3271 	if (!contig)
3272 		return 0;
3273 
3274 	real_size = min(bio_ctrl->len_to_oe_boundary,
3275 			bio_ctrl->len_to_stripe_boundary) - bio_size;
3276 	real_size = min(real_size, size);
3277 
3278 	/*
3279 	 * If real_size is 0, never call bio_add_*_page(), as even size is 0,
3280 	 * bio will still execute its endio function on the page!
3281 	 */
3282 	if (real_size == 0)
3283 		return 0;
3284 
3285 	if (bio_op(bio) == REQ_OP_ZONE_APPEND)
3286 		ret = bio_add_zone_append_page(bio, page, real_size, pg_offset);
3287 	else
3288 		ret = bio_add_page(bio, page, real_size, pg_offset);
3289 
3290 	return ret;
3291 }
3292 
3293 static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl,
3294 			       struct btrfs_inode *inode, u64 file_offset)
3295 {
3296 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3297 	struct btrfs_io_geometry geom;
3298 	struct btrfs_ordered_extent *ordered;
3299 	struct extent_map *em;
3300 	u64 logical = (bio_ctrl->bio->bi_iter.bi_sector << SECTOR_SHIFT);
3301 	int ret;
3302 
3303 	/*
3304 	 * Pages for compressed extent are never submitted to disk directly,
3305 	 * thus it has no real boundary, just set them to U32_MAX.
3306 	 *
3307 	 * The split happens for real compressed bio, which happens in
3308 	 * btrfs_submit_compressed_read/write().
3309 	 */
3310 	if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) {
3311 		bio_ctrl->len_to_oe_boundary = U32_MAX;
3312 		bio_ctrl->len_to_stripe_boundary = U32_MAX;
3313 		return 0;
3314 	}
3315 	em = btrfs_get_chunk_map(fs_info, logical, fs_info->sectorsize);
3316 	if (IS_ERR(em))
3317 		return PTR_ERR(em);
3318 	ret = btrfs_get_io_geometry(fs_info, em, btrfs_op(bio_ctrl->bio),
3319 				    logical, &geom);
3320 	free_extent_map(em);
3321 	if (ret < 0) {
3322 		return ret;
3323 	}
3324 	if (geom.len > U32_MAX)
3325 		bio_ctrl->len_to_stripe_boundary = U32_MAX;
3326 	else
3327 		bio_ctrl->len_to_stripe_boundary = (u32)geom.len;
3328 
3329 	if (bio_op(bio_ctrl->bio) != REQ_OP_ZONE_APPEND) {
3330 		bio_ctrl->len_to_oe_boundary = U32_MAX;
3331 		return 0;
3332 	}
3333 
3334 	/* Ordered extent not yet created, so we're good */
3335 	ordered = btrfs_lookup_ordered_extent(inode, file_offset);
3336 	if (!ordered) {
3337 		bio_ctrl->len_to_oe_boundary = U32_MAX;
3338 		return 0;
3339 	}
3340 
3341 	bio_ctrl->len_to_oe_boundary = min_t(u32, U32_MAX,
3342 		ordered->disk_bytenr + ordered->disk_num_bytes - logical);
3343 	btrfs_put_ordered_extent(ordered);
3344 	return 0;
3345 }
3346 
3347 static int alloc_new_bio(struct btrfs_inode *inode,
3348 			 struct btrfs_bio_ctrl *bio_ctrl,
3349 			 struct writeback_control *wbc,
3350 			 blk_opf_t opf,
3351 			 bio_end_io_t end_io_func,
3352 			 u64 disk_bytenr, u32 offset, u64 file_offset,
3353 			 enum btrfs_compression_type compress_type)
3354 {
3355 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3356 	struct bio *bio;
3357 	int ret;
3358 
3359 	bio = btrfs_bio_alloc(BIO_MAX_VECS);
3360 	/*
3361 	 * For compressed page range, its disk_bytenr is always @disk_bytenr
3362 	 * passed in, no matter if we have added any range into previous bio.
3363 	 */
3364 	if (compress_type != BTRFS_COMPRESS_NONE)
3365 		bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
3366 	else
3367 		bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT;
3368 	bio_ctrl->bio = bio;
3369 	bio_ctrl->compress_type = compress_type;
3370 	bio->bi_end_io = end_io_func;
3371 	bio->bi_opf = opf;
3372 	ret = calc_bio_boundaries(bio_ctrl, inode, file_offset);
3373 	if (ret < 0)
3374 		goto error;
3375 
3376 	if (wbc) {
3377 		/*
3378 		 * For Zone append we need the correct block_device that we are
3379 		 * going to write to set in the bio to be able to respect the
3380 		 * hardware limitation.  Look it up here:
3381 		 */
3382 		if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
3383 			struct btrfs_device *dev;
3384 
3385 			dev = btrfs_zoned_get_device(fs_info, disk_bytenr,
3386 						     fs_info->sectorsize);
3387 			if (IS_ERR(dev)) {
3388 				ret = PTR_ERR(dev);
3389 				goto error;
3390 			}
3391 
3392 			bio_set_dev(bio, dev->bdev);
3393 		} else {
3394 			/*
3395 			 * Otherwise pick the last added device to support
3396 			 * cgroup writeback.  For multi-device file systems this
3397 			 * means blk-cgroup policies have to always be set on the
3398 			 * last added/replaced device.  This is a bit odd but has
3399 			 * been like that for a long time.
3400 			 */
3401 			bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev);
3402 		}
3403 		wbc_init_bio(wbc, bio);
3404 	} else {
3405 		ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND);
3406 	}
3407 	return 0;
3408 error:
3409 	bio_ctrl->bio = NULL;
3410 	bio->bi_status = errno_to_blk_status(ret);
3411 	bio_endio(bio);
3412 	return ret;
3413 }
3414 
3415 /*
3416  * @opf:	bio REQ_OP_* and REQ_* flags as one value
3417  * @wbc:	optional writeback control for io accounting
3418  * @page:	page to add to the bio
3419  * @disk_bytenr: logical bytenr where the write will be
3420  * @size:	portion of page that we want to write to
3421  * @pg_offset:	offset of the new bio or to check whether we are adding
3422  *              a contiguous page to the previous one
3423  * @bio_ret:	must be valid pointer, newly allocated bio will be stored there
3424  * @end_io_func:     end_io callback for new bio
3425  * @mirror_num:	     desired mirror to read/write
3426  * @prev_bio_flags:  flags of previous bio to see if we can merge the current one
3427  * @compress_type:   compress type for current bio
3428  */
3429 static int submit_extent_page(blk_opf_t opf,
3430 			      struct writeback_control *wbc,
3431 			      struct btrfs_bio_ctrl *bio_ctrl,
3432 			      struct page *page, u64 disk_bytenr,
3433 			      size_t size, unsigned long pg_offset,
3434 			      bio_end_io_t end_io_func,
3435 			      enum btrfs_compression_type compress_type,
3436 			      bool force_bio_submit)
3437 {
3438 	int ret = 0;
3439 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
3440 	unsigned int cur = pg_offset;
3441 
3442 	ASSERT(bio_ctrl);
3443 
3444 	ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE &&
3445 	       pg_offset + size <= PAGE_SIZE);
3446 	if (force_bio_submit)
3447 		submit_one_bio(bio_ctrl);
3448 
3449 	while (cur < pg_offset + size) {
3450 		u32 offset = cur - pg_offset;
3451 		int added;
3452 
3453 		/* Allocate new bio if needed */
3454 		if (!bio_ctrl->bio) {
3455 			ret = alloc_new_bio(inode, bio_ctrl, wbc, opf,
3456 					    end_io_func, disk_bytenr, offset,
3457 					    page_offset(page) + cur,
3458 					    compress_type);
3459 			if (ret < 0)
3460 				return ret;
3461 		}
3462 		/*
3463 		 * We must go through btrfs_bio_add_page() to ensure each
3464 		 * page range won't cross various boundaries.
3465 		 */
3466 		if (compress_type != BTRFS_COMPRESS_NONE)
3467 			added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr,
3468 					size - offset, pg_offset + offset,
3469 					compress_type);
3470 		else
3471 			added = btrfs_bio_add_page(bio_ctrl, page,
3472 					disk_bytenr + offset, size - offset,
3473 					pg_offset + offset, compress_type);
3474 
3475 		/* Metadata page range should never be split */
3476 		if (!is_data_inode(&inode->vfs_inode))
3477 			ASSERT(added == 0 || added == size - offset);
3478 
3479 		/* At least we added some page, update the account */
3480 		if (wbc && added)
3481 			wbc_account_cgroup_owner(wbc, page, added);
3482 
3483 		/* We have reached boundary, submit right now */
3484 		if (added < size - offset) {
3485 			/* The bio should contain some page(s) */
3486 			ASSERT(bio_ctrl->bio->bi_iter.bi_size);
3487 			submit_one_bio(bio_ctrl);
3488 		}
3489 		cur += added;
3490 	}
3491 	return 0;
3492 }
3493 
3494 static int attach_extent_buffer_page(struct extent_buffer *eb,
3495 				     struct page *page,
3496 				     struct btrfs_subpage *prealloc)
3497 {
3498 	struct btrfs_fs_info *fs_info = eb->fs_info;
3499 	int ret = 0;
3500 
3501 	/*
3502 	 * If the page is mapped to btree inode, we should hold the private
3503 	 * lock to prevent race.
3504 	 * For cloned or dummy extent buffers, their pages are not mapped and
3505 	 * will not race with any other ebs.
3506 	 */
3507 	if (page->mapping)
3508 		lockdep_assert_held(&page->mapping->private_lock);
3509 
3510 	if (fs_info->nodesize >= PAGE_SIZE) {
3511 		if (!PagePrivate(page))
3512 			attach_page_private(page, eb);
3513 		else
3514 			WARN_ON(page->private != (unsigned long)eb);
3515 		return 0;
3516 	}
3517 
3518 	/* Already mapped, just free prealloc */
3519 	if (PagePrivate(page)) {
3520 		btrfs_free_subpage(prealloc);
3521 		return 0;
3522 	}
3523 
3524 	if (prealloc)
3525 		/* Has preallocated memory for subpage */
3526 		attach_page_private(page, prealloc);
3527 	else
3528 		/* Do new allocation to attach subpage */
3529 		ret = btrfs_attach_subpage(fs_info, page,
3530 					   BTRFS_SUBPAGE_METADATA);
3531 	return ret;
3532 }
3533 
3534 int set_page_extent_mapped(struct page *page)
3535 {
3536 	struct btrfs_fs_info *fs_info;
3537 
3538 	ASSERT(page->mapping);
3539 
3540 	if (PagePrivate(page))
3541 		return 0;
3542 
3543 	fs_info = btrfs_sb(page->mapping->host->i_sb);
3544 
3545 	if (btrfs_is_subpage(fs_info, page))
3546 		return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA);
3547 
3548 	attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE);
3549 	return 0;
3550 }
3551 
3552 void clear_page_extent_mapped(struct page *page)
3553 {
3554 	struct btrfs_fs_info *fs_info;
3555 
3556 	ASSERT(page->mapping);
3557 
3558 	if (!PagePrivate(page))
3559 		return;
3560 
3561 	fs_info = btrfs_sb(page->mapping->host->i_sb);
3562 	if (btrfs_is_subpage(fs_info, page))
3563 		return btrfs_detach_subpage(fs_info, page);
3564 
3565 	detach_page_private(page);
3566 }
3567 
3568 static struct extent_map *
3569 __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
3570 		 u64 start, u64 len, struct extent_map **em_cached)
3571 {
3572 	struct extent_map *em;
3573 
3574 	if (em_cached && *em_cached) {
3575 		em = *em_cached;
3576 		if (extent_map_in_tree(em) && start >= em->start &&
3577 		    start < extent_map_end(em)) {
3578 			refcount_inc(&em->refs);
3579 			return em;
3580 		}
3581 
3582 		free_extent_map(em);
3583 		*em_cached = NULL;
3584 	}
3585 
3586 	em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len);
3587 	if (em_cached && !IS_ERR(em)) {
3588 		BUG_ON(*em_cached);
3589 		refcount_inc(&em->refs);
3590 		*em_cached = em;
3591 	}
3592 	return em;
3593 }
3594 /*
3595  * basic readpage implementation.  Locked extent state structs are inserted
3596  * into the tree that are removed when the IO is done (by the end_io
3597  * handlers)
3598  * XXX JDM: This needs looking at to ensure proper page locking
3599  * return 0 on success, otherwise return error
3600  */
3601 static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
3602 		      struct btrfs_bio_ctrl *bio_ctrl,
3603 		      blk_opf_t read_flags, u64 *prev_em_start)
3604 {
3605 	struct inode *inode = page->mapping->host;
3606 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3607 	u64 start = page_offset(page);
3608 	const u64 end = start + PAGE_SIZE - 1;
3609 	u64 cur = start;
3610 	u64 extent_offset;
3611 	u64 last_byte = i_size_read(inode);
3612 	u64 block_start;
3613 	u64 cur_end;
3614 	struct extent_map *em;
3615 	int ret = 0;
3616 	size_t pg_offset = 0;
3617 	size_t iosize;
3618 	size_t blocksize = inode->i_sb->s_blocksize;
3619 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
3620 
3621 	ret = set_page_extent_mapped(page);
3622 	if (ret < 0) {
3623 		unlock_extent(tree, start, end);
3624 		btrfs_page_set_error(fs_info, page, start, PAGE_SIZE);
3625 		unlock_page(page);
3626 		goto out;
3627 	}
3628 
3629 	if (page->index == last_byte >> PAGE_SHIFT) {
3630 		size_t zero_offset = offset_in_page(last_byte);
3631 
3632 		if (zero_offset) {
3633 			iosize = PAGE_SIZE - zero_offset;
3634 			memzero_page(page, zero_offset, iosize);
3635 		}
3636 	}
3637 	begin_page_read(fs_info, page);
3638 	while (cur <= end) {
3639 		unsigned long this_bio_flag = 0;
3640 		bool force_bio_submit = false;
3641 		u64 disk_bytenr;
3642 
3643 		ASSERT(IS_ALIGNED(cur, fs_info->sectorsize));
3644 		if (cur >= last_byte) {
3645 			struct extent_state *cached = NULL;
3646 
3647 			iosize = PAGE_SIZE - pg_offset;
3648 			memzero_page(page, pg_offset, iosize);
3649 			set_extent_uptodate(tree, cur, cur + iosize - 1,
3650 					    &cached, GFP_NOFS);
3651 			unlock_extent_cached(tree, cur,
3652 					     cur + iosize - 1, &cached);
3653 			end_page_read(page, true, cur, iosize);
3654 			break;
3655 		}
3656 		em = __get_extent_map(inode, page, pg_offset, cur,
3657 				      end - cur + 1, em_cached);
3658 		if (IS_ERR(em)) {
3659 			unlock_extent(tree, cur, end);
3660 			end_page_read(page, false, cur, end + 1 - cur);
3661 			ret = PTR_ERR(em);
3662 			break;
3663 		}
3664 		extent_offset = cur - em->start;
3665 		BUG_ON(extent_map_end(em) <= cur);
3666 		BUG_ON(end < cur);
3667 
3668 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
3669 			this_bio_flag = em->compress_type;
3670 
3671 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
3672 		cur_end = min(extent_map_end(em) - 1, end);
3673 		iosize = ALIGN(iosize, blocksize);
3674 		if (this_bio_flag != BTRFS_COMPRESS_NONE)
3675 			disk_bytenr = em->block_start;
3676 		else
3677 			disk_bytenr = em->block_start + extent_offset;
3678 		block_start = em->block_start;
3679 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3680 			block_start = EXTENT_MAP_HOLE;
3681 
3682 		/*
3683 		 * If we have a file range that points to a compressed extent
3684 		 * and it's followed by a consecutive file range that points
3685 		 * to the same compressed extent (possibly with a different
3686 		 * offset and/or length, so it either points to the whole extent
3687 		 * or only part of it), we must make sure we do not submit a
3688 		 * single bio to populate the pages for the 2 ranges because
3689 		 * this makes the compressed extent read zero out the pages
3690 		 * belonging to the 2nd range. Imagine the following scenario:
3691 		 *
3692 		 *  File layout
3693 		 *  [0 - 8K]                     [8K - 24K]
3694 		 *    |                               |
3695 		 *    |                               |
3696 		 * points to extent X,         points to extent X,
3697 		 * offset 4K, length of 8K     offset 0, length 16K
3698 		 *
3699 		 * [extent X, compressed length = 4K uncompressed length = 16K]
3700 		 *
3701 		 * If the bio to read the compressed extent covers both ranges,
3702 		 * it will decompress extent X into the pages belonging to the
3703 		 * first range and then it will stop, zeroing out the remaining
3704 		 * pages that belong to the other range that points to extent X.
3705 		 * So here we make sure we submit 2 bios, one for the first
3706 		 * range and another one for the third range. Both will target
3707 		 * the same physical extent from disk, but we can't currently
3708 		 * make the compressed bio endio callback populate the pages
3709 		 * for both ranges because each compressed bio is tightly
3710 		 * coupled with a single extent map, and each range can have
3711 		 * an extent map with a different offset value relative to the
3712 		 * uncompressed data of our extent and different lengths. This
3713 		 * is a corner case so we prioritize correctness over
3714 		 * non-optimal behavior (submitting 2 bios for the same extent).
3715 		 */
3716 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3717 		    prev_em_start && *prev_em_start != (u64)-1 &&
3718 		    *prev_em_start != em->start)
3719 			force_bio_submit = true;
3720 
3721 		if (prev_em_start)
3722 			*prev_em_start = em->start;
3723 
3724 		free_extent_map(em);
3725 		em = NULL;
3726 
3727 		/* we've found a hole, just zero and go on */
3728 		if (block_start == EXTENT_MAP_HOLE) {
3729 			struct extent_state *cached = NULL;
3730 
3731 			memzero_page(page, pg_offset, iosize);
3732 
3733 			set_extent_uptodate(tree, cur, cur + iosize - 1,
3734 					    &cached, GFP_NOFS);
3735 			unlock_extent_cached(tree, cur,
3736 					     cur + iosize - 1, &cached);
3737 			end_page_read(page, true, cur, iosize);
3738 			cur = cur + iosize;
3739 			pg_offset += iosize;
3740 			continue;
3741 		}
3742 		/* the get_extent function already copied into the page */
3743 		if (test_range_bit(tree, cur, cur_end,
3744 				   EXTENT_UPTODATE, 1, NULL)) {
3745 			unlock_extent(tree, cur, cur + iosize - 1);
3746 			end_page_read(page, true, cur, iosize);
3747 			cur = cur + iosize;
3748 			pg_offset += iosize;
3749 			continue;
3750 		}
3751 		/* we have an inline extent but it didn't get marked up
3752 		 * to date.  Error out
3753 		 */
3754 		if (block_start == EXTENT_MAP_INLINE) {
3755 			unlock_extent(tree, cur, cur + iosize - 1);
3756 			end_page_read(page, false, cur, iosize);
3757 			cur = cur + iosize;
3758 			pg_offset += iosize;
3759 			continue;
3760 		}
3761 
3762 		ret = submit_extent_page(REQ_OP_READ | read_flags, NULL,
3763 					 bio_ctrl, page, disk_bytenr, iosize,
3764 					 pg_offset, end_bio_extent_readpage,
3765 					 this_bio_flag, force_bio_submit);
3766 		if (ret) {
3767 			/*
3768 			 * We have to unlock the remaining range, or the page
3769 			 * will never be unlocked.
3770 			 */
3771 			unlock_extent(tree, cur, end);
3772 			end_page_read(page, false, cur, end + 1 - cur);
3773 			goto out;
3774 		}
3775 		cur = cur + iosize;
3776 		pg_offset += iosize;
3777 	}
3778 out:
3779 	return ret;
3780 }
3781 
3782 int btrfs_read_folio(struct file *file, struct folio *folio)
3783 {
3784 	struct page *page = &folio->page;
3785 	struct btrfs_inode *inode = BTRFS_I(page->mapping->host);
3786 	u64 start = page_offset(page);
3787 	u64 end = start + PAGE_SIZE - 1;
3788 	struct btrfs_bio_ctrl bio_ctrl = { 0 };
3789 	int ret;
3790 
3791 	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
3792 
3793 	ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL);
3794 	/*
3795 	 * If btrfs_do_readpage() failed we will want to submit the assembled
3796 	 * bio to do the cleanup.
3797 	 */
3798 	submit_one_bio(&bio_ctrl);
3799 	return ret;
3800 }
3801 
3802 static inline void contiguous_readpages(struct page *pages[], int nr_pages,
3803 					u64 start, u64 end,
3804 					struct extent_map **em_cached,
3805 					struct btrfs_bio_ctrl *bio_ctrl,
3806 					u64 *prev_em_start)
3807 {
3808 	struct btrfs_inode *inode = BTRFS_I(pages[0]->mapping->host);
3809 	int index;
3810 
3811 	btrfs_lock_and_flush_ordered_range(inode, start, end, NULL);
3812 
3813 	for (index = 0; index < nr_pages; index++) {
3814 		btrfs_do_readpage(pages[index], em_cached, bio_ctrl,
3815 				  REQ_RAHEAD, prev_em_start);
3816 		put_page(pages[index]);
3817 	}
3818 }
3819 
3820 /*
3821  * helper for __extent_writepage, doing all of the delayed allocation setup.
3822  *
3823  * This returns 1 if btrfs_run_delalloc_range function did all the work required
3824  * to write the page (copy into inline extent).  In this case the IO has
3825  * been started and the page is already unlocked.
3826  *
3827  * This returns 0 if all went well (page still locked)
3828  * This returns < 0 if there were errors (page still locked)
3829  */
3830 static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode,
3831 		struct page *page, struct writeback_control *wbc)
3832 {
3833 	const u64 page_end = page_offset(page) + PAGE_SIZE - 1;
3834 	u64 delalloc_start = page_offset(page);
3835 	u64 delalloc_to_write = 0;
3836 	/* How many pages are started by btrfs_run_delalloc_range() */
3837 	unsigned long nr_written = 0;
3838 	int ret;
3839 	int page_started = 0;
3840 
3841 	while (delalloc_start < page_end) {
3842 		u64 delalloc_end = page_end;
3843 		bool found;
3844 
3845 		found = find_lock_delalloc_range(&inode->vfs_inode, page,
3846 					       &delalloc_start,
3847 					       &delalloc_end);
3848 		if (!found) {
3849 			delalloc_start = delalloc_end + 1;
3850 			continue;
3851 		}
3852 		ret = btrfs_run_delalloc_range(inode, page, delalloc_start,
3853 				delalloc_end, &page_started, &nr_written, wbc);
3854 		if (ret) {
3855 			btrfs_page_set_error(inode->root->fs_info, page,
3856 					     page_offset(page), PAGE_SIZE);
3857 			return ret;
3858 		}
3859 		/*
3860 		 * delalloc_end is already one less than the total length, so
3861 		 * we don't subtract one from PAGE_SIZE
3862 		 */
3863 		delalloc_to_write += (delalloc_end - delalloc_start +
3864 				      PAGE_SIZE) >> PAGE_SHIFT;
3865 		delalloc_start = delalloc_end + 1;
3866 	}
3867 	if (wbc->nr_to_write < delalloc_to_write) {
3868 		int thresh = 8192;
3869 
3870 		if (delalloc_to_write < thresh * 2)
3871 			thresh = delalloc_to_write;
3872 		wbc->nr_to_write = min_t(u64, delalloc_to_write,
3873 					 thresh);
3874 	}
3875 
3876 	/* Did btrfs_run_dealloc_range() already unlock and start the IO? */
3877 	if (page_started) {
3878 		/*
3879 		 * We've unlocked the page, so we can't update the mapping's
3880 		 * writeback index, just update nr_to_write.
3881 		 */
3882 		wbc->nr_to_write -= nr_written;
3883 		return 1;
3884 	}
3885 
3886 	return 0;
3887 }
3888 
3889 /*
3890  * Find the first byte we need to write.
3891  *
3892  * For subpage, one page can contain several sectors, and
3893  * __extent_writepage_io() will just grab all extent maps in the page
3894  * range and try to submit all non-inline/non-compressed extents.
3895  *
3896  * This is a big problem for subpage, we shouldn't re-submit already written
3897  * data at all.
3898  * This function will lookup subpage dirty bit to find which range we really
3899  * need to submit.
3900  *
3901  * Return the next dirty range in [@start, @end).
3902  * If no dirty range is found, @start will be page_offset(page) + PAGE_SIZE.
3903  */
3904 static void find_next_dirty_byte(struct btrfs_fs_info *fs_info,
3905 				 struct page *page, u64 *start, u64 *end)
3906 {
3907 	struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
3908 	struct btrfs_subpage_info *spi = fs_info->subpage_info;
3909 	u64 orig_start = *start;
3910 	/* Declare as unsigned long so we can use bitmap ops */
3911 	unsigned long flags;
3912 	int range_start_bit;
3913 	int range_end_bit;
3914 
3915 	/*
3916 	 * For regular sector size == page size case, since one page only
3917 	 * contains one sector, we return the page offset directly.
3918 	 */
3919 	if (!btrfs_is_subpage(fs_info, page)) {
3920 		*start = page_offset(page);
3921 		*end = page_offset(page) + PAGE_SIZE;
3922 		return;
3923 	}
3924 
3925 	range_start_bit = spi->dirty_offset +
3926 			  (offset_in_page(orig_start) >> fs_info->sectorsize_bits);
3927 
3928 	/* We should have the page locked, but just in case */
3929 	spin_lock_irqsave(&subpage->lock, flags);
3930 	bitmap_next_set_region(subpage->bitmaps, &range_start_bit, &range_end_bit,
3931 			       spi->dirty_offset + spi->bitmap_nr_bits);
3932 	spin_unlock_irqrestore(&subpage->lock, flags);
3933 
3934 	range_start_bit -= spi->dirty_offset;
3935 	range_end_bit -= spi->dirty_offset;
3936 
3937 	*start = page_offset(page) + range_start_bit * fs_info->sectorsize;
3938 	*end = page_offset(page) + range_end_bit * fs_info->sectorsize;
3939 }
3940 
3941 /*
3942  * helper for __extent_writepage.  This calls the writepage start hooks,
3943  * and does the loop to map the page into extents and bios.
3944  *
3945  * We return 1 if the IO is started and the page is unlocked,
3946  * 0 if all went well (page still locked)
3947  * < 0 if there were errors (page still locked)
3948  */
3949 static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
3950 				 struct page *page,
3951 				 struct writeback_control *wbc,
3952 				 struct extent_page_data *epd,
3953 				 loff_t i_size,
3954 				 int *nr_ret)
3955 {
3956 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3957 	u64 cur = page_offset(page);
3958 	u64 end = cur + PAGE_SIZE - 1;
3959 	u64 extent_offset;
3960 	u64 block_start;
3961 	struct extent_map *em;
3962 	int saved_ret = 0;
3963 	int ret = 0;
3964 	int nr = 0;
3965 	enum req_op op = REQ_OP_WRITE;
3966 	const blk_opf_t write_flags = wbc_to_write_flags(wbc);
3967 	bool has_error = false;
3968 	bool compressed;
3969 
3970 	ret = btrfs_writepage_cow_fixup(page);
3971 	if (ret) {
3972 		/* Fixup worker will requeue */
3973 		redirty_page_for_writepage(wbc, page);
3974 		unlock_page(page);
3975 		return 1;
3976 	}
3977 
3978 	/*
3979 	 * we don't want to touch the inode after unlocking the page,
3980 	 * so we update the mapping writeback index now
3981 	 */
3982 	wbc->nr_to_write--;
3983 
3984 	while (cur <= end) {
3985 		u64 disk_bytenr;
3986 		u64 em_end;
3987 		u64 dirty_range_start = cur;
3988 		u64 dirty_range_end;
3989 		u32 iosize;
3990 
3991 		if (cur >= i_size) {
3992 			btrfs_writepage_endio_finish_ordered(inode, page, cur,
3993 							     end, true);
3994 			/*
3995 			 * This range is beyond i_size, thus we don't need to
3996 			 * bother writing back.
3997 			 * But we still need to clear the dirty subpage bit, or
3998 			 * the next time the page gets dirtied, we will try to
3999 			 * writeback the sectors with subpage dirty bits,
4000 			 * causing writeback without ordered extent.
4001 			 */
4002 			btrfs_page_clear_dirty(fs_info, page, cur, end + 1 - cur);
4003 			break;
4004 		}
4005 
4006 		find_next_dirty_byte(fs_info, page, &dirty_range_start,
4007 				     &dirty_range_end);
4008 		if (cur < dirty_range_start) {
4009 			cur = dirty_range_start;
4010 			continue;
4011 		}
4012 
4013 		em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1);
4014 		if (IS_ERR(em)) {
4015 			btrfs_page_set_error(fs_info, page, cur, end - cur + 1);
4016 			ret = PTR_ERR_OR_ZERO(em);
4017 			has_error = true;
4018 			if (!saved_ret)
4019 				saved_ret = ret;
4020 			break;
4021 		}
4022 
4023 		extent_offset = cur - em->start;
4024 		em_end = extent_map_end(em);
4025 		ASSERT(cur <= em_end);
4026 		ASSERT(cur < end);
4027 		ASSERT(IS_ALIGNED(em->start, fs_info->sectorsize));
4028 		ASSERT(IS_ALIGNED(em->len, fs_info->sectorsize));
4029 		block_start = em->block_start;
4030 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
4031 		disk_bytenr = em->block_start + extent_offset;
4032 
4033 		/*
4034 		 * Note that em_end from extent_map_end() and dirty_range_end from
4035 		 * find_next_dirty_byte() are all exclusive
4036 		 */
4037 		iosize = min(min(em_end, end + 1), dirty_range_end) - cur;
4038 
4039 		if (btrfs_use_zone_append(inode, em->block_start))
4040 			op = REQ_OP_ZONE_APPEND;
4041 
4042 		free_extent_map(em);
4043 		em = NULL;
4044 
4045 		/*
4046 		 * compressed and inline extents are written through other
4047 		 * paths in the FS
4048 		 */
4049 		if (compressed || block_start == EXTENT_MAP_HOLE ||
4050 		    block_start == EXTENT_MAP_INLINE) {
4051 			if (compressed)
4052 				nr++;
4053 			else
4054 				btrfs_writepage_endio_finish_ordered(inode,
4055 						page, cur, cur + iosize - 1, true);
4056 			btrfs_page_clear_dirty(fs_info, page, cur, iosize);
4057 			cur += iosize;
4058 			continue;
4059 		}
4060 
4061 		btrfs_set_range_writeback(inode, cur, cur + iosize - 1);
4062 		if (!PageWriteback(page)) {
4063 			btrfs_err(inode->root->fs_info,
4064 				   "page %lu not writeback, cur %llu end %llu",
4065 			       page->index, cur, end);
4066 		}
4067 
4068 		/*
4069 		 * Although the PageDirty bit is cleared before entering this
4070 		 * function, subpage dirty bit is not cleared.
4071 		 * So clear subpage dirty bit here so next time we won't submit
4072 		 * page for range already written to disk.
4073 		 */
4074 		btrfs_page_clear_dirty(fs_info, page, cur, iosize);
4075 
4076 		ret = submit_extent_page(op | write_flags, wbc,
4077 					 &epd->bio_ctrl, page,
4078 					 disk_bytenr, iosize,
4079 					 cur - page_offset(page),
4080 					 end_bio_extent_writepage,
4081 					 0, false);
4082 		if (ret) {
4083 			has_error = true;
4084 			if (!saved_ret)
4085 				saved_ret = ret;
4086 
4087 			btrfs_page_set_error(fs_info, page, cur, iosize);
4088 			if (PageWriteback(page))
4089 				btrfs_page_clear_writeback(fs_info, page, cur,
4090 							   iosize);
4091 		}
4092 
4093 		cur += iosize;
4094 		nr++;
4095 	}
4096 	/*
4097 	 * If we finish without problem, we should not only clear page dirty,
4098 	 * but also empty subpage dirty bits
4099 	 */
4100 	if (!has_error)
4101 		btrfs_page_assert_not_dirty(fs_info, page);
4102 	else
4103 		ret = saved_ret;
4104 	*nr_ret = nr;
4105 	return ret;
4106 }
4107 
4108 /*
4109  * the writepage semantics are similar to regular writepage.  extent
4110  * records are inserted to lock ranges in the tree, and as dirty areas
4111  * are found, they are marked writeback.  Then the lock bits are removed
4112  * and the end_io handler clears the writeback ranges
4113  *
4114  * Return 0 if everything goes well.
4115  * Return <0 for error.
4116  */
4117 static int __extent_writepage(struct page *page, struct writeback_control *wbc,
4118 			      struct extent_page_data *epd)
4119 {
4120 	struct folio *folio = page_folio(page);
4121 	struct inode *inode = page->mapping->host;
4122 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4123 	const u64 page_start = page_offset(page);
4124 	const u64 page_end = page_start + PAGE_SIZE - 1;
4125 	int ret;
4126 	int nr = 0;
4127 	size_t pg_offset;
4128 	loff_t i_size = i_size_read(inode);
4129 	unsigned long end_index = i_size >> PAGE_SHIFT;
4130 
4131 	trace___extent_writepage(page, inode, wbc);
4132 
4133 	WARN_ON(!PageLocked(page));
4134 
4135 	btrfs_page_clear_error(btrfs_sb(inode->i_sb), page,
4136 			       page_offset(page), PAGE_SIZE);
4137 
4138 	pg_offset = offset_in_page(i_size);
4139 	if (page->index > end_index ||
4140 	   (page->index == end_index && !pg_offset)) {
4141 		folio_invalidate(folio, 0, folio_size(folio));
4142 		folio_unlock(folio);
4143 		return 0;
4144 	}
4145 
4146 	if (page->index == end_index)
4147 		memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
4148 
4149 	ret = set_page_extent_mapped(page);
4150 	if (ret < 0) {
4151 		SetPageError(page);
4152 		goto done;
4153 	}
4154 
4155 	if (!epd->extent_locked) {
4156 		ret = writepage_delalloc(BTRFS_I(inode), page, wbc);
4157 		if (ret == 1)
4158 			return 0;
4159 		if (ret)
4160 			goto done;
4161 	}
4162 
4163 	ret = __extent_writepage_io(BTRFS_I(inode), page, wbc, epd, i_size,
4164 				    &nr);
4165 	if (ret == 1)
4166 		return 0;
4167 
4168 done:
4169 	if (nr == 0) {
4170 		/* make sure the mapping tag for page dirty gets cleared */
4171 		set_page_writeback(page);
4172 		end_page_writeback(page);
4173 	}
4174 	/*
4175 	 * Here we used to have a check for PageError() and then set @ret and
4176 	 * call end_extent_writepage().
4177 	 *
4178 	 * But in fact setting @ret here will cause different error paths
4179 	 * between subpage and regular sectorsize.
4180 	 *
4181 	 * For regular page size, we never submit current page, but only add
4182 	 * current page to current bio.
4183 	 * The bio submission can only happen in next page.
4184 	 * Thus if we hit the PageError() branch, @ret is already set to
4185 	 * non-zero value and will not get updated for regular sectorsize.
4186 	 *
4187 	 * But for subpage case, it's possible we submit part of current page,
4188 	 * thus can get PageError() set by submitted bio of the same page,
4189 	 * while our @ret is still 0.
4190 	 *
4191 	 * So here we unify the behavior and don't set @ret.
4192 	 * Error can still be properly passed to higher layer as page will
4193 	 * be set error, here we just don't handle the IO failure.
4194 	 *
4195 	 * NOTE: This is just a hotfix for subpage.
4196 	 * The root fix will be properly ending ordered extent when we hit
4197 	 * an error during writeback.
4198 	 *
4199 	 * But that needs a bigger refactoring, as we not only need to grab the
4200 	 * submitted OE, but also need to know exactly at which bytenr we hit
4201 	 * the error.
4202 	 * Currently the full page based __extent_writepage_io() is not
4203 	 * capable of that.
4204 	 */
4205 	if (PageError(page))
4206 		end_extent_writepage(page, ret, page_start, page_end);
4207 	if (epd->extent_locked) {
4208 		/*
4209 		 * If epd->extent_locked, it's from extent_write_locked_range(),
4210 		 * the page can either be locked by lock_page() or
4211 		 * process_one_page().
4212 		 * Let btrfs_page_unlock_writer() handle both cases.
4213 		 */
4214 		ASSERT(wbc);
4215 		btrfs_page_unlock_writer(fs_info, page, wbc->range_start,
4216 					 wbc->range_end + 1 - wbc->range_start);
4217 	} else {
4218 		unlock_page(page);
4219 	}
4220 	ASSERT(ret <= 0);
4221 	return ret;
4222 }
4223 
4224 void wait_on_extent_buffer_writeback(struct extent_buffer *eb)
4225 {
4226 	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_WRITEBACK,
4227 		       TASK_UNINTERRUPTIBLE);
4228 }
4229 
4230 static void end_extent_buffer_writeback(struct extent_buffer *eb)
4231 {
4232 	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
4233 	smp_mb__after_atomic();
4234 	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
4235 }
4236 
4237 /*
4238  * Lock extent buffer status and pages for writeback.
4239  *
4240  * May try to flush write bio if we can't get the lock.
4241  *
4242  * Return  0 if the extent buffer doesn't need to be submitted.
4243  *           (E.g. the extent buffer is not dirty)
4244  * Return >0 is the extent buffer is submitted to bio.
4245  * Return <0 if something went wrong, no page is locked.
4246  */
4247 static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb,
4248 			  struct extent_page_data *epd)
4249 {
4250 	struct btrfs_fs_info *fs_info = eb->fs_info;
4251 	int i, num_pages;
4252 	int flush = 0;
4253 	int ret = 0;
4254 
4255 	if (!btrfs_try_tree_write_lock(eb)) {
4256 		submit_write_bio(epd, 0);
4257 		flush = 1;
4258 		btrfs_tree_lock(eb);
4259 	}
4260 
4261 	if (test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)) {
4262 		btrfs_tree_unlock(eb);
4263 		if (!epd->sync_io)
4264 			return 0;
4265 		if (!flush) {
4266 			submit_write_bio(epd, 0);
4267 			flush = 1;
4268 		}
4269 		while (1) {
4270 			wait_on_extent_buffer_writeback(eb);
4271 			btrfs_tree_lock(eb);
4272 			if (!test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags))
4273 				break;
4274 			btrfs_tree_unlock(eb);
4275 		}
4276 	}
4277 
4278 	/*
4279 	 * We need to do this to prevent races in people who check if the eb is
4280 	 * under IO since we can end up having no IO bits set for a short period
4281 	 * of time.
4282 	 */
4283 	spin_lock(&eb->refs_lock);
4284 	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
4285 		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
4286 		spin_unlock(&eb->refs_lock);
4287 		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
4288 		percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4289 					 -eb->len,
4290 					 fs_info->dirty_metadata_batch);
4291 		ret = 1;
4292 	} else {
4293 		spin_unlock(&eb->refs_lock);
4294 	}
4295 
4296 	btrfs_tree_unlock(eb);
4297 
4298 	/*
4299 	 * Either we don't need to submit any tree block, or we're submitting
4300 	 * subpage eb.
4301 	 * Subpage metadata doesn't use page locking at all, so we can skip
4302 	 * the page locking.
4303 	 */
4304 	if (!ret || fs_info->nodesize < PAGE_SIZE)
4305 		return ret;
4306 
4307 	num_pages = num_extent_pages(eb);
4308 	for (i = 0; i < num_pages; i++) {
4309 		struct page *p = eb->pages[i];
4310 
4311 		if (!trylock_page(p)) {
4312 			if (!flush) {
4313 				submit_write_bio(epd, 0);
4314 				flush = 1;
4315 			}
4316 			lock_page(p);
4317 		}
4318 	}
4319 
4320 	return ret;
4321 }
4322 
4323 static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
4324 {
4325 	struct btrfs_fs_info *fs_info = eb->fs_info;
4326 
4327 	btrfs_page_set_error(fs_info, page, eb->start, eb->len);
4328 	if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
4329 		return;
4330 
4331 	/*
4332 	 * A read may stumble upon this buffer later, make sure that it gets an
4333 	 * error and knows there was an error.
4334 	 */
4335 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
4336 
4337 	/*
4338 	 * We need to set the mapping with the io error as well because a write
4339 	 * error will flip the file system readonly, and then syncfs() will
4340 	 * return a 0 because we are readonly if we don't modify the err seq for
4341 	 * the superblock.
4342 	 */
4343 	mapping_set_error(page->mapping, -EIO);
4344 
4345 	/*
4346 	 * If we error out, we should add back the dirty_metadata_bytes
4347 	 * to make it consistent.
4348 	 */
4349 	percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
4350 				 eb->len, fs_info->dirty_metadata_batch);
4351 
4352 	/*
4353 	 * If writeback for a btree extent that doesn't belong to a log tree
4354 	 * failed, increment the counter transaction->eb_write_errors.
4355 	 * We do this because while the transaction is running and before it's
4356 	 * committing (when we call filemap_fdata[write|wait]_range against
4357 	 * the btree inode), we might have
4358 	 * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it
4359 	 * returns an error or an error happens during writeback, when we're
4360 	 * committing the transaction we wouldn't know about it, since the pages
4361 	 * can be no longer dirty nor marked anymore for writeback (if a
4362 	 * subsequent modification to the extent buffer didn't happen before the
4363 	 * transaction commit), which makes filemap_fdata[write|wait]_range not
4364 	 * able to find the pages tagged with SetPageError at transaction
4365 	 * commit time. So if this happens we must abort the transaction,
4366 	 * otherwise we commit a super block with btree roots that point to
4367 	 * btree nodes/leafs whose content on disk is invalid - either garbage
4368 	 * or the content of some node/leaf from a past generation that got
4369 	 * cowed or deleted and is no longer valid.
4370 	 *
4371 	 * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would
4372 	 * not be enough - we need to distinguish between log tree extents vs
4373 	 * non-log tree extents, and the next filemap_fdatawait_range() call
4374 	 * will catch and clear such errors in the mapping - and that call might
4375 	 * be from a log sync and not from a transaction commit. Also, checking
4376 	 * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is
4377 	 * not done and would not be reliable - the eb might have been released
4378 	 * from memory and reading it back again means that flag would not be
4379 	 * set (since it's a runtime flag, not persisted on disk).
4380 	 *
4381 	 * Using the flags below in the btree inode also makes us achieve the
4382 	 * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started
4383 	 * writeback for all dirty pages and before filemap_fdatawait_range()
4384 	 * is called, the writeback for all dirty pages had already finished
4385 	 * with errors - because we were not using AS_EIO/AS_ENOSPC,
4386 	 * filemap_fdatawait_range() would return success, as it could not know
4387 	 * that writeback errors happened (the pages were no longer tagged for
4388 	 * writeback).
4389 	 */
4390 	switch (eb->log_index) {
4391 	case -1:
4392 		set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
4393 		break;
4394 	case 0:
4395 		set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
4396 		break;
4397 	case 1:
4398 		set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
4399 		break;
4400 	default:
4401 		BUG(); /* unexpected, logic error */
4402 	}
4403 }
4404 
4405 /*
4406  * The endio specific version which won't touch any unsafe spinlock in endio
4407  * context.
4408  */
4409 static struct extent_buffer *find_extent_buffer_nolock(
4410 		struct btrfs_fs_info *fs_info, u64 start)
4411 {
4412 	struct extent_buffer *eb;
4413 
4414 	rcu_read_lock();
4415 	eb = radix_tree_lookup(&fs_info->buffer_radix,
4416 			       start >> fs_info->sectorsize_bits);
4417 	if (eb && atomic_inc_not_zero(&eb->refs)) {
4418 		rcu_read_unlock();
4419 		return eb;
4420 	}
4421 	rcu_read_unlock();
4422 	return NULL;
4423 }
4424 
4425 /*
4426  * The endio function for subpage extent buffer write.
4427  *
4428  * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
4429  * after all extent buffers in the page has finished their writeback.
4430  */
4431 static void end_bio_subpage_eb_writepage(struct bio *bio)
4432 {
4433 	struct btrfs_fs_info *fs_info;
4434 	struct bio_vec *bvec;
4435 	struct bvec_iter_all iter_all;
4436 
4437 	fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
4438 	ASSERT(fs_info->nodesize < PAGE_SIZE);
4439 
4440 	ASSERT(!bio_flagged(bio, BIO_CLONED));
4441 	bio_for_each_segment_all(bvec, bio, iter_all) {
4442 		struct page *page = bvec->bv_page;
4443 		u64 bvec_start = page_offset(page) + bvec->bv_offset;
4444 		u64 bvec_end = bvec_start + bvec->bv_len - 1;
4445 		u64 cur_bytenr = bvec_start;
4446 
4447 		ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
4448 
4449 		/* Iterate through all extent buffers in the range */
4450 		while (cur_bytenr <= bvec_end) {
4451 			struct extent_buffer *eb;
4452 			int done;
4453 
4454 			/*
4455 			 * Here we can't use find_extent_buffer(), as it may
4456 			 * try to lock eb->refs_lock, which is not safe in endio
4457 			 * context.
4458 			 */
4459 			eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
4460 			ASSERT(eb);
4461 
4462 			cur_bytenr = eb->start + eb->len;
4463 
4464 			ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
4465 			done = atomic_dec_and_test(&eb->io_pages);
4466 			ASSERT(done);
4467 
4468 			if (bio->bi_status ||
4469 			    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
4470 				ClearPageUptodate(page);
4471 				set_btree_ioerr(page, eb);
4472 			}
4473 
4474 			btrfs_subpage_clear_writeback(fs_info, page, eb->start,
4475 						      eb->len);
4476 			end_extent_buffer_writeback(eb);
4477 			/*
4478 			 * free_extent_buffer() will grab spinlock which is not
4479 			 * safe in endio context. Thus here we manually dec
4480 			 * the ref.
4481 			 */
4482 			atomic_dec(&eb->refs);
4483 		}
4484 	}
4485 	bio_put(bio);
4486 }
4487 
4488 static void end_bio_extent_buffer_writepage(struct bio *bio)
4489 {
4490 	struct bio_vec *bvec;
4491 	struct extent_buffer *eb;
4492 	int done;
4493 	struct bvec_iter_all iter_all;
4494 
4495 	ASSERT(!bio_flagged(bio, BIO_CLONED));
4496 	bio_for_each_segment_all(bvec, bio, iter_all) {
4497 		struct page *page = bvec->bv_page;
4498 
4499 		eb = (struct extent_buffer *)page->private;
4500 		BUG_ON(!eb);
4501 		done = atomic_dec_and_test(&eb->io_pages);
4502 
4503 		if (bio->bi_status ||
4504 		    test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
4505 			ClearPageUptodate(page);
4506 			set_btree_ioerr(page, eb);
4507 		}
4508 
4509 		end_page_writeback(page);
4510 
4511 		if (!done)
4512 			continue;
4513 
4514 		end_extent_buffer_writeback(eb);
4515 	}
4516 
4517 	bio_put(bio);
4518 }
4519 
4520 static void prepare_eb_write(struct extent_buffer *eb)
4521 {
4522 	u32 nritems;
4523 	unsigned long start;
4524 	unsigned long end;
4525 
4526 	clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
4527 	atomic_set(&eb->io_pages, num_extent_pages(eb));
4528 
4529 	/* Set btree blocks beyond nritems with 0 to avoid stale content */
4530 	nritems = btrfs_header_nritems(eb);
4531 	if (btrfs_header_level(eb) > 0) {
4532 		end = btrfs_node_key_ptr_offset(nritems);
4533 		memzero_extent_buffer(eb, end, eb->len - end);
4534 	} else {
4535 		/*
4536 		 * Leaf:
4537 		 * header 0 1 2 .. N ... data_N .. data_2 data_1 data_0
4538 		 */
4539 		start = btrfs_item_nr_offset(nritems);
4540 		end = BTRFS_LEAF_DATA_OFFSET + leaf_data_end(eb);
4541 		memzero_extent_buffer(eb, start, end - start);
4542 	}
4543 }
4544 
4545 /*
4546  * Unlike the work in write_one_eb(), we rely completely on extent locking.
4547  * Page locking is only utilized at minimum to keep the VMM code happy.
4548  */
4549 static int write_one_subpage_eb(struct extent_buffer *eb,
4550 				struct writeback_control *wbc,
4551 				struct extent_page_data *epd)
4552 {
4553 	struct btrfs_fs_info *fs_info = eb->fs_info;
4554 	struct page *page = eb->pages[0];
4555 	blk_opf_t write_flags = wbc_to_write_flags(wbc);
4556 	bool no_dirty_ebs = false;
4557 	int ret;
4558 
4559 	prepare_eb_write(eb);
4560 
4561 	/* clear_page_dirty_for_io() in subpage helper needs page locked */
4562 	lock_page(page);
4563 	btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
4564 
4565 	/* Check if this is the last dirty bit to update nr_written */
4566 	no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
4567 							  eb->start, eb->len);
4568 	if (no_dirty_ebs)
4569 		clear_page_dirty_for_io(page);
4570 
4571 	ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
4572 			&epd->bio_ctrl, page, eb->start, eb->len,
4573 			eb->start - page_offset(page),
4574 			end_bio_subpage_eb_writepage, 0, false);
4575 	if (ret) {
4576 		btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
4577 		set_btree_ioerr(page, eb);
4578 		unlock_page(page);
4579 
4580 		if (atomic_dec_and_test(&eb->io_pages))
4581 			end_extent_buffer_writeback(eb);
4582 		return -EIO;
4583 	}
4584 	unlock_page(page);
4585 	/*
4586 	 * Submission finished without problem, if no range of the page is
4587 	 * dirty anymore, we have submitted a page.  Update nr_written in wbc.
4588 	 */
4589 	if (no_dirty_ebs)
4590 		wbc->nr_to_write--;
4591 	return ret;
4592 }
4593 
4594 static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
4595 			struct writeback_control *wbc,
4596 			struct extent_page_data *epd)
4597 {
4598 	u64 disk_bytenr = eb->start;
4599 	int i, num_pages;
4600 	blk_opf_t write_flags = wbc_to_write_flags(wbc);
4601 	int ret = 0;
4602 
4603 	prepare_eb_write(eb);
4604 
4605 	num_pages = num_extent_pages(eb);
4606 	for (i = 0; i < num_pages; i++) {
4607 		struct page *p = eb->pages[i];
4608 
4609 		clear_page_dirty_for_io(p);
4610 		set_page_writeback(p);
4611 		ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc,
4612 					 &epd->bio_ctrl, p, disk_bytenr,
4613 					 PAGE_SIZE, 0,
4614 					 end_bio_extent_buffer_writepage,
4615 					 0, false);
4616 		if (ret) {
4617 			set_btree_ioerr(p, eb);
4618 			if (PageWriteback(p))
4619 				end_page_writeback(p);
4620 			if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
4621 				end_extent_buffer_writeback(eb);
4622 			ret = -EIO;
4623 			break;
4624 		}
4625 		disk_bytenr += PAGE_SIZE;
4626 		wbc->nr_to_write--;
4627 		unlock_page(p);
4628 	}
4629 
4630 	if (unlikely(ret)) {
4631 		for (; i < num_pages; i++) {
4632 			struct page *p = eb->pages[i];
4633 			clear_page_dirty_for_io(p);
4634 			unlock_page(p);
4635 		}
4636 	}
4637 
4638 	return ret;
4639 }
4640 
4641 /*
4642  * Submit one subpage btree page.
4643  *
4644  * The main difference to submit_eb_page() is:
4645  * - Page locking
4646  *   For subpage, we don't rely on page locking at all.
4647  *
4648  * - Flush write bio
4649  *   We only flush bio if we may be unable to fit current extent buffers into
4650  *   current bio.
4651  *
4652  * Return >=0 for the number of submitted extent buffers.
4653  * Return <0 for fatal error.
4654  */
4655 static int submit_eb_subpage(struct page *page,
4656 			     struct writeback_control *wbc,
4657 			     struct extent_page_data *epd)
4658 {
4659 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
4660 	int submitted = 0;
4661 	u64 page_start = page_offset(page);
4662 	int bit_start = 0;
4663 	int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
4664 	int ret;
4665 
4666 	/* Lock and write each dirty extent buffers in the range */
4667 	while (bit_start < fs_info->subpage_info->bitmap_nr_bits) {
4668 		struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
4669 		struct extent_buffer *eb;
4670 		unsigned long flags;
4671 		u64 start;
4672 
4673 		/*
4674 		 * Take private lock to ensure the subpage won't be detached
4675 		 * in the meantime.
4676 		 */
4677 		spin_lock(&page->mapping->private_lock);
4678 		if (!PagePrivate(page)) {
4679 			spin_unlock(&page->mapping->private_lock);
4680 			break;
4681 		}
4682 		spin_lock_irqsave(&subpage->lock, flags);
4683 		if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset,
4684 			      subpage->bitmaps)) {
4685 			spin_unlock_irqrestore(&subpage->lock, flags);
4686 			spin_unlock(&page->mapping->private_lock);
4687 			bit_start++;
4688 			continue;
4689 		}
4690 
4691 		start = page_start + bit_start * fs_info->sectorsize;
4692 		bit_start += sectors_per_node;
4693 
4694 		/*
4695 		 * Here we just want to grab the eb without touching extra
4696 		 * spin locks, so call find_extent_buffer_nolock().
4697 		 */
4698 		eb = find_extent_buffer_nolock(fs_info, start);
4699 		spin_unlock_irqrestore(&subpage->lock, flags);
4700 		spin_unlock(&page->mapping->private_lock);
4701 
4702 		/*
4703 		 * The eb has already reached 0 refs thus find_extent_buffer()
4704 		 * doesn't return it. We don't need to write back such eb
4705 		 * anyway.
4706 		 */
4707 		if (!eb)
4708 			continue;
4709 
4710 		ret = lock_extent_buffer_for_io(eb, epd);
4711 		if (ret == 0) {
4712 			free_extent_buffer(eb);
4713 			continue;
4714 		}
4715 		if (ret < 0) {
4716 			free_extent_buffer(eb);
4717 			goto cleanup;
4718 		}
4719 		ret = write_one_subpage_eb(eb, wbc, epd);
4720 		free_extent_buffer(eb);
4721 		if (ret < 0)
4722 			goto cleanup;
4723 		submitted++;
4724 	}
4725 	return submitted;
4726 
4727 cleanup:
4728 	/* We hit error, end bio for the submitted extent buffers */
4729 	submit_write_bio(epd, ret);
4730 	return ret;
4731 }
4732 
4733 /*
4734  * Submit all page(s) of one extent buffer.
4735  *
4736  * @page:	the page of one extent buffer
4737  * @eb_context:	to determine if we need to submit this page, if current page
4738  *		belongs to this eb, we don't need to submit
4739  *
4740  * The caller should pass each page in their bytenr order, and here we use
4741  * @eb_context to determine if we have submitted pages of one extent buffer.
4742  *
4743  * If we have, we just skip until we hit a new page that doesn't belong to
4744  * current @eb_context.
4745  *
4746  * If not, we submit all the page(s) of the extent buffer.
4747  *
4748  * Return >0 if we have submitted the extent buffer successfully.
4749  * Return 0 if we don't need to submit the page, as it's already submitted by
4750  * previous call.
4751  * Return <0 for fatal error.
4752  */
4753 static int submit_eb_page(struct page *page, struct writeback_control *wbc,
4754 			  struct extent_page_data *epd,
4755 			  struct extent_buffer **eb_context)
4756 {
4757 	struct address_space *mapping = page->mapping;
4758 	struct btrfs_block_group *cache = NULL;
4759 	struct extent_buffer *eb;
4760 	int ret;
4761 
4762 	if (!PagePrivate(page))
4763 		return 0;
4764 
4765 	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
4766 		return submit_eb_subpage(page, wbc, epd);
4767 
4768 	spin_lock(&mapping->private_lock);
4769 	if (!PagePrivate(page)) {
4770 		spin_unlock(&mapping->private_lock);
4771 		return 0;
4772 	}
4773 
4774 	eb = (struct extent_buffer *)page->private;
4775 
4776 	/*
4777 	 * Shouldn't happen and normally this would be a BUG_ON but no point
4778 	 * crashing the machine for something we can survive anyway.
4779 	 */
4780 	if (WARN_ON(!eb)) {
4781 		spin_unlock(&mapping->private_lock);
4782 		return 0;
4783 	}
4784 
4785 	if (eb == *eb_context) {
4786 		spin_unlock(&mapping->private_lock);
4787 		return 0;
4788 	}
4789 	ret = atomic_inc_not_zero(&eb->refs);
4790 	spin_unlock(&mapping->private_lock);
4791 	if (!ret)
4792 		return 0;
4793 
4794 	if (!btrfs_check_meta_write_pointer(eb->fs_info, eb, &cache)) {
4795 		/*
4796 		 * If for_sync, this hole will be filled with
4797 		 * trasnsaction commit.
4798 		 */
4799 		if (wbc->sync_mode == WB_SYNC_ALL && !wbc->for_sync)
4800 			ret = -EAGAIN;
4801 		else
4802 			ret = 0;
4803 		free_extent_buffer(eb);
4804 		return ret;
4805 	}
4806 
4807 	*eb_context = eb;
4808 
4809 	ret = lock_extent_buffer_for_io(eb, epd);
4810 	if (ret <= 0) {
4811 		btrfs_revert_meta_write_pointer(cache, eb);
4812 		if (cache)
4813 			btrfs_put_block_group(cache);
4814 		free_extent_buffer(eb);
4815 		return ret;
4816 	}
4817 	if (cache) {
4818 		/*
4819 		 * Implies write in zoned mode. Mark the last eb in a block group.
4820 		 */
4821 		btrfs_schedule_zone_finish_bg(cache, eb);
4822 		btrfs_put_block_group(cache);
4823 	}
4824 	ret = write_one_eb(eb, wbc, epd);
4825 	free_extent_buffer(eb);
4826 	if (ret < 0)
4827 		return ret;
4828 	return 1;
4829 }
4830 
4831 int btree_write_cache_pages(struct address_space *mapping,
4832 				   struct writeback_control *wbc)
4833 {
4834 	struct extent_buffer *eb_context = NULL;
4835 	struct extent_page_data epd = {
4836 		.bio_ctrl = { 0 },
4837 		.extent_locked = 0,
4838 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
4839 	};
4840 	struct btrfs_fs_info *fs_info = BTRFS_I(mapping->host)->root->fs_info;
4841 	int ret = 0;
4842 	int done = 0;
4843 	int nr_to_write_done = 0;
4844 	struct pagevec pvec;
4845 	int nr_pages;
4846 	pgoff_t index;
4847 	pgoff_t end;		/* Inclusive */
4848 	int scanned = 0;
4849 	xa_mark_t tag;
4850 
4851 	pagevec_init(&pvec);
4852 	if (wbc->range_cyclic) {
4853 		index = mapping->writeback_index; /* Start from prev offset */
4854 		end = -1;
4855 		/*
4856 		 * Start from the beginning does not need to cycle over the
4857 		 * range, mark it as scanned.
4858 		 */
4859 		scanned = (index == 0);
4860 	} else {
4861 		index = wbc->range_start >> PAGE_SHIFT;
4862 		end = wbc->range_end >> PAGE_SHIFT;
4863 		scanned = 1;
4864 	}
4865 	if (wbc->sync_mode == WB_SYNC_ALL)
4866 		tag = PAGECACHE_TAG_TOWRITE;
4867 	else
4868 		tag = PAGECACHE_TAG_DIRTY;
4869 	btrfs_zoned_meta_io_lock(fs_info);
4870 retry:
4871 	if (wbc->sync_mode == WB_SYNC_ALL)
4872 		tag_pages_for_writeback(mapping, index, end);
4873 	while (!done && !nr_to_write_done && (index <= end) &&
4874 	       (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
4875 			tag))) {
4876 		unsigned i;
4877 
4878 		for (i = 0; i < nr_pages; i++) {
4879 			struct page *page = pvec.pages[i];
4880 
4881 			ret = submit_eb_page(page, wbc, &epd, &eb_context);
4882 			if (ret == 0)
4883 				continue;
4884 			if (ret < 0) {
4885 				done = 1;
4886 				break;
4887 			}
4888 
4889 			/*
4890 			 * the filesystem may choose to bump up nr_to_write.
4891 			 * We have to make sure to honor the new nr_to_write
4892 			 * at any time
4893 			 */
4894 			nr_to_write_done = wbc->nr_to_write <= 0;
4895 		}
4896 		pagevec_release(&pvec);
4897 		cond_resched();
4898 	}
4899 	if (!scanned && !done) {
4900 		/*
4901 		 * We hit the last page and there is more work to be done: wrap
4902 		 * back to the start of the file
4903 		 */
4904 		scanned = 1;
4905 		index = 0;
4906 		goto retry;
4907 	}
4908 	/*
4909 	 * If something went wrong, don't allow any metadata write bio to be
4910 	 * submitted.
4911 	 *
4912 	 * This would prevent use-after-free if we had dirty pages not
4913 	 * cleaned up, which can still happen by fuzzed images.
4914 	 *
4915 	 * - Bad extent tree
4916 	 *   Allowing existing tree block to be allocated for other trees.
4917 	 *
4918 	 * - Log tree operations
4919 	 *   Exiting tree blocks get allocated to log tree, bumps its
4920 	 *   generation, then get cleaned in tree re-balance.
4921 	 *   Such tree block will not be written back, since it's clean,
4922 	 *   thus no WRITTEN flag set.
4923 	 *   And after log writes back, this tree block is not traced by
4924 	 *   any dirty extent_io_tree.
4925 	 *
4926 	 * - Offending tree block gets re-dirtied from its original owner
4927 	 *   Since it has bumped generation, no WRITTEN flag, it can be
4928 	 *   reused without COWing. This tree block will not be traced
4929 	 *   by btrfs_transaction::dirty_pages.
4930 	 *
4931 	 *   Now such dirty tree block will not be cleaned by any dirty
4932 	 *   extent io tree. Thus we don't want to submit such wild eb
4933 	 *   if the fs already has error.
4934 	 *
4935 	 * We can get ret > 0 from submit_extent_page() indicating how many ebs
4936 	 * were submitted. Reset it to 0 to avoid false alerts for the caller.
4937 	 */
4938 	if (ret > 0)
4939 		ret = 0;
4940 	if (!ret && BTRFS_FS_ERROR(fs_info))
4941 		ret = -EROFS;
4942 	submit_write_bio(&epd, ret);
4943 
4944 	btrfs_zoned_meta_io_unlock(fs_info);
4945 	return ret;
4946 }
4947 
4948 /**
4949  * Walk the list of dirty pages of the given address space and write all of them.
4950  *
4951  * @mapping: address space structure to write
4952  * @wbc:     subtract the number of written pages from *@wbc->nr_to_write
4953  * @epd:     holds context for the write, namely the bio
4954  *
4955  * If a page is already under I/O, write_cache_pages() skips it, even
4956  * if it's dirty.  This is desirable behaviour for memory-cleaning writeback,
4957  * but it is INCORRECT for data-integrity system calls such as fsync().  fsync()
4958  * and msync() need to guarantee that all the data which was dirty at the time
4959  * the call was made get new I/O started against them.  If wbc->sync_mode is
4960  * WB_SYNC_ALL then we were called for data integrity and we must wait for
4961  * existing IO to complete.
4962  */
4963 static int extent_write_cache_pages(struct address_space *mapping,
4964 			     struct writeback_control *wbc,
4965 			     struct extent_page_data *epd)
4966 {
4967 	struct inode *inode = mapping->host;
4968 	int ret = 0;
4969 	int done = 0;
4970 	int nr_to_write_done = 0;
4971 	struct pagevec pvec;
4972 	int nr_pages;
4973 	pgoff_t index;
4974 	pgoff_t end;		/* Inclusive */
4975 	pgoff_t done_index;
4976 	int range_whole = 0;
4977 	int scanned = 0;
4978 	xa_mark_t tag;
4979 
4980 	/*
4981 	 * We have to hold onto the inode so that ordered extents can do their
4982 	 * work when the IO finishes.  The alternative to this is failing to add
4983 	 * an ordered extent if the igrab() fails there and that is a huge pain
4984 	 * to deal with, so instead just hold onto the inode throughout the
4985 	 * writepages operation.  If it fails here we are freeing up the inode
4986 	 * anyway and we'd rather not waste our time writing out stuff that is
4987 	 * going to be truncated anyway.
4988 	 */
4989 	if (!igrab(inode))
4990 		return 0;
4991 
4992 	pagevec_init(&pvec);
4993 	if (wbc->range_cyclic) {
4994 		index = mapping->writeback_index; /* Start from prev offset */
4995 		end = -1;
4996 		/*
4997 		 * Start from the beginning does not need to cycle over the
4998 		 * range, mark it as scanned.
4999 		 */
5000 		scanned = (index == 0);
5001 	} else {
5002 		index = wbc->range_start >> PAGE_SHIFT;
5003 		end = wbc->range_end >> PAGE_SHIFT;
5004 		if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
5005 			range_whole = 1;
5006 		scanned = 1;
5007 	}
5008 
5009 	/*
5010 	 * We do the tagged writepage as long as the snapshot flush bit is set
5011 	 * and we are the first one who do the filemap_flush() on this inode.
5012 	 *
5013 	 * The nr_to_write == LONG_MAX is needed to make sure other flushers do
5014 	 * not race in and drop the bit.
5015 	 */
5016 	if (range_whole && wbc->nr_to_write == LONG_MAX &&
5017 	    test_and_clear_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
5018 			       &BTRFS_I(inode)->runtime_flags))
5019 		wbc->tagged_writepages = 1;
5020 
5021 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
5022 		tag = PAGECACHE_TAG_TOWRITE;
5023 	else
5024 		tag = PAGECACHE_TAG_DIRTY;
5025 retry:
5026 	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
5027 		tag_pages_for_writeback(mapping, index, end);
5028 	done_index = index;
5029 	while (!done && !nr_to_write_done && (index <= end) &&
5030 			(nr_pages = pagevec_lookup_range_tag(&pvec, mapping,
5031 						&index, end, tag))) {
5032 		unsigned i;
5033 
5034 		for (i = 0; i < nr_pages; i++) {
5035 			struct page *page = pvec.pages[i];
5036 
5037 			done_index = page->index + 1;
5038 			/*
5039 			 * At this point we hold neither the i_pages lock nor
5040 			 * the page lock: the page may be truncated or
5041 			 * invalidated (changing page->mapping to NULL),
5042 			 * or even swizzled back from swapper_space to
5043 			 * tmpfs file mapping
5044 			 */
5045 			if (!trylock_page(page)) {
5046 				submit_write_bio(epd, 0);
5047 				lock_page(page);
5048 			}
5049 
5050 			if (unlikely(page->mapping != mapping)) {
5051 				unlock_page(page);
5052 				continue;
5053 			}
5054 
5055 			if (wbc->sync_mode != WB_SYNC_NONE) {
5056 				if (PageWriteback(page))
5057 					submit_write_bio(epd, 0);
5058 				wait_on_page_writeback(page);
5059 			}
5060 
5061 			if (PageWriteback(page) ||
5062 			    !clear_page_dirty_for_io(page)) {
5063 				unlock_page(page);
5064 				continue;
5065 			}
5066 
5067 			ret = __extent_writepage(page, wbc, epd);
5068 			if (ret < 0) {
5069 				done = 1;
5070 				break;
5071 			}
5072 
5073 			/*
5074 			 * the filesystem may choose to bump up nr_to_write.
5075 			 * We have to make sure to honor the new nr_to_write
5076 			 * at any time
5077 			 */
5078 			nr_to_write_done = wbc->nr_to_write <= 0;
5079 		}
5080 		pagevec_release(&pvec);
5081 		cond_resched();
5082 	}
5083 	if (!scanned && !done) {
5084 		/*
5085 		 * We hit the last page and there is more work to be done: wrap
5086 		 * back to the start of the file
5087 		 */
5088 		scanned = 1;
5089 		index = 0;
5090 
5091 		/*
5092 		 * If we're looping we could run into a page that is locked by a
5093 		 * writer and that writer could be waiting on writeback for a
5094 		 * page in our current bio, and thus deadlock, so flush the
5095 		 * write bio here.
5096 		 */
5097 		submit_write_bio(epd, 0);
5098 		goto retry;
5099 	}
5100 
5101 	if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole))
5102 		mapping->writeback_index = done_index;
5103 
5104 	btrfs_add_delayed_iput(inode);
5105 	return ret;
5106 }
5107 
5108 /*
5109  * Submit the pages in the range to bio for call sites which delalloc range has
5110  * already been ran (aka, ordered extent inserted) and all pages are still
5111  * locked.
5112  */
5113 int extent_write_locked_range(struct inode *inode, u64 start, u64 end)
5114 {
5115 	bool found_error = false;
5116 	int first_error = 0;
5117 	int ret = 0;
5118 	struct address_space *mapping = inode->i_mapping;
5119 	struct page *page;
5120 	u64 cur = start;
5121 	unsigned long nr_pages;
5122 	const u32 sectorsize = btrfs_sb(inode->i_sb)->sectorsize;
5123 	struct extent_page_data epd = {
5124 		.bio_ctrl = { 0 },
5125 		.extent_locked = 1,
5126 		.sync_io = 1,
5127 	};
5128 	struct writeback_control wbc_writepages = {
5129 		.sync_mode	= WB_SYNC_ALL,
5130 		.range_start	= start,
5131 		.range_end	= end + 1,
5132 		/* We're called from an async helper function */
5133 		.punt_to_cgroup	= 1,
5134 		.no_cgroup_owner = 1,
5135 	};
5136 
5137 	ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize));
5138 	nr_pages = (round_up(end, PAGE_SIZE) - round_down(start, PAGE_SIZE)) >>
5139 		   PAGE_SHIFT;
5140 	wbc_writepages.nr_to_write = nr_pages * 2;
5141 
5142 	wbc_attach_fdatawrite_inode(&wbc_writepages, inode);
5143 	while (cur <= end) {
5144 		u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end);
5145 
5146 		page = find_get_page(mapping, cur >> PAGE_SHIFT);
5147 		/*
5148 		 * All pages in the range are locked since
5149 		 * btrfs_run_delalloc_range(), thus there is no way to clear
5150 		 * the page dirty flag.
5151 		 */
5152 		ASSERT(PageLocked(page));
5153 		ASSERT(PageDirty(page));
5154 		clear_page_dirty_for_io(page);
5155 		ret = __extent_writepage(page, &wbc_writepages, &epd);
5156 		ASSERT(ret <= 0);
5157 		if (ret < 0) {
5158 			found_error = true;
5159 			first_error = ret;
5160 		}
5161 		put_page(page);
5162 		cur = cur_end + 1;
5163 	}
5164 
5165 	submit_write_bio(&epd, found_error ? ret : 0);
5166 
5167 	wbc_detach_inode(&wbc_writepages);
5168 	if (found_error)
5169 		return first_error;
5170 	return ret;
5171 }
5172 
5173 int extent_writepages(struct address_space *mapping,
5174 		      struct writeback_control *wbc)
5175 {
5176 	struct inode *inode = mapping->host;
5177 	int ret = 0;
5178 	struct extent_page_data epd = {
5179 		.bio_ctrl = { 0 },
5180 		.extent_locked = 0,
5181 		.sync_io = wbc->sync_mode == WB_SYNC_ALL,
5182 	};
5183 
5184 	/*
5185 	 * Allow only a single thread to do the reloc work in zoned mode to
5186 	 * protect the write pointer updates.
5187 	 */
5188 	btrfs_zoned_data_reloc_lock(BTRFS_I(inode));
5189 	ret = extent_write_cache_pages(mapping, wbc, &epd);
5190 	submit_write_bio(&epd, ret);
5191 	btrfs_zoned_data_reloc_unlock(BTRFS_I(inode));
5192 	return ret;
5193 }
5194 
5195 void extent_readahead(struct readahead_control *rac)
5196 {
5197 	struct btrfs_bio_ctrl bio_ctrl = { 0 };
5198 	struct page *pagepool[16];
5199 	struct extent_map *em_cached = NULL;
5200 	u64 prev_em_start = (u64)-1;
5201 	int nr;
5202 
5203 	while ((nr = readahead_page_batch(rac, pagepool))) {
5204 		u64 contig_start = readahead_pos(rac);
5205 		u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
5206 
5207 		contiguous_readpages(pagepool, nr, contig_start, contig_end,
5208 				&em_cached, &bio_ctrl, &prev_em_start);
5209 	}
5210 
5211 	if (em_cached)
5212 		free_extent_map(em_cached);
5213 	submit_one_bio(&bio_ctrl);
5214 }
5215 
5216 /*
5217  * basic invalidate_folio code, this waits on any locked or writeback
5218  * ranges corresponding to the folio, and then deletes any extent state
5219  * records from the tree
5220  */
5221 int extent_invalidate_folio(struct extent_io_tree *tree,
5222 			  struct folio *folio, size_t offset)
5223 {
5224 	struct extent_state *cached_state = NULL;
5225 	u64 start = folio_pos(folio);
5226 	u64 end = start + folio_size(folio) - 1;
5227 	size_t blocksize = folio->mapping->host->i_sb->s_blocksize;
5228 
5229 	/* This function is only called for the btree inode */
5230 	ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO);
5231 
5232 	start += ALIGN(offset, blocksize);
5233 	if (start > end)
5234 		return 0;
5235 
5236 	lock_extent_bits(tree, start, end, &cached_state);
5237 	folio_wait_writeback(folio);
5238 
5239 	/*
5240 	 * Currently for btree io tree, only EXTENT_LOCKED is utilized,
5241 	 * so here we only need to unlock the extent range to free any
5242 	 * existing extent state.
5243 	 */
5244 	unlock_extent_cached(tree, start, end, &cached_state);
5245 	return 0;
5246 }
5247 
5248 /*
5249  * a helper for release_folio, this tests for areas of the page that
5250  * are locked or under IO and drops the related state bits if it is safe
5251  * to drop the page.
5252  */
5253 static int try_release_extent_state(struct extent_io_tree *tree,
5254 				    struct page *page, gfp_t mask)
5255 {
5256 	u64 start = page_offset(page);
5257 	u64 end = start + PAGE_SIZE - 1;
5258 	int ret = 1;
5259 
5260 	if (test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL)) {
5261 		ret = 0;
5262 	} else {
5263 		/*
5264 		 * At this point we can safely clear everything except the
5265 		 * locked bit, the nodatasum bit and the delalloc new bit.
5266 		 * The delalloc new bit will be cleared by ordered extent
5267 		 * completion.
5268 		 */
5269 		ret = __clear_extent_bit(tree, start, end,
5270 			 ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW),
5271 			 0, 0, NULL, mask, NULL);
5272 
5273 		/* if clear_extent_bit failed for enomem reasons,
5274 		 * we can't allow the release to continue.
5275 		 */
5276 		if (ret < 0)
5277 			ret = 0;
5278 		else
5279 			ret = 1;
5280 	}
5281 	return ret;
5282 }
5283 
5284 /*
5285  * a helper for release_folio.  As long as there are no locked extents
5286  * in the range corresponding to the page, both state records and extent
5287  * map records are removed
5288  */
5289 int try_release_extent_mapping(struct page *page, gfp_t mask)
5290 {
5291 	struct extent_map *em;
5292 	u64 start = page_offset(page);
5293 	u64 end = start + PAGE_SIZE - 1;
5294 	struct btrfs_inode *btrfs_inode = BTRFS_I(page->mapping->host);
5295 	struct extent_io_tree *tree = &btrfs_inode->io_tree;
5296 	struct extent_map_tree *map = &btrfs_inode->extent_tree;
5297 
5298 	if (gfpflags_allow_blocking(mask) &&
5299 	    page->mapping->host->i_size > SZ_16M) {
5300 		u64 len;
5301 		while (start <= end) {
5302 			struct btrfs_fs_info *fs_info;
5303 			u64 cur_gen;
5304 
5305 			len = end - start + 1;
5306 			write_lock(&map->lock);
5307 			em = lookup_extent_mapping(map, start, len);
5308 			if (!em) {
5309 				write_unlock(&map->lock);
5310 				break;
5311 			}
5312 			if (test_bit(EXTENT_FLAG_PINNED, &em->flags) ||
5313 			    em->start != start) {
5314 				write_unlock(&map->lock);
5315 				free_extent_map(em);
5316 				break;
5317 			}
5318 			if (test_range_bit(tree, em->start,
5319 					   extent_map_end(em) - 1,
5320 					   EXTENT_LOCKED, 0, NULL))
5321 				goto next;
5322 			/*
5323 			 * If it's not in the list of modified extents, used
5324 			 * by a fast fsync, we can remove it. If it's being
5325 			 * logged we can safely remove it since fsync took an
5326 			 * extra reference on the em.
5327 			 */
5328 			if (list_empty(&em->list) ||
5329 			    test_bit(EXTENT_FLAG_LOGGING, &em->flags))
5330 				goto remove_em;
5331 			/*
5332 			 * If it's in the list of modified extents, remove it
5333 			 * only if its generation is older then the current one,
5334 			 * in which case we don't need it for a fast fsync.
5335 			 * Otherwise don't remove it, we could be racing with an
5336 			 * ongoing fast fsync that could miss the new extent.
5337 			 */
5338 			fs_info = btrfs_inode->root->fs_info;
5339 			spin_lock(&fs_info->trans_lock);
5340 			cur_gen = fs_info->generation;
5341 			spin_unlock(&fs_info->trans_lock);
5342 			if (em->generation >= cur_gen)
5343 				goto next;
5344 remove_em:
5345 			/*
5346 			 * We only remove extent maps that are not in the list of
5347 			 * modified extents or that are in the list but with a
5348 			 * generation lower then the current generation, so there
5349 			 * is no need to set the full fsync flag on the inode (it
5350 			 * hurts the fsync performance for workloads with a data
5351 			 * size that exceeds or is close to the system's memory).
5352 			 */
5353 			remove_extent_mapping(map, em);
5354 			/* once for the rb tree */
5355 			free_extent_map(em);
5356 next:
5357 			start = extent_map_end(em);
5358 			write_unlock(&map->lock);
5359 
5360 			/* once for us */
5361 			free_extent_map(em);
5362 
5363 			cond_resched(); /* Allow large-extent preemption. */
5364 		}
5365 	}
5366 	return try_release_extent_state(tree, page, mask);
5367 }
5368 
5369 /*
5370  * helper function for fiemap, which doesn't want to see any holes.
5371  * This maps until we find something past 'last'
5372  */
5373 static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode,
5374 						u64 offset, u64 last)
5375 {
5376 	u64 sectorsize = btrfs_inode_sectorsize(inode);
5377 	struct extent_map *em;
5378 	u64 len;
5379 
5380 	if (offset >= last)
5381 		return NULL;
5382 
5383 	while (1) {
5384 		len = last - offset;
5385 		if (len == 0)
5386 			break;
5387 		len = ALIGN(len, sectorsize);
5388 		em = btrfs_get_extent_fiemap(inode, offset, len);
5389 		if (IS_ERR(em))
5390 			return em;
5391 
5392 		/* if this isn't a hole return it */
5393 		if (em->block_start != EXTENT_MAP_HOLE)
5394 			return em;
5395 
5396 		/* this is a hole, advance to the next extent */
5397 		offset = extent_map_end(em);
5398 		free_extent_map(em);
5399 		if (offset >= last)
5400 			break;
5401 	}
5402 	return NULL;
5403 }
5404 
5405 /*
5406  * To cache previous fiemap extent
5407  *
5408  * Will be used for merging fiemap extent
5409  */
5410 struct fiemap_cache {
5411 	u64 offset;
5412 	u64 phys;
5413 	u64 len;
5414 	u32 flags;
5415 	bool cached;
5416 };
5417 
5418 /*
5419  * Helper to submit fiemap extent.
5420  *
5421  * Will try to merge current fiemap extent specified by @offset, @phys,
5422  * @len and @flags with cached one.
5423  * And only when we fails to merge, cached one will be submitted as
5424  * fiemap extent.
5425  *
5426  * Return value is the same as fiemap_fill_next_extent().
5427  */
5428 static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo,
5429 				struct fiemap_cache *cache,
5430 				u64 offset, u64 phys, u64 len, u32 flags)
5431 {
5432 	int ret = 0;
5433 
5434 	if (!cache->cached)
5435 		goto assign;
5436 
5437 	/*
5438 	 * Sanity check, extent_fiemap() should have ensured that new
5439 	 * fiemap extent won't overlap with cached one.
5440 	 * Not recoverable.
5441 	 *
5442 	 * NOTE: Physical address can overlap, due to compression
5443 	 */
5444 	if (cache->offset + cache->len > offset) {
5445 		WARN_ON(1);
5446 		return -EINVAL;
5447 	}
5448 
5449 	/*
5450 	 * Only merges fiemap extents if
5451 	 * 1) Their logical addresses are continuous
5452 	 *
5453 	 * 2) Their physical addresses are continuous
5454 	 *    So truly compressed (physical size smaller than logical size)
5455 	 *    extents won't get merged with each other
5456 	 *
5457 	 * 3) Share same flags except FIEMAP_EXTENT_LAST
5458 	 *    So regular extent won't get merged with prealloc extent
5459 	 */
5460 	if (cache->offset + cache->len  == offset &&
5461 	    cache->phys + cache->len == phys  &&
5462 	    (cache->flags & ~FIEMAP_EXTENT_LAST) ==
5463 			(flags & ~FIEMAP_EXTENT_LAST)) {
5464 		cache->len += len;
5465 		cache->flags |= flags;
5466 		goto try_submit_last;
5467 	}
5468 
5469 	/* Not mergeable, need to submit cached one */
5470 	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5471 				      cache->len, cache->flags);
5472 	cache->cached = false;
5473 	if (ret)
5474 		return ret;
5475 assign:
5476 	cache->cached = true;
5477 	cache->offset = offset;
5478 	cache->phys = phys;
5479 	cache->len = len;
5480 	cache->flags = flags;
5481 try_submit_last:
5482 	if (cache->flags & FIEMAP_EXTENT_LAST) {
5483 		ret = fiemap_fill_next_extent(fieinfo, cache->offset,
5484 				cache->phys, cache->len, cache->flags);
5485 		cache->cached = false;
5486 	}
5487 	return ret;
5488 }
5489 
5490 /*
5491  * Emit last fiemap cache
5492  *
5493  * The last fiemap cache may still be cached in the following case:
5494  * 0		      4k		    8k
5495  * |<- Fiemap range ->|
5496  * |<------------  First extent ----------->|
5497  *
5498  * In this case, the first extent range will be cached but not emitted.
5499  * So we must emit it before ending extent_fiemap().
5500  */
5501 static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo,
5502 				  struct fiemap_cache *cache)
5503 {
5504 	int ret;
5505 
5506 	if (!cache->cached)
5507 		return 0;
5508 
5509 	ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys,
5510 				      cache->len, cache->flags);
5511 	cache->cached = false;
5512 	if (ret > 0)
5513 		ret = 0;
5514 	return ret;
5515 }
5516 
5517 int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
5518 		  u64 start, u64 len)
5519 {
5520 	int ret = 0;
5521 	u64 off;
5522 	u64 max = start + len;
5523 	u32 flags = 0;
5524 	u32 found_type;
5525 	u64 last;
5526 	u64 last_for_get_extent = 0;
5527 	u64 disko = 0;
5528 	u64 isize = i_size_read(&inode->vfs_inode);
5529 	struct btrfs_key found_key;
5530 	struct extent_map *em = NULL;
5531 	struct extent_state *cached_state = NULL;
5532 	struct btrfs_path *path;
5533 	struct btrfs_root *root = inode->root;
5534 	struct fiemap_cache cache = { 0 };
5535 	struct ulist *roots;
5536 	struct ulist *tmp_ulist;
5537 	int end = 0;
5538 	u64 em_start = 0;
5539 	u64 em_len = 0;
5540 	u64 em_end = 0;
5541 
5542 	if (len == 0)
5543 		return -EINVAL;
5544 
5545 	path = btrfs_alloc_path();
5546 	if (!path)
5547 		return -ENOMEM;
5548 
5549 	roots = ulist_alloc(GFP_KERNEL);
5550 	tmp_ulist = ulist_alloc(GFP_KERNEL);
5551 	if (!roots || !tmp_ulist) {
5552 		ret = -ENOMEM;
5553 		goto out_free_ulist;
5554 	}
5555 
5556 	/*
5557 	 * We can't initialize that to 'start' as this could miss extents due
5558 	 * to extent item merging
5559 	 */
5560 	off = 0;
5561 	start = round_down(start, btrfs_inode_sectorsize(inode));
5562 	len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
5563 
5564 	/*
5565 	 * lookup the last file extent.  We're not using i_size here
5566 	 * because there might be preallocation past i_size
5567 	 */
5568 	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1,
5569 				       0);
5570 	if (ret < 0) {
5571 		goto out_free_ulist;
5572 	} else {
5573 		WARN_ON(!ret);
5574 		if (ret == 1)
5575 			ret = 0;
5576 	}
5577 
5578 	path->slots[0]--;
5579 	btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
5580 	found_type = found_key.type;
5581 
5582 	/* No extents, but there might be delalloc bits */
5583 	if (found_key.objectid != btrfs_ino(inode) ||
5584 	    found_type != BTRFS_EXTENT_DATA_KEY) {
5585 		/* have to trust i_size as the end */
5586 		last = (u64)-1;
5587 		last_for_get_extent = isize;
5588 	} else {
5589 		/*
5590 		 * remember the start of the last extent.  There are a
5591 		 * bunch of different factors that go into the length of the
5592 		 * extent, so its much less complex to remember where it started
5593 		 */
5594 		last = found_key.offset;
5595 		last_for_get_extent = last + 1;
5596 	}
5597 	btrfs_release_path(path);
5598 
5599 	/*
5600 	 * we might have some extents allocated but more delalloc past those
5601 	 * extents.  so, we trust isize unless the start of the last extent is
5602 	 * beyond isize
5603 	 */
5604 	if (last < isize) {
5605 		last = (u64)-1;
5606 		last_for_get_extent = isize;
5607 	}
5608 
5609 	lock_extent_bits(&inode->io_tree, start, start + len - 1,
5610 			 &cached_state);
5611 
5612 	em = get_extent_skip_holes(inode, start, last_for_get_extent);
5613 	if (!em)
5614 		goto out;
5615 	if (IS_ERR(em)) {
5616 		ret = PTR_ERR(em);
5617 		goto out;
5618 	}
5619 
5620 	while (!end) {
5621 		u64 offset_in_extent = 0;
5622 
5623 		/* break if the extent we found is outside the range */
5624 		if (em->start >= max || extent_map_end(em) < off)
5625 			break;
5626 
5627 		/*
5628 		 * get_extent may return an extent that starts before our
5629 		 * requested range.  We have to make sure the ranges
5630 		 * we return to fiemap always move forward and don't
5631 		 * overlap, so adjust the offsets here
5632 		 */
5633 		em_start = max(em->start, off);
5634 
5635 		/*
5636 		 * record the offset from the start of the extent
5637 		 * for adjusting the disk offset below.  Only do this if the
5638 		 * extent isn't compressed since our in ram offset may be past
5639 		 * what we have actually allocated on disk.
5640 		 */
5641 		if (!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5642 			offset_in_extent = em_start - em->start;
5643 		em_end = extent_map_end(em);
5644 		em_len = em_end - em_start;
5645 		flags = 0;
5646 		if (em->block_start < EXTENT_MAP_LAST_BYTE)
5647 			disko = em->block_start + offset_in_extent;
5648 		else
5649 			disko = 0;
5650 
5651 		/*
5652 		 * bump off for our next call to get_extent
5653 		 */
5654 		off = extent_map_end(em);
5655 		if (off >= max)
5656 			end = 1;
5657 
5658 		if (em->block_start == EXTENT_MAP_LAST_BYTE) {
5659 			end = 1;
5660 			flags |= FIEMAP_EXTENT_LAST;
5661 		} else if (em->block_start == EXTENT_MAP_INLINE) {
5662 			flags |= (FIEMAP_EXTENT_DATA_INLINE |
5663 				  FIEMAP_EXTENT_NOT_ALIGNED);
5664 		} else if (em->block_start == EXTENT_MAP_DELALLOC) {
5665 			flags |= (FIEMAP_EXTENT_DELALLOC |
5666 				  FIEMAP_EXTENT_UNKNOWN);
5667 		} else if (fieinfo->fi_extents_max) {
5668 			u64 bytenr = em->block_start -
5669 				(em->start - em->orig_start);
5670 
5671 			/*
5672 			 * As btrfs supports shared space, this information
5673 			 * can be exported to userspace tools via
5674 			 * flag FIEMAP_EXTENT_SHARED.  If fi_extents_max == 0
5675 			 * then we're just getting a count and we can skip the
5676 			 * lookup stuff.
5677 			 */
5678 			ret = btrfs_check_shared(root, btrfs_ino(inode),
5679 						 bytenr, roots, tmp_ulist);
5680 			if (ret < 0)
5681 				goto out_free;
5682 			if (ret)
5683 				flags |= FIEMAP_EXTENT_SHARED;
5684 			ret = 0;
5685 		}
5686 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
5687 			flags |= FIEMAP_EXTENT_ENCODED;
5688 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
5689 			flags |= FIEMAP_EXTENT_UNWRITTEN;
5690 
5691 		free_extent_map(em);
5692 		em = NULL;
5693 		if ((em_start >= last) || em_len == (u64)-1 ||
5694 		   (last == (u64)-1 && isize <= em_end)) {
5695 			flags |= FIEMAP_EXTENT_LAST;
5696 			end = 1;
5697 		}
5698 
5699 		/* now scan forward to see if this is really the last extent. */
5700 		em = get_extent_skip_holes(inode, off, last_for_get_extent);
5701 		if (IS_ERR(em)) {
5702 			ret = PTR_ERR(em);
5703 			goto out;
5704 		}
5705 		if (!em) {
5706 			flags |= FIEMAP_EXTENT_LAST;
5707 			end = 1;
5708 		}
5709 		ret = emit_fiemap_extent(fieinfo, &cache, em_start, disko,
5710 					   em_len, flags);
5711 		if (ret) {
5712 			if (ret == 1)
5713 				ret = 0;
5714 			goto out_free;
5715 		}
5716 	}
5717 out_free:
5718 	if (!ret)
5719 		ret = emit_last_fiemap_cache(fieinfo, &cache);
5720 	free_extent_map(em);
5721 out:
5722 	unlock_extent_cached(&inode->io_tree, start, start + len - 1,
5723 			     &cached_state);
5724 
5725 out_free_ulist:
5726 	btrfs_free_path(path);
5727 	ulist_free(roots);
5728 	ulist_free(tmp_ulist);
5729 	return ret;
5730 }
5731 
5732 static void __free_extent_buffer(struct extent_buffer *eb)
5733 {
5734 	kmem_cache_free(extent_buffer_cache, eb);
5735 }
5736 
5737 int extent_buffer_under_io(const struct extent_buffer *eb)
5738 {
5739 	return (atomic_read(&eb->io_pages) ||
5740 		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
5741 		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5742 }
5743 
5744 static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page)
5745 {
5746 	struct btrfs_subpage *subpage;
5747 
5748 	lockdep_assert_held(&page->mapping->private_lock);
5749 
5750 	if (PagePrivate(page)) {
5751 		subpage = (struct btrfs_subpage *)page->private;
5752 		if (atomic_read(&subpage->eb_refs))
5753 			return true;
5754 		/*
5755 		 * Even there is no eb refs here, we may still have
5756 		 * end_page_read() call relying on page::private.
5757 		 */
5758 		if (atomic_read(&subpage->readers))
5759 			return true;
5760 	}
5761 	return false;
5762 }
5763 
5764 static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page)
5765 {
5766 	struct btrfs_fs_info *fs_info = eb->fs_info;
5767 	const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5768 
5769 	/*
5770 	 * For mapped eb, we're going to change the page private, which should
5771 	 * be done under the private_lock.
5772 	 */
5773 	if (mapped)
5774 		spin_lock(&page->mapping->private_lock);
5775 
5776 	if (!PagePrivate(page)) {
5777 		if (mapped)
5778 			spin_unlock(&page->mapping->private_lock);
5779 		return;
5780 	}
5781 
5782 	if (fs_info->nodesize >= PAGE_SIZE) {
5783 		/*
5784 		 * We do this since we'll remove the pages after we've
5785 		 * removed the eb from the radix tree, so we could race
5786 		 * and have this page now attached to the new eb.  So
5787 		 * only clear page_private if it's still connected to
5788 		 * this eb.
5789 		 */
5790 		if (PagePrivate(page) &&
5791 		    page->private == (unsigned long)eb) {
5792 			BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
5793 			BUG_ON(PageDirty(page));
5794 			BUG_ON(PageWriteback(page));
5795 			/*
5796 			 * We need to make sure we haven't be attached
5797 			 * to a new eb.
5798 			 */
5799 			detach_page_private(page);
5800 		}
5801 		if (mapped)
5802 			spin_unlock(&page->mapping->private_lock);
5803 		return;
5804 	}
5805 
5806 	/*
5807 	 * For subpage, we can have dummy eb with page private.  In this case,
5808 	 * we can directly detach the private as such page is only attached to
5809 	 * one dummy eb, no sharing.
5810 	 */
5811 	if (!mapped) {
5812 		btrfs_detach_subpage(fs_info, page);
5813 		return;
5814 	}
5815 
5816 	btrfs_page_dec_eb_refs(fs_info, page);
5817 
5818 	/*
5819 	 * We can only detach the page private if there are no other ebs in the
5820 	 * page range and no unfinished IO.
5821 	 */
5822 	if (!page_range_has_eb(fs_info, page))
5823 		btrfs_detach_subpage(fs_info, page);
5824 
5825 	spin_unlock(&page->mapping->private_lock);
5826 }
5827 
5828 /* Release all pages attached to the extent buffer */
5829 static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb)
5830 {
5831 	int i;
5832 	int num_pages;
5833 
5834 	ASSERT(!extent_buffer_under_io(eb));
5835 
5836 	num_pages = num_extent_pages(eb);
5837 	for (i = 0; i < num_pages; i++) {
5838 		struct page *page = eb->pages[i];
5839 
5840 		if (!page)
5841 			continue;
5842 
5843 		detach_extent_buffer_page(eb, page);
5844 
5845 		/* One for when we allocated the page */
5846 		put_page(page);
5847 	}
5848 }
5849 
5850 /*
5851  * Helper for releasing the extent buffer.
5852  */
5853 static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
5854 {
5855 	btrfs_release_extent_buffer_pages(eb);
5856 	btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
5857 	__free_extent_buffer(eb);
5858 }
5859 
5860 static struct extent_buffer *
5861 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
5862 		      unsigned long len)
5863 {
5864 	struct extent_buffer *eb = NULL;
5865 
5866 	eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL);
5867 	eb->start = start;
5868 	eb->len = len;
5869 	eb->fs_info = fs_info;
5870 	eb->bflags = 0;
5871 	init_rwsem(&eb->lock);
5872 
5873 	btrfs_leak_debug_add(&fs_info->eb_leak_lock, &eb->leak_list,
5874 			     &fs_info->allocated_ebs);
5875 	INIT_LIST_HEAD(&eb->release_list);
5876 
5877 	spin_lock_init(&eb->refs_lock);
5878 	atomic_set(&eb->refs, 1);
5879 	atomic_set(&eb->io_pages, 0);
5880 
5881 	ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE);
5882 
5883 	return eb;
5884 }
5885 
5886 struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src)
5887 {
5888 	int i;
5889 	struct extent_buffer *new;
5890 	int num_pages = num_extent_pages(src);
5891 	int ret;
5892 
5893 	new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
5894 	if (new == NULL)
5895 		return NULL;
5896 
5897 	/*
5898 	 * Set UNMAPPED before calling btrfs_release_extent_buffer(), as
5899 	 * btrfs_release_extent_buffer() have different behavior for
5900 	 * UNMAPPED subpage extent buffer.
5901 	 */
5902 	set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags);
5903 
5904 	memset(new->pages, 0, sizeof(*new->pages) * num_pages);
5905 	ret = btrfs_alloc_page_array(num_pages, new->pages);
5906 	if (ret) {
5907 		btrfs_release_extent_buffer(new);
5908 		return NULL;
5909 	}
5910 
5911 	for (i = 0; i < num_pages; i++) {
5912 		int ret;
5913 		struct page *p = new->pages[i];
5914 
5915 		ret = attach_extent_buffer_page(new, p, NULL);
5916 		if (ret < 0) {
5917 			btrfs_release_extent_buffer(new);
5918 			return NULL;
5919 		}
5920 		WARN_ON(PageDirty(p));
5921 		copy_page(page_address(p), page_address(src->pages[i]));
5922 	}
5923 	set_extent_buffer_uptodate(new);
5924 
5925 	return new;
5926 }
5927 
5928 struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5929 						  u64 start, unsigned long len)
5930 {
5931 	struct extent_buffer *eb;
5932 	int num_pages;
5933 	int i;
5934 	int ret;
5935 
5936 	eb = __alloc_extent_buffer(fs_info, start, len);
5937 	if (!eb)
5938 		return NULL;
5939 
5940 	num_pages = num_extent_pages(eb);
5941 	ret = btrfs_alloc_page_array(num_pages, eb->pages);
5942 	if (ret)
5943 		goto err;
5944 
5945 	for (i = 0; i < num_pages; i++) {
5946 		struct page *p = eb->pages[i];
5947 
5948 		ret = attach_extent_buffer_page(eb, p, NULL);
5949 		if (ret < 0)
5950 			goto err;
5951 	}
5952 
5953 	set_extent_buffer_uptodate(eb);
5954 	btrfs_set_header_nritems(eb, 0);
5955 	set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags);
5956 
5957 	return eb;
5958 err:
5959 	for (i = 0; i < num_pages; i++) {
5960 		if (eb->pages[i]) {
5961 			detach_extent_buffer_page(eb, eb->pages[i]);
5962 			__free_page(eb->pages[i]);
5963 		}
5964 	}
5965 	__free_extent_buffer(eb);
5966 	return NULL;
5967 }
5968 
5969 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
5970 						u64 start)
5971 {
5972 	return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize);
5973 }
5974 
5975 static void check_buffer_tree_ref(struct extent_buffer *eb)
5976 {
5977 	int refs;
5978 	/*
5979 	 * The TREE_REF bit is first set when the extent_buffer is added
5980 	 * to the radix tree. It is also reset, if unset, when a new reference
5981 	 * is created by find_extent_buffer.
5982 	 *
5983 	 * It is only cleared in two cases: freeing the last non-tree
5984 	 * reference to the extent_buffer when its STALE bit is set or
5985 	 * calling release_folio when the tree reference is the only reference.
5986 	 *
5987 	 * In both cases, care is taken to ensure that the extent_buffer's
5988 	 * pages are not under io. However, release_folio can be concurrently
5989 	 * called with creating new references, which is prone to race
5990 	 * conditions between the calls to check_buffer_tree_ref in those
5991 	 * codepaths and clearing TREE_REF in try_release_extent_buffer.
5992 	 *
5993 	 * The actual lifetime of the extent_buffer in the radix tree is
5994 	 * adequately protected by the refcount, but the TREE_REF bit and
5995 	 * its corresponding reference are not. To protect against this
5996 	 * class of races, we call check_buffer_tree_ref from the codepaths
5997 	 * which trigger io after they set eb->io_pages. Note that once io is
5998 	 * initiated, TREE_REF can no longer be cleared, so that is the
5999 	 * moment at which any such race is best fixed.
6000 	 */
6001 	refs = atomic_read(&eb->refs);
6002 	if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6003 		return;
6004 
6005 	spin_lock(&eb->refs_lock);
6006 	if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6007 		atomic_inc(&eb->refs);
6008 	spin_unlock(&eb->refs_lock);
6009 }
6010 
6011 static void mark_extent_buffer_accessed(struct extent_buffer *eb,
6012 		struct page *accessed)
6013 {
6014 	int num_pages, i;
6015 
6016 	check_buffer_tree_ref(eb);
6017 
6018 	num_pages = num_extent_pages(eb);
6019 	for (i = 0; i < num_pages; i++) {
6020 		struct page *p = eb->pages[i];
6021 
6022 		if (p != accessed)
6023 			mark_page_accessed(p);
6024 	}
6025 }
6026 
6027 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
6028 					 u64 start)
6029 {
6030 	struct extent_buffer *eb;
6031 
6032 	eb = find_extent_buffer_nolock(fs_info, start);
6033 	if (!eb)
6034 		return NULL;
6035 	/*
6036 	 * Lock our eb's refs_lock to avoid races with free_extent_buffer().
6037 	 * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
6038 	 * another task running free_extent_buffer() might have seen that flag
6039 	 * set, eb->refs == 2, that the buffer isn't under IO (dirty and
6040 	 * writeback flags not set) and it's still in the tree (flag
6041 	 * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
6042 	 * decrementing the extent buffer's reference count twice.  So here we
6043 	 * could race and increment the eb's reference count, clear its stale
6044 	 * flag, mark it as dirty and drop our reference before the other task
6045 	 * finishes executing free_extent_buffer, which would later result in
6046 	 * an attempt to free an extent buffer that is dirty.
6047 	 */
6048 	if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
6049 		spin_lock(&eb->refs_lock);
6050 		spin_unlock(&eb->refs_lock);
6051 	}
6052 	mark_extent_buffer_accessed(eb, NULL);
6053 	return eb;
6054 }
6055 
6056 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6057 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
6058 					u64 start)
6059 {
6060 	struct extent_buffer *eb, *exists = NULL;
6061 	int ret;
6062 
6063 	eb = find_extent_buffer(fs_info, start);
6064 	if (eb)
6065 		return eb;
6066 	eb = alloc_dummy_extent_buffer(fs_info, start);
6067 	if (!eb)
6068 		return ERR_PTR(-ENOMEM);
6069 	eb->fs_info = fs_info;
6070 again:
6071 	ret = radix_tree_preload(GFP_NOFS);
6072 	if (ret) {
6073 		exists = ERR_PTR(ret);
6074 		goto free_eb;
6075 	}
6076 	spin_lock(&fs_info->buffer_lock);
6077 	ret = radix_tree_insert(&fs_info->buffer_radix,
6078 				start >> fs_info->sectorsize_bits, eb);
6079 	spin_unlock(&fs_info->buffer_lock);
6080 	radix_tree_preload_end();
6081 	if (ret == -EEXIST) {
6082 		exists = find_extent_buffer(fs_info, start);
6083 		if (exists)
6084 			goto free_eb;
6085 		else
6086 			goto again;
6087 	}
6088 	check_buffer_tree_ref(eb);
6089 	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
6090 
6091 	return eb;
6092 free_eb:
6093 	btrfs_release_extent_buffer(eb);
6094 	return exists;
6095 }
6096 #endif
6097 
6098 static struct extent_buffer *grab_extent_buffer(
6099 		struct btrfs_fs_info *fs_info, struct page *page)
6100 {
6101 	struct extent_buffer *exists;
6102 
6103 	/*
6104 	 * For subpage case, we completely rely on radix tree to ensure we
6105 	 * don't try to insert two ebs for the same bytenr.  So here we always
6106 	 * return NULL and just continue.
6107 	 */
6108 	if (fs_info->nodesize < PAGE_SIZE)
6109 		return NULL;
6110 
6111 	/* Page not yet attached to an extent buffer */
6112 	if (!PagePrivate(page))
6113 		return NULL;
6114 
6115 	/*
6116 	 * We could have already allocated an eb for this page and attached one
6117 	 * so lets see if we can get a ref on the existing eb, and if we can we
6118 	 * know it's good and we can just return that one, else we know we can
6119 	 * just overwrite page->private.
6120 	 */
6121 	exists = (struct extent_buffer *)page->private;
6122 	if (atomic_inc_not_zero(&exists->refs))
6123 		return exists;
6124 
6125 	WARN_ON(PageDirty(page));
6126 	detach_page_private(page);
6127 	return NULL;
6128 }
6129 
6130 static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start)
6131 {
6132 	if (!IS_ALIGNED(start, fs_info->sectorsize)) {
6133 		btrfs_err(fs_info, "bad tree block start %llu", start);
6134 		return -EINVAL;
6135 	}
6136 
6137 	if (fs_info->nodesize < PAGE_SIZE &&
6138 	    offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) {
6139 		btrfs_err(fs_info,
6140 		"tree block crosses page boundary, start %llu nodesize %u",
6141 			  start, fs_info->nodesize);
6142 		return -EINVAL;
6143 	}
6144 	if (fs_info->nodesize >= PAGE_SIZE &&
6145 	    !PAGE_ALIGNED(start)) {
6146 		btrfs_err(fs_info,
6147 		"tree block is not page aligned, start %llu nodesize %u",
6148 			  start, fs_info->nodesize);
6149 		return -EINVAL;
6150 	}
6151 	return 0;
6152 }
6153 
6154 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
6155 					  u64 start, u64 owner_root, int level)
6156 {
6157 	unsigned long len = fs_info->nodesize;
6158 	int num_pages;
6159 	int i;
6160 	unsigned long index = start >> PAGE_SHIFT;
6161 	struct extent_buffer *eb;
6162 	struct extent_buffer *exists = NULL;
6163 	struct page *p;
6164 	struct address_space *mapping = fs_info->btree_inode->i_mapping;
6165 	u64 lockdep_owner = owner_root;
6166 	int uptodate = 1;
6167 	int ret;
6168 
6169 	if (check_eb_alignment(fs_info, start))
6170 		return ERR_PTR(-EINVAL);
6171 
6172 #if BITS_PER_LONG == 32
6173 	if (start >= MAX_LFS_FILESIZE) {
6174 		btrfs_err_rl(fs_info,
6175 		"extent buffer %llu is beyond 32bit page cache limit", start);
6176 		btrfs_err_32bit_limit(fs_info);
6177 		return ERR_PTR(-EOVERFLOW);
6178 	}
6179 	if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
6180 		btrfs_warn_32bit_limit(fs_info);
6181 #endif
6182 
6183 	eb = find_extent_buffer(fs_info, start);
6184 	if (eb)
6185 		return eb;
6186 
6187 	eb = __alloc_extent_buffer(fs_info, start, len);
6188 	if (!eb)
6189 		return ERR_PTR(-ENOMEM);
6190 
6191 	/*
6192 	 * The reloc trees are just snapshots, so we need them to appear to be
6193 	 * just like any other fs tree WRT lockdep.
6194 	 */
6195 	if (lockdep_owner == BTRFS_TREE_RELOC_OBJECTID)
6196 		lockdep_owner = BTRFS_FS_TREE_OBJECTID;
6197 
6198 	btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level);
6199 
6200 	num_pages = num_extent_pages(eb);
6201 	for (i = 0; i < num_pages; i++, index++) {
6202 		struct btrfs_subpage *prealloc = NULL;
6203 
6204 		p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
6205 		if (!p) {
6206 			exists = ERR_PTR(-ENOMEM);
6207 			goto free_eb;
6208 		}
6209 
6210 		/*
6211 		 * Preallocate page->private for subpage case, so that we won't
6212 		 * allocate memory with private_lock hold.  The memory will be
6213 		 * freed by attach_extent_buffer_page() or freed manually if
6214 		 * we exit earlier.
6215 		 *
6216 		 * Although we have ensured one subpage eb can only have one
6217 		 * page, but it may change in the future for 16K page size
6218 		 * support, so we still preallocate the memory in the loop.
6219 		 */
6220 		if (fs_info->nodesize < PAGE_SIZE) {
6221 			prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA);
6222 			if (IS_ERR(prealloc)) {
6223 				ret = PTR_ERR(prealloc);
6224 				unlock_page(p);
6225 				put_page(p);
6226 				exists = ERR_PTR(ret);
6227 				goto free_eb;
6228 			}
6229 		}
6230 
6231 		spin_lock(&mapping->private_lock);
6232 		exists = grab_extent_buffer(fs_info, p);
6233 		if (exists) {
6234 			spin_unlock(&mapping->private_lock);
6235 			unlock_page(p);
6236 			put_page(p);
6237 			mark_extent_buffer_accessed(exists, p);
6238 			btrfs_free_subpage(prealloc);
6239 			goto free_eb;
6240 		}
6241 		/* Should not fail, as we have preallocated the memory */
6242 		ret = attach_extent_buffer_page(eb, p, prealloc);
6243 		ASSERT(!ret);
6244 		/*
6245 		 * To inform we have extra eb under allocation, so that
6246 		 * detach_extent_buffer_page() won't release the page private
6247 		 * when the eb hasn't yet been inserted into radix tree.
6248 		 *
6249 		 * The ref will be decreased when the eb released the page, in
6250 		 * detach_extent_buffer_page().
6251 		 * Thus needs no special handling in error path.
6252 		 */
6253 		btrfs_page_inc_eb_refs(fs_info, p);
6254 		spin_unlock(&mapping->private_lock);
6255 
6256 		WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
6257 		eb->pages[i] = p;
6258 		if (!PageUptodate(p))
6259 			uptodate = 0;
6260 
6261 		/*
6262 		 * We can't unlock the pages just yet since the extent buffer
6263 		 * hasn't been properly inserted in the radix tree, this
6264 		 * opens a race with btree_release_folio which can free a page
6265 		 * while we are still filling in all pages for the buffer and
6266 		 * we could crash.
6267 		 */
6268 	}
6269 	if (uptodate)
6270 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6271 again:
6272 	ret = radix_tree_preload(GFP_NOFS);
6273 	if (ret) {
6274 		exists = ERR_PTR(ret);
6275 		goto free_eb;
6276 	}
6277 
6278 	spin_lock(&fs_info->buffer_lock);
6279 	ret = radix_tree_insert(&fs_info->buffer_radix,
6280 				start >> fs_info->sectorsize_bits, eb);
6281 	spin_unlock(&fs_info->buffer_lock);
6282 	radix_tree_preload_end();
6283 	if (ret == -EEXIST) {
6284 		exists = find_extent_buffer(fs_info, start);
6285 		if (exists)
6286 			goto free_eb;
6287 		else
6288 			goto again;
6289 	}
6290 	/* add one reference for the tree */
6291 	check_buffer_tree_ref(eb);
6292 	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
6293 
6294 	/*
6295 	 * Now it's safe to unlock the pages because any calls to
6296 	 * btree_release_folio will correctly detect that a page belongs to a
6297 	 * live buffer and won't free them prematurely.
6298 	 */
6299 	for (i = 0; i < num_pages; i++)
6300 		unlock_page(eb->pages[i]);
6301 	return eb;
6302 
6303 free_eb:
6304 	WARN_ON(!atomic_dec_and_test(&eb->refs));
6305 	for (i = 0; i < num_pages; i++) {
6306 		if (eb->pages[i])
6307 			unlock_page(eb->pages[i]);
6308 	}
6309 
6310 	btrfs_release_extent_buffer(eb);
6311 	return exists;
6312 }
6313 
6314 static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
6315 {
6316 	struct extent_buffer *eb =
6317 			container_of(head, struct extent_buffer, rcu_head);
6318 
6319 	__free_extent_buffer(eb);
6320 }
6321 
6322 static int release_extent_buffer(struct extent_buffer *eb)
6323 	__releases(&eb->refs_lock)
6324 {
6325 	lockdep_assert_held(&eb->refs_lock);
6326 
6327 	WARN_ON(atomic_read(&eb->refs) == 0);
6328 	if (atomic_dec_and_test(&eb->refs)) {
6329 		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
6330 			struct btrfs_fs_info *fs_info = eb->fs_info;
6331 
6332 			spin_unlock(&eb->refs_lock);
6333 
6334 			spin_lock(&fs_info->buffer_lock);
6335 			radix_tree_delete(&fs_info->buffer_radix,
6336 					  eb->start >> fs_info->sectorsize_bits);
6337 			spin_unlock(&fs_info->buffer_lock);
6338 		} else {
6339 			spin_unlock(&eb->refs_lock);
6340 		}
6341 
6342 		btrfs_leak_debug_del(&eb->fs_info->eb_leak_lock, &eb->leak_list);
6343 		/* Should be safe to release our pages at this point */
6344 		btrfs_release_extent_buffer_pages(eb);
6345 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
6346 		if (unlikely(test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags))) {
6347 			__free_extent_buffer(eb);
6348 			return 1;
6349 		}
6350 #endif
6351 		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
6352 		return 1;
6353 	}
6354 	spin_unlock(&eb->refs_lock);
6355 
6356 	return 0;
6357 }
6358 
6359 void free_extent_buffer(struct extent_buffer *eb)
6360 {
6361 	int refs;
6362 	int old;
6363 	if (!eb)
6364 		return;
6365 
6366 	while (1) {
6367 		refs = atomic_read(&eb->refs);
6368 		if ((!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) && refs <= 3)
6369 		    || (test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags) &&
6370 			refs == 1))
6371 			break;
6372 		old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
6373 		if (old == refs)
6374 			return;
6375 	}
6376 
6377 	spin_lock(&eb->refs_lock);
6378 	if (atomic_read(&eb->refs) == 2 &&
6379 	    test_bit(EXTENT_BUFFER_STALE, &eb->bflags) &&
6380 	    !extent_buffer_under_io(eb) &&
6381 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6382 		atomic_dec(&eb->refs);
6383 
6384 	/*
6385 	 * I know this is terrible, but it's temporary until we stop tracking
6386 	 * the uptodate bits and such for the extent buffers.
6387 	 */
6388 	release_extent_buffer(eb);
6389 }
6390 
6391 void free_extent_buffer_stale(struct extent_buffer *eb)
6392 {
6393 	if (!eb)
6394 		return;
6395 
6396 	spin_lock(&eb->refs_lock);
6397 	set_bit(EXTENT_BUFFER_STALE, &eb->bflags);
6398 
6399 	if (atomic_read(&eb->refs) == 2 && !extent_buffer_under_io(eb) &&
6400 	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
6401 		atomic_dec(&eb->refs);
6402 	release_extent_buffer(eb);
6403 }
6404 
6405 static void btree_clear_page_dirty(struct page *page)
6406 {
6407 	ASSERT(PageDirty(page));
6408 	ASSERT(PageLocked(page));
6409 	clear_page_dirty_for_io(page);
6410 	xa_lock_irq(&page->mapping->i_pages);
6411 	if (!PageDirty(page))
6412 		__xa_clear_mark(&page->mapping->i_pages,
6413 				page_index(page), PAGECACHE_TAG_DIRTY);
6414 	xa_unlock_irq(&page->mapping->i_pages);
6415 }
6416 
6417 static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
6418 {
6419 	struct btrfs_fs_info *fs_info = eb->fs_info;
6420 	struct page *page = eb->pages[0];
6421 	bool last;
6422 
6423 	/* btree_clear_page_dirty() needs page locked */
6424 	lock_page(page);
6425 	last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
6426 						  eb->len);
6427 	if (last)
6428 		btree_clear_page_dirty(page);
6429 	unlock_page(page);
6430 	WARN_ON(atomic_read(&eb->refs) == 0);
6431 }
6432 
6433 void clear_extent_buffer_dirty(const struct extent_buffer *eb)
6434 {
6435 	int i;
6436 	int num_pages;
6437 	struct page *page;
6438 
6439 	if (eb->fs_info->nodesize < PAGE_SIZE)
6440 		return clear_subpage_extent_buffer_dirty(eb);
6441 
6442 	num_pages = num_extent_pages(eb);
6443 
6444 	for (i = 0; i < num_pages; i++) {
6445 		page = eb->pages[i];
6446 		if (!PageDirty(page))
6447 			continue;
6448 		lock_page(page);
6449 		btree_clear_page_dirty(page);
6450 		ClearPageError(page);
6451 		unlock_page(page);
6452 	}
6453 	WARN_ON(atomic_read(&eb->refs) == 0);
6454 }
6455 
6456 bool set_extent_buffer_dirty(struct extent_buffer *eb)
6457 {
6458 	int i;
6459 	int num_pages;
6460 	bool was_dirty;
6461 
6462 	check_buffer_tree_ref(eb);
6463 
6464 	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
6465 
6466 	num_pages = num_extent_pages(eb);
6467 	WARN_ON(atomic_read(&eb->refs) == 0);
6468 	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
6469 
6470 	if (!was_dirty) {
6471 		bool subpage = eb->fs_info->nodesize < PAGE_SIZE;
6472 
6473 		/*
6474 		 * For subpage case, we can have other extent buffers in the
6475 		 * same page, and in clear_subpage_extent_buffer_dirty() we
6476 		 * have to clear page dirty without subpage lock held.
6477 		 * This can cause race where our page gets dirty cleared after
6478 		 * we just set it.
6479 		 *
6480 		 * Thankfully, clear_subpage_extent_buffer_dirty() has locked
6481 		 * its page for other reasons, we can use page lock to prevent
6482 		 * the above race.
6483 		 */
6484 		if (subpage)
6485 			lock_page(eb->pages[0]);
6486 		for (i = 0; i < num_pages; i++)
6487 			btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
6488 					     eb->start, eb->len);
6489 		if (subpage)
6490 			unlock_page(eb->pages[0]);
6491 	}
6492 #ifdef CONFIG_BTRFS_DEBUG
6493 	for (i = 0; i < num_pages; i++)
6494 		ASSERT(PageDirty(eb->pages[i]));
6495 #endif
6496 
6497 	return was_dirty;
6498 }
6499 
6500 void clear_extent_buffer_uptodate(struct extent_buffer *eb)
6501 {
6502 	struct btrfs_fs_info *fs_info = eb->fs_info;
6503 	struct page *page;
6504 	int num_pages;
6505 	int i;
6506 
6507 	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6508 	num_pages = num_extent_pages(eb);
6509 	for (i = 0; i < num_pages; i++) {
6510 		page = eb->pages[i];
6511 		if (!page)
6512 			continue;
6513 
6514 		/*
6515 		 * This is special handling for metadata subpage, as regular
6516 		 * btrfs_is_subpage() can not handle cloned/dummy metadata.
6517 		 */
6518 		if (fs_info->nodesize >= PAGE_SIZE)
6519 			ClearPageUptodate(page);
6520 		else
6521 			btrfs_subpage_clear_uptodate(fs_info, page, eb->start,
6522 						     eb->len);
6523 	}
6524 }
6525 
6526 void set_extent_buffer_uptodate(struct extent_buffer *eb)
6527 {
6528 	struct btrfs_fs_info *fs_info = eb->fs_info;
6529 	struct page *page;
6530 	int num_pages;
6531 	int i;
6532 
6533 	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6534 	num_pages = num_extent_pages(eb);
6535 	for (i = 0; i < num_pages; i++) {
6536 		page = eb->pages[i];
6537 
6538 		/*
6539 		 * This is special handling for metadata subpage, as regular
6540 		 * btrfs_is_subpage() can not handle cloned/dummy metadata.
6541 		 */
6542 		if (fs_info->nodesize >= PAGE_SIZE)
6543 			SetPageUptodate(page);
6544 		else
6545 			btrfs_subpage_set_uptodate(fs_info, page, eb->start,
6546 						   eb->len);
6547 	}
6548 }
6549 
6550 static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait,
6551 				      int mirror_num)
6552 {
6553 	struct btrfs_fs_info *fs_info = eb->fs_info;
6554 	struct extent_io_tree *io_tree;
6555 	struct page *page = eb->pages[0];
6556 	struct btrfs_bio_ctrl bio_ctrl = {
6557 		.mirror_num = mirror_num,
6558 	};
6559 	int ret = 0;
6560 
6561 	ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags));
6562 	ASSERT(PagePrivate(page));
6563 	io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
6564 
6565 	if (wait == WAIT_NONE) {
6566 		if (!try_lock_extent(io_tree, eb->start, eb->start + eb->len - 1))
6567 			return -EAGAIN;
6568 	} else {
6569 		ret = lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6570 		if (ret < 0)
6571 			return ret;
6572 	}
6573 
6574 	ret = 0;
6575 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags) ||
6576 	    PageUptodate(page) ||
6577 	    btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len)) {
6578 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6579 		unlock_extent(io_tree, eb->start, eb->start + eb->len - 1);
6580 		return ret;
6581 	}
6582 
6583 	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
6584 	eb->read_mirror = 0;
6585 	atomic_set(&eb->io_pages, 1);
6586 	check_buffer_tree_ref(eb);
6587 	btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len);
6588 
6589 	btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len);
6590 	ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl,
6591 				 page, eb->start, eb->len,
6592 				 eb->start - page_offset(page),
6593 				 end_bio_extent_readpage, 0, true);
6594 	if (ret) {
6595 		/*
6596 		 * In the endio function, if we hit something wrong we will
6597 		 * increase the io_pages, so here we need to decrease it for
6598 		 * error path.
6599 		 */
6600 		atomic_dec(&eb->io_pages);
6601 	}
6602 	submit_one_bio(&bio_ctrl);
6603 	if (ret || wait != WAIT_COMPLETE)
6604 		return ret;
6605 
6606 	wait_extent_bit(io_tree, eb->start, eb->start + eb->len - 1, EXTENT_LOCKED);
6607 	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
6608 		ret = -EIO;
6609 	return ret;
6610 }
6611 
6612 int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
6613 {
6614 	int i;
6615 	struct page *page;
6616 	int err;
6617 	int ret = 0;
6618 	int locked_pages = 0;
6619 	int all_uptodate = 1;
6620 	int num_pages;
6621 	unsigned long num_reads = 0;
6622 	struct btrfs_bio_ctrl bio_ctrl = {
6623 		.mirror_num = mirror_num,
6624 	};
6625 
6626 	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
6627 		return 0;
6628 
6629 	/*
6630 	 * We could have had EXTENT_BUFFER_UPTODATE cleared by the write
6631 	 * operation, which could potentially still be in flight.  In this case
6632 	 * we simply want to return an error.
6633 	 */
6634 	if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)))
6635 		return -EIO;
6636 
6637 	if (eb->fs_info->nodesize < PAGE_SIZE)
6638 		return read_extent_buffer_subpage(eb, wait, mirror_num);
6639 
6640 	num_pages = num_extent_pages(eb);
6641 	for (i = 0; i < num_pages; i++) {
6642 		page = eb->pages[i];
6643 		if (wait == WAIT_NONE) {
6644 			/*
6645 			 * WAIT_NONE is only utilized by readahead. If we can't
6646 			 * acquire the lock atomically it means either the eb
6647 			 * is being read out or under modification.
6648 			 * Either way the eb will be or has been cached,
6649 			 * readahead can exit safely.
6650 			 */
6651 			if (!trylock_page(page))
6652 				goto unlock_exit;
6653 		} else {
6654 			lock_page(page);
6655 		}
6656 		locked_pages++;
6657 	}
6658 	/*
6659 	 * We need to firstly lock all pages to make sure that
6660 	 * the uptodate bit of our pages won't be affected by
6661 	 * clear_extent_buffer_uptodate().
6662 	 */
6663 	for (i = 0; i < num_pages; i++) {
6664 		page = eb->pages[i];
6665 		if (!PageUptodate(page)) {
6666 			num_reads++;
6667 			all_uptodate = 0;
6668 		}
6669 	}
6670 
6671 	if (all_uptodate) {
6672 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
6673 		goto unlock_exit;
6674 	}
6675 
6676 	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
6677 	eb->read_mirror = 0;
6678 	atomic_set(&eb->io_pages, num_reads);
6679 	/*
6680 	 * It is possible for release_folio to clear the TREE_REF bit before we
6681 	 * set io_pages. See check_buffer_tree_ref for a more detailed comment.
6682 	 */
6683 	check_buffer_tree_ref(eb);
6684 	for (i = 0; i < num_pages; i++) {
6685 		page = eb->pages[i];
6686 
6687 		if (!PageUptodate(page)) {
6688 			if (ret) {
6689 				atomic_dec(&eb->io_pages);
6690 				unlock_page(page);
6691 				continue;
6692 			}
6693 
6694 			ClearPageError(page);
6695 			err = submit_extent_page(REQ_OP_READ, NULL,
6696 					 &bio_ctrl, page, page_offset(page),
6697 					 PAGE_SIZE, 0, end_bio_extent_readpage,
6698 					 0, false);
6699 			if (err) {
6700 				/*
6701 				 * We failed to submit the bio so it's the
6702 				 * caller's responsibility to perform cleanup
6703 				 * i.e unlock page/set error bit.
6704 				 */
6705 				ret = err;
6706 				SetPageError(page);
6707 				unlock_page(page);
6708 				atomic_dec(&eb->io_pages);
6709 			}
6710 		} else {
6711 			unlock_page(page);
6712 		}
6713 	}
6714 
6715 	submit_one_bio(&bio_ctrl);
6716 
6717 	if (ret || wait != WAIT_COMPLETE)
6718 		return ret;
6719 
6720 	for (i = 0; i < num_pages; i++) {
6721 		page = eb->pages[i];
6722 		wait_on_page_locked(page);
6723 		if (!PageUptodate(page))
6724 			ret = -EIO;
6725 	}
6726 
6727 	return ret;
6728 
6729 unlock_exit:
6730 	while (locked_pages > 0) {
6731 		locked_pages--;
6732 		page = eb->pages[locked_pages];
6733 		unlock_page(page);
6734 	}
6735 	return ret;
6736 }
6737 
6738 static bool report_eb_range(const struct extent_buffer *eb, unsigned long start,
6739 			    unsigned long len)
6740 {
6741 	btrfs_warn(eb->fs_info,
6742 		"access to eb bytenr %llu len %lu out of range start %lu len %lu",
6743 		eb->start, eb->len, start, len);
6744 	WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
6745 
6746 	return true;
6747 }
6748 
6749 /*
6750  * Check if the [start, start + len) range is valid before reading/writing
6751  * the eb.
6752  * NOTE: @start and @len are offset inside the eb, not logical address.
6753  *
6754  * Caller should not touch the dst/src memory if this function returns error.
6755  */
6756 static inline int check_eb_range(const struct extent_buffer *eb,
6757 				 unsigned long start, unsigned long len)
6758 {
6759 	unsigned long offset;
6760 
6761 	/* start, start + len should not go beyond eb->len nor overflow */
6762 	if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len))
6763 		return report_eb_range(eb, start, len);
6764 
6765 	return false;
6766 }
6767 
6768 void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
6769 			unsigned long start, unsigned long len)
6770 {
6771 	size_t cur;
6772 	size_t offset;
6773 	struct page *page;
6774 	char *kaddr;
6775 	char *dst = (char *)dstv;
6776 	unsigned long i = get_eb_page_index(start);
6777 
6778 	if (check_eb_range(eb, start, len))
6779 		return;
6780 
6781 	offset = get_eb_offset_in_page(eb, start);
6782 
6783 	while (len > 0) {
6784 		page = eb->pages[i];
6785 
6786 		cur = min(len, (PAGE_SIZE - offset));
6787 		kaddr = page_address(page);
6788 		memcpy(dst, kaddr + offset, cur);
6789 
6790 		dst += cur;
6791 		len -= cur;
6792 		offset = 0;
6793 		i++;
6794 	}
6795 }
6796 
6797 int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
6798 				       void __user *dstv,
6799 				       unsigned long start, unsigned long len)
6800 {
6801 	size_t cur;
6802 	size_t offset;
6803 	struct page *page;
6804 	char *kaddr;
6805 	char __user *dst = (char __user *)dstv;
6806 	unsigned long i = get_eb_page_index(start);
6807 	int ret = 0;
6808 
6809 	WARN_ON(start > eb->len);
6810 	WARN_ON(start + len > eb->start + eb->len);
6811 
6812 	offset = get_eb_offset_in_page(eb, start);
6813 
6814 	while (len > 0) {
6815 		page = eb->pages[i];
6816 
6817 		cur = min(len, (PAGE_SIZE - offset));
6818 		kaddr = page_address(page);
6819 		if (copy_to_user_nofault(dst, kaddr + offset, cur)) {
6820 			ret = -EFAULT;
6821 			break;
6822 		}
6823 
6824 		dst += cur;
6825 		len -= cur;
6826 		offset = 0;
6827 		i++;
6828 	}
6829 
6830 	return ret;
6831 }
6832 
6833 int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
6834 			 unsigned long start, unsigned long len)
6835 {
6836 	size_t cur;
6837 	size_t offset;
6838 	struct page *page;
6839 	char *kaddr;
6840 	char *ptr = (char *)ptrv;
6841 	unsigned long i = get_eb_page_index(start);
6842 	int ret = 0;
6843 
6844 	if (check_eb_range(eb, start, len))
6845 		return -EINVAL;
6846 
6847 	offset = get_eb_offset_in_page(eb, start);
6848 
6849 	while (len > 0) {
6850 		page = eb->pages[i];
6851 
6852 		cur = min(len, (PAGE_SIZE - offset));
6853 
6854 		kaddr = page_address(page);
6855 		ret = memcmp(ptr, kaddr + offset, cur);
6856 		if (ret)
6857 			break;
6858 
6859 		ptr += cur;
6860 		len -= cur;
6861 		offset = 0;
6862 		i++;
6863 	}
6864 	return ret;
6865 }
6866 
6867 /*
6868  * Check that the extent buffer is uptodate.
6869  *
6870  * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
6871  * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
6872  */
6873 static void assert_eb_page_uptodate(const struct extent_buffer *eb,
6874 				    struct page *page)
6875 {
6876 	struct btrfs_fs_info *fs_info = eb->fs_info;
6877 
6878 	/*
6879 	 * If we are using the commit root we could potentially clear a page
6880 	 * Uptodate while we're using the extent buffer that we've previously
6881 	 * looked up.  We don't want to complain in this case, as the page was
6882 	 * valid before, we just didn't write it out.  Instead we want to catch
6883 	 * the case where we didn't actually read the block properly, which
6884 	 * would have !PageUptodate && !PageError, as we clear PageError before
6885 	 * reading.
6886 	 */
6887 	if (fs_info->nodesize < PAGE_SIZE) {
6888 		bool uptodate, error;
6889 
6890 		uptodate = btrfs_subpage_test_uptodate(fs_info, page,
6891 						       eb->start, eb->len);
6892 		error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len);
6893 		WARN_ON(!uptodate && !error);
6894 	} else {
6895 		WARN_ON(!PageUptodate(page) && !PageError(page));
6896 	}
6897 }
6898 
6899 void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
6900 		const void *srcv)
6901 {
6902 	char *kaddr;
6903 
6904 	assert_eb_page_uptodate(eb, eb->pages[0]);
6905 	kaddr = page_address(eb->pages[0]) +
6906 		get_eb_offset_in_page(eb, offsetof(struct btrfs_header,
6907 						   chunk_tree_uuid));
6908 	memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
6909 }
6910 
6911 void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
6912 {
6913 	char *kaddr;
6914 
6915 	assert_eb_page_uptodate(eb, eb->pages[0]);
6916 	kaddr = page_address(eb->pages[0]) +
6917 		get_eb_offset_in_page(eb, offsetof(struct btrfs_header, fsid));
6918 	memcpy(kaddr, srcv, BTRFS_FSID_SIZE);
6919 }
6920 
6921 void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
6922 			 unsigned long start, unsigned long len)
6923 {
6924 	size_t cur;
6925 	size_t offset;
6926 	struct page *page;
6927 	char *kaddr;
6928 	char *src = (char *)srcv;
6929 	unsigned long i = get_eb_page_index(start);
6930 
6931 	WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags));
6932 
6933 	if (check_eb_range(eb, start, len))
6934 		return;
6935 
6936 	offset = get_eb_offset_in_page(eb, start);
6937 
6938 	while (len > 0) {
6939 		page = eb->pages[i];
6940 		assert_eb_page_uptodate(eb, page);
6941 
6942 		cur = min(len, PAGE_SIZE - offset);
6943 		kaddr = page_address(page);
6944 		memcpy(kaddr + offset, src, cur);
6945 
6946 		src += cur;
6947 		len -= cur;
6948 		offset = 0;
6949 		i++;
6950 	}
6951 }
6952 
6953 void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
6954 		unsigned long len)
6955 {
6956 	size_t cur;
6957 	size_t offset;
6958 	struct page *page;
6959 	char *kaddr;
6960 	unsigned long i = get_eb_page_index(start);
6961 
6962 	if (check_eb_range(eb, start, len))
6963 		return;
6964 
6965 	offset = get_eb_offset_in_page(eb, start);
6966 
6967 	while (len > 0) {
6968 		page = eb->pages[i];
6969 		assert_eb_page_uptodate(eb, page);
6970 
6971 		cur = min(len, PAGE_SIZE - offset);
6972 		kaddr = page_address(page);
6973 		memset(kaddr + offset, 0, cur);
6974 
6975 		len -= cur;
6976 		offset = 0;
6977 		i++;
6978 	}
6979 }
6980 
6981 void copy_extent_buffer_full(const struct extent_buffer *dst,
6982 			     const struct extent_buffer *src)
6983 {
6984 	int i;
6985 	int num_pages;
6986 
6987 	ASSERT(dst->len == src->len);
6988 
6989 	if (dst->fs_info->nodesize >= PAGE_SIZE) {
6990 		num_pages = num_extent_pages(dst);
6991 		for (i = 0; i < num_pages; i++)
6992 			copy_page(page_address(dst->pages[i]),
6993 				  page_address(src->pages[i]));
6994 	} else {
6995 		size_t src_offset = get_eb_offset_in_page(src, 0);
6996 		size_t dst_offset = get_eb_offset_in_page(dst, 0);
6997 
6998 		ASSERT(src->fs_info->nodesize < PAGE_SIZE);
6999 		memcpy(page_address(dst->pages[0]) + dst_offset,
7000 		       page_address(src->pages[0]) + src_offset,
7001 		       src->len);
7002 	}
7003 }
7004 
7005 void copy_extent_buffer(const struct extent_buffer *dst,
7006 			const struct extent_buffer *src,
7007 			unsigned long dst_offset, unsigned long src_offset,
7008 			unsigned long len)
7009 {
7010 	u64 dst_len = dst->len;
7011 	size_t cur;
7012 	size_t offset;
7013 	struct page *page;
7014 	char *kaddr;
7015 	unsigned long i = get_eb_page_index(dst_offset);
7016 
7017 	if (check_eb_range(dst, dst_offset, len) ||
7018 	    check_eb_range(src, src_offset, len))
7019 		return;
7020 
7021 	WARN_ON(src->len != dst_len);
7022 
7023 	offset = get_eb_offset_in_page(dst, dst_offset);
7024 
7025 	while (len > 0) {
7026 		page = dst->pages[i];
7027 		assert_eb_page_uptodate(dst, page);
7028 
7029 		cur = min(len, (unsigned long)(PAGE_SIZE - offset));
7030 
7031 		kaddr = page_address(page);
7032 		read_extent_buffer(src, kaddr + offset, src_offset, cur);
7033 
7034 		src_offset += cur;
7035 		len -= cur;
7036 		offset = 0;
7037 		i++;
7038 	}
7039 }
7040 
7041 /*
7042  * eb_bitmap_offset() - calculate the page and offset of the byte containing the
7043  * given bit number
7044  * @eb: the extent buffer
7045  * @start: offset of the bitmap item in the extent buffer
7046  * @nr: bit number
7047  * @page_index: return index of the page in the extent buffer that contains the
7048  * given bit number
7049  * @page_offset: return offset into the page given by page_index
7050  *
7051  * This helper hides the ugliness of finding the byte in an extent buffer which
7052  * contains a given bit.
7053  */
7054 static inline void eb_bitmap_offset(const struct extent_buffer *eb,
7055 				    unsigned long start, unsigned long nr,
7056 				    unsigned long *page_index,
7057 				    size_t *page_offset)
7058 {
7059 	size_t byte_offset = BIT_BYTE(nr);
7060 	size_t offset;
7061 
7062 	/*
7063 	 * The byte we want is the offset of the extent buffer + the offset of
7064 	 * the bitmap item in the extent buffer + the offset of the byte in the
7065 	 * bitmap item.
7066 	 */
7067 	offset = start + offset_in_page(eb->start) + byte_offset;
7068 
7069 	*page_index = offset >> PAGE_SHIFT;
7070 	*page_offset = offset_in_page(offset);
7071 }
7072 
7073 /**
7074  * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
7075  * @eb: the extent buffer
7076  * @start: offset of the bitmap item in the extent buffer
7077  * @nr: bit number to test
7078  */
7079 int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
7080 			   unsigned long nr)
7081 {
7082 	u8 *kaddr;
7083 	struct page *page;
7084 	unsigned long i;
7085 	size_t offset;
7086 
7087 	eb_bitmap_offset(eb, start, nr, &i, &offset);
7088 	page = eb->pages[i];
7089 	assert_eb_page_uptodate(eb, page);
7090 	kaddr = page_address(page);
7091 	return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
7092 }
7093 
7094 /**
7095  * extent_buffer_bitmap_set - set an area of a bitmap
7096  * @eb: the extent buffer
7097  * @start: offset of the bitmap item in the extent buffer
7098  * @pos: bit number of the first bit
7099  * @len: number of bits to set
7100  */
7101 void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long start,
7102 			      unsigned long pos, unsigned long len)
7103 {
7104 	u8 *kaddr;
7105 	struct page *page;
7106 	unsigned long i;
7107 	size_t offset;
7108 	const unsigned int size = pos + len;
7109 	int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
7110 	u8 mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
7111 
7112 	eb_bitmap_offset(eb, start, pos, &i, &offset);
7113 	page = eb->pages[i];
7114 	assert_eb_page_uptodate(eb, page);
7115 	kaddr = page_address(page);
7116 
7117 	while (len >= bits_to_set) {
7118 		kaddr[offset] |= mask_to_set;
7119 		len -= bits_to_set;
7120 		bits_to_set = BITS_PER_BYTE;
7121 		mask_to_set = ~0;
7122 		if (++offset >= PAGE_SIZE && len > 0) {
7123 			offset = 0;
7124 			page = eb->pages[++i];
7125 			assert_eb_page_uptodate(eb, page);
7126 			kaddr = page_address(page);
7127 		}
7128 	}
7129 	if (len) {
7130 		mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
7131 		kaddr[offset] |= mask_to_set;
7132 	}
7133 }
7134 
7135 
7136 /**
7137  * extent_buffer_bitmap_clear - clear an area of a bitmap
7138  * @eb: the extent buffer
7139  * @start: offset of the bitmap item in the extent buffer
7140  * @pos: bit number of the first bit
7141  * @len: number of bits to clear
7142  */
7143 void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
7144 				unsigned long start, unsigned long pos,
7145 				unsigned long len)
7146 {
7147 	u8 *kaddr;
7148 	struct page *page;
7149 	unsigned long i;
7150 	size_t offset;
7151 	const unsigned int size = pos + len;
7152 	int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
7153 	u8 mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
7154 
7155 	eb_bitmap_offset(eb, start, pos, &i, &offset);
7156 	page = eb->pages[i];
7157 	assert_eb_page_uptodate(eb, page);
7158 	kaddr = page_address(page);
7159 
7160 	while (len >= bits_to_clear) {
7161 		kaddr[offset] &= ~mask_to_clear;
7162 		len -= bits_to_clear;
7163 		bits_to_clear = BITS_PER_BYTE;
7164 		mask_to_clear = ~0;
7165 		if (++offset >= PAGE_SIZE && len > 0) {
7166 			offset = 0;
7167 			page = eb->pages[++i];
7168 			assert_eb_page_uptodate(eb, page);
7169 			kaddr = page_address(page);
7170 		}
7171 	}
7172 	if (len) {
7173 		mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
7174 		kaddr[offset] &= ~mask_to_clear;
7175 	}
7176 }
7177 
7178 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
7179 {
7180 	unsigned long distance = (src > dst) ? src - dst : dst - src;
7181 	return distance < len;
7182 }
7183 
7184 static void copy_pages(struct page *dst_page, struct page *src_page,
7185 		       unsigned long dst_off, unsigned long src_off,
7186 		       unsigned long len)
7187 {
7188 	char *dst_kaddr = page_address(dst_page);
7189 	char *src_kaddr;
7190 	int must_memmove = 0;
7191 
7192 	if (dst_page != src_page) {
7193 		src_kaddr = page_address(src_page);
7194 	} else {
7195 		src_kaddr = dst_kaddr;
7196 		if (areas_overlap(src_off, dst_off, len))
7197 			must_memmove = 1;
7198 	}
7199 
7200 	if (must_memmove)
7201 		memmove(dst_kaddr + dst_off, src_kaddr + src_off, len);
7202 	else
7203 		memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
7204 }
7205 
7206 void memcpy_extent_buffer(const struct extent_buffer *dst,
7207 			  unsigned long dst_offset, unsigned long src_offset,
7208 			  unsigned long len)
7209 {
7210 	size_t cur;
7211 	size_t dst_off_in_page;
7212 	size_t src_off_in_page;
7213 	unsigned long dst_i;
7214 	unsigned long src_i;
7215 
7216 	if (check_eb_range(dst, dst_offset, len) ||
7217 	    check_eb_range(dst, src_offset, len))
7218 		return;
7219 
7220 	while (len > 0) {
7221 		dst_off_in_page = get_eb_offset_in_page(dst, dst_offset);
7222 		src_off_in_page = get_eb_offset_in_page(dst, src_offset);
7223 
7224 		dst_i = get_eb_page_index(dst_offset);
7225 		src_i = get_eb_page_index(src_offset);
7226 
7227 		cur = min(len, (unsigned long)(PAGE_SIZE -
7228 					       src_off_in_page));
7229 		cur = min_t(unsigned long, cur,
7230 			(unsigned long)(PAGE_SIZE - dst_off_in_page));
7231 
7232 		copy_pages(dst->pages[dst_i], dst->pages[src_i],
7233 			   dst_off_in_page, src_off_in_page, cur);
7234 
7235 		src_offset += cur;
7236 		dst_offset += cur;
7237 		len -= cur;
7238 	}
7239 }
7240 
7241 void memmove_extent_buffer(const struct extent_buffer *dst,
7242 			   unsigned long dst_offset, unsigned long src_offset,
7243 			   unsigned long len)
7244 {
7245 	size_t cur;
7246 	size_t dst_off_in_page;
7247 	size_t src_off_in_page;
7248 	unsigned long dst_end = dst_offset + len - 1;
7249 	unsigned long src_end = src_offset + len - 1;
7250 	unsigned long dst_i;
7251 	unsigned long src_i;
7252 
7253 	if (check_eb_range(dst, dst_offset, len) ||
7254 	    check_eb_range(dst, src_offset, len))
7255 		return;
7256 	if (dst_offset < src_offset) {
7257 		memcpy_extent_buffer(dst, dst_offset, src_offset, len);
7258 		return;
7259 	}
7260 	while (len > 0) {
7261 		dst_i = get_eb_page_index(dst_end);
7262 		src_i = get_eb_page_index(src_end);
7263 
7264 		dst_off_in_page = get_eb_offset_in_page(dst, dst_end);
7265 		src_off_in_page = get_eb_offset_in_page(dst, src_end);
7266 
7267 		cur = min_t(unsigned long, len, src_off_in_page + 1);
7268 		cur = min(cur, dst_off_in_page + 1);
7269 		copy_pages(dst->pages[dst_i], dst->pages[src_i],
7270 			   dst_off_in_page - cur + 1,
7271 			   src_off_in_page - cur + 1, cur);
7272 
7273 		dst_end -= cur;
7274 		src_end -= cur;
7275 		len -= cur;
7276 	}
7277 }
7278 
7279 #define GANG_LOOKUP_SIZE	16
7280 static struct extent_buffer *get_next_extent_buffer(
7281 		struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr)
7282 {
7283 	struct extent_buffer *gang[GANG_LOOKUP_SIZE];
7284 	struct extent_buffer *found = NULL;
7285 	u64 page_start = page_offset(page);
7286 	u64 cur = page_start;
7287 
7288 	ASSERT(in_range(bytenr, page_start, PAGE_SIZE));
7289 	lockdep_assert_held(&fs_info->buffer_lock);
7290 
7291 	while (cur < page_start + PAGE_SIZE) {
7292 		int ret;
7293 		int i;
7294 
7295 		ret = radix_tree_gang_lookup(&fs_info->buffer_radix,
7296 				(void **)gang, cur >> fs_info->sectorsize_bits,
7297 				min_t(unsigned int, GANG_LOOKUP_SIZE,
7298 				      PAGE_SIZE / fs_info->nodesize));
7299 		if (ret == 0)
7300 			goto out;
7301 		for (i = 0; i < ret; i++) {
7302 			/* Already beyond page end */
7303 			if (gang[i]->start >= page_start + PAGE_SIZE)
7304 				goto out;
7305 			/* Found one */
7306 			if (gang[i]->start >= bytenr) {
7307 				found = gang[i];
7308 				goto out;
7309 			}
7310 		}
7311 		cur = gang[ret - 1]->start + gang[ret - 1]->len;
7312 	}
7313 out:
7314 	return found;
7315 }
7316 
7317 static int try_release_subpage_extent_buffer(struct page *page)
7318 {
7319 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
7320 	u64 cur = page_offset(page);
7321 	const u64 end = page_offset(page) + PAGE_SIZE;
7322 	int ret;
7323 
7324 	while (cur < end) {
7325 		struct extent_buffer *eb = NULL;
7326 
7327 		/*
7328 		 * Unlike try_release_extent_buffer() which uses page->private
7329 		 * to grab buffer, for subpage case we rely on radix tree, thus
7330 		 * we need to ensure radix tree consistency.
7331 		 *
7332 		 * We also want an atomic snapshot of the radix tree, thus go
7333 		 * with spinlock rather than RCU.
7334 		 */
7335 		spin_lock(&fs_info->buffer_lock);
7336 		eb = get_next_extent_buffer(fs_info, page, cur);
7337 		if (!eb) {
7338 			/* No more eb in the page range after or at cur */
7339 			spin_unlock(&fs_info->buffer_lock);
7340 			break;
7341 		}
7342 		cur = eb->start + eb->len;
7343 
7344 		/*
7345 		 * The same as try_release_extent_buffer(), to ensure the eb
7346 		 * won't disappear out from under us.
7347 		 */
7348 		spin_lock(&eb->refs_lock);
7349 		if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
7350 			spin_unlock(&eb->refs_lock);
7351 			spin_unlock(&fs_info->buffer_lock);
7352 			break;
7353 		}
7354 		spin_unlock(&fs_info->buffer_lock);
7355 
7356 		/*
7357 		 * If tree ref isn't set then we know the ref on this eb is a
7358 		 * real ref, so just return, this eb will likely be freed soon
7359 		 * anyway.
7360 		 */
7361 		if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7362 			spin_unlock(&eb->refs_lock);
7363 			break;
7364 		}
7365 
7366 		/*
7367 		 * Here we don't care about the return value, we will always
7368 		 * check the page private at the end.  And
7369 		 * release_extent_buffer() will release the refs_lock.
7370 		 */
7371 		release_extent_buffer(eb);
7372 	}
7373 	/*
7374 	 * Finally to check if we have cleared page private, as if we have
7375 	 * released all ebs in the page, the page private should be cleared now.
7376 	 */
7377 	spin_lock(&page->mapping->private_lock);
7378 	if (!PagePrivate(page))
7379 		ret = 1;
7380 	else
7381 		ret = 0;
7382 	spin_unlock(&page->mapping->private_lock);
7383 	return ret;
7384 
7385 }
7386 
7387 int try_release_extent_buffer(struct page *page)
7388 {
7389 	struct extent_buffer *eb;
7390 
7391 	if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
7392 		return try_release_subpage_extent_buffer(page);
7393 
7394 	/*
7395 	 * We need to make sure nobody is changing page->private, as we rely on
7396 	 * page->private as the pointer to extent buffer.
7397 	 */
7398 	spin_lock(&page->mapping->private_lock);
7399 	if (!PagePrivate(page)) {
7400 		spin_unlock(&page->mapping->private_lock);
7401 		return 1;
7402 	}
7403 
7404 	eb = (struct extent_buffer *)page->private;
7405 	BUG_ON(!eb);
7406 
7407 	/*
7408 	 * This is a little awful but should be ok, we need to make sure that
7409 	 * the eb doesn't disappear out from under us while we're looking at
7410 	 * this page.
7411 	 */
7412 	spin_lock(&eb->refs_lock);
7413 	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
7414 		spin_unlock(&eb->refs_lock);
7415 		spin_unlock(&page->mapping->private_lock);
7416 		return 0;
7417 	}
7418 	spin_unlock(&page->mapping->private_lock);
7419 
7420 	/*
7421 	 * If tree ref isn't set then we know the ref on this eb is a real ref,
7422 	 * so just return, this page will likely be freed soon anyway.
7423 	 */
7424 	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
7425 		spin_unlock(&eb->refs_lock);
7426 		return 0;
7427 	}
7428 
7429 	return release_extent_buffer(eb);
7430 }
7431 
7432 /*
7433  * btrfs_readahead_tree_block - attempt to readahead a child block
7434  * @fs_info:	the fs_info
7435  * @bytenr:	bytenr to read
7436  * @owner_root: objectid of the root that owns this eb
7437  * @gen:	generation for the uptodate check, can be 0
7438  * @level:	level for the eb
7439  *
7440  * Attempt to readahead a tree block at @bytenr.  If @gen is 0 then we do a
7441  * normal uptodate check of the eb, without checking the generation.  If we have
7442  * to read the block we will not block on anything.
7443  */
7444 void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
7445 				u64 bytenr, u64 owner_root, u64 gen, int level)
7446 {
7447 	struct extent_buffer *eb;
7448 	int ret;
7449 
7450 	eb = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
7451 	if (IS_ERR(eb))
7452 		return;
7453 
7454 	if (btrfs_buffer_uptodate(eb, gen, 1)) {
7455 		free_extent_buffer(eb);
7456 		return;
7457 	}
7458 
7459 	ret = read_extent_buffer_pages(eb, WAIT_NONE, 0);
7460 	if (ret < 0)
7461 		free_extent_buffer_stale(eb);
7462 	else
7463 		free_extent_buffer(eb);
7464 }
7465 
7466 /*
7467  * btrfs_readahead_node_child - readahead a node's child block
7468  * @node:	parent node we're reading from
7469  * @slot:	slot in the parent node for the child we want to read
7470  *
7471  * A helper for btrfs_readahead_tree_block, we simply read the bytenr pointed at
7472  * the slot in the node provided.
7473  */
7474 void btrfs_readahead_node_child(struct extent_buffer *node, int slot)
7475 {
7476 	btrfs_readahead_tree_block(node->fs_info,
7477 				   btrfs_node_blockptr(node, slot),
7478 				   btrfs_header_owner(node),
7479 				   btrfs_node_ptr_generation(node, slot),
7480 				   btrfs_header_level(node) - 1);
7481 }
7482