xref: /openbmc/linux/fs/btrfs/file.c (revision 48cc39c3)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/fs.h>
7 #include <linux/pagemap.h>
8 #include <linux/time.h>
9 #include <linux/init.h>
10 #include <linux/string.h>
11 #include <linux/backing-dev.h>
12 #include <linux/falloc.h>
13 #include <linux/writeback.h>
14 #include <linux/compat.h>
15 #include <linux/slab.h>
16 #include <linux/btrfs.h>
17 #include <linux/uio.h>
18 #include <linux/iversion.h>
19 #include <linux/fsverity.h>
20 #include "ctree.h"
21 #include "disk-io.h"
22 #include "transaction.h"
23 #include "btrfs_inode.h"
24 #include "print-tree.h"
25 #include "tree-log.h"
26 #include "locking.h"
27 #include "volumes.h"
28 #include "qgroup.h"
29 #include "compression.h"
30 #include "delalloc-space.h"
31 #include "reflink.h"
32 #include "subpage.h"
33 
34 static struct kmem_cache *btrfs_inode_defrag_cachep;
35 /*
36  * when auto defrag is enabled we
37  * queue up these defrag structs to remember which
38  * inodes need defragging passes
39  */
40 struct inode_defrag {
41 	struct rb_node rb_node;
42 	/* objectid */
43 	u64 ino;
44 	/*
45 	 * transid where the defrag was added, we search for
46 	 * extents newer than this
47 	 */
48 	u64 transid;
49 
50 	/* root objectid */
51 	u64 root;
52 
53 	/* last offset we were able to defrag */
54 	u64 last_offset;
55 
56 	/* if we've wrapped around back to zero once already */
57 	int cycled;
58 };
59 
60 static int __compare_inode_defrag(struct inode_defrag *defrag1,
61 				  struct inode_defrag *defrag2)
62 {
63 	if (defrag1->root > defrag2->root)
64 		return 1;
65 	else if (defrag1->root < defrag2->root)
66 		return -1;
67 	else if (defrag1->ino > defrag2->ino)
68 		return 1;
69 	else if (defrag1->ino < defrag2->ino)
70 		return -1;
71 	else
72 		return 0;
73 }
74 
75 /* pop a record for an inode into the defrag tree.  The lock
76  * must be held already
77  *
78  * If you're inserting a record for an older transid than an
79  * existing record, the transid already in the tree is lowered
80  *
81  * If an existing record is found the defrag item you
82  * pass in is freed
83  */
84 static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
85 				    struct inode_defrag *defrag)
86 {
87 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
88 	struct inode_defrag *entry;
89 	struct rb_node **p;
90 	struct rb_node *parent = NULL;
91 	int ret;
92 
93 	p = &fs_info->defrag_inodes.rb_node;
94 	while (*p) {
95 		parent = *p;
96 		entry = rb_entry(parent, struct inode_defrag, rb_node);
97 
98 		ret = __compare_inode_defrag(defrag, entry);
99 		if (ret < 0)
100 			p = &parent->rb_left;
101 		else if (ret > 0)
102 			p = &parent->rb_right;
103 		else {
104 			/* if we're reinserting an entry for
105 			 * an old defrag run, make sure to
106 			 * lower the transid of our existing record
107 			 */
108 			if (defrag->transid < entry->transid)
109 				entry->transid = defrag->transid;
110 			if (defrag->last_offset > entry->last_offset)
111 				entry->last_offset = defrag->last_offset;
112 			return -EEXIST;
113 		}
114 	}
115 	set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
116 	rb_link_node(&defrag->rb_node, parent, p);
117 	rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
118 	return 0;
119 }
120 
121 static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
122 {
123 	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
124 		return 0;
125 
126 	if (btrfs_fs_closing(fs_info))
127 		return 0;
128 
129 	return 1;
130 }
131 
132 /*
133  * insert a defrag record for this inode if auto defrag is
134  * enabled
135  */
136 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
137 			   struct btrfs_inode *inode)
138 {
139 	struct btrfs_root *root = inode->root;
140 	struct btrfs_fs_info *fs_info = root->fs_info;
141 	struct inode_defrag *defrag;
142 	u64 transid;
143 	int ret;
144 
145 	if (!__need_auto_defrag(fs_info))
146 		return 0;
147 
148 	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
149 		return 0;
150 
151 	if (trans)
152 		transid = trans->transid;
153 	else
154 		transid = inode->root->last_trans;
155 
156 	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
157 	if (!defrag)
158 		return -ENOMEM;
159 
160 	defrag->ino = btrfs_ino(inode);
161 	defrag->transid = transid;
162 	defrag->root = root->root_key.objectid;
163 
164 	spin_lock(&fs_info->defrag_inodes_lock);
165 	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
166 		/*
167 		 * If we set IN_DEFRAG flag and evict the inode from memory,
168 		 * and then re-read this inode, this new inode doesn't have
169 		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
170 		 */
171 		ret = __btrfs_add_inode_defrag(inode, defrag);
172 		if (ret)
173 			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
174 	} else {
175 		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
176 	}
177 	spin_unlock(&fs_info->defrag_inodes_lock);
178 	return 0;
179 }
180 
181 /*
182  * Requeue the defrag object. If there is a defrag object that points to
183  * the same inode in the tree, we will merge them together (by
184  * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
185  */
186 static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
187 				       struct inode_defrag *defrag)
188 {
189 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
190 	int ret;
191 
192 	if (!__need_auto_defrag(fs_info))
193 		goto out;
194 
195 	/*
196 	 * Here we don't check the IN_DEFRAG flag, because we need merge
197 	 * them together.
198 	 */
199 	spin_lock(&fs_info->defrag_inodes_lock);
200 	ret = __btrfs_add_inode_defrag(inode, defrag);
201 	spin_unlock(&fs_info->defrag_inodes_lock);
202 	if (ret)
203 		goto out;
204 	return;
205 out:
206 	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
207 }
208 
209 /*
210  * pick the defragable inode that we want, if it doesn't exist, we will get
211  * the next one.
212  */
213 static struct inode_defrag *
214 btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
215 {
216 	struct inode_defrag *entry = NULL;
217 	struct inode_defrag tmp;
218 	struct rb_node *p;
219 	struct rb_node *parent = NULL;
220 	int ret;
221 
222 	tmp.ino = ino;
223 	tmp.root = root;
224 
225 	spin_lock(&fs_info->defrag_inodes_lock);
226 	p = fs_info->defrag_inodes.rb_node;
227 	while (p) {
228 		parent = p;
229 		entry = rb_entry(parent, struct inode_defrag, rb_node);
230 
231 		ret = __compare_inode_defrag(&tmp, entry);
232 		if (ret < 0)
233 			p = parent->rb_left;
234 		else if (ret > 0)
235 			p = parent->rb_right;
236 		else
237 			goto out;
238 	}
239 
240 	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
241 		parent = rb_next(parent);
242 		if (parent)
243 			entry = rb_entry(parent, struct inode_defrag, rb_node);
244 		else
245 			entry = NULL;
246 	}
247 out:
248 	if (entry)
249 		rb_erase(parent, &fs_info->defrag_inodes);
250 	spin_unlock(&fs_info->defrag_inodes_lock);
251 	return entry;
252 }
253 
254 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
255 {
256 	struct inode_defrag *defrag;
257 	struct rb_node *node;
258 
259 	spin_lock(&fs_info->defrag_inodes_lock);
260 	node = rb_first(&fs_info->defrag_inodes);
261 	while (node) {
262 		rb_erase(node, &fs_info->defrag_inodes);
263 		defrag = rb_entry(node, struct inode_defrag, rb_node);
264 		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
265 
266 		cond_resched_lock(&fs_info->defrag_inodes_lock);
267 
268 		node = rb_first(&fs_info->defrag_inodes);
269 	}
270 	spin_unlock(&fs_info->defrag_inodes_lock);
271 }
272 
273 #define BTRFS_DEFRAG_BATCH	1024
274 
275 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
276 				    struct inode_defrag *defrag)
277 {
278 	struct btrfs_root *inode_root;
279 	struct inode *inode;
280 	struct btrfs_ioctl_defrag_range_args range;
281 	int num_defrag;
282 	int ret;
283 
284 	/* get the inode */
285 	inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
286 	if (IS_ERR(inode_root)) {
287 		ret = PTR_ERR(inode_root);
288 		goto cleanup;
289 	}
290 
291 	inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
292 	btrfs_put_root(inode_root);
293 	if (IS_ERR(inode)) {
294 		ret = PTR_ERR(inode);
295 		goto cleanup;
296 	}
297 
298 	/* do a chunk of defrag */
299 	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
300 	memset(&range, 0, sizeof(range));
301 	range.len = (u64)-1;
302 	range.start = defrag->last_offset;
303 
304 	sb_start_write(fs_info->sb);
305 	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
306 				       BTRFS_DEFRAG_BATCH);
307 	sb_end_write(fs_info->sb);
308 	/*
309 	 * if we filled the whole defrag batch, there
310 	 * must be more work to do.  Queue this defrag
311 	 * again
312 	 */
313 	if (num_defrag == BTRFS_DEFRAG_BATCH) {
314 		defrag->last_offset = range.start;
315 		btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
316 	} else if (defrag->last_offset && !defrag->cycled) {
317 		/*
318 		 * we didn't fill our defrag batch, but
319 		 * we didn't start at zero.  Make sure we loop
320 		 * around to the start of the file.
321 		 */
322 		defrag->last_offset = 0;
323 		defrag->cycled = 1;
324 		btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
325 	} else {
326 		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
327 	}
328 
329 	iput(inode);
330 	return 0;
331 cleanup:
332 	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
333 	return ret;
334 }
335 
336 /*
337  * run through the list of inodes in the FS that need
338  * defragging
339  */
340 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
341 {
342 	struct inode_defrag *defrag;
343 	u64 first_ino = 0;
344 	u64 root_objectid = 0;
345 
346 	atomic_inc(&fs_info->defrag_running);
347 	while (1) {
348 		/* Pause the auto defragger. */
349 		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
350 			     &fs_info->fs_state))
351 			break;
352 
353 		if (!__need_auto_defrag(fs_info))
354 			break;
355 
356 		/* find an inode to defrag */
357 		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
358 						 first_ino);
359 		if (!defrag) {
360 			if (root_objectid || first_ino) {
361 				root_objectid = 0;
362 				first_ino = 0;
363 				continue;
364 			} else {
365 				break;
366 			}
367 		}
368 
369 		first_ino = defrag->ino + 1;
370 		root_objectid = defrag->root;
371 
372 		__btrfs_run_defrag_inode(fs_info, defrag);
373 	}
374 	atomic_dec(&fs_info->defrag_running);
375 
376 	/*
377 	 * during unmount, we use the transaction_wait queue to
378 	 * wait for the defragger to stop
379 	 */
380 	wake_up(&fs_info->transaction_wait);
381 	return 0;
382 }
383 
384 /* simple helper to fault in pages and copy.  This should go away
385  * and be replaced with calls into generic code.
386  */
387 static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
388 					 struct page **prepared_pages,
389 					 struct iov_iter *i)
390 {
391 	size_t copied = 0;
392 	size_t total_copied = 0;
393 	int pg = 0;
394 	int offset = offset_in_page(pos);
395 
396 	while (write_bytes > 0) {
397 		size_t count = min_t(size_t,
398 				     PAGE_SIZE - offset, write_bytes);
399 		struct page *page = prepared_pages[pg];
400 		/*
401 		 * Copy data from userspace to the current page
402 		 */
403 		copied = copy_page_from_iter_atomic(page, offset, count, i);
404 
405 		/* Flush processor's dcache for this page */
406 		flush_dcache_page(page);
407 
408 		/*
409 		 * if we get a partial write, we can end up with
410 		 * partially up to date pages.  These add
411 		 * a lot of complexity, so make sure they don't
412 		 * happen by forcing this copy to be retried.
413 		 *
414 		 * The rest of the btrfs_file_write code will fall
415 		 * back to page at a time copies after we return 0.
416 		 */
417 		if (unlikely(copied < count)) {
418 			if (!PageUptodate(page)) {
419 				iov_iter_revert(i, copied);
420 				copied = 0;
421 			}
422 			if (!copied)
423 				break;
424 		}
425 
426 		write_bytes -= copied;
427 		total_copied += copied;
428 		offset += copied;
429 		if (offset == PAGE_SIZE) {
430 			pg++;
431 			offset = 0;
432 		}
433 	}
434 	return total_copied;
435 }
436 
437 /*
438  * unlocks pages after btrfs_file_write is done with them
439  */
440 static void btrfs_drop_pages(struct page **pages, size_t num_pages)
441 {
442 	size_t i;
443 	for (i = 0; i < num_pages; i++) {
444 		/* page checked is some magic around finding pages that
445 		 * have been modified without going through btrfs_set_page_dirty
446 		 * clear it here. There should be no need to mark the pages
447 		 * accessed as prepare_pages should have marked them accessed
448 		 * in prepare_pages via find_or_create_page()
449 		 */
450 		ClearPageChecked(pages[i]);
451 		unlock_page(pages[i]);
452 		put_page(pages[i]);
453 	}
454 }
455 
456 /*
457  * After btrfs_copy_from_user(), update the following things for delalloc:
458  * - Mark newly dirtied pages as DELALLOC in the io tree.
459  *   Used to advise which range is to be written back.
460  * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
461  * - Update inode size for past EOF write
462  */
463 int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
464 		      size_t num_pages, loff_t pos, size_t write_bytes,
465 		      struct extent_state **cached, bool noreserve)
466 {
467 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
468 	int err = 0;
469 	int i;
470 	u64 num_bytes;
471 	u64 start_pos;
472 	u64 end_of_last_block;
473 	u64 end_pos = pos + write_bytes;
474 	loff_t isize = i_size_read(&inode->vfs_inode);
475 	unsigned int extra_bits = 0;
476 
477 	if (write_bytes == 0)
478 		return 0;
479 
480 	if (noreserve)
481 		extra_bits |= EXTENT_NORESERVE;
482 
483 	start_pos = round_down(pos, fs_info->sectorsize);
484 	num_bytes = round_up(write_bytes + pos - start_pos,
485 			     fs_info->sectorsize);
486 	ASSERT(num_bytes <= U32_MAX);
487 
488 	end_of_last_block = start_pos + num_bytes - 1;
489 
490 	/*
491 	 * The pages may have already been dirty, clear out old accounting so
492 	 * we can set things up properly
493 	 */
494 	clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
495 			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
496 			 0, 0, cached);
497 
498 	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
499 					extra_bits, cached);
500 	if (err)
501 		return err;
502 
503 	for (i = 0; i < num_pages; i++) {
504 		struct page *p = pages[i];
505 
506 		btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
507 		ClearPageChecked(p);
508 		btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
509 	}
510 
511 	/*
512 	 * we've only changed i_size in ram, and we haven't updated
513 	 * the disk i_size.  There is no need to log the inode
514 	 * at this time.
515 	 */
516 	if (end_pos > isize)
517 		i_size_write(&inode->vfs_inode, end_pos);
518 	return 0;
519 }
520 
521 /*
522  * this drops all the extents in the cache that intersect the range
523  * [start, end].  Existing extents are split as required.
524  */
525 void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
526 			     int skip_pinned)
527 {
528 	struct extent_map *em;
529 	struct extent_map *split = NULL;
530 	struct extent_map *split2 = NULL;
531 	struct extent_map_tree *em_tree = &inode->extent_tree;
532 	u64 len = end - start + 1;
533 	u64 gen;
534 	int ret;
535 	int testend = 1;
536 	unsigned long flags;
537 	int compressed = 0;
538 	bool modified;
539 
540 	WARN_ON(end < start);
541 	if (end == (u64)-1) {
542 		len = (u64)-1;
543 		testend = 0;
544 	}
545 	while (1) {
546 		int no_splits = 0;
547 
548 		modified = false;
549 		if (!split)
550 			split = alloc_extent_map();
551 		if (!split2)
552 			split2 = alloc_extent_map();
553 		if (!split || !split2)
554 			no_splits = 1;
555 
556 		write_lock(&em_tree->lock);
557 		em = lookup_extent_mapping(em_tree, start, len);
558 		if (!em) {
559 			write_unlock(&em_tree->lock);
560 			break;
561 		}
562 		flags = em->flags;
563 		gen = em->generation;
564 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
565 			if (testend && em->start + em->len >= start + len) {
566 				free_extent_map(em);
567 				write_unlock(&em_tree->lock);
568 				break;
569 			}
570 			start = em->start + em->len;
571 			if (testend)
572 				len = start + len - (em->start + em->len);
573 			free_extent_map(em);
574 			write_unlock(&em_tree->lock);
575 			continue;
576 		}
577 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
578 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
579 		clear_bit(EXTENT_FLAG_LOGGING, &flags);
580 		modified = !list_empty(&em->list);
581 		if (no_splits)
582 			goto next;
583 
584 		if (em->start < start) {
585 			split->start = em->start;
586 			split->len = start - em->start;
587 
588 			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
589 				split->orig_start = em->orig_start;
590 				split->block_start = em->block_start;
591 
592 				if (compressed)
593 					split->block_len = em->block_len;
594 				else
595 					split->block_len = split->len;
596 				split->orig_block_len = max(split->block_len,
597 						em->orig_block_len);
598 				split->ram_bytes = em->ram_bytes;
599 			} else {
600 				split->orig_start = split->start;
601 				split->block_len = 0;
602 				split->block_start = em->block_start;
603 				split->orig_block_len = 0;
604 				split->ram_bytes = split->len;
605 			}
606 
607 			split->generation = gen;
608 			split->flags = flags;
609 			split->compress_type = em->compress_type;
610 			replace_extent_mapping(em_tree, em, split, modified);
611 			free_extent_map(split);
612 			split = split2;
613 			split2 = NULL;
614 		}
615 		if (testend && em->start + em->len > start + len) {
616 			u64 diff = start + len - em->start;
617 
618 			split->start = start + len;
619 			split->len = em->start + em->len - (start + len);
620 			split->flags = flags;
621 			split->compress_type = em->compress_type;
622 			split->generation = gen;
623 
624 			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
625 				split->orig_block_len = max(em->block_len,
626 						    em->orig_block_len);
627 
628 				split->ram_bytes = em->ram_bytes;
629 				if (compressed) {
630 					split->block_len = em->block_len;
631 					split->block_start = em->block_start;
632 					split->orig_start = em->orig_start;
633 				} else {
634 					split->block_len = split->len;
635 					split->block_start = em->block_start
636 						+ diff;
637 					split->orig_start = em->orig_start;
638 				}
639 			} else {
640 				split->ram_bytes = split->len;
641 				split->orig_start = split->start;
642 				split->block_len = 0;
643 				split->block_start = em->block_start;
644 				split->orig_block_len = 0;
645 			}
646 
647 			if (extent_map_in_tree(em)) {
648 				replace_extent_mapping(em_tree, em, split,
649 						       modified);
650 			} else {
651 				ret = add_extent_mapping(em_tree, split,
652 							 modified);
653 				ASSERT(ret == 0); /* Logic error */
654 			}
655 			free_extent_map(split);
656 			split = NULL;
657 		}
658 next:
659 		if (extent_map_in_tree(em))
660 			remove_extent_mapping(em_tree, em);
661 		write_unlock(&em_tree->lock);
662 
663 		/* once for us */
664 		free_extent_map(em);
665 		/* once for the tree*/
666 		free_extent_map(em);
667 	}
668 	if (split)
669 		free_extent_map(split);
670 	if (split2)
671 		free_extent_map(split2);
672 }
673 
674 /*
675  * this is very complex, but the basic idea is to drop all extents
676  * in the range start - end.  hint_block is filled in with a block number
677  * that would be a good hint to the block allocator for this file.
678  *
679  * If an extent intersects the range but is not entirely inside the range
680  * it is either truncated or split.  Anything entirely inside the range
681  * is deleted from the tree.
682  *
683  * Note: the VFS' inode number of bytes is not updated, it's up to the caller
684  * to deal with that. We set the field 'bytes_found' of the arguments structure
685  * with the number of allocated bytes found in the target range, so that the
686  * caller can update the inode's number of bytes in an atomic way when
687  * replacing extents in a range to avoid races with stat(2).
688  */
689 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
690 		       struct btrfs_root *root, struct btrfs_inode *inode,
691 		       struct btrfs_drop_extents_args *args)
692 {
693 	struct btrfs_fs_info *fs_info = root->fs_info;
694 	struct extent_buffer *leaf;
695 	struct btrfs_file_extent_item *fi;
696 	struct btrfs_ref ref = { 0 };
697 	struct btrfs_key key;
698 	struct btrfs_key new_key;
699 	u64 ino = btrfs_ino(inode);
700 	u64 search_start = args->start;
701 	u64 disk_bytenr = 0;
702 	u64 num_bytes = 0;
703 	u64 extent_offset = 0;
704 	u64 extent_end = 0;
705 	u64 last_end = args->start;
706 	int del_nr = 0;
707 	int del_slot = 0;
708 	int extent_type;
709 	int recow;
710 	int ret;
711 	int modify_tree = -1;
712 	int update_refs;
713 	int found = 0;
714 	int leafs_visited = 0;
715 	struct btrfs_path *path = args->path;
716 
717 	args->bytes_found = 0;
718 	args->extent_inserted = false;
719 
720 	/* Must always have a path if ->replace_extent is true */
721 	ASSERT(!(args->replace_extent && !args->path));
722 
723 	if (!path) {
724 		path = btrfs_alloc_path();
725 		if (!path) {
726 			ret = -ENOMEM;
727 			goto out;
728 		}
729 	}
730 
731 	if (args->drop_cache)
732 		btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0);
733 
734 	if (args->start >= inode->disk_i_size && !args->replace_extent)
735 		modify_tree = 0;
736 
737 	update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
738 		       root == fs_info->tree_root);
739 	while (1) {
740 		recow = 0;
741 		ret = btrfs_lookup_file_extent(trans, root, path, ino,
742 					       search_start, modify_tree);
743 		if (ret < 0)
744 			break;
745 		if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
746 			leaf = path->nodes[0];
747 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
748 			if (key.objectid == ino &&
749 			    key.type == BTRFS_EXTENT_DATA_KEY)
750 				path->slots[0]--;
751 		}
752 		ret = 0;
753 		leafs_visited++;
754 next_slot:
755 		leaf = path->nodes[0];
756 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
757 			BUG_ON(del_nr > 0);
758 			ret = btrfs_next_leaf(root, path);
759 			if (ret < 0)
760 				break;
761 			if (ret > 0) {
762 				ret = 0;
763 				break;
764 			}
765 			leafs_visited++;
766 			leaf = path->nodes[0];
767 			recow = 1;
768 		}
769 
770 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
771 
772 		if (key.objectid > ino)
773 			break;
774 		if (WARN_ON_ONCE(key.objectid < ino) ||
775 		    key.type < BTRFS_EXTENT_DATA_KEY) {
776 			ASSERT(del_nr == 0);
777 			path->slots[0]++;
778 			goto next_slot;
779 		}
780 		if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
781 			break;
782 
783 		fi = btrfs_item_ptr(leaf, path->slots[0],
784 				    struct btrfs_file_extent_item);
785 		extent_type = btrfs_file_extent_type(leaf, fi);
786 
787 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
788 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
789 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
790 			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
791 			extent_offset = btrfs_file_extent_offset(leaf, fi);
792 			extent_end = key.offset +
793 				btrfs_file_extent_num_bytes(leaf, fi);
794 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
795 			extent_end = key.offset +
796 				btrfs_file_extent_ram_bytes(leaf, fi);
797 		} else {
798 			/* can't happen */
799 			BUG();
800 		}
801 
802 		/*
803 		 * Don't skip extent items representing 0 byte lengths. They
804 		 * used to be created (bug) if while punching holes we hit
805 		 * -ENOSPC condition. So if we find one here, just ensure we
806 		 * delete it, otherwise we would insert a new file extent item
807 		 * with the same key (offset) as that 0 bytes length file
808 		 * extent item in the call to setup_items_for_insert() later
809 		 * in this function.
810 		 */
811 		if (extent_end == key.offset && extent_end >= search_start) {
812 			last_end = extent_end;
813 			goto delete_extent_item;
814 		}
815 
816 		if (extent_end <= search_start) {
817 			path->slots[0]++;
818 			goto next_slot;
819 		}
820 
821 		found = 1;
822 		search_start = max(key.offset, args->start);
823 		if (recow || !modify_tree) {
824 			modify_tree = -1;
825 			btrfs_release_path(path);
826 			continue;
827 		}
828 
829 		/*
830 		 *     | - range to drop - |
831 		 *  | -------- extent -------- |
832 		 */
833 		if (args->start > key.offset && args->end < extent_end) {
834 			BUG_ON(del_nr > 0);
835 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
836 				ret = -EOPNOTSUPP;
837 				break;
838 			}
839 
840 			memcpy(&new_key, &key, sizeof(new_key));
841 			new_key.offset = args->start;
842 			ret = btrfs_duplicate_item(trans, root, path,
843 						   &new_key);
844 			if (ret == -EAGAIN) {
845 				btrfs_release_path(path);
846 				continue;
847 			}
848 			if (ret < 0)
849 				break;
850 
851 			leaf = path->nodes[0];
852 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
853 					    struct btrfs_file_extent_item);
854 			btrfs_set_file_extent_num_bytes(leaf, fi,
855 							args->start - key.offset);
856 
857 			fi = btrfs_item_ptr(leaf, path->slots[0],
858 					    struct btrfs_file_extent_item);
859 
860 			extent_offset += args->start - key.offset;
861 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
862 			btrfs_set_file_extent_num_bytes(leaf, fi,
863 							extent_end - args->start);
864 			btrfs_mark_buffer_dirty(leaf);
865 
866 			if (update_refs && disk_bytenr > 0) {
867 				btrfs_init_generic_ref(&ref,
868 						BTRFS_ADD_DELAYED_REF,
869 						disk_bytenr, num_bytes, 0);
870 				btrfs_init_data_ref(&ref,
871 						root->root_key.objectid,
872 						new_key.objectid,
873 						args->start - extent_offset);
874 				ret = btrfs_inc_extent_ref(trans, &ref);
875 				BUG_ON(ret); /* -ENOMEM */
876 			}
877 			key.offset = args->start;
878 		}
879 		/*
880 		 * From here on out we will have actually dropped something, so
881 		 * last_end can be updated.
882 		 */
883 		last_end = extent_end;
884 
885 		/*
886 		 *  | ---- range to drop ----- |
887 		 *      | -------- extent -------- |
888 		 */
889 		if (args->start <= key.offset && args->end < extent_end) {
890 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
891 				ret = -EOPNOTSUPP;
892 				break;
893 			}
894 
895 			memcpy(&new_key, &key, sizeof(new_key));
896 			new_key.offset = args->end;
897 			btrfs_set_item_key_safe(fs_info, path, &new_key);
898 
899 			extent_offset += args->end - key.offset;
900 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
901 			btrfs_set_file_extent_num_bytes(leaf, fi,
902 							extent_end - args->end);
903 			btrfs_mark_buffer_dirty(leaf);
904 			if (update_refs && disk_bytenr > 0)
905 				args->bytes_found += args->end - key.offset;
906 			break;
907 		}
908 
909 		search_start = extent_end;
910 		/*
911 		 *       | ---- range to drop ----- |
912 		 *  | -------- extent -------- |
913 		 */
914 		if (args->start > key.offset && args->end >= extent_end) {
915 			BUG_ON(del_nr > 0);
916 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
917 				ret = -EOPNOTSUPP;
918 				break;
919 			}
920 
921 			btrfs_set_file_extent_num_bytes(leaf, fi,
922 							args->start - key.offset);
923 			btrfs_mark_buffer_dirty(leaf);
924 			if (update_refs && disk_bytenr > 0)
925 				args->bytes_found += extent_end - args->start;
926 			if (args->end == extent_end)
927 				break;
928 
929 			path->slots[0]++;
930 			goto next_slot;
931 		}
932 
933 		/*
934 		 *  | ---- range to drop ----- |
935 		 *    | ------ extent ------ |
936 		 */
937 		if (args->start <= key.offset && args->end >= extent_end) {
938 delete_extent_item:
939 			if (del_nr == 0) {
940 				del_slot = path->slots[0];
941 				del_nr = 1;
942 			} else {
943 				BUG_ON(del_slot + del_nr != path->slots[0]);
944 				del_nr++;
945 			}
946 
947 			if (update_refs &&
948 			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
949 				args->bytes_found += extent_end - key.offset;
950 				extent_end = ALIGN(extent_end,
951 						   fs_info->sectorsize);
952 			} else if (update_refs && disk_bytenr > 0) {
953 				btrfs_init_generic_ref(&ref,
954 						BTRFS_DROP_DELAYED_REF,
955 						disk_bytenr, num_bytes, 0);
956 				btrfs_init_data_ref(&ref,
957 						root->root_key.objectid,
958 						key.objectid,
959 						key.offset - extent_offset);
960 				ret = btrfs_free_extent(trans, &ref);
961 				BUG_ON(ret); /* -ENOMEM */
962 				args->bytes_found += extent_end - key.offset;
963 			}
964 
965 			if (args->end == extent_end)
966 				break;
967 
968 			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
969 				path->slots[0]++;
970 				goto next_slot;
971 			}
972 
973 			ret = btrfs_del_items(trans, root, path, del_slot,
974 					      del_nr);
975 			if (ret) {
976 				btrfs_abort_transaction(trans, ret);
977 				break;
978 			}
979 
980 			del_nr = 0;
981 			del_slot = 0;
982 
983 			btrfs_release_path(path);
984 			continue;
985 		}
986 
987 		BUG();
988 	}
989 
990 	if (!ret && del_nr > 0) {
991 		/*
992 		 * Set path->slots[0] to first slot, so that after the delete
993 		 * if items are move off from our leaf to its immediate left or
994 		 * right neighbor leafs, we end up with a correct and adjusted
995 		 * path->slots[0] for our insertion (if args->replace_extent).
996 		 */
997 		path->slots[0] = del_slot;
998 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
999 		if (ret)
1000 			btrfs_abort_transaction(trans, ret);
1001 	}
1002 
1003 	leaf = path->nodes[0];
1004 	/*
1005 	 * If btrfs_del_items() was called, it might have deleted a leaf, in
1006 	 * which case it unlocked our path, so check path->locks[0] matches a
1007 	 * write lock.
1008 	 */
1009 	if (!ret && args->replace_extent && leafs_visited == 1 &&
1010 	    path->locks[0] == BTRFS_WRITE_LOCK &&
1011 	    btrfs_leaf_free_space(leaf) >=
1012 	    sizeof(struct btrfs_item) + args->extent_item_size) {
1013 
1014 		key.objectid = ino;
1015 		key.type = BTRFS_EXTENT_DATA_KEY;
1016 		key.offset = args->start;
1017 		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
1018 			struct btrfs_key slot_key;
1019 
1020 			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
1021 			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
1022 				path->slots[0]++;
1023 		}
1024 		setup_items_for_insert(root, path, &key,
1025 				       &args->extent_item_size, 1);
1026 		args->extent_inserted = true;
1027 	}
1028 
1029 	if (!args->path)
1030 		btrfs_free_path(path);
1031 	else if (!args->extent_inserted)
1032 		btrfs_release_path(path);
1033 out:
1034 	args->drop_end = found ? min(args->end, last_end) : args->end;
1035 
1036 	return ret;
1037 }
1038 
1039 static int extent_mergeable(struct extent_buffer *leaf, int slot,
1040 			    u64 objectid, u64 bytenr, u64 orig_offset,
1041 			    u64 *start, u64 *end)
1042 {
1043 	struct btrfs_file_extent_item *fi;
1044 	struct btrfs_key key;
1045 	u64 extent_end;
1046 
1047 	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1048 		return 0;
1049 
1050 	btrfs_item_key_to_cpu(leaf, &key, slot);
1051 	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
1052 		return 0;
1053 
1054 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1055 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
1056 	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
1057 	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
1058 	    btrfs_file_extent_compression(leaf, fi) ||
1059 	    btrfs_file_extent_encryption(leaf, fi) ||
1060 	    btrfs_file_extent_other_encoding(leaf, fi))
1061 		return 0;
1062 
1063 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1064 	if ((*start && *start != key.offset) || (*end && *end != extent_end))
1065 		return 0;
1066 
1067 	*start = key.offset;
1068 	*end = extent_end;
1069 	return 1;
1070 }
1071 
1072 /*
1073  * Mark extent in the range start - end as written.
1074  *
1075  * This changes extent type from 'pre-allocated' to 'regular'. If only
1076  * part of extent is marked as written, the extent will be split into
1077  * two or three.
1078  */
1079 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1080 			      struct btrfs_inode *inode, u64 start, u64 end)
1081 {
1082 	struct btrfs_fs_info *fs_info = trans->fs_info;
1083 	struct btrfs_root *root = inode->root;
1084 	struct extent_buffer *leaf;
1085 	struct btrfs_path *path;
1086 	struct btrfs_file_extent_item *fi;
1087 	struct btrfs_ref ref = { 0 };
1088 	struct btrfs_key key;
1089 	struct btrfs_key new_key;
1090 	u64 bytenr;
1091 	u64 num_bytes;
1092 	u64 extent_end;
1093 	u64 orig_offset;
1094 	u64 other_start;
1095 	u64 other_end;
1096 	u64 split;
1097 	int del_nr = 0;
1098 	int del_slot = 0;
1099 	int recow;
1100 	int ret = 0;
1101 	u64 ino = btrfs_ino(inode);
1102 
1103 	path = btrfs_alloc_path();
1104 	if (!path)
1105 		return -ENOMEM;
1106 again:
1107 	recow = 0;
1108 	split = start;
1109 	key.objectid = ino;
1110 	key.type = BTRFS_EXTENT_DATA_KEY;
1111 	key.offset = split;
1112 
1113 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1114 	if (ret < 0)
1115 		goto out;
1116 	if (ret > 0 && path->slots[0] > 0)
1117 		path->slots[0]--;
1118 
1119 	leaf = path->nodes[0];
1120 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1121 	if (key.objectid != ino ||
1122 	    key.type != BTRFS_EXTENT_DATA_KEY) {
1123 		ret = -EINVAL;
1124 		btrfs_abort_transaction(trans, ret);
1125 		goto out;
1126 	}
1127 	fi = btrfs_item_ptr(leaf, path->slots[0],
1128 			    struct btrfs_file_extent_item);
1129 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
1130 		ret = -EINVAL;
1131 		btrfs_abort_transaction(trans, ret);
1132 		goto out;
1133 	}
1134 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1135 	if (key.offset > start || extent_end < end) {
1136 		ret = -EINVAL;
1137 		btrfs_abort_transaction(trans, ret);
1138 		goto out;
1139 	}
1140 
1141 	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1142 	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1143 	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
1144 	memcpy(&new_key, &key, sizeof(new_key));
1145 
1146 	if (start == key.offset && end < extent_end) {
1147 		other_start = 0;
1148 		other_end = start;
1149 		if (extent_mergeable(leaf, path->slots[0] - 1,
1150 				     ino, bytenr, orig_offset,
1151 				     &other_start, &other_end)) {
1152 			new_key.offset = end;
1153 			btrfs_set_item_key_safe(fs_info, path, &new_key);
1154 			fi = btrfs_item_ptr(leaf, path->slots[0],
1155 					    struct btrfs_file_extent_item);
1156 			btrfs_set_file_extent_generation(leaf, fi,
1157 							 trans->transid);
1158 			btrfs_set_file_extent_num_bytes(leaf, fi,
1159 							extent_end - end);
1160 			btrfs_set_file_extent_offset(leaf, fi,
1161 						     end - orig_offset);
1162 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1163 					    struct btrfs_file_extent_item);
1164 			btrfs_set_file_extent_generation(leaf, fi,
1165 							 trans->transid);
1166 			btrfs_set_file_extent_num_bytes(leaf, fi,
1167 							end - other_start);
1168 			btrfs_mark_buffer_dirty(leaf);
1169 			goto out;
1170 		}
1171 	}
1172 
1173 	if (start > key.offset && end == extent_end) {
1174 		other_start = end;
1175 		other_end = 0;
1176 		if (extent_mergeable(leaf, path->slots[0] + 1,
1177 				     ino, bytenr, orig_offset,
1178 				     &other_start, &other_end)) {
1179 			fi = btrfs_item_ptr(leaf, path->slots[0],
1180 					    struct btrfs_file_extent_item);
1181 			btrfs_set_file_extent_num_bytes(leaf, fi,
1182 							start - key.offset);
1183 			btrfs_set_file_extent_generation(leaf, fi,
1184 							 trans->transid);
1185 			path->slots[0]++;
1186 			new_key.offset = start;
1187 			btrfs_set_item_key_safe(fs_info, path, &new_key);
1188 
1189 			fi = btrfs_item_ptr(leaf, path->slots[0],
1190 					    struct btrfs_file_extent_item);
1191 			btrfs_set_file_extent_generation(leaf, fi,
1192 							 trans->transid);
1193 			btrfs_set_file_extent_num_bytes(leaf, fi,
1194 							other_end - start);
1195 			btrfs_set_file_extent_offset(leaf, fi,
1196 						     start - orig_offset);
1197 			btrfs_mark_buffer_dirty(leaf);
1198 			goto out;
1199 		}
1200 	}
1201 
1202 	while (start > key.offset || end < extent_end) {
1203 		if (key.offset == start)
1204 			split = end;
1205 
1206 		new_key.offset = split;
1207 		ret = btrfs_duplicate_item(trans, root, path, &new_key);
1208 		if (ret == -EAGAIN) {
1209 			btrfs_release_path(path);
1210 			goto again;
1211 		}
1212 		if (ret < 0) {
1213 			btrfs_abort_transaction(trans, ret);
1214 			goto out;
1215 		}
1216 
1217 		leaf = path->nodes[0];
1218 		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1219 				    struct btrfs_file_extent_item);
1220 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1221 		btrfs_set_file_extent_num_bytes(leaf, fi,
1222 						split - key.offset);
1223 
1224 		fi = btrfs_item_ptr(leaf, path->slots[0],
1225 				    struct btrfs_file_extent_item);
1226 
1227 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1228 		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1229 		btrfs_set_file_extent_num_bytes(leaf, fi,
1230 						extent_end - split);
1231 		btrfs_mark_buffer_dirty(leaf);
1232 
1233 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
1234 				       num_bytes, 0);
1235 		btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
1236 				    orig_offset);
1237 		ret = btrfs_inc_extent_ref(trans, &ref);
1238 		if (ret) {
1239 			btrfs_abort_transaction(trans, ret);
1240 			goto out;
1241 		}
1242 
1243 		if (split == start) {
1244 			key.offset = start;
1245 		} else {
1246 			if (start != key.offset) {
1247 				ret = -EINVAL;
1248 				btrfs_abort_transaction(trans, ret);
1249 				goto out;
1250 			}
1251 			path->slots[0]--;
1252 			extent_end = end;
1253 		}
1254 		recow = 1;
1255 	}
1256 
1257 	other_start = end;
1258 	other_end = 0;
1259 	btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1260 			       num_bytes, 0);
1261 	btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
1262 	if (extent_mergeable(leaf, path->slots[0] + 1,
1263 			     ino, bytenr, orig_offset,
1264 			     &other_start, &other_end)) {
1265 		if (recow) {
1266 			btrfs_release_path(path);
1267 			goto again;
1268 		}
1269 		extent_end = other_end;
1270 		del_slot = path->slots[0] + 1;
1271 		del_nr++;
1272 		ret = btrfs_free_extent(trans, &ref);
1273 		if (ret) {
1274 			btrfs_abort_transaction(trans, ret);
1275 			goto out;
1276 		}
1277 	}
1278 	other_start = 0;
1279 	other_end = start;
1280 	if (extent_mergeable(leaf, path->slots[0] - 1,
1281 			     ino, bytenr, orig_offset,
1282 			     &other_start, &other_end)) {
1283 		if (recow) {
1284 			btrfs_release_path(path);
1285 			goto again;
1286 		}
1287 		key.offset = other_start;
1288 		del_slot = path->slots[0];
1289 		del_nr++;
1290 		ret = btrfs_free_extent(trans, &ref);
1291 		if (ret) {
1292 			btrfs_abort_transaction(trans, ret);
1293 			goto out;
1294 		}
1295 	}
1296 	if (del_nr == 0) {
1297 		fi = btrfs_item_ptr(leaf, path->slots[0],
1298 			   struct btrfs_file_extent_item);
1299 		btrfs_set_file_extent_type(leaf, fi,
1300 					   BTRFS_FILE_EXTENT_REG);
1301 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1302 		btrfs_mark_buffer_dirty(leaf);
1303 	} else {
1304 		fi = btrfs_item_ptr(leaf, del_slot - 1,
1305 			   struct btrfs_file_extent_item);
1306 		btrfs_set_file_extent_type(leaf, fi,
1307 					   BTRFS_FILE_EXTENT_REG);
1308 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1309 		btrfs_set_file_extent_num_bytes(leaf, fi,
1310 						extent_end - key.offset);
1311 		btrfs_mark_buffer_dirty(leaf);
1312 
1313 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1314 		if (ret < 0) {
1315 			btrfs_abort_transaction(trans, ret);
1316 			goto out;
1317 		}
1318 	}
1319 out:
1320 	btrfs_free_path(path);
1321 	return ret;
1322 }
1323 
1324 /*
1325  * on error we return an unlocked page and the error value
1326  * on success we return a locked page and 0
1327  */
1328 static int prepare_uptodate_page(struct inode *inode,
1329 				 struct page *page, u64 pos,
1330 				 bool force_uptodate)
1331 {
1332 	int ret = 0;
1333 
1334 	if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
1335 	    !PageUptodate(page)) {
1336 		ret = btrfs_readpage(NULL, page);
1337 		if (ret)
1338 			return ret;
1339 		lock_page(page);
1340 		if (!PageUptodate(page)) {
1341 			unlock_page(page);
1342 			return -EIO;
1343 		}
1344 
1345 		/*
1346 		 * Since btrfs_readpage() will unlock the page before it
1347 		 * returns, there is a window where btrfs_releasepage() can be
1348 		 * called to release the page.  Here we check both inode
1349 		 * mapping and PagePrivate() to make sure the page was not
1350 		 * released.
1351 		 *
1352 		 * The private flag check is essential for subpage as we need
1353 		 * to store extra bitmap using page->private.
1354 		 */
1355 		if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
1356 			unlock_page(page);
1357 			return -EAGAIN;
1358 		}
1359 	}
1360 	return 0;
1361 }
1362 
1363 /*
1364  * this just gets pages into the page cache and locks them down.
1365  */
1366 static noinline int prepare_pages(struct inode *inode, struct page **pages,
1367 				  size_t num_pages, loff_t pos,
1368 				  size_t write_bytes, bool force_uptodate)
1369 {
1370 	int i;
1371 	unsigned long index = pos >> PAGE_SHIFT;
1372 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1373 	int err = 0;
1374 	int faili;
1375 
1376 	for (i = 0; i < num_pages; i++) {
1377 again:
1378 		pages[i] = find_or_create_page(inode->i_mapping, index + i,
1379 					       mask | __GFP_WRITE);
1380 		if (!pages[i]) {
1381 			faili = i - 1;
1382 			err = -ENOMEM;
1383 			goto fail;
1384 		}
1385 
1386 		err = set_page_extent_mapped(pages[i]);
1387 		if (err < 0) {
1388 			faili = i;
1389 			goto fail;
1390 		}
1391 
1392 		if (i == 0)
1393 			err = prepare_uptodate_page(inode, pages[i], pos,
1394 						    force_uptodate);
1395 		if (!err && i == num_pages - 1)
1396 			err = prepare_uptodate_page(inode, pages[i],
1397 						    pos + write_bytes, false);
1398 		if (err) {
1399 			put_page(pages[i]);
1400 			if (err == -EAGAIN) {
1401 				err = 0;
1402 				goto again;
1403 			}
1404 			faili = i - 1;
1405 			goto fail;
1406 		}
1407 		wait_on_page_writeback(pages[i]);
1408 	}
1409 
1410 	return 0;
1411 fail:
1412 	while (faili >= 0) {
1413 		unlock_page(pages[faili]);
1414 		put_page(pages[faili]);
1415 		faili--;
1416 	}
1417 	return err;
1418 
1419 }
1420 
1421 /*
1422  * This function locks the extent and properly waits for data=ordered extents
1423  * to finish before allowing the pages to be modified if need.
1424  *
1425  * The return value:
1426  * 1 - the extent is locked
1427  * 0 - the extent is not locked, and everything is OK
1428  * -EAGAIN - need re-prepare the pages
1429  * the other < 0 number - Something wrong happens
1430  */
1431 static noinline int
1432 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
1433 				size_t num_pages, loff_t pos,
1434 				size_t write_bytes,
1435 				u64 *lockstart, u64 *lockend,
1436 				struct extent_state **cached_state)
1437 {
1438 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1439 	u64 start_pos;
1440 	u64 last_pos;
1441 	int i;
1442 	int ret = 0;
1443 
1444 	start_pos = round_down(pos, fs_info->sectorsize);
1445 	last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
1446 
1447 	if (start_pos < inode->vfs_inode.i_size) {
1448 		struct btrfs_ordered_extent *ordered;
1449 
1450 		lock_extent_bits(&inode->io_tree, start_pos, last_pos,
1451 				cached_state);
1452 		ordered = btrfs_lookup_ordered_range(inode, start_pos,
1453 						     last_pos - start_pos + 1);
1454 		if (ordered &&
1455 		    ordered->file_offset + ordered->num_bytes > start_pos &&
1456 		    ordered->file_offset <= last_pos) {
1457 			unlock_extent_cached(&inode->io_tree, start_pos,
1458 					last_pos, cached_state);
1459 			for (i = 0; i < num_pages; i++) {
1460 				unlock_page(pages[i]);
1461 				put_page(pages[i]);
1462 			}
1463 			btrfs_start_ordered_extent(ordered, 1);
1464 			btrfs_put_ordered_extent(ordered);
1465 			return -EAGAIN;
1466 		}
1467 		if (ordered)
1468 			btrfs_put_ordered_extent(ordered);
1469 
1470 		*lockstart = start_pos;
1471 		*lockend = last_pos;
1472 		ret = 1;
1473 	}
1474 
1475 	/*
1476 	 * We should be called after prepare_pages() which should have locked
1477 	 * all pages in the range.
1478 	 */
1479 	for (i = 0; i < num_pages; i++)
1480 		WARN_ON(!PageLocked(pages[i]));
1481 
1482 	return ret;
1483 }
1484 
1485 static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
1486 			   size_t *write_bytes, bool nowait)
1487 {
1488 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1489 	struct btrfs_root *root = inode->root;
1490 	u64 lockstart, lockend;
1491 	u64 num_bytes;
1492 	int ret;
1493 
1494 	if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1495 		return 0;
1496 
1497 	if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
1498 		return -EAGAIN;
1499 
1500 	lockstart = round_down(pos, fs_info->sectorsize);
1501 	lockend = round_up(pos + *write_bytes,
1502 			   fs_info->sectorsize) - 1;
1503 	num_bytes = lockend - lockstart + 1;
1504 
1505 	if (nowait) {
1506 		struct btrfs_ordered_extent *ordered;
1507 
1508 		if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
1509 			return -EAGAIN;
1510 
1511 		ordered = btrfs_lookup_ordered_range(inode, lockstart,
1512 						     num_bytes);
1513 		if (ordered) {
1514 			btrfs_put_ordered_extent(ordered);
1515 			ret = -EAGAIN;
1516 			goto out_unlock;
1517 		}
1518 	} else {
1519 		btrfs_lock_and_flush_ordered_range(inode, lockstart,
1520 						   lockend, NULL);
1521 	}
1522 
1523 	ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
1524 			NULL, NULL, NULL, false);
1525 	if (ret <= 0) {
1526 		ret = 0;
1527 		if (!nowait)
1528 			btrfs_drew_write_unlock(&root->snapshot_lock);
1529 	} else {
1530 		*write_bytes = min_t(size_t, *write_bytes ,
1531 				     num_bytes - pos + lockstart);
1532 	}
1533 out_unlock:
1534 	unlock_extent(&inode->io_tree, lockstart, lockend);
1535 
1536 	return ret;
1537 }
1538 
1539 static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
1540 			      size_t *write_bytes)
1541 {
1542 	return check_can_nocow(inode, pos, write_bytes, true);
1543 }
1544 
1545 /*
1546  * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1547  *
1548  * @pos:	 File offset
1549  * @write_bytes: The length to write, will be updated to the nocow writeable
1550  *		 range
1551  *
1552  * This function will flush ordered extents in the range to ensure proper
1553  * nocow checks.
1554  *
1555  * Return:
1556  * >0		and update @write_bytes if we can do nocow write
1557  *  0		if we can't do nocow write
1558  * -EAGAIN	if we can't get the needed lock or there are ordered extents
1559  * 		for * (nowait == true) case
1560  * <0		if other error happened
1561  *
1562  * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
1563  */
1564 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
1565 			   size_t *write_bytes)
1566 {
1567 	return check_can_nocow(inode, pos, write_bytes, false);
1568 }
1569 
1570 void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1571 {
1572 	btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1573 }
1574 
1575 static void update_time_for_write(struct inode *inode)
1576 {
1577 	struct timespec64 now;
1578 
1579 	if (IS_NOCMTIME(inode))
1580 		return;
1581 
1582 	now = current_time(inode);
1583 	if (!timespec64_equal(&inode->i_mtime, &now))
1584 		inode->i_mtime = now;
1585 
1586 	if (!timespec64_equal(&inode->i_ctime, &now))
1587 		inode->i_ctime = now;
1588 
1589 	if (IS_I_VERSION(inode))
1590 		inode_inc_iversion(inode);
1591 }
1592 
1593 static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
1594 			     size_t count)
1595 {
1596 	struct file *file = iocb->ki_filp;
1597 	struct inode *inode = file_inode(file);
1598 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1599 	loff_t pos = iocb->ki_pos;
1600 	int ret;
1601 	loff_t oldsize;
1602 	loff_t start_pos;
1603 
1604 	if (iocb->ki_flags & IOCB_NOWAIT) {
1605 		size_t nocow_bytes = count;
1606 
1607 		/* We will allocate space in case nodatacow is not set, so bail */
1608 		if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0)
1609 			return -EAGAIN;
1610 		/*
1611 		 * There are holes in the range or parts of the range that must
1612 		 * be COWed (shared extents, RO block groups, etc), so just bail
1613 		 * out.
1614 		 */
1615 		if (nocow_bytes < count)
1616 			return -EAGAIN;
1617 	}
1618 
1619 	current->backing_dev_info = inode_to_bdi(inode);
1620 	ret = file_remove_privs(file);
1621 	if (ret)
1622 		return ret;
1623 
1624 	/*
1625 	 * We reserve space for updating the inode when we reserve space for the
1626 	 * extent we are going to write, so we will enospc out there.  We don't
1627 	 * need to start yet another transaction to update the inode as we will
1628 	 * update the inode when we finish writing whatever data we write.
1629 	 */
1630 	update_time_for_write(inode);
1631 
1632 	start_pos = round_down(pos, fs_info->sectorsize);
1633 	oldsize = i_size_read(inode);
1634 	if (start_pos > oldsize) {
1635 		/* Expand hole size to cover write data, preventing empty gap */
1636 		loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1637 
1638 		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1639 		if (ret) {
1640 			current->backing_dev_info = NULL;
1641 			return ret;
1642 		}
1643 	}
1644 
1645 	return 0;
1646 }
1647 
1648 static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
1649 					       struct iov_iter *i)
1650 {
1651 	struct file *file = iocb->ki_filp;
1652 	loff_t pos;
1653 	struct inode *inode = file_inode(file);
1654 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1655 	struct page **pages = NULL;
1656 	struct extent_changeset *data_reserved = NULL;
1657 	u64 release_bytes = 0;
1658 	u64 lockstart;
1659 	u64 lockend;
1660 	size_t num_written = 0;
1661 	int nrptrs;
1662 	ssize_t ret;
1663 	bool only_release_metadata = false;
1664 	bool force_page_uptodate = false;
1665 	loff_t old_isize = i_size_read(inode);
1666 	unsigned int ilock_flags = 0;
1667 
1668 	if (iocb->ki_flags & IOCB_NOWAIT)
1669 		ilock_flags |= BTRFS_ILOCK_TRY;
1670 
1671 	ret = btrfs_inode_lock(inode, ilock_flags);
1672 	if (ret < 0)
1673 		return ret;
1674 
1675 	ret = generic_write_checks(iocb, i);
1676 	if (ret <= 0)
1677 		goto out;
1678 
1679 	ret = btrfs_write_check(iocb, i, ret);
1680 	if (ret < 0)
1681 		goto out;
1682 
1683 	pos = iocb->ki_pos;
1684 	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
1685 			PAGE_SIZE / (sizeof(struct page *)));
1686 	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1687 	nrptrs = max(nrptrs, 8);
1688 	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
1689 	if (!pages) {
1690 		ret = -ENOMEM;
1691 		goto out;
1692 	}
1693 
1694 	while (iov_iter_count(i) > 0) {
1695 		struct extent_state *cached_state = NULL;
1696 		size_t offset = offset_in_page(pos);
1697 		size_t sector_offset;
1698 		size_t write_bytes = min(iov_iter_count(i),
1699 					 nrptrs * (size_t)PAGE_SIZE -
1700 					 offset);
1701 		size_t num_pages;
1702 		size_t reserve_bytes;
1703 		size_t dirty_pages;
1704 		size_t copied;
1705 		size_t dirty_sectors;
1706 		size_t num_sectors;
1707 		int extents_locked;
1708 
1709 		/*
1710 		 * Fault pages before locking them in prepare_pages
1711 		 * to avoid recursive lock
1712 		 */
1713 		if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1714 			ret = -EFAULT;
1715 			break;
1716 		}
1717 
1718 		only_release_metadata = false;
1719 		sector_offset = pos & (fs_info->sectorsize - 1);
1720 
1721 		extent_changeset_release(data_reserved);
1722 		ret = btrfs_check_data_free_space(BTRFS_I(inode),
1723 						  &data_reserved, pos,
1724 						  write_bytes);
1725 		if (ret < 0) {
1726 			/*
1727 			 * If we don't have to COW at the offset, reserve
1728 			 * metadata only. write_bytes may get smaller than
1729 			 * requested here.
1730 			 */
1731 			if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
1732 						   &write_bytes) > 0)
1733 				only_release_metadata = true;
1734 			else
1735 				break;
1736 		}
1737 
1738 		num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
1739 		WARN_ON(num_pages > nrptrs);
1740 		reserve_bytes = round_up(write_bytes + sector_offset,
1741 					 fs_info->sectorsize);
1742 		WARN_ON(reserve_bytes == 0);
1743 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1744 				reserve_bytes);
1745 		if (ret) {
1746 			if (!only_release_metadata)
1747 				btrfs_free_reserved_data_space(BTRFS_I(inode),
1748 						data_reserved, pos,
1749 						write_bytes);
1750 			else
1751 				btrfs_check_nocow_unlock(BTRFS_I(inode));
1752 			break;
1753 		}
1754 
1755 		release_bytes = reserve_bytes;
1756 again:
1757 		/*
1758 		 * This is going to setup the pages array with the number of
1759 		 * pages we want, so we don't really need to worry about the
1760 		 * contents of pages from loop to loop
1761 		 */
1762 		ret = prepare_pages(inode, pages, num_pages,
1763 				    pos, write_bytes,
1764 				    force_page_uptodate);
1765 		if (ret) {
1766 			btrfs_delalloc_release_extents(BTRFS_I(inode),
1767 						       reserve_bytes);
1768 			break;
1769 		}
1770 
1771 		extents_locked = lock_and_cleanup_extent_if_need(
1772 				BTRFS_I(inode), pages,
1773 				num_pages, pos, write_bytes, &lockstart,
1774 				&lockend, &cached_state);
1775 		if (extents_locked < 0) {
1776 			if (extents_locked == -EAGAIN)
1777 				goto again;
1778 			btrfs_delalloc_release_extents(BTRFS_I(inode),
1779 						       reserve_bytes);
1780 			ret = extents_locked;
1781 			break;
1782 		}
1783 
1784 		copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
1785 
1786 		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1787 		dirty_sectors = round_up(copied + sector_offset,
1788 					fs_info->sectorsize);
1789 		dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1790 
1791 		/*
1792 		 * if we have trouble faulting in the pages, fall
1793 		 * back to one page at a time
1794 		 */
1795 		if (copied < write_bytes)
1796 			nrptrs = 1;
1797 
1798 		if (copied == 0) {
1799 			force_page_uptodate = true;
1800 			dirty_sectors = 0;
1801 			dirty_pages = 0;
1802 		} else {
1803 			force_page_uptodate = false;
1804 			dirty_pages = DIV_ROUND_UP(copied + offset,
1805 						   PAGE_SIZE);
1806 		}
1807 
1808 		if (num_sectors > dirty_sectors) {
1809 			/* release everything except the sectors we dirtied */
1810 			release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
1811 			if (only_release_metadata) {
1812 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
1813 							release_bytes, true);
1814 			} else {
1815 				u64 __pos;
1816 
1817 				__pos = round_down(pos,
1818 						   fs_info->sectorsize) +
1819 					(dirty_pages << PAGE_SHIFT);
1820 				btrfs_delalloc_release_space(BTRFS_I(inode),
1821 						data_reserved, __pos,
1822 						release_bytes, true);
1823 			}
1824 		}
1825 
1826 		release_bytes = round_up(copied + sector_offset,
1827 					fs_info->sectorsize);
1828 
1829 		ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
1830 					dirty_pages, pos, copied,
1831 					&cached_state, only_release_metadata);
1832 
1833 		/*
1834 		 * If we have not locked the extent range, because the range's
1835 		 * start offset is >= i_size, we might still have a non-NULL
1836 		 * cached extent state, acquired while marking the extent range
1837 		 * as delalloc through btrfs_dirty_pages(). Therefore free any
1838 		 * possible cached extent state to avoid a memory leak.
1839 		 */
1840 		if (extents_locked)
1841 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1842 					     lockstart, lockend, &cached_state);
1843 		else
1844 			free_extent_state(cached_state);
1845 
1846 		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1847 		if (ret) {
1848 			btrfs_drop_pages(pages, num_pages);
1849 			break;
1850 		}
1851 
1852 		release_bytes = 0;
1853 		if (only_release_metadata)
1854 			btrfs_check_nocow_unlock(BTRFS_I(inode));
1855 
1856 		btrfs_drop_pages(pages, num_pages);
1857 
1858 		cond_resched();
1859 
1860 		balance_dirty_pages_ratelimited(inode->i_mapping);
1861 
1862 		pos += copied;
1863 		num_written += copied;
1864 	}
1865 
1866 	kfree(pages);
1867 
1868 	if (release_bytes) {
1869 		if (only_release_metadata) {
1870 			btrfs_check_nocow_unlock(BTRFS_I(inode));
1871 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
1872 					release_bytes, true);
1873 		} else {
1874 			btrfs_delalloc_release_space(BTRFS_I(inode),
1875 					data_reserved,
1876 					round_down(pos, fs_info->sectorsize),
1877 					release_bytes, true);
1878 		}
1879 	}
1880 
1881 	extent_changeset_free(data_reserved);
1882 	if (num_written > 0) {
1883 		pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1884 		iocb->ki_pos += num_written;
1885 	}
1886 out:
1887 	btrfs_inode_unlock(inode, ilock_flags);
1888 	return num_written ? num_written : ret;
1889 }
1890 
1891 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
1892 			       const struct iov_iter *iter, loff_t offset)
1893 {
1894 	const u32 blocksize_mask = fs_info->sectorsize - 1;
1895 
1896 	if (offset & blocksize_mask)
1897 		return -EINVAL;
1898 
1899 	if (iov_iter_alignment(iter) & blocksize_mask)
1900 		return -EINVAL;
1901 
1902 	return 0;
1903 }
1904 
1905 static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
1906 {
1907 	struct file *file = iocb->ki_filp;
1908 	struct inode *inode = file_inode(file);
1909 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1910 	loff_t pos;
1911 	ssize_t written = 0;
1912 	ssize_t written_buffered;
1913 	loff_t endbyte;
1914 	ssize_t err;
1915 	unsigned int ilock_flags = 0;
1916 	struct iomap_dio *dio = NULL;
1917 
1918 	if (iocb->ki_flags & IOCB_NOWAIT)
1919 		ilock_flags |= BTRFS_ILOCK_TRY;
1920 
1921 	/* If the write DIO is within EOF, use a shared lock */
1922 	if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode))
1923 		ilock_flags |= BTRFS_ILOCK_SHARED;
1924 
1925 relock:
1926 	err = btrfs_inode_lock(inode, ilock_flags);
1927 	if (err < 0)
1928 		return err;
1929 
1930 	err = generic_write_checks(iocb, from);
1931 	if (err <= 0) {
1932 		btrfs_inode_unlock(inode, ilock_flags);
1933 		return err;
1934 	}
1935 
1936 	err = btrfs_write_check(iocb, from, err);
1937 	if (err < 0) {
1938 		btrfs_inode_unlock(inode, ilock_flags);
1939 		goto out;
1940 	}
1941 
1942 	pos = iocb->ki_pos;
1943 	/*
1944 	 * Re-check since file size may have changed just before taking the
1945 	 * lock or pos may have changed because of O_APPEND in generic_write_check()
1946 	 */
1947 	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
1948 	    pos + iov_iter_count(from) > i_size_read(inode)) {
1949 		btrfs_inode_unlock(inode, ilock_flags);
1950 		ilock_flags &= ~BTRFS_ILOCK_SHARED;
1951 		goto relock;
1952 	}
1953 
1954 	if (check_direct_IO(fs_info, from, pos)) {
1955 		btrfs_inode_unlock(inode, ilock_flags);
1956 		goto buffered;
1957 	}
1958 
1959 	dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
1960 			     0);
1961 
1962 	btrfs_inode_unlock(inode, ilock_flags);
1963 
1964 	if (IS_ERR_OR_NULL(dio)) {
1965 		err = PTR_ERR_OR_ZERO(dio);
1966 		if (err < 0 && err != -ENOTBLK)
1967 			goto out;
1968 	} else {
1969 		written = iomap_dio_complete(dio);
1970 	}
1971 
1972 	if (written < 0 || !iov_iter_count(from)) {
1973 		err = written;
1974 		goto out;
1975 	}
1976 
1977 buffered:
1978 	pos = iocb->ki_pos;
1979 	written_buffered = btrfs_buffered_write(iocb, from);
1980 	if (written_buffered < 0) {
1981 		err = written_buffered;
1982 		goto out;
1983 	}
1984 	/*
1985 	 * Ensure all data is persisted. We want the next direct IO read to be
1986 	 * able to read what was just written.
1987 	 */
1988 	endbyte = pos + written_buffered - 1;
1989 	err = btrfs_fdatawrite_range(inode, pos, endbyte);
1990 	if (err)
1991 		goto out;
1992 	err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1993 	if (err)
1994 		goto out;
1995 	written += written_buffered;
1996 	iocb->ki_pos = pos + written_buffered;
1997 	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
1998 				 endbyte >> PAGE_SHIFT);
1999 out:
2000 	return written ? written : err;
2001 }
2002 
2003 static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
2004 				    struct iov_iter *from)
2005 {
2006 	struct file *file = iocb->ki_filp;
2007 	struct btrfs_inode *inode = BTRFS_I(file_inode(file));
2008 	ssize_t num_written = 0;
2009 	const bool sync = iocb->ki_flags & IOCB_DSYNC;
2010 
2011 	/*
2012 	 * If the fs flips readonly due to some impossible error, although we
2013 	 * have opened a file as writable, we have to stop this write operation
2014 	 * to ensure consistency.
2015 	 */
2016 	if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
2017 		return -EROFS;
2018 
2019 	if (!(iocb->ki_flags & IOCB_DIRECT) &&
2020 	    (iocb->ki_flags & IOCB_NOWAIT))
2021 		return -EOPNOTSUPP;
2022 
2023 	if (sync)
2024 		atomic_inc(&inode->sync_writers);
2025 
2026 	if (iocb->ki_flags & IOCB_DIRECT)
2027 		num_written = btrfs_direct_write(iocb, from);
2028 	else
2029 		num_written = btrfs_buffered_write(iocb, from);
2030 
2031 	btrfs_set_inode_last_sub_trans(inode);
2032 
2033 	if (num_written > 0)
2034 		num_written = generic_write_sync(iocb, num_written);
2035 
2036 	if (sync)
2037 		atomic_dec(&inode->sync_writers);
2038 
2039 	current->backing_dev_info = NULL;
2040 	return num_written;
2041 }
2042 
2043 int btrfs_release_file(struct inode *inode, struct file *filp)
2044 {
2045 	struct btrfs_file_private *private = filp->private_data;
2046 
2047 	if (private && private->filldir_buf)
2048 		kfree(private->filldir_buf);
2049 	kfree(private);
2050 	filp->private_data = NULL;
2051 
2052 	/*
2053 	 * Set by setattr when we are about to truncate a file from a non-zero
2054 	 * size to a zero size.  This tries to flush down new bytes that may
2055 	 * have been written if the application were using truncate to replace
2056 	 * a file in place.
2057 	 */
2058 	if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
2059 			       &BTRFS_I(inode)->runtime_flags))
2060 			filemap_flush(inode->i_mapping);
2061 	return 0;
2062 }
2063 
2064 static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
2065 {
2066 	int ret;
2067 	struct blk_plug plug;
2068 
2069 	/*
2070 	 * This is only called in fsync, which would do synchronous writes, so
2071 	 * a plug can merge adjacent IOs as much as possible.  Esp. in case of
2072 	 * multiple disks using raid profile, a large IO can be split to
2073 	 * several segments of stripe length (currently 64K).
2074 	 */
2075 	blk_start_plug(&plug);
2076 	atomic_inc(&BTRFS_I(inode)->sync_writers);
2077 	ret = btrfs_fdatawrite_range(inode, start, end);
2078 	atomic_dec(&BTRFS_I(inode)->sync_writers);
2079 	blk_finish_plug(&plug);
2080 
2081 	return ret;
2082 }
2083 
2084 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
2085 {
2086 	struct btrfs_inode *inode = BTRFS_I(ctx->inode);
2087 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2088 
2089 	if (btrfs_inode_in_log(inode, fs_info->generation) &&
2090 	    list_empty(&ctx->ordered_extents))
2091 		return true;
2092 
2093 	/*
2094 	 * If we are doing a fast fsync we can not bail out if the inode's
2095 	 * last_trans is <= then the last committed transaction, because we only
2096 	 * update the last_trans of the inode during ordered extent completion,
2097 	 * and for a fast fsync we don't wait for that, we only wait for the
2098 	 * writeback to complete.
2099 	 */
2100 	if (inode->last_trans <= fs_info->last_trans_committed &&
2101 	    (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
2102 	     list_empty(&ctx->ordered_extents)))
2103 		return true;
2104 
2105 	return false;
2106 }
2107 
2108 /*
2109  * fsync call for both files and directories.  This logs the inode into
2110  * the tree log instead of forcing full commits whenever possible.
2111  *
2112  * It needs to call filemap_fdatawait so that all ordered extent updates are
2113  * in the metadata btree are up to date for copying to the log.
2114  *
2115  * It drops the inode mutex before doing the tree log commit.  This is an
2116  * important optimization for directories because holding the mutex prevents
2117  * new operations on the dir while we write to disk.
2118  */
2119 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
2120 {
2121 	struct dentry *dentry = file_dentry(file);
2122 	struct inode *inode = d_inode(dentry);
2123 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2124 	struct btrfs_root *root = BTRFS_I(inode)->root;
2125 	struct btrfs_trans_handle *trans;
2126 	struct btrfs_log_ctx ctx;
2127 	int ret = 0, err;
2128 	u64 len;
2129 	bool full_sync;
2130 
2131 	trace_btrfs_sync_file(file, datasync);
2132 
2133 	btrfs_init_log_ctx(&ctx, inode);
2134 
2135 	/*
2136 	 * Always set the range to a full range, otherwise we can get into
2137 	 * several problems, from missing file extent items to represent holes
2138 	 * when not using the NO_HOLES feature, to log tree corruption due to
2139 	 * races between hole detection during logging and completion of ordered
2140 	 * extents outside the range, to missing checksums due to ordered extents
2141 	 * for which we flushed only a subset of their pages.
2142 	 */
2143 	start = 0;
2144 	end = LLONG_MAX;
2145 	len = (u64)LLONG_MAX + 1;
2146 
2147 	/*
2148 	 * We write the dirty pages in the range and wait until they complete
2149 	 * out of the ->i_mutex. If so, we can flush the dirty pages by
2150 	 * multi-task, and make the performance up.  See
2151 	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
2152 	 */
2153 	ret = start_ordered_ops(inode, start, end);
2154 	if (ret)
2155 		goto out;
2156 
2157 	btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
2158 
2159 	atomic_inc(&root->log_batch);
2160 
2161 	/*
2162 	 * Always check for the full sync flag while holding the inode's lock,
2163 	 * to avoid races with other tasks. The flag must be either set all the
2164 	 * time during logging or always off all the time while logging.
2165 	 */
2166 	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2167 			     &BTRFS_I(inode)->runtime_flags);
2168 
2169 	/*
2170 	 * Before we acquired the inode's lock and the mmap lock, someone may
2171 	 * have dirtied more pages in the target range. We need to make sure
2172 	 * that writeback for any such pages does not start while we are logging
2173 	 * the inode, because if it does, any of the following might happen when
2174 	 * we are not doing a full inode sync:
2175 	 *
2176 	 * 1) We log an extent after its writeback finishes but before its
2177 	 *    checksums are added to the csum tree, leading to -EIO errors
2178 	 *    when attempting to read the extent after a log replay.
2179 	 *
2180 	 * 2) We can end up logging an extent before its writeback finishes.
2181 	 *    Therefore after the log replay we will have a file extent item
2182 	 *    pointing to an unwritten extent (and no data checksums as well).
2183 	 *
2184 	 * So trigger writeback for any eventual new dirty pages and then we
2185 	 * wait for all ordered extents to complete below.
2186 	 */
2187 	ret = start_ordered_ops(inode, start, end);
2188 	if (ret) {
2189 		btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2190 		goto out;
2191 	}
2192 
2193 	/*
2194 	 * We have to do this here to avoid the priority inversion of waiting on
2195 	 * IO of a lower priority task while holding a transaction open.
2196 	 *
2197 	 * For a full fsync we wait for the ordered extents to complete while
2198 	 * for a fast fsync we wait just for writeback to complete, and then
2199 	 * attach the ordered extents to the transaction so that a transaction
2200 	 * commit waits for their completion, to avoid data loss if we fsync,
2201 	 * the current transaction commits before the ordered extents complete
2202 	 * and a power failure happens right after that.
2203 	 *
2204 	 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
2205 	 * logical address recorded in the ordered extent may change. We need
2206 	 * to wait for the IO to stabilize the logical address.
2207 	 */
2208 	if (full_sync || btrfs_is_zoned(fs_info)) {
2209 		ret = btrfs_wait_ordered_range(inode, start, len);
2210 	} else {
2211 		/*
2212 		 * Get our ordered extents as soon as possible to avoid doing
2213 		 * checksum lookups in the csum tree, and use instead the
2214 		 * checksums attached to the ordered extents.
2215 		 */
2216 		btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
2217 						      &ctx.ordered_extents);
2218 		ret = filemap_fdatawait_range(inode->i_mapping, start, end);
2219 	}
2220 
2221 	if (ret)
2222 		goto out_release_extents;
2223 
2224 	atomic_inc(&root->log_batch);
2225 
2226 	smp_mb();
2227 	if (skip_inode_logging(&ctx)) {
2228 		/*
2229 		 * We've had everything committed since the last time we were
2230 		 * modified so clear this flag in case it was set for whatever
2231 		 * reason, it's no longer relevant.
2232 		 */
2233 		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2234 			  &BTRFS_I(inode)->runtime_flags);
2235 		/*
2236 		 * An ordered extent might have started before and completed
2237 		 * already with io errors, in which case the inode was not
2238 		 * updated and we end up here. So check the inode's mapping
2239 		 * for any errors that might have happened since we last
2240 		 * checked called fsync.
2241 		 */
2242 		ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
2243 		goto out_release_extents;
2244 	}
2245 
2246 	/*
2247 	 * We use start here because we will need to wait on the IO to complete
2248 	 * in btrfs_sync_log, which could require joining a transaction (for
2249 	 * example checking cross references in the nocow path).  If we use join
2250 	 * here we could get into a situation where we're waiting on IO to
2251 	 * happen that is blocked on a transaction trying to commit.  With start
2252 	 * we inc the extwriter counter, so we wait for all extwriters to exit
2253 	 * before we start blocking joiners.  This comment is to keep somebody
2254 	 * from thinking they are super smart and changing this to
2255 	 * btrfs_join_transaction *cough*Josef*cough*.
2256 	 */
2257 	trans = btrfs_start_transaction(root, 0);
2258 	if (IS_ERR(trans)) {
2259 		ret = PTR_ERR(trans);
2260 		goto out_release_extents;
2261 	}
2262 	trans->in_fsync = true;
2263 
2264 	ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
2265 	btrfs_release_log_ctx_extents(&ctx);
2266 	if (ret < 0) {
2267 		/* Fallthrough and commit/free transaction. */
2268 		ret = 1;
2269 	}
2270 
2271 	/* we've logged all the items and now have a consistent
2272 	 * version of the file in the log.  It is possible that
2273 	 * someone will come in and modify the file, but that's
2274 	 * fine because the log is consistent on disk, and we
2275 	 * have references to all of the file's extents
2276 	 *
2277 	 * It is possible that someone will come in and log the
2278 	 * file again, but that will end up using the synchronization
2279 	 * inside btrfs_sync_log to keep things safe.
2280 	 */
2281 	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2282 
2283 	if (ret != BTRFS_NO_LOG_SYNC) {
2284 		if (!ret) {
2285 			ret = btrfs_sync_log(trans, root, &ctx);
2286 			if (!ret) {
2287 				ret = btrfs_end_transaction(trans);
2288 				goto out;
2289 			}
2290 		}
2291 		if (!full_sync) {
2292 			ret = btrfs_wait_ordered_range(inode, start, len);
2293 			if (ret) {
2294 				btrfs_end_transaction(trans);
2295 				goto out;
2296 			}
2297 		}
2298 		ret = btrfs_commit_transaction(trans);
2299 	} else {
2300 		ret = btrfs_end_transaction(trans);
2301 	}
2302 out:
2303 	ASSERT(list_empty(&ctx.list));
2304 	err = file_check_and_advance_wb_err(file);
2305 	if (!ret)
2306 		ret = err;
2307 	return ret > 0 ? -EIO : ret;
2308 
2309 out_release_extents:
2310 	btrfs_release_log_ctx_extents(&ctx);
2311 	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2312 	goto out;
2313 }
2314 
2315 static const struct vm_operations_struct btrfs_file_vm_ops = {
2316 	.fault		= filemap_fault,
2317 	.map_pages	= filemap_map_pages,
2318 	.page_mkwrite	= btrfs_page_mkwrite,
2319 };
2320 
2321 static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
2322 {
2323 	struct address_space *mapping = filp->f_mapping;
2324 
2325 	if (!mapping->a_ops->readpage)
2326 		return -ENOEXEC;
2327 
2328 	file_accessed(filp);
2329 	vma->vm_ops = &btrfs_file_vm_ops;
2330 
2331 	return 0;
2332 }
2333 
2334 static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2335 			  int slot, u64 start, u64 end)
2336 {
2337 	struct btrfs_file_extent_item *fi;
2338 	struct btrfs_key key;
2339 
2340 	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2341 		return 0;
2342 
2343 	btrfs_item_key_to_cpu(leaf, &key, slot);
2344 	if (key.objectid != btrfs_ino(inode) ||
2345 	    key.type != BTRFS_EXTENT_DATA_KEY)
2346 		return 0;
2347 
2348 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2349 
2350 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2351 		return 0;
2352 
2353 	if (btrfs_file_extent_disk_bytenr(leaf, fi))
2354 		return 0;
2355 
2356 	if (key.offset == end)
2357 		return 1;
2358 	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2359 		return 1;
2360 	return 0;
2361 }
2362 
2363 static int fill_holes(struct btrfs_trans_handle *trans,
2364 		struct btrfs_inode *inode,
2365 		struct btrfs_path *path, u64 offset, u64 end)
2366 {
2367 	struct btrfs_fs_info *fs_info = trans->fs_info;
2368 	struct btrfs_root *root = inode->root;
2369 	struct extent_buffer *leaf;
2370 	struct btrfs_file_extent_item *fi;
2371 	struct extent_map *hole_em;
2372 	struct extent_map_tree *em_tree = &inode->extent_tree;
2373 	struct btrfs_key key;
2374 	int ret;
2375 
2376 	if (btrfs_fs_incompat(fs_info, NO_HOLES))
2377 		goto out;
2378 
2379 	key.objectid = btrfs_ino(inode);
2380 	key.type = BTRFS_EXTENT_DATA_KEY;
2381 	key.offset = offset;
2382 
2383 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2384 	if (ret <= 0) {
2385 		/*
2386 		 * We should have dropped this offset, so if we find it then
2387 		 * something has gone horribly wrong.
2388 		 */
2389 		if (ret == 0)
2390 			ret = -EINVAL;
2391 		return ret;
2392 	}
2393 
2394 	leaf = path->nodes[0];
2395 	if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2396 		u64 num_bytes;
2397 
2398 		path->slots[0]--;
2399 		fi = btrfs_item_ptr(leaf, path->slots[0],
2400 				    struct btrfs_file_extent_item);
2401 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2402 			end - offset;
2403 		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2404 		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2405 		btrfs_set_file_extent_offset(leaf, fi, 0);
2406 		btrfs_mark_buffer_dirty(leaf);
2407 		goto out;
2408 	}
2409 
2410 	if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2411 		u64 num_bytes;
2412 
2413 		key.offset = offset;
2414 		btrfs_set_item_key_safe(fs_info, path, &key);
2415 		fi = btrfs_item_ptr(leaf, path->slots[0],
2416 				    struct btrfs_file_extent_item);
2417 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2418 			offset;
2419 		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2420 		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2421 		btrfs_set_file_extent_offset(leaf, fi, 0);
2422 		btrfs_mark_buffer_dirty(leaf);
2423 		goto out;
2424 	}
2425 	btrfs_release_path(path);
2426 
2427 	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
2428 			offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
2429 	if (ret)
2430 		return ret;
2431 
2432 out:
2433 	btrfs_release_path(path);
2434 
2435 	hole_em = alloc_extent_map();
2436 	if (!hole_em) {
2437 		btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2438 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
2439 	} else {
2440 		hole_em->start = offset;
2441 		hole_em->len = end - offset;
2442 		hole_em->ram_bytes = hole_em->len;
2443 		hole_em->orig_start = offset;
2444 
2445 		hole_em->block_start = EXTENT_MAP_HOLE;
2446 		hole_em->block_len = 0;
2447 		hole_em->orig_block_len = 0;
2448 		hole_em->compress_type = BTRFS_COMPRESS_NONE;
2449 		hole_em->generation = trans->transid;
2450 
2451 		do {
2452 			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2453 			write_lock(&em_tree->lock);
2454 			ret = add_extent_mapping(em_tree, hole_em, 1);
2455 			write_unlock(&em_tree->lock);
2456 		} while (ret == -EEXIST);
2457 		free_extent_map(hole_em);
2458 		if (ret)
2459 			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2460 					&inode->runtime_flags);
2461 	}
2462 
2463 	return 0;
2464 }
2465 
2466 /*
2467  * Find a hole extent on given inode and change start/len to the end of hole
2468  * extent.(hole/vacuum extent whose em->start <= start &&
2469  *	   em->start + em->len > start)
2470  * When a hole extent is found, return 1 and modify start/len.
2471  */
2472 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2473 {
2474 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2475 	struct extent_map *em;
2476 	int ret = 0;
2477 
2478 	em = btrfs_get_extent(inode, NULL, 0,
2479 			      round_down(*start, fs_info->sectorsize),
2480 			      round_up(*len, fs_info->sectorsize));
2481 	if (IS_ERR(em))
2482 		return PTR_ERR(em);
2483 
2484 	/* Hole or vacuum extent(only exists in no-hole mode) */
2485 	if (em->block_start == EXTENT_MAP_HOLE) {
2486 		ret = 1;
2487 		*len = em->start + em->len > *start + *len ?
2488 		       0 : *start + *len - em->start - em->len;
2489 		*start = em->start + em->len;
2490 	}
2491 	free_extent_map(em);
2492 	return ret;
2493 }
2494 
2495 static int btrfs_punch_hole_lock_range(struct inode *inode,
2496 				       const u64 lockstart,
2497 				       const u64 lockend,
2498 				       struct extent_state **cached_state)
2499 {
2500 	/*
2501 	 * For subpage case, if the range is not at page boundary, we could
2502 	 * have pages at the leading/tailing part of the range.
2503 	 * This could lead to dead loop since filemap_range_has_page()
2504 	 * will always return true.
2505 	 * So here we need to do extra page alignment for
2506 	 * filemap_range_has_page().
2507 	 */
2508 	const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
2509 	const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
2510 
2511 	while (1) {
2512 		struct btrfs_ordered_extent *ordered;
2513 		int ret;
2514 
2515 		truncate_pagecache_range(inode, lockstart, lockend);
2516 
2517 		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2518 				 cached_state);
2519 		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
2520 							    lockend);
2521 
2522 		/*
2523 		 * We need to make sure we have no ordered extents in this range
2524 		 * and nobody raced in and read a page in this range, if we did
2525 		 * we need to try again.
2526 		 */
2527 		if ((!ordered ||
2528 		    (ordered->file_offset + ordered->num_bytes <= lockstart ||
2529 		     ordered->file_offset > lockend)) &&
2530 		     !filemap_range_has_page(inode->i_mapping,
2531 					     page_lockstart, page_lockend)) {
2532 			if (ordered)
2533 				btrfs_put_ordered_extent(ordered);
2534 			break;
2535 		}
2536 		if (ordered)
2537 			btrfs_put_ordered_extent(ordered);
2538 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
2539 				     lockend, cached_state);
2540 		ret = btrfs_wait_ordered_range(inode, lockstart,
2541 					       lockend - lockstart + 1);
2542 		if (ret)
2543 			return ret;
2544 	}
2545 	return 0;
2546 }
2547 
2548 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2549 				     struct btrfs_inode *inode,
2550 				     struct btrfs_path *path,
2551 				     struct btrfs_replace_extent_info *extent_info,
2552 				     const u64 replace_len,
2553 				     const u64 bytes_to_drop)
2554 {
2555 	struct btrfs_fs_info *fs_info = trans->fs_info;
2556 	struct btrfs_root *root = inode->root;
2557 	struct btrfs_file_extent_item *extent;
2558 	struct extent_buffer *leaf;
2559 	struct btrfs_key key;
2560 	int slot;
2561 	struct btrfs_ref ref = { 0 };
2562 	int ret;
2563 
2564 	if (replace_len == 0)
2565 		return 0;
2566 
2567 	if (extent_info->disk_offset == 0 &&
2568 	    btrfs_fs_incompat(fs_info, NO_HOLES)) {
2569 		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2570 		return 0;
2571 	}
2572 
2573 	key.objectid = btrfs_ino(inode);
2574 	key.type = BTRFS_EXTENT_DATA_KEY;
2575 	key.offset = extent_info->file_offset;
2576 	ret = btrfs_insert_empty_item(trans, root, path, &key,
2577 				      sizeof(struct btrfs_file_extent_item));
2578 	if (ret)
2579 		return ret;
2580 	leaf = path->nodes[0];
2581 	slot = path->slots[0];
2582 	write_extent_buffer(leaf, extent_info->extent_buf,
2583 			    btrfs_item_ptr_offset(leaf, slot),
2584 			    sizeof(struct btrfs_file_extent_item));
2585 	extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2586 	ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2587 	btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2588 	btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2589 	if (extent_info->is_new_extent)
2590 		btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2591 	btrfs_mark_buffer_dirty(leaf);
2592 	btrfs_release_path(path);
2593 
2594 	ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2595 						replace_len);
2596 	if (ret)
2597 		return ret;
2598 
2599 	/* If it's a hole, nothing more needs to be done. */
2600 	if (extent_info->disk_offset == 0) {
2601 		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2602 		return 0;
2603 	}
2604 
2605 	btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2606 
2607 	if (extent_info->is_new_extent && extent_info->insertions == 0) {
2608 		key.objectid = extent_info->disk_offset;
2609 		key.type = BTRFS_EXTENT_ITEM_KEY;
2610 		key.offset = extent_info->disk_len;
2611 		ret = btrfs_alloc_reserved_file_extent(trans, root,
2612 						       btrfs_ino(inode),
2613 						       extent_info->file_offset,
2614 						       extent_info->qgroup_reserved,
2615 						       &key);
2616 	} else {
2617 		u64 ref_offset;
2618 
2619 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2620 				       extent_info->disk_offset,
2621 				       extent_info->disk_len, 0);
2622 		ref_offset = extent_info->file_offset - extent_info->data_offset;
2623 		btrfs_init_data_ref(&ref, root->root_key.objectid,
2624 				    btrfs_ino(inode), ref_offset);
2625 		ret = btrfs_inc_extent_ref(trans, &ref);
2626 	}
2627 
2628 	extent_info->insertions++;
2629 
2630 	return ret;
2631 }
2632 
2633 /*
2634  * The respective range must have been previously locked, as well as the inode.
2635  * The end offset is inclusive (last byte of the range).
2636  * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2637  * the file range with an extent.
2638  * When not punching a hole, we don't want to end up in a state where we dropped
2639  * extents without inserting a new one, so we must abort the transaction to avoid
2640  * a corruption.
2641  */
2642 int btrfs_replace_file_extents(struct btrfs_inode *inode,
2643 			       struct btrfs_path *path, const u64 start,
2644 			       const u64 end,
2645 			       struct btrfs_replace_extent_info *extent_info,
2646 			       struct btrfs_trans_handle **trans_out)
2647 {
2648 	struct btrfs_drop_extents_args drop_args = { 0 };
2649 	struct btrfs_root *root = inode->root;
2650 	struct btrfs_fs_info *fs_info = root->fs_info;
2651 	u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2652 	u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2653 	struct btrfs_trans_handle *trans = NULL;
2654 	struct btrfs_block_rsv *rsv;
2655 	unsigned int rsv_count;
2656 	u64 cur_offset;
2657 	u64 len = end - start;
2658 	int ret = 0;
2659 
2660 	if (end <= start)
2661 		return -EINVAL;
2662 
2663 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2664 	if (!rsv) {
2665 		ret = -ENOMEM;
2666 		goto out;
2667 	}
2668 	rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2669 	rsv->failfast = 1;
2670 
2671 	/*
2672 	 * 1 - update the inode
2673 	 * 1 - removing the extents in the range
2674 	 * 1 - adding the hole extent if no_holes isn't set or if we are
2675 	 *     replacing the range with a new extent
2676 	 */
2677 	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2678 		rsv_count = 3;
2679 	else
2680 		rsv_count = 2;
2681 
2682 	trans = btrfs_start_transaction(root, rsv_count);
2683 	if (IS_ERR(trans)) {
2684 		ret = PTR_ERR(trans);
2685 		trans = NULL;
2686 		goto out_free;
2687 	}
2688 
2689 	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2690 				      min_size, false);
2691 	BUG_ON(ret);
2692 	trans->block_rsv = rsv;
2693 
2694 	cur_offset = start;
2695 	drop_args.path = path;
2696 	drop_args.end = end + 1;
2697 	drop_args.drop_cache = true;
2698 	while (cur_offset < end) {
2699 		drop_args.start = cur_offset;
2700 		ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2701 		/* If we are punching a hole decrement the inode's byte count */
2702 		if (!extent_info)
2703 			btrfs_update_inode_bytes(inode, 0,
2704 						 drop_args.bytes_found);
2705 		if (ret != -ENOSPC) {
2706 			/*
2707 			 * When cloning we want to avoid transaction aborts when
2708 			 * nothing was done and we are attempting to clone parts
2709 			 * of inline extents, in such cases -EOPNOTSUPP is
2710 			 * returned by __btrfs_drop_extents() without having
2711 			 * changed anything in the file.
2712 			 */
2713 			if (extent_info && !extent_info->is_new_extent &&
2714 			    ret && ret != -EOPNOTSUPP)
2715 				btrfs_abort_transaction(trans, ret);
2716 			break;
2717 		}
2718 
2719 		trans->block_rsv = &fs_info->trans_block_rsv;
2720 
2721 		if (!extent_info && cur_offset < drop_args.drop_end &&
2722 		    cur_offset < ino_size) {
2723 			ret = fill_holes(trans, inode, path, cur_offset,
2724 					 drop_args.drop_end);
2725 			if (ret) {
2726 				/*
2727 				 * If we failed then we didn't insert our hole
2728 				 * entries for the area we dropped, so now the
2729 				 * fs is corrupted, so we must abort the
2730 				 * transaction.
2731 				 */
2732 				btrfs_abort_transaction(trans, ret);
2733 				break;
2734 			}
2735 		} else if (!extent_info && cur_offset < drop_args.drop_end) {
2736 			/*
2737 			 * We are past the i_size here, but since we didn't
2738 			 * insert holes we need to clear the mapped area so we
2739 			 * know to not set disk_i_size in this area until a new
2740 			 * file extent is inserted here.
2741 			 */
2742 			ret = btrfs_inode_clear_file_extent_range(inode,
2743 					cur_offset,
2744 					drop_args.drop_end - cur_offset);
2745 			if (ret) {
2746 				/*
2747 				 * We couldn't clear our area, so we could
2748 				 * presumably adjust up and corrupt the fs, so
2749 				 * we need to abort.
2750 				 */
2751 				btrfs_abort_transaction(trans, ret);
2752 				break;
2753 			}
2754 		}
2755 
2756 		if (extent_info &&
2757 		    drop_args.drop_end > extent_info->file_offset) {
2758 			u64 replace_len = drop_args.drop_end -
2759 					  extent_info->file_offset;
2760 
2761 			ret = btrfs_insert_replace_extent(trans, inode,	path,
2762 					extent_info, replace_len,
2763 					drop_args.bytes_found);
2764 			if (ret) {
2765 				btrfs_abort_transaction(trans, ret);
2766 				break;
2767 			}
2768 			extent_info->data_len -= replace_len;
2769 			extent_info->data_offset += replace_len;
2770 			extent_info->file_offset += replace_len;
2771 		}
2772 
2773 		ret = btrfs_update_inode(trans, root, inode);
2774 		if (ret)
2775 			break;
2776 
2777 		btrfs_end_transaction(trans);
2778 		btrfs_btree_balance_dirty(fs_info);
2779 
2780 		trans = btrfs_start_transaction(root, rsv_count);
2781 		if (IS_ERR(trans)) {
2782 			ret = PTR_ERR(trans);
2783 			trans = NULL;
2784 			break;
2785 		}
2786 
2787 		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2788 					      rsv, min_size, false);
2789 		BUG_ON(ret);	/* shouldn't happen */
2790 		trans->block_rsv = rsv;
2791 
2792 		cur_offset = drop_args.drop_end;
2793 		len = end - cur_offset;
2794 		if (!extent_info && len) {
2795 			ret = find_first_non_hole(inode, &cur_offset, &len);
2796 			if (unlikely(ret < 0))
2797 				break;
2798 			if (ret && !len) {
2799 				ret = 0;
2800 				break;
2801 			}
2802 		}
2803 	}
2804 
2805 	/*
2806 	 * If we were cloning, force the next fsync to be a full one since we
2807 	 * we replaced (or just dropped in the case of cloning holes when
2808 	 * NO_HOLES is enabled) file extent items and did not setup new extent
2809 	 * maps for the replacement extents (or holes).
2810 	 */
2811 	if (extent_info && !extent_info->is_new_extent)
2812 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
2813 
2814 	if (ret)
2815 		goto out_trans;
2816 
2817 	trans->block_rsv = &fs_info->trans_block_rsv;
2818 	/*
2819 	 * If we are using the NO_HOLES feature we might have had already an
2820 	 * hole that overlaps a part of the region [lockstart, lockend] and
2821 	 * ends at (or beyond) lockend. Since we have no file extent items to
2822 	 * represent holes, drop_end can be less than lockend and so we must
2823 	 * make sure we have an extent map representing the existing hole (the
2824 	 * call to __btrfs_drop_extents() might have dropped the existing extent
2825 	 * map representing the existing hole), otherwise the fast fsync path
2826 	 * will not record the existence of the hole region
2827 	 * [existing_hole_start, lockend].
2828 	 */
2829 	if (drop_args.drop_end <= end)
2830 		drop_args.drop_end = end + 1;
2831 	/*
2832 	 * Don't insert file hole extent item if it's for a range beyond eof
2833 	 * (because it's useless) or if it represents a 0 bytes range (when
2834 	 * cur_offset == drop_end).
2835 	 */
2836 	if (!extent_info && cur_offset < ino_size &&
2837 	    cur_offset < drop_args.drop_end) {
2838 		ret = fill_holes(trans, inode, path, cur_offset,
2839 				 drop_args.drop_end);
2840 		if (ret) {
2841 			/* Same comment as above. */
2842 			btrfs_abort_transaction(trans, ret);
2843 			goto out_trans;
2844 		}
2845 	} else if (!extent_info && cur_offset < drop_args.drop_end) {
2846 		/* See the comment in the loop above for the reasoning here. */
2847 		ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2848 					drop_args.drop_end - cur_offset);
2849 		if (ret) {
2850 			btrfs_abort_transaction(trans, ret);
2851 			goto out_trans;
2852 		}
2853 
2854 	}
2855 	if (extent_info) {
2856 		ret = btrfs_insert_replace_extent(trans, inode, path,
2857 				extent_info, extent_info->data_len,
2858 				drop_args.bytes_found);
2859 		if (ret) {
2860 			btrfs_abort_transaction(trans, ret);
2861 			goto out_trans;
2862 		}
2863 	}
2864 
2865 out_trans:
2866 	if (!trans)
2867 		goto out_free;
2868 
2869 	trans->block_rsv = &fs_info->trans_block_rsv;
2870 	if (ret)
2871 		btrfs_end_transaction(trans);
2872 	else
2873 		*trans_out = trans;
2874 out_free:
2875 	btrfs_free_block_rsv(fs_info, rsv);
2876 out:
2877 	return ret;
2878 }
2879 
2880 static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2881 {
2882 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2883 	struct btrfs_root *root = BTRFS_I(inode)->root;
2884 	struct extent_state *cached_state = NULL;
2885 	struct btrfs_path *path;
2886 	struct btrfs_trans_handle *trans = NULL;
2887 	u64 lockstart;
2888 	u64 lockend;
2889 	u64 tail_start;
2890 	u64 tail_len;
2891 	u64 orig_start = offset;
2892 	int ret = 0;
2893 	bool same_block;
2894 	u64 ino_size;
2895 	bool truncated_block = false;
2896 	bool updated_inode = false;
2897 
2898 	ret = btrfs_wait_ordered_range(inode, offset, len);
2899 	if (ret)
2900 		return ret;
2901 
2902 	btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
2903 	ino_size = round_up(inode->i_size, fs_info->sectorsize);
2904 	ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2905 	if (ret < 0)
2906 		goto out_only_mutex;
2907 	if (ret && !len) {
2908 		/* Already in a large hole */
2909 		ret = 0;
2910 		goto out_only_mutex;
2911 	}
2912 
2913 	lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
2914 	lockend = round_down(offset + len,
2915 			     btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
2916 	same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2917 		== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2918 	/*
2919 	 * We needn't truncate any block which is beyond the end of the file
2920 	 * because we are sure there is no data there.
2921 	 */
2922 	/*
2923 	 * Only do this if we are in the same block and we aren't doing the
2924 	 * entire block.
2925 	 */
2926 	if (same_block && len < fs_info->sectorsize) {
2927 		if (offset < ino_size) {
2928 			truncated_block = true;
2929 			ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2930 						   0);
2931 		} else {
2932 			ret = 0;
2933 		}
2934 		goto out_only_mutex;
2935 	}
2936 
2937 	/* zero back part of the first block */
2938 	if (offset < ino_size) {
2939 		truncated_block = true;
2940 		ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2941 		if (ret) {
2942 			btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2943 			return ret;
2944 		}
2945 	}
2946 
2947 	/* Check the aligned pages after the first unaligned page,
2948 	 * if offset != orig_start, which means the first unaligned page
2949 	 * including several following pages are already in holes,
2950 	 * the extra check can be skipped */
2951 	if (offset == orig_start) {
2952 		/* after truncate page, check hole again */
2953 		len = offset + len - lockstart;
2954 		offset = lockstart;
2955 		ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2956 		if (ret < 0)
2957 			goto out_only_mutex;
2958 		if (ret && !len) {
2959 			ret = 0;
2960 			goto out_only_mutex;
2961 		}
2962 		lockstart = offset;
2963 	}
2964 
2965 	/* Check the tail unaligned part is in a hole */
2966 	tail_start = lockend + 1;
2967 	tail_len = offset + len - tail_start;
2968 	if (tail_len) {
2969 		ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2970 		if (unlikely(ret < 0))
2971 			goto out_only_mutex;
2972 		if (!ret) {
2973 			/* zero the front end of the last page */
2974 			if (tail_start + tail_len < ino_size) {
2975 				truncated_block = true;
2976 				ret = btrfs_truncate_block(BTRFS_I(inode),
2977 							tail_start + tail_len,
2978 							0, 1);
2979 				if (ret)
2980 					goto out_only_mutex;
2981 			}
2982 		}
2983 	}
2984 
2985 	if (lockend < lockstart) {
2986 		ret = 0;
2987 		goto out_only_mutex;
2988 	}
2989 
2990 	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
2991 					  &cached_state);
2992 	if (ret)
2993 		goto out_only_mutex;
2994 
2995 	path = btrfs_alloc_path();
2996 	if (!path) {
2997 		ret = -ENOMEM;
2998 		goto out;
2999 	}
3000 
3001 	ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
3002 					 lockend, NULL, &trans);
3003 	btrfs_free_path(path);
3004 	if (ret)
3005 		goto out;
3006 
3007 	ASSERT(trans != NULL);
3008 	inode_inc_iversion(inode);
3009 	inode->i_mtime = inode->i_ctime = current_time(inode);
3010 	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3011 	updated_inode = true;
3012 	btrfs_end_transaction(trans);
3013 	btrfs_btree_balance_dirty(fs_info);
3014 out:
3015 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3016 			     &cached_state);
3017 out_only_mutex:
3018 	if (!updated_inode && truncated_block && !ret) {
3019 		/*
3020 		 * If we only end up zeroing part of a page, we still need to
3021 		 * update the inode item, so that all the time fields are
3022 		 * updated as well as the necessary btrfs inode in memory fields
3023 		 * for detecting, at fsync time, if the inode isn't yet in the
3024 		 * log tree or it's there but not up to date.
3025 		 */
3026 		struct timespec64 now = current_time(inode);
3027 
3028 		inode_inc_iversion(inode);
3029 		inode->i_mtime = now;
3030 		inode->i_ctime = now;
3031 		trans = btrfs_start_transaction(root, 1);
3032 		if (IS_ERR(trans)) {
3033 			ret = PTR_ERR(trans);
3034 		} else {
3035 			int ret2;
3036 
3037 			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3038 			ret2 = btrfs_end_transaction(trans);
3039 			if (!ret)
3040 				ret = ret2;
3041 		}
3042 	}
3043 	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3044 	return ret;
3045 }
3046 
3047 /* Helper structure to record which range is already reserved */
3048 struct falloc_range {
3049 	struct list_head list;
3050 	u64 start;
3051 	u64 len;
3052 };
3053 
3054 /*
3055  * Helper function to add falloc range
3056  *
3057  * Caller should have locked the larger range of extent containing
3058  * [start, len)
3059  */
3060 static int add_falloc_range(struct list_head *head, u64 start, u64 len)
3061 {
3062 	struct falloc_range *range = NULL;
3063 
3064 	if (!list_empty(head)) {
3065 		/*
3066 		 * As fallocate iterates by bytenr order, we only need to check
3067 		 * the last range.
3068 		 */
3069 		range = list_last_entry(head, struct falloc_range, list);
3070 		if (range->start + range->len == start) {
3071 			range->len += len;
3072 			return 0;
3073 		}
3074 	}
3075 
3076 	range = kmalloc(sizeof(*range), GFP_KERNEL);
3077 	if (!range)
3078 		return -ENOMEM;
3079 	range->start = start;
3080 	range->len = len;
3081 	list_add_tail(&range->list, head);
3082 	return 0;
3083 }
3084 
3085 static int btrfs_fallocate_update_isize(struct inode *inode,
3086 					const u64 end,
3087 					const int mode)
3088 {
3089 	struct btrfs_trans_handle *trans;
3090 	struct btrfs_root *root = BTRFS_I(inode)->root;
3091 	int ret;
3092 	int ret2;
3093 
3094 	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
3095 		return 0;
3096 
3097 	trans = btrfs_start_transaction(root, 1);
3098 	if (IS_ERR(trans))
3099 		return PTR_ERR(trans);
3100 
3101 	inode->i_ctime = current_time(inode);
3102 	i_size_write(inode, end);
3103 	btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
3104 	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3105 	ret2 = btrfs_end_transaction(trans);
3106 
3107 	return ret ? ret : ret2;
3108 }
3109 
3110 enum {
3111 	RANGE_BOUNDARY_WRITTEN_EXTENT,
3112 	RANGE_BOUNDARY_PREALLOC_EXTENT,
3113 	RANGE_BOUNDARY_HOLE,
3114 };
3115 
3116 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
3117 						 u64 offset)
3118 {
3119 	const u64 sectorsize = btrfs_inode_sectorsize(inode);
3120 	struct extent_map *em;
3121 	int ret;
3122 
3123 	offset = round_down(offset, sectorsize);
3124 	em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
3125 	if (IS_ERR(em))
3126 		return PTR_ERR(em);
3127 
3128 	if (em->block_start == EXTENT_MAP_HOLE)
3129 		ret = RANGE_BOUNDARY_HOLE;
3130 	else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3131 		ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
3132 	else
3133 		ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
3134 
3135 	free_extent_map(em);
3136 	return ret;
3137 }
3138 
3139 static int btrfs_zero_range(struct inode *inode,
3140 			    loff_t offset,
3141 			    loff_t len,
3142 			    const int mode)
3143 {
3144 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
3145 	struct extent_map *em;
3146 	struct extent_changeset *data_reserved = NULL;
3147 	int ret;
3148 	u64 alloc_hint = 0;
3149 	const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
3150 	u64 alloc_start = round_down(offset, sectorsize);
3151 	u64 alloc_end = round_up(offset + len, sectorsize);
3152 	u64 bytes_to_reserve = 0;
3153 	bool space_reserved = false;
3154 
3155 	inode_dio_wait(inode);
3156 
3157 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3158 			      alloc_end - alloc_start);
3159 	if (IS_ERR(em)) {
3160 		ret = PTR_ERR(em);
3161 		goto out;
3162 	}
3163 
3164 	/*
3165 	 * Avoid hole punching and extent allocation for some cases. More cases
3166 	 * could be considered, but these are unlikely common and we keep things
3167 	 * as simple as possible for now. Also, intentionally, if the target
3168 	 * range contains one or more prealloc extents together with regular
3169 	 * extents and holes, we drop all the existing extents and allocate a
3170 	 * new prealloc extent, so that we get a larger contiguous disk extent.
3171 	 */
3172 	if (em->start <= alloc_start &&
3173 	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3174 		const u64 em_end = em->start + em->len;
3175 
3176 		if (em_end >= offset + len) {
3177 			/*
3178 			 * The whole range is already a prealloc extent,
3179 			 * do nothing except updating the inode's i_size if
3180 			 * needed.
3181 			 */
3182 			free_extent_map(em);
3183 			ret = btrfs_fallocate_update_isize(inode, offset + len,
3184 							   mode);
3185 			goto out;
3186 		}
3187 		/*
3188 		 * Part of the range is already a prealloc extent, so operate
3189 		 * only on the remaining part of the range.
3190 		 */
3191 		alloc_start = em_end;
3192 		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
3193 		len = offset + len - alloc_start;
3194 		offset = alloc_start;
3195 		alloc_hint = em->block_start + em->len;
3196 	}
3197 	free_extent_map(em);
3198 
3199 	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
3200 	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
3201 		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3202 				      sectorsize);
3203 		if (IS_ERR(em)) {
3204 			ret = PTR_ERR(em);
3205 			goto out;
3206 		}
3207 
3208 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3209 			free_extent_map(em);
3210 			ret = btrfs_fallocate_update_isize(inode, offset + len,
3211 							   mode);
3212 			goto out;
3213 		}
3214 		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
3215 			free_extent_map(em);
3216 			ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
3217 						   0);
3218 			if (!ret)
3219 				ret = btrfs_fallocate_update_isize(inode,
3220 								   offset + len,
3221 								   mode);
3222 			return ret;
3223 		}
3224 		free_extent_map(em);
3225 		alloc_start = round_down(offset, sectorsize);
3226 		alloc_end = alloc_start + sectorsize;
3227 		goto reserve_space;
3228 	}
3229 
3230 	alloc_start = round_up(offset, sectorsize);
3231 	alloc_end = round_down(offset + len, sectorsize);
3232 
3233 	/*
3234 	 * For unaligned ranges, check the pages at the boundaries, they might
3235 	 * map to an extent, in which case we need to partially zero them, or
3236 	 * they might map to a hole, in which case we need our allocation range
3237 	 * to cover them.
3238 	 */
3239 	if (!IS_ALIGNED(offset, sectorsize)) {
3240 		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3241 							    offset);
3242 		if (ret < 0)
3243 			goto out;
3244 		if (ret == RANGE_BOUNDARY_HOLE) {
3245 			alloc_start = round_down(offset, sectorsize);
3246 			ret = 0;
3247 		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3248 			ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
3249 			if (ret)
3250 				goto out;
3251 		} else {
3252 			ret = 0;
3253 		}
3254 	}
3255 
3256 	if (!IS_ALIGNED(offset + len, sectorsize)) {
3257 		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3258 							    offset + len);
3259 		if (ret < 0)
3260 			goto out;
3261 		if (ret == RANGE_BOUNDARY_HOLE) {
3262 			alloc_end = round_up(offset + len, sectorsize);
3263 			ret = 0;
3264 		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3265 			ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
3266 						   0, 1);
3267 			if (ret)
3268 				goto out;
3269 		} else {
3270 			ret = 0;
3271 		}
3272 	}
3273 
3274 reserve_space:
3275 	if (alloc_start < alloc_end) {
3276 		struct extent_state *cached_state = NULL;
3277 		const u64 lockstart = alloc_start;
3278 		const u64 lockend = alloc_end - 1;
3279 
3280 		bytes_to_reserve = alloc_end - alloc_start;
3281 		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3282 						      bytes_to_reserve);
3283 		if (ret < 0)
3284 			goto out;
3285 		space_reserved = true;
3286 		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
3287 						  &cached_state);
3288 		if (ret)
3289 			goto out;
3290 		ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
3291 						alloc_start, bytes_to_reserve);
3292 		if (ret) {
3293 			unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
3294 					     lockend, &cached_state);
3295 			goto out;
3296 		}
3297 		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3298 						alloc_end - alloc_start,
3299 						i_blocksize(inode),
3300 						offset + len, &alloc_hint);
3301 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
3302 				     lockend, &cached_state);
3303 		/* btrfs_prealloc_file_range releases reserved space on error */
3304 		if (ret) {
3305 			space_reserved = false;
3306 			goto out;
3307 		}
3308 	}
3309 	ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3310  out:
3311 	if (ret && space_reserved)
3312 		btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3313 					       alloc_start, bytes_to_reserve);
3314 	extent_changeset_free(data_reserved);
3315 
3316 	return ret;
3317 }
3318 
3319 static long btrfs_fallocate(struct file *file, int mode,
3320 			    loff_t offset, loff_t len)
3321 {
3322 	struct inode *inode = file_inode(file);
3323 	struct extent_state *cached_state = NULL;
3324 	struct extent_changeset *data_reserved = NULL;
3325 	struct falloc_range *range;
3326 	struct falloc_range *tmp;
3327 	struct list_head reserve_list;
3328 	u64 cur_offset;
3329 	u64 last_byte;
3330 	u64 alloc_start;
3331 	u64 alloc_end;
3332 	u64 alloc_hint = 0;
3333 	u64 locked_end;
3334 	u64 actual_end = 0;
3335 	struct extent_map *em;
3336 	int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
3337 	int ret;
3338 
3339 	/* Do not allow fallocate in ZONED mode */
3340 	if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
3341 		return -EOPNOTSUPP;
3342 
3343 	alloc_start = round_down(offset, blocksize);
3344 	alloc_end = round_up(offset + len, blocksize);
3345 	cur_offset = alloc_start;
3346 
3347 	/* Make sure we aren't being give some crap mode */
3348 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3349 		     FALLOC_FL_ZERO_RANGE))
3350 		return -EOPNOTSUPP;
3351 
3352 	if (mode & FALLOC_FL_PUNCH_HOLE)
3353 		return btrfs_punch_hole(inode, offset, len);
3354 
3355 	/*
3356 	 * Only trigger disk allocation, don't trigger qgroup reserve
3357 	 *
3358 	 * For qgroup space, it will be checked later.
3359 	 */
3360 	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
3361 		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3362 						      alloc_end - alloc_start);
3363 		if (ret < 0)
3364 			return ret;
3365 	}
3366 
3367 	btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
3368 
3369 	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3370 		ret = inode_newsize_ok(inode, offset + len);
3371 		if (ret)
3372 			goto out;
3373 	}
3374 
3375 	/*
3376 	 * TODO: Move these two operations after we have checked
3377 	 * accurate reserved space, or fallocate can still fail but
3378 	 * with page truncated or size expanded.
3379 	 *
3380 	 * But that's a minor problem and won't do much harm BTW.
3381 	 */
3382 	if (alloc_start > inode->i_size) {
3383 		ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
3384 					alloc_start);
3385 		if (ret)
3386 			goto out;
3387 	} else if (offset + len > inode->i_size) {
3388 		/*
3389 		 * If we are fallocating from the end of the file onward we
3390 		 * need to zero out the end of the block if i_size lands in the
3391 		 * middle of a block.
3392 		 */
3393 		ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
3394 		if (ret)
3395 			goto out;
3396 	}
3397 
3398 	/*
3399 	 * wait for ordered IO before we have any locks.  We'll loop again
3400 	 * below with the locks held.
3401 	 */
3402 	ret = btrfs_wait_ordered_range(inode, alloc_start,
3403 				       alloc_end - alloc_start);
3404 	if (ret)
3405 		goto out;
3406 
3407 	if (mode & FALLOC_FL_ZERO_RANGE) {
3408 		ret = btrfs_zero_range(inode, offset, len, mode);
3409 		btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3410 		return ret;
3411 	}
3412 
3413 	locked_end = alloc_end - 1;
3414 	while (1) {
3415 		struct btrfs_ordered_extent *ordered;
3416 
3417 		/* the extent lock is ordered inside the running
3418 		 * transaction
3419 		 */
3420 		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
3421 				 locked_end, &cached_state);
3422 		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
3423 							    locked_end);
3424 
3425 		if (ordered &&
3426 		    ordered->file_offset + ordered->num_bytes > alloc_start &&
3427 		    ordered->file_offset < alloc_end) {
3428 			btrfs_put_ordered_extent(ordered);
3429 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
3430 					     alloc_start, locked_end,
3431 					     &cached_state);
3432 			/*
3433 			 * we can't wait on the range with the transaction
3434 			 * running or with the extent lock held
3435 			 */
3436 			ret = btrfs_wait_ordered_range(inode, alloc_start,
3437 						       alloc_end - alloc_start);
3438 			if (ret)
3439 				goto out;
3440 		} else {
3441 			if (ordered)
3442 				btrfs_put_ordered_extent(ordered);
3443 			break;
3444 		}
3445 	}
3446 
3447 	/* First, check if we exceed the qgroup limit */
3448 	INIT_LIST_HEAD(&reserve_list);
3449 	while (cur_offset < alloc_end) {
3450 		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
3451 				      alloc_end - cur_offset);
3452 		if (IS_ERR(em)) {
3453 			ret = PTR_ERR(em);
3454 			break;
3455 		}
3456 		last_byte = min(extent_map_end(em), alloc_end);
3457 		actual_end = min_t(u64, extent_map_end(em), offset + len);
3458 		last_byte = ALIGN(last_byte, blocksize);
3459 		if (em->block_start == EXTENT_MAP_HOLE ||
3460 		    (cur_offset >= inode->i_size &&
3461 		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
3462 			ret = add_falloc_range(&reserve_list, cur_offset,
3463 					       last_byte - cur_offset);
3464 			if (ret < 0) {
3465 				free_extent_map(em);
3466 				break;
3467 			}
3468 			ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3469 					&data_reserved, cur_offset,
3470 					last_byte - cur_offset);
3471 			if (ret < 0) {
3472 				cur_offset = last_byte;
3473 				free_extent_map(em);
3474 				break;
3475 			}
3476 		} else {
3477 			/*
3478 			 * Do not need to reserve unwritten extent for this
3479 			 * range, free reserved data space first, otherwise
3480 			 * it'll result in false ENOSPC error.
3481 			 */
3482 			btrfs_free_reserved_data_space(BTRFS_I(inode),
3483 				data_reserved, cur_offset,
3484 				last_byte - cur_offset);
3485 		}
3486 		free_extent_map(em);
3487 		cur_offset = last_byte;
3488 	}
3489 
3490 	/*
3491 	 * If ret is still 0, means we're OK to fallocate.
3492 	 * Or just cleanup the list and exit.
3493 	 */
3494 	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3495 		if (!ret)
3496 			ret = btrfs_prealloc_file_range(inode, mode,
3497 					range->start,
3498 					range->len, i_blocksize(inode),
3499 					offset + len, &alloc_hint);
3500 		else
3501 			btrfs_free_reserved_data_space(BTRFS_I(inode),
3502 					data_reserved, range->start,
3503 					range->len);
3504 		list_del(&range->list);
3505 		kfree(range);
3506 	}
3507 	if (ret < 0)
3508 		goto out_unlock;
3509 
3510 	/*
3511 	 * We didn't need to allocate any more space, but we still extended the
3512 	 * size of the file so we need to update i_size and the inode item.
3513 	 */
3514 	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3515 out_unlock:
3516 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3517 			     &cached_state);
3518 out:
3519 	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3520 	/* Let go of our reservation. */
3521 	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
3522 		btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3523 				cur_offset, alloc_end - cur_offset);
3524 	extent_changeset_free(data_reserved);
3525 	return ret;
3526 }
3527 
3528 static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
3529 				  int whence)
3530 {
3531 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3532 	struct extent_map *em = NULL;
3533 	struct extent_state *cached_state = NULL;
3534 	loff_t i_size = inode->vfs_inode.i_size;
3535 	u64 lockstart;
3536 	u64 lockend;
3537 	u64 start;
3538 	u64 len;
3539 	int ret = 0;
3540 
3541 	if (i_size == 0 || offset >= i_size)
3542 		return -ENXIO;
3543 
3544 	/*
3545 	 * offset can be negative, in this case we start finding DATA/HOLE from
3546 	 * the very start of the file.
3547 	 */
3548 	start = max_t(loff_t, 0, offset);
3549 
3550 	lockstart = round_down(start, fs_info->sectorsize);
3551 	lockend = round_up(i_size, fs_info->sectorsize);
3552 	if (lockend <= lockstart)
3553 		lockend = lockstart + fs_info->sectorsize;
3554 	lockend--;
3555 	len = lockend - lockstart + 1;
3556 
3557 	lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
3558 
3559 	while (start < i_size) {
3560 		em = btrfs_get_extent_fiemap(inode, start, len);
3561 		if (IS_ERR(em)) {
3562 			ret = PTR_ERR(em);
3563 			em = NULL;
3564 			break;
3565 		}
3566 
3567 		if (whence == SEEK_HOLE &&
3568 		    (em->block_start == EXTENT_MAP_HOLE ||
3569 		     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3570 			break;
3571 		else if (whence == SEEK_DATA &&
3572 			   (em->block_start != EXTENT_MAP_HOLE &&
3573 			    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3574 			break;
3575 
3576 		start = em->start + em->len;
3577 		free_extent_map(em);
3578 		em = NULL;
3579 		cond_resched();
3580 	}
3581 	free_extent_map(em);
3582 	unlock_extent_cached(&inode->io_tree, lockstart, lockend,
3583 			     &cached_state);
3584 	if (ret) {
3585 		offset = ret;
3586 	} else {
3587 		if (whence == SEEK_DATA && start >= i_size)
3588 			offset = -ENXIO;
3589 		else
3590 			offset = min_t(loff_t, start, i_size);
3591 	}
3592 
3593 	return offset;
3594 }
3595 
3596 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3597 {
3598 	struct inode *inode = file->f_mapping->host;
3599 
3600 	switch (whence) {
3601 	default:
3602 		return generic_file_llseek(file, offset, whence);
3603 	case SEEK_DATA:
3604 	case SEEK_HOLE:
3605 		btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
3606 		offset = find_desired_extent(BTRFS_I(inode), offset, whence);
3607 		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
3608 		break;
3609 	}
3610 
3611 	if (offset < 0)
3612 		return offset;
3613 
3614 	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3615 }
3616 
3617 static int btrfs_file_open(struct inode *inode, struct file *filp)
3618 {
3619 	int ret;
3620 
3621 	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
3622 
3623 	ret = fsverity_file_open(inode, filp);
3624 	if (ret)
3625 		return ret;
3626 	return generic_file_open(inode, filp);
3627 }
3628 
3629 static int check_direct_read(struct btrfs_fs_info *fs_info,
3630 			     const struct iov_iter *iter, loff_t offset)
3631 {
3632 	int ret;
3633 	int i, seg;
3634 
3635 	ret = check_direct_IO(fs_info, iter, offset);
3636 	if (ret < 0)
3637 		return ret;
3638 
3639 	if (!iter_is_iovec(iter))
3640 		return 0;
3641 
3642 	for (seg = 0; seg < iter->nr_segs; seg++)
3643 		for (i = seg + 1; i < iter->nr_segs; i++)
3644 			if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
3645 				return -EINVAL;
3646 	return 0;
3647 }
3648 
3649 static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
3650 {
3651 	struct inode *inode = file_inode(iocb->ki_filp);
3652 	ssize_t ret;
3653 
3654 	if (fsverity_active(inode))
3655 		return 0;
3656 
3657 	if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
3658 		return 0;
3659 
3660 	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
3661 	ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
3662 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
3663 	return ret;
3664 }
3665 
3666 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3667 {
3668 	ssize_t ret = 0;
3669 
3670 	if (iocb->ki_flags & IOCB_DIRECT) {
3671 		ret = btrfs_direct_read(iocb, to);
3672 		if (ret < 0 || !iov_iter_count(to) ||
3673 		    iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3674 			return ret;
3675 	}
3676 
3677 	return filemap_read(iocb, to, ret);
3678 }
3679 
3680 const struct file_operations btrfs_file_operations = {
3681 	.llseek		= btrfs_file_llseek,
3682 	.read_iter      = btrfs_file_read_iter,
3683 	.splice_read	= generic_file_splice_read,
3684 	.write_iter	= btrfs_file_write_iter,
3685 	.splice_write	= iter_file_splice_write,
3686 	.mmap		= btrfs_file_mmap,
3687 	.open		= btrfs_file_open,
3688 	.release	= btrfs_release_file,
3689 	.fsync		= btrfs_sync_file,
3690 	.fallocate	= btrfs_fallocate,
3691 	.unlocked_ioctl	= btrfs_ioctl,
3692 #ifdef CONFIG_COMPAT
3693 	.compat_ioctl	= btrfs_compat_ioctl,
3694 #endif
3695 	.remap_file_range = btrfs_remap_file_range,
3696 };
3697 
3698 void __cold btrfs_auto_defrag_exit(void)
3699 {
3700 	kmem_cache_destroy(btrfs_inode_defrag_cachep);
3701 }
3702 
3703 int __init btrfs_auto_defrag_init(void)
3704 {
3705 	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
3706 					sizeof(struct inode_defrag), 0,
3707 					SLAB_MEM_SPREAD,
3708 					NULL);
3709 	if (!btrfs_inode_defrag_cachep)
3710 		return -ENOMEM;
3711 
3712 	return 0;
3713 }
3714 
3715 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
3716 {
3717 	int ret;
3718 
3719 	/*
3720 	 * So with compression we will find and lock a dirty page and clear the
3721 	 * first one as dirty, setup an async extent, and immediately return
3722 	 * with the entire range locked but with nobody actually marked with
3723 	 * writeback.  So we can't just filemap_write_and_wait_range() and
3724 	 * expect it to work since it will just kick off a thread to do the
3725 	 * actual work.  So we need to call filemap_fdatawrite_range _again_
3726 	 * since it will wait on the page lock, which won't be unlocked until
3727 	 * after the pages have been marked as writeback and so we're good to go
3728 	 * from there.  We have to do this otherwise we'll miss the ordered
3729 	 * extents and that results in badness.  Please Josef, do not think you
3730 	 * know better and pull this out at some point in the future, it is
3731 	 * right and you are wrong.
3732 	 */
3733 	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3734 	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
3735 			     &BTRFS_I(inode)->runtime_flags))
3736 		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3737 
3738 	return ret;
3739 }
3740