xref: /openbmc/linux/fs/btrfs/file.c (revision 1d7a0395)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5 
6 #include <linux/fs.h>
7 #include <linux/pagemap.h>
8 #include <linux/time.h>
9 #include <linux/init.h>
10 #include <linux/string.h>
11 #include <linux/backing-dev.h>
12 #include <linux/falloc.h>
13 #include <linux/writeback.h>
14 #include <linux/compat.h>
15 #include <linux/slab.h>
16 #include <linux/btrfs.h>
17 #include <linux/uio.h>
18 #include <linux/iversion.h>
19 #include "ctree.h"
20 #include "disk-io.h"
21 #include "transaction.h"
22 #include "btrfs_inode.h"
23 #include "print-tree.h"
24 #include "tree-log.h"
25 #include "locking.h"
26 #include "volumes.h"
27 #include "qgroup.h"
28 #include "compression.h"
29 #include "delalloc-space.h"
30 #include "reflink.h"
31 #include "subpage.h"
32 
33 static struct kmem_cache *btrfs_inode_defrag_cachep;
34 /*
35  * when auto defrag is enabled we
36  * queue up these defrag structs to remember which
37  * inodes need defragging passes
38  */
39 struct inode_defrag {
40 	struct rb_node rb_node;
41 	/* objectid */
42 	u64 ino;
43 	/*
44 	 * transid where the defrag was added, we search for
45 	 * extents newer than this
46 	 */
47 	u64 transid;
48 
49 	/* root objectid */
50 	u64 root;
51 
52 	/* last offset we were able to defrag */
53 	u64 last_offset;
54 
55 	/* if we've wrapped around back to zero once already */
56 	int cycled;
57 };
58 
59 static int __compare_inode_defrag(struct inode_defrag *defrag1,
60 				  struct inode_defrag *defrag2)
61 {
62 	if (defrag1->root > defrag2->root)
63 		return 1;
64 	else if (defrag1->root < defrag2->root)
65 		return -1;
66 	else if (defrag1->ino > defrag2->ino)
67 		return 1;
68 	else if (defrag1->ino < defrag2->ino)
69 		return -1;
70 	else
71 		return 0;
72 }
73 
74 /* pop a record for an inode into the defrag tree.  The lock
75  * must be held already
76  *
77  * If you're inserting a record for an older transid than an
78  * existing record, the transid already in the tree is lowered
79  *
80  * If an existing record is found the defrag item you
81  * pass in is freed
82  */
83 static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
84 				    struct inode_defrag *defrag)
85 {
86 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
87 	struct inode_defrag *entry;
88 	struct rb_node **p;
89 	struct rb_node *parent = NULL;
90 	int ret;
91 
92 	p = &fs_info->defrag_inodes.rb_node;
93 	while (*p) {
94 		parent = *p;
95 		entry = rb_entry(parent, struct inode_defrag, rb_node);
96 
97 		ret = __compare_inode_defrag(defrag, entry);
98 		if (ret < 0)
99 			p = &parent->rb_left;
100 		else if (ret > 0)
101 			p = &parent->rb_right;
102 		else {
103 			/* if we're reinserting an entry for
104 			 * an old defrag run, make sure to
105 			 * lower the transid of our existing record
106 			 */
107 			if (defrag->transid < entry->transid)
108 				entry->transid = defrag->transid;
109 			if (defrag->last_offset > entry->last_offset)
110 				entry->last_offset = defrag->last_offset;
111 			return -EEXIST;
112 		}
113 	}
114 	set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
115 	rb_link_node(&defrag->rb_node, parent, p);
116 	rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
117 	return 0;
118 }
119 
120 static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
121 {
122 	if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
123 		return 0;
124 
125 	if (btrfs_fs_closing(fs_info))
126 		return 0;
127 
128 	return 1;
129 }
130 
131 /*
132  * insert a defrag record for this inode if auto defrag is
133  * enabled
134  */
135 int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
136 			   struct btrfs_inode *inode)
137 {
138 	struct btrfs_root *root = inode->root;
139 	struct btrfs_fs_info *fs_info = root->fs_info;
140 	struct inode_defrag *defrag;
141 	u64 transid;
142 	int ret;
143 
144 	if (!__need_auto_defrag(fs_info))
145 		return 0;
146 
147 	if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
148 		return 0;
149 
150 	if (trans)
151 		transid = trans->transid;
152 	else
153 		transid = inode->root->last_trans;
154 
155 	defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
156 	if (!defrag)
157 		return -ENOMEM;
158 
159 	defrag->ino = btrfs_ino(inode);
160 	defrag->transid = transid;
161 	defrag->root = root->root_key.objectid;
162 
163 	spin_lock(&fs_info->defrag_inodes_lock);
164 	if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
165 		/*
166 		 * If we set IN_DEFRAG flag and evict the inode from memory,
167 		 * and then re-read this inode, this new inode doesn't have
168 		 * IN_DEFRAG flag. At the case, we may find the existed defrag.
169 		 */
170 		ret = __btrfs_add_inode_defrag(inode, defrag);
171 		if (ret)
172 			kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
173 	} else {
174 		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
175 	}
176 	spin_unlock(&fs_info->defrag_inodes_lock);
177 	return 0;
178 }
179 
180 /*
181  * Requeue the defrag object. If there is a defrag object that points to
182  * the same inode in the tree, we will merge them together (by
183  * __btrfs_add_inode_defrag()) and free the one that we want to requeue.
184  */
185 static void btrfs_requeue_inode_defrag(struct btrfs_inode *inode,
186 				       struct inode_defrag *defrag)
187 {
188 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
189 	int ret;
190 
191 	if (!__need_auto_defrag(fs_info))
192 		goto out;
193 
194 	/*
195 	 * Here we don't check the IN_DEFRAG flag, because we need merge
196 	 * them together.
197 	 */
198 	spin_lock(&fs_info->defrag_inodes_lock);
199 	ret = __btrfs_add_inode_defrag(inode, defrag);
200 	spin_unlock(&fs_info->defrag_inodes_lock);
201 	if (ret)
202 		goto out;
203 	return;
204 out:
205 	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
206 }
207 
208 /*
209  * pick the defragable inode that we want, if it doesn't exist, we will get
210  * the next one.
211  */
212 static struct inode_defrag *
213 btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
214 {
215 	struct inode_defrag *entry = NULL;
216 	struct inode_defrag tmp;
217 	struct rb_node *p;
218 	struct rb_node *parent = NULL;
219 	int ret;
220 
221 	tmp.ino = ino;
222 	tmp.root = root;
223 
224 	spin_lock(&fs_info->defrag_inodes_lock);
225 	p = fs_info->defrag_inodes.rb_node;
226 	while (p) {
227 		parent = p;
228 		entry = rb_entry(parent, struct inode_defrag, rb_node);
229 
230 		ret = __compare_inode_defrag(&tmp, entry);
231 		if (ret < 0)
232 			p = parent->rb_left;
233 		else if (ret > 0)
234 			p = parent->rb_right;
235 		else
236 			goto out;
237 	}
238 
239 	if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
240 		parent = rb_next(parent);
241 		if (parent)
242 			entry = rb_entry(parent, struct inode_defrag, rb_node);
243 		else
244 			entry = NULL;
245 	}
246 out:
247 	if (entry)
248 		rb_erase(parent, &fs_info->defrag_inodes);
249 	spin_unlock(&fs_info->defrag_inodes_lock);
250 	return entry;
251 }
252 
253 void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
254 {
255 	struct inode_defrag *defrag;
256 	struct rb_node *node;
257 
258 	spin_lock(&fs_info->defrag_inodes_lock);
259 	node = rb_first(&fs_info->defrag_inodes);
260 	while (node) {
261 		rb_erase(node, &fs_info->defrag_inodes);
262 		defrag = rb_entry(node, struct inode_defrag, rb_node);
263 		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
264 
265 		cond_resched_lock(&fs_info->defrag_inodes_lock);
266 
267 		node = rb_first(&fs_info->defrag_inodes);
268 	}
269 	spin_unlock(&fs_info->defrag_inodes_lock);
270 }
271 
272 #define BTRFS_DEFRAG_BATCH	1024
273 
274 static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
275 				    struct inode_defrag *defrag)
276 {
277 	struct btrfs_root *inode_root;
278 	struct inode *inode;
279 	struct btrfs_ioctl_defrag_range_args range;
280 	int num_defrag;
281 	int ret;
282 
283 	/* get the inode */
284 	inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
285 	if (IS_ERR(inode_root)) {
286 		ret = PTR_ERR(inode_root);
287 		goto cleanup;
288 	}
289 
290 	inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
291 	btrfs_put_root(inode_root);
292 	if (IS_ERR(inode)) {
293 		ret = PTR_ERR(inode);
294 		goto cleanup;
295 	}
296 
297 	/* do a chunk of defrag */
298 	clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
299 	memset(&range, 0, sizeof(range));
300 	range.len = (u64)-1;
301 	range.start = defrag->last_offset;
302 
303 	sb_start_write(fs_info->sb);
304 	num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
305 				       BTRFS_DEFRAG_BATCH);
306 	sb_end_write(fs_info->sb);
307 	/*
308 	 * if we filled the whole defrag batch, there
309 	 * must be more work to do.  Queue this defrag
310 	 * again
311 	 */
312 	if (num_defrag == BTRFS_DEFRAG_BATCH) {
313 		defrag->last_offset = range.start;
314 		btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
315 	} else if (defrag->last_offset && !defrag->cycled) {
316 		/*
317 		 * we didn't fill our defrag batch, but
318 		 * we didn't start at zero.  Make sure we loop
319 		 * around to the start of the file.
320 		 */
321 		defrag->last_offset = 0;
322 		defrag->cycled = 1;
323 		btrfs_requeue_inode_defrag(BTRFS_I(inode), defrag);
324 	} else {
325 		kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
326 	}
327 
328 	iput(inode);
329 	return 0;
330 cleanup:
331 	kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
332 	return ret;
333 }
334 
335 /*
336  * run through the list of inodes in the FS that need
337  * defragging
338  */
339 int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
340 {
341 	struct inode_defrag *defrag;
342 	u64 first_ino = 0;
343 	u64 root_objectid = 0;
344 
345 	atomic_inc(&fs_info->defrag_running);
346 	while (1) {
347 		/* Pause the auto defragger. */
348 		if (test_bit(BTRFS_FS_STATE_REMOUNTING,
349 			     &fs_info->fs_state))
350 			break;
351 
352 		if (!__need_auto_defrag(fs_info))
353 			break;
354 
355 		/* find an inode to defrag */
356 		defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
357 						 first_ino);
358 		if (!defrag) {
359 			if (root_objectid || first_ino) {
360 				root_objectid = 0;
361 				first_ino = 0;
362 				continue;
363 			} else {
364 				break;
365 			}
366 		}
367 
368 		first_ino = defrag->ino + 1;
369 		root_objectid = defrag->root;
370 
371 		__btrfs_run_defrag_inode(fs_info, defrag);
372 	}
373 	atomic_dec(&fs_info->defrag_running);
374 
375 	/*
376 	 * during unmount, we use the transaction_wait queue to
377 	 * wait for the defragger to stop
378 	 */
379 	wake_up(&fs_info->transaction_wait);
380 	return 0;
381 }
382 
383 /* simple helper to fault in pages and copy.  This should go away
384  * and be replaced with calls into generic code.
385  */
386 static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
387 					 struct page **prepared_pages,
388 					 struct iov_iter *i)
389 {
390 	size_t copied = 0;
391 	size_t total_copied = 0;
392 	int pg = 0;
393 	int offset = offset_in_page(pos);
394 
395 	while (write_bytes > 0) {
396 		size_t count = min_t(size_t,
397 				     PAGE_SIZE - offset, write_bytes);
398 		struct page *page = prepared_pages[pg];
399 		/*
400 		 * Copy data from userspace to the current page
401 		 */
402 		copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
403 
404 		/* Flush processor's dcache for this page */
405 		flush_dcache_page(page);
406 
407 		/*
408 		 * if we get a partial write, we can end up with
409 		 * partially up to date pages.  These add
410 		 * a lot of complexity, so make sure they don't
411 		 * happen by forcing this copy to be retried.
412 		 *
413 		 * The rest of the btrfs_file_write code will fall
414 		 * back to page at a time copies after we return 0.
415 		 */
416 		if (!PageUptodate(page) && copied < count)
417 			copied = 0;
418 
419 		iov_iter_advance(i, copied);
420 		write_bytes -= copied;
421 		total_copied += copied;
422 
423 		/* Return to btrfs_file_write_iter to fault page */
424 		if (unlikely(copied == 0))
425 			break;
426 
427 		if (copied < PAGE_SIZE - offset) {
428 			offset += copied;
429 		} else {
430 			pg++;
431 			offset = 0;
432 		}
433 	}
434 	return total_copied;
435 }
436 
437 /*
438  * unlocks pages after btrfs_file_write is done with them
439  */
440 static void btrfs_drop_pages(struct page **pages, size_t num_pages)
441 {
442 	size_t i;
443 	for (i = 0; i < num_pages; i++) {
444 		/* page checked is some magic around finding pages that
445 		 * have been modified without going through btrfs_set_page_dirty
446 		 * clear it here. There should be no need to mark the pages
447 		 * accessed as prepare_pages should have marked them accessed
448 		 * in prepare_pages via find_or_create_page()
449 		 */
450 		ClearPageChecked(pages[i]);
451 		unlock_page(pages[i]);
452 		put_page(pages[i]);
453 	}
454 }
455 
456 /*
457  * After btrfs_copy_from_user(), update the following things for delalloc:
458  * - Mark newly dirtied pages as DELALLOC in the io tree.
459  *   Used to advise which range is to be written back.
460  * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
461  * - Update inode size for past EOF write
462  */
463 int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
464 		      size_t num_pages, loff_t pos, size_t write_bytes,
465 		      struct extent_state **cached, bool noreserve)
466 {
467 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
468 	int err = 0;
469 	int i;
470 	u64 num_bytes;
471 	u64 start_pos;
472 	u64 end_of_last_block;
473 	u64 end_pos = pos + write_bytes;
474 	loff_t isize = i_size_read(&inode->vfs_inode);
475 	unsigned int extra_bits = 0;
476 
477 	if (write_bytes == 0)
478 		return 0;
479 
480 	if (noreserve)
481 		extra_bits |= EXTENT_NORESERVE;
482 
483 	start_pos = round_down(pos, fs_info->sectorsize);
484 	num_bytes = round_up(write_bytes + pos - start_pos,
485 			     fs_info->sectorsize);
486 	ASSERT(num_bytes <= U32_MAX);
487 
488 	end_of_last_block = start_pos + num_bytes - 1;
489 
490 	/*
491 	 * The pages may have already been dirty, clear out old accounting so
492 	 * we can set things up properly
493 	 */
494 	clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
495 			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
496 			 0, 0, cached);
497 
498 	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
499 					extra_bits, cached);
500 	if (err)
501 		return err;
502 
503 	for (i = 0; i < num_pages; i++) {
504 		struct page *p = pages[i];
505 
506 		btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
507 		ClearPageChecked(p);
508 		btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
509 	}
510 
511 	/*
512 	 * we've only changed i_size in ram, and we haven't updated
513 	 * the disk i_size.  There is no need to log the inode
514 	 * at this time.
515 	 */
516 	if (end_pos > isize)
517 		i_size_write(&inode->vfs_inode, end_pos);
518 	return 0;
519 }
520 
521 /*
522  * this drops all the extents in the cache that intersect the range
523  * [start, end].  Existing extents are split as required.
524  */
525 void btrfs_drop_extent_cache(struct btrfs_inode *inode, u64 start, u64 end,
526 			     int skip_pinned)
527 {
528 	struct extent_map *em;
529 	struct extent_map *split = NULL;
530 	struct extent_map *split2 = NULL;
531 	struct extent_map_tree *em_tree = &inode->extent_tree;
532 	u64 len = end - start + 1;
533 	u64 gen;
534 	int ret;
535 	int testend = 1;
536 	unsigned long flags;
537 	int compressed = 0;
538 	bool modified;
539 
540 	WARN_ON(end < start);
541 	if (end == (u64)-1) {
542 		len = (u64)-1;
543 		testend = 0;
544 	}
545 	while (1) {
546 		int no_splits = 0;
547 
548 		modified = false;
549 		if (!split)
550 			split = alloc_extent_map();
551 		if (!split2)
552 			split2 = alloc_extent_map();
553 		if (!split || !split2)
554 			no_splits = 1;
555 
556 		write_lock(&em_tree->lock);
557 		em = lookup_extent_mapping(em_tree, start, len);
558 		if (!em) {
559 			write_unlock(&em_tree->lock);
560 			break;
561 		}
562 		flags = em->flags;
563 		gen = em->generation;
564 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
565 			if (testend && em->start + em->len >= start + len) {
566 				free_extent_map(em);
567 				write_unlock(&em_tree->lock);
568 				break;
569 			}
570 			start = em->start + em->len;
571 			if (testend)
572 				len = start + len - (em->start + em->len);
573 			free_extent_map(em);
574 			write_unlock(&em_tree->lock);
575 			continue;
576 		}
577 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
578 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
579 		clear_bit(EXTENT_FLAG_LOGGING, &flags);
580 		modified = !list_empty(&em->list);
581 		if (no_splits)
582 			goto next;
583 
584 		if (em->start < start) {
585 			split->start = em->start;
586 			split->len = start - em->start;
587 
588 			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
589 				split->orig_start = em->orig_start;
590 				split->block_start = em->block_start;
591 
592 				if (compressed)
593 					split->block_len = em->block_len;
594 				else
595 					split->block_len = split->len;
596 				split->orig_block_len = max(split->block_len,
597 						em->orig_block_len);
598 				split->ram_bytes = em->ram_bytes;
599 			} else {
600 				split->orig_start = split->start;
601 				split->block_len = 0;
602 				split->block_start = em->block_start;
603 				split->orig_block_len = 0;
604 				split->ram_bytes = split->len;
605 			}
606 
607 			split->generation = gen;
608 			split->flags = flags;
609 			split->compress_type = em->compress_type;
610 			replace_extent_mapping(em_tree, em, split, modified);
611 			free_extent_map(split);
612 			split = split2;
613 			split2 = NULL;
614 		}
615 		if (testend && em->start + em->len > start + len) {
616 			u64 diff = start + len - em->start;
617 
618 			split->start = start + len;
619 			split->len = em->start + em->len - (start + len);
620 			split->flags = flags;
621 			split->compress_type = em->compress_type;
622 			split->generation = gen;
623 
624 			if (em->block_start < EXTENT_MAP_LAST_BYTE) {
625 				split->orig_block_len = max(em->block_len,
626 						    em->orig_block_len);
627 
628 				split->ram_bytes = em->ram_bytes;
629 				if (compressed) {
630 					split->block_len = em->block_len;
631 					split->block_start = em->block_start;
632 					split->orig_start = em->orig_start;
633 				} else {
634 					split->block_len = split->len;
635 					split->block_start = em->block_start
636 						+ diff;
637 					split->orig_start = em->orig_start;
638 				}
639 			} else {
640 				split->ram_bytes = split->len;
641 				split->orig_start = split->start;
642 				split->block_len = 0;
643 				split->block_start = em->block_start;
644 				split->orig_block_len = 0;
645 			}
646 
647 			if (extent_map_in_tree(em)) {
648 				replace_extent_mapping(em_tree, em, split,
649 						       modified);
650 			} else {
651 				ret = add_extent_mapping(em_tree, split,
652 							 modified);
653 				ASSERT(ret == 0); /* Logic error */
654 			}
655 			free_extent_map(split);
656 			split = NULL;
657 		}
658 next:
659 		if (extent_map_in_tree(em))
660 			remove_extent_mapping(em_tree, em);
661 		write_unlock(&em_tree->lock);
662 
663 		/* once for us */
664 		free_extent_map(em);
665 		/* once for the tree*/
666 		free_extent_map(em);
667 	}
668 	if (split)
669 		free_extent_map(split);
670 	if (split2)
671 		free_extent_map(split2);
672 }
673 
674 /*
675  * this is very complex, but the basic idea is to drop all extents
676  * in the range start - end.  hint_block is filled in with a block number
677  * that would be a good hint to the block allocator for this file.
678  *
679  * If an extent intersects the range but is not entirely inside the range
680  * it is either truncated or split.  Anything entirely inside the range
681  * is deleted from the tree.
682  *
683  * Note: the VFS' inode number of bytes is not updated, it's up to the caller
684  * to deal with that. We set the field 'bytes_found' of the arguments structure
685  * with the number of allocated bytes found in the target range, so that the
686  * caller can update the inode's number of bytes in an atomic way when
687  * replacing extents in a range to avoid races with stat(2).
688  */
689 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
690 		       struct btrfs_root *root, struct btrfs_inode *inode,
691 		       struct btrfs_drop_extents_args *args)
692 {
693 	struct btrfs_fs_info *fs_info = root->fs_info;
694 	struct extent_buffer *leaf;
695 	struct btrfs_file_extent_item *fi;
696 	struct btrfs_ref ref = { 0 };
697 	struct btrfs_key key;
698 	struct btrfs_key new_key;
699 	u64 ino = btrfs_ino(inode);
700 	u64 search_start = args->start;
701 	u64 disk_bytenr = 0;
702 	u64 num_bytes = 0;
703 	u64 extent_offset = 0;
704 	u64 extent_end = 0;
705 	u64 last_end = args->start;
706 	int del_nr = 0;
707 	int del_slot = 0;
708 	int extent_type;
709 	int recow;
710 	int ret;
711 	int modify_tree = -1;
712 	int update_refs;
713 	int found = 0;
714 	int leafs_visited = 0;
715 	struct btrfs_path *path = args->path;
716 
717 	args->bytes_found = 0;
718 	args->extent_inserted = false;
719 
720 	/* Must always have a path if ->replace_extent is true */
721 	ASSERT(!(args->replace_extent && !args->path));
722 
723 	if (!path) {
724 		path = btrfs_alloc_path();
725 		if (!path) {
726 			ret = -ENOMEM;
727 			goto out;
728 		}
729 	}
730 
731 	if (args->drop_cache)
732 		btrfs_drop_extent_cache(inode, args->start, args->end - 1, 0);
733 
734 	if (args->start >= inode->disk_i_size && !args->replace_extent)
735 		modify_tree = 0;
736 
737 	update_refs = (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) ||
738 		       root == fs_info->tree_root);
739 	while (1) {
740 		recow = 0;
741 		ret = btrfs_lookup_file_extent(trans, root, path, ino,
742 					       search_start, modify_tree);
743 		if (ret < 0)
744 			break;
745 		if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
746 			leaf = path->nodes[0];
747 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
748 			if (key.objectid == ino &&
749 			    key.type == BTRFS_EXTENT_DATA_KEY)
750 				path->slots[0]--;
751 		}
752 		ret = 0;
753 		leafs_visited++;
754 next_slot:
755 		leaf = path->nodes[0];
756 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
757 			BUG_ON(del_nr > 0);
758 			ret = btrfs_next_leaf(root, path);
759 			if (ret < 0)
760 				break;
761 			if (ret > 0) {
762 				ret = 0;
763 				break;
764 			}
765 			leafs_visited++;
766 			leaf = path->nodes[0];
767 			recow = 1;
768 		}
769 
770 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
771 
772 		if (key.objectid > ino)
773 			break;
774 		if (WARN_ON_ONCE(key.objectid < ino) ||
775 		    key.type < BTRFS_EXTENT_DATA_KEY) {
776 			ASSERT(del_nr == 0);
777 			path->slots[0]++;
778 			goto next_slot;
779 		}
780 		if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
781 			break;
782 
783 		fi = btrfs_item_ptr(leaf, path->slots[0],
784 				    struct btrfs_file_extent_item);
785 		extent_type = btrfs_file_extent_type(leaf, fi);
786 
787 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
788 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
789 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
790 			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
791 			extent_offset = btrfs_file_extent_offset(leaf, fi);
792 			extent_end = key.offset +
793 				btrfs_file_extent_num_bytes(leaf, fi);
794 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
795 			extent_end = key.offset +
796 				btrfs_file_extent_ram_bytes(leaf, fi);
797 		} else {
798 			/* can't happen */
799 			BUG();
800 		}
801 
802 		/*
803 		 * Don't skip extent items representing 0 byte lengths. They
804 		 * used to be created (bug) if while punching holes we hit
805 		 * -ENOSPC condition. So if we find one here, just ensure we
806 		 * delete it, otherwise we would insert a new file extent item
807 		 * with the same key (offset) as that 0 bytes length file
808 		 * extent item in the call to setup_items_for_insert() later
809 		 * in this function.
810 		 */
811 		if (extent_end == key.offset && extent_end >= search_start) {
812 			last_end = extent_end;
813 			goto delete_extent_item;
814 		}
815 
816 		if (extent_end <= search_start) {
817 			path->slots[0]++;
818 			goto next_slot;
819 		}
820 
821 		found = 1;
822 		search_start = max(key.offset, args->start);
823 		if (recow || !modify_tree) {
824 			modify_tree = -1;
825 			btrfs_release_path(path);
826 			continue;
827 		}
828 
829 		/*
830 		 *     | - range to drop - |
831 		 *  | -------- extent -------- |
832 		 */
833 		if (args->start > key.offset && args->end < extent_end) {
834 			BUG_ON(del_nr > 0);
835 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
836 				ret = -EOPNOTSUPP;
837 				break;
838 			}
839 
840 			memcpy(&new_key, &key, sizeof(new_key));
841 			new_key.offset = args->start;
842 			ret = btrfs_duplicate_item(trans, root, path,
843 						   &new_key);
844 			if (ret == -EAGAIN) {
845 				btrfs_release_path(path);
846 				continue;
847 			}
848 			if (ret < 0)
849 				break;
850 
851 			leaf = path->nodes[0];
852 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
853 					    struct btrfs_file_extent_item);
854 			btrfs_set_file_extent_num_bytes(leaf, fi,
855 							args->start - key.offset);
856 
857 			fi = btrfs_item_ptr(leaf, path->slots[0],
858 					    struct btrfs_file_extent_item);
859 
860 			extent_offset += args->start - key.offset;
861 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
862 			btrfs_set_file_extent_num_bytes(leaf, fi,
863 							extent_end - args->start);
864 			btrfs_mark_buffer_dirty(leaf);
865 
866 			if (update_refs && disk_bytenr > 0) {
867 				btrfs_init_generic_ref(&ref,
868 						BTRFS_ADD_DELAYED_REF,
869 						disk_bytenr, num_bytes, 0);
870 				btrfs_init_data_ref(&ref,
871 						root->root_key.objectid,
872 						new_key.objectid,
873 						args->start - extent_offset);
874 				ret = btrfs_inc_extent_ref(trans, &ref);
875 				BUG_ON(ret); /* -ENOMEM */
876 			}
877 			key.offset = args->start;
878 		}
879 		/*
880 		 * From here on out we will have actually dropped something, so
881 		 * last_end can be updated.
882 		 */
883 		last_end = extent_end;
884 
885 		/*
886 		 *  | ---- range to drop ----- |
887 		 *      | -------- extent -------- |
888 		 */
889 		if (args->start <= key.offset && args->end < extent_end) {
890 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
891 				ret = -EOPNOTSUPP;
892 				break;
893 			}
894 
895 			memcpy(&new_key, &key, sizeof(new_key));
896 			new_key.offset = args->end;
897 			btrfs_set_item_key_safe(fs_info, path, &new_key);
898 
899 			extent_offset += args->end - key.offset;
900 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
901 			btrfs_set_file_extent_num_bytes(leaf, fi,
902 							extent_end - args->end);
903 			btrfs_mark_buffer_dirty(leaf);
904 			if (update_refs && disk_bytenr > 0)
905 				args->bytes_found += args->end - key.offset;
906 			break;
907 		}
908 
909 		search_start = extent_end;
910 		/*
911 		 *       | ---- range to drop ----- |
912 		 *  | -------- extent -------- |
913 		 */
914 		if (args->start > key.offset && args->end >= extent_end) {
915 			BUG_ON(del_nr > 0);
916 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
917 				ret = -EOPNOTSUPP;
918 				break;
919 			}
920 
921 			btrfs_set_file_extent_num_bytes(leaf, fi,
922 							args->start - key.offset);
923 			btrfs_mark_buffer_dirty(leaf);
924 			if (update_refs && disk_bytenr > 0)
925 				args->bytes_found += extent_end - args->start;
926 			if (args->end == extent_end)
927 				break;
928 
929 			path->slots[0]++;
930 			goto next_slot;
931 		}
932 
933 		/*
934 		 *  | ---- range to drop ----- |
935 		 *    | ------ extent ------ |
936 		 */
937 		if (args->start <= key.offset && args->end >= extent_end) {
938 delete_extent_item:
939 			if (del_nr == 0) {
940 				del_slot = path->slots[0];
941 				del_nr = 1;
942 			} else {
943 				BUG_ON(del_slot + del_nr != path->slots[0]);
944 				del_nr++;
945 			}
946 
947 			if (update_refs &&
948 			    extent_type == BTRFS_FILE_EXTENT_INLINE) {
949 				args->bytes_found += extent_end - key.offset;
950 				extent_end = ALIGN(extent_end,
951 						   fs_info->sectorsize);
952 			} else if (update_refs && disk_bytenr > 0) {
953 				btrfs_init_generic_ref(&ref,
954 						BTRFS_DROP_DELAYED_REF,
955 						disk_bytenr, num_bytes, 0);
956 				btrfs_init_data_ref(&ref,
957 						root->root_key.objectid,
958 						key.objectid,
959 						key.offset - extent_offset);
960 				ret = btrfs_free_extent(trans, &ref);
961 				BUG_ON(ret); /* -ENOMEM */
962 				args->bytes_found += extent_end - key.offset;
963 			}
964 
965 			if (args->end == extent_end)
966 				break;
967 
968 			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
969 				path->slots[0]++;
970 				goto next_slot;
971 			}
972 
973 			ret = btrfs_del_items(trans, root, path, del_slot,
974 					      del_nr);
975 			if (ret) {
976 				btrfs_abort_transaction(trans, ret);
977 				break;
978 			}
979 
980 			del_nr = 0;
981 			del_slot = 0;
982 
983 			btrfs_release_path(path);
984 			continue;
985 		}
986 
987 		BUG();
988 	}
989 
990 	if (!ret && del_nr > 0) {
991 		/*
992 		 * Set path->slots[0] to first slot, so that after the delete
993 		 * if items are move off from our leaf to its immediate left or
994 		 * right neighbor leafs, we end up with a correct and adjusted
995 		 * path->slots[0] for our insertion (if args->replace_extent).
996 		 */
997 		path->slots[0] = del_slot;
998 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
999 		if (ret)
1000 			btrfs_abort_transaction(trans, ret);
1001 	}
1002 
1003 	leaf = path->nodes[0];
1004 	/*
1005 	 * If btrfs_del_items() was called, it might have deleted a leaf, in
1006 	 * which case it unlocked our path, so check path->locks[0] matches a
1007 	 * write lock.
1008 	 */
1009 	if (!ret && args->replace_extent && leafs_visited == 1 &&
1010 	    path->locks[0] == BTRFS_WRITE_LOCK &&
1011 	    btrfs_leaf_free_space(leaf) >=
1012 	    sizeof(struct btrfs_item) + args->extent_item_size) {
1013 
1014 		key.objectid = ino;
1015 		key.type = BTRFS_EXTENT_DATA_KEY;
1016 		key.offset = args->start;
1017 		if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
1018 			struct btrfs_key slot_key;
1019 
1020 			btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
1021 			if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
1022 				path->slots[0]++;
1023 		}
1024 		setup_items_for_insert(root, path, &key,
1025 				       &args->extent_item_size, 1);
1026 		args->extent_inserted = true;
1027 	}
1028 
1029 	if (!args->path)
1030 		btrfs_free_path(path);
1031 	else if (!args->extent_inserted)
1032 		btrfs_release_path(path);
1033 out:
1034 	args->drop_end = found ? min(args->end, last_end) : args->end;
1035 
1036 	return ret;
1037 }
1038 
1039 static int extent_mergeable(struct extent_buffer *leaf, int slot,
1040 			    u64 objectid, u64 bytenr, u64 orig_offset,
1041 			    u64 *start, u64 *end)
1042 {
1043 	struct btrfs_file_extent_item *fi;
1044 	struct btrfs_key key;
1045 	u64 extent_end;
1046 
1047 	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
1048 		return 0;
1049 
1050 	btrfs_item_key_to_cpu(leaf, &key, slot);
1051 	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
1052 		return 0;
1053 
1054 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
1055 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
1056 	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
1057 	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
1058 	    btrfs_file_extent_compression(leaf, fi) ||
1059 	    btrfs_file_extent_encryption(leaf, fi) ||
1060 	    btrfs_file_extent_other_encoding(leaf, fi))
1061 		return 0;
1062 
1063 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1064 	if ((*start && *start != key.offset) || (*end && *end != extent_end))
1065 		return 0;
1066 
1067 	*start = key.offset;
1068 	*end = extent_end;
1069 	return 1;
1070 }
1071 
1072 /*
1073  * Mark extent in the range start - end as written.
1074  *
1075  * This changes extent type from 'pre-allocated' to 'regular'. If only
1076  * part of extent is marked as written, the extent will be split into
1077  * two or three.
1078  */
1079 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
1080 			      struct btrfs_inode *inode, u64 start, u64 end)
1081 {
1082 	struct btrfs_fs_info *fs_info = trans->fs_info;
1083 	struct btrfs_root *root = inode->root;
1084 	struct extent_buffer *leaf;
1085 	struct btrfs_path *path;
1086 	struct btrfs_file_extent_item *fi;
1087 	struct btrfs_ref ref = { 0 };
1088 	struct btrfs_key key;
1089 	struct btrfs_key new_key;
1090 	u64 bytenr;
1091 	u64 num_bytes;
1092 	u64 extent_end;
1093 	u64 orig_offset;
1094 	u64 other_start;
1095 	u64 other_end;
1096 	u64 split;
1097 	int del_nr = 0;
1098 	int del_slot = 0;
1099 	int recow;
1100 	int ret = 0;
1101 	u64 ino = btrfs_ino(inode);
1102 
1103 	path = btrfs_alloc_path();
1104 	if (!path)
1105 		return -ENOMEM;
1106 again:
1107 	recow = 0;
1108 	split = start;
1109 	key.objectid = ino;
1110 	key.type = BTRFS_EXTENT_DATA_KEY;
1111 	key.offset = split;
1112 
1113 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1114 	if (ret < 0)
1115 		goto out;
1116 	if (ret > 0 && path->slots[0] > 0)
1117 		path->slots[0]--;
1118 
1119 	leaf = path->nodes[0];
1120 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1121 	if (key.objectid != ino ||
1122 	    key.type != BTRFS_EXTENT_DATA_KEY) {
1123 		ret = -EINVAL;
1124 		btrfs_abort_transaction(trans, ret);
1125 		goto out;
1126 	}
1127 	fi = btrfs_item_ptr(leaf, path->slots[0],
1128 			    struct btrfs_file_extent_item);
1129 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
1130 		ret = -EINVAL;
1131 		btrfs_abort_transaction(trans, ret);
1132 		goto out;
1133 	}
1134 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
1135 	if (key.offset > start || extent_end < end) {
1136 		ret = -EINVAL;
1137 		btrfs_abort_transaction(trans, ret);
1138 		goto out;
1139 	}
1140 
1141 	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1142 	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1143 	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
1144 	memcpy(&new_key, &key, sizeof(new_key));
1145 
1146 	if (start == key.offset && end < extent_end) {
1147 		other_start = 0;
1148 		other_end = start;
1149 		if (extent_mergeable(leaf, path->slots[0] - 1,
1150 				     ino, bytenr, orig_offset,
1151 				     &other_start, &other_end)) {
1152 			new_key.offset = end;
1153 			btrfs_set_item_key_safe(fs_info, path, &new_key);
1154 			fi = btrfs_item_ptr(leaf, path->slots[0],
1155 					    struct btrfs_file_extent_item);
1156 			btrfs_set_file_extent_generation(leaf, fi,
1157 							 trans->transid);
1158 			btrfs_set_file_extent_num_bytes(leaf, fi,
1159 							extent_end - end);
1160 			btrfs_set_file_extent_offset(leaf, fi,
1161 						     end - orig_offset);
1162 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1163 					    struct btrfs_file_extent_item);
1164 			btrfs_set_file_extent_generation(leaf, fi,
1165 							 trans->transid);
1166 			btrfs_set_file_extent_num_bytes(leaf, fi,
1167 							end - other_start);
1168 			btrfs_mark_buffer_dirty(leaf);
1169 			goto out;
1170 		}
1171 	}
1172 
1173 	if (start > key.offset && end == extent_end) {
1174 		other_start = end;
1175 		other_end = 0;
1176 		if (extent_mergeable(leaf, path->slots[0] + 1,
1177 				     ino, bytenr, orig_offset,
1178 				     &other_start, &other_end)) {
1179 			fi = btrfs_item_ptr(leaf, path->slots[0],
1180 					    struct btrfs_file_extent_item);
1181 			btrfs_set_file_extent_num_bytes(leaf, fi,
1182 							start - key.offset);
1183 			btrfs_set_file_extent_generation(leaf, fi,
1184 							 trans->transid);
1185 			path->slots[0]++;
1186 			new_key.offset = start;
1187 			btrfs_set_item_key_safe(fs_info, path, &new_key);
1188 
1189 			fi = btrfs_item_ptr(leaf, path->slots[0],
1190 					    struct btrfs_file_extent_item);
1191 			btrfs_set_file_extent_generation(leaf, fi,
1192 							 trans->transid);
1193 			btrfs_set_file_extent_num_bytes(leaf, fi,
1194 							other_end - start);
1195 			btrfs_set_file_extent_offset(leaf, fi,
1196 						     start - orig_offset);
1197 			btrfs_mark_buffer_dirty(leaf);
1198 			goto out;
1199 		}
1200 	}
1201 
1202 	while (start > key.offset || end < extent_end) {
1203 		if (key.offset == start)
1204 			split = end;
1205 
1206 		new_key.offset = split;
1207 		ret = btrfs_duplicate_item(trans, root, path, &new_key);
1208 		if (ret == -EAGAIN) {
1209 			btrfs_release_path(path);
1210 			goto again;
1211 		}
1212 		if (ret < 0) {
1213 			btrfs_abort_transaction(trans, ret);
1214 			goto out;
1215 		}
1216 
1217 		leaf = path->nodes[0];
1218 		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
1219 				    struct btrfs_file_extent_item);
1220 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1221 		btrfs_set_file_extent_num_bytes(leaf, fi,
1222 						split - key.offset);
1223 
1224 		fi = btrfs_item_ptr(leaf, path->slots[0],
1225 				    struct btrfs_file_extent_item);
1226 
1227 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1228 		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
1229 		btrfs_set_file_extent_num_bytes(leaf, fi,
1230 						extent_end - split);
1231 		btrfs_mark_buffer_dirty(leaf);
1232 
1233 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
1234 				       num_bytes, 0);
1235 		btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
1236 				    orig_offset);
1237 		ret = btrfs_inc_extent_ref(trans, &ref);
1238 		if (ret) {
1239 			btrfs_abort_transaction(trans, ret);
1240 			goto out;
1241 		}
1242 
1243 		if (split == start) {
1244 			key.offset = start;
1245 		} else {
1246 			if (start != key.offset) {
1247 				ret = -EINVAL;
1248 				btrfs_abort_transaction(trans, ret);
1249 				goto out;
1250 			}
1251 			path->slots[0]--;
1252 			extent_end = end;
1253 		}
1254 		recow = 1;
1255 	}
1256 
1257 	other_start = end;
1258 	other_end = 0;
1259 	btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1260 			       num_bytes, 0);
1261 	btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset);
1262 	if (extent_mergeable(leaf, path->slots[0] + 1,
1263 			     ino, bytenr, orig_offset,
1264 			     &other_start, &other_end)) {
1265 		if (recow) {
1266 			btrfs_release_path(path);
1267 			goto again;
1268 		}
1269 		extent_end = other_end;
1270 		del_slot = path->slots[0] + 1;
1271 		del_nr++;
1272 		ret = btrfs_free_extent(trans, &ref);
1273 		if (ret) {
1274 			btrfs_abort_transaction(trans, ret);
1275 			goto out;
1276 		}
1277 	}
1278 	other_start = 0;
1279 	other_end = start;
1280 	if (extent_mergeable(leaf, path->slots[0] - 1,
1281 			     ino, bytenr, orig_offset,
1282 			     &other_start, &other_end)) {
1283 		if (recow) {
1284 			btrfs_release_path(path);
1285 			goto again;
1286 		}
1287 		key.offset = other_start;
1288 		del_slot = path->slots[0];
1289 		del_nr++;
1290 		ret = btrfs_free_extent(trans, &ref);
1291 		if (ret) {
1292 			btrfs_abort_transaction(trans, ret);
1293 			goto out;
1294 		}
1295 	}
1296 	if (del_nr == 0) {
1297 		fi = btrfs_item_ptr(leaf, path->slots[0],
1298 			   struct btrfs_file_extent_item);
1299 		btrfs_set_file_extent_type(leaf, fi,
1300 					   BTRFS_FILE_EXTENT_REG);
1301 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1302 		btrfs_mark_buffer_dirty(leaf);
1303 	} else {
1304 		fi = btrfs_item_ptr(leaf, del_slot - 1,
1305 			   struct btrfs_file_extent_item);
1306 		btrfs_set_file_extent_type(leaf, fi,
1307 					   BTRFS_FILE_EXTENT_REG);
1308 		btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1309 		btrfs_set_file_extent_num_bytes(leaf, fi,
1310 						extent_end - key.offset);
1311 		btrfs_mark_buffer_dirty(leaf);
1312 
1313 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
1314 		if (ret < 0) {
1315 			btrfs_abort_transaction(trans, ret);
1316 			goto out;
1317 		}
1318 	}
1319 out:
1320 	btrfs_free_path(path);
1321 	return ret;
1322 }
1323 
1324 /*
1325  * on error we return an unlocked page and the error value
1326  * on success we return a locked page and 0
1327  */
1328 static int prepare_uptodate_page(struct inode *inode,
1329 				 struct page *page, u64 pos,
1330 				 bool force_uptodate)
1331 {
1332 	int ret = 0;
1333 
1334 	if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
1335 	    !PageUptodate(page)) {
1336 		ret = btrfs_readpage(NULL, page);
1337 		if (ret)
1338 			return ret;
1339 		lock_page(page);
1340 		if (!PageUptodate(page)) {
1341 			unlock_page(page);
1342 			return -EIO;
1343 		}
1344 		if (page->mapping != inode->i_mapping) {
1345 			unlock_page(page);
1346 			return -EAGAIN;
1347 		}
1348 	}
1349 	return 0;
1350 }
1351 
1352 /*
1353  * this just gets pages into the page cache and locks them down.
1354  */
1355 static noinline int prepare_pages(struct inode *inode, struct page **pages,
1356 				  size_t num_pages, loff_t pos,
1357 				  size_t write_bytes, bool force_uptodate)
1358 {
1359 	int i;
1360 	unsigned long index = pos >> PAGE_SHIFT;
1361 	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
1362 	int err = 0;
1363 	int faili;
1364 
1365 	for (i = 0; i < num_pages; i++) {
1366 again:
1367 		pages[i] = find_or_create_page(inode->i_mapping, index + i,
1368 					       mask | __GFP_WRITE);
1369 		if (!pages[i]) {
1370 			faili = i - 1;
1371 			err = -ENOMEM;
1372 			goto fail;
1373 		}
1374 
1375 		err = set_page_extent_mapped(pages[i]);
1376 		if (err < 0) {
1377 			faili = i;
1378 			goto fail;
1379 		}
1380 
1381 		if (i == 0)
1382 			err = prepare_uptodate_page(inode, pages[i], pos,
1383 						    force_uptodate);
1384 		if (!err && i == num_pages - 1)
1385 			err = prepare_uptodate_page(inode, pages[i],
1386 						    pos + write_bytes, false);
1387 		if (err) {
1388 			put_page(pages[i]);
1389 			if (err == -EAGAIN) {
1390 				err = 0;
1391 				goto again;
1392 			}
1393 			faili = i - 1;
1394 			goto fail;
1395 		}
1396 		wait_on_page_writeback(pages[i]);
1397 	}
1398 
1399 	return 0;
1400 fail:
1401 	while (faili >= 0) {
1402 		unlock_page(pages[faili]);
1403 		put_page(pages[faili]);
1404 		faili--;
1405 	}
1406 	return err;
1407 
1408 }
1409 
1410 /*
1411  * This function locks the extent and properly waits for data=ordered extents
1412  * to finish before allowing the pages to be modified if need.
1413  *
1414  * The return value:
1415  * 1 - the extent is locked
1416  * 0 - the extent is not locked, and everything is OK
1417  * -EAGAIN - need re-prepare the pages
1418  * the other < 0 number - Something wrong happens
1419  */
1420 static noinline int
1421 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
1422 				size_t num_pages, loff_t pos,
1423 				size_t write_bytes,
1424 				u64 *lockstart, u64 *lockend,
1425 				struct extent_state **cached_state)
1426 {
1427 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1428 	u64 start_pos;
1429 	u64 last_pos;
1430 	int i;
1431 	int ret = 0;
1432 
1433 	start_pos = round_down(pos, fs_info->sectorsize);
1434 	last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
1435 
1436 	if (start_pos < inode->vfs_inode.i_size) {
1437 		struct btrfs_ordered_extent *ordered;
1438 
1439 		lock_extent_bits(&inode->io_tree, start_pos, last_pos,
1440 				cached_state);
1441 		ordered = btrfs_lookup_ordered_range(inode, start_pos,
1442 						     last_pos - start_pos + 1);
1443 		if (ordered &&
1444 		    ordered->file_offset + ordered->num_bytes > start_pos &&
1445 		    ordered->file_offset <= last_pos) {
1446 			unlock_extent_cached(&inode->io_tree, start_pos,
1447 					last_pos, cached_state);
1448 			for (i = 0; i < num_pages; i++) {
1449 				unlock_page(pages[i]);
1450 				put_page(pages[i]);
1451 			}
1452 			btrfs_start_ordered_extent(ordered, 1);
1453 			btrfs_put_ordered_extent(ordered);
1454 			return -EAGAIN;
1455 		}
1456 		if (ordered)
1457 			btrfs_put_ordered_extent(ordered);
1458 
1459 		*lockstart = start_pos;
1460 		*lockend = last_pos;
1461 		ret = 1;
1462 	}
1463 
1464 	/*
1465 	 * We should be called after prepare_pages() which should have locked
1466 	 * all pages in the range.
1467 	 */
1468 	for (i = 0; i < num_pages; i++)
1469 		WARN_ON(!PageLocked(pages[i]));
1470 
1471 	return ret;
1472 }
1473 
1474 static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
1475 			   size_t *write_bytes, bool nowait)
1476 {
1477 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1478 	struct btrfs_root *root = inode->root;
1479 	u64 lockstart, lockend;
1480 	u64 num_bytes;
1481 	int ret;
1482 
1483 	if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1484 		return 0;
1485 
1486 	if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
1487 		return -EAGAIN;
1488 
1489 	lockstart = round_down(pos, fs_info->sectorsize);
1490 	lockend = round_up(pos + *write_bytes,
1491 			   fs_info->sectorsize) - 1;
1492 	num_bytes = lockend - lockstart + 1;
1493 
1494 	if (nowait) {
1495 		struct btrfs_ordered_extent *ordered;
1496 
1497 		if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
1498 			return -EAGAIN;
1499 
1500 		ordered = btrfs_lookup_ordered_range(inode, lockstart,
1501 						     num_bytes);
1502 		if (ordered) {
1503 			btrfs_put_ordered_extent(ordered);
1504 			ret = -EAGAIN;
1505 			goto out_unlock;
1506 		}
1507 	} else {
1508 		btrfs_lock_and_flush_ordered_range(inode, lockstart,
1509 						   lockend, NULL);
1510 	}
1511 
1512 	ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
1513 			NULL, NULL, NULL, false);
1514 	if (ret <= 0) {
1515 		ret = 0;
1516 		if (!nowait)
1517 			btrfs_drew_write_unlock(&root->snapshot_lock);
1518 	} else {
1519 		*write_bytes = min_t(size_t, *write_bytes ,
1520 				     num_bytes - pos + lockstart);
1521 	}
1522 out_unlock:
1523 	unlock_extent(&inode->io_tree, lockstart, lockend);
1524 
1525 	return ret;
1526 }
1527 
1528 static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
1529 			      size_t *write_bytes)
1530 {
1531 	return check_can_nocow(inode, pos, write_bytes, true);
1532 }
1533 
1534 /*
1535  * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1536  *
1537  * @pos:	 File offset
1538  * @write_bytes: The length to write, will be updated to the nocow writeable
1539  *		 range
1540  *
1541  * This function will flush ordered extents in the range to ensure proper
1542  * nocow checks.
1543  *
1544  * Return:
1545  * >0		and update @write_bytes if we can do nocow write
1546  *  0		if we can't do nocow write
1547  * -EAGAIN	if we can't get the needed lock or there are ordered extents
1548  * 		for * (nowait == true) case
1549  * <0		if other error happened
1550  *
1551  * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
1552  */
1553 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
1554 			   size_t *write_bytes)
1555 {
1556 	return check_can_nocow(inode, pos, write_bytes, false);
1557 }
1558 
1559 void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1560 {
1561 	btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1562 }
1563 
1564 static void update_time_for_write(struct inode *inode)
1565 {
1566 	struct timespec64 now;
1567 
1568 	if (IS_NOCMTIME(inode))
1569 		return;
1570 
1571 	now = current_time(inode);
1572 	if (!timespec64_equal(&inode->i_mtime, &now))
1573 		inode->i_mtime = now;
1574 
1575 	if (!timespec64_equal(&inode->i_ctime, &now))
1576 		inode->i_ctime = now;
1577 
1578 	if (IS_I_VERSION(inode))
1579 		inode_inc_iversion(inode);
1580 }
1581 
1582 static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
1583 			     size_t count)
1584 {
1585 	struct file *file = iocb->ki_filp;
1586 	struct inode *inode = file_inode(file);
1587 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1588 	loff_t pos = iocb->ki_pos;
1589 	int ret;
1590 	loff_t oldsize;
1591 	loff_t start_pos;
1592 
1593 	if (iocb->ki_flags & IOCB_NOWAIT) {
1594 		size_t nocow_bytes = count;
1595 
1596 		/* We will allocate space in case nodatacow is not set, so bail */
1597 		if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0)
1598 			return -EAGAIN;
1599 		/*
1600 		 * There are holes in the range or parts of the range that must
1601 		 * be COWed (shared extents, RO block groups, etc), so just bail
1602 		 * out.
1603 		 */
1604 		if (nocow_bytes < count)
1605 			return -EAGAIN;
1606 	}
1607 
1608 	current->backing_dev_info = inode_to_bdi(inode);
1609 	ret = file_remove_privs(file);
1610 	if (ret)
1611 		return ret;
1612 
1613 	/*
1614 	 * We reserve space for updating the inode when we reserve space for the
1615 	 * extent we are going to write, so we will enospc out there.  We don't
1616 	 * need to start yet another transaction to update the inode as we will
1617 	 * update the inode when we finish writing whatever data we write.
1618 	 */
1619 	update_time_for_write(inode);
1620 
1621 	start_pos = round_down(pos, fs_info->sectorsize);
1622 	oldsize = i_size_read(inode);
1623 	if (start_pos > oldsize) {
1624 		/* Expand hole size to cover write data, preventing empty gap */
1625 		loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1626 
1627 		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1628 		if (ret) {
1629 			current->backing_dev_info = NULL;
1630 			return ret;
1631 		}
1632 	}
1633 
1634 	return 0;
1635 }
1636 
1637 static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
1638 					       struct iov_iter *i)
1639 {
1640 	struct file *file = iocb->ki_filp;
1641 	loff_t pos;
1642 	struct inode *inode = file_inode(file);
1643 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1644 	struct page **pages = NULL;
1645 	struct extent_changeset *data_reserved = NULL;
1646 	u64 release_bytes = 0;
1647 	u64 lockstart;
1648 	u64 lockend;
1649 	size_t num_written = 0;
1650 	int nrptrs;
1651 	ssize_t ret;
1652 	bool only_release_metadata = false;
1653 	bool force_page_uptodate = false;
1654 	loff_t old_isize = i_size_read(inode);
1655 	unsigned int ilock_flags = 0;
1656 
1657 	if (iocb->ki_flags & IOCB_NOWAIT)
1658 		ilock_flags |= BTRFS_ILOCK_TRY;
1659 
1660 	ret = btrfs_inode_lock(inode, ilock_flags);
1661 	if (ret < 0)
1662 		return ret;
1663 
1664 	ret = generic_write_checks(iocb, i);
1665 	if (ret <= 0)
1666 		goto out;
1667 
1668 	ret = btrfs_write_check(iocb, i, ret);
1669 	if (ret < 0)
1670 		goto out;
1671 
1672 	pos = iocb->ki_pos;
1673 	nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
1674 			PAGE_SIZE / (sizeof(struct page *)));
1675 	nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1676 	nrptrs = max(nrptrs, 8);
1677 	pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
1678 	if (!pages) {
1679 		ret = -ENOMEM;
1680 		goto out;
1681 	}
1682 
1683 	while (iov_iter_count(i) > 0) {
1684 		struct extent_state *cached_state = NULL;
1685 		size_t offset = offset_in_page(pos);
1686 		size_t sector_offset;
1687 		size_t write_bytes = min(iov_iter_count(i),
1688 					 nrptrs * (size_t)PAGE_SIZE -
1689 					 offset);
1690 		size_t num_pages;
1691 		size_t reserve_bytes;
1692 		size_t dirty_pages;
1693 		size_t copied;
1694 		size_t dirty_sectors;
1695 		size_t num_sectors;
1696 		int extents_locked;
1697 
1698 		/*
1699 		 * Fault pages before locking them in prepare_pages
1700 		 * to avoid recursive lock
1701 		 */
1702 		if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
1703 			ret = -EFAULT;
1704 			break;
1705 		}
1706 
1707 		only_release_metadata = false;
1708 		sector_offset = pos & (fs_info->sectorsize - 1);
1709 
1710 		extent_changeset_release(data_reserved);
1711 		ret = btrfs_check_data_free_space(BTRFS_I(inode),
1712 						  &data_reserved, pos,
1713 						  write_bytes);
1714 		if (ret < 0) {
1715 			/*
1716 			 * If we don't have to COW at the offset, reserve
1717 			 * metadata only. write_bytes may get smaller than
1718 			 * requested here.
1719 			 */
1720 			if (btrfs_check_nocow_lock(BTRFS_I(inode), pos,
1721 						   &write_bytes) > 0)
1722 				only_release_metadata = true;
1723 			else
1724 				break;
1725 		}
1726 
1727 		num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
1728 		WARN_ON(num_pages > nrptrs);
1729 		reserve_bytes = round_up(write_bytes + sector_offset,
1730 					 fs_info->sectorsize);
1731 		WARN_ON(reserve_bytes == 0);
1732 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1733 				reserve_bytes);
1734 		if (ret) {
1735 			if (!only_release_metadata)
1736 				btrfs_free_reserved_data_space(BTRFS_I(inode),
1737 						data_reserved, pos,
1738 						write_bytes);
1739 			else
1740 				btrfs_check_nocow_unlock(BTRFS_I(inode));
1741 			break;
1742 		}
1743 
1744 		release_bytes = reserve_bytes;
1745 again:
1746 		/*
1747 		 * This is going to setup the pages array with the number of
1748 		 * pages we want, so we don't really need to worry about the
1749 		 * contents of pages from loop to loop
1750 		 */
1751 		ret = prepare_pages(inode, pages, num_pages,
1752 				    pos, write_bytes,
1753 				    force_page_uptodate);
1754 		if (ret) {
1755 			btrfs_delalloc_release_extents(BTRFS_I(inode),
1756 						       reserve_bytes);
1757 			break;
1758 		}
1759 
1760 		extents_locked = lock_and_cleanup_extent_if_need(
1761 				BTRFS_I(inode), pages,
1762 				num_pages, pos, write_bytes, &lockstart,
1763 				&lockend, &cached_state);
1764 		if (extents_locked < 0) {
1765 			if (extents_locked == -EAGAIN)
1766 				goto again;
1767 			btrfs_delalloc_release_extents(BTRFS_I(inode),
1768 						       reserve_bytes);
1769 			ret = extents_locked;
1770 			break;
1771 		}
1772 
1773 		copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
1774 
1775 		num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1776 		dirty_sectors = round_up(copied + sector_offset,
1777 					fs_info->sectorsize);
1778 		dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1779 
1780 		/*
1781 		 * if we have trouble faulting in the pages, fall
1782 		 * back to one page at a time
1783 		 */
1784 		if (copied < write_bytes)
1785 			nrptrs = 1;
1786 
1787 		if (copied == 0) {
1788 			force_page_uptodate = true;
1789 			dirty_sectors = 0;
1790 			dirty_pages = 0;
1791 		} else {
1792 			force_page_uptodate = false;
1793 			dirty_pages = DIV_ROUND_UP(copied + offset,
1794 						   PAGE_SIZE);
1795 		}
1796 
1797 		if (num_sectors > dirty_sectors) {
1798 			/* release everything except the sectors we dirtied */
1799 			release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
1800 			if (only_release_metadata) {
1801 				btrfs_delalloc_release_metadata(BTRFS_I(inode),
1802 							release_bytes, true);
1803 			} else {
1804 				u64 __pos;
1805 
1806 				__pos = round_down(pos,
1807 						   fs_info->sectorsize) +
1808 					(dirty_pages << PAGE_SHIFT);
1809 				btrfs_delalloc_release_space(BTRFS_I(inode),
1810 						data_reserved, __pos,
1811 						release_bytes, true);
1812 			}
1813 		}
1814 
1815 		release_bytes = round_up(copied + sector_offset,
1816 					fs_info->sectorsize);
1817 
1818 		ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
1819 					dirty_pages, pos, copied,
1820 					&cached_state, only_release_metadata);
1821 
1822 		/*
1823 		 * If we have not locked the extent range, because the range's
1824 		 * start offset is >= i_size, we might still have a non-NULL
1825 		 * cached extent state, acquired while marking the extent range
1826 		 * as delalloc through btrfs_dirty_pages(). Therefore free any
1827 		 * possible cached extent state to avoid a memory leak.
1828 		 */
1829 		if (extents_locked)
1830 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
1831 					     lockstart, lockend, &cached_state);
1832 		else
1833 			free_extent_state(cached_state);
1834 
1835 		btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1836 		if (ret) {
1837 			btrfs_drop_pages(pages, num_pages);
1838 			break;
1839 		}
1840 
1841 		release_bytes = 0;
1842 		if (only_release_metadata)
1843 			btrfs_check_nocow_unlock(BTRFS_I(inode));
1844 
1845 		btrfs_drop_pages(pages, num_pages);
1846 
1847 		cond_resched();
1848 
1849 		balance_dirty_pages_ratelimited(inode->i_mapping);
1850 
1851 		pos += copied;
1852 		num_written += copied;
1853 	}
1854 
1855 	kfree(pages);
1856 
1857 	if (release_bytes) {
1858 		if (only_release_metadata) {
1859 			btrfs_check_nocow_unlock(BTRFS_I(inode));
1860 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
1861 					release_bytes, true);
1862 		} else {
1863 			btrfs_delalloc_release_space(BTRFS_I(inode),
1864 					data_reserved,
1865 					round_down(pos, fs_info->sectorsize),
1866 					release_bytes, true);
1867 		}
1868 	}
1869 
1870 	extent_changeset_free(data_reserved);
1871 	if (num_written > 0) {
1872 		pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1873 		iocb->ki_pos += num_written;
1874 	}
1875 out:
1876 	btrfs_inode_unlock(inode, ilock_flags);
1877 	return num_written ? num_written : ret;
1878 }
1879 
1880 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
1881 			       const struct iov_iter *iter, loff_t offset)
1882 {
1883 	const u32 blocksize_mask = fs_info->sectorsize - 1;
1884 
1885 	if (offset & blocksize_mask)
1886 		return -EINVAL;
1887 
1888 	if (iov_iter_alignment(iter) & blocksize_mask)
1889 		return -EINVAL;
1890 
1891 	return 0;
1892 }
1893 
1894 static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
1895 {
1896 	struct file *file = iocb->ki_filp;
1897 	struct inode *inode = file_inode(file);
1898 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1899 	loff_t pos;
1900 	ssize_t written = 0;
1901 	ssize_t written_buffered;
1902 	loff_t endbyte;
1903 	ssize_t err;
1904 	unsigned int ilock_flags = 0;
1905 	struct iomap_dio *dio = NULL;
1906 
1907 	if (iocb->ki_flags & IOCB_NOWAIT)
1908 		ilock_flags |= BTRFS_ILOCK_TRY;
1909 
1910 	/* If the write DIO is within EOF, use a shared lock */
1911 	if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode))
1912 		ilock_flags |= BTRFS_ILOCK_SHARED;
1913 
1914 relock:
1915 	err = btrfs_inode_lock(inode, ilock_flags);
1916 	if (err < 0)
1917 		return err;
1918 
1919 	err = generic_write_checks(iocb, from);
1920 	if (err <= 0) {
1921 		btrfs_inode_unlock(inode, ilock_flags);
1922 		return err;
1923 	}
1924 
1925 	err = btrfs_write_check(iocb, from, err);
1926 	if (err < 0) {
1927 		btrfs_inode_unlock(inode, ilock_flags);
1928 		goto out;
1929 	}
1930 
1931 	pos = iocb->ki_pos;
1932 	/*
1933 	 * Re-check since file size may have changed just before taking the
1934 	 * lock or pos may have changed because of O_APPEND in generic_write_check()
1935 	 */
1936 	if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
1937 	    pos + iov_iter_count(from) > i_size_read(inode)) {
1938 		btrfs_inode_unlock(inode, ilock_flags);
1939 		ilock_flags &= ~BTRFS_ILOCK_SHARED;
1940 		goto relock;
1941 	}
1942 
1943 	if (check_direct_IO(fs_info, from, pos)) {
1944 		btrfs_inode_unlock(inode, ilock_flags);
1945 		goto buffered;
1946 	}
1947 
1948 	dio = __iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
1949 			     0);
1950 
1951 	btrfs_inode_unlock(inode, ilock_flags);
1952 
1953 	if (IS_ERR_OR_NULL(dio)) {
1954 		err = PTR_ERR_OR_ZERO(dio);
1955 		if (err < 0 && err != -ENOTBLK)
1956 			goto out;
1957 	} else {
1958 		written = iomap_dio_complete(dio);
1959 	}
1960 
1961 	if (written < 0 || !iov_iter_count(from)) {
1962 		err = written;
1963 		goto out;
1964 	}
1965 
1966 buffered:
1967 	pos = iocb->ki_pos;
1968 	written_buffered = btrfs_buffered_write(iocb, from);
1969 	if (written_buffered < 0) {
1970 		err = written_buffered;
1971 		goto out;
1972 	}
1973 	/*
1974 	 * Ensure all data is persisted. We want the next direct IO read to be
1975 	 * able to read what was just written.
1976 	 */
1977 	endbyte = pos + written_buffered - 1;
1978 	err = btrfs_fdatawrite_range(inode, pos, endbyte);
1979 	if (err)
1980 		goto out;
1981 	err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1982 	if (err)
1983 		goto out;
1984 	written += written_buffered;
1985 	iocb->ki_pos = pos + written_buffered;
1986 	invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
1987 				 endbyte >> PAGE_SHIFT);
1988 out:
1989 	return written ? written : err;
1990 }
1991 
1992 static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
1993 				    struct iov_iter *from)
1994 {
1995 	struct file *file = iocb->ki_filp;
1996 	struct btrfs_inode *inode = BTRFS_I(file_inode(file));
1997 	ssize_t num_written = 0;
1998 	const bool sync = iocb->ki_flags & IOCB_DSYNC;
1999 
2000 	/*
2001 	 * If the fs flips readonly due to some impossible error, although we
2002 	 * have opened a file as writable, we have to stop this write operation
2003 	 * to ensure consistency.
2004 	 */
2005 	if (test_bit(BTRFS_FS_STATE_ERROR, &inode->root->fs_info->fs_state))
2006 		return -EROFS;
2007 
2008 	if (!(iocb->ki_flags & IOCB_DIRECT) &&
2009 	    (iocb->ki_flags & IOCB_NOWAIT))
2010 		return -EOPNOTSUPP;
2011 
2012 	if (sync)
2013 		atomic_inc(&inode->sync_writers);
2014 
2015 	if (iocb->ki_flags & IOCB_DIRECT)
2016 		num_written = btrfs_direct_write(iocb, from);
2017 	else
2018 		num_written = btrfs_buffered_write(iocb, from);
2019 
2020 	btrfs_set_inode_last_sub_trans(inode);
2021 
2022 	if (num_written > 0)
2023 		num_written = generic_write_sync(iocb, num_written);
2024 
2025 	if (sync)
2026 		atomic_dec(&inode->sync_writers);
2027 
2028 	current->backing_dev_info = NULL;
2029 	return num_written;
2030 }
2031 
2032 int btrfs_release_file(struct inode *inode, struct file *filp)
2033 {
2034 	struct btrfs_file_private *private = filp->private_data;
2035 
2036 	if (private && private->filldir_buf)
2037 		kfree(private->filldir_buf);
2038 	kfree(private);
2039 	filp->private_data = NULL;
2040 
2041 	/*
2042 	 * Set by setattr when we are about to truncate a file from a non-zero
2043 	 * size to a zero size.  This tries to flush down new bytes that may
2044 	 * have been written if the application were using truncate to replace
2045 	 * a file in place.
2046 	 */
2047 	if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
2048 			       &BTRFS_I(inode)->runtime_flags))
2049 			filemap_flush(inode->i_mapping);
2050 	return 0;
2051 }
2052 
2053 static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
2054 {
2055 	int ret;
2056 	struct blk_plug plug;
2057 
2058 	/*
2059 	 * This is only called in fsync, which would do synchronous writes, so
2060 	 * a plug can merge adjacent IOs as much as possible.  Esp. in case of
2061 	 * multiple disks using raid profile, a large IO can be split to
2062 	 * several segments of stripe length (currently 64K).
2063 	 */
2064 	blk_start_plug(&plug);
2065 	atomic_inc(&BTRFS_I(inode)->sync_writers);
2066 	ret = btrfs_fdatawrite_range(inode, start, end);
2067 	atomic_dec(&BTRFS_I(inode)->sync_writers);
2068 	blk_finish_plug(&plug);
2069 
2070 	return ret;
2071 }
2072 
2073 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
2074 {
2075 	struct btrfs_inode *inode = BTRFS_I(ctx->inode);
2076 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2077 
2078 	if (btrfs_inode_in_log(inode, fs_info->generation) &&
2079 	    list_empty(&ctx->ordered_extents))
2080 		return true;
2081 
2082 	/*
2083 	 * If we are doing a fast fsync we can not bail out if the inode's
2084 	 * last_trans is <= then the last committed transaction, because we only
2085 	 * update the last_trans of the inode during ordered extent completion,
2086 	 * and for a fast fsync we don't wait for that, we only wait for the
2087 	 * writeback to complete.
2088 	 */
2089 	if (inode->last_trans <= fs_info->last_trans_committed &&
2090 	    (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
2091 	     list_empty(&ctx->ordered_extents)))
2092 		return true;
2093 
2094 	return false;
2095 }
2096 
2097 /*
2098  * fsync call for both files and directories.  This logs the inode into
2099  * the tree log instead of forcing full commits whenever possible.
2100  *
2101  * It needs to call filemap_fdatawait so that all ordered extent updates are
2102  * in the metadata btree are up to date for copying to the log.
2103  *
2104  * It drops the inode mutex before doing the tree log commit.  This is an
2105  * important optimization for directories because holding the mutex prevents
2106  * new operations on the dir while we write to disk.
2107  */
2108 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
2109 {
2110 	struct dentry *dentry = file_dentry(file);
2111 	struct inode *inode = d_inode(dentry);
2112 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2113 	struct btrfs_root *root = BTRFS_I(inode)->root;
2114 	struct btrfs_trans_handle *trans;
2115 	struct btrfs_log_ctx ctx;
2116 	int ret = 0, err;
2117 	u64 len;
2118 	bool full_sync;
2119 
2120 	trace_btrfs_sync_file(file, datasync);
2121 
2122 	btrfs_init_log_ctx(&ctx, inode);
2123 
2124 	/*
2125 	 * Always set the range to a full range, otherwise we can get into
2126 	 * several problems, from missing file extent items to represent holes
2127 	 * when not using the NO_HOLES feature, to log tree corruption due to
2128 	 * races between hole detection during logging and completion of ordered
2129 	 * extents outside the range, to missing checksums due to ordered extents
2130 	 * for which we flushed only a subset of their pages.
2131 	 */
2132 	start = 0;
2133 	end = LLONG_MAX;
2134 	len = (u64)LLONG_MAX + 1;
2135 
2136 	/*
2137 	 * We write the dirty pages in the range and wait until they complete
2138 	 * out of the ->i_mutex. If so, we can flush the dirty pages by
2139 	 * multi-task, and make the performance up.  See
2140 	 * btrfs_wait_ordered_range for an explanation of the ASYNC check.
2141 	 */
2142 	ret = start_ordered_ops(inode, start, end);
2143 	if (ret)
2144 		goto out;
2145 
2146 	btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
2147 
2148 	atomic_inc(&root->log_batch);
2149 
2150 	/*
2151 	 * Always check for the full sync flag while holding the inode's lock,
2152 	 * to avoid races with other tasks. The flag must be either set all the
2153 	 * time during logging or always off all the time while logging.
2154 	 */
2155 	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2156 			     &BTRFS_I(inode)->runtime_flags);
2157 
2158 	/*
2159 	 * Before we acquired the inode's lock and the mmap lock, someone may
2160 	 * have dirtied more pages in the target range. We need to make sure
2161 	 * that writeback for any such pages does not start while we are logging
2162 	 * the inode, because if it does, any of the following might happen when
2163 	 * we are not doing a full inode sync:
2164 	 *
2165 	 * 1) We log an extent after its writeback finishes but before its
2166 	 *    checksums are added to the csum tree, leading to -EIO errors
2167 	 *    when attempting to read the extent after a log replay.
2168 	 *
2169 	 * 2) We can end up logging an extent before its writeback finishes.
2170 	 *    Therefore after the log replay we will have a file extent item
2171 	 *    pointing to an unwritten extent (and no data checksums as well).
2172 	 *
2173 	 * So trigger writeback for any eventual new dirty pages and then we
2174 	 * wait for all ordered extents to complete below.
2175 	 */
2176 	ret = start_ordered_ops(inode, start, end);
2177 	if (ret) {
2178 		btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2179 		goto out;
2180 	}
2181 
2182 	/*
2183 	 * We have to do this here to avoid the priority inversion of waiting on
2184 	 * IO of a lower priority task while holding a transaction open.
2185 	 *
2186 	 * For a full fsync we wait for the ordered extents to complete while
2187 	 * for a fast fsync we wait just for writeback to complete, and then
2188 	 * attach the ordered extents to the transaction so that a transaction
2189 	 * commit waits for their completion, to avoid data loss if we fsync,
2190 	 * the current transaction commits before the ordered extents complete
2191 	 * and a power failure happens right after that.
2192 	 *
2193 	 * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
2194 	 * logical address recorded in the ordered extent may change. We need
2195 	 * to wait for the IO to stabilize the logical address.
2196 	 */
2197 	if (full_sync || btrfs_is_zoned(fs_info)) {
2198 		ret = btrfs_wait_ordered_range(inode, start, len);
2199 	} else {
2200 		/*
2201 		 * Get our ordered extents as soon as possible to avoid doing
2202 		 * checksum lookups in the csum tree, and use instead the
2203 		 * checksums attached to the ordered extents.
2204 		 */
2205 		btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
2206 						      &ctx.ordered_extents);
2207 		ret = filemap_fdatawait_range(inode->i_mapping, start, end);
2208 	}
2209 
2210 	if (ret)
2211 		goto out_release_extents;
2212 
2213 	atomic_inc(&root->log_batch);
2214 
2215 	smp_mb();
2216 	if (skip_inode_logging(&ctx)) {
2217 		/*
2218 		 * We've had everything committed since the last time we were
2219 		 * modified so clear this flag in case it was set for whatever
2220 		 * reason, it's no longer relevant.
2221 		 */
2222 		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2223 			  &BTRFS_I(inode)->runtime_flags);
2224 		/*
2225 		 * An ordered extent might have started before and completed
2226 		 * already with io errors, in which case the inode was not
2227 		 * updated and we end up here. So check the inode's mapping
2228 		 * for any errors that might have happened since we last
2229 		 * checked called fsync.
2230 		 */
2231 		ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
2232 		goto out_release_extents;
2233 	}
2234 
2235 	/*
2236 	 * We use start here because we will need to wait on the IO to complete
2237 	 * in btrfs_sync_log, which could require joining a transaction (for
2238 	 * example checking cross references in the nocow path).  If we use join
2239 	 * here we could get into a situation where we're waiting on IO to
2240 	 * happen that is blocked on a transaction trying to commit.  With start
2241 	 * we inc the extwriter counter, so we wait for all extwriters to exit
2242 	 * before we start blocking joiners.  This comment is to keep somebody
2243 	 * from thinking they are super smart and changing this to
2244 	 * btrfs_join_transaction *cough*Josef*cough*.
2245 	 */
2246 	trans = btrfs_start_transaction(root, 0);
2247 	if (IS_ERR(trans)) {
2248 		ret = PTR_ERR(trans);
2249 		goto out_release_extents;
2250 	}
2251 	trans->in_fsync = true;
2252 
2253 	ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
2254 	btrfs_release_log_ctx_extents(&ctx);
2255 	if (ret < 0) {
2256 		/* Fallthrough and commit/free transaction. */
2257 		ret = 1;
2258 	}
2259 
2260 	/* we've logged all the items and now have a consistent
2261 	 * version of the file in the log.  It is possible that
2262 	 * someone will come in and modify the file, but that's
2263 	 * fine because the log is consistent on disk, and we
2264 	 * have references to all of the file's extents
2265 	 *
2266 	 * It is possible that someone will come in and log the
2267 	 * file again, but that will end up using the synchronization
2268 	 * inside btrfs_sync_log to keep things safe.
2269 	 */
2270 	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2271 
2272 	if (ret != BTRFS_NO_LOG_SYNC) {
2273 		if (!ret) {
2274 			ret = btrfs_sync_log(trans, root, &ctx);
2275 			if (!ret) {
2276 				ret = btrfs_end_transaction(trans);
2277 				goto out;
2278 			}
2279 		}
2280 		if (!full_sync) {
2281 			ret = btrfs_wait_ordered_range(inode, start, len);
2282 			if (ret) {
2283 				btrfs_end_transaction(trans);
2284 				goto out;
2285 			}
2286 		}
2287 		ret = btrfs_commit_transaction(trans);
2288 	} else {
2289 		ret = btrfs_end_transaction(trans);
2290 	}
2291 out:
2292 	ASSERT(list_empty(&ctx.list));
2293 	err = file_check_and_advance_wb_err(file);
2294 	if (!ret)
2295 		ret = err;
2296 	return ret > 0 ? -EIO : ret;
2297 
2298 out_release_extents:
2299 	btrfs_release_log_ctx_extents(&ctx);
2300 	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2301 	goto out;
2302 }
2303 
2304 static const struct vm_operations_struct btrfs_file_vm_ops = {
2305 	.fault		= filemap_fault,
2306 	.map_pages	= filemap_map_pages,
2307 	.page_mkwrite	= btrfs_page_mkwrite,
2308 };
2309 
2310 static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
2311 {
2312 	struct address_space *mapping = filp->f_mapping;
2313 
2314 	if (!mapping->a_ops->readpage)
2315 		return -ENOEXEC;
2316 
2317 	file_accessed(filp);
2318 	vma->vm_ops = &btrfs_file_vm_ops;
2319 
2320 	return 0;
2321 }
2322 
2323 static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2324 			  int slot, u64 start, u64 end)
2325 {
2326 	struct btrfs_file_extent_item *fi;
2327 	struct btrfs_key key;
2328 
2329 	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2330 		return 0;
2331 
2332 	btrfs_item_key_to_cpu(leaf, &key, slot);
2333 	if (key.objectid != btrfs_ino(inode) ||
2334 	    key.type != BTRFS_EXTENT_DATA_KEY)
2335 		return 0;
2336 
2337 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2338 
2339 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2340 		return 0;
2341 
2342 	if (btrfs_file_extent_disk_bytenr(leaf, fi))
2343 		return 0;
2344 
2345 	if (key.offset == end)
2346 		return 1;
2347 	if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2348 		return 1;
2349 	return 0;
2350 }
2351 
2352 static int fill_holes(struct btrfs_trans_handle *trans,
2353 		struct btrfs_inode *inode,
2354 		struct btrfs_path *path, u64 offset, u64 end)
2355 {
2356 	struct btrfs_fs_info *fs_info = trans->fs_info;
2357 	struct btrfs_root *root = inode->root;
2358 	struct extent_buffer *leaf;
2359 	struct btrfs_file_extent_item *fi;
2360 	struct extent_map *hole_em;
2361 	struct extent_map_tree *em_tree = &inode->extent_tree;
2362 	struct btrfs_key key;
2363 	int ret;
2364 
2365 	if (btrfs_fs_incompat(fs_info, NO_HOLES))
2366 		goto out;
2367 
2368 	key.objectid = btrfs_ino(inode);
2369 	key.type = BTRFS_EXTENT_DATA_KEY;
2370 	key.offset = offset;
2371 
2372 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2373 	if (ret <= 0) {
2374 		/*
2375 		 * We should have dropped this offset, so if we find it then
2376 		 * something has gone horribly wrong.
2377 		 */
2378 		if (ret == 0)
2379 			ret = -EINVAL;
2380 		return ret;
2381 	}
2382 
2383 	leaf = path->nodes[0];
2384 	if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2385 		u64 num_bytes;
2386 
2387 		path->slots[0]--;
2388 		fi = btrfs_item_ptr(leaf, path->slots[0],
2389 				    struct btrfs_file_extent_item);
2390 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2391 			end - offset;
2392 		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2393 		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2394 		btrfs_set_file_extent_offset(leaf, fi, 0);
2395 		btrfs_mark_buffer_dirty(leaf);
2396 		goto out;
2397 	}
2398 
2399 	if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2400 		u64 num_bytes;
2401 
2402 		key.offset = offset;
2403 		btrfs_set_item_key_safe(fs_info, path, &key);
2404 		fi = btrfs_item_ptr(leaf, path->slots[0],
2405 				    struct btrfs_file_extent_item);
2406 		num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2407 			offset;
2408 		btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2409 		btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2410 		btrfs_set_file_extent_offset(leaf, fi, 0);
2411 		btrfs_mark_buffer_dirty(leaf);
2412 		goto out;
2413 	}
2414 	btrfs_release_path(path);
2415 
2416 	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(inode),
2417 			offset, 0, 0, end - offset, 0, end - offset, 0, 0, 0);
2418 	if (ret)
2419 		return ret;
2420 
2421 out:
2422 	btrfs_release_path(path);
2423 
2424 	hole_em = alloc_extent_map();
2425 	if (!hole_em) {
2426 		btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2427 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
2428 	} else {
2429 		hole_em->start = offset;
2430 		hole_em->len = end - offset;
2431 		hole_em->ram_bytes = hole_em->len;
2432 		hole_em->orig_start = offset;
2433 
2434 		hole_em->block_start = EXTENT_MAP_HOLE;
2435 		hole_em->block_len = 0;
2436 		hole_em->orig_block_len = 0;
2437 		hole_em->compress_type = BTRFS_COMPRESS_NONE;
2438 		hole_em->generation = trans->transid;
2439 
2440 		do {
2441 			btrfs_drop_extent_cache(inode, offset, end - 1, 0);
2442 			write_lock(&em_tree->lock);
2443 			ret = add_extent_mapping(em_tree, hole_em, 1);
2444 			write_unlock(&em_tree->lock);
2445 		} while (ret == -EEXIST);
2446 		free_extent_map(hole_em);
2447 		if (ret)
2448 			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
2449 					&inode->runtime_flags);
2450 	}
2451 
2452 	return 0;
2453 }
2454 
2455 /*
2456  * Find a hole extent on given inode and change start/len to the end of hole
2457  * extent.(hole/vacuum extent whose em->start <= start &&
2458  *	   em->start + em->len > start)
2459  * When a hole extent is found, return 1 and modify start/len.
2460  */
2461 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2462 {
2463 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2464 	struct extent_map *em;
2465 	int ret = 0;
2466 
2467 	em = btrfs_get_extent(inode, NULL, 0,
2468 			      round_down(*start, fs_info->sectorsize),
2469 			      round_up(*len, fs_info->sectorsize));
2470 	if (IS_ERR(em))
2471 		return PTR_ERR(em);
2472 
2473 	/* Hole or vacuum extent(only exists in no-hole mode) */
2474 	if (em->block_start == EXTENT_MAP_HOLE) {
2475 		ret = 1;
2476 		*len = em->start + em->len > *start + *len ?
2477 		       0 : *start + *len - em->start - em->len;
2478 		*start = em->start + em->len;
2479 	}
2480 	free_extent_map(em);
2481 	return ret;
2482 }
2483 
2484 static int btrfs_punch_hole_lock_range(struct inode *inode,
2485 				       const u64 lockstart,
2486 				       const u64 lockend,
2487 				       struct extent_state **cached_state)
2488 {
2489 	/*
2490 	 * For subpage case, if the range is not at page boundary, we could
2491 	 * have pages at the leading/tailing part of the range.
2492 	 * This could lead to dead loop since filemap_range_has_page()
2493 	 * will always return true.
2494 	 * So here we need to do extra page alignment for
2495 	 * filemap_range_has_page().
2496 	 */
2497 	const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
2498 	const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
2499 
2500 	while (1) {
2501 		struct btrfs_ordered_extent *ordered;
2502 		int ret;
2503 
2504 		truncate_pagecache_range(inode, lockstart, lockend);
2505 
2506 		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2507 				 cached_state);
2508 		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
2509 							    lockend);
2510 
2511 		/*
2512 		 * We need to make sure we have no ordered extents in this range
2513 		 * and nobody raced in and read a page in this range, if we did
2514 		 * we need to try again.
2515 		 */
2516 		if ((!ordered ||
2517 		    (ordered->file_offset + ordered->num_bytes <= lockstart ||
2518 		     ordered->file_offset > lockend)) &&
2519 		     !filemap_range_has_page(inode->i_mapping,
2520 					     page_lockstart, page_lockend)) {
2521 			if (ordered)
2522 				btrfs_put_ordered_extent(ordered);
2523 			break;
2524 		}
2525 		if (ordered)
2526 			btrfs_put_ordered_extent(ordered);
2527 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
2528 				     lockend, cached_state);
2529 		ret = btrfs_wait_ordered_range(inode, lockstart,
2530 					       lockend - lockstart + 1);
2531 		if (ret)
2532 			return ret;
2533 	}
2534 	return 0;
2535 }
2536 
2537 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2538 				     struct btrfs_inode *inode,
2539 				     struct btrfs_path *path,
2540 				     struct btrfs_replace_extent_info *extent_info,
2541 				     const u64 replace_len,
2542 				     const u64 bytes_to_drop)
2543 {
2544 	struct btrfs_fs_info *fs_info = trans->fs_info;
2545 	struct btrfs_root *root = inode->root;
2546 	struct btrfs_file_extent_item *extent;
2547 	struct extent_buffer *leaf;
2548 	struct btrfs_key key;
2549 	int slot;
2550 	struct btrfs_ref ref = { 0 };
2551 	int ret;
2552 
2553 	if (replace_len == 0)
2554 		return 0;
2555 
2556 	if (extent_info->disk_offset == 0 &&
2557 	    btrfs_fs_incompat(fs_info, NO_HOLES)) {
2558 		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2559 		return 0;
2560 	}
2561 
2562 	key.objectid = btrfs_ino(inode);
2563 	key.type = BTRFS_EXTENT_DATA_KEY;
2564 	key.offset = extent_info->file_offset;
2565 	ret = btrfs_insert_empty_item(trans, root, path, &key,
2566 				      sizeof(struct btrfs_file_extent_item));
2567 	if (ret)
2568 		return ret;
2569 	leaf = path->nodes[0];
2570 	slot = path->slots[0];
2571 	write_extent_buffer(leaf, extent_info->extent_buf,
2572 			    btrfs_item_ptr_offset(leaf, slot),
2573 			    sizeof(struct btrfs_file_extent_item));
2574 	extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2575 	ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2576 	btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2577 	btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2578 	if (extent_info->is_new_extent)
2579 		btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2580 	btrfs_mark_buffer_dirty(leaf);
2581 	btrfs_release_path(path);
2582 
2583 	ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2584 						replace_len);
2585 	if (ret)
2586 		return ret;
2587 
2588 	/* If it's a hole, nothing more needs to be done. */
2589 	if (extent_info->disk_offset == 0) {
2590 		btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2591 		return 0;
2592 	}
2593 
2594 	btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2595 
2596 	if (extent_info->is_new_extent && extent_info->insertions == 0) {
2597 		key.objectid = extent_info->disk_offset;
2598 		key.type = BTRFS_EXTENT_ITEM_KEY;
2599 		key.offset = extent_info->disk_len;
2600 		ret = btrfs_alloc_reserved_file_extent(trans, root,
2601 						       btrfs_ino(inode),
2602 						       extent_info->file_offset,
2603 						       extent_info->qgroup_reserved,
2604 						       &key);
2605 	} else {
2606 		u64 ref_offset;
2607 
2608 		btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2609 				       extent_info->disk_offset,
2610 				       extent_info->disk_len, 0);
2611 		ref_offset = extent_info->file_offset - extent_info->data_offset;
2612 		btrfs_init_data_ref(&ref, root->root_key.objectid,
2613 				    btrfs_ino(inode), ref_offset);
2614 		ret = btrfs_inc_extent_ref(trans, &ref);
2615 	}
2616 
2617 	extent_info->insertions++;
2618 
2619 	return ret;
2620 }
2621 
2622 /*
2623  * The respective range must have been previously locked, as well as the inode.
2624  * The end offset is inclusive (last byte of the range).
2625  * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2626  * the file range with an extent.
2627  * When not punching a hole, we don't want to end up in a state where we dropped
2628  * extents without inserting a new one, so we must abort the transaction to avoid
2629  * a corruption.
2630  */
2631 int btrfs_replace_file_extents(struct btrfs_inode *inode,
2632 			       struct btrfs_path *path, const u64 start,
2633 			       const u64 end,
2634 			       struct btrfs_replace_extent_info *extent_info,
2635 			       struct btrfs_trans_handle **trans_out)
2636 {
2637 	struct btrfs_drop_extents_args drop_args = { 0 };
2638 	struct btrfs_root *root = inode->root;
2639 	struct btrfs_fs_info *fs_info = root->fs_info;
2640 	u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2641 	u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2642 	struct btrfs_trans_handle *trans = NULL;
2643 	struct btrfs_block_rsv *rsv;
2644 	unsigned int rsv_count;
2645 	u64 cur_offset;
2646 	u64 len = end - start;
2647 	int ret = 0;
2648 
2649 	if (end <= start)
2650 		return -EINVAL;
2651 
2652 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2653 	if (!rsv) {
2654 		ret = -ENOMEM;
2655 		goto out;
2656 	}
2657 	rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2658 	rsv->failfast = 1;
2659 
2660 	/*
2661 	 * 1 - update the inode
2662 	 * 1 - removing the extents in the range
2663 	 * 1 - adding the hole extent if no_holes isn't set or if we are
2664 	 *     replacing the range with a new extent
2665 	 */
2666 	if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2667 		rsv_count = 3;
2668 	else
2669 		rsv_count = 2;
2670 
2671 	trans = btrfs_start_transaction(root, rsv_count);
2672 	if (IS_ERR(trans)) {
2673 		ret = PTR_ERR(trans);
2674 		trans = NULL;
2675 		goto out_free;
2676 	}
2677 
2678 	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2679 				      min_size, false);
2680 	BUG_ON(ret);
2681 	trans->block_rsv = rsv;
2682 
2683 	cur_offset = start;
2684 	drop_args.path = path;
2685 	drop_args.end = end + 1;
2686 	drop_args.drop_cache = true;
2687 	while (cur_offset < end) {
2688 		drop_args.start = cur_offset;
2689 		ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2690 		/* If we are punching a hole decrement the inode's byte count */
2691 		if (!extent_info)
2692 			btrfs_update_inode_bytes(inode, 0,
2693 						 drop_args.bytes_found);
2694 		if (ret != -ENOSPC) {
2695 			/*
2696 			 * When cloning we want to avoid transaction aborts when
2697 			 * nothing was done and we are attempting to clone parts
2698 			 * of inline extents, in such cases -EOPNOTSUPP is
2699 			 * returned by __btrfs_drop_extents() without having
2700 			 * changed anything in the file.
2701 			 */
2702 			if (extent_info && !extent_info->is_new_extent &&
2703 			    ret && ret != -EOPNOTSUPP)
2704 				btrfs_abort_transaction(trans, ret);
2705 			break;
2706 		}
2707 
2708 		trans->block_rsv = &fs_info->trans_block_rsv;
2709 
2710 		if (!extent_info && cur_offset < drop_args.drop_end &&
2711 		    cur_offset < ino_size) {
2712 			ret = fill_holes(trans, inode, path, cur_offset,
2713 					 drop_args.drop_end);
2714 			if (ret) {
2715 				/*
2716 				 * If we failed then we didn't insert our hole
2717 				 * entries for the area we dropped, so now the
2718 				 * fs is corrupted, so we must abort the
2719 				 * transaction.
2720 				 */
2721 				btrfs_abort_transaction(trans, ret);
2722 				break;
2723 			}
2724 		} else if (!extent_info && cur_offset < drop_args.drop_end) {
2725 			/*
2726 			 * We are past the i_size here, but since we didn't
2727 			 * insert holes we need to clear the mapped area so we
2728 			 * know to not set disk_i_size in this area until a new
2729 			 * file extent is inserted here.
2730 			 */
2731 			ret = btrfs_inode_clear_file_extent_range(inode,
2732 					cur_offset,
2733 					drop_args.drop_end - cur_offset);
2734 			if (ret) {
2735 				/*
2736 				 * We couldn't clear our area, so we could
2737 				 * presumably adjust up and corrupt the fs, so
2738 				 * we need to abort.
2739 				 */
2740 				btrfs_abort_transaction(trans, ret);
2741 				break;
2742 			}
2743 		}
2744 
2745 		if (extent_info &&
2746 		    drop_args.drop_end > extent_info->file_offset) {
2747 			u64 replace_len = drop_args.drop_end -
2748 					  extent_info->file_offset;
2749 
2750 			ret = btrfs_insert_replace_extent(trans, inode,	path,
2751 					extent_info, replace_len,
2752 					drop_args.bytes_found);
2753 			if (ret) {
2754 				btrfs_abort_transaction(trans, ret);
2755 				break;
2756 			}
2757 			extent_info->data_len -= replace_len;
2758 			extent_info->data_offset += replace_len;
2759 			extent_info->file_offset += replace_len;
2760 		}
2761 
2762 		ret = btrfs_update_inode(trans, root, inode);
2763 		if (ret)
2764 			break;
2765 
2766 		btrfs_end_transaction(trans);
2767 		btrfs_btree_balance_dirty(fs_info);
2768 
2769 		trans = btrfs_start_transaction(root, rsv_count);
2770 		if (IS_ERR(trans)) {
2771 			ret = PTR_ERR(trans);
2772 			trans = NULL;
2773 			break;
2774 		}
2775 
2776 		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2777 					      rsv, min_size, false);
2778 		BUG_ON(ret);	/* shouldn't happen */
2779 		trans->block_rsv = rsv;
2780 
2781 		cur_offset = drop_args.drop_end;
2782 		len = end - cur_offset;
2783 		if (!extent_info && len) {
2784 			ret = find_first_non_hole(inode, &cur_offset, &len);
2785 			if (unlikely(ret < 0))
2786 				break;
2787 			if (ret && !len) {
2788 				ret = 0;
2789 				break;
2790 			}
2791 		}
2792 	}
2793 
2794 	/*
2795 	 * If we were cloning, force the next fsync to be a full one since we
2796 	 * we replaced (or just dropped in the case of cloning holes when
2797 	 * NO_HOLES is enabled) file extent items and did not setup new extent
2798 	 * maps for the replacement extents (or holes).
2799 	 */
2800 	if (extent_info && !extent_info->is_new_extent)
2801 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
2802 
2803 	if (ret)
2804 		goto out_trans;
2805 
2806 	trans->block_rsv = &fs_info->trans_block_rsv;
2807 	/*
2808 	 * If we are using the NO_HOLES feature we might have had already an
2809 	 * hole that overlaps a part of the region [lockstart, lockend] and
2810 	 * ends at (or beyond) lockend. Since we have no file extent items to
2811 	 * represent holes, drop_end can be less than lockend and so we must
2812 	 * make sure we have an extent map representing the existing hole (the
2813 	 * call to __btrfs_drop_extents() might have dropped the existing extent
2814 	 * map representing the existing hole), otherwise the fast fsync path
2815 	 * will not record the existence of the hole region
2816 	 * [existing_hole_start, lockend].
2817 	 */
2818 	if (drop_args.drop_end <= end)
2819 		drop_args.drop_end = end + 1;
2820 	/*
2821 	 * Don't insert file hole extent item if it's for a range beyond eof
2822 	 * (because it's useless) or if it represents a 0 bytes range (when
2823 	 * cur_offset == drop_end).
2824 	 */
2825 	if (!extent_info && cur_offset < ino_size &&
2826 	    cur_offset < drop_args.drop_end) {
2827 		ret = fill_holes(trans, inode, path, cur_offset,
2828 				 drop_args.drop_end);
2829 		if (ret) {
2830 			/* Same comment as above. */
2831 			btrfs_abort_transaction(trans, ret);
2832 			goto out_trans;
2833 		}
2834 	} else if (!extent_info && cur_offset < drop_args.drop_end) {
2835 		/* See the comment in the loop above for the reasoning here. */
2836 		ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2837 					drop_args.drop_end - cur_offset);
2838 		if (ret) {
2839 			btrfs_abort_transaction(trans, ret);
2840 			goto out_trans;
2841 		}
2842 
2843 	}
2844 	if (extent_info) {
2845 		ret = btrfs_insert_replace_extent(trans, inode, path,
2846 				extent_info, extent_info->data_len,
2847 				drop_args.bytes_found);
2848 		if (ret) {
2849 			btrfs_abort_transaction(trans, ret);
2850 			goto out_trans;
2851 		}
2852 	}
2853 
2854 out_trans:
2855 	if (!trans)
2856 		goto out_free;
2857 
2858 	trans->block_rsv = &fs_info->trans_block_rsv;
2859 	if (ret)
2860 		btrfs_end_transaction(trans);
2861 	else
2862 		*trans_out = trans;
2863 out_free:
2864 	btrfs_free_block_rsv(fs_info, rsv);
2865 out:
2866 	return ret;
2867 }
2868 
2869 static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
2870 {
2871 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2872 	struct btrfs_root *root = BTRFS_I(inode)->root;
2873 	struct extent_state *cached_state = NULL;
2874 	struct btrfs_path *path;
2875 	struct btrfs_trans_handle *trans = NULL;
2876 	u64 lockstart;
2877 	u64 lockend;
2878 	u64 tail_start;
2879 	u64 tail_len;
2880 	u64 orig_start = offset;
2881 	int ret = 0;
2882 	bool same_block;
2883 	u64 ino_size;
2884 	bool truncated_block = false;
2885 	bool updated_inode = false;
2886 
2887 	ret = btrfs_wait_ordered_range(inode, offset, len);
2888 	if (ret)
2889 		return ret;
2890 
2891 	btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
2892 	ino_size = round_up(inode->i_size, fs_info->sectorsize);
2893 	ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2894 	if (ret < 0)
2895 		goto out_only_mutex;
2896 	if (ret && !len) {
2897 		/* Already in a large hole */
2898 		ret = 0;
2899 		goto out_only_mutex;
2900 	}
2901 
2902 	lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
2903 	lockend = round_down(offset + len,
2904 			     btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
2905 	same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2906 		== (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2907 	/*
2908 	 * We needn't truncate any block which is beyond the end of the file
2909 	 * because we are sure there is no data there.
2910 	 */
2911 	/*
2912 	 * Only do this if we are in the same block and we aren't doing the
2913 	 * entire block.
2914 	 */
2915 	if (same_block && len < fs_info->sectorsize) {
2916 		if (offset < ino_size) {
2917 			truncated_block = true;
2918 			ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2919 						   0);
2920 		} else {
2921 			ret = 0;
2922 		}
2923 		goto out_only_mutex;
2924 	}
2925 
2926 	/* zero back part of the first block */
2927 	if (offset < ino_size) {
2928 		truncated_block = true;
2929 		ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2930 		if (ret) {
2931 			btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
2932 			return ret;
2933 		}
2934 	}
2935 
2936 	/* Check the aligned pages after the first unaligned page,
2937 	 * if offset != orig_start, which means the first unaligned page
2938 	 * including several following pages are already in holes,
2939 	 * the extra check can be skipped */
2940 	if (offset == orig_start) {
2941 		/* after truncate page, check hole again */
2942 		len = offset + len - lockstart;
2943 		offset = lockstart;
2944 		ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2945 		if (ret < 0)
2946 			goto out_only_mutex;
2947 		if (ret && !len) {
2948 			ret = 0;
2949 			goto out_only_mutex;
2950 		}
2951 		lockstart = offset;
2952 	}
2953 
2954 	/* Check the tail unaligned part is in a hole */
2955 	tail_start = lockend + 1;
2956 	tail_len = offset + len - tail_start;
2957 	if (tail_len) {
2958 		ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2959 		if (unlikely(ret < 0))
2960 			goto out_only_mutex;
2961 		if (!ret) {
2962 			/* zero the front end of the last page */
2963 			if (tail_start + tail_len < ino_size) {
2964 				truncated_block = true;
2965 				ret = btrfs_truncate_block(BTRFS_I(inode),
2966 							tail_start + tail_len,
2967 							0, 1);
2968 				if (ret)
2969 					goto out_only_mutex;
2970 			}
2971 		}
2972 	}
2973 
2974 	if (lockend < lockstart) {
2975 		ret = 0;
2976 		goto out_only_mutex;
2977 	}
2978 
2979 	ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
2980 					  &cached_state);
2981 	if (ret)
2982 		goto out_only_mutex;
2983 
2984 	path = btrfs_alloc_path();
2985 	if (!path) {
2986 		ret = -ENOMEM;
2987 		goto out;
2988 	}
2989 
2990 	ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
2991 					 lockend, NULL, &trans);
2992 	btrfs_free_path(path);
2993 	if (ret)
2994 		goto out;
2995 
2996 	ASSERT(trans != NULL);
2997 	inode_inc_iversion(inode);
2998 	inode->i_mtime = inode->i_ctime = current_time(inode);
2999 	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3000 	updated_inode = true;
3001 	btrfs_end_transaction(trans);
3002 	btrfs_btree_balance_dirty(fs_info);
3003 out:
3004 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3005 			     &cached_state);
3006 out_only_mutex:
3007 	if (!updated_inode && truncated_block && !ret) {
3008 		/*
3009 		 * If we only end up zeroing part of a page, we still need to
3010 		 * update the inode item, so that all the time fields are
3011 		 * updated as well as the necessary btrfs inode in memory fields
3012 		 * for detecting, at fsync time, if the inode isn't yet in the
3013 		 * log tree or it's there but not up to date.
3014 		 */
3015 		struct timespec64 now = current_time(inode);
3016 
3017 		inode_inc_iversion(inode);
3018 		inode->i_mtime = now;
3019 		inode->i_ctime = now;
3020 		trans = btrfs_start_transaction(root, 1);
3021 		if (IS_ERR(trans)) {
3022 			ret = PTR_ERR(trans);
3023 		} else {
3024 			int ret2;
3025 
3026 			ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3027 			ret2 = btrfs_end_transaction(trans);
3028 			if (!ret)
3029 				ret = ret2;
3030 		}
3031 	}
3032 	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3033 	return ret;
3034 }
3035 
3036 /* Helper structure to record which range is already reserved */
3037 struct falloc_range {
3038 	struct list_head list;
3039 	u64 start;
3040 	u64 len;
3041 };
3042 
3043 /*
3044  * Helper function to add falloc range
3045  *
3046  * Caller should have locked the larger range of extent containing
3047  * [start, len)
3048  */
3049 static int add_falloc_range(struct list_head *head, u64 start, u64 len)
3050 {
3051 	struct falloc_range *range = NULL;
3052 
3053 	if (!list_empty(head)) {
3054 		/*
3055 		 * As fallocate iterates by bytenr order, we only need to check
3056 		 * the last range.
3057 		 */
3058 		range = list_last_entry(head, struct falloc_range, list);
3059 		if (range->start + range->len == start) {
3060 			range->len += len;
3061 			return 0;
3062 		}
3063 	}
3064 
3065 	range = kmalloc(sizeof(*range), GFP_KERNEL);
3066 	if (!range)
3067 		return -ENOMEM;
3068 	range->start = start;
3069 	range->len = len;
3070 	list_add_tail(&range->list, head);
3071 	return 0;
3072 }
3073 
3074 static int btrfs_fallocate_update_isize(struct inode *inode,
3075 					const u64 end,
3076 					const int mode)
3077 {
3078 	struct btrfs_trans_handle *trans;
3079 	struct btrfs_root *root = BTRFS_I(inode)->root;
3080 	int ret;
3081 	int ret2;
3082 
3083 	if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
3084 		return 0;
3085 
3086 	trans = btrfs_start_transaction(root, 1);
3087 	if (IS_ERR(trans))
3088 		return PTR_ERR(trans);
3089 
3090 	inode->i_ctime = current_time(inode);
3091 	i_size_write(inode, end);
3092 	btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
3093 	ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
3094 	ret2 = btrfs_end_transaction(trans);
3095 
3096 	return ret ? ret : ret2;
3097 }
3098 
3099 enum {
3100 	RANGE_BOUNDARY_WRITTEN_EXTENT,
3101 	RANGE_BOUNDARY_PREALLOC_EXTENT,
3102 	RANGE_BOUNDARY_HOLE,
3103 };
3104 
3105 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
3106 						 u64 offset)
3107 {
3108 	const u64 sectorsize = btrfs_inode_sectorsize(inode);
3109 	struct extent_map *em;
3110 	int ret;
3111 
3112 	offset = round_down(offset, sectorsize);
3113 	em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
3114 	if (IS_ERR(em))
3115 		return PTR_ERR(em);
3116 
3117 	if (em->block_start == EXTENT_MAP_HOLE)
3118 		ret = RANGE_BOUNDARY_HOLE;
3119 	else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3120 		ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
3121 	else
3122 		ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
3123 
3124 	free_extent_map(em);
3125 	return ret;
3126 }
3127 
3128 static int btrfs_zero_range(struct inode *inode,
3129 			    loff_t offset,
3130 			    loff_t len,
3131 			    const int mode)
3132 {
3133 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
3134 	struct extent_map *em;
3135 	struct extent_changeset *data_reserved = NULL;
3136 	int ret;
3137 	u64 alloc_hint = 0;
3138 	const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode));
3139 	u64 alloc_start = round_down(offset, sectorsize);
3140 	u64 alloc_end = round_up(offset + len, sectorsize);
3141 	u64 bytes_to_reserve = 0;
3142 	bool space_reserved = false;
3143 
3144 	inode_dio_wait(inode);
3145 
3146 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3147 			      alloc_end - alloc_start);
3148 	if (IS_ERR(em)) {
3149 		ret = PTR_ERR(em);
3150 		goto out;
3151 	}
3152 
3153 	/*
3154 	 * Avoid hole punching and extent allocation for some cases. More cases
3155 	 * could be considered, but these are unlikely common and we keep things
3156 	 * as simple as possible for now. Also, intentionally, if the target
3157 	 * range contains one or more prealloc extents together with regular
3158 	 * extents and holes, we drop all the existing extents and allocate a
3159 	 * new prealloc extent, so that we get a larger contiguous disk extent.
3160 	 */
3161 	if (em->start <= alloc_start &&
3162 	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3163 		const u64 em_end = em->start + em->len;
3164 
3165 		if (em_end >= offset + len) {
3166 			/*
3167 			 * The whole range is already a prealloc extent,
3168 			 * do nothing except updating the inode's i_size if
3169 			 * needed.
3170 			 */
3171 			free_extent_map(em);
3172 			ret = btrfs_fallocate_update_isize(inode, offset + len,
3173 							   mode);
3174 			goto out;
3175 		}
3176 		/*
3177 		 * Part of the range is already a prealloc extent, so operate
3178 		 * only on the remaining part of the range.
3179 		 */
3180 		alloc_start = em_end;
3181 		ASSERT(IS_ALIGNED(alloc_start, sectorsize));
3182 		len = offset + len - alloc_start;
3183 		offset = alloc_start;
3184 		alloc_hint = em->block_start + em->len;
3185 	}
3186 	free_extent_map(em);
3187 
3188 	if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
3189 	    BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
3190 		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
3191 				      sectorsize);
3192 		if (IS_ERR(em)) {
3193 			ret = PTR_ERR(em);
3194 			goto out;
3195 		}
3196 
3197 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
3198 			free_extent_map(em);
3199 			ret = btrfs_fallocate_update_isize(inode, offset + len,
3200 							   mode);
3201 			goto out;
3202 		}
3203 		if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
3204 			free_extent_map(em);
3205 			ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
3206 						   0);
3207 			if (!ret)
3208 				ret = btrfs_fallocate_update_isize(inode,
3209 								   offset + len,
3210 								   mode);
3211 			return ret;
3212 		}
3213 		free_extent_map(em);
3214 		alloc_start = round_down(offset, sectorsize);
3215 		alloc_end = alloc_start + sectorsize;
3216 		goto reserve_space;
3217 	}
3218 
3219 	alloc_start = round_up(offset, sectorsize);
3220 	alloc_end = round_down(offset + len, sectorsize);
3221 
3222 	/*
3223 	 * For unaligned ranges, check the pages at the boundaries, they might
3224 	 * map to an extent, in which case we need to partially zero them, or
3225 	 * they might map to a hole, in which case we need our allocation range
3226 	 * to cover them.
3227 	 */
3228 	if (!IS_ALIGNED(offset, sectorsize)) {
3229 		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3230 							    offset);
3231 		if (ret < 0)
3232 			goto out;
3233 		if (ret == RANGE_BOUNDARY_HOLE) {
3234 			alloc_start = round_down(offset, sectorsize);
3235 			ret = 0;
3236 		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3237 			ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
3238 			if (ret)
3239 				goto out;
3240 		} else {
3241 			ret = 0;
3242 		}
3243 	}
3244 
3245 	if (!IS_ALIGNED(offset + len, sectorsize)) {
3246 		ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
3247 							    offset + len);
3248 		if (ret < 0)
3249 			goto out;
3250 		if (ret == RANGE_BOUNDARY_HOLE) {
3251 			alloc_end = round_up(offset + len, sectorsize);
3252 			ret = 0;
3253 		} else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
3254 			ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
3255 						   0, 1);
3256 			if (ret)
3257 				goto out;
3258 		} else {
3259 			ret = 0;
3260 		}
3261 	}
3262 
3263 reserve_space:
3264 	if (alloc_start < alloc_end) {
3265 		struct extent_state *cached_state = NULL;
3266 		const u64 lockstart = alloc_start;
3267 		const u64 lockend = alloc_end - 1;
3268 
3269 		bytes_to_reserve = alloc_end - alloc_start;
3270 		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3271 						      bytes_to_reserve);
3272 		if (ret < 0)
3273 			goto out;
3274 		space_reserved = true;
3275 		ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
3276 						  &cached_state);
3277 		if (ret)
3278 			goto out;
3279 		ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
3280 						alloc_start, bytes_to_reserve);
3281 		if (ret) {
3282 			unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
3283 					     lockend, &cached_state);
3284 			goto out;
3285 		}
3286 		ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3287 						alloc_end - alloc_start,
3288 						i_blocksize(inode),
3289 						offset + len, &alloc_hint);
3290 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
3291 				     lockend, &cached_state);
3292 		/* btrfs_prealloc_file_range releases reserved space on error */
3293 		if (ret) {
3294 			space_reserved = false;
3295 			goto out;
3296 		}
3297 	}
3298 	ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3299  out:
3300 	if (ret && space_reserved)
3301 		btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3302 					       alloc_start, bytes_to_reserve);
3303 	extent_changeset_free(data_reserved);
3304 
3305 	return ret;
3306 }
3307 
3308 static long btrfs_fallocate(struct file *file, int mode,
3309 			    loff_t offset, loff_t len)
3310 {
3311 	struct inode *inode = file_inode(file);
3312 	struct extent_state *cached_state = NULL;
3313 	struct extent_changeset *data_reserved = NULL;
3314 	struct falloc_range *range;
3315 	struct falloc_range *tmp;
3316 	struct list_head reserve_list;
3317 	u64 cur_offset;
3318 	u64 last_byte;
3319 	u64 alloc_start;
3320 	u64 alloc_end;
3321 	u64 alloc_hint = 0;
3322 	u64 locked_end;
3323 	u64 actual_end = 0;
3324 	struct extent_map *em;
3325 	int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
3326 	int ret;
3327 
3328 	/* Do not allow fallocate in ZONED mode */
3329 	if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
3330 		return -EOPNOTSUPP;
3331 
3332 	alloc_start = round_down(offset, blocksize);
3333 	alloc_end = round_up(offset + len, blocksize);
3334 	cur_offset = alloc_start;
3335 
3336 	/* Make sure we aren't being give some crap mode */
3337 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3338 		     FALLOC_FL_ZERO_RANGE))
3339 		return -EOPNOTSUPP;
3340 
3341 	if (mode & FALLOC_FL_PUNCH_HOLE)
3342 		return btrfs_punch_hole(inode, offset, len);
3343 
3344 	/*
3345 	 * Only trigger disk allocation, don't trigger qgroup reserve
3346 	 *
3347 	 * For qgroup space, it will be checked later.
3348 	 */
3349 	if (!(mode & FALLOC_FL_ZERO_RANGE)) {
3350 		ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3351 						      alloc_end - alloc_start);
3352 		if (ret < 0)
3353 			return ret;
3354 	}
3355 
3356 	btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
3357 
3358 	if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3359 		ret = inode_newsize_ok(inode, offset + len);
3360 		if (ret)
3361 			goto out;
3362 	}
3363 
3364 	/*
3365 	 * TODO: Move these two operations after we have checked
3366 	 * accurate reserved space, or fallocate can still fail but
3367 	 * with page truncated or size expanded.
3368 	 *
3369 	 * But that's a minor problem and won't do much harm BTW.
3370 	 */
3371 	if (alloc_start > inode->i_size) {
3372 		ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
3373 					alloc_start);
3374 		if (ret)
3375 			goto out;
3376 	} else if (offset + len > inode->i_size) {
3377 		/*
3378 		 * If we are fallocating from the end of the file onward we
3379 		 * need to zero out the end of the block if i_size lands in the
3380 		 * middle of a block.
3381 		 */
3382 		ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
3383 		if (ret)
3384 			goto out;
3385 	}
3386 
3387 	/*
3388 	 * wait for ordered IO before we have any locks.  We'll loop again
3389 	 * below with the locks held.
3390 	 */
3391 	ret = btrfs_wait_ordered_range(inode, alloc_start,
3392 				       alloc_end - alloc_start);
3393 	if (ret)
3394 		goto out;
3395 
3396 	if (mode & FALLOC_FL_ZERO_RANGE) {
3397 		ret = btrfs_zero_range(inode, offset, len, mode);
3398 		btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3399 		return ret;
3400 	}
3401 
3402 	locked_end = alloc_end - 1;
3403 	while (1) {
3404 		struct btrfs_ordered_extent *ordered;
3405 
3406 		/* the extent lock is ordered inside the running
3407 		 * transaction
3408 		 */
3409 		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
3410 				 locked_end, &cached_state);
3411 		ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
3412 							    locked_end);
3413 
3414 		if (ordered &&
3415 		    ordered->file_offset + ordered->num_bytes > alloc_start &&
3416 		    ordered->file_offset < alloc_end) {
3417 			btrfs_put_ordered_extent(ordered);
3418 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
3419 					     alloc_start, locked_end,
3420 					     &cached_state);
3421 			/*
3422 			 * we can't wait on the range with the transaction
3423 			 * running or with the extent lock held
3424 			 */
3425 			ret = btrfs_wait_ordered_range(inode, alloc_start,
3426 						       alloc_end - alloc_start);
3427 			if (ret)
3428 				goto out;
3429 		} else {
3430 			if (ordered)
3431 				btrfs_put_ordered_extent(ordered);
3432 			break;
3433 		}
3434 	}
3435 
3436 	/* First, check if we exceed the qgroup limit */
3437 	INIT_LIST_HEAD(&reserve_list);
3438 	while (cur_offset < alloc_end) {
3439 		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
3440 				      alloc_end - cur_offset);
3441 		if (IS_ERR(em)) {
3442 			ret = PTR_ERR(em);
3443 			break;
3444 		}
3445 		last_byte = min(extent_map_end(em), alloc_end);
3446 		actual_end = min_t(u64, extent_map_end(em), offset + len);
3447 		last_byte = ALIGN(last_byte, blocksize);
3448 		if (em->block_start == EXTENT_MAP_HOLE ||
3449 		    (cur_offset >= inode->i_size &&
3450 		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
3451 			ret = add_falloc_range(&reserve_list, cur_offset,
3452 					       last_byte - cur_offset);
3453 			if (ret < 0) {
3454 				free_extent_map(em);
3455 				break;
3456 			}
3457 			ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3458 					&data_reserved, cur_offset,
3459 					last_byte - cur_offset);
3460 			if (ret < 0) {
3461 				cur_offset = last_byte;
3462 				free_extent_map(em);
3463 				break;
3464 			}
3465 		} else {
3466 			/*
3467 			 * Do not need to reserve unwritten extent for this
3468 			 * range, free reserved data space first, otherwise
3469 			 * it'll result in false ENOSPC error.
3470 			 */
3471 			btrfs_free_reserved_data_space(BTRFS_I(inode),
3472 				data_reserved, cur_offset,
3473 				last_byte - cur_offset);
3474 		}
3475 		free_extent_map(em);
3476 		cur_offset = last_byte;
3477 	}
3478 
3479 	/*
3480 	 * If ret is still 0, means we're OK to fallocate.
3481 	 * Or just cleanup the list and exit.
3482 	 */
3483 	list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3484 		if (!ret)
3485 			ret = btrfs_prealloc_file_range(inode, mode,
3486 					range->start,
3487 					range->len, i_blocksize(inode),
3488 					offset + len, &alloc_hint);
3489 		else
3490 			btrfs_free_reserved_data_space(BTRFS_I(inode),
3491 					data_reserved, range->start,
3492 					range->len);
3493 		list_del(&range->list);
3494 		kfree(range);
3495 	}
3496 	if (ret < 0)
3497 		goto out_unlock;
3498 
3499 	/*
3500 	 * We didn't need to allocate any more space, but we still extended the
3501 	 * size of the file so we need to update i_size and the inode item.
3502 	 */
3503 	ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3504 out_unlock:
3505 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3506 			     &cached_state);
3507 out:
3508 	btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
3509 	/* Let go of our reservation. */
3510 	if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
3511 		btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3512 				cur_offset, alloc_end - cur_offset);
3513 	extent_changeset_free(data_reserved);
3514 	return ret;
3515 }
3516 
3517 static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
3518 				  int whence)
3519 {
3520 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3521 	struct extent_map *em = NULL;
3522 	struct extent_state *cached_state = NULL;
3523 	loff_t i_size = inode->vfs_inode.i_size;
3524 	u64 lockstart;
3525 	u64 lockend;
3526 	u64 start;
3527 	u64 len;
3528 	int ret = 0;
3529 
3530 	if (i_size == 0 || offset >= i_size)
3531 		return -ENXIO;
3532 
3533 	/*
3534 	 * offset can be negative, in this case we start finding DATA/HOLE from
3535 	 * the very start of the file.
3536 	 */
3537 	start = max_t(loff_t, 0, offset);
3538 
3539 	lockstart = round_down(start, fs_info->sectorsize);
3540 	lockend = round_up(i_size, fs_info->sectorsize);
3541 	if (lockend <= lockstart)
3542 		lockend = lockstart + fs_info->sectorsize;
3543 	lockend--;
3544 	len = lockend - lockstart + 1;
3545 
3546 	lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
3547 
3548 	while (start < i_size) {
3549 		em = btrfs_get_extent_fiemap(inode, start, len);
3550 		if (IS_ERR(em)) {
3551 			ret = PTR_ERR(em);
3552 			em = NULL;
3553 			break;
3554 		}
3555 
3556 		if (whence == SEEK_HOLE &&
3557 		    (em->block_start == EXTENT_MAP_HOLE ||
3558 		     test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3559 			break;
3560 		else if (whence == SEEK_DATA &&
3561 			   (em->block_start != EXTENT_MAP_HOLE &&
3562 			    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)))
3563 			break;
3564 
3565 		start = em->start + em->len;
3566 		free_extent_map(em);
3567 		em = NULL;
3568 		cond_resched();
3569 	}
3570 	free_extent_map(em);
3571 	unlock_extent_cached(&inode->io_tree, lockstart, lockend,
3572 			     &cached_state);
3573 	if (ret) {
3574 		offset = ret;
3575 	} else {
3576 		if (whence == SEEK_DATA && start >= i_size)
3577 			offset = -ENXIO;
3578 		else
3579 			offset = min_t(loff_t, start, i_size);
3580 	}
3581 
3582 	return offset;
3583 }
3584 
3585 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3586 {
3587 	struct inode *inode = file->f_mapping->host;
3588 
3589 	switch (whence) {
3590 	default:
3591 		return generic_file_llseek(file, offset, whence);
3592 	case SEEK_DATA:
3593 	case SEEK_HOLE:
3594 		btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
3595 		offset = find_desired_extent(BTRFS_I(inode), offset, whence);
3596 		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
3597 		break;
3598 	}
3599 
3600 	if (offset < 0)
3601 		return offset;
3602 
3603 	return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3604 }
3605 
3606 static int btrfs_file_open(struct inode *inode, struct file *filp)
3607 {
3608 	filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC;
3609 	return generic_file_open(inode, filp);
3610 }
3611 
3612 static int check_direct_read(struct btrfs_fs_info *fs_info,
3613 			     const struct iov_iter *iter, loff_t offset)
3614 {
3615 	int ret;
3616 	int i, seg;
3617 
3618 	ret = check_direct_IO(fs_info, iter, offset);
3619 	if (ret < 0)
3620 		return ret;
3621 
3622 	if (!iter_is_iovec(iter))
3623 		return 0;
3624 
3625 	for (seg = 0; seg < iter->nr_segs; seg++)
3626 		for (i = seg + 1; i < iter->nr_segs; i++)
3627 			if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
3628 				return -EINVAL;
3629 	return 0;
3630 }
3631 
3632 static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
3633 {
3634 	struct inode *inode = file_inode(iocb->ki_filp);
3635 	ssize_t ret;
3636 
3637 	if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
3638 		return 0;
3639 
3640 	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
3641 	ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, 0);
3642 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
3643 	return ret;
3644 }
3645 
3646 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3647 {
3648 	ssize_t ret = 0;
3649 
3650 	if (iocb->ki_flags & IOCB_DIRECT) {
3651 		ret = btrfs_direct_read(iocb, to);
3652 		if (ret < 0 || !iov_iter_count(to) ||
3653 		    iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3654 			return ret;
3655 	}
3656 
3657 	return filemap_read(iocb, to, ret);
3658 }
3659 
3660 const struct file_operations btrfs_file_operations = {
3661 	.llseek		= btrfs_file_llseek,
3662 	.read_iter      = btrfs_file_read_iter,
3663 	.splice_read	= generic_file_splice_read,
3664 	.write_iter	= btrfs_file_write_iter,
3665 	.splice_write	= iter_file_splice_write,
3666 	.mmap		= btrfs_file_mmap,
3667 	.open		= btrfs_file_open,
3668 	.release	= btrfs_release_file,
3669 	.fsync		= btrfs_sync_file,
3670 	.fallocate	= btrfs_fallocate,
3671 	.unlocked_ioctl	= btrfs_ioctl,
3672 #ifdef CONFIG_COMPAT
3673 	.compat_ioctl	= btrfs_compat_ioctl,
3674 #endif
3675 	.remap_file_range = btrfs_remap_file_range,
3676 };
3677 
3678 void __cold btrfs_auto_defrag_exit(void)
3679 {
3680 	kmem_cache_destroy(btrfs_inode_defrag_cachep);
3681 }
3682 
3683 int __init btrfs_auto_defrag_init(void)
3684 {
3685 	btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
3686 					sizeof(struct inode_defrag), 0,
3687 					SLAB_MEM_SPREAD,
3688 					NULL);
3689 	if (!btrfs_inode_defrag_cachep)
3690 		return -ENOMEM;
3691 
3692 	return 0;
3693 }
3694 
3695 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
3696 {
3697 	int ret;
3698 
3699 	/*
3700 	 * So with compression we will find and lock a dirty page and clear the
3701 	 * first one as dirty, setup an async extent, and immediately return
3702 	 * with the entire range locked but with nobody actually marked with
3703 	 * writeback.  So we can't just filemap_write_and_wait_range() and
3704 	 * expect it to work since it will just kick off a thread to do the
3705 	 * actual work.  So we need to call filemap_fdatawrite_range _again_
3706 	 * since it will wait on the page lock, which won't be unlocked until
3707 	 * after the pages have been marked as writeback and so we're good to go
3708 	 * from there.  We have to do this otherwise we'll miss the ordered
3709 	 * extents and that results in badness.  Please Josef, do not think you
3710 	 * know better and pull this out at some point in the future, it is
3711 	 * right and you are wrong.
3712 	 */
3713 	ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3714 	if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
3715 			     &BTRFS_I(inode)->runtime_flags))
3716 		ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3717 
3718 	return ret;
3719 }
3720