xref: /openbmc/linux/fs/btrfs/inode.c (revision e4c881d2)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5 
6 #include <crypto/hash.h>
7 #include <linux/kernel.h>
8 #include <linux/bio.h>
9 #include <linux/blk-cgroup.h>
10 #include <linux/file.h>
11 #include <linux/fs.h>
12 #include <linux/pagemap.h>
13 #include <linux/highmem.h>
14 #include <linux/time.h>
15 #include <linux/init.h>
16 #include <linux/string.h>
17 #include <linux/backing-dev.h>
18 #include <linux/writeback.h>
19 #include <linux/compat.h>
20 #include <linux/xattr.h>
21 #include <linux/posix_acl.h>
22 #include <linux/falloc.h>
23 #include <linux/slab.h>
24 #include <linux/ratelimit.h>
25 #include <linux/btrfs.h>
26 #include <linux/blkdev.h>
27 #include <linux/posix_acl_xattr.h>
28 #include <linux/uio.h>
29 #include <linux/magic.h>
30 #include <linux/iversion.h>
31 #include <linux/swap.h>
32 #include <linux/migrate.h>
33 #include <linux/sched/mm.h>
34 #include <linux/iomap.h>
35 #include <asm/unaligned.h>
36 #include <linux/fsverity.h>
37 #include "misc.h"
38 #include "ctree.h"
39 #include "disk-io.h"
40 #include "transaction.h"
41 #include "btrfs_inode.h"
42 #include "print-tree.h"
43 #include "ordered-data.h"
44 #include "xattr.h"
45 #include "tree-log.h"
46 #include "bio.h"
47 #include "compression.h"
48 #include "locking.h"
49 #include "free-space-cache.h"
50 #include "props.h"
51 #include "qgroup.h"
52 #include "delalloc-space.h"
53 #include "block-group.h"
54 #include "space-info.h"
55 #include "zoned.h"
56 #include "subpage.h"
57 #include "inode-item.h"
58 #include "fs.h"
59 #include "accessors.h"
60 #include "extent-tree.h"
61 #include "root-tree.h"
62 #include "defrag.h"
63 #include "dir-item.h"
64 #include "file-item.h"
65 #include "uuid-tree.h"
66 #include "ioctl.h"
67 #include "file.h"
68 #include "acl.h"
69 #include "relocation.h"
70 #include "verity.h"
71 #include "super.h"
72 #include "orphan.h"
73 #include "backref.h"
74 
75 struct btrfs_iget_args {
76 	u64 ino;
77 	struct btrfs_root *root;
78 };
79 
80 struct btrfs_dio_data {
81 	ssize_t submitted;
82 	struct extent_changeset *data_reserved;
83 	struct btrfs_ordered_extent *ordered;
84 	bool data_space_reserved;
85 	bool nocow_done;
86 };
87 
88 struct btrfs_dio_private {
89 	/* Range of I/O */
90 	u64 file_offset;
91 	u32 bytes;
92 
93 	/* This must be last */
94 	struct btrfs_bio bbio;
95 };
96 
97 static struct bio_set btrfs_dio_bioset;
98 
99 struct btrfs_rename_ctx {
100 	/* Output field. Stores the index number of the old directory entry. */
101 	u64 index;
102 };
103 
104 /*
105  * Used by data_reloc_print_warning_inode() to pass needed info for filename
106  * resolution and output of error message.
107  */
108 struct data_reloc_warn {
109 	struct btrfs_path path;
110 	struct btrfs_fs_info *fs_info;
111 	u64 extent_item_size;
112 	u64 logical;
113 	int mirror_num;
114 };
115 
116 static const struct inode_operations btrfs_dir_inode_operations;
117 static const struct inode_operations btrfs_symlink_inode_operations;
118 static const struct inode_operations btrfs_special_inode_operations;
119 static const struct inode_operations btrfs_file_inode_operations;
120 static const struct address_space_operations btrfs_aops;
121 static const struct file_operations btrfs_dir_file_operations;
122 
123 static struct kmem_cache *btrfs_inode_cachep;
124 
125 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
126 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
127 
128 static noinline int run_delalloc_cow(struct btrfs_inode *inode,
129 				     struct page *locked_page, u64 start,
130 				     u64 end, struct writeback_control *wbc,
131 				     bool pages_dirty);
132 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
133 				       u64 len, u64 orig_start, u64 block_start,
134 				       u64 block_len, u64 orig_block_len,
135 				       u64 ram_bytes, int compress_type,
136 				       int type);
137 
138 static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
139 					  u64 root, void *warn_ctx)
140 {
141 	struct data_reloc_warn *warn = warn_ctx;
142 	struct btrfs_fs_info *fs_info = warn->fs_info;
143 	struct extent_buffer *eb;
144 	struct btrfs_inode_item *inode_item;
145 	struct inode_fs_paths *ipath = NULL;
146 	struct btrfs_root *local_root;
147 	struct btrfs_key key;
148 	unsigned int nofs_flag;
149 	u32 nlink;
150 	int ret;
151 
152 	local_root = btrfs_get_fs_root(fs_info, root, true);
153 	if (IS_ERR(local_root)) {
154 		ret = PTR_ERR(local_root);
155 		goto err;
156 	}
157 
158 	/* This makes the path point to (inum INODE_ITEM ioff). */
159 	key.objectid = inum;
160 	key.type = BTRFS_INODE_ITEM_KEY;
161 	key.offset = 0;
162 
163 	ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
164 	if (ret) {
165 		btrfs_put_root(local_root);
166 		btrfs_release_path(&warn->path);
167 		goto err;
168 	}
169 
170 	eb = warn->path.nodes[0];
171 	inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
172 	nlink = btrfs_inode_nlink(eb, inode_item);
173 	btrfs_release_path(&warn->path);
174 
175 	nofs_flag = memalloc_nofs_save();
176 	ipath = init_ipath(4096, local_root, &warn->path);
177 	memalloc_nofs_restore(nofs_flag);
178 	if (IS_ERR(ipath)) {
179 		btrfs_put_root(local_root);
180 		ret = PTR_ERR(ipath);
181 		ipath = NULL;
182 		/*
183 		 * -ENOMEM, not a critical error, just output an generic error
184 		 * without filename.
185 		 */
186 		btrfs_warn(fs_info,
187 "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
188 			   warn->logical, warn->mirror_num, root, inum, offset);
189 		return ret;
190 	}
191 	ret = paths_from_inode(inum, ipath);
192 	if (ret < 0)
193 		goto err;
194 
195 	/*
196 	 * We deliberately ignore the bit ipath might have been too small to
197 	 * hold all of the paths here
198 	 */
199 	for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
200 		btrfs_warn(fs_info,
201 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
202 			   warn->logical, warn->mirror_num, root, inum, offset,
203 			   fs_info->sectorsize, nlink,
204 			   (char *)(unsigned long)ipath->fspath->val[i]);
205 	}
206 
207 	btrfs_put_root(local_root);
208 	free_ipath(ipath);
209 	return 0;
210 
211 err:
212 	btrfs_warn(fs_info,
213 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
214 		   warn->logical, warn->mirror_num, root, inum, offset, ret);
215 
216 	free_ipath(ipath);
217 	return ret;
218 }
219 
220 /*
221  * Do extra user-friendly error output (e.g. lookup all the affected files).
222  *
223  * Return true if we succeeded doing the backref lookup.
224  * Return false if such lookup failed, and has to fallback to the old error message.
225  */
226 static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
227 				   const u8 *csum, const u8 *csum_expected,
228 				   int mirror_num)
229 {
230 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
231 	struct btrfs_path path = { 0 };
232 	struct btrfs_key found_key = { 0 };
233 	struct extent_buffer *eb;
234 	struct btrfs_extent_item *ei;
235 	const u32 csum_size = fs_info->csum_size;
236 	u64 logical;
237 	u64 flags;
238 	u32 item_size;
239 	int ret;
240 
241 	mutex_lock(&fs_info->reloc_mutex);
242 	logical = btrfs_get_reloc_bg_bytenr(fs_info);
243 	mutex_unlock(&fs_info->reloc_mutex);
244 
245 	if (logical == U64_MAX) {
246 		btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
247 		btrfs_warn_rl(fs_info,
248 "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
249 			inode->root->root_key.objectid, btrfs_ino(inode), file_off,
250 			CSUM_FMT_VALUE(csum_size, csum),
251 			CSUM_FMT_VALUE(csum_size, csum_expected),
252 			mirror_num);
253 		return;
254 	}
255 
256 	logical += file_off;
257 	btrfs_warn_rl(fs_info,
258 "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
259 			inode->root->root_key.objectid,
260 			btrfs_ino(inode), file_off, logical,
261 			CSUM_FMT_VALUE(csum_size, csum),
262 			CSUM_FMT_VALUE(csum_size, csum_expected),
263 			mirror_num);
264 
265 	ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
266 	if (ret < 0) {
267 		btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
268 			     logical, ret);
269 		return;
270 	}
271 	eb = path.nodes[0];
272 	ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
273 	item_size = btrfs_item_size(eb, path.slots[0]);
274 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
275 		unsigned long ptr = 0;
276 		u64 ref_root;
277 		u8 ref_level;
278 
279 		while (true) {
280 			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
281 						      item_size, &ref_root,
282 						      &ref_level);
283 			if (ret < 0) {
284 				btrfs_warn_rl(fs_info,
285 				"failed to resolve tree backref for logical %llu: %d",
286 					      logical, ret);
287 				break;
288 			}
289 			if (ret > 0)
290 				break;
291 
292 			btrfs_warn_rl(fs_info,
293 "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
294 				logical, mirror_num,
295 				(ref_level ? "node" : "leaf"),
296 				ref_level, ref_root);
297 		}
298 		btrfs_release_path(&path);
299 	} else {
300 		struct btrfs_backref_walk_ctx ctx = { 0 };
301 		struct data_reloc_warn reloc_warn = { 0 };
302 
303 		btrfs_release_path(&path);
304 
305 		ctx.bytenr = found_key.objectid;
306 		ctx.extent_item_pos = logical - found_key.objectid;
307 		ctx.fs_info = fs_info;
308 
309 		reloc_warn.logical = logical;
310 		reloc_warn.extent_item_size = found_key.offset;
311 		reloc_warn.mirror_num = mirror_num;
312 		reloc_warn.fs_info = fs_info;
313 
314 		iterate_extent_inodes(&ctx, true,
315 				      data_reloc_print_warning_inode, &reloc_warn);
316 	}
317 }
318 
319 static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
320 		u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
321 {
322 	struct btrfs_root *root = inode->root;
323 	const u32 csum_size = root->fs_info->csum_size;
324 
325 	/* For data reloc tree, it's better to do a backref lookup instead. */
326 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
327 		return print_data_reloc_error(inode, logical_start, csum,
328 					      csum_expected, mirror_num);
329 
330 	/* Output without objectid, which is more meaningful */
331 	if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
332 		btrfs_warn_rl(root->fs_info,
333 "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
334 			root->root_key.objectid, btrfs_ino(inode),
335 			logical_start,
336 			CSUM_FMT_VALUE(csum_size, csum),
337 			CSUM_FMT_VALUE(csum_size, csum_expected),
338 			mirror_num);
339 	} else {
340 		btrfs_warn_rl(root->fs_info,
341 "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
342 			root->root_key.objectid, btrfs_ino(inode),
343 			logical_start,
344 			CSUM_FMT_VALUE(csum_size, csum),
345 			CSUM_FMT_VALUE(csum_size, csum_expected),
346 			mirror_num);
347 	}
348 }
349 
350 /*
351  * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
352  *
353  * ilock_flags can have the following bit set:
354  *
355  * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
356  * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
357  *		     return -EAGAIN
358  * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
359  */
360 int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
361 {
362 	if (ilock_flags & BTRFS_ILOCK_SHARED) {
363 		if (ilock_flags & BTRFS_ILOCK_TRY) {
364 			if (!inode_trylock_shared(&inode->vfs_inode))
365 				return -EAGAIN;
366 			else
367 				return 0;
368 		}
369 		inode_lock_shared(&inode->vfs_inode);
370 	} else {
371 		if (ilock_flags & BTRFS_ILOCK_TRY) {
372 			if (!inode_trylock(&inode->vfs_inode))
373 				return -EAGAIN;
374 			else
375 				return 0;
376 		}
377 		inode_lock(&inode->vfs_inode);
378 	}
379 	if (ilock_flags & BTRFS_ILOCK_MMAP)
380 		down_write(&inode->i_mmap_lock);
381 	return 0;
382 }
383 
384 /*
385  * btrfs_inode_unlock - unock inode i_rwsem
386  *
387  * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
388  * to decide whether the lock acquired is shared or exclusive.
389  */
390 void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
391 {
392 	if (ilock_flags & BTRFS_ILOCK_MMAP)
393 		up_write(&inode->i_mmap_lock);
394 	if (ilock_flags & BTRFS_ILOCK_SHARED)
395 		inode_unlock_shared(&inode->vfs_inode);
396 	else
397 		inode_unlock(&inode->vfs_inode);
398 }
399 
400 /*
401  * Cleanup all submitted ordered extents in specified range to handle errors
402  * from the btrfs_run_delalloc_range() callback.
403  *
404  * NOTE: caller must ensure that when an error happens, it can not call
405  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
406  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
407  * to be released, which we want to happen only when finishing the ordered
408  * extent (btrfs_finish_ordered_io()).
409  */
410 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
411 						 struct page *locked_page,
412 						 u64 offset, u64 bytes)
413 {
414 	unsigned long index = offset >> PAGE_SHIFT;
415 	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
416 	u64 page_start = 0, page_end = 0;
417 	struct page *page;
418 
419 	if (locked_page) {
420 		page_start = page_offset(locked_page);
421 		page_end = page_start + PAGE_SIZE - 1;
422 	}
423 
424 	while (index <= end_index) {
425 		/*
426 		 * For locked page, we will call btrfs_mark_ordered_io_finished
427 		 * through btrfs_mark_ordered_io_finished() on it
428 		 * in run_delalloc_range() for the error handling, which will
429 		 * clear page Ordered and run the ordered extent accounting.
430 		 *
431 		 * Here we can't just clear the Ordered bit, or
432 		 * btrfs_mark_ordered_io_finished() would skip the accounting
433 		 * for the page range, and the ordered extent will never finish.
434 		 */
435 		if (locked_page && index == (page_start >> PAGE_SHIFT)) {
436 			index++;
437 			continue;
438 		}
439 		page = find_get_page(inode->vfs_inode.i_mapping, index);
440 		index++;
441 		if (!page)
442 			continue;
443 
444 		/*
445 		 * Here we just clear all Ordered bits for every page in the
446 		 * range, then btrfs_mark_ordered_io_finished() will handle
447 		 * the ordered extent accounting for the range.
448 		 */
449 		btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
450 					       offset, bytes);
451 		put_page(page);
452 	}
453 
454 	if (locked_page) {
455 		/* The locked page covers the full range, nothing needs to be done */
456 		if (bytes + offset <= page_start + PAGE_SIZE)
457 			return;
458 		/*
459 		 * In case this page belongs to the delalloc range being
460 		 * instantiated then skip it, since the first page of a range is
461 		 * going to be properly cleaned up by the caller of
462 		 * run_delalloc_range
463 		 */
464 		if (page_start >= offset && page_end <= (offset + bytes - 1)) {
465 			bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
466 			offset = page_offset(locked_page) + PAGE_SIZE;
467 		}
468 	}
469 
470 	return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
471 }
472 
473 static int btrfs_dirty_inode(struct btrfs_inode *inode);
474 
475 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
476 				     struct btrfs_new_inode_args *args)
477 {
478 	int err;
479 
480 	if (args->default_acl) {
481 		err = __btrfs_set_acl(trans, args->inode, args->default_acl,
482 				      ACL_TYPE_DEFAULT);
483 		if (err)
484 			return err;
485 	}
486 	if (args->acl) {
487 		err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
488 		if (err)
489 			return err;
490 	}
491 	if (!args->default_acl && !args->acl)
492 		cache_no_acl(args->inode);
493 	return btrfs_xattr_security_init(trans, args->inode, args->dir,
494 					 &args->dentry->d_name);
495 }
496 
497 /*
498  * this does all the hard work for inserting an inline extent into
499  * the btree.  The caller should have done a btrfs_drop_extents so that
500  * no overlapping inline items exist in the btree
501  */
502 static int insert_inline_extent(struct btrfs_trans_handle *trans,
503 				struct btrfs_path *path,
504 				struct btrfs_inode *inode, bool extent_inserted,
505 				size_t size, size_t compressed_size,
506 				int compress_type,
507 				struct page **compressed_pages,
508 				bool update_i_size)
509 {
510 	struct btrfs_root *root = inode->root;
511 	struct extent_buffer *leaf;
512 	struct page *page = NULL;
513 	char *kaddr;
514 	unsigned long ptr;
515 	struct btrfs_file_extent_item *ei;
516 	int ret;
517 	size_t cur_size = size;
518 	u64 i_size;
519 
520 	ASSERT((compressed_size > 0 && compressed_pages) ||
521 	       (compressed_size == 0 && !compressed_pages));
522 
523 	if (compressed_size && compressed_pages)
524 		cur_size = compressed_size;
525 
526 	if (!extent_inserted) {
527 		struct btrfs_key key;
528 		size_t datasize;
529 
530 		key.objectid = btrfs_ino(inode);
531 		key.offset = 0;
532 		key.type = BTRFS_EXTENT_DATA_KEY;
533 
534 		datasize = btrfs_file_extent_calc_inline_size(cur_size);
535 		ret = btrfs_insert_empty_item(trans, root, path, &key,
536 					      datasize);
537 		if (ret)
538 			goto fail;
539 	}
540 	leaf = path->nodes[0];
541 	ei = btrfs_item_ptr(leaf, path->slots[0],
542 			    struct btrfs_file_extent_item);
543 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
544 	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
545 	btrfs_set_file_extent_encryption(leaf, ei, 0);
546 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
547 	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
548 	ptr = btrfs_file_extent_inline_start(ei);
549 
550 	if (compress_type != BTRFS_COMPRESS_NONE) {
551 		struct page *cpage;
552 		int i = 0;
553 		while (compressed_size > 0) {
554 			cpage = compressed_pages[i];
555 			cur_size = min_t(unsigned long, compressed_size,
556 				       PAGE_SIZE);
557 
558 			kaddr = kmap_local_page(cpage);
559 			write_extent_buffer(leaf, kaddr, ptr, cur_size);
560 			kunmap_local(kaddr);
561 
562 			i++;
563 			ptr += cur_size;
564 			compressed_size -= cur_size;
565 		}
566 		btrfs_set_file_extent_compression(leaf, ei,
567 						  compress_type);
568 	} else {
569 		page = find_get_page(inode->vfs_inode.i_mapping, 0);
570 		btrfs_set_file_extent_compression(leaf, ei, 0);
571 		kaddr = kmap_local_page(page);
572 		write_extent_buffer(leaf, kaddr, ptr, size);
573 		kunmap_local(kaddr);
574 		put_page(page);
575 	}
576 	btrfs_mark_buffer_dirty(trans, leaf);
577 	btrfs_release_path(path);
578 
579 	/*
580 	 * We align size to sectorsize for inline extents just for simplicity
581 	 * sake.
582 	 */
583 	ret = btrfs_inode_set_file_extent_range(inode, 0,
584 					ALIGN(size, root->fs_info->sectorsize));
585 	if (ret)
586 		goto fail;
587 
588 	/*
589 	 * We're an inline extent, so nobody can extend the file past i_size
590 	 * without locking a page we already have locked.
591 	 *
592 	 * We must do any i_size and inode updates before we unlock the pages.
593 	 * Otherwise we could end up racing with unlink.
594 	 */
595 	i_size = i_size_read(&inode->vfs_inode);
596 	if (update_i_size && size > i_size) {
597 		i_size_write(&inode->vfs_inode, size);
598 		i_size = size;
599 	}
600 	inode->disk_i_size = i_size;
601 
602 fail:
603 	return ret;
604 }
605 
606 
607 /*
608  * conditionally insert an inline extent into the file.  This
609  * does the checks required to make sure the data is small enough
610  * to fit as an inline extent.
611  */
612 static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
613 					  size_t compressed_size,
614 					  int compress_type,
615 					  struct page **compressed_pages,
616 					  bool update_i_size)
617 {
618 	struct btrfs_drop_extents_args drop_args = { 0 };
619 	struct btrfs_root *root = inode->root;
620 	struct btrfs_fs_info *fs_info = root->fs_info;
621 	struct btrfs_trans_handle *trans;
622 	u64 data_len = (compressed_size ?: size);
623 	int ret;
624 	struct btrfs_path *path;
625 
626 	/*
627 	 * We can create an inline extent if it ends at or beyond the current
628 	 * i_size, is no larger than a sector (decompressed), and the (possibly
629 	 * compressed) data fits in a leaf and the configured maximum inline
630 	 * size.
631 	 */
632 	if (size < i_size_read(&inode->vfs_inode) ||
633 	    size > fs_info->sectorsize ||
634 	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
635 	    data_len > fs_info->max_inline)
636 		return 1;
637 
638 	path = btrfs_alloc_path();
639 	if (!path)
640 		return -ENOMEM;
641 
642 	trans = btrfs_join_transaction(root);
643 	if (IS_ERR(trans)) {
644 		btrfs_free_path(path);
645 		return PTR_ERR(trans);
646 	}
647 	trans->block_rsv = &inode->block_rsv;
648 
649 	drop_args.path = path;
650 	drop_args.start = 0;
651 	drop_args.end = fs_info->sectorsize;
652 	drop_args.drop_cache = true;
653 	drop_args.replace_extent = true;
654 	drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
655 	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
656 	if (ret) {
657 		btrfs_abort_transaction(trans, ret);
658 		goto out;
659 	}
660 
661 	ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
662 				   size, compressed_size, compress_type,
663 				   compressed_pages, update_i_size);
664 	if (ret && ret != -ENOSPC) {
665 		btrfs_abort_transaction(trans, ret);
666 		goto out;
667 	} else if (ret == -ENOSPC) {
668 		ret = 1;
669 		goto out;
670 	}
671 
672 	btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
673 	ret = btrfs_update_inode(trans, root, inode);
674 	if (ret && ret != -ENOSPC) {
675 		btrfs_abort_transaction(trans, ret);
676 		goto out;
677 	} else if (ret == -ENOSPC) {
678 		ret = 1;
679 		goto out;
680 	}
681 
682 	btrfs_set_inode_full_sync(inode);
683 out:
684 	/*
685 	 * Don't forget to free the reserved space, as for inlined extent
686 	 * it won't count as data extent, free them directly here.
687 	 * And at reserve time, it's always aligned to page size, so
688 	 * just free one page here.
689 	 */
690 	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
691 	btrfs_free_path(path);
692 	btrfs_end_transaction(trans);
693 	return ret;
694 }
695 
696 struct async_extent {
697 	u64 start;
698 	u64 ram_size;
699 	u64 compressed_size;
700 	struct page **pages;
701 	unsigned long nr_pages;
702 	int compress_type;
703 	struct list_head list;
704 };
705 
706 struct async_chunk {
707 	struct btrfs_inode *inode;
708 	struct page *locked_page;
709 	u64 start;
710 	u64 end;
711 	blk_opf_t write_flags;
712 	struct list_head extents;
713 	struct cgroup_subsys_state *blkcg_css;
714 	struct btrfs_work work;
715 	struct async_cow *async_cow;
716 };
717 
718 struct async_cow {
719 	atomic_t num_chunks;
720 	struct async_chunk chunks[];
721 };
722 
723 static noinline int add_async_extent(struct async_chunk *cow,
724 				     u64 start, u64 ram_size,
725 				     u64 compressed_size,
726 				     struct page **pages,
727 				     unsigned long nr_pages,
728 				     int compress_type)
729 {
730 	struct async_extent *async_extent;
731 
732 	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
733 	BUG_ON(!async_extent); /* -ENOMEM */
734 	async_extent->start = start;
735 	async_extent->ram_size = ram_size;
736 	async_extent->compressed_size = compressed_size;
737 	async_extent->pages = pages;
738 	async_extent->nr_pages = nr_pages;
739 	async_extent->compress_type = compress_type;
740 	list_add_tail(&async_extent->list, &cow->extents);
741 	return 0;
742 }
743 
744 /*
745  * Check if the inode needs to be submitted to compression, based on mount
746  * options, defragmentation, properties or heuristics.
747  */
748 static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
749 				      u64 end)
750 {
751 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
752 
753 	if (!btrfs_inode_can_compress(inode)) {
754 		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
755 			KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
756 			btrfs_ino(inode));
757 		return 0;
758 	}
759 	/*
760 	 * Special check for subpage.
761 	 *
762 	 * We lock the full page then run each delalloc range in the page, thus
763 	 * for the following case, we will hit some subpage specific corner case:
764 	 *
765 	 * 0		32K		64K
766 	 * |	|///////|	|///////|
767 	 *		\- A		\- B
768 	 *
769 	 * In above case, both range A and range B will try to unlock the full
770 	 * page [0, 64K), causing the one finished later will have page
771 	 * unlocked already, triggering various page lock requirement BUG_ON()s.
772 	 *
773 	 * So here we add an artificial limit that subpage compression can only
774 	 * if the range is fully page aligned.
775 	 *
776 	 * In theory we only need to ensure the first page is fully covered, but
777 	 * the tailing partial page will be locked until the full compression
778 	 * finishes, delaying the write of other range.
779 	 *
780 	 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
781 	 * first to prevent any submitted async extent to unlock the full page.
782 	 * By this, we can ensure for subpage case that only the last async_cow
783 	 * will unlock the full page.
784 	 */
785 	if (fs_info->sectorsize < PAGE_SIZE) {
786 		if (!PAGE_ALIGNED(start) ||
787 		    !PAGE_ALIGNED(end + 1))
788 			return 0;
789 	}
790 
791 	/* force compress */
792 	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
793 		return 1;
794 	/* defrag ioctl */
795 	if (inode->defrag_compress)
796 		return 1;
797 	/* bad compression ratios */
798 	if (inode->flags & BTRFS_INODE_NOCOMPRESS)
799 		return 0;
800 	if (btrfs_test_opt(fs_info, COMPRESS) ||
801 	    inode->flags & BTRFS_INODE_COMPRESS ||
802 	    inode->prop_compress)
803 		return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
804 	return 0;
805 }
806 
807 static inline void inode_should_defrag(struct btrfs_inode *inode,
808 		u64 start, u64 end, u64 num_bytes, u32 small_write)
809 {
810 	/* If this is a small write inside eof, kick off a defrag */
811 	if (num_bytes < small_write &&
812 	    (start > 0 || end + 1 < inode->disk_i_size))
813 		btrfs_add_inode_defrag(NULL, inode, small_write);
814 }
815 
816 /*
817  * Work queue call back to started compression on a file and pages.
818  *
819  * This is done inside an ordered work queue, and the compression is spread
820  * across many cpus.  The actual IO submission is step two, and the ordered work
821  * queue takes care of making sure that happens in the same order things were
822  * put onto the queue by writepages and friends.
823  *
824  * If this code finds it can't get good compression, it puts an entry onto the
825  * work queue to write the uncompressed bytes.  This makes sure that both
826  * compressed inodes and uncompressed inodes are written in the same order that
827  * the flusher thread sent them down.
828  */
829 static void compress_file_range(struct btrfs_work *work)
830 {
831 	struct async_chunk *async_chunk =
832 		container_of(work, struct async_chunk, work);
833 	struct btrfs_inode *inode = async_chunk->inode;
834 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
835 	struct address_space *mapping = inode->vfs_inode.i_mapping;
836 	u64 blocksize = fs_info->sectorsize;
837 	u64 start = async_chunk->start;
838 	u64 end = async_chunk->end;
839 	u64 actual_end;
840 	u64 i_size;
841 	int ret = 0;
842 	struct page **pages;
843 	unsigned long nr_pages;
844 	unsigned long total_compressed = 0;
845 	unsigned long total_in = 0;
846 	unsigned int poff;
847 	int i;
848 	int compress_type = fs_info->compress_type;
849 
850 	inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
851 
852 	/*
853 	 * We need to call clear_page_dirty_for_io on each page in the range.
854 	 * Otherwise applications with the file mmap'd can wander in and change
855 	 * the page contents while we are compressing them.
856 	 */
857 	extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
858 
859 	/*
860 	 * We need to save i_size before now because it could change in between
861 	 * us evaluating the size and assigning it.  This is because we lock and
862 	 * unlock the page in truncate and fallocate, and then modify the i_size
863 	 * later on.
864 	 *
865 	 * The barriers are to emulate READ_ONCE, remove that once i_size_read
866 	 * does that for us.
867 	 */
868 	barrier();
869 	i_size = i_size_read(&inode->vfs_inode);
870 	barrier();
871 	actual_end = min_t(u64, i_size, end + 1);
872 again:
873 	pages = NULL;
874 	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
875 	nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
876 
877 	/*
878 	 * we don't want to send crud past the end of i_size through
879 	 * compression, that's just a waste of CPU time.  So, if the
880 	 * end of the file is before the start of our current
881 	 * requested range of bytes, we bail out to the uncompressed
882 	 * cleanup code that can deal with all of this.
883 	 *
884 	 * It isn't really the fastest way to fix things, but this is a
885 	 * very uncommon corner.
886 	 */
887 	if (actual_end <= start)
888 		goto cleanup_and_bail_uncompressed;
889 
890 	total_compressed = actual_end - start;
891 
892 	/*
893 	 * Skip compression for a small file range(<=blocksize) that
894 	 * isn't an inline extent, since it doesn't save disk space at all.
895 	 */
896 	if (total_compressed <= blocksize &&
897 	   (start > 0 || end + 1 < inode->disk_i_size))
898 		goto cleanup_and_bail_uncompressed;
899 
900 	/*
901 	 * For subpage case, we require full page alignment for the sector
902 	 * aligned range.
903 	 * Thus we must also check against @actual_end, not just @end.
904 	 */
905 	if (blocksize < PAGE_SIZE) {
906 		if (!PAGE_ALIGNED(start) ||
907 		    !PAGE_ALIGNED(round_up(actual_end, blocksize)))
908 			goto cleanup_and_bail_uncompressed;
909 	}
910 
911 	total_compressed = min_t(unsigned long, total_compressed,
912 			BTRFS_MAX_UNCOMPRESSED);
913 	total_in = 0;
914 	ret = 0;
915 
916 	/*
917 	 * We do compression for mount -o compress and when the inode has not
918 	 * been flagged as NOCOMPRESS.  This flag can change at any time if we
919 	 * discover bad compression ratios.
920 	 */
921 	if (!inode_need_compress(inode, start, end))
922 		goto cleanup_and_bail_uncompressed;
923 
924 	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
925 	if (!pages) {
926 		/*
927 		 * Memory allocation failure is not a fatal error, we can fall
928 		 * back to uncompressed code.
929 		 */
930 		goto cleanup_and_bail_uncompressed;
931 	}
932 
933 	if (inode->defrag_compress)
934 		compress_type = inode->defrag_compress;
935 	else if (inode->prop_compress)
936 		compress_type = inode->prop_compress;
937 
938 	/* Compression level is applied here. */
939 	ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4),
940 				   mapping, start, pages, &nr_pages, &total_in,
941 				   &total_compressed);
942 	if (ret)
943 		goto mark_incompressible;
944 
945 	/*
946 	 * Zero the tail end of the last page, as we might be sending it down
947 	 * to disk.
948 	 */
949 	poff = offset_in_page(total_compressed);
950 	if (poff)
951 		memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff);
952 
953 	/*
954 	 * Try to create an inline extent.
955 	 *
956 	 * If we didn't compress the entire range, try to create an uncompressed
957 	 * inline extent, else a compressed one.
958 	 *
959 	 * Check cow_file_range() for why we don't even try to create inline
960 	 * extent for the subpage case.
961 	 */
962 	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
963 		if (total_in < actual_end) {
964 			ret = cow_file_range_inline(inode, actual_end, 0,
965 						    BTRFS_COMPRESS_NONE, NULL,
966 						    false);
967 		} else {
968 			ret = cow_file_range_inline(inode, actual_end,
969 						    total_compressed,
970 						    compress_type, pages,
971 						    false);
972 		}
973 		if (ret <= 0) {
974 			unsigned long clear_flags = EXTENT_DELALLOC |
975 				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
976 				EXTENT_DO_ACCOUNTING;
977 
978 			if (ret < 0)
979 				mapping_set_error(mapping, -EIO);
980 
981 			/*
982 			 * inline extent creation worked or returned error,
983 			 * we don't need to create any more async work items.
984 			 * Unlock and free up our temp pages.
985 			 *
986 			 * We use DO_ACCOUNTING here because we need the
987 			 * delalloc_release_metadata to be done _after_ we drop
988 			 * our outstanding extent for clearing delalloc for this
989 			 * range.
990 			 */
991 			extent_clear_unlock_delalloc(inode, start, end,
992 						     NULL,
993 						     clear_flags,
994 						     PAGE_UNLOCK |
995 						     PAGE_START_WRITEBACK |
996 						     PAGE_END_WRITEBACK);
997 			goto free_pages;
998 		}
999 	}
1000 
1001 	/*
1002 	 * We aren't doing an inline extent. Round the compressed size up to a
1003 	 * block size boundary so the allocator does sane things.
1004 	 */
1005 	total_compressed = ALIGN(total_compressed, blocksize);
1006 
1007 	/*
1008 	 * One last check to make sure the compression is really a win, compare
1009 	 * the page count read with the blocks on disk, compression must free at
1010 	 * least one sector.
1011 	 */
1012 	total_in = round_up(total_in, fs_info->sectorsize);
1013 	if (total_compressed + blocksize > total_in)
1014 		goto mark_incompressible;
1015 
1016 	/*
1017 	 * The async work queues will take care of doing actual allocation on
1018 	 * disk for these compressed pages, and will submit the bios.
1019 	 */
1020 	add_async_extent(async_chunk, start, total_in, total_compressed, pages,
1021 			 nr_pages, compress_type);
1022 	if (start + total_in < end) {
1023 		start += total_in;
1024 		cond_resched();
1025 		goto again;
1026 	}
1027 	return;
1028 
1029 mark_incompressible:
1030 	if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
1031 		inode->flags |= BTRFS_INODE_NOCOMPRESS;
1032 cleanup_and_bail_uncompressed:
1033 	add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
1034 			 BTRFS_COMPRESS_NONE);
1035 free_pages:
1036 	if (pages) {
1037 		for (i = 0; i < nr_pages; i++) {
1038 			WARN_ON(pages[i]->mapping);
1039 			put_page(pages[i]);
1040 		}
1041 		kfree(pages);
1042 	}
1043 }
1044 
1045 static void free_async_extent_pages(struct async_extent *async_extent)
1046 {
1047 	int i;
1048 
1049 	if (!async_extent->pages)
1050 		return;
1051 
1052 	for (i = 0; i < async_extent->nr_pages; i++) {
1053 		WARN_ON(async_extent->pages[i]->mapping);
1054 		put_page(async_extent->pages[i]);
1055 	}
1056 	kfree(async_extent->pages);
1057 	async_extent->nr_pages = 0;
1058 	async_extent->pages = NULL;
1059 }
1060 
1061 static void submit_uncompressed_range(struct btrfs_inode *inode,
1062 				      struct async_extent *async_extent,
1063 				      struct page *locked_page)
1064 {
1065 	u64 start = async_extent->start;
1066 	u64 end = async_extent->start + async_extent->ram_size - 1;
1067 	int ret;
1068 	struct writeback_control wbc = {
1069 		.sync_mode		= WB_SYNC_ALL,
1070 		.range_start		= start,
1071 		.range_end		= end,
1072 		.no_cgroup_owner	= 1,
1073 	};
1074 
1075 	wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
1076 	ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false);
1077 	wbc_detach_inode(&wbc);
1078 	if (ret < 0) {
1079 		btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
1080 		if (locked_page) {
1081 			const u64 page_start = page_offset(locked_page);
1082 
1083 			set_page_writeback(locked_page);
1084 			end_page_writeback(locked_page);
1085 			btrfs_mark_ordered_io_finished(inode, locked_page,
1086 						       page_start, PAGE_SIZE,
1087 						       !ret);
1088 			mapping_set_error(locked_page->mapping, ret);
1089 			unlock_page(locked_page);
1090 		}
1091 	}
1092 }
1093 
1094 static void submit_one_async_extent(struct async_chunk *async_chunk,
1095 				    struct async_extent *async_extent,
1096 				    u64 *alloc_hint)
1097 {
1098 	struct btrfs_inode *inode = async_chunk->inode;
1099 	struct extent_io_tree *io_tree = &inode->io_tree;
1100 	struct btrfs_root *root = inode->root;
1101 	struct btrfs_fs_info *fs_info = root->fs_info;
1102 	struct btrfs_ordered_extent *ordered;
1103 	struct btrfs_key ins;
1104 	struct page *locked_page = NULL;
1105 	struct extent_map *em;
1106 	int ret = 0;
1107 	u64 start = async_extent->start;
1108 	u64 end = async_extent->start + async_extent->ram_size - 1;
1109 
1110 	if (async_chunk->blkcg_css)
1111 		kthread_associate_blkcg(async_chunk->blkcg_css);
1112 
1113 	/*
1114 	 * If async_chunk->locked_page is in the async_extent range, we need to
1115 	 * handle it.
1116 	 */
1117 	if (async_chunk->locked_page) {
1118 		u64 locked_page_start = page_offset(async_chunk->locked_page);
1119 		u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
1120 
1121 		if (!(start >= locked_page_end || end <= locked_page_start))
1122 			locked_page = async_chunk->locked_page;
1123 	}
1124 	lock_extent(io_tree, start, end, NULL);
1125 
1126 	if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
1127 		submit_uncompressed_range(inode, async_extent, locked_page);
1128 		goto done;
1129 	}
1130 
1131 	ret = btrfs_reserve_extent(root, async_extent->ram_size,
1132 				   async_extent->compressed_size,
1133 				   async_extent->compressed_size,
1134 				   0, *alloc_hint, &ins, 1, 1);
1135 	if (ret) {
1136 		/*
1137 		 * We can't reserve contiguous space for the compressed size.
1138 		 * Unlikely, but it's possible that we could have enough
1139 		 * non-contiguous space for the uncompressed size instead.  So
1140 		 * fall back to uncompressed.
1141 		 */
1142 		submit_uncompressed_range(inode, async_extent, locked_page);
1143 		goto done;
1144 	}
1145 
1146 	/* Here we're doing allocation and writeback of the compressed pages */
1147 	em = create_io_em(inode, start,
1148 			  async_extent->ram_size,	/* len */
1149 			  start,			/* orig_start */
1150 			  ins.objectid,			/* block_start */
1151 			  ins.offset,			/* block_len */
1152 			  ins.offset,			/* orig_block_len */
1153 			  async_extent->ram_size,	/* ram_bytes */
1154 			  async_extent->compress_type,
1155 			  BTRFS_ORDERED_COMPRESSED);
1156 	if (IS_ERR(em)) {
1157 		ret = PTR_ERR(em);
1158 		goto out_free_reserve;
1159 	}
1160 	free_extent_map(em);
1161 
1162 	ordered = btrfs_alloc_ordered_extent(inode, start,	/* file_offset */
1163 				       async_extent->ram_size,	/* num_bytes */
1164 				       async_extent->ram_size,	/* ram_bytes */
1165 				       ins.objectid,		/* disk_bytenr */
1166 				       ins.offset,		/* disk_num_bytes */
1167 				       0,			/* offset */
1168 				       1 << BTRFS_ORDERED_COMPRESSED,
1169 				       async_extent->compress_type);
1170 	if (IS_ERR(ordered)) {
1171 		btrfs_drop_extent_map_range(inode, start, end, false);
1172 		ret = PTR_ERR(ordered);
1173 		goto out_free_reserve;
1174 	}
1175 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1176 
1177 	/* Clear dirty, set writeback and unlock the pages. */
1178 	extent_clear_unlock_delalloc(inode, start, end,
1179 			NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
1180 			PAGE_UNLOCK | PAGE_START_WRITEBACK);
1181 	btrfs_submit_compressed_write(ordered,
1182 			    async_extent->pages,	/* compressed_pages */
1183 			    async_extent->nr_pages,
1184 			    async_chunk->write_flags, true);
1185 	*alloc_hint = ins.objectid + ins.offset;
1186 done:
1187 	if (async_chunk->blkcg_css)
1188 		kthread_associate_blkcg(NULL);
1189 	kfree(async_extent);
1190 	return;
1191 
1192 out_free_reserve:
1193 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1194 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1195 	mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
1196 	extent_clear_unlock_delalloc(inode, start, end,
1197 				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
1198 				     EXTENT_DELALLOC_NEW |
1199 				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
1200 				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
1201 				     PAGE_END_WRITEBACK);
1202 	free_async_extent_pages(async_extent);
1203 	if (async_chunk->blkcg_css)
1204 		kthread_associate_blkcg(NULL);
1205 	btrfs_debug(fs_info,
1206 "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
1207 		    root->root_key.objectid, btrfs_ino(inode), start,
1208 		    async_extent->ram_size, ret);
1209 	kfree(async_extent);
1210 }
1211 
1212 static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1213 				      u64 num_bytes)
1214 {
1215 	struct extent_map_tree *em_tree = &inode->extent_tree;
1216 	struct extent_map *em;
1217 	u64 alloc_hint = 0;
1218 
1219 	read_lock(&em_tree->lock);
1220 	em = search_extent_mapping(em_tree, start, num_bytes);
1221 	if (em) {
1222 		/*
1223 		 * if block start isn't an actual block number then find the
1224 		 * first block in this inode and use that as a hint.  If that
1225 		 * block is also bogus then just don't worry about it.
1226 		 */
1227 		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1228 			free_extent_map(em);
1229 			em = search_extent_mapping(em_tree, 0, 0);
1230 			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
1231 				alloc_hint = em->block_start;
1232 			if (em)
1233 				free_extent_map(em);
1234 		} else {
1235 			alloc_hint = em->block_start;
1236 			free_extent_map(em);
1237 		}
1238 	}
1239 	read_unlock(&em_tree->lock);
1240 
1241 	return alloc_hint;
1242 }
1243 
1244 /*
1245  * when extent_io.c finds a delayed allocation range in the file,
1246  * the call backs end up in this code.  The basic idea is to
1247  * allocate extents on disk for the range, and create ordered data structs
1248  * in ram to track those extents.
1249  *
1250  * locked_page is the page that writepage had locked already.  We use
1251  * it to make sure we don't do extra locks or unlocks.
1252  *
1253  * When this function fails, it unlocks all pages except @locked_page.
1254  *
1255  * When this function successfully creates an inline extent, it returns 1 and
1256  * unlocks all pages including locked_page and starts I/O on them.
1257  * (In reality inline extents are limited to a single page, so locked_page is
1258  * the only page handled anyway).
1259  *
1260  * When this function succeed and creates a normal extent, the page locking
1261  * status depends on the passed in flags:
1262  *
1263  * - If @keep_locked is set, all pages are kept locked.
1264  * - Else all pages except for @locked_page are unlocked.
1265  *
1266  * When a failure happens in the second or later iteration of the
1267  * while-loop, the ordered extents created in previous iterations are kept
1268  * intact. So, the caller must clean them up by calling
1269  * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
1270  * example.
1271  */
1272 static noinline int cow_file_range(struct btrfs_inode *inode,
1273 				   struct page *locked_page, u64 start, u64 end,
1274 				   u64 *done_offset,
1275 				   bool keep_locked, bool no_inline)
1276 {
1277 	struct btrfs_root *root = inode->root;
1278 	struct btrfs_fs_info *fs_info = root->fs_info;
1279 	u64 alloc_hint = 0;
1280 	u64 orig_start = start;
1281 	u64 num_bytes;
1282 	unsigned long ram_size;
1283 	u64 cur_alloc_size = 0;
1284 	u64 min_alloc_size;
1285 	u64 blocksize = fs_info->sectorsize;
1286 	struct btrfs_key ins;
1287 	struct extent_map *em;
1288 	unsigned clear_bits;
1289 	unsigned long page_ops;
1290 	bool extent_reserved = false;
1291 	int ret = 0;
1292 
1293 	if (btrfs_is_free_space_inode(inode)) {
1294 		ret = -EINVAL;
1295 		goto out_unlock;
1296 	}
1297 
1298 	num_bytes = ALIGN(end - start + 1, blocksize);
1299 	num_bytes = max(blocksize,  num_bytes);
1300 	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1301 
1302 	inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1303 
1304 	/*
1305 	 * Due to the page size limit, for subpage we can only trigger the
1306 	 * writeback for the dirty sectors of page, that means data writeback
1307 	 * is doing more writeback than what we want.
1308 	 *
1309 	 * This is especially unexpected for some call sites like fallocate,
1310 	 * where we only increase i_size after everything is done.
1311 	 * This means we can trigger inline extent even if we didn't want to.
1312 	 * So here we skip inline extent creation completely.
1313 	 */
1314 	if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) {
1315 		u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
1316 				       end + 1);
1317 
1318 		/* lets try to make an inline extent */
1319 		ret = cow_file_range_inline(inode, actual_end, 0,
1320 					    BTRFS_COMPRESS_NONE, NULL, false);
1321 		if (ret == 0) {
1322 			/*
1323 			 * We use DO_ACCOUNTING here because we need the
1324 			 * delalloc_release_metadata to be run _after_ we drop
1325 			 * our outstanding extent for clearing delalloc for this
1326 			 * range.
1327 			 */
1328 			extent_clear_unlock_delalloc(inode, start, end,
1329 				     locked_page,
1330 				     EXTENT_LOCKED | EXTENT_DELALLOC |
1331 				     EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1332 				     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1333 				     PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
1334 			/*
1335 			 * locked_page is locked by the caller of
1336 			 * writepage_delalloc(), not locked by
1337 			 * __process_pages_contig().
1338 			 *
1339 			 * We can't let __process_pages_contig() to unlock it,
1340 			 * as it doesn't have any subpage::writers recorded.
1341 			 *
1342 			 * Here we manually unlock the page, since the caller
1343 			 * can't determine if it's an inline extent or a
1344 			 * compressed extent.
1345 			 */
1346 			unlock_page(locked_page);
1347 			ret = 1;
1348 			goto done;
1349 		} else if (ret < 0) {
1350 			goto out_unlock;
1351 		}
1352 	}
1353 
1354 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1355 
1356 	/*
1357 	 * Relocation relies on the relocated extents to have exactly the same
1358 	 * size as the original extents. Normally writeback for relocation data
1359 	 * extents follows a NOCOW path because relocation preallocates the
1360 	 * extents. However, due to an operation such as scrub turning a block
1361 	 * group to RO mode, it may fallback to COW mode, so we must make sure
1362 	 * an extent allocated during COW has exactly the requested size and can
1363 	 * not be split into smaller extents, otherwise relocation breaks and
1364 	 * fails during the stage where it updates the bytenr of file extent
1365 	 * items.
1366 	 */
1367 	if (btrfs_is_data_reloc_root(root))
1368 		min_alloc_size = num_bytes;
1369 	else
1370 		min_alloc_size = fs_info->sectorsize;
1371 
1372 	while (num_bytes > 0) {
1373 		struct btrfs_ordered_extent *ordered;
1374 
1375 		cur_alloc_size = num_bytes;
1376 		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1377 					   min_alloc_size, 0, alloc_hint,
1378 					   &ins, 1, 1);
1379 		if (ret == -EAGAIN) {
1380 			/*
1381 			 * btrfs_reserve_extent only returns -EAGAIN for zoned
1382 			 * file systems, which is an indication that there are
1383 			 * no active zones to allocate from at the moment.
1384 			 *
1385 			 * If this is the first loop iteration, wait for at
1386 			 * least one zone to finish before retrying the
1387 			 * allocation.  Otherwise ask the caller to write out
1388 			 * the already allocated blocks before coming back to
1389 			 * us, or return -ENOSPC if it can't handle retries.
1390 			 */
1391 			ASSERT(btrfs_is_zoned(fs_info));
1392 			if (start == orig_start) {
1393 				wait_on_bit_io(&inode->root->fs_info->flags,
1394 					       BTRFS_FS_NEED_ZONE_FINISH,
1395 					       TASK_UNINTERRUPTIBLE);
1396 				continue;
1397 			}
1398 			if (done_offset) {
1399 				*done_offset = start - 1;
1400 				return 0;
1401 			}
1402 			ret = -ENOSPC;
1403 		}
1404 		if (ret < 0)
1405 			goto out_unlock;
1406 		cur_alloc_size = ins.offset;
1407 		extent_reserved = true;
1408 
1409 		ram_size = ins.offset;
1410 		em = create_io_em(inode, start, ins.offset, /* len */
1411 				  start, /* orig_start */
1412 				  ins.objectid, /* block_start */
1413 				  ins.offset, /* block_len */
1414 				  ins.offset, /* orig_block_len */
1415 				  ram_size, /* ram_bytes */
1416 				  BTRFS_COMPRESS_NONE, /* compress_type */
1417 				  BTRFS_ORDERED_REGULAR /* type */);
1418 		if (IS_ERR(em)) {
1419 			ret = PTR_ERR(em);
1420 			goto out_reserve;
1421 		}
1422 		free_extent_map(em);
1423 
1424 		ordered = btrfs_alloc_ordered_extent(inode, start, ram_size,
1425 					ram_size, ins.objectid, cur_alloc_size,
1426 					0, 1 << BTRFS_ORDERED_REGULAR,
1427 					BTRFS_COMPRESS_NONE);
1428 		if (IS_ERR(ordered)) {
1429 			ret = PTR_ERR(ordered);
1430 			goto out_drop_extent_cache;
1431 		}
1432 
1433 		if (btrfs_is_data_reloc_root(root)) {
1434 			ret = btrfs_reloc_clone_csums(ordered);
1435 
1436 			/*
1437 			 * Only drop cache here, and process as normal.
1438 			 *
1439 			 * We must not allow extent_clear_unlock_delalloc()
1440 			 * at out_unlock label to free meta of this ordered
1441 			 * extent, as its meta should be freed by
1442 			 * btrfs_finish_ordered_io().
1443 			 *
1444 			 * So we must continue until @start is increased to
1445 			 * skip current ordered extent.
1446 			 */
1447 			if (ret)
1448 				btrfs_drop_extent_map_range(inode, start,
1449 							    start + ram_size - 1,
1450 							    false);
1451 		}
1452 		btrfs_put_ordered_extent(ordered);
1453 
1454 		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1455 
1456 		/*
1457 		 * We're not doing compressed IO, don't unlock the first page
1458 		 * (which the caller expects to stay locked), don't clear any
1459 		 * dirty bits and don't set any writeback bits
1460 		 *
1461 		 * Do set the Ordered (Private2) bit so we know this page was
1462 		 * properly setup for writepage.
1463 		 */
1464 		page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
1465 		page_ops |= PAGE_SET_ORDERED;
1466 
1467 		extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
1468 					     locked_page,
1469 					     EXTENT_LOCKED | EXTENT_DELALLOC,
1470 					     page_ops);
1471 		if (num_bytes < cur_alloc_size)
1472 			num_bytes = 0;
1473 		else
1474 			num_bytes -= cur_alloc_size;
1475 		alloc_hint = ins.objectid + ins.offset;
1476 		start += cur_alloc_size;
1477 		extent_reserved = false;
1478 
1479 		/*
1480 		 * btrfs_reloc_clone_csums() error, since start is increased
1481 		 * extent_clear_unlock_delalloc() at out_unlock label won't
1482 		 * free metadata of current ordered extent, we're OK to exit.
1483 		 */
1484 		if (ret)
1485 			goto out_unlock;
1486 	}
1487 done:
1488 	if (done_offset)
1489 		*done_offset = end;
1490 	return ret;
1491 
1492 out_drop_extent_cache:
1493 	btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
1494 out_reserve:
1495 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1496 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1497 out_unlock:
1498 	/*
1499 	 * Now, we have three regions to clean up:
1500 	 *
1501 	 * |-------(1)----|---(2)---|-------------(3)----------|
1502 	 * `- orig_start  `- start  `- start + cur_alloc_size  `- end
1503 	 *
1504 	 * We process each region below.
1505 	 */
1506 
1507 	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1508 		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1509 	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1510 
1511 	/*
1512 	 * For the range (1). We have already instantiated the ordered extents
1513 	 * for this region. They are cleaned up by
1514 	 * btrfs_cleanup_ordered_extents() in e.g,
1515 	 * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
1516 	 * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
1517 	 * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
1518 	 * function.
1519 	 *
1520 	 * However, in case of @keep_locked, we still need to unlock the pages
1521 	 * (except @locked_page) to ensure all the pages are unlocked.
1522 	 */
1523 	if (keep_locked && orig_start < start) {
1524 		if (!locked_page)
1525 			mapping_set_error(inode->vfs_inode.i_mapping, ret);
1526 		extent_clear_unlock_delalloc(inode, orig_start, start - 1,
1527 					     locked_page, 0, page_ops);
1528 	}
1529 
1530 	/*
1531 	 * For the range (2). If we reserved an extent for our delalloc range
1532 	 * (or a subrange) and failed to create the respective ordered extent,
1533 	 * then it means that when we reserved the extent we decremented the
1534 	 * extent's size from the data space_info's bytes_may_use counter and
1535 	 * incremented the space_info's bytes_reserved counter by the same
1536 	 * amount. We must make sure extent_clear_unlock_delalloc() does not try
1537 	 * to decrement again the data space_info's bytes_may_use counter,
1538 	 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
1539 	 */
1540 	if (extent_reserved) {
1541 		extent_clear_unlock_delalloc(inode, start,
1542 					     start + cur_alloc_size - 1,
1543 					     locked_page,
1544 					     clear_bits,
1545 					     page_ops);
1546 		start += cur_alloc_size;
1547 	}
1548 
1549 	/*
1550 	 * For the range (3). We never touched the region. In addition to the
1551 	 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
1552 	 * space_info's bytes_may_use counter, reserved in
1553 	 * btrfs_check_data_free_space().
1554 	 */
1555 	if (start < end) {
1556 		clear_bits |= EXTENT_CLEAR_DATA_RESV;
1557 		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1558 					     clear_bits, page_ops);
1559 	}
1560 	return ret;
1561 }
1562 
1563 /*
1564  * Phase two of compressed writeback.  This is the ordered portion of the code,
1565  * which only gets called in the order the work was queued.  We walk all the
1566  * async extents created by compress_file_range and send them down to the disk.
1567  */
1568 static noinline void submit_compressed_extents(struct btrfs_work *work)
1569 {
1570 	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1571 						     work);
1572 	struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1573 	struct async_extent *async_extent;
1574 	unsigned long nr_pages;
1575 	u64 alloc_hint = 0;
1576 
1577 	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1578 		PAGE_SHIFT;
1579 
1580 	while (!list_empty(&async_chunk->extents)) {
1581 		async_extent = list_entry(async_chunk->extents.next,
1582 					  struct async_extent, list);
1583 		list_del(&async_extent->list);
1584 		submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
1585 	}
1586 
1587 	/* atomic_sub_return implies a barrier */
1588 	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1589 	    5 * SZ_1M)
1590 		cond_wake_up_nomb(&fs_info->async_submit_wait);
1591 }
1592 
1593 static noinline void async_cow_free(struct btrfs_work *work)
1594 {
1595 	struct async_chunk *async_chunk;
1596 	struct async_cow *async_cow;
1597 
1598 	async_chunk = container_of(work, struct async_chunk, work);
1599 	btrfs_add_delayed_iput(async_chunk->inode);
1600 	if (async_chunk->blkcg_css)
1601 		css_put(async_chunk->blkcg_css);
1602 
1603 	async_cow = async_chunk->async_cow;
1604 	if (atomic_dec_and_test(&async_cow->num_chunks))
1605 		kvfree(async_cow);
1606 }
1607 
1608 static bool run_delalloc_compressed(struct btrfs_inode *inode,
1609 				    struct page *locked_page, u64 start,
1610 				    u64 end, struct writeback_control *wbc)
1611 {
1612 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1613 	struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1614 	struct async_cow *ctx;
1615 	struct async_chunk *async_chunk;
1616 	unsigned long nr_pages;
1617 	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1618 	int i;
1619 	unsigned nofs_flag;
1620 	const blk_opf_t write_flags = wbc_to_write_flags(wbc);
1621 
1622 	nofs_flag = memalloc_nofs_save();
1623 	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1624 	memalloc_nofs_restore(nofs_flag);
1625 	if (!ctx)
1626 		return false;
1627 
1628 	unlock_extent(&inode->io_tree, start, end, NULL);
1629 	set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
1630 
1631 	async_chunk = ctx->chunks;
1632 	atomic_set(&ctx->num_chunks, num_chunks);
1633 
1634 	for (i = 0; i < num_chunks; i++) {
1635 		u64 cur_end = min(end, start + SZ_512K - 1);
1636 
1637 		/*
1638 		 * igrab is called higher up in the call chain, take only the
1639 		 * lightweight reference for the callback lifetime
1640 		 */
1641 		ihold(&inode->vfs_inode);
1642 		async_chunk[i].async_cow = ctx;
1643 		async_chunk[i].inode = inode;
1644 		async_chunk[i].start = start;
1645 		async_chunk[i].end = cur_end;
1646 		async_chunk[i].write_flags = write_flags;
1647 		INIT_LIST_HEAD(&async_chunk[i].extents);
1648 
1649 		/*
1650 		 * The locked_page comes all the way from writepage and its
1651 		 * the original page we were actually given.  As we spread
1652 		 * this large delalloc region across multiple async_chunk
1653 		 * structs, only the first struct needs a pointer to locked_page
1654 		 *
1655 		 * This way we don't need racey decisions about who is supposed
1656 		 * to unlock it.
1657 		 */
1658 		if (locked_page) {
1659 			/*
1660 			 * Depending on the compressibility, the pages might or
1661 			 * might not go through async.  We want all of them to
1662 			 * be accounted against wbc once.  Let's do it here
1663 			 * before the paths diverge.  wbc accounting is used
1664 			 * only for foreign writeback detection and doesn't
1665 			 * need full accuracy.  Just account the whole thing
1666 			 * against the first page.
1667 			 */
1668 			wbc_account_cgroup_owner(wbc, locked_page,
1669 						 cur_end - start);
1670 			async_chunk[i].locked_page = locked_page;
1671 			locked_page = NULL;
1672 		} else {
1673 			async_chunk[i].locked_page = NULL;
1674 		}
1675 
1676 		if (blkcg_css != blkcg_root_css) {
1677 			css_get(blkcg_css);
1678 			async_chunk[i].blkcg_css = blkcg_css;
1679 			async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
1680 		} else {
1681 			async_chunk[i].blkcg_css = NULL;
1682 		}
1683 
1684 		btrfs_init_work(&async_chunk[i].work, compress_file_range,
1685 				submit_compressed_extents, async_cow_free);
1686 
1687 		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1688 		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1689 
1690 		btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1691 
1692 		start = cur_end + 1;
1693 	}
1694 	return true;
1695 }
1696 
1697 /*
1698  * Run the delalloc range from start to end, and write back any dirty pages
1699  * covered by the range.
1700  */
1701 static noinline int run_delalloc_cow(struct btrfs_inode *inode,
1702 				     struct page *locked_page, u64 start,
1703 				     u64 end, struct writeback_control *wbc,
1704 				     bool pages_dirty)
1705 {
1706 	u64 done_offset = end;
1707 	int ret;
1708 
1709 	while (start <= end) {
1710 		ret = cow_file_range(inode, locked_page, start, end, &done_offset,
1711 				     true, false);
1712 		if (ret)
1713 			return ret;
1714 		extent_write_locked_range(&inode->vfs_inode, locked_page, start,
1715 					  done_offset, wbc, pages_dirty);
1716 		start = done_offset + 1;
1717 	}
1718 
1719 	return 1;
1720 }
1721 
1722 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1723 					u64 bytenr, u64 num_bytes, bool nowait)
1724 {
1725 	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
1726 	struct btrfs_ordered_sum *sums;
1727 	int ret;
1728 	LIST_HEAD(list);
1729 
1730 	ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1,
1731 				      &list, 0, nowait);
1732 	if (ret == 0 && list_empty(&list))
1733 		return 0;
1734 
1735 	while (!list_empty(&list)) {
1736 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1737 		list_del(&sums->list);
1738 		kfree(sums);
1739 	}
1740 	if (ret < 0)
1741 		return ret;
1742 	return 1;
1743 }
1744 
1745 static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
1746 			   const u64 start, const u64 end)
1747 {
1748 	const bool is_space_ino = btrfs_is_free_space_inode(inode);
1749 	const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
1750 	const u64 range_bytes = end + 1 - start;
1751 	struct extent_io_tree *io_tree = &inode->io_tree;
1752 	u64 range_start = start;
1753 	u64 count;
1754 	int ret;
1755 
1756 	/*
1757 	 * If EXTENT_NORESERVE is set it means that when the buffered write was
1758 	 * made we had not enough available data space and therefore we did not
1759 	 * reserve data space for it, since we though we could do NOCOW for the
1760 	 * respective file range (either there is prealloc extent or the inode
1761 	 * has the NOCOW bit set).
1762 	 *
1763 	 * However when we need to fallback to COW mode (because for example the
1764 	 * block group for the corresponding extent was turned to RO mode by a
1765 	 * scrub or relocation) we need to do the following:
1766 	 *
1767 	 * 1) We increment the bytes_may_use counter of the data space info.
1768 	 *    If COW succeeds, it allocates a new data extent and after doing
1769 	 *    that it decrements the space info's bytes_may_use counter and
1770 	 *    increments its bytes_reserved counter by the same amount (we do
1771 	 *    this at btrfs_add_reserved_bytes()). So we need to increment the
1772 	 *    bytes_may_use counter to compensate (when space is reserved at
1773 	 *    buffered write time, the bytes_may_use counter is incremented);
1774 	 *
1775 	 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1776 	 *    that if the COW path fails for any reason, it decrements (through
1777 	 *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1778 	 *    data space info, which we incremented in the step above.
1779 	 *
1780 	 * If we need to fallback to cow and the inode corresponds to a free
1781 	 * space cache inode or an inode of the data relocation tree, we must
1782 	 * also increment bytes_may_use of the data space_info for the same
1783 	 * reason. Space caches and relocated data extents always get a prealloc
1784 	 * extent for them, however scrub or balance may have set the block
1785 	 * group that contains that extent to RO mode and therefore force COW
1786 	 * when starting writeback.
1787 	 */
1788 	count = count_range_bits(io_tree, &range_start, end, range_bytes,
1789 				 EXTENT_NORESERVE, 0, NULL);
1790 	if (count > 0 || is_space_ino || is_reloc_ino) {
1791 		u64 bytes = count;
1792 		struct btrfs_fs_info *fs_info = inode->root->fs_info;
1793 		struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1794 
1795 		if (is_space_ino || is_reloc_ino)
1796 			bytes = range_bytes;
1797 
1798 		spin_lock(&sinfo->lock);
1799 		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
1800 		spin_unlock(&sinfo->lock);
1801 
1802 		if (count > 0)
1803 			clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
1804 					 NULL);
1805 	}
1806 
1807 	/*
1808 	 * Don't try to create inline extents, as a mix of inline extent that
1809 	 * is written out and unlocked directly and a normal NOCOW extent
1810 	 * doesn't work.
1811 	 */
1812 	ret = cow_file_range(inode, locked_page, start, end, NULL, false, true);
1813 	ASSERT(ret != 1);
1814 	return ret;
1815 }
1816 
1817 struct can_nocow_file_extent_args {
1818 	/* Input fields. */
1819 
1820 	/* Start file offset of the range we want to NOCOW. */
1821 	u64 start;
1822 	/* End file offset (inclusive) of the range we want to NOCOW. */
1823 	u64 end;
1824 	bool writeback_path;
1825 	bool strict;
1826 	/*
1827 	 * Free the path passed to can_nocow_file_extent() once it's not needed
1828 	 * anymore.
1829 	 */
1830 	bool free_path;
1831 
1832 	/* Output fields. Only set when can_nocow_file_extent() returns 1. */
1833 
1834 	u64 disk_bytenr;
1835 	u64 disk_num_bytes;
1836 	u64 extent_offset;
1837 	/* Number of bytes that can be written to in NOCOW mode. */
1838 	u64 num_bytes;
1839 };
1840 
1841 /*
1842  * Check if we can NOCOW the file extent that the path points to.
1843  * This function may return with the path released, so the caller should check
1844  * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
1845  *
1846  * Returns: < 0 on error
1847  *            0 if we can not NOCOW
1848  *            1 if we can NOCOW
1849  */
1850 static int can_nocow_file_extent(struct btrfs_path *path,
1851 				 struct btrfs_key *key,
1852 				 struct btrfs_inode *inode,
1853 				 struct can_nocow_file_extent_args *args)
1854 {
1855 	const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
1856 	struct extent_buffer *leaf = path->nodes[0];
1857 	struct btrfs_root *root = inode->root;
1858 	struct btrfs_file_extent_item *fi;
1859 	u64 extent_end;
1860 	u8 extent_type;
1861 	int can_nocow = 0;
1862 	int ret = 0;
1863 	bool nowait = path->nowait;
1864 
1865 	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
1866 	extent_type = btrfs_file_extent_type(leaf, fi);
1867 
1868 	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1869 		goto out;
1870 
1871 	/* Can't access these fields unless we know it's not an inline extent. */
1872 	args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1873 	args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1874 	args->extent_offset = btrfs_file_extent_offset(leaf, fi);
1875 
1876 	if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
1877 	    extent_type == BTRFS_FILE_EXTENT_REG)
1878 		goto out;
1879 
1880 	/*
1881 	 * If the extent was created before the generation where the last snapshot
1882 	 * for its subvolume was created, then this implies the extent is shared,
1883 	 * hence we must COW.
1884 	 */
1885 	if (!args->strict &&
1886 	    btrfs_file_extent_generation(leaf, fi) <=
1887 	    btrfs_root_last_snapshot(&root->root_item))
1888 		goto out;
1889 
1890 	/* An explicit hole, must COW. */
1891 	if (args->disk_bytenr == 0)
1892 		goto out;
1893 
1894 	/* Compressed/encrypted/encoded extents must be COWed. */
1895 	if (btrfs_file_extent_compression(leaf, fi) ||
1896 	    btrfs_file_extent_encryption(leaf, fi) ||
1897 	    btrfs_file_extent_other_encoding(leaf, fi))
1898 		goto out;
1899 
1900 	extent_end = btrfs_file_extent_end(path);
1901 
1902 	/*
1903 	 * The following checks can be expensive, as they need to take other
1904 	 * locks and do btree or rbtree searches, so release the path to avoid
1905 	 * blocking other tasks for too long.
1906 	 */
1907 	btrfs_release_path(path);
1908 
1909 	ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
1910 				    key->offset - args->extent_offset,
1911 				    args->disk_bytenr, args->strict, path);
1912 	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1913 	if (ret != 0)
1914 		goto out;
1915 
1916 	if (args->free_path) {
1917 		/*
1918 		 * We don't need the path anymore, plus through the
1919 		 * csum_exist_in_range() call below we will end up allocating
1920 		 * another path. So free the path to avoid unnecessary extra
1921 		 * memory usage.
1922 		 */
1923 		btrfs_free_path(path);
1924 		path = NULL;
1925 	}
1926 
1927 	/* If there are pending snapshots for this root, we must COW. */
1928 	if (args->writeback_path && !is_freespace_inode &&
1929 	    atomic_read(&root->snapshot_force_cow))
1930 		goto out;
1931 
1932 	args->disk_bytenr += args->extent_offset;
1933 	args->disk_bytenr += args->start - key->offset;
1934 	args->num_bytes = min(args->end + 1, extent_end) - args->start;
1935 
1936 	/*
1937 	 * Force COW if csums exist in the range. This ensures that csums for a
1938 	 * given extent are either valid or do not exist.
1939 	 */
1940 	ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
1941 				  nowait);
1942 	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1943 	if (ret != 0)
1944 		goto out;
1945 
1946 	can_nocow = 1;
1947  out:
1948 	if (args->free_path && path)
1949 		btrfs_free_path(path);
1950 
1951 	return ret < 0 ? ret : can_nocow;
1952 }
1953 
1954 /*
1955  * when nowcow writeback call back.  This checks for snapshots or COW copies
1956  * of the extents that exist in the file, and COWs the file as required.
1957  *
1958  * If no cow copies or snapshots exist, we write directly to the existing
1959  * blocks on disk
1960  */
1961 static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
1962 				       struct page *locked_page,
1963 				       const u64 start, const u64 end)
1964 {
1965 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1966 	struct btrfs_root *root = inode->root;
1967 	struct btrfs_path *path;
1968 	u64 cow_start = (u64)-1;
1969 	u64 cur_offset = start;
1970 	int ret;
1971 	bool check_prev = true;
1972 	u64 ino = btrfs_ino(inode);
1973 	struct can_nocow_file_extent_args nocow_args = { 0 };
1974 
1975 	/*
1976 	 * Normally on a zoned device we're only doing COW writes, but in case
1977 	 * of relocation on a zoned filesystem serializes I/O so that we're only
1978 	 * writing sequentially and can end up here as well.
1979 	 */
1980 	ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
1981 
1982 	path = btrfs_alloc_path();
1983 	if (!path) {
1984 		ret = -ENOMEM;
1985 		goto error;
1986 	}
1987 
1988 	nocow_args.end = end;
1989 	nocow_args.writeback_path = true;
1990 
1991 	while (1) {
1992 		struct btrfs_block_group *nocow_bg = NULL;
1993 		struct btrfs_ordered_extent *ordered;
1994 		struct btrfs_key found_key;
1995 		struct btrfs_file_extent_item *fi;
1996 		struct extent_buffer *leaf;
1997 		u64 extent_end;
1998 		u64 ram_bytes;
1999 		u64 nocow_end;
2000 		int extent_type;
2001 		bool is_prealloc;
2002 
2003 		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
2004 					       cur_offset, 0);
2005 		if (ret < 0)
2006 			goto error;
2007 
2008 		/*
2009 		 * If there is no extent for our range when doing the initial
2010 		 * search, then go back to the previous slot as it will be the
2011 		 * one containing the search offset
2012 		 */
2013 		if (ret > 0 && path->slots[0] > 0 && check_prev) {
2014 			leaf = path->nodes[0];
2015 			btrfs_item_key_to_cpu(leaf, &found_key,
2016 					      path->slots[0] - 1);
2017 			if (found_key.objectid == ino &&
2018 			    found_key.type == BTRFS_EXTENT_DATA_KEY)
2019 				path->slots[0]--;
2020 		}
2021 		check_prev = false;
2022 next_slot:
2023 		/* Go to next leaf if we have exhausted the current one */
2024 		leaf = path->nodes[0];
2025 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2026 			ret = btrfs_next_leaf(root, path);
2027 			if (ret < 0)
2028 				goto error;
2029 			if (ret > 0)
2030 				break;
2031 			leaf = path->nodes[0];
2032 		}
2033 
2034 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2035 
2036 		/* Didn't find anything for our INO */
2037 		if (found_key.objectid > ino)
2038 			break;
2039 		/*
2040 		 * Keep searching until we find an EXTENT_ITEM or there are no
2041 		 * more extents for this inode
2042 		 */
2043 		if (WARN_ON_ONCE(found_key.objectid < ino) ||
2044 		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
2045 			path->slots[0]++;
2046 			goto next_slot;
2047 		}
2048 
2049 		/* Found key is not EXTENT_DATA_KEY or starts after req range */
2050 		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
2051 		    found_key.offset > end)
2052 			break;
2053 
2054 		/*
2055 		 * If the found extent starts after requested offset, then
2056 		 * adjust extent_end to be right before this extent begins
2057 		 */
2058 		if (found_key.offset > cur_offset) {
2059 			extent_end = found_key.offset;
2060 			extent_type = 0;
2061 			goto must_cow;
2062 		}
2063 
2064 		/*
2065 		 * Found extent which begins before our range and potentially
2066 		 * intersect it
2067 		 */
2068 		fi = btrfs_item_ptr(leaf, path->slots[0],
2069 				    struct btrfs_file_extent_item);
2070 		extent_type = btrfs_file_extent_type(leaf, fi);
2071 		/* If this is triggered then we have a memory corruption. */
2072 		ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
2073 		if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
2074 			ret = -EUCLEAN;
2075 			goto error;
2076 		}
2077 		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
2078 		extent_end = btrfs_file_extent_end(path);
2079 
2080 		/*
2081 		 * If the extent we got ends before our current offset, skip to
2082 		 * the next extent.
2083 		 */
2084 		if (extent_end <= cur_offset) {
2085 			path->slots[0]++;
2086 			goto next_slot;
2087 		}
2088 
2089 		nocow_args.start = cur_offset;
2090 		ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
2091 		if (ret < 0)
2092 			goto error;
2093 		if (ret == 0)
2094 			goto must_cow;
2095 
2096 		ret = 0;
2097 		nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
2098 		if (!nocow_bg) {
2099 must_cow:
2100 			/*
2101 			 * If we can't perform NOCOW writeback for the range,
2102 			 * then record the beginning of the range that needs to
2103 			 * be COWed.  It will be written out before the next
2104 			 * NOCOW range if we find one, or when exiting this
2105 			 * loop.
2106 			 */
2107 			if (cow_start == (u64)-1)
2108 				cow_start = cur_offset;
2109 			cur_offset = extent_end;
2110 			if (cur_offset > end)
2111 				break;
2112 			if (!path->nodes[0])
2113 				continue;
2114 			path->slots[0]++;
2115 			goto next_slot;
2116 		}
2117 
2118 		/*
2119 		 * COW range from cow_start to found_key.offset - 1. As the key
2120 		 * will contain the beginning of the first extent that can be
2121 		 * NOCOW, following one which needs to be COW'ed
2122 		 */
2123 		if (cow_start != (u64)-1) {
2124 			ret = fallback_to_cow(inode, locked_page,
2125 					      cow_start, found_key.offset - 1);
2126 			cow_start = (u64)-1;
2127 			if (ret) {
2128 				btrfs_dec_nocow_writers(nocow_bg);
2129 				goto error;
2130 			}
2131 		}
2132 
2133 		nocow_end = cur_offset + nocow_args.num_bytes - 1;
2134 		is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
2135 		if (is_prealloc) {
2136 			u64 orig_start = found_key.offset - nocow_args.extent_offset;
2137 			struct extent_map *em;
2138 
2139 			em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
2140 					  orig_start,
2141 					  nocow_args.disk_bytenr, /* block_start */
2142 					  nocow_args.num_bytes, /* block_len */
2143 					  nocow_args.disk_num_bytes, /* orig_block_len */
2144 					  ram_bytes, BTRFS_COMPRESS_NONE,
2145 					  BTRFS_ORDERED_PREALLOC);
2146 			if (IS_ERR(em)) {
2147 				btrfs_dec_nocow_writers(nocow_bg);
2148 				ret = PTR_ERR(em);
2149 				goto error;
2150 			}
2151 			free_extent_map(em);
2152 		}
2153 
2154 		ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
2155 				nocow_args.num_bytes, nocow_args.num_bytes,
2156 				nocow_args.disk_bytenr, nocow_args.num_bytes, 0,
2157 				is_prealloc
2158 				? (1 << BTRFS_ORDERED_PREALLOC)
2159 				: (1 << BTRFS_ORDERED_NOCOW),
2160 				BTRFS_COMPRESS_NONE);
2161 		btrfs_dec_nocow_writers(nocow_bg);
2162 		if (IS_ERR(ordered)) {
2163 			if (is_prealloc) {
2164 				btrfs_drop_extent_map_range(inode, cur_offset,
2165 							    nocow_end, false);
2166 			}
2167 			ret = PTR_ERR(ordered);
2168 			goto error;
2169 		}
2170 
2171 		if (btrfs_is_data_reloc_root(root))
2172 			/*
2173 			 * Error handled later, as we must prevent
2174 			 * extent_clear_unlock_delalloc() in error handler
2175 			 * from freeing metadata of created ordered extent.
2176 			 */
2177 			ret = btrfs_reloc_clone_csums(ordered);
2178 		btrfs_put_ordered_extent(ordered);
2179 
2180 		extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
2181 					     locked_page, EXTENT_LOCKED |
2182 					     EXTENT_DELALLOC |
2183 					     EXTENT_CLEAR_DATA_RESV,
2184 					     PAGE_UNLOCK | PAGE_SET_ORDERED);
2185 
2186 		cur_offset = extent_end;
2187 
2188 		/*
2189 		 * btrfs_reloc_clone_csums() error, now we're OK to call error
2190 		 * handler, as metadata for created ordered extent will only
2191 		 * be freed by btrfs_finish_ordered_io().
2192 		 */
2193 		if (ret)
2194 			goto error;
2195 		if (cur_offset > end)
2196 			break;
2197 	}
2198 	btrfs_release_path(path);
2199 
2200 	if (cur_offset <= end && cow_start == (u64)-1)
2201 		cow_start = cur_offset;
2202 
2203 	if (cow_start != (u64)-1) {
2204 		cur_offset = end;
2205 		ret = fallback_to_cow(inode, locked_page, cow_start, end);
2206 		cow_start = (u64)-1;
2207 		if (ret)
2208 			goto error;
2209 	}
2210 
2211 	btrfs_free_path(path);
2212 	return 0;
2213 
2214 error:
2215 	/*
2216 	 * If an error happened while a COW region is outstanding, cur_offset
2217 	 * needs to be reset to cow_start to ensure the COW region is unlocked
2218 	 * as well.
2219 	 */
2220 	if (cow_start != (u64)-1)
2221 		cur_offset = cow_start;
2222 	if (cur_offset < end)
2223 		extent_clear_unlock_delalloc(inode, cur_offset, end,
2224 					     locked_page, EXTENT_LOCKED |
2225 					     EXTENT_DELALLOC | EXTENT_DEFRAG |
2226 					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
2227 					     PAGE_START_WRITEBACK |
2228 					     PAGE_END_WRITEBACK);
2229 	btrfs_free_path(path);
2230 	return ret;
2231 }
2232 
2233 static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
2234 {
2235 	if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
2236 		if (inode->defrag_bytes &&
2237 		    test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
2238 				   0, NULL))
2239 			return false;
2240 		return true;
2241 	}
2242 	return false;
2243 }
2244 
2245 /*
2246  * Function to process delayed allocation (create CoW) for ranges which are
2247  * being touched for the first time.
2248  */
2249 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
2250 			     u64 start, u64 end, struct writeback_control *wbc)
2251 {
2252 	const bool zoned = btrfs_is_zoned(inode->root->fs_info);
2253 	int ret;
2254 
2255 	/*
2256 	 * The range must cover part of the @locked_page, or a return of 1
2257 	 * can confuse the caller.
2258 	 */
2259 	ASSERT(!(end <= page_offset(locked_page) ||
2260 		 start >= page_offset(locked_page) + PAGE_SIZE));
2261 
2262 	if (should_nocow(inode, start, end)) {
2263 		ret = run_delalloc_nocow(inode, locked_page, start, end);
2264 		goto out;
2265 	}
2266 
2267 	if (btrfs_inode_can_compress(inode) &&
2268 	    inode_need_compress(inode, start, end) &&
2269 	    run_delalloc_compressed(inode, locked_page, start, end, wbc))
2270 		return 1;
2271 
2272 	if (zoned)
2273 		ret = run_delalloc_cow(inode, locked_page, start, end, wbc,
2274 				       true);
2275 	else
2276 		ret = cow_file_range(inode, locked_page, start, end, NULL,
2277 				     false, false);
2278 
2279 out:
2280 	if (ret < 0)
2281 		btrfs_cleanup_ordered_extents(inode, locked_page, start,
2282 					      end - start + 1);
2283 	return ret;
2284 }
2285 
2286 void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
2287 				 struct extent_state *orig, u64 split)
2288 {
2289 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2290 	u64 size;
2291 
2292 	/* not delalloc, ignore it */
2293 	if (!(orig->state & EXTENT_DELALLOC))
2294 		return;
2295 
2296 	size = orig->end - orig->start + 1;
2297 	if (size > fs_info->max_extent_size) {
2298 		u32 num_extents;
2299 		u64 new_size;
2300 
2301 		/*
2302 		 * See the explanation in btrfs_merge_delalloc_extent, the same
2303 		 * applies here, just in reverse.
2304 		 */
2305 		new_size = orig->end - split + 1;
2306 		num_extents = count_max_extents(fs_info, new_size);
2307 		new_size = split - orig->start;
2308 		num_extents += count_max_extents(fs_info, new_size);
2309 		if (count_max_extents(fs_info, size) >= num_extents)
2310 			return;
2311 	}
2312 
2313 	spin_lock(&inode->lock);
2314 	btrfs_mod_outstanding_extents(inode, 1);
2315 	spin_unlock(&inode->lock);
2316 }
2317 
2318 /*
2319  * Handle merged delayed allocation extents so we can keep track of new extents
2320  * that are just merged onto old extents, such as when we are doing sequential
2321  * writes, so we can properly account for the metadata space we'll need.
2322  */
2323 void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
2324 				 struct extent_state *other)
2325 {
2326 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2327 	u64 new_size, old_size;
2328 	u32 num_extents;
2329 
2330 	/* not delalloc, ignore it */
2331 	if (!(other->state & EXTENT_DELALLOC))
2332 		return;
2333 
2334 	if (new->start > other->start)
2335 		new_size = new->end - other->start + 1;
2336 	else
2337 		new_size = other->end - new->start + 1;
2338 
2339 	/* we're not bigger than the max, unreserve the space and go */
2340 	if (new_size <= fs_info->max_extent_size) {
2341 		spin_lock(&inode->lock);
2342 		btrfs_mod_outstanding_extents(inode, -1);
2343 		spin_unlock(&inode->lock);
2344 		return;
2345 	}
2346 
2347 	/*
2348 	 * We have to add up either side to figure out how many extents were
2349 	 * accounted for before we merged into one big extent.  If the number of
2350 	 * extents we accounted for is <= the amount we need for the new range
2351 	 * then we can return, otherwise drop.  Think of it like this
2352 	 *
2353 	 * [ 4k][MAX_SIZE]
2354 	 *
2355 	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
2356 	 * need 2 outstanding extents, on one side we have 1 and the other side
2357 	 * we have 1 so they are == and we can return.  But in this case
2358 	 *
2359 	 * [MAX_SIZE+4k][MAX_SIZE+4k]
2360 	 *
2361 	 * Each range on their own accounts for 2 extents, but merged together
2362 	 * they are only 3 extents worth of accounting, so we need to drop in
2363 	 * this case.
2364 	 */
2365 	old_size = other->end - other->start + 1;
2366 	num_extents = count_max_extents(fs_info, old_size);
2367 	old_size = new->end - new->start + 1;
2368 	num_extents += count_max_extents(fs_info, old_size);
2369 	if (count_max_extents(fs_info, new_size) >= num_extents)
2370 		return;
2371 
2372 	spin_lock(&inode->lock);
2373 	btrfs_mod_outstanding_extents(inode, -1);
2374 	spin_unlock(&inode->lock);
2375 }
2376 
2377 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
2378 				      struct btrfs_inode *inode)
2379 {
2380 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2381 
2382 	spin_lock(&root->delalloc_lock);
2383 	if (list_empty(&inode->delalloc_inodes)) {
2384 		list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
2385 		set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
2386 		root->nr_delalloc_inodes++;
2387 		if (root->nr_delalloc_inodes == 1) {
2388 			spin_lock(&fs_info->delalloc_root_lock);
2389 			BUG_ON(!list_empty(&root->delalloc_root));
2390 			list_add_tail(&root->delalloc_root,
2391 				      &fs_info->delalloc_roots);
2392 			spin_unlock(&fs_info->delalloc_root_lock);
2393 		}
2394 	}
2395 	spin_unlock(&root->delalloc_lock);
2396 }
2397 
2398 void __btrfs_del_delalloc_inode(struct btrfs_root *root,
2399 				struct btrfs_inode *inode)
2400 {
2401 	struct btrfs_fs_info *fs_info = root->fs_info;
2402 
2403 	if (!list_empty(&inode->delalloc_inodes)) {
2404 		list_del_init(&inode->delalloc_inodes);
2405 		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2406 			  &inode->runtime_flags);
2407 		root->nr_delalloc_inodes--;
2408 		if (!root->nr_delalloc_inodes) {
2409 			ASSERT(list_empty(&root->delalloc_inodes));
2410 			spin_lock(&fs_info->delalloc_root_lock);
2411 			BUG_ON(list_empty(&root->delalloc_root));
2412 			list_del_init(&root->delalloc_root);
2413 			spin_unlock(&fs_info->delalloc_root_lock);
2414 		}
2415 	}
2416 }
2417 
2418 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
2419 				     struct btrfs_inode *inode)
2420 {
2421 	spin_lock(&root->delalloc_lock);
2422 	__btrfs_del_delalloc_inode(root, inode);
2423 	spin_unlock(&root->delalloc_lock);
2424 }
2425 
2426 /*
2427  * Properly track delayed allocation bytes in the inode and to maintain the
2428  * list of inodes that have pending delalloc work to be done.
2429  */
2430 void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
2431 			       u32 bits)
2432 {
2433 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2434 
2435 	if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
2436 		WARN_ON(1);
2437 	/*
2438 	 * set_bit and clear bit hooks normally require _irqsave/restore
2439 	 * but in this case, we are only testing for the DELALLOC
2440 	 * bit, which is only set or cleared with irqs on
2441 	 */
2442 	if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2443 		struct btrfs_root *root = inode->root;
2444 		u64 len = state->end + 1 - state->start;
2445 		u32 num_extents = count_max_extents(fs_info, len);
2446 		bool do_list = !btrfs_is_free_space_inode(inode);
2447 
2448 		spin_lock(&inode->lock);
2449 		btrfs_mod_outstanding_extents(inode, num_extents);
2450 		spin_unlock(&inode->lock);
2451 
2452 		/* For sanity tests */
2453 		if (btrfs_is_testing(fs_info))
2454 			return;
2455 
2456 		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
2457 					 fs_info->delalloc_batch);
2458 		spin_lock(&inode->lock);
2459 		inode->delalloc_bytes += len;
2460 		if (bits & EXTENT_DEFRAG)
2461 			inode->defrag_bytes += len;
2462 		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2463 					 &inode->runtime_flags))
2464 			btrfs_add_delalloc_inodes(root, inode);
2465 		spin_unlock(&inode->lock);
2466 	}
2467 
2468 	if (!(state->state & EXTENT_DELALLOC_NEW) &&
2469 	    (bits & EXTENT_DELALLOC_NEW)) {
2470 		spin_lock(&inode->lock);
2471 		inode->new_delalloc_bytes += state->end + 1 - state->start;
2472 		spin_unlock(&inode->lock);
2473 	}
2474 }
2475 
2476 /*
2477  * Once a range is no longer delalloc this function ensures that proper
2478  * accounting happens.
2479  */
2480 void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
2481 				 struct extent_state *state, u32 bits)
2482 {
2483 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2484 	u64 len = state->end + 1 - state->start;
2485 	u32 num_extents = count_max_extents(fs_info, len);
2486 
2487 	if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
2488 		spin_lock(&inode->lock);
2489 		inode->defrag_bytes -= len;
2490 		spin_unlock(&inode->lock);
2491 	}
2492 
2493 	/*
2494 	 * set_bit and clear bit hooks normally require _irqsave/restore
2495 	 * but in this case, we are only testing for the DELALLOC
2496 	 * bit, which is only set or cleared with irqs on
2497 	 */
2498 	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2499 		struct btrfs_root *root = inode->root;
2500 		bool do_list = !btrfs_is_free_space_inode(inode);
2501 
2502 		spin_lock(&inode->lock);
2503 		btrfs_mod_outstanding_extents(inode, -num_extents);
2504 		spin_unlock(&inode->lock);
2505 
2506 		/*
2507 		 * We don't reserve metadata space for space cache inodes so we
2508 		 * don't need to call delalloc_release_metadata if there is an
2509 		 * error.
2510 		 */
2511 		if (bits & EXTENT_CLEAR_META_RESV &&
2512 		    root != fs_info->tree_root)
2513 			btrfs_delalloc_release_metadata(inode, len, false);
2514 
2515 		/* For sanity tests. */
2516 		if (btrfs_is_testing(fs_info))
2517 			return;
2518 
2519 		if (!btrfs_is_data_reloc_root(root) &&
2520 		    do_list && !(state->state & EXTENT_NORESERVE) &&
2521 		    (bits & EXTENT_CLEAR_DATA_RESV))
2522 			btrfs_free_reserved_data_space_noquota(fs_info, len);
2523 
2524 		percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
2525 					 fs_info->delalloc_batch);
2526 		spin_lock(&inode->lock);
2527 		inode->delalloc_bytes -= len;
2528 		if (do_list && inode->delalloc_bytes == 0 &&
2529 		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2530 					&inode->runtime_flags))
2531 			btrfs_del_delalloc_inode(root, inode);
2532 		spin_unlock(&inode->lock);
2533 	}
2534 
2535 	if ((state->state & EXTENT_DELALLOC_NEW) &&
2536 	    (bits & EXTENT_DELALLOC_NEW)) {
2537 		spin_lock(&inode->lock);
2538 		ASSERT(inode->new_delalloc_bytes >= len);
2539 		inode->new_delalloc_bytes -= len;
2540 		if (bits & EXTENT_ADD_INODE_BYTES)
2541 			inode_add_bytes(&inode->vfs_inode, len);
2542 		spin_unlock(&inode->lock);
2543 	}
2544 }
2545 
2546 static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
2547 					struct btrfs_ordered_extent *ordered)
2548 {
2549 	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
2550 	u64 len = bbio->bio.bi_iter.bi_size;
2551 	struct btrfs_ordered_extent *new;
2552 	int ret;
2553 
2554 	/* Must always be called for the beginning of an ordered extent. */
2555 	if (WARN_ON_ONCE(start != ordered->disk_bytenr))
2556 		return -EINVAL;
2557 
2558 	/* No need to split if the ordered extent covers the entire bio. */
2559 	if (ordered->disk_num_bytes == len) {
2560 		refcount_inc(&ordered->refs);
2561 		bbio->ordered = ordered;
2562 		return 0;
2563 	}
2564 
2565 	/*
2566 	 * Don't split the extent_map for NOCOW extents, as we're writing into
2567 	 * a pre-existing one.
2568 	 */
2569 	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
2570 		ret = split_extent_map(bbio->inode, bbio->file_offset,
2571 				       ordered->num_bytes, len,
2572 				       ordered->disk_bytenr);
2573 		if (ret)
2574 			return ret;
2575 	}
2576 
2577 	new = btrfs_split_ordered_extent(ordered, len);
2578 	if (IS_ERR(new))
2579 		return PTR_ERR(new);
2580 	bbio->ordered = new;
2581 	return 0;
2582 }
2583 
2584 /*
2585  * given a list of ordered sums record them in the inode.  This happens
2586  * at IO completion time based on sums calculated at bio submission time.
2587  */
2588 static int add_pending_csums(struct btrfs_trans_handle *trans,
2589 			     struct list_head *list)
2590 {
2591 	struct btrfs_ordered_sum *sum;
2592 	struct btrfs_root *csum_root = NULL;
2593 	int ret;
2594 
2595 	list_for_each_entry(sum, list, list) {
2596 		trans->adding_csums = true;
2597 		if (!csum_root)
2598 			csum_root = btrfs_csum_root(trans->fs_info,
2599 						    sum->logical);
2600 		ret = btrfs_csum_file_blocks(trans, csum_root, sum);
2601 		trans->adding_csums = false;
2602 		if (ret)
2603 			return ret;
2604 	}
2605 	return 0;
2606 }
2607 
2608 static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2609 					 const u64 start,
2610 					 const u64 len,
2611 					 struct extent_state **cached_state)
2612 {
2613 	u64 search_start = start;
2614 	const u64 end = start + len - 1;
2615 
2616 	while (search_start < end) {
2617 		const u64 search_len = end - search_start + 1;
2618 		struct extent_map *em;
2619 		u64 em_len;
2620 		int ret = 0;
2621 
2622 		em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
2623 		if (IS_ERR(em))
2624 			return PTR_ERR(em);
2625 
2626 		if (em->block_start != EXTENT_MAP_HOLE)
2627 			goto next;
2628 
2629 		em_len = em->len;
2630 		if (em->start < search_start)
2631 			em_len -= search_start - em->start;
2632 		if (em_len > search_len)
2633 			em_len = search_len;
2634 
2635 		ret = set_extent_bit(&inode->io_tree, search_start,
2636 				     search_start + em_len - 1,
2637 				     EXTENT_DELALLOC_NEW, cached_state);
2638 next:
2639 		search_start = extent_map_end(em);
2640 		free_extent_map(em);
2641 		if (ret)
2642 			return ret;
2643 	}
2644 	return 0;
2645 }
2646 
2647 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2648 			      unsigned int extra_bits,
2649 			      struct extent_state **cached_state)
2650 {
2651 	WARN_ON(PAGE_ALIGNED(end));
2652 
2653 	if (start >= i_size_read(&inode->vfs_inode) &&
2654 	    !(inode->flags & BTRFS_INODE_PREALLOC)) {
2655 		/*
2656 		 * There can't be any extents following eof in this case so just
2657 		 * set the delalloc new bit for the range directly.
2658 		 */
2659 		extra_bits |= EXTENT_DELALLOC_NEW;
2660 	} else {
2661 		int ret;
2662 
2663 		ret = btrfs_find_new_delalloc_bytes(inode, start,
2664 						    end + 1 - start,
2665 						    cached_state);
2666 		if (ret)
2667 			return ret;
2668 	}
2669 
2670 	return set_extent_bit(&inode->io_tree, start, end,
2671 			      EXTENT_DELALLOC | extra_bits, cached_state);
2672 }
2673 
2674 /* see btrfs_writepage_start_hook for details on why this is required */
2675 struct btrfs_writepage_fixup {
2676 	struct page *page;
2677 	struct btrfs_inode *inode;
2678 	struct btrfs_work work;
2679 };
2680 
2681 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2682 {
2683 	struct btrfs_writepage_fixup *fixup =
2684 		container_of(work, struct btrfs_writepage_fixup, work);
2685 	struct btrfs_ordered_extent *ordered;
2686 	struct extent_state *cached_state = NULL;
2687 	struct extent_changeset *data_reserved = NULL;
2688 	struct page *page = fixup->page;
2689 	struct btrfs_inode *inode = fixup->inode;
2690 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2691 	u64 page_start = page_offset(page);
2692 	u64 page_end = page_offset(page) + PAGE_SIZE - 1;
2693 	int ret = 0;
2694 	bool free_delalloc_space = true;
2695 
2696 	/*
2697 	 * This is similar to page_mkwrite, we need to reserve the space before
2698 	 * we take the page lock.
2699 	 */
2700 	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2701 					   PAGE_SIZE);
2702 again:
2703 	lock_page(page);
2704 
2705 	/*
2706 	 * Before we queued this fixup, we took a reference on the page.
2707 	 * page->mapping may go NULL, but it shouldn't be moved to a different
2708 	 * address space.
2709 	 */
2710 	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2711 		/*
2712 		 * Unfortunately this is a little tricky, either
2713 		 *
2714 		 * 1) We got here and our page had already been dealt with and
2715 		 *    we reserved our space, thus ret == 0, so we need to just
2716 		 *    drop our space reservation and bail.  This can happen the
2717 		 *    first time we come into the fixup worker, or could happen
2718 		 *    while waiting for the ordered extent.
2719 		 * 2) Our page was already dealt with, but we happened to get an
2720 		 *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
2721 		 *    this case we obviously don't have anything to release, but
2722 		 *    because the page was already dealt with we don't want to
2723 		 *    mark the page with an error, so make sure we're resetting
2724 		 *    ret to 0.  This is why we have this check _before_ the ret
2725 		 *    check, because we do not want to have a surprise ENOSPC
2726 		 *    when the page was already properly dealt with.
2727 		 */
2728 		if (!ret) {
2729 			btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2730 			btrfs_delalloc_release_space(inode, data_reserved,
2731 						     page_start, PAGE_SIZE,
2732 						     true);
2733 		}
2734 		ret = 0;
2735 		goto out_page;
2736 	}
2737 
2738 	/*
2739 	 * We can't mess with the page state unless it is locked, so now that
2740 	 * it is locked bail if we failed to make our space reservation.
2741 	 */
2742 	if (ret)
2743 		goto out_page;
2744 
2745 	lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2746 
2747 	/* already ordered? We're done */
2748 	if (PageOrdered(page))
2749 		goto out_reserved;
2750 
2751 	ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
2752 	if (ordered) {
2753 		unlock_extent(&inode->io_tree, page_start, page_end,
2754 			      &cached_state);
2755 		unlock_page(page);
2756 		btrfs_start_ordered_extent(ordered);
2757 		btrfs_put_ordered_extent(ordered);
2758 		goto again;
2759 	}
2760 
2761 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2762 					&cached_state);
2763 	if (ret)
2764 		goto out_reserved;
2765 
2766 	/*
2767 	 * Everything went as planned, we're now the owner of a dirty page with
2768 	 * delayed allocation bits set and space reserved for our COW
2769 	 * destination.
2770 	 *
2771 	 * The page was dirty when we started, nothing should have cleaned it.
2772 	 */
2773 	BUG_ON(!PageDirty(page));
2774 	free_delalloc_space = false;
2775 out_reserved:
2776 	btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2777 	if (free_delalloc_space)
2778 		btrfs_delalloc_release_space(inode, data_reserved, page_start,
2779 					     PAGE_SIZE, true);
2780 	unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2781 out_page:
2782 	if (ret) {
2783 		/*
2784 		 * We hit ENOSPC or other errors.  Update the mapping and page
2785 		 * to reflect the errors and clean the page.
2786 		 */
2787 		mapping_set_error(page->mapping, ret);
2788 		btrfs_mark_ordered_io_finished(inode, page, page_start,
2789 					       PAGE_SIZE, !ret);
2790 		clear_page_dirty_for_io(page);
2791 	}
2792 	btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
2793 	unlock_page(page);
2794 	put_page(page);
2795 	kfree(fixup);
2796 	extent_changeset_free(data_reserved);
2797 	/*
2798 	 * As a precaution, do a delayed iput in case it would be the last iput
2799 	 * that could need flushing space. Recursing back to fixup worker would
2800 	 * deadlock.
2801 	 */
2802 	btrfs_add_delayed_iput(inode);
2803 }
2804 
2805 /*
2806  * There are a few paths in the higher layers of the kernel that directly
2807  * set the page dirty bit without asking the filesystem if it is a
2808  * good idea.  This causes problems because we want to make sure COW
2809  * properly happens and the data=ordered rules are followed.
2810  *
2811  * In our case any range that doesn't have the ORDERED bit set
2812  * hasn't been properly setup for IO.  We kick off an async process
2813  * to fix it up.  The async helper will wait for ordered extents, set
2814  * the delalloc bit and make it safe to write the page.
2815  */
2816 int btrfs_writepage_cow_fixup(struct page *page)
2817 {
2818 	struct inode *inode = page->mapping->host;
2819 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2820 	struct btrfs_writepage_fixup *fixup;
2821 
2822 	/* This page has ordered extent covering it already */
2823 	if (PageOrdered(page))
2824 		return 0;
2825 
2826 	/*
2827 	 * PageChecked is set below when we create a fixup worker for this page,
2828 	 * don't try to create another one if we're already PageChecked()
2829 	 *
2830 	 * The extent_io writepage code will redirty the page if we send back
2831 	 * EAGAIN.
2832 	 */
2833 	if (PageChecked(page))
2834 		return -EAGAIN;
2835 
2836 	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2837 	if (!fixup)
2838 		return -EAGAIN;
2839 
2840 	/*
2841 	 * We are already holding a reference to this inode from
2842 	 * write_cache_pages.  We need to hold it because the space reservation
2843 	 * takes place outside of the page lock, and we can't trust
2844 	 * page->mapping outside of the page lock.
2845 	 */
2846 	ihold(inode);
2847 	btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
2848 	get_page(page);
2849 	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
2850 	fixup->page = page;
2851 	fixup->inode = BTRFS_I(inode);
2852 	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2853 
2854 	return -EAGAIN;
2855 }
2856 
2857 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2858 				       struct btrfs_inode *inode, u64 file_pos,
2859 				       struct btrfs_file_extent_item *stack_fi,
2860 				       const bool update_inode_bytes,
2861 				       u64 qgroup_reserved)
2862 {
2863 	struct btrfs_root *root = inode->root;
2864 	const u64 sectorsize = root->fs_info->sectorsize;
2865 	struct btrfs_path *path;
2866 	struct extent_buffer *leaf;
2867 	struct btrfs_key ins;
2868 	u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2869 	u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
2870 	u64 offset = btrfs_stack_file_extent_offset(stack_fi);
2871 	u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2872 	u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2873 	struct btrfs_drop_extents_args drop_args = { 0 };
2874 	int ret;
2875 
2876 	path = btrfs_alloc_path();
2877 	if (!path)
2878 		return -ENOMEM;
2879 
2880 	/*
2881 	 * we may be replacing one extent in the tree with another.
2882 	 * The new extent is pinned in the extent map, and we don't want
2883 	 * to drop it from the cache until it is completely in the btree.
2884 	 *
2885 	 * So, tell btrfs_drop_extents to leave this extent in the cache.
2886 	 * the caller is expected to unpin it and allow it to be merged
2887 	 * with the others.
2888 	 */
2889 	drop_args.path = path;
2890 	drop_args.start = file_pos;
2891 	drop_args.end = file_pos + num_bytes;
2892 	drop_args.replace_extent = true;
2893 	drop_args.extent_item_size = sizeof(*stack_fi);
2894 	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2895 	if (ret)
2896 		goto out;
2897 
2898 	if (!drop_args.extent_inserted) {
2899 		ins.objectid = btrfs_ino(inode);
2900 		ins.offset = file_pos;
2901 		ins.type = BTRFS_EXTENT_DATA_KEY;
2902 
2903 		ret = btrfs_insert_empty_item(trans, root, path, &ins,
2904 					      sizeof(*stack_fi));
2905 		if (ret)
2906 			goto out;
2907 	}
2908 	leaf = path->nodes[0];
2909 	btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2910 	write_extent_buffer(leaf, stack_fi,
2911 			btrfs_item_ptr_offset(leaf, path->slots[0]),
2912 			sizeof(struct btrfs_file_extent_item));
2913 
2914 	btrfs_mark_buffer_dirty(trans, leaf);
2915 	btrfs_release_path(path);
2916 
2917 	/*
2918 	 * If we dropped an inline extent here, we know the range where it is
2919 	 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
2920 	 * number of bytes only for that range containing the inline extent.
2921 	 * The remaining of the range will be processed when clearning the
2922 	 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
2923 	 */
2924 	if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
2925 		u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
2926 
2927 		inline_size = drop_args.bytes_found - inline_size;
2928 		btrfs_update_inode_bytes(inode, sectorsize, inline_size);
2929 		drop_args.bytes_found -= inline_size;
2930 		num_bytes -= sectorsize;
2931 	}
2932 
2933 	if (update_inode_bytes)
2934 		btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
2935 
2936 	ins.objectid = disk_bytenr;
2937 	ins.offset = disk_num_bytes;
2938 	ins.type = BTRFS_EXTENT_ITEM_KEY;
2939 
2940 	ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
2941 	if (ret)
2942 		goto out;
2943 
2944 	ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
2945 					       file_pos - offset,
2946 					       qgroup_reserved, &ins);
2947 out:
2948 	btrfs_free_path(path);
2949 
2950 	return ret;
2951 }
2952 
2953 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2954 					 u64 start, u64 len)
2955 {
2956 	struct btrfs_block_group *cache;
2957 
2958 	cache = btrfs_lookup_block_group(fs_info, start);
2959 	ASSERT(cache);
2960 
2961 	spin_lock(&cache->lock);
2962 	cache->delalloc_bytes -= len;
2963 	spin_unlock(&cache->lock);
2964 
2965 	btrfs_put_block_group(cache);
2966 }
2967 
2968 static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
2969 					     struct btrfs_ordered_extent *oe)
2970 {
2971 	struct btrfs_file_extent_item stack_fi;
2972 	bool update_inode_bytes;
2973 	u64 num_bytes = oe->num_bytes;
2974 	u64 ram_bytes = oe->ram_bytes;
2975 
2976 	memset(&stack_fi, 0, sizeof(stack_fi));
2977 	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
2978 	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
2979 	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
2980 						   oe->disk_num_bytes);
2981 	btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
2982 	if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
2983 		num_bytes = oe->truncated_len;
2984 		ram_bytes = num_bytes;
2985 	}
2986 	btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
2987 	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
2988 	btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
2989 	/* Encryption and other encoding is reserved and all 0 */
2990 
2991 	/*
2992 	 * For delalloc, when completing an ordered extent we update the inode's
2993 	 * bytes when clearing the range in the inode's io tree, so pass false
2994 	 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
2995 	 * except if the ordered extent was truncated.
2996 	 */
2997 	update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
2998 			     test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
2999 			     test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
3000 
3001 	return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
3002 					   oe->file_offset, &stack_fi,
3003 					   update_inode_bytes, oe->qgroup_rsv);
3004 }
3005 
3006 /*
3007  * As ordered data IO finishes, this gets called so we can finish
3008  * an ordered extent if the range of bytes in the file it covers are
3009  * fully written.
3010  */
3011 int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
3012 {
3013 	struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
3014 	struct btrfs_root *root = inode->root;
3015 	struct btrfs_fs_info *fs_info = root->fs_info;
3016 	struct btrfs_trans_handle *trans = NULL;
3017 	struct extent_io_tree *io_tree = &inode->io_tree;
3018 	struct extent_state *cached_state = NULL;
3019 	u64 start, end;
3020 	int compress_type = 0;
3021 	int ret = 0;
3022 	u64 logical_len = ordered_extent->num_bytes;
3023 	bool freespace_inode;
3024 	bool truncated = false;
3025 	bool clear_reserved_extent = true;
3026 	unsigned int clear_bits = EXTENT_DEFRAG;
3027 
3028 	start = ordered_extent->file_offset;
3029 	end = start + ordered_extent->num_bytes - 1;
3030 
3031 	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3032 	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3033 	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
3034 	    !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
3035 		clear_bits |= EXTENT_DELALLOC_NEW;
3036 
3037 	freespace_inode = btrfs_is_free_space_inode(inode);
3038 	if (!freespace_inode)
3039 		btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
3040 
3041 	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
3042 		ret = -EIO;
3043 		goto out;
3044 	}
3045 
3046 	if (btrfs_is_zoned(fs_info))
3047 		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
3048 					ordered_extent->disk_num_bytes);
3049 
3050 	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3051 		truncated = true;
3052 		logical_len = ordered_extent->truncated_len;
3053 		/* Truncated the entire extent, don't bother adding */
3054 		if (!logical_len)
3055 			goto out;
3056 	}
3057 
3058 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3059 		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
3060 
3061 		btrfs_inode_safe_disk_i_size_write(inode, 0);
3062 		if (freespace_inode)
3063 			trans = btrfs_join_transaction_spacecache(root);
3064 		else
3065 			trans = btrfs_join_transaction(root);
3066 		if (IS_ERR(trans)) {
3067 			ret = PTR_ERR(trans);
3068 			trans = NULL;
3069 			goto out;
3070 		}
3071 		trans->block_rsv = &inode->block_rsv;
3072 		ret = btrfs_update_inode_fallback(trans, root, inode);
3073 		if (ret) /* -ENOMEM or corruption */
3074 			btrfs_abort_transaction(trans, ret);
3075 		goto out;
3076 	}
3077 
3078 	clear_bits |= EXTENT_LOCKED;
3079 	lock_extent(io_tree, start, end, &cached_state);
3080 
3081 	if (freespace_inode)
3082 		trans = btrfs_join_transaction_spacecache(root);
3083 	else
3084 		trans = btrfs_join_transaction(root);
3085 	if (IS_ERR(trans)) {
3086 		ret = PTR_ERR(trans);
3087 		trans = NULL;
3088 		goto out;
3089 	}
3090 
3091 	trans->block_rsv = &inode->block_rsv;
3092 
3093 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3094 		compress_type = ordered_extent->compress_type;
3095 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3096 		BUG_ON(compress_type);
3097 		ret = btrfs_mark_extent_written(trans, inode,
3098 						ordered_extent->file_offset,
3099 						ordered_extent->file_offset +
3100 						logical_len);
3101 		btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
3102 						  ordered_extent->disk_num_bytes);
3103 	} else {
3104 		BUG_ON(root == fs_info->tree_root);
3105 		ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3106 		if (!ret) {
3107 			clear_reserved_extent = false;
3108 			btrfs_release_delalloc_bytes(fs_info,
3109 						ordered_extent->disk_bytenr,
3110 						ordered_extent->disk_num_bytes);
3111 		}
3112 	}
3113 	unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
3114 			   ordered_extent->num_bytes, trans->transid);
3115 	if (ret < 0) {
3116 		btrfs_abort_transaction(trans, ret);
3117 		goto out;
3118 	}
3119 
3120 	ret = add_pending_csums(trans, &ordered_extent->list);
3121 	if (ret) {
3122 		btrfs_abort_transaction(trans, ret);
3123 		goto out;
3124 	}
3125 
3126 	/*
3127 	 * If this is a new delalloc range, clear its new delalloc flag to
3128 	 * update the inode's number of bytes. This needs to be done first
3129 	 * before updating the inode item.
3130 	 */
3131 	if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3132 	    !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3133 		clear_extent_bit(&inode->io_tree, start, end,
3134 				 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
3135 				 &cached_state);
3136 
3137 	btrfs_inode_safe_disk_i_size_write(inode, 0);
3138 	ret = btrfs_update_inode_fallback(trans, root, inode);
3139 	if (ret) { /* -ENOMEM or corruption */
3140 		btrfs_abort_transaction(trans, ret);
3141 		goto out;
3142 	}
3143 	ret = 0;
3144 out:
3145 	clear_extent_bit(&inode->io_tree, start, end, clear_bits,
3146 			 &cached_state);
3147 
3148 	if (trans)
3149 		btrfs_end_transaction(trans);
3150 
3151 	if (ret || truncated) {
3152 		u64 unwritten_start = start;
3153 
3154 		/*
3155 		 * If we failed to finish this ordered extent for any reason we
3156 		 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3157 		 * extent, and mark the inode with the error if it wasn't
3158 		 * already set.  Any error during writeback would have already
3159 		 * set the mapping error, so we need to set it if we're the ones
3160 		 * marking this ordered extent as failed.
3161 		 */
3162 		if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
3163 					     &ordered_extent->flags))
3164 			mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
3165 
3166 		if (truncated)
3167 			unwritten_start += logical_len;
3168 		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
3169 
3170 		/*
3171 		 * Drop extent maps for the part of the extent we didn't write.
3172 		 *
3173 		 * We have an exception here for the free_space_inode, this is
3174 		 * because when we do btrfs_get_extent() on the free space inode
3175 		 * we will search the commit root.  If this is a new block group
3176 		 * we won't find anything, and we will trip over the assert in
3177 		 * writepage where we do ASSERT(em->block_start !=
3178 		 * EXTENT_MAP_HOLE).
3179 		 *
3180 		 * Theoretically we could also skip this for any NOCOW extent as
3181 		 * we don't mess with the extent map tree in the NOCOW case, but
3182 		 * for now simply skip this if we are the free space inode.
3183 		 */
3184 		if (!btrfs_is_free_space_inode(inode))
3185 			btrfs_drop_extent_map_range(inode, unwritten_start,
3186 						    end, false);
3187 
3188 		/*
3189 		 * If the ordered extent had an IOERR or something else went
3190 		 * wrong we need to return the space for this ordered extent
3191 		 * back to the allocator.  We only free the extent in the
3192 		 * truncated case if we didn't write out the extent at all.
3193 		 *
3194 		 * If we made it past insert_reserved_file_extent before we
3195 		 * errored out then we don't need to do this as the accounting
3196 		 * has already been done.
3197 		 */
3198 		if ((ret || !logical_len) &&
3199 		    clear_reserved_extent &&
3200 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3201 		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3202 			/*
3203 			 * Discard the range before returning it back to the
3204 			 * free space pool
3205 			 */
3206 			if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3207 				btrfs_discard_extent(fs_info,
3208 						ordered_extent->disk_bytenr,
3209 						ordered_extent->disk_num_bytes,
3210 						NULL);
3211 			btrfs_free_reserved_extent(fs_info,
3212 					ordered_extent->disk_bytenr,
3213 					ordered_extent->disk_num_bytes, 1);
3214 			/*
3215 			 * Actually free the qgroup rsv which was released when
3216 			 * the ordered extent was created.
3217 			 */
3218 			btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid,
3219 						  ordered_extent->qgroup_rsv,
3220 						  BTRFS_QGROUP_RSV_DATA);
3221 		}
3222 	}
3223 
3224 	/*
3225 	 * This needs to be done to make sure anybody waiting knows we are done
3226 	 * updating everything for this ordered extent.
3227 	 */
3228 	btrfs_remove_ordered_extent(inode, ordered_extent);
3229 
3230 	/* once for us */
3231 	btrfs_put_ordered_extent(ordered_extent);
3232 	/* once for the tree */
3233 	btrfs_put_ordered_extent(ordered_extent);
3234 
3235 	return ret;
3236 }
3237 
3238 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
3239 {
3240 	if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
3241 	    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
3242 		btrfs_finish_ordered_zoned(ordered);
3243 	return btrfs_finish_one_ordered(ordered);
3244 }
3245 
3246 /*
3247  * Verify the checksum for a single sector without any extra action that depend
3248  * on the type of I/O.
3249  */
3250 int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
3251 			    u32 pgoff, u8 *csum, const u8 * const csum_expected)
3252 {
3253 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3254 	char *kaddr;
3255 
3256 	ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
3257 
3258 	shash->tfm = fs_info->csum_shash;
3259 
3260 	kaddr = kmap_local_page(page) + pgoff;
3261 	crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
3262 	kunmap_local(kaddr);
3263 
3264 	if (memcmp(csum, csum_expected, fs_info->csum_size))
3265 		return -EIO;
3266 	return 0;
3267 }
3268 
3269 /*
3270  * Verify the checksum of a single data sector.
3271  *
3272  * @bbio:	btrfs_io_bio which contains the csum
3273  * @dev:	device the sector is on
3274  * @bio_offset:	offset to the beginning of the bio (in bytes)
3275  * @bv:		bio_vec to check
3276  *
3277  * Check if the checksum on a data block is valid.  When a checksum mismatch is
3278  * detected, report the error and fill the corrupted range with zero.
3279  *
3280  * Return %true if the sector is ok or had no checksum to start with, else %false.
3281  */
3282 bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
3283 			u32 bio_offset, struct bio_vec *bv)
3284 {
3285 	struct btrfs_inode *inode = bbio->inode;
3286 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3287 	u64 file_offset = bbio->file_offset + bio_offset;
3288 	u64 end = file_offset + bv->bv_len - 1;
3289 	u8 *csum_expected;
3290 	u8 csum[BTRFS_CSUM_SIZE];
3291 
3292 	ASSERT(bv->bv_len == fs_info->sectorsize);
3293 
3294 	if (!bbio->csum)
3295 		return true;
3296 
3297 	if (btrfs_is_data_reloc_root(inode->root) &&
3298 	    test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
3299 			   1, NULL)) {
3300 		/* Skip the range without csum for data reloc inode */
3301 		clear_extent_bits(&inode->io_tree, file_offset, end,
3302 				  EXTENT_NODATASUM);
3303 		return true;
3304 	}
3305 
3306 	csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
3307 				fs_info->csum_size;
3308 	if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
3309 				    csum_expected))
3310 		goto zeroit;
3311 	return true;
3312 
3313 zeroit:
3314 	btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
3315 				    bbio->mirror_num);
3316 	if (dev)
3317 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
3318 	memzero_bvec(bv);
3319 	return false;
3320 }
3321 
3322 /*
3323  * btrfs_add_delayed_iput - perform a delayed iput on @inode
3324  *
3325  * @inode: The inode we want to perform iput on
3326  *
3327  * This function uses the generic vfs_inode::i_count to track whether we should
3328  * just decrement it (in case it's > 1) or if this is the last iput then link
3329  * the inode to the delayed iput machinery. Delayed iputs are processed at
3330  * transaction commit time/superblock commit/cleaner kthread.
3331  */
3332 void btrfs_add_delayed_iput(struct btrfs_inode *inode)
3333 {
3334 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3335 	unsigned long flags;
3336 
3337 	if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
3338 		return;
3339 
3340 	atomic_inc(&fs_info->nr_delayed_iputs);
3341 	/*
3342 	 * Need to be irq safe here because we can be called from either an irq
3343 	 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
3344 	 * context.
3345 	 */
3346 	spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
3347 	ASSERT(list_empty(&inode->delayed_iput));
3348 	list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
3349 	spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
3350 	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3351 		wake_up_process(fs_info->cleaner_kthread);
3352 }
3353 
3354 static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3355 				    struct btrfs_inode *inode)
3356 {
3357 	list_del_init(&inode->delayed_iput);
3358 	spin_unlock_irq(&fs_info->delayed_iput_lock);
3359 	iput(&inode->vfs_inode);
3360 	if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3361 		wake_up(&fs_info->delayed_iputs_wait);
3362 	spin_lock_irq(&fs_info->delayed_iput_lock);
3363 }
3364 
3365 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3366 				   struct btrfs_inode *inode)
3367 {
3368 	if (!list_empty(&inode->delayed_iput)) {
3369 		spin_lock_irq(&fs_info->delayed_iput_lock);
3370 		if (!list_empty(&inode->delayed_iput))
3371 			run_delayed_iput_locked(fs_info, inode);
3372 		spin_unlock_irq(&fs_info->delayed_iput_lock);
3373 	}
3374 }
3375 
3376 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3377 {
3378 	/*
3379 	 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
3380 	 * calls btrfs_add_delayed_iput() and that needs to lock
3381 	 * fs_info->delayed_iput_lock. So we need to disable irqs here to
3382 	 * prevent a deadlock.
3383 	 */
3384 	spin_lock_irq(&fs_info->delayed_iput_lock);
3385 	while (!list_empty(&fs_info->delayed_iputs)) {
3386 		struct btrfs_inode *inode;
3387 
3388 		inode = list_first_entry(&fs_info->delayed_iputs,
3389 				struct btrfs_inode, delayed_iput);
3390 		run_delayed_iput_locked(fs_info, inode);
3391 		if (need_resched()) {
3392 			spin_unlock_irq(&fs_info->delayed_iput_lock);
3393 			cond_resched();
3394 			spin_lock_irq(&fs_info->delayed_iput_lock);
3395 		}
3396 	}
3397 	spin_unlock_irq(&fs_info->delayed_iput_lock);
3398 }
3399 
3400 /*
3401  * Wait for flushing all delayed iputs
3402  *
3403  * @fs_info:  the filesystem
3404  *
3405  * This will wait on any delayed iputs that are currently running with KILLABLE
3406  * set.  Once they are all done running we will return, unless we are killed in
3407  * which case we return EINTR. This helps in user operations like fallocate etc
3408  * that might get blocked on the iputs.
3409  *
3410  * Return EINTR if we were killed, 0 if nothing's pending
3411  */
3412 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3413 {
3414 	int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3415 			atomic_read(&fs_info->nr_delayed_iputs) == 0);
3416 	if (ret)
3417 		return -EINTR;
3418 	return 0;
3419 }
3420 
3421 /*
3422  * This creates an orphan entry for the given inode in case something goes wrong
3423  * in the middle of an unlink.
3424  */
3425 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3426 		     struct btrfs_inode *inode)
3427 {
3428 	int ret;
3429 
3430 	ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3431 	if (ret && ret != -EEXIST) {
3432 		btrfs_abort_transaction(trans, ret);
3433 		return ret;
3434 	}
3435 
3436 	return 0;
3437 }
3438 
3439 /*
3440  * We have done the delete so we can go ahead and remove the orphan item for
3441  * this particular inode.
3442  */
3443 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3444 			    struct btrfs_inode *inode)
3445 {
3446 	return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3447 }
3448 
3449 /*
3450  * this cleans up any orphans that may be left on the list from the last use
3451  * of this root.
3452  */
3453 int btrfs_orphan_cleanup(struct btrfs_root *root)
3454 {
3455 	struct btrfs_fs_info *fs_info = root->fs_info;
3456 	struct btrfs_path *path;
3457 	struct extent_buffer *leaf;
3458 	struct btrfs_key key, found_key;
3459 	struct btrfs_trans_handle *trans;
3460 	struct inode *inode;
3461 	u64 last_objectid = 0;
3462 	int ret = 0, nr_unlink = 0;
3463 
3464 	if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
3465 		return 0;
3466 
3467 	path = btrfs_alloc_path();
3468 	if (!path) {
3469 		ret = -ENOMEM;
3470 		goto out;
3471 	}
3472 	path->reada = READA_BACK;
3473 
3474 	key.objectid = BTRFS_ORPHAN_OBJECTID;
3475 	key.type = BTRFS_ORPHAN_ITEM_KEY;
3476 	key.offset = (u64)-1;
3477 
3478 	while (1) {
3479 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3480 		if (ret < 0)
3481 			goto out;
3482 
3483 		/*
3484 		 * if ret == 0 means we found what we were searching for, which
3485 		 * is weird, but possible, so only screw with path if we didn't
3486 		 * find the key and see if we have stuff that matches
3487 		 */
3488 		if (ret > 0) {
3489 			ret = 0;
3490 			if (path->slots[0] == 0)
3491 				break;
3492 			path->slots[0]--;
3493 		}
3494 
3495 		/* pull out the item */
3496 		leaf = path->nodes[0];
3497 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3498 
3499 		/* make sure the item matches what we want */
3500 		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3501 			break;
3502 		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3503 			break;
3504 
3505 		/* release the path since we're done with it */
3506 		btrfs_release_path(path);
3507 
3508 		/*
3509 		 * this is where we are basically btrfs_lookup, without the
3510 		 * crossing root thing.  we store the inode number in the
3511 		 * offset of the orphan item.
3512 		 */
3513 
3514 		if (found_key.offset == last_objectid) {
3515 			/*
3516 			 * We found the same inode as before. This means we were
3517 			 * not able to remove its items via eviction triggered
3518 			 * by an iput(). A transaction abort may have happened,
3519 			 * due to -ENOSPC for example, so try to grab the error
3520 			 * that lead to a transaction abort, if any.
3521 			 */
3522 			btrfs_err(fs_info,
3523 				  "Error removing orphan entry, stopping orphan cleanup");
3524 			ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
3525 			goto out;
3526 		}
3527 
3528 		last_objectid = found_key.offset;
3529 
3530 		found_key.objectid = found_key.offset;
3531 		found_key.type = BTRFS_INODE_ITEM_KEY;
3532 		found_key.offset = 0;
3533 		inode = btrfs_iget(fs_info->sb, last_objectid, root);
3534 		if (IS_ERR(inode)) {
3535 			ret = PTR_ERR(inode);
3536 			inode = NULL;
3537 			if (ret != -ENOENT)
3538 				goto out;
3539 		}
3540 
3541 		if (!inode && root == fs_info->tree_root) {
3542 			struct btrfs_root *dead_root;
3543 			int is_dead_root = 0;
3544 
3545 			/*
3546 			 * This is an orphan in the tree root. Currently these
3547 			 * could come from 2 sources:
3548 			 *  a) a root (snapshot/subvolume) deletion in progress
3549 			 *  b) a free space cache inode
3550 			 * We need to distinguish those two, as the orphan item
3551 			 * for a root must not get deleted before the deletion
3552 			 * of the snapshot/subvolume's tree completes.
3553 			 *
3554 			 * btrfs_find_orphan_roots() ran before us, which has
3555 			 * found all deleted roots and loaded them into
3556 			 * fs_info->fs_roots_radix. So here we can find if an
3557 			 * orphan item corresponds to a deleted root by looking
3558 			 * up the root from that radix tree.
3559 			 */
3560 
3561 			spin_lock(&fs_info->fs_roots_radix_lock);
3562 			dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3563 							 (unsigned long)found_key.objectid);
3564 			if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3565 				is_dead_root = 1;
3566 			spin_unlock(&fs_info->fs_roots_radix_lock);
3567 
3568 			if (is_dead_root) {
3569 				/* prevent this orphan from being found again */
3570 				key.offset = found_key.objectid - 1;
3571 				continue;
3572 			}
3573 
3574 		}
3575 
3576 		/*
3577 		 * If we have an inode with links, there are a couple of
3578 		 * possibilities:
3579 		 *
3580 		 * 1. We were halfway through creating fsverity metadata for the
3581 		 * file. In that case, the orphan item represents incomplete
3582 		 * fsverity metadata which must be cleaned up with
3583 		 * btrfs_drop_verity_items and deleting the orphan item.
3584 
3585 		 * 2. Old kernels (before v3.12) used to create an
3586 		 * orphan item for truncate indicating that there were possibly
3587 		 * extent items past i_size that needed to be deleted. In v3.12,
3588 		 * truncate was changed to update i_size in sync with the extent
3589 		 * items, but the (useless) orphan item was still created. Since
3590 		 * v4.18, we don't create the orphan item for truncate at all.
3591 		 *
3592 		 * So, this item could mean that we need to do a truncate, but
3593 		 * only if this filesystem was last used on a pre-v3.12 kernel
3594 		 * and was not cleanly unmounted. The odds of that are quite
3595 		 * slim, and it's a pain to do the truncate now, so just delete
3596 		 * the orphan item.
3597 		 *
3598 		 * It's also possible that this orphan item was supposed to be
3599 		 * deleted but wasn't. The inode number may have been reused,
3600 		 * but either way, we can delete the orphan item.
3601 		 */
3602 		if (!inode || inode->i_nlink) {
3603 			if (inode) {
3604 				ret = btrfs_drop_verity_items(BTRFS_I(inode));
3605 				iput(inode);
3606 				inode = NULL;
3607 				if (ret)
3608 					goto out;
3609 			}
3610 			trans = btrfs_start_transaction(root, 1);
3611 			if (IS_ERR(trans)) {
3612 				ret = PTR_ERR(trans);
3613 				goto out;
3614 			}
3615 			btrfs_debug(fs_info, "auto deleting %Lu",
3616 				    found_key.objectid);
3617 			ret = btrfs_del_orphan_item(trans, root,
3618 						    found_key.objectid);
3619 			btrfs_end_transaction(trans);
3620 			if (ret)
3621 				goto out;
3622 			continue;
3623 		}
3624 
3625 		nr_unlink++;
3626 
3627 		/* this will do delete_inode and everything for us */
3628 		iput(inode);
3629 	}
3630 	/* release the path since we're done with it */
3631 	btrfs_release_path(path);
3632 
3633 	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3634 		trans = btrfs_join_transaction(root);
3635 		if (!IS_ERR(trans))
3636 			btrfs_end_transaction(trans);
3637 	}
3638 
3639 	if (nr_unlink)
3640 		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3641 
3642 out:
3643 	if (ret)
3644 		btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3645 	btrfs_free_path(path);
3646 	return ret;
3647 }
3648 
3649 /*
3650  * very simple check to peek ahead in the leaf looking for xattrs.  If we
3651  * don't find any xattrs, we know there can't be any acls.
3652  *
3653  * slot is the slot the inode is in, objectid is the objectid of the inode
3654  */
3655 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3656 					  int slot, u64 objectid,
3657 					  int *first_xattr_slot)
3658 {
3659 	u32 nritems = btrfs_header_nritems(leaf);
3660 	struct btrfs_key found_key;
3661 	static u64 xattr_access = 0;
3662 	static u64 xattr_default = 0;
3663 	int scanned = 0;
3664 
3665 	if (!xattr_access) {
3666 		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3667 					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3668 		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3669 					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3670 	}
3671 
3672 	slot++;
3673 	*first_xattr_slot = -1;
3674 	while (slot < nritems) {
3675 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3676 
3677 		/* we found a different objectid, there must not be acls */
3678 		if (found_key.objectid != objectid)
3679 			return 0;
3680 
3681 		/* we found an xattr, assume we've got an acl */
3682 		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3683 			if (*first_xattr_slot == -1)
3684 				*first_xattr_slot = slot;
3685 			if (found_key.offset == xattr_access ||
3686 			    found_key.offset == xattr_default)
3687 				return 1;
3688 		}
3689 
3690 		/*
3691 		 * we found a key greater than an xattr key, there can't
3692 		 * be any acls later on
3693 		 */
3694 		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3695 			return 0;
3696 
3697 		slot++;
3698 		scanned++;
3699 
3700 		/*
3701 		 * it goes inode, inode backrefs, xattrs, extents,
3702 		 * so if there are a ton of hard links to an inode there can
3703 		 * be a lot of backrefs.  Don't waste time searching too hard,
3704 		 * this is just an optimization
3705 		 */
3706 		if (scanned >= 8)
3707 			break;
3708 	}
3709 	/* we hit the end of the leaf before we found an xattr or
3710 	 * something larger than an xattr.  We have to assume the inode
3711 	 * has acls
3712 	 */
3713 	if (*first_xattr_slot == -1)
3714 		*first_xattr_slot = slot;
3715 	return 1;
3716 }
3717 
3718 /*
3719  * read an inode from the btree into the in-memory inode
3720  */
3721 static int btrfs_read_locked_inode(struct inode *inode,
3722 				   struct btrfs_path *in_path)
3723 {
3724 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3725 	struct btrfs_path *path = in_path;
3726 	struct extent_buffer *leaf;
3727 	struct btrfs_inode_item *inode_item;
3728 	struct btrfs_root *root = BTRFS_I(inode)->root;
3729 	struct btrfs_key location;
3730 	unsigned long ptr;
3731 	int maybe_acls;
3732 	u32 rdev;
3733 	int ret;
3734 	bool filled = false;
3735 	int first_xattr_slot;
3736 
3737 	ret = btrfs_fill_inode(inode, &rdev);
3738 	if (!ret)
3739 		filled = true;
3740 
3741 	if (!path) {
3742 		path = btrfs_alloc_path();
3743 		if (!path)
3744 			return -ENOMEM;
3745 	}
3746 
3747 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3748 
3749 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3750 	if (ret) {
3751 		if (path != in_path)
3752 			btrfs_free_path(path);
3753 		return ret;
3754 	}
3755 
3756 	leaf = path->nodes[0];
3757 
3758 	if (filled)
3759 		goto cache_index;
3760 
3761 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3762 				    struct btrfs_inode_item);
3763 	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3764 	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3765 	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3766 	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3767 	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3768 	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
3769 			round_up(i_size_read(inode), fs_info->sectorsize));
3770 
3771 	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3772 	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3773 
3774 	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3775 	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3776 
3777 	inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
3778 			btrfs_timespec_nsec(leaf, &inode_item->ctime));
3779 
3780 	BTRFS_I(inode)->i_otime.tv_sec =
3781 		btrfs_timespec_sec(leaf, &inode_item->otime);
3782 	BTRFS_I(inode)->i_otime.tv_nsec =
3783 		btrfs_timespec_nsec(leaf, &inode_item->otime);
3784 
3785 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3786 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3787 	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3788 
3789 	inode_set_iversion_queried(inode,
3790 				   btrfs_inode_sequence(leaf, inode_item));
3791 	inode->i_generation = BTRFS_I(inode)->generation;
3792 	inode->i_rdev = 0;
3793 	rdev = btrfs_inode_rdev(leaf, inode_item);
3794 
3795 	BTRFS_I(inode)->index_cnt = (u64)-1;
3796 	btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
3797 				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
3798 
3799 cache_index:
3800 	/*
3801 	 * If we were modified in the current generation and evicted from memory
3802 	 * and then re-read we need to do a full sync since we don't have any
3803 	 * idea about which extents were modified before we were evicted from
3804 	 * cache.
3805 	 *
3806 	 * This is required for both inode re-read from disk and delayed inode
3807 	 * in delayed_nodes_tree.
3808 	 */
3809 	if (BTRFS_I(inode)->last_trans == fs_info->generation)
3810 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3811 			&BTRFS_I(inode)->runtime_flags);
3812 
3813 	/*
3814 	 * We don't persist the id of the transaction where an unlink operation
3815 	 * against the inode was last made. So here we assume the inode might
3816 	 * have been evicted, and therefore the exact value of last_unlink_trans
3817 	 * lost, and set it to last_trans to avoid metadata inconsistencies
3818 	 * between the inode and its parent if the inode is fsync'ed and the log
3819 	 * replayed. For example, in the scenario:
3820 	 *
3821 	 * touch mydir/foo
3822 	 * ln mydir/foo mydir/bar
3823 	 * sync
3824 	 * unlink mydir/bar
3825 	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3826 	 * xfs_io -c fsync mydir/foo
3827 	 * <power failure>
3828 	 * mount fs, triggers fsync log replay
3829 	 *
3830 	 * We must make sure that when we fsync our inode foo we also log its
3831 	 * parent inode, otherwise after log replay the parent still has the
3832 	 * dentry with the "bar" name but our inode foo has a link count of 1
3833 	 * and doesn't have an inode ref with the name "bar" anymore.
3834 	 *
3835 	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
3836 	 * but it guarantees correctness at the expense of occasional full
3837 	 * transaction commits on fsync if our inode is a directory, or if our
3838 	 * inode is not a directory, logging its parent unnecessarily.
3839 	 */
3840 	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3841 
3842 	/*
3843 	 * Same logic as for last_unlink_trans. We don't persist the generation
3844 	 * of the last transaction where this inode was used for a reflink
3845 	 * operation, so after eviction and reloading the inode we must be
3846 	 * pessimistic and assume the last transaction that modified the inode.
3847 	 */
3848 	BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
3849 
3850 	path->slots[0]++;
3851 	if (inode->i_nlink != 1 ||
3852 	    path->slots[0] >= btrfs_header_nritems(leaf))
3853 		goto cache_acl;
3854 
3855 	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3856 	if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3857 		goto cache_acl;
3858 
3859 	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3860 	if (location.type == BTRFS_INODE_REF_KEY) {
3861 		struct btrfs_inode_ref *ref;
3862 
3863 		ref = (struct btrfs_inode_ref *)ptr;
3864 		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3865 	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3866 		struct btrfs_inode_extref *extref;
3867 
3868 		extref = (struct btrfs_inode_extref *)ptr;
3869 		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3870 								     extref);
3871 	}
3872 cache_acl:
3873 	/*
3874 	 * try to precache a NULL acl entry for files that don't have
3875 	 * any xattrs or acls
3876 	 */
3877 	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3878 			btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3879 	if (first_xattr_slot != -1) {
3880 		path->slots[0] = first_xattr_slot;
3881 		ret = btrfs_load_inode_props(inode, path);
3882 		if (ret)
3883 			btrfs_err(fs_info,
3884 				  "error loading props for ino %llu (root %llu): %d",
3885 				  btrfs_ino(BTRFS_I(inode)),
3886 				  root->root_key.objectid, ret);
3887 	}
3888 	if (path != in_path)
3889 		btrfs_free_path(path);
3890 
3891 	if (!maybe_acls)
3892 		cache_no_acl(inode);
3893 
3894 	switch (inode->i_mode & S_IFMT) {
3895 	case S_IFREG:
3896 		inode->i_mapping->a_ops = &btrfs_aops;
3897 		inode->i_fop = &btrfs_file_operations;
3898 		inode->i_op = &btrfs_file_inode_operations;
3899 		break;
3900 	case S_IFDIR:
3901 		inode->i_fop = &btrfs_dir_file_operations;
3902 		inode->i_op = &btrfs_dir_inode_operations;
3903 		break;
3904 	case S_IFLNK:
3905 		inode->i_op = &btrfs_symlink_inode_operations;
3906 		inode_nohighmem(inode);
3907 		inode->i_mapping->a_ops = &btrfs_aops;
3908 		break;
3909 	default:
3910 		inode->i_op = &btrfs_special_inode_operations;
3911 		init_special_inode(inode, inode->i_mode, rdev);
3912 		break;
3913 	}
3914 
3915 	btrfs_sync_inode_flags_to_i_flags(inode);
3916 	return 0;
3917 }
3918 
3919 /*
3920  * given a leaf and an inode, copy the inode fields into the leaf
3921  */
3922 static void fill_inode_item(struct btrfs_trans_handle *trans,
3923 			    struct extent_buffer *leaf,
3924 			    struct btrfs_inode_item *item,
3925 			    struct inode *inode)
3926 {
3927 	struct btrfs_map_token token;
3928 	u64 flags;
3929 
3930 	btrfs_init_map_token(&token, leaf);
3931 
3932 	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3933 	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3934 	btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
3935 	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3936 	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3937 
3938 	btrfs_set_token_timespec_sec(&token, &item->atime,
3939 				     inode->i_atime.tv_sec);
3940 	btrfs_set_token_timespec_nsec(&token, &item->atime,
3941 				      inode->i_atime.tv_nsec);
3942 
3943 	btrfs_set_token_timespec_sec(&token, &item->mtime,
3944 				     inode->i_mtime.tv_sec);
3945 	btrfs_set_token_timespec_nsec(&token, &item->mtime,
3946 				      inode->i_mtime.tv_nsec);
3947 
3948 	btrfs_set_token_timespec_sec(&token, &item->ctime,
3949 				     inode_get_ctime(inode).tv_sec);
3950 	btrfs_set_token_timespec_nsec(&token, &item->ctime,
3951 				      inode_get_ctime(inode).tv_nsec);
3952 
3953 	btrfs_set_token_timespec_sec(&token, &item->otime,
3954 				     BTRFS_I(inode)->i_otime.tv_sec);
3955 	btrfs_set_token_timespec_nsec(&token, &item->otime,
3956 				      BTRFS_I(inode)->i_otime.tv_nsec);
3957 
3958 	btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
3959 	btrfs_set_token_inode_generation(&token, item,
3960 					 BTRFS_I(inode)->generation);
3961 	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3962 	btrfs_set_token_inode_transid(&token, item, trans->transid);
3963 	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3964 	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
3965 					  BTRFS_I(inode)->ro_flags);
3966 	btrfs_set_token_inode_flags(&token, item, flags);
3967 	btrfs_set_token_inode_block_group(&token, item, 0);
3968 }
3969 
3970 /*
3971  * copy everything in the in-memory inode into the btree.
3972  */
3973 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3974 				struct btrfs_root *root,
3975 				struct btrfs_inode *inode)
3976 {
3977 	struct btrfs_inode_item *inode_item;
3978 	struct btrfs_path *path;
3979 	struct extent_buffer *leaf;
3980 	int ret;
3981 
3982 	path = btrfs_alloc_path();
3983 	if (!path)
3984 		return -ENOMEM;
3985 
3986 	ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
3987 	if (ret) {
3988 		if (ret > 0)
3989 			ret = -ENOENT;
3990 		goto failed;
3991 	}
3992 
3993 	leaf = path->nodes[0];
3994 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3995 				    struct btrfs_inode_item);
3996 
3997 	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
3998 	btrfs_mark_buffer_dirty(trans, leaf);
3999 	btrfs_set_inode_last_trans(trans, inode);
4000 	ret = 0;
4001 failed:
4002 	btrfs_free_path(path);
4003 	return ret;
4004 }
4005 
4006 /*
4007  * copy everything in the in-memory inode into the btree.
4008  */
4009 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
4010 				struct btrfs_root *root,
4011 				struct btrfs_inode *inode)
4012 {
4013 	struct btrfs_fs_info *fs_info = root->fs_info;
4014 	int ret;
4015 
4016 	/*
4017 	 * If the inode is a free space inode, we can deadlock during commit
4018 	 * if we put it into the delayed code.
4019 	 *
4020 	 * The data relocation inode should also be directly updated
4021 	 * without delay
4022 	 */
4023 	if (!btrfs_is_free_space_inode(inode)
4024 	    && !btrfs_is_data_reloc_root(root)
4025 	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4026 		btrfs_update_root_times(trans, root);
4027 
4028 		ret = btrfs_delayed_update_inode(trans, root, inode);
4029 		if (!ret)
4030 			btrfs_set_inode_last_trans(trans, inode);
4031 		return ret;
4032 	}
4033 
4034 	return btrfs_update_inode_item(trans, root, inode);
4035 }
4036 
4037 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4038 				struct btrfs_root *root, struct btrfs_inode *inode)
4039 {
4040 	int ret;
4041 
4042 	ret = btrfs_update_inode(trans, root, inode);
4043 	if (ret == -ENOSPC)
4044 		return btrfs_update_inode_item(trans, root, inode);
4045 	return ret;
4046 }
4047 
4048 /*
4049  * unlink helper that gets used here in inode.c and in the tree logging
4050  * recovery code.  It remove a link in a directory with a given name, and
4051  * also drops the back refs in the inode to the directory
4052  */
4053 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4054 				struct btrfs_inode *dir,
4055 				struct btrfs_inode *inode,
4056 				const struct fscrypt_str *name,
4057 				struct btrfs_rename_ctx *rename_ctx)
4058 {
4059 	struct btrfs_root *root = dir->root;
4060 	struct btrfs_fs_info *fs_info = root->fs_info;
4061 	struct btrfs_path *path;
4062 	int ret = 0;
4063 	struct btrfs_dir_item *di;
4064 	u64 index;
4065 	u64 ino = btrfs_ino(inode);
4066 	u64 dir_ino = btrfs_ino(dir);
4067 
4068 	path = btrfs_alloc_path();
4069 	if (!path) {
4070 		ret = -ENOMEM;
4071 		goto out;
4072 	}
4073 
4074 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
4075 	if (IS_ERR_OR_NULL(di)) {
4076 		ret = di ? PTR_ERR(di) : -ENOENT;
4077 		goto err;
4078 	}
4079 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4080 	if (ret)
4081 		goto err;
4082 	btrfs_release_path(path);
4083 
4084 	/*
4085 	 * If we don't have dir index, we have to get it by looking up
4086 	 * the inode ref, since we get the inode ref, remove it directly,
4087 	 * it is unnecessary to do delayed deletion.
4088 	 *
4089 	 * But if we have dir index, needn't search inode ref to get it.
4090 	 * Since the inode ref is close to the inode item, it is better
4091 	 * that we delay to delete it, and just do this deletion when
4092 	 * we update the inode item.
4093 	 */
4094 	if (inode->dir_index) {
4095 		ret = btrfs_delayed_delete_inode_ref(inode);
4096 		if (!ret) {
4097 			index = inode->dir_index;
4098 			goto skip_backref;
4099 		}
4100 	}
4101 
4102 	ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
4103 	if (ret) {
4104 		btrfs_info(fs_info,
4105 			"failed to delete reference to %.*s, inode %llu parent %llu",
4106 			name->len, name->name, ino, dir_ino);
4107 		btrfs_abort_transaction(trans, ret);
4108 		goto err;
4109 	}
4110 skip_backref:
4111 	if (rename_ctx)
4112 		rename_ctx->index = index;
4113 
4114 	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4115 	if (ret) {
4116 		btrfs_abort_transaction(trans, ret);
4117 		goto err;
4118 	}
4119 
4120 	/*
4121 	 * If we are in a rename context, we don't need to update anything in the
4122 	 * log. That will be done later during the rename by btrfs_log_new_name().
4123 	 * Besides that, doing it here would only cause extra unnecessary btree
4124 	 * operations on the log tree, increasing latency for applications.
4125 	 */
4126 	if (!rename_ctx) {
4127 		btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
4128 		btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
4129 	}
4130 
4131 	/*
4132 	 * If we have a pending delayed iput we could end up with the final iput
4133 	 * being run in btrfs-cleaner context.  If we have enough of these built
4134 	 * up we can end up burning a lot of time in btrfs-cleaner without any
4135 	 * way to throttle the unlinks.  Since we're currently holding a ref on
4136 	 * the inode we can run the delayed iput here without any issues as the
4137 	 * final iput won't be done until after we drop the ref we're currently
4138 	 * holding.
4139 	 */
4140 	btrfs_run_delayed_iput(fs_info, inode);
4141 err:
4142 	btrfs_free_path(path);
4143 	if (ret)
4144 		goto out;
4145 
4146 	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
4147 	inode_inc_iversion(&inode->vfs_inode);
4148 	inode_inc_iversion(&dir->vfs_inode);
4149 	inode_set_ctime_current(&inode->vfs_inode);
4150 	dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
4151 	ret = btrfs_update_inode(trans, root, dir);
4152 out:
4153 	return ret;
4154 }
4155 
4156 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4157 		       struct btrfs_inode *dir, struct btrfs_inode *inode,
4158 		       const struct fscrypt_str *name)
4159 {
4160 	int ret;
4161 
4162 	ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
4163 	if (!ret) {
4164 		drop_nlink(&inode->vfs_inode);
4165 		ret = btrfs_update_inode(trans, inode->root, inode);
4166 	}
4167 	return ret;
4168 }
4169 
4170 /*
4171  * helper to start transaction for unlink and rmdir.
4172  *
4173  * unlink and rmdir are special in btrfs, they do not always free space, so
4174  * if we cannot make our reservations the normal way try and see if there is
4175  * plenty of slack room in the global reserve to migrate, otherwise we cannot
4176  * allow the unlink to occur.
4177  */
4178 static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
4179 {
4180 	struct btrfs_root *root = dir->root;
4181 
4182 	return btrfs_start_transaction_fallback_global_rsv(root,
4183 						   BTRFS_UNLINK_METADATA_UNITS);
4184 }
4185 
4186 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4187 {
4188 	struct btrfs_trans_handle *trans;
4189 	struct inode *inode = d_inode(dentry);
4190 	int ret;
4191 	struct fscrypt_name fname;
4192 
4193 	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4194 	if (ret)
4195 		return ret;
4196 
4197 	/* This needs to handle no-key deletions later on */
4198 
4199 	trans = __unlink_start_trans(BTRFS_I(dir));
4200 	if (IS_ERR(trans)) {
4201 		ret = PTR_ERR(trans);
4202 		goto fscrypt_free;
4203 	}
4204 
4205 	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4206 				false);
4207 
4208 	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4209 				 &fname.disk_name);
4210 	if (ret)
4211 		goto end_trans;
4212 
4213 	if (inode->i_nlink == 0) {
4214 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4215 		if (ret)
4216 			goto end_trans;
4217 	}
4218 
4219 end_trans:
4220 	btrfs_end_transaction(trans);
4221 	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
4222 fscrypt_free:
4223 	fscrypt_free_filename(&fname);
4224 	return ret;
4225 }
4226 
4227 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4228 			       struct btrfs_inode *dir, struct dentry *dentry)
4229 {
4230 	struct btrfs_root *root = dir->root;
4231 	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4232 	struct btrfs_path *path;
4233 	struct extent_buffer *leaf;
4234 	struct btrfs_dir_item *di;
4235 	struct btrfs_key key;
4236 	u64 index;
4237 	int ret;
4238 	u64 objectid;
4239 	u64 dir_ino = btrfs_ino(dir);
4240 	struct fscrypt_name fname;
4241 
4242 	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
4243 	if (ret)
4244 		return ret;
4245 
4246 	/* This needs to handle no-key deletions later on */
4247 
4248 	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4249 		objectid = inode->root->root_key.objectid;
4250 	} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4251 		objectid = inode->location.objectid;
4252 	} else {
4253 		WARN_ON(1);
4254 		fscrypt_free_filename(&fname);
4255 		return -EINVAL;
4256 	}
4257 
4258 	path = btrfs_alloc_path();
4259 	if (!path) {
4260 		ret = -ENOMEM;
4261 		goto out;
4262 	}
4263 
4264 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4265 				   &fname.disk_name, -1);
4266 	if (IS_ERR_OR_NULL(di)) {
4267 		ret = di ? PTR_ERR(di) : -ENOENT;
4268 		goto out;
4269 	}
4270 
4271 	leaf = path->nodes[0];
4272 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
4273 	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4274 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4275 	if (ret) {
4276 		btrfs_abort_transaction(trans, ret);
4277 		goto out;
4278 	}
4279 	btrfs_release_path(path);
4280 
4281 	/*
4282 	 * This is a placeholder inode for a subvolume we didn't have a
4283 	 * reference to at the time of the snapshot creation.  In the meantime
4284 	 * we could have renamed the real subvol link into our snapshot, so
4285 	 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4286 	 * Instead simply lookup the dir_index_item for this entry so we can
4287 	 * remove it.  Otherwise we know we have a ref to the root and we can
4288 	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
4289 	 */
4290 	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4291 		di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
4292 		if (IS_ERR_OR_NULL(di)) {
4293 			if (!di)
4294 				ret = -ENOENT;
4295 			else
4296 				ret = PTR_ERR(di);
4297 			btrfs_abort_transaction(trans, ret);
4298 			goto out;
4299 		}
4300 
4301 		leaf = path->nodes[0];
4302 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4303 		index = key.offset;
4304 		btrfs_release_path(path);
4305 	} else {
4306 		ret = btrfs_del_root_ref(trans, objectid,
4307 					 root->root_key.objectid, dir_ino,
4308 					 &index, &fname.disk_name);
4309 		if (ret) {
4310 			btrfs_abort_transaction(trans, ret);
4311 			goto out;
4312 		}
4313 	}
4314 
4315 	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4316 	if (ret) {
4317 		btrfs_abort_transaction(trans, ret);
4318 		goto out;
4319 	}
4320 
4321 	btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
4322 	inode_inc_iversion(&dir->vfs_inode);
4323 	dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
4324 	ret = btrfs_update_inode_fallback(trans, root, dir);
4325 	if (ret)
4326 		btrfs_abort_transaction(trans, ret);
4327 out:
4328 	btrfs_free_path(path);
4329 	fscrypt_free_filename(&fname);
4330 	return ret;
4331 }
4332 
4333 /*
4334  * Helper to check if the subvolume references other subvolumes or if it's
4335  * default.
4336  */
4337 static noinline int may_destroy_subvol(struct btrfs_root *root)
4338 {
4339 	struct btrfs_fs_info *fs_info = root->fs_info;
4340 	struct btrfs_path *path;
4341 	struct btrfs_dir_item *di;
4342 	struct btrfs_key key;
4343 	struct fscrypt_str name = FSTR_INIT("default", 7);
4344 	u64 dir_id;
4345 	int ret;
4346 
4347 	path = btrfs_alloc_path();
4348 	if (!path)
4349 		return -ENOMEM;
4350 
4351 	/* Make sure this root isn't set as the default subvol */
4352 	dir_id = btrfs_super_root_dir(fs_info->super_copy);
4353 	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
4354 				   dir_id, &name, 0);
4355 	if (di && !IS_ERR(di)) {
4356 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
4357 		if (key.objectid == root->root_key.objectid) {
4358 			ret = -EPERM;
4359 			btrfs_err(fs_info,
4360 				  "deleting default subvolume %llu is not allowed",
4361 				  key.objectid);
4362 			goto out;
4363 		}
4364 		btrfs_release_path(path);
4365 	}
4366 
4367 	key.objectid = root->root_key.objectid;
4368 	key.type = BTRFS_ROOT_REF_KEY;
4369 	key.offset = (u64)-1;
4370 
4371 	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4372 	if (ret < 0)
4373 		goto out;
4374 	BUG_ON(ret == 0);
4375 
4376 	ret = 0;
4377 	if (path->slots[0] > 0) {
4378 		path->slots[0]--;
4379 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4380 		if (key.objectid == root->root_key.objectid &&
4381 		    key.type == BTRFS_ROOT_REF_KEY)
4382 			ret = -ENOTEMPTY;
4383 	}
4384 out:
4385 	btrfs_free_path(path);
4386 	return ret;
4387 }
4388 
4389 /* Delete all dentries for inodes belonging to the root */
4390 static void btrfs_prune_dentries(struct btrfs_root *root)
4391 {
4392 	struct btrfs_fs_info *fs_info = root->fs_info;
4393 	struct rb_node *node;
4394 	struct rb_node *prev;
4395 	struct btrfs_inode *entry;
4396 	struct inode *inode;
4397 	u64 objectid = 0;
4398 
4399 	if (!BTRFS_FS_ERROR(fs_info))
4400 		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4401 
4402 	spin_lock(&root->inode_lock);
4403 again:
4404 	node = root->inode_tree.rb_node;
4405 	prev = NULL;
4406 	while (node) {
4407 		prev = node;
4408 		entry = rb_entry(node, struct btrfs_inode, rb_node);
4409 
4410 		if (objectid < btrfs_ino(entry))
4411 			node = node->rb_left;
4412 		else if (objectid > btrfs_ino(entry))
4413 			node = node->rb_right;
4414 		else
4415 			break;
4416 	}
4417 	if (!node) {
4418 		while (prev) {
4419 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
4420 			if (objectid <= btrfs_ino(entry)) {
4421 				node = prev;
4422 				break;
4423 			}
4424 			prev = rb_next(prev);
4425 		}
4426 	}
4427 	while (node) {
4428 		entry = rb_entry(node, struct btrfs_inode, rb_node);
4429 		objectid = btrfs_ino(entry) + 1;
4430 		inode = igrab(&entry->vfs_inode);
4431 		if (inode) {
4432 			spin_unlock(&root->inode_lock);
4433 			if (atomic_read(&inode->i_count) > 1)
4434 				d_prune_aliases(inode);
4435 			/*
4436 			 * btrfs_drop_inode will have it removed from the inode
4437 			 * cache when its usage count hits zero.
4438 			 */
4439 			iput(inode);
4440 			cond_resched();
4441 			spin_lock(&root->inode_lock);
4442 			goto again;
4443 		}
4444 
4445 		if (cond_resched_lock(&root->inode_lock))
4446 			goto again;
4447 
4448 		node = rb_next(node);
4449 	}
4450 	spin_unlock(&root->inode_lock);
4451 }
4452 
4453 int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
4454 {
4455 	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
4456 	struct btrfs_root *root = dir->root;
4457 	struct inode *inode = d_inode(dentry);
4458 	struct btrfs_root *dest = BTRFS_I(inode)->root;
4459 	struct btrfs_trans_handle *trans;
4460 	struct btrfs_block_rsv block_rsv;
4461 	u64 root_flags;
4462 	u64 qgroup_reserved = 0;
4463 	int ret;
4464 
4465 	down_write(&fs_info->subvol_sem);
4466 
4467 	/*
4468 	 * Don't allow to delete a subvolume with send in progress. This is
4469 	 * inside the inode lock so the error handling that has to drop the bit
4470 	 * again is not run concurrently.
4471 	 */
4472 	spin_lock(&dest->root_item_lock);
4473 	if (dest->send_in_progress) {
4474 		spin_unlock(&dest->root_item_lock);
4475 		btrfs_warn(fs_info,
4476 			   "attempt to delete subvolume %llu during send",
4477 			   dest->root_key.objectid);
4478 		ret = -EPERM;
4479 		goto out_up_write;
4480 	}
4481 	if (atomic_read(&dest->nr_swapfiles)) {
4482 		spin_unlock(&dest->root_item_lock);
4483 		btrfs_warn(fs_info,
4484 			   "attempt to delete subvolume %llu with active swapfile",
4485 			   root->root_key.objectid);
4486 		ret = -EPERM;
4487 		goto out_up_write;
4488 	}
4489 	root_flags = btrfs_root_flags(&dest->root_item);
4490 	btrfs_set_root_flags(&dest->root_item,
4491 			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4492 	spin_unlock(&dest->root_item_lock);
4493 
4494 	ret = may_destroy_subvol(dest);
4495 	if (ret)
4496 		goto out_undead;
4497 
4498 	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4499 	/*
4500 	 * One for dir inode,
4501 	 * two for dir entries,
4502 	 * two for root ref/backref.
4503 	 */
4504 	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4505 	if (ret)
4506 		goto out_undead;
4507 	qgroup_reserved = block_rsv.qgroup_rsv_reserved;
4508 
4509 	trans = btrfs_start_transaction(root, 0);
4510 	if (IS_ERR(trans)) {
4511 		ret = PTR_ERR(trans);
4512 		goto out_release;
4513 	}
4514 	ret = btrfs_record_root_in_trans(trans, root);
4515 	if (ret) {
4516 		btrfs_abort_transaction(trans, ret);
4517 		goto out_end_trans;
4518 	}
4519 	btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
4520 	qgroup_reserved = 0;
4521 	trans->block_rsv = &block_rsv;
4522 	trans->bytes_reserved = block_rsv.size;
4523 
4524 	btrfs_record_snapshot_destroy(trans, dir);
4525 
4526 	ret = btrfs_unlink_subvol(trans, dir, dentry);
4527 	if (ret) {
4528 		btrfs_abort_transaction(trans, ret);
4529 		goto out_end_trans;
4530 	}
4531 
4532 	ret = btrfs_record_root_in_trans(trans, dest);
4533 	if (ret) {
4534 		btrfs_abort_transaction(trans, ret);
4535 		goto out_end_trans;
4536 	}
4537 
4538 	memset(&dest->root_item.drop_progress, 0,
4539 		sizeof(dest->root_item.drop_progress));
4540 	btrfs_set_root_drop_level(&dest->root_item, 0);
4541 	btrfs_set_root_refs(&dest->root_item, 0);
4542 
4543 	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4544 		ret = btrfs_insert_orphan_item(trans,
4545 					fs_info->tree_root,
4546 					dest->root_key.objectid);
4547 		if (ret) {
4548 			btrfs_abort_transaction(trans, ret);
4549 			goto out_end_trans;
4550 		}
4551 	}
4552 
4553 	ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4554 				  BTRFS_UUID_KEY_SUBVOL,
4555 				  dest->root_key.objectid);
4556 	if (ret && ret != -ENOENT) {
4557 		btrfs_abort_transaction(trans, ret);
4558 		goto out_end_trans;
4559 	}
4560 	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4561 		ret = btrfs_uuid_tree_remove(trans,
4562 					  dest->root_item.received_uuid,
4563 					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4564 					  dest->root_key.objectid);
4565 		if (ret && ret != -ENOENT) {
4566 			btrfs_abort_transaction(trans, ret);
4567 			goto out_end_trans;
4568 		}
4569 	}
4570 
4571 	free_anon_bdev(dest->anon_dev);
4572 	dest->anon_dev = 0;
4573 out_end_trans:
4574 	trans->block_rsv = NULL;
4575 	trans->bytes_reserved = 0;
4576 	ret = btrfs_end_transaction(trans);
4577 	inode->i_flags |= S_DEAD;
4578 out_release:
4579 	btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
4580 	if (qgroup_reserved)
4581 		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
4582 out_undead:
4583 	if (ret) {
4584 		spin_lock(&dest->root_item_lock);
4585 		root_flags = btrfs_root_flags(&dest->root_item);
4586 		btrfs_set_root_flags(&dest->root_item,
4587 				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4588 		spin_unlock(&dest->root_item_lock);
4589 	}
4590 out_up_write:
4591 	up_write(&fs_info->subvol_sem);
4592 	if (!ret) {
4593 		d_invalidate(dentry);
4594 		btrfs_prune_dentries(dest);
4595 		ASSERT(dest->send_in_progress == 0);
4596 	}
4597 
4598 	return ret;
4599 }
4600 
4601 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4602 {
4603 	struct inode *inode = d_inode(dentry);
4604 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
4605 	int err = 0;
4606 	struct btrfs_trans_handle *trans;
4607 	u64 last_unlink_trans;
4608 	struct fscrypt_name fname;
4609 
4610 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4611 		return -ENOTEMPTY;
4612 	if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
4613 		if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
4614 			btrfs_err(fs_info,
4615 			"extent tree v2 doesn't support snapshot deletion yet");
4616 			return -EOPNOTSUPP;
4617 		}
4618 		return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
4619 	}
4620 
4621 	err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4622 	if (err)
4623 		return err;
4624 
4625 	/* This needs to handle no-key deletions later on */
4626 
4627 	trans = __unlink_start_trans(BTRFS_I(dir));
4628 	if (IS_ERR(trans)) {
4629 		err = PTR_ERR(trans);
4630 		goto out_notrans;
4631 	}
4632 
4633 	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4634 		err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
4635 		goto out;
4636 	}
4637 
4638 	err = btrfs_orphan_add(trans, BTRFS_I(inode));
4639 	if (err)
4640 		goto out;
4641 
4642 	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4643 
4644 	/* now the directory is empty */
4645 	err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4646 				 &fname.disk_name);
4647 	if (!err) {
4648 		btrfs_i_size_write(BTRFS_I(inode), 0);
4649 		/*
4650 		 * Propagate the last_unlink_trans value of the deleted dir to
4651 		 * its parent directory. This is to prevent an unrecoverable
4652 		 * log tree in the case we do something like this:
4653 		 * 1) create dir foo
4654 		 * 2) create snapshot under dir foo
4655 		 * 3) delete the snapshot
4656 		 * 4) rmdir foo
4657 		 * 5) mkdir foo
4658 		 * 6) fsync foo or some file inside foo
4659 		 */
4660 		if (last_unlink_trans >= trans->transid)
4661 			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4662 	}
4663 out:
4664 	btrfs_end_transaction(trans);
4665 out_notrans:
4666 	btrfs_btree_balance_dirty(fs_info);
4667 	fscrypt_free_filename(&fname);
4668 
4669 	return err;
4670 }
4671 
4672 /*
4673  * btrfs_truncate_block - read, zero a chunk and write a block
4674  * @inode - inode that we're zeroing
4675  * @from - the offset to start zeroing
4676  * @len - the length to zero, 0 to zero the entire range respective to the
4677  *	offset
4678  * @front - zero up to the offset instead of from the offset on
4679  *
4680  * This will find the block for the "from" offset and cow the block and zero the
4681  * part we want to zero.  This is used with truncate and hole punching.
4682  */
4683 int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
4684 			 int front)
4685 {
4686 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
4687 	struct address_space *mapping = inode->vfs_inode.i_mapping;
4688 	struct extent_io_tree *io_tree = &inode->io_tree;
4689 	struct btrfs_ordered_extent *ordered;
4690 	struct extent_state *cached_state = NULL;
4691 	struct extent_changeset *data_reserved = NULL;
4692 	bool only_release_metadata = false;
4693 	u32 blocksize = fs_info->sectorsize;
4694 	pgoff_t index = from >> PAGE_SHIFT;
4695 	unsigned offset = from & (blocksize - 1);
4696 	struct page *page;
4697 	gfp_t mask = btrfs_alloc_write_mask(mapping);
4698 	size_t write_bytes = blocksize;
4699 	int ret = 0;
4700 	u64 block_start;
4701 	u64 block_end;
4702 
4703 	if (IS_ALIGNED(offset, blocksize) &&
4704 	    (!len || IS_ALIGNED(len, blocksize)))
4705 		goto out;
4706 
4707 	block_start = round_down(from, blocksize);
4708 	block_end = block_start + blocksize - 1;
4709 
4710 	ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
4711 					  blocksize, false);
4712 	if (ret < 0) {
4713 		if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
4714 			/* For nocow case, no need to reserve data space */
4715 			only_release_metadata = true;
4716 		} else {
4717 			goto out;
4718 		}
4719 	}
4720 	ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
4721 	if (ret < 0) {
4722 		if (!only_release_metadata)
4723 			btrfs_free_reserved_data_space(inode, data_reserved,
4724 						       block_start, blocksize);
4725 		goto out;
4726 	}
4727 again:
4728 	page = find_or_create_page(mapping, index, mask);
4729 	if (!page) {
4730 		btrfs_delalloc_release_space(inode, data_reserved, block_start,
4731 					     blocksize, true);
4732 		btrfs_delalloc_release_extents(inode, blocksize);
4733 		ret = -ENOMEM;
4734 		goto out;
4735 	}
4736 
4737 	if (!PageUptodate(page)) {
4738 		ret = btrfs_read_folio(NULL, page_folio(page));
4739 		lock_page(page);
4740 		if (page->mapping != mapping) {
4741 			unlock_page(page);
4742 			put_page(page);
4743 			goto again;
4744 		}
4745 		if (!PageUptodate(page)) {
4746 			ret = -EIO;
4747 			goto out_unlock;
4748 		}
4749 	}
4750 
4751 	/*
4752 	 * We unlock the page after the io is completed and then re-lock it
4753 	 * above.  release_folio() could have come in between that and cleared
4754 	 * PagePrivate(), but left the page in the mapping.  Set the page mapped
4755 	 * here to make sure it's properly set for the subpage stuff.
4756 	 */
4757 	ret = set_page_extent_mapped(page);
4758 	if (ret < 0)
4759 		goto out_unlock;
4760 
4761 	wait_on_page_writeback(page);
4762 
4763 	lock_extent(io_tree, block_start, block_end, &cached_state);
4764 
4765 	ordered = btrfs_lookup_ordered_extent(inode, block_start);
4766 	if (ordered) {
4767 		unlock_extent(io_tree, block_start, block_end, &cached_state);
4768 		unlock_page(page);
4769 		put_page(page);
4770 		btrfs_start_ordered_extent(ordered);
4771 		btrfs_put_ordered_extent(ordered);
4772 		goto again;
4773 	}
4774 
4775 	clear_extent_bit(&inode->io_tree, block_start, block_end,
4776 			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4777 			 &cached_state);
4778 
4779 	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
4780 					&cached_state);
4781 	if (ret) {
4782 		unlock_extent(io_tree, block_start, block_end, &cached_state);
4783 		goto out_unlock;
4784 	}
4785 
4786 	if (offset != blocksize) {
4787 		if (!len)
4788 			len = blocksize - offset;
4789 		if (front)
4790 			memzero_page(page, (block_start - page_offset(page)),
4791 				     offset);
4792 		else
4793 			memzero_page(page, (block_start - page_offset(page)) + offset,
4794 				     len);
4795 	}
4796 	btrfs_page_clear_checked(fs_info, page, block_start,
4797 				 block_end + 1 - block_start);
4798 	btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
4799 	unlock_extent(io_tree, block_start, block_end, &cached_state);
4800 
4801 	if (only_release_metadata)
4802 		set_extent_bit(&inode->io_tree, block_start, block_end,
4803 			       EXTENT_NORESERVE, NULL);
4804 
4805 out_unlock:
4806 	if (ret) {
4807 		if (only_release_metadata)
4808 			btrfs_delalloc_release_metadata(inode, blocksize, true);
4809 		else
4810 			btrfs_delalloc_release_space(inode, data_reserved,
4811 					block_start, blocksize, true);
4812 	}
4813 	btrfs_delalloc_release_extents(inode, blocksize);
4814 	unlock_page(page);
4815 	put_page(page);
4816 out:
4817 	if (only_release_metadata)
4818 		btrfs_check_nocow_unlock(inode);
4819 	extent_changeset_free(data_reserved);
4820 	return ret;
4821 }
4822 
4823 static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
4824 			     u64 offset, u64 len)
4825 {
4826 	struct btrfs_fs_info *fs_info = root->fs_info;
4827 	struct btrfs_trans_handle *trans;
4828 	struct btrfs_drop_extents_args drop_args = { 0 };
4829 	int ret;
4830 
4831 	/*
4832 	 * If NO_HOLES is enabled, we don't need to do anything.
4833 	 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
4834 	 * or btrfs_update_inode() will be called, which guarantee that the next
4835 	 * fsync will know this inode was changed and needs to be logged.
4836 	 */
4837 	if (btrfs_fs_incompat(fs_info, NO_HOLES))
4838 		return 0;
4839 
4840 	/*
4841 	 * 1 - for the one we're dropping
4842 	 * 1 - for the one we're adding
4843 	 * 1 - for updating the inode.
4844 	 */
4845 	trans = btrfs_start_transaction(root, 3);
4846 	if (IS_ERR(trans))
4847 		return PTR_ERR(trans);
4848 
4849 	drop_args.start = offset;
4850 	drop_args.end = offset + len;
4851 	drop_args.drop_cache = true;
4852 
4853 	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
4854 	if (ret) {
4855 		btrfs_abort_transaction(trans, ret);
4856 		btrfs_end_transaction(trans);
4857 		return ret;
4858 	}
4859 
4860 	ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
4861 	if (ret) {
4862 		btrfs_abort_transaction(trans, ret);
4863 	} else {
4864 		btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
4865 		btrfs_update_inode(trans, root, inode);
4866 	}
4867 	btrfs_end_transaction(trans);
4868 	return ret;
4869 }
4870 
4871 /*
4872  * This function puts in dummy file extents for the area we're creating a hole
4873  * for.  So if we are truncating this file to a larger size we need to insert
4874  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4875  * the range between oldsize and size
4876  */
4877 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
4878 {
4879 	struct btrfs_root *root = inode->root;
4880 	struct btrfs_fs_info *fs_info = root->fs_info;
4881 	struct extent_io_tree *io_tree = &inode->io_tree;
4882 	struct extent_map *em = NULL;
4883 	struct extent_state *cached_state = NULL;
4884 	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
4885 	u64 block_end = ALIGN(size, fs_info->sectorsize);
4886 	u64 last_byte;
4887 	u64 cur_offset;
4888 	u64 hole_size;
4889 	int err = 0;
4890 
4891 	/*
4892 	 * If our size started in the middle of a block we need to zero out the
4893 	 * rest of the block before we expand the i_size, otherwise we could
4894 	 * expose stale data.
4895 	 */
4896 	err = btrfs_truncate_block(inode, oldsize, 0, 0);
4897 	if (err)
4898 		return err;
4899 
4900 	if (size <= hole_start)
4901 		return 0;
4902 
4903 	btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
4904 					   &cached_state);
4905 	cur_offset = hole_start;
4906 	while (1) {
4907 		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4908 				      block_end - cur_offset);
4909 		if (IS_ERR(em)) {
4910 			err = PTR_ERR(em);
4911 			em = NULL;
4912 			break;
4913 		}
4914 		last_byte = min(extent_map_end(em), block_end);
4915 		last_byte = ALIGN(last_byte, fs_info->sectorsize);
4916 		hole_size = last_byte - cur_offset;
4917 
4918 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4919 			struct extent_map *hole_em;
4920 
4921 			err = maybe_insert_hole(root, inode, cur_offset,
4922 						hole_size);
4923 			if (err)
4924 				break;
4925 
4926 			err = btrfs_inode_set_file_extent_range(inode,
4927 							cur_offset, hole_size);
4928 			if (err)
4929 				break;
4930 
4931 			hole_em = alloc_extent_map();
4932 			if (!hole_em) {
4933 				btrfs_drop_extent_map_range(inode, cur_offset,
4934 						    cur_offset + hole_size - 1,
4935 						    false);
4936 				btrfs_set_inode_full_sync(inode);
4937 				goto next;
4938 			}
4939 			hole_em->start = cur_offset;
4940 			hole_em->len = hole_size;
4941 			hole_em->orig_start = cur_offset;
4942 
4943 			hole_em->block_start = EXTENT_MAP_HOLE;
4944 			hole_em->block_len = 0;
4945 			hole_em->orig_block_len = 0;
4946 			hole_em->ram_bytes = hole_size;
4947 			hole_em->compress_type = BTRFS_COMPRESS_NONE;
4948 			hole_em->generation = fs_info->generation;
4949 
4950 			err = btrfs_replace_extent_map_range(inode, hole_em, true);
4951 			free_extent_map(hole_em);
4952 		} else {
4953 			err = btrfs_inode_set_file_extent_range(inode,
4954 							cur_offset, hole_size);
4955 			if (err)
4956 				break;
4957 		}
4958 next:
4959 		free_extent_map(em);
4960 		em = NULL;
4961 		cur_offset = last_byte;
4962 		if (cur_offset >= block_end)
4963 			break;
4964 	}
4965 	free_extent_map(em);
4966 	unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
4967 	return err;
4968 }
4969 
4970 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4971 {
4972 	struct btrfs_root *root = BTRFS_I(inode)->root;
4973 	struct btrfs_trans_handle *trans;
4974 	loff_t oldsize = i_size_read(inode);
4975 	loff_t newsize = attr->ia_size;
4976 	int mask = attr->ia_valid;
4977 	int ret;
4978 
4979 	/*
4980 	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4981 	 * special case where we need to update the times despite not having
4982 	 * these flags set.  For all other operations the VFS set these flags
4983 	 * explicitly if it wants a timestamp update.
4984 	 */
4985 	if (newsize != oldsize) {
4986 		inode_inc_iversion(inode);
4987 		if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
4988 			inode->i_mtime = inode_set_ctime_current(inode);
4989 		}
4990 	}
4991 
4992 	if (newsize > oldsize) {
4993 		/*
4994 		 * Don't do an expanding truncate while snapshotting is ongoing.
4995 		 * This is to ensure the snapshot captures a fully consistent
4996 		 * state of this file - if the snapshot captures this expanding
4997 		 * truncation, it must capture all writes that happened before
4998 		 * this truncation.
4999 		 */
5000 		btrfs_drew_write_lock(&root->snapshot_lock);
5001 		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
5002 		if (ret) {
5003 			btrfs_drew_write_unlock(&root->snapshot_lock);
5004 			return ret;
5005 		}
5006 
5007 		trans = btrfs_start_transaction(root, 1);
5008 		if (IS_ERR(trans)) {
5009 			btrfs_drew_write_unlock(&root->snapshot_lock);
5010 			return PTR_ERR(trans);
5011 		}
5012 
5013 		i_size_write(inode, newsize);
5014 		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
5015 		pagecache_isize_extended(inode, oldsize, newsize);
5016 		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
5017 		btrfs_drew_write_unlock(&root->snapshot_lock);
5018 		btrfs_end_transaction(trans);
5019 	} else {
5020 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5021 
5022 		if (btrfs_is_zoned(fs_info)) {
5023 			ret = btrfs_wait_ordered_range(inode,
5024 					ALIGN(newsize, fs_info->sectorsize),
5025 					(u64)-1);
5026 			if (ret)
5027 				return ret;
5028 		}
5029 
5030 		/*
5031 		 * We're truncating a file that used to have good data down to
5032 		 * zero. Make sure any new writes to the file get on disk
5033 		 * on close.
5034 		 */
5035 		if (newsize == 0)
5036 			set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
5037 				&BTRFS_I(inode)->runtime_flags);
5038 
5039 		truncate_setsize(inode, newsize);
5040 
5041 		inode_dio_wait(inode);
5042 
5043 		ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
5044 		if (ret && inode->i_nlink) {
5045 			int err;
5046 
5047 			/*
5048 			 * Truncate failed, so fix up the in-memory size. We
5049 			 * adjusted disk_i_size down as we removed extents, so
5050 			 * wait for disk_i_size to be stable and then update the
5051 			 * in-memory size to match.
5052 			 */
5053 			err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5054 			if (err)
5055 				return err;
5056 			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5057 		}
5058 	}
5059 
5060 	return ret;
5061 }
5062 
5063 static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
5064 			 struct iattr *attr)
5065 {
5066 	struct inode *inode = d_inode(dentry);
5067 	struct btrfs_root *root = BTRFS_I(inode)->root;
5068 	int err;
5069 
5070 	if (btrfs_root_readonly(root))
5071 		return -EROFS;
5072 
5073 	err = setattr_prepare(idmap, dentry, attr);
5074 	if (err)
5075 		return err;
5076 
5077 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5078 		err = btrfs_setsize(inode, attr);
5079 		if (err)
5080 			return err;
5081 	}
5082 
5083 	if (attr->ia_valid) {
5084 		setattr_copy(idmap, inode, attr);
5085 		inode_inc_iversion(inode);
5086 		err = btrfs_dirty_inode(BTRFS_I(inode));
5087 
5088 		if (!err && attr->ia_valid & ATTR_MODE)
5089 			err = posix_acl_chmod(idmap, dentry, inode->i_mode);
5090 	}
5091 
5092 	return err;
5093 }
5094 
5095 /*
5096  * While truncating the inode pages during eviction, we get the VFS
5097  * calling btrfs_invalidate_folio() against each folio of the inode. This
5098  * is slow because the calls to btrfs_invalidate_folio() result in a
5099  * huge amount of calls to lock_extent() and clear_extent_bit(),
5100  * which keep merging and splitting extent_state structures over and over,
5101  * wasting lots of time.
5102  *
5103  * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
5104  * skip all those expensive operations on a per folio basis and do only
5105  * the ordered io finishing, while we release here the extent_map and
5106  * extent_state structures, without the excessive merging and splitting.
5107  */
5108 static void evict_inode_truncate_pages(struct inode *inode)
5109 {
5110 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5111 	struct rb_node *node;
5112 
5113 	ASSERT(inode->i_state & I_FREEING);
5114 	truncate_inode_pages_final(&inode->i_data);
5115 
5116 	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
5117 
5118 	/*
5119 	 * Keep looping until we have no more ranges in the io tree.
5120 	 * We can have ongoing bios started by readahead that have
5121 	 * their endio callback (extent_io.c:end_bio_extent_readpage)
5122 	 * still in progress (unlocked the pages in the bio but did not yet
5123 	 * unlocked the ranges in the io tree). Therefore this means some
5124 	 * ranges can still be locked and eviction started because before
5125 	 * submitting those bios, which are executed by a separate task (work
5126 	 * queue kthread), inode references (inode->i_count) were not taken
5127 	 * (which would be dropped in the end io callback of each bio).
5128 	 * Therefore here we effectively end up waiting for those bios and
5129 	 * anyone else holding locked ranges without having bumped the inode's
5130 	 * reference count - if we don't do it, when they access the inode's
5131 	 * io_tree to unlock a range it may be too late, leading to an
5132 	 * use-after-free issue.
5133 	 */
5134 	spin_lock(&io_tree->lock);
5135 	while (!RB_EMPTY_ROOT(&io_tree->state)) {
5136 		struct extent_state *state;
5137 		struct extent_state *cached_state = NULL;
5138 		u64 start;
5139 		u64 end;
5140 		unsigned state_flags;
5141 
5142 		node = rb_first(&io_tree->state);
5143 		state = rb_entry(node, struct extent_state, rb_node);
5144 		start = state->start;
5145 		end = state->end;
5146 		state_flags = state->state;
5147 		spin_unlock(&io_tree->lock);
5148 
5149 		lock_extent(io_tree, start, end, &cached_state);
5150 
5151 		/*
5152 		 * If still has DELALLOC flag, the extent didn't reach disk,
5153 		 * and its reserved space won't be freed by delayed_ref.
5154 		 * So we need to free its reserved space here.
5155 		 * (Refer to comment in btrfs_invalidate_folio, case 2)
5156 		 *
5157 		 * Note, end is the bytenr of last byte, so we need + 1 here.
5158 		 */
5159 		if (state_flags & EXTENT_DELALLOC)
5160 			btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
5161 					       end - start + 1, NULL);
5162 
5163 		clear_extent_bit(io_tree, start, end,
5164 				 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
5165 				 &cached_state);
5166 
5167 		cond_resched();
5168 		spin_lock(&io_tree->lock);
5169 	}
5170 	spin_unlock(&io_tree->lock);
5171 }
5172 
5173 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5174 							struct btrfs_block_rsv *rsv)
5175 {
5176 	struct btrfs_fs_info *fs_info = root->fs_info;
5177 	struct btrfs_trans_handle *trans;
5178 	u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
5179 	int ret;
5180 
5181 	/*
5182 	 * Eviction should be taking place at some place safe because of our
5183 	 * delayed iputs.  However the normal flushing code will run delayed
5184 	 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5185 	 *
5186 	 * We reserve the delayed_refs_extra here again because we can't use
5187 	 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5188 	 * above.  We reserve our extra bit here because we generate a ton of
5189 	 * delayed refs activity by truncating.
5190 	 *
5191 	 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
5192 	 * if we fail to make this reservation we can re-try without the
5193 	 * delayed_refs_extra so we can make some forward progress.
5194 	 */
5195 	ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
5196 				     BTRFS_RESERVE_FLUSH_EVICT);
5197 	if (ret) {
5198 		ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
5199 					     BTRFS_RESERVE_FLUSH_EVICT);
5200 		if (ret) {
5201 			btrfs_warn(fs_info,
5202 				   "could not allocate space for delete; will truncate on mount");
5203 			return ERR_PTR(-ENOSPC);
5204 		}
5205 		delayed_refs_extra = 0;
5206 	}
5207 
5208 	trans = btrfs_join_transaction(root);
5209 	if (IS_ERR(trans))
5210 		return trans;
5211 
5212 	if (delayed_refs_extra) {
5213 		trans->block_rsv = &fs_info->trans_block_rsv;
5214 		trans->bytes_reserved = delayed_refs_extra;
5215 		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5216 					delayed_refs_extra, true);
5217 	}
5218 	return trans;
5219 }
5220 
5221 void btrfs_evict_inode(struct inode *inode)
5222 {
5223 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5224 	struct btrfs_trans_handle *trans;
5225 	struct btrfs_root *root = BTRFS_I(inode)->root;
5226 	struct btrfs_block_rsv *rsv = NULL;
5227 	int ret;
5228 
5229 	trace_btrfs_inode_evict(inode);
5230 
5231 	if (!root) {
5232 		fsverity_cleanup_inode(inode);
5233 		clear_inode(inode);
5234 		return;
5235 	}
5236 
5237 	evict_inode_truncate_pages(inode);
5238 
5239 	if (inode->i_nlink &&
5240 	    ((btrfs_root_refs(&root->root_item) != 0 &&
5241 	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5242 	     btrfs_is_free_space_inode(BTRFS_I(inode))))
5243 		goto out;
5244 
5245 	if (is_bad_inode(inode))
5246 		goto out;
5247 
5248 	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5249 		goto out;
5250 
5251 	if (inode->i_nlink > 0) {
5252 		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5253 		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5254 		goto out;
5255 	}
5256 
5257 	/*
5258 	 * This makes sure the inode item in tree is uptodate and the space for
5259 	 * the inode update is released.
5260 	 */
5261 	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5262 	if (ret)
5263 		goto out;
5264 
5265 	/*
5266 	 * This drops any pending insert or delete operations we have for this
5267 	 * inode.  We could have a delayed dir index deletion queued up, but
5268 	 * we're removing the inode completely so that'll be taken care of in
5269 	 * the truncate.
5270 	 */
5271 	btrfs_kill_delayed_inode_items(BTRFS_I(inode));
5272 
5273 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5274 	if (!rsv)
5275 		goto out;
5276 	rsv->size = btrfs_calc_metadata_size(fs_info, 1);
5277 	rsv->failfast = true;
5278 
5279 	btrfs_i_size_write(BTRFS_I(inode), 0);
5280 
5281 	while (1) {
5282 		struct btrfs_truncate_control control = {
5283 			.inode = BTRFS_I(inode),
5284 			.ino = btrfs_ino(BTRFS_I(inode)),
5285 			.new_size = 0,
5286 			.min_type = 0,
5287 		};
5288 
5289 		trans = evict_refill_and_join(root, rsv);
5290 		if (IS_ERR(trans))
5291 			goto out;
5292 
5293 		trans->block_rsv = rsv;
5294 
5295 		ret = btrfs_truncate_inode_items(trans, root, &control);
5296 		trans->block_rsv = &fs_info->trans_block_rsv;
5297 		btrfs_end_transaction(trans);
5298 		/*
5299 		 * We have not added new delayed items for our inode after we
5300 		 * have flushed its delayed items, so no need to throttle on
5301 		 * delayed items. However we have modified extent buffers.
5302 		 */
5303 		btrfs_btree_balance_dirty_nodelay(fs_info);
5304 		if (ret && ret != -ENOSPC && ret != -EAGAIN)
5305 			goto out;
5306 		else if (!ret)
5307 			break;
5308 	}
5309 
5310 	/*
5311 	 * Errors here aren't a big deal, it just means we leave orphan items in
5312 	 * the tree. They will be cleaned up on the next mount. If the inode
5313 	 * number gets reused, cleanup deletes the orphan item without doing
5314 	 * anything, and unlink reuses the existing orphan item.
5315 	 *
5316 	 * If it turns out that we are dropping too many of these, we might want
5317 	 * to add a mechanism for retrying these after a commit.
5318 	 */
5319 	trans = evict_refill_and_join(root, rsv);
5320 	if (!IS_ERR(trans)) {
5321 		trans->block_rsv = rsv;
5322 		btrfs_orphan_del(trans, BTRFS_I(inode));
5323 		trans->block_rsv = &fs_info->trans_block_rsv;
5324 		btrfs_end_transaction(trans);
5325 	}
5326 
5327 out:
5328 	btrfs_free_block_rsv(fs_info, rsv);
5329 	/*
5330 	 * If we didn't successfully delete, the orphan item will still be in
5331 	 * the tree and we'll retry on the next mount. Again, we might also want
5332 	 * to retry these periodically in the future.
5333 	 */
5334 	btrfs_remove_delayed_node(BTRFS_I(inode));
5335 	fsverity_cleanup_inode(inode);
5336 	clear_inode(inode);
5337 }
5338 
5339 /*
5340  * Return the key found in the dir entry in the location pointer, fill @type
5341  * with BTRFS_FT_*, and return 0.
5342  *
5343  * If no dir entries were found, returns -ENOENT.
5344  * If found a corrupted location in dir entry, returns -EUCLEAN.
5345  */
5346 static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
5347 			       struct btrfs_key *location, u8 *type)
5348 {
5349 	struct btrfs_dir_item *di;
5350 	struct btrfs_path *path;
5351 	struct btrfs_root *root = dir->root;
5352 	int ret = 0;
5353 	struct fscrypt_name fname;
5354 
5355 	path = btrfs_alloc_path();
5356 	if (!path)
5357 		return -ENOMEM;
5358 
5359 	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
5360 	if (ret < 0)
5361 		goto out;
5362 	/*
5363 	 * fscrypt_setup_filename() should never return a positive value, but
5364 	 * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
5365 	 */
5366 	ASSERT(ret == 0);
5367 
5368 	/* This needs to handle no-key deletions later on */
5369 
5370 	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
5371 				   &fname.disk_name, 0);
5372 	if (IS_ERR_OR_NULL(di)) {
5373 		ret = di ? PTR_ERR(di) : -ENOENT;
5374 		goto out;
5375 	}
5376 
5377 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5378 	if (location->type != BTRFS_INODE_ITEM_KEY &&
5379 	    location->type != BTRFS_ROOT_ITEM_KEY) {
5380 		ret = -EUCLEAN;
5381 		btrfs_warn(root->fs_info,
5382 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5383 			   __func__, fname.disk_name.name, btrfs_ino(dir),
5384 			   location->objectid, location->type, location->offset);
5385 	}
5386 	if (!ret)
5387 		*type = btrfs_dir_ftype(path->nodes[0], di);
5388 out:
5389 	fscrypt_free_filename(&fname);
5390 	btrfs_free_path(path);
5391 	return ret;
5392 }
5393 
5394 /*
5395  * when we hit a tree root in a directory, the btrfs part of the inode
5396  * needs to be changed to reflect the root directory of the tree root.  This
5397  * is kind of like crossing a mount point.
5398  */
5399 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5400 				    struct btrfs_inode *dir,
5401 				    struct dentry *dentry,
5402 				    struct btrfs_key *location,
5403 				    struct btrfs_root **sub_root)
5404 {
5405 	struct btrfs_path *path;
5406 	struct btrfs_root *new_root;
5407 	struct btrfs_root_ref *ref;
5408 	struct extent_buffer *leaf;
5409 	struct btrfs_key key;
5410 	int ret;
5411 	int err = 0;
5412 	struct fscrypt_name fname;
5413 
5414 	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
5415 	if (ret)
5416 		return ret;
5417 
5418 	path = btrfs_alloc_path();
5419 	if (!path) {
5420 		err = -ENOMEM;
5421 		goto out;
5422 	}
5423 
5424 	err = -ENOENT;
5425 	key.objectid = dir->root->root_key.objectid;
5426 	key.type = BTRFS_ROOT_REF_KEY;
5427 	key.offset = location->objectid;
5428 
5429 	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5430 	if (ret) {
5431 		if (ret < 0)
5432 			err = ret;
5433 		goto out;
5434 	}
5435 
5436 	leaf = path->nodes[0];
5437 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5438 	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
5439 	    btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
5440 		goto out;
5441 
5442 	ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
5443 				   (unsigned long)(ref + 1), fname.disk_name.len);
5444 	if (ret)
5445 		goto out;
5446 
5447 	btrfs_release_path(path);
5448 
5449 	new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5450 	if (IS_ERR(new_root)) {
5451 		err = PTR_ERR(new_root);
5452 		goto out;
5453 	}
5454 
5455 	*sub_root = new_root;
5456 	location->objectid = btrfs_root_dirid(&new_root->root_item);
5457 	location->type = BTRFS_INODE_ITEM_KEY;
5458 	location->offset = 0;
5459 	err = 0;
5460 out:
5461 	btrfs_free_path(path);
5462 	fscrypt_free_filename(&fname);
5463 	return err;
5464 }
5465 
5466 static void inode_tree_add(struct btrfs_inode *inode)
5467 {
5468 	struct btrfs_root *root = inode->root;
5469 	struct btrfs_inode *entry;
5470 	struct rb_node **p;
5471 	struct rb_node *parent;
5472 	struct rb_node *new = &inode->rb_node;
5473 	u64 ino = btrfs_ino(inode);
5474 
5475 	if (inode_unhashed(&inode->vfs_inode))
5476 		return;
5477 	parent = NULL;
5478 	spin_lock(&root->inode_lock);
5479 	p = &root->inode_tree.rb_node;
5480 	while (*p) {
5481 		parent = *p;
5482 		entry = rb_entry(parent, struct btrfs_inode, rb_node);
5483 
5484 		if (ino < btrfs_ino(entry))
5485 			p = &parent->rb_left;
5486 		else if (ino > btrfs_ino(entry))
5487 			p = &parent->rb_right;
5488 		else {
5489 			WARN_ON(!(entry->vfs_inode.i_state &
5490 				  (I_WILL_FREE | I_FREEING)));
5491 			rb_replace_node(parent, new, &root->inode_tree);
5492 			RB_CLEAR_NODE(parent);
5493 			spin_unlock(&root->inode_lock);
5494 			return;
5495 		}
5496 	}
5497 	rb_link_node(new, parent, p);
5498 	rb_insert_color(new, &root->inode_tree);
5499 	spin_unlock(&root->inode_lock);
5500 }
5501 
5502 static void inode_tree_del(struct btrfs_inode *inode)
5503 {
5504 	struct btrfs_root *root = inode->root;
5505 	int empty = 0;
5506 
5507 	spin_lock(&root->inode_lock);
5508 	if (!RB_EMPTY_NODE(&inode->rb_node)) {
5509 		rb_erase(&inode->rb_node, &root->inode_tree);
5510 		RB_CLEAR_NODE(&inode->rb_node);
5511 		empty = RB_EMPTY_ROOT(&root->inode_tree);
5512 	}
5513 	spin_unlock(&root->inode_lock);
5514 
5515 	if (empty && btrfs_root_refs(&root->root_item) == 0) {
5516 		spin_lock(&root->inode_lock);
5517 		empty = RB_EMPTY_ROOT(&root->inode_tree);
5518 		spin_unlock(&root->inode_lock);
5519 		if (empty)
5520 			btrfs_add_dead_root(root);
5521 	}
5522 }
5523 
5524 
5525 static int btrfs_init_locked_inode(struct inode *inode, void *p)
5526 {
5527 	struct btrfs_iget_args *args = p;
5528 
5529 	inode->i_ino = args->ino;
5530 	BTRFS_I(inode)->location.objectid = args->ino;
5531 	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5532 	BTRFS_I(inode)->location.offset = 0;
5533 	BTRFS_I(inode)->root = btrfs_grab_root(args->root);
5534 	BUG_ON(args->root && !BTRFS_I(inode)->root);
5535 
5536 	if (args->root && args->root == args->root->fs_info->tree_root &&
5537 	    args->ino != BTRFS_BTREE_INODE_OBJECTID)
5538 		set_bit(BTRFS_INODE_FREE_SPACE_INODE,
5539 			&BTRFS_I(inode)->runtime_flags);
5540 	return 0;
5541 }
5542 
5543 static int btrfs_find_actor(struct inode *inode, void *opaque)
5544 {
5545 	struct btrfs_iget_args *args = opaque;
5546 
5547 	return args->ino == BTRFS_I(inode)->location.objectid &&
5548 		args->root == BTRFS_I(inode)->root;
5549 }
5550 
5551 static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
5552 				       struct btrfs_root *root)
5553 {
5554 	struct inode *inode;
5555 	struct btrfs_iget_args args;
5556 	unsigned long hashval = btrfs_inode_hash(ino, root);
5557 
5558 	args.ino = ino;
5559 	args.root = root;
5560 
5561 	inode = iget5_locked(s, hashval, btrfs_find_actor,
5562 			     btrfs_init_locked_inode,
5563 			     (void *)&args);
5564 	return inode;
5565 }
5566 
5567 /*
5568  * Get an inode object given its inode number and corresponding root.
5569  * Path can be preallocated to prevent recursing back to iget through
5570  * allocator. NULL is also valid but may require an additional allocation
5571  * later.
5572  */
5573 struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
5574 			      struct btrfs_root *root, struct btrfs_path *path)
5575 {
5576 	struct inode *inode;
5577 
5578 	inode = btrfs_iget_locked(s, ino, root);
5579 	if (!inode)
5580 		return ERR_PTR(-ENOMEM);
5581 
5582 	if (inode->i_state & I_NEW) {
5583 		int ret;
5584 
5585 		ret = btrfs_read_locked_inode(inode, path);
5586 		if (!ret) {
5587 			inode_tree_add(BTRFS_I(inode));
5588 			unlock_new_inode(inode);
5589 		} else {
5590 			iget_failed(inode);
5591 			/*
5592 			 * ret > 0 can come from btrfs_search_slot called by
5593 			 * btrfs_read_locked_inode, this means the inode item
5594 			 * was not found.
5595 			 */
5596 			if (ret > 0)
5597 				ret = -ENOENT;
5598 			inode = ERR_PTR(ret);
5599 		}
5600 	}
5601 
5602 	return inode;
5603 }
5604 
5605 struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
5606 {
5607 	return btrfs_iget_path(s, ino, root, NULL);
5608 }
5609 
5610 static struct inode *new_simple_dir(struct inode *dir,
5611 				    struct btrfs_key *key,
5612 				    struct btrfs_root *root)
5613 {
5614 	struct inode *inode = new_inode(dir->i_sb);
5615 
5616 	if (!inode)
5617 		return ERR_PTR(-ENOMEM);
5618 
5619 	BTRFS_I(inode)->root = btrfs_grab_root(root);
5620 	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5621 	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5622 
5623 	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5624 	/*
5625 	 * We only need lookup, the rest is read-only and there's no inode
5626 	 * associated with the dentry
5627 	 */
5628 	inode->i_op = &simple_dir_inode_operations;
5629 	inode->i_opflags &= ~IOP_XATTR;
5630 	inode->i_fop = &simple_dir_operations;
5631 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5632 	inode->i_mtime = inode_set_ctime_current(inode);
5633 	inode->i_atime = dir->i_atime;
5634 	BTRFS_I(inode)->i_otime = inode->i_mtime;
5635 	inode->i_uid = dir->i_uid;
5636 	inode->i_gid = dir->i_gid;
5637 
5638 	return inode;
5639 }
5640 
5641 static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
5642 static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
5643 static_assert(BTRFS_FT_DIR == FT_DIR);
5644 static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
5645 static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
5646 static_assert(BTRFS_FT_FIFO == FT_FIFO);
5647 static_assert(BTRFS_FT_SOCK == FT_SOCK);
5648 static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
5649 
5650 static inline u8 btrfs_inode_type(struct inode *inode)
5651 {
5652 	return fs_umode_to_ftype(inode->i_mode);
5653 }
5654 
5655 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5656 {
5657 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
5658 	struct inode *inode;
5659 	struct btrfs_root *root = BTRFS_I(dir)->root;
5660 	struct btrfs_root *sub_root = root;
5661 	struct btrfs_key location;
5662 	u8 di_type = 0;
5663 	int ret = 0;
5664 
5665 	if (dentry->d_name.len > BTRFS_NAME_LEN)
5666 		return ERR_PTR(-ENAMETOOLONG);
5667 
5668 	ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
5669 	if (ret < 0)
5670 		return ERR_PTR(ret);
5671 
5672 	if (location.type == BTRFS_INODE_ITEM_KEY) {
5673 		inode = btrfs_iget(dir->i_sb, location.objectid, root);
5674 		if (IS_ERR(inode))
5675 			return inode;
5676 
5677 		/* Do extra check against inode mode with di_type */
5678 		if (btrfs_inode_type(inode) != di_type) {
5679 			btrfs_crit(fs_info,
5680 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5681 				  inode->i_mode, btrfs_inode_type(inode),
5682 				  di_type);
5683 			iput(inode);
5684 			return ERR_PTR(-EUCLEAN);
5685 		}
5686 		return inode;
5687 	}
5688 
5689 	ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
5690 				       &location, &sub_root);
5691 	if (ret < 0) {
5692 		if (ret != -ENOENT)
5693 			inode = ERR_PTR(ret);
5694 		else
5695 			inode = new_simple_dir(dir, &location, root);
5696 	} else {
5697 		inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
5698 		btrfs_put_root(sub_root);
5699 
5700 		if (IS_ERR(inode))
5701 			return inode;
5702 
5703 		down_read(&fs_info->cleanup_work_sem);
5704 		if (!sb_rdonly(inode->i_sb))
5705 			ret = btrfs_orphan_cleanup(sub_root);
5706 		up_read(&fs_info->cleanup_work_sem);
5707 		if (ret) {
5708 			iput(inode);
5709 			inode = ERR_PTR(ret);
5710 		}
5711 	}
5712 
5713 	return inode;
5714 }
5715 
5716 static int btrfs_dentry_delete(const struct dentry *dentry)
5717 {
5718 	struct btrfs_root *root;
5719 	struct inode *inode = d_inode(dentry);
5720 
5721 	if (!inode && !IS_ROOT(dentry))
5722 		inode = d_inode(dentry->d_parent);
5723 
5724 	if (inode) {
5725 		root = BTRFS_I(inode)->root;
5726 		if (btrfs_root_refs(&root->root_item) == 0)
5727 			return 1;
5728 
5729 		if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5730 			return 1;
5731 	}
5732 	return 0;
5733 }
5734 
5735 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5736 				   unsigned int flags)
5737 {
5738 	struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5739 
5740 	if (inode == ERR_PTR(-ENOENT))
5741 		inode = NULL;
5742 	return d_splice_alias(inode, dentry);
5743 }
5744 
5745 /*
5746  * Find the highest existing sequence number in a directory and then set the
5747  * in-memory index_cnt variable to the first free sequence number.
5748  */
5749 static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
5750 {
5751 	struct btrfs_root *root = inode->root;
5752 	struct btrfs_key key, found_key;
5753 	struct btrfs_path *path;
5754 	struct extent_buffer *leaf;
5755 	int ret;
5756 
5757 	key.objectid = btrfs_ino(inode);
5758 	key.type = BTRFS_DIR_INDEX_KEY;
5759 	key.offset = (u64)-1;
5760 
5761 	path = btrfs_alloc_path();
5762 	if (!path)
5763 		return -ENOMEM;
5764 
5765 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5766 	if (ret < 0)
5767 		goto out;
5768 	/* FIXME: we should be able to handle this */
5769 	if (ret == 0)
5770 		goto out;
5771 	ret = 0;
5772 
5773 	if (path->slots[0] == 0) {
5774 		inode->index_cnt = BTRFS_DIR_START_INDEX;
5775 		goto out;
5776 	}
5777 
5778 	path->slots[0]--;
5779 
5780 	leaf = path->nodes[0];
5781 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5782 
5783 	if (found_key.objectid != btrfs_ino(inode) ||
5784 	    found_key.type != BTRFS_DIR_INDEX_KEY) {
5785 		inode->index_cnt = BTRFS_DIR_START_INDEX;
5786 		goto out;
5787 	}
5788 
5789 	inode->index_cnt = found_key.offset + 1;
5790 out:
5791 	btrfs_free_path(path);
5792 	return ret;
5793 }
5794 
5795 static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
5796 {
5797 	int ret = 0;
5798 
5799 	btrfs_inode_lock(dir, 0);
5800 	if (dir->index_cnt == (u64)-1) {
5801 		ret = btrfs_inode_delayed_dir_index_count(dir);
5802 		if (ret) {
5803 			ret = btrfs_set_inode_index_count(dir);
5804 			if (ret)
5805 				goto out;
5806 		}
5807 	}
5808 
5809 	/* index_cnt is the index number of next new entry, so decrement it. */
5810 	*index = dir->index_cnt - 1;
5811 out:
5812 	btrfs_inode_unlock(dir, 0);
5813 
5814 	return ret;
5815 }
5816 
5817 /*
5818  * All this infrastructure exists because dir_emit can fault, and we are holding
5819  * the tree lock when doing readdir.  For now just allocate a buffer and copy
5820  * our information into that, and then dir_emit from the buffer.  This is
5821  * similar to what NFS does, only we don't keep the buffer around in pagecache
5822  * because I'm afraid I'll mess that up.  Long term we need to make filldir do
5823  * copy_to_user_inatomic so we don't have to worry about page faulting under the
5824  * tree lock.
5825  */
5826 static int btrfs_opendir(struct inode *inode, struct file *file)
5827 {
5828 	struct btrfs_file_private *private;
5829 	u64 last_index;
5830 	int ret;
5831 
5832 	ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
5833 	if (ret)
5834 		return ret;
5835 
5836 	private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
5837 	if (!private)
5838 		return -ENOMEM;
5839 	private->last_index = last_index;
5840 	private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
5841 	if (!private->filldir_buf) {
5842 		kfree(private);
5843 		return -ENOMEM;
5844 	}
5845 	file->private_data = private;
5846 	return 0;
5847 }
5848 
5849 static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
5850 {
5851 	struct btrfs_file_private *private = file->private_data;
5852 	int ret;
5853 
5854 	ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
5855 				       &private->last_index);
5856 	if (ret)
5857 		return ret;
5858 
5859 	return generic_file_llseek(file, offset, whence);
5860 }
5861 
5862 struct dir_entry {
5863 	u64 ino;
5864 	u64 offset;
5865 	unsigned type;
5866 	int name_len;
5867 };
5868 
5869 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
5870 {
5871 	while (entries--) {
5872 		struct dir_entry *entry = addr;
5873 		char *name = (char *)(entry + 1);
5874 
5875 		ctx->pos = get_unaligned(&entry->offset);
5876 		if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
5877 					 get_unaligned(&entry->ino),
5878 					 get_unaligned(&entry->type)))
5879 			return 1;
5880 		addr += sizeof(struct dir_entry) +
5881 			get_unaligned(&entry->name_len);
5882 		ctx->pos++;
5883 	}
5884 	return 0;
5885 }
5886 
5887 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5888 {
5889 	struct inode *inode = file_inode(file);
5890 	struct btrfs_root *root = BTRFS_I(inode)->root;
5891 	struct btrfs_file_private *private = file->private_data;
5892 	struct btrfs_dir_item *di;
5893 	struct btrfs_key key;
5894 	struct btrfs_key found_key;
5895 	struct btrfs_path *path;
5896 	void *addr;
5897 	LIST_HEAD(ins_list);
5898 	LIST_HEAD(del_list);
5899 	int ret;
5900 	char *name_ptr;
5901 	int name_len;
5902 	int entries = 0;
5903 	int total_len = 0;
5904 	bool put = false;
5905 	struct btrfs_key location;
5906 
5907 	if (!dir_emit_dots(file, ctx))
5908 		return 0;
5909 
5910 	path = btrfs_alloc_path();
5911 	if (!path)
5912 		return -ENOMEM;
5913 
5914 	addr = private->filldir_buf;
5915 	path->reada = READA_FORWARD;
5916 
5917 	put = btrfs_readdir_get_delayed_items(inode, private->last_index,
5918 					      &ins_list, &del_list);
5919 
5920 again:
5921 	key.type = BTRFS_DIR_INDEX_KEY;
5922 	key.offset = ctx->pos;
5923 	key.objectid = btrfs_ino(BTRFS_I(inode));
5924 
5925 	btrfs_for_each_slot(root, &key, &found_key, path, ret) {
5926 		struct dir_entry *entry;
5927 		struct extent_buffer *leaf = path->nodes[0];
5928 		u8 ftype;
5929 
5930 		if (found_key.objectid != key.objectid)
5931 			break;
5932 		if (found_key.type != BTRFS_DIR_INDEX_KEY)
5933 			break;
5934 		if (found_key.offset < ctx->pos)
5935 			continue;
5936 		if (found_key.offset > private->last_index)
5937 			break;
5938 		if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
5939 			continue;
5940 		di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
5941 		name_len = btrfs_dir_name_len(leaf, di);
5942 		if ((total_len + sizeof(struct dir_entry) + name_len) >=
5943 		    PAGE_SIZE) {
5944 			btrfs_release_path(path);
5945 			ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5946 			if (ret)
5947 				goto nopos;
5948 			addr = private->filldir_buf;
5949 			entries = 0;
5950 			total_len = 0;
5951 			goto again;
5952 		}
5953 
5954 		ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
5955 		entry = addr;
5956 		name_ptr = (char *)(entry + 1);
5957 		read_extent_buffer(leaf, name_ptr,
5958 				   (unsigned long)(di + 1), name_len);
5959 		put_unaligned(name_len, &entry->name_len);
5960 		put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
5961 		btrfs_dir_item_key_to_cpu(leaf, di, &location);
5962 		put_unaligned(location.objectid, &entry->ino);
5963 		put_unaligned(found_key.offset, &entry->offset);
5964 		entries++;
5965 		addr += sizeof(struct dir_entry) + name_len;
5966 		total_len += sizeof(struct dir_entry) + name_len;
5967 	}
5968 	/* Catch error encountered during iteration */
5969 	if (ret < 0)
5970 		goto err;
5971 
5972 	btrfs_release_path(path);
5973 
5974 	ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5975 	if (ret)
5976 		goto nopos;
5977 
5978 	ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5979 	if (ret)
5980 		goto nopos;
5981 
5982 	/*
5983 	 * Stop new entries from being returned after we return the last
5984 	 * entry.
5985 	 *
5986 	 * New directory entries are assigned a strictly increasing
5987 	 * offset.  This means that new entries created during readdir
5988 	 * are *guaranteed* to be seen in the future by that readdir.
5989 	 * This has broken buggy programs which operate on names as
5990 	 * they're returned by readdir.  Until we re-use freed offsets
5991 	 * we have this hack to stop new entries from being returned
5992 	 * under the assumption that they'll never reach this huge
5993 	 * offset.
5994 	 *
5995 	 * This is being careful not to overflow 32bit loff_t unless the
5996 	 * last entry requires it because doing so has broken 32bit apps
5997 	 * in the past.
5998 	 */
5999 	if (ctx->pos >= INT_MAX)
6000 		ctx->pos = LLONG_MAX;
6001 	else
6002 		ctx->pos = INT_MAX;
6003 nopos:
6004 	ret = 0;
6005 err:
6006 	if (put)
6007 		btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
6008 	btrfs_free_path(path);
6009 	return ret;
6010 }
6011 
6012 /*
6013  * This is somewhat expensive, updating the tree every time the
6014  * inode changes.  But, it is most likely to find the inode in cache.
6015  * FIXME, needs more benchmarking...there are no reasons other than performance
6016  * to keep or drop this code.
6017  */
6018 static int btrfs_dirty_inode(struct btrfs_inode *inode)
6019 {
6020 	struct btrfs_root *root = inode->root;
6021 	struct btrfs_fs_info *fs_info = root->fs_info;
6022 	struct btrfs_trans_handle *trans;
6023 	int ret;
6024 
6025 	if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
6026 		return 0;
6027 
6028 	trans = btrfs_join_transaction(root);
6029 	if (IS_ERR(trans))
6030 		return PTR_ERR(trans);
6031 
6032 	ret = btrfs_update_inode(trans, root, inode);
6033 	if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
6034 		/* whoops, lets try again with the full transaction */
6035 		btrfs_end_transaction(trans);
6036 		trans = btrfs_start_transaction(root, 1);
6037 		if (IS_ERR(trans))
6038 			return PTR_ERR(trans);
6039 
6040 		ret = btrfs_update_inode(trans, root, inode);
6041 	}
6042 	btrfs_end_transaction(trans);
6043 	if (inode->delayed_node)
6044 		btrfs_balance_delayed_items(fs_info);
6045 
6046 	return ret;
6047 }
6048 
6049 /*
6050  * This is a copy of file_update_time.  We need this so we can return error on
6051  * ENOSPC for updating the inode in the case of file write and mmap writes.
6052  */
6053 static int btrfs_update_time(struct inode *inode, int flags)
6054 {
6055 	struct btrfs_root *root = BTRFS_I(inode)->root;
6056 	bool dirty = flags & ~S_VERSION;
6057 
6058 	if (btrfs_root_readonly(root))
6059 		return -EROFS;
6060 
6061 	dirty = inode_update_timestamps(inode, flags);
6062 	return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
6063 }
6064 
6065 /*
6066  * helper to find a free sequence number in a given directory.  This current
6067  * code is very simple, later versions will do smarter things in the btree
6068  */
6069 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6070 {
6071 	int ret = 0;
6072 
6073 	if (dir->index_cnt == (u64)-1) {
6074 		ret = btrfs_inode_delayed_dir_index_count(dir);
6075 		if (ret) {
6076 			ret = btrfs_set_inode_index_count(dir);
6077 			if (ret)
6078 				return ret;
6079 		}
6080 	}
6081 
6082 	*index = dir->index_cnt;
6083 	dir->index_cnt++;
6084 
6085 	return ret;
6086 }
6087 
6088 static int btrfs_insert_inode_locked(struct inode *inode)
6089 {
6090 	struct btrfs_iget_args args;
6091 
6092 	args.ino = BTRFS_I(inode)->location.objectid;
6093 	args.root = BTRFS_I(inode)->root;
6094 
6095 	return insert_inode_locked4(inode,
6096 		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6097 		   btrfs_find_actor, &args);
6098 }
6099 
6100 int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
6101 			    unsigned int *trans_num_items)
6102 {
6103 	struct inode *dir = args->dir;
6104 	struct inode *inode = args->inode;
6105 	int ret;
6106 
6107 	if (!args->orphan) {
6108 		ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
6109 					     &args->fname);
6110 		if (ret)
6111 			return ret;
6112 	}
6113 
6114 	ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
6115 	if (ret) {
6116 		fscrypt_free_filename(&args->fname);
6117 		return ret;
6118 	}
6119 
6120 	/* 1 to add inode item */
6121 	*trans_num_items = 1;
6122 	/* 1 to add compression property */
6123 	if (BTRFS_I(dir)->prop_compress)
6124 		(*trans_num_items)++;
6125 	/* 1 to add default ACL xattr */
6126 	if (args->default_acl)
6127 		(*trans_num_items)++;
6128 	/* 1 to add access ACL xattr */
6129 	if (args->acl)
6130 		(*trans_num_items)++;
6131 #ifdef CONFIG_SECURITY
6132 	/* 1 to add LSM xattr */
6133 	if (dir->i_security)
6134 		(*trans_num_items)++;
6135 #endif
6136 	if (args->orphan) {
6137 		/* 1 to add orphan item */
6138 		(*trans_num_items)++;
6139 	} else {
6140 		/*
6141 		 * 1 to add dir item
6142 		 * 1 to add dir index
6143 		 * 1 to update parent inode item
6144 		 *
6145 		 * No need for 1 unit for the inode ref item because it is
6146 		 * inserted in a batch together with the inode item at
6147 		 * btrfs_create_new_inode().
6148 		 */
6149 		*trans_num_items += 3;
6150 	}
6151 	return 0;
6152 }
6153 
6154 void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
6155 {
6156 	posix_acl_release(args->acl);
6157 	posix_acl_release(args->default_acl);
6158 	fscrypt_free_filename(&args->fname);
6159 }
6160 
6161 /*
6162  * Inherit flags from the parent inode.
6163  *
6164  * Currently only the compression flags and the cow flags are inherited.
6165  */
6166 static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
6167 {
6168 	unsigned int flags;
6169 
6170 	flags = dir->flags;
6171 
6172 	if (flags & BTRFS_INODE_NOCOMPRESS) {
6173 		inode->flags &= ~BTRFS_INODE_COMPRESS;
6174 		inode->flags |= BTRFS_INODE_NOCOMPRESS;
6175 	} else if (flags & BTRFS_INODE_COMPRESS) {
6176 		inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
6177 		inode->flags |= BTRFS_INODE_COMPRESS;
6178 	}
6179 
6180 	if (flags & BTRFS_INODE_NODATACOW) {
6181 		inode->flags |= BTRFS_INODE_NODATACOW;
6182 		if (S_ISREG(inode->vfs_inode.i_mode))
6183 			inode->flags |= BTRFS_INODE_NODATASUM;
6184 	}
6185 
6186 	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
6187 }
6188 
6189 int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
6190 			   struct btrfs_new_inode_args *args)
6191 {
6192 	struct inode *dir = args->dir;
6193 	struct inode *inode = args->inode;
6194 	const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
6195 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6196 	struct btrfs_root *root;
6197 	struct btrfs_inode_item *inode_item;
6198 	struct btrfs_key *location;
6199 	struct btrfs_path *path;
6200 	u64 objectid;
6201 	struct btrfs_inode_ref *ref;
6202 	struct btrfs_key key[2];
6203 	u32 sizes[2];
6204 	struct btrfs_item_batch batch;
6205 	unsigned long ptr;
6206 	int ret;
6207 
6208 	path = btrfs_alloc_path();
6209 	if (!path)
6210 		return -ENOMEM;
6211 
6212 	if (!args->subvol)
6213 		BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
6214 	root = BTRFS_I(inode)->root;
6215 
6216 	ret = btrfs_get_free_objectid(root, &objectid);
6217 	if (ret)
6218 		goto out;
6219 	inode->i_ino = objectid;
6220 
6221 	if (args->orphan) {
6222 		/*
6223 		 * O_TMPFILE, set link count to 0, so that after this point, we
6224 		 * fill in an inode item with the correct link count.
6225 		 */
6226 		set_nlink(inode, 0);
6227 	} else {
6228 		trace_btrfs_inode_request(dir);
6229 
6230 		ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
6231 		if (ret)
6232 			goto out;
6233 	}
6234 	/* index_cnt is ignored for everything but a dir. */
6235 	BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
6236 	BTRFS_I(inode)->generation = trans->transid;
6237 	inode->i_generation = BTRFS_I(inode)->generation;
6238 
6239 	/*
6240 	 * Subvolumes don't inherit flags from their parent directory.
6241 	 * Originally this was probably by accident, but we probably can't
6242 	 * change it now without compatibility issues.
6243 	 */
6244 	if (!args->subvol)
6245 		btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
6246 
6247 	if (S_ISREG(inode->i_mode)) {
6248 		if (btrfs_test_opt(fs_info, NODATASUM))
6249 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6250 		if (btrfs_test_opt(fs_info, NODATACOW))
6251 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6252 				BTRFS_INODE_NODATASUM;
6253 	}
6254 
6255 	location = &BTRFS_I(inode)->location;
6256 	location->objectid = objectid;
6257 	location->offset = 0;
6258 	location->type = BTRFS_INODE_ITEM_KEY;
6259 
6260 	ret = btrfs_insert_inode_locked(inode);
6261 	if (ret < 0) {
6262 		if (!args->orphan)
6263 			BTRFS_I(dir)->index_cnt--;
6264 		goto out;
6265 	}
6266 
6267 	/*
6268 	 * We could have gotten an inode number from somebody who was fsynced
6269 	 * and then removed in this same transaction, so let's just set full
6270 	 * sync since it will be a full sync anyway and this will blow away the
6271 	 * old info in the log.
6272 	 */
6273 	btrfs_set_inode_full_sync(BTRFS_I(inode));
6274 
6275 	key[0].objectid = objectid;
6276 	key[0].type = BTRFS_INODE_ITEM_KEY;
6277 	key[0].offset = 0;
6278 
6279 	sizes[0] = sizeof(struct btrfs_inode_item);
6280 
6281 	if (!args->orphan) {
6282 		/*
6283 		 * Start new inodes with an inode_ref. This is slightly more
6284 		 * efficient for small numbers of hard links since they will
6285 		 * be packed into one item. Extended refs will kick in if we
6286 		 * add more hard links than can fit in the ref item.
6287 		 */
6288 		key[1].objectid = objectid;
6289 		key[1].type = BTRFS_INODE_REF_KEY;
6290 		if (args->subvol) {
6291 			key[1].offset = objectid;
6292 			sizes[1] = 2 + sizeof(*ref);
6293 		} else {
6294 			key[1].offset = btrfs_ino(BTRFS_I(dir));
6295 			sizes[1] = name->len + sizeof(*ref);
6296 		}
6297 	}
6298 
6299 	batch.keys = &key[0];
6300 	batch.data_sizes = &sizes[0];
6301 	batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
6302 	batch.nr = args->orphan ? 1 : 2;
6303 	ret = btrfs_insert_empty_items(trans, root, path, &batch);
6304 	if (ret != 0) {
6305 		btrfs_abort_transaction(trans, ret);
6306 		goto discard;
6307 	}
6308 
6309 	inode->i_mtime = inode_set_ctime_current(inode);
6310 	inode->i_atime = inode->i_mtime;
6311 	BTRFS_I(inode)->i_otime = inode->i_mtime;
6312 
6313 	/*
6314 	 * We're going to fill the inode item now, so at this point the inode
6315 	 * must be fully initialized.
6316 	 */
6317 
6318 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6319 				  struct btrfs_inode_item);
6320 	memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6321 			     sizeof(*inode_item));
6322 	fill_inode_item(trans, path->nodes[0], inode_item, inode);
6323 
6324 	if (!args->orphan) {
6325 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6326 				     struct btrfs_inode_ref);
6327 		ptr = (unsigned long)(ref + 1);
6328 		if (args->subvol) {
6329 			btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
6330 			btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
6331 			write_extent_buffer(path->nodes[0], "..", ptr, 2);
6332 		} else {
6333 			btrfs_set_inode_ref_name_len(path->nodes[0], ref,
6334 						     name->len);
6335 			btrfs_set_inode_ref_index(path->nodes[0], ref,
6336 						  BTRFS_I(inode)->dir_index);
6337 			write_extent_buffer(path->nodes[0], name->name, ptr,
6338 					    name->len);
6339 		}
6340 	}
6341 
6342 	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
6343 	/*
6344 	 * We don't need the path anymore, plus inheriting properties, adding
6345 	 * ACLs, security xattrs, orphan item or adding the link, will result in
6346 	 * allocating yet another path. So just free our path.
6347 	 */
6348 	btrfs_free_path(path);
6349 	path = NULL;
6350 
6351 	if (args->subvol) {
6352 		struct inode *parent;
6353 
6354 		/*
6355 		 * Subvolumes inherit properties from their parent subvolume,
6356 		 * not the directory they were created in.
6357 		 */
6358 		parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
6359 				    BTRFS_I(dir)->root);
6360 		if (IS_ERR(parent)) {
6361 			ret = PTR_ERR(parent);
6362 		} else {
6363 			ret = btrfs_inode_inherit_props(trans, inode, parent);
6364 			iput(parent);
6365 		}
6366 	} else {
6367 		ret = btrfs_inode_inherit_props(trans, inode, dir);
6368 	}
6369 	if (ret) {
6370 		btrfs_err(fs_info,
6371 			  "error inheriting props for ino %llu (root %llu): %d",
6372 			  btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
6373 			  ret);
6374 	}
6375 
6376 	/*
6377 	 * Subvolumes don't inherit ACLs or get passed to the LSM. This is
6378 	 * probably a bug.
6379 	 */
6380 	if (!args->subvol) {
6381 		ret = btrfs_init_inode_security(trans, args);
6382 		if (ret) {
6383 			btrfs_abort_transaction(trans, ret);
6384 			goto discard;
6385 		}
6386 	}
6387 
6388 	inode_tree_add(BTRFS_I(inode));
6389 
6390 	trace_btrfs_inode_new(inode);
6391 	btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
6392 
6393 	btrfs_update_root_times(trans, root);
6394 
6395 	if (args->orphan) {
6396 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
6397 	} else {
6398 		ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
6399 				     0, BTRFS_I(inode)->dir_index);
6400 	}
6401 	if (ret) {
6402 		btrfs_abort_transaction(trans, ret);
6403 		goto discard;
6404 	}
6405 
6406 	return 0;
6407 
6408 discard:
6409 	/*
6410 	 * discard_new_inode() calls iput(), but the caller owns the reference
6411 	 * to the inode.
6412 	 */
6413 	ihold(inode);
6414 	discard_new_inode(inode);
6415 out:
6416 	btrfs_free_path(path);
6417 	return ret;
6418 }
6419 
6420 /*
6421  * utility function to add 'inode' into 'parent_inode' with
6422  * a give name and a given sequence number.
6423  * if 'add_backref' is true, also insert a backref from the
6424  * inode to the parent directory.
6425  */
6426 int btrfs_add_link(struct btrfs_trans_handle *trans,
6427 		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6428 		   const struct fscrypt_str *name, int add_backref, u64 index)
6429 {
6430 	int ret = 0;
6431 	struct btrfs_key key;
6432 	struct btrfs_root *root = parent_inode->root;
6433 	u64 ino = btrfs_ino(inode);
6434 	u64 parent_ino = btrfs_ino(parent_inode);
6435 
6436 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6437 		memcpy(&key, &inode->root->root_key, sizeof(key));
6438 	} else {
6439 		key.objectid = ino;
6440 		key.type = BTRFS_INODE_ITEM_KEY;
6441 		key.offset = 0;
6442 	}
6443 
6444 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6445 		ret = btrfs_add_root_ref(trans, key.objectid,
6446 					 root->root_key.objectid, parent_ino,
6447 					 index, name);
6448 	} else if (add_backref) {
6449 		ret = btrfs_insert_inode_ref(trans, root, name,
6450 					     ino, parent_ino, index);
6451 	}
6452 
6453 	/* Nothing to clean up yet */
6454 	if (ret)
6455 		return ret;
6456 
6457 	ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
6458 				    btrfs_inode_type(&inode->vfs_inode), index);
6459 	if (ret == -EEXIST || ret == -EOVERFLOW)
6460 		goto fail_dir_item;
6461 	else if (ret) {
6462 		btrfs_abort_transaction(trans, ret);
6463 		return ret;
6464 	}
6465 
6466 	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6467 			   name->len * 2);
6468 	inode_inc_iversion(&parent_inode->vfs_inode);
6469 	/*
6470 	 * If we are replaying a log tree, we do not want to update the mtime
6471 	 * and ctime of the parent directory with the current time, since the
6472 	 * log replay procedure is responsible for setting them to their correct
6473 	 * values (the ones it had when the fsync was done).
6474 	 */
6475 	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
6476 		parent_inode->vfs_inode.i_mtime =
6477 			inode_set_ctime_current(&parent_inode->vfs_inode);
6478 
6479 	ret = btrfs_update_inode(trans, root, parent_inode);
6480 	if (ret)
6481 		btrfs_abort_transaction(trans, ret);
6482 	return ret;
6483 
6484 fail_dir_item:
6485 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6486 		u64 local_index;
6487 		int err;
6488 		err = btrfs_del_root_ref(trans, key.objectid,
6489 					 root->root_key.objectid, parent_ino,
6490 					 &local_index, name);
6491 		if (err)
6492 			btrfs_abort_transaction(trans, err);
6493 	} else if (add_backref) {
6494 		u64 local_index;
6495 		int err;
6496 
6497 		err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
6498 					  &local_index);
6499 		if (err)
6500 			btrfs_abort_transaction(trans, err);
6501 	}
6502 
6503 	/* Return the original error code */
6504 	return ret;
6505 }
6506 
6507 static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
6508 			       struct inode *inode)
6509 {
6510 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6511 	struct btrfs_root *root = BTRFS_I(dir)->root;
6512 	struct btrfs_new_inode_args new_inode_args = {
6513 		.dir = dir,
6514 		.dentry = dentry,
6515 		.inode = inode,
6516 	};
6517 	unsigned int trans_num_items;
6518 	struct btrfs_trans_handle *trans;
6519 	int err;
6520 
6521 	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
6522 	if (err)
6523 		goto out_inode;
6524 
6525 	trans = btrfs_start_transaction(root, trans_num_items);
6526 	if (IS_ERR(trans)) {
6527 		err = PTR_ERR(trans);
6528 		goto out_new_inode_args;
6529 	}
6530 
6531 	err = btrfs_create_new_inode(trans, &new_inode_args);
6532 	if (!err)
6533 		d_instantiate_new(dentry, inode);
6534 
6535 	btrfs_end_transaction(trans);
6536 	btrfs_btree_balance_dirty(fs_info);
6537 out_new_inode_args:
6538 	btrfs_new_inode_args_destroy(&new_inode_args);
6539 out_inode:
6540 	if (err)
6541 		iput(inode);
6542 	return err;
6543 }
6544 
6545 static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
6546 		       struct dentry *dentry, umode_t mode, dev_t rdev)
6547 {
6548 	struct inode *inode;
6549 
6550 	inode = new_inode(dir->i_sb);
6551 	if (!inode)
6552 		return -ENOMEM;
6553 	inode_init_owner(idmap, inode, dir, mode);
6554 	inode->i_op = &btrfs_special_inode_operations;
6555 	init_special_inode(inode, inode->i_mode, rdev);
6556 	return btrfs_create_common(dir, dentry, inode);
6557 }
6558 
6559 static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
6560 			struct dentry *dentry, umode_t mode, bool excl)
6561 {
6562 	struct inode *inode;
6563 
6564 	inode = new_inode(dir->i_sb);
6565 	if (!inode)
6566 		return -ENOMEM;
6567 	inode_init_owner(idmap, inode, dir, mode);
6568 	inode->i_fop = &btrfs_file_operations;
6569 	inode->i_op = &btrfs_file_inode_operations;
6570 	inode->i_mapping->a_ops = &btrfs_aops;
6571 	return btrfs_create_common(dir, dentry, inode);
6572 }
6573 
6574 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6575 		      struct dentry *dentry)
6576 {
6577 	struct btrfs_trans_handle *trans = NULL;
6578 	struct btrfs_root *root = BTRFS_I(dir)->root;
6579 	struct inode *inode = d_inode(old_dentry);
6580 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6581 	struct fscrypt_name fname;
6582 	u64 index;
6583 	int err;
6584 	int drop_inode = 0;
6585 
6586 	/* do not allow sys_link's with other subvols of the same device */
6587 	if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
6588 		return -EXDEV;
6589 
6590 	if (inode->i_nlink >= BTRFS_LINK_MAX)
6591 		return -EMLINK;
6592 
6593 	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
6594 	if (err)
6595 		goto fail;
6596 
6597 	err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6598 	if (err)
6599 		goto fail;
6600 
6601 	/*
6602 	 * 2 items for inode and inode ref
6603 	 * 2 items for dir items
6604 	 * 1 item for parent inode
6605 	 * 1 item for orphan item deletion if O_TMPFILE
6606 	 */
6607 	trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
6608 	if (IS_ERR(trans)) {
6609 		err = PTR_ERR(trans);
6610 		trans = NULL;
6611 		goto fail;
6612 	}
6613 
6614 	/* There are several dir indexes for this inode, clear the cache. */
6615 	BTRFS_I(inode)->dir_index = 0ULL;
6616 	inc_nlink(inode);
6617 	inode_inc_iversion(inode);
6618 	inode_set_ctime_current(inode);
6619 	ihold(inode);
6620 	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6621 
6622 	err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6623 			     &fname.disk_name, 1, index);
6624 
6625 	if (err) {
6626 		drop_inode = 1;
6627 	} else {
6628 		struct dentry *parent = dentry->d_parent;
6629 
6630 		err = btrfs_update_inode(trans, root, BTRFS_I(inode));
6631 		if (err)
6632 			goto fail;
6633 		if (inode->i_nlink == 1) {
6634 			/*
6635 			 * If new hard link count is 1, it's a file created
6636 			 * with open(2) O_TMPFILE flag.
6637 			 */
6638 			err = btrfs_orphan_del(trans, BTRFS_I(inode));
6639 			if (err)
6640 				goto fail;
6641 		}
6642 		d_instantiate(dentry, inode);
6643 		btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
6644 	}
6645 
6646 fail:
6647 	fscrypt_free_filename(&fname);
6648 	if (trans)
6649 		btrfs_end_transaction(trans);
6650 	if (drop_inode) {
6651 		inode_dec_link_count(inode);
6652 		iput(inode);
6653 	}
6654 	btrfs_btree_balance_dirty(fs_info);
6655 	return err;
6656 }
6657 
6658 static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
6659 		       struct dentry *dentry, umode_t mode)
6660 {
6661 	struct inode *inode;
6662 
6663 	inode = new_inode(dir->i_sb);
6664 	if (!inode)
6665 		return -ENOMEM;
6666 	inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
6667 	inode->i_op = &btrfs_dir_inode_operations;
6668 	inode->i_fop = &btrfs_dir_file_operations;
6669 	return btrfs_create_common(dir, dentry, inode);
6670 }
6671 
6672 static noinline int uncompress_inline(struct btrfs_path *path,
6673 				      struct page *page,
6674 				      struct btrfs_file_extent_item *item)
6675 {
6676 	int ret;
6677 	struct extent_buffer *leaf = path->nodes[0];
6678 	char *tmp;
6679 	size_t max_size;
6680 	unsigned long inline_size;
6681 	unsigned long ptr;
6682 	int compress_type;
6683 
6684 	compress_type = btrfs_file_extent_compression(leaf, item);
6685 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
6686 	inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
6687 	tmp = kmalloc(inline_size, GFP_NOFS);
6688 	if (!tmp)
6689 		return -ENOMEM;
6690 	ptr = btrfs_file_extent_inline_start(item);
6691 
6692 	read_extent_buffer(leaf, tmp, ptr, inline_size);
6693 
6694 	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6695 	ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
6696 
6697 	/*
6698 	 * decompression code contains a memset to fill in any space between the end
6699 	 * of the uncompressed data and the end of max_size in case the decompressed
6700 	 * data ends up shorter than ram_bytes.  That doesn't cover the hole between
6701 	 * the end of an inline extent and the beginning of the next block, so we
6702 	 * cover that region here.
6703 	 */
6704 
6705 	if (max_size < PAGE_SIZE)
6706 		memzero_page(page, max_size, PAGE_SIZE - max_size);
6707 	kfree(tmp);
6708 	return ret;
6709 }
6710 
6711 static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
6712 			      struct page *page)
6713 {
6714 	struct btrfs_file_extent_item *fi;
6715 	void *kaddr;
6716 	size_t copy_size;
6717 
6718 	if (!page || PageUptodate(page))
6719 		return 0;
6720 
6721 	ASSERT(page_offset(page) == 0);
6722 
6723 	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
6724 			    struct btrfs_file_extent_item);
6725 	if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
6726 		return uncompress_inline(path, page, fi);
6727 
6728 	copy_size = min_t(u64, PAGE_SIZE,
6729 			  btrfs_file_extent_ram_bytes(path->nodes[0], fi));
6730 	kaddr = kmap_local_page(page);
6731 	read_extent_buffer(path->nodes[0], kaddr,
6732 			   btrfs_file_extent_inline_start(fi), copy_size);
6733 	kunmap_local(kaddr);
6734 	if (copy_size < PAGE_SIZE)
6735 		memzero_page(page, copy_size, PAGE_SIZE - copy_size);
6736 	return 0;
6737 }
6738 
6739 /*
6740  * Lookup the first extent overlapping a range in a file.
6741  *
6742  * @inode:	file to search in
6743  * @page:	page to read extent data into if the extent is inline
6744  * @pg_offset:	offset into @page to copy to
6745  * @start:	file offset
6746  * @len:	length of range starting at @start
6747  *
6748  * Return the first &struct extent_map which overlaps the given range, reading
6749  * it from the B-tree and caching it if necessary. Note that there may be more
6750  * extents which overlap the given range after the returned extent_map.
6751  *
6752  * If @page is not NULL and the extent is inline, this also reads the extent
6753  * data directly into the page and marks the extent up to date in the io_tree.
6754  *
6755  * Return: ERR_PTR on error, non-NULL extent_map on success.
6756  */
6757 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6758 				    struct page *page, size_t pg_offset,
6759 				    u64 start, u64 len)
6760 {
6761 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
6762 	int ret = 0;
6763 	u64 extent_start = 0;
6764 	u64 extent_end = 0;
6765 	u64 objectid = btrfs_ino(inode);
6766 	int extent_type = -1;
6767 	struct btrfs_path *path = NULL;
6768 	struct btrfs_root *root = inode->root;
6769 	struct btrfs_file_extent_item *item;
6770 	struct extent_buffer *leaf;
6771 	struct btrfs_key found_key;
6772 	struct extent_map *em = NULL;
6773 	struct extent_map_tree *em_tree = &inode->extent_tree;
6774 
6775 	read_lock(&em_tree->lock);
6776 	em = lookup_extent_mapping(em_tree, start, len);
6777 	read_unlock(&em_tree->lock);
6778 
6779 	if (em) {
6780 		if (em->start > start || em->start + em->len <= start)
6781 			free_extent_map(em);
6782 		else if (em->block_start == EXTENT_MAP_INLINE && page)
6783 			free_extent_map(em);
6784 		else
6785 			goto out;
6786 	}
6787 	em = alloc_extent_map();
6788 	if (!em) {
6789 		ret = -ENOMEM;
6790 		goto out;
6791 	}
6792 	em->start = EXTENT_MAP_HOLE;
6793 	em->orig_start = EXTENT_MAP_HOLE;
6794 	em->len = (u64)-1;
6795 	em->block_len = (u64)-1;
6796 
6797 	path = btrfs_alloc_path();
6798 	if (!path) {
6799 		ret = -ENOMEM;
6800 		goto out;
6801 	}
6802 
6803 	/* Chances are we'll be called again, so go ahead and do readahead */
6804 	path->reada = READA_FORWARD;
6805 
6806 	/*
6807 	 * The same explanation in load_free_space_cache applies here as well,
6808 	 * we only read when we're loading the free space cache, and at that
6809 	 * point the commit_root has everything we need.
6810 	 */
6811 	if (btrfs_is_free_space_inode(inode)) {
6812 		path->search_commit_root = 1;
6813 		path->skip_locking = 1;
6814 	}
6815 
6816 	ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
6817 	if (ret < 0) {
6818 		goto out;
6819 	} else if (ret > 0) {
6820 		if (path->slots[0] == 0)
6821 			goto not_found;
6822 		path->slots[0]--;
6823 		ret = 0;
6824 	}
6825 
6826 	leaf = path->nodes[0];
6827 	item = btrfs_item_ptr(leaf, path->slots[0],
6828 			      struct btrfs_file_extent_item);
6829 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6830 	if (found_key.objectid != objectid ||
6831 	    found_key.type != BTRFS_EXTENT_DATA_KEY) {
6832 		/*
6833 		 * If we backup past the first extent we want to move forward
6834 		 * and see if there is an extent in front of us, otherwise we'll
6835 		 * say there is a hole for our whole search range which can
6836 		 * cause problems.
6837 		 */
6838 		extent_end = start;
6839 		goto next;
6840 	}
6841 
6842 	extent_type = btrfs_file_extent_type(leaf, item);
6843 	extent_start = found_key.offset;
6844 	extent_end = btrfs_file_extent_end(path);
6845 	if (extent_type == BTRFS_FILE_EXTENT_REG ||
6846 	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6847 		/* Only regular file could have regular/prealloc extent */
6848 		if (!S_ISREG(inode->vfs_inode.i_mode)) {
6849 			ret = -EUCLEAN;
6850 			btrfs_crit(fs_info,
6851 		"regular/prealloc extent found for non-regular inode %llu",
6852 				   btrfs_ino(inode));
6853 			goto out;
6854 		}
6855 		trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
6856 						       extent_start);
6857 	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6858 		trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
6859 						      path->slots[0],
6860 						      extent_start);
6861 	}
6862 next:
6863 	if (start >= extent_end) {
6864 		path->slots[0]++;
6865 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6866 			ret = btrfs_next_leaf(root, path);
6867 			if (ret < 0)
6868 				goto out;
6869 			else if (ret > 0)
6870 				goto not_found;
6871 
6872 			leaf = path->nodes[0];
6873 		}
6874 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6875 		if (found_key.objectid != objectid ||
6876 		    found_key.type != BTRFS_EXTENT_DATA_KEY)
6877 			goto not_found;
6878 		if (start + len <= found_key.offset)
6879 			goto not_found;
6880 		if (start > found_key.offset)
6881 			goto next;
6882 
6883 		/* New extent overlaps with existing one */
6884 		em->start = start;
6885 		em->orig_start = start;
6886 		em->len = found_key.offset - start;
6887 		em->block_start = EXTENT_MAP_HOLE;
6888 		goto insert;
6889 	}
6890 
6891 	btrfs_extent_item_to_extent_map(inode, path, item, em);
6892 
6893 	if (extent_type == BTRFS_FILE_EXTENT_REG ||
6894 	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6895 		goto insert;
6896 	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6897 		/*
6898 		 * Inline extent can only exist at file offset 0. This is
6899 		 * ensured by tree-checker and inline extent creation path.
6900 		 * Thus all members representing file offsets should be zero.
6901 		 */
6902 		ASSERT(pg_offset == 0);
6903 		ASSERT(extent_start == 0);
6904 		ASSERT(em->start == 0);
6905 
6906 		/*
6907 		 * btrfs_extent_item_to_extent_map() should have properly
6908 		 * initialized em members already.
6909 		 *
6910 		 * Other members are not utilized for inline extents.
6911 		 */
6912 		ASSERT(em->block_start == EXTENT_MAP_INLINE);
6913 		ASSERT(em->len == fs_info->sectorsize);
6914 
6915 		ret = read_inline_extent(inode, path, page);
6916 		if (ret < 0)
6917 			goto out;
6918 		goto insert;
6919 	}
6920 not_found:
6921 	em->start = start;
6922 	em->orig_start = start;
6923 	em->len = len;
6924 	em->block_start = EXTENT_MAP_HOLE;
6925 insert:
6926 	ret = 0;
6927 	btrfs_release_path(path);
6928 	if (em->start > start || extent_map_end(em) <= start) {
6929 		btrfs_err(fs_info,
6930 			  "bad extent! em: [%llu %llu] passed [%llu %llu]",
6931 			  em->start, em->len, start, len);
6932 		ret = -EIO;
6933 		goto out;
6934 	}
6935 
6936 	write_lock(&em_tree->lock);
6937 	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
6938 	write_unlock(&em_tree->lock);
6939 out:
6940 	btrfs_free_path(path);
6941 
6942 	trace_btrfs_get_extent(root, inode, em);
6943 
6944 	if (ret) {
6945 		free_extent_map(em);
6946 		return ERR_PTR(ret);
6947 	}
6948 	return em;
6949 }
6950 
6951 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
6952 						  struct btrfs_dio_data *dio_data,
6953 						  const u64 start,
6954 						  const u64 len,
6955 						  const u64 orig_start,
6956 						  const u64 block_start,
6957 						  const u64 block_len,
6958 						  const u64 orig_block_len,
6959 						  const u64 ram_bytes,
6960 						  const int type)
6961 {
6962 	struct extent_map *em = NULL;
6963 	struct btrfs_ordered_extent *ordered;
6964 
6965 	if (type != BTRFS_ORDERED_NOCOW) {
6966 		em = create_io_em(inode, start, len, orig_start, block_start,
6967 				  block_len, orig_block_len, ram_bytes,
6968 				  BTRFS_COMPRESS_NONE, /* compress_type */
6969 				  type);
6970 		if (IS_ERR(em))
6971 			goto out;
6972 	}
6973 	ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
6974 					     block_start, block_len, 0,
6975 					     (1 << type) |
6976 					     (1 << BTRFS_ORDERED_DIRECT),
6977 					     BTRFS_COMPRESS_NONE);
6978 	if (IS_ERR(ordered)) {
6979 		if (em) {
6980 			free_extent_map(em);
6981 			btrfs_drop_extent_map_range(inode, start,
6982 						    start + len - 1, false);
6983 		}
6984 		em = ERR_CAST(ordered);
6985 	} else {
6986 		ASSERT(!dio_data->ordered);
6987 		dio_data->ordered = ordered;
6988 	}
6989  out:
6990 
6991 	return em;
6992 }
6993 
6994 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
6995 						  struct btrfs_dio_data *dio_data,
6996 						  u64 start, u64 len)
6997 {
6998 	struct btrfs_root *root = inode->root;
6999 	struct btrfs_fs_info *fs_info = root->fs_info;
7000 	struct extent_map *em;
7001 	struct btrfs_key ins;
7002 	u64 alloc_hint;
7003 	int ret;
7004 
7005 	alloc_hint = get_extent_allocation_hint(inode, start, len);
7006 again:
7007 	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
7008 				   0, alloc_hint, &ins, 1, 1);
7009 	if (ret == -EAGAIN) {
7010 		ASSERT(btrfs_is_zoned(fs_info));
7011 		wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
7012 			       TASK_UNINTERRUPTIBLE);
7013 		goto again;
7014 	}
7015 	if (ret)
7016 		return ERR_PTR(ret);
7017 
7018 	em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
7019 				     ins.objectid, ins.offset, ins.offset,
7020 				     ins.offset, BTRFS_ORDERED_REGULAR);
7021 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
7022 	if (IS_ERR(em))
7023 		btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
7024 					   1);
7025 
7026 	return em;
7027 }
7028 
7029 static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
7030 {
7031 	struct btrfs_block_group *block_group;
7032 	bool readonly = false;
7033 
7034 	block_group = btrfs_lookup_block_group(fs_info, bytenr);
7035 	if (!block_group || block_group->ro)
7036 		readonly = true;
7037 	if (block_group)
7038 		btrfs_put_block_group(block_group);
7039 	return readonly;
7040 }
7041 
7042 /*
7043  * Check if we can do nocow write into the range [@offset, @offset + @len)
7044  *
7045  * @offset:	File offset
7046  * @len:	The length to write, will be updated to the nocow writeable
7047  *		range
7048  * @orig_start:	(optional) Return the original file offset of the file extent
7049  * @orig_len:	(optional) Return the original on-disk length of the file extent
7050  * @ram_bytes:	(optional) Return the ram_bytes of the file extent
7051  * @strict:	if true, omit optimizations that might force us into unnecessary
7052  *		cow. e.g., don't trust generation number.
7053  *
7054  * Return:
7055  * >0	and update @len if we can do nocow write
7056  *  0	if we can't do nocow write
7057  * <0	if error happened
7058  *
7059  * NOTE: This only checks the file extents, caller is responsible to wait for
7060  *	 any ordered extents.
7061  */
7062 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7063 			      u64 *orig_start, u64 *orig_block_len,
7064 			      u64 *ram_bytes, bool nowait, bool strict)
7065 {
7066 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7067 	struct can_nocow_file_extent_args nocow_args = { 0 };
7068 	struct btrfs_path *path;
7069 	int ret;
7070 	struct extent_buffer *leaf;
7071 	struct btrfs_root *root = BTRFS_I(inode)->root;
7072 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7073 	struct btrfs_file_extent_item *fi;
7074 	struct btrfs_key key;
7075 	int found_type;
7076 
7077 	path = btrfs_alloc_path();
7078 	if (!path)
7079 		return -ENOMEM;
7080 	path->nowait = nowait;
7081 
7082 	ret = btrfs_lookup_file_extent(NULL, root, path,
7083 			btrfs_ino(BTRFS_I(inode)), offset, 0);
7084 	if (ret < 0)
7085 		goto out;
7086 
7087 	if (ret == 1) {
7088 		if (path->slots[0] == 0) {
7089 			/* can't find the item, must cow */
7090 			ret = 0;
7091 			goto out;
7092 		}
7093 		path->slots[0]--;
7094 	}
7095 	ret = 0;
7096 	leaf = path->nodes[0];
7097 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7098 	if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
7099 	    key.type != BTRFS_EXTENT_DATA_KEY) {
7100 		/* not our file or wrong item type, must cow */
7101 		goto out;
7102 	}
7103 
7104 	if (key.offset > offset) {
7105 		/* Wrong offset, must cow */
7106 		goto out;
7107 	}
7108 
7109 	if (btrfs_file_extent_end(path) <= offset)
7110 		goto out;
7111 
7112 	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
7113 	found_type = btrfs_file_extent_type(leaf, fi);
7114 	if (ram_bytes)
7115 		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
7116 
7117 	nocow_args.start = offset;
7118 	nocow_args.end = offset + *len - 1;
7119 	nocow_args.strict = strict;
7120 	nocow_args.free_path = true;
7121 
7122 	ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
7123 	/* can_nocow_file_extent() has freed the path. */
7124 	path = NULL;
7125 
7126 	if (ret != 1) {
7127 		/* Treat errors as not being able to NOCOW. */
7128 		ret = 0;
7129 		goto out;
7130 	}
7131 
7132 	ret = 0;
7133 	if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
7134 		goto out;
7135 
7136 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7137 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7138 		u64 range_end;
7139 
7140 		range_end = round_up(offset + nocow_args.num_bytes,
7141 				     root->fs_info->sectorsize) - 1;
7142 		ret = test_range_bit(io_tree, offset, range_end,
7143 				     EXTENT_DELALLOC, 0, NULL);
7144 		if (ret) {
7145 			ret = -EAGAIN;
7146 			goto out;
7147 		}
7148 	}
7149 
7150 	if (orig_start)
7151 		*orig_start = key.offset - nocow_args.extent_offset;
7152 	if (orig_block_len)
7153 		*orig_block_len = nocow_args.disk_num_bytes;
7154 
7155 	*len = nocow_args.num_bytes;
7156 	ret = 1;
7157 out:
7158 	btrfs_free_path(path);
7159 	return ret;
7160 }
7161 
7162 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7163 			      struct extent_state **cached_state,
7164 			      unsigned int iomap_flags)
7165 {
7166 	const bool writing = (iomap_flags & IOMAP_WRITE);
7167 	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
7168 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7169 	struct btrfs_ordered_extent *ordered;
7170 	int ret = 0;
7171 
7172 	while (1) {
7173 		if (nowait) {
7174 			if (!try_lock_extent(io_tree, lockstart, lockend,
7175 					     cached_state))
7176 				return -EAGAIN;
7177 		} else {
7178 			lock_extent(io_tree, lockstart, lockend, cached_state);
7179 		}
7180 		/*
7181 		 * We're concerned with the entire range that we're going to be
7182 		 * doing DIO to, so we need to make sure there's no ordered
7183 		 * extents in this range.
7184 		 */
7185 		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
7186 						     lockend - lockstart + 1);
7187 
7188 		/*
7189 		 * We need to make sure there are no buffered pages in this
7190 		 * range either, we could have raced between the invalidate in
7191 		 * generic_file_direct_write and locking the extent.  The
7192 		 * invalidate needs to happen so that reads after a write do not
7193 		 * get stale data.
7194 		 */
7195 		if (!ordered &&
7196 		    (!writing || !filemap_range_has_page(inode->i_mapping,
7197 							 lockstart, lockend)))
7198 			break;
7199 
7200 		unlock_extent(io_tree, lockstart, lockend, cached_state);
7201 
7202 		if (ordered) {
7203 			if (nowait) {
7204 				btrfs_put_ordered_extent(ordered);
7205 				ret = -EAGAIN;
7206 				break;
7207 			}
7208 			/*
7209 			 * If we are doing a DIO read and the ordered extent we
7210 			 * found is for a buffered write, we can not wait for it
7211 			 * to complete and retry, because if we do so we can
7212 			 * deadlock with concurrent buffered writes on page
7213 			 * locks. This happens only if our DIO read covers more
7214 			 * than one extent map, if at this point has already
7215 			 * created an ordered extent for a previous extent map
7216 			 * and locked its range in the inode's io tree, and a
7217 			 * concurrent write against that previous extent map's
7218 			 * range and this range started (we unlock the ranges
7219 			 * in the io tree only when the bios complete and
7220 			 * buffered writes always lock pages before attempting
7221 			 * to lock range in the io tree).
7222 			 */
7223 			if (writing ||
7224 			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7225 				btrfs_start_ordered_extent(ordered);
7226 			else
7227 				ret = nowait ? -EAGAIN : -ENOTBLK;
7228 			btrfs_put_ordered_extent(ordered);
7229 		} else {
7230 			/*
7231 			 * We could trigger writeback for this range (and wait
7232 			 * for it to complete) and then invalidate the pages for
7233 			 * this range (through invalidate_inode_pages2_range()),
7234 			 * but that can lead us to a deadlock with a concurrent
7235 			 * call to readahead (a buffered read or a defrag call
7236 			 * triggered a readahead) on a page lock due to an
7237 			 * ordered dio extent we created before but did not have
7238 			 * yet a corresponding bio submitted (whence it can not
7239 			 * complete), which makes readahead wait for that
7240 			 * ordered extent to complete while holding a lock on
7241 			 * that page.
7242 			 */
7243 			ret = nowait ? -EAGAIN : -ENOTBLK;
7244 		}
7245 
7246 		if (ret)
7247 			break;
7248 
7249 		cond_resched();
7250 	}
7251 
7252 	return ret;
7253 }
7254 
7255 /* The callers of this must take lock_extent() */
7256 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
7257 				       u64 len, u64 orig_start, u64 block_start,
7258 				       u64 block_len, u64 orig_block_len,
7259 				       u64 ram_bytes, int compress_type,
7260 				       int type)
7261 {
7262 	struct extent_map *em;
7263 	int ret;
7264 
7265 	ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7266 	       type == BTRFS_ORDERED_COMPRESSED ||
7267 	       type == BTRFS_ORDERED_NOCOW ||
7268 	       type == BTRFS_ORDERED_REGULAR);
7269 
7270 	em = alloc_extent_map();
7271 	if (!em)
7272 		return ERR_PTR(-ENOMEM);
7273 
7274 	em->start = start;
7275 	em->orig_start = orig_start;
7276 	em->len = len;
7277 	em->block_len = block_len;
7278 	em->block_start = block_start;
7279 	em->orig_block_len = orig_block_len;
7280 	em->ram_bytes = ram_bytes;
7281 	em->generation = -1;
7282 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
7283 	if (type == BTRFS_ORDERED_PREALLOC) {
7284 		set_bit(EXTENT_FLAG_FILLING, &em->flags);
7285 	} else if (type == BTRFS_ORDERED_COMPRESSED) {
7286 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
7287 		em->compress_type = compress_type;
7288 	}
7289 
7290 	ret = btrfs_replace_extent_map_range(inode, em, true);
7291 	if (ret) {
7292 		free_extent_map(em);
7293 		return ERR_PTR(ret);
7294 	}
7295 
7296 	/* em got 2 refs now, callers needs to do free_extent_map once. */
7297 	return em;
7298 }
7299 
7300 
7301 static int btrfs_get_blocks_direct_write(struct extent_map **map,
7302 					 struct inode *inode,
7303 					 struct btrfs_dio_data *dio_data,
7304 					 u64 start, u64 *lenp,
7305 					 unsigned int iomap_flags)
7306 {
7307 	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
7308 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7309 	struct extent_map *em = *map;
7310 	int type;
7311 	u64 block_start, orig_start, orig_block_len, ram_bytes;
7312 	struct btrfs_block_group *bg;
7313 	bool can_nocow = false;
7314 	bool space_reserved = false;
7315 	u64 len = *lenp;
7316 	u64 prev_len;
7317 	int ret = 0;
7318 
7319 	/*
7320 	 * We don't allocate a new extent in the following cases
7321 	 *
7322 	 * 1) The inode is marked as NODATACOW. In this case we'll just use the
7323 	 * existing extent.
7324 	 * 2) The extent is marked as PREALLOC. We're good to go here and can
7325 	 * just use the extent.
7326 	 *
7327 	 */
7328 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7329 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7330 	     em->block_start != EXTENT_MAP_HOLE)) {
7331 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7332 			type = BTRFS_ORDERED_PREALLOC;
7333 		else
7334 			type = BTRFS_ORDERED_NOCOW;
7335 		len = min(len, em->len - (start - em->start));
7336 		block_start = em->block_start + (start - em->start);
7337 
7338 		if (can_nocow_extent(inode, start, &len, &orig_start,
7339 				     &orig_block_len, &ram_bytes, false, false) == 1) {
7340 			bg = btrfs_inc_nocow_writers(fs_info, block_start);
7341 			if (bg)
7342 				can_nocow = true;
7343 		}
7344 	}
7345 
7346 	prev_len = len;
7347 	if (can_nocow) {
7348 		struct extent_map *em2;
7349 
7350 		/* We can NOCOW, so only need to reserve metadata space. */
7351 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
7352 						      nowait);
7353 		if (ret < 0) {
7354 			/* Our caller expects us to free the input extent map. */
7355 			free_extent_map(em);
7356 			*map = NULL;
7357 			btrfs_dec_nocow_writers(bg);
7358 			if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
7359 				ret = -EAGAIN;
7360 			goto out;
7361 		}
7362 		space_reserved = true;
7363 
7364 		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
7365 					      orig_start, block_start,
7366 					      len, orig_block_len,
7367 					      ram_bytes, type);
7368 		btrfs_dec_nocow_writers(bg);
7369 		if (type == BTRFS_ORDERED_PREALLOC) {
7370 			free_extent_map(em);
7371 			*map = em2;
7372 			em = em2;
7373 		}
7374 
7375 		if (IS_ERR(em2)) {
7376 			ret = PTR_ERR(em2);
7377 			goto out;
7378 		}
7379 
7380 		dio_data->nocow_done = true;
7381 	} else {
7382 		/* Our caller expects us to free the input extent map. */
7383 		free_extent_map(em);
7384 		*map = NULL;
7385 
7386 		if (nowait) {
7387 			ret = -EAGAIN;
7388 			goto out;
7389 		}
7390 
7391 		/*
7392 		 * If we could not allocate data space before locking the file
7393 		 * range and we can't do a NOCOW write, then we have to fail.
7394 		 */
7395 		if (!dio_data->data_space_reserved) {
7396 			ret = -ENOSPC;
7397 			goto out;
7398 		}
7399 
7400 		/*
7401 		 * We have to COW and we have already reserved data space before,
7402 		 * so now we reserve only metadata.
7403 		 */
7404 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
7405 						      false);
7406 		if (ret < 0)
7407 			goto out;
7408 		space_reserved = true;
7409 
7410 		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
7411 		if (IS_ERR(em)) {
7412 			ret = PTR_ERR(em);
7413 			goto out;
7414 		}
7415 		*map = em;
7416 		len = min(len, em->len - (start - em->start));
7417 		if (len < prev_len)
7418 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
7419 							prev_len - len, true);
7420 	}
7421 
7422 	/*
7423 	 * We have created our ordered extent, so we can now release our reservation
7424 	 * for an outstanding extent.
7425 	 */
7426 	btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
7427 
7428 	/*
7429 	 * Need to update the i_size under the extent lock so buffered
7430 	 * readers will get the updated i_size when we unlock.
7431 	 */
7432 	if (start + len > i_size_read(inode))
7433 		i_size_write(inode, start + len);
7434 out:
7435 	if (ret && space_reserved) {
7436 		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
7437 		btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
7438 	}
7439 	*lenp = len;
7440 	return ret;
7441 }
7442 
7443 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
7444 		loff_t length, unsigned int flags, struct iomap *iomap,
7445 		struct iomap *srcmap)
7446 {
7447 	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
7448 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7449 	struct extent_map *em;
7450 	struct extent_state *cached_state = NULL;
7451 	struct btrfs_dio_data *dio_data = iter->private;
7452 	u64 lockstart, lockend;
7453 	const bool write = !!(flags & IOMAP_WRITE);
7454 	int ret = 0;
7455 	u64 len = length;
7456 	const u64 data_alloc_len = length;
7457 	bool unlock_extents = false;
7458 
7459 	/*
7460 	 * We could potentially fault if we have a buffer > PAGE_SIZE, and if
7461 	 * we're NOWAIT we may submit a bio for a partial range and return
7462 	 * EIOCBQUEUED, which would result in an errant short read.
7463 	 *
7464 	 * The best way to handle this would be to allow for partial completions
7465 	 * of iocb's, so we could submit the partial bio, return and fault in
7466 	 * the rest of the pages, and then submit the io for the rest of the
7467 	 * range.  However we don't have that currently, so simply return
7468 	 * -EAGAIN at this point so that the normal path is used.
7469 	 */
7470 	if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
7471 		return -EAGAIN;
7472 
7473 	/*
7474 	 * Cap the size of reads to that usually seen in buffered I/O as we need
7475 	 * to allocate a contiguous array for the checksums.
7476 	 */
7477 	if (!write)
7478 		len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
7479 
7480 	lockstart = start;
7481 	lockend = start + len - 1;
7482 
7483 	/*
7484 	 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
7485 	 * enough if we've written compressed pages to this area, so we need to
7486 	 * flush the dirty pages again to make absolutely sure that any
7487 	 * outstanding dirty pages are on disk - the first flush only starts
7488 	 * compression on the data, while keeping the pages locked, so by the
7489 	 * time the second flush returns we know bios for the compressed pages
7490 	 * were submitted and finished, and the pages no longer under writeback.
7491 	 *
7492 	 * If we have a NOWAIT request and we have any pages in the range that
7493 	 * are locked, likely due to compression still in progress, we don't want
7494 	 * to block on page locks. We also don't want to block on pages marked as
7495 	 * dirty or under writeback (same as for the non-compression case).
7496 	 * iomap_dio_rw() did the same check, but after that and before we got
7497 	 * here, mmap'ed writes may have happened or buffered reads started
7498 	 * (readpage() and readahead(), which lock pages), as we haven't locked
7499 	 * the file range yet.
7500 	 */
7501 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7502 		     &BTRFS_I(inode)->runtime_flags)) {
7503 		if (flags & IOMAP_NOWAIT) {
7504 			if (filemap_range_needs_writeback(inode->i_mapping,
7505 							  lockstart, lockend))
7506 				return -EAGAIN;
7507 		} else {
7508 			ret = filemap_fdatawrite_range(inode->i_mapping, start,
7509 						       start + length - 1);
7510 			if (ret)
7511 				return ret;
7512 		}
7513 	}
7514 
7515 	memset(dio_data, 0, sizeof(*dio_data));
7516 
7517 	/*
7518 	 * We always try to allocate data space and must do it before locking
7519 	 * the file range, to avoid deadlocks with concurrent writes to the same
7520 	 * range if the range has several extents and the writes don't expand the
7521 	 * current i_size (the inode lock is taken in shared mode). If we fail to
7522 	 * allocate data space here we continue and later, after locking the
7523 	 * file range, we fail with ENOSPC only if we figure out we can not do a
7524 	 * NOCOW write.
7525 	 */
7526 	if (write && !(flags & IOMAP_NOWAIT)) {
7527 		ret = btrfs_check_data_free_space(BTRFS_I(inode),
7528 						  &dio_data->data_reserved,
7529 						  start, data_alloc_len, false);
7530 		if (!ret)
7531 			dio_data->data_space_reserved = true;
7532 		else if (ret && !(BTRFS_I(inode)->flags &
7533 				  (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
7534 			goto err;
7535 	}
7536 
7537 	/*
7538 	 * If this errors out it's because we couldn't invalidate pagecache for
7539 	 * this range and we need to fallback to buffered IO, or we are doing a
7540 	 * NOWAIT read/write and we need to block.
7541 	 */
7542 	ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
7543 	if (ret < 0)
7544 		goto err;
7545 
7546 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
7547 	if (IS_ERR(em)) {
7548 		ret = PTR_ERR(em);
7549 		goto unlock_err;
7550 	}
7551 
7552 	/*
7553 	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7554 	 * io.  INLINE is special, and we could probably kludge it in here, but
7555 	 * it's still buffered so for safety lets just fall back to the generic
7556 	 * buffered path.
7557 	 *
7558 	 * For COMPRESSED we _have_ to read the entire extent in so we can
7559 	 * decompress it, so there will be buffering required no matter what we
7560 	 * do, so go ahead and fallback to buffered.
7561 	 *
7562 	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
7563 	 * to buffered IO.  Don't blame me, this is the price we pay for using
7564 	 * the generic code.
7565 	 */
7566 	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7567 	    em->block_start == EXTENT_MAP_INLINE) {
7568 		free_extent_map(em);
7569 		/*
7570 		 * If we are in a NOWAIT context, return -EAGAIN in order to
7571 		 * fallback to buffered IO. This is not only because we can
7572 		 * block with buffered IO (no support for NOWAIT semantics at
7573 		 * the moment) but also to avoid returning short reads to user
7574 		 * space - this happens if we were able to read some data from
7575 		 * previous non-compressed extents and then when we fallback to
7576 		 * buffered IO, at btrfs_file_read_iter() by calling
7577 		 * filemap_read(), we fail to fault in pages for the read buffer,
7578 		 * in which case filemap_read() returns a short read (the number
7579 		 * of bytes previously read is > 0, so it does not return -EFAULT).
7580 		 */
7581 		ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
7582 		goto unlock_err;
7583 	}
7584 
7585 	len = min(len, em->len - (start - em->start));
7586 
7587 	/*
7588 	 * If we have a NOWAIT request and the range contains multiple extents
7589 	 * (or a mix of extents and holes), then we return -EAGAIN to make the
7590 	 * caller fallback to a context where it can do a blocking (without
7591 	 * NOWAIT) request. This way we avoid doing partial IO and returning
7592 	 * success to the caller, which is not optimal for writes and for reads
7593 	 * it can result in unexpected behaviour for an application.
7594 	 *
7595 	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
7596 	 * iomap_dio_rw(), we can end up returning less data then what the caller
7597 	 * asked for, resulting in an unexpected, and incorrect, short read.
7598 	 * That is, the caller asked to read N bytes and we return less than that,
7599 	 * which is wrong unless we are crossing EOF. This happens if we get a
7600 	 * page fault error when trying to fault in pages for the buffer that is
7601 	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
7602 	 * have previously submitted bios for other extents in the range, in
7603 	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
7604 	 * those bios have completed by the time we get the page fault error,
7605 	 * which we return back to our caller - we should only return EIOCBQUEUED
7606 	 * after we have submitted bios for all the extents in the range.
7607 	 */
7608 	if ((flags & IOMAP_NOWAIT) && len < length) {
7609 		free_extent_map(em);
7610 		ret = -EAGAIN;
7611 		goto unlock_err;
7612 	}
7613 
7614 	if (write) {
7615 		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
7616 						    start, &len, flags);
7617 		if (ret < 0)
7618 			goto unlock_err;
7619 		unlock_extents = true;
7620 		/* Recalc len in case the new em is smaller than requested */
7621 		len = min(len, em->len - (start - em->start));
7622 		if (dio_data->data_space_reserved) {
7623 			u64 release_offset;
7624 			u64 release_len = 0;
7625 
7626 			if (dio_data->nocow_done) {
7627 				release_offset = start;
7628 				release_len = data_alloc_len;
7629 			} else if (len < data_alloc_len) {
7630 				release_offset = start + len;
7631 				release_len = data_alloc_len - len;
7632 			}
7633 
7634 			if (release_len > 0)
7635 				btrfs_free_reserved_data_space(BTRFS_I(inode),
7636 							       dio_data->data_reserved,
7637 							       release_offset,
7638 							       release_len);
7639 		}
7640 	} else {
7641 		/*
7642 		 * We need to unlock only the end area that we aren't using.
7643 		 * The rest is going to be unlocked by the endio routine.
7644 		 */
7645 		lockstart = start + len;
7646 		if (lockstart < lockend)
7647 			unlock_extents = true;
7648 	}
7649 
7650 	if (unlock_extents)
7651 		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7652 			      &cached_state);
7653 	else
7654 		free_extent_state(cached_state);
7655 
7656 	/*
7657 	 * Translate extent map information to iomap.
7658 	 * We trim the extents (and move the addr) even though iomap code does
7659 	 * that, since we have locked only the parts we are performing I/O in.
7660 	 */
7661 	if ((em->block_start == EXTENT_MAP_HOLE) ||
7662 	    (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
7663 		iomap->addr = IOMAP_NULL_ADDR;
7664 		iomap->type = IOMAP_HOLE;
7665 	} else {
7666 		iomap->addr = em->block_start + (start - em->start);
7667 		iomap->type = IOMAP_MAPPED;
7668 	}
7669 	iomap->offset = start;
7670 	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
7671 	iomap->length = len;
7672 	free_extent_map(em);
7673 
7674 	return 0;
7675 
7676 unlock_err:
7677 	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7678 		      &cached_state);
7679 err:
7680 	if (dio_data->data_space_reserved) {
7681 		btrfs_free_reserved_data_space(BTRFS_I(inode),
7682 					       dio_data->data_reserved,
7683 					       start, data_alloc_len);
7684 		extent_changeset_free(dio_data->data_reserved);
7685 	}
7686 
7687 	return ret;
7688 }
7689 
7690 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
7691 		ssize_t written, unsigned int flags, struct iomap *iomap)
7692 {
7693 	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
7694 	struct btrfs_dio_data *dio_data = iter->private;
7695 	size_t submitted = dio_data->submitted;
7696 	const bool write = !!(flags & IOMAP_WRITE);
7697 	int ret = 0;
7698 
7699 	if (!write && (iomap->type == IOMAP_HOLE)) {
7700 		/* If reading from a hole, unlock and return */
7701 		unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
7702 			      NULL);
7703 		return 0;
7704 	}
7705 
7706 	if (submitted < length) {
7707 		pos += submitted;
7708 		length -= submitted;
7709 		if (write)
7710 			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
7711 						    pos, length, false);
7712 		else
7713 			unlock_extent(&BTRFS_I(inode)->io_tree, pos,
7714 				      pos + length - 1, NULL);
7715 		ret = -ENOTBLK;
7716 	}
7717 	if (write) {
7718 		btrfs_put_ordered_extent(dio_data->ordered);
7719 		dio_data->ordered = NULL;
7720 	}
7721 
7722 	if (write)
7723 		extent_changeset_free(dio_data->data_reserved);
7724 	return ret;
7725 }
7726 
7727 static void btrfs_dio_end_io(struct btrfs_bio *bbio)
7728 {
7729 	struct btrfs_dio_private *dip =
7730 		container_of(bbio, struct btrfs_dio_private, bbio);
7731 	struct btrfs_inode *inode = bbio->inode;
7732 	struct bio *bio = &bbio->bio;
7733 
7734 	if (bio->bi_status) {
7735 		btrfs_warn(inode->root->fs_info,
7736 		"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
7737 			   btrfs_ino(inode), bio->bi_opf,
7738 			   dip->file_offset, dip->bytes, bio->bi_status);
7739 	}
7740 
7741 	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
7742 		btrfs_finish_ordered_extent(bbio->ordered, NULL,
7743 					    dip->file_offset, dip->bytes,
7744 					    !bio->bi_status);
7745 	} else {
7746 		unlock_extent(&inode->io_tree, dip->file_offset,
7747 			      dip->file_offset + dip->bytes - 1, NULL);
7748 	}
7749 
7750 	bbio->bio.bi_private = bbio->private;
7751 	iomap_dio_bio_end_io(bio);
7752 }
7753 
7754 static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
7755 				loff_t file_offset)
7756 {
7757 	struct btrfs_bio *bbio = btrfs_bio(bio);
7758 	struct btrfs_dio_private *dip =
7759 		container_of(bbio, struct btrfs_dio_private, bbio);
7760 	struct btrfs_dio_data *dio_data = iter->private;
7761 
7762 	btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
7763 		       btrfs_dio_end_io, bio->bi_private);
7764 	bbio->inode = BTRFS_I(iter->inode);
7765 	bbio->file_offset = file_offset;
7766 
7767 	dip->file_offset = file_offset;
7768 	dip->bytes = bio->bi_iter.bi_size;
7769 
7770 	dio_data->submitted += bio->bi_iter.bi_size;
7771 
7772 	/*
7773 	 * Check if we are doing a partial write.  If we are, we need to split
7774 	 * the ordered extent to match the submitted bio.  Hang on to the
7775 	 * remaining unfinishable ordered_extent in dio_data so that it can be
7776 	 * cancelled in iomap_end to avoid a deadlock wherein faulting the
7777 	 * remaining pages is blocked on the outstanding ordered extent.
7778 	 */
7779 	if (iter->flags & IOMAP_WRITE) {
7780 		int ret;
7781 
7782 		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
7783 		if (ret) {
7784 			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
7785 						    file_offset, dip->bytes,
7786 						    !ret);
7787 			bio->bi_status = errno_to_blk_status(ret);
7788 			iomap_dio_bio_end_io(bio);
7789 			return;
7790 		}
7791 	}
7792 
7793 	btrfs_submit_bio(bbio, 0);
7794 }
7795 
7796 static const struct iomap_ops btrfs_dio_iomap_ops = {
7797 	.iomap_begin            = btrfs_dio_iomap_begin,
7798 	.iomap_end              = btrfs_dio_iomap_end,
7799 };
7800 
7801 static const struct iomap_dio_ops btrfs_dio_ops = {
7802 	.submit_io		= btrfs_dio_submit_io,
7803 	.bio_set		= &btrfs_dio_bioset,
7804 };
7805 
7806 ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
7807 {
7808 	struct btrfs_dio_data data = { 0 };
7809 
7810 	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
7811 			    IOMAP_DIO_PARTIAL, &data, done_before);
7812 }
7813 
7814 struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
7815 				  size_t done_before)
7816 {
7817 	struct btrfs_dio_data data = { 0 };
7818 
7819 	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
7820 			    IOMAP_DIO_PARTIAL, &data, done_before);
7821 }
7822 
7823 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
7824 			u64 start, u64 len)
7825 {
7826 	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
7827 	int	ret;
7828 
7829 	ret = fiemap_prep(inode, fieinfo, start, &len, 0);
7830 	if (ret)
7831 		return ret;
7832 
7833 	/*
7834 	 * fiemap_prep() called filemap_write_and_wait() for the whole possible
7835 	 * file range (0 to LLONG_MAX), but that is not enough if we have
7836 	 * compression enabled. The first filemap_fdatawrite_range() only kicks
7837 	 * in the compression of data (in an async thread) and will return
7838 	 * before the compression is done and writeback is started. A second
7839 	 * filemap_fdatawrite_range() is needed to wait for the compression to
7840 	 * complete and writeback to start. We also need to wait for ordered
7841 	 * extents to complete, because our fiemap implementation uses mainly
7842 	 * file extent items to list the extents, searching for extent maps
7843 	 * only for file ranges with holes or prealloc extents to figure out
7844 	 * if we have delalloc in those ranges.
7845 	 */
7846 	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
7847 		ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
7848 		if (ret)
7849 			return ret;
7850 	}
7851 
7852 	btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
7853 
7854 	/*
7855 	 * We did an initial flush to avoid holding the inode's lock while
7856 	 * triggering writeback and waiting for the completion of IO and ordered
7857 	 * extents. Now after we locked the inode we do it again, because it's
7858 	 * possible a new write may have happened in between those two steps.
7859 	 */
7860 	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
7861 		ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
7862 		if (ret) {
7863 			btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
7864 			return ret;
7865 		}
7866 	}
7867 
7868 	ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
7869 	btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
7870 
7871 	return ret;
7872 }
7873 
7874 static int btrfs_writepages(struct address_space *mapping,
7875 			    struct writeback_control *wbc)
7876 {
7877 	return extent_writepages(mapping, wbc);
7878 }
7879 
7880 static void btrfs_readahead(struct readahead_control *rac)
7881 {
7882 	extent_readahead(rac);
7883 }
7884 
7885 /*
7886  * For release_folio() and invalidate_folio() we have a race window where
7887  * folio_end_writeback() is called but the subpage spinlock is not yet released.
7888  * If we continue to release/invalidate the page, we could cause use-after-free
7889  * for subpage spinlock.  So this function is to spin and wait for subpage
7890  * spinlock.
7891  */
7892 static void wait_subpage_spinlock(struct page *page)
7893 {
7894 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
7895 	struct btrfs_subpage *subpage;
7896 
7897 	if (!btrfs_is_subpage(fs_info, page))
7898 		return;
7899 
7900 	ASSERT(PagePrivate(page) && page->private);
7901 	subpage = (struct btrfs_subpage *)page->private;
7902 
7903 	/*
7904 	 * This may look insane as we just acquire the spinlock and release it,
7905 	 * without doing anything.  But we just want to make sure no one is
7906 	 * still holding the subpage spinlock.
7907 	 * And since the page is not dirty nor writeback, and we have page
7908 	 * locked, the only possible way to hold a spinlock is from the endio
7909 	 * function to clear page writeback.
7910 	 *
7911 	 * Here we just acquire the spinlock so that all existing callers
7912 	 * should exit and we're safe to release/invalidate the page.
7913 	 */
7914 	spin_lock_irq(&subpage->lock);
7915 	spin_unlock_irq(&subpage->lock);
7916 }
7917 
7918 static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7919 {
7920 	int ret = try_release_extent_mapping(&folio->page, gfp_flags);
7921 
7922 	if (ret == 1) {
7923 		wait_subpage_spinlock(&folio->page);
7924 		clear_page_extent_mapped(&folio->page);
7925 	}
7926 	return ret;
7927 }
7928 
7929 static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7930 {
7931 	if (folio_test_writeback(folio) || folio_test_dirty(folio))
7932 		return false;
7933 	return __btrfs_release_folio(folio, gfp_flags);
7934 }
7935 
7936 #ifdef CONFIG_MIGRATION
7937 static int btrfs_migrate_folio(struct address_space *mapping,
7938 			     struct folio *dst, struct folio *src,
7939 			     enum migrate_mode mode)
7940 {
7941 	int ret = filemap_migrate_folio(mapping, dst, src, mode);
7942 
7943 	if (ret != MIGRATEPAGE_SUCCESS)
7944 		return ret;
7945 
7946 	if (folio_test_ordered(src)) {
7947 		folio_clear_ordered(src);
7948 		folio_set_ordered(dst);
7949 	}
7950 
7951 	return MIGRATEPAGE_SUCCESS;
7952 }
7953 #else
7954 #define btrfs_migrate_folio NULL
7955 #endif
7956 
7957 static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
7958 				 size_t length)
7959 {
7960 	struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
7961 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
7962 	struct extent_io_tree *tree = &inode->io_tree;
7963 	struct extent_state *cached_state = NULL;
7964 	u64 page_start = folio_pos(folio);
7965 	u64 page_end = page_start + folio_size(folio) - 1;
7966 	u64 cur;
7967 	int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
7968 
7969 	/*
7970 	 * We have folio locked so no new ordered extent can be created on this
7971 	 * page, nor bio can be submitted for this folio.
7972 	 *
7973 	 * But already submitted bio can still be finished on this folio.
7974 	 * Furthermore, endio function won't skip folio which has Ordered
7975 	 * (Private2) already cleared, so it's possible for endio and
7976 	 * invalidate_folio to do the same ordered extent accounting twice
7977 	 * on one folio.
7978 	 *
7979 	 * So here we wait for any submitted bios to finish, so that we won't
7980 	 * do double ordered extent accounting on the same folio.
7981 	 */
7982 	folio_wait_writeback(folio);
7983 	wait_subpage_spinlock(&folio->page);
7984 
7985 	/*
7986 	 * For subpage case, we have call sites like
7987 	 * btrfs_punch_hole_lock_range() which passes range not aligned to
7988 	 * sectorsize.
7989 	 * If the range doesn't cover the full folio, we don't need to and
7990 	 * shouldn't clear page extent mapped, as folio->private can still
7991 	 * record subpage dirty bits for other part of the range.
7992 	 *
7993 	 * For cases that invalidate the full folio even the range doesn't
7994 	 * cover the full folio, like invalidating the last folio, we're
7995 	 * still safe to wait for ordered extent to finish.
7996 	 */
7997 	if (!(offset == 0 && length == folio_size(folio))) {
7998 		btrfs_release_folio(folio, GFP_NOFS);
7999 		return;
8000 	}
8001 
8002 	if (!inode_evicting)
8003 		lock_extent(tree, page_start, page_end, &cached_state);
8004 
8005 	cur = page_start;
8006 	while (cur < page_end) {
8007 		struct btrfs_ordered_extent *ordered;
8008 		u64 range_end;
8009 		u32 range_len;
8010 		u32 extra_flags = 0;
8011 
8012 		ordered = btrfs_lookup_first_ordered_range(inode, cur,
8013 							   page_end + 1 - cur);
8014 		if (!ordered) {
8015 			range_end = page_end;
8016 			/*
8017 			 * No ordered extent covering this range, we are safe
8018 			 * to delete all extent states in the range.
8019 			 */
8020 			extra_flags = EXTENT_CLEAR_ALL_BITS;
8021 			goto next;
8022 		}
8023 		if (ordered->file_offset > cur) {
8024 			/*
8025 			 * There is a range between [cur, oe->file_offset) not
8026 			 * covered by any ordered extent.
8027 			 * We are safe to delete all extent states, and handle
8028 			 * the ordered extent in the next iteration.
8029 			 */
8030 			range_end = ordered->file_offset - 1;
8031 			extra_flags = EXTENT_CLEAR_ALL_BITS;
8032 			goto next;
8033 		}
8034 
8035 		range_end = min(ordered->file_offset + ordered->num_bytes - 1,
8036 				page_end);
8037 		ASSERT(range_end + 1 - cur < U32_MAX);
8038 		range_len = range_end + 1 - cur;
8039 		if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
8040 			/*
8041 			 * If Ordered (Private2) is cleared, it means endio has
8042 			 * already been executed for the range.
8043 			 * We can't delete the extent states as
8044 			 * btrfs_finish_ordered_io() may still use some of them.
8045 			 */
8046 			goto next;
8047 		}
8048 		btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
8049 
8050 		/*
8051 		 * IO on this page will never be started, so we need to account
8052 		 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
8053 		 * here, must leave that up for the ordered extent completion.
8054 		 *
8055 		 * This will also unlock the range for incoming
8056 		 * btrfs_finish_ordered_io().
8057 		 */
8058 		if (!inode_evicting)
8059 			clear_extent_bit(tree, cur, range_end,
8060 					 EXTENT_DELALLOC |
8061 					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8062 					 EXTENT_DEFRAG, &cached_state);
8063 
8064 		spin_lock_irq(&inode->ordered_tree.lock);
8065 		set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8066 		ordered->truncated_len = min(ordered->truncated_len,
8067 					     cur - ordered->file_offset);
8068 		spin_unlock_irq(&inode->ordered_tree.lock);
8069 
8070 		/*
8071 		 * If the ordered extent has finished, we're safe to delete all
8072 		 * the extent states of the range, otherwise
8073 		 * btrfs_finish_ordered_io() will get executed by endio for
8074 		 * other pages, so we can't delete extent states.
8075 		 */
8076 		if (btrfs_dec_test_ordered_pending(inode, &ordered,
8077 						   cur, range_end + 1 - cur)) {
8078 			btrfs_finish_ordered_io(ordered);
8079 			/*
8080 			 * The ordered extent has finished, now we're again
8081 			 * safe to delete all extent states of the range.
8082 			 */
8083 			extra_flags = EXTENT_CLEAR_ALL_BITS;
8084 		}
8085 next:
8086 		if (ordered)
8087 			btrfs_put_ordered_extent(ordered);
8088 		/*
8089 		 * Qgroup reserved space handler
8090 		 * Sector(s) here will be either:
8091 		 *
8092 		 * 1) Already written to disk or bio already finished
8093 		 *    Then its QGROUP_RESERVED bit in io_tree is already cleared.
8094 		 *    Qgroup will be handled by its qgroup_record then.
8095 		 *    btrfs_qgroup_free_data() call will do nothing here.
8096 		 *
8097 		 * 2) Not written to disk yet
8098 		 *    Then btrfs_qgroup_free_data() call will clear the
8099 		 *    QGROUP_RESERVED bit of its io_tree, and free the qgroup
8100 		 *    reserved data space.
8101 		 *    Since the IO will never happen for this page.
8102 		 */
8103 		btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
8104 		if (!inode_evicting) {
8105 			clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
8106 				 EXTENT_DELALLOC | EXTENT_UPTODATE |
8107 				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
8108 				 extra_flags, &cached_state);
8109 		}
8110 		cur = range_end + 1;
8111 	}
8112 	/*
8113 	 * We have iterated through all ordered extents of the page, the page
8114 	 * should not have Ordered (Private2) anymore, or the above iteration
8115 	 * did something wrong.
8116 	 */
8117 	ASSERT(!folio_test_ordered(folio));
8118 	btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
8119 	if (!inode_evicting)
8120 		__btrfs_release_folio(folio, GFP_NOFS);
8121 	clear_page_extent_mapped(&folio->page);
8122 }
8123 
8124 /*
8125  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
8126  * called from a page fault handler when a page is first dirtied. Hence we must
8127  * be careful to check for EOF conditions here. We set the page up correctly
8128  * for a written page which means we get ENOSPC checking when writing into
8129  * holes and correct delalloc and unwritten extent mapping on filesystems that
8130  * support these features.
8131  *
8132  * We are not allowed to take the i_mutex here so we have to play games to
8133  * protect against truncate races as the page could now be beyond EOF.  Because
8134  * truncate_setsize() writes the inode size before removing pages, once we have
8135  * the page lock we can determine safely if the page is beyond EOF. If it is not
8136  * beyond EOF, then the page is guaranteed safe against truncation until we
8137  * unlock the page.
8138  */
8139 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
8140 {
8141 	struct page *page = vmf->page;
8142 	struct inode *inode = file_inode(vmf->vma->vm_file);
8143 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8144 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
8145 	struct btrfs_ordered_extent *ordered;
8146 	struct extent_state *cached_state = NULL;
8147 	struct extent_changeset *data_reserved = NULL;
8148 	unsigned long zero_start;
8149 	loff_t size;
8150 	vm_fault_t ret;
8151 	int ret2;
8152 	int reserved = 0;
8153 	u64 reserved_space;
8154 	u64 page_start;
8155 	u64 page_end;
8156 	u64 end;
8157 
8158 	reserved_space = PAGE_SIZE;
8159 
8160 	sb_start_pagefault(inode->i_sb);
8161 	page_start = page_offset(page);
8162 	page_end = page_start + PAGE_SIZE - 1;
8163 	end = page_end;
8164 
8165 	/*
8166 	 * Reserving delalloc space after obtaining the page lock can lead to
8167 	 * deadlock. For example, if a dirty page is locked by this function
8168 	 * and the call to btrfs_delalloc_reserve_space() ends up triggering
8169 	 * dirty page write out, then the btrfs_writepages() function could
8170 	 * end up waiting indefinitely to get a lock on the page currently
8171 	 * being processed by btrfs_page_mkwrite() function.
8172 	 */
8173 	ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
8174 					    page_start, reserved_space);
8175 	if (!ret2) {
8176 		ret2 = file_update_time(vmf->vma->vm_file);
8177 		reserved = 1;
8178 	}
8179 	if (ret2) {
8180 		ret = vmf_error(ret2);
8181 		if (reserved)
8182 			goto out;
8183 		goto out_noreserve;
8184 	}
8185 
8186 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
8187 again:
8188 	down_read(&BTRFS_I(inode)->i_mmap_lock);
8189 	lock_page(page);
8190 	size = i_size_read(inode);
8191 
8192 	if ((page->mapping != inode->i_mapping) ||
8193 	    (page_start >= size)) {
8194 		/* page got truncated out from underneath us */
8195 		goto out_unlock;
8196 	}
8197 	wait_on_page_writeback(page);
8198 
8199 	lock_extent(io_tree, page_start, page_end, &cached_state);
8200 	ret2 = set_page_extent_mapped(page);
8201 	if (ret2 < 0) {
8202 		ret = vmf_error(ret2);
8203 		unlock_extent(io_tree, page_start, page_end, &cached_state);
8204 		goto out_unlock;
8205 	}
8206 
8207 	/*
8208 	 * we can't set the delalloc bits if there are pending ordered
8209 	 * extents.  Drop our locks and wait for them to finish
8210 	 */
8211 	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
8212 			PAGE_SIZE);
8213 	if (ordered) {
8214 		unlock_extent(io_tree, page_start, page_end, &cached_state);
8215 		unlock_page(page);
8216 		up_read(&BTRFS_I(inode)->i_mmap_lock);
8217 		btrfs_start_ordered_extent(ordered);
8218 		btrfs_put_ordered_extent(ordered);
8219 		goto again;
8220 	}
8221 
8222 	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
8223 		reserved_space = round_up(size - page_start,
8224 					  fs_info->sectorsize);
8225 		if (reserved_space < PAGE_SIZE) {
8226 			end = page_start + reserved_space - 1;
8227 			btrfs_delalloc_release_space(BTRFS_I(inode),
8228 					data_reserved, page_start,
8229 					PAGE_SIZE - reserved_space, true);
8230 		}
8231 	}
8232 
8233 	/*
8234 	 * page_mkwrite gets called when the page is firstly dirtied after it's
8235 	 * faulted in, but write(2) could also dirty a page and set delalloc
8236 	 * bits, thus in this case for space account reason, we still need to
8237 	 * clear any delalloc bits within this page range since we have to
8238 	 * reserve data&meta space before lock_page() (see above comments).
8239 	 */
8240 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
8241 			  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
8242 			  EXTENT_DEFRAG, &cached_state);
8243 
8244 	ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
8245 					&cached_state);
8246 	if (ret2) {
8247 		unlock_extent(io_tree, page_start, page_end, &cached_state);
8248 		ret = VM_FAULT_SIGBUS;
8249 		goto out_unlock;
8250 	}
8251 
8252 	/* page is wholly or partially inside EOF */
8253 	if (page_start + PAGE_SIZE > size)
8254 		zero_start = offset_in_page(size);
8255 	else
8256 		zero_start = PAGE_SIZE;
8257 
8258 	if (zero_start != PAGE_SIZE)
8259 		memzero_page(page, zero_start, PAGE_SIZE - zero_start);
8260 
8261 	btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
8262 	btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
8263 	btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
8264 
8265 	btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
8266 
8267 	unlock_extent(io_tree, page_start, page_end, &cached_state);
8268 	up_read(&BTRFS_I(inode)->i_mmap_lock);
8269 
8270 	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8271 	sb_end_pagefault(inode->i_sb);
8272 	extent_changeset_free(data_reserved);
8273 	return VM_FAULT_LOCKED;
8274 
8275 out_unlock:
8276 	unlock_page(page);
8277 	up_read(&BTRFS_I(inode)->i_mmap_lock);
8278 out:
8279 	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8280 	btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
8281 				     reserved_space, (ret != 0));
8282 out_noreserve:
8283 	sb_end_pagefault(inode->i_sb);
8284 	extent_changeset_free(data_reserved);
8285 	return ret;
8286 }
8287 
8288 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
8289 {
8290 	struct btrfs_truncate_control control = {
8291 		.inode = inode,
8292 		.ino = btrfs_ino(inode),
8293 		.min_type = BTRFS_EXTENT_DATA_KEY,
8294 		.clear_extent_range = true,
8295 	};
8296 	struct btrfs_root *root = inode->root;
8297 	struct btrfs_fs_info *fs_info = root->fs_info;
8298 	struct btrfs_block_rsv *rsv;
8299 	int ret;
8300 	struct btrfs_trans_handle *trans;
8301 	u64 mask = fs_info->sectorsize - 1;
8302 	const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
8303 
8304 	if (!skip_writeback) {
8305 		ret = btrfs_wait_ordered_range(&inode->vfs_inode,
8306 					       inode->vfs_inode.i_size & (~mask),
8307 					       (u64)-1);
8308 		if (ret)
8309 			return ret;
8310 	}
8311 
8312 	/*
8313 	 * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
8314 	 * things going on here:
8315 	 *
8316 	 * 1) We need to reserve space to update our inode.
8317 	 *
8318 	 * 2) We need to have something to cache all the space that is going to
8319 	 * be free'd up by the truncate operation, but also have some slack
8320 	 * space reserved in case it uses space during the truncate (thank you
8321 	 * very much snapshotting).
8322 	 *
8323 	 * And we need these to be separate.  The fact is we can use a lot of
8324 	 * space doing the truncate, and we have no earthly idea how much space
8325 	 * we will use, so we need the truncate reservation to be separate so it
8326 	 * doesn't end up using space reserved for updating the inode.  We also
8327 	 * need to be able to stop the transaction and start a new one, which
8328 	 * means we need to be able to update the inode several times, and we
8329 	 * have no idea of knowing how many times that will be, so we can't just
8330 	 * reserve 1 item for the entirety of the operation, so that has to be
8331 	 * done separately as well.
8332 	 *
8333 	 * So that leaves us with
8334 	 *
8335 	 * 1) rsv - for the truncate reservation, which we will steal from the
8336 	 * transaction reservation.
8337 	 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
8338 	 * updating the inode.
8339 	 */
8340 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
8341 	if (!rsv)
8342 		return -ENOMEM;
8343 	rsv->size = min_size;
8344 	rsv->failfast = true;
8345 
8346 	/*
8347 	 * 1 for the truncate slack space
8348 	 * 1 for updating the inode.
8349 	 */
8350 	trans = btrfs_start_transaction(root, 2);
8351 	if (IS_ERR(trans)) {
8352 		ret = PTR_ERR(trans);
8353 		goto out;
8354 	}
8355 
8356 	/* Migrate the slack space for the truncate to our reserve */
8357 	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
8358 				      min_size, false);
8359 	/*
8360 	 * We have reserved 2 metadata units when we started the transaction and
8361 	 * min_size matches 1 unit, so this should never fail, but if it does,
8362 	 * it's not critical we just fail truncation.
8363 	 */
8364 	if (WARN_ON(ret)) {
8365 		btrfs_end_transaction(trans);
8366 		goto out;
8367 	}
8368 
8369 	trans->block_rsv = rsv;
8370 
8371 	while (1) {
8372 		struct extent_state *cached_state = NULL;
8373 		const u64 new_size = inode->vfs_inode.i_size;
8374 		const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
8375 
8376 		control.new_size = new_size;
8377 		lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
8378 		/*
8379 		 * We want to drop from the next block forward in case this new
8380 		 * size is not block aligned since we will be keeping the last
8381 		 * block of the extent just the way it is.
8382 		 */
8383 		btrfs_drop_extent_map_range(inode,
8384 					    ALIGN(new_size, fs_info->sectorsize),
8385 					    (u64)-1, false);
8386 
8387 		ret = btrfs_truncate_inode_items(trans, root, &control);
8388 
8389 		inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
8390 		btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
8391 
8392 		unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
8393 
8394 		trans->block_rsv = &fs_info->trans_block_rsv;
8395 		if (ret != -ENOSPC && ret != -EAGAIN)
8396 			break;
8397 
8398 		ret = btrfs_update_inode(trans, root, inode);
8399 		if (ret)
8400 			break;
8401 
8402 		btrfs_end_transaction(trans);
8403 		btrfs_btree_balance_dirty(fs_info);
8404 
8405 		trans = btrfs_start_transaction(root, 2);
8406 		if (IS_ERR(trans)) {
8407 			ret = PTR_ERR(trans);
8408 			trans = NULL;
8409 			break;
8410 		}
8411 
8412 		btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
8413 		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
8414 					      rsv, min_size, false);
8415 		/*
8416 		 * We have reserved 2 metadata units when we started the
8417 		 * transaction and min_size matches 1 unit, so this should never
8418 		 * fail, but if it does, it's not critical we just fail truncation.
8419 		 */
8420 		if (WARN_ON(ret))
8421 			break;
8422 
8423 		trans->block_rsv = rsv;
8424 	}
8425 
8426 	/*
8427 	 * We can't call btrfs_truncate_block inside a trans handle as we could
8428 	 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
8429 	 * know we've truncated everything except the last little bit, and can
8430 	 * do btrfs_truncate_block and then update the disk_i_size.
8431 	 */
8432 	if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
8433 		btrfs_end_transaction(trans);
8434 		btrfs_btree_balance_dirty(fs_info);
8435 
8436 		ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
8437 		if (ret)
8438 			goto out;
8439 		trans = btrfs_start_transaction(root, 1);
8440 		if (IS_ERR(trans)) {
8441 			ret = PTR_ERR(trans);
8442 			goto out;
8443 		}
8444 		btrfs_inode_safe_disk_i_size_write(inode, 0);
8445 	}
8446 
8447 	if (trans) {
8448 		int ret2;
8449 
8450 		trans->block_rsv = &fs_info->trans_block_rsv;
8451 		ret2 = btrfs_update_inode(trans, root, inode);
8452 		if (ret2 && !ret)
8453 			ret = ret2;
8454 
8455 		ret2 = btrfs_end_transaction(trans);
8456 		if (ret2 && !ret)
8457 			ret = ret2;
8458 		btrfs_btree_balance_dirty(fs_info);
8459 	}
8460 out:
8461 	btrfs_free_block_rsv(fs_info, rsv);
8462 	/*
8463 	 * So if we truncate and then write and fsync we normally would just
8464 	 * write the extents that changed, which is a problem if we need to
8465 	 * first truncate that entire inode.  So set this flag so we write out
8466 	 * all of the extents in the inode to the sync log so we're completely
8467 	 * safe.
8468 	 *
8469 	 * If no extents were dropped or trimmed we don't need to force the next
8470 	 * fsync to truncate all the inode's items from the log and re-log them
8471 	 * all. This means the truncate operation did not change the file size,
8472 	 * or changed it to a smaller size but there was only an implicit hole
8473 	 * between the old i_size and the new i_size, and there were no prealloc
8474 	 * extents beyond i_size to drop.
8475 	 */
8476 	if (control.extents_found > 0)
8477 		btrfs_set_inode_full_sync(inode);
8478 
8479 	return ret;
8480 }
8481 
8482 struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
8483 				     struct inode *dir)
8484 {
8485 	struct inode *inode;
8486 
8487 	inode = new_inode(dir->i_sb);
8488 	if (inode) {
8489 		/*
8490 		 * Subvolumes don't inherit the sgid bit or the parent's gid if
8491 		 * the parent's sgid bit is set. This is probably a bug.
8492 		 */
8493 		inode_init_owner(idmap, inode, NULL,
8494 				 S_IFDIR | (~current_umask() & S_IRWXUGO));
8495 		inode->i_op = &btrfs_dir_inode_operations;
8496 		inode->i_fop = &btrfs_dir_file_operations;
8497 	}
8498 	return inode;
8499 }
8500 
8501 struct inode *btrfs_alloc_inode(struct super_block *sb)
8502 {
8503 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
8504 	struct btrfs_inode *ei;
8505 	struct inode *inode;
8506 
8507 	ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
8508 	if (!ei)
8509 		return NULL;
8510 
8511 	ei->root = NULL;
8512 	ei->generation = 0;
8513 	ei->last_trans = 0;
8514 	ei->last_sub_trans = 0;
8515 	ei->logged_trans = 0;
8516 	ei->delalloc_bytes = 0;
8517 	ei->new_delalloc_bytes = 0;
8518 	ei->defrag_bytes = 0;
8519 	ei->disk_i_size = 0;
8520 	ei->flags = 0;
8521 	ei->ro_flags = 0;
8522 	ei->csum_bytes = 0;
8523 	ei->index_cnt = (u64)-1;
8524 	ei->dir_index = 0;
8525 	ei->last_unlink_trans = 0;
8526 	ei->last_reflink_trans = 0;
8527 	ei->last_log_commit = 0;
8528 
8529 	spin_lock_init(&ei->lock);
8530 	ei->outstanding_extents = 0;
8531 	if (sb->s_magic != BTRFS_TEST_MAGIC)
8532 		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
8533 					      BTRFS_BLOCK_RSV_DELALLOC);
8534 	ei->runtime_flags = 0;
8535 	ei->prop_compress = BTRFS_COMPRESS_NONE;
8536 	ei->defrag_compress = BTRFS_COMPRESS_NONE;
8537 
8538 	ei->delayed_node = NULL;
8539 
8540 	ei->i_otime.tv_sec = 0;
8541 	ei->i_otime.tv_nsec = 0;
8542 
8543 	inode = &ei->vfs_inode;
8544 	extent_map_tree_init(&ei->extent_tree);
8545 	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
8546 	ei->io_tree.inode = ei;
8547 	extent_io_tree_init(fs_info, &ei->file_extent_tree,
8548 			    IO_TREE_INODE_FILE_EXTENT);
8549 	mutex_init(&ei->log_mutex);
8550 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8551 	INIT_LIST_HEAD(&ei->delalloc_inodes);
8552 	INIT_LIST_HEAD(&ei->delayed_iput);
8553 	RB_CLEAR_NODE(&ei->rb_node);
8554 	init_rwsem(&ei->i_mmap_lock);
8555 
8556 	return inode;
8557 }
8558 
8559 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8560 void btrfs_test_destroy_inode(struct inode *inode)
8561 {
8562 	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
8563 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8564 }
8565 #endif
8566 
8567 void btrfs_free_inode(struct inode *inode)
8568 {
8569 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8570 }
8571 
8572 void btrfs_destroy_inode(struct inode *vfs_inode)
8573 {
8574 	struct btrfs_ordered_extent *ordered;
8575 	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
8576 	struct btrfs_root *root = inode->root;
8577 	bool freespace_inode;
8578 
8579 	WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
8580 	WARN_ON(vfs_inode->i_data.nrpages);
8581 	WARN_ON(inode->block_rsv.reserved);
8582 	WARN_ON(inode->block_rsv.size);
8583 	WARN_ON(inode->outstanding_extents);
8584 	if (!S_ISDIR(vfs_inode->i_mode)) {
8585 		WARN_ON(inode->delalloc_bytes);
8586 		WARN_ON(inode->new_delalloc_bytes);
8587 	}
8588 	WARN_ON(inode->csum_bytes);
8589 	WARN_ON(inode->defrag_bytes);
8590 
8591 	/*
8592 	 * This can happen where we create an inode, but somebody else also
8593 	 * created the same inode and we need to destroy the one we already
8594 	 * created.
8595 	 */
8596 	if (!root)
8597 		return;
8598 
8599 	/*
8600 	 * If this is a free space inode do not take the ordered extents lockdep
8601 	 * map.
8602 	 */
8603 	freespace_inode = btrfs_is_free_space_inode(inode);
8604 
8605 	while (1) {
8606 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
8607 		if (!ordered)
8608 			break;
8609 		else {
8610 			btrfs_err(root->fs_info,
8611 				  "found ordered extent %llu %llu on inode cleanup",
8612 				  ordered->file_offset, ordered->num_bytes);
8613 
8614 			if (!freespace_inode)
8615 				btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
8616 
8617 			btrfs_remove_ordered_extent(inode, ordered);
8618 			btrfs_put_ordered_extent(ordered);
8619 			btrfs_put_ordered_extent(ordered);
8620 		}
8621 	}
8622 	btrfs_qgroup_check_reserved_leak(inode);
8623 	inode_tree_del(inode);
8624 	btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
8625 	btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
8626 	btrfs_put_root(inode->root);
8627 }
8628 
8629 int btrfs_drop_inode(struct inode *inode)
8630 {
8631 	struct btrfs_root *root = BTRFS_I(inode)->root;
8632 
8633 	if (root == NULL)
8634 		return 1;
8635 
8636 	/* the snap/subvol tree is on deleting */
8637 	if (btrfs_root_refs(&root->root_item) == 0)
8638 		return 1;
8639 	else
8640 		return generic_drop_inode(inode);
8641 }
8642 
8643 static void init_once(void *foo)
8644 {
8645 	struct btrfs_inode *ei = foo;
8646 
8647 	inode_init_once(&ei->vfs_inode);
8648 }
8649 
8650 void __cold btrfs_destroy_cachep(void)
8651 {
8652 	/*
8653 	 * Make sure all delayed rcu free inodes are flushed before we
8654 	 * destroy cache.
8655 	 */
8656 	rcu_barrier();
8657 	bioset_exit(&btrfs_dio_bioset);
8658 	kmem_cache_destroy(btrfs_inode_cachep);
8659 }
8660 
8661 int __init btrfs_init_cachep(void)
8662 {
8663 	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
8664 			sizeof(struct btrfs_inode), 0,
8665 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
8666 			init_once);
8667 	if (!btrfs_inode_cachep)
8668 		goto fail;
8669 
8670 	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
8671 			offsetof(struct btrfs_dio_private, bbio.bio),
8672 			BIOSET_NEED_BVECS))
8673 		goto fail;
8674 
8675 	return 0;
8676 fail:
8677 	btrfs_destroy_cachep();
8678 	return -ENOMEM;
8679 }
8680 
8681 static int btrfs_getattr(struct mnt_idmap *idmap,
8682 			 const struct path *path, struct kstat *stat,
8683 			 u32 request_mask, unsigned int flags)
8684 {
8685 	u64 delalloc_bytes;
8686 	u64 inode_bytes;
8687 	struct inode *inode = d_inode(path->dentry);
8688 	u32 blocksize = inode->i_sb->s_blocksize;
8689 	u32 bi_flags = BTRFS_I(inode)->flags;
8690 	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
8691 
8692 	stat->result_mask |= STATX_BTIME;
8693 	stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
8694 	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
8695 	if (bi_flags & BTRFS_INODE_APPEND)
8696 		stat->attributes |= STATX_ATTR_APPEND;
8697 	if (bi_flags & BTRFS_INODE_COMPRESS)
8698 		stat->attributes |= STATX_ATTR_COMPRESSED;
8699 	if (bi_flags & BTRFS_INODE_IMMUTABLE)
8700 		stat->attributes |= STATX_ATTR_IMMUTABLE;
8701 	if (bi_flags & BTRFS_INODE_NODUMP)
8702 		stat->attributes |= STATX_ATTR_NODUMP;
8703 	if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
8704 		stat->attributes |= STATX_ATTR_VERITY;
8705 
8706 	stat->attributes_mask |= (STATX_ATTR_APPEND |
8707 				  STATX_ATTR_COMPRESSED |
8708 				  STATX_ATTR_IMMUTABLE |
8709 				  STATX_ATTR_NODUMP);
8710 
8711 	generic_fillattr(idmap, request_mask, inode, stat);
8712 	stat->dev = BTRFS_I(inode)->root->anon_dev;
8713 
8714 	spin_lock(&BTRFS_I(inode)->lock);
8715 	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
8716 	inode_bytes = inode_get_bytes(inode);
8717 	spin_unlock(&BTRFS_I(inode)->lock);
8718 	stat->blocks = (ALIGN(inode_bytes, blocksize) +
8719 			ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
8720 	return 0;
8721 }
8722 
8723 static int btrfs_rename_exchange(struct inode *old_dir,
8724 			      struct dentry *old_dentry,
8725 			      struct inode *new_dir,
8726 			      struct dentry *new_dentry)
8727 {
8728 	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
8729 	struct btrfs_trans_handle *trans;
8730 	unsigned int trans_num_items;
8731 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
8732 	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8733 	struct inode *new_inode = new_dentry->d_inode;
8734 	struct inode *old_inode = old_dentry->d_inode;
8735 	struct btrfs_rename_ctx old_rename_ctx;
8736 	struct btrfs_rename_ctx new_rename_ctx;
8737 	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
8738 	u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
8739 	u64 old_idx = 0;
8740 	u64 new_idx = 0;
8741 	int ret;
8742 	int ret2;
8743 	bool need_abort = false;
8744 	struct fscrypt_name old_fname, new_fname;
8745 	struct fscrypt_str *old_name, *new_name;
8746 
8747 	/*
8748 	 * For non-subvolumes allow exchange only within one subvolume, in the
8749 	 * same inode namespace. Two subvolumes (represented as directory) can
8750 	 * be exchanged as they're a logical link and have a fixed inode number.
8751 	 */
8752 	if (root != dest &&
8753 	    (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
8754 	     new_ino != BTRFS_FIRST_FREE_OBJECTID))
8755 		return -EXDEV;
8756 
8757 	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
8758 	if (ret)
8759 		return ret;
8760 
8761 	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
8762 	if (ret) {
8763 		fscrypt_free_filename(&old_fname);
8764 		return ret;
8765 	}
8766 
8767 	old_name = &old_fname.disk_name;
8768 	new_name = &new_fname.disk_name;
8769 
8770 	/* close the race window with snapshot create/destroy ioctl */
8771 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
8772 	    new_ino == BTRFS_FIRST_FREE_OBJECTID)
8773 		down_read(&fs_info->subvol_sem);
8774 
8775 	/*
8776 	 * For each inode:
8777 	 * 1 to remove old dir item
8778 	 * 1 to remove old dir index
8779 	 * 1 to add new dir item
8780 	 * 1 to add new dir index
8781 	 * 1 to update parent inode
8782 	 *
8783 	 * If the parents are the same, we only need to account for one
8784 	 */
8785 	trans_num_items = (old_dir == new_dir ? 9 : 10);
8786 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8787 		/*
8788 		 * 1 to remove old root ref
8789 		 * 1 to remove old root backref
8790 		 * 1 to add new root ref
8791 		 * 1 to add new root backref
8792 		 */
8793 		trans_num_items += 4;
8794 	} else {
8795 		/*
8796 		 * 1 to update inode item
8797 		 * 1 to remove old inode ref
8798 		 * 1 to add new inode ref
8799 		 */
8800 		trans_num_items += 3;
8801 	}
8802 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
8803 		trans_num_items += 4;
8804 	else
8805 		trans_num_items += 3;
8806 	trans = btrfs_start_transaction(root, trans_num_items);
8807 	if (IS_ERR(trans)) {
8808 		ret = PTR_ERR(trans);
8809 		goto out_notrans;
8810 	}
8811 
8812 	if (dest != root) {
8813 		ret = btrfs_record_root_in_trans(trans, dest);
8814 		if (ret)
8815 			goto out_fail;
8816 	}
8817 
8818 	/*
8819 	 * We need to find a free sequence number both in the source and
8820 	 * in the destination directory for the exchange.
8821 	 */
8822 	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
8823 	if (ret)
8824 		goto out_fail;
8825 	ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
8826 	if (ret)
8827 		goto out_fail;
8828 
8829 	BTRFS_I(old_inode)->dir_index = 0ULL;
8830 	BTRFS_I(new_inode)->dir_index = 0ULL;
8831 
8832 	/* Reference for the source. */
8833 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8834 		/* force full log commit if subvolume involved. */
8835 		btrfs_set_log_full_commit(trans);
8836 	} else {
8837 		ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
8838 					     btrfs_ino(BTRFS_I(new_dir)),
8839 					     old_idx);
8840 		if (ret)
8841 			goto out_fail;
8842 		need_abort = true;
8843 	}
8844 
8845 	/* And now for the dest. */
8846 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8847 		/* force full log commit if subvolume involved. */
8848 		btrfs_set_log_full_commit(trans);
8849 	} else {
8850 		ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
8851 					     btrfs_ino(BTRFS_I(old_dir)),
8852 					     new_idx);
8853 		if (ret) {
8854 			if (need_abort)
8855 				btrfs_abort_transaction(trans, ret);
8856 			goto out_fail;
8857 		}
8858 	}
8859 
8860 	/* Update inode version and ctime/mtime. */
8861 	inode_inc_iversion(old_dir);
8862 	inode_inc_iversion(new_dir);
8863 	inode_inc_iversion(old_inode);
8864 	inode_inc_iversion(new_inode);
8865 	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
8866 
8867 	if (old_dentry->d_parent != new_dentry->d_parent) {
8868 		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
8869 					BTRFS_I(old_inode), true);
8870 		btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
8871 					BTRFS_I(new_inode), true);
8872 	}
8873 
8874 	/* src is a subvolume */
8875 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8876 		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
8877 	} else { /* src is an inode */
8878 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
8879 					   BTRFS_I(old_dentry->d_inode),
8880 					   old_name, &old_rename_ctx);
8881 		if (!ret)
8882 			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
8883 	}
8884 	if (ret) {
8885 		btrfs_abort_transaction(trans, ret);
8886 		goto out_fail;
8887 	}
8888 
8889 	/* dest is a subvolume */
8890 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8891 		ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
8892 	} else { /* dest is an inode */
8893 		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
8894 					   BTRFS_I(new_dentry->d_inode),
8895 					   new_name, &new_rename_ctx);
8896 		if (!ret)
8897 			ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
8898 	}
8899 	if (ret) {
8900 		btrfs_abort_transaction(trans, ret);
8901 		goto out_fail;
8902 	}
8903 
8904 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
8905 			     new_name, 0, old_idx);
8906 	if (ret) {
8907 		btrfs_abort_transaction(trans, ret);
8908 		goto out_fail;
8909 	}
8910 
8911 	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
8912 			     old_name, 0, new_idx);
8913 	if (ret) {
8914 		btrfs_abort_transaction(trans, ret);
8915 		goto out_fail;
8916 	}
8917 
8918 	if (old_inode->i_nlink == 1)
8919 		BTRFS_I(old_inode)->dir_index = old_idx;
8920 	if (new_inode->i_nlink == 1)
8921 		BTRFS_I(new_inode)->dir_index = new_idx;
8922 
8923 	/*
8924 	 * Now pin the logs of the roots. We do it to ensure that no other task
8925 	 * can sync the logs while we are in progress with the rename, because
8926 	 * that could result in an inconsistency in case any of the inodes that
8927 	 * are part of this rename operation were logged before.
8928 	 */
8929 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8930 		btrfs_pin_log_trans(root);
8931 	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8932 		btrfs_pin_log_trans(dest);
8933 
8934 	/* Do the log updates for all inodes. */
8935 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8936 		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
8937 				   old_rename_ctx.index, new_dentry->d_parent);
8938 	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8939 		btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
8940 				   new_rename_ctx.index, old_dentry->d_parent);
8941 
8942 	/* Now unpin the logs. */
8943 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8944 		btrfs_end_log_trans(root);
8945 	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8946 		btrfs_end_log_trans(dest);
8947 out_fail:
8948 	ret2 = btrfs_end_transaction(trans);
8949 	ret = ret ? ret : ret2;
8950 out_notrans:
8951 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
8952 	    old_ino == BTRFS_FIRST_FREE_OBJECTID)
8953 		up_read(&fs_info->subvol_sem);
8954 
8955 	fscrypt_free_filename(&new_fname);
8956 	fscrypt_free_filename(&old_fname);
8957 	return ret;
8958 }
8959 
8960 static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
8961 					struct inode *dir)
8962 {
8963 	struct inode *inode;
8964 
8965 	inode = new_inode(dir->i_sb);
8966 	if (inode) {
8967 		inode_init_owner(idmap, inode, dir,
8968 				 S_IFCHR | WHITEOUT_MODE);
8969 		inode->i_op = &btrfs_special_inode_operations;
8970 		init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
8971 	}
8972 	return inode;
8973 }
8974 
8975 static int btrfs_rename(struct mnt_idmap *idmap,
8976 			struct inode *old_dir, struct dentry *old_dentry,
8977 			struct inode *new_dir, struct dentry *new_dentry,
8978 			unsigned int flags)
8979 {
8980 	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
8981 	struct btrfs_new_inode_args whiteout_args = {
8982 		.dir = old_dir,
8983 		.dentry = old_dentry,
8984 	};
8985 	struct btrfs_trans_handle *trans;
8986 	unsigned int trans_num_items;
8987 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
8988 	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8989 	struct inode *new_inode = d_inode(new_dentry);
8990 	struct inode *old_inode = d_inode(old_dentry);
8991 	struct btrfs_rename_ctx rename_ctx;
8992 	u64 index = 0;
8993 	int ret;
8994 	int ret2;
8995 	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
8996 	struct fscrypt_name old_fname, new_fname;
8997 
8998 	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
8999 		return -EPERM;
9000 
9001 	/* we only allow rename subvolume link between subvolumes */
9002 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9003 		return -EXDEV;
9004 
9005 	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9006 	    (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
9007 		return -ENOTEMPTY;
9008 
9009 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
9010 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
9011 		return -ENOTEMPTY;
9012 
9013 	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
9014 	if (ret)
9015 		return ret;
9016 
9017 	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
9018 	if (ret) {
9019 		fscrypt_free_filename(&old_fname);
9020 		return ret;
9021 	}
9022 
9023 	/* check for collisions, even if the  name isn't there */
9024 	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
9025 	if (ret) {
9026 		if (ret == -EEXIST) {
9027 			/* we shouldn't get
9028 			 * eexist without a new_inode */
9029 			if (WARN_ON(!new_inode)) {
9030 				goto out_fscrypt_names;
9031 			}
9032 		} else {
9033 			/* maybe -EOVERFLOW */
9034 			goto out_fscrypt_names;
9035 		}
9036 	}
9037 	ret = 0;
9038 
9039 	/*
9040 	 * we're using rename to replace one file with another.  Start IO on it
9041 	 * now so  we don't add too much work to the end of the transaction
9042 	 */
9043 	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9044 		filemap_flush(old_inode->i_mapping);
9045 
9046 	if (flags & RENAME_WHITEOUT) {
9047 		whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
9048 		if (!whiteout_args.inode) {
9049 			ret = -ENOMEM;
9050 			goto out_fscrypt_names;
9051 		}
9052 		ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
9053 		if (ret)
9054 			goto out_whiteout_inode;
9055 	} else {
9056 		/* 1 to update the old parent inode. */
9057 		trans_num_items = 1;
9058 	}
9059 
9060 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9061 		/* Close the race window with snapshot create/destroy ioctl */
9062 		down_read(&fs_info->subvol_sem);
9063 		/*
9064 		 * 1 to remove old root ref
9065 		 * 1 to remove old root backref
9066 		 * 1 to add new root ref
9067 		 * 1 to add new root backref
9068 		 */
9069 		trans_num_items += 4;
9070 	} else {
9071 		/*
9072 		 * 1 to update inode
9073 		 * 1 to remove old inode ref
9074 		 * 1 to add new inode ref
9075 		 */
9076 		trans_num_items += 3;
9077 	}
9078 	/*
9079 	 * 1 to remove old dir item
9080 	 * 1 to remove old dir index
9081 	 * 1 to add new dir item
9082 	 * 1 to add new dir index
9083 	 */
9084 	trans_num_items += 4;
9085 	/* 1 to update new parent inode if it's not the same as the old parent */
9086 	if (new_dir != old_dir)
9087 		trans_num_items++;
9088 	if (new_inode) {
9089 		/*
9090 		 * 1 to update inode
9091 		 * 1 to remove inode ref
9092 		 * 1 to remove dir item
9093 		 * 1 to remove dir index
9094 		 * 1 to possibly add orphan item
9095 		 */
9096 		trans_num_items += 5;
9097 	}
9098 	trans = btrfs_start_transaction(root, trans_num_items);
9099 	if (IS_ERR(trans)) {
9100 		ret = PTR_ERR(trans);
9101 		goto out_notrans;
9102 	}
9103 
9104 	if (dest != root) {
9105 		ret = btrfs_record_root_in_trans(trans, dest);
9106 		if (ret)
9107 			goto out_fail;
9108 	}
9109 
9110 	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
9111 	if (ret)
9112 		goto out_fail;
9113 
9114 	BTRFS_I(old_inode)->dir_index = 0ULL;
9115 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9116 		/* force full log commit if subvolume involved. */
9117 		btrfs_set_log_full_commit(trans);
9118 	} else {
9119 		ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
9120 					     old_ino, btrfs_ino(BTRFS_I(new_dir)),
9121 					     index);
9122 		if (ret)
9123 			goto out_fail;
9124 	}
9125 
9126 	inode_inc_iversion(old_dir);
9127 	inode_inc_iversion(new_dir);
9128 	inode_inc_iversion(old_inode);
9129 	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
9130 
9131 	if (old_dentry->d_parent != new_dentry->d_parent)
9132 		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9133 					BTRFS_I(old_inode), true);
9134 
9135 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9136 		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
9137 	} else {
9138 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
9139 					   BTRFS_I(d_inode(old_dentry)),
9140 					   &old_fname.disk_name, &rename_ctx);
9141 		if (!ret)
9142 			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
9143 	}
9144 	if (ret) {
9145 		btrfs_abort_transaction(trans, ret);
9146 		goto out_fail;
9147 	}
9148 
9149 	if (new_inode) {
9150 		inode_inc_iversion(new_inode);
9151 		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
9152 			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9153 			ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
9154 			BUG_ON(new_inode->i_nlink == 0);
9155 		} else {
9156 			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
9157 						 BTRFS_I(d_inode(new_dentry)),
9158 						 &new_fname.disk_name);
9159 		}
9160 		if (!ret && new_inode->i_nlink == 0)
9161 			ret = btrfs_orphan_add(trans,
9162 					BTRFS_I(d_inode(new_dentry)));
9163 		if (ret) {
9164 			btrfs_abort_transaction(trans, ret);
9165 			goto out_fail;
9166 		}
9167 	}
9168 
9169 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9170 			     &new_fname.disk_name, 0, index);
9171 	if (ret) {
9172 		btrfs_abort_transaction(trans, ret);
9173 		goto out_fail;
9174 	}
9175 
9176 	if (old_inode->i_nlink == 1)
9177 		BTRFS_I(old_inode)->dir_index = index;
9178 
9179 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
9180 		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
9181 				   rename_ctx.index, new_dentry->d_parent);
9182 
9183 	if (flags & RENAME_WHITEOUT) {
9184 		ret = btrfs_create_new_inode(trans, &whiteout_args);
9185 		if (ret) {
9186 			btrfs_abort_transaction(trans, ret);
9187 			goto out_fail;
9188 		} else {
9189 			unlock_new_inode(whiteout_args.inode);
9190 			iput(whiteout_args.inode);
9191 			whiteout_args.inode = NULL;
9192 		}
9193 	}
9194 out_fail:
9195 	ret2 = btrfs_end_transaction(trans);
9196 	ret = ret ? ret : ret2;
9197 out_notrans:
9198 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9199 		up_read(&fs_info->subvol_sem);
9200 	if (flags & RENAME_WHITEOUT)
9201 		btrfs_new_inode_args_destroy(&whiteout_args);
9202 out_whiteout_inode:
9203 	if (flags & RENAME_WHITEOUT)
9204 		iput(whiteout_args.inode);
9205 out_fscrypt_names:
9206 	fscrypt_free_filename(&old_fname);
9207 	fscrypt_free_filename(&new_fname);
9208 	return ret;
9209 }
9210 
9211 static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
9212 			 struct dentry *old_dentry, struct inode *new_dir,
9213 			 struct dentry *new_dentry, unsigned int flags)
9214 {
9215 	int ret;
9216 
9217 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
9218 		return -EINVAL;
9219 
9220 	if (flags & RENAME_EXCHANGE)
9221 		ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
9222 					    new_dentry);
9223 	else
9224 		ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
9225 				   new_dentry, flags);
9226 
9227 	btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
9228 
9229 	return ret;
9230 }
9231 
9232 struct btrfs_delalloc_work {
9233 	struct inode *inode;
9234 	struct completion completion;
9235 	struct list_head list;
9236 	struct btrfs_work work;
9237 };
9238 
9239 static void btrfs_run_delalloc_work(struct btrfs_work *work)
9240 {
9241 	struct btrfs_delalloc_work *delalloc_work;
9242 	struct inode *inode;
9243 
9244 	delalloc_work = container_of(work, struct btrfs_delalloc_work,
9245 				     work);
9246 	inode = delalloc_work->inode;
9247 	filemap_flush(inode->i_mapping);
9248 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
9249 				&BTRFS_I(inode)->runtime_flags))
9250 		filemap_flush(inode->i_mapping);
9251 
9252 	iput(inode);
9253 	complete(&delalloc_work->completion);
9254 }
9255 
9256 static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
9257 {
9258 	struct btrfs_delalloc_work *work;
9259 
9260 	work = kmalloc(sizeof(*work), GFP_NOFS);
9261 	if (!work)
9262 		return NULL;
9263 
9264 	init_completion(&work->completion);
9265 	INIT_LIST_HEAD(&work->list);
9266 	work->inode = inode;
9267 	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
9268 
9269 	return work;
9270 }
9271 
9272 /*
9273  * some fairly slow code that needs optimization. This walks the list
9274  * of all the inodes with pending delalloc and forces them to disk.
9275  */
9276 static int start_delalloc_inodes(struct btrfs_root *root,
9277 				 struct writeback_control *wbc, bool snapshot,
9278 				 bool in_reclaim_context)
9279 {
9280 	struct btrfs_inode *binode;
9281 	struct inode *inode;
9282 	struct btrfs_delalloc_work *work, *next;
9283 	LIST_HEAD(works);
9284 	LIST_HEAD(splice);
9285 	int ret = 0;
9286 	bool full_flush = wbc->nr_to_write == LONG_MAX;
9287 
9288 	mutex_lock(&root->delalloc_mutex);
9289 	spin_lock(&root->delalloc_lock);
9290 	list_splice_init(&root->delalloc_inodes, &splice);
9291 	while (!list_empty(&splice)) {
9292 		binode = list_entry(splice.next, struct btrfs_inode,
9293 				    delalloc_inodes);
9294 
9295 		list_move_tail(&binode->delalloc_inodes,
9296 			       &root->delalloc_inodes);
9297 
9298 		if (in_reclaim_context &&
9299 		    test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
9300 			continue;
9301 
9302 		inode = igrab(&binode->vfs_inode);
9303 		if (!inode) {
9304 			cond_resched_lock(&root->delalloc_lock);
9305 			continue;
9306 		}
9307 		spin_unlock(&root->delalloc_lock);
9308 
9309 		if (snapshot)
9310 			set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
9311 				&binode->runtime_flags);
9312 		if (full_flush) {
9313 			work = btrfs_alloc_delalloc_work(inode);
9314 			if (!work) {
9315 				iput(inode);
9316 				ret = -ENOMEM;
9317 				goto out;
9318 			}
9319 			list_add_tail(&work->list, &works);
9320 			btrfs_queue_work(root->fs_info->flush_workers,
9321 					 &work->work);
9322 		} else {
9323 			ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
9324 			btrfs_add_delayed_iput(BTRFS_I(inode));
9325 			if (ret || wbc->nr_to_write <= 0)
9326 				goto out;
9327 		}
9328 		cond_resched();
9329 		spin_lock(&root->delalloc_lock);
9330 	}
9331 	spin_unlock(&root->delalloc_lock);
9332 
9333 out:
9334 	list_for_each_entry_safe(work, next, &works, list) {
9335 		list_del_init(&work->list);
9336 		wait_for_completion(&work->completion);
9337 		kfree(work);
9338 	}
9339 
9340 	if (!list_empty(&splice)) {
9341 		spin_lock(&root->delalloc_lock);
9342 		list_splice_tail(&splice, &root->delalloc_inodes);
9343 		spin_unlock(&root->delalloc_lock);
9344 	}
9345 	mutex_unlock(&root->delalloc_mutex);
9346 	return ret;
9347 }
9348 
9349 int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
9350 {
9351 	struct writeback_control wbc = {
9352 		.nr_to_write = LONG_MAX,
9353 		.sync_mode = WB_SYNC_NONE,
9354 		.range_start = 0,
9355 		.range_end = LLONG_MAX,
9356 	};
9357 	struct btrfs_fs_info *fs_info = root->fs_info;
9358 
9359 	if (BTRFS_FS_ERROR(fs_info))
9360 		return -EROFS;
9361 
9362 	return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
9363 }
9364 
9365 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
9366 			       bool in_reclaim_context)
9367 {
9368 	struct writeback_control wbc = {
9369 		.nr_to_write = nr,
9370 		.sync_mode = WB_SYNC_NONE,
9371 		.range_start = 0,
9372 		.range_end = LLONG_MAX,
9373 	};
9374 	struct btrfs_root *root;
9375 	LIST_HEAD(splice);
9376 	int ret;
9377 
9378 	if (BTRFS_FS_ERROR(fs_info))
9379 		return -EROFS;
9380 
9381 	mutex_lock(&fs_info->delalloc_root_mutex);
9382 	spin_lock(&fs_info->delalloc_root_lock);
9383 	list_splice_init(&fs_info->delalloc_roots, &splice);
9384 	while (!list_empty(&splice)) {
9385 		/*
9386 		 * Reset nr_to_write here so we know that we're doing a full
9387 		 * flush.
9388 		 */
9389 		if (nr == LONG_MAX)
9390 			wbc.nr_to_write = LONG_MAX;
9391 
9392 		root = list_first_entry(&splice, struct btrfs_root,
9393 					delalloc_root);
9394 		root = btrfs_grab_root(root);
9395 		BUG_ON(!root);
9396 		list_move_tail(&root->delalloc_root,
9397 			       &fs_info->delalloc_roots);
9398 		spin_unlock(&fs_info->delalloc_root_lock);
9399 
9400 		ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
9401 		btrfs_put_root(root);
9402 		if (ret < 0 || wbc.nr_to_write <= 0)
9403 			goto out;
9404 		spin_lock(&fs_info->delalloc_root_lock);
9405 	}
9406 	spin_unlock(&fs_info->delalloc_root_lock);
9407 
9408 	ret = 0;
9409 out:
9410 	if (!list_empty(&splice)) {
9411 		spin_lock(&fs_info->delalloc_root_lock);
9412 		list_splice_tail(&splice, &fs_info->delalloc_roots);
9413 		spin_unlock(&fs_info->delalloc_root_lock);
9414 	}
9415 	mutex_unlock(&fs_info->delalloc_root_mutex);
9416 	return ret;
9417 }
9418 
9419 static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
9420 			 struct dentry *dentry, const char *symname)
9421 {
9422 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
9423 	struct btrfs_trans_handle *trans;
9424 	struct btrfs_root *root = BTRFS_I(dir)->root;
9425 	struct btrfs_path *path;
9426 	struct btrfs_key key;
9427 	struct inode *inode;
9428 	struct btrfs_new_inode_args new_inode_args = {
9429 		.dir = dir,
9430 		.dentry = dentry,
9431 	};
9432 	unsigned int trans_num_items;
9433 	int err;
9434 	int name_len;
9435 	int datasize;
9436 	unsigned long ptr;
9437 	struct btrfs_file_extent_item *ei;
9438 	struct extent_buffer *leaf;
9439 
9440 	name_len = strlen(symname);
9441 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
9442 		return -ENAMETOOLONG;
9443 
9444 	inode = new_inode(dir->i_sb);
9445 	if (!inode)
9446 		return -ENOMEM;
9447 	inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
9448 	inode->i_op = &btrfs_symlink_inode_operations;
9449 	inode_nohighmem(inode);
9450 	inode->i_mapping->a_ops = &btrfs_aops;
9451 	btrfs_i_size_write(BTRFS_I(inode), name_len);
9452 	inode_set_bytes(inode, name_len);
9453 
9454 	new_inode_args.inode = inode;
9455 	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
9456 	if (err)
9457 		goto out_inode;
9458 	/* 1 additional item for the inline extent */
9459 	trans_num_items++;
9460 
9461 	trans = btrfs_start_transaction(root, trans_num_items);
9462 	if (IS_ERR(trans)) {
9463 		err = PTR_ERR(trans);
9464 		goto out_new_inode_args;
9465 	}
9466 
9467 	err = btrfs_create_new_inode(trans, &new_inode_args);
9468 	if (err)
9469 		goto out;
9470 
9471 	path = btrfs_alloc_path();
9472 	if (!path) {
9473 		err = -ENOMEM;
9474 		btrfs_abort_transaction(trans, err);
9475 		discard_new_inode(inode);
9476 		inode = NULL;
9477 		goto out;
9478 	}
9479 	key.objectid = btrfs_ino(BTRFS_I(inode));
9480 	key.offset = 0;
9481 	key.type = BTRFS_EXTENT_DATA_KEY;
9482 	datasize = btrfs_file_extent_calc_inline_size(name_len);
9483 	err = btrfs_insert_empty_item(trans, root, path, &key,
9484 				      datasize);
9485 	if (err) {
9486 		btrfs_abort_transaction(trans, err);
9487 		btrfs_free_path(path);
9488 		discard_new_inode(inode);
9489 		inode = NULL;
9490 		goto out;
9491 	}
9492 	leaf = path->nodes[0];
9493 	ei = btrfs_item_ptr(leaf, path->slots[0],
9494 			    struct btrfs_file_extent_item);
9495 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
9496 	btrfs_set_file_extent_type(leaf, ei,
9497 				   BTRFS_FILE_EXTENT_INLINE);
9498 	btrfs_set_file_extent_encryption(leaf, ei, 0);
9499 	btrfs_set_file_extent_compression(leaf, ei, 0);
9500 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
9501 	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
9502 
9503 	ptr = btrfs_file_extent_inline_start(ei);
9504 	write_extent_buffer(leaf, symname, ptr, name_len);
9505 	btrfs_mark_buffer_dirty(trans, leaf);
9506 	btrfs_free_path(path);
9507 
9508 	d_instantiate_new(dentry, inode);
9509 	err = 0;
9510 out:
9511 	btrfs_end_transaction(trans);
9512 	btrfs_btree_balance_dirty(fs_info);
9513 out_new_inode_args:
9514 	btrfs_new_inode_args_destroy(&new_inode_args);
9515 out_inode:
9516 	if (err)
9517 		iput(inode);
9518 	return err;
9519 }
9520 
9521 static struct btrfs_trans_handle *insert_prealloc_file_extent(
9522 				       struct btrfs_trans_handle *trans_in,
9523 				       struct btrfs_inode *inode,
9524 				       struct btrfs_key *ins,
9525 				       u64 file_offset)
9526 {
9527 	struct btrfs_file_extent_item stack_fi;
9528 	struct btrfs_replace_extent_info extent_info;
9529 	struct btrfs_trans_handle *trans = trans_in;
9530 	struct btrfs_path *path;
9531 	u64 start = ins->objectid;
9532 	u64 len = ins->offset;
9533 	u64 qgroup_released = 0;
9534 	int ret;
9535 
9536 	memset(&stack_fi, 0, sizeof(stack_fi));
9537 
9538 	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
9539 	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
9540 	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
9541 	btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
9542 	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
9543 	btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
9544 	/* Encryption and other encoding is reserved and all 0 */
9545 
9546 	ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
9547 	if (ret < 0)
9548 		return ERR_PTR(ret);
9549 
9550 	if (trans) {
9551 		ret = insert_reserved_file_extent(trans, inode,
9552 						  file_offset, &stack_fi,
9553 						  true, qgroup_released);
9554 		if (ret)
9555 			goto free_qgroup;
9556 		return trans;
9557 	}
9558 
9559 	extent_info.disk_offset = start;
9560 	extent_info.disk_len = len;
9561 	extent_info.data_offset = 0;
9562 	extent_info.data_len = len;
9563 	extent_info.file_offset = file_offset;
9564 	extent_info.extent_buf = (char *)&stack_fi;
9565 	extent_info.is_new_extent = true;
9566 	extent_info.update_times = true;
9567 	extent_info.qgroup_reserved = qgroup_released;
9568 	extent_info.insertions = 0;
9569 
9570 	path = btrfs_alloc_path();
9571 	if (!path) {
9572 		ret = -ENOMEM;
9573 		goto free_qgroup;
9574 	}
9575 
9576 	ret = btrfs_replace_file_extents(inode, path, file_offset,
9577 				     file_offset + len - 1, &extent_info,
9578 				     &trans);
9579 	btrfs_free_path(path);
9580 	if (ret)
9581 		goto free_qgroup;
9582 	return trans;
9583 
9584 free_qgroup:
9585 	/*
9586 	 * We have released qgroup data range at the beginning of the function,
9587 	 * and normally qgroup_released bytes will be freed when committing
9588 	 * transaction.
9589 	 * But if we error out early, we have to free what we have released
9590 	 * or we leak qgroup data reservation.
9591 	 */
9592 	btrfs_qgroup_free_refroot(inode->root->fs_info,
9593 			inode->root->root_key.objectid, qgroup_released,
9594 			BTRFS_QGROUP_RSV_DATA);
9595 	return ERR_PTR(ret);
9596 }
9597 
9598 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9599 				       u64 start, u64 num_bytes, u64 min_size,
9600 				       loff_t actual_len, u64 *alloc_hint,
9601 				       struct btrfs_trans_handle *trans)
9602 {
9603 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9604 	struct extent_map *em;
9605 	struct btrfs_root *root = BTRFS_I(inode)->root;
9606 	struct btrfs_key ins;
9607 	u64 cur_offset = start;
9608 	u64 clear_offset = start;
9609 	u64 i_size;
9610 	u64 cur_bytes;
9611 	u64 last_alloc = (u64)-1;
9612 	int ret = 0;
9613 	bool own_trans = true;
9614 	u64 end = start + num_bytes - 1;
9615 
9616 	if (trans)
9617 		own_trans = false;
9618 	while (num_bytes > 0) {
9619 		cur_bytes = min_t(u64, num_bytes, SZ_256M);
9620 		cur_bytes = max(cur_bytes, min_size);
9621 		/*
9622 		 * If we are severely fragmented we could end up with really
9623 		 * small allocations, so if the allocator is returning small
9624 		 * chunks lets make its job easier by only searching for those
9625 		 * sized chunks.
9626 		 */
9627 		cur_bytes = min(cur_bytes, last_alloc);
9628 		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
9629 				min_size, 0, *alloc_hint, &ins, 1, 0);
9630 		if (ret)
9631 			break;
9632 
9633 		/*
9634 		 * We've reserved this space, and thus converted it from
9635 		 * ->bytes_may_use to ->bytes_reserved.  Any error that happens
9636 		 * from here on out we will only need to clear our reservation
9637 		 * for the remaining unreserved area, so advance our
9638 		 * clear_offset by our extent size.
9639 		 */
9640 		clear_offset += ins.offset;
9641 
9642 		last_alloc = ins.offset;
9643 		trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
9644 						    &ins, cur_offset);
9645 		/*
9646 		 * Now that we inserted the prealloc extent we can finally
9647 		 * decrement the number of reservations in the block group.
9648 		 * If we did it before, we could race with relocation and have
9649 		 * relocation miss the reserved extent, making it fail later.
9650 		 */
9651 		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9652 		if (IS_ERR(trans)) {
9653 			ret = PTR_ERR(trans);
9654 			btrfs_free_reserved_extent(fs_info, ins.objectid,
9655 						   ins.offset, 0);
9656 			break;
9657 		}
9658 
9659 		em = alloc_extent_map();
9660 		if (!em) {
9661 			btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
9662 					    cur_offset + ins.offset - 1, false);
9663 			btrfs_set_inode_full_sync(BTRFS_I(inode));
9664 			goto next;
9665 		}
9666 
9667 		em->start = cur_offset;
9668 		em->orig_start = cur_offset;
9669 		em->len = ins.offset;
9670 		em->block_start = ins.objectid;
9671 		em->block_len = ins.offset;
9672 		em->orig_block_len = ins.offset;
9673 		em->ram_bytes = ins.offset;
9674 		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
9675 		em->generation = trans->transid;
9676 
9677 		ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
9678 		free_extent_map(em);
9679 next:
9680 		num_bytes -= ins.offset;
9681 		cur_offset += ins.offset;
9682 		*alloc_hint = ins.objectid + ins.offset;
9683 
9684 		inode_inc_iversion(inode);
9685 		inode_set_ctime_current(inode);
9686 		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
9687 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
9688 		    (actual_len > inode->i_size) &&
9689 		    (cur_offset > inode->i_size)) {
9690 			if (cur_offset > actual_len)
9691 				i_size = actual_len;
9692 			else
9693 				i_size = cur_offset;
9694 			i_size_write(inode, i_size);
9695 			btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
9696 		}
9697 
9698 		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
9699 
9700 		if (ret) {
9701 			btrfs_abort_transaction(trans, ret);
9702 			if (own_trans)
9703 				btrfs_end_transaction(trans);
9704 			break;
9705 		}
9706 
9707 		if (own_trans) {
9708 			btrfs_end_transaction(trans);
9709 			trans = NULL;
9710 		}
9711 	}
9712 	if (clear_offset < end)
9713 		btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
9714 			end - clear_offset + 1);
9715 	return ret;
9716 }
9717 
9718 int btrfs_prealloc_file_range(struct inode *inode, int mode,
9719 			      u64 start, u64 num_bytes, u64 min_size,
9720 			      loff_t actual_len, u64 *alloc_hint)
9721 {
9722 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9723 					   min_size, actual_len, alloc_hint,
9724 					   NULL);
9725 }
9726 
9727 int btrfs_prealloc_file_range_trans(struct inode *inode,
9728 				    struct btrfs_trans_handle *trans, int mode,
9729 				    u64 start, u64 num_bytes, u64 min_size,
9730 				    loff_t actual_len, u64 *alloc_hint)
9731 {
9732 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9733 					   min_size, actual_len, alloc_hint, trans);
9734 }
9735 
9736 static int btrfs_permission(struct mnt_idmap *idmap,
9737 			    struct inode *inode, int mask)
9738 {
9739 	struct btrfs_root *root = BTRFS_I(inode)->root;
9740 	umode_t mode = inode->i_mode;
9741 
9742 	if (mask & MAY_WRITE &&
9743 	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
9744 		if (btrfs_root_readonly(root))
9745 			return -EROFS;
9746 		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
9747 			return -EACCES;
9748 	}
9749 	return generic_permission(idmap, inode, mask);
9750 }
9751 
9752 static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
9753 			 struct file *file, umode_t mode)
9754 {
9755 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
9756 	struct btrfs_trans_handle *trans;
9757 	struct btrfs_root *root = BTRFS_I(dir)->root;
9758 	struct inode *inode;
9759 	struct btrfs_new_inode_args new_inode_args = {
9760 		.dir = dir,
9761 		.dentry = file->f_path.dentry,
9762 		.orphan = true,
9763 	};
9764 	unsigned int trans_num_items;
9765 	int ret;
9766 
9767 	inode = new_inode(dir->i_sb);
9768 	if (!inode)
9769 		return -ENOMEM;
9770 	inode_init_owner(idmap, inode, dir, mode);
9771 	inode->i_fop = &btrfs_file_operations;
9772 	inode->i_op = &btrfs_file_inode_operations;
9773 	inode->i_mapping->a_ops = &btrfs_aops;
9774 
9775 	new_inode_args.inode = inode;
9776 	ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
9777 	if (ret)
9778 		goto out_inode;
9779 
9780 	trans = btrfs_start_transaction(root, trans_num_items);
9781 	if (IS_ERR(trans)) {
9782 		ret = PTR_ERR(trans);
9783 		goto out_new_inode_args;
9784 	}
9785 
9786 	ret = btrfs_create_new_inode(trans, &new_inode_args);
9787 
9788 	/*
9789 	 * We set number of links to 0 in btrfs_create_new_inode(), and here we
9790 	 * set it to 1 because d_tmpfile() will issue a warning if the count is
9791 	 * 0, through:
9792 	 *
9793 	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9794 	 */
9795 	set_nlink(inode, 1);
9796 
9797 	if (!ret) {
9798 		d_tmpfile(file, inode);
9799 		unlock_new_inode(inode);
9800 		mark_inode_dirty(inode);
9801 	}
9802 
9803 	btrfs_end_transaction(trans);
9804 	btrfs_btree_balance_dirty(fs_info);
9805 out_new_inode_args:
9806 	btrfs_new_inode_args_destroy(&new_inode_args);
9807 out_inode:
9808 	if (ret)
9809 		iput(inode);
9810 	return finish_open_simple(file, ret);
9811 }
9812 
9813 void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
9814 {
9815 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
9816 	unsigned long index = start >> PAGE_SHIFT;
9817 	unsigned long end_index = end >> PAGE_SHIFT;
9818 	struct page *page;
9819 	u32 len;
9820 
9821 	ASSERT(end + 1 - start <= U32_MAX);
9822 	len = end + 1 - start;
9823 	while (index <= end_index) {
9824 		page = find_get_page(inode->vfs_inode.i_mapping, index);
9825 		ASSERT(page); /* Pages should be in the extent_io_tree */
9826 
9827 		btrfs_page_set_writeback(fs_info, page, start, len);
9828 		put_page(page);
9829 		index++;
9830 	}
9831 }
9832 
9833 int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
9834 					     int compress_type)
9835 {
9836 	switch (compress_type) {
9837 	case BTRFS_COMPRESS_NONE:
9838 		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
9839 	case BTRFS_COMPRESS_ZLIB:
9840 		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
9841 	case BTRFS_COMPRESS_LZO:
9842 		/*
9843 		 * The LZO format depends on the sector size. 64K is the maximum
9844 		 * sector size that we support.
9845 		 */
9846 		if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
9847 			return -EINVAL;
9848 		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
9849 		       (fs_info->sectorsize_bits - 12);
9850 	case BTRFS_COMPRESS_ZSTD:
9851 		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
9852 	default:
9853 		return -EUCLEAN;
9854 	}
9855 }
9856 
9857 static ssize_t btrfs_encoded_read_inline(
9858 				struct kiocb *iocb,
9859 				struct iov_iter *iter, u64 start,
9860 				u64 lockend,
9861 				struct extent_state **cached_state,
9862 				u64 extent_start, size_t count,
9863 				struct btrfs_ioctl_encoded_io_args *encoded,
9864 				bool *unlocked)
9865 {
9866 	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9867 	struct btrfs_root *root = inode->root;
9868 	struct btrfs_fs_info *fs_info = root->fs_info;
9869 	struct extent_io_tree *io_tree = &inode->io_tree;
9870 	struct btrfs_path *path;
9871 	struct extent_buffer *leaf;
9872 	struct btrfs_file_extent_item *item;
9873 	u64 ram_bytes;
9874 	unsigned long ptr;
9875 	void *tmp;
9876 	ssize_t ret;
9877 
9878 	path = btrfs_alloc_path();
9879 	if (!path) {
9880 		ret = -ENOMEM;
9881 		goto out;
9882 	}
9883 	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
9884 				       extent_start, 0);
9885 	if (ret) {
9886 		if (ret > 0) {
9887 			/* The extent item disappeared? */
9888 			ret = -EIO;
9889 		}
9890 		goto out;
9891 	}
9892 	leaf = path->nodes[0];
9893 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
9894 
9895 	ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
9896 	ptr = btrfs_file_extent_inline_start(item);
9897 
9898 	encoded->len = min_t(u64, extent_start + ram_bytes,
9899 			     inode->vfs_inode.i_size) - iocb->ki_pos;
9900 	ret = btrfs_encoded_io_compression_from_extent(fs_info,
9901 				 btrfs_file_extent_compression(leaf, item));
9902 	if (ret < 0)
9903 		goto out;
9904 	encoded->compression = ret;
9905 	if (encoded->compression) {
9906 		size_t inline_size;
9907 
9908 		inline_size = btrfs_file_extent_inline_item_len(leaf,
9909 								path->slots[0]);
9910 		if (inline_size > count) {
9911 			ret = -ENOBUFS;
9912 			goto out;
9913 		}
9914 		count = inline_size;
9915 		encoded->unencoded_len = ram_bytes;
9916 		encoded->unencoded_offset = iocb->ki_pos - extent_start;
9917 	} else {
9918 		count = min_t(u64, count, encoded->len);
9919 		encoded->len = count;
9920 		encoded->unencoded_len = count;
9921 		ptr += iocb->ki_pos - extent_start;
9922 	}
9923 
9924 	tmp = kmalloc(count, GFP_NOFS);
9925 	if (!tmp) {
9926 		ret = -ENOMEM;
9927 		goto out;
9928 	}
9929 	read_extent_buffer(leaf, tmp, ptr, count);
9930 	btrfs_release_path(path);
9931 	unlock_extent(io_tree, start, lockend, cached_state);
9932 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9933 	*unlocked = true;
9934 
9935 	ret = copy_to_iter(tmp, count, iter);
9936 	if (ret != count)
9937 		ret = -EFAULT;
9938 	kfree(tmp);
9939 out:
9940 	btrfs_free_path(path);
9941 	return ret;
9942 }
9943 
9944 struct btrfs_encoded_read_private {
9945 	wait_queue_head_t wait;
9946 	atomic_t pending;
9947 	blk_status_t status;
9948 };
9949 
9950 static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
9951 {
9952 	struct btrfs_encoded_read_private *priv = bbio->private;
9953 
9954 	if (bbio->bio.bi_status) {
9955 		/*
9956 		 * The memory barrier implied by the atomic_dec_return() here
9957 		 * pairs with the memory barrier implied by the
9958 		 * atomic_dec_return() or io_wait_event() in
9959 		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
9960 		 * write is observed before the load of status in
9961 		 * btrfs_encoded_read_regular_fill_pages().
9962 		 */
9963 		WRITE_ONCE(priv->status, bbio->bio.bi_status);
9964 	}
9965 	if (!atomic_dec_return(&priv->pending))
9966 		wake_up(&priv->wait);
9967 	bio_put(&bbio->bio);
9968 }
9969 
9970 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
9971 					  u64 file_offset, u64 disk_bytenr,
9972 					  u64 disk_io_size, struct page **pages)
9973 {
9974 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
9975 	struct btrfs_encoded_read_private priv = {
9976 		.pending = ATOMIC_INIT(1),
9977 	};
9978 	unsigned long i = 0;
9979 	struct btrfs_bio *bbio;
9980 
9981 	init_waitqueue_head(&priv.wait);
9982 
9983 	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9984 			       btrfs_encoded_read_endio, &priv);
9985 	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9986 	bbio->inode = inode;
9987 
9988 	do {
9989 		size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
9990 
9991 		if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
9992 			atomic_inc(&priv.pending);
9993 			btrfs_submit_bio(bbio, 0);
9994 
9995 			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9996 					       btrfs_encoded_read_endio, &priv);
9997 			bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9998 			bbio->inode = inode;
9999 			continue;
10000 		}
10001 
10002 		i++;
10003 		disk_bytenr += bytes;
10004 		disk_io_size -= bytes;
10005 	} while (disk_io_size);
10006 
10007 	atomic_inc(&priv.pending);
10008 	btrfs_submit_bio(bbio, 0);
10009 
10010 	if (atomic_dec_return(&priv.pending))
10011 		io_wait_event(priv.wait, !atomic_read(&priv.pending));
10012 	/* See btrfs_encoded_read_endio() for ordering. */
10013 	return blk_status_to_errno(READ_ONCE(priv.status));
10014 }
10015 
10016 static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
10017 					  struct iov_iter *iter,
10018 					  u64 start, u64 lockend,
10019 					  struct extent_state **cached_state,
10020 					  u64 disk_bytenr, u64 disk_io_size,
10021 					  size_t count, bool compressed,
10022 					  bool *unlocked)
10023 {
10024 	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10025 	struct extent_io_tree *io_tree = &inode->io_tree;
10026 	struct page **pages;
10027 	unsigned long nr_pages, i;
10028 	u64 cur;
10029 	size_t page_offset;
10030 	ssize_t ret;
10031 
10032 	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
10033 	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
10034 	if (!pages)
10035 		return -ENOMEM;
10036 	ret = btrfs_alloc_page_array(nr_pages, pages);
10037 	if (ret) {
10038 		ret = -ENOMEM;
10039 		goto out;
10040 		}
10041 
10042 	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
10043 						    disk_io_size, pages);
10044 	if (ret)
10045 		goto out;
10046 
10047 	unlock_extent(io_tree, start, lockend, cached_state);
10048 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10049 	*unlocked = true;
10050 
10051 	if (compressed) {
10052 		i = 0;
10053 		page_offset = 0;
10054 	} else {
10055 		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
10056 		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
10057 	}
10058 	cur = 0;
10059 	while (cur < count) {
10060 		size_t bytes = min_t(size_t, count - cur,
10061 				     PAGE_SIZE - page_offset);
10062 
10063 		if (copy_page_to_iter(pages[i], page_offset, bytes,
10064 				      iter) != bytes) {
10065 			ret = -EFAULT;
10066 			goto out;
10067 		}
10068 		i++;
10069 		cur += bytes;
10070 		page_offset = 0;
10071 	}
10072 	ret = count;
10073 out:
10074 	for (i = 0; i < nr_pages; i++) {
10075 		if (pages[i])
10076 			__free_page(pages[i]);
10077 	}
10078 	kfree(pages);
10079 	return ret;
10080 }
10081 
10082 ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
10083 			   struct btrfs_ioctl_encoded_io_args *encoded)
10084 {
10085 	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10086 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
10087 	struct extent_io_tree *io_tree = &inode->io_tree;
10088 	ssize_t ret;
10089 	size_t count = iov_iter_count(iter);
10090 	u64 start, lockend, disk_bytenr, disk_io_size;
10091 	struct extent_state *cached_state = NULL;
10092 	struct extent_map *em;
10093 	bool unlocked = false;
10094 
10095 	file_accessed(iocb->ki_filp);
10096 
10097 	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
10098 
10099 	if (iocb->ki_pos >= inode->vfs_inode.i_size) {
10100 		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10101 		return 0;
10102 	}
10103 	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
10104 	/*
10105 	 * We don't know how long the extent containing iocb->ki_pos is, but if
10106 	 * it's compressed we know that it won't be longer than this.
10107 	 */
10108 	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
10109 
10110 	for (;;) {
10111 		struct btrfs_ordered_extent *ordered;
10112 
10113 		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
10114 					       lockend - start + 1);
10115 		if (ret)
10116 			goto out_unlock_inode;
10117 		lock_extent(io_tree, start, lockend, &cached_state);
10118 		ordered = btrfs_lookup_ordered_range(inode, start,
10119 						     lockend - start + 1);
10120 		if (!ordered)
10121 			break;
10122 		btrfs_put_ordered_extent(ordered);
10123 		unlock_extent(io_tree, start, lockend, &cached_state);
10124 		cond_resched();
10125 	}
10126 
10127 	em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
10128 	if (IS_ERR(em)) {
10129 		ret = PTR_ERR(em);
10130 		goto out_unlock_extent;
10131 	}
10132 
10133 	if (em->block_start == EXTENT_MAP_INLINE) {
10134 		u64 extent_start = em->start;
10135 
10136 		/*
10137 		 * For inline extents we get everything we need out of the
10138 		 * extent item.
10139 		 */
10140 		free_extent_map(em);
10141 		em = NULL;
10142 		ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
10143 						&cached_state, extent_start,
10144 						count, encoded, &unlocked);
10145 		goto out;
10146 	}
10147 
10148 	/*
10149 	 * We only want to return up to EOF even if the extent extends beyond
10150 	 * that.
10151 	 */
10152 	encoded->len = min_t(u64, extent_map_end(em),
10153 			     inode->vfs_inode.i_size) - iocb->ki_pos;
10154 	if (em->block_start == EXTENT_MAP_HOLE ||
10155 	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
10156 		disk_bytenr = EXTENT_MAP_HOLE;
10157 		count = min_t(u64, count, encoded->len);
10158 		encoded->len = count;
10159 		encoded->unencoded_len = count;
10160 	} else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10161 		disk_bytenr = em->block_start;
10162 		/*
10163 		 * Bail if the buffer isn't large enough to return the whole
10164 		 * compressed extent.
10165 		 */
10166 		if (em->block_len > count) {
10167 			ret = -ENOBUFS;
10168 			goto out_em;
10169 		}
10170 		disk_io_size = em->block_len;
10171 		count = em->block_len;
10172 		encoded->unencoded_len = em->ram_bytes;
10173 		encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
10174 		ret = btrfs_encoded_io_compression_from_extent(fs_info,
10175 							     em->compress_type);
10176 		if (ret < 0)
10177 			goto out_em;
10178 		encoded->compression = ret;
10179 	} else {
10180 		disk_bytenr = em->block_start + (start - em->start);
10181 		if (encoded->len > count)
10182 			encoded->len = count;
10183 		/*
10184 		 * Don't read beyond what we locked. This also limits the page
10185 		 * allocations that we'll do.
10186 		 */
10187 		disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
10188 		count = start + disk_io_size - iocb->ki_pos;
10189 		encoded->len = count;
10190 		encoded->unencoded_len = count;
10191 		disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
10192 	}
10193 	free_extent_map(em);
10194 	em = NULL;
10195 
10196 	if (disk_bytenr == EXTENT_MAP_HOLE) {
10197 		unlock_extent(io_tree, start, lockend, &cached_state);
10198 		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10199 		unlocked = true;
10200 		ret = iov_iter_zero(count, iter);
10201 		if (ret != count)
10202 			ret = -EFAULT;
10203 	} else {
10204 		ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
10205 						 &cached_state, disk_bytenr,
10206 						 disk_io_size, count,
10207 						 encoded->compression,
10208 						 &unlocked);
10209 	}
10210 
10211 out:
10212 	if (ret >= 0)
10213 		iocb->ki_pos += encoded->len;
10214 out_em:
10215 	free_extent_map(em);
10216 out_unlock_extent:
10217 	if (!unlocked)
10218 		unlock_extent(io_tree, start, lockend, &cached_state);
10219 out_unlock_inode:
10220 	if (!unlocked)
10221 		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10222 	return ret;
10223 }
10224 
10225 ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
10226 			       const struct btrfs_ioctl_encoded_io_args *encoded)
10227 {
10228 	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10229 	struct btrfs_root *root = inode->root;
10230 	struct btrfs_fs_info *fs_info = root->fs_info;
10231 	struct extent_io_tree *io_tree = &inode->io_tree;
10232 	struct extent_changeset *data_reserved = NULL;
10233 	struct extent_state *cached_state = NULL;
10234 	struct btrfs_ordered_extent *ordered;
10235 	int compression;
10236 	size_t orig_count;
10237 	u64 start, end;
10238 	u64 num_bytes, ram_bytes, disk_num_bytes;
10239 	unsigned long nr_pages, i;
10240 	struct page **pages;
10241 	struct btrfs_key ins;
10242 	bool extent_reserved = false;
10243 	struct extent_map *em;
10244 	ssize_t ret;
10245 
10246 	switch (encoded->compression) {
10247 	case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
10248 		compression = BTRFS_COMPRESS_ZLIB;
10249 		break;
10250 	case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
10251 		compression = BTRFS_COMPRESS_ZSTD;
10252 		break;
10253 	case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
10254 	case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
10255 	case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
10256 	case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
10257 	case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
10258 		/* The sector size must match for LZO. */
10259 		if (encoded->compression -
10260 		    BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
10261 		    fs_info->sectorsize_bits)
10262 			return -EINVAL;
10263 		compression = BTRFS_COMPRESS_LZO;
10264 		break;
10265 	default:
10266 		return -EINVAL;
10267 	}
10268 	if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
10269 		return -EINVAL;
10270 
10271 	/*
10272 	 * Compressed extents should always have checksums, so error out if we
10273 	 * have a NOCOW file or inode was created while mounted with NODATASUM.
10274 	 */
10275 	if (inode->flags & BTRFS_INODE_NODATASUM)
10276 		return -EINVAL;
10277 
10278 	orig_count = iov_iter_count(from);
10279 
10280 	/* The extent size must be sane. */
10281 	if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
10282 	    orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
10283 		return -EINVAL;
10284 
10285 	/*
10286 	 * The compressed data must be smaller than the decompressed data.
10287 	 *
10288 	 * It's of course possible for data to compress to larger or the same
10289 	 * size, but the buffered I/O path falls back to no compression for such
10290 	 * data, and we don't want to break any assumptions by creating these
10291 	 * extents.
10292 	 *
10293 	 * Note that this is less strict than the current check we have that the
10294 	 * compressed data must be at least one sector smaller than the
10295 	 * decompressed data. We only want to enforce the weaker requirement
10296 	 * from old kernels that it is at least one byte smaller.
10297 	 */
10298 	if (orig_count >= encoded->unencoded_len)
10299 		return -EINVAL;
10300 
10301 	/* The extent must start on a sector boundary. */
10302 	start = iocb->ki_pos;
10303 	if (!IS_ALIGNED(start, fs_info->sectorsize))
10304 		return -EINVAL;
10305 
10306 	/*
10307 	 * The extent must end on a sector boundary. However, we allow a write
10308 	 * which ends at or extends i_size to have an unaligned length; we round
10309 	 * up the extent size and set i_size to the unaligned end.
10310 	 */
10311 	if (start + encoded->len < inode->vfs_inode.i_size &&
10312 	    !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
10313 		return -EINVAL;
10314 
10315 	/* Finally, the offset in the unencoded data must be sector-aligned. */
10316 	if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
10317 		return -EINVAL;
10318 
10319 	num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
10320 	ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
10321 	end = start + num_bytes - 1;
10322 
10323 	/*
10324 	 * If the extent cannot be inline, the compressed data on disk must be
10325 	 * sector-aligned. For convenience, we extend it with zeroes if it
10326 	 * isn't.
10327 	 */
10328 	disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
10329 	nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
10330 	pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
10331 	if (!pages)
10332 		return -ENOMEM;
10333 	for (i = 0; i < nr_pages; i++) {
10334 		size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
10335 		char *kaddr;
10336 
10337 		pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
10338 		if (!pages[i]) {
10339 			ret = -ENOMEM;
10340 			goto out_pages;
10341 		}
10342 		kaddr = kmap_local_page(pages[i]);
10343 		if (copy_from_iter(kaddr, bytes, from) != bytes) {
10344 			kunmap_local(kaddr);
10345 			ret = -EFAULT;
10346 			goto out_pages;
10347 		}
10348 		if (bytes < PAGE_SIZE)
10349 			memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
10350 		kunmap_local(kaddr);
10351 	}
10352 
10353 	for (;;) {
10354 		struct btrfs_ordered_extent *ordered;
10355 
10356 		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
10357 		if (ret)
10358 			goto out_pages;
10359 		ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
10360 						    start >> PAGE_SHIFT,
10361 						    end >> PAGE_SHIFT);
10362 		if (ret)
10363 			goto out_pages;
10364 		lock_extent(io_tree, start, end, &cached_state);
10365 		ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
10366 		if (!ordered &&
10367 		    !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
10368 			break;
10369 		if (ordered)
10370 			btrfs_put_ordered_extent(ordered);
10371 		unlock_extent(io_tree, start, end, &cached_state);
10372 		cond_resched();
10373 	}
10374 
10375 	/*
10376 	 * We don't use the higher-level delalloc space functions because our
10377 	 * num_bytes and disk_num_bytes are different.
10378 	 */
10379 	ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
10380 	if (ret)
10381 		goto out_unlock;
10382 	ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
10383 	if (ret)
10384 		goto out_free_data_space;
10385 	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
10386 					      false);
10387 	if (ret)
10388 		goto out_qgroup_free_data;
10389 
10390 	/* Try an inline extent first. */
10391 	if (start == 0 && encoded->unencoded_len == encoded->len &&
10392 	    encoded->unencoded_offset == 0) {
10393 		ret = cow_file_range_inline(inode, encoded->len, orig_count,
10394 					    compression, pages, true);
10395 		if (ret <= 0) {
10396 			if (ret == 0)
10397 				ret = orig_count;
10398 			goto out_delalloc_release;
10399 		}
10400 	}
10401 
10402 	ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
10403 				   disk_num_bytes, 0, 0, &ins, 1, 1);
10404 	if (ret)
10405 		goto out_delalloc_release;
10406 	extent_reserved = true;
10407 
10408 	em = create_io_em(inode, start, num_bytes,
10409 			  start - encoded->unencoded_offset, ins.objectid,
10410 			  ins.offset, ins.offset, ram_bytes, compression,
10411 			  BTRFS_ORDERED_COMPRESSED);
10412 	if (IS_ERR(em)) {
10413 		ret = PTR_ERR(em);
10414 		goto out_free_reserved;
10415 	}
10416 	free_extent_map(em);
10417 
10418 	ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes,
10419 				       ins.objectid, ins.offset,
10420 				       encoded->unencoded_offset,
10421 				       (1 << BTRFS_ORDERED_ENCODED) |
10422 				       (1 << BTRFS_ORDERED_COMPRESSED),
10423 				       compression);
10424 	if (IS_ERR(ordered)) {
10425 		btrfs_drop_extent_map_range(inode, start, end, false);
10426 		ret = PTR_ERR(ordered);
10427 		goto out_free_reserved;
10428 	}
10429 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10430 
10431 	if (start + encoded->len > inode->vfs_inode.i_size)
10432 		i_size_write(&inode->vfs_inode, start + encoded->len);
10433 
10434 	unlock_extent(io_tree, start, end, &cached_state);
10435 
10436 	btrfs_delalloc_release_extents(inode, num_bytes);
10437 
10438 	btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false);
10439 	ret = orig_count;
10440 	goto out;
10441 
10442 out_free_reserved:
10443 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10444 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
10445 out_delalloc_release:
10446 	btrfs_delalloc_release_extents(inode, num_bytes);
10447 	btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
10448 out_qgroup_free_data:
10449 	if (ret < 0)
10450 		btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
10451 out_free_data_space:
10452 	/*
10453 	 * If btrfs_reserve_extent() succeeded, then we already decremented
10454 	 * bytes_may_use.
10455 	 */
10456 	if (!extent_reserved)
10457 		btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
10458 out_unlock:
10459 	unlock_extent(io_tree, start, end, &cached_state);
10460 out_pages:
10461 	for (i = 0; i < nr_pages; i++) {
10462 		if (pages[i])
10463 			__free_page(pages[i]);
10464 	}
10465 	kvfree(pages);
10466 out:
10467 	if (ret >= 0)
10468 		iocb->ki_pos += encoded->len;
10469 	return ret;
10470 }
10471 
10472 #ifdef CONFIG_SWAP
10473 /*
10474  * Add an entry indicating a block group or device which is pinned by a
10475  * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
10476  * negative errno on failure.
10477  */
10478 static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
10479 				  bool is_block_group)
10480 {
10481 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10482 	struct btrfs_swapfile_pin *sp, *entry;
10483 	struct rb_node **p;
10484 	struct rb_node *parent = NULL;
10485 
10486 	sp = kmalloc(sizeof(*sp), GFP_NOFS);
10487 	if (!sp)
10488 		return -ENOMEM;
10489 	sp->ptr = ptr;
10490 	sp->inode = inode;
10491 	sp->is_block_group = is_block_group;
10492 	sp->bg_extent_count = 1;
10493 
10494 	spin_lock(&fs_info->swapfile_pins_lock);
10495 	p = &fs_info->swapfile_pins.rb_node;
10496 	while (*p) {
10497 		parent = *p;
10498 		entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
10499 		if (sp->ptr < entry->ptr ||
10500 		    (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
10501 			p = &(*p)->rb_left;
10502 		} else if (sp->ptr > entry->ptr ||
10503 			   (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
10504 			p = &(*p)->rb_right;
10505 		} else {
10506 			if (is_block_group)
10507 				entry->bg_extent_count++;
10508 			spin_unlock(&fs_info->swapfile_pins_lock);
10509 			kfree(sp);
10510 			return 1;
10511 		}
10512 	}
10513 	rb_link_node(&sp->node, parent, p);
10514 	rb_insert_color(&sp->node, &fs_info->swapfile_pins);
10515 	spin_unlock(&fs_info->swapfile_pins_lock);
10516 	return 0;
10517 }
10518 
10519 /* Free all of the entries pinned by this swapfile. */
10520 static void btrfs_free_swapfile_pins(struct inode *inode)
10521 {
10522 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10523 	struct btrfs_swapfile_pin *sp;
10524 	struct rb_node *node, *next;
10525 
10526 	spin_lock(&fs_info->swapfile_pins_lock);
10527 	node = rb_first(&fs_info->swapfile_pins);
10528 	while (node) {
10529 		next = rb_next(node);
10530 		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
10531 		if (sp->inode == inode) {
10532 			rb_erase(&sp->node, &fs_info->swapfile_pins);
10533 			if (sp->is_block_group) {
10534 				btrfs_dec_block_group_swap_extents(sp->ptr,
10535 							   sp->bg_extent_count);
10536 				btrfs_put_block_group(sp->ptr);
10537 			}
10538 			kfree(sp);
10539 		}
10540 		node = next;
10541 	}
10542 	spin_unlock(&fs_info->swapfile_pins_lock);
10543 }
10544 
10545 struct btrfs_swap_info {
10546 	u64 start;
10547 	u64 block_start;
10548 	u64 block_len;
10549 	u64 lowest_ppage;
10550 	u64 highest_ppage;
10551 	unsigned long nr_pages;
10552 	int nr_extents;
10553 };
10554 
10555 static int btrfs_add_swap_extent(struct swap_info_struct *sis,
10556 				 struct btrfs_swap_info *bsi)
10557 {
10558 	unsigned long nr_pages;
10559 	unsigned long max_pages;
10560 	u64 first_ppage, first_ppage_reported, next_ppage;
10561 	int ret;
10562 
10563 	/*
10564 	 * Our swapfile may have had its size extended after the swap header was
10565 	 * written. In that case activating the swapfile should not go beyond
10566 	 * the max size set in the swap header.
10567 	 */
10568 	if (bsi->nr_pages >= sis->max)
10569 		return 0;
10570 
10571 	max_pages = sis->max - bsi->nr_pages;
10572 	first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
10573 	next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
10574 
10575 	if (first_ppage >= next_ppage)
10576 		return 0;
10577 	nr_pages = next_ppage - first_ppage;
10578 	nr_pages = min(nr_pages, max_pages);
10579 
10580 	first_ppage_reported = first_ppage;
10581 	if (bsi->start == 0)
10582 		first_ppage_reported++;
10583 	if (bsi->lowest_ppage > first_ppage_reported)
10584 		bsi->lowest_ppage = first_ppage_reported;
10585 	if (bsi->highest_ppage < (next_ppage - 1))
10586 		bsi->highest_ppage = next_ppage - 1;
10587 
10588 	ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
10589 	if (ret < 0)
10590 		return ret;
10591 	bsi->nr_extents += ret;
10592 	bsi->nr_pages += nr_pages;
10593 	return 0;
10594 }
10595 
10596 static void btrfs_swap_deactivate(struct file *file)
10597 {
10598 	struct inode *inode = file_inode(file);
10599 
10600 	btrfs_free_swapfile_pins(inode);
10601 	atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
10602 }
10603 
10604 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10605 			       sector_t *span)
10606 {
10607 	struct inode *inode = file_inode(file);
10608 	struct btrfs_root *root = BTRFS_I(inode)->root;
10609 	struct btrfs_fs_info *fs_info = root->fs_info;
10610 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
10611 	struct extent_state *cached_state = NULL;
10612 	struct extent_map *em = NULL;
10613 	struct btrfs_device *device = NULL;
10614 	struct btrfs_swap_info bsi = {
10615 		.lowest_ppage = (sector_t)-1ULL,
10616 	};
10617 	int ret = 0;
10618 	u64 isize;
10619 	u64 start;
10620 
10621 	/*
10622 	 * If the swap file was just created, make sure delalloc is done. If the
10623 	 * file changes again after this, the user is doing something stupid and
10624 	 * we don't really care.
10625 	 */
10626 	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
10627 	if (ret)
10628 		return ret;
10629 
10630 	/*
10631 	 * The inode is locked, so these flags won't change after we check them.
10632 	 */
10633 	if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
10634 		btrfs_warn(fs_info, "swapfile must not be compressed");
10635 		return -EINVAL;
10636 	}
10637 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
10638 		btrfs_warn(fs_info, "swapfile must not be copy-on-write");
10639 		return -EINVAL;
10640 	}
10641 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
10642 		btrfs_warn(fs_info, "swapfile must not be checksummed");
10643 		return -EINVAL;
10644 	}
10645 
10646 	/*
10647 	 * Balance or device remove/replace/resize can move stuff around from
10648 	 * under us. The exclop protection makes sure they aren't running/won't
10649 	 * run concurrently while we are mapping the swap extents, and
10650 	 * fs_info->swapfile_pins prevents them from running while the swap
10651 	 * file is active and moving the extents. Note that this also prevents
10652 	 * a concurrent device add which isn't actually necessary, but it's not
10653 	 * really worth the trouble to allow it.
10654 	 */
10655 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
10656 		btrfs_warn(fs_info,
10657 	   "cannot activate swapfile while exclusive operation is running");
10658 		return -EBUSY;
10659 	}
10660 
10661 	/*
10662 	 * Prevent snapshot creation while we are activating the swap file.
10663 	 * We do not want to race with snapshot creation. If snapshot creation
10664 	 * already started before we bumped nr_swapfiles from 0 to 1 and
10665 	 * completes before the first write into the swap file after it is
10666 	 * activated, than that write would fallback to COW.
10667 	 */
10668 	if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
10669 		btrfs_exclop_finish(fs_info);
10670 		btrfs_warn(fs_info,
10671 	   "cannot activate swapfile because snapshot creation is in progress");
10672 		return -EINVAL;
10673 	}
10674 	/*
10675 	 * Snapshots can create extents which require COW even if NODATACOW is
10676 	 * set. We use this counter to prevent snapshots. We must increment it
10677 	 * before walking the extents because we don't want a concurrent
10678 	 * snapshot to run after we've already checked the extents.
10679 	 *
10680 	 * It is possible that subvolume is marked for deletion but still not
10681 	 * removed yet. To prevent this race, we check the root status before
10682 	 * activating the swapfile.
10683 	 */
10684 	spin_lock(&root->root_item_lock);
10685 	if (btrfs_root_dead(root)) {
10686 		spin_unlock(&root->root_item_lock);
10687 
10688 		btrfs_exclop_finish(fs_info);
10689 		btrfs_warn(fs_info,
10690 		"cannot activate swapfile because subvolume %llu is being deleted",
10691 			root->root_key.objectid);
10692 		return -EPERM;
10693 	}
10694 	atomic_inc(&root->nr_swapfiles);
10695 	spin_unlock(&root->root_item_lock);
10696 
10697 	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
10698 
10699 	lock_extent(io_tree, 0, isize - 1, &cached_state);
10700 	start = 0;
10701 	while (start < isize) {
10702 		u64 logical_block_start, physical_block_start;
10703 		struct btrfs_block_group *bg;
10704 		u64 len = isize - start;
10705 
10706 		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
10707 		if (IS_ERR(em)) {
10708 			ret = PTR_ERR(em);
10709 			goto out;
10710 		}
10711 
10712 		if (em->block_start == EXTENT_MAP_HOLE) {
10713 			btrfs_warn(fs_info, "swapfile must not have holes");
10714 			ret = -EINVAL;
10715 			goto out;
10716 		}
10717 		if (em->block_start == EXTENT_MAP_INLINE) {
10718 			/*
10719 			 * It's unlikely we'll ever actually find ourselves
10720 			 * here, as a file small enough to fit inline won't be
10721 			 * big enough to store more than the swap header, but in
10722 			 * case something changes in the future, let's catch it
10723 			 * here rather than later.
10724 			 */
10725 			btrfs_warn(fs_info, "swapfile must not be inline");
10726 			ret = -EINVAL;
10727 			goto out;
10728 		}
10729 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10730 			btrfs_warn(fs_info, "swapfile must not be compressed");
10731 			ret = -EINVAL;
10732 			goto out;
10733 		}
10734 
10735 		logical_block_start = em->block_start + (start - em->start);
10736 		len = min(len, em->len - (start - em->start));
10737 		free_extent_map(em);
10738 		em = NULL;
10739 
10740 		ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
10741 		if (ret < 0) {
10742 			goto out;
10743 		} else if (ret) {
10744 			ret = 0;
10745 		} else {
10746 			btrfs_warn(fs_info,
10747 				   "swapfile must not be copy-on-write");
10748 			ret = -EINVAL;
10749 			goto out;
10750 		}
10751 
10752 		em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
10753 		if (IS_ERR(em)) {
10754 			ret = PTR_ERR(em);
10755 			goto out;
10756 		}
10757 
10758 		if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
10759 			btrfs_warn(fs_info,
10760 				   "swapfile must have single data profile");
10761 			ret = -EINVAL;
10762 			goto out;
10763 		}
10764 
10765 		if (device == NULL) {
10766 			device = em->map_lookup->stripes[0].dev;
10767 			ret = btrfs_add_swapfile_pin(inode, device, false);
10768 			if (ret == 1)
10769 				ret = 0;
10770 			else if (ret)
10771 				goto out;
10772 		} else if (device != em->map_lookup->stripes[0].dev) {
10773 			btrfs_warn(fs_info, "swapfile must be on one device");
10774 			ret = -EINVAL;
10775 			goto out;
10776 		}
10777 
10778 		physical_block_start = (em->map_lookup->stripes[0].physical +
10779 					(logical_block_start - em->start));
10780 		len = min(len, em->len - (logical_block_start - em->start));
10781 		free_extent_map(em);
10782 		em = NULL;
10783 
10784 		bg = btrfs_lookup_block_group(fs_info, logical_block_start);
10785 		if (!bg) {
10786 			btrfs_warn(fs_info,
10787 			   "could not find block group containing swapfile");
10788 			ret = -EINVAL;
10789 			goto out;
10790 		}
10791 
10792 		if (!btrfs_inc_block_group_swap_extents(bg)) {
10793 			btrfs_warn(fs_info,
10794 			   "block group for swapfile at %llu is read-only%s",
10795 			   bg->start,
10796 			   atomic_read(&fs_info->scrubs_running) ?
10797 				       " (scrub running)" : "");
10798 			btrfs_put_block_group(bg);
10799 			ret = -EINVAL;
10800 			goto out;
10801 		}
10802 
10803 		ret = btrfs_add_swapfile_pin(inode, bg, true);
10804 		if (ret) {
10805 			btrfs_put_block_group(bg);
10806 			if (ret == 1)
10807 				ret = 0;
10808 			else
10809 				goto out;
10810 		}
10811 
10812 		if (bsi.block_len &&
10813 		    bsi.block_start + bsi.block_len == physical_block_start) {
10814 			bsi.block_len += len;
10815 		} else {
10816 			if (bsi.block_len) {
10817 				ret = btrfs_add_swap_extent(sis, &bsi);
10818 				if (ret)
10819 					goto out;
10820 			}
10821 			bsi.start = start;
10822 			bsi.block_start = physical_block_start;
10823 			bsi.block_len = len;
10824 		}
10825 
10826 		start += len;
10827 	}
10828 
10829 	if (bsi.block_len)
10830 		ret = btrfs_add_swap_extent(sis, &bsi);
10831 
10832 out:
10833 	if (!IS_ERR_OR_NULL(em))
10834 		free_extent_map(em);
10835 
10836 	unlock_extent(io_tree, 0, isize - 1, &cached_state);
10837 
10838 	if (ret)
10839 		btrfs_swap_deactivate(file);
10840 
10841 	btrfs_drew_write_unlock(&root->snapshot_lock);
10842 
10843 	btrfs_exclop_finish(fs_info);
10844 
10845 	if (ret)
10846 		return ret;
10847 
10848 	if (device)
10849 		sis->bdev = device->bdev;
10850 	*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
10851 	sis->max = bsi.nr_pages;
10852 	sis->pages = bsi.nr_pages - 1;
10853 	sis->highest_bit = bsi.nr_pages - 1;
10854 	return bsi.nr_extents;
10855 }
10856 #else
10857 static void btrfs_swap_deactivate(struct file *file)
10858 {
10859 }
10860 
10861 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10862 			       sector_t *span)
10863 {
10864 	return -EOPNOTSUPP;
10865 }
10866 #endif
10867 
10868 /*
10869  * Update the number of bytes used in the VFS' inode. When we replace extents in
10870  * a range (clone, dedupe, fallocate's zero range), we must update the number of
10871  * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10872  * always get a correct value.
10873  */
10874 void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10875 			      const u64 add_bytes,
10876 			      const u64 del_bytes)
10877 {
10878 	if (add_bytes == del_bytes)
10879 		return;
10880 
10881 	spin_lock(&inode->lock);
10882 	if (del_bytes > 0)
10883 		inode_sub_bytes(&inode->vfs_inode, del_bytes);
10884 	if (add_bytes > 0)
10885 		inode_add_bytes(&inode->vfs_inode, add_bytes);
10886 	spin_unlock(&inode->lock);
10887 }
10888 
10889 /*
10890  * Verify that there are no ordered extents for a given file range.
10891  *
10892  * @inode:   The target inode.
10893  * @start:   Start offset of the file range, should be sector size aligned.
10894  * @end:     End offset (inclusive) of the file range, its value +1 should be
10895  *           sector size aligned.
10896  *
10897  * This should typically be used for cases where we locked an inode's VFS lock in
10898  * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
10899  * we have flushed all delalloc in the range, we have waited for all ordered
10900  * extents in the range to complete and finally we have locked the file range in
10901  * the inode's io_tree.
10902  */
10903 void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
10904 {
10905 	struct btrfs_root *root = inode->root;
10906 	struct btrfs_ordered_extent *ordered;
10907 
10908 	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
10909 		return;
10910 
10911 	ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
10912 	if (ordered) {
10913 		btrfs_err(root->fs_info,
10914 "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
10915 			  start, end, btrfs_ino(inode), root->root_key.objectid,
10916 			  ordered->file_offset,
10917 			  ordered->file_offset + ordered->num_bytes - 1);
10918 		btrfs_put_ordered_extent(ordered);
10919 	}
10920 
10921 	ASSERT(ordered == NULL);
10922 }
10923 
10924 static const struct inode_operations btrfs_dir_inode_operations = {
10925 	.getattr	= btrfs_getattr,
10926 	.lookup		= btrfs_lookup,
10927 	.create		= btrfs_create,
10928 	.unlink		= btrfs_unlink,
10929 	.link		= btrfs_link,
10930 	.mkdir		= btrfs_mkdir,
10931 	.rmdir		= btrfs_rmdir,
10932 	.rename		= btrfs_rename2,
10933 	.symlink	= btrfs_symlink,
10934 	.setattr	= btrfs_setattr,
10935 	.mknod		= btrfs_mknod,
10936 	.listxattr	= btrfs_listxattr,
10937 	.permission	= btrfs_permission,
10938 	.get_inode_acl	= btrfs_get_acl,
10939 	.set_acl	= btrfs_set_acl,
10940 	.update_time	= btrfs_update_time,
10941 	.tmpfile        = btrfs_tmpfile,
10942 	.fileattr_get	= btrfs_fileattr_get,
10943 	.fileattr_set	= btrfs_fileattr_set,
10944 };
10945 
10946 static const struct file_operations btrfs_dir_file_operations = {
10947 	.llseek		= btrfs_dir_llseek,
10948 	.read		= generic_read_dir,
10949 	.iterate_shared	= btrfs_real_readdir,
10950 	.open		= btrfs_opendir,
10951 	.unlocked_ioctl	= btrfs_ioctl,
10952 #ifdef CONFIG_COMPAT
10953 	.compat_ioctl	= btrfs_compat_ioctl,
10954 #endif
10955 	.release        = btrfs_release_file,
10956 	.fsync		= btrfs_sync_file,
10957 };
10958 
10959 /*
10960  * btrfs doesn't support the bmap operation because swapfiles
10961  * use bmap to make a mapping of extents in the file.  They assume
10962  * these extents won't change over the life of the file and they
10963  * use the bmap result to do IO directly to the drive.
10964  *
10965  * the btrfs bmap call would return logical addresses that aren't
10966  * suitable for IO and they also will change frequently as COW
10967  * operations happen.  So, swapfile + btrfs == corruption.
10968  *
10969  * For now we're avoiding this by dropping bmap.
10970  */
10971 static const struct address_space_operations btrfs_aops = {
10972 	.read_folio	= btrfs_read_folio,
10973 	.writepages	= btrfs_writepages,
10974 	.readahead	= btrfs_readahead,
10975 	.invalidate_folio = btrfs_invalidate_folio,
10976 	.release_folio	= btrfs_release_folio,
10977 	.migrate_folio	= btrfs_migrate_folio,
10978 	.dirty_folio	= filemap_dirty_folio,
10979 	.error_remove_page = generic_error_remove_page,
10980 	.swap_activate	= btrfs_swap_activate,
10981 	.swap_deactivate = btrfs_swap_deactivate,
10982 };
10983 
10984 static const struct inode_operations btrfs_file_inode_operations = {
10985 	.getattr	= btrfs_getattr,
10986 	.setattr	= btrfs_setattr,
10987 	.listxattr      = btrfs_listxattr,
10988 	.permission	= btrfs_permission,
10989 	.fiemap		= btrfs_fiemap,
10990 	.get_inode_acl	= btrfs_get_acl,
10991 	.set_acl	= btrfs_set_acl,
10992 	.update_time	= btrfs_update_time,
10993 	.fileattr_get	= btrfs_fileattr_get,
10994 	.fileattr_set	= btrfs_fileattr_set,
10995 };
10996 static const struct inode_operations btrfs_special_inode_operations = {
10997 	.getattr	= btrfs_getattr,
10998 	.setattr	= btrfs_setattr,
10999 	.permission	= btrfs_permission,
11000 	.listxattr	= btrfs_listxattr,
11001 	.get_inode_acl	= btrfs_get_acl,
11002 	.set_acl	= btrfs_set_acl,
11003 	.update_time	= btrfs_update_time,
11004 };
11005 static const struct inode_operations btrfs_symlink_inode_operations = {
11006 	.get_link	= page_get_link,
11007 	.getattr	= btrfs_getattr,
11008 	.setattr	= btrfs_setattr,
11009 	.permission	= btrfs_permission,
11010 	.listxattr	= btrfs_listxattr,
11011 	.update_time	= btrfs_update_time,
11012 };
11013 
11014 const struct dentry_operations btrfs_dentry_operations = {
11015 	.d_delete	= btrfs_dentry_delete,
11016 };
11017