xref: /openbmc/linux/fs/btrfs/inode.c (revision 9144f784f852f9a125cabe9927b986d909bfa439)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 2007 Oracle.  All rights reserved.
4   */
5  
6  #include <crypto/hash.h>
7  #include <linux/kernel.h>
8  #include <linux/bio.h>
9  #include <linux/blk-cgroup.h>
10  #include <linux/file.h>
11  #include <linux/fs.h>
12  #include <linux/pagemap.h>
13  #include <linux/highmem.h>
14  #include <linux/time.h>
15  #include <linux/init.h>
16  #include <linux/string.h>
17  #include <linux/backing-dev.h>
18  #include <linux/writeback.h>
19  #include <linux/compat.h>
20  #include <linux/xattr.h>
21  #include <linux/posix_acl.h>
22  #include <linux/falloc.h>
23  #include <linux/slab.h>
24  #include <linux/ratelimit.h>
25  #include <linux/btrfs.h>
26  #include <linux/blkdev.h>
27  #include <linux/posix_acl_xattr.h>
28  #include <linux/uio.h>
29  #include <linux/magic.h>
30  #include <linux/iversion.h>
31  #include <linux/swap.h>
32  #include <linux/migrate.h>
33  #include <linux/sched/mm.h>
34  #include <linux/iomap.h>
35  #include <asm/unaligned.h>
36  #include <linux/fsverity.h>
37  #include "misc.h"
38  #include "ctree.h"
39  #include "disk-io.h"
40  #include "transaction.h"
41  #include "btrfs_inode.h"
42  #include "print-tree.h"
43  #include "ordered-data.h"
44  #include "xattr.h"
45  #include "tree-log.h"
46  #include "bio.h"
47  #include "compression.h"
48  #include "locking.h"
49  #include "free-space-cache.h"
50  #include "props.h"
51  #include "qgroup.h"
52  #include "delalloc-space.h"
53  #include "block-group.h"
54  #include "space-info.h"
55  #include "zoned.h"
56  #include "subpage.h"
57  #include "inode-item.h"
58  #include "fs.h"
59  #include "accessors.h"
60  #include "extent-tree.h"
61  #include "root-tree.h"
62  #include "defrag.h"
63  #include "dir-item.h"
64  #include "file-item.h"
65  #include "uuid-tree.h"
66  #include "ioctl.h"
67  #include "file.h"
68  #include "acl.h"
69  #include "relocation.h"
70  #include "verity.h"
71  #include "super.h"
72  #include "orphan.h"
73  #include "backref.h"
74  
75  struct btrfs_iget_args {
76  	u64 ino;
77  	struct btrfs_root *root;
78  };
79  
80  struct btrfs_dio_data {
81  	ssize_t submitted;
82  	struct extent_changeset *data_reserved;
83  	struct btrfs_ordered_extent *ordered;
84  	bool data_space_reserved;
85  	bool nocow_done;
86  };
87  
88  struct btrfs_dio_private {
89  	/* Range of I/O */
90  	u64 file_offset;
91  	u32 bytes;
92  
93  	/* This must be last */
94  	struct btrfs_bio bbio;
95  };
96  
97  static struct bio_set btrfs_dio_bioset;
98  
99  struct btrfs_rename_ctx {
100  	/* Output field. Stores the index number of the old directory entry. */
101  	u64 index;
102  };
103  
104  /*
105   * Used by data_reloc_print_warning_inode() to pass needed info for filename
106   * resolution and output of error message.
107   */
108  struct data_reloc_warn {
109  	struct btrfs_path path;
110  	struct btrfs_fs_info *fs_info;
111  	u64 extent_item_size;
112  	u64 logical;
113  	int mirror_num;
114  };
115  
116  static const struct inode_operations btrfs_dir_inode_operations;
117  static const struct inode_operations btrfs_symlink_inode_operations;
118  static const struct inode_operations btrfs_special_inode_operations;
119  static const struct inode_operations btrfs_file_inode_operations;
120  static const struct address_space_operations btrfs_aops;
121  static const struct file_operations btrfs_dir_file_operations;
122  
123  static struct kmem_cache *btrfs_inode_cachep;
124  
125  static int btrfs_setsize(struct inode *inode, struct iattr *attr);
126  static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
127  
128  static noinline int run_delalloc_cow(struct btrfs_inode *inode,
129  				     struct page *locked_page, u64 start,
130  				     u64 end, struct writeback_control *wbc,
131  				     bool pages_dirty);
132  static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
133  				       u64 len, u64 orig_start, u64 block_start,
134  				       u64 block_len, u64 orig_block_len,
135  				       u64 ram_bytes, int compress_type,
136  				       int type);
137  
138  static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
139  					  u64 root, void *warn_ctx)
140  {
141  	struct data_reloc_warn *warn = warn_ctx;
142  	struct btrfs_fs_info *fs_info = warn->fs_info;
143  	struct extent_buffer *eb;
144  	struct btrfs_inode_item *inode_item;
145  	struct inode_fs_paths *ipath = NULL;
146  	struct btrfs_root *local_root;
147  	struct btrfs_key key;
148  	unsigned int nofs_flag;
149  	u32 nlink;
150  	int ret;
151  
152  	local_root = btrfs_get_fs_root(fs_info, root, true);
153  	if (IS_ERR(local_root)) {
154  		ret = PTR_ERR(local_root);
155  		goto err;
156  	}
157  
158  	/* This makes the path point to (inum INODE_ITEM ioff). */
159  	key.objectid = inum;
160  	key.type = BTRFS_INODE_ITEM_KEY;
161  	key.offset = 0;
162  
163  	ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
164  	if (ret) {
165  		btrfs_put_root(local_root);
166  		btrfs_release_path(&warn->path);
167  		goto err;
168  	}
169  
170  	eb = warn->path.nodes[0];
171  	inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
172  	nlink = btrfs_inode_nlink(eb, inode_item);
173  	btrfs_release_path(&warn->path);
174  
175  	nofs_flag = memalloc_nofs_save();
176  	ipath = init_ipath(4096, local_root, &warn->path);
177  	memalloc_nofs_restore(nofs_flag);
178  	if (IS_ERR(ipath)) {
179  		btrfs_put_root(local_root);
180  		ret = PTR_ERR(ipath);
181  		ipath = NULL;
182  		/*
183  		 * -ENOMEM, not a critical error, just output an generic error
184  		 * without filename.
185  		 */
186  		btrfs_warn(fs_info,
187  "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
188  			   warn->logical, warn->mirror_num, root, inum, offset);
189  		return ret;
190  	}
191  	ret = paths_from_inode(inum, ipath);
192  	if (ret < 0)
193  		goto err;
194  
195  	/*
196  	 * We deliberately ignore the bit ipath might have been too small to
197  	 * hold all of the paths here
198  	 */
199  	for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
200  		btrfs_warn(fs_info,
201  "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
202  			   warn->logical, warn->mirror_num, root, inum, offset,
203  			   fs_info->sectorsize, nlink,
204  			   (char *)(unsigned long)ipath->fspath->val[i]);
205  	}
206  
207  	btrfs_put_root(local_root);
208  	free_ipath(ipath);
209  	return 0;
210  
211  err:
212  	btrfs_warn(fs_info,
213  "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
214  		   warn->logical, warn->mirror_num, root, inum, offset, ret);
215  
216  	free_ipath(ipath);
217  	return ret;
218  }
219  
220  /*
221   * Do extra user-friendly error output (e.g. lookup all the affected files).
222   *
223   * Return true if we succeeded doing the backref lookup.
224   * Return false if such lookup failed, and has to fallback to the old error message.
225   */
226  static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
227  				   const u8 *csum, const u8 *csum_expected,
228  				   int mirror_num)
229  {
230  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
231  	struct btrfs_path path = { 0 };
232  	struct btrfs_key found_key = { 0 };
233  	struct extent_buffer *eb;
234  	struct btrfs_extent_item *ei;
235  	const u32 csum_size = fs_info->csum_size;
236  	u64 logical;
237  	u64 flags;
238  	u32 item_size;
239  	int ret;
240  
241  	mutex_lock(&fs_info->reloc_mutex);
242  	logical = btrfs_get_reloc_bg_bytenr(fs_info);
243  	mutex_unlock(&fs_info->reloc_mutex);
244  
245  	if (logical == U64_MAX) {
246  		btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
247  		btrfs_warn_rl(fs_info,
248  "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
249  			inode->root->root_key.objectid, btrfs_ino(inode), file_off,
250  			CSUM_FMT_VALUE(csum_size, csum),
251  			CSUM_FMT_VALUE(csum_size, csum_expected),
252  			mirror_num);
253  		return;
254  	}
255  
256  	logical += file_off;
257  	btrfs_warn_rl(fs_info,
258  "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
259  			inode->root->root_key.objectid,
260  			btrfs_ino(inode), file_off, logical,
261  			CSUM_FMT_VALUE(csum_size, csum),
262  			CSUM_FMT_VALUE(csum_size, csum_expected),
263  			mirror_num);
264  
265  	ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
266  	if (ret < 0) {
267  		btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
268  			     logical, ret);
269  		return;
270  	}
271  	eb = path.nodes[0];
272  	ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
273  	item_size = btrfs_item_size(eb, path.slots[0]);
274  	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
275  		unsigned long ptr = 0;
276  		u64 ref_root;
277  		u8 ref_level;
278  
279  		while (true) {
280  			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
281  						      item_size, &ref_root,
282  						      &ref_level);
283  			if (ret < 0) {
284  				btrfs_warn_rl(fs_info,
285  				"failed to resolve tree backref for logical %llu: %d",
286  					      logical, ret);
287  				break;
288  			}
289  			if (ret > 0)
290  				break;
291  
292  			btrfs_warn_rl(fs_info,
293  "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
294  				logical, mirror_num,
295  				(ref_level ? "node" : "leaf"),
296  				ref_level, ref_root);
297  		}
298  		btrfs_release_path(&path);
299  	} else {
300  		struct btrfs_backref_walk_ctx ctx = { 0 };
301  		struct data_reloc_warn reloc_warn = { 0 };
302  
303  		btrfs_release_path(&path);
304  
305  		ctx.bytenr = found_key.objectid;
306  		ctx.extent_item_pos = logical - found_key.objectid;
307  		ctx.fs_info = fs_info;
308  
309  		reloc_warn.logical = logical;
310  		reloc_warn.extent_item_size = found_key.offset;
311  		reloc_warn.mirror_num = mirror_num;
312  		reloc_warn.fs_info = fs_info;
313  
314  		iterate_extent_inodes(&ctx, true,
315  				      data_reloc_print_warning_inode, &reloc_warn);
316  	}
317  }
318  
319  static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
320  		u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
321  {
322  	struct btrfs_root *root = inode->root;
323  	const u32 csum_size = root->fs_info->csum_size;
324  
325  	/* For data reloc tree, it's better to do a backref lookup instead. */
326  	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
327  		return print_data_reloc_error(inode, logical_start, csum,
328  					      csum_expected, mirror_num);
329  
330  	/* Output without objectid, which is more meaningful */
331  	if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
332  		btrfs_warn_rl(root->fs_info,
333  "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
334  			root->root_key.objectid, btrfs_ino(inode),
335  			logical_start,
336  			CSUM_FMT_VALUE(csum_size, csum),
337  			CSUM_FMT_VALUE(csum_size, csum_expected),
338  			mirror_num);
339  	} else {
340  		btrfs_warn_rl(root->fs_info,
341  "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
342  			root->root_key.objectid, btrfs_ino(inode),
343  			logical_start,
344  			CSUM_FMT_VALUE(csum_size, csum),
345  			CSUM_FMT_VALUE(csum_size, csum_expected),
346  			mirror_num);
347  	}
348  }
349  
350  /*
351   * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
352   *
353   * ilock_flags can have the following bit set:
354   *
355   * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
356   * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
357   *		     return -EAGAIN
358   * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
359   */
360  int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
361  {
362  	if (ilock_flags & BTRFS_ILOCK_SHARED) {
363  		if (ilock_flags & BTRFS_ILOCK_TRY) {
364  			if (!inode_trylock_shared(&inode->vfs_inode))
365  				return -EAGAIN;
366  			else
367  				return 0;
368  		}
369  		inode_lock_shared(&inode->vfs_inode);
370  	} else {
371  		if (ilock_flags & BTRFS_ILOCK_TRY) {
372  			if (!inode_trylock(&inode->vfs_inode))
373  				return -EAGAIN;
374  			else
375  				return 0;
376  		}
377  		inode_lock(&inode->vfs_inode);
378  	}
379  	if (ilock_flags & BTRFS_ILOCK_MMAP)
380  		down_write(&inode->i_mmap_lock);
381  	return 0;
382  }
383  
384  /*
385   * btrfs_inode_unlock - unock inode i_rwsem
386   *
387   * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
388   * to decide whether the lock acquired is shared or exclusive.
389   */
390  void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
391  {
392  	if (ilock_flags & BTRFS_ILOCK_MMAP)
393  		up_write(&inode->i_mmap_lock);
394  	if (ilock_flags & BTRFS_ILOCK_SHARED)
395  		inode_unlock_shared(&inode->vfs_inode);
396  	else
397  		inode_unlock(&inode->vfs_inode);
398  }
399  
400  /*
401   * Cleanup all submitted ordered extents in specified range to handle errors
402   * from the btrfs_run_delalloc_range() callback.
403   *
404   * NOTE: caller must ensure that when an error happens, it can not call
405   * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
406   * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
407   * to be released, which we want to happen only when finishing the ordered
408   * extent (btrfs_finish_ordered_io()).
409   */
410  static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
411  						 struct page *locked_page,
412  						 u64 offset, u64 bytes)
413  {
414  	unsigned long index = offset >> PAGE_SHIFT;
415  	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
416  	u64 page_start = 0, page_end = 0;
417  	struct page *page;
418  
419  	if (locked_page) {
420  		page_start = page_offset(locked_page);
421  		page_end = page_start + PAGE_SIZE - 1;
422  	}
423  
424  	while (index <= end_index) {
425  		/*
426  		 * For locked page, we will call btrfs_mark_ordered_io_finished
427  		 * through btrfs_mark_ordered_io_finished() on it
428  		 * in run_delalloc_range() for the error handling, which will
429  		 * clear page Ordered and run the ordered extent accounting.
430  		 *
431  		 * Here we can't just clear the Ordered bit, or
432  		 * btrfs_mark_ordered_io_finished() would skip the accounting
433  		 * for the page range, and the ordered extent will never finish.
434  		 */
435  		if (locked_page && index == (page_start >> PAGE_SHIFT)) {
436  			index++;
437  			continue;
438  		}
439  		page = find_get_page(inode->vfs_inode.i_mapping, index);
440  		index++;
441  		if (!page)
442  			continue;
443  
444  		/*
445  		 * Here we just clear all Ordered bits for every page in the
446  		 * range, then btrfs_mark_ordered_io_finished() will handle
447  		 * the ordered extent accounting for the range.
448  		 */
449  		btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
450  					       offset, bytes);
451  		put_page(page);
452  	}
453  
454  	if (locked_page) {
455  		/* The locked page covers the full range, nothing needs to be done */
456  		if (bytes + offset <= page_start + PAGE_SIZE)
457  			return;
458  		/*
459  		 * In case this page belongs to the delalloc range being
460  		 * instantiated then skip it, since the first page of a range is
461  		 * going to be properly cleaned up by the caller of
462  		 * run_delalloc_range
463  		 */
464  		if (page_start >= offset && page_end <= (offset + bytes - 1)) {
465  			bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
466  			offset = page_offset(locked_page) + PAGE_SIZE;
467  		}
468  	}
469  
470  	return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
471  }
472  
473  static int btrfs_dirty_inode(struct btrfs_inode *inode);
474  
475  static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
476  				     struct btrfs_new_inode_args *args)
477  {
478  	int err;
479  
480  	if (args->default_acl) {
481  		err = __btrfs_set_acl(trans, args->inode, args->default_acl,
482  				      ACL_TYPE_DEFAULT);
483  		if (err)
484  			return err;
485  	}
486  	if (args->acl) {
487  		err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
488  		if (err)
489  			return err;
490  	}
491  	if (!args->default_acl && !args->acl)
492  		cache_no_acl(args->inode);
493  	return btrfs_xattr_security_init(trans, args->inode, args->dir,
494  					 &args->dentry->d_name);
495  }
496  
497  /*
498   * this does all the hard work for inserting an inline extent into
499   * the btree.  The caller should have done a btrfs_drop_extents so that
500   * no overlapping inline items exist in the btree
501   */
502  static int insert_inline_extent(struct btrfs_trans_handle *trans,
503  				struct btrfs_path *path,
504  				struct btrfs_inode *inode, bool extent_inserted,
505  				size_t size, size_t compressed_size,
506  				int compress_type,
507  				struct page **compressed_pages,
508  				bool update_i_size)
509  {
510  	struct btrfs_root *root = inode->root;
511  	struct extent_buffer *leaf;
512  	struct page *page = NULL;
513  	char *kaddr;
514  	unsigned long ptr;
515  	struct btrfs_file_extent_item *ei;
516  	int ret;
517  	size_t cur_size = size;
518  	u64 i_size;
519  
520  	ASSERT((compressed_size > 0 && compressed_pages) ||
521  	       (compressed_size == 0 && !compressed_pages));
522  
523  	if (compressed_size && compressed_pages)
524  		cur_size = compressed_size;
525  
526  	if (!extent_inserted) {
527  		struct btrfs_key key;
528  		size_t datasize;
529  
530  		key.objectid = btrfs_ino(inode);
531  		key.offset = 0;
532  		key.type = BTRFS_EXTENT_DATA_KEY;
533  
534  		datasize = btrfs_file_extent_calc_inline_size(cur_size);
535  		ret = btrfs_insert_empty_item(trans, root, path, &key,
536  					      datasize);
537  		if (ret)
538  			goto fail;
539  	}
540  	leaf = path->nodes[0];
541  	ei = btrfs_item_ptr(leaf, path->slots[0],
542  			    struct btrfs_file_extent_item);
543  	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
544  	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
545  	btrfs_set_file_extent_encryption(leaf, ei, 0);
546  	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
547  	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
548  	ptr = btrfs_file_extent_inline_start(ei);
549  
550  	if (compress_type != BTRFS_COMPRESS_NONE) {
551  		struct page *cpage;
552  		int i = 0;
553  		while (compressed_size > 0) {
554  			cpage = compressed_pages[i];
555  			cur_size = min_t(unsigned long, compressed_size,
556  				       PAGE_SIZE);
557  
558  			kaddr = kmap_local_page(cpage);
559  			write_extent_buffer(leaf, kaddr, ptr, cur_size);
560  			kunmap_local(kaddr);
561  
562  			i++;
563  			ptr += cur_size;
564  			compressed_size -= cur_size;
565  		}
566  		btrfs_set_file_extent_compression(leaf, ei,
567  						  compress_type);
568  	} else {
569  		page = find_get_page(inode->vfs_inode.i_mapping, 0);
570  		btrfs_set_file_extent_compression(leaf, ei, 0);
571  		kaddr = kmap_local_page(page);
572  		write_extent_buffer(leaf, kaddr, ptr, size);
573  		kunmap_local(kaddr);
574  		put_page(page);
575  	}
576  	btrfs_mark_buffer_dirty(trans, leaf);
577  	btrfs_release_path(path);
578  
579  	/*
580  	 * We align size to sectorsize for inline extents just for simplicity
581  	 * sake.
582  	 */
583  	ret = btrfs_inode_set_file_extent_range(inode, 0,
584  					ALIGN(size, root->fs_info->sectorsize));
585  	if (ret)
586  		goto fail;
587  
588  	/*
589  	 * We're an inline extent, so nobody can extend the file past i_size
590  	 * without locking a page we already have locked.
591  	 *
592  	 * We must do any i_size and inode updates before we unlock the pages.
593  	 * Otherwise we could end up racing with unlink.
594  	 */
595  	i_size = i_size_read(&inode->vfs_inode);
596  	if (update_i_size && size > i_size) {
597  		i_size_write(&inode->vfs_inode, size);
598  		i_size = size;
599  	}
600  	inode->disk_i_size = i_size;
601  
602  fail:
603  	return ret;
604  }
605  
606  
607  /*
608   * conditionally insert an inline extent into the file.  This
609   * does the checks required to make sure the data is small enough
610   * to fit as an inline extent.
611   */
612  static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
613  					  size_t compressed_size,
614  					  int compress_type,
615  					  struct page **compressed_pages,
616  					  bool update_i_size)
617  {
618  	struct btrfs_drop_extents_args drop_args = { 0 };
619  	struct btrfs_root *root = inode->root;
620  	struct btrfs_fs_info *fs_info = root->fs_info;
621  	struct btrfs_trans_handle *trans;
622  	u64 data_len = (compressed_size ?: size);
623  	int ret;
624  	struct btrfs_path *path;
625  
626  	/*
627  	 * We can create an inline extent if it ends at or beyond the current
628  	 * i_size, is no larger than a sector (decompressed), and the (possibly
629  	 * compressed) data fits in a leaf and the configured maximum inline
630  	 * size.
631  	 */
632  	if (size < i_size_read(&inode->vfs_inode) ||
633  	    size > fs_info->sectorsize ||
634  	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
635  	    data_len > fs_info->max_inline)
636  		return 1;
637  
638  	path = btrfs_alloc_path();
639  	if (!path)
640  		return -ENOMEM;
641  
642  	trans = btrfs_join_transaction(root);
643  	if (IS_ERR(trans)) {
644  		btrfs_free_path(path);
645  		return PTR_ERR(trans);
646  	}
647  	trans->block_rsv = &inode->block_rsv;
648  
649  	drop_args.path = path;
650  	drop_args.start = 0;
651  	drop_args.end = fs_info->sectorsize;
652  	drop_args.drop_cache = true;
653  	drop_args.replace_extent = true;
654  	drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
655  	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
656  	if (ret) {
657  		btrfs_abort_transaction(trans, ret);
658  		goto out;
659  	}
660  
661  	ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
662  				   size, compressed_size, compress_type,
663  				   compressed_pages, update_i_size);
664  	if (ret && ret != -ENOSPC) {
665  		btrfs_abort_transaction(trans, ret);
666  		goto out;
667  	} else if (ret == -ENOSPC) {
668  		ret = 1;
669  		goto out;
670  	}
671  
672  	btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
673  	ret = btrfs_update_inode(trans, root, inode);
674  	if (ret && ret != -ENOSPC) {
675  		btrfs_abort_transaction(trans, ret);
676  		goto out;
677  	} else if (ret == -ENOSPC) {
678  		ret = 1;
679  		goto out;
680  	}
681  
682  	btrfs_set_inode_full_sync(inode);
683  out:
684  	/*
685  	 * Don't forget to free the reserved space, as for inlined extent
686  	 * it won't count as data extent, free them directly here.
687  	 * And at reserve time, it's always aligned to page size, so
688  	 * just free one page here.
689  	 */
690  	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
691  	btrfs_free_path(path);
692  	btrfs_end_transaction(trans);
693  	return ret;
694  }
695  
696  struct async_extent {
697  	u64 start;
698  	u64 ram_size;
699  	u64 compressed_size;
700  	struct page **pages;
701  	unsigned long nr_pages;
702  	int compress_type;
703  	struct list_head list;
704  };
705  
706  struct async_chunk {
707  	struct btrfs_inode *inode;
708  	struct page *locked_page;
709  	u64 start;
710  	u64 end;
711  	blk_opf_t write_flags;
712  	struct list_head extents;
713  	struct cgroup_subsys_state *blkcg_css;
714  	struct btrfs_work work;
715  	struct async_cow *async_cow;
716  };
717  
718  struct async_cow {
719  	atomic_t num_chunks;
720  	struct async_chunk chunks[];
721  };
722  
723  static noinline int add_async_extent(struct async_chunk *cow,
724  				     u64 start, u64 ram_size,
725  				     u64 compressed_size,
726  				     struct page **pages,
727  				     unsigned long nr_pages,
728  				     int compress_type)
729  {
730  	struct async_extent *async_extent;
731  
732  	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
733  	if (!async_extent)
734  		return -ENOMEM;
735  	async_extent->start = start;
736  	async_extent->ram_size = ram_size;
737  	async_extent->compressed_size = compressed_size;
738  	async_extent->pages = pages;
739  	async_extent->nr_pages = nr_pages;
740  	async_extent->compress_type = compress_type;
741  	list_add_tail(&async_extent->list, &cow->extents);
742  	return 0;
743  }
744  
745  /*
746   * Check if the inode needs to be submitted to compression, based on mount
747   * options, defragmentation, properties or heuristics.
748   */
749  static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
750  				      u64 end)
751  {
752  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
753  
754  	if (!btrfs_inode_can_compress(inode)) {
755  		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
756  			KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
757  			btrfs_ino(inode));
758  		return 0;
759  	}
760  	/*
761  	 * Special check for subpage.
762  	 *
763  	 * We lock the full page then run each delalloc range in the page, thus
764  	 * for the following case, we will hit some subpage specific corner case:
765  	 *
766  	 * 0		32K		64K
767  	 * |	|///////|	|///////|
768  	 *		\- A		\- B
769  	 *
770  	 * In above case, both range A and range B will try to unlock the full
771  	 * page [0, 64K), causing the one finished later will have page
772  	 * unlocked already, triggering various page lock requirement BUG_ON()s.
773  	 *
774  	 * So here we add an artificial limit that subpage compression can only
775  	 * if the range is fully page aligned.
776  	 *
777  	 * In theory we only need to ensure the first page is fully covered, but
778  	 * the tailing partial page will be locked until the full compression
779  	 * finishes, delaying the write of other range.
780  	 *
781  	 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
782  	 * first to prevent any submitted async extent to unlock the full page.
783  	 * By this, we can ensure for subpage case that only the last async_cow
784  	 * will unlock the full page.
785  	 */
786  	if (fs_info->sectorsize < PAGE_SIZE) {
787  		if (!PAGE_ALIGNED(start) ||
788  		    !PAGE_ALIGNED(end + 1))
789  			return 0;
790  	}
791  
792  	/* force compress */
793  	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
794  		return 1;
795  	/* defrag ioctl */
796  	if (inode->defrag_compress)
797  		return 1;
798  	/* bad compression ratios */
799  	if (inode->flags & BTRFS_INODE_NOCOMPRESS)
800  		return 0;
801  	if (btrfs_test_opt(fs_info, COMPRESS) ||
802  	    inode->flags & BTRFS_INODE_COMPRESS ||
803  	    inode->prop_compress)
804  		return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
805  	return 0;
806  }
807  
808  static inline void inode_should_defrag(struct btrfs_inode *inode,
809  		u64 start, u64 end, u64 num_bytes, u32 small_write)
810  {
811  	/* If this is a small write inside eof, kick off a defrag */
812  	if (num_bytes < small_write &&
813  	    (start > 0 || end + 1 < inode->disk_i_size))
814  		btrfs_add_inode_defrag(NULL, inode, small_write);
815  }
816  
817  /*
818   * Work queue call back to started compression on a file and pages.
819   *
820   * This is done inside an ordered work queue, and the compression is spread
821   * across many cpus.  The actual IO submission is step two, and the ordered work
822   * queue takes care of making sure that happens in the same order things were
823   * put onto the queue by writepages and friends.
824   *
825   * If this code finds it can't get good compression, it puts an entry onto the
826   * work queue to write the uncompressed bytes.  This makes sure that both
827   * compressed inodes and uncompressed inodes are written in the same order that
828   * the flusher thread sent them down.
829   */
830  static void compress_file_range(struct btrfs_work *work)
831  {
832  	struct async_chunk *async_chunk =
833  		container_of(work, struct async_chunk, work);
834  	struct btrfs_inode *inode = async_chunk->inode;
835  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
836  	struct address_space *mapping = inode->vfs_inode.i_mapping;
837  	u64 blocksize = fs_info->sectorsize;
838  	u64 start = async_chunk->start;
839  	u64 end = async_chunk->end;
840  	u64 actual_end;
841  	u64 i_size;
842  	int ret = 0;
843  	struct page **pages;
844  	unsigned long nr_pages;
845  	unsigned long total_compressed = 0;
846  	unsigned long total_in = 0;
847  	unsigned int poff;
848  	int i;
849  	int compress_type = fs_info->compress_type;
850  
851  	inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
852  
853  	/*
854  	 * We need to call clear_page_dirty_for_io on each page in the range.
855  	 * Otherwise applications with the file mmap'd can wander in and change
856  	 * the page contents while we are compressing them.
857  	 */
858  	extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
859  
860  	/*
861  	 * We need to save i_size before now because it could change in between
862  	 * us evaluating the size and assigning it.  This is because we lock and
863  	 * unlock the page in truncate and fallocate, and then modify the i_size
864  	 * later on.
865  	 *
866  	 * The barriers are to emulate READ_ONCE, remove that once i_size_read
867  	 * does that for us.
868  	 */
869  	barrier();
870  	i_size = i_size_read(&inode->vfs_inode);
871  	barrier();
872  	actual_end = min_t(u64, i_size, end + 1);
873  again:
874  	pages = NULL;
875  	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
876  	nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
877  
878  	/*
879  	 * we don't want to send crud past the end of i_size through
880  	 * compression, that's just a waste of CPU time.  So, if the
881  	 * end of the file is before the start of our current
882  	 * requested range of bytes, we bail out to the uncompressed
883  	 * cleanup code that can deal with all of this.
884  	 *
885  	 * It isn't really the fastest way to fix things, but this is a
886  	 * very uncommon corner.
887  	 */
888  	if (actual_end <= start)
889  		goto cleanup_and_bail_uncompressed;
890  
891  	total_compressed = actual_end - start;
892  
893  	/*
894  	 * Skip compression for a small file range(<=blocksize) that
895  	 * isn't an inline extent, since it doesn't save disk space at all.
896  	 */
897  	if (total_compressed <= blocksize &&
898  	   (start > 0 || end + 1 < inode->disk_i_size))
899  		goto cleanup_and_bail_uncompressed;
900  
901  	/*
902  	 * For subpage case, we require full page alignment for the sector
903  	 * aligned range.
904  	 * Thus we must also check against @actual_end, not just @end.
905  	 */
906  	if (blocksize < PAGE_SIZE) {
907  		if (!PAGE_ALIGNED(start) ||
908  		    !PAGE_ALIGNED(round_up(actual_end, blocksize)))
909  			goto cleanup_and_bail_uncompressed;
910  	}
911  
912  	total_compressed = min_t(unsigned long, total_compressed,
913  			BTRFS_MAX_UNCOMPRESSED);
914  	total_in = 0;
915  	ret = 0;
916  
917  	/*
918  	 * We do compression for mount -o compress and when the inode has not
919  	 * been flagged as NOCOMPRESS.  This flag can change at any time if we
920  	 * discover bad compression ratios.
921  	 */
922  	if (!inode_need_compress(inode, start, end))
923  		goto cleanup_and_bail_uncompressed;
924  
925  	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
926  	if (!pages) {
927  		/*
928  		 * Memory allocation failure is not a fatal error, we can fall
929  		 * back to uncompressed code.
930  		 */
931  		goto cleanup_and_bail_uncompressed;
932  	}
933  
934  	if (inode->defrag_compress)
935  		compress_type = inode->defrag_compress;
936  	else if (inode->prop_compress)
937  		compress_type = inode->prop_compress;
938  
939  	/* Compression level is applied here. */
940  	ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4),
941  				   mapping, start, pages, &nr_pages, &total_in,
942  				   &total_compressed);
943  	if (ret)
944  		goto mark_incompressible;
945  
946  	/*
947  	 * Zero the tail end of the last page, as we might be sending it down
948  	 * to disk.
949  	 */
950  	poff = offset_in_page(total_compressed);
951  	if (poff)
952  		memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff);
953  
954  	/*
955  	 * Try to create an inline extent.
956  	 *
957  	 * If we didn't compress the entire range, try to create an uncompressed
958  	 * inline extent, else a compressed one.
959  	 *
960  	 * Check cow_file_range() for why we don't even try to create inline
961  	 * extent for the subpage case.
962  	 */
963  	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
964  		if (total_in < actual_end) {
965  			ret = cow_file_range_inline(inode, actual_end, 0,
966  						    BTRFS_COMPRESS_NONE, NULL,
967  						    false);
968  		} else {
969  			ret = cow_file_range_inline(inode, actual_end,
970  						    total_compressed,
971  						    compress_type, pages,
972  						    false);
973  		}
974  		if (ret <= 0) {
975  			unsigned long clear_flags = EXTENT_DELALLOC |
976  				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
977  				EXTENT_DO_ACCOUNTING;
978  
979  			if (ret < 0)
980  				mapping_set_error(mapping, -EIO);
981  
982  			/*
983  			 * inline extent creation worked or returned error,
984  			 * we don't need to create any more async work items.
985  			 * Unlock and free up our temp pages.
986  			 *
987  			 * We use DO_ACCOUNTING here because we need the
988  			 * delalloc_release_metadata to be done _after_ we drop
989  			 * our outstanding extent for clearing delalloc for this
990  			 * range.
991  			 */
992  			extent_clear_unlock_delalloc(inode, start, end,
993  						     NULL,
994  						     clear_flags,
995  						     PAGE_UNLOCK |
996  						     PAGE_START_WRITEBACK |
997  						     PAGE_END_WRITEBACK);
998  			goto free_pages;
999  		}
1000  	}
1001  
1002  	/*
1003  	 * We aren't doing an inline extent. Round the compressed size up to a
1004  	 * block size boundary so the allocator does sane things.
1005  	 */
1006  	total_compressed = ALIGN(total_compressed, blocksize);
1007  
1008  	/*
1009  	 * One last check to make sure the compression is really a win, compare
1010  	 * the page count read with the blocks on disk, compression must free at
1011  	 * least one sector.
1012  	 */
1013  	total_in = round_up(total_in, fs_info->sectorsize);
1014  	if (total_compressed + blocksize > total_in)
1015  		goto mark_incompressible;
1016  
1017  	/*
1018  	 * The async work queues will take care of doing actual allocation on
1019  	 * disk for these compressed pages, and will submit the bios.
1020  	 */
1021  	ret = add_async_extent(async_chunk, start, total_in, total_compressed, pages,
1022  			       nr_pages, compress_type);
1023  	BUG_ON(ret);
1024  	if (start + total_in < end) {
1025  		start += total_in;
1026  		cond_resched();
1027  		goto again;
1028  	}
1029  	return;
1030  
1031  mark_incompressible:
1032  	if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
1033  		inode->flags |= BTRFS_INODE_NOCOMPRESS;
1034  cleanup_and_bail_uncompressed:
1035  	ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
1036  			       BTRFS_COMPRESS_NONE);
1037  	BUG_ON(ret);
1038  free_pages:
1039  	if (pages) {
1040  		for (i = 0; i < nr_pages; i++) {
1041  			WARN_ON(pages[i]->mapping);
1042  			put_page(pages[i]);
1043  		}
1044  		kfree(pages);
1045  	}
1046  }
1047  
1048  static void free_async_extent_pages(struct async_extent *async_extent)
1049  {
1050  	int i;
1051  
1052  	if (!async_extent->pages)
1053  		return;
1054  
1055  	for (i = 0; i < async_extent->nr_pages; i++) {
1056  		WARN_ON(async_extent->pages[i]->mapping);
1057  		put_page(async_extent->pages[i]);
1058  	}
1059  	kfree(async_extent->pages);
1060  	async_extent->nr_pages = 0;
1061  	async_extent->pages = NULL;
1062  }
1063  
1064  static void submit_uncompressed_range(struct btrfs_inode *inode,
1065  				      struct async_extent *async_extent,
1066  				      struct page *locked_page)
1067  {
1068  	u64 start = async_extent->start;
1069  	u64 end = async_extent->start + async_extent->ram_size - 1;
1070  	int ret;
1071  	struct writeback_control wbc = {
1072  		.sync_mode		= WB_SYNC_ALL,
1073  		.range_start		= start,
1074  		.range_end		= end,
1075  		.no_cgroup_owner	= 1,
1076  	};
1077  
1078  	wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
1079  	ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false);
1080  	wbc_detach_inode(&wbc);
1081  	if (ret < 0) {
1082  		btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
1083  		if (locked_page) {
1084  			const u64 page_start = page_offset(locked_page);
1085  
1086  			set_page_writeback(locked_page);
1087  			end_page_writeback(locked_page);
1088  			btrfs_mark_ordered_io_finished(inode, locked_page,
1089  						       page_start, PAGE_SIZE,
1090  						       !ret);
1091  			mapping_set_error(locked_page->mapping, ret);
1092  			unlock_page(locked_page);
1093  		}
1094  	}
1095  }
1096  
1097  static void submit_one_async_extent(struct async_chunk *async_chunk,
1098  				    struct async_extent *async_extent,
1099  				    u64 *alloc_hint)
1100  {
1101  	struct btrfs_inode *inode = async_chunk->inode;
1102  	struct extent_io_tree *io_tree = &inode->io_tree;
1103  	struct btrfs_root *root = inode->root;
1104  	struct btrfs_fs_info *fs_info = root->fs_info;
1105  	struct btrfs_ordered_extent *ordered;
1106  	struct btrfs_key ins;
1107  	struct page *locked_page = NULL;
1108  	struct extent_map *em;
1109  	int ret = 0;
1110  	u64 start = async_extent->start;
1111  	u64 end = async_extent->start + async_extent->ram_size - 1;
1112  
1113  	if (async_chunk->blkcg_css)
1114  		kthread_associate_blkcg(async_chunk->blkcg_css);
1115  
1116  	/*
1117  	 * If async_chunk->locked_page is in the async_extent range, we need to
1118  	 * handle it.
1119  	 */
1120  	if (async_chunk->locked_page) {
1121  		u64 locked_page_start = page_offset(async_chunk->locked_page);
1122  		u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
1123  
1124  		if (!(start >= locked_page_end || end <= locked_page_start))
1125  			locked_page = async_chunk->locked_page;
1126  	}
1127  	lock_extent(io_tree, start, end, NULL);
1128  
1129  	if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
1130  		submit_uncompressed_range(inode, async_extent, locked_page);
1131  		goto done;
1132  	}
1133  
1134  	ret = btrfs_reserve_extent(root, async_extent->ram_size,
1135  				   async_extent->compressed_size,
1136  				   async_extent->compressed_size,
1137  				   0, *alloc_hint, &ins, 1, 1);
1138  	if (ret) {
1139  		/*
1140  		 * We can't reserve contiguous space for the compressed size.
1141  		 * Unlikely, but it's possible that we could have enough
1142  		 * non-contiguous space for the uncompressed size instead.  So
1143  		 * fall back to uncompressed.
1144  		 */
1145  		submit_uncompressed_range(inode, async_extent, locked_page);
1146  		goto done;
1147  	}
1148  
1149  	/* Here we're doing allocation and writeback of the compressed pages */
1150  	em = create_io_em(inode, start,
1151  			  async_extent->ram_size,	/* len */
1152  			  start,			/* orig_start */
1153  			  ins.objectid,			/* block_start */
1154  			  ins.offset,			/* block_len */
1155  			  ins.offset,			/* orig_block_len */
1156  			  async_extent->ram_size,	/* ram_bytes */
1157  			  async_extent->compress_type,
1158  			  BTRFS_ORDERED_COMPRESSED);
1159  	if (IS_ERR(em)) {
1160  		ret = PTR_ERR(em);
1161  		goto out_free_reserve;
1162  	}
1163  	free_extent_map(em);
1164  
1165  	ordered = btrfs_alloc_ordered_extent(inode, start,	/* file_offset */
1166  				       async_extent->ram_size,	/* num_bytes */
1167  				       async_extent->ram_size,	/* ram_bytes */
1168  				       ins.objectid,		/* disk_bytenr */
1169  				       ins.offset,		/* disk_num_bytes */
1170  				       0,			/* offset */
1171  				       1 << BTRFS_ORDERED_COMPRESSED,
1172  				       async_extent->compress_type);
1173  	if (IS_ERR(ordered)) {
1174  		btrfs_drop_extent_map_range(inode, start, end, false);
1175  		ret = PTR_ERR(ordered);
1176  		goto out_free_reserve;
1177  	}
1178  	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1179  
1180  	/* Clear dirty, set writeback and unlock the pages. */
1181  	extent_clear_unlock_delalloc(inode, start, end,
1182  			NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
1183  			PAGE_UNLOCK | PAGE_START_WRITEBACK);
1184  	btrfs_submit_compressed_write(ordered,
1185  			    async_extent->pages,	/* compressed_pages */
1186  			    async_extent->nr_pages,
1187  			    async_chunk->write_flags, true);
1188  	*alloc_hint = ins.objectid + ins.offset;
1189  done:
1190  	if (async_chunk->blkcg_css)
1191  		kthread_associate_blkcg(NULL);
1192  	kfree(async_extent);
1193  	return;
1194  
1195  out_free_reserve:
1196  	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1197  	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1198  	mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
1199  	extent_clear_unlock_delalloc(inode, start, end,
1200  				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
1201  				     EXTENT_DELALLOC_NEW |
1202  				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
1203  				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
1204  				     PAGE_END_WRITEBACK);
1205  	free_async_extent_pages(async_extent);
1206  	if (async_chunk->blkcg_css)
1207  		kthread_associate_blkcg(NULL);
1208  	btrfs_debug(fs_info,
1209  "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
1210  		    root->root_key.objectid, btrfs_ino(inode), start,
1211  		    async_extent->ram_size, ret);
1212  	kfree(async_extent);
1213  }
1214  
1215  static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1216  				      u64 num_bytes)
1217  {
1218  	struct extent_map_tree *em_tree = &inode->extent_tree;
1219  	struct extent_map *em;
1220  	u64 alloc_hint = 0;
1221  
1222  	read_lock(&em_tree->lock);
1223  	em = search_extent_mapping(em_tree, start, num_bytes);
1224  	if (em) {
1225  		/*
1226  		 * if block start isn't an actual block number then find the
1227  		 * first block in this inode and use that as a hint.  If that
1228  		 * block is also bogus then just don't worry about it.
1229  		 */
1230  		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1231  			free_extent_map(em);
1232  			em = search_extent_mapping(em_tree, 0, 0);
1233  			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
1234  				alloc_hint = em->block_start;
1235  			if (em)
1236  				free_extent_map(em);
1237  		} else {
1238  			alloc_hint = em->block_start;
1239  			free_extent_map(em);
1240  		}
1241  	}
1242  	read_unlock(&em_tree->lock);
1243  
1244  	return alloc_hint;
1245  }
1246  
1247  /*
1248   * when extent_io.c finds a delayed allocation range in the file,
1249   * the call backs end up in this code.  The basic idea is to
1250   * allocate extents on disk for the range, and create ordered data structs
1251   * in ram to track those extents.
1252   *
1253   * locked_page is the page that writepage had locked already.  We use
1254   * it to make sure we don't do extra locks or unlocks.
1255   *
1256   * When this function fails, it unlocks all pages except @locked_page.
1257   *
1258   * When this function successfully creates an inline extent, it returns 1 and
1259   * unlocks all pages including locked_page and starts I/O on them.
1260   * (In reality inline extents are limited to a single page, so locked_page is
1261   * the only page handled anyway).
1262   *
1263   * When this function succeed and creates a normal extent, the page locking
1264   * status depends on the passed in flags:
1265   *
1266   * - If @keep_locked is set, all pages are kept locked.
1267   * - Else all pages except for @locked_page are unlocked.
1268   *
1269   * When a failure happens in the second or later iteration of the
1270   * while-loop, the ordered extents created in previous iterations are kept
1271   * intact. So, the caller must clean them up by calling
1272   * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
1273   * example.
1274   */
1275  static noinline int cow_file_range(struct btrfs_inode *inode,
1276  				   struct page *locked_page, u64 start, u64 end,
1277  				   u64 *done_offset,
1278  				   bool keep_locked, bool no_inline)
1279  {
1280  	struct btrfs_root *root = inode->root;
1281  	struct btrfs_fs_info *fs_info = root->fs_info;
1282  	u64 alloc_hint = 0;
1283  	u64 orig_start = start;
1284  	u64 num_bytes;
1285  	unsigned long ram_size;
1286  	u64 cur_alloc_size = 0;
1287  	u64 min_alloc_size;
1288  	u64 blocksize = fs_info->sectorsize;
1289  	struct btrfs_key ins;
1290  	struct extent_map *em;
1291  	unsigned clear_bits;
1292  	unsigned long page_ops;
1293  	bool extent_reserved = false;
1294  	int ret = 0;
1295  
1296  	if (btrfs_is_free_space_inode(inode)) {
1297  		ret = -EINVAL;
1298  		goto out_unlock;
1299  	}
1300  
1301  	num_bytes = ALIGN(end - start + 1, blocksize);
1302  	num_bytes = max(blocksize,  num_bytes);
1303  	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1304  
1305  	inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1306  
1307  	/*
1308  	 * Due to the page size limit, for subpage we can only trigger the
1309  	 * writeback for the dirty sectors of page, that means data writeback
1310  	 * is doing more writeback than what we want.
1311  	 *
1312  	 * This is especially unexpected for some call sites like fallocate,
1313  	 * where we only increase i_size after everything is done.
1314  	 * This means we can trigger inline extent even if we didn't want to.
1315  	 * So here we skip inline extent creation completely.
1316  	 */
1317  	if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) {
1318  		u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
1319  				       end + 1);
1320  
1321  		/* lets try to make an inline extent */
1322  		ret = cow_file_range_inline(inode, actual_end, 0,
1323  					    BTRFS_COMPRESS_NONE, NULL, false);
1324  		if (ret == 0) {
1325  			/*
1326  			 * We use DO_ACCOUNTING here because we need the
1327  			 * delalloc_release_metadata to be run _after_ we drop
1328  			 * our outstanding extent for clearing delalloc for this
1329  			 * range.
1330  			 */
1331  			extent_clear_unlock_delalloc(inode, start, end,
1332  				     locked_page,
1333  				     EXTENT_LOCKED | EXTENT_DELALLOC |
1334  				     EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1335  				     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1336  				     PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
1337  			/*
1338  			 * locked_page is locked by the caller of
1339  			 * writepage_delalloc(), not locked by
1340  			 * __process_pages_contig().
1341  			 *
1342  			 * We can't let __process_pages_contig() to unlock it,
1343  			 * as it doesn't have any subpage::writers recorded.
1344  			 *
1345  			 * Here we manually unlock the page, since the caller
1346  			 * can't determine if it's an inline extent or a
1347  			 * compressed extent.
1348  			 */
1349  			unlock_page(locked_page);
1350  			ret = 1;
1351  			goto done;
1352  		} else if (ret < 0) {
1353  			goto out_unlock;
1354  		}
1355  	}
1356  
1357  	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1358  
1359  	/*
1360  	 * Relocation relies on the relocated extents to have exactly the same
1361  	 * size as the original extents. Normally writeback for relocation data
1362  	 * extents follows a NOCOW path because relocation preallocates the
1363  	 * extents. However, due to an operation such as scrub turning a block
1364  	 * group to RO mode, it may fallback to COW mode, so we must make sure
1365  	 * an extent allocated during COW has exactly the requested size and can
1366  	 * not be split into smaller extents, otherwise relocation breaks and
1367  	 * fails during the stage where it updates the bytenr of file extent
1368  	 * items.
1369  	 */
1370  	if (btrfs_is_data_reloc_root(root))
1371  		min_alloc_size = num_bytes;
1372  	else
1373  		min_alloc_size = fs_info->sectorsize;
1374  
1375  	while (num_bytes > 0) {
1376  		struct btrfs_ordered_extent *ordered;
1377  
1378  		cur_alloc_size = num_bytes;
1379  		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1380  					   min_alloc_size, 0, alloc_hint,
1381  					   &ins, 1, 1);
1382  		if (ret == -EAGAIN) {
1383  			/*
1384  			 * btrfs_reserve_extent only returns -EAGAIN for zoned
1385  			 * file systems, which is an indication that there are
1386  			 * no active zones to allocate from at the moment.
1387  			 *
1388  			 * If this is the first loop iteration, wait for at
1389  			 * least one zone to finish before retrying the
1390  			 * allocation.  Otherwise ask the caller to write out
1391  			 * the already allocated blocks before coming back to
1392  			 * us, or return -ENOSPC if it can't handle retries.
1393  			 */
1394  			ASSERT(btrfs_is_zoned(fs_info));
1395  			if (start == orig_start) {
1396  				wait_on_bit_io(&inode->root->fs_info->flags,
1397  					       BTRFS_FS_NEED_ZONE_FINISH,
1398  					       TASK_UNINTERRUPTIBLE);
1399  				continue;
1400  			}
1401  			if (done_offset) {
1402  				*done_offset = start - 1;
1403  				return 0;
1404  			}
1405  			ret = -ENOSPC;
1406  		}
1407  		if (ret < 0)
1408  			goto out_unlock;
1409  		cur_alloc_size = ins.offset;
1410  		extent_reserved = true;
1411  
1412  		ram_size = ins.offset;
1413  		em = create_io_em(inode, start, ins.offset, /* len */
1414  				  start, /* orig_start */
1415  				  ins.objectid, /* block_start */
1416  				  ins.offset, /* block_len */
1417  				  ins.offset, /* orig_block_len */
1418  				  ram_size, /* ram_bytes */
1419  				  BTRFS_COMPRESS_NONE, /* compress_type */
1420  				  BTRFS_ORDERED_REGULAR /* type */);
1421  		if (IS_ERR(em)) {
1422  			ret = PTR_ERR(em);
1423  			goto out_reserve;
1424  		}
1425  		free_extent_map(em);
1426  
1427  		ordered = btrfs_alloc_ordered_extent(inode, start, ram_size,
1428  					ram_size, ins.objectid, cur_alloc_size,
1429  					0, 1 << BTRFS_ORDERED_REGULAR,
1430  					BTRFS_COMPRESS_NONE);
1431  		if (IS_ERR(ordered)) {
1432  			ret = PTR_ERR(ordered);
1433  			goto out_drop_extent_cache;
1434  		}
1435  
1436  		if (btrfs_is_data_reloc_root(root)) {
1437  			ret = btrfs_reloc_clone_csums(ordered);
1438  
1439  			/*
1440  			 * Only drop cache here, and process as normal.
1441  			 *
1442  			 * We must not allow extent_clear_unlock_delalloc()
1443  			 * at out_unlock label to free meta of this ordered
1444  			 * extent, as its meta should be freed by
1445  			 * btrfs_finish_ordered_io().
1446  			 *
1447  			 * So we must continue until @start is increased to
1448  			 * skip current ordered extent.
1449  			 */
1450  			if (ret)
1451  				btrfs_drop_extent_map_range(inode, start,
1452  							    start + ram_size - 1,
1453  							    false);
1454  		}
1455  		btrfs_put_ordered_extent(ordered);
1456  
1457  		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1458  
1459  		/*
1460  		 * We're not doing compressed IO, don't unlock the first page
1461  		 * (which the caller expects to stay locked), don't clear any
1462  		 * dirty bits and don't set any writeback bits
1463  		 *
1464  		 * Do set the Ordered (Private2) bit so we know this page was
1465  		 * properly setup for writepage.
1466  		 */
1467  		page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
1468  		page_ops |= PAGE_SET_ORDERED;
1469  
1470  		extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
1471  					     locked_page,
1472  					     EXTENT_LOCKED | EXTENT_DELALLOC,
1473  					     page_ops);
1474  		if (num_bytes < cur_alloc_size)
1475  			num_bytes = 0;
1476  		else
1477  			num_bytes -= cur_alloc_size;
1478  		alloc_hint = ins.objectid + ins.offset;
1479  		start += cur_alloc_size;
1480  		extent_reserved = false;
1481  
1482  		/*
1483  		 * btrfs_reloc_clone_csums() error, since start is increased
1484  		 * extent_clear_unlock_delalloc() at out_unlock label won't
1485  		 * free metadata of current ordered extent, we're OK to exit.
1486  		 */
1487  		if (ret)
1488  			goto out_unlock;
1489  	}
1490  done:
1491  	if (done_offset)
1492  		*done_offset = end;
1493  	return ret;
1494  
1495  out_drop_extent_cache:
1496  	btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
1497  out_reserve:
1498  	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1499  	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1500  out_unlock:
1501  	/*
1502  	 * Now, we have three regions to clean up:
1503  	 *
1504  	 * |-------(1)----|---(2)---|-------------(3)----------|
1505  	 * `- orig_start  `- start  `- start + cur_alloc_size  `- end
1506  	 *
1507  	 * We process each region below.
1508  	 */
1509  
1510  	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1511  		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1512  	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1513  
1514  	/*
1515  	 * For the range (1). We have already instantiated the ordered extents
1516  	 * for this region. They are cleaned up by
1517  	 * btrfs_cleanup_ordered_extents() in e.g,
1518  	 * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
1519  	 * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
1520  	 * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
1521  	 * function.
1522  	 *
1523  	 * However, in case of @keep_locked, we still need to unlock the pages
1524  	 * (except @locked_page) to ensure all the pages are unlocked.
1525  	 */
1526  	if (keep_locked && orig_start < start) {
1527  		if (!locked_page)
1528  			mapping_set_error(inode->vfs_inode.i_mapping, ret);
1529  		extent_clear_unlock_delalloc(inode, orig_start, start - 1,
1530  					     locked_page, 0, page_ops);
1531  	}
1532  
1533  	/*
1534  	 * For the range (2). If we reserved an extent for our delalloc range
1535  	 * (or a subrange) and failed to create the respective ordered extent,
1536  	 * then it means that when we reserved the extent we decremented the
1537  	 * extent's size from the data space_info's bytes_may_use counter and
1538  	 * incremented the space_info's bytes_reserved counter by the same
1539  	 * amount. We must make sure extent_clear_unlock_delalloc() does not try
1540  	 * to decrement again the data space_info's bytes_may_use counter,
1541  	 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
1542  	 */
1543  	if (extent_reserved) {
1544  		extent_clear_unlock_delalloc(inode, start,
1545  					     start + cur_alloc_size - 1,
1546  					     locked_page,
1547  					     clear_bits,
1548  					     page_ops);
1549  		start += cur_alloc_size;
1550  	}
1551  
1552  	/*
1553  	 * For the range (3). We never touched the region. In addition to the
1554  	 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
1555  	 * space_info's bytes_may_use counter, reserved in
1556  	 * btrfs_check_data_free_space().
1557  	 */
1558  	if (start < end) {
1559  		clear_bits |= EXTENT_CLEAR_DATA_RESV;
1560  		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1561  					     clear_bits, page_ops);
1562  	}
1563  	return ret;
1564  }
1565  
1566  /*
1567   * Phase two of compressed writeback.  This is the ordered portion of the code,
1568   * which only gets called in the order the work was queued.  We walk all the
1569   * async extents created by compress_file_range and send them down to the disk.
1570   */
1571  static noinline void submit_compressed_extents(struct btrfs_work *work)
1572  {
1573  	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1574  						     work);
1575  	struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1576  	struct async_extent *async_extent;
1577  	unsigned long nr_pages;
1578  	u64 alloc_hint = 0;
1579  
1580  	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1581  		PAGE_SHIFT;
1582  
1583  	while (!list_empty(&async_chunk->extents)) {
1584  		async_extent = list_entry(async_chunk->extents.next,
1585  					  struct async_extent, list);
1586  		list_del(&async_extent->list);
1587  		submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
1588  	}
1589  
1590  	/* atomic_sub_return implies a barrier */
1591  	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1592  	    5 * SZ_1M)
1593  		cond_wake_up_nomb(&fs_info->async_submit_wait);
1594  }
1595  
1596  static noinline void async_cow_free(struct btrfs_work *work)
1597  {
1598  	struct async_chunk *async_chunk;
1599  	struct async_cow *async_cow;
1600  
1601  	async_chunk = container_of(work, struct async_chunk, work);
1602  	btrfs_add_delayed_iput(async_chunk->inode);
1603  	if (async_chunk->blkcg_css)
1604  		css_put(async_chunk->blkcg_css);
1605  
1606  	async_cow = async_chunk->async_cow;
1607  	if (atomic_dec_and_test(&async_cow->num_chunks))
1608  		kvfree(async_cow);
1609  }
1610  
1611  static bool run_delalloc_compressed(struct btrfs_inode *inode,
1612  				    struct page *locked_page, u64 start,
1613  				    u64 end, struct writeback_control *wbc)
1614  {
1615  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1616  	struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1617  	struct async_cow *ctx;
1618  	struct async_chunk *async_chunk;
1619  	unsigned long nr_pages;
1620  	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1621  	int i;
1622  	unsigned nofs_flag;
1623  	const blk_opf_t write_flags = wbc_to_write_flags(wbc);
1624  
1625  	nofs_flag = memalloc_nofs_save();
1626  	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1627  	memalloc_nofs_restore(nofs_flag);
1628  	if (!ctx)
1629  		return false;
1630  
1631  	unlock_extent(&inode->io_tree, start, end, NULL);
1632  	set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
1633  
1634  	async_chunk = ctx->chunks;
1635  	atomic_set(&ctx->num_chunks, num_chunks);
1636  
1637  	for (i = 0; i < num_chunks; i++) {
1638  		u64 cur_end = min(end, start + SZ_512K - 1);
1639  
1640  		/*
1641  		 * igrab is called higher up in the call chain, take only the
1642  		 * lightweight reference for the callback lifetime
1643  		 */
1644  		ihold(&inode->vfs_inode);
1645  		async_chunk[i].async_cow = ctx;
1646  		async_chunk[i].inode = inode;
1647  		async_chunk[i].start = start;
1648  		async_chunk[i].end = cur_end;
1649  		async_chunk[i].write_flags = write_flags;
1650  		INIT_LIST_HEAD(&async_chunk[i].extents);
1651  
1652  		/*
1653  		 * The locked_page comes all the way from writepage and its
1654  		 * the original page we were actually given.  As we spread
1655  		 * this large delalloc region across multiple async_chunk
1656  		 * structs, only the first struct needs a pointer to locked_page
1657  		 *
1658  		 * This way we don't need racey decisions about who is supposed
1659  		 * to unlock it.
1660  		 */
1661  		if (locked_page) {
1662  			/*
1663  			 * Depending on the compressibility, the pages might or
1664  			 * might not go through async.  We want all of them to
1665  			 * be accounted against wbc once.  Let's do it here
1666  			 * before the paths diverge.  wbc accounting is used
1667  			 * only for foreign writeback detection and doesn't
1668  			 * need full accuracy.  Just account the whole thing
1669  			 * against the first page.
1670  			 */
1671  			wbc_account_cgroup_owner(wbc, locked_page,
1672  						 cur_end - start);
1673  			async_chunk[i].locked_page = locked_page;
1674  			locked_page = NULL;
1675  		} else {
1676  			async_chunk[i].locked_page = NULL;
1677  		}
1678  
1679  		if (blkcg_css != blkcg_root_css) {
1680  			css_get(blkcg_css);
1681  			async_chunk[i].blkcg_css = blkcg_css;
1682  			async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
1683  		} else {
1684  			async_chunk[i].blkcg_css = NULL;
1685  		}
1686  
1687  		btrfs_init_work(&async_chunk[i].work, compress_file_range,
1688  				submit_compressed_extents, async_cow_free);
1689  
1690  		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1691  		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1692  
1693  		btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1694  
1695  		start = cur_end + 1;
1696  	}
1697  	return true;
1698  }
1699  
1700  /*
1701   * Run the delalloc range from start to end, and write back any dirty pages
1702   * covered by the range.
1703   */
1704  static noinline int run_delalloc_cow(struct btrfs_inode *inode,
1705  				     struct page *locked_page, u64 start,
1706  				     u64 end, struct writeback_control *wbc,
1707  				     bool pages_dirty)
1708  {
1709  	u64 done_offset = end;
1710  	int ret;
1711  
1712  	while (start <= end) {
1713  		ret = cow_file_range(inode, locked_page, start, end, &done_offset,
1714  				     true, false);
1715  		if (ret)
1716  			return ret;
1717  		extent_write_locked_range(&inode->vfs_inode, locked_page, start,
1718  					  done_offset, wbc, pages_dirty);
1719  		start = done_offset + 1;
1720  	}
1721  
1722  	return 1;
1723  }
1724  
1725  static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1726  					u64 bytenr, u64 num_bytes, bool nowait)
1727  {
1728  	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
1729  	struct btrfs_ordered_sum *sums;
1730  	int ret;
1731  	LIST_HEAD(list);
1732  
1733  	ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1,
1734  				      &list, 0, nowait);
1735  	if (ret == 0 && list_empty(&list))
1736  		return 0;
1737  
1738  	while (!list_empty(&list)) {
1739  		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1740  		list_del(&sums->list);
1741  		kfree(sums);
1742  	}
1743  	if (ret < 0)
1744  		return ret;
1745  	return 1;
1746  }
1747  
1748  static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
1749  			   const u64 start, const u64 end)
1750  {
1751  	const bool is_space_ino = btrfs_is_free_space_inode(inode);
1752  	const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
1753  	const u64 range_bytes = end + 1 - start;
1754  	struct extent_io_tree *io_tree = &inode->io_tree;
1755  	u64 range_start = start;
1756  	u64 count;
1757  	int ret;
1758  
1759  	/*
1760  	 * If EXTENT_NORESERVE is set it means that when the buffered write was
1761  	 * made we had not enough available data space and therefore we did not
1762  	 * reserve data space for it, since we though we could do NOCOW for the
1763  	 * respective file range (either there is prealloc extent or the inode
1764  	 * has the NOCOW bit set).
1765  	 *
1766  	 * However when we need to fallback to COW mode (because for example the
1767  	 * block group for the corresponding extent was turned to RO mode by a
1768  	 * scrub or relocation) we need to do the following:
1769  	 *
1770  	 * 1) We increment the bytes_may_use counter of the data space info.
1771  	 *    If COW succeeds, it allocates a new data extent and after doing
1772  	 *    that it decrements the space info's bytes_may_use counter and
1773  	 *    increments its bytes_reserved counter by the same amount (we do
1774  	 *    this at btrfs_add_reserved_bytes()). So we need to increment the
1775  	 *    bytes_may_use counter to compensate (when space is reserved at
1776  	 *    buffered write time, the bytes_may_use counter is incremented);
1777  	 *
1778  	 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1779  	 *    that if the COW path fails for any reason, it decrements (through
1780  	 *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1781  	 *    data space info, which we incremented in the step above.
1782  	 *
1783  	 * If we need to fallback to cow and the inode corresponds to a free
1784  	 * space cache inode or an inode of the data relocation tree, we must
1785  	 * also increment bytes_may_use of the data space_info for the same
1786  	 * reason. Space caches and relocated data extents always get a prealloc
1787  	 * extent for them, however scrub or balance may have set the block
1788  	 * group that contains that extent to RO mode and therefore force COW
1789  	 * when starting writeback.
1790  	 */
1791  	count = count_range_bits(io_tree, &range_start, end, range_bytes,
1792  				 EXTENT_NORESERVE, 0, NULL);
1793  	if (count > 0 || is_space_ino || is_reloc_ino) {
1794  		u64 bytes = count;
1795  		struct btrfs_fs_info *fs_info = inode->root->fs_info;
1796  		struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1797  
1798  		if (is_space_ino || is_reloc_ino)
1799  			bytes = range_bytes;
1800  
1801  		spin_lock(&sinfo->lock);
1802  		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
1803  		spin_unlock(&sinfo->lock);
1804  
1805  		if (count > 0)
1806  			clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
1807  					 NULL);
1808  	}
1809  
1810  	/*
1811  	 * Don't try to create inline extents, as a mix of inline extent that
1812  	 * is written out and unlocked directly and a normal NOCOW extent
1813  	 * doesn't work.
1814  	 */
1815  	ret = cow_file_range(inode, locked_page, start, end, NULL, false, true);
1816  	ASSERT(ret != 1);
1817  	return ret;
1818  }
1819  
1820  struct can_nocow_file_extent_args {
1821  	/* Input fields. */
1822  
1823  	/* Start file offset of the range we want to NOCOW. */
1824  	u64 start;
1825  	/* End file offset (inclusive) of the range we want to NOCOW. */
1826  	u64 end;
1827  	bool writeback_path;
1828  	bool strict;
1829  	/*
1830  	 * Free the path passed to can_nocow_file_extent() once it's not needed
1831  	 * anymore.
1832  	 */
1833  	bool free_path;
1834  
1835  	/* Output fields. Only set when can_nocow_file_extent() returns 1. */
1836  
1837  	u64 disk_bytenr;
1838  	u64 disk_num_bytes;
1839  	u64 extent_offset;
1840  	/* Number of bytes that can be written to in NOCOW mode. */
1841  	u64 num_bytes;
1842  };
1843  
1844  /*
1845   * Check if we can NOCOW the file extent that the path points to.
1846   * This function may return with the path released, so the caller should check
1847   * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
1848   *
1849   * Returns: < 0 on error
1850   *            0 if we can not NOCOW
1851   *            1 if we can NOCOW
1852   */
1853  static int can_nocow_file_extent(struct btrfs_path *path,
1854  				 struct btrfs_key *key,
1855  				 struct btrfs_inode *inode,
1856  				 struct can_nocow_file_extent_args *args)
1857  {
1858  	const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
1859  	struct extent_buffer *leaf = path->nodes[0];
1860  	struct btrfs_root *root = inode->root;
1861  	struct btrfs_file_extent_item *fi;
1862  	u64 extent_end;
1863  	u8 extent_type;
1864  	int can_nocow = 0;
1865  	int ret = 0;
1866  	bool nowait = path->nowait;
1867  
1868  	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
1869  	extent_type = btrfs_file_extent_type(leaf, fi);
1870  
1871  	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1872  		goto out;
1873  
1874  	/* Can't access these fields unless we know it's not an inline extent. */
1875  	args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1876  	args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1877  	args->extent_offset = btrfs_file_extent_offset(leaf, fi);
1878  
1879  	if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
1880  	    extent_type == BTRFS_FILE_EXTENT_REG)
1881  		goto out;
1882  
1883  	/*
1884  	 * If the extent was created before the generation where the last snapshot
1885  	 * for its subvolume was created, then this implies the extent is shared,
1886  	 * hence we must COW.
1887  	 */
1888  	if (!args->strict &&
1889  	    btrfs_file_extent_generation(leaf, fi) <=
1890  	    btrfs_root_last_snapshot(&root->root_item))
1891  		goto out;
1892  
1893  	/* An explicit hole, must COW. */
1894  	if (args->disk_bytenr == 0)
1895  		goto out;
1896  
1897  	/* Compressed/encrypted/encoded extents must be COWed. */
1898  	if (btrfs_file_extent_compression(leaf, fi) ||
1899  	    btrfs_file_extent_encryption(leaf, fi) ||
1900  	    btrfs_file_extent_other_encoding(leaf, fi))
1901  		goto out;
1902  
1903  	extent_end = btrfs_file_extent_end(path);
1904  
1905  	/*
1906  	 * The following checks can be expensive, as they need to take other
1907  	 * locks and do btree or rbtree searches, so release the path to avoid
1908  	 * blocking other tasks for too long.
1909  	 */
1910  	btrfs_release_path(path);
1911  
1912  	ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
1913  				    key->offset - args->extent_offset,
1914  				    args->disk_bytenr, args->strict, path);
1915  	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1916  	if (ret != 0)
1917  		goto out;
1918  
1919  	if (args->free_path) {
1920  		/*
1921  		 * We don't need the path anymore, plus through the
1922  		 * csum_exist_in_range() call below we will end up allocating
1923  		 * another path. So free the path to avoid unnecessary extra
1924  		 * memory usage.
1925  		 */
1926  		btrfs_free_path(path);
1927  		path = NULL;
1928  	}
1929  
1930  	/* If there are pending snapshots for this root, we must COW. */
1931  	if (args->writeback_path && !is_freespace_inode &&
1932  	    atomic_read(&root->snapshot_force_cow))
1933  		goto out;
1934  
1935  	args->disk_bytenr += args->extent_offset;
1936  	args->disk_bytenr += args->start - key->offset;
1937  	args->num_bytes = min(args->end + 1, extent_end) - args->start;
1938  
1939  	/*
1940  	 * Force COW if csums exist in the range. This ensures that csums for a
1941  	 * given extent are either valid or do not exist.
1942  	 */
1943  	ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
1944  				  nowait);
1945  	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1946  	if (ret != 0)
1947  		goto out;
1948  
1949  	can_nocow = 1;
1950   out:
1951  	if (args->free_path && path)
1952  		btrfs_free_path(path);
1953  
1954  	return ret < 0 ? ret : can_nocow;
1955  }
1956  
1957  /*
1958   * when nowcow writeback call back.  This checks for snapshots or COW copies
1959   * of the extents that exist in the file, and COWs the file as required.
1960   *
1961   * If no cow copies or snapshots exist, we write directly to the existing
1962   * blocks on disk
1963   */
1964  static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
1965  				       struct page *locked_page,
1966  				       const u64 start, const u64 end)
1967  {
1968  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1969  	struct btrfs_root *root = inode->root;
1970  	struct btrfs_path *path;
1971  	u64 cow_start = (u64)-1;
1972  	u64 cur_offset = start;
1973  	int ret;
1974  	bool check_prev = true;
1975  	u64 ino = btrfs_ino(inode);
1976  	struct can_nocow_file_extent_args nocow_args = { 0 };
1977  
1978  	/*
1979  	 * Normally on a zoned device we're only doing COW writes, but in case
1980  	 * of relocation on a zoned filesystem serializes I/O so that we're only
1981  	 * writing sequentially and can end up here as well.
1982  	 */
1983  	ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
1984  
1985  	path = btrfs_alloc_path();
1986  	if (!path) {
1987  		ret = -ENOMEM;
1988  		goto error;
1989  	}
1990  
1991  	nocow_args.end = end;
1992  	nocow_args.writeback_path = true;
1993  
1994  	while (1) {
1995  		struct btrfs_block_group *nocow_bg = NULL;
1996  		struct btrfs_ordered_extent *ordered;
1997  		struct btrfs_key found_key;
1998  		struct btrfs_file_extent_item *fi;
1999  		struct extent_buffer *leaf;
2000  		u64 extent_end;
2001  		u64 ram_bytes;
2002  		u64 nocow_end;
2003  		int extent_type;
2004  		bool is_prealloc;
2005  
2006  		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
2007  					       cur_offset, 0);
2008  		if (ret < 0)
2009  			goto error;
2010  
2011  		/*
2012  		 * If there is no extent for our range when doing the initial
2013  		 * search, then go back to the previous slot as it will be the
2014  		 * one containing the search offset
2015  		 */
2016  		if (ret > 0 && path->slots[0] > 0 && check_prev) {
2017  			leaf = path->nodes[0];
2018  			btrfs_item_key_to_cpu(leaf, &found_key,
2019  					      path->slots[0] - 1);
2020  			if (found_key.objectid == ino &&
2021  			    found_key.type == BTRFS_EXTENT_DATA_KEY)
2022  				path->slots[0]--;
2023  		}
2024  		check_prev = false;
2025  next_slot:
2026  		/* Go to next leaf if we have exhausted the current one */
2027  		leaf = path->nodes[0];
2028  		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2029  			ret = btrfs_next_leaf(root, path);
2030  			if (ret < 0)
2031  				goto error;
2032  			if (ret > 0)
2033  				break;
2034  			leaf = path->nodes[0];
2035  		}
2036  
2037  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2038  
2039  		/* Didn't find anything for our INO */
2040  		if (found_key.objectid > ino)
2041  			break;
2042  		/*
2043  		 * Keep searching until we find an EXTENT_ITEM or there are no
2044  		 * more extents for this inode
2045  		 */
2046  		if (WARN_ON_ONCE(found_key.objectid < ino) ||
2047  		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
2048  			path->slots[0]++;
2049  			goto next_slot;
2050  		}
2051  
2052  		/* Found key is not EXTENT_DATA_KEY or starts after req range */
2053  		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
2054  		    found_key.offset > end)
2055  			break;
2056  
2057  		/*
2058  		 * If the found extent starts after requested offset, then
2059  		 * adjust extent_end to be right before this extent begins
2060  		 */
2061  		if (found_key.offset > cur_offset) {
2062  			extent_end = found_key.offset;
2063  			extent_type = 0;
2064  			goto must_cow;
2065  		}
2066  
2067  		/*
2068  		 * Found extent which begins before our range and potentially
2069  		 * intersect it
2070  		 */
2071  		fi = btrfs_item_ptr(leaf, path->slots[0],
2072  				    struct btrfs_file_extent_item);
2073  		extent_type = btrfs_file_extent_type(leaf, fi);
2074  		/* If this is triggered then we have a memory corruption. */
2075  		ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
2076  		if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
2077  			ret = -EUCLEAN;
2078  			goto error;
2079  		}
2080  		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
2081  		extent_end = btrfs_file_extent_end(path);
2082  
2083  		/*
2084  		 * If the extent we got ends before our current offset, skip to
2085  		 * the next extent.
2086  		 */
2087  		if (extent_end <= cur_offset) {
2088  			path->slots[0]++;
2089  			goto next_slot;
2090  		}
2091  
2092  		nocow_args.start = cur_offset;
2093  		ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
2094  		if (ret < 0)
2095  			goto error;
2096  		if (ret == 0)
2097  			goto must_cow;
2098  
2099  		ret = 0;
2100  		nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
2101  		if (!nocow_bg) {
2102  must_cow:
2103  			/*
2104  			 * If we can't perform NOCOW writeback for the range,
2105  			 * then record the beginning of the range that needs to
2106  			 * be COWed.  It will be written out before the next
2107  			 * NOCOW range if we find one, or when exiting this
2108  			 * loop.
2109  			 */
2110  			if (cow_start == (u64)-1)
2111  				cow_start = cur_offset;
2112  			cur_offset = extent_end;
2113  			if (cur_offset > end)
2114  				break;
2115  			if (!path->nodes[0])
2116  				continue;
2117  			path->slots[0]++;
2118  			goto next_slot;
2119  		}
2120  
2121  		/*
2122  		 * COW range from cow_start to found_key.offset - 1. As the key
2123  		 * will contain the beginning of the first extent that can be
2124  		 * NOCOW, following one which needs to be COW'ed
2125  		 */
2126  		if (cow_start != (u64)-1) {
2127  			ret = fallback_to_cow(inode, locked_page,
2128  					      cow_start, found_key.offset - 1);
2129  			cow_start = (u64)-1;
2130  			if (ret) {
2131  				btrfs_dec_nocow_writers(nocow_bg);
2132  				goto error;
2133  			}
2134  		}
2135  
2136  		nocow_end = cur_offset + nocow_args.num_bytes - 1;
2137  		is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
2138  		if (is_prealloc) {
2139  			u64 orig_start = found_key.offset - nocow_args.extent_offset;
2140  			struct extent_map *em;
2141  
2142  			em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
2143  					  orig_start,
2144  					  nocow_args.disk_bytenr, /* block_start */
2145  					  nocow_args.num_bytes, /* block_len */
2146  					  nocow_args.disk_num_bytes, /* orig_block_len */
2147  					  ram_bytes, BTRFS_COMPRESS_NONE,
2148  					  BTRFS_ORDERED_PREALLOC);
2149  			if (IS_ERR(em)) {
2150  				btrfs_dec_nocow_writers(nocow_bg);
2151  				ret = PTR_ERR(em);
2152  				goto error;
2153  			}
2154  			free_extent_map(em);
2155  		}
2156  
2157  		ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
2158  				nocow_args.num_bytes, nocow_args.num_bytes,
2159  				nocow_args.disk_bytenr, nocow_args.num_bytes, 0,
2160  				is_prealloc
2161  				? (1 << BTRFS_ORDERED_PREALLOC)
2162  				: (1 << BTRFS_ORDERED_NOCOW),
2163  				BTRFS_COMPRESS_NONE);
2164  		btrfs_dec_nocow_writers(nocow_bg);
2165  		if (IS_ERR(ordered)) {
2166  			if (is_prealloc) {
2167  				btrfs_drop_extent_map_range(inode, cur_offset,
2168  							    nocow_end, false);
2169  			}
2170  			ret = PTR_ERR(ordered);
2171  			goto error;
2172  		}
2173  
2174  		if (btrfs_is_data_reloc_root(root))
2175  			/*
2176  			 * Error handled later, as we must prevent
2177  			 * extent_clear_unlock_delalloc() in error handler
2178  			 * from freeing metadata of created ordered extent.
2179  			 */
2180  			ret = btrfs_reloc_clone_csums(ordered);
2181  		btrfs_put_ordered_extent(ordered);
2182  
2183  		extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
2184  					     locked_page, EXTENT_LOCKED |
2185  					     EXTENT_DELALLOC |
2186  					     EXTENT_CLEAR_DATA_RESV,
2187  					     PAGE_UNLOCK | PAGE_SET_ORDERED);
2188  
2189  		cur_offset = extent_end;
2190  
2191  		/*
2192  		 * btrfs_reloc_clone_csums() error, now we're OK to call error
2193  		 * handler, as metadata for created ordered extent will only
2194  		 * be freed by btrfs_finish_ordered_io().
2195  		 */
2196  		if (ret)
2197  			goto error;
2198  		if (cur_offset > end)
2199  			break;
2200  	}
2201  	btrfs_release_path(path);
2202  
2203  	if (cur_offset <= end && cow_start == (u64)-1)
2204  		cow_start = cur_offset;
2205  
2206  	if (cow_start != (u64)-1) {
2207  		cur_offset = end;
2208  		ret = fallback_to_cow(inode, locked_page, cow_start, end);
2209  		cow_start = (u64)-1;
2210  		if (ret)
2211  			goto error;
2212  	}
2213  
2214  	btrfs_free_path(path);
2215  	return 0;
2216  
2217  error:
2218  	/*
2219  	 * If an error happened while a COW region is outstanding, cur_offset
2220  	 * needs to be reset to cow_start to ensure the COW region is unlocked
2221  	 * as well.
2222  	 */
2223  	if (cow_start != (u64)-1)
2224  		cur_offset = cow_start;
2225  	if (cur_offset < end)
2226  		extent_clear_unlock_delalloc(inode, cur_offset, end,
2227  					     locked_page, EXTENT_LOCKED |
2228  					     EXTENT_DELALLOC | EXTENT_DEFRAG |
2229  					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
2230  					     PAGE_START_WRITEBACK |
2231  					     PAGE_END_WRITEBACK);
2232  	btrfs_free_path(path);
2233  	return ret;
2234  }
2235  
2236  static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
2237  {
2238  	if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
2239  		if (inode->defrag_bytes &&
2240  		    test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
2241  				   0, NULL))
2242  			return false;
2243  		return true;
2244  	}
2245  	return false;
2246  }
2247  
2248  /*
2249   * Function to process delayed allocation (create CoW) for ranges which are
2250   * being touched for the first time.
2251   */
2252  int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
2253  			     u64 start, u64 end, struct writeback_control *wbc)
2254  {
2255  	const bool zoned = btrfs_is_zoned(inode->root->fs_info);
2256  	int ret;
2257  
2258  	/*
2259  	 * The range must cover part of the @locked_page, or a return of 1
2260  	 * can confuse the caller.
2261  	 */
2262  	ASSERT(!(end <= page_offset(locked_page) ||
2263  		 start >= page_offset(locked_page) + PAGE_SIZE));
2264  
2265  	if (should_nocow(inode, start, end)) {
2266  		ret = run_delalloc_nocow(inode, locked_page, start, end);
2267  		goto out;
2268  	}
2269  
2270  	if (btrfs_inode_can_compress(inode) &&
2271  	    inode_need_compress(inode, start, end) &&
2272  	    run_delalloc_compressed(inode, locked_page, start, end, wbc))
2273  		return 1;
2274  
2275  	if (zoned)
2276  		ret = run_delalloc_cow(inode, locked_page, start, end, wbc,
2277  				       true);
2278  	else
2279  		ret = cow_file_range(inode, locked_page, start, end, NULL,
2280  				     false, false);
2281  
2282  out:
2283  	if (ret < 0)
2284  		btrfs_cleanup_ordered_extents(inode, locked_page, start,
2285  					      end - start + 1);
2286  	return ret;
2287  }
2288  
2289  void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
2290  				 struct extent_state *orig, u64 split)
2291  {
2292  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2293  	u64 size;
2294  
2295  	/* not delalloc, ignore it */
2296  	if (!(orig->state & EXTENT_DELALLOC))
2297  		return;
2298  
2299  	size = orig->end - orig->start + 1;
2300  	if (size > fs_info->max_extent_size) {
2301  		u32 num_extents;
2302  		u64 new_size;
2303  
2304  		/*
2305  		 * See the explanation in btrfs_merge_delalloc_extent, the same
2306  		 * applies here, just in reverse.
2307  		 */
2308  		new_size = orig->end - split + 1;
2309  		num_extents = count_max_extents(fs_info, new_size);
2310  		new_size = split - orig->start;
2311  		num_extents += count_max_extents(fs_info, new_size);
2312  		if (count_max_extents(fs_info, size) >= num_extents)
2313  			return;
2314  	}
2315  
2316  	spin_lock(&inode->lock);
2317  	btrfs_mod_outstanding_extents(inode, 1);
2318  	spin_unlock(&inode->lock);
2319  }
2320  
2321  /*
2322   * Handle merged delayed allocation extents so we can keep track of new extents
2323   * that are just merged onto old extents, such as when we are doing sequential
2324   * writes, so we can properly account for the metadata space we'll need.
2325   */
2326  void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
2327  				 struct extent_state *other)
2328  {
2329  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2330  	u64 new_size, old_size;
2331  	u32 num_extents;
2332  
2333  	/* not delalloc, ignore it */
2334  	if (!(other->state & EXTENT_DELALLOC))
2335  		return;
2336  
2337  	if (new->start > other->start)
2338  		new_size = new->end - other->start + 1;
2339  	else
2340  		new_size = other->end - new->start + 1;
2341  
2342  	/* we're not bigger than the max, unreserve the space and go */
2343  	if (new_size <= fs_info->max_extent_size) {
2344  		spin_lock(&inode->lock);
2345  		btrfs_mod_outstanding_extents(inode, -1);
2346  		spin_unlock(&inode->lock);
2347  		return;
2348  	}
2349  
2350  	/*
2351  	 * We have to add up either side to figure out how many extents were
2352  	 * accounted for before we merged into one big extent.  If the number of
2353  	 * extents we accounted for is <= the amount we need for the new range
2354  	 * then we can return, otherwise drop.  Think of it like this
2355  	 *
2356  	 * [ 4k][MAX_SIZE]
2357  	 *
2358  	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
2359  	 * need 2 outstanding extents, on one side we have 1 and the other side
2360  	 * we have 1 so they are == and we can return.  But in this case
2361  	 *
2362  	 * [MAX_SIZE+4k][MAX_SIZE+4k]
2363  	 *
2364  	 * Each range on their own accounts for 2 extents, but merged together
2365  	 * they are only 3 extents worth of accounting, so we need to drop in
2366  	 * this case.
2367  	 */
2368  	old_size = other->end - other->start + 1;
2369  	num_extents = count_max_extents(fs_info, old_size);
2370  	old_size = new->end - new->start + 1;
2371  	num_extents += count_max_extents(fs_info, old_size);
2372  	if (count_max_extents(fs_info, new_size) >= num_extents)
2373  		return;
2374  
2375  	spin_lock(&inode->lock);
2376  	btrfs_mod_outstanding_extents(inode, -1);
2377  	spin_unlock(&inode->lock);
2378  }
2379  
2380  static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
2381  				      struct btrfs_inode *inode)
2382  {
2383  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2384  
2385  	spin_lock(&root->delalloc_lock);
2386  	if (list_empty(&inode->delalloc_inodes)) {
2387  		list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
2388  		set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
2389  		root->nr_delalloc_inodes++;
2390  		if (root->nr_delalloc_inodes == 1) {
2391  			spin_lock(&fs_info->delalloc_root_lock);
2392  			BUG_ON(!list_empty(&root->delalloc_root));
2393  			list_add_tail(&root->delalloc_root,
2394  				      &fs_info->delalloc_roots);
2395  			spin_unlock(&fs_info->delalloc_root_lock);
2396  		}
2397  	}
2398  	spin_unlock(&root->delalloc_lock);
2399  }
2400  
2401  void __btrfs_del_delalloc_inode(struct btrfs_root *root,
2402  				struct btrfs_inode *inode)
2403  {
2404  	struct btrfs_fs_info *fs_info = root->fs_info;
2405  
2406  	if (!list_empty(&inode->delalloc_inodes)) {
2407  		list_del_init(&inode->delalloc_inodes);
2408  		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2409  			  &inode->runtime_flags);
2410  		root->nr_delalloc_inodes--;
2411  		if (!root->nr_delalloc_inodes) {
2412  			ASSERT(list_empty(&root->delalloc_inodes));
2413  			spin_lock(&fs_info->delalloc_root_lock);
2414  			BUG_ON(list_empty(&root->delalloc_root));
2415  			list_del_init(&root->delalloc_root);
2416  			spin_unlock(&fs_info->delalloc_root_lock);
2417  		}
2418  	}
2419  }
2420  
2421  static void btrfs_del_delalloc_inode(struct btrfs_root *root,
2422  				     struct btrfs_inode *inode)
2423  {
2424  	spin_lock(&root->delalloc_lock);
2425  	__btrfs_del_delalloc_inode(root, inode);
2426  	spin_unlock(&root->delalloc_lock);
2427  }
2428  
2429  /*
2430   * Properly track delayed allocation bytes in the inode and to maintain the
2431   * list of inodes that have pending delalloc work to be done.
2432   */
2433  void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
2434  			       u32 bits)
2435  {
2436  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2437  
2438  	if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
2439  		WARN_ON(1);
2440  	/*
2441  	 * set_bit and clear bit hooks normally require _irqsave/restore
2442  	 * but in this case, we are only testing for the DELALLOC
2443  	 * bit, which is only set or cleared with irqs on
2444  	 */
2445  	if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2446  		struct btrfs_root *root = inode->root;
2447  		u64 len = state->end + 1 - state->start;
2448  		u32 num_extents = count_max_extents(fs_info, len);
2449  		bool do_list = !btrfs_is_free_space_inode(inode);
2450  
2451  		spin_lock(&inode->lock);
2452  		btrfs_mod_outstanding_extents(inode, num_extents);
2453  		spin_unlock(&inode->lock);
2454  
2455  		/* For sanity tests */
2456  		if (btrfs_is_testing(fs_info))
2457  			return;
2458  
2459  		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
2460  					 fs_info->delalloc_batch);
2461  		spin_lock(&inode->lock);
2462  		inode->delalloc_bytes += len;
2463  		if (bits & EXTENT_DEFRAG)
2464  			inode->defrag_bytes += len;
2465  		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2466  					 &inode->runtime_flags))
2467  			btrfs_add_delalloc_inodes(root, inode);
2468  		spin_unlock(&inode->lock);
2469  	}
2470  
2471  	if (!(state->state & EXTENT_DELALLOC_NEW) &&
2472  	    (bits & EXTENT_DELALLOC_NEW)) {
2473  		spin_lock(&inode->lock);
2474  		inode->new_delalloc_bytes += state->end + 1 - state->start;
2475  		spin_unlock(&inode->lock);
2476  	}
2477  }
2478  
2479  /*
2480   * Once a range is no longer delalloc this function ensures that proper
2481   * accounting happens.
2482   */
2483  void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
2484  				 struct extent_state *state, u32 bits)
2485  {
2486  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2487  	u64 len = state->end + 1 - state->start;
2488  	u32 num_extents = count_max_extents(fs_info, len);
2489  
2490  	if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
2491  		spin_lock(&inode->lock);
2492  		inode->defrag_bytes -= len;
2493  		spin_unlock(&inode->lock);
2494  	}
2495  
2496  	/*
2497  	 * set_bit and clear bit hooks normally require _irqsave/restore
2498  	 * but in this case, we are only testing for the DELALLOC
2499  	 * bit, which is only set or cleared with irqs on
2500  	 */
2501  	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2502  		struct btrfs_root *root = inode->root;
2503  		bool do_list = !btrfs_is_free_space_inode(inode);
2504  
2505  		spin_lock(&inode->lock);
2506  		btrfs_mod_outstanding_extents(inode, -num_extents);
2507  		spin_unlock(&inode->lock);
2508  
2509  		/*
2510  		 * We don't reserve metadata space for space cache inodes so we
2511  		 * don't need to call delalloc_release_metadata if there is an
2512  		 * error.
2513  		 */
2514  		if (bits & EXTENT_CLEAR_META_RESV &&
2515  		    root != fs_info->tree_root)
2516  			btrfs_delalloc_release_metadata(inode, len, true);
2517  
2518  		/* For sanity tests. */
2519  		if (btrfs_is_testing(fs_info))
2520  			return;
2521  
2522  		if (!btrfs_is_data_reloc_root(root) &&
2523  		    do_list && !(state->state & EXTENT_NORESERVE) &&
2524  		    (bits & EXTENT_CLEAR_DATA_RESV))
2525  			btrfs_free_reserved_data_space_noquota(fs_info, len);
2526  
2527  		percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
2528  					 fs_info->delalloc_batch);
2529  		spin_lock(&inode->lock);
2530  		inode->delalloc_bytes -= len;
2531  		if (do_list && inode->delalloc_bytes == 0 &&
2532  		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2533  					&inode->runtime_flags))
2534  			btrfs_del_delalloc_inode(root, inode);
2535  		spin_unlock(&inode->lock);
2536  	}
2537  
2538  	if ((state->state & EXTENT_DELALLOC_NEW) &&
2539  	    (bits & EXTENT_DELALLOC_NEW)) {
2540  		spin_lock(&inode->lock);
2541  		ASSERT(inode->new_delalloc_bytes >= len);
2542  		inode->new_delalloc_bytes -= len;
2543  		if (bits & EXTENT_ADD_INODE_BYTES)
2544  			inode_add_bytes(&inode->vfs_inode, len);
2545  		spin_unlock(&inode->lock);
2546  	}
2547  }
2548  
2549  static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
2550  					struct btrfs_ordered_extent *ordered)
2551  {
2552  	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
2553  	u64 len = bbio->bio.bi_iter.bi_size;
2554  	struct btrfs_ordered_extent *new;
2555  	int ret;
2556  
2557  	/* Must always be called for the beginning of an ordered extent. */
2558  	if (WARN_ON_ONCE(start != ordered->disk_bytenr))
2559  		return -EINVAL;
2560  
2561  	/* No need to split if the ordered extent covers the entire bio. */
2562  	if (ordered->disk_num_bytes == len) {
2563  		refcount_inc(&ordered->refs);
2564  		bbio->ordered = ordered;
2565  		return 0;
2566  	}
2567  
2568  	/*
2569  	 * Don't split the extent_map for NOCOW extents, as we're writing into
2570  	 * a pre-existing one.
2571  	 */
2572  	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
2573  		ret = split_extent_map(bbio->inode, bbio->file_offset,
2574  				       ordered->num_bytes, len,
2575  				       ordered->disk_bytenr);
2576  		if (ret)
2577  			return ret;
2578  	}
2579  
2580  	new = btrfs_split_ordered_extent(ordered, len);
2581  	if (IS_ERR(new))
2582  		return PTR_ERR(new);
2583  	bbio->ordered = new;
2584  	return 0;
2585  }
2586  
2587  /*
2588   * given a list of ordered sums record them in the inode.  This happens
2589   * at IO completion time based on sums calculated at bio submission time.
2590   */
2591  static int add_pending_csums(struct btrfs_trans_handle *trans,
2592  			     struct list_head *list)
2593  {
2594  	struct btrfs_ordered_sum *sum;
2595  	struct btrfs_root *csum_root = NULL;
2596  	int ret;
2597  
2598  	list_for_each_entry(sum, list, list) {
2599  		trans->adding_csums = true;
2600  		if (!csum_root)
2601  			csum_root = btrfs_csum_root(trans->fs_info,
2602  						    sum->logical);
2603  		ret = btrfs_csum_file_blocks(trans, csum_root, sum);
2604  		trans->adding_csums = false;
2605  		if (ret)
2606  			return ret;
2607  	}
2608  	return 0;
2609  }
2610  
2611  static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2612  					 const u64 start,
2613  					 const u64 len,
2614  					 struct extent_state **cached_state)
2615  {
2616  	u64 search_start = start;
2617  	const u64 end = start + len - 1;
2618  
2619  	while (search_start < end) {
2620  		const u64 search_len = end - search_start + 1;
2621  		struct extent_map *em;
2622  		u64 em_len;
2623  		int ret = 0;
2624  
2625  		em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
2626  		if (IS_ERR(em))
2627  			return PTR_ERR(em);
2628  
2629  		if (em->block_start != EXTENT_MAP_HOLE)
2630  			goto next;
2631  
2632  		em_len = em->len;
2633  		if (em->start < search_start)
2634  			em_len -= search_start - em->start;
2635  		if (em_len > search_len)
2636  			em_len = search_len;
2637  
2638  		ret = set_extent_bit(&inode->io_tree, search_start,
2639  				     search_start + em_len - 1,
2640  				     EXTENT_DELALLOC_NEW, cached_state);
2641  next:
2642  		search_start = extent_map_end(em);
2643  		free_extent_map(em);
2644  		if (ret)
2645  			return ret;
2646  	}
2647  	return 0;
2648  }
2649  
2650  int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2651  			      unsigned int extra_bits,
2652  			      struct extent_state **cached_state)
2653  {
2654  	WARN_ON(PAGE_ALIGNED(end));
2655  
2656  	if (start >= i_size_read(&inode->vfs_inode) &&
2657  	    !(inode->flags & BTRFS_INODE_PREALLOC)) {
2658  		/*
2659  		 * There can't be any extents following eof in this case so just
2660  		 * set the delalloc new bit for the range directly.
2661  		 */
2662  		extra_bits |= EXTENT_DELALLOC_NEW;
2663  	} else {
2664  		int ret;
2665  
2666  		ret = btrfs_find_new_delalloc_bytes(inode, start,
2667  						    end + 1 - start,
2668  						    cached_state);
2669  		if (ret)
2670  			return ret;
2671  	}
2672  
2673  	return set_extent_bit(&inode->io_tree, start, end,
2674  			      EXTENT_DELALLOC | extra_bits, cached_state);
2675  }
2676  
2677  /* see btrfs_writepage_start_hook for details on why this is required */
2678  struct btrfs_writepage_fixup {
2679  	struct page *page;
2680  	struct btrfs_inode *inode;
2681  	struct btrfs_work work;
2682  };
2683  
2684  static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2685  {
2686  	struct btrfs_writepage_fixup *fixup =
2687  		container_of(work, struct btrfs_writepage_fixup, work);
2688  	struct btrfs_ordered_extent *ordered;
2689  	struct extent_state *cached_state = NULL;
2690  	struct extent_changeset *data_reserved = NULL;
2691  	struct page *page = fixup->page;
2692  	struct btrfs_inode *inode = fixup->inode;
2693  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2694  	u64 page_start = page_offset(page);
2695  	u64 page_end = page_offset(page) + PAGE_SIZE - 1;
2696  	int ret = 0;
2697  	bool free_delalloc_space = true;
2698  
2699  	/*
2700  	 * This is similar to page_mkwrite, we need to reserve the space before
2701  	 * we take the page lock.
2702  	 */
2703  	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2704  					   PAGE_SIZE);
2705  again:
2706  	lock_page(page);
2707  
2708  	/*
2709  	 * Before we queued this fixup, we took a reference on the page.
2710  	 * page->mapping may go NULL, but it shouldn't be moved to a different
2711  	 * address space.
2712  	 */
2713  	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2714  		/*
2715  		 * Unfortunately this is a little tricky, either
2716  		 *
2717  		 * 1) We got here and our page had already been dealt with and
2718  		 *    we reserved our space, thus ret == 0, so we need to just
2719  		 *    drop our space reservation and bail.  This can happen the
2720  		 *    first time we come into the fixup worker, or could happen
2721  		 *    while waiting for the ordered extent.
2722  		 * 2) Our page was already dealt with, but we happened to get an
2723  		 *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
2724  		 *    this case we obviously don't have anything to release, but
2725  		 *    because the page was already dealt with we don't want to
2726  		 *    mark the page with an error, so make sure we're resetting
2727  		 *    ret to 0.  This is why we have this check _before_ the ret
2728  		 *    check, because we do not want to have a surprise ENOSPC
2729  		 *    when the page was already properly dealt with.
2730  		 */
2731  		if (!ret) {
2732  			btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2733  			btrfs_delalloc_release_space(inode, data_reserved,
2734  						     page_start, PAGE_SIZE,
2735  						     true);
2736  		}
2737  		ret = 0;
2738  		goto out_page;
2739  	}
2740  
2741  	/*
2742  	 * We can't mess with the page state unless it is locked, so now that
2743  	 * it is locked bail if we failed to make our space reservation.
2744  	 */
2745  	if (ret)
2746  		goto out_page;
2747  
2748  	lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2749  
2750  	/* already ordered? We're done */
2751  	if (PageOrdered(page))
2752  		goto out_reserved;
2753  
2754  	ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
2755  	if (ordered) {
2756  		unlock_extent(&inode->io_tree, page_start, page_end,
2757  			      &cached_state);
2758  		unlock_page(page);
2759  		btrfs_start_ordered_extent(ordered);
2760  		btrfs_put_ordered_extent(ordered);
2761  		goto again;
2762  	}
2763  
2764  	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2765  					&cached_state);
2766  	if (ret)
2767  		goto out_reserved;
2768  
2769  	/*
2770  	 * Everything went as planned, we're now the owner of a dirty page with
2771  	 * delayed allocation bits set and space reserved for our COW
2772  	 * destination.
2773  	 *
2774  	 * The page was dirty when we started, nothing should have cleaned it.
2775  	 */
2776  	BUG_ON(!PageDirty(page));
2777  	free_delalloc_space = false;
2778  out_reserved:
2779  	btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2780  	if (free_delalloc_space)
2781  		btrfs_delalloc_release_space(inode, data_reserved, page_start,
2782  					     PAGE_SIZE, true);
2783  	unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2784  out_page:
2785  	if (ret) {
2786  		/*
2787  		 * We hit ENOSPC or other errors.  Update the mapping and page
2788  		 * to reflect the errors and clean the page.
2789  		 */
2790  		mapping_set_error(page->mapping, ret);
2791  		btrfs_mark_ordered_io_finished(inode, page, page_start,
2792  					       PAGE_SIZE, !ret);
2793  		clear_page_dirty_for_io(page);
2794  	}
2795  	btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
2796  	unlock_page(page);
2797  	put_page(page);
2798  	kfree(fixup);
2799  	extent_changeset_free(data_reserved);
2800  	/*
2801  	 * As a precaution, do a delayed iput in case it would be the last iput
2802  	 * that could need flushing space. Recursing back to fixup worker would
2803  	 * deadlock.
2804  	 */
2805  	btrfs_add_delayed_iput(inode);
2806  }
2807  
2808  /*
2809   * There are a few paths in the higher layers of the kernel that directly
2810   * set the page dirty bit without asking the filesystem if it is a
2811   * good idea.  This causes problems because we want to make sure COW
2812   * properly happens and the data=ordered rules are followed.
2813   *
2814   * In our case any range that doesn't have the ORDERED bit set
2815   * hasn't been properly setup for IO.  We kick off an async process
2816   * to fix it up.  The async helper will wait for ordered extents, set
2817   * the delalloc bit and make it safe to write the page.
2818   */
2819  int btrfs_writepage_cow_fixup(struct page *page)
2820  {
2821  	struct inode *inode = page->mapping->host;
2822  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2823  	struct btrfs_writepage_fixup *fixup;
2824  
2825  	/* This page has ordered extent covering it already */
2826  	if (PageOrdered(page))
2827  		return 0;
2828  
2829  	/*
2830  	 * PageChecked is set below when we create a fixup worker for this page,
2831  	 * don't try to create another one if we're already PageChecked()
2832  	 *
2833  	 * The extent_io writepage code will redirty the page if we send back
2834  	 * EAGAIN.
2835  	 */
2836  	if (PageChecked(page))
2837  		return -EAGAIN;
2838  
2839  	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2840  	if (!fixup)
2841  		return -EAGAIN;
2842  
2843  	/*
2844  	 * We are already holding a reference to this inode from
2845  	 * write_cache_pages.  We need to hold it because the space reservation
2846  	 * takes place outside of the page lock, and we can't trust
2847  	 * page->mapping outside of the page lock.
2848  	 */
2849  	ihold(inode);
2850  	btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
2851  	get_page(page);
2852  	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
2853  	fixup->page = page;
2854  	fixup->inode = BTRFS_I(inode);
2855  	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2856  
2857  	return -EAGAIN;
2858  }
2859  
2860  static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2861  				       struct btrfs_inode *inode, u64 file_pos,
2862  				       struct btrfs_file_extent_item *stack_fi,
2863  				       const bool update_inode_bytes,
2864  				       u64 qgroup_reserved)
2865  {
2866  	struct btrfs_root *root = inode->root;
2867  	const u64 sectorsize = root->fs_info->sectorsize;
2868  	struct btrfs_path *path;
2869  	struct extent_buffer *leaf;
2870  	struct btrfs_key ins;
2871  	u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2872  	u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
2873  	u64 offset = btrfs_stack_file_extent_offset(stack_fi);
2874  	u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2875  	u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2876  	struct btrfs_drop_extents_args drop_args = { 0 };
2877  	int ret;
2878  
2879  	path = btrfs_alloc_path();
2880  	if (!path)
2881  		return -ENOMEM;
2882  
2883  	/*
2884  	 * we may be replacing one extent in the tree with another.
2885  	 * The new extent is pinned in the extent map, and we don't want
2886  	 * to drop it from the cache until it is completely in the btree.
2887  	 *
2888  	 * So, tell btrfs_drop_extents to leave this extent in the cache.
2889  	 * the caller is expected to unpin it and allow it to be merged
2890  	 * with the others.
2891  	 */
2892  	drop_args.path = path;
2893  	drop_args.start = file_pos;
2894  	drop_args.end = file_pos + num_bytes;
2895  	drop_args.replace_extent = true;
2896  	drop_args.extent_item_size = sizeof(*stack_fi);
2897  	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2898  	if (ret)
2899  		goto out;
2900  
2901  	if (!drop_args.extent_inserted) {
2902  		ins.objectid = btrfs_ino(inode);
2903  		ins.offset = file_pos;
2904  		ins.type = BTRFS_EXTENT_DATA_KEY;
2905  
2906  		ret = btrfs_insert_empty_item(trans, root, path, &ins,
2907  					      sizeof(*stack_fi));
2908  		if (ret)
2909  			goto out;
2910  	}
2911  	leaf = path->nodes[0];
2912  	btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2913  	write_extent_buffer(leaf, stack_fi,
2914  			btrfs_item_ptr_offset(leaf, path->slots[0]),
2915  			sizeof(struct btrfs_file_extent_item));
2916  
2917  	btrfs_mark_buffer_dirty(trans, leaf);
2918  	btrfs_release_path(path);
2919  
2920  	/*
2921  	 * If we dropped an inline extent here, we know the range where it is
2922  	 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
2923  	 * number of bytes only for that range containing the inline extent.
2924  	 * The remaining of the range will be processed when clearning the
2925  	 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
2926  	 */
2927  	if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
2928  		u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
2929  
2930  		inline_size = drop_args.bytes_found - inline_size;
2931  		btrfs_update_inode_bytes(inode, sectorsize, inline_size);
2932  		drop_args.bytes_found -= inline_size;
2933  		num_bytes -= sectorsize;
2934  	}
2935  
2936  	if (update_inode_bytes)
2937  		btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
2938  
2939  	ins.objectid = disk_bytenr;
2940  	ins.offset = disk_num_bytes;
2941  	ins.type = BTRFS_EXTENT_ITEM_KEY;
2942  
2943  	ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
2944  	if (ret)
2945  		goto out;
2946  
2947  	ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
2948  					       file_pos - offset,
2949  					       qgroup_reserved, &ins);
2950  out:
2951  	btrfs_free_path(path);
2952  
2953  	return ret;
2954  }
2955  
2956  static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2957  					 u64 start, u64 len)
2958  {
2959  	struct btrfs_block_group *cache;
2960  
2961  	cache = btrfs_lookup_block_group(fs_info, start);
2962  	ASSERT(cache);
2963  
2964  	spin_lock(&cache->lock);
2965  	cache->delalloc_bytes -= len;
2966  	spin_unlock(&cache->lock);
2967  
2968  	btrfs_put_block_group(cache);
2969  }
2970  
2971  static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
2972  					     struct btrfs_ordered_extent *oe)
2973  {
2974  	struct btrfs_file_extent_item stack_fi;
2975  	bool update_inode_bytes;
2976  	u64 num_bytes = oe->num_bytes;
2977  	u64 ram_bytes = oe->ram_bytes;
2978  
2979  	memset(&stack_fi, 0, sizeof(stack_fi));
2980  	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
2981  	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
2982  	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
2983  						   oe->disk_num_bytes);
2984  	btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
2985  	if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
2986  		num_bytes = oe->truncated_len;
2987  		ram_bytes = num_bytes;
2988  	}
2989  	btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
2990  	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
2991  	btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
2992  	/* Encryption and other encoding is reserved and all 0 */
2993  
2994  	/*
2995  	 * For delalloc, when completing an ordered extent we update the inode's
2996  	 * bytes when clearing the range in the inode's io tree, so pass false
2997  	 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
2998  	 * except if the ordered extent was truncated.
2999  	 */
3000  	update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
3001  			     test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
3002  			     test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
3003  
3004  	return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
3005  					   oe->file_offset, &stack_fi,
3006  					   update_inode_bytes, oe->qgroup_rsv);
3007  }
3008  
3009  /*
3010   * As ordered data IO finishes, this gets called so we can finish
3011   * an ordered extent if the range of bytes in the file it covers are
3012   * fully written.
3013   */
3014  int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
3015  {
3016  	struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
3017  	struct btrfs_root *root = inode->root;
3018  	struct btrfs_fs_info *fs_info = root->fs_info;
3019  	struct btrfs_trans_handle *trans = NULL;
3020  	struct extent_io_tree *io_tree = &inode->io_tree;
3021  	struct extent_state *cached_state = NULL;
3022  	u64 start, end;
3023  	int compress_type = 0;
3024  	int ret = 0;
3025  	u64 logical_len = ordered_extent->num_bytes;
3026  	bool freespace_inode;
3027  	bool truncated = false;
3028  	bool clear_reserved_extent = true;
3029  	unsigned int clear_bits = EXTENT_DEFRAG;
3030  
3031  	start = ordered_extent->file_offset;
3032  	end = start + ordered_extent->num_bytes - 1;
3033  
3034  	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3035  	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3036  	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
3037  	    !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
3038  		clear_bits |= EXTENT_DELALLOC_NEW;
3039  
3040  	freespace_inode = btrfs_is_free_space_inode(inode);
3041  	if (!freespace_inode)
3042  		btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
3043  
3044  	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
3045  		ret = -EIO;
3046  		goto out;
3047  	}
3048  
3049  	if (btrfs_is_zoned(fs_info))
3050  		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
3051  					ordered_extent->disk_num_bytes);
3052  
3053  	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3054  		truncated = true;
3055  		logical_len = ordered_extent->truncated_len;
3056  		/* Truncated the entire extent, don't bother adding */
3057  		if (!logical_len)
3058  			goto out;
3059  	}
3060  
3061  	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3062  		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
3063  
3064  		btrfs_inode_safe_disk_i_size_write(inode, 0);
3065  		if (freespace_inode)
3066  			trans = btrfs_join_transaction_spacecache(root);
3067  		else
3068  			trans = btrfs_join_transaction(root);
3069  		if (IS_ERR(trans)) {
3070  			ret = PTR_ERR(trans);
3071  			trans = NULL;
3072  			goto out;
3073  		}
3074  		trans->block_rsv = &inode->block_rsv;
3075  		ret = btrfs_update_inode_fallback(trans, root, inode);
3076  		if (ret) /* -ENOMEM or corruption */
3077  			btrfs_abort_transaction(trans, ret);
3078  		goto out;
3079  	}
3080  
3081  	clear_bits |= EXTENT_LOCKED;
3082  	lock_extent(io_tree, start, end, &cached_state);
3083  
3084  	if (freespace_inode)
3085  		trans = btrfs_join_transaction_spacecache(root);
3086  	else
3087  		trans = btrfs_join_transaction(root);
3088  	if (IS_ERR(trans)) {
3089  		ret = PTR_ERR(trans);
3090  		trans = NULL;
3091  		goto out;
3092  	}
3093  
3094  	trans->block_rsv = &inode->block_rsv;
3095  
3096  	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3097  		compress_type = ordered_extent->compress_type;
3098  	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3099  		BUG_ON(compress_type);
3100  		ret = btrfs_mark_extent_written(trans, inode,
3101  						ordered_extent->file_offset,
3102  						ordered_extent->file_offset +
3103  						logical_len);
3104  		btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
3105  						  ordered_extent->disk_num_bytes);
3106  	} else {
3107  		BUG_ON(root == fs_info->tree_root);
3108  		ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3109  		if (!ret) {
3110  			clear_reserved_extent = false;
3111  			btrfs_release_delalloc_bytes(fs_info,
3112  						ordered_extent->disk_bytenr,
3113  						ordered_extent->disk_num_bytes);
3114  		}
3115  	}
3116  	unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
3117  			   ordered_extent->num_bytes, trans->transid);
3118  	if (ret < 0) {
3119  		btrfs_abort_transaction(trans, ret);
3120  		goto out;
3121  	}
3122  
3123  	ret = add_pending_csums(trans, &ordered_extent->list);
3124  	if (ret) {
3125  		btrfs_abort_transaction(trans, ret);
3126  		goto out;
3127  	}
3128  
3129  	/*
3130  	 * If this is a new delalloc range, clear its new delalloc flag to
3131  	 * update the inode's number of bytes. This needs to be done first
3132  	 * before updating the inode item.
3133  	 */
3134  	if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3135  	    !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3136  		clear_extent_bit(&inode->io_tree, start, end,
3137  				 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
3138  				 &cached_state);
3139  
3140  	btrfs_inode_safe_disk_i_size_write(inode, 0);
3141  	ret = btrfs_update_inode_fallback(trans, root, inode);
3142  	if (ret) { /* -ENOMEM or corruption */
3143  		btrfs_abort_transaction(trans, ret);
3144  		goto out;
3145  	}
3146  	ret = 0;
3147  out:
3148  	clear_extent_bit(&inode->io_tree, start, end, clear_bits,
3149  			 &cached_state);
3150  
3151  	if (trans)
3152  		btrfs_end_transaction(trans);
3153  
3154  	if (ret || truncated) {
3155  		u64 unwritten_start = start;
3156  
3157  		/*
3158  		 * If we failed to finish this ordered extent for any reason we
3159  		 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3160  		 * extent, and mark the inode with the error if it wasn't
3161  		 * already set.  Any error during writeback would have already
3162  		 * set the mapping error, so we need to set it if we're the ones
3163  		 * marking this ordered extent as failed.
3164  		 */
3165  		if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
3166  					     &ordered_extent->flags))
3167  			mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
3168  
3169  		if (truncated)
3170  			unwritten_start += logical_len;
3171  		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
3172  
3173  		/*
3174  		 * Drop extent maps for the part of the extent we didn't write.
3175  		 *
3176  		 * We have an exception here for the free_space_inode, this is
3177  		 * because when we do btrfs_get_extent() on the free space inode
3178  		 * we will search the commit root.  If this is a new block group
3179  		 * we won't find anything, and we will trip over the assert in
3180  		 * writepage where we do ASSERT(em->block_start !=
3181  		 * EXTENT_MAP_HOLE).
3182  		 *
3183  		 * Theoretically we could also skip this for any NOCOW extent as
3184  		 * we don't mess with the extent map tree in the NOCOW case, but
3185  		 * for now simply skip this if we are the free space inode.
3186  		 */
3187  		if (!btrfs_is_free_space_inode(inode))
3188  			btrfs_drop_extent_map_range(inode, unwritten_start,
3189  						    end, false);
3190  
3191  		/*
3192  		 * If the ordered extent had an IOERR or something else went
3193  		 * wrong we need to return the space for this ordered extent
3194  		 * back to the allocator.  We only free the extent in the
3195  		 * truncated case if we didn't write out the extent at all.
3196  		 *
3197  		 * If we made it past insert_reserved_file_extent before we
3198  		 * errored out then we don't need to do this as the accounting
3199  		 * has already been done.
3200  		 */
3201  		if ((ret || !logical_len) &&
3202  		    clear_reserved_extent &&
3203  		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3204  		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3205  			/*
3206  			 * Discard the range before returning it back to the
3207  			 * free space pool
3208  			 */
3209  			if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3210  				btrfs_discard_extent(fs_info,
3211  						ordered_extent->disk_bytenr,
3212  						ordered_extent->disk_num_bytes,
3213  						NULL);
3214  			btrfs_free_reserved_extent(fs_info,
3215  					ordered_extent->disk_bytenr,
3216  					ordered_extent->disk_num_bytes, 1);
3217  			/*
3218  			 * Actually free the qgroup rsv which was released when
3219  			 * the ordered extent was created.
3220  			 */
3221  			btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid,
3222  						  ordered_extent->qgroup_rsv,
3223  						  BTRFS_QGROUP_RSV_DATA);
3224  		}
3225  	}
3226  
3227  	/*
3228  	 * This needs to be done to make sure anybody waiting knows we are done
3229  	 * updating everything for this ordered extent.
3230  	 */
3231  	btrfs_remove_ordered_extent(inode, ordered_extent);
3232  
3233  	/* once for us */
3234  	btrfs_put_ordered_extent(ordered_extent);
3235  	/* once for the tree */
3236  	btrfs_put_ordered_extent(ordered_extent);
3237  
3238  	return ret;
3239  }
3240  
3241  int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
3242  {
3243  	if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
3244  	    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
3245  		btrfs_finish_ordered_zoned(ordered);
3246  	return btrfs_finish_one_ordered(ordered);
3247  }
3248  
3249  /*
3250   * Verify the checksum for a single sector without any extra action that depend
3251   * on the type of I/O.
3252   */
3253  int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
3254  			    u32 pgoff, u8 *csum, const u8 * const csum_expected)
3255  {
3256  	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3257  	char *kaddr;
3258  
3259  	ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
3260  
3261  	shash->tfm = fs_info->csum_shash;
3262  
3263  	kaddr = kmap_local_page(page) + pgoff;
3264  	crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
3265  	kunmap_local(kaddr);
3266  
3267  	if (memcmp(csum, csum_expected, fs_info->csum_size))
3268  		return -EIO;
3269  	return 0;
3270  }
3271  
3272  /*
3273   * Verify the checksum of a single data sector.
3274   *
3275   * @bbio:	btrfs_io_bio which contains the csum
3276   * @dev:	device the sector is on
3277   * @bio_offset:	offset to the beginning of the bio (in bytes)
3278   * @bv:		bio_vec to check
3279   *
3280   * Check if the checksum on a data block is valid.  When a checksum mismatch is
3281   * detected, report the error and fill the corrupted range with zero.
3282   *
3283   * Return %true if the sector is ok or had no checksum to start with, else %false.
3284   */
3285  bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
3286  			u32 bio_offset, struct bio_vec *bv)
3287  {
3288  	struct btrfs_inode *inode = bbio->inode;
3289  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3290  	u64 file_offset = bbio->file_offset + bio_offset;
3291  	u64 end = file_offset + bv->bv_len - 1;
3292  	u8 *csum_expected;
3293  	u8 csum[BTRFS_CSUM_SIZE];
3294  
3295  	ASSERT(bv->bv_len == fs_info->sectorsize);
3296  
3297  	if (!bbio->csum)
3298  		return true;
3299  
3300  	if (btrfs_is_data_reloc_root(inode->root) &&
3301  	    test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
3302  			   1, NULL)) {
3303  		/* Skip the range without csum for data reloc inode */
3304  		clear_extent_bits(&inode->io_tree, file_offset, end,
3305  				  EXTENT_NODATASUM);
3306  		return true;
3307  	}
3308  
3309  	csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
3310  				fs_info->csum_size;
3311  	if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
3312  				    csum_expected))
3313  		goto zeroit;
3314  	return true;
3315  
3316  zeroit:
3317  	btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
3318  				    bbio->mirror_num);
3319  	if (dev)
3320  		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
3321  	memzero_bvec(bv);
3322  	return false;
3323  }
3324  
3325  /*
3326   * btrfs_add_delayed_iput - perform a delayed iput on @inode
3327   *
3328   * @inode: The inode we want to perform iput on
3329   *
3330   * This function uses the generic vfs_inode::i_count to track whether we should
3331   * just decrement it (in case it's > 1) or if this is the last iput then link
3332   * the inode to the delayed iput machinery. Delayed iputs are processed at
3333   * transaction commit time/superblock commit/cleaner kthread.
3334   */
3335  void btrfs_add_delayed_iput(struct btrfs_inode *inode)
3336  {
3337  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3338  	unsigned long flags;
3339  
3340  	if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
3341  		return;
3342  
3343  	atomic_inc(&fs_info->nr_delayed_iputs);
3344  	/*
3345  	 * Need to be irq safe here because we can be called from either an irq
3346  	 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
3347  	 * context.
3348  	 */
3349  	spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
3350  	ASSERT(list_empty(&inode->delayed_iput));
3351  	list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
3352  	spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
3353  	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3354  		wake_up_process(fs_info->cleaner_kthread);
3355  }
3356  
3357  static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3358  				    struct btrfs_inode *inode)
3359  {
3360  	list_del_init(&inode->delayed_iput);
3361  	spin_unlock_irq(&fs_info->delayed_iput_lock);
3362  	iput(&inode->vfs_inode);
3363  	if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3364  		wake_up(&fs_info->delayed_iputs_wait);
3365  	spin_lock_irq(&fs_info->delayed_iput_lock);
3366  }
3367  
3368  static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3369  				   struct btrfs_inode *inode)
3370  {
3371  	if (!list_empty(&inode->delayed_iput)) {
3372  		spin_lock_irq(&fs_info->delayed_iput_lock);
3373  		if (!list_empty(&inode->delayed_iput))
3374  			run_delayed_iput_locked(fs_info, inode);
3375  		spin_unlock_irq(&fs_info->delayed_iput_lock);
3376  	}
3377  }
3378  
3379  void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3380  {
3381  	/*
3382  	 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
3383  	 * calls btrfs_add_delayed_iput() and that needs to lock
3384  	 * fs_info->delayed_iput_lock. So we need to disable irqs here to
3385  	 * prevent a deadlock.
3386  	 */
3387  	spin_lock_irq(&fs_info->delayed_iput_lock);
3388  	while (!list_empty(&fs_info->delayed_iputs)) {
3389  		struct btrfs_inode *inode;
3390  
3391  		inode = list_first_entry(&fs_info->delayed_iputs,
3392  				struct btrfs_inode, delayed_iput);
3393  		run_delayed_iput_locked(fs_info, inode);
3394  		if (need_resched()) {
3395  			spin_unlock_irq(&fs_info->delayed_iput_lock);
3396  			cond_resched();
3397  			spin_lock_irq(&fs_info->delayed_iput_lock);
3398  		}
3399  	}
3400  	spin_unlock_irq(&fs_info->delayed_iput_lock);
3401  }
3402  
3403  /*
3404   * Wait for flushing all delayed iputs
3405   *
3406   * @fs_info:  the filesystem
3407   *
3408   * This will wait on any delayed iputs that are currently running with KILLABLE
3409   * set.  Once they are all done running we will return, unless we are killed in
3410   * which case we return EINTR. This helps in user operations like fallocate etc
3411   * that might get blocked on the iputs.
3412   *
3413   * Return EINTR if we were killed, 0 if nothing's pending
3414   */
3415  int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3416  {
3417  	int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3418  			atomic_read(&fs_info->nr_delayed_iputs) == 0);
3419  	if (ret)
3420  		return -EINTR;
3421  	return 0;
3422  }
3423  
3424  /*
3425   * This creates an orphan entry for the given inode in case something goes wrong
3426   * in the middle of an unlink.
3427   */
3428  int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3429  		     struct btrfs_inode *inode)
3430  {
3431  	int ret;
3432  
3433  	ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3434  	if (ret && ret != -EEXIST) {
3435  		btrfs_abort_transaction(trans, ret);
3436  		return ret;
3437  	}
3438  
3439  	return 0;
3440  }
3441  
3442  /*
3443   * We have done the delete so we can go ahead and remove the orphan item for
3444   * this particular inode.
3445   */
3446  static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3447  			    struct btrfs_inode *inode)
3448  {
3449  	return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3450  }
3451  
3452  /*
3453   * this cleans up any orphans that may be left on the list from the last use
3454   * of this root.
3455   */
3456  int btrfs_orphan_cleanup(struct btrfs_root *root)
3457  {
3458  	struct btrfs_fs_info *fs_info = root->fs_info;
3459  	struct btrfs_path *path;
3460  	struct extent_buffer *leaf;
3461  	struct btrfs_key key, found_key;
3462  	struct btrfs_trans_handle *trans;
3463  	struct inode *inode;
3464  	u64 last_objectid = 0;
3465  	int ret = 0, nr_unlink = 0;
3466  
3467  	if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
3468  		return 0;
3469  
3470  	path = btrfs_alloc_path();
3471  	if (!path) {
3472  		ret = -ENOMEM;
3473  		goto out;
3474  	}
3475  	path->reada = READA_BACK;
3476  
3477  	key.objectid = BTRFS_ORPHAN_OBJECTID;
3478  	key.type = BTRFS_ORPHAN_ITEM_KEY;
3479  	key.offset = (u64)-1;
3480  
3481  	while (1) {
3482  		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3483  		if (ret < 0)
3484  			goto out;
3485  
3486  		/*
3487  		 * if ret == 0 means we found what we were searching for, which
3488  		 * is weird, but possible, so only screw with path if we didn't
3489  		 * find the key and see if we have stuff that matches
3490  		 */
3491  		if (ret > 0) {
3492  			ret = 0;
3493  			if (path->slots[0] == 0)
3494  				break;
3495  			path->slots[0]--;
3496  		}
3497  
3498  		/* pull out the item */
3499  		leaf = path->nodes[0];
3500  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3501  
3502  		/* make sure the item matches what we want */
3503  		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3504  			break;
3505  		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3506  			break;
3507  
3508  		/* release the path since we're done with it */
3509  		btrfs_release_path(path);
3510  
3511  		/*
3512  		 * this is where we are basically btrfs_lookup, without the
3513  		 * crossing root thing.  we store the inode number in the
3514  		 * offset of the orphan item.
3515  		 */
3516  
3517  		if (found_key.offset == last_objectid) {
3518  			/*
3519  			 * We found the same inode as before. This means we were
3520  			 * not able to remove its items via eviction triggered
3521  			 * by an iput(). A transaction abort may have happened,
3522  			 * due to -ENOSPC for example, so try to grab the error
3523  			 * that lead to a transaction abort, if any.
3524  			 */
3525  			btrfs_err(fs_info,
3526  				  "Error removing orphan entry, stopping orphan cleanup");
3527  			ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
3528  			goto out;
3529  		}
3530  
3531  		last_objectid = found_key.offset;
3532  
3533  		found_key.objectid = found_key.offset;
3534  		found_key.type = BTRFS_INODE_ITEM_KEY;
3535  		found_key.offset = 0;
3536  		inode = btrfs_iget(fs_info->sb, last_objectid, root);
3537  		if (IS_ERR(inode)) {
3538  			ret = PTR_ERR(inode);
3539  			inode = NULL;
3540  			if (ret != -ENOENT)
3541  				goto out;
3542  		}
3543  
3544  		if (!inode && root == fs_info->tree_root) {
3545  			struct btrfs_root *dead_root;
3546  			int is_dead_root = 0;
3547  
3548  			/*
3549  			 * This is an orphan in the tree root. Currently these
3550  			 * could come from 2 sources:
3551  			 *  a) a root (snapshot/subvolume) deletion in progress
3552  			 *  b) a free space cache inode
3553  			 * We need to distinguish those two, as the orphan item
3554  			 * for a root must not get deleted before the deletion
3555  			 * of the snapshot/subvolume's tree completes.
3556  			 *
3557  			 * btrfs_find_orphan_roots() ran before us, which has
3558  			 * found all deleted roots and loaded them into
3559  			 * fs_info->fs_roots_radix. So here we can find if an
3560  			 * orphan item corresponds to a deleted root by looking
3561  			 * up the root from that radix tree.
3562  			 */
3563  
3564  			spin_lock(&fs_info->fs_roots_radix_lock);
3565  			dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3566  							 (unsigned long)found_key.objectid);
3567  			if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3568  				is_dead_root = 1;
3569  			spin_unlock(&fs_info->fs_roots_radix_lock);
3570  
3571  			if (is_dead_root) {
3572  				/* prevent this orphan from being found again */
3573  				key.offset = found_key.objectid - 1;
3574  				continue;
3575  			}
3576  
3577  		}
3578  
3579  		/*
3580  		 * If we have an inode with links, there are a couple of
3581  		 * possibilities:
3582  		 *
3583  		 * 1. We were halfway through creating fsverity metadata for the
3584  		 * file. In that case, the orphan item represents incomplete
3585  		 * fsverity metadata which must be cleaned up with
3586  		 * btrfs_drop_verity_items and deleting the orphan item.
3587  
3588  		 * 2. Old kernels (before v3.12) used to create an
3589  		 * orphan item for truncate indicating that there were possibly
3590  		 * extent items past i_size that needed to be deleted. In v3.12,
3591  		 * truncate was changed to update i_size in sync with the extent
3592  		 * items, but the (useless) orphan item was still created. Since
3593  		 * v4.18, we don't create the orphan item for truncate at all.
3594  		 *
3595  		 * So, this item could mean that we need to do a truncate, but
3596  		 * only if this filesystem was last used on a pre-v3.12 kernel
3597  		 * and was not cleanly unmounted. The odds of that are quite
3598  		 * slim, and it's a pain to do the truncate now, so just delete
3599  		 * the orphan item.
3600  		 *
3601  		 * It's also possible that this orphan item was supposed to be
3602  		 * deleted but wasn't. The inode number may have been reused,
3603  		 * but either way, we can delete the orphan item.
3604  		 */
3605  		if (!inode || inode->i_nlink) {
3606  			if (inode) {
3607  				ret = btrfs_drop_verity_items(BTRFS_I(inode));
3608  				iput(inode);
3609  				inode = NULL;
3610  				if (ret)
3611  					goto out;
3612  			}
3613  			trans = btrfs_start_transaction(root, 1);
3614  			if (IS_ERR(trans)) {
3615  				ret = PTR_ERR(trans);
3616  				goto out;
3617  			}
3618  			btrfs_debug(fs_info, "auto deleting %Lu",
3619  				    found_key.objectid);
3620  			ret = btrfs_del_orphan_item(trans, root,
3621  						    found_key.objectid);
3622  			btrfs_end_transaction(trans);
3623  			if (ret)
3624  				goto out;
3625  			continue;
3626  		}
3627  
3628  		nr_unlink++;
3629  
3630  		/* this will do delete_inode and everything for us */
3631  		iput(inode);
3632  	}
3633  	/* release the path since we're done with it */
3634  	btrfs_release_path(path);
3635  
3636  	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3637  		trans = btrfs_join_transaction(root);
3638  		if (!IS_ERR(trans))
3639  			btrfs_end_transaction(trans);
3640  	}
3641  
3642  	if (nr_unlink)
3643  		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3644  
3645  out:
3646  	if (ret)
3647  		btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3648  	btrfs_free_path(path);
3649  	return ret;
3650  }
3651  
3652  /*
3653   * very simple check to peek ahead in the leaf looking for xattrs.  If we
3654   * don't find any xattrs, we know there can't be any acls.
3655   *
3656   * slot is the slot the inode is in, objectid is the objectid of the inode
3657   */
3658  static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3659  					  int slot, u64 objectid,
3660  					  int *first_xattr_slot)
3661  {
3662  	u32 nritems = btrfs_header_nritems(leaf);
3663  	struct btrfs_key found_key;
3664  	static u64 xattr_access = 0;
3665  	static u64 xattr_default = 0;
3666  	int scanned = 0;
3667  
3668  	if (!xattr_access) {
3669  		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3670  					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3671  		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3672  					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3673  	}
3674  
3675  	slot++;
3676  	*first_xattr_slot = -1;
3677  	while (slot < nritems) {
3678  		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3679  
3680  		/* we found a different objectid, there must not be acls */
3681  		if (found_key.objectid != objectid)
3682  			return 0;
3683  
3684  		/* we found an xattr, assume we've got an acl */
3685  		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3686  			if (*first_xattr_slot == -1)
3687  				*first_xattr_slot = slot;
3688  			if (found_key.offset == xattr_access ||
3689  			    found_key.offset == xattr_default)
3690  				return 1;
3691  		}
3692  
3693  		/*
3694  		 * we found a key greater than an xattr key, there can't
3695  		 * be any acls later on
3696  		 */
3697  		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3698  			return 0;
3699  
3700  		slot++;
3701  		scanned++;
3702  
3703  		/*
3704  		 * it goes inode, inode backrefs, xattrs, extents,
3705  		 * so if there are a ton of hard links to an inode there can
3706  		 * be a lot of backrefs.  Don't waste time searching too hard,
3707  		 * this is just an optimization
3708  		 */
3709  		if (scanned >= 8)
3710  			break;
3711  	}
3712  	/* we hit the end of the leaf before we found an xattr or
3713  	 * something larger than an xattr.  We have to assume the inode
3714  	 * has acls
3715  	 */
3716  	if (*first_xattr_slot == -1)
3717  		*first_xattr_slot = slot;
3718  	return 1;
3719  }
3720  
3721  /*
3722   * read an inode from the btree into the in-memory inode
3723   */
3724  static int btrfs_read_locked_inode(struct inode *inode,
3725  				   struct btrfs_path *in_path)
3726  {
3727  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3728  	struct btrfs_path *path = in_path;
3729  	struct extent_buffer *leaf;
3730  	struct btrfs_inode_item *inode_item;
3731  	struct btrfs_root *root = BTRFS_I(inode)->root;
3732  	struct btrfs_key location;
3733  	unsigned long ptr;
3734  	int maybe_acls;
3735  	u32 rdev;
3736  	int ret;
3737  	bool filled = false;
3738  	int first_xattr_slot;
3739  
3740  	ret = btrfs_fill_inode(inode, &rdev);
3741  	if (!ret)
3742  		filled = true;
3743  
3744  	if (!path) {
3745  		path = btrfs_alloc_path();
3746  		if (!path)
3747  			return -ENOMEM;
3748  	}
3749  
3750  	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3751  
3752  	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3753  	if (ret) {
3754  		if (path != in_path)
3755  			btrfs_free_path(path);
3756  		return ret;
3757  	}
3758  
3759  	leaf = path->nodes[0];
3760  
3761  	if (filled)
3762  		goto cache_index;
3763  
3764  	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3765  				    struct btrfs_inode_item);
3766  	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3767  	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3768  	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3769  	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3770  	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3771  	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
3772  			round_up(i_size_read(inode), fs_info->sectorsize));
3773  
3774  	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3775  	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3776  
3777  	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3778  	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3779  
3780  	inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
3781  			btrfs_timespec_nsec(leaf, &inode_item->ctime));
3782  
3783  	BTRFS_I(inode)->i_otime.tv_sec =
3784  		btrfs_timespec_sec(leaf, &inode_item->otime);
3785  	BTRFS_I(inode)->i_otime.tv_nsec =
3786  		btrfs_timespec_nsec(leaf, &inode_item->otime);
3787  
3788  	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3789  	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3790  	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3791  
3792  	inode_set_iversion_queried(inode,
3793  				   btrfs_inode_sequence(leaf, inode_item));
3794  	inode->i_generation = BTRFS_I(inode)->generation;
3795  	inode->i_rdev = 0;
3796  	rdev = btrfs_inode_rdev(leaf, inode_item);
3797  
3798  	BTRFS_I(inode)->index_cnt = (u64)-1;
3799  	btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
3800  				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
3801  
3802  cache_index:
3803  	/*
3804  	 * If we were modified in the current generation and evicted from memory
3805  	 * and then re-read we need to do a full sync since we don't have any
3806  	 * idea about which extents were modified before we were evicted from
3807  	 * cache.
3808  	 *
3809  	 * This is required for both inode re-read from disk and delayed inode
3810  	 * in delayed_nodes_tree.
3811  	 */
3812  	if (BTRFS_I(inode)->last_trans == fs_info->generation)
3813  		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3814  			&BTRFS_I(inode)->runtime_flags);
3815  
3816  	/*
3817  	 * We don't persist the id of the transaction where an unlink operation
3818  	 * against the inode was last made. So here we assume the inode might
3819  	 * have been evicted, and therefore the exact value of last_unlink_trans
3820  	 * lost, and set it to last_trans to avoid metadata inconsistencies
3821  	 * between the inode and its parent if the inode is fsync'ed and the log
3822  	 * replayed. For example, in the scenario:
3823  	 *
3824  	 * touch mydir/foo
3825  	 * ln mydir/foo mydir/bar
3826  	 * sync
3827  	 * unlink mydir/bar
3828  	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3829  	 * xfs_io -c fsync mydir/foo
3830  	 * <power failure>
3831  	 * mount fs, triggers fsync log replay
3832  	 *
3833  	 * We must make sure that when we fsync our inode foo we also log its
3834  	 * parent inode, otherwise after log replay the parent still has the
3835  	 * dentry with the "bar" name but our inode foo has a link count of 1
3836  	 * and doesn't have an inode ref with the name "bar" anymore.
3837  	 *
3838  	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
3839  	 * but it guarantees correctness at the expense of occasional full
3840  	 * transaction commits on fsync if our inode is a directory, or if our
3841  	 * inode is not a directory, logging its parent unnecessarily.
3842  	 */
3843  	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3844  
3845  	/*
3846  	 * Same logic as for last_unlink_trans. We don't persist the generation
3847  	 * of the last transaction where this inode was used for a reflink
3848  	 * operation, so after eviction and reloading the inode we must be
3849  	 * pessimistic and assume the last transaction that modified the inode.
3850  	 */
3851  	BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
3852  
3853  	path->slots[0]++;
3854  	if (inode->i_nlink != 1 ||
3855  	    path->slots[0] >= btrfs_header_nritems(leaf))
3856  		goto cache_acl;
3857  
3858  	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3859  	if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3860  		goto cache_acl;
3861  
3862  	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3863  	if (location.type == BTRFS_INODE_REF_KEY) {
3864  		struct btrfs_inode_ref *ref;
3865  
3866  		ref = (struct btrfs_inode_ref *)ptr;
3867  		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3868  	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3869  		struct btrfs_inode_extref *extref;
3870  
3871  		extref = (struct btrfs_inode_extref *)ptr;
3872  		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3873  								     extref);
3874  	}
3875  cache_acl:
3876  	/*
3877  	 * try to precache a NULL acl entry for files that don't have
3878  	 * any xattrs or acls
3879  	 */
3880  	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3881  			btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3882  	if (first_xattr_slot != -1) {
3883  		path->slots[0] = first_xattr_slot;
3884  		ret = btrfs_load_inode_props(inode, path);
3885  		if (ret)
3886  			btrfs_err(fs_info,
3887  				  "error loading props for ino %llu (root %llu): %d",
3888  				  btrfs_ino(BTRFS_I(inode)),
3889  				  root->root_key.objectid, ret);
3890  	}
3891  	if (path != in_path)
3892  		btrfs_free_path(path);
3893  
3894  	if (!maybe_acls)
3895  		cache_no_acl(inode);
3896  
3897  	switch (inode->i_mode & S_IFMT) {
3898  	case S_IFREG:
3899  		inode->i_mapping->a_ops = &btrfs_aops;
3900  		inode->i_fop = &btrfs_file_operations;
3901  		inode->i_op = &btrfs_file_inode_operations;
3902  		break;
3903  	case S_IFDIR:
3904  		inode->i_fop = &btrfs_dir_file_operations;
3905  		inode->i_op = &btrfs_dir_inode_operations;
3906  		break;
3907  	case S_IFLNK:
3908  		inode->i_op = &btrfs_symlink_inode_operations;
3909  		inode_nohighmem(inode);
3910  		inode->i_mapping->a_ops = &btrfs_aops;
3911  		break;
3912  	default:
3913  		inode->i_op = &btrfs_special_inode_operations;
3914  		init_special_inode(inode, inode->i_mode, rdev);
3915  		break;
3916  	}
3917  
3918  	btrfs_sync_inode_flags_to_i_flags(inode);
3919  	return 0;
3920  }
3921  
3922  /*
3923   * given a leaf and an inode, copy the inode fields into the leaf
3924   */
3925  static void fill_inode_item(struct btrfs_trans_handle *trans,
3926  			    struct extent_buffer *leaf,
3927  			    struct btrfs_inode_item *item,
3928  			    struct inode *inode)
3929  {
3930  	struct btrfs_map_token token;
3931  	u64 flags;
3932  
3933  	btrfs_init_map_token(&token, leaf);
3934  
3935  	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3936  	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3937  	btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
3938  	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3939  	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3940  
3941  	btrfs_set_token_timespec_sec(&token, &item->atime,
3942  				     inode->i_atime.tv_sec);
3943  	btrfs_set_token_timespec_nsec(&token, &item->atime,
3944  				      inode->i_atime.tv_nsec);
3945  
3946  	btrfs_set_token_timespec_sec(&token, &item->mtime,
3947  				     inode->i_mtime.tv_sec);
3948  	btrfs_set_token_timespec_nsec(&token, &item->mtime,
3949  				      inode->i_mtime.tv_nsec);
3950  
3951  	btrfs_set_token_timespec_sec(&token, &item->ctime,
3952  				     inode_get_ctime(inode).tv_sec);
3953  	btrfs_set_token_timespec_nsec(&token, &item->ctime,
3954  				      inode_get_ctime(inode).tv_nsec);
3955  
3956  	btrfs_set_token_timespec_sec(&token, &item->otime,
3957  				     BTRFS_I(inode)->i_otime.tv_sec);
3958  	btrfs_set_token_timespec_nsec(&token, &item->otime,
3959  				      BTRFS_I(inode)->i_otime.tv_nsec);
3960  
3961  	btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
3962  	btrfs_set_token_inode_generation(&token, item,
3963  					 BTRFS_I(inode)->generation);
3964  	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3965  	btrfs_set_token_inode_transid(&token, item, trans->transid);
3966  	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3967  	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
3968  					  BTRFS_I(inode)->ro_flags);
3969  	btrfs_set_token_inode_flags(&token, item, flags);
3970  	btrfs_set_token_inode_block_group(&token, item, 0);
3971  }
3972  
3973  /*
3974   * copy everything in the in-memory inode into the btree.
3975   */
3976  static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3977  				struct btrfs_root *root,
3978  				struct btrfs_inode *inode)
3979  {
3980  	struct btrfs_inode_item *inode_item;
3981  	struct btrfs_path *path;
3982  	struct extent_buffer *leaf;
3983  	int ret;
3984  
3985  	path = btrfs_alloc_path();
3986  	if (!path)
3987  		return -ENOMEM;
3988  
3989  	ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
3990  	if (ret) {
3991  		if (ret > 0)
3992  			ret = -ENOENT;
3993  		goto failed;
3994  	}
3995  
3996  	leaf = path->nodes[0];
3997  	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3998  				    struct btrfs_inode_item);
3999  
4000  	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
4001  	btrfs_mark_buffer_dirty(trans, leaf);
4002  	btrfs_set_inode_last_trans(trans, inode);
4003  	ret = 0;
4004  failed:
4005  	btrfs_free_path(path);
4006  	return ret;
4007  }
4008  
4009  /*
4010   * copy everything in the in-memory inode into the btree.
4011   */
4012  noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
4013  				struct btrfs_root *root,
4014  				struct btrfs_inode *inode)
4015  {
4016  	struct btrfs_fs_info *fs_info = root->fs_info;
4017  	int ret;
4018  
4019  	/*
4020  	 * If the inode is a free space inode, we can deadlock during commit
4021  	 * if we put it into the delayed code.
4022  	 *
4023  	 * The data relocation inode should also be directly updated
4024  	 * without delay
4025  	 */
4026  	if (!btrfs_is_free_space_inode(inode)
4027  	    && !btrfs_is_data_reloc_root(root)
4028  	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4029  		btrfs_update_root_times(trans, root);
4030  
4031  		ret = btrfs_delayed_update_inode(trans, root, inode);
4032  		if (!ret)
4033  			btrfs_set_inode_last_trans(trans, inode);
4034  		return ret;
4035  	}
4036  
4037  	return btrfs_update_inode_item(trans, root, inode);
4038  }
4039  
4040  int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4041  				struct btrfs_root *root, struct btrfs_inode *inode)
4042  {
4043  	int ret;
4044  
4045  	ret = btrfs_update_inode(trans, root, inode);
4046  	if (ret == -ENOSPC)
4047  		return btrfs_update_inode_item(trans, root, inode);
4048  	return ret;
4049  }
4050  
4051  /*
4052   * unlink helper that gets used here in inode.c and in the tree logging
4053   * recovery code.  It remove a link in a directory with a given name, and
4054   * also drops the back refs in the inode to the directory
4055   */
4056  static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4057  				struct btrfs_inode *dir,
4058  				struct btrfs_inode *inode,
4059  				const struct fscrypt_str *name,
4060  				struct btrfs_rename_ctx *rename_ctx)
4061  {
4062  	struct btrfs_root *root = dir->root;
4063  	struct btrfs_fs_info *fs_info = root->fs_info;
4064  	struct btrfs_path *path;
4065  	int ret = 0;
4066  	struct btrfs_dir_item *di;
4067  	u64 index;
4068  	u64 ino = btrfs_ino(inode);
4069  	u64 dir_ino = btrfs_ino(dir);
4070  
4071  	path = btrfs_alloc_path();
4072  	if (!path) {
4073  		ret = -ENOMEM;
4074  		goto out;
4075  	}
4076  
4077  	di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
4078  	if (IS_ERR_OR_NULL(di)) {
4079  		ret = di ? PTR_ERR(di) : -ENOENT;
4080  		goto err;
4081  	}
4082  	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4083  	if (ret)
4084  		goto err;
4085  	btrfs_release_path(path);
4086  
4087  	/*
4088  	 * If we don't have dir index, we have to get it by looking up
4089  	 * the inode ref, since we get the inode ref, remove it directly,
4090  	 * it is unnecessary to do delayed deletion.
4091  	 *
4092  	 * But if we have dir index, needn't search inode ref to get it.
4093  	 * Since the inode ref is close to the inode item, it is better
4094  	 * that we delay to delete it, and just do this deletion when
4095  	 * we update the inode item.
4096  	 */
4097  	if (inode->dir_index) {
4098  		ret = btrfs_delayed_delete_inode_ref(inode);
4099  		if (!ret) {
4100  			index = inode->dir_index;
4101  			goto skip_backref;
4102  		}
4103  	}
4104  
4105  	ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
4106  	if (ret) {
4107  		btrfs_info(fs_info,
4108  			"failed to delete reference to %.*s, inode %llu parent %llu",
4109  			name->len, name->name, ino, dir_ino);
4110  		btrfs_abort_transaction(trans, ret);
4111  		goto err;
4112  	}
4113  skip_backref:
4114  	if (rename_ctx)
4115  		rename_ctx->index = index;
4116  
4117  	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4118  	if (ret) {
4119  		btrfs_abort_transaction(trans, ret);
4120  		goto err;
4121  	}
4122  
4123  	/*
4124  	 * If we are in a rename context, we don't need to update anything in the
4125  	 * log. That will be done later during the rename by btrfs_log_new_name().
4126  	 * Besides that, doing it here would only cause extra unnecessary btree
4127  	 * operations on the log tree, increasing latency for applications.
4128  	 */
4129  	if (!rename_ctx) {
4130  		btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
4131  		btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
4132  	}
4133  
4134  	/*
4135  	 * If we have a pending delayed iput we could end up with the final iput
4136  	 * being run in btrfs-cleaner context.  If we have enough of these built
4137  	 * up we can end up burning a lot of time in btrfs-cleaner without any
4138  	 * way to throttle the unlinks.  Since we're currently holding a ref on
4139  	 * the inode we can run the delayed iput here without any issues as the
4140  	 * final iput won't be done until after we drop the ref we're currently
4141  	 * holding.
4142  	 */
4143  	btrfs_run_delayed_iput(fs_info, inode);
4144  err:
4145  	btrfs_free_path(path);
4146  	if (ret)
4147  		goto out;
4148  
4149  	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
4150  	inode_inc_iversion(&inode->vfs_inode);
4151  	inode_set_ctime_current(&inode->vfs_inode);
4152  	inode_inc_iversion(&dir->vfs_inode);
4153  	inode_set_ctime_current(&inode->vfs_inode);
4154  	dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
4155  	ret = btrfs_update_inode(trans, root, dir);
4156  out:
4157  	return ret;
4158  }
4159  
4160  int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4161  		       struct btrfs_inode *dir, struct btrfs_inode *inode,
4162  		       const struct fscrypt_str *name)
4163  {
4164  	int ret;
4165  
4166  	ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
4167  	if (!ret) {
4168  		drop_nlink(&inode->vfs_inode);
4169  		ret = btrfs_update_inode(trans, inode->root, inode);
4170  	}
4171  	return ret;
4172  }
4173  
4174  /*
4175   * helper to start transaction for unlink and rmdir.
4176   *
4177   * unlink and rmdir are special in btrfs, they do not always free space, so
4178   * if we cannot make our reservations the normal way try and see if there is
4179   * plenty of slack room in the global reserve to migrate, otherwise we cannot
4180   * allow the unlink to occur.
4181   */
4182  static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
4183  {
4184  	struct btrfs_root *root = dir->root;
4185  
4186  	return btrfs_start_transaction_fallback_global_rsv(root,
4187  						   BTRFS_UNLINK_METADATA_UNITS);
4188  }
4189  
4190  static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4191  {
4192  	struct btrfs_trans_handle *trans;
4193  	struct inode *inode = d_inode(dentry);
4194  	int ret;
4195  	struct fscrypt_name fname;
4196  
4197  	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4198  	if (ret)
4199  		return ret;
4200  
4201  	/* This needs to handle no-key deletions later on */
4202  
4203  	trans = __unlink_start_trans(BTRFS_I(dir));
4204  	if (IS_ERR(trans)) {
4205  		ret = PTR_ERR(trans);
4206  		goto fscrypt_free;
4207  	}
4208  
4209  	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4210  				false);
4211  
4212  	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4213  				 &fname.disk_name);
4214  	if (ret)
4215  		goto end_trans;
4216  
4217  	if (inode->i_nlink == 0) {
4218  		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4219  		if (ret)
4220  			goto end_trans;
4221  	}
4222  
4223  end_trans:
4224  	btrfs_end_transaction(trans);
4225  	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
4226  fscrypt_free:
4227  	fscrypt_free_filename(&fname);
4228  	return ret;
4229  }
4230  
4231  static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4232  			       struct btrfs_inode *dir, struct dentry *dentry)
4233  {
4234  	struct btrfs_root *root = dir->root;
4235  	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4236  	struct btrfs_path *path;
4237  	struct extent_buffer *leaf;
4238  	struct btrfs_dir_item *di;
4239  	struct btrfs_key key;
4240  	u64 index;
4241  	int ret;
4242  	u64 objectid;
4243  	u64 dir_ino = btrfs_ino(dir);
4244  	struct fscrypt_name fname;
4245  
4246  	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
4247  	if (ret)
4248  		return ret;
4249  
4250  	/* This needs to handle no-key deletions later on */
4251  
4252  	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4253  		objectid = inode->root->root_key.objectid;
4254  	} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4255  		objectid = inode->location.objectid;
4256  	} else {
4257  		WARN_ON(1);
4258  		fscrypt_free_filename(&fname);
4259  		return -EINVAL;
4260  	}
4261  
4262  	path = btrfs_alloc_path();
4263  	if (!path) {
4264  		ret = -ENOMEM;
4265  		goto out;
4266  	}
4267  
4268  	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4269  				   &fname.disk_name, -1);
4270  	if (IS_ERR_OR_NULL(di)) {
4271  		ret = di ? PTR_ERR(di) : -ENOENT;
4272  		goto out;
4273  	}
4274  
4275  	leaf = path->nodes[0];
4276  	btrfs_dir_item_key_to_cpu(leaf, di, &key);
4277  	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4278  	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4279  	if (ret) {
4280  		btrfs_abort_transaction(trans, ret);
4281  		goto out;
4282  	}
4283  	btrfs_release_path(path);
4284  
4285  	/*
4286  	 * This is a placeholder inode for a subvolume we didn't have a
4287  	 * reference to at the time of the snapshot creation.  In the meantime
4288  	 * we could have renamed the real subvol link into our snapshot, so
4289  	 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4290  	 * Instead simply lookup the dir_index_item for this entry so we can
4291  	 * remove it.  Otherwise we know we have a ref to the root and we can
4292  	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
4293  	 */
4294  	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4295  		di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
4296  		if (IS_ERR(di)) {
4297  			ret = PTR_ERR(di);
4298  			btrfs_abort_transaction(trans, ret);
4299  			goto out;
4300  		}
4301  
4302  		leaf = path->nodes[0];
4303  		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4304  		index = key.offset;
4305  		btrfs_release_path(path);
4306  	} else {
4307  		ret = btrfs_del_root_ref(trans, objectid,
4308  					 root->root_key.objectid, dir_ino,
4309  					 &index, &fname.disk_name);
4310  		if (ret) {
4311  			btrfs_abort_transaction(trans, ret);
4312  			goto out;
4313  		}
4314  	}
4315  
4316  	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4317  	if (ret) {
4318  		btrfs_abort_transaction(trans, ret);
4319  		goto out;
4320  	}
4321  
4322  	btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
4323  	inode_inc_iversion(&dir->vfs_inode);
4324  	dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
4325  	ret = btrfs_update_inode_fallback(trans, root, dir);
4326  	if (ret)
4327  		btrfs_abort_transaction(trans, ret);
4328  out:
4329  	btrfs_free_path(path);
4330  	fscrypt_free_filename(&fname);
4331  	return ret;
4332  }
4333  
4334  /*
4335   * Helper to check if the subvolume references other subvolumes or if it's
4336   * default.
4337   */
4338  static noinline int may_destroy_subvol(struct btrfs_root *root)
4339  {
4340  	struct btrfs_fs_info *fs_info = root->fs_info;
4341  	struct btrfs_path *path;
4342  	struct btrfs_dir_item *di;
4343  	struct btrfs_key key;
4344  	struct fscrypt_str name = FSTR_INIT("default", 7);
4345  	u64 dir_id;
4346  	int ret;
4347  
4348  	path = btrfs_alloc_path();
4349  	if (!path)
4350  		return -ENOMEM;
4351  
4352  	/* Make sure this root isn't set as the default subvol */
4353  	dir_id = btrfs_super_root_dir(fs_info->super_copy);
4354  	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
4355  				   dir_id, &name, 0);
4356  	if (di && !IS_ERR(di)) {
4357  		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
4358  		if (key.objectid == root->root_key.objectid) {
4359  			ret = -EPERM;
4360  			btrfs_err(fs_info,
4361  				  "deleting default subvolume %llu is not allowed",
4362  				  key.objectid);
4363  			goto out;
4364  		}
4365  		btrfs_release_path(path);
4366  	}
4367  
4368  	key.objectid = root->root_key.objectid;
4369  	key.type = BTRFS_ROOT_REF_KEY;
4370  	key.offset = (u64)-1;
4371  
4372  	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4373  	if (ret < 0)
4374  		goto out;
4375  	if (ret == 0) {
4376  		/*
4377  		 * Key with offset -1 found, there would have to exist a root
4378  		 * with such id, but this is out of valid range.
4379  		 */
4380  		ret = -EUCLEAN;
4381  		goto out;
4382  	}
4383  
4384  	ret = 0;
4385  	if (path->slots[0] > 0) {
4386  		path->slots[0]--;
4387  		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4388  		if (key.objectid == root->root_key.objectid &&
4389  		    key.type == BTRFS_ROOT_REF_KEY)
4390  			ret = -ENOTEMPTY;
4391  	}
4392  out:
4393  	btrfs_free_path(path);
4394  	return ret;
4395  }
4396  
4397  /* Delete all dentries for inodes belonging to the root */
4398  static void btrfs_prune_dentries(struct btrfs_root *root)
4399  {
4400  	struct btrfs_fs_info *fs_info = root->fs_info;
4401  	struct rb_node *node;
4402  	struct rb_node *prev;
4403  	struct btrfs_inode *entry;
4404  	struct inode *inode;
4405  	u64 objectid = 0;
4406  
4407  	if (!BTRFS_FS_ERROR(fs_info))
4408  		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4409  
4410  	spin_lock(&root->inode_lock);
4411  again:
4412  	node = root->inode_tree.rb_node;
4413  	prev = NULL;
4414  	while (node) {
4415  		prev = node;
4416  		entry = rb_entry(node, struct btrfs_inode, rb_node);
4417  
4418  		if (objectid < btrfs_ino(entry))
4419  			node = node->rb_left;
4420  		else if (objectid > btrfs_ino(entry))
4421  			node = node->rb_right;
4422  		else
4423  			break;
4424  	}
4425  	if (!node) {
4426  		while (prev) {
4427  			entry = rb_entry(prev, struct btrfs_inode, rb_node);
4428  			if (objectid <= btrfs_ino(entry)) {
4429  				node = prev;
4430  				break;
4431  			}
4432  			prev = rb_next(prev);
4433  		}
4434  	}
4435  	while (node) {
4436  		entry = rb_entry(node, struct btrfs_inode, rb_node);
4437  		objectid = btrfs_ino(entry) + 1;
4438  		inode = igrab(&entry->vfs_inode);
4439  		if (inode) {
4440  			spin_unlock(&root->inode_lock);
4441  			if (atomic_read(&inode->i_count) > 1)
4442  				d_prune_aliases(inode);
4443  			/*
4444  			 * btrfs_drop_inode will have it removed from the inode
4445  			 * cache when its usage count hits zero.
4446  			 */
4447  			iput(inode);
4448  			cond_resched();
4449  			spin_lock(&root->inode_lock);
4450  			goto again;
4451  		}
4452  
4453  		if (cond_resched_lock(&root->inode_lock))
4454  			goto again;
4455  
4456  		node = rb_next(node);
4457  	}
4458  	spin_unlock(&root->inode_lock);
4459  }
4460  
4461  int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
4462  {
4463  	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
4464  	struct btrfs_root *root = dir->root;
4465  	struct inode *inode = d_inode(dentry);
4466  	struct btrfs_root *dest = BTRFS_I(inode)->root;
4467  	struct btrfs_trans_handle *trans;
4468  	struct btrfs_block_rsv block_rsv;
4469  	u64 root_flags;
4470  	u64 qgroup_reserved = 0;
4471  	int ret;
4472  
4473  	down_write(&fs_info->subvol_sem);
4474  
4475  	/*
4476  	 * Don't allow to delete a subvolume with send in progress. This is
4477  	 * inside the inode lock so the error handling that has to drop the bit
4478  	 * again is not run concurrently.
4479  	 */
4480  	spin_lock(&dest->root_item_lock);
4481  	if (dest->send_in_progress) {
4482  		spin_unlock(&dest->root_item_lock);
4483  		btrfs_warn(fs_info,
4484  			   "attempt to delete subvolume %llu during send",
4485  			   dest->root_key.objectid);
4486  		ret = -EPERM;
4487  		goto out_up_write;
4488  	}
4489  	if (atomic_read(&dest->nr_swapfiles)) {
4490  		spin_unlock(&dest->root_item_lock);
4491  		btrfs_warn(fs_info,
4492  			   "attempt to delete subvolume %llu with active swapfile",
4493  			   root->root_key.objectid);
4494  		ret = -EPERM;
4495  		goto out_up_write;
4496  	}
4497  	root_flags = btrfs_root_flags(&dest->root_item);
4498  	btrfs_set_root_flags(&dest->root_item,
4499  			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4500  	spin_unlock(&dest->root_item_lock);
4501  
4502  	ret = may_destroy_subvol(dest);
4503  	if (ret)
4504  		goto out_undead;
4505  
4506  	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4507  	/*
4508  	 * One for dir inode,
4509  	 * two for dir entries,
4510  	 * two for root ref/backref.
4511  	 */
4512  	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4513  	if (ret)
4514  		goto out_undead;
4515  	qgroup_reserved = block_rsv.qgroup_rsv_reserved;
4516  
4517  	trans = btrfs_start_transaction(root, 0);
4518  	if (IS_ERR(trans)) {
4519  		ret = PTR_ERR(trans);
4520  		goto out_release;
4521  	}
4522  	ret = btrfs_record_root_in_trans(trans, root);
4523  	if (ret) {
4524  		btrfs_abort_transaction(trans, ret);
4525  		goto out_end_trans;
4526  	}
4527  	btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
4528  	qgroup_reserved = 0;
4529  	trans->block_rsv = &block_rsv;
4530  	trans->bytes_reserved = block_rsv.size;
4531  
4532  	btrfs_record_snapshot_destroy(trans, dir);
4533  
4534  	ret = btrfs_unlink_subvol(trans, dir, dentry);
4535  	if (ret) {
4536  		btrfs_abort_transaction(trans, ret);
4537  		goto out_end_trans;
4538  	}
4539  
4540  	ret = btrfs_record_root_in_trans(trans, dest);
4541  	if (ret) {
4542  		btrfs_abort_transaction(trans, ret);
4543  		goto out_end_trans;
4544  	}
4545  
4546  	memset(&dest->root_item.drop_progress, 0,
4547  		sizeof(dest->root_item.drop_progress));
4548  	btrfs_set_root_drop_level(&dest->root_item, 0);
4549  	btrfs_set_root_refs(&dest->root_item, 0);
4550  
4551  	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4552  		ret = btrfs_insert_orphan_item(trans,
4553  					fs_info->tree_root,
4554  					dest->root_key.objectid);
4555  		if (ret) {
4556  			btrfs_abort_transaction(trans, ret);
4557  			goto out_end_trans;
4558  		}
4559  	}
4560  
4561  	ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4562  				  BTRFS_UUID_KEY_SUBVOL,
4563  				  dest->root_key.objectid);
4564  	if (ret && ret != -ENOENT) {
4565  		btrfs_abort_transaction(trans, ret);
4566  		goto out_end_trans;
4567  	}
4568  	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4569  		ret = btrfs_uuid_tree_remove(trans,
4570  					  dest->root_item.received_uuid,
4571  					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4572  					  dest->root_key.objectid);
4573  		if (ret && ret != -ENOENT) {
4574  			btrfs_abort_transaction(trans, ret);
4575  			goto out_end_trans;
4576  		}
4577  	}
4578  
4579  	free_anon_bdev(dest->anon_dev);
4580  	dest->anon_dev = 0;
4581  out_end_trans:
4582  	trans->block_rsv = NULL;
4583  	trans->bytes_reserved = 0;
4584  	ret = btrfs_end_transaction(trans);
4585  	inode->i_flags |= S_DEAD;
4586  out_release:
4587  	btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
4588  	if (qgroup_reserved)
4589  		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
4590  out_undead:
4591  	if (ret) {
4592  		spin_lock(&dest->root_item_lock);
4593  		root_flags = btrfs_root_flags(&dest->root_item);
4594  		btrfs_set_root_flags(&dest->root_item,
4595  				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4596  		spin_unlock(&dest->root_item_lock);
4597  	}
4598  out_up_write:
4599  	up_write(&fs_info->subvol_sem);
4600  	if (!ret) {
4601  		d_invalidate(dentry);
4602  		btrfs_prune_dentries(dest);
4603  		ASSERT(dest->send_in_progress == 0);
4604  	}
4605  
4606  	return ret;
4607  }
4608  
4609  static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4610  {
4611  	struct inode *inode = d_inode(dentry);
4612  	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
4613  	int err = 0;
4614  	struct btrfs_trans_handle *trans;
4615  	u64 last_unlink_trans;
4616  	struct fscrypt_name fname;
4617  
4618  	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4619  		return -ENOTEMPTY;
4620  	if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
4621  		if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
4622  			btrfs_err(fs_info,
4623  			"extent tree v2 doesn't support snapshot deletion yet");
4624  			return -EOPNOTSUPP;
4625  		}
4626  		return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
4627  	}
4628  
4629  	err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4630  	if (err)
4631  		return err;
4632  
4633  	/* This needs to handle no-key deletions later on */
4634  
4635  	trans = __unlink_start_trans(BTRFS_I(dir));
4636  	if (IS_ERR(trans)) {
4637  		err = PTR_ERR(trans);
4638  		goto out_notrans;
4639  	}
4640  
4641  	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4642  		err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
4643  		goto out;
4644  	}
4645  
4646  	err = btrfs_orphan_add(trans, BTRFS_I(inode));
4647  	if (err)
4648  		goto out;
4649  
4650  	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4651  
4652  	/* now the directory is empty */
4653  	err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4654  				 &fname.disk_name);
4655  	if (!err) {
4656  		btrfs_i_size_write(BTRFS_I(inode), 0);
4657  		/*
4658  		 * Propagate the last_unlink_trans value of the deleted dir to
4659  		 * its parent directory. This is to prevent an unrecoverable
4660  		 * log tree in the case we do something like this:
4661  		 * 1) create dir foo
4662  		 * 2) create snapshot under dir foo
4663  		 * 3) delete the snapshot
4664  		 * 4) rmdir foo
4665  		 * 5) mkdir foo
4666  		 * 6) fsync foo or some file inside foo
4667  		 */
4668  		if (last_unlink_trans >= trans->transid)
4669  			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4670  	}
4671  out:
4672  	btrfs_end_transaction(trans);
4673  out_notrans:
4674  	btrfs_btree_balance_dirty(fs_info);
4675  	fscrypt_free_filename(&fname);
4676  
4677  	return err;
4678  }
4679  
4680  /*
4681   * btrfs_truncate_block - read, zero a chunk and write a block
4682   * @inode - inode that we're zeroing
4683   * @from - the offset to start zeroing
4684   * @len - the length to zero, 0 to zero the entire range respective to the
4685   *	offset
4686   * @front - zero up to the offset instead of from the offset on
4687   *
4688   * This will find the block for the "from" offset and cow the block and zero the
4689   * part we want to zero.  This is used with truncate and hole punching.
4690   */
4691  int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
4692  			 int front)
4693  {
4694  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
4695  	struct address_space *mapping = inode->vfs_inode.i_mapping;
4696  	struct extent_io_tree *io_tree = &inode->io_tree;
4697  	struct btrfs_ordered_extent *ordered;
4698  	struct extent_state *cached_state = NULL;
4699  	struct extent_changeset *data_reserved = NULL;
4700  	bool only_release_metadata = false;
4701  	u32 blocksize = fs_info->sectorsize;
4702  	pgoff_t index = from >> PAGE_SHIFT;
4703  	unsigned offset = from & (blocksize - 1);
4704  	struct page *page;
4705  	gfp_t mask = btrfs_alloc_write_mask(mapping);
4706  	size_t write_bytes = blocksize;
4707  	int ret = 0;
4708  	u64 block_start;
4709  	u64 block_end;
4710  
4711  	if (IS_ALIGNED(offset, blocksize) &&
4712  	    (!len || IS_ALIGNED(len, blocksize)))
4713  		goto out;
4714  
4715  	block_start = round_down(from, blocksize);
4716  	block_end = block_start + blocksize - 1;
4717  
4718  	ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
4719  					  blocksize, false);
4720  	if (ret < 0) {
4721  		if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
4722  			/* For nocow case, no need to reserve data space */
4723  			only_release_metadata = true;
4724  		} else {
4725  			goto out;
4726  		}
4727  	}
4728  	ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
4729  	if (ret < 0) {
4730  		if (!only_release_metadata)
4731  			btrfs_free_reserved_data_space(inode, data_reserved,
4732  						       block_start, blocksize);
4733  		goto out;
4734  	}
4735  again:
4736  	page = find_or_create_page(mapping, index, mask);
4737  	if (!page) {
4738  		btrfs_delalloc_release_space(inode, data_reserved, block_start,
4739  					     blocksize, true);
4740  		btrfs_delalloc_release_extents(inode, blocksize);
4741  		ret = -ENOMEM;
4742  		goto out;
4743  	}
4744  
4745  	if (!PageUptodate(page)) {
4746  		ret = btrfs_read_folio(NULL, page_folio(page));
4747  		lock_page(page);
4748  		if (page->mapping != mapping) {
4749  			unlock_page(page);
4750  			put_page(page);
4751  			goto again;
4752  		}
4753  		if (!PageUptodate(page)) {
4754  			ret = -EIO;
4755  			goto out_unlock;
4756  		}
4757  	}
4758  
4759  	/*
4760  	 * We unlock the page after the io is completed and then re-lock it
4761  	 * above.  release_folio() could have come in between that and cleared
4762  	 * PagePrivate(), but left the page in the mapping.  Set the page mapped
4763  	 * here to make sure it's properly set for the subpage stuff.
4764  	 */
4765  	ret = set_page_extent_mapped(page);
4766  	if (ret < 0)
4767  		goto out_unlock;
4768  
4769  	wait_on_page_writeback(page);
4770  
4771  	lock_extent(io_tree, block_start, block_end, &cached_state);
4772  
4773  	ordered = btrfs_lookup_ordered_extent(inode, block_start);
4774  	if (ordered) {
4775  		unlock_extent(io_tree, block_start, block_end, &cached_state);
4776  		unlock_page(page);
4777  		put_page(page);
4778  		btrfs_start_ordered_extent(ordered);
4779  		btrfs_put_ordered_extent(ordered);
4780  		goto again;
4781  	}
4782  
4783  	clear_extent_bit(&inode->io_tree, block_start, block_end,
4784  			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4785  			 &cached_state);
4786  
4787  	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
4788  					&cached_state);
4789  	if (ret) {
4790  		unlock_extent(io_tree, block_start, block_end, &cached_state);
4791  		goto out_unlock;
4792  	}
4793  
4794  	if (offset != blocksize) {
4795  		if (!len)
4796  			len = blocksize - offset;
4797  		if (front)
4798  			memzero_page(page, (block_start - page_offset(page)),
4799  				     offset);
4800  		else
4801  			memzero_page(page, (block_start - page_offset(page)) + offset,
4802  				     len);
4803  	}
4804  	btrfs_page_clear_checked(fs_info, page, block_start,
4805  				 block_end + 1 - block_start);
4806  	btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
4807  	unlock_extent(io_tree, block_start, block_end, &cached_state);
4808  
4809  	if (only_release_metadata)
4810  		set_extent_bit(&inode->io_tree, block_start, block_end,
4811  			       EXTENT_NORESERVE, NULL);
4812  
4813  out_unlock:
4814  	if (ret) {
4815  		if (only_release_metadata)
4816  			btrfs_delalloc_release_metadata(inode, blocksize, true);
4817  		else
4818  			btrfs_delalloc_release_space(inode, data_reserved,
4819  					block_start, blocksize, true);
4820  	}
4821  	btrfs_delalloc_release_extents(inode, blocksize);
4822  	unlock_page(page);
4823  	put_page(page);
4824  out:
4825  	if (only_release_metadata)
4826  		btrfs_check_nocow_unlock(inode);
4827  	extent_changeset_free(data_reserved);
4828  	return ret;
4829  }
4830  
4831  static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
4832  			     u64 offset, u64 len)
4833  {
4834  	struct btrfs_fs_info *fs_info = root->fs_info;
4835  	struct btrfs_trans_handle *trans;
4836  	struct btrfs_drop_extents_args drop_args = { 0 };
4837  	int ret;
4838  
4839  	/*
4840  	 * If NO_HOLES is enabled, we don't need to do anything.
4841  	 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
4842  	 * or btrfs_update_inode() will be called, which guarantee that the next
4843  	 * fsync will know this inode was changed and needs to be logged.
4844  	 */
4845  	if (btrfs_fs_incompat(fs_info, NO_HOLES))
4846  		return 0;
4847  
4848  	/*
4849  	 * 1 - for the one we're dropping
4850  	 * 1 - for the one we're adding
4851  	 * 1 - for updating the inode.
4852  	 */
4853  	trans = btrfs_start_transaction(root, 3);
4854  	if (IS_ERR(trans))
4855  		return PTR_ERR(trans);
4856  
4857  	drop_args.start = offset;
4858  	drop_args.end = offset + len;
4859  	drop_args.drop_cache = true;
4860  
4861  	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
4862  	if (ret) {
4863  		btrfs_abort_transaction(trans, ret);
4864  		btrfs_end_transaction(trans);
4865  		return ret;
4866  	}
4867  
4868  	ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
4869  	if (ret) {
4870  		btrfs_abort_transaction(trans, ret);
4871  	} else {
4872  		btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
4873  		btrfs_update_inode(trans, root, inode);
4874  	}
4875  	btrfs_end_transaction(trans);
4876  	return ret;
4877  }
4878  
4879  /*
4880   * This function puts in dummy file extents for the area we're creating a hole
4881   * for.  So if we are truncating this file to a larger size we need to insert
4882   * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4883   * the range between oldsize and size
4884   */
4885  int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
4886  {
4887  	struct btrfs_root *root = inode->root;
4888  	struct btrfs_fs_info *fs_info = root->fs_info;
4889  	struct extent_io_tree *io_tree = &inode->io_tree;
4890  	struct extent_map *em = NULL;
4891  	struct extent_state *cached_state = NULL;
4892  	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
4893  	u64 block_end = ALIGN(size, fs_info->sectorsize);
4894  	u64 last_byte;
4895  	u64 cur_offset;
4896  	u64 hole_size;
4897  	int err = 0;
4898  
4899  	/*
4900  	 * If our size started in the middle of a block we need to zero out the
4901  	 * rest of the block before we expand the i_size, otherwise we could
4902  	 * expose stale data.
4903  	 */
4904  	err = btrfs_truncate_block(inode, oldsize, 0, 0);
4905  	if (err)
4906  		return err;
4907  
4908  	if (size <= hole_start)
4909  		return 0;
4910  
4911  	btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
4912  					   &cached_state);
4913  	cur_offset = hole_start;
4914  	while (1) {
4915  		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4916  				      block_end - cur_offset);
4917  		if (IS_ERR(em)) {
4918  			err = PTR_ERR(em);
4919  			em = NULL;
4920  			break;
4921  		}
4922  		last_byte = min(extent_map_end(em), block_end);
4923  		last_byte = ALIGN(last_byte, fs_info->sectorsize);
4924  		hole_size = last_byte - cur_offset;
4925  
4926  		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4927  			struct extent_map *hole_em;
4928  
4929  			err = maybe_insert_hole(root, inode, cur_offset,
4930  						hole_size);
4931  			if (err)
4932  				break;
4933  
4934  			err = btrfs_inode_set_file_extent_range(inode,
4935  							cur_offset, hole_size);
4936  			if (err)
4937  				break;
4938  
4939  			hole_em = alloc_extent_map();
4940  			if (!hole_em) {
4941  				btrfs_drop_extent_map_range(inode, cur_offset,
4942  						    cur_offset + hole_size - 1,
4943  						    false);
4944  				btrfs_set_inode_full_sync(inode);
4945  				goto next;
4946  			}
4947  			hole_em->start = cur_offset;
4948  			hole_em->len = hole_size;
4949  			hole_em->orig_start = cur_offset;
4950  
4951  			hole_em->block_start = EXTENT_MAP_HOLE;
4952  			hole_em->block_len = 0;
4953  			hole_em->orig_block_len = 0;
4954  			hole_em->ram_bytes = hole_size;
4955  			hole_em->compress_type = BTRFS_COMPRESS_NONE;
4956  			hole_em->generation = fs_info->generation;
4957  
4958  			err = btrfs_replace_extent_map_range(inode, hole_em, true);
4959  			free_extent_map(hole_em);
4960  		} else {
4961  			err = btrfs_inode_set_file_extent_range(inode,
4962  							cur_offset, hole_size);
4963  			if (err)
4964  				break;
4965  		}
4966  next:
4967  		free_extent_map(em);
4968  		em = NULL;
4969  		cur_offset = last_byte;
4970  		if (cur_offset >= block_end)
4971  			break;
4972  	}
4973  	free_extent_map(em);
4974  	unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
4975  	return err;
4976  }
4977  
4978  static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4979  {
4980  	struct btrfs_root *root = BTRFS_I(inode)->root;
4981  	struct btrfs_trans_handle *trans;
4982  	loff_t oldsize = i_size_read(inode);
4983  	loff_t newsize = attr->ia_size;
4984  	int mask = attr->ia_valid;
4985  	int ret;
4986  
4987  	/*
4988  	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4989  	 * special case where we need to update the times despite not having
4990  	 * these flags set.  For all other operations the VFS set these flags
4991  	 * explicitly if it wants a timestamp update.
4992  	 */
4993  	if (newsize != oldsize) {
4994  		inode_inc_iversion(inode);
4995  		if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
4996  			inode->i_mtime = inode_set_ctime_current(inode);
4997  		}
4998  	}
4999  
5000  	if (newsize > oldsize) {
5001  		/*
5002  		 * Don't do an expanding truncate while snapshotting is ongoing.
5003  		 * This is to ensure the snapshot captures a fully consistent
5004  		 * state of this file - if the snapshot captures this expanding
5005  		 * truncation, it must capture all writes that happened before
5006  		 * this truncation.
5007  		 */
5008  		btrfs_drew_write_lock(&root->snapshot_lock);
5009  		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
5010  		if (ret) {
5011  			btrfs_drew_write_unlock(&root->snapshot_lock);
5012  			return ret;
5013  		}
5014  
5015  		trans = btrfs_start_transaction(root, 1);
5016  		if (IS_ERR(trans)) {
5017  			btrfs_drew_write_unlock(&root->snapshot_lock);
5018  			return PTR_ERR(trans);
5019  		}
5020  
5021  		i_size_write(inode, newsize);
5022  		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
5023  		pagecache_isize_extended(inode, oldsize, newsize);
5024  		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
5025  		btrfs_drew_write_unlock(&root->snapshot_lock);
5026  		btrfs_end_transaction(trans);
5027  	} else {
5028  		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5029  
5030  		if (btrfs_is_zoned(fs_info)) {
5031  			ret = btrfs_wait_ordered_range(inode,
5032  					ALIGN(newsize, fs_info->sectorsize),
5033  					(u64)-1);
5034  			if (ret)
5035  				return ret;
5036  		}
5037  
5038  		/*
5039  		 * We're truncating a file that used to have good data down to
5040  		 * zero. Make sure any new writes to the file get on disk
5041  		 * on close.
5042  		 */
5043  		if (newsize == 0)
5044  			set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
5045  				&BTRFS_I(inode)->runtime_flags);
5046  
5047  		truncate_setsize(inode, newsize);
5048  
5049  		inode_dio_wait(inode);
5050  
5051  		ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
5052  		if (ret && inode->i_nlink) {
5053  			int err;
5054  
5055  			/*
5056  			 * Truncate failed, so fix up the in-memory size. We
5057  			 * adjusted disk_i_size down as we removed extents, so
5058  			 * wait for disk_i_size to be stable and then update the
5059  			 * in-memory size to match.
5060  			 */
5061  			err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5062  			if (err)
5063  				return err;
5064  			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5065  		}
5066  	}
5067  
5068  	return ret;
5069  }
5070  
5071  static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
5072  			 struct iattr *attr)
5073  {
5074  	struct inode *inode = d_inode(dentry);
5075  	struct btrfs_root *root = BTRFS_I(inode)->root;
5076  	int err;
5077  
5078  	if (btrfs_root_readonly(root))
5079  		return -EROFS;
5080  
5081  	err = setattr_prepare(idmap, dentry, attr);
5082  	if (err)
5083  		return err;
5084  
5085  	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5086  		err = btrfs_setsize(inode, attr);
5087  		if (err)
5088  			return err;
5089  	}
5090  
5091  	if (attr->ia_valid) {
5092  		setattr_copy(idmap, inode, attr);
5093  		inode_inc_iversion(inode);
5094  		err = btrfs_dirty_inode(BTRFS_I(inode));
5095  
5096  		if (!err && attr->ia_valid & ATTR_MODE)
5097  			err = posix_acl_chmod(idmap, dentry, inode->i_mode);
5098  	}
5099  
5100  	return err;
5101  }
5102  
5103  /*
5104   * While truncating the inode pages during eviction, we get the VFS
5105   * calling btrfs_invalidate_folio() against each folio of the inode. This
5106   * is slow because the calls to btrfs_invalidate_folio() result in a
5107   * huge amount of calls to lock_extent() and clear_extent_bit(),
5108   * which keep merging and splitting extent_state structures over and over,
5109   * wasting lots of time.
5110   *
5111   * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
5112   * skip all those expensive operations on a per folio basis and do only
5113   * the ordered io finishing, while we release here the extent_map and
5114   * extent_state structures, without the excessive merging and splitting.
5115   */
5116  static void evict_inode_truncate_pages(struct inode *inode)
5117  {
5118  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5119  	struct rb_node *node;
5120  
5121  	ASSERT(inode->i_state & I_FREEING);
5122  	truncate_inode_pages_final(&inode->i_data);
5123  
5124  	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
5125  
5126  	/*
5127  	 * Keep looping until we have no more ranges in the io tree.
5128  	 * We can have ongoing bios started by readahead that have
5129  	 * their endio callback (extent_io.c:end_bio_extent_readpage)
5130  	 * still in progress (unlocked the pages in the bio but did not yet
5131  	 * unlocked the ranges in the io tree). Therefore this means some
5132  	 * ranges can still be locked and eviction started because before
5133  	 * submitting those bios, which are executed by a separate task (work
5134  	 * queue kthread), inode references (inode->i_count) were not taken
5135  	 * (which would be dropped in the end io callback of each bio).
5136  	 * Therefore here we effectively end up waiting for those bios and
5137  	 * anyone else holding locked ranges without having bumped the inode's
5138  	 * reference count - if we don't do it, when they access the inode's
5139  	 * io_tree to unlock a range it may be too late, leading to an
5140  	 * use-after-free issue.
5141  	 */
5142  	spin_lock(&io_tree->lock);
5143  	while (!RB_EMPTY_ROOT(&io_tree->state)) {
5144  		struct extent_state *state;
5145  		struct extent_state *cached_state = NULL;
5146  		u64 start;
5147  		u64 end;
5148  		unsigned state_flags;
5149  
5150  		node = rb_first(&io_tree->state);
5151  		state = rb_entry(node, struct extent_state, rb_node);
5152  		start = state->start;
5153  		end = state->end;
5154  		state_flags = state->state;
5155  		spin_unlock(&io_tree->lock);
5156  
5157  		lock_extent(io_tree, start, end, &cached_state);
5158  
5159  		/*
5160  		 * If still has DELALLOC flag, the extent didn't reach disk,
5161  		 * and its reserved space won't be freed by delayed_ref.
5162  		 * So we need to free its reserved space here.
5163  		 * (Refer to comment in btrfs_invalidate_folio, case 2)
5164  		 *
5165  		 * Note, end is the bytenr of last byte, so we need + 1 here.
5166  		 */
5167  		if (state_flags & EXTENT_DELALLOC)
5168  			btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
5169  					       end - start + 1, NULL);
5170  
5171  		clear_extent_bit(io_tree, start, end,
5172  				 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
5173  				 &cached_state);
5174  
5175  		cond_resched();
5176  		spin_lock(&io_tree->lock);
5177  	}
5178  	spin_unlock(&io_tree->lock);
5179  }
5180  
5181  static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5182  							struct btrfs_block_rsv *rsv)
5183  {
5184  	struct btrfs_fs_info *fs_info = root->fs_info;
5185  	struct btrfs_trans_handle *trans;
5186  	u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
5187  	int ret;
5188  
5189  	/*
5190  	 * Eviction should be taking place at some place safe because of our
5191  	 * delayed iputs.  However the normal flushing code will run delayed
5192  	 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5193  	 *
5194  	 * We reserve the delayed_refs_extra here again because we can't use
5195  	 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5196  	 * above.  We reserve our extra bit here because we generate a ton of
5197  	 * delayed refs activity by truncating.
5198  	 *
5199  	 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
5200  	 * if we fail to make this reservation we can re-try without the
5201  	 * delayed_refs_extra so we can make some forward progress.
5202  	 */
5203  	ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
5204  				     BTRFS_RESERVE_FLUSH_EVICT);
5205  	if (ret) {
5206  		ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
5207  					     BTRFS_RESERVE_FLUSH_EVICT);
5208  		if (ret) {
5209  			btrfs_warn(fs_info,
5210  				   "could not allocate space for delete; will truncate on mount");
5211  			return ERR_PTR(-ENOSPC);
5212  		}
5213  		delayed_refs_extra = 0;
5214  	}
5215  
5216  	trans = btrfs_join_transaction(root);
5217  	if (IS_ERR(trans))
5218  		return trans;
5219  
5220  	if (delayed_refs_extra) {
5221  		trans->block_rsv = &fs_info->trans_block_rsv;
5222  		trans->bytes_reserved = delayed_refs_extra;
5223  		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5224  					delayed_refs_extra, true);
5225  	}
5226  	return trans;
5227  }
5228  
5229  void btrfs_evict_inode(struct inode *inode)
5230  {
5231  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5232  	struct btrfs_trans_handle *trans;
5233  	struct btrfs_root *root = BTRFS_I(inode)->root;
5234  	struct btrfs_block_rsv *rsv = NULL;
5235  	int ret;
5236  
5237  	trace_btrfs_inode_evict(inode);
5238  
5239  	if (!root) {
5240  		fsverity_cleanup_inode(inode);
5241  		clear_inode(inode);
5242  		return;
5243  	}
5244  
5245  	evict_inode_truncate_pages(inode);
5246  
5247  	if (inode->i_nlink &&
5248  	    ((btrfs_root_refs(&root->root_item) != 0 &&
5249  	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5250  	     btrfs_is_free_space_inode(BTRFS_I(inode))))
5251  		goto out;
5252  
5253  	if (is_bad_inode(inode))
5254  		goto out;
5255  
5256  	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5257  		goto out;
5258  
5259  	if (inode->i_nlink > 0) {
5260  		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5261  		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5262  		goto out;
5263  	}
5264  
5265  	/*
5266  	 * This makes sure the inode item in tree is uptodate and the space for
5267  	 * the inode update is released.
5268  	 */
5269  	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5270  	if (ret)
5271  		goto out;
5272  
5273  	/*
5274  	 * This drops any pending insert or delete operations we have for this
5275  	 * inode.  We could have a delayed dir index deletion queued up, but
5276  	 * we're removing the inode completely so that'll be taken care of in
5277  	 * the truncate.
5278  	 */
5279  	btrfs_kill_delayed_inode_items(BTRFS_I(inode));
5280  
5281  	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5282  	if (!rsv)
5283  		goto out;
5284  	rsv->size = btrfs_calc_metadata_size(fs_info, 1);
5285  	rsv->failfast = true;
5286  
5287  	btrfs_i_size_write(BTRFS_I(inode), 0);
5288  
5289  	while (1) {
5290  		struct btrfs_truncate_control control = {
5291  			.inode = BTRFS_I(inode),
5292  			.ino = btrfs_ino(BTRFS_I(inode)),
5293  			.new_size = 0,
5294  			.min_type = 0,
5295  		};
5296  
5297  		trans = evict_refill_and_join(root, rsv);
5298  		if (IS_ERR(trans))
5299  			goto out;
5300  
5301  		trans->block_rsv = rsv;
5302  
5303  		ret = btrfs_truncate_inode_items(trans, root, &control);
5304  		trans->block_rsv = &fs_info->trans_block_rsv;
5305  		btrfs_end_transaction(trans);
5306  		/*
5307  		 * We have not added new delayed items for our inode after we
5308  		 * have flushed its delayed items, so no need to throttle on
5309  		 * delayed items. However we have modified extent buffers.
5310  		 */
5311  		btrfs_btree_balance_dirty_nodelay(fs_info);
5312  		if (ret && ret != -ENOSPC && ret != -EAGAIN)
5313  			goto out;
5314  		else if (!ret)
5315  			break;
5316  	}
5317  
5318  	/*
5319  	 * Errors here aren't a big deal, it just means we leave orphan items in
5320  	 * the tree. They will be cleaned up on the next mount. If the inode
5321  	 * number gets reused, cleanup deletes the orphan item without doing
5322  	 * anything, and unlink reuses the existing orphan item.
5323  	 *
5324  	 * If it turns out that we are dropping too many of these, we might want
5325  	 * to add a mechanism for retrying these after a commit.
5326  	 */
5327  	trans = evict_refill_and_join(root, rsv);
5328  	if (!IS_ERR(trans)) {
5329  		trans->block_rsv = rsv;
5330  		btrfs_orphan_del(trans, BTRFS_I(inode));
5331  		trans->block_rsv = &fs_info->trans_block_rsv;
5332  		btrfs_end_transaction(trans);
5333  	}
5334  
5335  out:
5336  	btrfs_free_block_rsv(fs_info, rsv);
5337  	/*
5338  	 * If we didn't successfully delete, the orphan item will still be in
5339  	 * the tree and we'll retry on the next mount. Again, we might also want
5340  	 * to retry these periodically in the future.
5341  	 */
5342  	btrfs_remove_delayed_node(BTRFS_I(inode));
5343  	fsverity_cleanup_inode(inode);
5344  	clear_inode(inode);
5345  }
5346  
5347  /*
5348   * Return the key found in the dir entry in the location pointer, fill @type
5349   * with BTRFS_FT_*, and return 0.
5350   *
5351   * If no dir entries were found, returns -ENOENT.
5352   * If found a corrupted location in dir entry, returns -EUCLEAN.
5353   */
5354  static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
5355  			       struct btrfs_key *location, u8 *type)
5356  {
5357  	struct btrfs_dir_item *di;
5358  	struct btrfs_path *path;
5359  	struct btrfs_root *root = dir->root;
5360  	int ret = 0;
5361  	struct fscrypt_name fname;
5362  
5363  	path = btrfs_alloc_path();
5364  	if (!path)
5365  		return -ENOMEM;
5366  
5367  	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
5368  	if (ret < 0)
5369  		goto out;
5370  	/*
5371  	 * fscrypt_setup_filename() should never return a positive value, but
5372  	 * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
5373  	 */
5374  	ASSERT(ret == 0);
5375  
5376  	/* This needs to handle no-key deletions later on */
5377  
5378  	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
5379  				   &fname.disk_name, 0);
5380  	if (IS_ERR_OR_NULL(di)) {
5381  		ret = di ? PTR_ERR(di) : -ENOENT;
5382  		goto out;
5383  	}
5384  
5385  	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5386  	if (location->type != BTRFS_INODE_ITEM_KEY &&
5387  	    location->type != BTRFS_ROOT_ITEM_KEY) {
5388  		ret = -EUCLEAN;
5389  		btrfs_warn(root->fs_info,
5390  "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5391  			   __func__, fname.disk_name.name, btrfs_ino(dir),
5392  			   location->objectid, location->type, location->offset);
5393  	}
5394  	if (!ret)
5395  		*type = btrfs_dir_ftype(path->nodes[0], di);
5396  out:
5397  	fscrypt_free_filename(&fname);
5398  	btrfs_free_path(path);
5399  	return ret;
5400  }
5401  
5402  /*
5403   * when we hit a tree root in a directory, the btrfs part of the inode
5404   * needs to be changed to reflect the root directory of the tree root.  This
5405   * is kind of like crossing a mount point.
5406   */
5407  static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5408  				    struct btrfs_inode *dir,
5409  				    struct dentry *dentry,
5410  				    struct btrfs_key *location,
5411  				    struct btrfs_root **sub_root)
5412  {
5413  	struct btrfs_path *path;
5414  	struct btrfs_root *new_root;
5415  	struct btrfs_root_ref *ref;
5416  	struct extent_buffer *leaf;
5417  	struct btrfs_key key;
5418  	int ret;
5419  	int err = 0;
5420  	struct fscrypt_name fname;
5421  
5422  	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
5423  	if (ret)
5424  		return ret;
5425  
5426  	path = btrfs_alloc_path();
5427  	if (!path) {
5428  		err = -ENOMEM;
5429  		goto out;
5430  	}
5431  
5432  	err = -ENOENT;
5433  	key.objectid = dir->root->root_key.objectid;
5434  	key.type = BTRFS_ROOT_REF_KEY;
5435  	key.offset = location->objectid;
5436  
5437  	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5438  	if (ret) {
5439  		if (ret < 0)
5440  			err = ret;
5441  		goto out;
5442  	}
5443  
5444  	leaf = path->nodes[0];
5445  	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5446  	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
5447  	    btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
5448  		goto out;
5449  
5450  	ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
5451  				   (unsigned long)(ref + 1), fname.disk_name.len);
5452  	if (ret)
5453  		goto out;
5454  
5455  	btrfs_release_path(path);
5456  
5457  	new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5458  	if (IS_ERR(new_root)) {
5459  		err = PTR_ERR(new_root);
5460  		goto out;
5461  	}
5462  
5463  	*sub_root = new_root;
5464  	location->objectid = btrfs_root_dirid(&new_root->root_item);
5465  	location->type = BTRFS_INODE_ITEM_KEY;
5466  	location->offset = 0;
5467  	err = 0;
5468  out:
5469  	btrfs_free_path(path);
5470  	fscrypt_free_filename(&fname);
5471  	return err;
5472  }
5473  
5474  static void inode_tree_add(struct btrfs_inode *inode)
5475  {
5476  	struct btrfs_root *root = inode->root;
5477  	struct btrfs_inode *entry;
5478  	struct rb_node **p;
5479  	struct rb_node *parent;
5480  	struct rb_node *new = &inode->rb_node;
5481  	u64 ino = btrfs_ino(inode);
5482  
5483  	if (inode_unhashed(&inode->vfs_inode))
5484  		return;
5485  	parent = NULL;
5486  	spin_lock(&root->inode_lock);
5487  	p = &root->inode_tree.rb_node;
5488  	while (*p) {
5489  		parent = *p;
5490  		entry = rb_entry(parent, struct btrfs_inode, rb_node);
5491  
5492  		if (ino < btrfs_ino(entry))
5493  			p = &parent->rb_left;
5494  		else if (ino > btrfs_ino(entry))
5495  			p = &parent->rb_right;
5496  		else {
5497  			WARN_ON(!(entry->vfs_inode.i_state &
5498  				  (I_WILL_FREE | I_FREEING)));
5499  			rb_replace_node(parent, new, &root->inode_tree);
5500  			RB_CLEAR_NODE(parent);
5501  			spin_unlock(&root->inode_lock);
5502  			return;
5503  		}
5504  	}
5505  	rb_link_node(new, parent, p);
5506  	rb_insert_color(new, &root->inode_tree);
5507  	spin_unlock(&root->inode_lock);
5508  }
5509  
5510  static void inode_tree_del(struct btrfs_inode *inode)
5511  {
5512  	struct btrfs_root *root = inode->root;
5513  	int empty = 0;
5514  
5515  	spin_lock(&root->inode_lock);
5516  	if (!RB_EMPTY_NODE(&inode->rb_node)) {
5517  		rb_erase(&inode->rb_node, &root->inode_tree);
5518  		RB_CLEAR_NODE(&inode->rb_node);
5519  		empty = RB_EMPTY_ROOT(&root->inode_tree);
5520  	}
5521  	spin_unlock(&root->inode_lock);
5522  
5523  	if (empty && btrfs_root_refs(&root->root_item) == 0) {
5524  		spin_lock(&root->inode_lock);
5525  		empty = RB_EMPTY_ROOT(&root->inode_tree);
5526  		spin_unlock(&root->inode_lock);
5527  		if (empty)
5528  			btrfs_add_dead_root(root);
5529  	}
5530  }
5531  
5532  
5533  static int btrfs_init_locked_inode(struct inode *inode, void *p)
5534  {
5535  	struct btrfs_iget_args *args = p;
5536  
5537  	inode->i_ino = args->ino;
5538  	BTRFS_I(inode)->location.objectid = args->ino;
5539  	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5540  	BTRFS_I(inode)->location.offset = 0;
5541  	BTRFS_I(inode)->root = btrfs_grab_root(args->root);
5542  	BUG_ON(args->root && !BTRFS_I(inode)->root);
5543  
5544  	if (args->root && args->root == args->root->fs_info->tree_root &&
5545  	    args->ino != BTRFS_BTREE_INODE_OBJECTID)
5546  		set_bit(BTRFS_INODE_FREE_SPACE_INODE,
5547  			&BTRFS_I(inode)->runtime_flags);
5548  	return 0;
5549  }
5550  
5551  static int btrfs_find_actor(struct inode *inode, void *opaque)
5552  {
5553  	struct btrfs_iget_args *args = opaque;
5554  
5555  	return args->ino == BTRFS_I(inode)->location.objectid &&
5556  		args->root == BTRFS_I(inode)->root;
5557  }
5558  
5559  static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
5560  				       struct btrfs_root *root)
5561  {
5562  	struct inode *inode;
5563  	struct btrfs_iget_args args;
5564  	unsigned long hashval = btrfs_inode_hash(ino, root);
5565  
5566  	args.ino = ino;
5567  	args.root = root;
5568  
5569  	inode = iget5_locked(s, hashval, btrfs_find_actor,
5570  			     btrfs_init_locked_inode,
5571  			     (void *)&args);
5572  	return inode;
5573  }
5574  
5575  /*
5576   * Get an inode object given its inode number and corresponding root.
5577   * Path can be preallocated to prevent recursing back to iget through
5578   * allocator. NULL is also valid but may require an additional allocation
5579   * later.
5580   */
5581  struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
5582  			      struct btrfs_root *root, struct btrfs_path *path)
5583  {
5584  	struct inode *inode;
5585  
5586  	inode = btrfs_iget_locked(s, ino, root);
5587  	if (!inode)
5588  		return ERR_PTR(-ENOMEM);
5589  
5590  	if (inode->i_state & I_NEW) {
5591  		int ret;
5592  
5593  		ret = btrfs_read_locked_inode(inode, path);
5594  		if (!ret) {
5595  			inode_tree_add(BTRFS_I(inode));
5596  			unlock_new_inode(inode);
5597  		} else {
5598  			iget_failed(inode);
5599  			/*
5600  			 * ret > 0 can come from btrfs_search_slot called by
5601  			 * btrfs_read_locked_inode, this means the inode item
5602  			 * was not found.
5603  			 */
5604  			if (ret > 0)
5605  				ret = -ENOENT;
5606  			inode = ERR_PTR(ret);
5607  		}
5608  	}
5609  
5610  	return inode;
5611  }
5612  
5613  struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
5614  {
5615  	return btrfs_iget_path(s, ino, root, NULL);
5616  }
5617  
5618  static struct inode *new_simple_dir(struct inode *dir,
5619  				    struct btrfs_key *key,
5620  				    struct btrfs_root *root)
5621  {
5622  	struct inode *inode = new_inode(dir->i_sb);
5623  
5624  	if (!inode)
5625  		return ERR_PTR(-ENOMEM);
5626  
5627  	BTRFS_I(inode)->root = btrfs_grab_root(root);
5628  	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5629  	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5630  
5631  	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5632  	/*
5633  	 * We only need lookup, the rest is read-only and there's no inode
5634  	 * associated with the dentry
5635  	 */
5636  	inode->i_op = &simple_dir_inode_operations;
5637  	inode->i_opflags &= ~IOP_XATTR;
5638  	inode->i_fop = &simple_dir_operations;
5639  	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5640  	inode->i_mtime = inode_set_ctime_current(inode);
5641  	inode->i_atime = dir->i_atime;
5642  	BTRFS_I(inode)->i_otime = inode->i_mtime;
5643  	inode->i_uid = dir->i_uid;
5644  	inode->i_gid = dir->i_gid;
5645  
5646  	return inode;
5647  }
5648  
5649  static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
5650  static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
5651  static_assert(BTRFS_FT_DIR == FT_DIR);
5652  static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
5653  static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
5654  static_assert(BTRFS_FT_FIFO == FT_FIFO);
5655  static_assert(BTRFS_FT_SOCK == FT_SOCK);
5656  static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
5657  
5658  static inline u8 btrfs_inode_type(struct inode *inode)
5659  {
5660  	return fs_umode_to_ftype(inode->i_mode);
5661  }
5662  
5663  struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5664  {
5665  	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
5666  	struct inode *inode;
5667  	struct btrfs_root *root = BTRFS_I(dir)->root;
5668  	struct btrfs_root *sub_root = root;
5669  	struct btrfs_key location = { 0 };
5670  	u8 di_type = 0;
5671  	int ret = 0;
5672  
5673  	if (dentry->d_name.len > BTRFS_NAME_LEN)
5674  		return ERR_PTR(-ENAMETOOLONG);
5675  
5676  	ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
5677  	if (ret < 0)
5678  		return ERR_PTR(ret);
5679  
5680  	if (location.type == BTRFS_INODE_ITEM_KEY) {
5681  		inode = btrfs_iget(dir->i_sb, location.objectid, root);
5682  		if (IS_ERR(inode))
5683  			return inode;
5684  
5685  		/* Do extra check against inode mode with di_type */
5686  		if (btrfs_inode_type(inode) != di_type) {
5687  			btrfs_crit(fs_info,
5688  "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5689  				  inode->i_mode, btrfs_inode_type(inode),
5690  				  di_type);
5691  			iput(inode);
5692  			return ERR_PTR(-EUCLEAN);
5693  		}
5694  		return inode;
5695  	}
5696  
5697  	ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
5698  				       &location, &sub_root);
5699  	if (ret < 0) {
5700  		if (ret != -ENOENT)
5701  			inode = ERR_PTR(ret);
5702  		else
5703  			inode = new_simple_dir(dir, &location, root);
5704  	} else {
5705  		inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
5706  		btrfs_put_root(sub_root);
5707  
5708  		if (IS_ERR(inode))
5709  			return inode;
5710  
5711  		down_read(&fs_info->cleanup_work_sem);
5712  		if (!sb_rdonly(inode->i_sb))
5713  			ret = btrfs_orphan_cleanup(sub_root);
5714  		up_read(&fs_info->cleanup_work_sem);
5715  		if (ret) {
5716  			iput(inode);
5717  			inode = ERR_PTR(ret);
5718  		}
5719  	}
5720  
5721  	return inode;
5722  }
5723  
5724  static int btrfs_dentry_delete(const struct dentry *dentry)
5725  {
5726  	struct btrfs_root *root;
5727  	struct inode *inode = d_inode(dentry);
5728  
5729  	if (!inode && !IS_ROOT(dentry))
5730  		inode = d_inode(dentry->d_parent);
5731  
5732  	if (inode) {
5733  		root = BTRFS_I(inode)->root;
5734  		if (btrfs_root_refs(&root->root_item) == 0)
5735  			return 1;
5736  
5737  		if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5738  			return 1;
5739  	}
5740  	return 0;
5741  }
5742  
5743  static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5744  				   unsigned int flags)
5745  {
5746  	struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5747  
5748  	if (inode == ERR_PTR(-ENOENT))
5749  		inode = NULL;
5750  	return d_splice_alias(inode, dentry);
5751  }
5752  
5753  /*
5754   * Find the highest existing sequence number in a directory and then set the
5755   * in-memory index_cnt variable to the first free sequence number.
5756   */
5757  static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
5758  {
5759  	struct btrfs_root *root = inode->root;
5760  	struct btrfs_key key, found_key;
5761  	struct btrfs_path *path;
5762  	struct extent_buffer *leaf;
5763  	int ret;
5764  
5765  	key.objectid = btrfs_ino(inode);
5766  	key.type = BTRFS_DIR_INDEX_KEY;
5767  	key.offset = (u64)-1;
5768  
5769  	path = btrfs_alloc_path();
5770  	if (!path)
5771  		return -ENOMEM;
5772  
5773  	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5774  	if (ret < 0)
5775  		goto out;
5776  	/* FIXME: we should be able to handle this */
5777  	if (ret == 0)
5778  		goto out;
5779  	ret = 0;
5780  
5781  	if (path->slots[0] == 0) {
5782  		inode->index_cnt = BTRFS_DIR_START_INDEX;
5783  		goto out;
5784  	}
5785  
5786  	path->slots[0]--;
5787  
5788  	leaf = path->nodes[0];
5789  	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5790  
5791  	if (found_key.objectid != btrfs_ino(inode) ||
5792  	    found_key.type != BTRFS_DIR_INDEX_KEY) {
5793  		inode->index_cnt = BTRFS_DIR_START_INDEX;
5794  		goto out;
5795  	}
5796  
5797  	inode->index_cnt = found_key.offset + 1;
5798  out:
5799  	btrfs_free_path(path);
5800  	return ret;
5801  }
5802  
5803  static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
5804  {
5805  	int ret = 0;
5806  
5807  	btrfs_inode_lock(dir, 0);
5808  	if (dir->index_cnt == (u64)-1) {
5809  		ret = btrfs_inode_delayed_dir_index_count(dir);
5810  		if (ret) {
5811  			ret = btrfs_set_inode_index_count(dir);
5812  			if (ret)
5813  				goto out;
5814  		}
5815  	}
5816  
5817  	/* index_cnt is the index number of next new entry, so decrement it. */
5818  	*index = dir->index_cnt - 1;
5819  out:
5820  	btrfs_inode_unlock(dir, 0);
5821  
5822  	return ret;
5823  }
5824  
5825  /*
5826   * All this infrastructure exists because dir_emit can fault, and we are holding
5827   * the tree lock when doing readdir.  For now just allocate a buffer and copy
5828   * our information into that, and then dir_emit from the buffer.  This is
5829   * similar to what NFS does, only we don't keep the buffer around in pagecache
5830   * because I'm afraid I'll mess that up.  Long term we need to make filldir do
5831   * copy_to_user_inatomic so we don't have to worry about page faulting under the
5832   * tree lock.
5833   */
5834  static int btrfs_opendir(struct inode *inode, struct file *file)
5835  {
5836  	struct btrfs_file_private *private;
5837  	u64 last_index;
5838  	int ret;
5839  
5840  	ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
5841  	if (ret)
5842  		return ret;
5843  
5844  	private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
5845  	if (!private)
5846  		return -ENOMEM;
5847  	private->last_index = last_index;
5848  	private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
5849  	if (!private->filldir_buf) {
5850  		kfree(private);
5851  		return -ENOMEM;
5852  	}
5853  	file->private_data = private;
5854  	return 0;
5855  }
5856  
5857  static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
5858  {
5859  	struct btrfs_file_private *private = file->private_data;
5860  	int ret;
5861  
5862  	ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
5863  				       &private->last_index);
5864  	if (ret)
5865  		return ret;
5866  
5867  	return generic_file_llseek(file, offset, whence);
5868  }
5869  
5870  struct dir_entry {
5871  	u64 ino;
5872  	u64 offset;
5873  	unsigned type;
5874  	int name_len;
5875  };
5876  
5877  static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
5878  {
5879  	while (entries--) {
5880  		struct dir_entry *entry = addr;
5881  		char *name = (char *)(entry + 1);
5882  
5883  		ctx->pos = get_unaligned(&entry->offset);
5884  		if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
5885  					 get_unaligned(&entry->ino),
5886  					 get_unaligned(&entry->type)))
5887  			return 1;
5888  		addr += sizeof(struct dir_entry) +
5889  			get_unaligned(&entry->name_len);
5890  		ctx->pos++;
5891  	}
5892  	return 0;
5893  }
5894  
5895  static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5896  {
5897  	struct inode *inode = file_inode(file);
5898  	struct btrfs_root *root = BTRFS_I(inode)->root;
5899  	struct btrfs_file_private *private = file->private_data;
5900  	struct btrfs_dir_item *di;
5901  	struct btrfs_key key;
5902  	struct btrfs_key found_key;
5903  	struct btrfs_path *path;
5904  	void *addr;
5905  	LIST_HEAD(ins_list);
5906  	LIST_HEAD(del_list);
5907  	int ret;
5908  	char *name_ptr;
5909  	int name_len;
5910  	int entries = 0;
5911  	int total_len = 0;
5912  	bool put = false;
5913  	struct btrfs_key location;
5914  
5915  	if (!dir_emit_dots(file, ctx))
5916  		return 0;
5917  
5918  	path = btrfs_alloc_path();
5919  	if (!path)
5920  		return -ENOMEM;
5921  
5922  	addr = private->filldir_buf;
5923  	path->reada = READA_FORWARD;
5924  
5925  	put = btrfs_readdir_get_delayed_items(inode, private->last_index,
5926  					      &ins_list, &del_list);
5927  
5928  again:
5929  	key.type = BTRFS_DIR_INDEX_KEY;
5930  	key.offset = ctx->pos;
5931  	key.objectid = btrfs_ino(BTRFS_I(inode));
5932  
5933  	btrfs_for_each_slot(root, &key, &found_key, path, ret) {
5934  		struct dir_entry *entry;
5935  		struct extent_buffer *leaf = path->nodes[0];
5936  		u8 ftype;
5937  
5938  		if (found_key.objectid != key.objectid)
5939  			break;
5940  		if (found_key.type != BTRFS_DIR_INDEX_KEY)
5941  			break;
5942  		if (found_key.offset < ctx->pos)
5943  			continue;
5944  		if (found_key.offset > private->last_index)
5945  			break;
5946  		if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
5947  			continue;
5948  		di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
5949  		name_len = btrfs_dir_name_len(leaf, di);
5950  		if ((total_len + sizeof(struct dir_entry) + name_len) >=
5951  		    PAGE_SIZE) {
5952  			btrfs_release_path(path);
5953  			ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5954  			if (ret)
5955  				goto nopos;
5956  			addr = private->filldir_buf;
5957  			entries = 0;
5958  			total_len = 0;
5959  			goto again;
5960  		}
5961  
5962  		ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
5963  		entry = addr;
5964  		name_ptr = (char *)(entry + 1);
5965  		read_extent_buffer(leaf, name_ptr,
5966  				   (unsigned long)(di + 1), name_len);
5967  		put_unaligned(name_len, &entry->name_len);
5968  		put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
5969  		btrfs_dir_item_key_to_cpu(leaf, di, &location);
5970  		put_unaligned(location.objectid, &entry->ino);
5971  		put_unaligned(found_key.offset, &entry->offset);
5972  		entries++;
5973  		addr += sizeof(struct dir_entry) + name_len;
5974  		total_len += sizeof(struct dir_entry) + name_len;
5975  	}
5976  	/* Catch error encountered during iteration */
5977  	if (ret < 0)
5978  		goto err;
5979  
5980  	btrfs_release_path(path);
5981  
5982  	ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5983  	if (ret)
5984  		goto nopos;
5985  
5986  	ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5987  	if (ret)
5988  		goto nopos;
5989  
5990  	/*
5991  	 * Stop new entries from being returned after we return the last
5992  	 * entry.
5993  	 *
5994  	 * New directory entries are assigned a strictly increasing
5995  	 * offset.  This means that new entries created during readdir
5996  	 * are *guaranteed* to be seen in the future by that readdir.
5997  	 * This has broken buggy programs which operate on names as
5998  	 * they're returned by readdir.  Until we re-use freed offsets
5999  	 * we have this hack to stop new entries from being returned
6000  	 * under the assumption that they'll never reach this huge
6001  	 * offset.
6002  	 *
6003  	 * This is being careful not to overflow 32bit loff_t unless the
6004  	 * last entry requires it because doing so has broken 32bit apps
6005  	 * in the past.
6006  	 */
6007  	if (ctx->pos >= INT_MAX)
6008  		ctx->pos = LLONG_MAX;
6009  	else
6010  		ctx->pos = INT_MAX;
6011  nopos:
6012  	ret = 0;
6013  err:
6014  	if (put)
6015  		btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
6016  	btrfs_free_path(path);
6017  	return ret;
6018  }
6019  
6020  /*
6021   * This is somewhat expensive, updating the tree every time the
6022   * inode changes.  But, it is most likely to find the inode in cache.
6023   * FIXME, needs more benchmarking...there are no reasons other than performance
6024   * to keep or drop this code.
6025   */
6026  static int btrfs_dirty_inode(struct btrfs_inode *inode)
6027  {
6028  	struct btrfs_root *root = inode->root;
6029  	struct btrfs_fs_info *fs_info = root->fs_info;
6030  	struct btrfs_trans_handle *trans;
6031  	int ret;
6032  
6033  	if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
6034  		return 0;
6035  
6036  	trans = btrfs_join_transaction(root);
6037  	if (IS_ERR(trans))
6038  		return PTR_ERR(trans);
6039  
6040  	ret = btrfs_update_inode(trans, root, inode);
6041  	if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
6042  		/* whoops, lets try again with the full transaction */
6043  		btrfs_end_transaction(trans);
6044  		trans = btrfs_start_transaction(root, 1);
6045  		if (IS_ERR(trans))
6046  			return PTR_ERR(trans);
6047  
6048  		ret = btrfs_update_inode(trans, root, inode);
6049  	}
6050  	btrfs_end_transaction(trans);
6051  	if (inode->delayed_node)
6052  		btrfs_balance_delayed_items(fs_info);
6053  
6054  	return ret;
6055  }
6056  
6057  /*
6058   * This is a copy of file_update_time.  We need this so we can return error on
6059   * ENOSPC for updating the inode in the case of file write and mmap writes.
6060   */
6061  static int btrfs_update_time(struct inode *inode, int flags)
6062  {
6063  	struct btrfs_root *root = BTRFS_I(inode)->root;
6064  	bool dirty = flags & ~S_VERSION;
6065  
6066  	if (btrfs_root_readonly(root))
6067  		return -EROFS;
6068  
6069  	dirty = inode_update_timestamps(inode, flags);
6070  	return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
6071  }
6072  
6073  /*
6074   * helper to find a free sequence number in a given directory.  This current
6075   * code is very simple, later versions will do smarter things in the btree
6076   */
6077  int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6078  {
6079  	int ret = 0;
6080  
6081  	if (dir->index_cnt == (u64)-1) {
6082  		ret = btrfs_inode_delayed_dir_index_count(dir);
6083  		if (ret) {
6084  			ret = btrfs_set_inode_index_count(dir);
6085  			if (ret)
6086  				return ret;
6087  		}
6088  	}
6089  
6090  	*index = dir->index_cnt;
6091  	dir->index_cnt++;
6092  
6093  	return ret;
6094  }
6095  
6096  static int btrfs_insert_inode_locked(struct inode *inode)
6097  {
6098  	struct btrfs_iget_args args;
6099  
6100  	args.ino = BTRFS_I(inode)->location.objectid;
6101  	args.root = BTRFS_I(inode)->root;
6102  
6103  	return insert_inode_locked4(inode,
6104  		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6105  		   btrfs_find_actor, &args);
6106  }
6107  
6108  int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
6109  			    unsigned int *trans_num_items)
6110  {
6111  	struct inode *dir = args->dir;
6112  	struct inode *inode = args->inode;
6113  	int ret;
6114  
6115  	if (!args->orphan) {
6116  		ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
6117  					     &args->fname);
6118  		if (ret)
6119  			return ret;
6120  	}
6121  
6122  	ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
6123  	if (ret) {
6124  		fscrypt_free_filename(&args->fname);
6125  		return ret;
6126  	}
6127  
6128  	/* 1 to add inode item */
6129  	*trans_num_items = 1;
6130  	/* 1 to add compression property */
6131  	if (BTRFS_I(dir)->prop_compress)
6132  		(*trans_num_items)++;
6133  	/* 1 to add default ACL xattr */
6134  	if (args->default_acl)
6135  		(*trans_num_items)++;
6136  	/* 1 to add access ACL xattr */
6137  	if (args->acl)
6138  		(*trans_num_items)++;
6139  #ifdef CONFIG_SECURITY
6140  	/* 1 to add LSM xattr */
6141  	if (dir->i_security)
6142  		(*trans_num_items)++;
6143  #endif
6144  	if (args->orphan) {
6145  		/* 1 to add orphan item */
6146  		(*trans_num_items)++;
6147  	} else {
6148  		/*
6149  		 * 1 to add dir item
6150  		 * 1 to add dir index
6151  		 * 1 to update parent inode item
6152  		 *
6153  		 * No need for 1 unit for the inode ref item because it is
6154  		 * inserted in a batch together with the inode item at
6155  		 * btrfs_create_new_inode().
6156  		 */
6157  		*trans_num_items += 3;
6158  	}
6159  	return 0;
6160  }
6161  
6162  void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
6163  {
6164  	posix_acl_release(args->acl);
6165  	posix_acl_release(args->default_acl);
6166  	fscrypt_free_filename(&args->fname);
6167  }
6168  
6169  /*
6170   * Inherit flags from the parent inode.
6171   *
6172   * Currently only the compression flags and the cow flags are inherited.
6173   */
6174  static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
6175  {
6176  	unsigned int flags;
6177  
6178  	flags = dir->flags;
6179  
6180  	if (flags & BTRFS_INODE_NOCOMPRESS) {
6181  		inode->flags &= ~BTRFS_INODE_COMPRESS;
6182  		inode->flags |= BTRFS_INODE_NOCOMPRESS;
6183  	} else if (flags & BTRFS_INODE_COMPRESS) {
6184  		inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
6185  		inode->flags |= BTRFS_INODE_COMPRESS;
6186  	}
6187  
6188  	if (flags & BTRFS_INODE_NODATACOW) {
6189  		inode->flags |= BTRFS_INODE_NODATACOW;
6190  		if (S_ISREG(inode->vfs_inode.i_mode))
6191  			inode->flags |= BTRFS_INODE_NODATASUM;
6192  	}
6193  
6194  	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
6195  }
6196  
6197  int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
6198  			   struct btrfs_new_inode_args *args)
6199  {
6200  	struct inode *dir = args->dir;
6201  	struct inode *inode = args->inode;
6202  	const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
6203  	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6204  	struct btrfs_root *root;
6205  	struct btrfs_inode_item *inode_item;
6206  	struct btrfs_key *location;
6207  	struct btrfs_path *path;
6208  	u64 objectid;
6209  	struct btrfs_inode_ref *ref;
6210  	struct btrfs_key key[2];
6211  	u32 sizes[2];
6212  	struct btrfs_item_batch batch;
6213  	unsigned long ptr;
6214  	int ret;
6215  
6216  	path = btrfs_alloc_path();
6217  	if (!path)
6218  		return -ENOMEM;
6219  
6220  	if (!args->subvol)
6221  		BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
6222  	root = BTRFS_I(inode)->root;
6223  
6224  	ret = btrfs_get_free_objectid(root, &objectid);
6225  	if (ret)
6226  		goto out;
6227  	inode->i_ino = objectid;
6228  
6229  	if (args->orphan) {
6230  		/*
6231  		 * O_TMPFILE, set link count to 0, so that after this point, we
6232  		 * fill in an inode item with the correct link count.
6233  		 */
6234  		set_nlink(inode, 0);
6235  	} else {
6236  		trace_btrfs_inode_request(dir);
6237  
6238  		ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
6239  		if (ret)
6240  			goto out;
6241  	}
6242  	/* index_cnt is ignored for everything but a dir. */
6243  	BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
6244  	BTRFS_I(inode)->generation = trans->transid;
6245  	inode->i_generation = BTRFS_I(inode)->generation;
6246  
6247  	/*
6248  	 * Subvolumes don't inherit flags from their parent directory.
6249  	 * Originally this was probably by accident, but we probably can't
6250  	 * change it now without compatibility issues.
6251  	 */
6252  	if (!args->subvol)
6253  		btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
6254  
6255  	if (S_ISREG(inode->i_mode)) {
6256  		if (btrfs_test_opt(fs_info, NODATASUM))
6257  			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6258  		if (btrfs_test_opt(fs_info, NODATACOW))
6259  			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6260  				BTRFS_INODE_NODATASUM;
6261  	}
6262  
6263  	location = &BTRFS_I(inode)->location;
6264  	location->objectid = objectid;
6265  	location->offset = 0;
6266  	location->type = BTRFS_INODE_ITEM_KEY;
6267  
6268  	ret = btrfs_insert_inode_locked(inode);
6269  	if (ret < 0) {
6270  		if (!args->orphan)
6271  			BTRFS_I(dir)->index_cnt--;
6272  		goto out;
6273  	}
6274  
6275  	/*
6276  	 * We could have gotten an inode number from somebody who was fsynced
6277  	 * and then removed in this same transaction, so let's just set full
6278  	 * sync since it will be a full sync anyway and this will blow away the
6279  	 * old info in the log.
6280  	 */
6281  	btrfs_set_inode_full_sync(BTRFS_I(inode));
6282  
6283  	key[0].objectid = objectid;
6284  	key[0].type = BTRFS_INODE_ITEM_KEY;
6285  	key[0].offset = 0;
6286  
6287  	sizes[0] = sizeof(struct btrfs_inode_item);
6288  
6289  	if (!args->orphan) {
6290  		/*
6291  		 * Start new inodes with an inode_ref. This is slightly more
6292  		 * efficient for small numbers of hard links since they will
6293  		 * be packed into one item. Extended refs will kick in if we
6294  		 * add more hard links than can fit in the ref item.
6295  		 */
6296  		key[1].objectid = objectid;
6297  		key[1].type = BTRFS_INODE_REF_KEY;
6298  		if (args->subvol) {
6299  			key[1].offset = objectid;
6300  			sizes[1] = 2 + sizeof(*ref);
6301  		} else {
6302  			key[1].offset = btrfs_ino(BTRFS_I(dir));
6303  			sizes[1] = name->len + sizeof(*ref);
6304  		}
6305  	}
6306  
6307  	batch.keys = &key[0];
6308  	batch.data_sizes = &sizes[0];
6309  	batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
6310  	batch.nr = args->orphan ? 1 : 2;
6311  	ret = btrfs_insert_empty_items(trans, root, path, &batch);
6312  	if (ret != 0) {
6313  		btrfs_abort_transaction(trans, ret);
6314  		goto discard;
6315  	}
6316  
6317  	inode->i_mtime = inode_set_ctime_current(inode);
6318  	inode->i_atime = inode->i_mtime;
6319  	BTRFS_I(inode)->i_otime = inode->i_mtime;
6320  
6321  	/*
6322  	 * We're going to fill the inode item now, so at this point the inode
6323  	 * must be fully initialized.
6324  	 */
6325  
6326  	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6327  				  struct btrfs_inode_item);
6328  	memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6329  			     sizeof(*inode_item));
6330  	fill_inode_item(trans, path->nodes[0], inode_item, inode);
6331  
6332  	if (!args->orphan) {
6333  		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6334  				     struct btrfs_inode_ref);
6335  		ptr = (unsigned long)(ref + 1);
6336  		if (args->subvol) {
6337  			btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
6338  			btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
6339  			write_extent_buffer(path->nodes[0], "..", ptr, 2);
6340  		} else {
6341  			btrfs_set_inode_ref_name_len(path->nodes[0], ref,
6342  						     name->len);
6343  			btrfs_set_inode_ref_index(path->nodes[0], ref,
6344  						  BTRFS_I(inode)->dir_index);
6345  			write_extent_buffer(path->nodes[0], name->name, ptr,
6346  					    name->len);
6347  		}
6348  	}
6349  
6350  	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
6351  	/*
6352  	 * We don't need the path anymore, plus inheriting properties, adding
6353  	 * ACLs, security xattrs, orphan item or adding the link, will result in
6354  	 * allocating yet another path. So just free our path.
6355  	 */
6356  	btrfs_free_path(path);
6357  	path = NULL;
6358  
6359  	if (args->subvol) {
6360  		struct inode *parent;
6361  
6362  		/*
6363  		 * Subvolumes inherit properties from their parent subvolume,
6364  		 * not the directory they were created in.
6365  		 */
6366  		parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
6367  				    BTRFS_I(dir)->root);
6368  		if (IS_ERR(parent)) {
6369  			ret = PTR_ERR(parent);
6370  		} else {
6371  			ret = btrfs_inode_inherit_props(trans, inode, parent);
6372  			iput(parent);
6373  		}
6374  	} else {
6375  		ret = btrfs_inode_inherit_props(trans, inode, dir);
6376  	}
6377  	if (ret) {
6378  		btrfs_err(fs_info,
6379  			  "error inheriting props for ino %llu (root %llu): %d",
6380  			  btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
6381  			  ret);
6382  	}
6383  
6384  	/*
6385  	 * Subvolumes don't inherit ACLs or get passed to the LSM. This is
6386  	 * probably a bug.
6387  	 */
6388  	if (!args->subvol) {
6389  		ret = btrfs_init_inode_security(trans, args);
6390  		if (ret) {
6391  			btrfs_abort_transaction(trans, ret);
6392  			goto discard;
6393  		}
6394  	}
6395  
6396  	inode_tree_add(BTRFS_I(inode));
6397  
6398  	trace_btrfs_inode_new(inode);
6399  	btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
6400  
6401  	btrfs_update_root_times(trans, root);
6402  
6403  	if (args->orphan) {
6404  		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
6405  	} else {
6406  		ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
6407  				     0, BTRFS_I(inode)->dir_index);
6408  	}
6409  	if (ret) {
6410  		btrfs_abort_transaction(trans, ret);
6411  		goto discard;
6412  	}
6413  
6414  	return 0;
6415  
6416  discard:
6417  	/*
6418  	 * discard_new_inode() calls iput(), but the caller owns the reference
6419  	 * to the inode.
6420  	 */
6421  	ihold(inode);
6422  	discard_new_inode(inode);
6423  out:
6424  	btrfs_free_path(path);
6425  	return ret;
6426  }
6427  
6428  /*
6429   * utility function to add 'inode' into 'parent_inode' with
6430   * a give name and a given sequence number.
6431   * if 'add_backref' is true, also insert a backref from the
6432   * inode to the parent directory.
6433   */
6434  int btrfs_add_link(struct btrfs_trans_handle *trans,
6435  		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6436  		   const struct fscrypt_str *name, int add_backref, u64 index)
6437  {
6438  	int ret = 0;
6439  	struct btrfs_key key;
6440  	struct btrfs_root *root = parent_inode->root;
6441  	u64 ino = btrfs_ino(inode);
6442  	u64 parent_ino = btrfs_ino(parent_inode);
6443  
6444  	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6445  		memcpy(&key, &inode->root->root_key, sizeof(key));
6446  	} else {
6447  		key.objectid = ino;
6448  		key.type = BTRFS_INODE_ITEM_KEY;
6449  		key.offset = 0;
6450  	}
6451  
6452  	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6453  		ret = btrfs_add_root_ref(trans, key.objectid,
6454  					 root->root_key.objectid, parent_ino,
6455  					 index, name);
6456  	} else if (add_backref) {
6457  		ret = btrfs_insert_inode_ref(trans, root, name,
6458  					     ino, parent_ino, index);
6459  	}
6460  
6461  	/* Nothing to clean up yet */
6462  	if (ret)
6463  		return ret;
6464  
6465  	ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
6466  				    btrfs_inode_type(&inode->vfs_inode), index);
6467  	if (ret == -EEXIST || ret == -EOVERFLOW)
6468  		goto fail_dir_item;
6469  	else if (ret) {
6470  		btrfs_abort_transaction(trans, ret);
6471  		return ret;
6472  	}
6473  
6474  	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6475  			   name->len * 2);
6476  	inode_inc_iversion(&parent_inode->vfs_inode);
6477  	/*
6478  	 * If we are replaying a log tree, we do not want to update the mtime
6479  	 * and ctime of the parent directory with the current time, since the
6480  	 * log replay procedure is responsible for setting them to their correct
6481  	 * values (the ones it had when the fsync was done).
6482  	 */
6483  	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
6484  		parent_inode->vfs_inode.i_mtime =
6485  			inode_set_ctime_current(&parent_inode->vfs_inode);
6486  
6487  	ret = btrfs_update_inode(trans, root, parent_inode);
6488  	if (ret)
6489  		btrfs_abort_transaction(trans, ret);
6490  	return ret;
6491  
6492  fail_dir_item:
6493  	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6494  		u64 local_index;
6495  		int err;
6496  		err = btrfs_del_root_ref(trans, key.objectid,
6497  					 root->root_key.objectid, parent_ino,
6498  					 &local_index, name);
6499  		if (err)
6500  			btrfs_abort_transaction(trans, err);
6501  	} else if (add_backref) {
6502  		u64 local_index;
6503  		int err;
6504  
6505  		err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
6506  					  &local_index);
6507  		if (err)
6508  			btrfs_abort_transaction(trans, err);
6509  	}
6510  
6511  	/* Return the original error code */
6512  	return ret;
6513  }
6514  
6515  static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
6516  			       struct inode *inode)
6517  {
6518  	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6519  	struct btrfs_root *root = BTRFS_I(dir)->root;
6520  	struct btrfs_new_inode_args new_inode_args = {
6521  		.dir = dir,
6522  		.dentry = dentry,
6523  		.inode = inode,
6524  	};
6525  	unsigned int trans_num_items;
6526  	struct btrfs_trans_handle *trans;
6527  	int err;
6528  
6529  	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
6530  	if (err)
6531  		goto out_inode;
6532  
6533  	trans = btrfs_start_transaction(root, trans_num_items);
6534  	if (IS_ERR(trans)) {
6535  		err = PTR_ERR(trans);
6536  		goto out_new_inode_args;
6537  	}
6538  
6539  	err = btrfs_create_new_inode(trans, &new_inode_args);
6540  	if (!err)
6541  		d_instantiate_new(dentry, inode);
6542  
6543  	btrfs_end_transaction(trans);
6544  	btrfs_btree_balance_dirty(fs_info);
6545  out_new_inode_args:
6546  	btrfs_new_inode_args_destroy(&new_inode_args);
6547  out_inode:
6548  	if (err)
6549  		iput(inode);
6550  	return err;
6551  }
6552  
6553  static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
6554  		       struct dentry *dentry, umode_t mode, dev_t rdev)
6555  {
6556  	struct inode *inode;
6557  
6558  	inode = new_inode(dir->i_sb);
6559  	if (!inode)
6560  		return -ENOMEM;
6561  	inode_init_owner(idmap, inode, dir, mode);
6562  	inode->i_op = &btrfs_special_inode_operations;
6563  	init_special_inode(inode, inode->i_mode, rdev);
6564  	return btrfs_create_common(dir, dentry, inode);
6565  }
6566  
6567  static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
6568  			struct dentry *dentry, umode_t mode, bool excl)
6569  {
6570  	struct inode *inode;
6571  
6572  	inode = new_inode(dir->i_sb);
6573  	if (!inode)
6574  		return -ENOMEM;
6575  	inode_init_owner(idmap, inode, dir, mode);
6576  	inode->i_fop = &btrfs_file_operations;
6577  	inode->i_op = &btrfs_file_inode_operations;
6578  	inode->i_mapping->a_ops = &btrfs_aops;
6579  	return btrfs_create_common(dir, dentry, inode);
6580  }
6581  
6582  static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6583  		      struct dentry *dentry)
6584  {
6585  	struct btrfs_trans_handle *trans = NULL;
6586  	struct btrfs_root *root = BTRFS_I(dir)->root;
6587  	struct inode *inode = d_inode(old_dentry);
6588  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6589  	struct fscrypt_name fname;
6590  	u64 index;
6591  	int err;
6592  	int drop_inode = 0;
6593  
6594  	/* do not allow sys_link's with other subvols of the same device */
6595  	if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
6596  		return -EXDEV;
6597  
6598  	if (inode->i_nlink >= BTRFS_LINK_MAX)
6599  		return -EMLINK;
6600  
6601  	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
6602  	if (err)
6603  		goto fail;
6604  
6605  	err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6606  	if (err)
6607  		goto fail;
6608  
6609  	/*
6610  	 * 2 items for inode and inode ref
6611  	 * 2 items for dir items
6612  	 * 1 item for parent inode
6613  	 * 1 item for orphan item deletion if O_TMPFILE
6614  	 */
6615  	trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
6616  	if (IS_ERR(trans)) {
6617  		err = PTR_ERR(trans);
6618  		trans = NULL;
6619  		goto fail;
6620  	}
6621  
6622  	/* There are several dir indexes for this inode, clear the cache. */
6623  	BTRFS_I(inode)->dir_index = 0ULL;
6624  	inc_nlink(inode);
6625  	inode_inc_iversion(inode);
6626  	inode_set_ctime_current(inode);
6627  	ihold(inode);
6628  	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6629  
6630  	err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6631  			     &fname.disk_name, 1, index);
6632  
6633  	if (err) {
6634  		drop_inode = 1;
6635  	} else {
6636  		struct dentry *parent = dentry->d_parent;
6637  
6638  		err = btrfs_update_inode(trans, root, BTRFS_I(inode));
6639  		if (err)
6640  			goto fail;
6641  		if (inode->i_nlink == 1) {
6642  			/*
6643  			 * If new hard link count is 1, it's a file created
6644  			 * with open(2) O_TMPFILE flag.
6645  			 */
6646  			err = btrfs_orphan_del(trans, BTRFS_I(inode));
6647  			if (err)
6648  				goto fail;
6649  		}
6650  		d_instantiate(dentry, inode);
6651  		btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
6652  	}
6653  
6654  fail:
6655  	fscrypt_free_filename(&fname);
6656  	if (trans)
6657  		btrfs_end_transaction(trans);
6658  	if (drop_inode) {
6659  		inode_dec_link_count(inode);
6660  		iput(inode);
6661  	}
6662  	btrfs_btree_balance_dirty(fs_info);
6663  	return err;
6664  }
6665  
6666  static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
6667  		       struct dentry *dentry, umode_t mode)
6668  {
6669  	struct inode *inode;
6670  
6671  	inode = new_inode(dir->i_sb);
6672  	if (!inode)
6673  		return -ENOMEM;
6674  	inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
6675  	inode->i_op = &btrfs_dir_inode_operations;
6676  	inode->i_fop = &btrfs_dir_file_operations;
6677  	return btrfs_create_common(dir, dentry, inode);
6678  }
6679  
6680  static noinline int uncompress_inline(struct btrfs_path *path,
6681  				      struct page *page,
6682  				      struct btrfs_file_extent_item *item)
6683  {
6684  	int ret;
6685  	struct extent_buffer *leaf = path->nodes[0];
6686  	char *tmp;
6687  	size_t max_size;
6688  	unsigned long inline_size;
6689  	unsigned long ptr;
6690  	int compress_type;
6691  
6692  	compress_type = btrfs_file_extent_compression(leaf, item);
6693  	max_size = btrfs_file_extent_ram_bytes(leaf, item);
6694  	inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
6695  	tmp = kmalloc(inline_size, GFP_NOFS);
6696  	if (!tmp)
6697  		return -ENOMEM;
6698  	ptr = btrfs_file_extent_inline_start(item);
6699  
6700  	read_extent_buffer(leaf, tmp, ptr, inline_size);
6701  
6702  	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6703  	ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
6704  
6705  	/*
6706  	 * decompression code contains a memset to fill in any space between the end
6707  	 * of the uncompressed data and the end of max_size in case the decompressed
6708  	 * data ends up shorter than ram_bytes.  That doesn't cover the hole between
6709  	 * the end of an inline extent and the beginning of the next block, so we
6710  	 * cover that region here.
6711  	 */
6712  
6713  	if (max_size < PAGE_SIZE)
6714  		memzero_page(page, max_size, PAGE_SIZE - max_size);
6715  	kfree(tmp);
6716  	return ret;
6717  }
6718  
6719  static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
6720  			      struct page *page)
6721  {
6722  	struct btrfs_file_extent_item *fi;
6723  	void *kaddr;
6724  	size_t copy_size;
6725  
6726  	if (!page || PageUptodate(page))
6727  		return 0;
6728  
6729  	ASSERT(page_offset(page) == 0);
6730  
6731  	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
6732  			    struct btrfs_file_extent_item);
6733  	if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
6734  		return uncompress_inline(path, page, fi);
6735  
6736  	copy_size = min_t(u64, PAGE_SIZE,
6737  			  btrfs_file_extent_ram_bytes(path->nodes[0], fi));
6738  	kaddr = kmap_local_page(page);
6739  	read_extent_buffer(path->nodes[0], kaddr,
6740  			   btrfs_file_extent_inline_start(fi), copy_size);
6741  	kunmap_local(kaddr);
6742  	if (copy_size < PAGE_SIZE)
6743  		memzero_page(page, copy_size, PAGE_SIZE - copy_size);
6744  	return 0;
6745  }
6746  
6747  /*
6748   * Lookup the first extent overlapping a range in a file.
6749   *
6750   * @inode:	file to search in
6751   * @page:	page to read extent data into if the extent is inline
6752   * @pg_offset:	offset into @page to copy to
6753   * @start:	file offset
6754   * @len:	length of range starting at @start
6755   *
6756   * Return the first &struct extent_map which overlaps the given range, reading
6757   * it from the B-tree and caching it if necessary. Note that there may be more
6758   * extents which overlap the given range after the returned extent_map.
6759   *
6760   * If @page is not NULL and the extent is inline, this also reads the extent
6761   * data directly into the page and marks the extent up to date in the io_tree.
6762   *
6763   * Return: ERR_PTR on error, non-NULL extent_map on success.
6764   */
6765  struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6766  				    struct page *page, size_t pg_offset,
6767  				    u64 start, u64 len)
6768  {
6769  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
6770  	int ret = 0;
6771  	u64 extent_start = 0;
6772  	u64 extent_end = 0;
6773  	u64 objectid = btrfs_ino(inode);
6774  	int extent_type = -1;
6775  	struct btrfs_path *path = NULL;
6776  	struct btrfs_root *root = inode->root;
6777  	struct btrfs_file_extent_item *item;
6778  	struct extent_buffer *leaf;
6779  	struct btrfs_key found_key;
6780  	struct extent_map *em = NULL;
6781  	struct extent_map_tree *em_tree = &inode->extent_tree;
6782  
6783  	read_lock(&em_tree->lock);
6784  	em = lookup_extent_mapping(em_tree, start, len);
6785  	read_unlock(&em_tree->lock);
6786  
6787  	if (em) {
6788  		if (em->start > start || em->start + em->len <= start)
6789  			free_extent_map(em);
6790  		else if (em->block_start == EXTENT_MAP_INLINE && page)
6791  			free_extent_map(em);
6792  		else
6793  			goto out;
6794  	}
6795  	em = alloc_extent_map();
6796  	if (!em) {
6797  		ret = -ENOMEM;
6798  		goto out;
6799  	}
6800  	em->start = EXTENT_MAP_HOLE;
6801  	em->orig_start = EXTENT_MAP_HOLE;
6802  	em->len = (u64)-1;
6803  	em->block_len = (u64)-1;
6804  
6805  	path = btrfs_alloc_path();
6806  	if (!path) {
6807  		ret = -ENOMEM;
6808  		goto out;
6809  	}
6810  
6811  	/* Chances are we'll be called again, so go ahead and do readahead */
6812  	path->reada = READA_FORWARD;
6813  
6814  	/*
6815  	 * The same explanation in load_free_space_cache applies here as well,
6816  	 * we only read when we're loading the free space cache, and at that
6817  	 * point the commit_root has everything we need.
6818  	 */
6819  	if (btrfs_is_free_space_inode(inode)) {
6820  		path->search_commit_root = 1;
6821  		path->skip_locking = 1;
6822  	}
6823  
6824  	ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
6825  	if (ret < 0) {
6826  		goto out;
6827  	} else if (ret > 0) {
6828  		if (path->slots[0] == 0)
6829  			goto not_found;
6830  		path->slots[0]--;
6831  		ret = 0;
6832  	}
6833  
6834  	leaf = path->nodes[0];
6835  	item = btrfs_item_ptr(leaf, path->slots[0],
6836  			      struct btrfs_file_extent_item);
6837  	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6838  	if (found_key.objectid != objectid ||
6839  	    found_key.type != BTRFS_EXTENT_DATA_KEY) {
6840  		/*
6841  		 * If we backup past the first extent we want to move forward
6842  		 * and see if there is an extent in front of us, otherwise we'll
6843  		 * say there is a hole for our whole search range which can
6844  		 * cause problems.
6845  		 */
6846  		extent_end = start;
6847  		goto next;
6848  	}
6849  
6850  	extent_type = btrfs_file_extent_type(leaf, item);
6851  	extent_start = found_key.offset;
6852  	extent_end = btrfs_file_extent_end(path);
6853  	if (extent_type == BTRFS_FILE_EXTENT_REG ||
6854  	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6855  		/* Only regular file could have regular/prealloc extent */
6856  		if (!S_ISREG(inode->vfs_inode.i_mode)) {
6857  			ret = -EUCLEAN;
6858  			btrfs_crit(fs_info,
6859  		"regular/prealloc extent found for non-regular inode %llu",
6860  				   btrfs_ino(inode));
6861  			goto out;
6862  		}
6863  		trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
6864  						       extent_start);
6865  	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6866  		trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
6867  						      path->slots[0],
6868  						      extent_start);
6869  	}
6870  next:
6871  	if (start >= extent_end) {
6872  		path->slots[0]++;
6873  		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6874  			ret = btrfs_next_leaf(root, path);
6875  			if (ret < 0)
6876  				goto out;
6877  			else if (ret > 0)
6878  				goto not_found;
6879  
6880  			leaf = path->nodes[0];
6881  		}
6882  		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6883  		if (found_key.objectid != objectid ||
6884  		    found_key.type != BTRFS_EXTENT_DATA_KEY)
6885  			goto not_found;
6886  		if (start + len <= found_key.offset)
6887  			goto not_found;
6888  		if (start > found_key.offset)
6889  			goto next;
6890  
6891  		/* New extent overlaps with existing one */
6892  		em->start = start;
6893  		em->orig_start = start;
6894  		em->len = found_key.offset - start;
6895  		em->block_start = EXTENT_MAP_HOLE;
6896  		goto insert;
6897  	}
6898  
6899  	btrfs_extent_item_to_extent_map(inode, path, item, em);
6900  
6901  	if (extent_type == BTRFS_FILE_EXTENT_REG ||
6902  	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6903  		goto insert;
6904  	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6905  		/*
6906  		 * Inline extent can only exist at file offset 0. This is
6907  		 * ensured by tree-checker and inline extent creation path.
6908  		 * Thus all members representing file offsets should be zero.
6909  		 */
6910  		ASSERT(pg_offset == 0);
6911  		ASSERT(extent_start == 0);
6912  		ASSERT(em->start == 0);
6913  
6914  		/*
6915  		 * btrfs_extent_item_to_extent_map() should have properly
6916  		 * initialized em members already.
6917  		 *
6918  		 * Other members are not utilized for inline extents.
6919  		 */
6920  		ASSERT(em->block_start == EXTENT_MAP_INLINE);
6921  		ASSERT(em->len == fs_info->sectorsize);
6922  
6923  		ret = read_inline_extent(inode, path, page);
6924  		if (ret < 0)
6925  			goto out;
6926  		goto insert;
6927  	}
6928  not_found:
6929  	em->start = start;
6930  	em->orig_start = start;
6931  	em->len = len;
6932  	em->block_start = EXTENT_MAP_HOLE;
6933  insert:
6934  	ret = 0;
6935  	btrfs_release_path(path);
6936  	if (em->start > start || extent_map_end(em) <= start) {
6937  		btrfs_err(fs_info,
6938  			  "bad extent! em: [%llu %llu] passed [%llu %llu]",
6939  			  em->start, em->len, start, len);
6940  		ret = -EIO;
6941  		goto out;
6942  	}
6943  
6944  	write_lock(&em_tree->lock);
6945  	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
6946  	write_unlock(&em_tree->lock);
6947  out:
6948  	btrfs_free_path(path);
6949  
6950  	trace_btrfs_get_extent(root, inode, em);
6951  
6952  	if (ret) {
6953  		free_extent_map(em);
6954  		return ERR_PTR(ret);
6955  	}
6956  	return em;
6957  }
6958  
6959  static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
6960  						  struct btrfs_dio_data *dio_data,
6961  						  const u64 start,
6962  						  const u64 len,
6963  						  const u64 orig_start,
6964  						  const u64 block_start,
6965  						  const u64 block_len,
6966  						  const u64 orig_block_len,
6967  						  const u64 ram_bytes,
6968  						  const int type)
6969  {
6970  	struct extent_map *em = NULL;
6971  	struct btrfs_ordered_extent *ordered;
6972  
6973  	if (type != BTRFS_ORDERED_NOCOW) {
6974  		em = create_io_em(inode, start, len, orig_start, block_start,
6975  				  block_len, orig_block_len, ram_bytes,
6976  				  BTRFS_COMPRESS_NONE, /* compress_type */
6977  				  type);
6978  		if (IS_ERR(em))
6979  			goto out;
6980  	}
6981  	ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
6982  					     block_start, block_len, 0,
6983  					     (1 << type) |
6984  					     (1 << BTRFS_ORDERED_DIRECT),
6985  					     BTRFS_COMPRESS_NONE);
6986  	if (IS_ERR(ordered)) {
6987  		if (em) {
6988  			free_extent_map(em);
6989  			btrfs_drop_extent_map_range(inode, start,
6990  						    start + len - 1, false);
6991  		}
6992  		em = ERR_CAST(ordered);
6993  	} else {
6994  		ASSERT(!dio_data->ordered);
6995  		dio_data->ordered = ordered;
6996  	}
6997   out:
6998  
6999  	return em;
7000  }
7001  
7002  static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
7003  						  struct btrfs_dio_data *dio_data,
7004  						  u64 start, u64 len)
7005  {
7006  	struct btrfs_root *root = inode->root;
7007  	struct btrfs_fs_info *fs_info = root->fs_info;
7008  	struct extent_map *em;
7009  	struct btrfs_key ins;
7010  	u64 alloc_hint;
7011  	int ret;
7012  
7013  	alloc_hint = get_extent_allocation_hint(inode, start, len);
7014  again:
7015  	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
7016  				   0, alloc_hint, &ins, 1, 1);
7017  	if (ret == -EAGAIN) {
7018  		ASSERT(btrfs_is_zoned(fs_info));
7019  		wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
7020  			       TASK_UNINTERRUPTIBLE);
7021  		goto again;
7022  	}
7023  	if (ret)
7024  		return ERR_PTR(ret);
7025  
7026  	em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
7027  				     ins.objectid, ins.offset, ins.offset,
7028  				     ins.offset, BTRFS_ORDERED_REGULAR);
7029  	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
7030  	if (IS_ERR(em))
7031  		btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
7032  					   1);
7033  
7034  	return em;
7035  }
7036  
7037  static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
7038  {
7039  	struct btrfs_block_group *block_group;
7040  	bool readonly = false;
7041  
7042  	block_group = btrfs_lookup_block_group(fs_info, bytenr);
7043  	if (!block_group || block_group->ro)
7044  		readonly = true;
7045  	if (block_group)
7046  		btrfs_put_block_group(block_group);
7047  	return readonly;
7048  }
7049  
7050  /*
7051   * Check if we can do nocow write into the range [@offset, @offset + @len)
7052   *
7053   * @offset:	File offset
7054   * @len:	The length to write, will be updated to the nocow writeable
7055   *		range
7056   * @orig_start:	(optional) Return the original file offset of the file extent
7057   * @orig_len:	(optional) Return the original on-disk length of the file extent
7058   * @ram_bytes:	(optional) Return the ram_bytes of the file extent
7059   * @strict:	if true, omit optimizations that might force us into unnecessary
7060   *		cow. e.g., don't trust generation number.
7061   *
7062   * Return:
7063   * >0	and update @len if we can do nocow write
7064   *  0	if we can't do nocow write
7065   * <0	if error happened
7066   *
7067   * NOTE: This only checks the file extents, caller is responsible to wait for
7068   *	 any ordered extents.
7069   */
7070  noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7071  			      u64 *orig_start, u64 *orig_block_len,
7072  			      u64 *ram_bytes, bool nowait, bool strict)
7073  {
7074  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7075  	struct can_nocow_file_extent_args nocow_args = { 0 };
7076  	struct btrfs_path *path;
7077  	int ret;
7078  	struct extent_buffer *leaf;
7079  	struct btrfs_root *root = BTRFS_I(inode)->root;
7080  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7081  	struct btrfs_file_extent_item *fi;
7082  	struct btrfs_key key;
7083  	int found_type;
7084  
7085  	path = btrfs_alloc_path();
7086  	if (!path)
7087  		return -ENOMEM;
7088  	path->nowait = nowait;
7089  
7090  	ret = btrfs_lookup_file_extent(NULL, root, path,
7091  			btrfs_ino(BTRFS_I(inode)), offset, 0);
7092  	if (ret < 0)
7093  		goto out;
7094  
7095  	if (ret == 1) {
7096  		if (path->slots[0] == 0) {
7097  			/* can't find the item, must cow */
7098  			ret = 0;
7099  			goto out;
7100  		}
7101  		path->slots[0]--;
7102  	}
7103  	ret = 0;
7104  	leaf = path->nodes[0];
7105  	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7106  	if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
7107  	    key.type != BTRFS_EXTENT_DATA_KEY) {
7108  		/* not our file or wrong item type, must cow */
7109  		goto out;
7110  	}
7111  
7112  	if (key.offset > offset) {
7113  		/* Wrong offset, must cow */
7114  		goto out;
7115  	}
7116  
7117  	if (btrfs_file_extent_end(path) <= offset)
7118  		goto out;
7119  
7120  	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
7121  	found_type = btrfs_file_extent_type(leaf, fi);
7122  	if (ram_bytes)
7123  		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
7124  
7125  	nocow_args.start = offset;
7126  	nocow_args.end = offset + *len - 1;
7127  	nocow_args.strict = strict;
7128  	nocow_args.free_path = true;
7129  
7130  	ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
7131  	/* can_nocow_file_extent() has freed the path. */
7132  	path = NULL;
7133  
7134  	if (ret != 1) {
7135  		/* Treat errors as not being able to NOCOW. */
7136  		ret = 0;
7137  		goto out;
7138  	}
7139  
7140  	ret = 0;
7141  	if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
7142  		goto out;
7143  
7144  	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7145  	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7146  		u64 range_end;
7147  
7148  		range_end = round_up(offset + nocow_args.num_bytes,
7149  				     root->fs_info->sectorsize) - 1;
7150  		ret = test_range_bit(io_tree, offset, range_end,
7151  				     EXTENT_DELALLOC, 0, NULL);
7152  		if (ret) {
7153  			ret = -EAGAIN;
7154  			goto out;
7155  		}
7156  
7157  		cond_resched();
7158  	}
7159  
7160  	if (orig_start)
7161  		*orig_start = key.offset - nocow_args.extent_offset;
7162  	if (orig_block_len)
7163  		*orig_block_len = nocow_args.disk_num_bytes;
7164  
7165  	*len = nocow_args.num_bytes;
7166  	ret = 1;
7167  out:
7168  	btrfs_free_path(path);
7169  	return ret;
7170  }
7171  
7172  static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7173  			      struct extent_state **cached_state,
7174  			      unsigned int iomap_flags)
7175  {
7176  	const bool writing = (iomap_flags & IOMAP_WRITE);
7177  	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
7178  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7179  	struct btrfs_ordered_extent *ordered;
7180  	int ret = 0;
7181  
7182  	while (1) {
7183  		if (nowait) {
7184  			if (!try_lock_extent(io_tree, lockstart, lockend,
7185  					     cached_state))
7186  				return -EAGAIN;
7187  		} else {
7188  			lock_extent(io_tree, lockstart, lockend, cached_state);
7189  		}
7190  		/*
7191  		 * We're concerned with the entire range that we're going to be
7192  		 * doing DIO to, so we need to make sure there's no ordered
7193  		 * extents in this range.
7194  		 */
7195  		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
7196  						     lockend - lockstart + 1);
7197  
7198  		/*
7199  		 * We need to make sure there are no buffered pages in this
7200  		 * range either, we could have raced between the invalidate in
7201  		 * generic_file_direct_write and locking the extent.  The
7202  		 * invalidate needs to happen so that reads after a write do not
7203  		 * get stale data.
7204  		 */
7205  		if (!ordered &&
7206  		    (!writing || !filemap_range_has_page(inode->i_mapping,
7207  							 lockstart, lockend)))
7208  			break;
7209  
7210  		unlock_extent(io_tree, lockstart, lockend, cached_state);
7211  
7212  		if (ordered) {
7213  			if (nowait) {
7214  				btrfs_put_ordered_extent(ordered);
7215  				ret = -EAGAIN;
7216  				break;
7217  			}
7218  			/*
7219  			 * If we are doing a DIO read and the ordered extent we
7220  			 * found is for a buffered write, we can not wait for it
7221  			 * to complete and retry, because if we do so we can
7222  			 * deadlock with concurrent buffered writes on page
7223  			 * locks. This happens only if our DIO read covers more
7224  			 * than one extent map, if at this point has already
7225  			 * created an ordered extent for a previous extent map
7226  			 * and locked its range in the inode's io tree, and a
7227  			 * concurrent write against that previous extent map's
7228  			 * range and this range started (we unlock the ranges
7229  			 * in the io tree only when the bios complete and
7230  			 * buffered writes always lock pages before attempting
7231  			 * to lock range in the io tree).
7232  			 */
7233  			if (writing ||
7234  			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7235  				btrfs_start_ordered_extent(ordered);
7236  			else
7237  				ret = nowait ? -EAGAIN : -ENOTBLK;
7238  			btrfs_put_ordered_extent(ordered);
7239  		} else {
7240  			/*
7241  			 * We could trigger writeback for this range (and wait
7242  			 * for it to complete) and then invalidate the pages for
7243  			 * this range (through invalidate_inode_pages2_range()),
7244  			 * but that can lead us to a deadlock with a concurrent
7245  			 * call to readahead (a buffered read or a defrag call
7246  			 * triggered a readahead) on a page lock due to an
7247  			 * ordered dio extent we created before but did not have
7248  			 * yet a corresponding bio submitted (whence it can not
7249  			 * complete), which makes readahead wait for that
7250  			 * ordered extent to complete while holding a lock on
7251  			 * that page.
7252  			 */
7253  			ret = nowait ? -EAGAIN : -ENOTBLK;
7254  		}
7255  
7256  		if (ret)
7257  			break;
7258  
7259  		cond_resched();
7260  	}
7261  
7262  	return ret;
7263  }
7264  
7265  /* The callers of this must take lock_extent() */
7266  static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
7267  				       u64 len, u64 orig_start, u64 block_start,
7268  				       u64 block_len, u64 orig_block_len,
7269  				       u64 ram_bytes, int compress_type,
7270  				       int type)
7271  {
7272  	struct extent_map *em;
7273  	int ret;
7274  
7275  	ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7276  	       type == BTRFS_ORDERED_COMPRESSED ||
7277  	       type == BTRFS_ORDERED_NOCOW ||
7278  	       type == BTRFS_ORDERED_REGULAR);
7279  
7280  	em = alloc_extent_map();
7281  	if (!em)
7282  		return ERR_PTR(-ENOMEM);
7283  
7284  	em->start = start;
7285  	em->orig_start = orig_start;
7286  	em->len = len;
7287  	em->block_len = block_len;
7288  	em->block_start = block_start;
7289  	em->orig_block_len = orig_block_len;
7290  	em->ram_bytes = ram_bytes;
7291  	em->generation = -1;
7292  	set_bit(EXTENT_FLAG_PINNED, &em->flags);
7293  	if (type == BTRFS_ORDERED_PREALLOC) {
7294  		set_bit(EXTENT_FLAG_FILLING, &em->flags);
7295  	} else if (type == BTRFS_ORDERED_COMPRESSED) {
7296  		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
7297  		em->compress_type = compress_type;
7298  	}
7299  
7300  	ret = btrfs_replace_extent_map_range(inode, em, true);
7301  	if (ret) {
7302  		free_extent_map(em);
7303  		return ERR_PTR(ret);
7304  	}
7305  
7306  	/* em got 2 refs now, callers needs to do free_extent_map once. */
7307  	return em;
7308  }
7309  
7310  
7311  static int btrfs_get_blocks_direct_write(struct extent_map **map,
7312  					 struct inode *inode,
7313  					 struct btrfs_dio_data *dio_data,
7314  					 u64 start, u64 *lenp,
7315  					 unsigned int iomap_flags)
7316  {
7317  	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
7318  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7319  	struct extent_map *em = *map;
7320  	int type;
7321  	u64 block_start, orig_start, orig_block_len, ram_bytes;
7322  	struct btrfs_block_group *bg;
7323  	bool can_nocow = false;
7324  	bool space_reserved = false;
7325  	u64 len = *lenp;
7326  	u64 prev_len;
7327  	int ret = 0;
7328  
7329  	/*
7330  	 * We don't allocate a new extent in the following cases
7331  	 *
7332  	 * 1) The inode is marked as NODATACOW. In this case we'll just use the
7333  	 * existing extent.
7334  	 * 2) The extent is marked as PREALLOC. We're good to go here and can
7335  	 * just use the extent.
7336  	 *
7337  	 */
7338  	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7339  	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7340  	     em->block_start != EXTENT_MAP_HOLE)) {
7341  		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7342  			type = BTRFS_ORDERED_PREALLOC;
7343  		else
7344  			type = BTRFS_ORDERED_NOCOW;
7345  		len = min(len, em->len - (start - em->start));
7346  		block_start = em->block_start + (start - em->start);
7347  
7348  		if (can_nocow_extent(inode, start, &len, &orig_start,
7349  				     &orig_block_len, &ram_bytes, false, false) == 1) {
7350  			bg = btrfs_inc_nocow_writers(fs_info, block_start);
7351  			if (bg)
7352  				can_nocow = true;
7353  		}
7354  	}
7355  
7356  	prev_len = len;
7357  	if (can_nocow) {
7358  		struct extent_map *em2;
7359  
7360  		/* We can NOCOW, so only need to reserve metadata space. */
7361  		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
7362  						      nowait);
7363  		if (ret < 0) {
7364  			/* Our caller expects us to free the input extent map. */
7365  			free_extent_map(em);
7366  			*map = NULL;
7367  			btrfs_dec_nocow_writers(bg);
7368  			if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
7369  				ret = -EAGAIN;
7370  			goto out;
7371  		}
7372  		space_reserved = true;
7373  
7374  		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
7375  					      orig_start, block_start,
7376  					      len, orig_block_len,
7377  					      ram_bytes, type);
7378  		btrfs_dec_nocow_writers(bg);
7379  		if (type == BTRFS_ORDERED_PREALLOC) {
7380  			free_extent_map(em);
7381  			*map = em2;
7382  			em = em2;
7383  		}
7384  
7385  		if (IS_ERR(em2)) {
7386  			ret = PTR_ERR(em2);
7387  			goto out;
7388  		}
7389  
7390  		dio_data->nocow_done = true;
7391  	} else {
7392  		/* Our caller expects us to free the input extent map. */
7393  		free_extent_map(em);
7394  		*map = NULL;
7395  
7396  		if (nowait) {
7397  			ret = -EAGAIN;
7398  			goto out;
7399  		}
7400  
7401  		/*
7402  		 * If we could not allocate data space before locking the file
7403  		 * range and we can't do a NOCOW write, then we have to fail.
7404  		 */
7405  		if (!dio_data->data_space_reserved) {
7406  			ret = -ENOSPC;
7407  			goto out;
7408  		}
7409  
7410  		/*
7411  		 * We have to COW and we have already reserved data space before,
7412  		 * so now we reserve only metadata.
7413  		 */
7414  		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
7415  						      false);
7416  		if (ret < 0)
7417  			goto out;
7418  		space_reserved = true;
7419  
7420  		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
7421  		if (IS_ERR(em)) {
7422  			ret = PTR_ERR(em);
7423  			goto out;
7424  		}
7425  		*map = em;
7426  		len = min(len, em->len - (start - em->start));
7427  		if (len < prev_len)
7428  			btrfs_delalloc_release_metadata(BTRFS_I(inode),
7429  							prev_len - len, true);
7430  	}
7431  
7432  	/*
7433  	 * We have created our ordered extent, so we can now release our reservation
7434  	 * for an outstanding extent.
7435  	 */
7436  	btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
7437  
7438  	/*
7439  	 * Need to update the i_size under the extent lock so buffered
7440  	 * readers will get the updated i_size when we unlock.
7441  	 */
7442  	if (start + len > i_size_read(inode))
7443  		i_size_write(inode, start + len);
7444  out:
7445  	if (ret && space_reserved) {
7446  		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
7447  		btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
7448  	}
7449  	*lenp = len;
7450  	return ret;
7451  }
7452  
7453  static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
7454  		loff_t length, unsigned int flags, struct iomap *iomap,
7455  		struct iomap *srcmap)
7456  {
7457  	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
7458  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7459  	struct extent_map *em;
7460  	struct extent_state *cached_state = NULL;
7461  	struct btrfs_dio_data *dio_data = iter->private;
7462  	u64 lockstart, lockend;
7463  	const bool write = !!(flags & IOMAP_WRITE);
7464  	int ret = 0;
7465  	u64 len = length;
7466  	const u64 data_alloc_len = length;
7467  	bool unlock_extents = false;
7468  
7469  	/*
7470  	 * We could potentially fault if we have a buffer > PAGE_SIZE, and if
7471  	 * we're NOWAIT we may submit a bio for a partial range and return
7472  	 * EIOCBQUEUED, which would result in an errant short read.
7473  	 *
7474  	 * The best way to handle this would be to allow for partial completions
7475  	 * of iocb's, so we could submit the partial bio, return and fault in
7476  	 * the rest of the pages, and then submit the io for the rest of the
7477  	 * range.  However we don't have that currently, so simply return
7478  	 * -EAGAIN at this point so that the normal path is used.
7479  	 */
7480  	if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
7481  		return -EAGAIN;
7482  
7483  	/*
7484  	 * Cap the size of reads to that usually seen in buffered I/O as we need
7485  	 * to allocate a contiguous array for the checksums.
7486  	 */
7487  	if (!write)
7488  		len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
7489  
7490  	lockstart = start;
7491  	lockend = start + len - 1;
7492  
7493  	/*
7494  	 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
7495  	 * enough if we've written compressed pages to this area, so we need to
7496  	 * flush the dirty pages again to make absolutely sure that any
7497  	 * outstanding dirty pages are on disk - the first flush only starts
7498  	 * compression on the data, while keeping the pages locked, so by the
7499  	 * time the second flush returns we know bios for the compressed pages
7500  	 * were submitted and finished, and the pages no longer under writeback.
7501  	 *
7502  	 * If we have a NOWAIT request and we have any pages in the range that
7503  	 * are locked, likely due to compression still in progress, we don't want
7504  	 * to block on page locks. We also don't want to block on pages marked as
7505  	 * dirty or under writeback (same as for the non-compression case).
7506  	 * iomap_dio_rw() did the same check, but after that and before we got
7507  	 * here, mmap'ed writes may have happened or buffered reads started
7508  	 * (readpage() and readahead(), which lock pages), as we haven't locked
7509  	 * the file range yet.
7510  	 */
7511  	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7512  		     &BTRFS_I(inode)->runtime_flags)) {
7513  		if (flags & IOMAP_NOWAIT) {
7514  			if (filemap_range_needs_writeback(inode->i_mapping,
7515  							  lockstart, lockend))
7516  				return -EAGAIN;
7517  		} else {
7518  			ret = filemap_fdatawrite_range(inode->i_mapping, start,
7519  						       start + length - 1);
7520  			if (ret)
7521  				return ret;
7522  		}
7523  	}
7524  
7525  	memset(dio_data, 0, sizeof(*dio_data));
7526  
7527  	/*
7528  	 * We always try to allocate data space and must do it before locking
7529  	 * the file range, to avoid deadlocks with concurrent writes to the same
7530  	 * range if the range has several extents and the writes don't expand the
7531  	 * current i_size (the inode lock is taken in shared mode). If we fail to
7532  	 * allocate data space here we continue and later, after locking the
7533  	 * file range, we fail with ENOSPC only if we figure out we can not do a
7534  	 * NOCOW write.
7535  	 */
7536  	if (write && !(flags & IOMAP_NOWAIT)) {
7537  		ret = btrfs_check_data_free_space(BTRFS_I(inode),
7538  						  &dio_data->data_reserved,
7539  						  start, data_alloc_len, false);
7540  		if (!ret)
7541  			dio_data->data_space_reserved = true;
7542  		else if (ret && !(BTRFS_I(inode)->flags &
7543  				  (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
7544  			goto err;
7545  	}
7546  
7547  	/*
7548  	 * If this errors out it's because we couldn't invalidate pagecache for
7549  	 * this range and we need to fallback to buffered IO, or we are doing a
7550  	 * NOWAIT read/write and we need to block.
7551  	 */
7552  	ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
7553  	if (ret < 0)
7554  		goto err;
7555  
7556  	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
7557  	if (IS_ERR(em)) {
7558  		ret = PTR_ERR(em);
7559  		goto unlock_err;
7560  	}
7561  
7562  	/*
7563  	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7564  	 * io.  INLINE is special, and we could probably kludge it in here, but
7565  	 * it's still buffered so for safety lets just fall back to the generic
7566  	 * buffered path.
7567  	 *
7568  	 * For COMPRESSED we _have_ to read the entire extent in so we can
7569  	 * decompress it, so there will be buffering required no matter what we
7570  	 * do, so go ahead and fallback to buffered.
7571  	 *
7572  	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
7573  	 * to buffered IO.  Don't blame me, this is the price we pay for using
7574  	 * the generic code.
7575  	 */
7576  	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7577  	    em->block_start == EXTENT_MAP_INLINE) {
7578  		free_extent_map(em);
7579  		/*
7580  		 * If we are in a NOWAIT context, return -EAGAIN in order to
7581  		 * fallback to buffered IO. This is not only because we can
7582  		 * block with buffered IO (no support for NOWAIT semantics at
7583  		 * the moment) but also to avoid returning short reads to user
7584  		 * space - this happens if we were able to read some data from
7585  		 * previous non-compressed extents and then when we fallback to
7586  		 * buffered IO, at btrfs_file_read_iter() by calling
7587  		 * filemap_read(), we fail to fault in pages for the read buffer,
7588  		 * in which case filemap_read() returns a short read (the number
7589  		 * of bytes previously read is > 0, so it does not return -EFAULT).
7590  		 */
7591  		ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
7592  		goto unlock_err;
7593  	}
7594  
7595  	len = min(len, em->len - (start - em->start));
7596  
7597  	/*
7598  	 * If we have a NOWAIT request and the range contains multiple extents
7599  	 * (or a mix of extents and holes), then we return -EAGAIN to make the
7600  	 * caller fallback to a context where it can do a blocking (without
7601  	 * NOWAIT) request. This way we avoid doing partial IO and returning
7602  	 * success to the caller, which is not optimal for writes and for reads
7603  	 * it can result in unexpected behaviour for an application.
7604  	 *
7605  	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
7606  	 * iomap_dio_rw(), we can end up returning less data then what the caller
7607  	 * asked for, resulting in an unexpected, and incorrect, short read.
7608  	 * That is, the caller asked to read N bytes and we return less than that,
7609  	 * which is wrong unless we are crossing EOF. This happens if we get a
7610  	 * page fault error when trying to fault in pages for the buffer that is
7611  	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
7612  	 * have previously submitted bios for other extents in the range, in
7613  	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
7614  	 * those bios have completed by the time we get the page fault error,
7615  	 * which we return back to our caller - we should only return EIOCBQUEUED
7616  	 * after we have submitted bios for all the extents in the range.
7617  	 */
7618  	if ((flags & IOMAP_NOWAIT) && len < length) {
7619  		free_extent_map(em);
7620  		ret = -EAGAIN;
7621  		goto unlock_err;
7622  	}
7623  
7624  	if (write) {
7625  		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
7626  						    start, &len, flags);
7627  		if (ret < 0)
7628  			goto unlock_err;
7629  		unlock_extents = true;
7630  		/* Recalc len in case the new em is smaller than requested */
7631  		len = min(len, em->len - (start - em->start));
7632  		if (dio_data->data_space_reserved) {
7633  			u64 release_offset;
7634  			u64 release_len = 0;
7635  
7636  			if (dio_data->nocow_done) {
7637  				release_offset = start;
7638  				release_len = data_alloc_len;
7639  			} else if (len < data_alloc_len) {
7640  				release_offset = start + len;
7641  				release_len = data_alloc_len - len;
7642  			}
7643  
7644  			if (release_len > 0)
7645  				btrfs_free_reserved_data_space(BTRFS_I(inode),
7646  							       dio_data->data_reserved,
7647  							       release_offset,
7648  							       release_len);
7649  		}
7650  	} else {
7651  		/*
7652  		 * We need to unlock only the end area that we aren't using.
7653  		 * The rest is going to be unlocked by the endio routine.
7654  		 */
7655  		lockstart = start + len;
7656  		if (lockstart < lockend)
7657  			unlock_extents = true;
7658  	}
7659  
7660  	if (unlock_extents)
7661  		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7662  			      &cached_state);
7663  	else
7664  		free_extent_state(cached_state);
7665  
7666  	/*
7667  	 * Translate extent map information to iomap.
7668  	 * We trim the extents (and move the addr) even though iomap code does
7669  	 * that, since we have locked only the parts we are performing I/O in.
7670  	 */
7671  	if ((em->block_start == EXTENT_MAP_HOLE) ||
7672  	    (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
7673  		iomap->addr = IOMAP_NULL_ADDR;
7674  		iomap->type = IOMAP_HOLE;
7675  	} else {
7676  		iomap->addr = em->block_start + (start - em->start);
7677  		iomap->type = IOMAP_MAPPED;
7678  	}
7679  	iomap->offset = start;
7680  	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
7681  	iomap->length = len;
7682  	free_extent_map(em);
7683  
7684  	return 0;
7685  
7686  unlock_err:
7687  	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7688  		      &cached_state);
7689  err:
7690  	if (dio_data->data_space_reserved) {
7691  		btrfs_free_reserved_data_space(BTRFS_I(inode),
7692  					       dio_data->data_reserved,
7693  					       start, data_alloc_len);
7694  		extent_changeset_free(dio_data->data_reserved);
7695  	}
7696  
7697  	return ret;
7698  }
7699  
7700  static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
7701  		ssize_t written, unsigned int flags, struct iomap *iomap)
7702  {
7703  	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
7704  	struct btrfs_dio_data *dio_data = iter->private;
7705  	size_t submitted = dio_data->submitted;
7706  	const bool write = !!(flags & IOMAP_WRITE);
7707  	int ret = 0;
7708  
7709  	if (!write && (iomap->type == IOMAP_HOLE)) {
7710  		/* If reading from a hole, unlock and return */
7711  		unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
7712  			      NULL);
7713  		return 0;
7714  	}
7715  
7716  	if (submitted < length) {
7717  		pos += submitted;
7718  		length -= submitted;
7719  		if (write)
7720  			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
7721  						    pos, length, false);
7722  		else
7723  			unlock_extent(&BTRFS_I(inode)->io_tree, pos,
7724  				      pos + length - 1, NULL);
7725  		ret = -ENOTBLK;
7726  	}
7727  	if (write) {
7728  		btrfs_put_ordered_extent(dio_data->ordered);
7729  		dio_data->ordered = NULL;
7730  	}
7731  
7732  	if (write)
7733  		extent_changeset_free(dio_data->data_reserved);
7734  	return ret;
7735  }
7736  
7737  static void btrfs_dio_end_io(struct btrfs_bio *bbio)
7738  {
7739  	struct btrfs_dio_private *dip =
7740  		container_of(bbio, struct btrfs_dio_private, bbio);
7741  	struct btrfs_inode *inode = bbio->inode;
7742  	struct bio *bio = &bbio->bio;
7743  
7744  	if (bio->bi_status) {
7745  		btrfs_warn(inode->root->fs_info,
7746  		"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
7747  			   btrfs_ino(inode), bio->bi_opf,
7748  			   dip->file_offset, dip->bytes, bio->bi_status);
7749  	}
7750  
7751  	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
7752  		btrfs_finish_ordered_extent(bbio->ordered, NULL,
7753  					    dip->file_offset, dip->bytes,
7754  					    !bio->bi_status);
7755  	} else {
7756  		unlock_extent(&inode->io_tree, dip->file_offset,
7757  			      dip->file_offset + dip->bytes - 1, NULL);
7758  	}
7759  
7760  	bbio->bio.bi_private = bbio->private;
7761  	iomap_dio_bio_end_io(bio);
7762  }
7763  
7764  static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
7765  				loff_t file_offset)
7766  {
7767  	struct btrfs_bio *bbio = btrfs_bio(bio);
7768  	struct btrfs_dio_private *dip =
7769  		container_of(bbio, struct btrfs_dio_private, bbio);
7770  	struct btrfs_dio_data *dio_data = iter->private;
7771  
7772  	btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
7773  		       btrfs_dio_end_io, bio->bi_private);
7774  	bbio->inode = BTRFS_I(iter->inode);
7775  	bbio->file_offset = file_offset;
7776  
7777  	dip->file_offset = file_offset;
7778  	dip->bytes = bio->bi_iter.bi_size;
7779  
7780  	dio_data->submitted += bio->bi_iter.bi_size;
7781  
7782  	/*
7783  	 * Check if we are doing a partial write.  If we are, we need to split
7784  	 * the ordered extent to match the submitted bio.  Hang on to the
7785  	 * remaining unfinishable ordered_extent in dio_data so that it can be
7786  	 * cancelled in iomap_end to avoid a deadlock wherein faulting the
7787  	 * remaining pages is blocked on the outstanding ordered extent.
7788  	 */
7789  	if (iter->flags & IOMAP_WRITE) {
7790  		int ret;
7791  
7792  		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
7793  		if (ret) {
7794  			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
7795  						    file_offset, dip->bytes,
7796  						    !ret);
7797  			bio->bi_status = errno_to_blk_status(ret);
7798  			iomap_dio_bio_end_io(bio);
7799  			return;
7800  		}
7801  	}
7802  
7803  	btrfs_submit_bio(bbio, 0);
7804  }
7805  
7806  static const struct iomap_ops btrfs_dio_iomap_ops = {
7807  	.iomap_begin            = btrfs_dio_iomap_begin,
7808  	.iomap_end              = btrfs_dio_iomap_end,
7809  };
7810  
7811  static const struct iomap_dio_ops btrfs_dio_ops = {
7812  	.submit_io		= btrfs_dio_submit_io,
7813  	.bio_set		= &btrfs_dio_bioset,
7814  };
7815  
7816  ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
7817  {
7818  	struct btrfs_dio_data data = { 0 };
7819  
7820  	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
7821  			    IOMAP_DIO_PARTIAL, &data, done_before);
7822  }
7823  
7824  struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
7825  				  size_t done_before)
7826  {
7827  	struct btrfs_dio_data data = { 0 };
7828  
7829  	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
7830  			    IOMAP_DIO_PARTIAL, &data, done_before);
7831  }
7832  
7833  static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
7834  			u64 start, u64 len)
7835  {
7836  	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
7837  	int	ret;
7838  
7839  	ret = fiemap_prep(inode, fieinfo, start, &len, 0);
7840  	if (ret)
7841  		return ret;
7842  
7843  	/*
7844  	 * fiemap_prep() called filemap_write_and_wait() for the whole possible
7845  	 * file range (0 to LLONG_MAX), but that is not enough if we have
7846  	 * compression enabled. The first filemap_fdatawrite_range() only kicks
7847  	 * in the compression of data (in an async thread) and will return
7848  	 * before the compression is done and writeback is started. A second
7849  	 * filemap_fdatawrite_range() is needed to wait for the compression to
7850  	 * complete and writeback to start. We also need to wait for ordered
7851  	 * extents to complete, because our fiemap implementation uses mainly
7852  	 * file extent items to list the extents, searching for extent maps
7853  	 * only for file ranges with holes or prealloc extents to figure out
7854  	 * if we have delalloc in those ranges.
7855  	 */
7856  	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
7857  		ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
7858  		if (ret)
7859  			return ret;
7860  	}
7861  
7862  	btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
7863  
7864  	/*
7865  	 * We did an initial flush to avoid holding the inode's lock while
7866  	 * triggering writeback and waiting for the completion of IO and ordered
7867  	 * extents. Now after we locked the inode we do it again, because it's
7868  	 * possible a new write may have happened in between those two steps.
7869  	 */
7870  	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
7871  		ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
7872  		if (ret) {
7873  			btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
7874  			return ret;
7875  		}
7876  	}
7877  
7878  	ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
7879  	btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
7880  
7881  	return ret;
7882  }
7883  
7884  static int btrfs_writepages(struct address_space *mapping,
7885  			    struct writeback_control *wbc)
7886  {
7887  	return extent_writepages(mapping, wbc);
7888  }
7889  
7890  static void btrfs_readahead(struct readahead_control *rac)
7891  {
7892  	extent_readahead(rac);
7893  }
7894  
7895  /*
7896   * For release_folio() and invalidate_folio() we have a race window where
7897   * folio_end_writeback() is called but the subpage spinlock is not yet released.
7898   * If we continue to release/invalidate the page, we could cause use-after-free
7899   * for subpage spinlock.  So this function is to spin and wait for subpage
7900   * spinlock.
7901   */
7902  static void wait_subpage_spinlock(struct page *page)
7903  {
7904  	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
7905  	struct btrfs_subpage *subpage;
7906  
7907  	if (!btrfs_is_subpage(fs_info, page))
7908  		return;
7909  
7910  	ASSERT(PagePrivate(page) && page->private);
7911  	subpage = (struct btrfs_subpage *)page->private;
7912  
7913  	/*
7914  	 * This may look insane as we just acquire the spinlock and release it,
7915  	 * without doing anything.  But we just want to make sure no one is
7916  	 * still holding the subpage spinlock.
7917  	 * And since the page is not dirty nor writeback, and we have page
7918  	 * locked, the only possible way to hold a spinlock is from the endio
7919  	 * function to clear page writeback.
7920  	 *
7921  	 * Here we just acquire the spinlock so that all existing callers
7922  	 * should exit and we're safe to release/invalidate the page.
7923  	 */
7924  	spin_lock_irq(&subpage->lock);
7925  	spin_unlock_irq(&subpage->lock);
7926  }
7927  
7928  static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7929  {
7930  	int ret = try_release_extent_mapping(&folio->page, gfp_flags);
7931  
7932  	if (ret == 1) {
7933  		wait_subpage_spinlock(&folio->page);
7934  		clear_page_extent_mapped(&folio->page);
7935  	}
7936  	return ret;
7937  }
7938  
7939  static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7940  {
7941  	if (folio_test_writeback(folio) || folio_test_dirty(folio))
7942  		return false;
7943  	return __btrfs_release_folio(folio, gfp_flags);
7944  }
7945  
7946  #ifdef CONFIG_MIGRATION
7947  static int btrfs_migrate_folio(struct address_space *mapping,
7948  			     struct folio *dst, struct folio *src,
7949  			     enum migrate_mode mode)
7950  {
7951  	int ret = filemap_migrate_folio(mapping, dst, src, mode);
7952  
7953  	if (ret != MIGRATEPAGE_SUCCESS)
7954  		return ret;
7955  
7956  	if (folio_test_ordered(src)) {
7957  		folio_clear_ordered(src);
7958  		folio_set_ordered(dst);
7959  	}
7960  
7961  	return MIGRATEPAGE_SUCCESS;
7962  }
7963  #else
7964  #define btrfs_migrate_folio NULL
7965  #endif
7966  
7967  static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
7968  				 size_t length)
7969  {
7970  	struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
7971  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
7972  	struct extent_io_tree *tree = &inode->io_tree;
7973  	struct extent_state *cached_state = NULL;
7974  	u64 page_start = folio_pos(folio);
7975  	u64 page_end = page_start + folio_size(folio) - 1;
7976  	u64 cur;
7977  	int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
7978  
7979  	/*
7980  	 * We have folio locked so no new ordered extent can be created on this
7981  	 * page, nor bio can be submitted for this folio.
7982  	 *
7983  	 * But already submitted bio can still be finished on this folio.
7984  	 * Furthermore, endio function won't skip folio which has Ordered
7985  	 * (Private2) already cleared, so it's possible for endio and
7986  	 * invalidate_folio to do the same ordered extent accounting twice
7987  	 * on one folio.
7988  	 *
7989  	 * So here we wait for any submitted bios to finish, so that we won't
7990  	 * do double ordered extent accounting on the same folio.
7991  	 */
7992  	folio_wait_writeback(folio);
7993  	wait_subpage_spinlock(&folio->page);
7994  
7995  	/*
7996  	 * For subpage case, we have call sites like
7997  	 * btrfs_punch_hole_lock_range() which passes range not aligned to
7998  	 * sectorsize.
7999  	 * If the range doesn't cover the full folio, we don't need to and
8000  	 * shouldn't clear page extent mapped, as folio->private can still
8001  	 * record subpage dirty bits for other part of the range.
8002  	 *
8003  	 * For cases that invalidate the full folio even the range doesn't
8004  	 * cover the full folio, like invalidating the last folio, we're
8005  	 * still safe to wait for ordered extent to finish.
8006  	 */
8007  	if (!(offset == 0 && length == folio_size(folio))) {
8008  		btrfs_release_folio(folio, GFP_NOFS);
8009  		return;
8010  	}
8011  
8012  	if (!inode_evicting)
8013  		lock_extent(tree, page_start, page_end, &cached_state);
8014  
8015  	cur = page_start;
8016  	while (cur < page_end) {
8017  		struct btrfs_ordered_extent *ordered;
8018  		u64 range_end;
8019  		u32 range_len;
8020  		u32 extra_flags = 0;
8021  
8022  		ordered = btrfs_lookup_first_ordered_range(inode, cur,
8023  							   page_end + 1 - cur);
8024  		if (!ordered) {
8025  			range_end = page_end;
8026  			/*
8027  			 * No ordered extent covering this range, we are safe
8028  			 * to delete all extent states in the range.
8029  			 */
8030  			extra_flags = EXTENT_CLEAR_ALL_BITS;
8031  			goto next;
8032  		}
8033  		if (ordered->file_offset > cur) {
8034  			/*
8035  			 * There is a range between [cur, oe->file_offset) not
8036  			 * covered by any ordered extent.
8037  			 * We are safe to delete all extent states, and handle
8038  			 * the ordered extent in the next iteration.
8039  			 */
8040  			range_end = ordered->file_offset - 1;
8041  			extra_flags = EXTENT_CLEAR_ALL_BITS;
8042  			goto next;
8043  		}
8044  
8045  		range_end = min(ordered->file_offset + ordered->num_bytes - 1,
8046  				page_end);
8047  		ASSERT(range_end + 1 - cur < U32_MAX);
8048  		range_len = range_end + 1 - cur;
8049  		if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
8050  			/*
8051  			 * If Ordered (Private2) is cleared, it means endio has
8052  			 * already been executed for the range.
8053  			 * We can't delete the extent states as
8054  			 * btrfs_finish_ordered_io() may still use some of them.
8055  			 */
8056  			goto next;
8057  		}
8058  		btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
8059  
8060  		/*
8061  		 * IO on this page will never be started, so we need to account
8062  		 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
8063  		 * here, must leave that up for the ordered extent completion.
8064  		 *
8065  		 * This will also unlock the range for incoming
8066  		 * btrfs_finish_ordered_io().
8067  		 */
8068  		if (!inode_evicting)
8069  			clear_extent_bit(tree, cur, range_end,
8070  					 EXTENT_DELALLOC |
8071  					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8072  					 EXTENT_DEFRAG, &cached_state);
8073  
8074  		spin_lock_irq(&inode->ordered_tree.lock);
8075  		set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8076  		ordered->truncated_len = min(ordered->truncated_len,
8077  					     cur - ordered->file_offset);
8078  		spin_unlock_irq(&inode->ordered_tree.lock);
8079  
8080  		/*
8081  		 * If the ordered extent has finished, we're safe to delete all
8082  		 * the extent states of the range, otherwise
8083  		 * btrfs_finish_ordered_io() will get executed by endio for
8084  		 * other pages, so we can't delete extent states.
8085  		 */
8086  		if (btrfs_dec_test_ordered_pending(inode, &ordered,
8087  						   cur, range_end + 1 - cur)) {
8088  			btrfs_finish_ordered_io(ordered);
8089  			/*
8090  			 * The ordered extent has finished, now we're again
8091  			 * safe to delete all extent states of the range.
8092  			 */
8093  			extra_flags = EXTENT_CLEAR_ALL_BITS;
8094  		}
8095  next:
8096  		if (ordered)
8097  			btrfs_put_ordered_extent(ordered);
8098  		/*
8099  		 * Qgroup reserved space handler
8100  		 * Sector(s) here will be either:
8101  		 *
8102  		 * 1) Already written to disk or bio already finished
8103  		 *    Then its QGROUP_RESERVED bit in io_tree is already cleared.
8104  		 *    Qgroup will be handled by its qgroup_record then.
8105  		 *    btrfs_qgroup_free_data() call will do nothing here.
8106  		 *
8107  		 * 2) Not written to disk yet
8108  		 *    Then btrfs_qgroup_free_data() call will clear the
8109  		 *    QGROUP_RESERVED bit of its io_tree, and free the qgroup
8110  		 *    reserved data space.
8111  		 *    Since the IO will never happen for this page.
8112  		 */
8113  		btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
8114  		if (!inode_evicting) {
8115  			clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
8116  				 EXTENT_DELALLOC | EXTENT_UPTODATE |
8117  				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
8118  				 extra_flags, &cached_state);
8119  		}
8120  		cur = range_end + 1;
8121  	}
8122  	/*
8123  	 * We have iterated through all ordered extents of the page, the page
8124  	 * should not have Ordered (Private2) anymore, or the above iteration
8125  	 * did something wrong.
8126  	 */
8127  	ASSERT(!folio_test_ordered(folio));
8128  	btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
8129  	if (!inode_evicting)
8130  		__btrfs_release_folio(folio, GFP_NOFS);
8131  	clear_page_extent_mapped(&folio->page);
8132  }
8133  
8134  /*
8135   * btrfs_page_mkwrite() is not allowed to change the file size as it gets
8136   * called from a page fault handler when a page is first dirtied. Hence we must
8137   * be careful to check for EOF conditions here. We set the page up correctly
8138   * for a written page which means we get ENOSPC checking when writing into
8139   * holes and correct delalloc and unwritten extent mapping on filesystems that
8140   * support these features.
8141   *
8142   * We are not allowed to take the i_mutex here so we have to play games to
8143   * protect against truncate races as the page could now be beyond EOF.  Because
8144   * truncate_setsize() writes the inode size before removing pages, once we have
8145   * the page lock we can determine safely if the page is beyond EOF. If it is not
8146   * beyond EOF, then the page is guaranteed safe against truncation until we
8147   * unlock the page.
8148   */
8149  vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
8150  {
8151  	struct page *page = vmf->page;
8152  	struct inode *inode = file_inode(vmf->vma->vm_file);
8153  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8154  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
8155  	struct btrfs_ordered_extent *ordered;
8156  	struct extent_state *cached_state = NULL;
8157  	struct extent_changeset *data_reserved = NULL;
8158  	unsigned long zero_start;
8159  	loff_t size;
8160  	vm_fault_t ret;
8161  	int ret2;
8162  	int reserved = 0;
8163  	u64 reserved_space;
8164  	u64 page_start;
8165  	u64 page_end;
8166  	u64 end;
8167  
8168  	reserved_space = PAGE_SIZE;
8169  
8170  	sb_start_pagefault(inode->i_sb);
8171  	page_start = page_offset(page);
8172  	page_end = page_start + PAGE_SIZE - 1;
8173  	end = page_end;
8174  
8175  	/*
8176  	 * Reserving delalloc space after obtaining the page lock can lead to
8177  	 * deadlock. For example, if a dirty page is locked by this function
8178  	 * and the call to btrfs_delalloc_reserve_space() ends up triggering
8179  	 * dirty page write out, then the btrfs_writepages() function could
8180  	 * end up waiting indefinitely to get a lock on the page currently
8181  	 * being processed by btrfs_page_mkwrite() function.
8182  	 */
8183  	ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
8184  					    page_start, reserved_space);
8185  	if (!ret2) {
8186  		ret2 = file_update_time(vmf->vma->vm_file);
8187  		reserved = 1;
8188  	}
8189  	if (ret2) {
8190  		ret = vmf_error(ret2);
8191  		if (reserved)
8192  			goto out;
8193  		goto out_noreserve;
8194  	}
8195  
8196  	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
8197  again:
8198  	down_read(&BTRFS_I(inode)->i_mmap_lock);
8199  	lock_page(page);
8200  	size = i_size_read(inode);
8201  
8202  	if ((page->mapping != inode->i_mapping) ||
8203  	    (page_start >= size)) {
8204  		/* page got truncated out from underneath us */
8205  		goto out_unlock;
8206  	}
8207  	wait_on_page_writeback(page);
8208  
8209  	lock_extent(io_tree, page_start, page_end, &cached_state);
8210  	ret2 = set_page_extent_mapped(page);
8211  	if (ret2 < 0) {
8212  		ret = vmf_error(ret2);
8213  		unlock_extent(io_tree, page_start, page_end, &cached_state);
8214  		goto out_unlock;
8215  	}
8216  
8217  	/*
8218  	 * we can't set the delalloc bits if there are pending ordered
8219  	 * extents.  Drop our locks and wait for them to finish
8220  	 */
8221  	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
8222  			PAGE_SIZE);
8223  	if (ordered) {
8224  		unlock_extent(io_tree, page_start, page_end, &cached_state);
8225  		unlock_page(page);
8226  		up_read(&BTRFS_I(inode)->i_mmap_lock);
8227  		btrfs_start_ordered_extent(ordered);
8228  		btrfs_put_ordered_extent(ordered);
8229  		goto again;
8230  	}
8231  
8232  	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
8233  		reserved_space = round_up(size - page_start,
8234  					  fs_info->sectorsize);
8235  		if (reserved_space < PAGE_SIZE) {
8236  			end = page_start + reserved_space - 1;
8237  			btrfs_delalloc_release_space(BTRFS_I(inode),
8238  					data_reserved, page_start,
8239  					PAGE_SIZE - reserved_space, true);
8240  		}
8241  	}
8242  
8243  	/*
8244  	 * page_mkwrite gets called when the page is firstly dirtied after it's
8245  	 * faulted in, but write(2) could also dirty a page and set delalloc
8246  	 * bits, thus in this case for space account reason, we still need to
8247  	 * clear any delalloc bits within this page range since we have to
8248  	 * reserve data&meta space before lock_page() (see above comments).
8249  	 */
8250  	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
8251  			  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
8252  			  EXTENT_DEFRAG, &cached_state);
8253  
8254  	ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
8255  					&cached_state);
8256  	if (ret2) {
8257  		unlock_extent(io_tree, page_start, page_end, &cached_state);
8258  		ret = VM_FAULT_SIGBUS;
8259  		goto out_unlock;
8260  	}
8261  
8262  	/* page is wholly or partially inside EOF */
8263  	if (page_start + PAGE_SIZE > size)
8264  		zero_start = offset_in_page(size);
8265  	else
8266  		zero_start = PAGE_SIZE;
8267  
8268  	if (zero_start != PAGE_SIZE)
8269  		memzero_page(page, zero_start, PAGE_SIZE - zero_start);
8270  
8271  	btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
8272  	btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
8273  	btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
8274  
8275  	btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
8276  
8277  	unlock_extent(io_tree, page_start, page_end, &cached_state);
8278  	up_read(&BTRFS_I(inode)->i_mmap_lock);
8279  
8280  	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8281  	sb_end_pagefault(inode->i_sb);
8282  	extent_changeset_free(data_reserved);
8283  	return VM_FAULT_LOCKED;
8284  
8285  out_unlock:
8286  	unlock_page(page);
8287  	up_read(&BTRFS_I(inode)->i_mmap_lock);
8288  out:
8289  	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8290  	btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
8291  				     reserved_space, (ret != 0));
8292  out_noreserve:
8293  	sb_end_pagefault(inode->i_sb);
8294  	extent_changeset_free(data_reserved);
8295  	return ret;
8296  }
8297  
8298  static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
8299  {
8300  	struct btrfs_truncate_control control = {
8301  		.inode = inode,
8302  		.ino = btrfs_ino(inode),
8303  		.min_type = BTRFS_EXTENT_DATA_KEY,
8304  		.clear_extent_range = true,
8305  	};
8306  	struct btrfs_root *root = inode->root;
8307  	struct btrfs_fs_info *fs_info = root->fs_info;
8308  	struct btrfs_block_rsv *rsv;
8309  	int ret;
8310  	struct btrfs_trans_handle *trans;
8311  	u64 mask = fs_info->sectorsize - 1;
8312  	const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
8313  
8314  	if (!skip_writeback) {
8315  		ret = btrfs_wait_ordered_range(&inode->vfs_inode,
8316  					       inode->vfs_inode.i_size & (~mask),
8317  					       (u64)-1);
8318  		if (ret)
8319  			return ret;
8320  	}
8321  
8322  	/*
8323  	 * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
8324  	 * things going on here:
8325  	 *
8326  	 * 1) We need to reserve space to update our inode.
8327  	 *
8328  	 * 2) We need to have something to cache all the space that is going to
8329  	 * be free'd up by the truncate operation, but also have some slack
8330  	 * space reserved in case it uses space during the truncate (thank you
8331  	 * very much snapshotting).
8332  	 *
8333  	 * And we need these to be separate.  The fact is we can use a lot of
8334  	 * space doing the truncate, and we have no earthly idea how much space
8335  	 * we will use, so we need the truncate reservation to be separate so it
8336  	 * doesn't end up using space reserved for updating the inode.  We also
8337  	 * need to be able to stop the transaction and start a new one, which
8338  	 * means we need to be able to update the inode several times, and we
8339  	 * have no idea of knowing how many times that will be, so we can't just
8340  	 * reserve 1 item for the entirety of the operation, so that has to be
8341  	 * done separately as well.
8342  	 *
8343  	 * So that leaves us with
8344  	 *
8345  	 * 1) rsv - for the truncate reservation, which we will steal from the
8346  	 * transaction reservation.
8347  	 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
8348  	 * updating the inode.
8349  	 */
8350  	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
8351  	if (!rsv)
8352  		return -ENOMEM;
8353  	rsv->size = min_size;
8354  	rsv->failfast = true;
8355  
8356  	/*
8357  	 * 1 for the truncate slack space
8358  	 * 1 for updating the inode.
8359  	 */
8360  	trans = btrfs_start_transaction(root, 2);
8361  	if (IS_ERR(trans)) {
8362  		ret = PTR_ERR(trans);
8363  		goto out;
8364  	}
8365  
8366  	/* Migrate the slack space for the truncate to our reserve */
8367  	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
8368  				      min_size, false);
8369  	/*
8370  	 * We have reserved 2 metadata units when we started the transaction and
8371  	 * min_size matches 1 unit, so this should never fail, but if it does,
8372  	 * it's not critical we just fail truncation.
8373  	 */
8374  	if (WARN_ON(ret)) {
8375  		btrfs_end_transaction(trans);
8376  		goto out;
8377  	}
8378  
8379  	trans->block_rsv = rsv;
8380  
8381  	while (1) {
8382  		struct extent_state *cached_state = NULL;
8383  		const u64 new_size = inode->vfs_inode.i_size;
8384  		const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
8385  
8386  		control.new_size = new_size;
8387  		lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
8388  		/*
8389  		 * We want to drop from the next block forward in case this new
8390  		 * size is not block aligned since we will be keeping the last
8391  		 * block of the extent just the way it is.
8392  		 */
8393  		btrfs_drop_extent_map_range(inode,
8394  					    ALIGN(new_size, fs_info->sectorsize),
8395  					    (u64)-1, false);
8396  
8397  		ret = btrfs_truncate_inode_items(trans, root, &control);
8398  
8399  		inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
8400  		btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
8401  
8402  		unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
8403  
8404  		trans->block_rsv = &fs_info->trans_block_rsv;
8405  		if (ret != -ENOSPC && ret != -EAGAIN)
8406  			break;
8407  
8408  		ret = btrfs_update_inode(trans, root, inode);
8409  		if (ret)
8410  			break;
8411  
8412  		btrfs_end_transaction(trans);
8413  		btrfs_btree_balance_dirty(fs_info);
8414  
8415  		trans = btrfs_start_transaction(root, 2);
8416  		if (IS_ERR(trans)) {
8417  			ret = PTR_ERR(trans);
8418  			trans = NULL;
8419  			break;
8420  		}
8421  
8422  		btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
8423  		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
8424  					      rsv, min_size, false);
8425  		/*
8426  		 * We have reserved 2 metadata units when we started the
8427  		 * transaction and min_size matches 1 unit, so this should never
8428  		 * fail, but if it does, it's not critical we just fail truncation.
8429  		 */
8430  		if (WARN_ON(ret))
8431  			break;
8432  
8433  		trans->block_rsv = rsv;
8434  	}
8435  
8436  	/*
8437  	 * We can't call btrfs_truncate_block inside a trans handle as we could
8438  	 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
8439  	 * know we've truncated everything except the last little bit, and can
8440  	 * do btrfs_truncate_block and then update the disk_i_size.
8441  	 */
8442  	if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
8443  		btrfs_end_transaction(trans);
8444  		btrfs_btree_balance_dirty(fs_info);
8445  
8446  		ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
8447  		if (ret)
8448  			goto out;
8449  		trans = btrfs_start_transaction(root, 1);
8450  		if (IS_ERR(trans)) {
8451  			ret = PTR_ERR(trans);
8452  			goto out;
8453  		}
8454  		btrfs_inode_safe_disk_i_size_write(inode, 0);
8455  	}
8456  
8457  	if (trans) {
8458  		int ret2;
8459  
8460  		trans->block_rsv = &fs_info->trans_block_rsv;
8461  		ret2 = btrfs_update_inode(trans, root, inode);
8462  		if (ret2 && !ret)
8463  			ret = ret2;
8464  
8465  		ret2 = btrfs_end_transaction(trans);
8466  		if (ret2 && !ret)
8467  			ret = ret2;
8468  		btrfs_btree_balance_dirty(fs_info);
8469  	}
8470  out:
8471  	btrfs_free_block_rsv(fs_info, rsv);
8472  	/*
8473  	 * So if we truncate and then write and fsync we normally would just
8474  	 * write the extents that changed, which is a problem if we need to
8475  	 * first truncate that entire inode.  So set this flag so we write out
8476  	 * all of the extents in the inode to the sync log so we're completely
8477  	 * safe.
8478  	 *
8479  	 * If no extents were dropped or trimmed we don't need to force the next
8480  	 * fsync to truncate all the inode's items from the log and re-log them
8481  	 * all. This means the truncate operation did not change the file size,
8482  	 * or changed it to a smaller size but there was only an implicit hole
8483  	 * between the old i_size and the new i_size, and there were no prealloc
8484  	 * extents beyond i_size to drop.
8485  	 */
8486  	if (control.extents_found > 0)
8487  		btrfs_set_inode_full_sync(inode);
8488  
8489  	return ret;
8490  }
8491  
8492  struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
8493  				     struct inode *dir)
8494  {
8495  	struct inode *inode;
8496  
8497  	inode = new_inode(dir->i_sb);
8498  	if (inode) {
8499  		/*
8500  		 * Subvolumes don't inherit the sgid bit or the parent's gid if
8501  		 * the parent's sgid bit is set. This is probably a bug.
8502  		 */
8503  		inode_init_owner(idmap, inode, NULL,
8504  				 S_IFDIR | (~current_umask() & S_IRWXUGO));
8505  		inode->i_op = &btrfs_dir_inode_operations;
8506  		inode->i_fop = &btrfs_dir_file_operations;
8507  	}
8508  	return inode;
8509  }
8510  
8511  struct inode *btrfs_alloc_inode(struct super_block *sb)
8512  {
8513  	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
8514  	struct btrfs_inode *ei;
8515  	struct inode *inode;
8516  
8517  	ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
8518  	if (!ei)
8519  		return NULL;
8520  
8521  	ei->root = NULL;
8522  	ei->generation = 0;
8523  	ei->last_trans = 0;
8524  	ei->last_sub_trans = 0;
8525  	ei->logged_trans = 0;
8526  	ei->delalloc_bytes = 0;
8527  	ei->new_delalloc_bytes = 0;
8528  	ei->defrag_bytes = 0;
8529  	ei->disk_i_size = 0;
8530  	ei->flags = 0;
8531  	ei->ro_flags = 0;
8532  	ei->csum_bytes = 0;
8533  	ei->index_cnt = (u64)-1;
8534  	ei->dir_index = 0;
8535  	ei->last_unlink_trans = 0;
8536  	ei->last_reflink_trans = 0;
8537  	ei->last_log_commit = 0;
8538  
8539  	spin_lock_init(&ei->lock);
8540  	ei->outstanding_extents = 0;
8541  	if (sb->s_magic != BTRFS_TEST_MAGIC)
8542  		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
8543  					      BTRFS_BLOCK_RSV_DELALLOC);
8544  	ei->runtime_flags = 0;
8545  	ei->prop_compress = BTRFS_COMPRESS_NONE;
8546  	ei->defrag_compress = BTRFS_COMPRESS_NONE;
8547  
8548  	ei->delayed_node = NULL;
8549  
8550  	ei->i_otime.tv_sec = 0;
8551  	ei->i_otime.tv_nsec = 0;
8552  
8553  	inode = &ei->vfs_inode;
8554  	extent_map_tree_init(&ei->extent_tree);
8555  	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
8556  	ei->io_tree.inode = ei;
8557  	extent_io_tree_init(fs_info, &ei->file_extent_tree,
8558  			    IO_TREE_INODE_FILE_EXTENT);
8559  	mutex_init(&ei->log_mutex);
8560  	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8561  	INIT_LIST_HEAD(&ei->delalloc_inodes);
8562  	INIT_LIST_HEAD(&ei->delayed_iput);
8563  	RB_CLEAR_NODE(&ei->rb_node);
8564  	init_rwsem(&ei->i_mmap_lock);
8565  
8566  	return inode;
8567  }
8568  
8569  #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
8570  void btrfs_test_destroy_inode(struct inode *inode)
8571  {
8572  	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
8573  	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8574  }
8575  #endif
8576  
8577  void btrfs_free_inode(struct inode *inode)
8578  {
8579  	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8580  }
8581  
8582  void btrfs_destroy_inode(struct inode *vfs_inode)
8583  {
8584  	struct btrfs_ordered_extent *ordered;
8585  	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
8586  	struct btrfs_root *root = inode->root;
8587  	bool freespace_inode;
8588  
8589  	WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
8590  	WARN_ON(vfs_inode->i_data.nrpages);
8591  	WARN_ON(inode->block_rsv.reserved);
8592  	WARN_ON(inode->block_rsv.size);
8593  	WARN_ON(inode->outstanding_extents);
8594  	if (!S_ISDIR(vfs_inode->i_mode)) {
8595  		WARN_ON(inode->delalloc_bytes);
8596  		WARN_ON(inode->new_delalloc_bytes);
8597  	}
8598  	WARN_ON(inode->csum_bytes);
8599  	WARN_ON(inode->defrag_bytes);
8600  
8601  	/*
8602  	 * This can happen where we create an inode, but somebody else also
8603  	 * created the same inode and we need to destroy the one we already
8604  	 * created.
8605  	 */
8606  	if (!root)
8607  		return;
8608  
8609  	/*
8610  	 * If this is a free space inode do not take the ordered extents lockdep
8611  	 * map.
8612  	 */
8613  	freespace_inode = btrfs_is_free_space_inode(inode);
8614  
8615  	while (1) {
8616  		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
8617  		if (!ordered)
8618  			break;
8619  		else {
8620  			btrfs_err(root->fs_info,
8621  				  "found ordered extent %llu %llu on inode cleanup",
8622  				  ordered->file_offset, ordered->num_bytes);
8623  
8624  			if (!freespace_inode)
8625  				btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
8626  
8627  			btrfs_remove_ordered_extent(inode, ordered);
8628  			btrfs_put_ordered_extent(ordered);
8629  			btrfs_put_ordered_extent(ordered);
8630  		}
8631  	}
8632  	btrfs_qgroup_check_reserved_leak(inode);
8633  	inode_tree_del(inode);
8634  	btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
8635  	btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
8636  	btrfs_put_root(inode->root);
8637  }
8638  
8639  int btrfs_drop_inode(struct inode *inode)
8640  {
8641  	struct btrfs_root *root = BTRFS_I(inode)->root;
8642  
8643  	if (root == NULL)
8644  		return 1;
8645  
8646  	/* the snap/subvol tree is on deleting */
8647  	if (btrfs_root_refs(&root->root_item) == 0)
8648  		return 1;
8649  	else
8650  		return generic_drop_inode(inode);
8651  }
8652  
8653  static void init_once(void *foo)
8654  {
8655  	struct btrfs_inode *ei = foo;
8656  
8657  	inode_init_once(&ei->vfs_inode);
8658  }
8659  
8660  void __cold btrfs_destroy_cachep(void)
8661  {
8662  	/*
8663  	 * Make sure all delayed rcu free inodes are flushed before we
8664  	 * destroy cache.
8665  	 */
8666  	rcu_barrier();
8667  	bioset_exit(&btrfs_dio_bioset);
8668  	kmem_cache_destroy(btrfs_inode_cachep);
8669  }
8670  
8671  int __init btrfs_init_cachep(void)
8672  {
8673  	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
8674  			sizeof(struct btrfs_inode), 0,
8675  			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
8676  			init_once);
8677  	if (!btrfs_inode_cachep)
8678  		goto fail;
8679  
8680  	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
8681  			offsetof(struct btrfs_dio_private, bbio.bio),
8682  			BIOSET_NEED_BVECS))
8683  		goto fail;
8684  
8685  	return 0;
8686  fail:
8687  	btrfs_destroy_cachep();
8688  	return -ENOMEM;
8689  }
8690  
8691  static int btrfs_getattr(struct mnt_idmap *idmap,
8692  			 const struct path *path, struct kstat *stat,
8693  			 u32 request_mask, unsigned int flags)
8694  {
8695  	u64 delalloc_bytes;
8696  	u64 inode_bytes;
8697  	struct inode *inode = d_inode(path->dentry);
8698  	u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize;
8699  	u32 bi_flags = BTRFS_I(inode)->flags;
8700  	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
8701  
8702  	stat->result_mask |= STATX_BTIME;
8703  	stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
8704  	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
8705  	if (bi_flags & BTRFS_INODE_APPEND)
8706  		stat->attributes |= STATX_ATTR_APPEND;
8707  	if (bi_flags & BTRFS_INODE_COMPRESS)
8708  		stat->attributes |= STATX_ATTR_COMPRESSED;
8709  	if (bi_flags & BTRFS_INODE_IMMUTABLE)
8710  		stat->attributes |= STATX_ATTR_IMMUTABLE;
8711  	if (bi_flags & BTRFS_INODE_NODUMP)
8712  		stat->attributes |= STATX_ATTR_NODUMP;
8713  	if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
8714  		stat->attributes |= STATX_ATTR_VERITY;
8715  
8716  	stat->attributes_mask |= (STATX_ATTR_APPEND |
8717  				  STATX_ATTR_COMPRESSED |
8718  				  STATX_ATTR_IMMUTABLE |
8719  				  STATX_ATTR_NODUMP);
8720  
8721  	generic_fillattr(idmap, request_mask, inode, stat);
8722  	stat->dev = BTRFS_I(inode)->root->anon_dev;
8723  
8724  	spin_lock(&BTRFS_I(inode)->lock);
8725  	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
8726  	inode_bytes = inode_get_bytes(inode);
8727  	spin_unlock(&BTRFS_I(inode)->lock);
8728  	stat->blocks = (ALIGN(inode_bytes, blocksize) +
8729  			ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
8730  	return 0;
8731  }
8732  
8733  static int btrfs_rename_exchange(struct inode *old_dir,
8734  			      struct dentry *old_dentry,
8735  			      struct inode *new_dir,
8736  			      struct dentry *new_dentry)
8737  {
8738  	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
8739  	struct btrfs_trans_handle *trans;
8740  	unsigned int trans_num_items;
8741  	struct btrfs_root *root = BTRFS_I(old_dir)->root;
8742  	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8743  	struct inode *new_inode = new_dentry->d_inode;
8744  	struct inode *old_inode = old_dentry->d_inode;
8745  	struct btrfs_rename_ctx old_rename_ctx;
8746  	struct btrfs_rename_ctx new_rename_ctx;
8747  	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
8748  	u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
8749  	u64 old_idx = 0;
8750  	u64 new_idx = 0;
8751  	int ret;
8752  	int ret2;
8753  	bool need_abort = false;
8754  	struct fscrypt_name old_fname, new_fname;
8755  	struct fscrypt_str *old_name, *new_name;
8756  
8757  	/*
8758  	 * For non-subvolumes allow exchange only within one subvolume, in the
8759  	 * same inode namespace. Two subvolumes (represented as directory) can
8760  	 * be exchanged as they're a logical link and have a fixed inode number.
8761  	 */
8762  	if (root != dest &&
8763  	    (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
8764  	     new_ino != BTRFS_FIRST_FREE_OBJECTID))
8765  		return -EXDEV;
8766  
8767  	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
8768  	if (ret)
8769  		return ret;
8770  
8771  	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
8772  	if (ret) {
8773  		fscrypt_free_filename(&old_fname);
8774  		return ret;
8775  	}
8776  
8777  	old_name = &old_fname.disk_name;
8778  	new_name = &new_fname.disk_name;
8779  
8780  	/* close the race window with snapshot create/destroy ioctl */
8781  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
8782  	    new_ino == BTRFS_FIRST_FREE_OBJECTID)
8783  		down_read(&fs_info->subvol_sem);
8784  
8785  	/*
8786  	 * For each inode:
8787  	 * 1 to remove old dir item
8788  	 * 1 to remove old dir index
8789  	 * 1 to add new dir item
8790  	 * 1 to add new dir index
8791  	 * 1 to update parent inode
8792  	 *
8793  	 * If the parents are the same, we only need to account for one
8794  	 */
8795  	trans_num_items = (old_dir == new_dir ? 9 : 10);
8796  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8797  		/*
8798  		 * 1 to remove old root ref
8799  		 * 1 to remove old root backref
8800  		 * 1 to add new root ref
8801  		 * 1 to add new root backref
8802  		 */
8803  		trans_num_items += 4;
8804  	} else {
8805  		/*
8806  		 * 1 to update inode item
8807  		 * 1 to remove old inode ref
8808  		 * 1 to add new inode ref
8809  		 */
8810  		trans_num_items += 3;
8811  	}
8812  	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
8813  		trans_num_items += 4;
8814  	else
8815  		trans_num_items += 3;
8816  	trans = btrfs_start_transaction(root, trans_num_items);
8817  	if (IS_ERR(trans)) {
8818  		ret = PTR_ERR(trans);
8819  		goto out_notrans;
8820  	}
8821  
8822  	if (dest != root) {
8823  		ret = btrfs_record_root_in_trans(trans, dest);
8824  		if (ret)
8825  			goto out_fail;
8826  	}
8827  
8828  	/*
8829  	 * We need to find a free sequence number both in the source and
8830  	 * in the destination directory for the exchange.
8831  	 */
8832  	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
8833  	if (ret)
8834  		goto out_fail;
8835  	ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
8836  	if (ret)
8837  		goto out_fail;
8838  
8839  	BTRFS_I(old_inode)->dir_index = 0ULL;
8840  	BTRFS_I(new_inode)->dir_index = 0ULL;
8841  
8842  	/* Reference for the source. */
8843  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8844  		/* force full log commit if subvolume involved. */
8845  		btrfs_set_log_full_commit(trans);
8846  	} else {
8847  		ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
8848  					     btrfs_ino(BTRFS_I(new_dir)),
8849  					     old_idx);
8850  		if (ret)
8851  			goto out_fail;
8852  		need_abort = true;
8853  	}
8854  
8855  	/* And now for the dest. */
8856  	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8857  		/* force full log commit if subvolume involved. */
8858  		btrfs_set_log_full_commit(trans);
8859  	} else {
8860  		ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
8861  					     btrfs_ino(BTRFS_I(old_dir)),
8862  					     new_idx);
8863  		if (ret) {
8864  			if (need_abort)
8865  				btrfs_abort_transaction(trans, ret);
8866  			goto out_fail;
8867  		}
8868  	}
8869  
8870  	/* Update inode version and ctime/mtime. */
8871  	inode_inc_iversion(old_dir);
8872  	inode_inc_iversion(new_dir);
8873  	inode_inc_iversion(old_inode);
8874  	inode_inc_iversion(new_inode);
8875  	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
8876  
8877  	if (old_dentry->d_parent != new_dentry->d_parent) {
8878  		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
8879  					BTRFS_I(old_inode), true);
8880  		btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
8881  					BTRFS_I(new_inode), true);
8882  	}
8883  
8884  	/* src is a subvolume */
8885  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8886  		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
8887  	} else { /* src is an inode */
8888  		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
8889  					   BTRFS_I(old_dentry->d_inode),
8890  					   old_name, &old_rename_ctx);
8891  		if (!ret)
8892  			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
8893  	}
8894  	if (ret) {
8895  		btrfs_abort_transaction(trans, ret);
8896  		goto out_fail;
8897  	}
8898  
8899  	/* dest is a subvolume */
8900  	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8901  		ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
8902  	} else { /* dest is an inode */
8903  		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
8904  					   BTRFS_I(new_dentry->d_inode),
8905  					   new_name, &new_rename_ctx);
8906  		if (!ret)
8907  			ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
8908  	}
8909  	if (ret) {
8910  		btrfs_abort_transaction(trans, ret);
8911  		goto out_fail;
8912  	}
8913  
8914  	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
8915  			     new_name, 0, old_idx);
8916  	if (ret) {
8917  		btrfs_abort_transaction(trans, ret);
8918  		goto out_fail;
8919  	}
8920  
8921  	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
8922  			     old_name, 0, new_idx);
8923  	if (ret) {
8924  		btrfs_abort_transaction(trans, ret);
8925  		goto out_fail;
8926  	}
8927  
8928  	if (old_inode->i_nlink == 1)
8929  		BTRFS_I(old_inode)->dir_index = old_idx;
8930  	if (new_inode->i_nlink == 1)
8931  		BTRFS_I(new_inode)->dir_index = new_idx;
8932  
8933  	/*
8934  	 * Now pin the logs of the roots. We do it to ensure that no other task
8935  	 * can sync the logs while we are in progress with the rename, because
8936  	 * that could result in an inconsistency in case any of the inodes that
8937  	 * are part of this rename operation were logged before.
8938  	 */
8939  	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8940  		btrfs_pin_log_trans(root);
8941  	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8942  		btrfs_pin_log_trans(dest);
8943  
8944  	/* Do the log updates for all inodes. */
8945  	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8946  		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
8947  				   old_rename_ctx.index, new_dentry->d_parent);
8948  	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8949  		btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
8950  				   new_rename_ctx.index, old_dentry->d_parent);
8951  
8952  	/* Now unpin the logs. */
8953  	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8954  		btrfs_end_log_trans(root);
8955  	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8956  		btrfs_end_log_trans(dest);
8957  out_fail:
8958  	ret2 = btrfs_end_transaction(trans);
8959  	ret = ret ? ret : ret2;
8960  out_notrans:
8961  	if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
8962  	    old_ino == BTRFS_FIRST_FREE_OBJECTID)
8963  		up_read(&fs_info->subvol_sem);
8964  
8965  	fscrypt_free_filename(&new_fname);
8966  	fscrypt_free_filename(&old_fname);
8967  	return ret;
8968  }
8969  
8970  static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
8971  					struct inode *dir)
8972  {
8973  	struct inode *inode;
8974  
8975  	inode = new_inode(dir->i_sb);
8976  	if (inode) {
8977  		inode_init_owner(idmap, inode, dir,
8978  				 S_IFCHR | WHITEOUT_MODE);
8979  		inode->i_op = &btrfs_special_inode_operations;
8980  		init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
8981  	}
8982  	return inode;
8983  }
8984  
8985  static int btrfs_rename(struct mnt_idmap *idmap,
8986  			struct inode *old_dir, struct dentry *old_dentry,
8987  			struct inode *new_dir, struct dentry *new_dentry,
8988  			unsigned int flags)
8989  {
8990  	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
8991  	struct btrfs_new_inode_args whiteout_args = {
8992  		.dir = old_dir,
8993  		.dentry = old_dentry,
8994  	};
8995  	struct btrfs_trans_handle *trans;
8996  	unsigned int trans_num_items;
8997  	struct btrfs_root *root = BTRFS_I(old_dir)->root;
8998  	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8999  	struct inode *new_inode = d_inode(new_dentry);
9000  	struct inode *old_inode = d_inode(old_dentry);
9001  	struct btrfs_rename_ctx rename_ctx;
9002  	u64 index = 0;
9003  	int ret;
9004  	int ret2;
9005  	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9006  	struct fscrypt_name old_fname, new_fname;
9007  
9008  	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9009  		return -EPERM;
9010  
9011  	/* we only allow rename subvolume link between subvolumes */
9012  	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9013  		return -EXDEV;
9014  
9015  	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9016  	    (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
9017  		return -ENOTEMPTY;
9018  
9019  	if (S_ISDIR(old_inode->i_mode) && new_inode &&
9020  	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
9021  		return -ENOTEMPTY;
9022  
9023  	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
9024  	if (ret)
9025  		return ret;
9026  
9027  	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
9028  	if (ret) {
9029  		fscrypt_free_filename(&old_fname);
9030  		return ret;
9031  	}
9032  
9033  	/* check for collisions, even if the  name isn't there */
9034  	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
9035  	if (ret) {
9036  		if (ret == -EEXIST) {
9037  			/* we shouldn't get
9038  			 * eexist without a new_inode */
9039  			if (WARN_ON(!new_inode)) {
9040  				goto out_fscrypt_names;
9041  			}
9042  		} else {
9043  			/* maybe -EOVERFLOW */
9044  			goto out_fscrypt_names;
9045  		}
9046  	}
9047  	ret = 0;
9048  
9049  	/*
9050  	 * we're using rename to replace one file with another.  Start IO on it
9051  	 * now so  we don't add too much work to the end of the transaction
9052  	 */
9053  	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9054  		filemap_flush(old_inode->i_mapping);
9055  
9056  	if (flags & RENAME_WHITEOUT) {
9057  		whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
9058  		if (!whiteout_args.inode) {
9059  			ret = -ENOMEM;
9060  			goto out_fscrypt_names;
9061  		}
9062  		ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
9063  		if (ret)
9064  			goto out_whiteout_inode;
9065  	} else {
9066  		/* 1 to update the old parent inode. */
9067  		trans_num_items = 1;
9068  	}
9069  
9070  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9071  		/* Close the race window with snapshot create/destroy ioctl */
9072  		down_read(&fs_info->subvol_sem);
9073  		/*
9074  		 * 1 to remove old root ref
9075  		 * 1 to remove old root backref
9076  		 * 1 to add new root ref
9077  		 * 1 to add new root backref
9078  		 */
9079  		trans_num_items += 4;
9080  	} else {
9081  		/*
9082  		 * 1 to update inode
9083  		 * 1 to remove old inode ref
9084  		 * 1 to add new inode ref
9085  		 */
9086  		trans_num_items += 3;
9087  	}
9088  	/*
9089  	 * 1 to remove old dir item
9090  	 * 1 to remove old dir index
9091  	 * 1 to add new dir item
9092  	 * 1 to add new dir index
9093  	 */
9094  	trans_num_items += 4;
9095  	/* 1 to update new parent inode if it's not the same as the old parent */
9096  	if (new_dir != old_dir)
9097  		trans_num_items++;
9098  	if (new_inode) {
9099  		/*
9100  		 * 1 to update inode
9101  		 * 1 to remove inode ref
9102  		 * 1 to remove dir item
9103  		 * 1 to remove dir index
9104  		 * 1 to possibly add orphan item
9105  		 */
9106  		trans_num_items += 5;
9107  	}
9108  	trans = btrfs_start_transaction(root, trans_num_items);
9109  	if (IS_ERR(trans)) {
9110  		ret = PTR_ERR(trans);
9111  		goto out_notrans;
9112  	}
9113  
9114  	if (dest != root) {
9115  		ret = btrfs_record_root_in_trans(trans, dest);
9116  		if (ret)
9117  			goto out_fail;
9118  	}
9119  
9120  	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
9121  	if (ret)
9122  		goto out_fail;
9123  
9124  	BTRFS_I(old_inode)->dir_index = 0ULL;
9125  	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9126  		/* force full log commit if subvolume involved. */
9127  		btrfs_set_log_full_commit(trans);
9128  	} else {
9129  		ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
9130  					     old_ino, btrfs_ino(BTRFS_I(new_dir)),
9131  					     index);
9132  		if (ret)
9133  			goto out_fail;
9134  	}
9135  
9136  	inode_inc_iversion(old_dir);
9137  	inode_inc_iversion(new_dir);
9138  	inode_inc_iversion(old_inode);
9139  	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
9140  
9141  	if (old_dentry->d_parent != new_dentry->d_parent)
9142  		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9143  					BTRFS_I(old_inode), true);
9144  
9145  	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9146  		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
9147  	} else {
9148  		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
9149  					   BTRFS_I(d_inode(old_dentry)),
9150  					   &old_fname.disk_name, &rename_ctx);
9151  		if (!ret)
9152  			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
9153  	}
9154  	if (ret) {
9155  		btrfs_abort_transaction(trans, ret);
9156  		goto out_fail;
9157  	}
9158  
9159  	if (new_inode) {
9160  		inode_inc_iversion(new_inode);
9161  		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
9162  			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9163  			ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
9164  			BUG_ON(new_inode->i_nlink == 0);
9165  		} else {
9166  			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
9167  						 BTRFS_I(d_inode(new_dentry)),
9168  						 &new_fname.disk_name);
9169  		}
9170  		if (!ret && new_inode->i_nlink == 0)
9171  			ret = btrfs_orphan_add(trans,
9172  					BTRFS_I(d_inode(new_dentry)));
9173  		if (ret) {
9174  			btrfs_abort_transaction(trans, ret);
9175  			goto out_fail;
9176  		}
9177  	}
9178  
9179  	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9180  			     &new_fname.disk_name, 0, index);
9181  	if (ret) {
9182  		btrfs_abort_transaction(trans, ret);
9183  		goto out_fail;
9184  	}
9185  
9186  	if (old_inode->i_nlink == 1)
9187  		BTRFS_I(old_inode)->dir_index = index;
9188  
9189  	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
9190  		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
9191  				   rename_ctx.index, new_dentry->d_parent);
9192  
9193  	if (flags & RENAME_WHITEOUT) {
9194  		ret = btrfs_create_new_inode(trans, &whiteout_args);
9195  		if (ret) {
9196  			btrfs_abort_transaction(trans, ret);
9197  			goto out_fail;
9198  		} else {
9199  			unlock_new_inode(whiteout_args.inode);
9200  			iput(whiteout_args.inode);
9201  			whiteout_args.inode = NULL;
9202  		}
9203  	}
9204  out_fail:
9205  	ret2 = btrfs_end_transaction(trans);
9206  	ret = ret ? ret : ret2;
9207  out_notrans:
9208  	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9209  		up_read(&fs_info->subvol_sem);
9210  	if (flags & RENAME_WHITEOUT)
9211  		btrfs_new_inode_args_destroy(&whiteout_args);
9212  out_whiteout_inode:
9213  	if (flags & RENAME_WHITEOUT)
9214  		iput(whiteout_args.inode);
9215  out_fscrypt_names:
9216  	fscrypt_free_filename(&old_fname);
9217  	fscrypt_free_filename(&new_fname);
9218  	return ret;
9219  }
9220  
9221  static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
9222  			 struct dentry *old_dentry, struct inode *new_dir,
9223  			 struct dentry *new_dentry, unsigned int flags)
9224  {
9225  	int ret;
9226  
9227  	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
9228  		return -EINVAL;
9229  
9230  	if (flags & RENAME_EXCHANGE)
9231  		ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
9232  					    new_dentry);
9233  	else
9234  		ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
9235  				   new_dentry, flags);
9236  
9237  	btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
9238  
9239  	return ret;
9240  }
9241  
9242  struct btrfs_delalloc_work {
9243  	struct inode *inode;
9244  	struct completion completion;
9245  	struct list_head list;
9246  	struct btrfs_work work;
9247  };
9248  
9249  static void btrfs_run_delalloc_work(struct btrfs_work *work)
9250  {
9251  	struct btrfs_delalloc_work *delalloc_work;
9252  	struct inode *inode;
9253  
9254  	delalloc_work = container_of(work, struct btrfs_delalloc_work,
9255  				     work);
9256  	inode = delalloc_work->inode;
9257  	filemap_flush(inode->i_mapping);
9258  	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
9259  				&BTRFS_I(inode)->runtime_flags))
9260  		filemap_flush(inode->i_mapping);
9261  
9262  	iput(inode);
9263  	complete(&delalloc_work->completion);
9264  }
9265  
9266  static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
9267  {
9268  	struct btrfs_delalloc_work *work;
9269  
9270  	work = kmalloc(sizeof(*work), GFP_NOFS);
9271  	if (!work)
9272  		return NULL;
9273  
9274  	init_completion(&work->completion);
9275  	INIT_LIST_HEAD(&work->list);
9276  	work->inode = inode;
9277  	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
9278  
9279  	return work;
9280  }
9281  
9282  /*
9283   * some fairly slow code that needs optimization. This walks the list
9284   * of all the inodes with pending delalloc and forces them to disk.
9285   */
9286  static int start_delalloc_inodes(struct btrfs_root *root,
9287  				 struct writeback_control *wbc, bool snapshot,
9288  				 bool in_reclaim_context)
9289  {
9290  	struct btrfs_inode *binode;
9291  	struct inode *inode;
9292  	struct btrfs_delalloc_work *work, *next;
9293  	LIST_HEAD(works);
9294  	LIST_HEAD(splice);
9295  	int ret = 0;
9296  	bool full_flush = wbc->nr_to_write == LONG_MAX;
9297  
9298  	mutex_lock(&root->delalloc_mutex);
9299  	spin_lock(&root->delalloc_lock);
9300  	list_splice_init(&root->delalloc_inodes, &splice);
9301  	while (!list_empty(&splice)) {
9302  		binode = list_entry(splice.next, struct btrfs_inode,
9303  				    delalloc_inodes);
9304  
9305  		list_move_tail(&binode->delalloc_inodes,
9306  			       &root->delalloc_inodes);
9307  
9308  		if (in_reclaim_context &&
9309  		    test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
9310  			continue;
9311  
9312  		inode = igrab(&binode->vfs_inode);
9313  		if (!inode) {
9314  			cond_resched_lock(&root->delalloc_lock);
9315  			continue;
9316  		}
9317  		spin_unlock(&root->delalloc_lock);
9318  
9319  		if (snapshot)
9320  			set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
9321  				&binode->runtime_flags);
9322  		if (full_flush) {
9323  			work = btrfs_alloc_delalloc_work(inode);
9324  			if (!work) {
9325  				iput(inode);
9326  				ret = -ENOMEM;
9327  				goto out;
9328  			}
9329  			list_add_tail(&work->list, &works);
9330  			btrfs_queue_work(root->fs_info->flush_workers,
9331  					 &work->work);
9332  		} else {
9333  			ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
9334  			btrfs_add_delayed_iput(BTRFS_I(inode));
9335  			if (ret || wbc->nr_to_write <= 0)
9336  				goto out;
9337  		}
9338  		cond_resched();
9339  		spin_lock(&root->delalloc_lock);
9340  	}
9341  	spin_unlock(&root->delalloc_lock);
9342  
9343  out:
9344  	list_for_each_entry_safe(work, next, &works, list) {
9345  		list_del_init(&work->list);
9346  		wait_for_completion(&work->completion);
9347  		kfree(work);
9348  	}
9349  
9350  	if (!list_empty(&splice)) {
9351  		spin_lock(&root->delalloc_lock);
9352  		list_splice_tail(&splice, &root->delalloc_inodes);
9353  		spin_unlock(&root->delalloc_lock);
9354  	}
9355  	mutex_unlock(&root->delalloc_mutex);
9356  	return ret;
9357  }
9358  
9359  int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
9360  {
9361  	struct writeback_control wbc = {
9362  		.nr_to_write = LONG_MAX,
9363  		.sync_mode = WB_SYNC_NONE,
9364  		.range_start = 0,
9365  		.range_end = LLONG_MAX,
9366  	};
9367  	struct btrfs_fs_info *fs_info = root->fs_info;
9368  
9369  	if (BTRFS_FS_ERROR(fs_info))
9370  		return -EROFS;
9371  
9372  	return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
9373  }
9374  
9375  int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
9376  			       bool in_reclaim_context)
9377  {
9378  	struct writeback_control wbc = {
9379  		.nr_to_write = nr,
9380  		.sync_mode = WB_SYNC_NONE,
9381  		.range_start = 0,
9382  		.range_end = LLONG_MAX,
9383  	};
9384  	struct btrfs_root *root;
9385  	LIST_HEAD(splice);
9386  	int ret;
9387  
9388  	if (BTRFS_FS_ERROR(fs_info))
9389  		return -EROFS;
9390  
9391  	mutex_lock(&fs_info->delalloc_root_mutex);
9392  	spin_lock(&fs_info->delalloc_root_lock);
9393  	list_splice_init(&fs_info->delalloc_roots, &splice);
9394  	while (!list_empty(&splice)) {
9395  		/*
9396  		 * Reset nr_to_write here so we know that we're doing a full
9397  		 * flush.
9398  		 */
9399  		if (nr == LONG_MAX)
9400  			wbc.nr_to_write = LONG_MAX;
9401  
9402  		root = list_first_entry(&splice, struct btrfs_root,
9403  					delalloc_root);
9404  		root = btrfs_grab_root(root);
9405  		BUG_ON(!root);
9406  		list_move_tail(&root->delalloc_root,
9407  			       &fs_info->delalloc_roots);
9408  		spin_unlock(&fs_info->delalloc_root_lock);
9409  
9410  		ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
9411  		btrfs_put_root(root);
9412  		if (ret < 0 || wbc.nr_to_write <= 0)
9413  			goto out;
9414  		spin_lock(&fs_info->delalloc_root_lock);
9415  	}
9416  	spin_unlock(&fs_info->delalloc_root_lock);
9417  
9418  	ret = 0;
9419  out:
9420  	if (!list_empty(&splice)) {
9421  		spin_lock(&fs_info->delalloc_root_lock);
9422  		list_splice_tail(&splice, &fs_info->delalloc_roots);
9423  		spin_unlock(&fs_info->delalloc_root_lock);
9424  	}
9425  	mutex_unlock(&fs_info->delalloc_root_mutex);
9426  	return ret;
9427  }
9428  
9429  static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
9430  			 struct dentry *dentry, const char *symname)
9431  {
9432  	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
9433  	struct btrfs_trans_handle *trans;
9434  	struct btrfs_root *root = BTRFS_I(dir)->root;
9435  	struct btrfs_path *path;
9436  	struct btrfs_key key;
9437  	struct inode *inode;
9438  	struct btrfs_new_inode_args new_inode_args = {
9439  		.dir = dir,
9440  		.dentry = dentry,
9441  	};
9442  	unsigned int trans_num_items;
9443  	int err;
9444  	int name_len;
9445  	int datasize;
9446  	unsigned long ptr;
9447  	struct btrfs_file_extent_item *ei;
9448  	struct extent_buffer *leaf;
9449  
9450  	name_len = strlen(symname);
9451  	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
9452  		return -ENAMETOOLONG;
9453  
9454  	inode = new_inode(dir->i_sb);
9455  	if (!inode)
9456  		return -ENOMEM;
9457  	inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
9458  	inode->i_op = &btrfs_symlink_inode_operations;
9459  	inode_nohighmem(inode);
9460  	inode->i_mapping->a_ops = &btrfs_aops;
9461  	btrfs_i_size_write(BTRFS_I(inode), name_len);
9462  	inode_set_bytes(inode, name_len);
9463  
9464  	new_inode_args.inode = inode;
9465  	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
9466  	if (err)
9467  		goto out_inode;
9468  	/* 1 additional item for the inline extent */
9469  	trans_num_items++;
9470  
9471  	trans = btrfs_start_transaction(root, trans_num_items);
9472  	if (IS_ERR(trans)) {
9473  		err = PTR_ERR(trans);
9474  		goto out_new_inode_args;
9475  	}
9476  
9477  	err = btrfs_create_new_inode(trans, &new_inode_args);
9478  	if (err)
9479  		goto out;
9480  
9481  	path = btrfs_alloc_path();
9482  	if (!path) {
9483  		err = -ENOMEM;
9484  		btrfs_abort_transaction(trans, err);
9485  		discard_new_inode(inode);
9486  		inode = NULL;
9487  		goto out;
9488  	}
9489  	key.objectid = btrfs_ino(BTRFS_I(inode));
9490  	key.offset = 0;
9491  	key.type = BTRFS_EXTENT_DATA_KEY;
9492  	datasize = btrfs_file_extent_calc_inline_size(name_len);
9493  	err = btrfs_insert_empty_item(trans, root, path, &key,
9494  				      datasize);
9495  	if (err) {
9496  		btrfs_abort_transaction(trans, err);
9497  		btrfs_free_path(path);
9498  		discard_new_inode(inode);
9499  		inode = NULL;
9500  		goto out;
9501  	}
9502  	leaf = path->nodes[0];
9503  	ei = btrfs_item_ptr(leaf, path->slots[0],
9504  			    struct btrfs_file_extent_item);
9505  	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
9506  	btrfs_set_file_extent_type(leaf, ei,
9507  				   BTRFS_FILE_EXTENT_INLINE);
9508  	btrfs_set_file_extent_encryption(leaf, ei, 0);
9509  	btrfs_set_file_extent_compression(leaf, ei, 0);
9510  	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
9511  	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
9512  
9513  	ptr = btrfs_file_extent_inline_start(ei);
9514  	write_extent_buffer(leaf, symname, ptr, name_len);
9515  	btrfs_mark_buffer_dirty(trans, leaf);
9516  	btrfs_free_path(path);
9517  
9518  	d_instantiate_new(dentry, inode);
9519  	err = 0;
9520  out:
9521  	btrfs_end_transaction(trans);
9522  	btrfs_btree_balance_dirty(fs_info);
9523  out_new_inode_args:
9524  	btrfs_new_inode_args_destroy(&new_inode_args);
9525  out_inode:
9526  	if (err)
9527  		iput(inode);
9528  	return err;
9529  }
9530  
9531  static struct btrfs_trans_handle *insert_prealloc_file_extent(
9532  				       struct btrfs_trans_handle *trans_in,
9533  				       struct btrfs_inode *inode,
9534  				       struct btrfs_key *ins,
9535  				       u64 file_offset)
9536  {
9537  	struct btrfs_file_extent_item stack_fi;
9538  	struct btrfs_replace_extent_info extent_info;
9539  	struct btrfs_trans_handle *trans = trans_in;
9540  	struct btrfs_path *path;
9541  	u64 start = ins->objectid;
9542  	u64 len = ins->offset;
9543  	u64 qgroup_released = 0;
9544  	int ret;
9545  
9546  	memset(&stack_fi, 0, sizeof(stack_fi));
9547  
9548  	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
9549  	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
9550  	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
9551  	btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
9552  	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
9553  	btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
9554  	/* Encryption and other encoding is reserved and all 0 */
9555  
9556  	ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
9557  	if (ret < 0)
9558  		return ERR_PTR(ret);
9559  
9560  	if (trans) {
9561  		ret = insert_reserved_file_extent(trans, inode,
9562  						  file_offset, &stack_fi,
9563  						  true, qgroup_released);
9564  		if (ret)
9565  			goto free_qgroup;
9566  		return trans;
9567  	}
9568  
9569  	extent_info.disk_offset = start;
9570  	extent_info.disk_len = len;
9571  	extent_info.data_offset = 0;
9572  	extent_info.data_len = len;
9573  	extent_info.file_offset = file_offset;
9574  	extent_info.extent_buf = (char *)&stack_fi;
9575  	extent_info.is_new_extent = true;
9576  	extent_info.update_times = true;
9577  	extent_info.qgroup_reserved = qgroup_released;
9578  	extent_info.insertions = 0;
9579  
9580  	path = btrfs_alloc_path();
9581  	if (!path) {
9582  		ret = -ENOMEM;
9583  		goto free_qgroup;
9584  	}
9585  
9586  	ret = btrfs_replace_file_extents(inode, path, file_offset,
9587  				     file_offset + len - 1, &extent_info,
9588  				     &trans);
9589  	btrfs_free_path(path);
9590  	if (ret)
9591  		goto free_qgroup;
9592  	return trans;
9593  
9594  free_qgroup:
9595  	/*
9596  	 * We have released qgroup data range at the beginning of the function,
9597  	 * and normally qgroup_released bytes will be freed when committing
9598  	 * transaction.
9599  	 * But if we error out early, we have to free what we have released
9600  	 * or we leak qgroup data reservation.
9601  	 */
9602  	btrfs_qgroup_free_refroot(inode->root->fs_info,
9603  			inode->root->root_key.objectid, qgroup_released,
9604  			BTRFS_QGROUP_RSV_DATA);
9605  	return ERR_PTR(ret);
9606  }
9607  
9608  static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9609  				       u64 start, u64 num_bytes, u64 min_size,
9610  				       loff_t actual_len, u64 *alloc_hint,
9611  				       struct btrfs_trans_handle *trans)
9612  {
9613  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9614  	struct extent_map *em;
9615  	struct btrfs_root *root = BTRFS_I(inode)->root;
9616  	struct btrfs_key ins;
9617  	u64 cur_offset = start;
9618  	u64 clear_offset = start;
9619  	u64 i_size;
9620  	u64 cur_bytes;
9621  	u64 last_alloc = (u64)-1;
9622  	int ret = 0;
9623  	bool own_trans = true;
9624  	u64 end = start + num_bytes - 1;
9625  
9626  	if (trans)
9627  		own_trans = false;
9628  	while (num_bytes > 0) {
9629  		cur_bytes = min_t(u64, num_bytes, SZ_256M);
9630  		cur_bytes = max(cur_bytes, min_size);
9631  		/*
9632  		 * If we are severely fragmented we could end up with really
9633  		 * small allocations, so if the allocator is returning small
9634  		 * chunks lets make its job easier by only searching for those
9635  		 * sized chunks.
9636  		 */
9637  		cur_bytes = min(cur_bytes, last_alloc);
9638  		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
9639  				min_size, 0, *alloc_hint, &ins, 1, 0);
9640  		if (ret)
9641  			break;
9642  
9643  		/*
9644  		 * We've reserved this space, and thus converted it from
9645  		 * ->bytes_may_use to ->bytes_reserved.  Any error that happens
9646  		 * from here on out we will only need to clear our reservation
9647  		 * for the remaining unreserved area, so advance our
9648  		 * clear_offset by our extent size.
9649  		 */
9650  		clear_offset += ins.offset;
9651  
9652  		last_alloc = ins.offset;
9653  		trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
9654  						    &ins, cur_offset);
9655  		/*
9656  		 * Now that we inserted the prealloc extent we can finally
9657  		 * decrement the number of reservations in the block group.
9658  		 * If we did it before, we could race with relocation and have
9659  		 * relocation miss the reserved extent, making it fail later.
9660  		 */
9661  		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9662  		if (IS_ERR(trans)) {
9663  			ret = PTR_ERR(trans);
9664  			btrfs_free_reserved_extent(fs_info, ins.objectid,
9665  						   ins.offset, 0);
9666  			break;
9667  		}
9668  
9669  		em = alloc_extent_map();
9670  		if (!em) {
9671  			btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
9672  					    cur_offset + ins.offset - 1, false);
9673  			btrfs_set_inode_full_sync(BTRFS_I(inode));
9674  			goto next;
9675  		}
9676  
9677  		em->start = cur_offset;
9678  		em->orig_start = cur_offset;
9679  		em->len = ins.offset;
9680  		em->block_start = ins.objectid;
9681  		em->block_len = ins.offset;
9682  		em->orig_block_len = ins.offset;
9683  		em->ram_bytes = ins.offset;
9684  		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
9685  		em->generation = trans->transid;
9686  
9687  		ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
9688  		free_extent_map(em);
9689  next:
9690  		num_bytes -= ins.offset;
9691  		cur_offset += ins.offset;
9692  		*alloc_hint = ins.objectid + ins.offset;
9693  
9694  		inode_inc_iversion(inode);
9695  		inode_set_ctime_current(inode);
9696  		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
9697  		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
9698  		    (actual_len > inode->i_size) &&
9699  		    (cur_offset > inode->i_size)) {
9700  			if (cur_offset > actual_len)
9701  				i_size = actual_len;
9702  			else
9703  				i_size = cur_offset;
9704  			i_size_write(inode, i_size);
9705  			btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
9706  		}
9707  
9708  		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
9709  
9710  		if (ret) {
9711  			btrfs_abort_transaction(trans, ret);
9712  			if (own_trans)
9713  				btrfs_end_transaction(trans);
9714  			break;
9715  		}
9716  
9717  		if (own_trans) {
9718  			btrfs_end_transaction(trans);
9719  			trans = NULL;
9720  		}
9721  	}
9722  	if (clear_offset < end)
9723  		btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
9724  			end - clear_offset + 1);
9725  	return ret;
9726  }
9727  
9728  int btrfs_prealloc_file_range(struct inode *inode, int mode,
9729  			      u64 start, u64 num_bytes, u64 min_size,
9730  			      loff_t actual_len, u64 *alloc_hint)
9731  {
9732  	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9733  					   min_size, actual_len, alloc_hint,
9734  					   NULL);
9735  }
9736  
9737  int btrfs_prealloc_file_range_trans(struct inode *inode,
9738  				    struct btrfs_trans_handle *trans, int mode,
9739  				    u64 start, u64 num_bytes, u64 min_size,
9740  				    loff_t actual_len, u64 *alloc_hint)
9741  {
9742  	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9743  					   min_size, actual_len, alloc_hint, trans);
9744  }
9745  
9746  static int btrfs_permission(struct mnt_idmap *idmap,
9747  			    struct inode *inode, int mask)
9748  {
9749  	struct btrfs_root *root = BTRFS_I(inode)->root;
9750  	umode_t mode = inode->i_mode;
9751  
9752  	if (mask & MAY_WRITE &&
9753  	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
9754  		if (btrfs_root_readonly(root))
9755  			return -EROFS;
9756  		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
9757  			return -EACCES;
9758  	}
9759  	return generic_permission(idmap, inode, mask);
9760  }
9761  
9762  static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
9763  			 struct file *file, umode_t mode)
9764  {
9765  	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
9766  	struct btrfs_trans_handle *trans;
9767  	struct btrfs_root *root = BTRFS_I(dir)->root;
9768  	struct inode *inode;
9769  	struct btrfs_new_inode_args new_inode_args = {
9770  		.dir = dir,
9771  		.dentry = file->f_path.dentry,
9772  		.orphan = true,
9773  	};
9774  	unsigned int trans_num_items;
9775  	int ret;
9776  
9777  	inode = new_inode(dir->i_sb);
9778  	if (!inode)
9779  		return -ENOMEM;
9780  	inode_init_owner(idmap, inode, dir, mode);
9781  	inode->i_fop = &btrfs_file_operations;
9782  	inode->i_op = &btrfs_file_inode_operations;
9783  	inode->i_mapping->a_ops = &btrfs_aops;
9784  
9785  	new_inode_args.inode = inode;
9786  	ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
9787  	if (ret)
9788  		goto out_inode;
9789  
9790  	trans = btrfs_start_transaction(root, trans_num_items);
9791  	if (IS_ERR(trans)) {
9792  		ret = PTR_ERR(trans);
9793  		goto out_new_inode_args;
9794  	}
9795  
9796  	ret = btrfs_create_new_inode(trans, &new_inode_args);
9797  
9798  	/*
9799  	 * We set number of links to 0 in btrfs_create_new_inode(), and here we
9800  	 * set it to 1 because d_tmpfile() will issue a warning if the count is
9801  	 * 0, through:
9802  	 *
9803  	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9804  	 */
9805  	set_nlink(inode, 1);
9806  
9807  	if (!ret) {
9808  		d_tmpfile(file, inode);
9809  		unlock_new_inode(inode);
9810  		mark_inode_dirty(inode);
9811  	}
9812  
9813  	btrfs_end_transaction(trans);
9814  	btrfs_btree_balance_dirty(fs_info);
9815  out_new_inode_args:
9816  	btrfs_new_inode_args_destroy(&new_inode_args);
9817  out_inode:
9818  	if (ret)
9819  		iput(inode);
9820  	return finish_open_simple(file, ret);
9821  }
9822  
9823  void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
9824  {
9825  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
9826  	unsigned long index = start >> PAGE_SHIFT;
9827  	unsigned long end_index = end >> PAGE_SHIFT;
9828  	struct page *page;
9829  	u32 len;
9830  
9831  	ASSERT(end + 1 - start <= U32_MAX);
9832  	len = end + 1 - start;
9833  	while (index <= end_index) {
9834  		page = find_get_page(inode->vfs_inode.i_mapping, index);
9835  		ASSERT(page); /* Pages should be in the extent_io_tree */
9836  
9837  		btrfs_page_set_writeback(fs_info, page, start, len);
9838  		put_page(page);
9839  		index++;
9840  	}
9841  }
9842  
9843  int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
9844  					     int compress_type)
9845  {
9846  	switch (compress_type) {
9847  	case BTRFS_COMPRESS_NONE:
9848  		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
9849  	case BTRFS_COMPRESS_ZLIB:
9850  		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
9851  	case BTRFS_COMPRESS_LZO:
9852  		/*
9853  		 * The LZO format depends on the sector size. 64K is the maximum
9854  		 * sector size that we support.
9855  		 */
9856  		if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
9857  			return -EINVAL;
9858  		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
9859  		       (fs_info->sectorsize_bits - 12);
9860  	case BTRFS_COMPRESS_ZSTD:
9861  		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
9862  	default:
9863  		return -EUCLEAN;
9864  	}
9865  }
9866  
9867  static ssize_t btrfs_encoded_read_inline(
9868  				struct kiocb *iocb,
9869  				struct iov_iter *iter, u64 start,
9870  				u64 lockend,
9871  				struct extent_state **cached_state,
9872  				u64 extent_start, size_t count,
9873  				struct btrfs_ioctl_encoded_io_args *encoded,
9874  				bool *unlocked)
9875  {
9876  	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9877  	struct btrfs_root *root = inode->root;
9878  	struct btrfs_fs_info *fs_info = root->fs_info;
9879  	struct extent_io_tree *io_tree = &inode->io_tree;
9880  	struct btrfs_path *path;
9881  	struct extent_buffer *leaf;
9882  	struct btrfs_file_extent_item *item;
9883  	u64 ram_bytes;
9884  	unsigned long ptr;
9885  	void *tmp;
9886  	ssize_t ret;
9887  
9888  	path = btrfs_alloc_path();
9889  	if (!path) {
9890  		ret = -ENOMEM;
9891  		goto out;
9892  	}
9893  	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
9894  				       extent_start, 0);
9895  	if (ret) {
9896  		if (ret > 0) {
9897  			/* The extent item disappeared? */
9898  			ret = -EIO;
9899  		}
9900  		goto out;
9901  	}
9902  	leaf = path->nodes[0];
9903  	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
9904  
9905  	ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
9906  	ptr = btrfs_file_extent_inline_start(item);
9907  
9908  	encoded->len = min_t(u64, extent_start + ram_bytes,
9909  			     inode->vfs_inode.i_size) - iocb->ki_pos;
9910  	ret = btrfs_encoded_io_compression_from_extent(fs_info,
9911  				 btrfs_file_extent_compression(leaf, item));
9912  	if (ret < 0)
9913  		goto out;
9914  	encoded->compression = ret;
9915  	if (encoded->compression) {
9916  		size_t inline_size;
9917  
9918  		inline_size = btrfs_file_extent_inline_item_len(leaf,
9919  								path->slots[0]);
9920  		if (inline_size > count) {
9921  			ret = -ENOBUFS;
9922  			goto out;
9923  		}
9924  		count = inline_size;
9925  		encoded->unencoded_len = ram_bytes;
9926  		encoded->unencoded_offset = iocb->ki_pos - extent_start;
9927  	} else {
9928  		count = min_t(u64, count, encoded->len);
9929  		encoded->len = count;
9930  		encoded->unencoded_len = count;
9931  		ptr += iocb->ki_pos - extent_start;
9932  	}
9933  
9934  	tmp = kmalloc(count, GFP_NOFS);
9935  	if (!tmp) {
9936  		ret = -ENOMEM;
9937  		goto out;
9938  	}
9939  	read_extent_buffer(leaf, tmp, ptr, count);
9940  	btrfs_release_path(path);
9941  	unlock_extent(io_tree, start, lockend, cached_state);
9942  	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9943  	*unlocked = true;
9944  
9945  	ret = copy_to_iter(tmp, count, iter);
9946  	if (ret != count)
9947  		ret = -EFAULT;
9948  	kfree(tmp);
9949  out:
9950  	btrfs_free_path(path);
9951  	return ret;
9952  }
9953  
9954  struct btrfs_encoded_read_private {
9955  	wait_queue_head_t wait;
9956  	atomic_t pending;
9957  	blk_status_t status;
9958  };
9959  
9960  static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
9961  {
9962  	struct btrfs_encoded_read_private *priv = bbio->private;
9963  
9964  	if (bbio->bio.bi_status) {
9965  		/*
9966  		 * The memory barrier implied by the atomic_dec_return() here
9967  		 * pairs with the memory barrier implied by the
9968  		 * atomic_dec_return() or io_wait_event() in
9969  		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
9970  		 * write is observed before the load of status in
9971  		 * btrfs_encoded_read_regular_fill_pages().
9972  		 */
9973  		WRITE_ONCE(priv->status, bbio->bio.bi_status);
9974  	}
9975  	if (atomic_dec_and_test(&priv->pending))
9976  		wake_up(&priv->wait);
9977  	bio_put(&bbio->bio);
9978  }
9979  
9980  int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
9981  					  u64 file_offset, u64 disk_bytenr,
9982  					  u64 disk_io_size, struct page **pages)
9983  {
9984  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
9985  	struct btrfs_encoded_read_private priv = {
9986  		.pending = ATOMIC_INIT(1),
9987  	};
9988  	unsigned long i = 0;
9989  	struct btrfs_bio *bbio;
9990  
9991  	init_waitqueue_head(&priv.wait);
9992  
9993  	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9994  			       btrfs_encoded_read_endio, &priv);
9995  	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9996  	bbio->inode = inode;
9997  
9998  	do {
9999  		size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
10000  
10001  		if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
10002  			atomic_inc(&priv.pending);
10003  			btrfs_submit_bio(bbio, 0);
10004  
10005  			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
10006  					       btrfs_encoded_read_endio, &priv);
10007  			bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
10008  			bbio->inode = inode;
10009  			continue;
10010  		}
10011  
10012  		i++;
10013  		disk_bytenr += bytes;
10014  		disk_io_size -= bytes;
10015  	} while (disk_io_size);
10016  
10017  	atomic_inc(&priv.pending);
10018  	btrfs_submit_bio(bbio, 0);
10019  
10020  	if (atomic_dec_return(&priv.pending))
10021  		io_wait_event(priv.wait, !atomic_read(&priv.pending));
10022  	/* See btrfs_encoded_read_endio() for ordering. */
10023  	return blk_status_to_errno(READ_ONCE(priv.status));
10024  }
10025  
10026  static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
10027  					  struct iov_iter *iter,
10028  					  u64 start, u64 lockend,
10029  					  struct extent_state **cached_state,
10030  					  u64 disk_bytenr, u64 disk_io_size,
10031  					  size_t count, bool compressed,
10032  					  bool *unlocked)
10033  {
10034  	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10035  	struct extent_io_tree *io_tree = &inode->io_tree;
10036  	struct page **pages;
10037  	unsigned long nr_pages, i;
10038  	u64 cur;
10039  	size_t page_offset;
10040  	ssize_t ret;
10041  
10042  	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
10043  	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
10044  	if (!pages)
10045  		return -ENOMEM;
10046  	ret = btrfs_alloc_page_array(nr_pages, pages);
10047  	if (ret) {
10048  		ret = -ENOMEM;
10049  		goto out;
10050  		}
10051  
10052  	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
10053  						    disk_io_size, pages);
10054  	if (ret)
10055  		goto out;
10056  
10057  	unlock_extent(io_tree, start, lockend, cached_state);
10058  	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10059  	*unlocked = true;
10060  
10061  	if (compressed) {
10062  		i = 0;
10063  		page_offset = 0;
10064  	} else {
10065  		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
10066  		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
10067  	}
10068  	cur = 0;
10069  	while (cur < count) {
10070  		size_t bytes = min_t(size_t, count - cur,
10071  				     PAGE_SIZE - page_offset);
10072  
10073  		if (copy_page_to_iter(pages[i], page_offset, bytes,
10074  				      iter) != bytes) {
10075  			ret = -EFAULT;
10076  			goto out;
10077  		}
10078  		i++;
10079  		cur += bytes;
10080  		page_offset = 0;
10081  	}
10082  	ret = count;
10083  out:
10084  	for (i = 0; i < nr_pages; i++) {
10085  		if (pages[i])
10086  			__free_page(pages[i]);
10087  	}
10088  	kfree(pages);
10089  	return ret;
10090  }
10091  
10092  ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
10093  			   struct btrfs_ioctl_encoded_io_args *encoded)
10094  {
10095  	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10096  	struct btrfs_fs_info *fs_info = inode->root->fs_info;
10097  	struct extent_io_tree *io_tree = &inode->io_tree;
10098  	ssize_t ret;
10099  	size_t count = iov_iter_count(iter);
10100  	u64 start, lockend, disk_bytenr, disk_io_size;
10101  	struct extent_state *cached_state = NULL;
10102  	struct extent_map *em;
10103  	bool unlocked = false;
10104  
10105  	file_accessed(iocb->ki_filp);
10106  
10107  	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
10108  
10109  	if (iocb->ki_pos >= inode->vfs_inode.i_size) {
10110  		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10111  		return 0;
10112  	}
10113  	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
10114  	/*
10115  	 * We don't know how long the extent containing iocb->ki_pos is, but if
10116  	 * it's compressed we know that it won't be longer than this.
10117  	 */
10118  	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
10119  
10120  	for (;;) {
10121  		struct btrfs_ordered_extent *ordered;
10122  
10123  		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
10124  					       lockend - start + 1);
10125  		if (ret)
10126  			goto out_unlock_inode;
10127  		lock_extent(io_tree, start, lockend, &cached_state);
10128  		ordered = btrfs_lookup_ordered_range(inode, start,
10129  						     lockend - start + 1);
10130  		if (!ordered)
10131  			break;
10132  		btrfs_put_ordered_extent(ordered);
10133  		unlock_extent(io_tree, start, lockend, &cached_state);
10134  		cond_resched();
10135  	}
10136  
10137  	em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
10138  	if (IS_ERR(em)) {
10139  		ret = PTR_ERR(em);
10140  		goto out_unlock_extent;
10141  	}
10142  
10143  	if (em->block_start == EXTENT_MAP_INLINE) {
10144  		u64 extent_start = em->start;
10145  
10146  		/*
10147  		 * For inline extents we get everything we need out of the
10148  		 * extent item.
10149  		 */
10150  		free_extent_map(em);
10151  		em = NULL;
10152  		ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
10153  						&cached_state, extent_start,
10154  						count, encoded, &unlocked);
10155  		goto out;
10156  	}
10157  
10158  	/*
10159  	 * We only want to return up to EOF even if the extent extends beyond
10160  	 * that.
10161  	 */
10162  	encoded->len = min_t(u64, extent_map_end(em),
10163  			     inode->vfs_inode.i_size) - iocb->ki_pos;
10164  	if (em->block_start == EXTENT_MAP_HOLE ||
10165  	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
10166  		disk_bytenr = EXTENT_MAP_HOLE;
10167  		count = min_t(u64, count, encoded->len);
10168  		encoded->len = count;
10169  		encoded->unencoded_len = count;
10170  	} else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10171  		disk_bytenr = em->block_start;
10172  		/*
10173  		 * Bail if the buffer isn't large enough to return the whole
10174  		 * compressed extent.
10175  		 */
10176  		if (em->block_len > count) {
10177  			ret = -ENOBUFS;
10178  			goto out_em;
10179  		}
10180  		disk_io_size = em->block_len;
10181  		count = em->block_len;
10182  		encoded->unencoded_len = em->ram_bytes;
10183  		encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
10184  		ret = btrfs_encoded_io_compression_from_extent(fs_info,
10185  							     em->compress_type);
10186  		if (ret < 0)
10187  			goto out_em;
10188  		encoded->compression = ret;
10189  	} else {
10190  		disk_bytenr = em->block_start + (start - em->start);
10191  		if (encoded->len > count)
10192  			encoded->len = count;
10193  		/*
10194  		 * Don't read beyond what we locked. This also limits the page
10195  		 * allocations that we'll do.
10196  		 */
10197  		disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
10198  		count = start + disk_io_size - iocb->ki_pos;
10199  		encoded->len = count;
10200  		encoded->unencoded_len = count;
10201  		disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
10202  	}
10203  	free_extent_map(em);
10204  	em = NULL;
10205  
10206  	if (disk_bytenr == EXTENT_MAP_HOLE) {
10207  		unlock_extent(io_tree, start, lockend, &cached_state);
10208  		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10209  		unlocked = true;
10210  		ret = iov_iter_zero(count, iter);
10211  		if (ret != count)
10212  			ret = -EFAULT;
10213  	} else {
10214  		ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
10215  						 &cached_state, disk_bytenr,
10216  						 disk_io_size, count,
10217  						 encoded->compression,
10218  						 &unlocked);
10219  	}
10220  
10221  out:
10222  	if (ret >= 0)
10223  		iocb->ki_pos += encoded->len;
10224  out_em:
10225  	free_extent_map(em);
10226  out_unlock_extent:
10227  	if (!unlocked)
10228  		unlock_extent(io_tree, start, lockend, &cached_state);
10229  out_unlock_inode:
10230  	if (!unlocked)
10231  		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10232  	return ret;
10233  }
10234  
10235  ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
10236  			       const struct btrfs_ioctl_encoded_io_args *encoded)
10237  {
10238  	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10239  	struct btrfs_root *root = inode->root;
10240  	struct btrfs_fs_info *fs_info = root->fs_info;
10241  	struct extent_io_tree *io_tree = &inode->io_tree;
10242  	struct extent_changeset *data_reserved = NULL;
10243  	struct extent_state *cached_state = NULL;
10244  	struct btrfs_ordered_extent *ordered;
10245  	int compression;
10246  	size_t orig_count;
10247  	u64 start, end;
10248  	u64 num_bytes, ram_bytes, disk_num_bytes;
10249  	unsigned long nr_pages, i;
10250  	struct page **pages;
10251  	struct btrfs_key ins;
10252  	bool extent_reserved = false;
10253  	struct extent_map *em;
10254  	ssize_t ret;
10255  
10256  	switch (encoded->compression) {
10257  	case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
10258  		compression = BTRFS_COMPRESS_ZLIB;
10259  		break;
10260  	case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
10261  		compression = BTRFS_COMPRESS_ZSTD;
10262  		break;
10263  	case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
10264  	case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
10265  	case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
10266  	case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
10267  	case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
10268  		/* The sector size must match for LZO. */
10269  		if (encoded->compression -
10270  		    BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
10271  		    fs_info->sectorsize_bits)
10272  			return -EINVAL;
10273  		compression = BTRFS_COMPRESS_LZO;
10274  		break;
10275  	default:
10276  		return -EINVAL;
10277  	}
10278  	if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
10279  		return -EINVAL;
10280  
10281  	/*
10282  	 * Compressed extents should always have checksums, so error out if we
10283  	 * have a NOCOW file or inode was created while mounted with NODATASUM.
10284  	 */
10285  	if (inode->flags & BTRFS_INODE_NODATASUM)
10286  		return -EINVAL;
10287  
10288  	orig_count = iov_iter_count(from);
10289  
10290  	/* The extent size must be sane. */
10291  	if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
10292  	    orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
10293  		return -EINVAL;
10294  
10295  	/*
10296  	 * The compressed data must be smaller than the decompressed data.
10297  	 *
10298  	 * It's of course possible for data to compress to larger or the same
10299  	 * size, but the buffered I/O path falls back to no compression for such
10300  	 * data, and we don't want to break any assumptions by creating these
10301  	 * extents.
10302  	 *
10303  	 * Note that this is less strict than the current check we have that the
10304  	 * compressed data must be at least one sector smaller than the
10305  	 * decompressed data. We only want to enforce the weaker requirement
10306  	 * from old kernels that it is at least one byte smaller.
10307  	 */
10308  	if (orig_count >= encoded->unencoded_len)
10309  		return -EINVAL;
10310  
10311  	/* The extent must start on a sector boundary. */
10312  	start = iocb->ki_pos;
10313  	if (!IS_ALIGNED(start, fs_info->sectorsize))
10314  		return -EINVAL;
10315  
10316  	/*
10317  	 * The extent must end on a sector boundary. However, we allow a write
10318  	 * which ends at or extends i_size to have an unaligned length; we round
10319  	 * up the extent size and set i_size to the unaligned end.
10320  	 */
10321  	if (start + encoded->len < inode->vfs_inode.i_size &&
10322  	    !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
10323  		return -EINVAL;
10324  
10325  	/* Finally, the offset in the unencoded data must be sector-aligned. */
10326  	if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
10327  		return -EINVAL;
10328  
10329  	num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
10330  	ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
10331  	end = start + num_bytes - 1;
10332  
10333  	/*
10334  	 * If the extent cannot be inline, the compressed data on disk must be
10335  	 * sector-aligned. For convenience, we extend it with zeroes if it
10336  	 * isn't.
10337  	 */
10338  	disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
10339  	nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
10340  	pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
10341  	if (!pages)
10342  		return -ENOMEM;
10343  	for (i = 0; i < nr_pages; i++) {
10344  		size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
10345  		char *kaddr;
10346  
10347  		pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
10348  		if (!pages[i]) {
10349  			ret = -ENOMEM;
10350  			goto out_pages;
10351  		}
10352  		kaddr = kmap_local_page(pages[i]);
10353  		if (copy_from_iter(kaddr, bytes, from) != bytes) {
10354  			kunmap_local(kaddr);
10355  			ret = -EFAULT;
10356  			goto out_pages;
10357  		}
10358  		if (bytes < PAGE_SIZE)
10359  			memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
10360  		kunmap_local(kaddr);
10361  	}
10362  
10363  	for (;;) {
10364  		struct btrfs_ordered_extent *ordered;
10365  
10366  		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
10367  		if (ret)
10368  			goto out_pages;
10369  		ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
10370  						    start >> PAGE_SHIFT,
10371  						    end >> PAGE_SHIFT);
10372  		if (ret)
10373  			goto out_pages;
10374  		lock_extent(io_tree, start, end, &cached_state);
10375  		ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
10376  		if (!ordered &&
10377  		    !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
10378  			break;
10379  		if (ordered)
10380  			btrfs_put_ordered_extent(ordered);
10381  		unlock_extent(io_tree, start, end, &cached_state);
10382  		cond_resched();
10383  	}
10384  
10385  	/*
10386  	 * We don't use the higher-level delalloc space functions because our
10387  	 * num_bytes and disk_num_bytes are different.
10388  	 */
10389  	ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
10390  	if (ret)
10391  		goto out_unlock;
10392  	ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
10393  	if (ret)
10394  		goto out_free_data_space;
10395  	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
10396  					      false);
10397  	if (ret)
10398  		goto out_qgroup_free_data;
10399  
10400  	/* Try an inline extent first. */
10401  	if (start == 0 && encoded->unencoded_len == encoded->len &&
10402  	    encoded->unencoded_offset == 0) {
10403  		ret = cow_file_range_inline(inode, encoded->len, orig_count,
10404  					    compression, pages, true);
10405  		if (ret <= 0) {
10406  			if (ret == 0)
10407  				ret = orig_count;
10408  			goto out_delalloc_release;
10409  		}
10410  	}
10411  
10412  	ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
10413  				   disk_num_bytes, 0, 0, &ins, 1, 1);
10414  	if (ret)
10415  		goto out_delalloc_release;
10416  	extent_reserved = true;
10417  
10418  	em = create_io_em(inode, start, num_bytes,
10419  			  start - encoded->unencoded_offset, ins.objectid,
10420  			  ins.offset, ins.offset, ram_bytes, compression,
10421  			  BTRFS_ORDERED_COMPRESSED);
10422  	if (IS_ERR(em)) {
10423  		ret = PTR_ERR(em);
10424  		goto out_free_reserved;
10425  	}
10426  	free_extent_map(em);
10427  
10428  	ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes,
10429  				       ins.objectid, ins.offset,
10430  				       encoded->unencoded_offset,
10431  				       (1 << BTRFS_ORDERED_ENCODED) |
10432  				       (1 << BTRFS_ORDERED_COMPRESSED),
10433  				       compression);
10434  	if (IS_ERR(ordered)) {
10435  		btrfs_drop_extent_map_range(inode, start, end, false);
10436  		ret = PTR_ERR(ordered);
10437  		goto out_free_reserved;
10438  	}
10439  	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10440  
10441  	if (start + encoded->len > inode->vfs_inode.i_size)
10442  		i_size_write(&inode->vfs_inode, start + encoded->len);
10443  
10444  	unlock_extent(io_tree, start, end, &cached_state);
10445  
10446  	btrfs_delalloc_release_extents(inode, num_bytes);
10447  
10448  	btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false);
10449  	ret = orig_count;
10450  	goto out;
10451  
10452  out_free_reserved:
10453  	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10454  	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
10455  out_delalloc_release:
10456  	btrfs_delalloc_release_extents(inode, num_bytes);
10457  	btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
10458  out_qgroup_free_data:
10459  	if (ret < 0)
10460  		btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
10461  out_free_data_space:
10462  	/*
10463  	 * If btrfs_reserve_extent() succeeded, then we already decremented
10464  	 * bytes_may_use.
10465  	 */
10466  	if (!extent_reserved)
10467  		btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
10468  out_unlock:
10469  	unlock_extent(io_tree, start, end, &cached_state);
10470  out_pages:
10471  	for (i = 0; i < nr_pages; i++) {
10472  		if (pages[i])
10473  			__free_page(pages[i]);
10474  	}
10475  	kvfree(pages);
10476  out:
10477  	if (ret >= 0)
10478  		iocb->ki_pos += encoded->len;
10479  	return ret;
10480  }
10481  
10482  #ifdef CONFIG_SWAP
10483  /*
10484   * Add an entry indicating a block group or device which is pinned by a
10485   * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
10486   * negative errno on failure.
10487   */
10488  static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
10489  				  bool is_block_group)
10490  {
10491  	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10492  	struct btrfs_swapfile_pin *sp, *entry;
10493  	struct rb_node **p;
10494  	struct rb_node *parent = NULL;
10495  
10496  	sp = kmalloc(sizeof(*sp), GFP_NOFS);
10497  	if (!sp)
10498  		return -ENOMEM;
10499  	sp->ptr = ptr;
10500  	sp->inode = inode;
10501  	sp->is_block_group = is_block_group;
10502  	sp->bg_extent_count = 1;
10503  
10504  	spin_lock(&fs_info->swapfile_pins_lock);
10505  	p = &fs_info->swapfile_pins.rb_node;
10506  	while (*p) {
10507  		parent = *p;
10508  		entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
10509  		if (sp->ptr < entry->ptr ||
10510  		    (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
10511  			p = &(*p)->rb_left;
10512  		} else if (sp->ptr > entry->ptr ||
10513  			   (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
10514  			p = &(*p)->rb_right;
10515  		} else {
10516  			if (is_block_group)
10517  				entry->bg_extent_count++;
10518  			spin_unlock(&fs_info->swapfile_pins_lock);
10519  			kfree(sp);
10520  			return 1;
10521  		}
10522  	}
10523  	rb_link_node(&sp->node, parent, p);
10524  	rb_insert_color(&sp->node, &fs_info->swapfile_pins);
10525  	spin_unlock(&fs_info->swapfile_pins_lock);
10526  	return 0;
10527  }
10528  
10529  /* Free all of the entries pinned by this swapfile. */
10530  static void btrfs_free_swapfile_pins(struct inode *inode)
10531  {
10532  	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10533  	struct btrfs_swapfile_pin *sp;
10534  	struct rb_node *node, *next;
10535  
10536  	spin_lock(&fs_info->swapfile_pins_lock);
10537  	node = rb_first(&fs_info->swapfile_pins);
10538  	while (node) {
10539  		next = rb_next(node);
10540  		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
10541  		if (sp->inode == inode) {
10542  			rb_erase(&sp->node, &fs_info->swapfile_pins);
10543  			if (sp->is_block_group) {
10544  				btrfs_dec_block_group_swap_extents(sp->ptr,
10545  							   sp->bg_extent_count);
10546  				btrfs_put_block_group(sp->ptr);
10547  			}
10548  			kfree(sp);
10549  		}
10550  		node = next;
10551  	}
10552  	spin_unlock(&fs_info->swapfile_pins_lock);
10553  }
10554  
10555  struct btrfs_swap_info {
10556  	u64 start;
10557  	u64 block_start;
10558  	u64 block_len;
10559  	u64 lowest_ppage;
10560  	u64 highest_ppage;
10561  	unsigned long nr_pages;
10562  	int nr_extents;
10563  };
10564  
10565  static int btrfs_add_swap_extent(struct swap_info_struct *sis,
10566  				 struct btrfs_swap_info *bsi)
10567  {
10568  	unsigned long nr_pages;
10569  	unsigned long max_pages;
10570  	u64 first_ppage, first_ppage_reported, next_ppage;
10571  	int ret;
10572  
10573  	/*
10574  	 * Our swapfile may have had its size extended after the swap header was
10575  	 * written. In that case activating the swapfile should not go beyond
10576  	 * the max size set in the swap header.
10577  	 */
10578  	if (bsi->nr_pages >= sis->max)
10579  		return 0;
10580  
10581  	max_pages = sis->max - bsi->nr_pages;
10582  	first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
10583  	next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
10584  
10585  	if (first_ppage >= next_ppage)
10586  		return 0;
10587  	nr_pages = next_ppage - first_ppage;
10588  	nr_pages = min(nr_pages, max_pages);
10589  
10590  	first_ppage_reported = first_ppage;
10591  	if (bsi->start == 0)
10592  		first_ppage_reported++;
10593  	if (bsi->lowest_ppage > first_ppage_reported)
10594  		bsi->lowest_ppage = first_ppage_reported;
10595  	if (bsi->highest_ppage < (next_ppage - 1))
10596  		bsi->highest_ppage = next_ppage - 1;
10597  
10598  	ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
10599  	if (ret < 0)
10600  		return ret;
10601  	bsi->nr_extents += ret;
10602  	bsi->nr_pages += nr_pages;
10603  	return 0;
10604  }
10605  
10606  static void btrfs_swap_deactivate(struct file *file)
10607  {
10608  	struct inode *inode = file_inode(file);
10609  
10610  	btrfs_free_swapfile_pins(inode);
10611  	atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
10612  }
10613  
10614  static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10615  			       sector_t *span)
10616  {
10617  	struct inode *inode = file_inode(file);
10618  	struct btrfs_root *root = BTRFS_I(inode)->root;
10619  	struct btrfs_fs_info *fs_info = root->fs_info;
10620  	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
10621  	struct extent_state *cached_state = NULL;
10622  	struct extent_map *em = NULL;
10623  	struct btrfs_device *device = NULL;
10624  	struct btrfs_swap_info bsi = {
10625  		.lowest_ppage = (sector_t)-1ULL,
10626  	};
10627  	int ret = 0;
10628  	u64 isize;
10629  	u64 start;
10630  
10631  	/*
10632  	 * If the swap file was just created, make sure delalloc is done. If the
10633  	 * file changes again after this, the user is doing something stupid and
10634  	 * we don't really care.
10635  	 */
10636  	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
10637  	if (ret)
10638  		return ret;
10639  
10640  	/*
10641  	 * The inode is locked, so these flags won't change after we check them.
10642  	 */
10643  	if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
10644  		btrfs_warn(fs_info, "swapfile must not be compressed");
10645  		return -EINVAL;
10646  	}
10647  	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
10648  		btrfs_warn(fs_info, "swapfile must not be copy-on-write");
10649  		return -EINVAL;
10650  	}
10651  	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
10652  		btrfs_warn(fs_info, "swapfile must not be checksummed");
10653  		return -EINVAL;
10654  	}
10655  
10656  	/*
10657  	 * Balance or device remove/replace/resize can move stuff around from
10658  	 * under us. The exclop protection makes sure they aren't running/won't
10659  	 * run concurrently while we are mapping the swap extents, and
10660  	 * fs_info->swapfile_pins prevents them from running while the swap
10661  	 * file is active and moving the extents. Note that this also prevents
10662  	 * a concurrent device add which isn't actually necessary, but it's not
10663  	 * really worth the trouble to allow it.
10664  	 */
10665  	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
10666  		btrfs_warn(fs_info,
10667  	   "cannot activate swapfile while exclusive operation is running");
10668  		return -EBUSY;
10669  	}
10670  
10671  	/*
10672  	 * Prevent snapshot creation while we are activating the swap file.
10673  	 * We do not want to race with snapshot creation. If snapshot creation
10674  	 * already started before we bumped nr_swapfiles from 0 to 1 and
10675  	 * completes before the first write into the swap file after it is
10676  	 * activated, than that write would fallback to COW.
10677  	 */
10678  	if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
10679  		btrfs_exclop_finish(fs_info);
10680  		btrfs_warn(fs_info,
10681  	   "cannot activate swapfile because snapshot creation is in progress");
10682  		return -EINVAL;
10683  	}
10684  	/*
10685  	 * Snapshots can create extents which require COW even if NODATACOW is
10686  	 * set. We use this counter to prevent snapshots. We must increment it
10687  	 * before walking the extents because we don't want a concurrent
10688  	 * snapshot to run after we've already checked the extents.
10689  	 *
10690  	 * It is possible that subvolume is marked for deletion but still not
10691  	 * removed yet. To prevent this race, we check the root status before
10692  	 * activating the swapfile.
10693  	 */
10694  	spin_lock(&root->root_item_lock);
10695  	if (btrfs_root_dead(root)) {
10696  		spin_unlock(&root->root_item_lock);
10697  
10698  		btrfs_drew_write_unlock(&root->snapshot_lock);
10699  		btrfs_exclop_finish(fs_info);
10700  		btrfs_warn(fs_info,
10701  		"cannot activate swapfile because subvolume %llu is being deleted",
10702  			root->root_key.objectid);
10703  		return -EPERM;
10704  	}
10705  	atomic_inc(&root->nr_swapfiles);
10706  	spin_unlock(&root->root_item_lock);
10707  
10708  	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
10709  
10710  	lock_extent(io_tree, 0, isize - 1, &cached_state);
10711  	start = 0;
10712  	while (start < isize) {
10713  		u64 logical_block_start, physical_block_start;
10714  		struct btrfs_block_group *bg;
10715  		u64 len = isize - start;
10716  
10717  		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
10718  		if (IS_ERR(em)) {
10719  			ret = PTR_ERR(em);
10720  			goto out;
10721  		}
10722  
10723  		if (em->block_start == EXTENT_MAP_HOLE) {
10724  			btrfs_warn(fs_info, "swapfile must not have holes");
10725  			ret = -EINVAL;
10726  			goto out;
10727  		}
10728  		if (em->block_start == EXTENT_MAP_INLINE) {
10729  			/*
10730  			 * It's unlikely we'll ever actually find ourselves
10731  			 * here, as a file small enough to fit inline won't be
10732  			 * big enough to store more than the swap header, but in
10733  			 * case something changes in the future, let's catch it
10734  			 * here rather than later.
10735  			 */
10736  			btrfs_warn(fs_info, "swapfile must not be inline");
10737  			ret = -EINVAL;
10738  			goto out;
10739  		}
10740  		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10741  			btrfs_warn(fs_info, "swapfile must not be compressed");
10742  			ret = -EINVAL;
10743  			goto out;
10744  		}
10745  
10746  		logical_block_start = em->block_start + (start - em->start);
10747  		len = min(len, em->len - (start - em->start));
10748  		free_extent_map(em);
10749  		em = NULL;
10750  
10751  		ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
10752  		if (ret < 0) {
10753  			goto out;
10754  		} else if (ret) {
10755  			ret = 0;
10756  		} else {
10757  			btrfs_warn(fs_info,
10758  				   "swapfile must not be copy-on-write");
10759  			ret = -EINVAL;
10760  			goto out;
10761  		}
10762  
10763  		em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
10764  		if (IS_ERR(em)) {
10765  			ret = PTR_ERR(em);
10766  			goto out;
10767  		}
10768  
10769  		if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
10770  			btrfs_warn(fs_info,
10771  				   "swapfile must have single data profile");
10772  			ret = -EINVAL;
10773  			goto out;
10774  		}
10775  
10776  		if (device == NULL) {
10777  			device = em->map_lookup->stripes[0].dev;
10778  			ret = btrfs_add_swapfile_pin(inode, device, false);
10779  			if (ret == 1)
10780  				ret = 0;
10781  			else if (ret)
10782  				goto out;
10783  		} else if (device != em->map_lookup->stripes[0].dev) {
10784  			btrfs_warn(fs_info, "swapfile must be on one device");
10785  			ret = -EINVAL;
10786  			goto out;
10787  		}
10788  
10789  		physical_block_start = (em->map_lookup->stripes[0].physical +
10790  					(logical_block_start - em->start));
10791  		len = min(len, em->len - (logical_block_start - em->start));
10792  		free_extent_map(em);
10793  		em = NULL;
10794  
10795  		bg = btrfs_lookup_block_group(fs_info, logical_block_start);
10796  		if (!bg) {
10797  			btrfs_warn(fs_info,
10798  			   "could not find block group containing swapfile");
10799  			ret = -EINVAL;
10800  			goto out;
10801  		}
10802  
10803  		if (!btrfs_inc_block_group_swap_extents(bg)) {
10804  			btrfs_warn(fs_info,
10805  			   "block group for swapfile at %llu is read-only%s",
10806  			   bg->start,
10807  			   atomic_read(&fs_info->scrubs_running) ?
10808  				       " (scrub running)" : "");
10809  			btrfs_put_block_group(bg);
10810  			ret = -EINVAL;
10811  			goto out;
10812  		}
10813  
10814  		ret = btrfs_add_swapfile_pin(inode, bg, true);
10815  		if (ret) {
10816  			btrfs_put_block_group(bg);
10817  			if (ret == 1)
10818  				ret = 0;
10819  			else
10820  				goto out;
10821  		}
10822  
10823  		if (bsi.block_len &&
10824  		    bsi.block_start + bsi.block_len == physical_block_start) {
10825  			bsi.block_len += len;
10826  		} else {
10827  			if (bsi.block_len) {
10828  				ret = btrfs_add_swap_extent(sis, &bsi);
10829  				if (ret)
10830  					goto out;
10831  			}
10832  			bsi.start = start;
10833  			bsi.block_start = physical_block_start;
10834  			bsi.block_len = len;
10835  		}
10836  
10837  		start += len;
10838  	}
10839  
10840  	if (bsi.block_len)
10841  		ret = btrfs_add_swap_extent(sis, &bsi);
10842  
10843  out:
10844  	if (!IS_ERR_OR_NULL(em))
10845  		free_extent_map(em);
10846  
10847  	unlock_extent(io_tree, 0, isize - 1, &cached_state);
10848  
10849  	if (ret)
10850  		btrfs_swap_deactivate(file);
10851  
10852  	btrfs_drew_write_unlock(&root->snapshot_lock);
10853  
10854  	btrfs_exclop_finish(fs_info);
10855  
10856  	if (ret)
10857  		return ret;
10858  
10859  	if (device)
10860  		sis->bdev = device->bdev;
10861  	*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
10862  	sis->max = bsi.nr_pages;
10863  	sis->pages = bsi.nr_pages - 1;
10864  	sis->highest_bit = bsi.nr_pages - 1;
10865  	return bsi.nr_extents;
10866  }
10867  #else
10868  static void btrfs_swap_deactivate(struct file *file)
10869  {
10870  }
10871  
10872  static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10873  			       sector_t *span)
10874  {
10875  	return -EOPNOTSUPP;
10876  }
10877  #endif
10878  
10879  /*
10880   * Update the number of bytes used in the VFS' inode. When we replace extents in
10881   * a range (clone, dedupe, fallocate's zero range), we must update the number of
10882   * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10883   * always get a correct value.
10884   */
10885  void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10886  			      const u64 add_bytes,
10887  			      const u64 del_bytes)
10888  {
10889  	if (add_bytes == del_bytes)
10890  		return;
10891  
10892  	spin_lock(&inode->lock);
10893  	if (del_bytes > 0)
10894  		inode_sub_bytes(&inode->vfs_inode, del_bytes);
10895  	if (add_bytes > 0)
10896  		inode_add_bytes(&inode->vfs_inode, add_bytes);
10897  	spin_unlock(&inode->lock);
10898  }
10899  
10900  /*
10901   * Verify that there are no ordered extents for a given file range.
10902   *
10903   * @inode:   The target inode.
10904   * @start:   Start offset of the file range, should be sector size aligned.
10905   * @end:     End offset (inclusive) of the file range, its value +1 should be
10906   *           sector size aligned.
10907   *
10908   * This should typically be used for cases where we locked an inode's VFS lock in
10909   * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
10910   * we have flushed all delalloc in the range, we have waited for all ordered
10911   * extents in the range to complete and finally we have locked the file range in
10912   * the inode's io_tree.
10913   */
10914  void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
10915  {
10916  	struct btrfs_root *root = inode->root;
10917  	struct btrfs_ordered_extent *ordered;
10918  
10919  	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
10920  		return;
10921  
10922  	ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
10923  	if (ordered) {
10924  		btrfs_err(root->fs_info,
10925  "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
10926  			  start, end, btrfs_ino(inode), root->root_key.objectid,
10927  			  ordered->file_offset,
10928  			  ordered->file_offset + ordered->num_bytes - 1);
10929  		btrfs_put_ordered_extent(ordered);
10930  	}
10931  
10932  	ASSERT(ordered == NULL);
10933  }
10934  
10935  static const struct inode_operations btrfs_dir_inode_operations = {
10936  	.getattr	= btrfs_getattr,
10937  	.lookup		= btrfs_lookup,
10938  	.create		= btrfs_create,
10939  	.unlink		= btrfs_unlink,
10940  	.link		= btrfs_link,
10941  	.mkdir		= btrfs_mkdir,
10942  	.rmdir		= btrfs_rmdir,
10943  	.rename		= btrfs_rename2,
10944  	.symlink	= btrfs_symlink,
10945  	.setattr	= btrfs_setattr,
10946  	.mknod		= btrfs_mknod,
10947  	.listxattr	= btrfs_listxattr,
10948  	.permission	= btrfs_permission,
10949  	.get_inode_acl	= btrfs_get_acl,
10950  	.set_acl	= btrfs_set_acl,
10951  	.update_time	= btrfs_update_time,
10952  	.tmpfile        = btrfs_tmpfile,
10953  	.fileattr_get	= btrfs_fileattr_get,
10954  	.fileattr_set	= btrfs_fileattr_set,
10955  };
10956  
10957  static const struct file_operations btrfs_dir_file_operations = {
10958  	.llseek		= btrfs_dir_llseek,
10959  	.read		= generic_read_dir,
10960  	.iterate_shared	= btrfs_real_readdir,
10961  	.open		= btrfs_opendir,
10962  	.unlocked_ioctl	= btrfs_ioctl,
10963  #ifdef CONFIG_COMPAT
10964  	.compat_ioctl	= btrfs_compat_ioctl,
10965  #endif
10966  	.release        = btrfs_release_file,
10967  	.fsync		= btrfs_sync_file,
10968  };
10969  
10970  /*
10971   * btrfs doesn't support the bmap operation because swapfiles
10972   * use bmap to make a mapping of extents in the file.  They assume
10973   * these extents won't change over the life of the file and they
10974   * use the bmap result to do IO directly to the drive.
10975   *
10976   * the btrfs bmap call would return logical addresses that aren't
10977   * suitable for IO and they also will change frequently as COW
10978   * operations happen.  So, swapfile + btrfs == corruption.
10979   *
10980   * For now we're avoiding this by dropping bmap.
10981   */
10982  static const struct address_space_operations btrfs_aops = {
10983  	.read_folio	= btrfs_read_folio,
10984  	.writepages	= btrfs_writepages,
10985  	.readahead	= btrfs_readahead,
10986  	.invalidate_folio = btrfs_invalidate_folio,
10987  	.release_folio	= btrfs_release_folio,
10988  	.migrate_folio	= btrfs_migrate_folio,
10989  	.dirty_folio	= filemap_dirty_folio,
10990  	.error_remove_page = generic_error_remove_page,
10991  	.swap_activate	= btrfs_swap_activate,
10992  	.swap_deactivate = btrfs_swap_deactivate,
10993  };
10994  
10995  static const struct inode_operations btrfs_file_inode_operations = {
10996  	.getattr	= btrfs_getattr,
10997  	.setattr	= btrfs_setattr,
10998  	.listxattr      = btrfs_listxattr,
10999  	.permission	= btrfs_permission,
11000  	.fiemap		= btrfs_fiemap,
11001  	.get_inode_acl	= btrfs_get_acl,
11002  	.set_acl	= btrfs_set_acl,
11003  	.update_time	= btrfs_update_time,
11004  	.fileattr_get	= btrfs_fileattr_get,
11005  	.fileattr_set	= btrfs_fileattr_set,
11006  };
11007  static const struct inode_operations btrfs_special_inode_operations = {
11008  	.getattr	= btrfs_getattr,
11009  	.setattr	= btrfs_setattr,
11010  	.permission	= btrfs_permission,
11011  	.listxattr	= btrfs_listxattr,
11012  	.get_inode_acl	= btrfs_get_acl,
11013  	.set_acl	= btrfs_set_acl,
11014  	.update_time	= btrfs_update_time,
11015  };
11016  static const struct inode_operations btrfs_symlink_inode_operations = {
11017  	.get_link	= page_get_link,
11018  	.getattr	= btrfs_getattr,
11019  	.setattr	= btrfs_setattr,
11020  	.permission	= btrfs_permission,
11021  	.listxattr	= btrfs_listxattr,
11022  	.update_time	= btrfs_update_time,
11023  };
11024  
11025  const struct dentry_operations btrfs_dentry_operations = {
11026  	.d_delete	= btrfs_dentry_delete,
11027  };
11028