xref: /openbmc/linux/fs/btrfs/inode.c (revision 5a4c98323b01d52382575a7a4d6bf7bf5f326047)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5 
6 #include <crypto/hash.h>
7 #include <linux/kernel.h>
8 #include <linux/bio.h>
9 #include <linux/blk-cgroup.h>
10 #include <linux/file.h>
11 #include <linux/fs.h>
12 #include <linux/pagemap.h>
13 #include <linux/highmem.h>
14 #include <linux/time.h>
15 #include <linux/init.h>
16 #include <linux/string.h>
17 #include <linux/backing-dev.h>
18 #include <linux/writeback.h>
19 #include <linux/compat.h>
20 #include <linux/xattr.h>
21 #include <linux/posix_acl.h>
22 #include <linux/falloc.h>
23 #include <linux/slab.h>
24 #include <linux/ratelimit.h>
25 #include <linux/btrfs.h>
26 #include <linux/blkdev.h>
27 #include <linux/posix_acl_xattr.h>
28 #include <linux/uio.h>
29 #include <linux/magic.h>
30 #include <linux/iversion.h>
31 #include <linux/swap.h>
32 #include <linux/migrate.h>
33 #include <linux/sched/mm.h>
34 #include <linux/iomap.h>
35 #include <asm/unaligned.h>
36 #include <linux/fsverity.h>
37 #include "misc.h"
38 #include "ctree.h"
39 #include "disk-io.h"
40 #include "transaction.h"
41 #include "btrfs_inode.h"
42 #include "print-tree.h"
43 #include "ordered-data.h"
44 #include "xattr.h"
45 #include "tree-log.h"
46 #include "bio.h"
47 #include "compression.h"
48 #include "locking.h"
49 #include "free-space-cache.h"
50 #include "props.h"
51 #include "qgroup.h"
52 #include "delalloc-space.h"
53 #include "block-group.h"
54 #include "space-info.h"
55 #include "zoned.h"
56 #include "subpage.h"
57 #include "inode-item.h"
58 #include "fs.h"
59 #include "accessors.h"
60 #include "extent-tree.h"
61 #include "root-tree.h"
62 #include "defrag.h"
63 #include "dir-item.h"
64 #include "file-item.h"
65 #include "uuid-tree.h"
66 #include "ioctl.h"
67 #include "file.h"
68 #include "acl.h"
69 #include "relocation.h"
70 #include "verity.h"
71 #include "super.h"
72 #include "orphan.h"
73 #include "backref.h"
74 
75 struct btrfs_iget_args {
76 	u64 ino;
77 	struct btrfs_root *root;
78 };
79 
80 struct btrfs_dio_data {
81 	ssize_t submitted;
82 	struct extent_changeset *data_reserved;
83 	struct btrfs_ordered_extent *ordered;
84 	bool data_space_reserved;
85 	bool nocow_done;
86 };
87 
88 struct btrfs_dio_private {
89 	/* Range of I/O */
90 	u64 file_offset;
91 	u32 bytes;
92 
93 	/* This must be last */
94 	struct btrfs_bio bbio;
95 };
96 
97 static struct bio_set btrfs_dio_bioset;
98 
99 struct btrfs_rename_ctx {
100 	/* Output field. Stores the index number of the old directory entry. */
101 	u64 index;
102 };
103 
104 /*
105  * Used by data_reloc_print_warning_inode() to pass needed info for filename
106  * resolution and output of error message.
107  */
108 struct data_reloc_warn {
109 	struct btrfs_path path;
110 	struct btrfs_fs_info *fs_info;
111 	u64 extent_item_size;
112 	u64 logical;
113 	int mirror_num;
114 };
115 
116 static const struct inode_operations btrfs_dir_inode_operations;
117 static const struct inode_operations btrfs_symlink_inode_operations;
118 static const struct inode_operations btrfs_special_inode_operations;
119 static const struct inode_operations btrfs_file_inode_operations;
120 static const struct address_space_operations btrfs_aops;
121 static const struct file_operations btrfs_dir_file_operations;
122 
123 static struct kmem_cache *btrfs_inode_cachep;
124 
125 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
126 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
127 
128 static noinline int run_delalloc_cow(struct btrfs_inode *inode,
129 				     struct page *locked_page, u64 start,
130 				     u64 end, struct writeback_control *wbc,
131 				     bool pages_dirty);
132 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
133 				       u64 len, u64 orig_start, u64 block_start,
134 				       u64 block_len, u64 orig_block_len,
135 				       u64 ram_bytes, int compress_type,
136 				       int type);
137 
data_reloc_print_warning_inode(u64 inum,u64 offset,u64 num_bytes,u64 root,void * warn_ctx)138 static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
139 					  u64 root, void *warn_ctx)
140 {
141 	struct data_reloc_warn *warn = warn_ctx;
142 	struct btrfs_fs_info *fs_info = warn->fs_info;
143 	struct extent_buffer *eb;
144 	struct btrfs_inode_item *inode_item;
145 	struct inode_fs_paths *ipath = NULL;
146 	struct btrfs_root *local_root;
147 	struct btrfs_key key;
148 	unsigned int nofs_flag;
149 	u32 nlink;
150 	int ret;
151 
152 	local_root = btrfs_get_fs_root(fs_info, root, true);
153 	if (IS_ERR(local_root)) {
154 		ret = PTR_ERR(local_root);
155 		goto err;
156 	}
157 
158 	/* This makes the path point to (inum INODE_ITEM ioff). */
159 	key.objectid = inum;
160 	key.type = BTRFS_INODE_ITEM_KEY;
161 	key.offset = 0;
162 
163 	ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
164 	if (ret) {
165 		btrfs_put_root(local_root);
166 		btrfs_release_path(&warn->path);
167 		goto err;
168 	}
169 
170 	eb = warn->path.nodes[0];
171 	inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
172 	nlink = btrfs_inode_nlink(eb, inode_item);
173 	btrfs_release_path(&warn->path);
174 
175 	nofs_flag = memalloc_nofs_save();
176 	ipath = init_ipath(4096, local_root, &warn->path);
177 	memalloc_nofs_restore(nofs_flag);
178 	if (IS_ERR(ipath)) {
179 		btrfs_put_root(local_root);
180 		ret = PTR_ERR(ipath);
181 		ipath = NULL;
182 		/*
183 		 * -ENOMEM, not a critical error, just output an generic error
184 		 * without filename.
185 		 */
186 		btrfs_warn(fs_info,
187 "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
188 			   warn->logical, warn->mirror_num, root, inum, offset);
189 		return ret;
190 	}
191 	ret = paths_from_inode(inum, ipath);
192 	if (ret < 0)
193 		goto err;
194 
195 	/*
196 	 * We deliberately ignore the bit ipath might have been too small to
197 	 * hold all of the paths here
198 	 */
199 	for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
200 		btrfs_warn(fs_info,
201 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
202 			   warn->logical, warn->mirror_num, root, inum, offset,
203 			   fs_info->sectorsize, nlink,
204 			   (char *)(unsigned long)ipath->fspath->val[i]);
205 	}
206 
207 	btrfs_put_root(local_root);
208 	free_ipath(ipath);
209 	return 0;
210 
211 err:
212 	btrfs_warn(fs_info,
213 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
214 		   warn->logical, warn->mirror_num, root, inum, offset, ret);
215 
216 	free_ipath(ipath);
217 	return ret;
218 }
219 
220 /*
221  * Do extra user-friendly error output (e.g. lookup all the affected files).
222  *
223  * Return true if we succeeded doing the backref lookup.
224  * Return false if such lookup failed, and has to fallback to the old error message.
225  */
print_data_reloc_error(const struct btrfs_inode * inode,u64 file_off,const u8 * csum,const u8 * csum_expected,int mirror_num)226 static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
227 				   const u8 *csum, const u8 *csum_expected,
228 				   int mirror_num)
229 {
230 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
231 	struct btrfs_path path = { 0 };
232 	struct btrfs_key found_key = { 0 };
233 	struct extent_buffer *eb;
234 	struct btrfs_extent_item *ei;
235 	const u32 csum_size = fs_info->csum_size;
236 	u64 logical;
237 	u64 flags;
238 	u32 item_size;
239 	int ret;
240 
241 	mutex_lock(&fs_info->reloc_mutex);
242 	logical = btrfs_get_reloc_bg_bytenr(fs_info);
243 	mutex_unlock(&fs_info->reloc_mutex);
244 
245 	if (logical == U64_MAX) {
246 		btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
247 		btrfs_warn_rl(fs_info,
248 "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
249 			inode->root->root_key.objectid, btrfs_ino(inode), file_off,
250 			CSUM_FMT_VALUE(csum_size, csum),
251 			CSUM_FMT_VALUE(csum_size, csum_expected),
252 			mirror_num);
253 		return;
254 	}
255 
256 	logical += file_off;
257 	btrfs_warn_rl(fs_info,
258 "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
259 			inode->root->root_key.objectid,
260 			btrfs_ino(inode), file_off, logical,
261 			CSUM_FMT_VALUE(csum_size, csum),
262 			CSUM_FMT_VALUE(csum_size, csum_expected),
263 			mirror_num);
264 
265 	ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
266 	if (ret < 0) {
267 		btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
268 			     logical, ret);
269 		return;
270 	}
271 	eb = path.nodes[0];
272 	ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
273 	item_size = btrfs_item_size(eb, path.slots[0]);
274 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
275 		unsigned long ptr = 0;
276 		u64 ref_root;
277 		u8 ref_level;
278 
279 		while (true) {
280 			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
281 						      item_size, &ref_root,
282 						      &ref_level);
283 			if (ret < 0) {
284 				btrfs_warn_rl(fs_info,
285 				"failed to resolve tree backref for logical %llu: %d",
286 					      logical, ret);
287 				break;
288 			}
289 			if (ret > 0)
290 				break;
291 
292 			btrfs_warn_rl(fs_info,
293 "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
294 				logical, mirror_num,
295 				(ref_level ? "node" : "leaf"),
296 				ref_level, ref_root);
297 		}
298 		btrfs_release_path(&path);
299 	} else {
300 		struct btrfs_backref_walk_ctx ctx = { 0 };
301 		struct data_reloc_warn reloc_warn = { 0 };
302 
303 		btrfs_release_path(&path);
304 
305 		ctx.bytenr = found_key.objectid;
306 		ctx.extent_item_pos = logical - found_key.objectid;
307 		ctx.fs_info = fs_info;
308 
309 		reloc_warn.logical = logical;
310 		reloc_warn.extent_item_size = found_key.offset;
311 		reloc_warn.mirror_num = mirror_num;
312 		reloc_warn.fs_info = fs_info;
313 
314 		iterate_extent_inodes(&ctx, true,
315 				      data_reloc_print_warning_inode, &reloc_warn);
316 	}
317 }
318 
btrfs_print_data_csum_error(struct btrfs_inode * inode,u64 logical_start,u8 * csum,u8 * csum_expected,int mirror_num)319 static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
320 		u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
321 {
322 	struct btrfs_root *root = inode->root;
323 	const u32 csum_size = root->fs_info->csum_size;
324 
325 	/* For data reloc tree, it's better to do a backref lookup instead. */
326 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
327 		return print_data_reloc_error(inode, logical_start, csum,
328 					      csum_expected, mirror_num);
329 
330 	/* Output without objectid, which is more meaningful */
331 	if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) {
332 		btrfs_warn_rl(root->fs_info,
333 "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
334 			root->root_key.objectid, btrfs_ino(inode),
335 			logical_start,
336 			CSUM_FMT_VALUE(csum_size, csum),
337 			CSUM_FMT_VALUE(csum_size, csum_expected),
338 			mirror_num);
339 	} else {
340 		btrfs_warn_rl(root->fs_info,
341 "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
342 			root->root_key.objectid, btrfs_ino(inode),
343 			logical_start,
344 			CSUM_FMT_VALUE(csum_size, csum),
345 			CSUM_FMT_VALUE(csum_size, csum_expected),
346 			mirror_num);
347 	}
348 }
349 
350 /*
351  * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
352  *
353  * ilock_flags can have the following bit set:
354  *
355  * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
356  * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
357  *		     return -EAGAIN
358  * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
359  */
btrfs_inode_lock(struct btrfs_inode * inode,unsigned int ilock_flags)360 int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
361 {
362 	if (ilock_flags & BTRFS_ILOCK_SHARED) {
363 		if (ilock_flags & BTRFS_ILOCK_TRY) {
364 			if (!inode_trylock_shared(&inode->vfs_inode))
365 				return -EAGAIN;
366 			else
367 				return 0;
368 		}
369 		inode_lock_shared(&inode->vfs_inode);
370 	} else {
371 		if (ilock_flags & BTRFS_ILOCK_TRY) {
372 			if (!inode_trylock(&inode->vfs_inode))
373 				return -EAGAIN;
374 			else
375 				return 0;
376 		}
377 		inode_lock(&inode->vfs_inode);
378 	}
379 	if (ilock_flags & BTRFS_ILOCK_MMAP)
380 		down_write(&inode->i_mmap_lock);
381 	return 0;
382 }
383 
384 /*
385  * btrfs_inode_unlock - unock inode i_rwsem
386  *
387  * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
388  * to decide whether the lock acquired is shared or exclusive.
389  */
btrfs_inode_unlock(struct btrfs_inode * inode,unsigned int ilock_flags)390 void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
391 {
392 	if (ilock_flags & BTRFS_ILOCK_MMAP)
393 		up_write(&inode->i_mmap_lock);
394 	if (ilock_flags & BTRFS_ILOCK_SHARED)
395 		inode_unlock_shared(&inode->vfs_inode);
396 	else
397 		inode_unlock(&inode->vfs_inode);
398 }
399 
400 /*
401  * Cleanup all submitted ordered extents in specified range to handle errors
402  * from the btrfs_run_delalloc_range() callback.
403  *
404  * NOTE: caller must ensure that when an error happens, it can not call
405  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
406  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
407  * to be released, which we want to happen only when finishing the ordered
408  * extent (btrfs_finish_ordered_io()).
409  */
btrfs_cleanup_ordered_extents(struct btrfs_inode * inode,struct page * locked_page,u64 offset,u64 bytes)410 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
411 						 struct page *locked_page,
412 						 u64 offset, u64 bytes)
413 {
414 	unsigned long index = offset >> PAGE_SHIFT;
415 	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
416 	u64 page_start = 0, page_end = 0;
417 	struct page *page;
418 
419 	if (locked_page) {
420 		page_start = page_offset(locked_page);
421 		page_end = page_start + PAGE_SIZE - 1;
422 	}
423 
424 	while (index <= end_index) {
425 		/*
426 		 * For locked page, we will call btrfs_mark_ordered_io_finished
427 		 * through btrfs_mark_ordered_io_finished() on it
428 		 * in run_delalloc_range() for the error handling, which will
429 		 * clear page Ordered and run the ordered extent accounting.
430 		 *
431 		 * Here we can't just clear the Ordered bit, or
432 		 * btrfs_mark_ordered_io_finished() would skip the accounting
433 		 * for the page range, and the ordered extent will never finish.
434 		 */
435 		if (locked_page && index == (page_start >> PAGE_SHIFT)) {
436 			index++;
437 			continue;
438 		}
439 		page = find_get_page(inode->vfs_inode.i_mapping, index);
440 		index++;
441 		if (!page)
442 			continue;
443 
444 		/*
445 		 * Here we just clear all Ordered bits for every page in the
446 		 * range, then btrfs_mark_ordered_io_finished() will handle
447 		 * the ordered extent accounting for the range.
448 		 */
449 		btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
450 					       offset, bytes);
451 		put_page(page);
452 	}
453 
454 	if (locked_page) {
455 		/* The locked page covers the full range, nothing needs to be done */
456 		if (bytes + offset <= page_start + PAGE_SIZE)
457 			return;
458 		/*
459 		 * In case this page belongs to the delalloc range being
460 		 * instantiated then skip it, since the first page of a range is
461 		 * going to be properly cleaned up by the caller of
462 		 * run_delalloc_range
463 		 */
464 		if (page_start >= offset && page_end <= (offset + bytes - 1)) {
465 			bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
466 			offset = page_offset(locked_page) + PAGE_SIZE;
467 		}
468 	}
469 
470 	return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
471 }
472 
473 static int btrfs_dirty_inode(struct btrfs_inode *inode);
474 
btrfs_init_inode_security(struct btrfs_trans_handle * trans,struct btrfs_new_inode_args * args)475 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
476 				     struct btrfs_new_inode_args *args)
477 {
478 	int err;
479 
480 	if (args->default_acl) {
481 		err = __btrfs_set_acl(trans, args->inode, args->default_acl,
482 				      ACL_TYPE_DEFAULT);
483 		if (err)
484 			return err;
485 	}
486 	if (args->acl) {
487 		err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
488 		if (err)
489 			return err;
490 	}
491 	if (!args->default_acl && !args->acl)
492 		cache_no_acl(args->inode);
493 	return btrfs_xattr_security_init(trans, args->inode, args->dir,
494 					 &args->dentry->d_name);
495 }
496 
497 /*
498  * this does all the hard work for inserting an inline extent into
499  * the btree.  The caller should have done a btrfs_drop_extents so that
500  * no overlapping inline items exist in the btree
501  */
insert_inline_extent(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_inode * inode,bool extent_inserted,size_t size,size_t compressed_size,int compress_type,struct page ** compressed_pages,bool update_i_size)502 static int insert_inline_extent(struct btrfs_trans_handle *trans,
503 				struct btrfs_path *path,
504 				struct btrfs_inode *inode, bool extent_inserted,
505 				size_t size, size_t compressed_size,
506 				int compress_type,
507 				struct page **compressed_pages,
508 				bool update_i_size)
509 {
510 	struct btrfs_root *root = inode->root;
511 	struct extent_buffer *leaf;
512 	struct page *page = NULL;
513 	char *kaddr;
514 	unsigned long ptr;
515 	struct btrfs_file_extent_item *ei;
516 	int ret;
517 	size_t cur_size = size;
518 	u64 i_size;
519 
520 	ASSERT((compressed_size > 0 && compressed_pages) ||
521 	       (compressed_size == 0 && !compressed_pages));
522 
523 	if (compressed_size && compressed_pages)
524 		cur_size = compressed_size;
525 
526 	if (!extent_inserted) {
527 		struct btrfs_key key;
528 		size_t datasize;
529 
530 		key.objectid = btrfs_ino(inode);
531 		key.offset = 0;
532 		key.type = BTRFS_EXTENT_DATA_KEY;
533 
534 		datasize = btrfs_file_extent_calc_inline_size(cur_size);
535 		ret = btrfs_insert_empty_item(trans, root, path, &key,
536 					      datasize);
537 		if (ret)
538 			goto fail;
539 	}
540 	leaf = path->nodes[0];
541 	ei = btrfs_item_ptr(leaf, path->slots[0],
542 			    struct btrfs_file_extent_item);
543 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
544 	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
545 	btrfs_set_file_extent_encryption(leaf, ei, 0);
546 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
547 	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
548 	ptr = btrfs_file_extent_inline_start(ei);
549 
550 	if (compress_type != BTRFS_COMPRESS_NONE) {
551 		struct page *cpage;
552 		int i = 0;
553 		while (compressed_size > 0) {
554 			cpage = compressed_pages[i];
555 			cur_size = min_t(unsigned long, compressed_size,
556 				       PAGE_SIZE);
557 
558 			kaddr = kmap_local_page(cpage);
559 			write_extent_buffer(leaf, kaddr, ptr, cur_size);
560 			kunmap_local(kaddr);
561 
562 			i++;
563 			ptr += cur_size;
564 			compressed_size -= cur_size;
565 		}
566 		btrfs_set_file_extent_compression(leaf, ei,
567 						  compress_type);
568 	} else {
569 		page = find_get_page(inode->vfs_inode.i_mapping, 0);
570 		btrfs_set_file_extent_compression(leaf, ei, 0);
571 		kaddr = kmap_local_page(page);
572 		write_extent_buffer(leaf, kaddr, ptr, size);
573 		kunmap_local(kaddr);
574 		put_page(page);
575 	}
576 	btrfs_mark_buffer_dirty(trans, leaf);
577 	btrfs_release_path(path);
578 
579 	/*
580 	 * We align size to sectorsize for inline extents just for simplicity
581 	 * sake.
582 	 */
583 	ret = btrfs_inode_set_file_extent_range(inode, 0,
584 					ALIGN(size, root->fs_info->sectorsize));
585 	if (ret)
586 		goto fail;
587 
588 	/*
589 	 * We're an inline extent, so nobody can extend the file past i_size
590 	 * without locking a page we already have locked.
591 	 *
592 	 * We must do any i_size and inode updates before we unlock the pages.
593 	 * Otherwise we could end up racing with unlink.
594 	 */
595 	i_size = i_size_read(&inode->vfs_inode);
596 	if (update_i_size && size > i_size) {
597 		i_size_write(&inode->vfs_inode, size);
598 		i_size = size;
599 	}
600 	inode->disk_i_size = i_size;
601 
602 fail:
603 	return ret;
604 }
605 
606 
607 /*
608  * conditionally insert an inline extent into the file.  This
609  * does the checks required to make sure the data is small enough
610  * to fit as an inline extent.
611  */
cow_file_range_inline(struct btrfs_inode * inode,u64 size,size_t compressed_size,int compress_type,struct page ** compressed_pages,bool update_i_size)612 static noinline int cow_file_range_inline(struct btrfs_inode *inode, u64 size,
613 					  size_t compressed_size,
614 					  int compress_type,
615 					  struct page **compressed_pages,
616 					  bool update_i_size)
617 {
618 	struct btrfs_drop_extents_args drop_args = { 0 };
619 	struct btrfs_root *root = inode->root;
620 	struct btrfs_fs_info *fs_info = root->fs_info;
621 	struct btrfs_trans_handle *trans;
622 	u64 data_len = (compressed_size ?: size);
623 	int ret;
624 	struct btrfs_path *path;
625 
626 	/*
627 	 * We can create an inline extent if it ends at or beyond the current
628 	 * i_size, is no larger than a sector (decompressed), and the (possibly
629 	 * compressed) data fits in a leaf and the configured maximum inline
630 	 * size.
631 	 */
632 	if (size < i_size_read(&inode->vfs_inode) ||
633 	    size > fs_info->sectorsize ||
634 	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
635 	    data_len > fs_info->max_inline)
636 		return 1;
637 
638 	path = btrfs_alloc_path();
639 	if (!path)
640 		return -ENOMEM;
641 
642 	trans = btrfs_join_transaction(root);
643 	if (IS_ERR(trans)) {
644 		btrfs_free_path(path);
645 		return PTR_ERR(trans);
646 	}
647 	trans->block_rsv = &inode->block_rsv;
648 
649 	drop_args.path = path;
650 	drop_args.start = 0;
651 	drop_args.end = fs_info->sectorsize;
652 	drop_args.drop_cache = true;
653 	drop_args.replace_extent = true;
654 	drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
655 	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
656 	if (ret) {
657 		btrfs_abort_transaction(trans, ret);
658 		goto out;
659 	}
660 
661 	ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
662 				   size, compressed_size, compress_type,
663 				   compressed_pages, update_i_size);
664 	if (ret && ret != -ENOSPC) {
665 		btrfs_abort_transaction(trans, ret);
666 		goto out;
667 	} else if (ret == -ENOSPC) {
668 		ret = 1;
669 		goto out;
670 	}
671 
672 	btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
673 	ret = btrfs_update_inode(trans, root, inode);
674 	if (ret && ret != -ENOSPC) {
675 		btrfs_abort_transaction(trans, ret);
676 		goto out;
677 	} else if (ret == -ENOSPC) {
678 		ret = 1;
679 		goto out;
680 	}
681 
682 	btrfs_set_inode_full_sync(inode);
683 out:
684 	/*
685 	 * Don't forget to free the reserved space, as for inlined extent
686 	 * it won't count as data extent, free them directly here.
687 	 * And at reserve time, it's always aligned to page size, so
688 	 * just free one page here.
689 	 */
690 	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL);
691 	btrfs_free_path(path);
692 	btrfs_end_transaction(trans);
693 	return ret;
694 }
695 
696 struct async_extent {
697 	u64 start;
698 	u64 ram_size;
699 	u64 compressed_size;
700 	struct page **pages;
701 	unsigned long nr_pages;
702 	int compress_type;
703 	struct list_head list;
704 };
705 
706 struct async_chunk {
707 	struct btrfs_inode *inode;
708 	struct page *locked_page;
709 	u64 start;
710 	u64 end;
711 	blk_opf_t write_flags;
712 	struct list_head extents;
713 	struct cgroup_subsys_state *blkcg_css;
714 	struct btrfs_work work;
715 	struct async_cow *async_cow;
716 };
717 
718 struct async_cow {
719 	atomic_t num_chunks;
720 	struct async_chunk chunks[];
721 };
722 
add_async_extent(struct async_chunk * cow,u64 start,u64 ram_size,u64 compressed_size,struct page ** pages,unsigned long nr_pages,int compress_type)723 static noinline int add_async_extent(struct async_chunk *cow,
724 				     u64 start, u64 ram_size,
725 				     u64 compressed_size,
726 				     struct page **pages,
727 				     unsigned long nr_pages,
728 				     int compress_type)
729 {
730 	struct async_extent *async_extent;
731 
732 	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
733 	if (!async_extent)
734 		return -ENOMEM;
735 	async_extent->start = start;
736 	async_extent->ram_size = ram_size;
737 	async_extent->compressed_size = compressed_size;
738 	async_extent->pages = pages;
739 	async_extent->nr_pages = nr_pages;
740 	async_extent->compress_type = compress_type;
741 	list_add_tail(&async_extent->list, &cow->extents);
742 	return 0;
743 }
744 
745 /*
746  * Check if the inode needs to be submitted to compression, based on mount
747  * options, defragmentation, properties or heuristics.
748  */
inode_need_compress(struct btrfs_inode * inode,u64 start,u64 end)749 static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
750 				      u64 end)
751 {
752 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
753 
754 	if (!btrfs_inode_can_compress(inode)) {
755 		WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
756 			KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
757 			btrfs_ino(inode));
758 		return 0;
759 	}
760 	/*
761 	 * Special check for subpage.
762 	 *
763 	 * We lock the full page then run each delalloc range in the page, thus
764 	 * for the following case, we will hit some subpage specific corner case:
765 	 *
766 	 * 0		32K		64K
767 	 * |	|///////|	|///////|
768 	 *		\- A		\- B
769 	 *
770 	 * In above case, both range A and range B will try to unlock the full
771 	 * page [0, 64K), causing the one finished later will have page
772 	 * unlocked already, triggering various page lock requirement BUG_ON()s.
773 	 *
774 	 * So here we add an artificial limit that subpage compression can only
775 	 * if the range is fully page aligned.
776 	 *
777 	 * In theory we only need to ensure the first page is fully covered, but
778 	 * the tailing partial page will be locked until the full compression
779 	 * finishes, delaying the write of other range.
780 	 *
781 	 * TODO: Make btrfs_run_delalloc_range() to lock all delalloc range
782 	 * first to prevent any submitted async extent to unlock the full page.
783 	 * By this, we can ensure for subpage case that only the last async_cow
784 	 * will unlock the full page.
785 	 */
786 	if (fs_info->sectorsize < PAGE_SIZE) {
787 		if (!PAGE_ALIGNED(start) ||
788 		    !PAGE_ALIGNED(end + 1))
789 			return 0;
790 	}
791 
792 	/* force compress */
793 	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
794 		return 1;
795 	/* defrag ioctl */
796 	if (inode->defrag_compress)
797 		return 1;
798 	/* bad compression ratios */
799 	if (inode->flags & BTRFS_INODE_NOCOMPRESS)
800 		return 0;
801 	if (btrfs_test_opt(fs_info, COMPRESS) ||
802 	    inode->flags & BTRFS_INODE_COMPRESS ||
803 	    inode->prop_compress)
804 		return btrfs_compress_heuristic(&inode->vfs_inode, start, end);
805 	return 0;
806 }
807 
inode_should_defrag(struct btrfs_inode * inode,u64 start,u64 end,u64 num_bytes,u32 small_write)808 static inline void inode_should_defrag(struct btrfs_inode *inode,
809 		u64 start, u64 end, u64 num_bytes, u32 small_write)
810 {
811 	/* If this is a small write inside eof, kick off a defrag */
812 	if (num_bytes < small_write &&
813 	    (start > 0 || end + 1 < inode->disk_i_size))
814 		btrfs_add_inode_defrag(NULL, inode, small_write);
815 }
816 
817 /*
818  * Work queue call back to started compression on a file and pages.
819  *
820  * This is done inside an ordered work queue, and the compression is spread
821  * across many cpus.  The actual IO submission is step two, and the ordered work
822  * queue takes care of making sure that happens in the same order things were
823  * put onto the queue by writepages and friends.
824  *
825  * If this code finds it can't get good compression, it puts an entry onto the
826  * work queue to write the uncompressed bytes.  This makes sure that both
827  * compressed inodes and uncompressed inodes are written in the same order that
828  * the flusher thread sent them down.
829  */
compress_file_range(struct btrfs_work * work)830 static void compress_file_range(struct btrfs_work *work)
831 {
832 	struct async_chunk *async_chunk =
833 		container_of(work, struct async_chunk, work);
834 	struct btrfs_inode *inode = async_chunk->inode;
835 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
836 	struct address_space *mapping = inode->vfs_inode.i_mapping;
837 	u64 blocksize = fs_info->sectorsize;
838 	u64 start = async_chunk->start;
839 	u64 end = async_chunk->end;
840 	u64 actual_end;
841 	u64 i_size;
842 	int ret = 0;
843 	struct page **pages;
844 	unsigned long nr_pages;
845 	unsigned long total_compressed = 0;
846 	unsigned long total_in = 0;
847 	unsigned int poff;
848 	int i;
849 	int compress_type = fs_info->compress_type;
850 
851 	inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
852 
853 	/*
854 	 * We need to call clear_page_dirty_for_io on each page in the range.
855 	 * Otherwise applications with the file mmap'd can wander in and change
856 	 * the page contents while we are compressing them.
857 	 */
858 	extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end);
859 
860 	/*
861 	 * We need to save i_size before now because it could change in between
862 	 * us evaluating the size and assigning it.  This is because we lock and
863 	 * unlock the page in truncate and fallocate, and then modify the i_size
864 	 * later on.
865 	 *
866 	 * The barriers are to emulate READ_ONCE, remove that once i_size_read
867 	 * does that for us.
868 	 */
869 	barrier();
870 	i_size = i_size_read(&inode->vfs_inode);
871 	barrier();
872 	actual_end = min_t(u64, i_size, end + 1);
873 again:
874 	pages = NULL;
875 	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
876 	nr_pages = min_t(unsigned long, nr_pages, BTRFS_MAX_COMPRESSED_PAGES);
877 
878 	/*
879 	 * we don't want to send crud past the end of i_size through
880 	 * compression, that's just a waste of CPU time.  So, if the
881 	 * end of the file is before the start of our current
882 	 * requested range of bytes, we bail out to the uncompressed
883 	 * cleanup code that can deal with all of this.
884 	 *
885 	 * It isn't really the fastest way to fix things, but this is a
886 	 * very uncommon corner.
887 	 */
888 	if (actual_end <= start)
889 		goto cleanup_and_bail_uncompressed;
890 
891 	total_compressed = actual_end - start;
892 
893 	/*
894 	 * Skip compression for a small file range(<=blocksize) that
895 	 * isn't an inline extent, since it doesn't save disk space at all.
896 	 */
897 	if (total_compressed <= blocksize &&
898 	   (start > 0 || end + 1 < inode->disk_i_size))
899 		goto cleanup_and_bail_uncompressed;
900 
901 	/*
902 	 * For subpage case, we require full page alignment for the sector
903 	 * aligned range.
904 	 * Thus we must also check against @actual_end, not just @end.
905 	 */
906 	if (blocksize < PAGE_SIZE) {
907 		if (!PAGE_ALIGNED(start) ||
908 		    !PAGE_ALIGNED(round_up(actual_end, blocksize)))
909 			goto cleanup_and_bail_uncompressed;
910 	}
911 
912 	total_compressed = min_t(unsigned long, total_compressed,
913 			BTRFS_MAX_UNCOMPRESSED);
914 	total_in = 0;
915 	ret = 0;
916 
917 	/*
918 	 * We do compression for mount -o compress and when the inode has not
919 	 * been flagged as NOCOMPRESS.  This flag can change at any time if we
920 	 * discover bad compression ratios.
921 	 */
922 	if (!inode_need_compress(inode, start, end))
923 		goto cleanup_and_bail_uncompressed;
924 
925 	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
926 	if (!pages) {
927 		/*
928 		 * Memory allocation failure is not a fatal error, we can fall
929 		 * back to uncompressed code.
930 		 */
931 		goto cleanup_and_bail_uncompressed;
932 	}
933 
934 	if (inode->defrag_compress)
935 		compress_type = inode->defrag_compress;
936 	else if (inode->prop_compress)
937 		compress_type = inode->prop_compress;
938 
939 	/* Compression level is applied here. */
940 	ret = btrfs_compress_pages(compress_type | (fs_info->compress_level << 4),
941 				   mapping, start, pages, &nr_pages, &total_in,
942 				   &total_compressed);
943 	if (ret)
944 		goto mark_incompressible;
945 
946 	/*
947 	 * Zero the tail end of the last page, as we might be sending it down
948 	 * to disk.
949 	 */
950 	poff = offset_in_page(total_compressed);
951 	if (poff)
952 		memzero_page(pages[nr_pages - 1], poff, PAGE_SIZE - poff);
953 
954 	/*
955 	 * Try to create an inline extent.
956 	 *
957 	 * If we didn't compress the entire range, try to create an uncompressed
958 	 * inline extent, else a compressed one.
959 	 *
960 	 * Check cow_file_range() for why we don't even try to create inline
961 	 * extent for the subpage case.
962 	 */
963 	if (start == 0 && fs_info->sectorsize == PAGE_SIZE) {
964 		if (total_in < actual_end) {
965 			ret = cow_file_range_inline(inode, actual_end, 0,
966 						    BTRFS_COMPRESS_NONE, NULL,
967 						    false);
968 		} else {
969 			ret = cow_file_range_inline(inode, actual_end,
970 						    total_compressed,
971 						    compress_type, pages,
972 						    false);
973 		}
974 		if (ret <= 0) {
975 			unsigned long clear_flags = EXTENT_DELALLOC |
976 				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
977 				EXTENT_DO_ACCOUNTING;
978 
979 			if (ret < 0)
980 				mapping_set_error(mapping, -EIO);
981 
982 			/*
983 			 * inline extent creation worked or returned error,
984 			 * we don't need to create any more async work items.
985 			 * Unlock and free up our temp pages.
986 			 *
987 			 * We use DO_ACCOUNTING here because we need the
988 			 * delalloc_release_metadata to be done _after_ we drop
989 			 * our outstanding extent for clearing delalloc for this
990 			 * range.
991 			 */
992 			extent_clear_unlock_delalloc(inode, start, end,
993 						     NULL,
994 						     clear_flags,
995 						     PAGE_UNLOCK |
996 						     PAGE_START_WRITEBACK |
997 						     PAGE_END_WRITEBACK);
998 			goto free_pages;
999 		}
1000 	}
1001 
1002 	/*
1003 	 * We aren't doing an inline extent. Round the compressed size up to a
1004 	 * block size boundary so the allocator does sane things.
1005 	 */
1006 	total_compressed = ALIGN(total_compressed, blocksize);
1007 
1008 	/*
1009 	 * One last check to make sure the compression is really a win, compare
1010 	 * the page count read with the blocks on disk, compression must free at
1011 	 * least one sector.
1012 	 */
1013 	total_in = round_up(total_in, fs_info->sectorsize);
1014 	if (total_compressed + blocksize > total_in)
1015 		goto mark_incompressible;
1016 
1017 	/*
1018 	 * The async work queues will take care of doing actual allocation on
1019 	 * disk for these compressed pages, and will submit the bios.
1020 	 */
1021 	ret = add_async_extent(async_chunk, start, total_in, total_compressed, pages,
1022 			       nr_pages, compress_type);
1023 	BUG_ON(ret);
1024 	if (start + total_in < end) {
1025 		start += total_in;
1026 		cond_resched();
1027 		goto again;
1028 	}
1029 	return;
1030 
1031 mark_incompressible:
1032 	if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
1033 		inode->flags |= BTRFS_INODE_NOCOMPRESS;
1034 cleanup_and_bail_uncompressed:
1035 	ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
1036 			       BTRFS_COMPRESS_NONE);
1037 	BUG_ON(ret);
1038 free_pages:
1039 	if (pages) {
1040 		for (i = 0; i < nr_pages; i++) {
1041 			WARN_ON(pages[i]->mapping);
1042 			put_page(pages[i]);
1043 		}
1044 		kfree(pages);
1045 	}
1046 }
1047 
free_async_extent_pages(struct async_extent * async_extent)1048 static void free_async_extent_pages(struct async_extent *async_extent)
1049 {
1050 	int i;
1051 
1052 	if (!async_extent->pages)
1053 		return;
1054 
1055 	for (i = 0; i < async_extent->nr_pages; i++) {
1056 		WARN_ON(async_extent->pages[i]->mapping);
1057 		put_page(async_extent->pages[i]);
1058 	}
1059 	kfree(async_extent->pages);
1060 	async_extent->nr_pages = 0;
1061 	async_extent->pages = NULL;
1062 }
1063 
submit_uncompressed_range(struct btrfs_inode * inode,struct async_extent * async_extent,struct page * locked_page)1064 static void submit_uncompressed_range(struct btrfs_inode *inode,
1065 				      struct async_extent *async_extent,
1066 				      struct page *locked_page)
1067 {
1068 	u64 start = async_extent->start;
1069 	u64 end = async_extent->start + async_extent->ram_size - 1;
1070 	int ret;
1071 	struct writeback_control wbc = {
1072 		.sync_mode		= WB_SYNC_ALL,
1073 		.range_start		= start,
1074 		.range_end		= end,
1075 		.no_cgroup_owner	= 1,
1076 	};
1077 
1078 	wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
1079 	ret = run_delalloc_cow(inode, locked_page, start, end, &wbc, false);
1080 	wbc_detach_inode(&wbc);
1081 	if (ret < 0) {
1082 		btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
1083 		if (locked_page) {
1084 			const u64 page_start = page_offset(locked_page);
1085 
1086 			set_page_writeback(locked_page);
1087 			end_page_writeback(locked_page);
1088 			btrfs_mark_ordered_io_finished(inode, locked_page,
1089 						       page_start, PAGE_SIZE,
1090 						       !ret);
1091 			mapping_set_error(locked_page->mapping, ret);
1092 			unlock_page(locked_page);
1093 		}
1094 	}
1095 }
1096 
submit_one_async_extent(struct async_chunk * async_chunk,struct async_extent * async_extent,u64 * alloc_hint)1097 static void submit_one_async_extent(struct async_chunk *async_chunk,
1098 				    struct async_extent *async_extent,
1099 				    u64 *alloc_hint)
1100 {
1101 	struct btrfs_inode *inode = async_chunk->inode;
1102 	struct extent_io_tree *io_tree = &inode->io_tree;
1103 	struct btrfs_root *root = inode->root;
1104 	struct btrfs_fs_info *fs_info = root->fs_info;
1105 	struct btrfs_ordered_extent *ordered;
1106 	struct btrfs_key ins;
1107 	struct page *locked_page = NULL;
1108 	struct extent_map *em;
1109 	int ret = 0;
1110 	u64 start = async_extent->start;
1111 	u64 end = async_extent->start + async_extent->ram_size - 1;
1112 
1113 	if (async_chunk->blkcg_css)
1114 		kthread_associate_blkcg(async_chunk->blkcg_css);
1115 
1116 	/*
1117 	 * If async_chunk->locked_page is in the async_extent range, we need to
1118 	 * handle it.
1119 	 */
1120 	if (async_chunk->locked_page) {
1121 		u64 locked_page_start = page_offset(async_chunk->locked_page);
1122 		u64 locked_page_end = locked_page_start + PAGE_SIZE - 1;
1123 
1124 		if (!(start >= locked_page_end || end <= locked_page_start))
1125 			locked_page = async_chunk->locked_page;
1126 	}
1127 	lock_extent(io_tree, start, end, NULL);
1128 
1129 	if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
1130 		submit_uncompressed_range(inode, async_extent, locked_page);
1131 		goto done;
1132 	}
1133 
1134 	ret = btrfs_reserve_extent(root, async_extent->ram_size,
1135 				   async_extent->compressed_size,
1136 				   async_extent->compressed_size,
1137 				   0, *alloc_hint, &ins, 1, 1);
1138 	if (ret) {
1139 		/*
1140 		 * We can't reserve contiguous space for the compressed size.
1141 		 * Unlikely, but it's possible that we could have enough
1142 		 * non-contiguous space for the uncompressed size instead.  So
1143 		 * fall back to uncompressed.
1144 		 */
1145 		submit_uncompressed_range(inode, async_extent, locked_page);
1146 		goto done;
1147 	}
1148 
1149 	/* Here we're doing allocation and writeback of the compressed pages */
1150 	em = create_io_em(inode, start,
1151 			  async_extent->ram_size,	/* len */
1152 			  start,			/* orig_start */
1153 			  ins.objectid,			/* block_start */
1154 			  ins.offset,			/* block_len */
1155 			  ins.offset,			/* orig_block_len */
1156 			  async_extent->ram_size,	/* ram_bytes */
1157 			  async_extent->compress_type,
1158 			  BTRFS_ORDERED_COMPRESSED);
1159 	if (IS_ERR(em)) {
1160 		ret = PTR_ERR(em);
1161 		goto out_free_reserve;
1162 	}
1163 	free_extent_map(em);
1164 
1165 	ordered = btrfs_alloc_ordered_extent(inode, start,	/* file_offset */
1166 				       async_extent->ram_size,	/* num_bytes */
1167 				       async_extent->ram_size,	/* ram_bytes */
1168 				       ins.objectid,		/* disk_bytenr */
1169 				       ins.offset,		/* disk_num_bytes */
1170 				       0,			/* offset */
1171 				       1 << BTRFS_ORDERED_COMPRESSED,
1172 				       async_extent->compress_type);
1173 	if (IS_ERR(ordered)) {
1174 		btrfs_drop_extent_map_range(inode, start, end, false);
1175 		ret = PTR_ERR(ordered);
1176 		goto out_free_reserve;
1177 	}
1178 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1179 
1180 	/* Clear dirty, set writeback and unlock the pages. */
1181 	extent_clear_unlock_delalloc(inode, start, end,
1182 			NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
1183 			PAGE_UNLOCK | PAGE_START_WRITEBACK);
1184 	btrfs_submit_compressed_write(ordered,
1185 			    async_extent->pages,	/* compressed_pages */
1186 			    async_extent->nr_pages,
1187 			    async_chunk->write_flags, true);
1188 	*alloc_hint = ins.objectid + ins.offset;
1189 done:
1190 	if (async_chunk->blkcg_css)
1191 		kthread_associate_blkcg(NULL);
1192 	kfree(async_extent);
1193 	return;
1194 
1195 out_free_reserve:
1196 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1197 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1198 	mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
1199 	extent_clear_unlock_delalloc(inode, start, end,
1200 				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
1201 				     EXTENT_DELALLOC_NEW |
1202 				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
1203 				     PAGE_UNLOCK | PAGE_START_WRITEBACK |
1204 				     PAGE_END_WRITEBACK);
1205 	free_async_extent_pages(async_extent);
1206 	if (async_chunk->blkcg_css)
1207 		kthread_associate_blkcg(NULL);
1208 	btrfs_debug(fs_info,
1209 "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
1210 		    root->root_key.objectid, btrfs_ino(inode), start,
1211 		    async_extent->ram_size, ret);
1212 	kfree(async_extent);
1213 }
1214 
get_extent_allocation_hint(struct btrfs_inode * inode,u64 start,u64 num_bytes)1215 static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1216 				      u64 num_bytes)
1217 {
1218 	struct extent_map_tree *em_tree = &inode->extent_tree;
1219 	struct extent_map *em;
1220 	u64 alloc_hint = 0;
1221 
1222 	read_lock(&em_tree->lock);
1223 	em = search_extent_mapping(em_tree, start, num_bytes);
1224 	if (em) {
1225 		/*
1226 		 * if block start isn't an actual block number then find the
1227 		 * first block in this inode and use that as a hint.  If that
1228 		 * block is also bogus then just don't worry about it.
1229 		 */
1230 		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1231 			free_extent_map(em);
1232 			em = search_extent_mapping(em_tree, 0, 0);
1233 			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
1234 				alloc_hint = em->block_start;
1235 			if (em)
1236 				free_extent_map(em);
1237 		} else {
1238 			alloc_hint = em->block_start;
1239 			free_extent_map(em);
1240 		}
1241 	}
1242 	read_unlock(&em_tree->lock);
1243 
1244 	return alloc_hint;
1245 }
1246 
1247 /*
1248  * when extent_io.c finds a delayed allocation range in the file,
1249  * the call backs end up in this code.  The basic idea is to
1250  * allocate extents on disk for the range, and create ordered data structs
1251  * in ram to track those extents.
1252  *
1253  * locked_page is the page that writepage had locked already.  We use
1254  * it to make sure we don't do extra locks or unlocks.
1255  *
1256  * When this function fails, it unlocks all pages except @locked_page.
1257  *
1258  * When this function successfully creates an inline extent, it returns 1 and
1259  * unlocks all pages including locked_page and starts I/O on them.
1260  * (In reality inline extents are limited to a single page, so locked_page is
1261  * the only page handled anyway).
1262  *
1263  * When this function succeed and creates a normal extent, the page locking
1264  * status depends on the passed in flags:
1265  *
1266  * - If @keep_locked is set, all pages are kept locked.
1267  * - Else all pages except for @locked_page are unlocked.
1268  *
1269  * When a failure happens in the second or later iteration of the
1270  * while-loop, the ordered extents created in previous iterations are kept
1271  * intact. So, the caller must clean them up by calling
1272  * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
1273  * example.
1274  */
cow_file_range(struct btrfs_inode * inode,struct page * locked_page,u64 start,u64 end,u64 * done_offset,bool keep_locked,bool no_inline)1275 static noinline int cow_file_range(struct btrfs_inode *inode,
1276 				   struct page *locked_page, u64 start, u64 end,
1277 				   u64 *done_offset,
1278 				   bool keep_locked, bool no_inline)
1279 {
1280 	struct btrfs_root *root = inode->root;
1281 	struct btrfs_fs_info *fs_info = root->fs_info;
1282 	u64 alloc_hint = 0;
1283 	u64 orig_start = start;
1284 	u64 num_bytes;
1285 	unsigned long ram_size;
1286 	u64 cur_alloc_size = 0;
1287 	u64 min_alloc_size;
1288 	u64 blocksize = fs_info->sectorsize;
1289 	struct btrfs_key ins;
1290 	struct extent_map *em;
1291 	unsigned clear_bits;
1292 	unsigned long page_ops;
1293 	bool extent_reserved = false;
1294 	int ret = 0;
1295 
1296 	if (btrfs_is_free_space_inode(inode)) {
1297 		ret = -EINVAL;
1298 		goto out_unlock;
1299 	}
1300 
1301 	num_bytes = ALIGN(end - start + 1, blocksize);
1302 	num_bytes = max(blocksize,  num_bytes);
1303 	ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1304 
1305 	inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1306 
1307 	/*
1308 	 * Due to the page size limit, for subpage we can only trigger the
1309 	 * writeback for the dirty sectors of page, that means data writeback
1310 	 * is doing more writeback than what we want.
1311 	 *
1312 	 * This is especially unexpected for some call sites like fallocate,
1313 	 * where we only increase i_size after everything is done.
1314 	 * This means we can trigger inline extent even if we didn't want to.
1315 	 * So here we skip inline extent creation completely.
1316 	 */
1317 	if (start == 0 && fs_info->sectorsize == PAGE_SIZE && !no_inline) {
1318 		u64 actual_end = min_t(u64, i_size_read(&inode->vfs_inode),
1319 				       end + 1);
1320 
1321 		/* lets try to make an inline extent */
1322 		ret = cow_file_range_inline(inode, actual_end, 0,
1323 					    BTRFS_COMPRESS_NONE, NULL, false);
1324 		if (ret == 0) {
1325 			/*
1326 			 * We use DO_ACCOUNTING here because we need the
1327 			 * delalloc_release_metadata to be run _after_ we drop
1328 			 * our outstanding extent for clearing delalloc for this
1329 			 * range.
1330 			 */
1331 			extent_clear_unlock_delalloc(inode, start, end,
1332 				     locked_page,
1333 				     EXTENT_LOCKED | EXTENT_DELALLOC |
1334 				     EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1335 				     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1336 				     PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
1337 			/*
1338 			 * locked_page is locked by the caller of
1339 			 * writepage_delalloc(), not locked by
1340 			 * __process_pages_contig().
1341 			 *
1342 			 * We can't let __process_pages_contig() to unlock it,
1343 			 * as it doesn't have any subpage::writers recorded.
1344 			 *
1345 			 * Here we manually unlock the page, since the caller
1346 			 * can't determine if it's an inline extent or a
1347 			 * compressed extent.
1348 			 */
1349 			unlock_page(locked_page);
1350 			ret = 1;
1351 			goto done;
1352 		} else if (ret < 0) {
1353 			goto out_unlock;
1354 		}
1355 	}
1356 
1357 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1358 
1359 	/*
1360 	 * Relocation relies on the relocated extents to have exactly the same
1361 	 * size as the original extents. Normally writeback for relocation data
1362 	 * extents follows a NOCOW path because relocation preallocates the
1363 	 * extents. However, due to an operation such as scrub turning a block
1364 	 * group to RO mode, it may fallback to COW mode, so we must make sure
1365 	 * an extent allocated during COW has exactly the requested size and can
1366 	 * not be split into smaller extents, otherwise relocation breaks and
1367 	 * fails during the stage where it updates the bytenr of file extent
1368 	 * items.
1369 	 */
1370 	if (btrfs_is_data_reloc_root(root))
1371 		min_alloc_size = num_bytes;
1372 	else
1373 		min_alloc_size = fs_info->sectorsize;
1374 
1375 	while (num_bytes > 0) {
1376 		struct btrfs_ordered_extent *ordered;
1377 
1378 		cur_alloc_size = num_bytes;
1379 		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1380 					   min_alloc_size, 0, alloc_hint,
1381 					   &ins, 1, 1);
1382 		if (ret == -EAGAIN) {
1383 			/*
1384 			 * btrfs_reserve_extent only returns -EAGAIN for zoned
1385 			 * file systems, which is an indication that there are
1386 			 * no active zones to allocate from at the moment.
1387 			 *
1388 			 * If this is the first loop iteration, wait for at
1389 			 * least one zone to finish before retrying the
1390 			 * allocation.  Otherwise ask the caller to write out
1391 			 * the already allocated blocks before coming back to
1392 			 * us, or return -ENOSPC if it can't handle retries.
1393 			 */
1394 			ASSERT(btrfs_is_zoned(fs_info));
1395 			if (start == orig_start) {
1396 				wait_on_bit_io(&inode->root->fs_info->flags,
1397 					       BTRFS_FS_NEED_ZONE_FINISH,
1398 					       TASK_UNINTERRUPTIBLE);
1399 				continue;
1400 			}
1401 			if (done_offset) {
1402 				*done_offset = start - 1;
1403 				return 0;
1404 			}
1405 			ret = -ENOSPC;
1406 		}
1407 		if (ret < 0)
1408 			goto out_unlock;
1409 		cur_alloc_size = ins.offset;
1410 		extent_reserved = true;
1411 
1412 		ram_size = ins.offset;
1413 		em = create_io_em(inode, start, ins.offset, /* len */
1414 				  start, /* orig_start */
1415 				  ins.objectid, /* block_start */
1416 				  ins.offset, /* block_len */
1417 				  ins.offset, /* orig_block_len */
1418 				  ram_size, /* ram_bytes */
1419 				  BTRFS_COMPRESS_NONE, /* compress_type */
1420 				  BTRFS_ORDERED_REGULAR /* type */);
1421 		if (IS_ERR(em)) {
1422 			ret = PTR_ERR(em);
1423 			goto out_reserve;
1424 		}
1425 		free_extent_map(em);
1426 
1427 		ordered = btrfs_alloc_ordered_extent(inode, start, ram_size,
1428 					ram_size, ins.objectid, cur_alloc_size,
1429 					0, 1 << BTRFS_ORDERED_REGULAR,
1430 					BTRFS_COMPRESS_NONE);
1431 		if (IS_ERR(ordered)) {
1432 			ret = PTR_ERR(ordered);
1433 			goto out_drop_extent_cache;
1434 		}
1435 
1436 		if (btrfs_is_data_reloc_root(root)) {
1437 			ret = btrfs_reloc_clone_csums(ordered);
1438 
1439 			/*
1440 			 * Only drop cache here, and process as normal.
1441 			 *
1442 			 * We must not allow extent_clear_unlock_delalloc()
1443 			 * at out_unlock label to free meta of this ordered
1444 			 * extent, as its meta should be freed by
1445 			 * btrfs_finish_ordered_io().
1446 			 *
1447 			 * So we must continue until @start is increased to
1448 			 * skip current ordered extent.
1449 			 */
1450 			if (ret)
1451 				btrfs_drop_extent_map_range(inode, start,
1452 							    start + ram_size - 1,
1453 							    false);
1454 		}
1455 		btrfs_put_ordered_extent(ordered);
1456 
1457 		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1458 
1459 		/*
1460 		 * We're not doing compressed IO, don't unlock the first page
1461 		 * (which the caller expects to stay locked), don't clear any
1462 		 * dirty bits and don't set any writeback bits
1463 		 *
1464 		 * Do set the Ordered (Private2) bit so we know this page was
1465 		 * properly setup for writepage.
1466 		 */
1467 		page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
1468 		page_ops |= PAGE_SET_ORDERED;
1469 
1470 		extent_clear_unlock_delalloc(inode, start, start + ram_size - 1,
1471 					     locked_page,
1472 					     EXTENT_LOCKED | EXTENT_DELALLOC,
1473 					     page_ops);
1474 		if (num_bytes < cur_alloc_size)
1475 			num_bytes = 0;
1476 		else
1477 			num_bytes -= cur_alloc_size;
1478 		alloc_hint = ins.objectid + ins.offset;
1479 		start += cur_alloc_size;
1480 		extent_reserved = false;
1481 
1482 		/*
1483 		 * btrfs_reloc_clone_csums() error, since start is increased
1484 		 * extent_clear_unlock_delalloc() at out_unlock label won't
1485 		 * free metadata of current ordered extent, we're OK to exit.
1486 		 */
1487 		if (ret)
1488 			goto out_unlock;
1489 	}
1490 done:
1491 	if (done_offset)
1492 		*done_offset = end;
1493 	return ret;
1494 
1495 out_drop_extent_cache:
1496 	btrfs_drop_extent_map_range(inode, start, start + ram_size - 1, false);
1497 out_reserve:
1498 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1499 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1500 out_unlock:
1501 	/*
1502 	 * Now, we have three regions to clean up:
1503 	 *
1504 	 * |-------(1)----|---(2)---|-------------(3)----------|
1505 	 * `- orig_start  `- start  `- start + cur_alloc_size  `- end
1506 	 *
1507 	 * We process each region below.
1508 	 */
1509 
1510 	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1511 		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1512 	page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1513 
1514 	/*
1515 	 * For the range (1). We have already instantiated the ordered extents
1516 	 * for this region. They are cleaned up by
1517 	 * btrfs_cleanup_ordered_extents() in e.g,
1518 	 * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
1519 	 * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
1520 	 * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
1521 	 * function.
1522 	 *
1523 	 * However, in case of @keep_locked, we still need to unlock the pages
1524 	 * (except @locked_page) to ensure all the pages are unlocked.
1525 	 */
1526 	if (keep_locked && orig_start < start) {
1527 		if (!locked_page)
1528 			mapping_set_error(inode->vfs_inode.i_mapping, ret);
1529 		extent_clear_unlock_delalloc(inode, orig_start, start - 1,
1530 					     locked_page, 0, page_ops);
1531 	}
1532 
1533 	/*
1534 	 * For the range (2). If we reserved an extent for our delalloc range
1535 	 * (or a subrange) and failed to create the respective ordered extent,
1536 	 * then it means that when we reserved the extent we decremented the
1537 	 * extent's size from the data space_info's bytes_may_use counter and
1538 	 * incremented the space_info's bytes_reserved counter by the same
1539 	 * amount. We must make sure extent_clear_unlock_delalloc() does not try
1540 	 * to decrement again the data space_info's bytes_may_use counter,
1541 	 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
1542 	 */
1543 	if (extent_reserved) {
1544 		extent_clear_unlock_delalloc(inode, start,
1545 					     start + cur_alloc_size - 1,
1546 					     locked_page,
1547 					     clear_bits,
1548 					     page_ops);
1549 		btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
1550 		start += cur_alloc_size;
1551 	}
1552 
1553 	/*
1554 	 * For the range (3). We never touched the region. In addition to the
1555 	 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
1556 	 * space_info's bytes_may_use counter, reserved in
1557 	 * btrfs_check_data_free_space().
1558 	 */
1559 	if (start < end) {
1560 		clear_bits |= EXTENT_CLEAR_DATA_RESV;
1561 		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1562 					     clear_bits, page_ops);
1563 		btrfs_qgroup_free_data(inode, NULL, start, end - start + 1, NULL);
1564 	}
1565 	return ret;
1566 }
1567 
1568 /*
1569  * Phase two of compressed writeback.  This is the ordered portion of the code,
1570  * which only gets called in the order the work was queued.  We walk all the
1571  * async extents created by compress_file_range and send them down to the disk.
1572  */
submit_compressed_extents(struct btrfs_work * work)1573 static noinline void submit_compressed_extents(struct btrfs_work *work)
1574 {
1575 	struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1576 						     work);
1577 	struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1578 	struct async_extent *async_extent;
1579 	unsigned long nr_pages;
1580 	u64 alloc_hint = 0;
1581 
1582 	nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1583 		PAGE_SHIFT;
1584 
1585 	while (!list_empty(&async_chunk->extents)) {
1586 		async_extent = list_entry(async_chunk->extents.next,
1587 					  struct async_extent, list);
1588 		list_del(&async_extent->list);
1589 		submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
1590 	}
1591 
1592 	/* atomic_sub_return implies a barrier */
1593 	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1594 	    5 * SZ_1M)
1595 		cond_wake_up_nomb(&fs_info->async_submit_wait);
1596 }
1597 
async_cow_free(struct btrfs_work * work)1598 static noinline void async_cow_free(struct btrfs_work *work)
1599 {
1600 	struct async_chunk *async_chunk;
1601 	struct async_cow *async_cow;
1602 
1603 	async_chunk = container_of(work, struct async_chunk, work);
1604 	btrfs_add_delayed_iput(async_chunk->inode);
1605 	if (async_chunk->blkcg_css)
1606 		css_put(async_chunk->blkcg_css);
1607 
1608 	async_cow = async_chunk->async_cow;
1609 	if (atomic_dec_and_test(&async_cow->num_chunks))
1610 		kvfree(async_cow);
1611 }
1612 
run_delalloc_compressed(struct btrfs_inode * inode,struct page * locked_page,u64 start,u64 end,struct writeback_control * wbc)1613 static bool run_delalloc_compressed(struct btrfs_inode *inode,
1614 				    struct page *locked_page, u64 start,
1615 				    u64 end, struct writeback_control *wbc)
1616 {
1617 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1618 	struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1619 	struct async_cow *ctx;
1620 	struct async_chunk *async_chunk;
1621 	unsigned long nr_pages;
1622 	u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1623 	int i;
1624 	unsigned nofs_flag;
1625 	const blk_opf_t write_flags = wbc_to_write_flags(wbc);
1626 
1627 	nofs_flag = memalloc_nofs_save();
1628 	ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1629 	memalloc_nofs_restore(nofs_flag);
1630 	if (!ctx)
1631 		return false;
1632 
1633 	unlock_extent(&inode->io_tree, start, end, NULL);
1634 	set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
1635 
1636 	async_chunk = ctx->chunks;
1637 	atomic_set(&ctx->num_chunks, num_chunks);
1638 
1639 	for (i = 0; i < num_chunks; i++) {
1640 		u64 cur_end = min(end, start + SZ_512K - 1);
1641 
1642 		/*
1643 		 * igrab is called higher up in the call chain, take only the
1644 		 * lightweight reference for the callback lifetime
1645 		 */
1646 		ihold(&inode->vfs_inode);
1647 		async_chunk[i].async_cow = ctx;
1648 		async_chunk[i].inode = inode;
1649 		async_chunk[i].start = start;
1650 		async_chunk[i].end = cur_end;
1651 		async_chunk[i].write_flags = write_flags;
1652 		INIT_LIST_HEAD(&async_chunk[i].extents);
1653 
1654 		/*
1655 		 * The locked_page comes all the way from writepage and its
1656 		 * the original page we were actually given.  As we spread
1657 		 * this large delalloc region across multiple async_chunk
1658 		 * structs, only the first struct needs a pointer to locked_page
1659 		 *
1660 		 * This way we don't need racey decisions about who is supposed
1661 		 * to unlock it.
1662 		 */
1663 		if (locked_page) {
1664 			/*
1665 			 * Depending on the compressibility, the pages might or
1666 			 * might not go through async.  We want all of them to
1667 			 * be accounted against wbc once.  Let's do it here
1668 			 * before the paths diverge.  wbc accounting is used
1669 			 * only for foreign writeback detection and doesn't
1670 			 * need full accuracy.  Just account the whole thing
1671 			 * against the first page.
1672 			 */
1673 			wbc_account_cgroup_owner(wbc, locked_page,
1674 						 cur_end - start);
1675 			async_chunk[i].locked_page = locked_page;
1676 			locked_page = NULL;
1677 		} else {
1678 			async_chunk[i].locked_page = NULL;
1679 		}
1680 
1681 		if (blkcg_css != blkcg_root_css) {
1682 			css_get(blkcg_css);
1683 			async_chunk[i].blkcg_css = blkcg_css;
1684 			async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
1685 		} else {
1686 			async_chunk[i].blkcg_css = NULL;
1687 		}
1688 
1689 		btrfs_init_work(&async_chunk[i].work, compress_file_range,
1690 				submit_compressed_extents, async_cow_free);
1691 
1692 		nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1693 		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1694 
1695 		btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1696 
1697 		start = cur_end + 1;
1698 	}
1699 	return true;
1700 }
1701 
1702 /*
1703  * Run the delalloc range from start to end, and write back any dirty pages
1704  * covered by the range.
1705  */
run_delalloc_cow(struct btrfs_inode * inode,struct page * locked_page,u64 start,u64 end,struct writeback_control * wbc,bool pages_dirty)1706 static noinline int run_delalloc_cow(struct btrfs_inode *inode,
1707 				     struct page *locked_page, u64 start,
1708 				     u64 end, struct writeback_control *wbc,
1709 				     bool pages_dirty)
1710 {
1711 	u64 done_offset = end;
1712 	int ret;
1713 
1714 	while (start <= end) {
1715 		ret = cow_file_range(inode, locked_page, start, end, &done_offset,
1716 				     true, false);
1717 		if (ret)
1718 			return ret;
1719 		extent_write_locked_range(&inode->vfs_inode, locked_page, start,
1720 					  done_offset, wbc, pages_dirty);
1721 		start = done_offset + 1;
1722 	}
1723 
1724 	return 1;
1725 }
1726 
csum_exist_in_range(struct btrfs_fs_info * fs_info,u64 bytenr,u64 num_bytes,bool nowait)1727 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1728 					u64 bytenr, u64 num_bytes, bool nowait)
1729 {
1730 	struct btrfs_root *csum_root = btrfs_csum_root(fs_info, bytenr);
1731 	struct btrfs_ordered_sum *sums;
1732 	int ret;
1733 	LIST_HEAD(list);
1734 
1735 	ret = btrfs_lookup_csums_list(csum_root, bytenr, bytenr + num_bytes - 1,
1736 				      &list, 0, nowait);
1737 	if (ret == 0 && list_empty(&list))
1738 		return 0;
1739 
1740 	while (!list_empty(&list)) {
1741 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1742 		list_del(&sums->list);
1743 		kfree(sums);
1744 	}
1745 	if (ret < 0)
1746 		return ret;
1747 	return 1;
1748 }
1749 
fallback_to_cow(struct btrfs_inode * inode,struct page * locked_page,const u64 start,const u64 end)1750 static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
1751 			   const u64 start, const u64 end)
1752 {
1753 	const bool is_space_ino = btrfs_is_free_space_inode(inode);
1754 	const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
1755 	const u64 range_bytes = end + 1 - start;
1756 	struct extent_io_tree *io_tree = &inode->io_tree;
1757 	u64 range_start = start;
1758 	u64 count;
1759 	int ret;
1760 
1761 	/*
1762 	 * If EXTENT_NORESERVE is set it means that when the buffered write was
1763 	 * made we had not enough available data space and therefore we did not
1764 	 * reserve data space for it, since we though we could do NOCOW for the
1765 	 * respective file range (either there is prealloc extent or the inode
1766 	 * has the NOCOW bit set).
1767 	 *
1768 	 * However when we need to fallback to COW mode (because for example the
1769 	 * block group for the corresponding extent was turned to RO mode by a
1770 	 * scrub or relocation) we need to do the following:
1771 	 *
1772 	 * 1) We increment the bytes_may_use counter of the data space info.
1773 	 *    If COW succeeds, it allocates a new data extent and after doing
1774 	 *    that it decrements the space info's bytes_may_use counter and
1775 	 *    increments its bytes_reserved counter by the same amount (we do
1776 	 *    this at btrfs_add_reserved_bytes()). So we need to increment the
1777 	 *    bytes_may_use counter to compensate (when space is reserved at
1778 	 *    buffered write time, the bytes_may_use counter is incremented);
1779 	 *
1780 	 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1781 	 *    that if the COW path fails for any reason, it decrements (through
1782 	 *    extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1783 	 *    data space info, which we incremented in the step above.
1784 	 *
1785 	 * If we need to fallback to cow and the inode corresponds to a free
1786 	 * space cache inode or an inode of the data relocation tree, we must
1787 	 * also increment bytes_may_use of the data space_info for the same
1788 	 * reason. Space caches and relocated data extents always get a prealloc
1789 	 * extent for them, however scrub or balance may have set the block
1790 	 * group that contains that extent to RO mode and therefore force COW
1791 	 * when starting writeback.
1792 	 */
1793 	count = count_range_bits(io_tree, &range_start, end, range_bytes,
1794 				 EXTENT_NORESERVE, 0, NULL);
1795 	if (count > 0 || is_space_ino || is_reloc_ino) {
1796 		u64 bytes = count;
1797 		struct btrfs_fs_info *fs_info = inode->root->fs_info;
1798 		struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1799 
1800 		if (is_space_ino || is_reloc_ino)
1801 			bytes = range_bytes;
1802 
1803 		spin_lock(&sinfo->lock);
1804 		btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
1805 		spin_unlock(&sinfo->lock);
1806 
1807 		if (count > 0)
1808 			clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
1809 					 NULL);
1810 	}
1811 
1812 	/*
1813 	 * Don't try to create inline extents, as a mix of inline extent that
1814 	 * is written out and unlocked directly and a normal NOCOW extent
1815 	 * doesn't work.
1816 	 */
1817 	ret = cow_file_range(inode, locked_page, start, end, NULL, false, true);
1818 	ASSERT(ret != 1);
1819 	return ret;
1820 }
1821 
1822 struct can_nocow_file_extent_args {
1823 	/* Input fields. */
1824 
1825 	/* Start file offset of the range we want to NOCOW. */
1826 	u64 start;
1827 	/* End file offset (inclusive) of the range we want to NOCOW. */
1828 	u64 end;
1829 	bool writeback_path;
1830 	bool strict;
1831 	/*
1832 	 * Free the path passed to can_nocow_file_extent() once it's not needed
1833 	 * anymore.
1834 	 */
1835 	bool free_path;
1836 
1837 	/* Output fields. Only set when can_nocow_file_extent() returns 1. */
1838 
1839 	u64 disk_bytenr;
1840 	u64 disk_num_bytes;
1841 	u64 extent_offset;
1842 	/* Number of bytes that can be written to in NOCOW mode. */
1843 	u64 num_bytes;
1844 };
1845 
1846 /*
1847  * Check if we can NOCOW the file extent that the path points to.
1848  * This function may return with the path released, so the caller should check
1849  * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
1850  *
1851  * Returns: < 0 on error
1852  *            0 if we can not NOCOW
1853  *            1 if we can NOCOW
1854  */
can_nocow_file_extent(struct btrfs_path * path,struct btrfs_key * key,struct btrfs_inode * inode,struct can_nocow_file_extent_args * args)1855 static int can_nocow_file_extent(struct btrfs_path *path,
1856 				 struct btrfs_key *key,
1857 				 struct btrfs_inode *inode,
1858 				 struct can_nocow_file_extent_args *args)
1859 {
1860 	const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
1861 	struct extent_buffer *leaf = path->nodes[0];
1862 	struct btrfs_root *root = inode->root;
1863 	struct btrfs_file_extent_item *fi;
1864 	u64 extent_end;
1865 	u8 extent_type;
1866 	int can_nocow = 0;
1867 	int ret = 0;
1868 	bool nowait = path->nowait;
1869 
1870 	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
1871 	extent_type = btrfs_file_extent_type(leaf, fi);
1872 
1873 	if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1874 		goto out;
1875 
1876 	/* Can't access these fields unless we know it's not an inline extent. */
1877 	args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1878 	args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1879 	args->extent_offset = btrfs_file_extent_offset(leaf, fi);
1880 
1881 	if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
1882 	    extent_type == BTRFS_FILE_EXTENT_REG)
1883 		goto out;
1884 
1885 	/*
1886 	 * If the extent was created before the generation where the last snapshot
1887 	 * for its subvolume was created, then this implies the extent is shared,
1888 	 * hence we must COW.
1889 	 */
1890 	if (!args->strict &&
1891 	    btrfs_file_extent_generation(leaf, fi) <=
1892 	    btrfs_root_last_snapshot(&root->root_item))
1893 		goto out;
1894 
1895 	/* An explicit hole, must COW. */
1896 	if (args->disk_bytenr == 0)
1897 		goto out;
1898 
1899 	/* Compressed/encrypted/encoded extents must be COWed. */
1900 	if (btrfs_file_extent_compression(leaf, fi) ||
1901 	    btrfs_file_extent_encryption(leaf, fi) ||
1902 	    btrfs_file_extent_other_encoding(leaf, fi))
1903 		goto out;
1904 
1905 	extent_end = btrfs_file_extent_end(path);
1906 
1907 	/*
1908 	 * The following checks can be expensive, as they need to take other
1909 	 * locks and do btree or rbtree searches, so release the path to avoid
1910 	 * blocking other tasks for too long.
1911 	 */
1912 	btrfs_release_path(path);
1913 
1914 	ret = btrfs_cross_ref_exist(root, btrfs_ino(inode),
1915 				    key->offset - args->extent_offset,
1916 				    args->disk_bytenr, args->strict, path);
1917 	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1918 	if (ret != 0)
1919 		goto out;
1920 
1921 	if (args->free_path) {
1922 		/*
1923 		 * We don't need the path anymore, plus through the
1924 		 * csum_exist_in_range() call below we will end up allocating
1925 		 * another path. So free the path to avoid unnecessary extra
1926 		 * memory usage.
1927 		 */
1928 		btrfs_free_path(path);
1929 		path = NULL;
1930 	}
1931 
1932 	/* If there are pending snapshots for this root, we must COW. */
1933 	if (args->writeback_path && !is_freespace_inode &&
1934 	    atomic_read(&root->snapshot_force_cow))
1935 		goto out;
1936 
1937 	args->disk_bytenr += args->extent_offset;
1938 	args->disk_bytenr += args->start - key->offset;
1939 	args->num_bytes = min(args->end + 1, extent_end) - args->start;
1940 
1941 	/*
1942 	 * Force COW if csums exist in the range. This ensures that csums for a
1943 	 * given extent are either valid or do not exist.
1944 	 */
1945 	ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes,
1946 				  nowait);
1947 	WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1948 	if (ret != 0)
1949 		goto out;
1950 
1951 	can_nocow = 1;
1952  out:
1953 	if (args->free_path && path)
1954 		btrfs_free_path(path);
1955 
1956 	return ret < 0 ? ret : can_nocow;
1957 }
1958 
1959 /*
1960  * when nowcow writeback call back.  This checks for snapshots or COW copies
1961  * of the extents that exist in the file, and COWs the file as required.
1962  *
1963  * If no cow copies or snapshots exist, we write directly to the existing
1964  * blocks on disk
1965  */
run_delalloc_nocow(struct btrfs_inode * inode,struct page * locked_page,const u64 start,const u64 end)1966 static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
1967 				       struct page *locked_page,
1968 				       const u64 start, const u64 end)
1969 {
1970 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
1971 	struct btrfs_root *root = inode->root;
1972 	struct btrfs_path *path;
1973 	u64 cow_start = (u64)-1;
1974 	u64 cur_offset = start;
1975 	int ret;
1976 	bool check_prev = true;
1977 	u64 ino = btrfs_ino(inode);
1978 	struct can_nocow_file_extent_args nocow_args = { 0 };
1979 
1980 	/*
1981 	 * Normally on a zoned device we're only doing COW writes, but in case
1982 	 * of relocation on a zoned filesystem serializes I/O so that we're only
1983 	 * writing sequentially and can end up here as well.
1984 	 */
1985 	ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
1986 
1987 	path = btrfs_alloc_path();
1988 	if (!path) {
1989 		ret = -ENOMEM;
1990 		goto error;
1991 	}
1992 
1993 	nocow_args.end = end;
1994 	nocow_args.writeback_path = true;
1995 
1996 	while (1) {
1997 		struct btrfs_block_group *nocow_bg = NULL;
1998 		struct btrfs_ordered_extent *ordered;
1999 		struct btrfs_key found_key;
2000 		struct btrfs_file_extent_item *fi;
2001 		struct extent_buffer *leaf;
2002 		u64 extent_end;
2003 		u64 ram_bytes;
2004 		u64 nocow_end;
2005 		int extent_type;
2006 		bool is_prealloc;
2007 
2008 		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
2009 					       cur_offset, 0);
2010 		if (ret < 0)
2011 			goto error;
2012 
2013 		/*
2014 		 * If there is no extent for our range when doing the initial
2015 		 * search, then go back to the previous slot as it will be the
2016 		 * one containing the search offset
2017 		 */
2018 		if (ret > 0 && path->slots[0] > 0 && check_prev) {
2019 			leaf = path->nodes[0];
2020 			btrfs_item_key_to_cpu(leaf, &found_key,
2021 					      path->slots[0] - 1);
2022 			if (found_key.objectid == ino &&
2023 			    found_key.type == BTRFS_EXTENT_DATA_KEY)
2024 				path->slots[0]--;
2025 		}
2026 		check_prev = false;
2027 next_slot:
2028 		/* Go to next leaf if we have exhausted the current one */
2029 		leaf = path->nodes[0];
2030 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2031 			ret = btrfs_next_leaf(root, path);
2032 			if (ret < 0)
2033 				goto error;
2034 			if (ret > 0)
2035 				break;
2036 			leaf = path->nodes[0];
2037 		}
2038 
2039 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2040 
2041 		/* Didn't find anything for our INO */
2042 		if (found_key.objectid > ino)
2043 			break;
2044 		/*
2045 		 * Keep searching until we find an EXTENT_ITEM or there are no
2046 		 * more extents for this inode
2047 		 */
2048 		if (WARN_ON_ONCE(found_key.objectid < ino) ||
2049 		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
2050 			path->slots[0]++;
2051 			goto next_slot;
2052 		}
2053 
2054 		/* Found key is not EXTENT_DATA_KEY or starts after req range */
2055 		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
2056 		    found_key.offset > end)
2057 			break;
2058 
2059 		/*
2060 		 * If the found extent starts after requested offset, then
2061 		 * adjust cur_offset to be right before this extent begins.
2062 		 */
2063 		if (found_key.offset > cur_offset) {
2064 			if (cow_start == (u64)-1)
2065 				cow_start = cur_offset;
2066 			cur_offset = found_key.offset;
2067 			goto next_slot;
2068 		}
2069 
2070 		/*
2071 		 * Found extent which begins before our range and potentially
2072 		 * intersect it
2073 		 */
2074 		fi = btrfs_item_ptr(leaf, path->slots[0],
2075 				    struct btrfs_file_extent_item);
2076 		extent_type = btrfs_file_extent_type(leaf, fi);
2077 		/* If this is triggered then we have a memory corruption. */
2078 		ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
2079 		if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
2080 			ret = -EUCLEAN;
2081 			goto error;
2082 		}
2083 		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
2084 		extent_end = btrfs_file_extent_end(path);
2085 
2086 		/*
2087 		 * If the extent we got ends before our current offset, skip to
2088 		 * the next extent.
2089 		 */
2090 		if (extent_end <= cur_offset) {
2091 			path->slots[0]++;
2092 			goto next_slot;
2093 		}
2094 
2095 		nocow_args.start = cur_offset;
2096 		ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
2097 		if (ret < 0)
2098 			goto error;
2099 		if (ret == 0)
2100 			goto must_cow;
2101 
2102 		ret = 0;
2103 		nocow_bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr);
2104 		if (!nocow_bg) {
2105 must_cow:
2106 			/*
2107 			 * If we can't perform NOCOW writeback for the range,
2108 			 * then record the beginning of the range that needs to
2109 			 * be COWed.  It will be written out before the next
2110 			 * NOCOW range if we find one, or when exiting this
2111 			 * loop.
2112 			 */
2113 			if (cow_start == (u64)-1)
2114 				cow_start = cur_offset;
2115 			cur_offset = extent_end;
2116 			if (cur_offset > end)
2117 				break;
2118 			if (!path->nodes[0])
2119 				continue;
2120 			path->slots[0]++;
2121 			goto next_slot;
2122 		}
2123 
2124 		/*
2125 		 * COW range from cow_start to found_key.offset - 1. As the key
2126 		 * will contain the beginning of the first extent that can be
2127 		 * NOCOW, following one which needs to be COW'ed
2128 		 */
2129 		if (cow_start != (u64)-1) {
2130 			ret = fallback_to_cow(inode, locked_page,
2131 					      cow_start, found_key.offset - 1);
2132 			cow_start = (u64)-1;
2133 			if (ret) {
2134 				btrfs_dec_nocow_writers(nocow_bg);
2135 				goto error;
2136 			}
2137 		}
2138 
2139 		nocow_end = cur_offset + nocow_args.num_bytes - 1;
2140 		is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC;
2141 		if (is_prealloc) {
2142 			u64 orig_start = found_key.offset - nocow_args.extent_offset;
2143 			struct extent_map *em;
2144 
2145 			em = create_io_em(inode, cur_offset, nocow_args.num_bytes,
2146 					  orig_start,
2147 					  nocow_args.disk_bytenr, /* block_start */
2148 					  nocow_args.num_bytes, /* block_len */
2149 					  nocow_args.disk_num_bytes, /* orig_block_len */
2150 					  ram_bytes, BTRFS_COMPRESS_NONE,
2151 					  BTRFS_ORDERED_PREALLOC);
2152 			if (IS_ERR(em)) {
2153 				btrfs_dec_nocow_writers(nocow_bg);
2154 				ret = PTR_ERR(em);
2155 				goto error;
2156 			}
2157 			free_extent_map(em);
2158 		}
2159 
2160 		ordered = btrfs_alloc_ordered_extent(inode, cur_offset,
2161 				nocow_args.num_bytes, nocow_args.num_bytes,
2162 				nocow_args.disk_bytenr, nocow_args.num_bytes, 0,
2163 				is_prealloc
2164 				? (1 << BTRFS_ORDERED_PREALLOC)
2165 				: (1 << BTRFS_ORDERED_NOCOW),
2166 				BTRFS_COMPRESS_NONE);
2167 		btrfs_dec_nocow_writers(nocow_bg);
2168 		if (IS_ERR(ordered)) {
2169 			if (is_prealloc) {
2170 				btrfs_drop_extent_map_range(inode, cur_offset,
2171 							    nocow_end, false);
2172 			}
2173 			ret = PTR_ERR(ordered);
2174 			goto error;
2175 		}
2176 
2177 		if (btrfs_is_data_reloc_root(root))
2178 			/*
2179 			 * Error handled later, as we must prevent
2180 			 * extent_clear_unlock_delalloc() in error handler
2181 			 * from freeing metadata of created ordered extent.
2182 			 */
2183 			ret = btrfs_reloc_clone_csums(ordered);
2184 		btrfs_put_ordered_extent(ordered);
2185 
2186 		extent_clear_unlock_delalloc(inode, cur_offset, nocow_end,
2187 					     locked_page, EXTENT_LOCKED |
2188 					     EXTENT_DELALLOC |
2189 					     EXTENT_CLEAR_DATA_RESV,
2190 					     PAGE_UNLOCK | PAGE_SET_ORDERED);
2191 
2192 		cur_offset = extent_end;
2193 
2194 		/*
2195 		 * btrfs_reloc_clone_csums() error, now we're OK to call error
2196 		 * handler, as metadata for created ordered extent will only
2197 		 * be freed by btrfs_finish_ordered_io().
2198 		 */
2199 		if (ret)
2200 			goto error;
2201 		if (cur_offset > end)
2202 			break;
2203 	}
2204 	btrfs_release_path(path);
2205 
2206 	if (cur_offset <= end && cow_start == (u64)-1)
2207 		cow_start = cur_offset;
2208 
2209 	if (cow_start != (u64)-1) {
2210 		cur_offset = end;
2211 		ret = fallback_to_cow(inode, locked_page, cow_start, end);
2212 		cow_start = (u64)-1;
2213 		if (ret)
2214 			goto error;
2215 	}
2216 
2217 	btrfs_free_path(path);
2218 	return 0;
2219 
2220 error:
2221 	/*
2222 	 * If an error happened while a COW region is outstanding, cur_offset
2223 	 * needs to be reset to cow_start to ensure the COW region is unlocked
2224 	 * as well.
2225 	 */
2226 	if (cow_start != (u64)-1)
2227 		cur_offset = cow_start;
2228 	if (cur_offset < end) {
2229 		extent_clear_unlock_delalloc(inode, cur_offset, end,
2230 					     locked_page, EXTENT_LOCKED |
2231 					     EXTENT_DELALLOC | EXTENT_DEFRAG |
2232 					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
2233 					     PAGE_START_WRITEBACK |
2234 					     PAGE_END_WRITEBACK);
2235 		btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
2236 	}
2237 	btrfs_free_path(path);
2238 	return ret;
2239 }
2240 
should_nocow(struct btrfs_inode * inode,u64 start,u64 end)2241 static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
2242 {
2243 	if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
2244 		if (inode->defrag_bytes &&
2245 		    test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
2246 				   0, NULL))
2247 			return false;
2248 		return true;
2249 	}
2250 	return false;
2251 }
2252 
2253 /*
2254  * Function to process delayed allocation (create CoW) for ranges which are
2255  * being touched for the first time.
2256  */
btrfs_run_delalloc_range(struct btrfs_inode * inode,struct page * locked_page,u64 start,u64 end,struct writeback_control * wbc)2257 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
2258 			     u64 start, u64 end, struct writeback_control *wbc)
2259 {
2260 	const bool zoned = btrfs_is_zoned(inode->root->fs_info);
2261 	int ret;
2262 
2263 	/*
2264 	 * The range must cover part of the @locked_page, or a return of 1
2265 	 * can confuse the caller.
2266 	 */
2267 	ASSERT(!(end <= page_offset(locked_page) ||
2268 		 start >= page_offset(locked_page) + PAGE_SIZE));
2269 
2270 	if (should_nocow(inode, start, end)) {
2271 		ret = run_delalloc_nocow(inode, locked_page, start, end);
2272 		goto out;
2273 	}
2274 
2275 	if (btrfs_inode_can_compress(inode) &&
2276 	    inode_need_compress(inode, start, end) &&
2277 	    run_delalloc_compressed(inode, locked_page, start, end, wbc))
2278 		return 1;
2279 
2280 	if (zoned)
2281 		ret = run_delalloc_cow(inode, locked_page, start, end, wbc,
2282 				       true);
2283 	else
2284 		ret = cow_file_range(inode, locked_page, start, end, NULL,
2285 				     false, false);
2286 
2287 out:
2288 	if (ret < 0)
2289 		btrfs_cleanup_ordered_extents(inode, locked_page, start,
2290 					      end - start + 1);
2291 	return ret;
2292 }
2293 
btrfs_split_delalloc_extent(struct btrfs_inode * inode,struct extent_state * orig,u64 split)2294 void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
2295 				 struct extent_state *orig, u64 split)
2296 {
2297 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2298 	u64 size;
2299 
2300 	/* not delalloc, ignore it */
2301 	if (!(orig->state & EXTENT_DELALLOC))
2302 		return;
2303 
2304 	size = orig->end - orig->start + 1;
2305 	if (size > fs_info->max_extent_size) {
2306 		u32 num_extents;
2307 		u64 new_size;
2308 
2309 		/*
2310 		 * See the explanation in btrfs_merge_delalloc_extent, the same
2311 		 * applies here, just in reverse.
2312 		 */
2313 		new_size = orig->end - split + 1;
2314 		num_extents = count_max_extents(fs_info, new_size);
2315 		new_size = split - orig->start;
2316 		num_extents += count_max_extents(fs_info, new_size);
2317 		if (count_max_extents(fs_info, size) >= num_extents)
2318 			return;
2319 	}
2320 
2321 	spin_lock(&inode->lock);
2322 	btrfs_mod_outstanding_extents(inode, 1);
2323 	spin_unlock(&inode->lock);
2324 }
2325 
2326 /*
2327  * Handle merged delayed allocation extents so we can keep track of new extents
2328  * that are just merged onto old extents, such as when we are doing sequential
2329  * writes, so we can properly account for the metadata space we'll need.
2330  */
btrfs_merge_delalloc_extent(struct btrfs_inode * inode,struct extent_state * new,struct extent_state * other)2331 void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
2332 				 struct extent_state *other)
2333 {
2334 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2335 	u64 new_size, old_size;
2336 	u32 num_extents;
2337 
2338 	/* not delalloc, ignore it */
2339 	if (!(other->state & EXTENT_DELALLOC))
2340 		return;
2341 
2342 	if (new->start > other->start)
2343 		new_size = new->end - other->start + 1;
2344 	else
2345 		new_size = other->end - new->start + 1;
2346 
2347 	/* we're not bigger than the max, unreserve the space and go */
2348 	if (new_size <= fs_info->max_extent_size) {
2349 		spin_lock(&inode->lock);
2350 		btrfs_mod_outstanding_extents(inode, -1);
2351 		spin_unlock(&inode->lock);
2352 		return;
2353 	}
2354 
2355 	/*
2356 	 * We have to add up either side to figure out how many extents were
2357 	 * accounted for before we merged into one big extent.  If the number of
2358 	 * extents we accounted for is <= the amount we need for the new range
2359 	 * then we can return, otherwise drop.  Think of it like this
2360 	 *
2361 	 * [ 4k][MAX_SIZE]
2362 	 *
2363 	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
2364 	 * need 2 outstanding extents, on one side we have 1 and the other side
2365 	 * we have 1 so they are == and we can return.  But in this case
2366 	 *
2367 	 * [MAX_SIZE+4k][MAX_SIZE+4k]
2368 	 *
2369 	 * Each range on their own accounts for 2 extents, but merged together
2370 	 * they are only 3 extents worth of accounting, so we need to drop in
2371 	 * this case.
2372 	 */
2373 	old_size = other->end - other->start + 1;
2374 	num_extents = count_max_extents(fs_info, old_size);
2375 	old_size = new->end - new->start + 1;
2376 	num_extents += count_max_extents(fs_info, old_size);
2377 	if (count_max_extents(fs_info, new_size) >= num_extents)
2378 		return;
2379 
2380 	spin_lock(&inode->lock);
2381 	btrfs_mod_outstanding_extents(inode, -1);
2382 	spin_unlock(&inode->lock);
2383 }
2384 
btrfs_add_delalloc_inodes(struct btrfs_root * root,struct btrfs_inode * inode)2385 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
2386 				      struct btrfs_inode *inode)
2387 {
2388 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2389 
2390 	spin_lock(&root->delalloc_lock);
2391 	if (list_empty(&inode->delalloc_inodes)) {
2392 		list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
2393 		set_bit(BTRFS_INODE_IN_DELALLOC_LIST, &inode->runtime_flags);
2394 		root->nr_delalloc_inodes++;
2395 		if (root->nr_delalloc_inodes == 1) {
2396 			spin_lock(&fs_info->delalloc_root_lock);
2397 			BUG_ON(!list_empty(&root->delalloc_root));
2398 			list_add_tail(&root->delalloc_root,
2399 				      &fs_info->delalloc_roots);
2400 			spin_unlock(&fs_info->delalloc_root_lock);
2401 		}
2402 	}
2403 	spin_unlock(&root->delalloc_lock);
2404 }
2405 
__btrfs_del_delalloc_inode(struct btrfs_root * root,struct btrfs_inode * inode)2406 void __btrfs_del_delalloc_inode(struct btrfs_root *root,
2407 				struct btrfs_inode *inode)
2408 {
2409 	struct btrfs_fs_info *fs_info = root->fs_info;
2410 
2411 	if (!list_empty(&inode->delalloc_inodes)) {
2412 		list_del_init(&inode->delalloc_inodes);
2413 		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2414 			  &inode->runtime_flags);
2415 		root->nr_delalloc_inodes--;
2416 		if (!root->nr_delalloc_inodes) {
2417 			ASSERT(list_empty(&root->delalloc_inodes));
2418 			spin_lock(&fs_info->delalloc_root_lock);
2419 			BUG_ON(list_empty(&root->delalloc_root));
2420 			list_del_init(&root->delalloc_root);
2421 			spin_unlock(&fs_info->delalloc_root_lock);
2422 		}
2423 	}
2424 }
2425 
btrfs_del_delalloc_inode(struct btrfs_root * root,struct btrfs_inode * inode)2426 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
2427 				     struct btrfs_inode *inode)
2428 {
2429 	spin_lock(&root->delalloc_lock);
2430 	__btrfs_del_delalloc_inode(root, inode);
2431 	spin_unlock(&root->delalloc_lock);
2432 }
2433 
2434 /*
2435  * Properly track delayed allocation bytes in the inode and to maintain the
2436  * list of inodes that have pending delalloc work to be done.
2437  */
btrfs_set_delalloc_extent(struct btrfs_inode * inode,struct extent_state * state,u32 bits)2438 void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
2439 			       u32 bits)
2440 {
2441 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2442 
2443 	if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
2444 		WARN_ON(1);
2445 	/*
2446 	 * set_bit and clear bit hooks normally require _irqsave/restore
2447 	 * but in this case, we are only testing for the DELALLOC
2448 	 * bit, which is only set or cleared with irqs on
2449 	 */
2450 	if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2451 		struct btrfs_root *root = inode->root;
2452 		u64 len = state->end + 1 - state->start;
2453 		u32 num_extents = count_max_extents(fs_info, len);
2454 		bool do_list = !btrfs_is_free_space_inode(inode);
2455 
2456 		spin_lock(&inode->lock);
2457 		btrfs_mod_outstanding_extents(inode, num_extents);
2458 		spin_unlock(&inode->lock);
2459 
2460 		/* For sanity tests */
2461 		if (btrfs_is_testing(fs_info))
2462 			return;
2463 
2464 		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
2465 					 fs_info->delalloc_batch);
2466 		spin_lock(&inode->lock);
2467 		inode->delalloc_bytes += len;
2468 		if (bits & EXTENT_DEFRAG)
2469 			inode->defrag_bytes += len;
2470 		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2471 					 &inode->runtime_flags))
2472 			btrfs_add_delalloc_inodes(root, inode);
2473 		spin_unlock(&inode->lock);
2474 	}
2475 
2476 	if (!(state->state & EXTENT_DELALLOC_NEW) &&
2477 	    (bits & EXTENT_DELALLOC_NEW)) {
2478 		spin_lock(&inode->lock);
2479 		inode->new_delalloc_bytes += state->end + 1 - state->start;
2480 		spin_unlock(&inode->lock);
2481 	}
2482 }
2483 
2484 /*
2485  * Once a range is no longer delalloc this function ensures that proper
2486  * accounting happens.
2487  */
btrfs_clear_delalloc_extent(struct btrfs_inode * inode,struct extent_state * state,u32 bits)2488 void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
2489 				 struct extent_state *state, u32 bits)
2490 {
2491 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2492 	u64 len = state->end + 1 - state->start;
2493 	u32 num_extents = count_max_extents(fs_info, len);
2494 
2495 	if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
2496 		spin_lock(&inode->lock);
2497 		inode->defrag_bytes -= len;
2498 		spin_unlock(&inode->lock);
2499 	}
2500 
2501 	/*
2502 	 * set_bit and clear bit hooks normally require _irqsave/restore
2503 	 * but in this case, we are only testing for the DELALLOC
2504 	 * bit, which is only set or cleared with irqs on
2505 	 */
2506 	if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2507 		struct btrfs_root *root = inode->root;
2508 		bool do_list = !btrfs_is_free_space_inode(inode);
2509 
2510 		spin_lock(&inode->lock);
2511 		btrfs_mod_outstanding_extents(inode, -num_extents);
2512 		spin_unlock(&inode->lock);
2513 
2514 		/*
2515 		 * We don't reserve metadata space for space cache inodes so we
2516 		 * don't need to call delalloc_release_metadata if there is an
2517 		 * error.
2518 		 */
2519 		if (bits & EXTENT_CLEAR_META_RESV &&
2520 		    root != fs_info->tree_root)
2521 			btrfs_delalloc_release_metadata(inode, len, true);
2522 
2523 		/* For sanity tests. */
2524 		if (btrfs_is_testing(fs_info))
2525 			return;
2526 
2527 		if (!btrfs_is_data_reloc_root(root) &&
2528 		    do_list && !(state->state & EXTENT_NORESERVE) &&
2529 		    (bits & EXTENT_CLEAR_DATA_RESV))
2530 			btrfs_free_reserved_data_space_noquota(fs_info, len);
2531 
2532 		percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
2533 					 fs_info->delalloc_batch);
2534 		spin_lock(&inode->lock);
2535 		inode->delalloc_bytes -= len;
2536 		if (do_list && inode->delalloc_bytes == 0 &&
2537 		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
2538 					&inode->runtime_flags))
2539 			btrfs_del_delalloc_inode(root, inode);
2540 		spin_unlock(&inode->lock);
2541 	}
2542 
2543 	if ((state->state & EXTENT_DELALLOC_NEW) &&
2544 	    (bits & EXTENT_DELALLOC_NEW)) {
2545 		spin_lock(&inode->lock);
2546 		ASSERT(inode->new_delalloc_bytes >= len);
2547 		inode->new_delalloc_bytes -= len;
2548 		if (bits & EXTENT_ADD_INODE_BYTES)
2549 			inode_add_bytes(&inode->vfs_inode, len);
2550 		spin_unlock(&inode->lock);
2551 	}
2552 }
2553 
btrfs_extract_ordered_extent(struct btrfs_bio * bbio,struct btrfs_ordered_extent * ordered)2554 static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio,
2555 					struct btrfs_ordered_extent *ordered)
2556 {
2557 	u64 start = (u64)bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT;
2558 	u64 len = bbio->bio.bi_iter.bi_size;
2559 	struct btrfs_ordered_extent *new;
2560 	int ret;
2561 
2562 	/* Must always be called for the beginning of an ordered extent. */
2563 	if (WARN_ON_ONCE(start != ordered->disk_bytenr))
2564 		return -EINVAL;
2565 
2566 	/* No need to split if the ordered extent covers the entire bio. */
2567 	if (ordered->disk_num_bytes == len) {
2568 		refcount_inc(&ordered->refs);
2569 		bbio->ordered = ordered;
2570 		return 0;
2571 	}
2572 
2573 	/*
2574 	 * Don't split the extent_map for NOCOW extents, as we're writing into
2575 	 * a pre-existing one.
2576 	 */
2577 	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
2578 		ret = split_extent_map(bbio->inode, bbio->file_offset,
2579 				       ordered->num_bytes, len,
2580 				       ordered->disk_bytenr);
2581 		if (ret)
2582 			return ret;
2583 	}
2584 
2585 	new = btrfs_split_ordered_extent(ordered, len);
2586 	if (IS_ERR(new))
2587 		return PTR_ERR(new);
2588 	bbio->ordered = new;
2589 	return 0;
2590 }
2591 
2592 /*
2593  * given a list of ordered sums record them in the inode.  This happens
2594  * at IO completion time based on sums calculated at bio submission time.
2595  */
add_pending_csums(struct btrfs_trans_handle * trans,struct list_head * list)2596 static int add_pending_csums(struct btrfs_trans_handle *trans,
2597 			     struct list_head *list)
2598 {
2599 	struct btrfs_ordered_sum *sum;
2600 	struct btrfs_root *csum_root = NULL;
2601 	int ret;
2602 
2603 	list_for_each_entry(sum, list, list) {
2604 		trans->adding_csums = true;
2605 		if (!csum_root)
2606 			csum_root = btrfs_csum_root(trans->fs_info,
2607 						    sum->logical);
2608 		ret = btrfs_csum_file_blocks(trans, csum_root, sum);
2609 		trans->adding_csums = false;
2610 		if (ret)
2611 			return ret;
2612 	}
2613 	return 0;
2614 }
2615 
btrfs_find_new_delalloc_bytes(struct btrfs_inode * inode,const u64 start,const u64 len,struct extent_state ** cached_state)2616 static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2617 					 const u64 start,
2618 					 const u64 len,
2619 					 struct extent_state **cached_state)
2620 {
2621 	u64 search_start = start;
2622 	const u64 end = start + len - 1;
2623 
2624 	while (search_start < end) {
2625 		const u64 search_len = end - search_start + 1;
2626 		struct extent_map *em;
2627 		u64 em_len;
2628 		int ret = 0;
2629 
2630 		em = btrfs_get_extent(inode, NULL, 0, search_start, search_len);
2631 		if (IS_ERR(em))
2632 			return PTR_ERR(em);
2633 
2634 		if (em->block_start != EXTENT_MAP_HOLE)
2635 			goto next;
2636 
2637 		em_len = em->len;
2638 		if (em->start < search_start)
2639 			em_len -= search_start - em->start;
2640 		if (em_len > search_len)
2641 			em_len = search_len;
2642 
2643 		ret = set_extent_bit(&inode->io_tree, search_start,
2644 				     search_start + em_len - 1,
2645 				     EXTENT_DELALLOC_NEW, cached_state);
2646 next:
2647 		search_start = extent_map_end(em);
2648 		free_extent_map(em);
2649 		if (ret)
2650 			return ret;
2651 	}
2652 	return 0;
2653 }
2654 
btrfs_set_extent_delalloc(struct btrfs_inode * inode,u64 start,u64 end,unsigned int extra_bits,struct extent_state ** cached_state)2655 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2656 			      unsigned int extra_bits,
2657 			      struct extent_state **cached_state)
2658 {
2659 	WARN_ON(PAGE_ALIGNED(end));
2660 
2661 	if (start >= i_size_read(&inode->vfs_inode) &&
2662 	    !(inode->flags & BTRFS_INODE_PREALLOC)) {
2663 		/*
2664 		 * There can't be any extents following eof in this case so just
2665 		 * set the delalloc new bit for the range directly.
2666 		 */
2667 		extra_bits |= EXTENT_DELALLOC_NEW;
2668 	} else {
2669 		int ret;
2670 
2671 		ret = btrfs_find_new_delalloc_bytes(inode, start,
2672 						    end + 1 - start,
2673 						    cached_state);
2674 		if (ret)
2675 			return ret;
2676 	}
2677 
2678 	return set_extent_bit(&inode->io_tree, start, end,
2679 			      EXTENT_DELALLOC | extra_bits, cached_state);
2680 }
2681 
2682 /* see btrfs_writepage_start_hook for details on why this is required */
2683 struct btrfs_writepage_fixup {
2684 	struct page *page;
2685 	struct btrfs_inode *inode;
2686 	struct btrfs_work work;
2687 };
2688 
btrfs_writepage_fixup_worker(struct btrfs_work * work)2689 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2690 {
2691 	struct btrfs_writepage_fixup *fixup =
2692 		container_of(work, struct btrfs_writepage_fixup, work);
2693 	struct btrfs_ordered_extent *ordered;
2694 	struct extent_state *cached_state = NULL;
2695 	struct extent_changeset *data_reserved = NULL;
2696 	struct page *page = fixup->page;
2697 	struct btrfs_inode *inode = fixup->inode;
2698 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
2699 	u64 page_start = page_offset(page);
2700 	u64 page_end = page_offset(page) + PAGE_SIZE - 1;
2701 	int ret = 0;
2702 	bool free_delalloc_space = true;
2703 
2704 	/*
2705 	 * This is similar to page_mkwrite, we need to reserve the space before
2706 	 * we take the page lock.
2707 	 */
2708 	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2709 					   PAGE_SIZE);
2710 again:
2711 	lock_page(page);
2712 
2713 	/*
2714 	 * Before we queued this fixup, we took a reference on the page.
2715 	 * page->mapping may go NULL, but it shouldn't be moved to a different
2716 	 * address space.
2717 	 */
2718 	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2719 		/*
2720 		 * Unfortunately this is a little tricky, either
2721 		 *
2722 		 * 1) We got here and our page had already been dealt with and
2723 		 *    we reserved our space, thus ret == 0, so we need to just
2724 		 *    drop our space reservation and bail.  This can happen the
2725 		 *    first time we come into the fixup worker, or could happen
2726 		 *    while waiting for the ordered extent.
2727 		 * 2) Our page was already dealt with, but we happened to get an
2728 		 *    ENOSPC above from the btrfs_delalloc_reserve_space.  In
2729 		 *    this case we obviously don't have anything to release, but
2730 		 *    because the page was already dealt with we don't want to
2731 		 *    mark the page with an error, so make sure we're resetting
2732 		 *    ret to 0.  This is why we have this check _before_ the ret
2733 		 *    check, because we do not want to have a surprise ENOSPC
2734 		 *    when the page was already properly dealt with.
2735 		 */
2736 		if (!ret) {
2737 			btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2738 			btrfs_delalloc_release_space(inode, data_reserved,
2739 						     page_start, PAGE_SIZE,
2740 						     true);
2741 		}
2742 		ret = 0;
2743 		goto out_page;
2744 	}
2745 
2746 	/*
2747 	 * We can't mess with the page state unless it is locked, so now that
2748 	 * it is locked bail if we failed to make our space reservation.
2749 	 */
2750 	if (ret)
2751 		goto out_page;
2752 
2753 	lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2754 
2755 	/* already ordered? We're done */
2756 	if (PageOrdered(page))
2757 		goto out_reserved;
2758 
2759 	ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
2760 	if (ordered) {
2761 		unlock_extent(&inode->io_tree, page_start, page_end,
2762 			      &cached_state);
2763 		unlock_page(page);
2764 		btrfs_start_ordered_extent(ordered);
2765 		btrfs_put_ordered_extent(ordered);
2766 		goto again;
2767 	}
2768 
2769 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2770 					&cached_state);
2771 	if (ret)
2772 		goto out_reserved;
2773 
2774 	/*
2775 	 * Everything went as planned, we're now the owner of a dirty page with
2776 	 * delayed allocation bits set and space reserved for our COW
2777 	 * destination.
2778 	 *
2779 	 * The page was dirty when we started, nothing should have cleaned it.
2780 	 */
2781 	BUG_ON(!PageDirty(page));
2782 	free_delalloc_space = false;
2783 out_reserved:
2784 	btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2785 	if (free_delalloc_space)
2786 		btrfs_delalloc_release_space(inode, data_reserved, page_start,
2787 					     PAGE_SIZE, true);
2788 	unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2789 out_page:
2790 	if (ret) {
2791 		/*
2792 		 * We hit ENOSPC or other errors.  Update the mapping and page
2793 		 * to reflect the errors and clean the page.
2794 		 */
2795 		mapping_set_error(page->mapping, ret);
2796 		btrfs_mark_ordered_io_finished(inode, page, page_start,
2797 					       PAGE_SIZE, !ret);
2798 		clear_page_dirty_for_io(page);
2799 	}
2800 	btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
2801 	unlock_page(page);
2802 	put_page(page);
2803 	kfree(fixup);
2804 	extent_changeset_free(data_reserved);
2805 	/*
2806 	 * As a precaution, do a delayed iput in case it would be the last iput
2807 	 * that could need flushing space. Recursing back to fixup worker would
2808 	 * deadlock.
2809 	 */
2810 	btrfs_add_delayed_iput(inode);
2811 }
2812 
2813 /*
2814  * There are a few paths in the higher layers of the kernel that directly
2815  * set the page dirty bit without asking the filesystem if it is a
2816  * good idea.  This causes problems because we want to make sure COW
2817  * properly happens and the data=ordered rules are followed.
2818  *
2819  * In our case any range that doesn't have the ORDERED bit set
2820  * hasn't been properly setup for IO.  We kick off an async process
2821  * to fix it up.  The async helper will wait for ordered extents, set
2822  * the delalloc bit and make it safe to write the page.
2823  */
btrfs_writepage_cow_fixup(struct page * page)2824 int btrfs_writepage_cow_fixup(struct page *page)
2825 {
2826 	struct inode *inode = page->mapping->host;
2827 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2828 	struct btrfs_writepage_fixup *fixup;
2829 
2830 	/* This page has ordered extent covering it already */
2831 	if (PageOrdered(page))
2832 		return 0;
2833 
2834 	/*
2835 	 * PageChecked is set below when we create a fixup worker for this page,
2836 	 * don't try to create another one if we're already PageChecked()
2837 	 *
2838 	 * The extent_io writepage code will redirty the page if we send back
2839 	 * EAGAIN.
2840 	 */
2841 	if (PageChecked(page))
2842 		return -EAGAIN;
2843 
2844 	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2845 	if (!fixup)
2846 		return -EAGAIN;
2847 
2848 	/*
2849 	 * We are already holding a reference to this inode from
2850 	 * write_cache_pages.  We need to hold it because the space reservation
2851 	 * takes place outside of the page lock, and we can't trust
2852 	 * page->mapping outside of the page lock.
2853 	 */
2854 	ihold(inode);
2855 	btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE);
2856 	get_page(page);
2857 	btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL);
2858 	fixup->page = page;
2859 	fixup->inode = BTRFS_I(inode);
2860 	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2861 
2862 	return -EAGAIN;
2863 }
2864 
insert_reserved_file_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,u64 file_pos,struct btrfs_file_extent_item * stack_fi,const bool update_inode_bytes,u64 qgroup_reserved)2865 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2866 				       struct btrfs_inode *inode, u64 file_pos,
2867 				       struct btrfs_file_extent_item *stack_fi,
2868 				       const bool update_inode_bytes,
2869 				       u64 qgroup_reserved)
2870 {
2871 	struct btrfs_root *root = inode->root;
2872 	const u64 sectorsize = root->fs_info->sectorsize;
2873 	struct btrfs_path *path;
2874 	struct extent_buffer *leaf;
2875 	struct btrfs_key ins;
2876 	u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2877 	u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
2878 	u64 offset = btrfs_stack_file_extent_offset(stack_fi);
2879 	u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2880 	u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2881 	struct btrfs_drop_extents_args drop_args = { 0 };
2882 	int ret;
2883 
2884 	path = btrfs_alloc_path();
2885 	if (!path)
2886 		return -ENOMEM;
2887 
2888 	/*
2889 	 * we may be replacing one extent in the tree with another.
2890 	 * The new extent is pinned in the extent map, and we don't want
2891 	 * to drop it from the cache until it is completely in the btree.
2892 	 *
2893 	 * So, tell btrfs_drop_extents to leave this extent in the cache.
2894 	 * the caller is expected to unpin it and allow it to be merged
2895 	 * with the others.
2896 	 */
2897 	drop_args.path = path;
2898 	drop_args.start = file_pos;
2899 	drop_args.end = file_pos + num_bytes;
2900 	drop_args.replace_extent = true;
2901 	drop_args.extent_item_size = sizeof(*stack_fi);
2902 	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2903 	if (ret)
2904 		goto out;
2905 
2906 	if (!drop_args.extent_inserted) {
2907 		ins.objectid = btrfs_ino(inode);
2908 		ins.offset = file_pos;
2909 		ins.type = BTRFS_EXTENT_DATA_KEY;
2910 
2911 		ret = btrfs_insert_empty_item(trans, root, path, &ins,
2912 					      sizeof(*stack_fi));
2913 		if (ret)
2914 			goto out;
2915 	}
2916 	leaf = path->nodes[0];
2917 	btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2918 	write_extent_buffer(leaf, stack_fi,
2919 			btrfs_item_ptr_offset(leaf, path->slots[0]),
2920 			sizeof(struct btrfs_file_extent_item));
2921 
2922 	btrfs_mark_buffer_dirty(trans, leaf);
2923 	btrfs_release_path(path);
2924 
2925 	/*
2926 	 * If we dropped an inline extent here, we know the range where it is
2927 	 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
2928 	 * number of bytes only for that range containing the inline extent.
2929 	 * The remaining of the range will be processed when clearning the
2930 	 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
2931 	 */
2932 	if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
2933 		u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
2934 
2935 		inline_size = drop_args.bytes_found - inline_size;
2936 		btrfs_update_inode_bytes(inode, sectorsize, inline_size);
2937 		drop_args.bytes_found -= inline_size;
2938 		num_bytes -= sectorsize;
2939 	}
2940 
2941 	if (update_inode_bytes)
2942 		btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
2943 
2944 	ins.objectid = disk_bytenr;
2945 	ins.offset = disk_num_bytes;
2946 	ins.type = BTRFS_EXTENT_ITEM_KEY;
2947 
2948 	ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
2949 	if (ret)
2950 		goto out;
2951 
2952 	ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
2953 					       file_pos - offset,
2954 					       qgroup_reserved, &ins);
2955 out:
2956 	btrfs_free_path(path);
2957 
2958 	return ret;
2959 }
2960 
btrfs_release_delalloc_bytes(struct btrfs_fs_info * fs_info,u64 start,u64 len)2961 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2962 					 u64 start, u64 len)
2963 {
2964 	struct btrfs_block_group *cache;
2965 
2966 	cache = btrfs_lookup_block_group(fs_info, start);
2967 	ASSERT(cache);
2968 
2969 	spin_lock(&cache->lock);
2970 	cache->delalloc_bytes -= len;
2971 	spin_unlock(&cache->lock);
2972 
2973 	btrfs_put_block_group(cache);
2974 }
2975 
insert_ordered_extent_file_extent(struct btrfs_trans_handle * trans,struct btrfs_ordered_extent * oe)2976 static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
2977 					     struct btrfs_ordered_extent *oe)
2978 {
2979 	struct btrfs_file_extent_item stack_fi;
2980 	bool update_inode_bytes;
2981 	u64 num_bytes = oe->num_bytes;
2982 	u64 ram_bytes = oe->ram_bytes;
2983 
2984 	memset(&stack_fi, 0, sizeof(stack_fi));
2985 	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
2986 	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
2987 	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
2988 						   oe->disk_num_bytes);
2989 	btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
2990 	if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
2991 		num_bytes = oe->truncated_len;
2992 		ram_bytes = num_bytes;
2993 	}
2994 	btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
2995 	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
2996 	btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
2997 	/* Encryption and other encoding is reserved and all 0 */
2998 
2999 	/*
3000 	 * For delalloc, when completing an ordered extent we update the inode's
3001 	 * bytes when clearing the range in the inode's io tree, so pass false
3002 	 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
3003 	 * except if the ordered extent was truncated.
3004 	 */
3005 	update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
3006 			     test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
3007 			     test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
3008 
3009 	return insert_reserved_file_extent(trans, BTRFS_I(oe->inode),
3010 					   oe->file_offset, &stack_fi,
3011 					   update_inode_bytes, oe->qgroup_rsv);
3012 }
3013 
3014 /*
3015  * As ordered data IO finishes, this gets called so we can finish
3016  * an ordered extent if the range of bytes in the file it covers are
3017  * fully written.
3018  */
btrfs_finish_one_ordered(struct btrfs_ordered_extent * ordered_extent)3019 int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
3020 {
3021 	struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
3022 	struct btrfs_root *root = inode->root;
3023 	struct btrfs_fs_info *fs_info = root->fs_info;
3024 	struct btrfs_trans_handle *trans = NULL;
3025 	struct extent_io_tree *io_tree = &inode->io_tree;
3026 	struct extent_state *cached_state = NULL;
3027 	u64 start, end;
3028 	int compress_type = 0;
3029 	int ret = 0;
3030 	u64 logical_len = ordered_extent->num_bytes;
3031 	bool freespace_inode;
3032 	bool truncated = false;
3033 	bool clear_reserved_extent = true;
3034 	unsigned int clear_bits = EXTENT_DEFRAG;
3035 
3036 	start = ordered_extent->file_offset;
3037 	end = start + ordered_extent->num_bytes - 1;
3038 
3039 	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3040 	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3041 	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
3042 	    !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
3043 		clear_bits |= EXTENT_DELALLOC_NEW;
3044 
3045 	freespace_inode = btrfs_is_free_space_inode(inode);
3046 	if (!freespace_inode)
3047 		btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
3048 
3049 	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
3050 		ret = -EIO;
3051 		goto out;
3052 	}
3053 
3054 	if (btrfs_is_zoned(fs_info))
3055 		btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
3056 					ordered_extent->disk_num_bytes);
3057 
3058 	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3059 		truncated = true;
3060 		logical_len = ordered_extent->truncated_len;
3061 		/* Truncated the entire extent, don't bother adding */
3062 		if (!logical_len)
3063 			goto out;
3064 	}
3065 
3066 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3067 		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
3068 
3069 		btrfs_inode_safe_disk_i_size_write(inode, 0);
3070 		if (freespace_inode)
3071 			trans = btrfs_join_transaction_spacecache(root);
3072 		else
3073 			trans = btrfs_join_transaction(root);
3074 		if (IS_ERR(trans)) {
3075 			ret = PTR_ERR(trans);
3076 			trans = NULL;
3077 			goto out;
3078 		}
3079 		trans->block_rsv = &inode->block_rsv;
3080 		ret = btrfs_update_inode_fallback(trans, root, inode);
3081 		if (ret) /* -ENOMEM or corruption */
3082 			btrfs_abort_transaction(trans, ret);
3083 		goto out;
3084 	}
3085 
3086 	clear_bits |= EXTENT_LOCKED;
3087 	lock_extent(io_tree, start, end, &cached_state);
3088 
3089 	if (freespace_inode)
3090 		trans = btrfs_join_transaction_spacecache(root);
3091 	else
3092 		trans = btrfs_join_transaction(root);
3093 	if (IS_ERR(trans)) {
3094 		ret = PTR_ERR(trans);
3095 		trans = NULL;
3096 		goto out;
3097 	}
3098 
3099 	trans->block_rsv = &inode->block_rsv;
3100 
3101 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3102 		compress_type = ordered_extent->compress_type;
3103 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3104 		BUG_ON(compress_type);
3105 		ret = btrfs_mark_extent_written(trans, inode,
3106 						ordered_extent->file_offset,
3107 						ordered_extent->file_offset +
3108 						logical_len);
3109 		btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
3110 						  ordered_extent->disk_num_bytes);
3111 	} else {
3112 		BUG_ON(root == fs_info->tree_root);
3113 		ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3114 		if (!ret) {
3115 			clear_reserved_extent = false;
3116 			btrfs_release_delalloc_bytes(fs_info,
3117 						ordered_extent->disk_bytenr,
3118 						ordered_extent->disk_num_bytes);
3119 		}
3120 	}
3121 	unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset,
3122 			   ordered_extent->num_bytes, trans->transid);
3123 	if (ret < 0) {
3124 		btrfs_abort_transaction(trans, ret);
3125 		goto out;
3126 	}
3127 
3128 	ret = add_pending_csums(trans, &ordered_extent->list);
3129 	if (ret) {
3130 		btrfs_abort_transaction(trans, ret);
3131 		goto out;
3132 	}
3133 
3134 	/*
3135 	 * If this is a new delalloc range, clear its new delalloc flag to
3136 	 * update the inode's number of bytes. This needs to be done first
3137 	 * before updating the inode item.
3138 	 */
3139 	if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3140 	    !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3141 		clear_extent_bit(&inode->io_tree, start, end,
3142 				 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
3143 				 &cached_state);
3144 
3145 	btrfs_inode_safe_disk_i_size_write(inode, 0);
3146 	ret = btrfs_update_inode_fallback(trans, root, inode);
3147 	if (ret) { /* -ENOMEM or corruption */
3148 		btrfs_abort_transaction(trans, ret);
3149 		goto out;
3150 	}
3151 	ret = 0;
3152 out:
3153 	clear_extent_bit(&inode->io_tree, start, end, clear_bits,
3154 			 &cached_state);
3155 
3156 	if (trans)
3157 		btrfs_end_transaction(trans);
3158 
3159 	if (ret || truncated) {
3160 		u64 unwritten_start = start;
3161 
3162 		/*
3163 		 * If we failed to finish this ordered extent for any reason we
3164 		 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3165 		 * extent, and mark the inode with the error if it wasn't
3166 		 * already set.  Any error during writeback would have already
3167 		 * set the mapping error, so we need to set it if we're the ones
3168 		 * marking this ordered extent as failed.
3169 		 */
3170 		if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
3171 					     &ordered_extent->flags))
3172 			mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
3173 
3174 		if (truncated)
3175 			unwritten_start += logical_len;
3176 		clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
3177 
3178 		/*
3179 		 * Drop extent maps for the part of the extent we didn't write.
3180 		 *
3181 		 * We have an exception here for the free_space_inode, this is
3182 		 * because when we do btrfs_get_extent() on the free space inode
3183 		 * we will search the commit root.  If this is a new block group
3184 		 * we won't find anything, and we will trip over the assert in
3185 		 * writepage where we do ASSERT(em->block_start !=
3186 		 * EXTENT_MAP_HOLE).
3187 		 *
3188 		 * Theoretically we could also skip this for any NOCOW extent as
3189 		 * we don't mess with the extent map tree in the NOCOW case, but
3190 		 * for now simply skip this if we are the free space inode.
3191 		 */
3192 		if (!btrfs_is_free_space_inode(inode))
3193 			btrfs_drop_extent_map_range(inode, unwritten_start,
3194 						    end, false);
3195 
3196 		/*
3197 		 * If the ordered extent had an IOERR or something else went
3198 		 * wrong we need to return the space for this ordered extent
3199 		 * back to the allocator.  We only free the extent in the
3200 		 * truncated case if we didn't write out the extent at all.
3201 		 *
3202 		 * If we made it past insert_reserved_file_extent before we
3203 		 * errored out then we don't need to do this as the accounting
3204 		 * has already been done.
3205 		 */
3206 		if ((ret || !logical_len) &&
3207 		    clear_reserved_extent &&
3208 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3209 		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3210 			/*
3211 			 * Discard the range before returning it back to the
3212 			 * free space pool
3213 			 */
3214 			if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3215 				btrfs_discard_extent(fs_info,
3216 						ordered_extent->disk_bytenr,
3217 						ordered_extent->disk_num_bytes,
3218 						NULL);
3219 			btrfs_free_reserved_extent(fs_info,
3220 					ordered_extent->disk_bytenr,
3221 					ordered_extent->disk_num_bytes, 1);
3222 			/*
3223 			 * Actually free the qgroup rsv which was released when
3224 			 * the ordered extent was created.
3225 			 */
3226 			btrfs_qgroup_free_refroot(fs_info, inode->root->root_key.objectid,
3227 						  ordered_extent->qgroup_rsv,
3228 						  BTRFS_QGROUP_RSV_DATA);
3229 		}
3230 	}
3231 
3232 	/*
3233 	 * This needs to be done to make sure anybody waiting knows we are done
3234 	 * updating everything for this ordered extent.
3235 	 */
3236 	btrfs_remove_ordered_extent(inode, ordered_extent);
3237 
3238 	/* once for us */
3239 	btrfs_put_ordered_extent(ordered_extent);
3240 	/* once for the tree */
3241 	btrfs_put_ordered_extent(ordered_extent);
3242 
3243 	return ret;
3244 }
3245 
btrfs_finish_ordered_io(struct btrfs_ordered_extent * ordered)3246 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
3247 {
3248 	if (btrfs_is_zoned(btrfs_sb(ordered->inode->i_sb)) &&
3249 	    !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))
3250 		btrfs_finish_ordered_zoned(ordered);
3251 	return btrfs_finish_one_ordered(ordered);
3252 }
3253 
3254 /*
3255  * Verify the checksum for a single sector without any extra action that depend
3256  * on the type of I/O.
3257  */
btrfs_check_sector_csum(struct btrfs_fs_info * fs_info,struct page * page,u32 pgoff,u8 * csum,const u8 * const csum_expected)3258 int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
3259 			    u32 pgoff, u8 *csum, const u8 * const csum_expected)
3260 {
3261 	SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3262 	char *kaddr;
3263 
3264 	ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
3265 
3266 	shash->tfm = fs_info->csum_shash;
3267 
3268 	kaddr = kmap_local_page(page) + pgoff;
3269 	crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
3270 	kunmap_local(kaddr);
3271 
3272 	if (memcmp(csum, csum_expected, fs_info->csum_size))
3273 		return -EIO;
3274 	return 0;
3275 }
3276 
3277 /*
3278  * Verify the checksum of a single data sector.
3279  *
3280  * @bbio:	btrfs_io_bio which contains the csum
3281  * @dev:	device the sector is on
3282  * @bio_offset:	offset to the beginning of the bio (in bytes)
3283  * @bv:		bio_vec to check
3284  *
3285  * Check if the checksum on a data block is valid.  When a checksum mismatch is
3286  * detected, report the error and fill the corrupted range with zero.
3287  *
3288  * Return %true if the sector is ok or had no checksum to start with, else %false.
3289  */
btrfs_data_csum_ok(struct btrfs_bio * bbio,struct btrfs_device * dev,u32 bio_offset,struct bio_vec * bv)3290 bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
3291 			u32 bio_offset, struct bio_vec *bv)
3292 {
3293 	struct btrfs_inode *inode = bbio->inode;
3294 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3295 	u64 file_offset = bbio->file_offset + bio_offset;
3296 	u64 end = file_offset + bv->bv_len - 1;
3297 	u8 *csum_expected;
3298 	u8 csum[BTRFS_CSUM_SIZE];
3299 
3300 	ASSERT(bv->bv_len == fs_info->sectorsize);
3301 
3302 	if (!bbio->csum)
3303 		return true;
3304 
3305 	if (btrfs_is_data_reloc_root(inode->root) &&
3306 	    test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
3307 			   1, NULL)) {
3308 		/* Skip the range without csum for data reloc inode */
3309 		clear_extent_bits(&inode->io_tree, file_offset, end,
3310 				  EXTENT_NODATASUM);
3311 		return true;
3312 	}
3313 
3314 	csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
3315 				fs_info->csum_size;
3316 	if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
3317 				    csum_expected))
3318 		goto zeroit;
3319 	return true;
3320 
3321 zeroit:
3322 	btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
3323 				    bbio->mirror_num);
3324 	if (dev)
3325 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
3326 	memzero_bvec(bv);
3327 	return false;
3328 }
3329 
3330 /*
3331  * btrfs_add_delayed_iput - perform a delayed iput on @inode
3332  *
3333  * @inode: The inode we want to perform iput on
3334  *
3335  * This function uses the generic vfs_inode::i_count to track whether we should
3336  * just decrement it (in case it's > 1) or if this is the last iput then link
3337  * the inode to the delayed iput machinery. Delayed iputs are processed at
3338  * transaction commit time/superblock commit/cleaner kthread.
3339  */
btrfs_add_delayed_iput(struct btrfs_inode * inode)3340 void btrfs_add_delayed_iput(struct btrfs_inode *inode)
3341 {
3342 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
3343 	unsigned long flags;
3344 
3345 	if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
3346 		return;
3347 
3348 	atomic_inc(&fs_info->nr_delayed_iputs);
3349 	/*
3350 	 * Need to be irq safe here because we can be called from either an irq
3351 	 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
3352 	 * context.
3353 	 */
3354 	spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
3355 	ASSERT(list_empty(&inode->delayed_iput));
3356 	list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
3357 	spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
3358 	if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3359 		wake_up_process(fs_info->cleaner_kthread);
3360 }
3361 
run_delayed_iput_locked(struct btrfs_fs_info * fs_info,struct btrfs_inode * inode)3362 static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3363 				    struct btrfs_inode *inode)
3364 {
3365 	list_del_init(&inode->delayed_iput);
3366 	spin_unlock_irq(&fs_info->delayed_iput_lock);
3367 	iput(&inode->vfs_inode);
3368 	if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3369 		wake_up(&fs_info->delayed_iputs_wait);
3370 	spin_lock_irq(&fs_info->delayed_iput_lock);
3371 }
3372 
btrfs_run_delayed_iput(struct btrfs_fs_info * fs_info,struct btrfs_inode * inode)3373 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3374 				   struct btrfs_inode *inode)
3375 {
3376 	if (!list_empty(&inode->delayed_iput)) {
3377 		spin_lock_irq(&fs_info->delayed_iput_lock);
3378 		if (!list_empty(&inode->delayed_iput))
3379 			run_delayed_iput_locked(fs_info, inode);
3380 		spin_unlock_irq(&fs_info->delayed_iput_lock);
3381 	}
3382 }
3383 
btrfs_run_delayed_iputs(struct btrfs_fs_info * fs_info)3384 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3385 {
3386 	/*
3387 	 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
3388 	 * calls btrfs_add_delayed_iput() and that needs to lock
3389 	 * fs_info->delayed_iput_lock. So we need to disable irqs here to
3390 	 * prevent a deadlock.
3391 	 */
3392 	spin_lock_irq(&fs_info->delayed_iput_lock);
3393 	while (!list_empty(&fs_info->delayed_iputs)) {
3394 		struct btrfs_inode *inode;
3395 
3396 		inode = list_first_entry(&fs_info->delayed_iputs,
3397 				struct btrfs_inode, delayed_iput);
3398 		run_delayed_iput_locked(fs_info, inode);
3399 		if (need_resched()) {
3400 			spin_unlock_irq(&fs_info->delayed_iput_lock);
3401 			cond_resched();
3402 			spin_lock_irq(&fs_info->delayed_iput_lock);
3403 		}
3404 	}
3405 	spin_unlock_irq(&fs_info->delayed_iput_lock);
3406 }
3407 
3408 /*
3409  * Wait for flushing all delayed iputs
3410  *
3411  * @fs_info:  the filesystem
3412  *
3413  * This will wait on any delayed iputs that are currently running with KILLABLE
3414  * set.  Once they are all done running we will return, unless we are killed in
3415  * which case we return EINTR. This helps in user operations like fallocate etc
3416  * that might get blocked on the iputs.
3417  *
3418  * Return EINTR if we were killed, 0 if nothing's pending
3419  */
btrfs_wait_on_delayed_iputs(struct btrfs_fs_info * fs_info)3420 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3421 {
3422 	int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3423 			atomic_read(&fs_info->nr_delayed_iputs) == 0);
3424 	if (ret)
3425 		return -EINTR;
3426 	return 0;
3427 }
3428 
3429 /*
3430  * This creates an orphan entry for the given inode in case something goes wrong
3431  * in the middle of an unlink.
3432  */
btrfs_orphan_add(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3433 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3434 		     struct btrfs_inode *inode)
3435 {
3436 	int ret;
3437 
3438 	ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3439 	if (ret && ret != -EEXIST) {
3440 		btrfs_abort_transaction(trans, ret);
3441 		return ret;
3442 	}
3443 
3444 	return 0;
3445 }
3446 
3447 /*
3448  * We have done the delete so we can go ahead and remove the orphan item for
3449  * this particular inode.
3450  */
btrfs_orphan_del(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3451 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3452 			    struct btrfs_inode *inode)
3453 {
3454 	return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3455 }
3456 
3457 /*
3458  * this cleans up any orphans that may be left on the list from the last use
3459  * of this root.
3460  */
btrfs_orphan_cleanup(struct btrfs_root * root)3461 int btrfs_orphan_cleanup(struct btrfs_root *root)
3462 {
3463 	struct btrfs_fs_info *fs_info = root->fs_info;
3464 	struct btrfs_path *path;
3465 	struct extent_buffer *leaf;
3466 	struct btrfs_key key, found_key;
3467 	struct btrfs_trans_handle *trans;
3468 	struct inode *inode;
3469 	u64 last_objectid = 0;
3470 	int ret = 0, nr_unlink = 0;
3471 
3472 	if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
3473 		return 0;
3474 
3475 	path = btrfs_alloc_path();
3476 	if (!path) {
3477 		ret = -ENOMEM;
3478 		goto out;
3479 	}
3480 	path->reada = READA_BACK;
3481 
3482 	key.objectid = BTRFS_ORPHAN_OBJECTID;
3483 	key.type = BTRFS_ORPHAN_ITEM_KEY;
3484 	key.offset = (u64)-1;
3485 
3486 	while (1) {
3487 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3488 		if (ret < 0)
3489 			goto out;
3490 
3491 		/*
3492 		 * if ret == 0 means we found what we were searching for, which
3493 		 * is weird, but possible, so only screw with path if we didn't
3494 		 * find the key and see if we have stuff that matches
3495 		 */
3496 		if (ret > 0) {
3497 			ret = 0;
3498 			if (path->slots[0] == 0)
3499 				break;
3500 			path->slots[0]--;
3501 		}
3502 
3503 		/* pull out the item */
3504 		leaf = path->nodes[0];
3505 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3506 
3507 		/* make sure the item matches what we want */
3508 		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3509 			break;
3510 		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3511 			break;
3512 
3513 		/* release the path since we're done with it */
3514 		btrfs_release_path(path);
3515 
3516 		/*
3517 		 * this is where we are basically btrfs_lookup, without the
3518 		 * crossing root thing.  we store the inode number in the
3519 		 * offset of the orphan item.
3520 		 */
3521 
3522 		if (found_key.offset == last_objectid) {
3523 			/*
3524 			 * We found the same inode as before. This means we were
3525 			 * not able to remove its items via eviction triggered
3526 			 * by an iput(). A transaction abort may have happened,
3527 			 * due to -ENOSPC for example, so try to grab the error
3528 			 * that lead to a transaction abort, if any.
3529 			 */
3530 			btrfs_err(fs_info,
3531 				  "Error removing orphan entry, stopping orphan cleanup");
3532 			ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
3533 			goto out;
3534 		}
3535 
3536 		last_objectid = found_key.offset;
3537 
3538 		found_key.objectid = found_key.offset;
3539 		found_key.type = BTRFS_INODE_ITEM_KEY;
3540 		found_key.offset = 0;
3541 		inode = btrfs_iget(fs_info->sb, last_objectid, root);
3542 		if (IS_ERR(inode)) {
3543 			ret = PTR_ERR(inode);
3544 			inode = NULL;
3545 			if (ret != -ENOENT)
3546 				goto out;
3547 		}
3548 
3549 		if (!inode && root == fs_info->tree_root) {
3550 			struct btrfs_root *dead_root;
3551 			int is_dead_root = 0;
3552 
3553 			/*
3554 			 * This is an orphan in the tree root. Currently these
3555 			 * could come from 2 sources:
3556 			 *  a) a root (snapshot/subvolume) deletion in progress
3557 			 *  b) a free space cache inode
3558 			 * We need to distinguish those two, as the orphan item
3559 			 * for a root must not get deleted before the deletion
3560 			 * of the snapshot/subvolume's tree completes.
3561 			 *
3562 			 * btrfs_find_orphan_roots() ran before us, which has
3563 			 * found all deleted roots and loaded them into
3564 			 * fs_info->fs_roots_radix. So here we can find if an
3565 			 * orphan item corresponds to a deleted root by looking
3566 			 * up the root from that radix tree.
3567 			 */
3568 
3569 			spin_lock(&fs_info->fs_roots_radix_lock);
3570 			dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3571 							 (unsigned long)found_key.objectid);
3572 			if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3573 				is_dead_root = 1;
3574 			spin_unlock(&fs_info->fs_roots_radix_lock);
3575 
3576 			if (is_dead_root) {
3577 				/* prevent this orphan from being found again */
3578 				key.offset = found_key.objectid - 1;
3579 				continue;
3580 			}
3581 
3582 		}
3583 
3584 		/*
3585 		 * If we have an inode with links, there are a couple of
3586 		 * possibilities:
3587 		 *
3588 		 * 1. We were halfway through creating fsverity metadata for the
3589 		 * file. In that case, the orphan item represents incomplete
3590 		 * fsverity metadata which must be cleaned up with
3591 		 * btrfs_drop_verity_items and deleting the orphan item.
3592 
3593 		 * 2. Old kernels (before v3.12) used to create an
3594 		 * orphan item for truncate indicating that there were possibly
3595 		 * extent items past i_size that needed to be deleted. In v3.12,
3596 		 * truncate was changed to update i_size in sync with the extent
3597 		 * items, but the (useless) orphan item was still created. Since
3598 		 * v4.18, we don't create the orphan item for truncate at all.
3599 		 *
3600 		 * So, this item could mean that we need to do a truncate, but
3601 		 * only if this filesystem was last used on a pre-v3.12 kernel
3602 		 * and was not cleanly unmounted. The odds of that are quite
3603 		 * slim, and it's a pain to do the truncate now, so just delete
3604 		 * the orphan item.
3605 		 *
3606 		 * It's also possible that this orphan item was supposed to be
3607 		 * deleted but wasn't. The inode number may have been reused,
3608 		 * but either way, we can delete the orphan item.
3609 		 */
3610 		if (!inode || inode->i_nlink) {
3611 			if (inode) {
3612 				ret = btrfs_drop_verity_items(BTRFS_I(inode));
3613 				iput(inode);
3614 				inode = NULL;
3615 				if (ret)
3616 					goto out;
3617 			}
3618 			trans = btrfs_start_transaction(root, 1);
3619 			if (IS_ERR(trans)) {
3620 				ret = PTR_ERR(trans);
3621 				goto out;
3622 			}
3623 			btrfs_debug(fs_info, "auto deleting %Lu",
3624 				    found_key.objectid);
3625 			ret = btrfs_del_orphan_item(trans, root,
3626 						    found_key.objectid);
3627 			btrfs_end_transaction(trans);
3628 			if (ret)
3629 				goto out;
3630 			continue;
3631 		}
3632 
3633 		nr_unlink++;
3634 
3635 		/* this will do delete_inode and everything for us */
3636 		iput(inode);
3637 	}
3638 	/* release the path since we're done with it */
3639 	btrfs_release_path(path);
3640 
3641 	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3642 		trans = btrfs_join_transaction(root);
3643 		if (!IS_ERR(trans))
3644 			btrfs_end_transaction(trans);
3645 	}
3646 
3647 	if (nr_unlink)
3648 		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3649 
3650 out:
3651 	if (ret)
3652 		btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3653 	btrfs_free_path(path);
3654 	return ret;
3655 }
3656 
3657 /*
3658  * very simple check to peek ahead in the leaf looking for xattrs.  If we
3659  * don't find any xattrs, we know there can't be any acls.
3660  *
3661  * slot is the slot the inode is in, objectid is the objectid of the inode
3662  */
acls_after_inode_item(struct extent_buffer * leaf,int slot,u64 objectid,int * first_xattr_slot)3663 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3664 					  int slot, u64 objectid,
3665 					  int *first_xattr_slot)
3666 {
3667 	u32 nritems = btrfs_header_nritems(leaf);
3668 	struct btrfs_key found_key;
3669 	static u64 xattr_access = 0;
3670 	static u64 xattr_default = 0;
3671 	int scanned = 0;
3672 
3673 	if (!xattr_access) {
3674 		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3675 					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3676 		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3677 					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3678 	}
3679 
3680 	slot++;
3681 	*first_xattr_slot = -1;
3682 	while (slot < nritems) {
3683 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3684 
3685 		/* we found a different objectid, there must not be acls */
3686 		if (found_key.objectid != objectid)
3687 			return 0;
3688 
3689 		/* we found an xattr, assume we've got an acl */
3690 		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3691 			if (*first_xattr_slot == -1)
3692 				*first_xattr_slot = slot;
3693 			if (found_key.offset == xattr_access ||
3694 			    found_key.offset == xattr_default)
3695 				return 1;
3696 		}
3697 
3698 		/*
3699 		 * we found a key greater than an xattr key, there can't
3700 		 * be any acls later on
3701 		 */
3702 		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3703 			return 0;
3704 
3705 		slot++;
3706 		scanned++;
3707 
3708 		/*
3709 		 * it goes inode, inode backrefs, xattrs, extents,
3710 		 * so if there are a ton of hard links to an inode there can
3711 		 * be a lot of backrefs.  Don't waste time searching too hard,
3712 		 * this is just an optimization
3713 		 */
3714 		if (scanned >= 8)
3715 			break;
3716 	}
3717 	/* we hit the end of the leaf before we found an xattr or
3718 	 * something larger than an xattr.  We have to assume the inode
3719 	 * has acls
3720 	 */
3721 	if (*first_xattr_slot == -1)
3722 		*first_xattr_slot = slot;
3723 	return 1;
3724 }
3725 
3726 /*
3727  * read an inode from the btree into the in-memory inode
3728  */
btrfs_read_locked_inode(struct inode * inode,struct btrfs_path * in_path)3729 static int btrfs_read_locked_inode(struct inode *inode,
3730 				   struct btrfs_path *in_path)
3731 {
3732 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3733 	struct btrfs_path *path = in_path;
3734 	struct extent_buffer *leaf;
3735 	struct btrfs_inode_item *inode_item;
3736 	struct btrfs_root *root = BTRFS_I(inode)->root;
3737 	struct btrfs_key location;
3738 	unsigned long ptr;
3739 	int maybe_acls;
3740 	u32 rdev;
3741 	int ret;
3742 	bool filled = false;
3743 	int first_xattr_slot;
3744 
3745 	ret = btrfs_fill_inode(inode, &rdev);
3746 	if (!ret)
3747 		filled = true;
3748 
3749 	if (!path) {
3750 		path = btrfs_alloc_path();
3751 		if (!path)
3752 			return -ENOMEM;
3753 	}
3754 
3755 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3756 
3757 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3758 	if (ret) {
3759 		if (path != in_path)
3760 			btrfs_free_path(path);
3761 		return ret;
3762 	}
3763 
3764 	leaf = path->nodes[0];
3765 
3766 	if (filled)
3767 		goto cache_index;
3768 
3769 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3770 				    struct btrfs_inode_item);
3771 	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3772 	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3773 	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3774 	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3775 	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3776 	btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0,
3777 			round_up(i_size_read(inode), fs_info->sectorsize));
3778 
3779 	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3780 	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3781 
3782 	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3783 	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3784 
3785 	inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
3786 			btrfs_timespec_nsec(leaf, &inode_item->ctime));
3787 
3788 	BTRFS_I(inode)->i_otime.tv_sec =
3789 		btrfs_timespec_sec(leaf, &inode_item->otime);
3790 	BTRFS_I(inode)->i_otime.tv_nsec =
3791 		btrfs_timespec_nsec(leaf, &inode_item->otime);
3792 
3793 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3794 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3795 	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3796 
3797 	inode_set_iversion_queried(inode,
3798 				   btrfs_inode_sequence(leaf, inode_item));
3799 	inode->i_generation = BTRFS_I(inode)->generation;
3800 	inode->i_rdev = 0;
3801 	rdev = btrfs_inode_rdev(leaf, inode_item);
3802 
3803 	BTRFS_I(inode)->index_cnt = (u64)-1;
3804 	btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
3805 				&BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags);
3806 
3807 cache_index:
3808 	/*
3809 	 * If we were modified in the current generation and evicted from memory
3810 	 * and then re-read we need to do a full sync since we don't have any
3811 	 * idea about which extents were modified before we were evicted from
3812 	 * cache.
3813 	 *
3814 	 * This is required for both inode re-read from disk and delayed inode
3815 	 * in delayed_nodes_tree.
3816 	 */
3817 	if (BTRFS_I(inode)->last_trans == fs_info->generation)
3818 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3819 			&BTRFS_I(inode)->runtime_flags);
3820 
3821 	/*
3822 	 * We don't persist the id of the transaction where an unlink operation
3823 	 * against the inode was last made. So here we assume the inode might
3824 	 * have been evicted, and therefore the exact value of last_unlink_trans
3825 	 * lost, and set it to last_trans to avoid metadata inconsistencies
3826 	 * between the inode and its parent if the inode is fsync'ed and the log
3827 	 * replayed. For example, in the scenario:
3828 	 *
3829 	 * touch mydir/foo
3830 	 * ln mydir/foo mydir/bar
3831 	 * sync
3832 	 * unlink mydir/bar
3833 	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3834 	 * xfs_io -c fsync mydir/foo
3835 	 * <power failure>
3836 	 * mount fs, triggers fsync log replay
3837 	 *
3838 	 * We must make sure that when we fsync our inode foo we also log its
3839 	 * parent inode, otherwise after log replay the parent still has the
3840 	 * dentry with the "bar" name but our inode foo has a link count of 1
3841 	 * and doesn't have an inode ref with the name "bar" anymore.
3842 	 *
3843 	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
3844 	 * but it guarantees correctness at the expense of occasional full
3845 	 * transaction commits on fsync if our inode is a directory, or if our
3846 	 * inode is not a directory, logging its parent unnecessarily.
3847 	 */
3848 	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3849 
3850 	/*
3851 	 * Same logic as for last_unlink_trans. We don't persist the generation
3852 	 * of the last transaction where this inode was used for a reflink
3853 	 * operation, so after eviction and reloading the inode we must be
3854 	 * pessimistic and assume the last transaction that modified the inode.
3855 	 */
3856 	BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans;
3857 
3858 	path->slots[0]++;
3859 	if (inode->i_nlink != 1 ||
3860 	    path->slots[0] >= btrfs_header_nritems(leaf))
3861 		goto cache_acl;
3862 
3863 	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3864 	if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3865 		goto cache_acl;
3866 
3867 	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3868 	if (location.type == BTRFS_INODE_REF_KEY) {
3869 		struct btrfs_inode_ref *ref;
3870 
3871 		ref = (struct btrfs_inode_ref *)ptr;
3872 		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3873 	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3874 		struct btrfs_inode_extref *extref;
3875 
3876 		extref = (struct btrfs_inode_extref *)ptr;
3877 		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3878 								     extref);
3879 	}
3880 cache_acl:
3881 	/*
3882 	 * try to precache a NULL acl entry for files that don't have
3883 	 * any xattrs or acls
3884 	 */
3885 	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3886 			btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3887 	if (first_xattr_slot != -1) {
3888 		path->slots[0] = first_xattr_slot;
3889 		ret = btrfs_load_inode_props(inode, path);
3890 		if (ret)
3891 			btrfs_err(fs_info,
3892 				  "error loading props for ino %llu (root %llu): %d",
3893 				  btrfs_ino(BTRFS_I(inode)),
3894 				  root->root_key.objectid, ret);
3895 	}
3896 	if (path != in_path)
3897 		btrfs_free_path(path);
3898 
3899 	if (!maybe_acls)
3900 		cache_no_acl(inode);
3901 
3902 	switch (inode->i_mode & S_IFMT) {
3903 	case S_IFREG:
3904 		inode->i_mapping->a_ops = &btrfs_aops;
3905 		inode->i_fop = &btrfs_file_operations;
3906 		inode->i_op = &btrfs_file_inode_operations;
3907 		break;
3908 	case S_IFDIR:
3909 		inode->i_fop = &btrfs_dir_file_operations;
3910 		inode->i_op = &btrfs_dir_inode_operations;
3911 		break;
3912 	case S_IFLNK:
3913 		inode->i_op = &btrfs_symlink_inode_operations;
3914 		inode_nohighmem(inode);
3915 		inode->i_mapping->a_ops = &btrfs_aops;
3916 		break;
3917 	default:
3918 		inode->i_op = &btrfs_special_inode_operations;
3919 		init_special_inode(inode, inode->i_mode, rdev);
3920 		break;
3921 	}
3922 
3923 	btrfs_sync_inode_flags_to_i_flags(inode);
3924 	return 0;
3925 }
3926 
3927 /*
3928  * given a leaf and an inode, copy the inode fields into the leaf
3929  */
fill_inode_item(struct btrfs_trans_handle * trans,struct extent_buffer * leaf,struct btrfs_inode_item * item,struct inode * inode)3930 static void fill_inode_item(struct btrfs_trans_handle *trans,
3931 			    struct extent_buffer *leaf,
3932 			    struct btrfs_inode_item *item,
3933 			    struct inode *inode)
3934 {
3935 	struct btrfs_map_token token;
3936 	u64 flags;
3937 
3938 	btrfs_init_map_token(&token, leaf);
3939 
3940 	btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
3941 	btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
3942 	btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
3943 	btrfs_set_token_inode_mode(&token, item, inode->i_mode);
3944 	btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
3945 
3946 	btrfs_set_token_timespec_sec(&token, &item->atime,
3947 				     inode->i_atime.tv_sec);
3948 	btrfs_set_token_timespec_nsec(&token, &item->atime,
3949 				      inode->i_atime.tv_nsec);
3950 
3951 	btrfs_set_token_timespec_sec(&token, &item->mtime,
3952 				     inode->i_mtime.tv_sec);
3953 	btrfs_set_token_timespec_nsec(&token, &item->mtime,
3954 				      inode->i_mtime.tv_nsec);
3955 
3956 	btrfs_set_token_timespec_sec(&token, &item->ctime,
3957 				     inode_get_ctime(inode).tv_sec);
3958 	btrfs_set_token_timespec_nsec(&token, &item->ctime,
3959 				      inode_get_ctime(inode).tv_nsec);
3960 
3961 	btrfs_set_token_timespec_sec(&token, &item->otime,
3962 				     BTRFS_I(inode)->i_otime.tv_sec);
3963 	btrfs_set_token_timespec_nsec(&token, &item->otime,
3964 				      BTRFS_I(inode)->i_otime.tv_nsec);
3965 
3966 	btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
3967 	btrfs_set_token_inode_generation(&token, item,
3968 					 BTRFS_I(inode)->generation);
3969 	btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
3970 	btrfs_set_token_inode_transid(&token, item, trans->transid);
3971 	btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
3972 	flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
3973 					  BTRFS_I(inode)->ro_flags);
3974 	btrfs_set_token_inode_flags(&token, item, flags);
3975 	btrfs_set_token_inode_block_group(&token, item, 0);
3976 }
3977 
3978 /*
3979  * copy everything in the in-memory inode into the btree.
3980  */
btrfs_update_inode_item(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode)3981 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3982 				struct btrfs_root *root,
3983 				struct btrfs_inode *inode)
3984 {
3985 	struct btrfs_inode_item *inode_item;
3986 	struct btrfs_path *path;
3987 	struct extent_buffer *leaf;
3988 	int ret;
3989 
3990 	path = btrfs_alloc_path();
3991 	if (!path)
3992 		return -ENOMEM;
3993 
3994 	ret = btrfs_lookup_inode(trans, root, path, &inode->location, 1);
3995 	if (ret) {
3996 		if (ret > 0)
3997 			ret = -ENOENT;
3998 		goto failed;
3999 	}
4000 
4001 	leaf = path->nodes[0];
4002 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
4003 				    struct btrfs_inode_item);
4004 
4005 	fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
4006 	btrfs_mark_buffer_dirty(trans, leaf);
4007 	btrfs_set_inode_last_trans(trans, inode);
4008 	ret = 0;
4009 failed:
4010 	btrfs_free_path(path);
4011 	return ret;
4012 }
4013 
4014 /*
4015  * copy everything in the in-memory inode into the btree.
4016  */
btrfs_update_inode(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode)4017 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
4018 				struct btrfs_root *root,
4019 				struct btrfs_inode *inode)
4020 {
4021 	struct btrfs_fs_info *fs_info = root->fs_info;
4022 	int ret;
4023 
4024 	/*
4025 	 * If the inode is a free space inode, we can deadlock during commit
4026 	 * if we put it into the delayed code.
4027 	 *
4028 	 * The data relocation inode should also be directly updated
4029 	 * without delay
4030 	 */
4031 	if (!btrfs_is_free_space_inode(inode)
4032 	    && !btrfs_is_data_reloc_root(root)
4033 	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4034 		btrfs_update_root_times(trans, root);
4035 
4036 		ret = btrfs_delayed_update_inode(trans, root, inode);
4037 		if (!ret)
4038 			btrfs_set_inode_last_trans(trans, inode);
4039 		return ret;
4040 	}
4041 
4042 	return btrfs_update_inode_item(trans, root, inode);
4043 }
4044 
btrfs_update_inode_fallback(struct btrfs_trans_handle * trans,struct btrfs_root * root,struct btrfs_inode * inode)4045 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4046 				struct btrfs_root *root, struct btrfs_inode *inode)
4047 {
4048 	int ret;
4049 
4050 	ret = btrfs_update_inode(trans, root, inode);
4051 	if (ret == -ENOSPC)
4052 		return btrfs_update_inode_item(trans, root, inode);
4053 	return ret;
4054 }
4055 
4056 /*
4057  * unlink helper that gets used here in inode.c and in the tree logging
4058  * recovery code.  It remove a link in a directory with a given name, and
4059  * also drops the back refs in the inode to the directory
4060  */
__btrfs_unlink_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name,struct btrfs_rename_ctx * rename_ctx)4061 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4062 				struct btrfs_inode *dir,
4063 				struct btrfs_inode *inode,
4064 				const struct fscrypt_str *name,
4065 				struct btrfs_rename_ctx *rename_ctx)
4066 {
4067 	struct btrfs_root *root = dir->root;
4068 	struct btrfs_fs_info *fs_info = root->fs_info;
4069 	struct btrfs_path *path;
4070 	int ret = 0;
4071 	struct btrfs_dir_item *di;
4072 	u64 index;
4073 	u64 ino = btrfs_ino(inode);
4074 	u64 dir_ino = btrfs_ino(dir);
4075 
4076 	path = btrfs_alloc_path();
4077 	if (!path) {
4078 		ret = -ENOMEM;
4079 		goto out;
4080 	}
4081 
4082 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
4083 	if (IS_ERR_OR_NULL(di)) {
4084 		ret = di ? PTR_ERR(di) : -ENOENT;
4085 		goto err;
4086 	}
4087 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4088 	if (ret)
4089 		goto err;
4090 	btrfs_release_path(path);
4091 
4092 	/*
4093 	 * If we don't have dir index, we have to get it by looking up
4094 	 * the inode ref, since we get the inode ref, remove it directly,
4095 	 * it is unnecessary to do delayed deletion.
4096 	 *
4097 	 * But if we have dir index, needn't search inode ref to get it.
4098 	 * Since the inode ref is close to the inode item, it is better
4099 	 * that we delay to delete it, and just do this deletion when
4100 	 * we update the inode item.
4101 	 */
4102 	if (inode->dir_index) {
4103 		ret = btrfs_delayed_delete_inode_ref(inode);
4104 		if (!ret) {
4105 			index = inode->dir_index;
4106 			goto skip_backref;
4107 		}
4108 	}
4109 
4110 	ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
4111 	if (ret) {
4112 		btrfs_info(fs_info,
4113 			"failed to delete reference to %.*s, inode %llu parent %llu",
4114 			name->len, name->name, ino, dir_ino);
4115 		btrfs_abort_transaction(trans, ret);
4116 		goto err;
4117 	}
4118 skip_backref:
4119 	if (rename_ctx)
4120 		rename_ctx->index = index;
4121 
4122 	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4123 	if (ret) {
4124 		btrfs_abort_transaction(trans, ret);
4125 		goto err;
4126 	}
4127 
4128 	/*
4129 	 * If we are in a rename context, we don't need to update anything in the
4130 	 * log. That will be done later during the rename by btrfs_log_new_name().
4131 	 * Besides that, doing it here would only cause extra unnecessary btree
4132 	 * operations on the log tree, increasing latency for applications.
4133 	 */
4134 	if (!rename_ctx) {
4135 		btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
4136 		btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
4137 	}
4138 
4139 	/*
4140 	 * If we have a pending delayed iput we could end up with the final iput
4141 	 * being run in btrfs-cleaner context.  If we have enough of these built
4142 	 * up we can end up burning a lot of time in btrfs-cleaner without any
4143 	 * way to throttle the unlinks.  Since we're currently holding a ref on
4144 	 * the inode we can run the delayed iput here without any issues as the
4145 	 * final iput won't be done until after we drop the ref we're currently
4146 	 * holding.
4147 	 */
4148 	btrfs_run_delayed_iput(fs_info, inode);
4149 err:
4150 	btrfs_free_path(path);
4151 	if (ret)
4152 		goto out;
4153 
4154 	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
4155 	inode_inc_iversion(&inode->vfs_inode);
4156 	inode_set_ctime_current(&inode->vfs_inode);
4157 	inode_inc_iversion(&dir->vfs_inode);
4158 	inode_set_ctime_current(&inode->vfs_inode);
4159 	dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
4160 	ret = btrfs_update_inode(trans, root, dir);
4161 out:
4162 	return ret;
4163 }
4164 
btrfs_unlink_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name)4165 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4166 		       struct btrfs_inode *dir, struct btrfs_inode *inode,
4167 		       const struct fscrypt_str *name)
4168 {
4169 	int ret;
4170 
4171 	ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
4172 	if (!ret) {
4173 		drop_nlink(&inode->vfs_inode);
4174 		ret = btrfs_update_inode(trans, inode->root, inode);
4175 	}
4176 	return ret;
4177 }
4178 
4179 /*
4180  * helper to start transaction for unlink and rmdir.
4181  *
4182  * unlink and rmdir are special in btrfs, they do not always free space, so
4183  * if we cannot make our reservations the normal way try and see if there is
4184  * plenty of slack room in the global reserve to migrate, otherwise we cannot
4185  * allow the unlink to occur.
4186  */
__unlink_start_trans(struct btrfs_inode * dir)4187 static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
4188 {
4189 	struct btrfs_root *root = dir->root;
4190 
4191 	return btrfs_start_transaction_fallback_global_rsv(root,
4192 						   BTRFS_UNLINK_METADATA_UNITS);
4193 }
4194 
btrfs_unlink(struct inode * dir,struct dentry * dentry)4195 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4196 {
4197 	struct btrfs_trans_handle *trans;
4198 	struct inode *inode = d_inode(dentry);
4199 	int ret;
4200 	struct fscrypt_name fname;
4201 
4202 	ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4203 	if (ret)
4204 		return ret;
4205 
4206 	/* This needs to handle no-key deletions later on */
4207 
4208 	trans = __unlink_start_trans(BTRFS_I(dir));
4209 	if (IS_ERR(trans)) {
4210 		ret = PTR_ERR(trans);
4211 		goto fscrypt_free;
4212 	}
4213 
4214 	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4215 				false);
4216 
4217 	ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4218 				 &fname.disk_name);
4219 	if (ret)
4220 		goto end_trans;
4221 
4222 	if (inode->i_nlink == 0) {
4223 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4224 		if (ret)
4225 			goto end_trans;
4226 	}
4227 
4228 end_trans:
4229 	btrfs_end_transaction(trans);
4230 	btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
4231 fscrypt_free:
4232 	fscrypt_free_filename(&fname);
4233 	return ret;
4234 }
4235 
btrfs_unlink_subvol(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct dentry * dentry)4236 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4237 			       struct btrfs_inode *dir, struct dentry *dentry)
4238 {
4239 	struct btrfs_root *root = dir->root;
4240 	struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4241 	struct btrfs_path *path;
4242 	struct extent_buffer *leaf;
4243 	struct btrfs_dir_item *di;
4244 	struct btrfs_key key;
4245 	u64 index;
4246 	int ret;
4247 	u64 objectid;
4248 	u64 dir_ino = btrfs_ino(dir);
4249 	struct fscrypt_name fname;
4250 
4251 	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
4252 	if (ret)
4253 		return ret;
4254 
4255 	/* This needs to handle no-key deletions later on */
4256 
4257 	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4258 		objectid = inode->root->root_key.objectid;
4259 	} else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4260 		objectid = inode->location.objectid;
4261 	} else {
4262 		WARN_ON(1);
4263 		fscrypt_free_filename(&fname);
4264 		return -EINVAL;
4265 	}
4266 
4267 	path = btrfs_alloc_path();
4268 	if (!path) {
4269 		ret = -ENOMEM;
4270 		goto out;
4271 	}
4272 
4273 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4274 				   &fname.disk_name, -1);
4275 	if (IS_ERR_OR_NULL(di)) {
4276 		ret = di ? PTR_ERR(di) : -ENOENT;
4277 		goto out;
4278 	}
4279 
4280 	leaf = path->nodes[0];
4281 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
4282 	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4283 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4284 	if (ret) {
4285 		btrfs_abort_transaction(trans, ret);
4286 		goto out;
4287 	}
4288 	btrfs_release_path(path);
4289 
4290 	/*
4291 	 * This is a placeholder inode for a subvolume we didn't have a
4292 	 * reference to at the time of the snapshot creation.  In the meantime
4293 	 * we could have renamed the real subvol link into our snapshot, so
4294 	 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4295 	 * Instead simply lookup the dir_index_item for this entry so we can
4296 	 * remove it.  Otherwise we know we have a ref to the root and we can
4297 	 * call btrfs_del_root_ref, and it _shouldn't_ fail.
4298 	 */
4299 	if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4300 		di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
4301 		if (IS_ERR(di)) {
4302 			ret = PTR_ERR(di);
4303 			btrfs_abort_transaction(trans, ret);
4304 			goto out;
4305 		}
4306 
4307 		leaf = path->nodes[0];
4308 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4309 		index = key.offset;
4310 		btrfs_release_path(path);
4311 	} else {
4312 		ret = btrfs_del_root_ref(trans, objectid,
4313 					 root->root_key.objectid, dir_ino,
4314 					 &index, &fname.disk_name);
4315 		if (ret) {
4316 			btrfs_abort_transaction(trans, ret);
4317 			goto out;
4318 		}
4319 	}
4320 
4321 	ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4322 	if (ret) {
4323 		btrfs_abort_transaction(trans, ret);
4324 		goto out;
4325 	}
4326 
4327 	btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
4328 	inode_inc_iversion(&dir->vfs_inode);
4329 	dir->vfs_inode.i_mtime = inode_set_ctime_current(&dir->vfs_inode);
4330 	ret = btrfs_update_inode_fallback(trans, root, dir);
4331 	if (ret)
4332 		btrfs_abort_transaction(trans, ret);
4333 out:
4334 	btrfs_free_path(path);
4335 	fscrypt_free_filename(&fname);
4336 	return ret;
4337 }
4338 
4339 /*
4340  * Helper to check if the subvolume references other subvolumes or if it's
4341  * default.
4342  */
may_destroy_subvol(struct btrfs_root * root)4343 static noinline int may_destroy_subvol(struct btrfs_root *root)
4344 {
4345 	struct btrfs_fs_info *fs_info = root->fs_info;
4346 	struct btrfs_path *path;
4347 	struct btrfs_dir_item *di;
4348 	struct btrfs_key key;
4349 	struct fscrypt_str name = FSTR_INIT("default", 7);
4350 	u64 dir_id;
4351 	int ret;
4352 
4353 	path = btrfs_alloc_path();
4354 	if (!path)
4355 		return -ENOMEM;
4356 
4357 	/* Make sure this root isn't set as the default subvol */
4358 	dir_id = btrfs_super_root_dir(fs_info->super_copy);
4359 	di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
4360 				   dir_id, &name, 0);
4361 	if (di && !IS_ERR(di)) {
4362 		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
4363 		if (key.objectid == root->root_key.objectid) {
4364 			ret = -EPERM;
4365 			btrfs_err(fs_info,
4366 				  "deleting default subvolume %llu is not allowed",
4367 				  key.objectid);
4368 			goto out;
4369 		}
4370 		btrfs_release_path(path);
4371 	}
4372 
4373 	key.objectid = root->root_key.objectid;
4374 	key.type = BTRFS_ROOT_REF_KEY;
4375 	key.offset = (u64)-1;
4376 
4377 	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4378 	if (ret < 0)
4379 		goto out;
4380 	if (ret == 0) {
4381 		/*
4382 		 * Key with offset -1 found, there would have to exist a root
4383 		 * with such id, but this is out of valid range.
4384 		 */
4385 		ret = -EUCLEAN;
4386 		goto out;
4387 	}
4388 
4389 	ret = 0;
4390 	if (path->slots[0] > 0) {
4391 		path->slots[0]--;
4392 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4393 		if (key.objectid == root->root_key.objectid &&
4394 		    key.type == BTRFS_ROOT_REF_KEY)
4395 			ret = -ENOTEMPTY;
4396 	}
4397 out:
4398 	btrfs_free_path(path);
4399 	return ret;
4400 }
4401 
4402 /* Delete all dentries for inodes belonging to the root */
btrfs_prune_dentries(struct btrfs_root * root)4403 static void btrfs_prune_dentries(struct btrfs_root *root)
4404 {
4405 	struct btrfs_fs_info *fs_info = root->fs_info;
4406 	struct rb_node *node;
4407 	struct rb_node *prev;
4408 	struct btrfs_inode *entry;
4409 	struct inode *inode;
4410 	u64 objectid = 0;
4411 
4412 	if (!BTRFS_FS_ERROR(fs_info))
4413 		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4414 
4415 	spin_lock(&root->inode_lock);
4416 again:
4417 	node = root->inode_tree.rb_node;
4418 	prev = NULL;
4419 	while (node) {
4420 		prev = node;
4421 		entry = rb_entry(node, struct btrfs_inode, rb_node);
4422 
4423 		if (objectid < btrfs_ino(entry))
4424 			node = node->rb_left;
4425 		else if (objectid > btrfs_ino(entry))
4426 			node = node->rb_right;
4427 		else
4428 			break;
4429 	}
4430 	if (!node) {
4431 		while (prev) {
4432 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
4433 			if (objectid <= btrfs_ino(entry)) {
4434 				node = prev;
4435 				break;
4436 			}
4437 			prev = rb_next(prev);
4438 		}
4439 	}
4440 	while (node) {
4441 		entry = rb_entry(node, struct btrfs_inode, rb_node);
4442 		objectid = btrfs_ino(entry) + 1;
4443 		inode = igrab(&entry->vfs_inode);
4444 		if (inode) {
4445 			spin_unlock(&root->inode_lock);
4446 			if (atomic_read(&inode->i_count) > 1)
4447 				d_prune_aliases(inode);
4448 			/*
4449 			 * btrfs_drop_inode will have it removed from the inode
4450 			 * cache when its usage count hits zero.
4451 			 */
4452 			iput(inode);
4453 			cond_resched();
4454 			spin_lock(&root->inode_lock);
4455 			goto again;
4456 		}
4457 
4458 		if (cond_resched_lock(&root->inode_lock))
4459 			goto again;
4460 
4461 		node = rb_next(node);
4462 	}
4463 	spin_unlock(&root->inode_lock);
4464 }
4465 
btrfs_delete_subvolume(struct btrfs_inode * dir,struct dentry * dentry)4466 int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
4467 {
4468 	struct btrfs_fs_info *fs_info = btrfs_sb(dentry->d_sb);
4469 	struct btrfs_root *root = dir->root;
4470 	struct inode *inode = d_inode(dentry);
4471 	struct btrfs_root *dest = BTRFS_I(inode)->root;
4472 	struct btrfs_trans_handle *trans;
4473 	struct btrfs_block_rsv block_rsv;
4474 	u64 root_flags;
4475 	u64 qgroup_reserved = 0;
4476 	int ret;
4477 
4478 	down_write(&fs_info->subvol_sem);
4479 
4480 	/*
4481 	 * Don't allow to delete a subvolume with send in progress. This is
4482 	 * inside the inode lock so the error handling that has to drop the bit
4483 	 * again is not run concurrently.
4484 	 */
4485 	spin_lock(&dest->root_item_lock);
4486 	if (dest->send_in_progress) {
4487 		spin_unlock(&dest->root_item_lock);
4488 		btrfs_warn(fs_info,
4489 			   "attempt to delete subvolume %llu during send",
4490 			   dest->root_key.objectid);
4491 		ret = -EPERM;
4492 		goto out_up_write;
4493 	}
4494 	if (atomic_read(&dest->nr_swapfiles)) {
4495 		spin_unlock(&dest->root_item_lock);
4496 		btrfs_warn(fs_info,
4497 			   "attempt to delete subvolume %llu with active swapfile",
4498 			   root->root_key.objectid);
4499 		ret = -EPERM;
4500 		goto out_up_write;
4501 	}
4502 	root_flags = btrfs_root_flags(&dest->root_item);
4503 	btrfs_set_root_flags(&dest->root_item,
4504 			     root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4505 	spin_unlock(&dest->root_item_lock);
4506 
4507 	ret = may_destroy_subvol(dest);
4508 	if (ret)
4509 		goto out_undead;
4510 
4511 	btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4512 	/*
4513 	 * One for dir inode,
4514 	 * two for dir entries,
4515 	 * two for root ref/backref.
4516 	 */
4517 	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4518 	if (ret)
4519 		goto out_undead;
4520 	qgroup_reserved = block_rsv.qgroup_rsv_reserved;
4521 
4522 	trans = btrfs_start_transaction(root, 0);
4523 	if (IS_ERR(trans)) {
4524 		ret = PTR_ERR(trans);
4525 		goto out_release;
4526 	}
4527 	ret = btrfs_record_root_in_trans(trans, root);
4528 	if (ret) {
4529 		btrfs_abort_transaction(trans, ret);
4530 		goto out_end_trans;
4531 	}
4532 	btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
4533 	qgroup_reserved = 0;
4534 	trans->block_rsv = &block_rsv;
4535 	trans->bytes_reserved = block_rsv.size;
4536 
4537 	btrfs_record_snapshot_destroy(trans, dir);
4538 
4539 	ret = btrfs_unlink_subvol(trans, dir, dentry);
4540 	if (ret) {
4541 		btrfs_abort_transaction(trans, ret);
4542 		goto out_end_trans;
4543 	}
4544 
4545 	ret = btrfs_record_root_in_trans(trans, dest);
4546 	if (ret) {
4547 		btrfs_abort_transaction(trans, ret);
4548 		goto out_end_trans;
4549 	}
4550 
4551 	memset(&dest->root_item.drop_progress, 0,
4552 		sizeof(dest->root_item.drop_progress));
4553 	btrfs_set_root_drop_level(&dest->root_item, 0);
4554 	btrfs_set_root_refs(&dest->root_item, 0);
4555 
4556 	if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4557 		ret = btrfs_insert_orphan_item(trans,
4558 					fs_info->tree_root,
4559 					dest->root_key.objectid);
4560 		if (ret) {
4561 			btrfs_abort_transaction(trans, ret);
4562 			goto out_end_trans;
4563 		}
4564 	}
4565 
4566 	ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4567 				  BTRFS_UUID_KEY_SUBVOL,
4568 				  dest->root_key.objectid);
4569 	if (ret && ret != -ENOENT) {
4570 		btrfs_abort_transaction(trans, ret);
4571 		goto out_end_trans;
4572 	}
4573 	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4574 		ret = btrfs_uuid_tree_remove(trans,
4575 					  dest->root_item.received_uuid,
4576 					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4577 					  dest->root_key.objectid);
4578 		if (ret && ret != -ENOENT) {
4579 			btrfs_abort_transaction(trans, ret);
4580 			goto out_end_trans;
4581 		}
4582 	}
4583 
4584 	free_anon_bdev(dest->anon_dev);
4585 	dest->anon_dev = 0;
4586 out_end_trans:
4587 	trans->block_rsv = NULL;
4588 	trans->bytes_reserved = 0;
4589 	ret = btrfs_end_transaction(trans);
4590 	inode->i_flags |= S_DEAD;
4591 out_release:
4592 	btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
4593 	if (qgroup_reserved)
4594 		btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
4595 out_undead:
4596 	if (ret) {
4597 		spin_lock(&dest->root_item_lock);
4598 		root_flags = btrfs_root_flags(&dest->root_item);
4599 		btrfs_set_root_flags(&dest->root_item,
4600 				root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4601 		spin_unlock(&dest->root_item_lock);
4602 	}
4603 out_up_write:
4604 	up_write(&fs_info->subvol_sem);
4605 	if (!ret) {
4606 		d_invalidate(dentry);
4607 		btrfs_prune_dentries(dest);
4608 		ASSERT(dest->send_in_progress == 0);
4609 	}
4610 
4611 	return ret;
4612 }
4613 
btrfs_rmdir(struct inode * dir,struct dentry * dentry)4614 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4615 {
4616 	struct inode *inode = d_inode(dentry);
4617 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
4618 	int err = 0;
4619 	struct btrfs_trans_handle *trans;
4620 	u64 last_unlink_trans;
4621 	struct fscrypt_name fname;
4622 
4623 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4624 		return -ENOTEMPTY;
4625 	if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
4626 		if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
4627 			btrfs_err(fs_info,
4628 			"extent tree v2 doesn't support snapshot deletion yet");
4629 			return -EOPNOTSUPP;
4630 		}
4631 		return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
4632 	}
4633 
4634 	err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4635 	if (err)
4636 		return err;
4637 
4638 	/* This needs to handle no-key deletions later on */
4639 
4640 	trans = __unlink_start_trans(BTRFS_I(dir));
4641 	if (IS_ERR(trans)) {
4642 		err = PTR_ERR(trans);
4643 		goto out_notrans;
4644 	}
4645 
4646 	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4647 		err = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
4648 		goto out;
4649 	}
4650 
4651 	err = btrfs_orphan_add(trans, BTRFS_I(inode));
4652 	if (err)
4653 		goto out;
4654 
4655 	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4656 
4657 	/* now the directory is empty */
4658 	err = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4659 				 &fname.disk_name);
4660 	if (!err) {
4661 		btrfs_i_size_write(BTRFS_I(inode), 0);
4662 		/*
4663 		 * Propagate the last_unlink_trans value of the deleted dir to
4664 		 * its parent directory. This is to prevent an unrecoverable
4665 		 * log tree in the case we do something like this:
4666 		 * 1) create dir foo
4667 		 * 2) create snapshot under dir foo
4668 		 * 3) delete the snapshot
4669 		 * 4) rmdir foo
4670 		 * 5) mkdir foo
4671 		 * 6) fsync foo or some file inside foo
4672 		 */
4673 		if (last_unlink_trans >= trans->transid)
4674 			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4675 	}
4676 out:
4677 	btrfs_end_transaction(trans);
4678 out_notrans:
4679 	btrfs_btree_balance_dirty(fs_info);
4680 	fscrypt_free_filename(&fname);
4681 
4682 	return err;
4683 }
4684 
4685 /*
4686  * btrfs_truncate_block - read, zero a chunk and write a block
4687  * @inode - inode that we're zeroing
4688  * @from - the offset to start zeroing
4689  * @len - the length to zero, 0 to zero the entire range respective to the
4690  *	offset
4691  * @front - zero up to the offset instead of from the offset on
4692  *
4693  * This will find the block for the "from" offset and cow the block and zero the
4694  * part we want to zero.  This is used with truncate and hole punching.
4695  */
btrfs_truncate_block(struct btrfs_inode * inode,loff_t from,loff_t len,int front)4696 int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
4697 			 int front)
4698 {
4699 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
4700 	struct address_space *mapping = inode->vfs_inode.i_mapping;
4701 	struct extent_io_tree *io_tree = &inode->io_tree;
4702 	struct btrfs_ordered_extent *ordered;
4703 	struct extent_state *cached_state = NULL;
4704 	struct extent_changeset *data_reserved = NULL;
4705 	bool only_release_metadata = false;
4706 	u32 blocksize = fs_info->sectorsize;
4707 	pgoff_t index = from >> PAGE_SHIFT;
4708 	unsigned offset = from & (blocksize - 1);
4709 	struct page *page;
4710 	gfp_t mask = btrfs_alloc_write_mask(mapping);
4711 	size_t write_bytes = blocksize;
4712 	int ret = 0;
4713 	u64 block_start;
4714 	u64 block_end;
4715 
4716 	if (IS_ALIGNED(offset, blocksize) &&
4717 	    (!len || IS_ALIGNED(len, blocksize)))
4718 		goto out;
4719 
4720 	block_start = round_down(from, blocksize);
4721 	block_end = block_start + blocksize - 1;
4722 
4723 	ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
4724 					  blocksize, false);
4725 	if (ret < 0) {
4726 		if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
4727 			/* For nocow case, no need to reserve data space */
4728 			only_release_metadata = true;
4729 		} else {
4730 			goto out;
4731 		}
4732 	}
4733 	ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
4734 	if (ret < 0) {
4735 		if (!only_release_metadata)
4736 			btrfs_free_reserved_data_space(inode, data_reserved,
4737 						       block_start, blocksize);
4738 		goto out;
4739 	}
4740 again:
4741 	page = find_or_create_page(mapping, index, mask);
4742 	if (!page) {
4743 		btrfs_delalloc_release_space(inode, data_reserved, block_start,
4744 					     blocksize, true);
4745 		btrfs_delalloc_release_extents(inode, blocksize);
4746 		ret = -ENOMEM;
4747 		goto out;
4748 	}
4749 
4750 	if (!PageUptodate(page)) {
4751 		ret = btrfs_read_folio(NULL, page_folio(page));
4752 		lock_page(page);
4753 		if (page->mapping != mapping) {
4754 			unlock_page(page);
4755 			put_page(page);
4756 			goto again;
4757 		}
4758 		if (!PageUptodate(page)) {
4759 			ret = -EIO;
4760 			goto out_unlock;
4761 		}
4762 	}
4763 
4764 	/*
4765 	 * We unlock the page after the io is completed and then re-lock it
4766 	 * above.  release_folio() could have come in between that and cleared
4767 	 * PagePrivate(), but left the page in the mapping.  Set the page mapped
4768 	 * here to make sure it's properly set for the subpage stuff.
4769 	 */
4770 	ret = set_page_extent_mapped(page);
4771 	if (ret < 0)
4772 		goto out_unlock;
4773 
4774 	wait_on_page_writeback(page);
4775 
4776 	lock_extent(io_tree, block_start, block_end, &cached_state);
4777 
4778 	ordered = btrfs_lookup_ordered_extent(inode, block_start);
4779 	if (ordered) {
4780 		unlock_extent(io_tree, block_start, block_end, &cached_state);
4781 		unlock_page(page);
4782 		put_page(page);
4783 		btrfs_start_ordered_extent(ordered);
4784 		btrfs_put_ordered_extent(ordered);
4785 		goto again;
4786 	}
4787 
4788 	clear_extent_bit(&inode->io_tree, block_start, block_end,
4789 			 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4790 			 &cached_state);
4791 
4792 	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
4793 					&cached_state);
4794 	if (ret) {
4795 		unlock_extent(io_tree, block_start, block_end, &cached_state);
4796 		goto out_unlock;
4797 	}
4798 
4799 	if (offset != blocksize) {
4800 		if (!len)
4801 			len = blocksize - offset;
4802 		if (front)
4803 			memzero_page(page, (block_start - page_offset(page)),
4804 				     offset);
4805 		else
4806 			memzero_page(page, (block_start - page_offset(page)) + offset,
4807 				     len);
4808 	}
4809 	btrfs_page_clear_checked(fs_info, page, block_start,
4810 				 block_end + 1 - block_start);
4811 	btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start);
4812 	unlock_extent(io_tree, block_start, block_end, &cached_state);
4813 
4814 	if (only_release_metadata)
4815 		set_extent_bit(&inode->io_tree, block_start, block_end,
4816 			       EXTENT_NORESERVE, NULL);
4817 
4818 out_unlock:
4819 	if (ret) {
4820 		if (only_release_metadata)
4821 			btrfs_delalloc_release_metadata(inode, blocksize, true);
4822 		else
4823 			btrfs_delalloc_release_space(inode, data_reserved,
4824 					block_start, blocksize, true);
4825 	}
4826 	btrfs_delalloc_release_extents(inode, blocksize);
4827 	unlock_page(page);
4828 	put_page(page);
4829 out:
4830 	if (only_release_metadata)
4831 		btrfs_check_nocow_unlock(inode);
4832 	extent_changeset_free(data_reserved);
4833 	return ret;
4834 }
4835 
maybe_insert_hole(struct btrfs_root * root,struct btrfs_inode * inode,u64 offset,u64 len)4836 static int maybe_insert_hole(struct btrfs_root *root, struct btrfs_inode *inode,
4837 			     u64 offset, u64 len)
4838 {
4839 	struct btrfs_fs_info *fs_info = root->fs_info;
4840 	struct btrfs_trans_handle *trans;
4841 	struct btrfs_drop_extents_args drop_args = { 0 };
4842 	int ret;
4843 
4844 	/*
4845 	 * If NO_HOLES is enabled, we don't need to do anything.
4846 	 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
4847 	 * or btrfs_update_inode() will be called, which guarantee that the next
4848 	 * fsync will know this inode was changed and needs to be logged.
4849 	 */
4850 	if (btrfs_fs_incompat(fs_info, NO_HOLES))
4851 		return 0;
4852 
4853 	/*
4854 	 * 1 - for the one we're dropping
4855 	 * 1 - for the one we're adding
4856 	 * 1 - for updating the inode.
4857 	 */
4858 	trans = btrfs_start_transaction(root, 3);
4859 	if (IS_ERR(trans))
4860 		return PTR_ERR(trans);
4861 
4862 	drop_args.start = offset;
4863 	drop_args.end = offset + len;
4864 	drop_args.drop_cache = true;
4865 
4866 	ret = btrfs_drop_extents(trans, root, inode, &drop_args);
4867 	if (ret) {
4868 		btrfs_abort_transaction(trans, ret);
4869 		btrfs_end_transaction(trans);
4870 		return ret;
4871 	}
4872 
4873 	ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
4874 	if (ret) {
4875 		btrfs_abort_transaction(trans, ret);
4876 	} else {
4877 		btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
4878 		btrfs_update_inode(trans, root, inode);
4879 	}
4880 	btrfs_end_transaction(trans);
4881 	return ret;
4882 }
4883 
4884 /*
4885  * This function puts in dummy file extents for the area we're creating a hole
4886  * for.  So if we are truncating this file to a larger size we need to insert
4887  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4888  * the range between oldsize and size
4889  */
btrfs_cont_expand(struct btrfs_inode * inode,loff_t oldsize,loff_t size)4890 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
4891 {
4892 	struct btrfs_root *root = inode->root;
4893 	struct btrfs_fs_info *fs_info = root->fs_info;
4894 	struct extent_io_tree *io_tree = &inode->io_tree;
4895 	struct extent_map *em = NULL;
4896 	struct extent_state *cached_state = NULL;
4897 	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
4898 	u64 block_end = ALIGN(size, fs_info->sectorsize);
4899 	u64 last_byte;
4900 	u64 cur_offset;
4901 	u64 hole_size;
4902 	int err = 0;
4903 
4904 	/*
4905 	 * If our size started in the middle of a block we need to zero out the
4906 	 * rest of the block before we expand the i_size, otherwise we could
4907 	 * expose stale data.
4908 	 */
4909 	err = btrfs_truncate_block(inode, oldsize, 0, 0);
4910 	if (err)
4911 		return err;
4912 
4913 	if (size <= hole_start)
4914 		return 0;
4915 
4916 	btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
4917 					   &cached_state);
4918 	cur_offset = hole_start;
4919 	while (1) {
4920 		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4921 				      block_end - cur_offset);
4922 		if (IS_ERR(em)) {
4923 			err = PTR_ERR(em);
4924 			em = NULL;
4925 			break;
4926 		}
4927 		last_byte = min(extent_map_end(em), block_end);
4928 		last_byte = ALIGN(last_byte, fs_info->sectorsize);
4929 		hole_size = last_byte - cur_offset;
4930 
4931 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4932 			struct extent_map *hole_em;
4933 
4934 			err = maybe_insert_hole(root, inode, cur_offset,
4935 						hole_size);
4936 			if (err)
4937 				break;
4938 
4939 			err = btrfs_inode_set_file_extent_range(inode,
4940 							cur_offset, hole_size);
4941 			if (err)
4942 				break;
4943 
4944 			hole_em = alloc_extent_map();
4945 			if (!hole_em) {
4946 				btrfs_drop_extent_map_range(inode, cur_offset,
4947 						    cur_offset + hole_size - 1,
4948 						    false);
4949 				btrfs_set_inode_full_sync(inode);
4950 				goto next;
4951 			}
4952 			hole_em->start = cur_offset;
4953 			hole_em->len = hole_size;
4954 			hole_em->orig_start = cur_offset;
4955 
4956 			hole_em->block_start = EXTENT_MAP_HOLE;
4957 			hole_em->block_len = 0;
4958 			hole_em->orig_block_len = 0;
4959 			hole_em->ram_bytes = hole_size;
4960 			hole_em->compress_type = BTRFS_COMPRESS_NONE;
4961 			hole_em->generation = fs_info->generation;
4962 
4963 			err = btrfs_replace_extent_map_range(inode, hole_em, true);
4964 			free_extent_map(hole_em);
4965 		} else {
4966 			err = btrfs_inode_set_file_extent_range(inode,
4967 							cur_offset, hole_size);
4968 			if (err)
4969 				break;
4970 		}
4971 next:
4972 		free_extent_map(em);
4973 		em = NULL;
4974 		cur_offset = last_byte;
4975 		if (cur_offset >= block_end)
4976 			break;
4977 	}
4978 	free_extent_map(em);
4979 	unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
4980 	return err;
4981 }
4982 
btrfs_setsize(struct inode * inode,struct iattr * attr)4983 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4984 {
4985 	struct btrfs_root *root = BTRFS_I(inode)->root;
4986 	struct btrfs_trans_handle *trans;
4987 	loff_t oldsize = i_size_read(inode);
4988 	loff_t newsize = attr->ia_size;
4989 	int mask = attr->ia_valid;
4990 	int ret;
4991 
4992 	/*
4993 	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4994 	 * special case where we need to update the times despite not having
4995 	 * these flags set.  For all other operations the VFS set these flags
4996 	 * explicitly if it wants a timestamp update.
4997 	 */
4998 	if (newsize != oldsize) {
4999 		inode_inc_iversion(inode);
5000 		if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
5001 			inode->i_mtime = inode_set_ctime_current(inode);
5002 		}
5003 	}
5004 
5005 	if (newsize > oldsize) {
5006 		/*
5007 		 * Don't do an expanding truncate while snapshotting is ongoing.
5008 		 * This is to ensure the snapshot captures a fully consistent
5009 		 * state of this file - if the snapshot captures this expanding
5010 		 * truncation, it must capture all writes that happened before
5011 		 * this truncation.
5012 		 */
5013 		btrfs_drew_write_lock(&root->snapshot_lock);
5014 		ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
5015 		if (ret) {
5016 			btrfs_drew_write_unlock(&root->snapshot_lock);
5017 			return ret;
5018 		}
5019 
5020 		trans = btrfs_start_transaction(root, 1);
5021 		if (IS_ERR(trans)) {
5022 			btrfs_drew_write_unlock(&root->snapshot_lock);
5023 			return PTR_ERR(trans);
5024 		}
5025 
5026 		i_size_write(inode, newsize);
5027 		btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
5028 		pagecache_isize_extended(inode, oldsize, newsize);
5029 		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
5030 		btrfs_drew_write_unlock(&root->snapshot_lock);
5031 		btrfs_end_transaction(trans);
5032 	} else {
5033 		struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5034 
5035 		if (btrfs_is_zoned(fs_info)) {
5036 			ret = btrfs_wait_ordered_range(inode,
5037 					ALIGN(newsize, fs_info->sectorsize),
5038 					(u64)-1);
5039 			if (ret)
5040 				return ret;
5041 		}
5042 
5043 		/*
5044 		 * We're truncating a file that used to have good data down to
5045 		 * zero. Make sure any new writes to the file get on disk
5046 		 * on close.
5047 		 */
5048 		if (newsize == 0)
5049 			set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
5050 				&BTRFS_I(inode)->runtime_flags);
5051 
5052 		truncate_setsize(inode, newsize);
5053 
5054 		inode_dio_wait(inode);
5055 
5056 		ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
5057 		if (ret && inode->i_nlink) {
5058 			int err;
5059 
5060 			/*
5061 			 * Truncate failed, so fix up the in-memory size. We
5062 			 * adjusted disk_i_size down as we removed extents, so
5063 			 * wait for disk_i_size to be stable and then update the
5064 			 * in-memory size to match.
5065 			 */
5066 			err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5067 			if (err)
5068 				return err;
5069 			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5070 		}
5071 	}
5072 
5073 	return ret;
5074 }
5075 
btrfs_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)5076 static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
5077 			 struct iattr *attr)
5078 {
5079 	struct inode *inode = d_inode(dentry);
5080 	struct btrfs_root *root = BTRFS_I(inode)->root;
5081 	int err;
5082 
5083 	if (btrfs_root_readonly(root))
5084 		return -EROFS;
5085 
5086 	err = setattr_prepare(idmap, dentry, attr);
5087 	if (err)
5088 		return err;
5089 
5090 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5091 		err = btrfs_setsize(inode, attr);
5092 		if (err)
5093 			return err;
5094 	}
5095 
5096 	if (attr->ia_valid) {
5097 		setattr_copy(idmap, inode, attr);
5098 		inode_inc_iversion(inode);
5099 		err = btrfs_dirty_inode(BTRFS_I(inode));
5100 
5101 		if (!err && attr->ia_valid & ATTR_MODE)
5102 			err = posix_acl_chmod(idmap, dentry, inode->i_mode);
5103 	}
5104 
5105 	return err;
5106 }
5107 
5108 /*
5109  * While truncating the inode pages during eviction, we get the VFS
5110  * calling btrfs_invalidate_folio() against each folio of the inode. This
5111  * is slow because the calls to btrfs_invalidate_folio() result in a
5112  * huge amount of calls to lock_extent() and clear_extent_bit(),
5113  * which keep merging and splitting extent_state structures over and over,
5114  * wasting lots of time.
5115  *
5116  * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
5117  * skip all those expensive operations on a per folio basis and do only
5118  * the ordered io finishing, while we release here the extent_map and
5119  * extent_state structures, without the excessive merging and splitting.
5120  */
evict_inode_truncate_pages(struct inode * inode)5121 static void evict_inode_truncate_pages(struct inode *inode)
5122 {
5123 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5124 	struct rb_node *node;
5125 
5126 	ASSERT(inode->i_state & I_FREEING);
5127 	truncate_inode_pages_final(&inode->i_data);
5128 
5129 	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
5130 
5131 	/*
5132 	 * Keep looping until we have no more ranges in the io tree.
5133 	 * We can have ongoing bios started by readahead that have
5134 	 * their endio callback (extent_io.c:end_bio_extent_readpage)
5135 	 * still in progress (unlocked the pages in the bio but did not yet
5136 	 * unlocked the ranges in the io tree). Therefore this means some
5137 	 * ranges can still be locked and eviction started because before
5138 	 * submitting those bios, which are executed by a separate task (work
5139 	 * queue kthread), inode references (inode->i_count) were not taken
5140 	 * (which would be dropped in the end io callback of each bio).
5141 	 * Therefore here we effectively end up waiting for those bios and
5142 	 * anyone else holding locked ranges without having bumped the inode's
5143 	 * reference count - if we don't do it, when they access the inode's
5144 	 * io_tree to unlock a range it may be too late, leading to an
5145 	 * use-after-free issue.
5146 	 */
5147 	spin_lock(&io_tree->lock);
5148 	while (!RB_EMPTY_ROOT(&io_tree->state)) {
5149 		struct extent_state *state;
5150 		struct extent_state *cached_state = NULL;
5151 		u64 start;
5152 		u64 end;
5153 		unsigned state_flags;
5154 
5155 		node = rb_first(&io_tree->state);
5156 		state = rb_entry(node, struct extent_state, rb_node);
5157 		start = state->start;
5158 		end = state->end;
5159 		state_flags = state->state;
5160 		spin_unlock(&io_tree->lock);
5161 
5162 		lock_extent(io_tree, start, end, &cached_state);
5163 
5164 		/*
5165 		 * If still has DELALLOC flag, the extent didn't reach disk,
5166 		 * and its reserved space won't be freed by delayed_ref.
5167 		 * So we need to free its reserved space here.
5168 		 * (Refer to comment in btrfs_invalidate_folio, case 2)
5169 		 *
5170 		 * Note, end is the bytenr of last byte, so we need + 1 here.
5171 		 */
5172 		if (state_flags & EXTENT_DELALLOC)
5173 			btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
5174 					       end - start + 1, NULL);
5175 
5176 		clear_extent_bit(io_tree, start, end,
5177 				 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
5178 				 &cached_state);
5179 
5180 		cond_resched();
5181 		spin_lock(&io_tree->lock);
5182 	}
5183 	spin_unlock(&io_tree->lock);
5184 }
5185 
evict_refill_and_join(struct btrfs_root * root,struct btrfs_block_rsv * rsv)5186 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5187 							struct btrfs_block_rsv *rsv)
5188 {
5189 	struct btrfs_fs_info *fs_info = root->fs_info;
5190 	struct btrfs_trans_handle *trans;
5191 	u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
5192 	int ret;
5193 
5194 	/*
5195 	 * Eviction should be taking place at some place safe because of our
5196 	 * delayed iputs.  However the normal flushing code will run delayed
5197 	 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5198 	 *
5199 	 * We reserve the delayed_refs_extra here again because we can't use
5200 	 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5201 	 * above.  We reserve our extra bit here because we generate a ton of
5202 	 * delayed refs activity by truncating.
5203 	 *
5204 	 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
5205 	 * if we fail to make this reservation we can re-try without the
5206 	 * delayed_refs_extra so we can make some forward progress.
5207 	 */
5208 	ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
5209 				     BTRFS_RESERVE_FLUSH_EVICT);
5210 	if (ret) {
5211 		ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
5212 					     BTRFS_RESERVE_FLUSH_EVICT);
5213 		if (ret) {
5214 			btrfs_warn(fs_info,
5215 				   "could not allocate space for delete; will truncate on mount");
5216 			return ERR_PTR(-ENOSPC);
5217 		}
5218 		delayed_refs_extra = 0;
5219 	}
5220 
5221 	trans = btrfs_join_transaction(root);
5222 	if (IS_ERR(trans))
5223 		return trans;
5224 
5225 	if (delayed_refs_extra) {
5226 		trans->block_rsv = &fs_info->trans_block_rsv;
5227 		trans->bytes_reserved = delayed_refs_extra;
5228 		btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5229 					delayed_refs_extra, true);
5230 	}
5231 	return trans;
5232 }
5233 
btrfs_evict_inode(struct inode * inode)5234 void btrfs_evict_inode(struct inode *inode)
5235 {
5236 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5237 	struct btrfs_trans_handle *trans;
5238 	struct btrfs_root *root = BTRFS_I(inode)->root;
5239 	struct btrfs_block_rsv *rsv = NULL;
5240 	int ret;
5241 
5242 	trace_btrfs_inode_evict(inode);
5243 
5244 	if (!root) {
5245 		fsverity_cleanup_inode(inode);
5246 		clear_inode(inode);
5247 		return;
5248 	}
5249 
5250 	evict_inode_truncate_pages(inode);
5251 
5252 	if (inode->i_nlink &&
5253 	    ((btrfs_root_refs(&root->root_item) != 0 &&
5254 	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5255 	     btrfs_is_free_space_inode(BTRFS_I(inode))))
5256 		goto out;
5257 
5258 	if (is_bad_inode(inode))
5259 		goto out;
5260 
5261 	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5262 		goto out;
5263 
5264 	if (inode->i_nlink > 0) {
5265 		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5266 		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5267 		goto out;
5268 	}
5269 
5270 	/*
5271 	 * This makes sure the inode item in tree is uptodate and the space for
5272 	 * the inode update is released.
5273 	 */
5274 	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5275 	if (ret)
5276 		goto out;
5277 
5278 	/*
5279 	 * This drops any pending insert or delete operations we have for this
5280 	 * inode.  We could have a delayed dir index deletion queued up, but
5281 	 * we're removing the inode completely so that'll be taken care of in
5282 	 * the truncate.
5283 	 */
5284 	btrfs_kill_delayed_inode_items(BTRFS_I(inode));
5285 
5286 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5287 	if (!rsv)
5288 		goto out;
5289 	rsv->size = btrfs_calc_metadata_size(fs_info, 1);
5290 	rsv->failfast = true;
5291 
5292 	btrfs_i_size_write(BTRFS_I(inode), 0);
5293 
5294 	while (1) {
5295 		struct btrfs_truncate_control control = {
5296 			.inode = BTRFS_I(inode),
5297 			.ino = btrfs_ino(BTRFS_I(inode)),
5298 			.new_size = 0,
5299 			.min_type = 0,
5300 		};
5301 
5302 		trans = evict_refill_and_join(root, rsv);
5303 		if (IS_ERR(trans))
5304 			goto out;
5305 
5306 		trans->block_rsv = rsv;
5307 
5308 		ret = btrfs_truncate_inode_items(trans, root, &control);
5309 		trans->block_rsv = &fs_info->trans_block_rsv;
5310 		btrfs_end_transaction(trans);
5311 		/*
5312 		 * We have not added new delayed items for our inode after we
5313 		 * have flushed its delayed items, so no need to throttle on
5314 		 * delayed items. However we have modified extent buffers.
5315 		 */
5316 		btrfs_btree_balance_dirty_nodelay(fs_info);
5317 		if (ret && ret != -ENOSPC && ret != -EAGAIN)
5318 			goto out;
5319 		else if (!ret)
5320 			break;
5321 	}
5322 
5323 	/*
5324 	 * Errors here aren't a big deal, it just means we leave orphan items in
5325 	 * the tree. They will be cleaned up on the next mount. If the inode
5326 	 * number gets reused, cleanup deletes the orphan item without doing
5327 	 * anything, and unlink reuses the existing orphan item.
5328 	 *
5329 	 * If it turns out that we are dropping too many of these, we might want
5330 	 * to add a mechanism for retrying these after a commit.
5331 	 */
5332 	trans = evict_refill_and_join(root, rsv);
5333 	if (!IS_ERR(trans)) {
5334 		trans->block_rsv = rsv;
5335 		btrfs_orphan_del(trans, BTRFS_I(inode));
5336 		trans->block_rsv = &fs_info->trans_block_rsv;
5337 		btrfs_end_transaction(trans);
5338 	}
5339 
5340 out:
5341 	btrfs_free_block_rsv(fs_info, rsv);
5342 	/*
5343 	 * If we didn't successfully delete, the orphan item will still be in
5344 	 * the tree and we'll retry on the next mount. Again, we might also want
5345 	 * to retry these periodically in the future.
5346 	 */
5347 	btrfs_remove_delayed_node(BTRFS_I(inode));
5348 	fsverity_cleanup_inode(inode);
5349 	clear_inode(inode);
5350 }
5351 
5352 /*
5353  * Return the key found in the dir entry in the location pointer, fill @type
5354  * with BTRFS_FT_*, and return 0.
5355  *
5356  * If no dir entries were found, returns -ENOENT.
5357  * If found a corrupted location in dir entry, returns -EUCLEAN.
5358  */
btrfs_inode_by_name(struct btrfs_inode * dir,struct dentry * dentry,struct btrfs_key * location,u8 * type)5359 static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
5360 			       struct btrfs_key *location, u8 *type)
5361 {
5362 	struct btrfs_dir_item *di;
5363 	struct btrfs_path *path;
5364 	struct btrfs_root *root = dir->root;
5365 	int ret = 0;
5366 	struct fscrypt_name fname;
5367 
5368 	path = btrfs_alloc_path();
5369 	if (!path)
5370 		return -ENOMEM;
5371 
5372 	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
5373 	if (ret < 0)
5374 		goto out;
5375 	/*
5376 	 * fscrypt_setup_filename() should never return a positive value, but
5377 	 * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
5378 	 */
5379 	ASSERT(ret == 0);
5380 
5381 	/* This needs to handle no-key deletions later on */
5382 
5383 	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
5384 				   &fname.disk_name, 0);
5385 	if (IS_ERR_OR_NULL(di)) {
5386 		ret = di ? PTR_ERR(di) : -ENOENT;
5387 		goto out;
5388 	}
5389 
5390 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5391 	if (location->type != BTRFS_INODE_ITEM_KEY &&
5392 	    location->type != BTRFS_ROOT_ITEM_KEY) {
5393 		ret = -EUCLEAN;
5394 		btrfs_warn(root->fs_info,
5395 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5396 			   __func__, fname.disk_name.name, btrfs_ino(dir),
5397 			   location->objectid, location->type, location->offset);
5398 	}
5399 	if (!ret)
5400 		*type = btrfs_dir_ftype(path->nodes[0], di);
5401 out:
5402 	fscrypt_free_filename(&fname);
5403 	btrfs_free_path(path);
5404 	return ret;
5405 }
5406 
5407 /*
5408  * when we hit a tree root in a directory, the btrfs part of the inode
5409  * needs to be changed to reflect the root directory of the tree root.  This
5410  * is kind of like crossing a mount point.
5411  */
fixup_tree_root_location(struct btrfs_fs_info * fs_info,struct btrfs_inode * dir,struct dentry * dentry,struct btrfs_key * location,struct btrfs_root ** sub_root)5412 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5413 				    struct btrfs_inode *dir,
5414 				    struct dentry *dentry,
5415 				    struct btrfs_key *location,
5416 				    struct btrfs_root **sub_root)
5417 {
5418 	struct btrfs_path *path;
5419 	struct btrfs_root *new_root;
5420 	struct btrfs_root_ref *ref;
5421 	struct extent_buffer *leaf;
5422 	struct btrfs_key key;
5423 	int ret;
5424 	int err = 0;
5425 	struct fscrypt_name fname;
5426 
5427 	ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
5428 	if (ret)
5429 		return ret;
5430 
5431 	path = btrfs_alloc_path();
5432 	if (!path) {
5433 		err = -ENOMEM;
5434 		goto out;
5435 	}
5436 
5437 	err = -ENOENT;
5438 	key.objectid = dir->root->root_key.objectid;
5439 	key.type = BTRFS_ROOT_REF_KEY;
5440 	key.offset = location->objectid;
5441 
5442 	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5443 	if (ret) {
5444 		if (ret < 0)
5445 			err = ret;
5446 		goto out;
5447 	}
5448 
5449 	leaf = path->nodes[0];
5450 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5451 	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
5452 	    btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
5453 		goto out;
5454 
5455 	ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
5456 				   (unsigned long)(ref + 1), fname.disk_name.len);
5457 	if (ret)
5458 		goto out;
5459 
5460 	btrfs_release_path(path);
5461 
5462 	new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5463 	if (IS_ERR(new_root)) {
5464 		err = PTR_ERR(new_root);
5465 		goto out;
5466 	}
5467 
5468 	*sub_root = new_root;
5469 	location->objectid = btrfs_root_dirid(&new_root->root_item);
5470 	location->type = BTRFS_INODE_ITEM_KEY;
5471 	location->offset = 0;
5472 	err = 0;
5473 out:
5474 	btrfs_free_path(path);
5475 	fscrypt_free_filename(&fname);
5476 	return err;
5477 }
5478 
inode_tree_add(struct btrfs_inode * inode)5479 static void inode_tree_add(struct btrfs_inode *inode)
5480 {
5481 	struct btrfs_root *root = inode->root;
5482 	struct btrfs_inode *entry;
5483 	struct rb_node **p;
5484 	struct rb_node *parent;
5485 	struct rb_node *new = &inode->rb_node;
5486 	u64 ino = btrfs_ino(inode);
5487 
5488 	if (inode_unhashed(&inode->vfs_inode))
5489 		return;
5490 	parent = NULL;
5491 	spin_lock(&root->inode_lock);
5492 	p = &root->inode_tree.rb_node;
5493 	while (*p) {
5494 		parent = *p;
5495 		entry = rb_entry(parent, struct btrfs_inode, rb_node);
5496 
5497 		if (ino < btrfs_ino(entry))
5498 			p = &parent->rb_left;
5499 		else if (ino > btrfs_ino(entry))
5500 			p = &parent->rb_right;
5501 		else {
5502 			WARN_ON(!(entry->vfs_inode.i_state &
5503 				  (I_WILL_FREE | I_FREEING)));
5504 			rb_replace_node(parent, new, &root->inode_tree);
5505 			RB_CLEAR_NODE(parent);
5506 			spin_unlock(&root->inode_lock);
5507 			return;
5508 		}
5509 	}
5510 	rb_link_node(new, parent, p);
5511 	rb_insert_color(new, &root->inode_tree);
5512 	spin_unlock(&root->inode_lock);
5513 }
5514 
inode_tree_del(struct btrfs_inode * inode)5515 static void inode_tree_del(struct btrfs_inode *inode)
5516 {
5517 	struct btrfs_root *root = inode->root;
5518 	int empty = 0;
5519 
5520 	spin_lock(&root->inode_lock);
5521 	if (!RB_EMPTY_NODE(&inode->rb_node)) {
5522 		rb_erase(&inode->rb_node, &root->inode_tree);
5523 		RB_CLEAR_NODE(&inode->rb_node);
5524 		empty = RB_EMPTY_ROOT(&root->inode_tree);
5525 	}
5526 	spin_unlock(&root->inode_lock);
5527 
5528 	if (empty && btrfs_root_refs(&root->root_item) == 0) {
5529 		spin_lock(&root->inode_lock);
5530 		empty = RB_EMPTY_ROOT(&root->inode_tree);
5531 		spin_unlock(&root->inode_lock);
5532 		if (empty)
5533 			btrfs_add_dead_root(root);
5534 	}
5535 }
5536 
5537 
btrfs_init_locked_inode(struct inode * inode,void * p)5538 static int btrfs_init_locked_inode(struct inode *inode, void *p)
5539 {
5540 	struct btrfs_iget_args *args = p;
5541 
5542 	inode->i_ino = args->ino;
5543 	BTRFS_I(inode)->location.objectid = args->ino;
5544 	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
5545 	BTRFS_I(inode)->location.offset = 0;
5546 	BTRFS_I(inode)->root = btrfs_grab_root(args->root);
5547 	BUG_ON(args->root && !BTRFS_I(inode)->root);
5548 
5549 	if (args->root && args->root == args->root->fs_info->tree_root &&
5550 	    args->ino != BTRFS_BTREE_INODE_OBJECTID)
5551 		set_bit(BTRFS_INODE_FREE_SPACE_INODE,
5552 			&BTRFS_I(inode)->runtime_flags);
5553 	return 0;
5554 }
5555 
btrfs_find_actor(struct inode * inode,void * opaque)5556 static int btrfs_find_actor(struct inode *inode, void *opaque)
5557 {
5558 	struct btrfs_iget_args *args = opaque;
5559 
5560 	return args->ino == BTRFS_I(inode)->location.objectid &&
5561 		args->root == BTRFS_I(inode)->root;
5562 }
5563 
btrfs_iget_locked(struct super_block * s,u64 ino,struct btrfs_root * root)5564 static struct inode *btrfs_iget_locked(struct super_block *s, u64 ino,
5565 				       struct btrfs_root *root)
5566 {
5567 	struct inode *inode;
5568 	struct btrfs_iget_args args;
5569 	unsigned long hashval = btrfs_inode_hash(ino, root);
5570 
5571 	args.ino = ino;
5572 	args.root = root;
5573 
5574 	inode = iget5_locked(s, hashval, btrfs_find_actor,
5575 			     btrfs_init_locked_inode,
5576 			     (void *)&args);
5577 	return inode;
5578 }
5579 
5580 /*
5581  * Get an inode object given its inode number and corresponding root.
5582  * Path can be preallocated to prevent recursing back to iget through
5583  * allocator. NULL is also valid but may require an additional allocation
5584  * later.
5585  */
btrfs_iget_path(struct super_block * s,u64 ino,struct btrfs_root * root,struct btrfs_path * path)5586 struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
5587 			      struct btrfs_root *root, struct btrfs_path *path)
5588 {
5589 	struct inode *inode;
5590 
5591 	inode = btrfs_iget_locked(s, ino, root);
5592 	if (!inode)
5593 		return ERR_PTR(-ENOMEM);
5594 
5595 	if (inode->i_state & I_NEW) {
5596 		int ret;
5597 
5598 		ret = btrfs_read_locked_inode(inode, path);
5599 		if (!ret) {
5600 			inode_tree_add(BTRFS_I(inode));
5601 			unlock_new_inode(inode);
5602 		} else {
5603 			iget_failed(inode);
5604 			/*
5605 			 * ret > 0 can come from btrfs_search_slot called by
5606 			 * btrfs_read_locked_inode, this means the inode item
5607 			 * was not found.
5608 			 */
5609 			if (ret > 0)
5610 				ret = -ENOENT;
5611 			inode = ERR_PTR(ret);
5612 		}
5613 	}
5614 
5615 	return inode;
5616 }
5617 
btrfs_iget(struct super_block * s,u64 ino,struct btrfs_root * root)5618 struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root)
5619 {
5620 	return btrfs_iget_path(s, ino, root, NULL);
5621 }
5622 
new_simple_dir(struct inode * dir,struct btrfs_key * key,struct btrfs_root * root)5623 static struct inode *new_simple_dir(struct inode *dir,
5624 				    struct btrfs_key *key,
5625 				    struct btrfs_root *root)
5626 {
5627 	struct inode *inode = new_inode(dir->i_sb);
5628 
5629 	if (!inode)
5630 		return ERR_PTR(-ENOMEM);
5631 
5632 	BTRFS_I(inode)->root = btrfs_grab_root(root);
5633 	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5634 	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5635 
5636 	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5637 	/*
5638 	 * We only need lookup, the rest is read-only and there's no inode
5639 	 * associated with the dentry
5640 	 */
5641 	inode->i_op = &simple_dir_inode_operations;
5642 	inode->i_opflags &= ~IOP_XATTR;
5643 	inode->i_fop = &simple_dir_operations;
5644 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5645 	inode->i_mtime = inode_set_ctime_current(inode);
5646 	inode->i_atime = dir->i_atime;
5647 	BTRFS_I(inode)->i_otime = inode->i_mtime;
5648 	inode->i_uid = dir->i_uid;
5649 	inode->i_gid = dir->i_gid;
5650 
5651 	return inode;
5652 }
5653 
5654 static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
5655 static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
5656 static_assert(BTRFS_FT_DIR == FT_DIR);
5657 static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
5658 static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
5659 static_assert(BTRFS_FT_FIFO == FT_FIFO);
5660 static_assert(BTRFS_FT_SOCK == FT_SOCK);
5661 static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
5662 
btrfs_inode_type(struct inode * inode)5663 static inline u8 btrfs_inode_type(struct inode *inode)
5664 {
5665 	return fs_umode_to_ftype(inode->i_mode);
5666 }
5667 
btrfs_lookup_dentry(struct inode * dir,struct dentry * dentry)5668 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5669 {
5670 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
5671 	struct inode *inode;
5672 	struct btrfs_root *root = BTRFS_I(dir)->root;
5673 	struct btrfs_root *sub_root = root;
5674 	struct btrfs_key location = { 0 };
5675 	u8 di_type = 0;
5676 	int ret = 0;
5677 
5678 	if (dentry->d_name.len > BTRFS_NAME_LEN)
5679 		return ERR_PTR(-ENAMETOOLONG);
5680 
5681 	ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
5682 	if (ret < 0)
5683 		return ERR_PTR(ret);
5684 
5685 	if (location.type == BTRFS_INODE_ITEM_KEY) {
5686 		inode = btrfs_iget(dir->i_sb, location.objectid, root);
5687 		if (IS_ERR(inode))
5688 			return inode;
5689 
5690 		/* Do extra check against inode mode with di_type */
5691 		if (btrfs_inode_type(inode) != di_type) {
5692 			btrfs_crit(fs_info,
5693 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5694 				  inode->i_mode, btrfs_inode_type(inode),
5695 				  di_type);
5696 			iput(inode);
5697 			return ERR_PTR(-EUCLEAN);
5698 		}
5699 		return inode;
5700 	}
5701 
5702 	ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
5703 				       &location, &sub_root);
5704 	if (ret < 0) {
5705 		if (ret != -ENOENT)
5706 			inode = ERR_PTR(ret);
5707 		else
5708 			inode = new_simple_dir(dir, &location, root);
5709 	} else {
5710 		inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
5711 		btrfs_put_root(sub_root);
5712 
5713 		if (IS_ERR(inode))
5714 			return inode;
5715 
5716 		down_read(&fs_info->cleanup_work_sem);
5717 		if (!sb_rdonly(inode->i_sb))
5718 			ret = btrfs_orphan_cleanup(sub_root);
5719 		up_read(&fs_info->cleanup_work_sem);
5720 		if (ret) {
5721 			iput(inode);
5722 			inode = ERR_PTR(ret);
5723 		}
5724 	}
5725 
5726 	return inode;
5727 }
5728 
btrfs_dentry_delete(const struct dentry * dentry)5729 static int btrfs_dentry_delete(const struct dentry *dentry)
5730 {
5731 	struct btrfs_root *root;
5732 	struct inode *inode = d_inode(dentry);
5733 
5734 	if (!inode && !IS_ROOT(dentry))
5735 		inode = d_inode(dentry->d_parent);
5736 
5737 	if (inode) {
5738 		root = BTRFS_I(inode)->root;
5739 		if (btrfs_root_refs(&root->root_item) == 0)
5740 			return 1;
5741 
5742 		if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5743 			return 1;
5744 	}
5745 	return 0;
5746 }
5747 
btrfs_lookup(struct inode * dir,struct dentry * dentry,unsigned int flags)5748 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5749 				   unsigned int flags)
5750 {
5751 	struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5752 
5753 	if (inode == ERR_PTR(-ENOENT))
5754 		inode = NULL;
5755 	return d_splice_alias(inode, dentry);
5756 }
5757 
5758 /*
5759  * Find the highest existing sequence number in a directory and then set the
5760  * in-memory index_cnt variable to the first free sequence number.
5761  */
btrfs_set_inode_index_count(struct btrfs_inode * inode)5762 static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
5763 {
5764 	struct btrfs_root *root = inode->root;
5765 	struct btrfs_key key, found_key;
5766 	struct btrfs_path *path;
5767 	struct extent_buffer *leaf;
5768 	int ret;
5769 
5770 	key.objectid = btrfs_ino(inode);
5771 	key.type = BTRFS_DIR_INDEX_KEY;
5772 	key.offset = (u64)-1;
5773 
5774 	path = btrfs_alloc_path();
5775 	if (!path)
5776 		return -ENOMEM;
5777 
5778 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5779 	if (ret < 0)
5780 		goto out;
5781 	/* FIXME: we should be able to handle this */
5782 	if (ret == 0)
5783 		goto out;
5784 	ret = 0;
5785 
5786 	if (path->slots[0] == 0) {
5787 		inode->index_cnt = BTRFS_DIR_START_INDEX;
5788 		goto out;
5789 	}
5790 
5791 	path->slots[0]--;
5792 
5793 	leaf = path->nodes[0];
5794 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5795 
5796 	if (found_key.objectid != btrfs_ino(inode) ||
5797 	    found_key.type != BTRFS_DIR_INDEX_KEY) {
5798 		inode->index_cnt = BTRFS_DIR_START_INDEX;
5799 		goto out;
5800 	}
5801 
5802 	inode->index_cnt = found_key.offset + 1;
5803 out:
5804 	btrfs_free_path(path);
5805 	return ret;
5806 }
5807 
btrfs_get_dir_last_index(struct btrfs_inode * dir,u64 * index)5808 static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
5809 {
5810 	int ret = 0;
5811 
5812 	btrfs_inode_lock(dir, 0);
5813 	if (dir->index_cnt == (u64)-1) {
5814 		ret = btrfs_inode_delayed_dir_index_count(dir);
5815 		if (ret) {
5816 			ret = btrfs_set_inode_index_count(dir);
5817 			if (ret)
5818 				goto out;
5819 		}
5820 	}
5821 
5822 	/* index_cnt is the index number of next new entry, so decrement it. */
5823 	*index = dir->index_cnt - 1;
5824 out:
5825 	btrfs_inode_unlock(dir, 0);
5826 
5827 	return ret;
5828 }
5829 
5830 /*
5831  * All this infrastructure exists because dir_emit can fault, and we are holding
5832  * the tree lock when doing readdir.  For now just allocate a buffer and copy
5833  * our information into that, and then dir_emit from the buffer.  This is
5834  * similar to what NFS does, only we don't keep the buffer around in pagecache
5835  * because I'm afraid I'll mess that up.  Long term we need to make filldir do
5836  * copy_to_user_inatomic so we don't have to worry about page faulting under the
5837  * tree lock.
5838  */
btrfs_opendir(struct inode * inode,struct file * file)5839 static int btrfs_opendir(struct inode *inode, struct file *file)
5840 {
5841 	struct btrfs_file_private *private;
5842 	u64 last_index;
5843 	int ret;
5844 
5845 	ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
5846 	if (ret)
5847 		return ret;
5848 
5849 	private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
5850 	if (!private)
5851 		return -ENOMEM;
5852 	private->last_index = last_index;
5853 	private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
5854 	if (!private->filldir_buf) {
5855 		kfree(private);
5856 		return -ENOMEM;
5857 	}
5858 	file->private_data = private;
5859 	return 0;
5860 }
5861 
btrfs_dir_llseek(struct file * file,loff_t offset,int whence)5862 static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
5863 {
5864 	struct btrfs_file_private *private = file->private_data;
5865 	int ret;
5866 
5867 	ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
5868 				       &private->last_index);
5869 	if (ret)
5870 		return ret;
5871 
5872 	return generic_file_llseek(file, offset, whence);
5873 }
5874 
5875 struct dir_entry {
5876 	u64 ino;
5877 	u64 offset;
5878 	unsigned type;
5879 	int name_len;
5880 };
5881 
btrfs_filldir(void * addr,int entries,struct dir_context * ctx)5882 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
5883 {
5884 	while (entries--) {
5885 		struct dir_entry *entry = addr;
5886 		char *name = (char *)(entry + 1);
5887 
5888 		ctx->pos = get_unaligned(&entry->offset);
5889 		if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
5890 					 get_unaligned(&entry->ino),
5891 					 get_unaligned(&entry->type)))
5892 			return 1;
5893 		addr += sizeof(struct dir_entry) +
5894 			get_unaligned(&entry->name_len);
5895 		ctx->pos++;
5896 	}
5897 	return 0;
5898 }
5899 
btrfs_real_readdir(struct file * file,struct dir_context * ctx)5900 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5901 {
5902 	struct inode *inode = file_inode(file);
5903 	struct btrfs_root *root = BTRFS_I(inode)->root;
5904 	struct btrfs_file_private *private = file->private_data;
5905 	struct btrfs_dir_item *di;
5906 	struct btrfs_key key;
5907 	struct btrfs_key found_key;
5908 	struct btrfs_path *path;
5909 	void *addr;
5910 	LIST_HEAD(ins_list);
5911 	LIST_HEAD(del_list);
5912 	int ret;
5913 	char *name_ptr;
5914 	int name_len;
5915 	int entries = 0;
5916 	int total_len = 0;
5917 	bool put = false;
5918 	struct btrfs_key location;
5919 
5920 	if (!dir_emit_dots(file, ctx))
5921 		return 0;
5922 
5923 	path = btrfs_alloc_path();
5924 	if (!path)
5925 		return -ENOMEM;
5926 
5927 	addr = private->filldir_buf;
5928 	path->reada = READA_FORWARD;
5929 
5930 	put = btrfs_readdir_get_delayed_items(inode, private->last_index,
5931 					      &ins_list, &del_list);
5932 
5933 again:
5934 	key.type = BTRFS_DIR_INDEX_KEY;
5935 	key.offset = ctx->pos;
5936 	key.objectid = btrfs_ino(BTRFS_I(inode));
5937 
5938 	btrfs_for_each_slot(root, &key, &found_key, path, ret) {
5939 		struct dir_entry *entry;
5940 		struct extent_buffer *leaf = path->nodes[0];
5941 		u8 ftype;
5942 
5943 		if (found_key.objectid != key.objectid)
5944 			break;
5945 		if (found_key.type != BTRFS_DIR_INDEX_KEY)
5946 			break;
5947 		if (found_key.offset < ctx->pos)
5948 			continue;
5949 		if (found_key.offset > private->last_index)
5950 			break;
5951 		if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
5952 			continue;
5953 		di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
5954 		name_len = btrfs_dir_name_len(leaf, di);
5955 		if ((total_len + sizeof(struct dir_entry) + name_len) >=
5956 		    PAGE_SIZE) {
5957 			btrfs_release_path(path);
5958 			ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5959 			if (ret)
5960 				goto nopos;
5961 			addr = private->filldir_buf;
5962 			entries = 0;
5963 			total_len = 0;
5964 			goto again;
5965 		}
5966 
5967 		ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
5968 		entry = addr;
5969 		name_ptr = (char *)(entry + 1);
5970 		read_extent_buffer(leaf, name_ptr,
5971 				   (unsigned long)(di + 1), name_len);
5972 		put_unaligned(name_len, &entry->name_len);
5973 		put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
5974 		btrfs_dir_item_key_to_cpu(leaf, di, &location);
5975 		put_unaligned(location.objectid, &entry->ino);
5976 		put_unaligned(found_key.offset, &entry->offset);
5977 		entries++;
5978 		addr += sizeof(struct dir_entry) + name_len;
5979 		total_len += sizeof(struct dir_entry) + name_len;
5980 	}
5981 	/* Catch error encountered during iteration */
5982 	if (ret < 0)
5983 		goto err;
5984 
5985 	btrfs_release_path(path);
5986 
5987 	ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5988 	if (ret)
5989 		goto nopos;
5990 
5991 	ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5992 	if (ret)
5993 		goto nopos;
5994 
5995 	/*
5996 	 * Stop new entries from being returned after we return the last
5997 	 * entry.
5998 	 *
5999 	 * New directory entries are assigned a strictly increasing
6000 	 * offset.  This means that new entries created during readdir
6001 	 * are *guaranteed* to be seen in the future by that readdir.
6002 	 * This has broken buggy programs which operate on names as
6003 	 * they're returned by readdir.  Until we re-use freed offsets
6004 	 * we have this hack to stop new entries from being returned
6005 	 * under the assumption that they'll never reach this huge
6006 	 * offset.
6007 	 *
6008 	 * This is being careful not to overflow 32bit loff_t unless the
6009 	 * last entry requires it because doing so has broken 32bit apps
6010 	 * in the past.
6011 	 */
6012 	if (ctx->pos >= INT_MAX)
6013 		ctx->pos = LLONG_MAX;
6014 	else
6015 		ctx->pos = INT_MAX;
6016 nopos:
6017 	ret = 0;
6018 err:
6019 	if (put)
6020 		btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
6021 	btrfs_free_path(path);
6022 	return ret;
6023 }
6024 
6025 /*
6026  * This is somewhat expensive, updating the tree every time the
6027  * inode changes.  But, it is most likely to find the inode in cache.
6028  * FIXME, needs more benchmarking...there are no reasons other than performance
6029  * to keep or drop this code.
6030  */
btrfs_dirty_inode(struct btrfs_inode * inode)6031 static int btrfs_dirty_inode(struct btrfs_inode *inode)
6032 {
6033 	struct btrfs_root *root = inode->root;
6034 	struct btrfs_fs_info *fs_info = root->fs_info;
6035 	struct btrfs_trans_handle *trans;
6036 	int ret;
6037 
6038 	if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
6039 		return 0;
6040 
6041 	trans = btrfs_join_transaction(root);
6042 	if (IS_ERR(trans))
6043 		return PTR_ERR(trans);
6044 
6045 	ret = btrfs_update_inode(trans, root, inode);
6046 	if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
6047 		/* whoops, lets try again with the full transaction */
6048 		btrfs_end_transaction(trans);
6049 		trans = btrfs_start_transaction(root, 1);
6050 		if (IS_ERR(trans))
6051 			return PTR_ERR(trans);
6052 
6053 		ret = btrfs_update_inode(trans, root, inode);
6054 	}
6055 	btrfs_end_transaction(trans);
6056 	if (inode->delayed_node)
6057 		btrfs_balance_delayed_items(fs_info);
6058 
6059 	return ret;
6060 }
6061 
6062 /*
6063  * This is a copy of file_update_time.  We need this so we can return error on
6064  * ENOSPC for updating the inode in the case of file write and mmap writes.
6065  */
btrfs_update_time(struct inode * inode,int flags)6066 static int btrfs_update_time(struct inode *inode, int flags)
6067 {
6068 	struct btrfs_root *root = BTRFS_I(inode)->root;
6069 	bool dirty = flags & ~S_VERSION;
6070 
6071 	if (btrfs_root_readonly(root))
6072 		return -EROFS;
6073 
6074 	dirty = inode_update_timestamps(inode, flags);
6075 	return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
6076 }
6077 
6078 /*
6079  * helper to find a free sequence number in a given directory.  This current
6080  * code is very simple, later versions will do smarter things in the btree
6081  */
btrfs_set_inode_index(struct btrfs_inode * dir,u64 * index)6082 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6083 {
6084 	int ret = 0;
6085 
6086 	if (dir->index_cnt == (u64)-1) {
6087 		ret = btrfs_inode_delayed_dir_index_count(dir);
6088 		if (ret) {
6089 			ret = btrfs_set_inode_index_count(dir);
6090 			if (ret)
6091 				return ret;
6092 		}
6093 	}
6094 
6095 	*index = dir->index_cnt;
6096 	dir->index_cnt++;
6097 
6098 	return ret;
6099 }
6100 
btrfs_insert_inode_locked(struct inode * inode)6101 static int btrfs_insert_inode_locked(struct inode *inode)
6102 {
6103 	struct btrfs_iget_args args;
6104 
6105 	args.ino = BTRFS_I(inode)->location.objectid;
6106 	args.root = BTRFS_I(inode)->root;
6107 
6108 	return insert_inode_locked4(inode,
6109 		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6110 		   btrfs_find_actor, &args);
6111 }
6112 
btrfs_new_inode_prepare(struct btrfs_new_inode_args * args,unsigned int * trans_num_items)6113 int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
6114 			    unsigned int *trans_num_items)
6115 {
6116 	struct inode *dir = args->dir;
6117 	struct inode *inode = args->inode;
6118 	int ret;
6119 
6120 	if (!args->orphan) {
6121 		ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
6122 					     &args->fname);
6123 		if (ret)
6124 			return ret;
6125 	}
6126 
6127 	ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
6128 	if (ret) {
6129 		fscrypt_free_filename(&args->fname);
6130 		return ret;
6131 	}
6132 
6133 	/* 1 to add inode item */
6134 	*trans_num_items = 1;
6135 	/* 1 to add compression property */
6136 	if (BTRFS_I(dir)->prop_compress)
6137 		(*trans_num_items)++;
6138 	/* 1 to add default ACL xattr */
6139 	if (args->default_acl)
6140 		(*trans_num_items)++;
6141 	/* 1 to add access ACL xattr */
6142 	if (args->acl)
6143 		(*trans_num_items)++;
6144 #ifdef CONFIG_SECURITY
6145 	/* 1 to add LSM xattr */
6146 	if (dir->i_security)
6147 		(*trans_num_items)++;
6148 #endif
6149 	if (args->orphan) {
6150 		/* 1 to add orphan item */
6151 		(*trans_num_items)++;
6152 	} else {
6153 		/*
6154 		 * 1 to add dir item
6155 		 * 1 to add dir index
6156 		 * 1 to update parent inode item
6157 		 *
6158 		 * No need for 1 unit for the inode ref item because it is
6159 		 * inserted in a batch together with the inode item at
6160 		 * btrfs_create_new_inode().
6161 		 */
6162 		*trans_num_items += 3;
6163 	}
6164 	return 0;
6165 }
6166 
btrfs_new_inode_args_destroy(struct btrfs_new_inode_args * args)6167 void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
6168 {
6169 	posix_acl_release(args->acl);
6170 	posix_acl_release(args->default_acl);
6171 	fscrypt_free_filename(&args->fname);
6172 }
6173 
6174 /*
6175  * Inherit flags from the parent inode.
6176  *
6177  * Currently only the compression flags and the cow flags are inherited.
6178  */
btrfs_inherit_iflags(struct btrfs_inode * inode,struct btrfs_inode * dir)6179 static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
6180 {
6181 	unsigned int flags;
6182 
6183 	flags = dir->flags;
6184 
6185 	if (flags & BTRFS_INODE_NOCOMPRESS) {
6186 		inode->flags &= ~BTRFS_INODE_COMPRESS;
6187 		inode->flags |= BTRFS_INODE_NOCOMPRESS;
6188 	} else if (flags & BTRFS_INODE_COMPRESS) {
6189 		inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
6190 		inode->flags |= BTRFS_INODE_COMPRESS;
6191 	}
6192 
6193 	if (flags & BTRFS_INODE_NODATACOW) {
6194 		inode->flags |= BTRFS_INODE_NODATACOW;
6195 		if (S_ISREG(inode->vfs_inode.i_mode))
6196 			inode->flags |= BTRFS_INODE_NODATASUM;
6197 	}
6198 
6199 	btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode);
6200 }
6201 
btrfs_create_new_inode(struct btrfs_trans_handle * trans,struct btrfs_new_inode_args * args)6202 int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
6203 			   struct btrfs_new_inode_args *args)
6204 {
6205 	struct inode *dir = args->dir;
6206 	struct inode *inode = args->inode;
6207 	const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
6208 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6209 	struct btrfs_root *root;
6210 	struct btrfs_inode_item *inode_item;
6211 	struct btrfs_key *location;
6212 	struct btrfs_path *path;
6213 	u64 objectid;
6214 	struct btrfs_inode_ref *ref;
6215 	struct btrfs_key key[2];
6216 	u32 sizes[2];
6217 	struct btrfs_item_batch batch;
6218 	unsigned long ptr;
6219 	int ret;
6220 
6221 	path = btrfs_alloc_path();
6222 	if (!path)
6223 		return -ENOMEM;
6224 
6225 	if (!args->subvol)
6226 		BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
6227 	root = BTRFS_I(inode)->root;
6228 
6229 	ret = btrfs_get_free_objectid(root, &objectid);
6230 	if (ret)
6231 		goto out;
6232 	inode->i_ino = objectid;
6233 
6234 	if (args->orphan) {
6235 		/*
6236 		 * O_TMPFILE, set link count to 0, so that after this point, we
6237 		 * fill in an inode item with the correct link count.
6238 		 */
6239 		set_nlink(inode, 0);
6240 	} else {
6241 		trace_btrfs_inode_request(dir);
6242 
6243 		ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
6244 		if (ret)
6245 			goto out;
6246 	}
6247 	/* index_cnt is ignored for everything but a dir. */
6248 	BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
6249 	BTRFS_I(inode)->generation = trans->transid;
6250 	inode->i_generation = BTRFS_I(inode)->generation;
6251 
6252 	/*
6253 	 * Subvolumes don't inherit flags from their parent directory.
6254 	 * Originally this was probably by accident, but we probably can't
6255 	 * change it now without compatibility issues.
6256 	 */
6257 	if (!args->subvol)
6258 		btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
6259 
6260 	if (S_ISREG(inode->i_mode)) {
6261 		if (btrfs_test_opt(fs_info, NODATASUM))
6262 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6263 		if (btrfs_test_opt(fs_info, NODATACOW))
6264 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6265 				BTRFS_INODE_NODATASUM;
6266 	}
6267 
6268 	location = &BTRFS_I(inode)->location;
6269 	location->objectid = objectid;
6270 	location->offset = 0;
6271 	location->type = BTRFS_INODE_ITEM_KEY;
6272 
6273 	ret = btrfs_insert_inode_locked(inode);
6274 	if (ret < 0) {
6275 		if (!args->orphan)
6276 			BTRFS_I(dir)->index_cnt--;
6277 		goto out;
6278 	}
6279 
6280 	/*
6281 	 * We could have gotten an inode number from somebody who was fsynced
6282 	 * and then removed in this same transaction, so let's just set full
6283 	 * sync since it will be a full sync anyway and this will blow away the
6284 	 * old info in the log.
6285 	 */
6286 	btrfs_set_inode_full_sync(BTRFS_I(inode));
6287 
6288 	key[0].objectid = objectid;
6289 	key[0].type = BTRFS_INODE_ITEM_KEY;
6290 	key[0].offset = 0;
6291 
6292 	sizes[0] = sizeof(struct btrfs_inode_item);
6293 
6294 	if (!args->orphan) {
6295 		/*
6296 		 * Start new inodes with an inode_ref. This is slightly more
6297 		 * efficient for small numbers of hard links since they will
6298 		 * be packed into one item. Extended refs will kick in if we
6299 		 * add more hard links than can fit in the ref item.
6300 		 */
6301 		key[1].objectid = objectid;
6302 		key[1].type = BTRFS_INODE_REF_KEY;
6303 		if (args->subvol) {
6304 			key[1].offset = objectid;
6305 			sizes[1] = 2 + sizeof(*ref);
6306 		} else {
6307 			key[1].offset = btrfs_ino(BTRFS_I(dir));
6308 			sizes[1] = name->len + sizeof(*ref);
6309 		}
6310 	}
6311 
6312 	batch.keys = &key[0];
6313 	batch.data_sizes = &sizes[0];
6314 	batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
6315 	batch.nr = args->orphan ? 1 : 2;
6316 	ret = btrfs_insert_empty_items(trans, root, path, &batch);
6317 	if (ret != 0) {
6318 		btrfs_abort_transaction(trans, ret);
6319 		goto discard;
6320 	}
6321 
6322 	inode->i_mtime = inode_set_ctime_current(inode);
6323 	inode->i_atime = inode->i_mtime;
6324 	BTRFS_I(inode)->i_otime = inode->i_mtime;
6325 
6326 	/*
6327 	 * We're going to fill the inode item now, so at this point the inode
6328 	 * must be fully initialized.
6329 	 */
6330 
6331 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6332 				  struct btrfs_inode_item);
6333 	memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6334 			     sizeof(*inode_item));
6335 	fill_inode_item(trans, path->nodes[0], inode_item, inode);
6336 
6337 	if (!args->orphan) {
6338 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6339 				     struct btrfs_inode_ref);
6340 		ptr = (unsigned long)(ref + 1);
6341 		if (args->subvol) {
6342 			btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
6343 			btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
6344 			write_extent_buffer(path->nodes[0], "..", ptr, 2);
6345 		} else {
6346 			btrfs_set_inode_ref_name_len(path->nodes[0], ref,
6347 						     name->len);
6348 			btrfs_set_inode_ref_index(path->nodes[0], ref,
6349 						  BTRFS_I(inode)->dir_index);
6350 			write_extent_buffer(path->nodes[0], name->name, ptr,
6351 					    name->len);
6352 		}
6353 	}
6354 
6355 	btrfs_mark_buffer_dirty(trans, path->nodes[0]);
6356 	/*
6357 	 * We don't need the path anymore, plus inheriting properties, adding
6358 	 * ACLs, security xattrs, orphan item or adding the link, will result in
6359 	 * allocating yet another path. So just free our path.
6360 	 */
6361 	btrfs_free_path(path);
6362 	path = NULL;
6363 
6364 	if (args->subvol) {
6365 		struct inode *parent;
6366 
6367 		/*
6368 		 * Subvolumes inherit properties from their parent subvolume,
6369 		 * not the directory they were created in.
6370 		 */
6371 		parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID,
6372 				    BTRFS_I(dir)->root);
6373 		if (IS_ERR(parent)) {
6374 			ret = PTR_ERR(parent);
6375 		} else {
6376 			ret = btrfs_inode_inherit_props(trans, inode, parent);
6377 			iput(parent);
6378 		}
6379 	} else {
6380 		ret = btrfs_inode_inherit_props(trans, inode, dir);
6381 	}
6382 	if (ret) {
6383 		btrfs_err(fs_info,
6384 			  "error inheriting props for ino %llu (root %llu): %d",
6385 			  btrfs_ino(BTRFS_I(inode)), root->root_key.objectid,
6386 			  ret);
6387 	}
6388 
6389 	/*
6390 	 * Subvolumes don't inherit ACLs or get passed to the LSM. This is
6391 	 * probably a bug.
6392 	 */
6393 	if (!args->subvol) {
6394 		ret = btrfs_init_inode_security(trans, args);
6395 		if (ret) {
6396 			btrfs_abort_transaction(trans, ret);
6397 			goto discard;
6398 		}
6399 	}
6400 
6401 	inode_tree_add(BTRFS_I(inode));
6402 
6403 	trace_btrfs_inode_new(inode);
6404 	btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
6405 
6406 	btrfs_update_root_times(trans, root);
6407 
6408 	if (args->orphan) {
6409 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
6410 	} else {
6411 		ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
6412 				     0, BTRFS_I(inode)->dir_index);
6413 	}
6414 	if (ret) {
6415 		btrfs_abort_transaction(trans, ret);
6416 		goto discard;
6417 	}
6418 
6419 	return 0;
6420 
6421 discard:
6422 	/*
6423 	 * discard_new_inode() calls iput(), but the caller owns the reference
6424 	 * to the inode.
6425 	 */
6426 	ihold(inode);
6427 	discard_new_inode(inode);
6428 out:
6429 	btrfs_free_path(path);
6430 	return ret;
6431 }
6432 
6433 /*
6434  * utility function to add 'inode' into 'parent_inode' with
6435  * a give name and a given sequence number.
6436  * if 'add_backref' is true, also insert a backref from the
6437  * inode to the parent directory.
6438  */
btrfs_add_link(struct btrfs_trans_handle * trans,struct btrfs_inode * parent_inode,struct btrfs_inode * inode,const struct fscrypt_str * name,int add_backref,u64 index)6439 int btrfs_add_link(struct btrfs_trans_handle *trans,
6440 		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6441 		   const struct fscrypt_str *name, int add_backref, u64 index)
6442 {
6443 	int ret = 0;
6444 	struct btrfs_key key;
6445 	struct btrfs_root *root = parent_inode->root;
6446 	u64 ino = btrfs_ino(inode);
6447 	u64 parent_ino = btrfs_ino(parent_inode);
6448 
6449 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6450 		memcpy(&key, &inode->root->root_key, sizeof(key));
6451 	} else {
6452 		key.objectid = ino;
6453 		key.type = BTRFS_INODE_ITEM_KEY;
6454 		key.offset = 0;
6455 	}
6456 
6457 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6458 		ret = btrfs_add_root_ref(trans, key.objectid,
6459 					 root->root_key.objectid, parent_ino,
6460 					 index, name);
6461 	} else if (add_backref) {
6462 		ret = btrfs_insert_inode_ref(trans, root, name,
6463 					     ino, parent_ino, index);
6464 	}
6465 
6466 	/* Nothing to clean up yet */
6467 	if (ret)
6468 		return ret;
6469 
6470 	ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
6471 				    btrfs_inode_type(&inode->vfs_inode), index);
6472 	if (ret == -EEXIST || ret == -EOVERFLOW)
6473 		goto fail_dir_item;
6474 	else if (ret) {
6475 		btrfs_abort_transaction(trans, ret);
6476 		return ret;
6477 	}
6478 
6479 	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6480 			   name->len * 2);
6481 	inode_inc_iversion(&parent_inode->vfs_inode);
6482 	/*
6483 	 * If we are replaying a log tree, we do not want to update the mtime
6484 	 * and ctime of the parent directory with the current time, since the
6485 	 * log replay procedure is responsible for setting them to their correct
6486 	 * values (the ones it had when the fsync was done).
6487 	 */
6488 	if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
6489 		parent_inode->vfs_inode.i_mtime =
6490 			inode_set_ctime_current(&parent_inode->vfs_inode);
6491 
6492 	ret = btrfs_update_inode(trans, root, parent_inode);
6493 	if (ret)
6494 		btrfs_abort_transaction(trans, ret);
6495 	return ret;
6496 
6497 fail_dir_item:
6498 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6499 		u64 local_index;
6500 		int err;
6501 		err = btrfs_del_root_ref(trans, key.objectid,
6502 					 root->root_key.objectid, parent_ino,
6503 					 &local_index, name);
6504 		if (err)
6505 			btrfs_abort_transaction(trans, err);
6506 	} else if (add_backref) {
6507 		u64 local_index;
6508 		int err;
6509 
6510 		err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
6511 					  &local_index);
6512 		if (err)
6513 			btrfs_abort_transaction(trans, err);
6514 	}
6515 
6516 	/* Return the original error code */
6517 	return ret;
6518 }
6519 
btrfs_create_common(struct inode * dir,struct dentry * dentry,struct inode * inode)6520 static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
6521 			       struct inode *inode)
6522 {
6523 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6524 	struct btrfs_root *root = BTRFS_I(dir)->root;
6525 	struct btrfs_new_inode_args new_inode_args = {
6526 		.dir = dir,
6527 		.dentry = dentry,
6528 		.inode = inode,
6529 	};
6530 	unsigned int trans_num_items;
6531 	struct btrfs_trans_handle *trans;
6532 	int err;
6533 
6534 	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
6535 	if (err)
6536 		goto out_inode;
6537 
6538 	trans = btrfs_start_transaction(root, trans_num_items);
6539 	if (IS_ERR(trans)) {
6540 		err = PTR_ERR(trans);
6541 		goto out_new_inode_args;
6542 	}
6543 
6544 	err = btrfs_create_new_inode(trans, &new_inode_args);
6545 	if (!err)
6546 		d_instantiate_new(dentry, inode);
6547 
6548 	btrfs_end_transaction(trans);
6549 	btrfs_btree_balance_dirty(fs_info);
6550 out_new_inode_args:
6551 	btrfs_new_inode_args_destroy(&new_inode_args);
6552 out_inode:
6553 	if (err)
6554 		iput(inode);
6555 	return err;
6556 }
6557 
btrfs_mknod(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,dev_t rdev)6558 static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
6559 		       struct dentry *dentry, umode_t mode, dev_t rdev)
6560 {
6561 	struct inode *inode;
6562 
6563 	inode = new_inode(dir->i_sb);
6564 	if (!inode)
6565 		return -ENOMEM;
6566 	inode_init_owner(idmap, inode, dir, mode);
6567 	inode->i_op = &btrfs_special_inode_operations;
6568 	init_special_inode(inode, inode->i_mode, rdev);
6569 	return btrfs_create_common(dir, dentry, inode);
6570 }
6571 
btrfs_create(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,bool excl)6572 static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
6573 			struct dentry *dentry, umode_t mode, bool excl)
6574 {
6575 	struct inode *inode;
6576 
6577 	inode = new_inode(dir->i_sb);
6578 	if (!inode)
6579 		return -ENOMEM;
6580 	inode_init_owner(idmap, inode, dir, mode);
6581 	inode->i_fop = &btrfs_file_operations;
6582 	inode->i_op = &btrfs_file_inode_operations;
6583 	inode->i_mapping->a_ops = &btrfs_aops;
6584 	return btrfs_create_common(dir, dentry, inode);
6585 }
6586 
btrfs_link(struct dentry * old_dentry,struct inode * dir,struct dentry * dentry)6587 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6588 		      struct dentry *dentry)
6589 {
6590 	struct btrfs_trans_handle *trans = NULL;
6591 	struct btrfs_root *root = BTRFS_I(dir)->root;
6592 	struct inode *inode = d_inode(old_dentry);
6593 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6594 	struct fscrypt_name fname;
6595 	u64 index;
6596 	int err;
6597 	int drop_inode = 0;
6598 
6599 	/* do not allow sys_link's with other subvols of the same device */
6600 	if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
6601 		return -EXDEV;
6602 
6603 	if (inode->i_nlink >= BTRFS_LINK_MAX)
6604 		return -EMLINK;
6605 
6606 	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
6607 	if (err)
6608 		goto fail;
6609 
6610 	err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6611 	if (err)
6612 		goto fail;
6613 
6614 	/*
6615 	 * 2 items for inode and inode ref
6616 	 * 2 items for dir items
6617 	 * 1 item for parent inode
6618 	 * 1 item for orphan item deletion if O_TMPFILE
6619 	 */
6620 	trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
6621 	if (IS_ERR(trans)) {
6622 		err = PTR_ERR(trans);
6623 		trans = NULL;
6624 		goto fail;
6625 	}
6626 
6627 	/* There are several dir indexes for this inode, clear the cache. */
6628 	BTRFS_I(inode)->dir_index = 0ULL;
6629 	inc_nlink(inode);
6630 	inode_inc_iversion(inode);
6631 	inode_set_ctime_current(inode);
6632 	ihold(inode);
6633 	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6634 
6635 	err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6636 			     &fname.disk_name, 1, index);
6637 
6638 	if (err) {
6639 		drop_inode = 1;
6640 	} else {
6641 		struct dentry *parent = dentry->d_parent;
6642 
6643 		err = btrfs_update_inode(trans, root, BTRFS_I(inode));
6644 		if (err)
6645 			goto fail;
6646 		if (inode->i_nlink == 1) {
6647 			/*
6648 			 * If new hard link count is 1, it's a file created
6649 			 * with open(2) O_TMPFILE flag.
6650 			 */
6651 			err = btrfs_orphan_del(trans, BTRFS_I(inode));
6652 			if (err)
6653 				goto fail;
6654 		}
6655 		d_instantiate(dentry, inode);
6656 		btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
6657 	}
6658 
6659 fail:
6660 	fscrypt_free_filename(&fname);
6661 	if (trans)
6662 		btrfs_end_transaction(trans);
6663 	if (drop_inode) {
6664 		inode_dec_link_count(inode);
6665 		iput(inode);
6666 	}
6667 	btrfs_btree_balance_dirty(fs_info);
6668 	return err;
6669 }
6670 
btrfs_mkdir(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode)6671 static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
6672 		       struct dentry *dentry, umode_t mode)
6673 {
6674 	struct inode *inode;
6675 
6676 	inode = new_inode(dir->i_sb);
6677 	if (!inode)
6678 		return -ENOMEM;
6679 	inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
6680 	inode->i_op = &btrfs_dir_inode_operations;
6681 	inode->i_fop = &btrfs_dir_file_operations;
6682 	return btrfs_create_common(dir, dentry, inode);
6683 }
6684 
uncompress_inline(struct btrfs_path * path,struct page * page,struct btrfs_file_extent_item * item)6685 static noinline int uncompress_inline(struct btrfs_path *path,
6686 				      struct page *page,
6687 				      struct btrfs_file_extent_item *item)
6688 {
6689 	int ret;
6690 	struct extent_buffer *leaf = path->nodes[0];
6691 	char *tmp;
6692 	size_t max_size;
6693 	unsigned long inline_size;
6694 	unsigned long ptr;
6695 	int compress_type;
6696 
6697 	compress_type = btrfs_file_extent_compression(leaf, item);
6698 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
6699 	inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
6700 	tmp = kmalloc(inline_size, GFP_NOFS);
6701 	if (!tmp)
6702 		return -ENOMEM;
6703 	ptr = btrfs_file_extent_inline_start(item);
6704 
6705 	read_extent_buffer(leaf, tmp, ptr, inline_size);
6706 
6707 	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6708 	ret = btrfs_decompress(compress_type, tmp, page, 0, inline_size, max_size);
6709 
6710 	/*
6711 	 * decompression code contains a memset to fill in any space between the end
6712 	 * of the uncompressed data and the end of max_size in case the decompressed
6713 	 * data ends up shorter than ram_bytes.  That doesn't cover the hole between
6714 	 * the end of an inline extent and the beginning of the next block, so we
6715 	 * cover that region here.
6716 	 */
6717 
6718 	if (max_size < PAGE_SIZE)
6719 		memzero_page(page, max_size, PAGE_SIZE - max_size);
6720 	kfree(tmp);
6721 	return ret;
6722 }
6723 
read_inline_extent(struct btrfs_inode * inode,struct btrfs_path * path,struct page * page)6724 static int read_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path,
6725 			      struct page *page)
6726 {
6727 	struct btrfs_file_extent_item *fi;
6728 	void *kaddr;
6729 	size_t copy_size;
6730 
6731 	if (!page || PageUptodate(page))
6732 		return 0;
6733 
6734 	ASSERT(page_offset(page) == 0);
6735 
6736 	fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
6737 			    struct btrfs_file_extent_item);
6738 	if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
6739 		return uncompress_inline(path, page, fi);
6740 
6741 	copy_size = min_t(u64, PAGE_SIZE,
6742 			  btrfs_file_extent_ram_bytes(path->nodes[0], fi));
6743 	kaddr = kmap_local_page(page);
6744 	read_extent_buffer(path->nodes[0], kaddr,
6745 			   btrfs_file_extent_inline_start(fi), copy_size);
6746 	kunmap_local(kaddr);
6747 	if (copy_size < PAGE_SIZE)
6748 		memzero_page(page, copy_size, PAGE_SIZE - copy_size);
6749 	return 0;
6750 }
6751 
6752 /*
6753  * Lookup the first extent overlapping a range in a file.
6754  *
6755  * @inode:	file to search in
6756  * @page:	page to read extent data into if the extent is inline
6757  * @pg_offset:	offset into @page to copy to
6758  * @start:	file offset
6759  * @len:	length of range starting at @start
6760  *
6761  * Return the first &struct extent_map which overlaps the given range, reading
6762  * it from the B-tree and caching it if necessary. Note that there may be more
6763  * extents which overlap the given range after the returned extent_map.
6764  *
6765  * If @page is not NULL and the extent is inline, this also reads the extent
6766  * data directly into the page and marks the extent up to date in the io_tree.
6767  *
6768  * Return: ERR_PTR on error, non-NULL extent_map on success.
6769  */
btrfs_get_extent(struct btrfs_inode * inode,struct page * page,size_t pg_offset,u64 start,u64 len)6770 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6771 				    struct page *page, size_t pg_offset,
6772 				    u64 start, u64 len)
6773 {
6774 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
6775 	int ret = 0;
6776 	u64 extent_start = 0;
6777 	u64 extent_end = 0;
6778 	u64 objectid = btrfs_ino(inode);
6779 	int extent_type = -1;
6780 	struct btrfs_path *path = NULL;
6781 	struct btrfs_root *root = inode->root;
6782 	struct btrfs_file_extent_item *item;
6783 	struct extent_buffer *leaf;
6784 	struct btrfs_key found_key;
6785 	struct extent_map *em = NULL;
6786 	struct extent_map_tree *em_tree = &inode->extent_tree;
6787 
6788 	read_lock(&em_tree->lock);
6789 	em = lookup_extent_mapping(em_tree, start, len);
6790 	read_unlock(&em_tree->lock);
6791 
6792 	if (em) {
6793 		if (em->start > start || em->start + em->len <= start)
6794 			free_extent_map(em);
6795 		else if (em->block_start == EXTENT_MAP_INLINE && page)
6796 			free_extent_map(em);
6797 		else
6798 			goto out;
6799 	}
6800 	em = alloc_extent_map();
6801 	if (!em) {
6802 		ret = -ENOMEM;
6803 		goto out;
6804 	}
6805 	em->start = EXTENT_MAP_HOLE;
6806 	em->orig_start = EXTENT_MAP_HOLE;
6807 	em->len = (u64)-1;
6808 	em->block_len = (u64)-1;
6809 
6810 	path = btrfs_alloc_path();
6811 	if (!path) {
6812 		ret = -ENOMEM;
6813 		goto out;
6814 	}
6815 
6816 	/* Chances are we'll be called again, so go ahead and do readahead */
6817 	path->reada = READA_FORWARD;
6818 
6819 	/*
6820 	 * The same explanation in load_free_space_cache applies here as well,
6821 	 * we only read when we're loading the free space cache, and at that
6822 	 * point the commit_root has everything we need.
6823 	 */
6824 	if (btrfs_is_free_space_inode(inode)) {
6825 		path->search_commit_root = 1;
6826 		path->skip_locking = 1;
6827 	}
6828 
6829 	ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
6830 	if (ret < 0) {
6831 		goto out;
6832 	} else if (ret > 0) {
6833 		if (path->slots[0] == 0)
6834 			goto not_found;
6835 		path->slots[0]--;
6836 		ret = 0;
6837 	}
6838 
6839 	leaf = path->nodes[0];
6840 	item = btrfs_item_ptr(leaf, path->slots[0],
6841 			      struct btrfs_file_extent_item);
6842 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6843 	if (found_key.objectid != objectid ||
6844 	    found_key.type != BTRFS_EXTENT_DATA_KEY) {
6845 		/*
6846 		 * If we backup past the first extent we want to move forward
6847 		 * and see if there is an extent in front of us, otherwise we'll
6848 		 * say there is a hole for our whole search range which can
6849 		 * cause problems.
6850 		 */
6851 		extent_end = start;
6852 		goto next;
6853 	}
6854 
6855 	extent_type = btrfs_file_extent_type(leaf, item);
6856 	extent_start = found_key.offset;
6857 	extent_end = btrfs_file_extent_end(path);
6858 	if (extent_type == BTRFS_FILE_EXTENT_REG ||
6859 	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6860 		/* Only regular file could have regular/prealloc extent */
6861 		if (!S_ISREG(inode->vfs_inode.i_mode)) {
6862 			ret = -EUCLEAN;
6863 			btrfs_crit(fs_info,
6864 		"regular/prealloc extent found for non-regular inode %llu",
6865 				   btrfs_ino(inode));
6866 			goto out;
6867 		}
6868 		trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
6869 						       extent_start);
6870 	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6871 		trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
6872 						      path->slots[0],
6873 						      extent_start);
6874 	}
6875 next:
6876 	if (start >= extent_end) {
6877 		path->slots[0]++;
6878 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6879 			ret = btrfs_next_leaf(root, path);
6880 			if (ret < 0)
6881 				goto out;
6882 			else if (ret > 0)
6883 				goto not_found;
6884 
6885 			leaf = path->nodes[0];
6886 		}
6887 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6888 		if (found_key.objectid != objectid ||
6889 		    found_key.type != BTRFS_EXTENT_DATA_KEY)
6890 			goto not_found;
6891 		if (start + len <= found_key.offset)
6892 			goto not_found;
6893 		if (start > found_key.offset)
6894 			goto next;
6895 
6896 		/* New extent overlaps with existing one */
6897 		em->start = start;
6898 		em->orig_start = start;
6899 		em->len = found_key.offset - start;
6900 		em->block_start = EXTENT_MAP_HOLE;
6901 		goto insert;
6902 	}
6903 
6904 	btrfs_extent_item_to_extent_map(inode, path, item, em);
6905 
6906 	if (extent_type == BTRFS_FILE_EXTENT_REG ||
6907 	    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6908 		goto insert;
6909 	} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6910 		/*
6911 		 * Inline extent can only exist at file offset 0. This is
6912 		 * ensured by tree-checker and inline extent creation path.
6913 		 * Thus all members representing file offsets should be zero.
6914 		 */
6915 		ASSERT(pg_offset == 0);
6916 		ASSERT(extent_start == 0);
6917 		ASSERT(em->start == 0);
6918 
6919 		/*
6920 		 * btrfs_extent_item_to_extent_map() should have properly
6921 		 * initialized em members already.
6922 		 *
6923 		 * Other members are not utilized for inline extents.
6924 		 */
6925 		ASSERT(em->block_start == EXTENT_MAP_INLINE);
6926 		ASSERT(em->len == fs_info->sectorsize);
6927 
6928 		ret = read_inline_extent(inode, path, page);
6929 		if (ret < 0)
6930 			goto out;
6931 		goto insert;
6932 	}
6933 not_found:
6934 	em->start = start;
6935 	em->orig_start = start;
6936 	em->len = len;
6937 	em->block_start = EXTENT_MAP_HOLE;
6938 insert:
6939 	ret = 0;
6940 	btrfs_release_path(path);
6941 	if (em->start > start || extent_map_end(em) <= start) {
6942 		btrfs_err(fs_info,
6943 			  "bad extent! em: [%llu %llu] passed [%llu %llu]",
6944 			  em->start, em->len, start, len);
6945 		ret = -EIO;
6946 		goto out;
6947 	}
6948 
6949 	write_lock(&em_tree->lock);
6950 	ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
6951 	write_unlock(&em_tree->lock);
6952 out:
6953 	btrfs_free_path(path);
6954 
6955 	trace_btrfs_get_extent(root, inode, em);
6956 
6957 	if (ret) {
6958 		free_extent_map(em);
6959 		return ERR_PTR(ret);
6960 	}
6961 	return em;
6962 }
6963 
btrfs_create_dio_extent(struct btrfs_inode * inode,struct btrfs_dio_data * dio_data,const u64 start,const u64 len,const u64 orig_start,const u64 block_start,const u64 block_len,const u64 orig_block_len,const u64 ram_bytes,const int type)6964 static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode,
6965 						  struct btrfs_dio_data *dio_data,
6966 						  const u64 start,
6967 						  const u64 len,
6968 						  const u64 orig_start,
6969 						  const u64 block_start,
6970 						  const u64 block_len,
6971 						  const u64 orig_block_len,
6972 						  const u64 ram_bytes,
6973 						  const int type)
6974 {
6975 	struct extent_map *em = NULL;
6976 	struct btrfs_ordered_extent *ordered;
6977 
6978 	if (type != BTRFS_ORDERED_NOCOW) {
6979 		em = create_io_em(inode, start, len, orig_start, block_start,
6980 				  block_len, orig_block_len, ram_bytes,
6981 				  BTRFS_COMPRESS_NONE, /* compress_type */
6982 				  type);
6983 		if (IS_ERR(em))
6984 			goto out;
6985 	}
6986 	ordered = btrfs_alloc_ordered_extent(inode, start, len, len,
6987 					     block_start, block_len, 0,
6988 					     (1 << type) |
6989 					     (1 << BTRFS_ORDERED_DIRECT),
6990 					     BTRFS_COMPRESS_NONE);
6991 	if (IS_ERR(ordered)) {
6992 		if (em) {
6993 			free_extent_map(em);
6994 			btrfs_drop_extent_map_range(inode, start,
6995 						    start + len - 1, false);
6996 		}
6997 		em = ERR_CAST(ordered);
6998 	} else {
6999 		ASSERT(!dio_data->ordered);
7000 		dio_data->ordered = ordered;
7001 	}
7002  out:
7003 
7004 	return em;
7005 }
7006 
btrfs_new_extent_direct(struct btrfs_inode * inode,struct btrfs_dio_data * dio_data,u64 start,u64 len)7007 static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
7008 						  struct btrfs_dio_data *dio_data,
7009 						  u64 start, u64 len)
7010 {
7011 	struct btrfs_root *root = inode->root;
7012 	struct btrfs_fs_info *fs_info = root->fs_info;
7013 	struct extent_map *em;
7014 	struct btrfs_key ins;
7015 	u64 alloc_hint;
7016 	int ret;
7017 
7018 	alloc_hint = get_extent_allocation_hint(inode, start, len);
7019 again:
7020 	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
7021 				   0, alloc_hint, &ins, 1, 1);
7022 	if (ret == -EAGAIN) {
7023 		ASSERT(btrfs_is_zoned(fs_info));
7024 		wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH,
7025 			       TASK_UNINTERRUPTIBLE);
7026 		goto again;
7027 	}
7028 	if (ret)
7029 		return ERR_PTR(ret);
7030 
7031 	em = btrfs_create_dio_extent(inode, dio_data, start, ins.offset, start,
7032 				     ins.objectid, ins.offset, ins.offset,
7033 				     ins.offset, BTRFS_ORDERED_REGULAR);
7034 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
7035 	if (IS_ERR(em))
7036 		btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset,
7037 					   1);
7038 
7039 	return em;
7040 }
7041 
btrfs_extent_readonly(struct btrfs_fs_info * fs_info,u64 bytenr)7042 static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
7043 {
7044 	struct btrfs_block_group *block_group;
7045 	bool readonly = false;
7046 
7047 	block_group = btrfs_lookup_block_group(fs_info, bytenr);
7048 	if (!block_group || block_group->ro)
7049 		readonly = true;
7050 	if (block_group)
7051 		btrfs_put_block_group(block_group);
7052 	return readonly;
7053 }
7054 
7055 /*
7056  * Check if we can do nocow write into the range [@offset, @offset + @len)
7057  *
7058  * @offset:	File offset
7059  * @len:	The length to write, will be updated to the nocow writeable
7060  *		range
7061  * @orig_start:	(optional) Return the original file offset of the file extent
7062  * @orig_len:	(optional) Return the original on-disk length of the file extent
7063  * @ram_bytes:	(optional) Return the ram_bytes of the file extent
7064  * @strict:	if true, omit optimizations that might force us into unnecessary
7065  *		cow. e.g., don't trust generation number.
7066  *
7067  * Return:
7068  * >0	and update @len if we can do nocow write
7069  *  0	if we can't do nocow write
7070  * <0	if error happened
7071  *
7072  * NOTE: This only checks the file extents, caller is responsible to wait for
7073  *	 any ordered extents.
7074  */
can_nocow_extent(struct inode * inode,u64 offset,u64 * len,u64 * orig_start,u64 * orig_block_len,u64 * ram_bytes,bool nowait,bool strict)7075 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7076 			      u64 *orig_start, u64 *orig_block_len,
7077 			      u64 *ram_bytes, bool nowait, bool strict)
7078 {
7079 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7080 	struct can_nocow_file_extent_args nocow_args = { 0 };
7081 	struct btrfs_path *path;
7082 	int ret;
7083 	struct extent_buffer *leaf;
7084 	struct btrfs_root *root = BTRFS_I(inode)->root;
7085 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7086 	struct btrfs_file_extent_item *fi;
7087 	struct btrfs_key key;
7088 	int found_type;
7089 
7090 	path = btrfs_alloc_path();
7091 	if (!path)
7092 		return -ENOMEM;
7093 	path->nowait = nowait;
7094 
7095 	ret = btrfs_lookup_file_extent(NULL, root, path,
7096 			btrfs_ino(BTRFS_I(inode)), offset, 0);
7097 	if (ret < 0)
7098 		goto out;
7099 
7100 	if (ret == 1) {
7101 		if (path->slots[0] == 0) {
7102 			/* can't find the item, must cow */
7103 			ret = 0;
7104 			goto out;
7105 		}
7106 		path->slots[0]--;
7107 	}
7108 	ret = 0;
7109 	leaf = path->nodes[0];
7110 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7111 	if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
7112 	    key.type != BTRFS_EXTENT_DATA_KEY) {
7113 		/* not our file or wrong item type, must cow */
7114 		goto out;
7115 	}
7116 
7117 	if (key.offset > offset) {
7118 		/* Wrong offset, must cow */
7119 		goto out;
7120 	}
7121 
7122 	if (btrfs_file_extent_end(path) <= offset)
7123 		goto out;
7124 
7125 	fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
7126 	found_type = btrfs_file_extent_type(leaf, fi);
7127 	if (ram_bytes)
7128 		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
7129 
7130 	nocow_args.start = offset;
7131 	nocow_args.end = offset + *len - 1;
7132 	nocow_args.strict = strict;
7133 	nocow_args.free_path = true;
7134 
7135 	ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args);
7136 	/* can_nocow_file_extent() has freed the path. */
7137 	path = NULL;
7138 
7139 	if (ret != 1) {
7140 		/* Treat errors as not being able to NOCOW. */
7141 		ret = 0;
7142 		goto out;
7143 	}
7144 
7145 	ret = 0;
7146 	if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr))
7147 		goto out;
7148 
7149 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7150 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7151 		u64 range_end;
7152 
7153 		range_end = round_up(offset + nocow_args.num_bytes,
7154 				     root->fs_info->sectorsize) - 1;
7155 		ret = test_range_bit(io_tree, offset, range_end,
7156 				     EXTENT_DELALLOC, 0, NULL);
7157 		if (ret) {
7158 			ret = -EAGAIN;
7159 			goto out;
7160 		}
7161 	}
7162 
7163 	if (orig_start)
7164 		*orig_start = key.offset - nocow_args.extent_offset;
7165 	if (orig_block_len)
7166 		*orig_block_len = nocow_args.disk_num_bytes;
7167 
7168 	*len = nocow_args.num_bytes;
7169 	ret = 1;
7170 out:
7171 	btrfs_free_path(path);
7172 	return ret;
7173 }
7174 
lock_extent_direct(struct inode * inode,u64 lockstart,u64 lockend,struct extent_state ** cached_state,unsigned int iomap_flags)7175 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7176 			      struct extent_state **cached_state,
7177 			      unsigned int iomap_flags)
7178 {
7179 	const bool writing = (iomap_flags & IOMAP_WRITE);
7180 	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
7181 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7182 	struct btrfs_ordered_extent *ordered;
7183 	int ret = 0;
7184 
7185 	while (1) {
7186 		if (nowait) {
7187 			if (!try_lock_extent(io_tree, lockstart, lockend,
7188 					     cached_state))
7189 				return -EAGAIN;
7190 		} else {
7191 			lock_extent(io_tree, lockstart, lockend, cached_state);
7192 		}
7193 		/*
7194 		 * We're concerned with the entire range that we're going to be
7195 		 * doing DIO to, so we need to make sure there's no ordered
7196 		 * extents in this range.
7197 		 */
7198 		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
7199 						     lockend - lockstart + 1);
7200 
7201 		/*
7202 		 * We need to make sure there are no buffered pages in this
7203 		 * range either, we could have raced between the invalidate in
7204 		 * generic_file_direct_write and locking the extent.  The
7205 		 * invalidate needs to happen so that reads after a write do not
7206 		 * get stale data.
7207 		 */
7208 		if (!ordered &&
7209 		    (!writing || !filemap_range_has_page(inode->i_mapping,
7210 							 lockstart, lockend)))
7211 			break;
7212 
7213 		unlock_extent(io_tree, lockstart, lockend, cached_state);
7214 
7215 		if (ordered) {
7216 			if (nowait) {
7217 				btrfs_put_ordered_extent(ordered);
7218 				ret = -EAGAIN;
7219 				break;
7220 			}
7221 			/*
7222 			 * If we are doing a DIO read and the ordered extent we
7223 			 * found is for a buffered write, we can not wait for it
7224 			 * to complete and retry, because if we do so we can
7225 			 * deadlock with concurrent buffered writes on page
7226 			 * locks. This happens only if our DIO read covers more
7227 			 * than one extent map, if at this point has already
7228 			 * created an ordered extent for a previous extent map
7229 			 * and locked its range in the inode's io tree, and a
7230 			 * concurrent write against that previous extent map's
7231 			 * range and this range started (we unlock the ranges
7232 			 * in the io tree only when the bios complete and
7233 			 * buffered writes always lock pages before attempting
7234 			 * to lock range in the io tree).
7235 			 */
7236 			if (writing ||
7237 			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7238 				btrfs_start_ordered_extent(ordered);
7239 			else
7240 				ret = nowait ? -EAGAIN : -ENOTBLK;
7241 			btrfs_put_ordered_extent(ordered);
7242 		} else {
7243 			/*
7244 			 * We could trigger writeback for this range (and wait
7245 			 * for it to complete) and then invalidate the pages for
7246 			 * this range (through invalidate_inode_pages2_range()),
7247 			 * but that can lead us to a deadlock with a concurrent
7248 			 * call to readahead (a buffered read or a defrag call
7249 			 * triggered a readahead) on a page lock due to an
7250 			 * ordered dio extent we created before but did not have
7251 			 * yet a corresponding bio submitted (whence it can not
7252 			 * complete), which makes readahead wait for that
7253 			 * ordered extent to complete while holding a lock on
7254 			 * that page.
7255 			 */
7256 			ret = nowait ? -EAGAIN : -ENOTBLK;
7257 		}
7258 
7259 		if (ret)
7260 			break;
7261 
7262 		cond_resched();
7263 	}
7264 
7265 	return ret;
7266 }
7267 
7268 /* The callers of this must take lock_extent() */
create_io_em(struct btrfs_inode * inode,u64 start,u64 len,u64 orig_start,u64 block_start,u64 block_len,u64 orig_block_len,u64 ram_bytes,int compress_type,int type)7269 static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
7270 				       u64 len, u64 orig_start, u64 block_start,
7271 				       u64 block_len, u64 orig_block_len,
7272 				       u64 ram_bytes, int compress_type,
7273 				       int type)
7274 {
7275 	struct extent_map *em;
7276 	int ret;
7277 
7278 	ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7279 	       type == BTRFS_ORDERED_COMPRESSED ||
7280 	       type == BTRFS_ORDERED_NOCOW ||
7281 	       type == BTRFS_ORDERED_REGULAR);
7282 
7283 	em = alloc_extent_map();
7284 	if (!em)
7285 		return ERR_PTR(-ENOMEM);
7286 
7287 	em->start = start;
7288 	em->orig_start = orig_start;
7289 	em->len = len;
7290 	em->block_len = block_len;
7291 	em->block_start = block_start;
7292 	em->orig_block_len = orig_block_len;
7293 	em->ram_bytes = ram_bytes;
7294 	em->generation = -1;
7295 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
7296 	if (type == BTRFS_ORDERED_PREALLOC) {
7297 		set_bit(EXTENT_FLAG_FILLING, &em->flags);
7298 	} else if (type == BTRFS_ORDERED_COMPRESSED) {
7299 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
7300 		em->compress_type = compress_type;
7301 	}
7302 
7303 	ret = btrfs_replace_extent_map_range(inode, em, true);
7304 	if (ret) {
7305 		free_extent_map(em);
7306 		return ERR_PTR(ret);
7307 	}
7308 
7309 	/* em got 2 refs now, callers needs to do free_extent_map once. */
7310 	return em;
7311 }
7312 
7313 
btrfs_get_blocks_direct_write(struct extent_map ** map,struct inode * inode,struct btrfs_dio_data * dio_data,u64 start,u64 * lenp,unsigned int iomap_flags)7314 static int btrfs_get_blocks_direct_write(struct extent_map **map,
7315 					 struct inode *inode,
7316 					 struct btrfs_dio_data *dio_data,
7317 					 u64 start, u64 *lenp,
7318 					 unsigned int iomap_flags)
7319 {
7320 	const bool nowait = (iomap_flags & IOMAP_NOWAIT);
7321 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7322 	struct extent_map *em = *map;
7323 	int type;
7324 	u64 block_start, orig_start, orig_block_len, ram_bytes;
7325 	struct btrfs_block_group *bg;
7326 	bool can_nocow = false;
7327 	bool space_reserved = false;
7328 	u64 len = *lenp;
7329 	u64 prev_len;
7330 	int ret = 0;
7331 
7332 	/*
7333 	 * We don't allocate a new extent in the following cases
7334 	 *
7335 	 * 1) The inode is marked as NODATACOW. In this case we'll just use the
7336 	 * existing extent.
7337 	 * 2) The extent is marked as PREALLOC. We're good to go here and can
7338 	 * just use the extent.
7339 	 *
7340 	 */
7341 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7342 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7343 	     em->block_start != EXTENT_MAP_HOLE)) {
7344 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7345 			type = BTRFS_ORDERED_PREALLOC;
7346 		else
7347 			type = BTRFS_ORDERED_NOCOW;
7348 		len = min(len, em->len - (start - em->start));
7349 		block_start = em->block_start + (start - em->start);
7350 
7351 		if (can_nocow_extent(inode, start, &len, &orig_start,
7352 				     &orig_block_len, &ram_bytes, false, false) == 1) {
7353 			bg = btrfs_inc_nocow_writers(fs_info, block_start);
7354 			if (bg)
7355 				can_nocow = true;
7356 		}
7357 	}
7358 
7359 	prev_len = len;
7360 	if (can_nocow) {
7361 		struct extent_map *em2;
7362 
7363 		/* We can NOCOW, so only need to reserve metadata space. */
7364 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
7365 						      nowait);
7366 		if (ret < 0) {
7367 			/* Our caller expects us to free the input extent map. */
7368 			free_extent_map(em);
7369 			*map = NULL;
7370 			btrfs_dec_nocow_writers(bg);
7371 			if (nowait && (ret == -ENOSPC || ret == -EDQUOT))
7372 				ret = -EAGAIN;
7373 			goto out;
7374 		}
7375 		space_reserved = true;
7376 
7377 		em2 = btrfs_create_dio_extent(BTRFS_I(inode), dio_data, start, len,
7378 					      orig_start, block_start,
7379 					      len, orig_block_len,
7380 					      ram_bytes, type);
7381 		btrfs_dec_nocow_writers(bg);
7382 		if (type == BTRFS_ORDERED_PREALLOC) {
7383 			free_extent_map(em);
7384 			*map = em2;
7385 			em = em2;
7386 		}
7387 
7388 		if (IS_ERR(em2)) {
7389 			ret = PTR_ERR(em2);
7390 			goto out;
7391 		}
7392 
7393 		dio_data->nocow_done = true;
7394 	} else {
7395 		/* Our caller expects us to free the input extent map. */
7396 		free_extent_map(em);
7397 		*map = NULL;
7398 
7399 		if (nowait) {
7400 			ret = -EAGAIN;
7401 			goto out;
7402 		}
7403 
7404 		/*
7405 		 * If we could not allocate data space before locking the file
7406 		 * range and we can't do a NOCOW write, then we have to fail.
7407 		 */
7408 		if (!dio_data->data_space_reserved) {
7409 			ret = -ENOSPC;
7410 			goto out;
7411 		}
7412 
7413 		/*
7414 		 * We have to COW and we have already reserved data space before,
7415 		 * so now we reserve only metadata.
7416 		 */
7417 		ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len,
7418 						      false);
7419 		if (ret < 0)
7420 			goto out;
7421 		space_reserved = true;
7422 
7423 		em = btrfs_new_extent_direct(BTRFS_I(inode), dio_data, start, len);
7424 		if (IS_ERR(em)) {
7425 			ret = PTR_ERR(em);
7426 			goto out;
7427 		}
7428 		*map = em;
7429 		len = min(len, em->len - (start - em->start));
7430 		if (len < prev_len)
7431 			btrfs_delalloc_release_metadata(BTRFS_I(inode),
7432 							prev_len - len, true);
7433 	}
7434 
7435 	/*
7436 	 * We have created our ordered extent, so we can now release our reservation
7437 	 * for an outstanding extent.
7438 	 */
7439 	btrfs_delalloc_release_extents(BTRFS_I(inode), prev_len);
7440 
7441 	/*
7442 	 * Need to update the i_size under the extent lock so buffered
7443 	 * readers will get the updated i_size when we unlock.
7444 	 */
7445 	if (start + len > i_size_read(inode))
7446 		i_size_write(inode, start + len);
7447 out:
7448 	if (ret && space_reserved) {
7449 		btrfs_delalloc_release_extents(BTRFS_I(inode), len);
7450 		btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true);
7451 	}
7452 	*lenp = len;
7453 	return ret;
7454 }
7455 
btrfs_dio_iomap_begin(struct inode * inode,loff_t start,loff_t length,unsigned int flags,struct iomap * iomap,struct iomap * srcmap)7456 static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
7457 		loff_t length, unsigned int flags, struct iomap *iomap,
7458 		struct iomap *srcmap)
7459 {
7460 	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
7461 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7462 	struct extent_map *em;
7463 	struct extent_state *cached_state = NULL;
7464 	struct btrfs_dio_data *dio_data = iter->private;
7465 	u64 lockstart, lockend;
7466 	const bool write = !!(flags & IOMAP_WRITE);
7467 	int ret = 0;
7468 	u64 len = length;
7469 	const u64 data_alloc_len = length;
7470 	bool unlock_extents = false;
7471 
7472 	/*
7473 	 * We could potentially fault if we have a buffer > PAGE_SIZE, and if
7474 	 * we're NOWAIT we may submit a bio for a partial range and return
7475 	 * EIOCBQUEUED, which would result in an errant short read.
7476 	 *
7477 	 * The best way to handle this would be to allow for partial completions
7478 	 * of iocb's, so we could submit the partial bio, return and fault in
7479 	 * the rest of the pages, and then submit the io for the rest of the
7480 	 * range.  However we don't have that currently, so simply return
7481 	 * -EAGAIN at this point so that the normal path is used.
7482 	 */
7483 	if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
7484 		return -EAGAIN;
7485 
7486 	/*
7487 	 * Cap the size of reads to that usually seen in buffered I/O as we need
7488 	 * to allocate a contiguous array for the checksums.
7489 	 */
7490 	if (!write)
7491 		len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
7492 
7493 	lockstart = start;
7494 	lockend = start + len - 1;
7495 
7496 	/*
7497 	 * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't
7498 	 * enough if we've written compressed pages to this area, so we need to
7499 	 * flush the dirty pages again to make absolutely sure that any
7500 	 * outstanding dirty pages are on disk - the first flush only starts
7501 	 * compression on the data, while keeping the pages locked, so by the
7502 	 * time the second flush returns we know bios for the compressed pages
7503 	 * were submitted and finished, and the pages no longer under writeback.
7504 	 *
7505 	 * If we have a NOWAIT request and we have any pages in the range that
7506 	 * are locked, likely due to compression still in progress, we don't want
7507 	 * to block on page locks. We also don't want to block on pages marked as
7508 	 * dirty or under writeback (same as for the non-compression case).
7509 	 * iomap_dio_rw() did the same check, but after that and before we got
7510 	 * here, mmap'ed writes may have happened or buffered reads started
7511 	 * (readpage() and readahead(), which lock pages), as we haven't locked
7512 	 * the file range yet.
7513 	 */
7514 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
7515 		     &BTRFS_I(inode)->runtime_flags)) {
7516 		if (flags & IOMAP_NOWAIT) {
7517 			if (filemap_range_needs_writeback(inode->i_mapping,
7518 							  lockstart, lockend))
7519 				return -EAGAIN;
7520 		} else {
7521 			ret = filemap_fdatawrite_range(inode->i_mapping, start,
7522 						       start + length - 1);
7523 			if (ret)
7524 				return ret;
7525 		}
7526 	}
7527 
7528 	memset(dio_data, 0, sizeof(*dio_data));
7529 
7530 	/*
7531 	 * We always try to allocate data space and must do it before locking
7532 	 * the file range, to avoid deadlocks with concurrent writes to the same
7533 	 * range if the range has several extents and the writes don't expand the
7534 	 * current i_size (the inode lock is taken in shared mode). If we fail to
7535 	 * allocate data space here we continue and later, after locking the
7536 	 * file range, we fail with ENOSPC only if we figure out we can not do a
7537 	 * NOCOW write.
7538 	 */
7539 	if (write && !(flags & IOMAP_NOWAIT)) {
7540 		ret = btrfs_check_data_free_space(BTRFS_I(inode),
7541 						  &dio_data->data_reserved,
7542 						  start, data_alloc_len, false);
7543 		if (!ret)
7544 			dio_data->data_space_reserved = true;
7545 		else if (ret && !(BTRFS_I(inode)->flags &
7546 				  (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
7547 			goto err;
7548 	}
7549 
7550 	/*
7551 	 * If this errors out it's because we couldn't invalidate pagecache for
7552 	 * this range and we need to fallback to buffered IO, or we are doing a
7553 	 * NOWAIT read/write and we need to block.
7554 	 */
7555 	ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags);
7556 	if (ret < 0)
7557 		goto err;
7558 
7559 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
7560 	if (IS_ERR(em)) {
7561 		ret = PTR_ERR(em);
7562 		goto unlock_err;
7563 	}
7564 
7565 	/*
7566 	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7567 	 * io.  INLINE is special, and we could probably kludge it in here, but
7568 	 * it's still buffered so for safety lets just fall back to the generic
7569 	 * buffered path.
7570 	 *
7571 	 * For COMPRESSED we _have_ to read the entire extent in so we can
7572 	 * decompress it, so there will be buffering required no matter what we
7573 	 * do, so go ahead and fallback to buffered.
7574 	 *
7575 	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
7576 	 * to buffered IO.  Don't blame me, this is the price we pay for using
7577 	 * the generic code.
7578 	 */
7579 	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7580 	    em->block_start == EXTENT_MAP_INLINE) {
7581 		free_extent_map(em);
7582 		/*
7583 		 * If we are in a NOWAIT context, return -EAGAIN in order to
7584 		 * fallback to buffered IO. This is not only because we can
7585 		 * block with buffered IO (no support for NOWAIT semantics at
7586 		 * the moment) but also to avoid returning short reads to user
7587 		 * space - this happens if we were able to read some data from
7588 		 * previous non-compressed extents and then when we fallback to
7589 		 * buffered IO, at btrfs_file_read_iter() by calling
7590 		 * filemap_read(), we fail to fault in pages for the read buffer,
7591 		 * in which case filemap_read() returns a short read (the number
7592 		 * of bytes previously read is > 0, so it does not return -EFAULT).
7593 		 */
7594 		ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
7595 		goto unlock_err;
7596 	}
7597 
7598 	len = min(len, em->len - (start - em->start));
7599 
7600 	/*
7601 	 * If we have a NOWAIT request and the range contains multiple extents
7602 	 * (or a mix of extents and holes), then we return -EAGAIN to make the
7603 	 * caller fallback to a context where it can do a blocking (without
7604 	 * NOWAIT) request. This way we avoid doing partial IO and returning
7605 	 * success to the caller, which is not optimal for writes and for reads
7606 	 * it can result in unexpected behaviour for an application.
7607 	 *
7608 	 * When doing a read, because we use IOMAP_DIO_PARTIAL when calling
7609 	 * iomap_dio_rw(), we can end up returning less data then what the caller
7610 	 * asked for, resulting in an unexpected, and incorrect, short read.
7611 	 * That is, the caller asked to read N bytes and we return less than that,
7612 	 * which is wrong unless we are crossing EOF. This happens if we get a
7613 	 * page fault error when trying to fault in pages for the buffer that is
7614 	 * associated to the struct iov_iter passed to iomap_dio_rw(), and we
7615 	 * have previously submitted bios for other extents in the range, in
7616 	 * which case iomap_dio_rw() may return us EIOCBQUEUED if not all of
7617 	 * those bios have completed by the time we get the page fault error,
7618 	 * which we return back to our caller - we should only return EIOCBQUEUED
7619 	 * after we have submitted bios for all the extents in the range.
7620 	 */
7621 	if ((flags & IOMAP_NOWAIT) && len < length) {
7622 		free_extent_map(em);
7623 		ret = -EAGAIN;
7624 		goto unlock_err;
7625 	}
7626 
7627 	if (write) {
7628 		ret = btrfs_get_blocks_direct_write(&em, inode, dio_data,
7629 						    start, &len, flags);
7630 		if (ret < 0)
7631 			goto unlock_err;
7632 		unlock_extents = true;
7633 		/* Recalc len in case the new em is smaller than requested */
7634 		len = min(len, em->len - (start - em->start));
7635 		if (dio_data->data_space_reserved) {
7636 			u64 release_offset;
7637 			u64 release_len = 0;
7638 
7639 			if (dio_data->nocow_done) {
7640 				release_offset = start;
7641 				release_len = data_alloc_len;
7642 			} else if (len < data_alloc_len) {
7643 				release_offset = start + len;
7644 				release_len = data_alloc_len - len;
7645 			}
7646 
7647 			if (release_len > 0)
7648 				btrfs_free_reserved_data_space(BTRFS_I(inode),
7649 							       dio_data->data_reserved,
7650 							       release_offset,
7651 							       release_len);
7652 		}
7653 	} else {
7654 		/*
7655 		 * We need to unlock only the end area that we aren't using.
7656 		 * The rest is going to be unlocked by the endio routine.
7657 		 */
7658 		lockstart = start + len;
7659 		if (lockstart < lockend)
7660 			unlock_extents = true;
7661 	}
7662 
7663 	if (unlock_extents)
7664 		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7665 			      &cached_state);
7666 	else
7667 		free_extent_state(cached_state);
7668 
7669 	/*
7670 	 * Translate extent map information to iomap.
7671 	 * We trim the extents (and move the addr) even though iomap code does
7672 	 * that, since we have locked only the parts we are performing I/O in.
7673 	 */
7674 	if ((em->block_start == EXTENT_MAP_HOLE) ||
7675 	    (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) {
7676 		iomap->addr = IOMAP_NULL_ADDR;
7677 		iomap->type = IOMAP_HOLE;
7678 	} else {
7679 		iomap->addr = em->block_start + (start - em->start);
7680 		iomap->type = IOMAP_MAPPED;
7681 	}
7682 	iomap->offset = start;
7683 	iomap->bdev = fs_info->fs_devices->latest_dev->bdev;
7684 	iomap->length = len;
7685 	free_extent_map(em);
7686 
7687 	return 0;
7688 
7689 unlock_err:
7690 	unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7691 		      &cached_state);
7692 err:
7693 	if (dio_data->data_space_reserved) {
7694 		btrfs_free_reserved_data_space(BTRFS_I(inode),
7695 					       dio_data->data_reserved,
7696 					       start, data_alloc_len);
7697 		extent_changeset_free(dio_data->data_reserved);
7698 	}
7699 
7700 	return ret;
7701 }
7702 
btrfs_dio_iomap_end(struct inode * inode,loff_t pos,loff_t length,ssize_t written,unsigned int flags,struct iomap * iomap)7703 static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
7704 		ssize_t written, unsigned int flags, struct iomap *iomap)
7705 {
7706 	struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap);
7707 	struct btrfs_dio_data *dio_data = iter->private;
7708 	size_t submitted = dio_data->submitted;
7709 	const bool write = !!(flags & IOMAP_WRITE);
7710 	int ret = 0;
7711 
7712 	if (!write && (iomap->type == IOMAP_HOLE)) {
7713 		/* If reading from a hole, unlock and return */
7714 		unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1,
7715 			      NULL);
7716 		return 0;
7717 	}
7718 
7719 	if (submitted < length) {
7720 		pos += submitted;
7721 		length -= submitted;
7722 		if (write)
7723 			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
7724 						    pos, length, false);
7725 		else
7726 			unlock_extent(&BTRFS_I(inode)->io_tree, pos,
7727 				      pos + length - 1, NULL);
7728 		ret = -ENOTBLK;
7729 	}
7730 	if (write) {
7731 		btrfs_put_ordered_extent(dio_data->ordered);
7732 		dio_data->ordered = NULL;
7733 	}
7734 
7735 	if (write)
7736 		extent_changeset_free(dio_data->data_reserved);
7737 	return ret;
7738 }
7739 
btrfs_dio_end_io(struct btrfs_bio * bbio)7740 static void btrfs_dio_end_io(struct btrfs_bio *bbio)
7741 {
7742 	struct btrfs_dio_private *dip =
7743 		container_of(bbio, struct btrfs_dio_private, bbio);
7744 	struct btrfs_inode *inode = bbio->inode;
7745 	struct bio *bio = &bbio->bio;
7746 
7747 	if (bio->bi_status) {
7748 		btrfs_warn(inode->root->fs_info,
7749 		"direct IO failed ino %llu op 0x%0x offset %#llx len %u err no %d",
7750 			   btrfs_ino(inode), bio->bi_opf,
7751 			   dip->file_offset, dip->bytes, bio->bi_status);
7752 	}
7753 
7754 	if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
7755 		btrfs_finish_ordered_extent(bbio->ordered, NULL,
7756 					    dip->file_offset, dip->bytes,
7757 					    !bio->bi_status);
7758 	} else {
7759 		unlock_extent(&inode->io_tree, dip->file_offset,
7760 			      dip->file_offset + dip->bytes - 1, NULL);
7761 	}
7762 
7763 	bbio->bio.bi_private = bbio->private;
7764 	iomap_dio_bio_end_io(bio);
7765 }
7766 
btrfs_dio_submit_io(const struct iomap_iter * iter,struct bio * bio,loff_t file_offset)7767 static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio,
7768 				loff_t file_offset)
7769 {
7770 	struct btrfs_bio *bbio = btrfs_bio(bio);
7771 	struct btrfs_dio_private *dip =
7772 		container_of(bbio, struct btrfs_dio_private, bbio);
7773 	struct btrfs_dio_data *dio_data = iter->private;
7774 
7775 	btrfs_bio_init(bbio, BTRFS_I(iter->inode)->root->fs_info,
7776 		       btrfs_dio_end_io, bio->bi_private);
7777 	bbio->inode = BTRFS_I(iter->inode);
7778 	bbio->file_offset = file_offset;
7779 
7780 	dip->file_offset = file_offset;
7781 	dip->bytes = bio->bi_iter.bi_size;
7782 
7783 	dio_data->submitted += bio->bi_iter.bi_size;
7784 
7785 	/*
7786 	 * Check if we are doing a partial write.  If we are, we need to split
7787 	 * the ordered extent to match the submitted bio.  Hang on to the
7788 	 * remaining unfinishable ordered_extent in dio_data so that it can be
7789 	 * cancelled in iomap_end to avoid a deadlock wherein faulting the
7790 	 * remaining pages is blocked on the outstanding ordered extent.
7791 	 */
7792 	if (iter->flags & IOMAP_WRITE) {
7793 		int ret;
7794 
7795 		ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered);
7796 		if (ret) {
7797 			btrfs_finish_ordered_extent(dio_data->ordered, NULL,
7798 						    file_offset, dip->bytes,
7799 						    !ret);
7800 			bio->bi_status = errno_to_blk_status(ret);
7801 			iomap_dio_bio_end_io(bio);
7802 			return;
7803 		}
7804 	}
7805 
7806 	btrfs_submit_bio(bbio, 0);
7807 }
7808 
7809 static const struct iomap_ops btrfs_dio_iomap_ops = {
7810 	.iomap_begin            = btrfs_dio_iomap_begin,
7811 	.iomap_end              = btrfs_dio_iomap_end,
7812 };
7813 
7814 static const struct iomap_dio_ops btrfs_dio_ops = {
7815 	.submit_io		= btrfs_dio_submit_io,
7816 	.bio_set		= &btrfs_dio_bioset,
7817 };
7818 
btrfs_dio_read(struct kiocb * iocb,struct iov_iter * iter,size_t done_before)7819 ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter, size_t done_before)
7820 {
7821 	struct btrfs_dio_data data = { 0 };
7822 
7823 	return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
7824 			    IOMAP_DIO_PARTIAL, &data, done_before);
7825 }
7826 
btrfs_dio_write(struct kiocb * iocb,struct iov_iter * iter,size_t done_before)7827 struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
7828 				  size_t done_before)
7829 {
7830 	struct btrfs_dio_data data = { 0 };
7831 
7832 	return __iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
7833 			    IOMAP_DIO_PARTIAL, &data, done_before);
7834 }
7835 
btrfs_fiemap(struct inode * inode,struct fiemap_extent_info * fieinfo,u64 start,u64 len)7836 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
7837 			u64 start, u64 len)
7838 {
7839 	struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
7840 	int	ret;
7841 
7842 	ret = fiemap_prep(inode, fieinfo, start, &len, 0);
7843 	if (ret)
7844 		return ret;
7845 
7846 	/*
7847 	 * fiemap_prep() called filemap_write_and_wait() for the whole possible
7848 	 * file range (0 to LLONG_MAX), but that is not enough if we have
7849 	 * compression enabled. The first filemap_fdatawrite_range() only kicks
7850 	 * in the compression of data (in an async thread) and will return
7851 	 * before the compression is done and writeback is started. A second
7852 	 * filemap_fdatawrite_range() is needed to wait for the compression to
7853 	 * complete and writeback to start. We also need to wait for ordered
7854 	 * extents to complete, because our fiemap implementation uses mainly
7855 	 * file extent items to list the extents, searching for extent maps
7856 	 * only for file ranges with holes or prealloc extents to figure out
7857 	 * if we have delalloc in those ranges.
7858 	 */
7859 	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
7860 		ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
7861 		if (ret)
7862 			return ret;
7863 	}
7864 
7865 	btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED);
7866 
7867 	/*
7868 	 * We did an initial flush to avoid holding the inode's lock while
7869 	 * triggering writeback and waiting for the completion of IO and ordered
7870 	 * extents. Now after we locked the inode we do it again, because it's
7871 	 * possible a new write may have happened in between those two steps.
7872 	 */
7873 	if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) {
7874 		ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX);
7875 		if (ret) {
7876 			btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
7877 			return ret;
7878 		}
7879 	}
7880 
7881 	ret = extent_fiemap(btrfs_inode, fieinfo, start, len);
7882 	btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED);
7883 
7884 	return ret;
7885 }
7886 
btrfs_writepages(struct address_space * mapping,struct writeback_control * wbc)7887 static int btrfs_writepages(struct address_space *mapping,
7888 			    struct writeback_control *wbc)
7889 {
7890 	return extent_writepages(mapping, wbc);
7891 }
7892 
btrfs_readahead(struct readahead_control * rac)7893 static void btrfs_readahead(struct readahead_control *rac)
7894 {
7895 	extent_readahead(rac);
7896 }
7897 
7898 /*
7899  * For release_folio() and invalidate_folio() we have a race window where
7900  * folio_end_writeback() is called but the subpage spinlock is not yet released.
7901  * If we continue to release/invalidate the page, we could cause use-after-free
7902  * for subpage spinlock.  So this function is to spin and wait for subpage
7903  * spinlock.
7904  */
wait_subpage_spinlock(struct page * page)7905 static void wait_subpage_spinlock(struct page *page)
7906 {
7907 	struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
7908 	struct btrfs_subpage *subpage;
7909 
7910 	if (!btrfs_is_subpage(fs_info, page))
7911 		return;
7912 
7913 	ASSERT(PagePrivate(page) && page->private);
7914 	subpage = (struct btrfs_subpage *)page->private;
7915 
7916 	/*
7917 	 * This may look insane as we just acquire the spinlock and release it,
7918 	 * without doing anything.  But we just want to make sure no one is
7919 	 * still holding the subpage spinlock.
7920 	 * And since the page is not dirty nor writeback, and we have page
7921 	 * locked, the only possible way to hold a spinlock is from the endio
7922 	 * function to clear page writeback.
7923 	 *
7924 	 * Here we just acquire the spinlock so that all existing callers
7925 	 * should exit and we're safe to release/invalidate the page.
7926 	 */
7927 	spin_lock_irq(&subpage->lock);
7928 	spin_unlock_irq(&subpage->lock);
7929 }
7930 
__btrfs_release_folio(struct folio * folio,gfp_t gfp_flags)7931 static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7932 {
7933 	int ret = try_release_extent_mapping(&folio->page, gfp_flags);
7934 
7935 	if (ret == 1) {
7936 		wait_subpage_spinlock(&folio->page);
7937 		clear_page_extent_mapped(&folio->page);
7938 	}
7939 	return ret;
7940 }
7941 
btrfs_release_folio(struct folio * folio,gfp_t gfp_flags)7942 static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7943 {
7944 	if (folio_test_writeback(folio) || folio_test_dirty(folio))
7945 		return false;
7946 	return __btrfs_release_folio(folio, gfp_flags);
7947 }
7948 
7949 #ifdef CONFIG_MIGRATION
btrfs_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)7950 static int btrfs_migrate_folio(struct address_space *mapping,
7951 			     struct folio *dst, struct folio *src,
7952 			     enum migrate_mode mode)
7953 {
7954 	int ret = filemap_migrate_folio(mapping, dst, src, mode);
7955 
7956 	if (ret != MIGRATEPAGE_SUCCESS)
7957 		return ret;
7958 
7959 	if (folio_test_ordered(src)) {
7960 		folio_clear_ordered(src);
7961 		folio_set_ordered(dst);
7962 	}
7963 
7964 	return MIGRATEPAGE_SUCCESS;
7965 }
7966 #else
7967 #define btrfs_migrate_folio NULL
7968 #endif
7969 
btrfs_invalidate_folio(struct folio * folio,size_t offset,size_t length)7970 static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
7971 				 size_t length)
7972 {
7973 	struct btrfs_inode *inode = BTRFS_I(folio->mapping->host);
7974 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
7975 	struct extent_io_tree *tree = &inode->io_tree;
7976 	struct extent_state *cached_state = NULL;
7977 	u64 page_start = folio_pos(folio);
7978 	u64 page_end = page_start + folio_size(folio) - 1;
7979 	u64 cur;
7980 	int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
7981 
7982 	/*
7983 	 * We have folio locked so no new ordered extent can be created on this
7984 	 * page, nor bio can be submitted for this folio.
7985 	 *
7986 	 * But already submitted bio can still be finished on this folio.
7987 	 * Furthermore, endio function won't skip folio which has Ordered
7988 	 * (Private2) already cleared, so it's possible for endio and
7989 	 * invalidate_folio to do the same ordered extent accounting twice
7990 	 * on one folio.
7991 	 *
7992 	 * So here we wait for any submitted bios to finish, so that we won't
7993 	 * do double ordered extent accounting on the same folio.
7994 	 */
7995 	folio_wait_writeback(folio);
7996 	wait_subpage_spinlock(&folio->page);
7997 
7998 	/*
7999 	 * For subpage case, we have call sites like
8000 	 * btrfs_punch_hole_lock_range() which passes range not aligned to
8001 	 * sectorsize.
8002 	 * If the range doesn't cover the full folio, we don't need to and
8003 	 * shouldn't clear page extent mapped, as folio->private can still
8004 	 * record subpage dirty bits for other part of the range.
8005 	 *
8006 	 * For cases that invalidate the full folio even the range doesn't
8007 	 * cover the full folio, like invalidating the last folio, we're
8008 	 * still safe to wait for ordered extent to finish.
8009 	 */
8010 	if (!(offset == 0 && length == folio_size(folio))) {
8011 		btrfs_release_folio(folio, GFP_NOFS);
8012 		return;
8013 	}
8014 
8015 	if (!inode_evicting)
8016 		lock_extent(tree, page_start, page_end, &cached_state);
8017 
8018 	cur = page_start;
8019 	while (cur < page_end) {
8020 		struct btrfs_ordered_extent *ordered;
8021 		u64 range_end;
8022 		u32 range_len;
8023 		u32 extra_flags = 0;
8024 
8025 		ordered = btrfs_lookup_first_ordered_range(inode, cur,
8026 							   page_end + 1 - cur);
8027 		if (!ordered) {
8028 			range_end = page_end;
8029 			/*
8030 			 * No ordered extent covering this range, we are safe
8031 			 * to delete all extent states in the range.
8032 			 */
8033 			extra_flags = EXTENT_CLEAR_ALL_BITS;
8034 			goto next;
8035 		}
8036 		if (ordered->file_offset > cur) {
8037 			/*
8038 			 * There is a range between [cur, oe->file_offset) not
8039 			 * covered by any ordered extent.
8040 			 * We are safe to delete all extent states, and handle
8041 			 * the ordered extent in the next iteration.
8042 			 */
8043 			range_end = ordered->file_offset - 1;
8044 			extra_flags = EXTENT_CLEAR_ALL_BITS;
8045 			goto next;
8046 		}
8047 
8048 		range_end = min(ordered->file_offset + ordered->num_bytes - 1,
8049 				page_end);
8050 		ASSERT(range_end + 1 - cur < U32_MAX);
8051 		range_len = range_end + 1 - cur;
8052 		if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) {
8053 			/*
8054 			 * If Ordered (Private2) is cleared, it means endio has
8055 			 * already been executed for the range.
8056 			 * We can't delete the extent states as
8057 			 * btrfs_finish_ordered_io() may still use some of them.
8058 			 */
8059 			goto next;
8060 		}
8061 		btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len);
8062 
8063 		/*
8064 		 * IO on this page will never be started, so we need to account
8065 		 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
8066 		 * here, must leave that up for the ordered extent completion.
8067 		 *
8068 		 * This will also unlock the range for incoming
8069 		 * btrfs_finish_ordered_io().
8070 		 */
8071 		if (!inode_evicting)
8072 			clear_extent_bit(tree, cur, range_end,
8073 					 EXTENT_DELALLOC |
8074 					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8075 					 EXTENT_DEFRAG, &cached_state);
8076 
8077 		spin_lock_irq(&inode->ordered_tree.lock);
8078 		set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8079 		ordered->truncated_len = min(ordered->truncated_len,
8080 					     cur - ordered->file_offset);
8081 		spin_unlock_irq(&inode->ordered_tree.lock);
8082 
8083 		/*
8084 		 * If the ordered extent has finished, we're safe to delete all
8085 		 * the extent states of the range, otherwise
8086 		 * btrfs_finish_ordered_io() will get executed by endio for
8087 		 * other pages, so we can't delete extent states.
8088 		 */
8089 		if (btrfs_dec_test_ordered_pending(inode, &ordered,
8090 						   cur, range_end + 1 - cur)) {
8091 			btrfs_finish_ordered_io(ordered);
8092 			/*
8093 			 * The ordered extent has finished, now we're again
8094 			 * safe to delete all extent states of the range.
8095 			 */
8096 			extra_flags = EXTENT_CLEAR_ALL_BITS;
8097 		}
8098 next:
8099 		if (ordered)
8100 			btrfs_put_ordered_extent(ordered);
8101 		/*
8102 		 * Qgroup reserved space handler
8103 		 * Sector(s) here will be either:
8104 		 *
8105 		 * 1) Already written to disk or bio already finished
8106 		 *    Then its QGROUP_RESERVED bit in io_tree is already cleared.
8107 		 *    Qgroup will be handled by its qgroup_record then.
8108 		 *    btrfs_qgroup_free_data() call will do nothing here.
8109 		 *
8110 		 * 2) Not written to disk yet
8111 		 *    Then btrfs_qgroup_free_data() call will clear the
8112 		 *    QGROUP_RESERVED bit of its io_tree, and free the qgroup
8113 		 *    reserved data space.
8114 		 *    Since the IO will never happen for this page.
8115 		 */
8116 		btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
8117 		if (!inode_evicting) {
8118 			clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
8119 				 EXTENT_DELALLOC | EXTENT_UPTODATE |
8120 				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
8121 				 extra_flags, &cached_state);
8122 		}
8123 		cur = range_end + 1;
8124 	}
8125 	/*
8126 	 * We have iterated through all ordered extents of the page, the page
8127 	 * should not have Ordered (Private2) anymore, or the above iteration
8128 	 * did something wrong.
8129 	 */
8130 	ASSERT(!folio_test_ordered(folio));
8131 	btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio));
8132 	if (!inode_evicting)
8133 		__btrfs_release_folio(folio, GFP_NOFS);
8134 	clear_page_extent_mapped(&folio->page);
8135 }
8136 
8137 /*
8138  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
8139  * called from a page fault handler when a page is first dirtied. Hence we must
8140  * be careful to check for EOF conditions here. We set the page up correctly
8141  * for a written page which means we get ENOSPC checking when writing into
8142  * holes and correct delalloc and unwritten extent mapping on filesystems that
8143  * support these features.
8144  *
8145  * We are not allowed to take the i_mutex here so we have to play games to
8146  * protect against truncate races as the page could now be beyond EOF.  Because
8147  * truncate_setsize() writes the inode size before removing pages, once we have
8148  * the page lock we can determine safely if the page is beyond EOF. If it is not
8149  * beyond EOF, then the page is guaranteed safe against truncation until we
8150  * unlock the page.
8151  */
btrfs_page_mkwrite(struct vm_fault * vmf)8152 vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
8153 {
8154 	struct page *page = vmf->page;
8155 	struct inode *inode = file_inode(vmf->vma->vm_file);
8156 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8157 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
8158 	struct btrfs_ordered_extent *ordered;
8159 	struct extent_state *cached_state = NULL;
8160 	struct extent_changeset *data_reserved = NULL;
8161 	unsigned long zero_start;
8162 	loff_t size;
8163 	vm_fault_t ret;
8164 	int ret2;
8165 	int reserved = 0;
8166 	u64 reserved_space;
8167 	u64 page_start;
8168 	u64 page_end;
8169 	u64 end;
8170 
8171 	reserved_space = PAGE_SIZE;
8172 
8173 	sb_start_pagefault(inode->i_sb);
8174 	page_start = page_offset(page);
8175 	page_end = page_start + PAGE_SIZE - 1;
8176 	end = page_end;
8177 
8178 	/*
8179 	 * Reserving delalloc space after obtaining the page lock can lead to
8180 	 * deadlock. For example, if a dirty page is locked by this function
8181 	 * and the call to btrfs_delalloc_reserve_space() ends up triggering
8182 	 * dirty page write out, then the btrfs_writepages() function could
8183 	 * end up waiting indefinitely to get a lock on the page currently
8184 	 * being processed by btrfs_page_mkwrite() function.
8185 	 */
8186 	ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved,
8187 					    page_start, reserved_space);
8188 	if (!ret2) {
8189 		ret2 = file_update_time(vmf->vma->vm_file);
8190 		reserved = 1;
8191 	}
8192 	if (ret2) {
8193 		ret = vmf_error(ret2);
8194 		if (reserved)
8195 			goto out;
8196 		goto out_noreserve;
8197 	}
8198 
8199 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
8200 again:
8201 	down_read(&BTRFS_I(inode)->i_mmap_lock);
8202 	lock_page(page);
8203 	size = i_size_read(inode);
8204 
8205 	if ((page->mapping != inode->i_mapping) ||
8206 	    (page_start >= size)) {
8207 		/* page got truncated out from underneath us */
8208 		goto out_unlock;
8209 	}
8210 	wait_on_page_writeback(page);
8211 
8212 	lock_extent(io_tree, page_start, page_end, &cached_state);
8213 	ret2 = set_page_extent_mapped(page);
8214 	if (ret2 < 0) {
8215 		ret = vmf_error(ret2);
8216 		unlock_extent(io_tree, page_start, page_end, &cached_state);
8217 		goto out_unlock;
8218 	}
8219 
8220 	/*
8221 	 * we can't set the delalloc bits if there are pending ordered
8222 	 * extents.  Drop our locks and wait for them to finish
8223 	 */
8224 	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
8225 			PAGE_SIZE);
8226 	if (ordered) {
8227 		unlock_extent(io_tree, page_start, page_end, &cached_state);
8228 		unlock_page(page);
8229 		up_read(&BTRFS_I(inode)->i_mmap_lock);
8230 		btrfs_start_ordered_extent(ordered);
8231 		btrfs_put_ordered_extent(ordered);
8232 		goto again;
8233 	}
8234 
8235 	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
8236 		reserved_space = round_up(size - page_start,
8237 					  fs_info->sectorsize);
8238 		if (reserved_space < PAGE_SIZE) {
8239 			end = page_start + reserved_space - 1;
8240 			btrfs_delalloc_release_space(BTRFS_I(inode),
8241 					data_reserved, page_start,
8242 					PAGE_SIZE - reserved_space, true);
8243 		}
8244 	}
8245 
8246 	/*
8247 	 * page_mkwrite gets called when the page is firstly dirtied after it's
8248 	 * faulted in, but write(2) could also dirty a page and set delalloc
8249 	 * bits, thus in this case for space account reason, we still need to
8250 	 * clear any delalloc bits within this page range since we have to
8251 	 * reserve data&meta space before lock_page() (see above comments).
8252 	 */
8253 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
8254 			  EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
8255 			  EXTENT_DEFRAG, &cached_state);
8256 
8257 	ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0,
8258 					&cached_state);
8259 	if (ret2) {
8260 		unlock_extent(io_tree, page_start, page_end, &cached_state);
8261 		ret = VM_FAULT_SIGBUS;
8262 		goto out_unlock;
8263 	}
8264 
8265 	/* page is wholly or partially inside EOF */
8266 	if (page_start + PAGE_SIZE > size)
8267 		zero_start = offset_in_page(size);
8268 	else
8269 		zero_start = PAGE_SIZE;
8270 
8271 	if (zero_start != PAGE_SIZE)
8272 		memzero_page(page, zero_start, PAGE_SIZE - zero_start);
8273 
8274 	btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
8275 	btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
8276 	btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
8277 
8278 	btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
8279 
8280 	unlock_extent(io_tree, page_start, page_end, &cached_state);
8281 	up_read(&BTRFS_I(inode)->i_mmap_lock);
8282 
8283 	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8284 	sb_end_pagefault(inode->i_sb);
8285 	extent_changeset_free(data_reserved);
8286 	return VM_FAULT_LOCKED;
8287 
8288 out_unlock:
8289 	unlock_page(page);
8290 	up_read(&BTRFS_I(inode)->i_mmap_lock);
8291 out:
8292 	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
8293 	btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
8294 				     reserved_space, (ret != 0));
8295 out_noreserve:
8296 	sb_end_pagefault(inode->i_sb);
8297 	extent_changeset_free(data_reserved);
8298 	return ret;
8299 }
8300 
btrfs_truncate(struct btrfs_inode * inode,bool skip_writeback)8301 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
8302 {
8303 	struct btrfs_truncate_control control = {
8304 		.inode = inode,
8305 		.ino = btrfs_ino(inode),
8306 		.min_type = BTRFS_EXTENT_DATA_KEY,
8307 		.clear_extent_range = true,
8308 	};
8309 	struct btrfs_root *root = inode->root;
8310 	struct btrfs_fs_info *fs_info = root->fs_info;
8311 	struct btrfs_block_rsv *rsv;
8312 	int ret;
8313 	struct btrfs_trans_handle *trans;
8314 	u64 mask = fs_info->sectorsize - 1;
8315 	const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
8316 
8317 	if (!skip_writeback) {
8318 		ret = btrfs_wait_ordered_range(&inode->vfs_inode,
8319 					       inode->vfs_inode.i_size & (~mask),
8320 					       (u64)-1);
8321 		if (ret)
8322 			return ret;
8323 	}
8324 
8325 	/*
8326 	 * Yes ladies and gentlemen, this is indeed ugly.  We have a couple of
8327 	 * things going on here:
8328 	 *
8329 	 * 1) We need to reserve space to update our inode.
8330 	 *
8331 	 * 2) We need to have something to cache all the space that is going to
8332 	 * be free'd up by the truncate operation, but also have some slack
8333 	 * space reserved in case it uses space during the truncate (thank you
8334 	 * very much snapshotting).
8335 	 *
8336 	 * And we need these to be separate.  The fact is we can use a lot of
8337 	 * space doing the truncate, and we have no earthly idea how much space
8338 	 * we will use, so we need the truncate reservation to be separate so it
8339 	 * doesn't end up using space reserved for updating the inode.  We also
8340 	 * need to be able to stop the transaction and start a new one, which
8341 	 * means we need to be able to update the inode several times, and we
8342 	 * have no idea of knowing how many times that will be, so we can't just
8343 	 * reserve 1 item for the entirety of the operation, so that has to be
8344 	 * done separately as well.
8345 	 *
8346 	 * So that leaves us with
8347 	 *
8348 	 * 1) rsv - for the truncate reservation, which we will steal from the
8349 	 * transaction reservation.
8350 	 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
8351 	 * updating the inode.
8352 	 */
8353 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
8354 	if (!rsv)
8355 		return -ENOMEM;
8356 	rsv->size = min_size;
8357 	rsv->failfast = true;
8358 
8359 	/*
8360 	 * 1 for the truncate slack space
8361 	 * 1 for updating the inode.
8362 	 */
8363 	trans = btrfs_start_transaction(root, 2);
8364 	if (IS_ERR(trans)) {
8365 		ret = PTR_ERR(trans);
8366 		goto out;
8367 	}
8368 
8369 	/* Migrate the slack space for the truncate to our reserve */
8370 	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
8371 				      min_size, false);
8372 	/*
8373 	 * We have reserved 2 metadata units when we started the transaction and
8374 	 * min_size matches 1 unit, so this should never fail, but if it does,
8375 	 * it's not critical we just fail truncation.
8376 	 */
8377 	if (WARN_ON(ret)) {
8378 		btrfs_end_transaction(trans);
8379 		goto out;
8380 	}
8381 
8382 	trans->block_rsv = rsv;
8383 
8384 	while (1) {
8385 		struct extent_state *cached_state = NULL;
8386 		const u64 new_size = inode->vfs_inode.i_size;
8387 		const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
8388 
8389 		control.new_size = new_size;
8390 		lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
8391 		/*
8392 		 * We want to drop from the next block forward in case this new
8393 		 * size is not block aligned since we will be keeping the last
8394 		 * block of the extent just the way it is.
8395 		 */
8396 		btrfs_drop_extent_map_range(inode,
8397 					    ALIGN(new_size, fs_info->sectorsize),
8398 					    (u64)-1, false);
8399 
8400 		ret = btrfs_truncate_inode_items(trans, root, &control);
8401 
8402 		inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
8403 		btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
8404 
8405 		unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
8406 
8407 		trans->block_rsv = &fs_info->trans_block_rsv;
8408 		if (ret != -ENOSPC && ret != -EAGAIN)
8409 			break;
8410 
8411 		ret = btrfs_update_inode(trans, root, inode);
8412 		if (ret)
8413 			break;
8414 
8415 		btrfs_end_transaction(trans);
8416 		btrfs_btree_balance_dirty(fs_info);
8417 
8418 		trans = btrfs_start_transaction(root, 2);
8419 		if (IS_ERR(trans)) {
8420 			ret = PTR_ERR(trans);
8421 			trans = NULL;
8422 			break;
8423 		}
8424 
8425 		btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
8426 		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
8427 					      rsv, min_size, false);
8428 		/*
8429 		 * We have reserved 2 metadata units when we started the
8430 		 * transaction and min_size matches 1 unit, so this should never
8431 		 * fail, but if it does, it's not critical we just fail truncation.
8432 		 */
8433 		if (WARN_ON(ret))
8434 			break;
8435 
8436 		trans->block_rsv = rsv;
8437 	}
8438 
8439 	/*
8440 	 * We can't call btrfs_truncate_block inside a trans handle as we could
8441 	 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
8442 	 * know we've truncated everything except the last little bit, and can
8443 	 * do btrfs_truncate_block and then update the disk_i_size.
8444 	 */
8445 	if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
8446 		btrfs_end_transaction(trans);
8447 		btrfs_btree_balance_dirty(fs_info);
8448 
8449 		ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
8450 		if (ret)
8451 			goto out;
8452 		trans = btrfs_start_transaction(root, 1);
8453 		if (IS_ERR(trans)) {
8454 			ret = PTR_ERR(trans);
8455 			goto out;
8456 		}
8457 		btrfs_inode_safe_disk_i_size_write(inode, 0);
8458 	}
8459 
8460 	if (trans) {
8461 		int ret2;
8462 
8463 		trans->block_rsv = &fs_info->trans_block_rsv;
8464 		ret2 = btrfs_update_inode(trans, root, inode);
8465 		if (ret2 && !ret)
8466 			ret = ret2;
8467 
8468 		ret2 = btrfs_end_transaction(trans);
8469 		if (ret2 && !ret)
8470 			ret = ret2;
8471 		btrfs_btree_balance_dirty(fs_info);
8472 	}
8473 out:
8474 	btrfs_free_block_rsv(fs_info, rsv);
8475 	/*
8476 	 * So if we truncate and then write and fsync we normally would just
8477 	 * write the extents that changed, which is a problem if we need to
8478 	 * first truncate that entire inode.  So set this flag so we write out
8479 	 * all of the extents in the inode to the sync log so we're completely
8480 	 * safe.
8481 	 *
8482 	 * If no extents were dropped or trimmed we don't need to force the next
8483 	 * fsync to truncate all the inode's items from the log and re-log them
8484 	 * all. This means the truncate operation did not change the file size,
8485 	 * or changed it to a smaller size but there was only an implicit hole
8486 	 * between the old i_size and the new i_size, and there were no prealloc
8487 	 * extents beyond i_size to drop.
8488 	 */
8489 	if (control.extents_found > 0)
8490 		btrfs_set_inode_full_sync(inode);
8491 
8492 	return ret;
8493 }
8494 
btrfs_new_subvol_inode(struct mnt_idmap * idmap,struct inode * dir)8495 struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
8496 				     struct inode *dir)
8497 {
8498 	struct inode *inode;
8499 
8500 	inode = new_inode(dir->i_sb);
8501 	if (inode) {
8502 		/*
8503 		 * Subvolumes don't inherit the sgid bit or the parent's gid if
8504 		 * the parent's sgid bit is set. This is probably a bug.
8505 		 */
8506 		inode_init_owner(idmap, inode, NULL,
8507 				 S_IFDIR | (~current_umask() & S_IRWXUGO));
8508 		inode->i_op = &btrfs_dir_inode_operations;
8509 		inode->i_fop = &btrfs_dir_file_operations;
8510 	}
8511 	return inode;
8512 }
8513 
btrfs_alloc_inode(struct super_block * sb)8514 struct inode *btrfs_alloc_inode(struct super_block *sb)
8515 {
8516 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
8517 	struct btrfs_inode *ei;
8518 	struct inode *inode;
8519 
8520 	ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
8521 	if (!ei)
8522 		return NULL;
8523 
8524 	ei->root = NULL;
8525 	ei->generation = 0;
8526 	ei->last_trans = 0;
8527 	ei->last_sub_trans = 0;
8528 	ei->logged_trans = 0;
8529 	ei->delalloc_bytes = 0;
8530 	ei->new_delalloc_bytes = 0;
8531 	ei->defrag_bytes = 0;
8532 	ei->disk_i_size = 0;
8533 	ei->flags = 0;
8534 	ei->ro_flags = 0;
8535 	ei->csum_bytes = 0;
8536 	ei->index_cnt = (u64)-1;
8537 	ei->dir_index = 0;
8538 	ei->last_unlink_trans = 0;
8539 	ei->last_reflink_trans = 0;
8540 	ei->last_log_commit = 0;
8541 
8542 	spin_lock_init(&ei->lock);
8543 	ei->outstanding_extents = 0;
8544 	if (sb->s_magic != BTRFS_TEST_MAGIC)
8545 		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
8546 					      BTRFS_BLOCK_RSV_DELALLOC);
8547 	ei->runtime_flags = 0;
8548 	ei->prop_compress = BTRFS_COMPRESS_NONE;
8549 	ei->defrag_compress = BTRFS_COMPRESS_NONE;
8550 
8551 	ei->delayed_node = NULL;
8552 
8553 	ei->i_otime.tv_sec = 0;
8554 	ei->i_otime.tv_nsec = 0;
8555 
8556 	inode = &ei->vfs_inode;
8557 	extent_map_tree_init(&ei->extent_tree);
8558 	extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
8559 	ei->io_tree.inode = ei;
8560 	extent_io_tree_init(fs_info, &ei->file_extent_tree,
8561 			    IO_TREE_INODE_FILE_EXTENT);
8562 	mutex_init(&ei->log_mutex);
8563 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
8564 	INIT_LIST_HEAD(&ei->delalloc_inodes);
8565 	INIT_LIST_HEAD(&ei->delayed_iput);
8566 	RB_CLEAR_NODE(&ei->rb_node);
8567 	init_rwsem(&ei->i_mmap_lock);
8568 
8569 	return inode;
8570 }
8571 
8572 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
btrfs_test_destroy_inode(struct inode * inode)8573 void btrfs_test_destroy_inode(struct inode *inode)
8574 {
8575 	btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
8576 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8577 }
8578 #endif
8579 
btrfs_free_inode(struct inode * inode)8580 void btrfs_free_inode(struct inode *inode)
8581 {
8582 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
8583 }
8584 
btrfs_destroy_inode(struct inode * vfs_inode)8585 void btrfs_destroy_inode(struct inode *vfs_inode)
8586 {
8587 	struct btrfs_ordered_extent *ordered;
8588 	struct btrfs_inode *inode = BTRFS_I(vfs_inode);
8589 	struct btrfs_root *root = inode->root;
8590 	bool freespace_inode;
8591 
8592 	WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
8593 	WARN_ON(vfs_inode->i_data.nrpages);
8594 	WARN_ON(inode->block_rsv.reserved);
8595 	WARN_ON(inode->block_rsv.size);
8596 	WARN_ON(inode->outstanding_extents);
8597 	if (!S_ISDIR(vfs_inode->i_mode)) {
8598 		WARN_ON(inode->delalloc_bytes);
8599 		WARN_ON(inode->new_delalloc_bytes);
8600 	}
8601 	WARN_ON(inode->csum_bytes);
8602 	WARN_ON(inode->defrag_bytes);
8603 
8604 	/*
8605 	 * This can happen where we create an inode, but somebody else also
8606 	 * created the same inode and we need to destroy the one we already
8607 	 * created.
8608 	 */
8609 	if (!root)
8610 		return;
8611 
8612 	/*
8613 	 * If this is a free space inode do not take the ordered extents lockdep
8614 	 * map.
8615 	 */
8616 	freespace_inode = btrfs_is_free_space_inode(inode);
8617 
8618 	while (1) {
8619 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
8620 		if (!ordered)
8621 			break;
8622 		else {
8623 			btrfs_err(root->fs_info,
8624 				  "found ordered extent %llu %llu on inode cleanup",
8625 				  ordered->file_offset, ordered->num_bytes);
8626 
8627 			if (!freespace_inode)
8628 				btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
8629 
8630 			btrfs_remove_ordered_extent(inode, ordered);
8631 			btrfs_put_ordered_extent(ordered);
8632 			btrfs_put_ordered_extent(ordered);
8633 		}
8634 	}
8635 	btrfs_qgroup_check_reserved_leak(inode);
8636 	inode_tree_del(inode);
8637 	btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
8638 	btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
8639 	btrfs_put_root(inode->root);
8640 }
8641 
btrfs_drop_inode(struct inode * inode)8642 int btrfs_drop_inode(struct inode *inode)
8643 {
8644 	struct btrfs_root *root = BTRFS_I(inode)->root;
8645 
8646 	if (root == NULL)
8647 		return 1;
8648 
8649 	/* the snap/subvol tree is on deleting */
8650 	if (btrfs_root_refs(&root->root_item) == 0)
8651 		return 1;
8652 	else
8653 		return generic_drop_inode(inode);
8654 }
8655 
init_once(void * foo)8656 static void init_once(void *foo)
8657 {
8658 	struct btrfs_inode *ei = foo;
8659 
8660 	inode_init_once(&ei->vfs_inode);
8661 }
8662 
btrfs_destroy_cachep(void)8663 void __cold btrfs_destroy_cachep(void)
8664 {
8665 	/*
8666 	 * Make sure all delayed rcu free inodes are flushed before we
8667 	 * destroy cache.
8668 	 */
8669 	rcu_barrier();
8670 	bioset_exit(&btrfs_dio_bioset);
8671 	kmem_cache_destroy(btrfs_inode_cachep);
8672 }
8673 
btrfs_init_cachep(void)8674 int __init btrfs_init_cachep(void)
8675 {
8676 	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
8677 			sizeof(struct btrfs_inode), 0,
8678 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
8679 			init_once);
8680 	if (!btrfs_inode_cachep)
8681 		goto fail;
8682 
8683 	if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE,
8684 			offsetof(struct btrfs_dio_private, bbio.bio),
8685 			BIOSET_NEED_BVECS))
8686 		goto fail;
8687 
8688 	return 0;
8689 fail:
8690 	btrfs_destroy_cachep();
8691 	return -ENOMEM;
8692 }
8693 
btrfs_getattr(struct mnt_idmap * idmap,const struct path * path,struct kstat * stat,u32 request_mask,unsigned int flags)8694 static int btrfs_getattr(struct mnt_idmap *idmap,
8695 			 const struct path *path, struct kstat *stat,
8696 			 u32 request_mask, unsigned int flags)
8697 {
8698 	u64 delalloc_bytes;
8699 	u64 inode_bytes;
8700 	struct inode *inode = d_inode(path->dentry);
8701 	u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize;
8702 	u32 bi_flags = BTRFS_I(inode)->flags;
8703 	u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
8704 
8705 	stat->result_mask |= STATX_BTIME;
8706 	stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
8707 	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
8708 	if (bi_flags & BTRFS_INODE_APPEND)
8709 		stat->attributes |= STATX_ATTR_APPEND;
8710 	if (bi_flags & BTRFS_INODE_COMPRESS)
8711 		stat->attributes |= STATX_ATTR_COMPRESSED;
8712 	if (bi_flags & BTRFS_INODE_IMMUTABLE)
8713 		stat->attributes |= STATX_ATTR_IMMUTABLE;
8714 	if (bi_flags & BTRFS_INODE_NODUMP)
8715 		stat->attributes |= STATX_ATTR_NODUMP;
8716 	if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
8717 		stat->attributes |= STATX_ATTR_VERITY;
8718 
8719 	stat->attributes_mask |= (STATX_ATTR_APPEND |
8720 				  STATX_ATTR_COMPRESSED |
8721 				  STATX_ATTR_IMMUTABLE |
8722 				  STATX_ATTR_NODUMP);
8723 
8724 	generic_fillattr(idmap, request_mask, inode, stat);
8725 	stat->dev = BTRFS_I(inode)->root->anon_dev;
8726 
8727 	spin_lock(&BTRFS_I(inode)->lock);
8728 	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
8729 	inode_bytes = inode_get_bytes(inode);
8730 	spin_unlock(&BTRFS_I(inode)->lock);
8731 	stat->blocks = (ALIGN(inode_bytes, blocksize) +
8732 			ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
8733 	return 0;
8734 }
8735 
btrfs_rename_exchange(struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry)8736 static int btrfs_rename_exchange(struct inode *old_dir,
8737 			      struct dentry *old_dentry,
8738 			      struct inode *new_dir,
8739 			      struct dentry *new_dentry)
8740 {
8741 	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
8742 	struct btrfs_trans_handle *trans;
8743 	unsigned int trans_num_items;
8744 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
8745 	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8746 	struct inode *new_inode = new_dentry->d_inode;
8747 	struct inode *old_inode = old_dentry->d_inode;
8748 	struct btrfs_rename_ctx old_rename_ctx;
8749 	struct btrfs_rename_ctx new_rename_ctx;
8750 	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
8751 	u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
8752 	u64 old_idx = 0;
8753 	u64 new_idx = 0;
8754 	int ret;
8755 	int ret2;
8756 	bool need_abort = false;
8757 	struct fscrypt_name old_fname, new_fname;
8758 	struct fscrypt_str *old_name, *new_name;
8759 
8760 	/*
8761 	 * For non-subvolumes allow exchange only within one subvolume, in the
8762 	 * same inode namespace. Two subvolumes (represented as directory) can
8763 	 * be exchanged as they're a logical link and have a fixed inode number.
8764 	 */
8765 	if (root != dest &&
8766 	    (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
8767 	     new_ino != BTRFS_FIRST_FREE_OBJECTID))
8768 		return -EXDEV;
8769 
8770 	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
8771 	if (ret)
8772 		return ret;
8773 
8774 	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
8775 	if (ret) {
8776 		fscrypt_free_filename(&old_fname);
8777 		return ret;
8778 	}
8779 
8780 	old_name = &old_fname.disk_name;
8781 	new_name = &new_fname.disk_name;
8782 
8783 	/* close the race window with snapshot create/destroy ioctl */
8784 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
8785 	    new_ino == BTRFS_FIRST_FREE_OBJECTID)
8786 		down_read(&fs_info->subvol_sem);
8787 
8788 	/*
8789 	 * For each inode:
8790 	 * 1 to remove old dir item
8791 	 * 1 to remove old dir index
8792 	 * 1 to add new dir item
8793 	 * 1 to add new dir index
8794 	 * 1 to update parent inode
8795 	 *
8796 	 * If the parents are the same, we only need to account for one
8797 	 */
8798 	trans_num_items = (old_dir == new_dir ? 9 : 10);
8799 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8800 		/*
8801 		 * 1 to remove old root ref
8802 		 * 1 to remove old root backref
8803 		 * 1 to add new root ref
8804 		 * 1 to add new root backref
8805 		 */
8806 		trans_num_items += 4;
8807 	} else {
8808 		/*
8809 		 * 1 to update inode item
8810 		 * 1 to remove old inode ref
8811 		 * 1 to add new inode ref
8812 		 */
8813 		trans_num_items += 3;
8814 	}
8815 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
8816 		trans_num_items += 4;
8817 	else
8818 		trans_num_items += 3;
8819 	trans = btrfs_start_transaction(root, trans_num_items);
8820 	if (IS_ERR(trans)) {
8821 		ret = PTR_ERR(trans);
8822 		goto out_notrans;
8823 	}
8824 
8825 	if (dest != root) {
8826 		ret = btrfs_record_root_in_trans(trans, dest);
8827 		if (ret)
8828 			goto out_fail;
8829 	}
8830 
8831 	/*
8832 	 * We need to find a free sequence number both in the source and
8833 	 * in the destination directory for the exchange.
8834 	 */
8835 	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
8836 	if (ret)
8837 		goto out_fail;
8838 	ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
8839 	if (ret)
8840 		goto out_fail;
8841 
8842 	BTRFS_I(old_inode)->dir_index = 0ULL;
8843 	BTRFS_I(new_inode)->dir_index = 0ULL;
8844 
8845 	/* Reference for the source. */
8846 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8847 		/* force full log commit if subvolume involved. */
8848 		btrfs_set_log_full_commit(trans);
8849 	} else {
8850 		ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
8851 					     btrfs_ino(BTRFS_I(new_dir)),
8852 					     old_idx);
8853 		if (ret)
8854 			goto out_fail;
8855 		need_abort = true;
8856 	}
8857 
8858 	/* And now for the dest. */
8859 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8860 		/* force full log commit if subvolume involved. */
8861 		btrfs_set_log_full_commit(trans);
8862 	} else {
8863 		ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
8864 					     btrfs_ino(BTRFS_I(old_dir)),
8865 					     new_idx);
8866 		if (ret) {
8867 			if (need_abort)
8868 				btrfs_abort_transaction(trans, ret);
8869 			goto out_fail;
8870 		}
8871 	}
8872 
8873 	/* Update inode version and ctime/mtime. */
8874 	inode_inc_iversion(old_dir);
8875 	inode_inc_iversion(new_dir);
8876 	inode_inc_iversion(old_inode);
8877 	inode_inc_iversion(new_inode);
8878 	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
8879 
8880 	if (old_dentry->d_parent != new_dentry->d_parent) {
8881 		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
8882 					BTRFS_I(old_inode), true);
8883 		btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
8884 					BTRFS_I(new_inode), true);
8885 	}
8886 
8887 	/* src is a subvolume */
8888 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8889 		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
8890 	} else { /* src is an inode */
8891 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
8892 					   BTRFS_I(old_dentry->d_inode),
8893 					   old_name, &old_rename_ctx);
8894 		if (!ret)
8895 			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
8896 	}
8897 	if (ret) {
8898 		btrfs_abort_transaction(trans, ret);
8899 		goto out_fail;
8900 	}
8901 
8902 	/* dest is a subvolume */
8903 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8904 		ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
8905 	} else { /* dest is an inode */
8906 		ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
8907 					   BTRFS_I(new_dentry->d_inode),
8908 					   new_name, &new_rename_ctx);
8909 		if (!ret)
8910 			ret = btrfs_update_inode(trans, dest, BTRFS_I(new_inode));
8911 	}
8912 	if (ret) {
8913 		btrfs_abort_transaction(trans, ret);
8914 		goto out_fail;
8915 	}
8916 
8917 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
8918 			     new_name, 0, old_idx);
8919 	if (ret) {
8920 		btrfs_abort_transaction(trans, ret);
8921 		goto out_fail;
8922 	}
8923 
8924 	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
8925 			     old_name, 0, new_idx);
8926 	if (ret) {
8927 		btrfs_abort_transaction(trans, ret);
8928 		goto out_fail;
8929 	}
8930 
8931 	if (old_inode->i_nlink == 1)
8932 		BTRFS_I(old_inode)->dir_index = old_idx;
8933 	if (new_inode->i_nlink == 1)
8934 		BTRFS_I(new_inode)->dir_index = new_idx;
8935 
8936 	/*
8937 	 * Now pin the logs of the roots. We do it to ensure that no other task
8938 	 * can sync the logs while we are in progress with the rename, because
8939 	 * that could result in an inconsistency in case any of the inodes that
8940 	 * are part of this rename operation were logged before.
8941 	 */
8942 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8943 		btrfs_pin_log_trans(root);
8944 	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8945 		btrfs_pin_log_trans(dest);
8946 
8947 	/* Do the log updates for all inodes. */
8948 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8949 		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
8950 				   old_rename_ctx.index, new_dentry->d_parent);
8951 	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8952 		btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
8953 				   new_rename_ctx.index, old_dentry->d_parent);
8954 
8955 	/* Now unpin the logs. */
8956 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8957 		btrfs_end_log_trans(root);
8958 	if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8959 		btrfs_end_log_trans(dest);
8960 out_fail:
8961 	ret2 = btrfs_end_transaction(trans);
8962 	ret = ret ? ret : ret2;
8963 out_notrans:
8964 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
8965 	    old_ino == BTRFS_FIRST_FREE_OBJECTID)
8966 		up_read(&fs_info->subvol_sem);
8967 
8968 	fscrypt_free_filename(&new_fname);
8969 	fscrypt_free_filename(&old_fname);
8970 	return ret;
8971 }
8972 
new_whiteout_inode(struct mnt_idmap * idmap,struct inode * dir)8973 static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
8974 					struct inode *dir)
8975 {
8976 	struct inode *inode;
8977 
8978 	inode = new_inode(dir->i_sb);
8979 	if (inode) {
8980 		inode_init_owner(idmap, inode, dir,
8981 				 S_IFCHR | WHITEOUT_MODE);
8982 		inode->i_op = &btrfs_special_inode_operations;
8983 		init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
8984 	}
8985 	return inode;
8986 }
8987 
btrfs_rename(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)8988 static int btrfs_rename(struct mnt_idmap *idmap,
8989 			struct inode *old_dir, struct dentry *old_dentry,
8990 			struct inode *new_dir, struct dentry *new_dentry,
8991 			unsigned int flags)
8992 {
8993 	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
8994 	struct btrfs_new_inode_args whiteout_args = {
8995 		.dir = old_dir,
8996 		.dentry = old_dentry,
8997 	};
8998 	struct btrfs_trans_handle *trans;
8999 	unsigned int trans_num_items;
9000 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
9001 	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9002 	struct inode *new_inode = d_inode(new_dentry);
9003 	struct inode *old_inode = d_inode(old_dentry);
9004 	struct btrfs_rename_ctx rename_ctx;
9005 	u64 index = 0;
9006 	int ret;
9007 	int ret2;
9008 	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9009 	struct fscrypt_name old_fname, new_fname;
9010 
9011 	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9012 		return -EPERM;
9013 
9014 	/* we only allow rename subvolume link between subvolumes */
9015 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9016 		return -EXDEV;
9017 
9018 	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9019 	    (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
9020 		return -ENOTEMPTY;
9021 
9022 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
9023 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
9024 		return -ENOTEMPTY;
9025 
9026 	ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
9027 	if (ret)
9028 		return ret;
9029 
9030 	ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
9031 	if (ret) {
9032 		fscrypt_free_filename(&old_fname);
9033 		return ret;
9034 	}
9035 
9036 	/* check for collisions, even if the  name isn't there */
9037 	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
9038 	if (ret) {
9039 		if (ret == -EEXIST) {
9040 			/* we shouldn't get
9041 			 * eexist without a new_inode */
9042 			if (WARN_ON(!new_inode)) {
9043 				goto out_fscrypt_names;
9044 			}
9045 		} else {
9046 			/* maybe -EOVERFLOW */
9047 			goto out_fscrypt_names;
9048 		}
9049 	}
9050 	ret = 0;
9051 
9052 	/*
9053 	 * we're using rename to replace one file with another.  Start IO on it
9054 	 * now so  we don't add too much work to the end of the transaction
9055 	 */
9056 	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9057 		filemap_flush(old_inode->i_mapping);
9058 
9059 	if (flags & RENAME_WHITEOUT) {
9060 		whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
9061 		if (!whiteout_args.inode) {
9062 			ret = -ENOMEM;
9063 			goto out_fscrypt_names;
9064 		}
9065 		ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
9066 		if (ret)
9067 			goto out_whiteout_inode;
9068 	} else {
9069 		/* 1 to update the old parent inode. */
9070 		trans_num_items = 1;
9071 	}
9072 
9073 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9074 		/* Close the race window with snapshot create/destroy ioctl */
9075 		down_read(&fs_info->subvol_sem);
9076 		/*
9077 		 * 1 to remove old root ref
9078 		 * 1 to remove old root backref
9079 		 * 1 to add new root ref
9080 		 * 1 to add new root backref
9081 		 */
9082 		trans_num_items += 4;
9083 	} else {
9084 		/*
9085 		 * 1 to update inode
9086 		 * 1 to remove old inode ref
9087 		 * 1 to add new inode ref
9088 		 */
9089 		trans_num_items += 3;
9090 	}
9091 	/*
9092 	 * 1 to remove old dir item
9093 	 * 1 to remove old dir index
9094 	 * 1 to add new dir item
9095 	 * 1 to add new dir index
9096 	 */
9097 	trans_num_items += 4;
9098 	/* 1 to update new parent inode if it's not the same as the old parent */
9099 	if (new_dir != old_dir)
9100 		trans_num_items++;
9101 	if (new_inode) {
9102 		/*
9103 		 * 1 to update inode
9104 		 * 1 to remove inode ref
9105 		 * 1 to remove dir item
9106 		 * 1 to remove dir index
9107 		 * 1 to possibly add orphan item
9108 		 */
9109 		trans_num_items += 5;
9110 	}
9111 	trans = btrfs_start_transaction(root, trans_num_items);
9112 	if (IS_ERR(trans)) {
9113 		ret = PTR_ERR(trans);
9114 		goto out_notrans;
9115 	}
9116 
9117 	if (dest != root) {
9118 		ret = btrfs_record_root_in_trans(trans, dest);
9119 		if (ret)
9120 			goto out_fail;
9121 	}
9122 
9123 	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
9124 	if (ret)
9125 		goto out_fail;
9126 
9127 	BTRFS_I(old_inode)->dir_index = 0ULL;
9128 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9129 		/* force full log commit if subvolume involved. */
9130 		btrfs_set_log_full_commit(trans);
9131 	} else {
9132 		ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
9133 					     old_ino, btrfs_ino(BTRFS_I(new_dir)),
9134 					     index);
9135 		if (ret)
9136 			goto out_fail;
9137 	}
9138 
9139 	inode_inc_iversion(old_dir);
9140 	inode_inc_iversion(new_dir);
9141 	inode_inc_iversion(old_inode);
9142 	simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
9143 
9144 	if (old_dentry->d_parent != new_dentry->d_parent)
9145 		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9146 					BTRFS_I(old_inode), true);
9147 
9148 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9149 		ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
9150 	} else {
9151 		ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
9152 					   BTRFS_I(d_inode(old_dentry)),
9153 					   &old_fname.disk_name, &rename_ctx);
9154 		if (!ret)
9155 			ret = btrfs_update_inode(trans, root, BTRFS_I(old_inode));
9156 	}
9157 	if (ret) {
9158 		btrfs_abort_transaction(trans, ret);
9159 		goto out_fail;
9160 	}
9161 
9162 	if (new_inode) {
9163 		inode_inc_iversion(new_inode);
9164 		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
9165 			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9166 			ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
9167 			BUG_ON(new_inode->i_nlink == 0);
9168 		} else {
9169 			ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
9170 						 BTRFS_I(d_inode(new_dentry)),
9171 						 &new_fname.disk_name);
9172 		}
9173 		if (!ret && new_inode->i_nlink == 0)
9174 			ret = btrfs_orphan_add(trans,
9175 					BTRFS_I(d_inode(new_dentry)));
9176 		if (ret) {
9177 			btrfs_abort_transaction(trans, ret);
9178 			goto out_fail;
9179 		}
9180 	}
9181 
9182 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9183 			     &new_fname.disk_name, 0, index);
9184 	if (ret) {
9185 		btrfs_abort_transaction(trans, ret);
9186 		goto out_fail;
9187 	}
9188 
9189 	if (old_inode->i_nlink == 1)
9190 		BTRFS_I(old_inode)->dir_index = index;
9191 
9192 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
9193 		btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
9194 				   rename_ctx.index, new_dentry->d_parent);
9195 
9196 	if (flags & RENAME_WHITEOUT) {
9197 		ret = btrfs_create_new_inode(trans, &whiteout_args);
9198 		if (ret) {
9199 			btrfs_abort_transaction(trans, ret);
9200 			goto out_fail;
9201 		} else {
9202 			unlock_new_inode(whiteout_args.inode);
9203 			iput(whiteout_args.inode);
9204 			whiteout_args.inode = NULL;
9205 		}
9206 	}
9207 out_fail:
9208 	ret2 = btrfs_end_transaction(trans);
9209 	ret = ret ? ret : ret2;
9210 out_notrans:
9211 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9212 		up_read(&fs_info->subvol_sem);
9213 	if (flags & RENAME_WHITEOUT)
9214 		btrfs_new_inode_args_destroy(&whiteout_args);
9215 out_whiteout_inode:
9216 	if (flags & RENAME_WHITEOUT)
9217 		iput(whiteout_args.inode);
9218 out_fscrypt_names:
9219 	fscrypt_free_filename(&old_fname);
9220 	fscrypt_free_filename(&new_fname);
9221 	return ret;
9222 }
9223 
btrfs_rename2(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)9224 static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
9225 			 struct dentry *old_dentry, struct inode *new_dir,
9226 			 struct dentry *new_dentry, unsigned int flags)
9227 {
9228 	int ret;
9229 
9230 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
9231 		return -EINVAL;
9232 
9233 	if (flags & RENAME_EXCHANGE)
9234 		ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
9235 					    new_dentry);
9236 	else
9237 		ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
9238 				   new_dentry, flags);
9239 
9240 	btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
9241 
9242 	return ret;
9243 }
9244 
9245 struct btrfs_delalloc_work {
9246 	struct inode *inode;
9247 	struct completion completion;
9248 	struct list_head list;
9249 	struct btrfs_work work;
9250 };
9251 
btrfs_run_delalloc_work(struct btrfs_work * work)9252 static void btrfs_run_delalloc_work(struct btrfs_work *work)
9253 {
9254 	struct btrfs_delalloc_work *delalloc_work;
9255 	struct inode *inode;
9256 
9257 	delalloc_work = container_of(work, struct btrfs_delalloc_work,
9258 				     work);
9259 	inode = delalloc_work->inode;
9260 	filemap_flush(inode->i_mapping);
9261 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
9262 				&BTRFS_I(inode)->runtime_flags))
9263 		filemap_flush(inode->i_mapping);
9264 
9265 	iput(inode);
9266 	complete(&delalloc_work->completion);
9267 }
9268 
btrfs_alloc_delalloc_work(struct inode * inode)9269 static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
9270 {
9271 	struct btrfs_delalloc_work *work;
9272 
9273 	work = kmalloc(sizeof(*work), GFP_NOFS);
9274 	if (!work)
9275 		return NULL;
9276 
9277 	init_completion(&work->completion);
9278 	INIT_LIST_HEAD(&work->list);
9279 	work->inode = inode;
9280 	btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL);
9281 
9282 	return work;
9283 }
9284 
9285 /*
9286  * some fairly slow code that needs optimization. This walks the list
9287  * of all the inodes with pending delalloc and forces them to disk.
9288  */
start_delalloc_inodes(struct btrfs_root * root,struct writeback_control * wbc,bool snapshot,bool in_reclaim_context)9289 static int start_delalloc_inodes(struct btrfs_root *root,
9290 				 struct writeback_control *wbc, bool snapshot,
9291 				 bool in_reclaim_context)
9292 {
9293 	struct btrfs_inode *binode;
9294 	struct inode *inode;
9295 	struct btrfs_delalloc_work *work, *next;
9296 	LIST_HEAD(works);
9297 	LIST_HEAD(splice);
9298 	int ret = 0;
9299 	bool full_flush = wbc->nr_to_write == LONG_MAX;
9300 
9301 	mutex_lock(&root->delalloc_mutex);
9302 	spin_lock(&root->delalloc_lock);
9303 	list_splice_init(&root->delalloc_inodes, &splice);
9304 	while (!list_empty(&splice)) {
9305 		binode = list_entry(splice.next, struct btrfs_inode,
9306 				    delalloc_inodes);
9307 
9308 		list_move_tail(&binode->delalloc_inodes,
9309 			       &root->delalloc_inodes);
9310 
9311 		if (in_reclaim_context &&
9312 		    test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags))
9313 			continue;
9314 
9315 		inode = igrab(&binode->vfs_inode);
9316 		if (!inode) {
9317 			cond_resched_lock(&root->delalloc_lock);
9318 			continue;
9319 		}
9320 		spin_unlock(&root->delalloc_lock);
9321 
9322 		if (snapshot)
9323 			set_bit(BTRFS_INODE_SNAPSHOT_FLUSH,
9324 				&binode->runtime_flags);
9325 		if (full_flush) {
9326 			work = btrfs_alloc_delalloc_work(inode);
9327 			if (!work) {
9328 				iput(inode);
9329 				ret = -ENOMEM;
9330 				goto out;
9331 			}
9332 			list_add_tail(&work->list, &works);
9333 			btrfs_queue_work(root->fs_info->flush_workers,
9334 					 &work->work);
9335 		} else {
9336 			ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc);
9337 			btrfs_add_delayed_iput(BTRFS_I(inode));
9338 			if (ret || wbc->nr_to_write <= 0)
9339 				goto out;
9340 		}
9341 		cond_resched();
9342 		spin_lock(&root->delalloc_lock);
9343 	}
9344 	spin_unlock(&root->delalloc_lock);
9345 
9346 out:
9347 	list_for_each_entry_safe(work, next, &works, list) {
9348 		list_del_init(&work->list);
9349 		wait_for_completion(&work->completion);
9350 		kfree(work);
9351 	}
9352 
9353 	if (!list_empty(&splice)) {
9354 		spin_lock(&root->delalloc_lock);
9355 		list_splice_tail(&splice, &root->delalloc_inodes);
9356 		spin_unlock(&root->delalloc_lock);
9357 	}
9358 	mutex_unlock(&root->delalloc_mutex);
9359 	return ret;
9360 }
9361 
btrfs_start_delalloc_snapshot(struct btrfs_root * root,bool in_reclaim_context)9362 int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
9363 {
9364 	struct writeback_control wbc = {
9365 		.nr_to_write = LONG_MAX,
9366 		.sync_mode = WB_SYNC_NONE,
9367 		.range_start = 0,
9368 		.range_end = LLONG_MAX,
9369 	};
9370 	struct btrfs_fs_info *fs_info = root->fs_info;
9371 
9372 	if (BTRFS_FS_ERROR(fs_info))
9373 		return -EROFS;
9374 
9375 	return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
9376 }
9377 
btrfs_start_delalloc_roots(struct btrfs_fs_info * fs_info,long nr,bool in_reclaim_context)9378 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
9379 			       bool in_reclaim_context)
9380 {
9381 	struct writeback_control wbc = {
9382 		.nr_to_write = nr,
9383 		.sync_mode = WB_SYNC_NONE,
9384 		.range_start = 0,
9385 		.range_end = LLONG_MAX,
9386 	};
9387 	struct btrfs_root *root;
9388 	LIST_HEAD(splice);
9389 	int ret;
9390 
9391 	if (BTRFS_FS_ERROR(fs_info))
9392 		return -EROFS;
9393 
9394 	mutex_lock(&fs_info->delalloc_root_mutex);
9395 	spin_lock(&fs_info->delalloc_root_lock);
9396 	list_splice_init(&fs_info->delalloc_roots, &splice);
9397 	while (!list_empty(&splice)) {
9398 		/*
9399 		 * Reset nr_to_write here so we know that we're doing a full
9400 		 * flush.
9401 		 */
9402 		if (nr == LONG_MAX)
9403 			wbc.nr_to_write = LONG_MAX;
9404 
9405 		root = list_first_entry(&splice, struct btrfs_root,
9406 					delalloc_root);
9407 		root = btrfs_grab_root(root);
9408 		BUG_ON(!root);
9409 		list_move_tail(&root->delalloc_root,
9410 			       &fs_info->delalloc_roots);
9411 		spin_unlock(&fs_info->delalloc_root_lock);
9412 
9413 		ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
9414 		btrfs_put_root(root);
9415 		if (ret < 0 || wbc.nr_to_write <= 0)
9416 			goto out;
9417 		spin_lock(&fs_info->delalloc_root_lock);
9418 	}
9419 	spin_unlock(&fs_info->delalloc_root_lock);
9420 
9421 	ret = 0;
9422 out:
9423 	if (!list_empty(&splice)) {
9424 		spin_lock(&fs_info->delalloc_root_lock);
9425 		list_splice_tail(&splice, &fs_info->delalloc_roots);
9426 		spin_unlock(&fs_info->delalloc_root_lock);
9427 	}
9428 	mutex_unlock(&fs_info->delalloc_root_mutex);
9429 	return ret;
9430 }
9431 
btrfs_symlink(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,const char * symname)9432 static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
9433 			 struct dentry *dentry, const char *symname)
9434 {
9435 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
9436 	struct btrfs_trans_handle *trans;
9437 	struct btrfs_root *root = BTRFS_I(dir)->root;
9438 	struct btrfs_path *path;
9439 	struct btrfs_key key;
9440 	struct inode *inode;
9441 	struct btrfs_new_inode_args new_inode_args = {
9442 		.dir = dir,
9443 		.dentry = dentry,
9444 	};
9445 	unsigned int trans_num_items;
9446 	int err;
9447 	int name_len;
9448 	int datasize;
9449 	unsigned long ptr;
9450 	struct btrfs_file_extent_item *ei;
9451 	struct extent_buffer *leaf;
9452 
9453 	name_len = strlen(symname);
9454 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
9455 		return -ENAMETOOLONG;
9456 
9457 	inode = new_inode(dir->i_sb);
9458 	if (!inode)
9459 		return -ENOMEM;
9460 	inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
9461 	inode->i_op = &btrfs_symlink_inode_operations;
9462 	inode_nohighmem(inode);
9463 	inode->i_mapping->a_ops = &btrfs_aops;
9464 	btrfs_i_size_write(BTRFS_I(inode), name_len);
9465 	inode_set_bytes(inode, name_len);
9466 
9467 	new_inode_args.inode = inode;
9468 	err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
9469 	if (err)
9470 		goto out_inode;
9471 	/* 1 additional item for the inline extent */
9472 	trans_num_items++;
9473 
9474 	trans = btrfs_start_transaction(root, trans_num_items);
9475 	if (IS_ERR(trans)) {
9476 		err = PTR_ERR(trans);
9477 		goto out_new_inode_args;
9478 	}
9479 
9480 	err = btrfs_create_new_inode(trans, &new_inode_args);
9481 	if (err)
9482 		goto out;
9483 
9484 	path = btrfs_alloc_path();
9485 	if (!path) {
9486 		err = -ENOMEM;
9487 		btrfs_abort_transaction(trans, err);
9488 		discard_new_inode(inode);
9489 		inode = NULL;
9490 		goto out;
9491 	}
9492 	key.objectid = btrfs_ino(BTRFS_I(inode));
9493 	key.offset = 0;
9494 	key.type = BTRFS_EXTENT_DATA_KEY;
9495 	datasize = btrfs_file_extent_calc_inline_size(name_len);
9496 	err = btrfs_insert_empty_item(trans, root, path, &key,
9497 				      datasize);
9498 	if (err) {
9499 		btrfs_abort_transaction(trans, err);
9500 		btrfs_free_path(path);
9501 		discard_new_inode(inode);
9502 		inode = NULL;
9503 		goto out;
9504 	}
9505 	leaf = path->nodes[0];
9506 	ei = btrfs_item_ptr(leaf, path->slots[0],
9507 			    struct btrfs_file_extent_item);
9508 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
9509 	btrfs_set_file_extent_type(leaf, ei,
9510 				   BTRFS_FILE_EXTENT_INLINE);
9511 	btrfs_set_file_extent_encryption(leaf, ei, 0);
9512 	btrfs_set_file_extent_compression(leaf, ei, 0);
9513 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
9514 	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
9515 
9516 	ptr = btrfs_file_extent_inline_start(ei);
9517 	write_extent_buffer(leaf, symname, ptr, name_len);
9518 	btrfs_mark_buffer_dirty(trans, leaf);
9519 	btrfs_free_path(path);
9520 
9521 	d_instantiate_new(dentry, inode);
9522 	err = 0;
9523 out:
9524 	btrfs_end_transaction(trans);
9525 	btrfs_btree_balance_dirty(fs_info);
9526 out_new_inode_args:
9527 	btrfs_new_inode_args_destroy(&new_inode_args);
9528 out_inode:
9529 	if (err)
9530 		iput(inode);
9531 	return err;
9532 }
9533 
insert_prealloc_file_extent(struct btrfs_trans_handle * trans_in,struct btrfs_inode * inode,struct btrfs_key * ins,u64 file_offset)9534 static struct btrfs_trans_handle *insert_prealloc_file_extent(
9535 				       struct btrfs_trans_handle *trans_in,
9536 				       struct btrfs_inode *inode,
9537 				       struct btrfs_key *ins,
9538 				       u64 file_offset)
9539 {
9540 	struct btrfs_file_extent_item stack_fi;
9541 	struct btrfs_replace_extent_info extent_info;
9542 	struct btrfs_trans_handle *trans = trans_in;
9543 	struct btrfs_path *path;
9544 	u64 start = ins->objectid;
9545 	u64 len = ins->offset;
9546 	u64 qgroup_released = 0;
9547 	int ret;
9548 
9549 	memset(&stack_fi, 0, sizeof(stack_fi));
9550 
9551 	btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
9552 	btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
9553 	btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
9554 	btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
9555 	btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
9556 	btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
9557 	/* Encryption and other encoding is reserved and all 0 */
9558 
9559 	ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
9560 	if (ret < 0)
9561 		return ERR_PTR(ret);
9562 
9563 	if (trans) {
9564 		ret = insert_reserved_file_extent(trans, inode,
9565 						  file_offset, &stack_fi,
9566 						  true, qgroup_released);
9567 		if (ret)
9568 			goto free_qgroup;
9569 		return trans;
9570 	}
9571 
9572 	extent_info.disk_offset = start;
9573 	extent_info.disk_len = len;
9574 	extent_info.data_offset = 0;
9575 	extent_info.data_len = len;
9576 	extent_info.file_offset = file_offset;
9577 	extent_info.extent_buf = (char *)&stack_fi;
9578 	extent_info.is_new_extent = true;
9579 	extent_info.update_times = true;
9580 	extent_info.qgroup_reserved = qgroup_released;
9581 	extent_info.insertions = 0;
9582 
9583 	path = btrfs_alloc_path();
9584 	if (!path) {
9585 		ret = -ENOMEM;
9586 		goto free_qgroup;
9587 	}
9588 
9589 	ret = btrfs_replace_file_extents(inode, path, file_offset,
9590 				     file_offset + len - 1, &extent_info,
9591 				     &trans);
9592 	btrfs_free_path(path);
9593 	if (ret)
9594 		goto free_qgroup;
9595 	return trans;
9596 
9597 free_qgroup:
9598 	/*
9599 	 * We have released qgroup data range at the beginning of the function,
9600 	 * and normally qgroup_released bytes will be freed when committing
9601 	 * transaction.
9602 	 * But if we error out early, we have to free what we have released
9603 	 * or we leak qgroup data reservation.
9604 	 */
9605 	btrfs_qgroup_free_refroot(inode->root->fs_info,
9606 			inode->root->root_key.objectid, qgroup_released,
9607 			BTRFS_QGROUP_RSV_DATA);
9608 	return ERR_PTR(ret);
9609 }
9610 
__btrfs_prealloc_file_range(struct inode * inode,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint,struct btrfs_trans_handle * trans)9611 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9612 				       u64 start, u64 num_bytes, u64 min_size,
9613 				       loff_t actual_len, u64 *alloc_hint,
9614 				       struct btrfs_trans_handle *trans)
9615 {
9616 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9617 	struct extent_map *em;
9618 	struct btrfs_root *root = BTRFS_I(inode)->root;
9619 	struct btrfs_key ins;
9620 	u64 cur_offset = start;
9621 	u64 clear_offset = start;
9622 	u64 i_size;
9623 	u64 cur_bytes;
9624 	u64 last_alloc = (u64)-1;
9625 	int ret = 0;
9626 	bool own_trans = true;
9627 	u64 end = start + num_bytes - 1;
9628 
9629 	if (trans)
9630 		own_trans = false;
9631 	while (num_bytes > 0) {
9632 		cur_bytes = min_t(u64, num_bytes, SZ_256M);
9633 		cur_bytes = max(cur_bytes, min_size);
9634 		/*
9635 		 * If we are severely fragmented we could end up with really
9636 		 * small allocations, so if the allocator is returning small
9637 		 * chunks lets make its job easier by only searching for those
9638 		 * sized chunks.
9639 		 */
9640 		cur_bytes = min(cur_bytes, last_alloc);
9641 		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
9642 				min_size, 0, *alloc_hint, &ins, 1, 0);
9643 		if (ret)
9644 			break;
9645 
9646 		/*
9647 		 * We've reserved this space, and thus converted it from
9648 		 * ->bytes_may_use to ->bytes_reserved.  Any error that happens
9649 		 * from here on out we will only need to clear our reservation
9650 		 * for the remaining unreserved area, so advance our
9651 		 * clear_offset by our extent size.
9652 		 */
9653 		clear_offset += ins.offset;
9654 
9655 		last_alloc = ins.offset;
9656 		trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
9657 						    &ins, cur_offset);
9658 		/*
9659 		 * Now that we inserted the prealloc extent we can finally
9660 		 * decrement the number of reservations in the block group.
9661 		 * If we did it before, we could race with relocation and have
9662 		 * relocation miss the reserved extent, making it fail later.
9663 		 */
9664 		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9665 		if (IS_ERR(trans)) {
9666 			ret = PTR_ERR(trans);
9667 			btrfs_free_reserved_extent(fs_info, ins.objectid,
9668 						   ins.offset, 0);
9669 			break;
9670 		}
9671 
9672 		em = alloc_extent_map();
9673 		if (!em) {
9674 			btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
9675 					    cur_offset + ins.offset - 1, false);
9676 			btrfs_set_inode_full_sync(BTRFS_I(inode));
9677 			goto next;
9678 		}
9679 
9680 		em->start = cur_offset;
9681 		em->orig_start = cur_offset;
9682 		em->len = ins.offset;
9683 		em->block_start = ins.objectid;
9684 		em->block_len = ins.offset;
9685 		em->orig_block_len = ins.offset;
9686 		em->ram_bytes = ins.offset;
9687 		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
9688 		em->generation = trans->transid;
9689 
9690 		ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
9691 		free_extent_map(em);
9692 next:
9693 		num_bytes -= ins.offset;
9694 		cur_offset += ins.offset;
9695 		*alloc_hint = ins.objectid + ins.offset;
9696 
9697 		inode_inc_iversion(inode);
9698 		inode_set_ctime_current(inode);
9699 		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
9700 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
9701 		    (actual_len > inode->i_size) &&
9702 		    (cur_offset > inode->i_size)) {
9703 			if (cur_offset > actual_len)
9704 				i_size = actual_len;
9705 			else
9706 				i_size = cur_offset;
9707 			i_size_write(inode, i_size);
9708 			btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
9709 		}
9710 
9711 		ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
9712 
9713 		if (ret) {
9714 			btrfs_abort_transaction(trans, ret);
9715 			if (own_trans)
9716 				btrfs_end_transaction(trans);
9717 			break;
9718 		}
9719 
9720 		if (own_trans) {
9721 			btrfs_end_transaction(trans);
9722 			trans = NULL;
9723 		}
9724 	}
9725 	if (clear_offset < end)
9726 		btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
9727 			end - clear_offset + 1);
9728 	return ret;
9729 }
9730 
btrfs_prealloc_file_range(struct inode * inode,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint)9731 int btrfs_prealloc_file_range(struct inode *inode, int mode,
9732 			      u64 start, u64 num_bytes, u64 min_size,
9733 			      loff_t actual_len, u64 *alloc_hint)
9734 {
9735 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9736 					   min_size, actual_len, alloc_hint,
9737 					   NULL);
9738 }
9739 
btrfs_prealloc_file_range_trans(struct inode * inode,struct btrfs_trans_handle * trans,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint)9740 int btrfs_prealloc_file_range_trans(struct inode *inode,
9741 				    struct btrfs_trans_handle *trans, int mode,
9742 				    u64 start, u64 num_bytes, u64 min_size,
9743 				    loff_t actual_len, u64 *alloc_hint)
9744 {
9745 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9746 					   min_size, actual_len, alloc_hint, trans);
9747 }
9748 
btrfs_permission(struct mnt_idmap * idmap,struct inode * inode,int mask)9749 static int btrfs_permission(struct mnt_idmap *idmap,
9750 			    struct inode *inode, int mask)
9751 {
9752 	struct btrfs_root *root = BTRFS_I(inode)->root;
9753 	umode_t mode = inode->i_mode;
9754 
9755 	if (mask & MAY_WRITE &&
9756 	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
9757 		if (btrfs_root_readonly(root))
9758 			return -EROFS;
9759 		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
9760 			return -EACCES;
9761 	}
9762 	return generic_permission(idmap, inode, mask);
9763 }
9764 
btrfs_tmpfile(struct mnt_idmap * idmap,struct inode * dir,struct file * file,umode_t mode)9765 static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
9766 			 struct file *file, umode_t mode)
9767 {
9768 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
9769 	struct btrfs_trans_handle *trans;
9770 	struct btrfs_root *root = BTRFS_I(dir)->root;
9771 	struct inode *inode;
9772 	struct btrfs_new_inode_args new_inode_args = {
9773 		.dir = dir,
9774 		.dentry = file->f_path.dentry,
9775 		.orphan = true,
9776 	};
9777 	unsigned int trans_num_items;
9778 	int ret;
9779 
9780 	inode = new_inode(dir->i_sb);
9781 	if (!inode)
9782 		return -ENOMEM;
9783 	inode_init_owner(idmap, inode, dir, mode);
9784 	inode->i_fop = &btrfs_file_operations;
9785 	inode->i_op = &btrfs_file_inode_operations;
9786 	inode->i_mapping->a_ops = &btrfs_aops;
9787 
9788 	new_inode_args.inode = inode;
9789 	ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
9790 	if (ret)
9791 		goto out_inode;
9792 
9793 	trans = btrfs_start_transaction(root, trans_num_items);
9794 	if (IS_ERR(trans)) {
9795 		ret = PTR_ERR(trans);
9796 		goto out_new_inode_args;
9797 	}
9798 
9799 	ret = btrfs_create_new_inode(trans, &new_inode_args);
9800 
9801 	/*
9802 	 * We set number of links to 0 in btrfs_create_new_inode(), and here we
9803 	 * set it to 1 because d_tmpfile() will issue a warning if the count is
9804 	 * 0, through:
9805 	 *
9806 	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9807 	 */
9808 	set_nlink(inode, 1);
9809 
9810 	if (!ret) {
9811 		d_tmpfile(file, inode);
9812 		unlock_new_inode(inode);
9813 		mark_inode_dirty(inode);
9814 	}
9815 
9816 	btrfs_end_transaction(trans);
9817 	btrfs_btree_balance_dirty(fs_info);
9818 out_new_inode_args:
9819 	btrfs_new_inode_args_destroy(&new_inode_args);
9820 out_inode:
9821 	if (ret)
9822 		iput(inode);
9823 	return finish_open_simple(file, ret);
9824 }
9825 
btrfs_set_range_writeback(struct btrfs_inode * inode,u64 start,u64 end)9826 void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
9827 {
9828 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
9829 	unsigned long index = start >> PAGE_SHIFT;
9830 	unsigned long end_index = end >> PAGE_SHIFT;
9831 	struct page *page;
9832 	u32 len;
9833 
9834 	ASSERT(end + 1 - start <= U32_MAX);
9835 	len = end + 1 - start;
9836 	while (index <= end_index) {
9837 		page = find_get_page(inode->vfs_inode.i_mapping, index);
9838 		ASSERT(page); /* Pages should be in the extent_io_tree */
9839 
9840 		btrfs_page_set_writeback(fs_info, page, start, len);
9841 		put_page(page);
9842 		index++;
9843 	}
9844 }
9845 
btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info * fs_info,int compress_type)9846 int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
9847 					     int compress_type)
9848 {
9849 	switch (compress_type) {
9850 	case BTRFS_COMPRESS_NONE:
9851 		return BTRFS_ENCODED_IO_COMPRESSION_NONE;
9852 	case BTRFS_COMPRESS_ZLIB:
9853 		return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
9854 	case BTRFS_COMPRESS_LZO:
9855 		/*
9856 		 * The LZO format depends on the sector size. 64K is the maximum
9857 		 * sector size that we support.
9858 		 */
9859 		if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
9860 			return -EINVAL;
9861 		return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
9862 		       (fs_info->sectorsize_bits - 12);
9863 	case BTRFS_COMPRESS_ZSTD:
9864 		return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
9865 	default:
9866 		return -EUCLEAN;
9867 	}
9868 }
9869 
btrfs_encoded_read_inline(struct kiocb * iocb,struct iov_iter * iter,u64 start,u64 lockend,struct extent_state ** cached_state,u64 extent_start,size_t count,struct btrfs_ioctl_encoded_io_args * encoded,bool * unlocked)9870 static ssize_t btrfs_encoded_read_inline(
9871 				struct kiocb *iocb,
9872 				struct iov_iter *iter, u64 start,
9873 				u64 lockend,
9874 				struct extent_state **cached_state,
9875 				u64 extent_start, size_t count,
9876 				struct btrfs_ioctl_encoded_io_args *encoded,
9877 				bool *unlocked)
9878 {
9879 	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9880 	struct btrfs_root *root = inode->root;
9881 	struct btrfs_fs_info *fs_info = root->fs_info;
9882 	struct extent_io_tree *io_tree = &inode->io_tree;
9883 	struct btrfs_path *path;
9884 	struct extent_buffer *leaf;
9885 	struct btrfs_file_extent_item *item;
9886 	u64 ram_bytes;
9887 	unsigned long ptr;
9888 	void *tmp;
9889 	ssize_t ret;
9890 
9891 	path = btrfs_alloc_path();
9892 	if (!path) {
9893 		ret = -ENOMEM;
9894 		goto out;
9895 	}
9896 	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
9897 				       extent_start, 0);
9898 	if (ret) {
9899 		if (ret > 0) {
9900 			/* The extent item disappeared? */
9901 			ret = -EIO;
9902 		}
9903 		goto out;
9904 	}
9905 	leaf = path->nodes[0];
9906 	item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
9907 
9908 	ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
9909 	ptr = btrfs_file_extent_inline_start(item);
9910 
9911 	encoded->len = min_t(u64, extent_start + ram_bytes,
9912 			     inode->vfs_inode.i_size) - iocb->ki_pos;
9913 	ret = btrfs_encoded_io_compression_from_extent(fs_info,
9914 				 btrfs_file_extent_compression(leaf, item));
9915 	if (ret < 0)
9916 		goto out;
9917 	encoded->compression = ret;
9918 	if (encoded->compression) {
9919 		size_t inline_size;
9920 
9921 		inline_size = btrfs_file_extent_inline_item_len(leaf,
9922 								path->slots[0]);
9923 		if (inline_size > count) {
9924 			ret = -ENOBUFS;
9925 			goto out;
9926 		}
9927 		count = inline_size;
9928 		encoded->unencoded_len = ram_bytes;
9929 		encoded->unencoded_offset = iocb->ki_pos - extent_start;
9930 	} else {
9931 		count = min_t(u64, count, encoded->len);
9932 		encoded->len = count;
9933 		encoded->unencoded_len = count;
9934 		ptr += iocb->ki_pos - extent_start;
9935 	}
9936 
9937 	tmp = kmalloc(count, GFP_NOFS);
9938 	if (!tmp) {
9939 		ret = -ENOMEM;
9940 		goto out;
9941 	}
9942 	read_extent_buffer(leaf, tmp, ptr, count);
9943 	btrfs_release_path(path);
9944 	unlock_extent(io_tree, start, lockend, cached_state);
9945 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9946 	*unlocked = true;
9947 
9948 	ret = copy_to_iter(tmp, count, iter);
9949 	if (ret != count)
9950 		ret = -EFAULT;
9951 	kfree(tmp);
9952 out:
9953 	btrfs_free_path(path);
9954 	return ret;
9955 }
9956 
9957 struct btrfs_encoded_read_private {
9958 	wait_queue_head_t wait;
9959 	atomic_t pending;
9960 	blk_status_t status;
9961 };
9962 
btrfs_encoded_read_endio(struct btrfs_bio * bbio)9963 static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
9964 {
9965 	struct btrfs_encoded_read_private *priv = bbio->private;
9966 
9967 	if (bbio->bio.bi_status) {
9968 		/*
9969 		 * The memory barrier implied by the atomic_dec_return() here
9970 		 * pairs with the memory barrier implied by the
9971 		 * atomic_dec_return() or io_wait_event() in
9972 		 * btrfs_encoded_read_regular_fill_pages() to ensure that this
9973 		 * write is observed before the load of status in
9974 		 * btrfs_encoded_read_regular_fill_pages().
9975 		 */
9976 		WRITE_ONCE(priv->status, bbio->bio.bi_status);
9977 	}
9978 	if (atomic_dec_and_test(&priv->pending))
9979 		wake_up(&priv->wait);
9980 	bio_put(&bbio->bio);
9981 }
9982 
btrfs_encoded_read_regular_fill_pages(struct btrfs_inode * inode,u64 file_offset,u64 disk_bytenr,u64 disk_io_size,struct page ** pages)9983 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
9984 					  u64 file_offset, u64 disk_bytenr,
9985 					  u64 disk_io_size, struct page **pages)
9986 {
9987 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
9988 	struct btrfs_encoded_read_private priv = {
9989 		.pending = ATOMIC_INIT(1),
9990 	};
9991 	unsigned long i = 0;
9992 	struct btrfs_bio *bbio;
9993 
9994 	init_waitqueue_head(&priv.wait);
9995 
9996 	bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9997 			       btrfs_encoded_read_endio, &priv);
9998 	bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9999 	bbio->inode = inode;
10000 
10001 	do {
10002 		size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
10003 
10004 		if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
10005 			atomic_inc(&priv.pending);
10006 			btrfs_submit_bio(bbio, 0);
10007 
10008 			bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
10009 					       btrfs_encoded_read_endio, &priv);
10010 			bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
10011 			bbio->inode = inode;
10012 			continue;
10013 		}
10014 
10015 		i++;
10016 		disk_bytenr += bytes;
10017 		disk_io_size -= bytes;
10018 	} while (disk_io_size);
10019 
10020 	atomic_inc(&priv.pending);
10021 	btrfs_submit_bio(bbio, 0);
10022 
10023 	if (atomic_dec_return(&priv.pending))
10024 		io_wait_event(priv.wait, !atomic_read(&priv.pending));
10025 	/* See btrfs_encoded_read_endio() for ordering. */
10026 	return blk_status_to_errno(READ_ONCE(priv.status));
10027 }
10028 
btrfs_encoded_read_regular(struct kiocb * iocb,struct iov_iter * iter,u64 start,u64 lockend,struct extent_state ** cached_state,u64 disk_bytenr,u64 disk_io_size,size_t count,bool compressed,bool * unlocked)10029 static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb,
10030 					  struct iov_iter *iter,
10031 					  u64 start, u64 lockend,
10032 					  struct extent_state **cached_state,
10033 					  u64 disk_bytenr, u64 disk_io_size,
10034 					  size_t count, bool compressed,
10035 					  bool *unlocked)
10036 {
10037 	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10038 	struct extent_io_tree *io_tree = &inode->io_tree;
10039 	struct page **pages;
10040 	unsigned long nr_pages, i;
10041 	u64 cur;
10042 	size_t page_offset;
10043 	ssize_t ret;
10044 
10045 	nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
10046 	pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
10047 	if (!pages)
10048 		return -ENOMEM;
10049 	ret = btrfs_alloc_page_array(nr_pages, pages);
10050 	if (ret) {
10051 		ret = -ENOMEM;
10052 		goto out;
10053 		}
10054 
10055 	ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr,
10056 						    disk_io_size, pages);
10057 	if (ret)
10058 		goto out;
10059 
10060 	unlock_extent(io_tree, start, lockend, cached_state);
10061 	btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10062 	*unlocked = true;
10063 
10064 	if (compressed) {
10065 		i = 0;
10066 		page_offset = 0;
10067 	} else {
10068 		i = (iocb->ki_pos - start) >> PAGE_SHIFT;
10069 		page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
10070 	}
10071 	cur = 0;
10072 	while (cur < count) {
10073 		size_t bytes = min_t(size_t, count - cur,
10074 				     PAGE_SIZE - page_offset);
10075 
10076 		if (copy_page_to_iter(pages[i], page_offset, bytes,
10077 				      iter) != bytes) {
10078 			ret = -EFAULT;
10079 			goto out;
10080 		}
10081 		i++;
10082 		cur += bytes;
10083 		page_offset = 0;
10084 	}
10085 	ret = count;
10086 out:
10087 	for (i = 0; i < nr_pages; i++) {
10088 		if (pages[i])
10089 			__free_page(pages[i]);
10090 	}
10091 	kfree(pages);
10092 	return ret;
10093 }
10094 
btrfs_encoded_read(struct kiocb * iocb,struct iov_iter * iter,struct btrfs_ioctl_encoded_io_args * encoded)10095 ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
10096 			   struct btrfs_ioctl_encoded_io_args *encoded)
10097 {
10098 	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10099 	struct btrfs_fs_info *fs_info = inode->root->fs_info;
10100 	struct extent_io_tree *io_tree = &inode->io_tree;
10101 	ssize_t ret;
10102 	size_t count = iov_iter_count(iter);
10103 	u64 start, lockend, disk_bytenr, disk_io_size;
10104 	struct extent_state *cached_state = NULL;
10105 	struct extent_map *em;
10106 	bool unlocked = false;
10107 
10108 	file_accessed(iocb->ki_filp);
10109 
10110 	btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
10111 
10112 	if (iocb->ki_pos >= inode->vfs_inode.i_size) {
10113 		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10114 		return 0;
10115 	}
10116 	start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
10117 	/*
10118 	 * We don't know how long the extent containing iocb->ki_pos is, but if
10119 	 * it's compressed we know that it won't be longer than this.
10120 	 */
10121 	lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
10122 
10123 	for (;;) {
10124 		struct btrfs_ordered_extent *ordered;
10125 
10126 		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start,
10127 					       lockend - start + 1);
10128 		if (ret)
10129 			goto out_unlock_inode;
10130 		lock_extent(io_tree, start, lockend, &cached_state);
10131 		ordered = btrfs_lookup_ordered_range(inode, start,
10132 						     lockend - start + 1);
10133 		if (!ordered)
10134 			break;
10135 		btrfs_put_ordered_extent(ordered);
10136 		unlock_extent(io_tree, start, lockend, &cached_state);
10137 		cond_resched();
10138 	}
10139 
10140 	em = btrfs_get_extent(inode, NULL, 0, start, lockend - start + 1);
10141 	if (IS_ERR(em)) {
10142 		ret = PTR_ERR(em);
10143 		goto out_unlock_extent;
10144 	}
10145 
10146 	if (em->block_start == EXTENT_MAP_INLINE) {
10147 		u64 extent_start = em->start;
10148 
10149 		/*
10150 		 * For inline extents we get everything we need out of the
10151 		 * extent item.
10152 		 */
10153 		free_extent_map(em);
10154 		em = NULL;
10155 		ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
10156 						&cached_state, extent_start,
10157 						count, encoded, &unlocked);
10158 		goto out;
10159 	}
10160 
10161 	/*
10162 	 * We only want to return up to EOF even if the extent extends beyond
10163 	 * that.
10164 	 */
10165 	encoded->len = min_t(u64, extent_map_end(em),
10166 			     inode->vfs_inode.i_size) - iocb->ki_pos;
10167 	if (em->block_start == EXTENT_MAP_HOLE ||
10168 	    test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
10169 		disk_bytenr = EXTENT_MAP_HOLE;
10170 		count = min_t(u64, count, encoded->len);
10171 		encoded->len = count;
10172 		encoded->unencoded_len = count;
10173 	} else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10174 		disk_bytenr = em->block_start;
10175 		/*
10176 		 * Bail if the buffer isn't large enough to return the whole
10177 		 * compressed extent.
10178 		 */
10179 		if (em->block_len > count) {
10180 			ret = -ENOBUFS;
10181 			goto out_em;
10182 		}
10183 		disk_io_size = em->block_len;
10184 		count = em->block_len;
10185 		encoded->unencoded_len = em->ram_bytes;
10186 		encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
10187 		ret = btrfs_encoded_io_compression_from_extent(fs_info,
10188 							     em->compress_type);
10189 		if (ret < 0)
10190 			goto out_em;
10191 		encoded->compression = ret;
10192 	} else {
10193 		disk_bytenr = em->block_start + (start - em->start);
10194 		if (encoded->len > count)
10195 			encoded->len = count;
10196 		/*
10197 		 * Don't read beyond what we locked. This also limits the page
10198 		 * allocations that we'll do.
10199 		 */
10200 		disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
10201 		count = start + disk_io_size - iocb->ki_pos;
10202 		encoded->len = count;
10203 		encoded->unencoded_len = count;
10204 		disk_io_size = ALIGN(disk_io_size, fs_info->sectorsize);
10205 	}
10206 	free_extent_map(em);
10207 	em = NULL;
10208 
10209 	if (disk_bytenr == EXTENT_MAP_HOLE) {
10210 		unlock_extent(io_tree, start, lockend, &cached_state);
10211 		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10212 		unlocked = true;
10213 		ret = iov_iter_zero(count, iter);
10214 		if (ret != count)
10215 			ret = -EFAULT;
10216 	} else {
10217 		ret = btrfs_encoded_read_regular(iocb, iter, start, lockend,
10218 						 &cached_state, disk_bytenr,
10219 						 disk_io_size, count,
10220 						 encoded->compression,
10221 						 &unlocked);
10222 	}
10223 
10224 out:
10225 	if (ret >= 0)
10226 		iocb->ki_pos += encoded->len;
10227 out_em:
10228 	free_extent_map(em);
10229 out_unlock_extent:
10230 	if (!unlocked)
10231 		unlock_extent(io_tree, start, lockend, &cached_state);
10232 out_unlock_inode:
10233 	if (!unlocked)
10234 		btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
10235 	return ret;
10236 }
10237 
btrfs_do_encoded_write(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)10238 ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
10239 			       const struct btrfs_ioctl_encoded_io_args *encoded)
10240 {
10241 	struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
10242 	struct btrfs_root *root = inode->root;
10243 	struct btrfs_fs_info *fs_info = root->fs_info;
10244 	struct extent_io_tree *io_tree = &inode->io_tree;
10245 	struct extent_changeset *data_reserved = NULL;
10246 	struct extent_state *cached_state = NULL;
10247 	struct btrfs_ordered_extent *ordered;
10248 	int compression;
10249 	size_t orig_count;
10250 	u64 start, end;
10251 	u64 num_bytes, ram_bytes, disk_num_bytes;
10252 	unsigned long nr_pages, i;
10253 	struct page **pages;
10254 	struct btrfs_key ins;
10255 	bool extent_reserved = false;
10256 	struct extent_map *em;
10257 	ssize_t ret;
10258 
10259 	switch (encoded->compression) {
10260 	case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
10261 		compression = BTRFS_COMPRESS_ZLIB;
10262 		break;
10263 	case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
10264 		compression = BTRFS_COMPRESS_ZSTD;
10265 		break;
10266 	case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
10267 	case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
10268 	case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
10269 	case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
10270 	case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
10271 		/* The sector size must match for LZO. */
10272 		if (encoded->compression -
10273 		    BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
10274 		    fs_info->sectorsize_bits)
10275 			return -EINVAL;
10276 		compression = BTRFS_COMPRESS_LZO;
10277 		break;
10278 	default:
10279 		return -EINVAL;
10280 	}
10281 	if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
10282 		return -EINVAL;
10283 
10284 	/*
10285 	 * Compressed extents should always have checksums, so error out if we
10286 	 * have a NOCOW file or inode was created while mounted with NODATASUM.
10287 	 */
10288 	if (inode->flags & BTRFS_INODE_NODATASUM)
10289 		return -EINVAL;
10290 
10291 	orig_count = iov_iter_count(from);
10292 
10293 	/* The extent size must be sane. */
10294 	if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
10295 	    orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
10296 		return -EINVAL;
10297 
10298 	/*
10299 	 * The compressed data must be smaller than the decompressed data.
10300 	 *
10301 	 * It's of course possible for data to compress to larger or the same
10302 	 * size, but the buffered I/O path falls back to no compression for such
10303 	 * data, and we don't want to break any assumptions by creating these
10304 	 * extents.
10305 	 *
10306 	 * Note that this is less strict than the current check we have that the
10307 	 * compressed data must be at least one sector smaller than the
10308 	 * decompressed data. We only want to enforce the weaker requirement
10309 	 * from old kernels that it is at least one byte smaller.
10310 	 */
10311 	if (orig_count >= encoded->unencoded_len)
10312 		return -EINVAL;
10313 
10314 	/* The extent must start on a sector boundary. */
10315 	start = iocb->ki_pos;
10316 	if (!IS_ALIGNED(start, fs_info->sectorsize))
10317 		return -EINVAL;
10318 
10319 	/*
10320 	 * The extent must end on a sector boundary. However, we allow a write
10321 	 * which ends at or extends i_size to have an unaligned length; we round
10322 	 * up the extent size and set i_size to the unaligned end.
10323 	 */
10324 	if (start + encoded->len < inode->vfs_inode.i_size &&
10325 	    !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
10326 		return -EINVAL;
10327 
10328 	/* Finally, the offset in the unencoded data must be sector-aligned. */
10329 	if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
10330 		return -EINVAL;
10331 
10332 	num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
10333 	ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
10334 	end = start + num_bytes - 1;
10335 
10336 	/*
10337 	 * If the extent cannot be inline, the compressed data on disk must be
10338 	 * sector-aligned. For convenience, we extend it with zeroes if it
10339 	 * isn't.
10340 	 */
10341 	disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
10342 	nr_pages = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
10343 	pages = kvcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL_ACCOUNT);
10344 	if (!pages)
10345 		return -ENOMEM;
10346 	for (i = 0; i < nr_pages; i++) {
10347 		size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
10348 		char *kaddr;
10349 
10350 		pages[i] = alloc_page(GFP_KERNEL_ACCOUNT);
10351 		if (!pages[i]) {
10352 			ret = -ENOMEM;
10353 			goto out_pages;
10354 		}
10355 		kaddr = kmap_local_page(pages[i]);
10356 		if (copy_from_iter(kaddr, bytes, from) != bytes) {
10357 			kunmap_local(kaddr);
10358 			ret = -EFAULT;
10359 			goto out_pages;
10360 		}
10361 		if (bytes < PAGE_SIZE)
10362 			memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
10363 		kunmap_local(kaddr);
10364 	}
10365 
10366 	for (;;) {
10367 		struct btrfs_ordered_extent *ordered;
10368 
10369 		ret = btrfs_wait_ordered_range(&inode->vfs_inode, start, num_bytes);
10370 		if (ret)
10371 			goto out_pages;
10372 		ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
10373 						    start >> PAGE_SHIFT,
10374 						    end >> PAGE_SHIFT);
10375 		if (ret)
10376 			goto out_pages;
10377 		lock_extent(io_tree, start, end, &cached_state);
10378 		ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
10379 		if (!ordered &&
10380 		    !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
10381 			break;
10382 		if (ordered)
10383 			btrfs_put_ordered_extent(ordered);
10384 		unlock_extent(io_tree, start, end, &cached_state);
10385 		cond_resched();
10386 	}
10387 
10388 	/*
10389 	 * We don't use the higher-level delalloc space functions because our
10390 	 * num_bytes and disk_num_bytes are different.
10391 	 */
10392 	ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
10393 	if (ret)
10394 		goto out_unlock;
10395 	ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
10396 	if (ret)
10397 		goto out_free_data_space;
10398 	ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
10399 					      false);
10400 	if (ret)
10401 		goto out_qgroup_free_data;
10402 
10403 	/* Try an inline extent first. */
10404 	if (start == 0 && encoded->unencoded_len == encoded->len &&
10405 	    encoded->unencoded_offset == 0) {
10406 		ret = cow_file_range_inline(inode, encoded->len, orig_count,
10407 					    compression, pages, true);
10408 		if (ret <= 0) {
10409 			if (ret == 0)
10410 				ret = orig_count;
10411 			goto out_delalloc_release;
10412 		}
10413 	}
10414 
10415 	ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
10416 				   disk_num_bytes, 0, 0, &ins, 1, 1);
10417 	if (ret)
10418 		goto out_delalloc_release;
10419 	extent_reserved = true;
10420 
10421 	em = create_io_em(inode, start, num_bytes,
10422 			  start - encoded->unencoded_offset, ins.objectid,
10423 			  ins.offset, ins.offset, ram_bytes, compression,
10424 			  BTRFS_ORDERED_COMPRESSED);
10425 	if (IS_ERR(em)) {
10426 		ret = PTR_ERR(em);
10427 		goto out_free_reserved;
10428 	}
10429 	free_extent_map(em);
10430 
10431 	ordered = btrfs_alloc_ordered_extent(inode, start, num_bytes, ram_bytes,
10432 				       ins.objectid, ins.offset,
10433 				       encoded->unencoded_offset,
10434 				       (1 << BTRFS_ORDERED_ENCODED) |
10435 				       (1 << BTRFS_ORDERED_COMPRESSED),
10436 				       compression);
10437 	if (IS_ERR(ordered)) {
10438 		btrfs_drop_extent_map_range(inode, start, end, false);
10439 		ret = PTR_ERR(ordered);
10440 		goto out_free_reserved;
10441 	}
10442 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10443 
10444 	if (start + encoded->len > inode->vfs_inode.i_size)
10445 		i_size_write(&inode->vfs_inode, start + encoded->len);
10446 
10447 	unlock_extent(io_tree, start, end, &cached_state);
10448 
10449 	btrfs_delalloc_release_extents(inode, num_bytes);
10450 
10451 	btrfs_submit_compressed_write(ordered, pages, nr_pages, 0, false);
10452 	ret = orig_count;
10453 	goto out;
10454 
10455 out_free_reserved:
10456 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10457 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
10458 out_delalloc_release:
10459 	btrfs_delalloc_release_extents(inode, num_bytes);
10460 	btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
10461 out_qgroup_free_data:
10462 	if (ret < 0)
10463 		btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
10464 out_free_data_space:
10465 	/*
10466 	 * If btrfs_reserve_extent() succeeded, then we already decremented
10467 	 * bytes_may_use.
10468 	 */
10469 	if (!extent_reserved)
10470 		btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
10471 out_unlock:
10472 	unlock_extent(io_tree, start, end, &cached_state);
10473 out_pages:
10474 	for (i = 0; i < nr_pages; i++) {
10475 		if (pages[i])
10476 			__free_page(pages[i]);
10477 	}
10478 	kvfree(pages);
10479 out:
10480 	if (ret >= 0)
10481 		iocb->ki_pos += encoded->len;
10482 	return ret;
10483 }
10484 
10485 #ifdef CONFIG_SWAP
10486 /*
10487  * Add an entry indicating a block group or device which is pinned by a
10488  * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
10489  * negative errno on failure.
10490  */
btrfs_add_swapfile_pin(struct inode * inode,void * ptr,bool is_block_group)10491 static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
10492 				  bool is_block_group)
10493 {
10494 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10495 	struct btrfs_swapfile_pin *sp, *entry;
10496 	struct rb_node **p;
10497 	struct rb_node *parent = NULL;
10498 
10499 	sp = kmalloc(sizeof(*sp), GFP_NOFS);
10500 	if (!sp)
10501 		return -ENOMEM;
10502 	sp->ptr = ptr;
10503 	sp->inode = inode;
10504 	sp->is_block_group = is_block_group;
10505 	sp->bg_extent_count = 1;
10506 
10507 	spin_lock(&fs_info->swapfile_pins_lock);
10508 	p = &fs_info->swapfile_pins.rb_node;
10509 	while (*p) {
10510 		parent = *p;
10511 		entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
10512 		if (sp->ptr < entry->ptr ||
10513 		    (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
10514 			p = &(*p)->rb_left;
10515 		} else if (sp->ptr > entry->ptr ||
10516 			   (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
10517 			p = &(*p)->rb_right;
10518 		} else {
10519 			if (is_block_group)
10520 				entry->bg_extent_count++;
10521 			spin_unlock(&fs_info->swapfile_pins_lock);
10522 			kfree(sp);
10523 			return 1;
10524 		}
10525 	}
10526 	rb_link_node(&sp->node, parent, p);
10527 	rb_insert_color(&sp->node, &fs_info->swapfile_pins);
10528 	spin_unlock(&fs_info->swapfile_pins_lock);
10529 	return 0;
10530 }
10531 
10532 /* Free all of the entries pinned by this swapfile. */
btrfs_free_swapfile_pins(struct inode * inode)10533 static void btrfs_free_swapfile_pins(struct inode *inode)
10534 {
10535 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10536 	struct btrfs_swapfile_pin *sp;
10537 	struct rb_node *node, *next;
10538 
10539 	spin_lock(&fs_info->swapfile_pins_lock);
10540 	node = rb_first(&fs_info->swapfile_pins);
10541 	while (node) {
10542 		next = rb_next(node);
10543 		sp = rb_entry(node, struct btrfs_swapfile_pin, node);
10544 		if (sp->inode == inode) {
10545 			rb_erase(&sp->node, &fs_info->swapfile_pins);
10546 			if (sp->is_block_group) {
10547 				btrfs_dec_block_group_swap_extents(sp->ptr,
10548 							   sp->bg_extent_count);
10549 				btrfs_put_block_group(sp->ptr);
10550 			}
10551 			kfree(sp);
10552 		}
10553 		node = next;
10554 	}
10555 	spin_unlock(&fs_info->swapfile_pins_lock);
10556 }
10557 
10558 struct btrfs_swap_info {
10559 	u64 start;
10560 	u64 block_start;
10561 	u64 block_len;
10562 	u64 lowest_ppage;
10563 	u64 highest_ppage;
10564 	unsigned long nr_pages;
10565 	int nr_extents;
10566 };
10567 
btrfs_add_swap_extent(struct swap_info_struct * sis,struct btrfs_swap_info * bsi)10568 static int btrfs_add_swap_extent(struct swap_info_struct *sis,
10569 				 struct btrfs_swap_info *bsi)
10570 {
10571 	unsigned long nr_pages;
10572 	unsigned long max_pages;
10573 	u64 first_ppage, first_ppage_reported, next_ppage;
10574 	int ret;
10575 
10576 	/*
10577 	 * Our swapfile may have had its size extended after the swap header was
10578 	 * written. In that case activating the swapfile should not go beyond
10579 	 * the max size set in the swap header.
10580 	 */
10581 	if (bsi->nr_pages >= sis->max)
10582 		return 0;
10583 
10584 	max_pages = sis->max - bsi->nr_pages;
10585 	first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
10586 	next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
10587 
10588 	if (first_ppage >= next_ppage)
10589 		return 0;
10590 	nr_pages = next_ppage - first_ppage;
10591 	nr_pages = min(nr_pages, max_pages);
10592 
10593 	first_ppage_reported = first_ppage;
10594 	if (bsi->start == 0)
10595 		first_ppage_reported++;
10596 	if (bsi->lowest_ppage > first_ppage_reported)
10597 		bsi->lowest_ppage = first_ppage_reported;
10598 	if (bsi->highest_ppage < (next_ppage - 1))
10599 		bsi->highest_ppage = next_ppage - 1;
10600 
10601 	ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
10602 	if (ret < 0)
10603 		return ret;
10604 	bsi->nr_extents += ret;
10605 	bsi->nr_pages += nr_pages;
10606 	return 0;
10607 }
10608 
btrfs_swap_deactivate(struct file * file)10609 static void btrfs_swap_deactivate(struct file *file)
10610 {
10611 	struct inode *inode = file_inode(file);
10612 
10613 	btrfs_free_swapfile_pins(inode);
10614 	atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
10615 }
10616 
btrfs_swap_activate(struct swap_info_struct * sis,struct file * file,sector_t * span)10617 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10618 			       sector_t *span)
10619 {
10620 	struct inode *inode = file_inode(file);
10621 	struct btrfs_root *root = BTRFS_I(inode)->root;
10622 	struct btrfs_fs_info *fs_info = root->fs_info;
10623 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
10624 	struct extent_state *cached_state = NULL;
10625 	struct extent_map *em = NULL;
10626 	struct btrfs_device *device = NULL;
10627 	struct btrfs_swap_info bsi = {
10628 		.lowest_ppage = (sector_t)-1ULL,
10629 	};
10630 	int ret = 0;
10631 	u64 isize;
10632 	u64 start;
10633 
10634 	/*
10635 	 * If the swap file was just created, make sure delalloc is done. If the
10636 	 * file changes again after this, the user is doing something stupid and
10637 	 * we don't really care.
10638 	 */
10639 	ret = btrfs_wait_ordered_range(inode, 0, (u64)-1);
10640 	if (ret)
10641 		return ret;
10642 
10643 	/*
10644 	 * The inode is locked, so these flags won't change after we check them.
10645 	 */
10646 	if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
10647 		btrfs_warn(fs_info, "swapfile must not be compressed");
10648 		return -EINVAL;
10649 	}
10650 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
10651 		btrfs_warn(fs_info, "swapfile must not be copy-on-write");
10652 		return -EINVAL;
10653 	}
10654 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
10655 		btrfs_warn(fs_info, "swapfile must not be checksummed");
10656 		return -EINVAL;
10657 	}
10658 
10659 	/*
10660 	 * Balance or device remove/replace/resize can move stuff around from
10661 	 * under us. The exclop protection makes sure they aren't running/won't
10662 	 * run concurrently while we are mapping the swap extents, and
10663 	 * fs_info->swapfile_pins prevents them from running while the swap
10664 	 * file is active and moving the extents. Note that this also prevents
10665 	 * a concurrent device add which isn't actually necessary, but it's not
10666 	 * really worth the trouble to allow it.
10667 	 */
10668 	if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
10669 		btrfs_warn(fs_info,
10670 	   "cannot activate swapfile while exclusive operation is running");
10671 		return -EBUSY;
10672 	}
10673 
10674 	/*
10675 	 * Prevent snapshot creation while we are activating the swap file.
10676 	 * We do not want to race with snapshot creation. If snapshot creation
10677 	 * already started before we bumped nr_swapfiles from 0 to 1 and
10678 	 * completes before the first write into the swap file after it is
10679 	 * activated, than that write would fallback to COW.
10680 	 */
10681 	if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
10682 		btrfs_exclop_finish(fs_info);
10683 		btrfs_warn(fs_info,
10684 	   "cannot activate swapfile because snapshot creation is in progress");
10685 		return -EINVAL;
10686 	}
10687 	/*
10688 	 * Snapshots can create extents which require COW even if NODATACOW is
10689 	 * set. We use this counter to prevent snapshots. We must increment it
10690 	 * before walking the extents because we don't want a concurrent
10691 	 * snapshot to run after we've already checked the extents.
10692 	 *
10693 	 * It is possible that subvolume is marked for deletion but still not
10694 	 * removed yet. To prevent this race, we check the root status before
10695 	 * activating the swapfile.
10696 	 */
10697 	spin_lock(&root->root_item_lock);
10698 	if (btrfs_root_dead(root)) {
10699 		spin_unlock(&root->root_item_lock);
10700 
10701 		btrfs_drew_write_unlock(&root->snapshot_lock);
10702 		btrfs_exclop_finish(fs_info);
10703 		btrfs_warn(fs_info,
10704 		"cannot activate swapfile because subvolume %llu is being deleted",
10705 			root->root_key.objectid);
10706 		return -EPERM;
10707 	}
10708 	atomic_inc(&root->nr_swapfiles);
10709 	spin_unlock(&root->root_item_lock);
10710 
10711 	isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
10712 
10713 	lock_extent(io_tree, 0, isize - 1, &cached_state);
10714 	start = 0;
10715 	while (start < isize) {
10716 		u64 logical_block_start, physical_block_start;
10717 		struct btrfs_block_group *bg;
10718 		u64 len = isize - start;
10719 
10720 		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len);
10721 		if (IS_ERR(em)) {
10722 			ret = PTR_ERR(em);
10723 			goto out;
10724 		}
10725 
10726 		if (em->block_start == EXTENT_MAP_HOLE) {
10727 			btrfs_warn(fs_info, "swapfile must not have holes");
10728 			ret = -EINVAL;
10729 			goto out;
10730 		}
10731 		if (em->block_start == EXTENT_MAP_INLINE) {
10732 			/*
10733 			 * It's unlikely we'll ever actually find ourselves
10734 			 * here, as a file small enough to fit inline won't be
10735 			 * big enough to store more than the swap header, but in
10736 			 * case something changes in the future, let's catch it
10737 			 * here rather than later.
10738 			 */
10739 			btrfs_warn(fs_info, "swapfile must not be inline");
10740 			ret = -EINVAL;
10741 			goto out;
10742 		}
10743 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
10744 			btrfs_warn(fs_info, "swapfile must not be compressed");
10745 			ret = -EINVAL;
10746 			goto out;
10747 		}
10748 
10749 		logical_block_start = em->block_start + (start - em->start);
10750 		len = min(len, em->len - (start - em->start));
10751 		free_extent_map(em);
10752 		em = NULL;
10753 
10754 		ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, false, true);
10755 		if (ret < 0) {
10756 			goto out;
10757 		} else if (ret) {
10758 			ret = 0;
10759 		} else {
10760 			btrfs_warn(fs_info,
10761 				   "swapfile must not be copy-on-write");
10762 			ret = -EINVAL;
10763 			goto out;
10764 		}
10765 
10766 		em = btrfs_get_chunk_map(fs_info, logical_block_start, len);
10767 		if (IS_ERR(em)) {
10768 			ret = PTR_ERR(em);
10769 			goto out;
10770 		}
10771 
10772 		if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
10773 			btrfs_warn(fs_info,
10774 				   "swapfile must have single data profile");
10775 			ret = -EINVAL;
10776 			goto out;
10777 		}
10778 
10779 		if (device == NULL) {
10780 			device = em->map_lookup->stripes[0].dev;
10781 			ret = btrfs_add_swapfile_pin(inode, device, false);
10782 			if (ret == 1)
10783 				ret = 0;
10784 			else if (ret)
10785 				goto out;
10786 		} else if (device != em->map_lookup->stripes[0].dev) {
10787 			btrfs_warn(fs_info, "swapfile must be on one device");
10788 			ret = -EINVAL;
10789 			goto out;
10790 		}
10791 
10792 		physical_block_start = (em->map_lookup->stripes[0].physical +
10793 					(logical_block_start - em->start));
10794 		len = min(len, em->len - (logical_block_start - em->start));
10795 		free_extent_map(em);
10796 		em = NULL;
10797 
10798 		bg = btrfs_lookup_block_group(fs_info, logical_block_start);
10799 		if (!bg) {
10800 			btrfs_warn(fs_info,
10801 			   "could not find block group containing swapfile");
10802 			ret = -EINVAL;
10803 			goto out;
10804 		}
10805 
10806 		if (!btrfs_inc_block_group_swap_extents(bg)) {
10807 			btrfs_warn(fs_info,
10808 			   "block group for swapfile at %llu is read-only%s",
10809 			   bg->start,
10810 			   atomic_read(&fs_info->scrubs_running) ?
10811 				       " (scrub running)" : "");
10812 			btrfs_put_block_group(bg);
10813 			ret = -EINVAL;
10814 			goto out;
10815 		}
10816 
10817 		ret = btrfs_add_swapfile_pin(inode, bg, true);
10818 		if (ret) {
10819 			btrfs_put_block_group(bg);
10820 			if (ret == 1)
10821 				ret = 0;
10822 			else
10823 				goto out;
10824 		}
10825 
10826 		if (bsi.block_len &&
10827 		    bsi.block_start + bsi.block_len == physical_block_start) {
10828 			bsi.block_len += len;
10829 		} else {
10830 			if (bsi.block_len) {
10831 				ret = btrfs_add_swap_extent(sis, &bsi);
10832 				if (ret)
10833 					goto out;
10834 			}
10835 			bsi.start = start;
10836 			bsi.block_start = physical_block_start;
10837 			bsi.block_len = len;
10838 		}
10839 
10840 		start += len;
10841 
10842 		cond_resched();
10843 	}
10844 
10845 	if (bsi.block_len)
10846 		ret = btrfs_add_swap_extent(sis, &bsi);
10847 
10848 out:
10849 	if (!IS_ERR_OR_NULL(em))
10850 		free_extent_map(em);
10851 
10852 	unlock_extent(io_tree, 0, isize - 1, &cached_state);
10853 
10854 	if (ret)
10855 		btrfs_swap_deactivate(file);
10856 
10857 	btrfs_drew_write_unlock(&root->snapshot_lock);
10858 
10859 	btrfs_exclop_finish(fs_info);
10860 
10861 	if (ret)
10862 		return ret;
10863 
10864 	if (device)
10865 		sis->bdev = device->bdev;
10866 	*span = bsi.highest_ppage - bsi.lowest_ppage + 1;
10867 	sis->max = bsi.nr_pages;
10868 	sis->pages = bsi.nr_pages - 1;
10869 	sis->highest_bit = bsi.nr_pages - 1;
10870 	return bsi.nr_extents;
10871 }
10872 #else
btrfs_swap_deactivate(struct file * file)10873 static void btrfs_swap_deactivate(struct file *file)
10874 {
10875 }
10876 
btrfs_swap_activate(struct swap_info_struct * sis,struct file * file,sector_t * span)10877 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10878 			       sector_t *span)
10879 {
10880 	return -EOPNOTSUPP;
10881 }
10882 #endif
10883 
10884 /*
10885  * Update the number of bytes used in the VFS' inode. When we replace extents in
10886  * a range (clone, dedupe, fallocate's zero range), we must update the number of
10887  * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10888  * always get a correct value.
10889  */
btrfs_update_inode_bytes(struct btrfs_inode * inode,const u64 add_bytes,const u64 del_bytes)10890 void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10891 			      const u64 add_bytes,
10892 			      const u64 del_bytes)
10893 {
10894 	if (add_bytes == del_bytes)
10895 		return;
10896 
10897 	spin_lock(&inode->lock);
10898 	if (del_bytes > 0)
10899 		inode_sub_bytes(&inode->vfs_inode, del_bytes);
10900 	if (add_bytes > 0)
10901 		inode_add_bytes(&inode->vfs_inode, add_bytes);
10902 	spin_unlock(&inode->lock);
10903 }
10904 
10905 /*
10906  * Verify that there are no ordered extents for a given file range.
10907  *
10908  * @inode:   The target inode.
10909  * @start:   Start offset of the file range, should be sector size aligned.
10910  * @end:     End offset (inclusive) of the file range, its value +1 should be
10911  *           sector size aligned.
10912  *
10913  * This should typically be used for cases where we locked an inode's VFS lock in
10914  * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
10915  * we have flushed all delalloc in the range, we have waited for all ordered
10916  * extents in the range to complete and finally we have locked the file range in
10917  * the inode's io_tree.
10918  */
btrfs_assert_inode_range_clean(struct btrfs_inode * inode,u64 start,u64 end)10919 void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
10920 {
10921 	struct btrfs_root *root = inode->root;
10922 	struct btrfs_ordered_extent *ordered;
10923 
10924 	if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
10925 		return;
10926 
10927 	ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
10928 	if (ordered) {
10929 		btrfs_err(root->fs_info,
10930 "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
10931 			  start, end, btrfs_ino(inode), root->root_key.objectid,
10932 			  ordered->file_offset,
10933 			  ordered->file_offset + ordered->num_bytes - 1);
10934 		btrfs_put_ordered_extent(ordered);
10935 	}
10936 
10937 	ASSERT(ordered == NULL);
10938 }
10939 
10940 static const struct inode_operations btrfs_dir_inode_operations = {
10941 	.getattr	= btrfs_getattr,
10942 	.lookup		= btrfs_lookup,
10943 	.create		= btrfs_create,
10944 	.unlink		= btrfs_unlink,
10945 	.link		= btrfs_link,
10946 	.mkdir		= btrfs_mkdir,
10947 	.rmdir		= btrfs_rmdir,
10948 	.rename		= btrfs_rename2,
10949 	.symlink	= btrfs_symlink,
10950 	.setattr	= btrfs_setattr,
10951 	.mknod		= btrfs_mknod,
10952 	.listxattr	= btrfs_listxattr,
10953 	.permission	= btrfs_permission,
10954 	.get_inode_acl	= btrfs_get_acl,
10955 	.set_acl	= btrfs_set_acl,
10956 	.update_time	= btrfs_update_time,
10957 	.tmpfile        = btrfs_tmpfile,
10958 	.fileattr_get	= btrfs_fileattr_get,
10959 	.fileattr_set	= btrfs_fileattr_set,
10960 };
10961 
10962 static const struct file_operations btrfs_dir_file_operations = {
10963 	.llseek		= btrfs_dir_llseek,
10964 	.read		= generic_read_dir,
10965 	.iterate_shared	= btrfs_real_readdir,
10966 	.open		= btrfs_opendir,
10967 	.unlocked_ioctl	= btrfs_ioctl,
10968 #ifdef CONFIG_COMPAT
10969 	.compat_ioctl	= btrfs_compat_ioctl,
10970 #endif
10971 	.release        = btrfs_release_file,
10972 	.fsync		= btrfs_sync_file,
10973 };
10974 
10975 /*
10976  * btrfs doesn't support the bmap operation because swapfiles
10977  * use bmap to make a mapping of extents in the file.  They assume
10978  * these extents won't change over the life of the file and they
10979  * use the bmap result to do IO directly to the drive.
10980  *
10981  * the btrfs bmap call would return logical addresses that aren't
10982  * suitable for IO and they also will change frequently as COW
10983  * operations happen.  So, swapfile + btrfs == corruption.
10984  *
10985  * For now we're avoiding this by dropping bmap.
10986  */
10987 static const struct address_space_operations btrfs_aops = {
10988 	.read_folio	= btrfs_read_folio,
10989 	.writepages	= btrfs_writepages,
10990 	.readahead	= btrfs_readahead,
10991 	.invalidate_folio = btrfs_invalidate_folio,
10992 	.release_folio	= btrfs_release_folio,
10993 	.migrate_folio	= btrfs_migrate_folio,
10994 	.dirty_folio	= filemap_dirty_folio,
10995 	.error_remove_page = generic_error_remove_page,
10996 	.swap_activate	= btrfs_swap_activate,
10997 	.swap_deactivate = btrfs_swap_deactivate,
10998 };
10999 
11000 static const struct inode_operations btrfs_file_inode_operations = {
11001 	.getattr	= btrfs_getattr,
11002 	.setattr	= btrfs_setattr,
11003 	.listxattr      = btrfs_listxattr,
11004 	.permission	= btrfs_permission,
11005 	.fiemap		= btrfs_fiemap,
11006 	.get_inode_acl	= btrfs_get_acl,
11007 	.set_acl	= btrfs_set_acl,
11008 	.update_time	= btrfs_update_time,
11009 	.fileattr_get	= btrfs_fileattr_get,
11010 	.fileattr_set	= btrfs_fileattr_set,
11011 };
11012 static const struct inode_operations btrfs_special_inode_operations = {
11013 	.getattr	= btrfs_getattr,
11014 	.setattr	= btrfs_setattr,
11015 	.permission	= btrfs_permission,
11016 	.listxattr	= btrfs_listxattr,
11017 	.get_inode_acl	= btrfs_get_acl,
11018 	.set_acl	= btrfs_set_acl,
11019 	.update_time	= btrfs_update_time,
11020 };
11021 static const struct inode_operations btrfs_symlink_inode_operations = {
11022 	.get_link	= page_get_link,
11023 	.getattr	= btrfs_getattr,
11024 	.setattr	= btrfs_setattr,
11025 	.permission	= btrfs_permission,
11026 	.listxattr	= btrfs_listxattr,
11027 	.update_time	= btrfs_update_time,
11028 };
11029 
11030 const struct dentry_operations btrfs_dentry_operations = {
11031 	.d_delete	= btrfs_dentry_delete,
11032 };
11033