xref: /openbmc/linux/fs/btrfs/inode.c (revision 5927145e)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/compat.h>
34 #include <linux/bit_spinlock.h>
35 #include <linux/xattr.h>
36 #include <linux/posix_acl.h>
37 #include <linux/falloc.h>
38 #include <linux/slab.h>
39 #include <linux/ratelimit.h>
40 #include <linux/mount.h>
41 #include <linux/btrfs.h>
42 #include <linux/blkdev.h>
43 #include <linux/posix_acl_xattr.h>
44 #include <linux/uio.h>
45 #include <linux/magic.h>
46 #include <linux/iversion.h>
47 #include "ctree.h"
48 #include "disk-io.h"
49 #include "transaction.h"
50 #include "btrfs_inode.h"
51 #include "print-tree.h"
52 #include "ordered-data.h"
53 #include "xattr.h"
54 #include "tree-log.h"
55 #include "volumes.h"
56 #include "compression.h"
57 #include "locking.h"
58 #include "free-space-cache.h"
59 #include "inode-map.h"
60 #include "backref.h"
61 #include "hash.h"
62 #include "props.h"
63 #include "qgroup.h"
64 #include "dedupe.h"
65 
66 struct btrfs_iget_args {
67 	struct btrfs_key *location;
68 	struct btrfs_root *root;
69 };
70 
71 struct btrfs_dio_data {
72 	u64 reserve;
73 	u64 unsubmitted_oe_range_start;
74 	u64 unsubmitted_oe_range_end;
75 	int overwrite;
76 };
77 
78 static const struct inode_operations btrfs_dir_inode_operations;
79 static const struct inode_operations btrfs_symlink_inode_operations;
80 static const struct inode_operations btrfs_dir_ro_inode_operations;
81 static const struct inode_operations btrfs_special_inode_operations;
82 static const struct inode_operations btrfs_file_inode_operations;
83 static const struct address_space_operations btrfs_aops;
84 static const struct address_space_operations btrfs_symlink_aops;
85 static const struct file_operations btrfs_dir_file_operations;
86 static const struct extent_io_ops btrfs_extent_io_ops;
87 
88 static struct kmem_cache *btrfs_inode_cachep;
89 struct kmem_cache *btrfs_trans_handle_cachep;
90 struct kmem_cache *btrfs_path_cachep;
91 struct kmem_cache *btrfs_free_space_cachep;
92 
93 #define S_SHIFT 12
94 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
95 	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
96 	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
97 	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
98 	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
99 	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
100 	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
101 	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
102 };
103 
104 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
105 static int btrfs_truncate(struct inode *inode);
106 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
107 static noinline int cow_file_range(struct inode *inode,
108 				   struct page *locked_page,
109 				   u64 start, u64 end, u64 delalloc_end,
110 				   int *page_started, unsigned long *nr_written,
111 				   int unlock, struct btrfs_dedupe_hash *hash);
112 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
113 				       u64 orig_start, u64 block_start,
114 				       u64 block_len, u64 orig_block_len,
115 				       u64 ram_bytes, int compress_type,
116 				       int type);
117 
118 static void __endio_write_update_ordered(struct inode *inode,
119 					 const u64 offset, const u64 bytes,
120 					 const bool uptodate);
121 
122 /*
123  * Cleanup all submitted ordered extents in specified range to handle errors
124  * from the fill_dellaloc() callback.
125  *
126  * NOTE: caller must ensure that when an error happens, it can not call
127  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
128  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
129  * to be released, which we want to happen only when finishing the ordered
130  * extent (btrfs_finish_ordered_io()). Also note that the caller of the
131  * fill_delalloc() callback already does proper cleanup for the first page of
132  * the range, that is, it invokes the callback writepage_end_io_hook() for the
133  * range of the first page.
134  */
135 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
136 						 const u64 offset,
137 						 const u64 bytes)
138 {
139 	unsigned long index = offset >> PAGE_SHIFT;
140 	unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
141 	struct page *page;
142 
143 	while (index <= end_index) {
144 		page = find_get_page(inode->i_mapping, index);
145 		index++;
146 		if (!page)
147 			continue;
148 		ClearPagePrivate2(page);
149 		put_page(page);
150 	}
151 	return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
152 					    bytes - PAGE_SIZE, false);
153 }
154 
155 static int btrfs_dirty_inode(struct inode *inode);
156 
157 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
158 void btrfs_test_inode_set_ops(struct inode *inode)
159 {
160 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
161 }
162 #endif
163 
164 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
165 				     struct inode *inode,  struct inode *dir,
166 				     const struct qstr *qstr)
167 {
168 	int err;
169 
170 	err = btrfs_init_acl(trans, inode, dir);
171 	if (!err)
172 		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
173 	return err;
174 }
175 
176 /*
177  * this does all the hard work for inserting an inline extent into
178  * the btree.  The caller should have done a btrfs_drop_extents so that
179  * no overlapping inline items exist in the btree
180  */
181 static int insert_inline_extent(struct btrfs_trans_handle *trans,
182 				struct btrfs_path *path, int extent_inserted,
183 				struct btrfs_root *root, struct inode *inode,
184 				u64 start, size_t size, size_t compressed_size,
185 				int compress_type,
186 				struct page **compressed_pages)
187 {
188 	struct extent_buffer *leaf;
189 	struct page *page = NULL;
190 	char *kaddr;
191 	unsigned long ptr;
192 	struct btrfs_file_extent_item *ei;
193 	int ret;
194 	size_t cur_size = size;
195 	unsigned long offset;
196 
197 	if (compressed_size && compressed_pages)
198 		cur_size = compressed_size;
199 
200 	inode_add_bytes(inode, size);
201 
202 	if (!extent_inserted) {
203 		struct btrfs_key key;
204 		size_t datasize;
205 
206 		key.objectid = btrfs_ino(BTRFS_I(inode));
207 		key.offset = start;
208 		key.type = BTRFS_EXTENT_DATA_KEY;
209 
210 		datasize = btrfs_file_extent_calc_inline_size(cur_size);
211 		path->leave_spinning = 1;
212 		ret = btrfs_insert_empty_item(trans, root, path, &key,
213 					      datasize);
214 		if (ret)
215 			goto fail;
216 	}
217 	leaf = path->nodes[0];
218 	ei = btrfs_item_ptr(leaf, path->slots[0],
219 			    struct btrfs_file_extent_item);
220 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
221 	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
222 	btrfs_set_file_extent_encryption(leaf, ei, 0);
223 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
224 	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
225 	ptr = btrfs_file_extent_inline_start(ei);
226 
227 	if (compress_type != BTRFS_COMPRESS_NONE) {
228 		struct page *cpage;
229 		int i = 0;
230 		while (compressed_size > 0) {
231 			cpage = compressed_pages[i];
232 			cur_size = min_t(unsigned long, compressed_size,
233 				       PAGE_SIZE);
234 
235 			kaddr = kmap_atomic(cpage);
236 			write_extent_buffer(leaf, kaddr, ptr, cur_size);
237 			kunmap_atomic(kaddr);
238 
239 			i++;
240 			ptr += cur_size;
241 			compressed_size -= cur_size;
242 		}
243 		btrfs_set_file_extent_compression(leaf, ei,
244 						  compress_type);
245 	} else {
246 		page = find_get_page(inode->i_mapping,
247 				     start >> PAGE_SHIFT);
248 		btrfs_set_file_extent_compression(leaf, ei, 0);
249 		kaddr = kmap_atomic(page);
250 		offset = start & (PAGE_SIZE - 1);
251 		write_extent_buffer(leaf, kaddr + offset, ptr, size);
252 		kunmap_atomic(kaddr);
253 		put_page(page);
254 	}
255 	btrfs_mark_buffer_dirty(leaf);
256 	btrfs_release_path(path);
257 
258 	/*
259 	 * we're an inline extent, so nobody can
260 	 * extend the file past i_size without locking
261 	 * a page we already have locked.
262 	 *
263 	 * We must do any isize and inode updates
264 	 * before we unlock the pages.  Otherwise we
265 	 * could end up racing with unlink.
266 	 */
267 	BTRFS_I(inode)->disk_i_size = inode->i_size;
268 	ret = btrfs_update_inode(trans, root, inode);
269 
270 fail:
271 	return ret;
272 }
273 
274 
275 /*
276  * conditionally insert an inline extent into the file.  This
277  * does the checks required to make sure the data is small enough
278  * to fit as an inline extent.
279  */
280 static noinline int cow_file_range_inline(struct btrfs_root *root,
281 					  struct inode *inode, u64 start,
282 					  u64 end, size_t compressed_size,
283 					  int compress_type,
284 					  struct page **compressed_pages)
285 {
286 	struct btrfs_fs_info *fs_info = root->fs_info;
287 	struct btrfs_trans_handle *trans;
288 	u64 isize = i_size_read(inode);
289 	u64 actual_end = min(end + 1, isize);
290 	u64 inline_len = actual_end - start;
291 	u64 aligned_end = ALIGN(end, fs_info->sectorsize);
292 	u64 data_len = inline_len;
293 	int ret;
294 	struct btrfs_path *path;
295 	int extent_inserted = 0;
296 	u32 extent_item_size;
297 
298 	if (compressed_size)
299 		data_len = compressed_size;
300 
301 	if (start > 0 ||
302 	    actual_end > fs_info->sectorsize ||
303 	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
304 	    (!compressed_size &&
305 	    (actual_end & (fs_info->sectorsize - 1)) == 0) ||
306 	    end + 1 < isize ||
307 	    data_len > fs_info->max_inline) {
308 		return 1;
309 	}
310 
311 	path = btrfs_alloc_path();
312 	if (!path)
313 		return -ENOMEM;
314 
315 	trans = btrfs_join_transaction(root);
316 	if (IS_ERR(trans)) {
317 		btrfs_free_path(path);
318 		return PTR_ERR(trans);
319 	}
320 	trans->block_rsv = &BTRFS_I(inode)->block_rsv;
321 
322 	if (compressed_size && compressed_pages)
323 		extent_item_size = btrfs_file_extent_calc_inline_size(
324 		   compressed_size);
325 	else
326 		extent_item_size = btrfs_file_extent_calc_inline_size(
327 		    inline_len);
328 
329 	ret = __btrfs_drop_extents(trans, root, inode, path,
330 				   start, aligned_end, NULL,
331 				   1, 1, extent_item_size, &extent_inserted);
332 	if (ret) {
333 		btrfs_abort_transaction(trans, ret);
334 		goto out;
335 	}
336 
337 	if (isize > actual_end)
338 		inline_len = min_t(u64, isize, actual_end);
339 	ret = insert_inline_extent(trans, path, extent_inserted,
340 				   root, inode, start,
341 				   inline_len, compressed_size,
342 				   compress_type, compressed_pages);
343 	if (ret && ret != -ENOSPC) {
344 		btrfs_abort_transaction(trans, ret);
345 		goto out;
346 	} else if (ret == -ENOSPC) {
347 		ret = 1;
348 		goto out;
349 	}
350 
351 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
352 	btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
353 out:
354 	/*
355 	 * Don't forget to free the reserved space, as for inlined extent
356 	 * it won't count as data extent, free them directly here.
357 	 * And at reserve time, it's always aligned to page size, so
358 	 * just free one page here.
359 	 */
360 	btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE);
361 	btrfs_free_path(path);
362 	btrfs_end_transaction(trans);
363 	return ret;
364 }
365 
366 struct async_extent {
367 	u64 start;
368 	u64 ram_size;
369 	u64 compressed_size;
370 	struct page **pages;
371 	unsigned long nr_pages;
372 	int compress_type;
373 	struct list_head list;
374 };
375 
376 struct async_cow {
377 	struct inode *inode;
378 	struct btrfs_root *root;
379 	struct page *locked_page;
380 	u64 start;
381 	u64 end;
382 	unsigned int write_flags;
383 	struct list_head extents;
384 	struct btrfs_work work;
385 };
386 
387 static noinline int add_async_extent(struct async_cow *cow,
388 				     u64 start, u64 ram_size,
389 				     u64 compressed_size,
390 				     struct page **pages,
391 				     unsigned long nr_pages,
392 				     int compress_type)
393 {
394 	struct async_extent *async_extent;
395 
396 	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
397 	BUG_ON(!async_extent); /* -ENOMEM */
398 	async_extent->start = start;
399 	async_extent->ram_size = ram_size;
400 	async_extent->compressed_size = compressed_size;
401 	async_extent->pages = pages;
402 	async_extent->nr_pages = nr_pages;
403 	async_extent->compress_type = compress_type;
404 	list_add_tail(&async_extent->list, &cow->extents);
405 	return 0;
406 }
407 
408 static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
409 {
410 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
411 
412 	/* force compress */
413 	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
414 		return 1;
415 	/* defrag ioctl */
416 	if (BTRFS_I(inode)->defrag_compress)
417 		return 1;
418 	/* bad compression ratios */
419 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
420 		return 0;
421 	if (btrfs_test_opt(fs_info, COMPRESS) ||
422 	    BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
423 	    BTRFS_I(inode)->prop_compress)
424 		return btrfs_compress_heuristic(inode, start, end);
425 	return 0;
426 }
427 
428 static inline void inode_should_defrag(struct btrfs_inode *inode,
429 		u64 start, u64 end, u64 num_bytes, u64 small_write)
430 {
431 	/* If this is a small write inside eof, kick off a defrag */
432 	if (num_bytes < small_write &&
433 	    (start > 0 || end + 1 < inode->disk_i_size))
434 		btrfs_add_inode_defrag(NULL, inode);
435 }
436 
437 /*
438  * we create compressed extents in two phases.  The first
439  * phase compresses a range of pages that have already been
440  * locked (both pages and state bits are locked).
441  *
442  * This is done inside an ordered work queue, and the compression
443  * is spread across many cpus.  The actual IO submission is step
444  * two, and the ordered work queue takes care of making sure that
445  * happens in the same order things were put onto the queue by
446  * writepages and friends.
447  *
448  * If this code finds it can't get good compression, it puts an
449  * entry onto the work queue to write the uncompressed bytes.  This
450  * makes sure that both compressed inodes and uncompressed inodes
451  * are written in the same order that the flusher thread sent them
452  * down.
453  */
454 static noinline void compress_file_range(struct inode *inode,
455 					struct page *locked_page,
456 					u64 start, u64 end,
457 					struct async_cow *async_cow,
458 					int *num_added)
459 {
460 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
461 	struct btrfs_root *root = BTRFS_I(inode)->root;
462 	u64 blocksize = fs_info->sectorsize;
463 	u64 actual_end;
464 	u64 isize = i_size_read(inode);
465 	int ret = 0;
466 	struct page **pages = NULL;
467 	unsigned long nr_pages;
468 	unsigned long total_compressed = 0;
469 	unsigned long total_in = 0;
470 	int i;
471 	int will_compress;
472 	int compress_type = fs_info->compress_type;
473 	int redirty = 0;
474 
475 	inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
476 			SZ_16K);
477 
478 	actual_end = min_t(u64, isize, end + 1);
479 again:
480 	will_compress = 0;
481 	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
482 	BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
483 	nr_pages = min_t(unsigned long, nr_pages,
484 			BTRFS_MAX_COMPRESSED / PAGE_SIZE);
485 
486 	/*
487 	 * we don't want to send crud past the end of i_size through
488 	 * compression, that's just a waste of CPU time.  So, if the
489 	 * end of the file is before the start of our current
490 	 * requested range of bytes, we bail out to the uncompressed
491 	 * cleanup code that can deal with all of this.
492 	 *
493 	 * It isn't really the fastest way to fix things, but this is a
494 	 * very uncommon corner.
495 	 */
496 	if (actual_end <= start)
497 		goto cleanup_and_bail_uncompressed;
498 
499 	total_compressed = actual_end - start;
500 
501 	/*
502 	 * skip compression for a small file range(<=blocksize) that
503 	 * isn't an inline extent, since it doesn't save disk space at all.
504 	 */
505 	if (total_compressed <= blocksize &&
506 	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
507 		goto cleanup_and_bail_uncompressed;
508 
509 	total_compressed = min_t(unsigned long, total_compressed,
510 			BTRFS_MAX_UNCOMPRESSED);
511 	total_in = 0;
512 	ret = 0;
513 
514 	/*
515 	 * we do compression for mount -o compress and when the
516 	 * inode has not been flagged as nocompress.  This flag can
517 	 * change at any time if we discover bad compression ratios.
518 	 */
519 	if (inode_need_compress(inode, start, end)) {
520 		WARN_ON(pages);
521 		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
522 		if (!pages) {
523 			/* just bail out to the uncompressed code */
524 			goto cont;
525 		}
526 
527 		if (BTRFS_I(inode)->defrag_compress)
528 			compress_type = BTRFS_I(inode)->defrag_compress;
529 		else if (BTRFS_I(inode)->prop_compress)
530 			compress_type = BTRFS_I(inode)->prop_compress;
531 
532 		/*
533 		 * we need to call clear_page_dirty_for_io on each
534 		 * page in the range.  Otherwise applications with the file
535 		 * mmap'd can wander in and change the page contents while
536 		 * we are compressing them.
537 		 *
538 		 * If the compression fails for any reason, we set the pages
539 		 * dirty again later on.
540 		 *
541 		 * Note that the remaining part is redirtied, the start pointer
542 		 * has moved, the end is the original one.
543 		 */
544 		if (!redirty) {
545 			extent_range_clear_dirty_for_io(inode, start, end);
546 			redirty = 1;
547 		}
548 
549 		/* Compression level is applied here and only here */
550 		ret = btrfs_compress_pages(
551 			compress_type | (fs_info->compress_level << 4),
552 					   inode->i_mapping, start,
553 					   pages,
554 					   &nr_pages,
555 					   &total_in,
556 					   &total_compressed);
557 
558 		if (!ret) {
559 			unsigned long offset = total_compressed &
560 				(PAGE_SIZE - 1);
561 			struct page *page = pages[nr_pages - 1];
562 			char *kaddr;
563 
564 			/* zero the tail end of the last page, we might be
565 			 * sending it down to disk
566 			 */
567 			if (offset) {
568 				kaddr = kmap_atomic(page);
569 				memset(kaddr + offset, 0,
570 				       PAGE_SIZE - offset);
571 				kunmap_atomic(kaddr);
572 			}
573 			will_compress = 1;
574 		}
575 	}
576 cont:
577 	if (start == 0) {
578 		/* lets try to make an inline extent */
579 		if (ret || total_in < actual_end) {
580 			/* we didn't compress the entire range, try
581 			 * to make an uncompressed inline extent.
582 			 */
583 			ret = cow_file_range_inline(root, inode, start, end,
584 					    0, BTRFS_COMPRESS_NONE, NULL);
585 		} else {
586 			/* try making a compressed inline extent */
587 			ret = cow_file_range_inline(root, inode, start, end,
588 						    total_compressed,
589 						    compress_type, pages);
590 		}
591 		if (ret <= 0) {
592 			unsigned long clear_flags = EXTENT_DELALLOC |
593 				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
594 				EXTENT_DO_ACCOUNTING;
595 			unsigned long page_error_op;
596 
597 			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
598 
599 			/*
600 			 * inline extent creation worked or returned error,
601 			 * we don't need to create any more async work items.
602 			 * Unlock and free up our temp pages.
603 			 *
604 			 * We use DO_ACCOUNTING here because we need the
605 			 * delalloc_release_metadata to be done _after_ we drop
606 			 * our outstanding extent for clearing delalloc for this
607 			 * range.
608 			 */
609 			extent_clear_unlock_delalloc(inode, start, end, end,
610 						     NULL, clear_flags,
611 						     PAGE_UNLOCK |
612 						     PAGE_CLEAR_DIRTY |
613 						     PAGE_SET_WRITEBACK |
614 						     page_error_op |
615 						     PAGE_END_WRITEBACK);
616 			goto free_pages_out;
617 		}
618 	}
619 
620 	if (will_compress) {
621 		/*
622 		 * we aren't doing an inline extent round the compressed size
623 		 * up to a block size boundary so the allocator does sane
624 		 * things
625 		 */
626 		total_compressed = ALIGN(total_compressed, blocksize);
627 
628 		/*
629 		 * one last check to make sure the compression is really a
630 		 * win, compare the page count read with the blocks on disk,
631 		 * compression must free at least one sector size
632 		 */
633 		total_in = ALIGN(total_in, PAGE_SIZE);
634 		if (total_compressed + blocksize <= total_in) {
635 			*num_added += 1;
636 
637 			/*
638 			 * The async work queues will take care of doing actual
639 			 * allocation on disk for these compressed pages, and
640 			 * will submit them to the elevator.
641 			 */
642 			add_async_extent(async_cow, start, total_in,
643 					total_compressed, pages, nr_pages,
644 					compress_type);
645 
646 			if (start + total_in < end) {
647 				start += total_in;
648 				pages = NULL;
649 				cond_resched();
650 				goto again;
651 			}
652 			return;
653 		}
654 	}
655 	if (pages) {
656 		/*
657 		 * the compression code ran but failed to make things smaller,
658 		 * free any pages it allocated and our page pointer array
659 		 */
660 		for (i = 0; i < nr_pages; i++) {
661 			WARN_ON(pages[i]->mapping);
662 			put_page(pages[i]);
663 		}
664 		kfree(pages);
665 		pages = NULL;
666 		total_compressed = 0;
667 		nr_pages = 0;
668 
669 		/* flag the file so we don't compress in the future */
670 		if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
671 		    !(BTRFS_I(inode)->prop_compress)) {
672 			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
673 		}
674 	}
675 cleanup_and_bail_uncompressed:
676 	/*
677 	 * No compression, but we still need to write the pages in the file
678 	 * we've been given so far.  redirty the locked page if it corresponds
679 	 * to our extent and set things up for the async work queue to run
680 	 * cow_file_range to do the normal delalloc dance.
681 	 */
682 	if (page_offset(locked_page) >= start &&
683 	    page_offset(locked_page) <= end)
684 		__set_page_dirty_nobuffers(locked_page);
685 		/* unlocked later on in the async handlers */
686 
687 	if (redirty)
688 		extent_range_redirty_for_io(inode, start, end);
689 	add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
690 			 BTRFS_COMPRESS_NONE);
691 	*num_added += 1;
692 
693 	return;
694 
695 free_pages_out:
696 	for (i = 0; i < nr_pages; i++) {
697 		WARN_ON(pages[i]->mapping);
698 		put_page(pages[i]);
699 	}
700 	kfree(pages);
701 }
702 
703 static void free_async_extent_pages(struct async_extent *async_extent)
704 {
705 	int i;
706 
707 	if (!async_extent->pages)
708 		return;
709 
710 	for (i = 0; i < async_extent->nr_pages; i++) {
711 		WARN_ON(async_extent->pages[i]->mapping);
712 		put_page(async_extent->pages[i]);
713 	}
714 	kfree(async_extent->pages);
715 	async_extent->nr_pages = 0;
716 	async_extent->pages = NULL;
717 }
718 
719 /*
720  * phase two of compressed writeback.  This is the ordered portion
721  * of the code, which only gets called in the order the work was
722  * queued.  We walk all the async extents created by compress_file_range
723  * and send them down to the disk.
724  */
725 static noinline void submit_compressed_extents(struct inode *inode,
726 					      struct async_cow *async_cow)
727 {
728 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
729 	struct async_extent *async_extent;
730 	u64 alloc_hint = 0;
731 	struct btrfs_key ins;
732 	struct extent_map *em;
733 	struct btrfs_root *root = BTRFS_I(inode)->root;
734 	struct extent_io_tree *io_tree;
735 	int ret = 0;
736 
737 again:
738 	while (!list_empty(&async_cow->extents)) {
739 		async_extent = list_entry(async_cow->extents.next,
740 					  struct async_extent, list);
741 		list_del(&async_extent->list);
742 
743 		io_tree = &BTRFS_I(inode)->io_tree;
744 
745 retry:
746 		/* did the compression code fall back to uncompressed IO? */
747 		if (!async_extent->pages) {
748 			int page_started = 0;
749 			unsigned long nr_written = 0;
750 
751 			lock_extent(io_tree, async_extent->start,
752 					 async_extent->start +
753 					 async_extent->ram_size - 1);
754 
755 			/* allocate blocks */
756 			ret = cow_file_range(inode, async_cow->locked_page,
757 					     async_extent->start,
758 					     async_extent->start +
759 					     async_extent->ram_size - 1,
760 					     async_extent->start +
761 					     async_extent->ram_size - 1,
762 					     &page_started, &nr_written, 0,
763 					     NULL);
764 
765 			/* JDM XXX */
766 
767 			/*
768 			 * if page_started, cow_file_range inserted an
769 			 * inline extent and took care of all the unlocking
770 			 * and IO for us.  Otherwise, we need to submit
771 			 * all those pages down to the drive.
772 			 */
773 			if (!page_started && !ret)
774 				extent_write_locked_range(inode,
775 						  async_extent->start,
776 						  async_extent->start +
777 						  async_extent->ram_size - 1,
778 						  WB_SYNC_ALL);
779 			else if (ret)
780 				unlock_page(async_cow->locked_page);
781 			kfree(async_extent);
782 			cond_resched();
783 			continue;
784 		}
785 
786 		lock_extent(io_tree, async_extent->start,
787 			    async_extent->start + async_extent->ram_size - 1);
788 
789 		ret = btrfs_reserve_extent(root, async_extent->ram_size,
790 					   async_extent->compressed_size,
791 					   async_extent->compressed_size,
792 					   0, alloc_hint, &ins, 1, 1);
793 		if (ret) {
794 			free_async_extent_pages(async_extent);
795 
796 			if (ret == -ENOSPC) {
797 				unlock_extent(io_tree, async_extent->start,
798 					      async_extent->start +
799 					      async_extent->ram_size - 1);
800 
801 				/*
802 				 * we need to redirty the pages if we decide to
803 				 * fallback to uncompressed IO, otherwise we
804 				 * will not submit these pages down to lower
805 				 * layers.
806 				 */
807 				extent_range_redirty_for_io(inode,
808 						async_extent->start,
809 						async_extent->start +
810 						async_extent->ram_size - 1);
811 
812 				goto retry;
813 			}
814 			goto out_free;
815 		}
816 		/*
817 		 * here we're doing allocation and writeback of the
818 		 * compressed pages
819 		 */
820 		em = create_io_em(inode, async_extent->start,
821 				  async_extent->ram_size, /* len */
822 				  async_extent->start, /* orig_start */
823 				  ins.objectid, /* block_start */
824 				  ins.offset, /* block_len */
825 				  ins.offset, /* orig_block_len */
826 				  async_extent->ram_size, /* ram_bytes */
827 				  async_extent->compress_type,
828 				  BTRFS_ORDERED_COMPRESSED);
829 		if (IS_ERR(em))
830 			/* ret value is not necessary due to void function */
831 			goto out_free_reserve;
832 		free_extent_map(em);
833 
834 		ret = btrfs_add_ordered_extent_compress(inode,
835 						async_extent->start,
836 						ins.objectid,
837 						async_extent->ram_size,
838 						ins.offset,
839 						BTRFS_ORDERED_COMPRESSED,
840 						async_extent->compress_type);
841 		if (ret) {
842 			btrfs_drop_extent_cache(BTRFS_I(inode),
843 						async_extent->start,
844 						async_extent->start +
845 						async_extent->ram_size - 1, 0);
846 			goto out_free_reserve;
847 		}
848 		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
849 
850 		/*
851 		 * clear dirty, set writeback and unlock the pages.
852 		 */
853 		extent_clear_unlock_delalloc(inode, async_extent->start,
854 				async_extent->start +
855 				async_extent->ram_size - 1,
856 				async_extent->start +
857 				async_extent->ram_size - 1,
858 				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
859 				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
860 				PAGE_SET_WRITEBACK);
861 		if (btrfs_submit_compressed_write(inode,
862 				    async_extent->start,
863 				    async_extent->ram_size,
864 				    ins.objectid,
865 				    ins.offset, async_extent->pages,
866 				    async_extent->nr_pages,
867 				    async_cow->write_flags)) {
868 			struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
869 			struct page *p = async_extent->pages[0];
870 			const u64 start = async_extent->start;
871 			const u64 end = start + async_extent->ram_size - 1;
872 
873 			p->mapping = inode->i_mapping;
874 			tree->ops->writepage_end_io_hook(p, start, end,
875 							 NULL, 0);
876 			p->mapping = NULL;
877 			extent_clear_unlock_delalloc(inode, start, end, end,
878 						     NULL, 0,
879 						     PAGE_END_WRITEBACK |
880 						     PAGE_SET_ERROR);
881 			free_async_extent_pages(async_extent);
882 		}
883 		alloc_hint = ins.objectid + ins.offset;
884 		kfree(async_extent);
885 		cond_resched();
886 	}
887 	return;
888 out_free_reserve:
889 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
890 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
891 out_free:
892 	extent_clear_unlock_delalloc(inode, async_extent->start,
893 				     async_extent->start +
894 				     async_extent->ram_size - 1,
895 				     async_extent->start +
896 				     async_extent->ram_size - 1,
897 				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
898 				     EXTENT_DELALLOC_NEW |
899 				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
900 				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
901 				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
902 				     PAGE_SET_ERROR);
903 	free_async_extent_pages(async_extent);
904 	kfree(async_extent);
905 	goto again;
906 }
907 
908 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
909 				      u64 num_bytes)
910 {
911 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
912 	struct extent_map *em;
913 	u64 alloc_hint = 0;
914 
915 	read_lock(&em_tree->lock);
916 	em = search_extent_mapping(em_tree, start, num_bytes);
917 	if (em) {
918 		/*
919 		 * if block start isn't an actual block number then find the
920 		 * first block in this inode and use that as a hint.  If that
921 		 * block is also bogus then just don't worry about it.
922 		 */
923 		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
924 			free_extent_map(em);
925 			em = search_extent_mapping(em_tree, 0, 0);
926 			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
927 				alloc_hint = em->block_start;
928 			if (em)
929 				free_extent_map(em);
930 		} else {
931 			alloc_hint = em->block_start;
932 			free_extent_map(em);
933 		}
934 	}
935 	read_unlock(&em_tree->lock);
936 
937 	return alloc_hint;
938 }
939 
940 /*
941  * when extent_io.c finds a delayed allocation range in the file,
942  * the call backs end up in this code.  The basic idea is to
943  * allocate extents on disk for the range, and create ordered data structs
944  * in ram to track those extents.
945  *
946  * locked_page is the page that writepage had locked already.  We use
947  * it to make sure we don't do extra locks or unlocks.
948  *
949  * *page_started is set to one if we unlock locked_page and do everything
950  * required to start IO on it.  It may be clean and already done with
951  * IO when we return.
952  */
953 static noinline int cow_file_range(struct inode *inode,
954 				   struct page *locked_page,
955 				   u64 start, u64 end, u64 delalloc_end,
956 				   int *page_started, unsigned long *nr_written,
957 				   int unlock, struct btrfs_dedupe_hash *hash)
958 {
959 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
960 	struct btrfs_root *root = BTRFS_I(inode)->root;
961 	u64 alloc_hint = 0;
962 	u64 num_bytes;
963 	unsigned long ram_size;
964 	u64 disk_num_bytes;
965 	u64 cur_alloc_size = 0;
966 	u64 blocksize = fs_info->sectorsize;
967 	struct btrfs_key ins;
968 	struct extent_map *em;
969 	unsigned clear_bits;
970 	unsigned long page_ops;
971 	bool extent_reserved = false;
972 	int ret = 0;
973 
974 	if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
975 		WARN_ON_ONCE(1);
976 		ret = -EINVAL;
977 		goto out_unlock;
978 	}
979 
980 	num_bytes = ALIGN(end - start + 1, blocksize);
981 	num_bytes = max(blocksize,  num_bytes);
982 	disk_num_bytes = num_bytes;
983 
984 	inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
985 
986 	if (start == 0) {
987 		/* lets try to make an inline extent */
988 		ret = cow_file_range_inline(root, inode, start, end, 0,
989 					BTRFS_COMPRESS_NONE, NULL);
990 		if (ret == 0) {
991 			/*
992 			 * We use DO_ACCOUNTING here because we need the
993 			 * delalloc_release_metadata to be run _after_ we drop
994 			 * our outstanding extent for clearing delalloc for this
995 			 * range.
996 			 */
997 			extent_clear_unlock_delalloc(inode, start, end,
998 				     delalloc_end, NULL,
999 				     EXTENT_LOCKED | EXTENT_DELALLOC |
1000 				     EXTENT_DELALLOC_NEW | EXTENT_DEFRAG |
1001 				     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1002 				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1003 				     PAGE_END_WRITEBACK);
1004 			*nr_written = *nr_written +
1005 			     (end - start + PAGE_SIZE) / PAGE_SIZE;
1006 			*page_started = 1;
1007 			goto out;
1008 		} else if (ret < 0) {
1009 			goto out_unlock;
1010 		}
1011 	}
1012 
1013 	BUG_ON(disk_num_bytes >
1014 	       btrfs_super_total_bytes(fs_info->super_copy));
1015 
1016 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
1017 	btrfs_drop_extent_cache(BTRFS_I(inode), start,
1018 			start + num_bytes - 1, 0);
1019 
1020 	while (disk_num_bytes > 0) {
1021 		cur_alloc_size = disk_num_bytes;
1022 		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1023 					   fs_info->sectorsize, 0, alloc_hint,
1024 					   &ins, 1, 1);
1025 		if (ret < 0)
1026 			goto out_unlock;
1027 		cur_alloc_size = ins.offset;
1028 		extent_reserved = true;
1029 
1030 		ram_size = ins.offset;
1031 		em = create_io_em(inode, start, ins.offset, /* len */
1032 				  start, /* orig_start */
1033 				  ins.objectid, /* block_start */
1034 				  ins.offset, /* block_len */
1035 				  ins.offset, /* orig_block_len */
1036 				  ram_size, /* ram_bytes */
1037 				  BTRFS_COMPRESS_NONE, /* compress_type */
1038 				  BTRFS_ORDERED_REGULAR /* type */);
1039 		if (IS_ERR(em))
1040 			goto out_reserve;
1041 		free_extent_map(em);
1042 
1043 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1044 					       ram_size, cur_alloc_size, 0);
1045 		if (ret)
1046 			goto out_drop_extent_cache;
1047 
1048 		if (root->root_key.objectid ==
1049 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
1050 			ret = btrfs_reloc_clone_csums(inode, start,
1051 						      cur_alloc_size);
1052 			/*
1053 			 * Only drop cache here, and process as normal.
1054 			 *
1055 			 * We must not allow extent_clear_unlock_delalloc()
1056 			 * at out_unlock label to free meta of this ordered
1057 			 * extent, as its meta should be freed by
1058 			 * btrfs_finish_ordered_io().
1059 			 *
1060 			 * So we must continue until @start is increased to
1061 			 * skip current ordered extent.
1062 			 */
1063 			if (ret)
1064 				btrfs_drop_extent_cache(BTRFS_I(inode), start,
1065 						start + ram_size - 1, 0);
1066 		}
1067 
1068 		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1069 
1070 		/* we're not doing compressed IO, don't unlock the first
1071 		 * page (which the caller expects to stay locked), don't
1072 		 * clear any dirty bits and don't set any writeback bits
1073 		 *
1074 		 * Do set the Private2 bit so we know this page was properly
1075 		 * setup for writepage
1076 		 */
1077 		page_ops = unlock ? PAGE_UNLOCK : 0;
1078 		page_ops |= PAGE_SET_PRIVATE2;
1079 
1080 		extent_clear_unlock_delalloc(inode, start,
1081 					     start + ram_size - 1,
1082 					     delalloc_end, locked_page,
1083 					     EXTENT_LOCKED | EXTENT_DELALLOC,
1084 					     page_ops);
1085 		if (disk_num_bytes < cur_alloc_size)
1086 			disk_num_bytes = 0;
1087 		else
1088 			disk_num_bytes -= cur_alloc_size;
1089 		num_bytes -= cur_alloc_size;
1090 		alloc_hint = ins.objectid + ins.offset;
1091 		start += cur_alloc_size;
1092 		extent_reserved = false;
1093 
1094 		/*
1095 		 * btrfs_reloc_clone_csums() error, since start is increased
1096 		 * extent_clear_unlock_delalloc() at out_unlock label won't
1097 		 * free metadata of current ordered extent, we're OK to exit.
1098 		 */
1099 		if (ret)
1100 			goto out_unlock;
1101 	}
1102 out:
1103 	return ret;
1104 
1105 out_drop_extent_cache:
1106 	btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1107 out_reserve:
1108 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1109 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1110 out_unlock:
1111 	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1112 		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1113 	page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1114 		PAGE_END_WRITEBACK;
1115 	/*
1116 	 * If we reserved an extent for our delalloc range (or a subrange) and
1117 	 * failed to create the respective ordered extent, then it means that
1118 	 * when we reserved the extent we decremented the extent's size from
1119 	 * the data space_info's bytes_may_use counter and incremented the
1120 	 * space_info's bytes_reserved counter by the same amount. We must make
1121 	 * sure extent_clear_unlock_delalloc() does not try to decrement again
1122 	 * the data space_info's bytes_may_use counter, therefore we do not pass
1123 	 * it the flag EXTENT_CLEAR_DATA_RESV.
1124 	 */
1125 	if (extent_reserved) {
1126 		extent_clear_unlock_delalloc(inode, start,
1127 					     start + cur_alloc_size,
1128 					     start + cur_alloc_size,
1129 					     locked_page,
1130 					     clear_bits,
1131 					     page_ops);
1132 		start += cur_alloc_size;
1133 		if (start >= end)
1134 			goto out;
1135 	}
1136 	extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1137 				     locked_page,
1138 				     clear_bits | EXTENT_CLEAR_DATA_RESV,
1139 				     page_ops);
1140 	goto out;
1141 }
1142 
1143 /*
1144  * work queue call back to started compression on a file and pages
1145  */
1146 static noinline void async_cow_start(struct btrfs_work *work)
1147 {
1148 	struct async_cow *async_cow;
1149 	int num_added = 0;
1150 	async_cow = container_of(work, struct async_cow, work);
1151 
1152 	compress_file_range(async_cow->inode, async_cow->locked_page,
1153 			    async_cow->start, async_cow->end, async_cow,
1154 			    &num_added);
1155 	if (num_added == 0) {
1156 		btrfs_add_delayed_iput(async_cow->inode);
1157 		async_cow->inode = NULL;
1158 	}
1159 }
1160 
1161 /*
1162  * work queue call back to submit previously compressed pages
1163  */
1164 static noinline void async_cow_submit(struct btrfs_work *work)
1165 {
1166 	struct btrfs_fs_info *fs_info;
1167 	struct async_cow *async_cow;
1168 	struct btrfs_root *root;
1169 	unsigned long nr_pages;
1170 
1171 	async_cow = container_of(work, struct async_cow, work);
1172 
1173 	root = async_cow->root;
1174 	fs_info = root->fs_info;
1175 	nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1176 		PAGE_SHIFT;
1177 
1178 	/*
1179 	 * atomic_sub_return implies a barrier for waitqueue_active
1180 	 */
1181 	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1182 	    5 * SZ_1M &&
1183 	    waitqueue_active(&fs_info->async_submit_wait))
1184 		wake_up(&fs_info->async_submit_wait);
1185 
1186 	if (async_cow->inode)
1187 		submit_compressed_extents(async_cow->inode, async_cow);
1188 }
1189 
1190 static noinline void async_cow_free(struct btrfs_work *work)
1191 {
1192 	struct async_cow *async_cow;
1193 	async_cow = container_of(work, struct async_cow, work);
1194 	if (async_cow->inode)
1195 		btrfs_add_delayed_iput(async_cow->inode);
1196 	kfree(async_cow);
1197 }
1198 
1199 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1200 				u64 start, u64 end, int *page_started,
1201 				unsigned long *nr_written,
1202 				unsigned int write_flags)
1203 {
1204 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1205 	struct async_cow *async_cow;
1206 	struct btrfs_root *root = BTRFS_I(inode)->root;
1207 	unsigned long nr_pages;
1208 	u64 cur_end;
1209 
1210 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1211 			 1, 0, NULL);
1212 	while (start < end) {
1213 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1214 		BUG_ON(!async_cow); /* -ENOMEM */
1215 		async_cow->inode = igrab(inode);
1216 		async_cow->root = root;
1217 		async_cow->locked_page = locked_page;
1218 		async_cow->start = start;
1219 		async_cow->write_flags = write_flags;
1220 
1221 		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1222 		    !btrfs_test_opt(fs_info, FORCE_COMPRESS))
1223 			cur_end = end;
1224 		else
1225 			cur_end = min(end, start + SZ_512K - 1);
1226 
1227 		async_cow->end = cur_end;
1228 		INIT_LIST_HEAD(&async_cow->extents);
1229 
1230 		btrfs_init_work(&async_cow->work,
1231 				btrfs_delalloc_helper,
1232 				async_cow_start, async_cow_submit,
1233 				async_cow_free);
1234 
1235 		nr_pages = (cur_end - start + PAGE_SIZE) >>
1236 			PAGE_SHIFT;
1237 		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1238 
1239 		btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
1240 
1241 		*nr_written += nr_pages;
1242 		start = cur_end + 1;
1243 	}
1244 	*page_started = 1;
1245 	return 0;
1246 }
1247 
1248 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1249 					u64 bytenr, u64 num_bytes)
1250 {
1251 	int ret;
1252 	struct btrfs_ordered_sum *sums;
1253 	LIST_HEAD(list);
1254 
1255 	ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1256 				       bytenr + num_bytes - 1, &list, 0);
1257 	if (ret == 0 && list_empty(&list))
1258 		return 0;
1259 
1260 	while (!list_empty(&list)) {
1261 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1262 		list_del(&sums->list);
1263 		kfree(sums);
1264 	}
1265 	return 1;
1266 }
1267 
1268 /*
1269  * when nowcow writeback call back.  This checks for snapshots or COW copies
1270  * of the extents that exist in the file, and COWs the file as required.
1271  *
1272  * If no cow copies or snapshots exist, we write directly to the existing
1273  * blocks on disk
1274  */
1275 static noinline int run_delalloc_nocow(struct inode *inode,
1276 				       struct page *locked_page,
1277 			      u64 start, u64 end, int *page_started, int force,
1278 			      unsigned long *nr_written)
1279 {
1280 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1281 	struct btrfs_root *root = BTRFS_I(inode)->root;
1282 	struct extent_buffer *leaf;
1283 	struct btrfs_path *path;
1284 	struct btrfs_file_extent_item *fi;
1285 	struct btrfs_key found_key;
1286 	struct extent_map *em;
1287 	u64 cow_start;
1288 	u64 cur_offset;
1289 	u64 extent_end;
1290 	u64 extent_offset;
1291 	u64 disk_bytenr;
1292 	u64 num_bytes;
1293 	u64 disk_num_bytes;
1294 	u64 ram_bytes;
1295 	int extent_type;
1296 	int ret, err;
1297 	int type;
1298 	int nocow;
1299 	int check_prev = 1;
1300 	bool nolock;
1301 	u64 ino = btrfs_ino(BTRFS_I(inode));
1302 
1303 	path = btrfs_alloc_path();
1304 	if (!path) {
1305 		extent_clear_unlock_delalloc(inode, start, end, end,
1306 					     locked_page,
1307 					     EXTENT_LOCKED | EXTENT_DELALLOC |
1308 					     EXTENT_DO_ACCOUNTING |
1309 					     EXTENT_DEFRAG, PAGE_UNLOCK |
1310 					     PAGE_CLEAR_DIRTY |
1311 					     PAGE_SET_WRITEBACK |
1312 					     PAGE_END_WRITEBACK);
1313 		return -ENOMEM;
1314 	}
1315 
1316 	nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1317 
1318 	cow_start = (u64)-1;
1319 	cur_offset = start;
1320 	while (1) {
1321 		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1322 					       cur_offset, 0);
1323 		if (ret < 0)
1324 			goto error;
1325 		if (ret > 0 && path->slots[0] > 0 && check_prev) {
1326 			leaf = path->nodes[0];
1327 			btrfs_item_key_to_cpu(leaf, &found_key,
1328 					      path->slots[0] - 1);
1329 			if (found_key.objectid == ino &&
1330 			    found_key.type == BTRFS_EXTENT_DATA_KEY)
1331 				path->slots[0]--;
1332 		}
1333 		check_prev = 0;
1334 next_slot:
1335 		leaf = path->nodes[0];
1336 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1337 			ret = btrfs_next_leaf(root, path);
1338 			if (ret < 0)
1339 				goto error;
1340 			if (ret > 0)
1341 				break;
1342 			leaf = path->nodes[0];
1343 		}
1344 
1345 		nocow = 0;
1346 		disk_bytenr = 0;
1347 		num_bytes = 0;
1348 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1349 
1350 		if (found_key.objectid > ino)
1351 			break;
1352 		if (WARN_ON_ONCE(found_key.objectid < ino) ||
1353 		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
1354 			path->slots[0]++;
1355 			goto next_slot;
1356 		}
1357 		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1358 		    found_key.offset > end)
1359 			break;
1360 
1361 		if (found_key.offset > cur_offset) {
1362 			extent_end = found_key.offset;
1363 			extent_type = 0;
1364 			goto out_check;
1365 		}
1366 
1367 		fi = btrfs_item_ptr(leaf, path->slots[0],
1368 				    struct btrfs_file_extent_item);
1369 		extent_type = btrfs_file_extent_type(leaf, fi);
1370 
1371 		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1372 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
1373 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1374 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1375 			extent_offset = btrfs_file_extent_offset(leaf, fi);
1376 			extent_end = found_key.offset +
1377 				btrfs_file_extent_num_bytes(leaf, fi);
1378 			disk_num_bytes =
1379 				btrfs_file_extent_disk_num_bytes(leaf, fi);
1380 			if (extent_end <= start) {
1381 				path->slots[0]++;
1382 				goto next_slot;
1383 			}
1384 			if (disk_bytenr == 0)
1385 				goto out_check;
1386 			if (btrfs_file_extent_compression(leaf, fi) ||
1387 			    btrfs_file_extent_encryption(leaf, fi) ||
1388 			    btrfs_file_extent_other_encoding(leaf, fi))
1389 				goto out_check;
1390 			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1391 				goto out_check;
1392 			if (btrfs_extent_readonly(fs_info, disk_bytenr))
1393 				goto out_check;
1394 			if (btrfs_cross_ref_exist(root, ino,
1395 						  found_key.offset -
1396 						  extent_offset, disk_bytenr))
1397 				goto out_check;
1398 			disk_bytenr += extent_offset;
1399 			disk_bytenr += cur_offset - found_key.offset;
1400 			num_bytes = min(end + 1, extent_end) - cur_offset;
1401 			/*
1402 			 * if there are pending snapshots for this root,
1403 			 * we fall into common COW way.
1404 			 */
1405 			if (!nolock) {
1406 				err = btrfs_start_write_no_snapshotting(root);
1407 				if (!err)
1408 					goto out_check;
1409 			}
1410 			/*
1411 			 * force cow if csum exists in the range.
1412 			 * this ensure that csum for a given extent are
1413 			 * either valid or do not exist.
1414 			 */
1415 			if (csum_exist_in_range(fs_info, disk_bytenr,
1416 						num_bytes)) {
1417 				if (!nolock)
1418 					btrfs_end_write_no_snapshotting(root);
1419 				goto out_check;
1420 			}
1421 			if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
1422 				if (!nolock)
1423 					btrfs_end_write_no_snapshotting(root);
1424 				goto out_check;
1425 			}
1426 			nocow = 1;
1427 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1428 			extent_end = found_key.offset +
1429 				btrfs_file_extent_inline_len(leaf,
1430 						     path->slots[0], fi);
1431 			extent_end = ALIGN(extent_end,
1432 					   fs_info->sectorsize);
1433 		} else {
1434 			BUG_ON(1);
1435 		}
1436 out_check:
1437 		if (extent_end <= start) {
1438 			path->slots[0]++;
1439 			if (!nolock && nocow)
1440 				btrfs_end_write_no_snapshotting(root);
1441 			if (nocow)
1442 				btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1443 			goto next_slot;
1444 		}
1445 		if (!nocow) {
1446 			if (cow_start == (u64)-1)
1447 				cow_start = cur_offset;
1448 			cur_offset = extent_end;
1449 			if (cur_offset > end)
1450 				break;
1451 			path->slots[0]++;
1452 			goto next_slot;
1453 		}
1454 
1455 		btrfs_release_path(path);
1456 		if (cow_start != (u64)-1) {
1457 			ret = cow_file_range(inode, locked_page,
1458 					     cow_start, found_key.offset - 1,
1459 					     end, page_started, nr_written, 1,
1460 					     NULL);
1461 			if (ret) {
1462 				if (!nolock && nocow)
1463 					btrfs_end_write_no_snapshotting(root);
1464 				if (nocow)
1465 					btrfs_dec_nocow_writers(fs_info,
1466 								disk_bytenr);
1467 				goto error;
1468 			}
1469 			cow_start = (u64)-1;
1470 		}
1471 
1472 		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1473 			u64 orig_start = found_key.offset - extent_offset;
1474 
1475 			em = create_io_em(inode, cur_offset, num_bytes,
1476 					  orig_start,
1477 					  disk_bytenr, /* block_start */
1478 					  num_bytes, /* block_len */
1479 					  disk_num_bytes, /* orig_block_len */
1480 					  ram_bytes, BTRFS_COMPRESS_NONE,
1481 					  BTRFS_ORDERED_PREALLOC);
1482 			if (IS_ERR(em)) {
1483 				if (!nolock && nocow)
1484 					btrfs_end_write_no_snapshotting(root);
1485 				if (nocow)
1486 					btrfs_dec_nocow_writers(fs_info,
1487 								disk_bytenr);
1488 				ret = PTR_ERR(em);
1489 				goto error;
1490 			}
1491 			free_extent_map(em);
1492 		}
1493 
1494 		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1495 			type = BTRFS_ORDERED_PREALLOC;
1496 		} else {
1497 			type = BTRFS_ORDERED_NOCOW;
1498 		}
1499 
1500 		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1501 					       num_bytes, num_bytes, type);
1502 		if (nocow)
1503 			btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1504 		BUG_ON(ret); /* -ENOMEM */
1505 
1506 		if (root->root_key.objectid ==
1507 		    BTRFS_DATA_RELOC_TREE_OBJECTID)
1508 			/*
1509 			 * Error handled later, as we must prevent
1510 			 * extent_clear_unlock_delalloc() in error handler
1511 			 * from freeing metadata of created ordered extent.
1512 			 */
1513 			ret = btrfs_reloc_clone_csums(inode, cur_offset,
1514 						      num_bytes);
1515 
1516 		extent_clear_unlock_delalloc(inode, cur_offset,
1517 					     cur_offset + num_bytes - 1, end,
1518 					     locked_page, EXTENT_LOCKED |
1519 					     EXTENT_DELALLOC |
1520 					     EXTENT_CLEAR_DATA_RESV,
1521 					     PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1522 
1523 		if (!nolock && nocow)
1524 			btrfs_end_write_no_snapshotting(root);
1525 		cur_offset = extent_end;
1526 
1527 		/*
1528 		 * btrfs_reloc_clone_csums() error, now we're OK to call error
1529 		 * handler, as metadata for created ordered extent will only
1530 		 * be freed by btrfs_finish_ordered_io().
1531 		 */
1532 		if (ret)
1533 			goto error;
1534 		if (cur_offset > end)
1535 			break;
1536 	}
1537 	btrfs_release_path(path);
1538 
1539 	if (cur_offset <= end && cow_start == (u64)-1) {
1540 		cow_start = cur_offset;
1541 		cur_offset = end;
1542 	}
1543 
1544 	if (cow_start != (u64)-1) {
1545 		ret = cow_file_range(inode, locked_page, cow_start, end, end,
1546 				     page_started, nr_written, 1, NULL);
1547 		if (ret)
1548 			goto error;
1549 	}
1550 
1551 error:
1552 	if (ret && cur_offset < end)
1553 		extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1554 					     locked_page, EXTENT_LOCKED |
1555 					     EXTENT_DELALLOC | EXTENT_DEFRAG |
1556 					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1557 					     PAGE_CLEAR_DIRTY |
1558 					     PAGE_SET_WRITEBACK |
1559 					     PAGE_END_WRITEBACK);
1560 	btrfs_free_path(path);
1561 	return ret;
1562 }
1563 
1564 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1565 {
1566 
1567 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1568 	    !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1569 		return 0;
1570 
1571 	/*
1572 	 * @defrag_bytes is a hint value, no spinlock held here,
1573 	 * if is not zero, it means the file is defragging.
1574 	 * Force cow if given extent needs to be defragged.
1575 	 */
1576 	if (BTRFS_I(inode)->defrag_bytes &&
1577 	    test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1578 			   EXTENT_DEFRAG, 0, NULL))
1579 		return 1;
1580 
1581 	return 0;
1582 }
1583 
1584 /*
1585  * extent_io.c call back to do delayed allocation processing
1586  */
1587 static int run_delalloc_range(void *private_data, struct page *locked_page,
1588 			      u64 start, u64 end, int *page_started,
1589 			      unsigned long *nr_written,
1590 			      struct writeback_control *wbc)
1591 {
1592 	struct inode *inode = private_data;
1593 	int ret;
1594 	int force_cow = need_force_cow(inode, start, end);
1595 	unsigned int write_flags = wbc_to_write_flags(wbc);
1596 
1597 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1598 		ret = run_delalloc_nocow(inode, locked_page, start, end,
1599 					 page_started, 1, nr_written);
1600 	} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1601 		ret = run_delalloc_nocow(inode, locked_page, start, end,
1602 					 page_started, 0, nr_written);
1603 	} else if (!inode_need_compress(inode, start, end)) {
1604 		ret = cow_file_range(inode, locked_page, start, end, end,
1605 				      page_started, nr_written, 1, NULL);
1606 	} else {
1607 		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1608 			&BTRFS_I(inode)->runtime_flags);
1609 		ret = cow_file_range_async(inode, locked_page, start, end,
1610 					   page_started, nr_written,
1611 					   write_flags);
1612 	}
1613 	if (ret)
1614 		btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
1615 	return ret;
1616 }
1617 
1618 static void btrfs_split_extent_hook(void *private_data,
1619 				    struct extent_state *orig, u64 split)
1620 {
1621 	struct inode *inode = private_data;
1622 	u64 size;
1623 
1624 	/* not delalloc, ignore it */
1625 	if (!(orig->state & EXTENT_DELALLOC))
1626 		return;
1627 
1628 	size = orig->end - orig->start + 1;
1629 	if (size > BTRFS_MAX_EXTENT_SIZE) {
1630 		u32 num_extents;
1631 		u64 new_size;
1632 
1633 		/*
1634 		 * See the explanation in btrfs_merge_extent_hook, the same
1635 		 * applies here, just in reverse.
1636 		 */
1637 		new_size = orig->end - split + 1;
1638 		num_extents = count_max_extents(new_size);
1639 		new_size = split - orig->start;
1640 		num_extents += count_max_extents(new_size);
1641 		if (count_max_extents(size) >= num_extents)
1642 			return;
1643 	}
1644 
1645 	spin_lock(&BTRFS_I(inode)->lock);
1646 	btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
1647 	spin_unlock(&BTRFS_I(inode)->lock);
1648 }
1649 
1650 /*
1651  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1652  * extents so we can keep track of new extents that are just merged onto old
1653  * extents, such as when we are doing sequential writes, so we can properly
1654  * account for the metadata space we'll need.
1655  */
1656 static void btrfs_merge_extent_hook(void *private_data,
1657 				    struct extent_state *new,
1658 				    struct extent_state *other)
1659 {
1660 	struct inode *inode = private_data;
1661 	u64 new_size, old_size;
1662 	u32 num_extents;
1663 
1664 	/* not delalloc, ignore it */
1665 	if (!(other->state & EXTENT_DELALLOC))
1666 		return;
1667 
1668 	if (new->start > other->start)
1669 		new_size = new->end - other->start + 1;
1670 	else
1671 		new_size = other->end - new->start + 1;
1672 
1673 	/* we're not bigger than the max, unreserve the space and go */
1674 	if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1675 		spin_lock(&BTRFS_I(inode)->lock);
1676 		btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1677 		spin_unlock(&BTRFS_I(inode)->lock);
1678 		return;
1679 	}
1680 
1681 	/*
1682 	 * We have to add up either side to figure out how many extents were
1683 	 * accounted for before we merged into one big extent.  If the number of
1684 	 * extents we accounted for is <= the amount we need for the new range
1685 	 * then we can return, otherwise drop.  Think of it like this
1686 	 *
1687 	 * [ 4k][MAX_SIZE]
1688 	 *
1689 	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
1690 	 * need 2 outstanding extents, on one side we have 1 and the other side
1691 	 * we have 1 so they are == and we can return.  But in this case
1692 	 *
1693 	 * [MAX_SIZE+4k][MAX_SIZE+4k]
1694 	 *
1695 	 * Each range on their own accounts for 2 extents, but merged together
1696 	 * they are only 3 extents worth of accounting, so we need to drop in
1697 	 * this case.
1698 	 */
1699 	old_size = other->end - other->start + 1;
1700 	num_extents = count_max_extents(old_size);
1701 	old_size = new->end - new->start + 1;
1702 	num_extents += count_max_extents(old_size);
1703 	if (count_max_extents(new_size) >= num_extents)
1704 		return;
1705 
1706 	spin_lock(&BTRFS_I(inode)->lock);
1707 	btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
1708 	spin_unlock(&BTRFS_I(inode)->lock);
1709 }
1710 
1711 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1712 				      struct inode *inode)
1713 {
1714 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1715 
1716 	spin_lock(&root->delalloc_lock);
1717 	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1718 		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1719 			      &root->delalloc_inodes);
1720 		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1721 			&BTRFS_I(inode)->runtime_flags);
1722 		root->nr_delalloc_inodes++;
1723 		if (root->nr_delalloc_inodes == 1) {
1724 			spin_lock(&fs_info->delalloc_root_lock);
1725 			BUG_ON(!list_empty(&root->delalloc_root));
1726 			list_add_tail(&root->delalloc_root,
1727 				      &fs_info->delalloc_roots);
1728 			spin_unlock(&fs_info->delalloc_root_lock);
1729 		}
1730 	}
1731 	spin_unlock(&root->delalloc_lock);
1732 }
1733 
1734 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1735 				     struct btrfs_inode *inode)
1736 {
1737 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1738 
1739 	spin_lock(&root->delalloc_lock);
1740 	if (!list_empty(&inode->delalloc_inodes)) {
1741 		list_del_init(&inode->delalloc_inodes);
1742 		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1743 			  &inode->runtime_flags);
1744 		root->nr_delalloc_inodes--;
1745 		if (!root->nr_delalloc_inodes) {
1746 			spin_lock(&fs_info->delalloc_root_lock);
1747 			BUG_ON(list_empty(&root->delalloc_root));
1748 			list_del_init(&root->delalloc_root);
1749 			spin_unlock(&fs_info->delalloc_root_lock);
1750 		}
1751 	}
1752 	spin_unlock(&root->delalloc_lock);
1753 }
1754 
1755 /*
1756  * extent_io.c set_bit_hook, used to track delayed allocation
1757  * bytes in this file, and to maintain the list of inodes that
1758  * have pending delalloc work to be done.
1759  */
1760 static void btrfs_set_bit_hook(void *private_data,
1761 			       struct extent_state *state, unsigned *bits)
1762 {
1763 	struct inode *inode = private_data;
1764 
1765 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1766 
1767 	if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1768 		WARN_ON(1);
1769 	/*
1770 	 * set_bit and clear bit hooks normally require _irqsave/restore
1771 	 * but in this case, we are only testing for the DELALLOC
1772 	 * bit, which is only set or cleared with irqs on
1773 	 */
1774 	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1775 		struct btrfs_root *root = BTRFS_I(inode)->root;
1776 		u64 len = state->end + 1 - state->start;
1777 		u32 num_extents = count_max_extents(len);
1778 		bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1779 
1780 		spin_lock(&BTRFS_I(inode)->lock);
1781 		btrfs_mod_outstanding_extents(BTRFS_I(inode), num_extents);
1782 		spin_unlock(&BTRFS_I(inode)->lock);
1783 
1784 		/* For sanity tests */
1785 		if (btrfs_is_testing(fs_info))
1786 			return;
1787 
1788 		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
1789 					 fs_info->delalloc_batch);
1790 		spin_lock(&BTRFS_I(inode)->lock);
1791 		BTRFS_I(inode)->delalloc_bytes += len;
1792 		if (*bits & EXTENT_DEFRAG)
1793 			BTRFS_I(inode)->defrag_bytes += len;
1794 		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1795 					 &BTRFS_I(inode)->runtime_flags))
1796 			btrfs_add_delalloc_inodes(root, inode);
1797 		spin_unlock(&BTRFS_I(inode)->lock);
1798 	}
1799 
1800 	if (!(state->state & EXTENT_DELALLOC_NEW) &&
1801 	    (*bits & EXTENT_DELALLOC_NEW)) {
1802 		spin_lock(&BTRFS_I(inode)->lock);
1803 		BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1804 			state->start;
1805 		spin_unlock(&BTRFS_I(inode)->lock);
1806 	}
1807 }
1808 
1809 /*
1810  * extent_io.c clear_bit_hook, see set_bit_hook for why
1811  */
1812 static void btrfs_clear_bit_hook(void *private_data,
1813 				 struct extent_state *state,
1814 				 unsigned *bits)
1815 {
1816 	struct btrfs_inode *inode = BTRFS_I((struct inode *)private_data);
1817 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1818 	u64 len = state->end + 1 - state->start;
1819 	u32 num_extents = count_max_extents(len);
1820 
1821 	if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
1822 		spin_lock(&inode->lock);
1823 		inode->defrag_bytes -= len;
1824 		spin_unlock(&inode->lock);
1825 	}
1826 
1827 	/*
1828 	 * set_bit and clear bit hooks normally require _irqsave/restore
1829 	 * but in this case, we are only testing for the DELALLOC
1830 	 * bit, which is only set or cleared with irqs on
1831 	 */
1832 	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1833 		struct btrfs_root *root = inode->root;
1834 		bool do_list = !btrfs_is_free_space_inode(inode);
1835 
1836 		spin_lock(&inode->lock);
1837 		btrfs_mod_outstanding_extents(inode, -num_extents);
1838 		spin_unlock(&inode->lock);
1839 
1840 		/*
1841 		 * We don't reserve metadata space for space cache inodes so we
1842 		 * don't need to call dellalloc_release_metadata if there is an
1843 		 * error.
1844 		 */
1845 		if (*bits & EXTENT_CLEAR_META_RESV &&
1846 		    root != fs_info->tree_root)
1847 			btrfs_delalloc_release_metadata(inode, len);
1848 
1849 		/* For sanity tests. */
1850 		if (btrfs_is_testing(fs_info))
1851 			return;
1852 
1853 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1854 		    do_list && !(state->state & EXTENT_NORESERVE) &&
1855 		    (*bits & EXTENT_CLEAR_DATA_RESV))
1856 			btrfs_free_reserved_data_space_noquota(
1857 					&inode->vfs_inode,
1858 					state->start, len);
1859 
1860 		percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
1861 					 fs_info->delalloc_batch);
1862 		spin_lock(&inode->lock);
1863 		inode->delalloc_bytes -= len;
1864 		if (do_list && inode->delalloc_bytes == 0 &&
1865 		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1866 					&inode->runtime_flags))
1867 			btrfs_del_delalloc_inode(root, inode);
1868 		spin_unlock(&inode->lock);
1869 	}
1870 
1871 	if ((state->state & EXTENT_DELALLOC_NEW) &&
1872 	    (*bits & EXTENT_DELALLOC_NEW)) {
1873 		spin_lock(&inode->lock);
1874 		ASSERT(inode->new_delalloc_bytes >= len);
1875 		inode->new_delalloc_bytes -= len;
1876 		spin_unlock(&inode->lock);
1877 	}
1878 }
1879 
1880 /*
1881  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1882  * we don't create bios that span stripes or chunks
1883  *
1884  * return 1 if page cannot be merged to bio
1885  * return 0 if page can be merged to bio
1886  * return error otherwise
1887  */
1888 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1889 			 size_t size, struct bio *bio,
1890 			 unsigned long bio_flags)
1891 {
1892 	struct inode *inode = page->mapping->host;
1893 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1894 	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1895 	u64 length = 0;
1896 	u64 map_length;
1897 	int ret;
1898 
1899 	if (bio_flags & EXTENT_BIO_COMPRESSED)
1900 		return 0;
1901 
1902 	length = bio->bi_iter.bi_size;
1903 	map_length = length;
1904 	ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1905 			      NULL, 0);
1906 	if (ret < 0)
1907 		return ret;
1908 	if (map_length < length + size)
1909 		return 1;
1910 	return 0;
1911 }
1912 
1913 /*
1914  * in order to insert checksums into the metadata in large chunks,
1915  * we wait until bio submission time.   All the pages in the bio are
1916  * checksummed and sums are attached onto the ordered extent record.
1917  *
1918  * At IO completion time the cums attached on the ordered extent record
1919  * are inserted into the btree
1920  */
1921 static blk_status_t __btrfs_submit_bio_start(void *private_data, struct bio *bio,
1922 				    int mirror_num, unsigned long bio_flags,
1923 				    u64 bio_offset)
1924 {
1925 	struct inode *inode = private_data;
1926 	blk_status_t ret = 0;
1927 
1928 	ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1929 	BUG_ON(ret); /* -ENOMEM */
1930 	return 0;
1931 }
1932 
1933 /*
1934  * in order to insert checksums into the metadata in large chunks,
1935  * we wait until bio submission time.   All the pages in the bio are
1936  * checksummed and sums are attached onto the ordered extent record.
1937  *
1938  * At IO completion time the cums attached on the ordered extent record
1939  * are inserted into the btree
1940  */
1941 static blk_status_t __btrfs_submit_bio_done(void *private_data, struct bio *bio,
1942 			  int mirror_num, unsigned long bio_flags,
1943 			  u64 bio_offset)
1944 {
1945 	struct inode *inode = private_data;
1946 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1947 	blk_status_t ret;
1948 
1949 	ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
1950 	if (ret) {
1951 		bio->bi_status = ret;
1952 		bio_endio(bio);
1953 	}
1954 	return ret;
1955 }
1956 
1957 /*
1958  * extent_io.c submission hook. This does the right thing for csum calculation
1959  * on write, or reading the csums from the tree before a read.
1960  *
1961  * Rules about async/sync submit,
1962  * a) read:				sync submit
1963  *
1964  * b) write without checksum:		sync submit
1965  *
1966  * c) write with checksum:
1967  *    c-1) if bio is issued by fsync:	sync submit
1968  *         (sync_writers != 0)
1969  *
1970  *    c-2) if root is reloc root:	sync submit
1971  *         (only in case of buffered IO)
1972  *
1973  *    c-3) otherwise:			async submit
1974  */
1975 static blk_status_t btrfs_submit_bio_hook(void *private_data, struct bio *bio,
1976 				 int mirror_num, unsigned long bio_flags,
1977 				 u64 bio_offset)
1978 {
1979 	struct inode *inode = private_data;
1980 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1981 	struct btrfs_root *root = BTRFS_I(inode)->root;
1982 	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1983 	blk_status_t ret = 0;
1984 	int skip_sum;
1985 	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1986 
1987 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1988 
1989 	if (btrfs_is_free_space_inode(BTRFS_I(inode)))
1990 		metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
1991 
1992 	if (bio_op(bio) != REQ_OP_WRITE) {
1993 		ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
1994 		if (ret)
1995 			goto out;
1996 
1997 		if (bio_flags & EXTENT_BIO_COMPRESSED) {
1998 			ret = btrfs_submit_compressed_read(inode, bio,
1999 							   mirror_num,
2000 							   bio_flags);
2001 			goto out;
2002 		} else if (!skip_sum) {
2003 			ret = btrfs_lookup_bio_sums(inode, bio, NULL);
2004 			if (ret)
2005 				goto out;
2006 		}
2007 		goto mapit;
2008 	} else if (async && !skip_sum) {
2009 		/* csum items have already been cloned */
2010 		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2011 			goto mapit;
2012 		/* we're doing a write, do the async checksumming */
2013 		ret = btrfs_wq_submit_bio(fs_info, bio, mirror_num, bio_flags,
2014 					  bio_offset, inode,
2015 					  __btrfs_submit_bio_start,
2016 					  __btrfs_submit_bio_done);
2017 		goto out;
2018 	} else if (!skip_sum) {
2019 		ret = btrfs_csum_one_bio(inode, bio, 0, 0);
2020 		if (ret)
2021 			goto out;
2022 	}
2023 
2024 mapit:
2025 	ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
2026 
2027 out:
2028 	if (ret) {
2029 		bio->bi_status = ret;
2030 		bio_endio(bio);
2031 	}
2032 	return ret;
2033 }
2034 
2035 /*
2036  * given a list of ordered sums record them in the inode.  This happens
2037  * at IO completion time based on sums calculated at bio submission time.
2038  */
2039 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2040 			     struct inode *inode, struct list_head *list)
2041 {
2042 	struct btrfs_ordered_sum *sum;
2043 
2044 	list_for_each_entry(sum, list, list) {
2045 		trans->adding_csums = true;
2046 		btrfs_csum_file_blocks(trans,
2047 		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
2048 		trans->adding_csums = false;
2049 	}
2050 	return 0;
2051 }
2052 
2053 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2054 			      unsigned int extra_bits,
2055 			      struct extent_state **cached_state, int dedupe)
2056 {
2057 	WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2058 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2059 				   extra_bits, cached_state);
2060 }
2061 
2062 /* see btrfs_writepage_start_hook for details on why this is required */
2063 struct btrfs_writepage_fixup {
2064 	struct page *page;
2065 	struct btrfs_work work;
2066 };
2067 
2068 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2069 {
2070 	struct btrfs_writepage_fixup *fixup;
2071 	struct btrfs_ordered_extent *ordered;
2072 	struct extent_state *cached_state = NULL;
2073 	struct extent_changeset *data_reserved = NULL;
2074 	struct page *page;
2075 	struct inode *inode;
2076 	u64 page_start;
2077 	u64 page_end;
2078 	int ret;
2079 
2080 	fixup = container_of(work, struct btrfs_writepage_fixup, work);
2081 	page = fixup->page;
2082 again:
2083 	lock_page(page);
2084 	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2085 		ClearPageChecked(page);
2086 		goto out_page;
2087 	}
2088 
2089 	inode = page->mapping->host;
2090 	page_start = page_offset(page);
2091 	page_end = page_offset(page) + PAGE_SIZE - 1;
2092 
2093 	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2094 			 &cached_state);
2095 
2096 	/* already ordered? We're done */
2097 	if (PagePrivate2(page))
2098 		goto out;
2099 
2100 	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2101 					PAGE_SIZE);
2102 	if (ordered) {
2103 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2104 				     page_end, &cached_state);
2105 		unlock_page(page);
2106 		btrfs_start_ordered_extent(inode, ordered, 1);
2107 		btrfs_put_ordered_extent(ordered);
2108 		goto again;
2109 	}
2110 
2111 	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2112 					   PAGE_SIZE);
2113 	if (ret) {
2114 		mapping_set_error(page->mapping, ret);
2115 		end_extent_writepage(page, ret, page_start, page_end);
2116 		ClearPageChecked(page);
2117 		goto out;
2118 	 }
2119 
2120 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2121 					&cached_state, 0);
2122 	if (ret) {
2123 		mapping_set_error(page->mapping, ret);
2124 		end_extent_writepage(page, ret, page_start, page_end);
2125 		ClearPageChecked(page);
2126 		goto out;
2127 	}
2128 
2129 	ClearPageChecked(page);
2130 	set_page_dirty(page);
2131 	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
2132 out:
2133 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2134 			     &cached_state);
2135 out_page:
2136 	unlock_page(page);
2137 	put_page(page);
2138 	kfree(fixup);
2139 	extent_changeset_free(data_reserved);
2140 }
2141 
2142 /*
2143  * There are a few paths in the higher layers of the kernel that directly
2144  * set the page dirty bit without asking the filesystem if it is a
2145  * good idea.  This causes problems because we want to make sure COW
2146  * properly happens and the data=ordered rules are followed.
2147  *
2148  * In our case any range that doesn't have the ORDERED bit set
2149  * hasn't been properly setup for IO.  We kick off an async process
2150  * to fix it up.  The async helper will wait for ordered extents, set
2151  * the delalloc bit and make it safe to write the page.
2152  */
2153 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2154 {
2155 	struct inode *inode = page->mapping->host;
2156 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2157 	struct btrfs_writepage_fixup *fixup;
2158 
2159 	/* this page is properly in the ordered list */
2160 	if (TestClearPagePrivate2(page))
2161 		return 0;
2162 
2163 	if (PageChecked(page))
2164 		return -EAGAIN;
2165 
2166 	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2167 	if (!fixup)
2168 		return -EAGAIN;
2169 
2170 	SetPageChecked(page);
2171 	get_page(page);
2172 	btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2173 			btrfs_writepage_fixup_worker, NULL, NULL);
2174 	fixup->page = page;
2175 	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2176 	return -EBUSY;
2177 }
2178 
2179 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2180 				       struct inode *inode, u64 file_pos,
2181 				       u64 disk_bytenr, u64 disk_num_bytes,
2182 				       u64 num_bytes, u64 ram_bytes,
2183 				       u8 compression, u8 encryption,
2184 				       u16 other_encoding, int extent_type)
2185 {
2186 	struct btrfs_root *root = BTRFS_I(inode)->root;
2187 	struct btrfs_file_extent_item *fi;
2188 	struct btrfs_path *path;
2189 	struct extent_buffer *leaf;
2190 	struct btrfs_key ins;
2191 	u64 qg_released;
2192 	int extent_inserted = 0;
2193 	int ret;
2194 
2195 	path = btrfs_alloc_path();
2196 	if (!path)
2197 		return -ENOMEM;
2198 
2199 	/*
2200 	 * we may be replacing one extent in the tree with another.
2201 	 * The new extent is pinned in the extent map, and we don't want
2202 	 * to drop it from the cache until it is completely in the btree.
2203 	 *
2204 	 * So, tell btrfs_drop_extents to leave this extent in the cache.
2205 	 * the caller is expected to unpin it and allow it to be merged
2206 	 * with the others.
2207 	 */
2208 	ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2209 				   file_pos + num_bytes, NULL, 0,
2210 				   1, sizeof(*fi), &extent_inserted);
2211 	if (ret)
2212 		goto out;
2213 
2214 	if (!extent_inserted) {
2215 		ins.objectid = btrfs_ino(BTRFS_I(inode));
2216 		ins.offset = file_pos;
2217 		ins.type = BTRFS_EXTENT_DATA_KEY;
2218 
2219 		path->leave_spinning = 1;
2220 		ret = btrfs_insert_empty_item(trans, root, path, &ins,
2221 					      sizeof(*fi));
2222 		if (ret)
2223 			goto out;
2224 	}
2225 	leaf = path->nodes[0];
2226 	fi = btrfs_item_ptr(leaf, path->slots[0],
2227 			    struct btrfs_file_extent_item);
2228 	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2229 	btrfs_set_file_extent_type(leaf, fi, extent_type);
2230 	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2231 	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2232 	btrfs_set_file_extent_offset(leaf, fi, 0);
2233 	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2234 	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2235 	btrfs_set_file_extent_compression(leaf, fi, compression);
2236 	btrfs_set_file_extent_encryption(leaf, fi, encryption);
2237 	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2238 
2239 	btrfs_mark_buffer_dirty(leaf);
2240 	btrfs_release_path(path);
2241 
2242 	inode_add_bytes(inode, num_bytes);
2243 
2244 	ins.objectid = disk_bytenr;
2245 	ins.offset = disk_num_bytes;
2246 	ins.type = BTRFS_EXTENT_ITEM_KEY;
2247 
2248 	/*
2249 	 * Release the reserved range from inode dirty range map, as it is
2250 	 * already moved into delayed_ref_head
2251 	 */
2252 	ret = btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2253 	if (ret < 0)
2254 		goto out;
2255 	qg_released = ret;
2256 	ret = btrfs_alloc_reserved_file_extent(trans, root,
2257 					       btrfs_ino(BTRFS_I(inode)),
2258 					       file_pos, qg_released, &ins);
2259 out:
2260 	btrfs_free_path(path);
2261 
2262 	return ret;
2263 }
2264 
2265 /* snapshot-aware defrag */
2266 struct sa_defrag_extent_backref {
2267 	struct rb_node node;
2268 	struct old_sa_defrag_extent *old;
2269 	u64 root_id;
2270 	u64 inum;
2271 	u64 file_pos;
2272 	u64 extent_offset;
2273 	u64 num_bytes;
2274 	u64 generation;
2275 };
2276 
2277 struct old_sa_defrag_extent {
2278 	struct list_head list;
2279 	struct new_sa_defrag_extent *new;
2280 
2281 	u64 extent_offset;
2282 	u64 bytenr;
2283 	u64 offset;
2284 	u64 len;
2285 	int count;
2286 };
2287 
2288 struct new_sa_defrag_extent {
2289 	struct rb_root root;
2290 	struct list_head head;
2291 	struct btrfs_path *path;
2292 	struct inode *inode;
2293 	u64 file_pos;
2294 	u64 len;
2295 	u64 bytenr;
2296 	u64 disk_len;
2297 	u8 compress_type;
2298 };
2299 
2300 static int backref_comp(struct sa_defrag_extent_backref *b1,
2301 			struct sa_defrag_extent_backref *b2)
2302 {
2303 	if (b1->root_id < b2->root_id)
2304 		return -1;
2305 	else if (b1->root_id > b2->root_id)
2306 		return 1;
2307 
2308 	if (b1->inum < b2->inum)
2309 		return -1;
2310 	else if (b1->inum > b2->inum)
2311 		return 1;
2312 
2313 	if (b1->file_pos < b2->file_pos)
2314 		return -1;
2315 	else if (b1->file_pos > b2->file_pos)
2316 		return 1;
2317 
2318 	/*
2319 	 * [------------------------------] ===> (a range of space)
2320 	 *     |<--->|   |<---->| =============> (fs/file tree A)
2321 	 * |<---------------------------->| ===> (fs/file tree B)
2322 	 *
2323 	 * A range of space can refer to two file extents in one tree while
2324 	 * refer to only one file extent in another tree.
2325 	 *
2326 	 * So we may process a disk offset more than one time(two extents in A)
2327 	 * and locate at the same extent(one extent in B), then insert two same
2328 	 * backrefs(both refer to the extent in B).
2329 	 */
2330 	return 0;
2331 }
2332 
2333 static void backref_insert(struct rb_root *root,
2334 			   struct sa_defrag_extent_backref *backref)
2335 {
2336 	struct rb_node **p = &root->rb_node;
2337 	struct rb_node *parent = NULL;
2338 	struct sa_defrag_extent_backref *entry;
2339 	int ret;
2340 
2341 	while (*p) {
2342 		parent = *p;
2343 		entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2344 
2345 		ret = backref_comp(backref, entry);
2346 		if (ret < 0)
2347 			p = &(*p)->rb_left;
2348 		else
2349 			p = &(*p)->rb_right;
2350 	}
2351 
2352 	rb_link_node(&backref->node, parent, p);
2353 	rb_insert_color(&backref->node, root);
2354 }
2355 
2356 /*
2357  * Note the backref might has changed, and in this case we just return 0.
2358  */
2359 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2360 				       void *ctx)
2361 {
2362 	struct btrfs_file_extent_item *extent;
2363 	struct old_sa_defrag_extent *old = ctx;
2364 	struct new_sa_defrag_extent *new = old->new;
2365 	struct btrfs_path *path = new->path;
2366 	struct btrfs_key key;
2367 	struct btrfs_root *root;
2368 	struct sa_defrag_extent_backref *backref;
2369 	struct extent_buffer *leaf;
2370 	struct inode *inode = new->inode;
2371 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2372 	int slot;
2373 	int ret;
2374 	u64 extent_offset;
2375 	u64 num_bytes;
2376 
2377 	if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2378 	    inum == btrfs_ino(BTRFS_I(inode)))
2379 		return 0;
2380 
2381 	key.objectid = root_id;
2382 	key.type = BTRFS_ROOT_ITEM_KEY;
2383 	key.offset = (u64)-1;
2384 
2385 	root = btrfs_read_fs_root_no_name(fs_info, &key);
2386 	if (IS_ERR(root)) {
2387 		if (PTR_ERR(root) == -ENOENT)
2388 			return 0;
2389 		WARN_ON(1);
2390 		btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2391 			 inum, offset, root_id);
2392 		return PTR_ERR(root);
2393 	}
2394 
2395 	key.objectid = inum;
2396 	key.type = BTRFS_EXTENT_DATA_KEY;
2397 	if (offset > (u64)-1 << 32)
2398 		key.offset = 0;
2399 	else
2400 		key.offset = offset;
2401 
2402 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2403 	if (WARN_ON(ret < 0))
2404 		return ret;
2405 	ret = 0;
2406 
2407 	while (1) {
2408 		cond_resched();
2409 
2410 		leaf = path->nodes[0];
2411 		slot = path->slots[0];
2412 
2413 		if (slot >= btrfs_header_nritems(leaf)) {
2414 			ret = btrfs_next_leaf(root, path);
2415 			if (ret < 0) {
2416 				goto out;
2417 			} else if (ret > 0) {
2418 				ret = 0;
2419 				goto out;
2420 			}
2421 			continue;
2422 		}
2423 
2424 		path->slots[0]++;
2425 
2426 		btrfs_item_key_to_cpu(leaf, &key, slot);
2427 
2428 		if (key.objectid > inum)
2429 			goto out;
2430 
2431 		if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2432 			continue;
2433 
2434 		extent = btrfs_item_ptr(leaf, slot,
2435 					struct btrfs_file_extent_item);
2436 
2437 		if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2438 			continue;
2439 
2440 		/*
2441 		 * 'offset' refers to the exact key.offset,
2442 		 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2443 		 * (key.offset - extent_offset).
2444 		 */
2445 		if (key.offset != offset)
2446 			continue;
2447 
2448 		extent_offset = btrfs_file_extent_offset(leaf, extent);
2449 		num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2450 
2451 		if (extent_offset >= old->extent_offset + old->offset +
2452 		    old->len || extent_offset + num_bytes <=
2453 		    old->extent_offset + old->offset)
2454 			continue;
2455 		break;
2456 	}
2457 
2458 	backref = kmalloc(sizeof(*backref), GFP_NOFS);
2459 	if (!backref) {
2460 		ret = -ENOENT;
2461 		goto out;
2462 	}
2463 
2464 	backref->root_id = root_id;
2465 	backref->inum = inum;
2466 	backref->file_pos = offset;
2467 	backref->num_bytes = num_bytes;
2468 	backref->extent_offset = extent_offset;
2469 	backref->generation = btrfs_file_extent_generation(leaf, extent);
2470 	backref->old = old;
2471 	backref_insert(&new->root, backref);
2472 	old->count++;
2473 out:
2474 	btrfs_release_path(path);
2475 	WARN_ON(ret);
2476 	return ret;
2477 }
2478 
2479 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2480 				   struct new_sa_defrag_extent *new)
2481 {
2482 	struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2483 	struct old_sa_defrag_extent *old, *tmp;
2484 	int ret;
2485 
2486 	new->path = path;
2487 
2488 	list_for_each_entry_safe(old, tmp, &new->head, list) {
2489 		ret = iterate_inodes_from_logical(old->bytenr +
2490 						  old->extent_offset, fs_info,
2491 						  path, record_one_backref,
2492 						  old, false);
2493 		if (ret < 0 && ret != -ENOENT)
2494 			return false;
2495 
2496 		/* no backref to be processed for this extent */
2497 		if (!old->count) {
2498 			list_del(&old->list);
2499 			kfree(old);
2500 		}
2501 	}
2502 
2503 	if (list_empty(&new->head))
2504 		return false;
2505 
2506 	return true;
2507 }
2508 
2509 static int relink_is_mergable(struct extent_buffer *leaf,
2510 			      struct btrfs_file_extent_item *fi,
2511 			      struct new_sa_defrag_extent *new)
2512 {
2513 	if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2514 		return 0;
2515 
2516 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2517 		return 0;
2518 
2519 	if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2520 		return 0;
2521 
2522 	if (btrfs_file_extent_encryption(leaf, fi) ||
2523 	    btrfs_file_extent_other_encoding(leaf, fi))
2524 		return 0;
2525 
2526 	return 1;
2527 }
2528 
2529 /*
2530  * Note the backref might has changed, and in this case we just return 0.
2531  */
2532 static noinline int relink_extent_backref(struct btrfs_path *path,
2533 				 struct sa_defrag_extent_backref *prev,
2534 				 struct sa_defrag_extent_backref *backref)
2535 {
2536 	struct btrfs_file_extent_item *extent;
2537 	struct btrfs_file_extent_item *item;
2538 	struct btrfs_ordered_extent *ordered;
2539 	struct btrfs_trans_handle *trans;
2540 	struct btrfs_root *root;
2541 	struct btrfs_key key;
2542 	struct extent_buffer *leaf;
2543 	struct old_sa_defrag_extent *old = backref->old;
2544 	struct new_sa_defrag_extent *new = old->new;
2545 	struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2546 	struct inode *inode;
2547 	struct extent_state *cached = NULL;
2548 	int ret = 0;
2549 	u64 start;
2550 	u64 len;
2551 	u64 lock_start;
2552 	u64 lock_end;
2553 	bool merge = false;
2554 	int index;
2555 
2556 	if (prev && prev->root_id == backref->root_id &&
2557 	    prev->inum == backref->inum &&
2558 	    prev->file_pos + prev->num_bytes == backref->file_pos)
2559 		merge = true;
2560 
2561 	/* step 1: get root */
2562 	key.objectid = backref->root_id;
2563 	key.type = BTRFS_ROOT_ITEM_KEY;
2564 	key.offset = (u64)-1;
2565 
2566 	index = srcu_read_lock(&fs_info->subvol_srcu);
2567 
2568 	root = btrfs_read_fs_root_no_name(fs_info, &key);
2569 	if (IS_ERR(root)) {
2570 		srcu_read_unlock(&fs_info->subvol_srcu, index);
2571 		if (PTR_ERR(root) == -ENOENT)
2572 			return 0;
2573 		return PTR_ERR(root);
2574 	}
2575 
2576 	if (btrfs_root_readonly(root)) {
2577 		srcu_read_unlock(&fs_info->subvol_srcu, index);
2578 		return 0;
2579 	}
2580 
2581 	/* step 2: get inode */
2582 	key.objectid = backref->inum;
2583 	key.type = BTRFS_INODE_ITEM_KEY;
2584 	key.offset = 0;
2585 
2586 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2587 	if (IS_ERR(inode)) {
2588 		srcu_read_unlock(&fs_info->subvol_srcu, index);
2589 		return 0;
2590 	}
2591 
2592 	srcu_read_unlock(&fs_info->subvol_srcu, index);
2593 
2594 	/* step 3: relink backref */
2595 	lock_start = backref->file_pos;
2596 	lock_end = backref->file_pos + backref->num_bytes - 1;
2597 	lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2598 			 &cached);
2599 
2600 	ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2601 	if (ordered) {
2602 		btrfs_put_ordered_extent(ordered);
2603 		goto out_unlock;
2604 	}
2605 
2606 	trans = btrfs_join_transaction(root);
2607 	if (IS_ERR(trans)) {
2608 		ret = PTR_ERR(trans);
2609 		goto out_unlock;
2610 	}
2611 
2612 	key.objectid = backref->inum;
2613 	key.type = BTRFS_EXTENT_DATA_KEY;
2614 	key.offset = backref->file_pos;
2615 
2616 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2617 	if (ret < 0) {
2618 		goto out_free_path;
2619 	} else if (ret > 0) {
2620 		ret = 0;
2621 		goto out_free_path;
2622 	}
2623 
2624 	extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2625 				struct btrfs_file_extent_item);
2626 
2627 	if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2628 	    backref->generation)
2629 		goto out_free_path;
2630 
2631 	btrfs_release_path(path);
2632 
2633 	start = backref->file_pos;
2634 	if (backref->extent_offset < old->extent_offset + old->offset)
2635 		start += old->extent_offset + old->offset -
2636 			 backref->extent_offset;
2637 
2638 	len = min(backref->extent_offset + backref->num_bytes,
2639 		  old->extent_offset + old->offset + old->len);
2640 	len -= max(backref->extent_offset, old->extent_offset + old->offset);
2641 
2642 	ret = btrfs_drop_extents(trans, root, inode, start,
2643 				 start + len, 1);
2644 	if (ret)
2645 		goto out_free_path;
2646 again:
2647 	key.objectid = btrfs_ino(BTRFS_I(inode));
2648 	key.type = BTRFS_EXTENT_DATA_KEY;
2649 	key.offset = start;
2650 
2651 	path->leave_spinning = 1;
2652 	if (merge) {
2653 		struct btrfs_file_extent_item *fi;
2654 		u64 extent_len;
2655 		struct btrfs_key found_key;
2656 
2657 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2658 		if (ret < 0)
2659 			goto out_free_path;
2660 
2661 		path->slots[0]--;
2662 		leaf = path->nodes[0];
2663 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2664 
2665 		fi = btrfs_item_ptr(leaf, path->slots[0],
2666 				    struct btrfs_file_extent_item);
2667 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2668 
2669 		if (extent_len + found_key.offset == start &&
2670 		    relink_is_mergable(leaf, fi, new)) {
2671 			btrfs_set_file_extent_num_bytes(leaf, fi,
2672 							extent_len + len);
2673 			btrfs_mark_buffer_dirty(leaf);
2674 			inode_add_bytes(inode, len);
2675 
2676 			ret = 1;
2677 			goto out_free_path;
2678 		} else {
2679 			merge = false;
2680 			btrfs_release_path(path);
2681 			goto again;
2682 		}
2683 	}
2684 
2685 	ret = btrfs_insert_empty_item(trans, root, path, &key,
2686 					sizeof(*extent));
2687 	if (ret) {
2688 		btrfs_abort_transaction(trans, ret);
2689 		goto out_free_path;
2690 	}
2691 
2692 	leaf = path->nodes[0];
2693 	item = btrfs_item_ptr(leaf, path->slots[0],
2694 				struct btrfs_file_extent_item);
2695 	btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2696 	btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2697 	btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2698 	btrfs_set_file_extent_num_bytes(leaf, item, len);
2699 	btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2700 	btrfs_set_file_extent_generation(leaf, item, trans->transid);
2701 	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2702 	btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2703 	btrfs_set_file_extent_encryption(leaf, item, 0);
2704 	btrfs_set_file_extent_other_encoding(leaf, item, 0);
2705 
2706 	btrfs_mark_buffer_dirty(leaf);
2707 	inode_add_bytes(inode, len);
2708 	btrfs_release_path(path);
2709 
2710 	ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2711 			new->disk_len, 0,
2712 			backref->root_id, backref->inum,
2713 			new->file_pos);	/* start - extent_offset */
2714 	if (ret) {
2715 		btrfs_abort_transaction(trans, ret);
2716 		goto out_free_path;
2717 	}
2718 
2719 	ret = 1;
2720 out_free_path:
2721 	btrfs_release_path(path);
2722 	path->leave_spinning = 0;
2723 	btrfs_end_transaction(trans);
2724 out_unlock:
2725 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2726 			     &cached);
2727 	iput(inode);
2728 	return ret;
2729 }
2730 
2731 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2732 {
2733 	struct old_sa_defrag_extent *old, *tmp;
2734 
2735 	if (!new)
2736 		return;
2737 
2738 	list_for_each_entry_safe(old, tmp, &new->head, list) {
2739 		kfree(old);
2740 	}
2741 	kfree(new);
2742 }
2743 
2744 static void relink_file_extents(struct new_sa_defrag_extent *new)
2745 {
2746 	struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2747 	struct btrfs_path *path;
2748 	struct sa_defrag_extent_backref *backref;
2749 	struct sa_defrag_extent_backref *prev = NULL;
2750 	struct inode *inode;
2751 	struct btrfs_root *root;
2752 	struct rb_node *node;
2753 	int ret;
2754 
2755 	inode = new->inode;
2756 	root = BTRFS_I(inode)->root;
2757 
2758 	path = btrfs_alloc_path();
2759 	if (!path)
2760 		return;
2761 
2762 	if (!record_extent_backrefs(path, new)) {
2763 		btrfs_free_path(path);
2764 		goto out;
2765 	}
2766 	btrfs_release_path(path);
2767 
2768 	while (1) {
2769 		node = rb_first(&new->root);
2770 		if (!node)
2771 			break;
2772 		rb_erase(node, &new->root);
2773 
2774 		backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2775 
2776 		ret = relink_extent_backref(path, prev, backref);
2777 		WARN_ON(ret < 0);
2778 
2779 		kfree(prev);
2780 
2781 		if (ret == 1)
2782 			prev = backref;
2783 		else
2784 			prev = NULL;
2785 		cond_resched();
2786 	}
2787 	kfree(prev);
2788 
2789 	btrfs_free_path(path);
2790 out:
2791 	free_sa_defrag_extent(new);
2792 
2793 	atomic_dec(&fs_info->defrag_running);
2794 	wake_up(&fs_info->transaction_wait);
2795 }
2796 
2797 static struct new_sa_defrag_extent *
2798 record_old_file_extents(struct inode *inode,
2799 			struct btrfs_ordered_extent *ordered)
2800 {
2801 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2802 	struct btrfs_root *root = BTRFS_I(inode)->root;
2803 	struct btrfs_path *path;
2804 	struct btrfs_key key;
2805 	struct old_sa_defrag_extent *old;
2806 	struct new_sa_defrag_extent *new;
2807 	int ret;
2808 
2809 	new = kmalloc(sizeof(*new), GFP_NOFS);
2810 	if (!new)
2811 		return NULL;
2812 
2813 	new->inode = inode;
2814 	new->file_pos = ordered->file_offset;
2815 	new->len = ordered->len;
2816 	new->bytenr = ordered->start;
2817 	new->disk_len = ordered->disk_len;
2818 	new->compress_type = ordered->compress_type;
2819 	new->root = RB_ROOT;
2820 	INIT_LIST_HEAD(&new->head);
2821 
2822 	path = btrfs_alloc_path();
2823 	if (!path)
2824 		goto out_kfree;
2825 
2826 	key.objectid = btrfs_ino(BTRFS_I(inode));
2827 	key.type = BTRFS_EXTENT_DATA_KEY;
2828 	key.offset = new->file_pos;
2829 
2830 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2831 	if (ret < 0)
2832 		goto out_free_path;
2833 	if (ret > 0 && path->slots[0] > 0)
2834 		path->slots[0]--;
2835 
2836 	/* find out all the old extents for the file range */
2837 	while (1) {
2838 		struct btrfs_file_extent_item *extent;
2839 		struct extent_buffer *l;
2840 		int slot;
2841 		u64 num_bytes;
2842 		u64 offset;
2843 		u64 end;
2844 		u64 disk_bytenr;
2845 		u64 extent_offset;
2846 
2847 		l = path->nodes[0];
2848 		slot = path->slots[0];
2849 
2850 		if (slot >= btrfs_header_nritems(l)) {
2851 			ret = btrfs_next_leaf(root, path);
2852 			if (ret < 0)
2853 				goto out_free_path;
2854 			else if (ret > 0)
2855 				break;
2856 			continue;
2857 		}
2858 
2859 		btrfs_item_key_to_cpu(l, &key, slot);
2860 
2861 		if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2862 			break;
2863 		if (key.type != BTRFS_EXTENT_DATA_KEY)
2864 			break;
2865 		if (key.offset >= new->file_pos + new->len)
2866 			break;
2867 
2868 		extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2869 
2870 		num_bytes = btrfs_file_extent_num_bytes(l, extent);
2871 		if (key.offset + num_bytes < new->file_pos)
2872 			goto next;
2873 
2874 		disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2875 		if (!disk_bytenr)
2876 			goto next;
2877 
2878 		extent_offset = btrfs_file_extent_offset(l, extent);
2879 
2880 		old = kmalloc(sizeof(*old), GFP_NOFS);
2881 		if (!old)
2882 			goto out_free_path;
2883 
2884 		offset = max(new->file_pos, key.offset);
2885 		end = min(new->file_pos + new->len, key.offset + num_bytes);
2886 
2887 		old->bytenr = disk_bytenr;
2888 		old->extent_offset = extent_offset;
2889 		old->offset = offset - key.offset;
2890 		old->len = end - offset;
2891 		old->new = new;
2892 		old->count = 0;
2893 		list_add_tail(&old->list, &new->head);
2894 next:
2895 		path->slots[0]++;
2896 		cond_resched();
2897 	}
2898 
2899 	btrfs_free_path(path);
2900 	atomic_inc(&fs_info->defrag_running);
2901 
2902 	return new;
2903 
2904 out_free_path:
2905 	btrfs_free_path(path);
2906 out_kfree:
2907 	free_sa_defrag_extent(new);
2908 	return NULL;
2909 }
2910 
2911 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2912 					 u64 start, u64 len)
2913 {
2914 	struct btrfs_block_group_cache *cache;
2915 
2916 	cache = btrfs_lookup_block_group(fs_info, start);
2917 	ASSERT(cache);
2918 
2919 	spin_lock(&cache->lock);
2920 	cache->delalloc_bytes -= len;
2921 	spin_unlock(&cache->lock);
2922 
2923 	btrfs_put_block_group(cache);
2924 }
2925 
2926 /* as ordered data IO finishes, this gets called so we can finish
2927  * an ordered extent if the range of bytes in the file it covers are
2928  * fully written.
2929  */
2930 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2931 {
2932 	struct inode *inode = ordered_extent->inode;
2933 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2934 	struct btrfs_root *root = BTRFS_I(inode)->root;
2935 	struct btrfs_trans_handle *trans = NULL;
2936 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2937 	struct extent_state *cached_state = NULL;
2938 	struct new_sa_defrag_extent *new = NULL;
2939 	int compress_type = 0;
2940 	int ret = 0;
2941 	u64 logical_len = ordered_extent->len;
2942 	bool nolock;
2943 	bool truncated = false;
2944 	bool range_locked = false;
2945 	bool clear_new_delalloc_bytes = false;
2946 
2947 	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2948 	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2949 	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2950 		clear_new_delalloc_bytes = true;
2951 
2952 	nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2953 
2954 	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2955 		ret = -EIO;
2956 		goto out;
2957 	}
2958 
2959 	btrfs_free_io_failure_record(BTRFS_I(inode),
2960 			ordered_extent->file_offset,
2961 			ordered_extent->file_offset +
2962 			ordered_extent->len - 1);
2963 
2964 	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2965 		truncated = true;
2966 		logical_len = ordered_extent->truncated_len;
2967 		/* Truncated the entire extent, don't bother adding */
2968 		if (!logical_len)
2969 			goto out;
2970 	}
2971 
2972 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2973 		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2974 
2975 		/*
2976 		 * For mwrite(mmap + memset to write) case, we still reserve
2977 		 * space for NOCOW range.
2978 		 * As NOCOW won't cause a new delayed ref, just free the space
2979 		 */
2980 		btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
2981 				       ordered_extent->len);
2982 		btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2983 		if (nolock)
2984 			trans = btrfs_join_transaction_nolock(root);
2985 		else
2986 			trans = btrfs_join_transaction(root);
2987 		if (IS_ERR(trans)) {
2988 			ret = PTR_ERR(trans);
2989 			trans = NULL;
2990 			goto out;
2991 		}
2992 		trans->block_rsv = &BTRFS_I(inode)->block_rsv;
2993 		ret = btrfs_update_inode_fallback(trans, root, inode);
2994 		if (ret) /* -ENOMEM or corruption */
2995 			btrfs_abort_transaction(trans, ret);
2996 		goto out;
2997 	}
2998 
2999 	range_locked = true;
3000 	lock_extent_bits(io_tree, ordered_extent->file_offset,
3001 			 ordered_extent->file_offset + ordered_extent->len - 1,
3002 			 &cached_state);
3003 
3004 	ret = test_range_bit(io_tree, ordered_extent->file_offset,
3005 			ordered_extent->file_offset + ordered_extent->len - 1,
3006 			EXTENT_DEFRAG, 0, cached_state);
3007 	if (ret) {
3008 		u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
3009 		if (0 && last_snapshot >= BTRFS_I(inode)->generation)
3010 			/* the inode is shared */
3011 			new = record_old_file_extents(inode, ordered_extent);
3012 
3013 		clear_extent_bit(io_tree, ordered_extent->file_offset,
3014 			ordered_extent->file_offset + ordered_extent->len - 1,
3015 			EXTENT_DEFRAG, 0, 0, &cached_state);
3016 	}
3017 
3018 	if (nolock)
3019 		trans = btrfs_join_transaction_nolock(root);
3020 	else
3021 		trans = btrfs_join_transaction(root);
3022 	if (IS_ERR(trans)) {
3023 		ret = PTR_ERR(trans);
3024 		trans = NULL;
3025 		goto out;
3026 	}
3027 
3028 	trans->block_rsv = &BTRFS_I(inode)->block_rsv;
3029 
3030 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3031 		compress_type = ordered_extent->compress_type;
3032 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3033 		BUG_ON(compress_type);
3034 		btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset,
3035 				       ordered_extent->len);
3036 		ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
3037 						ordered_extent->file_offset,
3038 						ordered_extent->file_offset +
3039 						logical_len);
3040 	} else {
3041 		BUG_ON(root == fs_info->tree_root);
3042 		ret = insert_reserved_file_extent(trans, inode,
3043 						ordered_extent->file_offset,
3044 						ordered_extent->start,
3045 						ordered_extent->disk_len,
3046 						logical_len, logical_len,
3047 						compress_type, 0, 0,
3048 						BTRFS_FILE_EXTENT_REG);
3049 		if (!ret)
3050 			btrfs_release_delalloc_bytes(fs_info,
3051 						     ordered_extent->start,
3052 						     ordered_extent->disk_len);
3053 	}
3054 	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3055 			   ordered_extent->file_offset, ordered_extent->len,
3056 			   trans->transid);
3057 	if (ret < 0) {
3058 		btrfs_abort_transaction(trans, ret);
3059 		goto out;
3060 	}
3061 
3062 	add_pending_csums(trans, inode, &ordered_extent->list);
3063 
3064 	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3065 	ret = btrfs_update_inode_fallback(trans, root, inode);
3066 	if (ret) { /* -ENOMEM or corruption */
3067 		btrfs_abort_transaction(trans, ret);
3068 		goto out;
3069 	}
3070 	ret = 0;
3071 out:
3072 	if (range_locked || clear_new_delalloc_bytes) {
3073 		unsigned int clear_bits = 0;
3074 
3075 		if (range_locked)
3076 			clear_bits |= EXTENT_LOCKED;
3077 		if (clear_new_delalloc_bytes)
3078 			clear_bits |= EXTENT_DELALLOC_NEW;
3079 		clear_extent_bit(&BTRFS_I(inode)->io_tree,
3080 				 ordered_extent->file_offset,
3081 				 ordered_extent->file_offset +
3082 				 ordered_extent->len - 1,
3083 				 clear_bits,
3084 				 (clear_bits & EXTENT_LOCKED) ? 1 : 0,
3085 				 0, &cached_state);
3086 	}
3087 
3088 	if (trans)
3089 		btrfs_end_transaction(trans);
3090 
3091 	if (ret || truncated) {
3092 		u64 start, end;
3093 
3094 		if (truncated)
3095 			start = ordered_extent->file_offset + logical_len;
3096 		else
3097 			start = ordered_extent->file_offset;
3098 		end = ordered_extent->file_offset + ordered_extent->len - 1;
3099 		clear_extent_uptodate(io_tree, start, end, NULL);
3100 
3101 		/* Drop the cache for the part of the extent we didn't write. */
3102 		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3103 
3104 		/*
3105 		 * If the ordered extent had an IOERR or something else went
3106 		 * wrong we need to return the space for this ordered extent
3107 		 * back to the allocator.  We only free the extent in the
3108 		 * truncated case if we didn't write out the extent at all.
3109 		 */
3110 		if ((ret || !logical_len) &&
3111 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3112 		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
3113 			btrfs_free_reserved_extent(fs_info,
3114 						   ordered_extent->start,
3115 						   ordered_extent->disk_len, 1);
3116 	}
3117 
3118 
3119 	/*
3120 	 * This needs to be done to make sure anybody waiting knows we are done
3121 	 * updating everything for this ordered extent.
3122 	 */
3123 	btrfs_remove_ordered_extent(inode, ordered_extent);
3124 
3125 	/* for snapshot-aware defrag */
3126 	if (new) {
3127 		if (ret) {
3128 			free_sa_defrag_extent(new);
3129 			atomic_dec(&fs_info->defrag_running);
3130 		} else {
3131 			relink_file_extents(new);
3132 		}
3133 	}
3134 
3135 	/* once for us */
3136 	btrfs_put_ordered_extent(ordered_extent);
3137 	/* once for the tree */
3138 	btrfs_put_ordered_extent(ordered_extent);
3139 
3140 	return ret;
3141 }
3142 
3143 static void finish_ordered_fn(struct btrfs_work *work)
3144 {
3145 	struct btrfs_ordered_extent *ordered_extent;
3146 	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3147 	btrfs_finish_ordered_io(ordered_extent);
3148 }
3149 
3150 static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
3151 				struct extent_state *state, int uptodate)
3152 {
3153 	struct inode *inode = page->mapping->host;
3154 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3155 	struct btrfs_ordered_extent *ordered_extent = NULL;
3156 	struct btrfs_workqueue *wq;
3157 	btrfs_work_func_t func;
3158 
3159 	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
3160 
3161 	ClearPagePrivate2(page);
3162 	if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
3163 					    end - start + 1, uptodate))
3164 		return;
3165 
3166 	if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
3167 		wq = fs_info->endio_freespace_worker;
3168 		func = btrfs_freespace_write_helper;
3169 	} else {
3170 		wq = fs_info->endio_write_workers;
3171 		func = btrfs_endio_write_helper;
3172 	}
3173 
3174 	btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3175 			NULL);
3176 	btrfs_queue_work(wq, &ordered_extent->work);
3177 }
3178 
3179 static int __readpage_endio_check(struct inode *inode,
3180 				  struct btrfs_io_bio *io_bio,
3181 				  int icsum, struct page *page,
3182 				  int pgoff, u64 start, size_t len)
3183 {
3184 	char *kaddr;
3185 	u32 csum_expected;
3186 	u32 csum = ~(u32)0;
3187 
3188 	csum_expected = *(((u32 *)io_bio->csum) + icsum);
3189 
3190 	kaddr = kmap_atomic(page);
3191 	csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
3192 	btrfs_csum_final(csum, (u8 *)&csum);
3193 	if (csum != csum_expected)
3194 		goto zeroit;
3195 
3196 	kunmap_atomic(kaddr);
3197 	return 0;
3198 zeroit:
3199 	btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3200 				    io_bio->mirror_num);
3201 	memset(kaddr + pgoff, 1, len);
3202 	flush_dcache_page(page);
3203 	kunmap_atomic(kaddr);
3204 	return -EIO;
3205 }
3206 
3207 /*
3208  * when reads are done, we need to check csums to verify the data is correct
3209  * if there's a match, we allow the bio to finish.  If not, the code in
3210  * extent_io.c will try to find good copies for us.
3211  */
3212 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3213 				      u64 phy_offset, struct page *page,
3214 				      u64 start, u64 end, int mirror)
3215 {
3216 	size_t offset = start - page_offset(page);
3217 	struct inode *inode = page->mapping->host;
3218 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3219 	struct btrfs_root *root = BTRFS_I(inode)->root;
3220 
3221 	if (PageChecked(page)) {
3222 		ClearPageChecked(page);
3223 		return 0;
3224 	}
3225 
3226 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3227 		return 0;
3228 
3229 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
3230 	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
3231 		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
3232 		return 0;
3233 	}
3234 
3235 	phy_offset >>= inode->i_sb->s_blocksize_bits;
3236 	return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
3237 				      start, (size_t)(end - start + 1));
3238 }
3239 
3240 void btrfs_add_delayed_iput(struct inode *inode)
3241 {
3242 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3243 	struct btrfs_inode *binode = BTRFS_I(inode);
3244 
3245 	if (atomic_add_unless(&inode->i_count, -1, 1))
3246 		return;
3247 
3248 	spin_lock(&fs_info->delayed_iput_lock);
3249 	if (binode->delayed_iput_count == 0) {
3250 		ASSERT(list_empty(&binode->delayed_iput));
3251 		list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
3252 	} else {
3253 		binode->delayed_iput_count++;
3254 	}
3255 	spin_unlock(&fs_info->delayed_iput_lock);
3256 }
3257 
3258 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3259 {
3260 
3261 	spin_lock(&fs_info->delayed_iput_lock);
3262 	while (!list_empty(&fs_info->delayed_iputs)) {
3263 		struct btrfs_inode *inode;
3264 
3265 		inode = list_first_entry(&fs_info->delayed_iputs,
3266 				struct btrfs_inode, delayed_iput);
3267 		if (inode->delayed_iput_count) {
3268 			inode->delayed_iput_count--;
3269 			list_move_tail(&inode->delayed_iput,
3270 					&fs_info->delayed_iputs);
3271 		} else {
3272 			list_del_init(&inode->delayed_iput);
3273 		}
3274 		spin_unlock(&fs_info->delayed_iput_lock);
3275 		iput(&inode->vfs_inode);
3276 		spin_lock(&fs_info->delayed_iput_lock);
3277 	}
3278 	spin_unlock(&fs_info->delayed_iput_lock);
3279 }
3280 
3281 /*
3282  * This is called in transaction commit time. If there are no orphan
3283  * files in the subvolume, it removes orphan item and frees block_rsv
3284  * structure.
3285  */
3286 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
3287 			      struct btrfs_root *root)
3288 {
3289 	struct btrfs_fs_info *fs_info = root->fs_info;
3290 	struct btrfs_block_rsv *block_rsv;
3291 	int ret;
3292 
3293 	if (atomic_read(&root->orphan_inodes) ||
3294 	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
3295 		return;
3296 
3297 	spin_lock(&root->orphan_lock);
3298 	if (atomic_read(&root->orphan_inodes)) {
3299 		spin_unlock(&root->orphan_lock);
3300 		return;
3301 	}
3302 
3303 	if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
3304 		spin_unlock(&root->orphan_lock);
3305 		return;
3306 	}
3307 
3308 	block_rsv = root->orphan_block_rsv;
3309 	root->orphan_block_rsv = NULL;
3310 	spin_unlock(&root->orphan_lock);
3311 
3312 	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
3313 	    btrfs_root_refs(&root->root_item) > 0) {
3314 		ret = btrfs_del_orphan_item(trans, fs_info->tree_root,
3315 					    root->root_key.objectid);
3316 		if (ret)
3317 			btrfs_abort_transaction(trans, ret);
3318 		else
3319 			clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
3320 				  &root->state);
3321 	}
3322 
3323 	if (block_rsv) {
3324 		WARN_ON(block_rsv->size > 0);
3325 		btrfs_free_block_rsv(fs_info, block_rsv);
3326 	}
3327 }
3328 
3329 /*
3330  * This creates an orphan entry for the given inode in case something goes
3331  * wrong in the middle of an unlink/truncate.
3332  *
3333  * NOTE: caller of this function should reserve 5 units of metadata for
3334  *	 this function.
3335  */
3336 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3337 		struct btrfs_inode *inode)
3338 {
3339 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
3340 	struct btrfs_root *root = inode->root;
3341 	struct btrfs_block_rsv *block_rsv = NULL;
3342 	int reserve = 0;
3343 	int insert = 0;
3344 	int ret;
3345 
3346 	if (!root->orphan_block_rsv) {
3347 		block_rsv = btrfs_alloc_block_rsv(fs_info,
3348 						  BTRFS_BLOCK_RSV_TEMP);
3349 		if (!block_rsv)
3350 			return -ENOMEM;
3351 	}
3352 
3353 	spin_lock(&root->orphan_lock);
3354 	if (!root->orphan_block_rsv) {
3355 		root->orphan_block_rsv = block_rsv;
3356 	} else if (block_rsv) {
3357 		btrfs_free_block_rsv(fs_info, block_rsv);
3358 		block_rsv = NULL;
3359 	}
3360 
3361 	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3362 			      &inode->runtime_flags)) {
3363 #if 0
3364 		/*
3365 		 * For proper ENOSPC handling, we should do orphan
3366 		 * cleanup when mounting. But this introduces backward
3367 		 * compatibility issue.
3368 		 */
3369 		if (!xchg(&root->orphan_item_inserted, 1))
3370 			insert = 2;
3371 		else
3372 			insert = 1;
3373 #endif
3374 		insert = 1;
3375 		atomic_inc(&root->orphan_inodes);
3376 	}
3377 
3378 	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3379 			      &inode->runtime_flags))
3380 		reserve = 1;
3381 	spin_unlock(&root->orphan_lock);
3382 
3383 	/* grab metadata reservation from transaction handle */
3384 	if (reserve) {
3385 		ret = btrfs_orphan_reserve_metadata(trans, inode);
3386 		ASSERT(!ret);
3387 		if (ret) {
3388 			atomic_dec(&root->orphan_inodes);
3389 			clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3390 				  &inode->runtime_flags);
3391 			if (insert)
3392 				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3393 					  &inode->runtime_flags);
3394 			return ret;
3395 		}
3396 	}
3397 
3398 	/* insert an orphan item to track this unlinked/truncated file */
3399 	if (insert >= 1) {
3400 		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3401 		if (ret) {
3402 			atomic_dec(&root->orphan_inodes);
3403 			if (reserve) {
3404 				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3405 					  &inode->runtime_flags);
3406 				btrfs_orphan_release_metadata(inode);
3407 			}
3408 			if (ret != -EEXIST) {
3409 				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3410 					  &inode->runtime_flags);
3411 				btrfs_abort_transaction(trans, ret);
3412 				return ret;
3413 			}
3414 		}
3415 		ret = 0;
3416 	}
3417 
3418 	/* insert an orphan item to track subvolume contains orphan files */
3419 	if (insert >= 2) {
3420 		ret = btrfs_insert_orphan_item(trans, fs_info->tree_root,
3421 					       root->root_key.objectid);
3422 		if (ret && ret != -EEXIST) {
3423 			btrfs_abort_transaction(trans, ret);
3424 			return ret;
3425 		}
3426 	}
3427 	return 0;
3428 }
3429 
3430 /*
3431  * We have done the truncate/delete so we can go ahead and remove the orphan
3432  * item for this particular inode.
3433  */
3434 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3435 			    struct btrfs_inode *inode)
3436 {
3437 	struct btrfs_root *root = inode->root;
3438 	int delete_item = 0;
3439 	int release_rsv = 0;
3440 	int ret = 0;
3441 
3442 	spin_lock(&root->orphan_lock);
3443 	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3444 			       &inode->runtime_flags))
3445 		delete_item = 1;
3446 
3447 	if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3448 			       &inode->runtime_flags))
3449 		release_rsv = 1;
3450 	spin_unlock(&root->orphan_lock);
3451 
3452 	if (delete_item) {
3453 		atomic_dec(&root->orphan_inodes);
3454 		if (trans)
3455 			ret = btrfs_del_orphan_item(trans, root,
3456 						    btrfs_ino(inode));
3457 	}
3458 
3459 	if (release_rsv)
3460 		btrfs_orphan_release_metadata(inode);
3461 
3462 	return ret;
3463 }
3464 
3465 /*
3466  * this cleans up any orphans that may be left on the list from the last use
3467  * of this root.
3468  */
3469 int btrfs_orphan_cleanup(struct btrfs_root *root)
3470 {
3471 	struct btrfs_fs_info *fs_info = root->fs_info;
3472 	struct btrfs_path *path;
3473 	struct extent_buffer *leaf;
3474 	struct btrfs_key key, found_key;
3475 	struct btrfs_trans_handle *trans;
3476 	struct inode *inode;
3477 	u64 last_objectid = 0;
3478 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
3479 
3480 	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3481 		return 0;
3482 
3483 	path = btrfs_alloc_path();
3484 	if (!path) {
3485 		ret = -ENOMEM;
3486 		goto out;
3487 	}
3488 	path->reada = READA_BACK;
3489 
3490 	key.objectid = BTRFS_ORPHAN_OBJECTID;
3491 	key.type = BTRFS_ORPHAN_ITEM_KEY;
3492 	key.offset = (u64)-1;
3493 
3494 	while (1) {
3495 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3496 		if (ret < 0)
3497 			goto out;
3498 
3499 		/*
3500 		 * if ret == 0 means we found what we were searching for, which
3501 		 * is weird, but possible, so only screw with path if we didn't
3502 		 * find the key and see if we have stuff that matches
3503 		 */
3504 		if (ret > 0) {
3505 			ret = 0;
3506 			if (path->slots[0] == 0)
3507 				break;
3508 			path->slots[0]--;
3509 		}
3510 
3511 		/* pull out the item */
3512 		leaf = path->nodes[0];
3513 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3514 
3515 		/* make sure the item matches what we want */
3516 		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3517 			break;
3518 		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3519 			break;
3520 
3521 		/* release the path since we're done with it */
3522 		btrfs_release_path(path);
3523 
3524 		/*
3525 		 * this is where we are basically btrfs_lookup, without the
3526 		 * crossing root thing.  we store the inode number in the
3527 		 * offset of the orphan item.
3528 		 */
3529 
3530 		if (found_key.offset == last_objectid) {
3531 			btrfs_err(fs_info,
3532 				  "Error removing orphan entry, stopping orphan cleanup");
3533 			ret = -EINVAL;
3534 			goto out;
3535 		}
3536 
3537 		last_objectid = found_key.offset;
3538 
3539 		found_key.objectid = found_key.offset;
3540 		found_key.type = BTRFS_INODE_ITEM_KEY;
3541 		found_key.offset = 0;
3542 		inode = btrfs_iget(fs_info->sb, &found_key, root, NULL);
3543 		ret = PTR_ERR_OR_ZERO(inode);
3544 		if (ret && ret != -ENOENT)
3545 			goto out;
3546 
3547 		if (ret == -ENOENT && root == fs_info->tree_root) {
3548 			struct btrfs_root *dead_root;
3549 			struct btrfs_fs_info *fs_info = root->fs_info;
3550 			int is_dead_root = 0;
3551 
3552 			/*
3553 			 * this is an orphan in the tree root. Currently these
3554 			 * could come from 2 sources:
3555 			 *  a) a snapshot deletion in progress
3556 			 *  b) a free space cache inode
3557 			 * We need to distinguish those two, as the snapshot
3558 			 * orphan must not get deleted.
3559 			 * find_dead_roots already ran before us, so if this
3560 			 * is a snapshot deletion, we should find the root
3561 			 * in the dead_roots list
3562 			 */
3563 			spin_lock(&fs_info->trans_lock);
3564 			list_for_each_entry(dead_root, &fs_info->dead_roots,
3565 					    root_list) {
3566 				if (dead_root->root_key.objectid ==
3567 				    found_key.objectid) {
3568 					is_dead_root = 1;
3569 					break;
3570 				}
3571 			}
3572 			spin_unlock(&fs_info->trans_lock);
3573 			if (is_dead_root) {
3574 				/* prevent this orphan from being found again */
3575 				key.offset = found_key.objectid - 1;
3576 				continue;
3577 			}
3578 		}
3579 		/*
3580 		 * Inode is already gone but the orphan item is still there,
3581 		 * kill the orphan item.
3582 		 */
3583 		if (ret == -ENOENT) {
3584 			trans = btrfs_start_transaction(root, 1);
3585 			if (IS_ERR(trans)) {
3586 				ret = PTR_ERR(trans);
3587 				goto out;
3588 			}
3589 			btrfs_debug(fs_info, "auto deleting %Lu",
3590 				    found_key.objectid);
3591 			ret = btrfs_del_orphan_item(trans, root,
3592 						    found_key.objectid);
3593 			btrfs_end_transaction(trans);
3594 			if (ret)
3595 				goto out;
3596 			continue;
3597 		}
3598 
3599 		/*
3600 		 * add this inode to the orphan list so btrfs_orphan_del does
3601 		 * the proper thing when we hit it
3602 		 */
3603 		set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3604 			&BTRFS_I(inode)->runtime_flags);
3605 		atomic_inc(&root->orphan_inodes);
3606 
3607 		/* if we have links, this was a truncate, lets do that */
3608 		if (inode->i_nlink) {
3609 			if (WARN_ON(!S_ISREG(inode->i_mode))) {
3610 				iput(inode);
3611 				continue;
3612 			}
3613 			nr_truncate++;
3614 
3615 			/* 1 for the orphan item deletion. */
3616 			trans = btrfs_start_transaction(root, 1);
3617 			if (IS_ERR(trans)) {
3618 				iput(inode);
3619 				ret = PTR_ERR(trans);
3620 				goto out;
3621 			}
3622 			ret = btrfs_orphan_add(trans, BTRFS_I(inode));
3623 			btrfs_end_transaction(trans);
3624 			if (ret) {
3625 				iput(inode);
3626 				goto out;
3627 			}
3628 
3629 			ret = btrfs_truncate(inode);
3630 			if (ret)
3631 				btrfs_orphan_del(NULL, BTRFS_I(inode));
3632 		} else {
3633 			nr_unlink++;
3634 		}
3635 
3636 		/* this will do delete_inode and everything for us */
3637 		iput(inode);
3638 		if (ret)
3639 			goto out;
3640 	}
3641 	/* release the path since we're done with it */
3642 	btrfs_release_path(path);
3643 
3644 	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3645 
3646 	if (root->orphan_block_rsv)
3647 		btrfs_block_rsv_release(fs_info, root->orphan_block_rsv,
3648 					(u64)-1);
3649 
3650 	if (root->orphan_block_rsv ||
3651 	    test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3652 		trans = btrfs_join_transaction(root);
3653 		if (!IS_ERR(trans))
3654 			btrfs_end_transaction(trans);
3655 	}
3656 
3657 	if (nr_unlink)
3658 		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3659 	if (nr_truncate)
3660 		btrfs_debug(fs_info, "truncated %d orphans", nr_truncate);
3661 
3662 out:
3663 	if (ret)
3664 		btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3665 	btrfs_free_path(path);
3666 	return ret;
3667 }
3668 
3669 /*
3670  * very simple check to peek ahead in the leaf looking for xattrs.  If we
3671  * don't find any xattrs, we know there can't be any acls.
3672  *
3673  * slot is the slot the inode is in, objectid is the objectid of the inode
3674  */
3675 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3676 					  int slot, u64 objectid,
3677 					  int *first_xattr_slot)
3678 {
3679 	u32 nritems = btrfs_header_nritems(leaf);
3680 	struct btrfs_key found_key;
3681 	static u64 xattr_access = 0;
3682 	static u64 xattr_default = 0;
3683 	int scanned = 0;
3684 
3685 	if (!xattr_access) {
3686 		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3687 					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3688 		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3689 					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3690 	}
3691 
3692 	slot++;
3693 	*first_xattr_slot = -1;
3694 	while (slot < nritems) {
3695 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3696 
3697 		/* we found a different objectid, there must not be acls */
3698 		if (found_key.objectid != objectid)
3699 			return 0;
3700 
3701 		/* we found an xattr, assume we've got an acl */
3702 		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3703 			if (*first_xattr_slot == -1)
3704 				*first_xattr_slot = slot;
3705 			if (found_key.offset == xattr_access ||
3706 			    found_key.offset == xattr_default)
3707 				return 1;
3708 		}
3709 
3710 		/*
3711 		 * we found a key greater than an xattr key, there can't
3712 		 * be any acls later on
3713 		 */
3714 		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3715 			return 0;
3716 
3717 		slot++;
3718 		scanned++;
3719 
3720 		/*
3721 		 * it goes inode, inode backrefs, xattrs, extents,
3722 		 * so if there are a ton of hard links to an inode there can
3723 		 * be a lot of backrefs.  Don't waste time searching too hard,
3724 		 * this is just an optimization
3725 		 */
3726 		if (scanned >= 8)
3727 			break;
3728 	}
3729 	/* we hit the end of the leaf before we found an xattr or
3730 	 * something larger than an xattr.  We have to assume the inode
3731 	 * has acls
3732 	 */
3733 	if (*first_xattr_slot == -1)
3734 		*first_xattr_slot = slot;
3735 	return 1;
3736 }
3737 
3738 /*
3739  * read an inode from the btree into the in-memory inode
3740  */
3741 static int btrfs_read_locked_inode(struct inode *inode)
3742 {
3743 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3744 	struct btrfs_path *path;
3745 	struct extent_buffer *leaf;
3746 	struct btrfs_inode_item *inode_item;
3747 	struct btrfs_root *root = BTRFS_I(inode)->root;
3748 	struct btrfs_key location;
3749 	unsigned long ptr;
3750 	int maybe_acls;
3751 	u32 rdev;
3752 	int ret;
3753 	bool filled = false;
3754 	int first_xattr_slot;
3755 
3756 	ret = btrfs_fill_inode(inode, &rdev);
3757 	if (!ret)
3758 		filled = true;
3759 
3760 	path = btrfs_alloc_path();
3761 	if (!path) {
3762 		ret = -ENOMEM;
3763 		goto make_bad;
3764 	}
3765 
3766 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3767 
3768 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3769 	if (ret) {
3770 		if (ret > 0)
3771 			ret = -ENOENT;
3772 		goto make_bad;
3773 	}
3774 
3775 	leaf = path->nodes[0];
3776 
3777 	if (filled)
3778 		goto cache_index;
3779 
3780 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3781 				    struct btrfs_inode_item);
3782 	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3783 	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3784 	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3785 	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3786 	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3787 
3788 	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3789 	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3790 
3791 	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3792 	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3793 
3794 	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
3795 	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3796 
3797 	BTRFS_I(inode)->i_otime.tv_sec =
3798 		btrfs_timespec_sec(leaf, &inode_item->otime);
3799 	BTRFS_I(inode)->i_otime.tv_nsec =
3800 		btrfs_timespec_nsec(leaf, &inode_item->otime);
3801 
3802 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3803 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3804 	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3805 
3806 	inode_set_iversion_queried(inode,
3807 				   btrfs_inode_sequence(leaf, inode_item));
3808 	inode->i_generation = BTRFS_I(inode)->generation;
3809 	inode->i_rdev = 0;
3810 	rdev = btrfs_inode_rdev(leaf, inode_item);
3811 
3812 	BTRFS_I(inode)->index_cnt = (u64)-1;
3813 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3814 
3815 cache_index:
3816 	/*
3817 	 * If we were modified in the current generation and evicted from memory
3818 	 * and then re-read we need to do a full sync since we don't have any
3819 	 * idea about which extents were modified before we were evicted from
3820 	 * cache.
3821 	 *
3822 	 * This is required for both inode re-read from disk and delayed inode
3823 	 * in delayed_nodes_tree.
3824 	 */
3825 	if (BTRFS_I(inode)->last_trans == fs_info->generation)
3826 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3827 			&BTRFS_I(inode)->runtime_flags);
3828 
3829 	/*
3830 	 * We don't persist the id of the transaction where an unlink operation
3831 	 * against the inode was last made. So here we assume the inode might
3832 	 * have been evicted, and therefore the exact value of last_unlink_trans
3833 	 * lost, and set it to last_trans to avoid metadata inconsistencies
3834 	 * between the inode and its parent if the inode is fsync'ed and the log
3835 	 * replayed. For example, in the scenario:
3836 	 *
3837 	 * touch mydir/foo
3838 	 * ln mydir/foo mydir/bar
3839 	 * sync
3840 	 * unlink mydir/bar
3841 	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3842 	 * xfs_io -c fsync mydir/foo
3843 	 * <power failure>
3844 	 * mount fs, triggers fsync log replay
3845 	 *
3846 	 * We must make sure that when we fsync our inode foo we also log its
3847 	 * parent inode, otherwise after log replay the parent still has the
3848 	 * dentry with the "bar" name but our inode foo has a link count of 1
3849 	 * and doesn't have an inode ref with the name "bar" anymore.
3850 	 *
3851 	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
3852 	 * but it guarantees correctness at the expense of occasional full
3853 	 * transaction commits on fsync if our inode is a directory, or if our
3854 	 * inode is not a directory, logging its parent unnecessarily.
3855 	 */
3856 	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3857 
3858 	path->slots[0]++;
3859 	if (inode->i_nlink != 1 ||
3860 	    path->slots[0] >= btrfs_header_nritems(leaf))
3861 		goto cache_acl;
3862 
3863 	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3864 	if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3865 		goto cache_acl;
3866 
3867 	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3868 	if (location.type == BTRFS_INODE_REF_KEY) {
3869 		struct btrfs_inode_ref *ref;
3870 
3871 		ref = (struct btrfs_inode_ref *)ptr;
3872 		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3873 	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3874 		struct btrfs_inode_extref *extref;
3875 
3876 		extref = (struct btrfs_inode_extref *)ptr;
3877 		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3878 								     extref);
3879 	}
3880 cache_acl:
3881 	/*
3882 	 * try to precache a NULL acl entry for files that don't have
3883 	 * any xattrs or acls
3884 	 */
3885 	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3886 			btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3887 	if (first_xattr_slot != -1) {
3888 		path->slots[0] = first_xattr_slot;
3889 		ret = btrfs_load_inode_props(inode, path);
3890 		if (ret)
3891 			btrfs_err(fs_info,
3892 				  "error loading props for ino %llu (root %llu): %d",
3893 				  btrfs_ino(BTRFS_I(inode)),
3894 				  root->root_key.objectid, ret);
3895 	}
3896 	btrfs_free_path(path);
3897 
3898 	if (!maybe_acls)
3899 		cache_no_acl(inode);
3900 
3901 	switch (inode->i_mode & S_IFMT) {
3902 	case S_IFREG:
3903 		inode->i_mapping->a_ops = &btrfs_aops;
3904 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3905 		inode->i_fop = &btrfs_file_operations;
3906 		inode->i_op = &btrfs_file_inode_operations;
3907 		break;
3908 	case S_IFDIR:
3909 		inode->i_fop = &btrfs_dir_file_operations;
3910 		inode->i_op = &btrfs_dir_inode_operations;
3911 		break;
3912 	case S_IFLNK:
3913 		inode->i_op = &btrfs_symlink_inode_operations;
3914 		inode_nohighmem(inode);
3915 		inode->i_mapping->a_ops = &btrfs_symlink_aops;
3916 		break;
3917 	default:
3918 		inode->i_op = &btrfs_special_inode_operations;
3919 		init_special_inode(inode, inode->i_mode, rdev);
3920 		break;
3921 	}
3922 
3923 	btrfs_update_iflags(inode);
3924 	return 0;
3925 
3926 make_bad:
3927 	btrfs_free_path(path);
3928 	make_bad_inode(inode);
3929 	return ret;
3930 }
3931 
3932 /*
3933  * given a leaf and an inode, copy the inode fields into the leaf
3934  */
3935 static void fill_inode_item(struct btrfs_trans_handle *trans,
3936 			    struct extent_buffer *leaf,
3937 			    struct btrfs_inode_item *item,
3938 			    struct inode *inode)
3939 {
3940 	struct btrfs_map_token token;
3941 
3942 	btrfs_init_map_token(&token);
3943 
3944 	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3945 	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3946 	btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3947 				   &token);
3948 	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3949 	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3950 
3951 	btrfs_set_token_timespec_sec(leaf, &item->atime,
3952 				     inode->i_atime.tv_sec, &token);
3953 	btrfs_set_token_timespec_nsec(leaf, &item->atime,
3954 				      inode->i_atime.tv_nsec, &token);
3955 
3956 	btrfs_set_token_timespec_sec(leaf, &item->mtime,
3957 				     inode->i_mtime.tv_sec, &token);
3958 	btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3959 				      inode->i_mtime.tv_nsec, &token);
3960 
3961 	btrfs_set_token_timespec_sec(leaf, &item->ctime,
3962 				     inode->i_ctime.tv_sec, &token);
3963 	btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3964 				      inode->i_ctime.tv_nsec, &token);
3965 
3966 	btrfs_set_token_timespec_sec(leaf, &item->otime,
3967 				     BTRFS_I(inode)->i_otime.tv_sec, &token);
3968 	btrfs_set_token_timespec_nsec(leaf, &item->otime,
3969 				      BTRFS_I(inode)->i_otime.tv_nsec, &token);
3970 
3971 	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3972 				     &token);
3973 	btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3974 					 &token);
3975 	btrfs_set_token_inode_sequence(leaf, item, inode_peek_iversion(inode),
3976 				       &token);
3977 	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3978 	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3979 	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3980 	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3981 }
3982 
3983 /*
3984  * copy everything in the in-memory inode into the btree.
3985  */
3986 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3987 				struct btrfs_root *root, struct inode *inode)
3988 {
3989 	struct btrfs_inode_item *inode_item;
3990 	struct btrfs_path *path;
3991 	struct extent_buffer *leaf;
3992 	int ret;
3993 
3994 	path = btrfs_alloc_path();
3995 	if (!path)
3996 		return -ENOMEM;
3997 
3998 	path->leave_spinning = 1;
3999 	ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
4000 				 1);
4001 	if (ret) {
4002 		if (ret > 0)
4003 			ret = -ENOENT;
4004 		goto failed;
4005 	}
4006 
4007 	leaf = path->nodes[0];
4008 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
4009 				    struct btrfs_inode_item);
4010 
4011 	fill_inode_item(trans, leaf, inode_item, inode);
4012 	btrfs_mark_buffer_dirty(leaf);
4013 	btrfs_set_inode_last_trans(trans, inode);
4014 	ret = 0;
4015 failed:
4016 	btrfs_free_path(path);
4017 	return ret;
4018 }
4019 
4020 /*
4021  * copy everything in the in-memory inode into the btree.
4022  */
4023 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
4024 				struct btrfs_root *root, struct inode *inode)
4025 {
4026 	struct btrfs_fs_info *fs_info = root->fs_info;
4027 	int ret;
4028 
4029 	/*
4030 	 * If the inode is a free space inode, we can deadlock during commit
4031 	 * if we put it into the delayed code.
4032 	 *
4033 	 * The data relocation inode should also be directly updated
4034 	 * without delay
4035 	 */
4036 	if (!btrfs_is_free_space_inode(BTRFS_I(inode))
4037 	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
4038 	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4039 		btrfs_update_root_times(trans, root);
4040 
4041 		ret = btrfs_delayed_update_inode(trans, root, inode);
4042 		if (!ret)
4043 			btrfs_set_inode_last_trans(trans, inode);
4044 		return ret;
4045 	}
4046 
4047 	return btrfs_update_inode_item(trans, root, inode);
4048 }
4049 
4050 noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4051 					 struct btrfs_root *root,
4052 					 struct inode *inode)
4053 {
4054 	int ret;
4055 
4056 	ret = btrfs_update_inode(trans, root, inode);
4057 	if (ret == -ENOSPC)
4058 		return btrfs_update_inode_item(trans, root, inode);
4059 	return ret;
4060 }
4061 
4062 /*
4063  * unlink helper that gets used here in inode.c and in the tree logging
4064  * recovery code.  It remove a link in a directory with a given name, and
4065  * also drops the back refs in the inode to the directory
4066  */
4067 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4068 				struct btrfs_root *root,
4069 				struct btrfs_inode *dir,
4070 				struct btrfs_inode *inode,
4071 				const char *name, int name_len)
4072 {
4073 	struct btrfs_fs_info *fs_info = root->fs_info;
4074 	struct btrfs_path *path;
4075 	int ret = 0;
4076 	struct extent_buffer *leaf;
4077 	struct btrfs_dir_item *di;
4078 	struct btrfs_key key;
4079 	u64 index;
4080 	u64 ino = btrfs_ino(inode);
4081 	u64 dir_ino = btrfs_ino(dir);
4082 
4083 	path = btrfs_alloc_path();
4084 	if (!path) {
4085 		ret = -ENOMEM;
4086 		goto out;
4087 	}
4088 
4089 	path->leave_spinning = 1;
4090 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4091 				    name, name_len, -1);
4092 	if (IS_ERR(di)) {
4093 		ret = PTR_ERR(di);
4094 		goto err;
4095 	}
4096 	if (!di) {
4097 		ret = -ENOENT;
4098 		goto err;
4099 	}
4100 	leaf = path->nodes[0];
4101 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
4102 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4103 	if (ret)
4104 		goto err;
4105 	btrfs_release_path(path);
4106 
4107 	/*
4108 	 * If we don't have dir index, we have to get it by looking up
4109 	 * the inode ref, since we get the inode ref, remove it directly,
4110 	 * it is unnecessary to do delayed deletion.
4111 	 *
4112 	 * But if we have dir index, needn't search inode ref to get it.
4113 	 * Since the inode ref is close to the inode item, it is better
4114 	 * that we delay to delete it, and just do this deletion when
4115 	 * we update the inode item.
4116 	 */
4117 	if (inode->dir_index) {
4118 		ret = btrfs_delayed_delete_inode_ref(inode);
4119 		if (!ret) {
4120 			index = inode->dir_index;
4121 			goto skip_backref;
4122 		}
4123 	}
4124 
4125 	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
4126 				  dir_ino, &index);
4127 	if (ret) {
4128 		btrfs_info(fs_info,
4129 			"failed to delete reference to %.*s, inode %llu parent %llu",
4130 			name_len, name, ino, dir_ino);
4131 		btrfs_abort_transaction(trans, ret);
4132 		goto err;
4133 	}
4134 skip_backref:
4135 	ret = btrfs_delete_delayed_dir_index(trans, fs_info, dir, index);
4136 	if (ret) {
4137 		btrfs_abort_transaction(trans, ret);
4138 		goto err;
4139 	}
4140 
4141 	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
4142 			dir_ino);
4143 	if (ret != 0 && ret != -ENOENT) {
4144 		btrfs_abort_transaction(trans, ret);
4145 		goto err;
4146 	}
4147 
4148 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
4149 			index);
4150 	if (ret == -ENOENT)
4151 		ret = 0;
4152 	else if (ret)
4153 		btrfs_abort_transaction(trans, ret);
4154 err:
4155 	btrfs_free_path(path);
4156 	if (ret)
4157 		goto out;
4158 
4159 	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
4160 	inode_inc_iversion(&inode->vfs_inode);
4161 	inode_inc_iversion(&dir->vfs_inode);
4162 	inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
4163 		dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
4164 	ret = btrfs_update_inode(trans, root, &dir->vfs_inode);
4165 out:
4166 	return ret;
4167 }
4168 
4169 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4170 		       struct btrfs_root *root,
4171 		       struct btrfs_inode *dir, struct btrfs_inode *inode,
4172 		       const char *name, int name_len)
4173 {
4174 	int ret;
4175 	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
4176 	if (!ret) {
4177 		drop_nlink(&inode->vfs_inode);
4178 		ret = btrfs_update_inode(trans, root, &inode->vfs_inode);
4179 	}
4180 	return ret;
4181 }
4182 
4183 /*
4184  * helper to start transaction for unlink and rmdir.
4185  *
4186  * unlink and rmdir are special in btrfs, they do not always free space, so
4187  * if we cannot make our reservations the normal way try and see if there is
4188  * plenty of slack room in the global reserve to migrate, otherwise we cannot
4189  * allow the unlink to occur.
4190  */
4191 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
4192 {
4193 	struct btrfs_root *root = BTRFS_I(dir)->root;
4194 
4195 	/*
4196 	 * 1 for the possible orphan item
4197 	 * 1 for the dir item
4198 	 * 1 for the dir index
4199 	 * 1 for the inode ref
4200 	 * 1 for the inode
4201 	 */
4202 	return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
4203 }
4204 
4205 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4206 {
4207 	struct btrfs_root *root = BTRFS_I(dir)->root;
4208 	struct btrfs_trans_handle *trans;
4209 	struct inode *inode = d_inode(dentry);
4210 	int ret;
4211 
4212 	trans = __unlink_start_trans(dir);
4213 	if (IS_ERR(trans))
4214 		return PTR_ERR(trans);
4215 
4216 	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4217 			0);
4218 
4219 	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
4220 			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4221 			dentry->d_name.len);
4222 	if (ret)
4223 		goto out;
4224 
4225 	if (inode->i_nlink == 0) {
4226 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4227 		if (ret)
4228 			goto out;
4229 	}
4230 
4231 out:
4232 	btrfs_end_transaction(trans);
4233 	btrfs_btree_balance_dirty(root->fs_info);
4234 	return ret;
4235 }
4236 
4237 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4238 			struct btrfs_root *root,
4239 			struct inode *dir, u64 objectid,
4240 			const char *name, int name_len)
4241 {
4242 	struct btrfs_fs_info *fs_info = root->fs_info;
4243 	struct btrfs_path *path;
4244 	struct extent_buffer *leaf;
4245 	struct btrfs_dir_item *di;
4246 	struct btrfs_key key;
4247 	u64 index;
4248 	int ret;
4249 	u64 dir_ino = btrfs_ino(BTRFS_I(dir));
4250 
4251 	path = btrfs_alloc_path();
4252 	if (!path)
4253 		return -ENOMEM;
4254 
4255 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4256 				   name, name_len, -1);
4257 	if (IS_ERR_OR_NULL(di)) {
4258 		if (!di)
4259 			ret = -ENOENT;
4260 		else
4261 			ret = PTR_ERR(di);
4262 		goto out;
4263 	}
4264 
4265 	leaf = path->nodes[0];
4266 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
4267 	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4268 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4269 	if (ret) {
4270 		btrfs_abort_transaction(trans, ret);
4271 		goto out;
4272 	}
4273 	btrfs_release_path(path);
4274 
4275 	ret = btrfs_del_root_ref(trans, fs_info, objectid,
4276 				 root->root_key.objectid, dir_ino,
4277 				 &index, name, name_len);
4278 	if (ret < 0) {
4279 		if (ret != -ENOENT) {
4280 			btrfs_abort_transaction(trans, ret);
4281 			goto out;
4282 		}
4283 		di = btrfs_search_dir_index_item(root, path, dir_ino,
4284 						 name, name_len);
4285 		if (IS_ERR_OR_NULL(di)) {
4286 			if (!di)
4287 				ret = -ENOENT;
4288 			else
4289 				ret = PTR_ERR(di);
4290 			btrfs_abort_transaction(trans, ret);
4291 			goto out;
4292 		}
4293 
4294 		leaf = path->nodes[0];
4295 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4296 		btrfs_release_path(path);
4297 		index = key.offset;
4298 	}
4299 	btrfs_release_path(path);
4300 
4301 	ret = btrfs_delete_delayed_dir_index(trans, fs_info, BTRFS_I(dir), index);
4302 	if (ret) {
4303 		btrfs_abort_transaction(trans, ret);
4304 		goto out;
4305 	}
4306 
4307 	btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
4308 	inode_inc_iversion(dir);
4309 	dir->i_mtime = dir->i_ctime = current_time(dir);
4310 	ret = btrfs_update_inode_fallback(trans, root, dir);
4311 	if (ret)
4312 		btrfs_abort_transaction(trans, ret);
4313 out:
4314 	btrfs_free_path(path);
4315 	return ret;
4316 }
4317 
4318 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4319 {
4320 	struct inode *inode = d_inode(dentry);
4321 	int err = 0;
4322 	struct btrfs_root *root = BTRFS_I(dir)->root;
4323 	struct btrfs_trans_handle *trans;
4324 	u64 last_unlink_trans;
4325 
4326 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4327 		return -ENOTEMPTY;
4328 	if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
4329 		return -EPERM;
4330 
4331 	trans = __unlink_start_trans(dir);
4332 	if (IS_ERR(trans))
4333 		return PTR_ERR(trans);
4334 
4335 	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4336 		err = btrfs_unlink_subvol(trans, root, dir,
4337 					  BTRFS_I(inode)->location.objectid,
4338 					  dentry->d_name.name,
4339 					  dentry->d_name.len);
4340 		goto out;
4341 	}
4342 
4343 	err = btrfs_orphan_add(trans, BTRFS_I(inode));
4344 	if (err)
4345 		goto out;
4346 
4347 	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4348 
4349 	/* now the directory is empty */
4350 	err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
4351 			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4352 			dentry->d_name.len);
4353 	if (!err) {
4354 		btrfs_i_size_write(BTRFS_I(inode), 0);
4355 		/*
4356 		 * Propagate the last_unlink_trans value of the deleted dir to
4357 		 * its parent directory. This is to prevent an unrecoverable
4358 		 * log tree in the case we do something like this:
4359 		 * 1) create dir foo
4360 		 * 2) create snapshot under dir foo
4361 		 * 3) delete the snapshot
4362 		 * 4) rmdir foo
4363 		 * 5) mkdir foo
4364 		 * 6) fsync foo or some file inside foo
4365 		 */
4366 		if (last_unlink_trans >= trans->transid)
4367 			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4368 	}
4369 out:
4370 	btrfs_end_transaction(trans);
4371 	btrfs_btree_balance_dirty(root->fs_info);
4372 
4373 	return err;
4374 }
4375 
4376 static int truncate_space_check(struct btrfs_trans_handle *trans,
4377 				struct btrfs_root *root,
4378 				u64 bytes_deleted)
4379 {
4380 	struct btrfs_fs_info *fs_info = root->fs_info;
4381 	int ret;
4382 
4383 	/*
4384 	 * This is only used to apply pressure to the enospc system, we don't
4385 	 * intend to use this reservation at all.
4386 	 */
4387 	bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted);
4388 	bytes_deleted *= fs_info->nodesize;
4389 	ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
4390 				  bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
4391 	if (!ret) {
4392 		trace_btrfs_space_reservation(fs_info, "transaction",
4393 					      trans->transid,
4394 					      bytes_deleted, 1);
4395 		trans->bytes_reserved += bytes_deleted;
4396 	}
4397 	return ret;
4398 
4399 }
4400 
4401 /*
4402  * Return this if we need to call truncate_block for the last bit of the
4403  * truncate.
4404  */
4405 #define NEED_TRUNCATE_BLOCK 1
4406 
4407 /*
4408  * this can truncate away extent items, csum items and directory items.
4409  * It starts at a high offset and removes keys until it can't find
4410  * any higher than new_size
4411  *
4412  * csum items that cross the new i_size are truncated to the new size
4413  * as well.
4414  *
4415  * min_type is the minimum key type to truncate down to.  If set to 0, this
4416  * will kill all the items on this inode, including the INODE_ITEM_KEY.
4417  */
4418 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4419 			       struct btrfs_root *root,
4420 			       struct inode *inode,
4421 			       u64 new_size, u32 min_type)
4422 {
4423 	struct btrfs_fs_info *fs_info = root->fs_info;
4424 	struct btrfs_path *path;
4425 	struct extent_buffer *leaf;
4426 	struct btrfs_file_extent_item *fi;
4427 	struct btrfs_key key;
4428 	struct btrfs_key found_key;
4429 	u64 extent_start = 0;
4430 	u64 extent_num_bytes = 0;
4431 	u64 extent_offset = 0;
4432 	u64 item_end = 0;
4433 	u64 last_size = new_size;
4434 	u32 found_type = (u8)-1;
4435 	int found_extent;
4436 	int del_item;
4437 	int pending_del_nr = 0;
4438 	int pending_del_slot = 0;
4439 	int extent_type = -1;
4440 	int ret;
4441 	int err = 0;
4442 	u64 ino = btrfs_ino(BTRFS_I(inode));
4443 	u64 bytes_deleted = 0;
4444 	bool be_nice = false;
4445 	bool should_throttle = false;
4446 	bool should_end = false;
4447 
4448 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4449 
4450 	/*
4451 	 * for non-free space inodes and ref cows, we want to back off from
4452 	 * time to time
4453 	 */
4454 	if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
4455 	    test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4456 		be_nice = true;
4457 
4458 	path = btrfs_alloc_path();
4459 	if (!path)
4460 		return -ENOMEM;
4461 	path->reada = READA_BACK;
4462 
4463 	/*
4464 	 * We want to drop from the next block forward in case this new size is
4465 	 * not block aligned since we will be keeping the last block of the
4466 	 * extent just the way it is.
4467 	 */
4468 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4469 	    root == fs_info->tree_root)
4470 		btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
4471 					fs_info->sectorsize),
4472 					(u64)-1, 0);
4473 
4474 	/*
4475 	 * This function is also used to drop the items in the log tree before
4476 	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4477 	 * it is used to drop the loged items. So we shouldn't kill the delayed
4478 	 * items.
4479 	 */
4480 	if (min_type == 0 && root == BTRFS_I(inode)->root)
4481 		btrfs_kill_delayed_inode_items(BTRFS_I(inode));
4482 
4483 	key.objectid = ino;
4484 	key.offset = (u64)-1;
4485 	key.type = (u8)-1;
4486 
4487 search_again:
4488 	/*
4489 	 * with a 16K leaf size and 128MB extents, you can actually queue
4490 	 * up a huge file in a single leaf.  Most of the time that
4491 	 * bytes_deleted is > 0, it will be huge by the time we get here
4492 	 */
4493 	if (be_nice && bytes_deleted > SZ_32M) {
4494 		if (btrfs_should_end_transaction(trans)) {
4495 			err = -EAGAIN;
4496 			goto error;
4497 		}
4498 	}
4499 
4500 
4501 	path->leave_spinning = 1;
4502 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4503 	if (ret < 0) {
4504 		err = ret;
4505 		goto out;
4506 	}
4507 
4508 	if (ret > 0) {
4509 		/* there are no items in the tree for us to truncate, we're
4510 		 * done
4511 		 */
4512 		if (path->slots[0] == 0)
4513 			goto out;
4514 		path->slots[0]--;
4515 	}
4516 
4517 	while (1) {
4518 		fi = NULL;
4519 		leaf = path->nodes[0];
4520 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4521 		found_type = found_key.type;
4522 
4523 		if (found_key.objectid != ino)
4524 			break;
4525 
4526 		if (found_type < min_type)
4527 			break;
4528 
4529 		item_end = found_key.offset;
4530 		if (found_type == BTRFS_EXTENT_DATA_KEY) {
4531 			fi = btrfs_item_ptr(leaf, path->slots[0],
4532 					    struct btrfs_file_extent_item);
4533 			extent_type = btrfs_file_extent_type(leaf, fi);
4534 			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4535 				item_end +=
4536 				    btrfs_file_extent_num_bytes(leaf, fi);
4537 
4538 				trace_btrfs_truncate_show_fi_regular(
4539 					BTRFS_I(inode), leaf, fi,
4540 					found_key.offset);
4541 			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4542 				item_end += btrfs_file_extent_inline_len(leaf,
4543 							 path->slots[0], fi);
4544 
4545 				trace_btrfs_truncate_show_fi_inline(
4546 					BTRFS_I(inode), leaf, fi, path->slots[0],
4547 					found_key.offset);
4548 			}
4549 			item_end--;
4550 		}
4551 		if (found_type > min_type) {
4552 			del_item = 1;
4553 		} else {
4554 			if (item_end < new_size)
4555 				break;
4556 			if (found_key.offset >= new_size)
4557 				del_item = 1;
4558 			else
4559 				del_item = 0;
4560 		}
4561 		found_extent = 0;
4562 		/* FIXME, shrink the extent if the ref count is only 1 */
4563 		if (found_type != BTRFS_EXTENT_DATA_KEY)
4564 			goto delete;
4565 
4566 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4567 			u64 num_dec;
4568 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4569 			if (!del_item) {
4570 				u64 orig_num_bytes =
4571 					btrfs_file_extent_num_bytes(leaf, fi);
4572 				extent_num_bytes = ALIGN(new_size -
4573 						found_key.offset,
4574 						fs_info->sectorsize);
4575 				btrfs_set_file_extent_num_bytes(leaf, fi,
4576 							 extent_num_bytes);
4577 				num_dec = (orig_num_bytes -
4578 					   extent_num_bytes);
4579 				if (test_bit(BTRFS_ROOT_REF_COWS,
4580 					     &root->state) &&
4581 				    extent_start != 0)
4582 					inode_sub_bytes(inode, num_dec);
4583 				btrfs_mark_buffer_dirty(leaf);
4584 			} else {
4585 				extent_num_bytes =
4586 					btrfs_file_extent_disk_num_bytes(leaf,
4587 									 fi);
4588 				extent_offset = found_key.offset -
4589 					btrfs_file_extent_offset(leaf, fi);
4590 
4591 				/* FIXME blocksize != 4096 */
4592 				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4593 				if (extent_start != 0) {
4594 					found_extent = 1;
4595 					if (test_bit(BTRFS_ROOT_REF_COWS,
4596 						     &root->state))
4597 						inode_sub_bytes(inode, num_dec);
4598 				}
4599 			}
4600 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4601 			/*
4602 			 * we can't truncate inline items that have had
4603 			 * special encodings
4604 			 */
4605 			if (!del_item &&
4606 			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
4607 			    btrfs_file_extent_other_encoding(leaf, fi) == 0 &&
4608 			    btrfs_file_extent_compression(leaf, fi) == 0) {
4609 				u32 size = (u32)(new_size - found_key.offset);
4610 
4611 				btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4612 				size = btrfs_file_extent_calc_inline_size(size);
4613 				btrfs_truncate_item(root->fs_info, path, size, 1);
4614 			} else if (!del_item) {
4615 				/*
4616 				 * We have to bail so the last_size is set to
4617 				 * just before this extent.
4618 				 */
4619 				err = NEED_TRUNCATE_BLOCK;
4620 				break;
4621 			}
4622 
4623 			if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4624 				inode_sub_bytes(inode, item_end + 1 - new_size);
4625 		}
4626 delete:
4627 		if (del_item)
4628 			last_size = found_key.offset;
4629 		else
4630 			last_size = new_size;
4631 		if (del_item) {
4632 			if (!pending_del_nr) {
4633 				/* no pending yet, add ourselves */
4634 				pending_del_slot = path->slots[0];
4635 				pending_del_nr = 1;
4636 			} else if (pending_del_nr &&
4637 				   path->slots[0] + 1 == pending_del_slot) {
4638 				/* hop on the pending chunk */
4639 				pending_del_nr++;
4640 				pending_del_slot = path->slots[0];
4641 			} else {
4642 				BUG();
4643 			}
4644 		} else {
4645 			break;
4646 		}
4647 		should_throttle = false;
4648 
4649 		if (found_extent &&
4650 		    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4651 		     root == fs_info->tree_root)) {
4652 			btrfs_set_path_blocking(path);
4653 			bytes_deleted += extent_num_bytes;
4654 			ret = btrfs_free_extent(trans, root, extent_start,
4655 						extent_num_bytes, 0,
4656 						btrfs_header_owner(leaf),
4657 						ino, extent_offset);
4658 			BUG_ON(ret);
4659 			if (btrfs_should_throttle_delayed_refs(trans, fs_info))
4660 				btrfs_async_run_delayed_refs(fs_info,
4661 					trans->delayed_ref_updates * 2,
4662 					trans->transid, 0);
4663 			if (be_nice) {
4664 				if (truncate_space_check(trans, root,
4665 							 extent_num_bytes)) {
4666 					should_end = true;
4667 				}
4668 				if (btrfs_should_throttle_delayed_refs(trans,
4669 								       fs_info))
4670 					should_throttle = true;
4671 			}
4672 		}
4673 
4674 		if (found_type == BTRFS_INODE_ITEM_KEY)
4675 			break;
4676 
4677 		if (path->slots[0] == 0 ||
4678 		    path->slots[0] != pending_del_slot ||
4679 		    should_throttle || should_end) {
4680 			if (pending_del_nr) {
4681 				ret = btrfs_del_items(trans, root, path,
4682 						pending_del_slot,
4683 						pending_del_nr);
4684 				if (ret) {
4685 					btrfs_abort_transaction(trans, ret);
4686 					goto error;
4687 				}
4688 				pending_del_nr = 0;
4689 			}
4690 			btrfs_release_path(path);
4691 			if (should_throttle) {
4692 				unsigned long updates = trans->delayed_ref_updates;
4693 				if (updates) {
4694 					trans->delayed_ref_updates = 0;
4695 					ret = btrfs_run_delayed_refs(trans,
4696 								   fs_info,
4697 								   updates * 2);
4698 					if (ret && !err)
4699 						err = ret;
4700 				}
4701 			}
4702 			/*
4703 			 * if we failed to refill our space rsv, bail out
4704 			 * and let the transaction restart
4705 			 */
4706 			if (should_end) {
4707 				err = -EAGAIN;
4708 				goto error;
4709 			}
4710 			goto search_again;
4711 		} else {
4712 			path->slots[0]--;
4713 		}
4714 	}
4715 out:
4716 	if (pending_del_nr) {
4717 		ret = btrfs_del_items(trans, root, path, pending_del_slot,
4718 				      pending_del_nr);
4719 		if (ret)
4720 			btrfs_abort_transaction(trans, ret);
4721 	}
4722 error:
4723 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4724 		ASSERT(last_size >= new_size);
4725 		if (!err && last_size > new_size)
4726 			last_size = new_size;
4727 		btrfs_ordered_update_i_size(inode, last_size, NULL);
4728 	}
4729 
4730 	btrfs_free_path(path);
4731 
4732 	if (be_nice && bytes_deleted > SZ_32M) {
4733 		unsigned long updates = trans->delayed_ref_updates;
4734 		if (updates) {
4735 			trans->delayed_ref_updates = 0;
4736 			ret = btrfs_run_delayed_refs(trans, fs_info,
4737 						     updates * 2);
4738 			if (ret && !err)
4739 				err = ret;
4740 		}
4741 	}
4742 	return err;
4743 }
4744 
4745 /*
4746  * btrfs_truncate_block - read, zero a chunk and write a block
4747  * @inode - inode that we're zeroing
4748  * @from - the offset to start zeroing
4749  * @len - the length to zero, 0 to zero the entire range respective to the
4750  *	offset
4751  * @front - zero up to the offset instead of from the offset on
4752  *
4753  * This will find the block for the "from" offset and cow the block and zero the
4754  * part we want to zero.  This is used with truncate and hole punching.
4755  */
4756 int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4757 			int front)
4758 {
4759 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4760 	struct address_space *mapping = inode->i_mapping;
4761 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4762 	struct btrfs_ordered_extent *ordered;
4763 	struct extent_state *cached_state = NULL;
4764 	struct extent_changeset *data_reserved = NULL;
4765 	char *kaddr;
4766 	u32 blocksize = fs_info->sectorsize;
4767 	pgoff_t index = from >> PAGE_SHIFT;
4768 	unsigned offset = from & (blocksize - 1);
4769 	struct page *page;
4770 	gfp_t mask = btrfs_alloc_write_mask(mapping);
4771 	int ret = 0;
4772 	u64 block_start;
4773 	u64 block_end;
4774 
4775 	if (IS_ALIGNED(offset, blocksize) &&
4776 	    (!len || IS_ALIGNED(len, blocksize)))
4777 		goto out;
4778 
4779 	block_start = round_down(from, blocksize);
4780 	block_end = block_start + blocksize - 1;
4781 
4782 	ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
4783 					   block_start, blocksize);
4784 	if (ret)
4785 		goto out;
4786 
4787 again:
4788 	page = find_or_create_page(mapping, index, mask);
4789 	if (!page) {
4790 		btrfs_delalloc_release_space(inode, data_reserved,
4791 					     block_start, blocksize);
4792 		btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
4793 		ret = -ENOMEM;
4794 		goto out;
4795 	}
4796 
4797 	if (!PageUptodate(page)) {
4798 		ret = btrfs_readpage(NULL, page);
4799 		lock_page(page);
4800 		if (page->mapping != mapping) {
4801 			unlock_page(page);
4802 			put_page(page);
4803 			goto again;
4804 		}
4805 		if (!PageUptodate(page)) {
4806 			ret = -EIO;
4807 			goto out_unlock;
4808 		}
4809 	}
4810 	wait_on_page_writeback(page);
4811 
4812 	lock_extent_bits(io_tree, block_start, block_end, &cached_state);
4813 	set_page_extent_mapped(page);
4814 
4815 	ordered = btrfs_lookup_ordered_extent(inode, block_start);
4816 	if (ordered) {
4817 		unlock_extent_cached(io_tree, block_start, block_end,
4818 				     &cached_state);
4819 		unlock_page(page);
4820 		put_page(page);
4821 		btrfs_start_ordered_extent(inode, ordered, 1);
4822 		btrfs_put_ordered_extent(ordered);
4823 		goto again;
4824 	}
4825 
4826 	clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
4827 			  EXTENT_DIRTY | EXTENT_DELALLOC |
4828 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4829 			  0, 0, &cached_state);
4830 
4831 	ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
4832 					&cached_state, 0);
4833 	if (ret) {
4834 		unlock_extent_cached(io_tree, block_start, block_end,
4835 				     &cached_state);
4836 		goto out_unlock;
4837 	}
4838 
4839 	if (offset != blocksize) {
4840 		if (!len)
4841 			len = blocksize - offset;
4842 		kaddr = kmap(page);
4843 		if (front)
4844 			memset(kaddr + (block_start - page_offset(page)),
4845 				0, offset);
4846 		else
4847 			memset(kaddr + (block_start - page_offset(page)) +  offset,
4848 				0, len);
4849 		flush_dcache_page(page);
4850 		kunmap(page);
4851 	}
4852 	ClearPageChecked(page);
4853 	set_page_dirty(page);
4854 	unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
4855 
4856 out_unlock:
4857 	if (ret)
4858 		btrfs_delalloc_release_space(inode, data_reserved, block_start,
4859 					     blocksize);
4860 	btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
4861 	unlock_page(page);
4862 	put_page(page);
4863 out:
4864 	extent_changeset_free(data_reserved);
4865 	return ret;
4866 }
4867 
4868 static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
4869 			     u64 offset, u64 len)
4870 {
4871 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4872 	struct btrfs_trans_handle *trans;
4873 	int ret;
4874 
4875 	/*
4876 	 * Still need to make sure the inode looks like it's been updated so
4877 	 * that any holes get logged if we fsync.
4878 	 */
4879 	if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
4880 		BTRFS_I(inode)->last_trans = fs_info->generation;
4881 		BTRFS_I(inode)->last_sub_trans = root->log_transid;
4882 		BTRFS_I(inode)->last_log_commit = root->last_log_commit;
4883 		return 0;
4884 	}
4885 
4886 	/*
4887 	 * 1 - for the one we're dropping
4888 	 * 1 - for the one we're adding
4889 	 * 1 - for updating the inode.
4890 	 */
4891 	trans = btrfs_start_transaction(root, 3);
4892 	if (IS_ERR(trans))
4893 		return PTR_ERR(trans);
4894 
4895 	ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
4896 	if (ret) {
4897 		btrfs_abort_transaction(trans, ret);
4898 		btrfs_end_transaction(trans);
4899 		return ret;
4900 	}
4901 
4902 	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)),
4903 			offset, 0, 0, len, 0, len, 0, 0, 0);
4904 	if (ret)
4905 		btrfs_abort_transaction(trans, ret);
4906 	else
4907 		btrfs_update_inode(trans, root, inode);
4908 	btrfs_end_transaction(trans);
4909 	return ret;
4910 }
4911 
4912 /*
4913  * This function puts in dummy file extents for the area we're creating a hole
4914  * for.  So if we are truncating this file to a larger size we need to insert
4915  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4916  * the range between oldsize and size
4917  */
4918 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4919 {
4920 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4921 	struct btrfs_root *root = BTRFS_I(inode)->root;
4922 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4923 	struct extent_map *em = NULL;
4924 	struct extent_state *cached_state = NULL;
4925 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4926 	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
4927 	u64 block_end = ALIGN(size, fs_info->sectorsize);
4928 	u64 last_byte;
4929 	u64 cur_offset;
4930 	u64 hole_size;
4931 	int err = 0;
4932 
4933 	/*
4934 	 * If our size started in the middle of a block we need to zero out the
4935 	 * rest of the block before we expand the i_size, otherwise we could
4936 	 * expose stale data.
4937 	 */
4938 	err = btrfs_truncate_block(inode, oldsize, 0, 0);
4939 	if (err)
4940 		return err;
4941 
4942 	if (size <= hole_start)
4943 		return 0;
4944 
4945 	while (1) {
4946 		struct btrfs_ordered_extent *ordered;
4947 
4948 		lock_extent_bits(io_tree, hole_start, block_end - 1,
4949 				 &cached_state);
4950 		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start,
4951 						     block_end - hole_start);
4952 		if (!ordered)
4953 			break;
4954 		unlock_extent_cached(io_tree, hole_start, block_end - 1,
4955 				     &cached_state);
4956 		btrfs_start_ordered_extent(inode, ordered, 1);
4957 		btrfs_put_ordered_extent(ordered);
4958 	}
4959 
4960 	cur_offset = hole_start;
4961 	while (1) {
4962 		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
4963 				block_end - cur_offset, 0);
4964 		if (IS_ERR(em)) {
4965 			err = PTR_ERR(em);
4966 			em = NULL;
4967 			break;
4968 		}
4969 		last_byte = min(extent_map_end(em), block_end);
4970 		last_byte = ALIGN(last_byte, fs_info->sectorsize);
4971 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4972 			struct extent_map *hole_em;
4973 			hole_size = last_byte - cur_offset;
4974 
4975 			err = maybe_insert_hole(root, inode, cur_offset,
4976 						hole_size);
4977 			if (err)
4978 				break;
4979 			btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
4980 						cur_offset + hole_size - 1, 0);
4981 			hole_em = alloc_extent_map();
4982 			if (!hole_em) {
4983 				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4984 					&BTRFS_I(inode)->runtime_flags);
4985 				goto next;
4986 			}
4987 			hole_em->start = cur_offset;
4988 			hole_em->len = hole_size;
4989 			hole_em->orig_start = cur_offset;
4990 
4991 			hole_em->block_start = EXTENT_MAP_HOLE;
4992 			hole_em->block_len = 0;
4993 			hole_em->orig_block_len = 0;
4994 			hole_em->ram_bytes = hole_size;
4995 			hole_em->bdev = fs_info->fs_devices->latest_bdev;
4996 			hole_em->compress_type = BTRFS_COMPRESS_NONE;
4997 			hole_em->generation = fs_info->generation;
4998 
4999 			while (1) {
5000 				write_lock(&em_tree->lock);
5001 				err = add_extent_mapping(em_tree, hole_em, 1);
5002 				write_unlock(&em_tree->lock);
5003 				if (err != -EEXIST)
5004 					break;
5005 				btrfs_drop_extent_cache(BTRFS_I(inode),
5006 							cur_offset,
5007 							cur_offset +
5008 							hole_size - 1, 0);
5009 			}
5010 			free_extent_map(hole_em);
5011 		}
5012 next:
5013 		free_extent_map(em);
5014 		em = NULL;
5015 		cur_offset = last_byte;
5016 		if (cur_offset >= block_end)
5017 			break;
5018 	}
5019 	free_extent_map(em);
5020 	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state);
5021 	return err;
5022 }
5023 
5024 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
5025 {
5026 	struct btrfs_root *root = BTRFS_I(inode)->root;
5027 	struct btrfs_trans_handle *trans;
5028 	loff_t oldsize = i_size_read(inode);
5029 	loff_t newsize = attr->ia_size;
5030 	int mask = attr->ia_valid;
5031 	int ret;
5032 
5033 	/*
5034 	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
5035 	 * special case where we need to update the times despite not having
5036 	 * these flags set.  For all other operations the VFS set these flags
5037 	 * explicitly if it wants a timestamp update.
5038 	 */
5039 	if (newsize != oldsize) {
5040 		inode_inc_iversion(inode);
5041 		if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
5042 			inode->i_ctime = inode->i_mtime =
5043 				current_time(inode);
5044 	}
5045 
5046 	if (newsize > oldsize) {
5047 		/*
5048 		 * Don't do an expanding truncate while snapshotting is ongoing.
5049 		 * This is to ensure the snapshot captures a fully consistent
5050 		 * state of this file - if the snapshot captures this expanding
5051 		 * truncation, it must capture all writes that happened before
5052 		 * this truncation.
5053 		 */
5054 		btrfs_wait_for_snapshot_creation(root);
5055 		ret = btrfs_cont_expand(inode, oldsize, newsize);
5056 		if (ret) {
5057 			btrfs_end_write_no_snapshotting(root);
5058 			return ret;
5059 		}
5060 
5061 		trans = btrfs_start_transaction(root, 1);
5062 		if (IS_ERR(trans)) {
5063 			btrfs_end_write_no_snapshotting(root);
5064 			return PTR_ERR(trans);
5065 		}
5066 
5067 		i_size_write(inode, newsize);
5068 		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
5069 		pagecache_isize_extended(inode, oldsize, newsize);
5070 		ret = btrfs_update_inode(trans, root, inode);
5071 		btrfs_end_write_no_snapshotting(root);
5072 		btrfs_end_transaction(trans);
5073 	} else {
5074 
5075 		/*
5076 		 * We're truncating a file that used to have good data down to
5077 		 * zero. Make sure it gets into the ordered flush list so that
5078 		 * any new writes get down to disk quickly.
5079 		 */
5080 		if (newsize == 0)
5081 			set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
5082 				&BTRFS_I(inode)->runtime_flags);
5083 
5084 		/*
5085 		 * 1 for the orphan item we're going to add
5086 		 * 1 for the orphan item deletion.
5087 		 */
5088 		trans = btrfs_start_transaction(root, 2);
5089 		if (IS_ERR(trans))
5090 			return PTR_ERR(trans);
5091 
5092 		/*
5093 		 * We need to do this in case we fail at _any_ point during the
5094 		 * actual truncate.  Once we do the truncate_setsize we could
5095 		 * invalidate pages which forces any outstanding ordered io to
5096 		 * be instantly completed which will give us extents that need
5097 		 * to be truncated.  If we fail to get an orphan inode down we
5098 		 * could have left over extents that were never meant to live,
5099 		 * so we need to guarantee from this point on that everything
5100 		 * will be consistent.
5101 		 */
5102 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
5103 		btrfs_end_transaction(trans);
5104 		if (ret)
5105 			return ret;
5106 
5107 		/* we don't support swapfiles, so vmtruncate shouldn't fail */
5108 		truncate_setsize(inode, newsize);
5109 
5110 		/* Disable nonlocked read DIO to avoid the end less truncate */
5111 		btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
5112 		inode_dio_wait(inode);
5113 		btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
5114 
5115 		ret = btrfs_truncate(inode);
5116 		if (ret && inode->i_nlink) {
5117 			int err;
5118 
5119 			/* To get a stable disk_i_size */
5120 			err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5121 			if (err) {
5122 				btrfs_orphan_del(NULL, BTRFS_I(inode));
5123 				return err;
5124 			}
5125 
5126 			/*
5127 			 * failed to truncate, disk_i_size is only adjusted down
5128 			 * as we remove extents, so it should represent the true
5129 			 * size of the inode, so reset the in memory size and
5130 			 * delete our orphan entry.
5131 			 */
5132 			trans = btrfs_join_transaction(root);
5133 			if (IS_ERR(trans)) {
5134 				btrfs_orphan_del(NULL, BTRFS_I(inode));
5135 				return ret;
5136 			}
5137 			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5138 			err = btrfs_orphan_del(trans, BTRFS_I(inode));
5139 			if (err)
5140 				btrfs_abort_transaction(trans, err);
5141 			btrfs_end_transaction(trans);
5142 		}
5143 	}
5144 
5145 	return ret;
5146 }
5147 
5148 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
5149 {
5150 	struct inode *inode = d_inode(dentry);
5151 	struct btrfs_root *root = BTRFS_I(inode)->root;
5152 	int err;
5153 
5154 	if (btrfs_root_readonly(root))
5155 		return -EROFS;
5156 
5157 	err = setattr_prepare(dentry, attr);
5158 	if (err)
5159 		return err;
5160 
5161 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5162 		err = btrfs_setsize(inode, attr);
5163 		if (err)
5164 			return err;
5165 	}
5166 
5167 	if (attr->ia_valid) {
5168 		setattr_copy(inode, attr);
5169 		inode_inc_iversion(inode);
5170 		err = btrfs_dirty_inode(inode);
5171 
5172 		if (!err && attr->ia_valid & ATTR_MODE)
5173 			err = posix_acl_chmod(inode, inode->i_mode);
5174 	}
5175 
5176 	return err;
5177 }
5178 
5179 /*
5180  * While truncating the inode pages during eviction, we get the VFS calling
5181  * btrfs_invalidatepage() against each page of the inode. This is slow because
5182  * the calls to btrfs_invalidatepage() result in a huge amount of calls to
5183  * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
5184  * extent_state structures over and over, wasting lots of time.
5185  *
5186  * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
5187  * those expensive operations on a per page basis and do only the ordered io
5188  * finishing, while we release here the extent_map and extent_state structures,
5189  * without the excessive merging and splitting.
5190  */
5191 static void evict_inode_truncate_pages(struct inode *inode)
5192 {
5193 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5194 	struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
5195 	struct rb_node *node;
5196 
5197 	ASSERT(inode->i_state & I_FREEING);
5198 	truncate_inode_pages_final(&inode->i_data);
5199 
5200 	write_lock(&map_tree->lock);
5201 	while (!RB_EMPTY_ROOT(&map_tree->map)) {
5202 		struct extent_map *em;
5203 
5204 		node = rb_first(&map_tree->map);
5205 		em = rb_entry(node, struct extent_map, rb_node);
5206 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
5207 		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
5208 		remove_extent_mapping(map_tree, em);
5209 		free_extent_map(em);
5210 		if (need_resched()) {
5211 			write_unlock(&map_tree->lock);
5212 			cond_resched();
5213 			write_lock(&map_tree->lock);
5214 		}
5215 	}
5216 	write_unlock(&map_tree->lock);
5217 
5218 	/*
5219 	 * Keep looping until we have no more ranges in the io tree.
5220 	 * We can have ongoing bios started by readpages (called from readahead)
5221 	 * that have their endio callback (extent_io.c:end_bio_extent_readpage)
5222 	 * still in progress (unlocked the pages in the bio but did not yet
5223 	 * unlocked the ranges in the io tree). Therefore this means some
5224 	 * ranges can still be locked and eviction started because before
5225 	 * submitting those bios, which are executed by a separate task (work
5226 	 * queue kthread), inode references (inode->i_count) were not taken
5227 	 * (which would be dropped in the end io callback of each bio).
5228 	 * Therefore here we effectively end up waiting for those bios and
5229 	 * anyone else holding locked ranges without having bumped the inode's
5230 	 * reference count - if we don't do it, when they access the inode's
5231 	 * io_tree to unlock a range it may be too late, leading to an
5232 	 * use-after-free issue.
5233 	 */
5234 	spin_lock(&io_tree->lock);
5235 	while (!RB_EMPTY_ROOT(&io_tree->state)) {
5236 		struct extent_state *state;
5237 		struct extent_state *cached_state = NULL;
5238 		u64 start;
5239 		u64 end;
5240 
5241 		node = rb_first(&io_tree->state);
5242 		state = rb_entry(node, struct extent_state, rb_node);
5243 		start = state->start;
5244 		end = state->end;
5245 		spin_unlock(&io_tree->lock);
5246 
5247 		lock_extent_bits(io_tree, start, end, &cached_state);
5248 
5249 		/*
5250 		 * If still has DELALLOC flag, the extent didn't reach disk,
5251 		 * and its reserved space won't be freed by delayed_ref.
5252 		 * So we need to free its reserved space here.
5253 		 * (Refer to comment in btrfs_invalidatepage, case 2)
5254 		 *
5255 		 * Note, end is the bytenr of last byte, so we need + 1 here.
5256 		 */
5257 		if (state->state & EXTENT_DELALLOC)
5258 			btrfs_qgroup_free_data(inode, NULL, start, end - start + 1);
5259 
5260 		clear_extent_bit(io_tree, start, end,
5261 				 EXTENT_LOCKED | EXTENT_DIRTY |
5262 				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
5263 				 EXTENT_DEFRAG, 1, 1, &cached_state);
5264 
5265 		cond_resched();
5266 		spin_lock(&io_tree->lock);
5267 	}
5268 	spin_unlock(&io_tree->lock);
5269 }
5270 
5271 void btrfs_evict_inode(struct inode *inode)
5272 {
5273 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5274 	struct btrfs_trans_handle *trans;
5275 	struct btrfs_root *root = BTRFS_I(inode)->root;
5276 	struct btrfs_block_rsv *rsv, *global_rsv;
5277 	int steal_from_global = 0;
5278 	u64 min_size;
5279 	int ret;
5280 
5281 	trace_btrfs_inode_evict(inode);
5282 
5283 	if (!root) {
5284 		kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
5285 		return;
5286 	}
5287 
5288 	min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
5289 
5290 	evict_inode_truncate_pages(inode);
5291 
5292 	if (inode->i_nlink &&
5293 	    ((btrfs_root_refs(&root->root_item) != 0 &&
5294 	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5295 	     btrfs_is_free_space_inode(BTRFS_I(inode))))
5296 		goto no_delete;
5297 
5298 	if (is_bad_inode(inode)) {
5299 		btrfs_orphan_del(NULL, BTRFS_I(inode));
5300 		goto no_delete;
5301 	}
5302 	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
5303 	if (!special_file(inode->i_mode))
5304 		btrfs_wait_ordered_range(inode, 0, (u64)-1);
5305 
5306 	btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
5307 
5308 	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
5309 		BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
5310 				 &BTRFS_I(inode)->runtime_flags));
5311 		goto no_delete;
5312 	}
5313 
5314 	if (inode->i_nlink > 0) {
5315 		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5316 		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5317 		goto no_delete;
5318 	}
5319 
5320 	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5321 	if (ret) {
5322 		btrfs_orphan_del(NULL, BTRFS_I(inode));
5323 		goto no_delete;
5324 	}
5325 
5326 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5327 	if (!rsv) {
5328 		btrfs_orphan_del(NULL, BTRFS_I(inode));
5329 		goto no_delete;
5330 	}
5331 	rsv->size = min_size;
5332 	rsv->failfast = 1;
5333 	global_rsv = &fs_info->global_block_rsv;
5334 
5335 	btrfs_i_size_write(BTRFS_I(inode), 0);
5336 
5337 	/*
5338 	 * This is a bit simpler than btrfs_truncate since we've already
5339 	 * reserved our space for our orphan item in the unlink, so we just
5340 	 * need to reserve some slack space in case we add bytes and update
5341 	 * inode item when doing the truncate.
5342 	 */
5343 	while (1) {
5344 		ret = btrfs_block_rsv_refill(root, rsv, min_size,
5345 					     BTRFS_RESERVE_FLUSH_LIMIT);
5346 
5347 		/*
5348 		 * Try and steal from the global reserve since we will
5349 		 * likely not use this space anyway, we want to try as
5350 		 * hard as possible to get this to work.
5351 		 */
5352 		if (ret)
5353 			steal_from_global++;
5354 		else
5355 			steal_from_global = 0;
5356 		ret = 0;
5357 
5358 		/*
5359 		 * steal_from_global == 0: we reserved stuff, hooray!
5360 		 * steal_from_global == 1: we didn't reserve stuff, boo!
5361 		 * steal_from_global == 2: we've committed, still not a lot of
5362 		 * room but maybe we'll have room in the global reserve this
5363 		 * time.
5364 		 * steal_from_global == 3: abandon all hope!
5365 		 */
5366 		if (steal_from_global > 2) {
5367 			btrfs_warn(fs_info,
5368 				   "Could not get space for a delete, will truncate on mount %d",
5369 				   ret);
5370 			btrfs_orphan_del(NULL, BTRFS_I(inode));
5371 			btrfs_free_block_rsv(fs_info, rsv);
5372 			goto no_delete;
5373 		}
5374 
5375 		trans = btrfs_join_transaction(root);
5376 		if (IS_ERR(trans)) {
5377 			btrfs_orphan_del(NULL, BTRFS_I(inode));
5378 			btrfs_free_block_rsv(fs_info, rsv);
5379 			goto no_delete;
5380 		}
5381 
5382 		/*
5383 		 * We can't just steal from the global reserve, we need to make
5384 		 * sure there is room to do it, if not we need to commit and try
5385 		 * again.
5386 		 */
5387 		if (steal_from_global) {
5388 			if (!btrfs_check_space_for_delayed_refs(trans, fs_info))
5389 				ret = btrfs_block_rsv_migrate(global_rsv, rsv,
5390 							      min_size, 0);
5391 			else
5392 				ret = -ENOSPC;
5393 		}
5394 
5395 		/*
5396 		 * Couldn't steal from the global reserve, we have too much
5397 		 * pending stuff built up, commit the transaction and try it
5398 		 * again.
5399 		 */
5400 		if (ret) {
5401 			ret = btrfs_commit_transaction(trans);
5402 			if (ret) {
5403 				btrfs_orphan_del(NULL, BTRFS_I(inode));
5404 				btrfs_free_block_rsv(fs_info, rsv);
5405 				goto no_delete;
5406 			}
5407 			continue;
5408 		} else {
5409 			steal_from_global = 0;
5410 		}
5411 
5412 		trans->block_rsv = rsv;
5413 
5414 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
5415 		if (ret != -ENOSPC && ret != -EAGAIN)
5416 			break;
5417 
5418 		trans->block_rsv = &fs_info->trans_block_rsv;
5419 		btrfs_end_transaction(trans);
5420 		trans = NULL;
5421 		btrfs_btree_balance_dirty(fs_info);
5422 	}
5423 
5424 	btrfs_free_block_rsv(fs_info, rsv);
5425 
5426 	/*
5427 	 * Errors here aren't a big deal, it just means we leave orphan items
5428 	 * in the tree.  They will be cleaned up on the next mount.
5429 	 */
5430 	if (ret == 0) {
5431 		trans->block_rsv = root->orphan_block_rsv;
5432 		btrfs_orphan_del(trans, BTRFS_I(inode));
5433 	} else {
5434 		btrfs_orphan_del(NULL, BTRFS_I(inode));
5435 	}
5436 
5437 	trans->block_rsv = &fs_info->trans_block_rsv;
5438 	if (!(root == fs_info->tree_root ||
5439 	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
5440 		btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
5441 
5442 	btrfs_end_transaction(trans);
5443 	btrfs_btree_balance_dirty(fs_info);
5444 no_delete:
5445 	btrfs_remove_delayed_node(BTRFS_I(inode));
5446 	clear_inode(inode);
5447 }
5448 
5449 /*
5450  * this returns the key found in the dir entry in the location pointer.
5451  * If no dir entries were found, location->objectid is 0.
5452  */
5453 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
5454 			       struct btrfs_key *location)
5455 {
5456 	const char *name = dentry->d_name.name;
5457 	int namelen = dentry->d_name.len;
5458 	struct btrfs_dir_item *di;
5459 	struct btrfs_path *path;
5460 	struct btrfs_root *root = BTRFS_I(dir)->root;
5461 	int ret = 0;
5462 
5463 	path = btrfs_alloc_path();
5464 	if (!path)
5465 		return -ENOMEM;
5466 
5467 	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
5468 			name, namelen, 0);
5469 	if (IS_ERR(di))
5470 		ret = PTR_ERR(di);
5471 
5472 	if (IS_ERR_OR_NULL(di))
5473 		goto out_err;
5474 
5475 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5476 	if (location->type != BTRFS_INODE_ITEM_KEY &&
5477 	    location->type != BTRFS_ROOT_ITEM_KEY) {
5478 		btrfs_warn(root->fs_info,
5479 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5480 			   __func__, name, btrfs_ino(BTRFS_I(dir)),
5481 			   location->objectid, location->type, location->offset);
5482 		goto out_err;
5483 	}
5484 out:
5485 	btrfs_free_path(path);
5486 	return ret;
5487 out_err:
5488 	location->objectid = 0;
5489 	goto out;
5490 }
5491 
5492 /*
5493  * when we hit a tree root in a directory, the btrfs part of the inode
5494  * needs to be changed to reflect the root directory of the tree root.  This
5495  * is kind of like crossing a mount point.
5496  */
5497 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5498 				    struct inode *dir,
5499 				    struct dentry *dentry,
5500 				    struct btrfs_key *location,
5501 				    struct btrfs_root **sub_root)
5502 {
5503 	struct btrfs_path *path;
5504 	struct btrfs_root *new_root;
5505 	struct btrfs_root_ref *ref;
5506 	struct extent_buffer *leaf;
5507 	struct btrfs_key key;
5508 	int ret;
5509 	int err = 0;
5510 
5511 	path = btrfs_alloc_path();
5512 	if (!path) {
5513 		err = -ENOMEM;
5514 		goto out;
5515 	}
5516 
5517 	err = -ENOENT;
5518 	key.objectid = BTRFS_I(dir)->root->root_key.objectid;
5519 	key.type = BTRFS_ROOT_REF_KEY;
5520 	key.offset = location->objectid;
5521 
5522 	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5523 	if (ret) {
5524 		if (ret < 0)
5525 			err = ret;
5526 		goto out;
5527 	}
5528 
5529 	leaf = path->nodes[0];
5530 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5531 	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
5532 	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
5533 		goto out;
5534 
5535 	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
5536 				   (unsigned long)(ref + 1),
5537 				   dentry->d_name.len);
5538 	if (ret)
5539 		goto out;
5540 
5541 	btrfs_release_path(path);
5542 
5543 	new_root = btrfs_read_fs_root_no_name(fs_info, location);
5544 	if (IS_ERR(new_root)) {
5545 		err = PTR_ERR(new_root);
5546 		goto out;
5547 	}
5548 
5549 	*sub_root = new_root;
5550 	location->objectid = btrfs_root_dirid(&new_root->root_item);
5551 	location->type = BTRFS_INODE_ITEM_KEY;
5552 	location->offset = 0;
5553 	err = 0;
5554 out:
5555 	btrfs_free_path(path);
5556 	return err;
5557 }
5558 
5559 static void inode_tree_add(struct inode *inode)
5560 {
5561 	struct btrfs_root *root = BTRFS_I(inode)->root;
5562 	struct btrfs_inode *entry;
5563 	struct rb_node **p;
5564 	struct rb_node *parent;
5565 	struct rb_node *new = &BTRFS_I(inode)->rb_node;
5566 	u64 ino = btrfs_ino(BTRFS_I(inode));
5567 
5568 	if (inode_unhashed(inode))
5569 		return;
5570 	parent = NULL;
5571 	spin_lock(&root->inode_lock);
5572 	p = &root->inode_tree.rb_node;
5573 	while (*p) {
5574 		parent = *p;
5575 		entry = rb_entry(parent, struct btrfs_inode, rb_node);
5576 
5577 		if (ino < btrfs_ino(BTRFS_I(&entry->vfs_inode)))
5578 			p = &parent->rb_left;
5579 		else if (ino > btrfs_ino(BTRFS_I(&entry->vfs_inode)))
5580 			p = &parent->rb_right;
5581 		else {
5582 			WARN_ON(!(entry->vfs_inode.i_state &
5583 				  (I_WILL_FREE | I_FREEING)));
5584 			rb_replace_node(parent, new, &root->inode_tree);
5585 			RB_CLEAR_NODE(parent);
5586 			spin_unlock(&root->inode_lock);
5587 			return;
5588 		}
5589 	}
5590 	rb_link_node(new, parent, p);
5591 	rb_insert_color(new, &root->inode_tree);
5592 	spin_unlock(&root->inode_lock);
5593 }
5594 
5595 static void inode_tree_del(struct inode *inode)
5596 {
5597 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5598 	struct btrfs_root *root = BTRFS_I(inode)->root;
5599 	int empty = 0;
5600 
5601 	spin_lock(&root->inode_lock);
5602 	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
5603 		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
5604 		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
5605 		empty = RB_EMPTY_ROOT(&root->inode_tree);
5606 	}
5607 	spin_unlock(&root->inode_lock);
5608 
5609 	if (empty && btrfs_root_refs(&root->root_item) == 0) {
5610 		synchronize_srcu(&fs_info->subvol_srcu);
5611 		spin_lock(&root->inode_lock);
5612 		empty = RB_EMPTY_ROOT(&root->inode_tree);
5613 		spin_unlock(&root->inode_lock);
5614 		if (empty)
5615 			btrfs_add_dead_root(root);
5616 	}
5617 }
5618 
5619 void btrfs_invalidate_inodes(struct btrfs_root *root)
5620 {
5621 	struct btrfs_fs_info *fs_info = root->fs_info;
5622 	struct rb_node *node;
5623 	struct rb_node *prev;
5624 	struct btrfs_inode *entry;
5625 	struct inode *inode;
5626 	u64 objectid = 0;
5627 
5628 	if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
5629 		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
5630 
5631 	spin_lock(&root->inode_lock);
5632 again:
5633 	node = root->inode_tree.rb_node;
5634 	prev = NULL;
5635 	while (node) {
5636 		prev = node;
5637 		entry = rb_entry(node, struct btrfs_inode, rb_node);
5638 
5639 		if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode)))
5640 			node = node->rb_left;
5641 		else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode)))
5642 			node = node->rb_right;
5643 		else
5644 			break;
5645 	}
5646 	if (!node) {
5647 		while (prev) {
5648 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
5649 			if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) {
5650 				node = prev;
5651 				break;
5652 			}
5653 			prev = rb_next(prev);
5654 		}
5655 	}
5656 	while (node) {
5657 		entry = rb_entry(node, struct btrfs_inode, rb_node);
5658 		objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1;
5659 		inode = igrab(&entry->vfs_inode);
5660 		if (inode) {
5661 			spin_unlock(&root->inode_lock);
5662 			if (atomic_read(&inode->i_count) > 1)
5663 				d_prune_aliases(inode);
5664 			/*
5665 			 * btrfs_drop_inode will have it removed from
5666 			 * the inode cache when its usage count
5667 			 * hits zero.
5668 			 */
5669 			iput(inode);
5670 			cond_resched();
5671 			spin_lock(&root->inode_lock);
5672 			goto again;
5673 		}
5674 
5675 		if (cond_resched_lock(&root->inode_lock))
5676 			goto again;
5677 
5678 		node = rb_next(node);
5679 	}
5680 	spin_unlock(&root->inode_lock);
5681 }
5682 
5683 static int btrfs_init_locked_inode(struct inode *inode, void *p)
5684 {
5685 	struct btrfs_iget_args *args = p;
5686 	inode->i_ino = args->location->objectid;
5687 	memcpy(&BTRFS_I(inode)->location, args->location,
5688 	       sizeof(*args->location));
5689 	BTRFS_I(inode)->root = args->root;
5690 	return 0;
5691 }
5692 
5693 static int btrfs_find_actor(struct inode *inode, void *opaque)
5694 {
5695 	struct btrfs_iget_args *args = opaque;
5696 	return args->location->objectid == BTRFS_I(inode)->location.objectid &&
5697 		args->root == BTRFS_I(inode)->root;
5698 }
5699 
5700 static struct inode *btrfs_iget_locked(struct super_block *s,
5701 				       struct btrfs_key *location,
5702 				       struct btrfs_root *root)
5703 {
5704 	struct inode *inode;
5705 	struct btrfs_iget_args args;
5706 	unsigned long hashval = btrfs_inode_hash(location->objectid, root);
5707 
5708 	args.location = location;
5709 	args.root = root;
5710 
5711 	inode = iget5_locked(s, hashval, btrfs_find_actor,
5712 			     btrfs_init_locked_inode,
5713 			     (void *)&args);
5714 	return inode;
5715 }
5716 
5717 /* Get an inode object given its location and corresponding root.
5718  * Returns in *is_new if the inode was read from disk
5719  */
5720 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5721 			 struct btrfs_root *root, int *new)
5722 {
5723 	struct inode *inode;
5724 
5725 	inode = btrfs_iget_locked(s, location, root);
5726 	if (!inode)
5727 		return ERR_PTR(-ENOMEM);
5728 
5729 	if (inode->i_state & I_NEW) {
5730 		int ret;
5731 
5732 		ret = btrfs_read_locked_inode(inode);
5733 		if (!is_bad_inode(inode)) {
5734 			inode_tree_add(inode);
5735 			unlock_new_inode(inode);
5736 			if (new)
5737 				*new = 1;
5738 		} else {
5739 			unlock_new_inode(inode);
5740 			iput(inode);
5741 			ASSERT(ret < 0);
5742 			inode = ERR_PTR(ret < 0 ? ret : -ESTALE);
5743 		}
5744 	}
5745 
5746 	return inode;
5747 }
5748 
5749 static struct inode *new_simple_dir(struct super_block *s,
5750 				    struct btrfs_key *key,
5751 				    struct btrfs_root *root)
5752 {
5753 	struct inode *inode = new_inode(s);
5754 
5755 	if (!inode)
5756 		return ERR_PTR(-ENOMEM);
5757 
5758 	BTRFS_I(inode)->root = root;
5759 	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5760 	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5761 
5762 	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5763 	inode->i_op = &btrfs_dir_ro_inode_operations;
5764 	inode->i_opflags &= ~IOP_XATTR;
5765 	inode->i_fop = &simple_dir_operations;
5766 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5767 	inode->i_mtime = current_time(inode);
5768 	inode->i_atime = inode->i_mtime;
5769 	inode->i_ctime = inode->i_mtime;
5770 	BTRFS_I(inode)->i_otime = inode->i_mtime;
5771 
5772 	return inode;
5773 }
5774 
5775 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5776 {
5777 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
5778 	struct inode *inode;
5779 	struct btrfs_root *root = BTRFS_I(dir)->root;
5780 	struct btrfs_root *sub_root = root;
5781 	struct btrfs_key location;
5782 	int index;
5783 	int ret = 0;
5784 
5785 	if (dentry->d_name.len > BTRFS_NAME_LEN)
5786 		return ERR_PTR(-ENAMETOOLONG);
5787 
5788 	ret = btrfs_inode_by_name(dir, dentry, &location);
5789 	if (ret < 0)
5790 		return ERR_PTR(ret);
5791 
5792 	if (location.objectid == 0)
5793 		return ERR_PTR(-ENOENT);
5794 
5795 	if (location.type == BTRFS_INODE_ITEM_KEY) {
5796 		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
5797 		return inode;
5798 	}
5799 
5800 	index = srcu_read_lock(&fs_info->subvol_srcu);
5801 	ret = fixup_tree_root_location(fs_info, dir, dentry,
5802 				       &location, &sub_root);
5803 	if (ret < 0) {
5804 		if (ret != -ENOENT)
5805 			inode = ERR_PTR(ret);
5806 		else
5807 			inode = new_simple_dir(dir->i_sb, &location, sub_root);
5808 	} else {
5809 		inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
5810 	}
5811 	srcu_read_unlock(&fs_info->subvol_srcu, index);
5812 
5813 	if (!IS_ERR(inode) && root != sub_root) {
5814 		down_read(&fs_info->cleanup_work_sem);
5815 		if (!sb_rdonly(inode->i_sb))
5816 			ret = btrfs_orphan_cleanup(sub_root);
5817 		up_read(&fs_info->cleanup_work_sem);
5818 		if (ret) {
5819 			iput(inode);
5820 			inode = ERR_PTR(ret);
5821 		}
5822 	}
5823 
5824 	return inode;
5825 }
5826 
5827 static int btrfs_dentry_delete(const struct dentry *dentry)
5828 {
5829 	struct btrfs_root *root;
5830 	struct inode *inode = d_inode(dentry);
5831 
5832 	if (!inode && !IS_ROOT(dentry))
5833 		inode = d_inode(dentry->d_parent);
5834 
5835 	if (inode) {
5836 		root = BTRFS_I(inode)->root;
5837 		if (btrfs_root_refs(&root->root_item) == 0)
5838 			return 1;
5839 
5840 		if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5841 			return 1;
5842 	}
5843 	return 0;
5844 }
5845 
5846 static void btrfs_dentry_release(struct dentry *dentry)
5847 {
5848 	kfree(dentry->d_fsdata);
5849 }
5850 
5851 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5852 				   unsigned int flags)
5853 {
5854 	struct inode *inode;
5855 
5856 	inode = btrfs_lookup_dentry(dir, dentry);
5857 	if (IS_ERR(inode)) {
5858 		if (PTR_ERR(inode) == -ENOENT)
5859 			inode = NULL;
5860 		else
5861 			return ERR_CAST(inode);
5862 	}
5863 
5864 	return d_splice_alias(inode, dentry);
5865 }
5866 
5867 unsigned char btrfs_filetype_table[] = {
5868 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5869 };
5870 
5871 /*
5872  * All this infrastructure exists because dir_emit can fault, and we are holding
5873  * the tree lock when doing readdir.  For now just allocate a buffer and copy
5874  * our information into that, and then dir_emit from the buffer.  This is
5875  * similar to what NFS does, only we don't keep the buffer around in pagecache
5876  * because I'm afraid I'll mess that up.  Long term we need to make filldir do
5877  * copy_to_user_inatomic so we don't have to worry about page faulting under the
5878  * tree lock.
5879  */
5880 static int btrfs_opendir(struct inode *inode, struct file *file)
5881 {
5882 	struct btrfs_file_private *private;
5883 
5884 	private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
5885 	if (!private)
5886 		return -ENOMEM;
5887 	private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
5888 	if (!private->filldir_buf) {
5889 		kfree(private);
5890 		return -ENOMEM;
5891 	}
5892 	file->private_data = private;
5893 	return 0;
5894 }
5895 
5896 struct dir_entry {
5897 	u64 ino;
5898 	u64 offset;
5899 	unsigned type;
5900 	int name_len;
5901 };
5902 
5903 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
5904 {
5905 	while (entries--) {
5906 		struct dir_entry *entry = addr;
5907 		char *name = (char *)(entry + 1);
5908 
5909 		ctx->pos = entry->offset;
5910 		if (!dir_emit(ctx, name, entry->name_len, entry->ino,
5911 			      entry->type))
5912 			return 1;
5913 		addr += sizeof(struct dir_entry) + entry->name_len;
5914 		ctx->pos++;
5915 	}
5916 	return 0;
5917 }
5918 
5919 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5920 {
5921 	struct inode *inode = file_inode(file);
5922 	struct btrfs_root *root = BTRFS_I(inode)->root;
5923 	struct btrfs_file_private *private = file->private_data;
5924 	struct btrfs_dir_item *di;
5925 	struct btrfs_key key;
5926 	struct btrfs_key found_key;
5927 	struct btrfs_path *path;
5928 	void *addr;
5929 	struct list_head ins_list;
5930 	struct list_head del_list;
5931 	int ret;
5932 	struct extent_buffer *leaf;
5933 	int slot;
5934 	char *name_ptr;
5935 	int name_len;
5936 	int entries = 0;
5937 	int total_len = 0;
5938 	bool put = false;
5939 	struct btrfs_key location;
5940 
5941 	if (!dir_emit_dots(file, ctx))
5942 		return 0;
5943 
5944 	path = btrfs_alloc_path();
5945 	if (!path)
5946 		return -ENOMEM;
5947 
5948 	addr = private->filldir_buf;
5949 	path->reada = READA_FORWARD;
5950 
5951 	INIT_LIST_HEAD(&ins_list);
5952 	INIT_LIST_HEAD(&del_list);
5953 	put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
5954 
5955 again:
5956 	key.type = BTRFS_DIR_INDEX_KEY;
5957 	key.offset = ctx->pos;
5958 	key.objectid = btrfs_ino(BTRFS_I(inode));
5959 
5960 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5961 	if (ret < 0)
5962 		goto err;
5963 
5964 	while (1) {
5965 		struct dir_entry *entry;
5966 
5967 		leaf = path->nodes[0];
5968 		slot = path->slots[0];
5969 		if (slot >= btrfs_header_nritems(leaf)) {
5970 			ret = btrfs_next_leaf(root, path);
5971 			if (ret < 0)
5972 				goto err;
5973 			else if (ret > 0)
5974 				break;
5975 			continue;
5976 		}
5977 
5978 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
5979 
5980 		if (found_key.objectid != key.objectid)
5981 			break;
5982 		if (found_key.type != BTRFS_DIR_INDEX_KEY)
5983 			break;
5984 		if (found_key.offset < ctx->pos)
5985 			goto next;
5986 		if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
5987 			goto next;
5988 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5989 		name_len = btrfs_dir_name_len(leaf, di);
5990 		if ((total_len + sizeof(struct dir_entry) + name_len) >=
5991 		    PAGE_SIZE) {
5992 			btrfs_release_path(path);
5993 			ret = btrfs_filldir(private->filldir_buf, entries, ctx);
5994 			if (ret)
5995 				goto nopos;
5996 			addr = private->filldir_buf;
5997 			entries = 0;
5998 			total_len = 0;
5999 			goto again;
6000 		}
6001 
6002 		entry = addr;
6003 		entry->name_len = name_len;
6004 		name_ptr = (char *)(entry + 1);
6005 		read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
6006 				   name_len);
6007 		entry->type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
6008 		btrfs_dir_item_key_to_cpu(leaf, di, &location);
6009 		entry->ino = location.objectid;
6010 		entry->offset = found_key.offset;
6011 		entries++;
6012 		addr += sizeof(struct dir_entry) + name_len;
6013 		total_len += sizeof(struct dir_entry) + name_len;
6014 next:
6015 		path->slots[0]++;
6016 	}
6017 	btrfs_release_path(path);
6018 
6019 	ret = btrfs_filldir(private->filldir_buf, entries, ctx);
6020 	if (ret)
6021 		goto nopos;
6022 
6023 	ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
6024 	if (ret)
6025 		goto nopos;
6026 
6027 	/*
6028 	 * Stop new entries from being returned after we return the last
6029 	 * entry.
6030 	 *
6031 	 * New directory entries are assigned a strictly increasing
6032 	 * offset.  This means that new entries created during readdir
6033 	 * are *guaranteed* to be seen in the future by that readdir.
6034 	 * This has broken buggy programs which operate on names as
6035 	 * they're returned by readdir.  Until we re-use freed offsets
6036 	 * we have this hack to stop new entries from being returned
6037 	 * under the assumption that they'll never reach this huge
6038 	 * offset.
6039 	 *
6040 	 * This is being careful not to overflow 32bit loff_t unless the
6041 	 * last entry requires it because doing so has broken 32bit apps
6042 	 * in the past.
6043 	 */
6044 	if (ctx->pos >= INT_MAX)
6045 		ctx->pos = LLONG_MAX;
6046 	else
6047 		ctx->pos = INT_MAX;
6048 nopos:
6049 	ret = 0;
6050 err:
6051 	if (put)
6052 		btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
6053 	btrfs_free_path(path);
6054 	return ret;
6055 }
6056 
6057 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
6058 {
6059 	struct btrfs_root *root = BTRFS_I(inode)->root;
6060 	struct btrfs_trans_handle *trans;
6061 	int ret = 0;
6062 	bool nolock = false;
6063 
6064 	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
6065 		return 0;
6066 
6067 	if (btrfs_fs_closing(root->fs_info) &&
6068 			btrfs_is_free_space_inode(BTRFS_I(inode)))
6069 		nolock = true;
6070 
6071 	if (wbc->sync_mode == WB_SYNC_ALL) {
6072 		if (nolock)
6073 			trans = btrfs_join_transaction_nolock(root);
6074 		else
6075 			trans = btrfs_join_transaction(root);
6076 		if (IS_ERR(trans))
6077 			return PTR_ERR(trans);
6078 		ret = btrfs_commit_transaction(trans);
6079 	}
6080 	return ret;
6081 }
6082 
6083 /*
6084  * This is somewhat expensive, updating the tree every time the
6085  * inode changes.  But, it is most likely to find the inode in cache.
6086  * FIXME, needs more benchmarking...there are no reasons other than performance
6087  * to keep or drop this code.
6088  */
6089 static int btrfs_dirty_inode(struct inode *inode)
6090 {
6091 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6092 	struct btrfs_root *root = BTRFS_I(inode)->root;
6093 	struct btrfs_trans_handle *trans;
6094 	int ret;
6095 
6096 	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
6097 		return 0;
6098 
6099 	trans = btrfs_join_transaction(root);
6100 	if (IS_ERR(trans))
6101 		return PTR_ERR(trans);
6102 
6103 	ret = btrfs_update_inode(trans, root, inode);
6104 	if (ret && ret == -ENOSPC) {
6105 		/* whoops, lets try again with the full transaction */
6106 		btrfs_end_transaction(trans);
6107 		trans = btrfs_start_transaction(root, 1);
6108 		if (IS_ERR(trans))
6109 			return PTR_ERR(trans);
6110 
6111 		ret = btrfs_update_inode(trans, root, inode);
6112 	}
6113 	btrfs_end_transaction(trans);
6114 	if (BTRFS_I(inode)->delayed_node)
6115 		btrfs_balance_delayed_items(fs_info);
6116 
6117 	return ret;
6118 }
6119 
6120 /*
6121  * This is a copy of file_update_time.  We need this so we can return error on
6122  * ENOSPC for updating the inode in the case of file write and mmap writes.
6123  */
6124 static int btrfs_update_time(struct inode *inode, struct timespec *now,
6125 			     int flags)
6126 {
6127 	struct btrfs_root *root = BTRFS_I(inode)->root;
6128 	bool dirty = flags & ~S_VERSION;
6129 
6130 	if (btrfs_root_readonly(root))
6131 		return -EROFS;
6132 
6133 	if (flags & S_VERSION)
6134 		dirty |= inode_maybe_inc_iversion(inode, dirty);
6135 	if (flags & S_CTIME)
6136 		inode->i_ctime = *now;
6137 	if (flags & S_MTIME)
6138 		inode->i_mtime = *now;
6139 	if (flags & S_ATIME)
6140 		inode->i_atime = *now;
6141 	return dirty ? btrfs_dirty_inode(inode) : 0;
6142 }
6143 
6144 /*
6145  * find the highest existing sequence number in a directory
6146  * and then set the in-memory index_cnt variable to reflect
6147  * free sequence numbers
6148  */
6149 static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
6150 {
6151 	struct btrfs_root *root = inode->root;
6152 	struct btrfs_key key, found_key;
6153 	struct btrfs_path *path;
6154 	struct extent_buffer *leaf;
6155 	int ret;
6156 
6157 	key.objectid = btrfs_ino(inode);
6158 	key.type = BTRFS_DIR_INDEX_KEY;
6159 	key.offset = (u64)-1;
6160 
6161 	path = btrfs_alloc_path();
6162 	if (!path)
6163 		return -ENOMEM;
6164 
6165 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6166 	if (ret < 0)
6167 		goto out;
6168 	/* FIXME: we should be able to handle this */
6169 	if (ret == 0)
6170 		goto out;
6171 	ret = 0;
6172 
6173 	/*
6174 	 * MAGIC NUMBER EXPLANATION:
6175 	 * since we search a directory based on f_pos we have to start at 2
6176 	 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
6177 	 * else has to start at 2
6178 	 */
6179 	if (path->slots[0] == 0) {
6180 		inode->index_cnt = 2;
6181 		goto out;
6182 	}
6183 
6184 	path->slots[0]--;
6185 
6186 	leaf = path->nodes[0];
6187 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6188 
6189 	if (found_key.objectid != btrfs_ino(inode) ||
6190 	    found_key.type != BTRFS_DIR_INDEX_KEY) {
6191 		inode->index_cnt = 2;
6192 		goto out;
6193 	}
6194 
6195 	inode->index_cnt = found_key.offset + 1;
6196 out:
6197 	btrfs_free_path(path);
6198 	return ret;
6199 }
6200 
6201 /*
6202  * helper to find a free sequence number in a given directory.  This current
6203  * code is very simple, later versions will do smarter things in the btree
6204  */
6205 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6206 {
6207 	int ret = 0;
6208 
6209 	if (dir->index_cnt == (u64)-1) {
6210 		ret = btrfs_inode_delayed_dir_index_count(dir);
6211 		if (ret) {
6212 			ret = btrfs_set_inode_index_count(dir);
6213 			if (ret)
6214 				return ret;
6215 		}
6216 	}
6217 
6218 	*index = dir->index_cnt;
6219 	dir->index_cnt++;
6220 
6221 	return ret;
6222 }
6223 
6224 static int btrfs_insert_inode_locked(struct inode *inode)
6225 {
6226 	struct btrfs_iget_args args;
6227 	args.location = &BTRFS_I(inode)->location;
6228 	args.root = BTRFS_I(inode)->root;
6229 
6230 	return insert_inode_locked4(inode,
6231 		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6232 		   btrfs_find_actor, &args);
6233 }
6234 
6235 /*
6236  * Inherit flags from the parent inode.
6237  *
6238  * Currently only the compression flags and the cow flags are inherited.
6239  */
6240 static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
6241 {
6242 	unsigned int flags;
6243 
6244 	if (!dir)
6245 		return;
6246 
6247 	flags = BTRFS_I(dir)->flags;
6248 
6249 	if (flags & BTRFS_INODE_NOCOMPRESS) {
6250 		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
6251 		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
6252 	} else if (flags & BTRFS_INODE_COMPRESS) {
6253 		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
6254 		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
6255 	}
6256 
6257 	if (flags & BTRFS_INODE_NODATACOW) {
6258 		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
6259 		if (S_ISREG(inode->i_mode))
6260 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6261 	}
6262 
6263 	btrfs_update_iflags(inode);
6264 }
6265 
6266 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
6267 				     struct btrfs_root *root,
6268 				     struct inode *dir,
6269 				     const char *name, int name_len,
6270 				     u64 ref_objectid, u64 objectid,
6271 				     umode_t mode, u64 *index)
6272 {
6273 	struct btrfs_fs_info *fs_info = root->fs_info;
6274 	struct inode *inode;
6275 	struct btrfs_inode_item *inode_item;
6276 	struct btrfs_key *location;
6277 	struct btrfs_path *path;
6278 	struct btrfs_inode_ref *ref;
6279 	struct btrfs_key key[2];
6280 	u32 sizes[2];
6281 	int nitems = name ? 2 : 1;
6282 	unsigned long ptr;
6283 	int ret;
6284 
6285 	path = btrfs_alloc_path();
6286 	if (!path)
6287 		return ERR_PTR(-ENOMEM);
6288 
6289 	inode = new_inode(fs_info->sb);
6290 	if (!inode) {
6291 		btrfs_free_path(path);
6292 		return ERR_PTR(-ENOMEM);
6293 	}
6294 
6295 	/*
6296 	 * O_TMPFILE, set link count to 0, so that after this point,
6297 	 * we fill in an inode item with the correct link count.
6298 	 */
6299 	if (!name)
6300 		set_nlink(inode, 0);
6301 
6302 	/*
6303 	 * we have to initialize this early, so we can reclaim the inode
6304 	 * number if we fail afterwards in this function.
6305 	 */
6306 	inode->i_ino = objectid;
6307 
6308 	if (dir && name) {
6309 		trace_btrfs_inode_request(dir);
6310 
6311 		ret = btrfs_set_inode_index(BTRFS_I(dir), index);
6312 		if (ret) {
6313 			btrfs_free_path(path);
6314 			iput(inode);
6315 			return ERR_PTR(ret);
6316 		}
6317 	} else if (dir) {
6318 		*index = 0;
6319 	}
6320 	/*
6321 	 * index_cnt is ignored for everything but a dir,
6322 	 * btrfs_set_inode_index_count has an explanation for the magic
6323 	 * number
6324 	 */
6325 	BTRFS_I(inode)->index_cnt = 2;
6326 	BTRFS_I(inode)->dir_index = *index;
6327 	BTRFS_I(inode)->root = root;
6328 	BTRFS_I(inode)->generation = trans->transid;
6329 	inode->i_generation = BTRFS_I(inode)->generation;
6330 
6331 	/*
6332 	 * We could have gotten an inode number from somebody who was fsynced
6333 	 * and then removed in this same transaction, so let's just set full
6334 	 * sync since it will be a full sync anyway and this will blow away the
6335 	 * old info in the log.
6336 	 */
6337 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
6338 
6339 	key[0].objectid = objectid;
6340 	key[0].type = BTRFS_INODE_ITEM_KEY;
6341 	key[0].offset = 0;
6342 
6343 	sizes[0] = sizeof(struct btrfs_inode_item);
6344 
6345 	if (name) {
6346 		/*
6347 		 * Start new inodes with an inode_ref. This is slightly more
6348 		 * efficient for small numbers of hard links since they will
6349 		 * be packed into one item. Extended refs will kick in if we
6350 		 * add more hard links than can fit in the ref item.
6351 		 */
6352 		key[1].objectid = objectid;
6353 		key[1].type = BTRFS_INODE_REF_KEY;
6354 		key[1].offset = ref_objectid;
6355 
6356 		sizes[1] = name_len + sizeof(*ref);
6357 	}
6358 
6359 	location = &BTRFS_I(inode)->location;
6360 	location->objectid = objectid;
6361 	location->offset = 0;
6362 	location->type = BTRFS_INODE_ITEM_KEY;
6363 
6364 	ret = btrfs_insert_inode_locked(inode);
6365 	if (ret < 0)
6366 		goto fail;
6367 
6368 	path->leave_spinning = 1;
6369 	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
6370 	if (ret != 0)
6371 		goto fail_unlock;
6372 
6373 	inode_init_owner(inode, dir, mode);
6374 	inode_set_bytes(inode, 0);
6375 
6376 	inode->i_mtime = current_time(inode);
6377 	inode->i_atime = inode->i_mtime;
6378 	inode->i_ctime = inode->i_mtime;
6379 	BTRFS_I(inode)->i_otime = inode->i_mtime;
6380 
6381 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6382 				  struct btrfs_inode_item);
6383 	memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6384 			     sizeof(*inode_item));
6385 	fill_inode_item(trans, path->nodes[0], inode_item, inode);
6386 
6387 	if (name) {
6388 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6389 				     struct btrfs_inode_ref);
6390 		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
6391 		btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
6392 		ptr = (unsigned long)(ref + 1);
6393 		write_extent_buffer(path->nodes[0], name, ptr, name_len);
6394 	}
6395 
6396 	btrfs_mark_buffer_dirty(path->nodes[0]);
6397 	btrfs_free_path(path);
6398 
6399 	btrfs_inherit_iflags(inode, dir);
6400 
6401 	if (S_ISREG(mode)) {
6402 		if (btrfs_test_opt(fs_info, NODATASUM))
6403 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6404 		if (btrfs_test_opt(fs_info, NODATACOW))
6405 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6406 				BTRFS_INODE_NODATASUM;
6407 	}
6408 
6409 	inode_tree_add(inode);
6410 
6411 	trace_btrfs_inode_new(inode);
6412 	btrfs_set_inode_last_trans(trans, inode);
6413 
6414 	btrfs_update_root_times(trans, root);
6415 
6416 	ret = btrfs_inode_inherit_props(trans, inode, dir);
6417 	if (ret)
6418 		btrfs_err(fs_info,
6419 			  "error inheriting props for ino %llu (root %llu): %d",
6420 			btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret);
6421 
6422 	return inode;
6423 
6424 fail_unlock:
6425 	unlock_new_inode(inode);
6426 fail:
6427 	if (dir && name)
6428 		BTRFS_I(dir)->index_cnt--;
6429 	btrfs_free_path(path);
6430 	iput(inode);
6431 	return ERR_PTR(ret);
6432 }
6433 
6434 static inline u8 btrfs_inode_type(struct inode *inode)
6435 {
6436 	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
6437 }
6438 
6439 /*
6440  * utility function to add 'inode' into 'parent_inode' with
6441  * a give name and a given sequence number.
6442  * if 'add_backref' is true, also insert a backref from the
6443  * inode to the parent directory.
6444  */
6445 int btrfs_add_link(struct btrfs_trans_handle *trans,
6446 		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6447 		   const char *name, int name_len, int add_backref, u64 index)
6448 {
6449 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6450 	int ret = 0;
6451 	struct btrfs_key key;
6452 	struct btrfs_root *root = parent_inode->root;
6453 	u64 ino = btrfs_ino(inode);
6454 	u64 parent_ino = btrfs_ino(parent_inode);
6455 
6456 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6457 		memcpy(&key, &inode->root->root_key, sizeof(key));
6458 	} else {
6459 		key.objectid = ino;
6460 		key.type = BTRFS_INODE_ITEM_KEY;
6461 		key.offset = 0;
6462 	}
6463 
6464 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6465 		ret = btrfs_add_root_ref(trans, fs_info, key.objectid,
6466 					 root->root_key.objectid, parent_ino,
6467 					 index, name, name_len);
6468 	} else if (add_backref) {
6469 		ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
6470 					     parent_ino, index);
6471 	}
6472 
6473 	/* Nothing to clean up yet */
6474 	if (ret)
6475 		return ret;
6476 
6477 	ret = btrfs_insert_dir_item(trans, root, name, name_len,
6478 				    parent_inode, &key,
6479 				    btrfs_inode_type(&inode->vfs_inode), index);
6480 	if (ret == -EEXIST || ret == -EOVERFLOW)
6481 		goto fail_dir_item;
6482 	else if (ret) {
6483 		btrfs_abort_transaction(trans, ret);
6484 		return ret;
6485 	}
6486 
6487 	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6488 			   name_len * 2);
6489 	inode_inc_iversion(&parent_inode->vfs_inode);
6490 	parent_inode->vfs_inode.i_mtime = parent_inode->vfs_inode.i_ctime =
6491 		current_time(&parent_inode->vfs_inode);
6492 	ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode);
6493 	if (ret)
6494 		btrfs_abort_transaction(trans, ret);
6495 	return ret;
6496 
6497 fail_dir_item:
6498 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6499 		u64 local_index;
6500 		int err;
6501 		err = btrfs_del_root_ref(trans, fs_info, key.objectid,
6502 					 root->root_key.objectid, parent_ino,
6503 					 &local_index, name, name_len);
6504 
6505 	} else if (add_backref) {
6506 		u64 local_index;
6507 		int err;
6508 
6509 		err = btrfs_del_inode_ref(trans, root, name, name_len,
6510 					  ino, parent_ino, &local_index);
6511 	}
6512 	return ret;
6513 }
6514 
6515 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
6516 			    struct btrfs_inode *dir, struct dentry *dentry,
6517 			    struct btrfs_inode *inode, int backref, u64 index)
6518 {
6519 	int err = btrfs_add_link(trans, dir, inode,
6520 				 dentry->d_name.name, dentry->d_name.len,
6521 				 backref, index);
6522 	if (err > 0)
6523 		err = -EEXIST;
6524 	return err;
6525 }
6526 
6527 static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
6528 			umode_t mode, dev_t rdev)
6529 {
6530 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6531 	struct btrfs_trans_handle *trans;
6532 	struct btrfs_root *root = BTRFS_I(dir)->root;
6533 	struct inode *inode = NULL;
6534 	int err;
6535 	int drop_inode = 0;
6536 	u64 objectid;
6537 	u64 index = 0;
6538 
6539 	/*
6540 	 * 2 for inode item and ref
6541 	 * 2 for dir items
6542 	 * 1 for xattr if selinux is on
6543 	 */
6544 	trans = btrfs_start_transaction(root, 5);
6545 	if (IS_ERR(trans))
6546 		return PTR_ERR(trans);
6547 
6548 	err = btrfs_find_free_ino(root, &objectid);
6549 	if (err)
6550 		goto out_unlock;
6551 
6552 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6553 			dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
6554 			mode, &index);
6555 	if (IS_ERR(inode)) {
6556 		err = PTR_ERR(inode);
6557 		goto out_unlock;
6558 	}
6559 
6560 	/*
6561 	* If the active LSM wants to access the inode during
6562 	* d_instantiate it needs these. Smack checks to see
6563 	* if the filesystem supports xattrs by looking at the
6564 	* ops vector.
6565 	*/
6566 	inode->i_op = &btrfs_special_inode_operations;
6567 	init_special_inode(inode, inode->i_mode, rdev);
6568 
6569 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6570 	if (err)
6571 		goto out_unlock_inode;
6572 
6573 	err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6574 			0, index);
6575 	if (err) {
6576 		goto out_unlock_inode;
6577 	} else {
6578 		btrfs_update_inode(trans, root, inode);
6579 		unlock_new_inode(inode);
6580 		d_instantiate(dentry, inode);
6581 	}
6582 
6583 out_unlock:
6584 	btrfs_end_transaction(trans);
6585 	btrfs_btree_balance_dirty(fs_info);
6586 	if (drop_inode) {
6587 		inode_dec_link_count(inode);
6588 		iput(inode);
6589 	}
6590 	return err;
6591 
6592 out_unlock_inode:
6593 	drop_inode = 1;
6594 	unlock_new_inode(inode);
6595 	goto out_unlock;
6596 
6597 }
6598 
6599 static int btrfs_create(struct inode *dir, struct dentry *dentry,
6600 			umode_t mode, bool excl)
6601 {
6602 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6603 	struct btrfs_trans_handle *trans;
6604 	struct btrfs_root *root = BTRFS_I(dir)->root;
6605 	struct inode *inode = NULL;
6606 	int drop_inode_on_err = 0;
6607 	int err;
6608 	u64 objectid;
6609 	u64 index = 0;
6610 
6611 	/*
6612 	 * 2 for inode item and ref
6613 	 * 2 for dir items
6614 	 * 1 for xattr if selinux is on
6615 	 */
6616 	trans = btrfs_start_transaction(root, 5);
6617 	if (IS_ERR(trans))
6618 		return PTR_ERR(trans);
6619 
6620 	err = btrfs_find_free_ino(root, &objectid);
6621 	if (err)
6622 		goto out_unlock;
6623 
6624 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6625 			dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
6626 			mode, &index);
6627 	if (IS_ERR(inode)) {
6628 		err = PTR_ERR(inode);
6629 		goto out_unlock;
6630 	}
6631 	drop_inode_on_err = 1;
6632 	/*
6633 	* If the active LSM wants to access the inode during
6634 	* d_instantiate it needs these. Smack checks to see
6635 	* if the filesystem supports xattrs by looking at the
6636 	* ops vector.
6637 	*/
6638 	inode->i_fop = &btrfs_file_operations;
6639 	inode->i_op = &btrfs_file_inode_operations;
6640 	inode->i_mapping->a_ops = &btrfs_aops;
6641 
6642 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6643 	if (err)
6644 		goto out_unlock_inode;
6645 
6646 	err = btrfs_update_inode(trans, root, inode);
6647 	if (err)
6648 		goto out_unlock_inode;
6649 
6650 	err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6651 			0, index);
6652 	if (err)
6653 		goto out_unlock_inode;
6654 
6655 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6656 	unlock_new_inode(inode);
6657 	d_instantiate(dentry, inode);
6658 
6659 out_unlock:
6660 	btrfs_end_transaction(trans);
6661 	if (err && drop_inode_on_err) {
6662 		inode_dec_link_count(inode);
6663 		iput(inode);
6664 	}
6665 	btrfs_btree_balance_dirty(fs_info);
6666 	return err;
6667 
6668 out_unlock_inode:
6669 	unlock_new_inode(inode);
6670 	goto out_unlock;
6671 
6672 }
6673 
6674 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6675 		      struct dentry *dentry)
6676 {
6677 	struct btrfs_trans_handle *trans = NULL;
6678 	struct btrfs_root *root = BTRFS_I(dir)->root;
6679 	struct inode *inode = d_inode(old_dentry);
6680 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6681 	u64 index;
6682 	int err;
6683 	int drop_inode = 0;
6684 
6685 	/* do not allow sys_link's with other subvols of the same device */
6686 	if (root->objectid != BTRFS_I(inode)->root->objectid)
6687 		return -EXDEV;
6688 
6689 	if (inode->i_nlink >= BTRFS_LINK_MAX)
6690 		return -EMLINK;
6691 
6692 	err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6693 	if (err)
6694 		goto fail;
6695 
6696 	/*
6697 	 * 2 items for inode and inode ref
6698 	 * 2 items for dir items
6699 	 * 1 item for parent inode
6700 	 */
6701 	trans = btrfs_start_transaction(root, 5);
6702 	if (IS_ERR(trans)) {
6703 		err = PTR_ERR(trans);
6704 		trans = NULL;
6705 		goto fail;
6706 	}
6707 
6708 	/* There are several dir indexes for this inode, clear the cache. */
6709 	BTRFS_I(inode)->dir_index = 0ULL;
6710 	inc_nlink(inode);
6711 	inode_inc_iversion(inode);
6712 	inode->i_ctime = current_time(inode);
6713 	ihold(inode);
6714 	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6715 
6716 	err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6717 			1, index);
6718 
6719 	if (err) {
6720 		drop_inode = 1;
6721 	} else {
6722 		struct dentry *parent = dentry->d_parent;
6723 		err = btrfs_update_inode(trans, root, inode);
6724 		if (err)
6725 			goto fail;
6726 		if (inode->i_nlink == 1) {
6727 			/*
6728 			 * If new hard link count is 1, it's a file created
6729 			 * with open(2) O_TMPFILE flag.
6730 			 */
6731 			err = btrfs_orphan_del(trans, BTRFS_I(inode));
6732 			if (err)
6733 				goto fail;
6734 		}
6735 		d_instantiate(dentry, inode);
6736 		btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
6737 	}
6738 
6739 fail:
6740 	if (trans)
6741 		btrfs_end_transaction(trans);
6742 	if (drop_inode) {
6743 		inode_dec_link_count(inode);
6744 		iput(inode);
6745 	}
6746 	btrfs_btree_balance_dirty(fs_info);
6747 	return err;
6748 }
6749 
6750 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
6751 {
6752 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6753 	struct inode *inode = NULL;
6754 	struct btrfs_trans_handle *trans;
6755 	struct btrfs_root *root = BTRFS_I(dir)->root;
6756 	int err = 0;
6757 	int drop_on_err = 0;
6758 	u64 objectid = 0;
6759 	u64 index = 0;
6760 
6761 	/*
6762 	 * 2 items for inode and ref
6763 	 * 2 items for dir items
6764 	 * 1 for xattr if selinux is on
6765 	 */
6766 	trans = btrfs_start_transaction(root, 5);
6767 	if (IS_ERR(trans))
6768 		return PTR_ERR(trans);
6769 
6770 	err = btrfs_find_free_ino(root, &objectid);
6771 	if (err)
6772 		goto out_fail;
6773 
6774 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6775 			dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
6776 			S_IFDIR | mode, &index);
6777 	if (IS_ERR(inode)) {
6778 		err = PTR_ERR(inode);
6779 		goto out_fail;
6780 	}
6781 
6782 	drop_on_err = 1;
6783 	/* these must be set before we unlock the inode */
6784 	inode->i_op = &btrfs_dir_inode_operations;
6785 	inode->i_fop = &btrfs_dir_file_operations;
6786 
6787 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6788 	if (err)
6789 		goto out_fail_inode;
6790 
6791 	btrfs_i_size_write(BTRFS_I(inode), 0);
6792 	err = btrfs_update_inode(trans, root, inode);
6793 	if (err)
6794 		goto out_fail_inode;
6795 
6796 	err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6797 			dentry->d_name.name,
6798 			dentry->d_name.len, 0, index);
6799 	if (err)
6800 		goto out_fail_inode;
6801 
6802 	d_instantiate(dentry, inode);
6803 	/*
6804 	 * mkdir is special.  We're unlocking after we call d_instantiate
6805 	 * to avoid a race with nfsd calling d_instantiate.
6806 	 */
6807 	unlock_new_inode(inode);
6808 	drop_on_err = 0;
6809 
6810 out_fail:
6811 	btrfs_end_transaction(trans);
6812 	if (drop_on_err) {
6813 		inode_dec_link_count(inode);
6814 		iput(inode);
6815 	}
6816 	btrfs_btree_balance_dirty(fs_info);
6817 	return err;
6818 
6819 out_fail_inode:
6820 	unlock_new_inode(inode);
6821 	goto out_fail;
6822 }
6823 
6824 static noinline int uncompress_inline(struct btrfs_path *path,
6825 				      struct page *page,
6826 				      size_t pg_offset, u64 extent_offset,
6827 				      struct btrfs_file_extent_item *item)
6828 {
6829 	int ret;
6830 	struct extent_buffer *leaf = path->nodes[0];
6831 	char *tmp;
6832 	size_t max_size;
6833 	unsigned long inline_size;
6834 	unsigned long ptr;
6835 	int compress_type;
6836 
6837 	WARN_ON(pg_offset != 0);
6838 	compress_type = btrfs_file_extent_compression(leaf, item);
6839 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
6840 	inline_size = btrfs_file_extent_inline_item_len(leaf,
6841 					btrfs_item_nr(path->slots[0]));
6842 	tmp = kmalloc(inline_size, GFP_NOFS);
6843 	if (!tmp)
6844 		return -ENOMEM;
6845 	ptr = btrfs_file_extent_inline_start(item);
6846 
6847 	read_extent_buffer(leaf, tmp, ptr, inline_size);
6848 
6849 	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6850 	ret = btrfs_decompress(compress_type, tmp, page,
6851 			       extent_offset, inline_size, max_size);
6852 
6853 	/*
6854 	 * decompression code contains a memset to fill in any space between the end
6855 	 * of the uncompressed data and the end of max_size in case the decompressed
6856 	 * data ends up shorter than ram_bytes.  That doesn't cover the hole between
6857 	 * the end of an inline extent and the beginning of the next block, so we
6858 	 * cover that region here.
6859 	 */
6860 
6861 	if (max_size + pg_offset < PAGE_SIZE) {
6862 		char *map = kmap(page);
6863 		memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset);
6864 		kunmap(page);
6865 	}
6866 	kfree(tmp);
6867 	return ret;
6868 }
6869 
6870 /*
6871  * a bit scary, this does extent mapping from logical file offset to the disk.
6872  * the ugly parts come from merging extents from the disk with the in-ram
6873  * representation.  This gets more complex because of the data=ordered code,
6874  * where the in-ram extents might be locked pending data=ordered completion.
6875  *
6876  * This also copies inline extents directly into the page.
6877  */
6878 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6879 		struct page *page,
6880 	    size_t pg_offset, u64 start, u64 len,
6881 		int create)
6882 {
6883 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6884 	int ret;
6885 	int err = 0;
6886 	u64 extent_start = 0;
6887 	u64 extent_end = 0;
6888 	u64 objectid = btrfs_ino(inode);
6889 	u32 found_type;
6890 	struct btrfs_path *path = NULL;
6891 	struct btrfs_root *root = inode->root;
6892 	struct btrfs_file_extent_item *item;
6893 	struct extent_buffer *leaf;
6894 	struct btrfs_key found_key;
6895 	struct extent_map *em = NULL;
6896 	struct extent_map_tree *em_tree = &inode->extent_tree;
6897 	struct extent_io_tree *io_tree = &inode->io_tree;
6898 	const bool new_inline = !page || create;
6899 
6900 	read_lock(&em_tree->lock);
6901 	em = lookup_extent_mapping(em_tree, start, len);
6902 	if (em)
6903 		em->bdev = fs_info->fs_devices->latest_bdev;
6904 	read_unlock(&em_tree->lock);
6905 
6906 	if (em) {
6907 		if (em->start > start || em->start + em->len <= start)
6908 			free_extent_map(em);
6909 		else if (em->block_start == EXTENT_MAP_INLINE && page)
6910 			free_extent_map(em);
6911 		else
6912 			goto out;
6913 	}
6914 	em = alloc_extent_map();
6915 	if (!em) {
6916 		err = -ENOMEM;
6917 		goto out;
6918 	}
6919 	em->bdev = fs_info->fs_devices->latest_bdev;
6920 	em->start = EXTENT_MAP_HOLE;
6921 	em->orig_start = EXTENT_MAP_HOLE;
6922 	em->len = (u64)-1;
6923 	em->block_len = (u64)-1;
6924 
6925 	if (!path) {
6926 		path = btrfs_alloc_path();
6927 		if (!path) {
6928 			err = -ENOMEM;
6929 			goto out;
6930 		}
6931 		/*
6932 		 * Chances are we'll be called again, so go ahead and do
6933 		 * readahead
6934 		 */
6935 		path->reada = READA_FORWARD;
6936 	}
6937 
6938 	ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
6939 	if (ret < 0) {
6940 		err = ret;
6941 		goto out;
6942 	}
6943 
6944 	if (ret != 0) {
6945 		if (path->slots[0] == 0)
6946 			goto not_found;
6947 		path->slots[0]--;
6948 	}
6949 
6950 	leaf = path->nodes[0];
6951 	item = btrfs_item_ptr(leaf, path->slots[0],
6952 			      struct btrfs_file_extent_item);
6953 	/* are we inside the extent that was found? */
6954 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6955 	found_type = found_key.type;
6956 	if (found_key.objectid != objectid ||
6957 	    found_type != BTRFS_EXTENT_DATA_KEY) {
6958 		/*
6959 		 * If we backup past the first extent we want to move forward
6960 		 * and see if there is an extent in front of us, otherwise we'll
6961 		 * say there is a hole for our whole search range which can
6962 		 * cause problems.
6963 		 */
6964 		extent_end = start;
6965 		goto next;
6966 	}
6967 
6968 	found_type = btrfs_file_extent_type(leaf, item);
6969 	extent_start = found_key.offset;
6970 	if (found_type == BTRFS_FILE_EXTENT_REG ||
6971 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6972 		extent_end = extent_start +
6973 		       btrfs_file_extent_num_bytes(leaf, item);
6974 
6975 		trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
6976 						       extent_start);
6977 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6978 		size_t size;
6979 		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6980 		extent_end = ALIGN(extent_start + size,
6981 				   fs_info->sectorsize);
6982 
6983 		trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
6984 						      path->slots[0],
6985 						      extent_start);
6986 	}
6987 next:
6988 	if (start >= extent_end) {
6989 		path->slots[0]++;
6990 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6991 			ret = btrfs_next_leaf(root, path);
6992 			if (ret < 0) {
6993 				err = ret;
6994 				goto out;
6995 			}
6996 			if (ret > 0)
6997 				goto not_found;
6998 			leaf = path->nodes[0];
6999 		}
7000 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7001 		if (found_key.objectid != objectid ||
7002 		    found_key.type != BTRFS_EXTENT_DATA_KEY)
7003 			goto not_found;
7004 		if (start + len <= found_key.offset)
7005 			goto not_found;
7006 		if (start > found_key.offset)
7007 			goto next;
7008 		em->start = start;
7009 		em->orig_start = start;
7010 		em->len = found_key.offset - start;
7011 		goto not_found_em;
7012 	}
7013 
7014 	btrfs_extent_item_to_extent_map(inode, path, item,
7015 			new_inline, em);
7016 
7017 	if (found_type == BTRFS_FILE_EXTENT_REG ||
7018 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7019 		goto insert;
7020 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
7021 		unsigned long ptr;
7022 		char *map;
7023 		size_t size;
7024 		size_t extent_offset;
7025 		size_t copy_size;
7026 
7027 		if (new_inline)
7028 			goto out;
7029 
7030 		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
7031 		extent_offset = page_offset(page) + pg_offset - extent_start;
7032 		copy_size = min_t(u64, PAGE_SIZE - pg_offset,
7033 				  size - extent_offset);
7034 		em->start = extent_start + extent_offset;
7035 		em->len = ALIGN(copy_size, fs_info->sectorsize);
7036 		em->orig_block_len = em->len;
7037 		em->orig_start = em->start;
7038 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
7039 		if (!PageUptodate(page)) {
7040 			if (btrfs_file_extent_compression(leaf, item) !=
7041 			    BTRFS_COMPRESS_NONE) {
7042 				ret = uncompress_inline(path, page, pg_offset,
7043 							extent_offset, item);
7044 				if (ret) {
7045 					err = ret;
7046 					goto out;
7047 				}
7048 			} else {
7049 				map = kmap(page);
7050 				read_extent_buffer(leaf, map + pg_offset, ptr,
7051 						   copy_size);
7052 				if (pg_offset + copy_size < PAGE_SIZE) {
7053 					memset(map + pg_offset + copy_size, 0,
7054 					       PAGE_SIZE - pg_offset -
7055 					       copy_size);
7056 				}
7057 				kunmap(page);
7058 			}
7059 			flush_dcache_page(page);
7060 		}
7061 		set_extent_uptodate(io_tree, em->start,
7062 				    extent_map_end(em) - 1, NULL, GFP_NOFS);
7063 		goto insert;
7064 	}
7065 not_found:
7066 	em->start = start;
7067 	em->orig_start = start;
7068 	em->len = len;
7069 not_found_em:
7070 	em->block_start = EXTENT_MAP_HOLE;
7071 insert:
7072 	btrfs_release_path(path);
7073 	if (em->start > start || extent_map_end(em) <= start) {
7074 		btrfs_err(fs_info,
7075 			  "bad extent! em: [%llu %llu] passed [%llu %llu]",
7076 			  em->start, em->len, start, len);
7077 		err = -EIO;
7078 		goto out;
7079 	}
7080 
7081 	err = 0;
7082 	write_lock(&em_tree->lock);
7083 	err = btrfs_add_extent_mapping(em_tree, &em, start, len);
7084 	write_unlock(&em_tree->lock);
7085 out:
7086 
7087 	trace_btrfs_get_extent(root, inode, em);
7088 
7089 	btrfs_free_path(path);
7090 	if (err) {
7091 		free_extent_map(em);
7092 		return ERR_PTR(err);
7093 	}
7094 	BUG_ON(!em); /* Error is always set */
7095 	return em;
7096 }
7097 
7098 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
7099 		struct page *page,
7100 		size_t pg_offset, u64 start, u64 len,
7101 		int create)
7102 {
7103 	struct extent_map *em;
7104 	struct extent_map *hole_em = NULL;
7105 	u64 range_start = start;
7106 	u64 end;
7107 	u64 found;
7108 	u64 found_end;
7109 	int err = 0;
7110 
7111 	em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
7112 	if (IS_ERR(em))
7113 		return em;
7114 	/*
7115 	 * If our em maps to:
7116 	 * - a hole or
7117 	 * - a pre-alloc extent,
7118 	 * there might actually be delalloc bytes behind it.
7119 	 */
7120 	if (em->block_start != EXTENT_MAP_HOLE &&
7121 	    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7122 		return em;
7123 	else
7124 		hole_em = em;
7125 
7126 	/* check to see if we've wrapped (len == -1 or similar) */
7127 	end = start + len;
7128 	if (end < start)
7129 		end = (u64)-1;
7130 	else
7131 		end -= 1;
7132 
7133 	em = NULL;
7134 
7135 	/* ok, we didn't find anything, lets look for delalloc */
7136 	found = count_range_bits(&inode->io_tree, &range_start,
7137 				 end, len, EXTENT_DELALLOC, 1);
7138 	found_end = range_start + found;
7139 	if (found_end < range_start)
7140 		found_end = (u64)-1;
7141 
7142 	/*
7143 	 * we didn't find anything useful, return
7144 	 * the original results from get_extent()
7145 	 */
7146 	if (range_start > end || found_end <= start) {
7147 		em = hole_em;
7148 		hole_em = NULL;
7149 		goto out;
7150 	}
7151 
7152 	/* adjust the range_start to make sure it doesn't
7153 	 * go backwards from the start they passed in
7154 	 */
7155 	range_start = max(start, range_start);
7156 	found = found_end - range_start;
7157 
7158 	if (found > 0) {
7159 		u64 hole_start = start;
7160 		u64 hole_len = len;
7161 
7162 		em = alloc_extent_map();
7163 		if (!em) {
7164 			err = -ENOMEM;
7165 			goto out;
7166 		}
7167 		/*
7168 		 * when btrfs_get_extent can't find anything it
7169 		 * returns one huge hole
7170 		 *
7171 		 * make sure what it found really fits our range, and
7172 		 * adjust to make sure it is based on the start from
7173 		 * the caller
7174 		 */
7175 		if (hole_em) {
7176 			u64 calc_end = extent_map_end(hole_em);
7177 
7178 			if (calc_end <= start || (hole_em->start > end)) {
7179 				free_extent_map(hole_em);
7180 				hole_em = NULL;
7181 			} else {
7182 				hole_start = max(hole_em->start, start);
7183 				hole_len = calc_end - hole_start;
7184 			}
7185 		}
7186 		em->bdev = NULL;
7187 		if (hole_em && range_start > hole_start) {
7188 			/* our hole starts before our delalloc, so we
7189 			 * have to return just the parts of the hole
7190 			 * that go until  the delalloc starts
7191 			 */
7192 			em->len = min(hole_len,
7193 				      range_start - hole_start);
7194 			em->start = hole_start;
7195 			em->orig_start = hole_start;
7196 			/*
7197 			 * don't adjust block start at all,
7198 			 * it is fixed at EXTENT_MAP_HOLE
7199 			 */
7200 			em->block_start = hole_em->block_start;
7201 			em->block_len = hole_len;
7202 			if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
7203 				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7204 		} else {
7205 			em->start = range_start;
7206 			em->len = found;
7207 			em->orig_start = range_start;
7208 			em->block_start = EXTENT_MAP_DELALLOC;
7209 			em->block_len = found;
7210 		}
7211 	} else {
7212 		return hole_em;
7213 	}
7214 out:
7215 
7216 	free_extent_map(hole_em);
7217 	if (err) {
7218 		free_extent_map(em);
7219 		return ERR_PTR(err);
7220 	}
7221 	return em;
7222 }
7223 
7224 static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
7225 						  const u64 start,
7226 						  const u64 len,
7227 						  const u64 orig_start,
7228 						  const u64 block_start,
7229 						  const u64 block_len,
7230 						  const u64 orig_block_len,
7231 						  const u64 ram_bytes,
7232 						  const int type)
7233 {
7234 	struct extent_map *em = NULL;
7235 	int ret;
7236 
7237 	if (type != BTRFS_ORDERED_NOCOW) {
7238 		em = create_io_em(inode, start, len, orig_start,
7239 				  block_start, block_len, orig_block_len,
7240 				  ram_bytes,
7241 				  BTRFS_COMPRESS_NONE, /* compress_type */
7242 				  type);
7243 		if (IS_ERR(em))
7244 			goto out;
7245 	}
7246 	ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
7247 					   len, block_len, type);
7248 	if (ret) {
7249 		if (em) {
7250 			free_extent_map(em);
7251 			btrfs_drop_extent_cache(BTRFS_I(inode), start,
7252 						start + len - 1, 0);
7253 		}
7254 		em = ERR_PTR(ret);
7255 	}
7256  out:
7257 
7258 	return em;
7259 }
7260 
7261 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7262 						  u64 start, u64 len)
7263 {
7264 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7265 	struct btrfs_root *root = BTRFS_I(inode)->root;
7266 	struct extent_map *em;
7267 	struct btrfs_key ins;
7268 	u64 alloc_hint;
7269 	int ret;
7270 
7271 	alloc_hint = get_extent_allocation_hint(inode, start, len);
7272 	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
7273 				   0, alloc_hint, &ins, 1, 1);
7274 	if (ret)
7275 		return ERR_PTR(ret);
7276 
7277 	em = btrfs_create_dio_extent(inode, start, ins.offset, start,
7278 				     ins.objectid, ins.offset, ins.offset,
7279 				     ins.offset, BTRFS_ORDERED_REGULAR);
7280 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
7281 	if (IS_ERR(em))
7282 		btrfs_free_reserved_extent(fs_info, ins.objectid,
7283 					   ins.offset, 1);
7284 
7285 	return em;
7286 }
7287 
7288 /*
7289  * returns 1 when the nocow is safe, < 1 on error, 0 if the
7290  * block must be cow'd
7291  */
7292 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7293 			      u64 *orig_start, u64 *orig_block_len,
7294 			      u64 *ram_bytes)
7295 {
7296 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7297 	struct btrfs_path *path;
7298 	int ret;
7299 	struct extent_buffer *leaf;
7300 	struct btrfs_root *root = BTRFS_I(inode)->root;
7301 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7302 	struct btrfs_file_extent_item *fi;
7303 	struct btrfs_key key;
7304 	u64 disk_bytenr;
7305 	u64 backref_offset;
7306 	u64 extent_end;
7307 	u64 num_bytes;
7308 	int slot;
7309 	int found_type;
7310 	bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
7311 
7312 	path = btrfs_alloc_path();
7313 	if (!path)
7314 		return -ENOMEM;
7315 
7316 	ret = btrfs_lookup_file_extent(NULL, root, path,
7317 			btrfs_ino(BTRFS_I(inode)), offset, 0);
7318 	if (ret < 0)
7319 		goto out;
7320 
7321 	slot = path->slots[0];
7322 	if (ret == 1) {
7323 		if (slot == 0) {
7324 			/* can't find the item, must cow */
7325 			ret = 0;
7326 			goto out;
7327 		}
7328 		slot--;
7329 	}
7330 	ret = 0;
7331 	leaf = path->nodes[0];
7332 	btrfs_item_key_to_cpu(leaf, &key, slot);
7333 	if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
7334 	    key.type != BTRFS_EXTENT_DATA_KEY) {
7335 		/* not our file or wrong item type, must cow */
7336 		goto out;
7337 	}
7338 
7339 	if (key.offset > offset) {
7340 		/* Wrong offset, must cow */
7341 		goto out;
7342 	}
7343 
7344 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
7345 	found_type = btrfs_file_extent_type(leaf, fi);
7346 	if (found_type != BTRFS_FILE_EXTENT_REG &&
7347 	    found_type != BTRFS_FILE_EXTENT_PREALLOC) {
7348 		/* not a regular extent, must cow */
7349 		goto out;
7350 	}
7351 
7352 	if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
7353 		goto out;
7354 
7355 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
7356 	if (extent_end <= offset)
7357 		goto out;
7358 
7359 	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7360 	if (disk_bytenr == 0)
7361 		goto out;
7362 
7363 	if (btrfs_file_extent_compression(leaf, fi) ||
7364 	    btrfs_file_extent_encryption(leaf, fi) ||
7365 	    btrfs_file_extent_other_encoding(leaf, fi))
7366 		goto out;
7367 
7368 	backref_offset = btrfs_file_extent_offset(leaf, fi);
7369 
7370 	if (orig_start) {
7371 		*orig_start = key.offset - backref_offset;
7372 		*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
7373 		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
7374 	}
7375 
7376 	if (btrfs_extent_readonly(fs_info, disk_bytenr))
7377 		goto out;
7378 
7379 	num_bytes = min(offset + *len, extent_end) - offset;
7380 	if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7381 		u64 range_end;
7382 
7383 		range_end = round_up(offset + num_bytes,
7384 				     root->fs_info->sectorsize) - 1;
7385 		ret = test_range_bit(io_tree, offset, range_end,
7386 				     EXTENT_DELALLOC, 0, NULL);
7387 		if (ret) {
7388 			ret = -EAGAIN;
7389 			goto out;
7390 		}
7391 	}
7392 
7393 	btrfs_release_path(path);
7394 
7395 	/*
7396 	 * look for other files referencing this extent, if we
7397 	 * find any we must cow
7398 	 */
7399 
7400 	ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
7401 				    key.offset - backref_offset, disk_bytenr);
7402 	if (ret) {
7403 		ret = 0;
7404 		goto out;
7405 	}
7406 
7407 	/*
7408 	 * adjust disk_bytenr and num_bytes to cover just the bytes
7409 	 * in this extent we are about to write.  If there
7410 	 * are any csums in that range we have to cow in order
7411 	 * to keep the csums correct
7412 	 */
7413 	disk_bytenr += backref_offset;
7414 	disk_bytenr += offset - key.offset;
7415 	if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes))
7416 		goto out;
7417 	/*
7418 	 * all of the above have passed, it is safe to overwrite this extent
7419 	 * without cow
7420 	 */
7421 	*len = num_bytes;
7422 	ret = 1;
7423 out:
7424 	btrfs_free_path(path);
7425 	return ret;
7426 }
7427 
7428 bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
7429 {
7430 	struct radix_tree_root *root = &inode->i_mapping->page_tree;
7431 	bool found = false;
7432 	void **pagep = NULL;
7433 	struct page *page = NULL;
7434 	unsigned long start_idx;
7435 	unsigned long end_idx;
7436 
7437 	start_idx = start >> PAGE_SHIFT;
7438 
7439 	/*
7440 	 * end is the last byte in the last page.  end == start is legal
7441 	 */
7442 	end_idx = end >> PAGE_SHIFT;
7443 
7444 	rcu_read_lock();
7445 
7446 	/* Most of the code in this while loop is lifted from
7447 	 * find_get_page.  It's been modified to begin searching from a
7448 	 * page and return just the first page found in that range.  If the
7449 	 * found idx is less than or equal to the end idx then we know that
7450 	 * a page exists.  If no pages are found or if those pages are
7451 	 * outside of the range then we're fine (yay!) */
7452 	while (page == NULL &&
7453 	       radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
7454 		page = radix_tree_deref_slot(pagep);
7455 		if (unlikely(!page))
7456 			break;
7457 
7458 		if (radix_tree_exception(page)) {
7459 			if (radix_tree_deref_retry(page)) {
7460 				page = NULL;
7461 				continue;
7462 			}
7463 			/*
7464 			 * Otherwise, shmem/tmpfs must be storing a swap entry
7465 			 * here as an exceptional entry: so return it without
7466 			 * attempting to raise page count.
7467 			 */
7468 			page = NULL;
7469 			break; /* TODO: Is this relevant for this use case? */
7470 		}
7471 
7472 		if (!page_cache_get_speculative(page)) {
7473 			page = NULL;
7474 			continue;
7475 		}
7476 
7477 		/*
7478 		 * Has the page moved?
7479 		 * This is part of the lockless pagecache protocol. See
7480 		 * include/linux/pagemap.h for details.
7481 		 */
7482 		if (unlikely(page != *pagep)) {
7483 			put_page(page);
7484 			page = NULL;
7485 		}
7486 	}
7487 
7488 	if (page) {
7489 		if (page->index <= end_idx)
7490 			found = true;
7491 		put_page(page);
7492 	}
7493 
7494 	rcu_read_unlock();
7495 	return found;
7496 }
7497 
7498 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7499 			      struct extent_state **cached_state, int writing)
7500 {
7501 	struct btrfs_ordered_extent *ordered;
7502 	int ret = 0;
7503 
7504 	while (1) {
7505 		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7506 				 cached_state);
7507 		/*
7508 		 * We're concerned with the entire range that we're going to be
7509 		 * doing DIO to, so we need to make sure there's no ordered
7510 		 * extents in this range.
7511 		 */
7512 		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
7513 						     lockend - lockstart + 1);
7514 
7515 		/*
7516 		 * We need to make sure there are no buffered pages in this
7517 		 * range either, we could have raced between the invalidate in
7518 		 * generic_file_direct_write and locking the extent.  The
7519 		 * invalidate needs to happen so that reads after a write do not
7520 		 * get stale data.
7521 		 */
7522 		if (!ordered &&
7523 		    (!writing ||
7524 		     !btrfs_page_exists_in_range(inode, lockstart, lockend)))
7525 			break;
7526 
7527 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7528 				     cached_state);
7529 
7530 		if (ordered) {
7531 			/*
7532 			 * If we are doing a DIO read and the ordered extent we
7533 			 * found is for a buffered write, we can not wait for it
7534 			 * to complete and retry, because if we do so we can
7535 			 * deadlock with concurrent buffered writes on page
7536 			 * locks. This happens only if our DIO read covers more
7537 			 * than one extent map, if at this point has already
7538 			 * created an ordered extent for a previous extent map
7539 			 * and locked its range in the inode's io tree, and a
7540 			 * concurrent write against that previous extent map's
7541 			 * range and this range started (we unlock the ranges
7542 			 * in the io tree only when the bios complete and
7543 			 * buffered writes always lock pages before attempting
7544 			 * to lock range in the io tree).
7545 			 */
7546 			if (writing ||
7547 			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7548 				btrfs_start_ordered_extent(inode, ordered, 1);
7549 			else
7550 				ret = -ENOTBLK;
7551 			btrfs_put_ordered_extent(ordered);
7552 		} else {
7553 			/*
7554 			 * We could trigger writeback for this range (and wait
7555 			 * for it to complete) and then invalidate the pages for
7556 			 * this range (through invalidate_inode_pages2_range()),
7557 			 * but that can lead us to a deadlock with a concurrent
7558 			 * call to readpages() (a buffered read or a defrag call
7559 			 * triggered a readahead) on a page lock due to an
7560 			 * ordered dio extent we created before but did not have
7561 			 * yet a corresponding bio submitted (whence it can not
7562 			 * complete), which makes readpages() wait for that
7563 			 * ordered extent to complete while holding a lock on
7564 			 * that page.
7565 			 */
7566 			ret = -ENOTBLK;
7567 		}
7568 
7569 		if (ret)
7570 			break;
7571 
7572 		cond_resched();
7573 	}
7574 
7575 	return ret;
7576 }
7577 
7578 /* The callers of this must take lock_extent() */
7579 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
7580 				       u64 orig_start, u64 block_start,
7581 				       u64 block_len, u64 orig_block_len,
7582 				       u64 ram_bytes, int compress_type,
7583 				       int type)
7584 {
7585 	struct extent_map_tree *em_tree;
7586 	struct extent_map *em;
7587 	struct btrfs_root *root = BTRFS_I(inode)->root;
7588 	int ret;
7589 
7590 	ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7591 	       type == BTRFS_ORDERED_COMPRESSED ||
7592 	       type == BTRFS_ORDERED_NOCOW ||
7593 	       type == BTRFS_ORDERED_REGULAR);
7594 
7595 	em_tree = &BTRFS_I(inode)->extent_tree;
7596 	em = alloc_extent_map();
7597 	if (!em)
7598 		return ERR_PTR(-ENOMEM);
7599 
7600 	em->start = start;
7601 	em->orig_start = orig_start;
7602 	em->len = len;
7603 	em->block_len = block_len;
7604 	em->block_start = block_start;
7605 	em->bdev = root->fs_info->fs_devices->latest_bdev;
7606 	em->orig_block_len = orig_block_len;
7607 	em->ram_bytes = ram_bytes;
7608 	em->generation = -1;
7609 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
7610 	if (type == BTRFS_ORDERED_PREALLOC) {
7611 		set_bit(EXTENT_FLAG_FILLING, &em->flags);
7612 	} else if (type == BTRFS_ORDERED_COMPRESSED) {
7613 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
7614 		em->compress_type = compress_type;
7615 	}
7616 
7617 	do {
7618 		btrfs_drop_extent_cache(BTRFS_I(inode), em->start,
7619 				em->start + em->len - 1, 0);
7620 		write_lock(&em_tree->lock);
7621 		ret = add_extent_mapping(em_tree, em, 1);
7622 		write_unlock(&em_tree->lock);
7623 		/*
7624 		 * The caller has taken lock_extent(), who could race with us
7625 		 * to add em?
7626 		 */
7627 	} while (ret == -EEXIST);
7628 
7629 	if (ret) {
7630 		free_extent_map(em);
7631 		return ERR_PTR(ret);
7632 	}
7633 
7634 	/* em got 2 refs now, callers needs to do free_extent_map once. */
7635 	return em;
7636 }
7637 
7638 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7639 				   struct buffer_head *bh_result, int create)
7640 {
7641 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7642 	struct extent_map *em;
7643 	struct extent_state *cached_state = NULL;
7644 	struct btrfs_dio_data *dio_data = NULL;
7645 	u64 start = iblock << inode->i_blkbits;
7646 	u64 lockstart, lockend;
7647 	u64 len = bh_result->b_size;
7648 	int unlock_bits = EXTENT_LOCKED;
7649 	int ret = 0;
7650 
7651 	if (create)
7652 		unlock_bits |= EXTENT_DIRTY;
7653 	else
7654 		len = min_t(u64, len, fs_info->sectorsize);
7655 
7656 	lockstart = start;
7657 	lockend = start + len - 1;
7658 
7659 	if (current->journal_info) {
7660 		/*
7661 		 * Need to pull our outstanding extents and set journal_info to NULL so
7662 		 * that anything that needs to check if there's a transaction doesn't get
7663 		 * confused.
7664 		 */
7665 		dio_data = current->journal_info;
7666 		current->journal_info = NULL;
7667 	}
7668 
7669 	/*
7670 	 * If this errors out it's because we couldn't invalidate pagecache for
7671 	 * this range and we need to fallback to buffered.
7672 	 */
7673 	if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
7674 			       create)) {
7675 		ret = -ENOTBLK;
7676 		goto err;
7677 	}
7678 
7679 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
7680 	if (IS_ERR(em)) {
7681 		ret = PTR_ERR(em);
7682 		goto unlock_err;
7683 	}
7684 
7685 	/*
7686 	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7687 	 * io.  INLINE is special, and we could probably kludge it in here, but
7688 	 * it's still buffered so for safety lets just fall back to the generic
7689 	 * buffered path.
7690 	 *
7691 	 * For COMPRESSED we _have_ to read the entire extent in so we can
7692 	 * decompress it, so there will be buffering required no matter what we
7693 	 * do, so go ahead and fallback to buffered.
7694 	 *
7695 	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
7696 	 * to buffered IO.  Don't blame me, this is the price we pay for using
7697 	 * the generic code.
7698 	 */
7699 	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7700 	    em->block_start == EXTENT_MAP_INLINE) {
7701 		free_extent_map(em);
7702 		ret = -ENOTBLK;
7703 		goto unlock_err;
7704 	}
7705 
7706 	/* Just a good old fashioned hole, return */
7707 	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
7708 			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
7709 		free_extent_map(em);
7710 		goto unlock_err;
7711 	}
7712 
7713 	/*
7714 	 * We don't allocate a new extent in the following cases
7715 	 *
7716 	 * 1) The inode is marked as NODATACOW.  In this case we'll just use the
7717 	 * existing extent.
7718 	 * 2) The extent is marked as PREALLOC.  We're good to go here and can
7719 	 * just use the extent.
7720 	 *
7721 	 */
7722 	if (!create) {
7723 		len = min(len, em->len - (start - em->start));
7724 		lockstart = start + len;
7725 		goto unlock;
7726 	}
7727 
7728 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7729 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7730 	     em->block_start != EXTENT_MAP_HOLE)) {
7731 		int type;
7732 		u64 block_start, orig_start, orig_block_len, ram_bytes;
7733 
7734 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7735 			type = BTRFS_ORDERED_PREALLOC;
7736 		else
7737 			type = BTRFS_ORDERED_NOCOW;
7738 		len = min(len, em->len - (start - em->start));
7739 		block_start = em->block_start + (start - em->start);
7740 
7741 		if (can_nocow_extent(inode, start, &len, &orig_start,
7742 				     &orig_block_len, &ram_bytes) == 1 &&
7743 		    btrfs_inc_nocow_writers(fs_info, block_start)) {
7744 			struct extent_map *em2;
7745 
7746 			em2 = btrfs_create_dio_extent(inode, start, len,
7747 						      orig_start, block_start,
7748 						      len, orig_block_len,
7749 						      ram_bytes, type);
7750 			btrfs_dec_nocow_writers(fs_info, block_start);
7751 			if (type == BTRFS_ORDERED_PREALLOC) {
7752 				free_extent_map(em);
7753 				em = em2;
7754 			}
7755 			if (em2 && IS_ERR(em2)) {
7756 				ret = PTR_ERR(em2);
7757 				goto unlock_err;
7758 			}
7759 			/*
7760 			 * For inode marked NODATACOW or extent marked PREALLOC,
7761 			 * use the existing or preallocated extent, so does not
7762 			 * need to adjust btrfs_space_info's bytes_may_use.
7763 			 */
7764 			btrfs_free_reserved_data_space_noquota(inode,
7765 					start, len);
7766 			goto unlock;
7767 		}
7768 	}
7769 
7770 	/*
7771 	 * this will cow the extent, reset the len in case we changed
7772 	 * it above
7773 	 */
7774 	len = bh_result->b_size;
7775 	free_extent_map(em);
7776 	em = btrfs_new_extent_direct(inode, start, len);
7777 	if (IS_ERR(em)) {
7778 		ret = PTR_ERR(em);
7779 		goto unlock_err;
7780 	}
7781 	len = min(len, em->len - (start - em->start));
7782 unlock:
7783 	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7784 		inode->i_blkbits;
7785 	bh_result->b_size = len;
7786 	bh_result->b_bdev = em->bdev;
7787 	set_buffer_mapped(bh_result);
7788 	if (create) {
7789 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7790 			set_buffer_new(bh_result);
7791 
7792 		/*
7793 		 * Need to update the i_size under the extent lock so buffered
7794 		 * readers will get the updated i_size when we unlock.
7795 		 */
7796 		if (!dio_data->overwrite && start + len > i_size_read(inode))
7797 			i_size_write(inode, start + len);
7798 
7799 		WARN_ON(dio_data->reserve < len);
7800 		dio_data->reserve -= len;
7801 		dio_data->unsubmitted_oe_range_end = start + len;
7802 		current->journal_info = dio_data;
7803 	}
7804 
7805 	/*
7806 	 * In the case of write we need to clear and unlock the entire range,
7807 	 * in the case of read we need to unlock only the end area that we
7808 	 * aren't using if there is any left over space.
7809 	 */
7810 	if (lockstart < lockend) {
7811 		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7812 				 lockend, unlock_bits, 1, 0,
7813 				 &cached_state);
7814 	} else {
7815 		free_extent_state(cached_state);
7816 	}
7817 
7818 	free_extent_map(em);
7819 
7820 	return 0;
7821 
7822 unlock_err:
7823 	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7824 			 unlock_bits, 1, 0, &cached_state);
7825 err:
7826 	if (dio_data)
7827 		current->journal_info = dio_data;
7828 	return ret;
7829 }
7830 
7831 static inline blk_status_t submit_dio_repair_bio(struct inode *inode,
7832 						 struct bio *bio,
7833 						 int mirror_num)
7834 {
7835 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7836 	blk_status_t ret;
7837 
7838 	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
7839 
7840 	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
7841 	if (ret)
7842 		return ret;
7843 
7844 	ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
7845 
7846 	return ret;
7847 }
7848 
7849 static int btrfs_check_dio_repairable(struct inode *inode,
7850 				      struct bio *failed_bio,
7851 				      struct io_failure_record *failrec,
7852 				      int failed_mirror)
7853 {
7854 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7855 	int num_copies;
7856 
7857 	num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
7858 	if (num_copies == 1) {
7859 		/*
7860 		 * we only have a single copy of the data, so don't bother with
7861 		 * all the retry and error correction code that follows. no
7862 		 * matter what the error is, it is very likely to persist.
7863 		 */
7864 		btrfs_debug(fs_info,
7865 			"Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
7866 			num_copies, failrec->this_mirror, failed_mirror);
7867 		return 0;
7868 	}
7869 
7870 	failrec->failed_mirror = failed_mirror;
7871 	failrec->this_mirror++;
7872 	if (failrec->this_mirror == failed_mirror)
7873 		failrec->this_mirror++;
7874 
7875 	if (failrec->this_mirror > num_copies) {
7876 		btrfs_debug(fs_info,
7877 			"Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
7878 			num_copies, failrec->this_mirror, failed_mirror);
7879 		return 0;
7880 	}
7881 
7882 	return 1;
7883 }
7884 
7885 static blk_status_t dio_read_error(struct inode *inode, struct bio *failed_bio,
7886 				   struct page *page, unsigned int pgoff,
7887 				   u64 start, u64 end, int failed_mirror,
7888 				   bio_end_io_t *repair_endio, void *repair_arg)
7889 {
7890 	struct io_failure_record *failrec;
7891 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7892 	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
7893 	struct bio *bio;
7894 	int isector;
7895 	unsigned int read_mode = 0;
7896 	int segs;
7897 	int ret;
7898 	blk_status_t status;
7899 	struct bio_vec bvec;
7900 
7901 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
7902 
7903 	ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
7904 	if (ret)
7905 		return errno_to_blk_status(ret);
7906 
7907 	ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
7908 					 failed_mirror);
7909 	if (!ret) {
7910 		free_io_failure(failure_tree, io_tree, failrec);
7911 		return BLK_STS_IOERR;
7912 	}
7913 
7914 	segs = bio_segments(failed_bio);
7915 	bio_get_first_bvec(failed_bio, &bvec);
7916 	if (segs > 1 ||
7917 	    (bvec.bv_len > btrfs_inode_sectorsize(inode)))
7918 		read_mode |= REQ_FAILFAST_DEV;
7919 
7920 	isector = start - btrfs_io_bio(failed_bio)->logical;
7921 	isector >>= inode->i_sb->s_blocksize_bits;
7922 	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
7923 				pgoff, isector, repair_endio, repair_arg);
7924 	bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
7925 
7926 	btrfs_debug(BTRFS_I(inode)->root->fs_info,
7927 		    "repair DIO read error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d",
7928 		    read_mode, failrec->this_mirror, failrec->in_validation);
7929 
7930 	status = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
7931 	if (status) {
7932 		free_io_failure(failure_tree, io_tree, failrec);
7933 		bio_put(bio);
7934 	}
7935 
7936 	return status;
7937 }
7938 
7939 struct btrfs_retry_complete {
7940 	struct completion done;
7941 	struct inode *inode;
7942 	u64 start;
7943 	int uptodate;
7944 };
7945 
7946 static void btrfs_retry_endio_nocsum(struct bio *bio)
7947 {
7948 	struct btrfs_retry_complete *done = bio->bi_private;
7949 	struct inode *inode = done->inode;
7950 	struct bio_vec *bvec;
7951 	struct extent_io_tree *io_tree, *failure_tree;
7952 	int i;
7953 
7954 	if (bio->bi_status)
7955 		goto end;
7956 
7957 	ASSERT(bio->bi_vcnt == 1);
7958 	io_tree = &BTRFS_I(inode)->io_tree;
7959 	failure_tree = &BTRFS_I(inode)->io_failure_tree;
7960 	ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(inode));
7961 
7962 	done->uptodate = 1;
7963 	ASSERT(!bio_flagged(bio, BIO_CLONED));
7964 	bio_for_each_segment_all(bvec, bio, i)
7965 		clean_io_failure(BTRFS_I(inode)->root->fs_info, failure_tree,
7966 				 io_tree, done->start, bvec->bv_page,
7967 				 btrfs_ino(BTRFS_I(inode)), 0);
7968 end:
7969 	complete(&done->done);
7970 	bio_put(bio);
7971 }
7972 
7973 static blk_status_t __btrfs_correct_data_nocsum(struct inode *inode,
7974 						struct btrfs_io_bio *io_bio)
7975 {
7976 	struct btrfs_fs_info *fs_info;
7977 	struct bio_vec bvec;
7978 	struct bvec_iter iter;
7979 	struct btrfs_retry_complete done;
7980 	u64 start;
7981 	unsigned int pgoff;
7982 	u32 sectorsize;
7983 	int nr_sectors;
7984 	blk_status_t ret;
7985 	blk_status_t err = BLK_STS_OK;
7986 
7987 	fs_info = BTRFS_I(inode)->root->fs_info;
7988 	sectorsize = fs_info->sectorsize;
7989 
7990 	start = io_bio->logical;
7991 	done.inode = inode;
7992 	io_bio->bio.bi_iter = io_bio->iter;
7993 
7994 	bio_for_each_segment(bvec, &io_bio->bio, iter) {
7995 		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
7996 		pgoff = bvec.bv_offset;
7997 
7998 next_block_or_try_again:
7999 		done.uptodate = 0;
8000 		done.start = start;
8001 		init_completion(&done.done);
8002 
8003 		ret = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
8004 				pgoff, start, start + sectorsize - 1,
8005 				io_bio->mirror_num,
8006 				btrfs_retry_endio_nocsum, &done);
8007 		if (ret) {
8008 			err = ret;
8009 			goto next;
8010 		}
8011 
8012 		wait_for_completion_io(&done.done);
8013 
8014 		if (!done.uptodate) {
8015 			/* We might have another mirror, so try again */
8016 			goto next_block_or_try_again;
8017 		}
8018 
8019 next:
8020 		start += sectorsize;
8021 
8022 		nr_sectors--;
8023 		if (nr_sectors) {
8024 			pgoff += sectorsize;
8025 			ASSERT(pgoff < PAGE_SIZE);
8026 			goto next_block_or_try_again;
8027 		}
8028 	}
8029 
8030 	return err;
8031 }
8032 
8033 static void btrfs_retry_endio(struct bio *bio)
8034 {
8035 	struct btrfs_retry_complete *done = bio->bi_private;
8036 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8037 	struct extent_io_tree *io_tree, *failure_tree;
8038 	struct inode *inode = done->inode;
8039 	struct bio_vec *bvec;
8040 	int uptodate;
8041 	int ret;
8042 	int i;
8043 
8044 	if (bio->bi_status)
8045 		goto end;
8046 
8047 	uptodate = 1;
8048 
8049 	ASSERT(bio->bi_vcnt == 1);
8050 	ASSERT(bio_first_bvec_all(bio)->bv_len == btrfs_inode_sectorsize(done->inode));
8051 
8052 	io_tree = &BTRFS_I(inode)->io_tree;
8053 	failure_tree = &BTRFS_I(inode)->io_failure_tree;
8054 
8055 	ASSERT(!bio_flagged(bio, BIO_CLONED));
8056 	bio_for_each_segment_all(bvec, bio, i) {
8057 		ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page,
8058 					     bvec->bv_offset, done->start,
8059 					     bvec->bv_len);
8060 		if (!ret)
8061 			clean_io_failure(BTRFS_I(inode)->root->fs_info,
8062 					 failure_tree, io_tree, done->start,
8063 					 bvec->bv_page,
8064 					 btrfs_ino(BTRFS_I(inode)),
8065 					 bvec->bv_offset);
8066 		else
8067 			uptodate = 0;
8068 	}
8069 
8070 	done->uptodate = uptodate;
8071 end:
8072 	complete(&done->done);
8073 	bio_put(bio);
8074 }
8075 
8076 static blk_status_t __btrfs_subio_endio_read(struct inode *inode,
8077 		struct btrfs_io_bio *io_bio, blk_status_t err)
8078 {
8079 	struct btrfs_fs_info *fs_info;
8080 	struct bio_vec bvec;
8081 	struct bvec_iter iter;
8082 	struct btrfs_retry_complete done;
8083 	u64 start;
8084 	u64 offset = 0;
8085 	u32 sectorsize;
8086 	int nr_sectors;
8087 	unsigned int pgoff;
8088 	int csum_pos;
8089 	bool uptodate = (err == 0);
8090 	int ret;
8091 	blk_status_t status;
8092 
8093 	fs_info = BTRFS_I(inode)->root->fs_info;
8094 	sectorsize = fs_info->sectorsize;
8095 
8096 	err = BLK_STS_OK;
8097 	start = io_bio->logical;
8098 	done.inode = inode;
8099 	io_bio->bio.bi_iter = io_bio->iter;
8100 
8101 	bio_for_each_segment(bvec, &io_bio->bio, iter) {
8102 		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
8103 
8104 		pgoff = bvec.bv_offset;
8105 next_block:
8106 		if (uptodate) {
8107 			csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
8108 			ret = __readpage_endio_check(inode, io_bio, csum_pos,
8109 					bvec.bv_page, pgoff, start, sectorsize);
8110 			if (likely(!ret))
8111 				goto next;
8112 		}
8113 try_again:
8114 		done.uptodate = 0;
8115 		done.start = start;
8116 		init_completion(&done.done);
8117 
8118 		status = dio_read_error(inode, &io_bio->bio, bvec.bv_page,
8119 					pgoff, start, start + sectorsize - 1,
8120 					io_bio->mirror_num, btrfs_retry_endio,
8121 					&done);
8122 		if (status) {
8123 			err = status;
8124 			goto next;
8125 		}
8126 
8127 		wait_for_completion_io(&done.done);
8128 
8129 		if (!done.uptodate) {
8130 			/* We might have another mirror, so try again */
8131 			goto try_again;
8132 		}
8133 next:
8134 		offset += sectorsize;
8135 		start += sectorsize;
8136 
8137 		ASSERT(nr_sectors);
8138 
8139 		nr_sectors--;
8140 		if (nr_sectors) {
8141 			pgoff += sectorsize;
8142 			ASSERT(pgoff < PAGE_SIZE);
8143 			goto next_block;
8144 		}
8145 	}
8146 
8147 	return err;
8148 }
8149 
8150 static blk_status_t btrfs_subio_endio_read(struct inode *inode,
8151 		struct btrfs_io_bio *io_bio, blk_status_t err)
8152 {
8153 	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8154 
8155 	if (skip_csum) {
8156 		if (unlikely(err))
8157 			return __btrfs_correct_data_nocsum(inode, io_bio);
8158 		else
8159 			return BLK_STS_OK;
8160 	} else {
8161 		return __btrfs_subio_endio_read(inode, io_bio, err);
8162 	}
8163 }
8164 
8165 static void btrfs_endio_direct_read(struct bio *bio)
8166 {
8167 	struct btrfs_dio_private *dip = bio->bi_private;
8168 	struct inode *inode = dip->inode;
8169 	struct bio *dio_bio;
8170 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8171 	blk_status_t err = bio->bi_status;
8172 
8173 	if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
8174 		err = btrfs_subio_endio_read(inode, io_bio, err);
8175 
8176 	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
8177 		      dip->logical_offset + dip->bytes - 1);
8178 	dio_bio = dip->dio_bio;
8179 
8180 	kfree(dip);
8181 
8182 	dio_bio->bi_status = err;
8183 	dio_end_io(dio_bio);
8184 
8185 	if (io_bio->end_io)
8186 		io_bio->end_io(io_bio, blk_status_to_errno(err));
8187 	bio_put(bio);
8188 }
8189 
8190 static void __endio_write_update_ordered(struct inode *inode,
8191 					 const u64 offset, const u64 bytes,
8192 					 const bool uptodate)
8193 {
8194 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8195 	struct btrfs_ordered_extent *ordered = NULL;
8196 	struct btrfs_workqueue *wq;
8197 	btrfs_work_func_t func;
8198 	u64 ordered_offset = offset;
8199 	u64 ordered_bytes = bytes;
8200 	u64 last_offset;
8201 	int ret;
8202 
8203 	if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
8204 		wq = fs_info->endio_freespace_worker;
8205 		func = btrfs_freespace_write_helper;
8206 	} else {
8207 		wq = fs_info->endio_write_workers;
8208 		func = btrfs_endio_write_helper;
8209 	}
8210 
8211 again:
8212 	last_offset = ordered_offset;
8213 	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
8214 						   &ordered_offset,
8215 						   ordered_bytes,
8216 						   uptodate);
8217 	if (!ret)
8218 		goto out_test;
8219 
8220 	btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL);
8221 	btrfs_queue_work(wq, &ordered->work);
8222 out_test:
8223 	/*
8224 	 * If btrfs_dec_test_ordered_pending does not find any ordered extent
8225 	 * in the range, we can exit.
8226 	 */
8227 	if (ordered_offset == last_offset)
8228 		return;
8229 	/*
8230 	 * our bio might span multiple ordered extents.  If we haven't
8231 	 * completed the accounting for the whole dio, go back and try again
8232 	 */
8233 	if (ordered_offset < offset + bytes) {
8234 		ordered_bytes = offset + bytes - ordered_offset;
8235 		ordered = NULL;
8236 		goto again;
8237 	}
8238 }
8239 
8240 static void btrfs_endio_direct_write(struct bio *bio)
8241 {
8242 	struct btrfs_dio_private *dip = bio->bi_private;
8243 	struct bio *dio_bio = dip->dio_bio;
8244 
8245 	__endio_write_update_ordered(dip->inode, dip->logical_offset,
8246 				     dip->bytes, !bio->bi_status);
8247 
8248 	kfree(dip);
8249 
8250 	dio_bio->bi_status = bio->bi_status;
8251 	dio_end_io(dio_bio);
8252 	bio_put(bio);
8253 }
8254 
8255 static blk_status_t __btrfs_submit_bio_start_direct_io(void *private_data,
8256 				    struct bio *bio, int mirror_num,
8257 				    unsigned long bio_flags, u64 offset)
8258 {
8259 	struct inode *inode = private_data;
8260 	blk_status_t ret;
8261 	ret = btrfs_csum_one_bio(inode, bio, offset, 1);
8262 	BUG_ON(ret); /* -ENOMEM */
8263 	return 0;
8264 }
8265 
8266 static void btrfs_end_dio_bio(struct bio *bio)
8267 {
8268 	struct btrfs_dio_private *dip = bio->bi_private;
8269 	blk_status_t err = bio->bi_status;
8270 
8271 	if (err)
8272 		btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
8273 			   "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
8274 			   btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
8275 			   bio->bi_opf,
8276 			   (unsigned long long)bio->bi_iter.bi_sector,
8277 			   bio->bi_iter.bi_size, err);
8278 
8279 	if (dip->subio_endio)
8280 		err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
8281 
8282 	if (err) {
8283 		dip->errors = 1;
8284 
8285 		/*
8286 		 * before atomic variable goto zero, we must make sure
8287 		 * dip->errors is perceived to be set.
8288 		 */
8289 		smp_mb__before_atomic();
8290 	}
8291 
8292 	/* if there are more bios still pending for this dio, just exit */
8293 	if (!atomic_dec_and_test(&dip->pending_bios))
8294 		goto out;
8295 
8296 	if (dip->errors) {
8297 		bio_io_error(dip->orig_bio);
8298 	} else {
8299 		dip->dio_bio->bi_status = BLK_STS_OK;
8300 		bio_endio(dip->orig_bio);
8301 	}
8302 out:
8303 	bio_put(bio);
8304 }
8305 
8306 static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode,
8307 						 struct btrfs_dio_private *dip,
8308 						 struct bio *bio,
8309 						 u64 file_offset)
8310 {
8311 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8312 	struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
8313 	blk_status_t ret;
8314 
8315 	/*
8316 	 * We load all the csum data we need when we submit
8317 	 * the first bio to reduce the csum tree search and
8318 	 * contention.
8319 	 */
8320 	if (dip->logical_offset == file_offset) {
8321 		ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio,
8322 						file_offset);
8323 		if (ret)
8324 			return ret;
8325 	}
8326 
8327 	if (bio == dip->orig_bio)
8328 		return 0;
8329 
8330 	file_offset -= dip->logical_offset;
8331 	file_offset >>= inode->i_sb->s_blocksize_bits;
8332 	io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
8333 
8334 	return 0;
8335 }
8336 
8337 static inline blk_status_t
8338 __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, u64 file_offset,
8339 		       int async_submit)
8340 {
8341 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8342 	struct btrfs_dio_private *dip = bio->bi_private;
8343 	bool write = bio_op(bio) == REQ_OP_WRITE;
8344 	blk_status_t ret;
8345 
8346 	/* Check btrfs_submit_bio_hook() for rules about async submit. */
8347 	if (async_submit)
8348 		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
8349 
8350 	if (!write) {
8351 		ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
8352 		if (ret)
8353 			goto err;
8354 	}
8355 
8356 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
8357 		goto map;
8358 
8359 	if (write && async_submit) {
8360 		ret = btrfs_wq_submit_bio(fs_info, bio, 0, 0,
8361 					  file_offset, inode,
8362 					  __btrfs_submit_bio_start_direct_io,
8363 					  __btrfs_submit_bio_done);
8364 		goto err;
8365 	} else if (write) {
8366 		/*
8367 		 * If we aren't doing async submit, calculate the csum of the
8368 		 * bio now.
8369 		 */
8370 		ret = btrfs_csum_one_bio(inode, bio, file_offset, 1);
8371 		if (ret)
8372 			goto err;
8373 	} else {
8374 		ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio,
8375 						     file_offset);
8376 		if (ret)
8377 			goto err;
8378 	}
8379 map:
8380 	ret = btrfs_map_bio(fs_info, bio, 0, 0);
8381 err:
8382 	return ret;
8383 }
8384 
8385 static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
8386 {
8387 	struct inode *inode = dip->inode;
8388 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8389 	struct bio *bio;
8390 	struct bio *orig_bio = dip->orig_bio;
8391 	u64 start_sector = orig_bio->bi_iter.bi_sector;
8392 	u64 file_offset = dip->logical_offset;
8393 	u64 map_length;
8394 	int async_submit = 0;
8395 	u64 submit_len;
8396 	int clone_offset = 0;
8397 	int clone_len;
8398 	int ret;
8399 	blk_status_t status;
8400 
8401 	map_length = orig_bio->bi_iter.bi_size;
8402 	submit_len = map_length;
8403 	ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9,
8404 			      &map_length, NULL, 0);
8405 	if (ret)
8406 		return -EIO;
8407 
8408 	if (map_length >= submit_len) {
8409 		bio = orig_bio;
8410 		dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
8411 		goto submit;
8412 	}
8413 
8414 	/* async crcs make it difficult to collect full stripe writes. */
8415 	if (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK)
8416 		async_submit = 0;
8417 	else
8418 		async_submit = 1;
8419 
8420 	/* bio split */
8421 	ASSERT(map_length <= INT_MAX);
8422 	atomic_inc(&dip->pending_bios);
8423 	do {
8424 		clone_len = min_t(int, submit_len, map_length);
8425 
8426 		/*
8427 		 * This will never fail as it's passing GPF_NOFS and
8428 		 * the allocation is backed by btrfs_bioset.
8429 		 */
8430 		bio = btrfs_bio_clone_partial(orig_bio, clone_offset,
8431 					      clone_len);
8432 		bio->bi_private = dip;
8433 		bio->bi_end_io = btrfs_end_dio_bio;
8434 		btrfs_io_bio(bio)->logical = file_offset;
8435 
8436 		ASSERT(submit_len >= clone_len);
8437 		submit_len -= clone_len;
8438 		if (submit_len == 0)
8439 			break;
8440 
8441 		/*
8442 		 * Increase the count before we submit the bio so we know
8443 		 * the end IO handler won't happen before we increase the
8444 		 * count. Otherwise, the dip might get freed before we're
8445 		 * done setting it up.
8446 		 */
8447 		atomic_inc(&dip->pending_bios);
8448 
8449 		status = __btrfs_submit_dio_bio(bio, inode, file_offset,
8450 						async_submit);
8451 		if (status) {
8452 			bio_put(bio);
8453 			atomic_dec(&dip->pending_bios);
8454 			goto out_err;
8455 		}
8456 
8457 		clone_offset += clone_len;
8458 		start_sector += clone_len >> 9;
8459 		file_offset += clone_len;
8460 
8461 		map_length = submit_len;
8462 		ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
8463 				      start_sector << 9, &map_length, NULL, 0);
8464 		if (ret)
8465 			goto out_err;
8466 	} while (submit_len > 0);
8467 
8468 submit:
8469 	status = __btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
8470 	if (!status)
8471 		return 0;
8472 
8473 	bio_put(bio);
8474 out_err:
8475 	dip->errors = 1;
8476 	/*
8477 	 * before atomic variable goto zero, we must
8478 	 * make sure dip->errors is perceived to be set.
8479 	 */
8480 	smp_mb__before_atomic();
8481 	if (atomic_dec_and_test(&dip->pending_bios))
8482 		bio_io_error(dip->orig_bio);
8483 
8484 	/* bio_end_io() will handle error, so we needn't return it */
8485 	return 0;
8486 }
8487 
8488 static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
8489 				loff_t file_offset)
8490 {
8491 	struct btrfs_dio_private *dip = NULL;
8492 	struct bio *bio = NULL;
8493 	struct btrfs_io_bio *io_bio;
8494 	bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
8495 	int ret = 0;
8496 
8497 	bio = btrfs_bio_clone(dio_bio);
8498 
8499 	dip = kzalloc(sizeof(*dip), GFP_NOFS);
8500 	if (!dip) {
8501 		ret = -ENOMEM;
8502 		goto free_ordered;
8503 	}
8504 
8505 	dip->private = dio_bio->bi_private;
8506 	dip->inode = inode;
8507 	dip->logical_offset = file_offset;
8508 	dip->bytes = dio_bio->bi_iter.bi_size;
8509 	dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
8510 	bio->bi_private = dip;
8511 	dip->orig_bio = bio;
8512 	dip->dio_bio = dio_bio;
8513 	atomic_set(&dip->pending_bios, 0);
8514 	io_bio = btrfs_io_bio(bio);
8515 	io_bio->logical = file_offset;
8516 
8517 	if (write) {
8518 		bio->bi_end_io = btrfs_endio_direct_write;
8519 	} else {
8520 		bio->bi_end_io = btrfs_endio_direct_read;
8521 		dip->subio_endio = btrfs_subio_endio_read;
8522 	}
8523 
8524 	/*
8525 	 * Reset the range for unsubmitted ordered extents (to a 0 length range)
8526 	 * even if we fail to submit a bio, because in such case we do the
8527 	 * corresponding error handling below and it must not be done a second
8528 	 * time by btrfs_direct_IO().
8529 	 */
8530 	if (write) {
8531 		struct btrfs_dio_data *dio_data = current->journal_info;
8532 
8533 		dio_data->unsubmitted_oe_range_end = dip->logical_offset +
8534 			dip->bytes;
8535 		dio_data->unsubmitted_oe_range_start =
8536 			dio_data->unsubmitted_oe_range_end;
8537 	}
8538 
8539 	ret = btrfs_submit_direct_hook(dip);
8540 	if (!ret)
8541 		return;
8542 
8543 	if (io_bio->end_io)
8544 		io_bio->end_io(io_bio, ret);
8545 
8546 free_ordered:
8547 	/*
8548 	 * If we arrived here it means either we failed to submit the dip
8549 	 * or we either failed to clone the dio_bio or failed to allocate the
8550 	 * dip. If we cloned the dio_bio and allocated the dip, we can just
8551 	 * call bio_endio against our io_bio so that we get proper resource
8552 	 * cleanup if we fail to submit the dip, otherwise, we must do the
8553 	 * same as btrfs_endio_direct_[write|read] because we can't call these
8554 	 * callbacks - they require an allocated dip and a clone of dio_bio.
8555 	 */
8556 	if (bio && dip) {
8557 		bio_io_error(bio);
8558 		/*
8559 		 * The end io callbacks free our dip, do the final put on bio
8560 		 * and all the cleanup and final put for dio_bio (through
8561 		 * dio_end_io()).
8562 		 */
8563 		dip = NULL;
8564 		bio = NULL;
8565 	} else {
8566 		if (write)
8567 			__endio_write_update_ordered(inode,
8568 						file_offset,
8569 						dio_bio->bi_iter.bi_size,
8570 						false);
8571 		else
8572 			unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
8573 			      file_offset + dio_bio->bi_iter.bi_size - 1);
8574 
8575 		dio_bio->bi_status = BLK_STS_IOERR;
8576 		/*
8577 		 * Releases and cleans up our dio_bio, no need to bio_put()
8578 		 * nor bio_endio()/bio_io_error() against dio_bio.
8579 		 */
8580 		dio_end_io(dio_bio);
8581 	}
8582 	if (bio)
8583 		bio_put(bio);
8584 	kfree(dip);
8585 }
8586 
8587 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
8588 			       const struct iov_iter *iter, loff_t offset)
8589 {
8590 	int seg;
8591 	int i;
8592 	unsigned int blocksize_mask = fs_info->sectorsize - 1;
8593 	ssize_t retval = -EINVAL;
8594 
8595 	if (offset & blocksize_mask)
8596 		goto out;
8597 
8598 	if (iov_iter_alignment(iter) & blocksize_mask)
8599 		goto out;
8600 
8601 	/* If this is a write we don't need to check anymore */
8602 	if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
8603 		return 0;
8604 	/*
8605 	 * Check to make sure we don't have duplicate iov_base's in this
8606 	 * iovec, if so return EINVAL, otherwise we'll get csum errors
8607 	 * when reading back.
8608 	 */
8609 	for (seg = 0; seg < iter->nr_segs; seg++) {
8610 		for (i = seg + 1; i < iter->nr_segs; i++) {
8611 			if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
8612 				goto out;
8613 		}
8614 	}
8615 	retval = 0;
8616 out:
8617 	return retval;
8618 }
8619 
8620 static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8621 {
8622 	struct file *file = iocb->ki_filp;
8623 	struct inode *inode = file->f_mapping->host;
8624 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8625 	struct btrfs_dio_data dio_data = { 0 };
8626 	struct extent_changeset *data_reserved = NULL;
8627 	loff_t offset = iocb->ki_pos;
8628 	size_t count = 0;
8629 	int flags = 0;
8630 	bool wakeup = true;
8631 	bool relock = false;
8632 	ssize_t ret;
8633 
8634 	if (check_direct_IO(fs_info, iter, offset))
8635 		return 0;
8636 
8637 	inode_dio_begin(inode);
8638 
8639 	/*
8640 	 * The generic stuff only does filemap_write_and_wait_range, which
8641 	 * isn't enough if we've written compressed pages to this area, so
8642 	 * we need to flush the dirty pages again to make absolutely sure
8643 	 * that any outstanding dirty pages are on disk.
8644 	 */
8645 	count = iov_iter_count(iter);
8646 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8647 		     &BTRFS_I(inode)->runtime_flags))
8648 		filemap_fdatawrite_range(inode->i_mapping, offset,
8649 					 offset + count - 1);
8650 
8651 	if (iov_iter_rw(iter) == WRITE) {
8652 		/*
8653 		 * If the write DIO is beyond the EOF, we need update
8654 		 * the isize, but it is protected by i_mutex. So we can
8655 		 * not unlock the i_mutex at this case.
8656 		 */
8657 		if (offset + count <= inode->i_size) {
8658 			dio_data.overwrite = 1;
8659 			inode_unlock(inode);
8660 			relock = true;
8661 		} else if (iocb->ki_flags & IOCB_NOWAIT) {
8662 			ret = -EAGAIN;
8663 			goto out;
8664 		}
8665 		ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
8666 						   offset, count);
8667 		if (ret)
8668 			goto out;
8669 
8670 		/*
8671 		 * We need to know how many extents we reserved so that we can
8672 		 * do the accounting properly if we go over the number we
8673 		 * originally calculated.  Abuse current->journal_info for this.
8674 		 */
8675 		dio_data.reserve = round_up(count,
8676 					    fs_info->sectorsize);
8677 		dio_data.unsubmitted_oe_range_start = (u64)offset;
8678 		dio_data.unsubmitted_oe_range_end = (u64)offset;
8679 		current->journal_info = &dio_data;
8680 		down_read(&BTRFS_I(inode)->dio_sem);
8681 	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
8682 				     &BTRFS_I(inode)->runtime_flags)) {
8683 		inode_dio_end(inode);
8684 		flags = DIO_LOCKING | DIO_SKIP_HOLES;
8685 		wakeup = false;
8686 	}
8687 
8688 	ret = __blockdev_direct_IO(iocb, inode,
8689 				   fs_info->fs_devices->latest_bdev,
8690 				   iter, btrfs_get_blocks_direct, NULL,
8691 				   btrfs_submit_direct, flags);
8692 	if (iov_iter_rw(iter) == WRITE) {
8693 		up_read(&BTRFS_I(inode)->dio_sem);
8694 		current->journal_info = NULL;
8695 		if (ret < 0 && ret != -EIOCBQUEUED) {
8696 			if (dio_data.reserve)
8697 				btrfs_delalloc_release_space(inode, data_reserved,
8698 					offset, dio_data.reserve);
8699 			/*
8700 			 * On error we might have left some ordered extents
8701 			 * without submitting corresponding bios for them, so
8702 			 * cleanup them up to avoid other tasks getting them
8703 			 * and waiting for them to complete forever.
8704 			 */
8705 			if (dio_data.unsubmitted_oe_range_start <
8706 			    dio_data.unsubmitted_oe_range_end)
8707 				__endio_write_update_ordered(inode,
8708 					dio_data.unsubmitted_oe_range_start,
8709 					dio_data.unsubmitted_oe_range_end -
8710 					dio_data.unsubmitted_oe_range_start,
8711 					false);
8712 		} else if (ret >= 0 && (size_t)ret < count)
8713 			btrfs_delalloc_release_space(inode, data_reserved,
8714 					offset, count - (size_t)ret);
8715 		btrfs_delalloc_release_extents(BTRFS_I(inode), count);
8716 	}
8717 out:
8718 	if (wakeup)
8719 		inode_dio_end(inode);
8720 	if (relock)
8721 		inode_lock(inode);
8722 
8723 	extent_changeset_free(data_reserved);
8724 	return ret;
8725 }
8726 
8727 #define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
8728 
8729 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
8730 		__u64 start, __u64 len)
8731 {
8732 	int	ret;
8733 
8734 	ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
8735 	if (ret)
8736 		return ret;
8737 
8738 	return extent_fiemap(inode, fieinfo, start, len);
8739 }
8740 
8741 int btrfs_readpage(struct file *file, struct page *page)
8742 {
8743 	struct extent_io_tree *tree;
8744 	tree = &BTRFS_I(page->mapping->host)->io_tree;
8745 	return extent_read_full_page(tree, page, btrfs_get_extent, 0);
8746 }
8747 
8748 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
8749 {
8750 	struct inode *inode = page->mapping->host;
8751 	int ret;
8752 
8753 	if (current->flags & PF_MEMALLOC) {
8754 		redirty_page_for_writepage(wbc, page);
8755 		unlock_page(page);
8756 		return 0;
8757 	}
8758 
8759 	/*
8760 	 * If we are under memory pressure we will call this directly from the
8761 	 * VM, we need to make sure we have the inode referenced for the ordered
8762 	 * extent.  If not just return like we didn't do anything.
8763 	 */
8764 	if (!igrab(inode)) {
8765 		redirty_page_for_writepage(wbc, page);
8766 		return AOP_WRITEPAGE_ACTIVATE;
8767 	}
8768 	ret = extent_write_full_page(page, wbc);
8769 	btrfs_add_delayed_iput(inode);
8770 	return ret;
8771 }
8772 
8773 static int btrfs_writepages(struct address_space *mapping,
8774 			    struct writeback_control *wbc)
8775 {
8776 	struct extent_io_tree *tree;
8777 
8778 	tree = &BTRFS_I(mapping->host)->io_tree;
8779 	return extent_writepages(tree, mapping, wbc);
8780 }
8781 
8782 static int
8783 btrfs_readpages(struct file *file, struct address_space *mapping,
8784 		struct list_head *pages, unsigned nr_pages)
8785 {
8786 	struct extent_io_tree *tree;
8787 	tree = &BTRFS_I(mapping->host)->io_tree;
8788 	return extent_readpages(tree, mapping, pages, nr_pages);
8789 }
8790 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8791 {
8792 	struct extent_io_tree *tree;
8793 	struct extent_map_tree *map;
8794 	int ret;
8795 
8796 	tree = &BTRFS_I(page->mapping->host)->io_tree;
8797 	map = &BTRFS_I(page->mapping->host)->extent_tree;
8798 	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
8799 	if (ret == 1) {
8800 		ClearPagePrivate(page);
8801 		set_page_private(page, 0);
8802 		put_page(page);
8803 	}
8804 	return ret;
8805 }
8806 
8807 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8808 {
8809 	if (PageWriteback(page) || PageDirty(page))
8810 		return 0;
8811 	return __btrfs_releasepage(page, gfp_flags);
8812 }
8813 
8814 static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8815 				 unsigned int length)
8816 {
8817 	struct inode *inode = page->mapping->host;
8818 	struct extent_io_tree *tree;
8819 	struct btrfs_ordered_extent *ordered;
8820 	struct extent_state *cached_state = NULL;
8821 	u64 page_start = page_offset(page);
8822 	u64 page_end = page_start + PAGE_SIZE - 1;
8823 	u64 start;
8824 	u64 end;
8825 	int inode_evicting = inode->i_state & I_FREEING;
8826 
8827 	/*
8828 	 * we have the page locked, so new writeback can't start,
8829 	 * and the dirty bit won't be cleared while we are here.
8830 	 *
8831 	 * Wait for IO on this page so that we can safely clear
8832 	 * the PagePrivate2 bit and do ordered accounting
8833 	 */
8834 	wait_on_page_writeback(page);
8835 
8836 	tree = &BTRFS_I(inode)->io_tree;
8837 	if (offset) {
8838 		btrfs_releasepage(page, GFP_NOFS);
8839 		return;
8840 	}
8841 
8842 	if (!inode_evicting)
8843 		lock_extent_bits(tree, page_start, page_end, &cached_state);
8844 again:
8845 	start = page_start;
8846 	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
8847 					page_end - start + 1);
8848 	if (ordered) {
8849 		end = min(page_end, ordered->file_offset + ordered->len - 1);
8850 		/*
8851 		 * IO on this page will never be started, so we need
8852 		 * to account for any ordered extents now
8853 		 */
8854 		if (!inode_evicting)
8855 			clear_extent_bit(tree, start, end,
8856 					 EXTENT_DIRTY | EXTENT_DELALLOC |
8857 					 EXTENT_DELALLOC_NEW |
8858 					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8859 					 EXTENT_DEFRAG, 1, 0, &cached_state);
8860 		/*
8861 		 * whoever cleared the private bit is responsible
8862 		 * for the finish_ordered_io
8863 		 */
8864 		if (TestClearPagePrivate2(page)) {
8865 			struct btrfs_ordered_inode_tree *tree;
8866 			u64 new_len;
8867 
8868 			tree = &BTRFS_I(inode)->ordered_tree;
8869 
8870 			spin_lock_irq(&tree->lock);
8871 			set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8872 			new_len = start - ordered->file_offset;
8873 			if (new_len < ordered->truncated_len)
8874 				ordered->truncated_len = new_len;
8875 			spin_unlock_irq(&tree->lock);
8876 
8877 			if (btrfs_dec_test_ordered_pending(inode, &ordered,
8878 							   start,
8879 							   end - start + 1, 1))
8880 				btrfs_finish_ordered_io(ordered);
8881 		}
8882 		btrfs_put_ordered_extent(ordered);
8883 		if (!inode_evicting) {
8884 			cached_state = NULL;
8885 			lock_extent_bits(tree, start, end,
8886 					 &cached_state);
8887 		}
8888 
8889 		start = end + 1;
8890 		if (start < page_end)
8891 			goto again;
8892 	}
8893 
8894 	/*
8895 	 * Qgroup reserved space handler
8896 	 * Page here will be either
8897 	 * 1) Already written to disk
8898 	 *    In this case, its reserved space is released from data rsv map
8899 	 *    and will be freed by delayed_ref handler finally.
8900 	 *    So even we call qgroup_free_data(), it won't decrease reserved
8901 	 *    space.
8902 	 * 2) Not written to disk
8903 	 *    This means the reserved space should be freed here. However,
8904 	 *    if a truncate invalidates the page (by clearing PageDirty)
8905 	 *    and the page is accounted for while allocating extent
8906 	 *    in btrfs_check_data_free_space() we let delayed_ref to
8907 	 *    free the entire extent.
8908 	 */
8909 	if (PageDirty(page))
8910 		btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
8911 	if (!inode_evicting) {
8912 		clear_extent_bit(tree, page_start, page_end,
8913 				 EXTENT_LOCKED | EXTENT_DIRTY |
8914 				 EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
8915 				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
8916 				 &cached_state);
8917 
8918 		__btrfs_releasepage(page, GFP_NOFS);
8919 	}
8920 
8921 	ClearPageChecked(page);
8922 	if (PagePrivate(page)) {
8923 		ClearPagePrivate(page);
8924 		set_page_private(page, 0);
8925 		put_page(page);
8926 	}
8927 }
8928 
8929 /*
8930  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
8931  * called from a page fault handler when a page is first dirtied. Hence we must
8932  * be careful to check for EOF conditions here. We set the page up correctly
8933  * for a written page which means we get ENOSPC checking when writing into
8934  * holes and correct delalloc and unwritten extent mapping on filesystems that
8935  * support these features.
8936  *
8937  * We are not allowed to take the i_mutex here so we have to play games to
8938  * protect against truncate races as the page could now be beyond EOF.  Because
8939  * vmtruncate() writes the inode size before removing pages, once we have the
8940  * page lock we can determine safely if the page is beyond EOF. If it is not
8941  * beyond EOF, then the page is guaranteed safe against truncation until we
8942  * unlock the page.
8943  */
8944 int btrfs_page_mkwrite(struct vm_fault *vmf)
8945 {
8946 	struct page *page = vmf->page;
8947 	struct inode *inode = file_inode(vmf->vma->vm_file);
8948 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8949 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
8950 	struct btrfs_ordered_extent *ordered;
8951 	struct extent_state *cached_state = NULL;
8952 	struct extent_changeset *data_reserved = NULL;
8953 	char *kaddr;
8954 	unsigned long zero_start;
8955 	loff_t size;
8956 	int ret;
8957 	int reserved = 0;
8958 	u64 reserved_space;
8959 	u64 page_start;
8960 	u64 page_end;
8961 	u64 end;
8962 
8963 	reserved_space = PAGE_SIZE;
8964 
8965 	sb_start_pagefault(inode->i_sb);
8966 	page_start = page_offset(page);
8967 	page_end = page_start + PAGE_SIZE - 1;
8968 	end = page_end;
8969 
8970 	/*
8971 	 * Reserving delalloc space after obtaining the page lock can lead to
8972 	 * deadlock. For example, if a dirty page is locked by this function
8973 	 * and the call to btrfs_delalloc_reserve_space() ends up triggering
8974 	 * dirty page write out, then the btrfs_writepage() function could
8975 	 * end up waiting indefinitely to get a lock on the page currently
8976 	 * being processed by btrfs_page_mkwrite() function.
8977 	 */
8978 	ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
8979 					   reserved_space);
8980 	if (!ret) {
8981 		ret = file_update_time(vmf->vma->vm_file);
8982 		reserved = 1;
8983 	}
8984 	if (ret) {
8985 		if (ret == -ENOMEM)
8986 			ret = VM_FAULT_OOM;
8987 		else /* -ENOSPC, -EIO, etc */
8988 			ret = VM_FAULT_SIGBUS;
8989 		if (reserved)
8990 			goto out;
8991 		goto out_noreserve;
8992 	}
8993 
8994 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
8995 again:
8996 	lock_page(page);
8997 	size = i_size_read(inode);
8998 
8999 	if ((page->mapping != inode->i_mapping) ||
9000 	    (page_start >= size)) {
9001 		/* page got truncated out from underneath us */
9002 		goto out_unlock;
9003 	}
9004 	wait_on_page_writeback(page);
9005 
9006 	lock_extent_bits(io_tree, page_start, page_end, &cached_state);
9007 	set_page_extent_mapped(page);
9008 
9009 	/*
9010 	 * we can't set the delalloc bits if there are pending ordered
9011 	 * extents.  Drop our locks and wait for them to finish
9012 	 */
9013 	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
9014 			PAGE_SIZE);
9015 	if (ordered) {
9016 		unlock_extent_cached(io_tree, page_start, page_end,
9017 				     &cached_state);
9018 		unlock_page(page);
9019 		btrfs_start_ordered_extent(inode, ordered, 1);
9020 		btrfs_put_ordered_extent(ordered);
9021 		goto again;
9022 	}
9023 
9024 	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
9025 		reserved_space = round_up(size - page_start,
9026 					  fs_info->sectorsize);
9027 		if (reserved_space < PAGE_SIZE) {
9028 			end = page_start + reserved_space - 1;
9029 			btrfs_delalloc_release_space(inode, data_reserved,
9030 					page_start, PAGE_SIZE - reserved_space);
9031 		}
9032 	}
9033 
9034 	/*
9035 	 * page_mkwrite gets called when the page is firstly dirtied after it's
9036 	 * faulted in, but write(2) could also dirty a page and set delalloc
9037 	 * bits, thus in this case for space account reason, we still need to
9038 	 * clear any delalloc bits within this page range since we have to
9039 	 * reserve data&meta space before lock_page() (see above comments).
9040 	 */
9041 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
9042 			  EXTENT_DIRTY | EXTENT_DELALLOC |
9043 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
9044 			  0, 0, &cached_state);
9045 
9046 	ret = btrfs_set_extent_delalloc(inode, page_start, end, 0,
9047 					&cached_state, 0);
9048 	if (ret) {
9049 		unlock_extent_cached(io_tree, page_start, page_end,
9050 				     &cached_state);
9051 		ret = VM_FAULT_SIGBUS;
9052 		goto out_unlock;
9053 	}
9054 	ret = 0;
9055 
9056 	/* page is wholly or partially inside EOF */
9057 	if (page_start + PAGE_SIZE > size)
9058 		zero_start = size & ~PAGE_MASK;
9059 	else
9060 		zero_start = PAGE_SIZE;
9061 
9062 	if (zero_start != PAGE_SIZE) {
9063 		kaddr = kmap(page);
9064 		memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
9065 		flush_dcache_page(page);
9066 		kunmap(page);
9067 	}
9068 	ClearPageChecked(page);
9069 	set_page_dirty(page);
9070 	SetPageUptodate(page);
9071 
9072 	BTRFS_I(inode)->last_trans = fs_info->generation;
9073 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
9074 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
9075 
9076 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
9077 
9078 out_unlock:
9079 	if (!ret) {
9080 		btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
9081 		sb_end_pagefault(inode->i_sb);
9082 		extent_changeset_free(data_reserved);
9083 		return VM_FAULT_LOCKED;
9084 	}
9085 	unlock_page(page);
9086 out:
9087 	btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
9088 	btrfs_delalloc_release_space(inode, data_reserved, page_start,
9089 				     reserved_space);
9090 out_noreserve:
9091 	sb_end_pagefault(inode->i_sb);
9092 	extent_changeset_free(data_reserved);
9093 	return ret;
9094 }
9095 
9096 static int btrfs_truncate(struct inode *inode)
9097 {
9098 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9099 	struct btrfs_root *root = BTRFS_I(inode)->root;
9100 	struct btrfs_block_rsv *rsv;
9101 	int ret = 0;
9102 	int err = 0;
9103 	struct btrfs_trans_handle *trans;
9104 	u64 mask = fs_info->sectorsize - 1;
9105 	u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
9106 
9107 	ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
9108 				       (u64)-1);
9109 	if (ret)
9110 		return ret;
9111 
9112 	/*
9113 	 * Yes ladies and gentlemen, this is indeed ugly.  The fact is we have
9114 	 * 3 things going on here
9115 	 *
9116 	 * 1) We need to reserve space for our orphan item and the space to
9117 	 * delete our orphan item.  Lord knows we don't want to have a dangling
9118 	 * orphan item because we didn't reserve space to remove it.
9119 	 *
9120 	 * 2) We need to reserve space to update our inode.
9121 	 *
9122 	 * 3) We need to have something to cache all the space that is going to
9123 	 * be free'd up by the truncate operation, but also have some slack
9124 	 * space reserved in case it uses space during the truncate (thank you
9125 	 * very much snapshotting).
9126 	 *
9127 	 * And we need these to all be separate.  The fact is we can use a lot of
9128 	 * space doing the truncate, and we have no earthly idea how much space
9129 	 * we will use, so we need the truncate reservation to be separate so it
9130 	 * doesn't end up using space reserved for updating the inode or
9131 	 * removing the orphan item.  We also need to be able to stop the
9132 	 * transaction and start a new one, which means we need to be able to
9133 	 * update the inode several times, and we have no idea of knowing how
9134 	 * many times that will be, so we can't just reserve 1 item for the
9135 	 * entirety of the operation, so that has to be done separately as well.
9136 	 * Then there is the orphan item, which does indeed need to be held on
9137 	 * to for the whole operation, and we need nobody to touch this reserved
9138 	 * space except the orphan code.
9139 	 *
9140 	 * So that leaves us with
9141 	 *
9142 	 * 1) root->orphan_block_rsv - for the orphan deletion.
9143 	 * 2) rsv - for the truncate reservation, which we will steal from the
9144 	 * transaction reservation.
9145 	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
9146 	 * updating the inode.
9147 	 */
9148 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
9149 	if (!rsv)
9150 		return -ENOMEM;
9151 	rsv->size = min_size;
9152 	rsv->failfast = 1;
9153 
9154 	/*
9155 	 * 1 for the truncate slack space
9156 	 * 1 for updating the inode.
9157 	 */
9158 	trans = btrfs_start_transaction(root, 2);
9159 	if (IS_ERR(trans)) {
9160 		err = PTR_ERR(trans);
9161 		goto out;
9162 	}
9163 
9164 	/* Migrate the slack space for the truncate to our reserve */
9165 	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
9166 				      min_size, 0);
9167 	BUG_ON(ret);
9168 
9169 	/*
9170 	 * So if we truncate and then write and fsync we normally would just
9171 	 * write the extents that changed, which is a problem if we need to
9172 	 * first truncate that entire inode.  So set this flag so we write out
9173 	 * all of the extents in the inode to the sync log so we're completely
9174 	 * safe.
9175 	 */
9176 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
9177 	trans->block_rsv = rsv;
9178 
9179 	while (1) {
9180 		ret = btrfs_truncate_inode_items(trans, root, inode,
9181 						 inode->i_size,
9182 						 BTRFS_EXTENT_DATA_KEY);
9183 		trans->block_rsv = &fs_info->trans_block_rsv;
9184 		if (ret != -ENOSPC && ret != -EAGAIN) {
9185 			err = ret;
9186 			break;
9187 		}
9188 
9189 		ret = btrfs_update_inode(trans, root, inode);
9190 		if (ret) {
9191 			err = ret;
9192 			break;
9193 		}
9194 
9195 		btrfs_end_transaction(trans);
9196 		btrfs_btree_balance_dirty(fs_info);
9197 
9198 		trans = btrfs_start_transaction(root, 2);
9199 		if (IS_ERR(trans)) {
9200 			ret = err = PTR_ERR(trans);
9201 			trans = NULL;
9202 			break;
9203 		}
9204 
9205 		btrfs_block_rsv_release(fs_info, rsv, -1);
9206 		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
9207 					      rsv, min_size, 0);
9208 		BUG_ON(ret);	/* shouldn't happen */
9209 		trans->block_rsv = rsv;
9210 	}
9211 
9212 	/*
9213 	 * We can't call btrfs_truncate_block inside a trans handle as we could
9214 	 * deadlock with freeze, if we got NEED_TRUNCATE_BLOCK then we know
9215 	 * we've truncated everything except the last little bit, and can do
9216 	 * btrfs_truncate_block and then update the disk_i_size.
9217 	 */
9218 	if (ret == NEED_TRUNCATE_BLOCK) {
9219 		btrfs_end_transaction(trans);
9220 		btrfs_btree_balance_dirty(fs_info);
9221 
9222 		ret = btrfs_truncate_block(inode, inode->i_size, 0, 0);
9223 		if (ret)
9224 			goto out;
9225 		trans = btrfs_start_transaction(root, 1);
9226 		if (IS_ERR(trans)) {
9227 			ret = PTR_ERR(trans);
9228 			goto out;
9229 		}
9230 		btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
9231 	}
9232 
9233 	if (ret == 0 && inode->i_nlink > 0) {
9234 		trans->block_rsv = root->orphan_block_rsv;
9235 		ret = btrfs_orphan_del(trans, BTRFS_I(inode));
9236 		if (ret)
9237 			err = ret;
9238 	}
9239 
9240 	if (trans) {
9241 		trans->block_rsv = &fs_info->trans_block_rsv;
9242 		ret = btrfs_update_inode(trans, root, inode);
9243 		if (ret && !err)
9244 			err = ret;
9245 
9246 		ret = btrfs_end_transaction(trans);
9247 		btrfs_btree_balance_dirty(fs_info);
9248 	}
9249 out:
9250 	btrfs_free_block_rsv(fs_info, rsv);
9251 
9252 	if (ret && !err)
9253 		err = ret;
9254 
9255 	return err;
9256 }
9257 
9258 /*
9259  * create a new subvolume directory/inode (helper for the ioctl).
9260  */
9261 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
9262 			     struct btrfs_root *new_root,
9263 			     struct btrfs_root *parent_root,
9264 			     u64 new_dirid)
9265 {
9266 	struct inode *inode;
9267 	int err;
9268 	u64 index = 0;
9269 
9270 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
9271 				new_dirid, new_dirid,
9272 				S_IFDIR | (~current_umask() & S_IRWXUGO),
9273 				&index);
9274 	if (IS_ERR(inode))
9275 		return PTR_ERR(inode);
9276 	inode->i_op = &btrfs_dir_inode_operations;
9277 	inode->i_fop = &btrfs_dir_file_operations;
9278 
9279 	set_nlink(inode, 1);
9280 	btrfs_i_size_write(BTRFS_I(inode), 0);
9281 	unlock_new_inode(inode);
9282 
9283 	err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
9284 	if (err)
9285 		btrfs_err(new_root->fs_info,
9286 			  "error inheriting subvolume %llu properties: %d",
9287 			  new_root->root_key.objectid, err);
9288 
9289 	err = btrfs_update_inode(trans, new_root, inode);
9290 
9291 	iput(inode);
9292 	return err;
9293 }
9294 
9295 struct inode *btrfs_alloc_inode(struct super_block *sb)
9296 {
9297 	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
9298 	struct btrfs_inode *ei;
9299 	struct inode *inode;
9300 
9301 	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_KERNEL);
9302 	if (!ei)
9303 		return NULL;
9304 
9305 	ei->root = NULL;
9306 	ei->generation = 0;
9307 	ei->last_trans = 0;
9308 	ei->last_sub_trans = 0;
9309 	ei->logged_trans = 0;
9310 	ei->delalloc_bytes = 0;
9311 	ei->new_delalloc_bytes = 0;
9312 	ei->defrag_bytes = 0;
9313 	ei->disk_i_size = 0;
9314 	ei->flags = 0;
9315 	ei->csum_bytes = 0;
9316 	ei->index_cnt = (u64)-1;
9317 	ei->dir_index = 0;
9318 	ei->last_unlink_trans = 0;
9319 	ei->last_log_commit = 0;
9320 	ei->delayed_iput_count = 0;
9321 
9322 	spin_lock_init(&ei->lock);
9323 	ei->outstanding_extents = 0;
9324 	if (sb->s_magic != BTRFS_TEST_MAGIC)
9325 		btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
9326 					      BTRFS_BLOCK_RSV_DELALLOC);
9327 	ei->runtime_flags = 0;
9328 	ei->prop_compress = BTRFS_COMPRESS_NONE;
9329 	ei->defrag_compress = BTRFS_COMPRESS_NONE;
9330 
9331 	ei->delayed_node = NULL;
9332 
9333 	ei->i_otime.tv_sec = 0;
9334 	ei->i_otime.tv_nsec = 0;
9335 
9336 	inode = &ei->vfs_inode;
9337 	extent_map_tree_init(&ei->extent_tree);
9338 	extent_io_tree_init(&ei->io_tree, inode);
9339 	extent_io_tree_init(&ei->io_failure_tree, inode);
9340 	ei->io_tree.track_uptodate = 1;
9341 	ei->io_failure_tree.track_uptodate = 1;
9342 	atomic_set(&ei->sync_writers, 0);
9343 	mutex_init(&ei->log_mutex);
9344 	mutex_init(&ei->delalloc_mutex);
9345 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
9346 	INIT_LIST_HEAD(&ei->delalloc_inodes);
9347 	INIT_LIST_HEAD(&ei->delayed_iput);
9348 	RB_CLEAR_NODE(&ei->rb_node);
9349 	init_rwsem(&ei->dio_sem);
9350 
9351 	return inode;
9352 }
9353 
9354 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
9355 void btrfs_test_destroy_inode(struct inode *inode)
9356 {
9357 	btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
9358 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9359 }
9360 #endif
9361 
9362 static void btrfs_i_callback(struct rcu_head *head)
9363 {
9364 	struct inode *inode = container_of(head, struct inode, i_rcu);
9365 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9366 }
9367 
9368 void btrfs_destroy_inode(struct inode *inode)
9369 {
9370 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9371 	struct btrfs_ordered_extent *ordered;
9372 	struct btrfs_root *root = BTRFS_I(inode)->root;
9373 
9374 	WARN_ON(!hlist_empty(&inode->i_dentry));
9375 	WARN_ON(inode->i_data.nrpages);
9376 	WARN_ON(BTRFS_I(inode)->block_rsv.reserved);
9377 	WARN_ON(BTRFS_I(inode)->block_rsv.size);
9378 	WARN_ON(BTRFS_I(inode)->outstanding_extents);
9379 	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
9380 	WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
9381 	WARN_ON(BTRFS_I(inode)->csum_bytes);
9382 	WARN_ON(BTRFS_I(inode)->defrag_bytes);
9383 
9384 	/*
9385 	 * This can happen where we create an inode, but somebody else also
9386 	 * created the same inode and we need to destroy the one we already
9387 	 * created.
9388 	 */
9389 	if (!root)
9390 		goto free;
9391 
9392 	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
9393 		     &BTRFS_I(inode)->runtime_flags)) {
9394 		btrfs_info(fs_info, "inode %llu still on the orphan list",
9395 			   btrfs_ino(BTRFS_I(inode)));
9396 		atomic_dec(&root->orphan_inodes);
9397 	}
9398 
9399 	while (1) {
9400 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
9401 		if (!ordered)
9402 			break;
9403 		else {
9404 			btrfs_err(fs_info,
9405 				  "found ordered extent %llu %llu on inode cleanup",
9406 				  ordered->file_offset, ordered->len);
9407 			btrfs_remove_ordered_extent(inode, ordered);
9408 			btrfs_put_ordered_extent(ordered);
9409 			btrfs_put_ordered_extent(ordered);
9410 		}
9411 	}
9412 	btrfs_qgroup_check_reserved_leak(inode);
9413 	inode_tree_del(inode);
9414 	btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
9415 free:
9416 	call_rcu(&inode->i_rcu, btrfs_i_callback);
9417 }
9418 
9419 int btrfs_drop_inode(struct inode *inode)
9420 {
9421 	struct btrfs_root *root = BTRFS_I(inode)->root;
9422 
9423 	if (root == NULL)
9424 		return 1;
9425 
9426 	/* the snap/subvol tree is on deleting */
9427 	if (btrfs_root_refs(&root->root_item) == 0)
9428 		return 1;
9429 	else
9430 		return generic_drop_inode(inode);
9431 }
9432 
9433 static void init_once(void *foo)
9434 {
9435 	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
9436 
9437 	inode_init_once(&ei->vfs_inode);
9438 }
9439 
9440 void btrfs_destroy_cachep(void)
9441 {
9442 	/*
9443 	 * Make sure all delayed rcu free inodes are flushed before we
9444 	 * destroy cache.
9445 	 */
9446 	rcu_barrier();
9447 	kmem_cache_destroy(btrfs_inode_cachep);
9448 	kmem_cache_destroy(btrfs_trans_handle_cachep);
9449 	kmem_cache_destroy(btrfs_path_cachep);
9450 	kmem_cache_destroy(btrfs_free_space_cachep);
9451 }
9452 
9453 int __init btrfs_init_cachep(void)
9454 {
9455 	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
9456 			sizeof(struct btrfs_inode), 0,
9457 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
9458 			init_once);
9459 	if (!btrfs_inode_cachep)
9460 		goto fail;
9461 
9462 	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
9463 			sizeof(struct btrfs_trans_handle), 0,
9464 			SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
9465 	if (!btrfs_trans_handle_cachep)
9466 		goto fail;
9467 
9468 	btrfs_path_cachep = kmem_cache_create("btrfs_path",
9469 			sizeof(struct btrfs_path), 0,
9470 			SLAB_MEM_SPREAD, NULL);
9471 	if (!btrfs_path_cachep)
9472 		goto fail;
9473 
9474 	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
9475 			sizeof(struct btrfs_free_space), 0,
9476 			SLAB_MEM_SPREAD, NULL);
9477 	if (!btrfs_free_space_cachep)
9478 		goto fail;
9479 
9480 	return 0;
9481 fail:
9482 	btrfs_destroy_cachep();
9483 	return -ENOMEM;
9484 }
9485 
9486 static int btrfs_getattr(const struct path *path, struct kstat *stat,
9487 			 u32 request_mask, unsigned int flags)
9488 {
9489 	u64 delalloc_bytes;
9490 	struct inode *inode = d_inode(path->dentry);
9491 	u32 blocksize = inode->i_sb->s_blocksize;
9492 	u32 bi_flags = BTRFS_I(inode)->flags;
9493 
9494 	stat->result_mask |= STATX_BTIME;
9495 	stat->btime.tv_sec = BTRFS_I(inode)->i_otime.tv_sec;
9496 	stat->btime.tv_nsec = BTRFS_I(inode)->i_otime.tv_nsec;
9497 	if (bi_flags & BTRFS_INODE_APPEND)
9498 		stat->attributes |= STATX_ATTR_APPEND;
9499 	if (bi_flags & BTRFS_INODE_COMPRESS)
9500 		stat->attributes |= STATX_ATTR_COMPRESSED;
9501 	if (bi_flags & BTRFS_INODE_IMMUTABLE)
9502 		stat->attributes |= STATX_ATTR_IMMUTABLE;
9503 	if (bi_flags & BTRFS_INODE_NODUMP)
9504 		stat->attributes |= STATX_ATTR_NODUMP;
9505 
9506 	stat->attributes_mask |= (STATX_ATTR_APPEND |
9507 				  STATX_ATTR_COMPRESSED |
9508 				  STATX_ATTR_IMMUTABLE |
9509 				  STATX_ATTR_NODUMP);
9510 
9511 	generic_fillattr(inode, stat);
9512 	stat->dev = BTRFS_I(inode)->root->anon_dev;
9513 
9514 	spin_lock(&BTRFS_I(inode)->lock);
9515 	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
9516 	spin_unlock(&BTRFS_I(inode)->lock);
9517 	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
9518 			ALIGN(delalloc_bytes, blocksize)) >> 9;
9519 	return 0;
9520 }
9521 
9522 static int btrfs_rename_exchange(struct inode *old_dir,
9523 			      struct dentry *old_dentry,
9524 			      struct inode *new_dir,
9525 			      struct dentry *new_dentry)
9526 {
9527 	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
9528 	struct btrfs_trans_handle *trans;
9529 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
9530 	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9531 	struct inode *new_inode = new_dentry->d_inode;
9532 	struct inode *old_inode = old_dentry->d_inode;
9533 	struct timespec ctime = current_time(old_inode);
9534 	struct dentry *parent;
9535 	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9536 	u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
9537 	u64 old_idx = 0;
9538 	u64 new_idx = 0;
9539 	u64 root_objectid;
9540 	int ret;
9541 	bool root_log_pinned = false;
9542 	bool dest_log_pinned = false;
9543 
9544 	/* we only allow rename subvolume link between subvolumes */
9545 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9546 		return -EXDEV;
9547 
9548 	/* close the race window with snapshot create/destroy ioctl */
9549 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9550 		down_read(&fs_info->subvol_sem);
9551 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
9552 		down_read(&fs_info->subvol_sem);
9553 
9554 	/*
9555 	 * We want to reserve the absolute worst case amount of items.  So if
9556 	 * both inodes are subvols and we need to unlink them then that would
9557 	 * require 4 item modifications, but if they are both normal inodes it
9558 	 * would require 5 item modifications, so we'll assume their normal
9559 	 * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
9560 	 * should cover the worst case number of items we'll modify.
9561 	 */
9562 	trans = btrfs_start_transaction(root, 12);
9563 	if (IS_ERR(trans)) {
9564 		ret = PTR_ERR(trans);
9565 		goto out_notrans;
9566 	}
9567 
9568 	/*
9569 	 * We need to find a free sequence number both in the source and
9570 	 * in the destination directory for the exchange.
9571 	 */
9572 	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
9573 	if (ret)
9574 		goto out_fail;
9575 	ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
9576 	if (ret)
9577 		goto out_fail;
9578 
9579 	BTRFS_I(old_inode)->dir_index = 0ULL;
9580 	BTRFS_I(new_inode)->dir_index = 0ULL;
9581 
9582 	/* Reference for the source. */
9583 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9584 		/* force full log commit if subvolume involved. */
9585 		btrfs_set_log_full_commit(fs_info, trans);
9586 	} else {
9587 		btrfs_pin_log_trans(root);
9588 		root_log_pinned = true;
9589 		ret = btrfs_insert_inode_ref(trans, dest,
9590 					     new_dentry->d_name.name,
9591 					     new_dentry->d_name.len,
9592 					     old_ino,
9593 					     btrfs_ino(BTRFS_I(new_dir)),
9594 					     old_idx);
9595 		if (ret)
9596 			goto out_fail;
9597 	}
9598 
9599 	/* And now for the dest. */
9600 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9601 		/* force full log commit if subvolume involved. */
9602 		btrfs_set_log_full_commit(fs_info, trans);
9603 	} else {
9604 		btrfs_pin_log_trans(dest);
9605 		dest_log_pinned = true;
9606 		ret = btrfs_insert_inode_ref(trans, root,
9607 					     old_dentry->d_name.name,
9608 					     old_dentry->d_name.len,
9609 					     new_ino,
9610 					     btrfs_ino(BTRFS_I(old_dir)),
9611 					     new_idx);
9612 		if (ret)
9613 			goto out_fail;
9614 	}
9615 
9616 	/* Update inode version and ctime/mtime. */
9617 	inode_inc_iversion(old_dir);
9618 	inode_inc_iversion(new_dir);
9619 	inode_inc_iversion(old_inode);
9620 	inode_inc_iversion(new_inode);
9621 	old_dir->i_ctime = old_dir->i_mtime = ctime;
9622 	new_dir->i_ctime = new_dir->i_mtime = ctime;
9623 	old_inode->i_ctime = ctime;
9624 	new_inode->i_ctime = ctime;
9625 
9626 	if (old_dentry->d_parent != new_dentry->d_parent) {
9627 		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9628 				BTRFS_I(old_inode), 1);
9629 		btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
9630 				BTRFS_I(new_inode), 1);
9631 	}
9632 
9633 	/* src is a subvolume */
9634 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9635 		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
9636 		ret = btrfs_unlink_subvol(trans, root, old_dir,
9637 					  root_objectid,
9638 					  old_dentry->d_name.name,
9639 					  old_dentry->d_name.len);
9640 	} else { /* src is an inode */
9641 		ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
9642 					   BTRFS_I(old_dentry->d_inode),
9643 					   old_dentry->d_name.name,
9644 					   old_dentry->d_name.len);
9645 		if (!ret)
9646 			ret = btrfs_update_inode(trans, root, old_inode);
9647 	}
9648 	if (ret) {
9649 		btrfs_abort_transaction(trans, ret);
9650 		goto out_fail;
9651 	}
9652 
9653 	/* dest is a subvolume */
9654 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9655 		root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
9656 		ret = btrfs_unlink_subvol(trans, dest, new_dir,
9657 					  root_objectid,
9658 					  new_dentry->d_name.name,
9659 					  new_dentry->d_name.len);
9660 	} else { /* dest is an inode */
9661 		ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
9662 					   BTRFS_I(new_dentry->d_inode),
9663 					   new_dentry->d_name.name,
9664 					   new_dentry->d_name.len);
9665 		if (!ret)
9666 			ret = btrfs_update_inode(trans, dest, new_inode);
9667 	}
9668 	if (ret) {
9669 		btrfs_abort_transaction(trans, ret);
9670 		goto out_fail;
9671 	}
9672 
9673 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9674 			     new_dentry->d_name.name,
9675 			     new_dentry->d_name.len, 0, old_idx);
9676 	if (ret) {
9677 		btrfs_abort_transaction(trans, ret);
9678 		goto out_fail;
9679 	}
9680 
9681 	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
9682 			     old_dentry->d_name.name,
9683 			     old_dentry->d_name.len, 0, new_idx);
9684 	if (ret) {
9685 		btrfs_abort_transaction(trans, ret);
9686 		goto out_fail;
9687 	}
9688 
9689 	if (old_inode->i_nlink == 1)
9690 		BTRFS_I(old_inode)->dir_index = old_idx;
9691 	if (new_inode->i_nlink == 1)
9692 		BTRFS_I(new_inode)->dir_index = new_idx;
9693 
9694 	if (root_log_pinned) {
9695 		parent = new_dentry->d_parent;
9696 		btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
9697 				parent);
9698 		btrfs_end_log_trans(root);
9699 		root_log_pinned = false;
9700 	}
9701 	if (dest_log_pinned) {
9702 		parent = old_dentry->d_parent;
9703 		btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
9704 				parent);
9705 		btrfs_end_log_trans(dest);
9706 		dest_log_pinned = false;
9707 	}
9708 out_fail:
9709 	/*
9710 	 * If we have pinned a log and an error happened, we unpin tasks
9711 	 * trying to sync the log and force them to fallback to a transaction
9712 	 * commit if the log currently contains any of the inodes involved in
9713 	 * this rename operation (to ensure we do not persist a log with an
9714 	 * inconsistent state for any of these inodes or leading to any
9715 	 * inconsistencies when replayed). If the transaction was aborted, the
9716 	 * abortion reason is propagated to userspace when attempting to commit
9717 	 * the transaction. If the log does not contain any of these inodes, we
9718 	 * allow the tasks to sync it.
9719 	 */
9720 	if (ret && (root_log_pinned || dest_log_pinned)) {
9721 		if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
9722 		    btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
9723 		    btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
9724 		    (new_inode &&
9725 		     btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
9726 			btrfs_set_log_full_commit(fs_info, trans);
9727 
9728 		if (root_log_pinned) {
9729 			btrfs_end_log_trans(root);
9730 			root_log_pinned = false;
9731 		}
9732 		if (dest_log_pinned) {
9733 			btrfs_end_log_trans(dest);
9734 			dest_log_pinned = false;
9735 		}
9736 	}
9737 	ret = btrfs_end_transaction(trans);
9738 out_notrans:
9739 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
9740 		up_read(&fs_info->subvol_sem);
9741 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9742 		up_read(&fs_info->subvol_sem);
9743 
9744 	return ret;
9745 }
9746 
9747 static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
9748 				     struct btrfs_root *root,
9749 				     struct inode *dir,
9750 				     struct dentry *dentry)
9751 {
9752 	int ret;
9753 	struct inode *inode;
9754 	u64 objectid;
9755 	u64 index;
9756 
9757 	ret = btrfs_find_free_ino(root, &objectid);
9758 	if (ret)
9759 		return ret;
9760 
9761 	inode = btrfs_new_inode(trans, root, dir,
9762 				dentry->d_name.name,
9763 				dentry->d_name.len,
9764 				btrfs_ino(BTRFS_I(dir)),
9765 				objectid,
9766 				S_IFCHR | WHITEOUT_MODE,
9767 				&index);
9768 
9769 	if (IS_ERR(inode)) {
9770 		ret = PTR_ERR(inode);
9771 		return ret;
9772 	}
9773 
9774 	inode->i_op = &btrfs_special_inode_operations;
9775 	init_special_inode(inode, inode->i_mode,
9776 		WHITEOUT_DEV);
9777 
9778 	ret = btrfs_init_inode_security(trans, inode, dir,
9779 				&dentry->d_name);
9780 	if (ret)
9781 		goto out;
9782 
9783 	ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
9784 				BTRFS_I(inode), 0, index);
9785 	if (ret)
9786 		goto out;
9787 
9788 	ret = btrfs_update_inode(trans, root, inode);
9789 out:
9790 	unlock_new_inode(inode);
9791 	if (ret)
9792 		inode_dec_link_count(inode);
9793 	iput(inode);
9794 
9795 	return ret;
9796 }
9797 
9798 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9799 			   struct inode *new_dir, struct dentry *new_dentry,
9800 			   unsigned int flags)
9801 {
9802 	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
9803 	struct btrfs_trans_handle *trans;
9804 	unsigned int trans_num_items;
9805 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
9806 	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9807 	struct inode *new_inode = d_inode(new_dentry);
9808 	struct inode *old_inode = d_inode(old_dentry);
9809 	u64 index = 0;
9810 	u64 root_objectid;
9811 	int ret;
9812 	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9813 	bool log_pinned = false;
9814 
9815 	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9816 		return -EPERM;
9817 
9818 	/* we only allow rename subvolume link between subvolumes */
9819 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9820 		return -EXDEV;
9821 
9822 	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9823 	    (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
9824 		return -ENOTEMPTY;
9825 
9826 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
9827 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
9828 		return -ENOTEMPTY;
9829 
9830 
9831 	/* check for collisions, even if the  name isn't there */
9832 	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
9833 			     new_dentry->d_name.name,
9834 			     new_dentry->d_name.len);
9835 
9836 	if (ret) {
9837 		if (ret == -EEXIST) {
9838 			/* we shouldn't get
9839 			 * eexist without a new_inode */
9840 			if (WARN_ON(!new_inode)) {
9841 				return ret;
9842 			}
9843 		} else {
9844 			/* maybe -EOVERFLOW */
9845 			return ret;
9846 		}
9847 	}
9848 	ret = 0;
9849 
9850 	/*
9851 	 * we're using rename to replace one file with another.  Start IO on it
9852 	 * now so  we don't add too much work to the end of the transaction
9853 	 */
9854 	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9855 		filemap_flush(old_inode->i_mapping);
9856 
9857 	/* close the racy window with snapshot create/destroy ioctl */
9858 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9859 		down_read(&fs_info->subvol_sem);
9860 	/*
9861 	 * We want to reserve the absolute worst case amount of items.  So if
9862 	 * both inodes are subvols and we need to unlink them then that would
9863 	 * require 4 item modifications, but if they are both normal inodes it
9864 	 * would require 5 item modifications, so we'll assume they are normal
9865 	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
9866 	 * should cover the worst case number of items we'll modify.
9867 	 * If our rename has the whiteout flag, we need more 5 units for the
9868 	 * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
9869 	 * when selinux is enabled).
9870 	 */
9871 	trans_num_items = 11;
9872 	if (flags & RENAME_WHITEOUT)
9873 		trans_num_items += 5;
9874 	trans = btrfs_start_transaction(root, trans_num_items);
9875 	if (IS_ERR(trans)) {
9876 		ret = PTR_ERR(trans);
9877 		goto out_notrans;
9878 	}
9879 
9880 	if (dest != root)
9881 		btrfs_record_root_in_trans(trans, dest);
9882 
9883 	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
9884 	if (ret)
9885 		goto out_fail;
9886 
9887 	BTRFS_I(old_inode)->dir_index = 0ULL;
9888 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9889 		/* force full log commit if subvolume involved. */
9890 		btrfs_set_log_full_commit(fs_info, trans);
9891 	} else {
9892 		btrfs_pin_log_trans(root);
9893 		log_pinned = true;
9894 		ret = btrfs_insert_inode_ref(trans, dest,
9895 					     new_dentry->d_name.name,
9896 					     new_dentry->d_name.len,
9897 					     old_ino,
9898 					     btrfs_ino(BTRFS_I(new_dir)), index);
9899 		if (ret)
9900 			goto out_fail;
9901 	}
9902 
9903 	inode_inc_iversion(old_dir);
9904 	inode_inc_iversion(new_dir);
9905 	inode_inc_iversion(old_inode);
9906 	old_dir->i_ctime = old_dir->i_mtime =
9907 	new_dir->i_ctime = new_dir->i_mtime =
9908 	old_inode->i_ctime = current_time(old_dir);
9909 
9910 	if (old_dentry->d_parent != new_dentry->d_parent)
9911 		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9912 				BTRFS_I(old_inode), 1);
9913 
9914 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9915 		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
9916 		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
9917 					old_dentry->d_name.name,
9918 					old_dentry->d_name.len);
9919 	} else {
9920 		ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
9921 					BTRFS_I(d_inode(old_dentry)),
9922 					old_dentry->d_name.name,
9923 					old_dentry->d_name.len);
9924 		if (!ret)
9925 			ret = btrfs_update_inode(trans, root, old_inode);
9926 	}
9927 	if (ret) {
9928 		btrfs_abort_transaction(trans, ret);
9929 		goto out_fail;
9930 	}
9931 
9932 	if (new_inode) {
9933 		inode_inc_iversion(new_inode);
9934 		new_inode->i_ctime = current_time(new_inode);
9935 		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
9936 			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9937 			root_objectid = BTRFS_I(new_inode)->location.objectid;
9938 			ret = btrfs_unlink_subvol(trans, dest, new_dir,
9939 						root_objectid,
9940 						new_dentry->d_name.name,
9941 						new_dentry->d_name.len);
9942 			BUG_ON(new_inode->i_nlink == 0);
9943 		} else {
9944 			ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
9945 						 BTRFS_I(d_inode(new_dentry)),
9946 						 new_dentry->d_name.name,
9947 						 new_dentry->d_name.len);
9948 		}
9949 		if (!ret && new_inode->i_nlink == 0)
9950 			ret = btrfs_orphan_add(trans,
9951 					BTRFS_I(d_inode(new_dentry)));
9952 		if (ret) {
9953 			btrfs_abort_transaction(trans, ret);
9954 			goto out_fail;
9955 		}
9956 	}
9957 
9958 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9959 			     new_dentry->d_name.name,
9960 			     new_dentry->d_name.len, 0, index);
9961 	if (ret) {
9962 		btrfs_abort_transaction(trans, ret);
9963 		goto out_fail;
9964 	}
9965 
9966 	if (old_inode->i_nlink == 1)
9967 		BTRFS_I(old_inode)->dir_index = index;
9968 
9969 	if (log_pinned) {
9970 		struct dentry *parent = new_dentry->d_parent;
9971 
9972 		btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
9973 				parent);
9974 		btrfs_end_log_trans(root);
9975 		log_pinned = false;
9976 	}
9977 
9978 	if (flags & RENAME_WHITEOUT) {
9979 		ret = btrfs_whiteout_for_rename(trans, root, old_dir,
9980 						old_dentry);
9981 
9982 		if (ret) {
9983 			btrfs_abort_transaction(trans, ret);
9984 			goto out_fail;
9985 		}
9986 	}
9987 out_fail:
9988 	/*
9989 	 * If we have pinned the log and an error happened, we unpin tasks
9990 	 * trying to sync the log and force them to fallback to a transaction
9991 	 * commit if the log currently contains any of the inodes involved in
9992 	 * this rename operation (to ensure we do not persist a log with an
9993 	 * inconsistent state for any of these inodes or leading to any
9994 	 * inconsistencies when replayed). If the transaction was aborted, the
9995 	 * abortion reason is propagated to userspace when attempting to commit
9996 	 * the transaction. If the log does not contain any of these inodes, we
9997 	 * allow the tasks to sync it.
9998 	 */
9999 	if (ret && log_pinned) {
10000 		if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
10001 		    btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
10002 		    btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
10003 		    (new_inode &&
10004 		     btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
10005 			btrfs_set_log_full_commit(fs_info, trans);
10006 
10007 		btrfs_end_log_trans(root);
10008 		log_pinned = false;
10009 	}
10010 	btrfs_end_transaction(trans);
10011 out_notrans:
10012 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
10013 		up_read(&fs_info->subvol_sem);
10014 
10015 	return ret;
10016 }
10017 
10018 static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
10019 			 struct inode *new_dir, struct dentry *new_dentry,
10020 			 unsigned int flags)
10021 {
10022 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
10023 		return -EINVAL;
10024 
10025 	if (flags & RENAME_EXCHANGE)
10026 		return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
10027 					  new_dentry);
10028 
10029 	return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
10030 }
10031 
10032 static void btrfs_run_delalloc_work(struct btrfs_work *work)
10033 {
10034 	struct btrfs_delalloc_work *delalloc_work;
10035 	struct inode *inode;
10036 
10037 	delalloc_work = container_of(work, struct btrfs_delalloc_work,
10038 				     work);
10039 	inode = delalloc_work->inode;
10040 	filemap_flush(inode->i_mapping);
10041 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
10042 				&BTRFS_I(inode)->runtime_flags))
10043 		filemap_flush(inode->i_mapping);
10044 
10045 	if (delalloc_work->delay_iput)
10046 		btrfs_add_delayed_iput(inode);
10047 	else
10048 		iput(inode);
10049 	complete(&delalloc_work->completion);
10050 }
10051 
10052 struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
10053 						    int delay_iput)
10054 {
10055 	struct btrfs_delalloc_work *work;
10056 
10057 	work = kmalloc(sizeof(*work), GFP_NOFS);
10058 	if (!work)
10059 		return NULL;
10060 
10061 	init_completion(&work->completion);
10062 	INIT_LIST_HEAD(&work->list);
10063 	work->inode = inode;
10064 	work->delay_iput = delay_iput;
10065 	WARN_ON_ONCE(!inode);
10066 	btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
10067 			btrfs_run_delalloc_work, NULL, NULL);
10068 
10069 	return work;
10070 }
10071 
10072 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
10073 {
10074 	wait_for_completion(&work->completion);
10075 	kfree(work);
10076 }
10077 
10078 /*
10079  * some fairly slow code that needs optimization. This walks the list
10080  * of all the inodes with pending delalloc and forces them to disk.
10081  */
10082 static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
10083 				   int nr)
10084 {
10085 	struct btrfs_inode *binode;
10086 	struct inode *inode;
10087 	struct btrfs_delalloc_work *work, *next;
10088 	struct list_head works;
10089 	struct list_head splice;
10090 	int ret = 0;
10091 
10092 	INIT_LIST_HEAD(&works);
10093 	INIT_LIST_HEAD(&splice);
10094 
10095 	mutex_lock(&root->delalloc_mutex);
10096 	spin_lock(&root->delalloc_lock);
10097 	list_splice_init(&root->delalloc_inodes, &splice);
10098 	while (!list_empty(&splice)) {
10099 		binode = list_entry(splice.next, struct btrfs_inode,
10100 				    delalloc_inodes);
10101 
10102 		list_move_tail(&binode->delalloc_inodes,
10103 			       &root->delalloc_inodes);
10104 		inode = igrab(&binode->vfs_inode);
10105 		if (!inode) {
10106 			cond_resched_lock(&root->delalloc_lock);
10107 			continue;
10108 		}
10109 		spin_unlock(&root->delalloc_lock);
10110 
10111 		work = btrfs_alloc_delalloc_work(inode, delay_iput);
10112 		if (!work) {
10113 			if (delay_iput)
10114 				btrfs_add_delayed_iput(inode);
10115 			else
10116 				iput(inode);
10117 			ret = -ENOMEM;
10118 			goto out;
10119 		}
10120 		list_add_tail(&work->list, &works);
10121 		btrfs_queue_work(root->fs_info->flush_workers,
10122 				 &work->work);
10123 		ret++;
10124 		if (nr != -1 && ret >= nr)
10125 			goto out;
10126 		cond_resched();
10127 		spin_lock(&root->delalloc_lock);
10128 	}
10129 	spin_unlock(&root->delalloc_lock);
10130 
10131 out:
10132 	list_for_each_entry_safe(work, next, &works, list) {
10133 		list_del_init(&work->list);
10134 		btrfs_wait_and_free_delalloc_work(work);
10135 	}
10136 
10137 	if (!list_empty_careful(&splice)) {
10138 		spin_lock(&root->delalloc_lock);
10139 		list_splice_tail(&splice, &root->delalloc_inodes);
10140 		spin_unlock(&root->delalloc_lock);
10141 	}
10142 	mutex_unlock(&root->delalloc_mutex);
10143 	return ret;
10144 }
10145 
10146 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
10147 {
10148 	struct btrfs_fs_info *fs_info = root->fs_info;
10149 	int ret;
10150 
10151 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
10152 		return -EROFS;
10153 
10154 	ret = __start_delalloc_inodes(root, delay_iput, -1);
10155 	if (ret > 0)
10156 		ret = 0;
10157 	return ret;
10158 }
10159 
10160 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
10161 			       int nr)
10162 {
10163 	struct btrfs_root *root;
10164 	struct list_head splice;
10165 	int ret;
10166 
10167 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
10168 		return -EROFS;
10169 
10170 	INIT_LIST_HEAD(&splice);
10171 
10172 	mutex_lock(&fs_info->delalloc_root_mutex);
10173 	spin_lock(&fs_info->delalloc_root_lock);
10174 	list_splice_init(&fs_info->delalloc_roots, &splice);
10175 	while (!list_empty(&splice) && nr) {
10176 		root = list_first_entry(&splice, struct btrfs_root,
10177 					delalloc_root);
10178 		root = btrfs_grab_fs_root(root);
10179 		BUG_ON(!root);
10180 		list_move_tail(&root->delalloc_root,
10181 			       &fs_info->delalloc_roots);
10182 		spin_unlock(&fs_info->delalloc_root_lock);
10183 
10184 		ret = __start_delalloc_inodes(root, delay_iput, nr);
10185 		btrfs_put_fs_root(root);
10186 		if (ret < 0)
10187 			goto out;
10188 
10189 		if (nr != -1) {
10190 			nr -= ret;
10191 			WARN_ON(nr < 0);
10192 		}
10193 		spin_lock(&fs_info->delalloc_root_lock);
10194 	}
10195 	spin_unlock(&fs_info->delalloc_root_lock);
10196 
10197 	ret = 0;
10198 out:
10199 	if (!list_empty_careful(&splice)) {
10200 		spin_lock(&fs_info->delalloc_root_lock);
10201 		list_splice_tail(&splice, &fs_info->delalloc_roots);
10202 		spin_unlock(&fs_info->delalloc_root_lock);
10203 	}
10204 	mutex_unlock(&fs_info->delalloc_root_mutex);
10205 	return ret;
10206 }
10207 
10208 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
10209 			 const char *symname)
10210 {
10211 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
10212 	struct btrfs_trans_handle *trans;
10213 	struct btrfs_root *root = BTRFS_I(dir)->root;
10214 	struct btrfs_path *path;
10215 	struct btrfs_key key;
10216 	struct inode *inode = NULL;
10217 	int err;
10218 	int drop_inode = 0;
10219 	u64 objectid;
10220 	u64 index = 0;
10221 	int name_len;
10222 	int datasize;
10223 	unsigned long ptr;
10224 	struct btrfs_file_extent_item *ei;
10225 	struct extent_buffer *leaf;
10226 
10227 	name_len = strlen(symname);
10228 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
10229 		return -ENAMETOOLONG;
10230 
10231 	/*
10232 	 * 2 items for inode item and ref
10233 	 * 2 items for dir items
10234 	 * 1 item for updating parent inode item
10235 	 * 1 item for the inline extent item
10236 	 * 1 item for xattr if selinux is on
10237 	 */
10238 	trans = btrfs_start_transaction(root, 7);
10239 	if (IS_ERR(trans))
10240 		return PTR_ERR(trans);
10241 
10242 	err = btrfs_find_free_ino(root, &objectid);
10243 	if (err)
10244 		goto out_unlock;
10245 
10246 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
10247 				dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
10248 				objectid, S_IFLNK|S_IRWXUGO, &index);
10249 	if (IS_ERR(inode)) {
10250 		err = PTR_ERR(inode);
10251 		goto out_unlock;
10252 	}
10253 
10254 	/*
10255 	* If the active LSM wants to access the inode during
10256 	* d_instantiate it needs these. Smack checks to see
10257 	* if the filesystem supports xattrs by looking at the
10258 	* ops vector.
10259 	*/
10260 	inode->i_fop = &btrfs_file_operations;
10261 	inode->i_op = &btrfs_file_inode_operations;
10262 	inode->i_mapping->a_ops = &btrfs_aops;
10263 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
10264 
10265 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
10266 	if (err)
10267 		goto out_unlock_inode;
10268 
10269 	path = btrfs_alloc_path();
10270 	if (!path) {
10271 		err = -ENOMEM;
10272 		goto out_unlock_inode;
10273 	}
10274 	key.objectid = btrfs_ino(BTRFS_I(inode));
10275 	key.offset = 0;
10276 	key.type = BTRFS_EXTENT_DATA_KEY;
10277 	datasize = btrfs_file_extent_calc_inline_size(name_len);
10278 	err = btrfs_insert_empty_item(trans, root, path, &key,
10279 				      datasize);
10280 	if (err) {
10281 		btrfs_free_path(path);
10282 		goto out_unlock_inode;
10283 	}
10284 	leaf = path->nodes[0];
10285 	ei = btrfs_item_ptr(leaf, path->slots[0],
10286 			    struct btrfs_file_extent_item);
10287 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
10288 	btrfs_set_file_extent_type(leaf, ei,
10289 				   BTRFS_FILE_EXTENT_INLINE);
10290 	btrfs_set_file_extent_encryption(leaf, ei, 0);
10291 	btrfs_set_file_extent_compression(leaf, ei, 0);
10292 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
10293 	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
10294 
10295 	ptr = btrfs_file_extent_inline_start(ei);
10296 	write_extent_buffer(leaf, symname, ptr, name_len);
10297 	btrfs_mark_buffer_dirty(leaf);
10298 	btrfs_free_path(path);
10299 
10300 	inode->i_op = &btrfs_symlink_inode_operations;
10301 	inode_nohighmem(inode);
10302 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
10303 	inode_set_bytes(inode, name_len);
10304 	btrfs_i_size_write(BTRFS_I(inode), name_len);
10305 	err = btrfs_update_inode(trans, root, inode);
10306 	/*
10307 	 * Last step, add directory indexes for our symlink inode. This is the
10308 	 * last step to avoid extra cleanup of these indexes if an error happens
10309 	 * elsewhere above.
10310 	 */
10311 	if (!err)
10312 		err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
10313 				BTRFS_I(inode), 0, index);
10314 	if (err) {
10315 		drop_inode = 1;
10316 		goto out_unlock_inode;
10317 	}
10318 
10319 	unlock_new_inode(inode);
10320 	d_instantiate(dentry, inode);
10321 
10322 out_unlock:
10323 	btrfs_end_transaction(trans);
10324 	if (drop_inode) {
10325 		inode_dec_link_count(inode);
10326 		iput(inode);
10327 	}
10328 	btrfs_btree_balance_dirty(fs_info);
10329 	return err;
10330 
10331 out_unlock_inode:
10332 	drop_inode = 1;
10333 	unlock_new_inode(inode);
10334 	goto out_unlock;
10335 }
10336 
10337 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
10338 				       u64 start, u64 num_bytes, u64 min_size,
10339 				       loff_t actual_len, u64 *alloc_hint,
10340 				       struct btrfs_trans_handle *trans)
10341 {
10342 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
10343 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
10344 	struct extent_map *em;
10345 	struct btrfs_root *root = BTRFS_I(inode)->root;
10346 	struct btrfs_key ins;
10347 	u64 cur_offset = start;
10348 	u64 i_size;
10349 	u64 cur_bytes;
10350 	u64 last_alloc = (u64)-1;
10351 	int ret = 0;
10352 	bool own_trans = true;
10353 	u64 end = start + num_bytes - 1;
10354 
10355 	if (trans)
10356 		own_trans = false;
10357 	while (num_bytes > 0) {
10358 		if (own_trans) {
10359 			trans = btrfs_start_transaction(root, 3);
10360 			if (IS_ERR(trans)) {
10361 				ret = PTR_ERR(trans);
10362 				break;
10363 			}
10364 		}
10365 
10366 		cur_bytes = min_t(u64, num_bytes, SZ_256M);
10367 		cur_bytes = max(cur_bytes, min_size);
10368 		/*
10369 		 * If we are severely fragmented we could end up with really
10370 		 * small allocations, so if the allocator is returning small
10371 		 * chunks lets make its job easier by only searching for those
10372 		 * sized chunks.
10373 		 */
10374 		cur_bytes = min(cur_bytes, last_alloc);
10375 		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
10376 				min_size, 0, *alloc_hint, &ins, 1, 0);
10377 		if (ret) {
10378 			if (own_trans)
10379 				btrfs_end_transaction(trans);
10380 			break;
10381 		}
10382 		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10383 
10384 		last_alloc = ins.offset;
10385 		ret = insert_reserved_file_extent(trans, inode,
10386 						  cur_offset, ins.objectid,
10387 						  ins.offset, ins.offset,
10388 						  ins.offset, 0, 0, 0,
10389 						  BTRFS_FILE_EXTENT_PREALLOC);
10390 		if (ret) {
10391 			btrfs_free_reserved_extent(fs_info, ins.objectid,
10392 						   ins.offset, 0);
10393 			btrfs_abort_transaction(trans, ret);
10394 			if (own_trans)
10395 				btrfs_end_transaction(trans);
10396 			break;
10397 		}
10398 
10399 		btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
10400 					cur_offset + ins.offset -1, 0);
10401 
10402 		em = alloc_extent_map();
10403 		if (!em) {
10404 			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
10405 				&BTRFS_I(inode)->runtime_flags);
10406 			goto next;
10407 		}
10408 
10409 		em->start = cur_offset;
10410 		em->orig_start = cur_offset;
10411 		em->len = ins.offset;
10412 		em->block_start = ins.objectid;
10413 		em->block_len = ins.offset;
10414 		em->orig_block_len = ins.offset;
10415 		em->ram_bytes = ins.offset;
10416 		em->bdev = fs_info->fs_devices->latest_bdev;
10417 		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
10418 		em->generation = trans->transid;
10419 
10420 		while (1) {
10421 			write_lock(&em_tree->lock);
10422 			ret = add_extent_mapping(em_tree, em, 1);
10423 			write_unlock(&em_tree->lock);
10424 			if (ret != -EEXIST)
10425 				break;
10426 			btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
10427 						cur_offset + ins.offset - 1,
10428 						0);
10429 		}
10430 		free_extent_map(em);
10431 next:
10432 		num_bytes -= ins.offset;
10433 		cur_offset += ins.offset;
10434 		*alloc_hint = ins.objectid + ins.offset;
10435 
10436 		inode_inc_iversion(inode);
10437 		inode->i_ctime = current_time(inode);
10438 		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
10439 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
10440 		    (actual_len > inode->i_size) &&
10441 		    (cur_offset > inode->i_size)) {
10442 			if (cur_offset > actual_len)
10443 				i_size = actual_len;
10444 			else
10445 				i_size = cur_offset;
10446 			i_size_write(inode, i_size);
10447 			btrfs_ordered_update_i_size(inode, i_size, NULL);
10448 		}
10449 
10450 		ret = btrfs_update_inode(trans, root, inode);
10451 
10452 		if (ret) {
10453 			btrfs_abort_transaction(trans, ret);
10454 			if (own_trans)
10455 				btrfs_end_transaction(trans);
10456 			break;
10457 		}
10458 
10459 		if (own_trans)
10460 			btrfs_end_transaction(trans);
10461 	}
10462 	if (cur_offset < end)
10463 		btrfs_free_reserved_data_space(inode, NULL, cur_offset,
10464 			end - cur_offset + 1);
10465 	return ret;
10466 }
10467 
10468 int btrfs_prealloc_file_range(struct inode *inode, int mode,
10469 			      u64 start, u64 num_bytes, u64 min_size,
10470 			      loff_t actual_len, u64 *alloc_hint)
10471 {
10472 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10473 					   min_size, actual_len, alloc_hint,
10474 					   NULL);
10475 }
10476 
10477 int btrfs_prealloc_file_range_trans(struct inode *inode,
10478 				    struct btrfs_trans_handle *trans, int mode,
10479 				    u64 start, u64 num_bytes, u64 min_size,
10480 				    loff_t actual_len, u64 *alloc_hint)
10481 {
10482 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10483 					   min_size, actual_len, alloc_hint, trans);
10484 }
10485 
10486 static int btrfs_set_page_dirty(struct page *page)
10487 {
10488 	return __set_page_dirty_nobuffers(page);
10489 }
10490 
10491 static int btrfs_permission(struct inode *inode, int mask)
10492 {
10493 	struct btrfs_root *root = BTRFS_I(inode)->root;
10494 	umode_t mode = inode->i_mode;
10495 
10496 	if (mask & MAY_WRITE &&
10497 	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
10498 		if (btrfs_root_readonly(root))
10499 			return -EROFS;
10500 		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
10501 			return -EACCES;
10502 	}
10503 	return generic_permission(inode, mask);
10504 }
10505 
10506 static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
10507 {
10508 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
10509 	struct btrfs_trans_handle *trans;
10510 	struct btrfs_root *root = BTRFS_I(dir)->root;
10511 	struct inode *inode = NULL;
10512 	u64 objectid;
10513 	u64 index;
10514 	int ret = 0;
10515 
10516 	/*
10517 	 * 5 units required for adding orphan entry
10518 	 */
10519 	trans = btrfs_start_transaction(root, 5);
10520 	if (IS_ERR(trans))
10521 		return PTR_ERR(trans);
10522 
10523 	ret = btrfs_find_free_ino(root, &objectid);
10524 	if (ret)
10525 		goto out;
10526 
10527 	inode = btrfs_new_inode(trans, root, dir, NULL, 0,
10528 			btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
10529 	if (IS_ERR(inode)) {
10530 		ret = PTR_ERR(inode);
10531 		inode = NULL;
10532 		goto out;
10533 	}
10534 
10535 	inode->i_fop = &btrfs_file_operations;
10536 	inode->i_op = &btrfs_file_inode_operations;
10537 
10538 	inode->i_mapping->a_ops = &btrfs_aops;
10539 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
10540 
10541 	ret = btrfs_init_inode_security(trans, inode, dir, NULL);
10542 	if (ret)
10543 		goto out_inode;
10544 
10545 	ret = btrfs_update_inode(trans, root, inode);
10546 	if (ret)
10547 		goto out_inode;
10548 	ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10549 	if (ret)
10550 		goto out_inode;
10551 
10552 	/*
10553 	 * We set number of links to 0 in btrfs_new_inode(), and here we set
10554 	 * it to 1 because d_tmpfile() will issue a warning if the count is 0,
10555 	 * through:
10556 	 *
10557 	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
10558 	 */
10559 	set_nlink(inode, 1);
10560 	unlock_new_inode(inode);
10561 	d_tmpfile(dentry, inode);
10562 	mark_inode_dirty(inode);
10563 
10564 out:
10565 	btrfs_end_transaction(trans);
10566 	if (ret)
10567 		iput(inode);
10568 	btrfs_btree_balance_dirty(fs_info);
10569 	return ret;
10570 
10571 out_inode:
10572 	unlock_new_inode(inode);
10573 	goto out;
10574 
10575 }
10576 
10577 __attribute__((const))
10578 static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror)
10579 {
10580 	return -EAGAIN;
10581 }
10582 
10583 static struct btrfs_fs_info *iotree_fs_info(void *private_data)
10584 {
10585 	struct inode *inode = private_data;
10586 	return btrfs_sb(inode->i_sb);
10587 }
10588 
10589 static void btrfs_check_extent_io_range(void *private_data, const char *caller,
10590 					u64 start, u64 end)
10591 {
10592 	struct inode *inode = private_data;
10593 	u64 isize;
10594 
10595 	isize = i_size_read(inode);
10596 	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
10597 		btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
10598 		    "%s: ino %llu isize %llu odd range [%llu,%llu]",
10599 			caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
10600 	}
10601 }
10602 
10603 void btrfs_set_range_writeback(void *private_data, u64 start, u64 end)
10604 {
10605 	struct inode *inode = private_data;
10606 	unsigned long index = start >> PAGE_SHIFT;
10607 	unsigned long end_index = end >> PAGE_SHIFT;
10608 	struct page *page;
10609 
10610 	while (index <= end_index) {
10611 		page = find_get_page(inode->i_mapping, index);
10612 		ASSERT(page); /* Pages should be in the extent_io_tree */
10613 		set_page_writeback(page);
10614 		put_page(page);
10615 		index++;
10616 	}
10617 }
10618 
10619 static const struct inode_operations btrfs_dir_inode_operations = {
10620 	.getattr	= btrfs_getattr,
10621 	.lookup		= btrfs_lookup,
10622 	.create		= btrfs_create,
10623 	.unlink		= btrfs_unlink,
10624 	.link		= btrfs_link,
10625 	.mkdir		= btrfs_mkdir,
10626 	.rmdir		= btrfs_rmdir,
10627 	.rename		= btrfs_rename2,
10628 	.symlink	= btrfs_symlink,
10629 	.setattr	= btrfs_setattr,
10630 	.mknod		= btrfs_mknod,
10631 	.listxattr	= btrfs_listxattr,
10632 	.permission	= btrfs_permission,
10633 	.get_acl	= btrfs_get_acl,
10634 	.set_acl	= btrfs_set_acl,
10635 	.update_time	= btrfs_update_time,
10636 	.tmpfile        = btrfs_tmpfile,
10637 };
10638 static const struct inode_operations btrfs_dir_ro_inode_operations = {
10639 	.lookup		= btrfs_lookup,
10640 	.permission	= btrfs_permission,
10641 	.update_time	= btrfs_update_time,
10642 };
10643 
10644 static const struct file_operations btrfs_dir_file_operations = {
10645 	.llseek		= generic_file_llseek,
10646 	.read		= generic_read_dir,
10647 	.iterate_shared	= btrfs_real_readdir,
10648 	.open		= btrfs_opendir,
10649 	.unlocked_ioctl	= btrfs_ioctl,
10650 #ifdef CONFIG_COMPAT
10651 	.compat_ioctl	= btrfs_compat_ioctl,
10652 #endif
10653 	.release        = btrfs_release_file,
10654 	.fsync		= btrfs_sync_file,
10655 };
10656 
10657 static const struct extent_io_ops btrfs_extent_io_ops = {
10658 	/* mandatory callbacks */
10659 	.submit_bio_hook = btrfs_submit_bio_hook,
10660 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
10661 	.merge_bio_hook = btrfs_merge_bio_hook,
10662 	.readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
10663 	.tree_fs_info = iotree_fs_info,
10664 	.set_range_writeback = btrfs_set_range_writeback,
10665 
10666 	/* optional callbacks */
10667 	.fill_delalloc = run_delalloc_range,
10668 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
10669 	.writepage_start_hook = btrfs_writepage_start_hook,
10670 	.set_bit_hook = btrfs_set_bit_hook,
10671 	.clear_bit_hook = btrfs_clear_bit_hook,
10672 	.merge_extent_hook = btrfs_merge_extent_hook,
10673 	.split_extent_hook = btrfs_split_extent_hook,
10674 	.check_extent_io_range = btrfs_check_extent_io_range,
10675 };
10676 
10677 /*
10678  * btrfs doesn't support the bmap operation because swapfiles
10679  * use bmap to make a mapping of extents in the file.  They assume
10680  * these extents won't change over the life of the file and they
10681  * use the bmap result to do IO directly to the drive.
10682  *
10683  * the btrfs bmap call would return logical addresses that aren't
10684  * suitable for IO and they also will change frequently as COW
10685  * operations happen.  So, swapfile + btrfs == corruption.
10686  *
10687  * For now we're avoiding this by dropping bmap.
10688  */
10689 static const struct address_space_operations btrfs_aops = {
10690 	.readpage	= btrfs_readpage,
10691 	.writepage	= btrfs_writepage,
10692 	.writepages	= btrfs_writepages,
10693 	.readpages	= btrfs_readpages,
10694 	.direct_IO	= btrfs_direct_IO,
10695 	.invalidatepage = btrfs_invalidatepage,
10696 	.releasepage	= btrfs_releasepage,
10697 	.set_page_dirty	= btrfs_set_page_dirty,
10698 	.error_remove_page = generic_error_remove_page,
10699 };
10700 
10701 static const struct address_space_operations btrfs_symlink_aops = {
10702 	.readpage	= btrfs_readpage,
10703 	.writepage	= btrfs_writepage,
10704 	.invalidatepage = btrfs_invalidatepage,
10705 	.releasepage	= btrfs_releasepage,
10706 };
10707 
10708 static const struct inode_operations btrfs_file_inode_operations = {
10709 	.getattr	= btrfs_getattr,
10710 	.setattr	= btrfs_setattr,
10711 	.listxattr      = btrfs_listxattr,
10712 	.permission	= btrfs_permission,
10713 	.fiemap		= btrfs_fiemap,
10714 	.get_acl	= btrfs_get_acl,
10715 	.set_acl	= btrfs_set_acl,
10716 	.update_time	= btrfs_update_time,
10717 };
10718 static const struct inode_operations btrfs_special_inode_operations = {
10719 	.getattr	= btrfs_getattr,
10720 	.setattr	= btrfs_setattr,
10721 	.permission	= btrfs_permission,
10722 	.listxattr	= btrfs_listxattr,
10723 	.get_acl	= btrfs_get_acl,
10724 	.set_acl	= btrfs_set_acl,
10725 	.update_time	= btrfs_update_time,
10726 };
10727 static const struct inode_operations btrfs_symlink_inode_operations = {
10728 	.get_link	= page_get_link,
10729 	.getattr	= btrfs_getattr,
10730 	.setattr	= btrfs_setattr,
10731 	.permission	= btrfs_permission,
10732 	.listxattr	= btrfs_listxattr,
10733 	.update_time	= btrfs_update_time,
10734 };
10735 
10736 const struct dentry_operations btrfs_dentry_operations = {
10737 	.d_delete	= btrfs_dentry_delete,
10738 	.d_release	= btrfs_dentry_release,
10739 };
10740