xref: /openbmc/linux/fs/btrfs/inode.c (revision efe4a1ac)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/compat.h>
34 #include <linux/bit_spinlock.h>
35 #include <linux/xattr.h>
36 #include <linux/posix_acl.h>
37 #include <linux/falloc.h>
38 #include <linux/slab.h>
39 #include <linux/ratelimit.h>
40 #include <linux/mount.h>
41 #include <linux/btrfs.h>
42 #include <linux/blkdev.h>
43 #include <linux/posix_acl_xattr.h>
44 #include <linux/uio.h>
45 #include "ctree.h"
46 #include "disk-io.h"
47 #include "transaction.h"
48 #include "btrfs_inode.h"
49 #include "print-tree.h"
50 #include "ordered-data.h"
51 #include "xattr.h"
52 #include "tree-log.h"
53 #include "volumes.h"
54 #include "compression.h"
55 #include "locking.h"
56 #include "free-space-cache.h"
57 #include "inode-map.h"
58 #include "backref.h"
59 #include "hash.h"
60 #include "props.h"
61 #include "qgroup.h"
62 #include "dedupe.h"
63 
64 struct btrfs_iget_args {
65 	struct btrfs_key *location;
66 	struct btrfs_root *root;
67 };
68 
69 struct btrfs_dio_data {
70 	u64 outstanding_extents;
71 	u64 reserve;
72 	u64 unsubmitted_oe_range_start;
73 	u64 unsubmitted_oe_range_end;
74 	int overwrite;
75 };
76 
77 static const struct inode_operations btrfs_dir_inode_operations;
78 static const struct inode_operations btrfs_symlink_inode_operations;
79 static const struct inode_operations btrfs_dir_ro_inode_operations;
80 static const struct inode_operations btrfs_special_inode_operations;
81 static const struct inode_operations btrfs_file_inode_operations;
82 static const struct address_space_operations btrfs_aops;
83 static const struct address_space_operations btrfs_symlink_aops;
84 static const struct file_operations btrfs_dir_file_operations;
85 static const struct extent_io_ops btrfs_extent_io_ops;
86 
87 static struct kmem_cache *btrfs_inode_cachep;
88 struct kmem_cache *btrfs_trans_handle_cachep;
89 struct kmem_cache *btrfs_transaction_cachep;
90 struct kmem_cache *btrfs_path_cachep;
91 struct kmem_cache *btrfs_free_space_cachep;
92 
93 #define S_SHIFT 12
94 static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
95 	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
96 	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
97 	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
98 	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
99 	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
100 	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
101 	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
102 };
103 
104 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
105 static int btrfs_truncate(struct inode *inode);
106 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
107 static noinline int cow_file_range(struct inode *inode,
108 				   struct page *locked_page,
109 				   u64 start, u64 end, u64 delalloc_end,
110 				   int *page_started, unsigned long *nr_written,
111 				   int unlock, struct btrfs_dedupe_hash *hash);
112 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
113 				       u64 orig_start, u64 block_start,
114 				       u64 block_len, u64 orig_block_len,
115 				       u64 ram_bytes, int compress_type,
116 				       int type);
117 
118 static void __endio_write_update_ordered(struct inode *inode,
119 					 const u64 offset, const u64 bytes,
120 					 const bool uptodate);
121 
122 /*
123  * Cleanup all submitted ordered extents in specified range to handle errors
124  * from the fill_dellaloc() callback.
125  *
126  * NOTE: caller must ensure that when an error happens, it can not call
127  * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
128  * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
129  * to be released, which we want to happen only when finishing the ordered
130  * extent (btrfs_finish_ordered_io()). Also note that the caller of the
131  * fill_delalloc() callback already does proper cleanup for the first page of
132  * the range, that is, it invokes the callback writepage_end_io_hook() for the
133  * range of the first page.
134  */
135 static inline void btrfs_cleanup_ordered_extents(struct inode *inode,
136 						 const u64 offset,
137 						 const u64 bytes)
138 {
139 	return __endio_write_update_ordered(inode, offset + PAGE_SIZE,
140 					    bytes - PAGE_SIZE, false);
141 }
142 
143 static int btrfs_dirty_inode(struct inode *inode);
144 
145 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
146 void btrfs_test_inode_set_ops(struct inode *inode)
147 {
148 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
149 }
150 #endif
151 
152 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
153 				     struct inode *inode,  struct inode *dir,
154 				     const struct qstr *qstr)
155 {
156 	int err;
157 
158 	err = btrfs_init_acl(trans, inode, dir);
159 	if (!err)
160 		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
161 	return err;
162 }
163 
164 /*
165  * this does all the hard work for inserting an inline extent into
166  * the btree.  The caller should have done a btrfs_drop_extents so that
167  * no overlapping inline items exist in the btree
168  */
169 static int insert_inline_extent(struct btrfs_trans_handle *trans,
170 				struct btrfs_path *path, int extent_inserted,
171 				struct btrfs_root *root, struct inode *inode,
172 				u64 start, size_t size, size_t compressed_size,
173 				int compress_type,
174 				struct page **compressed_pages)
175 {
176 	struct extent_buffer *leaf;
177 	struct page *page = NULL;
178 	char *kaddr;
179 	unsigned long ptr;
180 	struct btrfs_file_extent_item *ei;
181 	int err = 0;
182 	int ret;
183 	size_t cur_size = size;
184 	unsigned long offset;
185 
186 	if (compressed_size && compressed_pages)
187 		cur_size = compressed_size;
188 
189 	inode_add_bytes(inode, size);
190 
191 	if (!extent_inserted) {
192 		struct btrfs_key key;
193 		size_t datasize;
194 
195 		key.objectid = btrfs_ino(BTRFS_I(inode));
196 		key.offset = start;
197 		key.type = BTRFS_EXTENT_DATA_KEY;
198 
199 		datasize = btrfs_file_extent_calc_inline_size(cur_size);
200 		path->leave_spinning = 1;
201 		ret = btrfs_insert_empty_item(trans, root, path, &key,
202 					      datasize);
203 		if (ret) {
204 			err = ret;
205 			goto fail;
206 		}
207 	}
208 	leaf = path->nodes[0];
209 	ei = btrfs_item_ptr(leaf, path->slots[0],
210 			    struct btrfs_file_extent_item);
211 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
212 	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
213 	btrfs_set_file_extent_encryption(leaf, ei, 0);
214 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
215 	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
216 	ptr = btrfs_file_extent_inline_start(ei);
217 
218 	if (compress_type != BTRFS_COMPRESS_NONE) {
219 		struct page *cpage;
220 		int i = 0;
221 		while (compressed_size > 0) {
222 			cpage = compressed_pages[i];
223 			cur_size = min_t(unsigned long, compressed_size,
224 				       PAGE_SIZE);
225 
226 			kaddr = kmap_atomic(cpage);
227 			write_extent_buffer(leaf, kaddr, ptr, cur_size);
228 			kunmap_atomic(kaddr);
229 
230 			i++;
231 			ptr += cur_size;
232 			compressed_size -= cur_size;
233 		}
234 		btrfs_set_file_extent_compression(leaf, ei,
235 						  compress_type);
236 	} else {
237 		page = find_get_page(inode->i_mapping,
238 				     start >> PAGE_SHIFT);
239 		btrfs_set_file_extent_compression(leaf, ei, 0);
240 		kaddr = kmap_atomic(page);
241 		offset = start & (PAGE_SIZE - 1);
242 		write_extent_buffer(leaf, kaddr + offset, ptr, size);
243 		kunmap_atomic(kaddr);
244 		put_page(page);
245 	}
246 	btrfs_mark_buffer_dirty(leaf);
247 	btrfs_release_path(path);
248 
249 	/*
250 	 * we're an inline extent, so nobody can
251 	 * extend the file past i_size without locking
252 	 * a page we already have locked.
253 	 *
254 	 * We must do any isize and inode updates
255 	 * before we unlock the pages.  Otherwise we
256 	 * could end up racing with unlink.
257 	 */
258 	BTRFS_I(inode)->disk_i_size = inode->i_size;
259 	ret = btrfs_update_inode(trans, root, inode);
260 
261 	return ret;
262 fail:
263 	return err;
264 }
265 
266 
267 /*
268  * conditionally insert an inline extent into the file.  This
269  * does the checks required to make sure the data is small enough
270  * to fit as an inline extent.
271  */
272 static noinline int cow_file_range_inline(struct btrfs_root *root,
273 					  struct inode *inode, u64 start,
274 					  u64 end, size_t compressed_size,
275 					  int compress_type,
276 					  struct page **compressed_pages)
277 {
278 	struct btrfs_fs_info *fs_info = root->fs_info;
279 	struct btrfs_trans_handle *trans;
280 	u64 isize = i_size_read(inode);
281 	u64 actual_end = min(end + 1, isize);
282 	u64 inline_len = actual_end - start;
283 	u64 aligned_end = ALIGN(end, fs_info->sectorsize);
284 	u64 data_len = inline_len;
285 	int ret;
286 	struct btrfs_path *path;
287 	int extent_inserted = 0;
288 	u32 extent_item_size;
289 
290 	if (compressed_size)
291 		data_len = compressed_size;
292 
293 	if (start > 0 ||
294 	    actual_end > fs_info->sectorsize ||
295 	    data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
296 	    (!compressed_size &&
297 	    (actual_end & (fs_info->sectorsize - 1)) == 0) ||
298 	    end + 1 < isize ||
299 	    data_len > fs_info->max_inline) {
300 		return 1;
301 	}
302 
303 	path = btrfs_alloc_path();
304 	if (!path)
305 		return -ENOMEM;
306 
307 	trans = btrfs_join_transaction(root);
308 	if (IS_ERR(trans)) {
309 		btrfs_free_path(path);
310 		return PTR_ERR(trans);
311 	}
312 	trans->block_rsv = &fs_info->delalloc_block_rsv;
313 
314 	if (compressed_size && compressed_pages)
315 		extent_item_size = btrfs_file_extent_calc_inline_size(
316 		   compressed_size);
317 	else
318 		extent_item_size = btrfs_file_extent_calc_inline_size(
319 		    inline_len);
320 
321 	ret = __btrfs_drop_extents(trans, root, inode, path,
322 				   start, aligned_end, NULL,
323 				   1, 1, extent_item_size, &extent_inserted);
324 	if (ret) {
325 		btrfs_abort_transaction(trans, ret);
326 		goto out;
327 	}
328 
329 	if (isize > actual_end)
330 		inline_len = min_t(u64, isize, actual_end);
331 	ret = insert_inline_extent(trans, path, extent_inserted,
332 				   root, inode, start,
333 				   inline_len, compressed_size,
334 				   compress_type, compressed_pages);
335 	if (ret && ret != -ENOSPC) {
336 		btrfs_abort_transaction(trans, ret);
337 		goto out;
338 	} else if (ret == -ENOSPC) {
339 		ret = 1;
340 		goto out;
341 	}
342 
343 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
344 	btrfs_delalloc_release_metadata(BTRFS_I(inode), end + 1 - start);
345 	btrfs_drop_extent_cache(BTRFS_I(inode), start, aligned_end - 1, 0);
346 out:
347 	/*
348 	 * Don't forget to free the reserved space, as for inlined extent
349 	 * it won't count as data extent, free them directly here.
350 	 * And at reserve time, it's always aligned to page size, so
351 	 * just free one page here.
352 	 */
353 	btrfs_qgroup_free_data(inode, 0, PAGE_SIZE);
354 	btrfs_free_path(path);
355 	btrfs_end_transaction(trans);
356 	return ret;
357 }
358 
359 struct async_extent {
360 	u64 start;
361 	u64 ram_size;
362 	u64 compressed_size;
363 	struct page **pages;
364 	unsigned long nr_pages;
365 	int compress_type;
366 	struct list_head list;
367 };
368 
369 struct async_cow {
370 	struct inode *inode;
371 	struct btrfs_root *root;
372 	struct page *locked_page;
373 	u64 start;
374 	u64 end;
375 	struct list_head extents;
376 	struct btrfs_work work;
377 };
378 
379 static noinline int add_async_extent(struct async_cow *cow,
380 				     u64 start, u64 ram_size,
381 				     u64 compressed_size,
382 				     struct page **pages,
383 				     unsigned long nr_pages,
384 				     int compress_type)
385 {
386 	struct async_extent *async_extent;
387 
388 	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
389 	BUG_ON(!async_extent); /* -ENOMEM */
390 	async_extent->start = start;
391 	async_extent->ram_size = ram_size;
392 	async_extent->compressed_size = compressed_size;
393 	async_extent->pages = pages;
394 	async_extent->nr_pages = nr_pages;
395 	async_extent->compress_type = compress_type;
396 	list_add_tail(&async_extent->list, &cow->extents);
397 	return 0;
398 }
399 
400 static inline int inode_need_compress(struct inode *inode)
401 {
402 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
403 
404 	/* force compress */
405 	if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
406 		return 1;
407 	/* bad compression ratios */
408 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
409 		return 0;
410 	if (btrfs_test_opt(fs_info, COMPRESS) ||
411 	    BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS ||
412 	    BTRFS_I(inode)->force_compress)
413 		return 1;
414 	return 0;
415 }
416 
417 static inline void inode_should_defrag(struct btrfs_inode *inode,
418 		u64 start, u64 end, u64 num_bytes, u64 small_write)
419 {
420 	/* If this is a small write inside eof, kick off a defrag */
421 	if (num_bytes < small_write &&
422 	    (start > 0 || end + 1 < inode->disk_i_size))
423 		btrfs_add_inode_defrag(NULL, inode);
424 }
425 
426 /*
427  * we create compressed extents in two phases.  The first
428  * phase compresses a range of pages that have already been
429  * locked (both pages and state bits are locked).
430  *
431  * This is done inside an ordered work queue, and the compression
432  * is spread across many cpus.  The actual IO submission is step
433  * two, and the ordered work queue takes care of making sure that
434  * happens in the same order things were put onto the queue by
435  * writepages and friends.
436  *
437  * If this code finds it can't get good compression, it puts an
438  * entry onto the work queue to write the uncompressed bytes.  This
439  * makes sure that both compressed inodes and uncompressed inodes
440  * are written in the same order that the flusher thread sent them
441  * down.
442  */
443 static noinline void compress_file_range(struct inode *inode,
444 					struct page *locked_page,
445 					u64 start, u64 end,
446 					struct async_cow *async_cow,
447 					int *num_added)
448 {
449 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
450 	struct btrfs_root *root = BTRFS_I(inode)->root;
451 	u64 num_bytes;
452 	u64 blocksize = fs_info->sectorsize;
453 	u64 actual_end;
454 	u64 isize = i_size_read(inode);
455 	int ret = 0;
456 	struct page **pages = NULL;
457 	unsigned long nr_pages;
458 	unsigned long total_compressed = 0;
459 	unsigned long total_in = 0;
460 	int i;
461 	int will_compress;
462 	int compress_type = fs_info->compress_type;
463 	int redirty = 0;
464 
465 	inode_should_defrag(BTRFS_I(inode), start, end, end - start + 1,
466 			SZ_16K);
467 
468 	actual_end = min_t(u64, isize, end + 1);
469 again:
470 	will_compress = 0;
471 	nr_pages = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
472 	BUILD_BUG_ON((BTRFS_MAX_COMPRESSED % PAGE_SIZE) != 0);
473 	nr_pages = min_t(unsigned long, nr_pages,
474 			BTRFS_MAX_COMPRESSED / PAGE_SIZE);
475 
476 	/*
477 	 * we don't want to send crud past the end of i_size through
478 	 * compression, that's just a waste of CPU time.  So, if the
479 	 * end of the file is before the start of our current
480 	 * requested range of bytes, we bail out to the uncompressed
481 	 * cleanup code that can deal with all of this.
482 	 *
483 	 * It isn't really the fastest way to fix things, but this is a
484 	 * very uncommon corner.
485 	 */
486 	if (actual_end <= start)
487 		goto cleanup_and_bail_uncompressed;
488 
489 	total_compressed = actual_end - start;
490 
491 	/*
492 	 * skip compression for a small file range(<=blocksize) that
493 	 * isn't an inline extent, since it doesn't save disk space at all.
494 	 */
495 	if (total_compressed <= blocksize &&
496 	   (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
497 		goto cleanup_and_bail_uncompressed;
498 
499 	total_compressed = min_t(unsigned long, total_compressed,
500 			BTRFS_MAX_UNCOMPRESSED);
501 	num_bytes = ALIGN(end - start + 1, blocksize);
502 	num_bytes = max(blocksize,  num_bytes);
503 	total_in = 0;
504 	ret = 0;
505 
506 	/*
507 	 * we do compression for mount -o compress and when the
508 	 * inode has not been flagged as nocompress.  This flag can
509 	 * change at any time if we discover bad compression ratios.
510 	 */
511 	if (inode_need_compress(inode)) {
512 		WARN_ON(pages);
513 		pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
514 		if (!pages) {
515 			/* just bail out to the uncompressed code */
516 			goto cont;
517 		}
518 
519 		if (BTRFS_I(inode)->force_compress)
520 			compress_type = BTRFS_I(inode)->force_compress;
521 
522 		/*
523 		 * we need to call clear_page_dirty_for_io on each
524 		 * page in the range.  Otherwise applications with the file
525 		 * mmap'd can wander in and change the page contents while
526 		 * we are compressing them.
527 		 *
528 		 * If the compression fails for any reason, we set the pages
529 		 * dirty again later on.
530 		 */
531 		extent_range_clear_dirty_for_io(inode, start, end);
532 		redirty = 1;
533 		ret = btrfs_compress_pages(compress_type,
534 					   inode->i_mapping, start,
535 					   pages,
536 					   &nr_pages,
537 					   &total_in,
538 					   &total_compressed);
539 
540 		if (!ret) {
541 			unsigned long offset = total_compressed &
542 				(PAGE_SIZE - 1);
543 			struct page *page = pages[nr_pages - 1];
544 			char *kaddr;
545 
546 			/* zero the tail end of the last page, we might be
547 			 * sending it down to disk
548 			 */
549 			if (offset) {
550 				kaddr = kmap_atomic(page);
551 				memset(kaddr + offset, 0,
552 				       PAGE_SIZE - offset);
553 				kunmap_atomic(kaddr);
554 			}
555 			will_compress = 1;
556 		}
557 	}
558 cont:
559 	if (start == 0) {
560 		/* lets try to make an inline extent */
561 		if (ret || total_in < (actual_end - start)) {
562 			/* we didn't compress the entire range, try
563 			 * to make an uncompressed inline extent.
564 			 */
565 			ret = cow_file_range_inline(root, inode, start, end,
566 					    0, BTRFS_COMPRESS_NONE, NULL);
567 		} else {
568 			/* try making a compressed inline extent */
569 			ret = cow_file_range_inline(root, inode, start, end,
570 						    total_compressed,
571 						    compress_type, pages);
572 		}
573 		if (ret <= 0) {
574 			unsigned long clear_flags = EXTENT_DELALLOC |
575 				EXTENT_DELALLOC_NEW | EXTENT_DEFRAG;
576 			unsigned long page_error_op;
577 
578 			clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
579 			page_error_op = ret < 0 ? PAGE_SET_ERROR : 0;
580 
581 			/*
582 			 * inline extent creation worked or returned error,
583 			 * we don't need to create any more async work items.
584 			 * Unlock and free up our temp pages.
585 			 */
586 			extent_clear_unlock_delalloc(inode, start, end, end,
587 						     NULL, clear_flags,
588 						     PAGE_UNLOCK |
589 						     PAGE_CLEAR_DIRTY |
590 						     PAGE_SET_WRITEBACK |
591 						     page_error_op |
592 						     PAGE_END_WRITEBACK);
593 			if (ret == 0)
594 				btrfs_free_reserved_data_space_noquota(inode,
595 							       start,
596 							       end - start + 1);
597 			goto free_pages_out;
598 		}
599 	}
600 
601 	if (will_compress) {
602 		/*
603 		 * we aren't doing an inline extent round the compressed size
604 		 * up to a block size boundary so the allocator does sane
605 		 * things
606 		 */
607 		total_compressed = ALIGN(total_compressed, blocksize);
608 
609 		/*
610 		 * one last check to make sure the compression is really a
611 		 * win, compare the page count read with the blocks on disk
612 		 */
613 		total_in = ALIGN(total_in, PAGE_SIZE);
614 		if (total_compressed >= total_in) {
615 			will_compress = 0;
616 		} else {
617 			num_bytes = total_in;
618 			*num_added += 1;
619 
620 			/*
621 			 * The async work queues will take care of doing actual
622 			 * allocation on disk for these compressed pages, and
623 			 * will submit them to the elevator.
624 			 */
625 			add_async_extent(async_cow, start, num_bytes,
626 					total_compressed, pages, nr_pages,
627 					compress_type);
628 
629 			if (start + num_bytes < end) {
630 				start += num_bytes;
631 				pages = NULL;
632 				cond_resched();
633 				goto again;
634 			}
635 			return;
636 		}
637 	}
638 	if (pages) {
639 		/*
640 		 * the compression code ran but failed to make things smaller,
641 		 * free any pages it allocated and our page pointer array
642 		 */
643 		for (i = 0; i < nr_pages; i++) {
644 			WARN_ON(pages[i]->mapping);
645 			put_page(pages[i]);
646 		}
647 		kfree(pages);
648 		pages = NULL;
649 		total_compressed = 0;
650 		nr_pages = 0;
651 
652 		/* flag the file so we don't compress in the future */
653 		if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) &&
654 		    !(BTRFS_I(inode)->force_compress)) {
655 			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
656 		}
657 	}
658 cleanup_and_bail_uncompressed:
659 	/*
660 	 * No compression, but we still need to write the pages in the file
661 	 * we've been given so far.  redirty the locked page if it corresponds
662 	 * to our extent and set things up for the async work queue to run
663 	 * cow_file_range to do the normal delalloc dance.
664 	 */
665 	if (page_offset(locked_page) >= start &&
666 	    page_offset(locked_page) <= end)
667 		__set_page_dirty_nobuffers(locked_page);
668 		/* unlocked later on in the async handlers */
669 
670 	if (redirty)
671 		extent_range_redirty_for_io(inode, start, end);
672 	add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0,
673 			 BTRFS_COMPRESS_NONE);
674 	*num_added += 1;
675 
676 	return;
677 
678 free_pages_out:
679 	for (i = 0; i < nr_pages; i++) {
680 		WARN_ON(pages[i]->mapping);
681 		put_page(pages[i]);
682 	}
683 	kfree(pages);
684 }
685 
686 static void free_async_extent_pages(struct async_extent *async_extent)
687 {
688 	int i;
689 
690 	if (!async_extent->pages)
691 		return;
692 
693 	for (i = 0; i < async_extent->nr_pages; i++) {
694 		WARN_ON(async_extent->pages[i]->mapping);
695 		put_page(async_extent->pages[i]);
696 	}
697 	kfree(async_extent->pages);
698 	async_extent->nr_pages = 0;
699 	async_extent->pages = NULL;
700 }
701 
702 /*
703  * phase two of compressed writeback.  This is the ordered portion
704  * of the code, which only gets called in the order the work was
705  * queued.  We walk all the async extents created by compress_file_range
706  * and send them down to the disk.
707  */
708 static noinline void submit_compressed_extents(struct inode *inode,
709 					      struct async_cow *async_cow)
710 {
711 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
712 	struct async_extent *async_extent;
713 	u64 alloc_hint = 0;
714 	struct btrfs_key ins;
715 	struct extent_map *em;
716 	struct btrfs_root *root = BTRFS_I(inode)->root;
717 	struct extent_io_tree *io_tree;
718 	int ret = 0;
719 
720 again:
721 	while (!list_empty(&async_cow->extents)) {
722 		async_extent = list_entry(async_cow->extents.next,
723 					  struct async_extent, list);
724 		list_del(&async_extent->list);
725 
726 		io_tree = &BTRFS_I(inode)->io_tree;
727 
728 retry:
729 		/* did the compression code fall back to uncompressed IO? */
730 		if (!async_extent->pages) {
731 			int page_started = 0;
732 			unsigned long nr_written = 0;
733 
734 			lock_extent(io_tree, async_extent->start,
735 					 async_extent->start +
736 					 async_extent->ram_size - 1);
737 
738 			/* allocate blocks */
739 			ret = cow_file_range(inode, async_cow->locked_page,
740 					     async_extent->start,
741 					     async_extent->start +
742 					     async_extent->ram_size - 1,
743 					     async_extent->start +
744 					     async_extent->ram_size - 1,
745 					     &page_started, &nr_written, 0,
746 					     NULL);
747 
748 			/* JDM XXX */
749 
750 			/*
751 			 * if page_started, cow_file_range inserted an
752 			 * inline extent and took care of all the unlocking
753 			 * and IO for us.  Otherwise, we need to submit
754 			 * all those pages down to the drive.
755 			 */
756 			if (!page_started && !ret)
757 				extent_write_locked_range(io_tree,
758 						  inode, async_extent->start,
759 						  async_extent->start +
760 						  async_extent->ram_size - 1,
761 						  btrfs_get_extent,
762 						  WB_SYNC_ALL);
763 			else if (ret)
764 				unlock_page(async_cow->locked_page);
765 			kfree(async_extent);
766 			cond_resched();
767 			continue;
768 		}
769 
770 		lock_extent(io_tree, async_extent->start,
771 			    async_extent->start + async_extent->ram_size - 1);
772 
773 		ret = btrfs_reserve_extent(root, async_extent->ram_size,
774 					   async_extent->compressed_size,
775 					   async_extent->compressed_size,
776 					   0, alloc_hint, &ins, 1, 1);
777 		if (ret) {
778 			free_async_extent_pages(async_extent);
779 
780 			if (ret == -ENOSPC) {
781 				unlock_extent(io_tree, async_extent->start,
782 					      async_extent->start +
783 					      async_extent->ram_size - 1);
784 
785 				/*
786 				 * we need to redirty the pages if we decide to
787 				 * fallback to uncompressed IO, otherwise we
788 				 * will not submit these pages down to lower
789 				 * layers.
790 				 */
791 				extent_range_redirty_for_io(inode,
792 						async_extent->start,
793 						async_extent->start +
794 						async_extent->ram_size - 1);
795 
796 				goto retry;
797 			}
798 			goto out_free;
799 		}
800 		/*
801 		 * here we're doing allocation and writeback of the
802 		 * compressed pages
803 		 */
804 		em = create_io_em(inode, async_extent->start,
805 				  async_extent->ram_size, /* len */
806 				  async_extent->start, /* orig_start */
807 				  ins.objectid, /* block_start */
808 				  ins.offset, /* block_len */
809 				  ins.offset, /* orig_block_len */
810 				  async_extent->ram_size, /* ram_bytes */
811 				  async_extent->compress_type,
812 				  BTRFS_ORDERED_COMPRESSED);
813 		if (IS_ERR(em))
814 			/* ret value is not necessary due to void function */
815 			goto out_free_reserve;
816 		free_extent_map(em);
817 
818 		ret = btrfs_add_ordered_extent_compress(inode,
819 						async_extent->start,
820 						ins.objectid,
821 						async_extent->ram_size,
822 						ins.offset,
823 						BTRFS_ORDERED_COMPRESSED,
824 						async_extent->compress_type);
825 		if (ret) {
826 			btrfs_drop_extent_cache(BTRFS_I(inode),
827 						async_extent->start,
828 						async_extent->start +
829 						async_extent->ram_size - 1, 0);
830 			goto out_free_reserve;
831 		}
832 		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
833 
834 		/*
835 		 * clear dirty, set writeback and unlock the pages.
836 		 */
837 		extent_clear_unlock_delalloc(inode, async_extent->start,
838 				async_extent->start +
839 				async_extent->ram_size - 1,
840 				async_extent->start +
841 				async_extent->ram_size - 1,
842 				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
843 				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
844 				PAGE_SET_WRITEBACK);
845 		ret = btrfs_submit_compressed_write(inode,
846 				    async_extent->start,
847 				    async_extent->ram_size,
848 				    ins.objectid,
849 				    ins.offset, async_extent->pages,
850 				    async_extent->nr_pages);
851 		if (ret) {
852 			struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
853 			struct page *p = async_extent->pages[0];
854 			const u64 start = async_extent->start;
855 			const u64 end = start + async_extent->ram_size - 1;
856 
857 			p->mapping = inode->i_mapping;
858 			tree->ops->writepage_end_io_hook(p, start, end,
859 							 NULL, 0);
860 			p->mapping = NULL;
861 			extent_clear_unlock_delalloc(inode, start, end, end,
862 						     NULL, 0,
863 						     PAGE_END_WRITEBACK |
864 						     PAGE_SET_ERROR);
865 			free_async_extent_pages(async_extent);
866 		}
867 		alloc_hint = ins.objectid + ins.offset;
868 		kfree(async_extent);
869 		cond_resched();
870 	}
871 	return;
872 out_free_reserve:
873 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
874 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
875 out_free:
876 	extent_clear_unlock_delalloc(inode, async_extent->start,
877 				     async_extent->start +
878 				     async_extent->ram_size - 1,
879 				     async_extent->start +
880 				     async_extent->ram_size - 1,
881 				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
882 				     EXTENT_DELALLOC_NEW |
883 				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
884 				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
885 				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK |
886 				     PAGE_SET_ERROR);
887 	free_async_extent_pages(async_extent);
888 	kfree(async_extent);
889 	goto again;
890 }
891 
892 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
893 				      u64 num_bytes)
894 {
895 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
896 	struct extent_map *em;
897 	u64 alloc_hint = 0;
898 
899 	read_lock(&em_tree->lock);
900 	em = search_extent_mapping(em_tree, start, num_bytes);
901 	if (em) {
902 		/*
903 		 * if block start isn't an actual block number then find the
904 		 * first block in this inode and use that as a hint.  If that
905 		 * block is also bogus then just don't worry about it.
906 		 */
907 		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
908 			free_extent_map(em);
909 			em = search_extent_mapping(em_tree, 0, 0);
910 			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
911 				alloc_hint = em->block_start;
912 			if (em)
913 				free_extent_map(em);
914 		} else {
915 			alloc_hint = em->block_start;
916 			free_extent_map(em);
917 		}
918 	}
919 	read_unlock(&em_tree->lock);
920 
921 	return alloc_hint;
922 }
923 
924 /*
925  * when extent_io.c finds a delayed allocation range in the file,
926  * the call backs end up in this code.  The basic idea is to
927  * allocate extents on disk for the range, and create ordered data structs
928  * in ram to track those extents.
929  *
930  * locked_page is the page that writepage had locked already.  We use
931  * it to make sure we don't do extra locks or unlocks.
932  *
933  * *page_started is set to one if we unlock locked_page and do everything
934  * required to start IO on it.  It may be clean and already done with
935  * IO when we return.
936  */
937 static noinline int cow_file_range(struct inode *inode,
938 				   struct page *locked_page,
939 				   u64 start, u64 end, u64 delalloc_end,
940 				   int *page_started, unsigned long *nr_written,
941 				   int unlock, struct btrfs_dedupe_hash *hash)
942 {
943 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
944 	struct btrfs_root *root = BTRFS_I(inode)->root;
945 	u64 alloc_hint = 0;
946 	u64 num_bytes;
947 	unsigned long ram_size;
948 	u64 disk_num_bytes;
949 	u64 cur_alloc_size = 0;
950 	u64 blocksize = fs_info->sectorsize;
951 	struct btrfs_key ins;
952 	struct extent_map *em;
953 	unsigned clear_bits;
954 	unsigned long page_ops;
955 	bool extent_reserved = false;
956 	int ret = 0;
957 
958 	if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
959 		WARN_ON_ONCE(1);
960 		ret = -EINVAL;
961 		goto out_unlock;
962 	}
963 
964 	num_bytes = ALIGN(end - start + 1, blocksize);
965 	num_bytes = max(blocksize,  num_bytes);
966 	disk_num_bytes = num_bytes;
967 
968 	inode_should_defrag(BTRFS_I(inode), start, end, num_bytes, SZ_64K);
969 
970 	if (start == 0) {
971 		/* lets try to make an inline extent */
972 		ret = cow_file_range_inline(root, inode, start, end, 0,
973 					BTRFS_COMPRESS_NONE, NULL);
974 		if (ret == 0) {
975 			extent_clear_unlock_delalloc(inode, start, end,
976 				     delalloc_end, NULL,
977 				     EXTENT_LOCKED | EXTENT_DELALLOC |
978 				     EXTENT_DELALLOC_NEW |
979 				     EXTENT_DEFRAG, PAGE_UNLOCK |
980 				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
981 				     PAGE_END_WRITEBACK);
982 			btrfs_free_reserved_data_space_noquota(inode, start,
983 						end - start + 1);
984 			*nr_written = *nr_written +
985 			     (end - start + PAGE_SIZE) / PAGE_SIZE;
986 			*page_started = 1;
987 			goto out;
988 		} else if (ret < 0) {
989 			goto out_unlock;
990 		}
991 	}
992 
993 	BUG_ON(disk_num_bytes >
994 	       btrfs_super_total_bytes(fs_info->super_copy));
995 
996 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
997 	btrfs_drop_extent_cache(BTRFS_I(inode), start,
998 			start + num_bytes - 1, 0);
999 
1000 	while (disk_num_bytes > 0) {
1001 		cur_alloc_size = disk_num_bytes;
1002 		ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
1003 					   fs_info->sectorsize, 0, alloc_hint,
1004 					   &ins, 1, 1);
1005 		if (ret < 0)
1006 			goto out_unlock;
1007 		cur_alloc_size = ins.offset;
1008 		extent_reserved = true;
1009 
1010 		ram_size = ins.offset;
1011 		em = create_io_em(inode, start, ins.offset, /* len */
1012 				  start, /* orig_start */
1013 				  ins.objectid, /* block_start */
1014 				  ins.offset, /* block_len */
1015 				  ins.offset, /* orig_block_len */
1016 				  ram_size, /* ram_bytes */
1017 				  BTRFS_COMPRESS_NONE, /* compress_type */
1018 				  BTRFS_ORDERED_REGULAR /* type */);
1019 		if (IS_ERR(em))
1020 			goto out_reserve;
1021 		free_extent_map(em);
1022 
1023 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
1024 					       ram_size, cur_alloc_size, 0);
1025 		if (ret)
1026 			goto out_drop_extent_cache;
1027 
1028 		if (root->root_key.objectid ==
1029 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
1030 			ret = btrfs_reloc_clone_csums(inode, start,
1031 						      cur_alloc_size);
1032 			/*
1033 			 * Only drop cache here, and process as normal.
1034 			 *
1035 			 * We must not allow extent_clear_unlock_delalloc()
1036 			 * at out_unlock label to free meta of this ordered
1037 			 * extent, as its meta should be freed by
1038 			 * btrfs_finish_ordered_io().
1039 			 *
1040 			 * So we must continue until @start is increased to
1041 			 * skip current ordered extent.
1042 			 */
1043 			if (ret)
1044 				btrfs_drop_extent_cache(BTRFS_I(inode), start,
1045 						start + ram_size - 1, 0);
1046 		}
1047 
1048 		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1049 
1050 		/* we're not doing compressed IO, don't unlock the first
1051 		 * page (which the caller expects to stay locked), don't
1052 		 * clear any dirty bits and don't set any writeback bits
1053 		 *
1054 		 * Do set the Private2 bit so we know this page was properly
1055 		 * setup for writepage
1056 		 */
1057 		page_ops = unlock ? PAGE_UNLOCK : 0;
1058 		page_ops |= PAGE_SET_PRIVATE2;
1059 
1060 		extent_clear_unlock_delalloc(inode, start,
1061 					     start + ram_size - 1,
1062 					     delalloc_end, locked_page,
1063 					     EXTENT_LOCKED | EXTENT_DELALLOC,
1064 					     page_ops);
1065 		if (disk_num_bytes < cur_alloc_size)
1066 			disk_num_bytes = 0;
1067 		else
1068 			disk_num_bytes -= cur_alloc_size;
1069 		num_bytes -= cur_alloc_size;
1070 		alloc_hint = ins.objectid + ins.offset;
1071 		start += cur_alloc_size;
1072 		extent_reserved = false;
1073 
1074 		/*
1075 		 * btrfs_reloc_clone_csums() error, since start is increased
1076 		 * extent_clear_unlock_delalloc() at out_unlock label won't
1077 		 * free metadata of current ordered extent, we're OK to exit.
1078 		 */
1079 		if (ret)
1080 			goto out_unlock;
1081 	}
1082 out:
1083 	return ret;
1084 
1085 out_drop_extent_cache:
1086 	btrfs_drop_extent_cache(BTRFS_I(inode), start, start + ram_size - 1, 0);
1087 out_reserve:
1088 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1089 	btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1090 out_unlock:
1091 	clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1092 		EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1093 	page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
1094 		PAGE_END_WRITEBACK;
1095 	/*
1096 	 * If we reserved an extent for our delalloc range (or a subrange) and
1097 	 * failed to create the respective ordered extent, then it means that
1098 	 * when we reserved the extent we decremented the extent's size from
1099 	 * the data space_info's bytes_may_use counter and incremented the
1100 	 * space_info's bytes_reserved counter by the same amount. We must make
1101 	 * sure extent_clear_unlock_delalloc() does not try to decrement again
1102 	 * the data space_info's bytes_may_use counter, therefore we do not pass
1103 	 * it the flag EXTENT_CLEAR_DATA_RESV.
1104 	 */
1105 	if (extent_reserved) {
1106 		extent_clear_unlock_delalloc(inode, start,
1107 					     start + cur_alloc_size,
1108 					     start + cur_alloc_size,
1109 					     locked_page,
1110 					     clear_bits,
1111 					     page_ops);
1112 		start += cur_alloc_size;
1113 		if (start >= end)
1114 			goto out;
1115 	}
1116 	extent_clear_unlock_delalloc(inode, start, end, delalloc_end,
1117 				     locked_page,
1118 				     clear_bits | EXTENT_CLEAR_DATA_RESV,
1119 				     page_ops);
1120 	goto out;
1121 }
1122 
1123 /*
1124  * work queue call back to started compression on a file and pages
1125  */
1126 static noinline void async_cow_start(struct btrfs_work *work)
1127 {
1128 	struct async_cow *async_cow;
1129 	int num_added = 0;
1130 	async_cow = container_of(work, struct async_cow, work);
1131 
1132 	compress_file_range(async_cow->inode, async_cow->locked_page,
1133 			    async_cow->start, async_cow->end, async_cow,
1134 			    &num_added);
1135 	if (num_added == 0) {
1136 		btrfs_add_delayed_iput(async_cow->inode);
1137 		async_cow->inode = NULL;
1138 	}
1139 }
1140 
1141 /*
1142  * work queue call back to submit previously compressed pages
1143  */
1144 static noinline void async_cow_submit(struct btrfs_work *work)
1145 {
1146 	struct btrfs_fs_info *fs_info;
1147 	struct async_cow *async_cow;
1148 	struct btrfs_root *root;
1149 	unsigned long nr_pages;
1150 
1151 	async_cow = container_of(work, struct async_cow, work);
1152 
1153 	root = async_cow->root;
1154 	fs_info = root->fs_info;
1155 	nr_pages = (async_cow->end - async_cow->start + PAGE_SIZE) >>
1156 		PAGE_SHIFT;
1157 
1158 	/*
1159 	 * atomic_sub_return implies a barrier for waitqueue_active
1160 	 */
1161 	if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1162 	    5 * SZ_1M &&
1163 	    waitqueue_active(&fs_info->async_submit_wait))
1164 		wake_up(&fs_info->async_submit_wait);
1165 
1166 	if (async_cow->inode)
1167 		submit_compressed_extents(async_cow->inode, async_cow);
1168 }
1169 
1170 static noinline void async_cow_free(struct btrfs_work *work)
1171 {
1172 	struct async_cow *async_cow;
1173 	async_cow = container_of(work, struct async_cow, work);
1174 	if (async_cow->inode)
1175 		btrfs_add_delayed_iput(async_cow->inode);
1176 	kfree(async_cow);
1177 }
1178 
1179 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1180 				u64 start, u64 end, int *page_started,
1181 				unsigned long *nr_written)
1182 {
1183 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1184 	struct async_cow *async_cow;
1185 	struct btrfs_root *root = BTRFS_I(inode)->root;
1186 	unsigned long nr_pages;
1187 	u64 cur_end;
1188 
1189 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1190 			 1, 0, NULL, GFP_NOFS);
1191 	while (start < end) {
1192 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1193 		BUG_ON(!async_cow); /* -ENOMEM */
1194 		async_cow->inode = igrab(inode);
1195 		async_cow->root = root;
1196 		async_cow->locked_page = locked_page;
1197 		async_cow->start = start;
1198 
1199 		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS &&
1200 		    !btrfs_test_opt(fs_info, FORCE_COMPRESS))
1201 			cur_end = end;
1202 		else
1203 			cur_end = min(end, start + SZ_512K - 1);
1204 
1205 		async_cow->end = cur_end;
1206 		INIT_LIST_HEAD(&async_cow->extents);
1207 
1208 		btrfs_init_work(&async_cow->work,
1209 				btrfs_delalloc_helper,
1210 				async_cow_start, async_cow_submit,
1211 				async_cow_free);
1212 
1213 		nr_pages = (cur_end - start + PAGE_SIZE) >>
1214 			PAGE_SHIFT;
1215 		atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1216 
1217 		btrfs_queue_work(fs_info->delalloc_workers, &async_cow->work);
1218 
1219 		while (atomic_read(&fs_info->async_submit_draining) &&
1220 		       atomic_read(&fs_info->async_delalloc_pages)) {
1221 			wait_event(fs_info->async_submit_wait,
1222 				   (atomic_read(&fs_info->async_delalloc_pages) ==
1223 				    0));
1224 		}
1225 
1226 		*nr_written += nr_pages;
1227 		start = cur_end + 1;
1228 	}
1229 	*page_started = 1;
1230 	return 0;
1231 }
1232 
1233 static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
1234 					u64 bytenr, u64 num_bytes)
1235 {
1236 	int ret;
1237 	struct btrfs_ordered_sum *sums;
1238 	LIST_HEAD(list);
1239 
1240 	ret = btrfs_lookup_csums_range(fs_info->csum_root, bytenr,
1241 				       bytenr + num_bytes - 1, &list, 0);
1242 	if (ret == 0 && list_empty(&list))
1243 		return 0;
1244 
1245 	while (!list_empty(&list)) {
1246 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1247 		list_del(&sums->list);
1248 		kfree(sums);
1249 	}
1250 	return 1;
1251 }
1252 
1253 /*
1254  * when nowcow writeback call back.  This checks for snapshots or COW copies
1255  * of the extents that exist in the file, and COWs the file as required.
1256  *
1257  * If no cow copies or snapshots exist, we write directly to the existing
1258  * blocks on disk
1259  */
1260 static noinline int run_delalloc_nocow(struct inode *inode,
1261 				       struct page *locked_page,
1262 			      u64 start, u64 end, int *page_started, int force,
1263 			      unsigned long *nr_written)
1264 {
1265 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1266 	struct btrfs_root *root = BTRFS_I(inode)->root;
1267 	struct extent_buffer *leaf;
1268 	struct btrfs_path *path;
1269 	struct btrfs_file_extent_item *fi;
1270 	struct btrfs_key found_key;
1271 	struct extent_map *em;
1272 	u64 cow_start;
1273 	u64 cur_offset;
1274 	u64 extent_end;
1275 	u64 extent_offset;
1276 	u64 disk_bytenr;
1277 	u64 num_bytes;
1278 	u64 disk_num_bytes;
1279 	u64 ram_bytes;
1280 	int extent_type;
1281 	int ret, err;
1282 	int type;
1283 	int nocow;
1284 	int check_prev = 1;
1285 	bool nolock;
1286 	u64 ino = btrfs_ino(BTRFS_I(inode));
1287 
1288 	path = btrfs_alloc_path();
1289 	if (!path) {
1290 		extent_clear_unlock_delalloc(inode, start, end, end,
1291 					     locked_page,
1292 					     EXTENT_LOCKED | EXTENT_DELALLOC |
1293 					     EXTENT_DO_ACCOUNTING |
1294 					     EXTENT_DEFRAG, PAGE_UNLOCK |
1295 					     PAGE_CLEAR_DIRTY |
1296 					     PAGE_SET_WRITEBACK |
1297 					     PAGE_END_WRITEBACK);
1298 		return -ENOMEM;
1299 	}
1300 
1301 	nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
1302 
1303 	cow_start = (u64)-1;
1304 	cur_offset = start;
1305 	while (1) {
1306 		ret = btrfs_lookup_file_extent(NULL, root, path, ino,
1307 					       cur_offset, 0);
1308 		if (ret < 0)
1309 			goto error;
1310 		if (ret > 0 && path->slots[0] > 0 && check_prev) {
1311 			leaf = path->nodes[0];
1312 			btrfs_item_key_to_cpu(leaf, &found_key,
1313 					      path->slots[0] - 1);
1314 			if (found_key.objectid == ino &&
1315 			    found_key.type == BTRFS_EXTENT_DATA_KEY)
1316 				path->slots[0]--;
1317 		}
1318 		check_prev = 0;
1319 next_slot:
1320 		leaf = path->nodes[0];
1321 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1322 			ret = btrfs_next_leaf(root, path);
1323 			if (ret < 0)
1324 				goto error;
1325 			if (ret > 0)
1326 				break;
1327 			leaf = path->nodes[0];
1328 		}
1329 
1330 		nocow = 0;
1331 		disk_bytenr = 0;
1332 		num_bytes = 0;
1333 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1334 
1335 		if (found_key.objectid > ino)
1336 			break;
1337 		if (WARN_ON_ONCE(found_key.objectid < ino) ||
1338 		    found_key.type < BTRFS_EXTENT_DATA_KEY) {
1339 			path->slots[0]++;
1340 			goto next_slot;
1341 		}
1342 		if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
1343 		    found_key.offset > end)
1344 			break;
1345 
1346 		if (found_key.offset > cur_offset) {
1347 			extent_end = found_key.offset;
1348 			extent_type = 0;
1349 			goto out_check;
1350 		}
1351 
1352 		fi = btrfs_item_ptr(leaf, path->slots[0],
1353 				    struct btrfs_file_extent_item);
1354 		extent_type = btrfs_file_extent_type(leaf, fi);
1355 
1356 		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1357 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
1358 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1359 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1360 			extent_offset = btrfs_file_extent_offset(leaf, fi);
1361 			extent_end = found_key.offset +
1362 				btrfs_file_extent_num_bytes(leaf, fi);
1363 			disk_num_bytes =
1364 				btrfs_file_extent_disk_num_bytes(leaf, fi);
1365 			if (extent_end <= start) {
1366 				path->slots[0]++;
1367 				goto next_slot;
1368 			}
1369 			if (disk_bytenr == 0)
1370 				goto out_check;
1371 			if (btrfs_file_extent_compression(leaf, fi) ||
1372 			    btrfs_file_extent_encryption(leaf, fi) ||
1373 			    btrfs_file_extent_other_encoding(leaf, fi))
1374 				goto out_check;
1375 			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1376 				goto out_check;
1377 			if (btrfs_extent_readonly(fs_info, disk_bytenr))
1378 				goto out_check;
1379 			if (btrfs_cross_ref_exist(root, ino,
1380 						  found_key.offset -
1381 						  extent_offset, disk_bytenr))
1382 				goto out_check;
1383 			disk_bytenr += extent_offset;
1384 			disk_bytenr += cur_offset - found_key.offset;
1385 			num_bytes = min(end + 1, extent_end) - cur_offset;
1386 			/*
1387 			 * if there are pending snapshots for this root,
1388 			 * we fall into common COW way.
1389 			 */
1390 			if (!nolock) {
1391 				err = btrfs_start_write_no_snapshoting(root);
1392 				if (!err)
1393 					goto out_check;
1394 			}
1395 			/*
1396 			 * force cow if csum exists in the range.
1397 			 * this ensure that csum for a given extent are
1398 			 * either valid or do not exist.
1399 			 */
1400 			if (csum_exist_in_range(fs_info, disk_bytenr,
1401 						num_bytes)) {
1402 				if (!nolock)
1403 					btrfs_end_write_no_snapshoting(root);
1404 				goto out_check;
1405 			}
1406 			if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
1407 				if (!nolock)
1408 					btrfs_end_write_no_snapshoting(root);
1409 				goto out_check;
1410 			}
1411 			nocow = 1;
1412 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1413 			extent_end = found_key.offset +
1414 				btrfs_file_extent_inline_len(leaf,
1415 						     path->slots[0], fi);
1416 			extent_end = ALIGN(extent_end,
1417 					   fs_info->sectorsize);
1418 		} else {
1419 			BUG_ON(1);
1420 		}
1421 out_check:
1422 		if (extent_end <= start) {
1423 			path->slots[0]++;
1424 			if (!nolock && nocow)
1425 				btrfs_end_write_no_snapshoting(root);
1426 			if (nocow)
1427 				btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1428 			goto next_slot;
1429 		}
1430 		if (!nocow) {
1431 			if (cow_start == (u64)-1)
1432 				cow_start = cur_offset;
1433 			cur_offset = extent_end;
1434 			if (cur_offset > end)
1435 				break;
1436 			path->slots[0]++;
1437 			goto next_slot;
1438 		}
1439 
1440 		btrfs_release_path(path);
1441 		if (cow_start != (u64)-1) {
1442 			ret = cow_file_range(inode, locked_page,
1443 					     cow_start, found_key.offset - 1,
1444 					     end, page_started, nr_written, 1,
1445 					     NULL);
1446 			if (ret) {
1447 				if (!nolock && nocow)
1448 					btrfs_end_write_no_snapshoting(root);
1449 				if (nocow)
1450 					btrfs_dec_nocow_writers(fs_info,
1451 								disk_bytenr);
1452 				goto error;
1453 			}
1454 			cow_start = (u64)-1;
1455 		}
1456 
1457 		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1458 			u64 orig_start = found_key.offset - extent_offset;
1459 
1460 			em = create_io_em(inode, cur_offset, num_bytes,
1461 					  orig_start,
1462 					  disk_bytenr, /* block_start */
1463 					  num_bytes, /* block_len */
1464 					  disk_num_bytes, /* orig_block_len */
1465 					  ram_bytes, BTRFS_COMPRESS_NONE,
1466 					  BTRFS_ORDERED_PREALLOC);
1467 			if (IS_ERR(em)) {
1468 				if (!nolock && nocow)
1469 					btrfs_end_write_no_snapshoting(root);
1470 				if (nocow)
1471 					btrfs_dec_nocow_writers(fs_info,
1472 								disk_bytenr);
1473 				ret = PTR_ERR(em);
1474 				goto error;
1475 			}
1476 			free_extent_map(em);
1477 		}
1478 
1479 		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1480 			type = BTRFS_ORDERED_PREALLOC;
1481 		} else {
1482 			type = BTRFS_ORDERED_NOCOW;
1483 		}
1484 
1485 		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1486 					       num_bytes, num_bytes, type);
1487 		if (nocow)
1488 			btrfs_dec_nocow_writers(fs_info, disk_bytenr);
1489 		BUG_ON(ret); /* -ENOMEM */
1490 
1491 		if (root->root_key.objectid ==
1492 		    BTRFS_DATA_RELOC_TREE_OBJECTID)
1493 			/*
1494 			 * Error handled later, as we must prevent
1495 			 * extent_clear_unlock_delalloc() in error handler
1496 			 * from freeing metadata of created ordered extent.
1497 			 */
1498 			ret = btrfs_reloc_clone_csums(inode, cur_offset,
1499 						      num_bytes);
1500 
1501 		extent_clear_unlock_delalloc(inode, cur_offset,
1502 					     cur_offset + num_bytes - 1, end,
1503 					     locked_page, EXTENT_LOCKED |
1504 					     EXTENT_DELALLOC |
1505 					     EXTENT_CLEAR_DATA_RESV,
1506 					     PAGE_UNLOCK | PAGE_SET_PRIVATE2);
1507 
1508 		if (!nolock && nocow)
1509 			btrfs_end_write_no_snapshoting(root);
1510 		cur_offset = extent_end;
1511 
1512 		/*
1513 		 * btrfs_reloc_clone_csums() error, now we're OK to call error
1514 		 * handler, as metadata for created ordered extent will only
1515 		 * be freed by btrfs_finish_ordered_io().
1516 		 */
1517 		if (ret)
1518 			goto error;
1519 		if (cur_offset > end)
1520 			break;
1521 	}
1522 	btrfs_release_path(path);
1523 
1524 	if (cur_offset <= end && cow_start == (u64)-1) {
1525 		cow_start = cur_offset;
1526 		cur_offset = end;
1527 	}
1528 
1529 	if (cow_start != (u64)-1) {
1530 		ret = cow_file_range(inode, locked_page, cow_start, end, end,
1531 				     page_started, nr_written, 1, NULL);
1532 		if (ret)
1533 			goto error;
1534 	}
1535 
1536 error:
1537 	if (ret && cur_offset < end)
1538 		extent_clear_unlock_delalloc(inode, cur_offset, end, end,
1539 					     locked_page, EXTENT_LOCKED |
1540 					     EXTENT_DELALLOC | EXTENT_DEFRAG |
1541 					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1542 					     PAGE_CLEAR_DIRTY |
1543 					     PAGE_SET_WRITEBACK |
1544 					     PAGE_END_WRITEBACK);
1545 	btrfs_free_path(path);
1546 	return ret;
1547 }
1548 
1549 static inline int need_force_cow(struct inode *inode, u64 start, u64 end)
1550 {
1551 
1552 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
1553 	    !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC))
1554 		return 0;
1555 
1556 	/*
1557 	 * @defrag_bytes is a hint value, no spinlock held here,
1558 	 * if is not zero, it means the file is defragging.
1559 	 * Force cow if given extent needs to be defragged.
1560 	 */
1561 	if (BTRFS_I(inode)->defrag_bytes &&
1562 	    test_range_bit(&BTRFS_I(inode)->io_tree, start, end,
1563 			   EXTENT_DEFRAG, 0, NULL))
1564 		return 1;
1565 
1566 	return 0;
1567 }
1568 
1569 /*
1570  * extent_io.c call back to do delayed allocation processing
1571  */
1572 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1573 			      u64 start, u64 end, int *page_started,
1574 			      unsigned long *nr_written)
1575 {
1576 	int ret;
1577 	int force_cow = need_force_cow(inode, start, end);
1578 
1579 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) {
1580 		ret = run_delalloc_nocow(inode, locked_page, start, end,
1581 					 page_started, 1, nr_written);
1582 	} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
1583 		ret = run_delalloc_nocow(inode, locked_page, start, end,
1584 					 page_started, 0, nr_written);
1585 	} else if (!inode_need_compress(inode)) {
1586 		ret = cow_file_range(inode, locked_page, start, end, end,
1587 				      page_started, nr_written, 1, NULL);
1588 	} else {
1589 		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1590 			&BTRFS_I(inode)->runtime_flags);
1591 		ret = cow_file_range_async(inode, locked_page, start, end,
1592 					   page_started, nr_written);
1593 	}
1594 	if (ret)
1595 		btrfs_cleanup_ordered_extents(inode, start, end - start + 1);
1596 	return ret;
1597 }
1598 
1599 static void btrfs_split_extent_hook(struct inode *inode,
1600 				    struct extent_state *orig, u64 split)
1601 {
1602 	u64 size;
1603 
1604 	/* not delalloc, ignore it */
1605 	if (!(orig->state & EXTENT_DELALLOC))
1606 		return;
1607 
1608 	size = orig->end - orig->start + 1;
1609 	if (size > BTRFS_MAX_EXTENT_SIZE) {
1610 		u32 num_extents;
1611 		u64 new_size;
1612 
1613 		/*
1614 		 * See the explanation in btrfs_merge_extent_hook, the same
1615 		 * applies here, just in reverse.
1616 		 */
1617 		new_size = orig->end - split + 1;
1618 		num_extents = count_max_extents(new_size);
1619 		new_size = split - orig->start;
1620 		num_extents += count_max_extents(new_size);
1621 		if (count_max_extents(size) >= num_extents)
1622 			return;
1623 	}
1624 
1625 	spin_lock(&BTRFS_I(inode)->lock);
1626 	BTRFS_I(inode)->outstanding_extents++;
1627 	spin_unlock(&BTRFS_I(inode)->lock);
1628 }
1629 
1630 /*
1631  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1632  * extents so we can keep track of new extents that are just merged onto old
1633  * extents, such as when we are doing sequential writes, so we can properly
1634  * account for the metadata space we'll need.
1635  */
1636 static void btrfs_merge_extent_hook(struct inode *inode,
1637 				    struct extent_state *new,
1638 				    struct extent_state *other)
1639 {
1640 	u64 new_size, old_size;
1641 	u32 num_extents;
1642 
1643 	/* not delalloc, ignore it */
1644 	if (!(other->state & EXTENT_DELALLOC))
1645 		return;
1646 
1647 	if (new->start > other->start)
1648 		new_size = new->end - other->start + 1;
1649 	else
1650 		new_size = other->end - new->start + 1;
1651 
1652 	/* we're not bigger than the max, unreserve the space and go */
1653 	if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
1654 		spin_lock(&BTRFS_I(inode)->lock);
1655 		BTRFS_I(inode)->outstanding_extents--;
1656 		spin_unlock(&BTRFS_I(inode)->lock);
1657 		return;
1658 	}
1659 
1660 	/*
1661 	 * We have to add up either side to figure out how many extents were
1662 	 * accounted for before we merged into one big extent.  If the number of
1663 	 * extents we accounted for is <= the amount we need for the new range
1664 	 * then we can return, otherwise drop.  Think of it like this
1665 	 *
1666 	 * [ 4k][MAX_SIZE]
1667 	 *
1668 	 * So we've grown the extent by a MAX_SIZE extent, this would mean we
1669 	 * need 2 outstanding extents, on one side we have 1 and the other side
1670 	 * we have 1 so they are == and we can return.  But in this case
1671 	 *
1672 	 * [MAX_SIZE+4k][MAX_SIZE+4k]
1673 	 *
1674 	 * Each range on their own accounts for 2 extents, but merged together
1675 	 * they are only 3 extents worth of accounting, so we need to drop in
1676 	 * this case.
1677 	 */
1678 	old_size = other->end - other->start + 1;
1679 	num_extents = count_max_extents(old_size);
1680 	old_size = new->end - new->start + 1;
1681 	num_extents += count_max_extents(old_size);
1682 	if (count_max_extents(new_size) >= num_extents)
1683 		return;
1684 
1685 	spin_lock(&BTRFS_I(inode)->lock);
1686 	BTRFS_I(inode)->outstanding_extents--;
1687 	spin_unlock(&BTRFS_I(inode)->lock);
1688 }
1689 
1690 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1691 				      struct inode *inode)
1692 {
1693 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1694 
1695 	spin_lock(&root->delalloc_lock);
1696 	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1697 		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1698 			      &root->delalloc_inodes);
1699 		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1700 			&BTRFS_I(inode)->runtime_flags);
1701 		root->nr_delalloc_inodes++;
1702 		if (root->nr_delalloc_inodes == 1) {
1703 			spin_lock(&fs_info->delalloc_root_lock);
1704 			BUG_ON(!list_empty(&root->delalloc_root));
1705 			list_add_tail(&root->delalloc_root,
1706 				      &fs_info->delalloc_roots);
1707 			spin_unlock(&fs_info->delalloc_root_lock);
1708 		}
1709 	}
1710 	spin_unlock(&root->delalloc_lock);
1711 }
1712 
1713 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1714 				     struct btrfs_inode *inode)
1715 {
1716 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1717 
1718 	spin_lock(&root->delalloc_lock);
1719 	if (!list_empty(&inode->delalloc_inodes)) {
1720 		list_del_init(&inode->delalloc_inodes);
1721 		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1722 			  &inode->runtime_flags);
1723 		root->nr_delalloc_inodes--;
1724 		if (!root->nr_delalloc_inodes) {
1725 			spin_lock(&fs_info->delalloc_root_lock);
1726 			BUG_ON(list_empty(&root->delalloc_root));
1727 			list_del_init(&root->delalloc_root);
1728 			spin_unlock(&fs_info->delalloc_root_lock);
1729 		}
1730 	}
1731 	spin_unlock(&root->delalloc_lock);
1732 }
1733 
1734 /*
1735  * extent_io.c set_bit_hook, used to track delayed allocation
1736  * bytes in this file, and to maintain the list of inodes that
1737  * have pending delalloc work to be done.
1738  */
1739 static void btrfs_set_bit_hook(struct inode *inode,
1740 			       struct extent_state *state, unsigned *bits)
1741 {
1742 
1743 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1744 
1745 	if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
1746 		WARN_ON(1);
1747 	/*
1748 	 * set_bit and clear bit hooks normally require _irqsave/restore
1749 	 * but in this case, we are only testing for the DELALLOC
1750 	 * bit, which is only set or cleared with irqs on
1751 	 */
1752 	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1753 		struct btrfs_root *root = BTRFS_I(inode)->root;
1754 		u64 len = state->end + 1 - state->start;
1755 		bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
1756 
1757 		if (*bits & EXTENT_FIRST_DELALLOC) {
1758 			*bits &= ~EXTENT_FIRST_DELALLOC;
1759 		} else {
1760 			spin_lock(&BTRFS_I(inode)->lock);
1761 			BTRFS_I(inode)->outstanding_extents++;
1762 			spin_unlock(&BTRFS_I(inode)->lock);
1763 		}
1764 
1765 		/* For sanity tests */
1766 		if (btrfs_is_testing(fs_info))
1767 			return;
1768 
1769 		__percpu_counter_add(&fs_info->delalloc_bytes, len,
1770 				     fs_info->delalloc_batch);
1771 		spin_lock(&BTRFS_I(inode)->lock);
1772 		BTRFS_I(inode)->delalloc_bytes += len;
1773 		if (*bits & EXTENT_DEFRAG)
1774 			BTRFS_I(inode)->defrag_bytes += len;
1775 		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1776 					 &BTRFS_I(inode)->runtime_flags))
1777 			btrfs_add_delalloc_inodes(root, inode);
1778 		spin_unlock(&BTRFS_I(inode)->lock);
1779 	}
1780 
1781 	if (!(state->state & EXTENT_DELALLOC_NEW) &&
1782 	    (*bits & EXTENT_DELALLOC_NEW)) {
1783 		spin_lock(&BTRFS_I(inode)->lock);
1784 		BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
1785 			state->start;
1786 		spin_unlock(&BTRFS_I(inode)->lock);
1787 	}
1788 }
1789 
1790 /*
1791  * extent_io.c clear_bit_hook, see set_bit_hook for why
1792  */
1793 static void btrfs_clear_bit_hook(struct btrfs_inode *inode,
1794 				 struct extent_state *state,
1795 				 unsigned *bits)
1796 {
1797 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
1798 	u64 len = state->end + 1 - state->start;
1799 	u32 num_extents = count_max_extents(len);
1800 
1801 	spin_lock(&inode->lock);
1802 	if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
1803 		inode->defrag_bytes -= len;
1804 	spin_unlock(&inode->lock);
1805 
1806 	/*
1807 	 * set_bit and clear bit hooks normally require _irqsave/restore
1808 	 * but in this case, we are only testing for the DELALLOC
1809 	 * bit, which is only set or cleared with irqs on
1810 	 */
1811 	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1812 		struct btrfs_root *root = inode->root;
1813 		bool do_list = !btrfs_is_free_space_inode(inode);
1814 
1815 		if (*bits & EXTENT_FIRST_DELALLOC) {
1816 			*bits &= ~EXTENT_FIRST_DELALLOC;
1817 		} else if (!(*bits & EXTENT_CLEAR_META_RESV)) {
1818 			spin_lock(&inode->lock);
1819 			inode->outstanding_extents -= num_extents;
1820 			spin_unlock(&inode->lock);
1821 		}
1822 
1823 		/*
1824 		 * We don't reserve metadata space for space cache inodes so we
1825 		 * don't need to call dellalloc_release_metadata if there is an
1826 		 * error.
1827 		 */
1828 		if (*bits & EXTENT_CLEAR_META_RESV &&
1829 		    root != fs_info->tree_root)
1830 			btrfs_delalloc_release_metadata(inode, len);
1831 
1832 		/* For sanity tests. */
1833 		if (btrfs_is_testing(fs_info))
1834 			return;
1835 
1836 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID &&
1837 		    do_list && !(state->state & EXTENT_NORESERVE) &&
1838 		    (*bits & EXTENT_CLEAR_DATA_RESV))
1839 			btrfs_free_reserved_data_space_noquota(
1840 					&inode->vfs_inode,
1841 					state->start, len);
1842 
1843 		__percpu_counter_add(&fs_info->delalloc_bytes, -len,
1844 				     fs_info->delalloc_batch);
1845 		spin_lock(&inode->lock);
1846 		inode->delalloc_bytes -= len;
1847 		if (do_list && inode->delalloc_bytes == 0 &&
1848 		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1849 					&inode->runtime_flags))
1850 			btrfs_del_delalloc_inode(root, inode);
1851 		spin_unlock(&inode->lock);
1852 	}
1853 
1854 	if ((state->state & EXTENT_DELALLOC_NEW) &&
1855 	    (*bits & EXTENT_DELALLOC_NEW)) {
1856 		spin_lock(&inode->lock);
1857 		ASSERT(inode->new_delalloc_bytes >= len);
1858 		inode->new_delalloc_bytes -= len;
1859 		spin_unlock(&inode->lock);
1860 	}
1861 }
1862 
1863 /*
1864  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1865  * we don't create bios that span stripes or chunks
1866  *
1867  * return 1 if page cannot be merged to bio
1868  * return 0 if page can be merged to bio
1869  * return error otherwise
1870  */
1871 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
1872 			 size_t size, struct bio *bio,
1873 			 unsigned long bio_flags)
1874 {
1875 	struct inode *inode = page->mapping->host;
1876 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1877 	u64 logical = (u64)bio->bi_iter.bi_sector << 9;
1878 	u64 length = 0;
1879 	u64 map_length;
1880 	int ret;
1881 
1882 	if (bio_flags & EXTENT_BIO_COMPRESSED)
1883 		return 0;
1884 
1885 	length = bio->bi_iter.bi_size;
1886 	map_length = length;
1887 	ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
1888 			      NULL, 0);
1889 	if (ret < 0)
1890 		return ret;
1891 	if (map_length < length + size)
1892 		return 1;
1893 	return 0;
1894 }
1895 
1896 /*
1897  * in order to insert checksums into the metadata in large chunks,
1898  * we wait until bio submission time.   All the pages in the bio are
1899  * checksummed and sums are attached onto the ordered extent record.
1900  *
1901  * At IO completion time the cums attached on the ordered extent record
1902  * are inserted into the btree
1903  */
1904 static int __btrfs_submit_bio_start(struct inode *inode, struct bio *bio,
1905 				    int mirror_num, unsigned long bio_flags,
1906 				    u64 bio_offset)
1907 {
1908 	int ret = 0;
1909 
1910 	ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1911 	BUG_ON(ret); /* -ENOMEM */
1912 	return 0;
1913 }
1914 
1915 /*
1916  * in order to insert checksums into the metadata in large chunks,
1917  * we wait until bio submission time.   All the pages in the bio are
1918  * checksummed and sums are attached onto the ordered extent record.
1919  *
1920  * At IO completion time the cums attached on the ordered extent record
1921  * are inserted into the btree
1922  */
1923 static int __btrfs_submit_bio_done(struct inode *inode, struct bio *bio,
1924 			  int mirror_num, unsigned long bio_flags,
1925 			  u64 bio_offset)
1926 {
1927 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1928 	int ret;
1929 
1930 	ret = btrfs_map_bio(fs_info, bio, mirror_num, 1);
1931 	if (ret) {
1932 		bio->bi_error = ret;
1933 		bio_endio(bio);
1934 	}
1935 	return ret;
1936 }
1937 
1938 /*
1939  * extent_io.c submission hook. This does the right thing for csum calculation
1940  * on write, or reading the csums from the tree before a read
1941  */
1942 static int btrfs_submit_bio_hook(struct inode *inode, struct bio *bio,
1943 			  int mirror_num, unsigned long bio_flags,
1944 			  u64 bio_offset)
1945 {
1946 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1947 	struct btrfs_root *root = BTRFS_I(inode)->root;
1948 	enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
1949 	int ret = 0;
1950 	int skip_sum;
1951 	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1952 
1953 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1954 
1955 	if (btrfs_is_free_space_inode(BTRFS_I(inode)))
1956 		metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
1957 
1958 	if (bio_op(bio) != REQ_OP_WRITE) {
1959 		ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
1960 		if (ret)
1961 			goto out;
1962 
1963 		if (bio_flags & EXTENT_BIO_COMPRESSED) {
1964 			ret = btrfs_submit_compressed_read(inode, bio,
1965 							   mirror_num,
1966 							   bio_flags);
1967 			goto out;
1968 		} else if (!skip_sum) {
1969 			ret = btrfs_lookup_bio_sums(inode, bio, NULL);
1970 			if (ret)
1971 				goto out;
1972 		}
1973 		goto mapit;
1974 	} else if (async && !skip_sum) {
1975 		/* csum items have already been cloned */
1976 		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1977 			goto mapit;
1978 		/* we're doing a write, do the async checksumming */
1979 		ret = btrfs_wq_submit_bio(fs_info, inode, bio, mirror_num,
1980 					  bio_flags, bio_offset,
1981 					  __btrfs_submit_bio_start,
1982 					  __btrfs_submit_bio_done);
1983 		goto out;
1984 	} else if (!skip_sum) {
1985 		ret = btrfs_csum_one_bio(inode, bio, 0, 0);
1986 		if (ret)
1987 			goto out;
1988 	}
1989 
1990 mapit:
1991 	ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
1992 
1993 out:
1994 	if (ret < 0) {
1995 		bio->bi_error = ret;
1996 		bio_endio(bio);
1997 	}
1998 	return ret;
1999 }
2000 
2001 /*
2002  * given a list of ordered sums record them in the inode.  This happens
2003  * at IO completion time based on sums calculated at bio submission time.
2004  */
2005 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
2006 			     struct inode *inode, struct list_head *list)
2007 {
2008 	struct btrfs_ordered_sum *sum;
2009 
2010 	list_for_each_entry(sum, list, list) {
2011 		trans->adding_csums = 1;
2012 		btrfs_csum_file_blocks(trans,
2013 		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
2014 		trans->adding_csums = 0;
2015 	}
2016 	return 0;
2017 }
2018 
2019 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2020 			      struct extent_state **cached_state, int dedupe)
2021 {
2022 	WARN_ON((end & (PAGE_SIZE - 1)) == 0);
2023 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
2024 				   cached_state);
2025 }
2026 
2027 /* see btrfs_writepage_start_hook for details on why this is required */
2028 struct btrfs_writepage_fixup {
2029 	struct page *page;
2030 	struct btrfs_work work;
2031 };
2032 
2033 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2034 {
2035 	struct btrfs_writepage_fixup *fixup;
2036 	struct btrfs_ordered_extent *ordered;
2037 	struct extent_state *cached_state = NULL;
2038 	struct page *page;
2039 	struct inode *inode;
2040 	u64 page_start;
2041 	u64 page_end;
2042 	int ret;
2043 
2044 	fixup = container_of(work, struct btrfs_writepage_fixup, work);
2045 	page = fixup->page;
2046 again:
2047 	lock_page(page);
2048 	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
2049 		ClearPageChecked(page);
2050 		goto out_page;
2051 	}
2052 
2053 	inode = page->mapping->host;
2054 	page_start = page_offset(page);
2055 	page_end = page_offset(page) + PAGE_SIZE - 1;
2056 
2057 	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
2058 			 &cached_state);
2059 
2060 	/* already ordered? We're done */
2061 	if (PagePrivate2(page))
2062 		goto out;
2063 
2064 	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
2065 					PAGE_SIZE);
2066 	if (ordered) {
2067 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
2068 				     page_end, &cached_state, GFP_NOFS);
2069 		unlock_page(page);
2070 		btrfs_start_ordered_extent(inode, ordered, 1);
2071 		btrfs_put_ordered_extent(ordered);
2072 		goto again;
2073 	}
2074 
2075 	ret = btrfs_delalloc_reserve_space(inode, page_start,
2076 					   PAGE_SIZE);
2077 	if (ret) {
2078 		mapping_set_error(page->mapping, ret);
2079 		end_extent_writepage(page, ret, page_start, page_end);
2080 		ClearPageChecked(page);
2081 		goto out;
2082 	 }
2083 
2084 	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state,
2085 				  0);
2086 	ClearPageChecked(page);
2087 	set_page_dirty(page);
2088 out:
2089 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
2090 			     &cached_state, GFP_NOFS);
2091 out_page:
2092 	unlock_page(page);
2093 	put_page(page);
2094 	kfree(fixup);
2095 }
2096 
2097 /*
2098  * There are a few paths in the higher layers of the kernel that directly
2099  * set the page dirty bit without asking the filesystem if it is a
2100  * good idea.  This causes problems because we want to make sure COW
2101  * properly happens and the data=ordered rules are followed.
2102  *
2103  * In our case any range that doesn't have the ORDERED bit set
2104  * hasn't been properly setup for IO.  We kick off an async process
2105  * to fix it up.  The async helper will wait for ordered extents, set
2106  * the delalloc bit and make it safe to write the page.
2107  */
2108 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
2109 {
2110 	struct inode *inode = page->mapping->host;
2111 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2112 	struct btrfs_writepage_fixup *fixup;
2113 
2114 	/* this page is properly in the ordered list */
2115 	if (TestClearPagePrivate2(page))
2116 		return 0;
2117 
2118 	if (PageChecked(page))
2119 		return -EAGAIN;
2120 
2121 	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2122 	if (!fixup)
2123 		return -EAGAIN;
2124 
2125 	SetPageChecked(page);
2126 	get_page(page);
2127 	btrfs_init_work(&fixup->work, btrfs_fixup_helper,
2128 			btrfs_writepage_fixup_worker, NULL, NULL);
2129 	fixup->page = page;
2130 	btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2131 	return -EBUSY;
2132 }
2133 
2134 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2135 				       struct inode *inode, u64 file_pos,
2136 				       u64 disk_bytenr, u64 disk_num_bytes,
2137 				       u64 num_bytes, u64 ram_bytes,
2138 				       u8 compression, u8 encryption,
2139 				       u16 other_encoding, int extent_type)
2140 {
2141 	struct btrfs_root *root = BTRFS_I(inode)->root;
2142 	struct btrfs_file_extent_item *fi;
2143 	struct btrfs_path *path;
2144 	struct extent_buffer *leaf;
2145 	struct btrfs_key ins;
2146 	int extent_inserted = 0;
2147 	int ret;
2148 
2149 	path = btrfs_alloc_path();
2150 	if (!path)
2151 		return -ENOMEM;
2152 
2153 	/*
2154 	 * we may be replacing one extent in the tree with another.
2155 	 * The new extent is pinned in the extent map, and we don't want
2156 	 * to drop it from the cache until it is completely in the btree.
2157 	 *
2158 	 * So, tell btrfs_drop_extents to leave this extent in the cache.
2159 	 * the caller is expected to unpin it and allow it to be merged
2160 	 * with the others.
2161 	 */
2162 	ret = __btrfs_drop_extents(trans, root, inode, path, file_pos,
2163 				   file_pos + num_bytes, NULL, 0,
2164 				   1, sizeof(*fi), &extent_inserted);
2165 	if (ret)
2166 		goto out;
2167 
2168 	if (!extent_inserted) {
2169 		ins.objectid = btrfs_ino(BTRFS_I(inode));
2170 		ins.offset = file_pos;
2171 		ins.type = BTRFS_EXTENT_DATA_KEY;
2172 
2173 		path->leave_spinning = 1;
2174 		ret = btrfs_insert_empty_item(trans, root, path, &ins,
2175 					      sizeof(*fi));
2176 		if (ret)
2177 			goto out;
2178 	}
2179 	leaf = path->nodes[0];
2180 	fi = btrfs_item_ptr(leaf, path->slots[0],
2181 			    struct btrfs_file_extent_item);
2182 	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2183 	btrfs_set_file_extent_type(leaf, fi, extent_type);
2184 	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
2185 	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
2186 	btrfs_set_file_extent_offset(leaf, fi, 0);
2187 	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2188 	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
2189 	btrfs_set_file_extent_compression(leaf, fi, compression);
2190 	btrfs_set_file_extent_encryption(leaf, fi, encryption);
2191 	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
2192 
2193 	btrfs_mark_buffer_dirty(leaf);
2194 	btrfs_release_path(path);
2195 
2196 	inode_add_bytes(inode, num_bytes);
2197 
2198 	ins.objectid = disk_bytenr;
2199 	ins.offset = disk_num_bytes;
2200 	ins.type = BTRFS_EXTENT_ITEM_KEY;
2201 	ret = btrfs_alloc_reserved_file_extent(trans, root->root_key.objectid,
2202 			btrfs_ino(BTRFS_I(inode)), file_pos, ram_bytes, &ins);
2203 	/*
2204 	 * Release the reserved range from inode dirty range map, as it is
2205 	 * already moved into delayed_ref_head
2206 	 */
2207 	btrfs_qgroup_release_data(inode, file_pos, ram_bytes);
2208 out:
2209 	btrfs_free_path(path);
2210 
2211 	return ret;
2212 }
2213 
2214 /* snapshot-aware defrag */
2215 struct sa_defrag_extent_backref {
2216 	struct rb_node node;
2217 	struct old_sa_defrag_extent *old;
2218 	u64 root_id;
2219 	u64 inum;
2220 	u64 file_pos;
2221 	u64 extent_offset;
2222 	u64 num_bytes;
2223 	u64 generation;
2224 };
2225 
2226 struct old_sa_defrag_extent {
2227 	struct list_head list;
2228 	struct new_sa_defrag_extent *new;
2229 
2230 	u64 extent_offset;
2231 	u64 bytenr;
2232 	u64 offset;
2233 	u64 len;
2234 	int count;
2235 };
2236 
2237 struct new_sa_defrag_extent {
2238 	struct rb_root root;
2239 	struct list_head head;
2240 	struct btrfs_path *path;
2241 	struct inode *inode;
2242 	u64 file_pos;
2243 	u64 len;
2244 	u64 bytenr;
2245 	u64 disk_len;
2246 	u8 compress_type;
2247 };
2248 
2249 static int backref_comp(struct sa_defrag_extent_backref *b1,
2250 			struct sa_defrag_extent_backref *b2)
2251 {
2252 	if (b1->root_id < b2->root_id)
2253 		return -1;
2254 	else if (b1->root_id > b2->root_id)
2255 		return 1;
2256 
2257 	if (b1->inum < b2->inum)
2258 		return -1;
2259 	else if (b1->inum > b2->inum)
2260 		return 1;
2261 
2262 	if (b1->file_pos < b2->file_pos)
2263 		return -1;
2264 	else if (b1->file_pos > b2->file_pos)
2265 		return 1;
2266 
2267 	/*
2268 	 * [------------------------------] ===> (a range of space)
2269 	 *     |<--->|   |<---->| =============> (fs/file tree A)
2270 	 * |<---------------------------->| ===> (fs/file tree B)
2271 	 *
2272 	 * A range of space can refer to two file extents in one tree while
2273 	 * refer to only one file extent in another tree.
2274 	 *
2275 	 * So we may process a disk offset more than one time(two extents in A)
2276 	 * and locate at the same extent(one extent in B), then insert two same
2277 	 * backrefs(both refer to the extent in B).
2278 	 */
2279 	return 0;
2280 }
2281 
2282 static void backref_insert(struct rb_root *root,
2283 			   struct sa_defrag_extent_backref *backref)
2284 {
2285 	struct rb_node **p = &root->rb_node;
2286 	struct rb_node *parent = NULL;
2287 	struct sa_defrag_extent_backref *entry;
2288 	int ret;
2289 
2290 	while (*p) {
2291 		parent = *p;
2292 		entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
2293 
2294 		ret = backref_comp(backref, entry);
2295 		if (ret < 0)
2296 			p = &(*p)->rb_left;
2297 		else
2298 			p = &(*p)->rb_right;
2299 	}
2300 
2301 	rb_link_node(&backref->node, parent, p);
2302 	rb_insert_color(&backref->node, root);
2303 }
2304 
2305 /*
2306  * Note the backref might has changed, and in this case we just return 0.
2307  */
2308 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2309 				       void *ctx)
2310 {
2311 	struct btrfs_file_extent_item *extent;
2312 	struct old_sa_defrag_extent *old = ctx;
2313 	struct new_sa_defrag_extent *new = old->new;
2314 	struct btrfs_path *path = new->path;
2315 	struct btrfs_key key;
2316 	struct btrfs_root *root;
2317 	struct sa_defrag_extent_backref *backref;
2318 	struct extent_buffer *leaf;
2319 	struct inode *inode = new->inode;
2320 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2321 	int slot;
2322 	int ret;
2323 	u64 extent_offset;
2324 	u64 num_bytes;
2325 
2326 	if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2327 	    inum == btrfs_ino(BTRFS_I(inode)))
2328 		return 0;
2329 
2330 	key.objectid = root_id;
2331 	key.type = BTRFS_ROOT_ITEM_KEY;
2332 	key.offset = (u64)-1;
2333 
2334 	root = btrfs_read_fs_root_no_name(fs_info, &key);
2335 	if (IS_ERR(root)) {
2336 		if (PTR_ERR(root) == -ENOENT)
2337 			return 0;
2338 		WARN_ON(1);
2339 		btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu",
2340 			 inum, offset, root_id);
2341 		return PTR_ERR(root);
2342 	}
2343 
2344 	key.objectid = inum;
2345 	key.type = BTRFS_EXTENT_DATA_KEY;
2346 	if (offset > (u64)-1 << 32)
2347 		key.offset = 0;
2348 	else
2349 		key.offset = offset;
2350 
2351 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2352 	if (WARN_ON(ret < 0))
2353 		return ret;
2354 	ret = 0;
2355 
2356 	while (1) {
2357 		cond_resched();
2358 
2359 		leaf = path->nodes[0];
2360 		slot = path->slots[0];
2361 
2362 		if (slot >= btrfs_header_nritems(leaf)) {
2363 			ret = btrfs_next_leaf(root, path);
2364 			if (ret < 0) {
2365 				goto out;
2366 			} else if (ret > 0) {
2367 				ret = 0;
2368 				goto out;
2369 			}
2370 			continue;
2371 		}
2372 
2373 		path->slots[0]++;
2374 
2375 		btrfs_item_key_to_cpu(leaf, &key, slot);
2376 
2377 		if (key.objectid > inum)
2378 			goto out;
2379 
2380 		if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2381 			continue;
2382 
2383 		extent = btrfs_item_ptr(leaf, slot,
2384 					struct btrfs_file_extent_item);
2385 
2386 		if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2387 			continue;
2388 
2389 		/*
2390 		 * 'offset' refers to the exact key.offset,
2391 		 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2392 		 * (key.offset - extent_offset).
2393 		 */
2394 		if (key.offset != offset)
2395 			continue;
2396 
2397 		extent_offset = btrfs_file_extent_offset(leaf, extent);
2398 		num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2399 
2400 		if (extent_offset >= old->extent_offset + old->offset +
2401 		    old->len || extent_offset + num_bytes <=
2402 		    old->extent_offset + old->offset)
2403 			continue;
2404 		break;
2405 	}
2406 
2407 	backref = kmalloc(sizeof(*backref), GFP_NOFS);
2408 	if (!backref) {
2409 		ret = -ENOENT;
2410 		goto out;
2411 	}
2412 
2413 	backref->root_id = root_id;
2414 	backref->inum = inum;
2415 	backref->file_pos = offset;
2416 	backref->num_bytes = num_bytes;
2417 	backref->extent_offset = extent_offset;
2418 	backref->generation = btrfs_file_extent_generation(leaf, extent);
2419 	backref->old = old;
2420 	backref_insert(&new->root, backref);
2421 	old->count++;
2422 out:
2423 	btrfs_release_path(path);
2424 	WARN_ON(ret);
2425 	return ret;
2426 }
2427 
2428 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2429 				   struct new_sa_defrag_extent *new)
2430 {
2431 	struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2432 	struct old_sa_defrag_extent *old, *tmp;
2433 	int ret;
2434 
2435 	new->path = path;
2436 
2437 	list_for_each_entry_safe(old, tmp, &new->head, list) {
2438 		ret = iterate_inodes_from_logical(old->bytenr +
2439 						  old->extent_offset, fs_info,
2440 						  path, record_one_backref,
2441 						  old);
2442 		if (ret < 0 && ret != -ENOENT)
2443 			return false;
2444 
2445 		/* no backref to be processed for this extent */
2446 		if (!old->count) {
2447 			list_del(&old->list);
2448 			kfree(old);
2449 		}
2450 	}
2451 
2452 	if (list_empty(&new->head))
2453 		return false;
2454 
2455 	return true;
2456 }
2457 
2458 static int relink_is_mergable(struct extent_buffer *leaf,
2459 			      struct btrfs_file_extent_item *fi,
2460 			      struct new_sa_defrag_extent *new)
2461 {
2462 	if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2463 		return 0;
2464 
2465 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2466 		return 0;
2467 
2468 	if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2469 		return 0;
2470 
2471 	if (btrfs_file_extent_encryption(leaf, fi) ||
2472 	    btrfs_file_extent_other_encoding(leaf, fi))
2473 		return 0;
2474 
2475 	return 1;
2476 }
2477 
2478 /*
2479  * Note the backref might has changed, and in this case we just return 0.
2480  */
2481 static noinline int relink_extent_backref(struct btrfs_path *path,
2482 				 struct sa_defrag_extent_backref *prev,
2483 				 struct sa_defrag_extent_backref *backref)
2484 {
2485 	struct btrfs_file_extent_item *extent;
2486 	struct btrfs_file_extent_item *item;
2487 	struct btrfs_ordered_extent *ordered;
2488 	struct btrfs_trans_handle *trans;
2489 	struct btrfs_root *root;
2490 	struct btrfs_key key;
2491 	struct extent_buffer *leaf;
2492 	struct old_sa_defrag_extent *old = backref->old;
2493 	struct new_sa_defrag_extent *new = old->new;
2494 	struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2495 	struct inode *inode;
2496 	struct extent_state *cached = NULL;
2497 	int ret = 0;
2498 	u64 start;
2499 	u64 len;
2500 	u64 lock_start;
2501 	u64 lock_end;
2502 	bool merge = false;
2503 	int index;
2504 
2505 	if (prev && prev->root_id == backref->root_id &&
2506 	    prev->inum == backref->inum &&
2507 	    prev->file_pos + prev->num_bytes == backref->file_pos)
2508 		merge = true;
2509 
2510 	/* step 1: get root */
2511 	key.objectid = backref->root_id;
2512 	key.type = BTRFS_ROOT_ITEM_KEY;
2513 	key.offset = (u64)-1;
2514 
2515 	index = srcu_read_lock(&fs_info->subvol_srcu);
2516 
2517 	root = btrfs_read_fs_root_no_name(fs_info, &key);
2518 	if (IS_ERR(root)) {
2519 		srcu_read_unlock(&fs_info->subvol_srcu, index);
2520 		if (PTR_ERR(root) == -ENOENT)
2521 			return 0;
2522 		return PTR_ERR(root);
2523 	}
2524 
2525 	if (btrfs_root_readonly(root)) {
2526 		srcu_read_unlock(&fs_info->subvol_srcu, index);
2527 		return 0;
2528 	}
2529 
2530 	/* step 2: get inode */
2531 	key.objectid = backref->inum;
2532 	key.type = BTRFS_INODE_ITEM_KEY;
2533 	key.offset = 0;
2534 
2535 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2536 	if (IS_ERR(inode)) {
2537 		srcu_read_unlock(&fs_info->subvol_srcu, index);
2538 		return 0;
2539 	}
2540 
2541 	srcu_read_unlock(&fs_info->subvol_srcu, index);
2542 
2543 	/* step 3: relink backref */
2544 	lock_start = backref->file_pos;
2545 	lock_end = backref->file_pos + backref->num_bytes - 1;
2546 	lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2547 			 &cached);
2548 
2549 	ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2550 	if (ordered) {
2551 		btrfs_put_ordered_extent(ordered);
2552 		goto out_unlock;
2553 	}
2554 
2555 	trans = btrfs_join_transaction(root);
2556 	if (IS_ERR(trans)) {
2557 		ret = PTR_ERR(trans);
2558 		goto out_unlock;
2559 	}
2560 
2561 	key.objectid = backref->inum;
2562 	key.type = BTRFS_EXTENT_DATA_KEY;
2563 	key.offset = backref->file_pos;
2564 
2565 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2566 	if (ret < 0) {
2567 		goto out_free_path;
2568 	} else if (ret > 0) {
2569 		ret = 0;
2570 		goto out_free_path;
2571 	}
2572 
2573 	extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2574 				struct btrfs_file_extent_item);
2575 
2576 	if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2577 	    backref->generation)
2578 		goto out_free_path;
2579 
2580 	btrfs_release_path(path);
2581 
2582 	start = backref->file_pos;
2583 	if (backref->extent_offset < old->extent_offset + old->offset)
2584 		start += old->extent_offset + old->offset -
2585 			 backref->extent_offset;
2586 
2587 	len = min(backref->extent_offset + backref->num_bytes,
2588 		  old->extent_offset + old->offset + old->len);
2589 	len -= max(backref->extent_offset, old->extent_offset + old->offset);
2590 
2591 	ret = btrfs_drop_extents(trans, root, inode, start,
2592 				 start + len, 1);
2593 	if (ret)
2594 		goto out_free_path;
2595 again:
2596 	key.objectid = btrfs_ino(BTRFS_I(inode));
2597 	key.type = BTRFS_EXTENT_DATA_KEY;
2598 	key.offset = start;
2599 
2600 	path->leave_spinning = 1;
2601 	if (merge) {
2602 		struct btrfs_file_extent_item *fi;
2603 		u64 extent_len;
2604 		struct btrfs_key found_key;
2605 
2606 		ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2607 		if (ret < 0)
2608 			goto out_free_path;
2609 
2610 		path->slots[0]--;
2611 		leaf = path->nodes[0];
2612 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2613 
2614 		fi = btrfs_item_ptr(leaf, path->slots[0],
2615 				    struct btrfs_file_extent_item);
2616 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2617 
2618 		if (extent_len + found_key.offset == start &&
2619 		    relink_is_mergable(leaf, fi, new)) {
2620 			btrfs_set_file_extent_num_bytes(leaf, fi,
2621 							extent_len + len);
2622 			btrfs_mark_buffer_dirty(leaf);
2623 			inode_add_bytes(inode, len);
2624 
2625 			ret = 1;
2626 			goto out_free_path;
2627 		} else {
2628 			merge = false;
2629 			btrfs_release_path(path);
2630 			goto again;
2631 		}
2632 	}
2633 
2634 	ret = btrfs_insert_empty_item(trans, root, path, &key,
2635 					sizeof(*extent));
2636 	if (ret) {
2637 		btrfs_abort_transaction(trans, ret);
2638 		goto out_free_path;
2639 	}
2640 
2641 	leaf = path->nodes[0];
2642 	item = btrfs_item_ptr(leaf, path->slots[0],
2643 				struct btrfs_file_extent_item);
2644 	btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2645 	btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2646 	btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2647 	btrfs_set_file_extent_num_bytes(leaf, item, len);
2648 	btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2649 	btrfs_set_file_extent_generation(leaf, item, trans->transid);
2650 	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2651 	btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2652 	btrfs_set_file_extent_encryption(leaf, item, 0);
2653 	btrfs_set_file_extent_other_encoding(leaf, item, 0);
2654 
2655 	btrfs_mark_buffer_dirty(leaf);
2656 	inode_add_bytes(inode, len);
2657 	btrfs_release_path(path);
2658 
2659 	ret = btrfs_inc_extent_ref(trans, fs_info, new->bytenr,
2660 			new->disk_len, 0,
2661 			backref->root_id, backref->inum,
2662 			new->file_pos);	/* start - extent_offset */
2663 	if (ret) {
2664 		btrfs_abort_transaction(trans, ret);
2665 		goto out_free_path;
2666 	}
2667 
2668 	ret = 1;
2669 out_free_path:
2670 	btrfs_release_path(path);
2671 	path->leave_spinning = 0;
2672 	btrfs_end_transaction(trans);
2673 out_unlock:
2674 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2675 			     &cached, GFP_NOFS);
2676 	iput(inode);
2677 	return ret;
2678 }
2679 
2680 static void free_sa_defrag_extent(struct new_sa_defrag_extent *new)
2681 {
2682 	struct old_sa_defrag_extent *old, *tmp;
2683 
2684 	if (!new)
2685 		return;
2686 
2687 	list_for_each_entry_safe(old, tmp, &new->head, list) {
2688 		kfree(old);
2689 	}
2690 	kfree(new);
2691 }
2692 
2693 static void relink_file_extents(struct new_sa_defrag_extent *new)
2694 {
2695 	struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb);
2696 	struct btrfs_path *path;
2697 	struct sa_defrag_extent_backref *backref;
2698 	struct sa_defrag_extent_backref *prev = NULL;
2699 	struct inode *inode;
2700 	struct btrfs_root *root;
2701 	struct rb_node *node;
2702 	int ret;
2703 
2704 	inode = new->inode;
2705 	root = BTRFS_I(inode)->root;
2706 
2707 	path = btrfs_alloc_path();
2708 	if (!path)
2709 		return;
2710 
2711 	if (!record_extent_backrefs(path, new)) {
2712 		btrfs_free_path(path);
2713 		goto out;
2714 	}
2715 	btrfs_release_path(path);
2716 
2717 	while (1) {
2718 		node = rb_first(&new->root);
2719 		if (!node)
2720 			break;
2721 		rb_erase(node, &new->root);
2722 
2723 		backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2724 
2725 		ret = relink_extent_backref(path, prev, backref);
2726 		WARN_ON(ret < 0);
2727 
2728 		kfree(prev);
2729 
2730 		if (ret == 1)
2731 			prev = backref;
2732 		else
2733 			prev = NULL;
2734 		cond_resched();
2735 	}
2736 	kfree(prev);
2737 
2738 	btrfs_free_path(path);
2739 out:
2740 	free_sa_defrag_extent(new);
2741 
2742 	atomic_dec(&fs_info->defrag_running);
2743 	wake_up(&fs_info->transaction_wait);
2744 }
2745 
2746 static struct new_sa_defrag_extent *
2747 record_old_file_extents(struct inode *inode,
2748 			struct btrfs_ordered_extent *ordered)
2749 {
2750 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2751 	struct btrfs_root *root = BTRFS_I(inode)->root;
2752 	struct btrfs_path *path;
2753 	struct btrfs_key key;
2754 	struct old_sa_defrag_extent *old;
2755 	struct new_sa_defrag_extent *new;
2756 	int ret;
2757 
2758 	new = kmalloc(sizeof(*new), GFP_NOFS);
2759 	if (!new)
2760 		return NULL;
2761 
2762 	new->inode = inode;
2763 	new->file_pos = ordered->file_offset;
2764 	new->len = ordered->len;
2765 	new->bytenr = ordered->start;
2766 	new->disk_len = ordered->disk_len;
2767 	new->compress_type = ordered->compress_type;
2768 	new->root = RB_ROOT;
2769 	INIT_LIST_HEAD(&new->head);
2770 
2771 	path = btrfs_alloc_path();
2772 	if (!path)
2773 		goto out_kfree;
2774 
2775 	key.objectid = btrfs_ino(BTRFS_I(inode));
2776 	key.type = BTRFS_EXTENT_DATA_KEY;
2777 	key.offset = new->file_pos;
2778 
2779 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2780 	if (ret < 0)
2781 		goto out_free_path;
2782 	if (ret > 0 && path->slots[0] > 0)
2783 		path->slots[0]--;
2784 
2785 	/* find out all the old extents for the file range */
2786 	while (1) {
2787 		struct btrfs_file_extent_item *extent;
2788 		struct extent_buffer *l;
2789 		int slot;
2790 		u64 num_bytes;
2791 		u64 offset;
2792 		u64 end;
2793 		u64 disk_bytenr;
2794 		u64 extent_offset;
2795 
2796 		l = path->nodes[0];
2797 		slot = path->slots[0];
2798 
2799 		if (slot >= btrfs_header_nritems(l)) {
2800 			ret = btrfs_next_leaf(root, path);
2801 			if (ret < 0)
2802 				goto out_free_path;
2803 			else if (ret > 0)
2804 				break;
2805 			continue;
2806 		}
2807 
2808 		btrfs_item_key_to_cpu(l, &key, slot);
2809 
2810 		if (key.objectid != btrfs_ino(BTRFS_I(inode)))
2811 			break;
2812 		if (key.type != BTRFS_EXTENT_DATA_KEY)
2813 			break;
2814 		if (key.offset >= new->file_pos + new->len)
2815 			break;
2816 
2817 		extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2818 
2819 		num_bytes = btrfs_file_extent_num_bytes(l, extent);
2820 		if (key.offset + num_bytes < new->file_pos)
2821 			goto next;
2822 
2823 		disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2824 		if (!disk_bytenr)
2825 			goto next;
2826 
2827 		extent_offset = btrfs_file_extent_offset(l, extent);
2828 
2829 		old = kmalloc(sizeof(*old), GFP_NOFS);
2830 		if (!old)
2831 			goto out_free_path;
2832 
2833 		offset = max(new->file_pos, key.offset);
2834 		end = min(new->file_pos + new->len, key.offset + num_bytes);
2835 
2836 		old->bytenr = disk_bytenr;
2837 		old->extent_offset = extent_offset;
2838 		old->offset = offset - key.offset;
2839 		old->len = end - offset;
2840 		old->new = new;
2841 		old->count = 0;
2842 		list_add_tail(&old->list, &new->head);
2843 next:
2844 		path->slots[0]++;
2845 		cond_resched();
2846 	}
2847 
2848 	btrfs_free_path(path);
2849 	atomic_inc(&fs_info->defrag_running);
2850 
2851 	return new;
2852 
2853 out_free_path:
2854 	btrfs_free_path(path);
2855 out_kfree:
2856 	free_sa_defrag_extent(new);
2857 	return NULL;
2858 }
2859 
2860 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
2861 					 u64 start, u64 len)
2862 {
2863 	struct btrfs_block_group_cache *cache;
2864 
2865 	cache = btrfs_lookup_block_group(fs_info, start);
2866 	ASSERT(cache);
2867 
2868 	spin_lock(&cache->lock);
2869 	cache->delalloc_bytes -= len;
2870 	spin_unlock(&cache->lock);
2871 
2872 	btrfs_put_block_group(cache);
2873 }
2874 
2875 /* as ordered data IO finishes, this gets called so we can finish
2876  * an ordered extent if the range of bytes in the file it covers are
2877  * fully written.
2878  */
2879 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2880 {
2881 	struct inode *inode = ordered_extent->inode;
2882 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2883 	struct btrfs_root *root = BTRFS_I(inode)->root;
2884 	struct btrfs_trans_handle *trans = NULL;
2885 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2886 	struct extent_state *cached_state = NULL;
2887 	struct new_sa_defrag_extent *new = NULL;
2888 	int compress_type = 0;
2889 	int ret = 0;
2890 	u64 logical_len = ordered_extent->len;
2891 	bool nolock;
2892 	bool truncated = false;
2893 	bool range_locked = false;
2894 	bool clear_new_delalloc_bytes = false;
2895 
2896 	if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2897 	    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
2898 	    !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags))
2899 		clear_new_delalloc_bytes = true;
2900 
2901 	nolock = btrfs_is_free_space_inode(BTRFS_I(inode));
2902 
2903 	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2904 		ret = -EIO;
2905 		goto out;
2906 	}
2907 
2908 	btrfs_free_io_failure_record(BTRFS_I(inode),
2909 			ordered_extent->file_offset,
2910 			ordered_extent->file_offset +
2911 			ordered_extent->len - 1);
2912 
2913 	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2914 		truncated = true;
2915 		logical_len = ordered_extent->truncated_len;
2916 		/* Truncated the entire extent, don't bother adding */
2917 		if (!logical_len)
2918 			goto out;
2919 	}
2920 
2921 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2922 		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2923 
2924 		/*
2925 		 * For mwrite(mmap + memset to write) case, we still reserve
2926 		 * space for NOCOW range.
2927 		 * As NOCOW won't cause a new delayed ref, just free the space
2928 		 */
2929 		btrfs_qgroup_free_data(inode, ordered_extent->file_offset,
2930 				       ordered_extent->len);
2931 		btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2932 		if (nolock)
2933 			trans = btrfs_join_transaction_nolock(root);
2934 		else
2935 			trans = btrfs_join_transaction(root);
2936 		if (IS_ERR(trans)) {
2937 			ret = PTR_ERR(trans);
2938 			trans = NULL;
2939 			goto out;
2940 		}
2941 		trans->block_rsv = &fs_info->delalloc_block_rsv;
2942 		ret = btrfs_update_inode_fallback(trans, root, inode);
2943 		if (ret) /* -ENOMEM or corruption */
2944 			btrfs_abort_transaction(trans, ret);
2945 		goto out;
2946 	}
2947 
2948 	range_locked = true;
2949 	lock_extent_bits(io_tree, ordered_extent->file_offset,
2950 			 ordered_extent->file_offset + ordered_extent->len - 1,
2951 			 &cached_state);
2952 
2953 	ret = test_range_bit(io_tree, ordered_extent->file_offset,
2954 			ordered_extent->file_offset + ordered_extent->len - 1,
2955 			EXTENT_DEFRAG, 0, cached_state);
2956 	if (ret) {
2957 		u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2958 		if (0 && last_snapshot >= BTRFS_I(inode)->generation)
2959 			/* the inode is shared */
2960 			new = record_old_file_extents(inode, ordered_extent);
2961 
2962 		clear_extent_bit(io_tree, ordered_extent->file_offset,
2963 			ordered_extent->file_offset + ordered_extent->len - 1,
2964 			EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2965 	}
2966 
2967 	if (nolock)
2968 		trans = btrfs_join_transaction_nolock(root);
2969 	else
2970 		trans = btrfs_join_transaction(root);
2971 	if (IS_ERR(trans)) {
2972 		ret = PTR_ERR(trans);
2973 		trans = NULL;
2974 		goto out;
2975 	}
2976 
2977 	trans->block_rsv = &fs_info->delalloc_block_rsv;
2978 
2979 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2980 		compress_type = ordered_extent->compress_type;
2981 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2982 		BUG_ON(compress_type);
2983 		ret = btrfs_mark_extent_written(trans, BTRFS_I(inode),
2984 						ordered_extent->file_offset,
2985 						ordered_extent->file_offset +
2986 						logical_len);
2987 	} else {
2988 		BUG_ON(root == fs_info->tree_root);
2989 		ret = insert_reserved_file_extent(trans, inode,
2990 						ordered_extent->file_offset,
2991 						ordered_extent->start,
2992 						ordered_extent->disk_len,
2993 						logical_len, logical_len,
2994 						compress_type, 0, 0,
2995 						BTRFS_FILE_EXTENT_REG);
2996 		if (!ret)
2997 			btrfs_release_delalloc_bytes(fs_info,
2998 						     ordered_extent->start,
2999 						     ordered_extent->disk_len);
3000 	}
3001 	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
3002 			   ordered_extent->file_offset, ordered_extent->len,
3003 			   trans->transid);
3004 	if (ret < 0) {
3005 		btrfs_abort_transaction(trans, ret);
3006 		goto out;
3007 	}
3008 
3009 	add_pending_csums(trans, inode, &ordered_extent->list);
3010 
3011 	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
3012 	ret = btrfs_update_inode_fallback(trans, root, inode);
3013 	if (ret) { /* -ENOMEM or corruption */
3014 		btrfs_abort_transaction(trans, ret);
3015 		goto out;
3016 	}
3017 	ret = 0;
3018 out:
3019 	if (range_locked || clear_new_delalloc_bytes) {
3020 		unsigned int clear_bits = 0;
3021 
3022 		if (range_locked)
3023 			clear_bits |= EXTENT_LOCKED;
3024 		if (clear_new_delalloc_bytes)
3025 			clear_bits |= EXTENT_DELALLOC_NEW;
3026 		clear_extent_bit(&BTRFS_I(inode)->io_tree,
3027 				 ordered_extent->file_offset,
3028 				 ordered_extent->file_offset +
3029 				 ordered_extent->len - 1,
3030 				 clear_bits,
3031 				 (clear_bits & EXTENT_LOCKED) ? 1 : 0,
3032 				 0, &cached_state, GFP_NOFS);
3033 	}
3034 
3035 	if (root != fs_info->tree_root)
3036 		btrfs_delalloc_release_metadata(BTRFS_I(inode),
3037 				ordered_extent->len);
3038 	if (trans)
3039 		btrfs_end_transaction(trans);
3040 
3041 	if (ret || truncated) {
3042 		u64 start, end;
3043 
3044 		if (truncated)
3045 			start = ordered_extent->file_offset + logical_len;
3046 		else
3047 			start = ordered_extent->file_offset;
3048 		end = ordered_extent->file_offset + ordered_extent->len - 1;
3049 		clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
3050 
3051 		/* Drop the cache for the part of the extent we didn't write. */
3052 		btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0);
3053 
3054 		/*
3055 		 * If the ordered extent had an IOERR or something else went
3056 		 * wrong we need to return the space for this ordered extent
3057 		 * back to the allocator.  We only free the extent in the
3058 		 * truncated case if we didn't write out the extent at all.
3059 		 */
3060 		if ((ret || !logical_len) &&
3061 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3062 		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
3063 			btrfs_free_reserved_extent(fs_info,
3064 						   ordered_extent->start,
3065 						   ordered_extent->disk_len, 1);
3066 	}
3067 
3068 
3069 	/*
3070 	 * This needs to be done to make sure anybody waiting knows we are done
3071 	 * updating everything for this ordered extent.
3072 	 */
3073 	btrfs_remove_ordered_extent(inode, ordered_extent);
3074 
3075 	/* for snapshot-aware defrag */
3076 	if (new) {
3077 		if (ret) {
3078 			free_sa_defrag_extent(new);
3079 			atomic_dec(&fs_info->defrag_running);
3080 		} else {
3081 			relink_file_extents(new);
3082 		}
3083 	}
3084 
3085 	/* once for us */
3086 	btrfs_put_ordered_extent(ordered_extent);
3087 	/* once for the tree */
3088 	btrfs_put_ordered_extent(ordered_extent);
3089 
3090 	return ret;
3091 }
3092 
3093 static void finish_ordered_fn(struct btrfs_work *work)
3094 {
3095 	struct btrfs_ordered_extent *ordered_extent;
3096 	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
3097 	btrfs_finish_ordered_io(ordered_extent);
3098 }
3099 
3100 static void btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
3101 				struct extent_state *state, int uptodate)
3102 {
3103 	struct inode *inode = page->mapping->host;
3104 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3105 	struct btrfs_ordered_extent *ordered_extent = NULL;
3106 	struct btrfs_workqueue *wq;
3107 	btrfs_work_func_t func;
3108 
3109 	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
3110 
3111 	ClearPagePrivate2(page);
3112 	if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
3113 					    end - start + 1, uptodate))
3114 		return;
3115 
3116 	if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
3117 		wq = fs_info->endio_freespace_worker;
3118 		func = btrfs_freespace_write_helper;
3119 	} else {
3120 		wq = fs_info->endio_write_workers;
3121 		func = btrfs_endio_write_helper;
3122 	}
3123 
3124 	btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
3125 			NULL);
3126 	btrfs_queue_work(wq, &ordered_extent->work);
3127 }
3128 
3129 static int __readpage_endio_check(struct inode *inode,
3130 				  struct btrfs_io_bio *io_bio,
3131 				  int icsum, struct page *page,
3132 				  int pgoff, u64 start, size_t len)
3133 {
3134 	char *kaddr;
3135 	u32 csum_expected;
3136 	u32 csum = ~(u32)0;
3137 
3138 	csum_expected = *(((u32 *)io_bio->csum) + icsum);
3139 
3140 	kaddr = kmap_atomic(page);
3141 	csum = btrfs_csum_data(kaddr + pgoff, csum,  len);
3142 	btrfs_csum_final(csum, (u8 *)&csum);
3143 	if (csum != csum_expected)
3144 		goto zeroit;
3145 
3146 	kunmap_atomic(kaddr);
3147 	return 0;
3148 zeroit:
3149 	btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
3150 				    io_bio->mirror_num);
3151 	memset(kaddr + pgoff, 1, len);
3152 	flush_dcache_page(page);
3153 	kunmap_atomic(kaddr);
3154 	if (csum_expected == 0)
3155 		return 0;
3156 	return -EIO;
3157 }
3158 
3159 /*
3160  * when reads are done, we need to check csums to verify the data is correct
3161  * if there's a match, we allow the bio to finish.  If not, the code in
3162  * extent_io.c will try to find good copies for us.
3163  */
3164 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
3165 				      u64 phy_offset, struct page *page,
3166 				      u64 start, u64 end, int mirror)
3167 {
3168 	size_t offset = start - page_offset(page);
3169 	struct inode *inode = page->mapping->host;
3170 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3171 	struct btrfs_root *root = BTRFS_I(inode)->root;
3172 
3173 	if (PageChecked(page)) {
3174 		ClearPageChecked(page);
3175 		return 0;
3176 	}
3177 
3178 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
3179 		return 0;
3180 
3181 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
3182 	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
3183 		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM);
3184 		return 0;
3185 	}
3186 
3187 	phy_offset >>= inode->i_sb->s_blocksize_bits;
3188 	return __readpage_endio_check(inode, io_bio, phy_offset, page, offset,
3189 				      start, (size_t)(end - start + 1));
3190 }
3191 
3192 void btrfs_add_delayed_iput(struct inode *inode)
3193 {
3194 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3195 	struct btrfs_inode *binode = BTRFS_I(inode);
3196 
3197 	if (atomic_add_unless(&inode->i_count, -1, 1))
3198 		return;
3199 
3200 	spin_lock(&fs_info->delayed_iput_lock);
3201 	if (binode->delayed_iput_count == 0) {
3202 		ASSERT(list_empty(&binode->delayed_iput));
3203 		list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
3204 	} else {
3205 		binode->delayed_iput_count++;
3206 	}
3207 	spin_unlock(&fs_info->delayed_iput_lock);
3208 }
3209 
3210 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3211 {
3212 
3213 	spin_lock(&fs_info->delayed_iput_lock);
3214 	while (!list_empty(&fs_info->delayed_iputs)) {
3215 		struct btrfs_inode *inode;
3216 
3217 		inode = list_first_entry(&fs_info->delayed_iputs,
3218 				struct btrfs_inode, delayed_iput);
3219 		if (inode->delayed_iput_count) {
3220 			inode->delayed_iput_count--;
3221 			list_move_tail(&inode->delayed_iput,
3222 					&fs_info->delayed_iputs);
3223 		} else {
3224 			list_del_init(&inode->delayed_iput);
3225 		}
3226 		spin_unlock(&fs_info->delayed_iput_lock);
3227 		iput(&inode->vfs_inode);
3228 		spin_lock(&fs_info->delayed_iput_lock);
3229 	}
3230 	spin_unlock(&fs_info->delayed_iput_lock);
3231 }
3232 
3233 /*
3234  * This is called in transaction commit time. If there are no orphan
3235  * files in the subvolume, it removes orphan item and frees block_rsv
3236  * structure.
3237  */
3238 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
3239 			      struct btrfs_root *root)
3240 {
3241 	struct btrfs_fs_info *fs_info = root->fs_info;
3242 	struct btrfs_block_rsv *block_rsv;
3243 	int ret;
3244 
3245 	if (atomic_read(&root->orphan_inodes) ||
3246 	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
3247 		return;
3248 
3249 	spin_lock(&root->orphan_lock);
3250 	if (atomic_read(&root->orphan_inodes)) {
3251 		spin_unlock(&root->orphan_lock);
3252 		return;
3253 	}
3254 
3255 	if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
3256 		spin_unlock(&root->orphan_lock);
3257 		return;
3258 	}
3259 
3260 	block_rsv = root->orphan_block_rsv;
3261 	root->orphan_block_rsv = NULL;
3262 	spin_unlock(&root->orphan_lock);
3263 
3264 	if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state) &&
3265 	    btrfs_root_refs(&root->root_item) > 0) {
3266 		ret = btrfs_del_orphan_item(trans, fs_info->tree_root,
3267 					    root->root_key.objectid);
3268 		if (ret)
3269 			btrfs_abort_transaction(trans, ret);
3270 		else
3271 			clear_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
3272 				  &root->state);
3273 	}
3274 
3275 	if (block_rsv) {
3276 		WARN_ON(block_rsv->size > 0);
3277 		btrfs_free_block_rsv(fs_info, block_rsv);
3278 	}
3279 }
3280 
3281 /*
3282  * This creates an orphan entry for the given inode in case something goes
3283  * wrong in the middle of an unlink/truncate.
3284  *
3285  * NOTE: caller of this function should reserve 5 units of metadata for
3286  *	 this function.
3287  */
3288 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3289 		struct btrfs_inode *inode)
3290 {
3291 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
3292 	struct btrfs_root *root = inode->root;
3293 	struct btrfs_block_rsv *block_rsv = NULL;
3294 	int reserve = 0;
3295 	int insert = 0;
3296 	int ret;
3297 
3298 	if (!root->orphan_block_rsv) {
3299 		block_rsv = btrfs_alloc_block_rsv(fs_info,
3300 						  BTRFS_BLOCK_RSV_TEMP);
3301 		if (!block_rsv)
3302 			return -ENOMEM;
3303 	}
3304 
3305 	spin_lock(&root->orphan_lock);
3306 	if (!root->orphan_block_rsv) {
3307 		root->orphan_block_rsv = block_rsv;
3308 	} else if (block_rsv) {
3309 		btrfs_free_block_rsv(fs_info, block_rsv);
3310 		block_rsv = NULL;
3311 	}
3312 
3313 	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3314 			      &inode->runtime_flags)) {
3315 #if 0
3316 		/*
3317 		 * For proper ENOSPC handling, we should do orphan
3318 		 * cleanup when mounting. But this introduces backward
3319 		 * compatibility issue.
3320 		 */
3321 		if (!xchg(&root->orphan_item_inserted, 1))
3322 			insert = 2;
3323 		else
3324 			insert = 1;
3325 #endif
3326 		insert = 1;
3327 		atomic_inc(&root->orphan_inodes);
3328 	}
3329 
3330 	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3331 			      &inode->runtime_flags))
3332 		reserve = 1;
3333 	spin_unlock(&root->orphan_lock);
3334 
3335 	/* grab metadata reservation from transaction handle */
3336 	if (reserve) {
3337 		ret = btrfs_orphan_reserve_metadata(trans, inode);
3338 		ASSERT(!ret);
3339 		if (ret) {
3340 			atomic_dec(&root->orphan_inodes);
3341 			clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3342 				  &inode->runtime_flags);
3343 			if (insert)
3344 				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3345 					  &inode->runtime_flags);
3346 			return ret;
3347 		}
3348 	}
3349 
3350 	/* insert an orphan item to track this unlinked/truncated file */
3351 	if (insert >= 1) {
3352 		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
3353 		if (ret) {
3354 			atomic_dec(&root->orphan_inodes);
3355 			if (reserve) {
3356 				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3357 					  &inode->runtime_flags);
3358 				btrfs_orphan_release_metadata(inode);
3359 			}
3360 			if (ret != -EEXIST) {
3361 				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3362 					  &inode->runtime_flags);
3363 				btrfs_abort_transaction(trans, ret);
3364 				return ret;
3365 			}
3366 		}
3367 		ret = 0;
3368 	}
3369 
3370 	/* insert an orphan item to track subvolume contains orphan files */
3371 	if (insert >= 2) {
3372 		ret = btrfs_insert_orphan_item(trans, fs_info->tree_root,
3373 					       root->root_key.objectid);
3374 		if (ret && ret != -EEXIST) {
3375 			btrfs_abort_transaction(trans, ret);
3376 			return ret;
3377 		}
3378 	}
3379 	return 0;
3380 }
3381 
3382 /*
3383  * We have done the truncate/delete so we can go ahead and remove the orphan
3384  * item for this particular inode.
3385  */
3386 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3387 			    struct btrfs_inode *inode)
3388 {
3389 	struct btrfs_root *root = inode->root;
3390 	int delete_item = 0;
3391 	int release_rsv = 0;
3392 	int ret = 0;
3393 
3394 	spin_lock(&root->orphan_lock);
3395 	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3396 			       &inode->runtime_flags))
3397 		delete_item = 1;
3398 
3399 	if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3400 			       &inode->runtime_flags))
3401 		release_rsv = 1;
3402 	spin_unlock(&root->orphan_lock);
3403 
3404 	if (delete_item) {
3405 		atomic_dec(&root->orphan_inodes);
3406 		if (trans)
3407 			ret = btrfs_del_orphan_item(trans, root,
3408 						    btrfs_ino(inode));
3409 	}
3410 
3411 	if (release_rsv)
3412 		btrfs_orphan_release_metadata(inode);
3413 
3414 	return ret;
3415 }
3416 
3417 /*
3418  * this cleans up any orphans that may be left on the list from the last use
3419  * of this root.
3420  */
3421 int btrfs_orphan_cleanup(struct btrfs_root *root)
3422 {
3423 	struct btrfs_fs_info *fs_info = root->fs_info;
3424 	struct btrfs_path *path;
3425 	struct extent_buffer *leaf;
3426 	struct btrfs_key key, found_key;
3427 	struct btrfs_trans_handle *trans;
3428 	struct inode *inode;
3429 	u64 last_objectid = 0;
3430 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
3431 
3432 	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3433 		return 0;
3434 
3435 	path = btrfs_alloc_path();
3436 	if (!path) {
3437 		ret = -ENOMEM;
3438 		goto out;
3439 	}
3440 	path->reada = READA_BACK;
3441 
3442 	key.objectid = BTRFS_ORPHAN_OBJECTID;
3443 	key.type = BTRFS_ORPHAN_ITEM_KEY;
3444 	key.offset = (u64)-1;
3445 
3446 	while (1) {
3447 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3448 		if (ret < 0)
3449 			goto out;
3450 
3451 		/*
3452 		 * if ret == 0 means we found what we were searching for, which
3453 		 * is weird, but possible, so only screw with path if we didn't
3454 		 * find the key and see if we have stuff that matches
3455 		 */
3456 		if (ret > 0) {
3457 			ret = 0;
3458 			if (path->slots[0] == 0)
3459 				break;
3460 			path->slots[0]--;
3461 		}
3462 
3463 		/* pull out the item */
3464 		leaf = path->nodes[0];
3465 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3466 
3467 		/* make sure the item matches what we want */
3468 		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3469 			break;
3470 		if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3471 			break;
3472 
3473 		/* release the path since we're done with it */
3474 		btrfs_release_path(path);
3475 
3476 		/*
3477 		 * this is where we are basically btrfs_lookup, without the
3478 		 * crossing root thing.  we store the inode number in the
3479 		 * offset of the orphan item.
3480 		 */
3481 
3482 		if (found_key.offset == last_objectid) {
3483 			btrfs_err(fs_info,
3484 				  "Error removing orphan entry, stopping orphan cleanup");
3485 			ret = -EINVAL;
3486 			goto out;
3487 		}
3488 
3489 		last_objectid = found_key.offset;
3490 
3491 		found_key.objectid = found_key.offset;
3492 		found_key.type = BTRFS_INODE_ITEM_KEY;
3493 		found_key.offset = 0;
3494 		inode = btrfs_iget(fs_info->sb, &found_key, root, NULL);
3495 		ret = PTR_ERR_OR_ZERO(inode);
3496 		if (ret && ret != -ENOENT)
3497 			goto out;
3498 
3499 		if (ret == -ENOENT && root == fs_info->tree_root) {
3500 			struct btrfs_root *dead_root;
3501 			struct btrfs_fs_info *fs_info = root->fs_info;
3502 			int is_dead_root = 0;
3503 
3504 			/*
3505 			 * this is an orphan in the tree root. Currently these
3506 			 * could come from 2 sources:
3507 			 *  a) a snapshot deletion in progress
3508 			 *  b) a free space cache inode
3509 			 * We need to distinguish those two, as the snapshot
3510 			 * orphan must not get deleted.
3511 			 * find_dead_roots already ran before us, so if this
3512 			 * is a snapshot deletion, we should find the root
3513 			 * in the dead_roots list
3514 			 */
3515 			spin_lock(&fs_info->trans_lock);
3516 			list_for_each_entry(dead_root, &fs_info->dead_roots,
3517 					    root_list) {
3518 				if (dead_root->root_key.objectid ==
3519 				    found_key.objectid) {
3520 					is_dead_root = 1;
3521 					break;
3522 				}
3523 			}
3524 			spin_unlock(&fs_info->trans_lock);
3525 			if (is_dead_root) {
3526 				/* prevent this orphan from being found again */
3527 				key.offset = found_key.objectid - 1;
3528 				continue;
3529 			}
3530 		}
3531 		/*
3532 		 * Inode is already gone but the orphan item is still there,
3533 		 * kill the orphan item.
3534 		 */
3535 		if (ret == -ENOENT) {
3536 			trans = btrfs_start_transaction(root, 1);
3537 			if (IS_ERR(trans)) {
3538 				ret = PTR_ERR(trans);
3539 				goto out;
3540 			}
3541 			btrfs_debug(fs_info, "auto deleting %Lu",
3542 				    found_key.objectid);
3543 			ret = btrfs_del_orphan_item(trans, root,
3544 						    found_key.objectid);
3545 			btrfs_end_transaction(trans);
3546 			if (ret)
3547 				goto out;
3548 			continue;
3549 		}
3550 
3551 		/*
3552 		 * add this inode to the orphan list so btrfs_orphan_del does
3553 		 * the proper thing when we hit it
3554 		 */
3555 		set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3556 			&BTRFS_I(inode)->runtime_flags);
3557 		atomic_inc(&root->orphan_inodes);
3558 
3559 		/* if we have links, this was a truncate, lets do that */
3560 		if (inode->i_nlink) {
3561 			if (WARN_ON(!S_ISREG(inode->i_mode))) {
3562 				iput(inode);
3563 				continue;
3564 			}
3565 			nr_truncate++;
3566 
3567 			/* 1 for the orphan item deletion. */
3568 			trans = btrfs_start_transaction(root, 1);
3569 			if (IS_ERR(trans)) {
3570 				iput(inode);
3571 				ret = PTR_ERR(trans);
3572 				goto out;
3573 			}
3574 			ret = btrfs_orphan_add(trans, BTRFS_I(inode));
3575 			btrfs_end_transaction(trans);
3576 			if (ret) {
3577 				iput(inode);
3578 				goto out;
3579 			}
3580 
3581 			ret = btrfs_truncate(inode);
3582 			if (ret)
3583 				btrfs_orphan_del(NULL, BTRFS_I(inode));
3584 		} else {
3585 			nr_unlink++;
3586 		}
3587 
3588 		/* this will do delete_inode and everything for us */
3589 		iput(inode);
3590 		if (ret)
3591 			goto out;
3592 	}
3593 	/* release the path since we're done with it */
3594 	btrfs_release_path(path);
3595 
3596 	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3597 
3598 	if (root->orphan_block_rsv)
3599 		btrfs_block_rsv_release(fs_info, root->orphan_block_rsv,
3600 					(u64)-1);
3601 
3602 	if (root->orphan_block_rsv ||
3603 	    test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3604 		trans = btrfs_join_transaction(root);
3605 		if (!IS_ERR(trans))
3606 			btrfs_end_transaction(trans);
3607 	}
3608 
3609 	if (nr_unlink)
3610 		btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3611 	if (nr_truncate)
3612 		btrfs_debug(fs_info, "truncated %d orphans", nr_truncate);
3613 
3614 out:
3615 	if (ret)
3616 		btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3617 	btrfs_free_path(path);
3618 	return ret;
3619 }
3620 
3621 /*
3622  * very simple check to peek ahead in the leaf looking for xattrs.  If we
3623  * don't find any xattrs, we know there can't be any acls.
3624  *
3625  * slot is the slot the inode is in, objectid is the objectid of the inode
3626  */
3627 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3628 					  int slot, u64 objectid,
3629 					  int *first_xattr_slot)
3630 {
3631 	u32 nritems = btrfs_header_nritems(leaf);
3632 	struct btrfs_key found_key;
3633 	static u64 xattr_access = 0;
3634 	static u64 xattr_default = 0;
3635 	int scanned = 0;
3636 
3637 	if (!xattr_access) {
3638 		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3639 					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3640 		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3641 					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3642 	}
3643 
3644 	slot++;
3645 	*first_xattr_slot = -1;
3646 	while (slot < nritems) {
3647 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3648 
3649 		/* we found a different objectid, there must not be acls */
3650 		if (found_key.objectid != objectid)
3651 			return 0;
3652 
3653 		/* we found an xattr, assume we've got an acl */
3654 		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3655 			if (*first_xattr_slot == -1)
3656 				*first_xattr_slot = slot;
3657 			if (found_key.offset == xattr_access ||
3658 			    found_key.offset == xattr_default)
3659 				return 1;
3660 		}
3661 
3662 		/*
3663 		 * we found a key greater than an xattr key, there can't
3664 		 * be any acls later on
3665 		 */
3666 		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3667 			return 0;
3668 
3669 		slot++;
3670 		scanned++;
3671 
3672 		/*
3673 		 * it goes inode, inode backrefs, xattrs, extents,
3674 		 * so if there are a ton of hard links to an inode there can
3675 		 * be a lot of backrefs.  Don't waste time searching too hard,
3676 		 * this is just an optimization
3677 		 */
3678 		if (scanned >= 8)
3679 			break;
3680 	}
3681 	/* we hit the end of the leaf before we found an xattr or
3682 	 * something larger than an xattr.  We have to assume the inode
3683 	 * has acls
3684 	 */
3685 	if (*first_xattr_slot == -1)
3686 		*first_xattr_slot = slot;
3687 	return 1;
3688 }
3689 
3690 /*
3691  * read an inode from the btree into the in-memory inode
3692  */
3693 static int btrfs_read_locked_inode(struct inode *inode)
3694 {
3695 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3696 	struct btrfs_path *path;
3697 	struct extent_buffer *leaf;
3698 	struct btrfs_inode_item *inode_item;
3699 	struct btrfs_root *root = BTRFS_I(inode)->root;
3700 	struct btrfs_key location;
3701 	unsigned long ptr;
3702 	int maybe_acls;
3703 	u32 rdev;
3704 	int ret;
3705 	bool filled = false;
3706 	int first_xattr_slot;
3707 
3708 	ret = btrfs_fill_inode(inode, &rdev);
3709 	if (!ret)
3710 		filled = true;
3711 
3712 	path = btrfs_alloc_path();
3713 	if (!path) {
3714 		ret = -ENOMEM;
3715 		goto make_bad;
3716 	}
3717 
3718 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3719 
3720 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3721 	if (ret) {
3722 		if (ret > 0)
3723 			ret = -ENOENT;
3724 		goto make_bad;
3725 	}
3726 
3727 	leaf = path->nodes[0];
3728 
3729 	if (filled)
3730 		goto cache_index;
3731 
3732 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3733 				    struct btrfs_inode_item);
3734 	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3735 	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3736 	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3737 	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3738 	btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item));
3739 
3740 	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
3741 	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
3742 
3743 	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
3744 	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
3745 
3746 	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
3747 	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
3748 
3749 	BTRFS_I(inode)->i_otime.tv_sec =
3750 		btrfs_timespec_sec(leaf, &inode_item->otime);
3751 	BTRFS_I(inode)->i_otime.tv_nsec =
3752 		btrfs_timespec_nsec(leaf, &inode_item->otime);
3753 
3754 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3755 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3756 	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3757 
3758 	inode->i_version = btrfs_inode_sequence(leaf, inode_item);
3759 	inode->i_generation = BTRFS_I(inode)->generation;
3760 	inode->i_rdev = 0;
3761 	rdev = btrfs_inode_rdev(leaf, inode_item);
3762 
3763 	BTRFS_I(inode)->index_cnt = (u64)-1;
3764 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3765 
3766 cache_index:
3767 	/*
3768 	 * If we were modified in the current generation and evicted from memory
3769 	 * and then re-read we need to do a full sync since we don't have any
3770 	 * idea about which extents were modified before we were evicted from
3771 	 * cache.
3772 	 *
3773 	 * This is required for both inode re-read from disk and delayed inode
3774 	 * in delayed_nodes_tree.
3775 	 */
3776 	if (BTRFS_I(inode)->last_trans == fs_info->generation)
3777 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3778 			&BTRFS_I(inode)->runtime_flags);
3779 
3780 	/*
3781 	 * We don't persist the id of the transaction where an unlink operation
3782 	 * against the inode was last made. So here we assume the inode might
3783 	 * have been evicted, and therefore the exact value of last_unlink_trans
3784 	 * lost, and set it to last_trans to avoid metadata inconsistencies
3785 	 * between the inode and its parent if the inode is fsync'ed and the log
3786 	 * replayed. For example, in the scenario:
3787 	 *
3788 	 * touch mydir/foo
3789 	 * ln mydir/foo mydir/bar
3790 	 * sync
3791 	 * unlink mydir/bar
3792 	 * echo 2 > /proc/sys/vm/drop_caches   # evicts inode
3793 	 * xfs_io -c fsync mydir/foo
3794 	 * <power failure>
3795 	 * mount fs, triggers fsync log replay
3796 	 *
3797 	 * We must make sure that when we fsync our inode foo we also log its
3798 	 * parent inode, otherwise after log replay the parent still has the
3799 	 * dentry with the "bar" name but our inode foo has a link count of 1
3800 	 * and doesn't have an inode ref with the name "bar" anymore.
3801 	 *
3802 	 * Setting last_unlink_trans to last_trans is a pessimistic approach,
3803 	 * but it guarantees correctness at the expense of occasional full
3804 	 * transaction commits on fsync if our inode is a directory, or if our
3805 	 * inode is not a directory, logging its parent unnecessarily.
3806 	 */
3807 	BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans;
3808 
3809 	path->slots[0]++;
3810 	if (inode->i_nlink != 1 ||
3811 	    path->slots[0] >= btrfs_header_nritems(leaf))
3812 		goto cache_acl;
3813 
3814 	btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
3815 	if (location.objectid != btrfs_ino(BTRFS_I(inode)))
3816 		goto cache_acl;
3817 
3818 	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
3819 	if (location.type == BTRFS_INODE_REF_KEY) {
3820 		struct btrfs_inode_ref *ref;
3821 
3822 		ref = (struct btrfs_inode_ref *)ptr;
3823 		BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref);
3824 	} else if (location.type == BTRFS_INODE_EXTREF_KEY) {
3825 		struct btrfs_inode_extref *extref;
3826 
3827 		extref = (struct btrfs_inode_extref *)ptr;
3828 		BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf,
3829 								     extref);
3830 	}
3831 cache_acl:
3832 	/*
3833 	 * try to precache a NULL acl entry for files that don't have
3834 	 * any xattrs or acls
3835 	 */
3836 	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3837 			btrfs_ino(BTRFS_I(inode)), &first_xattr_slot);
3838 	if (first_xattr_slot != -1) {
3839 		path->slots[0] = first_xattr_slot;
3840 		ret = btrfs_load_inode_props(inode, path);
3841 		if (ret)
3842 			btrfs_err(fs_info,
3843 				  "error loading props for ino %llu (root %llu): %d",
3844 				  btrfs_ino(BTRFS_I(inode)),
3845 				  root->root_key.objectid, ret);
3846 	}
3847 	btrfs_free_path(path);
3848 
3849 	if (!maybe_acls)
3850 		cache_no_acl(inode);
3851 
3852 	switch (inode->i_mode & S_IFMT) {
3853 	case S_IFREG:
3854 		inode->i_mapping->a_ops = &btrfs_aops;
3855 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3856 		inode->i_fop = &btrfs_file_operations;
3857 		inode->i_op = &btrfs_file_inode_operations;
3858 		break;
3859 	case S_IFDIR:
3860 		inode->i_fop = &btrfs_dir_file_operations;
3861 		inode->i_op = &btrfs_dir_inode_operations;
3862 		break;
3863 	case S_IFLNK:
3864 		inode->i_op = &btrfs_symlink_inode_operations;
3865 		inode_nohighmem(inode);
3866 		inode->i_mapping->a_ops = &btrfs_symlink_aops;
3867 		break;
3868 	default:
3869 		inode->i_op = &btrfs_special_inode_operations;
3870 		init_special_inode(inode, inode->i_mode, rdev);
3871 		break;
3872 	}
3873 
3874 	btrfs_update_iflags(inode);
3875 	return 0;
3876 
3877 make_bad:
3878 	btrfs_free_path(path);
3879 	make_bad_inode(inode);
3880 	return ret;
3881 }
3882 
3883 /*
3884  * given a leaf and an inode, copy the inode fields into the leaf
3885  */
3886 static void fill_inode_item(struct btrfs_trans_handle *trans,
3887 			    struct extent_buffer *leaf,
3888 			    struct btrfs_inode_item *item,
3889 			    struct inode *inode)
3890 {
3891 	struct btrfs_map_token token;
3892 
3893 	btrfs_init_map_token(&token);
3894 
3895 	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3896 	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3897 	btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3898 				   &token);
3899 	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3900 	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3901 
3902 	btrfs_set_token_timespec_sec(leaf, &item->atime,
3903 				     inode->i_atime.tv_sec, &token);
3904 	btrfs_set_token_timespec_nsec(leaf, &item->atime,
3905 				      inode->i_atime.tv_nsec, &token);
3906 
3907 	btrfs_set_token_timespec_sec(leaf, &item->mtime,
3908 				     inode->i_mtime.tv_sec, &token);
3909 	btrfs_set_token_timespec_nsec(leaf, &item->mtime,
3910 				      inode->i_mtime.tv_nsec, &token);
3911 
3912 	btrfs_set_token_timespec_sec(leaf, &item->ctime,
3913 				     inode->i_ctime.tv_sec, &token);
3914 	btrfs_set_token_timespec_nsec(leaf, &item->ctime,
3915 				      inode->i_ctime.tv_nsec, &token);
3916 
3917 	btrfs_set_token_timespec_sec(leaf, &item->otime,
3918 				     BTRFS_I(inode)->i_otime.tv_sec, &token);
3919 	btrfs_set_token_timespec_nsec(leaf, &item->otime,
3920 				      BTRFS_I(inode)->i_otime.tv_nsec, &token);
3921 
3922 	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3923 				     &token);
3924 	btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3925 					 &token);
3926 	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3927 	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3928 	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3929 	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3930 	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3931 }
3932 
3933 /*
3934  * copy everything in the in-memory inode into the btree.
3935  */
3936 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3937 				struct btrfs_root *root, struct inode *inode)
3938 {
3939 	struct btrfs_inode_item *inode_item;
3940 	struct btrfs_path *path;
3941 	struct extent_buffer *leaf;
3942 	int ret;
3943 
3944 	path = btrfs_alloc_path();
3945 	if (!path)
3946 		return -ENOMEM;
3947 
3948 	path->leave_spinning = 1;
3949 	ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
3950 				 1);
3951 	if (ret) {
3952 		if (ret > 0)
3953 			ret = -ENOENT;
3954 		goto failed;
3955 	}
3956 
3957 	leaf = path->nodes[0];
3958 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3959 				    struct btrfs_inode_item);
3960 
3961 	fill_inode_item(trans, leaf, inode_item, inode);
3962 	btrfs_mark_buffer_dirty(leaf);
3963 	btrfs_set_inode_last_trans(trans, inode);
3964 	ret = 0;
3965 failed:
3966 	btrfs_free_path(path);
3967 	return ret;
3968 }
3969 
3970 /*
3971  * copy everything in the in-memory inode into the btree.
3972  */
3973 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3974 				struct btrfs_root *root, struct inode *inode)
3975 {
3976 	struct btrfs_fs_info *fs_info = root->fs_info;
3977 	int ret;
3978 
3979 	/*
3980 	 * If the inode is a free space inode, we can deadlock during commit
3981 	 * if we put it into the delayed code.
3982 	 *
3983 	 * The data relocation inode should also be directly updated
3984 	 * without delay
3985 	 */
3986 	if (!btrfs_is_free_space_inode(BTRFS_I(inode))
3987 	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
3988 	    && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
3989 		btrfs_update_root_times(trans, root);
3990 
3991 		ret = btrfs_delayed_update_inode(trans, root, inode);
3992 		if (!ret)
3993 			btrfs_set_inode_last_trans(trans, inode);
3994 		return ret;
3995 	}
3996 
3997 	return btrfs_update_inode_item(trans, root, inode);
3998 }
3999 
4000 noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4001 					 struct btrfs_root *root,
4002 					 struct inode *inode)
4003 {
4004 	int ret;
4005 
4006 	ret = btrfs_update_inode(trans, root, inode);
4007 	if (ret == -ENOSPC)
4008 		return btrfs_update_inode_item(trans, root, inode);
4009 	return ret;
4010 }
4011 
4012 /*
4013  * unlink helper that gets used here in inode.c and in the tree logging
4014  * recovery code.  It remove a link in a directory with a given name, and
4015  * also drops the back refs in the inode to the directory
4016  */
4017 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4018 				struct btrfs_root *root,
4019 				struct btrfs_inode *dir,
4020 				struct btrfs_inode *inode,
4021 				const char *name, int name_len)
4022 {
4023 	struct btrfs_fs_info *fs_info = root->fs_info;
4024 	struct btrfs_path *path;
4025 	int ret = 0;
4026 	struct extent_buffer *leaf;
4027 	struct btrfs_dir_item *di;
4028 	struct btrfs_key key;
4029 	u64 index;
4030 	u64 ino = btrfs_ino(inode);
4031 	u64 dir_ino = btrfs_ino(dir);
4032 
4033 	path = btrfs_alloc_path();
4034 	if (!path) {
4035 		ret = -ENOMEM;
4036 		goto out;
4037 	}
4038 
4039 	path->leave_spinning = 1;
4040 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4041 				    name, name_len, -1);
4042 	if (IS_ERR(di)) {
4043 		ret = PTR_ERR(di);
4044 		goto err;
4045 	}
4046 	if (!di) {
4047 		ret = -ENOENT;
4048 		goto err;
4049 	}
4050 	leaf = path->nodes[0];
4051 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
4052 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4053 	if (ret)
4054 		goto err;
4055 	btrfs_release_path(path);
4056 
4057 	/*
4058 	 * If we don't have dir index, we have to get it by looking up
4059 	 * the inode ref, since we get the inode ref, remove it directly,
4060 	 * it is unnecessary to do delayed deletion.
4061 	 *
4062 	 * But if we have dir index, needn't search inode ref to get it.
4063 	 * Since the inode ref is close to the inode item, it is better
4064 	 * that we delay to delete it, and just do this deletion when
4065 	 * we update the inode item.
4066 	 */
4067 	if (inode->dir_index) {
4068 		ret = btrfs_delayed_delete_inode_ref(inode);
4069 		if (!ret) {
4070 			index = inode->dir_index;
4071 			goto skip_backref;
4072 		}
4073 	}
4074 
4075 	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
4076 				  dir_ino, &index);
4077 	if (ret) {
4078 		btrfs_info(fs_info,
4079 			"failed to delete reference to %.*s, inode %llu parent %llu",
4080 			name_len, name, ino, dir_ino);
4081 		btrfs_abort_transaction(trans, ret);
4082 		goto err;
4083 	}
4084 skip_backref:
4085 	ret = btrfs_delete_delayed_dir_index(trans, fs_info, dir, index);
4086 	if (ret) {
4087 		btrfs_abort_transaction(trans, ret);
4088 		goto err;
4089 	}
4090 
4091 	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, inode,
4092 			dir_ino);
4093 	if (ret != 0 && ret != -ENOENT) {
4094 		btrfs_abort_transaction(trans, ret);
4095 		goto err;
4096 	}
4097 
4098 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, dir,
4099 			index);
4100 	if (ret == -ENOENT)
4101 		ret = 0;
4102 	else if (ret)
4103 		btrfs_abort_transaction(trans, ret);
4104 err:
4105 	btrfs_free_path(path);
4106 	if (ret)
4107 		goto out;
4108 
4109 	btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
4110 	inode_inc_iversion(&inode->vfs_inode);
4111 	inode_inc_iversion(&dir->vfs_inode);
4112 	inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
4113 		dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
4114 	ret = btrfs_update_inode(trans, root, &dir->vfs_inode);
4115 out:
4116 	return ret;
4117 }
4118 
4119 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4120 		       struct btrfs_root *root,
4121 		       struct btrfs_inode *dir, struct btrfs_inode *inode,
4122 		       const char *name, int name_len)
4123 {
4124 	int ret;
4125 	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
4126 	if (!ret) {
4127 		drop_nlink(&inode->vfs_inode);
4128 		ret = btrfs_update_inode(trans, root, &inode->vfs_inode);
4129 	}
4130 	return ret;
4131 }
4132 
4133 /*
4134  * helper to start transaction for unlink and rmdir.
4135  *
4136  * unlink and rmdir are special in btrfs, they do not always free space, so
4137  * if we cannot make our reservations the normal way try and see if there is
4138  * plenty of slack room in the global reserve to migrate, otherwise we cannot
4139  * allow the unlink to occur.
4140  */
4141 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
4142 {
4143 	struct btrfs_root *root = BTRFS_I(dir)->root;
4144 
4145 	/*
4146 	 * 1 for the possible orphan item
4147 	 * 1 for the dir item
4148 	 * 1 for the dir index
4149 	 * 1 for the inode ref
4150 	 * 1 for the inode
4151 	 */
4152 	return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
4153 }
4154 
4155 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4156 {
4157 	struct btrfs_root *root = BTRFS_I(dir)->root;
4158 	struct btrfs_trans_handle *trans;
4159 	struct inode *inode = d_inode(dentry);
4160 	int ret;
4161 
4162 	trans = __unlink_start_trans(dir);
4163 	if (IS_ERR(trans))
4164 		return PTR_ERR(trans);
4165 
4166 	btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4167 			0);
4168 
4169 	ret = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
4170 			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4171 			dentry->d_name.len);
4172 	if (ret)
4173 		goto out;
4174 
4175 	if (inode->i_nlink == 0) {
4176 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4177 		if (ret)
4178 			goto out;
4179 	}
4180 
4181 out:
4182 	btrfs_end_transaction(trans);
4183 	btrfs_btree_balance_dirty(root->fs_info);
4184 	return ret;
4185 }
4186 
4187 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4188 			struct btrfs_root *root,
4189 			struct inode *dir, u64 objectid,
4190 			const char *name, int name_len)
4191 {
4192 	struct btrfs_fs_info *fs_info = root->fs_info;
4193 	struct btrfs_path *path;
4194 	struct extent_buffer *leaf;
4195 	struct btrfs_dir_item *di;
4196 	struct btrfs_key key;
4197 	u64 index;
4198 	int ret;
4199 	u64 dir_ino = btrfs_ino(BTRFS_I(dir));
4200 
4201 	path = btrfs_alloc_path();
4202 	if (!path)
4203 		return -ENOMEM;
4204 
4205 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4206 				   name, name_len, -1);
4207 	if (IS_ERR_OR_NULL(di)) {
4208 		if (!di)
4209 			ret = -ENOENT;
4210 		else
4211 			ret = PTR_ERR(di);
4212 		goto out;
4213 	}
4214 
4215 	leaf = path->nodes[0];
4216 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
4217 	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4218 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
4219 	if (ret) {
4220 		btrfs_abort_transaction(trans, ret);
4221 		goto out;
4222 	}
4223 	btrfs_release_path(path);
4224 
4225 	ret = btrfs_del_root_ref(trans, fs_info, objectid,
4226 				 root->root_key.objectid, dir_ino,
4227 				 &index, name, name_len);
4228 	if (ret < 0) {
4229 		if (ret != -ENOENT) {
4230 			btrfs_abort_transaction(trans, ret);
4231 			goto out;
4232 		}
4233 		di = btrfs_search_dir_index_item(root, path, dir_ino,
4234 						 name, name_len);
4235 		if (IS_ERR_OR_NULL(di)) {
4236 			if (!di)
4237 				ret = -ENOENT;
4238 			else
4239 				ret = PTR_ERR(di);
4240 			btrfs_abort_transaction(trans, ret);
4241 			goto out;
4242 		}
4243 
4244 		leaf = path->nodes[0];
4245 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4246 		btrfs_release_path(path);
4247 		index = key.offset;
4248 	}
4249 	btrfs_release_path(path);
4250 
4251 	ret = btrfs_delete_delayed_dir_index(trans, fs_info, BTRFS_I(dir), index);
4252 	if (ret) {
4253 		btrfs_abort_transaction(trans, ret);
4254 		goto out;
4255 	}
4256 
4257 	btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
4258 	inode_inc_iversion(dir);
4259 	dir->i_mtime = dir->i_ctime = current_time(dir);
4260 	ret = btrfs_update_inode_fallback(trans, root, dir);
4261 	if (ret)
4262 		btrfs_abort_transaction(trans, ret);
4263 out:
4264 	btrfs_free_path(path);
4265 	return ret;
4266 }
4267 
4268 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4269 {
4270 	struct inode *inode = d_inode(dentry);
4271 	int err = 0;
4272 	struct btrfs_root *root = BTRFS_I(dir)->root;
4273 	struct btrfs_trans_handle *trans;
4274 	u64 last_unlink_trans;
4275 
4276 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4277 		return -ENOTEMPTY;
4278 	if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID)
4279 		return -EPERM;
4280 
4281 	trans = __unlink_start_trans(dir);
4282 	if (IS_ERR(trans))
4283 		return PTR_ERR(trans);
4284 
4285 	if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4286 		err = btrfs_unlink_subvol(trans, root, dir,
4287 					  BTRFS_I(inode)->location.objectid,
4288 					  dentry->d_name.name,
4289 					  dentry->d_name.len);
4290 		goto out;
4291 	}
4292 
4293 	err = btrfs_orphan_add(trans, BTRFS_I(inode));
4294 	if (err)
4295 		goto out;
4296 
4297 	last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4298 
4299 	/* now the directory is empty */
4300 	err = btrfs_unlink_inode(trans, root, BTRFS_I(dir),
4301 			BTRFS_I(d_inode(dentry)), dentry->d_name.name,
4302 			dentry->d_name.len);
4303 	if (!err) {
4304 		btrfs_i_size_write(BTRFS_I(inode), 0);
4305 		/*
4306 		 * Propagate the last_unlink_trans value of the deleted dir to
4307 		 * its parent directory. This is to prevent an unrecoverable
4308 		 * log tree in the case we do something like this:
4309 		 * 1) create dir foo
4310 		 * 2) create snapshot under dir foo
4311 		 * 3) delete the snapshot
4312 		 * 4) rmdir foo
4313 		 * 5) mkdir foo
4314 		 * 6) fsync foo or some file inside foo
4315 		 */
4316 		if (last_unlink_trans >= trans->transid)
4317 			BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4318 	}
4319 out:
4320 	btrfs_end_transaction(trans);
4321 	btrfs_btree_balance_dirty(root->fs_info);
4322 
4323 	return err;
4324 }
4325 
4326 static int truncate_space_check(struct btrfs_trans_handle *trans,
4327 				struct btrfs_root *root,
4328 				u64 bytes_deleted)
4329 {
4330 	struct btrfs_fs_info *fs_info = root->fs_info;
4331 	int ret;
4332 
4333 	/*
4334 	 * This is only used to apply pressure to the enospc system, we don't
4335 	 * intend to use this reservation at all.
4336 	 */
4337 	bytes_deleted = btrfs_csum_bytes_to_leaves(fs_info, bytes_deleted);
4338 	bytes_deleted *= fs_info->nodesize;
4339 	ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
4340 				  bytes_deleted, BTRFS_RESERVE_NO_FLUSH);
4341 	if (!ret) {
4342 		trace_btrfs_space_reservation(fs_info, "transaction",
4343 					      trans->transid,
4344 					      bytes_deleted, 1);
4345 		trans->bytes_reserved += bytes_deleted;
4346 	}
4347 	return ret;
4348 
4349 }
4350 
4351 static int truncate_inline_extent(struct inode *inode,
4352 				  struct btrfs_path *path,
4353 				  struct btrfs_key *found_key,
4354 				  const u64 item_end,
4355 				  const u64 new_size)
4356 {
4357 	struct extent_buffer *leaf = path->nodes[0];
4358 	int slot = path->slots[0];
4359 	struct btrfs_file_extent_item *fi;
4360 	u32 size = (u32)(new_size - found_key->offset);
4361 	struct btrfs_root *root = BTRFS_I(inode)->root;
4362 
4363 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
4364 
4365 	if (btrfs_file_extent_compression(leaf, fi) != BTRFS_COMPRESS_NONE) {
4366 		loff_t offset = new_size;
4367 		loff_t page_end = ALIGN(offset, PAGE_SIZE);
4368 
4369 		/*
4370 		 * Zero out the remaining of the last page of our inline extent,
4371 		 * instead of directly truncating our inline extent here - that
4372 		 * would be much more complex (decompressing all the data, then
4373 		 * compressing the truncated data, which might be bigger than
4374 		 * the size of the inline extent, resize the extent, etc).
4375 		 * We release the path because to get the page we might need to
4376 		 * read the extent item from disk (data not in the page cache).
4377 		 */
4378 		btrfs_release_path(path);
4379 		return btrfs_truncate_block(inode, offset, page_end - offset,
4380 					0);
4381 	}
4382 
4383 	btrfs_set_file_extent_ram_bytes(leaf, fi, size);
4384 	size = btrfs_file_extent_calc_inline_size(size);
4385 	btrfs_truncate_item(root->fs_info, path, size, 1);
4386 
4387 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4388 		inode_sub_bytes(inode, item_end + 1 - new_size);
4389 
4390 	return 0;
4391 }
4392 
4393 /*
4394  * this can truncate away extent items, csum items and directory items.
4395  * It starts at a high offset and removes keys until it can't find
4396  * any higher than new_size
4397  *
4398  * csum items that cross the new i_size are truncated to the new size
4399  * as well.
4400  *
4401  * min_type is the minimum key type to truncate down to.  If set to 0, this
4402  * will kill all the items on this inode, including the INODE_ITEM_KEY.
4403  */
4404 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
4405 			       struct btrfs_root *root,
4406 			       struct inode *inode,
4407 			       u64 new_size, u32 min_type)
4408 {
4409 	struct btrfs_fs_info *fs_info = root->fs_info;
4410 	struct btrfs_path *path;
4411 	struct extent_buffer *leaf;
4412 	struct btrfs_file_extent_item *fi;
4413 	struct btrfs_key key;
4414 	struct btrfs_key found_key;
4415 	u64 extent_start = 0;
4416 	u64 extent_num_bytes = 0;
4417 	u64 extent_offset = 0;
4418 	u64 item_end = 0;
4419 	u64 last_size = new_size;
4420 	u32 found_type = (u8)-1;
4421 	int found_extent;
4422 	int del_item;
4423 	int pending_del_nr = 0;
4424 	int pending_del_slot = 0;
4425 	int extent_type = -1;
4426 	int ret;
4427 	int err = 0;
4428 	u64 ino = btrfs_ino(BTRFS_I(inode));
4429 	u64 bytes_deleted = 0;
4430 	bool be_nice = 0;
4431 	bool should_throttle = 0;
4432 	bool should_end = 0;
4433 
4434 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
4435 
4436 	/*
4437 	 * for non-free space inodes and ref cows, we want to back off from
4438 	 * time to time
4439 	 */
4440 	if (!btrfs_is_free_space_inode(BTRFS_I(inode)) &&
4441 	    test_bit(BTRFS_ROOT_REF_COWS, &root->state))
4442 		be_nice = 1;
4443 
4444 	path = btrfs_alloc_path();
4445 	if (!path)
4446 		return -ENOMEM;
4447 	path->reada = READA_BACK;
4448 
4449 	/*
4450 	 * We want to drop from the next block forward in case this new size is
4451 	 * not block aligned since we will be keeping the last block of the
4452 	 * extent just the way it is.
4453 	 */
4454 	if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4455 	    root == fs_info->tree_root)
4456 		btrfs_drop_extent_cache(BTRFS_I(inode), ALIGN(new_size,
4457 					fs_info->sectorsize),
4458 					(u64)-1, 0);
4459 
4460 	/*
4461 	 * This function is also used to drop the items in the log tree before
4462 	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
4463 	 * it is used to drop the loged items. So we shouldn't kill the delayed
4464 	 * items.
4465 	 */
4466 	if (min_type == 0 && root == BTRFS_I(inode)->root)
4467 		btrfs_kill_delayed_inode_items(BTRFS_I(inode));
4468 
4469 	key.objectid = ino;
4470 	key.offset = (u64)-1;
4471 	key.type = (u8)-1;
4472 
4473 search_again:
4474 	/*
4475 	 * with a 16K leaf size and 128MB extents, you can actually queue
4476 	 * up a huge file in a single leaf.  Most of the time that
4477 	 * bytes_deleted is > 0, it will be huge by the time we get here
4478 	 */
4479 	if (be_nice && bytes_deleted > SZ_32M) {
4480 		if (btrfs_should_end_transaction(trans)) {
4481 			err = -EAGAIN;
4482 			goto error;
4483 		}
4484 	}
4485 
4486 
4487 	path->leave_spinning = 1;
4488 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
4489 	if (ret < 0) {
4490 		err = ret;
4491 		goto out;
4492 	}
4493 
4494 	if (ret > 0) {
4495 		/* there are no items in the tree for us to truncate, we're
4496 		 * done
4497 		 */
4498 		if (path->slots[0] == 0)
4499 			goto out;
4500 		path->slots[0]--;
4501 	}
4502 
4503 	while (1) {
4504 		fi = NULL;
4505 		leaf = path->nodes[0];
4506 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
4507 		found_type = found_key.type;
4508 
4509 		if (found_key.objectid != ino)
4510 			break;
4511 
4512 		if (found_type < min_type)
4513 			break;
4514 
4515 		item_end = found_key.offset;
4516 		if (found_type == BTRFS_EXTENT_DATA_KEY) {
4517 			fi = btrfs_item_ptr(leaf, path->slots[0],
4518 					    struct btrfs_file_extent_item);
4519 			extent_type = btrfs_file_extent_type(leaf, fi);
4520 			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4521 				item_end +=
4522 				    btrfs_file_extent_num_bytes(leaf, fi);
4523 
4524 				trace_btrfs_truncate_show_fi_regular(
4525 					BTRFS_I(inode), leaf, fi,
4526 					found_key.offset);
4527 			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4528 				item_end += btrfs_file_extent_inline_len(leaf,
4529 							 path->slots[0], fi);
4530 
4531 				trace_btrfs_truncate_show_fi_inline(
4532 					BTRFS_I(inode), leaf, fi, path->slots[0],
4533 					found_key.offset);
4534 			}
4535 			item_end--;
4536 		}
4537 		if (found_type > min_type) {
4538 			del_item = 1;
4539 		} else {
4540 			if (item_end < new_size)
4541 				break;
4542 			if (found_key.offset >= new_size)
4543 				del_item = 1;
4544 			else
4545 				del_item = 0;
4546 		}
4547 		found_extent = 0;
4548 		/* FIXME, shrink the extent if the ref count is only 1 */
4549 		if (found_type != BTRFS_EXTENT_DATA_KEY)
4550 			goto delete;
4551 
4552 		if (del_item)
4553 			last_size = found_key.offset;
4554 		else
4555 			last_size = new_size;
4556 
4557 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
4558 			u64 num_dec;
4559 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
4560 			if (!del_item) {
4561 				u64 orig_num_bytes =
4562 					btrfs_file_extent_num_bytes(leaf, fi);
4563 				extent_num_bytes = ALIGN(new_size -
4564 						found_key.offset,
4565 						fs_info->sectorsize);
4566 				btrfs_set_file_extent_num_bytes(leaf, fi,
4567 							 extent_num_bytes);
4568 				num_dec = (orig_num_bytes -
4569 					   extent_num_bytes);
4570 				if (test_bit(BTRFS_ROOT_REF_COWS,
4571 					     &root->state) &&
4572 				    extent_start != 0)
4573 					inode_sub_bytes(inode, num_dec);
4574 				btrfs_mark_buffer_dirty(leaf);
4575 			} else {
4576 				extent_num_bytes =
4577 					btrfs_file_extent_disk_num_bytes(leaf,
4578 									 fi);
4579 				extent_offset = found_key.offset -
4580 					btrfs_file_extent_offset(leaf, fi);
4581 
4582 				/* FIXME blocksize != 4096 */
4583 				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
4584 				if (extent_start != 0) {
4585 					found_extent = 1;
4586 					if (test_bit(BTRFS_ROOT_REF_COWS,
4587 						     &root->state))
4588 						inode_sub_bytes(inode, num_dec);
4589 				}
4590 			}
4591 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
4592 			/*
4593 			 * we can't truncate inline items that have had
4594 			 * special encodings
4595 			 */
4596 			if (!del_item &&
4597 			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
4598 			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
4599 
4600 				/*
4601 				 * Need to release path in order to truncate a
4602 				 * compressed extent. So delete any accumulated
4603 				 * extent items so far.
4604 				 */
4605 				if (btrfs_file_extent_compression(leaf, fi) !=
4606 				    BTRFS_COMPRESS_NONE && pending_del_nr) {
4607 					err = btrfs_del_items(trans, root, path,
4608 							      pending_del_slot,
4609 							      pending_del_nr);
4610 					if (err) {
4611 						btrfs_abort_transaction(trans,
4612 									err);
4613 						goto error;
4614 					}
4615 					pending_del_nr = 0;
4616 				}
4617 
4618 				err = truncate_inline_extent(inode, path,
4619 							     &found_key,
4620 							     item_end,
4621 							     new_size);
4622 				if (err) {
4623 					btrfs_abort_transaction(trans, err);
4624 					goto error;
4625 				}
4626 			} else if (test_bit(BTRFS_ROOT_REF_COWS,
4627 					    &root->state)) {
4628 				inode_sub_bytes(inode, item_end + 1 - new_size);
4629 			}
4630 		}
4631 delete:
4632 		if (del_item) {
4633 			if (!pending_del_nr) {
4634 				/* no pending yet, add ourselves */
4635 				pending_del_slot = path->slots[0];
4636 				pending_del_nr = 1;
4637 			} else if (pending_del_nr &&
4638 				   path->slots[0] + 1 == pending_del_slot) {
4639 				/* hop on the pending chunk */
4640 				pending_del_nr++;
4641 				pending_del_slot = path->slots[0];
4642 			} else {
4643 				BUG();
4644 			}
4645 		} else {
4646 			break;
4647 		}
4648 		should_throttle = 0;
4649 
4650 		if (found_extent &&
4651 		    (test_bit(BTRFS_ROOT_REF_COWS, &root->state) ||
4652 		     root == fs_info->tree_root)) {
4653 			btrfs_set_path_blocking(path);
4654 			bytes_deleted += extent_num_bytes;
4655 			ret = btrfs_free_extent(trans, fs_info, extent_start,
4656 						extent_num_bytes, 0,
4657 						btrfs_header_owner(leaf),
4658 						ino, extent_offset);
4659 			BUG_ON(ret);
4660 			if (btrfs_should_throttle_delayed_refs(trans, fs_info))
4661 				btrfs_async_run_delayed_refs(fs_info,
4662 					trans->delayed_ref_updates * 2,
4663 					trans->transid, 0);
4664 			if (be_nice) {
4665 				if (truncate_space_check(trans, root,
4666 							 extent_num_bytes)) {
4667 					should_end = 1;
4668 				}
4669 				if (btrfs_should_throttle_delayed_refs(trans,
4670 								       fs_info))
4671 					should_throttle = 1;
4672 			}
4673 		}
4674 
4675 		if (found_type == BTRFS_INODE_ITEM_KEY)
4676 			break;
4677 
4678 		if (path->slots[0] == 0 ||
4679 		    path->slots[0] != pending_del_slot ||
4680 		    should_throttle || should_end) {
4681 			if (pending_del_nr) {
4682 				ret = btrfs_del_items(trans, root, path,
4683 						pending_del_slot,
4684 						pending_del_nr);
4685 				if (ret) {
4686 					btrfs_abort_transaction(trans, ret);
4687 					goto error;
4688 				}
4689 				pending_del_nr = 0;
4690 			}
4691 			btrfs_release_path(path);
4692 			if (should_throttle) {
4693 				unsigned long updates = trans->delayed_ref_updates;
4694 				if (updates) {
4695 					trans->delayed_ref_updates = 0;
4696 					ret = btrfs_run_delayed_refs(trans,
4697 								   fs_info,
4698 								   updates * 2);
4699 					if (ret && !err)
4700 						err = ret;
4701 				}
4702 			}
4703 			/*
4704 			 * if we failed to refill our space rsv, bail out
4705 			 * and let the transaction restart
4706 			 */
4707 			if (should_end) {
4708 				err = -EAGAIN;
4709 				goto error;
4710 			}
4711 			goto search_again;
4712 		} else {
4713 			path->slots[0]--;
4714 		}
4715 	}
4716 out:
4717 	if (pending_del_nr) {
4718 		ret = btrfs_del_items(trans, root, path, pending_del_slot,
4719 				      pending_del_nr);
4720 		if (ret)
4721 			btrfs_abort_transaction(trans, ret);
4722 	}
4723 error:
4724 	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
4725 		ASSERT(last_size >= new_size);
4726 		if (!err && last_size > new_size)
4727 			last_size = new_size;
4728 		btrfs_ordered_update_i_size(inode, last_size, NULL);
4729 	}
4730 
4731 	btrfs_free_path(path);
4732 
4733 	if (be_nice && bytes_deleted > SZ_32M) {
4734 		unsigned long updates = trans->delayed_ref_updates;
4735 		if (updates) {
4736 			trans->delayed_ref_updates = 0;
4737 			ret = btrfs_run_delayed_refs(trans, fs_info,
4738 						     updates * 2);
4739 			if (ret && !err)
4740 				err = ret;
4741 		}
4742 	}
4743 	return err;
4744 }
4745 
4746 /*
4747  * btrfs_truncate_block - read, zero a chunk and write a block
4748  * @inode - inode that we're zeroing
4749  * @from - the offset to start zeroing
4750  * @len - the length to zero, 0 to zero the entire range respective to the
4751  *	offset
4752  * @front - zero up to the offset instead of from the offset on
4753  *
4754  * This will find the block for the "from" offset and cow the block and zero the
4755  * part we want to zero.  This is used with truncate and hole punching.
4756  */
4757 int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
4758 			int front)
4759 {
4760 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4761 	struct address_space *mapping = inode->i_mapping;
4762 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4763 	struct btrfs_ordered_extent *ordered;
4764 	struct extent_state *cached_state = NULL;
4765 	char *kaddr;
4766 	u32 blocksize = fs_info->sectorsize;
4767 	pgoff_t index = from >> PAGE_SHIFT;
4768 	unsigned offset = from & (blocksize - 1);
4769 	struct page *page;
4770 	gfp_t mask = btrfs_alloc_write_mask(mapping);
4771 	int ret = 0;
4772 	u64 block_start;
4773 	u64 block_end;
4774 
4775 	if ((offset & (blocksize - 1)) == 0 &&
4776 	    (!len || ((len & (blocksize - 1)) == 0)))
4777 		goto out;
4778 
4779 	ret = btrfs_delalloc_reserve_space(inode,
4780 			round_down(from, blocksize), blocksize);
4781 	if (ret)
4782 		goto out;
4783 
4784 again:
4785 	page = find_or_create_page(mapping, index, mask);
4786 	if (!page) {
4787 		btrfs_delalloc_release_space(inode,
4788 				round_down(from, blocksize),
4789 				blocksize);
4790 		ret = -ENOMEM;
4791 		goto out;
4792 	}
4793 
4794 	block_start = round_down(from, blocksize);
4795 	block_end = block_start + blocksize - 1;
4796 
4797 	if (!PageUptodate(page)) {
4798 		ret = btrfs_readpage(NULL, page);
4799 		lock_page(page);
4800 		if (page->mapping != mapping) {
4801 			unlock_page(page);
4802 			put_page(page);
4803 			goto again;
4804 		}
4805 		if (!PageUptodate(page)) {
4806 			ret = -EIO;
4807 			goto out_unlock;
4808 		}
4809 	}
4810 	wait_on_page_writeback(page);
4811 
4812 	lock_extent_bits(io_tree, block_start, block_end, &cached_state);
4813 	set_page_extent_mapped(page);
4814 
4815 	ordered = btrfs_lookup_ordered_extent(inode, block_start);
4816 	if (ordered) {
4817 		unlock_extent_cached(io_tree, block_start, block_end,
4818 				     &cached_state, GFP_NOFS);
4819 		unlock_page(page);
4820 		put_page(page);
4821 		btrfs_start_ordered_extent(inode, ordered, 1);
4822 		btrfs_put_ordered_extent(ordered);
4823 		goto again;
4824 	}
4825 
4826 	clear_extent_bit(&BTRFS_I(inode)->io_tree, block_start, block_end,
4827 			  EXTENT_DIRTY | EXTENT_DELALLOC |
4828 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4829 			  0, 0, &cached_state, GFP_NOFS);
4830 
4831 	ret = btrfs_set_extent_delalloc(inode, block_start, block_end,
4832 					&cached_state, 0);
4833 	if (ret) {
4834 		unlock_extent_cached(io_tree, block_start, block_end,
4835 				     &cached_state, GFP_NOFS);
4836 		goto out_unlock;
4837 	}
4838 
4839 	if (offset != blocksize) {
4840 		if (!len)
4841 			len = blocksize - offset;
4842 		kaddr = kmap(page);
4843 		if (front)
4844 			memset(kaddr + (block_start - page_offset(page)),
4845 				0, offset);
4846 		else
4847 			memset(kaddr + (block_start - page_offset(page)) +  offset,
4848 				0, len);
4849 		flush_dcache_page(page);
4850 		kunmap(page);
4851 	}
4852 	ClearPageChecked(page);
4853 	set_page_dirty(page);
4854 	unlock_extent_cached(io_tree, block_start, block_end, &cached_state,
4855 			     GFP_NOFS);
4856 
4857 out_unlock:
4858 	if (ret)
4859 		btrfs_delalloc_release_space(inode, block_start,
4860 					     blocksize);
4861 	unlock_page(page);
4862 	put_page(page);
4863 out:
4864 	return ret;
4865 }
4866 
4867 static int maybe_insert_hole(struct btrfs_root *root, struct inode *inode,
4868 			     u64 offset, u64 len)
4869 {
4870 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4871 	struct btrfs_trans_handle *trans;
4872 	int ret;
4873 
4874 	/*
4875 	 * Still need to make sure the inode looks like it's been updated so
4876 	 * that any holes get logged if we fsync.
4877 	 */
4878 	if (btrfs_fs_incompat(fs_info, NO_HOLES)) {
4879 		BTRFS_I(inode)->last_trans = fs_info->generation;
4880 		BTRFS_I(inode)->last_sub_trans = root->log_transid;
4881 		BTRFS_I(inode)->last_log_commit = root->last_log_commit;
4882 		return 0;
4883 	}
4884 
4885 	/*
4886 	 * 1 - for the one we're dropping
4887 	 * 1 - for the one we're adding
4888 	 * 1 - for updating the inode.
4889 	 */
4890 	trans = btrfs_start_transaction(root, 3);
4891 	if (IS_ERR(trans))
4892 		return PTR_ERR(trans);
4893 
4894 	ret = btrfs_drop_extents(trans, root, inode, offset, offset + len, 1);
4895 	if (ret) {
4896 		btrfs_abort_transaction(trans, ret);
4897 		btrfs_end_transaction(trans);
4898 		return ret;
4899 	}
4900 
4901 	ret = btrfs_insert_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)),
4902 			offset, 0, 0, len, 0, len, 0, 0, 0);
4903 	if (ret)
4904 		btrfs_abort_transaction(trans, ret);
4905 	else
4906 		btrfs_update_inode(trans, root, inode);
4907 	btrfs_end_transaction(trans);
4908 	return ret;
4909 }
4910 
4911 /*
4912  * This function puts in dummy file extents for the area we're creating a hole
4913  * for.  So if we are truncating this file to a larger size we need to insert
4914  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4915  * the range between oldsize and size
4916  */
4917 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4918 {
4919 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4920 	struct btrfs_root *root = BTRFS_I(inode)->root;
4921 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4922 	struct extent_map *em = NULL;
4923 	struct extent_state *cached_state = NULL;
4924 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4925 	u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
4926 	u64 block_end = ALIGN(size, fs_info->sectorsize);
4927 	u64 last_byte;
4928 	u64 cur_offset;
4929 	u64 hole_size;
4930 	int err = 0;
4931 
4932 	/*
4933 	 * If our size started in the middle of a block we need to zero out the
4934 	 * rest of the block before we expand the i_size, otherwise we could
4935 	 * expose stale data.
4936 	 */
4937 	err = btrfs_truncate_block(inode, oldsize, 0, 0);
4938 	if (err)
4939 		return err;
4940 
4941 	if (size <= hole_start)
4942 		return 0;
4943 
4944 	while (1) {
4945 		struct btrfs_ordered_extent *ordered;
4946 
4947 		lock_extent_bits(io_tree, hole_start, block_end - 1,
4948 				 &cached_state);
4949 		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), hole_start,
4950 						     block_end - hole_start);
4951 		if (!ordered)
4952 			break;
4953 		unlock_extent_cached(io_tree, hole_start, block_end - 1,
4954 				     &cached_state, GFP_NOFS);
4955 		btrfs_start_ordered_extent(inode, ordered, 1);
4956 		btrfs_put_ordered_extent(ordered);
4957 	}
4958 
4959 	cur_offset = hole_start;
4960 	while (1) {
4961 		em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
4962 				block_end - cur_offset, 0);
4963 		if (IS_ERR(em)) {
4964 			err = PTR_ERR(em);
4965 			em = NULL;
4966 			break;
4967 		}
4968 		last_byte = min(extent_map_end(em), block_end);
4969 		last_byte = ALIGN(last_byte, fs_info->sectorsize);
4970 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4971 			struct extent_map *hole_em;
4972 			hole_size = last_byte - cur_offset;
4973 
4974 			err = maybe_insert_hole(root, inode, cur_offset,
4975 						hole_size);
4976 			if (err)
4977 				break;
4978 			btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
4979 						cur_offset + hole_size - 1, 0);
4980 			hole_em = alloc_extent_map();
4981 			if (!hole_em) {
4982 				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4983 					&BTRFS_I(inode)->runtime_flags);
4984 				goto next;
4985 			}
4986 			hole_em->start = cur_offset;
4987 			hole_em->len = hole_size;
4988 			hole_em->orig_start = cur_offset;
4989 
4990 			hole_em->block_start = EXTENT_MAP_HOLE;
4991 			hole_em->block_len = 0;
4992 			hole_em->orig_block_len = 0;
4993 			hole_em->ram_bytes = hole_size;
4994 			hole_em->bdev = fs_info->fs_devices->latest_bdev;
4995 			hole_em->compress_type = BTRFS_COMPRESS_NONE;
4996 			hole_em->generation = fs_info->generation;
4997 
4998 			while (1) {
4999 				write_lock(&em_tree->lock);
5000 				err = add_extent_mapping(em_tree, hole_em, 1);
5001 				write_unlock(&em_tree->lock);
5002 				if (err != -EEXIST)
5003 					break;
5004 				btrfs_drop_extent_cache(BTRFS_I(inode),
5005 							cur_offset,
5006 							cur_offset +
5007 							hole_size - 1, 0);
5008 			}
5009 			free_extent_map(hole_em);
5010 		}
5011 next:
5012 		free_extent_map(em);
5013 		em = NULL;
5014 		cur_offset = last_byte;
5015 		if (cur_offset >= block_end)
5016 			break;
5017 	}
5018 	free_extent_map(em);
5019 	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
5020 			     GFP_NOFS);
5021 	return err;
5022 }
5023 
5024 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
5025 {
5026 	struct btrfs_root *root = BTRFS_I(inode)->root;
5027 	struct btrfs_trans_handle *trans;
5028 	loff_t oldsize = i_size_read(inode);
5029 	loff_t newsize = attr->ia_size;
5030 	int mask = attr->ia_valid;
5031 	int ret;
5032 
5033 	/*
5034 	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
5035 	 * special case where we need to update the times despite not having
5036 	 * these flags set.  For all other operations the VFS set these flags
5037 	 * explicitly if it wants a timestamp update.
5038 	 */
5039 	if (newsize != oldsize) {
5040 		inode_inc_iversion(inode);
5041 		if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
5042 			inode->i_ctime = inode->i_mtime =
5043 				current_time(inode);
5044 	}
5045 
5046 	if (newsize > oldsize) {
5047 		/*
5048 		 * Don't do an expanding truncate while snapshoting is ongoing.
5049 		 * This is to ensure the snapshot captures a fully consistent
5050 		 * state of this file - if the snapshot captures this expanding
5051 		 * truncation, it must capture all writes that happened before
5052 		 * this truncation.
5053 		 */
5054 		btrfs_wait_for_snapshot_creation(root);
5055 		ret = btrfs_cont_expand(inode, oldsize, newsize);
5056 		if (ret) {
5057 			btrfs_end_write_no_snapshoting(root);
5058 			return ret;
5059 		}
5060 
5061 		trans = btrfs_start_transaction(root, 1);
5062 		if (IS_ERR(trans)) {
5063 			btrfs_end_write_no_snapshoting(root);
5064 			return PTR_ERR(trans);
5065 		}
5066 
5067 		i_size_write(inode, newsize);
5068 		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
5069 		pagecache_isize_extended(inode, oldsize, newsize);
5070 		ret = btrfs_update_inode(trans, root, inode);
5071 		btrfs_end_write_no_snapshoting(root);
5072 		btrfs_end_transaction(trans);
5073 	} else {
5074 
5075 		/*
5076 		 * We're truncating a file that used to have good data down to
5077 		 * zero. Make sure it gets into the ordered flush list so that
5078 		 * any new writes get down to disk quickly.
5079 		 */
5080 		if (newsize == 0)
5081 			set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
5082 				&BTRFS_I(inode)->runtime_flags);
5083 
5084 		/*
5085 		 * 1 for the orphan item we're going to add
5086 		 * 1 for the orphan item deletion.
5087 		 */
5088 		trans = btrfs_start_transaction(root, 2);
5089 		if (IS_ERR(trans))
5090 			return PTR_ERR(trans);
5091 
5092 		/*
5093 		 * We need to do this in case we fail at _any_ point during the
5094 		 * actual truncate.  Once we do the truncate_setsize we could
5095 		 * invalidate pages which forces any outstanding ordered io to
5096 		 * be instantly completed which will give us extents that need
5097 		 * to be truncated.  If we fail to get an orphan inode down we
5098 		 * could have left over extents that were never meant to live,
5099 		 * so we need to guarantee from this point on that everything
5100 		 * will be consistent.
5101 		 */
5102 		ret = btrfs_orphan_add(trans, BTRFS_I(inode));
5103 		btrfs_end_transaction(trans);
5104 		if (ret)
5105 			return ret;
5106 
5107 		/* we don't support swapfiles, so vmtruncate shouldn't fail */
5108 		truncate_setsize(inode, newsize);
5109 
5110 		/* Disable nonlocked read DIO to avoid the end less truncate */
5111 		btrfs_inode_block_unlocked_dio(BTRFS_I(inode));
5112 		inode_dio_wait(inode);
5113 		btrfs_inode_resume_unlocked_dio(BTRFS_I(inode));
5114 
5115 		ret = btrfs_truncate(inode);
5116 		if (ret && inode->i_nlink) {
5117 			int err;
5118 
5119 			/* To get a stable disk_i_size */
5120 			err = btrfs_wait_ordered_range(inode, 0, (u64)-1);
5121 			if (err) {
5122 				btrfs_orphan_del(NULL, BTRFS_I(inode));
5123 				return err;
5124 			}
5125 
5126 			/*
5127 			 * failed to truncate, disk_i_size is only adjusted down
5128 			 * as we remove extents, so it should represent the true
5129 			 * size of the inode, so reset the in memory size and
5130 			 * delete our orphan entry.
5131 			 */
5132 			trans = btrfs_join_transaction(root);
5133 			if (IS_ERR(trans)) {
5134 				btrfs_orphan_del(NULL, BTRFS_I(inode));
5135 				return ret;
5136 			}
5137 			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5138 			err = btrfs_orphan_del(trans, BTRFS_I(inode));
5139 			if (err)
5140 				btrfs_abort_transaction(trans, err);
5141 			btrfs_end_transaction(trans);
5142 		}
5143 	}
5144 
5145 	return ret;
5146 }
5147 
5148 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
5149 {
5150 	struct inode *inode = d_inode(dentry);
5151 	struct btrfs_root *root = BTRFS_I(inode)->root;
5152 	int err;
5153 
5154 	if (btrfs_root_readonly(root))
5155 		return -EROFS;
5156 
5157 	err = setattr_prepare(dentry, attr);
5158 	if (err)
5159 		return err;
5160 
5161 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5162 		err = btrfs_setsize(inode, attr);
5163 		if (err)
5164 			return err;
5165 	}
5166 
5167 	if (attr->ia_valid) {
5168 		setattr_copy(inode, attr);
5169 		inode_inc_iversion(inode);
5170 		err = btrfs_dirty_inode(inode);
5171 
5172 		if (!err && attr->ia_valid & ATTR_MODE)
5173 			err = posix_acl_chmod(inode, inode->i_mode);
5174 	}
5175 
5176 	return err;
5177 }
5178 
5179 /*
5180  * While truncating the inode pages during eviction, we get the VFS calling
5181  * btrfs_invalidatepage() against each page of the inode. This is slow because
5182  * the calls to btrfs_invalidatepage() result in a huge amount of calls to
5183  * lock_extent_bits() and clear_extent_bit(), which keep merging and splitting
5184  * extent_state structures over and over, wasting lots of time.
5185  *
5186  * Therefore if the inode is being evicted, let btrfs_invalidatepage() skip all
5187  * those expensive operations on a per page basis and do only the ordered io
5188  * finishing, while we release here the extent_map and extent_state structures,
5189  * without the excessive merging and splitting.
5190  */
5191 static void evict_inode_truncate_pages(struct inode *inode)
5192 {
5193 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5194 	struct extent_map_tree *map_tree = &BTRFS_I(inode)->extent_tree;
5195 	struct rb_node *node;
5196 
5197 	ASSERT(inode->i_state & I_FREEING);
5198 	truncate_inode_pages_final(&inode->i_data);
5199 
5200 	write_lock(&map_tree->lock);
5201 	while (!RB_EMPTY_ROOT(&map_tree->map)) {
5202 		struct extent_map *em;
5203 
5204 		node = rb_first(&map_tree->map);
5205 		em = rb_entry(node, struct extent_map, rb_node);
5206 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
5207 		clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
5208 		remove_extent_mapping(map_tree, em);
5209 		free_extent_map(em);
5210 		if (need_resched()) {
5211 			write_unlock(&map_tree->lock);
5212 			cond_resched();
5213 			write_lock(&map_tree->lock);
5214 		}
5215 	}
5216 	write_unlock(&map_tree->lock);
5217 
5218 	/*
5219 	 * Keep looping until we have no more ranges in the io tree.
5220 	 * We can have ongoing bios started by readpages (called from readahead)
5221 	 * that have their endio callback (extent_io.c:end_bio_extent_readpage)
5222 	 * still in progress (unlocked the pages in the bio but did not yet
5223 	 * unlocked the ranges in the io tree). Therefore this means some
5224 	 * ranges can still be locked and eviction started because before
5225 	 * submitting those bios, which are executed by a separate task (work
5226 	 * queue kthread), inode references (inode->i_count) were not taken
5227 	 * (which would be dropped in the end io callback of each bio).
5228 	 * Therefore here we effectively end up waiting for those bios and
5229 	 * anyone else holding locked ranges without having bumped the inode's
5230 	 * reference count - if we don't do it, when they access the inode's
5231 	 * io_tree to unlock a range it may be too late, leading to an
5232 	 * use-after-free issue.
5233 	 */
5234 	spin_lock(&io_tree->lock);
5235 	while (!RB_EMPTY_ROOT(&io_tree->state)) {
5236 		struct extent_state *state;
5237 		struct extent_state *cached_state = NULL;
5238 		u64 start;
5239 		u64 end;
5240 
5241 		node = rb_first(&io_tree->state);
5242 		state = rb_entry(node, struct extent_state, rb_node);
5243 		start = state->start;
5244 		end = state->end;
5245 		spin_unlock(&io_tree->lock);
5246 
5247 		lock_extent_bits(io_tree, start, end, &cached_state);
5248 
5249 		/*
5250 		 * If still has DELALLOC flag, the extent didn't reach disk,
5251 		 * and its reserved space won't be freed by delayed_ref.
5252 		 * So we need to free its reserved space here.
5253 		 * (Refer to comment in btrfs_invalidatepage, case 2)
5254 		 *
5255 		 * Note, end is the bytenr of last byte, so we need + 1 here.
5256 		 */
5257 		if (state->state & EXTENT_DELALLOC)
5258 			btrfs_qgroup_free_data(inode, start, end - start + 1);
5259 
5260 		clear_extent_bit(io_tree, start, end,
5261 				 EXTENT_LOCKED | EXTENT_DIRTY |
5262 				 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
5263 				 EXTENT_DEFRAG, 1, 1,
5264 				 &cached_state, GFP_NOFS);
5265 
5266 		cond_resched();
5267 		spin_lock(&io_tree->lock);
5268 	}
5269 	spin_unlock(&io_tree->lock);
5270 }
5271 
5272 void btrfs_evict_inode(struct inode *inode)
5273 {
5274 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5275 	struct btrfs_trans_handle *trans;
5276 	struct btrfs_root *root = BTRFS_I(inode)->root;
5277 	struct btrfs_block_rsv *rsv, *global_rsv;
5278 	int steal_from_global = 0;
5279 	u64 min_size;
5280 	int ret;
5281 
5282 	trace_btrfs_inode_evict(inode);
5283 
5284 	if (!root) {
5285 		kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
5286 		return;
5287 	}
5288 
5289 	min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
5290 
5291 	evict_inode_truncate_pages(inode);
5292 
5293 	if (inode->i_nlink &&
5294 	    ((btrfs_root_refs(&root->root_item) != 0 &&
5295 	      root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID) ||
5296 	     btrfs_is_free_space_inode(BTRFS_I(inode))))
5297 		goto no_delete;
5298 
5299 	if (is_bad_inode(inode)) {
5300 		btrfs_orphan_del(NULL, BTRFS_I(inode));
5301 		goto no_delete;
5302 	}
5303 	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
5304 	if (!special_file(inode->i_mode))
5305 		btrfs_wait_ordered_range(inode, 0, (u64)-1);
5306 
5307 	btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
5308 
5309 	if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
5310 		BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
5311 				 &BTRFS_I(inode)->runtime_flags));
5312 		goto no_delete;
5313 	}
5314 
5315 	if (inode->i_nlink > 0) {
5316 		BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5317 		       root->root_key.objectid != BTRFS_ROOT_TREE_OBJECTID);
5318 		goto no_delete;
5319 	}
5320 
5321 	ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5322 	if (ret) {
5323 		btrfs_orphan_del(NULL, BTRFS_I(inode));
5324 		goto no_delete;
5325 	}
5326 
5327 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5328 	if (!rsv) {
5329 		btrfs_orphan_del(NULL, BTRFS_I(inode));
5330 		goto no_delete;
5331 	}
5332 	rsv->size = min_size;
5333 	rsv->failfast = 1;
5334 	global_rsv = &fs_info->global_block_rsv;
5335 
5336 	btrfs_i_size_write(BTRFS_I(inode), 0);
5337 
5338 	/*
5339 	 * This is a bit simpler than btrfs_truncate since we've already
5340 	 * reserved our space for our orphan item in the unlink, so we just
5341 	 * need to reserve some slack space in case we add bytes and update
5342 	 * inode item when doing the truncate.
5343 	 */
5344 	while (1) {
5345 		ret = btrfs_block_rsv_refill(root, rsv, min_size,
5346 					     BTRFS_RESERVE_FLUSH_LIMIT);
5347 
5348 		/*
5349 		 * Try and steal from the global reserve since we will
5350 		 * likely not use this space anyway, we want to try as
5351 		 * hard as possible to get this to work.
5352 		 */
5353 		if (ret)
5354 			steal_from_global++;
5355 		else
5356 			steal_from_global = 0;
5357 		ret = 0;
5358 
5359 		/*
5360 		 * steal_from_global == 0: we reserved stuff, hooray!
5361 		 * steal_from_global == 1: we didn't reserve stuff, boo!
5362 		 * steal_from_global == 2: we've committed, still not a lot of
5363 		 * room but maybe we'll have room in the global reserve this
5364 		 * time.
5365 		 * steal_from_global == 3: abandon all hope!
5366 		 */
5367 		if (steal_from_global > 2) {
5368 			btrfs_warn(fs_info,
5369 				   "Could not get space for a delete, will truncate on mount %d",
5370 				   ret);
5371 			btrfs_orphan_del(NULL, BTRFS_I(inode));
5372 			btrfs_free_block_rsv(fs_info, rsv);
5373 			goto no_delete;
5374 		}
5375 
5376 		trans = btrfs_join_transaction(root);
5377 		if (IS_ERR(trans)) {
5378 			btrfs_orphan_del(NULL, BTRFS_I(inode));
5379 			btrfs_free_block_rsv(fs_info, rsv);
5380 			goto no_delete;
5381 		}
5382 
5383 		/*
5384 		 * We can't just steal from the global reserve, we need to make
5385 		 * sure there is room to do it, if not we need to commit and try
5386 		 * again.
5387 		 */
5388 		if (steal_from_global) {
5389 			if (!btrfs_check_space_for_delayed_refs(trans, fs_info))
5390 				ret = btrfs_block_rsv_migrate(global_rsv, rsv,
5391 							      min_size, 0);
5392 			else
5393 				ret = -ENOSPC;
5394 		}
5395 
5396 		/*
5397 		 * Couldn't steal from the global reserve, we have too much
5398 		 * pending stuff built up, commit the transaction and try it
5399 		 * again.
5400 		 */
5401 		if (ret) {
5402 			ret = btrfs_commit_transaction(trans);
5403 			if (ret) {
5404 				btrfs_orphan_del(NULL, BTRFS_I(inode));
5405 				btrfs_free_block_rsv(fs_info, rsv);
5406 				goto no_delete;
5407 			}
5408 			continue;
5409 		} else {
5410 			steal_from_global = 0;
5411 		}
5412 
5413 		trans->block_rsv = rsv;
5414 
5415 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
5416 		if (ret != -ENOSPC && ret != -EAGAIN)
5417 			break;
5418 
5419 		trans->block_rsv = &fs_info->trans_block_rsv;
5420 		btrfs_end_transaction(trans);
5421 		trans = NULL;
5422 		btrfs_btree_balance_dirty(fs_info);
5423 	}
5424 
5425 	btrfs_free_block_rsv(fs_info, rsv);
5426 
5427 	/*
5428 	 * Errors here aren't a big deal, it just means we leave orphan items
5429 	 * in the tree.  They will be cleaned up on the next mount.
5430 	 */
5431 	if (ret == 0) {
5432 		trans->block_rsv = root->orphan_block_rsv;
5433 		btrfs_orphan_del(trans, BTRFS_I(inode));
5434 	} else {
5435 		btrfs_orphan_del(NULL, BTRFS_I(inode));
5436 	}
5437 
5438 	trans->block_rsv = &fs_info->trans_block_rsv;
5439 	if (!(root == fs_info->tree_root ||
5440 	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
5441 		btrfs_return_ino(root, btrfs_ino(BTRFS_I(inode)));
5442 
5443 	btrfs_end_transaction(trans);
5444 	btrfs_btree_balance_dirty(fs_info);
5445 no_delete:
5446 	btrfs_remove_delayed_node(BTRFS_I(inode));
5447 	clear_inode(inode);
5448 }
5449 
5450 /*
5451  * this returns the key found in the dir entry in the location pointer.
5452  * If no dir entries were found, location->objectid is 0.
5453  */
5454 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
5455 			       struct btrfs_key *location)
5456 {
5457 	const char *name = dentry->d_name.name;
5458 	int namelen = dentry->d_name.len;
5459 	struct btrfs_dir_item *di;
5460 	struct btrfs_path *path;
5461 	struct btrfs_root *root = BTRFS_I(dir)->root;
5462 	int ret = 0;
5463 
5464 	path = btrfs_alloc_path();
5465 	if (!path)
5466 		return -ENOMEM;
5467 
5468 	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
5469 			name, namelen, 0);
5470 	if (IS_ERR(di))
5471 		ret = PTR_ERR(di);
5472 
5473 	if (IS_ERR_OR_NULL(di))
5474 		goto out_err;
5475 
5476 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5477 out:
5478 	btrfs_free_path(path);
5479 	return ret;
5480 out_err:
5481 	location->objectid = 0;
5482 	goto out;
5483 }
5484 
5485 /*
5486  * when we hit a tree root in a directory, the btrfs part of the inode
5487  * needs to be changed to reflect the root directory of the tree root.  This
5488  * is kind of like crossing a mount point.
5489  */
5490 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5491 				    struct inode *dir,
5492 				    struct dentry *dentry,
5493 				    struct btrfs_key *location,
5494 				    struct btrfs_root **sub_root)
5495 {
5496 	struct btrfs_path *path;
5497 	struct btrfs_root *new_root;
5498 	struct btrfs_root_ref *ref;
5499 	struct extent_buffer *leaf;
5500 	struct btrfs_key key;
5501 	int ret;
5502 	int err = 0;
5503 
5504 	path = btrfs_alloc_path();
5505 	if (!path) {
5506 		err = -ENOMEM;
5507 		goto out;
5508 	}
5509 
5510 	err = -ENOENT;
5511 	key.objectid = BTRFS_I(dir)->root->root_key.objectid;
5512 	key.type = BTRFS_ROOT_REF_KEY;
5513 	key.offset = location->objectid;
5514 
5515 	ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5516 	if (ret) {
5517 		if (ret < 0)
5518 			err = ret;
5519 		goto out;
5520 	}
5521 
5522 	leaf = path->nodes[0];
5523 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5524 	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(BTRFS_I(dir)) ||
5525 	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
5526 		goto out;
5527 
5528 	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
5529 				   (unsigned long)(ref + 1),
5530 				   dentry->d_name.len);
5531 	if (ret)
5532 		goto out;
5533 
5534 	btrfs_release_path(path);
5535 
5536 	new_root = btrfs_read_fs_root_no_name(fs_info, location);
5537 	if (IS_ERR(new_root)) {
5538 		err = PTR_ERR(new_root);
5539 		goto out;
5540 	}
5541 
5542 	*sub_root = new_root;
5543 	location->objectid = btrfs_root_dirid(&new_root->root_item);
5544 	location->type = BTRFS_INODE_ITEM_KEY;
5545 	location->offset = 0;
5546 	err = 0;
5547 out:
5548 	btrfs_free_path(path);
5549 	return err;
5550 }
5551 
5552 static void inode_tree_add(struct inode *inode)
5553 {
5554 	struct btrfs_root *root = BTRFS_I(inode)->root;
5555 	struct btrfs_inode *entry;
5556 	struct rb_node **p;
5557 	struct rb_node *parent;
5558 	struct rb_node *new = &BTRFS_I(inode)->rb_node;
5559 	u64 ino = btrfs_ino(BTRFS_I(inode));
5560 
5561 	if (inode_unhashed(inode))
5562 		return;
5563 	parent = NULL;
5564 	spin_lock(&root->inode_lock);
5565 	p = &root->inode_tree.rb_node;
5566 	while (*p) {
5567 		parent = *p;
5568 		entry = rb_entry(parent, struct btrfs_inode, rb_node);
5569 
5570 		if (ino < btrfs_ino(BTRFS_I(&entry->vfs_inode)))
5571 			p = &parent->rb_left;
5572 		else if (ino > btrfs_ino(BTRFS_I(&entry->vfs_inode)))
5573 			p = &parent->rb_right;
5574 		else {
5575 			WARN_ON(!(entry->vfs_inode.i_state &
5576 				  (I_WILL_FREE | I_FREEING)));
5577 			rb_replace_node(parent, new, &root->inode_tree);
5578 			RB_CLEAR_NODE(parent);
5579 			spin_unlock(&root->inode_lock);
5580 			return;
5581 		}
5582 	}
5583 	rb_link_node(new, parent, p);
5584 	rb_insert_color(new, &root->inode_tree);
5585 	spin_unlock(&root->inode_lock);
5586 }
5587 
5588 static void inode_tree_del(struct inode *inode)
5589 {
5590 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5591 	struct btrfs_root *root = BTRFS_I(inode)->root;
5592 	int empty = 0;
5593 
5594 	spin_lock(&root->inode_lock);
5595 	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
5596 		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
5597 		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
5598 		empty = RB_EMPTY_ROOT(&root->inode_tree);
5599 	}
5600 	spin_unlock(&root->inode_lock);
5601 
5602 	if (empty && btrfs_root_refs(&root->root_item) == 0) {
5603 		synchronize_srcu(&fs_info->subvol_srcu);
5604 		spin_lock(&root->inode_lock);
5605 		empty = RB_EMPTY_ROOT(&root->inode_tree);
5606 		spin_unlock(&root->inode_lock);
5607 		if (empty)
5608 			btrfs_add_dead_root(root);
5609 	}
5610 }
5611 
5612 void btrfs_invalidate_inodes(struct btrfs_root *root)
5613 {
5614 	struct btrfs_fs_info *fs_info = root->fs_info;
5615 	struct rb_node *node;
5616 	struct rb_node *prev;
5617 	struct btrfs_inode *entry;
5618 	struct inode *inode;
5619 	u64 objectid = 0;
5620 
5621 	if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
5622 		WARN_ON(btrfs_root_refs(&root->root_item) != 0);
5623 
5624 	spin_lock(&root->inode_lock);
5625 again:
5626 	node = root->inode_tree.rb_node;
5627 	prev = NULL;
5628 	while (node) {
5629 		prev = node;
5630 		entry = rb_entry(node, struct btrfs_inode, rb_node);
5631 
5632 		if (objectid < btrfs_ino(BTRFS_I(&entry->vfs_inode)))
5633 			node = node->rb_left;
5634 		else if (objectid > btrfs_ino(BTRFS_I(&entry->vfs_inode)))
5635 			node = node->rb_right;
5636 		else
5637 			break;
5638 	}
5639 	if (!node) {
5640 		while (prev) {
5641 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
5642 			if (objectid <= btrfs_ino(BTRFS_I(&entry->vfs_inode))) {
5643 				node = prev;
5644 				break;
5645 			}
5646 			prev = rb_next(prev);
5647 		}
5648 	}
5649 	while (node) {
5650 		entry = rb_entry(node, struct btrfs_inode, rb_node);
5651 		objectid = btrfs_ino(BTRFS_I(&entry->vfs_inode)) + 1;
5652 		inode = igrab(&entry->vfs_inode);
5653 		if (inode) {
5654 			spin_unlock(&root->inode_lock);
5655 			if (atomic_read(&inode->i_count) > 1)
5656 				d_prune_aliases(inode);
5657 			/*
5658 			 * btrfs_drop_inode will have it removed from
5659 			 * the inode cache when its usage count
5660 			 * hits zero.
5661 			 */
5662 			iput(inode);
5663 			cond_resched();
5664 			spin_lock(&root->inode_lock);
5665 			goto again;
5666 		}
5667 
5668 		if (cond_resched_lock(&root->inode_lock))
5669 			goto again;
5670 
5671 		node = rb_next(node);
5672 	}
5673 	spin_unlock(&root->inode_lock);
5674 }
5675 
5676 static int btrfs_init_locked_inode(struct inode *inode, void *p)
5677 {
5678 	struct btrfs_iget_args *args = p;
5679 	inode->i_ino = args->location->objectid;
5680 	memcpy(&BTRFS_I(inode)->location, args->location,
5681 	       sizeof(*args->location));
5682 	BTRFS_I(inode)->root = args->root;
5683 	return 0;
5684 }
5685 
5686 static int btrfs_find_actor(struct inode *inode, void *opaque)
5687 {
5688 	struct btrfs_iget_args *args = opaque;
5689 	return args->location->objectid == BTRFS_I(inode)->location.objectid &&
5690 		args->root == BTRFS_I(inode)->root;
5691 }
5692 
5693 static struct inode *btrfs_iget_locked(struct super_block *s,
5694 				       struct btrfs_key *location,
5695 				       struct btrfs_root *root)
5696 {
5697 	struct inode *inode;
5698 	struct btrfs_iget_args args;
5699 	unsigned long hashval = btrfs_inode_hash(location->objectid, root);
5700 
5701 	args.location = location;
5702 	args.root = root;
5703 
5704 	inode = iget5_locked(s, hashval, btrfs_find_actor,
5705 			     btrfs_init_locked_inode,
5706 			     (void *)&args);
5707 	return inode;
5708 }
5709 
5710 /* Get an inode object given its location and corresponding root.
5711  * Returns in *is_new if the inode was read from disk
5712  */
5713 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
5714 			 struct btrfs_root *root, int *new)
5715 {
5716 	struct inode *inode;
5717 
5718 	inode = btrfs_iget_locked(s, location, root);
5719 	if (!inode)
5720 		return ERR_PTR(-ENOMEM);
5721 
5722 	if (inode->i_state & I_NEW) {
5723 		int ret;
5724 
5725 		ret = btrfs_read_locked_inode(inode);
5726 		if (!is_bad_inode(inode)) {
5727 			inode_tree_add(inode);
5728 			unlock_new_inode(inode);
5729 			if (new)
5730 				*new = 1;
5731 		} else {
5732 			unlock_new_inode(inode);
5733 			iput(inode);
5734 			ASSERT(ret < 0);
5735 			inode = ERR_PTR(ret < 0 ? ret : -ESTALE);
5736 		}
5737 	}
5738 
5739 	return inode;
5740 }
5741 
5742 static struct inode *new_simple_dir(struct super_block *s,
5743 				    struct btrfs_key *key,
5744 				    struct btrfs_root *root)
5745 {
5746 	struct inode *inode = new_inode(s);
5747 
5748 	if (!inode)
5749 		return ERR_PTR(-ENOMEM);
5750 
5751 	BTRFS_I(inode)->root = root;
5752 	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
5753 	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
5754 
5755 	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
5756 	inode->i_op = &btrfs_dir_ro_inode_operations;
5757 	inode->i_opflags &= ~IOP_XATTR;
5758 	inode->i_fop = &simple_dir_operations;
5759 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5760 	inode->i_mtime = current_time(inode);
5761 	inode->i_atime = inode->i_mtime;
5762 	inode->i_ctime = inode->i_mtime;
5763 	BTRFS_I(inode)->i_otime = inode->i_mtime;
5764 
5765 	return inode;
5766 }
5767 
5768 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5769 {
5770 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
5771 	struct inode *inode;
5772 	struct btrfs_root *root = BTRFS_I(dir)->root;
5773 	struct btrfs_root *sub_root = root;
5774 	struct btrfs_key location;
5775 	int index;
5776 	int ret = 0;
5777 
5778 	if (dentry->d_name.len > BTRFS_NAME_LEN)
5779 		return ERR_PTR(-ENAMETOOLONG);
5780 
5781 	ret = btrfs_inode_by_name(dir, dentry, &location);
5782 	if (ret < 0)
5783 		return ERR_PTR(ret);
5784 
5785 	if (location.objectid == 0)
5786 		return ERR_PTR(-ENOENT);
5787 
5788 	if (location.type == BTRFS_INODE_ITEM_KEY) {
5789 		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
5790 		return inode;
5791 	}
5792 
5793 	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
5794 
5795 	index = srcu_read_lock(&fs_info->subvol_srcu);
5796 	ret = fixup_tree_root_location(fs_info, dir, dentry,
5797 				       &location, &sub_root);
5798 	if (ret < 0) {
5799 		if (ret != -ENOENT)
5800 			inode = ERR_PTR(ret);
5801 		else
5802 			inode = new_simple_dir(dir->i_sb, &location, sub_root);
5803 	} else {
5804 		inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
5805 	}
5806 	srcu_read_unlock(&fs_info->subvol_srcu, index);
5807 
5808 	if (!IS_ERR(inode) && root != sub_root) {
5809 		down_read(&fs_info->cleanup_work_sem);
5810 		if (!(inode->i_sb->s_flags & MS_RDONLY))
5811 			ret = btrfs_orphan_cleanup(sub_root);
5812 		up_read(&fs_info->cleanup_work_sem);
5813 		if (ret) {
5814 			iput(inode);
5815 			inode = ERR_PTR(ret);
5816 		}
5817 	}
5818 
5819 	return inode;
5820 }
5821 
5822 static int btrfs_dentry_delete(const struct dentry *dentry)
5823 {
5824 	struct btrfs_root *root;
5825 	struct inode *inode = d_inode(dentry);
5826 
5827 	if (!inode && !IS_ROOT(dentry))
5828 		inode = d_inode(dentry->d_parent);
5829 
5830 	if (inode) {
5831 		root = BTRFS_I(inode)->root;
5832 		if (btrfs_root_refs(&root->root_item) == 0)
5833 			return 1;
5834 
5835 		if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5836 			return 1;
5837 	}
5838 	return 0;
5839 }
5840 
5841 static void btrfs_dentry_release(struct dentry *dentry)
5842 {
5843 	kfree(dentry->d_fsdata);
5844 }
5845 
5846 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5847 				   unsigned int flags)
5848 {
5849 	struct inode *inode;
5850 
5851 	inode = btrfs_lookup_dentry(dir, dentry);
5852 	if (IS_ERR(inode)) {
5853 		if (PTR_ERR(inode) == -ENOENT)
5854 			inode = NULL;
5855 		else
5856 			return ERR_CAST(inode);
5857 	}
5858 
5859 	return d_splice_alias(inode, dentry);
5860 }
5861 
5862 unsigned char btrfs_filetype_table[] = {
5863 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
5864 };
5865 
5866 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5867 {
5868 	struct inode *inode = file_inode(file);
5869 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
5870 	struct btrfs_root *root = BTRFS_I(inode)->root;
5871 	struct btrfs_item *item;
5872 	struct btrfs_dir_item *di;
5873 	struct btrfs_key key;
5874 	struct btrfs_key found_key;
5875 	struct btrfs_path *path;
5876 	struct list_head ins_list;
5877 	struct list_head del_list;
5878 	int ret;
5879 	struct extent_buffer *leaf;
5880 	int slot;
5881 	unsigned char d_type;
5882 	int over = 0;
5883 	char tmp_name[32];
5884 	char *name_ptr;
5885 	int name_len;
5886 	bool put = false;
5887 	struct btrfs_key location;
5888 
5889 	if (!dir_emit_dots(file, ctx))
5890 		return 0;
5891 
5892 	path = btrfs_alloc_path();
5893 	if (!path)
5894 		return -ENOMEM;
5895 
5896 	path->reada = READA_FORWARD;
5897 
5898 	INIT_LIST_HEAD(&ins_list);
5899 	INIT_LIST_HEAD(&del_list);
5900 	put = btrfs_readdir_get_delayed_items(inode, &ins_list, &del_list);
5901 
5902 	key.type = BTRFS_DIR_INDEX_KEY;
5903 	key.offset = ctx->pos;
5904 	key.objectid = btrfs_ino(BTRFS_I(inode));
5905 
5906 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5907 	if (ret < 0)
5908 		goto err;
5909 
5910 	while (1) {
5911 		leaf = path->nodes[0];
5912 		slot = path->slots[0];
5913 		if (slot >= btrfs_header_nritems(leaf)) {
5914 			ret = btrfs_next_leaf(root, path);
5915 			if (ret < 0)
5916 				goto err;
5917 			else if (ret > 0)
5918 				break;
5919 			continue;
5920 		}
5921 
5922 		item = btrfs_item_nr(slot);
5923 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
5924 
5925 		if (found_key.objectid != key.objectid)
5926 			break;
5927 		if (found_key.type != BTRFS_DIR_INDEX_KEY)
5928 			break;
5929 		if (found_key.offset < ctx->pos)
5930 			goto next;
5931 		if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
5932 			goto next;
5933 
5934 		ctx->pos = found_key.offset;
5935 
5936 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5937 		if (verify_dir_item(fs_info, leaf, di))
5938 			goto next;
5939 
5940 		name_len = btrfs_dir_name_len(leaf, di);
5941 		if (name_len <= sizeof(tmp_name)) {
5942 			name_ptr = tmp_name;
5943 		} else {
5944 			name_ptr = kmalloc(name_len, GFP_KERNEL);
5945 			if (!name_ptr) {
5946 				ret = -ENOMEM;
5947 				goto err;
5948 			}
5949 		}
5950 		read_extent_buffer(leaf, name_ptr, (unsigned long)(di + 1),
5951 				   name_len);
5952 
5953 		d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
5954 		btrfs_dir_item_key_to_cpu(leaf, di, &location);
5955 
5956 		over = !dir_emit(ctx, name_ptr, name_len, location.objectid,
5957 				 d_type);
5958 
5959 		if (name_ptr != tmp_name)
5960 			kfree(name_ptr);
5961 
5962 		if (over)
5963 			goto nopos;
5964 		ctx->pos++;
5965 next:
5966 		path->slots[0]++;
5967 	}
5968 
5969 	ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5970 	if (ret)
5971 		goto nopos;
5972 
5973 	/*
5974 	 * Stop new entries from being returned after we return the last
5975 	 * entry.
5976 	 *
5977 	 * New directory entries are assigned a strictly increasing
5978 	 * offset.  This means that new entries created during readdir
5979 	 * are *guaranteed* to be seen in the future by that readdir.
5980 	 * This has broken buggy programs which operate on names as
5981 	 * they're returned by readdir.  Until we re-use freed offsets
5982 	 * we have this hack to stop new entries from being returned
5983 	 * under the assumption that they'll never reach this huge
5984 	 * offset.
5985 	 *
5986 	 * This is being careful not to overflow 32bit loff_t unless the
5987 	 * last entry requires it because doing so has broken 32bit apps
5988 	 * in the past.
5989 	 */
5990 	if (ctx->pos >= INT_MAX)
5991 		ctx->pos = LLONG_MAX;
5992 	else
5993 		ctx->pos = INT_MAX;
5994 nopos:
5995 	ret = 0;
5996 err:
5997 	if (put)
5998 		btrfs_readdir_put_delayed_items(inode, &ins_list, &del_list);
5999 	btrfs_free_path(path);
6000 	return ret;
6001 }
6002 
6003 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
6004 {
6005 	struct btrfs_root *root = BTRFS_I(inode)->root;
6006 	struct btrfs_trans_handle *trans;
6007 	int ret = 0;
6008 	bool nolock = false;
6009 
6010 	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
6011 		return 0;
6012 
6013 	if (btrfs_fs_closing(root->fs_info) &&
6014 			btrfs_is_free_space_inode(BTRFS_I(inode)))
6015 		nolock = true;
6016 
6017 	if (wbc->sync_mode == WB_SYNC_ALL) {
6018 		if (nolock)
6019 			trans = btrfs_join_transaction_nolock(root);
6020 		else
6021 			trans = btrfs_join_transaction(root);
6022 		if (IS_ERR(trans))
6023 			return PTR_ERR(trans);
6024 		ret = btrfs_commit_transaction(trans);
6025 	}
6026 	return ret;
6027 }
6028 
6029 /*
6030  * This is somewhat expensive, updating the tree every time the
6031  * inode changes.  But, it is most likely to find the inode in cache.
6032  * FIXME, needs more benchmarking...there are no reasons other than performance
6033  * to keep or drop this code.
6034  */
6035 static int btrfs_dirty_inode(struct inode *inode)
6036 {
6037 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6038 	struct btrfs_root *root = BTRFS_I(inode)->root;
6039 	struct btrfs_trans_handle *trans;
6040 	int ret;
6041 
6042 	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
6043 		return 0;
6044 
6045 	trans = btrfs_join_transaction(root);
6046 	if (IS_ERR(trans))
6047 		return PTR_ERR(trans);
6048 
6049 	ret = btrfs_update_inode(trans, root, inode);
6050 	if (ret && ret == -ENOSPC) {
6051 		/* whoops, lets try again with the full transaction */
6052 		btrfs_end_transaction(trans);
6053 		trans = btrfs_start_transaction(root, 1);
6054 		if (IS_ERR(trans))
6055 			return PTR_ERR(trans);
6056 
6057 		ret = btrfs_update_inode(trans, root, inode);
6058 	}
6059 	btrfs_end_transaction(trans);
6060 	if (BTRFS_I(inode)->delayed_node)
6061 		btrfs_balance_delayed_items(fs_info);
6062 
6063 	return ret;
6064 }
6065 
6066 /*
6067  * This is a copy of file_update_time.  We need this so we can return error on
6068  * ENOSPC for updating the inode in the case of file write and mmap writes.
6069  */
6070 static int btrfs_update_time(struct inode *inode, struct timespec *now,
6071 			     int flags)
6072 {
6073 	struct btrfs_root *root = BTRFS_I(inode)->root;
6074 
6075 	if (btrfs_root_readonly(root))
6076 		return -EROFS;
6077 
6078 	if (flags & S_VERSION)
6079 		inode_inc_iversion(inode);
6080 	if (flags & S_CTIME)
6081 		inode->i_ctime = *now;
6082 	if (flags & S_MTIME)
6083 		inode->i_mtime = *now;
6084 	if (flags & S_ATIME)
6085 		inode->i_atime = *now;
6086 	return btrfs_dirty_inode(inode);
6087 }
6088 
6089 /*
6090  * find the highest existing sequence number in a directory
6091  * and then set the in-memory index_cnt variable to reflect
6092  * free sequence numbers
6093  */
6094 static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
6095 {
6096 	struct btrfs_root *root = inode->root;
6097 	struct btrfs_key key, found_key;
6098 	struct btrfs_path *path;
6099 	struct extent_buffer *leaf;
6100 	int ret;
6101 
6102 	key.objectid = btrfs_ino(inode);
6103 	key.type = BTRFS_DIR_INDEX_KEY;
6104 	key.offset = (u64)-1;
6105 
6106 	path = btrfs_alloc_path();
6107 	if (!path)
6108 		return -ENOMEM;
6109 
6110 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6111 	if (ret < 0)
6112 		goto out;
6113 	/* FIXME: we should be able to handle this */
6114 	if (ret == 0)
6115 		goto out;
6116 	ret = 0;
6117 
6118 	/*
6119 	 * MAGIC NUMBER EXPLANATION:
6120 	 * since we search a directory based on f_pos we have to start at 2
6121 	 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
6122 	 * else has to start at 2
6123 	 */
6124 	if (path->slots[0] == 0) {
6125 		inode->index_cnt = 2;
6126 		goto out;
6127 	}
6128 
6129 	path->slots[0]--;
6130 
6131 	leaf = path->nodes[0];
6132 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6133 
6134 	if (found_key.objectid != btrfs_ino(inode) ||
6135 	    found_key.type != BTRFS_DIR_INDEX_KEY) {
6136 		inode->index_cnt = 2;
6137 		goto out;
6138 	}
6139 
6140 	inode->index_cnt = found_key.offset + 1;
6141 out:
6142 	btrfs_free_path(path);
6143 	return ret;
6144 }
6145 
6146 /*
6147  * helper to find a free sequence number in a given directory.  This current
6148  * code is very simple, later versions will do smarter things in the btree
6149  */
6150 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6151 {
6152 	int ret = 0;
6153 
6154 	if (dir->index_cnt == (u64)-1) {
6155 		ret = btrfs_inode_delayed_dir_index_count(dir);
6156 		if (ret) {
6157 			ret = btrfs_set_inode_index_count(dir);
6158 			if (ret)
6159 				return ret;
6160 		}
6161 	}
6162 
6163 	*index = dir->index_cnt;
6164 	dir->index_cnt++;
6165 
6166 	return ret;
6167 }
6168 
6169 static int btrfs_insert_inode_locked(struct inode *inode)
6170 {
6171 	struct btrfs_iget_args args;
6172 	args.location = &BTRFS_I(inode)->location;
6173 	args.root = BTRFS_I(inode)->root;
6174 
6175 	return insert_inode_locked4(inode,
6176 		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6177 		   btrfs_find_actor, &args);
6178 }
6179 
6180 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
6181 				     struct btrfs_root *root,
6182 				     struct inode *dir,
6183 				     const char *name, int name_len,
6184 				     u64 ref_objectid, u64 objectid,
6185 				     umode_t mode, u64 *index)
6186 {
6187 	struct btrfs_fs_info *fs_info = root->fs_info;
6188 	struct inode *inode;
6189 	struct btrfs_inode_item *inode_item;
6190 	struct btrfs_key *location;
6191 	struct btrfs_path *path;
6192 	struct btrfs_inode_ref *ref;
6193 	struct btrfs_key key[2];
6194 	u32 sizes[2];
6195 	int nitems = name ? 2 : 1;
6196 	unsigned long ptr;
6197 	int ret;
6198 
6199 	path = btrfs_alloc_path();
6200 	if (!path)
6201 		return ERR_PTR(-ENOMEM);
6202 
6203 	inode = new_inode(fs_info->sb);
6204 	if (!inode) {
6205 		btrfs_free_path(path);
6206 		return ERR_PTR(-ENOMEM);
6207 	}
6208 
6209 	/*
6210 	 * O_TMPFILE, set link count to 0, so that after this point,
6211 	 * we fill in an inode item with the correct link count.
6212 	 */
6213 	if (!name)
6214 		set_nlink(inode, 0);
6215 
6216 	/*
6217 	 * we have to initialize this early, so we can reclaim the inode
6218 	 * number if we fail afterwards in this function.
6219 	 */
6220 	inode->i_ino = objectid;
6221 
6222 	if (dir && name) {
6223 		trace_btrfs_inode_request(dir);
6224 
6225 		ret = btrfs_set_inode_index(BTRFS_I(dir), index);
6226 		if (ret) {
6227 			btrfs_free_path(path);
6228 			iput(inode);
6229 			return ERR_PTR(ret);
6230 		}
6231 	} else if (dir) {
6232 		*index = 0;
6233 	}
6234 	/*
6235 	 * index_cnt is ignored for everything but a dir,
6236 	 * btrfs_get_inode_index_count has an explanation for the magic
6237 	 * number
6238 	 */
6239 	BTRFS_I(inode)->index_cnt = 2;
6240 	BTRFS_I(inode)->dir_index = *index;
6241 	BTRFS_I(inode)->root = root;
6242 	BTRFS_I(inode)->generation = trans->transid;
6243 	inode->i_generation = BTRFS_I(inode)->generation;
6244 
6245 	/*
6246 	 * We could have gotten an inode number from somebody who was fsynced
6247 	 * and then removed in this same transaction, so let's just set full
6248 	 * sync since it will be a full sync anyway and this will blow away the
6249 	 * old info in the log.
6250 	 */
6251 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
6252 
6253 	key[0].objectid = objectid;
6254 	key[0].type = BTRFS_INODE_ITEM_KEY;
6255 	key[0].offset = 0;
6256 
6257 	sizes[0] = sizeof(struct btrfs_inode_item);
6258 
6259 	if (name) {
6260 		/*
6261 		 * Start new inodes with an inode_ref. This is slightly more
6262 		 * efficient for small numbers of hard links since they will
6263 		 * be packed into one item. Extended refs will kick in if we
6264 		 * add more hard links than can fit in the ref item.
6265 		 */
6266 		key[1].objectid = objectid;
6267 		key[1].type = BTRFS_INODE_REF_KEY;
6268 		key[1].offset = ref_objectid;
6269 
6270 		sizes[1] = name_len + sizeof(*ref);
6271 	}
6272 
6273 	location = &BTRFS_I(inode)->location;
6274 	location->objectid = objectid;
6275 	location->offset = 0;
6276 	location->type = BTRFS_INODE_ITEM_KEY;
6277 
6278 	ret = btrfs_insert_inode_locked(inode);
6279 	if (ret < 0)
6280 		goto fail;
6281 
6282 	path->leave_spinning = 1;
6283 	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
6284 	if (ret != 0)
6285 		goto fail_unlock;
6286 
6287 	inode_init_owner(inode, dir, mode);
6288 	inode_set_bytes(inode, 0);
6289 
6290 	inode->i_mtime = current_time(inode);
6291 	inode->i_atime = inode->i_mtime;
6292 	inode->i_ctime = inode->i_mtime;
6293 	BTRFS_I(inode)->i_otime = inode->i_mtime;
6294 
6295 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6296 				  struct btrfs_inode_item);
6297 	memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6298 			     sizeof(*inode_item));
6299 	fill_inode_item(trans, path->nodes[0], inode_item, inode);
6300 
6301 	if (name) {
6302 		ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6303 				     struct btrfs_inode_ref);
6304 		btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
6305 		btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
6306 		ptr = (unsigned long)(ref + 1);
6307 		write_extent_buffer(path->nodes[0], name, ptr, name_len);
6308 	}
6309 
6310 	btrfs_mark_buffer_dirty(path->nodes[0]);
6311 	btrfs_free_path(path);
6312 
6313 	btrfs_inherit_iflags(inode, dir);
6314 
6315 	if (S_ISREG(mode)) {
6316 		if (btrfs_test_opt(fs_info, NODATASUM))
6317 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6318 		if (btrfs_test_opt(fs_info, NODATACOW))
6319 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6320 				BTRFS_INODE_NODATASUM;
6321 	}
6322 
6323 	inode_tree_add(inode);
6324 
6325 	trace_btrfs_inode_new(inode);
6326 	btrfs_set_inode_last_trans(trans, inode);
6327 
6328 	btrfs_update_root_times(trans, root);
6329 
6330 	ret = btrfs_inode_inherit_props(trans, inode, dir);
6331 	if (ret)
6332 		btrfs_err(fs_info,
6333 			  "error inheriting props for ino %llu (root %llu): %d",
6334 			btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret);
6335 
6336 	return inode;
6337 
6338 fail_unlock:
6339 	unlock_new_inode(inode);
6340 fail:
6341 	if (dir && name)
6342 		BTRFS_I(dir)->index_cnt--;
6343 	btrfs_free_path(path);
6344 	iput(inode);
6345 	return ERR_PTR(ret);
6346 }
6347 
6348 static inline u8 btrfs_inode_type(struct inode *inode)
6349 {
6350 	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
6351 }
6352 
6353 /*
6354  * utility function to add 'inode' into 'parent_inode' with
6355  * a give name and a given sequence number.
6356  * if 'add_backref' is true, also insert a backref from the
6357  * inode to the parent directory.
6358  */
6359 int btrfs_add_link(struct btrfs_trans_handle *trans,
6360 		   struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6361 		   const char *name, int name_len, int add_backref, u64 index)
6362 {
6363 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6364 	int ret = 0;
6365 	struct btrfs_key key;
6366 	struct btrfs_root *root = parent_inode->root;
6367 	u64 ino = btrfs_ino(inode);
6368 	u64 parent_ino = btrfs_ino(parent_inode);
6369 
6370 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6371 		memcpy(&key, &inode->root->root_key, sizeof(key));
6372 	} else {
6373 		key.objectid = ino;
6374 		key.type = BTRFS_INODE_ITEM_KEY;
6375 		key.offset = 0;
6376 	}
6377 
6378 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6379 		ret = btrfs_add_root_ref(trans, fs_info, key.objectid,
6380 					 root->root_key.objectid, parent_ino,
6381 					 index, name, name_len);
6382 	} else if (add_backref) {
6383 		ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
6384 					     parent_ino, index);
6385 	}
6386 
6387 	/* Nothing to clean up yet */
6388 	if (ret)
6389 		return ret;
6390 
6391 	ret = btrfs_insert_dir_item(trans, root, name, name_len,
6392 				    parent_inode, &key,
6393 				    btrfs_inode_type(&inode->vfs_inode), index);
6394 	if (ret == -EEXIST || ret == -EOVERFLOW)
6395 		goto fail_dir_item;
6396 	else if (ret) {
6397 		btrfs_abort_transaction(trans, ret);
6398 		return ret;
6399 	}
6400 
6401 	btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6402 			   name_len * 2);
6403 	inode_inc_iversion(&parent_inode->vfs_inode);
6404 	parent_inode->vfs_inode.i_mtime = parent_inode->vfs_inode.i_ctime =
6405 		current_time(&parent_inode->vfs_inode);
6406 	ret = btrfs_update_inode(trans, root, &parent_inode->vfs_inode);
6407 	if (ret)
6408 		btrfs_abort_transaction(trans, ret);
6409 	return ret;
6410 
6411 fail_dir_item:
6412 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6413 		u64 local_index;
6414 		int err;
6415 		err = btrfs_del_root_ref(trans, fs_info, key.objectid,
6416 					 root->root_key.objectid, parent_ino,
6417 					 &local_index, name, name_len);
6418 
6419 	} else if (add_backref) {
6420 		u64 local_index;
6421 		int err;
6422 
6423 		err = btrfs_del_inode_ref(trans, root, name, name_len,
6424 					  ino, parent_ino, &local_index);
6425 	}
6426 	return ret;
6427 }
6428 
6429 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
6430 			    struct btrfs_inode *dir, struct dentry *dentry,
6431 			    struct btrfs_inode *inode, int backref, u64 index)
6432 {
6433 	int err = btrfs_add_link(trans, dir, inode,
6434 				 dentry->d_name.name, dentry->d_name.len,
6435 				 backref, index);
6436 	if (err > 0)
6437 		err = -EEXIST;
6438 	return err;
6439 }
6440 
6441 static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
6442 			umode_t mode, dev_t rdev)
6443 {
6444 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6445 	struct btrfs_trans_handle *trans;
6446 	struct btrfs_root *root = BTRFS_I(dir)->root;
6447 	struct inode *inode = NULL;
6448 	int err;
6449 	int drop_inode = 0;
6450 	u64 objectid;
6451 	u64 index = 0;
6452 
6453 	/*
6454 	 * 2 for inode item and ref
6455 	 * 2 for dir items
6456 	 * 1 for xattr if selinux is on
6457 	 */
6458 	trans = btrfs_start_transaction(root, 5);
6459 	if (IS_ERR(trans))
6460 		return PTR_ERR(trans);
6461 
6462 	err = btrfs_find_free_ino(root, &objectid);
6463 	if (err)
6464 		goto out_unlock;
6465 
6466 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6467 			dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
6468 			mode, &index);
6469 	if (IS_ERR(inode)) {
6470 		err = PTR_ERR(inode);
6471 		goto out_unlock;
6472 	}
6473 
6474 	/*
6475 	* If the active LSM wants to access the inode during
6476 	* d_instantiate it needs these. Smack checks to see
6477 	* if the filesystem supports xattrs by looking at the
6478 	* ops vector.
6479 	*/
6480 	inode->i_op = &btrfs_special_inode_operations;
6481 	init_special_inode(inode, inode->i_mode, rdev);
6482 
6483 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6484 	if (err)
6485 		goto out_unlock_inode;
6486 
6487 	err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6488 			0, index);
6489 	if (err) {
6490 		goto out_unlock_inode;
6491 	} else {
6492 		btrfs_update_inode(trans, root, inode);
6493 		unlock_new_inode(inode);
6494 		d_instantiate(dentry, inode);
6495 	}
6496 
6497 out_unlock:
6498 	btrfs_end_transaction(trans);
6499 	btrfs_balance_delayed_items(fs_info);
6500 	btrfs_btree_balance_dirty(fs_info);
6501 	if (drop_inode) {
6502 		inode_dec_link_count(inode);
6503 		iput(inode);
6504 	}
6505 	return err;
6506 
6507 out_unlock_inode:
6508 	drop_inode = 1;
6509 	unlock_new_inode(inode);
6510 	goto out_unlock;
6511 
6512 }
6513 
6514 static int btrfs_create(struct inode *dir, struct dentry *dentry,
6515 			umode_t mode, bool excl)
6516 {
6517 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6518 	struct btrfs_trans_handle *trans;
6519 	struct btrfs_root *root = BTRFS_I(dir)->root;
6520 	struct inode *inode = NULL;
6521 	int drop_inode_on_err = 0;
6522 	int err;
6523 	u64 objectid;
6524 	u64 index = 0;
6525 
6526 	/*
6527 	 * 2 for inode item and ref
6528 	 * 2 for dir items
6529 	 * 1 for xattr if selinux is on
6530 	 */
6531 	trans = btrfs_start_transaction(root, 5);
6532 	if (IS_ERR(trans))
6533 		return PTR_ERR(trans);
6534 
6535 	err = btrfs_find_free_ino(root, &objectid);
6536 	if (err)
6537 		goto out_unlock;
6538 
6539 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6540 			dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
6541 			mode, &index);
6542 	if (IS_ERR(inode)) {
6543 		err = PTR_ERR(inode);
6544 		goto out_unlock;
6545 	}
6546 	drop_inode_on_err = 1;
6547 	/*
6548 	* If the active LSM wants to access the inode during
6549 	* d_instantiate it needs these. Smack checks to see
6550 	* if the filesystem supports xattrs by looking at the
6551 	* ops vector.
6552 	*/
6553 	inode->i_fop = &btrfs_file_operations;
6554 	inode->i_op = &btrfs_file_inode_operations;
6555 	inode->i_mapping->a_ops = &btrfs_aops;
6556 
6557 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6558 	if (err)
6559 		goto out_unlock_inode;
6560 
6561 	err = btrfs_update_inode(trans, root, inode);
6562 	if (err)
6563 		goto out_unlock_inode;
6564 
6565 	err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6566 			0, index);
6567 	if (err)
6568 		goto out_unlock_inode;
6569 
6570 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
6571 	unlock_new_inode(inode);
6572 	d_instantiate(dentry, inode);
6573 
6574 out_unlock:
6575 	btrfs_end_transaction(trans);
6576 	if (err && drop_inode_on_err) {
6577 		inode_dec_link_count(inode);
6578 		iput(inode);
6579 	}
6580 	btrfs_balance_delayed_items(fs_info);
6581 	btrfs_btree_balance_dirty(fs_info);
6582 	return err;
6583 
6584 out_unlock_inode:
6585 	unlock_new_inode(inode);
6586 	goto out_unlock;
6587 
6588 }
6589 
6590 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6591 		      struct dentry *dentry)
6592 {
6593 	struct btrfs_trans_handle *trans = NULL;
6594 	struct btrfs_root *root = BTRFS_I(dir)->root;
6595 	struct inode *inode = d_inode(old_dentry);
6596 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
6597 	u64 index;
6598 	int err;
6599 	int drop_inode = 0;
6600 
6601 	/* do not allow sys_link's with other subvols of the same device */
6602 	if (root->objectid != BTRFS_I(inode)->root->objectid)
6603 		return -EXDEV;
6604 
6605 	if (inode->i_nlink >= BTRFS_LINK_MAX)
6606 		return -EMLINK;
6607 
6608 	err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6609 	if (err)
6610 		goto fail;
6611 
6612 	/*
6613 	 * 2 items for inode and inode ref
6614 	 * 2 items for dir items
6615 	 * 1 item for parent inode
6616 	 */
6617 	trans = btrfs_start_transaction(root, 5);
6618 	if (IS_ERR(trans)) {
6619 		err = PTR_ERR(trans);
6620 		trans = NULL;
6621 		goto fail;
6622 	}
6623 
6624 	/* There are several dir indexes for this inode, clear the cache. */
6625 	BTRFS_I(inode)->dir_index = 0ULL;
6626 	inc_nlink(inode);
6627 	inode_inc_iversion(inode);
6628 	inode->i_ctime = current_time(inode);
6629 	ihold(inode);
6630 	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6631 
6632 	err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode),
6633 			1, index);
6634 
6635 	if (err) {
6636 		drop_inode = 1;
6637 	} else {
6638 		struct dentry *parent = dentry->d_parent;
6639 		err = btrfs_update_inode(trans, root, inode);
6640 		if (err)
6641 			goto fail;
6642 		if (inode->i_nlink == 1) {
6643 			/*
6644 			 * If new hard link count is 1, it's a file created
6645 			 * with open(2) O_TMPFILE flag.
6646 			 */
6647 			err = btrfs_orphan_del(trans, BTRFS_I(inode));
6648 			if (err)
6649 				goto fail;
6650 		}
6651 		d_instantiate(dentry, inode);
6652 		btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
6653 	}
6654 
6655 	btrfs_balance_delayed_items(fs_info);
6656 fail:
6657 	if (trans)
6658 		btrfs_end_transaction(trans);
6659 	if (drop_inode) {
6660 		inode_dec_link_count(inode);
6661 		iput(inode);
6662 	}
6663 	btrfs_btree_balance_dirty(fs_info);
6664 	return err;
6665 }
6666 
6667 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
6668 {
6669 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
6670 	struct inode *inode = NULL;
6671 	struct btrfs_trans_handle *trans;
6672 	struct btrfs_root *root = BTRFS_I(dir)->root;
6673 	int err = 0;
6674 	int drop_on_err = 0;
6675 	u64 objectid = 0;
6676 	u64 index = 0;
6677 
6678 	/*
6679 	 * 2 items for inode and ref
6680 	 * 2 items for dir items
6681 	 * 1 for xattr if selinux is on
6682 	 */
6683 	trans = btrfs_start_transaction(root, 5);
6684 	if (IS_ERR(trans))
6685 		return PTR_ERR(trans);
6686 
6687 	err = btrfs_find_free_ino(root, &objectid);
6688 	if (err)
6689 		goto out_fail;
6690 
6691 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
6692 			dentry->d_name.len, btrfs_ino(BTRFS_I(dir)), objectid,
6693 			S_IFDIR | mode, &index);
6694 	if (IS_ERR(inode)) {
6695 		err = PTR_ERR(inode);
6696 		goto out_fail;
6697 	}
6698 
6699 	drop_on_err = 1;
6700 	/* these must be set before we unlock the inode */
6701 	inode->i_op = &btrfs_dir_inode_operations;
6702 	inode->i_fop = &btrfs_dir_file_operations;
6703 
6704 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
6705 	if (err)
6706 		goto out_fail_inode;
6707 
6708 	btrfs_i_size_write(BTRFS_I(inode), 0);
6709 	err = btrfs_update_inode(trans, root, inode);
6710 	if (err)
6711 		goto out_fail_inode;
6712 
6713 	err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6714 			dentry->d_name.name,
6715 			dentry->d_name.len, 0, index);
6716 	if (err)
6717 		goto out_fail_inode;
6718 
6719 	d_instantiate(dentry, inode);
6720 	/*
6721 	 * mkdir is special.  We're unlocking after we call d_instantiate
6722 	 * to avoid a race with nfsd calling d_instantiate.
6723 	 */
6724 	unlock_new_inode(inode);
6725 	drop_on_err = 0;
6726 
6727 out_fail:
6728 	btrfs_end_transaction(trans);
6729 	if (drop_on_err) {
6730 		inode_dec_link_count(inode);
6731 		iput(inode);
6732 	}
6733 	btrfs_balance_delayed_items(fs_info);
6734 	btrfs_btree_balance_dirty(fs_info);
6735 	return err;
6736 
6737 out_fail_inode:
6738 	unlock_new_inode(inode);
6739 	goto out_fail;
6740 }
6741 
6742 /* Find next extent map of a given extent map, caller needs to ensure locks */
6743 static struct extent_map *next_extent_map(struct extent_map *em)
6744 {
6745 	struct rb_node *next;
6746 
6747 	next = rb_next(&em->rb_node);
6748 	if (!next)
6749 		return NULL;
6750 	return container_of(next, struct extent_map, rb_node);
6751 }
6752 
6753 static struct extent_map *prev_extent_map(struct extent_map *em)
6754 {
6755 	struct rb_node *prev;
6756 
6757 	prev = rb_prev(&em->rb_node);
6758 	if (!prev)
6759 		return NULL;
6760 	return container_of(prev, struct extent_map, rb_node);
6761 }
6762 
6763 /* helper for btfs_get_extent.  Given an existing extent in the tree,
6764  * the existing extent is the nearest extent to map_start,
6765  * and an extent that you want to insert, deal with overlap and insert
6766  * the best fitted new extent into the tree.
6767  */
6768 static int merge_extent_mapping(struct extent_map_tree *em_tree,
6769 				struct extent_map *existing,
6770 				struct extent_map *em,
6771 				u64 map_start)
6772 {
6773 	struct extent_map *prev;
6774 	struct extent_map *next;
6775 	u64 start;
6776 	u64 end;
6777 	u64 start_diff;
6778 
6779 	BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
6780 
6781 	if (existing->start > map_start) {
6782 		next = existing;
6783 		prev = prev_extent_map(next);
6784 	} else {
6785 		prev = existing;
6786 		next = next_extent_map(prev);
6787 	}
6788 
6789 	start = prev ? extent_map_end(prev) : em->start;
6790 	start = max_t(u64, start, em->start);
6791 	end = next ? next->start : extent_map_end(em);
6792 	end = min_t(u64, end, extent_map_end(em));
6793 	start_diff = start - em->start;
6794 	em->start = start;
6795 	em->len = end - start;
6796 	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
6797 	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
6798 		em->block_start += start_diff;
6799 		em->block_len -= start_diff;
6800 	}
6801 	return add_extent_mapping(em_tree, em, 0);
6802 }
6803 
6804 static noinline int uncompress_inline(struct btrfs_path *path,
6805 				      struct page *page,
6806 				      size_t pg_offset, u64 extent_offset,
6807 				      struct btrfs_file_extent_item *item)
6808 {
6809 	int ret;
6810 	struct extent_buffer *leaf = path->nodes[0];
6811 	char *tmp;
6812 	size_t max_size;
6813 	unsigned long inline_size;
6814 	unsigned long ptr;
6815 	int compress_type;
6816 
6817 	WARN_ON(pg_offset != 0);
6818 	compress_type = btrfs_file_extent_compression(leaf, item);
6819 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
6820 	inline_size = btrfs_file_extent_inline_item_len(leaf,
6821 					btrfs_item_nr(path->slots[0]));
6822 	tmp = kmalloc(inline_size, GFP_NOFS);
6823 	if (!tmp)
6824 		return -ENOMEM;
6825 	ptr = btrfs_file_extent_inline_start(item);
6826 
6827 	read_extent_buffer(leaf, tmp, ptr, inline_size);
6828 
6829 	max_size = min_t(unsigned long, PAGE_SIZE, max_size);
6830 	ret = btrfs_decompress(compress_type, tmp, page,
6831 			       extent_offset, inline_size, max_size);
6832 
6833 	/*
6834 	 * decompression code contains a memset to fill in any space between the end
6835 	 * of the uncompressed data and the end of max_size in case the decompressed
6836 	 * data ends up shorter than ram_bytes.  That doesn't cover the hole between
6837 	 * the end of an inline extent and the beginning of the next block, so we
6838 	 * cover that region here.
6839 	 */
6840 
6841 	if (max_size + pg_offset < PAGE_SIZE) {
6842 		char *map = kmap(page);
6843 		memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset);
6844 		kunmap(page);
6845 	}
6846 	kfree(tmp);
6847 	return ret;
6848 }
6849 
6850 /*
6851  * a bit scary, this does extent mapping from logical file offset to the disk.
6852  * the ugly parts come from merging extents from the disk with the in-ram
6853  * representation.  This gets more complex because of the data=ordered code,
6854  * where the in-ram extents might be locked pending data=ordered completion.
6855  *
6856  * This also copies inline extents directly into the page.
6857  */
6858 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6859 		struct page *page,
6860 	    size_t pg_offset, u64 start, u64 len,
6861 		int create)
6862 {
6863 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
6864 	int ret;
6865 	int err = 0;
6866 	u64 extent_start = 0;
6867 	u64 extent_end = 0;
6868 	u64 objectid = btrfs_ino(inode);
6869 	u32 found_type;
6870 	struct btrfs_path *path = NULL;
6871 	struct btrfs_root *root = inode->root;
6872 	struct btrfs_file_extent_item *item;
6873 	struct extent_buffer *leaf;
6874 	struct btrfs_key found_key;
6875 	struct extent_map *em = NULL;
6876 	struct extent_map_tree *em_tree = &inode->extent_tree;
6877 	struct extent_io_tree *io_tree = &inode->io_tree;
6878 	struct btrfs_trans_handle *trans = NULL;
6879 	const bool new_inline = !page || create;
6880 
6881 again:
6882 	read_lock(&em_tree->lock);
6883 	em = lookup_extent_mapping(em_tree, start, len);
6884 	if (em)
6885 		em->bdev = fs_info->fs_devices->latest_bdev;
6886 	read_unlock(&em_tree->lock);
6887 
6888 	if (em) {
6889 		if (em->start > start || em->start + em->len <= start)
6890 			free_extent_map(em);
6891 		else if (em->block_start == EXTENT_MAP_INLINE && page)
6892 			free_extent_map(em);
6893 		else
6894 			goto out;
6895 	}
6896 	em = alloc_extent_map();
6897 	if (!em) {
6898 		err = -ENOMEM;
6899 		goto out;
6900 	}
6901 	em->bdev = fs_info->fs_devices->latest_bdev;
6902 	em->start = EXTENT_MAP_HOLE;
6903 	em->orig_start = EXTENT_MAP_HOLE;
6904 	em->len = (u64)-1;
6905 	em->block_len = (u64)-1;
6906 
6907 	if (!path) {
6908 		path = btrfs_alloc_path();
6909 		if (!path) {
6910 			err = -ENOMEM;
6911 			goto out;
6912 		}
6913 		/*
6914 		 * Chances are we'll be called again, so go ahead and do
6915 		 * readahead
6916 		 */
6917 		path->reada = READA_FORWARD;
6918 	}
6919 
6920 	ret = btrfs_lookup_file_extent(trans, root, path,
6921 				       objectid, start, trans != NULL);
6922 	if (ret < 0) {
6923 		err = ret;
6924 		goto out;
6925 	}
6926 
6927 	if (ret != 0) {
6928 		if (path->slots[0] == 0)
6929 			goto not_found;
6930 		path->slots[0]--;
6931 	}
6932 
6933 	leaf = path->nodes[0];
6934 	item = btrfs_item_ptr(leaf, path->slots[0],
6935 			      struct btrfs_file_extent_item);
6936 	/* are we inside the extent that was found? */
6937 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6938 	found_type = found_key.type;
6939 	if (found_key.objectid != objectid ||
6940 	    found_type != BTRFS_EXTENT_DATA_KEY) {
6941 		/*
6942 		 * If we backup past the first extent we want to move forward
6943 		 * and see if there is an extent in front of us, otherwise we'll
6944 		 * say there is a hole for our whole search range which can
6945 		 * cause problems.
6946 		 */
6947 		extent_end = start;
6948 		goto next;
6949 	}
6950 
6951 	found_type = btrfs_file_extent_type(leaf, item);
6952 	extent_start = found_key.offset;
6953 	if (found_type == BTRFS_FILE_EXTENT_REG ||
6954 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6955 		extent_end = extent_start +
6956 		       btrfs_file_extent_num_bytes(leaf, item);
6957 
6958 		trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
6959 						       extent_start);
6960 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6961 		size_t size;
6962 		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
6963 		extent_end = ALIGN(extent_start + size,
6964 				   fs_info->sectorsize);
6965 
6966 		trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
6967 						      path->slots[0],
6968 						      extent_start);
6969 	}
6970 next:
6971 	if (start >= extent_end) {
6972 		path->slots[0]++;
6973 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6974 			ret = btrfs_next_leaf(root, path);
6975 			if (ret < 0) {
6976 				err = ret;
6977 				goto out;
6978 			}
6979 			if (ret > 0)
6980 				goto not_found;
6981 			leaf = path->nodes[0];
6982 		}
6983 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6984 		if (found_key.objectid != objectid ||
6985 		    found_key.type != BTRFS_EXTENT_DATA_KEY)
6986 			goto not_found;
6987 		if (start + len <= found_key.offset)
6988 			goto not_found;
6989 		if (start > found_key.offset)
6990 			goto next;
6991 		em->start = start;
6992 		em->orig_start = start;
6993 		em->len = found_key.offset - start;
6994 		goto not_found_em;
6995 	}
6996 
6997 	btrfs_extent_item_to_extent_map(inode, path, item,
6998 			new_inline, em);
6999 
7000 	if (found_type == BTRFS_FILE_EXTENT_REG ||
7001 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7002 		goto insert;
7003 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
7004 		unsigned long ptr;
7005 		char *map;
7006 		size_t size;
7007 		size_t extent_offset;
7008 		size_t copy_size;
7009 
7010 		if (new_inline)
7011 			goto out;
7012 
7013 		size = btrfs_file_extent_inline_len(leaf, path->slots[0], item);
7014 		extent_offset = page_offset(page) + pg_offset - extent_start;
7015 		copy_size = min_t(u64, PAGE_SIZE - pg_offset,
7016 				  size - extent_offset);
7017 		em->start = extent_start + extent_offset;
7018 		em->len = ALIGN(copy_size, fs_info->sectorsize);
7019 		em->orig_block_len = em->len;
7020 		em->orig_start = em->start;
7021 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
7022 		if (create == 0 && !PageUptodate(page)) {
7023 			if (btrfs_file_extent_compression(leaf, item) !=
7024 			    BTRFS_COMPRESS_NONE) {
7025 				ret = uncompress_inline(path, page, pg_offset,
7026 							extent_offset, item);
7027 				if (ret) {
7028 					err = ret;
7029 					goto out;
7030 				}
7031 			} else {
7032 				map = kmap(page);
7033 				read_extent_buffer(leaf, map + pg_offset, ptr,
7034 						   copy_size);
7035 				if (pg_offset + copy_size < PAGE_SIZE) {
7036 					memset(map + pg_offset + copy_size, 0,
7037 					       PAGE_SIZE - pg_offset -
7038 					       copy_size);
7039 				}
7040 				kunmap(page);
7041 			}
7042 			flush_dcache_page(page);
7043 		} else if (create && PageUptodate(page)) {
7044 			BUG();
7045 			if (!trans) {
7046 				kunmap(page);
7047 				free_extent_map(em);
7048 				em = NULL;
7049 
7050 				btrfs_release_path(path);
7051 				trans = btrfs_join_transaction(root);
7052 
7053 				if (IS_ERR(trans))
7054 					return ERR_CAST(trans);
7055 				goto again;
7056 			}
7057 			map = kmap(page);
7058 			write_extent_buffer(leaf, map + pg_offset, ptr,
7059 					    copy_size);
7060 			kunmap(page);
7061 			btrfs_mark_buffer_dirty(leaf);
7062 		}
7063 		set_extent_uptodate(io_tree, em->start,
7064 				    extent_map_end(em) - 1, NULL, GFP_NOFS);
7065 		goto insert;
7066 	}
7067 not_found:
7068 	em->start = start;
7069 	em->orig_start = start;
7070 	em->len = len;
7071 not_found_em:
7072 	em->block_start = EXTENT_MAP_HOLE;
7073 	set_bit(EXTENT_FLAG_VACANCY, &em->flags);
7074 insert:
7075 	btrfs_release_path(path);
7076 	if (em->start > start || extent_map_end(em) <= start) {
7077 		btrfs_err(fs_info,
7078 			  "bad extent! em: [%llu %llu] passed [%llu %llu]",
7079 			  em->start, em->len, start, len);
7080 		err = -EIO;
7081 		goto out;
7082 	}
7083 
7084 	err = 0;
7085 	write_lock(&em_tree->lock);
7086 	ret = add_extent_mapping(em_tree, em, 0);
7087 	/* it is possible that someone inserted the extent into the tree
7088 	 * while we had the lock dropped.  It is also possible that
7089 	 * an overlapping map exists in the tree
7090 	 */
7091 	if (ret == -EEXIST) {
7092 		struct extent_map *existing;
7093 
7094 		ret = 0;
7095 
7096 		existing = search_extent_mapping(em_tree, start, len);
7097 		/*
7098 		 * existing will always be non-NULL, since there must be
7099 		 * extent causing the -EEXIST.
7100 		 */
7101 		if (existing->start == em->start &&
7102 		    extent_map_end(existing) >= extent_map_end(em) &&
7103 		    em->block_start == existing->block_start) {
7104 			/*
7105 			 * The existing extent map already encompasses the
7106 			 * entire extent map we tried to add.
7107 			 */
7108 			free_extent_map(em);
7109 			em = existing;
7110 			err = 0;
7111 
7112 		} else if (start >= extent_map_end(existing) ||
7113 		    start <= existing->start) {
7114 			/*
7115 			 * The existing extent map is the one nearest to
7116 			 * the [start, start + len) range which overlaps
7117 			 */
7118 			err = merge_extent_mapping(em_tree, existing,
7119 						   em, start);
7120 			free_extent_map(existing);
7121 			if (err) {
7122 				free_extent_map(em);
7123 				em = NULL;
7124 			}
7125 		} else {
7126 			free_extent_map(em);
7127 			em = existing;
7128 			err = 0;
7129 		}
7130 	}
7131 	write_unlock(&em_tree->lock);
7132 out:
7133 
7134 	trace_btrfs_get_extent(root, inode, em);
7135 
7136 	btrfs_free_path(path);
7137 	if (trans) {
7138 		ret = btrfs_end_transaction(trans);
7139 		if (!err)
7140 			err = ret;
7141 	}
7142 	if (err) {
7143 		free_extent_map(em);
7144 		return ERR_PTR(err);
7145 	}
7146 	BUG_ON(!em); /* Error is always set */
7147 	return em;
7148 }
7149 
7150 struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
7151 		struct page *page,
7152 		size_t pg_offset, u64 start, u64 len,
7153 		int create)
7154 {
7155 	struct extent_map *em;
7156 	struct extent_map *hole_em = NULL;
7157 	u64 range_start = start;
7158 	u64 end;
7159 	u64 found;
7160 	u64 found_end;
7161 	int err = 0;
7162 
7163 	em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
7164 	if (IS_ERR(em))
7165 		return em;
7166 	/*
7167 	 * If our em maps to:
7168 	 * - a hole or
7169 	 * - a pre-alloc extent,
7170 	 * there might actually be delalloc bytes behind it.
7171 	 */
7172 	if (em->block_start != EXTENT_MAP_HOLE &&
7173 	    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7174 		return em;
7175 	else
7176 		hole_em = em;
7177 
7178 	/* check to see if we've wrapped (len == -1 or similar) */
7179 	end = start + len;
7180 	if (end < start)
7181 		end = (u64)-1;
7182 	else
7183 		end -= 1;
7184 
7185 	em = NULL;
7186 
7187 	/* ok, we didn't find anything, lets look for delalloc */
7188 	found = count_range_bits(&inode->io_tree, &range_start,
7189 				 end, len, EXTENT_DELALLOC, 1);
7190 	found_end = range_start + found;
7191 	if (found_end < range_start)
7192 		found_end = (u64)-1;
7193 
7194 	/*
7195 	 * we didn't find anything useful, return
7196 	 * the original results from get_extent()
7197 	 */
7198 	if (range_start > end || found_end <= start) {
7199 		em = hole_em;
7200 		hole_em = NULL;
7201 		goto out;
7202 	}
7203 
7204 	/* adjust the range_start to make sure it doesn't
7205 	 * go backwards from the start they passed in
7206 	 */
7207 	range_start = max(start, range_start);
7208 	found = found_end - range_start;
7209 
7210 	if (found > 0) {
7211 		u64 hole_start = start;
7212 		u64 hole_len = len;
7213 
7214 		em = alloc_extent_map();
7215 		if (!em) {
7216 			err = -ENOMEM;
7217 			goto out;
7218 		}
7219 		/*
7220 		 * when btrfs_get_extent can't find anything it
7221 		 * returns one huge hole
7222 		 *
7223 		 * make sure what it found really fits our range, and
7224 		 * adjust to make sure it is based on the start from
7225 		 * the caller
7226 		 */
7227 		if (hole_em) {
7228 			u64 calc_end = extent_map_end(hole_em);
7229 
7230 			if (calc_end <= start || (hole_em->start > end)) {
7231 				free_extent_map(hole_em);
7232 				hole_em = NULL;
7233 			} else {
7234 				hole_start = max(hole_em->start, start);
7235 				hole_len = calc_end - hole_start;
7236 			}
7237 		}
7238 		em->bdev = NULL;
7239 		if (hole_em && range_start > hole_start) {
7240 			/* our hole starts before our delalloc, so we
7241 			 * have to return just the parts of the hole
7242 			 * that go until  the delalloc starts
7243 			 */
7244 			em->len = min(hole_len,
7245 				      range_start - hole_start);
7246 			em->start = hole_start;
7247 			em->orig_start = hole_start;
7248 			/*
7249 			 * don't adjust block start at all,
7250 			 * it is fixed at EXTENT_MAP_HOLE
7251 			 */
7252 			em->block_start = hole_em->block_start;
7253 			em->block_len = hole_len;
7254 			if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
7255 				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
7256 		} else {
7257 			em->start = range_start;
7258 			em->len = found;
7259 			em->orig_start = range_start;
7260 			em->block_start = EXTENT_MAP_DELALLOC;
7261 			em->block_len = found;
7262 		}
7263 	} else if (hole_em) {
7264 		return hole_em;
7265 	}
7266 out:
7267 
7268 	free_extent_map(hole_em);
7269 	if (err) {
7270 		free_extent_map(em);
7271 		return ERR_PTR(err);
7272 	}
7273 	return em;
7274 }
7275 
7276 static struct extent_map *btrfs_create_dio_extent(struct inode *inode,
7277 						  const u64 start,
7278 						  const u64 len,
7279 						  const u64 orig_start,
7280 						  const u64 block_start,
7281 						  const u64 block_len,
7282 						  const u64 orig_block_len,
7283 						  const u64 ram_bytes,
7284 						  const int type)
7285 {
7286 	struct extent_map *em = NULL;
7287 	int ret;
7288 
7289 	if (type != BTRFS_ORDERED_NOCOW) {
7290 		em = create_io_em(inode, start, len, orig_start,
7291 				  block_start, block_len, orig_block_len,
7292 				  ram_bytes,
7293 				  BTRFS_COMPRESS_NONE, /* compress_type */
7294 				  type);
7295 		if (IS_ERR(em))
7296 			goto out;
7297 	}
7298 	ret = btrfs_add_ordered_extent_dio(inode, start, block_start,
7299 					   len, block_len, type);
7300 	if (ret) {
7301 		if (em) {
7302 			free_extent_map(em);
7303 			btrfs_drop_extent_cache(BTRFS_I(inode), start,
7304 						start + len - 1, 0);
7305 		}
7306 		em = ERR_PTR(ret);
7307 	}
7308  out:
7309 
7310 	return em;
7311 }
7312 
7313 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
7314 						  u64 start, u64 len)
7315 {
7316 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7317 	struct btrfs_root *root = BTRFS_I(inode)->root;
7318 	struct extent_map *em;
7319 	struct btrfs_key ins;
7320 	u64 alloc_hint;
7321 	int ret;
7322 
7323 	alloc_hint = get_extent_allocation_hint(inode, start, len);
7324 	ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize,
7325 				   0, alloc_hint, &ins, 1, 1);
7326 	if (ret)
7327 		return ERR_PTR(ret);
7328 
7329 	em = btrfs_create_dio_extent(inode, start, ins.offset, start,
7330 				     ins.objectid, ins.offset, ins.offset,
7331 				     ins.offset, BTRFS_ORDERED_REGULAR);
7332 	btrfs_dec_block_group_reservations(fs_info, ins.objectid);
7333 	if (IS_ERR(em))
7334 		btrfs_free_reserved_extent(fs_info, ins.objectid,
7335 					   ins.offset, 1);
7336 
7337 	return em;
7338 }
7339 
7340 /*
7341  * returns 1 when the nocow is safe, < 1 on error, 0 if the
7342  * block must be cow'd
7343  */
7344 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
7345 			      u64 *orig_start, u64 *orig_block_len,
7346 			      u64 *ram_bytes)
7347 {
7348 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7349 	struct btrfs_path *path;
7350 	int ret;
7351 	struct extent_buffer *leaf;
7352 	struct btrfs_root *root = BTRFS_I(inode)->root;
7353 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7354 	struct btrfs_file_extent_item *fi;
7355 	struct btrfs_key key;
7356 	u64 disk_bytenr;
7357 	u64 backref_offset;
7358 	u64 extent_end;
7359 	u64 num_bytes;
7360 	int slot;
7361 	int found_type;
7362 	bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
7363 
7364 	path = btrfs_alloc_path();
7365 	if (!path)
7366 		return -ENOMEM;
7367 
7368 	ret = btrfs_lookup_file_extent(NULL, root, path,
7369 			btrfs_ino(BTRFS_I(inode)), offset, 0);
7370 	if (ret < 0)
7371 		goto out;
7372 
7373 	slot = path->slots[0];
7374 	if (ret == 1) {
7375 		if (slot == 0) {
7376 			/* can't find the item, must cow */
7377 			ret = 0;
7378 			goto out;
7379 		}
7380 		slot--;
7381 	}
7382 	ret = 0;
7383 	leaf = path->nodes[0];
7384 	btrfs_item_key_to_cpu(leaf, &key, slot);
7385 	if (key.objectid != btrfs_ino(BTRFS_I(inode)) ||
7386 	    key.type != BTRFS_EXTENT_DATA_KEY) {
7387 		/* not our file or wrong item type, must cow */
7388 		goto out;
7389 	}
7390 
7391 	if (key.offset > offset) {
7392 		/* Wrong offset, must cow */
7393 		goto out;
7394 	}
7395 
7396 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
7397 	found_type = btrfs_file_extent_type(leaf, fi);
7398 	if (found_type != BTRFS_FILE_EXTENT_REG &&
7399 	    found_type != BTRFS_FILE_EXTENT_PREALLOC) {
7400 		/* not a regular extent, must cow */
7401 		goto out;
7402 	}
7403 
7404 	if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
7405 		goto out;
7406 
7407 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
7408 	if (extent_end <= offset)
7409 		goto out;
7410 
7411 	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
7412 	if (disk_bytenr == 0)
7413 		goto out;
7414 
7415 	if (btrfs_file_extent_compression(leaf, fi) ||
7416 	    btrfs_file_extent_encryption(leaf, fi) ||
7417 	    btrfs_file_extent_other_encoding(leaf, fi))
7418 		goto out;
7419 
7420 	backref_offset = btrfs_file_extent_offset(leaf, fi);
7421 
7422 	if (orig_start) {
7423 		*orig_start = key.offset - backref_offset;
7424 		*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
7425 		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
7426 	}
7427 
7428 	if (btrfs_extent_readonly(fs_info, disk_bytenr))
7429 		goto out;
7430 
7431 	num_bytes = min(offset + *len, extent_end) - offset;
7432 	if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7433 		u64 range_end;
7434 
7435 		range_end = round_up(offset + num_bytes,
7436 				     root->fs_info->sectorsize) - 1;
7437 		ret = test_range_bit(io_tree, offset, range_end,
7438 				     EXTENT_DELALLOC, 0, NULL);
7439 		if (ret) {
7440 			ret = -EAGAIN;
7441 			goto out;
7442 		}
7443 	}
7444 
7445 	btrfs_release_path(path);
7446 
7447 	/*
7448 	 * look for other files referencing this extent, if we
7449 	 * find any we must cow
7450 	 */
7451 
7452 	ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
7453 				    key.offset - backref_offset, disk_bytenr);
7454 	if (ret) {
7455 		ret = 0;
7456 		goto out;
7457 	}
7458 
7459 	/*
7460 	 * adjust disk_bytenr and num_bytes to cover just the bytes
7461 	 * in this extent we are about to write.  If there
7462 	 * are any csums in that range we have to cow in order
7463 	 * to keep the csums correct
7464 	 */
7465 	disk_bytenr += backref_offset;
7466 	disk_bytenr += offset - key.offset;
7467 	if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes))
7468 		goto out;
7469 	/*
7470 	 * all of the above have passed, it is safe to overwrite this extent
7471 	 * without cow
7472 	 */
7473 	*len = num_bytes;
7474 	ret = 1;
7475 out:
7476 	btrfs_free_path(path);
7477 	return ret;
7478 }
7479 
7480 bool btrfs_page_exists_in_range(struct inode *inode, loff_t start, loff_t end)
7481 {
7482 	struct radix_tree_root *root = &inode->i_mapping->page_tree;
7483 	int found = false;
7484 	void **pagep = NULL;
7485 	struct page *page = NULL;
7486 	unsigned long start_idx;
7487 	unsigned long end_idx;
7488 
7489 	start_idx = start >> PAGE_SHIFT;
7490 
7491 	/*
7492 	 * end is the last byte in the last page.  end == start is legal
7493 	 */
7494 	end_idx = end >> PAGE_SHIFT;
7495 
7496 	rcu_read_lock();
7497 
7498 	/* Most of the code in this while loop is lifted from
7499 	 * find_get_page.  It's been modified to begin searching from a
7500 	 * page and return just the first page found in that range.  If the
7501 	 * found idx is less than or equal to the end idx then we know that
7502 	 * a page exists.  If no pages are found or if those pages are
7503 	 * outside of the range then we're fine (yay!) */
7504 	while (page == NULL &&
7505 	       radix_tree_gang_lookup_slot(root, &pagep, NULL, start_idx, 1)) {
7506 		page = radix_tree_deref_slot(pagep);
7507 		if (unlikely(!page))
7508 			break;
7509 
7510 		if (radix_tree_exception(page)) {
7511 			if (radix_tree_deref_retry(page)) {
7512 				page = NULL;
7513 				continue;
7514 			}
7515 			/*
7516 			 * Otherwise, shmem/tmpfs must be storing a swap entry
7517 			 * here as an exceptional entry: so return it without
7518 			 * attempting to raise page count.
7519 			 */
7520 			page = NULL;
7521 			break; /* TODO: Is this relevant for this use case? */
7522 		}
7523 
7524 		if (!page_cache_get_speculative(page)) {
7525 			page = NULL;
7526 			continue;
7527 		}
7528 
7529 		/*
7530 		 * Has the page moved?
7531 		 * This is part of the lockless pagecache protocol. See
7532 		 * include/linux/pagemap.h for details.
7533 		 */
7534 		if (unlikely(page != *pagep)) {
7535 			put_page(page);
7536 			page = NULL;
7537 		}
7538 	}
7539 
7540 	if (page) {
7541 		if (page->index <= end_idx)
7542 			found = true;
7543 		put_page(page);
7544 	}
7545 
7546 	rcu_read_unlock();
7547 	return found;
7548 }
7549 
7550 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
7551 			      struct extent_state **cached_state, int writing)
7552 {
7553 	struct btrfs_ordered_extent *ordered;
7554 	int ret = 0;
7555 
7556 	while (1) {
7557 		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7558 				 cached_state);
7559 		/*
7560 		 * We're concerned with the entire range that we're going to be
7561 		 * doing DIO to, so we need to make sure there's no ordered
7562 		 * extents in this range.
7563 		 */
7564 		ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), lockstart,
7565 						     lockend - lockstart + 1);
7566 
7567 		/*
7568 		 * We need to make sure there are no buffered pages in this
7569 		 * range either, we could have raced between the invalidate in
7570 		 * generic_file_direct_write and locking the extent.  The
7571 		 * invalidate needs to happen so that reads after a write do not
7572 		 * get stale data.
7573 		 */
7574 		if (!ordered &&
7575 		    (!writing ||
7576 		     !btrfs_page_exists_in_range(inode, lockstart, lockend)))
7577 			break;
7578 
7579 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7580 				     cached_state, GFP_NOFS);
7581 
7582 		if (ordered) {
7583 			/*
7584 			 * If we are doing a DIO read and the ordered extent we
7585 			 * found is for a buffered write, we can not wait for it
7586 			 * to complete and retry, because if we do so we can
7587 			 * deadlock with concurrent buffered writes on page
7588 			 * locks. This happens only if our DIO read covers more
7589 			 * than one extent map, if at this point has already
7590 			 * created an ordered extent for a previous extent map
7591 			 * and locked its range in the inode's io tree, and a
7592 			 * concurrent write against that previous extent map's
7593 			 * range and this range started (we unlock the ranges
7594 			 * in the io tree only when the bios complete and
7595 			 * buffered writes always lock pages before attempting
7596 			 * to lock range in the io tree).
7597 			 */
7598 			if (writing ||
7599 			    test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags))
7600 				btrfs_start_ordered_extent(inode, ordered, 1);
7601 			else
7602 				ret = -ENOTBLK;
7603 			btrfs_put_ordered_extent(ordered);
7604 		} else {
7605 			/*
7606 			 * We could trigger writeback for this range (and wait
7607 			 * for it to complete) and then invalidate the pages for
7608 			 * this range (through invalidate_inode_pages2_range()),
7609 			 * but that can lead us to a deadlock with a concurrent
7610 			 * call to readpages() (a buffered read or a defrag call
7611 			 * triggered a readahead) on a page lock due to an
7612 			 * ordered dio extent we created before but did not have
7613 			 * yet a corresponding bio submitted (whence it can not
7614 			 * complete), which makes readpages() wait for that
7615 			 * ordered extent to complete while holding a lock on
7616 			 * that page.
7617 			 */
7618 			ret = -ENOTBLK;
7619 		}
7620 
7621 		if (ret)
7622 			break;
7623 
7624 		cond_resched();
7625 	}
7626 
7627 	return ret;
7628 }
7629 
7630 /* The callers of this must take lock_extent() */
7631 static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len,
7632 				       u64 orig_start, u64 block_start,
7633 				       u64 block_len, u64 orig_block_len,
7634 				       u64 ram_bytes, int compress_type,
7635 				       int type)
7636 {
7637 	struct extent_map_tree *em_tree;
7638 	struct extent_map *em;
7639 	struct btrfs_root *root = BTRFS_I(inode)->root;
7640 	int ret;
7641 
7642 	ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7643 	       type == BTRFS_ORDERED_COMPRESSED ||
7644 	       type == BTRFS_ORDERED_NOCOW ||
7645 	       type == BTRFS_ORDERED_REGULAR);
7646 
7647 	em_tree = &BTRFS_I(inode)->extent_tree;
7648 	em = alloc_extent_map();
7649 	if (!em)
7650 		return ERR_PTR(-ENOMEM);
7651 
7652 	em->start = start;
7653 	em->orig_start = orig_start;
7654 	em->len = len;
7655 	em->block_len = block_len;
7656 	em->block_start = block_start;
7657 	em->bdev = root->fs_info->fs_devices->latest_bdev;
7658 	em->orig_block_len = orig_block_len;
7659 	em->ram_bytes = ram_bytes;
7660 	em->generation = -1;
7661 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
7662 	if (type == BTRFS_ORDERED_PREALLOC) {
7663 		set_bit(EXTENT_FLAG_FILLING, &em->flags);
7664 	} else if (type == BTRFS_ORDERED_COMPRESSED) {
7665 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
7666 		em->compress_type = compress_type;
7667 	}
7668 
7669 	do {
7670 		btrfs_drop_extent_cache(BTRFS_I(inode), em->start,
7671 				em->start + em->len - 1, 0);
7672 		write_lock(&em_tree->lock);
7673 		ret = add_extent_mapping(em_tree, em, 1);
7674 		write_unlock(&em_tree->lock);
7675 		/*
7676 		 * The caller has taken lock_extent(), who could race with us
7677 		 * to add em?
7678 		 */
7679 	} while (ret == -EEXIST);
7680 
7681 	if (ret) {
7682 		free_extent_map(em);
7683 		return ERR_PTR(ret);
7684 	}
7685 
7686 	/* em got 2 refs now, callers needs to do free_extent_map once. */
7687 	return em;
7688 }
7689 
7690 static void adjust_dio_outstanding_extents(struct inode *inode,
7691 					   struct btrfs_dio_data *dio_data,
7692 					   const u64 len)
7693 {
7694 	unsigned num_extents = count_max_extents(len);
7695 
7696 	/*
7697 	 * If we have an outstanding_extents count still set then we're
7698 	 * within our reservation, otherwise we need to adjust our inode
7699 	 * counter appropriately.
7700 	 */
7701 	if (dio_data->outstanding_extents >= num_extents) {
7702 		dio_data->outstanding_extents -= num_extents;
7703 	} else {
7704 		/*
7705 		 * If dio write length has been split due to no large enough
7706 		 * contiguous space, we need to compensate our inode counter
7707 		 * appropriately.
7708 		 */
7709 		u64 num_needed = num_extents - dio_data->outstanding_extents;
7710 
7711 		spin_lock(&BTRFS_I(inode)->lock);
7712 		BTRFS_I(inode)->outstanding_extents += num_needed;
7713 		spin_unlock(&BTRFS_I(inode)->lock);
7714 	}
7715 }
7716 
7717 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
7718 				   struct buffer_head *bh_result, int create)
7719 {
7720 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7721 	struct extent_map *em;
7722 	struct extent_state *cached_state = NULL;
7723 	struct btrfs_dio_data *dio_data = NULL;
7724 	u64 start = iblock << inode->i_blkbits;
7725 	u64 lockstart, lockend;
7726 	u64 len = bh_result->b_size;
7727 	int unlock_bits = EXTENT_LOCKED;
7728 	int ret = 0;
7729 
7730 	if (create)
7731 		unlock_bits |= EXTENT_DIRTY;
7732 	else
7733 		len = min_t(u64, len, fs_info->sectorsize);
7734 
7735 	lockstart = start;
7736 	lockend = start + len - 1;
7737 
7738 	if (current->journal_info) {
7739 		/*
7740 		 * Need to pull our outstanding extents and set journal_info to NULL so
7741 		 * that anything that needs to check if there's a transaction doesn't get
7742 		 * confused.
7743 		 */
7744 		dio_data = current->journal_info;
7745 		current->journal_info = NULL;
7746 	}
7747 
7748 	/*
7749 	 * If this errors out it's because we couldn't invalidate pagecache for
7750 	 * this range and we need to fallback to buffered.
7751 	 */
7752 	if (lock_extent_direct(inode, lockstart, lockend, &cached_state,
7753 			       create)) {
7754 		ret = -ENOTBLK;
7755 		goto err;
7756 	}
7757 
7758 	em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0);
7759 	if (IS_ERR(em)) {
7760 		ret = PTR_ERR(em);
7761 		goto unlock_err;
7762 	}
7763 
7764 	/*
7765 	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
7766 	 * io.  INLINE is special, and we could probably kludge it in here, but
7767 	 * it's still buffered so for safety lets just fall back to the generic
7768 	 * buffered path.
7769 	 *
7770 	 * For COMPRESSED we _have_ to read the entire extent in so we can
7771 	 * decompress it, so there will be buffering required no matter what we
7772 	 * do, so go ahead and fallback to buffered.
7773 	 *
7774 	 * We return -ENOTBLK because that's what makes DIO go ahead and go back
7775 	 * to buffered IO.  Don't blame me, this is the price we pay for using
7776 	 * the generic code.
7777 	 */
7778 	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
7779 	    em->block_start == EXTENT_MAP_INLINE) {
7780 		free_extent_map(em);
7781 		ret = -ENOTBLK;
7782 		goto unlock_err;
7783 	}
7784 
7785 	/* Just a good old fashioned hole, return */
7786 	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
7787 			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
7788 		free_extent_map(em);
7789 		goto unlock_err;
7790 	}
7791 
7792 	/*
7793 	 * We don't allocate a new extent in the following cases
7794 	 *
7795 	 * 1) The inode is marked as NODATACOW.  In this case we'll just use the
7796 	 * existing extent.
7797 	 * 2) The extent is marked as PREALLOC.  We're good to go here and can
7798 	 * just use the extent.
7799 	 *
7800 	 */
7801 	if (!create) {
7802 		len = min(len, em->len - (start - em->start));
7803 		lockstart = start + len;
7804 		goto unlock;
7805 	}
7806 
7807 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
7808 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
7809 	     em->block_start != EXTENT_MAP_HOLE)) {
7810 		int type;
7811 		u64 block_start, orig_start, orig_block_len, ram_bytes;
7812 
7813 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7814 			type = BTRFS_ORDERED_PREALLOC;
7815 		else
7816 			type = BTRFS_ORDERED_NOCOW;
7817 		len = min(len, em->len - (start - em->start));
7818 		block_start = em->block_start + (start - em->start);
7819 
7820 		if (can_nocow_extent(inode, start, &len, &orig_start,
7821 				     &orig_block_len, &ram_bytes) == 1 &&
7822 		    btrfs_inc_nocow_writers(fs_info, block_start)) {
7823 			struct extent_map *em2;
7824 
7825 			em2 = btrfs_create_dio_extent(inode, start, len,
7826 						      orig_start, block_start,
7827 						      len, orig_block_len,
7828 						      ram_bytes, type);
7829 			btrfs_dec_nocow_writers(fs_info, block_start);
7830 			if (type == BTRFS_ORDERED_PREALLOC) {
7831 				free_extent_map(em);
7832 				em = em2;
7833 			}
7834 			if (em2 && IS_ERR(em2)) {
7835 				ret = PTR_ERR(em2);
7836 				goto unlock_err;
7837 			}
7838 			/*
7839 			 * For inode marked NODATACOW or extent marked PREALLOC,
7840 			 * use the existing or preallocated extent, so does not
7841 			 * need to adjust btrfs_space_info's bytes_may_use.
7842 			 */
7843 			btrfs_free_reserved_data_space_noquota(inode,
7844 					start, len);
7845 			goto unlock;
7846 		}
7847 	}
7848 
7849 	/*
7850 	 * this will cow the extent, reset the len in case we changed
7851 	 * it above
7852 	 */
7853 	len = bh_result->b_size;
7854 	free_extent_map(em);
7855 	em = btrfs_new_extent_direct(inode, start, len);
7856 	if (IS_ERR(em)) {
7857 		ret = PTR_ERR(em);
7858 		goto unlock_err;
7859 	}
7860 	len = min(len, em->len - (start - em->start));
7861 unlock:
7862 	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
7863 		inode->i_blkbits;
7864 	bh_result->b_size = len;
7865 	bh_result->b_bdev = em->bdev;
7866 	set_buffer_mapped(bh_result);
7867 	if (create) {
7868 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
7869 			set_buffer_new(bh_result);
7870 
7871 		/*
7872 		 * Need to update the i_size under the extent lock so buffered
7873 		 * readers will get the updated i_size when we unlock.
7874 		 */
7875 		if (!dio_data->overwrite && start + len > i_size_read(inode))
7876 			i_size_write(inode, start + len);
7877 
7878 		adjust_dio_outstanding_extents(inode, dio_data, len);
7879 		WARN_ON(dio_data->reserve < len);
7880 		dio_data->reserve -= len;
7881 		dio_data->unsubmitted_oe_range_end = start + len;
7882 		current->journal_info = dio_data;
7883 	}
7884 
7885 	/*
7886 	 * In the case of write we need to clear and unlock the entire range,
7887 	 * in the case of read we need to unlock only the end area that we
7888 	 * aren't using if there is any left over space.
7889 	 */
7890 	if (lockstart < lockend) {
7891 		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
7892 				 lockend, unlock_bits, 1, 0,
7893 				 &cached_state, GFP_NOFS);
7894 	} else {
7895 		free_extent_state(cached_state);
7896 	}
7897 
7898 	free_extent_map(em);
7899 
7900 	return 0;
7901 
7902 unlock_err:
7903 	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
7904 			 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
7905 err:
7906 	if (dio_data)
7907 		current->journal_info = dio_data;
7908 	/*
7909 	 * Compensate the delalloc release we do in btrfs_direct_IO() when we
7910 	 * write less data then expected, so that we don't underflow our inode's
7911 	 * outstanding extents counter.
7912 	 */
7913 	if (create && dio_data)
7914 		adjust_dio_outstanding_extents(inode, dio_data, len);
7915 
7916 	return ret;
7917 }
7918 
7919 static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio,
7920 					int mirror_num)
7921 {
7922 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7923 	int ret;
7924 
7925 	BUG_ON(bio_op(bio) == REQ_OP_WRITE);
7926 
7927 	bio_get(bio);
7928 
7929 	ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DIO_REPAIR);
7930 	if (ret)
7931 		goto err;
7932 
7933 	ret = btrfs_map_bio(fs_info, bio, mirror_num, 0);
7934 err:
7935 	bio_put(bio);
7936 	return ret;
7937 }
7938 
7939 static int btrfs_check_dio_repairable(struct inode *inode,
7940 				      struct bio *failed_bio,
7941 				      struct io_failure_record *failrec,
7942 				      int failed_mirror)
7943 {
7944 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
7945 	int num_copies;
7946 
7947 	num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len);
7948 	if (num_copies == 1) {
7949 		/*
7950 		 * we only have a single copy of the data, so don't bother with
7951 		 * all the retry and error correction code that follows. no
7952 		 * matter what the error is, it is very likely to persist.
7953 		 */
7954 		btrfs_debug(fs_info,
7955 			"Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d",
7956 			num_copies, failrec->this_mirror, failed_mirror);
7957 		return 0;
7958 	}
7959 
7960 	failrec->failed_mirror = failed_mirror;
7961 	failrec->this_mirror++;
7962 	if (failrec->this_mirror == failed_mirror)
7963 		failrec->this_mirror++;
7964 
7965 	if (failrec->this_mirror > num_copies) {
7966 		btrfs_debug(fs_info,
7967 			"Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d",
7968 			num_copies, failrec->this_mirror, failed_mirror);
7969 		return 0;
7970 	}
7971 
7972 	return 1;
7973 }
7974 
7975 static int dio_read_error(struct inode *inode, struct bio *failed_bio,
7976 			struct page *page, unsigned int pgoff,
7977 			u64 start, u64 end, int failed_mirror,
7978 			bio_end_io_t *repair_endio, void *repair_arg)
7979 {
7980 	struct io_failure_record *failrec;
7981 	struct bio *bio;
7982 	int isector;
7983 	int read_mode = 0;
7984 	int ret;
7985 
7986 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
7987 
7988 	ret = btrfs_get_io_failure_record(inode, start, end, &failrec);
7989 	if (ret)
7990 		return ret;
7991 
7992 	ret = btrfs_check_dio_repairable(inode, failed_bio, failrec,
7993 					 failed_mirror);
7994 	if (!ret) {
7995 		free_io_failure(BTRFS_I(inode), failrec);
7996 		return -EIO;
7997 	}
7998 
7999 	if ((failed_bio->bi_vcnt > 1)
8000 		|| (failed_bio->bi_io_vec->bv_len
8001 			> btrfs_inode_sectorsize(inode)))
8002 		read_mode |= REQ_FAILFAST_DEV;
8003 
8004 	isector = start - btrfs_io_bio(failed_bio)->logical;
8005 	isector >>= inode->i_sb->s_blocksize_bits;
8006 	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
8007 				pgoff, isector, repair_endio, repair_arg);
8008 	if (!bio) {
8009 		free_io_failure(BTRFS_I(inode), failrec);
8010 		return -EIO;
8011 	}
8012 	bio_set_op_attrs(bio, REQ_OP_READ, read_mode);
8013 
8014 	btrfs_debug(BTRFS_I(inode)->root->fs_info,
8015 		    "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n",
8016 		    read_mode, failrec->this_mirror, failrec->in_validation);
8017 
8018 	ret = submit_dio_repair_bio(inode, bio, failrec->this_mirror);
8019 	if (ret) {
8020 		free_io_failure(BTRFS_I(inode), failrec);
8021 		bio_put(bio);
8022 	}
8023 
8024 	return ret;
8025 }
8026 
8027 struct btrfs_retry_complete {
8028 	struct completion done;
8029 	struct inode *inode;
8030 	u64 start;
8031 	int uptodate;
8032 };
8033 
8034 static void btrfs_retry_endio_nocsum(struct bio *bio)
8035 {
8036 	struct btrfs_retry_complete *done = bio->bi_private;
8037 	struct bio_vec *bvec;
8038 	int i;
8039 
8040 	if (bio->bi_error)
8041 		goto end;
8042 
8043 	ASSERT(bio->bi_vcnt == 1);
8044 	ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
8045 
8046 	done->uptodate = 1;
8047 	bio_for_each_segment_all(bvec, bio, i)
8048 		clean_io_failure(BTRFS_I(done->inode), done->start,
8049 				 bvec->bv_page, 0);
8050 end:
8051 	complete(&done->done);
8052 	bio_put(bio);
8053 }
8054 
8055 static int __btrfs_correct_data_nocsum(struct inode *inode,
8056 				       struct btrfs_io_bio *io_bio)
8057 {
8058 	struct btrfs_fs_info *fs_info;
8059 	struct bio_vec *bvec;
8060 	struct btrfs_retry_complete done;
8061 	u64 start;
8062 	unsigned int pgoff;
8063 	u32 sectorsize;
8064 	int nr_sectors;
8065 	int i;
8066 	int ret;
8067 
8068 	fs_info = BTRFS_I(inode)->root->fs_info;
8069 	sectorsize = fs_info->sectorsize;
8070 
8071 	start = io_bio->logical;
8072 	done.inode = inode;
8073 
8074 	bio_for_each_segment_all(bvec, &io_bio->bio, i) {
8075 		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
8076 		pgoff = bvec->bv_offset;
8077 
8078 next_block_or_try_again:
8079 		done.uptodate = 0;
8080 		done.start = start;
8081 		init_completion(&done.done);
8082 
8083 		ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
8084 				pgoff, start, start + sectorsize - 1,
8085 				io_bio->mirror_num,
8086 				btrfs_retry_endio_nocsum, &done);
8087 		if (ret)
8088 			return ret;
8089 
8090 		wait_for_completion(&done.done);
8091 
8092 		if (!done.uptodate) {
8093 			/* We might have another mirror, so try again */
8094 			goto next_block_or_try_again;
8095 		}
8096 
8097 		start += sectorsize;
8098 
8099 		nr_sectors--;
8100 		if (nr_sectors) {
8101 			pgoff += sectorsize;
8102 			ASSERT(pgoff < PAGE_SIZE);
8103 			goto next_block_or_try_again;
8104 		}
8105 	}
8106 
8107 	return 0;
8108 }
8109 
8110 static void btrfs_retry_endio(struct bio *bio)
8111 {
8112 	struct btrfs_retry_complete *done = bio->bi_private;
8113 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8114 	struct bio_vec *bvec;
8115 	int uptodate;
8116 	int ret;
8117 	int i;
8118 
8119 	if (bio->bi_error)
8120 		goto end;
8121 
8122 	uptodate = 1;
8123 
8124 	ASSERT(bio->bi_vcnt == 1);
8125 	ASSERT(bio->bi_io_vec->bv_len == btrfs_inode_sectorsize(done->inode));
8126 
8127 	bio_for_each_segment_all(bvec, bio, i) {
8128 		ret = __readpage_endio_check(done->inode, io_bio, i,
8129 					bvec->bv_page, bvec->bv_offset,
8130 					done->start, bvec->bv_len);
8131 		if (!ret)
8132 			clean_io_failure(BTRFS_I(done->inode), done->start,
8133 					bvec->bv_page, bvec->bv_offset);
8134 		else
8135 			uptodate = 0;
8136 	}
8137 
8138 	done->uptodate = uptodate;
8139 end:
8140 	complete(&done->done);
8141 	bio_put(bio);
8142 }
8143 
8144 static int __btrfs_subio_endio_read(struct inode *inode,
8145 				    struct btrfs_io_bio *io_bio, int err)
8146 {
8147 	struct btrfs_fs_info *fs_info;
8148 	struct bio_vec *bvec;
8149 	struct btrfs_retry_complete done;
8150 	u64 start;
8151 	u64 offset = 0;
8152 	u32 sectorsize;
8153 	int nr_sectors;
8154 	unsigned int pgoff;
8155 	int csum_pos;
8156 	int i;
8157 	int ret;
8158 
8159 	fs_info = BTRFS_I(inode)->root->fs_info;
8160 	sectorsize = fs_info->sectorsize;
8161 
8162 	err = 0;
8163 	start = io_bio->logical;
8164 	done.inode = inode;
8165 
8166 	bio_for_each_segment_all(bvec, &io_bio->bio, i) {
8167 		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
8168 
8169 		pgoff = bvec->bv_offset;
8170 next_block:
8171 		csum_pos = BTRFS_BYTES_TO_BLKS(fs_info, offset);
8172 		ret = __readpage_endio_check(inode, io_bio, csum_pos,
8173 					bvec->bv_page, pgoff, start,
8174 					sectorsize);
8175 		if (likely(!ret))
8176 			goto next;
8177 try_again:
8178 		done.uptodate = 0;
8179 		done.start = start;
8180 		init_completion(&done.done);
8181 
8182 		ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page,
8183 				pgoff, start, start + sectorsize - 1,
8184 				io_bio->mirror_num,
8185 				btrfs_retry_endio, &done);
8186 		if (ret) {
8187 			err = ret;
8188 			goto next;
8189 		}
8190 
8191 		wait_for_completion(&done.done);
8192 
8193 		if (!done.uptodate) {
8194 			/* We might have another mirror, so try again */
8195 			goto try_again;
8196 		}
8197 next:
8198 		offset += sectorsize;
8199 		start += sectorsize;
8200 
8201 		ASSERT(nr_sectors);
8202 
8203 		nr_sectors--;
8204 		if (nr_sectors) {
8205 			pgoff += sectorsize;
8206 			ASSERT(pgoff < PAGE_SIZE);
8207 			goto next_block;
8208 		}
8209 	}
8210 
8211 	return err;
8212 }
8213 
8214 static int btrfs_subio_endio_read(struct inode *inode,
8215 				  struct btrfs_io_bio *io_bio, int err)
8216 {
8217 	bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8218 
8219 	if (skip_csum) {
8220 		if (unlikely(err))
8221 			return __btrfs_correct_data_nocsum(inode, io_bio);
8222 		else
8223 			return 0;
8224 	} else {
8225 		return __btrfs_subio_endio_read(inode, io_bio, err);
8226 	}
8227 }
8228 
8229 static void btrfs_endio_direct_read(struct bio *bio)
8230 {
8231 	struct btrfs_dio_private *dip = bio->bi_private;
8232 	struct inode *inode = dip->inode;
8233 	struct bio *dio_bio;
8234 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8235 	int err = bio->bi_error;
8236 
8237 	if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED)
8238 		err = btrfs_subio_endio_read(inode, io_bio, err);
8239 
8240 	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
8241 		      dip->logical_offset + dip->bytes - 1);
8242 	dio_bio = dip->dio_bio;
8243 
8244 	kfree(dip);
8245 
8246 	dio_bio->bi_error = bio->bi_error;
8247 	dio_end_io(dio_bio, bio->bi_error);
8248 
8249 	if (io_bio->end_io)
8250 		io_bio->end_io(io_bio, err);
8251 	bio_put(bio);
8252 }
8253 
8254 static void __endio_write_update_ordered(struct inode *inode,
8255 					 const u64 offset, const u64 bytes,
8256 					 const bool uptodate)
8257 {
8258 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8259 	struct btrfs_ordered_extent *ordered = NULL;
8260 	struct btrfs_workqueue *wq;
8261 	btrfs_work_func_t func;
8262 	u64 ordered_offset = offset;
8263 	u64 ordered_bytes = bytes;
8264 	int ret;
8265 
8266 	if (btrfs_is_free_space_inode(BTRFS_I(inode))) {
8267 		wq = fs_info->endio_freespace_worker;
8268 		func = btrfs_freespace_write_helper;
8269 	} else {
8270 		wq = fs_info->endio_write_workers;
8271 		func = btrfs_endio_write_helper;
8272 	}
8273 
8274 again:
8275 	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
8276 						   &ordered_offset,
8277 						   ordered_bytes,
8278 						   uptodate);
8279 	if (!ret)
8280 		goto out_test;
8281 
8282 	btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL);
8283 	btrfs_queue_work(wq, &ordered->work);
8284 out_test:
8285 	/*
8286 	 * our bio might span multiple ordered extents.  If we haven't
8287 	 * completed the accounting for the whole dio, go back and try again
8288 	 */
8289 	if (ordered_offset < offset + bytes) {
8290 		ordered_bytes = offset + bytes - ordered_offset;
8291 		ordered = NULL;
8292 		goto again;
8293 	}
8294 }
8295 
8296 static void btrfs_endio_direct_write(struct bio *bio)
8297 {
8298 	struct btrfs_dio_private *dip = bio->bi_private;
8299 	struct bio *dio_bio = dip->dio_bio;
8300 
8301 	__endio_write_update_ordered(dip->inode, dip->logical_offset,
8302 				     dip->bytes, !bio->bi_error);
8303 
8304 	kfree(dip);
8305 
8306 	dio_bio->bi_error = bio->bi_error;
8307 	dio_end_io(dio_bio, bio->bi_error);
8308 	bio_put(bio);
8309 }
8310 
8311 static int __btrfs_submit_bio_start_direct_io(struct inode *inode,
8312 				    struct bio *bio, int mirror_num,
8313 				    unsigned long bio_flags, u64 offset)
8314 {
8315 	int ret;
8316 	ret = btrfs_csum_one_bio(inode, bio, offset, 1);
8317 	BUG_ON(ret); /* -ENOMEM */
8318 	return 0;
8319 }
8320 
8321 static void btrfs_end_dio_bio(struct bio *bio)
8322 {
8323 	struct btrfs_dio_private *dip = bio->bi_private;
8324 	int err = bio->bi_error;
8325 
8326 	if (err)
8327 		btrfs_warn(BTRFS_I(dip->inode)->root->fs_info,
8328 			   "direct IO failed ino %llu rw %d,%u sector %#Lx len %u err no %d",
8329 			   btrfs_ino(BTRFS_I(dip->inode)), bio_op(bio),
8330 			   bio->bi_opf,
8331 			   (unsigned long long)bio->bi_iter.bi_sector,
8332 			   bio->bi_iter.bi_size, err);
8333 
8334 	if (dip->subio_endio)
8335 		err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err);
8336 
8337 	if (err) {
8338 		dip->errors = 1;
8339 
8340 		/*
8341 		 * before atomic variable goto zero, we must make sure
8342 		 * dip->errors is perceived to be set.
8343 		 */
8344 		smp_mb__before_atomic();
8345 	}
8346 
8347 	/* if there are more bios still pending for this dio, just exit */
8348 	if (!atomic_dec_and_test(&dip->pending_bios))
8349 		goto out;
8350 
8351 	if (dip->errors) {
8352 		bio_io_error(dip->orig_bio);
8353 	} else {
8354 		dip->dio_bio->bi_error = 0;
8355 		bio_endio(dip->orig_bio);
8356 	}
8357 out:
8358 	bio_put(bio);
8359 }
8360 
8361 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
8362 				       u64 first_sector, gfp_t gfp_flags)
8363 {
8364 	struct bio *bio;
8365 	bio = btrfs_bio_alloc(bdev, first_sector, BIO_MAX_PAGES, gfp_flags);
8366 	if (bio)
8367 		bio_associate_current(bio);
8368 	return bio;
8369 }
8370 
8371 static inline int btrfs_lookup_and_bind_dio_csum(struct inode *inode,
8372 						 struct btrfs_dio_private *dip,
8373 						 struct bio *bio,
8374 						 u64 file_offset)
8375 {
8376 	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
8377 	struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio);
8378 	int ret;
8379 
8380 	/*
8381 	 * We load all the csum data we need when we submit
8382 	 * the first bio to reduce the csum tree search and
8383 	 * contention.
8384 	 */
8385 	if (dip->logical_offset == file_offset) {
8386 		ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio,
8387 						file_offset);
8388 		if (ret)
8389 			return ret;
8390 	}
8391 
8392 	if (bio == dip->orig_bio)
8393 		return 0;
8394 
8395 	file_offset -= dip->logical_offset;
8396 	file_offset >>= inode->i_sb->s_blocksize_bits;
8397 	io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset);
8398 
8399 	return 0;
8400 }
8401 
8402 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
8403 					 u64 file_offset, int skip_sum,
8404 					 int async_submit)
8405 {
8406 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8407 	struct btrfs_dio_private *dip = bio->bi_private;
8408 	bool write = bio_op(bio) == REQ_OP_WRITE;
8409 	int ret;
8410 
8411 	if (async_submit)
8412 		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
8413 
8414 	bio_get(bio);
8415 
8416 	if (!write) {
8417 		ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
8418 		if (ret)
8419 			goto err;
8420 	}
8421 
8422 	if (skip_sum)
8423 		goto map;
8424 
8425 	if (write && async_submit) {
8426 		ret = btrfs_wq_submit_bio(fs_info, inode, bio, 0, 0,
8427 					  file_offset,
8428 					  __btrfs_submit_bio_start_direct_io,
8429 					  __btrfs_submit_bio_done);
8430 		goto err;
8431 	} else if (write) {
8432 		/*
8433 		 * If we aren't doing async submit, calculate the csum of the
8434 		 * bio now.
8435 		 */
8436 		ret = btrfs_csum_one_bio(inode, bio, file_offset, 1);
8437 		if (ret)
8438 			goto err;
8439 	} else {
8440 		ret = btrfs_lookup_and_bind_dio_csum(inode, dip, bio,
8441 						     file_offset);
8442 		if (ret)
8443 			goto err;
8444 	}
8445 map:
8446 	ret = btrfs_map_bio(fs_info, bio, 0, async_submit);
8447 err:
8448 	bio_put(bio);
8449 	return ret;
8450 }
8451 
8452 static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
8453 				    int skip_sum)
8454 {
8455 	struct inode *inode = dip->inode;
8456 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8457 	struct btrfs_root *root = BTRFS_I(inode)->root;
8458 	struct bio *bio;
8459 	struct bio *orig_bio = dip->orig_bio;
8460 	struct bio_vec *bvec;
8461 	u64 start_sector = orig_bio->bi_iter.bi_sector;
8462 	u64 file_offset = dip->logical_offset;
8463 	u64 submit_len = 0;
8464 	u64 map_length;
8465 	u32 blocksize = fs_info->sectorsize;
8466 	int async_submit = 0;
8467 	int nr_sectors;
8468 	int ret;
8469 	int i, j;
8470 
8471 	map_length = orig_bio->bi_iter.bi_size;
8472 	ret = btrfs_map_block(fs_info, btrfs_op(orig_bio), start_sector << 9,
8473 			      &map_length, NULL, 0);
8474 	if (ret)
8475 		return -EIO;
8476 
8477 	if (map_length >= orig_bio->bi_iter.bi_size) {
8478 		bio = orig_bio;
8479 		dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED;
8480 		goto submit;
8481 	}
8482 
8483 	/* async crcs make it difficult to collect full stripe writes. */
8484 	if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
8485 		async_submit = 0;
8486 	else
8487 		async_submit = 1;
8488 
8489 	bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
8490 	if (!bio)
8491 		return -ENOMEM;
8492 
8493 	bio->bi_opf = orig_bio->bi_opf;
8494 	bio->bi_private = dip;
8495 	bio->bi_end_io = btrfs_end_dio_bio;
8496 	btrfs_io_bio(bio)->logical = file_offset;
8497 	atomic_inc(&dip->pending_bios);
8498 
8499 	bio_for_each_segment_all(bvec, orig_bio, j) {
8500 		nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec->bv_len);
8501 		i = 0;
8502 next_block:
8503 		if (unlikely(map_length < submit_len + blocksize ||
8504 		    bio_add_page(bio, bvec->bv_page, blocksize,
8505 			    bvec->bv_offset + (i * blocksize)) < blocksize)) {
8506 			/*
8507 			 * inc the count before we submit the bio so
8508 			 * we know the end IO handler won't happen before
8509 			 * we inc the count. Otherwise, the dip might get freed
8510 			 * before we're done setting it up
8511 			 */
8512 			atomic_inc(&dip->pending_bios);
8513 			ret = __btrfs_submit_dio_bio(bio, inode,
8514 						     file_offset, skip_sum,
8515 						     async_submit);
8516 			if (ret) {
8517 				bio_put(bio);
8518 				atomic_dec(&dip->pending_bios);
8519 				goto out_err;
8520 			}
8521 
8522 			start_sector += submit_len >> 9;
8523 			file_offset += submit_len;
8524 
8525 			submit_len = 0;
8526 
8527 			bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
8528 						  start_sector, GFP_NOFS);
8529 			if (!bio)
8530 				goto out_err;
8531 			bio->bi_opf = orig_bio->bi_opf;
8532 			bio->bi_private = dip;
8533 			bio->bi_end_io = btrfs_end_dio_bio;
8534 			btrfs_io_bio(bio)->logical = file_offset;
8535 
8536 			map_length = orig_bio->bi_iter.bi_size;
8537 			ret = btrfs_map_block(fs_info, btrfs_op(orig_bio),
8538 					      start_sector << 9,
8539 					      &map_length, NULL, 0);
8540 			if (ret) {
8541 				bio_put(bio);
8542 				goto out_err;
8543 			}
8544 
8545 			goto next_block;
8546 		} else {
8547 			submit_len += blocksize;
8548 			if (--nr_sectors) {
8549 				i++;
8550 				goto next_block;
8551 			}
8552 		}
8553 	}
8554 
8555 submit:
8556 	ret = __btrfs_submit_dio_bio(bio, inode, file_offset, skip_sum,
8557 				     async_submit);
8558 	if (!ret)
8559 		return 0;
8560 
8561 	bio_put(bio);
8562 out_err:
8563 	dip->errors = 1;
8564 	/*
8565 	 * before atomic variable goto zero, we must
8566 	 * make sure dip->errors is perceived to be set.
8567 	 */
8568 	smp_mb__before_atomic();
8569 	if (atomic_dec_and_test(&dip->pending_bios))
8570 		bio_io_error(dip->orig_bio);
8571 
8572 	/* bio_end_io() will handle error, so we needn't return it */
8573 	return 0;
8574 }
8575 
8576 static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
8577 				loff_t file_offset)
8578 {
8579 	struct btrfs_dio_private *dip = NULL;
8580 	struct bio *io_bio = NULL;
8581 	struct btrfs_io_bio *btrfs_bio;
8582 	int skip_sum;
8583 	bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
8584 	int ret = 0;
8585 
8586 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
8587 
8588 	io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
8589 	if (!io_bio) {
8590 		ret = -ENOMEM;
8591 		goto free_ordered;
8592 	}
8593 
8594 	dip = kzalloc(sizeof(*dip), GFP_NOFS);
8595 	if (!dip) {
8596 		ret = -ENOMEM;
8597 		goto free_ordered;
8598 	}
8599 
8600 	dip->private = dio_bio->bi_private;
8601 	dip->inode = inode;
8602 	dip->logical_offset = file_offset;
8603 	dip->bytes = dio_bio->bi_iter.bi_size;
8604 	dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
8605 	io_bio->bi_private = dip;
8606 	dip->orig_bio = io_bio;
8607 	dip->dio_bio = dio_bio;
8608 	atomic_set(&dip->pending_bios, 0);
8609 	btrfs_bio = btrfs_io_bio(io_bio);
8610 	btrfs_bio->logical = file_offset;
8611 
8612 	if (write) {
8613 		io_bio->bi_end_io = btrfs_endio_direct_write;
8614 	} else {
8615 		io_bio->bi_end_io = btrfs_endio_direct_read;
8616 		dip->subio_endio = btrfs_subio_endio_read;
8617 	}
8618 
8619 	/*
8620 	 * Reset the range for unsubmitted ordered extents (to a 0 length range)
8621 	 * even if we fail to submit a bio, because in such case we do the
8622 	 * corresponding error handling below and it must not be done a second
8623 	 * time by btrfs_direct_IO().
8624 	 */
8625 	if (write) {
8626 		struct btrfs_dio_data *dio_data = current->journal_info;
8627 
8628 		dio_data->unsubmitted_oe_range_end = dip->logical_offset +
8629 			dip->bytes;
8630 		dio_data->unsubmitted_oe_range_start =
8631 			dio_data->unsubmitted_oe_range_end;
8632 	}
8633 
8634 	ret = btrfs_submit_direct_hook(dip, skip_sum);
8635 	if (!ret)
8636 		return;
8637 
8638 	if (btrfs_bio->end_io)
8639 		btrfs_bio->end_io(btrfs_bio, ret);
8640 
8641 free_ordered:
8642 	/*
8643 	 * If we arrived here it means either we failed to submit the dip
8644 	 * or we either failed to clone the dio_bio or failed to allocate the
8645 	 * dip. If we cloned the dio_bio and allocated the dip, we can just
8646 	 * call bio_endio against our io_bio so that we get proper resource
8647 	 * cleanup if we fail to submit the dip, otherwise, we must do the
8648 	 * same as btrfs_endio_direct_[write|read] because we can't call these
8649 	 * callbacks - they require an allocated dip and a clone of dio_bio.
8650 	 */
8651 	if (io_bio && dip) {
8652 		io_bio->bi_error = -EIO;
8653 		bio_endio(io_bio);
8654 		/*
8655 		 * The end io callbacks free our dip, do the final put on io_bio
8656 		 * and all the cleanup and final put for dio_bio (through
8657 		 * dio_end_io()).
8658 		 */
8659 		dip = NULL;
8660 		io_bio = NULL;
8661 	} else {
8662 		if (write)
8663 			__endio_write_update_ordered(inode,
8664 						file_offset,
8665 						dio_bio->bi_iter.bi_size,
8666 						false);
8667 		else
8668 			unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
8669 			      file_offset + dio_bio->bi_iter.bi_size - 1);
8670 
8671 		dio_bio->bi_error = -EIO;
8672 		/*
8673 		 * Releases and cleans up our dio_bio, no need to bio_put()
8674 		 * nor bio_endio()/bio_io_error() against dio_bio.
8675 		 */
8676 		dio_end_io(dio_bio, ret);
8677 	}
8678 	if (io_bio)
8679 		bio_put(io_bio);
8680 	kfree(dip);
8681 }
8682 
8683 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
8684 			       struct kiocb *iocb,
8685 			       const struct iov_iter *iter, loff_t offset)
8686 {
8687 	int seg;
8688 	int i;
8689 	unsigned int blocksize_mask = fs_info->sectorsize - 1;
8690 	ssize_t retval = -EINVAL;
8691 
8692 	if (offset & blocksize_mask)
8693 		goto out;
8694 
8695 	if (iov_iter_alignment(iter) & blocksize_mask)
8696 		goto out;
8697 
8698 	/* If this is a write we don't need to check anymore */
8699 	if (iov_iter_rw(iter) != READ || !iter_is_iovec(iter))
8700 		return 0;
8701 	/*
8702 	 * Check to make sure we don't have duplicate iov_base's in this
8703 	 * iovec, if so return EINVAL, otherwise we'll get csum errors
8704 	 * when reading back.
8705 	 */
8706 	for (seg = 0; seg < iter->nr_segs; seg++) {
8707 		for (i = seg + 1; i < iter->nr_segs; i++) {
8708 			if (iter->iov[seg].iov_base == iter->iov[i].iov_base)
8709 				goto out;
8710 		}
8711 	}
8712 	retval = 0;
8713 out:
8714 	return retval;
8715 }
8716 
8717 static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
8718 {
8719 	struct file *file = iocb->ki_filp;
8720 	struct inode *inode = file->f_mapping->host;
8721 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
8722 	struct btrfs_dio_data dio_data = { 0 };
8723 	loff_t offset = iocb->ki_pos;
8724 	size_t count = 0;
8725 	int flags = 0;
8726 	bool wakeup = true;
8727 	bool relock = false;
8728 	ssize_t ret;
8729 
8730 	if (check_direct_IO(fs_info, iocb, iter, offset))
8731 		return 0;
8732 
8733 	inode_dio_begin(inode);
8734 	smp_mb__after_atomic();
8735 
8736 	/*
8737 	 * The generic stuff only does filemap_write_and_wait_range, which
8738 	 * isn't enough if we've written compressed pages to this area, so
8739 	 * we need to flush the dirty pages again to make absolutely sure
8740 	 * that any outstanding dirty pages are on disk.
8741 	 */
8742 	count = iov_iter_count(iter);
8743 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8744 		     &BTRFS_I(inode)->runtime_flags))
8745 		filemap_fdatawrite_range(inode->i_mapping, offset,
8746 					 offset + count - 1);
8747 
8748 	if (iov_iter_rw(iter) == WRITE) {
8749 		/*
8750 		 * If the write DIO is beyond the EOF, we need update
8751 		 * the isize, but it is protected by i_mutex. So we can
8752 		 * not unlock the i_mutex at this case.
8753 		 */
8754 		if (offset + count <= inode->i_size) {
8755 			dio_data.overwrite = 1;
8756 			inode_unlock(inode);
8757 			relock = true;
8758 		}
8759 		ret = btrfs_delalloc_reserve_space(inode, offset, count);
8760 		if (ret)
8761 			goto out;
8762 		dio_data.outstanding_extents = count_max_extents(count);
8763 
8764 		/*
8765 		 * We need to know how many extents we reserved so that we can
8766 		 * do the accounting properly if we go over the number we
8767 		 * originally calculated.  Abuse current->journal_info for this.
8768 		 */
8769 		dio_data.reserve = round_up(count,
8770 					    fs_info->sectorsize);
8771 		dio_data.unsubmitted_oe_range_start = (u64)offset;
8772 		dio_data.unsubmitted_oe_range_end = (u64)offset;
8773 		current->journal_info = &dio_data;
8774 		down_read(&BTRFS_I(inode)->dio_sem);
8775 	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
8776 				     &BTRFS_I(inode)->runtime_flags)) {
8777 		inode_dio_end(inode);
8778 		flags = DIO_LOCKING | DIO_SKIP_HOLES;
8779 		wakeup = false;
8780 	}
8781 
8782 	ret = __blockdev_direct_IO(iocb, inode,
8783 				   fs_info->fs_devices->latest_bdev,
8784 				   iter, btrfs_get_blocks_direct, NULL,
8785 				   btrfs_submit_direct, flags);
8786 	if (iov_iter_rw(iter) == WRITE) {
8787 		up_read(&BTRFS_I(inode)->dio_sem);
8788 		current->journal_info = NULL;
8789 		if (ret < 0 && ret != -EIOCBQUEUED) {
8790 			if (dio_data.reserve)
8791 				btrfs_delalloc_release_space(inode, offset,
8792 							     dio_data.reserve);
8793 			/*
8794 			 * On error we might have left some ordered extents
8795 			 * without submitting corresponding bios for them, so
8796 			 * cleanup them up to avoid other tasks getting them
8797 			 * and waiting for them to complete forever.
8798 			 */
8799 			if (dio_data.unsubmitted_oe_range_start <
8800 			    dio_data.unsubmitted_oe_range_end)
8801 				__endio_write_update_ordered(inode,
8802 					dio_data.unsubmitted_oe_range_start,
8803 					dio_data.unsubmitted_oe_range_end -
8804 					dio_data.unsubmitted_oe_range_start,
8805 					false);
8806 		} else if (ret >= 0 && (size_t)ret < count)
8807 			btrfs_delalloc_release_space(inode, offset,
8808 						     count - (size_t)ret);
8809 	}
8810 out:
8811 	if (wakeup)
8812 		inode_dio_end(inode);
8813 	if (relock)
8814 		inode_lock(inode);
8815 
8816 	return ret;
8817 }
8818 
8819 #define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
8820 
8821 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
8822 		__u64 start, __u64 len)
8823 {
8824 	int	ret;
8825 
8826 	ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
8827 	if (ret)
8828 		return ret;
8829 
8830 	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
8831 }
8832 
8833 int btrfs_readpage(struct file *file, struct page *page)
8834 {
8835 	struct extent_io_tree *tree;
8836 	tree = &BTRFS_I(page->mapping->host)->io_tree;
8837 	return extent_read_full_page(tree, page, btrfs_get_extent, 0);
8838 }
8839 
8840 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
8841 {
8842 	struct extent_io_tree *tree;
8843 	struct inode *inode = page->mapping->host;
8844 	int ret;
8845 
8846 	if (current->flags & PF_MEMALLOC) {
8847 		redirty_page_for_writepage(wbc, page);
8848 		unlock_page(page);
8849 		return 0;
8850 	}
8851 
8852 	/*
8853 	 * If we are under memory pressure we will call this directly from the
8854 	 * VM, we need to make sure we have the inode referenced for the ordered
8855 	 * extent.  If not just return like we didn't do anything.
8856 	 */
8857 	if (!igrab(inode)) {
8858 		redirty_page_for_writepage(wbc, page);
8859 		return AOP_WRITEPAGE_ACTIVATE;
8860 	}
8861 	tree = &BTRFS_I(page->mapping->host)->io_tree;
8862 	ret = extent_write_full_page(tree, page, btrfs_get_extent, wbc);
8863 	btrfs_add_delayed_iput(inode);
8864 	return ret;
8865 }
8866 
8867 static int btrfs_writepages(struct address_space *mapping,
8868 			    struct writeback_control *wbc)
8869 {
8870 	struct extent_io_tree *tree;
8871 
8872 	tree = &BTRFS_I(mapping->host)->io_tree;
8873 	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
8874 }
8875 
8876 static int
8877 btrfs_readpages(struct file *file, struct address_space *mapping,
8878 		struct list_head *pages, unsigned nr_pages)
8879 {
8880 	struct extent_io_tree *tree;
8881 	tree = &BTRFS_I(mapping->host)->io_tree;
8882 	return extent_readpages(tree, mapping, pages, nr_pages,
8883 				btrfs_get_extent);
8884 }
8885 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8886 {
8887 	struct extent_io_tree *tree;
8888 	struct extent_map_tree *map;
8889 	int ret;
8890 
8891 	tree = &BTRFS_I(page->mapping->host)->io_tree;
8892 	map = &BTRFS_I(page->mapping->host)->extent_tree;
8893 	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
8894 	if (ret == 1) {
8895 		ClearPagePrivate(page);
8896 		set_page_private(page, 0);
8897 		put_page(page);
8898 	}
8899 	return ret;
8900 }
8901 
8902 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
8903 {
8904 	if (PageWriteback(page) || PageDirty(page))
8905 		return 0;
8906 	return __btrfs_releasepage(page, gfp_flags);
8907 }
8908 
8909 static void btrfs_invalidatepage(struct page *page, unsigned int offset,
8910 				 unsigned int length)
8911 {
8912 	struct inode *inode = page->mapping->host;
8913 	struct extent_io_tree *tree;
8914 	struct btrfs_ordered_extent *ordered;
8915 	struct extent_state *cached_state = NULL;
8916 	u64 page_start = page_offset(page);
8917 	u64 page_end = page_start + PAGE_SIZE - 1;
8918 	u64 start;
8919 	u64 end;
8920 	int inode_evicting = inode->i_state & I_FREEING;
8921 
8922 	/*
8923 	 * we have the page locked, so new writeback can't start,
8924 	 * and the dirty bit won't be cleared while we are here.
8925 	 *
8926 	 * Wait for IO on this page so that we can safely clear
8927 	 * the PagePrivate2 bit and do ordered accounting
8928 	 */
8929 	wait_on_page_writeback(page);
8930 
8931 	tree = &BTRFS_I(inode)->io_tree;
8932 	if (offset) {
8933 		btrfs_releasepage(page, GFP_NOFS);
8934 		return;
8935 	}
8936 
8937 	if (!inode_evicting)
8938 		lock_extent_bits(tree, page_start, page_end, &cached_state);
8939 again:
8940 	start = page_start;
8941 	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start,
8942 					page_end - start + 1);
8943 	if (ordered) {
8944 		end = min(page_end, ordered->file_offset + ordered->len - 1);
8945 		/*
8946 		 * IO on this page will never be started, so we need
8947 		 * to account for any ordered extents now
8948 		 */
8949 		if (!inode_evicting)
8950 			clear_extent_bit(tree, start, end,
8951 					 EXTENT_DIRTY | EXTENT_DELALLOC |
8952 					 EXTENT_DELALLOC_NEW |
8953 					 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
8954 					 EXTENT_DEFRAG, 1, 0, &cached_state,
8955 					 GFP_NOFS);
8956 		/*
8957 		 * whoever cleared the private bit is responsible
8958 		 * for the finish_ordered_io
8959 		 */
8960 		if (TestClearPagePrivate2(page)) {
8961 			struct btrfs_ordered_inode_tree *tree;
8962 			u64 new_len;
8963 
8964 			tree = &BTRFS_I(inode)->ordered_tree;
8965 
8966 			spin_lock_irq(&tree->lock);
8967 			set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
8968 			new_len = start - ordered->file_offset;
8969 			if (new_len < ordered->truncated_len)
8970 				ordered->truncated_len = new_len;
8971 			spin_unlock_irq(&tree->lock);
8972 
8973 			if (btrfs_dec_test_ordered_pending(inode, &ordered,
8974 							   start,
8975 							   end - start + 1, 1))
8976 				btrfs_finish_ordered_io(ordered);
8977 		}
8978 		btrfs_put_ordered_extent(ordered);
8979 		if (!inode_evicting) {
8980 			cached_state = NULL;
8981 			lock_extent_bits(tree, start, end,
8982 					 &cached_state);
8983 		}
8984 
8985 		start = end + 1;
8986 		if (start < page_end)
8987 			goto again;
8988 	}
8989 
8990 	/*
8991 	 * Qgroup reserved space handler
8992 	 * Page here will be either
8993 	 * 1) Already written to disk
8994 	 *    In this case, its reserved space is released from data rsv map
8995 	 *    and will be freed by delayed_ref handler finally.
8996 	 *    So even we call qgroup_free_data(), it won't decrease reserved
8997 	 *    space.
8998 	 * 2) Not written to disk
8999 	 *    This means the reserved space should be freed here. However,
9000 	 *    if a truncate invalidates the page (by clearing PageDirty)
9001 	 *    and the page is accounted for while allocating extent
9002 	 *    in btrfs_check_data_free_space() we let delayed_ref to
9003 	 *    free the entire extent.
9004 	 */
9005 	if (PageDirty(page))
9006 		btrfs_qgroup_free_data(inode, page_start, PAGE_SIZE);
9007 	if (!inode_evicting) {
9008 		clear_extent_bit(tree, page_start, page_end,
9009 				 EXTENT_LOCKED | EXTENT_DIRTY |
9010 				 EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
9011 				 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
9012 				 &cached_state, GFP_NOFS);
9013 
9014 		__btrfs_releasepage(page, GFP_NOFS);
9015 	}
9016 
9017 	ClearPageChecked(page);
9018 	if (PagePrivate(page)) {
9019 		ClearPagePrivate(page);
9020 		set_page_private(page, 0);
9021 		put_page(page);
9022 	}
9023 }
9024 
9025 /*
9026  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
9027  * called from a page fault handler when a page is first dirtied. Hence we must
9028  * be careful to check for EOF conditions here. We set the page up correctly
9029  * for a written page which means we get ENOSPC checking when writing into
9030  * holes and correct delalloc and unwritten extent mapping on filesystems that
9031  * support these features.
9032  *
9033  * We are not allowed to take the i_mutex here so we have to play games to
9034  * protect against truncate races as the page could now be beyond EOF.  Because
9035  * vmtruncate() writes the inode size before removing pages, once we have the
9036  * page lock we can determine safely if the page is beyond EOF. If it is not
9037  * beyond EOF, then the page is guaranteed safe against truncation until we
9038  * unlock the page.
9039  */
9040 int btrfs_page_mkwrite(struct vm_fault *vmf)
9041 {
9042 	struct page *page = vmf->page;
9043 	struct inode *inode = file_inode(vmf->vma->vm_file);
9044 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9045 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
9046 	struct btrfs_ordered_extent *ordered;
9047 	struct extent_state *cached_state = NULL;
9048 	char *kaddr;
9049 	unsigned long zero_start;
9050 	loff_t size;
9051 	int ret;
9052 	int reserved = 0;
9053 	u64 reserved_space;
9054 	u64 page_start;
9055 	u64 page_end;
9056 	u64 end;
9057 
9058 	reserved_space = PAGE_SIZE;
9059 
9060 	sb_start_pagefault(inode->i_sb);
9061 	page_start = page_offset(page);
9062 	page_end = page_start + PAGE_SIZE - 1;
9063 	end = page_end;
9064 
9065 	/*
9066 	 * Reserving delalloc space after obtaining the page lock can lead to
9067 	 * deadlock. For example, if a dirty page is locked by this function
9068 	 * and the call to btrfs_delalloc_reserve_space() ends up triggering
9069 	 * dirty page write out, then the btrfs_writepage() function could
9070 	 * end up waiting indefinitely to get a lock on the page currently
9071 	 * being processed by btrfs_page_mkwrite() function.
9072 	 */
9073 	ret = btrfs_delalloc_reserve_space(inode, page_start,
9074 					   reserved_space);
9075 	if (!ret) {
9076 		ret = file_update_time(vmf->vma->vm_file);
9077 		reserved = 1;
9078 	}
9079 	if (ret) {
9080 		if (ret == -ENOMEM)
9081 			ret = VM_FAULT_OOM;
9082 		else /* -ENOSPC, -EIO, etc */
9083 			ret = VM_FAULT_SIGBUS;
9084 		if (reserved)
9085 			goto out;
9086 		goto out_noreserve;
9087 	}
9088 
9089 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
9090 again:
9091 	lock_page(page);
9092 	size = i_size_read(inode);
9093 
9094 	if ((page->mapping != inode->i_mapping) ||
9095 	    (page_start >= size)) {
9096 		/* page got truncated out from underneath us */
9097 		goto out_unlock;
9098 	}
9099 	wait_on_page_writeback(page);
9100 
9101 	lock_extent_bits(io_tree, page_start, page_end, &cached_state);
9102 	set_page_extent_mapped(page);
9103 
9104 	/*
9105 	 * we can't set the delalloc bits if there are pending ordered
9106 	 * extents.  Drop our locks and wait for them to finish
9107 	 */
9108 	ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start,
9109 			PAGE_SIZE);
9110 	if (ordered) {
9111 		unlock_extent_cached(io_tree, page_start, page_end,
9112 				     &cached_state, GFP_NOFS);
9113 		unlock_page(page);
9114 		btrfs_start_ordered_extent(inode, ordered, 1);
9115 		btrfs_put_ordered_extent(ordered);
9116 		goto again;
9117 	}
9118 
9119 	if (page->index == ((size - 1) >> PAGE_SHIFT)) {
9120 		reserved_space = round_up(size - page_start,
9121 					  fs_info->sectorsize);
9122 		if (reserved_space < PAGE_SIZE) {
9123 			end = page_start + reserved_space - 1;
9124 			spin_lock(&BTRFS_I(inode)->lock);
9125 			BTRFS_I(inode)->outstanding_extents++;
9126 			spin_unlock(&BTRFS_I(inode)->lock);
9127 			btrfs_delalloc_release_space(inode, page_start,
9128 						PAGE_SIZE - reserved_space);
9129 		}
9130 	}
9131 
9132 	/*
9133 	 * page_mkwrite gets called when the page is firstly dirtied after it's
9134 	 * faulted in, but write(2) could also dirty a page and set delalloc
9135 	 * bits, thus in this case for space account reason, we still need to
9136 	 * clear any delalloc bits within this page range since we have to
9137 	 * reserve data&meta space before lock_page() (see above comments).
9138 	 */
9139 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end,
9140 			  EXTENT_DIRTY | EXTENT_DELALLOC |
9141 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
9142 			  0, 0, &cached_state, GFP_NOFS);
9143 
9144 	ret = btrfs_set_extent_delalloc(inode, page_start, end,
9145 					&cached_state, 0);
9146 	if (ret) {
9147 		unlock_extent_cached(io_tree, page_start, page_end,
9148 				     &cached_state, GFP_NOFS);
9149 		ret = VM_FAULT_SIGBUS;
9150 		goto out_unlock;
9151 	}
9152 	ret = 0;
9153 
9154 	/* page is wholly or partially inside EOF */
9155 	if (page_start + PAGE_SIZE > size)
9156 		zero_start = size & ~PAGE_MASK;
9157 	else
9158 		zero_start = PAGE_SIZE;
9159 
9160 	if (zero_start != PAGE_SIZE) {
9161 		kaddr = kmap(page);
9162 		memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
9163 		flush_dcache_page(page);
9164 		kunmap(page);
9165 	}
9166 	ClearPageChecked(page);
9167 	set_page_dirty(page);
9168 	SetPageUptodate(page);
9169 
9170 	BTRFS_I(inode)->last_trans = fs_info->generation;
9171 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
9172 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
9173 
9174 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
9175 
9176 out_unlock:
9177 	if (!ret) {
9178 		sb_end_pagefault(inode->i_sb);
9179 		return VM_FAULT_LOCKED;
9180 	}
9181 	unlock_page(page);
9182 out:
9183 	btrfs_delalloc_release_space(inode, page_start, reserved_space);
9184 out_noreserve:
9185 	sb_end_pagefault(inode->i_sb);
9186 	return ret;
9187 }
9188 
9189 static int btrfs_truncate(struct inode *inode)
9190 {
9191 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9192 	struct btrfs_root *root = BTRFS_I(inode)->root;
9193 	struct btrfs_block_rsv *rsv;
9194 	int ret = 0;
9195 	int err = 0;
9196 	struct btrfs_trans_handle *trans;
9197 	u64 mask = fs_info->sectorsize - 1;
9198 	u64 min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
9199 
9200 	ret = btrfs_wait_ordered_range(inode, inode->i_size & (~mask),
9201 				       (u64)-1);
9202 	if (ret)
9203 		return ret;
9204 
9205 	/*
9206 	 * Yes ladies and gentlemen, this is indeed ugly.  The fact is we have
9207 	 * 3 things going on here
9208 	 *
9209 	 * 1) We need to reserve space for our orphan item and the space to
9210 	 * delete our orphan item.  Lord knows we don't want to have a dangling
9211 	 * orphan item because we didn't reserve space to remove it.
9212 	 *
9213 	 * 2) We need to reserve space to update our inode.
9214 	 *
9215 	 * 3) We need to have something to cache all the space that is going to
9216 	 * be free'd up by the truncate operation, but also have some slack
9217 	 * space reserved in case it uses space during the truncate (thank you
9218 	 * very much snapshotting).
9219 	 *
9220 	 * And we need these to all be separate.  The fact is we can use a lot of
9221 	 * space doing the truncate, and we have no earthly idea how much space
9222 	 * we will use, so we need the truncate reservation to be separate so it
9223 	 * doesn't end up using space reserved for updating the inode or
9224 	 * removing the orphan item.  We also need to be able to stop the
9225 	 * transaction and start a new one, which means we need to be able to
9226 	 * update the inode several times, and we have no idea of knowing how
9227 	 * many times that will be, so we can't just reserve 1 item for the
9228 	 * entirety of the operation, so that has to be done separately as well.
9229 	 * Then there is the orphan item, which does indeed need to be held on
9230 	 * to for the whole operation, and we need nobody to touch this reserved
9231 	 * space except the orphan code.
9232 	 *
9233 	 * So that leaves us with
9234 	 *
9235 	 * 1) root->orphan_block_rsv - for the orphan deletion.
9236 	 * 2) rsv - for the truncate reservation, which we will steal from the
9237 	 * transaction reservation.
9238 	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
9239 	 * updating the inode.
9240 	 */
9241 	rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
9242 	if (!rsv)
9243 		return -ENOMEM;
9244 	rsv->size = min_size;
9245 	rsv->failfast = 1;
9246 
9247 	/*
9248 	 * 1 for the truncate slack space
9249 	 * 1 for updating the inode.
9250 	 */
9251 	trans = btrfs_start_transaction(root, 2);
9252 	if (IS_ERR(trans)) {
9253 		err = PTR_ERR(trans);
9254 		goto out;
9255 	}
9256 
9257 	/* Migrate the slack space for the truncate to our reserve */
9258 	ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
9259 				      min_size, 0);
9260 	BUG_ON(ret);
9261 
9262 	/*
9263 	 * So if we truncate and then write and fsync we normally would just
9264 	 * write the extents that changed, which is a problem if we need to
9265 	 * first truncate that entire inode.  So set this flag so we write out
9266 	 * all of the extents in the inode to the sync log so we're completely
9267 	 * safe.
9268 	 */
9269 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
9270 	trans->block_rsv = rsv;
9271 
9272 	while (1) {
9273 		ret = btrfs_truncate_inode_items(trans, root, inode,
9274 						 inode->i_size,
9275 						 BTRFS_EXTENT_DATA_KEY);
9276 		if (ret != -ENOSPC && ret != -EAGAIN) {
9277 			err = ret;
9278 			break;
9279 		}
9280 
9281 		trans->block_rsv = &fs_info->trans_block_rsv;
9282 		ret = btrfs_update_inode(trans, root, inode);
9283 		if (ret) {
9284 			err = ret;
9285 			break;
9286 		}
9287 
9288 		btrfs_end_transaction(trans);
9289 		btrfs_btree_balance_dirty(fs_info);
9290 
9291 		trans = btrfs_start_transaction(root, 2);
9292 		if (IS_ERR(trans)) {
9293 			ret = err = PTR_ERR(trans);
9294 			trans = NULL;
9295 			break;
9296 		}
9297 
9298 		btrfs_block_rsv_release(fs_info, rsv, -1);
9299 		ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
9300 					      rsv, min_size, 0);
9301 		BUG_ON(ret);	/* shouldn't happen */
9302 		trans->block_rsv = rsv;
9303 	}
9304 
9305 	if (ret == 0 && inode->i_nlink > 0) {
9306 		trans->block_rsv = root->orphan_block_rsv;
9307 		ret = btrfs_orphan_del(trans, BTRFS_I(inode));
9308 		if (ret)
9309 			err = ret;
9310 	}
9311 
9312 	if (trans) {
9313 		trans->block_rsv = &fs_info->trans_block_rsv;
9314 		ret = btrfs_update_inode(trans, root, inode);
9315 		if (ret && !err)
9316 			err = ret;
9317 
9318 		ret = btrfs_end_transaction(trans);
9319 		btrfs_btree_balance_dirty(fs_info);
9320 	}
9321 out:
9322 	btrfs_free_block_rsv(fs_info, rsv);
9323 
9324 	if (ret && !err)
9325 		err = ret;
9326 
9327 	return err;
9328 }
9329 
9330 /*
9331  * create a new subvolume directory/inode (helper for the ioctl).
9332  */
9333 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
9334 			     struct btrfs_root *new_root,
9335 			     struct btrfs_root *parent_root,
9336 			     u64 new_dirid)
9337 {
9338 	struct inode *inode;
9339 	int err;
9340 	u64 index = 0;
9341 
9342 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
9343 				new_dirid, new_dirid,
9344 				S_IFDIR | (~current_umask() & S_IRWXUGO),
9345 				&index);
9346 	if (IS_ERR(inode))
9347 		return PTR_ERR(inode);
9348 	inode->i_op = &btrfs_dir_inode_operations;
9349 	inode->i_fop = &btrfs_dir_file_operations;
9350 
9351 	set_nlink(inode, 1);
9352 	btrfs_i_size_write(BTRFS_I(inode), 0);
9353 	unlock_new_inode(inode);
9354 
9355 	err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
9356 	if (err)
9357 		btrfs_err(new_root->fs_info,
9358 			  "error inheriting subvolume %llu properties: %d",
9359 			  new_root->root_key.objectid, err);
9360 
9361 	err = btrfs_update_inode(trans, new_root, inode);
9362 
9363 	iput(inode);
9364 	return err;
9365 }
9366 
9367 struct inode *btrfs_alloc_inode(struct super_block *sb)
9368 {
9369 	struct btrfs_inode *ei;
9370 	struct inode *inode;
9371 
9372 	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
9373 	if (!ei)
9374 		return NULL;
9375 
9376 	ei->root = NULL;
9377 	ei->generation = 0;
9378 	ei->last_trans = 0;
9379 	ei->last_sub_trans = 0;
9380 	ei->logged_trans = 0;
9381 	ei->delalloc_bytes = 0;
9382 	ei->new_delalloc_bytes = 0;
9383 	ei->defrag_bytes = 0;
9384 	ei->disk_i_size = 0;
9385 	ei->flags = 0;
9386 	ei->csum_bytes = 0;
9387 	ei->index_cnt = (u64)-1;
9388 	ei->dir_index = 0;
9389 	ei->last_unlink_trans = 0;
9390 	ei->last_log_commit = 0;
9391 	ei->delayed_iput_count = 0;
9392 
9393 	spin_lock_init(&ei->lock);
9394 	ei->outstanding_extents = 0;
9395 	ei->reserved_extents = 0;
9396 
9397 	ei->runtime_flags = 0;
9398 	ei->force_compress = BTRFS_COMPRESS_NONE;
9399 
9400 	ei->delayed_node = NULL;
9401 
9402 	ei->i_otime.tv_sec = 0;
9403 	ei->i_otime.tv_nsec = 0;
9404 
9405 	inode = &ei->vfs_inode;
9406 	extent_map_tree_init(&ei->extent_tree);
9407 	extent_io_tree_init(&ei->io_tree, &inode->i_data);
9408 	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
9409 	ei->io_tree.track_uptodate = 1;
9410 	ei->io_failure_tree.track_uptodate = 1;
9411 	atomic_set(&ei->sync_writers, 0);
9412 	mutex_init(&ei->log_mutex);
9413 	mutex_init(&ei->delalloc_mutex);
9414 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
9415 	INIT_LIST_HEAD(&ei->delalloc_inodes);
9416 	INIT_LIST_HEAD(&ei->delayed_iput);
9417 	RB_CLEAR_NODE(&ei->rb_node);
9418 	init_rwsem(&ei->dio_sem);
9419 
9420 	return inode;
9421 }
9422 
9423 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
9424 void btrfs_test_destroy_inode(struct inode *inode)
9425 {
9426 	btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
9427 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9428 }
9429 #endif
9430 
9431 static void btrfs_i_callback(struct rcu_head *head)
9432 {
9433 	struct inode *inode = container_of(head, struct inode, i_rcu);
9434 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
9435 }
9436 
9437 void btrfs_destroy_inode(struct inode *inode)
9438 {
9439 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
9440 	struct btrfs_ordered_extent *ordered;
9441 	struct btrfs_root *root = BTRFS_I(inode)->root;
9442 
9443 	WARN_ON(!hlist_empty(&inode->i_dentry));
9444 	WARN_ON(inode->i_data.nrpages);
9445 	WARN_ON(BTRFS_I(inode)->outstanding_extents);
9446 	WARN_ON(BTRFS_I(inode)->reserved_extents);
9447 	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
9448 	WARN_ON(BTRFS_I(inode)->new_delalloc_bytes);
9449 	WARN_ON(BTRFS_I(inode)->csum_bytes);
9450 	WARN_ON(BTRFS_I(inode)->defrag_bytes);
9451 
9452 	/*
9453 	 * This can happen where we create an inode, but somebody else also
9454 	 * created the same inode and we need to destroy the one we already
9455 	 * created.
9456 	 */
9457 	if (!root)
9458 		goto free;
9459 
9460 	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
9461 		     &BTRFS_I(inode)->runtime_flags)) {
9462 		btrfs_info(fs_info, "inode %llu still on the orphan list",
9463 			   btrfs_ino(BTRFS_I(inode)));
9464 		atomic_dec(&root->orphan_inodes);
9465 	}
9466 
9467 	while (1) {
9468 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
9469 		if (!ordered)
9470 			break;
9471 		else {
9472 			btrfs_err(fs_info,
9473 				  "found ordered extent %llu %llu on inode cleanup",
9474 				  ordered->file_offset, ordered->len);
9475 			btrfs_remove_ordered_extent(inode, ordered);
9476 			btrfs_put_ordered_extent(ordered);
9477 			btrfs_put_ordered_extent(ordered);
9478 		}
9479 	}
9480 	btrfs_qgroup_check_reserved_leak(inode);
9481 	inode_tree_del(inode);
9482 	btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
9483 free:
9484 	call_rcu(&inode->i_rcu, btrfs_i_callback);
9485 }
9486 
9487 int btrfs_drop_inode(struct inode *inode)
9488 {
9489 	struct btrfs_root *root = BTRFS_I(inode)->root;
9490 
9491 	if (root == NULL)
9492 		return 1;
9493 
9494 	/* the snap/subvol tree is on deleting */
9495 	if (btrfs_root_refs(&root->root_item) == 0)
9496 		return 1;
9497 	else
9498 		return generic_drop_inode(inode);
9499 }
9500 
9501 static void init_once(void *foo)
9502 {
9503 	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
9504 
9505 	inode_init_once(&ei->vfs_inode);
9506 }
9507 
9508 void btrfs_destroy_cachep(void)
9509 {
9510 	/*
9511 	 * Make sure all delayed rcu free inodes are flushed before we
9512 	 * destroy cache.
9513 	 */
9514 	rcu_barrier();
9515 	kmem_cache_destroy(btrfs_inode_cachep);
9516 	kmem_cache_destroy(btrfs_trans_handle_cachep);
9517 	kmem_cache_destroy(btrfs_transaction_cachep);
9518 	kmem_cache_destroy(btrfs_path_cachep);
9519 	kmem_cache_destroy(btrfs_free_space_cachep);
9520 }
9521 
9522 int btrfs_init_cachep(void)
9523 {
9524 	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
9525 			sizeof(struct btrfs_inode), 0,
9526 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
9527 			init_once);
9528 	if (!btrfs_inode_cachep)
9529 		goto fail;
9530 
9531 	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
9532 			sizeof(struct btrfs_trans_handle), 0,
9533 			SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
9534 	if (!btrfs_trans_handle_cachep)
9535 		goto fail;
9536 
9537 	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
9538 			sizeof(struct btrfs_transaction), 0,
9539 			SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
9540 	if (!btrfs_transaction_cachep)
9541 		goto fail;
9542 
9543 	btrfs_path_cachep = kmem_cache_create("btrfs_path",
9544 			sizeof(struct btrfs_path), 0,
9545 			SLAB_MEM_SPREAD, NULL);
9546 	if (!btrfs_path_cachep)
9547 		goto fail;
9548 
9549 	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
9550 			sizeof(struct btrfs_free_space), 0,
9551 			SLAB_MEM_SPREAD, NULL);
9552 	if (!btrfs_free_space_cachep)
9553 		goto fail;
9554 
9555 	return 0;
9556 fail:
9557 	btrfs_destroy_cachep();
9558 	return -ENOMEM;
9559 }
9560 
9561 static int btrfs_getattr(const struct path *path, struct kstat *stat,
9562 			 u32 request_mask, unsigned int flags)
9563 {
9564 	u64 delalloc_bytes;
9565 	struct inode *inode = d_inode(path->dentry);
9566 	u32 blocksize = inode->i_sb->s_blocksize;
9567 
9568 	generic_fillattr(inode, stat);
9569 	stat->dev = BTRFS_I(inode)->root->anon_dev;
9570 
9571 	spin_lock(&BTRFS_I(inode)->lock);
9572 	delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
9573 	spin_unlock(&BTRFS_I(inode)->lock);
9574 	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
9575 			ALIGN(delalloc_bytes, blocksize)) >> 9;
9576 	return 0;
9577 }
9578 
9579 static int btrfs_rename_exchange(struct inode *old_dir,
9580 			      struct dentry *old_dentry,
9581 			      struct inode *new_dir,
9582 			      struct dentry *new_dentry)
9583 {
9584 	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
9585 	struct btrfs_trans_handle *trans;
9586 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
9587 	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9588 	struct inode *new_inode = new_dentry->d_inode;
9589 	struct inode *old_inode = old_dentry->d_inode;
9590 	struct timespec ctime = current_time(old_inode);
9591 	struct dentry *parent;
9592 	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9593 	u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
9594 	u64 old_idx = 0;
9595 	u64 new_idx = 0;
9596 	u64 root_objectid;
9597 	int ret;
9598 	bool root_log_pinned = false;
9599 	bool dest_log_pinned = false;
9600 
9601 	/* we only allow rename subvolume link between subvolumes */
9602 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9603 		return -EXDEV;
9604 
9605 	/* close the race window with snapshot create/destroy ioctl */
9606 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9607 		down_read(&fs_info->subvol_sem);
9608 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
9609 		down_read(&fs_info->subvol_sem);
9610 
9611 	/*
9612 	 * We want to reserve the absolute worst case amount of items.  So if
9613 	 * both inodes are subvols and we need to unlink them then that would
9614 	 * require 4 item modifications, but if they are both normal inodes it
9615 	 * would require 5 item modifications, so we'll assume their normal
9616 	 * inodes.  So 5 * 2 is 10, plus 2 for the new links, so 12 total items
9617 	 * should cover the worst case number of items we'll modify.
9618 	 */
9619 	trans = btrfs_start_transaction(root, 12);
9620 	if (IS_ERR(trans)) {
9621 		ret = PTR_ERR(trans);
9622 		goto out_notrans;
9623 	}
9624 
9625 	/*
9626 	 * We need to find a free sequence number both in the source and
9627 	 * in the destination directory for the exchange.
9628 	 */
9629 	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
9630 	if (ret)
9631 		goto out_fail;
9632 	ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
9633 	if (ret)
9634 		goto out_fail;
9635 
9636 	BTRFS_I(old_inode)->dir_index = 0ULL;
9637 	BTRFS_I(new_inode)->dir_index = 0ULL;
9638 
9639 	/* Reference for the source. */
9640 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9641 		/* force full log commit if subvolume involved. */
9642 		btrfs_set_log_full_commit(fs_info, trans);
9643 	} else {
9644 		btrfs_pin_log_trans(root);
9645 		root_log_pinned = true;
9646 		ret = btrfs_insert_inode_ref(trans, dest,
9647 					     new_dentry->d_name.name,
9648 					     new_dentry->d_name.len,
9649 					     old_ino,
9650 					     btrfs_ino(BTRFS_I(new_dir)),
9651 					     old_idx);
9652 		if (ret)
9653 			goto out_fail;
9654 	}
9655 
9656 	/* And now for the dest. */
9657 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9658 		/* force full log commit if subvolume involved. */
9659 		btrfs_set_log_full_commit(fs_info, trans);
9660 	} else {
9661 		btrfs_pin_log_trans(dest);
9662 		dest_log_pinned = true;
9663 		ret = btrfs_insert_inode_ref(trans, root,
9664 					     old_dentry->d_name.name,
9665 					     old_dentry->d_name.len,
9666 					     new_ino,
9667 					     btrfs_ino(BTRFS_I(old_dir)),
9668 					     new_idx);
9669 		if (ret)
9670 			goto out_fail;
9671 	}
9672 
9673 	/* Update inode version and ctime/mtime. */
9674 	inode_inc_iversion(old_dir);
9675 	inode_inc_iversion(new_dir);
9676 	inode_inc_iversion(old_inode);
9677 	inode_inc_iversion(new_inode);
9678 	old_dir->i_ctime = old_dir->i_mtime = ctime;
9679 	new_dir->i_ctime = new_dir->i_mtime = ctime;
9680 	old_inode->i_ctime = ctime;
9681 	new_inode->i_ctime = ctime;
9682 
9683 	if (old_dentry->d_parent != new_dentry->d_parent) {
9684 		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9685 				BTRFS_I(old_inode), 1);
9686 		btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
9687 				BTRFS_I(new_inode), 1);
9688 	}
9689 
9690 	/* src is a subvolume */
9691 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
9692 		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
9693 		ret = btrfs_unlink_subvol(trans, root, old_dir,
9694 					  root_objectid,
9695 					  old_dentry->d_name.name,
9696 					  old_dentry->d_name.len);
9697 	} else { /* src is an inode */
9698 		ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
9699 					   BTRFS_I(old_dentry->d_inode),
9700 					   old_dentry->d_name.name,
9701 					   old_dentry->d_name.len);
9702 		if (!ret)
9703 			ret = btrfs_update_inode(trans, root, old_inode);
9704 	}
9705 	if (ret) {
9706 		btrfs_abort_transaction(trans, ret);
9707 		goto out_fail;
9708 	}
9709 
9710 	/* dest is a subvolume */
9711 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
9712 		root_objectid = BTRFS_I(new_inode)->root->root_key.objectid;
9713 		ret = btrfs_unlink_subvol(trans, dest, new_dir,
9714 					  root_objectid,
9715 					  new_dentry->d_name.name,
9716 					  new_dentry->d_name.len);
9717 	} else { /* dest is an inode */
9718 		ret = __btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
9719 					   BTRFS_I(new_dentry->d_inode),
9720 					   new_dentry->d_name.name,
9721 					   new_dentry->d_name.len);
9722 		if (!ret)
9723 			ret = btrfs_update_inode(trans, dest, new_inode);
9724 	}
9725 	if (ret) {
9726 		btrfs_abort_transaction(trans, ret);
9727 		goto out_fail;
9728 	}
9729 
9730 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
9731 			     new_dentry->d_name.name,
9732 			     new_dentry->d_name.len, 0, old_idx);
9733 	if (ret) {
9734 		btrfs_abort_transaction(trans, ret);
9735 		goto out_fail;
9736 	}
9737 
9738 	ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
9739 			     old_dentry->d_name.name,
9740 			     old_dentry->d_name.len, 0, new_idx);
9741 	if (ret) {
9742 		btrfs_abort_transaction(trans, ret);
9743 		goto out_fail;
9744 	}
9745 
9746 	if (old_inode->i_nlink == 1)
9747 		BTRFS_I(old_inode)->dir_index = old_idx;
9748 	if (new_inode->i_nlink == 1)
9749 		BTRFS_I(new_inode)->dir_index = new_idx;
9750 
9751 	if (root_log_pinned) {
9752 		parent = new_dentry->d_parent;
9753 		btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
9754 				parent);
9755 		btrfs_end_log_trans(root);
9756 		root_log_pinned = false;
9757 	}
9758 	if (dest_log_pinned) {
9759 		parent = old_dentry->d_parent;
9760 		btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
9761 				parent);
9762 		btrfs_end_log_trans(dest);
9763 		dest_log_pinned = false;
9764 	}
9765 out_fail:
9766 	/*
9767 	 * If we have pinned a log and an error happened, we unpin tasks
9768 	 * trying to sync the log and force them to fallback to a transaction
9769 	 * commit if the log currently contains any of the inodes involved in
9770 	 * this rename operation (to ensure we do not persist a log with an
9771 	 * inconsistent state for any of these inodes or leading to any
9772 	 * inconsistencies when replayed). If the transaction was aborted, the
9773 	 * abortion reason is propagated to userspace when attempting to commit
9774 	 * the transaction. If the log does not contain any of these inodes, we
9775 	 * allow the tasks to sync it.
9776 	 */
9777 	if (ret && (root_log_pinned || dest_log_pinned)) {
9778 		if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
9779 		    btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
9780 		    btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
9781 		    (new_inode &&
9782 		     btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
9783 			btrfs_set_log_full_commit(fs_info, trans);
9784 
9785 		if (root_log_pinned) {
9786 			btrfs_end_log_trans(root);
9787 			root_log_pinned = false;
9788 		}
9789 		if (dest_log_pinned) {
9790 			btrfs_end_log_trans(dest);
9791 			dest_log_pinned = false;
9792 		}
9793 	}
9794 	ret = btrfs_end_transaction(trans);
9795 out_notrans:
9796 	if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
9797 		up_read(&fs_info->subvol_sem);
9798 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9799 		up_read(&fs_info->subvol_sem);
9800 
9801 	return ret;
9802 }
9803 
9804 static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans,
9805 				     struct btrfs_root *root,
9806 				     struct inode *dir,
9807 				     struct dentry *dentry)
9808 {
9809 	int ret;
9810 	struct inode *inode;
9811 	u64 objectid;
9812 	u64 index;
9813 
9814 	ret = btrfs_find_free_ino(root, &objectid);
9815 	if (ret)
9816 		return ret;
9817 
9818 	inode = btrfs_new_inode(trans, root, dir,
9819 				dentry->d_name.name,
9820 				dentry->d_name.len,
9821 				btrfs_ino(BTRFS_I(dir)),
9822 				objectid,
9823 				S_IFCHR | WHITEOUT_MODE,
9824 				&index);
9825 
9826 	if (IS_ERR(inode)) {
9827 		ret = PTR_ERR(inode);
9828 		return ret;
9829 	}
9830 
9831 	inode->i_op = &btrfs_special_inode_operations;
9832 	init_special_inode(inode, inode->i_mode,
9833 		WHITEOUT_DEV);
9834 
9835 	ret = btrfs_init_inode_security(trans, inode, dir,
9836 				&dentry->d_name);
9837 	if (ret)
9838 		goto out;
9839 
9840 	ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
9841 				BTRFS_I(inode), 0, index);
9842 	if (ret)
9843 		goto out;
9844 
9845 	ret = btrfs_update_inode(trans, root, inode);
9846 out:
9847 	unlock_new_inode(inode);
9848 	if (ret)
9849 		inode_dec_link_count(inode);
9850 	iput(inode);
9851 
9852 	return ret;
9853 }
9854 
9855 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
9856 			   struct inode *new_dir, struct dentry *new_dentry,
9857 			   unsigned int flags)
9858 {
9859 	struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb);
9860 	struct btrfs_trans_handle *trans;
9861 	unsigned int trans_num_items;
9862 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
9863 	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
9864 	struct inode *new_inode = d_inode(new_dentry);
9865 	struct inode *old_inode = d_inode(old_dentry);
9866 	u64 index = 0;
9867 	u64 root_objectid;
9868 	int ret;
9869 	u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
9870 	bool log_pinned = false;
9871 
9872 	if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
9873 		return -EPERM;
9874 
9875 	/* we only allow rename subvolume link between subvolumes */
9876 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
9877 		return -EXDEV;
9878 
9879 	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
9880 	    (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
9881 		return -ENOTEMPTY;
9882 
9883 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
9884 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
9885 		return -ENOTEMPTY;
9886 
9887 
9888 	/* check for collisions, even if the  name isn't there */
9889 	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
9890 			     new_dentry->d_name.name,
9891 			     new_dentry->d_name.len);
9892 
9893 	if (ret) {
9894 		if (ret == -EEXIST) {
9895 			/* we shouldn't get
9896 			 * eexist without a new_inode */
9897 			if (WARN_ON(!new_inode)) {
9898 				return ret;
9899 			}
9900 		} else {
9901 			/* maybe -EOVERFLOW */
9902 			return ret;
9903 		}
9904 	}
9905 	ret = 0;
9906 
9907 	/*
9908 	 * we're using rename to replace one file with another.  Start IO on it
9909 	 * now so  we don't add too much work to the end of the transaction
9910 	 */
9911 	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
9912 		filemap_flush(old_inode->i_mapping);
9913 
9914 	/* close the racy window with snapshot create/destroy ioctl */
9915 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
9916 		down_read(&fs_info->subvol_sem);
9917 	/*
9918 	 * We want to reserve the absolute worst case amount of items.  So if
9919 	 * both inodes are subvols and we need to unlink them then that would
9920 	 * require 4 item modifications, but if they are both normal inodes it
9921 	 * would require 5 item modifications, so we'll assume they are normal
9922 	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
9923 	 * should cover the worst case number of items we'll modify.
9924 	 * If our rename has the whiteout flag, we need more 5 units for the
9925 	 * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item
9926 	 * when selinux is enabled).
9927 	 */
9928 	trans_num_items = 11;
9929 	if (flags & RENAME_WHITEOUT)
9930 		trans_num_items += 5;
9931 	trans = btrfs_start_transaction(root, trans_num_items);
9932 	if (IS_ERR(trans)) {
9933 		ret = PTR_ERR(trans);
9934 		goto out_notrans;
9935 	}
9936 
9937 	if (dest != root)
9938 		btrfs_record_root_in_trans(trans, dest);
9939 
9940 	ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
9941 	if (ret)
9942 		goto out_fail;
9943 
9944 	BTRFS_I(old_inode)->dir_index = 0ULL;
9945 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9946 		/* force full log commit if subvolume involved. */
9947 		btrfs_set_log_full_commit(fs_info, trans);
9948 	} else {
9949 		btrfs_pin_log_trans(root);
9950 		log_pinned = true;
9951 		ret = btrfs_insert_inode_ref(trans, dest,
9952 					     new_dentry->d_name.name,
9953 					     new_dentry->d_name.len,
9954 					     old_ino,
9955 					     btrfs_ino(BTRFS_I(new_dir)), index);
9956 		if (ret)
9957 			goto out_fail;
9958 	}
9959 
9960 	inode_inc_iversion(old_dir);
9961 	inode_inc_iversion(new_dir);
9962 	inode_inc_iversion(old_inode);
9963 	old_dir->i_ctime = old_dir->i_mtime =
9964 	new_dir->i_ctime = new_dir->i_mtime =
9965 	old_inode->i_ctime = current_time(old_dir);
9966 
9967 	if (old_dentry->d_parent != new_dentry->d_parent)
9968 		btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
9969 				BTRFS_I(old_inode), 1);
9970 
9971 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
9972 		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
9973 		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
9974 					old_dentry->d_name.name,
9975 					old_dentry->d_name.len);
9976 	} else {
9977 		ret = __btrfs_unlink_inode(trans, root, BTRFS_I(old_dir),
9978 					BTRFS_I(d_inode(old_dentry)),
9979 					old_dentry->d_name.name,
9980 					old_dentry->d_name.len);
9981 		if (!ret)
9982 			ret = btrfs_update_inode(trans, root, old_inode);
9983 	}
9984 	if (ret) {
9985 		btrfs_abort_transaction(trans, ret);
9986 		goto out_fail;
9987 	}
9988 
9989 	if (new_inode) {
9990 		inode_inc_iversion(new_inode);
9991 		new_inode->i_ctime = current_time(new_inode);
9992 		if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
9993 			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
9994 			root_objectid = BTRFS_I(new_inode)->location.objectid;
9995 			ret = btrfs_unlink_subvol(trans, dest, new_dir,
9996 						root_objectid,
9997 						new_dentry->d_name.name,
9998 						new_dentry->d_name.len);
9999 			BUG_ON(new_inode->i_nlink == 0);
10000 		} else {
10001 			ret = btrfs_unlink_inode(trans, dest, BTRFS_I(new_dir),
10002 						 BTRFS_I(d_inode(new_dentry)),
10003 						 new_dentry->d_name.name,
10004 						 new_dentry->d_name.len);
10005 		}
10006 		if (!ret && new_inode->i_nlink == 0)
10007 			ret = btrfs_orphan_add(trans,
10008 					BTRFS_I(d_inode(new_dentry)));
10009 		if (ret) {
10010 			btrfs_abort_transaction(trans, ret);
10011 			goto out_fail;
10012 		}
10013 	}
10014 
10015 	ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
10016 			     new_dentry->d_name.name,
10017 			     new_dentry->d_name.len, 0, index);
10018 	if (ret) {
10019 		btrfs_abort_transaction(trans, ret);
10020 		goto out_fail;
10021 	}
10022 
10023 	if (old_inode->i_nlink == 1)
10024 		BTRFS_I(old_inode)->dir_index = index;
10025 
10026 	if (log_pinned) {
10027 		struct dentry *parent = new_dentry->d_parent;
10028 
10029 		btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
10030 				parent);
10031 		btrfs_end_log_trans(root);
10032 		log_pinned = false;
10033 	}
10034 
10035 	if (flags & RENAME_WHITEOUT) {
10036 		ret = btrfs_whiteout_for_rename(trans, root, old_dir,
10037 						old_dentry);
10038 
10039 		if (ret) {
10040 			btrfs_abort_transaction(trans, ret);
10041 			goto out_fail;
10042 		}
10043 	}
10044 out_fail:
10045 	/*
10046 	 * If we have pinned the log and an error happened, we unpin tasks
10047 	 * trying to sync the log and force them to fallback to a transaction
10048 	 * commit if the log currently contains any of the inodes involved in
10049 	 * this rename operation (to ensure we do not persist a log with an
10050 	 * inconsistent state for any of these inodes or leading to any
10051 	 * inconsistencies when replayed). If the transaction was aborted, the
10052 	 * abortion reason is propagated to userspace when attempting to commit
10053 	 * the transaction. If the log does not contain any of these inodes, we
10054 	 * allow the tasks to sync it.
10055 	 */
10056 	if (ret && log_pinned) {
10057 		if (btrfs_inode_in_log(BTRFS_I(old_dir), fs_info->generation) ||
10058 		    btrfs_inode_in_log(BTRFS_I(new_dir), fs_info->generation) ||
10059 		    btrfs_inode_in_log(BTRFS_I(old_inode), fs_info->generation) ||
10060 		    (new_inode &&
10061 		     btrfs_inode_in_log(BTRFS_I(new_inode), fs_info->generation)))
10062 			btrfs_set_log_full_commit(fs_info, trans);
10063 
10064 		btrfs_end_log_trans(root);
10065 		log_pinned = false;
10066 	}
10067 	btrfs_end_transaction(trans);
10068 out_notrans:
10069 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
10070 		up_read(&fs_info->subvol_sem);
10071 
10072 	return ret;
10073 }
10074 
10075 static int btrfs_rename2(struct inode *old_dir, struct dentry *old_dentry,
10076 			 struct inode *new_dir, struct dentry *new_dentry,
10077 			 unsigned int flags)
10078 {
10079 	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
10080 		return -EINVAL;
10081 
10082 	if (flags & RENAME_EXCHANGE)
10083 		return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
10084 					  new_dentry);
10085 
10086 	return btrfs_rename(old_dir, old_dentry, new_dir, new_dentry, flags);
10087 }
10088 
10089 static void btrfs_run_delalloc_work(struct btrfs_work *work)
10090 {
10091 	struct btrfs_delalloc_work *delalloc_work;
10092 	struct inode *inode;
10093 
10094 	delalloc_work = container_of(work, struct btrfs_delalloc_work,
10095 				     work);
10096 	inode = delalloc_work->inode;
10097 	filemap_flush(inode->i_mapping);
10098 	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
10099 				&BTRFS_I(inode)->runtime_flags))
10100 		filemap_flush(inode->i_mapping);
10101 
10102 	if (delalloc_work->delay_iput)
10103 		btrfs_add_delayed_iput(inode);
10104 	else
10105 		iput(inode);
10106 	complete(&delalloc_work->completion);
10107 }
10108 
10109 struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
10110 						    int delay_iput)
10111 {
10112 	struct btrfs_delalloc_work *work;
10113 
10114 	work = kmalloc(sizeof(*work), GFP_NOFS);
10115 	if (!work)
10116 		return NULL;
10117 
10118 	init_completion(&work->completion);
10119 	INIT_LIST_HEAD(&work->list);
10120 	work->inode = inode;
10121 	work->delay_iput = delay_iput;
10122 	WARN_ON_ONCE(!inode);
10123 	btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
10124 			btrfs_run_delalloc_work, NULL, NULL);
10125 
10126 	return work;
10127 }
10128 
10129 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
10130 {
10131 	wait_for_completion(&work->completion);
10132 	kfree(work);
10133 }
10134 
10135 /*
10136  * some fairly slow code that needs optimization. This walks the list
10137  * of all the inodes with pending delalloc and forces them to disk.
10138  */
10139 static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
10140 				   int nr)
10141 {
10142 	struct btrfs_inode *binode;
10143 	struct inode *inode;
10144 	struct btrfs_delalloc_work *work, *next;
10145 	struct list_head works;
10146 	struct list_head splice;
10147 	int ret = 0;
10148 
10149 	INIT_LIST_HEAD(&works);
10150 	INIT_LIST_HEAD(&splice);
10151 
10152 	mutex_lock(&root->delalloc_mutex);
10153 	spin_lock(&root->delalloc_lock);
10154 	list_splice_init(&root->delalloc_inodes, &splice);
10155 	while (!list_empty(&splice)) {
10156 		binode = list_entry(splice.next, struct btrfs_inode,
10157 				    delalloc_inodes);
10158 
10159 		list_move_tail(&binode->delalloc_inodes,
10160 			       &root->delalloc_inodes);
10161 		inode = igrab(&binode->vfs_inode);
10162 		if (!inode) {
10163 			cond_resched_lock(&root->delalloc_lock);
10164 			continue;
10165 		}
10166 		spin_unlock(&root->delalloc_lock);
10167 
10168 		work = btrfs_alloc_delalloc_work(inode, delay_iput);
10169 		if (!work) {
10170 			if (delay_iput)
10171 				btrfs_add_delayed_iput(inode);
10172 			else
10173 				iput(inode);
10174 			ret = -ENOMEM;
10175 			goto out;
10176 		}
10177 		list_add_tail(&work->list, &works);
10178 		btrfs_queue_work(root->fs_info->flush_workers,
10179 				 &work->work);
10180 		ret++;
10181 		if (nr != -1 && ret >= nr)
10182 			goto out;
10183 		cond_resched();
10184 		spin_lock(&root->delalloc_lock);
10185 	}
10186 	spin_unlock(&root->delalloc_lock);
10187 
10188 out:
10189 	list_for_each_entry_safe(work, next, &works, list) {
10190 		list_del_init(&work->list);
10191 		btrfs_wait_and_free_delalloc_work(work);
10192 	}
10193 
10194 	if (!list_empty_careful(&splice)) {
10195 		spin_lock(&root->delalloc_lock);
10196 		list_splice_tail(&splice, &root->delalloc_inodes);
10197 		spin_unlock(&root->delalloc_lock);
10198 	}
10199 	mutex_unlock(&root->delalloc_mutex);
10200 	return ret;
10201 }
10202 
10203 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
10204 {
10205 	struct btrfs_fs_info *fs_info = root->fs_info;
10206 	int ret;
10207 
10208 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
10209 		return -EROFS;
10210 
10211 	ret = __start_delalloc_inodes(root, delay_iput, -1);
10212 	if (ret > 0)
10213 		ret = 0;
10214 	/*
10215 	 * the filemap_flush will queue IO into the worker threads, but
10216 	 * we have to make sure the IO is actually started and that
10217 	 * ordered extents get created before we return
10218 	 */
10219 	atomic_inc(&fs_info->async_submit_draining);
10220 	while (atomic_read(&fs_info->nr_async_submits) ||
10221 	       atomic_read(&fs_info->async_delalloc_pages)) {
10222 		wait_event(fs_info->async_submit_wait,
10223 			   (atomic_read(&fs_info->nr_async_submits) == 0 &&
10224 			    atomic_read(&fs_info->async_delalloc_pages) == 0));
10225 	}
10226 	atomic_dec(&fs_info->async_submit_draining);
10227 	return ret;
10228 }
10229 
10230 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int delay_iput,
10231 			       int nr)
10232 {
10233 	struct btrfs_root *root;
10234 	struct list_head splice;
10235 	int ret;
10236 
10237 	if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
10238 		return -EROFS;
10239 
10240 	INIT_LIST_HEAD(&splice);
10241 
10242 	mutex_lock(&fs_info->delalloc_root_mutex);
10243 	spin_lock(&fs_info->delalloc_root_lock);
10244 	list_splice_init(&fs_info->delalloc_roots, &splice);
10245 	while (!list_empty(&splice) && nr) {
10246 		root = list_first_entry(&splice, struct btrfs_root,
10247 					delalloc_root);
10248 		root = btrfs_grab_fs_root(root);
10249 		BUG_ON(!root);
10250 		list_move_tail(&root->delalloc_root,
10251 			       &fs_info->delalloc_roots);
10252 		spin_unlock(&fs_info->delalloc_root_lock);
10253 
10254 		ret = __start_delalloc_inodes(root, delay_iput, nr);
10255 		btrfs_put_fs_root(root);
10256 		if (ret < 0)
10257 			goto out;
10258 
10259 		if (nr != -1) {
10260 			nr -= ret;
10261 			WARN_ON(nr < 0);
10262 		}
10263 		spin_lock(&fs_info->delalloc_root_lock);
10264 	}
10265 	spin_unlock(&fs_info->delalloc_root_lock);
10266 
10267 	ret = 0;
10268 	atomic_inc(&fs_info->async_submit_draining);
10269 	while (atomic_read(&fs_info->nr_async_submits) ||
10270 	      atomic_read(&fs_info->async_delalloc_pages)) {
10271 		wait_event(fs_info->async_submit_wait,
10272 		   (atomic_read(&fs_info->nr_async_submits) == 0 &&
10273 		    atomic_read(&fs_info->async_delalloc_pages) == 0));
10274 	}
10275 	atomic_dec(&fs_info->async_submit_draining);
10276 out:
10277 	if (!list_empty_careful(&splice)) {
10278 		spin_lock(&fs_info->delalloc_root_lock);
10279 		list_splice_tail(&splice, &fs_info->delalloc_roots);
10280 		spin_unlock(&fs_info->delalloc_root_lock);
10281 	}
10282 	mutex_unlock(&fs_info->delalloc_root_mutex);
10283 	return ret;
10284 }
10285 
10286 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
10287 			 const char *symname)
10288 {
10289 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
10290 	struct btrfs_trans_handle *trans;
10291 	struct btrfs_root *root = BTRFS_I(dir)->root;
10292 	struct btrfs_path *path;
10293 	struct btrfs_key key;
10294 	struct inode *inode = NULL;
10295 	int err;
10296 	int drop_inode = 0;
10297 	u64 objectid;
10298 	u64 index = 0;
10299 	int name_len;
10300 	int datasize;
10301 	unsigned long ptr;
10302 	struct btrfs_file_extent_item *ei;
10303 	struct extent_buffer *leaf;
10304 
10305 	name_len = strlen(symname);
10306 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
10307 		return -ENAMETOOLONG;
10308 
10309 	/*
10310 	 * 2 items for inode item and ref
10311 	 * 2 items for dir items
10312 	 * 1 item for updating parent inode item
10313 	 * 1 item for the inline extent item
10314 	 * 1 item for xattr if selinux is on
10315 	 */
10316 	trans = btrfs_start_transaction(root, 7);
10317 	if (IS_ERR(trans))
10318 		return PTR_ERR(trans);
10319 
10320 	err = btrfs_find_free_ino(root, &objectid);
10321 	if (err)
10322 		goto out_unlock;
10323 
10324 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
10325 				dentry->d_name.len, btrfs_ino(BTRFS_I(dir)),
10326 				objectid, S_IFLNK|S_IRWXUGO, &index);
10327 	if (IS_ERR(inode)) {
10328 		err = PTR_ERR(inode);
10329 		goto out_unlock;
10330 	}
10331 
10332 	/*
10333 	* If the active LSM wants to access the inode during
10334 	* d_instantiate it needs these. Smack checks to see
10335 	* if the filesystem supports xattrs by looking at the
10336 	* ops vector.
10337 	*/
10338 	inode->i_fop = &btrfs_file_operations;
10339 	inode->i_op = &btrfs_file_inode_operations;
10340 	inode->i_mapping->a_ops = &btrfs_aops;
10341 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
10342 
10343 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
10344 	if (err)
10345 		goto out_unlock_inode;
10346 
10347 	path = btrfs_alloc_path();
10348 	if (!path) {
10349 		err = -ENOMEM;
10350 		goto out_unlock_inode;
10351 	}
10352 	key.objectid = btrfs_ino(BTRFS_I(inode));
10353 	key.offset = 0;
10354 	key.type = BTRFS_EXTENT_DATA_KEY;
10355 	datasize = btrfs_file_extent_calc_inline_size(name_len);
10356 	err = btrfs_insert_empty_item(trans, root, path, &key,
10357 				      datasize);
10358 	if (err) {
10359 		btrfs_free_path(path);
10360 		goto out_unlock_inode;
10361 	}
10362 	leaf = path->nodes[0];
10363 	ei = btrfs_item_ptr(leaf, path->slots[0],
10364 			    struct btrfs_file_extent_item);
10365 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
10366 	btrfs_set_file_extent_type(leaf, ei,
10367 				   BTRFS_FILE_EXTENT_INLINE);
10368 	btrfs_set_file_extent_encryption(leaf, ei, 0);
10369 	btrfs_set_file_extent_compression(leaf, ei, 0);
10370 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
10371 	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
10372 
10373 	ptr = btrfs_file_extent_inline_start(ei);
10374 	write_extent_buffer(leaf, symname, ptr, name_len);
10375 	btrfs_mark_buffer_dirty(leaf);
10376 	btrfs_free_path(path);
10377 
10378 	inode->i_op = &btrfs_symlink_inode_operations;
10379 	inode_nohighmem(inode);
10380 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
10381 	inode_set_bytes(inode, name_len);
10382 	btrfs_i_size_write(BTRFS_I(inode), name_len);
10383 	err = btrfs_update_inode(trans, root, inode);
10384 	/*
10385 	 * Last step, add directory indexes for our symlink inode. This is the
10386 	 * last step to avoid extra cleanup of these indexes if an error happens
10387 	 * elsewhere above.
10388 	 */
10389 	if (!err)
10390 		err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry,
10391 				BTRFS_I(inode), 0, index);
10392 	if (err) {
10393 		drop_inode = 1;
10394 		goto out_unlock_inode;
10395 	}
10396 
10397 	unlock_new_inode(inode);
10398 	d_instantiate(dentry, inode);
10399 
10400 out_unlock:
10401 	btrfs_end_transaction(trans);
10402 	if (drop_inode) {
10403 		inode_dec_link_count(inode);
10404 		iput(inode);
10405 	}
10406 	btrfs_btree_balance_dirty(fs_info);
10407 	return err;
10408 
10409 out_unlock_inode:
10410 	drop_inode = 1;
10411 	unlock_new_inode(inode);
10412 	goto out_unlock;
10413 }
10414 
10415 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
10416 				       u64 start, u64 num_bytes, u64 min_size,
10417 				       loff_t actual_len, u64 *alloc_hint,
10418 				       struct btrfs_trans_handle *trans)
10419 {
10420 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
10421 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
10422 	struct extent_map *em;
10423 	struct btrfs_root *root = BTRFS_I(inode)->root;
10424 	struct btrfs_key ins;
10425 	u64 cur_offset = start;
10426 	u64 i_size;
10427 	u64 cur_bytes;
10428 	u64 last_alloc = (u64)-1;
10429 	int ret = 0;
10430 	bool own_trans = true;
10431 	u64 end = start + num_bytes - 1;
10432 
10433 	if (trans)
10434 		own_trans = false;
10435 	while (num_bytes > 0) {
10436 		if (own_trans) {
10437 			trans = btrfs_start_transaction(root, 3);
10438 			if (IS_ERR(trans)) {
10439 				ret = PTR_ERR(trans);
10440 				break;
10441 			}
10442 		}
10443 
10444 		cur_bytes = min_t(u64, num_bytes, SZ_256M);
10445 		cur_bytes = max(cur_bytes, min_size);
10446 		/*
10447 		 * If we are severely fragmented we could end up with really
10448 		 * small allocations, so if the allocator is returning small
10449 		 * chunks lets make its job easier by only searching for those
10450 		 * sized chunks.
10451 		 */
10452 		cur_bytes = min(cur_bytes, last_alloc);
10453 		ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
10454 				min_size, 0, *alloc_hint, &ins, 1, 0);
10455 		if (ret) {
10456 			if (own_trans)
10457 				btrfs_end_transaction(trans);
10458 			break;
10459 		}
10460 		btrfs_dec_block_group_reservations(fs_info, ins.objectid);
10461 
10462 		last_alloc = ins.offset;
10463 		ret = insert_reserved_file_extent(trans, inode,
10464 						  cur_offset, ins.objectid,
10465 						  ins.offset, ins.offset,
10466 						  ins.offset, 0, 0, 0,
10467 						  BTRFS_FILE_EXTENT_PREALLOC);
10468 		if (ret) {
10469 			btrfs_free_reserved_extent(fs_info, ins.objectid,
10470 						   ins.offset, 0);
10471 			btrfs_abort_transaction(trans, ret);
10472 			if (own_trans)
10473 				btrfs_end_transaction(trans);
10474 			break;
10475 		}
10476 
10477 		btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
10478 					cur_offset + ins.offset -1, 0);
10479 
10480 		em = alloc_extent_map();
10481 		if (!em) {
10482 			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
10483 				&BTRFS_I(inode)->runtime_flags);
10484 			goto next;
10485 		}
10486 
10487 		em->start = cur_offset;
10488 		em->orig_start = cur_offset;
10489 		em->len = ins.offset;
10490 		em->block_start = ins.objectid;
10491 		em->block_len = ins.offset;
10492 		em->orig_block_len = ins.offset;
10493 		em->ram_bytes = ins.offset;
10494 		em->bdev = fs_info->fs_devices->latest_bdev;
10495 		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
10496 		em->generation = trans->transid;
10497 
10498 		while (1) {
10499 			write_lock(&em_tree->lock);
10500 			ret = add_extent_mapping(em_tree, em, 1);
10501 			write_unlock(&em_tree->lock);
10502 			if (ret != -EEXIST)
10503 				break;
10504 			btrfs_drop_extent_cache(BTRFS_I(inode), cur_offset,
10505 						cur_offset + ins.offset - 1,
10506 						0);
10507 		}
10508 		free_extent_map(em);
10509 next:
10510 		num_bytes -= ins.offset;
10511 		cur_offset += ins.offset;
10512 		*alloc_hint = ins.objectid + ins.offset;
10513 
10514 		inode_inc_iversion(inode);
10515 		inode->i_ctime = current_time(inode);
10516 		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
10517 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
10518 		    (actual_len > inode->i_size) &&
10519 		    (cur_offset > inode->i_size)) {
10520 			if (cur_offset > actual_len)
10521 				i_size = actual_len;
10522 			else
10523 				i_size = cur_offset;
10524 			i_size_write(inode, i_size);
10525 			btrfs_ordered_update_i_size(inode, i_size, NULL);
10526 		}
10527 
10528 		ret = btrfs_update_inode(trans, root, inode);
10529 
10530 		if (ret) {
10531 			btrfs_abort_transaction(trans, ret);
10532 			if (own_trans)
10533 				btrfs_end_transaction(trans);
10534 			break;
10535 		}
10536 
10537 		if (own_trans)
10538 			btrfs_end_transaction(trans);
10539 	}
10540 	if (cur_offset < end)
10541 		btrfs_free_reserved_data_space(inode, cur_offset,
10542 			end - cur_offset + 1);
10543 	return ret;
10544 }
10545 
10546 int btrfs_prealloc_file_range(struct inode *inode, int mode,
10547 			      u64 start, u64 num_bytes, u64 min_size,
10548 			      loff_t actual_len, u64 *alloc_hint)
10549 {
10550 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10551 					   min_size, actual_len, alloc_hint,
10552 					   NULL);
10553 }
10554 
10555 int btrfs_prealloc_file_range_trans(struct inode *inode,
10556 				    struct btrfs_trans_handle *trans, int mode,
10557 				    u64 start, u64 num_bytes, u64 min_size,
10558 				    loff_t actual_len, u64 *alloc_hint)
10559 {
10560 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
10561 					   min_size, actual_len, alloc_hint, trans);
10562 }
10563 
10564 static int btrfs_set_page_dirty(struct page *page)
10565 {
10566 	return __set_page_dirty_nobuffers(page);
10567 }
10568 
10569 static int btrfs_permission(struct inode *inode, int mask)
10570 {
10571 	struct btrfs_root *root = BTRFS_I(inode)->root;
10572 	umode_t mode = inode->i_mode;
10573 
10574 	if (mask & MAY_WRITE &&
10575 	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
10576 		if (btrfs_root_readonly(root))
10577 			return -EROFS;
10578 		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
10579 			return -EACCES;
10580 	}
10581 	return generic_permission(inode, mask);
10582 }
10583 
10584 static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
10585 {
10586 	struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
10587 	struct btrfs_trans_handle *trans;
10588 	struct btrfs_root *root = BTRFS_I(dir)->root;
10589 	struct inode *inode = NULL;
10590 	u64 objectid;
10591 	u64 index;
10592 	int ret = 0;
10593 
10594 	/*
10595 	 * 5 units required for adding orphan entry
10596 	 */
10597 	trans = btrfs_start_transaction(root, 5);
10598 	if (IS_ERR(trans))
10599 		return PTR_ERR(trans);
10600 
10601 	ret = btrfs_find_free_ino(root, &objectid);
10602 	if (ret)
10603 		goto out;
10604 
10605 	inode = btrfs_new_inode(trans, root, dir, NULL, 0,
10606 			btrfs_ino(BTRFS_I(dir)), objectid, mode, &index);
10607 	if (IS_ERR(inode)) {
10608 		ret = PTR_ERR(inode);
10609 		inode = NULL;
10610 		goto out;
10611 	}
10612 
10613 	inode->i_fop = &btrfs_file_operations;
10614 	inode->i_op = &btrfs_file_inode_operations;
10615 
10616 	inode->i_mapping->a_ops = &btrfs_aops;
10617 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
10618 
10619 	ret = btrfs_init_inode_security(trans, inode, dir, NULL);
10620 	if (ret)
10621 		goto out_inode;
10622 
10623 	ret = btrfs_update_inode(trans, root, inode);
10624 	if (ret)
10625 		goto out_inode;
10626 	ret = btrfs_orphan_add(trans, BTRFS_I(inode));
10627 	if (ret)
10628 		goto out_inode;
10629 
10630 	/*
10631 	 * We set number of links to 0 in btrfs_new_inode(), and here we set
10632 	 * it to 1 because d_tmpfile() will issue a warning if the count is 0,
10633 	 * through:
10634 	 *
10635 	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
10636 	 */
10637 	set_nlink(inode, 1);
10638 	unlock_new_inode(inode);
10639 	d_tmpfile(dentry, inode);
10640 	mark_inode_dirty(inode);
10641 
10642 out:
10643 	btrfs_end_transaction(trans);
10644 	if (ret)
10645 		iput(inode);
10646 	btrfs_balance_delayed_items(fs_info);
10647 	btrfs_btree_balance_dirty(fs_info);
10648 	return ret;
10649 
10650 out_inode:
10651 	unlock_new_inode(inode);
10652 	goto out;
10653 
10654 }
10655 
10656 __attribute__((const))
10657 static int btrfs_readpage_io_failed_hook(struct page *page, int failed_mirror)
10658 {
10659 	return -EAGAIN;
10660 }
10661 
10662 static const struct inode_operations btrfs_dir_inode_operations = {
10663 	.getattr	= btrfs_getattr,
10664 	.lookup		= btrfs_lookup,
10665 	.create		= btrfs_create,
10666 	.unlink		= btrfs_unlink,
10667 	.link		= btrfs_link,
10668 	.mkdir		= btrfs_mkdir,
10669 	.rmdir		= btrfs_rmdir,
10670 	.rename		= btrfs_rename2,
10671 	.symlink	= btrfs_symlink,
10672 	.setattr	= btrfs_setattr,
10673 	.mknod		= btrfs_mknod,
10674 	.listxattr	= btrfs_listxattr,
10675 	.permission	= btrfs_permission,
10676 	.get_acl	= btrfs_get_acl,
10677 	.set_acl	= btrfs_set_acl,
10678 	.update_time	= btrfs_update_time,
10679 	.tmpfile        = btrfs_tmpfile,
10680 };
10681 static const struct inode_operations btrfs_dir_ro_inode_operations = {
10682 	.lookup		= btrfs_lookup,
10683 	.permission	= btrfs_permission,
10684 	.update_time	= btrfs_update_time,
10685 };
10686 
10687 static const struct file_operations btrfs_dir_file_operations = {
10688 	.llseek		= generic_file_llseek,
10689 	.read		= generic_read_dir,
10690 	.iterate_shared	= btrfs_real_readdir,
10691 	.unlocked_ioctl	= btrfs_ioctl,
10692 #ifdef CONFIG_COMPAT
10693 	.compat_ioctl	= btrfs_compat_ioctl,
10694 #endif
10695 	.release        = btrfs_release_file,
10696 	.fsync		= btrfs_sync_file,
10697 };
10698 
10699 static const struct extent_io_ops btrfs_extent_io_ops = {
10700 	/* mandatory callbacks */
10701 	.submit_bio_hook = btrfs_submit_bio_hook,
10702 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
10703 	.merge_bio_hook = btrfs_merge_bio_hook,
10704 	.readpage_io_failed_hook = btrfs_readpage_io_failed_hook,
10705 
10706 	/* optional callbacks */
10707 	.fill_delalloc = run_delalloc_range,
10708 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
10709 	.writepage_start_hook = btrfs_writepage_start_hook,
10710 	.set_bit_hook = btrfs_set_bit_hook,
10711 	.clear_bit_hook = btrfs_clear_bit_hook,
10712 	.merge_extent_hook = btrfs_merge_extent_hook,
10713 	.split_extent_hook = btrfs_split_extent_hook,
10714 };
10715 
10716 /*
10717  * btrfs doesn't support the bmap operation because swapfiles
10718  * use bmap to make a mapping of extents in the file.  They assume
10719  * these extents won't change over the life of the file and they
10720  * use the bmap result to do IO directly to the drive.
10721  *
10722  * the btrfs bmap call would return logical addresses that aren't
10723  * suitable for IO and they also will change frequently as COW
10724  * operations happen.  So, swapfile + btrfs == corruption.
10725  *
10726  * For now we're avoiding this by dropping bmap.
10727  */
10728 static const struct address_space_operations btrfs_aops = {
10729 	.readpage	= btrfs_readpage,
10730 	.writepage	= btrfs_writepage,
10731 	.writepages	= btrfs_writepages,
10732 	.readpages	= btrfs_readpages,
10733 	.direct_IO	= btrfs_direct_IO,
10734 	.invalidatepage = btrfs_invalidatepage,
10735 	.releasepage	= btrfs_releasepage,
10736 	.set_page_dirty	= btrfs_set_page_dirty,
10737 	.error_remove_page = generic_error_remove_page,
10738 };
10739 
10740 static const struct address_space_operations btrfs_symlink_aops = {
10741 	.readpage	= btrfs_readpage,
10742 	.writepage	= btrfs_writepage,
10743 	.invalidatepage = btrfs_invalidatepage,
10744 	.releasepage	= btrfs_releasepage,
10745 };
10746 
10747 static const struct inode_operations btrfs_file_inode_operations = {
10748 	.getattr	= btrfs_getattr,
10749 	.setattr	= btrfs_setattr,
10750 	.listxattr      = btrfs_listxattr,
10751 	.permission	= btrfs_permission,
10752 	.fiemap		= btrfs_fiemap,
10753 	.get_acl	= btrfs_get_acl,
10754 	.set_acl	= btrfs_set_acl,
10755 	.update_time	= btrfs_update_time,
10756 };
10757 static const struct inode_operations btrfs_special_inode_operations = {
10758 	.getattr	= btrfs_getattr,
10759 	.setattr	= btrfs_setattr,
10760 	.permission	= btrfs_permission,
10761 	.listxattr	= btrfs_listxattr,
10762 	.get_acl	= btrfs_get_acl,
10763 	.set_acl	= btrfs_set_acl,
10764 	.update_time	= btrfs_update_time,
10765 };
10766 static const struct inode_operations btrfs_symlink_inode_operations = {
10767 	.get_link	= page_get_link,
10768 	.getattr	= btrfs_getattr,
10769 	.setattr	= btrfs_setattr,
10770 	.permission	= btrfs_permission,
10771 	.listxattr	= btrfs_listxattr,
10772 	.update_time	= btrfs_update_time,
10773 };
10774 
10775 const struct dentry_operations btrfs_dentry_operations = {
10776 	.d_delete	= btrfs_dentry_delete,
10777 	.d_release	= btrfs_dentry_release,
10778 };
10779