xref: /openbmc/linux/fs/btrfs/inode.c (revision 00361589d2eebd90fca022148c763e40d3e90871)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/statfs.h>
34 #include <linux/compat.h>
35 #include <linux/aio.h>
36 #include <linux/bit_spinlock.h>
37 #include <linux/xattr.h>
38 #include <linux/posix_acl.h>
39 #include <linux/falloc.h>
40 #include <linux/slab.h>
41 #include <linux/ratelimit.h>
42 #include <linux/mount.h>
43 #include <linux/btrfs.h>
44 #include <linux/blkdev.h>
45 #include <linux/posix_acl_xattr.h>
46 #include "compat.h"
47 #include "ctree.h"
48 #include "disk-io.h"
49 #include "transaction.h"
50 #include "btrfs_inode.h"
51 #include "print-tree.h"
52 #include "ordered-data.h"
53 #include "xattr.h"
54 #include "tree-log.h"
55 #include "volumes.h"
56 #include "compression.h"
57 #include "locking.h"
58 #include "free-space-cache.h"
59 #include "inode-map.h"
60 #include "backref.h"
61 #include "hash.h"
62 
63 struct btrfs_iget_args {
64 	u64 ino;
65 	struct btrfs_root *root;
66 };
67 
68 static const struct inode_operations btrfs_dir_inode_operations;
69 static const struct inode_operations btrfs_symlink_inode_operations;
70 static const struct inode_operations btrfs_dir_ro_inode_operations;
71 static const struct inode_operations btrfs_special_inode_operations;
72 static const struct inode_operations btrfs_file_inode_operations;
73 static const struct address_space_operations btrfs_aops;
74 static const struct address_space_operations btrfs_symlink_aops;
75 static const struct file_operations btrfs_dir_file_operations;
76 static struct extent_io_ops btrfs_extent_io_ops;
77 
78 static struct kmem_cache *btrfs_inode_cachep;
79 static struct kmem_cache *btrfs_delalloc_work_cachep;
80 struct kmem_cache *btrfs_trans_handle_cachep;
81 struct kmem_cache *btrfs_transaction_cachep;
82 struct kmem_cache *btrfs_path_cachep;
83 struct kmem_cache *btrfs_free_space_cachep;
84 
85 #define S_SHIFT 12
86 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
87 	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
88 	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
89 	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
90 	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
91 	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
92 	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
93 	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
94 };
95 
96 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
97 static int btrfs_truncate(struct inode *inode);
98 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
99 static noinline int cow_file_range(struct inode *inode,
100 				   struct page *locked_page,
101 				   u64 start, u64 end, int *page_started,
102 				   unsigned long *nr_written, int unlock);
103 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
104 					   u64 len, u64 orig_start,
105 					   u64 block_start, u64 block_len,
106 					   u64 orig_block_len, u64 ram_bytes,
107 					   int type);
108 
109 static int btrfs_dirty_inode(struct inode *inode);
110 
111 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
112 				     struct inode *inode,  struct inode *dir,
113 				     const struct qstr *qstr)
114 {
115 	int err;
116 
117 	err = btrfs_init_acl(trans, inode, dir);
118 	if (!err)
119 		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
120 	return err;
121 }
122 
123 /*
124  * this does all the hard work for inserting an inline extent into
125  * the btree.  The caller should have done a btrfs_drop_extents so that
126  * no overlapping inline items exist in the btree
127  */
128 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
129 				struct btrfs_root *root, struct inode *inode,
130 				u64 start, size_t size, size_t compressed_size,
131 				int compress_type,
132 				struct page **compressed_pages)
133 {
134 	struct btrfs_key key;
135 	struct btrfs_path *path;
136 	struct extent_buffer *leaf;
137 	struct page *page = NULL;
138 	char *kaddr;
139 	unsigned long ptr;
140 	struct btrfs_file_extent_item *ei;
141 	int err = 0;
142 	int ret;
143 	size_t cur_size = size;
144 	size_t datasize;
145 	unsigned long offset;
146 
147 	if (compressed_size && compressed_pages)
148 		cur_size = compressed_size;
149 
150 	path = btrfs_alloc_path();
151 	if (!path)
152 		return -ENOMEM;
153 
154 	path->leave_spinning = 1;
155 
156 	key.objectid = btrfs_ino(inode);
157 	key.offset = start;
158 	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
159 	datasize = btrfs_file_extent_calc_inline_size(cur_size);
160 
161 	inode_add_bytes(inode, size);
162 	ret = btrfs_insert_empty_item(trans, root, path, &key,
163 				      datasize);
164 	if (ret) {
165 		err = ret;
166 		goto fail;
167 	}
168 	leaf = path->nodes[0];
169 	ei = btrfs_item_ptr(leaf, path->slots[0],
170 			    struct btrfs_file_extent_item);
171 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
172 	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
173 	btrfs_set_file_extent_encryption(leaf, ei, 0);
174 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
175 	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
176 	ptr = btrfs_file_extent_inline_start(ei);
177 
178 	if (compress_type != BTRFS_COMPRESS_NONE) {
179 		struct page *cpage;
180 		int i = 0;
181 		while (compressed_size > 0) {
182 			cpage = compressed_pages[i];
183 			cur_size = min_t(unsigned long, compressed_size,
184 				       PAGE_CACHE_SIZE);
185 
186 			kaddr = kmap_atomic(cpage);
187 			write_extent_buffer(leaf, kaddr, ptr, cur_size);
188 			kunmap_atomic(kaddr);
189 
190 			i++;
191 			ptr += cur_size;
192 			compressed_size -= cur_size;
193 		}
194 		btrfs_set_file_extent_compression(leaf, ei,
195 						  compress_type);
196 	} else {
197 		page = find_get_page(inode->i_mapping,
198 				     start >> PAGE_CACHE_SHIFT);
199 		btrfs_set_file_extent_compression(leaf, ei, 0);
200 		kaddr = kmap_atomic(page);
201 		offset = start & (PAGE_CACHE_SIZE - 1);
202 		write_extent_buffer(leaf, kaddr + offset, ptr, size);
203 		kunmap_atomic(kaddr);
204 		page_cache_release(page);
205 	}
206 	btrfs_mark_buffer_dirty(leaf);
207 	btrfs_free_path(path);
208 
209 	/*
210 	 * we're an inline extent, so nobody can
211 	 * extend the file past i_size without locking
212 	 * a page we already have locked.
213 	 *
214 	 * We must do any isize and inode updates
215 	 * before we unlock the pages.  Otherwise we
216 	 * could end up racing with unlink.
217 	 */
218 	BTRFS_I(inode)->disk_i_size = inode->i_size;
219 	ret = btrfs_update_inode(trans, root, inode);
220 
221 	return ret;
222 fail:
223 	btrfs_free_path(path);
224 	return err;
225 }
226 
227 
228 /*
229  * conditionally insert an inline extent into the file.  This
230  * does the checks required to make sure the data is small enough
231  * to fit as an inline extent.
232  */
233 static noinline int cow_file_range_inline(struct btrfs_root *root,
234 					  struct inode *inode, u64 start,
235 					  u64 end, size_t compressed_size,
236 					  int compress_type,
237 					  struct page **compressed_pages)
238 {
239 	struct btrfs_trans_handle *trans;
240 	u64 isize = i_size_read(inode);
241 	u64 actual_end = min(end + 1, isize);
242 	u64 inline_len = actual_end - start;
243 	u64 aligned_end = ALIGN(end, root->sectorsize);
244 	u64 data_len = inline_len;
245 	int ret;
246 
247 	if (compressed_size)
248 		data_len = compressed_size;
249 
250 	if (start > 0 ||
251 	    actual_end >= PAGE_CACHE_SIZE ||
252 	    data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
253 	    (!compressed_size &&
254 	    (actual_end & (root->sectorsize - 1)) == 0) ||
255 	    end + 1 < isize ||
256 	    data_len > root->fs_info->max_inline) {
257 		return 1;
258 	}
259 
260 	trans = btrfs_join_transaction(root);
261 	if (IS_ERR(trans))
262 		return PTR_ERR(trans);
263 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
264 
265 	ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
266 	if (ret) {
267 		btrfs_abort_transaction(trans, root, ret);
268 		goto out;
269 	}
270 
271 	if (isize > actual_end)
272 		inline_len = min_t(u64, isize, actual_end);
273 	ret = insert_inline_extent(trans, root, inode, start,
274 				   inline_len, compressed_size,
275 				   compress_type, compressed_pages);
276 	if (ret && ret != -ENOSPC) {
277 		btrfs_abort_transaction(trans, root, ret);
278 		goto out;
279 	} else if (ret == -ENOSPC) {
280 		ret = 1;
281 		goto out;
282 	}
283 
284 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
285 	btrfs_delalloc_release_metadata(inode, end + 1 - start);
286 	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
287 out:
288 	btrfs_end_transaction(trans, root);
289 	return ret;
290 }
291 
292 struct async_extent {
293 	u64 start;
294 	u64 ram_size;
295 	u64 compressed_size;
296 	struct page **pages;
297 	unsigned long nr_pages;
298 	int compress_type;
299 	struct list_head list;
300 };
301 
302 struct async_cow {
303 	struct inode *inode;
304 	struct btrfs_root *root;
305 	struct page *locked_page;
306 	u64 start;
307 	u64 end;
308 	struct list_head extents;
309 	struct btrfs_work work;
310 };
311 
312 static noinline int add_async_extent(struct async_cow *cow,
313 				     u64 start, u64 ram_size,
314 				     u64 compressed_size,
315 				     struct page **pages,
316 				     unsigned long nr_pages,
317 				     int compress_type)
318 {
319 	struct async_extent *async_extent;
320 
321 	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
322 	BUG_ON(!async_extent); /* -ENOMEM */
323 	async_extent->start = start;
324 	async_extent->ram_size = ram_size;
325 	async_extent->compressed_size = compressed_size;
326 	async_extent->pages = pages;
327 	async_extent->nr_pages = nr_pages;
328 	async_extent->compress_type = compress_type;
329 	list_add_tail(&async_extent->list, &cow->extents);
330 	return 0;
331 }
332 
333 /*
334  * we create compressed extents in two phases.  The first
335  * phase compresses a range of pages that have already been
336  * locked (both pages and state bits are locked).
337  *
338  * This is done inside an ordered work queue, and the compression
339  * is spread across many cpus.  The actual IO submission is step
340  * two, and the ordered work queue takes care of making sure that
341  * happens in the same order things were put onto the queue by
342  * writepages and friends.
343  *
344  * If this code finds it can't get good compression, it puts an
345  * entry onto the work queue to write the uncompressed bytes.  This
346  * makes sure that both compressed inodes and uncompressed inodes
347  * are written in the same order that the flusher thread sent them
348  * down.
349  */
350 static noinline int compress_file_range(struct inode *inode,
351 					struct page *locked_page,
352 					u64 start, u64 end,
353 					struct async_cow *async_cow,
354 					int *num_added)
355 {
356 	struct btrfs_root *root = BTRFS_I(inode)->root;
357 	u64 num_bytes;
358 	u64 blocksize = root->sectorsize;
359 	u64 actual_end;
360 	u64 isize = i_size_read(inode);
361 	int ret = 0;
362 	struct page **pages = NULL;
363 	unsigned long nr_pages;
364 	unsigned long nr_pages_ret = 0;
365 	unsigned long total_compressed = 0;
366 	unsigned long total_in = 0;
367 	unsigned long max_compressed = 128 * 1024;
368 	unsigned long max_uncompressed = 128 * 1024;
369 	int i;
370 	int will_compress;
371 	int compress_type = root->fs_info->compress_type;
372 	int redirty = 0;
373 
374 	/* if this is a small write inside eof, kick off a defrag */
375 	if ((end - start + 1) < 16 * 1024 &&
376 	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
377 		btrfs_add_inode_defrag(NULL, inode);
378 
379 	actual_end = min_t(u64, isize, end + 1);
380 again:
381 	will_compress = 0;
382 	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
383 	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
384 
385 	/*
386 	 * we don't want to send crud past the end of i_size through
387 	 * compression, that's just a waste of CPU time.  So, if the
388 	 * end of the file is before the start of our current
389 	 * requested range of bytes, we bail out to the uncompressed
390 	 * cleanup code that can deal with all of this.
391 	 *
392 	 * It isn't really the fastest way to fix things, but this is a
393 	 * very uncommon corner.
394 	 */
395 	if (actual_end <= start)
396 		goto cleanup_and_bail_uncompressed;
397 
398 	total_compressed = actual_end - start;
399 
400 	/* we want to make sure that amount of ram required to uncompress
401 	 * an extent is reasonable, so we limit the total size in ram
402 	 * of a compressed extent to 128k.  This is a crucial number
403 	 * because it also controls how easily we can spread reads across
404 	 * cpus for decompression.
405 	 *
406 	 * We also want to make sure the amount of IO required to do
407 	 * a random read is reasonably small, so we limit the size of
408 	 * a compressed extent to 128k.
409 	 */
410 	total_compressed = min(total_compressed, max_uncompressed);
411 	num_bytes = ALIGN(end - start + 1, blocksize);
412 	num_bytes = max(blocksize,  num_bytes);
413 	total_in = 0;
414 	ret = 0;
415 
416 	/*
417 	 * we do compression for mount -o compress and when the
418 	 * inode has not been flagged as nocompress.  This flag can
419 	 * change at any time if we discover bad compression ratios.
420 	 */
421 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
422 	    (btrfs_test_opt(root, COMPRESS) ||
423 	     (BTRFS_I(inode)->force_compress) ||
424 	     (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
425 		WARN_ON(pages);
426 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
427 		if (!pages) {
428 			/* just bail out to the uncompressed code */
429 			goto cont;
430 		}
431 
432 		if (BTRFS_I(inode)->force_compress)
433 			compress_type = BTRFS_I(inode)->force_compress;
434 
435 		/*
436 		 * we need to call clear_page_dirty_for_io on each
437 		 * page in the range.  Otherwise applications with the file
438 		 * mmap'd can wander in and change the page contents while
439 		 * we are compressing them.
440 		 *
441 		 * If the compression fails for any reason, we set the pages
442 		 * dirty again later on.
443 		 */
444 		extent_range_clear_dirty_for_io(inode, start, end);
445 		redirty = 1;
446 		ret = btrfs_compress_pages(compress_type,
447 					   inode->i_mapping, start,
448 					   total_compressed, pages,
449 					   nr_pages, &nr_pages_ret,
450 					   &total_in,
451 					   &total_compressed,
452 					   max_compressed);
453 
454 		if (!ret) {
455 			unsigned long offset = total_compressed &
456 				(PAGE_CACHE_SIZE - 1);
457 			struct page *page = pages[nr_pages_ret - 1];
458 			char *kaddr;
459 
460 			/* zero the tail end of the last page, we might be
461 			 * sending it down to disk
462 			 */
463 			if (offset) {
464 				kaddr = kmap_atomic(page);
465 				memset(kaddr + offset, 0,
466 				       PAGE_CACHE_SIZE - offset);
467 				kunmap_atomic(kaddr);
468 			}
469 			will_compress = 1;
470 		}
471 	}
472 cont:
473 	if (start == 0) {
474 		/* lets try to make an inline extent */
475 		if (ret || total_in < (actual_end - start)) {
476 			/* we didn't compress the entire range, try
477 			 * to make an uncompressed inline extent.
478 			 */
479 			ret = cow_file_range_inline(root, inode, start, end,
480 						    0, 0, NULL);
481 		} else {
482 			/* try making a compressed inline extent */
483 			ret = cow_file_range_inline(root, inode, start, end,
484 						    total_compressed,
485 						    compress_type, pages);
486 		}
487 		if (ret <= 0) {
488 			unsigned long clear_flags = EXTENT_DELALLOC |
489 				EXTENT_DEFRAG;
490 			clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
491 
492 			/*
493 			 * inline extent creation worked or returned error,
494 			 * we don't need to create any more async work items.
495 			 * Unlock and free up our temp pages.
496 			 */
497 			extent_clear_unlock_delalloc(inode, start, end, NULL,
498 						     clear_flags, PAGE_UNLOCK |
499 						     PAGE_CLEAR_DIRTY |
500 						     PAGE_SET_WRITEBACK |
501 						     PAGE_END_WRITEBACK);
502 			goto free_pages_out;
503 		}
504 	}
505 
506 	if (will_compress) {
507 		/*
508 		 * we aren't doing an inline extent round the compressed size
509 		 * up to a block size boundary so the allocator does sane
510 		 * things
511 		 */
512 		total_compressed = ALIGN(total_compressed, blocksize);
513 
514 		/*
515 		 * one last check to make sure the compression is really a
516 		 * win, compare the page count read with the blocks on disk
517 		 */
518 		total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
519 		if (total_compressed >= total_in) {
520 			will_compress = 0;
521 		} else {
522 			num_bytes = total_in;
523 		}
524 	}
525 	if (!will_compress && pages) {
526 		/*
527 		 * the compression code ran but failed to make things smaller,
528 		 * free any pages it allocated and our page pointer array
529 		 */
530 		for (i = 0; i < nr_pages_ret; i++) {
531 			WARN_ON(pages[i]->mapping);
532 			page_cache_release(pages[i]);
533 		}
534 		kfree(pages);
535 		pages = NULL;
536 		total_compressed = 0;
537 		nr_pages_ret = 0;
538 
539 		/* flag the file so we don't compress in the future */
540 		if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
541 		    !(BTRFS_I(inode)->force_compress)) {
542 			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
543 		}
544 	}
545 	if (will_compress) {
546 		*num_added += 1;
547 
548 		/* the async work queues will take care of doing actual
549 		 * allocation on disk for these compressed pages,
550 		 * and will submit them to the elevator.
551 		 */
552 		add_async_extent(async_cow, start, num_bytes,
553 				 total_compressed, pages, nr_pages_ret,
554 				 compress_type);
555 
556 		if (start + num_bytes < end) {
557 			start += num_bytes;
558 			pages = NULL;
559 			cond_resched();
560 			goto again;
561 		}
562 	} else {
563 cleanup_and_bail_uncompressed:
564 		/*
565 		 * No compression, but we still need to write the pages in
566 		 * the file we've been given so far.  redirty the locked
567 		 * page if it corresponds to our extent and set things up
568 		 * for the async work queue to run cow_file_range to do
569 		 * the normal delalloc dance
570 		 */
571 		if (page_offset(locked_page) >= start &&
572 		    page_offset(locked_page) <= end) {
573 			__set_page_dirty_nobuffers(locked_page);
574 			/* unlocked later on in the async handlers */
575 		}
576 		if (redirty)
577 			extent_range_redirty_for_io(inode, start, end);
578 		add_async_extent(async_cow, start, end - start + 1,
579 				 0, NULL, 0, BTRFS_COMPRESS_NONE);
580 		*num_added += 1;
581 	}
582 
583 out:
584 	return ret;
585 
586 free_pages_out:
587 	for (i = 0; i < nr_pages_ret; i++) {
588 		WARN_ON(pages[i]->mapping);
589 		page_cache_release(pages[i]);
590 	}
591 	kfree(pages);
592 
593 	goto out;
594 }
595 
596 /*
597  * phase two of compressed writeback.  This is the ordered portion
598  * of the code, which only gets called in the order the work was
599  * queued.  We walk all the async extents created by compress_file_range
600  * and send them down to the disk.
601  */
602 static noinline int submit_compressed_extents(struct inode *inode,
603 					      struct async_cow *async_cow)
604 {
605 	struct async_extent *async_extent;
606 	u64 alloc_hint = 0;
607 	struct btrfs_key ins;
608 	struct extent_map *em;
609 	struct btrfs_root *root = BTRFS_I(inode)->root;
610 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
611 	struct extent_io_tree *io_tree;
612 	int ret = 0;
613 
614 	if (list_empty(&async_cow->extents))
615 		return 0;
616 
617 again:
618 	while (!list_empty(&async_cow->extents)) {
619 		async_extent = list_entry(async_cow->extents.next,
620 					  struct async_extent, list);
621 		list_del(&async_extent->list);
622 
623 		io_tree = &BTRFS_I(inode)->io_tree;
624 
625 retry:
626 		/* did the compression code fall back to uncompressed IO? */
627 		if (!async_extent->pages) {
628 			int page_started = 0;
629 			unsigned long nr_written = 0;
630 
631 			lock_extent(io_tree, async_extent->start,
632 					 async_extent->start +
633 					 async_extent->ram_size - 1);
634 
635 			/* allocate blocks */
636 			ret = cow_file_range(inode, async_cow->locked_page,
637 					     async_extent->start,
638 					     async_extent->start +
639 					     async_extent->ram_size - 1,
640 					     &page_started, &nr_written, 0);
641 
642 			/* JDM XXX */
643 
644 			/*
645 			 * if page_started, cow_file_range inserted an
646 			 * inline extent and took care of all the unlocking
647 			 * and IO for us.  Otherwise, we need to submit
648 			 * all those pages down to the drive.
649 			 */
650 			if (!page_started && !ret)
651 				extent_write_locked_range(io_tree,
652 						  inode, async_extent->start,
653 						  async_extent->start +
654 						  async_extent->ram_size - 1,
655 						  btrfs_get_extent,
656 						  WB_SYNC_ALL);
657 			else if (ret)
658 				unlock_page(async_cow->locked_page);
659 			kfree(async_extent);
660 			cond_resched();
661 			continue;
662 		}
663 
664 		lock_extent(io_tree, async_extent->start,
665 			    async_extent->start + async_extent->ram_size - 1);
666 
667 		ret = btrfs_reserve_extent(root,
668 					   async_extent->compressed_size,
669 					   async_extent->compressed_size,
670 					   0, alloc_hint, &ins, 1);
671 		if (ret) {
672 			int i;
673 
674 			for (i = 0; i < async_extent->nr_pages; i++) {
675 				WARN_ON(async_extent->pages[i]->mapping);
676 				page_cache_release(async_extent->pages[i]);
677 			}
678 			kfree(async_extent->pages);
679 			async_extent->nr_pages = 0;
680 			async_extent->pages = NULL;
681 
682 			if (ret == -ENOSPC) {
683 				unlock_extent(io_tree, async_extent->start,
684 					      async_extent->start +
685 					      async_extent->ram_size - 1);
686 				goto retry;
687 			}
688 			goto out_free;
689 		}
690 
691 		/*
692 		 * here we're doing allocation and writeback of the
693 		 * compressed pages
694 		 */
695 		btrfs_drop_extent_cache(inode, async_extent->start,
696 					async_extent->start +
697 					async_extent->ram_size - 1, 0);
698 
699 		em = alloc_extent_map();
700 		if (!em) {
701 			ret = -ENOMEM;
702 			goto out_free_reserve;
703 		}
704 		em->start = async_extent->start;
705 		em->len = async_extent->ram_size;
706 		em->orig_start = em->start;
707 		em->mod_start = em->start;
708 		em->mod_len = em->len;
709 
710 		em->block_start = ins.objectid;
711 		em->block_len = ins.offset;
712 		em->orig_block_len = ins.offset;
713 		em->ram_bytes = async_extent->ram_size;
714 		em->bdev = root->fs_info->fs_devices->latest_bdev;
715 		em->compress_type = async_extent->compress_type;
716 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
717 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
718 		em->generation = -1;
719 
720 		while (1) {
721 			write_lock(&em_tree->lock);
722 			ret = add_extent_mapping(em_tree, em, 1);
723 			write_unlock(&em_tree->lock);
724 			if (ret != -EEXIST) {
725 				free_extent_map(em);
726 				break;
727 			}
728 			btrfs_drop_extent_cache(inode, async_extent->start,
729 						async_extent->start +
730 						async_extent->ram_size - 1, 0);
731 		}
732 
733 		if (ret)
734 			goto out_free_reserve;
735 
736 		ret = btrfs_add_ordered_extent_compress(inode,
737 						async_extent->start,
738 						ins.objectid,
739 						async_extent->ram_size,
740 						ins.offset,
741 						BTRFS_ORDERED_COMPRESSED,
742 						async_extent->compress_type);
743 		if (ret)
744 			goto out_free_reserve;
745 
746 		/*
747 		 * clear dirty, set writeback and unlock the pages.
748 		 */
749 		extent_clear_unlock_delalloc(inode, async_extent->start,
750 				async_extent->start +
751 				async_extent->ram_size - 1,
752 				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
753 				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
754 				PAGE_SET_WRITEBACK);
755 		ret = btrfs_submit_compressed_write(inode,
756 				    async_extent->start,
757 				    async_extent->ram_size,
758 				    ins.objectid,
759 				    ins.offset, async_extent->pages,
760 				    async_extent->nr_pages);
761 		alloc_hint = ins.objectid + ins.offset;
762 		kfree(async_extent);
763 		if (ret)
764 			goto out;
765 		cond_resched();
766 	}
767 	ret = 0;
768 out:
769 	return ret;
770 out_free_reserve:
771 	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
772 out_free:
773 	extent_clear_unlock_delalloc(inode, async_extent->start,
774 				     async_extent->start +
775 				     async_extent->ram_size - 1,
776 				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
777 				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
778 				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
779 				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
780 	kfree(async_extent);
781 	goto again;
782 }
783 
784 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
785 				      u64 num_bytes)
786 {
787 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
788 	struct extent_map *em;
789 	u64 alloc_hint = 0;
790 
791 	read_lock(&em_tree->lock);
792 	em = search_extent_mapping(em_tree, start, num_bytes);
793 	if (em) {
794 		/*
795 		 * if block start isn't an actual block number then find the
796 		 * first block in this inode and use that as a hint.  If that
797 		 * block is also bogus then just don't worry about it.
798 		 */
799 		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
800 			free_extent_map(em);
801 			em = search_extent_mapping(em_tree, 0, 0);
802 			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
803 				alloc_hint = em->block_start;
804 			if (em)
805 				free_extent_map(em);
806 		} else {
807 			alloc_hint = em->block_start;
808 			free_extent_map(em);
809 		}
810 	}
811 	read_unlock(&em_tree->lock);
812 
813 	return alloc_hint;
814 }
815 
816 /*
817  * when extent_io.c finds a delayed allocation range in the file,
818  * the call backs end up in this code.  The basic idea is to
819  * allocate extents on disk for the range, and create ordered data structs
820  * in ram to track those extents.
821  *
822  * locked_page is the page that writepage had locked already.  We use
823  * it to make sure we don't do extra locks or unlocks.
824  *
825  * *page_started is set to one if we unlock locked_page and do everything
826  * required to start IO on it.  It may be clean and already done with
827  * IO when we return.
828  */
829 static noinline int cow_file_range(struct inode *inode,
830 				   struct page *locked_page,
831 				   u64 start, u64 end, int *page_started,
832 				   unsigned long *nr_written,
833 				   int unlock)
834 {
835 	struct btrfs_root *root = BTRFS_I(inode)->root;
836 	u64 alloc_hint = 0;
837 	u64 num_bytes;
838 	unsigned long ram_size;
839 	u64 disk_num_bytes;
840 	u64 cur_alloc_size;
841 	u64 blocksize = root->sectorsize;
842 	struct btrfs_key ins;
843 	struct extent_map *em;
844 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
845 	int ret = 0;
846 
847 	BUG_ON(btrfs_is_free_space_inode(inode));
848 
849 	num_bytes = ALIGN(end - start + 1, blocksize);
850 	num_bytes = max(blocksize,  num_bytes);
851 	disk_num_bytes = num_bytes;
852 
853 	/* if this is a small write inside eof, kick off defrag */
854 	if (num_bytes < 64 * 1024 &&
855 	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
856 		btrfs_add_inode_defrag(NULL, inode);
857 
858 	if (start == 0) {
859 		/* lets try to make an inline extent */
860 		ret = cow_file_range_inline(root, inode, start, end, 0, 0,
861 					    NULL);
862 		if (ret == 0) {
863 			extent_clear_unlock_delalloc(inode, start, end, NULL,
864 				     EXTENT_LOCKED | EXTENT_DELALLOC |
865 				     EXTENT_DEFRAG, PAGE_UNLOCK |
866 				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
867 				     PAGE_END_WRITEBACK);
868 
869 			*nr_written = *nr_written +
870 			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
871 			*page_started = 1;
872 			goto out;
873 		} else if (ret < 0) {
874 			goto out_unlock;
875 		}
876 	}
877 
878 	BUG_ON(disk_num_bytes >
879 	       btrfs_super_total_bytes(root->fs_info->super_copy));
880 
881 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
882 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
883 
884 	while (disk_num_bytes > 0) {
885 		unsigned long op;
886 
887 		cur_alloc_size = disk_num_bytes;
888 		ret = btrfs_reserve_extent(root, cur_alloc_size,
889 					   root->sectorsize, 0, alloc_hint,
890 					   &ins, 1);
891 		if (ret < 0)
892 			goto out_unlock;
893 
894 		em = alloc_extent_map();
895 		if (!em) {
896 			ret = -ENOMEM;
897 			goto out_reserve;
898 		}
899 		em->start = start;
900 		em->orig_start = em->start;
901 		ram_size = ins.offset;
902 		em->len = ins.offset;
903 		em->mod_start = em->start;
904 		em->mod_len = em->len;
905 
906 		em->block_start = ins.objectid;
907 		em->block_len = ins.offset;
908 		em->orig_block_len = ins.offset;
909 		em->ram_bytes = ram_size;
910 		em->bdev = root->fs_info->fs_devices->latest_bdev;
911 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
912 		em->generation = -1;
913 
914 		while (1) {
915 			write_lock(&em_tree->lock);
916 			ret = add_extent_mapping(em_tree, em, 1);
917 			write_unlock(&em_tree->lock);
918 			if (ret != -EEXIST) {
919 				free_extent_map(em);
920 				break;
921 			}
922 			btrfs_drop_extent_cache(inode, start,
923 						start + ram_size - 1, 0);
924 		}
925 		if (ret)
926 			goto out_reserve;
927 
928 		cur_alloc_size = ins.offset;
929 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
930 					       ram_size, cur_alloc_size, 0);
931 		if (ret)
932 			goto out_reserve;
933 
934 		if (root->root_key.objectid ==
935 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
936 			ret = btrfs_reloc_clone_csums(inode, start,
937 						      cur_alloc_size);
938 			if (ret)
939 				goto out_reserve;
940 		}
941 
942 		if (disk_num_bytes < cur_alloc_size)
943 			break;
944 
945 		/* we're not doing compressed IO, don't unlock the first
946 		 * page (which the caller expects to stay locked), don't
947 		 * clear any dirty bits and don't set any writeback bits
948 		 *
949 		 * Do set the Private2 bit so we know this page was properly
950 		 * setup for writepage
951 		 */
952 		op = unlock ? PAGE_UNLOCK : 0;
953 		op |= PAGE_SET_PRIVATE2;
954 
955 		extent_clear_unlock_delalloc(inode, start,
956 					     start + ram_size - 1, locked_page,
957 					     EXTENT_LOCKED | EXTENT_DELALLOC,
958 					     op);
959 		disk_num_bytes -= cur_alloc_size;
960 		num_bytes -= cur_alloc_size;
961 		alloc_hint = ins.objectid + ins.offset;
962 		start += cur_alloc_size;
963 	}
964 out:
965 	return ret;
966 
967 out_reserve:
968 	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
969 out_unlock:
970 	extent_clear_unlock_delalloc(inode, start, end, locked_page,
971 				     EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
972 				     EXTENT_DELALLOC | EXTENT_DEFRAG,
973 				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
974 				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
975 	goto out;
976 }
977 
978 /*
979  * work queue call back to started compression on a file and pages
980  */
981 static noinline void async_cow_start(struct btrfs_work *work)
982 {
983 	struct async_cow *async_cow;
984 	int num_added = 0;
985 	async_cow = container_of(work, struct async_cow, work);
986 
987 	compress_file_range(async_cow->inode, async_cow->locked_page,
988 			    async_cow->start, async_cow->end, async_cow,
989 			    &num_added);
990 	if (num_added == 0) {
991 		btrfs_add_delayed_iput(async_cow->inode);
992 		async_cow->inode = NULL;
993 	}
994 }
995 
996 /*
997  * work queue call back to submit previously compressed pages
998  */
999 static noinline void async_cow_submit(struct btrfs_work *work)
1000 {
1001 	struct async_cow *async_cow;
1002 	struct btrfs_root *root;
1003 	unsigned long nr_pages;
1004 
1005 	async_cow = container_of(work, struct async_cow, work);
1006 
1007 	root = async_cow->root;
1008 	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
1009 		PAGE_CACHE_SHIFT;
1010 
1011 	if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1012 	    5 * 1024 * 1024 &&
1013 	    waitqueue_active(&root->fs_info->async_submit_wait))
1014 		wake_up(&root->fs_info->async_submit_wait);
1015 
1016 	if (async_cow->inode)
1017 		submit_compressed_extents(async_cow->inode, async_cow);
1018 }
1019 
1020 static noinline void async_cow_free(struct btrfs_work *work)
1021 {
1022 	struct async_cow *async_cow;
1023 	async_cow = container_of(work, struct async_cow, work);
1024 	if (async_cow->inode)
1025 		btrfs_add_delayed_iput(async_cow->inode);
1026 	kfree(async_cow);
1027 }
1028 
1029 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1030 				u64 start, u64 end, int *page_started,
1031 				unsigned long *nr_written)
1032 {
1033 	struct async_cow *async_cow;
1034 	struct btrfs_root *root = BTRFS_I(inode)->root;
1035 	unsigned long nr_pages;
1036 	u64 cur_end;
1037 	int limit = 10 * 1024 * 1024;
1038 
1039 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1040 			 1, 0, NULL, GFP_NOFS);
1041 	while (start < end) {
1042 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1043 		BUG_ON(!async_cow); /* -ENOMEM */
1044 		async_cow->inode = igrab(inode);
1045 		async_cow->root = root;
1046 		async_cow->locked_page = locked_page;
1047 		async_cow->start = start;
1048 
1049 		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
1050 			cur_end = end;
1051 		else
1052 			cur_end = min(end, start + 512 * 1024 - 1);
1053 
1054 		async_cow->end = cur_end;
1055 		INIT_LIST_HEAD(&async_cow->extents);
1056 
1057 		async_cow->work.func = async_cow_start;
1058 		async_cow->work.ordered_func = async_cow_submit;
1059 		async_cow->work.ordered_free = async_cow_free;
1060 		async_cow->work.flags = 0;
1061 
1062 		nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1063 			PAGE_CACHE_SHIFT;
1064 		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1065 
1066 		btrfs_queue_worker(&root->fs_info->delalloc_workers,
1067 				   &async_cow->work);
1068 
1069 		if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1070 			wait_event(root->fs_info->async_submit_wait,
1071 			   (atomic_read(&root->fs_info->async_delalloc_pages) <
1072 			    limit));
1073 		}
1074 
1075 		while (atomic_read(&root->fs_info->async_submit_draining) &&
1076 		      atomic_read(&root->fs_info->async_delalloc_pages)) {
1077 			wait_event(root->fs_info->async_submit_wait,
1078 			  (atomic_read(&root->fs_info->async_delalloc_pages) ==
1079 			   0));
1080 		}
1081 
1082 		*nr_written += nr_pages;
1083 		start = cur_end + 1;
1084 	}
1085 	*page_started = 1;
1086 	return 0;
1087 }
1088 
1089 static noinline int csum_exist_in_range(struct btrfs_root *root,
1090 					u64 bytenr, u64 num_bytes)
1091 {
1092 	int ret;
1093 	struct btrfs_ordered_sum *sums;
1094 	LIST_HEAD(list);
1095 
1096 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1097 				       bytenr + num_bytes - 1, &list, 0);
1098 	if (ret == 0 && list_empty(&list))
1099 		return 0;
1100 
1101 	while (!list_empty(&list)) {
1102 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1103 		list_del(&sums->list);
1104 		kfree(sums);
1105 	}
1106 	return 1;
1107 }
1108 
1109 /*
1110  * when nowcow writeback call back.  This checks for snapshots or COW copies
1111  * of the extents that exist in the file, and COWs the file as required.
1112  *
1113  * If no cow copies or snapshots exist, we write directly to the existing
1114  * blocks on disk
1115  */
1116 static noinline int run_delalloc_nocow(struct inode *inode,
1117 				       struct page *locked_page,
1118 			      u64 start, u64 end, int *page_started, int force,
1119 			      unsigned long *nr_written)
1120 {
1121 	struct btrfs_root *root = BTRFS_I(inode)->root;
1122 	struct btrfs_trans_handle *trans;
1123 	struct extent_buffer *leaf;
1124 	struct btrfs_path *path;
1125 	struct btrfs_file_extent_item *fi;
1126 	struct btrfs_key found_key;
1127 	u64 cow_start;
1128 	u64 cur_offset;
1129 	u64 extent_end;
1130 	u64 extent_offset;
1131 	u64 disk_bytenr;
1132 	u64 num_bytes;
1133 	u64 disk_num_bytes;
1134 	u64 ram_bytes;
1135 	int extent_type;
1136 	int ret, err;
1137 	int type;
1138 	int nocow;
1139 	int check_prev = 1;
1140 	bool nolock;
1141 	u64 ino = btrfs_ino(inode);
1142 
1143 	path = btrfs_alloc_path();
1144 	if (!path) {
1145 		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1146 					     EXTENT_LOCKED | EXTENT_DELALLOC |
1147 					     EXTENT_DO_ACCOUNTING |
1148 					     EXTENT_DEFRAG, PAGE_UNLOCK |
1149 					     PAGE_CLEAR_DIRTY |
1150 					     PAGE_SET_WRITEBACK |
1151 					     PAGE_END_WRITEBACK);
1152 		return -ENOMEM;
1153 	}
1154 
1155 	nolock = btrfs_is_free_space_inode(inode);
1156 
1157 	if (nolock)
1158 		trans = btrfs_join_transaction_nolock(root);
1159 	else
1160 		trans = btrfs_join_transaction(root);
1161 
1162 	if (IS_ERR(trans)) {
1163 		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1164 					     EXTENT_LOCKED | EXTENT_DELALLOC |
1165 					     EXTENT_DO_ACCOUNTING |
1166 					     EXTENT_DEFRAG, PAGE_UNLOCK |
1167 					     PAGE_CLEAR_DIRTY |
1168 					     PAGE_SET_WRITEBACK |
1169 					     PAGE_END_WRITEBACK);
1170 		btrfs_free_path(path);
1171 		return PTR_ERR(trans);
1172 	}
1173 
1174 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1175 
1176 	cow_start = (u64)-1;
1177 	cur_offset = start;
1178 	while (1) {
1179 		ret = btrfs_lookup_file_extent(trans, root, path, ino,
1180 					       cur_offset, 0);
1181 		if (ret < 0) {
1182 			btrfs_abort_transaction(trans, root, ret);
1183 			goto error;
1184 		}
1185 		if (ret > 0 && path->slots[0] > 0 && check_prev) {
1186 			leaf = path->nodes[0];
1187 			btrfs_item_key_to_cpu(leaf, &found_key,
1188 					      path->slots[0] - 1);
1189 			if (found_key.objectid == ino &&
1190 			    found_key.type == BTRFS_EXTENT_DATA_KEY)
1191 				path->slots[0]--;
1192 		}
1193 		check_prev = 0;
1194 next_slot:
1195 		leaf = path->nodes[0];
1196 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1197 			ret = btrfs_next_leaf(root, path);
1198 			if (ret < 0) {
1199 				btrfs_abort_transaction(trans, root, ret);
1200 				goto error;
1201 			}
1202 			if (ret > 0)
1203 				break;
1204 			leaf = path->nodes[0];
1205 		}
1206 
1207 		nocow = 0;
1208 		disk_bytenr = 0;
1209 		num_bytes = 0;
1210 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1211 
1212 		if (found_key.objectid > ino ||
1213 		    found_key.type > BTRFS_EXTENT_DATA_KEY ||
1214 		    found_key.offset > end)
1215 			break;
1216 
1217 		if (found_key.offset > cur_offset) {
1218 			extent_end = found_key.offset;
1219 			extent_type = 0;
1220 			goto out_check;
1221 		}
1222 
1223 		fi = btrfs_item_ptr(leaf, path->slots[0],
1224 				    struct btrfs_file_extent_item);
1225 		extent_type = btrfs_file_extent_type(leaf, fi);
1226 
1227 		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1228 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
1229 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1230 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1231 			extent_offset = btrfs_file_extent_offset(leaf, fi);
1232 			extent_end = found_key.offset +
1233 				btrfs_file_extent_num_bytes(leaf, fi);
1234 			disk_num_bytes =
1235 				btrfs_file_extent_disk_num_bytes(leaf, fi);
1236 			if (extent_end <= start) {
1237 				path->slots[0]++;
1238 				goto next_slot;
1239 			}
1240 			if (disk_bytenr == 0)
1241 				goto out_check;
1242 			if (btrfs_file_extent_compression(leaf, fi) ||
1243 			    btrfs_file_extent_encryption(leaf, fi) ||
1244 			    btrfs_file_extent_other_encoding(leaf, fi))
1245 				goto out_check;
1246 			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1247 				goto out_check;
1248 			if (btrfs_extent_readonly(root, disk_bytenr))
1249 				goto out_check;
1250 			if (btrfs_cross_ref_exist(trans, root, ino,
1251 						  found_key.offset -
1252 						  extent_offset, disk_bytenr))
1253 				goto out_check;
1254 			disk_bytenr += extent_offset;
1255 			disk_bytenr += cur_offset - found_key.offset;
1256 			num_bytes = min(end + 1, extent_end) - cur_offset;
1257 			/*
1258 			 * force cow if csum exists in the range.
1259 			 * this ensure that csum for a given extent are
1260 			 * either valid or do not exist.
1261 			 */
1262 			if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1263 				goto out_check;
1264 			nocow = 1;
1265 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1266 			extent_end = found_key.offset +
1267 				btrfs_file_extent_inline_len(leaf, fi);
1268 			extent_end = ALIGN(extent_end, root->sectorsize);
1269 		} else {
1270 			BUG_ON(1);
1271 		}
1272 out_check:
1273 		if (extent_end <= start) {
1274 			path->slots[0]++;
1275 			goto next_slot;
1276 		}
1277 		if (!nocow) {
1278 			if (cow_start == (u64)-1)
1279 				cow_start = cur_offset;
1280 			cur_offset = extent_end;
1281 			if (cur_offset > end)
1282 				break;
1283 			path->slots[0]++;
1284 			goto next_slot;
1285 		}
1286 
1287 		btrfs_release_path(path);
1288 		if (cow_start != (u64)-1) {
1289 			ret = cow_file_range(inode, locked_page,
1290 					     cow_start, found_key.offset - 1,
1291 					     page_started, nr_written, 1);
1292 			if (ret) {
1293 				btrfs_abort_transaction(trans, root, ret);
1294 				goto error;
1295 			}
1296 			cow_start = (u64)-1;
1297 		}
1298 
1299 		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1300 			struct extent_map *em;
1301 			struct extent_map_tree *em_tree;
1302 			em_tree = &BTRFS_I(inode)->extent_tree;
1303 			em = alloc_extent_map();
1304 			BUG_ON(!em); /* -ENOMEM */
1305 			em->start = cur_offset;
1306 			em->orig_start = found_key.offset - extent_offset;
1307 			em->len = num_bytes;
1308 			em->block_len = num_bytes;
1309 			em->block_start = disk_bytenr;
1310 			em->orig_block_len = disk_num_bytes;
1311 			em->ram_bytes = ram_bytes;
1312 			em->bdev = root->fs_info->fs_devices->latest_bdev;
1313 			em->mod_start = em->start;
1314 			em->mod_len = em->len;
1315 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
1316 			set_bit(EXTENT_FLAG_FILLING, &em->flags);
1317 			em->generation = -1;
1318 			while (1) {
1319 				write_lock(&em_tree->lock);
1320 				ret = add_extent_mapping(em_tree, em, 1);
1321 				write_unlock(&em_tree->lock);
1322 				if (ret != -EEXIST) {
1323 					free_extent_map(em);
1324 					break;
1325 				}
1326 				btrfs_drop_extent_cache(inode, em->start,
1327 						em->start + em->len - 1, 0);
1328 			}
1329 			type = BTRFS_ORDERED_PREALLOC;
1330 		} else {
1331 			type = BTRFS_ORDERED_NOCOW;
1332 		}
1333 
1334 		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1335 					       num_bytes, num_bytes, type);
1336 		BUG_ON(ret); /* -ENOMEM */
1337 
1338 		if (root->root_key.objectid ==
1339 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
1340 			ret = btrfs_reloc_clone_csums(inode, cur_offset,
1341 						      num_bytes);
1342 			if (ret) {
1343 				btrfs_abort_transaction(trans, root, ret);
1344 				goto error;
1345 			}
1346 		}
1347 
1348 		extent_clear_unlock_delalloc(inode, cur_offset,
1349 					     cur_offset + num_bytes - 1,
1350 					     locked_page, EXTENT_LOCKED |
1351 					     EXTENT_DELALLOC, PAGE_UNLOCK |
1352 					     PAGE_SET_PRIVATE2);
1353 		cur_offset = extent_end;
1354 		if (cur_offset > end)
1355 			break;
1356 	}
1357 	btrfs_release_path(path);
1358 
1359 	if (cur_offset <= end && cow_start == (u64)-1) {
1360 		cow_start = cur_offset;
1361 		cur_offset = end;
1362 	}
1363 
1364 	if (cow_start != (u64)-1) {
1365 		ret = cow_file_range(inode, locked_page, cow_start, end,
1366 				     page_started, nr_written, 1);
1367 		if (ret) {
1368 			btrfs_abort_transaction(trans, root, ret);
1369 			goto error;
1370 		}
1371 	}
1372 
1373 error:
1374 	err = btrfs_end_transaction(trans, root);
1375 	if (!ret)
1376 		ret = err;
1377 
1378 	if (ret && cur_offset < end)
1379 		extent_clear_unlock_delalloc(inode, cur_offset, end,
1380 					     locked_page, EXTENT_LOCKED |
1381 					     EXTENT_DELALLOC | EXTENT_DEFRAG |
1382 					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1383 					     PAGE_CLEAR_DIRTY |
1384 					     PAGE_SET_WRITEBACK |
1385 					     PAGE_END_WRITEBACK);
1386 	btrfs_free_path(path);
1387 	return ret;
1388 }
1389 
1390 /*
1391  * extent_io.c call back to do delayed allocation processing
1392  */
1393 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1394 			      u64 start, u64 end, int *page_started,
1395 			      unsigned long *nr_written)
1396 {
1397 	int ret;
1398 	struct btrfs_root *root = BTRFS_I(inode)->root;
1399 
1400 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
1401 		ret = run_delalloc_nocow(inode, locked_page, start, end,
1402 					 page_started, 1, nr_written);
1403 	} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
1404 		ret = run_delalloc_nocow(inode, locked_page, start, end,
1405 					 page_started, 0, nr_written);
1406 	} else if (!btrfs_test_opt(root, COMPRESS) &&
1407 		   !(BTRFS_I(inode)->force_compress) &&
1408 		   !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
1409 		ret = cow_file_range(inode, locked_page, start, end,
1410 				      page_started, nr_written, 1);
1411 	} else {
1412 		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1413 			&BTRFS_I(inode)->runtime_flags);
1414 		ret = cow_file_range_async(inode, locked_page, start, end,
1415 					   page_started, nr_written);
1416 	}
1417 	return ret;
1418 }
1419 
1420 static void btrfs_split_extent_hook(struct inode *inode,
1421 				    struct extent_state *orig, u64 split)
1422 {
1423 	/* not delalloc, ignore it */
1424 	if (!(orig->state & EXTENT_DELALLOC))
1425 		return;
1426 
1427 	spin_lock(&BTRFS_I(inode)->lock);
1428 	BTRFS_I(inode)->outstanding_extents++;
1429 	spin_unlock(&BTRFS_I(inode)->lock);
1430 }
1431 
1432 /*
1433  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1434  * extents so we can keep track of new extents that are just merged onto old
1435  * extents, such as when we are doing sequential writes, so we can properly
1436  * account for the metadata space we'll need.
1437  */
1438 static void btrfs_merge_extent_hook(struct inode *inode,
1439 				    struct extent_state *new,
1440 				    struct extent_state *other)
1441 {
1442 	/* not delalloc, ignore it */
1443 	if (!(other->state & EXTENT_DELALLOC))
1444 		return;
1445 
1446 	spin_lock(&BTRFS_I(inode)->lock);
1447 	BTRFS_I(inode)->outstanding_extents--;
1448 	spin_unlock(&BTRFS_I(inode)->lock);
1449 }
1450 
1451 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1452 				      struct inode *inode)
1453 {
1454 	spin_lock(&root->delalloc_lock);
1455 	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1456 		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1457 			      &root->delalloc_inodes);
1458 		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1459 			&BTRFS_I(inode)->runtime_flags);
1460 		root->nr_delalloc_inodes++;
1461 		if (root->nr_delalloc_inodes == 1) {
1462 			spin_lock(&root->fs_info->delalloc_root_lock);
1463 			BUG_ON(!list_empty(&root->delalloc_root));
1464 			list_add_tail(&root->delalloc_root,
1465 				      &root->fs_info->delalloc_roots);
1466 			spin_unlock(&root->fs_info->delalloc_root_lock);
1467 		}
1468 	}
1469 	spin_unlock(&root->delalloc_lock);
1470 }
1471 
1472 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1473 				     struct inode *inode)
1474 {
1475 	spin_lock(&root->delalloc_lock);
1476 	if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1477 		list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1478 		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1479 			  &BTRFS_I(inode)->runtime_flags);
1480 		root->nr_delalloc_inodes--;
1481 		if (!root->nr_delalloc_inodes) {
1482 			spin_lock(&root->fs_info->delalloc_root_lock);
1483 			BUG_ON(list_empty(&root->delalloc_root));
1484 			list_del_init(&root->delalloc_root);
1485 			spin_unlock(&root->fs_info->delalloc_root_lock);
1486 		}
1487 	}
1488 	spin_unlock(&root->delalloc_lock);
1489 }
1490 
1491 /*
1492  * extent_io.c set_bit_hook, used to track delayed allocation
1493  * bytes in this file, and to maintain the list of inodes that
1494  * have pending delalloc work to be done.
1495  */
1496 static void btrfs_set_bit_hook(struct inode *inode,
1497 			       struct extent_state *state, unsigned long *bits)
1498 {
1499 
1500 	/*
1501 	 * set_bit and clear bit hooks normally require _irqsave/restore
1502 	 * but in this case, we are only testing for the DELALLOC
1503 	 * bit, which is only set or cleared with irqs on
1504 	 */
1505 	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1506 		struct btrfs_root *root = BTRFS_I(inode)->root;
1507 		u64 len = state->end + 1 - state->start;
1508 		bool do_list = !btrfs_is_free_space_inode(inode);
1509 
1510 		if (*bits & EXTENT_FIRST_DELALLOC) {
1511 			*bits &= ~EXTENT_FIRST_DELALLOC;
1512 		} else {
1513 			spin_lock(&BTRFS_I(inode)->lock);
1514 			BTRFS_I(inode)->outstanding_extents++;
1515 			spin_unlock(&BTRFS_I(inode)->lock);
1516 		}
1517 
1518 		__percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1519 				     root->fs_info->delalloc_batch);
1520 		spin_lock(&BTRFS_I(inode)->lock);
1521 		BTRFS_I(inode)->delalloc_bytes += len;
1522 		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1523 					 &BTRFS_I(inode)->runtime_flags))
1524 			btrfs_add_delalloc_inodes(root, inode);
1525 		spin_unlock(&BTRFS_I(inode)->lock);
1526 	}
1527 }
1528 
1529 /*
1530  * extent_io.c clear_bit_hook, see set_bit_hook for why
1531  */
1532 static void btrfs_clear_bit_hook(struct inode *inode,
1533 				 struct extent_state *state,
1534 				 unsigned long *bits)
1535 {
1536 	/*
1537 	 * set_bit and clear bit hooks normally require _irqsave/restore
1538 	 * but in this case, we are only testing for the DELALLOC
1539 	 * bit, which is only set or cleared with irqs on
1540 	 */
1541 	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1542 		struct btrfs_root *root = BTRFS_I(inode)->root;
1543 		u64 len = state->end + 1 - state->start;
1544 		bool do_list = !btrfs_is_free_space_inode(inode);
1545 
1546 		if (*bits & EXTENT_FIRST_DELALLOC) {
1547 			*bits &= ~EXTENT_FIRST_DELALLOC;
1548 		} else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1549 			spin_lock(&BTRFS_I(inode)->lock);
1550 			BTRFS_I(inode)->outstanding_extents--;
1551 			spin_unlock(&BTRFS_I(inode)->lock);
1552 		}
1553 
1554 		if (*bits & EXTENT_DO_ACCOUNTING)
1555 			btrfs_delalloc_release_metadata(inode, len);
1556 
1557 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1558 		    && do_list && !(state->state & EXTENT_NORESERVE))
1559 			btrfs_free_reserved_data_space(inode, len);
1560 
1561 		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1562 				     root->fs_info->delalloc_batch);
1563 		spin_lock(&BTRFS_I(inode)->lock);
1564 		BTRFS_I(inode)->delalloc_bytes -= len;
1565 		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1566 		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1567 			     &BTRFS_I(inode)->runtime_flags))
1568 			btrfs_del_delalloc_inode(root, inode);
1569 		spin_unlock(&BTRFS_I(inode)->lock);
1570 	}
1571 }
1572 
1573 /*
1574  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1575  * we don't create bios that span stripes or chunks
1576  */
1577 int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1578 			 size_t size, struct bio *bio,
1579 			 unsigned long bio_flags)
1580 {
1581 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1582 	u64 logical = (u64)bio->bi_sector << 9;
1583 	u64 length = 0;
1584 	u64 map_length;
1585 	int ret;
1586 
1587 	if (bio_flags & EXTENT_BIO_COMPRESSED)
1588 		return 0;
1589 
1590 	length = bio->bi_size;
1591 	map_length = length;
1592 	ret = btrfs_map_block(root->fs_info, rw, logical,
1593 			      &map_length, NULL, 0);
1594 	/* Will always return 0 with map_multi == NULL */
1595 	BUG_ON(ret < 0);
1596 	if (map_length < length + size)
1597 		return 1;
1598 	return 0;
1599 }
1600 
1601 /*
1602  * in order to insert checksums into the metadata in large chunks,
1603  * we wait until bio submission time.   All the pages in the bio are
1604  * checksummed and sums are attached onto the ordered extent record.
1605  *
1606  * At IO completion time the cums attached on the ordered extent record
1607  * are inserted into the btree
1608  */
1609 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1610 				    struct bio *bio, int mirror_num,
1611 				    unsigned long bio_flags,
1612 				    u64 bio_offset)
1613 {
1614 	struct btrfs_root *root = BTRFS_I(inode)->root;
1615 	int ret = 0;
1616 
1617 	ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1618 	BUG_ON(ret); /* -ENOMEM */
1619 	return 0;
1620 }
1621 
1622 /*
1623  * in order to insert checksums into the metadata in large chunks,
1624  * we wait until bio submission time.   All the pages in the bio are
1625  * checksummed and sums are attached onto the ordered extent record.
1626  *
1627  * At IO completion time the cums attached on the ordered extent record
1628  * are inserted into the btree
1629  */
1630 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1631 			  int mirror_num, unsigned long bio_flags,
1632 			  u64 bio_offset)
1633 {
1634 	struct btrfs_root *root = BTRFS_I(inode)->root;
1635 	int ret;
1636 
1637 	ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1638 	if (ret)
1639 		bio_endio(bio, ret);
1640 	return ret;
1641 }
1642 
1643 /*
1644  * extent_io.c submission hook. This does the right thing for csum calculation
1645  * on write, or reading the csums from the tree before a read
1646  */
1647 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1648 			  int mirror_num, unsigned long bio_flags,
1649 			  u64 bio_offset)
1650 {
1651 	struct btrfs_root *root = BTRFS_I(inode)->root;
1652 	int ret = 0;
1653 	int skip_sum;
1654 	int metadata = 0;
1655 	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1656 
1657 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1658 
1659 	if (btrfs_is_free_space_inode(inode))
1660 		metadata = 2;
1661 
1662 	if (!(rw & REQ_WRITE)) {
1663 		ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1664 		if (ret)
1665 			goto out;
1666 
1667 		if (bio_flags & EXTENT_BIO_COMPRESSED) {
1668 			ret = btrfs_submit_compressed_read(inode, bio,
1669 							   mirror_num,
1670 							   bio_flags);
1671 			goto out;
1672 		} else if (!skip_sum) {
1673 			ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1674 			if (ret)
1675 				goto out;
1676 		}
1677 		goto mapit;
1678 	} else if (async && !skip_sum) {
1679 		/* csum items have already been cloned */
1680 		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1681 			goto mapit;
1682 		/* we're doing a write, do the async checksumming */
1683 		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1684 				   inode, rw, bio, mirror_num,
1685 				   bio_flags, bio_offset,
1686 				   __btrfs_submit_bio_start,
1687 				   __btrfs_submit_bio_done);
1688 		goto out;
1689 	} else if (!skip_sum) {
1690 		ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1691 		if (ret)
1692 			goto out;
1693 	}
1694 
1695 mapit:
1696 	ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1697 
1698 out:
1699 	if (ret < 0)
1700 		bio_endio(bio, ret);
1701 	return ret;
1702 }
1703 
1704 /*
1705  * given a list of ordered sums record them in the inode.  This happens
1706  * at IO completion time based on sums calculated at bio submission time.
1707  */
1708 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1709 			     struct inode *inode, u64 file_offset,
1710 			     struct list_head *list)
1711 {
1712 	struct btrfs_ordered_sum *sum;
1713 
1714 	list_for_each_entry(sum, list, list) {
1715 		trans->adding_csums = 1;
1716 		btrfs_csum_file_blocks(trans,
1717 		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
1718 		trans->adding_csums = 0;
1719 	}
1720 	return 0;
1721 }
1722 
1723 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1724 			      struct extent_state **cached_state)
1725 {
1726 	WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1727 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1728 				   cached_state, GFP_NOFS);
1729 }
1730 
1731 /* see btrfs_writepage_start_hook for details on why this is required */
1732 struct btrfs_writepage_fixup {
1733 	struct page *page;
1734 	struct btrfs_work work;
1735 };
1736 
1737 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1738 {
1739 	struct btrfs_writepage_fixup *fixup;
1740 	struct btrfs_ordered_extent *ordered;
1741 	struct extent_state *cached_state = NULL;
1742 	struct page *page;
1743 	struct inode *inode;
1744 	u64 page_start;
1745 	u64 page_end;
1746 	int ret;
1747 
1748 	fixup = container_of(work, struct btrfs_writepage_fixup, work);
1749 	page = fixup->page;
1750 again:
1751 	lock_page(page);
1752 	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1753 		ClearPageChecked(page);
1754 		goto out_page;
1755 	}
1756 
1757 	inode = page->mapping->host;
1758 	page_start = page_offset(page);
1759 	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1760 
1761 	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1762 			 &cached_state);
1763 
1764 	/* already ordered? We're done */
1765 	if (PagePrivate2(page))
1766 		goto out;
1767 
1768 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
1769 	if (ordered) {
1770 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1771 				     page_end, &cached_state, GFP_NOFS);
1772 		unlock_page(page);
1773 		btrfs_start_ordered_extent(inode, ordered, 1);
1774 		btrfs_put_ordered_extent(ordered);
1775 		goto again;
1776 	}
1777 
1778 	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
1779 	if (ret) {
1780 		mapping_set_error(page->mapping, ret);
1781 		end_extent_writepage(page, ret, page_start, page_end);
1782 		ClearPageChecked(page);
1783 		goto out;
1784 	 }
1785 
1786 	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1787 	ClearPageChecked(page);
1788 	set_page_dirty(page);
1789 out:
1790 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1791 			     &cached_state, GFP_NOFS);
1792 out_page:
1793 	unlock_page(page);
1794 	page_cache_release(page);
1795 	kfree(fixup);
1796 }
1797 
1798 /*
1799  * There are a few paths in the higher layers of the kernel that directly
1800  * set the page dirty bit without asking the filesystem if it is a
1801  * good idea.  This causes problems because we want to make sure COW
1802  * properly happens and the data=ordered rules are followed.
1803  *
1804  * In our case any range that doesn't have the ORDERED bit set
1805  * hasn't been properly setup for IO.  We kick off an async process
1806  * to fix it up.  The async helper will wait for ordered extents, set
1807  * the delalloc bit and make it safe to write the page.
1808  */
1809 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1810 {
1811 	struct inode *inode = page->mapping->host;
1812 	struct btrfs_writepage_fixup *fixup;
1813 	struct btrfs_root *root = BTRFS_I(inode)->root;
1814 
1815 	/* this page is properly in the ordered list */
1816 	if (TestClearPagePrivate2(page))
1817 		return 0;
1818 
1819 	if (PageChecked(page))
1820 		return -EAGAIN;
1821 
1822 	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1823 	if (!fixup)
1824 		return -EAGAIN;
1825 
1826 	SetPageChecked(page);
1827 	page_cache_get(page);
1828 	fixup->work.func = btrfs_writepage_fixup_worker;
1829 	fixup->page = page;
1830 	btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1831 	return -EBUSY;
1832 }
1833 
1834 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1835 				       struct inode *inode, u64 file_pos,
1836 				       u64 disk_bytenr, u64 disk_num_bytes,
1837 				       u64 num_bytes, u64 ram_bytes,
1838 				       u8 compression, u8 encryption,
1839 				       u16 other_encoding, int extent_type)
1840 {
1841 	struct btrfs_root *root = BTRFS_I(inode)->root;
1842 	struct btrfs_file_extent_item *fi;
1843 	struct btrfs_path *path;
1844 	struct extent_buffer *leaf;
1845 	struct btrfs_key ins;
1846 	int ret;
1847 
1848 	path = btrfs_alloc_path();
1849 	if (!path)
1850 		return -ENOMEM;
1851 
1852 	path->leave_spinning = 1;
1853 
1854 	/*
1855 	 * we may be replacing one extent in the tree with another.
1856 	 * The new extent is pinned in the extent map, and we don't want
1857 	 * to drop it from the cache until it is completely in the btree.
1858 	 *
1859 	 * So, tell btrfs_drop_extents to leave this extent in the cache.
1860 	 * the caller is expected to unpin it and allow it to be merged
1861 	 * with the others.
1862 	 */
1863 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
1864 				 file_pos + num_bytes, 0);
1865 	if (ret)
1866 		goto out;
1867 
1868 	ins.objectid = btrfs_ino(inode);
1869 	ins.offset = file_pos;
1870 	ins.type = BTRFS_EXTENT_DATA_KEY;
1871 	ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1872 	if (ret)
1873 		goto out;
1874 	leaf = path->nodes[0];
1875 	fi = btrfs_item_ptr(leaf, path->slots[0],
1876 			    struct btrfs_file_extent_item);
1877 	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1878 	btrfs_set_file_extent_type(leaf, fi, extent_type);
1879 	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1880 	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1881 	btrfs_set_file_extent_offset(leaf, fi, 0);
1882 	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1883 	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1884 	btrfs_set_file_extent_compression(leaf, fi, compression);
1885 	btrfs_set_file_extent_encryption(leaf, fi, encryption);
1886 	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1887 
1888 	btrfs_mark_buffer_dirty(leaf);
1889 	btrfs_release_path(path);
1890 
1891 	inode_add_bytes(inode, num_bytes);
1892 
1893 	ins.objectid = disk_bytenr;
1894 	ins.offset = disk_num_bytes;
1895 	ins.type = BTRFS_EXTENT_ITEM_KEY;
1896 	ret = btrfs_alloc_reserved_file_extent(trans, root,
1897 					root->root_key.objectid,
1898 					btrfs_ino(inode), file_pos, &ins);
1899 out:
1900 	btrfs_free_path(path);
1901 
1902 	return ret;
1903 }
1904 
1905 /* snapshot-aware defrag */
1906 struct sa_defrag_extent_backref {
1907 	struct rb_node node;
1908 	struct old_sa_defrag_extent *old;
1909 	u64 root_id;
1910 	u64 inum;
1911 	u64 file_pos;
1912 	u64 extent_offset;
1913 	u64 num_bytes;
1914 	u64 generation;
1915 };
1916 
1917 struct old_sa_defrag_extent {
1918 	struct list_head list;
1919 	struct new_sa_defrag_extent *new;
1920 
1921 	u64 extent_offset;
1922 	u64 bytenr;
1923 	u64 offset;
1924 	u64 len;
1925 	int count;
1926 };
1927 
1928 struct new_sa_defrag_extent {
1929 	struct rb_root root;
1930 	struct list_head head;
1931 	struct btrfs_path *path;
1932 	struct inode *inode;
1933 	u64 file_pos;
1934 	u64 len;
1935 	u64 bytenr;
1936 	u64 disk_len;
1937 	u8 compress_type;
1938 };
1939 
1940 static int backref_comp(struct sa_defrag_extent_backref *b1,
1941 			struct sa_defrag_extent_backref *b2)
1942 {
1943 	if (b1->root_id < b2->root_id)
1944 		return -1;
1945 	else if (b1->root_id > b2->root_id)
1946 		return 1;
1947 
1948 	if (b1->inum < b2->inum)
1949 		return -1;
1950 	else if (b1->inum > b2->inum)
1951 		return 1;
1952 
1953 	if (b1->file_pos < b2->file_pos)
1954 		return -1;
1955 	else if (b1->file_pos > b2->file_pos)
1956 		return 1;
1957 
1958 	/*
1959 	 * [------------------------------] ===> (a range of space)
1960 	 *     |<--->|   |<---->| =============> (fs/file tree A)
1961 	 * |<---------------------------->| ===> (fs/file tree B)
1962 	 *
1963 	 * A range of space can refer to two file extents in one tree while
1964 	 * refer to only one file extent in another tree.
1965 	 *
1966 	 * So we may process a disk offset more than one time(two extents in A)
1967 	 * and locate at the same extent(one extent in B), then insert two same
1968 	 * backrefs(both refer to the extent in B).
1969 	 */
1970 	return 0;
1971 }
1972 
1973 static void backref_insert(struct rb_root *root,
1974 			   struct sa_defrag_extent_backref *backref)
1975 {
1976 	struct rb_node **p = &root->rb_node;
1977 	struct rb_node *parent = NULL;
1978 	struct sa_defrag_extent_backref *entry;
1979 	int ret;
1980 
1981 	while (*p) {
1982 		parent = *p;
1983 		entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
1984 
1985 		ret = backref_comp(backref, entry);
1986 		if (ret < 0)
1987 			p = &(*p)->rb_left;
1988 		else
1989 			p = &(*p)->rb_right;
1990 	}
1991 
1992 	rb_link_node(&backref->node, parent, p);
1993 	rb_insert_color(&backref->node, root);
1994 }
1995 
1996 /*
1997  * Note the backref might has changed, and in this case we just return 0.
1998  */
1999 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2000 				       void *ctx)
2001 {
2002 	struct btrfs_file_extent_item *extent;
2003 	struct btrfs_fs_info *fs_info;
2004 	struct old_sa_defrag_extent *old = ctx;
2005 	struct new_sa_defrag_extent *new = old->new;
2006 	struct btrfs_path *path = new->path;
2007 	struct btrfs_key key;
2008 	struct btrfs_root *root;
2009 	struct sa_defrag_extent_backref *backref;
2010 	struct extent_buffer *leaf;
2011 	struct inode *inode = new->inode;
2012 	int slot;
2013 	int ret;
2014 	u64 extent_offset;
2015 	u64 num_bytes;
2016 
2017 	if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2018 	    inum == btrfs_ino(inode))
2019 		return 0;
2020 
2021 	key.objectid = root_id;
2022 	key.type = BTRFS_ROOT_ITEM_KEY;
2023 	key.offset = (u64)-1;
2024 
2025 	fs_info = BTRFS_I(inode)->root->fs_info;
2026 	root = btrfs_read_fs_root_no_name(fs_info, &key);
2027 	if (IS_ERR(root)) {
2028 		if (PTR_ERR(root) == -ENOENT)
2029 			return 0;
2030 		WARN_ON(1);
2031 		pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2032 			 inum, offset, root_id);
2033 		return PTR_ERR(root);
2034 	}
2035 
2036 	key.objectid = inum;
2037 	key.type = BTRFS_EXTENT_DATA_KEY;
2038 	if (offset > (u64)-1 << 32)
2039 		key.offset = 0;
2040 	else
2041 		key.offset = offset;
2042 
2043 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2044 	if (ret < 0) {
2045 		WARN_ON(1);
2046 		return ret;
2047 	}
2048 	ret = 0;
2049 
2050 	while (1) {
2051 		cond_resched();
2052 
2053 		leaf = path->nodes[0];
2054 		slot = path->slots[0];
2055 
2056 		if (slot >= btrfs_header_nritems(leaf)) {
2057 			ret = btrfs_next_leaf(root, path);
2058 			if (ret < 0) {
2059 				goto out;
2060 			} else if (ret > 0) {
2061 				ret = 0;
2062 				goto out;
2063 			}
2064 			continue;
2065 		}
2066 
2067 		path->slots[0]++;
2068 
2069 		btrfs_item_key_to_cpu(leaf, &key, slot);
2070 
2071 		if (key.objectid > inum)
2072 			goto out;
2073 
2074 		if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2075 			continue;
2076 
2077 		extent = btrfs_item_ptr(leaf, slot,
2078 					struct btrfs_file_extent_item);
2079 
2080 		if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2081 			continue;
2082 
2083 		/*
2084 		 * 'offset' refers to the exact key.offset,
2085 		 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2086 		 * (key.offset - extent_offset).
2087 		 */
2088 		if (key.offset != offset)
2089 			continue;
2090 
2091 		extent_offset = btrfs_file_extent_offset(leaf, extent);
2092 		num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2093 
2094 		if (extent_offset >= old->extent_offset + old->offset +
2095 		    old->len || extent_offset + num_bytes <=
2096 		    old->extent_offset + old->offset)
2097 			continue;
2098 		break;
2099 	}
2100 
2101 	backref = kmalloc(sizeof(*backref), GFP_NOFS);
2102 	if (!backref) {
2103 		ret = -ENOENT;
2104 		goto out;
2105 	}
2106 
2107 	backref->root_id = root_id;
2108 	backref->inum = inum;
2109 	backref->file_pos = offset;
2110 	backref->num_bytes = num_bytes;
2111 	backref->extent_offset = extent_offset;
2112 	backref->generation = btrfs_file_extent_generation(leaf, extent);
2113 	backref->old = old;
2114 	backref_insert(&new->root, backref);
2115 	old->count++;
2116 out:
2117 	btrfs_release_path(path);
2118 	WARN_ON(ret);
2119 	return ret;
2120 }
2121 
2122 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2123 				   struct new_sa_defrag_extent *new)
2124 {
2125 	struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2126 	struct old_sa_defrag_extent *old, *tmp;
2127 	int ret;
2128 
2129 	new->path = path;
2130 
2131 	list_for_each_entry_safe(old, tmp, &new->head, list) {
2132 		ret = iterate_inodes_from_logical(old->bytenr +
2133 						  old->extent_offset, fs_info,
2134 						  path, record_one_backref,
2135 						  old);
2136 		BUG_ON(ret < 0 && ret != -ENOENT);
2137 
2138 		/* no backref to be processed for this extent */
2139 		if (!old->count) {
2140 			list_del(&old->list);
2141 			kfree(old);
2142 		}
2143 	}
2144 
2145 	if (list_empty(&new->head))
2146 		return false;
2147 
2148 	return true;
2149 }
2150 
2151 static int relink_is_mergable(struct extent_buffer *leaf,
2152 			      struct btrfs_file_extent_item *fi,
2153 			      struct new_sa_defrag_extent *new)
2154 {
2155 	if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2156 		return 0;
2157 
2158 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2159 		return 0;
2160 
2161 	if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2162 		return 0;
2163 
2164 	if (btrfs_file_extent_encryption(leaf, fi) ||
2165 	    btrfs_file_extent_other_encoding(leaf, fi))
2166 		return 0;
2167 
2168 	return 1;
2169 }
2170 
2171 /*
2172  * Note the backref might has changed, and in this case we just return 0.
2173  */
2174 static noinline int relink_extent_backref(struct btrfs_path *path,
2175 				 struct sa_defrag_extent_backref *prev,
2176 				 struct sa_defrag_extent_backref *backref)
2177 {
2178 	struct btrfs_file_extent_item *extent;
2179 	struct btrfs_file_extent_item *item;
2180 	struct btrfs_ordered_extent *ordered;
2181 	struct btrfs_trans_handle *trans;
2182 	struct btrfs_fs_info *fs_info;
2183 	struct btrfs_root *root;
2184 	struct btrfs_key key;
2185 	struct extent_buffer *leaf;
2186 	struct old_sa_defrag_extent *old = backref->old;
2187 	struct new_sa_defrag_extent *new = old->new;
2188 	struct inode *src_inode = new->inode;
2189 	struct inode *inode;
2190 	struct extent_state *cached = NULL;
2191 	int ret = 0;
2192 	u64 start;
2193 	u64 len;
2194 	u64 lock_start;
2195 	u64 lock_end;
2196 	bool merge = false;
2197 	int index;
2198 
2199 	if (prev && prev->root_id == backref->root_id &&
2200 	    prev->inum == backref->inum &&
2201 	    prev->file_pos + prev->num_bytes == backref->file_pos)
2202 		merge = true;
2203 
2204 	/* step 1: get root */
2205 	key.objectid = backref->root_id;
2206 	key.type = BTRFS_ROOT_ITEM_KEY;
2207 	key.offset = (u64)-1;
2208 
2209 	fs_info = BTRFS_I(src_inode)->root->fs_info;
2210 	index = srcu_read_lock(&fs_info->subvol_srcu);
2211 
2212 	root = btrfs_read_fs_root_no_name(fs_info, &key);
2213 	if (IS_ERR(root)) {
2214 		srcu_read_unlock(&fs_info->subvol_srcu, index);
2215 		if (PTR_ERR(root) == -ENOENT)
2216 			return 0;
2217 		return PTR_ERR(root);
2218 	}
2219 
2220 	/* step 2: get inode */
2221 	key.objectid = backref->inum;
2222 	key.type = BTRFS_INODE_ITEM_KEY;
2223 	key.offset = 0;
2224 
2225 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2226 	if (IS_ERR(inode)) {
2227 		srcu_read_unlock(&fs_info->subvol_srcu, index);
2228 		return 0;
2229 	}
2230 
2231 	srcu_read_unlock(&fs_info->subvol_srcu, index);
2232 
2233 	/* step 3: relink backref */
2234 	lock_start = backref->file_pos;
2235 	lock_end = backref->file_pos + backref->num_bytes - 1;
2236 	lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2237 			 0, &cached);
2238 
2239 	ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2240 	if (ordered) {
2241 		btrfs_put_ordered_extent(ordered);
2242 		goto out_unlock;
2243 	}
2244 
2245 	trans = btrfs_join_transaction(root);
2246 	if (IS_ERR(trans)) {
2247 		ret = PTR_ERR(trans);
2248 		goto out_unlock;
2249 	}
2250 
2251 	key.objectid = backref->inum;
2252 	key.type = BTRFS_EXTENT_DATA_KEY;
2253 	key.offset = backref->file_pos;
2254 
2255 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2256 	if (ret < 0) {
2257 		goto out_free_path;
2258 	} else if (ret > 0) {
2259 		ret = 0;
2260 		goto out_free_path;
2261 	}
2262 
2263 	extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2264 				struct btrfs_file_extent_item);
2265 
2266 	if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2267 	    backref->generation)
2268 		goto out_free_path;
2269 
2270 	btrfs_release_path(path);
2271 
2272 	start = backref->file_pos;
2273 	if (backref->extent_offset < old->extent_offset + old->offset)
2274 		start += old->extent_offset + old->offset -
2275 			 backref->extent_offset;
2276 
2277 	len = min(backref->extent_offset + backref->num_bytes,
2278 		  old->extent_offset + old->offset + old->len);
2279 	len -= max(backref->extent_offset, old->extent_offset + old->offset);
2280 
2281 	ret = btrfs_drop_extents(trans, root, inode, start,
2282 				 start + len, 1);
2283 	if (ret)
2284 		goto out_free_path;
2285 again:
2286 	key.objectid = btrfs_ino(inode);
2287 	key.type = BTRFS_EXTENT_DATA_KEY;
2288 	key.offset = start;
2289 
2290 	path->leave_spinning = 1;
2291 	if (merge) {
2292 		struct btrfs_file_extent_item *fi;
2293 		u64 extent_len;
2294 		struct btrfs_key found_key;
2295 
2296 		ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
2297 		if (ret < 0)
2298 			goto out_free_path;
2299 
2300 		path->slots[0]--;
2301 		leaf = path->nodes[0];
2302 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2303 
2304 		fi = btrfs_item_ptr(leaf, path->slots[0],
2305 				    struct btrfs_file_extent_item);
2306 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2307 
2308 		if (extent_len + found_key.offset == start &&
2309 		    relink_is_mergable(leaf, fi, new)) {
2310 			btrfs_set_file_extent_num_bytes(leaf, fi,
2311 							extent_len + len);
2312 			btrfs_mark_buffer_dirty(leaf);
2313 			inode_add_bytes(inode, len);
2314 
2315 			ret = 1;
2316 			goto out_free_path;
2317 		} else {
2318 			merge = false;
2319 			btrfs_release_path(path);
2320 			goto again;
2321 		}
2322 	}
2323 
2324 	ret = btrfs_insert_empty_item(trans, root, path, &key,
2325 					sizeof(*extent));
2326 	if (ret) {
2327 		btrfs_abort_transaction(trans, root, ret);
2328 		goto out_free_path;
2329 	}
2330 
2331 	leaf = path->nodes[0];
2332 	item = btrfs_item_ptr(leaf, path->slots[0],
2333 				struct btrfs_file_extent_item);
2334 	btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2335 	btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2336 	btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2337 	btrfs_set_file_extent_num_bytes(leaf, item, len);
2338 	btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2339 	btrfs_set_file_extent_generation(leaf, item, trans->transid);
2340 	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2341 	btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2342 	btrfs_set_file_extent_encryption(leaf, item, 0);
2343 	btrfs_set_file_extent_other_encoding(leaf, item, 0);
2344 
2345 	btrfs_mark_buffer_dirty(leaf);
2346 	inode_add_bytes(inode, len);
2347 	btrfs_release_path(path);
2348 
2349 	ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2350 			new->disk_len, 0,
2351 			backref->root_id, backref->inum,
2352 			new->file_pos, 0);	/* start - extent_offset */
2353 	if (ret) {
2354 		btrfs_abort_transaction(trans, root, ret);
2355 		goto out_free_path;
2356 	}
2357 
2358 	ret = 1;
2359 out_free_path:
2360 	btrfs_release_path(path);
2361 	path->leave_spinning = 0;
2362 	btrfs_end_transaction(trans, root);
2363 out_unlock:
2364 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2365 			     &cached, GFP_NOFS);
2366 	iput(inode);
2367 	return ret;
2368 }
2369 
2370 static void relink_file_extents(struct new_sa_defrag_extent *new)
2371 {
2372 	struct btrfs_path *path;
2373 	struct old_sa_defrag_extent *old, *tmp;
2374 	struct sa_defrag_extent_backref *backref;
2375 	struct sa_defrag_extent_backref *prev = NULL;
2376 	struct inode *inode;
2377 	struct btrfs_root *root;
2378 	struct rb_node *node;
2379 	int ret;
2380 
2381 	inode = new->inode;
2382 	root = BTRFS_I(inode)->root;
2383 
2384 	path = btrfs_alloc_path();
2385 	if (!path)
2386 		return;
2387 
2388 	if (!record_extent_backrefs(path, new)) {
2389 		btrfs_free_path(path);
2390 		goto out;
2391 	}
2392 	btrfs_release_path(path);
2393 
2394 	while (1) {
2395 		node = rb_first(&new->root);
2396 		if (!node)
2397 			break;
2398 		rb_erase(node, &new->root);
2399 
2400 		backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2401 
2402 		ret = relink_extent_backref(path, prev, backref);
2403 		WARN_ON(ret < 0);
2404 
2405 		kfree(prev);
2406 
2407 		if (ret == 1)
2408 			prev = backref;
2409 		else
2410 			prev = NULL;
2411 		cond_resched();
2412 	}
2413 	kfree(prev);
2414 
2415 	btrfs_free_path(path);
2416 
2417 	list_for_each_entry_safe(old, tmp, &new->head, list) {
2418 		list_del(&old->list);
2419 		kfree(old);
2420 	}
2421 out:
2422 	atomic_dec(&root->fs_info->defrag_running);
2423 	wake_up(&root->fs_info->transaction_wait);
2424 
2425 	kfree(new);
2426 }
2427 
2428 static struct new_sa_defrag_extent *
2429 record_old_file_extents(struct inode *inode,
2430 			struct btrfs_ordered_extent *ordered)
2431 {
2432 	struct btrfs_root *root = BTRFS_I(inode)->root;
2433 	struct btrfs_path *path;
2434 	struct btrfs_key key;
2435 	struct old_sa_defrag_extent *old, *tmp;
2436 	struct new_sa_defrag_extent *new;
2437 	int ret;
2438 
2439 	new = kmalloc(sizeof(*new), GFP_NOFS);
2440 	if (!new)
2441 		return NULL;
2442 
2443 	new->inode = inode;
2444 	new->file_pos = ordered->file_offset;
2445 	new->len = ordered->len;
2446 	new->bytenr = ordered->start;
2447 	new->disk_len = ordered->disk_len;
2448 	new->compress_type = ordered->compress_type;
2449 	new->root = RB_ROOT;
2450 	INIT_LIST_HEAD(&new->head);
2451 
2452 	path = btrfs_alloc_path();
2453 	if (!path)
2454 		goto out_kfree;
2455 
2456 	key.objectid = btrfs_ino(inode);
2457 	key.type = BTRFS_EXTENT_DATA_KEY;
2458 	key.offset = new->file_pos;
2459 
2460 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2461 	if (ret < 0)
2462 		goto out_free_path;
2463 	if (ret > 0 && path->slots[0] > 0)
2464 		path->slots[0]--;
2465 
2466 	/* find out all the old extents for the file range */
2467 	while (1) {
2468 		struct btrfs_file_extent_item *extent;
2469 		struct extent_buffer *l;
2470 		int slot;
2471 		u64 num_bytes;
2472 		u64 offset;
2473 		u64 end;
2474 		u64 disk_bytenr;
2475 		u64 extent_offset;
2476 
2477 		l = path->nodes[0];
2478 		slot = path->slots[0];
2479 
2480 		if (slot >= btrfs_header_nritems(l)) {
2481 			ret = btrfs_next_leaf(root, path);
2482 			if (ret < 0)
2483 				goto out_free_list;
2484 			else if (ret > 0)
2485 				break;
2486 			continue;
2487 		}
2488 
2489 		btrfs_item_key_to_cpu(l, &key, slot);
2490 
2491 		if (key.objectid != btrfs_ino(inode))
2492 			break;
2493 		if (key.type != BTRFS_EXTENT_DATA_KEY)
2494 			break;
2495 		if (key.offset >= new->file_pos + new->len)
2496 			break;
2497 
2498 		extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2499 
2500 		num_bytes = btrfs_file_extent_num_bytes(l, extent);
2501 		if (key.offset + num_bytes < new->file_pos)
2502 			goto next;
2503 
2504 		disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2505 		if (!disk_bytenr)
2506 			goto next;
2507 
2508 		extent_offset = btrfs_file_extent_offset(l, extent);
2509 
2510 		old = kmalloc(sizeof(*old), GFP_NOFS);
2511 		if (!old)
2512 			goto out_free_list;
2513 
2514 		offset = max(new->file_pos, key.offset);
2515 		end = min(new->file_pos + new->len, key.offset + num_bytes);
2516 
2517 		old->bytenr = disk_bytenr;
2518 		old->extent_offset = extent_offset;
2519 		old->offset = offset - key.offset;
2520 		old->len = end - offset;
2521 		old->new = new;
2522 		old->count = 0;
2523 		list_add_tail(&old->list, &new->head);
2524 next:
2525 		path->slots[0]++;
2526 		cond_resched();
2527 	}
2528 
2529 	btrfs_free_path(path);
2530 	atomic_inc(&root->fs_info->defrag_running);
2531 
2532 	return new;
2533 
2534 out_free_list:
2535 	list_for_each_entry_safe(old, tmp, &new->head, list) {
2536 		list_del(&old->list);
2537 		kfree(old);
2538 	}
2539 out_free_path:
2540 	btrfs_free_path(path);
2541 out_kfree:
2542 	kfree(new);
2543 	return NULL;
2544 }
2545 
2546 /*
2547  * helper function for btrfs_finish_ordered_io, this
2548  * just reads in some of the csum leaves to prime them into ram
2549  * before we start the transaction.  It limits the amount of btree
2550  * reads required while inside the transaction.
2551  */
2552 /* as ordered data IO finishes, this gets called so we can finish
2553  * an ordered extent if the range of bytes in the file it covers are
2554  * fully written.
2555  */
2556 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2557 {
2558 	struct inode *inode = ordered_extent->inode;
2559 	struct btrfs_root *root = BTRFS_I(inode)->root;
2560 	struct btrfs_trans_handle *trans = NULL;
2561 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2562 	struct extent_state *cached_state = NULL;
2563 	struct new_sa_defrag_extent *new = NULL;
2564 	int compress_type = 0;
2565 	int ret;
2566 	bool nolock;
2567 
2568 	nolock = btrfs_is_free_space_inode(inode);
2569 
2570 	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2571 		ret = -EIO;
2572 		goto out;
2573 	}
2574 
2575 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2576 		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2577 		btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2578 		if (nolock)
2579 			trans = btrfs_join_transaction_nolock(root);
2580 		else
2581 			trans = btrfs_join_transaction(root);
2582 		if (IS_ERR(trans)) {
2583 			ret = PTR_ERR(trans);
2584 			trans = NULL;
2585 			goto out;
2586 		}
2587 		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2588 		ret = btrfs_update_inode_fallback(trans, root, inode);
2589 		if (ret) /* -ENOMEM or corruption */
2590 			btrfs_abort_transaction(trans, root, ret);
2591 		goto out;
2592 	}
2593 
2594 	lock_extent_bits(io_tree, ordered_extent->file_offset,
2595 			 ordered_extent->file_offset + ordered_extent->len - 1,
2596 			 0, &cached_state);
2597 
2598 	ret = test_range_bit(io_tree, ordered_extent->file_offset,
2599 			ordered_extent->file_offset + ordered_extent->len - 1,
2600 			EXTENT_DEFRAG, 1, cached_state);
2601 	if (ret) {
2602 		u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2603 		if (last_snapshot >= BTRFS_I(inode)->generation)
2604 			/* the inode is shared */
2605 			new = record_old_file_extents(inode, ordered_extent);
2606 
2607 		clear_extent_bit(io_tree, ordered_extent->file_offset,
2608 			ordered_extent->file_offset + ordered_extent->len - 1,
2609 			EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2610 	}
2611 
2612 	if (nolock)
2613 		trans = btrfs_join_transaction_nolock(root);
2614 	else
2615 		trans = btrfs_join_transaction(root);
2616 	if (IS_ERR(trans)) {
2617 		ret = PTR_ERR(trans);
2618 		trans = NULL;
2619 		goto out_unlock;
2620 	}
2621 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2622 
2623 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2624 		compress_type = ordered_extent->compress_type;
2625 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2626 		BUG_ON(compress_type);
2627 		ret = btrfs_mark_extent_written(trans, inode,
2628 						ordered_extent->file_offset,
2629 						ordered_extent->file_offset +
2630 						ordered_extent->len);
2631 	} else {
2632 		BUG_ON(root == root->fs_info->tree_root);
2633 		ret = insert_reserved_file_extent(trans, inode,
2634 						ordered_extent->file_offset,
2635 						ordered_extent->start,
2636 						ordered_extent->disk_len,
2637 						ordered_extent->len,
2638 						ordered_extent->len,
2639 						compress_type, 0, 0,
2640 						BTRFS_FILE_EXTENT_REG);
2641 	}
2642 	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2643 			   ordered_extent->file_offset, ordered_extent->len,
2644 			   trans->transid);
2645 	if (ret < 0) {
2646 		btrfs_abort_transaction(trans, root, ret);
2647 		goto out_unlock;
2648 	}
2649 
2650 	add_pending_csums(trans, inode, ordered_extent->file_offset,
2651 			  &ordered_extent->list);
2652 
2653 	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2654 	ret = btrfs_update_inode_fallback(trans, root, inode);
2655 	if (ret) { /* -ENOMEM or corruption */
2656 		btrfs_abort_transaction(trans, root, ret);
2657 		goto out_unlock;
2658 	}
2659 	ret = 0;
2660 out_unlock:
2661 	unlock_extent_cached(io_tree, ordered_extent->file_offset,
2662 			     ordered_extent->file_offset +
2663 			     ordered_extent->len - 1, &cached_state, GFP_NOFS);
2664 out:
2665 	if (root != root->fs_info->tree_root)
2666 		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
2667 	if (trans)
2668 		btrfs_end_transaction(trans, root);
2669 
2670 	if (ret) {
2671 		clear_extent_uptodate(io_tree, ordered_extent->file_offset,
2672 				      ordered_extent->file_offset +
2673 				      ordered_extent->len - 1, NULL, GFP_NOFS);
2674 
2675 		/*
2676 		 * If the ordered extent had an IOERR or something else went
2677 		 * wrong we need to return the space for this ordered extent
2678 		 * back to the allocator.
2679 		 */
2680 		if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2681 		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2682 			btrfs_free_reserved_extent(root, ordered_extent->start,
2683 						   ordered_extent->disk_len);
2684 	}
2685 
2686 
2687 	/*
2688 	 * This needs to be done to make sure anybody waiting knows we are done
2689 	 * updating everything for this ordered extent.
2690 	 */
2691 	btrfs_remove_ordered_extent(inode, ordered_extent);
2692 
2693 	/* for snapshot-aware defrag */
2694 	if (new)
2695 		relink_file_extents(new);
2696 
2697 	/* once for us */
2698 	btrfs_put_ordered_extent(ordered_extent);
2699 	/* once for the tree */
2700 	btrfs_put_ordered_extent(ordered_extent);
2701 
2702 	return ret;
2703 }
2704 
2705 static void finish_ordered_fn(struct btrfs_work *work)
2706 {
2707 	struct btrfs_ordered_extent *ordered_extent;
2708 	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
2709 	btrfs_finish_ordered_io(ordered_extent);
2710 }
2711 
2712 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2713 				struct extent_state *state, int uptodate)
2714 {
2715 	struct inode *inode = page->mapping->host;
2716 	struct btrfs_root *root = BTRFS_I(inode)->root;
2717 	struct btrfs_ordered_extent *ordered_extent = NULL;
2718 	struct btrfs_workers *workers;
2719 
2720 	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2721 
2722 	ClearPagePrivate2(page);
2723 	if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
2724 					    end - start + 1, uptodate))
2725 		return 0;
2726 
2727 	ordered_extent->work.func = finish_ordered_fn;
2728 	ordered_extent->work.flags = 0;
2729 
2730 	if (btrfs_is_free_space_inode(inode))
2731 		workers = &root->fs_info->endio_freespace_worker;
2732 	else
2733 		workers = &root->fs_info->endio_write_workers;
2734 	btrfs_queue_worker(workers, &ordered_extent->work);
2735 
2736 	return 0;
2737 }
2738 
2739 /*
2740  * when reads are done, we need to check csums to verify the data is correct
2741  * if there's a match, we allow the bio to finish.  If not, the code in
2742  * extent_io.c will try to find good copies for us.
2743  */
2744 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2745 				      u64 phy_offset, struct page *page,
2746 				      u64 start, u64 end, int mirror)
2747 {
2748 	size_t offset = start - page_offset(page);
2749 	struct inode *inode = page->mapping->host;
2750 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2751 	char *kaddr;
2752 	struct btrfs_root *root = BTRFS_I(inode)->root;
2753 	u32 csum_expected;
2754 	u32 csum = ~(u32)0;
2755 	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2756 	                              DEFAULT_RATELIMIT_BURST);
2757 
2758 	if (PageChecked(page)) {
2759 		ClearPageChecked(page);
2760 		goto good;
2761 	}
2762 
2763 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
2764 		goto good;
2765 
2766 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
2767 	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
2768 		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
2769 				  GFP_NOFS);
2770 		return 0;
2771 	}
2772 
2773 	phy_offset >>= inode->i_sb->s_blocksize_bits;
2774 	csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
2775 
2776 	kaddr = kmap_atomic(page);
2777 	csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);
2778 	btrfs_csum_final(csum, (char *)&csum);
2779 	if (csum != csum_expected)
2780 		goto zeroit;
2781 
2782 	kunmap_atomic(kaddr);
2783 good:
2784 	return 0;
2785 
2786 zeroit:
2787 	if (__ratelimit(&_rs))
2788 		btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
2789 			(unsigned long long)btrfs_ino(page->mapping->host),
2790 			(unsigned long long)start, csum, csum_expected);
2791 	memset(kaddr + offset, 1, end - start + 1);
2792 	flush_dcache_page(page);
2793 	kunmap_atomic(kaddr);
2794 	if (csum_expected == 0)
2795 		return 0;
2796 	return -EIO;
2797 }
2798 
2799 struct delayed_iput {
2800 	struct list_head list;
2801 	struct inode *inode;
2802 };
2803 
2804 /* JDM: If this is fs-wide, why can't we add a pointer to
2805  * btrfs_inode instead and avoid the allocation? */
2806 void btrfs_add_delayed_iput(struct inode *inode)
2807 {
2808 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2809 	struct delayed_iput *delayed;
2810 
2811 	if (atomic_add_unless(&inode->i_count, -1, 1))
2812 		return;
2813 
2814 	delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
2815 	delayed->inode = inode;
2816 
2817 	spin_lock(&fs_info->delayed_iput_lock);
2818 	list_add_tail(&delayed->list, &fs_info->delayed_iputs);
2819 	spin_unlock(&fs_info->delayed_iput_lock);
2820 }
2821 
2822 void btrfs_run_delayed_iputs(struct btrfs_root *root)
2823 {
2824 	LIST_HEAD(list);
2825 	struct btrfs_fs_info *fs_info = root->fs_info;
2826 	struct delayed_iput *delayed;
2827 	int empty;
2828 
2829 	spin_lock(&fs_info->delayed_iput_lock);
2830 	empty = list_empty(&fs_info->delayed_iputs);
2831 	spin_unlock(&fs_info->delayed_iput_lock);
2832 	if (empty)
2833 		return;
2834 
2835 	spin_lock(&fs_info->delayed_iput_lock);
2836 	list_splice_init(&fs_info->delayed_iputs, &list);
2837 	spin_unlock(&fs_info->delayed_iput_lock);
2838 
2839 	while (!list_empty(&list)) {
2840 		delayed = list_entry(list.next, struct delayed_iput, list);
2841 		list_del(&delayed->list);
2842 		iput(delayed->inode);
2843 		kfree(delayed);
2844 	}
2845 }
2846 
2847 /*
2848  * This is called in transaction commit time. If there are no orphan
2849  * files in the subvolume, it removes orphan item and frees block_rsv
2850  * structure.
2851  */
2852 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2853 			      struct btrfs_root *root)
2854 {
2855 	struct btrfs_block_rsv *block_rsv;
2856 	int ret;
2857 
2858 	if (atomic_read(&root->orphan_inodes) ||
2859 	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2860 		return;
2861 
2862 	spin_lock(&root->orphan_lock);
2863 	if (atomic_read(&root->orphan_inodes)) {
2864 		spin_unlock(&root->orphan_lock);
2865 		return;
2866 	}
2867 
2868 	if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
2869 		spin_unlock(&root->orphan_lock);
2870 		return;
2871 	}
2872 
2873 	block_rsv = root->orphan_block_rsv;
2874 	root->orphan_block_rsv = NULL;
2875 	spin_unlock(&root->orphan_lock);
2876 
2877 	if (root->orphan_item_inserted &&
2878 	    btrfs_root_refs(&root->root_item) > 0) {
2879 		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2880 					    root->root_key.objectid);
2881 		if (ret)
2882 			btrfs_abort_transaction(trans, root, ret);
2883 		else
2884 			root->orphan_item_inserted = 0;
2885 	}
2886 
2887 	if (block_rsv) {
2888 		WARN_ON(block_rsv->size > 0);
2889 		btrfs_free_block_rsv(root, block_rsv);
2890 	}
2891 }
2892 
2893 /*
2894  * This creates an orphan entry for the given inode in case something goes
2895  * wrong in the middle of an unlink/truncate.
2896  *
2897  * NOTE: caller of this function should reserve 5 units of metadata for
2898  *	 this function.
2899  */
2900 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2901 {
2902 	struct btrfs_root *root = BTRFS_I(inode)->root;
2903 	struct btrfs_block_rsv *block_rsv = NULL;
2904 	int reserve = 0;
2905 	int insert = 0;
2906 	int ret;
2907 
2908 	if (!root->orphan_block_rsv) {
2909 		block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
2910 		if (!block_rsv)
2911 			return -ENOMEM;
2912 	}
2913 
2914 	spin_lock(&root->orphan_lock);
2915 	if (!root->orphan_block_rsv) {
2916 		root->orphan_block_rsv = block_rsv;
2917 	} else if (block_rsv) {
2918 		btrfs_free_block_rsv(root, block_rsv);
2919 		block_rsv = NULL;
2920 	}
2921 
2922 	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2923 			      &BTRFS_I(inode)->runtime_flags)) {
2924 #if 0
2925 		/*
2926 		 * For proper ENOSPC handling, we should do orphan
2927 		 * cleanup when mounting. But this introduces backward
2928 		 * compatibility issue.
2929 		 */
2930 		if (!xchg(&root->orphan_item_inserted, 1))
2931 			insert = 2;
2932 		else
2933 			insert = 1;
2934 #endif
2935 		insert = 1;
2936 		atomic_inc(&root->orphan_inodes);
2937 	}
2938 
2939 	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2940 			      &BTRFS_I(inode)->runtime_flags))
2941 		reserve = 1;
2942 	spin_unlock(&root->orphan_lock);
2943 
2944 	/* grab metadata reservation from transaction handle */
2945 	if (reserve) {
2946 		ret = btrfs_orphan_reserve_metadata(trans, inode);
2947 		BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
2948 	}
2949 
2950 	/* insert an orphan item to track this unlinked/truncated file */
2951 	if (insert >= 1) {
2952 		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
2953 		if (ret) {
2954 			clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2955 				  &BTRFS_I(inode)->runtime_flags);
2956 			if (reserve) {
2957 				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2958 					  &BTRFS_I(inode)->runtime_flags);
2959 				btrfs_orphan_release_metadata(inode);
2960 			}
2961 			if (ret != -EEXIST) {
2962 				btrfs_abort_transaction(trans, root, ret);
2963 				return ret;
2964 			}
2965 		}
2966 		ret = 0;
2967 	}
2968 
2969 	/* insert an orphan item to track subvolume contains orphan files */
2970 	if (insert >= 2) {
2971 		ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2972 					       root->root_key.objectid);
2973 		if (ret && ret != -EEXIST) {
2974 			btrfs_abort_transaction(trans, root, ret);
2975 			return ret;
2976 		}
2977 	}
2978 	return 0;
2979 }
2980 
2981 /*
2982  * We have done the truncate/delete so we can go ahead and remove the orphan
2983  * item for this particular inode.
2984  */
2985 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
2986 			    struct inode *inode)
2987 {
2988 	struct btrfs_root *root = BTRFS_I(inode)->root;
2989 	int delete_item = 0;
2990 	int release_rsv = 0;
2991 	int ret = 0;
2992 
2993 	spin_lock(&root->orphan_lock);
2994 	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2995 			       &BTRFS_I(inode)->runtime_flags))
2996 		delete_item = 1;
2997 
2998 	if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2999 			       &BTRFS_I(inode)->runtime_flags))
3000 		release_rsv = 1;
3001 	spin_unlock(&root->orphan_lock);
3002 
3003 	if (trans && delete_item)
3004 		ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
3005 
3006 	if (release_rsv) {
3007 		btrfs_orphan_release_metadata(inode);
3008 		atomic_dec(&root->orphan_inodes);
3009 	}
3010 
3011 	return ret;
3012 }
3013 
3014 /*
3015  * this cleans up any orphans that may be left on the list from the last use
3016  * of this root.
3017  */
3018 int btrfs_orphan_cleanup(struct btrfs_root *root)
3019 {
3020 	struct btrfs_path *path;
3021 	struct extent_buffer *leaf;
3022 	struct btrfs_key key, found_key;
3023 	struct btrfs_trans_handle *trans;
3024 	struct inode *inode;
3025 	u64 last_objectid = 0;
3026 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
3027 
3028 	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3029 		return 0;
3030 
3031 	path = btrfs_alloc_path();
3032 	if (!path) {
3033 		ret = -ENOMEM;
3034 		goto out;
3035 	}
3036 	path->reada = -1;
3037 
3038 	key.objectid = BTRFS_ORPHAN_OBJECTID;
3039 	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
3040 	key.offset = (u64)-1;
3041 
3042 	while (1) {
3043 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3044 		if (ret < 0)
3045 			goto out;
3046 
3047 		/*
3048 		 * if ret == 0 means we found what we were searching for, which
3049 		 * is weird, but possible, so only screw with path if we didn't
3050 		 * find the key and see if we have stuff that matches
3051 		 */
3052 		if (ret > 0) {
3053 			ret = 0;
3054 			if (path->slots[0] == 0)
3055 				break;
3056 			path->slots[0]--;
3057 		}
3058 
3059 		/* pull out the item */
3060 		leaf = path->nodes[0];
3061 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3062 
3063 		/* make sure the item matches what we want */
3064 		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3065 			break;
3066 		if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
3067 			break;
3068 
3069 		/* release the path since we're done with it */
3070 		btrfs_release_path(path);
3071 
3072 		/*
3073 		 * this is where we are basically btrfs_lookup, without the
3074 		 * crossing root thing.  we store the inode number in the
3075 		 * offset of the orphan item.
3076 		 */
3077 
3078 		if (found_key.offset == last_objectid) {
3079 			btrfs_err(root->fs_info,
3080 				"Error removing orphan entry, stopping orphan cleanup");
3081 			ret = -EINVAL;
3082 			goto out;
3083 		}
3084 
3085 		last_objectid = found_key.offset;
3086 
3087 		found_key.objectid = found_key.offset;
3088 		found_key.type = BTRFS_INODE_ITEM_KEY;
3089 		found_key.offset = 0;
3090 		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
3091 		ret = PTR_RET(inode);
3092 		if (ret && ret != -ESTALE)
3093 			goto out;
3094 
3095 		if (ret == -ESTALE && root == root->fs_info->tree_root) {
3096 			struct btrfs_root *dead_root;
3097 			struct btrfs_fs_info *fs_info = root->fs_info;
3098 			int is_dead_root = 0;
3099 
3100 			/*
3101 			 * this is an orphan in the tree root. Currently these
3102 			 * could come from 2 sources:
3103 			 *  a) a snapshot deletion in progress
3104 			 *  b) a free space cache inode
3105 			 * We need to distinguish those two, as the snapshot
3106 			 * orphan must not get deleted.
3107 			 * find_dead_roots already ran before us, so if this
3108 			 * is a snapshot deletion, we should find the root
3109 			 * in the dead_roots list
3110 			 */
3111 			spin_lock(&fs_info->trans_lock);
3112 			list_for_each_entry(dead_root, &fs_info->dead_roots,
3113 					    root_list) {
3114 				if (dead_root->root_key.objectid ==
3115 				    found_key.objectid) {
3116 					is_dead_root = 1;
3117 					break;
3118 				}
3119 			}
3120 			spin_unlock(&fs_info->trans_lock);
3121 			if (is_dead_root) {
3122 				/* prevent this orphan from being found again */
3123 				key.offset = found_key.objectid - 1;
3124 				continue;
3125 			}
3126 		}
3127 		/*
3128 		 * Inode is already gone but the orphan item is still there,
3129 		 * kill the orphan item.
3130 		 */
3131 		if (ret == -ESTALE) {
3132 			trans = btrfs_start_transaction(root, 1);
3133 			if (IS_ERR(trans)) {
3134 				ret = PTR_ERR(trans);
3135 				goto out;
3136 			}
3137 			btrfs_debug(root->fs_info, "auto deleting %Lu",
3138 				found_key.objectid);
3139 			ret = btrfs_del_orphan_item(trans, root,
3140 						    found_key.objectid);
3141 			btrfs_end_transaction(trans, root);
3142 			if (ret)
3143 				goto out;
3144 			continue;
3145 		}
3146 
3147 		/*
3148 		 * add this inode to the orphan list so btrfs_orphan_del does
3149 		 * the proper thing when we hit it
3150 		 */
3151 		set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3152 			&BTRFS_I(inode)->runtime_flags);
3153 		atomic_inc(&root->orphan_inodes);
3154 
3155 		/* if we have links, this was a truncate, lets do that */
3156 		if (inode->i_nlink) {
3157 			if (!S_ISREG(inode->i_mode)) {
3158 				WARN_ON(1);
3159 				iput(inode);
3160 				continue;
3161 			}
3162 			nr_truncate++;
3163 
3164 			/* 1 for the orphan item deletion. */
3165 			trans = btrfs_start_transaction(root, 1);
3166 			if (IS_ERR(trans)) {
3167 				iput(inode);
3168 				ret = PTR_ERR(trans);
3169 				goto out;
3170 			}
3171 			ret = btrfs_orphan_add(trans, inode);
3172 			btrfs_end_transaction(trans, root);
3173 			if (ret) {
3174 				iput(inode);
3175 				goto out;
3176 			}
3177 
3178 			ret = btrfs_truncate(inode);
3179 			if (ret)
3180 				btrfs_orphan_del(NULL, inode);
3181 		} else {
3182 			nr_unlink++;
3183 		}
3184 
3185 		/* this will do delete_inode and everything for us */
3186 		iput(inode);
3187 		if (ret)
3188 			goto out;
3189 	}
3190 	/* release the path since we're done with it */
3191 	btrfs_release_path(path);
3192 
3193 	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3194 
3195 	if (root->orphan_block_rsv)
3196 		btrfs_block_rsv_release(root, root->orphan_block_rsv,
3197 					(u64)-1);
3198 
3199 	if (root->orphan_block_rsv || root->orphan_item_inserted) {
3200 		trans = btrfs_join_transaction(root);
3201 		if (!IS_ERR(trans))
3202 			btrfs_end_transaction(trans, root);
3203 	}
3204 
3205 	if (nr_unlink)
3206 		btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
3207 	if (nr_truncate)
3208 		btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
3209 
3210 out:
3211 	if (ret)
3212 		btrfs_crit(root->fs_info,
3213 			"could not do orphan cleanup %d", ret);
3214 	btrfs_free_path(path);
3215 	return ret;
3216 }
3217 
3218 /*
3219  * very simple check to peek ahead in the leaf looking for xattrs.  If we
3220  * don't find any xattrs, we know there can't be any acls.
3221  *
3222  * slot is the slot the inode is in, objectid is the objectid of the inode
3223  */
3224 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3225 					  int slot, u64 objectid)
3226 {
3227 	u32 nritems = btrfs_header_nritems(leaf);
3228 	struct btrfs_key found_key;
3229 	static u64 xattr_access = 0;
3230 	static u64 xattr_default = 0;
3231 	int scanned = 0;
3232 
3233 	if (!xattr_access) {
3234 		xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
3235 					strlen(POSIX_ACL_XATTR_ACCESS));
3236 		xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
3237 					strlen(POSIX_ACL_XATTR_DEFAULT));
3238 	}
3239 
3240 	slot++;
3241 	while (slot < nritems) {
3242 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3243 
3244 		/* we found a different objectid, there must not be acls */
3245 		if (found_key.objectid != objectid)
3246 			return 0;
3247 
3248 		/* we found an xattr, assume we've got an acl */
3249 		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3250 			if (found_key.offset == xattr_access ||
3251 			    found_key.offset == xattr_default)
3252 				return 1;
3253 		}
3254 
3255 		/*
3256 		 * we found a key greater than an xattr key, there can't
3257 		 * be any acls later on
3258 		 */
3259 		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3260 			return 0;
3261 
3262 		slot++;
3263 		scanned++;
3264 
3265 		/*
3266 		 * it goes inode, inode backrefs, xattrs, extents,
3267 		 * so if there are a ton of hard links to an inode there can
3268 		 * be a lot of backrefs.  Don't waste time searching too hard,
3269 		 * this is just an optimization
3270 		 */
3271 		if (scanned >= 8)
3272 			break;
3273 	}
3274 	/* we hit the end of the leaf before we found an xattr or
3275 	 * something larger than an xattr.  We have to assume the inode
3276 	 * has acls
3277 	 */
3278 	return 1;
3279 }
3280 
3281 /*
3282  * read an inode from the btree into the in-memory inode
3283  */
3284 static void btrfs_read_locked_inode(struct inode *inode)
3285 {
3286 	struct btrfs_path *path;
3287 	struct extent_buffer *leaf;
3288 	struct btrfs_inode_item *inode_item;
3289 	struct btrfs_timespec *tspec;
3290 	struct btrfs_root *root = BTRFS_I(inode)->root;
3291 	struct btrfs_key location;
3292 	int maybe_acls;
3293 	u32 rdev;
3294 	int ret;
3295 	bool filled = false;
3296 
3297 	ret = btrfs_fill_inode(inode, &rdev);
3298 	if (!ret)
3299 		filled = true;
3300 
3301 	path = btrfs_alloc_path();
3302 	if (!path)
3303 		goto make_bad;
3304 
3305 	path->leave_spinning = 1;
3306 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3307 
3308 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3309 	if (ret)
3310 		goto make_bad;
3311 
3312 	leaf = path->nodes[0];
3313 
3314 	if (filled)
3315 		goto cache_acl;
3316 
3317 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3318 				    struct btrfs_inode_item);
3319 	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3320 	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3321 	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3322 	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3323 	btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3324 
3325 	tspec = btrfs_inode_atime(inode_item);
3326 	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3327 	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3328 
3329 	tspec = btrfs_inode_mtime(inode_item);
3330 	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3331 	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3332 
3333 	tspec = btrfs_inode_ctime(inode_item);
3334 	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3335 	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3336 
3337 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3338 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3339 	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3340 
3341 	/*
3342 	 * If we were modified in the current generation and evicted from memory
3343 	 * and then re-read we need to do a full sync since we don't have any
3344 	 * idea about which extents were modified before we were evicted from
3345 	 * cache.
3346 	 */
3347 	if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
3348 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3349 			&BTRFS_I(inode)->runtime_flags);
3350 
3351 	inode->i_version = btrfs_inode_sequence(leaf, inode_item);
3352 	inode->i_generation = BTRFS_I(inode)->generation;
3353 	inode->i_rdev = 0;
3354 	rdev = btrfs_inode_rdev(leaf, inode_item);
3355 
3356 	BTRFS_I(inode)->index_cnt = (u64)-1;
3357 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3358 cache_acl:
3359 	/*
3360 	 * try to precache a NULL acl entry for files that don't have
3361 	 * any xattrs or acls
3362 	 */
3363 	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3364 					   btrfs_ino(inode));
3365 	if (!maybe_acls)
3366 		cache_no_acl(inode);
3367 
3368 	btrfs_free_path(path);
3369 
3370 	switch (inode->i_mode & S_IFMT) {
3371 	case S_IFREG:
3372 		inode->i_mapping->a_ops = &btrfs_aops;
3373 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3374 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3375 		inode->i_fop = &btrfs_file_operations;
3376 		inode->i_op = &btrfs_file_inode_operations;
3377 		break;
3378 	case S_IFDIR:
3379 		inode->i_fop = &btrfs_dir_file_operations;
3380 		if (root == root->fs_info->tree_root)
3381 			inode->i_op = &btrfs_dir_ro_inode_operations;
3382 		else
3383 			inode->i_op = &btrfs_dir_inode_operations;
3384 		break;
3385 	case S_IFLNK:
3386 		inode->i_op = &btrfs_symlink_inode_operations;
3387 		inode->i_mapping->a_ops = &btrfs_symlink_aops;
3388 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3389 		break;
3390 	default:
3391 		inode->i_op = &btrfs_special_inode_operations;
3392 		init_special_inode(inode, inode->i_mode, rdev);
3393 		break;
3394 	}
3395 
3396 	btrfs_update_iflags(inode);
3397 	return;
3398 
3399 make_bad:
3400 	btrfs_free_path(path);
3401 	make_bad_inode(inode);
3402 }
3403 
3404 /*
3405  * given a leaf and an inode, copy the inode fields into the leaf
3406  */
3407 static void fill_inode_item(struct btrfs_trans_handle *trans,
3408 			    struct extent_buffer *leaf,
3409 			    struct btrfs_inode_item *item,
3410 			    struct inode *inode)
3411 {
3412 	struct btrfs_map_token token;
3413 
3414 	btrfs_init_map_token(&token);
3415 
3416 	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3417 	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3418 	btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3419 				   &token);
3420 	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3421 	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3422 
3423 	btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
3424 				     inode->i_atime.tv_sec, &token);
3425 	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
3426 				      inode->i_atime.tv_nsec, &token);
3427 
3428 	btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
3429 				     inode->i_mtime.tv_sec, &token);
3430 	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
3431 				      inode->i_mtime.tv_nsec, &token);
3432 
3433 	btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
3434 				     inode->i_ctime.tv_sec, &token);
3435 	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3436 				      inode->i_ctime.tv_nsec, &token);
3437 
3438 	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3439 				     &token);
3440 	btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3441 					 &token);
3442 	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3443 	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3444 	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3445 	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3446 	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3447 }
3448 
3449 /*
3450  * copy everything in the in-memory inode into the btree.
3451  */
3452 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3453 				struct btrfs_root *root, struct inode *inode)
3454 {
3455 	struct btrfs_inode_item *inode_item;
3456 	struct btrfs_path *path;
3457 	struct extent_buffer *leaf;
3458 	int ret;
3459 
3460 	path = btrfs_alloc_path();
3461 	if (!path)
3462 		return -ENOMEM;
3463 
3464 	path->leave_spinning = 1;
3465 	ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
3466 				 1);
3467 	if (ret) {
3468 		if (ret > 0)
3469 			ret = -ENOENT;
3470 		goto failed;
3471 	}
3472 
3473 	btrfs_unlock_up_safe(path, 1);
3474 	leaf = path->nodes[0];
3475 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3476 				    struct btrfs_inode_item);
3477 
3478 	fill_inode_item(trans, leaf, inode_item, inode);
3479 	btrfs_mark_buffer_dirty(leaf);
3480 	btrfs_set_inode_last_trans(trans, inode);
3481 	ret = 0;
3482 failed:
3483 	btrfs_free_path(path);
3484 	return ret;
3485 }
3486 
3487 /*
3488  * copy everything in the in-memory inode into the btree.
3489  */
3490 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3491 				struct btrfs_root *root, struct inode *inode)
3492 {
3493 	int ret;
3494 
3495 	/*
3496 	 * If the inode is a free space inode, we can deadlock during commit
3497 	 * if we put it into the delayed code.
3498 	 *
3499 	 * The data relocation inode should also be directly updated
3500 	 * without delay
3501 	 */
3502 	if (!btrfs_is_free_space_inode(inode)
3503 	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
3504 		btrfs_update_root_times(trans, root);
3505 
3506 		ret = btrfs_delayed_update_inode(trans, root, inode);
3507 		if (!ret)
3508 			btrfs_set_inode_last_trans(trans, inode);
3509 		return ret;
3510 	}
3511 
3512 	return btrfs_update_inode_item(trans, root, inode);
3513 }
3514 
3515 noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
3516 					 struct btrfs_root *root,
3517 					 struct inode *inode)
3518 {
3519 	int ret;
3520 
3521 	ret = btrfs_update_inode(trans, root, inode);
3522 	if (ret == -ENOSPC)
3523 		return btrfs_update_inode_item(trans, root, inode);
3524 	return ret;
3525 }
3526 
3527 /*
3528  * unlink helper that gets used here in inode.c and in the tree logging
3529  * recovery code.  It remove a link in a directory with a given name, and
3530  * also drops the back refs in the inode to the directory
3531  */
3532 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3533 				struct btrfs_root *root,
3534 				struct inode *dir, struct inode *inode,
3535 				const char *name, int name_len)
3536 {
3537 	struct btrfs_path *path;
3538 	int ret = 0;
3539 	struct extent_buffer *leaf;
3540 	struct btrfs_dir_item *di;
3541 	struct btrfs_key key;
3542 	u64 index;
3543 	u64 ino = btrfs_ino(inode);
3544 	u64 dir_ino = btrfs_ino(dir);
3545 
3546 	path = btrfs_alloc_path();
3547 	if (!path) {
3548 		ret = -ENOMEM;
3549 		goto out;
3550 	}
3551 
3552 	path->leave_spinning = 1;
3553 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3554 				    name, name_len, -1);
3555 	if (IS_ERR(di)) {
3556 		ret = PTR_ERR(di);
3557 		goto err;
3558 	}
3559 	if (!di) {
3560 		ret = -ENOENT;
3561 		goto err;
3562 	}
3563 	leaf = path->nodes[0];
3564 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
3565 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
3566 	if (ret)
3567 		goto err;
3568 	btrfs_release_path(path);
3569 
3570 	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
3571 				  dir_ino, &index);
3572 	if (ret) {
3573 		btrfs_info(root->fs_info,
3574 			"failed to delete reference to %.*s, inode %llu parent %llu",
3575 			name_len, name,
3576 			(unsigned long long)ino, (unsigned long long)dir_ino);
3577 		btrfs_abort_transaction(trans, root, ret);
3578 		goto err;
3579 	}
3580 
3581 	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3582 	if (ret) {
3583 		btrfs_abort_transaction(trans, root, ret);
3584 		goto err;
3585 	}
3586 
3587 	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
3588 					 inode, dir_ino);
3589 	if (ret != 0 && ret != -ENOENT) {
3590 		btrfs_abort_transaction(trans, root, ret);
3591 		goto err;
3592 	}
3593 
3594 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
3595 					   dir, index);
3596 	if (ret == -ENOENT)
3597 		ret = 0;
3598 	else if (ret)
3599 		btrfs_abort_transaction(trans, root, ret);
3600 err:
3601 	btrfs_free_path(path);
3602 	if (ret)
3603 		goto out;
3604 
3605 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3606 	inode_inc_iversion(inode);
3607 	inode_inc_iversion(dir);
3608 	inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3609 	ret = btrfs_update_inode(trans, root, dir);
3610 out:
3611 	return ret;
3612 }
3613 
3614 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3615 		       struct btrfs_root *root,
3616 		       struct inode *dir, struct inode *inode,
3617 		       const char *name, int name_len)
3618 {
3619 	int ret;
3620 	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
3621 	if (!ret) {
3622 		btrfs_drop_nlink(inode);
3623 		ret = btrfs_update_inode(trans, root, inode);
3624 	}
3625 	return ret;
3626 }
3627 
3628 /*
3629  * helper to start transaction for unlink and rmdir.
3630  *
3631  * unlink and rmdir are special in btrfs, they do not always free space, so
3632  * if we cannot make our reservations the normal way try and see if there is
3633  * plenty of slack room in the global reserve to migrate, otherwise we cannot
3634  * allow the unlink to occur.
3635  */
3636 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
3637 {
3638 	struct btrfs_trans_handle *trans;
3639 	struct btrfs_root *root = BTRFS_I(dir)->root;
3640 	int ret;
3641 
3642 	/*
3643 	 * 1 for the possible orphan item
3644 	 * 1 for the dir item
3645 	 * 1 for the dir index
3646 	 * 1 for the inode ref
3647 	 * 1 for the inode
3648 	 */
3649 	trans = btrfs_start_transaction(root, 5);
3650 	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
3651 		return trans;
3652 
3653 	if (PTR_ERR(trans) == -ENOSPC) {
3654 		u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
3655 
3656 		trans = btrfs_start_transaction(root, 0);
3657 		if (IS_ERR(trans))
3658 			return trans;
3659 		ret = btrfs_cond_migrate_bytes(root->fs_info,
3660 					       &root->fs_info->trans_block_rsv,
3661 					       num_bytes, 5);
3662 		if (ret) {
3663 			btrfs_end_transaction(trans, root);
3664 			return ERR_PTR(ret);
3665 		}
3666 		trans->block_rsv = &root->fs_info->trans_block_rsv;
3667 		trans->bytes_reserved = num_bytes;
3668 	}
3669 	return trans;
3670 }
3671 
3672 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3673 {
3674 	struct btrfs_root *root = BTRFS_I(dir)->root;
3675 	struct btrfs_trans_handle *trans;
3676 	struct inode *inode = dentry->d_inode;
3677 	int ret;
3678 
3679 	trans = __unlink_start_trans(dir);
3680 	if (IS_ERR(trans))
3681 		return PTR_ERR(trans);
3682 
3683 	btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
3684 
3685 	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3686 				 dentry->d_name.name, dentry->d_name.len);
3687 	if (ret)
3688 		goto out;
3689 
3690 	if (inode->i_nlink == 0) {
3691 		ret = btrfs_orphan_add(trans, inode);
3692 		if (ret)
3693 			goto out;
3694 	}
3695 
3696 out:
3697 	btrfs_end_transaction(trans, root);
3698 	btrfs_btree_balance_dirty(root);
3699 	return ret;
3700 }
3701 
3702 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3703 			struct btrfs_root *root,
3704 			struct inode *dir, u64 objectid,
3705 			const char *name, int name_len)
3706 {
3707 	struct btrfs_path *path;
3708 	struct extent_buffer *leaf;
3709 	struct btrfs_dir_item *di;
3710 	struct btrfs_key key;
3711 	u64 index;
3712 	int ret;
3713 	u64 dir_ino = btrfs_ino(dir);
3714 
3715 	path = btrfs_alloc_path();
3716 	if (!path)
3717 		return -ENOMEM;
3718 
3719 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3720 				   name, name_len, -1);
3721 	if (IS_ERR_OR_NULL(di)) {
3722 		if (!di)
3723 			ret = -ENOENT;
3724 		else
3725 			ret = PTR_ERR(di);
3726 		goto out;
3727 	}
3728 
3729 	leaf = path->nodes[0];
3730 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
3731 	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
3732 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
3733 	if (ret) {
3734 		btrfs_abort_transaction(trans, root, ret);
3735 		goto out;
3736 	}
3737 	btrfs_release_path(path);
3738 
3739 	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
3740 				 objectid, root->root_key.objectid,
3741 				 dir_ino, &index, name, name_len);
3742 	if (ret < 0) {
3743 		if (ret != -ENOENT) {
3744 			btrfs_abort_transaction(trans, root, ret);
3745 			goto out;
3746 		}
3747 		di = btrfs_search_dir_index_item(root, path, dir_ino,
3748 						 name, name_len);
3749 		if (IS_ERR_OR_NULL(di)) {
3750 			if (!di)
3751 				ret = -ENOENT;
3752 			else
3753 				ret = PTR_ERR(di);
3754 			btrfs_abort_transaction(trans, root, ret);
3755 			goto out;
3756 		}
3757 
3758 		leaf = path->nodes[0];
3759 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3760 		btrfs_release_path(path);
3761 		index = key.offset;
3762 	}
3763 	btrfs_release_path(path);
3764 
3765 	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3766 	if (ret) {
3767 		btrfs_abort_transaction(trans, root, ret);
3768 		goto out;
3769 	}
3770 
3771 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3772 	inode_inc_iversion(dir);
3773 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3774 	ret = btrfs_update_inode_fallback(trans, root, dir);
3775 	if (ret)
3776 		btrfs_abort_transaction(trans, root, ret);
3777 out:
3778 	btrfs_free_path(path);
3779 	return ret;
3780 }
3781 
3782 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3783 {
3784 	struct inode *inode = dentry->d_inode;
3785 	int err = 0;
3786 	struct btrfs_root *root = BTRFS_I(dir)->root;
3787 	struct btrfs_trans_handle *trans;
3788 
3789 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3790 		return -ENOTEMPTY;
3791 	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3792 		return -EPERM;
3793 
3794 	trans = __unlink_start_trans(dir);
3795 	if (IS_ERR(trans))
3796 		return PTR_ERR(trans);
3797 
3798 	if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
3799 		err = btrfs_unlink_subvol(trans, root, dir,
3800 					  BTRFS_I(inode)->location.objectid,
3801 					  dentry->d_name.name,
3802 					  dentry->d_name.len);
3803 		goto out;
3804 	}
3805 
3806 	err = btrfs_orphan_add(trans, inode);
3807 	if (err)
3808 		goto out;
3809 
3810 	/* now the directory is empty */
3811 	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3812 				 dentry->d_name.name, dentry->d_name.len);
3813 	if (!err)
3814 		btrfs_i_size_write(inode, 0);
3815 out:
3816 	btrfs_end_transaction(trans, root);
3817 	btrfs_btree_balance_dirty(root);
3818 
3819 	return err;
3820 }
3821 
3822 /*
3823  * this can truncate away extent items, csum items and directory items.
3824  * It starts at a high offset and removes keys until it can't find
3825  * any higher than new_size
3826  *
3827  * csum items that cross the new i_size are truncated to the new size
3828  * as well.
3829  *
3830  * min_type is the minimum key type to truncate down to.  If set to 0, this
3831  * will kill all the items on this inode, including the INODE_ITEM_KEY.
3832  */
3833 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3834 			       struct btrfs_root *root,
3835 			       struct inode *inode,
3836 			       u64 new_size, u32 min_type)
3837 {
3838 	struct btrfs_path *path;
3839 	struct extent_buffer *leaf;
3840 	struct btrfs_file_extent_item *fi;
3841 	struct btrfs_key key;
3842 	struct btrfs_key found_key;
3843 	u64 extent_start = 0;
3844 	u64 extent_num_bytes = 0;
3845 	u64 extent_offset = 0;
3846 	u64 item_end = 0;
3847 	u32 found_type = (u8)-1;
3848 	int found_extent;
3849 	int del_item;
3850 	int pending_del_nr = 0;
3851 	int pending_del_slot = 0;
3852 	int extent_type = -1;
3853 	int ret;
3854 	int err = 0;
3855 	u64 ino = btrfs_ino(inode);
3856 
3857 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
3858 
3859 	path = btrfs_alloc_path();
3860 	if (!path)
3861 		return -ENOMEM;
3862 	path->reada = -1;
3863 
3864 	/*
3865 	 * We want to drop from the next block forward in case this new size is
3866 	 * not block aligned since we will be keeping the last block of the
3867 	 * extent just the way it is.
3868 	 */
3869 	if (root->ref_cows || root == root->fs_info->tree_root)
3870 		btrfs_drop_extent_cache(inode, ALIGN(new_size,
3871 					root->sectorsize), (u64)-1, 0);
3872 
3873 	/*
3874 	 * This function is also used to drop the items in the log tree before
3875 	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
3876 	 * it is used to drop the loged items. So we shouldn't kill the delayed
3877 	 * items.
3878 	 */
3879 	if (min_type == 0 && root == BTRFS_I(inode)->root)
3880 		btrfs_kill_delayed_inode_items(inode);
3881 
3882 	key.objectid = ino;
3883 	key.offset = (u64)-1;
3884 	key.type = (u8)-1;
3885 
3886 search_again:
3887 	path->leave_spinning = 1;
3888 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3889 	if (ret < 0) {
3890 		err = ret;
3891 		goto out;
3892 	}
3893 
3894 	if (ret > 0) {
3895 		/* there are no items in the tree for us to truncate, we're
3896 		 * done
3897 		 */
3898 		if (path->slots[0] == 0)
3899 			goto out;
3900 		path->slots[0]--;
3901 	}
3902 
3903 	while (1) {
3904 		fi = NULL;
3905 		leaf = path->nodes[0];
3906 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3907 		found_type = btrfs_key_type(&found_key);
3908 
3909 		if (found_key.objectid != ino)
3910 			break;
3911 
3912 		if (found_type < min_type)
3913 			break;
3914 
3915 		item_end = found_key.offset;
3916 		if (found_type == BTRFS_EXTENT_DATA_KEY) {
3917 			fi = btrfs_item_ptr(leaf, path->slots[0],
3918 					    struct btrfs_file_extent_item);
3919 			extent_type = btrfs_file_extent_type(leaf, fi);
3920 			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3921 				item_end +=
3922 				    btrfs_file_extent_num_bytes(leaf, fi);
3923 			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3924 				item_end += btrfs_file_extent_inline_len(leaf,
3925 									 fi);
3926 			}
3927 			item_end--;
3928 		}
3929 		if (found_type > min_type) {
3930 			del_item = 1;
3931 		} else {
3932 			if (item_end < new_size)
3933 				break;
3934 			if (found_key.offset >= new_size)
3935 				del_item = 1;
3936 			else
3937 				del_item = 0;
3938 		}
3939 		found_extent = 0;
3940 		/* FIXME, shrink the extent if the ref count is only 1 */
3941 		if (found_type != BTRFS_EXTENT_DATA_KEY)
3942 			goto delete;
3943 
3944 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3945 			u64 num_dec;
3946 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
3947 			if (!del_item) {
3948 				u64 orig_num_bytes =
3949 					btrfs_file_extent_num_bytes(leaf, fi);
3950 				extent_num_bytes = ALIGN(new_size -
3951 						found_key.offset,
3952 						root->sectorsize);
3953 				btrfs_set_file_extent_num_bytes(leaf, fi,
3954 							 extent_num_bytes);
3955 				num_dec = (orig_num_bytes -
3956 					   extent_num_bytes);
3957 				if (root->ref_cows && extent_start != 0)
3958 					inode_sub_bytes(inode, num_dec);
3959 				btrfs_mark_buffer_dirty(leaf);
3960 			} else {
3961 				extent_num_bytes =
3962 					btrfs_file_extent_disk_num_bytes(leaf,
3963 									 fi);
3964 				extent_offset = found_key.offset -
3965 					btrfs_file_extent_offset(leaf, fi);
3966 
3967 				/* FIXME blocksize != 4096 */
3968 				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
3969 				if (extent_start != 0) {
3970 					found_extent = 1;
3971 					if (root->ref_cows)
3972 						inode_sub_bytes(inode, num_dec);
3973 				}
3974 			}
3975 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3976 			/*
3977 			 * we can't truncate inline items that have had
3978 			 * special encodings
3979 			 */
3980 			if (!del_item &&
3981 			    btrfs_file_extent_compression(leaf, fi) == 0 &&
3982 			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
3983 			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
3984 				u32 size = new_size - found_key.offset;
3985 
3986 				if (root->ref_cows) {
3987 					inode_sub_bytes(inode, item_end + 1 -
3988 							new_size);
3989 				}
3990 				size =
3991 				    btrfs_file_extent_calc_inline_size(size);
3992 				btrfs_truncate_item(root, path, size, 1);
3993 			} else if (root->ref_cows) {
3994 				inode_sub_bytes(inode, item_end + 1 -
3995 						found_key.offset);
3996 			}
3997 		}
3998 delete:
3999 		if (del_item) {
4000 			if (!pending_del_nr) {
4001 				/* no pending yet, add ourselves */
4002 				pending_del_slot = path->slots[0];
4003 				pending_del_nr = 1;
4004 			} else if (pending_del_nr &&
4005 				   path->slots[0] + 1 == pending_del_slot) {
4006 				/* hop on the pending chunk */
4007 				pending_del_nr++;
4008 				pending_del_slot = path->slots[0];
4009 			} else {
4010 				BUG();
4011 			}
4012 		} else {
4013 			break;
4014 		}
4015 		if (found_extent && (root->ref_cows ||
4016 				     root == root->fs_info->tree_root)) {
4017 			btrfs_set_path_blocking(path);
4018 			ret = btrfs_free_extent(trans, root, extent_start,
4019 						extent_num_bytes, 0,
4020 						btrfs_header_owner(leaf),
4021 						ino, extent_offset, 0);
4022 			BUG_ON(ret);
4023 		}
4024 
4025 		if (found_type == BTRFS_INODE_ITEM_KEY)
4026 			break;
4027 
4028 		if (path->slots[0] == 0 ||
4029 		    path->slots[0] != pending_del_slot) {
4030 			if (pending_del_nr) {
4031 				ret = btrfs_del_items(trans, root, path,
4032 						pending_del_slot,
4033 						pending_del_nr);
4034 				if (ret) {
4035 					btrfs_abort_transaction(trans,
4036 								root, ret);
4037 					goto error;
4038 				}
4039 				pending_del_nr = 0;
4040 			}
4041 			btrfs_release_path(path);
4042 			goto search_again;
4043 		} else {
4044 			path->slots[0]--;
4045 		}
4046 	}
4047 out:
4048 	if (pending_del_nr) {
4049 		ret = btrfs_del_items(trans, root, path, pending_del_slot,
4050 				      pending_del_nr);
4051 		if (ret)
4052 			btrfs_abort_transaction(trans, root, ret);
4053 	}
4054 error:
4055 	btrfs_free_path(path);
4056 	return err;
4057 }
4058 
4059 /*
4060  * btrfs_truncate_page - read, zero a chunk and write a page
4061  * @inode - inode that we're zeroing
4062  * @from - the offset to start zeroing
4063  * @len - the length to zero, 0 to zero the entire range respective to the
4064  *	offset
4065  * @front - zero up to the offset instead of from the offset on
4066  *
4067  * This will find the page for the "from" offset and cow the page and zero the
4068  * part we want to zero.  This is used with truncate and hole punching.
4069  */
4070 int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
4071 			int front)
4072 {
4073 	struct address_space *mapping = inode->i_mapping;
4074 	struct btrfs_root *root = BTRFS_I(inode)->root;
4075 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4076 	struct btrfs_ordered_extent *ordered;
4077 	struct extent_state *cached_state = NULL;
4078 	char *kaddr;
4079 	u32 blocksize = root->sectorsize;
4080 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
4081 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
4082 	struct page *page;
4083 	gfp_t mask = btrfs_alloc_write_mask(mapping);
4084 	int ret = 0;
4085 	u64 page_start;
4086 	u64 page_end;
4087 
4088 	if ((offset & (blocksize - 1)) == 0 &&
4089 	    (!len || ((len & (blocksize - 1)) == 0)))
4090 		goto out;
4091 	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
4092 	if (ret)
4093 		goto out;
4094 
4095 again:
4096 	page = find_or_create_page(mapping, index, mask);
4097 	if (!page) {
4098 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4099 		ret = -ENOMEM;
4100 		goto out;
4101 	}
4102 
4103 	page_start = page_offset(page);
4104 	page_end = page_start + PAGE_CACHE_SIZE - 1;
4105 
4106 	if (!PageUptodate(page)) {
4107 		ret = btrfs_readpage(NULL, page);
4108 		lock_page(page);
4109 		if (page->mapping != mapping) {
4110 			unlock_page(page);
4111 			page_cache_release(page);
4112 			goto again;
4113 		}
4114 		if (!PageUptodate(page)) {
4115 			ret = -EIO;
4116 			goto out_unlock;
4117 		}
4118 	}
4119 	wait_on_page_writeback(page);
4120 
4121 	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
4122 	set_page_extent_mapped(page);
4123 
4124 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
4125 	if (ordered) {
4126 		unlock_extent_cached(io_tree, page_start, page_end,
4127 				     &cached_state, GFP_NOFS);
4128 		unlock_page(page);
4129 		page_cache_release(page);
4130 		btrfs_start_ordered_extent(inode, ordered, 1);
4131 		btrfs_put_ordered_extent(ordered);
4132 		goto again;
4133 	}
4134 
4135 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
4136 			  EXTENT_DIRTY | EXTENT_DELALLOC |
4137 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4138 			  0, 0, &cached_state, GFP_NOFS);
4139 
4140 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
4141 					&cached_state);
4142 	if (ret) {
4143 		unlock_extent_cached(io_tree, page_start, page_end,
4144 				     &cached_state, GFP_NOFS);
4145 		goto out_unlock;
4146 	}
4147 
4148 	if (offset != PAGE_CACHE_SIZE) {
4149 		if (!len)
4150 			len = PAGE_CACHE_SIZE - offset;
4151 		kaddr = kmap(page);
4152 		if (front)
4153 			memset(kaddr, 0, offset);
4154 		else
4155 			memset(kaddr + offset, 0, len);
4156 		flush_dcache_page(page);
4157 		kunmap(page);
4158 	}
4159 	ClearPageChecked(page);
4160 	set_page_dirty(page);
4161 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
4162 			     GFP_NOFS);
4163 
4164 out_unlock:
4165 	if (ret)
4166 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4167 	unlock_page(page);
4168 	page_cache_release(page);
4169 out:
4170 	return ret;
4171 }
4172 
4173 /*
4174  * This function puts in dummy file extents for the area we're creating a hole
4175  * for.  So if we are truncating this file to a larger size we need to insert
4176  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4177  * the range between oldsize and size
4178  */
4179 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4180 {
4181 	struct btrfs_trans_handle *trans;
4182 	struct btrfs_root *root = BTRFS_I(inode)->root;
4183 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4184 	struct extent_map *em = NULL;
4185 	struct extent_state *cached_state = NULL;
4186 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4187 	u64 hole_start = ALIGN(oldsize, root->sectorsize);
4188 	u64 block_end = ALIGN(size, root->sectorsize);
4189 	u64 last_byte;
4190 	u64 cur_offset;
4191 	u64 hole_size;
4192 	int err = 0;
4193 
4194 	/*
4195 	 * If our size started in the middle of a page we need to zero out the
4196 	 * rest of the page before we expand the i_size, otherwise we could
4197 	 * expose stale data.
4198 	 */
4199 	err = btrfs_truncate_page(inode, oldsize, 0, 0);
4200 	if (err)
4201 		return err;
4202 
4203 	if (size <= hole_start)
4204 		return 0;
4205 
4206 	while (1) {
4207 		struct btrfs_ordered_extent *ordered;
4208 		btrfs_wait_ordered_range(inode, hole_start,
4209 					 block_end - hole_start);
4210 		lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
4211 				 &cached_state);
4212 		ordered = btrfs_lookup_ordered_extent(inode, hole_start);
4213 		if (!ordered)
4214 			break;
4215 		unlock_extent_cached(io_tree, hole_start, block_end - 1,
4216 				     &cached_state, GFP_NOFS);
4217 		btrfs_put_ordered_extent(ordered);
4218 	}
4219 
4220 	cur_offset = hole_start;
4221 	while (1) {
4222 		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4223 				block_end - cur_offset, 0);
4224 		if (IS_ERR(em)) {
4225 			err = PTR_ERR(em);
4226 			em = NULL;
4227 			break;
4228 		}
4229 		last_byte = min(extent_map_end(em), block_end);
4230 		last_byte = ALIGN(last_byte , root->sectorsize);
4231 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4232 			struct extent_map *hole_em;
4233 			hole_size = last_byte - cur_offset;
4234 
4235 			trans = btrfs_start_transaction(root, 3);
4236 			if (IS_ERR(trans)) {
4237 				err = PTR_ERR(trans);
4238 				break;
4239 			}
4240 
4241 			err = btrfs_drop_extents(trans, root, inode,
4242 						 cur_offset,
4243 						 cur_offset + hole_size, 1);
4244 			if (err) {
4245 				btrfs_abort_transaction(trans, root, err);
4246 				btrfs_end_transaction(trans, root);
4247 				break;
4248 			}
4249 
4250 			err = btrfs_insert_file_extent(trans, root,
4251 					btrfs_ino(inode), cur_offset, 0,
4252 					0, hole_size, 0, hole_size,
4253 					0, 0, 0);
4254 			if (err) {
4255 				btrfs_abort_transaction(trans, root, err);
4256 				btrfs_end_transaction(trans, root);
4257 				break;
4258 			}
4259 
4260 			btrfs_drop_extent_cache(inode, cur_offset,
4261 						cur_offset + hole_size - 1, 0);
4262 			hole_em = alloc_extent_map();
4263 			if (!hole_em) {
4264 				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4265 					&BTRFS_I(inode)->runtime_flags);
4266 				goto next;
4267 			}
4268 			hole_em->start = cur_offset;
4269 			hole_em->len = hole_size;
4270 			hole_em->orig_start = cur_offset;
4271 
4272 			hole_em->block_start = EXTENT_MAP_HOLE;
4273 			hole_em->block_len = 0;
4274 			hole_em->orig_block_len = 0;
4275 			hole_em->ram_bytes = hole_size;
4276 			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
4277 			hole_em->compress_type = BTRFS_COMPRESS_NONE;
4278 			hole_em->generation = trans->transid;
4279 
4280 			while (1) {
4281 				write_lock(&em_tree->lock);
4282 				err = add_extent_mapping(em_tree, hole_em, 1);
4283 				write_unlock(&em_tree->lock);
4284 				if (err != -EEXIST)
4285 					break;
4286 				btrfs_drop_extent_cache(inode, cur_offset,
4287 							cur_offset +
4288 							hole_size - 1, 0);
4289 			}
4290 			free_extent_map(hole_em);
4291 next:
4292 			btrfs_update_inode(trans, root, inode);
4293 			btrfs_end_transaction(trans, root);
4294 		}
4295 		free_extent_map(em);
4296 		em = NULL;
4297 		cur_offset = last_byte;
4298 		if (cur_offset >= block_end)
4299 			break;
4300 	}
4301 
4302 	free_extent_map(em);
4303 	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
4304 			     GFP_NOFS);
4305 	return err;
4306 }
4307 
4308 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4309 {
4310 	struct btrfs_root *root = BTRFS_I(inode)->root;
4311 	struct btrfs_trans_handle *trans;
4312 	loff_t oldsize = i_size_read(inode);
4313 	loff_t newsize = attr->ia_size;
4314 	int mask = attr->ia_valid;
4315 	int ret;
4316 
4317 	/*
4318 	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4319 	 * special case where we need to update the times despite not having
4320 	 * these flags set.  For all other operations the VFS set these flags
4321 	 * explicitly if it wants a timestamp update.
4322 	 */
4323 	if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
4324 		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
4325 
4326 	if (newsize > oldsize) {
4327 		truncate_pagecache(inode, oldsize, newsize);
4328 		ret = btrfs_cont_expand(inode, oldsize, newsize);
4329 		if (ret)
4330 			return ret;
4331 
4332 		trans = btrfs_start_transaction(root, 1);
4333 		if (IS_ERR(trans))
4334 			return PTR_ERR(trans);
4335 
4336 		i_size_write(inode, newsize);
4337 		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4338 		ret = btrfs_update_inode(trans, root, inode);
4339 		btrfs_end_transaction(trans, root);
4340 	} else {
4341 
4342 		/*
4343 		 * We're truncating a file that used to have good data down to
4344 		 * zero. Make sure it gets into the ordered flush list so that
4345 		 * any new writes get down to disk quickly.
4346 		 */
4347 		if (newsize == 0)
4348 			set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
4349 				&BTRFS_I(inode)->runtime_flags);
4350 
4351 		/*
4352 		 * 1 for the orphan item we're going to add
4353 		 * 1 for the orphan item deletion.
4354 		 */
4355 		trans = btrfs_start_transaction(root, 2);
4356 		if (IS_ERR(trans))
4357 			return PTR_ERR(trans);
4358 
4359 		/*
4360 		 * We need to do this in case we fail at _any_ point during the
4361 		 * actual truncate.  Once we do the truncate_setsize we could
4362 		 * invalidate pages which forces any outstanding ordered io to
4363 		 * be instantly completed which will give us extents that need
4364 		 * to be truncated.  If we fail to get an orphan inode down we
4365 		 * could have left over extents that were never meant to live,
4366 		 * so we need to garuntee from this point on that everything
4367 		 * will be consistent.
4368 		 */
4369 		ret = btrfs_orphan_add(trans, inode);
4370 		btrfs_end_transaction(trans, root);
4371 		if (ret)
4372 			return ret;
4373 
4374 		/* we don't support swapfiles, so vmtruncate shouldn't fail */
4375 		truncate_setsize(inode, newsize);
4376 
4377 		/* Disable nonlocked read DIO to avoid the end less truncate */
4378 		btrfs_inode_block_unlocked_dio(inode);
4379 		inode_dio_wait(inode);
4380 		btrfs_inode_resume_unlocked_dio(inode);
4381 
4382 		ret = btrfs_truncate(inode);
4383 		if (ret && inode->i_nlink)
4384 			btrfs_orphan_del(NULL, inode);
4385 	}
4386 
4387 	return ret;
4388 }
4389 
4390 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
4391 {
4392 	struct inode *inode = dentry->d_inode;
4393 	struct btrfs_root *root = BTRFS_I(inode)->root;
4394 	int err;
4395 
4396 	if (btrfs_root_readonly(root))
4397 		return -EROFS;
4398 
4399 	err = inode_change_ok(inode, attr);
4400 	if (err)
4401 		return err;
4402 
4403 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
4404 		err = btrfs_setsize(inode, attr);
4405 		if (err)
4406 			return err;
4407 	}
4408 
4409 	if (attr->ia_valid) {
4410 		setattr_copy(inode, attr);
4411 		inode_inc_iversion(inode);
4412 		err = btrfs_dirty_inode(inode);
4413 
4414 		if (!err && attr->ia_valid & ATTR_MODE)
4415 			err = btrfs_acl_chmod(inode);
4416 	}
4417 
4418 	return err;
4419 }
4420 
4421 void btrfs_evict_inode(struct inode *inode)
4422 {
4423 	struct btrfs_trans_handle *trans;
4424 	struct btrfs_root *root = BTRFS_I(inode)->root;
4425 	struct btrfs_block_rsv *rsv, *global_rsv;
4426 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
4427 	int ret;
4428 
4429 	trace_btrfs_inode_evict(inode);
4430 
4431 	truncate_inode_pages(&inode->i_data, 0);
4432 	if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
4433 			       btrfs_is_free_space_inode(inode)))
4434 		goto no_delete;
4435 
4436 	if (is_bad_inode(inode)) {
4437 		btrfs_orphan_del(NULL, inode);
4438 		goto no_delete;
4439 	}
4440 	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
4441 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
4442 
4443 	if (root->fs_info->log_root_recovering) {
4444 		BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
4445 				 &BTRFS_I(inode)->runtime_flags));
4446 		goto no_delete;
4447 	}
4448 
4449 	if (inode->i_nlink > 0) {
4450 		BUG_ON(btrfs_root_refs(&root->root_item) != 0);
4451 		goto no_delete;
4452 	}
4453 
4454 	ret = btrfs_commit_inode_delayed_inode(inode);
4455 	if (ret) {
4456 		btrfs_orphan_del(NULL, inode);
4457 		goto no_delete;
4458 	}
4459 
4460 	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
4461 	if (!rsv) {
4462 		btrfs_orphan_del(NULL, inode);
4463 		goto no_delete;
4464 	}
4465 	rsv->size = min_size;
4466 	rsv->failfast = 1;
4467 	global_rsv = &root->fs_info->global_block_rsv;
4468 
4469 	btrfs_i_size_write(inode, 0);
4470 
4471 	/*
4472 	 * This is a bit simpler than btrfs_truncate since we've already
4473 	 * reserved our space for our orphan item in the unlink, so we just
4474 	 * need to reserve some slack space in case we add bytes and update
4475 	 * inode item when doing the truncate.
4476 	 */
4477 	while (1) {
4478 		ret = btrfs_block_rsv_refill(root, rsv, min_size,
4479 					     BTRFS_RESERVE_FLUSH_LIMIT);
4480 
4481 		/*
4482 		 * Try and steal from the global reserve since we will
4483 		 * likely not use this space anyway, we want to try as
4484 		 * hard as possible to get this to work.
4485 		 */
4486 		if (ret)
4487 			ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
4488 
4489 		if (ret) {
4490 			btrfs_warn(root->fs_info,
4491 				"Could not get space for a delete, will truncate on mount %d",
4492 				ret);
4493 			btrfs_orphan_del(NULL, inode);
4494 			btrfs_free_block_rsv(root, rsv);
4495 			goto no_delete;
4496 		}
4497 
4498 		trans = btrfs_join_transaction(root);
4499 		if (IS_ERR(trans)) {
4500 			btrfs_orphan_del(NULL, inode);
4501 			btrfs_free_block_rsv(root, rsv);
4502 			goto no_delete;
4503 		}
4504 
4505 		trans->block_rsv = rsv;
4506 
4507 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
4508 		if (ret != -ENOSPC)
4509 			break;
4510 
4511 		trans->block_rsv = &root->fs_info->trans_block_rsv;
4512 		btrfs_end_transaction(trans, root);
4513 		trans = NULL;
4514 		btrfs_btree_balance_dirty(root);
4515 	}
4516 
4517 	btrfs_free_block_rsv(root, rsv);
4518 
4519 	/*
4520 	 * Errors here aren't a big deal, it just means we leave orphan items
4521 	 * in the tree.  They will be cleaned up on the next mount.
4522 	 */
4523 	if (ret == 0) {
4524 		trans->block_rsv = root->orphan_block_rsv;
4525 		btrfs_orphan_del(trans, inode);
4526 	} else {
4527 		btrfs_orphan_del(NULL, inode);
4528 	}
4529 
4530 	trans->block_rsv = &root->fs_info->trans_block_rsv;
4531 	if (!(root == root->fs_info->tree_root ||
4532 	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
4533 		btrfs_return_ino(root, btrfs_ino(inode));
4534 
4535 	btrfs_end_transaction(trans, root);
4536 	btrfs_btree_balance_dirty(root);
4537 no_delete:
4538 	btrfs_remove_delayed_node(inode);
4539 	clear_inode(inode);
4540 	return;
4541 }
4542 
4543 /*
4544  * this returns the key found in the dir entry in the location pointer.
4545  * If no dir entries were found, location->objectid is 0.
4546  */
4547 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
4548 			       struct btrfs_key *location)
4549 {
4550 	const char *name = dentry->d_name.name;
4551 	int namelen = dentry->d_name.len;
4552 	struct btrfs_dir_item *di;
4553 	struct btrfs_path *path;
4554 	struct btrfs_root *root = BTRFS_I(dir)->root;
4555 	int ret = 0;
4556 
4557 	path = btrfs_alloc_path();
4558 	if (!path)
4559 		return -ENOMEM;
4560 
4561 	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
4562 				    namelen, 0);
4563 	if (IS_ERR(di))
4564 		ret = PTR_ERR(di);
4565 
4566 	if (IS_ERR_OR_NULL(di))
4567 		goto out_err;
4568 
4569 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
4570 out:
4571 	btrfs_free_path(path);
4572 	return ret;
4573 out_err:
4574 	location->objectid = 0;
4575 	goto out;
4576 }
4577 
4578 /*
4579  * when we hit a tree root in a directory, the btrfs part of the inode
4580  * needs to be changed to reflect the root directory of the tree root.  This
4581  * is kind of like crossing a mount point.
4582  */
4583 static int fixup_tree_root_location(struct btrfs_root *root,
4584 				    struct inode *dir,
4585 				    struct dentry *dentry,
4586 				    struct btrfs_key *location,
4587 				    struct btrfs_root **sub_root)
4588 {
4589 	struct btrfs_path *path;
4590 	struct btrfs_root *new_root;
4591 	struct btrfs_root_ref *ref;
4592 	struct extent_buffer *leaf;
4593 	int ret;
4594 	int err = 0;
4595 
4596 	path = btrfs_alloc_path();
4597 	if (!path) {
4598 		err = -ENOMEM;
4599 		goto out;
4600 	}
4601 
4602 	err = -ENOENT;
4603 	ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
4604 				  BTRFS_I(dir)->root->root_key.objectid,
4605 				  location->objectid);
4606 	if (ret) {
4607 		if (ret < 0)
4608 			err = ret;
4609 		goto out;
4610 	}
4611 
4612 	leaf = path->nodes[0];
4613 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
4614 	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
4615 	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
4616 		goto out;
4617 
4618 	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
4619 				   (unsigned long)(ref + 1),
4620 				   dentry->d_name.len);
4621 	if (ret)
4622 		goto out;
4623 
4624 	btrfs_release_path(path);
4625 
4626 	new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
4627 	if (IS_ERR(new_root)) {
4628 		err = PTR_ERR(new_root);
4629 		goto out;
4630 	}
4631 
4632 	*sub_root = new_root;
4633 	location->objectid = btrfs_root_dirid(&new_root->root_item);
4634 	location->type = BTRFS_INODE_ITEM_KEY;
4635 	location->offset = 0;
4636 	err = 0;
4637 out:
4638 	btrfs_free_path(path);
4639 	return err;
4640 }
4641 
4642 static void inode_tree_add(struct inode *inode)
4643 {
4644 	struct btrfs_root *root = BTRFS_I(inode)->root;
4645 	struct btrfs_inode *entry;
4646 	struct rb_node **p;
4647 	struct rb_node *parent;
4648 	u64 ino = btrfs_ino(inode);
4649 
4650 	if (inode_unhashed(inode))
4651 		return;
4652 again:
4653 	parent = NULL;
4654 	spin_lock(&root->inode_lock);
4655 	p = &root->inode_tree.rb_node;
4656 	while (*p) {
4657 		parent = *p;
4658 		entry = rb_entry(parent, struct btrfs_inode, rb_node);
4659 
4660 		if (ino < btrfs_ino(&entry->vfs_inode))
4661 			p = &parent->rb_left;
4662 		else if (ino > btrfs_ino(&entry->vfs_inode))
4663 			p = &parent->rb_right;
4664 		else {
4665 			WARN_ON(!(entry->vfs_inode.i_state &
4666 				  (I_WILL_FREE | I_FREEING)));
4667 			rb_erase(parent, &root->inode_tree);
4668 			RB_CLEAR_NODE(parent);
4669 			spin_unlock(&root->inode_lock);
4670 			goto again;
4671 		}
4672 	}
4673 	rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
4674 	rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
4675 	spin_unlock(&root->inode_lock);
4676 }
4677 
4678 static void inode_tree_del(struct inode *inode)
4679 {
4680 	struct btrfs_root *root = BTRFS_I(inode)->root;
4681 	int empty = 0;
4682 
4683 	spin_lock(&root->inode_lock);
4684 	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
4685 		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
4686 		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
4687 		empty = RB_EMPTY_ROOT(&root->inode_tree);
4688 	}
4689 	spin_unlock(&root->inode_lock);
4690 
4691 	/*
4692 	 * Free space cache has inodes in the tree root, but the tree root has a
4693 	 * root_refs of 0, so this could end up dropping the tree root as a
4694 	 * snapshot, so we need the extra !root->fs_info->tree_root check to
4695 	 * make sure we don't drop it.
4696 	 */
4697 	if (empty && btrfs_root_refs(&root->root_item) == 0 &&
4698 	    root != root->fs_info->tree_root) {
4699 		synchronize_srcu(&root->fs_info->subvol_srcu);
4700 		spin_lock(&root->inode_lock);
4701 		empty = RB_EMPTY_ROOT(&root->inode_tree);
4702 		spin_unlock(&root->inode_lock);
4703 		if (empty)
4704 			btrfs_add_dead_root(root);
4705 	}
4706 }
4707 
4708 void btrfs_invalidate_inodes(struct btrfs_root *root)
4709 {
4710 	struct rb_node *node;
4711 	struct rb_node *prev;
4712 	struct btrfs_inode *entry;
4713 	struct inode *inode;
4714 	u64 objectid = 0;
4715 
4716 	WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4717 
4718 	spin_lock(&root->inode_lock);
4719 again:
4720 	node = root->inode_tree.rb_node;
4721 	prev = NULL;
4722 	while (node) {
4723 		prev = node;
4724 		entry = rb_entry(node, struct btrfs_inode, rb_node);
4725 
4726 		if (objectid < btrfs_ino(&entry->vfs_inode))
4727 			node = node->rb_left;
4728 		else if (objectid > btrfs_ino(&entry->vfs_inode))
4729 			node = node->rb_right;
4730 		else
4731 			break;
4732 	}
4733 	if (!node) {
4734 		while (prev) {
4735 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
4736 			if (objectid <= btrfs_ino(&entry->vfs_inode)) {
4737 				node = prev;
4738 				break;
4739 			}
4740 			prev = rb_next(prev);
4741 		}
4742 	}
4743 	while (node) {
4744 		entry = rb_entry(node, struct btrfs_inode, rb_node);
4745 		objectid = btrfs_ino(&entry->vfs_inode) + 1;
4746 		inode = igrab(&entry->vfs_inode);
4747 		if (inode) {
4748 			spin_unlock(&root->inode_lock);
4749 			if (atomic_read(&inode->i_count) > 1)
4750 				d_prune_aliases(inode);
4751 			/*
4752 			 * btrfs_drop_inode will have it removed from
4753 			 * the inode cache when its usage count
4754 			 * hits zero.
4755 			 */
4756 			iput(inode);
4757 			cond_resched();
4758 			spin_lock(&root->inode_lock);
4759 			goto again;
4760 		}
4761 
4762 		if (cond_resched_lock(&root->inode_lock))
4763 			goto again;
4764 
4765 		node = rb_next(node);
4766 	}
4767 	spin_unlock(&root->inode_lock);
4768 }
4769 
4770 static int btrfs_init_locked_inode(struct inode *inode, void *p)
4771 {
4772 	struct btrfs_iget_args *args = p;
4773 	inode->i_ino = args->ino;
4774 	BTRFS_I(inode)->root = args->root;
4775 	return 0;
4776 }
4777 
4778 static int btrfs_find_actor(struct inode *inode, void *opaque)
4779 {
4780 	struct btrfs_iget_args *args = opaque;
4781 	return args->ino == btrfs_ino(inode) &&
4782 		args->root == BTRFS_I(inode)->root;
4783 }
4784 
4785 static struct inode *btrfs_iget_locked(struct super_block *s,
4786 				       u64 objectid,
4787 				       struct btrfs_root *root)
4788 {
4789 	struct inode *inode;
4790 	struct btrfs_iget_args args;
4791 	args.ino = objectid;
4792 	args.root = root;
4793 
4794 	inode = iget5_locked(s, objectid, btrfs_find_actor,
4795 			     btrfs_init_locked_inode,
4796 			     (void *)&args);
4797 	return inode;
4798 }
4799 
4800 /* Get an inode object given its location and corresponding root.
4801  * Returns in *is_new if the inode was read from disk
4802  */
4803 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
4804 			 struct btrfs_root *root, int *new)
4805 {
4806 	struct inode *inode;
4807 
4808 	inode = btrfs_iget_locked(s, location->objectid, root);
4809 	if (!inode)
4810 		return ERR_PTR(-ENOMEM);
4811 
4812 	if (inode->i_state & I_NEW) {
4813 		BTRFS_I(inode)->root = root;
4814 		memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
4815 		btrfs_read_locked_inode(inode);
4816 		if (!is_bad_inode(inode)) {
4817 			inode_tree_add(inode);
4818 			unlock_new_inode(inode);
4819 			if (new)
4820 				*new = 1;
4821 		} else {
4822 			unlock_new_inode(inode);
4823 			iput(inode);
4824 			inode = ERR_PTR(-ESTALE);
4825 		}
4826 	}
4827 
4828 	return inode;
4829 }
4830 
4831 static struct inode *new_simple_dir(struct super_block *s,
4832 				    struct btrfs_key *key,
4833 				    struct btrfs_root *root)
4834 {
4835 	struct inode *inode = new_inode(s);
4836 
4837 	if (!inode)
4838 		return ERR_PTR(-ENOMEM);
4839 
4840 	BTRFS_I(inode)->root = root;
4841 	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
4842 	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
4843 
4844 	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
4845 	inode->i_op = &btrfs_dir_ro_inode_operations;
4846 	inode->i_fop = &simple_dir_operations;
4847 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
4848 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
4849 
4850 	return inode;
4851 }
4852 
4853 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4854 {
4855 	struct inode *inode;
4856 	struct btrfs_root *root = BTRFS_I(dir)->root;
4857 	struct btrfs_root *sub_root = root;
4858 	struct btrfs_key location;
4859 	int index;
4860 	int ret = 0;
4861 
4862 	if (dentry->d_name.len > BTRFS_NAME_LEN)
4863 		return ERR_PTR(-ENAMETOOLONG);
4864 
4865 	ret = btrfs_inode_by_name(dir, dentry, &location);
4866 	if (ret < 0)
4867 		return ERR_PTR(ret);
4868 
4869 	if (location.objectid == 0)
4870 		return NULL;
4871 
4872 	if (location.type == BTRFS_INODE_ITEM_KEY) {
4873 		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
4874 		return inode;
4875 	}
4876 
4877 	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
4878 
4879 	index = srcu_read_lock(&root->fs_info->subvol_srcu);
4880 	ret = fixup_tree_root_location(root, dir, dentry,
4881 				       &location, &sub_root);
4882 	if (ret < 0) {
4883 		if (ret != -ENOENT)
4884 			inode = ERR_PTR(ret);
4885 		else
4886 			inode = new_simple_dir(dir->i_sb, &location, sub_root);
4887 	} else {
4888 		inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
4889 	}
4890 	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
4891 
4892 	if (!IS_ERR(inode) && root != sub_root) {
4893 		down_read(&root->fs_info->cleanup_work_sem);
4894 		if (!(inode->i_sb->s_flags & MS_RDONLY))
4895 			ret = btrfs_orphan_cleanup(sub_root);
4896 		up_read(&root->fs_info->cleanup_work_sem);
4897 		if (ret) {
4898 			iput(inode);
4899 			inode = ERR_PTR(ret);
4900 		}
4901 	}
4902 
4903 	return inode;
4904 }
4905 
4906 static int btrfs_dentry_delete(const struct dentry *dentry)
4907 {
4908 	struct btrfs_root *root;
4909 	struct inode *inode = dentry->d_inode;
4910 
4911 	if (!inode && !IS_ROOT(dentry))
4912 		inode = dentry->d_parent->d_inode;
4913 
4914 	if (inode) {
4915 		root = BTRFS_I(inode)->root;
4916 		if (btrfs_root_refs(&root->root_item) == 0)
4917 			return 1;
4918 
4919 		if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
4920 			return 1;
4921 	}
4922 	return 0;
4923 }
4924 
4925 static void btrfs_dentry_release(struct dentry *dentry)
4926 {
4927 	if (dentry->d_fsdata)
4928 		kfree(dentry->d_fsdata);
4929 }
4930 
4931 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
4932 				   unsigned int flags)
4933 {
4934 	struct dentry *ret;
4935 
4936 	ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
4937 	return ret;
4938 }
4939 
4940 unsigned char btrfs_filetype_table[] = {
4941 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
4942 };
4943 
4944 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
4945 {
4946 	struct inode *inode = file_inode(file);
4947 	struct btrfs_root *root = BTRFS_I(inode)->root;
4948 	struct btrfs_item *item;
4949 	struct btrfs_dir_item *di;
4950 	struct btrfs_key key;
4951 	struct btrfs_key found_key;
4952 	struct btrfs_path *path;
4953 	struct list_head ins_list;
4954 	struct list_head del_list;
4955 	int ret;
4956 	struct extent_buffer *leaf;
4957 	int slot;
4958 	unsigned char d_type;
4959 	int over = 0;
4960 	u32 di_cur;
4961 	u32 di_total;
4962 	u32 di_len;
4963 	int key_type = BTRFS_DIR_INDEX_KEY;
4964 	char tmp_name[32];
4965 	char *name_ptr;
4966 	int name_len;
4967 	int is_curr = 0;	/* ctx->pos points to the current index? */
4968 
4969 	/* FIXME, use a real flag for deciding about the key type */
4970 	if (root->fs_info->tree_root == root)
4971 		key_type = BTRFS_DIR_ITEM_KEY;
4972 
4973 	if (!dir_emit_dots(file, ctx))
4974 		return 0;
4975 
4976 	path = btrfs_alloc_path();
4977 	if (!path)
4978 		return -ENOMEM;
4979 
4980 	path->reada = 1;
4981 
4982 	if (key_type == BTRFS_DIR_INDEX_KEY) {
4983 		INIT_LIST_HEAD(&ins_list);
4984 		INIT_LIST_HEAD(&del_list);
4985 		btrfs_get_delayed_items(inode, &ins_list, &del_list);
4986 	}
4987 
4988 	btrfs_set_key_type(&key, key_type);
4989 	key.offset = ctx->pos;
4990 	key.objectid = btrfs_ino(inode);
4991 
4992 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4993 	if (ret < 0)
4994 		goto err;
4995 
4996 	while (1) {
4997 		leaf = path->nodes[0];
4998 		slot = path->slots[0];
4999 		if (slot >= btrfs_header_nritems(leaf)) {
5000 			ret = btrfs_next_leaf(root, path);
5001 			if (ret < 0)
5002 				goto err;
5003 			else if (ret > 0)
5004 				break;
5005 			continue;
5006 		}
5007 
5008 		item = btrfs_item_nr(leaf, slot);
5009 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
5010 
5011 		if (found_key.objectid != key.objectid)
5012 			break;
5013 		if (btrfs_key_type(&found_key) != key_type)
5014 			break;
5015 		if (found_key.offset < ctx->pos)
5016 			goto next;
5017 		if (key_type == BTRFS_DIR_INDEX_KEY &&
5018 		    btrfs_should_delete_dir_index(&del_list,
5019 						  found_key.offset))
5020 			goto next;
5021 
5022 		ctx->pos = found_key.offset;
5023 		is_curr = 1;
5024 
5025 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5026 		di_cur = 0;
5027 		di_total = btrfs_item_size(leaf, item);
5028 
5029 		while (di_cur < di_total) {
5030 			struct btrfs_key location;
5031 
5032 			if (verify_dir_item(root, leaf, di))
5033 				break;
5034 
5035 			name_len = btrfs_dir_name_len(leaf, di);
5036 			if (name_len <= sizeof(tmp_name)) {
5037 				name_ptr = tmp_name;
5038 			} else {
5039 				name_ptr = kmalloc(name_len, GFP_NOFS);
5040 				if (!name_ptr) {
5041 					ret = -ENOMEM;
5042 					goto err;
5043 				}
5044 			}
5045 			read_extent_buffer(leaf, name_ptr,
5046 					   (unsigned long)(di + 1), name_len);
5047 
5048 			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
5049 			btrfs_dir_item_key_to_cpu(leaf, di, &location);
5050 
5051 
5052 			/* is this a reference to our own snapshot? If so
5053 			 * skip it.
5054 			 *
5055 			 * In contrast to old kernels, we insert the snapshot's
5056 			 * dir item and dir index after it has been created, so
5057 			 * we won't find a reference to our own snapshot. We
5058 			 * still keep the following code for backward
5059 			 * compatibility.
5060 			 */
5061 			if (location.type == BTRFS_ROOT_ITEM_KEY &&
5062 			    location.objectid == root->root_key.objectid) {
5063 				over = 0;
5064 				goto skip;
5065 			}
5066 			over = !dir_emit(ctx, name_ptr, name_len,
5067 				       location.objectid, d_type);
5068 
5069 skip:
5070 			if (name_ptr != tmp_name)
5071 				kfree(name_ptr);
5072 
5073 			if (over)
5074 				goto nopos;
5075 			di_len = btrfs_dir_name_len(leaf, di) +
5076 				 btrfs_dir_data_len(leaf, di) + sizeof(*di);
5077 			di_cur += di_len;
5078 			di = (struct btrfs_dir_item *)((char *)di + di_len);
5079 		}
5080 next:
5081 		path->slots[0]++;
5082 	}
5083 
5084 	if (key_type == BTRFS_DIR_INDEX_KEY) {
5085 		if (is_curr)
5086 			ctx->pos++;
5087 		ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5088 		if (ret)
5089 			goto nopos;
5090 	}
5091 
5092 	/* Reached end of directory/root. Bump pos past the last item. */
5093 	ctx->pos++;
5094 
5095 	/*
5096 	 * Stop new entries from being returned after we return the last
5097 	 * entry.
5098 	 *
5099 	 * New directory entries are assigned a strictly increasing
5100 	 * offset.  This means that new entries created during readdir
5101 	 * are *guaranteed* to be seen in the future by that readdir.
5102 	 * This has broken buggy programs which operate on names as
5103 	 * they're returned by readdir.  Until we re-use freed offsets
5104 	 * we have this hack to stop new entries from being returned
5105 	 * under the assumption that they'll never reach this huge
5106 	 * offset.
5107 	 *
5108 	 * This is being careful not to overflow 32bit loff_t unless the
5109 	 * last entry requires it because doing so has broken 32bit apps
5110 	 * in the past.
5111 	 */
5112 	if (key_type == BTRFS_DIR_INDEX_KEY) {
5113 		if (ctx->pos >= INT_MAX)
5114 			ctx->pos = LLONG_MAX;
5115 		else
5116 			ctx->pos = INT_MAX;
5117 	}
5118 nopos:
5119 	ret = 0;
5120 err:
5121 	if (key_type == BTRFS_DIR_INDEX_KEY)
5122 		btrfs_put_delayed_items(&ins_list, &del_list);
5123 	btrfs_free_path(path);
5124 	return ret;
5125 }
5126 
5127 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
5128 {
5129 	struct btrfs_root *root = BTRFS_I(inode)->root;
5130 	struct btrfs_trans_handle *trans;
5131 	int ret = 0;
5132 	bool nolock = false;
5133 
5134 	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5135 		return 0;
5136 
5137 	if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
5138 		nolock = true;
5139 
5140 	if (wbc->sync_mode == WB_SYNC_ALL) {
5141 		if (nolock)
5142 			trans = btrfs_join_transaction_nolock(root);
5143 		else
5144 			trans = btrfs_join_transaction(root);
5145 		if (IS_ERR(trans))
5146 			return PTR_ERR(trans);
5147 		ret = btrfs_commit_transaction(trans, root);
5148 	}
5149 	return ret;
5150 }
5151 
5152 /*
5153  * This is somewhat expensive, updating the tree every time the
5154  * inode changes.  But, it is most likely to find the inode in cache.
5155  * FIXME, needs more benchmarking...there are no reasons other than performance
5156  * to keep or drop this code.
5157  */
5158 static int btrfs_dirty_inode(struct inode *inode)
5159 {
5160 	struct btrfs_root *root = BTRFS_I(inode)->root;
5161 	struct btrfs_trans_handle *trans;
5162 	int ret;
5163 
5164 	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5165 		return 0;
5166 
5167 	trans = btrfs_join_transaction(root);
5168 	if (IS_ERR(trans))
5169 		return PTR_ERR(trans);
5170 
5171 	ret = btrfs_update_inode(trans, root, inode);
5172 	if (ret && ret == -ENOSPC) {
5173 		/* whoops, lets try again with the full transaction */
5174 		btrfs_end_transaction(trans, root);
5175 		trans = btrfs_start_transaction(root, 1);
5176 		if (IS_ERR(trans))
5177 			return PTR_ERR(trans);
5178 
5179 		ret = btrfs_update_inode(trans, root, inode);
5180 	}
5181 	btrfs_end_transaction(trans, root);
5182 	if (BTRFS_I(inode)->delayed_node)
5183 		btrfs_balance_delayed_items(root);
5184 
5185 	return ret;
5186 }
5187 
5188 /*
5189  * This is a copy of file_update_time.  We need this so we can return error on
5190  * ENOSPC for updating the inode in the case of file write and mmap writes.
5191  */
5192 static int btrfs_update_time(struct inode *inode, struct timespec *now,
5193 			     int flags)
5194 {
5195 	struct btrfs_root *root = BTRFS_I(inode)->root;
5196 
5197 	if (btrfs_root_readonly(root))
5198 		return -EROFS;
5199 
5200 	if (flags & S_VERSION)
5201 		inode_inc_iversion(inode);
5202 	if (flags & S_CTIME)
5203 		inode->i_ctime = *now;
5204 	if (flags & S_MTIME)
5205 		inode->i_mtime = *now;
5206 	if (flags & S_ATIME)
5207 		inode->i_atime = *now;
5208 	return btrfs_dirty_inode(inode);
5209 }
5210 
5211 /*
5212  * find the highest existing sequence number in a directory
5213  * and then set the in-memory index_cnt variable to reflect
5214  * free sequence numbers
5215  */
5216 static int btrfs_set_inode_index_count(struct inode *inode)
5217 {
5218 	struct btrfs_root *root = BTRFS_I(inode)->root;
5219 	struct btrfs_key key, found_key;
5220 	struct btrfs_path *path;
5221 	struct extent_buffer *leaf;
5222 	int ret;
5223 
5224 	key.objectid = btrfs_ino(inode);
5225 	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
5226 	key.offset = (u64)-1;
5227 
5228 	path = btrfs_alloc_path();
5229 	if (!path)
5230 		return -ENOMEM;
5231 
5232 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5233 	if (ret < 0)
5234 		goto out;
5235 	/* FIXME: we should be able to handle this */
5236 	if (ret == 0)
5237 		goto out;
5238 	ret = 0;
5239 
5240 	/*
5241 	 * MAGIC NUMBER EXPLANATION:
5242 	 * since we search a directory based on f_pos we have to start at 2
5243 	 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
5244 	 * else has to start at 2
5245 	 */
5246 	if (path->slots[0] == 0) {
5247 		BTRFS_I(inode)->index_cnt = 2;
5248 		goto out;
5249 	}
5250 
5251 	path->slots[0]--;
5252 
5253 	leaf = path->nodes[0];
5254 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5255 
5256 	if (found_key.objectid != btrfs_ino(inode) ||
5257 	    btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
5258 		BTRFS_I(inode)->index_cnt = 2;
5259 		goto out;
5260 	}
5261 
5262 	BTRFS_I(inode)->index_cnt = found_key.offset + 1;
5263 out:
5264 	btrfs_free_path(path);
5265 	return ret;
5266 }
5267 
5268 /*
5269  * helper to find a free sequence number in a given directory.  This current
5270  * code is very simple, later versions will do smarter things in the btree
5271  */
5272 int btrfs_set_inode_index(struct inode *dir, u64 *index)
5273 {
5274 	int ret = 0;
5275 
5276 	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
5277 		ret = btrfs_inode_delayed_dir_index_count(dir);
5278 		if (ret) {
5279 			ret = btrfs_set_inode_index_count(dir);
5280 			if (ret)
5281 				return ret;
5282 		}
5283 	}
5284 
5285 	*index = BTRFS_I(dir)->index_cnt;
5286 	BTRFS_I(dir)->index_cnt++;
5287 
5288 	return ret;
5289 }
5290 
5291 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5292 				     struct btrfs_root *root,
5293 				     struct inode *dir,
5294 				     const char *name, int name_len,
5295 				     u64 ref_objectid, u64 objectid,
5296 				     umode_t mode, u64 *index)
5297 {
5298 	struct inode *inode;
5299 	struct btrfs_inode_item *inode_item;
5300 	struct btrfs_key *location;
5301 	struct btrfs_path *path;
5302 	struct btrfs_inode_ref *ref;
5303 	struct btrfs_key key[2];
5304 	u32 sizes[2];
5305 	unsigned long ptr;
5306 	int ret;
5307 	int owner;
5308 
5309 	path = btrfs_alloc_path();
5310 	if (!path)
5311 		return ERR_PTR(-ENOMEM);
5312 
5313 	inode = new_inode(root->fs_info->sb);
5314 	if (!inode) {
5315 		btrfs_free_path(path);
5316 		return ERR_PTR(-ENOMEM);
5317 	}
5318 
5319 	/*
5320 	 * we have to initialize this early, so we can reclaim the inode
5321 	 * number if we fail afterwards in this function.
5322 	 */
5323 	inode->i_ino = objectid;
5324 
5325 	if (dir) {
5326 		trace_btrfs_inode_request(dir);
5327 
5328 		ret = btrfs_set_inode_index(dir, index);
5329 		if (ret) {
5330 			btrfs_free_path(path);
5331 			iput(inode);
5332 			return ERR_PTR(ret);
5333 		}
5334 	}
5335 	/*
5336 	 * index_cnt is ignored for everything but a dir,
5337 	 * btrfs_get_inode_index_count has an explanation for the magic
5338 	 * number
5339 	 */
5340 	BTRFS_I(inode)->index_cnt = 2;
5341 	BTRFS_I(inode)->root = root;
5342 	BTRFS_I(inode)->generation = trans->transid;
5343 	inode->i_generation = BTRFS_I(inode)->generation;
5344 
5345 	/*
5346 	 * We could have gotten an inode number from somebody who was fsynced
5347 	 * and then removed in this same transaction, so let's just set full
5348 	 * sync since it will be a full sync anyway and this will blow away the
5349 	 * old info in the log.
5350 	 */
5351 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
5352 
5353 	if (S_ISDIR(mode))
5354 		owner = 0;
5355 	else
5356 		owner = 1;
5357 
5358 	key[0].objectid = objectid;
5359 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
5360 	key[0].offset = 0;
5361 
5362 	/*
5363 	 * Start new inodes with an inode_ref. This is slightly more
5364 	 * efficient for small numbers of hard links since they will
5365 	 * be packed into one item. Extended refs will kick in if we
5366 	 * add more hard links than can fit in the ref item.
5367 	 */
5368 	key[1].objectid = objectid;
5369 	btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
5370 	key[1].offset = ref_objectid;
5371 
5372 	sizes[0] = sizeof(struct btrfs_inode_item);
5373 	sizes[1] = name_len + sizeof(*ref);
5374 
5375 	path->leave_spinning = 1;
5376 	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
5377 	if (ret != 0)
5378 		goto fail;
5379 
5380 	inode_init_owner(inode, dir, mode);
5381 	inode_set_bytes(inode, 0);
5382 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
5383 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5384 				  struct btrfs_inode_item);
5385 	memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
5386 			     sizeof(*inode_item));
5387 	fill_inode_item(trans, path->nodes[0], inode_item, inode);
5388 
5389 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
5390 			     struct btrfs_inode_ref);
5391 	btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
5392 	btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
5393 	ptr = (unsigned long)(ref + 1);
5394 	write_extent_buffer(path->nodes[0], name, ptr, name_len);
5395 
5396 	btrfs_mark_buffer_dirty(path->nodes[0]);
5397 	btrfs_free_path(path);
5398 
5399 	location = &BTRFS_I(inode)->location;
5400 	location->objectid = objectid;
5401 	location->offset = 0;
5402 	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
5403 
5404 	btrfs_inherit_iflags(inode, dir);
5405 
5406 	if (S_ISREG(mode)) {
5407 		if (btrfs_test_opt(root, NODATASUM))
5408 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
5409 		if (btrfs_test_opt(root, NODATACOW))
5410 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
5411 				BTRFS_INODE_NODATASUM;
5412 	}
5413 
5414 	insert_inode_hash(inode);
5415 	inode_tree_add(inode);
5416 
5417 	trace_btrfs_inode_new(inode);
5418 	btrfs_set_inode_last_trans(trans, inode);
5419 
5420 	btrfs_update_root_times(trans, root);
5421 
5422 	return inode;
5423 fail:
5424 	if (dir)
5425 		BTRFS_I(dir)->index_cnt--;
5426 	btrfs_free_path(path);
5427 	iput(inode);
5428 	return ERR_PTR(ret);
5429 }
5430 
5431 static inline u8 btrfs_inode_type(struct inode *inode)
5432 {
5433 	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
5434 }
5435 
5436 /*
5437  * utility function to add 'inode' into 'parent_inode' with
5438  * a give name and a given sequence number.
5439  * if 'add_backref' is true, also insert a backref from the
5440  * inode to the parent directory.
5441  */
5442 int btrfs_add_link(struct btrfs_trans_handle *trans,
5443 		   struct inode *parent_inode, struct inode *inode,
5444 		   const char *name, int name_len, int add_backref, u64 index)
5445 {
5446 	int ret = 0;
5447 	struct btrfs_key key;
5448 	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
5449 	u64 ino = btrfs_ino(inode);
5450 	u64 parent_ino = btrfs_ino(parent_inode);
5451 
5452 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5453 		memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
5454 	} else {
5455 		key.objectid = ino;
5456 		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
5457 		key.offset = 0;
5458 	}
5459 
5460 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5461 		ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
5462 					 key.objectid, root->root_key.objectid,
5463 					 parent_ino, index, name, name_len);
5464 	} else if (add_backref) {
5465 		ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
5466 					     parent_ino, index);
5467 	}
5468 
5469 	/* Nothing to clean up yet */
5470 	if (ret)
5471 		return ret;
5472 
5473 	ret = btrfs_insert_dir_item(trans, root, name, name_len,
5474 				    parent_inode, &key,
5475 				    btrfs_inode_type(inode), index);
5476 	if (ret == -EEXIST || ret == -EOVERFLOW)
5477 		goto fail_dir_item;
5478 	else if (ret) {
5479 		btrfs_abort_transaction(trans, root, ret);
5480 		return ret;
5481 	}
5482 
5483 	btrfs_i_size_write(parent_inode, parent_inode->i_size +
5484 			   name_len * 2);
5485 	inode_inc_iversion(parent_inode);
5486 	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
5487 	ret = btrfs_update_inode(trans, root, parent_inode);
5488 	if (ret)
5489 		btrfs_abort_transaction(trans, root, ret);
5490 	return ret;
5491 
5492 fail_dir_item:
5493 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5494 		u64 local_index;
5495 		int err;
5496 		err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
5497 				 key.objectid, root->root_key.objectid,
5498 				 parent_ino, &local_index, name, name_len);
5499 
5500 	} else if (add_backref) {
5501 		u64 local_index;
5502 		int err;
5503 
5504 		err = btrfs_del_inode_ref(trans, root, name, name_len,
5505 					  ino, parent_ino, &local_index);
5506 	}
5507 	return ret;
5508 }
5509 
5510 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
5511 			    struct inode *dir, struct dentry *dentry,
5512 			    struct inode *inode, int backref, u64 index)
5513 {
5514 	int err = btrfs_add_link(trans, dir, inode,
5515 				 dentry->d_name.name, dentry->d_name.len,
5516 				 backref, index);
5517 	if (err > 0)
5518 		err = -EEXIST;
5519 	return err;
5520 }
5521 
5522 static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5523 			umode_t mode, dev_t rdev)
5524 {
5525 	struct btrfs_trans_handle *trans;
5526 	struct btrfs_root *root = BTRFS_I(dir)->root;
5527 	struct inode *inode = NULL;
5528 	int err;
5529 	int drop_inode = 0;
5530 	u64 objectid;
5531 	u64 index = 0;
5532 
5533 	if (!new_valid_dev(rdev))
5534 		return -EINVAL;
5535 
5536 	/*
5537 	 * 2 for inode item and ref
5538 	 * 2 for dir items
5539 	 * 1 for xattr if selinux is on
5540 	 */
5541 	trans = btrfs_start_transaction(root, 5);
5542 	if (IS_ERR(trans))
5543 		return PTR_ERR(trans);
5544 
5545 	err = btrfs_find_free_ino(root, &objectid);
5546 	if (err)
5547 		goto out_unlock;
5548 
5549 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5550 				dentry->d_name.len, btrfs_ino(dir), objectid,
5551 				mode, &index);
5552 	if (IS_ERR(inode)) {
5553 		err = PTR_ERR(inode);
5554 		goto out_unlock;
5555 	}
5556 
5557 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5558 	if (err) {
5559 		drop_inode = 1;
5560 		goto out_unlock;
5561 	}
5562 
5563 	/*
5564 	* If the active LSM wants to access the inode during
5565 	* d_instantiate it needs these. Smack checks to see
5566 	* if the filesystem supports xattrs by looking at the
5567 	* ops vector.
5568 	*/
5569 
5570 	inode->i_op = &btrfs_special_inode_operations;
5571 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5572 	if (err)
5573 		drop_inode = 1;
5574 	else {
5575 		init_special_inode(inode, inode->i_mode, rdev);
5576 		btrfs_update_inode(trans, root, inode);
5577 		d_instantiate(dentry, inode);
5578 	}
5579 out_unlock:
5580 	btrfs_end_transaction(trans, root);
5581 	btrfs_btree_balance_dirty(root);
5582 	if (drop_inode) {
5583 		inode_dec_link_count(inode);
5584 		iput(inode);
5585 	}
5586 	return err;
5587 }
5588 
5589 static int btrfs_create(struct inode *dir, struct dentry *dentry,
5590 			umode_t mode, bool excl)
5591 {
5592 	struct btrfs_trans_handle *trans;
5593 	struct btrfs_root *root = BTRFS_I(dir)->root;
5594 	struct inode *inode = NULL;
5595 	int drop_inode_on_err = 0;
5596 	int err;
5597 	u64 objectid;
5598 	u64 index = 0;
5599 
5600 	/*
5601 	 * 2 for inode item and ref
5602 	 * 2 for dir items
5603 	 * 1 for xattr if selinux is on
5604 	 */
5605 	trans = btrfs_start_transaction(root, 5);
5606 	if (IS_ERR(trans))
5607 		return PTR_ERR(trans);
5608 
5609 	err = btrfs_find_free_ino(root, &objectid);
5610 	if (err)
5611 		goto out_unlock;
5612 
5613 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5614 				dentry->d_name.len, btrfs_ino(dir), objectid,
5615 				mode, &index);
5616 	if (IS_ERR(inode)) {
5617 		err = PTR_ERR(inode);
5618 		goto out_unlock;
5619 	}
5620 	drop_inode_on_err = 1;
5621 
5622 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5623 	if (err)
5624 		goto out_unlock;
5625 
5626 	err = btrfs_update_inode(trans, root, inode);
5627 	if (err)
5628 		goto out_unlock;
5629 
5630 	/*
5631 	* If the active LSM wants to access the inode during
5632 	* d_instantiate it needs these. Smack checks to see
5633 	* if the filesystem supports xattrs by looking at the
5634 	* ops vector.
5635 	*/
5636 	inode->i_fop = &btrfs_file_operations;
5637 	inode->i_op = &btrfs_file_inode_operations;
5638 
5639 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5640 	if (err)
5641 		goto out_unlock;
5642 
5643 	inode->i_mapping->a_ops = &btrfs_aops;
5644 	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
5645 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
5646 	d_instantiate(dentry, inode);
5647 
5648 out_unlock:
5649 	btrfs_end_transaction(trans, root);
5650 	if (err && drop_inode_on_err) {
5651 		inode_dec_link_count(inode);
5652 		iput(inode);
5653 	}
5654 	btrfs_btree_balance_dirty(root);
5655 	return err;
5656 }
5657 
5658 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5659 		      struct dentry *dentry)
5660 {
5661 	struct btrfs_trans_handle *trans;
5662 	struct btrfs_root *root = BTRFS_I(dir)->root;
5663 	struct inode *inode = old_dentry->d_inode;
5664 	u64 index;
5665 	int err;
5666 	int drop_inode = 0;
5667 
5668 	/* do not allow sys_link's with other subvols of the same device */
5669 	if (root->objectid != BTRFS_I(inode)->root->objectid)
5670 		return -EXDEV;
5671 
5672 	if (inode->i_nlink >= BTRFS_LINK_MAX)
5673 		return -EMLINK;
5674 
5675 	err = btrfs_set_inode_index(dir, &index);
5676 	if (err)
5677 		goto fail;
5678 
5679 	/*
5680 	 * 2 items for inode and inode ref
5681 	 * 2 items for dir items
5682 	 * 1 item for parent inode
5683 	 */
5684 	trans = btrfs_start_transaction(root, 5);
5685 	if (IS_ERR(trans)) {
5686 		err = PTR_ERR(trans);
5687 		goto fail;
5688 	}
5689 
5690 	btrfs_inc_nlink(inode);
5691 	inode_inc_iversion(inode);
5692 	inode->i_ctime = CURRENT_TIME;
5693 	ihold(inode);
5694 	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
5695 
5696 	err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
5697 
5698 	if (err) {
5699 		drop_inode = 1;
5700 	} else {
5701 		struct dentry *parent = dentry->d_parent;
5702 		err = btrfs_update_inode(trans, root, inode);
5703 		if (err)
5704 			goto fail;
5705 		d_instantiate(dentry, inode);
5706 		btrfs_log_new_name(trans, inode, NULL, parent);
5707 	}
5708 
5709 	btrfs_end_transaction(trans, root);
5710 fail:
5711 	if (drop_inode) {
5712 		inode_dec_link_count(inode);
5713 		iput(inode);
5714 	}
5715 	btrfs_btree_balance_dirty(root);
5716 	return err;
5717 }
5718 
5719 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5720 {
5721 	struct inode *inode = NULL;
5722 	struct btrfs_trans_handle *trans;
5723 	struct btrfs_root *root = BTRFS_I(dir)->root;
5724 	int err = 0;
5725 	int drop_on_err = 0;
5726 	u64 objectid = 0;
5727 	u64 index = 0;
5728 
5729 	/*
5730 	 * 2 items for inode and ref
5731 	 * 2 items for dir items
5732 	 * 1 for xattr if selinux is on
5733 	 */
5734 	trans = btrfs_start_transaction(root, 5);
5735 	if (IS_ERR(trans))
5736 		return PTR_ERR(trans);
5737 
5738 	err = btrfs_find_free_ino(root, &objectid);
5739 	if (err)
5740 		goto out_fail;
5741 
5742 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5743 				dentry->d_name.len, btrfs_ino(dir), objectid,
5744 				S_IFDIR | mode, &index);
5745 	if (IS_ERR(inode)) {
5746 		err = PTR_ERR(inode);
5747 		goto out_fail;
5748 	}
5749 
5750 	drop_on_err = 1;
5751 
5752 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5753 	if (err)
5754 		goto out_fail;
5755 
5756 	inode->i_op = &btrfs_dir_inode_operations;
5757 	inode->i_fop = &btrfs_dir_file_operations;
5758 
5759 	btrfs_i_size_write(inode, 0);
5760 	err = btrfs_update_inode(trans, root, inode);
5761 	if (err)
5762 		goto out_fail;
5763 
5764 	err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
5765 			     dentry->d_name.len, 0, index);
5766 	if (err)
5767 		goto out_fail;
5768 
5769 	d_instantiate(dentry, inode);
5770 	drop_on_err = 0;
5771 
5772 out_fail:
5773 	btrfs_end_transaction(trans, root);
5774 	if (drop_on_err)
5775 		iput(inode);
5776 	btrfs_btree_balance_dirty(root);
5777 	return err;
5778 }
5779 
5780 /* helper for btfs_get_extent.  Given an existing extent in the tree,
5781  * and an extent that you want to insert, deal with overlap and insert
5782  * the new extent into the tree.
5783  */
5784 static int merge_extent_mapping(struct extent_map_tree *em_tree,
5785 				struct extent_map *existing,
5786 				struct extent_map *em,
5787 				u64 map_start, u64 map_len)
5788 {
5789 	u64 start_diff;
5790 
5791 	BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
5792 	start_diff = map_start - em->start;
5793 	em->start = map_start;
5794 	em->len = map_len;
5795 	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
5796 	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
5797 		em->block_start += start_diff;
5798 		em->block_len -= start_diff;
5799 	}
5800 	return add_extent_mapping(em_tree, em, 0);
5801 }
5802 
5803 static noinline int uncompress_inline(struct btrfs_path *path,
5804 				      struct inode *inode, struct page *page,
5805 				      size_t pg_offset, u64 extent_offset,
5806 				      struct btrfs_file_extent_item *item)
5807 {
5808 	int ret;
5809 	struct extent_buffer *leaf = path->nodes[0];
5810 	char *tmp;
5811 	size_t max_size;
5812 	unsigned long inline_size;
5813 	unsigned long ptr;
5814 	int compress_type;
5815 
5816 	WARN_ON(pg_offset != 0);
5817 	compress_type = btrfs_file_extent_compression(leaf, item);
5818 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
5819 	inline_size = btrfs_file_extent_inline_item_len(leaf,
5820 					btrfs_item_nr(leaf, path->slots[0]));
5821 	tmp = kmalloc(inline_size, GFP_NOFS);
5822 	if (!tmp)
5823 		return -ENOMEM;
5824 	ptr = btrfs_file_extent_inline_start(item);
5825 
5826 	read_extent_buffer(leaf, tmp, ptr, inline_size);
5827 
5828 	max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
5829 	ret = btrfs_decompress(compress_type, tmp, page,
5830 			       extent_offset, inline_size, max_size);
5831 	if (ret) {
5832 		char *kaddr = kmap_atomic(page);
5833 		unsigned long copy_size = min_t(u64,
5834 				  PAGE_CACHE_SIZE - pg_offset,
5835 				  max_size - extent_offset);
5836 		memset(kaddr + pg_offset, 0, copy_size);
5837 		kunmap_atomic(kaddr);
5838 	}
5839 	kfree(tmp);
5840 	return 0;
5841 }
5842 
5843 /*
5844  * a bit scary, this does extent mapping from logical file offset to the disk.
5845  * the ugly parts come from merging extents from the disk with the in-ram
5846  * representation.  This gets more complex because of the data=ordered code,
5847  * where the in-ram extents might be locked pending data=ordered completion.
5848  *
5849  * This also copies inline extents directly into the page.
5850  */
5851 
5852 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
5853 				    size_t pg_offset, u64 start, u64 len,
5854 				    int create)
5855 {
5856 	int ret;
5857 	int err = 0;
5858 	u64 bytenr;
5859 	u64 extent_start = 0;
5860 	u64 extent_end = 0;
5861 	u64 objectid = btrfs_ino(inode);
5862 	u32 found_type;
5863 	struct btrfs_path *path = NULL;
5864 	struct btrfs_root *root = BTRFS_I(inode)->root;
5865 	struct btrfs_file_extent_item *item;
5866 	struct extent_buffer *leaf;
5867 	struct btrfs_key found_key;
5868 	struct extent_map *em = NULL;
5869 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5870 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5871 	struct btrfs_trans_handle *trans = NULL;
5872 	int compress_type;
5873 
5874 again:
5875 	read_lock(&em_tree->lock);
5876 	em = lookup_extent_mapping(em_tree, start, len);
5877 	if (em)
5878 		em->bdev = root->fs_info->fs_devices->latest_bdev;
5879 	read_unlock(&em_tree->lock);
5880 
5881 	if (em) {
5882 		if (em->start > start || em->start + em->len <= start)
5883 			free_extent_map(em);
5884 		else if (em->block_start == EXTENT_MAP_INLINE && page)
5885 			free_extent_map(em);
5886 		else
5887 			goto out;
5888 	}
5889 	em = alloc_extent_map();
5890 	if (!em) {
5891 		err = -ENOMEM;
5892 		goto out;
5893 	}
5894 	em->bdev = root->fs_info->fs_devices->latest_bdev;
5895 	em->start = EXTENT_MAP_HOLE;
5896 	em->orig_start = EXTENT_MAP_HOLE;
5897 	em->len = (u64)-1;
5898 	em->block_len = (u64)-1;
5899 
5900 	if (!path) {
5901 		path = btrfs_alloc_path();
5902 		if (!path) {
5903 			err = -ENOMEM;
5904 			goto out;
5905 		}
5906 		/*
5907 		 * Chances are we'll be called again, so go ahead and do
5908 		 * readahead
5909 		 */
5910 		path->reada = 1;
5911 	}
5912 
5913 	ret = btrfs_lookup_file_extent(trans, root, path,
5914 				       objectid, start, trans != NULL);
5915 	if (ret < 0) {
5916 		err = ret;
5917 		goto out;
5918 	}
5919 
5920 	if (ret != 0) {
5921 		if (path->slots[0] == 0)
5922 			goto not_found;
5923 		path->slots[0]--;
5924 	}
5925 
5926 	leaf = path->nodes[0];
5927 	item = btrfs_item_ptr(leaf, path->slots[0],
5928 			      struct btrfs_file_extent_item);
5929 	/* are we inside the extent that was found? */
5930 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5931 	found_type = btrfs_key_type(&found_key);
5932 	if (found_key.objectid != objectid ||
5933 	    found_type != BTRFS_EXTENT_DATA_KEY) {
5934 		goto not_found;
5935 	}
5936 
5937 	found_type = btrfs_file_extent_type(leaf, item);
5938 	extent_start = found_key.offset;
5939 	compress_type = btrfs_file_extent_compression(leaf, item);
5940 	if (found_type == BTRFS_FILE_EXTENT_REG ||
5941 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5942 		extent_end = extent_start +
5943 		       btrfs_file_extent_num_bytes(leaf, item);
5944 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
5945 		size_t size;
5946 		size = btrfs_file_extent_inline_len(leaf, item);
5947 		extent_end = ALIGN(extent_start + size, root->sectorsize);
5948 	}
5949 
5950 	if (start >= extent_end) {
5951 		path->slots[0]++;
5952 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
5953 			ret = btrfs_next_leaf(root, path);
5954 			if (ret < 0) {
5955 				err = ret;
5956 				goto out;
5957 			}
5958 			if (ret > 0)
5959 				goto not_found;
5960 			leaf = path->nodes[0];
5961 		}
5962 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5963 		if (found_key.objectid != objectid ||
5964 		    found_key.type != BTRFS_EXTENT_DATA_KEY)
5965 			goto not_found;
5966 		if (start + len <= found_key.offset)
5967 			goto not_found;
5968 		em->start = start;
5969 		em->orig_start = start;
5970 		em->len = found_key.offset - start;
5971 		goto not_found_em;
5972 	}
5973 
5974 	em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
5975 	if (found_type == BTRFS_FILE_EXTENT_REG ||
5976 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5977 		em->start = extent_start;
5978 		em->len = extent_end - extent_start;
5979 		em->orig_start = extent_start -
5980 				 btrfs_file_extent_offset(leaf, item);
5981 		em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
5982 								      item);
5983 		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
5984 		if (bytenr == 0) {
5985 			em->block_start = EXTENT_MAP_HOLE;
5986 			goto insert;
5987 		}
5988 		if (compress_type != BTRFS_COMPRESS_NONE) {
5989 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5990 			em->compress_type = compress_type;
5991 			em->block_start = bytenr;
5992 			em->block_len = em->orig_block_len;
5993 		} else {
5994 			bytenr += btrfs_file_extent_offset(leaf, item);
5995 			em->block_start = bytenr;
5996 			em->block_len = em->len;
5997 			if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
5998 				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
5999 		}
6000 		goto insert;
6001 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6002 		unsigned long ptr;
6003 		char *map;
6004 		size_t size;
6005 		size_t extent_offset;
6006 		size_t copy_size;
6007 
6008 		em->block_start = EXTENT_MAP_INLINE;
6009 		if (!page || create) {
6010 			em->start = extent_start;
6011 			em->len = extent_end - extent_start;
6012 			goto out;
6013 		}
6014 
6015 		size = btrfs_file_extent_inline_len(leaf, item);
6016 		extent_offset = page_offset(page) + pg_offset - extent_start;
6017 		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
6018 				size - extent_offset);
6019 		em->start = extent_start + extent_offset;
6020 		em->len = ALIGN(copy_size, root->sectorsize);
6021 		em->orig_block_len = em->len;
6022 		em->orig_start = em->start;
6023 		if (compress_type) {
6024 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6025 			em->compress_type = compress_type;
6026 		}
6027 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
6028 		if (create == 0 && !PageUptodate(page)) {
6029 			if (btrfs_file_extent_compression(leaf, item) !=
6030 			    BTRFS_COMPRESS_NONE) {
6031 				ret = uncompress_inline(path, inode, page,
6032 							pg_offset,
6033 							extent_offset, item);
6034 				BUG_ON(ret); /* -ENOMEM */
6035 			} else {
6036 				map = kmap(page);
6037 				read_extent_buffer(leaf, map + pg_offset, ptr,
6038 						   copy_size);
6039 				if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
6040 					memset(map + pg_offset + copy_size, 0,
6041 					       PAGE_CACHE_SIZE - pg_offset -
6042 					       copy_size);
6043 				}
6044 				kunmap(page);
6045 			}
6046 			flush_dcache_page(page);
6047 		} else if (create && PageUptodate(page)) {
6048 			BUG();
6049 			if (!trans) {
6050 				kunmap(page);
6051 				free_extent_map(em);
6052 				em = NULL;
6053 
6054 				btrfs_release_path(path);
6055 				trans = btrfs_join_transaction(root);
6056 
6057 				if (IS_ERR(trans))
6058 					return ERR_CAST(trans);
6059 				goto again;
6060 			}
6061 			map = kmap(page);
6062 			write_extent_buffer(leaf, map + pg_offset, ptr,
6063 					    copy_size);
6064 			kunmap(page);
6065 			btrfs_mark_buffer_dirty(leaf);
6066 		}
6067 		set_extent_uptodate(io_tree, em->start,
6068 				    extent_map_end(em) - 1, NULL, GFP_NOFS);
6069 		goto insert;
6070 	} else {
6071 		WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
6072 	}
6073 not_found:
6074 	em->start = start;
6075 	em->orig_start = start;
6076 	em->len = len;
6077 not_found_em:
6078 	em->block_start = EXTENT_MAP_HOLE;
6079 	set_bit(EXTENT_FLAG_VACANCY, &em->flags);
6080 insert:
6081 	btrfs_release_path(path);
6082 	if (em->start > start || extent_map_end(em) <= start) {
6083 		btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
6084 			(unsigned long long)em->start,
6085 			(unsigned long long)em->len,
6086 			(unsigned long long)start,
6087 			(unsigned long long)len);
6088 		err = -EIO;
6089 		goto out;
6090 	}
6091 
6092 	err = 0;
6093 	write_lock(&em_tree->lock);
6094 	ret = add_extent_mapping(em_tree, em, 0);
6095 	/* it is possible that someone inserted the extent into the tree
6096 	 * while we had the lock dropped.  It is also possible that
6097 	 * an overlapping map exists in the tree
6098 	 */
6099 	if (ret == -EEXIST) {
6100 		struct extent_map *existing;
6101 
6102 		ret = 0;
6103 
6104 		existing = lookup_extent_mapping(em_tree, start, len);
6105 		if (existing && (existing->start > start ||
6106 		    existing->start + existing->len <= start)) {
6107 			free_extent_map(existing);
6108 			existing = NULL;
6109 		}
6110 		if (!existing) {
6111 			existing = lookup_extent_mapping(em_tree, em->start,
6112 							 em->len);
6113 			if (existing) {
6114 				err = merge_extent_mapping(em_tree, existing,
6115 							   em, start,
6116 							   root->sectorsize);
6117 				free_extent_map(existing);
6118 				if (err) {
6119 					free_extent_map(em);
6120 					em = NULL;
6121 				}
6122 			} else {
6123 				err = -EIO;
6124 				free_extent_map(em);
6125 				em = NULL;
6126 			}
6127 		} else {
6128 			free_extent_map(em);
6129 			em = existing;
6130 			err = 0;
6131 		}
6132 	}
6133 	write_unlock(&em_tree->lock);
6134 out:
6135 
6136 	if (em)
6137 		trace_btrfs_get_extent(root, em);
6138 
6139 	if (path)
6140 		btrfs_free_path(path);
6141 	if (trans) {
6142 		ret = btrfs_end_transaction(trans, root);
6143 		if (!err)
6144 			err = ret;
6145 	}
6146 	if (err) {
6147 		free_extent_map(em);
6148 		return ERR_PTR(err);
6149 	}
6150 	BUG_ON(!em); /* Error is always set */
6151 	return em;
6152 }
6153 
6154 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
6155 					   size_t pg_offset, u64 start, u64 len,
6156 					   int create)
6157 {
6158 	struct extent_map *em;
6159 	struct extent_map *hole_em = NULL;
6160 	u64 range_start = start;
6161 	u64 end;
6162 	u64 found;
6163 	u64 found_end;
6164 	int err = 0;
6165 
6166 	em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
6167 	if (IS_ERR(em))
6168 		return em;
6169 	if (em) {
6170 		/*
6171 		 * if our em maps to
6172 		 * -  a hole or
6173 		 * -  a pre-alloc extent,
6174 		 * there might actually be delalloc bytes behind it.
6175 		 */
6176 		if (em->block_start != EXTENT_MAP_HOLE &&
6177 		    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6178 			return em;
6179 		else
6180 			hole_em = em;
6181 	}
6182 
6183 	/* check to see if we've wrapped (len == -1 or similar) */
6184 	end = start + len;
6185 	if (end < start)
6186 		end = (u64)-1;
6187 	else
6188 		end -= 1;
6189 
6190 	em = NULL;
6191 
6192 	/* ok, we didn't find anything, lets look for delalloc */
6193 	found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
6194 				 end, len, EXTENT_DELALLOC, 1);
6195 	found_end = range_start + found;
6196 	if (found_end < range_start)
6197 		found_end = (u64)-1;
6198 
6199 	/*
6200 	 * we didn't find anything useful, return
6201 	 * the original results from get_extent()
6202 	 */
6203 	if (range_start > end || found_end <= start) {
6204 		em = hole_em;
6205 		hole_em = NULL;
6206 		goto out;
6207 	}
6208 
6209 	/* adjust the range_start to make sure it doesn't
6210 	 * go backwards from the start they passed in
6211 	 */
6212 	range_start = max(start,range_start);
6213 	found = found_end - range_start;
6214 
6215 	if (found > 0) {
6216 		u64 hole_start = start;
6217 		u64 hole_len = len;
6218 
6219 		em = alloc_extent_map();
6220 		if (!em) {
6221 			err = -ENOMEM;
6222 			goto out;
6223 		}
6224 		/*
6225 		 * when btrfs_get_extent can't find anything it
6226 		 * returns one huge hole
6227 		 *
6228 		 * make sure what it found really fits our range, and
6229 		 * adjust to make sure it is based on the start from
6230 		 * the caller
6231 		 */
6232 		if (hole_em) {
6233 			u64 calc_end = extent_map_end(hole_em);
6234 
6235 			if (calc_end <= start || (hole_em->start > end)) {
6236 				free_extent_map(hole_em);
6237 				hole_em = NULL;
6238 			} else {
6239 				hole_start = max(hole_em->start, start);
6240 				hole_len = calc_end - hole_start;
6241 			}
6242 		}
6243 		em->bdev = NULL;
6244 		if (hole_em && range_start > hole_start) {
6245 			/* our hole starts before our delalloc, so we
6246 			 * have to return just the parts of the hole
6247 			 * that go until  the delalloc starts
6248 			 */
6249 			em->len = min(hole_len,
6250 				      range_start - hole_start);
6251 			em->start = hole_start;
6252 			em->orig_start = hole_start;
6253 			/*
6254 			 * don't adjust block start at all,
6255 			 * it is fixed at EXTENT_MAP_HOLE
6256 			 */
6257 			em->block_start = hole_em->block_start;
6258 			em->block_len = hole_len;
6259 			if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
6260 				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
6261 		} else {
6262 			em->start = range_start;
6263 			em->len = found;
6264 			em->orig_start = range_start;
6265 			em->block_start = EXTENT_MAP_DELALLOC;
6266 			em->block_len = found;
6267 		}
6268 	} else if (hole_em) {
6269 		return hole_em;
6270 	}
6271 out:
6272 
6273 	free_extent_map(hole_em);
6274 	if (err) {
6275 		free_extent_map(em);
6276 		return ERR_PTR(err);
6277 	}
6278 	return em;
6279 }
6280 
6281 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
6282 						  u64 start, u64 len)
6283 {
6284 	struct btrfs_root *root = BTRFS_I(inode)->root;
6285 	struct extent_map *em;
6286 	struct btrfs_key ins;
6287 	u64 alloc_hint;
6288 	int ret;
6289 
6290 	alloc_hint = get_extent_allocation_hint(inode, start, len);
6291 	ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
6292 				   alloc_hint, &ins, 1);
6293 	if (ret)
6294 		return ERR_PTR(ret);
6295 
6296 	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
6297 			      ins.offset, ins.offset, ins.offset, 0);
6298 	if (IS_ERR(em)) {
6299 		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
6300 		return em;
6301 	}
6302 
6303 	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
6304 					   ins.offset, ins.offset, 0);
6305 	if (ret) {
6306 		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
6307 		free_extent_map(em);
6308 		return ERR_PTR(ret);
6309 	}
6310 
6311 	return em;
6312 }
6313 
6314 /*
6315  * returns 1 when the nocow is safe, < 1 on error, 0 if the
6316  * block must be cow'd
6317  */
6318 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6319 			      u64 *orig_start, u64 *orig_block_len,
6320 			      u64 *ram_bytes)
6321 {
6322 	struct btrfs_trans_handle *trans;
6323 	struct btrfs_path *path;
6324 	int ret;
6325 	struct extent_buffer *leaf;
6326 	struct btrfs_root *root = BTRFS_I(inode)->root;
6327 	struct btrfs_file_extent_item *fi;
6328 	struct btrfs_key key;
6329 	u64 disk_bytenr;
6330 	u64 backref_offset;
6331 	u64 extent_end;
6332 	u64 num_bytes;
6333 	int slot;
6334 	int found_type;
6335 	bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
6336 	path = btrfs_alloc_path();
6337 	if (!path)
6338 		return -ENOMEM;
6339 
6340 	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
6341 				       offset, 0);
6342 	if (ret < 0)
6343 		goto out;
6344 
6345 	slot = path->slots[0];
6346 	if (ret == 1) {
6347 		if (slot == 0) {
6348 			/* can't find the item, must cow */
6349 			ret = 0;
6350 			goto out;
6351 		}
6352 		slot--;
6353 	}
6354 	ret = 0;
6355 	leaf = path->nodes[0];
6356 	btrfs_item_key_to_cpu(leaf, &key, slot);
6357 	if (key.objectid != btrfs_ino(inode) ||
6358 	    key.type != BTRFS_EXTENT_DATA_KEY) {
6359 		/* not our file or wrong item type, must cow */
6360 		goto out;
6361 	}
6362 
6363 	if (key.offset > offset) {
6364 		/* Wrong offset, must cow */
6365 		goto out;
6366 	}
6367 
6368 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
6369 	found_type = btrfs_file_extent_type(leaf, fi);
6370 	if (found_type != BTRFS_FILE_EXTENT_REG &&
6371 	    found_type != BTRFS_FILE_EXTENT_PREALLOC) {
6372 		/* not a regular extent, must cow */
6373 		goto out;
6374 	}
6375 
6376 	if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
6377 		goto out;
6378 
6379 	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6380 	if (disk_bytenr == 0)
6381 		goto out;
6382 
6383 	if (btrfs_file_extent_compression(leaf, fi) ||
6384 	    btrfs_file_extent_encryption(leaf, fi) ||
6385 	    btrfs_file_extent_other_encoding(leaf, fi))
6386 		goto out;
6387 
6388 	backref_offset = btrfs_file_extent_offset(leaf, fi);
6389 
6390 	if (orig_start) {
6391 		*orig_start = key.offset - backref_offset;
6392 		*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
6393 		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6394 	}
6395 
6396 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
6397 
6398 	if (btrfs_extent_readonly(root, disk_bytenr))
6399 		goto out;
6400 
6401 	/*
6402 	 * look for other files referencing this extent, if we
6403 	 * find any we must cow
6404 	 */
6405 	trans = btrfs_join_transaction(root);
6406 	if (IS_ERR(trans)) {
6407 		ret = 0;
6408 		goto out;
6409 	}
6410 
6411 	ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
6412 				    key.offset - backref_offset, disk_bytenr);
6413 	btrfs_end_transaction(trans, root);
6414 	if (ret) {
6415 		ret = 0;
6416 		goto out;
6417 	}
6418 
6419 	/*
6420 	 * adjust disk_bytenr and num_bytes to cover just the bytes
6421 	 * in this extent we are about to write.  If there
6422 	 * are any csums in that range we have to cow in order
6423 	 * to keep the csums correct
6424 	 */
6425 	disk_bytenr += backref_offset;
6426 	disk_bytenr += offset - key.offset;
6427 	num_bytes = min(offset + *len, extent_end) - offset;
6428 	if (csum_exist_in_range(root, disk_bytenr, num_bytes))
6429 				goto out;
6430 	/*
6431 	 * all of the above have passed, it is safe to overwrite this extent
6432 	 * without cow
6433 	 */
6434 	*len = num_bytes;
6435 	ret = 1;
6436 out:
6437 	btrfs_free_path(path);
6438 	return ret;
6439 }
6440 
6441 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
6442 			      struct extent_state **cached_state, int writing)
6443 {
6444 	struct btrfs_ordered_extent *ordered;
6445 	int ret = 0;
6446 
6447 	while (1) {
6448 		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6449 				 0, cached_state);
6450 		/*
6451 		 * We're concerned with the entire range that we're going to be
6452 		 * doing DIO to, so we need to make sure theres no ordered
6453 		 * extents in this range.
6454 		 */
6455 		ordered = btrfs_lookup_ordered_range(inode, lockstart,
6456 						     lockend - lockstart + 1);
6457 
6458 		/*
6459 		 * We need to make sure there are no buffered pages in this
6460 		 * range either, we could have raced between the invalidate in
6461 		 * generic_file_direct_write and locking the extent.  The
6462 		 * invalidate needs to happen so that reads after a write do not
6463 		 * get stale data.
6464 		 */
6465 		if (!ordered && (!writing ||
6466 		    !test_range_bit(&BTRFS_I(inode)->io_tree,
6467 				    lockstart, lockend, EXTENT_UPTODATE, 0,
6468 				    *cached_state)))
6469 			break;
6470 
6471 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6472 				     cached_state, GFP_NOFS);
6473 
6474 		if (ordered) {
6475 			btrfs_start_ordered_extent(inode, ordered, 1);
6476 			btrfs_put_ordered_extent(ordered);
6477 		} else {
6478 			/* Screw you mmap */
6479 			ret = filemap_write_and_wait_range(inode->i_mapping,
6480 							   lockstart,
6481 							   lockend);
6482 			if (ret)
6483 				break;
6484 
6485 			/*
6486 			 * If we found a page that couldn't be invalidated just
6487 			 * fall back to buffered.
6488 			 */
6489 			ret = invalidate_inode_pages2_range(inode->i_mapping,
6490 					lockstart >> PAGE_CACHE_SHIFT,
6491 					lockend >> PAGE_CACHE_SHIFT);
6492 			if (ret)
6493 				break;
6494 		}
6495 
6496 		cond_resched();
6497 	}
6498 
6499 	return ret;
6500 }
6501 
6502 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
6503 					   u64 len, u64 orig_start,
6504 					   u64 block_start, u64 block_len,
6505 					   u64 orig_block_len, u64 ram_bytes,
6506 					   int type)
6507 {
6508 	struct extent_map_tree *em_tree;
6509 	struct extent_map *em;
6510 	struct btrfs_root *root = BTRFS_I(inode)->root;
6511 	int ret;
6512 
6513 	em_tree = &BTRFS_I(inode)->extent_tree;
6514 	em = alloc_extent_map();
6515 	if (!em)
6516 		return ERR_PTR(-ENOMEM);
6517 
6518 	em->start = start;
6519 	em->orig_start = orig_start;
6520 	em->mod_start = start;
6521 	em->mod_len = len;
6522 	em->len = len;
6523 	em->block_len = block_len;
6524 	em->block_start = block_start;
6525 	em->bdev = root->fs_info->fs_devices->latest_bdev;
6526 	em->orig_block_len = orig_block_len;
6527 	em->ram_bytes = ram_bytes;
6528 	em->generation = -1;
6529 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
6530 	if (type == BTRFS_ORDERED_PREALLOC)
6531 		set_bit(EXTENT_FLAG_FILLING, &em->flags);
6532 
6533 	do {
6534 		btrfs_drop_extent_cache(inode, em->start,
6535 				em->start + em->len - 1, 0);
6536 		write_lock(&em_tree->lock);
6537 		ret = add_extent_mapping(em_tree, em, 1);
6538 		write_unlock(&em_tree->lock);
6539 	} while (ret == -EEXIST);
6540 
6541 	if (ret) {
6542 		free_extent_map(em);
6543 		return ERR_PTR(ret);
6544 	}
6545 
6546 	return em;
6547 }
6548 
6549 
6550 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6551 				   struct buffer_head *bh_result, int create)
6552 {
6553 	struct extent_map *em;
6554 	struct btrfs_root *root = BTRFS_I(inode)->root;
6555 	struct extent_state *cached_state = NULL;
6556 	u64 start = iblock << inode->i_blkbits;
6557 	u64 lockstart, lockend;
6558 	u64 len = bh_result->b_size;
6559 	int unlock_bits = EXTENT_LOCKED;
6560 	int ret = 0;
6561 
6562 	if (create)
6563 		unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
6564 	else
6565 		len = min_t(u64, len, root->sectorsize);
6566 
6567 	lockstart = start;
6568 	lockend = start + len - 1;
6569 
6570 	/*
6571 	 * If this errors out it's because we couldn't invalidate pagecache for
6572 	 * this range and we need to fallback to buffered.
6573 	 */
6574 	if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
6575 		return -ENOTBLK;
6576 
6577 	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
6578 	if (IS_ERR(em)) {
6579 		ret = PTR_ERR(em);
6580 		goto unlock_err;
6581 	}
6582 
6583 	/*
6584 	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
6585 	 * io.  INLINE is special, and we could probably kludge it in here, but
6586 	 * it's still buffered so for safety lets just fall back to the generic
6587 	 * buffered path.
6588 	 *
6589 	 * For COMPRESSED we _have_ to read the entire extent in so we can
6590 	 * decompress it, so there will be buffering required no matter what we
6591 	 * do, so go ahead and fallback to buffered.
6592 	 *
6593 	 * We return -ENOTBLK because thats what makes DIO go ahead and go back
6594 	 * to buffered IO.  Don't blame me, this is the price we pay for using
6595 	 * the generic code.
6596 	 */
6597 	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
6598 	    em->block_start == EXTENT_MAP_INLINE) {
6599 		free_extent_map(em);
6600 		ret = -ENOTBLK;
6601 		goto unlock_err;
6602 	}
6603 
6604 	/* Just a good old fashioned hole, return */
6605 	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
6606 			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
6607 		free_extent_map(em);
6608 		goto unlock_err;
6609 	}
6610 
6611 	/*
6612 	 * We don't allocate a new extent in the following cases
6613 	 *
6614 	 * 1) The inode is marked as NODATACOW.  In this case we'll just use the
6615 	 * existing extent.
6616 	 * 2) The extent is marked as PREALLOC.  We're good to go here and can
6617 	 * just use the extent.
6618 	 *
6619 	 */
6620 	if (!create) {
6621 		len = min(len, em->len - (start - em->start));
6622 		lockstart = start + len;
6623 		goto unlock;
6624 	}
6625 
6626 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
6627 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
6628 	     em->block_start != EXTENT_MAP_HOLE)) {
6629 		int type;
6630 		int ret;
6631 		u64 block_start, orig_start, orig_block_len, ram_bytes;
6632 
6633 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6634 			type = BTRFS_ORDERED_PREALLOC;
6635 		else
6636 			type = BTRFS_ORDERED_NOCOW;
6637 		len = min(len, em->len - (start - em->start));
6638 		block_start = em->block_start + (start - em->start);
6639 
6640 		if (can_nocow_extent(inode, start, &len, &orig_start,
6641 				     &orig_block_len, &ram_bytes) == 1) {
6642 			if (type == BTRFS_ORDERED_PREALLOC) {
6643 				free_extent_map(em);
6644 				em = create_pinned_em(inode, start, len,
6645 						       orig_start,
6646 						       block_start, len,
6647 						       orig_block_len,
6648 						       ram_bytes, type);
6649 				if (IS_ERR(em))
6650 					goto unlock_err;
6651 			}
6652 
6653 			ret = btrfs_add_ordered_extent_dio(inode, start,
6654 					   block_start, len, len, type);
6655 			if (ret) {
6656 				free_extent_map(em);
6657 				goto unlock_err;
6658 			}
6659 			goto unlock;
6660 		}
6661 	}
6662 
6663 	/*
6664 	 * this will cow the extent, reset the len in case we changed
6665 	 * it above
6666 	 */
6667 	len = bh_result->b_size;
6668 	free_extent_map(em);
6669 	em = btrfs_new_extent_direct(inode, start, len);
6670 	if (IS_ERR(em)) {
6671 		ret = PTR_ERR(em);
6672 		goto unlock_err;
6673 	}
6674 	len = min(len, em->len - (start - em->start));
6675 unlock:
6676 	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
6677 		inode->i_blkbits;
6678 	bh_result->b_size = len;
6679 	bh_result->b_bdev = em->bdev;
6680 	set_buffer_mapped(bh_result);
6681 	if (create) {
6682 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6683 			set_buffer_new(bh_result);
6684 
6685 		/*
6686 		 * Need to update the i_size under the extent lock so buffered
6687 		 * readers will get the updated i_size when we unlock.
6688 		 */
6689 		if (start + len > i_size_read(inode))
6690 			i_size_write(inode, start + len);
6691 
6692 		spin_lock(&BTRFS_I(inode)->lock);
6693 		BTRFS_I(inode)->outstanding_extents++;
6694 		spin_unlock(&BTRFS_I(inode)->lock);
6695 
6696 		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6697 				     lockstart + len - 1, EXTENT_DELALLOC, NULL,
6698 				     &cached_state, GFP_NOFS);
6699 		BUG_ON(ret);
6700 	}
6701 
6702 	/*
6703 	 * In the case of write we need to clear and unlock the entire range,
6704 	 * in the case of read we need to unlock only the end area that we
6705 	 * aren't using if there is any left over space.
6706 	 */
6707 	if (lockstart < lockend) {
6708 		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6709 				 lockend, unlock_bits, 1, 0,
6710 				 &cached_state, GFP_NOFS);
6711 	} else {
6712 		free_extent_state(cached_state);
6713 	}
6714 
6715 	free_extent_map(em);
6716 
6717 	return 0;
6718 
6719 unlock_err:
6720 	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6721 			 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
6722 	return ret;
6723 }
6724 
6725 static void btrfs_endio_direct_read(struct bio *bio, int err)
6726 {
6727 	struct btrfs_dio_private *dip = bio->bi_private;
6728 	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
6729 	struct bio_vec *bvec = bio->bi_io_vec;
6730 	struct inode *inode = dip->inode;
6731 	struct btrfs_root *root = BTRFS_I(inode)->root;
6732 	struct bio *dio_bio;
6733 	u32 *csums = (u32 *)dip->csum;
6734 	int index = 0;
6735 	u64 start;
6736 
6737 	start = dip->logical_offset;
6738 	do {
6739 		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
6740 			struct page *page = bvec->bv_page;
6741 			char *kaddr;
6742 			u32 csum = ~(u32)0;
6743 			unsigned long flags;
6744 
6745 			local_irq_save(flags);
6746 			kaddr = kmap_atomic(page);
6747 			csum = btrfs_csum_data(kaddr + bvec->bv_offset,
6748 					       csum, bvec->bv_len);
6749 			btrfs_csum_final(csum, (char *)&csum);
6750 			kunmap_atomic(kaddr);
6751 			local_irq_restore(flags);
6752 
6753 			flush_dcache_page(bvec->bv_page);
6754 			if (csum != csums[index]) {
6755 				btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
6756 					  (unsigned long long)btrfs_ino(inode),
6757 					  (unsigned long long)start,
6758 					  csum, csums[index]);
6759 				err = -EIO;
6760 			}
6761 		}
6762 
6763 		start += bvec->bv_len;
6764 		bvec++;
6765 		index++;
6766 	} while (bvec <= bvec_end);
6767 
6768 	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
6769 		      dip->logical_offset + dip->bytes - 1);
6770 	dio_bio = dip->dio_bio;
6771 
6772 	kfree(dip);
6773 
6774 	/* If we had a csum failure make sure to clear the uptodate flag */
6775 	if (err)
6776 		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
6777 	dio_end_io(dio_bio, err);
6778 	bio_put(bio);
6779 }
6780 
6781 static void btrfs_endio_direct_write(struct bio *bio, int err)
6782 {
6783 	struct btrfs_dio_private *dip = bio->bi_private;
6784 	struct inode *inode = dip->inode;
6785 	struct btrfs_root *root = BTRFS_I(inode)->root;
6786 	struct btrfs_ordered_extent *ordered = NULL;
6787 	u64 ordered_offset = dip->logical_offset;
6788 	u64 ordered_bytes = dip->bytes;
6789 	struct bio *dio_bio;
6790 	int ret;
6791 
6792 	if (err)
6793 		goto out_done;
6794 again:
6795 	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
6796 						   &ordered_offset,
6797 						   ordered_bytes, !err);
6798 	if (!ret)
6799 		goto out_test;
6800 
6801 	ordered->work.func = finish_ordered_fn;
6802 	ordered->work.flags = 0;
6803 	btrfs_queue_worker(&root->fs_info->endio_write_workers,
6804 			   &ordered->work);
6805 out_test:
6806 	/*
6807 	 * our bio might span multiple ordered extents.  If we haven't
6808 	 * completed the accounting for the whole dio, go back and try again
6809 	 */
6810 	if (ordered_offset < dip->logical_offset + dip->bytes) {
6811 		ordered_bytes = dip->logical_offset + dip->bytes -
6812 			ordered_offset;
6813 		ordered = NULL;
6814 		goto again;
6815 	}
6816 out_done:
6817 	dio_bio = dip->dio_bio;
6818 
6819 	kfree(dip);
6820 
6821 	/* If we had an error make sure to clear the uptodate flag */
6822 	if (err)
6823 		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
6824 	dio_end_io(dio_bio, err);
6825 	bio_put(bio);
6826 }
6827 
6828 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
6829 				    struct bio *bio, int mirror_num,
6830 				    unsigned long bio_flags, u64 offset)
6831 {
6832 	int ret;
6833 	struct btrfs_root *root = BTRFS_I(inode)->root;
6834 	ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
6835 	BUG_ON(ret); /* -ENOMEM */
6836 	return 0;
6837 }
6838 
6839 static void btrfs_end_dio_bio(struct bio *bio, int err)
6840 {
6841 	struct btrfs_dio_private *dip = bio->bi_private;
6842 
6843 	if (err) {
6844 		printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
6845 		      "sector %#Lx len %u err no %d\n",
6846 		      (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw,
6847 		      (unsigned long long)bio->bi_sector, bio->bi_size, err);
6848 		dip->errors = 1;
6849 
6850 		/*
6851 		 * before atomic variable goto zero, we must make sure
6852 		 * dip->errors is perceived to be set.
6853 		 */
6854 		smp_mb__before_atomic_dec();
6855 	}
6856 
6857 	/* if there are more bios still pending for this dio, just exit */
6858 	if (!atomic_dec_and_test(&dip->pending_bios))
6859 		goto out;
6860 
6861 	if (dip->errors) {
6862 		bio_io_error(dip->orig_bio);
6863 	} else {
6864 		set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
6865 		bio_endio(dip->orig_bio, 0);
6866 	}
6867 out:
6868 	bio_put(bio);
6869 }
6870 
6871 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
6872 				       u64 first_sector, gfp_t gfp_flags)
6873 {
6874 	int nr_vecs = bio_get_nr_vecs(bdev);
6875 	return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
6876 }
6877 
6878 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
6879 					 int rw, u64 file_offset, int skip_sum,
6880 					 int async_submit)
6881 {
6882 	struct btrfs_dio_private *dip = bio->bi_private;
6883 	int write = rw & REQ_WRITE;
6884 	struct btrfs_root *root = BTRFS_I(inode)->root;
6885 	int ret;
6886 
6887 	if (async_submit)
6888 		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
6889 
6890 	bio_get(bio);
6891 
6892 	if (!write) {
6893 		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
6894 		if (ret)
6895 			goto err;
6896 	}
6897 
6898 	if (skip_sum)
6899 		goto map;
6900 
6901 	if (write && async_submit) {
6902 		ret = btrfs_wq_submit_bio(root->fs_info,
6903 				   inode, rw, bio, 0, 0,
6904 				   file_offset,
6905 				   __btrfs_submit_bio_start_direct_io,
6906 				   __btrfs_submit_bio_done);
6907 		goto err;
6908 	} else if (write) {
6909 		/*
6910 		 * If we aren't doing async submit, calculate the csum of the
6911 		 * bio now.
6912 		 */
6913 		ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
6914 		if (ret)
6915 			goto err;
6916 	} else if (!skip_sum) {
6917 		ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
6918 						file_offset);
6919 		if (ret)
6920 			goto err;
6921 	}
6922 
6923 map:
6924 	ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
6925 err:
6926 	bio_put(bio);
6927 	return ret;
6928 }
6929 
6930 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6931 				    int skip_sum)
6932 {
6933 	struct inode *inode = dip->inode;
6934 	struct btrfs_root *root = BTRFS_I(inode)->root;
6935 	struct bio *bio;
6936 	struct bio *orig_bio = dip->orig_bio;
6937 	struct bio_vec *bvec = orig_bio->bi_io_vec;
6938 	u64 start_sector = orig_bio->bi_sector;
6939 	u64 file_offset = dip->logical_offset;
6940 	u64 submit_len = 0;
6941 	u64 map_length;
6942 	int nr_pages = 0;
6943 	int ret = 0;
6944 	int async_submit = 0;
6945 
6946 	map_length = orig_bio->bi_size;
6947 	ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
6948 			      &map_length, NULL, 0);
6949 	if (ret) {
6950 		bio_put(orig_bio);
6951 		return -EIO;
6952 	}
6953 
6954 	if (map_length >= orig_bio->bi_size) {
6955 		bio = orig_bio;
6956 		goto submit;
6957 	}
6958 
6959 	/* async crcs make it difficult to collect full stripe writes. */
6960 	if (btrfs_get_alloc_profile(root, 1) &
6961 	    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
6962 		async_submit = 0;
6963 	else
6964 		async_submit = 1;
6965 
6966 	bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
6967 	if (!bio)
6968 		return -ENOMEM;
6969 	bio->bi_private = dip;
6970 	bio->bi_end_io = btrfs_end_dio_bio;
6971 	atomic_inc(&dip->pending_bios);
6972 
6973 	while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
6974 		if (unlikely(map_length < submit_len + bvec->bv_len ||
6975 		    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
6976 				 bvec->bv_offset) < bvec->bv_len)) {
6977 			/*
6978 			 * inc the count before we submit the bio so
6979 			 * we know the end IO handler won't happen before
6980 			 * we inc the count. Otherwise, the dip might get freed
6981 			 * before we're done setting it up
6982 			 */
6983 			atomic_inc(&dip->pending_bios);
6984 			ret = __btrfs_submit_dio_bio(bio, inode, rw,
6985 						     file_offset, skip_sum,
6986 						     async_submit);
6987 			if (ret) {
6988 				bio_put(bio);
6989 				atomic_dec(&dip->pending_bios);
6990 				goto out_err;
6991 			}
6992 
6993 			start_sector += submit_len >> 9;
6994 			file_offset += submit_len;
6995 
6996 			submit_len = 0;
6997 			nr_pages = 0;
6998 
6999 			bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
7000 						  start_sector, GFP_NOFS);
7001 			if (!bio)
7002 				goto out_err;
7003 			bio->bi_private = dip;
7004 			bio->bi_end_io = btrfs_end_dio_bio;
7005 
7006 			map_length = orig_bio->bi_size;
7007 			ret = btrfs_map_block(root->fs_info, rw,
7008 					      start_sector << 9,
7009 					      &map_length, NULL, 0);
7010 			if (ret) {
7011 				bio_put(bio);
7012 				goto out_err;
7013 			}
7014 		} else {
7015 			submit_len += bvec->bv_len;
7016 			nr_pages ++;
7017 			bvec++;
7018 		}
7019 	}
7020 
7021 submit:
7022 	ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
7023 				     async_submit);
7024 	if (!ret)
7025 		return 0;
7026 
7027 	bio_put(bio);
7028 out_err:
7029 	dip->errors = 1;
7030 	/*
7031 	 * before atomic variable goto zero, we must
7032 	 * make sure dip->errors is perceived to be set.
7033 	 */
7034 	smp_mb__before_atomic_dec();
7035 	if (atomic_dec_and_test(&dip->pending_bios))
7036 		bio_io_error(dip->orig_bio);
7037 
7038 	/* bio_end_io() will handle error, so we needn't return it */
7039 	return 0;
7040 }
7041 
7042 static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7043 				struct inode *inode, loff_t file_offset)
7044 {
7045 	struct btrfs_root *root = BTRFS_I(inode)->root;
7046 	struct btrfs_dio_private *dip;
7047 	struct bio *io_bio;
7048 	int skip_sum;
7049 	int sum_len;
7050 	int write = rw & REQ_WRITE;
7051 	int ret = 0;
7052 	u16 csum_size;
7053 
7054 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7055 
7056 	io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
7057 	if (!io_bio) {
7058 		ret = -ENOMEM;
7059 		goto free_ordered;
7060 	}
7061 
7062 	if (!skip_sum && !write) {
7063 		csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
7064 		sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits;
7065 		sum_len *= csum_size;
7066 	} else {
7067 		sum_len = 0;
7068 	}
7069 
7070 	dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
7071 	if (!dip) {
7072 		ret = -ENOMEM;
7073 		goto free_io_bio;
7074 	}
7075 
7076 	dip->private = dio_bio->bi_private;
7077 	dip->inode = inode;
7078 	dip->logical_offset = file_offset;
7079 	dip->bytes = dio_bio->bi_size;
7080 	dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
7081 	io_bio->bi_private = dip;
7082 	dip->errors = 0;
7083 	dip->orig_bio = io_bio;
7084 	dip->dio_bio = dio_bio;
7085 	atomic_set(&dip->pending_bios, 0);
7086 
7087 	if (write)
7088 		io_bio->bi_end_io = btrfs_endio_direct_write;
7089 	else
7090 		io_bio->bi_end_io = btrfs_endio_direct_read;
7091 
7092 	ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
7093 	if (!ret)
7094 		return;
7095 
7096 free_io_bio:
7097 	bio_put(io_bio);
7098 
7099 free_ordered:
7100 	/*
7101 	 * If this is a write, we need to clean up the reserved space and kill
7102 	 * the ordered extent.
7103 	 */
7104 	if (write) {
7105 		struct btrfs_ordered_extent *ordered;
7106 		ordered = btrfs_lookup_ordered_extent(inode, file_offset);
7107 		if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
7108 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
7109 			btrfs_free_reserved_extent(root, ordered->start,
7110 						   ordered->disk_len);
7111 		btrfs_put_ordered_extent(ordered);
7112 		btrfs_put_ordered_extent(ordered);
7113 	}
7114 	bio_endio(dio_bio, ret);
7115 }
7116 
7117 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
7118 			const struct iovec *iov, loff_t offset,
7119 			unsigned long nr_segs)
7120 {
7121 	int seg;
7122 	int i;
7123 	size_t size;
7124 	unsigned long addr;
7125 	unsigned blocksize_mask = root->sectorsize - 1;
7126 	ssize_t retval = -EINVAL;
7127 	loff_t end = offset;
7128 
7129 	if (offset & blocksize_mask)
7130 		goto out;
7131 
7132 	/* Check the memory alignment.  Blocks cannot straddle pages */
7133 	for (seg = 0; seg < nr_segs; seg++) {
7134 		addr = (unsigned long)iov[seg].iov_base;
7135 		size = iov[seg].iov_len;
7136 		end += size;
7137 		if ((addr & blocksize_mask) || (size & blocksize_mask))
7138 			goto out;
7139 
7140 		/* If this is a write we don't need to check anymore */
7141 		if (rw & WRITE)
7142 			continue;
7143 
7144 		/*
7145 		 * Check to make sure we don't have duplicate iov_base's in this
7146 		 * iovec, if so return EINVAL, otherwise we'll get csum errors
7147 		 * when reading back.
7148 		 */
7149 		for (i = seg + 1; i < nr_segs; i++) {
7150 			if (iov[seg].iov_base == iov[i].iov_base)
7151 				goto out;
7152 		}
7153 	}
7154 	retval = 0;
7155 out:
7156 	return retval;
7157 }
7158 
7159 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7160 			const struct iovec *iov, loff_t offset,
7161 			unsigned long nr_segs)
7162 {
7163 	struct file *file = iocb->ki_filp;
7164 	struct inode *inode = file->f_mapping->host;
7165 	size_t count = 0;
7166 	int flags = 0;
7167 	bool wakeup = true;
7168 	bool relock = false;
7169 	ssize_t ret;
7170 
7171 	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
7172 			    offset, nr_segs))
7173 		return 0;
7174 
7175 	atomic_inc(&inode->i_dio_count);
7176 	smp_mb__after_atomic_inc();
7177 
7178 	/*
7179 	 * The generic stuff only does filemap_write_and_wait_range, which isn't
7180 	 * enough if we've written compressed pages to this area, so we need to
7181 	 * call btrfs_wait_ordered_range to make absolutely sure that any
7182 	 * outstanding dirty pages are on disk.
7183 	 */
7184 	count = iov_length(iov, nr_segs);
7185 	btrfs_wait_ordered_range(inode, offset, count);
7186 
7187 	if (rw & WRITE) {
7188 		/*
7189 		 * If the write DIO is beyond the EOF, we need update
7190 		 * the isize, but it is protected by i_mutex. So we can
7191 		 * not unlock the i_mutex at this case.
7192 		 */
7193 		if (offset + count <= inode->i_size) {
7194 			mutex_unlock(&inode->i_mutex);
7195 			relock = true;
7196 		}
7197 		ret = btrfs_delalloc_reserve_space(inode, count);
7198 		if (ret)
7199 			goto out;
7200 	} else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
7201 				     &BTRFS_I(inode)->runtime_flags))) {
7202 		inode_dio_done(inode);
7203 		flags = DIO_LOCKING | DIO_SKIP_HOLES;
7204 		wakeup = false;
7205 	}
7206 
7207 	ret = __blockdev_direct_IO(rw, iocb, inode,
7208 			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
7209 			iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
7210 			btrfs_submit_direct, flags);
7211 	if (rw & WRITE) {
7212 		if (ret < 0 && ret != -EIOCBQUEUED)
7213 			btrfs_delalloc_release_space(inode, count);
7214 		else if (ret >= 0 && (size_t)ret < count)
7215 			btrfs_delalloc_release_space(inode,
7216 						     count - (size_t)ret);
7217 		else
7218 			btrfs_delalloc_release_metadata(inode, 0);
7219 	}
7220 out:
7221 	if (wakeup)
7222 		inode_dio_done(inode);
7223 	if (relock)
7224 		mutex_lock(&inode->i_mutex);
7225 
7226 	return ret;
7227 }
7228 
7229 #define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
7230 
7231 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
7232 		__u64 start, __u64 len)
7233 {
7234 	int	ret;
7235 
7236 	ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
7237 	if (ret)
7238 		return ret;
7239 
7240 	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
7241 }
7242 
7243 int btrfs_readpage(struct file *file, struct page *page)
7244 {
7245 	struct extent_io_tree *tree;
7246 	tree = &BTRFS_I(page->mapping->host)->io_tree;
7247 	return extent_read_full_page(tree, page, btrfs_get_extent, 0);
7248 }
7249 
7250 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
7251 {
7252 	struct extent_io_tree *tree;
7253 
7254 
7255 	if (current->flags & PF_MEMALLOC) {
7256 		redirty_page_for_writepage(wbc, page);
7257 		unlock_page(page);
7258 		return 0;
7259 	}
7260 	tree = &BTRFS_I(page->mapping->host)->io_tree;
7261 	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
7262 }
7263 
7264 static int btrfs_writepages(struct address_space *mapping,
7265 			    struct writeback_control *wbc)
7266 {
7267 	struct extent_io_tree *tree;
7268 
7269 	tree = &BTRFS_I(mapping->host)->io_tree;
7270 	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
7271 }
7272 
7273 static int
7274 btrfs_readpages(struct file *file, struct address_space *mapping,
7275 		struct list_head *pages, unsigned nr_pages)
7276 {
7277 	struct extent_io_tree *tree;
7278 	tree = &BTRFS_I(mapping->host)->io_tree;
7279 	return extent_readpages(tree, mapping, pages, nr_pages,
7280 				btrfs_get_extent);
7281 }
7282 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
7283 {
7284 	struct extent_io_tree *tree;
7285 	struct extent_map_tree *map;
7286 	int ret;
7287 
7288 	tree = &BTRFS_I(page->mapping->host)->io_tree;
7289 	map = &BTRFS_I(page->mapping->host)->extent_tree;
7290 	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
7291 	if (ret == 1) {
7292 		ClearPagePrivate(page);
7293 		set_page_private(page, 0);
7294 		page_cache_release(page);
7295 	}
7296 	return ret;
7297 }
7298 
7299 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
7300 {
7301 	if (PageWriteback(page) || PageDirty(page))
7302 		return 0;
7303 	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
7304 }
7305 
7306 static void btrfs_invalidatepage(struct page *page, unsigned int offset,
7307 				 unsigned int length)
7308 {
7309 	struct inode *inode = page->mapping->host;
7310 	struct extent_io_tree *tree;
7311 	struct btrfs_ordered_extent *ordered;
7312 	struct extent_state *cached_state = NULL;
7313 	u64 page_start = page_offset(page);
7314 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
7315 
7316 	/*
7317 	 * we have the page locked, so new writeback can't start,
7318 	 * and the dirty bit won't be cleared while we are here.
7319 	 *
7320 	 * Wait for IO on this page so that we can safely clear
7321 	 * the PagePrivate2 bit and do ordered accounting
7322 	 */
7323 	wait_on_page_writeback(page);
7324 
7325 	tree = &BTRFS_I(inode)->io_tree;
7326 	if (offset) {
7327 		btrfs_releasepage(page, GFP_NOFS);
7328 		return;
7329 	}
7330 	lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
7331 	ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
7332 	if (ordered) {
7333 		/*
7334 		 * IO on this page will never be started, so we need
7335 		 * to account for any ordered extents now
7336 		 */
7337 		clear_extent_bit(tree, page_start, page_end,
7338 				 EXTENT_DIRTY | EXTENT_DELALLOC |
7339 				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
7340 				 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
7341 		/*
7342 		 * whoever cleared the private bit is responsible
7343 		 * for the finish_ordered_io
7344 		 */
7345 		if (TestClearPagePrivate2(page) &&
7346 		    btrfs_dec_test_ordered_pending(inode, &ordered, page_start,
7347 						   PAGE_CACHE_SIZE, 1)) {
7348 			btrfs_finish_ordered_io(ordered);
7349 		}
7350 		btrfs_put_ordered_extent(ordered);
7351 		cached_state = NULL;
7352 		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
7353 	}
7354 	clear_extent_bit(tree, page_start, page_end,
7355 		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
7356 		 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
7357 		 &cached_state, GFP_NOFS);
7358 	__btrfs_releasepage(page, GFP_NOFS);
7359 
7360 	ClearPageChecked(page);
7361 	if (PagePrivate(page)) {
7362 		ClearPagePrivate(page);
7363 		set_page_private(page, 0);
7364 		page_cache_release(page);
7365 	}
7366 }
7367 
7368 /*
7369  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
7370  * called from a page fault handler when a page is first dirtied. Hence we must
7371  * be careful to check for EOF conditions here. We set the page up correctly
7372  * for a written page which means we get ENOSPC checking when writing into
7373  * holes and correct delalloc and unwritten extent mapping on filesystems that
7374  * support these features.
7375  *
7376  * We are not allowed to take the i_mutex here so we have to play games to
7377  * protect against truncate races as the page could now be beyond EOF.  Because
7378  * vmtruncate() writes the inode size before removing pages, once we have the
7379  * page lock we can determine safely if the page is beyond EOF. If it is not
7380  * beyond EOF, then the page is guaranteed safe against truncation until we
7381  * unlock the page.
7382  */
7383 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
7384 {
7385 	struct page *page = vmf->page;
7386 	struct inode *inode = file_inode(vma->vm_file);
7387 	struct btrfs_root *root = BTRFS_I(inode)->root;
7388 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7389 	struct btrfs_ordered_extent *ordered;
7390 	struct extent_state *cached_state = NULL;
7391 	char *kaddr;
7392 	unsigned long zero_start;
7393 	loff_t size;
7394 	int ret;
7395 	int reserved = 0;
7396 	u64 page_start;
7397 	u64 page_end;
7398 
7399 	sb_start_pagefault(inode->i_sb);
7400 	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
7401 	if (!ret) {
7402 		ret = file_update_time(vma->vm_file);
7403 		reserved = 1;
7404 	}
7405 	if (ret) {
7406 		if (ret == -ENOMEM)
7407 			ret = VM_FAULT_OOM;
7408 		else /* -ENOSPC, -EIO, etc */
7409 			ret = VM_FAULT_SIGBUS;
7410 		if (reserved)
7411 			goto out;
7412 		goto out_noreserve;
7413 	}
7414 
7415 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
7416 again:
7417 	lock_page(page);
7418 	size = i_size_read(inode);
7419 	page_start = page_offset(page);
7420 	page_end = page_start + PAGE_CACHE_SIZE - 1;
7421 
7422 	if ((page->mapping != inode->i_mapping) ||
7423 	    (page_start >= size)) {
7424 		/* page got truncated out from underneath us */
7425 		goto out_unlock;
7426 	}
7427 	wait_on_page_writeback(page);
7428 
7429 	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
7430 	set_page_extent_mapped(page);
7431 
7432 	/*
7433 	 * we can't set the delalloc bits if there are pending ordered
7434 	 * extents.  Drop our locks and wait for them to finish
7435 	 */
7436 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
7437 	if (ordered) {
7438 		unlock_extent_cached(io_tree, page_start, page_end,
7439 				     &cached_state, GFP_NOFS);
7440 		unlock_page(page);
7441 		btrfs_start_ordered_extent(inode, ordered, 1);
7442 		btrfs_put_ordered_extent(ordered);
7443 		goto again;
7444 	}
7445 
7446 	/*
7447 	 * XXX - page_mkwrite gets called every time the page is dirtied, even
7448 	 * if it was already dirty, so for space accounting reasons we need to
7449 	 * clear any delalloc bits for the range we are fixing to save.  There
7450 	 * is probably a better way to do this, but for now keep consistent with
7451 	 * prepare_pages in the normal write path.
7452 	 */
7453 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
7454 			  EXTENT_DIRTY | EXTENT_DELALLOC |
7455 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
7456 			  0, 0, &cached_state, GFP_NOFS);
7457 
7458 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
7459 					&cached_state);
7460 	if (ret) {
7461 		unlock_extent_cached(io_tree, page_start, page_end,
7462 				     &cached_state, GFP_NOFS);
7463 		ret = VM_FAULT_SIGBUS;
7464 		goto out_unlock;
7465 	}
7466 	ret = 0;
7467 
7468 	/* page is wholly or partially inside EOF */
7469 	if (page_start + PAGE_CACHE_SIZE > size)
7470 		zero_start = size & ~PAGE_CACHE_MASK;
7471 	else
7472 		zero_start = PAGE_CACHE_SIZE;
7473 
7474 	if (zero_start != PAGE_CACHE_SIZE) {
7475 		kaddr = kmap(page);
7476 		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
7477 		flush_dcache_page(page);
7478 		kunmap(page);
7479 	}
7480 	ClearPageChecked(page);
7481 	set_page_dirty(page);
7482 	SetPageUptodate(page);
7483 
7484 	BTRFS_I(inode)->last_trans = root->fs_info->generation;
7485 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
7486 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
7487 
7488 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
7489 
7490 out_unlock:
7491 	if (!ret) {
7492 		sb_end_pagefault(inode->i_sb);
7493 		return VM_FAULT_LOCKED;
7494 	}
7495 	unlock_page(page);
7496 out:
7497 	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
7498 out_noreserve:
7499 	sb_end_pagefault(inode->i_sb);
7500 	return ret;
7501 }
7502 
7503 static int btrfs_truncate(struct inode *inode)
7504 {
7505 	struct btrfs_root *root = BTRFS_I(inode)->root;
7506 	struct btrfs_block_rsv *rsv;
7507 	int ret = 0;
7508 	int err = 0;
7509 	struct btrfs_trans_handle *trans;
7510 	u64 mask = root->sectorsize - 1;
7511 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
7512 
7513 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
7514 	btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
7515 
7516 	/*
7517 	 * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
7518 	 * 3 things going on here
7519 	 *
7520 	 * 1) We need to reserve space for our orphan item and the space to
7521 	 * delete our orphan item.  Lord knows we don't want to have a dangling
7522 	 * orphan item because we didn't reserve space to remove it.
7523 	 *
7524 	 * 2) We need to reserve space to update our inode.
7525 	 *
7526 	 * 3) We need to have something to cache all the space that is going to
7527 	 * be free'd up by the truncate operation, but also have some slack
7528 	 * space reserved in case it uses space during the truncate (thank you
7529 	 * very much snapshotting).
7530 	 *
7531 	 * And we need these to all be seperate.  The fact is we can use alot of
7532 	 * space doing the truncate, and we have no earthly idea how much space
7533 	 * we will use, so we need the truncate reservation to be seperate so it
7534 	 * doesn't end up using space reserved for updating the inode or
7535 	 * removing the orphan item.  We also need to be able to stop the
7536 	 * transaction and start a new one, which means we need to be able to
7537 	 * update the inode several times, and we have no idea of knowing how
7538 	 * many times that will be, so we can't just reserve 1 item for the
7539 	 * entirety of the opration, so that has to be done seperately as well.
7540 	 * Then there is the orphan item, which does indeed need to be held on
7541 	 * to for the whole operation, and we need nobody to touch this reserved
7542 	 * space except the orphan code.
7543 	 *
7544 	 * So that leaves us with
7545 	 *
7546 	 * 1) root->orphan_block_rsv - for the orphan deletion.
7547 	 * 2) rsv - for the truncate reservation, which we will steal from the
7548 	 * transaction reservation.
7549 	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
7550 	 * updating the inode.
7551 	 */
7552 	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
7553 	if (!rsv)
7554 		return -ENOMEM;
7555 	rsv->size = min_size;
7556 	rsv->failfast = 1;
7557 
7558 	/*
7559 	 * 1 for the truncate slack space
7560 	 * 1 for updating the inode.
7561 	 */
7562 	trans = btrfs_start_transaction(root, 2);
7563 	if (IS_ERR(trans)) {
7564 		err = PTR_ERR(trans);
7565 		goto out;
7566 	}
7567 
7568 	/* Migrate the slack space for the truncate to our reserve */
7569 	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
7570 				      min_size);
7571 	BUG_ON(ret);
7572 
7573 	/*
7574 	 * setattr is responsible for setting the ordered_data_close flag,
7575 	 * but that is only tested during the last file release.  That
7576 	 * could happen well after the next commit, leaving a great big
7577 	 * window where new writes may get lost if someone chooses to write
7578 	 * to this file after truncating to zero
7579 	 *
7580 	 * The inode doesn't have any dirty data here, and so if we commit
7581 	 * this is a noop.  If someone immediately starts writing to the inode
7582 	 * it is very likely we'll catch some of their writes in this
7583 	 * transaction, and the commit will find this file on the ordered
7584 	 * data list with good things to send down.
7585 	 *
7586 	 * This is a best effort solution, there is still a window where
7587 	 * using truncate to replace the contents of the file will
7588 	 * end up with a zero length file after a crash.
7589 	 */
7590 	if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
7591 					   &BTRFS_I(inode)->runtime_flags))
7592 		btrfs_add_ordered_operation(trans, root, inode);
7593 
7594 	/*
7595 	 * So if we truncate and then write and fsync we normally would just
7596 	 * write the extents that changed, which is a problem if we need to
7597 	 * first truncate that entire inode.  So set this flag so we write out
7598 	 * all of the extents in the inode to the sync log so we're completely
7599 	 * safe.
7600 	 */
7601 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
7602 	trans->block_rsv = rsv;
7603 
7604 	while (1) {
7605 		ret = btrfs_truncate_inode_items(trans, root, inode,
7606 						 inode->i_size,
7607 						 BTRFS_EXTENT_DATA_KEY);
7608 		if (ret != -ENOSPC) {
7609 			err = ret;
7610 			break;
7611 		}
7612 
7613 		trans->block_rsv = &root->fs_info->trans_block_rsv;
7614 		ret = btrfs_update_inode(trans, root, inode);
7615 		if (ret) {
7616 			err = ret;
7617 			break;
7618 		}
7619 
7620 		btrfs_end_transaction(trans, root);
7621 		btrfs_btree_balance_dirty(root);
7622 
7623 		trans = btrfs_start_transaction(root, 2);
7624 		if (IS_ERR(trans)) {
7625 			ret = err = PTR_ERR(trans);
7626 			trans = NULL;
7627 			break;
7628 		}
7629 
7630 		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
7631 					      rsv, min_size);
7632 		BUG_ON(ret);	/* shouldn't happen */
7633 		trans->block_rsv = rsv;
7634 	}
7635 
7636 	if (ret == 0 && inode->i_nlink > 0) {
7637 		trans->block_rsv = root->orphan_block_rsv;
7638 		ret = btrfs_orphan_del(trans, inode);
7639 		if (ret)
7640 			err = ret;
7641 	}
7642 
7643 	if (trans) {
7644 		trans->block_rsv = &root->fs_info->trans_block_rsv;
7645 		ret = btrfs_update_inode(trans, root, inode);
7646 		if (ret && !err)
7647 			err = ret;
7648 
7649 		ret = btrfs_end_transaction(trans, root);
7650 		btrfs_btree_balance_dirty(root);
7651 	}
7652 
7653 out:
7654 	btrfs_free_block_rsv(root, rsv);
7655 
7656 	if (ret && !err)
7657 		err = ret;
7658 
7659 	return err;
7660 }
7661 
7662 /*
7663  * create a new subvolume directory/inode (helper for the ioctl).
7664  */
7665 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
7666 			     struct btrfs_root *new_root, u64 new_dirid)
7667 {
7668 	struct inode *inode;
7669 	int err;
7670 	u64 index = 0;
7671 
7672 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
7673 				new_dirid, new_dirid,
7674 				S_IFDIR | (~current_umask() & S_IRWXUGO),
7675 				&index);
7676 	if (IS_ERR(inode))
7677 		return PTR_ERR(inode);
7678 	inode->i_op = &btrfs_dir_inode_operations;
7679 	inode->i_fop = &btrfs_dir_file_operations;
7680 
7681 	set_nlink(inode, 1);
7682 	btrfs_i_size_write(inode, 0);
7683 
7684 	err = btrfs_update_inode(trans, new_root, inode);
7685 
7686 	iput(inode);
7687 	return err;
7688 }
7689 
7690 struct inode *btrfs_alloc_inode(struct super_block *sb)
7691 {
7692 	struct btrfs_inode *ei;
7693 	struct inode *inode;
7694 
7695 	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
7696 	if (!ei)
7697 		return NULL;
7698 
7699 	ei->root = NULL;
7700 	ei->generation = 0;
7701 	ei->last_trans = 0;
7702 	ei->last_sub_trans = 0;
7703 	ei->logged_trans = 0;
7704 	ei->delalloc_bytes = 0;
7705 	ei->disk_i_size = 0;
7706 	ei->flags = 0;
7707 	ei->csum_bytes = 0;
7708 	ei->index_cnt = (u64)-1;
7709 	ei->last_unlink_trans = 0;
7710 	ei->last_log_commit = 0;
7711 
7712 	spin_lock_init(&ei->lock);
7713 	ei->outstanding_extents = 0;
7714 	ei->reserved_extents = 0;
7715 
7716 	ei->runtime_flags = 0;
7717 	ei->force_compress = BTRFS_COMPRESS_NONE;
7718 
7719 	ei->delayed_node = NULL;
7720 
7721 	inode = &ei->vfs_inode;
7722 	extent_map_tree_init(&ei->extent_tree);
7723 	extent_io_tree_init(&ei->io_tree, &inode->i_data);
7724 	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
7725 	ei->io_tree.track_uptodate = 1;
7726 	ei->io_failure_tree.track_uptodate = 1;
7727 	atomic_set(&ei->sync_writers, 0);
7728 	mutex_init(&ei->log_mutex);
7729 	mutex_init(&ei->delalloc_mutex);
7730 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
7731 	INIT_LIST_HEAD(&ei->delalloc_inodes);
7732 	INIT_LIST_HEAD(&ei->ordered_operations);
7733 	RB_CLEAR_NODE(&ei->rb_node);
7734 
7735 	return inode;
7736 }
7737 
7738 static void btrfs_i_callback(struct rcu_head *head)
7739 {
7740 	struct inode *inode = container_of(head, struct inode, i_rcu);
7741 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7742 }
7743 
7744 void btrfs_destroy_inode(struct inode *inode)
7745 {
7746 	struct btrfs_ordered_extent *ordered;
7747 	struct btrfs_root *root = BTRFS_I(inode)->root;
7748 
7749 	WARN_ON(!hlist_empty(&inode->i_dentry));
7750 	WARN_ON(inode->i_data.nrpages);
7751 	WARN_ON(BTRFS_I(inode)->outstanding_extents);
7752 	WARN_ON(BTRFS_I(inode)->reserved_extents);
7753 	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
7754 	WARN_ON(BTRFS_I(inode)->csum_bytes);
7755 
7756 	/*
7757 	 * This can happen where we create an inode, but somebody else also
7758 	 * created the same inode and we need to destroy the one we already
7759 	 * created.
7760 	 */
7761 	if (!root)
7762 		goto free;
7763 
7764 	/*
7765 	 * Make sure we're properly removed from the ordered operation
7766 	 * lists.
7767 	 */
7768 	smp_mb();
7769 	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
7770 		spin_lock(&root->fs_info->ordered_root_lock);
7771 		list_del_init(&BTRFS_I(inode)->ordered_operations);
7772 		spin_unlock(&root->fs_info->ordered_root_lock);
7773 	}
7774 
7775 	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
7776 		     &BTRFS_I(inode)->runtime_flags)) {
7777 		btrfs_info(root->fs_info, "inode %llu still on the orphan list",
7778 			(unsigned long long)btrfs_ino(inode));
7779 		atomic_dec(&root->orphan_inodes);
7780 	}
7781 
7782 	while (1) {
7783 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
7784 		if (!ordered)
7785 			break;
7786 		else {
7787 			btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
7788 				(unsigned long long)ordered->file_offset,
7789 				(unsigned long long)ordered->len);
7790 			btrfs_remove_ordered_extent(inode, ordered);
7791 			btrfs_put_ordered_extent(ordered);
7792 			btrfs_put_ordered_extent(ordered);
7793 		}
7794 	}
7795 	inode_tree_del(inode);
7796 	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
7797 free:
7798 	call_rcu(&inode->i_rcu, btrfs_i_callback);
7799 }
7800 
7801 int btrfs_drop_inode(struct inode *inode)
7802 {
7803 	struct btrfs_root *root = BTRFS_I(inode)->root;
7804 
7805 	if (root == NULL)
7806 		return 1;
7807 
7808 	/* the snap/subvol tree is on deleting */
7809 	if (btrfs_root_refs(&root->root_item) == 0 &&
7810 	    root != root->fs_info->tree_root)
7811 		return 1;
7812 	else
7813 		return generic_drop_inode(inode);
7814 }
7815 
7816 static void init_once(void *foo)
7817 {
7818 	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
7819 
7820 	inode_init_once(&ei->vfs_inode);
7821 }
7822 
7823 void btrfs_destroy_cachep(void)
7824 {
7825 	/*
7826 	 * Make sure all delayed rcu free inodes are flushed before we
7827 	 * destroy cache.
7828 	 */
7829 	rcu_barrier();
7830 	if (btrfs_inode_cachep)
7831 		kmem_cache_destroy(btrfs_inode_cachep);
7832 	if (btrfs_trans_handle_cachep)
7833 		kmem_cache_destroy(btrfs_trans_handle_cachep);
7834 	if (btrfs_transaction_cachep)
7835 		kmem_cache_destroy(btrfs_transaction_cachep);
7836 	if (btrfs_path_cachep)
7837 		kmem_cache_destroy(btrfs_path_cachep);
7838 	if (btrfs_free_space_cachep)
7839 		kmem_cache_destroy(btrfs_free_space_cachep);
7840 	if (btrfs_delalloc_work_cachep)
7841 		kmem_cache_destroy(btrfs_delalloc_work_cachep);
7842 }
7843 
7844 int btrfs_init_cachep(void)
7845 {
7846 	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
7847 			sizeof(struct btrfs_inode), 0,
7848 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
7849 	if (!btrfs_inode_cachep)
7850 		goto fail;
7851 
7852 	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
7853 			sizeof(struct btrfs_trans_handle), 0,
7854 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7855 	if (!btrfs_trans_handle_cachep)
7856 		goto fail;
7857 
7858 	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
7859 			sizeof(struct btrfs_transaction), 0,
7860 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7861 	if (!btrfs_transaction_cachep)
7862 		goto fail;
7863 
7864 	btrfs_path_cachep = kmem_cache_create("btrfs_path",
7865 			sizeof(struct btrfs_path), 0,
7866 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7867 	if (!btrfs_path_cachep)
7868 		goto fail;
7869 
7870 	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
7871 			sizeof(struct btrfs_free_space), 0,
7872 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7873 	if (!btrfs_free_space_cachep)
7874 		goto fail;
7875 
7876 	btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
7877 			sizeof(struct btrfs_delalloc_work), 0,
7878 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
7879 			NULL);
7880 	if (!btrfs_delalloc_work_cachep)
7881 		goto fail;
7882 
7883 	return 0;
7884 fail:
7885 	btrfs_destroy_cachep();
7886 	return -ENOMEM;
7887 }
7888 
7889 static int btrfs_getattr(struct vfsmount *mnt,
7890 			 struct dentry *dentry, struct kstat *stat)
7891 {
7892 	u64 delalloc_bytes;
7893 	struct inode *inode = dentry->d_inode;
7894 	u32 blocksize = inode->i_sb->s_blocksize;
7895 
7896 	generic_fillattr(inode, stat);
7897 	stat->dev = BTRFS_I(inode)->root->anon_dev;
7898 	stat->blksize = PAGE_CACHE_SIZE;
7899 
7900 	spin_lock(&BTRFS_I(inode)->lock);
7901 	delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
7902 	spin_unlock(&BTRFS_I(inode)->lock);
7903 	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
7904 			ALIGN(delalloc_bytes, blocksize)) >> 9;
7905 	return 0;
7906 }
7907 
7908 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7909 			   struct inode *new_dir, struct dentry *new_dentry)
7910 {
7911 	struct btrfs_trans_handle *trans;
7912 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
7913 	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
7914 	struct inode *new_inode = new_dentry->d_inode;
7915 	struct inode *old_inode = old_dentry->d_inode;
7916 	struct timespec ctime = CURRENT_TIME;
7917 	u64 index = 0;
7918 	u64 root_objectid;
7919 	int ret;
7920 	u64 old_ino = btrfs_ino(old_inode);
7921 
7922 	if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
7923 		return -EPERM;
7924 
7925 	/* we only allow rename subvolume link between subvolumes */
7926 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
7927 		return -EXDEV;
7928 
7929 	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
7930 	    (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
7931 		return -ENOTEMPTY;
7932 
7933 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
7934 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
7935 		return -ENOTEMPTY;
7936 
7937 
7938 	/* check for collisions, even if the  name isn't there */
7939 	ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
7940 			     new_dentry->d_name.name,
7941 			     new_dentry->d_name.len);
7942 
7943 	if (ret) {
7944 		if (ret == -EEXIST) {
7945 			/* we shouldn't get
7946 			 * eexist without a new_inode */
7947 			if (!new_inode) {
7948 				WARN_ON(1);
7949 				return ret;
7950 			}
7951 		} else {
7952 			/* maybe -EOVERFLOW */
7953 			return ret;
7954 		}
7955 	}
7956 	ret = 0;
7957 
7958 	/*
7959 	 * we're using rename to replace one file with another.
7960 	 * and the replacement file is large.  Start IO on it now so
7961 	 * we don't add too much work to the end of the transaction
7962 	 */
7963 	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
7964 	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
7965 		filemap_flush(old_inode->i_mapping);
7966 
7967 	/* close the racy window with snapshot create/destroy ioctl */
7968 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
7969 		down_read(&root->fs_info->subvol_sem);
7970 	/*
7971 	 * We want to reserve the absolute worst case amount of items.  So if
7972 	 * both inodes are subvols and we need to unlink them then that would
7973 	 * require 4 item modifications, but if they are both normal inodes it
7974 	 * would require 5 item modifications, so we'll assume their normal
7975 	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
7976 	 * should cover the worst case number of items we'll modify.
7977 	 */
7978 	trans = btrfs_start_transaction(root, 11);
7979 	if (IS_ERR(trans)) {
7980                 ret = PTR_ERR(trans);
7981                 goto out_notrans;
7982         }
7983 
7984 	if (dest != root)
7985 		btrfs_record_root_in_trans(trans, dest);
7986 
7987 	ret = btrfs_set_inode_index(new_dir, &index);
7988 	if (ret)
7989 		goto out_fail;
7990 
7991 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
7992 		/* force full log commit if subvolume involved. */
7993 		root->fs_info->last_trans_log_full_commit = trans->transid;
7994 	} else {
7995 		ret = btrfs_insert_inode_ref(trans, dest,
7996 					     new_dentry->d_name.name,
7997 					     new_dentry->d_name.len,
7998 					     old_ino,
7999 					     btrfs_ino(new_dir), index);
8000 		if (ret)
8001 			goto out_fail;
8002 		/*
8003 		 * this is an ugly little race, but the rename is required
8004 		 * to make sure that if we crash, the inode is either at the
8005 		 * old name or the new one.  pinning the log transaction lets
8006 		 * us make sure we don't allow a log commit to come in after
8007 		 * we unlink the name but before we add the new name back in.
8008 		 */
8009 		btrfs_pin_log_trans(root);
8010 	}
8011 	/*
8012 	 * make sure the inode gets flushed if it is replacing
8013 	 * something.
8014 	 */
8015 	if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
8016 		btrfs_add_ordered_operation(trans, root, old_inode);
8017 
8018 	inode_inc_iversion(old_dir);
8019 	inode_inc_iversion(new_dir);
8020 	inode_inc_iversion(old_inode);
8021 	old_dir->i_ctime = old_dir->i_mtime = ctime;
8022 	new_dir->i_ctime = new_dir->i_mtime = ctime;
8023 	old_inode->i_ctime = ctime;
8024 
8025 	if (old_dentry->d_parent != new_dentry->d_parent)
8026 		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
8027 
8028 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8029 		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
8030 		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
8031 					old_dentry->d_name.name,
8032 					old_dentry->d_name.len);
8033 	} else {
8034 		ret = __btrfs_unlink_inode(trans, root, old_dir,
8035 					old_dentry->d_inode,
8036 					old_dentry->d_name.name,
8037 					old_dentry->d_name.len);
8038 		if (!ret)
8039 			ret = btrfs_update_inode(trans, root, old_inode);
8040 	}
8041 	if (ret) {
8042 		btrfs_abort_transaction(trans, root, ret);
8043 		goto out_fail;
8044 	}
8045 
8046 	if (new_inode) {
8047 		inode_inc_iversion(new_inode);
8048 		new_inode->i_ctime = CURRENT_TIME;
8049 		if (unlikely(btrfs_ino(new_inode) ==
8050 			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
8051 			root_objectid = BTRFS_I(new_inode)->location.objectid;
8052 			ret = btrfs_unlink_subvol(trans, dest, new_dir,
8053 						root_objectid,
8054 						new_dentry->d_name.name,
8055 						new_dentry->d_name.len);
8056 			BUG_ON(new_inode->i_nlink == 0);
8057 		} else {
8058 			ret = btrfs_unlink_inode(trans, dest, new_dir,
8059 						 new_dentry->d_inode,
8060 						 new_dentry->d_name.name,
8061 						 new_dentry->d_name.len);
8062 		}
8063 		if (!ret && new_inode->i_nlink == 0)
8064 			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
8065 		if (ret) {
8066 			btrfs_abort_transaction(trans, root, ret);
8067 			goto out_fail;
8068 		}
8069 	}
8070 
8071 	ret = btrfs_add_link(trans, new_dir, old_inode,
8072 			     new_dentry->d_name.name,
8073 			     new_dentry->d_name.len, 0, index);
8074 	if (ret) {
8075 		btrfs_abort_transaction(trans, root, ret);
8076 		goto out_fail;
8077 	}
8078 
8079 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
8080 		struct dentry *parent = new_dentry->d_parent;
8081 		btrfs_log_new_name(trans, old_inode, old_dir, parent);
8082 		btrfs_end_log_trans(root);
8083 	}
8084 out_fail:
8085 	btrfs_end_transaction(trans, root);
8086 out_notrans:
8087 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
8088 		up_read(&root->fs_info->subvol_sem);
8089 
8090 	return ret;
8091 }
8092 
8093 static void btrfs_run_delalloc_work(struct btrfs_work *work)
8094 {
8095 	struct btrfs_delalloc_work *delalloc_work;
8096 
8097 	delalloc_work = container_of(work, struct btrfs_delalloc_work,
8098 				     work);
8099 	if (delalloc_work->wait)
8100 		btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
8101 	else
8102 		filemap_flush(delalloc_work->inode->i_mapping);
8103 
8104 	if (delalloc_work->delay_iput)
8105 		btrfs_add_delayed_iput(delalloc_work->inode);
8106 	else
8107 		iput(delalloc_work->inode);
8108 	complete(&delalloc_work->completion);
8109 }
8110 
8111 struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8112 						    int wait, int delay_iput)
8113 {
8114 	struct btrfs_delalloc_work *work;
8115 
8116 	work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
8117 	if (!work)
8118 		return NULL;
8119 
8120 	init_completion(&work->completion);
8121 	INIT_LIST_HEAD(&work->list);
8122 	work->inode = inode;
8123 	work->wait = wait;
8124 	work->delay_iput = delay_iput;
8125 	work->work.func = btrfs_run_delalloc_work;
8126 
8127 	return work;
8128 }
8129 
8130 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8131 {
8132 	wait_for_completion(&work->completion);
8133 	kmem_cache_free(btrfs_delalloc_work_cachep, work);
8134 }
8135 
8136 /*
8137  * some fairly slow code that needs optimization. This walks the list
8138  * of all the inodes with pending delalloc and forces them to disk.
8139  */
8140 static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8141 {
8142 	struct btrfs_inode *binode;
8143 	struct inode *inode;
8144 	struct btrfs_delalloc_work *work, *next;
8145 	struct list_head works;
8146 	struct list_head splice;
8147 	int ret = 0;
8148 
8149 	INIT_LIST_HEAD(&works);
8150 	INIT_LIST_HEAD(&splice);
8151 
8152 	spin_lock(&root->delalloc_lock);
8153 	list_splice_init(&root->delalloc_inodes, &splice);
8154 	while (!list_empty(&splice)) {
8155 		binode = list_entry(splice.next, struct btrfs_inode,
8156 				    delalloc_inodes);
8157 
8158 		list_move_tail(&binode->delalloc_inodes,
8159 			       &root->delalloc_inodes);
8160 		inode = igrab(&binode->vfs_inode);
8161 		if (!inode) {
8162 			cond_resched_lock(&root->delalloc_lock);
8163 			continue;
8164 		}
8165 		spin_unlock(&root->delalloc_lock);
8166 
8167 		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
8168 		if (unlikely(!work)) {
8169 			ret = -ENOMEM;
8170 			goto out;
8171 		}
8172 		list_add_tail(&work->list, &works);
8173 		btrfs_queue_worker(&root->fs_info->flush_workers,
8174 				   &work->work);
8175 
8176 		cond_resched();
8177 		spin_lock(&root->delalloc_lock);
8178 	}
8179 	spin_unlock(&root->delalloc_lock);
8180 
8181 	list_for_each_entry_safe(work, next, &works, list) {
8182 		list_del_init(&work->list);
8183 		btrfs_wait_and_free_delalloc_work(work);
8184 	}
8185 	return 0;
8186 out:
8187 	list_for_each_entry_safe(work, next, &works, list) {
8188 		list_del_init(&work->list);
8189 		btrfs_wait_and_free_delalloc_work(work);
8190 	}
8191 
8192 	if (!list_empty_careful(&splice)) {
8193 		spin_lock(&root->delalloc_lock);
8194 		list_splice_tail(&splice, &root->delalloc_inodes);
8195 		spin_unlock(&root->delalloc_lock);
8196 	}
8197 	return ret;
8198 }
8199 
8200 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8201 {
8202 	int ret;
8203 
8204 	if (root->fs_info->sb->s_flags & MS_RDONLY)
8205 		return -EROFS;
8206 
8207 	ret = __start_delalloc_inodes(root, delay_iput);
8208 	/*
8209 	 * the filemap_flush will queue IO into the worker threads, but
8210 	 * we have to make sure the IO is actually started and that
8211 	 * ordered extents get created before we return
8212 	 */
8213 	atomic_inc(&root->fs_info->async_submit_draining);
8214 	while (atomic_read(&root->fs_info->nr_async_submits) ||
8215 	      atomic_read(&root->fs_info->async_delalloc_pages)) {
8216 		wait_event(root->fs_info->async_submit_wait,
8217 		   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
8218 		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
8219 	}
8220 	atomic_dec(&root->fs_info->async_submit_draining);
8221 	return ret;
8222 }
8223 
8224 int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
8225 				    int delay_iput)
8226 {
8227 	struct btrfs_root *root;
8228 	struct list_head splice;
8229 	int ret;
8230 
8231 	if (fs_info->sb->s_flags & MS_RDONLY)
8232 		return -EROFS;
8233 
8234 	INIT_LIST_HEAD(&splice);
8235 
8236 	spin_lock(&fs_info->delalloc_root_lock);
8237 	list_splice_init(&fs_info->delalloc_roots, &splice);
8238 	while (!list_empty(&splice)) {
8239 		root = list_first_entry(&splice, struct btrfs_root,
8240 					delalloc_root);
8241 		root = btrfs_grab_fs_root(root);
8242 		BUG_ON(!root);
8243 		list_move_tail(&root->delalloc_root,
8244 			       &fs_info->delalloc_roots);
8245 		spin_unlock(&fs_info->delalloc_root_lock);
8246 
8247 		ret = __start_delalloc_inodes(root, delay_iput);
8248 		btrfs_put_fs_root(root);
8249 		if (ret)
8250 			goto out;
8251 
8252 		spin_lock(&fs_info->delalloc_root_lock);
8253 	}
8254 	spin_unlock(&fs_info->delalloc_root_lock);
8255 
8256 	atomic_inc(&fs_info->async_submit_draining);
8257 	while (atomic_read(&fs_info->nr_async_submits) ||
8258 	      atomic_read(&fs_info->async_delalloc_pages)) {
8259 		wait_event(fs_info->async_submit_wait,
8260 		   (atomic_read(&fs_info->nr_async_submits) == 0 &&
8261 		    atomic_read(&fs_info->async_delalloc_pages) == 0));
8262 	}
8263 	atomic_dec(&fs_info->async_submit_draining);
8264 	return 0;
8265 out:
8266 	if (!list_empty_careful(&splice)) {
8267 		spin_lock(&fs_info->delalloc_root_lock);
8268 		list_splice_tail(&splice, &fs_info->delalloc_roots);
8269 		spin_unlock(&fs_info->delalloc_root_lock);
8270 	}
8271 	return ret;
8272 }
8273 
8274 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8275 			 const char *symname)
8276 {
8277 	struct btrfs_trans_handle *trans;
8278 	struct btrfs_root *root = BTRFS_I(dir)->root;
8279 	struct btrfs_path *path;
8280 	struct btrfs_key key;
8281 	struct inode *inode = NULL;
8282 	int err;
8283 	int drop_inode = 0;
8284 	u64 objectid;
8285 	u64 index = 0 ;
8286 	int name_len;
8287 	int datasize;
8288 	unsigned long ptr;
8289 	struct btrfs_file_extent_item *ei;
8290 	struct extent_buffer *leaf;
8291 
8292 	name_len = strlen(symname) + 1;
8293 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
8294 		return -ENAMETOOLONG;
8295 
8296 	/*
8297 	 * 2 items for inode item and ref
8298 	 * 2 items for dir items
8299 	 * 1 item for xattr if selinux is on
8300 	 */
8301 	trans = btrfs_start_transaction(root, 5);
8302 	if (IS_ERR(trans))
8303 		return PTR_ERR(trans);
8304 
8305 	err = btrfs_find_free_ino(root, &objectid);
8306 	if (err)
8307 		goto out_unlock;
8308 
8309 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
8310 				dentry->d_name.len, btrfs_ino(dir), objectid,
8311 				S_IFLNK|S_IRWXUGO, &index);
8312 	if (IS_ERR(inode)) {
8313 		err = PTR_ERR(inode);
8314 		goto out_unlock;
8315 	}
8316 
8317 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
8318 	if (err) {
8319 		drop_inode = 1;
8320 		goto out_unlock;
8321 	}
8322 
8323 	/*
8324 	* If the active LSM wants to access the inode during
8325 	* d_instantiate it needs these. Smack checks to see
8326 	* if the filesystem supports xattrs by looking at the
8327 	* ops vector.
8328 	*/
8329 	inode->i_fop = &btrfs_file_operations;
8330 	inode->i_op = &btrfs_file_inode_operations;
8331 
8332 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
8333 	if (err)
8334 		drop_inode = 1;
8335 	else {
8336 		inode->i_mapping->a_ops = &btrfs_aops;
8337 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8338 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
8339 	}
8340 	if (drop_inode)
8341 		goto out_unlock;
8342 
8343 	path = btrfs_alloc_path();
8344 	if (!path) {
8345 		err = -ENOMEM;
8346 		drop_inode = 1;
8347 		goto out_unlock;
8348 	}
8349 	key.objectid = btrfs_ino(inode);
8350 	key.offset = 0;
8351 	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
8352 	datasize = btrfs_file_extent_calc_inline_size(name_len);
8353 	err = btrfs_insert_empty_item(trans, root, path, &key,
8354 				      datasize);
8355 	if (err) {
8356 		drop_inode = 1;
8357 		btrfs_free_path(path);
8358 		goto out_unlock;
8359 	}
8360 	leaf = path->nodes[0];
8361 	ei = btrfs_item_ptr(leaf, path->slots[0],
8362 			    struct btrfs_file_extent_item);
8363 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
8364 	btrfs_set_file_extent_type(leaf, ei,
8365 				   BTRFS_FILE_EXTENT_INLINE);
8366 	btrfs_set_file_extent_encryption(leaf, ei, 0);
8367 	btrfs_set_file_extent_compression(leaf, ei, 0);
8368 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
8369 	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
8370 
8371 	ptr = btrfs_file_extent_inline_start(ei);
8372 	write_extent_buffer(leaf, symname, ptr, name_len);
8373 	btrfs_mark_buffer_dirty(leaf);
8374 	btrfs_free_path(path);
8375 
8376 	inode->i_op = &btrfs_symlink_inode_operations;
8377 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
8378 	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8379 	inode_set_bytes(inode, name_len);
8380 	btrfs_i_size_write(inode, name_len - 1);
8381 	err = btrfs_update_inode(trans, root, inode);
8382 	if (err)
8383 		drop_inode = 1;
8384 
8385 out_unlock:
8386 	if (!err)
8387 		d_instantiate(dentry, inode);
8388 	btrfs_end_transaction(trans, root);
8389 	if (drop_inode) {
8390 		inode_dec_link_count(inode);
8391 		iput(inode);
8392 	}
8393 	btrfs_btree_balance_dirty(root);
8394 	return err;
8395 }
8396 
8397 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
8398 				       u64 start, u64 num_bytes, u64 min_size,
8399 				       loff_t actual_len, u64 *alloc_hint,
8400 				       struct btrfs_trans_handle *trans)
8401 {
8402 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
8403 	struct extent_map *em;
8404 	struct btrfs_root *root = BTRFS_I(inode)->root;
8405 	struct btrfs_key ins;
8406 	u64 cur_offset = start;
8407 	u64 i_size;
8408 	u64 cur_bytes;
8409 	int ret = 0;
8410 	bool own_trans = true;
8411 
8412 	if (trans)
8413 		own_trans = false;
8414 	while (num_bytes > 0) {
8415 		if (own_trans) {
8416 			trans = btrfs_start_transaction(root, 3);
8417 			if (IS_ERR(trans)) {
8418 				ret = PTR_ERR(trans);
8419 				break;
8420 			}
8421 		}
8422 
8423 		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
8424 		cur_bytes = max(cur_bytes, min_size);
8425 		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
8426 					   *alloc_hint, &ins, 1);
8427 		if (ret) {
8428 			if (own_trans)
8429 				btrfs_end_transaction(trans, root);
8430 			break;
8431 		}
8432 
8433 		ret = insert_reserved_file_extent(trans, inode,
8434 						  cur_offset, ins.objectid,
8435 						  ins.offset, ins.offset,
8436 						  ins.offset, 0, 0, 0,
8437 						  BTRFS_FILE_EXTENT_PREALLOC);
8438 		if (ret) {
8439 			btrfs_abort_transaction(trans, root, ret);
8440 			if (own_trans)
8441 				btrfs_end_transaction(trans, root);
8442 			break;
8443 		}
8444 		btrfs_drop_extent_cache(inode, cur_offset,
8445 					cur_offset + ins.offset -1, 0);
8446 
8447 		em = alloc_extent_map();
8448 		if (!em) {
8449 			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
8450 				&BTRFS_I(inode)->runtime_flags);
8451 			goto next;
8452 		}
8453 
8454 		em->start = cur_offset;
8455 		em->orig_start = cur_offset;
8456 		em->len = ins.offset;
8457 		em->block_start = ins.objectid;
8458 		em->block_len = ins.offset;
8459 		em->orig_block_len = ins.offset;
8460 		em->ram_bytes = ins.offset;
8461 		em->bdev = root->fs_info->fs_devices->latest_bdev;
8462 		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
8463 		em->generation = trans->transid;
8464 
8465 		while (1) {
8466 			write_lock(&em_tree->lock);
8467 			ret = add_extent_mapping(em_tree, em, 1);
8468 			write_unlock(&em_tree->lock);
8469 			if (ret != -EEXIST)
8470 				break;
8471 			btrfs_drop_extent_cache(inode, cur_offset,
8472 						cur_offset + ins.offset - 1,
8473 						0);
8474 		}
8475 		free_extent_map(em);
8476 next:
8477 		num_bytes -= ins.offset;
8478 		cur_offset += ins.offset;
8479 		*alloc_hint = ins.objectid + ins.offset;
8480 
8481 		inode_inc_iversion(inode);
8482 		inode->i_ctime = CURRENT_TIME;
8483 		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
8484 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
8485 		    (actual_len > inode->i_size) &&
8486 		    (cur_offset > inode->i_size)) {
8487 			if (cur_offset > actual_len)
8488 				i_size = actual_len;
8489 			else
8490 				i_size = cur_offset;
8491 			i_size_write(inode, i_size);
8492 			btrfs_ordered_update_i_size(inode, i_size, NULL);
8493 		}
8494 
8495 		ret = btrfs_update_inode(trans, root, inode);
8496 
8497 		if (ret) {
8498 			btrfs_abort_transaction(trans, root, ret);
8499 			if (own_trans)
8500 				btrfs_end_transaction(trans, root);
8501 			break;
8502 		}
8503 
8504 		if (own_trans)
8505 			btrfs_end_transaction(trans, root);
8506 	}
8507 	return ret;
8508 }
8509 
8510 int btrfs_prealloc_file_range(struct inode *inode, int mode,
8511 			      u64 start, u64 num_bytes, u64 min_size,
8512 			      loff_t actual_len, u64 *alloc_hint)
8513 {
8514 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8515 					   min_size, actual_len, alloc_hint,
8516 					   NULL);
8517 }
8518 
8519 int btrfs_prealloc_file_range_trans(struct inode *inode,
8520 				    struct btrfs_trans_handle *trans, int mode,
8521 				    u64 start, u64 num_bytes, u64 min_size,
8522 				    loff_t actual_len, u64 *alloc_hint)
8523 {
8524 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8525 					   min_size, actual_len, alloc_hint, trans);
8526 }
8527 
8528 static int btrfs_set_page_dirty(struct page *page)
8529 {
8530 	return __set_page_dirty_nobuffers(page);
8531 }
8532 
8533 static int btrfs_permission(struct inode *inode, int mask)
8534 {
8535 	struct btrfs_root *root = BTRFS_I(inode)->root;
8536 	umode_t mode = inode->i_mode;
8537 
8538 	if (mask & MAY_WRITE &&
8539 	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
8540 		if (btrfs_root_readonly(root))
8541 			return -EROFS;
8542 		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
8543 			return -EACCES;
8544 	}
8545 	return generic_permission(inode, mask);
8546 }
8547 
8548 static const struct inode_operations btrfs_dir_inode_operations = {
8549 	.getattr	= btrfs_getattr,
8550 	.lookup		= btrfs_lookup,
8551 	.create		= btrfs_create,
8552 	.unlink		= btrfs_unlink,
8553 	.link		= btrfs_link,
8554 	.mkdir		= btrfs_mkdir,
8555 	.rmdir		= btrfs_rmdir,
8556 	.rename		= btrfs_rename,
8557 	.symlink	= btrfs_symlink,
8558 	.setattr	= btrfs_setattr,
8559 	.mknod		= btrfs_mknod,
8560 	.setxattr	= btrfs_setxattr,
8561 	.getxattr	= btrfs_getxattr,
8562 	.listxattr	= btrfs_listxattr,
8563 	.removexattr	= btrfs_removexattr,
8564 	.permission	= btrfs_permission,
8565 	.get_acl	= btrfs_get_acl,
8566 };
8567 static const struct inode_operations btrfs_dir_ro_inode_operations = {
8568 	.lookup		= btrfs_lookup,
8569 	.permission	= btrfs_permission,
8570 	.get_acl	= btrfs_get_acl,
8571 };
8572 
8573 static const struct file_operations btrfs_dir_file_operations = {
8574 	.llseek		= generic_file_llseek,
8575 	.read		= generic_read_dir,
8576 	.iterate	= btrfs_real_readdir,
8577 	.unlocked_ioctl	= btrfs_ioctl,
8578 #ifdef CONFIG_COMPAT
8579 	.compat_ioctl	= btrfs_ioctl,
8580 #endif
8581 	.release        = btrfs_release_file,
8582 	.fsync		= btrfs_sync_file,
8583 };
8584 
8585 static struct extent_io_ops btrfs_extent_io_ops = {
8586 	.fill_delalloc = run_delalloc_range,
8587 	.submit_bio_hook = btrfs_submit_bio_hook,
8588 	.merge_bio_hook = btrfs_merge_bio_hook,
8589 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
8590 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
8591 	.writepage_start_hook = btrfs_writepage_start_hook,
8592 	.set_bit_hook = btrfs_set_bit_hook,
8593 	.clear_bit_hook = btrfs_clear_bit_hook,
8594 	.merge_extent_hook = btrfs_merge_extent_hook,
8595 	.split_extent_hook = btrfs_split_extent_hook,
8596 };
8597 
8598 /*
8599  * btrfs doesn't support the bmap operation because swapfiles
8600  * use bmap to make a mapping of extents in the file.  They assume
8601  * these extents won't change over the life of the file and they
8602  * use the bmap result to do IO directly to the drive.
8603  *
8604  * the btrfs bmap call would return logical addresses that aren't
8605  * suitable for IO and they also will change frequently as COW
8606  * operations happen.  So, swapfile + btrfs == corruption.
8607  *
8608  * For now we're avoiding this by dropping bmap.
8609  */
8610 static const struct address_space_operations btrfs_aops = {
8611 	.readpage	= btrfs_readpage,
8612 	.writepage	= btrfs_writepage,
8613 	.writepages	= btrfs_writepages,
8614 	.readpages	= btrfs_readpages,
8615 	.direct_IO	= btrfs_direct_IO,
8616 	.invalidatepage = btrfs_invalidatepage,
8617 	.releasepage	= btrfs_releasepage,
8618 	.set_page_dirty	= btrfs_set_page_dirty,
8619 	.error_remove_page = generic_error_remove_page,
8620 };
8621 
8622 static const struct address_space_operations btrfs_symlink_aops = {
8623 	.readpage	= btrfs_readpage,
8624 	.writepage	= btrfs_writepage,
8625 	.invalidatepage = btrfs_invalidatepage,
8626 	.releasepage	= btrfs_releasepage,
8627 };
8628 
8629 static const struct inode_operations btrfs_file_inode_operations = {
8630 	.getattr	= btrfs_getattr,
8631 	.setattr	= btrfs_setattr,
8632 	.setxattr	= btrfs_setxattr,
8633 	.getxattr	= btrfs_getxattr,
8634 	.listxattr      = btrfs_listxattr,
8635 	.removexattr	= btrfs_removexattr,
8636 	.permission	= btrfs_permission,
8637 	.fiemap		= btrfs_fiemap,
8638 	.get_acl	= btrfs_get_acl,
8639 	.update_time	= btrfs_update_time,
8640 };
8641 static const struct inode_operations btrfs_special_inode_operations = {
8642 	.getattr	= btrfs_getattr,
8643 	.setattr	= btrfs_setattr,
8644 	.permission	= btrfs_permission,
8645 	.setxattr	= btrfs_setxattr,
8646 	.getxattr	= btrfs_getxattr,
8647 	.listxattr	= btrfs_listxattr,
8648 	.removexattr	= btrfs_removexattr,
8649 	.get_acl	= btrfs_get_acl,
8650 	.update_time	= btrfs_update_time,
8651 };
8652 static const struct inode_operations btrfs_symlink_inode_operations = {
8653 	.readlink	= generic_readlink,
8654 	.follow_link	= page_follow_link_light,
8655 	.put_link	= page_put_link,
8656 	.getattr	= btrfs_getattr,
8657 	.setattr	= btrfs_setattr,
8658 	.permission	= btrfs_permission,
8659 	.setxattr	= btrfs_setxattr,
8660 	.getxattr	= btrfs_getxattr,
8661 	.listxattr	= btrfs_listxattr,
8662 	.removexattr	= btrfs_removexattr,
8663 	.get_acl	= btrfs_get_acl,
8664 	.update_time	= btrfs_update_time,
8665 };
8666 
8667 const struct dentry_operations btrfs_dentry_operations = {
8668 	.d_delete	= btrfs_dentry_delete,
8669 	.d_release	= btrfs_dentry_release,
8670 };
8671