xref: /openbmc/linux/fs/btrfs/inode.c (revision c1c9ff7c94e83fae89a742df74db51156869bad5)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/statfs.h>
34 #include <linux/compat.h>
35 #include <linux/aio.h>
36 #include <linux/bit_spinlock.h>
37 #include <linux/xattr.h>
38 #include <linux/posix_acl.h>
39 #include <linux/falloc.h>
40 #include <linux/slab.h>
41 #include <linux/ratelimit.h>
42 #include <linux/mount.h>
43 #include <linux/btrfs.h>
44 #include <linux/blkdev.h>
45 #include <linux/posix_acl_xattr.h>
46 #include "compat.h"
47 #include "ctree.h"
48 #include "disk-io.h"
49 #include "transaction.h"
50 #include "btrfs_inode.h"
51 #include "print-tree.h"
52 #include "ordered-data.h"
53 #include "xattr.h"
54 #include "tree-log.h"
55 #include "volumes.h"
56 #include "compression.h"
57 #include "locking.h"
58 #include "free-space-cache.h"
59 #include "inode-map.h"
60 #include "backref.h"
61 #include "hash.h"
62 
63 struct btrfs_iget_args {
64 	u64 ino;
65 	struct btrfs_root *root;
66 };
67 
68 static const struct inode_operations btrfs_dir_inode_operations;
69 static const struct inode_operations btrfs_symlink_inode_operations;
70 static const struct inode_operations btrfs_dir_ro_inode_operations;
71 static const struct inode_operations btrfs_special_inode_operations;
72 static const struct inode_operations btrfs_file_inode_operations;
73 static const struct address_space_operations btrfs_aops;
74 static const struct address_space_operations btrfs_symlink_aops;
75 static const struct file_operations btrfs_dir_file_operations;
76 static struct extent_io_ops btrfs_extent_io_ops;
77 
78 static struct kmem_cache *btrfs_inode_cachep;
79 static struct kmem_cache *btrfs_delalloc_work_cachep;
80 struct kmem_cache *btrfs_trans_handle_cachep;
81 struct kmem_cache *btrfs_transaction_cachep;
82 struct kmem_cache *btrfs_path_cachep;
83 struct kmem_cache *btrfs_free_space_cachep;
84 
85 #define S_SHIFT 12
86 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
87 	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
88 	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
89 	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
90 	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
91 	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
92 	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
93 	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
94 };
95 
96 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
97 static int btrfs_truncate(struct inode *inode);
98 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
99 static noinline int cow_file_range(struct inode *inode,
100 				   struct page *locked_page,
101 				   u64 start, u64 end, int *page_started,
102 				   unsigned long *nr_written, int unlock);
103 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
104 					   u64 len, u64 orig_start,
105 					   u64 block_start, u64 block_len,
106 					   u64 orig_block_len, u64 ram_bytes,
107 					   int type);
108 
109 static int btrfs_dirty_inode(struct inode *inode);
110 
111 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
112 				     struct inode *inode,  struct inode *dir,
113 				     const struct qstr *qstr)
114 {
115 	int err;
116 
117 	err = btrfs_init_acl(trans, inode, dir);
118 	if (!err)
119 		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
120 	return err;
121 }
122 
123 /*
124  * this does all the hard work for inserting an inline extent into
125  * the btree.  The caller should have done a btrfs_drop_extents so that
126  * no overlapping inline items exist in the btree
127  */
128 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
129 				struct btrfs_root *root, struct inode *inode,
130 				u64 start, size_t size, size_t compressed_size,
131 				int compress_type,
132 				struct page **compressed_pages)
133 {
134 	struct btrfs_key key;
135 	struct btrfs_path *path;
136 	struct extent_buffer *leaf;
137 	struct page *page = NULL;
138 	char *kaddr;
139 	unsigned long ptr;
140 	struct btrfs_file_extent_item *ei;
141 	int err = 0;
142 	int ret;
143 	size_t cur_size = size;
144 	size_t datasize;
145 	unsigned long offset;
146 
147 	if (compressed_size && compressed_pages)
148 		cur_size = compressed_size;
149 
150 	path = btrfs_alloc_path();
151 	if (!path)
152 		return -ENOMEM;
153 
154 	path->leave_spinning = 1;
155 
156 	key.objectid = btrfs_ino(inode);
157 	key.offset = start;
158 	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
159 	datasize = btrfs_file_extent_calc_inline_size(cur_size);
160 
161 	inode_add_bytes(inode, size);
162 	ret = btrfs_insert_empty_item(trans, root, path, &key,
163 				      datasize);
164 	if (ret) {
165 		err = ret;
166 		goto fail;
167 	}
168 	leaf = path->nodes[0];
169 	ei = btrfs_item_ptr(leaf, path->slots[0],
170 			    struct btrfs_file_extent_item);
171 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
172 	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
173 	btrfs_set_file_extent_encryption(leaf, ei, 0);
174 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
175 	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
176 	ptr = btrfs_file_extent_inline_start(ei);
177 
178 	if (compress_type != BTRFS_COMPRESS_NONE) {
179 		struct page *cpage;
180 		int i = 0;
181 		while (compressed_size > 0) {
182 			cpage = compressed_pages[i];
183 			cur_size = min_t(unsigned long, compressed_size,
184 				       PAGE_CACHE_SIZE);
185 
186 			kaddr = kmap_atomic(cpage);
187 			write_extent_buffer(leaf, kaddr, ptr, cur_size);
188 			kunmap_atomic(kaddr);
189 
190 			i++;
191 			ptr += cur_size;
192 			compressed_size -= cur_size;
193 		}
194 		btrfs_set_file_extent_compression(leaf, ei,
195 						  compress_type);
196 	} else {
197 		page = find_get_page(inode->i_mapping,
198 				     start >> PAGE_CACHE_SHIFT);
199 		btrfs_set_file_extent_compression(leaf, ei, 0);
200 		kaddr = kmap_atomic(page);
201 		offset = start & (PAGE_CACHE_SIZE - 1);
202 		write_extent_buffer(leaf, kaddr + offset, ptr, size);
203 		kunmap_atomic(kaddr);
204 		page_cache_release(page);
205 	}
206 	btrfs_mark_buffer_dirty(leaf);
207 	btrfs_free_path(path);
208 
209 	/*
210 	 * we're an inline extent, so nobody can
211 	 * extend the file past i_size without locking
212 	 * a page we already have locked.
213 	 *
214 	 * We must do any isize and inode updates
215 	 * before we unlock the pages.  Otherwise we
216 	 * could end up racing with unlink.
217 	 */
218 	BTRFS_I(inode)->disk_i_size = inode->i_size;
219 	ret = btrfs_update_inode(trans, root, inode);
220 
221 	return ret;
222 fail:
223 	btrfs_free_path(path);
224 	return err;
225 }
226 
227 
228 /*
229  * conditionally insert an inline extent into the file.  This
230  * does the checks required to make sure the data is small enough
231  * to fit as an inline extent.
232  */
233 static noinline int cow_file_range_inline(struct btrfs_root *root,
234 					  struct inode *inode, u64 start,
235 					  u64 end, size_t compressed_size,
236 					  int compress_type,
237 					  struct page **compressed_pages)
238 {
239 	struct btrfs_trans_handle *trans;
240 	u64 isize = i_size_read(inode);
241 	u64 actual_end = min(end + 1, isize);
242 	u64 inline_len = actual_end - start;
243 	u64 aligned_end = ALIGN(end, root->sectorsize);
244 	u64 data_len = inline_len;
245 	int ret;
246 
247 	if (compressed_size)
248 		data_len = compressed_size;
249 
250 	if (start > 0 ||
251 	    actual_end >= PAGE_CACHE_SIZE ||
252 	    data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
253 	    (!compressed_size &&
254 	    (actual_end & (root->sectorsize - 1)) == 0) ||
255 	    end + 1 < isize ||
256 	    data_len > root->fs_info->max_inline) {
257 		return 1;
258 	}
259 
260 	trans = btrfs_join_transaction(root);
261 	if (IS_ERR(trans))
262 		return PTR_ERR(trans);
263 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
264 
265 	ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
266 	if (ret) {
267 		btrfs_abort_transaction(trans, root, ret);
268 		goto out;
269 	}
270 
271 	if (isize > actual_end)
272 		inline_len = min_t(u64, isize, actual_end);
273 	ret = insert_inline_extent(trans, root, inode, start,
274 				   inline_len, compressed_size,
275 				   compress_type, compressed_pages);
276 	if (ret && ret != -ENOSPC) {
277 		btrfs_abort_transaction(trans, root, ret);
278 		goto out;
279 	} else if (ret == -ENOSPC) {
280 		ret = 1;
281 		goto out;
282 	}
283 
284 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
285 	btrfs_delalloc_release_metadata(inode, end + 1 - start);
286 	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
287 out:
288 	btrfs_end_transaction(trans, root);
289 	return ret;
290 }
291 
292 struct async_extent {
293 	u64 start;
294 	u64 ram_size;
295 	u64 compressed_size;
296 	struct page **pages;
297 	unsigned long nr_pages;
298 	int compress_type;
299 	struct list_head list;
300 };
301 
302 struct async_cow {
303 	struct inode *inode;
304 	struct btrfs_root *root;
305 	struct page *locked_page;
306 	u64 start;
307 	u64 end;
308 	struct list_head extents;
309 	struct btrfs_work work;
310 };
311 
312 static noinline int add_async_extent(struct async_cow *cow,
313 				     u64 start, u64 ram_size,
314 				     u64 compressed_size,
315 				     struct page **pages,
316 				     unsigned long nr_pages,
317 				     int compress_type)
318 {
319 	struct async_extent *async_extent;
320 
321 	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
322 	BUG_ON(!async_extent); /* -ENOMEM */
323 	async_extent->start = start;
324 	async_extent->ram_size = ram_size;
325 	async_extent->compressed_size = compressed_size;
326 	async_extent->pages = pages;
327 	async_extent->nr_pages = nr_pages;
328 	async_extent->compress_type = compress_type;
329 	list_add_tail(&async_extent->list, &cow->extents);
330 	return 0;
331 }
332 
333 /*
334  * we create compressed extents in two phases.  The first
335  * phase compresses a range of pages that have already been
336  * locked (both pages and state bits are locked).
337  *
338  * This is done inside an ordered work queue, and the compression
339  * is spread across many cpus.  The actual IO submission is step
340  * two, and the ordered work queue takes care of making sure that
341  * happens in the same order things were put onto the queue by
342  * writepages and friends.
343  *
344  * If this code finds it can't get good compression, it puts an
345  * entry onto the work queue to write the uncompressed bytes.  This
346  * makes sure that both compressed inodes and uncompressed inodes
347  * are written in the same order that the flusher thread sent them
348  * down.
349  */
350 static noinline int compress_file_range(struct inode *inode,
351 					struct page *locked_page,
352 					u64 start, u64 end,
353 					struct async_cow *async_cow,
354 					int *num_added)
355 {
356 	struct btrfs_root *root = BTRFS_I(inode)->root;
357 	u64 num_bytes;
358 	u64 blocksize = root->sectorsize;
359 	u64 actual_end;
360 	u64 isize = i_size_read(inode);
361 	int ret = 0;
362 	struct page **pages = NULL;
363 	unsigned long nr_pages;
364 	unsigned long nr_pages_ret = 0;
365 	unsigned long total_compressed = 0;
366 	unsigned long total_in = 0;
367 	unsigned long max_compressed = 128 * 1024;
368 	unsigned long max_uncompressed = 128 * 1024;
369 	int i;
370 	int will_compress;
371 	int compress_type = root->fs_info->compress_type;
372 	int redirty = 0;
373 
374 	/* if this is a small write inside eof, kick off a defrag */
375 	if ((end - start + 1) < 16 * 1024 &&
376 	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
377 		btrfs_add_inode_defrag(NULL, inode);
378 
379 	actual_end = min_t(u64, isize, end + 1);
380 again:
381 	will_compress = 0;
382 	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
383 	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
384 
385 	/*
386 	 * we don't want to send crud past the end of i_size through
387 	 * compression, that's just a waste of CPU time.  So, if the
388 	 * end of the file is before the start of our current
389 	 * requested range of bytes, we bail out to the uncompressed
390 	 * cleanup code that can deal with all of this.
391 	 *
392 	 * It isn't really the fastest way to fix things, but this is a
393 	 * very uncommon corner.
394 	 */
395 	if (actual_end <= start)
396 		goto cleanup_and_bail_uncompressed;
397 
398 	total_compressed = actual_end - start;
399 
400 	/* we want to make sure that amount of ram required to uncompress
401 	 * an extent is reasonable, so we limit the total size in ram
402 	 * of a compressed extent to 128k.  This is a crucial number
403 	 * because it also controls how easily we can spread reads across
404 	 * cpus for decompression.
405 	 *
406 	 * We also want to make sure the amount of IO required to do
407 	 * a random read is reasonably small, so we limit the size of
408 	 * a compressed extent to 128k.
409 	 */
410 	total_compressed = min(total_compressed, max_uncompressed);
411 	num_bytes = ALIGN(end - start + 1, blocksize);
412 	num_bytes = max(blocksize,  num_bytes);
413 	total_in = 0;
414 	ret = 0;
415 
416 	/*
417 	 * we do compression for mount -o compress and when the
418 	 * inode has not been flagged as nocompress.  This flag can
419 	 * change at any time if we discover bad compression ratios.
420 	 */
421 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
422 	    (btrfs_test_opt(root, COMPRESS) ||
423 	     (BTRFS_I(inode)->force_compress) ||
424 	     (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
425 		WARN_ON(pages);
426 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
427 		if (!pages) {
428 			/* just bail out to the uncompressed code */
429 			goto cont;
430 		}
431 
432 		if (BTRFS_I(inode)->force_compress)
433 			compress_type = BTRFS_I(inode)->force_compress;
434 
435 		/*
436 		 * we need to call clear_page_dirty_for_io on each
437 		 * page in the range.  Otherwise applications with the file
438 		 * mmap'd can wander in and change the page contents while
439 		 * we are compressing them.
440 		 *
441 		 * If the compression fails for any reason, we set the pages
442 		 * dirty again later on.
443 		 */
444 		extent_range_clear_dirty_for_io(inode, start, end);
445 		redirty = 1;
446 		ret = btrfs_compress_pages(compress_type,
447 					   inode->i_mapping, start,
448 					   total_compressed, pages,
449 					   nr_pages, &nr_pages_ret,
450 					   &total_in,
451 					   &total_compressed,
452 					   max_compressed);
453 
454 		if (!ret) {
455 			unsigned long offset = total_compressed &
456 				(PAGE_CACHE_SIZE - 1);
457 			struct page *page = pages[nr_pages_ret - 1];
458 			char *kaddr;
459 
460 			/* zero the tail end of the last page, we might be
461 			 * sending it down to disk
462 			 */
463 			if (offset) {
464 				kaddr = kmap_atomic(page);
465 				memset(kaddr + offset, 0,
466 				       PAGE_CACHE_SIZE - offset);
467 				kunmap_atomic(kaddr);
468 			}
469 			will_compress = 1;
470 		}
471 	}
472 cont:
473 	if (start == 0) {
474 		/* lets try to make an inline extent */
475 		if (ret || total_in < (actual_end - start)) {
476 			/* we didn't compress the entire range, try
477 			 * to make an uncompressed inline extent.
478 			 */
479 			ret = cow_file_range_inline(root, inode, start, end,
480 						    0, 0, NULL);
481 		} else {
482 			/* try making a compressed inline extent */
483 			ret = cow_file_range_inline(root, inode, start, end,
484 						    total_compressed,
485 						    compress_type, pages);
486 		}
487 		if (ret <= 0) {
488 			unsigned long clear_flags = EXTENT_DELALLOC |
489 				EXTENT_DEFRAG;
490 			clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
491 
492 			/*
493 			 * inline extent creation worked or returned error,
494 			 * we don't need to create any more async work items.
495 			 * Unlock and free up our temp pages.
496 			 */
497 			extent_clear_unlock_delalloc(inode, start, end, NULL,
498 						     clear_flags, PAGE_UNLOCK |
499 						     PAGE_CLEAR_DIRTY |
500 						     PAGE_SET_WRITEBACK |
501 						     PAGE_END_WRITEBACK);
502 			goto free_pages_out;
503 		}
504 	}
505 
506 	if (will_compress) {
507 		/*
508 		 * we aren't doing an inline extent round the compressed size
509 		 * up to a block size boundary so the allocator does sane
510 		 * things
511 		 */
512 		total_compressed = ALIGN(total_compressed, blocksize);
513 
514 		/*
515 		 * one last check to make sure the compression is really a
516 		 * win, compare the page count read with the blocks on disk
517 		 */
518 		total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
519 		if (total_compressed >= total_in) {
520 			will_compress = 0;
521 		} else {
522 			num_bytes = total_in;
523 		}
524 	}
525 	if (!will_compress && pages) {
526 		/*
527 		 * the compression code ran but failed to make things smaller,
528 		 * free any pages it allocated and our page pointer array
529 		 */
530 		for (i = 0; i < nr_pages_ret; i++) {
531 			WARN_ON(pages[i]->mapping);
532 			page_cache_release(pages[i]);
533 		}
534 		kfree(pages);
535 		pages = NULL;
536 		total_compressed = 0;
537 		nr_pages_ret = 0;
538 
539 		/* flag the file so we don't compress in the future */
540 		if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
541 		    !(BTRFS_I(inode)->force_compress)) {
542 			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
543 		}
544 	}
545 	if (will_compress) {
546 		*num_added += 1;
547 
548 		/* the async work queues will take care of doing actual
549 		 * allocation on disk for these compressed pages,
550 		 * and will submit them to the elevator.
551 		 */
552 		add_async_extent(async_cow, start, num_bytes,
553 				 total_compressed, pages, nr_pages_ret,
554 				 compress_type);
555 
556 		if (start + num_bytes < end) {
557 			start += num_bytes;
558 			pages = NULL;
559 			cond_resched();
560 			goto again;
561 		}
562 	} else {
563 cleanup_and_bail_uncompressed:
564 		/*
565 		 * No compression, but we still need to write the pages in
566 		 * the file we've been given so far.  redirty the locked
567 		 * page if it corresponds to our extent and set things up
568 		 * for the async work queue to run cow_file_range to do
569 		 * the normal delalloc dance
570 		 */
571 		if (page_offset(locked_page) >= start &&
572 		    page_offset(locked_page) <= end) {
573 			__set_page_dirty_nobuffers(locked_page);
574 			/* unlocked later on in the async handlers */
575 		}
576 		if (redirty)
577 			extent_range_redirty_for_io(inode, start, end);
578 		add_async_extent(async_cow, start, end - start + 1,
579 				 0, NULL, 0, BTRFS_COMPRESS_NONE);
580 		*num_added += 1;
581 	}
582 
583 out:
584 	return ret;
585 
586 free_pages_out:
587 	for (i = 0; i < nr_pages_ret; i++) {
588 		WARN_ON(pages[i]->mapping);
589 		page_cache_release(pages[i]);
590 	}
591 	kfree(pages);
592 
593 	goto out;
594 }
595 
596 /*
597  * phase two of compressed writeback.  This is the ordered portion
598  * of the code, which only gets called in the order the work was
599  * queued.  We walk all the async extents created by compress_file_range
600  * and send them down to the disk.
601  */
602 static noinline int submit_compressed_extents(struct inode *inode,
603 					      struct async_cow *async_cow)
604 {
605 	struct async_extent *async_extent;
606 	u64 alloc_hint = 0;
607 	struct btrfs_key ins;
608 	struct extent_map *em;
609 	struct btrfs_root *root = BTRFS_I(inode)->root;
610 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
611 	struct extent_io_tree *io_tree;
612 	int ret = 0;
613 
614 	if (list_empty(&async_cow->extents))
615 		return 0;
616 
617 again:
618 	while (!list_empty(&async_cow->extents)) {
619 		async_extent = list_entry(async_cow->extents.next,
620 					  struct async_extent, list);
621 		list_del(&async_extent->list);
622 
623 		io_tree = &BTRFS_I(inode)->io_tree;
624 
625 retry:
626 		/* did the compression code fall back to uncompressed IO? */
627 		if (!async_extent->pages) {
628 			int page_started = 0;
629 			unsigned long nr_written = 0;
630 
631 			lock_extent(io_tree, async_extent->start,
632 					 async_extent->start +
633 					 async_extent->ram_size - 1);
634 
635 			/* allocate blocks */
636 			ret = cow_file_range(inode, async_cow->locked_page,
637 					     async_extent->start,
638 					     async_extent->start +
639 					     async_extent->ram_size - 1,
640 					     &page_started, &nr_written, 0);
641 
642 			/* JDM XXX */
643 
644 			/*
645 			 * if page_started, cow_file_range inserted an
646 			 * inline extent and took care of all the unlocking
647 			 * and IO for us.  Otherwise, we need to submit
648 			 * all those pages down to the drive.
649 			 */
650 			if (!page_started && !ret)
651 				extent_write_locked_range(io_tree,
652 						  inode, async_extent->start,
653 						  async_extent->start +
654 						  async_extent->ram_size - 1,
655 						  btrfs_get_extent,
656 						  WB_SYNC_ALL);
657 			else if (ret)
658 				unlock_page(async_cow->locked_page);
659 			kfree(async_extent);
660 			cond_resched();
661 			continue;
662 		}
663 
664 		lock_extent(io_tree, async_extent->start,
665 			    async_extent->start + async_extent->ram_size - 1);
666 
667 		ret = btrfs_reserve_extent(root,
668 					   async_extent->compressed_size,
669 					   async_extent->compressed_size,
670 					   0, alloc_hint, &ins, 1);
671 		if (ret) {
672 			int i;
673 
674 			for (i = 0; i < async_extent->nr_pages; i++) {
675 				WARN_ON(async_extent->pages[i]->mapping);
676 				page_cache_release(async_extent->pages[i]);
677 			}
678 			kfree(async_extent->pages);
679 			async_extent->nr_pages = 0;
680 			async_extent->pages = NULL;
681 
682 			if (ret == -ENOSPC) {
683 				unlock_extent(io_tree, async_extent->start,
684 					      async_extent->start +
685 					      async_extent->ram_size - 1);
686 				goto retry;
687 			}
688 			goto out_free;
689 		}
690 
691 		/*
692 		 * here we're doing allocation and writeback of the
693 		 * compressed pages
694 		 */
695 		btrfs_drop_extent_cache(inode, async_extent->start,
696 					async_extent->start +
697 					async_extent->ram_size - 1, 0);
698 
699 		em = alloc_extent_map();
700 		if (!em) {
701 			ret = -ENOMEM;
702 			goto out_free_reserve;
703 		}
704 		em->start = async_extent->start;
705 		em->len = async_extent->ram_size;
706 		em->orig_start = em->start;
707 		em->mod_start = em->start;
708 		em->mod_len = em->len;
709 
710 		em->block_start = ins.objectid;
711 		em->block_len = ins.offset;
712 		em->orig_block_len = ins.offset;
713 		em->ram_bytes = async_extent->ram_size;
714 		em->bdev = root->fs_info->fs_devices->latest_bdev;
715 		em->compress_type = async_extent->compress_type;
716 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
717 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
718 		em->generation = -1;
719 
720 		while (1) {
721 			write_lock(&em_tree->lock);
722 			ret = add_extent_mapping(em_tree, em, 1);
723 			write_unlock(&em_tree->lock);
724 			if (ret != -EEXIST) {
725 				free_extent_map(em);
726 				break;
727 			}
728 			btrfs_drop_extent_cache(inode, async_extent->start,
729 						async_extent->start +
730 						async_extent->ram_size - 1, 0);
731 		}
732 
733 		if (ret)
734 			goto out_free_reserve;
735 
736 		ret = btrfs_add_ordered_extent_compress(inode,
737 						async_extent->start,
738 						ins.objectid,
739 						async_extent->ram_size,
740 						ins.offset,
741 						BTRFS_ORDERED_COMPRESSED,
742 						async_extent->compress_type);
743 		if (ret)
744 			goto out_free_reserve;
745 
746 		/*
747 		 * clear dirty, set writeback and unlock the pages.
748 		 */
749 		extent_clear_unlock_delalloc(inode, async_extent->start,
750 				async_extent->start +
751 				async_extent->ram_size - 1,
752 				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
753 				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
754 				PAGE_SET_WRITEBACK);
755 		ret = btrfs_submit_compressed_write(inode,
756 				    async_extent->start,
757 				    async_extent->ram_size,
758 				    ins.objectid,
759 				    ins.offset, async_extent->pages,
760 				    async_extent->nr_pages);
761 		alloc_hint = ins.objectid + ins.offset;
762 		kfree(async_extent);
763 		if (ret)
764 			goto out;
765 		cond_resched();
766 	}
767 	ret = 0;
768 out:
769 	return ret;
770 out_free_reserve:
771 	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
772 out_free:
773 	extent_clear_unlock_delalloc(inode, async_extent->start,
774 				     async_extent->start +
775 				     async_extent->ram_size - 1,
776 				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
777 				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
778 				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
779 				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
780 	kfree(async_extent);
781 	goto again;
782 }
783 
784 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
785 				      u64 num_bytes)
786 {
787 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
788 	struct extent_map *em;
789 	u64 alloc_hint = 0;
790 
791 	read_lock(&em_tree->lock);
792 	em = search_extent_mapping(em_tree, start, num_bytes);
793 	if (em) {
794 		/*
795 		 * if block start isn't an actual block number then find the
796 		 * first block in this inode and use that as a hint.  If that
797 		 * block is also bogus then just don't worry about it.
798 		 */
799 		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
800 			free_extent_map(em);
801 			em = search_extent_mapping(em_tree, 0, 0);
802 			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
803 				alloc_hint = em->block_start;
804 			if (em)
805 				free_extent_map(em);
806 		} else {
807 			alloc_hint = em->block_start;
808 			free_extent_map(em);
809 		}
810 	}
811 	read_unlock(&em_tree->lock);
812 
813 	return alloc_hint;
814 }
815 
816 /*
817  * when extent_io.c finds a delayed allocation range in the file,
818  * the call backs end up in this code.  The basic idea is to
819  * allocate extents on disk for the range, and create ordered data structs
820  * in ram to track those extents.
821  *
822  * locked_page is the page that writepage had locked already.  We use
823  * it to make sure we don't do extra locks or unlocks.
824  *
825  * *page_started is set to one if we unlock locked_page and do everything
826  * required to start IO on it.  It may be clean and already done with
827  * IO when we return.
828  */
829 static noinline int cow_file_range(struct inode *inode,
830 				   struct page *locked_page,
831 				   u64 start, u64 end, int *page_started,
832 				   unsigned long *nr_written,
833 				   int unlock)
834 {
835 	struct btrfs_root *root = BTRFS_I(inode)->root;
836 	u64 alloc_hint = 0;
837 	u64 num_bytes;
838 	unsigned long ram_size;
839 	u64 disk_num_bytes;
840 	u64 cur_alloc_size;
841 	u64 blocksize = root->sectorsize;
842 	struct btrfs_key ins;
843 	struct extent_map *em;
844 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
845 	int ret = 0;
846 
847 	BUG_ON(btrfs_is_free_space_inode(inode));
848 
849 	num_bytes = ALIGN(end - start + 1, blocksize);
850 	num_bytes = max(blocksize,  num_bytes);
851 	disk_num_bytes = num_bytes;
852 
853 	/* if this is a small write inside eof, kick off defrag */
854 	if (num_bytes < 64 * 1024 &&
855 	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
856 		btrfs_add_inode_defrag(NULL, inode);
857 
858 	if (start == 0) {
859 		/* lets try to make an inline extent */
860 		ret = cow_file_range_inline(root, inode, start, end, 0, 0,
861 					    NULL);
862 		if (ret == 0) {
863 			extent_clear_unlock_delalloc(inode, start, end, NULL,
864 				     EXTENT_LOCKED | EXTENT_DELALLOC |
865 				     EXTENT_DEFRAG, PAGE_UNLOCK |
866 				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
867 				     PAGE_END_WRITEBACK);
868 
869 			*nr_written = *nr_written +
870 			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
871 			*page_started = 1;
872 			goto out;
873 		} else if (ret < 0) {
874 			goto out_unlock;
875 		}
876 	}
877 
878 	BUG_ON(disk_num_bytes >
879 	       btrfs_super_total_bytes(root->fs_info->super_copy));
880 
881 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
882 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
883 
884 	while (disk_num_bytes > 0) {
885 		unsigned long op;
886 
887 		cur_alloc_size = disk_num_bytes;
888 		ret = btrfs_reserve_extent(root, cur_alloc_size,
889 					   root->sectorsize, 0, alloc_hint,
890 					   &ins, 1);
891 		if (ret < 0)
892 			goto out_unlock;
893 
894 		em = alloc_extent_map();
895 		if (!em) {
896 			ret = -ENOMEM;
897 			goto out_reserve;
898 		}
899 		em->start = start;
900 		em->orig_start = em->start;
901 		ram_size = ins.offset;
902 		em->len = ins.offset;
903 		em->mod_start = em->start;
904 		em->mod_len = em->len;
905 
906 		em->block_start = ins.objectid;
907 		em->block_len = ins.offset;
908 		em->orig_block_len = ins.offset;
909 		em->ram_bytes = ram_size;
910 		em->bdev = root->fs_info->fs_devices->latest_bdev;
911 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
912 		em->generation = -1;
913 
914 		while (1) {
915 			write_lock(&em_tree->lock);
916 			ret = add_extent_mapping(em_tree, em, 1);
917 			write_unlock(&em_tree->lock);
918 			if (ret != -EEXIST) {
919 				free_extent_map(em);
920 				break;
921 			}
922 			btrfs_drop_extent_cache(inode, start,
923 						start + ram_size - 1, 0);
924 		}
925 		if (ret)
926 			goto out_reserve;
927 
928 		cur_alloc_size = ins.offset;
929 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
930 					       ram_size, cur_alloc_size, 0);
931 		if (ret)
932 			goto out_reserve;
933 
934 		if (root->root_key.objectid ==
935 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
936 			ret = btrfs_reloc_clone_csums(inode, start,
937 						      cur_alloc_size);
938 			if (ret)
939 				goto out_reserve;
940 		}
941 
942 		if (disk_num_bytes < cur_alloc_size)
943 			break;
944 
945 		/* we're not doing compressed IO, don't unlock the first
946 		 * page (which the caller expects to stay locked), don't
947 		 * clear any dirty bits and don't set any writeback bits
948 		 *
949 		 * Do set the Private2 bit so we know this page was properly
950 		 * setup for writepage
951 		 */
952 		op = unlock ? PAGE_UNLOCK : 0;
953 		op |= PAGE_SET_PRIVATE2;
954 
955 		extent_clear_unlock_delalloc(inode, start,
956 					     start + ram_size - 1, locked_page,
957 					     EXTENT_LOCKED | EXTENT_DELALLOC,
958 					     op);
959 		disk_num_bytes -= cur_alloc_size;
960 		num_bytes -= cur_alloc_size;
961 		alloc_hint = ins.objectid + ins.offset;
962 		start += cur_alloc_size;
963 	}
964 out:
965 	return ret;
966 
967 out_reserve:
968 	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
969 out_unlock:
970 	extent_clear_unlock_delalloc(inode, start, end, locked_page,
971 				     EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
972 				     EXTENT_DELALLOC | EXTENT_DEFRAG,
973 				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
974 				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
975 	goto out;
976 }
977 
978 /*
979  * work queue call back to started compression on a file and pages
980  */
981 static noinline void async_cow_start(struct btrfs_work *work)
982 {
983 	struct async_cow *async_cow;
984 	int num_added = 0;
985 	async_cow = container_of(work, struct async_cow, work);
986 
987 	compress_file_range(async_cow->inode, async_cow->locked_page,
988 			    async_cow->start, async_cow->end, async_cow,
989 			    &num_added);
990 	if (num_added == 0) {
991 		btrfs_add_delayed_iput(async_cow->inode);
992 		async_cow->inode = NULL;
993 	}
994 }
995 
996 /*
997  * work queue call back to submit previously compressed pages
998  */
999 static noinline void async_cow_submit(struct btrfs_work *work)
1000 {
1001 	struct async_cow *async_cow;
1002 	struct btrfs_root *root;
1003 	unsigned long nr_pages;
1004 
1005 	async_cow = container_of(work, struct async_cow, work);
1006 
1007 	root = async_cow->root;
1008 	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
1009 		PAGE_CACHE_SHIFT;
1010 
1011 	if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1012 	    5 * 1024 * 1024 &&
1013 	    waitqueue_active(&root->fs_info->async_submit_wait))
1014 		wake_up(&root->fs_info->async_submit_wait);
1015 
1016 	if (async_cow->inode)
1017 		submit_compressed_extents(async_cow->inode, async_cow);
1018 }
1019 
1020 static noinline void async_cow_free(struct btrfs_work *work)
1021 {
1022 	struct async_cow *async_cow;
1023 	async_cow = container_of(work, struct async_cow, work);
1024 	if (async_cow->inode)
1025 		btrfs_add_delayed_iput(async_cow->inode);
1026 	kfree(async_cow);
1027 }
1028 
1029 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1030 				u64 start, u64 end, int *page_started,
1031 				unsigned long *nr_written)
1032 {
1033 	struct async_cow *async_cow;
1034 	struct btrfs_root *root = BTRFS_I(inode)->root;
1035 	unsigned long nr_pages;
1036 	u64 cur_end;
1037 	int limit = 10 * 1024 * 1024;
1038 
1039 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1040 			 1, 0, NULL, GFP_NOFS);
1041 	while (start < end) {
1042 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1043 		BUG_ON(!async_cow); /* -ENOMEM */
1044 		async_cow->inode = igrab(inode);
1045 		async_cow->root = root;
1046 		async_cow->locked_page = locked_page;
1047 		async_cow->start = start;
1048 
1049 		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
1050 			cur_end = end;
1051 		else
1052 			cur_end = min(end, start + 512 * 1024 - 1);
1053 
1054 		async_cow->end = cur_end;
1055 		INIT_LIST_HEAD(&async_cow->extents);
1056 
1057 		async_cow->work.func = async_cow_start;
1058 		async_cow->work.ordered_func = async_cow_submit;
1059 		async_cow->work.ordered_free = async_cow_free;
1060 		async_cow->work.flags = 0;
1061 
1062 		nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1063 			PAGE_CACHE_SHIFT;
1064 		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1065 
1066 		btrfs_queue_worker(&root->fs_info->delalloc_workers,
1067 				   &async_cow->work);
1068 
1069 		if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1070 			wait_event(root->fs_info->async_submit_wait,
1071 			   (atomic_read(&root->fs_info->async_delalloc_pages) <
1072 			    limit));
1073 		}
1074 
1075 		while (atomic_read(&root->fs_info->async_submit_draining) &&
1076 		      atomic_read(&root->fs_info->async_delalloc_pages)) {
1077 			wait_event(root->fs_info->async_submit_wait,
1078 			  (atomic_read(&root->fs_info->async_delalloc_pages) ==
1079 			   0));
1080 		}
1081 
1082 		*nr_written += nr_pages;
1083 		start = cur_end + 1;
1084 	}
1085 	*page_started = 1;
1086 	return 0;
1087 }
1088 
1089 static noinline int csum_exist_in_range(struct btrfs_root *root,
1090 					u64 bytenr, u64 num_bytes)
1091 {
1092 	int ret;
1093 	struct btrfs_ordered_sum *sums;
1094 	LIST_HEAD(list);
1095 
1096 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1097 				       bytenr + num_bytes - 1, &list, 0);
1098 	if (ret == 0 && list_empty(&list))
1099 		return 0;
1100 
1101 	while (!list_empty(&list)) {
1102 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1103 		list_del(&sums->list);
1104 		kfree(sums);
1105 	}
1106 	return 1;
1107 }
1108 
1109 /*
1110  * when nowcow writeback call back.  This checks for snapshots or COW copies
1111  * of the extents that exist in the file, and COWs the file as required.
1112  *
1113  * If no cow copies or snapshots exist, we write directly to the existing
1114  * blocks on disk
1115  */
1116 static noinline int run_delalloc_nocow(struct inode *inode,
1117 				       struct page *locked_page,
1118 			      u64 start, u64 end, int *page_started, int force,
1119 			      unsigned long *nr_written)
1120 {
1121 	struct btrfs_root *root = BTRFS_I(inode)->root;
1122 	struct btrfs_trans_handle *trans;
1123 	struct extent_buffer *leaf;
1124 	struct btrfs_path *path;
1125 	struct btrfs_file_extent_item *fi;
1126 	struct btrfs_key found_key;
1127 	u64 cow_start;
1128 	u64 cur_offset;
1129 	u64 extent_end;
1130 	u64 extent_offset;
1131 	u64 disk_bytenr;
1132 	u64 num_bytes;
1133 	u64 disk_num_bytes;
1134 	u64 ram_bytes;
1135 	int extent_type;
1136 	int ret, err;
1137 	int type;
1138 	int nocow;
1139 	int check_prev = 1;
1140 	bool nolock;
1141 	u64 ino = btrfs_ino(inode);
1142 
1143 	path = btrfs_alloc_path();
1144 	if (!path) {
1145 		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1146 					     EXTENT_LOCKED | EXTENT_DELALLOC |
1147 					     EXTENT_DO_ACCOUNTING |
1148 					     EXTENT_DEFRAG, PAGE_UNLOCK |
1149 					     PAGE_CLEAR_DIRTY |
1150 					     PAGE_SET_WRITEBACK |
1151 					     PAGE_END_WRITEBACK);
1152 		return -ENOMEM;
1153 	}
1154 
1155 	nolock = btrfs_is_free_space_inode(inode);
1156 
1157 	if (nolock)
1158 		trans = btrfs_join_transaction_nolock(root);
1159 	else
1160 		trans = btrfs_join_transaction(root);
1161 
1162 	if (IS_ERR(trans)) {
1163 		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1164 					     EXTENT_LOCKED | EXTENT_DELALLOC |
1165 					     EXTENT_DO_ACCOUNTING |
1166 					     EXTENT_DEFRAG, PAGE_UNLOCK |
1167 					     PAGE_CLEAR_DIRTY |
1168 					     PAGE_SET_WRITEBACK |
1169 					     PAGE_END_WRITEBACK);
1170 		btrfs_free_path(path);
1171 		return PTR_ERR(trans);
1172 	}
1173 
1174 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1175 
1176 	cow_start = (u64)-1;
1177 	cur_offset = start;
1178 	while (1) {
1179 		ret = btrfs_lookup_file_extent(trans, root, path, ino,
1180 					       cur_offset, 0);
1181 		if (ret < 0) {
1182 			btrfs_abort_transaction(trans, root, ret);
1183 			goto error;
1184 		}
1185 		if (ret > 0 && path->slots[0] > 0 && check_prev) {
1186 			leaf = path->nodes[0];
1187 			btrfs_item_key_to_cpu(leaf, &found_key,
1188 					      path->slots[0] - 1);
1189 			if (found_key.objectid == ino &&
1190 			    found_key.type == BTRFS_EXTENT_DATA_KEY)
1191 				path->slots[0]--;
1192 		}
1193 		check_prev = 0;
1194 next_slot:
1195 		leaf = path->nodes[0];
1196 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1197 			ret = btrfs_next_leaf(root, path);
1198 			if (ret < 0) {
1199 				btrfs_abort_transaction(trans, root, ret);
1200 				goto error;
1201 			}
1202 			if (ret > 0)
1203 				break;
1204 			leaf = path->nodes[0];
1205 		}
1206 
1207 		nocow = 0;
1208 		disk_bytenr = 0;
1209 		num_bytes = 0;
1210 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1211 
1212 		if (found_key.objectid > ino ||
1213 		    found_key.type > BTRFS_EXTENT_DATA_KEY ||
1214 		    found_key.offset > end)
1215 			break;
1216 
1217 		if (found_key.offset > cur_offset) {
1218 			extent_end = found_key.offset;
1219 			extent_type = 0;
1220 			goto out_check;
1221 		}
1222 
1223 		fi = btrfs_item_ptr(leaf, path->slots[0],
1224 				    struct btrfs_file_extent_item);
1225 		extent_type = btrfs_file_extent_type(leaf, fi);
1226 
1227 		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1228 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
1229 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1230 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1231 			extent_offset = btrfs_file_extent_offset(leaf, fi);
1232 			extent_end = found_key.offset +
1233 				btrfs_file_extent_num_bytes(leaf, fi);
1234 			disk_num_bytes =
1235 				btrfs_file_extent_disk_num_bytes(leaf, fi);
1236 			if (extent_end <= start) {
1237 				path->slots[0]++;
1238 				goto next_slot;
1239 			}
1240 			if (disk_bytenr == 0)
1241 				goto out_check;
1242 			if (btrfs_file_extent_compression(leaf, fi) ||
1243 			    btrfs_file_extent_encryption(leaf, fi) ||
1244 			    btrfs_file_extent_other_encoding(leaf, fi))
1245 				goto out_check;
1246 			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1247 				goto out_check;
1248 			if (btrfs_extent_readonly(root, disk_bytenr))
1249 				goto out_check;
1250 			if (btrfs_cross_ref_exist(trans, root, ino,
1251 						  found_key.offset -
1252 						  extent_offset, disk_bytenr))
1253 				goto out_check;
1254 			disk_bytenr += extent_offset;
1255 			disk_bytenr += cur_offset - found_key.offset;
1256 			num_bytes = min(end + 1, extent_end) - cur_offset;
1257 			/*
1258 			 * force cow if csum exists in the range.
1259 			 * this ensure that csum for a given extent are
1260 			 * either valid or do not exist.
1261 			 */
1262 			if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1263 				goto out_check;
1264 			nocow = 1;
1265 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1266 			extent_end = found_key.offset +
1267 				btrfs_file_extent_inline_len(leaf, fi);
1268 			extent_end = ALIGN(extent_end, root->sectorsize);
1269 		} else {
1270 			BUG_ON(1);
1271 		}
1272 out_check:
1273 		if (extent_end <= start) {
1274 			path->slots[0]++;
1275 			goto next_slot;
1276 		}
1277 		if (!nocow) {
1278 			if (cow_start == (u64)-1)
1279 				cow_start = cur_offset;
1280 			cur_offset = extent_end;
1281 			if (cur_offset > end)
1282 				break;
1283 			path->slots[0]++;
1284 			goto next_slot;
1285 		}
1286 
1287 		btrfs_release_path(path);
1288 		if (cow_start != (u64)-1) {
1289 			ret = cow_file_range(inode, locked_page,
1290 					     cow_start, found_key.offset - 1,
1291 					     page_started, nr_written, 1);
1292 			if (ret) {
1293 				btrfs_abort_transaction(trans, root, ret);
1294 				goto error;
1295 			}
1296 			cow_start = (u64)-1;
1297 		}
1298 
1299 		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1300 			struct extent_map *em;
1301 			struct extent_map_tree *em_tree;
1302 			em_tree = &BTRFS_I(inode)->extent_tree;
1303 			em = alloc_extent_map();
1304 			BUG_ON(!em); /* -ENOMEM */
1305 			em->start = cur_offset;
1306 			em->orig_start = found_key.offset - extent_offset;
1307 			em->len = num_bytes;
1308 			em->block_len = num_bytes;
1309 			em->block_start = disk_bytenr;
1310 			em->orig_block_len = disk_num_bytes;
1311 			em->ram_bytes = ram_bytes;
1312 			em->bdev = root->fs_info->fs_devices->latest_bdev;
1313 			em->mod_start = em->start;
1314 			em->mod_len = em->len;
1315 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
1316 			set_bit(EXTENT_FLAG_FILLING, &em->flags);
1317 			em->generation = -1;
1318 			while (1) {
1319 				write_lock(&em_tree->lock);
1320 				ret = add_extent_mapping(em_tree, em, 1);
1321 				write_unlock(&em_tree->lock);
1322 				if (ret != -EEXIST) {
1323 					free_extent_map(em);
1324 					break;
1325 				}
1326 				btrfs_drop_extent_cache(inode, em->start,
1327 						em->start + em->len - 1, 0);
1328 			}
1329 			type = BTRFS_ORDERED_PREALLOC;
1330 		} else {
1331 			type = BTRFS_ORDERED_NOCOW;
1332 		}
1333 
1334 		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1335 					       num_bytes, num_bytes, type);
1336 		BUG_ON(ret); /* -ENOMEM */
1337 
1338 		if (root->root_key.objectid ==
1339 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
1340 			ret = btrfs_reloc_clone_csums(inode, cur_offset,
1341 						      num_bytes);
1342 			if (ret) {
1343 				btrfs_abort_transaction(trans, root, ret);
1344 				goto error;
1345 			}
1346 		}
1347 
1348 		extent_clear_unlock_delalloc(inode, cur_offset,
1349 					     cur_offset + num_bytes - 1,
1350 					     locked_page, EXTENT_LOCKED |
1351 					     EXTENT_DELALLOC, PAGE_UNLOCK |
1352 					     PAGE_SET_PRIVATE2);
1353 		cur_offset = extent_end;
1354 		if (cur_offset > end)
1355 			break;
1356 	}
1357 	btrfs_release_path(path);
1358 
1359 	if (cur_offset <= end && cow_start == (u64)-1) {
1360 		cow_start = cur_offset;
1361 		cur_offset = end;
1362 	}
1363 
1364 	if (cow_start != (u64)-1) {
1365 		ret = cow_file_range(inode, locked_page, cow_start, end,
1366 				     page_started, nr_written, 1);
1367 		if (ret) {
1368 			btrfs_abort_transaction(trans, root, ret);
1369 			goto error;
1370 		}
1371 	}
1372 
1373 error:
1374 	err = btrfs_end_transaction(trans, root);
1375 	if (!ret)
1376 		ret = err;
1377 
1378 	if (ret && cur_offset < end)
1379 		extent_clear_unlock_delalloc(inode, cur_offset, end,
1380 					     locked_page, EXTENT_LOCKED |
1381 					     EXTENT_DELALLOC | EXTENT_DEFRAG |
1382 					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1383 					     PAGE_CLEAR_DIRTY |
1384 					     PAGE_SET_WRITEBACK |
1385 					     PAGE_END_WRITEBACK);
1386 	btrfs_free_path(path);
1387 	return ret;
1388 }
1389 
1390 /*
1391  * extent_io.c call back to do delayed allocation processing
1392  */
1393 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1394 			      u64 start, u64 end, int *page_started,
1395 			      unsigned long *nr_written)
1396 {
1397 	int ret;
1398 	struct btrfs_root *root = BTRFS_I(inode)->root;
1399 
1400 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
1401 		ret = run_delalloc_nocow(inode, locked_page, start, end,
1402 					 page_started, 1, nr_written);
1403 	} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
1404 		ret = run_delalloc_nocow(inode, locked_page, start, end,
1405 					 page_started, 0, nr_written);
1406 	} else if (!btrfs_test_opt(root, COMPRESS) &&
1407 		   !(BTRFS_I(inode)->force_compress) &&
1408 		   !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
1409 		ret = cow_file_range(inode, locked_page, start, end,
1410 				      page_started, nr_written, 1);
1411 	} else {
1412 		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1413 			&BTRFS_I(inode)->runtime_flags);
1414 		ret = cow_file_range_async(inode, locked_page, start, end,
1415 					   page_started, nr_written);
1416 	}
1417 	return ret;
1418 }
1419 
1420 static void btrfs_split_extent_hook(struct inode *inode,
1421 				    struct extent_state *orig, u64 split)
1422 {
1423 	/* not delalloc, ignore it */
1424 	if (!(orig->state & EXTENT_DELALLOC))
1425 		return;
1426 
1427 	spin_lock(&BTRFS_I(inode)->lock);
1428 	BTRFS_I(inode)->outstanding_extents++;
1429 	spin_unlock(&BTRFS_I(inode)->lock);
1430 }
1431 
1432 /*
1433  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1434  * extents so we can keep track of new extents that are just merged onto old
1435  * extents, such as when we are doing sequential writes, so we can properly
1436  * account for the metadata space we'll need.
1437  */
1438 static void btrfs_merge_extent_hook(struct inode *inode,
1439 				    struct extent_state *new,
1440 				    struct extent_state *other)
1441 {
1442 	/* not delalloc, ignore it */
1443 	if (!(other->state & EXTENT_DELALLOC))
1444 		return;
1445 
1446 	spin_lock(&BTRFS_I(inode)->lock);
1447 	BTRFS_I(inode)->outstanding_extents--;
1448 	spin_unlock(&BTRFS_I(inode)->lock);
1449 }
1450 
1451 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1452 				      struct inode *inode)
1453 {
1454 	spin_lock(&root->delalloc_lock);
1455 	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1456 		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1457 			      &root->delalloc_inodes);
1458 		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1459 			&BTRFS_I(inode)->runtime_flags);
1460 		root->nr_delalloc_inodes++;
1461 		if (root->nr_delalloc_inodes == 1) {
1462 			spin_lock(&root->fs_info->delalloc_root_lock);
1463 			BUG_ON(!list_empty(&root->delalloc_root));
1464 			list_add_tail(&root->delalloc_root,
1465 				      &root->fs_info->delalloc_roots);
1466 			spin_unlock(&root->fs_info->delalloc_root_lock);
1467 		}
1468 	}
1469 	spin_unlock(&root->delalloc_lock);
1470 }
1471 
1472 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1473 				     struct inode *inode)
1474 {
1475 	spin_lock(&root->delalloc_lock);
1476 	if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1477 		list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1478 		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1479 			  &BTRFS_I(inode)->runtime_flags);
1480 		root->nr_delalloc_inodes--;
1481 		if (!root->nr_delalloc_inodes) {
1482 			spin_lock(&root->fs_info->delalloc_root_lock);
1483 			BUG_ON(list_empty(&root->delalloc_root));
1484 			list_del_init(&root->delalloc_root);
1485 			spin_unlock(&root->fs_info->delalloc_root_lock);
1486 		}
1487 	}
1488 	spin_unlock(&root->delalloc_lock);
1489 }
1490 
1491 /*
1492  * extent_io.c set_bit_hook, used to track delayed allocation
1493  * bytes in this file, and to maintain the list of inodes that
1494  * have pending delalloc work to be done.
1495  */
1496 static void btrfs_set_bit_hook(struct inode *inode,
1497 			       struct extent_state *state, unsigned long *bits)
1498 {
1499 
1500 	/*
1501 	 * set_bit and clear bit hooks normally require _irqsave/restore
1502 	 * but in this case, we are only testing for the DELALLOC
1503 	 * bit, which is only set or cleared with irqs on
1504 	 */
1505 	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1506 		struct btrfs_root *root = BTRFS_I(inode)->root;
1507 		u64 len = state->end + 1 - state->start;
1508 		bool do_list = !btrfs_is_free_space_inode(inode);
1509 
1510 		if (*bits & EXTENT_FIRST_DELALLOC) {
1511 			*bits &= ~EXTENT_FIRST_DELALLOC;
1512 		} else {
1513 			spin_lock(&BTRFS_I(inode)->lock);
1514 			BTRFS_I(inode)->outstanding_extents++;
1515 			spin_unlock(&BTRFS_I(inode)->lock);
1516 		}
1517 
1518 		__percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1519 				     root->fs_info->delalloc_batch);
1520 		spin_lock(&BTRFS_I(inode)->lock);
1521 		BTRFS_I(inode)->delalloc_bytes += len;
1522 		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1523 					 &BTRFS_I(inode)->runtime_flags))
1524 			btrfs_add_delalloc_inodes(root, inode);
1525 		spin_unlock(&BTRFS_I(inode)->lock);
1526 	}
1527 }
1528 
1529 /*
1530  * extent_io.c clear_bit_hook, see set_bit_hook for why
1531  */
1532 static void btrfs_clear_bit_hook(struct inode *inode,
1533 				 struct extent_state *state,
1534 				 unsigned long *bits)
1535 {
1536 	/*
1537 	 * set_bit and clear bit hooks normally require _irqsave/restore
1538 	 * but in this case, we are only testing for the DELALLOC
1539 	 * bit, which is only set or cleared with irqs on
1540 	 */
1541 	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1542 		struct btrfs_root *root = BTRFS_I(inode)->root;
1543 		u64 len = state->end + 1 - state->start;
1544 		bool do_list = !btrfs_is_free_space_inode(inode);
1545 
1546 		if (*bits & EXTENT_FIRST_DELALLOC) {
1547 			*bits &= ~EXTENT_FIRST_DELALLOC;
1548 		} else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1549 			spin_lock(&BTRFS_I(inode)->lock);
1550 			BTRFS_I(inode)->outstanding_extents--;
1551 			spin_unlock(&BTRFS_I(inode)->lock);
1552 		}
1553 
1554 		if (*bits & EXTENT_DO_ACCOUNTING)
1555 			btrfs_delalloc_release_metadata(inode, len);
1556 
1557 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1558 		    && do_list && !(state->state & EXTENT_NORESERVE))
1559 			btrfs_free_reserved_data_space(inode, len);
1560 
1561 		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1562 				     root->fs_info->delalloc_batch);
1563 		spin_lock(&BTRFS_I(inode)->lock);
1564 		BTRFS_I(inode)->delalloc_bytes -= len;
1565 		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1566 		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1567 			     &BTRFS_I(inode)->runtime_flags))
1568 			btrfs_del_delalloc_inode(root, inode);
1569 		spin_unlock(&BTRFS_I(inode)->lock);
1570 	}
1571 }
1572 
1573 /*
1574  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1575  * we don't create bios that span stripes or chunks
1576  */
1577 int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1578 			 size_t size, struct bio *bio,
1579 			 unsigned long bio_flags)
1580 {
1581 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1582 	u64 logical = (u64)bio->bi_sector << 9;
1583 	u64 length = 0;
1584 	u64 map_length;
1585 	int ret;
1586 
1587 	if (bio_flags & EXTENT_BIO_COMPRESSED)
1588 		return 0;
1589 
1590 	length = bio->bi_size;
1591 	map_length = length;
1592 	ret = btrfs_map_block(root->fs_info, rw, logical,
1593 			      &map_length, NULL, 0);
1594 	/* Will always return 0 with map_multi == NULL */
1595 	BUG_ON(ret < 0);
1596 	if (map_length < length + size)
1597 		return 1;
1598 	return 0;
1599 }
1600 
1601 /*
1602  * in order to insert checksums into the metadata in large chunks,
1603  * we wait until bio submission time.   All the pages in the bio are
1604  * checksummed and sums are attached onto the ordered extent record.
1605  *
1606  * At IO completion time the cums attached on the ordered extent record
1607  * are inserted into the btree
1608  */
1609 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1610 				    struct bio *bio, int mirror_num,
1611 				    unsigned long bio_flags,
1612 				    u64 bio_offset)
1613 {
1614 	struct btrfs_root *root = BTRFS_I(inode)->root;
1615 	int ret = 0;
1616 
1617 	ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1618 	BUG_ON(ret); /* -ENOMEM */
1619 	return 0;
1620 }
1621 
1622 /*
1623  * in order to insert checksums into the metadata in large chunks,
1624  * we wait until bio submission time.   All the pages in the bio are
1625  * checksummed and sums are attached onto the ordered extent record.
1626  *
1627  * At IO completion time the cums attached on the ordered extent record
1628  * are inserted into the btree
1629  */
1630 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1631 			  int mirror_num, unsigned long bio_flags,
1632 			  u64 bio_offset)
1633 {
1634 	struct btrfs_root *root = BTRFS_I(inode)->root;
1635 	int ret;
1636 
1637 	ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1638 	if (ret)
1639 		bio_endio(bio, ret);
1640 	return ret;
1641 }
1642 
1643 /*
1644  * extent_io.c submission hook. This does the right thing for csum calculation
1645  * on write, or reading the csums from the tree before a read
1646  */
1647 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1648 			  int mirror_num, unsigned long bio_flags,
1649 			  u64 bio_offset)
1650 {
1651 	struct btrfs_root *root = BTRFS_I(inode)->root;
1652 	int ret = 0;
1653 	int skip_sum;
1654 	int metadata = 0;
1655 	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1656 
1657 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1658 
1659 	if (btrfs_is_free_space_inode(inode))
1660 		metadata = 2;
1661 
1662 	if (!(rw & REQ_WRITE)) {
1663 		ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1664 		if (ret)
1665 			goto out;
1666 
1667 		if (bio_flags & EXTENT_BIO_COMPRESSED) {
1668 			ret = btrfs_submit_compressed_read(inode, bio,
1669 							   mirror_num,
1670 							   bio_flags);
1671 			goto out;
1672 		} else if (!skip_sum) {
1673 			ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1674 			if (ret)
1675 				goto out;
1676 		}
1677 		goto mapit;
1678 	} else if (async && !skip_sum) {
1679 		/* csum items have already been cloned */
1680 		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1681 			goto mapit;
1682 		/* we're doing a write, do the async checksumming */
1683 		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1684 				   inode, rw, bio, mirror_num,
1685 				   bio_flags, bio_offset,
1686 				   __btrfs_submit_bio_start,
1687 				   __btrfs_submit_bio_done);
1688 		goto out;
1689 	} else if (!skip_sum) {
1690 		ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1691 		if (ret)
1692 			goto out;
1693 	}
1694 
1695 mapit:
1696 	ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1697 
1698 out:
1699 	if (ret < 0)
1700 		bio_endio(bio, ret);
1701 	return ret;
1702 }
1703 
1704 /*
1705  * given a list of ordered sums record them in the inode.  This happens
1706  * at IO completion time based on sums calculated at bio submission time.
1707  */
1708 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1709 			     struct inode *inode, u64 file_offset,
1710 			     struct list_head *list)
1711 {
1712 	struct btrfs_ordered_sum *sum;
1713 
1714 	list_for_each_entry(sum, list, list) {
1715 		trans->adding_csums = 1;
1716 		btrfs_csum_file_blocks(trans,
1717 		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
1718 		trans->adding_csums = 0;
1719 	}
1720 	return 0;
1721 }
1722 
1723 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1724 			      struct extent_state **cached_state)
1725 {
1726 	WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1727 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1728 				   cached_state, GFP_NOFS);
1729 }
1730 
1731 /* see btrfs_writepage_start_hook for details on why this is required */
1732 struct btrfs_writepage_fixup {
1733 	struct page *page;
1734 	struct btrfs_work work;
1735 };
1736 
1737 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1738 {
1739 	struct btrfs_writepage_fixup *fixup;
1740 	struct btrfs_ordered_extent *ordered;
1741 	struct extent_state *cached_state = NULL;
1742 	struct page *page;
1743 	struct inode *inode;
1744 	u64 page_start;
1745 	u64 page_end;
1746 	int ret;
1747 
1748 	fixup = container_of(work, struct btrfs_writepage_fixup, work);
1749 	page = fixup->page;
1750 again:
1751 	lock_page(page);
1752 	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1753 		ClearPageChecked(page);
1754 		goto out_page;
1755 	}
1756 
1757 	inode = page->mapping->host;
1758 	page_start = page_offset(page);
1759 	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1760 
1761 	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1762 			 &cached_state);
1763 
1764 	/* already ordered? We're done */
1765 	if (PagePrivate2(page))
1766 		goto out;
1767 
1768 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
1769 	if (ordered) {
1770 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1771 				     page_end, &cached_state, GFP_NOFS);
1772 		unlock_page(page);
1773 		btrfs_start_ordered_extent(inode, ordered, 1);
1774 		btrfs_put_ordered_extent(ordered);
1775 		goto again;
1776 	}
1777 
1778 	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
1779 	if (ret) {
1780 		mapping_set_error(page->mapping, ret);
1781 		end_extent_writepage(page, ret, page_start, page_end);
1782 		ClearPageChecked(page);
1783 		goto out;
1784 	 }
1785 
1786 	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1787 	ClearPageChecked(page);
1788 	set_page_dirty(page);
1789 out:
1790 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1791 			     &cached_state, GFP_NOFS);
1792 out_page:
1793 	unlock_page(page);
1794 	page_cache_release(page);
1795 	kfree(fixup);
1796 }
1797 
1798 /*
1799  * There are a few paths in the higher layers of the kernel that directly
1800  * set the page dirty bit without asking the filesystem if it is a
1801  * good idea.  This causes problems because we want to make sure COW
1802  * properly happens and the data=ordered rules are followed.
1803  *
1804  * In our case any range that doesn't have the ORDERED bit set
1805  * hasn't been properly setup for IO.  We kick off an async process
1806  * to fix it up.  The async helper will wait for ordered extents, set
1807  * the delalloc bit and make it safe to write the page.
1808  */
1809 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1810 {
1811 	struct inode *inode = page->mapping->host;
1812 	struct btrfs_writepage_fixup *fixup;
1813 	struct btrfs_root *root = BTRFS_I(inode)->root;
1814 
1815 	/* this page is properly in the ordered list */
1816 	if (TestClearPagePrivate2(page))
1817 		return 0;
1818 
1819 	if (PageChecked(page))
1820 		return -EAGAIN;
1821 
1822 	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1823 	if (!fixup)
1824 		return -EAGAIN;
1825 
1826 	SetPageChecked(page);
1827 	page_cache_get(page);
1828 	fixup->work.func = btrfs_writepage_fixup_worker;
1829 	fixup->page = page;
1830 	btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1831 	return -EBUSY;
1832 }
1833 
1834 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1835 				       struct inode *inode, u64 file_pos,
1836 				       u64 disk_bytenr, u64 disk_num_bytes,
1837 				       u64 num_bytes, u64 ram_bytes,
1838 				       u8 compression, u8 encryption,
1839 				       u16 other_encoding, int extent_type)
1840 {
1841 	struct btrfs_root *root = BTRFS_I(inode)->root;
1842 	struct btrfs_file_extent_item *fi;
1843 	struct btrfs_path *path;
1844 	struct extent_buffer *leaf;
1845 	struct btrfs_key ins;
1846 	int ret;
1847 
1848 	path = btrfs_alloc_path();
1849 	if (!path)
1850 		return -ENOMEM;
1851 
1852 	path->leave_spinning = 1;
1853 
1854 	/*
1855 	 * we may be replacing one extent in the tree with another.
1856 	 * The new extent is pinned in the extent map, and we don't want
1857 	 * to drop it from the cache until it is completely in the btree.
1858 	 *
1859 	 * So, tell btrfs_drop_extents to leave this extent in the cache.
1860 	 * the caller is expected to unpin it and allow it to be merged
1861 	 * with the others.
1862 	 */
1863 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
1864 				 file_pos + num_bytes, 0);
1865 	if (ret)
1866 		goto out;
1867 
1868 	ins.objectid = btrfs_ino(inode);
1869 	ins.offset = file_pos;
1870 	ins.type = BTRFS_EXTENT_DATA_KEY;
1871 	ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1872 	if (ret)
1873 		goto out;
1874 	leaf = path->nodes[0];
1875 	fi = btrfs_item_ptr(leaf, path->slots[0],
1876 			    struct btrfs_file_extent_item);
1877 	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1878 	btrfs_set_file_extent_type(leaf, fi, extent_type);
1879 	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1880 	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1881 	btrfs_set_file_extent_offset(leaf, fi, 0);
1882 	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1883 	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1884 	btrfs_set_file_extent_compression(leaf, fi, compression);
1885 	btrfs_set_file_extent_encryption(leaf, fi, encryption);
1886 	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1887 
1888 	btrfs_mark_buffer_dirty(leaf);
1889 	btrfs_release_path(path);
1890 
1891 	inode_add_bytes(inode, num_bytes);
1892 
1893 	ins.objectid = disk_bytenr;
1894 	ins.offset = disk_num_bytes;
1895 	ins.type = BTRFS_EXTENT_ITEM_KEY;
1896 	ret = btrfs_alloc_reserved_file_extent(trans, root,
1897 					root->root_key.objectid,
1898 					btrfs_ino(inode), file_pos, &ins);
1899 out:
1900 	btrfs_free_path(path);
1901 
1902 	return ret;
1903 }
1904 
1905 /* snapshot-aware defrag */
1906 struct sa_defrag_extent_backref {
1907 	struct rb_node node;
1908 	struct old_sa_defrag_extent *old;
1909 	u64 root_id;
1910 	u64 inum;
1911 	u64 file_pos;
1912 	u64 extent_offset;
1913 	u64 num_bytes;
1914 	u64 generation;
1915 };
1916 
1917 struct old_sa_defrag_extent {
1918 	struct list_head list;
1919 	struct new_sa_defrag_extent *new;
1920 
1921 	u64 extent_offset;
1922 	u64 bytenr;
1923 	u64 offset;
1924 	u64 len;
1925 	int count;
1926 };
1927 
1928 struct new_sa_defrag_extent {
1929 	struct rb_root root;
1930 	struct list_head head;
1931 	struct btrfs_path *path;
1932 	struct inode *inode;
1933 	u64 file_pos;
1934 	u64 len;
1935 	u64 bytenr;
1936 	u64 disk_len;
1937 	u8 compress_type;
1938 };
1939 
1940 static int backref_comp(struct sa_defrag_extent_backref *b1,
1941 			struct sa_defrag_extent_backref *b2)
1942 {
1943 	if (b1->root_id < b2->root_id)
1944 		return -1;
1945 	else if (b1->root_id > b2->root_id)
1946 		return 1;
1947 
1948 	if (b1->inum < b2->inum)
1949 		return -1;
1950 	else if (b1->inum > b2->inum)
1951 		return 1;
1952 
1953 	if (b1->file_pos < b2->file_pos)
1954 		return -1;
1955 	else if (b1->file_pos > b2->file_pos)
1956 		return 1;
1957 
1958 	/*
1959 	 * [------------------------------] ===> (a range of space)
1960 	 *     |<--->|   |<---->| =============> (fs/file tree A)
1961 	 * |<---------------------------->| ===> (fs/file tree B)
1962 	 *
1963 	 * A range of space can refer to two file extents in one tree while
1964 	 * refer to only one file extent in another tree.
1965 	 *
1966 	 * So we may process a disk offset more than one time(two extents in A)
1967 	 * and locate at the same extent(one extent in B), then insert two same
1968 	 * backrefs(both refer to the extent in B).
1969 	 */
1970 	return 0;
1971 }
1972 
1973 static void backref_insert(struct rb_root *root,
1974 			   struct sa_defrag_extent_backref *backref)
1975 {
1976 	struct rb_node **p = &root->rb_node;
1977 	struct rb_node *parent = NULL;
1978 	struct sa_defrag_extent_backref *entry;
1979 	int ret;
1980 
1981 	while (*p) {
1982 		parent = *p;
1983 		entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
1984 
1985 		ret = backref_comp(backref, entry);
1986 		if (ret < 0)
1987 			p = &(*p)->rb_left;
1988 		else
1989 			p = &(*p)->rb_right;
1990 	}
1991 
1992 	rb_link_node(&backref->node, parent, p);
1993 	rb_insert_color(&backref->node, root);
1994 }
1995 
1996 /*
1997  * Note the backref might has changed, and in this case we just return 0.
1998  */
1999 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2000 				       void *ctx)
2001 {
2002 	struct btrfs_file_extent_item *extent;
2003 	struct btrfs_fs_info *fs_info;
2004 	struct old_sa_defrag_extent *old = ctx;
2005 	struct new_sa_defrag_extent *new = old->new;
2006 	struct btrfs_path *path = new->path;
2007 	struct btrfs_key key;
2008 	struct btrfs_root *root;
2009 	struct sa_defrag_extent_backref *backref;
2010 	struct extent_buffer *leaf;
2011 	struct inode *inode = new->inode;
2012 	int slot;
2013 	int ret;
2014 	u64 extent_offset;
2015 	u64 num_bytes;
2016 
2017 	if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2018 	    inum == btrfs_ino(inode))
2019 		return 0;
2020 
2021 	key.objectid = root_id;
2022 	key.type = BTRFS_ROOT_ITEM_KEY;
2023 	key.offset = (u64)-1;
2024 
2025 	fs_info = BTRFS_I(inode)->root->fs_info;
2026 	root = btrfs_read_fs_root_no_name(fs_info, &key);
2027 	if (IS_ERR(root)) {
2028 		if (PTR_ERR(root) == -ENOENT)
2029 			return 0;
2030 		WARN_ON(1);
2031 		pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2032 			 inum, offset, root_id);
2033 		return PTR_ERR(root);
2034 	}
2035 
2036 	key.objectid = inum;
2037 	key.type = BTRFS_EXTENT_DATA_KEY;
2038 	if (offset > (u64)-1 << 32)
2039 		key.offset = 0;
2040 	else
2041 		key.offset = offset;
2042 
2043 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2044 	if (ret < 0) {
2045 		WARN_ON(1);
2046 		return ret;
2047 	}
2048 	ret = 0;
2049 
2050 	while (1) {
2051 		cond_resched();
2052 
2053 		leaf = path->nodes[0];
2054 		slot = path->slots[0];
2055 
2056 		if (slot >= btrfs_header_nritems(leaf)) {
2057 			ret = btrfs_next_leaf(root, path);
2058 			if (ret < 0) {
2059 				goto out;
2060 			} else if (ret > 0) {
2061 				ret = 0;
2062 				goto out;
2063 			}
2064 			continue;
2065 		}
2066 
2067 		path->slots[0]++;
2068 
2069 		btrfs_item_key_to_cpu(leaf, &key, slot);
2070 
2071 		if (key.objectid > inum)
2072 			goto out;
2073 
2074 		if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2075 			continue;
2076 
2077 		extent = btrfs_item_ptr(leaf, slot,
2078 					struct btrfs_file_extent_item);
2079 
2080 		if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2081 			continue;
2082 
2083 		/*
2084 		 * 'offset' refers to the exact key.offset,
2085 		 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2086 		 * (key.offset - extent_offset).
2087 		 */
2088 		if (key.offset != offset)
2089 			continue;
2090 
2091 		extent_offset = btrfs_file_extent_offset(leaf, extent);
2092 		num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2093 
2094 		if (extent_offset >= old->extent_offset + old->offset +
2095 		    old->len || extent_offset + num_bytes <=
2096 		    old->extent_offset + old->offset)
2097 			continue;
2098 		break;
2099 	}
2100 
2101 	backref = kmalloc(sizeof(*backref), GFP_NOFS);
2102 	if (!backref) {
2103 		ret = -ENOENT;
2104 		goto out;
2105 	}
2106 
2107 	backref->root_id = root_id;
2108 	backref->inum = inum;
2109 	backref->file_pos = offset;
2110 	backref->num_bytes = num_bytes;
2111 	backref->extent_offset = extent_offset;
2112 	backref->generation = btrfs_file_extent_generation(leaf, extent);
2113 	backref->old = old;
2114 	backref_insert(&new->root, backref);
2115 	old->count++;
2116 out:
2117 	btrfs_release_path(path);
2118 	WARN_ON(ret);
2119 	return ret;
2120 }
2121 
2122 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2123 				   struct new_sa_defrag_extent *new)
2124 {
2125 	struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2126 	struct old_sa_defrag_extent *old, *tmp;
2127 	int ret;
2128 
2129 	new->path = path;
2130 
2131 	list_for_each_entry_safe(old, tmp, &new->head, list) {
2132 		ret = iterate_inodes_from_logical(old->bytenr +
2133 						  old->extent_offset, fs_info,
2134 						  path, record_one_backref,
2135 						  old);
2136 		BUG_ON(ret < 0 && ret != -ENOENT);
2137 
2138 		/* no backref to be processed for this extent */
2139 		if (!old->count) {
2140 			list_del(&old->list);
2141 			kfree(old);
2142 		}
2143 	}
2144 
2145 	if (list_empty(&new->head))
2146 		return false;
2147 
2148 	return true;
2149 }
2150 
2151 static int relink_is_mergable(struct extent_buffer *leaf,
2152 			      struct btrfs_file_extent_item *fi,
2153 			      struct new_sa_defrag_extent *new)
2154 {
2155 	if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2156 		return 0;
2157 
2158 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2159 		return 0;
2160 
2161 	if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2162 		return 0;
2163 
2164 	if (btrfs_file_extent_encryption(leaf, fi) ||
2165 	    btrfs_file_extent_other_encoding(leaf, fi))
2166 		return 0;
2167 
2168 	return 1;
2169 }
2170 
2171 /*
2172  * Note the backref might has changed, and in this case we just return 0.
2173  */
2174 static noinline int relink_extent_backref(struct btrfs_path *path,
2175 				 struct sa_defrag_extent_backref *prev,
2176 				 struct sa_defrag_extent_backref *backref)
2177 {
2178 	struct btrfs_file_extent_item *extent;
2179 	struct btrfs_file_extent_item *item;
2180 	struct btrfs_ordered_extent *ordered;
2181 	struct btrfs_trans_handle *trans;
2182 	struct btrfs_fs_info *fs_info;
2183 	struct btrfs_root *root;
2184 	struct btrfs_key key;
2185 	struct extent_buffer *leaf;
2186 	struct old_sa_defrag_extent *old = backref->old;
2187 	struct new_sa_defrag_extent *new = old->new;
2188 	struct inode *src_inode = new->inode;
2189 	struct inode *inode;
2190 	struct extent_state *cached = NULL;
2191 	int ret = 0;
2192 	u64 start;
2193 	u64 len;
2194 	u64 lock_start;
2195 	u64 lock_end;
2196 	bool merge = false;
2197 	int index;
2198 
2199 	if (prev && prev->root_id == backref->root_id &&
2200 	    prev->inum == backref->inum &&
2201 	    prev->file_pos + prev->num_bytes == backref->file_pos)
2202 		merge = true;
2203 
2204 	/* step 1: get root */
2205 	key.objectid = backref->root_id;
2206 	key.type = BTRFS_ROOT_ITEM_KEY;
2207 	key.offset = (u64)-1;
2208 
2209 	fs_info = BTRFS_I(src_inode)->root->fs_info;
2210 	index = srcu_read_lock(&fs_info->subvol_srcu);
2211 
2212 	root = btrfs_read_fs_root_no_name(fs_info, &key);
2213 	if (IS_ERR(root)) {
2214 		srcu_read_unlock(&fs_info->subvol_srcu, index);
2215 		if (PTR_ERR(root) == -ENOENT)
2216 			return 0;
2217 		return PTR_ERR(root);
2218 	}
2219 
2220 	/* step 2: get inode */
2221 	key.objectid = backref->inum;
2222 	key.type = BTRFS_INODE_ITEM_KEY;
2223 	key.offset = 0;
2224 
2225 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2226 	if (IS_ERR(inode)) {
2227 		srcu_read_unlock(&fs_info->subvol_srcu, index);
2228 		return 0;
2229 	}
2230 
2231 	srcu_read_unlock(&fs_info->subvol_srcu, index);
2232 
2233 	/* step 3: relink backref */
2234 	lock_start = backref->file_pos;
2235 	lock_end = backref->file_pos + backref->num_bytes - 1;
2236 	lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2237 			 0, &cached);
2238 
2239 	ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2240 	if (ordered) {
2241 		btrfs_put_ordered_extent(ordered);
2242 		goto out_unlock;
2243 	}
2244 
2245 	trans = btrfs_join_transaction(root);
2246 	if (IS_ERR(trans)) {
2247 		ret = PTR_ERR(trans);
2248 		goto out_unlock;
2249 	}
2250 
2251 	key.objectid = backref->inum;
2252 	key.type = BTRFS_EXTENT_DATA_KEY;
2253 	key.offset = backref->file_pos;
2254 
2255 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2256 	if (ret < 0) {
2257 		goto out_free_path;
2258 	} else if (ret > 0) {
2259 		ret = 0;
2260 		goto out_free_path;
2261 	}
2262 
2263 	extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2264 				struct btrfs_file_extent_item);
2265 
2266 	if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2267 	    backref->generation)
2268 		goto out_free_path;
2269 
2270 	btrfs_release_path(path);
2271 
2272 	start = backref->file_pos;
2273 	if (backref->extent_offset < old->extent_offset + old->offset)
2274 		start += old->extent_offset + old->offset -
2275 			 backref->extent_offset;
2276 
2277 	len = min(backref->extent_offset + backref->num_bytes,
2278 		  old->extent_offset + old->offset + old->len);
2279 	len -= max(backref->extent_offset, old->extent_offset + old->offset);
2280 
2281 	ret = btrfs_drop_extents(trans, root, inode, start,
2282 				 start + len, 1);
2283 	if (ret)
2284 		goto out_free_path;
2285 again:
2286 	key.objectid = btrfs_ino(inode);
2287 	key.type = BTRFS_EXTENT_DATA_KEY;
2288 	key.offset = start;
2289 
2290 	path->leave_spinning = 1;
2291 	if (merge) {
2292 		struct btrfs_file_extent_item *fi;
2293 		u64 extent_len;
2294 		struct btrfs_key found_key;
2295 
2296 		ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
2297 		if (ret < 0)
2298 			goto out_free_path;
2299 
2300 		path->slots[0]--;
2301 		leaf = path->nodes[0];
2302 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2303 
2304 		fi = btrfs_item_ptr(leaf, path->slots[0],
2305 				    struct btrfs_file_extent_item);
2306 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2307 
2308 		if (extent_len + found_key.offset == start &&
2309 		    relink_is_mergable(leaf, fi, new)) {
2310 			btrfs_set_file_extent_num_bytes(leaf, fi,
2311 							extent_len + len);
2312 			btrfs_mark_buffer_dirty(leaf);
2313 			inode_add_bytes(inode, len);
2314 
2315 			ret = 1;
2316 			goto out_free_path;
2317 		} else {
2318 			merge = false;
2319 			btrfs_release_path(path);
2320 			goto again;
2321 		}
2322 	}
2323 
2324 	ret = btrfs_insert_empty_item(trans, root, path, &key,
2325 					sizeof(*extent));
2326 	if (ret) {
2327 		btrfs_abort_transaction(trans, root, ret);
2328 		goto out_free_path;
2329 	}
2330 
2331 	leaf = path->nodes[0];
2332 	item = btrfs_item_ptr(leaf, path->slots[0],
2333 				struct btrfs_file_extent_item);
2334 	btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2335 	btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2336 	btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2337 	btrfs_set_file_extent_num_bytes(leaf, item, len);
2338 	btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2339 	btrfs_set_file_extent_generation(leaf, item, trans->transid);
2340 	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2341 	btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2342 	btrfs_set_file_extent_encryption(leaf, item, 0);
2343 	btrfs_set_file_extent_other_encoding(leaf, item, 0);
2344 
2345 	btrfs_mark_buffer_dirty(leaf);
2346 	inode_add_bytes(inode, len);
2347 	btrfs_release_path(path);
2348 
2349 	ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2350 			new->disk_len, 0,
2351 			backref->root_id, backref->inum,
2352 			new->file_pos, 0);	/* start - extent_offset */
2353 	if (ret) {
2354 		btrfs_abort_transaction(trans, root, ret);
2355 		goto out_free_path;
2356 	}
2357 
2358 	ret = 1;
2359 out_free_path:
2360 	btrfs_release_path(path);
2361 	path->leave_spinning = 0;
2362 	btrfs_end_transaction(trans, root);
2363 out_unlock:
2364 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2365 			     &cached, GFP_NOFS);
2366 	iput(inode);
2367 	return ret;
2368 }
2369 
2370 static void relink_file_extents(struct new_sa_defrag_extent *new)
2371 {
2372 	struct btrfs_path *path;
2373 	struct old_sa_defrag_extent *old, *tmp;
2374 	struct sa_defrag_extent_backref *backref;
2375 	struct sa_defrag_extent_backref *prev = NULL;
2376 	struct inode *inode;
2377 	struct btrfs_root *root;
2378 	struct rb_node *node;
2379 	int ret;
2380 
2381 	inode = new->inode;
2382 	root = BTRFS_I(inode)->root;
2383 
2384 	path = btrfs_alloc_path();
2385 	if (!path)
2386 		return;
2387 
2388 	if (!record_extent_backrefs(path, new)) {
2389 		btrfs_free_path(path);
2390 		goto out;
2391 	}
2392 	btrfs_release_path(path);
2393 
2394 	while (1) {
2395 		node = rb_first(&new->root);
2396 		if (!node)
2397 			break;
2398 		rb_erase(node, &new->root);
2399 
2400 		backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2401 
2402 		ret = relink_extent_backref(path, prev, backref);
2403 		WARN_ON(ret < 0);
2404 
2405 		kfree(prev);
2406 
2407 		if (ret == 1)
2408 			prev = backref;
2409 		else
2410 			prev = NULL;
2411 		cond_resched();
2412 	}
2413 	kfree(prev);
2414 
2415 	btrfs_free_path(path);
2416 
2417 	list_for_each_entry_safe(old, tmp, &new->head, list) {
2418 		list_del(&old->list);
2419 		kfree(old);
2420 	}
2421 out:
2422 	atomic_dec(&root->fs_info->defrag_running);
2423 	wake_up(&root->fs_info->transaction_wait);
2424 
2425 	kfree(new);
2426 }
2427 
2428 static struct new_sa_defrag_extent *
2429 record_old_file_extents(struct inode *inode,
2430 			struct btrfs_ordered_extent *ordered)
2431 {
2432 	struct btrfs_root *root = BTRFS_I(inode)->root;
2433 	struct btrfs_path *path;
2434 	struct btrfs_key key;
2435 	struct old_sa_defrag_extent *old, *tmp;
2436 	struct new_sa_defrag_extent *new;
2437 	int ret;
2438 
2439 	new = kmalloc(sizeof(*new), GFP_NOFS);
2440 	if (!new)
2441 		return NULL;
2442 
2443 	new->inode = inode;
2444 	new->file_pos = ordered->file_offset;
2445 	new->len = ordered->len;
2446 	new->bytenr = ordered->start;
2447 	new->disk_len = ordered->disk_len;
2448 	new->compress_type = ordered->compress_type;
2449 	new->root = RB_ROOT;
2450 	INIT_LIST_HEAD(&new->head);
2451 
2452 	path = btrfs_alloc_path();
2453 	if (!path)
2454 		goto out_kfree;
2455 
2456 	key.objectid = btrfs_ino(inode);
2457 	key.type = BTRFS_EXTENT_DATA_KEY;
2458 	key.offset = new->file_pos;
2459 
2460 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2461 	if (ret < 0)
2462 		goto out_free_path;
2463 	if (ret > 0 && path->slots[0] > 0)
2464 		path->slots[0]--;
2465 
2466 	/* find out all the old extents for the file range */
2467 	while (1) {
2468 		struct btrfs_file_extent_item *extent;
2469 		struct extent_buffer *l;
2470 		int slot;
2471 		u64 num_bytes;
2472 		u64 offset;
2473 		u64 end;
2474 		u64 disk_bytenr;
2475 		u64 extent_offset;
2476 
2477 		l = path->nodes[0];
2478 		slot = path->slots[0];
2479 
2480 		if (slot >= btrfs_header_nritems(l)) {
2481 			ret = btrfs_next_leaf(root, path);
2482 			if (ret < 0)
2483 				goto out_free_list;
2484 			else if (ret > 0)
2485 				break;
2486 			continue;
2487 		}
2488 
2489 		btrfs_item_key_to_cpu(l, &key, slot);
2490 
2491 		if (key.objectid != btrfs_ino(inode))
2492 			break;
2493 		if (key.type != BTRFS_EXTENT_DATA_KEY)
2494 			break;
2495 		if (key.offset >= new->file_pos + new->len)
2496 			break;
2497 
2498 		extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2499 
2500 		num_bytes = btrfs_file_extent_num_bytes(l, extent);
2501 		if (key.offset + num_bytes < new->file_pos)
2502 			goto next;
2503 
2504 		disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2505 		if (!disk_bytenr)
2506 			goto next;
2507 
2508 		extent_offset = btrfs_file_extent_offset(l, extent);
2509 
2510 		old = kmalloc(sizeof(*old), GFP_NOFS);
2511 		if (!old)
2512 			goto out_free_list;
2513 
2514 		offset = max(new->file_pos, key.offset);
2515 		end = min(new->file_pos + new->len, key.offset + num_bytes);
2516 
2517 		old->bytenr = disk_bytenr;
2518 		old->extent_offset = extent_offset;
2519 		old->offset = offset - key.offset;
2520 		old->len = end - offset;
2521 		old->new = new;
2522 		old->count = 0;
2523 		list_add_tail(&old->list, &new->head);
2524 next:
2525 		path->slots[0]++;
2526 		cond_resched();
2527 	}
2528 
2529 	btrfs_free_path(path);
2530 	atomic_inc(&root->fs_info->defrag_running);
2531 
2532 	return new;
2533 
2534 out_free_list:
2535 	list_for_each_entry_safe(old, tmp, &new->head, list) {
2536 		list_del(&old->list);
2537 		kfree(old);
2538 	}
2539 out_free_path:
2540 	btrfs_free_path(path);
2541 out_kfree:
2542 	kfree(new);
2543 	return NULL;
2544 }
2545 
2546 /*
2547  * helper function for btrfs_finish_ordered_io, this
2548  * just reads in some of the csum leaves to prime them into ram
2549  * before we start the transaction.  It limits the amount of btree
2550  * reads required while inside the transaction.
2551  */
2552 /* as ordered data IO finishes, this gets called so we can finish
2553  * an ordered extent if the range of bytes in the file it covers are
2554  * fully written.
2555  */
2556 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2557 {
2558 	struct inode *inode = ordered_extent->inode;
2559 	struct btrfs_root *root = BTRFS_I(inode)->root;
2560 	struct btrfs_trans_handle *trans = NULL;
2561 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2562 	struct extent_state *cached_state = NULL;
2563 	struct new_sa_defrag_extent *new = NULL;
2564 	int compress_type = 0;
2565 	int ret;
2566 	bool nolock;
2567 
2568 	nolock = btrfs_is_free_space_inode(inode);
2569 
2570 	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2571 		ret = -EIO;
2572 		goto out;
2573 	}
2574 
2575 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2576 		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2577 		btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2578 		if (nolock)
2579 			trans = btrfs_join_transaction_nolock(root);
2580 		else
2581 			trans = btrfs_join_transaction(root);
2582 		if (IS_ERR(trans)) {
2583 			ret = PTR_ERR(trans);
2584 			trans = NULL;
2585 			goto out;
2586 		}
2587 		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2588 		ret = btrfs_update_inode_fallback(trans, root, inode);
2589 		if (ret) /* -ENOMEM or corruption */
2590 			btrfs_abort_transaction(trans, root, ret);
2591 		goto out;
2592 	}
2593 
2594 	lock_extent_bits(io_tree, ordered_extent->file_offset,
2595 			 ordered_extent->file_offset + ordered_extent->len - 1,
2596 			 0, &cached_state);
2597 
2598 	ret = test_range_bit(io_tree, ordered_extent->file_offset,
2599 			ordered_extent->file_offset + ordered_extent->len - 1,
2600 			EXTENT_DEFRAG, 1, cached_state);
2601 	if (ret) {
2602 		u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2603 		if (last_snapshot >= BTRFS_I(inode)->generation)
2604 			/* the inode is shared */
2605 			new = record_old_file_extents(inode, ordered_extent);
2606 
2607 		clear_extent_bit(io_tree, ordered_extent->file_offset,
2608 			ordered_extent->file_offset + ordered_extent->len - 1,
2609 			EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2610 	}
2611 
2612 	if (nolock)
2613 		trans = btrfs_join_transaction_nolock(root);
2614 	else
2615 		trans = btrfs_join_transaction(root);
2616 	if (IS_ERR(trans)) {
2617 		ret = PTR_ERR(trans);
2618 		trans = NULL;
2619 		goto out_unlock;
2620 	}
2621 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2622 
2623 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2624 		compress_type = ordered_extent->compress_type;
2625 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2626 		BUG_ON(compress_type);
2627 		ret = btrfs_mark_extent_written(trans, inode,
2628 						ordered_extent->file_offset,
2629 						ordered_extent->file_offset +
2630 						ordered_extent->len);
2631 	} else {
2632 		BUG_ON(root == root->fs_info->tree_root);
2633 		ret = insert_reserved_file_extent(trans, inode,
2634 						ordered_extent->file_offset,
2635 						ordered_extent->start,
2636 						ordered_extent->disk_len,
2637 						ordered_extent->len,
2638 						ordered_extent->len,
2639 						compress_type, 0, 0,
2640 						BTRFS_FILE_EXTENT_REG);
2641 	}
2642 	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2643 			   ordered_extent->file_offset, ordered_extent->len,
2644 			   trans->transid);
2645 	if (ret < 0) {
2646 		btrfs_abort_transaction(trans, root, ret);
2647 		goto out_unlock;
2648 	}
2649 
2650 	add_pending_csums(trans, inode, ordered_extent->file_offset,
2651 			  &ordered_extent->list);
2652 
2653 	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2654 	ret = btrfs_update_inode_fallback(trans, root, inode);
2655 	if (ret) { /* -ENOMEM or corruption */
2656 		btrfs_abort_transaction(trans, root, ret);
2657 		goto out_unlock;
2658 	}
2659 	ret = 0;
2660 out_unlock:
2661 	unlock_extent_cached(io_tree, ordered_extent->file_offset,
2662 			     ordered_extent->file_offset +
2663 			     ordered_extent->len - 1, &cached_state, GFP_NOFS);
2664 out:
2665 	if (root != root->fs_info->tree_root)
2666 		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
2667 	if (trans)
2668 		btrfs_end_transaction(trans, root);
2669 
2670 	if (ret) {
2671 		clear_extent_uptodate(io_tree, ordered_extent->file_offset,
2672 				      ordered_extent->file_offset +
2673 				      ordered_extent->len - 1, NULL, GFP_NOFS);
2674 
2675 		/*
2676 		 * If the ordered extent had an IOERR or something else went
2677 		 * wrong we need to return the space for this ordered extent
2678 		 * back to the allocator.
2679 		 */
2680 		if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2681 		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2682 			btrfs_free_reserved_extent(root, ordered_extent->start,
2683 						   ordered_extent->disk_len);
2684 	}
2685 
2686 
2687 	/*
2688 	 * This needs to be done to make sure anybody waiting knows we are done
2689 	 * updating everything for this ordered extent.
2690 	 */
2691 	btrfs_remove_ordered_extent(inode, ordered_extent);
2692 
2693 	/* for snapshot-aware defrag */
2694 	if (new)
2695 		relink_file_extents(new);
2696 
2697 	/* once for us */
2698 	btrfs_put_ordered_extent(ordered_extent);
2699 	/* once for the tree */
2700 	btrfs_put_ordered_extent(ordered_extent);
2701 
2702 	return ret;
2703 }
2704 
2705 static void finish_ordered_fn(struct btrfs_work *work)
2706 {
2707 	struct btrfs_ordered_extent *ordered_extent;
2708 	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
2709 	btrfs_finish_ordered_io(ordered_extent);
2710 }
2711 
2712 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2713 				struct extent_state *state, int uptodate)
2714 {
2715 	struct inode *inode = page->mapping->host;
2716 	struct btrfs_root *root = BTRFS_I(inode)->root;
2717 	struct btrfs_ordered_extent *ordered_extent = NULL;
2718 	struct btrfs_workers *workers;
2719 
2720 	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2721 
2722 	ClearPagePrivate2(page);
2723 	if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
2724 					    end - start + 1, uptodate))
2725 		return 0;
2726 
2727 	ordered_extent->work.func = finish_ordered_fn;
2728 	ordered_extent->work.flags = 0;
2729 
2730 	if (btrfs_is_free_space_inode(inode))
2731 		workers = &root->fs_info->endio_freespace_worker;
2732 	else
2733 		workers = &root->fs_info->endio_write_workers;
2734 	btrfs_queue_worker(workers, &ordered_extent->work);
2735 
2736 	return 0;
2737 }
2738 
2739 /*
2740  * when reads are done, we need to check csums to verify the data is correct
2741  * if there's a match, we allow the bio to finish.  If not, the code in
2742  * extent_io.c will try to find good copies for us.
2743  */
2744 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2745 				      u64 phy_offset, struct page *page,
2746 				      u64 start, u64 end, int mirror)
2747 {
2748 	size_t offset = start - page_offset(page);
2749 	struct inode *inode = page->mapping->host;
2750 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2751 	char *kaddr;
2752 	struct btrfs_root *root = BTRFS_I(inode)->root;
2753 	u32 csum_expected;
2754 	u32 csum = ~(u32)0;
2755 	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2756 	                              DEFAULT_RATELIMIT_BURST);
2757 
2758 	if (PageChecked(page)) {
2759 		ClearPageChecked(page);
2760 		goto good;
2761 	}
2762 
2763 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
2764 		goto good;
2765 
2766 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
2767 	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
2768 		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
2769 				  GFP_NOFS);
2770 		return 0;
2771 	}
2772 
2773 	phy_offset >>= inode->i_sb->s_blocksize_bits;
2774 	csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
2775 
2776 	kaddr = kmap_atomic(page);
2777 	csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);
2778 	btrfs_csum_final(csum, (char *)&csum);
2779 	if (csum != csum_expected)
2780 		goto zeroit;
2781 
2782 	kunmap_atomic(kaddr);
2783 good:
2784 	return 0;
2785 
2786 zeroit:
2787 	if (__ratelimit(&_rs))
2788 		btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
2789 			btrfs_ino(page->mapping->host), start, csum, csum_expected);
2790 	memset(kaddr + offset, 1, end - start + 1);
2791 	flush_dcache_page(page);
2792 	kunmap_atomic(kaddr);
2793 	if (csum_expected == 0)
2794 		return 0;
2795 	return -EIO;
2796 }
2797 
2798 struct delayed_iput {
2799 	struct list_head list;
2800 	struct inode *inode;
2801 };
2802 
2803 /* JDM: If this is fs-wide, why can't we add a pointer to
2804  * btrfs_inode instead and avoid the allocation? */
2805 void btrfs_add_delayed_iput(struct inode *inode)
2806 {
2807 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2808 	struct delayed_iput *delayed;
2809 
2810 	if (atomic_add_unless(&inode->i_count, -1, 1))
2811 		return;
2812 
2813 	delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
2814 	delayed->inode = inode;
2815 
2816 	spin_lock(&fs_info->delayed_iput_lock);
2817 	list_add_tail(&delayed->list, &fs_info->delayed_iputs);
2818 	spin_unlock(&fs_info->delayed_iput_lock);
2819 }
2820 
2821 void btrfs_run_delayed_iputs(struct btrfs_root *root)
2822 {
2823 	LIST_HEAD(list);
2824 	struct btrfs_fs_info *fs_info = root->fs_info;
2825 	struct delayed_iput *delayed;
2826 	int empty;
2827 
2828 	spin_lock(&fs_info->delayed_iput_lock);
2829 	empty = list_empty(&fs_info->delayed_iputs);
2830 	spin_unlock(&fs_info->delayed_iput_lock);
2831 	if (empty)
2832 		return;
2833 
2834 	spin_lock(&fs_info->delayed_iput_lock);
2835 	list_splice_init(&fs_info->delayed_iputs, &list);
2836 	spin_unlock(&fs_info->delayed_iput_lock);
2837 
2838 	while (!list_empty(&list)) {
2839 		delayed = list_entry(list.next, struct delayed_iput, list);
2840 		list_del(&delayed->list);
2841 		iput(delayed->inode);
2842 		kfree(delayed);
2843 	}
2844 }
2845 
2846 /*
2847  * This is called in transaction commit time. If there are no orphan
2848  * files in the subvolume, it removes orphan item and frees block_rsv
2849  * structure.
2850  */
2851 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2852 			      struct btrfs_root *root)
2853 {
2854 	struct btrfs_block_rsv *block_rsv;
2855 	int ret;
2856 
2857 	if (atomic_read(&root->orphan_inodes) ||
2858 	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2859 		return;
2860 
2861 	spin_lock(&root->orphan_lock);
2862 	if (atomic_read(&root->orphan_inodes)) {
2863 		spin_unlock(&root->orphan_lock);
2864 		return;
2865 	}
2866 
2867 	if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
2868 		spin_unlock(&root->orphan_lock);
2869 		return;
2870 	}
2871 
2872 	block_rsv = root->orphan_block_rsv;
2873 	root->orphan_block_rsv = NULL;
2874 	spin_unlock(&root->orphan_lock);
2875 
2876 	if (root->orphan_item_inserted &&
2877 	    btrfs_root_refs(&root->root_item) > 0) {
2878 		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2879 					    root->root_key.objectid);
2880 		if (ret)
2881 			btrfs_abort_transaction(trans, root, ret);
2882 		else
2883 			root->orphan_item_inserted = 0;
2884 	}
2885 
2886 	if (block_rsv) {
2887 		WARN_ON(block_rsv->size > 0);
2888 		btrfs_free_block_rsv(root, block_rsv);
2889 	}
2890 }
2891 
2892 /*
2893  * This creates an orphan entry for the given inode in case something goes
2894  * wrong in the middle of an unlink/truncate.
2895  *
2896  * NOTE: caller of this function should reserve 5 units of metadata for
2897  *	 this function.
2898  */
2899 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2900 {
2901 	struct btrfs_root *root = BTRFS_I(inode)->root;
2902 	struct btrfs_block_rsv *block_rsv = NULL;
2903 	int reserve = 0;
2904 	int insert = 0;
2905 	int ret;
2906 
2907 	if (!root->orphan_block_rsv) {
2908 		block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
2909 		if (!block_rsv)
2910 			return -ENOMEM;
2911 	}
2912 
2913 	spin_lock(&root->orphan_lock);
2914 	if (!root->orphan_block_rsv) {
2915 		root->orphan_block_rsv = block_rsv;
2916 	} else if (block_rsv) {
2917 		btrfs_free_block_rsv(root, block_rsv);
2918 		block_rsv = NULL;
2919 	}
2920 
2921 	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2922 			      &BTRFS_I(inode)->runtime_flags)) {
2923 #if 0
2924 		/*
2925 		 * For proper ENOSPC handling, we should do orphan
2926 		 * cleanup when mounting. But this introduces backward
2927 		 * compatibility issue.
2928 		 */
2929 		if (!xchg(&root->orphan_item_inserted, 1))
2930 			insert = 2;
2931 		else
2932 			insert = 1;
2933 #endif
2934 		insert = 1;
2935 		atomic_inc(&root->orphan_inodes);
2936 	}
2937 
2938 	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2939 			      &BTRFS_I(inode)->runtime_flags))
2940 		reserve = 1;
2941 	spin_unlock(&root->orphan_lock);
2942 
2943 	/* grab metadata reservation from transaction handle */
2944 	if (reserve) {
2945 		ret = btrfs_orphan_reserve_metadata(trans, inode);
2946 		BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
2947 	}
2948 
2949 	/* insert an orphan item to track this unlinked/truncated file */
2950 	if (insert >= 1) {
2951 		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
2952 		if (ret) {
2953 			clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2954 				  &BTRFS_I(inode)->runtime_flags);
2955 			if (reserve) {
2956 				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2957 					  &BTRFS_I(inode)->runtime_flags);
2958 				btrfs_orphan_release_metadata(inode);
2959 			}
2960 			if (ret != -EEXIST) {
2961 				btrfs_abort_transaction(trans, root, ret);
2962 				return ret;
2963 			}
2964 		}
2965 		ret = 0;
2966 	}
2967 
2968 	/* insert an orphan item to track subvolume contains orphan files */
2969 	if (insert >= 2) {
2970 		ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2971 					       root->root_key.objectid);
2972 		if (ret && ret != -EEXIST) {
2973 			btrfs_abort_transaction(trans, root, ret);
2974 			return ret;
2975 		}
2976 	}
2977 	return 0;
2978 }
2979 
2980 /*
2981  * We have done the truncate/delete so we can go ahead and remove the orphan
2982  * item for this particular inode.
2983  */
2984 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
2985 			    struct inode *inode)
2986 {
2987 	struct btrfs_root *root = BTRFS_I(inode)->root;
2988 	int delete_item = 0;
2989 	int release_rsv = 0;
2990 	int ret = 0;
2991 
2992 	spin_lock(&root->orphan_lock);
2993 	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2994 			       &BTRFS_I(inode)->runtime_flags))
2995 		delete_item = 1;
2996 
2997 	if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2998 			       &BTRFS_I(inode)->runtime_flags))
2999 		release_rsv = 1;
3000 	spin_unlock(&root->orphan_lock);
3001 
3002 	if (trans && delete_item)
3003 		ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
3004 
3005 	if (release_rsv) {
3006 		btrfs_orphan_release_metadata(inode);
3007 		atomic_dec(&root->orphan_inodes);
3008 	}
3009 
3010 	return ret;
3011 }
3012 
3013 /*
3014  * this cleans up any orphans that may be left on the list from the last use
3015  * of this root.
3016  */
3017 int btrfs_orphan_cleanup(struct btrfs_root *root)
3018 {
3019 	struct btrfs_path *path;
3020 	struct extent_buffer *leaf;
3021 	struct btrfs_key key, found_key;
3022 	struct btrfs_trans_handle *trans;
3023 	struct inode *inode;
3024 	u64 last_objectid = 0;
3025 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
3026 
3027 	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3028 		return 0;
3029 
3030 	path = btrfs_alloc_path();
3031 	if (!path) {
3032 		ret = -ENOMEM;
3033 		goto out;
3034 	}
3035 	path->reada = -1;
3036 
3037 	key.objectid = BTRFS_ORPHAN_OBJECTID;
3038 	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
3039 	key.offset = (u64)-1;
3040 
3041 	while (1) {
3042 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3043 		if (ret < 0)
3044 			goto out;
3045 
3046 		/*
3047 		 * if ret == 0 means we found what we were searching for, which
3048 		 * is weird, but possible, so only screw with path if we didn't
3049 		 * find the key and see if we have stuff that matches
3050 		 */
3051 		if (ret > 0) {
3052 			ret = 0;
3053 			if (path->slots[0] == 0)
3054 				break;
3055 			path->slots[0]--;
3056 		}
3057 
3058 		/* pull out the item */
3059 		leaf = path->nodes[0];
3060 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3061 
3062 		/* make sure the item matches what we want */
3063 		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3064 			break;
3065 		if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
3066 			break;
3067 
3068 		/* release the path since we're done with it */
3069 		btrfs_release_path(path);
3070 
3071 		/*
3072 		 * this is where we are basically btrfs_lookup, without the
3073 		 * crossing root thing.  we store the inode number in the
3074 		 * offset of the orphan item.
3075 		 */
3076 
3077 		if (found_key.offset == last_objectid) {
3078 			btrfs_err(root->fs_info,
3079 				"Error removing orphan entry, stopping orphan cleanup");
3080 			ret = -EINVAL;
3081 			goto out;
3082 		}
3083 
3084 		last_objectid = found_key.offset;
3085 
3086 		found_key.objectid = found_key.offset;
3087 		found_key.type = BTRFS_INODE_ITEM_KEY;
3088 		found_key.offset = 0;
3089 		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
3090 		ret = PTR_RET(inode);
3091 		if (ret && ret != -ESTALE)
3092 			goto out;
3093 
3094 		if (ret == -ESTALE && root == root->fs_info->tree_root) {
3095 			struct btrfs_root *dead_root;
3096 			struct btrfs_fs_info *fs_info = root->fs_info;
3097 			int is_dead_root = 0;
3098 
3099 			/*
3100 			 * this is an orphan in the tree root. Currently these
3101 			 * could come from 2 sources:
3102 			 *  a) a snapshot deletion in progress
3103 			 *  b) a free space cache inode
3104 			 * We need to distinguish those two, as the snapshot
3105 			 * orphan must not get deleted.
3106 			 * find_dead_roots already ran before us, so if this
3107 			 * is a snapshot deletion, we should find the root
3108 			 * in the dead_roots list
3109 			 */
3110 			spin_lock(&fs_info->trans_lock);
3111 			list_for_each_entry(dead_root, &fs_info->dead_roots,
3112 					    root_list) {
3113 				if (dead_root->root_key.objectid ==
3114 				    found_key.objectid) {
3115 					is_dead_root = 1;
3116 					break;
3117 				}
3118 			}
3119 			spin_unlock(&fs_info->trans_lock);
3120 			if (is_dead_root) {
3121 				/* prevent this orphan from being found again */
3122 				key.offset = found_key.objectid - 1;
3123 				continue;
3124 			}
3125 		}
3126 		/*
3127 		 * Inode is already gone but the orphan item is still there,
3128 		 * kill the orphan item.
3129 		 */
3130 		if (ret == -ESTALE) {
3131 			trans = btrfs_start_transaction(root, 1);
3132 			if (IS_ERR(trans)) {
3133 				ret = PTR_ERR(trans);
3134 				goto out;
3135 			}
3136 			btrfs_debug(root->fs_info, "auto deleting %Lu",
3137 				found_key.objectid);
3138 			ret = btrfs_del_orphan_item(trans, root,
3139 						    found_key.objectid);
3140 			btrfs_end_transaction(trans, root);
3141 			if (ret)
3142 				goto out;
3143 			continue;
3144 		}
3145 
3146 		/*
3147 		 * add this inode to the orphan list so btrfs_orphan_del does
3148 		 * the proper thing when we hit it
3149 		 */
3150 		set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3151 			&BTRFS_I(inode)->runtime_flags);
3152 		atomic_inc(&root->orphan_inodes);
3153 
3154 		/* if we have links, this was a truncate, lets do that */
3155 		if (inode->i_nlink) {
3156 			if (!S_ISREG(inode->i_mode)) {
3157 				WARN_ON(1);
3158 				iput(inode);
3159 				continue;
3160 			}
3161 			nr_truncate++;
3162 
3163 			/* 1 for the orphan item deletion. */
3164 			trans = btrfs_start_transaction(root, 1);
3165 			if (IS_ERR(trans)) {
3166 				iput(inode);
3167 				ret = PTR_ERR(trans);
3168 				goto out;
3169 			}
3170 			ret = btrfs_orphan_add(trans, inode);
3171 			btrfs_end_transaction(trans, root);
3172 			if (ret) {
3173 				iput(inode);
3174 				goto out;
3175 			}
3176 
3177 			ret = btrfs_truncate(inode);
3178 			if (ret)
3179 				btrfs_orphan_del(NULL, inode);
3180 		} else {
3181 			nr_unlink++;
3182 		}
3183 
3184 		/* this will do delete_inode and everything for us */
3185 		iput(inode);
3186 		if (ret)
3187 			goto out;
3188 	}
3189 	/* release the path since we're done with it */
3190 	btrfs_release_path(path);
3191 
3192 	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3193 
3194 	if (root->orphan_block_rsv)
3195 		btrfs_block_rsv_release(root, root->orphan_block_rsv,
3196 					(u64)-1);
3197 
3198 	if (root->orphan_block_rsv || root->orphan_item_inserted) {
3199 		trans = btrfs_join_transaction(root);
3200 		if (!IS_ERR(trans))
3201 			btrfs_end_transaction(trans, root);
3202 	}
3203 
3204 	if (nr_unlink)
3205 		btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
3206 	if (nr_truncate)
3207 		btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
3208 
3209 out:
3210 	if (ret)
3211 		btrfs_crit(root->fs_info,
3212 			"could not do orphan cleanup %d", ret);
3213 	btrfs_free_path(path);
3214 	return ret;
3215 }
3216 
3217 /*
3218  * very simple check to peek ahead in the leaf looking for xattrs.  If we
3219  * don't find any xattrs, we know there can't be any acls.
3220  *
3221  * slot is the slot the inode is in, objectid is the objectid of the inode
3222  */
3223 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3224 					  int slot, u64 objectid)
3225 {
3226 	u32 nritems = btrfs_header_nritems(leaf);
3227 	struct btrfs_key found_key;
3228 	static u64 xattr_access = 0;
3229 	static u64 xattr_default = 0;
3230 	int scanned = 0;
3231 
3232 	if (!xattr_access) {
3233 		xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
3234 					strlen(POSIX_ACL_XATTR_ACCESS));
3235 		xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
3236 					strlen(POSIX_ACL_XATTR_DEFAULT));
3237 	}
3238 
3239 	slot++;
3240 	while (slot < nritems) {
3241 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3242 
3243 		/* we found a different objectid, there must not be acls */
3244 		if (found_key.objectid != objectid)
3245 			return 0;
3246 
3247 		/* we found an xattr, assume we've got an acl */
3248 		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3249 			if (found_key.offset == xattr_access ||
3250 			    found_key.offset == xattr_default)
3251 				return 1;
3252 		}
3253 
3254 		/*
3255 		 * we found a key greater than an xattr key, there can't
3256 		 * be any acls later on
3257 		 */
3258 		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3259 			return 0;
3260 
3261 		slot++;
3262 		scanned++;
3263 
3264 		/*
3265 		 * it goes inode, inode backrefs, xattrs, extents,
3266 		 * so if there are a ton of hard links to an inode there can
3267 		 * be a lot of backrefs.  Don't waste time searching too hard,
3268 		 * this is just an optimization
3269 		 */
3270 		if (scanned >= 8)
3271 			break;
3272 	}
3273 	/* we hit the end of the leaf before we found an xattr or
3274 	 * something larger than an xattr.  We have to assume the inode
3275 	 * has acls
3276 	 */
3277 	return 1;
3278 }
3279 
3280 /*
3281  * read an inode from the btree into the in-memory inode
3282  */
3283 static void btrfs_read_locked_inode(struct inode *inode)
3284 {
3285 	struct btrfs_path *path;
3286 	struct extent_buffer *leaf;
3287 	struct btrfs_inode_item *inode_item;
3288 	struct btrfs_timespec *tspec;
3289 	struct btrfs_root *root = BTRFS_I(inode)->root;
3290 	struct btrfs_key location;
3291 	int maybe_acls;
3292 	u32 rdev;
3293 	int ret;
3294 	bool filled = false;
3295 
3296 	ret = btrfs_fill_inode(inode, &rdev);
3297 	if (!ret)
3298 		filled = true;
3299 
3300 	path = btrfs_alloc_path();
3301 	if (!path)
3302 		goto make_bad;
3303 
3304 	path->leave_spinning = 1;
3305 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3306 
3307 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3308 	if (ret)
3309 		goto make_bad;
3310 
3311 	leaf = path->nodes[0];
3312 
3313 	if (filled)
3314 		goto cache_acl;
3315 
3316 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3317 				    struct btrfs_inode_item);
3318 	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3319 	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3320 	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3321 	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3322 	btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3323 
3324 	tspec = btrfs_inode_atime(inode_item);
3325 	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3326 	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3327 
3328 	tspec = btrfs_inode_mtime(inode_item);
3329 	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3330 	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3331 
3332 	tspec = btrfs_inode_ctime(inode_item);
3333 	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3334 	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3335 
3336 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3337 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3338 	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3339 
3340 	/*
3341 	 * If we were modified in the current generation and evicted from memory
3342 	 * and then re-read we need to do a full sync since we don't have any
3343 	 * idea about which extents were modified before we were evicted from
3344 	 * cache.
3345 	 */
3346 	if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
3347 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3348 			&BTRFS_I(inode)->runtime_flags);
3349 
3350 	inode->i_version = btrfs_inode_sequence(leaf, inode_item);
3351 	inode->i_generation = BTRFS_I(inode)->generation;
3352 	inode->i_rdev = 0;
3353 	rdev = btrfs_inode_rdev(leaf, inode_item);
3354 
3355 	BTRFS_I(inode)->index_cnt = (u64)-1;
3356 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3357 cache_acl:
3358 	/*
3359 	 * try to precache a NULL acl entry for files that don't have
3360 	 * any xattrs or acls
3361 	 */
3362 	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3363 					   btrfs_ino(inode));
3364 	if (!maybe_acls)
3365 		cache_no_acl(inode);
3366 
3367 	btrfs_free_path(path);
3368 
3369 	switch (inode->i_mode & S_IFMT) {
3370 	case S_IFREG:
3371 		inode->i_mapping->a_ops = &btrfs_aops;
3372 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3373 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3374 		inode->i_fop = &btrfs_file_operations;
3375 		inode->i_op = &btrfs_file_inode_operations;
3376 		break;
3377 	case S_IFDIR:
3378 		inode->i_fop = &btrfs_dir_file_operations;
3379 		if (root == root->fs_info->tree_root)
3380 			inode->i_op = &btrfs_dir_ro_inode_operations;
3381 		else
3382 			inode->i_op = &btrfs_dir_inode_operations;
3383 		break;
3384 	case S_IFLNK:
3385 		inode->i_op = &btrfs_symlink_inode_operations;
3386 		inode->i_mapping->a_ops = &btrfs_symlink_aops;
3387 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3388 		break;
3389 	default:
3390 		inode->i_op = &btrfs_special_inode_operations;
3391 		init_special_inode(inode, inode->i_mode, rdev);
3392 		break;
3393 	}
3394 
3395 	btrfs_update_iflags(inode);
3396 	return;
3397 
3398 make_bad:
3399 	btrfs_free_path(path);
3400 	make_bad_inode(inode);
3401 }
3402 
3403 /*
3404  * given a leaf and an inode, copy the inode fields into the leaf
3405  */
3406 static void fill_inode_item(struct btrfs_trans_handle *trans,
3407 			    struct extent_buffer *leaf,
3408 			    struct btrfs_inode_item *item,
3409 			    struct inode *inode)
3410 {
3411 	struct btrfs_map_token token;
3412 
3413 	btrfs_init_map_token(&token);
3414 
3415 	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3416 	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3417 	btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3418 				   &token);
3419 	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3420 	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3421 
3422 	btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
3423 				     inode->i_atime.tv_sec, &token);
3424 	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
3425 				      inode->i_atime.tv_nsec, &token);
3426 
3427 	btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
3428 				     inode->i_mtime.tv_sec, &token);
3429 	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
3430 				      inode->i_mtime.tv_nsec, &token);
3431 
3432 	btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
3433 				     inode->i_ctime.tv_sec, &token);
3434 	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3435 				      inode->i_ctime.tv_nsec, &token);
3436 
3437 	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3438 				     &token);
3439 	btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3440 					 &token);
3441 	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3442 	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3443 	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3444 	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3445 	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3446 }
3447 
3448 /*
3449  * copy everything in the in-memory inode into the btree.
3450  */
3451 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3452 				struct btrfs_root *root, struct inode *inode)
3453 {
3454 	struct btrfs_inode_item *inode_item;
3455 	struct btrfs_path *path;
3456 	struct extent_buffer *leaf;
3457 	int ret;
3458 
3459 	path = btrfs_alloc_path();
3460 	if (!path)
3461 		return -ENOMEM;
3462 
3463 	path->leave_spinning = 1;
3464 	ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
3465 				 1);
3466 	if (ret) {
3467 		if (ret > 0)
3468 			ret = -ENOENT;
3469 		goto failed;
3470 	}
3471 
3472 	btrfs_unlock_up_safe(path, 1);
3473 	leaf = path->nodes[0];
3474 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3475 				    struct btrfs_inode_item);
3476 
3477 	fill_inode_item(trans, leaf, inode_item, inode);
3478 	btrfs_mark_buffer_dirty(leaf);
3479 	btrfs_set_inode_last_trans(trans, inode);
3480 	ret = 0;
3481 failed:
3482 	btrfs_free_path(path);
3483 	return ret;
3484 }
3485 
3486 /*
3487  * copy everything in the in-memory inode into the btree.
3488  */
3489 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3490 				struct btrfs_root *root, struct inode *inode)
3491 {
3492 	int ret;
3493 
3494 	/*
3495 	 * If the inode is a free space inode, we can deadlock during commit
3496 	 * if we put it into the delayed code.
3497 	 *
3498 	 * The data relocation inode should also be directly updated
3499 	 * without delay
3500 	 */
3501 	if (!btrfs_is_free_space_inode(inode)
3502 	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
3503 		btrfs_update_root_times(trans, root);
3504 
3505 		ret = btrfs_delayed_update_inode(trans, root, inode);
3506 		if (!ret)
3507 			btrfs_set_inode_last_trans(trans, inode);
3508 		return ret;
3509 	}
3510 
3511 	return btrfs_update_inode_item(trans, root, inode);
3512 }
3513 
3514 noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
3515 					 struct btrfs_root *root,
3516 					 struct inode *inode)
3517 {
3518 	int ret;
3519 
3520 	ret = btrfs_update_inode(trans, root, inode);
3521 	if (ret == -ENOSPC)
3522 		return btrfs_update_inode_item(trans, root, inode);
3523 	return ret;
3524 }
3525 
3526 /*
3527  * unlink helper that gets used here in inode.c and in the tree logging
3528  * recovery code.  It remove a link in a directory with a given name, and
3529  * also drops the back refs in the inode to the directory
3530  */
3531 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3532 				struct btrfs_root *root,
3533 				struct inode *dir, struct inode *inode,
3534 				const char *name, int name_len)
3535 {
3536 	struct btrfs_path *path;
3537 	int ret = 0;
3538 	struct extent_buffer *leaf;
3539 	struct btrfs_dir_item *di;
3540 	struct btrfs_key key;
3541 	u64 index;
3542 	u64 ino = btrfs_ino(inode);
3543 	u64 dir_ino = btrfs_ino(dir);
3544 
3545 	path = btrfs_alloc_path();
3546 	if (!path) {
3547 		ret = -ENOMEM;
3548 		goto out;
3549 	}
3550 
3551 	path->leave_spinning = 1;
3552 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3553 				    name, name_len, -1);
3554 	if (IS_ERR(di)) {
3555 		ret = PTR_ERR(di);
3556 		goto err;
3557 	}
3558 	if (!di) {
3559 		ret = -ENOENT;
3560 		goto err;
3561 	}
3562 	leaf = path->nodes[0];
3563 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
3564 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
3565 	if (ret)
3566 		goto err;
3567 	btrfs_release_path(path);
3568 
3569 	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
3570 				  dir_ino, &index);
3571 	if (ret) {
3572 		btrfs_info(root->fs_info,
3573 			"failed to delete reference to %.*s, inode %llu parent %llu",
3574 			name_len, name, ino, dir_ino);
3575 		btrfs_abort_transaction(trans, root, ret);
3576 		goto err;
3577 	}
3578 
3579 	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3580 	if (ret) {
3581 		btrfs_abort_transaction(trans, root, ret);
3582 		goto err;
3583 	}
3584 
3585 	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
3586 					 inode, dir_ino);
3587 	if (ret != 0 && ret != -ENOENT) {
3588 		btrfs_abort_transaction(trans, root, ret);
3589 		goto err;
3590 	}
3591 
3592 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
3593 					   dir, index);
3594 	if (ret == -ENOENT)
3595 		ret = 0;
3596 	else if (ret)
3597 		btrfs_abort_transaction(trans, root, ret);
3598 err:
3599 	btrfs_free_path(path);
3600 	if (ret)
3601 		goto out;
3602 
3603 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3604 	inode_inc_iversion(inode);
3605 	inode_inc_iversion(dir);
3606 	inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3607 	ret = btrfs_update_inode(trans, root, dir);
3608 out:
3609 	return ret;
3610 }
3611 
3612 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3613 		       struct btrfs_root *root,
3614 		       struct inode *dir, struct inode *inode,
3615 		       const char *name, int name_len)
3616 {
3617 	int ret;
3618 	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
3619 	if (!ret) {
3620 		btrfs_drop_nlink(inode);
3621 		ret = btrfs_update_inode(trans, root, inode);
3622 	}
3623 	return ret;
3624 }
3625 
3626 /*
3627  * helper to start transaction for unlink and rmdir.
3628  *
3629  * unlink and rmdir are special in btrfs, they do not always free space, so
3630  * if we cannot make our reservations the normal way try and see if there is
3631  * plenty of slack room in the global reserve to migrate, otherwise we cannot
3632  * allow the unlink to occur.
3633  */
3634 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
3635 {
3636 	struct btrfs_trans_handle *trans;
3637 	struct btrfs_root *root = BTRFS_I(dir)->root;
3638 	int ret;
3639 
3640 	/*
3641 	 * 1 for the possible orphan item
3642 	 * 1 for the dir item
3643 	 * 1 for the dir index
3644 	 * 1 for the inode ref
3645 	 * 1 for the inode
3646 	 */
3647 	trans = btrfs_start_transaction(root, 5);
3648 	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
3649 		return trans;
3650 
3651 	if (PTR_ERR(trans) == -ENOSPC) {
3652 		u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
3653 
3654 		trans = btrfs_start_transaction(root, 0);
3655 		if (IS_ERR(trans))
3656 			return trans;
3657 		ret = btrfs_cond_migrate_bytes(root->fs_info,
3658 					       &root->fs_info->trans_block_rsv,
3659 					       num_bytes, 5);
3660 		if (ret) {
3661 			btrfs_end_transaction(trans, root);
3662 			return ERR_PTR(ret);
3663 		}
3664 		trans->block_rsv = &root->fs_info->trans_block_rsv;
3665 		trans->bytes_reserved = num_bytes;
3666 	}
3667 	return trans;
3668 }
3669 
3670 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3671 {
3672 	struct btrfs_root *root = BTRFS_I(dir)->root;
3673 	struct btrfs_trans_handle *trans;
3674 	struct inode *inode = dentry->d_inode;
3675 	int ret;
3676 
3677 	trans = __unlink_start_trans(dir);
3678 	if (IS_ERR(trans))
3679 		return PTR_ERR(trans);
3680 
3681 	btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
3682 
3683 	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3684 				 dentry->d_name.name, dentry->d_name.len);
3685 	if (ret)
3686 		goto out;
3687 
3688 	if (inode->i_nlink == 0) {
3689 		ret = btrfs_orphan_add(trans, inode);
3690 		if (ret)
3691 			goto out;
3692 	}
3693 
3694 out:
3695 	btrfs_end_transaction(trans, root);
3696 	btrfs_btree_balance_dirty(root);
3697 	return ret;
3698 }
3699 
3700 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3701 			struct btrfs_root *root,
3702 			struct inode *dir, u64 objectid,
3703 			const char *name, int name_len)
3704 {
3705 	struct btrfs_path *path;
3706 	struct extent_buffer *leaf;
3707 	struct btrfs_dir_item *di;
3708 	struct btrfs_key key;
3709 	u64 index;
3710 	int ret;
3711 	u64 dir_ino = btrfs_ino(dir);
3712 
3713 	path = btrfs_alloc_path();
3714 	if (!path)
3715 		return -ENOMEM;
3716 
3717 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3718 				   name, name_len, -1);
3719 	if (IS_ERR_OR_NULL(di)) {
3720 		if (!di)
3721 			ret = -ENOENT;
3722 		else
3723 			ret = PTR_ERR(di);
3724 		goto out;
3725 	}
3726 
3727 	leaf = path->nodes[0];
3728 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
3729 	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
3730 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
3731 	if (ret) {
3732 		btrfs_abort_transaction(trans, root, ret);
3733 		goto out;
3734 	}
3735 	btrfs_release_path(path);
3736 
3737 	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
3738 				 objectid, root->root_key.objectid,
3739 				 dir_ino, &index, name, name_len);
3740 	if (ret < 0) {
3741 		if (ret != -ENOENT) {
3742 			btrfs_abort_transaction(trans, root, ret);
3743 			goto out;
3744 		}
3745 		di = btrfs_search_dir_index_item(root, path, dir_ino,
3746 						 name, name_len);
3747 		if (IS_ERR_OR_NULL(di)) {
3748 			if (!di)
3749 				ret = -ENOENT;
3750 			else
3751 				ret = PTR_ERR(di);
3752 			btrfs_abort_transaction(trans, root, ret);
3753 			goto out;
3754 		}
3755 
3756 		leaf = path->nodes[0];
3757 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3758 		btrfs_release_path(path);
3759 		index = key.offset;
3760 	}
3761 	btrfs_release_path(path);
3762 
3763 	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3764 	if (ret) {
3765 		btrfs_abort_transaction(trans, root, ret);
3766 		goto out;
3767 	}
3768 
3769 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3770 	inode_inc_iversion(dir);
3771 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3772 	ret = btrfs_update_inode_fallback(trans, root, dir);
3773 	if (ret)
3774 		btrfs_abort_transaction(trans, root, ret);
3775 out:
3776 	btrfs_free_path(path);
3777 	return ret;
3778 }
3779 
3780 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3781 {
3782 	struct inode *inode = dentry->d_inode;
3783 	int err = 0;
3784 	struct btrfs_root *root = BTRFS_I(dir)->root;
3785 	struct btrfs_trans_handle *trans;
3786 
3787 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3788 		return -ENOTEMPTY;
3789 	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3790 		return -EPERM;
3791 
3792 	trans = __unlink_start_trans(dir);
3793 	if (IS_ERR(trans))
3794 		return PTR_ERR(trans);
3795 
3796 	if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
3797 		err = btrfs_unlink_subvol(trans, root, dir,
3798 					  BTRFS_I(inode)->location.objectid,
3799 					  dentry->d_name.name,
3800 					  dentry->d_name.len);
3801 		goto out;
3802 	}
3803 
3804 	err = btrfs_orphan_add(trans, inode);
3805 	if (err)
3806 		goto out;
3807 
3808 	/* now the directory is empty */
3809 	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3810 				 dentry->d_name.name, dentry->d_name.len);
3811 	if (!err)
3812 		btrfs_i_size_write(inode, 0);
3813 out:
3814 	btrfs_end_transaction(trans, root);
3815 	btrfs_btree_balance_dirty(root);
3816 
3817 	return err;
3818 }
3819 
3820 /*
3821  * this can truncate away extent items, csum items and directory items.
3822  * It starts at a high offset and removes keys until it can't find
3823  * any higher than new_size
3824  *
3825  * csum items that cross the new i_size are truncated to the new size
3826  * as well.
3827  *
3828  * min_type is the minimum key type to truncate down to.  If set to 0, this
3829  * will kill all the items on this inode, including the INODE_ITEM_KEY.
3830  */
3831 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3832 			       struct btrfs_root *root,
3833 			       struct inode *inode,
3834 			       u64 new_size, u32 min_type)
3835 {
3836 	struct btrfs_path *path;
3837 	struct extent_buffer *leaf;
3838 	struct btrfs_file_extent_item *fi;
3839 	struct btrfs_key key;
3840 	struct btrfs_key found_key;
3841 	u64 extent_start = 0;
3842 	u64 extent_num_bytes = 0;
3843 	u64 extent_offset = 0;
3844 	u64 item_end = 0;
3845 	u32 found_type = (u8)-1;
3846 	int found_extent;
3847 	int del_item;
3848 	int pending_del_nr = 0;
3849 	int pending_del_slot = 0;
3850 	int extent_type = -1;
3851 	int ret;
3852 	int err = 0;
3853 	u64 ino = btrfs_ino(inode);
3854 
3855 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
3856 
3857 	path = btrfs_alloc_path();
3858 	if (!path)
3859 		return -ENOMEM;
3860 	path->reada = -1;
3861 
3862 	/*
3863 	 * We want to drop from the next block forward in case this new size is
3864 	 * not block aligned since we will be keeping the last block of the
3865 	 * extent just the way it is.
3866 	 */
3867 	if (root->ref_cows || root == root->fs_info->tree_root)
3868 		btrfs_drop_extent_cache(inode, ALIGN(new_size,
3869 					root->sectorsize), (u64)-1, 0);
3870 
3871 	/*
3872 	 * This function is also used to drop the items in the log tree before
3873 	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
3874 	 * it is used to drop the loged items. So we shouldn't kill the delayed
3875 	 * items.
3876 	 */
3877 	if (min_type == 0 && root == BTRFS_I(inode)->root)
3878 		btrfs_kill_delayed_inode_items(inode);
3879 
3880 	key.objectid = ino;
3881 	key.offset = (u64)-1;
3882 	key.type = (u8)-1;
3883 
3884 search_again:
3885 	path->leave_spinning = 1;
3886 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3887 	if (ret < 0) {
3888 		err = ret;
3889 		goto out;
3890 	}
3891 
3892 	if (ret > 0) {
3893 		/* there are no items in the tree for us to truncate, we're
3894 		 * done
3895 		 */
3896 		if (path->slots[0] == 0)
3897 			goto out;
3898 		path->slots[0]--;
3899 	}
3900 
3901 	while (1) {
3902 		fi = NULL;
3903 		leaf = path->nodes[0];
3904 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3905 		found_type = btrfs_key_type(&found_key);
3906 
3907 		if (found_key.objectid != ino)
3908 			break;
3909 
3910 		if (found_type < min_type)
3911 			break;
3912 
3913 		item_end = found_key.offset;
3914 		if (found_type == BTRFS_EXTENT_DATA_KEY) {
3915 			fi = btrfs_item_ptr(leaf, path->slots[0],
3916 					    struct btrfs_file_extent_item);
3917 			extent_type = btrfs_file_extent_type(leaf, fi);
3918 			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3919 				item_end +=
3920 				    btrfs_file_extent_num_bytes(leaf, fi);
3921 			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3922 				item_end += btrfs_file_extent_inline_len(leaf,
3923 									 fi);
3924 			}
3925 			item_end--;
3926 		}
3927 		if (found_type > min_type) {
3928 			del_item = 1;
3929 		} else {
3930 			if (item_end < new_size)
3931 				break;
3932 			if (found_key.offset >= new_size)
3933 				del_item = 1;
3934 			else
3935 				del_item = 0;
3936 		}
3937 		found_extent = 0;
3938 		/* FIXME, shrink the extent if the ref count is only 1 */
3939 		if (found_type != BTRFS_EXTENT_DATA_KEY)
3940 			goto delete;
3941 
3942 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3943 			u64 num_dec;
3944 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
3945 			if (!del_item) {
3946 				u64 orig_num_bytes =
3947 					btrfs_file_extent_num_bytes(leaf, fi);
3948 				extent_num_bytes = ALIGN(new_size -
3949 						found_key.offset,
3950 						root->sectorsize);
3951 				btrfs_set_file_extent_num_bytes(leaf, fi,
3952 							 extent_num_bytes);
3953 				num_dec = (orig_num_bytes -
3954 					   extent_num_bytes);
3955 				if (root->ref_cows && extent_start != 0)
3956 					inode_sub_bytes(inode, num_dec);
3957 				btrfs_mark_buffer_dirty(leaf);
3958 			} else {
3959 				extent_num_bytes =
3960 					btrfs_file_extent_disk_num_bytes(leaf,
3961 									 fi);
3962 				extent_offset = found_key.offset -
3963 					btrfs_file_extent_offset(leaf, fi);
3964 
3965 				/* FIXME blocksize != 4096 */
3966 				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
3967 				if (extent_start != 0) {
3968 					found_extent = 1;
3969 					if (root->ref_cows)
3970 						inode_sub_bytes(inode, num_dec);
3971 				}
3972 			}
3973 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3974 			/*
3975 			 * we can't truncate inline items that have had
3976 			 * special encodings
3977 			 */
3978 			if (!del_item &&
3979 			    btrfs_file_extent_compression(leaf, fi) == 0 &&
3980 			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
3981 			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
3982 				u32 size = new_size - found_key.offset;
3983 
3984 				if (root->ref_cows) {
3985 					inode_sub_bytes(inode, item_end + 1 -
3986 							new_size);
3987 				}
3988 				size =
3989 				    btrfs_file_extent_calc_inline_size(size);
3990 				btrfs_truncate_item(root, path, size, 1);
3991 			} else if (root->ref_cows) {
3992 				inode_sub_bytes(inode, item_end + 1 -
3993 						found_key.offset);
3994 			}
3995 		}
3996 delete:
3997 		if (del_item) {
3998 			if (!pending_del_nr) {
3999 				/* no pending yet, add ourselves */
4000 				pending_del_slot = path->slots[0];
4001 				pending_del_nr = 1;
4002 			} else if (pending_del_nr &&
4003 				   path->slots[0] + 1 == pending_del_slot) {
4004 				/* hop on the pending chunk */
4005 				pending_del_nr++;
4006 				pending_del_slot = path->slots[0];
4007 			} else {
4008 				BUG();
4009 			}
4010 		} else {
4011 			break;
4012 		}
4013 		if (found_extent && (root->ref_cows ||
4014 				     root == root->fs_info->tree_root)) {
4015 			btrfs_set_path_blocking(path);
4016 			ret = btrfs_free_extent(trans, root, extent_start,
4017 						extent_num_bytes, 0,
4018 						btrfs_header_owner(leaf),
4019 						ino, extent_offset, 0);
4020 			BUG_ON(ret);
4021 		}
4022 
4023 		if (found_type == BTRFS_INODE_ITEM_KEY)
4024 			break;
4025 
4026 		if (path->slots[0] == 0 ||
4027 		    path->slots[0] != pending_del_slot) {
4028 			if (pending_del_nr) {
4029 				ret = btrfs_del_items(trans, root, path,
4030 						pending_del_slot,
4031 						pending_del_nr);
4032 				if (ret) {
4033 					btrfs_abort_transaction(trans,
4034 								root, ret);
4035 					goto error;
4036 				}
4037 				pending_del_nr = 0;
4038 			}
4039 			btrfs_release_path(path);
4040 			goto search_again;
4041 		} else {
4042 			path->slots[0]--;
4043 		}
4044 	}
4045 out:
4046 	if (pending_del_nr) {
4047 		ret = btrfs_del_items(trans, root, path, pending_del_slot,
4048 				      pending_del_nr);
4049 		if (ret)
4050 			btrfs_abort_transaction(trans, root, ret);
4051 	}
4052 error:
4053 	btrfs_free_path(path);
4054 	return err;
4055 }
4056 
4057 /*
4058  * btrfs_truncate_page - read, zero a chunk and write a page
4059  * @inode - inode that we're zeroing
4060  * @from - the offset to start zeroing
4061  * @len - the length to zero, 0 to zero the entire range respective to the
4062  *	offset
4063  * @front - zero up to the offset instead of from the offset on
4064  *
4065  * This will find the page for the "from" offset and cow the page and zero the
4066  * part we want to zero.  This is used with truncate and hole punching.
4067  */
4068 int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
4069 			int front)
4070 {
4071 	struct address_space *mapping = inode->i_mapping;
4072 	struct btrfs_root *root = BTRFS_I(inode)->root;
4073 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4074 	struct btrfs_ordered_extent *ordered;
4075 	struct extent_state *cached_state = NULL;
4076 	char *kaddr;
4077 	u32 blocksize = root->sectorsize;
4078 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
4079 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
4080 	struct page *page;
4081 	gfp_t mask = btrfs_alloc_write_mask(mapping);
4082 	int ret = 0;
4083 	u64 page_start;
4084 	u64 page_end;
4085 
4086 	if ((offset & (blocksize - 1)) == 0 &&
4087 	    (!len || ((len & (blocksize - 1)) == 0)))
4088 		goto out;
4089 	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
4090 	if (ret)
4091 		goto out;
4092 
4093 again:
4094 	page = find_or_create_page(mapping, index, mask);
4095 	if (!page) {
4096 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4097 		ret = -ENOMEM;
4098 		goto out;
4099 	}
4100 
4101 	page_start = page_offset(page);
4102 	page_end = page_start + PAGE_CACHE_SIZE - 1;
4103 
4104 	if (!PageUptodate(page)) {
4105 		ret = btrfs_readpage(NULL, page);
4106 		lock_page(page);
4107 		if (page->mapping != mapping) {
4108 			unlock_page(page);
4109 			page_cache_release(page);
4110 			goto again;
4111 		}
4112 		if (!PageUptodate(page)) {
4113 			ret = -EIO;
4114 			goto out_unlock;
4115 		}
4116 	}
4117 	wait_on_page_writeback(page);
4118 
4119 	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
4120 	set_page_extent_mapped(page);
4121 
4122 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
4123 	if (ordered) {
4124 		unlock_extent_cached(io_tree, page_start, page_end,
4125 				     &cached_state, GFP_NOFS);
4126 		unlock_page(page);
4127 		page_cache_release(page);
4128 		btrfs_start_ordered_extent(inode, ordered, 1);
4129 		btrfs_put_ordered_extent(ordered);
4130 		goto again;
4131 	}
4132 
4133 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
4134 			  EXTENT_DIRTY | EXTENT_DELALLOC |
4135 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4136 			  0, 0, &cached_state, GFP_NOFS);
4137 
4138 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
4139 					&cached_state);
4140 	if (ret) {
4141 		unlock_extent_cached(io_tree, page_start, page_end,
4142 				     &cached_state, GFP_NOFS);
4143 		goto out_unlock;
4144 	}
4145 
4146 	if (offset != PAGE_CACHE_SIZE) {
4147 		if (!len)
4148 			len = PAGE_CACHE_SIZE - offset;
4149 		kaddr = kmap(page);
4150 		if (front)
4151 			memset(kaddr, 0, offset);
4152 		else
4153 			memset(kaddr + offset, 0, len);
4154 		flush_dcache_page(page);
4155 		kunmap(page);
4156 	}
4157 	ClearPageChecked(page);
4158 	set_page_dirty(page);
4159 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
4160 			     GFP_NOFS);
4161 
4162 out_unlock:
4163 	if (ret)
4164 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4165 	unlock_page(page);
4166 	page_cache_release(page);
4167 out:
4168 	return ret;
4169 }
4170 
4171 /*
4172  * This function puts in dummy file extents for the area we're creating a hole
4173  * for.  So if we are truncating this file to a larger size we need to insert
4174  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4175  * the range between oldsize and size
4176  */
4177 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4178 {
4179 	struct btrfs_trans_handle *trans;
4180 	struct btrfs_root *root = BTRFS_I(inode)->root;
4181 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4182 	struct extent_map *em = NULL;
4183 	struct extent_state *cached_state = NULL;
4184 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4185 	u64 hole_start = ALIGN(oldsize, root->sectorsize);
4186 	u64 block_end = ALIGN(size, root->sectorsize);
4187 	u64 last_byte;
4188 	u64 cur_offset;
4189 	u64 hole_size;
4190 	int err = 0;
4191 
4192 	/*
4193 	 * If our size started in the middle of a page we need to zero out the
4194 	 * rest of the page before we expand the i_size, otherwise we could
4195 	 * expose stale data.
4196 	 */
4197 	err = btrfs_truncate_page(inode, oldsize, 0, 0);
4198 	if (err)
4199 		return err;
4200 
4201 	if (size <= hole_start)
4202 		return 0;
4203 
4204 	while (1) {
4205 		struct btrfs_ordered_extent *ordered;
4206 		btrfs_wait_ordered_range(inode, hole_start,
4207 					 block_end - hole_start);
4208 		lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
4209 				 &cached_state);
4210 		ordered = btrfs_lookup_ordered_extent(inode, hole_start);
4211 		if (!ordered)
4212 			break;
4213 		unlock_extent_cached(io_tree, hole_start, block_end - 1,
4214 				     &cached_state, GFP_NOFS);
4215 		btrfs_put_ordered_extent(ordered);
4216 	}
4217 
4218 	cur_offset = hole_start;
4219 	while (1) {
4220 		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4221 				block_end - cur_offset, 0);
4222 		if (IS_ERR(em)) {
4223 			err = PTR_ERR(em);
4224 			em = NULL;
4225 			break;
4226 		}
4227 		last_byte = min(extent_map_end(em), block_end);
4228 		last_byte = ALIGN(last_byte , root->sectorsize);
4229 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4230 			struct extent_map *hole_em;
4231 			hole_size = last_byte - cur_offset;
4232 
4233 			trans = btrfs_start_transaction(root, 3);
4234 			if (IS_ERR(trans)) {
4235 				err = PTR_ERR(trans);
4236 				break;
4237 			}
4238 
4239 			err = btrfs_drop_extents(trans, root, inode,
4240 						 cur_offset,
4241 						 cur_offset + hole_size, 1);
4242 			if (err) {
4243 				btrfs_abort_transaction(trans, root, err);
4244 				btrfs_end_transaction(trans, root);
4245 				break;
4246 			}
4247 
4248 			err = btrfs_insert_file_extent(trans, root,
4249 					btrfs_ino(inode), cur_offset, 0,
4250 					0, hole_size, 0, hole_size,
4251 					0, 0, 0);
4252 			if (err) {
4253 				btrfs_abort_transaction(trans, root, err);
4254 				btrfs_end_transaction(trans, root);
4255 				break;
4256 			}
4257 
4258 			btrfs_drop_extent_cache(inode, cur_offset,
4259 						cur_offset + hole_size - 1, 0);
4260 			hole_em = alloc_extent_map();
4261 			if (!hole_em) {
4262 				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4263 					&BTRFS_I(inode)->runtime_flags);
4264 				goto next;
4265 			}
4266 			hole_em->start = cur_offset;
4267 			hole_em->len = hole_size;
4268 			hole_em->orig_start = cur_offset;
4269 
4270 			hole_em->block_start = EXTENT_MAP_HOLE;
4271 			hole_em->block_len = 0;
4272 			hole_em->orig_block_len = 0;
4273 			hole_em->ram_bytes = hole_size;
4274 			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
4275 			hole_em->compress_type = BTRFS_COMPRESS_NONE;
4276 			hole_em->generation = trans->transid;
4277 
4278 			while (1) {
4279 				write_lock(&em_tree->lock);
4280 				err = add_extent_mapping(em_tree, hole_em, 1);
4281 				write_unlock(&em_tree->lock);
4282 				if (err != -EEXIST)
4283 					break;
4284 				btrfs_drop_extent_cache(inode, cur_offset,
4285 							cur_offset +
4286 							hole_size - 1, 0);
4287 			}
4288 			free_extent_map(hole_em);
4289 next:
4290 			btrfs_update_inode(trans, root, inode);
4291 			btrfs_end_transaction(trans, root);
4292 		}
4293 		free_extent_map(em);
4294 		em = NULL;
4295 		cur_offset = last_byte;
4296 		if (cur_offset >= block_end)
4297 			break;
4298 	}
4299 
4300 	free_extent_map(em);
4301 	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
4302 			     GFP_NOFS);
4303 	return err;
4304 }
4305 
4306 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4307 {
4308 	struct btrfs_root *root = BTRFS_I(inode)->root;
4309 	struct btrfs_trans_handle *trans;
4310 	loff_t oldsize = i_size_read(inode);
4311 	loff_t newsize = attr->ia_size;
4312 	int mask = attr->ia_valid;
4313 	int ret;
4314 
4315 	/*
4316 	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4317 	 * special case where we need to update the times despite not having
4318 	 * these flags set.  For all other operations the VFS set these flags
4319 	 * explicitly if it wants a timestamp update.
4320 	 */
4321 	if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
4322 		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
4323 
4324 	if (newsize > oldsize) {
4325 		truncate_pagecache(inode, oldsize, newsize);
4326 		ret = btrfs_cont_expand(inode, oldsize, newsize);
4327 		if (ret)
4328 			return ret;
4329 
4330 		trans = btrfs_start_transaction(root, 1);
4331 		if (IS_ERR(trans))
4332 			return PTR_ERR(trans);
4333 
4334 		i_size_write(inode, newsize);
4335 		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4336 		ret = btrfs_update_inode(trans, root, inode);
4337 		btrfs_end_transaction(trans, root);
4338 	} else {
4339 
4340 		/*
4341 		 * We're truncating a file that used to have good data down to
4342 		 * zero. Make sure it gets into the ordered flush list so that
4343 		 * any new writes get down to disk quickly.
4344 		 */
4345 		if (newsize == 0)
4346 			set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
4347 				&BTRFS_I(inode)->runtime_flags);
4348 
4349 		/*
4350 		 * 1 for the orphan item we're going to add
4351 		 * 1 for the orphan item deletion.
4352 		 */
4353 		trans = btrfs_start_transaction(root, 2);
4354 		if (IS_ERR(trans))
4355 			return PTR_ERR(trans);
4356 
4357 		/*
4358 		 * We need to do this in case we fail at _any_ point during the
4359 		 * actual truncate.  Once we do the truncate_setsize we could
4360 		 * invalidate pages which forces any outstanding ordered io to
4361 		 * be instantly completed which will give us extents that need
4362 		 * to be truncated.  If we fail to get an orphan inode down we
4363 		 * could have left over extents that were never meant to live,
4364 		 * so we need to garuntee from this point on that everything
4365 		 * will be consistent.
4366 		 */
4367 		ret = btrfs_orphan_add(trans, inode);
4368 		btrfs_end_transaction(trans, root);
4369 		if (ret)
4370 			return ret;
4371 
4372 		/* we don't support swapfiles, so vmtruncate shouldn't fail */
4373 		truncate_setsize(inode, newsize);
4374 
4375 		/* Disable nonlocked read DIO to avoid the end less truncate */
4376 		btrfs_inode_block_unlocked_dio(inode);
4377 		inode_dio_wait(inode);
4378 		btrfs_inode_resume_unlocked_dio(inode);
4379 
4380 		ret = btrfs_truncate(inode);
4381 		if (ret && inode->i_nlink)
4382 			btrfs_orphan_del(NULL, inode);
4383 	}
4384 
4385 	return ret;
4386 }
4387 
4388 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
4389 {
4390 	struct inode *inode = dentry->d_inode;
4391 	struct btrfs_root *root = BTRFS_I(inode)->root;
4392 	int err;
4393 
4394 	if (btrfs_root_readonly(root))
4395 		return -EROFS;
4396 
4397 	err = inode_change_ok(inode, attr);
4398 	if (err)
4399 		return err;
4400 
4401 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
4402 		err = btrfs_setsize(inode, attr);
4403 		if (err)
4404 			return err;
4405 	}
4406 
4407 	if (attr->ia_valid) {
4408 		setattr_copy(inode, attr);
4409 		inode_inc_iversion(inode);
4410 		err = btrfs_dirty_inode(inode);
4411 
4412 		if (!err && attr->ia_valid & ATTR_MODE)
4413 			err = btrfs_acl_chmod(inode);
4414 	}
4415 
4416 	return err;
4417 }
4418 
4419 void btrfs_evict_inode(struct inode *inode)
4420 {
4421 	struct btrfs_trans_handle *trans;
4422 	struct btrfs_root *root = BTRFS_I(inode)->root;
4423 	struct btrfs_block_rsv *rsv, *global_rsv;
4424 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
4425 	int ret;
4426 
4427 	trace_btrfs_inode_evict(inode);
4428 
4429 	truncate_inode_pages(&inode->i_data, 0);
4430 	if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
4431 			       btrfs_is_free_space_inode(inode)))
4432 		goto no_delete;
4433 
4434 	if (is_bad_inode(inode)) {
4435 		btrfs_orphan_del(NULL, inode);
4436 		goto no_delete;
4437 	}
4438 	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
4439 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
4440 
4441 	if (root->fs_info->log_root_recovering) {
4442 		BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
4443 				 &BTRFS_I(inode)->runtime_flags));
4444 		goto no_delete;
4445 	}
4446 
4447 	if (inode->i_nlink > 0) {
4448 		BUG_ON(btrfs_root_refs(&root->root_item) != 0);
4449 		goto no_delete;
4450 	}
4451 
4452 	ret = btrfs_commit_inode_delayed_inode(inode);
4453 	if (ret) {
4454 		btrfs_orphan_del(NULL, inode);
4455 		goto no_delete;
4456 	}
4457 
4458 	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
4459 	if (!rsv) {
4460 		btrfs_orphan_del(NULL, inode);
4461 		goto no_delete;
4462 	}
4463 	rsv->size = min_size;
4464 	rsv->failfast = 1;
4465 	global_rsv = &root->fs_info->global_block_rsv;
4466 
4467 	btrfs_i_size_write(inode, 0);
4468 
4469 	/*
4470 	 * This is a bit simpler than btrfs_truncate since we've already
4471 	 * reserved our space for our orphan item in the unlink, so we just
4472 	 * need to reserve some slack space in case we add bytes and update
4473 	 * inode item when doing the truncate.
4474 	 */
4475 	while (1) {
4476 		ret = btrfs_block_rsv_refill(root, rsv, min_size,
4477 					     BTRFS_RESERVE_FLUSH_LIMIT);
4478 
4479 		/*
4480 		 * Try and steal from the global reserve since we will
4481 		 * likely not use this space anyway, we want to try as
4482 		 * hard as possible to get this to work.
4483 		 */
4484 		if (ret)
4485 			ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
4486 
4487 		if (ret) {
4488 			btrfs_warn(root->fs_info,
4489 				"Could not get space for a delete, will truncate on mount %d",
4490 				ret);
4491 			btrfs_orphan_del(NULL, inode);
4492 			btrfs_free_block_rsv(root, rsv);
4493 			goto no_delete;
4494 		}
4495 
4496 		trans = btrfs_join_transaction(root);
4497 		if (IS_ERR(trans)) {
4498 			btrfs_orphan_del(NULL, inode);
4499 			btrfs_free_block_rsv(root, rsv);
4500 			goto no_delete;
4501 		}
4502 
4503 		trans->block_rsv = rsv;
4504 
4505 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
4506 		if (ret != -ENOSPC)
4507 			break;
4508 
4509 		trans->block_rsv = &root->fs_info->trans_block_rsv;
4510 		btrfs_end_transaction(trans, root);
4511 		trans = NULL;
4512 		btrfs_btree_balance_dirty(root);
4513 	}
4514 
4515 	btrfs_free_block_rsv(root, rsv);
4516 
4517 	/*
4518 	 * Errors here aren't a big deal, it just means we leave orphan items
4519 	 * in the tree.  They will be cleaned up on the next mount.
4520 	 */
4521 	if (ret == 0) {
4522 		trans->block_rsv = root->orphan_block_rsv;
4523 		btrfs_orphan_del(trans, inode);
4524 	} else {
4525 		btrfs_orphan_del(NULL, inode);
4526 	}
4527 
4528 	trans->block_rsv = &root->fs_info->trans_block_rsv;
4529 	if (!(root == root->fs_info->tree_root ||
4530 	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
4531 		btrfs_return_ino(root, btrfs_ino(inode));
4532 
4533 	btrfs_end_transaction(trans, root);
4534 	btrfs_btree_balance_dirty(root);
4535 no_delete:
4536 	btrfs_remove_delayed_node(inode);
4537 	clear_inode(inode);
4538 	return;
4539 }
4540 
4541 /*
4542  * this returns the key found in the dir entry in the location pointer.
4543  * If no dir entries were found, location->objectid is 0.
4544  */
4545 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
4546 			       struct btrfs_key *location)
4547 {
4548 	const char *name = dentry->d_name.name;
4549 	int namelen = dentry->d_name.len;
4550 	struct btrfs_dir_item *di;
4551 	struct btrfs_path *path;
4552 	struct btrfs_root *root = BTRFS_I(dir)->root;
4553 	int ret = 0;
4554 
4555 	path = btrfs_alloc_path();
4556 	if (!path)
4557 		return -ENOMEM;
4558 
4559 	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
4560 				    namelen, 0);
4561 	if (IS_ERR(di))
4562 		ret = PTR_ERR(di);
4563 
4564 	if (IS_ERR_OR_NULL(di))
4565 		goto out_err;
4566 
4567 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
4568 out:
4569 	btrfs_free_path(path);
4570 	return ret;
4571 out_err:
4572 	location->objectid = 0;
4573 	goto out;
4574 }
4575 
4576 /*
4577  * when we hit a tree root in a directory, the btrfs part of the inode
4578  * needs to be changed to reflect the root directory of the tree root.  This
4579  * is kind of like crossing a mount point.
4580  */
4581 static int fixup_tree_root_location(struct btrfs_root *root,
4582 				    struct inode *dir,
4583 				    struct dentry *dentry,
4584 				    struct btrfs_key *location,
4585 				    struct btrfs_root **sub_root)
4586 {
4587 	struct btrfs_path *path;
4588 	struct btrfs_root *new_root;
4589 	struct btrfs_root_ref *ref;
4590 	struct extent_buffer *leaf;
4591 	int ret;
4592 	int err = 0;
4593 
4594 	path = btrfs_alloc_path();
4595 	if (!path) {
4596 		err = -ENOMEM;
4597 		goto out;
4598 	}
4599 
4600 	err = -ENOENT;
4601 	ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
4602 				  BTRFS_I(dir)->root->root_key.objectid,
4603 				  location->objectid);
4604 	if (ret) {
4605 		if (ret < 0)
4606 			err = ret;
4607 		goto out;
4608 	}
4609 
4610 	leaf = path->nodes[0];
4611 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
4612 	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
4613 	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
4614 		goto out;
4615 
4616 	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
4617 				   (unsigned long)(ref + 1),
4618 				   dentry->d_name.len);
4619 	if (ret)
4620 		goto out;
4621 
4622 	btrfs_release_path(path);
4623 
4624 	new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
4625 	if (IS_ERR(new_root)) {
4626 		err = PTR_ERR(new_root);
4627 		goto out;
4628 	}
4629 
4630 	*sub_root = new_root;
4631 	location->objectid = btrfs_root_dirid(&new_root->root_item);
4632 	location->type = BTRFS_INODE_ITEM_KEY;
4633 	location->offset = 0;
4634 	err = 0;
4635 out:
4636 	btrfs_free_path(path);
4637 	return err;
4638 }
4639 
4640 static void inode_tree_add(struct inode *inode)
4641 {
4642 	struct btrfs_root *root = BTRFS_I(inode)->root;
4643 	struct btrfs_inode *entry;
4644 	struct rb_node **p;
4645 	struct rb_node *parent;
4646 	u64 ino = btrfs_ino(inode);
4647 
4648 	if (inode_unhashed(inode))
4649 		return;
4650 again:
4651 	parent = NULL;
4652 	spin_lock(&root->inode_lock);
4653 	p = &root->inode_tree.rb_node;
4654 	while (*p) {
4655 		parent = *p;
4656 		entry = rb_entry(parent, struct btrfs_inode, rb_node);
4657 
4658 		if (ino < btrfs_ino(&entry->vfs_inode))
4659 			p = &parent->rb_left;
4660 		else if (ino > btrfs_ino(&entry->vfs_inode))
4661 			p = &parent->rb_right;
4662 		else {
4663 			WARN_ON(!(entry->vfs_inode.i_state &
4664 				  (I_WILL_FREE | I_FREEING)));
4665 			rb_erase(parent, &root->inode_tree);
4666 			RB_CLEAR_NODE(parent);
4667 			spin_unlock(&root->inode_lock);
4668 			goto again;
4669 		}
4670 	}
4671 	rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
4672 	rb_insert_color(&BTRFS_I(inode)->rb_node, &root->inode_tree);
4673 	spin_unlock(&root->inode_lock);
4674 }
4675 
4676 static void inode_tree_del(struct inode *inode)
4677 {
4678 	struct btrfs_root *root = BTRFS_I(inode)->root;
4679 	int empty = 0;
4680 
4681 	spin_lock(&root->inode_lock);
4682 	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
4683 		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
4684 		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
4685 		empty = RB_EMPTY_ROOT(&root->inode_tree);
4686 	}
4687 	spin_unlock(&root->inode_lock);
4688 
4689 	/*
4690 	 * Free space cache has inodes in the tree root, but the tree root has a
4691 	 * root_refs of 0, so this could end up dropping the tree root as a
4692 	 * snapshot, so we need the extra !root->fs_info->tree_root check to
4693 	 * make sure we don't drop it.
4694 	 */
4695 	if (empty && btrfs_root_refs(&root->root_item) == 0 &&
4696 	    root != root->fs_info->tree_root) {
4697 		synchronize_srcu(&root->fs_info->subvol_srcu);
4698 		spin_lock(&root->inode_lock);
4699 		empty = RB_EMPTY_ROOT(&root->inode_tree);
4700 		spin_unlock(&root->inode_lock);
4701 		if (empty)
4702 			btrfs_add_dead_root(root);
4703 	}
4704 }
4705 
4706 void btrfs_invalidate_inodes(struct btrfs_root *root)
4707 {
4708 	struct rb_node *node;
4709 	struct rb_node *prev;
4710 	struct btrfs_inode *entry;
4711 	struct inode *inode;
4712 	u64 objectid = 0;
4713 
4714 	WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4715 
4716 	spin_lock(&root->inode_lock);
4717 again:
4718 	node = root->inode_tree.rb_node;
4719 	prev = NULL;
4720 	while (node) {
4721 		prev = node;
4722 		entry = rb_entry(node, struct btrfs_inode, rb_node);
4723 
4724 		if (objectid < btrfs_ino(&entry->vfs_inode))
4725 			node = node->rb_left;
4726 		else if (objectid > btrfs_ino(&entry->vfs_inode))
4727 			node = node->rb_right;
4728 		else
4729 			break;
4730 	}
4731 	if (!node) {
4732 		while (prev) {
4733 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
4734 			if (objectid <= btrfs_ino(&entry->vfs_inode)) {
4735 				node = prev;
4736 				break;
4737 			}
4738 			prev = rb_next(prev);
4739 		}
4740 	}
4741 	while (node) {
4742 		entry = rb_entry(node, struct btrfs_inode, rb_node);
4743 		objectid = btrfs_ino(&entry->vfs_inode) + 1;
4744 		inode = igrab(&entry->vfs_inode);
4745 		if (inode) {
4746 			spin_unlock(&root->inode_lock);
4747 			if (atomic_read(&inode->i_count) > 1)
4748 				d_prune_aliases(inode);
4749 			/*
4750 			 * btrfs_drop_inode will have it removed from
4751 			 * the inode cache when its usage count
4752 			 * hits zero.
4753 			 */
4754 			iput(inode);
4755 			cond_resched();
4756 			spin_lock(&root->inode_lock);
4757 			goto again;
4758 		}
4759 
4760 		if (cond_resched_lock(&root->inode_lock))
4761 			goto again;
4762 
4763 		node = rb_next(node);
4764 	}
4765 	spin_unlock(&root->inode_lock);
4766 }
4767 
4768 static int btrfs_init_locked_inode(struct inode *inode, void *p)
4769 {
4770 	struct btrfs_iget_args *args = p;
4771 	inode->i_ino = args->ino;
4772 	BTRFS_I(inode)->root = args->root;
4773 	return 0;
4774 }
4775 
4776 static int btrfs_find_actor(struct inode *inode, void *opaque)
4777 {
4778 	struct btrfs_iget_args *args = opaque;
4779 	return args->ino == btrfs_ino(inode) &&
4780 		args->root == BTRFS_I(inode)->root;
4781 }
4782 
4783 static struct inode *btrfs_iget_locked(struct super_block *s,
4784 				       u64 objectid,
4785 				       struct btrfs_root *root)
4786 {
4787 	struct inode *inode;
4788 	struct btrfs_iget_args args;
4789 	args.ino = objectid;
4790 	args.root = root;
4791 
4792 	inode = iget5_locked(s, objectid, btrfs_find_actor,
4793 			     btrfs_init_locked_inode,
4794 			     (void *)&args);
4795 	return inode;
4796 }
4797 
4798 /* Get an inode object given its location and corresponding root.
4799  * Returns in *is_new if the inode was read from disk
4800  */
4801 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
4802 			 struct btrfs_root *root, int *new)
4803 {
4804 	struct inode *inode;
4805 
4806 	inode = btrfs_iget_locked(s, location->objectid, root);
4807 	if (!inode)
4808 		return ERR_PTR(-ENOMEM);
4809 
4810 	if (inode->i_state & I_NEW) {
4811 		BTRFS_I(inode)->root = root;
4812 		memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
4813 		btrfs_read_locked_inode(inode);
4814 		if (!is_bad_inode(inode)) {
4815 			inode_tree_add(inode);
4816 			unlock_new_inode(inode);
4817 			if (new)
4818 				*new = 1;
4819 		} else {
4820 			unlock_new_inode(inode);
4821 			iput(inode);
4822 			inode = ERR_PTR(-ESTALE);
4823 		}
4824 	}
4825 
4826 	return inode;
4827 }
4828 
4829 static struct inode *new_simple_dir(struct super_block *s,
4830 				    struct btrfs_key *key,
4831 				    struct btrfs_root *root)
4832 {
4833 	struct inode *inode = new_inode(s);
4834 
4835 	if (!inode)
4836 		return ERR_PTR(-ENOMEM);
4837 
4838 	BTRFS_I(inode)->root = root;
4839 	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
4840 	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
4841 
4842 	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
4843 	inode->i_op = &btrfs_dir_ro_inode_operations;
4844 	inode->i_fop = &simple_dir_operations;
4845 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
4846 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
4847 
4848 	return inode;
4849 }
4850 
4851 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4852 {
4853 	struct inode *inode;
4854 	struct btrfs_root *root = BTRFS_I(dir)->root;
4855 	struct btrfs_root *sub_root = root;
4856 	struct btrfs_key location;
4857 	int index;
4858 	int ret = 0;
4859 
4860 	if (dentry->d_name.len > BTRFS_NAME_LEN)
4861 		return ERR_PTR(-ENAMETOOLONG);
4862 
4863 	ret = btrfs_inode_by_name(dir, dentry, &location);
4864 	if (ret < 0)
4865 		return ERR_PTR(ret);
4866 
4867 	if (location.objectid == 0)
4868 		return NULL;
4869 
4870 	if (location.type == BTRFS_INODE_ITEM_KEY) {
4871 		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
4872 		return inode;
4873 	}
4874 
4875 	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
4876 
4877 	index = srcu_read_lock(&root->fs_info->subvol_srcu);
4878 	ret = fixup_tree_root_location(root, dir, dentry,
4879 				       &location, &sub_root);
4880 	if (ret < 0) {
4881 		if (ret != -ENOENT)
4882 			inode = ERR_PTR(ret);
4883 		else
4884 			inode = new_simple_dir(dir->i_sb, &location, sub_root);
4885 	} else {
4886 		inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
4887 	}
4888 	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
4889 
4890 	if (!IS_ERR(inode) && root != sub_root) {
4891 		down_read(&root->fs_info->cleanup_work_sem);
4892 		if (!(inode->i_sb->s_flags & MS_RDONLY))
4893 			ret = btrfs_orphan_cleanup(sub_root);
4894 		up_read(&root->fs_info->cleanup_work_sem);
4895 		if (ret) {
4896 			iput(inode);
4897 			inode = ERR_PTR(ret);
4898 		}
4899 	}
4900 
4901 	return inode;
4902 }
4903 
4904 static int btrfs_dentry_delete(const struct dentry *dentry)
4905 {
4906 	struct btrfs_root *root;
4907 	struct inode *inode = dentry->d_inode;
4908 
4909 	if (!inode && !IS_ROOT(dentry))
4910 		inode = dentry->d_parent->d_inode;
4911 
4912 	if (inode) {
4913 		root = BTRFS_I(inode)->root;
4914 		if (btrfs_root_refs(&root->root_item) == 0)
4915 			return 1;
4916 
4917 		if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
4918 			return 1;
4919 	}
4920 	return 0;
4921 }
4922 
4923 static void btrfs_dentry_release(struct dentry *dentry)
4924 {
4925 	if (dentry->d_fsdata)
4926 		kfree(dentry->d_fsdata);
4927 }
4928 
4929 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
4930 				   unsigned int flags)
4931 {
4932 	struct dentry *ret;
4933 
4934 	ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
4935 	return ret;
4936 }
4937 
4938 unsigned char btrfs_filetype_table[] = {
4939 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
4940 };
4941 
4942 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
4943 {
4944 	struct inode *inode = file_inode(file);
4945 	struct btrfs_root *root = BTRFS_I(inode)->root;
4946 	struct btrfs_item *item;
4947 	struct btrfs_dir_item *di;
4948 	struct btrfs_key key;
4949 	struct btrfs_key found_key;
4950 	struct btrfs_path *path;
4951 	struct list_head ins_list;
4952 	struct list_head del_list;
4953 	int ret;
4954 	struct extent_buffer *leaf;
4955 	int slot;
4956 	unsigned char d_type;
4957 	int over = 0;
4958 	u32 di_cur;
4959 	u32 di_total;
4960 	u32 di_len;
4961 	int key_type = BTRFS_DIR_INDEX_KEY;
4962 	char tmp_name[32];
4963 	char *name_ptr;
4964 	int name_len;
4965 	int is_curr = 0;	/* ctx->pos points to the current index? */
4966 
4967 	/* FIXME, use a real flag for deciding about the key type */
4968 	if (root->fs_info->tree_root == root)
4969 		key_type = BTRFS_DIR_ITEM_KEY;
4970 
4971 	if (!dir_emit_dots(file, ctx))
4972 		return 0;
4973 
4974 	path = btrfs_alloc_path();
4975 	if (!path)
4976 		return -ENOMEM;
4977 
4978 	path->reada = 1;
4979 
4980 	if (key_type == BTRFS_DIR_INDEX_KEY) {
4981 		INIT_LIST_HEAD(&ins_list);
4982 		INIT_LIST_HEAD(&del_list);
4983 		btrfs_get_delayed_items(inode, &ins_list, &del_list);
4984 	}
4985 
4986 	btrfs_set_key_type(&key, key_type);
4987 	key.offset = ctx->pos;
4988 	key.objectid = btrfs_ino(inode);
4989 
4990 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4991 	if (ret < 0)
4992 		goto err;
4993 
4994 	while (1) {
4995 		leaf = path->nodes[0];
4996 		slot = path->slots[0];
4997 		if (slot >= btrfs_header_nritems(leaf)) {
4998 			ret = btrfs_next_leaf(root, path);
4999 			if (ret < 0)
5000 				goto err;
5001 			else if (ret > 0)
5002 				break;
5003 			continue;
5004 		}
5005 
5006 		item = btrfs_item_nr(leaf, slot);
5007 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
5008 
5009 		if (found_key.objectid != key.objectid)
5010 			break;
5011 		if (btrfs_key_type(&found_key) != key_type)
5012 			break;
5013 		if (found_key.offset < ctx->pos)
5014 			goto next;
5015 		if (key_type == BTRFS_DIR_INDEX_KEY &&
5016 		    btrfs_should_delete_dir_index(&del_list,
5017 						  found_key.offset))
5018 			goto next;
5019 
5020 		ctx->pos = found_key.offset;
5021 		is_curr = 1;
5022 
5023 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5024 		di_cur = 0;
5025 		di_total = btrfs_item_size(leaf, item);
5026 
5027 		while (di_cur < di_total) {
5028 			struct btrfs_key location;
5029 
5030 			if (verify_dir_item(root, leaf, di))
5031 				break;
5032 
5033 			name_len = btrfs_dir_name_len(leaf, di);
5034 			if (name_len <= sizeof(tmp_name)) {
5035 				name_ptr = tmp_name;
5036 			} else {
5037 				name_ptr = kmalloc(name_len, GFP_NOFS);
5038 				if (!name_ptr) {
5039 					ret = -ENOMEM;
5040 					goto err;
5041 				}
5042 			}
5043 			read_extent_buffer(leaf, name_ptr,
5044 					   (unsigned long)(di + 1), name_len);
5045 
5046 			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
5047 			btrfs_dir_item_key_to_cpu(leaf, di, &location);
5048 
5049 
5050 			/* is this a reference to our own snapshot? If so
5051 			 * skip it.
5052 			 *
5053 			 * In contrast to old kernels, we insert the snapshot's
5054 			 * dir item and dir index after it has been created, so
5055 			 * we won't find a reference to our own snapshot. We
5056 			 * still keep the following code for backward
5057 			 * compatibility.
5058 			 */
5059 			if (location.type == BTRFS_ROOT_ITEM_KEY &&
5060 			    location.objectid == root->root_key.objectid) {
5061 				over = 0;
5062 				goto skip;
5063 			}
5064 			over = !dir_emit(ctx, name_ptr, name_len,
5065 				       location.objectid, d_type);
5066 
5067 skip:
5068 			if (name_ptr != tmp_name)
5069 				kfree(name_ptr);
5070 
5071 			if (over)
5072 				goto nopos;
5073 			di_len = btrfs_dir_name_len(leaf, di) +
5074 				 btrfs_dir_data_len(leaf, di) + sizeof(*di);
5075 			di_cur += di_len;
5076 			di = (struct btrfs_dir_item *)((char *)di + di_len);
5077 		}
5078 next:
5079 		path->slots[0]++;
5080 	}
5081 
5082 	if (key_type == BTRFS_DIR_INDEX_KEY) {
5083 		if (is_curr)
5084 			ctx->pos++;
5085 		ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5086 		if (ret)
5087 			goto nopos;
5088 	}
5089 
5090 	/* Reached end of directory/root. Bump pos past the last item. */
5091 	ctx->pos++;
5092 
5093 	/*
5094 	 * Stop new entries from being returned after we return the last
5095 	 * entry.
5096 	 *
5097 	 * New directory entries are assigned a strictly increasing
5098 	 * offset.  This means that new entries created during readdir
5099 	 * are *guaranteed* to be seen in the future by that readdir.
5100 	 * This has broken buggy programs which operate on names as
5101 	 * they're returned by readdir.  Until we re-use freed offsets
5102 	 * we have this hack to stop new entries from being returned
5103 	 * under the assumption that they'll never reach this huge
5104 	 * offset.
5105 	 *
5106 	 * This is being careful not to overflow 32bit loff_t unless the
5107 	 * last entry requires it because doing so has broken 32bit apps
5108 	 * in the past.
5109 	 */
5110 	if (key_type == BTRFS_DIR_INDEX_KEY) {
5111 		if (ctx->pos >= INT_MAX)
5112 			ctx->pos = LLONG_MAX;
5113 		else
5114 			ctx->pos = INT_MAX;
5115 	}
5116 nopos:
5117 	ret = 0;
5118 err:
5119 	if (key_type == BTRFS_DIR_INDEX_KEY)
5120 		btrfs_put_delayed_items(&ins_list, &del_list);
5121 	btrfs_free_path(path);
5122 	return ret;
5123 }
5124 
5125 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
5126 {
5127 	struct btrfs_root *root = BTRFS_I(inode)->root;
5128 	struct btrfs_trans_handle *trans;
5129 	int ret = 0;
5130 	bool nolock = false;
5131 
5132 	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5133 		return 0;
5134 
5135 	if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
5136 		nolock = true;
5137 
5138 	if (wbc->sync_mode == WB_SYNC_ALL) {
5139 		if (nolock)
5140 			trans = btrfs_join_transaction_nolock(root);
5141 		else
5142 			trans = btrfs_join_transaction(root);
5143 		if (IS_ERR(trans))
5144 			return PTR_ERR(trans);
5145 		ret = btrfs_commit_transaction(trans, root);
5146 	}
5147 	return ret;
5148 }
5149 
5150 /*
5151  * This is somewhat expensive, updating the tree every time the
5152  * inode changes.  But, it is most likely to find the inode in cache.
5153  * FIXME, needs more benchmarking...there are no reasons other than performance
5154  * to keep or drop this code.
5155  */
5156 static int btrfs_dirty_inode(struct inode *inode)
5157 {
5158 	struct btrfs_root *root = BTRFS_I(inode)->root;
5159 	struct btrfs_trans_handle *trans;
5160 	int ret;
5161 
5162 	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5163 		return 0;
5164 
5165 	trans = btrfs_join_transaction(root);
5166 	if (IS_ERR(trans))
5167 		return PTR_ERR(trans);
5168 
5169 	ret = btrfs_update_inode(trans, root, inode);
5170 	if (ret && ret == -ENOSPC) {
5171 		/* whoops, lets try again with the full transaction */
5172 		btrfs_end_transaction(trans, root);
5173 		trans = btrfs_start_transaction(root, 1);
5174 		if (IS_ERR(trans))
5175 			return PTR_ERR(trans);
5176 
5177 		ret = btrfs_update_inode(trans, root, inode);
5178 	}
5179 	btrfs_end_transaction(trans, root);
5180 	if (BTRFS_I(inode)->delayed_node)
5181 		btrfs_balance_delayed_items(root);
5182 
5183 	return ret;
5184 }
5185 
5186 /*
5187  * This is a copy of file_update_time.  We need this so we can return error on
5188  * ENOSPC for updating the inode in the case of file write and mmap writes.
5189  */
5190 static int btrfs_update_time(struct inode *inode, struct timespec *now,
5191 			     int flags)
5192 {
5193 	struct btrfs_root *root = BTRFS_I(inode)->root;
5194 
5195 	if (btrfs_root_readonly(root))
5196 		return -EROFS;
5197 
5198 	if (flags & S_VERSION)
5199 		inode_inc_iversion(inode);
5200 	if (flags & S_CTIME)
5201 		inode->i_ctime = *now;
5202 	if (flags & S_MTIME)
5203 		inode->i_mtime = *now;
5204 	if (flags & S_ATIME)
5205 		inode->i_atime = *now;
5206 	return btrfs_dirty_inode(inode);
5207 }
5208 
5209 /*
5210  * find the highest existing sequence number in a directory
5211  * and then set the in-memory index_cnt variable to reflect
5212  * free sequence numbers
5213  */
5214 static int btrfs_set_inode_index_count(struct inode *inode)
5215 {
5216 	struct btrfs_root *root = BTRFS_I(inode)->root;
5217 	struct btrfs_key key, found_key;
5218 	struct btrfs_path *path;
5219 	struct extent_buffer *leaf;
5220 	int ret;
5221 
5222 	key.objectid = btrfs_ino(inode);
5223 	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
5224 	key.offset = (u64)-1;
5225 
5226 	path = btrfs_alloc_path();
5227 	if (!path)
5228 		return -ENOMEM;
5229 
5230 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5231 	if (ret < 0)
5232 		goto out;
5233 	/* FIXME: we should be able to handle this */
5234 	if (ret == 0)
5235 		goto out;
5236 	ret = 0;
5237 
5238 	/*
5239 	 * MAGIC NUMBER EXPLANATION:
5240 	 * since we search a directory based on f_pos we have to start at 2
5241 	 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
5242 	 * else has to start at 2
5243 	 */
5244 	if (path->slots[0] == 0) {
5245 		BTRFS_I(inode)->index_cnt = 2;
5246 		goto out;
5247 	}
5248 
5249 	path->slots[0]--;
5250 
5251 	leaf = path->nodes[0];
5252 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5253 
5254 	if (found_key.objectid != btrfs_ino(inode) ||
5255 	    btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
5256 		BTRFS_I(inode)->index_cnt = 2;
5257 		goto out;
5258 	}
5259 
5260 	BTRFS_I(inode)->index_cnt = found_key.offset + 1;
5261 out:
5262 	btrfs_free_path(path);
5263 	return ret;
5264 }
5265 
5266 /*
5267  * helper to find a free sequence number in a given directory.  This current
5268  * code is very simple, later versions will do smarter things in the btree
5269  */
5270 int btrfs_set_inode_index(struct inode *dir, u64 *index)
5271 {
5272 	int ret = 0;
5273 
5274 	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
5275 		ret = btrfs_inode_delayed_dir_index_count(dir);
5276 		if (ret) {
5277 			ret = btrfs_set_inode_index_count(dir);
5278 			if (ret)
5279 				return ret;
5280 		}
5281 	}
5282 
5283 	*index = BTRFS_I(dir)->index_cnt;
5284 	BTRFS_I(dir)->index_cnt++;
5285 
5286 	return ret;
5287 }
5288 
5289 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5290 				     struct btrfs_root *root,
5291 				     struct inode *dir,
5292 				     const char *name, int name_len,
5293 				     u64 ref_objectid, u64 objectid,
5294 				     umode_t mode, u64 *index)
5295 {
5296 	struct inode *inode;
5297 	struct btrfs_inode_item *inode_item;
5298 	struct btrfs_key *location;
5299 	struct btrfs_path *path;
5300 	struct btrfs_inode_ref *ref;
5301 	struct btrfs_key key[2];
5302 	u32 sizes[2];
5303 	unsigned long ptr;
5304 	int ret;
5305 	int owner;
5306 
5307 	path = btrfs_alloc_path();
5308 	if (!path)
5309 		return ERR_PTR(-ENOMEM);
5310 
5311 	inode = new_inode(root->fs_info->sb);
5312 	if (!inode) {
5313 		btrfs_free_path(path);
5314 		return ERR_PTR(-ENOMEM);
5315 	}
5316 
5317 	/*
5318 	 * we have to initialize this early, so we can reclaim the inode
5319 	 * number if we fail afterwards in this function.
5320 	 */
5321 	inode->i_ino = objectid;
5322 
5323 	if (dir) {
5324 		trace_btrfs_inode_request(dir);
5325 
5326 		ret = btrfs_set_inode_index(dir, index);
5327 		if (ret) {
5328 			btrfs_free_path(path);
5329 			iput(inode);
5330 			return ERR_PTR(ret);
5331 		}
5332 	}
5333 	/*
5334 	 * index_cnt is ignored for everything but a dir,
5335 	 * btrfs_get_inode_index_count has an explanation for the magic
5336 	 * number
5337 	 */
5338 	BTRFS_I(inode)->index_cnt = 2;
5339 	BTRFS_I(inode)->root = root;
5340 	BTRFS_I(inode)->generation = trans->transid;
5341 	inode->i_generation = BTRFS_I(inode)->generation;
5342 
5343 	/*
5344 	 * We could have gotten an inode number from somebody who was fsynced
5345 	 * and then removed in this same transaction, so let's just set full
5346 	 * sync since it will be a full sync anyway and this will blow away the
5347 	 * old info in the log.
5348 	 */
5349 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
5350 
5351 	if (S_ISDIR(mode))
5352 		owner = 0;
5353 	else
5354 		owner = 1;
5355 
5356 	key[0].objectid = objectid;
5357 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
5358 	key[0].offset = 0;
5359 
5360 	/*
5361 	 * Start new inodes with an inode_ref. This is slightly more
5362 	 * efficient for small numbers of hard links since they will
5363 	 * be packed into one item. Extended refs will kick in if we
5364 	 * add more hard links than can fit in the ref item.
5365 	 */
5366 	key[1].objectid = objectid;
5367 	btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
5368 	key[1].offset = ref_objectid;
5369 
5370 	sizes[0] = sizeof(struct btrfs_inode_item);
5371 	sizes[1] = name_len + sizeof(*ref);
5372 
5373 	path->leave_spinning = 1;
5374 	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
5375 	if (ret != 0)
5376 		goto fail;
5377 
5378 	inode_init_owner(inode, dir, mode);
5379 	inode_set_bytes(inode, 0);
5380 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
5381 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5382 				  struct btrfs_inode_item);
5383 	memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
5384 			     sizeof(*inode_item));
5385 	fill_inode_item(trans, path->nodes[0], inode_item, inode);
5386 
5387 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
5388 			     struct btrfs_inode_ref);
5389 	btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
5390 	btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
5391 	ptr = (unsigned long)(ref + 1);
5392 	write_extent_buffer(path->nodes[0], name, ptr, name_len);
5393 
5394 	btrfs_mark_buffer_dirty(path->nodes[0]);
5395 	btrfs_free_path(path);
5396 
5397 	location = &BTRFS_I(inode)->location;
5398 	location->objectid = objectid;
5399 	location->offset = 0;
5400 	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
5401 
5402 	btrfs_inherit_iflags(inode, dir);
5403 
5404 	if (S_ISREG(mode)) {
5405 		if (btrfs_test_opt(root, NODATASUM))
5406 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
5407 		if (btrfs_test_opt(root, NODATACOW))
5408 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
5409 				BTRFS_INODE_NODATASUM;
5410 	}
5411 
5412 	insert_inode_hash(inode);
5413 	inode_tree_add(inode);
5414 
5415 	trace_btrfs_inode_new(inode);
5416 	btrfs_set_inode_last_trans(trans, inode);
5417 
5418 	btrfs_update_root_times(trans, root);
5419 
5420 	return inode;
5421 fail:
5422 	if (dir)
5423 		BTRFS_I(dir)->index_cnt--;
5424 	btrfs_free_path(path);
5425 	iput(inode);
5426 	return ERR_PTR(ret);
5427 }
5428 
5429 static inline u8 btrfs_inode_type(struct inode *inode)
5430 {
5431 	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
5432 }
5433 
5434 /*
5435  * utility function to add 'inode' into 'parent_inode' with
5436  * a give name and a given sequence number.
5437  * if 'add_backref' is true, also insert a backref from the
5438  * inode to the parent directory.
5439  */
5440 int btrfs_add_link(struct btrfs_trans_handle *trans,
5441 		   struct inode *parent_inode, struct inode *inode,
5442 		   const char *name, int name_len, int add_backref, u64 index)
5443 {
5444 	int ret = 0;
5445 	struct btrfs_key key;
5446 	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
5447 	u64 ino = btrfs_ino(inode);
5448 	u64 parent_ino = btrfs_ino(parent_inode);
5449 
5450 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5451 		memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
5452 	} else {
5453 		key.objectid = ino;
5454 		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
5455 		key.offset = 0;
5456 	}
5457 
5458 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5459 		ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
5460 					 key.objectid, root->root_key.objectid,
5461 					 parent_ino, index, name, name_len);
5462 	} else if (add_backref) {
5463 		ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
5464 					     parent_ino, index);
5465 	}
5466 
5467 	/* Nothing to clean up yet */
5468 	if (ret)
5469 		return ret;
5470 
5471 	ret = btrfs_insert_dir_item(trans, root, name, name_len,
5472 				    parent_inode, &key,
5473 				    btrfs_inode_type(inode), index);
5474 	if (ret == -EEXIST || ret == -EOVERFLOW)
5475 		goto fail_dir_item;
5476 	else if (ret) {
5477 		btrfs_abort_transaction(trans, root, ret);
5478 		return ret;
5479 	}
5480 
5481 	btrfs_i_size_write(parent_inode, parent_inode->i_size +
5482 			   name_len * 2);
5483 	inode_inc_iversion(parent_inode);
5484 	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
5485 	ret = btrfs_update_inode(trans, root, parent_inode);
5486 	if (ret)
5487 		btrfs_abort_transaction(trans, root, ret);
5488 	return ret;
5489 
5490 fail_dir_item:
5491 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5492 		u64 local_index;
5493 		int err;
5494 		err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
5495 				 key.objectid, root->root_key.objectid,
5496 				 parent_ino, &local_index, name, name_len);
5497 
5498 	} else if (add_backref) {
5499 		u64 local_index;
5500 		int err;
5501 
5502 		err = btrfs_del_inode_ref(trans, root, name, name_len,
5503 					  ino, parent_ino, &local_index);
5504 	}
5505 	return ret;
5506 }
5507 
5508 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
5509 			    struct inode *dir, struct dentry *dentry,
5510 			    struct inode *inode, int backref, u64 index)
5511 {
5512 	int err = btrfs_add_link(trans, dir, inode,
5513 				 dentry->d_name.name, dentry->d_name.len,
5514 				 backref, index);
5515 	if (err > 0)
5516 		err = -EEXIST;
5517 	return err;
5518 }
5519 
5520 static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5521 			umode_t mode, dev_t rdev)
5522 {
5523 	struct btrfs_trans_handle *trans;
5524 	struct btrfs_root *root = BTRFS_I(dir)->root;
5525 	struct inode *inode = NULL;
5526 	int err;
5527 	int drop_inode = 0;
5528 	u64 objectid;
5529 	u64 index = 0;
5530 
5531 	if (!new_valid_dev(rdev))
5532 		return -EINVAL;
5533 
5534 	/*
5535 	 * 2 for inode item and ref
5536 	 * 2 for dir items
5537 	 * 1 for xattr if selinux is on
5538 	 */
5539 	trans = btrfs_start_transaction(root, 5);
5540 	if (IS_ERR(trans))
5541 		return PTR_ERR(trans);
5542 
5543 	err = btrfs_find_free_ino(root, &objectid);
5544 	if (err)
5545 		goto out_unlock;
5546 
5547 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5548 				dentry->d_name.len, btrfs_ino(dir), objectid,
5549 				mode, &index);
5550 	if (IS_ERR(inode)) {
5551 		err = PTR_ERR(inode);
5552 		goto out_unlock;
5553 	}
5554 
5555 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5556 	if (err) {
5557 		drop_inode = 1;
5558 		goto out_unlock;
5559 	}
5560 
5561 	/*
5562 	* If the active LSM wants to access the inode during
5563 	* d_instantiate it needs these. Smack checks to see
5564 	* if the filesystem supports xattrs by looking at the
5565 	* ops vector.
5566 	*/
5567 
5568 	inode->i_op = &btrfs_special_inode_operations;
5569 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5570 	if (err)
5571 		drop_inode = 1;
5572 	else {
5573 		init_special_inode(inode, inode->i_mode, rdev);
5574 		btrfs_update_inode(trans, root, inode);
5575 		d_instantiate(dentry, inode);
5576 	}
5577 out_unlock:
5578 	btrfs_end_transaction(trans, root);
5579 	btrfs_btree_balance_dirty(root);
5580 	if (drop_inode) {
5581 		inode_dec_link_count(inode);
5582 		iput(inode);
5583 	}
5584 	return err;
5585 }
5586 
5587 static int btrfs_create(struct inode *dir, struct dentry *dentry,
5588 			umode_t mode, bool excl)
5589 {
5590 	struct btrfs_trans_handle *trans;
5591 	struct btrfs_root *root = BTRFS_I(dir)->root;
5592 	struct inode *inode = NULL;
5593 	int drop_inode_on_err = 0;
5594 	int err;
5595 	u64 objectid;
5596 	u64 index = 0;
5597 
5598 	/*
5599 	 * 2 for inode item and ref
5600 	 * 2 for dir items
5601 	 * 1 for xattr if selinux is on
5602 	 */
5603 	trans = btrfs_start_transaction(root, 5);
5604 	if (IS_ERR(trans))
5605 		return PTR_ERR(trans);
5606 
5607 	err = btrfs_find_free_ino(root, &objectid);
5608 	if (err)
5609 		goto out_unlock;
5610 
5611 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5612 				dentry->d_name.len, btrfs_ino(dir), objectid,
5613 				mode, &index);
5614 	if (IS_ERR(inode)) {
5615 		err = PTR_ERR(inode);
5616 		goto out_unlock;
5617 	}
5618 	drop_inode_on_err = 1;
5619 
5620 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5621 	if (err)
5622 		goto out_unlock;
5623 
5624 	err = btrfs_update_inode(trans, root, inode);
5625 	if (err)
5626 		goto out_unlock;
5627 
5628 	/*
5629 	* If the active LSM wants to access the inode during
5630 	* d_instantiate it needs these. Smack checks to see
5631 	* if the filesystem supports xattrs by looking at the
5632 	* ops vector.
5633 	*/
5634 	inode->i_fop = &btrfs_file_operations;
5635 	inode->i_op = &btrfs_file_inode_operations;
5636 
5637 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5638 	if (err)
5639 		goto out_unlock;
5640 
5641 	inode->i_mapping->a_ops = &btrfs_aops;
5642 	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
5643 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
5644 	d_instantiate(dentry, inode);
5645 
5646 out_unlock:
5647 	btrfs_end_transaction(trans, root);
5648 	if (err && drop_inode_on_err) {
5649 		inode_dec_link_count(inode);
5650 		iput(inode);
5651 	}
5652 	btrfs_btree_balance_dirty(root);
5653 	return err;
5654 }
5655 
5656 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5657 		      struct dentry *dentry)
5658 {
5659 	struct btrfs_trans_handle *trans;
5660 	struct btrfs_root *root = BTRFS_I(dir)->root;
5661 	struct inode *inode = old_dentry->d_inode;
5662 	u64 index;
5663 	int err;
5664 	int drop_inode = 0;
5665 
5666 	/* do not allow sys_link's with other subvols of the same device */
5667 	if (root->objectid != BTRFS_I(inode)->root->objectid)
5668 		return -EXDEV;
5669 
5670 	if (inode->i_nlink >= BTRFS_LINK_MAX)
5671 		return -EMLINK;
5672 
5673 	err = btrfs_set_inode_index(dir, &index);
5674 	if (err)
5675 		goto fail;
5676 
5677 	/*
5678 	 * 2 items for inode and inode ref
5679 	 * 2 items for dir items
5680 	 * 1 item for parent inode
5681 	 */
5682 	trans = btrfs_start_transaction(root, 5);
5683 	if (IS_ERR(trans)) {
5684 		err = PTR_ERR(trans);
5685 		goto fail;
5686 	}
5687 
5688 	btrfs_inc_nlink(inode);
5689 	inode_inc_iversion(inode);
5690 	inode->i_ctime = CURRENT_TIME;
5691 	ihold(inode);
5692 	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
5693 
5694 	err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
5695 
5696 	if (err) {
5697 		drop_inode = 1;
5698 	} else {
5699 		struct dentry *parent = dentry->d_parent;
5700 		err = btrfs_update_inode(trans, root, inode);
5701 		if (err)
5702 			goto fail;
5703 		d_instantiate(dentry, inode);
5704 		btrfs_log_new_name(trans, inode, NULL, parent);
5705 	}
5706 
5707 	btrfs_end_transaction(trans, root);
5708 fail:
5709 	if (drop_inode) {
5710 		inode_dec_link_count(inode);
5711 		iput(inode);
5712 	}
5713 	btrfs_btree_balance_dirty(root);
5714 	return err;
5715 }
5716 
5717 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5718 {
5719 	struct inode *inode = NULL;
5720 	struct btrfs_trans_handle *trans;
5721 	struct btrfs_root *root = BTRFS_I(dir)->root;
5722 	int err = 0;
5723 	int drop_on_err = 0;
5724 	u64 objectid = 0;
5725 	u64 index = 0;
5726 
5727 	/*
5728 	 * 2 items for inode and ref
5729 	 * 2 items for dir items
5730 	 * 1 for xattr if selinux is on
5731 	 */
5732 	trans = btrfs_start_transaction(root, 5);
5733 	if (IS_ERR(trans))
5734 		return PTR_ERR(trans);
5735 
5736 	err = btrfs_find_free_ino(root, &objectid);
5737 	if (err)
5738 		goto out_fail;
5739 
5740 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5741 				dentry->d_name.len, btrfs_ino(dir), objectid,
5742 				S_IFDIR | mode, &index);
5743 	if (IS_ERR(inode)) {
5744 		err = PTR_ERR(inode);
5745 		goto out_fail;
5746 	}
5747 
5748 	drop_on_err = 1;
5749 
5750 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5751 	if (err)
5752 		goto out_fail;
5753 
5754 	inode->i_op = &btrfs_dir_inode_operations;
5755 	inode->i_fop = &btrfs_dir_file_operations;
5756 
5757 	btrfs_i_size_write(inode, 0);
5758 	err = btrfs_update_inode(trans, root, inode);
5759 	if (err)
5760 		goto out_fail;
5761 
5762 	err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
5763 			     dentry->d_name.len, 0, index);
5764 	if (err)
5765 		goto out_fail;
5766 
5767 	d_instantiate(dentry, inode);
5768 	drop_on_err = 0;
5769 
5770 out_fail:
5771 	btrfs_end_transaction(trans, root);
5772 	if (drop_on_err)
5773 		iput(inode);
5774 	btrfs_btree_balance_dirty(root);
5775 	return err;
5776 }
5777 
5778 /* helper for btfs_get_extent.  Given an existing extent in the tree,
5779  * and an extent that you want to insert, deal with overlap and insert
5780  * the new extent into the tree.
5781  */
5782 static int merge_extent_mapping(struct extent_map_tree *em_tree,
5783 				struct extent_map *existing,
5784 				struct extent_map *em,
5785 				u64 map_start, u64 map_len)
5786 {
5787 	u64 start_diff;
5788 
5789 	BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
5790 	start_diff = map_start - em->start;
5791 	em->start = map_start;
5792 	em->len = map_len;
5793 	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
5794 	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
5795 		em->block_start += start_diff;
5796 		em->block_len -= start_diff;
5797 	}
5798 	return add_extent_mapping(em_tree, em, 0);
5799 }
5800 
5801 static noinline int uncompress_inline(struct btrfs_path *path,
5802 				      struct inode *inode, struct page *page,
5803 				      size_t pg_offset, u64 extent_offset,
5804 				      struct btrfs_file_extent_item *item)
5805 {
5806 	int ret;
5807 	struct extent_buffer *leaf = path->nodes[0];
5808 	char *tmp;
5809 	size_t max_size;
5810 	unsigned long inline_size;
5811 	unsigned long ptr;
5812 	int compress_type;
5813 
5814 	WARN_ON(pg_offset != 0);
5815 	compress_type = btrfs_file_extent_compression(leaf, item);
5816 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
5817 	inline_size = btrfs_file_extent_inline_item_len(leaf,
5818 					btrfs_item_nr(leaf, path->slots[0]));
5819 	tmp = kmalloc(inline_size, GFP_NOFS);
5820 	if (!tmp)
5821 		return -ENOMEM;
5822 	ptr = btrfs_file_extent_inline_start(item);
5823 
5824 	read_extent_buffer(leaf, tmp, ptr, inline_size);
5825 
5826 	max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
5827 	ret = btrfs_decompress(compress_type, tmp, page,
5828 			       extent_offset, inline_size, max_size);
5829 	if (ret) {
5830 		char *kaddr = kmap_atomic(page);
5831 		unsigned long copy_size = min_t(u64,
5832 				  PAGE_CACHE_SIZE - pg_offset,
5833 				  max_size - extent_offset);
5834 		memset(kaddr + pg_offset, 0, copy_size);
5835 		kunmap_atomic(kaddr);
5836 	}
5837 	kfree(tmp);
5838 	return 0;
5839 }
5840 
5841 /*
5842  * a bit scary, this does extent mapping from logical file offset to the disk.
5843  * the ugly parts come from merging extents from the disk with the in-ram
5844  * representation.  This gets more complex because of the data=ordered code,
5845  * where the in-ram extents might be locked pending data=ordered completion.
5846  *
5847  * This also copies inline extents directly into the page.
5848  */
5849 
5850 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
5851 				    size_t pg_offset, u64 start, u64 len,
5852 				    int create)
5853 {
5854 	int ret;
5855 	int err = 0;
5856 	u64 bytenr;
5857 	u64 extent_start = 0;
5858 	u64 extent_end = 0;
5859 	u64 objectid = btrfs_ino(inode);
5860 	u32 found_type;
5861 	struct btrfs_path *path = NULL;
5862 	struct btrfs_root *root = BTRFS_I(inode)->root;
5863 	struct btrfs_file_extent_item *item;
5864 	struct extent_buffer *leaf;
5865 	struct btrfs_key found_key;
5866 	struct extent_map *em = NULL;
5867 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5868 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5869 	struct btrfs_trans_handle *trans = NULL;
5870 	int compress_type;
5871 
5872 again:
5873 	read_lock(&em_tree->lock);
5874 	em = lookup_extent_mapping(em_tree, start, len);
5875 	if (em)
5876 		em->bdev = root->fs_info->fs_devices->latest_bdev;
5877 	read_unlock(&em_tree->lock);
5878 
5879 	if (em) {
5880 		if (em->start > start || em->start + em->len <= start)
5881 			free_extent_map(em);
5882 		else if (em->block_start == EXTENT_MAP_INLINE && page)
5883 			free_extent_map(em);
5884 		else
5885 			goto out;
5886 	}
5887 	em = alloc_extent_map();
5888 	if (!em) {
5889 		err = -ENOMEM;
5890 		goto out;
5891 	}
5892 	em->bdev = root->fs_info->fs_devices->latest_bdev;
5893 	em->start = EXTENT_MAP_HOLE;
5894 	em->orig_start = EXTENT_MAP_HOLE;
5895 	em->len = (u64)-1;
5896 	em->block_len = (u64)-1;
5897 
5898 	if (!path) {
5899 		path = btrfs_alloc_path();
5900 		if (!path) {
5901 			err = -ENOMEM;
5902 			goto out;
5903 		}
5904 		/*
5905 		 * Chances are we'll be called again, so go ahead and do
5906 		 * readahead
5907 		 */
5908 		path->reada = 1;
5909 	}
5910 
5911 	ret = btrfs_lookup_file_extent(trans, root, path,
5912 				       objectid, start, trans != NULL);
5913 	if (ret < 0) {
5914 		err = ret;
5915 		goto out;
5916 	}
5917 
5918 	if (ret != 0) {
5919 		if (path->slots[0] == 0)
5920 			goto not_found;
5921 		path->slots[0]--;
5922 	}
5923 
5924 	leaf = path->nodes[0];
5925 	item = btrfs_item_ptr(leaf, path->slots[0],
5926 			      struct btrfs_file_extent_item);
5927 	/* are we inside the extent that was found? */
5928 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5929 	found_type = btrfs_key_type(&found_key);
5930 	if (found_key.objectid != objectid ||
5931 	    found_type != BTRFS_EXTENT_DATA_KEY) {
5932 		goto not_found;
5933 	}
5934 
5935 	found_type = btrfs_file_extent_type(leaf, item);
5936 	extent_start = found_key.offset;
5937 	compress_type = btrfs_file_extent_compression(leaf, item);
5938 	if (found_type == BTRFS_FILE_EXTENT_REG ||
5939 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5940 		extent_end = extent_start +
5941 		       btrfs_file_extent_num_bytes(leaf, item);
5942 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
5943 		size_t size;
5944 		size = btrfs_file_extent_inline_len(leaf, item);
5945 		extent_end = ALIGN(extent_start + size, root->sectorsize);
5946 	}
5947 
5948 	if (start >= extent_end) {
5949 		path->slots[0]++;
5950 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
5951 			ret = btrfs_next_leaf(root, path);
5952 			if (ret < 0) {
5953 				err = ret;
5954 				goto out;
5955 			}
5956 			if (ret > 0)
5957 				goto not_found;
5958 			leaf = path->nodes[0];
5959 		}
5960 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5961 		if (found_key.objectid != objectid ||
5962 		    found_key.type != BTRFS_EXTENT_DATA_KEY)
5963 			goto not_found;
5964 		if (start + len <= found_key.offset)
5965 			goto not_found;
5966 		em->start = start;
5967 		em->orig_start = start;
5968 		em->len = found_key.offset - start;
5969 		goto not_found_em;
5970 	}
5971 
5972 	em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
5973 	if (found_type == BTRFS_FILE_EXTENT_REG ||
5974 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5975 		em->start = extent_start;
5976 		em->len = extent_end - extent_start;
5977 		em->orig_start = extent_start -
5978 				 btrfs_file_extent_offset(leaf, item);
5979 		em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
5980 								      item);
5981 		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
5982 		if (bytenr == 0) {
5983 			em->block_start = EXTENT_MAP_HOLE;
5984 			goto insert;
5985 		}
5986 		if (compress_type != BTRFS_COMPRESS_NONE) {
5987 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
5988 			em->compress_type = compress_type;
5989 			em->block_start = bytenr;
5990 			em->block_len = em->orig_block_len;
5991 		} else {
5992 			bytenr += btrfs_file_extent_offset(leaf, item);
5993 			em->block_start = bytenr;
5994 			em->block_len = em->len;
5995 			if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
5996 				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
5997 		}
5998 		goto insert;
5999 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6000 		unsigned long ptr;
6001 		char *map;
6002 		size_t size;
6003 		size_t extent_offset;
6004 		size_t copy_size;
6005 
6006 		em->block_start = EXTENT_MAP_INLINE;
6007 		if (!page || create) {
6008 			em->start = extent_start;
6009 			em->len = extent_end - extent_start;
6010 			goto out;
6011 		}
6012 
6013 		size = btrfs_file_extent_inline_len(leaf, item);
6014 		extent_offset = page_offset(page) + pg_offset - extent_start;
6015 		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
6016 				size - extent_offset);
6017 		em->start = extent_start + extent_offset;
6018 		em->len = ALIGN(copy_size, root->sectorsize);
6019 		em->orig_block_len = em->len;
6020 		em->orig_start = em->start;
6021 		if (compress_type) {
6022 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6023 			em->compress_type = compress_type;
6024 		}
6025 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
6026 		if (create == 0 && !PageUptodate(page)) {
6027 			if (btrfs_file_extent_compression(leaf, item) !=
6028 			    BTRFS_COMPRESS_NONE) {
6029 				ret = uncompress_inline(path, inode, page,
6030 							pg_offset,
6031 							extent_offset, item);
6032 				BUG_ON(ret); /* -ENOMEM */
6033 			} else {
6034 				map = kmap(page);
6035 				read_extent_buffer(leaf, map + pg_offset, ptr,
6036 						   copy_size);
6037 				if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
6038 					memset(map + pg_offset + copy_size, 0,
6039 					       PAGE_CACHE_SIZE - pg_offset -
6040 					       copy_size);
6041 				}
6042 				kunmap(page);
6043 			}
6044 			flush_dcache_page(page);
6045 		} else if (create && PageUptodate(page)) {
6046 			BUG();
6047 			if (!trans) {
6048 				kunmap(page);
6049 				free_extent_map(em);
6050 				em = NULL;
6051 
6052 				btrfs_release_path(path);
6053 				trans = btrfs_join_transaction(root);
6054 
6055 				if (IS_ERR(trans))
6056 					return ERR_CAST(trans);
6057 				goto again;
6058 			}
6059 			map = kmap(page);
6060 			write_extent_buffer(leaf, map + pg_offset, ptr,
6061 					    copy_size);
6062 			kunmap(page);
6063 			btrfs_mark_buffer_dirty(leaf);
6064 		}
6065 		set_extent_uptodate(io_tree, em->start,
6066 				    extent_map_end(em) - 1, NULL, GFP_NOFS);
6067 		goto insert;
6068 	} else {
6069 		WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
6070 	}
6071 not_found:
6072 	em->start = start;
6073 	em->orig_start = start;
6074 	em->len = len;
6075 not_found_em:
6076 	em->block_start = EXTENT_MAP_HOLE;
6077 	set_bit(EXTENT_FLAG_VACANCY, &em->flags);
6078 insert:
6079 	btrfs_release_path(path);
6080 	if (em->start > start || extent_map_end(em) <= start) {
6081 		btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
6082 			em->start, em->len, start, len);
6083 		err = -EIO;
6084 		goto out;
6085 	}
6086 
6087 	err = 0;
6088 	write_lock(&em_tree->lock);
6089 	ret = add_extent_mapping(em_tree, em, 0);
6090 	/* it is possible that someone inserted the extent into the tree
6091 	 * while we had the lock dropped.  It is also possible that
6092 	 * an overlapping map exists in the tree
6093 	 */
6094 	if (ret == -EEXIST) {
6095 		struct extent_map *existing;
6096 
6097 		ret = 0;
6098 
6099 		existing = lookup_extent_mapping(em_tree, start, len);
6100 		if (existing && (existing->start > start ||
6101 		    existing->start + existing->len <= start)) {
6102 			free_extent_map(existing);
6103 			existing = NULL;
6104 		}
6105 		if (!existing) {
6106 			existing = lookup_extent_mapping(em_tree, em->start,
6107 							 em->len);
6108 			if (existing) {
6109 				err = merge_extent_mapping(em_tree, existing,
6110 							   em, start,
6111 							   root->sectorsize);
6112 				free_extent_map(existing);
6113 				if (err) {
6114 					free_extent_map(em);
6115 					em = NULL;
6116 				}
6117 			} else {
6118 				err = -EIO;
6119 				free_extent_map(em);
6120 				em = NULL;
6121 			}
6122 		} else {
6123 			free_extent_map(em);
6124 			em = existing;
6125 			err = 0;
6126 		}
6127 	}
6128 	write_unlock(&em_tree->lock);
6129 out:
6130 
6131 	if (em)
6132 		trace_btrfs_get_extent(root, em);
6133 
6134 	if (path)
6135 		btrfs_free_path(path);
6136 	if (trans) {
6137 		ret = btrfs_end_transaction(trans, root);
6138 		if (!err)
6139 			err = ret;
6140 	}
6141 	if (err) {
6142 		free_extent_map(em);
6143 		return ERR_PTR(err);
6144 	}
6145 	BUG_ON(!em); /* Error is always set */
6146 	return em;
6147 }
6148 
6149 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
6150 					   size_t pg_offset, u64 start, u64 len,
6151 					   int create)
6152 {
6153 	struct extent_map *em;
6154 	struct extent_map *hole_em = NULL;
6155 	u64 range_start = start;
6156 	u64 end;
6157 	u64 found;
6158 	u64 found_end;
6159 	int err = 0;
6160 
6161 	em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
6162 	if (IS_ERR(em))
6163 		return em;
6164 	if (em) {
6165 		/*
6166 		 * if our em maps to
6167 		 * -  a hole or
6168 		 * -  a pre-alloc extent,
6169 		 * there might actually be delalloc bytes behind it.
6170 		 */
6171 		if (em->block_start != EXTENT_MAP_HOLE &&
6172 		    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6173 			return em;
6174 		else
6175 			hole_em = em;
6176 	}
6177 
6178 	/* check to see if we've wrapped (len == -1 or similar) */
6179 	end = start + len;
6180 	if (end < start)
6181 		end = (u64)-1;
6182 	else
6183 		end -= 1;
6184 
6185 	em = NULL;
6186 
6187 	/* ok, we didn't find anything, lets look for delalloc */
6188 	found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
6189 				 end, len, EXTENT_DELALLOC, 1);
6190 	found_end = range_start + found;
6191 	if (found_end < range_start)
6192 		found_end = (u64)-1;
6193 
6194 	/*
6195 	 * we didn't find anything useful, return
6196 	 * the original results from get_extent()
6197 	 */
6198 	if (range_start > end || found_end <= start) {
6199 		em = hole_em;
6200 		hole_em = NULL;
6201 		goto out;
6202 	}
6203 
6204 	/* adjust the range_start to make sure it doesn't
6205 	 * go backwards from the start they passed in
6206 	 */
6207 	range_start = max(start,range_start);
6208 	found = found_end - range_start;
6209 
6210 	if (found > 0) {
6211 		u64 hole_start = start;
6212 		u64 hole_len = len;
6213 
6214 		em = alloc_extent_map();
6215 		if (!em) {
6216 			err = -ENOMEM;
6217 			goto out;
6218 		}
6219 		/*
6220 		 * when btrfs_get_extent can't find anything it
6221 		 * returns one huge hole
6222 		 *
6223 		 * make sure what it found really fits our range, and
6224 		 * adjust to make sure it is based on the start from
6225 		 * the caller
6226 		 */
6227 		if (hole_em) {
6228 			u64 calc_end = extent_map_end(hole_em);
6229 
6230 			if (calc_end <= start || (hole_em->start > end)) {
6231 				free_extent_map(hole_em);
6232 				hole_em = NULL;
6233 			} else {
6234 				hole_start = max(hole_em->start, start);
6235 				hole_len = calc_end - hole_start;
6236 			}
6237 		}
6238 		em->bdev = NULL;
6239 		if (hole_em && range_start > hole_start) {
6240 			/* our hole starts before our delalloc, so we
6241 			 * have to return just the parts of the hole
6242 			 * that go until  the delalloc starts
6243 			 */
6244 			em->len = min(hole_len,
6245 				      range_start - hole_start);
6246 			em->start = hole_start;
6247 			em->orig_start = hole_start;
6248 			/*
6249 			 * don't adjust block start at all,
6250 			 * it is fixed at EXTENT_MAP_HOLE
6251 			 */
6252 			em->block_start = hole_em->block_start;
6253 			em->block_len = hole_len;
6254 			if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
6255 				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
6256 		} else {
6257 			em->start = range_start;
6258 			em->len = found;
6259 			em->orig_start = range_start;
6260 			em->block_start = EXTENT_MAP_DELALLOC;
6261 			em->block_len = found;
6262 		}
6263 	} else if (hole_em) {
6264 		return hole_em;
6265 	}
6266 out:
6267 
6268 	free_extent_map(hole_em);
6269 	if (err) {
6270 		free_extent_map(em);
6271 		return ERR_PTR(err);
6272 	}
6273 	return em;
6274 }
6275 
6276 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
6277 						  u64 start, u64 len)
6278 {
6279 	struct btrfs_root *root = BTRFS_I(inode)->root;
6280 	struct extent_map *em;
6281 	struct btrfs_key ins;
6282 	u64 alloc_hint;
6283 	int ret;
6284 
6285 	alloc_hint = get_extent_allocation_hint(inode, start, len);
6286 	ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
6287 				   alloc_hint, &ins, 1);
6288 	if (ret)
6289 		return ERR_PTR(ret);
6290 
6291 	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
6292 			      ins.offset, ins.offset, ins.offset, 0);
6293 	if (IS_ERR(em)) {
6294 		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
6295 		return em;
6296 	}
6297 
6298 	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
6299 					   ins.offset, ins.offset, 0);
6300 	if (ret) {
6301 		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
6302 		free_extent_map(em);
6303 		return ERR_PTR(ret);
6304 	}
6305 
6306 	return em;
6307 }
6308 
6309 /*
6310  * returns 1 when the nocow is safe, < 1 on error, 0 if the
6311  * block must be cow'd
6312  */
6313 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6314 			      u64 *orig_start, u64 *orig_block_len,
6315 			      u64 *ram_bytes)
6316 {
6317 	struct btrfs_trans_handle *trans;
6318 	struct btrfs_path *path;
6319 	int ret;
6320 	struct extent_buffer *leaf;
6321 	struct btrfs_root *root = BTRFS_I(inode)->root;
6322 	struct btrfs_file_extent_item *fi;
6323 	struct btrfs_key key;
6324 	u64 disk_bytenr;
6325 	u64 backref_offset;
6326 	u64 extent_end;
6327 	u64 num_bytes;
6328 	int slot;
6329 	int found_type;
6330 	bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
6331 	path = btrfs_alloc_path();
6332 	if (!path)
6333 		return -ENOMEM;
6334 
6335 	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
6336 				       offset, 0);
6337 	if (ret < 0)
6338 		goto out;
6339 
6340 	slot = path->slots[0];
6341 	if (ret == 1) {
6342 		if (slot == 0) {
6343 			/* can't find the item, must cow */
6344 			ret = 0;
6345 			goto out;
6346 		}
6347 		slot--;
6348 	}
6349 	ret = 0;
6350 	leaf = path->nodes[0];
6351 	btrfs_item_key_to_cpu(leaf, &key, slot);
6352 	if (key.objectid != btrfs_ino(inode) ||
6353 	    key.type != BTRFS_EXTENT_DATA_KEY) {
6354 		/* not our file or wrong item type, must cow */
6355 		goto out;
6356 	}
6357 
6358 	if (key.offset > offset) {
6359 		/* Wrong offset, must cow */
6360 		goto out;
6361 	}
6362 
6363 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
6364 	found_type = btrfs_file_extent_type(leaf, fi);
6365 	if (found_type != BTRFS_FILE_EXTENT_REG &&
6366 	    found_type != BTRFS_FILE_EXTENT_PREALLOC) {
6367 		/* not a regular extent, must cow */
6368 		goto out;
6369 	}
6370 
6371 	if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
6372 		goto out;
6373 
6374 	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6375 	if (disk_bytenr == 0)
6376 		goto out;
6377 
6378 	if (btrfs_file_extent_compression(leaf, fi) ||
6379 	    btrfs_file_extent_encryption(leaf, fi) ||
6380 	    btrfs_file_extent_other_encoding(leaf, fi))
6381 		goto out;
6382 
6383 	backref_offset = btrfs_file_extent_offset(leaf, fi);
6384 
6385 	if (orig_start) {
6386 		*orig_start = key.offset - backref_offset;
6387 		*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
6388 		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6389 	}
6390 
6391 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
6392 
6393 	if (btrfs_extent_readonly(root, disk_bytenr))
6394 		goto out;
6395 
6396 	/*
6397 	 * look for other files referencing this extent, if we
6398 	 * find any we must cow
6399 	 */
6400 	trans = btrfs_join_transaction(root);
6401 	if (IS_ERR(trans)) {
6402 		ret = 0;
6403 		goto out;
6404 	}
6405 
6406 	ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
6407 				    key.offset - backref_offset, disk_bytenr);
6408 	btrfs_end_transaction(trans, root);
6409 	if (ret) {
6410 		ret = 0;
6411 		goto out;
6412 	}
6413 
6414 	/*
6415 	 * adjust disk_bytenr and num_bytes to cover just the bytes
6416 	 * in this extent we are about to write.  If there
6417 	 * are any csums in that range we have to cow in order
6418 	 * to keep the csums correct
6419 	 */
6420 	disk_bytenr += backref_offset;
6421 	disk_bytenr += offset - key.offset;
6422 	num_bytes = min(offset + *len, extent_end) - offset;
6423 	if (csum_exist_in_range(root, disk_bytenr, num_bytes))
6424 				goto out;
6425 	/*
6426 	 * all of the above have passed, it is safe to overwrite this extent
6427 	 * without cow
6428 	 */
6429 	*len = num_bytes;
6430 	ret = 1;
6431 out:
6432 	btrfs_free_path(path);
6433 	return ret;
6434 }
6435 
6436 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
6437 			      struct extent_state **cached_state, int writing)
6438 {
6439 	struct btrfs_ordered_extent *ordered;
6440 	int ret = 0;
6441 
6442 	while (1) {
6443 		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6444 				 0, cached_state);
6445 		/*
6446 		 * We're concerned with the entire range that we're going to be
6447 		 * doing DIO to, so we need to make sure theres no ordered
6448 		 * extents in this range.
6449 		 */
6450 		ordered = btrfs_lookup_ordered_range(inode, lockstart,
6451 						     lockend - lockstart + 1);
6452 
6453 		/*
6454 		 * We need to make sure there are no buffered pages in this
6455 		 * range either, we could have raced between the invalidate in
6456 		 * generic_file_direct_write and locking the extent.  The
6457 		 * invalidate needs to happen so that reads after a write do not
6458 		 * get stale data.
6459 		 */
6460 		if (!ordered && (!writing ||
6461 		    !test_range_bit(&BTRFS_I(inode)->io_tree,
6462 				    lockstart, lockend, EXTENT_UPTODATE, 0,
6463 				    *cached_state)))
6464 			break;
6465 
6466 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6467 				     cached_state, GFP_NOFS);
6468 
6469 		if (ordered) {
6470 			btrfs_start_ordered_extent(inode, ordered, 1);
6471 			btrfs_put_ordered_extent(ordered);
6472 		} else {
6473 			/* Screw you mmap */
6474 			ret = filemap_write_and_wait_range(inode->i_mapping,
6475 							   lockstart,
6476 							   lockend);
6477 			if (ret)
6478 				break;
6479 
6480 			/*
6481 			 * If we found a page that couldn't be invalidated just
6482 			 * fall back to buffered.
6483 			 */
6484 			ret = invalidate_inode_pages2_range(inode->i_mapping,
6485 					lockstart >> PAGE_CACHE_SHIFT,
6486 					lockend >> PAGE_CACHE_SHIFT);
6487 			if (ret)
6488 				break;
6489 		}
6490 
6491 		cond_resched();
6492 	}
6493 
6494 	return ret;
6495 }
6496 
6497 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
6498 					   u64 len, u64 orig_start,
6499 					   u64 block_start, u64 block_len,
6500 					   u64 orig_block_len, u64 ram_bytes,
6501 					   int type)
6502 {
6503 	struct extent_map_tree *em_tree;
6504 	struct extent_map *em;
6505 	struct btrfs_root *root = BTRFS_I(inode)->root;
6506 	int ret;
6507 
6508 	em_tree = &BTRFS_I(inode)->extent_tree;
6509 	em = alloc_extent_map();
6510 	if (!em)
6511 		return ERR_PTR(-ENOMEM);
6512 
6513 	em->start = start;
6514 	em->orig_start = orig_start;
6515 	em->mod_start = start;
6516 	em->mod_len = len;
6517 	em->len = len;
6518 	em->block_len = block_len;
6519 	em->block_start = block_start;
6520 	em->bdev = root->fs_info->fs_devices->latest_bdev;
6521 	em->orig_block_len = orig_block_len;
6522 	em->ram_bytes = ram_bytes;
6523 	em->generation = -1;
6524 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
6525 	if (type == BTRFS_ORDERED_PREALLOC)
6526 		set_bit(EXTENT_FLAG_FILLING, &em->flags);
6527 
6528 	do {
6529 		btrfs_drop_extent_cache(inode, em->start,
6530 				em->start + em->len - 1, 0);
6531 		write_lock(&em_tree->lock);
6532 		ret = add_extent_mapping(em_tree, em, 1);
6533 		write_unlock(&em_tree->lock);
6534 	} while (ret == -EEXIST);
6535 
6536 	if (ret) {
6537 		free_extent_map(em);
6538 		return ERR_PTR(ret);
6539 	}
6540 
6541 	return em;
6542 }
6543 
6544 
6545 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6546 				   struct buffer_head *bh_result, int create)
6547 {
6548 	struct extent_map *em;
6549 	struct btrfs_root *root = BTRFS_I(inode)->root;
6550 	struct extent_state *cached_state = NULL;
6551 	u64 start = iblock << inode->i_blkbits;
6552 	u64 lockstart, lockend;
6553 	u64 len = bh_result->b_size;
6554 	int unlock_bits = EXTENT_LOCKED;
6555 	int ret = 0;
6556 
6557 	if (create)
6558 		unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
6559 	else
6560 		len = min_t(u64, len, root->sectorsize);
6561 
6562 	lockstart = start;
6563 	lockend = start + len - 1;
6564 
6565 	/*
6566 	 * If this errors out it's because we couldn't invalidate pagecache for
6567 	 * this range and we need to fallback to buffered.
6568 	 */
6569 	if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
6570 		return -ENOTBLK;
6571 
6572 	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
6573 	if (IS_ERR(em)) {
6574 		ret = PTR_ERR(em);
6575 		goto unlock_err;
6576 	}
6577 
6578 	/*
6579 	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
6580 	 * io.  INLINE is special, and we could probably kludge it in here, but
6581 	 * it's still buffered so for safety lets just fall back to the generic
6582 	 * buffered path.
6583 	 *
6584 	 * For COMPRESSED we _have_ to read the entire extent in so we can
6585 	 * decompress it, so there will be buffering required no matter what we
6586 	 * do, so go ahead and fallback to buffered.
6587 	 *
6588 	 * We return -ENOTBLK because thats what makes DIO go ahead and go back
6589 	 * to buffered IO.  Don't blame me, this is the price we pay for using
6590 	 * the generic code.
6591 	 */
6592 	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
6593 	    em->block_start == EXTENT_MAP_INLINE) {
6594 		free_extent_map(em);
6595 		ret = -ENOTBLK;
6596 		goto unlock_err;
6597 	}
6598 
6599 	/* Just a good old fashioned hole, return */
6600 	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
6601 			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
6602 		free_extent_map(em);
6603 		goto unlock_err;
6604 	}
6605 
6606 	/*
6607 	 * We don't allocate a new extent in the following cases
6608 	 *
6609 	 * 1) The inode is marked as NODATACOW.  In this case we'll just use the
6610 	 * existing extent.
6611 	 * 2) The extent is marked as PREALLOC.  We're good to go here and can
6612 	 * just use the extent.
6613 	 *
6614 	 */
6615 	if (!create) {
6616 		len = min(len, em->len - (start - em->start));
6617 		lockstart = start + len;
6618 		goto unlock;
6619 	}
6620 
6621 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
6622 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
6623 	     em->block_start != EXTENT_MAP_HOLE)) {
6624 		int type;
6625 		int ret;
6626 		u64 block_start, orig_start, orig_block_len, ram_bytes;
6627 
6628 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6629 			type = BTRFS_ORDERED_PREALLOC;
6630 		else
6631 			type = BTRFS_ORDERED_NOCOW;
6632 		len = min(len, em->len - (start - em->start));
6633 		block_start = em->block_start + (start - em->start);
6634 
6635 		if (can_nocow_extent(inode, start, &len, &orig_start,
6636 				     &orig_block_len, &ram_bytes) == 1) {
6637 			if (type == BTRFS_ORDERED_PREALLOC) {
6638 				free_extent_map(em);
6639 				em = create_pinned_em(inode, start, len,
6640 						       orig_start,
6641 						       block_start, len,
6642 						       orig_block_len,
6643 						       ram_bytes, type);
6644 				if (IS_ERR(em))
6645 					goto unlock_err;
6646 			}
6647 
6648 			ret = btrfs_add_ordered_extent_dio(inode, start,
6649 					   block_start, len, len, type);
6650 			if (ret) {
6651 				free_extent_map(em);
6652 				goto unlock_err;
6653 			}
6654 			goto unlock;
6655 		}
6656 	}
6657 
6658 	/*
6659 	 * this will cow the extent, reset the len in case we changed
6660 	 * it above
6661 	 */
6662 	len = bh_result->b_size;
6663 	free_extent_map(em);
6664 	em = btrfs_new_extent_direct(inode, start, len);
6665 	if (IS_ERR(em)) {
6666 		ret = PTR_ERR(em);
6667 		goto unlock_err;
6668 	}
6669 	len = min(len, em->len - (start - em->start));
6670 unlock:
6671 	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
6672 		inode->i_blkbits;
6673 	bh_result->b_size = len;
6674 	bh_result->b_bdev = em->bdev;
6675 	set_buffer_mapped(bh_result);
6676 	if (create) {
6677 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6678 			set_buffer_new(bh_result);
6679 
6680 		/*
6681 		 * Need to update the i_size under the extent lock so buffered
6682 		 * readers will get the updated i_size when we unlock.
6683 		 */
6684 		if (start + len > i_size_read(inode))
6685 			i_size_write(inode, start + len);
6686 
6687 		spin_lock(&BTRFS_I(inode)->lock);
6688 		BTRFS_I(inode)->outstanding_extents++;
6689 		spin_unlock(&BTRFS_I(inode)->lock);
6690 
6691 		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6692 				     lockstart + len - 1, EXTENT_DELALLOC, NULL,
6693 				     &cached_state, GFP_NOFS);
6694 		BUG_ON(ret);
6695 	}
6696 
6697 	/*
6698 	 * In the case of write we need to clear and unlock the entire range,
6699 	 * in the case of read we need to unlock only the end area that we
6700 	 * aren't using if there is any left over space.
6701 	 */
6702 	if (lockstart < lockend) {
6703 		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6704 				 lockend, unlock_bits, 1, 0,
6705 				 &cached_state, GFP_NOFS);
6706 	} else {
6707 		free_extent_state(cached_state);
6708 	}
6709 
6710 	free_extent_map(em);
6711 
6712 	return 0;
6713 
6714 unlock_err:
6715 	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6716 			 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
6717 	return ret;
6718 }
6719 
6720 static void btrfs_endio_direct_read(struct bio *bio, int err)
6721 {
6722 	struct btrfs_dio_private *dip = bio->bi_private;
6723 	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
6724 	struct bio_vec *bvec = bio->bi_io_vec;
6725 	struct inode *inode = dip->inode;
6726 	struct btrfs_root *root = BTRFS_I(inode)->root;
6727 	struct bio *dio_bio;
6728 	u32 *csums = (u32 *)dip->csum;
6729 	int index = 0;
6730 	u64 start;
6731 
6732 	start = dip->logical_offset;
6733 	do {
6734 		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
6735 			struct page *page = bvec->bv_page;
6736 			char *kaddr;
6737 			u32 csum = ~(u32)0;
6738 			unsigned long flags;
6739 
6740 			local_irq_save(flags);
6741 			kaddr = kmap_atomic(page);
6742 			csum = btrfs_csum_data(kaddr + bvec->bv_offset,
6743 					       csum, bvec->bv_len);
6744 			btrfs_csum_final(csum, (char *)&csum);
6745 			kunmap_atomic(kaddr);
6746 			local_irq_restore(flags);
6747 
6748 			flush_dcache_page(bvec->bv_page);
6749 			if (csum != csums[index]) {
6750 				btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
6751 					  btrfs_ino(inode), start, csum,
6752 					  csums[index]);
6753 				err = -EIO;
6754 			}
6755 		}
6756 
6757 		start += bvec->bv_len;
6758 		bvec++;
6759 		index++;
6760 	} while (bvec <= bvec_end);
6761 
6762 	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
6763 		      dip->logical_offset + dip->bytes - 1);
6764 	dio_bio = dip->dio_bio;
6765 
6766 	kfree(dip);
6767 
6768 	/* If we had a csum failure make sure to clear the uptodate flag */
6769 	if (err)
6770 		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
6771 	dio_end_io(dio_bio, err);
6772 	bio_put(bio);
6773 }
6774 
6775 static void btrfs_endio_direct_write(struct bio *bio, int err)
6776 {
6777 	struct btrfs_dio_private *dip = bio->bi_private;
6778 	struct inode *inode = dip->inode;
6779 	struct btrfs_root *root = BTRFS_I(inode)->root;
6780 	struct btrfs_ordered_extent *ordered = NULL;
6781 	u64 ordered_offset = dip->logical_offset;
6782 	u64 ordered_bytes = dip->bytes;
6783 	struct bio *dio_bio;
6784 	int ret;
6785 
6786 	if (err)
6787 		goto out_done;
6788 again:
6789 	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
6790 						   &ordered_offset,
6791 						   ordered_bytes, !err);
6792 	if (!ret)
6793 		goto out_test;
6794 
6795 	ordered->work.func = finish_ordered_fn;
6796 	ordered->work.flags = 0;
6797 	btrfs_queue_worker(&root->fs_info->endio_write_workers,
6798 			   &ordered->work);
6799 out_test:
6800 	/*
6801 	 * our bio might span multiple ordered extents.  If we haven't
6802 	 * completed the accounting for the whole dio, go back and try again
6803 	 */
6804 	if (ordered_offset < dip->logical_offset + dip->bytes) {
6805 		ordered_bytes = dip->logical_offset + dip->bytes -
6806 			ordered_offset;
6807 		ordered = NULL;
6808 		goto again;
6809 	}
6810 out_done:
6811 	dio_bio = dip->dio_bio;
6812 
6813 	kfree(dip);
6814 
6815 	/* If we had an error make sure to clear the uptodate flag */
6816 	if (err)
6817 		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
6818 	dio_end_io(dio_bio, err);
6819 	bio_put(bio);
6820 }
6821 
6822 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
6823 				    struct bio *bio, int mirror_num,
6824 				    unsigned long bio_flags, u64 offset)
6825 {
6826 	int ret;
6827 	struct btrfs_root *root = BTRFS_I(inode)->root;
6828 	ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
6829 	BUG_ON(ret); /* -ENOMEM */
6830 	return 0;
6831 }
6832 
6833 static void btrfs_end_dio_bio(struct bio *bio, int err)
6834 {
6835 	struct btrfs_dio_private *dip = bio->bi_private;
6836 
6837 	if (err) {
6838 		printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
6839 		      "sector %#Lx len %u err no %d\n",
6840 		      btrfs_ino(dip->inode), bio->bi_rw,
6841 		      (unsigned long long)bio->bi_sector, bio->bi_size, err);
6842 		dip->errors = 1;
6843 
6844 		/*
6845 		 * before atomic variable goto zero, we must make sure
6846 		 * dip->errors is perceived to be set.
6847 		 */
6848 		smp_mb__before_atomic_dec();
6849 	}
6850 
6851 	/* if there are more bios still pending for this dio, just exit */
6852 	if (!atomic_dec_and_test(&dip->pending_bios))
6853 		goto out;
6854 
6855 	if (dip->errors) {
6856 		bio_io_error(dip->orig_bio);
6857 	} else {
6858 		set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
6859 		bio_endio(dip->orig_bio, 0);
6860 	}
6861 out:
6862 	bio_put(bio);
6863 }
6864 
6865 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
6866 				       u64 first_sector, gfp_t gfp_flags)
6867 {
6868 	int nr_vecs = bio_get_nr_vecs(bdev);
6869 	return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
6870 }
6871 
6872 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
6873 					 int rw, u64 file_offset, int skip_sum,
6874 					 int async_submit)
6875 {
6876 	struct btrfs_dio_private *dip = bio->bi_private;
6877 	int write = rw & REQ_WRITE;
6878 	struct btrfs_root *root = BTRFS_I(inode)->root;
6879 	int ret;
6880 
6881 	if (async_submit)
6882 		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
6883 
6884 	bio_get(bio);
6885 
6886 	if (!write) {
6887 		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
6888 		if (ret)
6889 			goto err;
6890 	}
6891 
6892 	if (skip_sum)
6893 		goto map;
6894 
6895 	if (write && async_submit) {
6896 		ret = btrfs_wq_submit_bio(root->fs_info,
6897 				   inode, rw, bio, 0, 0,
6898 				   file_offset,
6899 				   __btrfs_submit_bio_start_direct_io,
6900 				   __btrfs_submit_bio_done);
6901 		goto err;
6902 	} else if (write) {
6903 		/*
6904 		 * If we aren't doing async submit, calculate the csum of the
6905 		 * bio now.
6906 		 */
6907 		ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
6908 		if (ret)
6909 			goto err;
6910 	} else if (!skip_sum) {
6911 		ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
6912 						file_offset);
6913 		if (ret)
6914 			goto err;
6915 	}
6916 
6917 map:
6918 	ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
6919 err:
6920 	bio_put(bio);
6921 	return ret;
6922 }
6923 
6924 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6925 				    int skip_sum)
6926 {
6927 	struct inode *inode = dip->inode;
6928 	struct btrfs_root *root = BTRFS_I(inode)->root;
6929 	struct bio *bio;
6930 	struct bio *orig_bio = dip->orig_bio;
6931 	struct bio_vec *bvec = orig_bio->bi_io_vec;
6932 	u64 start_sector = orig_bio->bi_sector;
6933 	u64 file_offset = dip->logical_offset;
6934 	u64 submit_len = 0;
6935 	u64 map_length;
6936 	int nr_pages = 0;
6937 	int ret = 0;
6938 	int async_submit = 0;
6939 
6940 	map_length = orig_bio->bi_size;
6941 	ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
6942 			      &map_length, NULL, 0);
6943 	if (ret) {
6944 		bio_put(orig_bio);
6945 		return -EIO;
6946 	}
6947 
6948 	if (map_length >= orig_bio->bi_size) {
6949 		bio = orig_bio;
6950 		goto submit;
6951 	}
6952 
6953 	/* async crcs make it difficult to collect full stripe writes. */
6954 	if (btrfs_get_alloc_profile(root, 1) &
6955 	    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
6956 		async_submit = 0;
6957 	else
6958 		async_submit = 1;
6959 
6960 	bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
6961 	if (!bio)
6962 		return -ENOMEM;
6963 	bio->bi_private = dip;
6964 	bio->bi_end_io = btrfs_end_dio_bio;
6965 	atomic_inc(&dip->pending_bios);
6966 
6967 	while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
6968 		if (unlikely(map_length < submit_len + bvec->bv_len ||
6969 		    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
6970 				 bvec->bv_offset) < bvec->bv_len)) {
6971 			/*
6972 			 * inc the count before we submit the bio so
6973 			 * we know the end IO handler won't happen before
6974 			 * we inc the count. Otherwise, the dip might get freed
6975 			 * before we're done setting it up
6976 			 */
6977 			atomic_inc(&dip->pending_bios);
6978 			ret = __btrfs_submit_dio_bio(bio, inode, rw,
6979 						     file_offset, skip_sum,
6980 						     async_submit);
6981 			if (ret) {
6982 				bio_put(bio);
6983 				atomic_dec(&dip->pending_bios);
6984 				goto out_err;
6985 			}
6986 
6987 			start_sector += submit_len >> 9;
6988 			file_offset += submit_len;
6989 
6990 			submit_len = 0;
6991 			nr_pages = 0;
6992 
6993 			bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
6994 						  start_sector, GFP_NOFS);
6995 			if (!bio)
6996 				goto out_err;
6997 			bio->bi_private = dip;
6998 			bio->bi_end_io = btrfs_end_dio_bio;
6999 
7000 			map_length = orig_bio->bi_size;
7001 			ret = btrfs_map_block(root->fs_info, rw,
7002 					      start_sector << 9,
7003 					      &map_length, NULL, 0);
7004 			if (ret) {
7005 				bio_put(bio);
7006 				goto out_err;
7007 			}
7008 		} else {
7009 			submit_len += bvec->bv_len;
7010 			nr_pages ++;
7011 			bvec++;
7012 		}
7013 	}
7014 
7015 submit:
7016 	ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
7017 				     async_submit);
7018 	if (!ret)
7019 		return 0;
7020 
7021 	bio_put(bio);
7022 out_err:
7023 	dip->errors = 1;
7024 	/*
7025 	 * before atomic variable goto zero, we must
7026 	 * make sure dip->errors is perceived to be set.
7027 	 */
7028 	smp_mb__before_atomic_dec();
7029 	if (atomic_dec_and_test(&dip->pending_bios))
7030 		bio_io_error(dip->orig_bio);
7031 
7032 	/* bio_end_io() will handle error, so we needn't return it */
7033 	return 0;
7034 }
7035 
7036 static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7037 				struct inode *inode, loff_t file_offset)
7038 {
7039 	struct btrfs_root *root = BTRFS_I(inode)->root;
7040 	struct btrfs_dio_private *dip;
7041 	struct bio *io_bio;
7042 	int skip_sum;
7043 	int sum_len;
7044 	int write = rw & REQ_WRITE;
7045 	int ret = 0;
7046 	u16 csum_size;
7047 
7048 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7049 
7050 	io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
7051 	if (!io_bio) {
7052 		ret = -ENOMEM;
7053 		goto free_ordered;
7054 	}
7055 
7056 	if (!skip_sum && !write) {
7057 		csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
7058 		sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits;
7059 		sum_len *= csum_size;
7060 	} else {
7061 		sum_len = 0;
7062 	}
7063 
7064 	dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
7065 	if (!dip) {
7066 		ret = -ENOMEM;
7067 		goto free_io_bio;
7068 	}
7069 
7070 	dip->private = dio_bio->bi_private;
7071 	dip->inode = inode;
7072 	dip->logical_offset = file_offset;
7073 	dip->bytes = dio_bio->bi_size;
7074 	dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
7075 	io_bio->bi_private = dip;
7076 	dip->errors = 0;
7077 	dip->orig_bio = io_bio;
7078 	dip->dio_bio = dio_bio;
7079 	atomic_set(&dip->pending_bios, 0);
7080 
7081 	if (write)
7082 		io_bio->bi_end_io = btrfs_endio_direct_write;
7083 	else
7084 		io_bio->bi_end_io = btrfs_endio_direct_read;
7085 
7086 	ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
7087 	if (!ret)
7088 		return;
7089 
7090 free_io_bio:
7091 	bio_put(io_bio);
7092 
7093 free_ordered:
7094 	/*
7095 	 * If this is a write, we need to clean up the reserved space and kill
7096 	 * the ordered extent.
7097 	 */
7098 	if (write) {
7099 		struct btrfs_ordered_extent *ordered;
7100 		ordered = btrfs_lookup_ordered_extent(inode, file_offset);
7101 		if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
7102 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
7103 			btrfs_free_reserved_extent(root, ordered->start,
7104 						   ordered->disk_len);
7105 		btrfs_put_ordered_extent(ordered);
7106 		btrfs_put_ordered_extent(ordered);
7107 	}
7108 	bio_endio(dio_bio, ret);
7109 }
7110 
7111 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
7112 			const struct iovec *iov, loff_t offset,
7113 			unsigned long nr_segs)
7114 {
7115 	int seg;
7116 	int i;
7117 	size_t size;
7118 	unsigned long addr;
7119 	unsigned blocksize_mask = root->sectorsize - 1;
7120 	ssize_t retval = -EINVAL;
7121 	loff_t end = offset;
7122 
7123 	if (offset & blocksize_mask)
7124 		goto out;
7125 
7126 	/* Check the memory alignment.  Blocks cannot straddle pages */
7127 	for (seg = 0; seg < nr_segs; seg++) {
7128 		addr = (unsigned long)iov[seg].iov_base;
7129 		size = iov[seg].iov_len;
7130 		end += size;
7131 		if ((addr & blocksize_mask) || (size & blocksize_mask))
7132 			goto out;
7133 
7134 		/* If this is a write we don't need to check anymore */
7135 		if (rw & WRITE)
7136 			continue;
7137 
7138 		/*
7139 		 * Check to make sure we don't have duplicate iov_base's in this
7140 		 * iovec, if so return EINVAL, otherwise we'll get csum errors
7141 		 * when reading back.
7142 		 */
7143 		for (i = seg + 1; i < nr_segs; i++) {
7144 			if (iov[seg].iov_base == iov[i].iov_base)
7145 				goto out;
7146 		}
7147 	}
7148 	retval = 0;
7149 out:
7150 	return retval;
7151 }
7152 
7153 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7154 			const struct iovec *iov, loff_t offset,
7155 			unsigned long nr_segs)
7156 {
7157 	struct file *file = iocb->ki_filp;
7158 	struct inode *inode = file->f_mapping->host;
7159 	size_t count = 0;
7160 	int flags = 0;
7161 	bool wakeup = true;
7162 	bool relock = false;
7163 	ssize_t ret;
7164 
7165 	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
7166 			    offset, nr_segs))
7167 		return 0;
7168 
7169 	atomic_inc(&inode->i_dio_count);
7170 	smp_mb__after_atomic_inc();
7171 
7172 	/*
7173 	 * The generic stuff only does filemap_write_and_wait_range, which isn't
7174 	 * enough if we've written compressed pages to this area, so we need to
7175 	 * call btrfs_wait_ordered_range to make absolutely sure that any
7176 	 * outstanding dirty pages are on disk.
7177 	 */
7178 	count = iov_length(iov, nr_segs);
7179 	btrfs_wait_ordered_range(inode, offset, count);
7180 
7181 	if (rw & WRITE) {
7182 		/*
7183 		 * If the write DIO is beyond the EOF, we need update
7184 		 * the isize, but it is protected by i_mutex. So we can
7185 		 * not unlock the i_mutex at this case.
7186 		 */
7187 		if (offset + count <= inode->i_size) {
7188 			mutex_unlock(&inode->i_mutex);
7189 			relock = true;
7190 		}
7191 		ret = btrfs_delalloc_reserve_space(inode, count);
7192 		if (ret)
7193 			goto out;
7194 	} else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
7195 				     &BTRFS_I(inode)->runtime_flags))) {
7196 		inode_dio_done(inode);
7197 		flags = DIO_LOCKING | DIO_SKIP_HOLES;
7198 		wakeup = false;
7199 	}
7200 
7201 	ret = __blockdev_direct_IO(rw, iocb, inode,
7202 			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
7203 			iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
7204 			btrfs_submit_direct, flags);
7205 	if (rw & WRITE) {
7206 		if (ret < 0 && ret != -EIOCBQUEUED)
7207 			btrfs_delalloc_release_space(inode, count);
7208 		else if (ret >= 0 && (size_t)ret < count)
7209 			btrfs_delalloc_release_space(inode,
7210 						     count - (size_t)ret);
7211 		else
7212 			btrfs_delalloc_release_metadata(inode, 0);
7213 	}
7214 out:
7215 	if (wakeup)
7216 		inode_dio_done(inode);
7217 	if (relock)
7218 		mutex_lock(&inode->i_mutex);
7219 
7220 	return ret;
7221 }
7222 
7223 #define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
7224 
7225 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
7226 		__u64 start, __u64 len)
7227 {
7228 	int	ret;
7229 
7230 	ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
7231 	if (ret)
7232 		return ret;
7233 
7234 	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
7235 }
7236 
7237 int btrfs_readpage(struct file *file, struct page *page)
7238 {
7239 	struct extent_io_tree *tree;
7240 	tree = &BTRFS_I(page->mapping->host)->io_tree;
7241 	return extent_read_full_page(tree, page, btrfs_get_extent, 0);
7242 }
7243 
7244 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
7245 {
7246 	struct extent_io_tree *tree;
7247 
7248 
7249 	if (current->flags & PF_MEMALLOC) {
7250 		redirty_page_for_writepage(wbc, page);
7251 		unlock_page(page);
7252 		return 0;
7253 	}
7254 	tree = &BTRFS_I(page->mapping->host)->io_tree;
7255 	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
7256 }
7257 
7258 static int btrfs_writepages(struct address_space *mapping,
7259 			    struct writeback_control *wbc)
7260 {
7261 	struct extent_io_tree *tree;
7262 
7263 	tree = &BTRFS_I(mapping->host)->io_tree;
7264 	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
7265 }
7266 
7267 static int
7268 btrfs_readpages(struct file *file, struct address_space *mapping,
7269 		struct list_head *pages, unsigned nr_pages)
7270 {
7271 	struct extent_io_tree *tree;
7272 	tree = &BTRFS_I(mapping->host)->io_tree;
7273 	return extent_readpages(tree, mapping, pages, nr_pages,
7274 				btrfs_get_extent);
7275 }
7276 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
7277 {
7278 	struct extent_io_tree *tree;
7279 	struct extent_map_tree *map;
7280 	int ret;
7281 
7282 	tree = &BTRFS_I(page->mapping->host)->io_tree;
7283 	map = &BTRFS_I(page->mapping->host)->extent_tree;
7284 	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
7285 	if (ret == 1) {
7286 		ClearPagePrivate(page);
7287 		set_page_private(page, 0);
7288 		page_cache_release(page);
7289 	}
7290 	return ret;
7291 }
7292 
7293 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
7294 {
7295 	if (PageWriteback(page) || PageDirty(page))
7296 		return 0;
7297 	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
7298 }
7299 
7300 static void btrfs_invalidatepage(struct page *page, unsigned int offset,
7301 				 unsigned int length)
7302 {
7303 	struct inode *inode = page->mapping->host;
7304 	struct extent_io_tree *tree;
7305 	struct btrfs_ordered_extent *ordered;
7306 	struct extent_state *cached_state = NULL;
7307 	u64 page_start = page_offset(page);
7308 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
7309 
7310 	/*
7311 	 * we have the page locked, so new writeback can't start,
7312 	 * and the dirty bit won't be cleared while we are here.
7313 	 *
7314 	 * Wait for IO on this page so that we can safely clear
7315 	 * the PagePrivate2 bit and do ordered accounting
7316 	 */
7317 	wait_on_page_writeback(page);
7318 
7319 	tree = &BTRFS_I(inode)->io_tree;
7320 	if (offset) {
7321 		btrfs_releasepage(page, GFP_NOFS);
7322 		return;
7323 	}
7324 	lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
7325 	ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
7326 	if (ordered) {
7327 		/*
7328 		 * IO on this page will never be started, so we need
7329 		 * to account for any ordered extents now
7330 		 */
7331 		clear_extent_bit(tree, page_start, page_end,
7332 				 EXTENT_DIRTY | EXTENT_DELALLOC |
7333 				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
7334 				 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
7335 		/*
7336 		 * whoever cleared the private bit is responsible
7337 		 * for the finish_ordered_io
7338 		 */
7339 		if (TestClearPagePrivate2(page) &&
7340 		    btrfs_dec_test_ordered_pending(inode, &ordered, page_start,
7341 						   PAGE_CACHE_SIZE, 1)) {
7342 			btrfs_finish_ordered_io(ordered);
7343 		}
7344 		btrfs_put_ordered_extent(ordered);
7345 		cached_state = NULL;
7346 		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
7347 	}
7348 	clear_extent_bit(tree, page_start, page_end,
7349 		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
7350 		 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
7351 		 &cached_state, GFP_NOFS);
7352 	__btrfs_releasepage(page, GFP_NOFS);
7353 
7354 	ClearPageChecked(page);
7355 	if (PagePrivate(page)) {
7356 		ClearPagePrivate(page);
7357 		set_page_private(page, 0);
7358 		page_cache_release(page);
7359 	}
7360 }
7361 
7362 /*
7363  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
7364  * called from a page fault handler when a page is first dirtied. Hence we must
7365  * be careful to check for EOF conditions here. We set the page up correctly
7366  * for a written page which means we get ENOSPC checking when writing into
7367  * holes and correct delalloc and unwritten extent mapping on filesystems that
7368  * support these features.
7369  *
7370  * We are not allowed to take the i_mutex here so we have to play games to
7371  * protect against truncate races as the page could now be beyond EOF.  Because
7372  * vmtruncate() writes the inode size before removing pages, once we have the
7373  * page lock we can determine safely if the page is beyond EOF. If it is not
7374  * beyond EOF, then the page is guaranteed safe against truncation until we
7375  * unlock the page.
7376  */
7377 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
7378 {
7379 	struct page *page = vmf->page;
7380 	struct inode *inode = file_inode(vma->vm_file);
7381 	struct btrfs_root *root = BTRFS_I(inode)->root;
7382 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7383 	struct btrfs_ordered_extent *ordered;
7384 	struct extent_state *cached_state = NULL;
7385 	char *kaddr;
7386 	unsigned long zero_start;
7387 	loff_t size;
7388 	int ret;
7389 	int reserved = 0;
7390 	u64 page_start;
7391 	u64 page_end;
7392 
7393 	sb_start_pagefault(inode->i_sb);
7394 	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
7395 	if (!ret) {
7396 		ret = file_update_time(vma->vm_file);
7397 		reserved = 1;
7398 	}
7399 	if (ret) {
7400 		if (ret == -ENOMEM)
7401 			ret = VM_FAULT_OOM;
7402 		else /* -ENOSPC, -EIO, etc */
7403 			ret = VM_FAULT_SIGBUS;
7404 		if (reserved)
7405 			goto out;
7406 		goto out_noreserve;
7407 	}
7408 
7409 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
7410 again:
7411 	lock_page(page);
7412 	size = i_size_read(inode);
7413 	page_start = page_offset(page);
7414 	page_end = page_start + PAGE_CACHE_SIZE - 1;
7415 
7416 	if ((page->mapping != inode->i_mapping) ||
7417 	    (page_start >= size)) {
7418 		/* page got truncated out from underneath us */
7419 		goto out_unlock;
7420 	}
7421 	wait_on_page_writeback(page);
7422 
7423 	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
7424 	set_page_extent_mapped(page);
7425 
7426 	/*
7427 	 * we can't set the delalloc bits if there are pending ordered
7428 	 * extents.  Drop our locks and wait for them to finish
7429 	 */
7430 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
7431 	if (ordered) {
7432 		unlock_extent_cached(io_tree, page_start, page_end,
7433 				     &cached_state, GFP_NOFS);
7434 		unlock_page(page);
7435 		btrfs_start_ordered_extent(inode, ordered, 1);
7436 		btrfs_put_ordered_extent(ordered);
7437 		goto again;
7438 	}
7439 
7440 	/*
7441 	 * XXX - page_mkwrite gets called every time the page is dirtied, even
7442 	 * if it was already dirty, so for space accounting reasons we need to
7443 	 * clear any delalloc bits for the range we are fixing to save.  There
7444 	 * is probably a better way to do this, but for now keep consistent with
7445 	 * prepare_pages in the normal write path.
7446 	 */
7447 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
7448 			  EXTENT_DIRTY | EXTENT_DELALLOC |
7449 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
7450 			  0, 0, &cached_state, GFP_NOFS);
7451 
7452 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
7453 					&cached_state);
7454 	if (ret) {
7455 		unlock_extent_cached(io_tree, page_start, page_end,
7456 				     &cached_state, GFP_NOFS);
7457 		ret = VM_FAULT_SIGBUS;
7458 		goto out_unlock;
7459 	}
7460 	ret = 0;
7461 
7462 	/* page is wholly or partially inside EOF */
7463 	if (page_start + PAGE_CACHE_SIZE > size)
7464 		zero_start = size & ~PAGE_CACHE_MASK;
7465 	else
7466 		zero_start = PAGE_CACHE_SIZE;
7467 
7468 	if (zero_start != PAGE_CACHE_SIZE) {
7469 		kaddr = kmap(page);
7470 		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
7471 		flush_dcache_page(page);
7472 		kunmap(page);
7473 	}
7474 	ClearPageChecked(page);
7475 	set_page_dirty(page);
7476 	SetPageUptodate(page);
7477 
7478 	BTRFS_I(inode)->last_trans = root->fs_info->generation;
7479 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
7480 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
7481 
7482 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
7483 
7484 out_unlock:
7485 	if (!ret) {
7486 		sb_end_pagefault(inode->i_sb);
7487 		return VM_FAULT_LOCKED;
7488 	}
7489 	unlock_page(page);
7490 out:
7491 	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
7492 out_noreserve:
7493 	sb_end_pagefault(inode->i_sb);
7494 	return ret;
7495 }
7496 
7497 static int btrfs_truncate(struct inode *inode)
7498 {
7499 	struct btrfs_root *root = BTRFS_I(inode)->root;
7500 	struct btrfs_block_rsv *rsv;
7501 	int ret = 0;
7502 	int err = 0;
7503 	struct btrfs_trans_handle *trans;
7504 	u64 mask = root->sectorsize - 1;
7505 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
7506 
7507 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
7508 	btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
7509 
7510 	/*
7511 	 * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
7512 	 * 3 things going on here
7513 	 *
7514 	 * 1) We need to reserve space for our orphan item and the space to
7515 	 * delete our orphan item.  Lord knows we don't want to have a dangling
7516 	 * orphan item because we didn't reserve space to remove it.
7517 	 *
7518 	 * 2) We need to reserve space to update our inode.
7519 	 *
7520 	 * 3) We need to have something to cache all the space that is going to
7521 	 * be free'd up by the truncate operation, but also have some slack
7522 	 * space reserved in case it uses space during the truncate (thank you
7523 	 * very much snapshotting).
7524 	 *
7525 	 * And we need these to all be seperate.  The fact is we can use alot of
7526 	 * space doing the truncate, and we have no earthly idea how much space
7527 	 * we will use, so we need the truncate reservation to be seperate so it
7528 	 * doesn't end up using space reserved for updating the inode or
7529 	 * removing the orphan item.  We also need to be able to stop the
7530 	 * transaction and start a new one, which means we need to be able to
7531 	 * update the inode several times, and we have no idea of knowing how
7532 	 * many times that will be, so we can't just reserve 1 item for the
7533 	 * entirety of the opration, so that has to be done seperately as well.
7534 	 * Then there is the orphan item, which does indeed need to be held on
7535 	 * to for the whole operation, and we need nobody to touch this reserved
7536 	 * space except the orphan code.
7537 	 *
7538 	 * So that leaves us with
7539 	 *
7540 	 * 1) root->orphan_block_rsv - for the orphan deletion.
7541 	 * 2) rsv - for the truncate reservation, which we will steal from the
7542 	 * transaction reservation.
7543 	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
7544 	 * updating the inode.
7545 	 */
7546 	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
7547 	if (!rsv)
7548 		return -ENOMEM;
7549 	rsv->size = min_size;
7550 	rsv->failfast = 1;
7551 
7552 	/*
7553 	 * 1 for the truncate slack space
7554 	 * 1 for updating the inode.
7555 	 */
7556 	trans = btrfs_start_transaction(root, 2);
7557 	if (IS_ERR(trans)) {
7558 		err = PTR_ERR(trans);
7559 		goto out;
7560 	}
7561 
7562 	/* Migrate the slack space for the truncate to our reserve */
7563 	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
7564 				      min_size);
7565 	BUG_ON(ret);
7566 
7567 	/*
7568 	 * setattr is responsible for setting the ordered_data_close flag,
7569 	 * but that is only tested during the last file release.  That
7570 	 * could happen well after the next commit, leaving a great big
7571 	 * window where new writes may get lost if someone chooses to write
7572 	 * to this file after truncating to zero
7573 	 *
7574 	 * The inode doesn't have any dirty data here, and so if we commit
7575 	 * this is a noop.  If someone immediately starts writing to the inode
7576 	 * it is very likely we'll catch some of their writes in this
7577 	 * transaction, and the commit will find this file on the ordered
7578 	 * data list with good things to send down.
7579 	 *
7580 	 * This is a best effort solution, there is still a window where
7581 	 * using truncate to replace the contents of the file will
7582 	 * end up with a zero length file after a crash.
7583 	 */
7584 	if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
7585 					   &BTRFS_I(inode)->runtime_flags))
7586 		btrfs_add_ordered_operation(trans, root, inode);
7587 
7588 	/*
7589 	 * So if we truncate and then write and fsync we normally would just
7590 	 * write the extents that changed, which is a problem if we need to
7591 	 * first truncate that entire inode.  So set this flag so we write out
7592 	 * all of the extents in the inode to the sync log so we're completely
7593 	 * safe.
7594 	 */
7595 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
7596 	trans->block_rsv = rsv;
7597 
7598 	while (1) {
7599 		ret = btrfs_truncate_inode_items(trans, root, inode,
7600 						 inode->i_size,
7601 						 BTRFS_EXTENT_DATA_KEY);
7602 		if (ret != -ENOSPC) {
7603 			err = ret;
7604 			break;
7605 		}
7606 
7607 		trans->block_rsv = &root->fs_info->trans_block_rsv;
7608 		ret = btrfs_update_inode(trans, root, inode);
7609 		if (ret) {
7610 			err = ret;
7611 			break;
7612 		}
7613 
7614 		btrfs_end_transaction(trans, root);
7615 		btrfs_btree_balance_dirty(root);
7616 
7617 		trans = btrfs_start_transaction(root, 2);
7618 		if (IS_ERR(trans)) {
7619 			ret = err = PTR_ERR(trans);
7620 			trans = NULL;
7621 			break;
7622 		}
7623 
7624 		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
7625 					      rsv, min_size);
7626 		BUG_ON(ret);	/* shouldn't happen */
7627 		trans->block_rsv = rsv;
7628 	}
7629 
7630 	if (ret == 0 && inode->i_nlink > 0) {
7631 		trans->block_rsv = root->orphan_block_rsv;
7632 		ret = btrfs_orphan_del(trans, inode);
7633 		if (ret)
7634 			err = ret;
7635 	}
7636 
7637 	if (trans) {
7638 		trans->block_rsv = &root->fs_info->trans_block_rsv;
7639 		ret = btrfs_update_inode(trans, root, inode);
7640 		if (ret && !err)
7641 			err = ret;
7642 
7643 		ret = btrfs_end_transaction(trans, root);
7644 		btrfs_btree_balance_dirty(root);
7645 	}
7646 
7647 out:
7648 	btrfs_free_block_rsv(root, rsv);
7649 
7650 	if (ret && !err)
7651 		err = ret;
7652 
7653 	return err;
7654 }
7655 
7656 /*
7657  * create a new subvolume directory/inode (helper for the ioctl).
7658  */
7659 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
7660 			     struct btrfs_root *new_root, u64 new_dirid)
7661 {
7662 	struct inode *inode;
7663 	int err;
7664 	u64 index = 0;
7665 
7666 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
7667 				new_dirid, new_dirid,
7668 				S_IFDIR | (~current_umask() & S_IRWXUGO),
7669 				&index);
7670 	if (IS_ERR(inode))
7671 		return PTR_ERR(inode);
7672 	inode->i_op = &btrfs_dir_inode_operations;
7673 	inode->i_fop = &btrfs_dir_file_operations;
7674 
7675 	set_nlink(inode, 1);
7676 	btrfs_i_size_write(inode, 0);
7677 
7678 	err = btrfs_update_inode(trans, new_root, inode);
7679 
7680 	iput(inode);
7681 	return err;
7682 }
7683 
7684 struct inode *btrfs_alloc_inode(struct super_block *sb)
7685 {
7686 	struct btrfs_inode *ei;
7687 	struct inode *inode;
7688 
7689 	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
7690 	if (!ei)
7691 		return NULL;
7692 
7693 	ei->root = NULL;
7694 	ei->generation = 0;
7695 	ei->last_trans = 0;
7696 	ei->last_sub_trans = 0;
7697 	ei->logged_trans = 0;
7698 	ei->delalloc_bytes = 0;
7699 	ei->disk_i_size = 0;
7700 	ei->flags = 0;
7701 	ei->csum_bytes = 0;
7702 	ei->index_cnt = (u64)-1;
7703 	ei->last_unlink_trans = 0;
7704 	ei->last_log_commit = 0;
7705 
7706 	spin_lock_init(&ei->lock);
7707 	ei->outstanding_extents = 0;
7708 	ei->reserved_extents = 0;
7709 
7710 	ei->runtime_flags = 0;
7711 	ei->force_compress = BTRFS_COMPRESS_NONE;
7712 
7713 	ei->delayed_node = NULL;
7714 
7715 	inode = &ei->vfs_inode;
7716 	extent_map_tree_init(&ei->extent_tree);
7717 	extent_io_tree_init(&ei->io_tree, &inode->i_data);
7718 	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
7719 	ei->io_tree.track_uptodate = 1;
7720 	ei->io_failure_tree.track_uptodate = 1;
7721 	atomic_set(&ei->sync_writers, 0);
7722 	mutex_init(&ei->log_mutex);
7723 	mutex_init(&ei->delalloc_mutex);
7724 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
7725 	INIT_LIST_HEAD(&ei->delalloc_inodes);
7726 	INIT_LIST_HEAD(&ei->ordered_operations);
7727 	RB_CLEAR_NODE(&ei->rb_node);
7728 
7729 	return inode;
7730 }
7731 
7732 static void btrfs_i_callback(struct rcu_head *head)
7733 {
7734 	struct inode *inode = container_of(head, struct inode, i_rcu);
7735 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7736 }
7737 
7738 void btrfs_destroy_inode(struct inode *inode)
7739 {
7740 	struct btrfs_ordered_extent *ordered;
7741 	struct btrfs_root *root = BTRFS_I(inode)->root;
7742 
7743 	WARN_ON(!hlist_empty(&inode->i_dentry));
7744 	WARN_ON(inode->i_data.nrpages);
7745 	WARN_ON(BTRFS_I(inode)->outstanding_extents);
7746 	WARN_ON(BTRFS_I(inode)->reserved_extents);
7747 	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
7748 	WARN_ON(BTRFS_I(inode)->csum_bytes);
7749 
7750 	/*
7751 	 * This can happen where we create an inode, but somebody else also
7752 	 * created the same inode and we need to destroy the one we already
7753 	 * created.
7754 	 */
7755 	if (!root)
7756 		goto free;
7757 
7758 	/*
7759 	 * Make sure we're properly removed from the ordered operation
7760 	 * lists.
7761 	 */
7762 	smp_mb();
7763 	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
7764 		spin_lock(&root->fs_info->ordered_root_lock);
7765 		list_del_init(&BTRFS_I(inode)->ordered_operations);
7766 		spin_unlock(&root->fs_info->ordered_root_lock);
7767 	}
7768 
7769 	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
7770 		     &BTRFS_I(inode)->runtime_flags)) {
7771 		btrfs_info(root->fs_info, "inode %llu still on the orphan list",
7772 			btrfs_ino(inode));
7773 		atomic_dec(&root->orphan_inodes);
7774 	}
7775 
7776 	while (1) {
7777 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
7778 		if (!ordered)
7779 			break;
7780 		else {
7781 			btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
7782 				ordered->file_offset, ordered->len);
7783 			btrfs_remove_ordered_extent(inode, ordered);
7784 			btrfs_put_ordered_extent(ordered);
7785 			btrfs_put_ordered_extent(ordered);
7786 		}
7787 	}
7788 	inode_tree_del(inode);
7789 	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
7790 free:
7791 	call_rcu(&inode->i_rcu, btrfs_i_callback);
7792 }
7793 
7794 int btrfs_drop_inode(struct inode *inode)
7795 {
7796 	struct btrfs_root *root = BTRFS_I(inode)->root;
7797 
7798 	if (root == NULL)
7799 		return 1;
7800 
7801 	/* the snap/subvol tree is on deleting */
7802 	if (btrfs_root_refs(&root->root_item) == 0 &&
7803 	    root != root->fs_info->tree_root)
7804 		return 1;
7805 	else
7806 		return generic_drop_inode(inode);
7807 }
7808 
7809 static void init_once(void *foo)
7810 {
7811 	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
7812 
7813 	inode_init_once(&ei->vfs_inode);
7814 }
7815 
7816 void btrfs_destroy_cachep(void)
7817 {
7818 	/*
7819 	 * Make sure all delayed rcu free inodes are flushed before we
7820 	 * destroy cache.
7821 	 */
7822 	rcu_barrier();
7823 	if (btrfs_inode_cachep)
7824 		kmem_cache_destroy(btrfs_inode_cachep);
7825 	if (btrfs_trans_handle_cachep)
7826 		kmem_cache_destroy(btrfs_trans_handle_cachep);
7827 	if (btrfs_transaction_cachep)
7828 		kmem_cache_destroy(btrfs_transaction_cachep);
7829 	if (btrfs_path_cachep)
7830 		kmem_cache_destroy(btrfs_path_cachep);
7831 	if (btrfs_free_space_cachep)
7832 		kmem_cache_destroy(btrfs_free_space_cachep);
7833 	if (btrfs_delalloc_work_cachep)
7834 		kmem_cache_destroy(btrfs_delalloc_work_cachep);
7835 }
7836 
7837 int btrfs_init_cachep(void)
7838 {
7839 	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
7840 			sizeof(struct btrfs_inode), 0,
7841 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
7842 	if (!btrfs_inode_cachep)
7843 		goto fail;
7844 
7845 	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
7846 			sizeof(struct btrfs_trans_handle), 0,
7847 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7848 	if (!btrfs_trans_handle_cachep)
7849 		goto fail;
7850 
7851 	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
7852 			sizeof(struct btrfs_transaction), 0,
7853 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7854 	if (!btrfs_transaction_cachep)
7855 		goto fail;
7856 
7857 	btrfs_path_cachep = kmem_cache_create("btrfs_path",
7858 			sizeof(struct btrfs_path), 0,
7859 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7860 	if (!btrfs_path_cachep)
7861 		goto fail;
7862 
7863 	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
7864 			sizeof(struct btrfs_free_space), 0,
7865 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7866 	if (!btrfs_free_space_cachep)
7867 		goto fail;
7868 
7869 	btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
7870 			sizeof(struct btrfs_delalloc_work), 0,
7871 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
7872 			NULL);
7873 	if (!btrfs_delalloc_work_cachep)
7874 		goto fail;
7875 
7876 	return 0;
7877 fail:
7878 	btrfs_destroy_cachep();
7879 	return -ENOMEM;
7880 }
7881 
7882 static int btrfs_getattr(struct vfsmount *mnt,
7883 			 struct dentry *dentry, struct kstat *stat)
7884 {
7885 	u64 delalloc_bytes;
7886 	struct inode *inode = dentry->d_inode;
7887 	u32 blocksize = inode->i_sb->s_blocksize;
7888 
7889 	generic_fillattr(inode, stat);
7890 	stat->dev = BTRFS_I(inode)->root->anon_dev;
7891 	stat->blksize = PAGE_CACHE_SIZE;
7892 
7893 	spin_lock(&BTRFS_I(inode)->lock);
7894 	delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
7895 	spin_unlock(&BTRFS_I(inode)->lock);
7896 	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
7897 			ALIGN(delalloc_bytes, blocksize)) >> 9;
7898 	return 0;
7899 }
7900 
7901 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7902 			   struct inode *new_dir, struct dentry *new_dentry)
7903 {
7904 	struct btrfs_trans_handle *trans;
7905 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
7906 	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
7907 	struct inode *new_inode = new_dentry->d_inode;
7908 	struct inode *old_inode = old_dentry->d_inode;
7909 	struct timespec ctime = CURRENT_TIME;
7910 	u64 index = 0;
7911 	u64 root_objectid;
7912 	int ret;
7913 	u64 old_ino = btrfs_ino(old_inode);
7914 
7915 	if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
7916 		return -EPERM;
7917 
7918 	/* we only allow rename subvolume link between subvolumes */
7919 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
7920 		return -EXDEV;
7921 
7922 	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
7923 	    (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
7924 		return -ENOTEMPTY;
7925 
7926 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
7927 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
7928 		return -ENOTEMPTY;
7929 
7930 
7931 	/* check for collisions, even if the  name isn't there */
7932 	ret = btrfs_check_dir_item_collision(root, new_dir->i_ino,
7933 			     new_dentry->d_name.name,
7934 			     new_dentry->d_name.len);
7935 
7936 	if (ret) {
7937 		if (ret == -EEXIST) {
7938 			/* we shouldn't get
7939 			 * eexist without a new_inode */
7940 			if (!new_inode) {
7941 				WARN_ON(1);
7942 				return ret;
7943 			}
7944 		} else {
7945 			/* maybe -EOVERFLOW */
7946 			return ret;
7947 		}
7948 	}
7949 	ret = 0;
7950 
7951 	/*
7952 	 * we're using rename to replace one file with another.
7953 	 * and the replacement file is large.  Start IO on it now so
7954 	 * we don't add too much work to the end of the transaction
7955 	 */
7956 	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
7957 	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
7958 		filemap_flush(old_inode->i_mapping);
7959 
7960 	/* close the racy window with snapshot create/destroy ioctl */
7961 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
7962 		down_read(&root->fs_info->subvol_sem);
7963 	/*
7964 	 * We want to reserve the absolute worst case amount of items.  So if
7965 	 * both inodes are subvols and we need to unlink them then that would
7966 	 * require 4 item modifications, but if they are both normal inodes it
7967 	 * would require 5 item modifications, so we'll assume their normal
7968 	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
7969 	 * should cover the worst case number of items we'll modify.
7970 	 */
7971 	trans = btrfs_start_transaction(root, 11);
7972 	if (IS_ERR(trans)) {
7973                 ret = PTR_ERR(trans);
7974                 goto out_notrans;
7975         }
7976 
7977 	if (dest != root)
7978 		btrfs_record_root_in_trans(trans, dest);
7979 
7980 	ret = btrfs_set_inode_index(new_dir, &index);
7981 	if (ret)
7982 		goto out_fail;
7983 
7984 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
7985 		/* force full log commit if subvolume involved. */
7986 		root->fs_info->last_trans_log_full_commit = trans->transid;
7987 	} else {
7988 		ret = btrfs_insert_inode_ref(trans, dest,
7989 					     new_dentry->d_name.name,
7990 					     new_dentry->d_name.len,
7991 					     old_ino,
7992 					     btrfs_ino(new_dir), index);
7993 		if (ret)
7994 			goto out_fail;
7995 		/*
7996 		 * this is an ugly little race, but the rename is required
7997 		 * to make sure that if we crash, the inode is either at the
7998 		 * old name or the new one.  pinning the log transaction lets
7999 		 * us make sure we don't allow a log commit to come in after
8000 		 * we unlink the name but before we add the new name back in.
8001 		 */
8002 		btrfs_pin_log_trans(root);
8003 	}
8004 	/*
8005 	 * make sure the inode gets flushed if it is replacing
8006 	 * something.
8007 	 */
8008 	if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
8009 		btrfs_add_ordered_operation(trans, root, old_inode);
8010 
8011 	inode_inc_iversion(old_dir);
8012 	inode_inc_iversion(new_dir);
8013 	inode_inc_iversion(old_inode);
8014 	old_dir->i_ctime = old_dir->i_mtime = ctime;
8015 	new_dir->i_ctime = new_dir->i_mtime = ctime;
8016 	old_inode->i_ctime = ctime;
8017 
8018 	if (old_dentry->d_parent != new_dentry->d_parent)
8019 		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
8020 
8021 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8022 		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
8023 		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
8024 					old_dentry->d_name.name,
8025 					old_dentry->d_name.len);
8026 	} else {
8027 		ret = __btrfs_unlink_inode(trans, root, old_dir,
8028 					old_dentry->d_inode,
8029 					old_dentry->d_name.name,
8030 					old_dentry->d_name.len);
8031 		if (!ret)
8032 			ret = btrfs_update_inode(trans, root, old_inode);
8033 	}
8034 	if (ret) {
8035 		btrfs_abort_transaction(trans, root, ret);
8036 		goto out_fail;
8037 	}
8038 
8039 	if (new_inode) {
8040 		inode_inc_iversion(new_inode);
8041 		new_inode->i_ctime = CURRENT_TIME;
8042 		if (unlikely(btrfs_ino(new_inode) ==
8043 			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
8044 			root_objectid = BTRFS_I(new_inode)->location.objectid;
8045 			ret = btrfs_unlink_subvol(trans, dest, new_dir,
8046 						root_objectid,
8047 						new_dentry->d_name.name,
8048 						new_dentry->d_name.len);
8049 			BUG_ON(new_inode->i_nlink == 0);
8050 		} else {
8051 			ret = btrfs_unlink_inode(trans, dest, new_dir,
8052 						 new_dentry->d_inode,
8053 						 new_dentry->d_name.name,
8054 						 new_dentry->d_name.len);
8055 		}
8056 		if (!ret && new_inode->i_nlink == 0)
8057 			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
8058 		if (ret) {
8059 			btrfs_abort_transaction(trans, root, ret);
8060 			goto out_fail;
8061 		}
8062 	}
8063 
8064 	ret = btrfs_add_link(trans, new_dir, old_inode,
8065 			     new_dentry->d_name.name,
8066 			     new_dentry->d_name.len, 0, index);
8067 	if (ret) {
8068 		btrfs_abort_transaction(trans, root, ret);
8069 		goto out_fail;
8070 	}
8071 
8072 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
8073 		struct dentry *parent = new_dentry->d_parent;
8074 		btrfs_log_new_name(trans, old_inode, old_dir, parent);
8075 		btrfs_end_log_trans(root);
8076 	}
8077 out_fail:
8078 	btrfs_end_transaction(trans, root);
8079 out_notrans:
8080 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
8081 		up_read(&root->fs_info->subvol_sem);
8082 
8083 	return ret;
8084 }
8085 
8086 static void btrfs_run_delalloc_work(struct btrfs_work *work)
8087 {
8088 	struct btrfs_delalloc_work *delalloc_work;
8089 
8090 	delalloc_work = container_of(work, struct btrfs_delalloc_work,
8091 				     work);
8092 	if (delalloc_work->wait)
8093 		btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
8094 	else
8095 		filemap_flush(delalloc_work->inode->i_mapping);
8096 
8097 	if (delalloc_work->delay_iput)
8098 		btrfs_add_delayed_iput(delalloc_work->inode);
8099 	else
8100 		iput(delalloc_work->inode);
8101 	complete(&delalloc_work->completion);
8102 }
8103 
8104 struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8105 						    int wait, int delay_iput)
8106 {
8107 	struct btrfs_delalloc_work *work;
8108 
8109 	work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
8110 	if (!work)
8111 		return NULL;
8112 
8113 	init_completion(&work->completion);
8114 	INIT_LIST_HEAD(&work->list);
8115 	work->inode = inode;
8116 	work->wait = wait;
8117 	work->delay_iput = delay_iput;
8118 	work->work.func = btrfs_run_delalloc_work;
8119 
8120 	return work;
8121 }
8122 
8123 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8124 {
8125 	wait_for_completion(&work->completion);
8126 	kmem_cache_free(btrfs_delalloc_work_cachep, work);
8127 }
8128 
8129 /*
8130  * some fairly slow code that needs optimization. This walks the list
8131  * of all the inodes with pending delalloc and forces them to disk.
8132  */
8133 static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8134 {
8135 	struct btrfs_inode *binode;
8136 	struct inode *inode;
8137 	struct btrfs_delalloc_work *work, *next;
8138 	struct list_head works;
8139 	struct list_head splice;
8140 	int ret = 0;
8141 
8142 	INIT_LIST_HEAD(&works);
8143 	INIT_LIST_HEAD(&splice);
8144 
8145 	spin_lock(&root->delalloc_lock);
8146 	list_splice_init(&root->delalloc_inodes, &splice);
8147 	while (!list_empty(&splice)) {
8148 		binode = list_entry(splice.next, struct btrfs_inode,
8149 				    delalloc_inodes);
8150 
8151 		list_move_tail(&binode->delalloc_inodes,
8152 			       &root->delalloc_inodes);
8153 		inode = igrab(&binode->vfs_inode);
8154 		if (!inode) {
8155 			cond_resched_lock(&root->delalloc_lock);
8156 			continue;
8157 		}
8158 		spin_unlock(&root->delalloc_lock);
8159 
8160 		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
8161 		if (unlikely(!work)) {
8162 			ret = -ENOMEM;
8163 			goto out;
8164 		}
8165 		list_add_tail(&work->list, &works);
8166 		btrfs_queue_worker(&root->fs_info->flush_workers,
8167 				   &work->work);
8168 
8169 		cond_resched();
8170 		spin_lock(&root->delalloc_lock);
8171 	}
8172 	spin_unlock(&root->delalloc_lock);
8173 
8174 	list_for_each_entry_safe(work, next, &works, list) {
8175 		list_del_init(&work->list);
8176 		btrfs_wait_and_free_delalloc_work(work);
8177 	}
8178 	return 0;
8179 out:
8180 	list_for_each_entry_safe(work, next, &works, list) {
8181 		list_del_init(&work->list);
8182 		btrfs_wait_and_free_delalloc_work(work);
8183 	}
8184 
8185 	if (!list_empty_careful(&splice)) {
8186 		spin_lock(&root->delalloc_lock);
8187 		list_splice_tail(&splice, &root->delalloc_inodes);
8188 		spin_unlock(&root->delalloc_lock);
8189 	}
8190 	return ret;
8191 }
8192 
8193 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8194 {
8195 	int ret;
8196 
8197 	if (root->fs_info->sb->s_flags & MS_RDONLY)
8198 		return -EROFS;
8199 
8200 	ret = __start_delalloc_inodes(root, delay_iput);
8201 	/*
8202 	 * the filemap_flush will queue IO into the worker threads, but
8203 	 * we have to make sure the IO is actually started and that
8204 	 * ordered extents get created before we return
8205 	 */
8206 	atomic_inc(&root->fs_info->async_submit_draining);
8207 	while (atomic_read(&root->fs_info->nr_async_submits) ||
8208 	      atomic_read(&root->fs_info->async_delalloc_pages)) {
8209 		wait_event(root->fs_info->async_submit_wait,
8210 		   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
8211 		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
8212 	}
8213 	atomic_dec(&root->fs_info->async_submit_draining);
8214 	return ret;
8215 }
8216 
8217 int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
8218 				    int delay_iput)
8219 {
8220 	struct btrfs_root *root;
8221 	struct list_head splice;
8222 	int ret;
8223 
8224 	if (fs_info->sb->s_flags & MS_RDONLY)
8225 		return -EROFS;
8226 
8227 	INIT_LIST_HEAD(&splice);
8228 
8229 	spin_lock(&fs_info->delalloc_root_lock);
8230 	list_splice_init(&fs_info->delalloc_roots, &splice);
8231 	while (!list_empty(&splice)) {
8232 		root = list_first_entry(&splice, struct btrfs_root,
8233 					delalloc_root);
8234 		root = btrfs_grab_fs_root(root);
8235 		BUG_ON(!root);
8236 		list_move_tail(&root->delalloc_root,
8237 			       &fs_info->delalloc_roots);
8238 		spin_unlock(&fs_info->delalloc_root_lock);
8239 
8240 		ret = __start_delalloc_inodes(root, delay_iput);
8241 		btrfs_put_fs_root(root);
8242 		if (ret)
8243 			goto out;
8244 
8245 		spin_lock(&fs_info->delalloc_root_lock);
8246 	}
8247 	spin_unlock(&fs_info->delalloc_root_lock);
8248 
8249 	atomic_inc(&fs_info->async_submit_draining);
8250 	while (atomic_read(&fs_info->nr_async_submits) ||
8251 	      atomic_read(&fs_info->async_delalloc_pages)) {
8252 		wait_event(fs_info->async_submit_wait,
8253 		   (atomic_read(&fs_info->nr_async_submits) == 0 &&
8254 		    atomic_read(&fs_info->async_delalloc_pages) == 0));
8255 	}
8256 	atomic_dec(&fs_info->async_submit_draining);
8257 	return 0;
8258 out:
8259 	if (!list_empty_careful(&splice)) {
8260 		spin_lock(&fs_info->delalloc_root_lock);
8261 		list_splice_tail(&splice, &fs_info->delalloc_roots);
8262 		spin_unlock(&fs_info->delalloc_root_lock);
8263 	}
8264 	return ret;
8265 }
8266 
8267 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8268 			 const char *symname)
8269 {
8270 	struct btrfs_trans_handle *trans;
8271 	struct btrfs_root *root = BTRFS_I(dir)->root;
8272 	struct btrfs_path *path;
8273 	struct btrfs_key key;
8274 	struct inode *inode = NULL;
8275 	int err;
8276 	int drop_inode = 0;
8277 	u64 objectid;
8278 	u64 index = 0 ;
8279 	int name_len;
8280 	int datasize;
8281 	unsigned long ptr;
8282 	struct btrfs_file_extent_item *ei;
8283 	struct extent_buffer *leaf;
8284 
8285 	name_len = strlen(symname) + 1;
8286 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
8287 		return -ENAMETOOLONG;
8288 
8289 	/*
8290 	 * 2 items for inode item and ref
8291 	 * 2 items for dir items
8292 	 * 1 item for xattr if selinux is on
8293 	 */
8294 	trans = btrfs_start_transaction(root, 5);
8295 	if (IS_ERR(trans))
8296 		return PTR_ERR(trans);
8297 
8298 	err = btrfs_find_free_ino(root, &objectid);
8299 	if (err)
8300 		goto out_unlock;
8301 
8302 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
8303 				dentry->d_name.len, btrfs_ino(dir), objectid,
8304 				S_IFLNK|S_IRWXUGO, &index);
8305 	if (IS_ERR(inode)) {
8306 		err = PTR_ERR(inode);
8307 		goto out_unlock;
8308 	}
8309 
8310 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
8311 	if (err) {
8312 		drop_inode = 1;
8313 		goto out_unlock;
8314 	}
8315 
8316 	/*
8317 	* If the active LSM wants to access the inode during
8318 	* d_instantiate it needs these. Smack checks to see
8319 	* if the filesystem supports xattrs by looking at the
8320 	* ops vector.
8321 	*/
8322 	inode->i_fop = &btrfs_file_operations;
8323 	inode->i_op = &btrfs_file_inode_operations;
8324 
8325 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
8326 	if (err)
8327 		drop_inode = 1;
8328 	else {
8329 		inode->i_mapping->a_ops = &btrfs_aops;
8330 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8331 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
8332 	}
8333 	if (drop_inode)
8334 		goto out_unlock;
8335 
8336 	path = btrfs_alloc_path();
8337 	if (!path) {
8338 		err = -ENOMEM;
8339 		drop_inode = 1;
8340 		goto out_unlock;
8341 	}
8342 	key.objectid = btrfs_ino(inode);
8343 	key.offset = 0;
8344 	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
8345 	datasize = btrfs_file_extent_calc_inline_size(name_len);
8346 	err = btrfs_insert_empty_item(trans, root, path, &key,
8347 				      datasize);
8348 	if (err) {
8349 		drop_inode = 1;
8350 		btrfs_free_path(path);
8351 		goto out_unlock;
8352 	}
8353 	leaf = path->nodes[0];
8354 	ei = btrfs_item_ptr(leaf, path->slots[0],
8355 			    struct btrfs_file_extent_item);
8356 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
8357 	btrfs_set_file_extent_type(leaf, ei,
8358 				   BTRFS_FILE_EXTENT_INLINE);
8359 	btrfs_set_file_extent_encryption(leaf, ei, 0);
8360 	btrfs_set_file_extent_compression(leaf, ei, 0);
8361 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
8362 	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
8363 
8364 	ptr = btrfs_file_extent_inline_start(ei);
8365 	write_extent_buffer(leaf, symname, ptr, name_len);
8366 	btrfs_mark_buffer_dirty(leaf);
8367 	btrfs_free_path(path);
8368 
8369 	inode->i_op = &btrfs_symlink_inode_operations;
8370 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
8371 	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8372 	inode_set_bytes(inode, name_len);
8373 	btrfs_i_size_write(inode, name_len - 1);
8374 	err = btrfs_update_inode(trans, root, inode);
8375 	if (err)
8376 		drop_inode = 1;
8377 
8378 out_unlock:
8379 	if (!err)
8380 		d_instantiate(dentry, inode);
8381 	btrfs_end_transaction(trans, root);
8382 	if (drop_inode) {
8383 		inode_dec_link_count(inode);
8384 		iput(inode);
8385 	}
8386 	btrfs_btree_balance_dirty(root);
8387 	return err;
8388 }
8389 
8390 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
8391 				       u64 start, u64 num_bytes, u64 min_size,
8392 				       loff_t actual_len, u64 *alloc_hint,
8393 				       struct btrfs_trans_handle *trans)
8394 {
8395 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
8396 	struct extent_map *em;
8397 	struct btrfs_root *root = BTRFS_I(inode)->root;
8398 	struct btrfs_key ins;
8399 	u64 cur_offset = start;
8400 	u64 i_size;
8401 	u64 cur_bytes;
8402 	int ret = 0;
8403 	bool own_trans = true;
8404 
8405 	if (trans)
8406 		own_trans = false;
8407 	while (num_bytes > 0) {
8408 		if (own_trans) {
8409 			trans = btrfs_start_transaction(root, 3);
8410 			if (IS_ERR(trans)) {
8411 				ret = PTR_ERR(trans);
8412 				break;
8413 			}
8414 		}
8415 
8416 		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
8417 		cur_bytes = max(cur_bytes, min_size);
8418 		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
8419 					   *alloc_hint, &ins, 1);
8420 		if (ret) {
8421 			if (own_trans)
8422 				btrfs_end_transaction(trans, root);
8423 			break;
8424 		}
8425 
8426 		ret = insert_reserved_file_extent(trans, inode,
8427 						  cur_offset, ins.objectid,
8428 						  ins.offset, ins.offset,
8429 						  ins.offset, 0, 0, 0,
8430 						  BTRFS_FILE_EXTENT_PREALLOC);
8431 		if (ret) {
8432 			btrfs_abort_transaction(trans, root, ret);
8433 			if (own_trans)
8434 				btrfs_end_transaction(trans, root);
8435 			break;
8436 		}
8437 		btrfs_drop_extent_cache(inode, cur_offset,
8438 					cur_offset + ins.offset -1, 0);
8439 
8440 		em = alloc_extent_map();
8441 		if (!em) {
8442 			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
8443 				&BTRFS_I(inode)->runtime_flags);
8444 			goto next;
8445 		}
8446 
8447 		em->start = cur_offset;
8448 		em->orig_start = cur_offset;
8449 		em->len = ins.offset;
8450 		em->block_start = ins.objectid;
8451 		em->block_len = ins.offset;
8452 		em->orig_block_len = ins.offset;
8453 		em->ram_bytes = ins.offset;
8454 		em->bdev = root->fs_info->fs_devices->latest_bdev;
8455 		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
8456 		em->generation = trans->transid;
8457 
8458 		while (1) {
8459 			write_lock(&em_tree->lock);
8460 			ret = add_extent_mapping(em_tree, em, 1);
8461 			write_unlock(&em_tree->lock);
8462 			if (ret != -EEXIST)
8463 				break;
8464 			btrfs_drop_extent_cache(inode, cur_offset,
8465 						cur_offset + ins.offset - 1,
8466 						0);
8467 		}
8468 		free_extent_map(em);
8469 next:
8470 		num_bytes -= ins.offset;
8471 		cur_offset += ins.offset;
8472 		*alloc_hint = ins.objectid + ins.offset;
8473 
8474 		inode_inc_iversion(inode);
8475 		inode->i_ctime = CURRENT_TIME;
8476 		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
8477 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
8478 		    (actual_len > inode->i_size) &&
8479 		    (cur_offset > inode->i_size)) {
8480 			if (cur_offset > actual_len)
8481 				i_size = actual_len;
8482 			else
8483 				i_size = cur_offset;
8484 			i_size_write(inode, i_size);
8485 			btrfs_ordered_update_i_size(inode, i_size, NULL);
8486 		}
8487 
8488 		ret = btrfs_update_inode(trans, root, inode);
8489 
8490 		if (ret) {
8491 			btrfs_abort_transaction(trans, root, ret);
8492 			if (own_trans)
8493 				btrfs_end_transaction(trans, root);
8494 			break;
8495 		}
8496 
8497 		if (own_trans)
8498 			btrfs_end_transaction(trans, root);
8499 	}
8500 	return ret;
8501 }
8502 
8503 int btrfs_prealloc_file_range(struct inode *inode, int mode,
8504 			      u64 start, u64 num_bytes, u64 min_size,
8505 			      loff_t actual_len, u64 *alloc_hint)
8506 {
8507 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8508 					   min_size, actual_len, alloc_hint,
8509 					   NULL);
8510 }
8511 
8512 int btrfs_prealloc_file_range_trans(struct inode *inode,
8513 				    struct btrfs_trans_handle *trans, int mode,
8514 				    u64 start, u64 num_bytes, u64 min_size,
8515 				    loff_t actual_len, u64 *alloc_hint)
8516 {
8517 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8518 					   min_size, actual_len, alloc_hint, trans);
8519 }
8520 
8521 static int btrfs_set_page_dirty(struct page *page)
8522 {
8523 	return __set_page_dirty_nobuffers(page);
8524 }
8525 
8526 static int btrfs_permission(struct inode *inode, int mask)
8527 {
8528 	struct btrfs_root *root = BTRFS_I(inode)->root;
8529 	umode_t mode = inode->i_mode;
8530 
8531 	if (mask & MAY_WRITE &&
8532 	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
8533 		if (btrfs_root_readonly(root))
8534 			return -EROFS;
8535 		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
8536 			return -EACCES;
8537 	}
8538 	return generic_permission(inode, mask);
8539 }
8540 
8541 static const struct inode_operations btrfs_dir_inode_operations = {
8542 	.getattr	= btrfs_getattr,
8543 	.lookup		= btrfs_lookup,
8544 	.create		= btrfs_create,
8545 	.unlink		= btrfs_unlink,
8546 	.link		= btrfs_link,
8547 	.mkdir		= btrfs_mkdir,
8548 	.rmdir		= btrfs_rmdir,
8549 	.rename		= btrfs_rename,
8550 	.symlink	= btrfs_symlink,
8551 	.setattr	= btrfs_setattr,
8552 	.mknod		= btrfs_mknod,
8553 	.setxattr	= btrfs_setxattr,
8554 	.getxattr	= btrfs_getxattr,
8555 	.listxattr	= btrfs_listxattr,
8556 	.removexattr	= btrfs_removexattr,
8557 	.permission	= btrfs_permission,
8558 	.get_acl	= btrfs_get_acl,
8559 };
8560 static const struct inode_operations btrfs_dir_ro_inode_operations = {
8561 	.lookup		= btrfs_lookup,
8562 	.permission	= btrfs_permission,
8563 	.get_acl	= btrfs_get_acl,
8564 };
8565 
8566 static const struct file_operations btrfs_dir_file_operations = {
8567 	.llseek		= generic_file_llseek,
8568 	.read		= generic_read_dir,
8569 	.iterate	= btrfs_real_readdir,
8570 	.unlocked_ioctl	= btrfs_ioctl,
8571 #ifdef CONFIG_COMPAT
8572 	.compat_ioctl	= btrfs_ioctl,
8573 #endif
8574 	.release        = btrfs_release_file,
8575 	.fsync		= btrfs_sync_file,
8576 };
8577 
8578 static struct extent_io_ops btrfs_extent_io_ops = {
8579 	.fill_delalloc = run_delalloc_range,
8580 	.submit_bio_hook = btrfs_submit_bio_hook,
8581 	.merge_bio_hook = btrfs_merge_bio_hook,
8582 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
8583 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
8584 	.writepage_start_hook = btrfs_writepage_start_hook,
8585 	.set_bit_hook = btrfs_set_bit_hook,
8586 	.clear_bit_hook = btrfs_clear_bit_hook,
8587 	.merge_extent_hook = btrfs_merge_extent_hook,
8588 	.split_extent_hook = btrfs_split_extent_hook,
8589 };
8590 
8591 /*
8592  * btrfs doesn't support the bmap operation because swapfiles
8593  * use bmap to make a mapping of extents in the file.  They assume
8594  * these extents won't change over the life of the file and they
8595  * use the bmap result to do IO directly to the drive.
8596  *
8597  * the btrfs bmap call would return logical addresses that aren't
8598  * suitable for IO and they also will change frequently as COW
8599  * operations happen.  So, swapfile + btrfs == corruption.
8600  *
8601  * For now we're avoiding this by dropping bmap.
8602  */
8603 static const struct address_space_operations btrfs_aops = {
8604 	.readpage	= btrfs_readpage,
8605 	.writepage	= btrfs_writepage,
8606 	.writepages	= btrfs_writepages,
8607 	.readpages	= btrfs_readpages,
8608 	.direct_IO	= btrfs_direct_IO,
8609 	.invalidatepage = btrfs_invalidatepage,
8610 	.releasepage	= btrfs_releasepage,
8611 	.set_page_dirty	= btrfs_set_page_dirty,
8612 	.error_remove_page = generic_error_remove_page,
8613 };
8614 
8615 static const struct address_space_operations btrfs_symlink_aops = {
8616 	.readpage	= btrfs_readpage,
8617 	.writepage	= btrfs_writepage,
8618 	.invalidatepage = btrfs_invalidatepage,
8619 	.releasepage	= btrfs_releasepage,
8620 };
8621 
8622 static const struct inode_operations btrfs_file_inode_operations = {
8623 	.getattr	= btrfs_getattr,
8624 	.setattr	= btrfs_setattr,
8625 	.setxattr	= btrfs_setxattr,
8626 	.getxattr	= btrfs_getxattr,
8627 	.listxattr      = btrfs_listxattr,
8628 	.removexattr	= btrfs_removexattr,
8629 	.permission	= btrfs_permission,
8630 	.fiemap		= btrfs_fiemap,
8631 	.get_acl	= btrfs_get_acl,
8632 	.update_time	= btrfs_update_time,
8633 };
8634 static const struct inode_operations btrfs_special_inode_operations = {
8635 	.getattr	= btrfs_getattr,
8636 	.setattr	= btrfs_setattr,
8637 	.permission	= btrfs_permission,
8638 	.setxattr	= btrfs_setxattr,
8639 	.getxattr	= btrfs_getxattr,
8640 	.listxattr	= btrfs_listxattr,
8641 	.removexattr	= btrfs_removexattr,
8642 	.get_acl	= btrfs_get_acl,
8643 	.update_time	= btrfs_update_time,
8644 };
8645 static const struct inode_operations btrfs_symlink_inode_operations = {
8646 	.readlink	= generic_readlink,
8647 	.follow_link	= page_follow_link_light,
8648 	.put_link	= page_put_link,
8649 	.getattr	= btrfs_getattr,
8650 	.setattr	= btrfs_setattr,
8651 	.permission	= btrfs_permission,
8652 	.setxattr	= btrfs_setxattr,
8653 	.getxattr	= btrfs_getxattr,
8654 	.listxattr	= btrfs_listxattr,
8655 	.removexattr	= btrfs_removexattr,
8656 	.get_acl	= btrfs_get_acl,
8657 	.update_time	= btrfs_update_time,
8658 };
8659 
8660 const struct dentry_operations btrfs_dentry_operations = {
8661 	.d_delete	= btrfs_dentry_delete,
8662 	.d_release	= btrfs_dentry_release,
8663 };
8664