xref: /openbmc/linux/fs/btrfs/inode.c (revision cd5d5810)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/kernel.h>
20 #include <linux/bio.h>
21 #include <linux/buffer_head.h>
22 #include <linux/file.h>
23 #include <linux/fs.h>
24 #include <linux/pagemap.h>
25 #include <linux/highmem.h>
26 #include <linux/time.h>
27 #include <linux/init.h>
28 #include <linux/string.h>
29 #include <linux/backing-dev.h>
30 #include <linux/mpage.h>
31 #include <linux/swap.h>
32 #include <linux/writeback.h>
33 #include <linux/statfs.h>
34 #include <linux/compat.h>
35 #include <linux/aio.h>
36 #include <linux/bit_spinlock.h>
37 #include <linux/xattr.h>
38 #include <linux/posix_acl.h>
39 #include <linux/falloc.h>
40 #include <linux/slab.h>
41 #include <linux/ratelimit.h>
42 #include <linux/mount.h>
43 #include <linux/btrfs.h>
44 #include <linux/blkdev.h>
45 #include <linux/posix_acl_xattr.h>
46 #include "compat.h"
47 #include "ctree.h"
48 #include "disk-io.h"
49 #include "transaction.h"
50 #include "btrfs_inode.h"
51 #include "print-tree.h"
52 #include "ordered-data.h"
53 #include "xattr.h"
54 #include "tree-log.h"
55 #include "volumes.h"
56 #include "compression.h"
57 #include "locking.h"
58 #include "free-space-cache.h"
59 #include "inode-map.h"
60 #include "backref.h"
61 #include "hash.h"
62 
63 struct btrfs_iget_args {
64 	u64 ino;
65 	struct btrfs_root *root;
66 };
67 
68 static const struct inode_operations btrfs_dir_inode_operations;
69 static const struct inode_operations btrfs_symlink_inode_operations;
70 static const struct inode_operations btrfs_dir_ro_inode_operations;
71 static const struct inode_operations btrfs_special_inode_operations;
72 static const struct inode_operations btrfs_file_inode_operations;
73 static const struct address_space_operations btrfs_aops;
74 static const struct address_space_operations btrfs_symlink_aops;
75 static const struct file_operations btrfs_dir_file_operations;
76 static struct extent_io_ops btrfs_extent_io_ops;
77 
78 static struct kmem_cache *btrfs_inode_cachep;
79 static struct kmem_cache *btrfs_delalloc_work_cachep;
80 struct kmem_cache *btrfs_trans_handle_cachep;
81 struct kmem_cache *btrfs_transaction_cachep;
82 struct kmem_cache *btrfs_path_cachep;
83 struct kmem_cache *btrfs_free_space_cachep;
84 
85 #define S_SHIFT 12
86 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
87 	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
88 	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
89 	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
90 	[S_IFBLK >> S_SHIFT]	= BTRFS_FT_BLKDEV,
91 	[S_IFIFO >> S_SHIFT]	= BTRFS_FT_FIFO,
92 	[S_IFSOCK >> S_SHIFT]	= BTRFS_FT_SOCK,
93 	[S_IFLNK >> S_SHIFT]	= BTRFS_FT_SYMLINK,
94 };
95 
96 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
97 static int btrfs_truncate(struct inode *inode);
98 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
99 static noinline int cow_file_range(struct inode *inode,
100 				   struct page *locked_page,
101 				   u64 start, u64 end, int *page_started,
102 				   unsigned long *nr_written, int unlock);
103 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
104 					   u64 len, u64 orig_start,
105 					   u64 block_start, u64 block_len,
106 					   u64 orig_block_len, u64 ram_bytes,
107 					   int type);
108 
109 static int btrfs_dirty_inode(struct inode *inode);
110 
111 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
112 				     struct inode *inode,  struct inode *dir,
113 				     const struct qstr *qstr)
114 {
115 	int err;
116 
117 	err = btrfs_init_acl(trans, inode, dir);
118 	if (!err)
119 		err = btrfs_xattr_security_init(trans, inode, dir, qstr);
120 	return err;
121 }
122 
123 /*
124  * this does all the hard work for inserting an inline extent into
125  * the btree.  The caller should have done a btrfs_drop_extents so that
126  * no overlapping inline items exist in the btree
127  */
128 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
129 				struct btrfs_root *root, struct inode *inode,
130 				u64 start, size_t size, size_t compressed_size,
131 				int compress_type,
132 				struct page **compressed_pages)
133 {
134 	struct btrfs_key key;
135 	struct btrfs_path *path;
136 	struct extent_buffer *leaf;
137 	struct page *page = NULL;
138 	char *kaddr;
139 	unsigned long ptr;
140 	struct btrfs_file_extent_item *ei;
141 	int err = 0;
142 	int ret;
143 	size_t cur_size = size;
144 	size_t datasize;
145 	unsigned long offset;
146 
147 	if (compressed_size && compressed_pages)
148 		cur_size = compressed_size;
149 
150 	path = btrfs_alloc_path();
151 	if (!path)
152 		return -ENOMEM;
153 
154 	path->leave_spinning = 1;
155 
156 	key.objectid = btrfs_ino(inode);
157 	key.offset = start;
158 	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
159 	datasize = btrfs_file_extent_calc_inline_size(cur_size);
160 
161 	inode_add_bytes(inode, size);
162 	ret = btrfs_insert_empty_item(trans, root, path, &key,
163 				      datasize);
164 	if (ret) {
165 		err = ret;
166 		goto fail;
167 	}
168 	leaf = path->nodes[0];
169 	ei = btrfs_item_ptr(leaf, path->slots[0],
170 			    struct btrfs_file_extent_item);
171 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
172 	btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
173 	btrfs_set_file_extent_encryption(leaf, ei, 0);
174 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
175 	btrfs_set_file_extent_ram_bytes(leaf, ei, size);
176 	ptr = btrfs_file_extent_inline_start(ei);
177 
178 	if (compress_type != BTRFS_COMPRESS_NONE) {
179 		struct page *cpage;
180 		int i = 0;
181 		while (compressed_size > 0) {
182 			cpage = compressed_pages[i];
183 			cur_size = min_t(unsigned long, compressed_size,
184 				       PAGE_CACHE_SIZE);
185 
186 			kaddr = kmap_atomic(cpage);
187 			write_extent_buffer(leaf, kaddr, ptr, cur_size);
188 			kunmap_atomic(kaddr);
189 
190 			i++;
191 			ptr += cur_size;
192 			compressed_size -= cur_size;
193 		}
194 		btrfs_set_file_extent_compression(leaf, ei,
195 						  compress_type);
196 	} else {
197 		page = find_get_page(inode->i_mapping,
198 				     start >> PAGE_CACHE_SHIFT);
199 		btrfs_set_file_extent_compression(leaf, ei, 0);
200 		kaddr = kmap_atomic(page);
201 		offset = start & (PAGE_CACHE_SIZE - 1);
202 		write_extent_buffer(leaf, kaddr + offset, ptr, size);
203 		kunmap_atomic(kaddr);
204 		page_cache_release(page);
205 	}
206 	btrfs_mark_buffer_dirty(leaf);
207 	btrfs_free_path(path);
208 
209 	/*
210 	 * we're an inline extent, so nobody can
211 	 * extend the file past i_size without locking
212 	 * a page we already have locked.
213 	 *
214 	 * We must do any isize and inode updates
215 	 * before we unlock the pages.  Otherwise we
216 	 * could end up racing with unlink.
217 	 */
218 	BTRFS_I(inode)->disk_i_size = inode->i_size;
219 	ret = btrfs_update_inode(trans, root, inode);
220 
221 	return ret;
222 fail:
223 	btrfs_free_path(path);
224 	return err;
225 }
226 
227 
228 /*
229  * conditionally insert an inline extent into the file.  This
230  * does the checks required to make sure the data is small enough
231  * to fit as an inline extent.
232  */
233 static noinline int cow_file_range_inline(struct btrfs_root *root,
234 					  struct inode *inode, u64 start,
235 					  u64 end, size_t compressed_size,
236 					  int compress_type,
237 					  struct page **compressed_pages)
238 {
239 	struct btrfs_trans_handle *trans;
240 	u64 isize = i_size_read(inode);
241 	u64 actual_end = min(end + 1, isize);
242 	u64 inline_len = actual_end - start;
243 	u64 aligned_end = ALIGN(end, root->sectorsize);
244 	u64 data_len = inline_len;
245 	int ret;
246 
247 	if (compressed_size)
248 		data_len = compressed_size;
249 
250 	if (start > 0 ||
251 	    actual_end >= PAGE_CACHE_SIZE ||
252 	    data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) ||
253 	    (!compressed_size &&
254 	    (actual_end & (root->sectorsize - 1)) == 0) ||
255 	    end + 1 < isize ||
256 	    data_len > root->fs_info->max_inline) {
257 		return 1;
258 	}
259 
260 	trans = btrfs_join_transaction(root);
261 	if (IS_ERR(trans))
262 		return PTR_ERR(trans);
263 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
264 
265 	ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
266 	if (ret) {
267 		btrfs_abort_transaction(trans, root, ret);
268 		goto out;
269 	}
270 
271 	if (isize > actual_end)
272 		inline_len = min_t(u64, isize, actual_end);
273 	ret = insert_inline_extent(trans, root, inode, start,
274 				   inline_len, compressed_size,
275 				   compress_type, compressed_pages);
276 	if (ret && ret != -ENOSPC) {
277 		btrfs_abort_transaction(trans, root, ret);
278 		goto out;
279 	} else if (ret == -ENOSPC) {
280 		ret = 1;
281 		goto out;
282 	}
283 
284 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
285 	btrfs_delalloc_release_metadata(inode, end + 1 - start);
286 	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
287 out:
288 	btrfs_end_transaction(trans, root);
289 	return ret;
290 }
291 
292 struct async_extent {
293 	u64 start;
294 	u64 ram_size;
295 	u64 compressed_size;
296 	struct page **pages;
297 	unsigned long nr_pages;
298 	int compress_type;
299 	struct list_head list;
300 };
301 
302 struct async_cow {
303 	struct inode *inode;
304 	struct btrfs_root *root;
305 	struct page *locked_page;
306 	u64 start;
307 	u64 end;
308 	struct list_head extents;
309 	struct btrfs_work work;
310 };
311 
312 static noinline int add_async_extent(struct async_cow *cow,
313 				     u64 start, u64 ram_size,
314 				     u64 compressed_size,
315 				     struct page **pages,
316 				     unsigned long nr_pages,
317 				     int compress_type)
318 {
319 	struct async_extent *async_extent;
320 
321 	async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
322 	BUG_ON(!async_extent); /* -ENOMEM */
323 	async_extent->start = start;
324 	async_extent->ram_size = ram_size;
325 	async_extent->compressed_size = compressed_size;
326 	async_extent->pages = pages;
327 	async_extent->nr_pages = nr_pages;
328 	async_extent->compress_type = compress_type;
329 	list_add_tail(&async_extent->list, &cow->extents);
330 	return 0;
331 }
332 
333 /*
334  * we create compressed extents in two phases.  The first
335  * phase compresses a range of pages that have already been
336  * locked (both pages and state bits are locked).
337  *
338  * This is done inside an ordered work queue, and the compression
339  * is spread across many cpus.  The actual IO submission is step
340  * two, and the ordered work queue takes care of making sure that
341  * happens in the same order things were put onto the queue by
342  * writepages and friends.
343  *
344  * If this code finds it can't get good compression, it puts an
345  * entry onto the work queue to write the uncompressed bytes.  This
346  * makes sure that both compressed inodes and uncompressed inodes
347  * are written in the same order that the flusher thread sent them
348  * down.
349  */
350 static noinline int compress_file_range(struct inode *inode,
351 					struct page *locked_page,
352 					u64 start, u64 end,
353 					struct async_cow *async_cow,
354 					int *num_added)
355 {
356 	struct btrfs_root *root = BTRFS_I(inode)->root;
357 	u64 num_bytes;
358 	u64 blocksize = root->sectorsize;
359 	u64 actual_end;
360 	u64 isize = i_size_read(inode);
361 	int ret = 0;
362 	struct page **pages = NULL;
363 	unsigned long nr_pages;
364 	unsigned long nr_pages_ret = 0;
365 	unsigned long total_compressed = 0;
366 	unsigned long total_in = 0;
367 	unsigned long max_compressed = 128 * 1024;
368 	unsigned long max_uncompressed = 128 * 1024;
369 	int i;
370 	int will_compress;
371 	int compress_type = root->fs_info->compress_type;
372 	int redirty = 0;
373 
374 	/* if this is a small write inside eof, kick off a defrag */
375 	if ((end - start + 1) < 16 * 1024 &&
376 	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
377 		btrfs_add_inode_defrag(NULL, inode);
378 
379 	actual_end = min_t(u64, isize, end + 1);
380 again:
381 	will_compress = 0;
382 	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
383 	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
384 
385 	/*
386 	 * we don't want to send crud past the end of i_size through
387 	 * compression, that's just a waste of CPU time.  So, if the
388 	 * end of the file is before the start of our current
389 	 * requested range of bytes, we bail out to the uncompressed
390 	 * cleanup code that can deal with all of this.
391 	 *
392 	 * It isn't really the fastest way to fix things, but this is a
393 	 * very uncommon corner.
394 	 */
395 	if (actual_end <= start)
396 		goto cleanup_and_bail_uncompressed;
397 
398 	total_compressed = actual_end - start;
399 
400 	/* we want to make sure that amount of ram required to uncompress
401 	 * an extent is reasonable, so we limit the total size in ram
402 	 * of a compressed extent to 128k.  This is a crucial number
403 	 * because it also controls how easily we can spread reads across
404 	 * cpus for decompression.
405 	 *
406 	 * We also want to make sure the amount of IO required to do
407 	 * a random read is reasonably small, so we limit the size of
408 	 * a compressed extent to 128k.
409 	 */
410 	total_compressed = min(total_compressed, max_uncompressed);
411 	num_bytes = ALIGN(end - start + 1, blocksize);
412 	num_bytes = max(blocksize,  num_bytes);
413 	total_in = 0;
414 	ret = 0;
415 
416 	/*
417 	 * we do compression for mount -o compress and when the
418 	 * inode has not been flagged as nocompress.  This flag can
419 	 * change at any time if we discover bad compression ratios.
420 	 */
421 	if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
422 	    (btrfs_test_opt(root, COMPRESS) ||
423 	     (BTRFS_I(inode)->force_compress) ||
424 	     (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
425 		WARN_ON(pages);
426 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
427 		if (!pages) {
428 			/* just bail out to the uncompressed code */
429 			goto cont;
430 		}
431 
432 		if (BTRFS_I(inode)->force_compress)
433 			compress_type = BTRFS_I(inode)->force_compress;
434 
435 		/*
436 		 * we need to call clear_page_dirty_for_io on each
437 		 * page in the range.  Otherwise applications with the file
438 		 * mmap'd can wander in and change the page contents while
439 		 * we are compressing them.
440 		 *
441 		 * If the compression fails for any reason, we set the pages
442 		 * dirty again later on.
443 		 */
444 		extent_range_clear_dirty_for_io(inode, start, end);
445 		redirty = 1;
446 		ret = btrfs_compress_pages(compress_type,
447 					   inode->i_mapping, start,
448 					   total_compressed, pages,
449 					   nr_pages, &nr_pages_ret,
450 					   &total_in,
451 					   &total_compressed,
452 					   max_compressed);
453 
454 		if (!ret) {
455 			unsigned long offset = total_compressed &
456 				(PAGE_CACHE_SIZE - 1);
457 			struct page *page = pages[nr_pages_ret - 1];
458 			char *kaddr;
459 
460 			/* zero the tail end of the last page, we might be
461 			 * sending it down to disk
462 			 */
463 			if (offset) {
464 				kaddr = kmap_atomic(page);
465 				memset(kaddr + offset, 0,
466 				       PAGE_CACHE_SIZE - offset);
467 				kunmap_atomic(kaddr);
468 			}
469 			will_compress = 1;
470 		}
471 	}
472 cont:
473 	if (start == 0) {
474 		/* lets try to make an inline extent */
475 		if (ret || total_in < (actual_end - start)) {
476 			/* we didn't compress the entire range, try
477 			 * to make an uncompressed inline extent.
478 			 */
479 			ret = cow_file_range_inline(root, inode, start, end,
480 						    0, 0, NULL);
481 		} else {
482 			/* try making a compressed inline extent */
483 			ret = cow_file_range_inline(root, inode, start, end,
484 						    total_compressed,
485 						    compress_type, pages);
486 		}
487 		if (ret <= 0) {
488 			unsigned long clear_flags = EXTENT_DELALLOC |
489 				EXTENT_DEFRAG;
490 			clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
491 
492 			/*
493 			 * inline extent creation worked or returned error,
494 			 * we don't need to create any more async work items.
495 			 * Unlock and free up our temp pages.
496 			 */
497 			extent_clear_unlock_delalloc(inode, start, end, NULL,
498 						     clear_flags, PAGE_UNLOCK |
499 						     PAGE_CLEAR_DIRTY |
500 						     PAGE_SET_WRITEBACK |
501 						     PAGE_END_WRITEBACK);
502 			goto free_pages_out;
503 		}
504 	}
505 
506 	if (will_compress) {
507 		/*
508 		 * we aren't doing an inline extent round the compressed size
509 		 * up to a block size boundary so the allocator does sane
510 		 * things
511 		 */
512 		total_compressed = ALIGN(total_compressed, blocksize);
513 
514 		/*
515 		 * one last check to make sure the compression is really a
516 		 * win, compare the page count read with the blocks on disk
517 		 */
518 		total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
519 		if (total_compressed >= total_in) {
520 			will_compress = 0;
521 		} else {
522 			num_bytes = total_in;
523 		}
524 	}
525 	if (!will_compress && pages) {
526 		/*
527 		 * the compression code ran but failed to make things smaller,
528 		 * free any pages it allocated and our page pointer array
529 		 */
530 		for (i = 0; i < nr_pages_ret; i++) {
531 			WARN_ON(pages[i]->mapping);
532 			page_cache_release(pages[i]);
533 		}
534 		kfree(pages);
535 		pages = NULL;
536 		total_compressed = 0;
537 		nr_pages_ret = 0;
538 
539 		/* flag the file so we don't compress in the future */
540 		if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
541 		    !(BTRFS_I(inode)->force_compress)) {
542 			BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
543 		}
544 	}
545 	if (will_compress) {
546 		*num_added += 1;
547 
548 		/* the async work queues will take care of doing actual
549 		 * allocation on disk for these compressed pages,
550 		 * and will submit them to the elevator.
551 		 */
552 		add_async_extent(async_cow, start, num_bytes,
553 				 total_compressed, pages, nr_pages_ret,
554 				 compress_type);
555 
556 		if (start + num_bytes < end) {
557 			start += num_bytes;
558 			pages = NULL;
559 			cond_resched();
560 			goto again;
561 		}
562 	} else {
563 cleanup_and_bail_uncompressed:
564 		/*
565 		 * No compression, but we still need to write the pages in
566 		 * the file we've been given so far.  redirty the locked
567 		 * page if it corresponds to our extent and set things up
568 		 * for the async work queue to run cow_file_range to do
569 		 * the normal delalloc dance
570 		 */
571 		if (page_offset(locked_page) >= start &&
572 		    page_offset(locked_page) <= end) {
573 			__set_page_dirty_nobuffers(locked_page);
574 			/* unlocked later on in the async handlers */
575 		}
576 		if (redirty)
577 			extent_range_redirty_for_io(inode, start, end);
578 		add_async_extent(async_cow, start, end - start + 1,
579 				 0, NULL, 0, BTRFS_COMPRESS_NONE);
580 		*num_added += 1;
581 	}
582 
583 out:
584 	return ret;
585 
586 free_pages_out:
587 	for (i = 0; i < nr_pages_ret; i++) {
588 		WARN_ON(pages[i]->mapping);
589 		page_cache_release(pages[i]);
590 	}
591 	kfree(pages);
592 
593 	goto out;
594 }
595 
596 /*
597  * phase two of compressed writeback.  This is the ordered portion
598  * of the code, which only gets called in the order the work was
599  * queued.  We walk all the async extents created by compress_file_range
600  * and send them down to the disk.
601  */
602 static noinline int submit_compressed_extents(struct inode *inode,
603 					      struct async_cow *async_cow)
604 {
605 	struct async_extent *async_extent;
606 	u64 alloc_hint = 0;
607 	struct btrfs_key ins;
608 	struct extent_map *em;
609 	struct btrfs_root *root = BTRFS_I(inode)->root;
610 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
611 	struct extent_io_tree *io_tree;
612 	int ret = 0;
613 
614 	if (list_empty(&async_cow->extents))
615 		return 0;
616 
617 again:
618 	while (!list_empty(&async_cow->extents)) {
619 		async_extent = list_entry(async_cow->extents.next,
620 					  struct async_extent, list);
621 		list_del(&async_extent->list);
622 
623 		io_tree = &BTRFS_I(inode)->io_tree;
624 
625 retry:
626 		/* did the compression code fall back to uncompressed IO? */
627 		if (!async_extent->pages) {
628 			int page_started = 0;
629 			unsigned long nr_written = 0;
630 
631 			lock_extent(io_tree, async_extent->start,
632 					 async_extent->start +
633 					 async_extent->ram_size - 1);
634 
635 			/* allocate blocks */
636 			ret = cow_file_range(inode, async_cow->locked_page,
637 					     async_extent->start,
638 					     async_extent->start +
639 					     async_extent->ram_size - 1,
640 					     &page_started, &nr_written, 0);
641 
642 			/* JDM XXX */
643 
644 			/*
645 			 * if page_started, cow_file_range inserted an
646 			 * inline extent and took care of all the unlocking
647 			 * and IO for us.  Otherwise, we need to submit
648 			 * all those pages down to the drive.
649 			 */
650 			if (!page_started && !ret)
651 				extent_write_locked_range(io_tree,
652 						  inode, async_extent->start,
653 						  async_extent->start +
654 						  async_extent->ram_size - 1,
655 						  btrfs_get_extent,
656 						  WB_SYNC_ALL);
657 			else if (ret)
658 				unlock_page(async_cow->locked_page);
659 			kfree(async_extent);
660 			cond_resched();
661 			continue;
662 		}
663 
664 		lock_extent(io_tree, async_extent->start,
665 			    async_extent->start + async_extent->ram_size - 1);
666 
667 		ret = btrfs_reserve_extent(root,
668 					   async_extent->compressed_size,
669 					   async_extent->compressed_size,
670 					   0, alloc_hint, &ins, 1);
671 		if (ret) {
672 			int i;
673 
674 			for (i = 0; i < async_extent->nr_pages; i++) {
675 				WARN_ON(async_extent->pages[i]->mapping);
676 				page_cache_release(async_extent->pages[i]);
677 			}
678 			kfree(async_extent->pages);
679 			async_extent->nr_pages = 0;
680 			async_extent->pages = NULL;
681 
682 			if (ret == -ENOSPC) {
683 				unlock_extent(io_tree, async_extent->start,
684 					      async_extent->start +
685 					      async_extent->ram_size - 1);
686 				goto retry;
687 			}
688 			goto out_free;
689 		}
690 
691 		/*
692 		 * here we're doing allocation and writeback of the
693 		 * compressed pages
694 		 */
695 		btrfs_drop_extent_cache(inode, async_extent->start,
696 					async_extent->start +
697 					async_extent->ram_size - 1, 0);
698 
699 		em = alloc_extent_map();
700 		if (!em) {
701 			ret = -ENOMEM;
702 			goto out_free_reserve;
703 		}
704 		em->start = async_extent->start;
705 		em->len = async_extent->ram_size;
706 		em->orig_start = em->start;
707 		em->mod_start = em->start;
708 		em->mod_len = em->len;
709 
710 		em->block_start = ins.objectid;
711 		em->block_len = ins.offset;
712 		em->orig_block_len = ins.offset;
713 		em->ram_bytes = async_extent->ram_size;
714 		em->bdev = root->fs_info->fs_devices->latest_bdev;
715 		em->compress_type = async_extent->compress_type;
716 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
717 		set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
718 		em->generation = -1;
719 
720 		while (1) {
721 			write_lock(&em_tree->lock);
722 			ret = add_extent_mapping(em_tree, em, 1);
723 			write_unlock(&em_tree->lock);
724 			if (ret != -EEXIST) {
725 				free_extent_map(em);
726 				break;
727 			}
728 			btrfs_drop_extent_cache(inode, async_extent->start,
729 						async_extent->start +
730 						async_extent->ram_size - 1, 0);
731 		}
732 
733 		if (ret)
734 			goto out_free_reserve;
735 
736 		ret = btrfs_add_ordered_extent_compress(inode,
737 						async_extent->start,
738 						ins.objectid,
739 						async_extent->ram_size,
740 						ins.offset,
741 						BTRFS_ORDERED_COMPRESSED,
742 						async_extent->compress_type);
743 		if (ret)
744 			goto out_free_reserve;
745 
746 		/*
747 		 * clear dirty, set writeback and unlock the pages.
748 		 */
749 		extent_clear_unlock_delalloc(inode, async_extent->start,
750 				async_extent->start +
751 				async_extent->ram_size - 1,
752 				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
753 				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
754 				PAGE_SET_WRITEBACK);
755 		ret = btrfs_submit_compressed_write(inode,
756 				    async_extent->start,
757 				    async_extent->ram_size,
758 				    ins.objectid,
759 				    ins.offset, async_extent->pages,
760 				    async_extent->nr_pages);
761 		alloc_hint = ins.objectid + ins.offset;
762 		kfree(async_extent);
763 		if (ret)
764 			goto out;
765 		cond_resched();
766 	}
767 	ret = 0;
768 out:
769 	return ret;
770 out_free_reserve:
771 	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
772 out_free:
773 	extent_clear_unlock_delalloc(inode, async_extent->start,
774 				     async_extent->start +
775 				     async_extent->ram_size - 1,
776 				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
777 				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
778 				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
779 				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
780 	kfree(async_extent);
781 	goto again;
782 }
783 
784 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
785 				      u64 num_bytes)
786 {
787 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
788 	struct extent_map *em;
789 	u64 alloc_hint = 0;
790 
791 	read_lock(&em_tree->lock);
792 	em = search_extent_mapping(em_tree, start, num_bytes);
793 	if (em) {
794 		/*
795 		 * if block start isn't an actual block number then find the
796 		 * first block in this inode and use that as a hint.  If that
797 		 * block is also bogus then just don't worry about it.
798 		 */
799 		if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
800 			free_extent_map(em);
801 			em = search_extent_mapping(em_tree, 0, 0);
802 			if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
803 				alloc_hint = em->block_start;
804 			if (em)
805 				free_extent_map(em);
806 		} else {
807 			alloc_hint = em->block_start;
808 			free_extent_map(em);
809 		}
810 	}
811 	read_unlock(&em_tree->lock);
812 
813 	return alloc_hint;
814 }
815 
816 /*
817  * when extent_io.c finds a delayed allocation range in the file,
818  * the call backs end up in this code.  The basic idea is to
819  * allocate extents on disk for the range, and create ordered data structs
820  * in ram to track those extents.
821  *
822  * locked_page is the page that writepage had locked already.  We use
823  * it to make sure we don't do extra locks or unlocks.
824  *
825  * *page_started is set to one if we unlock locked_page and do everything
826  * required to start IO on it.  It may be clean and already done with
827  * IO when we return.
828  */
829 static noinline int cow_file_range(struct inode *inode,
830 				   struct page *locked_page,
831 				   u64 start, u64 end, int *page_started,
832 				   unsigned long *nr_written,
833 				   int unlock)
834 {
835 	struct btrfs_root *root = BTRFS_I(inode)->root;
836 	u64 alloc_hint = 0;
837 	u64 num_bytes;
838 	unsigned long ram_size;
839 	u64 disk_num_bytes;
840 	u64 cur_alloc_size;
841 	u64 blocksize = root->sectorsize;
842 	struct btrfs_key ins;
843 	struct extent_map *em;
844 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
845 	int ret = 0;
846 
847 	BUG_ON(btrfs_is_free_space_inode(inode));
848 
849 	num_bytes = ALIGN(end - start + 1, blocksize);
850 	num_bytes = max(blocksize,  num_bytes);
851 	disk_num_bytes = num_bytes;
852 
853 	/* if this is a small write inside eof, kick off defrag */
854 	if (num_bytes < 64 * 1024 &&
855 	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
856 		btrfs_add_inode_defrag(NULL, inode);
857 
858 	if (start == 0) {
859 		/* lets try to make an inline extent */
860 		ret = cow_file_range_inline(root, inode, start, end, 0, 0,
861 					    NULL);
862 		if (ret == 0) {
863 			extent_clear_unlock_delalloc(inode, start, end, NULL,
864 				     EXTENT_LOCKED | EXTENT_DELALLOC |
865 				     EXTENT_DEFRAG, PAGE_UNLOCK |
866 				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
867 				     PAGE_END_WRITEBACK);
868 
869 			*nr_written = *nr_written +
870 			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
871 			*page_started = 1;
872 			goto out;
873 		} else if (ret < 0) {
874 			goto out_unlock;
875 		}
876 	}
877 
878 	BUG_ON(disk_num_bytes >
879 	       btrfs_super_total_bytes(root->fs_info->super_copy));
880 
881 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
882 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
883 
884 	while (disk_num_bytes > 0) {
885 		unsigned long op;
886 
887 		cur_alloc_size = disk_num_bytes;
888 		ret = btrfs_reserve_extent(root, cur_alloc_size,
889 					   root->sectorsize, 0, alloc_hint,
890 					   &ins, 1);
891 		if (ret < 0)
892 			goto out_unlock;
893 
894 		em = alloc_extent_map();
895 		if (!em) {
896 			ret = -ENOMEM;
897 			goto out_reserve;
898 		}
899 		em->start = start;
900 		em->orig_start = em->start;
901 		ram_size = ins.offset;
902 		em->len = ins.offset;
903 		em->mod_start = em->start;
904 		em->mod_len = em->len;
905 
906 		em->block_start = ins.objectid;
907 		em->block_len = ins.offset;
908 		em->orig_block_len = ins.offset;
909 		em->ram_bytes = ram_size;
910 		em->bdev = root->fs_info->fs_devices->latest_bdev;
911 		set_bit(EXTENT_FLAG_PINNED, &em->flags);
912 		em->generation = -1;
913 
914 		while (1) {
915 			write_lock(&em_tree->lock);
916 			ret = add_extent_mapping(em_tree, em, 1);
917 			write_unlock(&em_tree->lock);
918 			if (ret != -EEXIST) {
919 				free_extent_map(em);
920 				break;
921 			}
922 			btrfs_drop_extent_cache(inode, start,
923 						start + ram_size - 1, 0);
924 		}
925 		if (ret)
926 			goto out_reserve;
927 
928 		cur_alloc_size = ins.offset;
929 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
930 					       ram_size, cur_alloc_size, 0);
931 		if (ret)
932 			goto out_reserve;
933 
934 		if (root->root_key.objectid ==
935 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
936 			ret = btrfs_reloc_clone_csums(inode, start,
937 						      cur_alloc_size);
938 			if (ret)
939 				goto out_reserve;
940 		}
941 
942 		if (disk_num_bytes < cur_alloc_size)
943 			break;
944 
945 		/* we're not doing compressed IO, don't unlock the first
946 		 * page (which the caller expects to stay locked), don't
947 		 * clear any dirty bits and don't set any writeback bits
948 		 *
949 		 * Do set the Private2 bit so we know this page was properly
950 		 * setup for writepage
951 		 */
952 		op = unlock ? PAGE_UNLOCK : 0;
953 		op |= PAGE_SET_PRIVATE2;
954 
955 		extent_clear_unlock_delalloc(inode, start,
956 					     start + ram_size - 1, locked_page,
957 					     EXTENT_LOCKED | EXTENT_DELALLOC,
958 					     op);
959 		disk_num_bytes -= cur_alloc_size;
960 		num_bytes -= cur_alloc_size;
961 		alloc_hint = ins.objectid + ins.offset;
962 		start += cur_alloc_size;
963 	}
964 out:
965 	return ret;
966 
967 out_reserve:
968 	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
969 out_unlock:
970 	extent_clear_unlock_delalloc(inode, start, end, locked_page,
971 				     EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
972 				     EXTENT_DELALLOC | EXTENT_DEFRAG,
973 				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
974 				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
975 	goto out;
976 }
977 
978 /*
979  * work queue call back to started compression on a file and pages
980  */
981 static noinline void async_cow_start(struct btrfs_work *work)
982 {
983 	struct async_cow *async_cow;
984 	int num_added = 0;
985 	async_cow = container_of(work, struct async_cow, work);
986 
987 	compress_file_range(async_cow->inode, async_cow->locked_page,
988 			    async_cow->start, async_cow->end, async_cow,
989 			    &num_added);
990 	if (num_added == 0) {
991 		btrfs_add_delayed_iput(async_cow->inode);
992 		async_cow->inode = NULL;
993 	}
994 }
995 
996 /*
997  * work queue call back to submit previously compressed pages
998  */
999 static noinline void async_cow_submit(struct btrfs_work *work)
1000 {
1001 	struct async_cow *async_cow;
1002 	struct btrfs_root *root;
1003 	unsigned long nr_pages;
1004 
1005 	async_cow = container_of(work, struct async_cow, work);
1006 
1007 	root = async_cow->root;
1008 	nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >>
1009 		PAGE_CACHE_SHIFT;
1010 
1011 	if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
1012 	    5 * 1024 * 1024 &&
1013 	    waitqueue_active(&root->fs_info->async_submit_wait))
1014 		wake_up(&root->fs_info->async_submit_wait);
1015 
1016 	if (async_cow->inode)
1017 		submit_compressed_extents(async_cow->inode, async_cow);
1018 }
1019 
1020 static noinline void async_cow_free(struct btrfs_work *work)
1021 {
1022 	struct async_cow *async_cow;
1023 	async_cow = container_of(work, struct async_cow, work);
1024 	if (async_cow->inode)
1025 		btrfs_add_delayed_iput(async_cow->inode);
1026 	kfree(async_cow);
1027 }
1028 
1029 static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1030 				u64 start, u64 end, int *page_started,
1031 				unsigned long *nr_written)
1032 {
1033 	struct async_cow *async_cow;
1034 	struct btrfs_root *root = BTRFS_I(inode)->root;
1035 	unsigned long nr_pages;
1036 	u64 cur_end;
1037 	int limit = 10 * 1024 * 1024;
1038 
1039 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
1040 			 1, 0, NULL, GFP_NOFS);
1041 	while (start < end) {
1042 		async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
1043 		BUG_ON(!async_cow); /* -ENOMEM */
1044 		async_cow->inode = igrab(inode);
1045 		async_cow->root = root;
1046 		async_cow->locked_page = locked_page;
1047 		async_cow->start = start;
1048 
1049 		if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS)
1050 			cur_end = end;
1051 		else
1052 			cur_end = min(end, start + 512 * 1024 - 1);
1053 
1054 		async_cow->end = cur_end;
1055 		INIT_LIST_HEAD(&async_cow->extents);
1056 
1057 		async_cow->work.func = async_cow_start;
1058 		async_cow->work.ordered_func = async_cow_submit;
1059 		async_cow->work.ordered_free = async_cow_free;
1060 		async_cow->work.flags = 0;
1061 
1062 		nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1063 			PAGE_CACHE_SHIFT;
1064 		atomic_add(nr_pages, &root->fs_info->async_delalloc_pages);
1065 
1066 		btrfs_queue_worker(&root->fs_info->delalloc_workers,
1067 				   &async_cow->work);
1068 
1069 		if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) {
1070 			wait_event(root->fs_info->async_submit_wait,
1071 			   (atomic_read(&root->fs_info->async_delalloc_pages) <
1072 			    limit));
1073 		}
1074 
1075 		while (atomic_read(&root->fs_info->async_submit_draining) &&
1076 		      atomic_read(&root->fs_info->async_delalloc_pages)) {
1077 			wait_event(root->fs_info->async_submit_wait,
1078 			  (atomic_read(&root->fs_info->async_delalloc_pages) ==
1079 			   0));
1080 		}
1081 
1082 		*nr_written += nr_pages;
1083 		start = cur_end + 1;
1084 	}
1085 	*page_started = 1;
1086 	return 0;
1087 }
1088 
1089 static noinline int csum_exist_in_range(struct btrfs_root *root,
1090 					u64 bytenr, u64 num_bytes)
1091 {
1092 	int ret;
1093 	struct btrfs_ordered_sum *sums;
1094 	LIST_HEAD(list);
1095 
1096 	ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
1097 				       bytenr + num_bytes - 1, &list, 0);
1098 	if (ret == 0 && list_empty(&list))
1099 		return 0;
1100 
1101 	while (!list_empty(&list)) {
1102 		sums = list_entry(list.next, struct btrfs_ordered_sum, list);
1103 		list_del(&sums->list);
1104 		kfree(sums);
1105 	}
1106 	return 1;
1107 }
1108 
1109 /*
1110  * when nowcow writeback call back.  This checks for snapshots or COW copies
1111  * of the extents that exist in the file, and COWs the file as required.
1112  *
1113  * If no cow copies or snapshots exist, we write directly to the existing
1114  * blocks on disk
1115  */
1116 static noinline int run_delalloc_nocow(struct inode *inode,
1117 				       struct page *locked_page,
1118 			      u64 start, u64 end, int *page_started, int force,
1119 			      unsigned long *nr_written)
1120 {
1121 	struct btrfs_root *root = BTRFS_I(inode)->root;
1122 	struct btrfs_trans_handle *trans;
1123 	struct extent_buffer *leaf;
1124 	struct btrfs_path *path;
1125 	struct btrfs_file_extent_item *fi;
1126 	struct btrfs_key found_key;
1127 	u64 cow_start;
1128 	u64 cur_offset;
1129 	u64 extent_end;
1130 	u64 extent_offset;
1131 	u64 disk_bytenr;
1132 	u64 num_bytes;
1133 	u64 disk_num_bytes;
1134 	u64 ram_bytes;
1135 	int extent_type;
1136 	int ret, err;
1137 	int type;
1138 	int nocow;
1139 	int check_prev = 1;
1140 	bool nolock;
1141 	u64 ino = btrfs_ino(inode);
1142 
1143 	path = btrfs_alloc_path();
1144 	if (!path) {
1145 		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1146 					     EXTENT_LOCKED | EXTENT_DELALLOC |
1147 					     EXTENT_DO_ACCOUNTING |
1148 					     EXTENT_DEFRAG, PAGE_UNLOCK |
1149 					     PAGE_CLEAR_DIRTY |
1150 					     PAGE_SET_WRITEBACK |
1151 					     PAGE_END_WRITEBACK);
1152 		return -ENOMEM;
1153 	}
1154 
1155 	nolock = btrfs_is_free_space_inode(inode);
1156 
1157 	if (nolock)
1158 		trans = btrfs_join_transaction_nolock(root);
1159 	else
1160 		trans = btrfs_join_transaction(root);
1161 
1162 	if (IS_ERR(trans)) {
1163 		extent_clear_unlock_delalloc(inode, start, end, locked_page,
1164 					     EXTENT_LOCKED | EXTENT_DELALLOC |
1165 					     EXTENT_DO_ACCOUNTING |
1166 					     EXTENT_DEFRAG, PAGE_UNLOCK |
1167 					     PAGE_CLEAR_DIRTY |
1168 					     PAGE_SET_WRITEBACK |
1169 					     PAGE_END_WRITEBACK);
1170 		btrfs_free_path(path);
1171 		return PTR_ERR(trans);
1172 	}
1173 
1174 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
1175 
1176 	cow_start = (u64)-1;
1177 	cur_offset = start;
1178 	while (1) {
1179 		ret = btrfs_lookup_file_extent(trans, root, path, ino,
1180 					       cur_offset, 0);
1181 		if (ret < 0) {
1182 			btrfs_abort_transaction(trans, root, ret);
1183 			goto error;
1184 		}
1185 		if (ret > 0 && path->slots[0] > 0 && check_prev) {
1186 			leaf = path->nodes[0];
1187 			btrfs_item_key_to_cpu(leaf, &found_key,
1188 					      path->slots[0] - 1);
1189 			if (found_key.objectid == ino &&
1190 			    found_key.type == BTRFS_EXTENT_DATA_KEY)
1191 				path->slots[0]--;
1192 		}
1193 		check_prev = 0;
1194 next_slot:
1195 		leaf = path->nodes[0];
1196 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1197 			ret = btrfs_next_leaf(root, path);
1198 			if (ret < 0) {
1199 				btrfs_abort_transaction(trans, root, ret);
1200 				goto error;
1201 			}
1202 			if (ret > 0)
1203 				break;
1204 			leaf = path->nodes[0];
1205 		}
1206 
1207 		nocow = 0;
1208 		disk_bytenr = 0;
1209 		num_bytes = 0;
1210 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1211 
1212 		if (found_key.objectid > ino ||
1213 		    found_key.type > BTRFS_EXTENT_DATA_KEY ||
1214 		    found_key.offset > end)
1215 			break;
1216 
1217 		if (found_key.offset > cur_offset) {
1218 			extent_end = found_key.offset;
1219 			extent_type = 0;
1220 			goto out_check;
1221 		}
1222 
1223 		fi = btrfs_item_ptr(leaf, path->slots[0],
1224 				    struct btrfs_file_extent_item);
1225 		extent_type = btrfs_file_extent_type(leaf, fi);
1226 
1227 		ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1228 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
1229 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1230 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1231 			extent_offset = btrfs_file_extent_offset(leaf, fi);
1232 			extent_end = found_key.offset +
1233 				btrfs_file_extent_num_bytes(leaf, fi);
1234 			disk_num_bytes =
1235 				btrfs_file_extent_disk_num_bytes(leaf, fi);
1236 			if (extent_end <= start) {
1237 				path->slots[0]++;
1238 				goto next_slot;
1239 			}
1240 			if (disk_bytenr == 0)
1241 				goto out_check;
1242 			if (btrfs_file_extent_compression(leaf, fi) ||
1243 			    btrfs_file_extent_encryption(leaf, fi) ||
1244 			    btrfs_file_extent_other_encoding(leaf, fi))
1245 				goto out_check;
1246 			if (extent_type == BTRFS_FILE_EXTENT_REG && !force)
1247 				goto out_check;
1248 			if (btrfs_extent_readonly(root, disk_bytenr))
1249 				goto out_check;
1250 			if (btrfs_cross_ref_exist(trans, root, ino,
1251 						  found_key.offset -
1252 						  extent_offset, disk_bytenr))
1253 				goto out_check;
1254 			disk_bytenr += extent_offset;
1255 			disk_bytenr += cur_offset - found_key.offset;
1256 			num_bytes = min(end + 1, extent_end) - cur_offset;
1257 			/*
1258 			 * force cow if csum exists in the range.
1259 			 * this ensure that csum for a given extent are
1260 			 * either valid or do not exist.
1261 			 */
1262 			if (csum_exist_in_range(root, disk_bytenr, num_bytes))
1263 				goto out_check;
1264 			nocow = 1;
1265 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
1266 			extent_end = found_key.offset +
1267 				btrfs_file_extent_inline_len(leaf, fi);
1268 			extent_end = ALIGN(extent_end, root->sectorsize);
1269 		} else {
1270 			BUG_ON(1);
1271 		}
1272 out_check:
1273 		if (extent_end <= start) {
1274 			path->slots[0]++;
1275 			goto next_slot;
1276 		}
1277 		if (!nocow) {
1278 			if (cow_start == (u64)-1)
1279 				cow_start = cur_offset;
1280 			cur_offset = extent_end;
1281 			if (cur_offset > end)
1282 				break;
1283 			path->slots[0]++;
1284 			goto next_slot;
1285 		}
1286 
1287 		btrfs_release_path(path);
1288 		if (cow_start != (u64)-1) {
1289 			ret = cow_file_range(inode, locked_page,
1290 					     cow_start, found_key.offset - 1,
1291 					     page_started, nr_written, 1);
1292 			if (ret) {
1293 				btrfs_abort_transaction(trans, root, ret);
1294 				goto error;
1295 			}
1296 			cow_start = (u64)-1;
1297 		}
1298 
1299 		if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
1300 			struct extent_map *em;
1301 			struct extent_map_tree *em_tree;
1302 			em_tree = &BTRFS_I(inode)->extent_tree;
1303 			em = alloc_extent_map();
1304 			BUG_ON(!em); /* -ENOMEM */
1305 			em->start = cur_offset;
1306 			em->orig_start = found_key.offset - extent_offset;
1307 			em->len = num_bytes;
1308 			em->block_len = num_bytes;
1309 			em->block_start = disk_bytenr;
1310 			em->orig_block_len = disk_num_bytes;
1311 			em->ram_bytes = ram_bytes;
1312 			em->bdev = root->fs_info->fs_devices->latest_bdev;
1313 			em->mod_start = em->start;
1314 			em->mod_len = em->len;
1315 			set_bit(EXTENT_FLAG_PINNED, &em->flags);
1316 			set_bit(EXTENT_FLAG_FILLING, &em->flags);
1317 			em->generation = -1;
1318 			while (1) {
1319 				write_lock(&em_tree->lock);
1320 				ret = add_extent_mapping(em_tree, em, 1);
1321 				write_unlock(&em_tree->lock);
1322 				if (ret != -EEXIST) {
1323 					free_extent_map(em);
1324 					break;
1325 				}
1326 				btrfs_drop_extent_cache(inode, em->start,
1327 						em->start + em->len - 1, 0);
1328 			}
1329 			type = BTRFS_ORDERED_PREALLOC;
1330 		} else {
1331 			type = BTRFS_ORDERED_NOCOW;
1332 		}
1333 
1334 		ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr,
1335 					       num_bytes, num_bytes, type);
1336 		BUG_ON(ret); /* -ENOMEM */
1337 
1338 		if (root->root_key.objectid ==
1339 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
1340 			ret = btrfs_reloc_clone_csums(inode, cur_offset,
1341 						      num_bytes);
1342 			if (ret) {
1343 				btrfs_abort_transaction(trans, root, ret);
1344 				goto error;
1345 			}
1346 		}
1347 
1348 		extent_clear_unlock_delalloc(inode, cur_offset,
1349 					     cur_offset + num_bytes - 1,
1350 					     locked_page, EXTENT_LOCKED |
1351 					     EXTENT_DELALLOC, PAGE_UNLOCK |
1352 					     PAGE_SET_PRIVATE2);
1353 		cur_offset = extent_end;
1354 		if (cur_offset > end)
1355 			break;
1356 	}
1357 	btrfs_release_path(path);
1358 
1359 	if (cur_offset <= end && cow_start == (u64)-1) {
1360 		cow_start = cur_offset;
1361 		cur_offset = end;
1362 	}
1363 
1364 	if (cow_start != (u64)-1) {
1365 		ret = cow_file_range(inode, locked_page, cow_start, end,
1366 				     page_started, nr_written, 1);
1367 		if (ret) {
1368 			btrfs_abort_transaction(trans, root, ret);
1369 			goto error;
1370 		}
1371 	}
1372 
1373 error:
1374 	err = btrfs_end_transaction(trans, root);
1375 	if (!ret)
1376 		ret = err;
1377 
1378 	if (ret && cur_offset < end)
1379 		extent_clear_unlock_delalloc(inode, cur_offset, end,
1380 					     locked_page, EXTENT_LOCKED |
1381 					     EXTENT_DELALLOC | EXTENT_DEFRAG |
1382 					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
1383 					     PAGE_CLEAR_DIRTY |
1384 					     PAGE_SET_WRITEBACK |
1385 					     PAGE_END_WRITEBACK);
1386 	btrfs_free_path(path);
1387 	return ret;
1388 }
1389 
1390 /*
1391  * extent_io.c call back to do delayed allocation processing
1392  */
1393 static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1394 			      u64 start, u64 end, int *page_started,
1395 			      unsigned long *nr_written)
1396 {
1397 	int ret;
1398 	struct btrfs_root *root = BTRFS_I(inode)->root;
1399 
1400 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) {
1401 		ret = run_delalloc_nocow(inode, locked_page, start, end,
1402 					 page_started, 1, nr_written);
1403 	} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) {
1404 		ret = run_delalloc_nocow(inode, locked_page, start, end,
1405 					 page_started, 0, nr_written);
1406 	} else if (!btrfs_test_opt(root, COMPRESS) &&
1407 		   !(BTRFS_I(inode)->force_compress) &&
1408 		   !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) {
1409 		ret = cow_file_range(inode, locked_page, start, end,
1410 				      page_started, nr_written, 1);
1411 	} else {
1412 		set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1413 			&BTRFS_I(inode)->runtime_flags);
1414 		ret = cow_file_range_async(inode, locked_page, start, end,
1415 					   page_started, nr_written);
1416 	}
1417 	return ret;
1418 }
1419 
1420 static void btrfs_split_extent_hook(struct inode *inode,
1421 				    struct extent_state *orig, u64 split)
1422 {
1423 	/* not delalloc, ignore it */
1424 	if (!(orig->state & EXTENT_DELALLOC))
1425 		return;
1426 
1427 	spin_lock(&BTRFS_I(inode)->lock);
1428 	BTRFS_I(inode)->outstanding_extents++;
1429 	spin_unlock(&BTRFS_I(inode)->lock);
1430 }
1431 
1432 /*
1433  * extent_io.c merge_extent_hook, used to track merged delayed allocation
1434  * extents so we can keep track of new extents that are just merged onto old
1435  * extents, such as when we are doing sequential writes, so we can properly
1436  * account for the metadata space we'll need.
1437  */
1438 static void btrfs_merge_extent_hook(struct inode *inode,
1439 				    struct extent_state *new,
1440 				    struct extent_state *other)
1441 {
1442 	/* not delalloc, ignore it */
1443 	if (!(other->state & EXTENT_DELALLOC))
1444 		return;
1445 
1446 	spin_lock(&BTRFS_I(inode)->lock);
1447 	BTRFS_I(inode)->outstanding_extents--;
1448 	spin_unlock(&BTRFS_I(inode)->lock);
1449 }
1450 
1451 static void btrfs_add_delalloc_inodes(struct btrfs_root *root,
1452 				      struct inode *inode)
1453 {
1454 	spin_lock(&root->delalloc_lock);
1455 	if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1456 		list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
1457 			      &root->delalloc_inodes);
1458 		set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1459 			&BTRFS_I(inode)->runtime_flags);
1460 		root->nr_delalloc_inodes++;
1461 		if (root->nr_delalloc_inodes == 1) {
1462 			spin_lock(&root->fs_info->delalloc_root_lock);
1463 			BUG_ON(!list_empty(&root->delalloc_root));
1464 			list_add_tail(&root->delalloc_root,
1465 				      &root->fs_info->delalloc_roots);
1466 			spin_unlock(&root->fs_info->delalloc_root_lock);
1467 		}
1468 	}
1469 	spin_unlock(&root->delalloc_lock);
1470 }
1471 
1472 static void btrfs_del_delalloc_inode(struct btrfs_root *root,
1473 				     struct inode *inode)
1474 {
1475 	spin_lock(&root->delalloc_lock);
1476 	if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
1477 		list_del_init(&BTRFS_I(inode)->delalloc_inodes);
1478 		clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1479 			  &BTRFS_I(inode)->runtime_flags);
1480 		root->nr_delalloc_inodes--;
1481 		if (!root->nr_delalloc_inodes) {
1482 			spin_lock(&root->fs_info->delalloc_root_lock);
1483 			BUG_ON(list_empty(&root->delalloc_root));
1484 			list_del_init(&root->delalloc_root);
1485 			spin_unlock(&root->fs_info->delalloc_root_lock);
1486 		}
1487 	}
1488 	spin_unlock(&root->delalloc_lock);
1489 }
1490 
1491 /*
1492  * extent_io.c set_bit_hook, used to track delayed allocation
1493  * bytes in this file, and to maintain the list of inodes that
1494  * have pending delalloc work to be done.
1495  */
1496 static void btrfs_set_bit_hook(struct inode *inode,
1497 			       struct extent_state *state, unsigned long *bits)
1498 {
1499 
1500 	/*
1501 	 * set_bit and clear bit hooks normally require _irqsave/restore
1502 	 * but in this case, we are only testing for the DELALLOC
1503 	 * bit, which is only set or cleared with irqs on
1504 	 */
1505 	if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1506 		struct btrfs_root *root = BTRFS_I(inode)->root;
1507 		u64 len = state->end + 1 - state->start;
1508 		bool do_list = !btrfs_is_free_space_inode(inode);
1509 
1510 		if (*bits & EXTENT_FIRST_DELALLOC) {
1511 			*bits &= ~EXTENT_FIRST_DELALLOC;
1512 		} else {
1513 			spin_lock(&BTRFS_I(inode)->lock);
1514 			BTRFS_I(inode)->outstanding_extents++;
1515 			spin_unlock(&BTRFS_I(inode)->lock);
1516 		}
1517 
1518 		__percpu_counter_add(&root->fs_info->delalloc_bytes, len,
1519 				     root->fs_info->delalloc_batch);
1520 		spin_lock(&BTRFS_I(inode)->lock);
1521 		BTRFS_I(inode)->delalloc_bytes += len;
1522 		if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1523 					 &BTRFS_I(inode)->runtime_flags))
1524 			btrfs_add_delalloc_inodes(root, inode);
1525 		spin_unlock(&BTRFS_I(inode)->lock);
1526 	}
1527 }
1528 
1529 /*
1530  * extent_io.c clear_bit_hook, see set_bit_hook for why
1531  */
1532 static void btrfs_clear_bit_hook(struct inode *inode,
1533 				 struct extent_state *state,
1534 				 unsigned long *bits)
1535 {
1536 	/*
1537 	 * set_bit and clear bit hooks normally require _irqsave/restore
1538 	 * but in this case, we are only testing for the DELALLOC
1539 	 * bit, which is only set or cleared with irqs on
1540 	 */
1541 	if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
1542 		struct btrfs_root *root = BTRFS_I(inode)->root;
1543 		u64 len = state->end + 1 - state->start;
1544 		bool do_list = !btrfs_is_free_space_inode(inode);
1545 
1546 		if (*bits & EXTENT_FIRST_DELALLOC) {
1547 			*bits &= ~EXTENT_FIRST_DELALLOC;
1548 		} else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
1549 			spin_lock(&BTRFS_I(inode)->lock);
1550 			BTRFS_I(inode)->outstanding_extents--;
1551 			spin_unlock(&BTRFS_I(inode)->lock);
1552 		}
1553 
1554 		if (*bits & EXTENT_DO_ACCOUNTING)
1555 			btrfs_delalloc_release_metadata(inode, len);
1556 
1557 		if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
1558 		    && do_list && !(state->state & EXTENT_NORESERVE))
1559 			btrfs_free_reserved_data_space(inode, len);
1560 
1561 		__percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
1562 				     root->fs_info->delalloc_batch);
1563 		spin_lock(&BTRFS_I(inode)->lock);
1564 		BTRFS_I(inode)->delalloc_bytes -= len;
1565 		if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
1566 		    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
1567 			     &BTRFS_I(inode)->runtime_flags))
1568 			btrfs_del_delalloc_inode(root, inode);
1569 		spin_unlock(&BTRFS_I(inode)->lock);
1570 	}
1571 }
1572 
1573 /*
1574  * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
1575  * we don't create bios that span stripes or chunks
1576  */
1577 int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
1578 			 size_t size, struct bio *bio,
1579 			 unsigned long bio_flags)
1580 {
1581 	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
1582 	u64 logical = (u64)bio->bi_sector << 9;
1583 	u64 length = 0;
1584 	u64 map_length;
1585 	int ret;
1586 
1587 	if (bio_flags & EXTENT_BIO_COMPRESSED)
1588 		return 0;
1589 
1590 	length = bio->bi_size;
1591 	map_length = length;
1592 	ret = btrfs_map_block(root->fs_info, rw, logical,
1593 			      &map_length, NULL, 0);
1594 	/* Will always return 0 with map_multi == NULL */
1595 	BUG_ON(ret < 0);
1596 	if (map_length < length + size)
1597 		return 1;
1598 	return 0;
1599 }
1600 
1601 /*
1602  * in order to insert checksums into the metadata in large chunks,
1603  * we wait until bio submission time.   All the pages in the bio are
1604  * checksummed and sums are attached onto the ordered extent record.
1605  *
1606  * At IO completion time the cums attached on the ordered extent record
1607  * are inserted into the btree
1608  */
1609 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
1610 				    struct bio *bio, int mirror_num,
1611 				    unsigned long bio_flags,
1612 				    u64 bio_offset)
1613 {
1614 	struct btrfs_root *root = BTRFS_I(inode)->root;
1615 	int ret = 0;
1616 
1617 	ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1618 	BUG_ON(ret); /* -ENOMEM */
1619 	return 0;
1620 }
1621 
1622 /*
1623  * in order to insert checksums into the metadata in large chunks,
1624  * we wait until bio submission time.   All the pages in the bio are
1625  * checksummed and sums are attached onto the ordered extent record.
1626  *
1627  * At IO completion time the cums attached on the ordered extent record
1628  * are inserted into the btree
1629  */
1630 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
1631 			  int mirror_num, unsigned long bio_flags,
1632 			  u64 bio_offset)
1633 {
1634 	struct btrfs_root *root = BTRFS_I(inode)->root;
1635 	int ret;
1636 
1637 	ret = btrfs_map_bio(root, rw, bio, mirror_num, 1);
1638 	if (ret)
1639 		bio_endio(bio, ret);
1640 	return ret;
1641 }
1642 
1643 /*
1644  * extent_io.c submission hook. This does the right thing for csum calculation
1645  * on write, or reading the csums from the tree before a read
1646  */
1647 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
1648 			  int mirror_num, unsigned long bio_flags,
1649 			  u64 bio_offset)
1650 {
1651 	struct btrfs_root *root = BTRFS_I(inode)->root;
1652 	int ret = 0;
1653 	int skip_sum;
1654 	int metadata = 0;
1655 	int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
1656 
1657 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
1658 
1659 	if (btrfs_is_free_space_inode(inode))
1660 		metadata = 2;
1661 
1662 	if (!(rw & REQ_WRITE)) {
1663 		ret = btrfs_bio_wq_end_io(root->fs_info, bio, metadata);
1664 		if (ret)
1665 			goto out;
1666 
1667 		if (bio_flags & EXTENT_BIO_COMPRESSED) {
1668 			ret = btrfs_submit_compressed_read(inode, bio,
1669 							   mirror_num,
1670 							   bio_flags);
1671 			goto out;
1672 		} else if (!skip_sum) {
1673 			ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
1674 			if (ret)
1675 				goto out;
1676 		}
1677 		goto mapit;
1678 	} else if (async && !skip_sum) {
1679 		/* csum items have already been cloned */
1680 		if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
1681 			goto mapit;
1682 		/* we're doing a write, do the async checksumming */
1683 		ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
1684 				   inode, rw, bio, mirror_num,
1685 				   bio_flags, bio_offset,
1686 				   __btrfs_submit_bio_start,
1687 				   __btrfs_submit_bio_done);
1688 		goto out;
1689 	} else if (!skip_sum) {
1690 		ret = btrfs_csum_one_bio(root, inode, bio, 0, 0);
1691 		if (ret)
1692 			goto out;
1693 	}
1694 
1695 mapit:
1696 	ret = btrfs_map_bio(root, rw, bio, mirror_num, 0);
1697 
1698 out:
1699 	if (ret < 0)
1700 		bio_endio(bio, ret);
1701 	return ret;
1702 }
1703 
1704 /*
1705  * given a list of ordered sums record them in the inode.  This happens
1706  * at IO completion time based on sums calculated at bio submission time.
1707  */
1708 static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1709 			     struct inode *inode, u64 file_offset,
1710 			     struct list_head *list)
1711 {
1712 	struct btrfs_ordered_sum *sum;
1713 
1714 	list_for_each_entry(sum, list, list) {
1715 		trans->adding_csums = 1;
1716 		btrfs_csum_file_blocks(trans,
1717 		       BTRFS_I(inode)->root->fs_info->csum_root, sum);
1718 		trans->adding_csums = 0;
1719 	}
1720 	return 0;
1721 }
1722 
1723 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1724 			      struct extent_state **cached_state)
1725 {
1726 	WARN_ON((end & (PAGE_CACHE_SIZE - 1)) == 0);
1727 	return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1728 				   cached_state, GFP_NOFS);
1729 }
1730 
1731 /* see btrfs_writepage_start_hook for details on why this is required */
1732 struct btrfs_writepage_fixup {
1733 	struct page *page;
1734 	struct btrfs_work work;
1735 };
1736 
1737 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1738 {
1739 	struct btrfs_writepage_fixup *fixup;
1740 	struct btrfs_ordered_extent *ordered;
1741 	struct extent_state *cached_state = NULL;
1742 	struct page *page;
1743 	struct inode *inode;
1744 	u64 page_start;
1745 	u64 page_end;
1746 	int ret;
1747 
1748 	fixup = container_of(work, struct btrfs_writepage_fixup, work);
1749 	page = fixup->page;
1750 again:
1751 	lock_page(page);
1752 	if (!page->mapping || !PageDirty(page) || !PageChecked(page)) {
1753 		ClearPageChecked(page);
1754 		goto out_page;
1755 	}
1756 
1757 	inode = page->mapping->host;
1758 	page_start = page_offset(page);
1759 	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1760 
1761 	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1762 			 &cached_state);
1763 
1764 	/* already ordered? We're done */
1765 	if (PagePrivate2(page))
1766 		goto out;
1767 
1768 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
1769 	if (ordered) {
1770 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1771 				     page_end, &cached_state, GFP_NOFS);
1772 		unlock_page(page);
1773 		btrfs_start_ordered_extent(inode, ordered, 1);
1774 		btrfs_put_ordered_extent(ordered);
1775 		goto again;
1776 	}
1777 
1778 	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
1779 	if (ret) {
1780 		mapping_set_error(page->mapping, ret);
1781 		end_extent_writepage(page, ret, page_start, page_end);
1782 		ClearPageChecked(page);
1783 		goto out;
1784 	 }
1785 
1786 	btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1787 	ClearPageChecked(page);
1788 	set_page_dirty(page);
1789 out:
1790 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1791 			     &cached_state, GFP_NOFS);
1792 out_page:
1793 	unlock_page(page);
1794 	page_cache_release(page);
1795 	kfree(fixup);
1796 }
1797 
1798 /*
1799  * There are a few paths in the higher layers of the kernel that directly
1800  * set the page dirty bit without asking the filesystem if it is a
1801  * good idea.  This causes problems because we want to make sure COW
1802  * properly happens and the data=ordered rules are followed.
1803  *
1804  * In our case any range that doesn't have the ORDERED bit set
1805  * hasn't been properly setup for IO.  We kick off an async process
1806  * to fix it up.  The async helper will wait for ordered extents, set
1807  * the delalloc bit and make it safe to write the page.
1808  */
1809 static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1810 {
1811 	struct inode *inode = page->mapping->host;
1812 	struct btrfs_writepage_fixup *fixup;
1813 	struct btrfs_root *root = BTRFS_I(inode)->root;
1814 
1815 	/* this page is properly in the ordered list */
1816 	if (TestClearPagePrivate2(page))
1817 		return 0;
1818 
1819 	if (PageChecked(page))
1820 		return -EAGAIN;
1821 
1822 	fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
1823 	if (!fixup)
1824 		return -EAGAIN;
1825 
1826 	SetPageChecked(page);
1827 	page_cache_get(page);
1828 	fixup->work.func = btrfs_writepage_fixup_worker;
1829 	fixup->page = page;
1830 	btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work);
1831 	return -EBUSY;
1832 }
1833 
1834 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
1835 				       struct inode *inode, u64 file_pos,
1836 				       u64 disk_bytenr, u64 disk_num_bytes,
1837 				       u64 num_bytes, u64 ram_bytes,
1838 				       u8 compression, u8 encryption,
1839 				       u16 other_encoding, int extent_type)
1840 {
1841 	struct btrfs_root *root = BTRFS_I(inode)->root;
1842 	struct btrfs_file_extent_item *fi;
1843 	struct btrfs_path *path;
1844 	struct extent_buffer *leaf;
1845 	struct btrfs_key ins;
1846 	int ret;
1847 
1848 	path = btrfs_alloc_path();
1849 	if (!path)
1850 		return -ENOMEM;
1851 
1852 	path->leave_spinning = 1;
1853 
1854 	/*
1855 	 * we may be replacing one extent in the tree with another.
1856 	 * The new extent is pinned in the extent map, and we don't want
1857 	 * to drop it from the cache until it is completely in the btree.
1858 	 *
1859 	 * So, tell btrfs_drop_extents to leave this extent in the cache.
1860 	 * the caller is expected to unpin it and allow it to be merged
1861 	 * with the others.
1862 	 */
1863 	ret = btrfs_drop_extents(trans, root, inode, file_pos,
1864 				 file_pos + num_bytes, 0);
1865 	if (ret)
1866 		goto out;
1867 
1868 	ins.objectid = btrfs_ino(inode);
1869 	ins.offset = file_pos;
1870 	ins.type = BTRFS_EXTENT_DATA_KEY;
1871 	ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
1872 	if (ret)
1873 		goto out;
1874 	leaf = path->nodes[0];
1875 	fi = btrfs_item_ptr(leaf, path->slots[0],
1876 			    struct btrfs_file_extent_item);
1877 	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
1878 	btrfs_set_file_extent_type(leaf, fi, extent_type);
1879 	btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr);
1880 	btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes);
1881 	btrfs_set_file_extent_offset(leaf, fi, 0);
1882 	btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
1883 	btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes);
1884 	btrfs_set_file_extent_compression(leaf, fi, compression);
1885 	btrfs_set_file_extent_encryption(leaf, fi, encryption);
1886 	btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
1887 
1888 	btrfs_mark_buffer_dirty(leaf);
1889 	btrfs_release_path(path);
1890 
1891 	inode_add_bytes(inode, num_bytes);
1892 
1893 	ins.objectid = disk_bytenr;
1894 	ins.offset = disk_num_bytes;
1895 	ins.type = BTRFS_EXTENT_ITEM_KEY;
1896 	ret = btrfs_alloc_reserved_file_extent(trans, root,
1897 					root->root_key.objectid,
1898 					btrfs_ino(inode), file_pos, &ins);
1899 out:
1900 	btrfs_free_path(path);
1901 
1902 	return ret;
1903 }
1904 
1905 /* snapshot-aware defrag */
1906 struct sa_defrag_extent_backref {
1907 	struct rb_node node;
1908 	struct old_sa_defrag_extent *old;
1909 	u64 root_id;
1910 	u64 inum;
1911 	u64 file_pos;
1912 	u64 extent_offset;
1913 	u64 num_bytes;
1914 	u64 generation;
1915 };
1916 
1917 struct old_sa_defrag_extent {
1918 	struct list_head list;
1919 	struct new_sa_defrag_extent *new;
1920 
1921 	u64 extent_offset;
1922 	u64 bytenr;
1923 	u64 offset;
1924 	u64 len;
1925 	int count;
1926 };
1927 
1928 struct new_sa_defrag_extent {
1929 	struct rb_root root;
1930 	struct list_head head;
1931 	struct btrfs_path *path;
1932 	struct inode *inode;
1933 	u64 file_pos;
1934 	u64 len;
1935 	u64 bytenr;
1936 	u64 disk_len;
1937 	u8 compress_type;
1938 };
1939 
1940 static int backref_comp(struct sa_defrag_extent_backref *b1,
1941 			struct sa_defrag_extent_backref *b2)
1942 {
1943 	if (b1->root_id < b2->root_id)
1944 		return -1;
1945 	else if (b1->root_id > b2->root_id)
1946 		return 1;
1947 
1948 	if (b1->inum < b2->inum)
1949 		return -1;
1950 	else if (b1->inum > b2->inum)
1951 		return 1;
1952 
1953 	if (b1->file_pos < b2->file_pos)
1954 		return -1;
1955 	else if (b1->file_pos > b2->file_pos)
1956 		return 1;
1957 
1958 	/*
1959 	 * [------------------------------] ===> (a range of space)
1960 	 *     |<--->|   |<---->| =============> (fs/file tree A)
1961 	 * |<---------------------------->| ===> (fs/file tree B)
1962 	 *
1963 	 * A range of space can refer to two file extents in one tree while
1964 	 * refer to only one file extent in another tree.
1965 	 *
1966 	 * So we may process a disk offset more than one time(two extents in A)
1967 	 * and locate at the same extent(one extent in B), then insert two same
1968 	 * backrefs(both refer to the extent in B).
1969 	 */
1970 	return 0;
1971 }
1972 
1973 static void backref_insert(struct rb_root *root,
1974 			   struct sa_defrag_extent_backref *backref)
1975 {
1976 	struct rb_node **p = &root->rb_node;
1977 	struct rb_node *parent = NULL;
1978 	struct sa_defrag_extent_backref *entry;
1979 	int ret;
1980 
1981 	while (*p) {
1982 		parent = *p;
1983 		entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
1984 
1985 		ret = backref_comp(backref, entry);
1986 		if (ret < 0)
1987 			p = &(*p)->rb_left;
1988 		else
1989 			p = &(*p)->rb_right;
1990 	}
1991 
1992 	rb_link_node(&backref->node, parent, p);
1993 	rb_insert_color(&backref->node, root);
1994 }
1995 
1996 /*
1997  * Note the backref might has changed, and in this case we just return 0.
1998  */
1999 static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
2000 				       void *ctx)
2001 {
2002 	struct btrfs_file_extent_item *extent;
2003 	struct btrfs_fs_info *fs_info;
2004 	struct old_sa_defrag_extent *old = ctx;
2005 	struct new_sa_defrag_extent *new = old->new;
2006 	struct btrfs_path *path = new->path;
2007 	struct btrfs_key key;
2008 	struct btrfs_root *root;
2009 	struct sa_defrag_extent_backref *backref;
2010 	struct extent_buffer *leaf;
2011 	struct inode *inode = new->inode;
2012 	int slot;
2013 	int ret;
2014 	u64 extent_offset;
2015 	u64 num_bytes;
2016 
2017 	if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
2018 	    inum == btrfs_ino(inode))
2019 		return 0;
2020 
2021 	key.objectid = root_id;
2022 	key.type = BTRFS_ROOT_ITEM_KEY;
2023 	key.offset = (u64)-1;
2024 
2025 	fs_info = BTRFS_I(inode)->root->fs_info;
2026 	root = btrfs_read_fs_root_no_name(fs_info, &key);
2027 	if (IS_ERR(root)) {
2028 		if (PTR_ERR(root) == -ENOENT)
2029 			return 0;
2030 		WARN_ON(1);
2031 		pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
2032 			 inum, offset, root_id);
2033 		return PTR_ERR(root);
2034 	}
2035 
2036 	key.objectid = inum;
2037 	key.type = BTRFS_EXTENT_DATA_KEY;
2038 	if (offset > (u64)-1 << 32)
2039 		key.offset = 0;
2040 	else
2041 		key.offset = offset;
2042 
2043 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2044 	if (ret < 0) {
2045 		WARN_ON(1);
2046 		return ret;
2047 	}
2048 	ret = 0;
2049 
2050 	while (1) {
2051 		cond_resched();
2052 
2053 		leaf = path->nodes[0];
2054 		slot = path->slots[0];
2055 
2056 		if (slot >= btrfs_header_nritems(leaf)) {
2057 			ret = btrfs_next_leaf(root, path);
2058 			if (ret < 0) {
2059 				goto out;
2060 			} else if (ret > 0) {
2061 				ret = 0;
2062 				goto out;
2063 			}
2064 			continue;
2065 		}
2066 
2067 		path->slots[0]++;
2068 
2069 		btrfs_item_key_to_cpu(leaf, &key, slot);
2070 
2071 		if (key.objectid > inum)
2072 			goto out;
2073 
2074 		if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
2075 			continue;
2076 
2077 		extent = btrfs_item_ptr(leaf, slot,
2078 					struct btrfs_file_extent_item);
2079 
2080 		if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
2081 			continue;
2082 
2083 		/*
2084 		 * 'offset' refers to the exact key.offset,
2085 		 * NOT the 'offset' field in btrfs_extent_data_ref, ie.
2086 		 * (key.offset - extent_offset).
2087 		 */
2088 		if (key.offset != offset)
2089 			continue;
2090 
2091 		extent_offset = btrfs_file_extent_offset(leaf, extent);
2092 		num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
2093 
2094 		if (extent_offset >= old->extent_offset + old->offset +
2095 		    old->len || extent_offset + num_bytes <=
2096 		    old->extent_offset + old->offset)
2097 			continue;
2098 		break;
2099 	}
2100 
2101 	backref = kmalloc(sizeof(*backref), GFP_NOFS);
2102 	if (!backref) {
2103 		ret = -ENOENT;
2104 		goto out;
2105 	}
2106 
2107 	backref->root_id = root_id;
2108 	backref->inum = inum;
2109 	backref->file_pos = offset;
2110 	backref->num_bytes = num_bytes;
2111 	backref->extent_offset = extent_offset;
2112 	backref->generation = btrfs_file_extent_generation(leaf, extent);
2113 	backref->old = old;
2114 	backref_insert(&new->root, backref);
2115 	old->count++;
2116 out:
2117 	btrfs_release_path(path);
2118 	WARN_ON(ret);
2119 	return ret;
2120 }
2121 
2122 static noinline bool record_extent_backrefs(struct btrfs_path *path,
2123 				   struct new_sa_defrag_extent *new)
2124 {
2125 	struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
2126 	struct old_sa_defrag_extent *old, *tmp;
2127 	int ret;
2128 
2129 	new->path = path;
2130 
2131 	list_for_each_entry_safe(old, tmp, &new->head, list) {
2132 		ret = iterate_inodes_from_logical(old->bytenr +
2133 						  old->extent_offset, fs_info,
2134 						  path, record_one_backref,
2135 						  old);
2136 		BUG_ON(ret < 0 && ret != -ENOENT);
2137 
2138 		/* no backref to be processed for this extent */
2139 		if (!old->count) {
2140 			list_del(&old->list);
2141 			kfree(old);
2142 		}
2143 	}
2144 
2145 	if (list_empty(&new->head))
2146 		return false;
2147 
2148 	return true;
2149 }
2150 
2151 static int relink_is_mergable(struct extent_buffer *leaf,
2152 			      struct btrfs_file_extent_item *fi,
2153 			      struct new_sa_defrag_extent *new)
2154 {
2155 	if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
2156 		return 0;
2157 
2158 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2159 		return 0;
2160 
2161 	if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
2162 		return 0;
2163 
2164 	if (btrfs_file_extent_encryption(leaf, fi) ||
2165 	    btrfs_file_extent_other_encoding(leaf, fi))
2166 		return 0;
2167 
2168 	return 1;
2169 }
2170 
2171 /*
2172  * Note the backref might has changed, and in this case we just return 0.
2173  */
2174 static noinline int relink_extent_backref(struct btrfs_path *path,
2175 				 struct sa_defrag_extent_backref *prev,
2176 				 struct sa_defrag_extent_backref *backref)
2177 {
2178 	struct btrfs_file_extent_item *extent;
2179 	struct btrfs_file_extent_item *item;
2180 	struct btrfs_ordered_extent *ordered;
2181 	struct btrfs_trans_handle *trans;
2182 	struct btrfs_fs_info *fs_info;
2183 	struct btrfs_root *root;
2184 	struct btrfs_key key;
2185 	struct extent_buffer *leaf;
2186 	struct old_sa_defrag_extent *old = backref->old;
2187 	struct new_sa_defrag_extent *new = old->new;
2188 	struct inode *src_inode = new->inode;
2189 	struct inode *inode;
2190 	struct extent_state *cached = NULL;
2191 	int ret = 0;
2192 	u64 start;
2193 	u64 len;
2194 	u64 lock_start;
2195 	u64 lock_end;
2196 	bool merge = false;
2197 	int index;
2198 
2199 	if (prev && prev->root_id == backref->root_id &&
2200 	    prev->inum == backref->inum &&
2201 	    prev->file_pos + prev->num_bytes == backref->file_pos)
2202 		merge = true;
2203 
2204 	/* step 1: get root */
2205 	key.objectid = backref->root_id;
2206 	key.type = BTRFS_ROOT_ITEM_KEY;
2207 	key.offset = (u64)-1;
2208 
2209 	fs_info = BTRFS_I(src_inode)->root->fs_info;
2210 	index = srcu_read_lock(&fs_info->subvol_srcu);
2211 
2212 	root = btrfs_read_fs_root_no_name(fs_info, &key);
2213 	if (IS_ERR(root)) {
2214 		srcu_read_unlock(&fs_info->subvol_srcu, index);
2215 		if (PTR_ERR(root) == -ENOENT)
2216 			return 0;
2217 		return PTR_ERR(root);
2218 	}
2219 
2220 	/* step 2: get inode */
2221 	key.objectid = backref->inum;
2222 	key.type = BTRFS_INODE_ITEM_KEY;
2223 	key.offset = 0;
2224 
2225 	inode = btrfs_iget(fs_info->sb, &key, root, NULL);
2226 	if (IS_ERR(inode)) {
2227 		srcu_read_unlock(&fs_info->subvol_srcu, index);
2228 		return 0;
2229 	}
2230 
2231 	srcu_read_unlock(&fs_info->subvol_srcu, index);
2232 
2233 	/* step 3: relink backref */
2234 	lock_start = backref->file_pos;
2235 	lock_end = backref->file_pos + backref->num_bytes - 1;
2236 	lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2237 			 0, &cached);
2238 
2239 	ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
2240 	if (ordered) {
2241 		btrfs_put_ordered_extent(ordered);
2242 		goto out_unlock;
2243 	}
2244 
2245 	trans = btrfs_join_transaction(root);
2246 	if (IS_ERR(trans)) {
2247 		ret = PTR_ERR(trans);
2248 		goto out_unlock;
2249 	}
2250 
2251 	key.objectid = backref->inum;
2252 	key.type = BTRFS_EXTENT_DATA_KEY;
2253 	key.offset = backref->file_pos;
2254 
2255 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2256 	if (ret < 0) {
2257 		goto out_free_path;
2258 	} else if (ret > 0) {
2259 		ret = 0;
2260 		goto out_free_path;
2261 	}
2262 
2263 	extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
2264 				struct btrfs_file_extent_item);
2265 
2266 	if (btrfs_file_extent_generation(path->nodes[0], extent) !=
2267 	    backref->generation)
2268 		goto out_free_path;
2269 
2270 	btrfs_release_path(path);
2271 
2272 	start = backref->file_pos;
2273 	if (backref->extent_offset < old->extent_offset + old->offset)
2274 		start += old->extent_offset + old->offset -
2275 			 backref->extent_offset;
2276 
2277 	len = min(backref->extent_offset + backref->num_bytes,
2278 		  old->extent_offset + old->offset + old->len);
2279 	len -= max(backref->extent_offset, old->extent_offset + old->offset);
2280 
2281 	ret = btrfs_drop_extents(trans, root, inode, start,
2282 				 start + len, 1);
2283 	if (ret)
2284 		goto out_free_path;
2285 again:
2286 	key.objectid = btrfs_ino(inode);
2287 	key.type = BTRFS_EXTENT_DATA_KEY;
2288 	key.offset = start;
2289 
2290 	path->leave_spinning = 1;
2291 	if (merge) {
2292 		struct btrfs_file_extent_item *fi;
2293 		u64 extent_len;
2294 		struct btrfs_key found_key;
2295 
2296 		ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
2297 		if (ret < 0)
2298 			goto out_free_path;
2299 
2300 		path->slots[0]--;
2301 		leaf = path->nodes[0];
2302 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2303 
2304 		fi = btrfs_item_ptr(leaf, path->slots[0],
2305 				    struct btrfs_file_extent_item);
2306 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
2307 
2308 		if (extent_len + found_key.offset == start &&
2309 		    relink_is_mergable(leaf, fi, new)) {
2310 			btrfs_set_file_extent_num_bytes(leaf, fi,
2311 							extent_len + len);
2312 			btrfs_mark_buffer_dirty(leaf);
2313 			inode_add_bytes(inode, len);
2314 
2315 			ret = 1;
2316 			goto out_free_path;
2317 		} else {
2318 			merge = false;
2319 			btrfs_release_path(path);
2320 			goto again;
2321 		}
2322 	}
2323 
2324 	ret = btrfs_insert_empty_item(trans, root, path, &key,
2325 					sizeof(*extent));
2326 	if (ret) {
2327 		btrfs_abort_transaction(trans, root, ret);
2328 		goto out_free_path;
2329 	}
2330 
2331 	leaf = path->nodes[0];
2332 	item = btrfs_item_ptr(leaf, path->slots[0],
2333 				struct btrfs_file_extent_item);
2334 	btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
2335 	btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
2336 	btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
2337 	btrfs_set_file_extent_num_bytes(leaf, item, len);
2338 	btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
2339 	btrfs_set_file_extent_generation(leaf, item, trans->transid);
2340 	btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
2341 	btrfs_set_file_extent_compression(leaf, item, new->compress_type);
2342 	btrfs_set_file_extent_encryption(leaf, item, 0);
2343 	btrfs_set_file_extent_other_encoding(leaf, item, 0);
2344 
2345 	btrfs_mark_buffer_dirty(leaf);
2346 	inode_add_bytes(inode, len);
2347 	btrfs_release_path(path);
2348 
2349 	ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
2350 			new->disk_len, 0,
2351 			backref->root_id, backref->inum,
2352 			new->file_pos, 0);	/* start - extent_offset */
2353 	if (ret) {
2354 		btrfs_abort_transaction(trans, root, ret);
2355 		goto out_free_path;
2356 	}
2357 
2358 	ret = 1;
2359 out_free_path:
2360 	btrfs_release_path(path);
2361 	path->leave_spinning = 0;
2362 	btrfs_end_transaction(trans, root);
2363 out_unlock:
2364 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
2365 			     &cached, GFP_NOFS);
2366 	iput(inode);
2367 	return ret;
2368 }
2369 
2370 static void relink_file_extents(struct new_sa_defrag_extent *new)
2371 {
2372 	struct btrfs_path *path;
2373 	struct old_sa_defrag_extent *old, *tmp;
2374 	struct sa_defrag_extent_backref *backref;
2375 	struct sa_defrag_extent_backref *prev = NULL;
2376 	struct inode *inode;
2377 	struct btrfs_root *root;
2378 	struct rb_node *node;
2379 	int ret;
2380 
2381 	inode = new->inode;
2382 	root = BTRFS_I(inode)->root;
2383 
2384 	path = btrfs_alloc_path();
2385 	if (!path)
2386 		return;
2387 
2388 	if (!record_extent_backrefs(path, new)) {
2389 		btrfs_free_path(path);
2390 		goto out;
2391 	}
2392 	btrfs_release_path(path);
2393 
2394 	while (1) {
2395 		node = rb_first(&new->root);
2396 		if (!node)
2397 			break;
2398 		rb_erase(node, &new->root);
2399 
2400 		backref = rb_entry(node, struct sa_defrag_extent_backref, node);
2401 
2402 		ret = relink_extent_backref(path, prev, backref);
2403 		WARN_ON(ret < 0);
2404 
2405 		kfree(prev);
2406 
2407 		if (ret == 1)
2408 			prev = backref;
2409 		else
2410 			prev = NULL;
2411 		cond_resched();
2412 	}
2413 	kfree(prev);
2414 
2415 	btrfs_free_path(path);
2416 
2417 	list_for_each_entry_safe(old, tmp, &new->head, list) {
2418 		list_del(&old->list);
2419 		kfree(old);
2420 	}
2421 out:
2422 	atomic_dec(&root->fs_info->defrag_running);
2423 	wake_up(&root->fs_info->transaction_wait);
2424 
2425 	kfree(new);
2426 }
2427 
2428 static struct new_sa_defrag_extent *
2429 record_old_file_extents(struct inode *inode,
2430 			struct btrfs_ordered_extent *ordered)
2431 {
2432 	struct btrfs_root *root = BTRFS_I(inode)->root;
2433 	struct btrfs_path *path;
2434 	struct btrfs_key key;
2435 	struct old_sa_defrag_extent *old, *tmp;
2436 	struct new_sa_defrag_extent *new;
2437 	int ret;
2438 
2439 	new = kmalloc(sizeof(*new), GFP_NOFS);
2440 	if (!new)
2441 		return NULL;
2442 
2443 	new->inode = inode;
2444 	new->file_pos = ordered->file_offset;
2445 	new->len = ordered->len;
2446 	new->bytenr = ordered->start;
2447 	new->disk_len = ordered->disk_len;
2448 	new->compress_type = ordered->compress_type;
2449 	new->root = RB_ROOT;
2450 	INIT_LIST_HEAD(&new->head);
2451 
2452 	path = btrfs_alloc_path();
2453 	if (!path)
2454 		goto out_kfree;
2455 
2456 	key.objectid = btrfs_ino(inode);
2457 	key.type = BTRFS_EXTENT_DATA_KEY;
2458 	key.offset = new->file_pos;
2459 
2460 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2461 	if (ret < 0)
2462 		goto out_free_path;
2463 	if (ret > 0 && path->slots[0] > 0)
2464 		path->slots[0]--;
2465 
2466 	/* find out all the old extents for the file range */
2467 	while (1) {
2468 		struct btrfs_file_extent_item *extent;
2469 		struct extent_buffer *l;
2470 		int slot;
2471 		u64 num_bytes;
2472 		u64 offset;
2473 		u64 end;
2474 		u64 disk_bytenr;
2475 		u64 extent_offset;
2476 
2477 		l = path->nodes[0];
2478 		slot = path->slots[0];
2479 
2480 		if (slot >= btrfs_header_nritems(l)) {
2481 			ret = btrfs_next_leaf(root, path);
2482 			if (ret < 0)
2483 				goto out_free_list;
2484 			else if (ret > 0)
2485 				break;
2486 			continue;
2487 		}
2488 
2489 		btrfs_item_key_to_cpu(l, &key, slot);
2490 
2491 		if (key.objectid != btrfs_ino(inode))
2492 			break;
2493 		if (key.type != BTRFS_EXTENT_DATA_KEY)
2494 			break;
2495 		if (key.offset >= new->file_pos + new->len)
2496 			break;
2497 
2498 		extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
2499 
2500 		num_bytes = btrfs_file_extent_num_bytes(l, extent);
2501 		if (key.offset + num_bytes < new->file_pos)
2502 			goto next;
2503 
2504 		disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
2505 		if (!disk_bytenr)
2506 			goto next;
2507 
2508 		extent_offset = btrfs_file_extent_offset(l, extent);
2509 
2510 		old = kmalloc(sizeof(*old), GFP_NOFS);
2511 		if (!old)
2512 			goto out_free_list;
2513 
2514 		offset = max(new->file_pos, key.offset);
2515 		end = min(new->file_pos + new->len, key.offset + num_bytes);
2516 
2517 		old->bytenr = disk_bytenr;
2518 		old->extent_offset = extent_offset;
2519 		old->offset = offset - key.offset;
2520 		old->len = end - offset;
2521 		old->new = new;
2522 		old->count = 0;
2523 		list_add_tail(&old->list, &new->head);
2524 next:
2525 		path->slots[0]++;
2526 		cond_resched();
2527 	}
2528 
2529 	btrfs_free_path(path);
2530 	atomic_inc(&root->fs_info->defrag_running);
2531 
2532 	return new;
2533 
2534 out_free_list:
2535 	list_for_each_entry_safe(old, tmp, &new->head, list) {
2536 		list_del(&old->list);
2537 		kfree(old);
2538 	}
2539 out_free_path:
2540 	btrfs_free_path(path);
2541 out_kfree:
2542 	kfree(new);
2543 	return NULL;
2544 }
2545 
2546 /*
2547  * helper function for btrfs_finish_ordered_io, this
2548  * just reads in some of the csum leaves to prime them into ram
2549  * before we start the transaction.  It limits the amount of btree
2550  * reads required while inside the transaction.
2551  */
2552 /* as ordered data IO finishes, this gets called so we can finish
2553  * an ordered extent if the range of bytes in the file it covers are
2554  * fully written.
2555  */
2556 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
2557 {
2558 	struct inode *inode = ordered_extent->inode;
2559 	struct btrfs_root *root = BTRFS_I(inode)->root;
2560 	struct btrfs_trans_handle *trans = NULL;
2561 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2562 	struct extent_state *cached_state = NULL;
2563 	struct new_sa_defrag_extent *new = NULL;
2564 	int compress_type = 0;
2565 	int ret = 0;
2566 	u64 logical_len = ordered_extent->len;
2567 	bool nolock;
2568 	bool truncated = false;
2569 
2570 	nolock = btrfs_is_free_space_inode(inode);
2571 
2572 	if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
2573 		ret = -EIO;
2574 		goto out;
2575 	}
2576 
2577 	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
2578 		truncated = true;
2579 		logical_len = ordered_extent->truncated_len;
2580 		/* Truncated the entire extent, don't bother adding */
2581 		if (!logical_len)
2582 			goto out;
2583 	}
2584 
2585 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
2586 		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
2587 		btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2588 		if (nolock)
2589 			trans = btrfs_join_transaction_nolock(root);
2590 		else
2591 			trans = btrfs_join_transaction(root);
2592 		if (IS_ERR(trans)) {
2593 			ret = PTR_ERR(trans);
2594 			trans = NULL;
2595 			goto out;
2596 		}
2597 		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2598 		ret = btrfs_update_inode_fallback(trans, root, inode);
2599 		if (ret) /* -ENOMEM or corruption */
2600 			btrfs_abort_transaction(trans, root, ret);
2601 		goto out;
2602 	}
2603 
2604 	lock_extent_bits(io_tree, ordered_extent->file_offset,
2605 			 ordered_extent->file_offset + ordered_extent->len - 1,
2606 			 0, &cached_state);
2607 
2608 	ret = test_range_bit(io_tree, ordered_extent->file_offset,
2609 			ordered_extent->file_offset + ordered_extent->len - 1,
2610 			EXTENT_DEFRAG, 1, cached_state);
2611 	if (ret) {
2612 		u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
2613 		if (last_snapshot >= BTRFS_I(inode)->generation)
2614 			/* the inode is shared */
2615 			new = record_old_file_extents(inode, ordered_extent);
2616 
2617 		clear_extent_bit(io_tree, ordered_extent->file_offset,
2618 			ordered_extent->file_offset + ordered_extent->len - 1,
2619 			EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
2620 	}
2621 
2622 	if (nolock)
2623 		trans = btrfs_join_transaction_nolock(root);
2624 	else
2625 		trans = btrfs_join_transaction(root);
2626 	if (IS_ERR(trans)) {
2627 		ret = PTR_ERR(trans);
2628 		trans = NULL;
2629 		goto out_unlock;
2630 	}
2631 	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
2632 
2633 	if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
2634 		compress_type = ordered_extent->compress_type;
2635 	if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
2636 		BUG_ON(compress_type);
2637 		ret = btrfs_mark_extent_written(trans, inode,
2638 						ordered_extent->file_offset,
2639 						ordered_extent->file_offset +
2640 						logical_len);
2641 	} else {
2642 		BUG_ON(root == root->fs_info->tree_root);
2643 		ret = insert_reserved_file_extent(trans, inode,
2644 						ordered_extent->file_offset,
2645 						ordered_extent->start,
2646 						ordered_extent->disk_len,
2647 						logical_len, logical_len,
2648 						compress_type, 0, 0,
2649 						BTRFS_FILE_EXTENT_REG);
2650 	}
2651 	unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
2652 			   ordered_extent->file_offset, ordered_extent->len,
2653 			   trans->transid);
2654 	if (ret < 0) {
2655 		btrfs_abort_transaction(trans, root, ret);
2656 		goto out_unlock;
2657 	}
2658 
2659 	add_pending_csums(trans, inode, ordered_extent->file_offset,
2660 			  &ordered_extent->list);
2661 
2662 	btrfs_ordered_update_i_size(inode, 0, ordered_extent);
2663 	ret = btrfs_update_inode_fallback(trans, root, inode);
2664 	if (ret) { /* -ENOMEM or corruption */
2665 		btrfs_abort_transaction(trans, root, ret);
2666 		goto out_unlock;
2667 	}
2668 	ret = 0;
2669 out_unlock:
2670 	unlock_extent_cached(io_tree, ordered_extent->file_offset,
2671 			     ordered_extent->file_offset +
2672 			     ordered_extent->len - 1, &cached_state, GFP_NOFS);
2673 out:
2674 	if (root != root->fs_info->tree_root)
2675 		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
2676 	if (trans)
2677 		btrfs_end_transaction(trans, root);
2678 
2679 	if (ret || truncated) {
2680 		u64 start, end;
2681 
2682 		if (truncated)
2683 			start = ordered_extent->file_offset + logical_len;
2684 		else
2685 			start = ordered_extent->file_offset;
2686 		end = ordered_extent->file_offset + ordered_extent->len - 1;
2687 		clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
2688 
2689 		/* Drop the cache for the part of the extent we didn't write. */
2690 		btrfs_drop_extent_cache(inode, start, end, 0);
2691 
2692 		/*
2693 		 * If the ordered extent had an IOERR or something else went
2694 		 * wrong we need to return the space for this ordered extent
2695 		 * back to the allocator.  We only free the extent in the
2696 		 * truncated case if we didn't write out the extent at all.
2697 		 */
2698 		if ((ret || !logical_len) &&
2699 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
2700 		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
2701 			btrfs_free_reserved_extent(root, ordered_extent->start,
2702 						   ordered_extent->disk_len);
2703 	}
2704 
2705 
2706 	/*
2707 	 * This needs to be done to make sure anybody waiting knows we are done
2708 	 * updating everything for this ordered extent.
2709 	 */
2710 	btrfs_remove_ordered_extent(inode, ordered_extent);
2711 
2712 	/* for snapshot-aware defrag */
2713 	if (new)
2714 		relink_file_extents(new);
2715 
2716 	/* once for us */
2717 	btrfs_put_ordered_extent(ordered_extent);
2718 	/* once for the tree */
2719 	btrfs_put_ordered_extent(ordered_extent);
2720 
2721 	return ret;
2722 }
2723 
2724 static void finish_ordered_fn(struct btrfs_work *work)
2725 {
2726 	struct btrfs_ordered_extent *ordered_extent;
2727 	ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
2728 	btrfs_finish_ordered_io(ordered_extent);
2729 }
2730 
2731 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2732 				struct extent_state *state, int uptodate)
2733 {
2734 	struct inode *inode = page->mapping->host;
2735 	struct btrfs_root *root = BTRFS_I(inode)->root;
2736 	struct btrfs_ordered_extent *ordered_extent = NULL;
2737 	struct btrfs_workers *workers;
2738 
2739 	trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2740 
2741 	ClearPagePrivate2(page);
2742 	if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
2743 					    end - start + 1, uptodate))
2744 		return 0;
2745 
2746 	ordered_extent->work.func = finish_ordered_fn;
2747 	ordered_extent->work.flags = 0;
2748 
2749 	if (btrfs_is_free_space_inode(inode))
2750 		workers = &root->fs_info->endio_freespace_worker;
2751 	else
2752 		workers = &root->fs_info->endio_write_workers;
2753 	btrfs_queue_worker(workers, &ordered_extent->work);
2754 
2755 	return 0;
2756 }
2757 
2758 /*
2759  * when reads are done, we need to check csums to verify the data is correct
2760  * if there's a match, we allow the bio to finish.  If not, the code in
2761  * extent_io.c will try to find good copies for us.
2762  */
2763 static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
2764 				      u64 phy_offset, struct page *page,
2765 				      u64 start, u64 end, int mirror)
2766 {
2767 	size_t offset = start - page_offset(page);
2768 	struct inode *inode = page->mapping->host;
2769 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
2770 	char *kaddr;
2771 	struct btrfs_root *root = BTRFS_I(inode)->root;
2772 	u32 csum_expected;
2773 	u32 csum = ~(u32)0;
2774 	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
2775 	                              DEFAULT_RATELIMIT_BURST);
2776 
2777 	if (PageChecked(page)) {
2778 		ClearPageChecked(page);
2779 		goto good;
2780 	}
2781 
2782 	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
2783 		goto good;
2784 
2785 	if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
2786 	    test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
2787 		clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM,
2788 				  GFP_NOFS);
2789 		return 0;
2790 	}
2791 
2792 	phy_offset >>= inode->i_sb->s_blocksize_bits;
2793 	csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
2794 
2795 	kaddr = kmap_atomic(page);
2796 	csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);
2797 	btrfs_csum_final(csum, (char *)&csum);
2798 	if (csum != csum_expected)
2799 		goto zeroit;
2800 
2801 	kunmap_atomic(kaddr);
2802 good:
2803 	return 0;
2804 
2805 zeroit:
2806 	if (__ratelimit(&_rs))
2807 		btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
2808 			btrfs_ino(page->mapping->host), start, csum, csum_expected);
2809 	memset(kaddr + offset, 1, end - start + 1);
2810 	flush_dcache_page(page);
2811 	kunmap_atomic(kaddr);
2812 	if (csum_expected == 0)
2813 		return 0;
2814 	return -EIO;
2815 }
2816 
2817 struct delayed_iput {
2818 	struct list_head list;
2819 	struct inode *inode;
2820 };
2821 
2822 /* JDM: If this is fs-wide, why can't we add a pointer to
2823  * btrfs_inode instead and avoid the allocation? */
2824 void btrfs_add_delayed_iput(struct inode *inode)
2825 {
2826 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2827 	struct delayed_iput *delayed;
2828 
2829 	if (atomic_add_unless(&inode->i_count, -1, 1))
2830 		return;
2831 
2832 	delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
2833 	delayed->inode = inode;
2834 
2835 	spin_lock(&fs_info->delayed_iput_lock);
2836 	list_add_tail(&delayed->list, &fs_info->delayed_iputs);
2837 	spin_unlock(&fs_info->delayed_iput_lock);
2838 }
2839 
2840 void btrfs_run_delayed_iputs(struct btrfs_root *root)
2841 {
2842 	LIST_HEAD(list);
2843 	struct btrfs_fs_info *fs_info = root->fs_info;
2844 	struct delayed_iput *delayed;
2845 	int empty;
2846 
2847 	spin_lock(&fs_info->delayed_iput_lock);
2848 	empty = list_empty(&fs_info->delayed_iputs);
2849 	spin_unlock(&fs_info->delayed_iput_lock);
2850 	if (empty)
2851 		return;
2852 
2853 	spin_lock(&fs_info->delayed_iput_lock);
2854 	list_splice_init(&fs_info->delayed_iputs, &list);
2855 	spin_unlock(&fs_info->delayed_iput_lock);
2856 
2857 	while (!list_empty(&list)) {
2858 		delayed = list_entry(list.next, struct delayed_iput, list);
2859 		list_del(&delayed->list);
2860 		iput(delayed->inode);
2861 		kfree(delayed);
2862 	}
2863 }
2864 
2865 /*
2866  * This is called in transaction commit time. If there are no orphan
2867  * files in the subvolume, it removes orphan item and frees block_rsv
2868  * structure.
2869  */
2870 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
2871 			      struct btrfs_root *root)
2872 {
2873 	struct btrfs_block_rsv *block_rsv;
2874 	int ret;
2875 
2876 	if (atomic_read(&root->orphan_inodes) ||
2877 	    root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
2878 		return;
2879 
2880 	spin_lock(&root->orphan_lock);
2881 	if (atomic_read(&root->orphan_inodes)) {
2882 		spin_unlock(&root->orphan_lock);
2883 		return;
2884 	}
2885 
2886 	if (root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE) {
2887 		spin_unlock(&root->orphan_lock);
2888 		return;
2889 	}
2890 
2891 	block_rsv = root->orphan_block_rsv;
2892 	root->orphan_block_rsv = NULL;
2893 	spin_unlock(&root->orphan_lock);
2894 
2895 	if (root->orphan_item_inserted &&
2896 	    btrfs_root_refs(&root->root_item) > 0) {
2897 		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
2898 					    root->root_key.objectid);
2899 		if (ret)
2900 			btrfs_abort_transaction(trans, root, ret);
2901 		else
2902 			root->orphan_item_inserted = 0;
2903 	}
2904 
2905 	if (block_rsv) {
2906 		WARN_ON(block_rsv->size > 0);
2907 		btrfs_free_block_rsv(root, block_rsv);
2908 	}
2909 }
2910 
2911 /*
2912  * This creates an orphan entry for the given inode in case something goes
2913  * wrong in the middle of an unlink/truncate.
2914  *
2915  * NOTE: caller of this function should reserve 5 units of metadata for
2916  *	 this function.
2917  */
2918 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
2919 {
2920 	struct btrfs_root *root = BTRFS_I(inode)->root;
2921 	struct btrfs_block_rsv *block_rsv = NULL;
2922 	int reserve = 0;
2923 	int insert = 0;
2924 	int ret;
2925 
2926 	if (!root->orphan_block_rsv) {
2927 		block_rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
2928 		if (!block_rsv)
2929 			return -ENOMEM;
2930 	}
2931 
2932 	spin_lock(&root->orphan_lock);
2933 	if (!root->orphan_block_rsv) {
2934 		root->orphan_block_rsv = block_rsv;
2935 	} else if (block_rsv) {
2936 		btrfs_free_block_rsv(root, block_rsv);
2937 		block_rsv = NULL;
2938 	}
2939 
2940 	if (!test_and_set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2941 			      &BTRFS_I(inode)->runtime_flags)) {
2942 #if 0
2943 		/*
2944 		 * For proper ENOSPC handling, we should do orphan
2945 		 * cleanup when mounting. But this introduces backward
2946 		 * compatibility issue.
2947 		 */
2948 		if (!xchg(&root->orphan_item_inserted, 1))
2949 			insert = 2;
2950 		else
2951 			insert = 1;
2952 #endif
2953 		insert = 1;
2954 		atomic_inc(&root->orphan_inodes);
2955 	}
2956 
2957 	if (!test_and_set_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2958 			      &BTRFS_I(inode)->runtime_flags))
2959 		reserve = 1;
2960 	spin_unlock(&root->orphan_lock);
2961 
2962 	/* grab metadata reservation from transaction handle */
2963 	if (reserve) {
2964 		ret = btrfs_orphan_reserve_metadata(trans, inode);
2965 		BUG_ON(ret); /* -ENOSPC in reservation; Logic error? JDM */
2966 	}
2967 
2968 	/* insert an orphan item to track this unlinked/truncated file */
2969 	if (insert >= 1) {
2970 		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
2971 		if (ret) {
2972 			if (reserve) {
2973 				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
2974 					  &BTRFS_I(inode)->runtime_flags);
2975 				btrfs_orphan_release_metadata(inode);
2976 			}
2977 			if (ret != -EEXIST) {
2978 				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
2979 					  &BTRFS_I(inode)->runtime_flags);
2980 				btrfs_abort_transaction(trans, root, ret);
2981 				return ret;
2982 			}
2983 		}
2984 		ret = 0;
2985 	}
2986 
2987 	/* insert an orphan item to track subvolume contains orphan files */
2988 	if (insert >= 2) {
2989 		ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
2990 					       root->root_key.objectid);
2991 		if (ret && ret != -EEXIST) {
2992 			btrfs_abort_transaction(trans, root, ret);
2993 			return ret;
2994 		}
2995 	}
2996 	return 0;
2997 }
2998 
2999 /*
3000  * We have done the truncate/delete so we can go ahead and remove the orphan
3001  * item for this particular inode.
3002  */
3003 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3004 			    struct inode *inode)
3005 {
3006 	struct btrfs_root *root = BTRFS_I(inode)->root;
3007 	int delete_item = 0;
3008 	int release_rsv = 0;
3009 	int ret = 0;
3010 
3011 	spin_lock(&root->orphan_lock);
3012 	if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3013 			       &BTRFS_I(inode)->runtime_flags))
3014 		delete_item = 1;
3015 
3016 	if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
3017 			       &BTRFS_I(inode)->runtime_flags))
3018 		release_rsv = 1;
3019 	spin_unlock(&root->orphan_lock);
3020 
3021 	if (trans && delete_item)
3022 		ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
3023 
3024 	if (release_rsv) {
3025 		btrfs_orphan_release_metadata(inode);
3026 		atomic_dec(&root->orphan_inodes);
3027 	}
3028 
3029 	return ret;
3030 }
3031 
3032 /*
3033  * this cleans up any orphans that may be left on the list from the last use
3034  * of this root.
3035  */
3036 int btrfs_orphan_cleanup(struct btrfs_root *root)
3037 {
3038 	struct btrfs_path *path;
3039 	struct extent_buffer *leaf;
3040 	struct btrfs_key key, found_key;
3041 	struct btrfs_trans_handle *trans;
3042 	struct inode *inode;
3043 	u64 last_objectid = 0;
3044 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
3045 
3046 	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
3047 		return 0;
3048 
3049 	path = btrfs_alloc_path();
3050 	if (!path) {
3051 		ret = -ENOMEM;
3052 		goto out;
3053 	}
3054 	path->reada = -1;
3055 
3056 	key.objectid = BTRFS_ORPHAN_OBJECTID;
3057 	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
3058 	key.offset = (u64)-1;
3059 
3060 	while (1) {
3061 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3062 		if (ret < 0)
3063 			goto out;
3064 
3065 		/*
3066 		 * if ret == 0 means we found what we were searching for, which
3067 		 * is weird, but possible, so only screw with path if we didn't
3068 		 * find the key and see if we have stuff that matches
3069 		 */
3070 		if (ret > 0) {
3071 			ret = 0;
3072 			if (path->slots[0] == 0)
3073 				break;
3074 			path->slots[0]--;
3075 		}
3076 
3077 		/* pull out the item */
3078 		leaf = path->nodes[0];
3079 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3080 
3081 		/* make sure the item matches what we want */
3082 		if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3083 			break;
3084 		if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY)
3085 			break;
3086 
3087 		/* release the path since we're done with it */
3088 		btrfs_release_path(path);
3089 
3090 		/*
3091 		 * this is where we are basically btrfs_lookup, without the
3092 		 * crossing root thing.  we store the inode number in the
3093 		 * offset of the orphan item.
3094 		 */
3095 
3096 		if (found_key.offset == last_objectid) {
3097 			btrfs_err(root->fs_info,
3098 				"Error removing orphan entry, stopping orphan cleanup");
3099 			ret = -EINVAL;
3100 			goto out;
3101 		}
3102 
3103 		last_objectid = found_key.offset;
3104 
3105 		found_key.objectid = found_key.offset;
3106 		found_key.type = BTRFS_INODE_ITEM_KEY;
3107 		found_key.offset = 0;
3108 		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
3109 		ret = PTR_ERR_OR_ZERO(inode);
3110 		if (ret && ret != -ESTALE)
3111 			goto out;
3112 
3113 		if (ret == -ESTALE && root == root->fs_info->tree_root) {
3114 			struct btrfs_root *dead_root;
3115 			struct btrfs_fs_info *fs_info = root->fs_info;
3116 			int is_dead_root = 0;
3117 
3118 			/*
3119 			 * this is an orphan in the tree root. Currently these
3120 			 * could come from 2 sources:
3121 			 *  a) a snapshot deletion in progress
3122 			 *  b) a free space cache inode
3123 			 * We need to distinguish those two, as the snapshot
3124 			 * orphan must not get deleted.
3125 			 * find_dead_roots already ran before us, so if this
3126 			 * is a snapshot deletion, we should find the root
3127 			 * in the dead_roots list
3128 			 */
3129 			spin_lock(&fs_info->trans_lock);
3130 			list_for_each_entry(dead_root, &fs_info->dead_roots,
3131 					    root_list) {
3132 				if (dead_root->root_key.objectid ==
3133 				    found_key.objectid) {
3134 					is_dead_root = 1;
3135 					break;
3136 				}
3137 			}
3138 			spin_unlock(&fs_info->trans_lock);
3139 			if (is_dead_root) {
3140 				/* prevent this orphan from being found again */
3141 				key.offset = found_key.objectid - 1;
3142 				continue;
3143 			}
3144 		}
3145 		/*
3146 		 * Inode is already gone but the orphan item is still there,
3147 		 * kill the orphan item.
3148 		 */
3149 		if (ret == -ESTALE) {
3150 			trans = btrfs_start_transaction(root, 1);
3151 			if (IS_ERR(trans)) {
3152 				ret = PTR_ERR(trans);
3153 				goto out;
3154 			}
3155 			btrfs_debug(root->fs_info, "auto deleting %Lu",
3156 				found_key.objectid);
3157 			ret = btrfs_del_orphan_item(trans, root,
3158 						    found_key.objectid);
3159 			btrfs_end_transaction(trans, root);
3160 			if (ret)
3161 				goto out;
3162 			continue;
3163 		}
3164 
3165 		/*
3166 		 * add this inode to the orphan list so btrfs_orphan_del does
3167 		 * the proper thing when we hit it
3168 		 */
3169 		set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
3170 			&BTRFS_I(inode)->runtime_flags);
3171 		atomic_inc(&root->orphan_inodes);
3172 
3173 		/* if we have links, this was a truncate, lets do that */
3174 		if (inode->i_nlink) {
3175 			if (!S_ISREG(inode->i_mode)) {
3176 				WARN_ON(1);
3177 				iput(inode);
3178 				continue;
3179 			}
3180 			nr_truncate++;
3181 
3182 			/* 1 for the orphan item deletion. */
3183 			trans = btrfs_start_transaction(root, 1);
3184 			if (IS_ERR(trans)) {
3185 				iput(inode);
3186 				ret = PTR_ERR(trans);
3187 				goto out;
3188 			}
3189 			ret = btrfs_orphan_add(trans, inode);
3190 			btrfs_end_transaction(trans, root);
3191 			if (ret) {
3192 				iput(inode);
3193 				goto out;
3194 			}
3195 
3196 			ret = btrfs_truncate(inode);
3197 			if (ret)
3198 				btrfs_orphan_del(NULL, inode);
3199 		} else {
3200 			nr_unlink++;
3201 		}
3202 
3203 		/* this will do delete_inode and everything for us */
3204 		iput(inode);
3205 		if (ret)
3206 			goto out;
3207 	}
3208 	/* release the path since we're done with it */
3209 	btrfs_release_path(path);
3210 
3211 	root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
3212 
3213 	if (root->orphan_block_rsv)
3214 		btrfs_block_rsv_release(root, root->orphan_block_rsv,
3215 					(u64)-1);
3216 
3217 	if (root->orphan_block_rsv || root->orphan_item_inserted) {
3218 		trans = btrfs_join_transaction(root);
3219 		if (!IS_ERR(trans))
3220 			btrfs_end_transaction(trans, root);
3221 	}
3222 
3223 	if (nr_unlink)
3224 		btrfs_debug(root->fs_info, "unlinked %d orphans", nr_unlink);
3225 	if (nr_truncate)
3226 		btrfs_debug(root->fs_info, "truncated %d orphans", nr_truncate);
3227 
3228 out:
3229 	if (ret)
3230 		btrfs_crit(root->fs_info,
3231 			"could not do orphan cleanup %d", ret);
3232 	btrfs_free_path(path);
3233 	return ret;
3234 }
3235 
3236 /*
3237  * very simple check to peek ahead in the leaf looking for xattrs.  If we
3238  * don't find any xattrs, we know there can't be any acls.
3239  *
3240  * slot is the slot the inode is in, objectid is the objectid of the inode
3241  */
3242 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3243 					  int slot, u64 objectid)
3244 {
3245 	u32 nritems = btrfs_header_nritems(leaf);
3246 	struct btrfs_key found_key;
3247 	static u64 xattr_access = 0;
3248 	static u64 xattr_default = 0;
3249 	int scanned = 0;
3250 
3251 	if (!xattr_access) {
3252 		xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
3253 					strlen(POSIX_ACL_XATTR_ACCESS));
3254 		xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
3255 					strlen(POSIX_ACL_XATTR_DEFAULT));
3256 	}
3257 
3258 	slot++;
3259 	while (slot < nritems) {
3260 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
3261 
3262 		/* we found a different objectid, there must not be acls */
3263 		if (found_key.objectid != objectid)
3264 			return 0;
3265 
3266 		/* we found an xattr, assume we've got an acl */
3267 		if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3268 			if (found_key.offset == xattr_access ||
3269 			    found_key.offset == xattr_default)
3270 				return 1;
3271 		}
3272 
3273 		/*
3274 		 * we found a key greater than an xattr key, there can't
3275 		 * be any acls later on
3276 		 */
3277 		if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3278 			return 0;
3279 
3280 		slot++;
3281 		scanned++;
3282 
3283 		/*
3284 		 * it goes inode, inode backrefs, xattrs, extents,
3285 		 * so if there are a ton of hard links to an inode there can
3286 		 * be a lot of backrefs.  Don't waste time searching too hard,
3287 		 * this is just an optimization
3288 		 */
3289 		if (scanned >= 8)
3290 			break;
3291 	}
3292 	/* we hit the end of the leaf before we found an xattr or
3293 	 * something larger than an xattr.  We have to assume the inode
3294 	 * has acls
3295 	 */
3296 	return 1;
3297 }
3298 
3299 /*
3300  * read an inode from the btree into the in-memory inode
3301  */
3302 static void btrfs_read_locked_inode(struct inode *inode)
3303 {
3304 	struct btrfs_path *path;
3305 	struct extent_buffer *leaf;
3306 	struct btrfs_inode_item *inode_item;
3307 	struct btrfs_timespec *tspec;
3308 	struct btrfs_root *root = BTRFS_I(inode)->root;
3309 	struct btrfs_key location;
3310 	int maybe_acls;
3311 	u32 rdev;
3312 	int ret;
3313 	bool filled = false;
3314 
3315 	ret = btrfs_fill_inode(inode, &rdev);
3316 	if (!ret)
3317 		filled = true;
3318 
3319 	path = btrfs_alloc_path();
3320 	if (!path)
3321 		goto make_bad;
3322 
3323 	path->leave_spinning = 1;
3324 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
3325 
3326 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3327 	if (ret)
3328 		goto make_bad;
3329 
3330 	leaf = path->nodes[0];
3331 
3332 	if (filled)
3333 		goto cache_acl;
3334 
3335 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3336 				    struct btrfs_inode_item);
3337 	inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3338 	set_nlink(inode, btrfs_inode_nlink(leaf, inode_item));
3339 	i_uid_write(inode, btrfs_inode_uid(leaf, inode_item));
3340 	i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
3341 	btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3342 
3343 	tspec = btrfs_inode_atime(inode_item);
3344 	inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3345 	inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3346 
3347 	tspec = btrfs_inode_mtime(inode_item);
3348 	inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3349 	inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3350 
3351 	tspec = btrfs_inode_ctime(inode_item);
3352 	inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
3353 	inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
3354 
3355 	inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
3356 	BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
3357 	BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item);
3358 
3359 	/*
3360 	 * If we were modified in the current generation and evicted from memory
3361 	 * and then re-read we need to do a full sync since we don't have any
3362 	 * idea about which extents were modified before we were evicted from
3363 	 * cache.
3364 	 */
3365 	if (BTRFS_I(inode)->last_trans == root->fs_info->generation)
3366 		set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
3367 			&BTRFS_I(inode)->runtime_flags);
3368 
3369 	inode->i_version = btrfs_inode_sequence(leaf, inode_item);
3370 	inode->i_generation = BTRFS_I(inode)->generation;
3371 	inode->i_rdev = 0;
3372 	rdev = btrfs_inode_rdev(leaf, inode_item);
3373 
3374 	BTRFS_I(inode)->index_cnt = (u64)-1;
3375 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
3376 cache_acl:
3377 	/*
3378 	 * try to precache a NULL acl entry for files that don't have
3379 	 * any xattrs or acls
3380 	 */
3381 	maybe_acls = acls_after_inode_item(leaf, path->slots[0],
3382 					   btrfs_ino(inode));
3383 	if (!maybe_acls)
3384 		cache_no_acl(inode);
3385 
3386 	btrfs_free_path(path);
3387 
3388 	switch (inode->i_mode & S_IFMT) {
3389 	case S_IFREG:
3390 		inode->i_mapping->a_ops = &btrfs_aops;
3391 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3392 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
3393 		inode->i_fop = &btrfs_file_operations;
3394 		inode->i_op = &btrfs_file_inode_operations;
3395 		break;
3396 	case S_IFDIR:
3397 		inode->i_fop = &btrfs_dir_file_operations;
3398 		if (root == root->fs_info->tree_root)
3399 			inode->i_op = &btrfs_dir_ro_inode_operations;
3400 		else
3401 			inode->i_op = &btrfs_dir_inode_operations;
3402 		break;
3403 	case S_IFLNK:
3404 		inode->i_op = &btrfs_symlink_inode_operations;
3405 		inode->i_mapping->a_ops = &btrfs_symlink_aops;
3406 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
3407 		break;
3408 	default:
3409 		inode->i_op = &btrfs_special_inode_operations;
3410 		init_special_inode(inode, inode->i_mode, rdev);
3411 		break;
3412 	}
3413 
3414 	btrfs_update_iflags(inode);
3415 	return;
3416 
3417 make_bad:
3418 	btrfs_free_path(path);
3419 	make_bad_inode(inode);
3420 }
3421 
3422 /*
3423  * given a leaf and an inode, copy the inode fields into the leaf
3424  */
3425 static void fill_inode_item(struct btrfs_trans_handle *trans,
3426 			    struct extent_buffer *leaf,
3427 			    struct btrfs_inode_item *item,
3428 			    struct inode *inode)
3429 {
3430 	struct btrfs_map_token token;
3431 
3432 	btrfs_init_map_token(&token);
3433 
3434 	btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
3435 	btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
3436 	btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
3437 				   &token);
3438 	btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
3439 	btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
3440 
3441 	btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
3442 				     inode->i_atime.tv_sec, &token);
3443 	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
3444 				      inode->i_atime.tv_nsec, &token);
3445 
3446 	btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
3447 				     inode->i_mtime.tv_sec, &token);
3448 	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
3449 				      inode->i_mtime.tv_nsec, &token);
3450 
3451 	btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
3452 				     inode->i_ctime.tv_sec, &token);
3453 	btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
3454 				      inode->i_ctime.tv_nsec, &token);
3455 
3456 	btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
3457 				     &token);
3458 	btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
3459 					 &token);
3460 	btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
3461 	btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
3462 	btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
3463 	btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
3464 	btrfs_set_token_inode_block_group(leaf, item, 0, &token);
3465 }
3466 
3467 /*
3468  * copy everything in the in-memory inode into the btree.
3469  */
3470 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
3471 				struct btrfs_root *root, struct inode *inode)
3472 {
3473 	struct btrfs_inode_item *inode_item;
3474 	struct btrfs_path *path;
3475 	struct extent_buffer *leaf;
3476 	int ret;
3477 
3478 	path = btrfs_alloc_path();
3479 	if (!path)
3480 		return -ENOMEM;
3481 
3482 	path->leave_spinning = 1;
3483 	ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
3484 				 1);
3485 	if (ret) {
3486 		if (ret > 0)
3487 			ret = -ENOENT;
3488 		goto failed;
3489 	}
3490 
3491 	btrfs_unlock_up_safe(path, 1);
3492 	leaf = path->nodes[0];
3493 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
3494 				    struct btrfs_inode_item);
3495 
3496 	fill_inode_item(trans, leaf, inode_item, inode);
3497 	btrfs_mark_buffer_dirty(leaf);
3498 	btrfs_set_inode_last_trans(trans, inode);
3499 	ret = 0;
3500 failed:
3501 	btrfs_free_path(path);
3502 	return ret;
3503 }
3504 
3505 /*
3506  * copy everything in the in-memory inode into the btree.
3507  */
3508 noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
3509 				struct btrfs_root *root, struct inode *inode)
3510 {
3511 	int ret;
3512 
3513 	/*
3514 	 * If the inode is a free space inode, we can deadlock during commit
3515 	 * if we put it into the delayed code.
3516 	 *
3517 	 * The data relocation inode should also be directly updated
3518 	 * without delay
3519 	 */
3520 	if (!btrfs_is_free_space_inode(inode)
3521 	    && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
3522 		btrfs_update_root_times(trans, root);
3523 
3524 		ret = btrfs_delayed_update_inode(trans, root, inode);
3525 		if (!ret)
3526 			btrfs_set_inode_last_trans(trans, inode);
3527 		return ret;
3528 	}
3529 
3530 	return btrfs_update_inode_item(trans, root, inode);
3531 }
3532 
3533 noinline int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
3534 					 struct btrfs_root *root,
3535 					 struct inode *inode)
3536 {
3537 	int ret;
3538 
3539 	ret = btrfs_update_inode(trans, root, inode);
3540 	if (ret == -ENOSPC)
3541 		return btrfs_update_inode_item(trans, root, inode);
3542 	return ret;
3543 }
3544 
3545 /*
3546  * unlink helper that gets used here in inode.c and in the tree logging
3547  * recovery code.  It remove a link in a directory with a given name, and
3548  * also drops the back refs in the inode to the directory
3549  */
3550 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3551 				struct btrfs_root *root,
3552 				struct inode *dir, struct inode *inode,
3553 				const char *name, int name_len)
3554 {
3555 	struct btrfs_path *path;
3556 	int ret = 0;
3557 	struct extent_buffer *leaf;
3558 	struct btrfs_dir_item *di;
3559 	struct btrfs_key key;
3560 	u64 index;
3561 	u64 ino = btrfs_ino(inode);
3562 	u64 dir_ino = btrfs_ino(dir);
3563 
3564 	path = btrfs_alloc_path();
3565 	if (!path) {
3566 		ret = -ENOMEM;
3567 		goto out;
3568 	}
3569 
3570 	path->leave_spinning = 1;
3571 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3572 				    name, name_len, -1);
3573 	if (IS_ERR(di)) {
3574 		ret = PTR_ERR(di);
3575 		goto err;
3576 	}
3577 	if (!di) {
3578 		ret = -ENOENT;
3579 		goto err;
3580 	}
3581 	leaf = path->nodes[0];
3582 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
3583 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
3584 	if (ret)
3585 		goto err;
3586 	btrfs_release_path(path);
3587 
3588 	ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
3589 				  dir_ino, &index);
3590 	if (ret) {
3591 		btrfs_info(root->fs_info,
3592 			"failed to delete reference to %.*s, inode %llu parent %llu",
3593 			name_len, name, ino, dir_ino);
3594 		btrfs_abort_transaction(trans, root, ret);
3595 		goto err;
3596 	}
3597 
3598 	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3599 	if (ret) {
3600 		btrfs_abort_transaction(trans, root, ret);
3601 		goto err;
3602 	}
3603 
3604 	ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
3605 					 inode, dir_ino);
3606 	if (ret != 0 && ret != -ENOENT) {
3607 		btrfs_abort_transaction(trans, root, ret);
3608 		goto err;
3609 	}
3610 
3611 	ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
3612 					   dir, index);
3613 	if (ret == -ENOENT)
3614 		ret = 0;
3615 	else if (ret)
3616 		btrfs_abort_transaction(trans, root, ret);
3617 err:
3618 	btrfs_free_path(path);
3619 	if (ret)
3620 		goto out;
3621 
3622 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3623 	inode_inc_iversion(inode);
3624 	inode_inc_iversion(dir);
3625 	inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3626 	ret = btrfs_update_inode(trans, root, dir);
3627 out:
3628 	return ret;
3629 }
3630 
3631 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
3632 		       struct btrfs_root *root,
3633 		       struct inode *dir, struct inode *inode,
3634 		       const char *name, int name_len)
3635 {
3636 	int ret;
3637 	ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
3638 	if (!ret) {
3639 		btrfs_drop_nlink(inode);
3640 		ret = btrfs_update_inode(trans, root, inode);
3641 	}
3642 	return ret;
3643 }
3644 
3645 /*
3646  * helper to start transaction for unlink and rmdir.
3647  *
3648  * unlink and rmdir are special in btrfs, they do not always free space, so
3649  * if we cannot make our reservations the normal way try and see if there is
3650  * plenty of slack room in the global reserve to migrate, otherwise we cannot
3651  * allow the unlink to occur.
3652  */
3653 static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
3654 {
3655 	struct btrfs_trans_handle *trans;
3656 	struct btrfs_root *root = BTRFS_I(dir)->root;
3657 	int ret;
3658 
3659 	/*
3660 	 * 1 for the possible orphan item
3661 	 * 1 for the dir item
3662 	 * 1 for the dir index
3663 	 * 1 for the inode ref
3664 	 * 1 for the inode
3665 	 */
3666 	trans = btrfs_start_transaction(root, 5);
3667 	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
3668 		return trans;
3669 
3670 	if (PTR_ERR(trans) == -ENOSPC) {
3671 		u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
3672 
3673 		trans = btrfs_start_transaction(root, 0);
3674 		if (IS_ERR(trans))
3675 			return trans;
3676 		ret = btrfs_cond_migrate_bytes(root->fs_info,
3677 					       &root->fs_info->trans_block_rsv,
3678 					       num_bytes, 5);
3679 		if (ret) {
3680 			btrfs_end_transaction(trans, root);
3681 			return ERR_PTR(ret);
3682 		}
3683 		trans->block_rsv = &root->fs_info->trans_block_rsv;
3684 		trans->bytes_reserved = num_bytes;
3685 	}
3686 	return trans;
3687 }
3688 
3689 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
3690 {
3691 	struct btrfs_root *root = BTRFS_I(dir)->root;
3692 	struct btrfs_trans_handle *trans;
3693 	struct inode *inode = dentry->d_inode;
3694 	int ret;
3695 
3696 	trans = __unlink_start_trans(dir);
3697 	if (IS_ERR(trans))
3698 		return PTR_ERR(trans);
3699 
3700 	btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
3701 
3702 	ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3703 				 dentry->d_name.name, dentry->d_name.len);
3704 	if (ret)
3705 		goto out;
3706 
3707 	if (inode->i_nlink == 0) {
3708 		ret = btrfs_orphan_add(trans, inode);
3709 		if (ret)
3710 			goto out;
3711 	}
3712 
3713 out:
3714 	btrfs_end_transaction(trans, root);
3715 	btrfs_btree_balance_dirty(root);
3716 	return ret;
3717 }
3718 
3719 int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
3720 			struct btrfs_root *root,
3721 			struct inode *dir, u64 objectid,
3722 			const char *name, int name_len)
3723 {
3724 	struct btrfs_path *path;
3725 	struct extent_buffer *leaf;
3726 	struct btrfs_dir_item *di;
3727 	struct btrfs_key key;
3728 	u64 index;
3729 	int ret;
3730 	u64 dir_ino = btrfs_ino(dir);
3731 
3732 	path = btrfs_alloc_path();
3733 	if (!path)
3734 		return -ENOMEM;
3735 
3736 	di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
3737 				   name, name_len, -1);
3738 	if (IS_ERR_OR_NULL(di)) {
3739 		if (!di)
3740 			ret = -ENOENT;
3741 		else
3742 			ret = PTR_ERR(di);
3743 		goto out;
3744 	}
3745 
3746 	leaf = path->nodes[0];
3747 	btrfs_dir_item_key_to_cpu(leaf, di, &key);
3748 	WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
3749 	ret = btrfs_delete_one_dir_name(trans, root, path, di);
3750 	if (ret) {
3751 		btrfs_abort_transaction(trans, root, ret);
3752 		goto out;
3753 	}
3754 	btrfs_release_path(path);
3755 
3756 	ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
3757 				 objectid, root->root_key.objectid,
3758 				 dir_ino, &index, name, name_len);
3759 	if (ret < 0) {
3760 		if (ret != -ENOENT) {
3761 			btrfs_abort_transaction(trans, root, ret);
3762 			goto out;
3763 		}
3764 		di = btrfs_search_dir_index_item(root, path, dir_ino,
3765 						 name, name_len);
3766 		if (IS_ERR_OR_NULL(di)) {
3767 			if (!di)
3768 				ret = -ENOENT;
3769 			else
3770 				ret = PTR_ERR(di);
3771 			btrfs_abort_transaction(trans, root, ret);
3772 			goto out;
3773 		}
3774 
3775 		leaf = path->nodes[0];
3776 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3777 		btrfs_release_path(path);
3778 		index = key.offset;
3779 	}
3780 	btrfs_release_path(path);
3781 
3782 	ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
3783 	if (ret) {
3784 		btrfs_abort_transaction(trans, root, ret);
3785 		goto out;
3786 	}
3787 
3788 	btrfs_i_size_write(dir, dir->i_size - name_len * 2);
3789 	inode_inc_iversion(dir);
3790 	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
3791 	ret = btrfs_update_inode_fallback(trans, root, dir);
3792 	if (ret)
3793 		btrfs_abort_transaction(trans, root, ret);
3794 out:
3795 	btrfs_free_path(path);
3796 	return ret;
3797 }
3798 
3799 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
3800 {
3801 	struct inode *inode = dentry->d_inode;
3802 	int err = 0;
3803 	struct btrfs_root *root = BTRFS_I(dir)->root;
3804 	struct btrfs_trans_handle *trans;
3805 
3806 	if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
3807 		return -ENOTEMPTY;
3808 	if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
3809 		return -EPERM;
3810 
3811 	trans = __unlink_start_trans(dir);
3812 	if (IS_ERR(trans))
3813 		return PTR_ERR(trans);
3814 
3815 	if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
3816 		err = btrfs_unlink_subvol(trans, root, dir,
3817 					  BTRFS_I(inode)->location.objectid,
3818 					  dentry->d_name.name,
3819 					  dentry->d_name.len);
3820 		goto out;
3821 	}
3822 
3823 	err = btrfs_orphan_add(trans, inode);
3824 	if (err)
3825 		goto out;
3826 
3827 	/* now the directory is empty */
3828 	err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
3829 				 dentry->d_name.name, dentry->d_name.len);
3830 	if (!err)
3831 		btrfs_i_size_write(inode, 0);
3832 out:
3833 	btrfs_end_transaction(trans, root);
3834 	btrfs_btree_balance_dirty(root);
3835 
3836 	return err;
3837 }
3838 
3839 /*
3840  * this can truncate away extent items, csum items and directory items.
3841  * It starts at a high offset and removes keys until it can't find
3842  * any higher than new_size
3843  *
3844  * csum items that cross the new i_size are truncated to the new size
3845  * as well.
3846  *
3847  * min_type is the minimum key type to truncate down to.  If set to 0, this
3848  * will kill all the items on this inode, including the INODE_ITEM_KEY.
3849  */
3850 int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
3851 			       struct btrfs_root *root,
3852 			       struct inode *inode,
3853 			       u64 new_size, u32 min_type)
3854 {
3855 	struct btrfs_path *path;
3856 	struct extent_buffer *leaf;
3857 	struct btrfs_file_extent_item *fi;
3858 	struct btrfs_key key;
3859 	struct btrfs_key found_key;
3860 	u64 extent_start = 0;
3861 	u64 extent_num_bytes = 0;
3862 	u64 extent_offset = 0;
3863 	u64 item_end = 0;
3864 	u64 last_size = (u64)-1;
3865 	u32 found_type = (u8)-1;
3866 	int found_extent;
3867 	int del_item;
3868 	int pending_del_nr = 0;
3869 	int pending_del_slot = 0;
3870 	int extent_type = -1;
3871 	int ret;
3872 	int err = 0;
3873 	u64 ino = btrfs_ino(inode);
3874 
3875 	BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
3876 
3877 	path = btrfs_alloc_path();
3878 	if (!path)
3879 		return -ENOMEM;
3880 	path->reada = -1;
3881 
3882 	/*
3883 	 * We want to drop from the next block forward in case this new size is
3884 	 * not block aligned since we will be keeping the last block of the
3885 	 * extent just the way it is.
3886 	 */
3887 	if (root->ref_cows || root == root->fs_info->tree_root)
3888 		btrfs_drop_extent_cache(inode, ALIGN(new_size,
3889 					root->sectorsize), (u64)-1, 0);
3890 
3891 	/*
3892 	 * This function is also used to drop the items in the log tree before
3893 	 * we relog the inode, so if root != BTRFS_I(inode)->root, it means
3894 	 * it is used to drop the loged items. So we shouldn't kill the delayed
3895 	 * items.
3896 	 */
3897 	if (min_type == 0 && root == BTRFS_I(inode)->root)
3898 		btrfs_kill_delayed_inode_items(inode);
3899 
3900 	key.objectid = ino;
3901 	key.offset = (u64)-1;
3902 	key.type = (u8)-1;
3903 
3904 search_again:
3905 	path->leave_spinning = 1;
3906 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3907 	if (ret < 0) {
3908 		err = ret;
3909 		goto out;
3910 	}
3911 
3912 	if (ret > 0) {
3913 		/* there are no items in the tree for us to truncate, we're
3914 		 * done
3915 		 */
3916 		if (path->slots[0] == 0)
3917 			goto out;
3918 		path->slots[0]--;
3919 	}
3920 
3921 	while (1) {
3922 		fi = NULL;
3923 		leaf = path->nodes[0];
3924 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3925 		found_type = btrfs_key_type(&found_key);
3926 
3927 		if (found_key.objectid != ino)
3928 			break;
3929 
3930 		if (found_type < min_type)
3931 			break;
3932 
3933 		item_end = found_key.offset;
3934 		if (found_type == BTRFS_EXTENT_DATA_KEY) {
3935 			fi = btrfs_item_ptr(leaf, path->slots[0],
3936 					    struct btrfs_file_extent_item);
3937 			extent_type = btrfs_file_extent_type(leaf, fi);
3938 			if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3939 				item_end +=
3940 				    btrfs_file_extent_num_bytes(leaf, fi);
3941 			} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3942 				item_end += btrfs_file_extent_inline_len(leaf,
3943 									 fi);
3944 			}
3945 			item_end--;
3946 		}
3947 		if (found_type > min_type) {
3948 			del_item = 1;
3949 		} else {
3950 			if (item_end < new_size)
3951 				break;
3952 			if (found_key.offset >= new_size)
3953 				del_item = 1;
3954 			else
3955 				del_item = 0;
3956 		}
3957 		found_extent = 0;
3958 		/* FIXME, shrink the extent if the ref count is only 1 */
3959 		if (found_type != BTRFS_EXTENT_DATA_KEY)
3960 			goto delete;
3961 
3962 		if (del_item)
3963 			last_size = found_key.offset;
3964 		else
3965 			last_size = new_size;
3966 
3967 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
3968 			u64 num_dec;
3969 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
3970 			if (!del_item) {
3971 				u64 orig_num_bytes =
3972 					btrfs_file_extent_num_bytes(leaf, fi);
3973 				extent_num_bytes = ALIGN(new_size -
3974 						found_key.offset,
3975 						root->sectorsize);
3976 				btrfs_set_file_extent_num_bytes(leaf, fi,
3977 							 extent_num_bytes);
3978 				num_dec = (orig_num_bytes -
3979 					   extent_num_bytes);
3980 				if (root->ref_cows && extent_start != 0)
3981 					inode_sub_bytes(inode, num_dec);
3982 				btrfs_mark_buffer_dirty(leaf);
3983 			} else {
3984 				extent_num_bytes =
3985 					btrfs_file_extent_disk_num_bytes(leaf,
3986 									 fi);
3987 				extent_offset = found_key.offset -
3988 					btrfs_file_extent_offset(leaf, fi);
3989 
3990 				/* FIXME blocksize != 4096 */
3991 				num_dec = btrfs_file_extent_num_bytes(leaf, fi);
3992 				if (extent_start != 0) {
3993 					found_extent = 1;
3994 					if (root->ref_cows)
3995 						inode_sub_bytes(inode, num_dec);
3996 				}
3997 			}
3998 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
3999 			/*
4000 			 * we can't truncate inline items that have had
4001 			 * special encodings
4002 			 */
4003 			if (!del_item &&
4004 			    btrfs_file_extent_compression(leaf, fi) == 0 &&
4005 			    btrfs_file_extent_encryption(leaf, fi) == 0 &&
4006 			    btrfs_file_extent_other_encoding(leaf, fi) == 0) {
4007 				u32 size = new_size - found_key.offset;
4008 
4009 				if (root->ref_cows) {
4010 					inode_sub_bytes(inode, item_end + 1 -
4011 							new_size);
4012 				}
4013 				size =
4014 				    btrfs_file_extent_calc_inline_size(size);
4015 				btrfs_truncate_item(root, path, size, 1);
4016 			} else if (root->ref_cows) {
4017 				inode_sub_bytes(inode, item_end + 1 -
4018 						found_key.offset);
4019 			}
4020 		}
4021 delete:
4022 		if (del_item) {
4023 			if (!pending_del_nr) {
4024 				/* no pending yet, add ourselves */
4025 				pending_del_slot = path->slots[0];
4026 				pending_del_nr = 1;
4027 			} else if (pending_del_nr &&
4028 				   path->slots[0] + 1 == pending_del_slot) {
4029 				/* hop on the pending chunk */
4030 				pending_del_nr++;
4031 				pending_del_slot = path->slots[0];
4032 			} else {
4033 				BUG();
4034 			}
4035 		} else {
4036 			break;
4037 		}
4038 		if (found_extent && (root->ref_cows ||
4039 				     root == root->fs_info->tree_root)) {
4040 			btrfs_set_path_blocking(path);
4041 			ret = btrfs_free_extent(trans, root, extent_start,
4042 						extent_num_bytes, 0,
4043 						btrfs_header_owner(leaf),
4044 						ino, extent_offset, 0);
4045 			BUG_ON(ret);
4046 		}
4047 
4048 		if (found_type == BTRFS_INODE_ITEM_KEY)
4049 			break;
4050 
4051 		if (path->slots[0] == 0 ||
4052 		    path->slots[0] != pending_del_slot) {
4053 			if (pending_del_nr) {
4054 				ret = btrfs_del_items(trans, root, path,
4055 						pending_del_slot,
4056 						pending_del_nr);
4057 				if (ret) {
4058 					btrfs_abort_transaction(trans,
4059 								root, ret);
4060 					goto error;
4061 				}
4062 				pending_del_nr = 0;
4063 			}
4064 			btrfs_release_path(path);
4065 			goto search_again;
4066 		} else {
4067 			path->slots[0]--;
4068 		}
4069 	}
4070 out:
4071 	if (pending_del_nr) {
4072 		ret = btrfs_del_items(trans, root, path, pending_del_slot,
4073 				      pending_del_nr);
4074 		if (ret)
4075 			btrfs_abort_transaction(trans, root, ret);
4076 	}
4077 error:
4078 	if (last_size != (u64)-1)
4079 		btrfs_ordered_update_i_size(inode, last_size, NULL);
4080 	btrfs_free_path(path);
4081 	return err;
4082 }
4083 
4084 /*
4085  * btrfs_truncate_page - read, zero a chunk and write a page
4086  * @inode - inode that we're zeroing
4087  * @from - the offset to start zeroing
4088  * @len - the length to zero, 0 to zero the entire range respective to the
4089  *	offset
4090  * @front - zero up to the offset instead of from the offset on
4091  *
4092  * This will find the page for the "from" offset and cow the page and zero the
4093  * part we want to zero.  This is used with truncate and hole punching.
4094  */
4095 int btrfs_truncate_page(struct inode *inode, loff_t from, loff_t len,
4096 			int front)
4097 {
4098 	struct address_space *mapping = inode->i_mapping;
4099 	struct btrfs_root *root = BTRFS_I(inode)->root;
4100 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4101 	struct btrfs_ordered_extent *ordered;
4102 	struct extent_state *cached_state = NULL;
4103 	char *kaddr;
4104 	u32 blocksize = root->sectorsize;
4105 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
4106 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
4107 	struct page *page;
4108 	gfp_t mask = btrfs_alloc_write_mask(mapping);
4109 	int ret = 0;
4110 	u64 page_start;
4111 	u64 page_end;
4112 
4113 	if ((offset & (blocksize - 1)) == 0 &&
4114 	    (!len || ((len & (blocksize - 1)) == 0)))
4115 		goto out;
4116 	ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
4117 	if (ret)
4118 		goto out;
4119 
4120 again:
4121 	page = find_or_create_page(mapping, index, mask);
4122 	if (!page) {
4123 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4124 		ret = -ENOMEM;
4125 		goto out;
4126 	}
4127 
4128 	page_start = page_offset(page);
4129 	page_end = page_start + PAGE_CACHE_SIZE - 1;
4130 
4131 	if (!PageUptodate(page)) {
4132 		ret = btrfs_readpage(NULL, page);
4133 		lock_page(page);
4134 		if (page->mapping != mapping) {
4135 			unlock_page(page);
4136 			page_cache_release(page);
4137 			goto again;
4138 		}
4139 		if (!PageUptodate(page)) {
4140 			ret = -EIO;
4141 			goto out_unlock;
4142 		}
4143 	}
4144 	wait_on_page_writeback(page);
4145 
4146 	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
4147 	set_page_extent_mapped(page);
4148 
4149 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
4150 	if (ordered) {
4151 		unlock_extent_cached(io_tree, page_start, page_end,
4152 				     &cached_state, GFP_NOFS);
4153 		unlock_page(page);
4154 		page_cache_release(page);
4155 		btrfs_start_ordered_extent(inode, ordered, 1);
4156 		btrfs_put_ordered_extent(ordered);
4157 		goto again;
4158 	}
4159 
4160 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
4161 			  EXTENT_DIRTY | EXTENT_DELALLOC |
4162 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4163 			  0, 0, &cached_state, GFP_NOFS);
4164 
4165 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
4166 					&cached_state);
4167 	if (ret) {
4168 		unlock_extent_cached(io_tree, page_start, page_end,
4169 				     &cached_state, GFP_NOFS);
4170 		goto out_unlock;
4171 	}
4172 
4173 	if (offset != PAGE_CACHE_SIZE) {
4174 		if (!len)
4175 			len = PAGE_CACHE_SIZE - offset;
4176 		kaddr = kmap(page);
4177 		if (front)
4178 			memset(kaddr, 0, offset);
4179 		else
4180 			memset(kaddr + offset, 0, len);
4181 		flush_dcache_page(page);
4182 		kunmap(page);
4183 	}
4184 	ClearPageChecked(page);
4185 	set_page_dirty(page);
4186 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
4187 			     GFP_NOFS);
4188 
4189 out_unlock:
4190 	if (ret)
4191 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
4192 	unlock_page(page);
4193 	page_cache_release(page);
4194 out:
4195 	return ret;
4196 }
4197 
4198 /*
4199  * This function puts in dummy file extents for the area we're creating a hole
4200  * for.  So if we are truncating this file to a larger size we need to insert
4201  * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4202  * the range between oldsize and size
4203  */
4204 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
4205 {
4206 	struct btrfs_trans_handle *trans;
4207 	struct btrfs_root *root = BTRFS_I(inode)->root;
4208 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
4209 	struct extent_map *em = NULL;
4210 	struct extent_state *cached_state = NULL;
4211 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
4212 	u64 hole_start = ALIGN(oldsize, root->sectorsize);
4213 	u64 block_end = ALIGN(size, root->sectorsize);
4214 	u64 last_byte;
4215 	u64 cur_offset;
4216 	u64 hole_size;
4217 	int err = 0;
4218 
4219 	/*
4220 	 * If our size started in the middle of a page we need to zero out the
4221 	 * rest of the page before we expand the i_size, otherwise we could
4222 	 * expose stale data.
4223 	 */
4224 	err = btrfs_truncate_page(inode, oldsize, 0, 0);
4225 	if (err)
4226 		return err;
4227 
4228 	if (size <= hole_start)
4229 		return 0;
4230 
4231 	while (1) {
4232 		struct btrfs_ordered_extent *ordered;
4233 		btrfs_wait_ordered_range(inode, hole_start,
4234 					 block_end - hole_start);
4235 		lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
4236 				 &cached_state);
4237 		ordered = btrfs_lookup_ordered_extent(inode, hole_start);
4238 		if (!ordered)
4239 			break;
4240 		unlock_extent_cached(io_tree, hole_start, block_end - 1,
4241 				     &cached_state, GFP_NOFS);
4242 		btrfs_put_ordered_extent(ordered);
4243 	}
4244 
4245 	cur_offset = hole_start;
4246 	while (1) {
4247 		em = btrfs_get_extent(inode, NULL, 0, cur_offset,
4248 				block_end - cur_offset, 0);
4249 		if (IS_ERR(em)) {
4250 			err = PTR_ERR(em);
4251 			em = NULL;
4252 			break;
4253 		}
4254 		last_byte = min(extent_map_end(em), block_end);
4255 		last_byte = ALIGN(last_byte , root->sectorsize);
4256 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
4257 			struct extent_map *hole_em;
4258 			hole_size = last_byte - cur_offset;
4259 
4260 			trans = btrfs_start_transaction(root, 3);
4261 			if (IS_ERR(trans)) {
4262 				err = PTR_ERR(trans);
4263 				break;
4264 			}
4265 
4266 			err = btrfs_drop_extents(trans, root, inode,
4267 						 cur_offset,
4268 						 cur_offset + hole_size, 1);
4269 			if (err) {
4270 				btrfs_abort_transaction(trans, root, err);
4271 				btrfs_end_transaction(trans, root);
4272 				break;
4273 			}
4274 
4275 			err = btrfs_insert_file_extent(trans, root,
4276 					btrfs_ino(inode), cur_offset, 0,
4277 					0, hole_size, 0, hole_size,
4278 					0, 0, 0);
4279 			if (err) {
4280 				btrfs_abort_transaction(trans, root, err);
4281 				btrfs_end_transaction(trans, root);
4282 				break;
4283 			}
4284 
4285 			btrfs_drop_extent_cache(inode, cur_offset,
4286 						cur_offset + hole_size - 1, 0);
4287 			hole_em = alloc_extent_map();
4288 			if (!hole_em) {
4289 				set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
4290 					&BTRFS_I(inode)->runtime_flags);
4291 				goto next;
4292 			}
4293 			hole_em->start = cur_offset;
4294 			hole_em->len = hole_size;
4295 			hole_em->orig_start = cur_offset;
4296 
4297 			hole_em->block_start = EXTENT_MAP_HOLE;
4298 			hole_em->block_len = 0;
4299 			hole_em->orig_block_len = 0;
4300 			hole_em->ram_bytes = hole_size;
4301 			hole_em->bdev = root->fs_info->fs_devices->latest_bdev;
4302 			hole_em->compress_type = BTRFS_COMPRESS_NONE;
4303 			hole_em->generation = trans->transid;
4304 
4305 			while (1) {
4306 				write_lock(&em_tree->lock);
4307 				err = add_extent_mapping(em_tree, hole_em, 1);
4308 				write_unlock(&em_tree->lock);
4309 				if (err != -EEXIST)
4310 					break;
4311 				btrfs_drop_extent_cache(inode, cur_offset,
4312 							cur_offset +
4313 							hole_size - 1, 0);
4314 			}
4315 			free_extent_map(hole_em);
4316 next:
4317 			btrfs_update_inode(trans, root, inode);
4318 			btrfs_end_transaction(trans, root);
4319 		}
4320 		free_extent_map(em);
4321 		em = NULL;
4322 		cur_offset = last_byte;
4323 		if (cur_offset >= block_end)
4324 			break;
4325 	}
4326 
4327 	free_extent_map(em);
4328 	unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
4329 			     GFP_NOFS);
4330 	return err;
4331 }
4332 
4333 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
4334 {
4335 	struct btrfs_root *root = BTRFS_I(inode)->root;
4336 	struct btrfs_trans_handle *trans;
4337 	loff_t oldsize = i_size_read(inode);
4338 	loff_t newsize = attr->ia_size;
4339 	int mask = attr->ia_valid;
4340 	int ret;
4341 
4342 	/*
4343 	 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
4344 	 * special case where we need to update the times despite not having
4345 	 * these flags set.  For all other operations the VFS set these flags
4346 	 * explicitly if it wants a timestamp update.
4347 	 */
4348 	if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
4349 		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
4350 
4351 	if (newsize > oldsize) {
4352 		truncate_pagecache(inode, newsize);
4353 		ret = btrfs_cont_expand(inode, oldsize, newsize);
4354 		if (ret)
4355 			return ret;
4356 
4357 		trans = btrfs_start_transaction(root, 1);
4358 		if (IS_ERR(trans))
4359 			return PTR_ERR(trans);
4360 
4361 		i_size_write(inode, newsize);
4362 		btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
4363 		ret = btrfs_update_inode(trans, root, inode);
4364 		btrfs_end_transaction(trans, root);
4365 	} else {
4366 
4367 		/*
4368 		 * We're truncating a file that used to have good data down to
4369 		 * zero. Make sure it gets into the ordered flush list so that
4370 		 * any new writes get down to disk quickly.
4371 		 */
4372 		if (newsize == 0)
4373 			set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
4374 				&BTRFS_I(inode)->runtime_flags);
4375 
4376 		/*
4377 		 * 1 for the orphan item we're going to add
4378 		 * 1 for the orphan item deletion.
4379 		 */
4380 		trans = btrfs_start_transaction(root, 2);
4381 		if (IS_ERR(trans))
4382 			return PTR_ERR(trans);
4383 
4384 		/*
4385 		 * We need to do this in case we fail at _any_ point during the
4386 		 * actual truncate.  Once we do the truncate_setsize we could
4387 		 * invalidate pages which forces any outstanding ordered io to
4388 		 * be instantly completed which will give us extents that need
4389 		 * to be truncated.  If we fail to get an orphan inode down we
4390 		 * could have left over extents that were never meant to live,
4391 		 * so we need to garuntee from this point on that everything
4392 		 * will be consistent.
4393 		 */
4394 		ret = btrfs_orphan_add(trans, inode);
4395 		btrfs_end_transaction(trans, root);
4396 		if (ret)
4397 			return ret;
4398 
4399 		/* we don't support swapfiles, so vmtruncate shouldn't fail */
4400 		truncate_setsize(inode, newsize);
4401 
4402 		/* Disable nonlocked read DIO to avoid the end less truncate */
4403 		btrfs_inode_block_unlocked_dio(inode);
4404 		inode_dio_wait(inode);
4405 		btrfs_inode_resume_unlocked_dio(inode);
4406 
4407 		ret = btrfs_truncate(inode);
4408 		if (ret && inode->i_nlink) {
4409 			int err;
4410 
4411 			/*
4412 			 * failed to truncate, disk_i_size is only adjusted down
4413 			 * as we remove extents, so it should represent the true
4414 			 * size of the inode, so reset the in memory size and
4415 			 * delete our orphan entry.
4416 			 */
4417 			trans = btrfs_join_transaction(root);
4418 			if (IS_ERR(trans)) {
4419 				btrfs_orphan_del(NULL, inode);
4420 				return ret;
4421 			}
4422 			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
4423 			err = btrfs_orphan_del(trans, inode);
4424 			if (err)
4425 				btrfs_abort_transaction(trans, root, err);
4426 			btrfs_end_transaction(trans, root);
4427 		}
4428 	}
4429 
4430 	return ret;
4431 }
4432 
4433 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
4434 {
4435 	struct inode *inode = dentry->d_inode;
4436 	struct btrfs_root *root = BTRFS_I(inode)->root;
4437 	int err;
4438 
4439 	if (btrfs_root_readonly(root))
4440 		return -EROFS;
4441 
4442 	err = inode_change_ok(inode, attr);
4443 	if (err)
4444 		return err;
4445 
4446 	if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
4447 		err = btrfs_setsize(inode, attr);
4448 		if (err)
4449 			return err;
4450 	}
4451 
4452 	if (attr->ia_valid) {
4453 		setattr_copy(inode, attr);
4454 		inode_inc_iversion(inode);
4455 		err = btrfs_dirty_inode(inode);
4456 
4457 		if (!err && attr->ia_valid & ATTR_MODE)
4458 			err = btrfs_acl_chmod(inode);
4459 	}
4460 
4461 	return err;
4462 }
4463 
4464 void btrfs_evict_inode(struct inode *inode)
4465 {
4466 	struct btrfs_trans_handle *trans;
4467 	struct btrfs_root *root = BTRFS_I(inode)->root;
4468 	struct btrfs_block_rsv *rsv, *global_rsv;
4469 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
4470 	int ret;
4471 
4472 	trace_btrfs_inode_evict(inode);
4473 
4474 	truncate_inode_pages(&inode->i_data, 0);
4475 	if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
4476 			       btrfs_is_free_space_inode(inode)))
4477 		goto no_delete;
4478 
4479 	if (is_bad_inode(inode)) {
4480 		btrfs_orphan_del(NULL, inode);
4481 		goto no_delete;
4482 	}
4483 	/* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
4484 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
4485 
4486 	if (root->fs_info->log_root_recovering) {
4487 		BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
4488 				 &BTRFS_I(inode)->runtime_flags));
4489 		goto no_delete;
4490 	}
4491 
4492 	if (inode->i_nlink > 0) {
4493 		BUG_ON(btrfs_root_refs(&root->root_item) != 0);
4494 		goto no_delete;
4495 	}
4496 
4497 	ret = btrfs_commit_inode_delayed_inode(inode);
4498 	if (ret) {
4499 		btrfs_orphan_del(NULL, inode);
4500 		goto no_delete;
4501 	}
4502 
4503 	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
4504 	if (!rsv) {
4505 		btrfs_orphan_del(NULL, inode);
4506 		goto no_delete;
4507 	}
4508 	rsv->size = min_size;
4509 	rsv->failfast = 1;
4510 	global_rsv = &root->fs_info->global_block_rsv;
4511 
4512 	btrfs_i_size_write(inode, 0);
4513 
4514 	/*
4515 	 * This is a bit simpler than btrfs_truncate since we've already
4516 	 * reserved our space for our orphan item in the unlink, so we just
4517 	 * need to reserve some slack space in case we add bytes and update
4518 	 * inode item when doing the truncate.
4519 	 */
4520 	while (1) {
4521 		ret = btrfs_block_rsv_refill(root, rsv, min_size,
4522 					     BTRFS_RESERVE_FLUSH_LIMIT);
4523 
4524 		/*
4525 		 * Try and steal from the global reserve since we will
4526 		 * likely not use this space anyway, we want to try as
4527 		 * hard as possible to get this to work.
4528 		 */
4529 		if (ret)
4530 			ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
4531 
4532 		if (ret) {
4533 			btrfs_warn(root->fs_info,
4534 				"Could not get space for a delete, will truncate on mount %d",
4535 				ret);
4536 			btrfs_orphan_del(NULL, inode);
4537 			btrfs_free_block_rsv(root, rsv);
4538 			goto no_delete;
4539 		}
4540 
4541 		trans = btrfs_join_transaction(root);
4542 		if (IS_ERR(trans)) {
4543 			btrfs_orphan_del(NULL, inode);
4544 			btrfs_free_block_rsv(root, rsv);
4545 			goto no_delete;
4546 		}
4547 
4548 		trans->block_rsv = rsv;
4549 
4550 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
4551 		if (ret != -ENOSPC)
4552 			break;
4553 
4554 		trans->block_rsv = &root->fs_info->trans_block_rsv;
4555 		btrfs_end_transaction(trans, root);
4556 		trans = NULL;
4557 		btrfs_btree_balance_dirty(root);
4558 	}
4559 
4560 	btrfs_free_block_rsv(root, rsv);
4561 
4562 	/*
4563 	 * Errors here aren't a big deal, it just means we leave orphan items
4564 	 * in the tree.  They will be cleaned up on the next mount.
4565 	 */
4566 	if (ret == 0) {
4567 		trans->block_rsv = root->orphan_block_rsv;
4568 		btrfs_orphan_del(trans, inode);
4569 	} else {
4570 		btrfs_orphan_del(NULL, inode);
4571 	}
4572 
4573 	trans->block_rsv = &root->fs_info->trans_block_rsv;
4574 	if (!(root == root->fs_info->tree_root ||
4575 	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
4576 		btrfs_return_ino(root, btrfs_ino(inode));
4577 
4578 	btrfs_end_transaction(trans, root);
4579 	btrfs_btree_balance_dirty(root);
4580 no_delete:
4581 	btrfs_remove_delayed_node(inode);
4582 	clear_inode(inode);
4583 	return;
4584 }
4585 
4586 /*
4587  * this returns the key found in the dir entry in the location pointer.
4588  * If no dir entries were found, location->objectid is 0.
4589  */
4590 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
4591 			       struct btrfs_key *location)
4592 {
4593 	const char *name = dentry->d_name.name;
4594 	int namelen = dentry->d_name.len;
4595 	struct btrfs_dir_item *di;
4596 	struct btrfs_path *path;
4597 	struct btrfs_root *root = BTRFS_I(dir)->root;
4598 	int ret = 0;
4599 
4600 	path = btrfs_alloc_path();
4601 	if (!path)
4602 		return -ENOMEM;
4603 
4604 	di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
4605 				    namelen, 0);
4606 	if (IS_ERR(di))
4607 		ret = PTR_ERR(di);
4608 
4609 	if (IS_ERR_OR_NULL(di))
4610 		goto out_err;
4611 
4612 	btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
4613 out:
4614 	btrfs_free_path(path);
4615 	return ret;
4616 out_err:
4617 	location->objectid = 0;
4618 	goto out;
4619 }
4620 
4621 /*
4622  * when we hit a tree root in a directory, the btrfs part of the inode
4623  * needs to be changed to reflect the root directory of the tree root.  This
4624  * is kind of like crossing a mount point.
4625  */
4626 static int fixup_tree_root_location(struct btrfs_root *root,
4627 				    struct inode *dir,
4628 				    struct dentry *dentry,
4629 				    struct btrfs_key *location,
4630 				    struct btrfs_root **sub_root)
4631 {
4632 	struct btrfs_path *path;
4633 	struct btrfs_root *new_root;
4634 	struct btrfs_root_ref *ref;
4635 	struct extent_buffer *leaf;
4636 	int ret;
4637 	int err = 0;
4638 
4639 	path = btrfs_alloc_path();
4640 	if (!path) {
4641 		err = -ENOMEM;
4642 		goto out;
4643 	}
4644 
4645 	err = -ENOENT;
4646 	ret = btrfs_find_root_ref(root->fs_info->tree_root, path,
4647 				  BTRFS_I(dir)->root->root_key.objectid,
4648 				  location->objectid);
4649 	if (ret) {
4650 		if (ret < 0)
4651 			err = ret;
4652 		goto out;
4653 	}
4654 
4655 	leaf = path->nodes[0];
4656 	ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
4657 	if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
4658 	    btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
4659 		goto out;
4660 
4661 	ret = memcmp_extent_buffer(leaf, dentry->d_name.name,
4662 				   (unsigned long)(ref + 1),
4663 				   dentry->d_name.len);
4664 	if (ret)
4665 		goto out;
4666 
4667 	btrfs_release_path(path);
4668 
4669 	new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
4670 	if (IS_ERR(new_root)) {
4671 		err = PTR_ERR(new_root);
4672 		goto out;
4673 	}
4674 
4675 	*sub_root = new_root;
4676 	location->objectid = btrfs_root_dirid(&new_root->root_item);
4677 	location->type = BTRFS_INODE_ITEM_KEY;
4678 	location->offset = 0;
4679 	err = 0;
4680 out:
4681 	btrfs_free_path(path);
4682 	return err;
4683 }
4684 
4685 static void inode_tree_add(struct inode *inode)
4686 {
4687 	struct btrfs_root *root = BTRFS_I(inode)->root;
4688 	struct btrfs_inode *entry;
4689 	struct rb_node **p;
4690 	struct rb_node *parent;
4691 	struct rb_node *new = &BTRFS_I(inode)->rb_node;
4692 	u64 ino = btrfs_ino(inode);
4693 
4694 	if (inode_unhashed(inode))
4695 		return;
4696 	parent = NULL;
4697 	spin_lock(&root->inode_lock);
4698 	p = &root->inode_tree.rb_node;
4699 	while (*p) {
4700 		parent = *p;
4701 		entry = rb_entry(parent, struct btrfs_inode, rb_node);
4702 
4703 		if (ino < btrfs_ino(&entry->vfs_inode))
4704 			p = &parent->rb_left;
4705 		else if (ino > btrfs_ino(&entry->vfs_inode))
4706 			p = &parent->rb_right;
4707 		else {
4708 			WARN_ON(!(entry->vfs_inode.i_state &
4709 				  (I_WILL_FREE | I_FREEING)));
4710 			rb_replace_node(parent, new, &root->inode_tree);
4711 			RB_CLEAR_NODE(parent);
4712 			spin_unlock(&root->inode_lock);
4713 			return;
4714 		}
4715 	}
4716 	rb_link_node(new, parent, p);
4717 	rb_insert_color(new, &root->inode_tree);
4718 	spin_unlock(&root->inode_lock);
4719 }
4720 
4721 static void inode_tree_del(struct inode *inode)
4722 {
4723 	struct btrfs_root *root = BTRFS_I(inode)->root;
4724 	int empty = 0;
4725 
4726 	spin_lock(&root->inode_lock);
4727 	if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
4728 		rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
4729 		RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
4730 		empty = RB_EMPTY_ROOT(&root->inode_tree);
4731 	}
4732 	spin_unlock(&root->inode_lock);
4733 
4734 	/*
4735 	 * Free space cache has inodes in the tree root, but the tree root has a
4736 	 * root_refs of 0, so this could end up dropping the tree root as a
4737 	 * snapshot, so we need the extra !root->fs_info->tree_root check to
4738 	 * make sure we don't drop it.
4739 	 */
4740 	if (empty && btrfs_root_refs(&root->root_item) == 0 &&
4741 	    root != root->fs_info->tree_root) {
4742 		synchronize_srcu(&root->fs_info->subvol_srcu);
4743 		spin_lock(&root->inode_lock);
4744 		empty = RB_EMPTY_ROOT(&root->inode_tree);
4745 		spin_unlock(&root->inode_lock);
4746 		if (empty)
4747 			btrfs_add_dead_root(root);
4748 	}
4749 }
4750 
4751 void btrfs_invalidate_inodes(struct btrfs_root *root)
4752 {
4753 	struct rb_node *node;
4754 	struct rb_node *prev;
4755 	struct btrfs_inode *entry;
4756 	struct inode *inode;
4757 	u64 objectid = 0;
4758 
4759 	WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4760 
4761 	spin_lock(&root->inode_lock);
4762 again:
4763 	node = root->inode_tree.rb_node;
4764 	prev = NULL;
4765 	while (node) {
4766 		prev = node;
4767 		entry = rb_entry(node, struct btrfs_inode, rb_node);
4768 
4769 		if (objectid < btrfs_ino(&entry->vfs_inode))
4770 			node = node->rb_left;
4771 		else if (objectid > btrfs_ino(&entry->vfs_inode))
4772 			node = node->rb_right;
4773 		else
4774 			break;
4775 	}
4776 	if (!node) {
4777 		while (prev) {
4778 			entry = rb_entry(prev, struct btrfs_inode, rb_node);
4779 			if (objectid <= btrfs_ino(&entry->vfs_inode)) {
4780 				node = prev;
4781 				break;
4782 			}
4783 			prev = rb_next(prev);
4784 		}
4785 	}
4786 	while (node) {
4787 		entry = rb_entry(node, struct btrfs_inode, rb_node);
4788 		objectid = btrfs_ino(&entry->vfs_inode) + 1;
4789 		inode = igrab(&entry->vfs_inode);
4790 		if (inode) {
4791 			spin_unlock(&root->inode_lock);
4792 			if (atomic_read(&inode->i_count) > 1)
4793 				d_prune_aliases(inode);
4794 			/*
4795 			 * btrfs_drop_inode will have it removed from
4796 			 * the inode cache when its usage count
4797 			 * hits zero.
4798 			 */
4799 			iput(inode);
4800 			cond_resched();
4801 			spin_lock(&root->inode_lock);
4802 			goto again;
4803 		}
4804 
4805 		if (cond_resched_lock(&root->inode_lock))
4806 			goto again;
4807 
4808 		node = rb_next(node);
4809 	}
4810 	spin_unlock(&root->inode_lock);
4811 }
4812 
4813 static int btrfs_init_locked_inode(struct inode *inode, void *p)
4814 {
4815 	struct btrfs_iget_args *args = p;
4816 	inode->i_ino = args->ino;
4817 	BTRFS_I(inode)->root = args->root;
4818 	return 0;
4819 }
4820 
4821 static int btrfs_find_actor(struct inode *inode, void *opaque)
4822 {
4823 	struct btrfs_iget_args *args = opaque;
4824 	return args->ino == btrfs_ino(inode) &&
4825 		args->root == BTRFS_I(inode)->root;
4826 }
4827 
4828 static struct inode *btrfs_iget_locked(struct super_block *s,
4829 				       u64 objectid,
4830 				       struct btrfs_root *root)
4831 {
4832 	struct inode *inode;
4833 	struct btrfs_iget_args args;
4834 	args.ino = objectid;
4835 	args.root = root;
4836 
4837 	inode = iget5_locked(s, objectid, btrfs_find_actor,
4838 			     btrfs_init_locked_inode,
4839 			     (void *)&args);
4840 	return inode;
4841 }
4842 
4843 /* Get an inode object given its location and corresponding root.
4844  * Returns in *is_new if the inode was read from disk
4845  */
4846 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
4847 			 struct btrfs_root *root, int *new)
4848 {
4849 	struct inode *inode;
4850 
4851 	inode = btrfs_iget_locked(s, location->objectid, root);
4852 	if (!inode)
4853 		return ERR_PTR(-ENOMEM);
4854 
4855 	if (inode->i_state & I_NEW) {
4856 		BTRFS_I(inode)->root = root;
4857 		memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
4858 		btrfs_read_locked_inode(inode);
4859 		if (!is_bad_inode(inode)) {
4860 			inode_tree_add(inode);
4861 			unlock_new_inode(inode);
4862 			if (new)
4863 				*new = 1;
4864 		} else {
4865 			unlock_new_inode(inode);
4866 			iput(inode);
4867 			inode = ERR_PTR(-ESTALE);
4868 		}
4869 	}
4870 
4871 	return inode;
4872 }
4873 
4874 static struct inode *new_simple_dir(struct super_block *s,
4875 				    struct btrfs_key *key,
4876 				    struct btrfs_root *root)
4877 {
4878 	struct inode *inode = new_inode(s);
4879 
4880 	if (!inode)
4881 		return ERR_PTR(-ENOMEM);
4882 
4883 	BTRFS_I(inode)->root = root;
4884 	memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
4885 	set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags);
4886 
4887 	inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID;
4888 	inode->i_op = &btrfs_dir_ro_inode_operations;
4889 	inode->i_fop = &simple_dir_operations;
4890 	inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
4891 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
4892 
4893 	return inode;
4894 }
4895 
4896 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
4897 {
4898 	struct inode *inode;
4899 	struct btrfs_root *root = BTRFS_I(dir)->root;
4900 	struct btrfs_root *sub_root = root;
4901 	struct btrfs_key location;
4902 	int index;
4903 	int ret = 0;
4904 
4905 	if (dentry->d_name.len > BTRFS_NAME_LEN)
4906 		return ERR_PTR(-ENAMETOOLONG);
4907 
4908 	ret = btrfs_inode_by_name(dir, dentry, &location);
4909 	if (ret < 0)
4910 		return ERR_PTR(ret);
4911 
4912 	if (location.objectid == 0)
4913 		return NULL;
4914 
4915 	if (location.type == BTRFS_INODE_ITEM_KEY) {
4916 		inode = btrfs_iget(dir->i_sb, &location, root, NULL);
4917 		return inode;
4918 	}
4919 
4920 	BUG_ON(location.type != BTRFS_ROOT_ITEM_KEY);
4921 
4922 	index = srcu_read_lock(&root->fs_info->subvol_srcu);
4923 	ret = fixup_tree_root_location(root, dir, dentry,
4924 				       &location, &sub_root);
4925 	if (ret < 0) {
4926 		if (ret != -ENOENT)
4927 			inode = ERR_PTR(ret);
4928 		else
4929 			inode = new_simple_dir(dir->i_sb, &location, sub_root);
4930 	} else {
4931 		inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
4932 	}
4933 	srcu_read_unlock(&root->fs_info->subvol_srcu, index);
4934 
4935 	if (!IS_ERR(inode) && root != sub_root) {
4936 		down_read(&root->fs_info->cleanup_work_sem);
4937 		if (!(inode->i_sb->s_flags & MS_RDONLY))
4938 			ret = btrfs_orphan_cleanup(sub_root);
4939 		up_read(&root->fs_info->cleanup_work_sem);
4940 		if (ret) {
4941 			iput(inode);
4942 			inode = ERR_PTR(ret);
4943 		}
4944 	}
4945 
4946 	return inode;
4947 }
4948 
4949 static int btrfs_dentry_delete(const struct dentry *dentry)
4950 {
4951 	struct btrfs_root *root;
4952 	struct inode *inode = dentry->d_inode;
4953 
4954 	if (!inode && !IS_ROOT(dentry))
4955 		inode = dentry->d_parent->d_inode;
4956 
4957 	if (inode) {
4958 		root = BTRFS_I(inode)->root;
4959 		if (btrfs_root_refs(&root->root_item) == 0)
4960 			return 1;
4961 
4962 		if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
4963 			return 1;
4964 	}
4965 	return 0;
4966 }
4967 
4968 static void btrfs_dentry_release(struct dentry *dentry)
4969 {
4970 	if (dentry->d_fsdata)
4971 		kfree(dentry->d_fsdata);
4972 }
4973 
4974 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
4975 				   unsigned int flags)
4976 {
4977 	struct dentry *ret;
4978 
4979 	ret = d_splice_alias(btrfs_lookup_dentry(dir, dentry), dentry);
4980 	return ret;
4981 }
4982 
4983 unsigned char btrfs_filetype_table[] = {
4984 	DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
4985 };
4986 
4987 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
4988 {
4989 	struct inode *inode = file_inode(file);
4990 	struct btrfs_root *root = BTRFS_I(inode)->root;
4991 	struct btrfs_item *item;
4992 	struct btrfs_dir_item *di;
4993 	struct btrfs_key key;
4994 	struct btrfs_key found_key;
4995 	struct btrfs_path *path;
4996 	struct list_head ins_list;
4997 	struct list_head del_list;
4998 	int ret;
4999 	struct extent_buffer *leaf;
5000 	int slot;
5001 	unsigned char d_type;
5002 	int over = 0;
5003 	u32 di_cur;
5004 	u32 di_total;
5005 	u32 di_len;
5006 	int key_type = BTRFS_DIR_INDEX_KEY;
5007 	char tmp_name[32];
5008 	char *name_ptr;
5009 	int name_len;
5010 	int is_curr = 0;	/* ctx->pos points to the current index? */
5011 
5012 	/* FIXME, use a real flag for deciding about the key type */
5013 	if (root->fs_info->tree_root == root)
5014 		key_type = BTRFS_DIR_ITEM_KEY;
5015 
5016 	if (!dir_emit_dots(file, ctx))
5017 		return 0;
5018 
5019 	path = btrfs_alloc_path();
5020 	if (!path)
5021 		return -ENOMEM;
5022 
5023 	path->reada = 1;
5024 
5025 	if (key_type == BTRFS_DIR_INDEX_KEY) {
5026 		INIT_LIST_HEAD(&ins_list);
5027 		INIT_LIST_HEAD(&del_list);
5028 		btrfs_get_delayed_items(inode, &ins_list, &del_list);
5029 	}
5030 
5031 	btrfs_set_key_type(&key, key_type);
5032 	key.offset = ctx->pos;
5033 	key.objectid = btrfs_ino(inode);
5034 
5035 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5036 	if (ret < 0)
5037 		goto err;
5038 
5039 	while (1) {
5040 		leaf = path->nodes[0];
5041 		slot = path->slots[0];
5042 		if (slot >= btrfs_header_nritems(leaf)) {
5043 			ret = btrfs_next_leaf(root, path);
5044 			if (ret < 0)
5045 				goto err;
5046 			else if (ret > 0)
5047 				break;
5048 			continue;
5049 		}
5050 
5051 		item = btrfs_item_nr(leaf, slot);
5052 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
5053 
5054 		if (found_key.objectid != key.objectid)
5055 			break;
5056 		if (btrfs_key_type(&found_key) != key_type)
5057 			break;
5058 		if (found_key.offset < ctx->pos)
5059 			goto next;
5060 		if (key_type == BTRFS_DIR_INDEX_KEY &&
5061 		    btrfs_should_delete_dir_index(&del_list,
5062 						  found_key.offset))
5063 			goto next;
5064 
5065 		ctx->pos = found_key.offset;
5066 		is_curr = 1;
5067 
5068 		di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
5069 		di_cur = 0;
5070 		di_total = btrfs_item_size(leaf, item);
5071 
5072 		while (di_cur < di_total) {
5073 			struct btrfs_key location;
5074 
5075 			if (verify_dir_item(root, leaf, di))
5076 				break;
5077 
5078 			name_len = btrfs_dir_name_len(leaf, di);
5079 			if (name_len <= sizeof(tmp_name)) {
5080 				name_ptr = tmp_name;
5081 			} else {
5082 				name_ptr = kmalloc(name_len, GFP_NOFS);
5083 				if (!name_ptr) {
5084 					ret = -ENOMEM;
5085 					goto err;
5086 				}
5087 			}
5088 			read_extent_buffer(leaf, name_ptr,
5089 					   (unsigned long)(di + 1), name_len);
5090 
5091 			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
5092 			btrfs_dir_item_key_to_cpu(leaf, di, &location);
5093 
5094 
5095 			/* is this a reference to our own snapshot? If so
5096 			 * skip it.
5097 			 *
5098 			 * In contrast to old kernels, we insert the snapshot's
5099 			 * dir item and dir index after it has been created, so
5100 			 * we won't find a reference to our own snapshot. We
5101 			 * still keep the following code for backward
5102 			 * compatibility.
5103 			 */
5104 			if (location.type == BTRFS_ROOT_ITEM_KEY &&
5105 			    location.objectid == root->root_key.objectid) {
5106 				over = 0;
5107 				goto skip;
5108 			}
5109 			over = !dir_emit(ctx, name_ptr, name_len,
5110 				       location.objectid, d_type);
5111 
5112 skip:
5113 			if (name_ptr != tmp_name)
5114 				kfree(name_ptr);
5115 
5116 			if (over)
5117 				goto nopos;
5118 			di_len = btrfs_dir_name_len(leaf, di) +
5119 				 btrfs_dir_data_len(leaf, di) + sizeof(*di);
5120 			di_cur += di_len;
5121 			di = (struct btrfs_dir_item *)((char *)di + di_len);
5122 		}
5123 next:
5124 		path->slots[0]++;
5125 	}
5126 
5127 	if (key_type == BTRFS_DIR_INDEX_KEY) {
5128 		if (is_curr)
5129 			ctx->pos++;
5130 		ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
5131 		if (ret)
5132 			goto nopos;
5133 	}
5134 
5135 	/* Reached end of directory/root. Bump pos past the last item. */
5136 	ctx->pos++;
5137 
5138 	/*
5139 	 * Stop new entries from being returned after we return the last
5140 	 * entry.
5141 	 *
5142 	 * New directory entries are assigned a strictly increasing
5143 	 * offset.  This means that new entries created during readdir
5144 	 * are *guaranteed* to be seen in the future by that readdir.
5145 	 * This has broken buggy programs which operate on names as
5146 	 * they're returned by readdir.  Until we re-use freed offsets
5147 	 * we have this hack to stop new entries from being returned
5148 	 * under the assumption that they'll never reach this huge
5149 	 * offset.
5150 	 *
5151 	 * This is being careful not to overflow 32bit loff_t unless the
5152 	 * last entry requires it because doing so has broken 32bit apps
5153 	 * in the past.
5154 	 */
5155 	if (key_type == BTRFS_DIR_INDEX_KEY) {
5156 		if (ctx->pos >= INT_MAX)
5157 			ctx->pos = LLONG_MAX;
5158 		else
5159 			ctx->pos = INT_MAX;
5160 	}
5161 nopos:
5162 	ret = 0;
5163 err:
5164 	if (key_type == BTRFS_DIR_INDEX_KEY)
5165 		btrfs_put_delayed_items(&ins_list, &del_list);
5166 	btrfs_free_path(path);
5167 	return ret;
5168 }
5169 
5170 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
5171 {
5172 	struct btrfs_root *root = BTRFS_I(inode)->root;
5173 	struct btrfs_trans_handle *trans;
5174 	int ret = 0;
5175 	bool nolock = false;
5176 
5177 	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5178 		return 0;
5179 
5180 	if (btrfs_fs_closing(root->fs_info) && btrfs_is_free_space_inode(inode))
5181 		nolock = true;
5182 
5183 	if (wbc->sync_mode == WB_SYNC_ALL) {
5184 		if (nolock)
5185 			trans = btrfs_join_transaction_nolock(root);
5186 		else
5187 			trans = btrfs_join_transaction(root);
5188 		if (IS_ERR(trans))
5189 			return PTR_ERR(trans);
5190 		ret = btrfs_commit_transaction(trans, root);
5191 	}
5192 	return ret;
5193 }
5194 
5195 /*
5196  * This is somewhat expensive, updating the tree every time the
5197  * inode changes.  But, it is most likely to find the inode in cache.
5198  * FIXME, needs more benchmarking...there are no reasons other than performance
5199  * to keep or drop this code.
5200  */
5201 static int btrfs_dirty_inode(struct inode *inode)
5202 {
5203 	struct btrfs_root *root = BTRFS_I(inode)->root;
5204 	struct btrfs_trans_handle *trans;
5205 	int ret;
5206 
5207 	if (test_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags))
5208 		return 0;
5209 
5210 	trans = btrfs_join_transaction(root);
5211 	if (IS_ERR(trans))
5212 		return PTR_ERR(trans);
5213 
5214 	ret = btrfs_update_inode(trans, root, inode);
5215 	if (ret && ret == -ENOSPC) {
5216 		/* whoops, lets try again with the full transaction */
5217 		btrfs_end_transaction(trans, root);
5218 		trans = btrfs_start_transaction(root, 1);
5219 		if (IS_ERR(trans))
5220 			return PTR_ERR(trans);
5221 
5222 		ret = btrfs_update_inode(trans, root, inode);
5223 	}
5224 	btrfs_end_transaction(trans, root);
5225 	if (BTRFS_I(inode)->delayed_node)
5226 		btrfs_balance_delayed_items(root);
5227 
5228 	return ret;
5229 }
5230 
5231 /*
5232  * This is a copy of file_update_time.  We need this so we can return error on
5233  * ENOSPC for updating the inode in the case of file write and mmap writes.
5234  */
5235 static int btrfs_update_time(struct inode *inode, struct timespec *now,
5236 			     int flags)
5237 {
5238 	struct btrfs_root *root = BTRFS_I(inode)->root;
5239 
5240 	if (btrfs_root_readonly(root))
5241 		return -EROFS;
5242 
5243 	if (flags & S_VERSION)
5244 		inode_inc_iversion(inode);
5245 	if (flags & S_CTIME)
5246 		inode->i_ctime = *now;
5247 	if (flags & S_MTIME)
5248 		inode->i_mtime = *now;
5249 	if (flags & S_ATIME)
5250 		inode->i_atime = *now;
5251 	return btrfs_dirty_inode(inode);
5252 }
5253 
5254 /*
5255  * find the highest existing sequence number in a directory
5256  * and then set the in-memory index_cnt variable to reflect
5257  * free sequence numbers
5258  */
5259 static int btrfs_set_inode_index_count(struct inode *inode)
5260 {
5261 	struct btrfs_root *root = BTRFS_I(inode)->root;
5262 	struct btrfs_key key, found_key;
5263 	struct btrfs_path *path;
5264 	struct extent_buffer *leaf;
5265 	int ret;
5266 
5267 	key.objectid = btrfs_ino(inode);
5268 	btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
5269 	key.offset = (u64)-1;
5270 
5271 	path = btrfs_alloc_path();
5272 	if (!path)
5273 		return -ENOMEM;
5274 
5275 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5276 	if (ret < 0)
5277 		goto out;
5278 	/* FIXME: we should be able to handle this */
5279 	if (ret == 0)
5280 		goto out;
5281 	ret = 0;
5282 
5283 	/*
5284 	 * MAGIC NUMBER EXPLANATION:
5285 	 * since we search a directory based on f_pos we have to start at 2
5286 	 * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody
5287 	 * else has to start at 2
5288 	 */
5289 	if (path->slots[0] == 0) {
5290 		BTRFS_I(inode)->index_cnt = 2;
5291 		goto out;
5292 	}
5293 
5294 	path->slots[0]--;
5295 
5296 	leaf = path->nodes[0];
5297 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5298 
5299 	if (found_key.objectid != btrfs_ino(inode) ||
5300 	    btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
5301 		BTRFS_I(inode)->index_cnt = 2;
5302 		goto out;
5303 	}
5304 
5305 	BTRFS_I(inode)->index_cnt = found_key.offset + 1;
5306 out:
5307 	btrfs_free_path(path);
5308 	return ret;
5309 }
5310 
5311 /*
5312  * helper to find a free sequence number in a given directory.  This current
5313  * code is very simple, later versions will do smarter things in the btree
5314  */
5315 int btrfs_set_inode_index(struct inode *dir, u64 *index)
5316 {
5317 	int ret = 0;
5318 
5319 	if (BTRFS_I(dir)->index_cnt == (u64)-1) {
5320 		ret = btrfs_inode_delayed_dir_index_count(dir);
5321 		if (ret) {
5322 			ret = btrfs_set_inode_index_count(dir);
5323 			if (ret)
5324 				return ret;
5325 		}
5326 	}
5327 
5328 	*index = BTRFS_I(dir)->index_cnt;
5329 	BTRFS_I(dir)->index_cnt++;
5330 
5331 	return ret;
5332 }
5333 
5334 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
5335 				     struct btrfs_root *root,
5336 				     struct inode *dir,
5337 				     const char *name, int name_len,
5338 				     u64 ref_objectid, u64 objectid,
5339 				     umode_t mode, u64 *index)
5340 {
5341 	struct inode *inode;
5342 	struct btrfs_inode_item *inode_item;
5343 	struct btrfs_key *location;
5344 	struct btrfs_path *path;
5345 	struct btrfs_inode_ref *ref;
5346 	struct btrfs_key key[2];
5347 	u32 sizes[2];
5348 	unsigned long ptr;
5349 	int ret;
5350 	int owner;
5351 
5352 	path = btrfs_alloc_path();
5353 	if (!path)
5354 		return ERR_PTR(-ENOMEM);
5355 
5356 	inode = new_inode(root->fs_info->sb);
5357 	if (!inode) {
5358 		btrfs_free_path(path);
5359 		return ERR_PTR(-ENOMEM);
5360 	}
5361 
5362 	/*
5363 	 * we have to initialize this early, so we can reclaim the inode
5364 	 * number if we fail afterwards in this function.
5365 	 */
5366 	inode->i_ino = objectid;
5367 
5368 	if (dir) {
5369 		trace_btrfs_inode_request(dir);
5370 
5371 		ret = btrfs_set_inode_index(dir, index);
5372 		if (ret) {
5373 			btrfs_free_path(path);
5374 			iput(inode);
5375 			return ERR_PTR(ret);
5376 		}
5377 	}
5378 	/*
5379 	 * index_cnt is ignored for everything but a dir,
5380 	 * btrfs_get_inode_index_count has an explanation for the magic
5381 	 * number
5382 	 */
5383 	BTRFS_I(inode)->index_cnt = 2;
5384 	BTRFS_I(inode)->root = root;
5385 	BTRFS_I(inode)->generation = trans->transid;
5386 	inode->i_generation = BTRFS_I(inode)->generation;
5387 
5388 	/*
5389 	 * We could have gotten an inode number from somebody who was fsynced
5390 	 * and then removed in this same transaction, so let's just set full
5391 	 * sync since it will be a full sync anyway and this will blow away the
5392 	 * old info in the log.
5393 	 */
5394 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
5395 
5396 	if (S_ISDIR(mode))
5397 		owner = 0;
5398 	else
5399 		owner = 1;
5400 
5401 	key[0].objectid = objectid;
5402 	btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
5403 	key[0].offset = 0;
5404 
5405 	/*
5406 	 * Start new inodes with an inode_ref. This is slightly more
5407 	 * efficient for small numbers of hard links since they will
5408 	 * be packed into one item. Extended refs will kick in if we
5409 	 * add more hard links than can fit in the ref item.
5410 	 */
5411 	key[1].objectid = objectid;
5412 	btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY);
5413 	key[1].offset = ref_objectid;
5414 
5415 	sizes[0] = sizeof(struct btrfs_inode_item);
5416 	sizes[1] = name_len + sizeof(*ref);
5417 
5418 	path->leave_spinning = 1;
5419 	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
5420 	if (ret != 0)
5421 		goto fail;
5422 
5423 	inode_init_owner(inode, dir, mode);
5424 	inode_set_bytes(inode, 0);
5425 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
5426 	inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
5427 				  struct btrfs_inode_item);
5428 	memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
5429 			     sizeof(*inode_item));
5430 	fill_inode_item(trans, path->nodes[0], inode_item, inode);
5431 
5432 	ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
5433 			     struct btrfs_inode_ref);
5434 	btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
5435 	btrfs_set_inode_ref_index(path->nodes[0], ref, *index);
5436 	ptr = (unsigned long)(ref + 1);
5437 	write_extent_buffer(path->nodes[0], name, ptr, name_len);
5438 
5439 	btrfs_mark_buffer_dirty(path->nodes[0]);
5440 	btrfs_free_path(path);
5441 
5442 	location = &BTRFS_I(inode)->location;
5443 	location->objectid = objectid;
5444 	location->offset = 0;
5445 	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
5446 
5447 	btrfs_inherit_iflags(inode, dir);
5448 
5449 	if (S_ISREG(mode)) {
5450 		if (btrfs_test_opt(root, NODATASUM))
5451 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
5452 		if (btrfs_test_opt(root, NODATACOW))
5453 			BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
5454 				BTRFS_INODE_NODATASUM;
5455 	}
5456 
5457 	insert_inode_hash(inode);
5458 	inode_tree_add(inode);
5459 
5460 	trace_btrfs_inode_new(inode);
5461 	btrfs_set_inode_last_trans(trans, inode);
5462 
5463 	btrfs_update_root_times(trans, root);
5464 
5465 	return inode;
5466 fail:
5467 	if (dir)
5468 		BTRFS_I(dir)->index_cnt--;
5469 	btrfs_free_path(path);
5470 	iput(inode);
5471 	return ERR_PTR(ret);
5472 }
5473 
5474 static inline u8 btrfs_inode_type(struct inode *inode)
5475 {
5476 	return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT];
5477 }
5478 
5479 /*
5480  * utility function to add 'inode' into 'parent_inode' with
5481  * a give name and a given sequence number.
5482  * if 'add_backref' is true, also insert a backref from the
5483  * inode to the parent directory.
5484  */
5485 int btrfs_add_link(struct btrfs_trans_handle *trans,
5486 		   struct inode *parent_inode, struct inode *inode,
5487 		   const char *name, int name_len, int add_backref, u64 index)
5488 {
5489 	int ret = 0;
5490 	struct btrfs_key key;
5491 	struct btrfs_root *root = BTRFS_I(parent_inode)->root;
5492 	u64 ino = btrfs_ino(inode);
5493 	u64 parent_ino = btrfs_ino(parent_inode);
5494 
5495 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5496 		memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
5497 	} else {
5498 		key.objectid = ino;
5499 		btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
5500 		key.offset = 0;
5501 	}
5502 
5503 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5504 		ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
5505 					 key.objectid, root->root_key.objectid,
5506 					 parent_ino, index, name, name_len);
5507 	} else if (add_backref) {
5508 		ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
5509 					     parent_ino, index);
5510 	}
5511 
5512 	/* Nothing to clean up yet */
5513 	if (ret)
5514 		return ret;
5515 
5516 	ret = btrfs_insert_dir_item(trans, root, name, name_len,
5517 				    parent_inode, &key,
5518 				    btrfs_inode_type(inode), index);
5519 	if (ret == -EEXIST || ret == -EOVERFLOW)
5520 		goto fail_dir_item;
5521 	else if (ret) {
5522 		btrfs_abort_transaction(trans, root, ret);
5523 		return ret;
5524 	}
5525 
5526 	btrfs_i_size_write(parent_inode, parent_inode->i_size +
5527 			   name_len * 2);
5528 	inode_inc_iversion(parent_inode);
5529 	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
5530 	ret = btrfs_update_inode(trans, root, parent_inode);
5531 	if (ret)
5532 		btrfs_abort_transaction(trans, root, ret);
5533 	return ret;
5534 
5535 fail_dir_item:
5536 	if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
5537 		u64 local_index;
5538 		int err;
5539 		err = btrfs_del_root_ref(trans, root->fs_info->tree_root,
5540 				 key.objectid, root->root_key.objectid,
5541 				 parent_ino, &local_index, name, name_len);
5542 
5543 	} else if (add_backref) {
5544 		u64 local_index;
5545 		int err;
5546 
5547 		err = btrfs_del_inode_ref(trans, root, name, name_len,
5548 					  ino, parent_ino, &local_index);
5549 	}
5550 	return ret;
5551 }
5552 
5553 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
5554 			    struct inode *dir, struct dentry *dentry,
5555 			    struct inode *inode, int backref, u64 index)
5556 {
5557 	int err = btrfs_add_link(trans, dir, inode,
5558 				 dentry->d_name.name, dentry->d_name.len,
5559 				 backref, index);
5560 	if (err > 0)
5561 		err = -EEXIST;
5562 	return err;
5563 }
5564 
5565 static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
5566 			umode_t mode, dev_t rdev)
5567 {
5568 	struct btrfs_trans_handle *trans;
5569 	struct btrfs_root *root = BTRFS_I(dir)->root;
5570 	struct inode *inode = NULL;
5571 	int err;
5572 	int drop_inode = 0;
5573 	u64 objectid;
5574 	u64 index = 0;
5575 
5576 	if (!new_valid_dev(rdev))
5577 		return -EINVAL;
5578 
5579 	/*
5580 	 * 2 for inode item and ref
5581 	 * 2 for dir items
5582 	 * 1 for xattr if selinux is on
5583 	 */
5584 	trans = btrfs_start_transaction(root, 5);
5585 	if (IS_ERR(trans))
5586 		return PTR_ERR(trans);
5587 
5588 	err = btrfs_find_free_ino(root, &objectid);
5589 	if (err)
5590 		goto out_unlock;
5591 
5592 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5593 				dentry->d_name.len, btrfs_ino(dir), objectid,
5594 				mode, &index);
5595 	if (IS_ERR(inode)) {
5596 		err = PTR_ERR(inode);
5597 		goto out_unlock;
5598 	}
5599 
5600 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5601 	if (err) {
5602 		drop_inode = 1;
5603 		goto out_unlock;
5604 	}
5605 
5606 	/*
5607 	* If the active LSM wants to access the inode during
5608 	* d_instantiate it needs these. Smack checks to see
5609 	* if the filesystem supports xattrs by looking at the
5610 	* ops vector.
5611 	*/
5612 
5613 	inode->i_op = &btrfs_special_inode_operations;
5614 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5615 	if (err)
5616 		drop_inode = 1;
5617 	else {
5618 		init_special_inode(inode, inode->i_mode, rdev);
5619 		btrfs_update_inode(trans, root, inode);
5620 		d_instantiate(dentry, inode);
5621 	}
5622 out_unlock:
5623 	btrfs_end_transaction(trans, root);
5624 	btrfs_btree_balance_dirty(root);
5625 	if (drop_inode) {
5626 		inode_dec_link_count(inode);
5627 		iput(inode);
5628 	}
5629 	return err;
5630 }
5631 
5632 static int btrfs_create(struct inode *dir, struct dentry *dentry,
5633 			umode_t mode, bool excl)
5634 {
5635 	struct btrfs_trans_handle *trans;
5636 	struct btrfs_root *root = BTRFS_I(dir)->root;
5637 	struct inode *inode = NULL;
5638 	int drop_inode_on_err = 0;
5639 	int err;
5640 	u64 objectid;
5641 	u64 index = 0;
5642 
5643 	/*
5644 	 * 2 for inode item and ref
5645 	 * 2 for dir items
5646 	 * 1 for xattr if selinux is on
5647 	 */
5648 	trans = btrfs_start_transaction(root, 5);
5649 	if (IS_ERR(trans))
5650 		return PTR_ERR(trans);
5651 
5652 	err = btrfs_find_free_ino(root, &objectid);
5653 	if (err)
5654 		goto out_unlock;
5655 
5656 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5657 				dentry->d_name.len, btrfs_ino(dir), objectid,
5658 				mode, &index);
5659 	if (IS_ERR(inode)) {
5660 		err = PTR_ERR(inode);
5661 		goto out_unlock;
5662 	}
5663 	drop_inode_on_err = 1;
5664 
5665 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5666 	if (err)
5667 		goto out_unlock;
5668 
5669 	err = btrfs_update_inode(trans, root, inode);
5670 	if (err)
5671 		goto out_unlock;
5672 
5673 	/*
5674 	* If the active LSM wants to access the inode during
5675 	* d_instantiate it needs these. Smack checks to see
5676 	* if the filesystem supports xattrs by looking at the
5677 	* ops vector.
5678 	*/
5679 	inode->i_fop = &btrfs_file_operations;
5680 	inode->i_op = &btrfs_file_inode_operations;
5681 
5682 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
5683 	if (err)
5684 		goto out_unlock;
5685 
5686 	inode->i_mapping->a_ops = &btrfs_aops;
5687 	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
5688 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
5689 	d_instantiate(dentry, inode);
5690 
5691 out_unlock:
5692 	btrfs_end_transaction(trans, root);
5693 	if (err && drop_inode_on_err) {
5694 		inode_dec_link_count(inode);
5695 		iput(inode);
5696 	}
5697 	btrfs_btree_balance_dirty(root);
5698 	return err;
5699 }
5700 
5701 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
5702 		      struct dentry *dentry)
5703 {
5704 	struct btrfs_trans_handle *trans;
5705 	struct btrfs_root *root = BTRFS_I(dir)->root;
5706 	struct inode *inode = old_dentry->d_inode;
5707 	u64 index;
5708 	int err;
5709 	int drop_inode = 0;
5710 
5711 	/* do not allow sys_link's with other subvols of the same device */
5712 	if (root->objectid != BTRFS_I(inode)->root->objectid)
5713 		return -EXDEV;
5714 
5715 	if (inode->i_nlink >= BTRFS_LINK_MAX)
5716 		return -EMLINK;
5717 
5718 	err = btrfs_set_inode_index(dir, &index);
5719 	if (err)
5720 		goto fail;
5721 
5722 	/*
5723 	 * 2 items for inode and inode ref
5724 	 * 2 items for dir items
5725 	 * 1 item for parent inode
5726 	 */
5727 	trans = btrfs_start_transaction(root, 5);
5728 	if (IS_ERR(trans)) {
5729 		err = PTR_ERR(trans);
5730 		goto fail;
5731 	}
5732 
5733 	btrfs_inc_nlink(inode);
5734 	inode_inc_iversion(inode);
5735 	inode->i_ctime = CURRENT_TIME;
5736 	ihold(inode);
5737 	set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
5738 
5739 	err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
5740 
5741 	if (err) {
5742 		drop_inode = 1;
5743 	} else {
5744 		struct dentry *parent = dentry->d_parent;
5745 		err = btrfs_update_inode(trans, root, inode);
5746 		if (err)
5747 			goto fail;
5748 		d_instantiate(dentry, inode);
5749 		btrfs_log_new_name(trans, inode, NULL, parent);
5750 	}
5751 
5752 	btrfs_end_transaction(trans, root);
5753 fail:
5754 	if (drop_inode) {
5755 		inode_dec_link_count(inode);
5756 		iput(inode);
5757 	}
5758 	btrfs_btree_balance_dirty(root);
5759 	return err;
5760 }
5761 
5762 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
5763 {
5764 	struct inode *inode = NULL;
5765 	struct btrfs_trans_handle *trans;
5766 	struct btrfs_root *root = BTRFS_I(dir)->root;
5767 	int err = 0;
5768 	int drop_on_err = 0;
5769 	u64 objectid = 0;
5770 	u64 index = 0;
5771 
5772 	/*
5773 	 * 2 items for inode and ref
5774 	 * 2 items for dir items
5775 	 * 1 for xattr if selinux is on
5776 	 */
5777 	trans = btrfs_start_transaction(root, 5);
5778 	if (IS_ERR(trans))
5779 		return PTR_ERR(trans);
5780 
5781 	err = btrfs_find_free_ino(root, &objectid);
5782 	if (err)
5783 		goto out_fail;
5784 
5785 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
5786 				dentry->d_name.len, btrfs_ino(dir), objectid,
5787 				S_IFDIR | mode, &index);
5788 	if (IS_ERR(inode)) {
5789 		err = PTR_ERR(inode);
5790 		goto out_fail;
5791 	}
5792 
5793 	drop_on_err = 1;
5794 
5795 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
5796 	if (err)
5797 		goto out_fail;
5798 
5799 	inode->i_op = &btrfs_dir_inode_operations;
5800 	inode->i_fop = &btrfs_dir_file_operations;
5801 
5802 	btrfs_i_size_write(inode, 0);
5803 	err = btrfs_update_inode(trans, root, inode);
5804 	if (err)
5805 		goto out_fail;
5806 
5807 	err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
5808 			     dentry->d_name.len, 0, index);
5809 	if (err)
5810 		goto out_fail;
5811 
5812 	d_instantiate(dentry, inode);
5813 	drop_on_err = 0;
5814 
5815 out_fail:
5816 	btrfs_end_transaction(trans, root);
5817 	if (drop_on_err)
5818 		iput(inode);
5819 	btrfs_btree_balance_dirty(root);
5820 	return err;
5821 }
5822 
5823 /* helper for btfs_get_extent.  Given an existing extent in the tree,
5824  * and an extent that you want to insert, deal with overlap and insert
5825  * the new extent into the tree.
5826  */
5827 static int merge_extent_mapping(struct extent_map_tree *em_tree,
5828 				struct extent_map *existing,
5829 				struct extent_map *em,
5830 				u64 map_start, u64 map_len)
5831 {
5832 	u64 start_diff;
5833 
5834 	BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
5835 	start_diff = map_start - em->start;
5836 	em->start = map_start;
5837 	em->len = map_len;
5838 	if (em->block_start < EXTENT_MAP_LAST_BYTE &&
5839 	    !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
5840 		em->block_start += start_diff;
5841 		em->block_len -= start_diff;
5842 	}
5843 	return add_extent_mapping(em_tree, em, 0);
5844 }
5845 
5846 static noinline int uncompress_inline(struct btrfs_path *path,
5847 				      struct inode *inode, struct page *page,
5848 				      size_t pg_offset, u64 extent_offset,
5849 				      struct btrfs_file_extent_item *item)
5850 {
5851 	int ret;
5852 	struct extent_buffer *leaf = path->nodes[0];
5853 	char *tmp;
5854 	size_t max_size;
5855 	unsigned long inline_size;
5856 	unsigned long ptr;
5857 	int compress_type;
5858 
5859 	WARN_ON(pg_offset != 0);
5860 	compress_type = btrfs_file_extent_compression(leaf, item);
5861 	max_size = btrfs_file_extent_ram_bytes(leaf, item);
5862 	inline_size = btrfs_file_extent_inline_item_len(leaf,
5863 					btrfs_item_nr(leaf, path->slots[0]));
5864 	tmp = kmalloc(inline_size, GFP_NOFS);
5865 	if (!tmp)
5866 		return -ENOMEM;
5867 	ptr = btrfs_file_extent_inline_start(item);
5868 
5869 	read_extent_buffer(leaf, tmp, ptr, inline_size);
5870 
5871 	max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
5872 	ret = btrfs_decompress(compress_type, tmp, page,
5873 			       extent_offset, inline_size, max_size);
5874 	if (ret) {
5875 		char *kaddr = kmap_atomic(page);
5876 		unsigned long copy_size = min_t(u64,
5877 				  PAGE_CACHE_SIZE - pg_offset,
5878 				  max_size - extent_offset);
5879 		memset(kaddr + pg_offset, 0, copy_size);
5880 		kunmap_atomic(kaddr);
5881 	}
5882 	kfree(tmp);
5883 	return 0;
5884 }
5885 
5886 /*
5887  * a bit scary, this does extent mapping from logical file offset to the disk.
5888  * the ugly parts come from merging extents from the disk with the in-ram
5889  * representation.  This gets more complex because of the data=ordered code,
5890  * where the in-ram extents might be locked pending data=ordered completion.
5891  *
5892  * This also copies inline extents directly into the page.
5893  */
5894 
5895 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
5896 				    size_t pg_offset, u64 start, u64 len,
5897 				    int create)
5898 {
5899 	int ret;
5900 	int err = 0;
5901 	u64 bytenr;
5902 	u64 extent_start = 0;
5903 	u64 extent_end = 0;
5904 	u64 objectid = btrfs_ino(inode);
5905 	u32 found_type;
5906 	struct btrfs_path *path = NULL;
5907 	struct btrfs_root *root = BTRFS_I(inode)->root;
5908 	struct btrfs_file_extent_item *item;
5909 	struct extent_buffer *leaf;
5910 	struct btrfs_key found_key;
5911 	struct extent_map *em = NULL;
5912 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
5913 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5914 	struct btrfs_trans_handle *trans = NULL;
5915 	int compress_type;
5916 
5917 again:
5918 	read_lock(&em_tree->lock);
5919 	em = lookup_extent_mapping(em_tree, start, len);
5920 	if (em)
5921 		em->bdev = root->fs_info->fs_devices->latest_bdev;
5922 	read_unlock(&em_tree->lock);
5923 
5924 	if (em) {
5925 		if (em->start > start || em->start + em->len <= start)
5926 			free_extent_map(em);
5927 		else if (em->block_start == EXTENT_MAP_INLINE && page)
5928 			free_extent_map(em);
5929 		else
5930 			goto out;
5931 	}
5932 	em = alloc_extent_map();
5933 	if (!em) {
5934 		err = -ENOMEM;
5935 		goto out;
5936 	}
5937 	em->bdev = root->fs_info->fs_devices->latest_bdev;
5938 	em->start = EXTENT_MAP_HOLE;
5939 	em->orig_start = EXTENT_MAP_HOLE;
5940 	em->len = (u64)-1;
5941 	em->block_len = (u64)-1;
5942 
5943 	if (!path) {
5944 		path = btrfs_alloc_path();
5945 		if (!path) {
5946 			err = -ENOMEM;
5947 			goto out;
5948 		}
5949 		/*
5950 		 * Chances are we'll be called again, so go ahead and do
5951 		 * readahead
5952 		 */
5953 		path->reada = 1;
5954 	}
5955 
5956 	ret = btrfs_lookup_file_extent(trans, root, path,
5957 				       objectid, start, trans != NULL);
5958 	if (ret < 0) {
5959 		err = ret;
5960 		goto out;
5961 	}
5962 
5963 	if (ret != 0) {
5964 		if (path->slots[0] == 0)
5965 			goto not_found;
5966 		path->slots[0]--;
5967 	}
5968 
5969 	leaf = path->nodes[0];
5970 	item = btrfs_item_ptr(leaf, path->slots[0],
5971 			      struct btrfs_file_extent_item);
5972 	/* are we inside the extent that was found? */
5973 	btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5974 	found_type = btrfs_key_type(&found_key);
5975 	if (found_key.objectid != objectid ||
5976 	    found_type != BTRFS_EXTENT_DATA_KEY) {
5977 		goto not_found;
5978 	}
5979 
5980 	found_type = btrfs_file_extent_type(leaf, item);
5981 	extent_start = found_key.offset;
5982 	compress_type = btrfs_file_extent_compression(leaf, item);
5983 	if (found_type == BTRFS_FILE_EXTENT_REG ||
5984 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
5985 		extent_end = extent_start +
5986 		       btrfs_file_extent_num_bytes(leaf, item);
5987 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
5988 		size_t size;
5989 		size = btrfs_file_extent_inline_len(leaf, item);
5990 		extent_end = ALIGN(extent_start + size, root->sectorsize);
5991 	}
5992 
5993 	if (start >= extent_end) {
5994 		path->slots[0]++;
5995 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
5996 			ret = btrfs_next_leaf(root, path);
5997 			if (ret < 0) {
5998 				err = ret;
5999 				goto out;
6000 			}
6001 			if (ret > 0)
6002 				goto not_found;
6003 			leaf = path->nodes[0];
6004 		}
6005 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6006 		if (found_key.objectid != objectid ||
6007 		    found_key.type != BTRFS_EXTENT_DATA_KEY)
6008 			goto not_found;
6009 		if (start + len <= found_key.offset)
6010 			goto not_found;
6011 		em->start = start;
6012 		em->orig_start = start;
6013 		em->len = found_key.offset - start;
6014 		goto not_found_em;
6015 	}
6016 
6017 	em->ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
6018 	if (found_type == BTRFS_FILE_EXTENT_REG ||
6019 	    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
6020 		em->start = extent_start;
6021 		em->len = extent_end - extent_start;
6022 		em->orig_start = extent_start -
6023 				 btrfs_file_extent_offset(leaf, item);
6024 		em->orig_block_len = btrfs_file_extent_disk_num_bytes(leaf,
6025 								      item);
6026 		bytenr = btrfs_file_extent_disk_bytenr(leaf, item);
6027 		if (bytenr == 0) {
6028 			em->block_start = EXTENT_MAP_HOLE;
6029 			goto insert;
6030 		}
6031 		if (compress_type != BTRFS_COMPRESS_NONE) {
6032 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6033 			em->compress_type = compress_type;
6034 			em->block_start = bytenr;
6035 			em->block_len = em->orig_block_len;
6036 		} else {
6037 			bytenr += btrfs_file_extent_offset(leaf, item);
6038 			em->block_start = bytenr;
6039 			em->block_len = em->len;
6040 			if (found_type == BTRFS_FILE_EXTENT_PREALLOC)
6041 				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
6042 		}
6043 		goto insert;
6044 	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
6045 		unsigned long ptr;
6046 		char *map;
6047 		size_t size;
6048 		size_t extent_offset;
6049 		size_t copy_size;
6050 
6051 		em->block_start = EXTENT_MAP_INLINE;
6052 		if (!page || create) {
6053 			em->start = extent_start;
6054 			em->len = extent_end - extent_start;
6055 			goto out;
6056 		}
6057 
6058 		size = btrfs_file_extent_inline_len(leaf, item);
6059 		extent_offset = page_offset(page) + pg_offset - extent_start;
6060 		copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
6061 				size - extent_offset);
6062 		em->start = extent_start + extent_offset;
6063 		em->len = ALIGN(copy_size, root->sectorsize);
6064 		em->orig_block_len = em->len;
6065 		em->orig_start = em->start;
6066 		if (compress_type) {
6067 			set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
6068 			em->compress_type = compress_type;
6069 		}
6070 		ptr = btrfs_file_extent_inline_start(item) + extent_offset;
6071 		if (create == 0 && !PageUptodate(page)) {
6072 			if (btrfs_file_extent_compression(leaf, item) !=
6073 			    BTRFS_COMPRESS_NONE) {
6074 				ret = uncompress_inline(path, inode, page,
6075 							pg_offset,
6076 							extent_offset, item);
6077 				BUG_ON(ret); /* -ENOMEM */
6078 			} else {
6079 				map = kmap(page);
6080 				read_extent_buffer(leaf, map + pg_offset, ptr,
6081 						   copy_size);
6082 				if (pg_offset + copy_size < PAGE_CACHE_SIZE) {
6083 					memset(map + pg_offset + copy_size, 0,
6084 					       PAGE_CACHE_SIZE - pg_offset -
6085 					       copy_size);
6086 				}
6087 				kunmap(page);
6088 			}
6089 			flush_dcache_page(page);
6090 		} else if (create && PageUptodate(page)) {
6091 			BUG();
6092 			if (!trans) {
6093 				kunmap(page);
6094 				free_extent_map(em);
6095 				em = NULL;
6096 
6097 				btrfs_release_path(path);
6098 				trans = btrfs_join_transaction(root);
6099 
6100 				if (IS_ERR(trans))
6101 					return ERR_CAST(trans);
6102 				goto again;
6103 			}
6104 			map = kmap(page);
6105 			write_extent_buffer(leaf, map + pg_offset, ptr,
6106 					    copy_size);
6107 			kunmap(page);
6108 			btrfs_mark_buffer_dirty(leaf);
6109 		}
6110 		set_extent_uptodate(io_tree, em->start,
6111 				    extent_map_end(em) - 1, NULL, GFP_NOFS);
6112 		goto insert;
6113 	} else {
6114 		WARN(1, KERN_ERR "btrfs unknown found_type %d\n", found_type);
6115 	}
6116 not_found:
6117 	em->start = start;
6118 	em->orig_start = start;
6119 	em->len = len;
6120 not_found_em:
6121 	em->block_start = EXTENT_MAP_HOLE;
6122 	set_bit(EXTENT_FLAG_VACANCY, &em->flags);
6123 insert:
6124 	btrfs_release_path(path);
6125 	if (em->start > start || extent_map_end(em) <= start) {
6126 		btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
6127 			em->start, em->len, start, len);
6128 		err = -EIO;
6129 		goto out;
6130 	}
6131 
6132 	err = 0;
6133 	write_lock(&em_tree->lock);
6134 	ret = add_extent_mapping(em_tree, em, 0);
6135 	/* it is possible that someone inserted the extent into the tree
6136 	 * while we had the lock dropped.  It is also possible that
6137 	 * an overlapping map exists in the tree
6138 	 */
6139 	if (ret == -EEXIST) {
6140 		struct extent_map *existing;
6141 
6142 		ret = 0;
6143 
6144 		existing = lookup_extent_mapping(em_tree, start, len);
6145 		if (existing && (existing->start > start ||
6146 		    existing->start + existing->len <= start)) {
6147 			free_extent_map(existing);
6148 			existing = NULL;
6149 		}
6150 		if (!existing) {
6151 			existing = lookup_extent_mapping(em_tree, em->start,
6152 							 em->len);
6153 			if (existing) {
6154 				err = merge_extent_mapping(em_tree, existing,
6155 							   em, start,
6156 							   root->sectorsize);
6157 				free_extent_map(existing);
6158 				if (err) {
6159 					free_extent_map(em);
6160 					em = NULL;
6161 				}
6162 			} else {
6163 				err = -EIO;
6164 				free_extent_map(em);
6165 				em = NULL;
6166 			}
6167 		} else {
6168 			free_extent_map(em);
6169 			em = existing;
6170 			err = 0;
6171 		}
6172 	}
6173 	write_unlock(&em_tree->lock);
6174 out:
6175 
6176 	if (em)
6177 		trace_btrfs_get_extent(root, em);
6178 
6179 	if (path)
6180 		btrfs_free_path(path);
6181 	if (trans) {
6182 		ret = btrfs_end_transaction(trans, root);
6183 		if (!err)
6184 			err = ret;
6185 	}
6186 	if (err) {
6187 		free_extent_map(em);
6188 		return ERR_PTR(err);
6189 	}
6190 	BUG_ON(!em); /* Error is always set */
6191 	return em;
6192 }
6193 
6194 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
6195 					   size_t pg_offset, u64 start, u64 len,
6196 					   int create)
6197 {
6198 	struct extent_map *em;
6199 	struct extent_map *hole_em = NULL;
6200 	u64 range_start = start;
6201 	u64 end;
6202 	u64 found;
6203 	u64 found_end;
6204 	int err = 0;
6205 
6206 	em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
6207 	if (IS_ERR(em))
6208 		return em;
6209 	if (em) {
6210 		/*
6211 		 * if our em maps to
6212 		 * -  a hole or
6213 		 * -  a pre-alloc extent,
6214 		 * there might actually be delalloc bytes behind it.
6215 		 */
6216 		if (em->block_start != EXTENT_MAP_HOLE &&
6217 		    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6218 			return em;
6219 		else
6220 			hole_em = em;
6221 	}
6222 
6223 	/* check to see if we've wrapped (len == -1 or similar) */
6224 	end = start + len;
6225 	if (end < start)
6226 		end = (u64)-1;
6227 	else
6228 		end -= 1;
6229 
6230 	em = NULL;
6231 
6232 	/* ok, we didn't find anything, lets look for delalloc */
6233 	found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
6234 				 end, len, EXTENT_DELALLOC, 1);
6235 	found_end = range_start + found;
6236 	if (found_end < range_start)
6237 		found_end = (u64)-1;
6238 
6239 	/*
6240 	 * we didn't find anything useful, return
6241 	 * the original results from get_extent()
6242 	 */
6243 	if (range_start > end || found_end <= start) {
6244 		em = hole_em;
6245 		hole_em = NULL;
6246 		goto out;
6247 	}
6248 
6249 	/* adjust the range_start to make sure it doesn't
6250 	 * go backwards from the start they passed in
6251 	 */
6252 	range_start = max(start,range_start);
6253 	found = found_end - range_start;
6254 
6255 	if (found > 0) {
6256 		u64 hole_start = start;
6257 		u64 hole_len = len;
6258 
6259 		em = alloc_extent_map();
6260 		if (!em) {
6261 			err = -ENOMEM;
6262 			goto out;
6263 		}
6264 		/*
6265 		 * when btrfs_get_extent can't find anything it
6266 		 * returns one huge hole
6267 		 *
6268 		 * make sure what it found really fits our range, and
6269 		 * adjust to make sure it is based on the start from
6270 		 * the caller
6271 		 */
6272 		if (hole_em) {
6273 			u64 calc_end = extent_map_end(hole_em);
6274 
6275 			if (calc_end <= start || (hole_em->start > end)) {
6276 				free_extent_map(hole_em);
6277 				hole_em = NULL;
6278 			} else {
6279 				hole_start = max(hole_em->start, start);
6280 				hole_len = calc_end - hole_start;
6281 			}
6282 		}
6283 		em->bdev = NULL;
6284 		if (hole_em && range_start > hole_start) {
6285 			/* our hole starts before our delalloc, so we
6286 			 * have to return just the parts of the hole
6287 			 * that go until  the delalloc starts
6288 			 */
6289 			em->len = min(hole_len,
6290 				      range_start - hole_start);
6291 			em->start = hole_start;
6292 			em->orig_start = hole_start;
6293 			/*
6294 			 * don't adjust block start at all,
6295 			 * it is fixed at EXTENT_MAP_HOLE
6296 			 */
6297 			em->block_start = hole_em->block_start;
6298 			em->block_len = hole_len;
6299 			if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
6300 				set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
6301 		} else {
6302 			em->start = range_start;
6303 			em->len = found;
6304 			em->orig_start = range_start;
6305 			em->block_start = EXTENT_MAP_DELALLOC;
6306 			em->block_len = found;
6307 		}
6308 	} else if (hole_em) {
6309 		return hole_em;
6310 	}
6311 out:
6312 
6313 	free_extent_map(hole_em);
6314 	if (err) {
6315 		free_extent_map(em);
6316 		return ERR_PTR(err);
6317 	}
6318 	return em;
6319 }
6320 
6321 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
6322 						  u64 start, u64 len)
6323 {
6324 	struct btrfs_root *root = BTRFS_I(inode)->root;
6325 	struct extent_map *em;
6326 	struct btrfs_key ins;
6327 	u64 alloc_hint;
6328 	int ret;
6329 
6330 	alloc_hint = get_extent_allocation_hint(inode, start, len);
6331 	ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
6332 				   alloc_hint, &ins, 1);
6333 	if (ret)
6334 		return ERR_PTR(ret);
6335 
6336 	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
6337 			      ins.offset, ins.offset, ins.offset, 0);
6338 	if (IS_ERR(em)) {
6339 		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
6340 		return em;
6341 	}
6342 
6343 	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
6344 					   ins.offset, ins.offset, 0);
6345 	if (ret) {
6346 		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
6347 		free_extent_map(em);
6348 		return ERR_PTR(ret);
6349 	}
6350 
6351 	return em;
6352 }
6353 
6354 /*
6355  * returns 1 when the nocow is safe, < 1 on error, 0 if the
6356  * block must be cow'd
6357  */
6358 noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
6359 			      u64 *orig_start, u64 *orig_block_len,
6360 			      u64 *ram_bytes)
6361 {
6362 	struct btrfs_trans_handle *trans;
6363 	struct btrfs_path *path;
6364 	int ret;
6365 	struct extent_buffer *leaf;
6366 	struct btrfs_root *root = BTRFS_I(inode)->root;
6367 	struct btrfs_file_extent_item *fi;
6368 	struct btrfs_key key;
6369 	u64 disk_bytenr;
6370 	u64 backref_offset;
6371 	u64 extent_end;
6372 	u64 num_bytes;
6373 	int slot;
6374 	int found_type;
6375 	bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW);
6376 	path = btrfs_alloc_path();
6377 	if (!path)
6378 		return -ENOMEM;
6379 
6380 	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
6381 				       offset, 0);
6382 	if (ret < 0)
6383 		goto out;
6384 
6385 	slot = path->slots[0];
6386 	if (ret == 1) {
6387 		if (slot == 0) {
6388 			/* can't find the item, must cow */
6389 			ret = 0;
6390 			goto out;
6391 		}
6392 		slot--;
6393 	}
6394 	ret = 0;
6395 	leaf = path->nodes[0];
6396 	btrfs_item_key_to_cpu(leaf, &key, slot);
6397 	if (key.objectid != btrfs_ino(inode) ||
6398 	    key.type != BTRFS_EXTENT_DATA_KEY) {
6399 		/* not our file or wrong item type, must cow */
6400 		goto out;
6401 	}
6402 
6403 	if (key.offset > offset) {
6404 		/* Wrong offset, must cow */
6405 		goto out;
6406 	}
6407 
6408 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
6409 	found_type = btrfs_file_extent_type(leaf, fi);
6410 	if (found_type != BTRFS_FILE_EXTENT_REG &&
6411 	    found_type != BTRFS_FILE_EXTENT_PREALLOC) {
6412 		/* not a regular extent, must cow */
6413 		goto out;
6414 	}
6415 
6416 	if (!nocow && found_type == BTRFS_FILE_EXTENT_REG)
6417 		goto out;
6418 
6419 	disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
6420 	if (disk_bytenr == 0)
6421 		goto out;
6422 
6423 	if (btrfs_file_extent_compression(leaf, fi) ||
6424 	    btrfs_file_extent_encryption(leaf, fi) ||
6425 	    btrfs_file_extent_other_encoding(leaf, fi))
6426 		goto out;
6427 
6428 	backref_offset = btrfs_file_extent_offset(leaf, fi);
6429 
6430 	if (orig_start) {
6431 		*orig_start = key.offset - backref_offset;
6432 		*orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi);
6433 		*ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
6434 	}
6435 
6436 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
6437 
6438 	if (btrfs_extent_readonly(root, disk_bytenr))
6439 		goto out;
6440 	btrfs_release_path(path);
6441 
6442 	/*
6443 	 * look for other files referencing this extent, if we
6444 	 * find any we must cow
6445 	 */
6446 	trans = btrfs_join_transaction(root);
6447 	if (IS_ERR(trans)) {
6448 		ret = 0;
6449 		goto out;
6450 	}
6451 
6452 	ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
6453 				    key.offset - backref_offset, disk_bytenr);
6454 	btrfs_end_transaction(trans, root);
6455 	if (ret) {
6456 		ret = 0;
6457 		goto out;
6458 	}
6459 
6460 	/*
6461 	 * adjust disk_bytenr and num_bytes to cover just the bytes
6462 	 * in this extent we are about to write.  If there
6463 	 * are any csums in that range we have to cow in order
6464 	 * to keep the csums correct
6465 	 */
6466 	disk_bytenr += backref_offset;
6467 	disk_bytenr += offset - key.offset;
6468 	num_bytes = min(offset + *len, extent_end) - offset;
6469 	if (csum_exist_in_range(root, disk_bytenr, num_bytes))
6470 				goto out;
6471 	/*
6472 	 * all of the above have passed, it is safe to overwrite this extent
6473 	 * without cow
6474 	 */
6475 	*len = num_bytes;
6476 	ret = 1;
6477 out:
6478 	btrfs_free_path(path);
6479 	return ret;
6480 }
6481 
6482 static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
6483 			      struct extent_state **cached_state, int writing)
6484 {
6485 	struct btrfs_ordered_extent *ordered;
6486 	int ret = 0;
6487 
6488 	while (1) {
6489 		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6490 				 0, cached_state);
6491 		/*
6492 		 * We're concerned with the entire range that we're going to be
6493 		 * doing DIO to, so we need to make sure theres no ordered
6494 		 * extents in this range.
6495 		 */
6496 		ordered = btrfs_lookup_ordered_range(inode, lockstart,
6497 						     lockend - lockstart + 1);
6498 
6499 		/*
6500 		 * We need to make sure there are no buffered pages in this
6501 		 * range either, we could have raced between the invalidate in
6502 		 * generic_file_direct_write and locking the extent.  The
6503 		 * invalidate needs to happen so that reads after a write do not
6504 		 * get stale data.
6505 		 */
6506 		if (!ordered && (!writing ||
6507 		    !test_range_bit(&BTRFS_I(inode)->io_tree,
6508 				    lockstart, lockend, EXTENT_UPTODATE, 0,
6509 				    *cached_state)))
6510 			break;
6511 
6512 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6513 				     cached_state, GFP_NOFS);
6514 
6515 		if (ordered) {
6516 			btrfs_start_ordered_extent(inode, ordered, 1);
6517 			btrfs_put_ordered_extent(ordered);
6518 		} else {
6519 			/* Screw you mmap */
6520 			ret = filemap_write_and_wait_range(inode->i_mapping,
6521 							   lockstart,
6522 							   lockend);
6523 			if (ret)
6524 				break;
6525 
6526 			/*
6527 			 * If we found a page that couldn't be invalidated just
6528 			 * fall back to buffered.
6529 			 */
6530 			ret = invalidate_inode_pages2_range(inode->i_mapping,
6531 					lockstart >> PAGE_CACHE_SHIFT,
6532 					lockend >> PAGE_CACHE_SHIFT);
6533 			if (ret)
6534 				break;
6535 		}
6536 
6537 		cond_resched();
6538 	}
6539 
6540 	return ret;
6541 }
6542 
6543 static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
6544 					   u64 len, u64 orig_start,
6545 					   u64 block_start, u64 block_len,
6546 					   u64 orig_block_len, u64 ram_bytes,
6547 					   int type)
6548 {
6549 	struct extent_map_tree *em_tree;
6550 	struct extent_map *em;
6551 	struct btrfs_root *root = BTRFS_I(inode)->root;
6552 	int ret;
6553 
6554 	em_tree = &BTRFS_I(inode)->extent_tree;
6555 	em = alloc_extent_map();
6556 	if (!em)
6557 		return ERR_PTR(-ENOMEM);
6558 
6559 	em->start = start;
6560 	em->orig_start = orig_start;
6561 	em->mod_start = start;
6562 	em->mod_len = len;
6563 	em->len = len;
6564 	em->block_len = block_len;
6565 	em->block_start = block_start;
6566 	em->bdev = root->fs_info->fs_devices->latest_bdev;
6567 	em->orig_block_len = orig_block_len;
6568 	em->ram_bytes = ram_bytes;
6569 	em->generation = -1;
6570 	set_bit(EXTENT_FLAG_PINNED, &em->flags);
6571 	if (type == BTRFS_ORDERED_PREALLOC)
6572 		set_bit(EXTENT_FLAG_FILLING, &em->flags);
6573 
6574 	do {
6575 		btrfs_drop_extent_cache(inode, em->start,
6576 				em->start + em->len - 1, 0);
6577 		write_lock(&em_tree->lock);
6578 		ret = add_extent_mapping(em_tree, em, 1);
6579 		write_unlock(&em_tree->lock);
6580 	} while (ret == -EEXIST);
6581 
6582 	if (ret) {
6583 		free_extent_map(em);
6584 		return ERR_PTR(ret);
6585 	}
6586 
6587 	return em;
6588 }
6589 
6590 
6591 static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
6592 				   struct buffer_head *bh_result, int create)
6593 {
6594 	struct extent_map *em;
6595 	struct btrfs_root *root = BTRFS_I(inode)->root;
6596 	struct extent_state *cached_state = NULL;
6597 	u64 start = iblock << inode->i_blkbits;
6598 	u64 lockstart, lockend;
6599 	u64 len = bh_result->b_size;
6600 	int unlock_bits = EXTENT_LOCKED;
6601 	int ret = 0;
6602 
6603 	if (create)
6604 		unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
6605 	else
6606 		len = min_t(u64, len, root->sectorsize);
6607 
6608 	lockstart = start;
6609 	lockend = start + len - 1;
6610 
6611 	/*
6612 	 * If this errors out it's because we couldn't invalidate pagecache for
6613 	 * this range and we need to fallback to buffered.
6614 	 */
6615 	if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
6616 		return -ENOTBLK;
6617 
6618 	em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
6619 	if (IS_ERR(em)) {
6620 		ret = PTR_ERR(em);
6621 		goto unlock_err;
6622 	}
6623 
6624 	/*
6625 	 * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
6626 	 * io.  INLINE is special, and we could probably kludge it in here, but
6627 	 * it's still buffered so for safety lets just fall back to the generic
6628 	 * buffered path.
6629 	 *
6630 	 * For COMPRESSED we _have_ to read the entire extent in so we can
6631 	 * decompress it, so there will be buffering required no matter what we
6632 	 * do, so go ahead and fallback to buffered.
6633 	 *
6634 	 * We return -ENOTBLK because thats what makes DIO go ahead and go back
6635 	 * to buffered IO.  Don't blame me, this is the price we pay for using
6636 	 * the generic code.
6637 	 */
6638 	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
6639 	    em->block_start == EXTENT_MAP_INLINE) {
6640 		free_extent_map(em);
6641 		ret = -ENOTBLK;
6642 		goto unlock_err;
6643 	}
6644 
6645 	/* Just a good old fashioned hole, return */
6646 	if (!create && (em->block_start == EXTENT_MAP_HOLE ||
6647 			test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
6648 		free_extent_map(em);
6649 		goto unlock_err;
6650 	}
6651 
6652 	/*
6653 	 * We don't allocate a new extent in the following cases
6654 	 *
6655 	 * 1) The inode is marked as NODATACOW.  In this case we'll just use the
6656 	 * existing extent.
6657 	 * 2) The extent is marked as PREALLOC.  We're good to go here and can
6658 	 * just use the extent.
6659 	 *
6660 	 */
6661 	if (!create) {
6662 		len = min(len, em->len - (start - em->start));
6663 		lockstart = start + len;
6664 		goto unlock;
6665 	}
6666 
6667 	if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
6668 	    ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
6669 	     em->block_start != EXTENT_MAP_HOLE)) {
6670 		int type;
6671 		int ret;
6672 		u64 block_start, orig_start, orig_block_len, ram_bytes;
6673 
6674 		if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6675 			type = BTRFS_ORDERED_PREALLOC;
6676 		else
6677 			type = BTRFS_ORDERED_NOCOW;
6678 		len = min(len, em->len - (start - em->start));
6679 		block_start = em->block_start + (start - em->start);
6680 
6681 		if (can_nocow_extent(inode, start, &len, &orig_start,
6682 				     &orig_block_len, &ram_bytes) == 1) {
6683 			if (type == BTRFS_ORDERED_PREALLOC) {
6684 				free_extent_map(em);
6685 				em = create_pinned_em(inode, start, len,
6686 						       orig_start,
6687 						       block_start, len,
6688 						       orig_block_len,
6689 						       ram_bytes, type);
6690 				if (IS_ERR(em))
6691 					goto unlock_err;
6692 			}
6693 
6694 			ret = btrfs_add_ordered_extent_dio(inode, start,
6695 					   block_start, len, len, type);
6696 			if (ret) {
6697 				free_extent_map(em);
6698 				goto unlock_err;
6699 			}
6700 			goto unlock;
6701 		}
6702 	}
6703 
6704 	/*
6705 	 * this will cow the extent, reset the len in case we changed
6706 	 * it above
6707 	 */
6708 	len = bh_result->b_size;
6709 	free_extent_map(em);
6710 	em = btrfs_new_extent_direct(inode, start, len);
6711 	if (IS_ERR(em)) {
6712 		ret = PTR_ERR(em);
6713 		goto unlock_err;
6714 	}
6715 	len = min(len, em->len - (start - em->start));
6716 unlock:
6717 	bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
6718 		inode->i_blkbits;
6719 	bh_result->b_size = len;
6720 	bh_result->b_bdev = em->bdev;
6721 	set_buffer_mapped(bh_result);
6722 	if (create) {
6723 		if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
6724 			set_buffer_new(bh_result);
6725 
6726 		/*
6727 		 * Need to update the i_size under the extent lock so buffered
6728 		 * readers will get the updated i_size when we unlock.
6729 		 */
6730 		if (start + len > i_size_read(inode))
6731 			i_size_write(inode, start + len);
6732 
6733 		spin_lock(&BTRFS_I(inode)->lock);
6734 		BTRFS_I(inode)->outstanding_extents++;
6735 		spin_unlock(&BTRFS_I(inode)->lock);
6736 
6737 		ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6738 				     lockstart + len - 1, EXTENT_DELALLOC, NULL,
6739 				     &cached_state, GFP_NOFS);
6740 		BUG_ON(ret);
6741 	}
6742 
6743 	/*
6744 	 * In the case of write we need to clear and unlock the entire range,
6745 	 * in the case of read we need to unlock only the end area that we
6746 	 * aren't using if there is any left over space.
6747 	 */
6748 	if (lockstart < lockend) {
6749 		clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
6750 				 lockend, unlock_bits, 1, 0,
6751 				 &cached_state, GFP_NOFS);
6752 	} else {
6753 		free_extent_state(cached_state);
6754 	}
6755 
6756 	free_extent_map(em);
6757 
6758 	return 0;
6759 
6760 unlock_err:
6761 	clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
6762 			 unlock_bits, 1, 0, &cached_state, GFP_NOFS);
6763 	return ret;
6764 }
6765 
6766 static void btrfs_endio_direct_read(struct bio *bio, int err)
6767 {
6768 	struct btrfs_dio_private *dip = bio->bi_private;
6769 	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
6770 	struct bio_vec *bvec = bio->bi_io_vec;
6771 	struct inode *inode = dip->inode;
6772 	struct btrfs_root *root = BTRFS_I(inode)->root;
6773 	struct bio *dio_bio;
6774 	u32 *csums = (u32 *)dip->csum;
6775 	int index = 0;
6776 	u64 start;
6777 
6778 	start = dip->logical_offset;
6779 	do {
6780 		if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
6781 			struct page *page = bvec->bv_page;
6782 			char *kaddr;
6783 			u32 csum = ~(u32)0;
6784 			unsigned long flags;
6785 
6786 			local_irq_save(flags);
6787 			kaddr = kmap_atomic(page);
6788 			csum = btrfs_csum_data(kaddr + bvec->bv_offset,
6789 					       csum, bvec->bv_len);
6790 			btrfs_csum_final(csum, (char *)&csum);
6791 			kunmap_atomic(kaddr);
6792 			local_irq_restore(flags);
6793 
6794 			flush_dcache_page(bvec->bv_page);
6795 			if (csum != csums[index]) {
6796 				btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
6797 					  btrfs_ino(inode), start, csum,
6798 					  csums[index]);
6799 				err = -EIO;
6800 			}
6801 		}
6802 
6803 		start += bvec->bv_len;
6804 		bvec++;
6805 		index++;
6806 	} while (bvec <= bvec_end);
6807 
6808 	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
6809 		      dip->logical_offset + dip->bytes - 1);
6810 	dio_bio = dip->dio_bio;
6811 
6812 	kfree(dip);
6813 
6814 	/* If we had a csum failure make sure to clear the uptodate flag */
6815 	if (err)
6816 		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
6817 	dio_end_io(dio_bio, err);
6818 	bio_put(bio);
6819 }
6820 
6821 static void btrfs_endio_direct_write(struct bio *bio, int err)
6822 {
6823 	struct btrfs_dio_private *dip = bio->bi_private;
6824 	struct inode *inode = dip->inode;
6825 	struct btrfs_root *root = BTRFS_I(inode)->root;
6826 	struct btrfs_ordered_extent *ordered = NULL;
6827 	u64 ordered_offset = dip->logical_offset;
6828 	u64 ordered_bytes = dip->bytes;
6829 	struct bio *dio_bio;
6830 	int ret;
6831 
6832 	if (err)
6833 		goto out_done;
6834 again:
6835 	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
6836 						   &ordered_offset,
6837 						   ordered_bytes, !err);
6838 	if (!ret)
6839 		goto out_test;
6840 
6841 	ordered->work.func = finish_ordered_fn;
6842 	ordered->work.flags = 0;
6843 	btrfs_queue_worker(&root->fs_info->endio_write_workers,
6844 			   &ordered->work);
6845 out_test:
6846 	/*
6847 	 * our bio might span multiple ordered extents.  If we haven't
6848 	 * completed the accounting for the whole dio, go back and try again
6849 	 */
6850 	if (ordered_offset < dip->logical_offset + dip->bytes) {
6851 		ordered_bytes = dip->logical_offset + dip->bytes -
6852 			ordered_offset;
6853 		ordered = NULL;
6854 		goto again;
6855 	}
6856 out_done:
6857 	dio_bio = dip->dio_bio;
6858 
6859 	kfree(dip);
6860 
6861 	/* If we had an error make sure to clear the uptodate flag */
6862 	if (err)
6863 		clear_bit(BIO_UPTODATE, &dio_bio->bi_flags);
6864 	dio_end_io(dio_bio, err);
6865 	bio_put(bio);
6866 }
6867 
6868 static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
6869 				    struct bio *bio, int mirror_num,
6870 				    unsigned long bio_flags, u64 offset)
6871 {
6872 	int ret;
6873 	struct btrfs_root *root = BTRFS_I(inode)->root;
6874 	ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
6875 	BUG_ON(ret); /* -ENOMEM */
6876 	return 0;
6877 }
6878 
6879 static void btrfs_end_dio_bio(struct bio *bio, int err)
6880 {
6881 	struct btrfs_dio_private *dip = bio->bi_private;
6882 
6883 	if (err) {
6884 		printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
6885 		      "sector %#Lx len %u err no %d\n",
6886 		      btrfs_ino(dip->inode), bio->bi_rw,
6887 		      (unsigned long long)bio->bi_sector, bio->bi_size, err);
6888 		dip->errors = 1;
6889 
6890 		/*
6891 		 * before atomic variable goto zero, we must make sure
6892 		 * dip->errors is perceived to be set.
6893 		 */
6894 		smp_mb__before_atomic_dec();
6895 	}
6896 
6897 	/* if there are more bios still pending for this dio, just exit */
6898 	if (!atomic_dec_and_test(&dip->pending_bios))
6899 		goto out;
6900 
6901 	if (dip->errors) {
6902 		bio_io_error(dip->orig_bio);
6903 	} else {
6904 		set_bit(BIO_UPTODATE, &dip->dio_bio->bi_flags);
6905 		bio_endio(dip->orig_bio, 0);
6906 	}
6907 out:
6908 	bio_put(bio);
6909 }
6910 
6911 static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
6912 				       u64 first_sector, gfp_t gfp_flags)
6913 {
6914 	int nr_vecs = bio_get_nr_vecs(bdev);
6915 	return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
6916 }
6917 
6918 static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
6919 					 int rw, u64 file_offset, int skip_sum,
6920 					 int async_submit)
6921 {
6922 	struct btrfs_dio_private *dip = bio->bi_private;
6923 	int write = rw & REQ_WRITE;
6924 	struct btrfs_root *root = BTRFS_I(inode)->root;
6925 	int ret;
6926 
6927 	if (async_submit)
6928 		async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
6929 
6930 	bio_get(bio);
6931 
6932 	if (!write) {
6933 		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
6934 		if (ret)
6935 			goto err;
6936 	}
6937 
6938 	if (skip_sum)
6939 		goto map;
6940 
6941 	if (write && async_submit) {
6942 		ret = btrfs_wq_submit_bio(root->fs_info,
6943 				   inode, rw, bio, 0, 0,
6944 				   file_offset,
6945 				   __btrfs_submit_bio_start_direct_io,
6946 				   __btrfs_submit_bio_done);
6947 		goto err;
6948 	} else if (write) {
6949 		/*
6950 		 * If we aren't doing async submit, calculate the csum of the
6951 		 * bio now.
6952 		 */
6953 		ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
6954 		if (ret)
6955 			goto err;
6956 	} else if (!skip_sum) {
6957 		ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
6958 						file_offset);
6959 		if (ret)
6960 			goto err;
6961 	}
6962 
6963 map:
6964 	ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
6965 err:
6966 	bio_put(bio);
6967 	return ret;
6968 }
6969 
6970 static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
6971 				    int skip_sum)
6972 {
6973 	struct inode *inode = dip->inode;
6974 	struct btrfs_root *root = BTRFS_I(inode)->root;
6975 	struct bio *bio;
6976 	struct bio *orig_bio = dip->orig_bio;
6977 	struct bio_vec *bvec = orig_bio->bi_io_vec;
6978 	u64 start_sector = orig_bio->bi_sector;
6979 	u64 file_offset = dip->logical_offset;
6980 	u64 submit_len = 0;
6981 	u64 map_length;
6982 	int nr_pages = 0;
6983 	int ret = 0;
6984 	int async_submit = 0;
6985 
6986 	map_length = orig_bio->bi_size;
6987 	ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
6988 			      &map_length, NULL, 0);
6989 	if (ret) {
6990 		bio_put(orig_bio);
6991 		return -EIO;
6992 	}
6993 
6994 	if (map_length >= orig_bio->bi_size) {
6995 		bio = orig_bio;
6996 		goto submit;
6997 	}
6998 
6999 	/* async crcs make it difficult to collect full stripe writes. */
7000 	if (btrfs_get_alloc_profile(root, 1) &
7001 	    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
7002 		async_submit = 0;
7003 	else
7004 		async_submit = 1;
7005 
7006 	bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
7007 	if (!bio)
7008 		return -ENOMEM;
7009 	bio->bi_private = dip;
7010 	bio->bi_end_io = btrfs_end_dio_bio;
7011 	atomic_inc(&dip->pending_bios);
7012 
7013 	while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
7014 		if (unlikely(map_length < submit_len + bvec->bv_len ||
7015 		    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
7016 				 bvec->bv_offset) < bvec->bv_len)) {
7017 			/*
7018 			 * inc the count before we submit the bio so
7019 			 * we know the end IO handler won't happen before
7020 			 * we inc the count. Otherwise, the dip might get freed
7021 			 * before we're done setting it up
7022 			 */
7023 			atomic_inc(&dip->pending_bios);
7024 			ret = __btrfs_submit_dio_bio(bio, inode, rw,
7025 						     file_offset, skip_sum,
7026 						     async_submit);
7027 			if (ret) {
7028 				bio_put(bio);
7029 				atomic_dec(&dip->pending_bios);
7030 				goto out_err;
7031 			}
7032 
7033 			start_sector += submit_len >> 9;
7034 			file_offset += submit_len;
7035 
7036 			submit_len = 0;
7037 			nr_pages = 0;
7038 
7039 			bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
7040 						  start_sector, GFP_NOFS);
7041 			if (!bio)
7042 				goto out_err;
7043 			bio->bi_private = dip;
7044 			bio->bi_end_io = btrfs_end_dio_bio;
7045 
7046 			map_length = orig_bio->bi_size;
7047 			ret = btrfs_map_block(root->fs_info, rw,
7048 					      start_sector << 9,
7049 					      &map_length, NULL, 0);
7050 			if (ret) {
7051 				bio_put(bio);
7052 				goto out_err;
7053 			}
7054 		} else {
7055 			submit_len += bvec->bv_len;
7056 			nr_pages ++;
7057 			bvec++;
7058 		}
7059 	}
7060 
7061 submit:
7062 	ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
7063 				     async_submit);
7064 	if (!ret)
7065 		return 0;
7066 
7067 	bio_put(bio);
7068 out_err:
7069 	dip->errors = 1;
7070 	/*
7071 	 * before atomic variable goto zero, we must
7072 	 * make sure dip->errors is perceived to be set.
7073 	 */
7074 	smp_mb__before_atomic_dec();
7075 	if (atomic_dec_and_test(&dip->pending_bios))
7076 		bio_io_error(dip->orig_bio);
7077 
7078 	/* bio_end_io() will handle error, so we needn't return it */
7079 	return 0;
7080 }
7081 
7082 static void btrfs_submit_direct(int rw, struct bio *dio_bio,
7083 				struct inode *inode, loff_t file_offset)
7084 {
7085 	struct btrfs_root *root = BTRFS_I(inode)->root;
7086 	struct btrfs_dio_private *dip;
7087 	struct bio *io_bio;
7088 	int skip_sum;
7089 	int sum_len;
7090 	int write = rw & REQ_WRITE;
7091 	int ret = 0;
7092 	u16 csum_size;
7093 
7094 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
7095 
7096 	io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
7097 	if (!io_bio) {
7098 		ret = -ENOMEM;
7099 		goto free_ordered;
7100 	}
7101 
7102 	if (!skip_sum && !write) {
7103 		csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
7104 		sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits;
7105 		sum_len *= csum_size;
7106 	} else {
7107 		sum_len = 0;
7108 	}
7109 
7110 	dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
7111 	if (!dip) {
7112 		ret = -ENOMEM;
7113 		goto free_io_bio;
7114 	}
7115 
7116 	dip->private = dio_bio->bi_private;
7117 	dip->inode = inode;
7118 	dip->logical_offset = file_offset;
7119 	dip->bytes = dio_bio->bi_size;
7120 	dip->disk_bytenr = (u64)dio_bio->bi_sector << 9;
7121 	io_bio->bi_private = dip;
7122 	dip->errors = 0;
7123 	dip->orig_bio = io_bio;
7124 	dip->dio_bio = dio_bio;
7125 	atomic_set(&dip->pending_bios, 0);
7126 
7127 	if (write)
7128 		io_bio->bi_end_io = btrfs_endio_direct_write;
7129 	else
7130 		io_bio->bi_end_io = btrfs_endio_direct_read;
7131 
7132 	ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
7133 	if (!ret)
7134 		return;
7135 
7136 free_io_bio:
7137 	bio_put(io_bio);
7138 
7139 free_ordered:
7140 	/*
7141 	 * If this is a write, we need to clean up the reserved space and kill
7142 	 * the ordered extent.
7143 	 */
7144 	if (write) {
7145 		struct btrfs_ordered_extent *ordered;
7146 		ordered = btrfs_lookup_ordered_extent(inode, file_offset);
7147 		if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
7148 		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
7149 			btrfs_free_reserved_extent(root, ordered->start,
7150 						   ordered->disk_len);
7151 		btrfs_put_ordered_extent(ordered);
7152 		btrfs_put_ordered_extent(ordered);
7153 	}
7154 	bio_endio(dio_bio, ret);
7155 }
7156 
7157 static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
7158 			const struct iovec *iov, loff_t offset,
7159 			unsigned long nr_segs)
7160 {
7161 	int seg;
7162 	int i;
7163 	size_t size;
7164 	unsigned long addr;
7165 	unsigned blocksize_mask = root->sectorsize - 1;
7166 	ssize_t retval = -EINVAL;
7167 	loff_t end = offset;
7168 
7169 	if (offset & blocksize_mask)
7170 		goto out;
7171 
7172 	/* Check the memory alignment.  Blocks cannot straddle pages */
7173 	for (seg = 0; seg < nr_segs; seg++) {
7174 		addr = (unsigned long)iov[seg].iov_base;
7175 		size = iov[seg].iov_len;
7176 		end += size;
7177 		if ((addr & blocksize_mask) || (size & blocksize_mask))
7178 			goto out;
7179 
7180 		/* If this is a write we don't need to check anymore */
7181 		if (rw & WRITE)
7182 			continue;
7183 
7184 		/*
7185 		 * Check to make sure we don't have duplicate iov_base's in this
7186 		 * iovec, if so return EINVAL, otherwise we'll get csum errors
7187 		 * when reading back.
7188 		 */
7189 		for (i = seg + 1; i < nr_segs; i++) {
7190 			if (iov[seg].iov_base == iov[i].iov_base)
7191 				goto out;
7192 		}
7193 	}
7194 	retval = 0;
7195 out:
7196 	return retval;
7197 }
7198 
7199 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
7200 			const struct iovec *iov, loff_t offset,
7201 			unsigned long nr_segs)
7202 {
7203 	struct file *file = iocb->ki_filp;
7204 	struct inode *inode = file->f_mapping->host;
7205 	size_t count = 0;
7206 	int flags = 0;
7207 	bool wakeup = true;
7208 	bool relock = false;
7209 	ssize_t ret;
7210 
7211 	if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
7212 			    offset, nr_segs))
7213 		return 0;
7214 
7215 	atomic_inc(&inode->i_dio_count);
7216 	smp_mb__after_atomic_inc();
7217 
7218 	/*
7219 	 * The generic stuff only does filemap_write_and_wait_range, which isn't
7220 	 * enough if we've written compressed pages to this area, so we need to
7221 	 * call btrfs_wait_ordered_range to make absolutely sure that any
7222 	 * outstanding dirty pages are on disk.
7223 	 */
7224 	count = iov_length(iov, nr_segs);
7225 	btrfs_wait_ordered_range(inode, offset, count);
7226 
7227 	if (rw & WRITE) {
7228 		/*
7229 		 * If the write DIO is beyond the EOF, we need update
7230 		 * the isize, but it is protected by i_mutex. So we can
7231 		 * not unlock the i_mutex at this case.
7232 		 */
7233 		if (offset + count <= inode->i_size) {
7234 			mutex_unlock(&inode->i_mutex);
7235 			relock = true;
7236 		}
7237 		ret = btrfs_delalloc_reserve_space(inode, count);
7238 		if (ret)
7239 			goto out;
7240 	} else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
7241 				     &BTRFS_I(inode)->runtime_flags))) {
7242 		inode_dio_done(inode);
7243 		flags = DIO_LOCKING | DIO_SKIP_HOLES;
7244 		wakeup = false;
7245 	}
7246 
7247 	ret = __blockdev_direct_IO(rw, iocb, inode,
7248 			BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
7249 			iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
7250 			btrfs_submit_direct, flags);
7251 	if (rw & WRITE) {
7252 		if (ret < 0 && ret != -EIOCBQUEUED)
7253 			btrfs_delalloc_release_space(inode, count);
7254 		else if (ret >= 0 && (size_t)ret < count)
7255 			btrfs_delalloc_release_space(inode,
7256 						     count - (size_t)ret);
7257 		else
7258 			btrfs_delalloc_release_metadata(inode, 0);
7259 	}
7260 out:
7261 	if (wakeup)
7262 		inode_dio_done(inode);
7263 	if (relock)
7264 		mutex_lock(&inode->i_mutex);
7265 
7266 	return ret;
7267 }
7268 
7269 #define BTRFS_FIEMAP_FLAGS	(FIEMAP_FLAG_SYNC)
7270 
7271 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
7272 		__u64 start, __u64 len)
7273 {
7274 	int	ret;
7275 
7276 	ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS);
7277 	if (ret)
7278 		return ret;
7279 
7280 	return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
7281 }
7282 
7283 int btrfs_readpage(struct file *file, struct page *page)
7284 {
7285 	struct extent_io_tree *tree;
7286 	tree = &BTRFS_I(page->mapping->host)->io_tree;
7287 	return extent_read_full_page(tree, page, btrfs_get_extent, 0);
7288 }
7289 
7290 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
7291 {
7292 	struct extent_io_tree *tree;
7293 
7294 
7295 	if (current->flags & PF_MEMALLOC) {
7296 		redirty_page_for_writepage(wbc, page);
7297 		unlock_page(page);
7298 		return 0;
7299 	}
7300 	tree = &BTRFS_I(page->mapping->host)->io_tree;
7301 	return extent_write_full_page(tree, page, btrfs_get_extent, wbc);
7302 }
7303 
7304 static int btrfs_writepages(struct address_space *mapping,
7305 			    struct writeback_control *wbc)
7306 {
7307 	struct extent_io_tree *tree;
7308 
7309 	tree = &BTRFS_I(mapping->host)->io_tree;
7310 	return extent_writepages(tree, mapping, btrfs_get_extent, wbc);
7311 }
7312 
7313 static int
7314 btrfs_readpages(struct file *file, struct address_space *mapping,
7315 		struct list_head *pages, unsigned nr_pages)
7316 {
7317 	struct extent_io_tree *tree;
7318 	tree = &BTRFS_I(mapping->host)->io_tree;
7319 	return extent_readpages(tree, mapping, pages, nr_pages,
7320 				btrfs_get_extent);
7321 }
7322 static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
7323 {
7324 	struct extent_io_tree *tree;
7325 	struct extent_map_tree *map;
7326 	int ret;
7327 
7328 	tree = &BTRFS_I(page->mapping->host)->io_tree;
7329 	map = &BTRFS_I(page->mapping->host)->extent_tree;
7330 	ret = try_release_extent_mapping(map, tree, page, gfp_flags);
7331 	if (ret == 1) {
7332 		ClearPagePrivate(page);
7333 		set_page_private(page, 0);
7334 		page_cache_release(page);
7335 	}
7336 	return ret;
7337 }
7338 
7339 static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
7340 {
7341 	if (PageWriteback(page) || PageDirty(page))
7342 		return 0;
7343 	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
7344 }
7345 
7346 static void btrfs_invalidatepage(struct page *page, unsigned int offset,
7347 				 unsigned int length)
7348 {
7349 	struct inode *inode = page->mapping->host;
7350 	struct extent_io_tree *tree;
7351 	struct btrfs_ordered_extent *ordered;
7352 	struct extent_state *cached_state = NULL;
7353 	u64 page_start = page_offset(page);
7354 	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
7355 
7356 	/*
7357 	 * we have the page locked, so new writeback can't start,
7358 	 * and the dirty bit won't be cleared while we are here.
7359 	 *
7360 	 * Wait for IO on this page so that we can safely clear
7361 	 * the PagePrivate2 bit and do ordered accounting
7362 	 */
7363 	wait_on_page_writeback(page);
7364 
7365 	tree = &BTRFS_I(inode)->io_tree;
7366 	if (offset) {
7367 		btrfs_releasepage(page, GFP_NOFS);
7368 		return;
7369 	}
7370 	lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
7371 	ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
7372 	if (ordered) {
7373 		/*
7374 		 * IO on this page will never be started, so we need
7375 		 * to account for any ordered extents now
7376 		 */
7377 		clear_extent_bit(tree, page_start, page_end,
7378 				 EXTENT_DIRTY | EXTENT_DELALLOC |
7379 				 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
7380 				 EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS);
7381 		/*
7382 		 * whoever cleared the private bit is responsible
7383 		 * for the finish_ordered_io
7384 		 */
7385 		if (TestClearPagePrivate2(page)) {
7386 			struct btrfs_ordered_inode_tree *tree;
7387 			u64 new_len;
7388 
7389 			tree = &BTRFS_I(inode)->ordered_tree;
7390 
7391 			spin_lock_irq(&tree->lock);
7392 			set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
7393 			new_len = page_start - ordered->file_offset;
7394 			if (new_len < ordered->truncated_len)
7395 				ordered->truncated_len = new_len;
7396 			spin_unlock_irq(&tree->lock);
7397 
7398 			if (btrfs_dec_test_ordered_pending(inode, &ordered,
7399 							   page_start,
7400 							   PAGE_CACHE_SIZE, 1))
7401 				btrfs_finish_ordered_io(ordered);
7402 		}
7403 		btrfs_put_ordered_extent(ordered);
7404 		cached_state = NULL;
7405 		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
7406 	}
7407 	clear_extent_bit(tree, page_start, page_end,
7408 		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
7409 		 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1,
7410 		 &cached_state, GFP_NOFS);
7411 	__btrfs_releasepage(page, GFP_NOFS);
7412 
7413 	ClearPageChecked(page);
7414 	if (PagePrivate(page)) {
7415 		ClearPagePrivate(page);
7416 		set_page_private(page, 0);
7417 		page_cache_release(page);
7418 	}
7419 }
7420 
7421 /*
7422  * btrfs_page_mkwrite() is not allowed to change the file size as it gets
7423  * called from a page fault handler when a page is first dirtied. Hence we must
7424  * be careful to check for EOF conditions here. We set the page up correctly
7425  * for a written page which means we get ENOSPC checking when writing into
7426  * holes and correct delalloc and unwritten extent mapping on filesystems that
7427  * support these features.
7428  *
7429  * We are not allowed to take the i_mutex here so we have to play games to
7430  * protect against truncate races as the page could now be beyond EOF.  Because
7431  * vmtruncate() writes the inode size before removing pages, once we have the
7432  * page lock we can determine safely if the page is beyond EOF. If it is not
7433  * beyond EOF, then the page is guaranteed safe against truncation until we
7434  * unlock the page.
7435  */
7436 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
7437 {
7438 	struct page *page = vmf->page;
7439 	struct inode *inode = file_inode(vma->vm_file);
7440 	struct btrfs_root *root = BTRFS_I(inode)->root;
7441 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
7442 	struct btrfs_ordered_extent *ordered;
7443 	struct extent_state *cached_state = NULL;
7444 	char *kaddr;
7445 	unsigned long zero_start;
7446 	loff_t size;
7447 	int ret;
7448 	int reserved = 0;
7449 	u64 page_start;
7450 	u64 page_end;
7451 
7452 	sb_start_pagefault(inode->i_sb);
7453 	ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
7454 	if (!ret) {
7455 		ret = file_update_time(vma->vm_file);
7456 		reserved = 1;
7457 	}
7458 	if (ret) {
7459 		if (ret == -ENOMEM)
7460 			ret = VM_FAULT_OOM;
7461 		else /* -ENOSPC, -EIO, etc */
7462 			ret = VM_FAULT_SIGBUS;
7463 		if (reserved)
7464 			goto out;
7465 		goto out_noreserve;
7466 	}
7467 
7468 	ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
7469 again:
7470 	lock_page(page);
7471 	size = i_size_read(inode);
7472 	page_start = page_offset(page);
7473 	page_end = page_start + PAGE_CACHE_SIZE - 1;
7474 
7475 	if ((page->mapping != inode->i_mapping) ||
7476 	    (page_start >= size)) {
7477 		/* page got truncated out from underneath us */
7478 		goto out_unlock;
7479 	}
7480 	wait_on_page_writeback(page);
7481 
7482 	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
7483 	set_page_extent_mapped(page);
7484 
7485 	/*
7486 	 * we can't set the delalloc bits if there are pending ordered
7487 	 * extents.  Drop our locks and wait for them to finish
7488 	 */
7489 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
7490 	if (ordered) {
7491 		unlock_extent_cached(io_tree, page_start, page_end,
7492 				     &cached_state, GFP_NOFS);
7493 		unlock_page(page);
7494 		btrfs_start_ordered_extent(inode, ordered, 1);
7495 		btrfs_put_ordered_extent(ordered);
7496 		goto again;
7497 	}
7498 
7499 	/*
7500 	 * XXX - page_mkwrite gets called every time the page is dirtied, even
7501 	 * if it was already dirty, so for space accounting reasons we need to
7502 	 * clear any delalloc bits for the range we are fixing to save.  There
7503 	 * is probably a better way to do this, but for now keep consistent with
7504 	 * prepare_pages in the normal write path.
7505 	 */
7506 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
7507 			  EXTENT_DIRTY | EXTENT_DELALLOC |
7508 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
7509 			  0, 0, &cached_state, GFP_NOFS);
7510 
7511 	ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
7512 					&cached_state);
7513 	if (ret) {
7514 		unlock_extent_cached(io_tree, page_start, page_end,
7515 				     &cached_state, GFP_NOFS);
7516 		ret = VM_FAULT_SIGBUS;
7517 		goto out_unlock;
7518 	}
7519 	ret = 0;
7520 
7521 	/* page is wholly or partially inside EOF */
7522 	if (page_start + PAGE_CACHE_SIZE > size)
7523 		zero_start = size & ~PAGE_CACHE_MASK;
7524 	else
7525 		zero_start = PAGE_CACHE_SIZE;
7526 
7527 	if (zero_start != PAGE_CACHE_SIZE) {
7528 		kaddr = kmap(page);
7529 		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
7530 		flush_dcache_page(page);
7531 		kunmap(page);
7532 	}
7533 	ClearPageChecked(page);
7534 	set_page_dirty(page);
7535 	SetPageUptodate(page);
7536 
7537 	BTRFS_I(inode)->last_trans = root->fs_info->generation;
7538 	BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
7539 	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
7540 
7541 	unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
7542 
7543 out_unlock:
7544 	if (!ret) {
7545 		sb_end_pagefault(inode->i_sb);
7546 		return VM_FAULT_LOCKED;
7547 	}
7548 	unlock_page(page);
7549 out:
7550 	btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
7551 out_noreserve:
7552 	sb_end_pagefault(inode->i_sb);
7553 	return ret;
7554 }
7555 
7556 static int btrfs_truncate(struct inode *inode)
7557 {
7558 	struct btrfs_root *root = BTRFS_I(inode)->root;
7559 	struct btrfs_block_rsv *rsv;
7560 	int ret = 0;
7561 	int err = 0;
7562 	struct btrfs_trans_handle *trans;
7563 	u64 mask = root->sectorsize - 1;
7564 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
7565 
7566 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
7567 
7568 	/*
7569 	 * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
7570 	 * 3 things going on here
7571 	 *
7572 	 * 1) We need to reserve space for our orphan item and the space to
7573 	 * delete our orphan item.  Lord knows we don't want to have a dangling
7574 	 * orphan item because we didn't reserve space to remove it.
7575 	 *
7576 	 * 2) We need to reserve space to update our inode.
7577 	 *
7578 	 * 3) We need to have something to cache all the space that is going to
7579 	 * be free'd up by the truncate operation, but also have some slack
7580 	 * space reserved in case it uses space during the truncate (thank you
7581 	 * very much snapshotting).
7582 	 *
7583 	 * And we need these to all be seperate.  The fact is we can use alot of
7584 	 * space doing the truncate, and we have no earthly idea how much space
7585 	 * we will use, so we need the truncate reservation to be seperate so it
7586 	 * doesn't end up using space reserved for updating the inode or
7587 	 * removing the orphan item.  We also need to be able to stop the
7588 	 * transaction and start a new one, which means we need to be able to
7589 	 * update the inode several times, and we have no idea of knowing how
7590 	 * many times that will be, so we can't just reserve 1 item for the
7591 	 * entirety of the opration, so that has to be done seperately as well.
7592 	 * Then there is the orphan item, which does indeed need to be held on
7593 	 * to for the whole operation, and we need nobody to touch this reserved
7594 	 * space except the orphan code.
7595 	 *
7596 	 * So that leaves us with
7597 	 *
7598 	 * 1) root->orphan_block_rsv - for the orphan deletion.
7599 	 * 2) rsv - for the truncate reservation, which we will steal from the
7600 	 * transaction reservation.
7601 	 * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
7602 	 * updating the inode.
7603 	 */
7604 	rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
7605 	if (!rsv)
7606 		return -ENOMEM;
7607 	rsv->size = min_size;
7608 	rsv->failfast = 1;
7609 
7610 	/*
7611 	 * 1 for the truncate slack space
7612 	 * 1 for updating the inode.
7613 	 */
7614 	trans = btrfs_start_transaction(root, 2);
7615 	if (IS_ERR(trans)) {
7616 		err = PTR_ERR(trans);
7617 		goto out;
7618 	}
7619 
7620 	/* Migrate the slack space for the truncate to our reserve */
7621 	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
7622 				      min_size);
7623 	BUG_ON(ret);
7624 
7625 	/*
7626 	 * setattr is responsible for setting the ordered_data_close flag,
7627 	 * but that is only tested during the last file release.  That
7628 	 * could happen well after the next commit, leaving a great big
7629 	 * window where new writes may get lost if someone chooses to write
7630 	 * to this file after truncating to zero
7631 	 *
7632 	 * The inode doesn't have any dirty data here, and so if we commit
7633 	 * this is a noop.  If someone immediately starts writing to the inode
7634 	 * it is very likely we'll catch some of their writes in this
7635 	 * transaction, and the commit will find this file on the ordered
7636 	 * data list with good things to send down.
7637 	 *
7638 	 * This is a best effort solution, there is still a window where
7639 	 * using truncate to replace the contents of the file will
7640 	 * end up with a zero length file after a crash.
7641 	 */
7642 	if (inode->i_size == 0 && test_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
7643 					   &BTRFS_I(inode)->runtime_flags))
7644 		btrfs_add_ordered_operation(trans, root, inode);
7645 
7646 	/*
7647 	 * So if we truncate and then write and fsync we normally would just
7648 	 * write the extents that changed, which is a problem if we need to
7649 	 * first truncate that entire inode.  So set this flag so we write out
7650 	 * all of the extents in the inode to the sync log so we're completely
7651 	 * safe.
7652 	 */
7653 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
7654 	trans->block_rsv = rsv;
7655 
7656 	while (1) {
7657 		ret = btrfs_truncate_inode_items(trans, root, inode,
7658 						 inode->i_size,
7659 						 BTRFS_EXTENT_DATA_KEY);
7660 		if (ret != -ENOSPC) {
7661 			err = ret;
7662 			break;
7663 		}
7664 
7665 		trans->block_rsv = &root->fs_info->trans_block_rsv;
7666 		ret = btrfs_update_inode(trans, root, inode);
7667 		if (ret) {
7668 			err = ret;
7669 			break;
7670 		}
7671 
7672 		btrfs_end_transaction(trans, root);
7673 		btrfs_btree_balance_dirty(root);
7674 
7675 		trans = btrfs_start_transaction(root, 2);
7676 		if (IS_ERR(trans)) {
7677 			ret = err = PTR_ERR(trans);
7678 			trans = NULL;
7679 			break;
7680 		}
7681 
7682 		ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv,
7683 					      rsv, min_size);
7684 		BUG_ON(ret);	/* shouldn't happen */
7685 		trans->block_rsv = rsv;
7686 	}
7687 
7688 	if (ret == 0 && inode->i_nlink > 0) {
7689 		trans->block_rsv = root->orphan_block_rsv;
7690 		ret = btrfs_orphan_del(trans, inode);
7691 		if (ret)
7692 			err = ret;
7693 	}
7694 
7695 	if (trans) {
7696 		trans->block_rsv = &root->fs_info->trans_block_rsv;
7697 		ret = btrfs_update_inode(trans, root, inode);
7698 		if (ret && !err)
7699 			err = ret;
7700 
7701 		ret = btrfs_end_transaction(trans, root);
7702 		btrfs_btree_balance_dirty(root);
7703 	}
7704 
7705 out:
7706 	btrfs_free_block_rsv(root, rsv);
7707 
7708 	if (ret && !err)
7709 		err = ret;
7710 
7711 	return err;
7712 }
7713 
7714 /*
7715  * create a new subvolume directory/inode (helper for the ioctl).
7716  */
7717 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
7718 			     struct btrfs_root *new_root, u64 new_dirid)
7719 {
7720 	struct inode *inode;
7721 	int err;
7722 	u64 index = 0;
7723 
7724 	inode = btrfs_new_inode(trans, new_root, NULL, "..", 2,
7725 				new_dirid, new_dirid,
7726 				S_IFDIR | (~current_umask() & S_IRWXUGO),
7727 				&index);
7728 	if (IS_ERR(inode))
7729 		return PTR_ERR(inode);
7730 	inode->i_op = &btrfs_dir_inode_operations;
7731 	inode->i_fop = &btrfs_dir_file_operations;
7732 
7733 	set_nlink(inode, 1);
7734 	btrfs_i_size_write(inode, 0);
7735 
7736 	err = btrfs_update_inode(trans, new_root, inode);
7737 
7738 	iput(inode);
7739 	return err;
7740 }
7741 
7742 struct inode *btrfs_alloc_inode(struct super_block *sb)
7743 {
7744 	struct btrfs_inode *ei;
7745 	struct inode *inode;
7746 
7747 	ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
7748 	if (!ei)
7749 		return NULL;
7750 
7751 	ei->root = NULL;
7752 	ei->generation = 0;
7753 	ei->last_trans = 0;
7754 	ei->last_sub_trans = 0;
7755 	ei->logged_trans = 0;
7756 	ei->delalloc_bytes = 0;
7757 	ei->disk_i_size = 0;
7758 	ei->flags = 0;
7759 	ei->csum_bytes = 0;
7760 	ei->index_cnt = (u64)-1;
7761 	ei->last_unlink_trans = 0;
7762 	ei->last_log_commit = 0;
7763 
7764 	spin_lock_init(&ei->lock);
7765 	ei->outstanding_extents = 0;
7766 	ei->reserved_extents = 0;
7767 
7768 	ei->runtime_flags = 0;
7769 	ei->force_compress = BTRFS_COMPRESS_NONE;
7770 
7771 	ei->delayed_node = NULL;
7772 
7773 	inode = &ei->vfs_inode;
7774 	extent_map_tree_init(&ei->extent_tree);
7775 	extent_io_tree_init(&ei->io_tree, &inode->i_data);
7776 	extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
7777 	ei->io_tree.track_uptodate = 1;
7778 	ei->io_failure_tree.track_uptodate = 1;
7779 	atomic_set(&ei->sync_writers, 0);
7780 	mutex_init(&ei->log_mutex);
7781 	mutex_init(&ei->delalloc_mutex);
7782 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
7783 	INIT_LIST_HEAD(&ei->delalloc_inodes);
7784 	INIT_LIST_HEAD(&ei->ordered_operations);
7785 	RB_CLEAR_NODE(&ei->rb_node);
7786 
7787 	return inode;
7788 }
7789 
7790 static void btrfs_i_callback(struct rcu_head *head)
7791 {
7792 	struct inode *inode = container_of(head, struct inode, i_rcu);
7793 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7794 }
7795 
7796 void btrfs_destroy_inode(struct inode *inode)
7797 {
7798 	struct btrfs_ordered_extent *ordered;
7799 	struct btrfs_root *root = BTRFS_I(inode)->root;
7800 
7801 	WARN_ON(!hlist_empty(&inode->i_dentry));
7802 	WARN_ON(inode->i_data.nrpages);
7803 	WARN_ON(BTRFS_I(inode)->outstanding_extents);
7804 	WARN_ON(BTRFS_I(inode)->reserved_extents);
7805 	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
7806 	WARN_ON(BTRFS_I(inode)->csum_bytes);
7807 
7808 	/*
7809 	 * This can happen where we create an inode, but somebody else also
7810 	 * created the same inode and we need to destroy the one we already
7811 	 * created.
7812 	 */
7813 	if (!root)
7814 		goto free;
7815 
7816 	/*
7817 	 * Make sure we're properly removed from the ordered operation
7818 	 * lists.
7819 	 */
7820 	smp_mb();
7821 	if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
7822 		spin_lock(&root->fs_info->ordered_root_lock);
7823 		list_del_init(&BTRFS_I(inode)->ordered_operations);
7824 		spin_unlock(&root->fs_info->ordered_root_lock);
7825 	}
7826 
7827 	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
7828 		     &BTRFS_I(inode)->runtime_flags)) {
7829 		btrfs_info(root->fs_info, "inode %llu still on the orphan list",
7830 			btrfs_ino(inode));
7831 		atomic_dec(&root->orphan_inodes);
7832 	}
7833 
7834 	while (1) {
7835 		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
7836 		if (!ordered)
7837 			break;
7838 		else {
7839 			btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
7840 				ordered->file_offset, ordered->len);
7841 			btrfs_remove_ordered_extent(inode, ordered);
7842 			btrfs_put_ordered_extent(ordered);
7843 			btrfs_put_ordered_extent(ordered);
7844 		}
7845 	}
7846 	inode_tree_del(inode);
7847 	btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
7848 free:
7849 	call_rcu(&inode->i_rcu, btrfs_i_callback);
7850 }
7851 
7852 int btrfs_drop_inode(struct inode *inode)
7853 {
7854 	struct btrfs_root *root = BTRFS_I(inode)->root;
7855 
7856 	if (root == NULL)
7857 		return 1;
7858 
7859 	/* the snap/subvol tree is on deleting */
7860 	if (btrfs_root_refs(&root->root_item) == 0 &&
7861 	    root != root->fs_info->tree_root)
7862 		return 1;
7863 	else
7864 		return generic_drop_inode(inode);
7865 }
7866 
7867 static void init_once(void *foo)
7868 {
7869 	struct btrfs_inode *ei = (struct btrfs_inode *) foo;
7870 
7871 	inode_init_once(&ei->vfs_inode);
7872 }
7873 
7874 void btrfs_destroy_cachep(void)
7875 {
7876 	/*
7877 	 * Make sure all delayed rcu free inodes are flushed before we
7878 	 * destroy cache.
7879 	 */
7880 	rcu_barrier();
7881 	if (btrfs_inode_cachep)
7882 		kmem_cache_destroy(btrfs_inode_cachep);
7883 	if (btrfs_trans_handle_cachep)
7884 		kmem_cache_destroy(btrfs_trans_handle_cachep);
7885 	if (btrfs_transaction_cachep)
7886 		kmem_cache_destroy(btrfs_transaction_cachep);
7887 	if (btrfs_path_cachep)
7888 		kmem_cache_destroy(btrfs_path_cachep);
7889 	if (btrfs_free_space_cachep)
7890 		kmem_cache_destroy(btrfs_free_space_cachep);
7891 	if (btrfs_delalloc_work_cachep)
7892 		kmem_cache_destroy(btrfs_delalloc_work_cachep);
7893 }
7894 
7895 int btrfs_init_cachep(void)
7896 {
7897 	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
7898 			sizeof(struct btrfs_inode), 0,
7899 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
7900 	if (!btrfs_inode_cachep)
7901 		goto fail;
7902 
7903 	btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
7904 			sizeof(struct btrfs_trans_handle), 0,
7905 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7906 	if (!btrfs_trans_handle_cachep)
7907 		goto fail;
7908 
7909 	btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction",
7910 			sizeof(struct btrfs_transaction), 0,
7911 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7912 	if (!btrfs_transaction_cachep)
7913 		goto fail;
7914 
7915 	btrfs_path_cachep = kmem_cache_create("btrfs_path",
7916 			sizeof(struct btrfs_path), 0,
7917 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7918 	if (!btrfs_path_cachep)
7919 		goto fail;
7920 
7921 	btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
7922 			sizeof(struct btrfs_free_space), 0,
7923 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
7924 	if (!btrfs_free_space_cachep)
7925 		goto fail;
7926 
7927 	btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
7928 			sizeof(struct btrfs_delalloc_work), 0,
7929 			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
7930 			NULL);
7931 	if (!btrfs_delalloc_work_cachep)
7932 		goto fail;
7933 
7934 	return 0;
7935 fail:
7936 	btrfs_destroy_cachep();
7937 	return -ENOMEM;
7938 }
7939 
7940 static int btrfs_getattr(struct vfsmount *mnt,
7941 			 struct dentry *dentry, struct kstat *stat)
7942 {
7943 	u64 delalloc_bytes;
7944 	struct inode *inode = dentry->d_inode;
7945 	u32 blocksize = inode->i_sb->s_blocksize;
7946 
7947 	generic_fillattr(inode, stat);
7948 	stat->dev = BTRFS_I(inode)->root->anon_dev;
7949 	stat->blksize = PAGE_CACHE_SIZE;
7950 
7951 	spin_lock(&BTRFS_I(inode)->lock);
7952 	delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
7953 	spin_unlock(&BTRFS_I(inode)->lock);
7954 	stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
7955 			ALIGN(delalloc_bytes, blocksize)) >> 9;
7956 	return 0;
7957 }
7958 
7959 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
7960 			   struct inode *new_dir, struct dentry *new_dentry)
7961 {
7962 	struct btrfs_trans_handle *trans;
7963 	struct btrfs_root *root = BTRFS_I(old_dir)->root;
7964 	struct btrfs_root *dest = BTRFS_I(new_dir)->root;
7965 	struct inode *new_inode = new_dentry->d_inode;
7966 	struct inode *old_inode = old_dentry->d_inode;
7967 	struct timespec ctime = CURRENT_TIME;
7968 	u64 index = 0;
7969 	u64 root_objectid;
7970 	int ret;
7971 	u64 old_ino = btrfs_ino(old_inode);
7972 
7973 	if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
7974 		return -EPERM;
7975 
7976 	/* we only allow rename subvolume link between subvolumes */
7977 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
7978 		return -EXDEV;
7979 
7980 	if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
7981 	    (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
7982 		return -ENOTEMPTY;
7983 
7984 	if (S_ISDIR(old_inode->i_mode) && new_inode &&
7985 	    new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
7986 		return -ENOTEMPTY;
7987 
7988 
7989 	/* check for collisions, even if the  name isn't there */
7990 	ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino,
7991 			     new_dentry->d_name.name,
7992 			     new_dentry->d_name.len);
7993 
7994 	if (ret) {
7995 		if (ret == -EEXIST) {
7996 			/* we shouldn't get
7997 			 * eexist without a new_inode */
7998 			if (!new_inode) {
7999 				WARN_ON(1);
8000 				return ret;
8001 			}
8002 		} else {
8003 			/* maybe -EOVERFLOW */
8004 			return ret;
8005 		}
8006 	}
8007 	ret = 0;
8008 
8009 	/*
8010 	 * we're using rename to replace one file with another.
8011 	 * and the replacement file is large.  Start IO on it now so
8012 	 * we don't add too much work to the end of the transaction
8013 	 */
8014 	if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
8015 	    old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
8016 		filemap_flush(old_inode->i_mapping);
8017 
8018 	/* close the racy window with snapshot create/destroy ioctl */
8019 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
8020 		down_read(&root->fs_info->subvol_sem);
8021 	/*
8022 	 * We want to reserve the absolute worst case amount of items.  So if
8023 	 * both inodes are subvols and we need to unlink them then that would
8024 	 * require 4 item modifications, but if they are both normal inodes it
8025 	 * would require 5 item modifications, so we'll assume their normal
8026 	 * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
8027 	 * should cover the worst case number of items we'll modify.
8028 	 */
8029 	trans = btrfs_start_transaction(root, 11);
8030 	if (IS_ERR(trans)) {
8031                 ret = PTR_ERR(trans);
8032                 goto out_notrans;
8033         }
8034 
8035 	if (dest != root)
8036 		btrfs_record_root_in_trans(trans, dest);
8037 
8038 	ret = btrfs_set_inode_index(new_dir, &index);
8039 	if (ret)
8040 		goto out_fail;
8041 
8042 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8043 		/* force full log commit if subvolume involved. */
8044 		root->fs_info->last_trans_log_full_commit = trans->transid;
8045 	} else {
8046 		ret = btrfs_insert_inode_ref(trans, dest,
8047 					     new_dentry->d_name.name,
8048 					     new_dentry->d_name.len,
8049 					     old_ino,
8050 					     btrfs_ino(new_dir), index);
8051 		if (ret)
8052 			goto out_fail;
8053 		/*
8054 		 * this is an ugly little race, but the rename is required
8055 		 * to make sure that if we crash, the inode is either at the
8056 		 * old name or the new one.  pinning the log transaction lets
8057 		 * us make sure we don't allow a log commit to come in after
8058 		 * we unlink the name but before we add the new name back in.
8059 		 */
8060 		btrfs_pin_log_trans(root);
8061 	}
8062 	/*
8063 	 * make sure the inode gets flushed if it is replacing
8064 	 * something.
8065 	 */
8066 	if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
8067 		btrfs_add_ordered_operation(trans, root, old_inode);
8068 
8069 	inode_inc_iversion(old_dir);
8070 	inode_inc_iversion(new_dir);
8071 	inode_inc_iversion(old_inode);
8072 	old_dir->i_ctime = old_dir->i_mtime = ctime;
8073 	new_dir->i_ctime = new_dir->i_mtime = ctime;
8074 	old_inode->i_ctime = ctime;
8075 
8076 	if (old_dentry->d_parent != new_dentry->d_parent)
8077 		btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
8078 
8079 	if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8080 		root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
8081 		ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
8082 					old_dentry->d_name.name,
8083 					old_dentry->d_name.len);
8084 	} else {
8085 		ret = __btrfs_unlink_inode(trans, root, old_dir,
8086 					old_dentry->d_inode,
8087 					old_dentry->d_name.name,
8088 					old_dentry->d_name.len);
8089 		if (!ret)
8090 			ret = btrfs_update_inode(trans, root, old_inode);
8091 	}
8092 	if (ret) {
8093 		btrfs_abort_transaction(trans, root, ret);
8094 		goto out_fail;
8095 	}
8096 
8097 	if (new_inode) {
8098 		inode_inc_iversion(new_inode);
8099 		new_inode->i_ctime = CURRENT_TIME;
8100 		if (unlikely(btrfs_ino(new_inode) ==
8101 			     BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
8102 			root_objectid = BTRFS_I(new_inode)->location.objectid;
8103 			ret = btrfs_unlink_subvol(trans, dest, new_dir,
8104 						root_objectid,
8105 						new_dentry->d_name.name,
8106 						new_dentry->d_name.len);
8107 			BUG_ON(new_inode->i_nlink == 0);
8108 		} else {
8109 			ret = btrfs_unlink_inode(trans, dest, new_dir,
8110 						 new_dentry->d_inode,
8111 						 new_dentry->d_name.name,
8112 						 new_dentry->d_name.len);
8113 		}
8114 		if (!ret && new_inode->i_nlink == 0)
8115 			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
8116 		if (ret) {
8117 			btrfs_abort_transaction(trans, root, ret);
8118 			goto out_fail;
8119 		}
8120 	}
8121 
8122 	ret = btrfs_add_link(trans, new_dir, old_inode,
8123 			     new_dentry->d_name.name,
8124 			     new_dentry->d_name.len, 0, index);
8125 	if (ret) {
8126 		btrfs_abort_transaction(trans, root, ret);
8127 		goto out_fail;
8128 	}
8129 
8130 	if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
8131 		struct dentry *parent = new_dentry->d_parent;
8132 		btrfs_log_new_name(trans, old_inode, old_dir, parent);
8133 		btrfs_end_log_trans(root);
8134 	}
8135 out_fail:
8136 	btrfs_end_transaction(trans, root);
8137 out_notrans:
8138 	if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
8139 		up_read(&root->fs_info->subvol_sem);
8140 
8141 	return ret;
8142 }
8143 
8144 static void btrfs_run_delalloc_work(struct btrfs_work *work)
8145 {
8146 	struct btrfs_delalloc_work *delalloc_work;
8147 
8148 	delalloc_work = container_of(work, struct btrfs_delalloc_work,
8149 				     work);
8150 	if (delalloc_work->wait)
8151 		btrfs_wait_ordered_range(delalloc_work->inode, 0, (u64)-1);
8152 	else
8153 		filemap_flush(delalloc_work->inode->i_mapping);
8154 
8155 	if (delalloc_work->delay_iput)
8156 		btrfs_add_delayed_iput(delalloc_work->inode);
8157 	else
8158 		iput(delalloc_work->inode);
8159 	complete(&delalloc_work->completion);
8160 }
8161 
8162 struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8163 						    int wait, int delay_iput)
8164 {
8165 	struct btrfs_delalloc_work *work;
8166 
8167 	work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
8168 	if (!work)
8169 		return NULL;
8170 
8171 	init_completion(&work->completion);
8172 	INIT_LIST_HEAD(&work->list);
8173 	work->inode = inode;
8174 	work->wait = wait;
8175 	work->delay_iput = delay_iput;
8176 	work->work.func = btrfs_run_delalloc_work;
8177 
8178 	return work;
8179 }
8180 
8181 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
8182 {
8183 	wait_for_completion(&work->completion);
8184 	kmem_cache_free(btrfs_delalloc_work_cachep, work);
8185 }
8186 
8187 /*
8188  * some fairly slow code that needs optimization. This walks the list
8189  * of all the inodes with pending delalloc and forces them to disk.
8190  */
8191 static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8192 {
8193 	struct btrfs_inode *binode;
8194 	struct inode *inode;
8195 	struct btrfs_delalloc_work *work, *next;
8196 	struct list_head works;
8197 	struct list_head splice;
8198 	int ret = 0;
8199 
8200 	INIT_LIST_HEAD(&works);
8201 	INIT_LIST_HEAD(&splice);
8202 
8203 	spin_lock(&root->delalloc_lock);
8204 	list_splice_init(&root->delalloc_inodes, &splice);
8205 	while (!list_empty(&splice)) {
8206 		binode = list_entry(splice.next, struct btrfs_inode,
8207 				    delalloc_inodes);
8208 
8209 		list_move_tail(&binode->delalloc_inodes,
8210 			       &root->delalloc_inodes);
8211 		inode = igrab(&binode->vfs_inode);
8212 		if (!inode) {
8213 			cond_resched_lock(&root->delalloc_lock);
8214 			continue;
8215 		}
8216 		spin_unlock(&root->delalloc_lock);
8217 
8218 		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
8219 		if (unlikely(!work)) {
8220 			if (delay_iput)
8221 				btrfs_add_delayed_iput(inode);
8222 			else
8223 				iput(inode);
8224 			ret = -ENOMEM;
8225 			goto out;
8226 		}
8227 		list_add_tail(&work->list, &works);
8228 		btrfs_queue_worker(&root->fs_info->flush_workers,
8229 				   &work->work);
8230 
8231 		cond_resched();
8232 		spin_lock(&root->delalloc_lock);
8233 	}
8234 	spin_unlock(&root->delalloc_lock);
8235 
8236 	list_for_each_entry_safe(work, next, &works, list) {
8237 		list_del_init(&work->list);
8238 		btrfs_wait_and_free_delalloc_work(work);
8239 	}
8240 	return 0;
8241 out:
8242 	list_for_each_entry_safe(work, next, &works, list) {
8243 		list_del_init(&work->list);
8244 		btrfs_wait_and_free_delalloc_work(work);
8245 	}
8246 
8247 	if (!list_empty_careful(&splice)) {
8248 		spin_lock(&root->delalloc_lock);
8249 		list_splice_tail(&splice, &root->delalloc_inodes);
8250 		spin_unlock(&root->delalloc_lock);
8251 	}
8252 	return ret;
8253 }
8254 
8255 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
8256 {
8257 	int ret;
8258 
8259 	if (root->fs_info->sb->s_flags & MS_RDONLY)
8260 		return -EROFS;
8261 
8262 	ret = __start_delalloc_inodes(root, delay_iput);
8263 	/*
8264 	 * the filemap_flush will queue IO into the worker threads, but
8265 	 * we have to make sure the IO is actually started and that
8266 	 * ordered extents get created before we return
8267 	 */
8268 	atomic_inc(&root->fs_info->async_submit_draining);
8269 	while (atomic_read(&root->fs_info->nr_async_submits) ||
8270 	      atomic_read(&root->fs_info->async_delalloc_pages)) {
8271 		wait_event(root->fs_info->async_submit_wait,
8272 		   (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
8273 		    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
8274 	}
8275 	atomic_dec(&root->fs_info->async_submit_draining);
8276 	return ret;
8277 }
8278 
8279 int btrfs_start_all_delalloc_inodes(struct btrfs_fs_info *fs_info,
8280 				    int delay_iput)
8281 {
8282 	struct btrfs_root *root;
8283 	struct list_head splice;
8284 	int ret;
8285 
8286 	if (fs_info->sb->s_flags & MS_RDONLY)
8287 		return -EROFS;
8288 
8289 	INIT_LIST_HEAD(&splice);
8290 
8291 	spin_lock(&fs_info->delalloc_root_lock);
8292 	list_splice_init(&fs_info->delalloc_roots, &splice);
8293 	while (!list_empty(&splice)) {
8294 		root = list_first_entry(&splice, struct btrfs_root,
8295 					delalloc_root);
8296 		root = btrfs_grab_fs_root(root);
8297 		BUG_ON(!root);
8298 		list_move_tail(&root->delalloc_root,
8299 			       &fs_info->delalloc_roots);
8300 		spin_unlock(&fs_info->delalloc_root_lock);
8301 
8302 		ret = __start_delalloc_inodes(root, delay_iput);
8303 		btrfs_put_fs_root(root);
8304 		if (ret)
8305 			goto out;
8306 
8307 		spin_lock(&fs_info->delalloc_root_lock);
8308 	}
8309 	spin_unlock(&fs_info->delalloc_root_lock);
8310 
8311 	atomic_inc(&fs_info->async_submit_draining);
8312 	while (atomic_read(&fs_info->nr_async_submits) ||
8313 	      atomic_read(&fs_info->async_delalloc_pages)) {
8314 		wait_event(fs_info->async_submit_wait,
8315 		   (atomic_read(&fs_info->nr_async_submits) == 0 &&
8316 		    atomic_read(&fs_info->async_delalloc_pages) == 0));
8317 	}
8318 	atomic_dec(&fs_info->async_submit_draining);
8319 	return 0;
8320 out:
8321 	if (!list_empty_careful(&splice)) {
8322 		spin_lock(&fs_info->delalloc_root_lock);
8323 		list_splice_tail(&splice, &fs_info->delalloc_roots);
8324 		spin_unlock(&fs_info->delalloc_root_lock);
8325 	}
8326 	return ret;
8327 }
8328 
8329 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
8330 			 const char *symname)
8331 {
8332 	struct btrfs_trans_handle *trans;
8333 	struct btrfs_root *root = BTRFS_I(dir)->root;
8334 	struct btrfs_path *path;
8335 	struct btrfs_key key;
8336 	struct inode *inode = NULL;
8337 	int err;
8338 	int drop_inode = 0;
8339 	u64 objectid;
8340 	u64 index = 0 ;
8341 	int name_len;
8342 	int datasize;
8343 	unsigned long ptr;
8344 	struct btrfs_file_extent_item *ei;
8345 	struct extent_buffer *leaf;
8346 
8347 	name_len = strlen(symname) + 1;
8348 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
8349 		return -ENAMETOOLONG;
8350 
8351 	/*
8352 	 * 2 items for inode item and ref
8353 	 * 2 items for dir items
8354 	 * 1 item for xattr if selinux is on
8355 	 */
8356 	trans = btrfs_start_transaction(root, 5);
8357 	if (IS_ERR(trans))
8358 		return PTR_ERR(trans);
8359 
8360 	err = btrfs_find_free_ino(root, &objectid);
8361 	if (err)
8362 		goto out_unlock;
8363 
8364 	inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
8365 				dentry->d_name.len, btrfs_ino(dir), objectid,
8366 				S_IFLNK|S_IRWXUGO, &index);
8367 	if (IS_ERR(inode)) {
8368 		err = PTR_ERR(inode);
8369 		goto out_unlock;
8370 	}
8371 
8372 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
8373 	if (err) {
8374 		drop_inode = 1;
8375 		goto out_unlock;
8376 	}
8377 
8378 	/*
8379 	* If the active LSM wants to access the inode during
8380 	* d_instantiate it needs these. Smack checks to see
8381 	* if the filesystem supports xattrs by looking at the
8382 	* ops vector.
8383 	*/
8384 	inode->i_fop = &btrfs_file_operations;
8385 	inode->i_op = &btrfs_file_inode_operations;
8386 
8387 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
8388 	if (err)
8389 		drop_inode = 1;
8390 	else {
8391 		inode->i_mapping->a_ops = &btrfs_aops;
8392 		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8393 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
8394 	}
8395 	if (drop_inode)
8396 		goto out_unlock;
8397 
8398 	path = btrfs_alloc_path();
8399 	if (!path) {
8400 		err = -ENOMEM;
8401 		drop_inode = 1;
8402 		goto out_unlock;
8403 	}
8404 	key.objectid = btrfs_ino(inode);
8405 	key.offset = 0;
8406 	btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
8407 	datasize = btrfs_file_extent_calc_inline_size(name_len);
8408 	err = btrfs_insert_empty_item(trans, root, path, &key,
8409 				      datasize);
8410 	if (err) {
8411 		drop_inode = 1;
8412 		btrfs_free_path(path);
8413 		goto out_unlock;
8414 	}
8415 	leaf = path->nodes[0];
8416 	ei = btrfs_item_ptr(leaf, path->slots[0],
8417 			    struct btrfs_file_extent_item);
8418 	btrfs_set_file_extent_generation(leaf, ei, trans->transid);
8419 	btrfs_set_file_extent_type(leaf, ei,
8420 				   BTRFS_FILE_EXTENT_INLINE);
8421 	btrfs_set_file_extent_encryption(leaf, ei, 0);
8422 	btrfs_set_file_extent_compression(leaf, ei, 0);
8423 	btrfs_set_file_extent_other_encoding(leaf, ei, 0);
8424 	btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
8425 
8426 	ptr = btrfs_file_extent_inline_start(ei);
8427 	write_extent_buffer(leaf, symname, ptr, name_len);
8428 	btrfs_mark_buffer_dirty(leaf);
8429 	btrfs_free_path(path);
8430 
8431 	inode->i_op = &btrfs_symlink_inode_operations;
8432 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
8433 	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
8434 	inode_set_bytes(inode, name_len);
8435 	btrfs_i_size_write(inode, name_len - 1);
8436 	err = btrfs_update_inode(trans, root, inode);
8437 	if (err)
8438 		drop_inode = 1;
8439 
8440 out_unlock:
8441 	if (!err)
8442 		d_instantiate(dentry, inode);
8443 	btrfs_end_transaction(trans, root);
8444 	if (drop_inode) {
8445 		inode_dec_link_count(inode);
8446 		iput(inode);
8447 	}
8448 	btrfs_btree_balance_dirty(root);
8449 	return err;
8450 }
8451 
8452 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
8453 				       u64 start, u64 num_bytes, u64 min_size,
8454 				       loff_t actual_len, u64 *alloc_hint,
8455 				       struct btrfs_trans_handle *trans)
8456 {
8457 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
8458 	struct extent_map *em;
8459 	struct btrfs_root *root = BTRFS_I(inode)->root;
8460 	struct btrfs_key ins;
8461 	u64 cur_offset = start;
8462 	u64 i_size;
8463 	u64 cur_bytes;
8464 	int ret = 0;
8465 	bool own_trans = true;
8466 
8467 	if (trans)
8468 		own_trans = false;
8469 	while (num_bytes > 0) {
8470 		if (own_trans) {
8471 			trans = btrfs_start_transaction(root, 3);
8472 			if (IS_ERR(trans)) {
8473 				ret = PTR_ERR(trans);
8474 				break;
8475 			}
8476 		}
8477 
8478 		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
8479 		cur_bytes = max(cur_bytes, min_size);
8480 		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
8481 					   *alloc_hint, &ins, 1);
8482 		if (ret) {
8483 			if (own_trans)
8484 				btrfs_end_transaction(trans, root);
8485 			break;
8486 		}
8487 
8488 		ret = insert_reserved_file_extent(trans, inode,
8489 						  cur_offset, ins.objectid,
8490 						  ins.offset, ins.offset,
8491 						  ins.offset, 0, 0, 0,
8492 						  BTRFS_FILE_EXTENT_PREALLOC);
8493 		if (ret) {
8494 			btrfs_abort_transaction(trans, root, ret);
8495 			if (own_trans)
8496 				btrfs_end_transaction(trans, root);
8497 			break;
8498 		}
8499 		btrfs_drop_extent_cache(inode, cur_offset,
8500 					cur_offset + ins.offset -1, 0);
8501 
8502 		em = alloc_extent_map();
8503 		if (!em) {
8504 			set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
8505 				&BTRFS_I(inode)->runtime_flags);
8506 			goto next;
8507 		}
8508 
8509 		em->start = cur_offset;
8510 		em->orig_start = cur_offset;
8511 		em->len = ins.offset;
8512 		em->block_start = ins.objectid;
8513 		em->block_len = ins.offset;
8514 		em->orig_block_len = ins.offset;
8515 		em->ram_bytes = ins.offset;
8516 		em->bdev = root->fs_info->fs_devices->latest_bdev;
8517 		set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
8518 		em->generation = trans->transid;
8519 
8520 		while (1) {
8521 			write_lock(&em_tree->lock);
8522 			ret = add_extent_mapping(em_tree, em, 1);
8523 			write_unlock(&em_tree->lock);
8524 			if (ret != -EEXIST)
8525 				break;
8526 			btrfs_drop_extent_cache(inode, cur_offset,
8527 						cur_offset + ins.offset - 1,
8528 						0);
8529 		}
8530 		free_extent_map(em);
8531 next:
8532 		num_bytes -= ins.offset;
8533 		cur_offset += ins.offset;
8534 		*alloc_hint = ins.objectid + ins.offset;
8535 
8536 		inode_inc_iversion(inode);
8537 		inode->i_ctime = CURRENT_TIME;
8538 		BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
8539 		if (!(mode & FALLOC_FL_KEEP_SIZE) &&
8540 		    (actual_len > inode->i_size) &&
8541 		    (cur_offset > inode->i_size)) {
8542 			if (cur_offset > actual_len)
8543 				i_size = actual_len;
8544 			else
8545 				i_size = cur_offset;
8546 			i_size_write(inode, i_size);
8547 			btrfs_ordered_update_i_size(inode, i_size, NULL);
8548 		}
8549 
8550 		ret = btrfs_update_inode(trans, root, inode);
8551 
8552 		if (ret) {
8553 			btrfs_abort_transaction(trans, root, ret);
8554 			if (own_trans)
8555 				btrfs_end_transaction(trans, root);
8556 			break;
8557 		}
8558 
8559 		if (own_trans)
8560 			btrfs_end_transaction(trans, root);
8561 	}
8562 	return ret;
8563 }
8564 
8565 int btrfs_prealloc_file_range(struct inode *inode, int mode,
8566 			      u64 start, u64 num_bytes, u64 min_size,
8567 			      loff_t actual_len, u64 *alloc_hint)
8568 {
8569 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8570 					   min_size, actual_len, alloc_hint,
8571 					   NULL);
8572 }
8573 
8574 int btrfs_prealloc_file_range_trans(struct inode *inode,
8575 				    struct btrfs_trans_handle *trans, int mode,
8576 				    u64 start, u64 num_bytes, u64 min_size,
8577 				    loff_t actual_len, u64 *alloc_hint)
8578 {
8579 	return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8580 					   min_size, actual_len, alloc_hint, trans);
8581 }
8582 
8583 static int btrfs_set_page_dirty(struct page *page)
8584 {
8585 	return __set_page_dirty_nobuffers(page);
8586 }
8587 
8588 static int btrfs_permission(struct inode *inode, int mask)
8589 {
8590 	struct btrfs_root *root = BTRFS_I(inode)->root;
8591 	umode_t mode = inode->i_mode;
8592 
8593 	if (mask & MAY_WRITE &&
8594 	    (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
8595 		if (btrfs_root_readonly(root))
8596 			return -EROFS;
8597 		if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
8598 			return -EACCES;
8599 	}
8600 	return generic_permission(inode, mask);
8601 }
8602 
8603 static const struct inode_operations btrfs_dir_inode_operations = {
8604 	.getattr	= btrfs_getattr,
8605 	.lookup		= btrfs_lookup,
8606 	.create		= btrfs_create,
8607 	.unlink		= btrfs_unlink,
8608 	.link		= btrfs_link,
8609 	.mkdir		= btrfs_mkdir,
8610 	.rmdir		= btrfs_rmdir,
8611 	.rename		= btrfs_rename,
8612 	.symlink	= btrfs_symlink,
8613 	.setattr	= btrfs_setattr,
8614 	.mknod		= btrfs_mknod,
8615 	.setxattr	= btrfs_setxattr,
8616 	.getxattr	= btrfs_getxattr,
8617 	.listxattr	= btrfs_listxattr,
8618 	.removexattr	= btrfs_removexattr,
8619 	.permission	= btrfs_permission,
8620 	.get_acl	= btrfs_get_acl,
8621 	.update_time	= btrfs_update_time,
8622 };
8623 static const struct inode_operations btrfs_dir_ro_inode_operations = {
8624 	.lookup		= btrfs_lookup,
8625 	.permission	= btrfs_permission,
8626 	.get_acl	= btrfs_get_acl,
8627 	.update_time	= btrfs_update_time,
8628 };
8629 
8630 static const struct file_operations btrfs_dir_file_operations = {
8631 	.llseek		= generic_file_llseek,
8632 	.read		= generic_read_dir,
8633 	.iterate	= btrfs_real_readdir,
8634 	.unlocked_ioctl	= btrfs_ioctl,
8635 #ifdef CONFIG_COMPAT
8636 	.compat_ioctl	= btrfs_ioctl,
8637 #endif
8638 	.release        = btrfs_release_file,
8639 	.fsync		= btrfs_sync_file,
8640 };
8641 
8642 static struct extent_io_ops btrfs_extent_io_ops = {
8643 	.fill_delalloc = run_delalloc_range,
8644 	.submit_bio_hook = btrfs_submit_bio_hook,
8645 	.merge_bio_hook = btrfs_merge_bio_hook,
8646 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
8647 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
8648 	.writepage_start_hook = btrfs_writepage_start_hook,
8649 	.set_bit_hook = btrfs_set_bit_hook,
8650 	.clear_bit_hook = btrfs_clear_bit_hook,
8651 	.merge_extent_hook = btrfs_merge_extent_hook,
8652 	.split_extent_hook = btrfs_split_extent_hook,
8653 };
8654 
8655 /*
8656  * btrfs doesn't support the bmap operation because swapfiles
8657  * use bmap to make a mapping of extents in the file.  They assume
8658  * these extents won't change over the life of the file and they
8659  * use the bmap result to do IO directly to the drive.
8660  *
8661  * the btrfs bmap call would return logical addresses that aren't
8662  * suitable for IO and they also will change frequently as COW
8663  * operations happen.  So, swapfile + btrfs == corruption.
8664  *
8665  * For now we're avoiding this by dropping bmap.
8666  */
8667 static const struct address_space_operations btrfs_aops = {
8668 	.readpage	= btrfs_readpage,
8669 	.writepage	= btrfs_writepage,
8670 	.writepages	= btrfs_writepages,
8671 	.readpages	= btrfs_readpages,
8672 	.direct_IO	= btrfs_direct_IO,
8673 	.invalidatepage = btrfs_invalidatepage,
8674 	.releasepage	= btrfs_releasepage,
8675 	.set_page_dirty	= btrfs_set_page_dirty,
8676 	.error_remove_page = generic_error_remove_page,
8677 };
8678 
8679 static const struct address_space_operations btrfs_symlink_aops = {
8680 	.readpage	= btrfs_readpage,
8681 	.writepage	= btrfs_writepage,
8682 	.invalidatepage = btrfs_invalidatepage,
8683 	.releasepage	= btrfs_releasepage,
8684 };
8685 
8686 static const struct inode_operations btrfs_file_inode_operations = {
8687 	.getattr	= btrfs_getattr,
8688 	.setattr	= btrfs_setattr,
8689 	.setxattr	= btrfs_setxattr,
8690 	.getxattr	= btrfs_getxattr,
8691 	.listxattr      = btrfs_listxattr,
8692 	.removexattr	= btrfs_removexattr,
8693 	.permission	= btrfs_permission,
8694 	.fiemap		= btrfs_fiemap,
8695 	.get_acl	= btrfs_get_acl,
8696 	.update_time	= btrfs_update_time,
8697 };
8698 static const struct inode_operations btrfs_special_inode_operations = {
8699 	.getattr	= btrfs_getattr,
8700 	.setattr	= btrfs_setattr,
8701 	.permission	= btrfs_permission,
8702 	.setxattr	= btrfs_setxattr,
8703 	.getxattr	= btrfs_getxattr,
8704 	.listxattr	= btrfs_listxattr,
8705 	.removexattr	= btrfs_removexattr,
8706 	.get_acl	= btrfs_get_acl,
8707 	.update_time	= btrfs_update_time,
8708 };
8709 static const struct inode_operations btrfs_symlink_inode_operations = {
8710 	.readlink	= generic_readlink,
8711 	.follow_link	= page_follow_link_light,
8712 	.put_link	= page_put_link,
8713 	.getattr	= btrfs_getattr,
8714 	.setattr	= btrfs_setattr,
8715 	.permission	= btrfs_permission,
8716 	.setxattr	= btrfs_setxattr,
8717 	.getxattr	= btrfs_getxattr,
8718 	.listxattr	= btrfs_listxattr,
8719 	.removexattr	= btrfs_removexattr,
8720 	.get_acl	= btrfs_get_acl,
8721 	.update_time	= btrfs_update_time,
8722 };
8723 
8724 const struct dentry_operations btrfs_dentry_operations = {
8725 	.d_delete	= btrfs_dentry_delete,
8726 	.d_release	= btrfs_dentry_release,
8727 };
8728