xref: /openbmc/linux/fs/btrfs/file.c (revision 5d4a2e29)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/fs.h>
20 #include <linux/pagemap.h>
21 #include <linux/highmem.h>
22 #include <linux/time.h>
23 #include <linux/init.h>
24 #include <linux/string.h>
25 #include <linux/backing-dev.h>
26 #include <linux/mpage.h>
27 #include <linux/swap.h>
28 #include <linux/writeback.h>
29 #include <linux/statfs.h>
30 #include <linux/compat.h>
31 #include <linux/slab.h>
32 #include "ctree.h"
33 #include "disk-io.h"
34 #include "transaction.h"
35 #include "btrfs_inode.h"
36 #include "ioctl.h"
37 #include "print-tree.h"
38 #include "tree-log.h"
39 #include "locking.h"
40 #include "compat.h"
41 
42 
43 /* simple helper to fault in pages and copy.  This should go away
44  * and be replaced with calls into generic code.
45  */
46 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
47 					 int write_bytes,
48 					 struct page **prepared_pages,
49 					 struct iov_iter *i)
50 {
51 	size_t copied;
52 	int pg = 0;
53 	int offset = pos & (PAGE_CACHE_SIZE - 1);
54 
55 	while (write_bytes > 0) {
56 		size_t count = min_t(size_t,
57 				     PAGE_CACHE_SIZE - offset, write_bytes);
58 		struct page *page = prepared_pages[pg];
59 again:
60 		if (unlikely(iov_iter_fault_in_readable(i, count)))
61 			return -EFAULT;
62 
63 		/* Copy data from userspace to the current page */
64 		copied = iov_iter_copy_from_user(page, i, offset, count);
65 
66 		/* Flush processor's dcache for this page */
67 		flush_dcache_page(page);
68 		iov_iter_advance(i, copied);
69 		write_bytes -= copied;
70 
71 		if (unlikely(copied == 0)) {
72 			count = min_t(size_t, PAGE_CACHE_SIZE - offset,
73 				      iov_iter_single_seg_count(i));
74 			goto again;
75 		}
76 
77 		if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
78 			offset += copied;
79 		} else {
80 			pg++;
81 			offset = 0;
82 		}
83 	}
84 	return 0;
85 }
86 
87 /*
88  * unlocks pages after btrfs_file_write is done with them
89  */
90 static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
91 {
92 	size_t i;
93 	for (i = 0; i < num_pages; i++) {
94 		if (!pages[i])
95 			break;
96 		/* page checked is some magic around finding pages that
97 		 * have been modified without going through btrfs_set_page_dirty
98 		 * clear it here
99 		 */
100 		ClearPageChecked(pages[i]);
101 		unlock_page(pages[i]);
102 		mark_page_accessed(pages[i]);
103 		page_cache_release(pages[i]);
104 	}
105 }
106 
107 /*
108  * after copy_from_user, pages need to be dirtied and we need to make
109  * sure holes are created between the current EOF and the start of
110  * any next extents (if required).
111  *
112  * this also makes the decision about creating an inline extent vs
113  * doing real data extents, marking pages dirty and delalloc as required.
114  */
115 static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
116 				   struct btrfs_root *root,
117 				   struct file *file,
118 				   struct page **pages,
119 				   size_t num_pages,
120 				   loff_t pos,
121 				   size_t write_bytes)
122 {
123 	int err = 0;
124 	int i;
125 	struct inode *inode = fdentry(file)->d_inode;
126 	u64 num_bytes;
127 	u64 start_pos;
128 	u64 end_of_last_block;
129 	u64 end_pos = pos + write_bytes;
130 	loff_t isize = i_size_read(inode);
131 
132 	start_pos = pos & ~((u64)root->sectorsize - 1);
133 	num_bytes = (write_bytes + pos - start_pos +
134 		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
135 
136 	end_of_last_block = start_pos + num_bytes - 1;
137 	err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
138 					NULL);
139 	BUG_ON(err);
140 
141 	for (i = 0; i < num_pages; i++) {
142 		struct page *p = pages[i];
143 		SetPageUptodate(p);
144 		ClearPageChecked(p);
145 		set_page_dirty(p);
146 	}
147 	if (end_pos > isize) {
148 		i_size_write(inode, end_pos);
149 		/* we've only changed i_size in ram, and we haven't updated
150 		 * the disk i_size.  There is no need to log the inode
151 		 * at this time.
152 		 */
153 	}
154 	return 0;
155 }
156 
157 /*
158  * this drops all the extents in the cache that intersect the range
159  * [start, end].  Existing extents are split as required.
160  */
161 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
162 			    int skip_pinned)
163 {
164 	struct extent_map *em;
165 	struct extent_map *split = NULL;
166 	struct extent_map *split2 = NULL;
167 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
168 	u64 len = end - start + 1;
169 	int ret;
170 	int testend = 1;
171 	unsigned long flags;
172 	int compressed = 0;
173 
174 	WARN_ON(end < start);
175 	if (end == (u64)-1) {
176 		len = (u64)-1;
177 		testend = 0;
178 	}
179 	while (1) {
180 		if (!split)
181 			split = alloc_extent_map(GFP_NOFS);
182 		if (!split2)
183 			split2 = alloc_extent_map(GFP_NOFS);
184 
185 		write_lock(&em_tree->lock);
186 		em = lookup_extent_mapping(em_tree, start, len);
187 		if (!em) {
188 			write_unlock(&em_tree->lock);
189 			break;
190 		}
191 		flags = em->flags;
192 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
193 			if (testend && em->start + em->len >= start + len) {
194 				free_extent_map(em);
195 				write_unlock(&em_tree->lock);
196 				break;
197 			}
198 			start = em->start + em->len;
199 			if (testend)
200 				len = start + len - (em->start + em->len);
201 			free_extent_map(em);
202 			write_unlock(&em_tree->lock);
203 			continue;
204 		}
205 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
206 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
207 		remove_extent_mapping(em_tree, em);
208 
209 		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
210 		    em->start < start) {
211 			split->start = em->start;
212 			split->len = start - em->start;
213 			split->orig_start = em->orig_start;
214 			split->block_start = em->block_start;
215 
216 			if (compressed)
217 				split->block_len = em->block_len;
218 			else
219 				split->block_len = split->len;
220 
221 			split->bdev = em->bdev;
222 			split->flags = flags;
223 			ret = add_extent_mapping(em_tree, split);
224 			BUG_ON(ret);
225 			free_extent_map(split);
226 			split = split2;
227 			split2 = NULL;
228 		}
229 		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
230 		    testend && em->start + em->len > start + len) {
231 			u64 diff = start + len - em->start;
232 
233 			split->start = start + len;
234 			split->len = em->start + em->len - (start + len);
235 			split->bdev = em->bdev;
236 			split->flags = flags;
237 
238 			if (compressed) {
239 				split->block_len = em->block_len;
240 				split->block_start = em->block_start;
241 				split->orig_start = em->orig_start;
242 			} else {
243 				split->block_len = split->len;
244 				split->block_start = em->block_start + diff;
245 				split->orig_start = split->start;
246 			}
247 
248 			ret = add_extent_mapping(em_tree, split);
249 			BUG_ON(ret);
250 			free_extent_map(split);
251 			split = NULL;
252 		}
253 		write_unlock(&em_tree->lock);
254 
255 		/* once for us */
256 		free_extent_map(em);
257 		/* once for the tree*/
258 		free_extent_map(em);
259 	}
260 	if (split)
261 		free_extent_map(split);
262 	if (split2)
263 		free_extent_map(split2);
264 	return 0;
265 }
266 
267 /*
268  * this is very complex, but the basic idea is to drop all extents
269  * in the range start - end.  hint_block is filled in with a block number
270  * that would be a good hint to the block allocator for this file.
271  *
272  * If an extent intersects the range but is not entirely inside the range
273  * it is either truncated or split.  Anything entirely inside the range
274  * is deleted from the tree.
275  */
276 int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
277 		       u64 start, u64 end, u64 *hint_byte, int drop_cache)
278 {
279 	struct btrfs_root *root = BTRFS_I(inode)->root;
280 	struct extent_buffer *leaf;
281 	struct btrfs_file_extent_item *fi;
282 	struct btrfs_path *path;
283 	struct btrfs_key key;
284 	struct btrfs_key new_key;
285 	u64 search_start = start;
286 	u64 disk_bytenr = 0;
287 	u64 num_bytes = 0;
288 	u64 extent_offset = 0;
289 	u64 extent_end = 0;
290 	int del_nr = 0;
291 	int del_slot = 0;
292 	int extent_type;
293 	int recow;
294 	int ret;
295 
296 	if (drop_cache)
297 		btrfs_drop_extent_cache(inode, start, end - 1, 0);
298 
299 	path = btrfs_alloc_path();
300 	if (!path)
301 		return -ENOMEM;
302 
303 	while (1) {
304 		recow = 0;
305 		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
306 					       search_start, -1);
307 		if (ret < 0)
308 			break;
309 		if (ret > 0 && path->slots[0] > 0 && search_start == start) {
310 			leaf = path->nodes[0];
311 			btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
312 			if (key.objectid == inode->i_ino &&
313 			    key.type == BTRFS_EXTENT_DATA_KEY)
314 				path->slots[0]--;
315 		}
316 		ret = 0;
317 next_slot:
318 		leaf = path->nodes[0];
319 		if (path->slots[0] >= btrfs_header_nritems(leaf)) {
320 			BUG_ON(del_nr > 0);
321 			ret = btrfs_next_leaf(root, path);
322 			if (ret < 0)
323 				break;
324 			if (ret > 0) {
325 				ret = 0;
326 				break;
327 			}
328 			leaf = path->nodes[0];
329 			recow = 1;
330 		}
331 
332 		btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
333 		if (key.objectid > inode->i_ino ||
334 		    key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
335 			break;
336 
337 		fi = btrfs_item_ptr(leaf, path->slots[0],
338 				    struct btrfs_file_extent_item);
339 		extent_type = btrfs_file_extent_type(leaf, fi);
340 
341 		if (extent_type == BTRFS_FILE_EXTENT_REG ||
342 		    extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
343 			disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
344 			num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
345 			extent_offset = btrfs_file_extent_offset(leaf, fi);
346 			extent_end = key.offset +
347 				btrfs_file_extent_num_bytes(leaf, fi);
348 		} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
349 			extent_end = key.offset +
350 				btrfs_file_extent_inline_len(leaf, fi);
351 		} else {
352 			WARN_ON(1);
353 			extent_end = search_start;
354 		}
355 
356 		if (extent_end <= search_start) {
357 			path->slots[0]++;
358 			goto next_slot;
359 		}
360 
361 		search_start = max(key.offset, start);
362 		if (recow) {
363 			btrfs_release_path(root, path);
364 			continue;
365 		}
366 
367 		/*
368 		 *     | - range to drop - |
369 		 *  | -------- extent -------- |
370 		 */
371 		if (start > key.offset && end < extent_end) {
372 			BUG_ON(del_nr > 0);
373 			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
374 
375 			memcpy(&new_key, &key, sizeof(new_key));
376 			new_key.offset = start;
377 			ret = btrfs_duplicate_item(trans, root, path,
378 						   &new_key);
379 			if (ret == -EAGAIN) {
380 				btrfs_release_path(root, path);
381 				continue;
382 			}
383 			if (ret < 0)
384 				break;
385 
386 			leaf = path->nodes[0];
387 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
388 					    struct btrfs_file_extent_item);
389 			btrfs_set_file_extent_num_bytes(leaf, fi,
390 							start - key.offset);
391 
392 			fi = btrfs_item_ptr(leaf, path->slots[0],
393 					    struct btrfs_file_extent_item);
394 
395 			extent_offset += start - key.offset;
396 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
397 			btrfs_set_file_extent_num_bytes(leaf, fi,
398 							extent_end - start);
399 			btrfs_mark_buffer_dirty(leaf);
400 
401 			if (disk_bytenr > 0) {
402 				ret = btrfs_inc_extent_ref(trans, root,
403 						disk_bytenr, num_bytes, 0,
404 						root->root_key.objectid,
405 						new_key.objectid,
406 						start - extent_offset);
407 				BUG_ON(ret);
408 				*hint_byte = disk_bytenr;
409 			}
410 			key.offset = start;
411 		}
412 		/*
413 		 *  | ---- range to drop ----- |
414 		 *      | -------- extent -------- |
415 		 */
416 		if (start <= key.offset && end < extent_end) {
417 			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
418 
419 			memcpy(&new_key, &key, sizeof(new_key));
420 			new_key.offset = end;
421 			btrfs_set_item_key_safe(trans, root, path, &new_key);
422 
423 			extent_offset += end - key.offset;
424 			btrfs_set_file_extent_offset(leaf, fi, extent_offset);
425 			btrfs_set_file_extent_num_bytes(leaf, fi,
426 							extent_end - end);
427 			btrfs_mark_buffer_dirty(leaf);
428 			if (disk_bytenr > 0) {
429 				inode_sub_bytes(inode, end - key.offset);
430 				*hint_byte = disk_bytenr;
431 			}
432 			break;
433 		}
434 
435 		search_start = extent_end;
436 		/*
437 		 *       | ---- range to drop ----- |
438 		 *  | -------- extent -------- |
439 		 */
440 		if (start > key.offset && end >= extent_end) {
441 			BUG_ON(del_nr > 0);
442 			BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE);
443 
444 			btrfs_set_file_extent_num_bytes(leaf, fi,
445 							start - key.offset);
446 			btrfs_mark_buffer_dirty(leaf);
447 			if (disk_bytenr > 0) {
448 				inode_sub_bytes(inode, extent_end - start);
449 				*hint_byte = disk_bytenr;
450 			}
451 			if (end == extent_end)
452 				break;
453 
454 			path->slots[0]++;
455 			goto next_slot;
456 		}
457 
458 		/*
459 		 *  | ---- range to drop ----- |
460 		 *    | ------ extent ------ |
461 		 */
462 		if (start <= key.offset && end >= extent_end) {
463 			if (del_nr == 0) {
464 				del_slot = path->slots[0];
465 				del_nr = 1;
466 			} else {
467 				BUG_ON(del_slot + del_nr != path->slots[0]);
468 				del_nr++;
469 			}
470 
471 			if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
472 				inode_sub_bytes(inode,
473 						extent_end - key.offset);
474 				extent_end = ALIGN(extent_end,
475 						   root->sectorsize);
476 			} else if (disk_bytenr > 0) {
477 				ret = btrfs_free_extent(trans, root,
478 						disk_bytenr, num_bytes, 0,
479 						root->root_key.objectid,
480 						key.objectid, key.offset -
481 						extent_offset);
482 				BUG_ON(ret);
483 				inode_sub_bytes(inode,
484 						extent_end - key.offset);
485 				*hint_byte = disk_bytenr;
486 			}
487 
488 			if (end == extent_end)
489 				break;
490 
491 			if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
492 				path->slots[0]++;
493 				goto next_slot;
494 			}
495 
496 			ret = btrfs_del_items(trans, root, path, del_slot,
497 					      del_nr);
498 			BUG_ON(ret);
499 
500 			del_nr = 0;
501 			del_slot = 0;
502 
503 			btrfs_release_path(root, path);
504 			continue;
505 		}
506 
507 		BUG_ON(1);
508 	}
509 
510 	if (del_nr > 0) {
511 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
512 		BUG_ON(ret);
513 	}
514 
515 	btrfs_free_path(path);
516 	return ret;
517 }
518 
519 static int extent_mergeable(struct extent_buffer *leaf, int slot,
520 			    u64 objectid, u64 bytenr, u64 orig_offset,
521 			    u64 *start, u64 *end)
522 {
523 	struct btrfs_file_extent_item *fi;
524 	struct btrfs_key key;
525 	u64 extent_end;
526 
527 	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
528 		return 0;
529 
530 	btrfs_item_key_to_cpu(leaf, &key, slot);
531 	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
532 		return 0;
533 
534 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
535 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
536 	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
537 	    btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
538 	    btrfs_file_extent_compression(leaf, fi) ||
539 	    btrfs_file_extent_encryption(leaf, fi) ||
540 	    btrfs_file_extent_other_encoding(leaf, fi))
541 		return 0;
542 
543 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
544 	if ((*start && *start != key.offset) || (*end && *end != extent_end))
545 		return 0;
546 
547 	*start = key.offset;
548 	*end = extent_end;
549 	return 1;
550 }
551 
552 /*
553  * Mark extent in the range start - end as written.
554  *
555  * This changes extent type from 'pre-allocated' to 'regular'. If only
556  * part of extent is marked as written, the extent will be split into
557  * two or three.
558  */
559 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
560 			      struct inode *inode, u64 start, u64 end)
561 {
562 	struct btrfs_root *root = BTRFS_I(inode)->root;
563 	struct extent_buffer *leaf;
564 	struct btrfs_path *path;
565 	struct btrfs_file_extent_item *fi;
566 	struct btrfs_key key;
567 	struct btrfs_key new_key;
568 	u64 bytenr;
569 	u64 num_bytes;
570 	u64 extent_end;
571 	u64 orig_offset;
572 	u64 other_start;
573 	u64 other_end;
574 	u64 split;
575 	int del_nr = 0;
576 	int del_slot = 0;
577 	int recow;
578 	int ret;
579 
580 	btrfs_drop_extent_cache(inode, start, end - 1, 0);
581 
582 	path = btrfs_alloc_path();
583 	BUG_ON(!path);
584 again:
585 	recow = 0;
586 	split = start;
587 	key.objectid = inode->i_ino;
588 	key.type = BTRFS_EXTENT_DATA_KEY;
589 	key.offset = split;
590 
591 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
592 	if (ret > 0 && path->slots[0] > 0)
593 		path->slots[0]--;
594 
595 	leaf = path->nodes[0];
596 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
597 	BUG_ON(key.objectid != inode->i_ino ||
598 	       key.type != BTRFS_EXTENT_DATA_KEY);
599 	fi = btrfs_item_ptr(leaf, path->slots[0],
600 			    struct btrfs_file_extent_item);
601 	BUG_ON(btrfs_file_extent_type(leaf, fi) !=
602 	       BTRFS_FILE_EXTENT_PREALLOC);
603 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
604 	BUG_ON(key.offset > start || extent_end < end);
605 
606 	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
607 	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
608 	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
609 	memcpy(&new_key, &key, sizeof(new_key));
610 
611 	if (start == key.offset && end < extent_end) {
612 		other_start = 0;
613 		other_end = start;
614 		if (extent_mergeable(leaf, path->slots[0] - 1,
615 				     inode->i_ino, bytenr, orig_offset,
616 				     &other_start, &other_end)) {
617 			new_key.offset = end;
618 			btrfs_set_item_key_safe(trans, root, path, &new_key);
619 			fi = btrfs_item_ptr(leaf, path->slots[0],
620 					    struct btrfs_file_extent_item);
621 			btrfs_set_file_extent_num_bytes(leaf, fi,
622 							extent_end - end);
623 			btrfs_set_file_extent_offset(leaf, fi,
624 						     end - orig_offset);
625 			fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
626 					    struct btrfs_file_extent_item);
627 			btrfs_set_file_extent_num_bytes(leaf, fi,
628 							end - other_start);
629 			btrfs_mark_buffer_dirty(leaf);
630 			goto out;
631 		}
632 	}
633 
634 	if (start > key.offset && end == extent_end) {
635 		other_start = end;
636 		other_end = 0;
637 		if (extent_mergeable(leaf, path->slots[0] + 1,
638 				     inode->i_ino, bytenr, orig_offset,
639 				     &other_start, &other_end)) {
640 			fi = btrfs_item_ptr(leaf, path->slots[0],
641 					    struct btrfs_file_extent_item);
642 			btrfs_set_file_extent_num_bytes(leaf, fi,
643 							start - key.offset);
644 			path->slots[0]++;
645 			new_key.offset = start;
646 			btrfs_set_item_key_safe(trans, root, path, &new_key);
647 
648 			fi = btrfs_item_ptr(leaf, path->slots[0],
649 					    struct btrfs_file_extent_item);
650 			btrfs_set_file_extent_num_bytes(leaf, fi,
651 							other_end - start);
652 			btrfs_set_file_extent_offset(leaf, fi,
653 						     start - orig_offset);
654 			btrfs_mark_buffer_dirty(leaf);
655 			goto out;
656 		}
657 	}
658 
659 	while (start > key.offset || end < extent_end) {
660 		if (key.offset == start)
661 			split = end;
662 
663 		new_key.offset = split;
664 		ret = btrfs_duplicate_item(trans, root, path, &new_key);
665 		if (ret == -EAGAIN) {
666 			btrfs_release_path(root, path);
667 			goto again;
668 		}
669 		BUG_ON(ret < 0);
670 
671 		leaf = path->nodes[0];
672 		fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
673 				    struct btrfs_file_extent_item);
674 		btrfs_set_file_extent_num_bytes(leaf, fi,
675 						split - key.offset);
676 
677 		fi = btrfs_item_ptr(leaf, path->slots[0],
678 				    struct btrfs_file_extent_item);
679 
680 		btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
681 		btrfs_set_file_extent_num_bytes(leaf, fi,
682 						extent_end - split);
683 		btrfs_mark_buffer_dirty(leaf);
684 
685 		ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
686 					   root->root_key.objectid,
687 					   inode->i_ino, orig_offset);
688 		BUG_ON(ret);
689 
690 		if (split == start) {
691 			key.offset = start;
692 		} else {
693 			BUG_ON(start != key.offset);
694 			path->slots[0]--;
695 			extent_end = end;
696 		}
697 		recow = 1;
698 	}
699 
700 	other_start = end;
701 	other_end = 0;
702 	if (extent_mergeable(leaf, path->slots[0] + 1,
703 			     inode->i_ino, bytenr, orig_offset,
704 			     &other_start, &other_end)) {
705 		if (recow) {
706 			btrfs_release_path(root, path);
707 			goto again;
708 		}
709 		extent_end = other_end;
710 		del_slot = path->slots[0] + 1;
711 		del_nr++;
712 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
713 					0, root->root_key.objectid,
714 					inode->i_ino, orig_offset);
715 		BUG_ON(ret);
716 	}
717 	other_start = 0;
718 	other_end = start;
719 	if (extent_mergeable(leaf, path->slots[0] - 1,
720 			     inode->i_ino, bytenr, orig_offset,
721 			     &other_start, &other_end)) {
722 		if (recow) {
723 			btrfs_release_path(root, path);
724 			goto again;
725 		}
726 		key.offset = other_start;
727 		del_slot = path->slots[0];
728 		del_nr++;
729 		ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
730 					0, root->root_key.objectid,
731 					inode->i_ino, orig_offset);
732 		BUG_ON(ret);
733 	}
734 	if (del_nr == 0) {
735 		fi = btrfs_item_ptr(leaf, path->slots[0],
736 			   struct btrfs_file_extent_item);
737 		btrfs_set_file_extent_type(leaf, fi,
738 					   BTRFS_FILE_EXTENT_REG);
739 		btrfs_mark_buffer_dirty(leaf);
740 	} else {
741 		fi = btrfs_item_ptr(leaf, del_slot - 1,
742 			   struct btrfs_file_extent_item);
743 		btrfs_set_file_extent_type(leaf, fi,
744 					   BTRFS_FILE_EXTENT_REG);
745 		btrfs_set_file_extent_num_bytes(leaf, fi,
746 						extent_end - key.offset);
747 		btrfs_mark_buffer_dirty(leaf);
748 
749 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
750 		BUG_ON(ret);
751 	}
752 out:
753 	btrfs_free_path(path);
754 	return 0;
755 }
756 
757 /*
758  * this gets pages into the page cache and locks them down, it also properly
759  * waits for data=ordered extents to finish before allowing the pages to be
760  * modified.
761  */
762 static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
763 			 struct page **pages, size_t num_pages,
764 			 loff_t pos, unsigned long first_index,
765 			 unsigned long last_index, size_t write_bytes)
766 {
767 	struct extent_state *cached_state = NULL;
768 	int i;
769 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
770 	struct inode *inode = fdentry(file)->d_inode;
771 	int err = 0;
772 	u64 start_pos;
773 	u64 last_pos;
774 
775 	start_pos = pos & ~((u64)root->sectorsize - 1);
776 	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
777 
778 	if (start_pos > inode->i_size) {
779 		err = btrfs_cont_expand(inode, start_pos);
780 		if (err)
781 			return err;
782 	}
783 
784 	memset(pages, 0, num_pages * sizeof(struct page *));
785 again:
786 	for (i = 0; i < num_pages; i++) {
787 		pages[i] = grab_cache_page(inode->i_mapping, index + i);
788 		if (!pages[i]) {
789 			err = -ENOMEM;
790 			BUG_ON(1);
791 		}
792 		wait_on_page_writeback(pages[i]);
793 	}
794 	if (start_pos < inode->i_size) {
795 		struct btrfs_ordered_extent *ordered;
796 		lock_extent_bits(&BTRFS_I(inode)->io_tree,
797 				 start_pos, last_pos - 1, 0, &cached_state,
798 				 GFP_NOFS);
799 		ordered = btrfs_lookup_first_ordered_extent(inode,
800 							    last_pos - 1);
801 		if (ordered &&
802 		    ordered->file_offset + ordered->len > start_pos &&
803 		    ordered->file_offset < last_pos) {
804 			btrfs_put_ordered_extent(ordered);
805 			unlock_extent_cached(&BTRFS_I(inode)->io_tree,
806 					     start_pos, last_pos - 1,
807 					     &cached_state, GFP_NOFS);
808 			for (i = 0; i < num_pages; i++) {
809 				unlock_page(pages[i]);
810 				page_cache_release(pages[i]);
811 			}
812 			btrfs_wait_ordered_range(inode, start_pos,
813 						 last_pos - start_pos);
814 			goto again;
815 		}
816 		if (ordered)
817 			btrfs_put_ordered_extent(ordered);
818 
819 		clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
820 				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
821 				  EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
822 				  GFP_NOFS);
823 		unlock_extent_cached(&BTRFS_I(inode)->io_tree,
824 				     start_pos, last_pos - 1, &cached_state,
825 				     GFP_NOFS);
826 	}
827 	for (i = 0; i < num_pages; i++) {
828 		clear_page_dirty_for_io(pages[i]);
829 		set_page_extent_mapped(pages[i]);
830 		WARN_ON(!PageLocked(pages[i]));
831 	}
832 	return 0;
833 }
834 
835 static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
836 				    const struct iovec *iov,
837 				    unsigned long nr_segs, loff_t pos)
838 {
839 	struct file *file = iocb->ki_filp;
840 	struct inode *inode = fdentry(file)->d_inode;
841 	struct btrfs_root *root = BTRFS_I(inode)->root;
842 	struct page *pinned[2];
843 	struct page **pages = NULL;
844 	struct iov_iter i;
845 	loff_t *ppos = &iocb->ki_pos;
846 	loff_t start_pos;
847 	ssize_t num_written = 0;
848 	ssize_t err = 0;
849 	size_t count;
850 	size_t ocount;
851 	int ret = 0;
852 	int nrptrs;
853 	unsigned long first_index;
854 	unsigned long last_index;
855 	int will_write;
856 	int buffered = 0;
857 
858 	will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
859 		      (file->f_flags & O_DIRECT));
860 
861 	pinned[0] = NULL;
862 	pinned[1] = NULL;
863 
864 	start_pos = pos;
865 
866 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
867 
868 	mutex_lock(&inode->i_mutex);
869 
870 	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
871 	if (err)
872 		goto out;
873 	count = ocount;
874 
875 	current->backing_dev_info = inode->i_mapping->backing_dev_info;
876 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
877 	if (err)
878 		goto out;
879 
880 	if (count == 0)
881 		goto out;
882 
883 	err = file_remove_suid(file);
884 	if (err)
885 		goto out;
886 
887 	file_update_time(file);
888 	BTRFS_I(inode)->sequence++;
889 
890 	if (unlikely(file->f_flags & O_DIRECT)) {
891 		num_written = generic_file_direct_write(iocb, iov, &nr_segs,
892 							pos, ppos, count,
893 							ocount);
894 		/*
895 		 * the generic O_DIRECT will update in-memory i_size after the
896 		 * DIOs are done.  But our endio handlers that update the on
897 		 * disk i_size never update past the in memory i_size.  So we
898 		 * need one more update here to catch any additions to the
899 		 * file
900 		 */
901 		if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
902 			btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
903 			mark_inode_dirty(inode);
904 		}
905 
906 		if (num_written < 0) {
907 			ret = num_written;
908 			num_written = 0;
909 			goto out;
910 		} else if (num_written == count) {
911 			/* pick up pos changes done by the generic code */
912 			pos = *ppos;
913 			goto out;
914 		}
915 		/*
916 		 * We are going to do buffered for the rest of the range, so we
917 		 * need to make sure to invalidate the buffered pages when we're
918 		 * done.
919 		 */
920 		buffered = 1;
921 		pos += num_written;
922 	}
923 
924 	iov_iter_init(&i, iov, nr_segs, count, num_written);
925 	nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
926 		     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
927 		     (sizeof(struct page *)));
928 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
929 
930 	/* generic_write_checks can change our pos */
931 	start_pos = pos;
932 
933 	first_index = pos >> PAGE_CACHE_SHIFT;
934 	last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
935 
936 	/*
937 	 * there are lots of better ways to do this, but this code
938 	 * makes sure the first and last page in the file range are
939 	 * up to date and ready for cow
940 	 */
941 	if ((pos & (PAGE_CACHE_SIZE - 1))) {
942 		pinned[0] = grab_cache_page(inode->i_mapping, first_index);
943 		if (!PageUptodate(pinned[0])) {
944 			ret = btrfs_readpage(NULL, pinned[0]);
945 			BUG_ON(ret);
946 			wait_on_page_locked(pinned[0]);
947 		} else {
948 			unlock_page(pinned[0]);
949 		}
950 	}
951 	if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
952 		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
953 		if (!PageUptodate(pinned[1])) {
954 			ret = btrfs_readpage(NULL, pinned[1]);
955 			BUG_ON(ret);
956 			wait_on_page_locked(pinned[1]);
957 		} else {
958 			unlock_page(pinned[1]);
959 		}
960 	}
961 
962 	while (iov_iter_count(&i) > 0) {
963 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
964 		size_t write_bytes = min(iov_iter_count(&i),
965 					 nrptrs * (size_t)PAGE_CACHE_SIZE -
966 					 offset);
967 		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
968 					PAGE_CACHE_SHIFT;
969 
970 		WARN_ON(num_pages > nrptrs);
971 		memset(pages, 0, sizeof(struct page *) * nrptrs);
972 
973 		ret = btrfs_delalloc_reserve_space(inode, write_bytes);
974 		if (ret)
975 			goto out;
976 
977 		ret = prepare_pages(root, file, pages, num_pages,
978 				    pos, first_index, last_index,
979 				    write_bytes);
980 		if (ret) {
981 			btrfs_delalloc_release_space(inode, write_bytes);
982 			goto out;
983 		}
984 
985 		ret = btrfs_copy_from_user(pos, num_pages,
986 					   write_bytes, pages, &i);
987 		if (ret == 0) {
988 			dirty_and_release_pages(NULL, root, file, pages,
989 						num_pages, pos, write_bytes);
990 		}
991 
992 		btrfs_drop_pages(pages, num_pages);
993 		if (ret) {
994 			btrfs_delalloc_release_space(inode, write_bytes);
995 			goto out;
996 		}
997 
998 		if (will_write) {
999 			filemap_fdatawrite_range(inode->i_mapping, pos,
1000 						 pos + write_bytes - 1);
1001 		} else {
1002 			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1003 							   num_pages);
1004 			if (num_pages <
1005 			    (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1006 				btrfs_btree_balance_dirty(root, 1);
1007 			btrfs_throttle(root);
1008 		}
1009 
1010 		pos += write_bytes;
1011 		num_written += write_bytes;
1012 
1013 		cond_resched();
1014 	}
1015 out:
1016 	mutex_unlock(&inode->i_mutex);
1017 	if (ret)
1018 		err = ret;
1019 
1020 	kfree(pages);
1021 	if (pinned[0])
1022 		page_cache_release(pinned[0]);
1023 	if (pinned[1])
1024 		page_cache_release(pinned[1]);
1025 	*ppos = pos;
1026 
1027 	/*
1028 	 * we want to make sure fsync finds this change
1029 	 * but we haven't joined a transaction running right now.
1030 	 *
1031 	 * Later on, someone is sure to update the inode and get the
1032 	 * real transid recorded.
1033 	 *
1034 	 * We set last_trans now to the fs_info generation + 1,
1035 	 * this will either be one more than the running transaction
1036 	 * or the generation used for the next transaction if there isn't
1037 	 * one running right now.
1038 	 */
1039 	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1040 
1041 	if (num_written > 0 && will_write) {
1042 		struct btrfs_trans_handle *trans;
1043 
1044 		err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1045 		if (err)
1046 			num_written = err;
1047 
1048 		if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
1049 			trans = btrfs_start_transaction(root, 0);
1050 			ret = btrfs_log_dentry_safe(trans, root,
1051 						    file->f_dentry);
1052 			if (ret == 0) {
1053 				ret = btrfs_sync_log(trans, root);
1054 				if (ret == 0)
1055 					btrfs_end_transaction(trans, root);
1056 				else
1057 					btrfs_commit_transaction(trans, root);
1058 			} else if (ret != BTRFS_NO_LOG_SYNC) {
1059 				btrfs_commit_transaction(trans, root);
1060 			} else {
1061 				btrfs_end_transaction(trans, root);
1062 			}
1063 		}
1064 		if (file->f_flags & O_DIRECT && buffered) {
1065 			invalidate_mapping_pages(inode->i_mapping,
1066 			      start_pos >> PAGE_CACHE_SHIFT,
1067 			     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1068 		}
1069 	}
1070 	current->backing_dev_info = NULL;
1071 	return num_written ? num_written : err;
1072 }
1073 
1074 int btrfs_release_file(struct inode *inode, struct file *filp)
1075 {
1076 	/*
1077 	 * ordered_data_close is set by settattr when we are about to truncate
1078 	 * a file from a non-zero size to a zero size.  This tries to
1079 	 * flush down new bytes that may have been written if the
1080 	 * application were using truncate to replace a file in place.
1081 	 */
1082 	if (BTRFS_I(inode)->ordered_data_close) {
1083 		BTRFS_I(inode)->ordered_data_close = 0;
1084 		btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1085 		if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1086 			filemap_flush(inode->i_mapping);
1087 	}
1088 	if (filp->private_data)
1089 		btrfs_ioctl_trans_end(filp);
1090 	return 0;
1091 }
1092 
1093 /*
1094  * fsync call for both files and directories.  This logs the inode into
1095  * the tree log instead of forcing full commits whenever possible.
1096  *
1097  * It needs to call filemap_fdatawait so that all ordered extent updates are
1098  * in the metadata btree are up to date for copying to the log.
1099  *
1100  * It drops the inode mutex before doing the tree log commit.  This is an
1101  * important optimization for directories because holding the mutex prevents
1102  * new operations on the dir while we write to disk.
1103  */
1104 int btrfs_sync_file(struct file *file, int datasync)
1105 {
1106 	struct dentry *dentry = file->f_path.dentry;
1107 	struct inode *inode = dentry->d_inode;
1108 	struct btrfs_root *root = BTRFS_I(inode)->root;
1109 	int ret = 0;
1110 	struct btrfs_trans_handle *trans;
1111 
1112 
1113 	/* we wait first, since the writeback may change the inode */
1114 	root->log_batch++;
1115 	/* the VFS called filemap_fdatawrite for us */
1116 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
1117 	root->log_batch++;
1118 
1119 	/*
1120 	 * check the transaction that last modified this inode
1121 	 * and see if its already been committed
1122 	 */
1123 	if (!BTRFS_I(inode)->last_trans)
1124 		goto out;
1125 
1126 	/*
1127 	 * if the last transaction that changed this file was before
1128 	 * the current transaction, we can bail out now without any
1129 	 * syncing
1130 	 */
1131 	mutex_lock(&root->fs_info->trans_mutex);
1132 	if (BTRFS_I(inode)->last_trans <=
1133 	    root->fs_info->last_trans_committed) {
1134 		BTRFS_I(inode)->last_trans = 0;
1135 		mutex_unlock(&root->fs_info->trans_mutex);
1136 		goto out;
1137 	}
1138 	mutex_unlock(&root->fs_info->trans_mutex);
1139 
1140 	/*
1141 	 * ok we haven't committed the transaction yet, lets do a commit
1142 	 */
1143 	if (file->private_data)
1144 		btrfs_ioctl_trans_end(file);
1145 
1146 	trans = btrfs_start_transaction(root, 0);
1147 	if (IS_ERR(trans)) {
1148 		ret = PTR_ERR(trans);
1149 		goto out;
1150 	}
1151 
1152 	ret = btrfs_log_dentry_safe(trans, root, dentry);
1153 	if (ret < 0)
1154 		goto out;
1155 
1156 	/* we've logged all the items and now have a consistent
1157 	 * version of the file in the log.  It is possible that
1158 	 * someone will come in and modify the file, but that's
1159 	 * fine because the log is consistent on disk, and we
1160 	 * have references to all of the file's extents
1161 	 *
1162 	 * It is possible that someone will come in and log the
1163 	 * file again, but that will end up using the synchronization
1164 	 * inside btrfs_sync_log to keep things safe.
1165 	 */
1166 	mutex_unlock(&dentry->d_inode->i_mutex);
1167 
1168 	if (ret != BTRFS_NO_LOG_SYNC) {
1169 		if (ret > 0) {
1170 			ret = btrfs_commit_transaction(trans, root);
1171 		} else {
1172 			ret = btrfs_sync_log(trans, root);
1173 			if (ret == 0)
1174 				ret = btrfs_end_transaction(trans, root);
1175 			else
1176 				ret = btrfs_commit_transaction(trans, root);
1177 		}
1178 	} else {
1179 		ret = btrfs_end_transaction(trans, root);
1180 	}
1181 	mutex_lock(&dentry->d_inode->i_mutex);
1182 out:
1183 	return ret > 0 ? -EIO : ret;
1184 }
1185 
1186 static const struct vm_operations_struct btrfs_file_vm_ops = {
1187 	.fault		= filemap_fault,
1188 	.page_mkwrite	= btrfs_page_mkwrite,
1189 };
1190 
1191 static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
1192 {
1193 	struct address_space *mapping = filp->f_mapping;
1194 
1195 	if (!mapping->a_ops->readpage)
1196 		return -ENOEXEC;
1197 
1198 	file_accessed(filp);
1199 	vma->vm_ops = &btrfs_file_vm_ops;
1200 	vma->vm_flags |= VM_CAN_NONLINEAR;
1201 
1202 	return 0;
1203 }
1204 
1205 const struct file_operations btrfs_file_operations = {
1206 	.llseek		= generic_file_llseek,
1207 	.read		= do_sync_read,
1208 	.write		= do_sync_write,
1209 	.aio_read       = generic_file_aio_read,
1210 	.splice_read	= generic_file_splice_read,
1211 	.aio_write	= btrfs_file_aio_write,
1212 	.mmap		= btrfs_file_mmap,
1213 	.open		= generic_file_open,
1214 	.release	= btrfs_release_file,
1215 	.fsync		= btrfs_sync_file,
1216 	.unlocked_ioctl	= btrfs_ioctl,
1217 #ifdef CONFIG_COMPAT
1218 	.compat_ioctl	= btrfs_ioctl,
1219 #endif
1220 };
1221