xref: /openbmc/linux/fs/btrfs/file.c (revision 2c59b0b7)
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 
19 #include <linux/fs.h>
20 #include <linux/pagemap.h>
21 #include <linux/highmem.h>
22 #include <linux/time.h>
23 #include <linux/init.h>
24 #include <linux/string.h>
25 #include <linux/backing-dev.h>
26 #include <linux/mpage.h>
27 #include <linux/swap.h>
28 #include <linux/writeback.h>
29 #include <linux/statfs.h>
30 #include <linux/compat.h>
31 #include "ctree.h"
32 #include "disk-io.h"
33 #include "transaction.h"
34 #include "btrfs_inode.h"
35 #include "ioctl.h"
36 #include "print-tree.h"
37 #include "tree-log.h"
38 #include "locking.h"
39 #include "compat.h"
40 
41 
42 /* simple helper to fault in pages and copy.  This should go away
43  * and be replaced with calls into generic code.
44  */
45 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
46 					 int write_bytes,
47 					 struct page **prepared_pages,
48 					 const char __user *buf)
49 {
50 	long page_fault = 0;
51 	int i;
52 	int offset = pos & (PAGE_CACHE_SIZE - 1);
53 
54 	for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
55 		size_t count = min_t(size_t,
56 				     PAGE_CACHE_SIZE - offset, write_bytes);
57 		struct page *page = prepared_pages[i];
58 		fault_in_pages_readable(buf, count);
59 
60 		/* Copy data from userspace to the current page */
61 		kmap(page);
62 		page_fault = __copy_from_user(page_address(page) + offset,
63 					      buf, count);
64 		/* Flush processor's dcache for this page */
65 		flush_dcache_page(page);
66 		kunmap(page);
67 		buf += count;
68 		write_bytes -= count;
69 
70 		if (page_fault)
71 			break;
72 	}
73 	return page_fault ? -EFAULT : 0;
74 }
75 
76 /*
77  * unlocks pages after btrfs_file_write is done with them
78  */
79 static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
80 {
81 	size_t i;
82 	for (i = 0; i < num_pages; i++) {
83 		if (!pages[i])
84 			break;
85 		/* page checked is some magic around finding pages that
86 		 * have been modified without going through btrfs_set_page_dirty
87 		 * clear it here
88 		 */
89 		ClearPageChecked(pages[i]);
90 		unlock_page(pages[i]);
91 		mark_page_accessed(pages[i]);
92 		page_cache_release(pages[i]);
93 	}
94 }
95 
96 /*
97  * after copy_from_user, pages need to be dirtied and we need to make
98  * sure holes are created between the current EOF and the start of
99  * any next extents (if required).
100  *
101  * this also makes the decision about creating an inline extent vs
102  * doing real data extents, marking pages dirty and delalloc as required.
103  */
104 static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
105 				   struct btrfs_root *root,
106 				   struct file *file,
107 				   struct page **pages,
108 				   size_t num_pages,
109 				   loff_t pos,
110 				   size_t write_bytes)
111 {
112 	int err = 0;
113 	int i;
114 	struct inode *inode = fdentry(file)->d_inode;
115 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
116 	u64 hint_byte;
117 	u64 num_bytes;
118 	u64 start_pos;
119 	u64 end_of_last_block;
120 	u64 end_pos = pos + write_bytes;
121 	loff_t isize = i_size_read(inode);
122 
123 	start_pos = pos & ~((u64)root->sectorsize - 1);
124 	num_bytes = (write_bytes + pos - start_pos +
125 		    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
126 
127 	end_of_last_block = start_pos + num_bytes - 1;
128 
129 	lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
130 	trans = btrfs_join_transaction(root, 1);
131 	if (!trans) {
132 		err = -ENOMEM;
133 		goto out_unlock;
134 	}
135 	btrfs_set_trans_block_group(trans, inode);
136 	hint_byte = 0;
137 
138 	set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
139 
140 	/* check for reserved extents on each page, we don't want
141 	 * to reset the delalloc bit on things that already have
142 	 * extents reserved.
143 	 */
144 	btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
145 	for (i = 0; i < num_pages; i++) {
146 		struct page *p = pages[i];
147 		SetPageUptodate(p);
148 		ClearPageChecked(p);
149 		set_page_dirty(p);
150 	}
151 	if (end_pos > isize) {
152 		i_size_write(inode, end_pos);
153 		/* we've only changed i_size in ram, and we haven't updated
154 		 * the disk i_size.  There is no need to log the inode
155 		 * at this time.
156 		 */
157 	}
158 	err = btrfs_end_transaction(trans, root);
159 out_unlock:
160 	unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
161 	return err;
162 }
163 
164 /*
165  * this drops all the extents in the cache that intersect the range
166  * [start, end].  Existing extents are split as required.
167  */
168 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
169 			    int skip_pinned)
170 {
171 	struct extent_map *em;
172 	struct extent_map *split = NULL;
173 	struct extent_map *split2 = NULL;
174 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
175 	u64 len = end - start + 1;
176 	int ret;
177 	int testend = 1;
178 	unsigned long flags;
179 	int compressed = 0;
180 
181 	WARN_ON(end < start);
182 	if (end == (u64)-1) {
183 		len = (u64)-1;
184 		testend = 0;
185 	}
186 	while (1) {
187 		if (!split)
188 			split = alloc_extent_map(GFP_NOFS);
189 		if (!split2)
190 			split2 = alloc_extent_map(GFP_NOFS);
191 
192 		spin_lock(&em_tree->lock);
193 		em = lookup_extent_mapping(em_tree, start, len);
194 		if (!em) {
195 			spin_unlock(&em_tree->lock);
196 			break;
197 		}
198 		flags = em->flags;
199 		if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
200 			spin_unlock(&em_tree->lock);
201 			if (em->start <= start &&
202 			    (!testend || em->start + em->len >= start + len)) {
203 				free_extent_map(em);
204 				break;
205 			}
206 			if (start < em->start) {
207 				len = em->start - start;
208 			} else {
209 				len = start + len - (em->start + em->len);
210 				start = em->start + em->len;
211 			}
212 			free_extent_map(em);
213 			continue;
214 		}
215 		compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
216 		clear_bit(EXTENT_FLAG_PINNED, &em->flags);
217 		remove_extent_mapping(em_tree, em);
218 
219 		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
220 		    em->start < start) {
221 			split->start = em->start;
222 			split->len = start - em->start;
223 			split->orig_start = em->orig_start;
224 			split->block_start = em->block_start;
225 
226 			if (compressed)
227 				split->block_len = em->block_len;
228 			else
229 				split->block_len = split->len;
230 
231 			split->bdev = em->bdev;
232 			split->flags = flags;
233 			ret = add_extent_mapping(em_tree, split);
234 			BUG_ON(ret);
235 			free_extent_map(split);
236 			split = split2;
237 			split2 = NULL;
238 		}
239 		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
240 		    testend && em->start + em->len > start + len) {
241 			u64 diff = start + len - em->start;
242 
243 			split->start = start + len;
244 			split->len = em->start + em->len - (start + len);
245 			split->bdev = em->bdev;
246 			split->flags = flags;
247 
248 			if (compressed) {
249 				split->block_len = em->block_len;
250 				split->block_start = em->block_start;
251 				split->orig_start = em->orig_start;
252 			} else {
253 				split->block_len = split->len;
254 				split->block_start = em->block_start + diff;
255 				split->orig_start = split->start;
256 			}
257 
258 			ret = add_extent_mapping(em_tree, split);
259 			BUG_ON(ret);
260 			free_extent_map(split);
261 			split = NULL;
262 		}
263 		spin_unlock(&em_tree->lock);
264 
265 		/* once for us */
266 		free_extent_map(em);
267 		/* once for the tree*/
268 		free_extent_map(em);
269 	}
270 	if (split)
271 		free_extent_map(split);
272 	if (split2)
273 		free_extent_map(split2);
274 	return 0;
275 }
276 
277 /*
278  * this is very complex, but the basic idea is to drop all extents
279  * in the range start - end.  hint_block is filled in with a block number
280  * that would be a good hint to the block allocator for this file.
281  *
282  * If an extent intersects the range but is not entirely inside the range
283  * it is either truncated or split.  Anything entirely inside the range
284  * is deleted from the tree.
285  *
286  * inline_limit is used to tell this code which offsets in the file to keep
287  * if they contain inline extents.
288  */
289 noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans,
290 		       struct btrfs_root *root, struct inode *inode,
291 		       u64 start, u64 end, u64 locked_end,
292 		       u64 inline_limit, u64 *hint_byte)
293 {
294 	u64 extent_end = 0;
295 	u64 search_start = start;
296 	u64 ram_bytes = 0;
297 	u64 disk_bytenr = 0;
298 	u64 orig_locked_end = locked_end;
299 	u8 compression;
300 	u8 encryption;
301 	u16 other_encoding = 0;
302 	struct extent_buffer *leaf;
303 	struct btrfs_file_extent_item *extent;
304 	struct btrfs_path *path;
305 	struct btrfs_key key;
306 	struct btrfs_file_extent_item old;
307 	int keep;
308 	int slot;
309 	int bookend;
310 	int found_type = 0;
311 	int found_extent;
312 	int found_inline;
313 	int recow;
314 	int ret;
315 
316 	inline_limit = 0;
317 	btrfs_drop_extent_cache(inode, start, end - 1, 0);
318 
319 	path = btrfs_alloc_path();
320 	if (!path)
321 		return -ENOMEM;
322 	while (1) {
323 		recow = 0;
324 		btrfs_release_path(root, path);
325 		ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
326 					       search_start, -1);
327 		if (ret < 0)
328 			goto out;
329 		if (ret > 0) {
330 			if (path->slots[0] == 0) {
331 				ret = 0;
332 				goto out;
333 			}
334 			path->slots[0]--;
335 		}
336 next_slot:
337 		keep = 0;
338 		bookend = 0;
339 		found_extent = 0;
340 		found_inline = 0;
341 		compression = 0;
342 		encryption = 0;
343 		extent = NULL;
344 		leaf = path->nodes[0];
345 		slot = path->slots[0];
346 		ret = 0;
347 		btrfs_item_key_to_cpu(leaf, &key, slot);
348 		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
349 		    key.offset >= end) {
350 			goto out;
351 		}
352 		if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
353 		    key.objectid != inode->i_ino) {
354 			goto out;
355 		}
356 		if (recow) {
357 			search_start = max(key.offset, start);
358 			continue;
359 		}
360 		if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
361 			extent = btrfs_item_ptr(leaf, slot,
362 						struct btrfs_file_extent_item);
363 			found_type = btrfs_file_extent_type(leaf, extent);
364 			compression = btrfs_file_extent_compression(leaf,
365 								    extent);
366 			encryption = btrfs_file_extent_encryption(leaf,
367 								  extent);
368 			other_encoding = btrfs_file_extent_other_encoding(leaf,
369 								  extent);
370 			if (found_type == BTRFS_FILE_EXTENT_REG ||
371 			    found_type == BTRFS_FILE_EXTENT_PREALLOC) {
372 				extent_end =
373 				     btrfs_file_extent_disk_bytenr(leaf,
374 								   extent);
375 				if (extent_end)
376 					*hint_byte = extent_end;
377 
378 				extent_end = key.offset +
379 				     btrfs_file_extent_num_bytes(leaf, extent);
380 				ram_bytes = btrfs_file_extent_ram_bytes(leaf,
381 								extent);
382 				found_extent = 1;
383 			} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
384 				found_inline = 1;
385 				extent_end = key.offset +
386 				     btrfs_file_extent_inline_len(leaf, extent);
387 			}
388 		} else {
389 			extent_end = search_start;
390 		}
391 
392 		/* we found nothing we can drop */
393 		if ((!found_extent && !found_inline) ||
394 		    search_start >= extent_end) {
395 			int nextret;
396 			u32 nritems;
397 			nritems = btrfs_header_nritems(leaf);
398 			if (slot >= nritems - 1) {
399 				nextret = btrfs_next_leaf(root, path);
400 				if (nextret)
401 					goto out;
402 				recow = 1;
403 			} else {
404 				path->slots[0]++;
405 			}
406 			goto next_slot;
407 		}
408 
409 		if (end <= extent_end && start >= key.offset && found_inline)
410 			*hint_byte = EXTENT_MAP_INLINE;
411 
412 		if (found_extent) {
413 			read_extent_buffer(leaf, &old, (unsigned long)extent,
414 					   sizeof(old));
415 		}
416 
417 		if (end < extent_end && end >= key.offset) {
418 			bookend = 1;
419 			if (found_inline && start <= key.offset)
420 				keep = 1;
421 		}
422 
423 		if (bookend && found_extent) {
424 			if (locked_end < extent_end) {
425 				ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
426 						locked_end, extent_end - 1,
427 						GFP_NOFS);
428 				if (!ret) {
429 					btrfs_release_path(root, path);
430 					lock_extent(&BTRFS_I(inode)->io_tree,
431 						locked_end, extent_end - 1,
432 						GFP_NOFS);
433 					locked_end = extent_end;
434 					continue;
435 				}
436 				locked_end = extent_end;
437 			}
438 			disk_bytenr = le64_to_cpu(old.disk_bytenr);
439 			if (disk_bytenr != 0) {
440 				ret = btrfs_inc_extent_ref(trans, root,
441 					   disk_bytenr,
442 					   le64_to_cpu(old.disk_num_bytes), 0,
443 					   root->root_key.objectid,
444 					   key.objectid, key.offset -
445 					   le64_to_cpu(old.offset));
446 				BUG_ON(ret);
447 			}
448 		}
449 
450 		if (found_inline) {
451 			u64 mask = root->sectorsize - 1;
452 			search_start = (extent_end + mask) & ~mask;
453 		} else
454 			search_start = extent_end;
455 
456 		/* truncate existing extent */
457 		if (start > key.offset) {
458 			u64 new_num;
459 			u64 old_num;
460 			keep = 1;
461 			WARN_ON(start & (root->sectorsize - 1));
462 			if (found_extent) {
463 				new_num = start - key.offset;
464 				old_num = btrfs_file_extent_num_bytes(leaf,
465 								      extent);
466 				*hint_byte =
467 					btrfs_file_extent_disk_bytenr(leaf,
468 								      extent);
469 				if (btrfs_file_extent_disk_bytenr(leaf,
470 								  extent)) {
471 					inode_sub_bytes(inode, old_num -
472 							new_num);
473 				}
474 				btrfs_set_file_extent_num_bytes(leaf,
475 							extent, new_num);
476 				btrfs_mark_buffer_dirty(leaf);
477 			} else if (key.offset < inline_limit &&
478 				   (end > extent_end) &&
479 				   (inline_limit < extent_end)) {
480 				u32 new_size;
481 				new_size = btrfs_file_extent_calc_inline_size(
482 						   inline_limit - key.offset);
483 				inode_sub_bytes(inode, extent_end -
484 						inline_limit);
485 				btrfs_set_file_extent_ram_bytes(leaf, extent,
486 							new_size);
487 				if (!compression && !encryption) {
488 					btrfs_truncate_item(trans, root, path,
489 							    new_size, 1);
490 				}
491 			}
492 		}
493 		/* delete the entire extent */
494 		if (!keep) {
495 			if (found_inline)
496 				inode_sub_bytes(inode, extent_end -
497 						key.offset);
498 			ret = btrfs_del_item(trans, root, path);
499 			/* TODO update progress marker and return */
500 			BUG_ON(ret);
501 			extent = NULL;
502 			btrfs_release_path(root, path);
503 			/* the extent will be freed later */
504 		}
505 		if (bookend && found_inline && start <= key.offset) {
506 			u32 new_size;
507 			new_size = btrfs_file_extent_calc_inline_size(
508 						   extent_end - end);
509 			inode_sub_bytes(inode, end - key.offset);
510 			btrfs_set_file_extent_ram_bytes(leaf, extent,
511 							new_size);
512 			if (!compression && !encryption)
513 				ret = btrfs_truncate_item(trans, root, path,
514 							  new_size, 0);
515 			BUG_ON(ret);
516 		}
517 		/* create bookend, splitting the extent in two */
518 		if (bookend && found_extent) {
519 			struct btrfs_key ins;
520 			ins.objectid = inode->i_ino;
521 			ins.offset = end;
522 			btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
523 
524 			btrfs_release_path(root, path);
525 			path->leave_spinning = 1;
526 			ret = btrfs_insert_empty_item(trans, root, path, &ins,
527 						      sizeof(*extent));
528 			BUG_ON(ret);
529 
530 			leaf = path->nodes[0];
531 			extent = btrfs_item_ptr(leaf, path->slots[0],
532 						struct btrfs_file_extent_item);
533 			write_extent_buffer(leaf, &old,
534 					    (unsigned long)extent, sizeof(old));
535 
536 			btrfs_set_file_extent_compression(leaf, extent,
537 							  compression);
538 			btrfs_set_file_extent_encryption(leaf, extent,
539 							 encryption);
540 			btrfs_set_file_extent_other_encoding(leaf, extent,
541 							     other_encoding);
542 			btrfs_set_file_extent_offset(leaf, extent,
543 				    le64_to_cpu(old.offset) + end - key.offset);
544 			WARN_ON(le64_to_cpu(old.num_bytes) <
545 				(extent_end - end));
546 			btrfs_set_file_extent_num_bytes(leaf, extent,
547 							extent_end - end);
548 
549 			/*
550 			 * set the ram bytes to the size of the full extent
551 			 * before splitting.  This is a worst case flag,
552 			 * but its the best we can do because we don't know
553 			 * how splitting affects compression
554 			 */
555 			btrfs_set_file_extent_ram_bytes(leaf, extent,
556 							ram_bytes);
557 			btrfs_set_file_extent_type(leaf, extent, found_type);
558 
559 			btrfs_unlock_up_safe(path, 1);
560 			btrfs_mark_buffer_dirty(path->nodes[0]);
561 			btrfs_set_lock_blocking(path->nodes[0]);
562 
563 			path->leave_spinning = 0;
564 			btrfs_release_path(root, path);
565 			if (disk_bytenr != 0)
566 				inode_add_bytes(inode, extent_end - end);
567 		}
568 
569 		if (found_extent && !keep) {
570 			u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr);
571 
572 			if (old_disk_bytenr != 0) {
573 				inode_sub_bytes(inode,
574 						le64_to_cpu(old.num_bytes));
575 				ret = btrfs_free_extent(trans, root,
576 						old_disk_bytenr,
577 						le64_to_cpu(old.disk_num_bytes),
578 						0, root->root_key.objectid,
579 						key.objectid, key.offset -
580 						le64_to_cpu(old.offset));
581 				BUG_ON(ret);
582 				*hint_byte = old_disk_bytenr;
583 			}
584 		}
585 
586 		if (search_start >= end) {
587 			ret = 0;
588 			goto out;
589 		}
590 	}
591 out:
592 	btrfs_free_path(path);
593 	if (locked_end > orig_locked_end) {
594 		unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end,
595 			      locked_end - 1, GFP_NOFS);
596 	}
597 	return ret;
598 }
599 
600 static int extent_mergeable(struct extent_buffer *leaf, int slot,
601 			    u64 objectid, u64 bytenr, u64 *start, u64 *end)
602 {
603 	struct btrfs_file_extent_item *fi;
604 	struct btrfs_key key;
605 	u64 extent_end;
606 
607 	if (slot < 0 || slot >= btrfs_header_nritems(leaf))
608 		return 0;
609 
610 	btrfs_item_key_to_cpu(leaf, &key, slot);
611 	if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
612 		return 0;
613 
614 	fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
615 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
616 	    btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
617 	    btrfs_file_extent_compression(leaf, fi) ||
618 	    btrfs_file_extent_encryption(leaf, fi) ||
619 	    btrfs_file_extent_other_encoding(leaf, fi))
620 		return 0;
621 
622 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
623 	if ((*start && *start != key.offset) || (*end && *end != extent_end))
624 		return 0;
625 
626 	*start = key.offset;
627 	*end = extent_end;
628 	return 1;
629 }
630 
631 /*
632  * Mark extent in the range start - end as written.
633  *
634  * This changes extent type from 'pre-allocated' to 'regular'. If only
635  * part of extent is marked as written, the extent will be split into
636  * two or three.
637  */
638 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
639 			      struct btrfs_root *root,
640 			      struct inode *inode, u64 start, u64 end)
641 {
642 	struct extent_buffer *leaf;
643 	struct btrfs_path *path;
644 	struct btrfs_file_extent_item *fi;
645 	struct btrfs_key key;
646 	u64 bytenr;
647 	u64 num_bytes;
648 	u64 extent_end;
649 	u64 orig_offset;
650 	u64 other_start;
651 	u64 other_end;
652 	u64 split = start;
653 	u64 locked_end = end;
654 	int extent_type;
655 	int split_end = 1;
656 	int ret;
657 
658 	btrfs_drop_extent_cache(inode, start, end - 1, 0);
659 
660 	path = btrfs_alloc_path();
661 	BUG_ON(!path);
662 again:
663 	key.objectid = inode->i_ino;
664 	key.type = BTRFS_EXTENT_DATA_KEY;
665 	if (split == start)
666 		key.offset = split;
667 	else
668 		key.offset = split - 1;
669 
670 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
671 	if (ret > 0 && path->slots[0] > 0)
672 		path->slots[0]--;
673 
674 	leaf = path->nodes[0];
675 	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
676 	BUG_ON(key.objectid != inode->i_ino ||
677 	       key.type != BTRFS_EXTENT_DATA_KEY);
678 	fi = btrfs_item_ptr(leaf, path->slots[0],
679 			    struct btrfs_file_extent_item);
680 	extent_type = btrfs_file_extent_type(leaf, fi);
681 	BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
682 	extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
683 	BUG_ON(key.offset > start || extent_end < end);
684 
685 	bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
686 	num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
687 	orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
688 
689 	if (key.offset == start)
690 		split = end;
691 
692 	if (key.offset == start && extent_end == end) {
693 		int del_nr = 0;
694 		int del_slot = 0;
695 		other_start = end;
696 		other_end = 0;
697 		if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
698 				     bytenr, &other_start, &other_end)) {
699 			extent_end = other_end;
700 			del_slot = path->slots[0] + 1;
701 			del_nr++;
702 			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
703 						0, root->root_key.objectid,
704 						inode->i_ino, orig_offset);
705 			BUG_ON(ret);
706 		}
707 		other_start = 0;
708 		other_end = start;
709 		if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
710 				     bytenr, &other_start, &other_end)) {
711 			key.offset = other_start;
712 			del_slot = path->slots[0];
713 			del_nr++;
714 			ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
715 						0, root->root_key.objectid,
716 						inode->i_ino, orig_offset);
717 			BUG_ON(ret);
718 		}
719 		split_end = 0;
720 		if (del_nr == 0) {
721 			btrfs_set_file_extent_type(leaf, fi,
722 						   BTRFS_FILE_EXTENT_REG);
723 			goto done;
724 		}
725 
726 		fi = btrfs_item_ptr(leaf, del_slot - 1,
727 				    struct btrfs_file_extent_item);
728 		btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
729 		btrfs_set_file_extent_num_bytes(leaf, fi,
730 						extent_end - key.offset);
731 		btrfs_mark_buffer_dirty(leaf);
732 
733 		ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
734 		BUG_ON(ret);
735 		goto release;
736 	} else if (split == start) {
737 		if (locked_end < extent_end) {
738 			ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
739 					locked_end, extent_end - 1, GFP_NOFS);
740 			if (!ret) {
741 				btrfs_release_path(root, path);
742 				lock_extent(&BTRFS_I(inode)->io_tree,
743 					locked_end, extent_end - 1, GFP_NOFS);
744 				locked_end = extent_end;
745 				goto again;
746 			}
747 			locked_end = extent_end;
748 		}
749 		btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
750 	} else  {
751 		BUG_ON(key.offset != start);
752 		key.offset = split;
753 		btrfs_set_file_extent_offset(leaf, fi, key.offset -
754 					     orig_offset);
755 		btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
756 		btrfs_set_item_key_safe(trans, root, path, &key);
757 		extent_end = split;
758 	}
759 
760 	if (extent_end == end) {
761 		split_end = 0;
762 		extent_type = BTRFS_FILE_EXTENT_REG;
763 	}
764 	if (extent_end == end && split == start) {
765 		other_start = end;
766 		other_end = 0;
767 		if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
768 				     bytenr, &other_start, &other_end)) {
769 			path->slots[0]++;
770 			fi = btrfs_item_ptr(leaf, path->slots[0],
771 					    struct btrfs_file_extent_item);
772 			key.offset = split;
773 			btrfs_set_item_key_safe(trans, root, path, &key);
774 			btrfs_set_file_extent_offset(leaf, fi, key.offset -
775 						     orig_offset);
776 			btrfs_set_file_extent_num_bytes(leaf, fi,
777 							other_end - split);
778 			goto done;
779 		}
780 	}
781 	if (extent_end == end && split == end) {
782 		other_start = 0;
783 		other_end = start;
784 		if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
785 				     bytenr, &other_start, &other_end)) {
786 			path->slots[0]--;
787 			fi = btrfs_item_ptr(leaf, path->slots[0],
788 					    struct btrfs_file_extent_item);
789 			btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
790 							other_start);
791 			goto done;
792 		}
793 	}
794 
795 	btrfs_mark_buffer_dirty(leaf);
796 
797 	ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
798 				   root->root_key.objectid,
799 				   inode->i_ino, orig_offset);
800 	BUG_ON(ret);
801 	btrfs_release_path(root, path);
802 
803 	key.offset = start;
804 	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
805 	BUG_ON(ret);
806 
807 	leaf = path->nodes[0];
808 	fi = btrfs_item_ptr(leaf, path->slots[0],
809 			    struct btrfs_file_extent_item);
810 	btrfs_set_file_extent_generation(leaf, fi, trans->transid);
811 	btrfs_set_file_extent_type(leaf, fi, extent_type);
812 	btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
813 	btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
814 	btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset);
815 	btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
816 	btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
817 	btrfs_set_file_extent_compression(leaf, fi, 0);
818 	btrfs_set_file_extent_encryption(leaf, fi, 0);
819 	btrfs_set_file_extent_other_encoding(leaf, fi, 0);
820 done:
821 	btrfs_mark_buffer_dirty(leaf);
822 
823 release:
824 	btrfs_release_path(root, path);
825 	if (split_end && split == start) {
826 		split = end;
827 		goto again;
828 	}
829 	if (locked_end > end) {
830 		unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
831 			      GFP_NOFS);
832 	}
833 	btrfs_free_path(path);
834 	return 0;
835 }
836 
837 /*
838  * this gets pages into the page cache and locks them down, it also properly
839  * waits for data=ordered extents to finish before allowing the pages to be
840  * modified.
841  */
842 static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
843 			 struct page **pages, size_t num_pages,
844 			 loff_t pos, unsigned long first_index,
845 			 unsigned long last_index, size_t write_bytes)
846 {
847 	int i;
848 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
849 	struct inode *inode = fdentry(file)->d_inode;
850 	int err = 0;
851 	u64 start_pos;
852 	u64 last_pos;
853 
854 	start_pos = pos & ~((u64)root->sectorsize - 1);
855 	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
856 
857 	if (start_pos > inode->i_size) {
858 		err = btrfs_cont_expand(inode, start_pos);
859 		if (err)
860 			return err;
861 	}
862 
863 	memset(pages, 0, num_pages * sizeof(struct page *));
864 again:
865 	for (i = 0; i < num_pages; i++) {
866 		pages[i] = grab_cache_page(inode->i_mapping, index + i);
867 		if (!pages[i]) {
868 			err = -ENOMEM;
869 			BUG_ON(1);
870 		}
871 		wait_on_page_writeback(pages[i]);
872 	}
873 	if (start_pos < inode->i_size) {
874 		struct btrfs_ordered_extent *ordered;
875 		lock_extent(&BTRFS_I(inode)->io_tree,
876 			    start_pos, last_pos - 1, GFP_NOFS);
877 		ordered = btrfs_lookup_first_ordered_extent(inode,
878 							    last_pos - 1);
879 		if (ordered &&
880 		    ordered->file_offset + ordered->len > start_pos &&
881 		    ordered->file_offset < last_pos) {
882 			btrfs_put_ordered_extent(ordered);
883 			unlock_extent(&BTRFS_I(inode)->io_tree,
884 				      start_pos, last_pos - 1, GFP_NOFS);
885 			for (i = 0; i < num_pages; i++) {
886 				unlock_page(pages[i]);
887 				page_cache_release(pages[i]);
888 			}
889 			btrfs_wait_ordered_range(inode, start_pos,
890 						 last_pos - start_pos);
891 			goto again;
892 		}
893 		if (ordered)
894 			btrfs_put_ordered_extent(ordered);
895 
896 		clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
897 				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
898 				  GFP_NOFS);
899 		unlock_extent(&BTRFS_I(inode)->io_tree,
900 			      start_pos, last_pos - 1, GFP_NOFS);
901 	}
902 	for (i = 0; i < num_pages; i++) {
903 		clear_page_dirty_for_io(pages[i]);
904 		set_page_extent_mapped(pages[i]);
905 		WARN_ON(!PageLocked(pages[i]));
906 	}
907 	return 0;
908 }
909 
910 static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
911 				size_t count, loff_t *ppos)
912 {
913 	loff_t pos;
914 	loff_t start_pos;
915 	ssize_t num_written = 0;
916 	ssize_t err = 0;
917 	int ret = 0;
918 	struct inode *inode = fdentry(file)->d_inode;
919 	struct btrfs_root *root = BTRFS_I(inode)->root;
920 	struct page **pages = NULL;
921 	int nrptrs;
922 	struct page *pinned[2];
923 	unsigned long first_index;
924 	unsigned long last_index;
925 	int will_write;
926 
927 	will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
928 		      (file->f_flags & O_DIRECT));
929 
930 	nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
931 		     PAGE_CACHE_SIZE / (sizeof(struct page *)));
932 	pinned[0] = NULL;
933 	pinned[1] = NULL;
934 
935 	pos = *ppos;
936 	start_pos = pos;
937 
938 	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
939 	current->backing_dev_info = inode->i_mapping->backing_dev_info;
940 	err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
941 	if (err)
942 		goto out_nolock;
943 	if (count == 0)
944 		goto out_nolock;
945 
946 	err = file_remove_suid(file);
947 	if (err)
948 		goto out_nolock;
949 	file_update_time(file);
950 
951 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
952 
953 	mutex_lock(&inode->i_mutex);
954 	BTRFS_I(inode)->sequence++;
955 	first_index = pos >> PAGE_CACHE_SHIFT;
956 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
957 
958 	/*
959 	 * there are lots of better ways to do this, but this code
960 	 * makes sure the first and last page in the file range are
961 	 * up to date and ready for cow
962 	 */
963 	if ((pos & (PAGE_CACHE_SIZE - 1))) {
964 		pinned[0] = grab_cache_page(inode->i_mapping, first_index);
965 		if (!PageUptodate(pinned[0])) {
966 			ret = btrfs_readpage(NULL, pinned[0]);
967 			BUG_ON(ret);
968 			wait_on_page_locked(pinned[0]);
969 		} else {
970 			unlock_page(pinned[0]);
971 		}
972 	}
973 	if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
974 		pinned[1] = grab_cache_page(inode->i_mapping, last_index);
975 		if (!PageUptodate(pinned[1])) {
976 			ret = btrfs_readpage(NULL, pinned[1]);
977 			BUG_ON(ret);
978 			wait_on_page_locked(pinned[1]);
979 		} else {
980 			unlock_page(pinned[1]);
981 		}
982 	}
983 
984 	while (count > 0) {
985 		size_t offset = pos & (PAGE_CACHE_SIZE - 1);
986 		size_t write_bytes = min(count, nrptrs *
987 					(size_t)PAGE_CACHE_SIZE -
988 					 offset);
989 		size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
990 					PAGE_CACHE_SHIFT;
991 
992 		WARN_ON(num_pages > nrptrs);
993 		memset(pages, 0, sizeof(struct page *) * nrptrs);
994 
995 		ret = btrfs_check_data_free_space(root, inode, write_bytes);
996 		if (ret)
997 			goto out;
998 
999 		ret = prepare_pages(root, file, pages, num_pages,
1000 				    pos, first_index, last_index,
1001 				    write_bytes);
1002 		if (ret) {
1003 			btrfs_free_reserved_data_space(root, inode,
1004 						       write_bytes);
1005 			goto out;
1006 		}
1007 
1008 		ret = btrfs_copy_from_user(pos, num_pages,
1009 					   write_bytes, pages, buf);
1010 		if (ret) {
1011 			btrfs_free_reserved_data_space(root, inode,
1012 						       write_bytes);
1013 			btrfs_drop_pages(pages, num_pages);
1014 			goto out;
1015 		}
1016 
1017 		ret = dirty_and_release_pages(NULL, root, file, pages,
1018 					      num_pages, pos, write_bytes);
1019 		btrfs_drop_pages(pages, num_pages);
1020 		if (ret) {
1021 			btrfs_free_reserved_data_space(root, inode,
1022 						       write_bytes);
1023 			goto out;
1024 		}
1025 
1026 		if (will_write) {
1027 			btrfs_fdatawrite_range(inode->i_mapping, pos,
1028 					       pos + write_bytes - 1,
1029 					       WB_SYNC_ALL);
1030 		} else {
1031 			balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1032 							   num_pages);
1033 			if (num_pages <
1034 			    (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1035 				btrfs_btree_balance_dirty(root, 1);
1036 			btrfs_throttle(root);
1037 		}
1038 
1039 		buf += write_bytes;
1040 		count -= write_bytes;
1041 		pos += write_bytes;
1042 		num_written += write_bytes;
1043 
1044 		cond_resched();
1045 	}
1046 out:
1047 	mutex_unlock(&inode->i_mutex);
1048 	if (ret)
1049 		err = ret;
1050 
1051 out_nolock:
1052 	kfree(pages);
1053 	if (pinned[0])
1054 		page_cache_release(pinned[0]);
1055 	if (pinned[1])
1056 		page_cache_release(pinned[1]);
1057 	*ppos = pos;
1058 
1059 	/*
1060 	 * we want to make sure fsync finds this change
1061 	 * but we haven't joined a transaction running right now.
1062 	 *
1063 	 * Later on, someone is sure to update the inode and get the
1064 	 * real transid recorded.
1065 	 *
1066 	 * We set last_trans now to the fs_info generation + 1,
1067 	 * this will either be one more than the running transaction
1068 	 * or the generation used for the next transaction if there isn't
1069 	 * one running right now.
1070 	 */
1071 	BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
1072 
1073 	if (num_written > 0 && will_write) {
1074 		struct btrfs_trans_handle *trans;
1075 
1076 		err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1077 		if (err)
1078 			num_written = err;
1079 
1080 		if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
1081 			trans = btrfs_start_transaction(root, 1);
1082 			ret = btrfs_log_dentry_safe(trans, root,
1083 						    file->f_dentry);
1084 			if (ret == 0) {
1085 				ret = btrfs_sync_log(trans, root);
1086 				if (ret == 0)
1087 					btrfs_end_transaction(trans, root);
1088 				else
1089 					btrfs_commit_transaction(trans, root);
1090 			} else {
1091 				btrfs_commit_transaction(trans, root);
1092 			}
1093 		}
1094 		if (file->f_flags & O_DIRECT) {
1095 			invalidate_mapping_pages(inode->i_mapping,
1096 			      start_pos >> PAGE_CACHE_SHIFT,
1097 			     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1098 		}
1099 	}
1100 	current->backing_dev_info = NULL;
1101 	return num_written ? num_written : err;
1102 }
1103 
1104 int btrfs_release_file(struct inode *inode, struct file *filp)
1105 {
1106 	/*
1107 	 * ordered_data_close is set by settattr when we are about to truncate
1108 	 * a file from a non-zero size to a zero size.  This tries to
1109 	 * flush down new bytes that may have been written if the
1110 	 * application were using truncate to replace a file in place.
1111 	 */
1112 	if (BTRFS_I(inode)->ordered_data_close) {
1113 		BTRFS_I(inode)->ordered_data_close = 0;
1114 		btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
1115 		if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
1116 			filemap_flush(inode->i_mapping);
1117 	}
1118 	if (filp->private_data)
1119 		btrfs_ioctl_trans_end(filp);
1120 	return 0;
1121 }
1122 
1123 /*
1124  * fsync call for both files and directories.  This logs the inode into
1125  * the tree log instead of forcing full commits whenever possible.
1126  *
1127  * It needs to call filemap_fdatawait so that all ordered extent updates are
1128  * in the metadata btree are up to date for copying to the log.
1129  *
1130  * It drops the inode mutex before doing the tree log commit.  This is an
1131  * important optimization for directories because holding the mutex prevents
1132  * new operations on the dir while we write to disk.
1133  */
1134 int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1135 {
1136 	struct inode *inode = dentry->d_inode;
1137 	struct btrfs_root *root = BTRFS_I(inode)->root;
1138 	int ret = 0;
1139 	struct btrfs_trans_handle *trans;
1140 
1141 	/*
1142 	 * check the transaction that last modified this inode
1143 	 * and see if its already been committed
1144 	 */
1145 	if (!BTRFS_I(inode)->last_trans)
1146 		goto out;
1147 
1148 	mutex_lock(&root->fs_info->trans_mutex);
1149 	if (BTRFS_I(inode)->last_trans <=
1150 	    root->fs_info->last_trans_committed) {
1151 		BTRFS_I(inode)->last_trans = 0;
1152 		mutex_unlock(&root->fs_info->trans_mutex);
1153 		goto out;
1154 	}
1155 	mutex_unlock(&root->fs_info->trans_mutex);
1156 
1157 	root->log_batch++;
1158 	filemap_fdatawrite(inode->i_mapping);
1159 	btrfs_wait_ordered_range(inode, 0, (u64)-1);
1160 	root->log_batch++;
1161 
1162 	if (datasync && !(inode->i_state & I_DIRTY_PAGES))
1163 		goto out;
1164 	/*
1165 	 * ok we haven't committed the transaction yet, lets do a commit
1166 	 */
1167 	if (file && file->private_data)
1168 		btrfs_ioctl_trans_end(file);
1169 
1170 	trans = btrfs_start_transaction(root, 1);
1171 	if (!trans) {
1172 		ret = -ENOMEM;
1173 		goto out;
1174 	}
1175 
1176 	ret = btrfs_log_dentry_safe(trans, root, dentry);
1177 	if (ret < 0)
1178 		goto out;
1179 
1180 	/* we've logged all the items and now have a consistent
1181 	 * version of the file in the log.  It is possible that
1182 	 * someone will come in and modify the file, but that's
1183 	 * fine because the log is consistent on disk, and we
1184 	 * have references to all of the file's extents
1185 	 *
1186 	 * It is possible that someone will come in and log the
1187 	 * file again, but that will end up using the synchronization
1188 	 * inside btrfs_sync_log to keep things safe.
1189 	 */
1190 	mutex_unlock(&dentry->d_inode->i_mutex);
1191 
1192 	if (ret > 0) {
1193 		ret = btrfs_commit_transaction(trans, root);
1194 	} else {
1195 		ret = btrfs_sync_log(trans, root);
1196 		if (ret == 0)
1197 			ret = btrfs_end_transaction(trans, root);
1198 		else
1199 			ret = btrfs_commit_transaction(trans, root);
1200 	}
1201 	mutex_lock(&dentry->d_inode->i_mutex);
1202 out:
1203 	return ret > 0 ? EIO : ret;
1204 }
1205 
1206 static struct vm_operations_struct btrfs_file_vm_ops = {
1207 	.fault		= filemap_fault,
1208 	.page_mkwrite	= btrfs_page_mkwrite,
1209 };
1210 
1211 static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
1212 {
1213 	vma->vm_ops = &btrfs_file_vm_ops;
1214 	file_accessed(filp);
1215 	return 0;
1216 }
1217 
1218 struct file_operations btrfs_file_operations = {
1219 	.llseek		= generic_file_llseek,
1220 	.read		= do_sync_read,
1221 	.aio_read       = generic_file_aio_read,
1222 	.splice_read	= generic_file_splice_read,
1223 	.write		= btrfs_file_write,
1224 	.mmap		= btrfs_file_mmap,
1225 	.open		= generic_file_open,
1226 	.release	= btrfs_release_file,
1227 	.fsync		= btrfs_sync_file,
1228 	.unlocked_ioctl	= btrfs_ioctl,
1229 #ifdef CONFIG_COMPAT
1230 	.compat_ioctl	= btrfs_ioctl,
1231 #endif
1232 };
1233