xref: /openbmc/linux/fs/btrfs/reflink.c (revision a61e1e0d)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/iversion.h>
4 #include "ctree.h"
5 #include "reflink.h"
6 #include "transaction.h"
7 
8 #define BTRFS_MAX_DEDUPE_LEN	SZ_16M
9 
10 static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
11 				     struct inode *inode,
12 				     u64 endoff,
13 				     const u64 destoff,
14 				     const u64 olen,
15 				     int no_time_update)
16 {
17 	struct btrfs_root *root = BTRFS_I(inode)->root;
18 	int ret;
19 
20 	inode_inc_iversion(inode);
21 	if (!no_time_update)
22 		inode->i_mtime = inode->i_ctime = current_time(inode);
23 	/*
24 	 * We round up to the block size at eof when determining which
25 	 * extents to clone above, but shouldn't round up the file size.
26 	 */
27 	if (endoff > destoff + olen)
28 		endoff = destoff + olen;
29 	if (endoff > inode->i_size) {
30 		i_size_write(inode, endoff);
31 		btrfs_inode_safe_disk_i_size_write(inode, 0);
32 	}
33 
34 	ret = btrfs_update_inode(trans, root, inode);
35 	if (ret) {
36 		btrfs_abort_transaction(trans, ret);
37 		btrfs_end_transaction(trans);
38 		goto out;
39 	}
40 	ret = btrfs_end_transaction(trans);
41 out:
42 	return ret;
43 }
44 
45 /*
46  * Make sure we do not end up inserting an inline extent into a file that has
47  * already other (non-inline) extents. If a file has an inline extent it can
48  * not have any other extents and the (single) inline extent must start at the
49  * file offset 0. Failing to respect these rules will lead to file corruption,
50  * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
51  *
52  * We can have extents that have been already written to disk or we can have
53  * dirty ranges still in delalloc, in which case the extent maps and items are
54  * created only when we run delalloc, and the delalloc ranges might fall outside
55  * the range we are currently locking in the inode's io tree. So we check the
56  * inode's i_size because of that (i_size updates are done while holding the
57  * i_mutex, which we are holding here).
58  * We also check to see if the inode has a size not greater than "datal" but has
59  * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
60  * protected against such concurrent fallocate calls by the i_mutex).
61  *
62  * If the file has no extents but a size greater than datal, do not allow the
63  * copy because we would need turn the inline extent into a non-inline one (even
64  * with NO_HOLES enabled). If we find our destination inode only has one inline
65  * extent, just overwrite it with the source inline extent if its size is less
66  * than the source extent's size, or we could copy the source inline extent's
67  * data into the destination inode's inline extent if the later is greater then
68  * the former.
69  */
70 static int clone_copy_inline_extent(struct inode *dst,
71 				    struct btrfs_trans_handle *trans,
72 				    struct btrfs_path *path,
73 				    struct btrfs_key *new_key,
74 				    const u64 drop_start,
75 				    const u64 datal,
76 				    const u64 size,
77 				    const char *inline_data)
78 {
79 	struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
80 	struct btrfs_root *root = BTRFS_I(dst)->root;
81 	const u64 aligned_end = ALIGN(new_key->offset + datal,
82 				      fs_info->sectorsize);
83 	int ret;
84 	struct btrfs_key key;
85 
86 	if (new_key->offset > 0)
87 		return -EOPNOTSUPP;
88 
89 	key.objectid = btrfs_ino(BTRFS_I(dst));
90 	key.type = BTRFS_EXTENT_DATA_KEY;
91 	key.offset = 0;
92 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
93 	if (ret < 0) {
94 		return ret;
95 	} else if (ret > 0) {
96 		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
97 			ret = btrfs_next_leaf(root, path);
98 			if (ret < 0)
99 				return ret;
100 			else if (ret > 0)
101 				goto copy_inline_extent;
102 		}
103 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
104 		if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
105 		    key.type == BTRFS_EXTENT_DATA_KEY) {
106 			ASSERT(key.offset > 0);
107 			return -EOPNOTSUPP;
108 		}
109 	} else if (i_size_read(dst) <= datal) {
110 		struct btrfs_file_extent_item *ei;
111 		u64 ext_len;
112 
113 		/*
114 		 * If the file size is <= datal, make sure there are no other
115 		 * extents following (can happen do to an fallocate call with
116 		 * the flag FALLOC_FL_KEEP_SIZE).
117 		 */
118 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
119 				    struct btrfs_file_extent_item);
120 		/*
121 		 * If it's an inline extent, it can not have other extents
122 		 * following it.
123 		 */
124 		if (btrfs_file_extent_type(path->nodes[0], ei) ==
125 		    BTRFS_FILE_EXTENT_INLINE)
126 			goto copy_inline_extent;
127 
128 		ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
129 		if (ext_len > aligned_end)
130 			return -EOPNOTSUPP;
131 
132 		ret = btrfs_next_item(root, path);
133 		if (ret < 0) {
134 			return ret;
135 		} else if (ret == 0) {
136 			btrfs_item_key_to_cpu(path->nodes[0], &key,
137 					      path->slots[0]);
138 			if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
139 			    key.type == BTRFS_EXTENT_DATA_KEY)
140 				return -EOPNOTSUPP;
141 		}
142 	}
143 
144 copy_inline_extent:
145 	/*
146 	 * We have no extent items, or we have an extent at offset 0 which may
147 	 * or may not be inlined. All these cases are dealt the same way.
148 	 */
149 	if (i_size_read(dst) > datal) {
150 		/*
151 		 * If the destination inode has an inline extent.
152 		 * This would require copying the data from the source inline
153 		 * extent into the beginning of the destination's inline extent.
154 		 * But this is really complex, both extents can be compressed
155 		 * or just one of them, which would require decompressing and
156 		 * re-compressing data (which could increase the new compressed
157 		 * size, not allowing the compressed data to fit anymore in an
158 		 * inline extent).
159 		 * So just don't support this case for now (it should be rare,
160 		 * we are not really saving space when cloning inline extents).
161 		 */
162 		return -EOPNOTSUPP;
163 	}
164 
165 	btrfs_release_path(path);
166 	ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
167 	if (ret)
168 		return ret;
169 	ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
170 	if (ret)
171 		return ret;
172 
173 	write_extent_buffer(path->nodes[0], inline_data,
174 			    btrfs_item_ptr_offset(path->nodes[0],
175 						  path->slots[0]),
176 			    size);
177 	inode_add_bytes(dst, datal);
178 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
179 
180 	return 0;
181 }
182 
183 /**
184  * btrfs_clone() - clone a range from inode file to another
185  *
186  * @src: Inode to clone from
187  * @inode: Inode to clone to
188  * @off: Offset within source to start clone from
189  * @olen: Original length, passed by user, of range to clone
190  * @olen_aligned: Block-aligned value of olen
191  * @destoff: Offset within @inode to start clone
192  * @no_time_update: Whether to update mtime/ctime on the target inode
193  */
194 static int btrfs_clone(struct inode *src, struct inode *inode,
195 		       const u64 off, const u64 olen, const u64 olen_aligned,
196 		       const u64 destoff, int no_time_update)
197 {
198 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
199 	struct btrfs_root *root = BTRFS_I(inode)->root;
200 	struct btrfs_path *path = NULL;
201 	struct extent_buffer *leaf;
202 	struct btrfs_trans_handle *trans;
203 	char *buf = NULL;
204 	struct btrfs_key key;
205 	u32 nritems;
206 	int slot;
207 	int ret;
208 	const u64 len = olen_aligned;
209 	u64 last_dest_end = destoff;
210 
211 	ret = -ENOMEM;
212 	buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
213 	if (!buf)
214 		return ret;
215 
216 	path = btrfs_alloc_path();
217 	if (!path) {
218 		kvfree(buf);
219 		return ret;
220 	}
221 
222 	path->reada = READA_FORWARD;
223 	/* Clone data */
224 	key.objectid = btrfs_ino(BTRFS_I(src));
225 	key.type = BTRFS_EXTENT_DATA_KEY;
226 	key.offset = off;
227 
228 	while (1) {
229 		u64 next_key_min_offset = key.offset + 1;
230 		struct btrfs_file_extent_item *extent;
231 		int type;
232 		u32 size;
233 		struct btrfs_key new_key;
234 		u64 disko = 0, diskl = 0;
235 		u64 datao = 0, datal = 0;
236 		u64 drop_start;
237 
238 		/* Note the key will change type as we walk through the tree */
239 		path->leave_spinning = 1;
240 		ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
241 				0, 0);
242 		if (ret < 0)
243 			goto out;
244 		/*
245 		 * First search, if no extent item that starts at offset off was
246 		 * found but the previous item is an extent item, it's possible
247 		 * it might overlap our target range, therefore process it.
248 		 */
249 		if (key.offset == off && ret > 0 && path->slots[0] > 0) {
250 			btrfs_item_key_to_cpu(path->nodes[0], &key,
251 					      path->slots[0] - 1);
252 			if (key.type == BTRFS_EXTENT_DATA_KEY)
253 				path->slots[0]--;
254 		}
255 
256 		nritems = btrfs_header_nritems(path->nodes[0]);
257 process_slot:
258 		if (path->slots[0] >= nritems) {
259 			ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
260 			if (ret < 0)
261 				goto out;
262 			if (ret > 0)
263 				break;
264 			nritems = btrfs_header_nritems(path->nodes[0]);
265 		}
266 		leaf = path->nodes[0];
267 		slot = path->slots[0];
268 
269 		btrfs_item_key_to_cpu(leaf, &key, slot);
270 		if (key.type > BTRFS_EXTENT_DATA_KEY ||
271 		    key.objectid != btrfs_ino(BTRFS_I(src)))
272 			break;
273 
274 		ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
275 
276 		extent = btrfs_item_ptr(leaf, slot,
277 					struct btrfs_file_extent_item);
278 		type = btrfs_file_extent_type(leaf, extent);
279 		if (type == BTRFS_FILE_EXTENT_REG ||
280 		    type == BTRFS_FILE_EXTENT_PREALLOC) {
281 			disko = btrfs_file_extent_disk_bytenr(leaf, extent);
282 			diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
283 			datao = btrfs_file_extent_offset(leaf, extent);
284 			datal = btrfs_file_extent_num_bytes(leaf, extent);
285 		} else if (type == BTRFS_FILE_EXTENT_INLINE) {
286 			/* Take upper bound, may be compressed */
287 			datal = btrfs_file_extent_ram_bytes(leaf, extent);
288 		}
289 
290 		/*
291 		 * The first search might have left us at an extent item that
292 		 * ends before our target range's start, can happen if we have
293 		 * holes and NO_HOLES feature enabled.
294 		 */
295 		if (key.offset + datal <= off) {
296 			path->slots[0]++;
297 			goto process_slot;
298 		} else if (key.offset >= off + len) {
299 			break;
300 		}
301 		next_key_min_offset = key.offset + datal;
302 		size = btrfs_item_size_nr(leaf, slot);
303 		read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
304 				   size);
305 
306 		btrfs_release_path(path);
307 		path->leave_spinning = 0;
308 
309 		memcpy(&new_key, &key, sizeof(new_key));
310 		new_key.objectid = btrfs_ino(BTRFS_I(inode));
311 		if (off <= key.offset)
312 			new_key.offset = key.offset + destoff - off;
313 		else
314 			new_key.offset = destoff;
315 
316 		/*
317 		 * Deal with a hole that doesn't have an extent item that
318 		 * represents it (NO_HOLES feature enabled).
319 		 * This hole is either in the middle of the cloning range or at
320 		 * the beginning (fully overlaps it or partially overlaps it).
321 		 */
322 		if (new_key.offset != last_dest_end)
323 			drop_start = last_dest_end;
324 		else
325 			drop_start = new_key.offset;
326 
327 		if (type == BTRFS_FILE_EXTENT_REG ||
328 		    type == BTRFS_FILE_EXTENT_PREALLOC) {
329 			struct btrfs_clone_extent_info clone_info;
330 
331 			/*
332 			 *    a  | --- range to clone ---|  b
333 			 * | ------------- extent ------------- |
334 			 */
335 
336 			/* Subtract range b */
337 			if (key.offset + datal > off + len)
338 				datal = off + len - key.offset;
339 
340 			/* Subtract range a */
341 			if (off > key.offset) {
342 				datao += off - key.offset;
343 				datal -= off - key.offset;
344 			}
345 
346 			clone_info.disk_offset = disko;
347 			clone_info.disk_len = diskl;
348 			clone_info.data_offset = datao;
349 			clone_info.data_len = datal;
350 			clone_info.file_offset = new_key.offset;
351 			clone_info.extent_buf = buf;
352 			clone_info.item_size = size;
353 			ret = btrfs_punch_hole_range(inode, path, drop_start,
354 					new_key.offset + datal - 1, &clone_info,
355 					&trans);
356 			if (ret)
357 				goto out;
358 		} else if (type == BTRFS_FILE_EXTENT_INLINE) {
359 			/*
360 			 * Inline extents always have to start at file offset 0
361 			 * and can never be bigger then the sector size. We can
362 			 * never clone only parts of an inline extent, since all
363 			 * reflink operations must start at a sector size aligned
364 			 * offset, and the length must be aligned too or end at
365 			 * the i_size (which implies the whole inlined data).
366 			 */
367 			ASSERT(key.offset == 0);
368 			ASSERT(datal <= fs_info->sectorsize);
369 			if (key.offset != 0 || datal > fs_info->sectorsize)
370 				return -EUCLEAN;
371 
372 			/*
373 			 * If our extent is inline, we know we will drop or
374 			 * adjust at most 1 extent item in the destination root.
375 			 *
376 			 * 1 - adjusting old extent (we may have to split it)
377 			 * 1 - add new extent
378 			 * 1 - inode update
379 			 */
380 			trans = btrfs_start_transaction(root, 3);
381 			if (IS_ERR(trans)) {
382 				ret = PTR_ERR(trans);
383 				goto out;
384 			}
385 
386 			ret = clone_copy_inline_extent(inode, trans, path,
387 						       &new_key, drop_start,
388 						       datal, size, buf);
389 			if (ret) {
390 				if (ret != -EOPNOTSUPP)
391 					btrfs_abort_transaction(trans, ret);
392 				btrfs_end_transaction(trans);
393 				goto out;
394 			}
395 		}
396 
397 		btrfs_release_path(path);
398 
399 		last_dest_end = ALIGN(new_key.offset + datal,
400 				      fs_info->sectorsize);
401 		ret = clone_finish_inode_update(trans, inode, last_dest_end,
402 						destoff, olen, no_time_update);
403 		if (ret)
404 			goto out;
405 		if (new_key.offset + datal >= destoff + len)
406 			break;
407 
408 		btrfs_release_path(path);
409 		key.offset = next_key_min_offset;
410 
411 		if (fatal_signal_pending(current)) {
412 			ret = -EINTR;
413 			goto out;
414 		}
415 	}
416 	ret = 0;
417 
418 	if (last_dest_end < destoff + len) {
419 		/*
420 		 * We have an implicit hole that fully or partially overlaps our
421 		 * cloning range at its end. This means that we either have the
422 		 * NO_HOLES feature enabled or the implicit hole happened due to
423 		 * mixing buffered and direct IO writes against this file.
424 		 */
425 		btrfs_release_path(path);
426 		path->leave_spinning = 0;
427 
428 		ret = btrfs_punch_hole_range(inode, path, last_dest_end,
429 				destoff + len - 1, NULL, &trans);
430 		if (ret)
431 			goto out;
432 
433 		ret = clone_finish_inode_update(trans, inode, destoff + len,
434 						destoff, olen, no_time_update);
435 	}
436 
437 out:
438 	btrfs_free_path(path);
439 	kvfree(buf);
440 	return ret;
441 }
442 
443 static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
444 				       struct inode *inode2, u64 loff2, u64 len)
445 {
446 	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
447 	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
448 }
449 
450 static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
451 				     struct inode *inode2, u64 loff2, u64 len)
452 {
453 	if (inode1 < inode2) {
454 		swap(inode1, inode2);
455 		swap(loff1, loff2);
456 	} else if (inode1 == inode2 && loff2 < loff1) {
457 		swap(loff1, loff2);
458 	}
459 	lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
460 	lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
461 }
462 
463 static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
464 				   struct inode *dst, u64 dst_loff)
465 {
466 	const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
467 	int ret;
468 
469 	/*
470 	 * Lock destination range to serialize with concurrent readpages() and
471 	 * source range to serialize with relocation.
472 	 */
473 	btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
474 	ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
475 	btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
476 
477 	return ret;
478 }
479 
480 static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
481 			     struct inode *dst, u64 dst_loff)
482 {
483 	int ret;
484 	u64 i, tail_len, chunk_count;
485 	struct btrfs_root *root_dst = BTRFS_I(dst)->root;
486 
487 	spin_lock(&root_dst->root_item_lock);
488 	if (root_dst->send_in_progress) {
489 		btrfs_warn_rl(root_dst->fs_info,
490 "cannot deduplicate to root %llu while send operations are using it (%d in progress)",
491 			      root_dst->root_key.objectid,
492 			      root_dst->send_in_progress);
493 		spin_unlock(&root_dst->root_item_lock);
494 		return -EAGAIN;
495 	}
496 	root_dst->dedupe_in_progress++;
497 	spin_unlock(&root_dst->root_item_lock);
498 
499 	tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
500 	chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
501 
502 	for (i = 0; i < chunk_count; i++) {
503 		ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
504 					      dst, dst_loff);
505 		if (ret)
506 			goto out;
507 
508 		loff += BTRFS_MAX_DEDUPE_LEN;
509 		dst_loff += BTRFS_MAX_DEDUPE_LEN;
510 	}
511 
512 	if (tail_len > 0)
513 		ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
514 out:
515 	spin_lock(&root_dst->root_item_lock);
516 	root_dst->dedupe_in_progress--;
517 	spin_unlock(&root_dst->root_item_lock);
518 
519 	return ret;
520 }
521 
522 static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
523 					u64 off, u64 olen, u64 destoff)
524 {
525 	struct inode *inode = file_inode(file);
526 	struct inode *src = file_inode(file_src);
527 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
528 	int ret;
529 	u64 len = olen;
530 	u64 bs = fs_info->sb->s_blocksize;
531 
532 	/*
533 	 * VFS's generic_remap_file_range_prep() protects us from cloning the
534 	 * eof block into the middle of a file, which would result in corruption
535 	 * if the file size is not blocksize aligned. So we don't need to check
536 	 * for that case here.
537 	 */
538 	if (off + len == src->i_size)
539 		len = ALIGN(src->i_size, bs) - off;
540 
541 	if (destoff > inode->i_size) {
542 		const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
543 
544 		ret = btrfs_cont_expand(inode, inode->i_size, destoff);
545 		if (ret)
546 			return ret;
547 		/*
548 		 * We may have truncated the last block if the inode's size is
549 		 * not sector size aligned, so we need to wait for writeback to
550 		 * complete before proceeding further, otherwise we can race
551 		 * with cloning and attempt to increment a reference to an
552 		 * extent that no longer exists (writeback completed right after
553 		 * we found the previous extent covering eof and before we
554 		 * attempted to increment its reference count).
555 		 */
556 		ret = btrfs_wait_ordered_range(inode, wb_start,
557 					       destoff - wb_start);
558 		if (ret)
559 			return ret;
560 	}
561 
562 	/*
563 	 * Lock destination range to serialize with concurrent readpages() and
564 	 * source range to serialize with relocation.
565 	 */
566 	btrfs_double_extent_lock(src, off, inode, destoff, len);
567 	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
568 	btrfs_double_extent_unlock(src, off, inode, destoff, len);
569 	/*
570 	 * Truncate page cache pages so that future reads will see the cloned
571 	 * data immediately and not the previous data.
572 	 */
573 	truncate_inode_pages_range(&inode->i_data,
574 				round_down(destoff, PAGE_SIZE),
575 				round_up(destoff + len, PAGE_SIZE) - 1);
576 
577 	return ret;
578 }
579 
580 static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
581 				       struct file *file_out, loff_t pos_out,
582 				       loff_t *len, unsigned int remap_flags)
583 {
584 	struct inode *inode_in = file_inode(file_in);
585 	struct inode *inode_out = file_inode(file_out);
586 	u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
587 	bool same_inode = inode_out == inode_in;
588 	u64 wb_len;
589 	int ret;
590 
591 	if (!(remap_flags & REMAP_FILE_DEDUP)) {
592 		struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
593 
594 		if (btrfs_root_readonly(root_out))
595 			return -EROFS;
596 
597 		if (file_in->f_path.mnt != file_out->f_path.mnt ||
598 		    inode_in->i_sb != inode_out->i_sb)
599 			return -EXDEV;
600 	}
601 
602 	/* Don't make the dst file partly checksummed */
603 	if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
604 	    (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
605 		return -EINVAL;
606 	}
607 
608 	/*
609 	 * Now that the inodes are locked, we need to start writeback ourselves
610 	 * and can not rely on the writeback from the VFS's generic helper
611 	 * generic_remap_file_range_prep() because:
612 	 *
613 	 * 1) For compression we must call filemap_fdatawrite_range() range
614 	 *    twice (btrfs_fdatawrite_range() does it for us), and the generic
615 	 *    helper only calls it once;
616 	 *
617 	 * 2) filemap_fdatawrite_range(), called by the generic helper only
618 	 *    waits for the writeback to complete, i.e. for IO to be done, and
619 	 *    not for the ordered extents to complete. We need to wait for them
620 	 *    to complete so that new file extent items are in the fs tree.
621 	 */
622 	if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
623 		wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
624 	else
625 		wb_len = ALIGN(*len, bs);
626 
627 	/*
628 	 * Since we don't lock ranges, wait for ongoing lockless dio writes (as
629 	 * any in progress could create its ordered extents after we wait for
630 	 * existing ordered extents below).
631 	 */
632 	inode_dio_wait(inode_in);
633 	if (!same_inode)
634 		inode_dio_wait(inode_out);
635 
636 	/*
637 	 * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
638 	 *
639 	 * Btrfs' back references do not have a block level granularity, they
640 	 * work at the whole extent level.
641 	 * NOCOW buffered write without data space reserved may not be able
642 	 * to fall back to CoW due to lack of data space, thus could cause
643 	 * data loss.
644 	 *
645 	 * Here we take a shortcut by flushing the whole inode, so that all
646 	 * nocow write should reach disk as nocow before we increase the
647 	 * reference of the extent. We could do better by only flushing NOCOW
648 	 * data, but that needs extra accounting.
649 	 *
650 	 * Also we don't need to check ASYNC_EXTENT, as async extent will be
651 	 * CoWed anyway, not affecting nocow part.
652 	 */
653 	ret = filemap_flush(inode_in->i_mapping);
654 	if (ret < 0)
655 		return ret;
656 
657 	ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
658 				       wb_len);
659 	if (ret < 0)
660 		return ret;
661 	ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
662 				       wb_len);
663 	if (ret < 0)
664 		return ret;
665 
666 	return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
667 					    len, remap_flags);
668 }
669 
670 loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
671 		struct file *dst_file, loff_t destoff, loff_t len,
672 		unsigned int remap_flags)
673 {
674 	struct inode *src_inode = file_inode(src_file);
675 	struct inode *dst_inode = file_inode(dst_file);
676 	bool same_inode = dst_inode == src_inode;
677 	int ret;
678 
679 	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
680 		return -EINVAL;
681 
682 	if (same_inode)
683 		inode_lock(src_inode);
684 	else
685 		lock_two_nondirectories(src_inode, dst_inode);
686 
687 	ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
688 					  &len, remap_flags);
689 	if (ret < 0 || len == 0)
690 		goto out_unlock;
691 
692 	if (remap_flags & REMAP_FILE_DEDUP)
693 		ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
694 	else
695 		ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
696 
697 out_unlock:
698 	if (same_inode)
699 		inode_unlock(src_inode);
700 	else
701 		unlock_two_nondirectories(src_inode, dst_inode);
702 
703 	return ret < 0 ? ret : len;
704 }
705