xref: /openbmc/linux/fs/btrfs/reflink.c (revision 6a177381)
1 // SPDX-License-Identifier: GPL-2.0
2 
3 #include <linux/iversion.h>
4 #include "ctree.h"
5 #include "reflink.h"
6 #include "transaction.h"
7 
8 #define BTRFS_MAX_DEDUPE_LEN	SZ_16M
9 
10 static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
11 				     struct inode *inode,
12 				     u64 endoff,
13 				     const u64 destoff,
14 				     const u64 olen,
15 				     int no_time_update)
16 {
17 	struct btrfs_root *root = BTRFS_I(inode)->root;
18 	int ret;
19 
20 	inode_inc_iversion(inode);
21 	if (!no_time_update)
22 		inode->i_mtime = inode->i_ctime = current_time(inode);
23 	/*
24 	 * We round up to the block size at eof when determining which
25 	 * extents to clone above, but shouldn't round up the file size.
26 	 */
27 	if (endoff > destoff + olen)
28 		endoff = destoff + olen;
29 	if (endoff > inode->i_size) {
30 		i_size_write(inode, endoff);
31 		btrfs_inode_safe_disk_i_size_write(inode, 0);
32 	}
33 
34 	ret = btrfs_update_inode(trans, root, inode);
35 	if (ret) {
36 		btrfs_abort_transaction(trans, ret);
37 		btrfs_end_transaction(trans);
38 		goto out;
39 	}
40 	ret = btrfs_end_transaction(trans);
41 out:
42 	return ret;
43 }
44 
45 /*
46  * Make sure we do not end up inserting an inline extent into a file that has
47  * already other (non-inline) extents. If a file has an inline extent it can
48  * not have any other extents and the (single) inline extent must start at the
49  * file offset 0. Failing to respect these rules will lead to file corruption,
50  * resulting in EIO errors on read/write operations, hitting BUG_ON's in mm, etc
51  *
52  * We can have extents that have been already written to disk or we can have
53  * dirty ranges still in delalloc, in which case the extent maps and items are
54  * created only when we run delalloc, and the delalloc ranges might fall outside
55  * the range we are currently locking in the inode's io tree. So we check the
56  * inode's i_size because of that (i_size updates are done while holding the
57  * i_mutex, which we are holding here).
58  * We also check to see if the inode has a size not greater than "datal" but has
59  * extents beyond it, due to an fallocate with FALLOC_FL_KEEP_SIZE (and we are
60  * protected against such concurrent fallocate calls by the i_mutex).
61  *
62  * If the file has no extents but a size greater than datal, do not allow the
63  * copy because we would need turn the inline extent into a non-inline one (even
64  * with NO_HOLES enabled). If we find our destination inode only has one inline
65  * extent, just overwrite it with the source inline extent if its size is less
66  * than the source extent's size, or we could copy the source inline extent's
67  * data into the destination inode's inline extent if the later is greater then
68  * the former.
69  */
70 static int clone_copy_inline_extent(struct inode *dst,
71 				    struct btrfs_trans_handle *trans,
72 				    struct btrfs_path *path,
73 				    struct btrfs_key *new_key,
74 				    const u64 drop_start,
75 				    const u64 datal,
76 				    const u64 skip,
77 				    const u64 size,
78 				    char *inline_data)
79 {
80 	struct btrfs_fs_info *fs_info = btrfs_sb(dst->i_sb);
81 	struct btrfs_root *root = BTRFS_I(dst)->root;
82 	const u64 aligned_end = ALIGN(new_key->offset + datal,
83 				      fs_info->sectorsize);
84 	int ret;
85 	struct btrfs_key key;
86 
87 	if (new_key->offset > 0)
88 		return -EOPNOTSUPP;
89 
90 	key.objectid = btrfs_ino(BTRFS_I(dst));
91 	key.type = BTRFS_EXTENT_DATA_KEY;
92 	key.offset = 0;
93 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
94 	if (ret < 0) {
95 		return ret;
96 	} else if (ret > 0) {
97 		if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
98 			ret = btrfs_next_leaf(root, path);
99 			if (ret < 0)
100 				return ret;
101 			else if (ret > 0)
102 				goto copy_inline_extent;
103 		}
104 		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
105 		if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
106 		    key.type == BTRFS_EXTENT_DATA_KEY) {
107 			ASSERT(key.offset > 0);
108 			return -EOPNOTSUPP;
109 		}
110 	} else if (i_size_read(dst) <= datal) {
111 		struct btrfs_file_extent_item *ei;
112 		u64 ext_len;
113 
114 		/*
115 		 * If the file size is <= datal, make sure there are no other
116 		 * extents following (can happen do to an fallocate call with
117 		 * the flag FALLOC_FL_KEEP_SIZE).
118 		 */
119 		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
120 				    struct btrfs_file_extent_item);
121 		/*
122 		 * If it's an inline extent, it can not have other extents
123 		 * following it.
124 		 */
125 		if (btrfs_file_extent_type(path->nodes[0], ei) ==
126 		    BTRFS_FILE_EXTENT_INLINE)
127 			goto copy_inline_extent;
128 
129 		ext_len = btrfs_file_extent_num_bytes(path->nodes[0], ei);
130 		if (ext_len > aligned_end)
131 			return -EOPNOTSUPP;
132 
133 		ret = btrfs_next_item(root, path);
134 		if (ret < 0) {
135 			return ret;
136 		} else if (ret == 0) {
137 			btrfs_item_key_to_cpu(path->nodes[0], &key,
138 					      path->slots[0]);
139 			if (key.objectid == btrfs_ino(BTRFS_I(dst)) &&
140 			    key.type == BTRFS_EXTENT_DATA_KEY)
141 				return -EOPNOTSUPP;
142 		}
143 	}
144 
145 copy_inline_extent:
146 	/*
147 	 * We have no extent items, or we have an extent at offset 0 which may
148 	 * or may not be inlined. All these cases are dealt the same way.
149 	 */
150 	if (i_size_read(dst) > datal) {
151 		/*
152 		 * If the destination inode has an inline extent.
153 		 * This would require copying the data from the source inline
154 		 * extent into the beginning of the destination's inline extent.
155 		 * But this is really complex, both extents can be compressed
156 		 * or just one of them, which would require decompressing and
157 		 * re-compressing data (which could increase the new compressed
158 		 * size, not allowing the compressed data to fit anymore in an
159 		 * inline extent).
160 		 * So just don't support this case for now (it should be rare,
161 		 * we are not really saving space when cloning inline extents).
162 		 */
163 		return -EOPNOTSUPP;
164 	}
165 
166 	btrfs_release_path(path);
167 	ret = btrfs_drop_extents(trans, root, dst, drop_start, aligned_end, 1);
168 	if (ret)
169 		return ret;
170 	ret = btrfs_insert_empty_item(trans, root, path, new_key, size);
171 	if (ret)
172 		return ret;
173 
174 	if (skip) {
175 		const u32 start = btrfs_file_extent_calc_inline_size(0);
176 
177 		memmove(inline_data + start, inline_data + start + skip, datal);
178 	}
179 
180 	write_extent_buffer(path->nodes[0], inline_data,
181 			    btrfs_item_ptr_offset(path->nodes[0],
182 						  path->slots[0]),
183 			    size);
184 	inode_add_bytes(dst, datal);
185 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(dst)->runtime_flags);
186 
187 	return 0;
188 }
189 
190 /**
191  * btrfs_clone() - clone a range from inode file to another
192  *
193  * @src: Inode to clone from
194  * @inode: Inode to clone to
195  * @off: Offset within source to start clone from
196  * @olen: Original length, passed by user, of range to clone
197  * @olen_aligned: Block-aligned value of olen
198  * @destoff: Offset within @inode to start clone
199  * @no_time_update: Whether to update mtime/ctime on the target inode
200  */
201 static int btrfs_clone(struct inode *src, struct inode *inode,
202 		       const u64 off, const u64 olen, const u64 olen_aligned,
203 		       const u64 destoff, int no_time_update)
204 {
205 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
206 	struct btrfs_root *root = BTRFS_I(inode)->root;
207 	struct btrfs_path *path = NULL;
208 	struct extent_buffer *leaf;
209 	struct btrfs_trans_handle *trans;
210 	char *buf = NULL;
211 	struct btrfs_key key;
212 	u32 nritems;
213 	int slot;
214 	int ret;
215 	const u64 len = olen_aligned;
216 	u64 last_dest_end = destoff;
217 
218 	ret = -ENOMEM;
219 	buf = kvmalloc(fs_info->nodesize, GFP_KERNEL);
220 	if (!buf)
221 		return ret;
222 
223 	path = btrfs_alloc_path();
224 	if (!path) {
225 		kvfree(buf);
226 		return ret;
227 	}
228 
229 	path->reada = READA_FORWARD;
230 	/* Clone data */
231 	key.objectid = btrfs_ino(BTRFS_I(src));
232 	key.type = BTRFS_EXTENT_DATA_KEY;
233 	key.offset = off;
234 
235 	while (1) {
236 		u64 next_key_min_offset = key.offset + 1;
237 		struct btrfs_file_extent_item *extent;
238 		int type;
239 		u32 size;
240 		struct btrfs_key new_key;
241 		u64 disko = 0, diskl = 0;
242 		u64 datao = 0, datal = 0;
243 		u8 comp;
244 		u64 drop_start;
245 
246 		/* Note the key will change type as we walk through the tree */
247 		path->leave_spinning = 1;
248 		ret = btrfs_search_slot(NULL, BTRFS_I(src)->root, &key, path,
249 				0, 0);
250 		if (ret < 0)
251 			goto out;
252 		/*
253 		 * First search, if no extent item that starts at offset off was
254 		 * found but the previous item is an extent item, it's possible
255 		 * it might overlap our target range, therefore process it.
256 		 */
257 		if (key.offset == off && ret > 0 && path->slots[0] > 0) {
258 			btrfs_item_key_to_cpu(path->nodes[0], &key,
259 					      path->slots[0] - 1);
260 			if (key.type == BTRFS_EXTENT_DATA_KEY)
261 				path->slots[0]--;
262 		}
263 
264 		nritems = btrfs_header_nritems(path->nodes[0]);
265 process_slot:
266 		if (path->slots[0] >= nritems) {
267 			ret = btrfs_next_leaf(BTRFS_I(src)->root, path);
268 			if (ret < 0)
269 				goto out;
270 			if (ret > 0)
271 				break;
272 			nritems = btrfs_header_nritems(path->nodes[0]);
273 		}
274 		leaf = path->nodes[0];
275 		slot = path->slots[0];
276 
277 		btrfs_item_key_to_cpu(leaf, &key, slot);
278 		if (key.type > BTRFS_EXTENT_DATA_KEY ||
279 		    key.objectid != btrfs_ino(BTRFS_I(src)))
280 			break;
281 
282 		ASSERT(key.type == BTRFS_EXTENT_DATA_KEY);
283 
284 		extent = btrfs_item_ptr(leaf, slot,
285 					struct btrfs_file_extent_item);
286 		comp = btrfs_file_extent_compression(leaf, extent);
287 		type = btrfs_file_extent_type(leaf, extent);
288 		if (type == BTRFS_FILE_EXTENT_REG ||
289 		    type == BTRFS_FILE_EXTENT_PREALLOC) {
290 			disko = btrfs_file_extent_disk_bytenr(leaf, extent);
291 			diskl = btrfs_file_extent_disk_num_bytes(leaf, extent);
292 			datao = btrfs_file_extent_offset(leaf, extent);
293 			datal = btrfs_file_extent_num_bytes(leaf, extent);
294 		} else if (type == BTRFS_FILE_EXTENT_INLINE) {
295 			/* Take upper bound, may be compressed */
296 			datal = btrfs_file_extent_ram_bytes(leaf, extent);
297 		}
298 
299 		/*
300 		 * The first search might have left us at an extent item that
301 		 * ends before our target range's start, can happen if we have
302 		 * holes and NO_HOLES feature enabled.
303 		 */
304 		if (key.offset + datal <= off) {
305 			path->slots[0]++;
306 			goto process_slot;
307 		} else if (key.offset >= off + len) {
308 			break;
309 		}
310 		next_key_min_offset = key.offset + datal;
311 		size = btrfs_item_size_nr(leaf, slot);
312 		read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot),
313 				   size);
314 
315 		btrfs_release_path(path);
316 		path->leave_spinning = 0;
317 
318 		memcpy(&new_key, &key, sizeof(new_key));
319 		new_key.objectid = btrfs_ino(BTRFS_I(inode));
320 		if (off <= key.offset)
321 			new_key.offset = key.offset + destoff - off;
322 		else
323 			new_key.offset = destoff;
324 
325 		/*
326 		 * Deal with a hole that doesn't have an extent item that
327 		 * represents it (NO_HOLES feature enabled).
328 		 * This hole is either in the middle of the cloning range or at
329 		 * the beginning (fully overlaps it or partially overlaps it).
330 		 */
331 		if (new_key.offset != last_dest_end)
332 			drop_start = last_dest_end;
333 		else
334 			drop_start = new_key.offset;
335 
336 		if (type == BTRFS_FILE_EXTENT_REG ||
337 		    type == BTRFS_FILE_EXTENT_PREALLOC) {
338 			struct btrfs_clone_extent_info clone_info;
339 
340 			/*
341 			 *    a  | --- range to clone ---|  b
342 			 * | ------------- extent ------------- |
343 			 */
344 
345 			/* Subtract range b */
346 			if (key.offset + datal > off + len)
347 				datal = off + len - key.offset;
348 
349 			/* Subtract range a */
350 			if (off > key.offset) {
351 				datao += off - key.offset;
352 				datal -= off - key.offset;
353 			}
354 
355 			clone_info.disk_offset = disko;
356 			clone_info.disk_len = diskl;
357 			clone_info.data_offset = datao;
358 			clone_info.data_len = datal;
359 			clone_info.file_offset = new_key.offset;
360 			clone_info.extent_buf = buf;
361 			clone_info.item_size = size;
362 			ret = btrfs_punch_hole_range(inode, path, drop_start,
363 					new_key.offset + datal - 1, &clone_info,
364 					&trans);
365 			if (ret)
366 				goto out;
367 		} else if (type == BTRFS_FILE_EXTENT_INLINE) {
368 			u64 skip = 0;
369 			u64 trim = 0;
370 
371 			if (off > key.offset) {
372 				skip = off - key.offset;
373 				new_key.offset += skip;
374 			}
375 
376 			if (key.offset + datal > off + len)
377 				trim = key.offset + datal - (off + len);
378 
379 			if (comp && (skip || trim)) {
380 				ret = -EINVAL;
381 				goto out;
382 			}
383 			size -= skip + trim;
384 			datal -= skip + trim;
385 
386 			/*
387 			 * If our extent is inline, we know we will drop or
388 			 * adjust at most 1 extent item in the destination root.
389 			 *
390 			 * 1 - adjusting old extent (we may have to split it)
391 			 * 1 - add new extent
392 			 * 1 - inode update
393 			 */
394 			trans = btrfs_start_transaction(root, 3);
395 			if (IS_ERR(trans)) {
396 				ret = PTR_ERR(trans);
397 				goto out;
398 			}
399 
400 			ret = clone_copy_inline_extent(inode, trans, path,
401 						       &new_key, drop_start,
402 						       datal, skip, size, buf);
403 			if (ret) {
404 				if (ret != -EOPNOTSUPP)
405 					btrfs_abort_transaction(trans, ret);
406 				btrfs_end_transaction(trans);
407 				goto out;
408 			}
409 		}
410 
411 		btrfs_release_path(path);
412 
413 		last_dest_end = ALIGN(new_key.offset + datal,
414 				      fs_info->sectorsize);
415 		ret = clone_finish_inode_update(trans, inode, last_dest_end,
416 						destoff, olen, no_time_update);
417 		if (ret)
418 			goto out;
419 		if (new_key.offset + datal >= destoff + len)
420 			break;
421 
422 		btrfs_release_path(path);
423 		key.offset = next_key_min_offset;
424 
425 		if (fatal_signal_pending(current)) {
426 			ret = -EINTR;
427 			goto out;
428 		}
429 	}
430 	ret = 0;
431 
432 	if (last_dest_end < destoff + len) {
433 		/*
434 		 * We have an implicit hole that fully or partially overlaps our
435 		 * cloning range at its end. This means that we either have the
436 		 * NO_HOLES feature enabled or the implicit hole happened due to
437 		 * mixing buffered and direct IO writes against this file.
438 		 */
439 		btrfs_release_path(path);
440 		path->leave_spinning = 0;
441 
442 		ret = btrfs_punch_hole_range(inode, path, last_dest_end,
443 				destoff + len - 1, NULL, &trans);
444 		if (ret)
445 			goto out;
446 
447 		ret = clone_finish_inode_update(trans, inode, destoff + len,
448 						destoff, olen, no_time_update);
449 	}
450 
451 out:
452 	btrfs_free_path(path);
453 	kvfree(buf);
454 	return ret;
455 }
456 
457 static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
458 				       struct inode *inode2, u64 loff2, u64 len)
459 {
460 	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
461 	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
462 }
463 
464 static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
465 				     struct inode *inode2, u64 loff2, u64 len)
466 {
467 	if (inode1 < inode2) {
468 		swap(inode1, inode2);
469 		swap(loff1, loff2);
470 	} else if (inode1 == inode2 && loff2 < loff1) {
471 		swap(loff1, loff2);
472 	}
473 	lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
474 	lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
475 }
476 
477 static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
478 				   struct inode *dst, u64 dst_loff)
479 {
480 	const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
481 	int ret;
482 
483 	/*
484 	 * Lock destination range to serialize with concurrent readpages() and
485 	 * source range to serialize with relocation.
486 	 */
487 	btrfs_double_extent_lock(src, loff, dst, dst_loff, len);
488 	ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1);
489 	btrfs_double_extent_unlock(src, loff, dst, dst_loff, len);
490 
491 	return ret;
492 }
493 
494 static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
495 			     struct inode *dst, u64 dst_loff)
496 {
497 	int ret;
498 	u64 i, tail_len, chunk_count;
499 	struct btrfs_root *root_dst = BTRFS_I(dst)->root;
500 
501 	spin_lock(&root_dst->root_item_lock);
502 	if (root_dst->send_in_progress) {
503 		btrfs_warn_rl(root_dst->fs_info,
504 "cannot deduplicate to root %llu while send operations are using it (%d in progress)",
505 			      root_dst->root_key.objectid,
506 			      root_dst->send_in_progress);
507 		spin_unlock(&root_dst->root_item_lock);
508 		return -EAGAIN;
509 	}
510 	root_dst->dedupe_in_progress++;
511 	spin_unlock(&root_dst->root_item_lock);
512 
513 	tail_len = olen % BTRFS_MAX_DEDUPE_LEN;
514 	chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN);
515 
516 	for (i = 0; i < chunk_count; i++) {
517 		ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN,
518 					      dst, dst_loff);
519 		if (ret)
520 			goto out;
521 
522 		loff += BTRFS_MAX_DEDUPE_LEN;
523 		dst_loff += BTRFS_MAX_DEDUPE_LEN;
524 	}
525 
526 	if (tail_len > 0)
527 		ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff);
528 out:
529 	spin_lock(&root_dst->root_item_lock);
530 	root_dst->dedupe_in_progress--;
531 	spin_unlock(&root_dst->root_item_lock);
532 
533 	return ret;
534 }
535 
536 static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
537 					u64 off, u64 olen, u64 destoff)
538 {
539 	struct inode *inode = file_inode(file);
540 	struct inode *src = file_inode(file_src);
541 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
542 	int ret;
543 	u64 len = olen;
544 	u64 bs = fs_info->sb->s_blocksize;
545 
546 	/*
547 	 * TODO:
548 	 * - split compressed inline extents.  annoying: we need to
549 	 *   decompress into destination's address_space (the file offset
550 	 *   may change, so source mapping won't do), then recompress (or
551 	 *   otherwise reinsert) a subrange.
552 	 *
553 	 * - split destination inode's inline extents.  The inline extents can
554 	 *   be either compressed or non-compressed.
555 	 */
556 
557 	/*
558 	 * VFS's generic_remap_file_range_prep() protects us from cloning the
559 	 * eof block into the middle of a file, which would result in corruption
560 	 * if the file size is not blocksize aligned. So we don't need to check
561 	 * for that case here.
562 	 */
563 	if (off + len == src->i_size)
564 		len = ALIGN(src->i_size, bs) - off;
565 
566 	if (destoff > inode->i_size) {
567 		const u64 wb_start = ALIGN_DOWN(inode->i_size, bs);
568 
569 		ret = btrfs_cont_expand(inode, inode->i_size, destoff);
570 		if (ret)
571 			return ret;
572 		/*
573 		 * We may have truncated the last block if the inode's size is
574 		 * not sector size aligned, so we need to wait for writeback to
575 		 * complete before proceeding further, otherwise we can race
576 		 * with cloning and attempt to increment a reference to an
577 		 * extent that no longer exists (writeback completed right after
578 		 * we found the previous extent covering eof and before we
579 		 * attempted to increment its reference count).
580 		 */
581 		ret = btrfs_wait_ordered_range(inode, wb_start,
582 					       destoff - wb_start);
583 		if (ret)
584 			return ret;
585 	}
586 
587 	/*
588 	 * Lock destination range to serialize with concurrent readpages() and
589 	 * source range to serialize with relocation.
590 	 */
591 	btrfs_double_extent_lock(src, off, inode, destoff, len);
592 	ret = btrfs_clone(src, inode, off, olen, len, destoff, 0);
593 	btrfs_double_extent_unlock(src, off, inode, destoff, len);
594 	/*
595 	 * Truncate page cache pages so that future reads will see the cloned
596 	 * data immediately and not the previous data.
597 	 */
598 	truncate_inode_pages_range(&inode->i_data,
599 				round_down(destoff, PAGE_SIZE),
600 				round_up(destoff + len, PAGE_SIZE) - 1);
601 
602 	return ret;
603 }
604 
605 static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
606 				       struct file *file_out, loff_t pos_out,
607 				       loff_t *len, unsigned int remap_flags)
608 {
609 	struct inode *inode_in = file_inode(file_in);
610 	struct inode *inode_out = file_inode(file_out);
611 	u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
612 	bool same_inode = inode_out == inode_in;
613 	u64 wb_len;
614 	int ret;
615 
616 	if (!(remap_flags & REMAP_FILE_DEDUP)) {
617 		struct btrfs_root *root_out = BTRFS_I(inode_out)->root;
618 
619 		if (btrfs_root_readonly(root_out))
620 			return -EROFS;
621 
622 		if (file_in->f_path.mnt != file_out->f_path.mnt ||
623 		    inode_in->i_sb != inode_out->i_sb)
624 			return -EXDEV;
625 	}
626 
627 	/* Don't make the dst file partly checksummed */
628 	if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) !=
629 	    (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) {
630 		return -EINVAL;
631 	}
632 
633 	/*
634 	 * Now that the inodes are locked, we need to start writeback ourselves
635 	 * and can not rely on the writeback from the VFS's generic helper
636 	 * generic_remap_file_range_prep() because:
637 	 *
638 	 * 1) For compression we must call filemap_fdatawrite_range() range
639 	 *    twice (btrfs_fdatawrite_range() does it for us), and the generic
640 	 *    helper only calls it once;
641 	 *
642 	 * 2) filemap_fdatawrite_range(), called by the generic helper only
643 	 *    waits for the writeback to complete, i.e. for IO to be done, and
644 	 *    not for the ordered extents to complete. We need to wait for them
645 	 *    to complete so that new file extent items are in the fs tree.
646 	 */
647 	if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP))
648 		wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs);
649 	else
650 		wb_len = ALIGN(*len, bs);
651 
652 	/*
653 	 * Since we don't lock ranges, wait for ongoing lockless dio writes (as
654 	 * any in progress could create its ordered extents after we wait for
655 	 * existing ordered extents below).
656 	 */
657 	inode_dio_wait(inode_in);
658 	if (!same_inode)
659 		inode_dio_wait(inode_out);
660 
661 	/*
662 	 * Workaround to make sure NOCOW buffered write reach disk as NOCOW.
663 	 *
664 	 * Btrfs' back references do not have a block level granularity, they
665 	 * work at the whole extent level.
666 	 * NOCOW buffered write without data space reserved may not be able
667 	 * to fall back to CoW due to lack of data space, thus could cause
668 	 * data loss.
669 	 *
670 	 * Here we take a shortcut by flushing the whole inode, so that all
671 	 * nocow write should reach disk as nocow before we increase the
672 	 * reference of the extent. We could do better by only flushing NOCOW
673 	 * data, but that needs extra accounting.
674 	 *
675 	 * Also we don't need to check ASYNC_EXTENT, as async extent will be
676 	 * CoWed anyway, not affecting nocow part.
677 	 */
678 	ret = filemap_flush(inode_in->i_mapping);
679 	if (ret < 0)
680 		return ret;
681 
682 	ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs),
683 				       wb_len);
684 	if (ret < 0)
685 		return ret;
686 	ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs),
687 				       wb_len);
688 	if (ret < 0)
689 		return ret;
690 
691 	return generic_remap_file_range_prep(file_in, pos_in, file_out, pos_out,
692 					    len, remap_flags);
693 }
694 
695 loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
696 		struct file *dst_file, loff_t destoff, loff_t len,
697 		unsigned int remap_flags)
698 {
699 	struct inode *src_inode = file_inode(src_file);
700 	struct inode *dst_inode = file_inode(dst_file);
701 	bool same_inode = dst_inode == src_inode;
702 	int ret;
703 
704 	if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
705 		return -EINVAL;
706 
707 	if (same_inode)
708 		inode_lock(src_inode);
709 	else
710 		lock_two_nondirectories(src_inode, dst_inode);
711 
712 	ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
713 					  &len, remap_flags);
714 	if (ret < 0 || len == 0)
715 		goto out_unlock;
716 
717 	if (remap_flags & REMAP_FILE_DEDUP)
718 		ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff);
719 	else
720 		ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
721 
722 out_unlock:
723 	if (same_inode)
724 		inode_unlock(src_inode);
725 	else
726 		unlock_two_nondirectories(src_inode, dst_inode);
727 
728 	return ret < 0 ? ret : len;
729 }
730