xref: /openbmc/linux/fs/xfs/xfs_reflink.c (revision 8730046c)
1 /*
2  * Copyright (C) 2016 Oracle.  All Rights Reserved.
3  *
4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License
8  * as published by the Free Software Foundation; either version 2
9  * of the License, or (at your option) any later version.
10  *
11  * This program is distributed in the hope that it would be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write the Free Software Foundation,
18  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
19  */
20 #include "xfs.h"
21 #include "xfs_fs.h"
22 #include "xfs_shared.h"
23 #include "xfs_format.h"
24 #include "xfs_log_format.h"
25 #include "xfs_trans_resv.h"
26 #include "xfs_mount.h"
27 #include "xfs_defer.h"
28 #include "xfs_da_format.h"
29 #include "xfs_da_btree.h"
30 #include "xfs_inode.h"
31 #include "xfs_trans.h"
32 #include "xfs_inode_item.h"
33 #include "xfs_bmap.h"
34 #include "xfs_bmap_util.h"
35 #include "xfs_error.h"
36 #include "xfs_dir2.h"
37 #include "xfs_dir2_priv.h"
38 #include "xfs_ioctl.h"
39 #include "xfs_trace.h"
40 #include "xfs_log.h"
41 #include "xfs_icache.h"
42 #include "xfs_pnfs.h"
43 #include "xfs_btree.h"
44 #include "xfs_refcount_btree.h"
45 #include "xfs_refcount.h"
46 #include "xfs_bmap_btree.h"
47 #include "xfs_trans_space.h"
48 #include "xfs_bit.h"
49 #include "xfs_alloc.h"
50 #include "xfs_quota_defs.h"
51 #include "xfs_quota.h"
52 #include "xfs_btree.h"
53 #include "xfs_bmap_btree.h"
54 #include "xfs_reflink.h"
55 #include "xfs_iomap.h"
56 #include "xfs_rmap_btree.h"
57 #include "xfs_sb.h"
58 #include "xfs_ag_resv.h"
59 
60 /*
61  * Copy on Write of Shared Blocks
62  *
63  * XFS must preserve "the usual" file semantics even when two files share
64  * the same physical blocks.  This means that a write to one file must not
65  * alter the blocks in a different file; the way that we'll do that is
66  * through the use of a copy-on-write mechanism.  At a high level, that
67  * means that when we want to write to a shared block, we allocate a new
68  * block, write the data to the new block, and if that succeeds we map the
69  * new block into the file.
70  *
71  * XFS provides a "delayed allocation" mechanism that defers the allocation
72  * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
73  * possible.  This reduces fragmentation by enabling the filesystem to ask
74  * for bigger chunks less often, which is exactly what we want for CoW.
75  *
76  * The delalloc mechanism begins when the kernel wants to make a block
77  * writable (write_begin or page_mkwrite).  If the offset is not mapped, we
78  * create a delalloc mapping, which is a regular in-core extent, but without
79  * a real startblock.  (For delalloc mappings, the startblock encodes both
80  * a flag that this is a delalloc mapping, and a worst-case estimate of how
81  * many blocks might be required to put the mapping into the BMBT.)  delalloc
82  * mappings are a reservation against the free space in the filesystem;
83  * adjacent mappings can also be combined into fewer larger mappings.
84  *
85  * When dirty pages are being written out (typically in writepage), the
86  * delalloc reservations are converted into real mappings by allocating
87  * blocks and replacing the delalloc mapping with real ones.  A delalloc
88  * mapping can be replaced by several real ones if the free space is
89  * fragmented.
90  *
91  * We want to adapt the delalloc mechanism for copy-on-write, since the
92  * write paths are similar.  The first two steps (creating the reservation
93  * and allocating the blocks) are exactly the same as delalloc except that
94  * the mappings must be stored in a separate CoW fork because we do not want
95  * to disturb the mapping in the data fork until we're sure that the write
96  * succeeded.  IO completion in this case is the process of removing the old
97  * mapping from the data fork and moving the new mapping from the CoW fork to
98  * the data fork.  This will be discussed shortly.
99  *
100  * For now, unaligned directio writes will be bounced back to the page cache.
101  * Block-aligned directio writes will use the same mechanism as buffered
102  * writes.
103  *
104  * CoW remapping must be done after the data block write completes,
105  * because we don't want to destroy the old data fork map until we're sure
106  * the new block has been written.  Since the new mappings are kept in a
107  * separate fork, we can simply iterate these mappings to find the ones
108  * that cover the file blocks that we just CoW'd.  For each extent, simply
109  * unmap the corresponding range in the data fork, map the new range into
110  * the data fork, and remove the extent from the CoW fork.
111  *
112  * Since the remapping operation can be applied to an arbitrary file
113  * range, we record the need for the remap step as a flag in the ioend
114  * instead of declaring a new IO type.  This is required for direct io
115  * because we only have ioend for the whole dio, and we have to be able to
116  * remember the presence of unwritten blocks and CoW blocks with a single
117  * ioend structure.  Better yet, the more ground we can cover with one
118  * ioend, the better.
119  */
120 
121 /*
122  * Given an AG extent, find the lowest-numbered run of shared blocks
123  * within that range and return the range in fbno/flen.  If
124  * find_end_of_shared is true, return the longest contiguous extent of
125  * shared blocks.  If there are no shared extents, fbno and flen will
126  * be set to NULLAGBLOCK and 0, respectively.
127  */
128 int
129 xfs_reflink_find_shared(
130 	struct xfs_mount	*mp,
131 	xfs_agnumber_t		agno,
132 	xfs_agblock_t		agbno,
133 	xfs_extlen_t		aglen,
134 	xfs_agblock_t		*fbno,
135 	xfs_extlen_t		*flen,
136 	bool			find_end_of_shared)
137 {
138 	struct xfs_buf		*agbp;
139 	struct xfs_btree_cur	*cur;
140 	int			error;
141 
142 	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
143 	if (error)
144 		return error;
145 
146 	cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
147 
148 	error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
149 			find_end_of_shared);
150 
151 	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
152 
153 	xfs_buf_relse(agbp);
154 	return error;
155 }
156 
157 /*
158  * Trim the mapping to the next block where there's a change in the
159  * shared/unshared status.  More specifically, this means that we
160  * find the lowest-numbered extent of shared blocks that coincides with
161  * the given block mapping.  If the shared extent overlaps the start of
162  * the mapping, trim the mapping to the end of the shared extent.  If
163  * the shared region intersects the mapping, trim the mapping to the
164  * start of the shared extent.  If there are no shared regions that
165  * overlap, just return the original extent.
166  */
167 int
168 xfs_reflink_trim_around_shared(
169 	struct xfs_inode	*ip,
170 	struct xfs_bmbt_irec	*irec,
171 	bool			*shared,
172 	bool			*trimmed)
173 {
174 	xfs_agnumber_t		agno;
175 	xfs_agblock_t		agbno;
176 	xfs_extlen_t		aglen;
177 	xfs_agblock_t		fbno;
178 	xfs_extlen_t		flen;
179 	int			error = 0;
180 
181 	/* Holes, unwritten, and delalloc extents cannot be shared */
182 	if (!xfs_is_reflink_inode(ip) ||
183 	    ISUNWRITTEN(irec) ||
184 	    irec->br_startblock == HOLESTARTBLOCK ||
185 	    irec->br_startblock == DELAYSTARTBLOCK ||
186 	    isnullstartblock(irec->br_startblock)) {
187 		*shared = false;
188 		return 0;
189 	}
190 
191 	trace_xfs_reflink_trim_around_shared(ip, irec);
192 
193 	agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
194 	agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
195 	aglen = irec->br_blockcount;
196 
197 	error = xfs_reflink_find_shared(ip->i_mount, agno, agbno,
198 			aglen, &fbno, &flen, true);
199 	if (error)
200 		return error;
201 
202 	*shared = *trimmed = false;
203 	if (fbno == NULLAGBLOCK) {
204 		/* No shared blocks at all. */
205 		return 0;
206 	} else if (fbno == agbno) {
207 		/*
208 		 * The start of this extent is shared.  Truncate the
209 		 * mapping at the end of the shared region so that a
210 		 * subsequent iteration starts at the start of the
211 		 * unshared region.
212 		 */
213 		irec->br_blockcount = flen;
214 		*shared = true;
215 		if (flen != aglen)
216 			*trimmed = true;
217 		return 0;
218 	} else {
219 		/*
220 		 * There's a shared extent midway through this extent.
221 		 * Truncate the mapping at the start of the shared
222 		 * extent so that a subsequent iteration starts at the
223 		 * start of the shared region.
224 		 */
225 		irec->br_blockcount = fbno - agbno;
226 		*trimmed = true;
227 		return 0;
228 	}
229 }
230 
231 /*
232  * Trim the passed in imap to the next shared/unshared extent boundary, and
233  * if imap->br_startoff points to a shared extent reserve space for it in the
234  * COW fork.  In this case *shared is set to true, else to false.
235  *
236  * Note that imap will always contain the block numbers for the existing blocks
237  * in the data fork, as the upper layers need them for read-modify-write
238  * operations.
239  */
240 int
241 xfs_reflink_reserve_cow(
242 	struct xfs_inode	*ip,
243 	struct xfs_bmbt_irec	*imap,
244 	bool			*shared)
245 {
246 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
247 	struct xfs_bmbt_irec	got;
248 	int			error = 0;
249 	bool			eof = false, trimmed;
250 	xfs_extnum_t		idx;
251 
252 	/*
253 	 * Search the COW fork extent list first.  This serves two purposes:
254 	 * first this implement the speculative preallocation using cowextisze,
255 	 * so that we also unshared block adjacent to shared blocks instead
256 	 * of just the shared blocks themselves.  Second the lookup in the
257 	 * extent list is generally faster than going out to the shared extent
258 	 * tree.
259 	 */
260 
261 	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got))
262 		eof = true;
263 	if (!eof && got.br_startoff <= imap->br_startoff) {
264 		trace_xfs_reflink_cow_found(ip, imap);
265 		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
266 
267 		*shared = true;
268 		return 0;
269 	}
270 
271 	/* Trim the mapping to the nearest shared extent boundary. */
272 	error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
273 	if (error)
274 		return error;
275 
276 	/* Not shared?  Just report the (potentially capped) extent. */
277 	if (!*shared)
278 		return 0;
279 
280 	/*
281 	 * Fork all the shared blocks from our write offset until the end of
282 	 * the extent.
283 	 */
284 	error = xfs_qm_dqattach_locked(ip, 0);
285 	if (error)
286 		return error;
287 
288 	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
289 			imap->br_blockcount, 0, &got, &idx, eof);
290 	if (error == -ENOSPC || error == -EDQUOT)
291 		trace_xfs_reflink_cow_enospc(ip, imap);
292 	if (error)
293 		return error;
294 
295 	trace_xfs_reflink_cow_alloc(ip, &got);
296 	return 0;
297 }
298 
299 /* Allocate all CoW reservations covering a range of blocks in a file. */
300 static int
301 __xfs_reflink_allocate_cow(
302 	struct xfs_inode	*ip,
303 	xfs_fileoff_t		*offset_fsb,
304 	xfs_fileoff_t		end_fsb)
305 {
306 	struct xfs_mount	*mp = ip->i_mount;
307 	struct xfs_bmbt_irec	imap;
308 	struct xfs_defer_ops	dfops;
309 	struct xfs_trans	*tp;
310 	xfs_fsblock_t		first_block;
311 	int			nimaps = 1, error;
312 	bool			shared;
313 
314 	xfs_defer_init(&dfops, &first_block);
315 
316 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
317 			XFS_TRANS_RESERVE, &tp);
318 	if (error)
319 		return error;
320 
321 	xfs_ilock(ip, XFS_ILOCK_EXCL);
322 
323 	/* Read extent from the source file. */
324 	nimaps = 1;
325 	error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb,
326 			&imap, &nimaps, 0);
327 	if (error)
328 		goto out_unlock;
329 	ASSERT(nimaps == 1);
330 
331 	error = xfs_reflink_reserve_cow(ip, &imap, &shared);
332 	if (error)
333 		goto out_trans_cancel;
334 
335 	if (!shared) {
336 		*offset_fsb = imap.br_startoff + imap.br_blockcount;
337 		goto out_trans_cancel;
338 	}
339 
340 	xfs_trans_ijoin(tp, ip, 0);
341 	error = xfs_bmapi_write(tp, ip, imap.br_startoff, imap.br_blockcount,
342 			XFS_BMAPI_COWFORK, &first_block,
343 			XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
344 			&imap, &nimaps, &dfops);
345 	if (error)
346 		goto out_trans_cancel;
347 
348 	error = xfs_defer_finish(&tp, &dfops, NULL);
349 	if (error)
350 		goto out_trans_cancel;
351 
352 	error = xfs_trans_commit(tp);
353 
354 	*offset_fsb = imap.br_startoff + imap.br_blockcount;
355 out_unlock:
356 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
357 	return error;
358 out_trans_cancel:
359 	xfs_defer_cancel(&dfops);
360 	xfs_trans_cancel(tp);
361 	goto out_unlock;
362 }
363 
364 /* Allocate all CoW reservations covering a part of a file. */
365 int
366 xfs_reflink_allocate_cow_range(
367 	struct xfs_inode	*ip,
368 	xfs_off_t		offset,
369 	xfs_off_t		count)
370 {
371 	struct xfs_mount	*mp = ip->i_mount;
372 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
373 	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
374 	int			error;
375 
376 	ASSERT(xfs_is_reflink_inode(ip));
377 
378 	trace_xfs_reflink_allocate_cow_range(ip, offset, count);
379 
380 	/*
381 	 * Make sure that the dquots are there.
382 	 */
383 	error = xfs_qm_dqattach(ip, 0);
384 	if (error)
385 		return error;
386 
387 	while (offset_fsb < end_fsb) {
388 		error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb);
389 		if (error) {
390 			trace_xfs_reflink_allocate_cow_range_error(ip, error,
391 					_RET_IP_);
392 			break;
393 		}
394 	}
395 
396 	return error;
397 }
398 
399 /*
400  * Find the CoW reservation for a given byte offset of a file.
401  */
402 bool
403 xfs_reflink_find_cow_mapping(
404 	struct xfs_inode		*ip,
405 	xfs_off_t			offset,
406 	struct xfs_bmbt_irec		*imap)
407 {
408 	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
409 	xfs_fileoff_t			offset_fsb;
410 	struct xfs_bmbt_irec		got;
411 	xfs_extnum_t			idx;
412 
413 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
414 	ASSERT(xfs_is_reflink_inode(ip));
415 
416 	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
417 	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
418 		return false;
419 	if (got.br_startoff > offset_fsb)
420 		return false;
421 
422 	trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
423 			&got);
424 	*imap = got;
425 	return true;
426 }
427 
428 /*
429  * Trim an extent to end at the next CoW reservation past offset_fsb.
430  */
431 void
432 xfs_reflink_trim_irec_to_next_cow(
433 	struct xfs_inode		*ip,
434 	xfs_fileoff_t			offset_fsb,
435 	struct xfs_bmbt_irec		*imap)
436 {
437 	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
438 	struct xfs_bmbt_irec		got;
439 	xfs_extnum_t			idx;
440 
441 	if (!xfs_is_reflink_inode(ip))
442 		return;
443 
444 	/* Find the extent in the CoW fork. */
445 	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
446 		return;
447 
448 	/* This is the extent before; try sliding up one. */
449 	if (got.br_startoff < offset_fsb) {
450 		if (!xfs_iext_get_extent(ifp, idx + 1, &got))
451 			return;
452 	}
453 
454 	if (got.br_startoff >= imap->br_startoff + imap->br_blockcount)
455 		return;
456 
457 	imap->br_blockcount = got.br_startoff - imap->br_startoff;
458 	trace_xfs_reflink_trim_irec(ip, imap);
459 }
460 
461 /*
462  * Cancel all pending CoW reservations for some block range of an inode.
463  */
464 int
465 xfs_reflink_cancel_cow_blocks(
466 	struct xfs_inode		*ip,
467 	struct xfs_trans		**tpp,
468 	xfs_fileoff_t			offset_fsb,
469 	xfs_fileoff_t			end_fsb)
470 {
471 	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
472 	struct xfs_bmbt_irec		got, del;
473 	xfs_extnum_t			idx;
474 	xfs_fsblock_t			firstfsb;
475 	struct xfs_defer_ops		dfops;
476 	int				error = 0;
477 
478 	if (!xfs_is_reflink_inode(ip))
479 		return 0;
480 	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
481 		return 0;
482 
483 	while (got.br_startoff < end_fsb) {
484 		del = got;
485 		xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
486 		trace_xfs_reflink_cancel_cow(ip, &del);
487 
488 		if (isnullstartblock(del.br_startblock)) {
489 			error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
490 					&idx, &got, &del);
491 			if (error)
492 				break;
493 		} else {
494 			xfs_trans_ijoin(*tpp, ip, 0);
495 			xfs_defer_init(&dfops, &firstfsb);
496 
497 			/* Free the CoW orphan record. */
498 			error = xfs_refcount_free_cow_extent(ip->i_mount,
499 					&dfops, del.br_startblock,
500 					del.br_blockcount);
501 			if (error)
502 				break;
503 
504 			xfs_bmap_add_free(ip->i_mount, &dfops,
505 					del.br_startblock, del.br_blockcount,
506 					NULL);
507 
508 			/* Update quota accounting */
509 			xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT,
510 					-(long)del.br_blockcount);
511 
512 			/* Roll the transaction */
513 			error = xfs_defer_finish(tpp, &dfops, ip);
514 			if (error) {
515 				xfs_defer_cancel(&dfops);
516 				break;
517 			}
518 
519 			/* Remove the mapping from the CoW fork. */
520 			xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
521 		}
522 
523 		if (!xfs_iext_get_extent(ifp, ++idx, &got))
524 			break;
525 	}
526 
527 	/* clear tag if cow fork is emptied */
528 	if (!ifp->if_bytes)
529 		xfs_inode_clear_cowblocks_tag(ip);
530 
531 	return error;
532 }
533 
534 /*
535  * Cancel all pending CoW reservations for some byte range of an inode.
536  */
537 int
538 xfs_reflink_cancel_cow_range(
539 	struct xfs_inode	*ip,
540 	xfs_off_t		offset,
541 	xfs_off_t		count)
542 {
543 	struct xfs_trans	*tp;
544 	xfs_fileoff_t		offset_fsb;
545 	xfs_fileoff_t		end_fsb;
546 	int			error;
547 
548 	trace_xfs_reflink_cancel_cow_range(ip, offset, count);
549 	ASSERT(xfs_is_reflink_inode(ip));
550 
551 	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
552 	if (count == NULLFILEOFF)
553 		end_fsb = NULLFILEOFF;
554 	else
555 		end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
556 
557 	/* Start a rolling transaction to remove the mappings */
558 	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
559 			0, 0, 0, &tp);
560 	if (error)
561 		goto out;
562 
563 	xfs_ilock(ip, XFS_ILOCK_EXCL);
564 	xfs_trans_ijoin(tp, ip, 0);
565 
566 	/* Scrape out the old CoW reservations */
567 	error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb);
568 	if (error)
569 		goto out_cancel;
570 
571 	error = xfs_trans_commit(tp);
572 
573 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
574 	return error;
575 
576 out_cancel:
577 	xfs_trans_cancel(tp);
578 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
579 out:
580 	trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_);
581 	return error;
582 }
583 
584 /*
585  * Remap parts of a file's data fork after a successful CoW.
586  */
587 int
588 xfs_reflink_end_cow(
589 	struct xfs_inode		*ip,
590 	xfs_off_t			offset,
591 	xfs_off_t			count)
592 {
593 	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
594 	struct xfs_bmbt_irec		got, del;
595 	struct xfs_trans		*tp;
596 	xfs_fileoff_t			offset_fsb;
597 	xfs_fileoff_t			end_fsb;
598 	xfs_fsblock_t			firstfsb;
599 	struct xfs_defer_ops		dfops;
600 	int				error;
601 	unsigned int			resblks;
602 	xfs_filblks_t			rlen;
603 	xfs_extnum_t			idx;
604 
605 	trace_xfs_reflink_end_cow(ip, offset, count);
606 
607 	/* No COW extents?  That's easy! */
608 	if (ifp->if_bytes == 0)
609 		return 0;
610 
611 	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
612 	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
613 
614 	/* Start a rolling transaction to switch the mappings */
615 	resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
616 	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
617 			resblks, 0, 0, &tp);
618 	if (error)
619 		goto out;
620 
621 	xfs_ilock(ip, XFS_ILOCK_EXCL);
622 	xfs_trans_ijoin(tp, ip, 0);
623 
624 	/* If there is a hole at end_fsb - 1 go to the previous extent */
625 	if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) ||
626 	    got.br_startoff > end_fsb) {
627 		ASSERT(idx > 0);
628 		xfs_iext_get_extent(ifp, --idx, &got);
629 	}
630 
631 	/* Walk backwards until we're out of the I/O range... */
632 	while (got.br_startoff + got.br_blockcount > offset_fsb) {
633 		del = got;
634 		xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
635 
636 		/* Extent delete may have bumped idx forward */
637 		if (!del.br_blockcount) {
638 			idx--;
639 			goto next_extent;
640 		}
641 
642 		ASSERT(!isnullstartblock(got.br_startblock));
643 
644 		/* Unmap the old blocks in the data fork. */
645 		xfs_defer_init(&dfops, &firstfsb);
646 		rlen = del.br_blockcount;
647 		error = __xfs_bunmapi(tp, ip, del.br_startoff, &rlen, 0, 1,
648 				&firstfsb, &dfops);
649 		if (error)
650 			goto out_defer;
651 
652 		/* Trim the extent to whatever got unmapped. */
653 		if (rlen) {
654 			xfs_trim_extent(&del, del.br_startoff + rlen,
655 				del.br_blockcount - rlen);
656 		}
657 		trace_xfs_reflink_cow_remap(ip, &del);
658 
659 		/* Free the CoW orphan record. */
660 		error = xfs_refcount_free_cow_extent(tp->t_mountp, &dfops,
661 				del.br_startblock, del.br_blockcount);
662 		if (error)
663 			goto out_defer;
664 
665 		/* Map the new blocks into the data fork. */
666 		error = xfs_bmap_map_extent(tp->t_mountp, &dfops, ip, &del);
667 		if (error)
668 			goto out_defer;
669 
670 		/* Remove the mapping from the CoW fork. */
671 		xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
672 
673 		error = xfs_defer_finish(&tp, &dfops, ip);
674 		if (error)
675 			goto out_defer;
676 next_extent:
677 		if (!xfs_iext_get_extent(ifp, idx, &got))
678 			break;
679 	}
680 
681 	error = xfs_trans_commit(tp);
682 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
683 	if (error)
684 		goto out;
685 	return 0;
686 
687 out_defer:
688 	xfs_defer_cancel(&dfops);
689 	xfs_trans_cancel(tp);
690 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
691 out:
692 	trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
693 	return error;
694 }
695 
696 /*
697  * Free leftover CoW reservations that didn't get cleaned out.
698  */
699 int
700 xfs_reflink_recover_cow(
701 	struct xfs_mount	*mp)
702 {
703 	xfs_agnumber_t		agno;
704 	int			error = 0;
705 
706 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
707 		return 0;
708 
709 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
710 		error = xfs_refcount_recover_cow_leftovers(mp, agno);
711 		if (error)
712 			break;
713 	}
714 
715 	return error;
716 }
717 
718 /*
719  * Reflinking (Block) Ranges of Two Files Together
720  *
721  * First, ensure that the reflink flag is set on both inodes.  The flag is an
722  * optimization to avoid unnecessary refcount btree lookups in the write path.
723  *
724  * Now we can iteratively remap the range of extents (and holes) in src to the
725  * corresponding ranges in dest.  Let drange and srange denote the ranges of
726  * logical blocks in dest and src touched by the reflink operation.
727  *
728  * While the length of drange is greater than zero,
729  *    - Read src's bmbt at the start of srange ("imap")
730  *    - If imap doesn't exist, make imap appear to start at the end of srange
731  *      with zero length.
732  *    - If imap starts before srange, advance imap to start at srange.
733  *    - If imap goes beyond srange, truncate imap to end at the end of srange.
734  *    - Punch (imap start - srange start + imap len) blocks from dest at
735  *      offset (drange start).
736  *    - If imap points to a real range of pblks,
737  *         > Increase the refcount of the imap's pblks
738  *         > Map imap's pblks into dest at the offset
739  *           (drange start + imap start - srange start)
740  *    - Advance drange and srange by (imap start - srange start + imap len)
741  *
742  * Finally, if the reflink made dest longer, update both the in-core and
743  * on-disk file sizes.
744  *
745  * ASCII Art Demonstration:
746  *
747  * Let's say we want to reflink this source file:
748  *
749  * ----SSSSSSS-SSSSS----SSSSSS (src file)
750  *   <-------------------->
751  *
752  * into this destination file:
753  *
754  * --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
755  *        <-------------------->
756  * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
757  * Observe that the range has different logical offsets in either file.
758  *
759  * Consider that the first extent in the source file doesn't line up with our
760  * reflink range.  Unmapping  and remapping are separate operations, so we can
761  * unmap more blocks from the destination file than we remap.
762  *
763  * ----SSSSSSS-SSSSS----SSSSSS
764  *   <------->
765  * --DDDDD---------DDDDD--DDD
766  *        <------->
767  *
768  * Now remap the source extent into the destination file:
769  *
770  * ----SSSSSSS-SSSSS----SSSSSS
771  *   <------->
772  * --DDDDD--SSSSSSSDDDDD--DDD
773  *        <------->
774  *
775  * Do likewise with the second hole and extent in our range.  Holes in the
776  * unmap range don't affect our operation.
777  *
778  * ----SSSSSSS-SSSSS----SSSSSS
779  *            <---->
780  * --DDDDD--SSSSSSS-SSSSS-DDD
781  *                 <---->
782  *
783  * Finally, unmap and remap part of the third extent.  This will increase the
784  * size of the destination file.
785  *
786  * ----SSSSSSS-SSSSS----SSSSSS
787  *                  <----->
788  * --DDDDD--SSSSSSS-SSSSS----SSS
789  *                       <----->
790  *
791  * Once we update the destination file's i_size, we're done.
792  */
793 
794 /*
795  * Ensure the reflink bit is set in both inodes.
796  */
797 STATIC int
798 xfs_reflink_set_inode_flag(
799 	struct xfs_inode	*src,
800 	struct xfs_inode	*dest)
801 {
802 	struct xfs_mount	*mp = src->i_mount;
803 	int			error;
804 	struct xfs_trans	*tp;
805 
806 	if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest))
807 		return 0;
808 
809 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
810 	if (error)
811 		goto out_error;
812 
813 	/* Lock both files against IO */
814 	if (src->i_ino == dest->i_ino)
815 		xfs_ilock(src, XFS_ILOCK_EXCL);
816 	else
817 		xfs_lock_two_inodes(src, dest, XFS_ILOCK_EXCL);
818 
819 	if (!xfs_is_reflink_inode(src)) {
820 		trace_xfs_reflink_set_inode_flag(src);
821 		xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
822 		src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
823 		xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
824 		xfs_ifork_init_cow(src);
825 	} else
826 		xfs_iunlock(src, XFS_ILOCK_EXCL);
827 
828 	if (src->i_ino == dest->i_ino)
829 		goto commit_flags;
830 
831 	if (!xfs_is_reflink_inode(dest)) {
832 		trace_xfs_reflink_set_inode_flag(dest);
833 		xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
834 		dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
835 		xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
836 		xfs_ifork_init_cow(dest);
837 	} else
838 		xfs_iunlock(dest, XFS_ILOCK_EXCL);
839 
840 commit_flags:
841 	error = xfs_trans_commit(tp);
842 	if (error)
843 		goto out_error;
844 	return error;
845 
846 out_error:
847 	trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_);
848 	return error;
849 }
850 
851 /*
852  * Update destination inode size & cowextsize hint, if necessary.
853  */
854 STATIC int
855 xfs_reflink_update_dest(
856 	struct xfs_inode	*dest,
857 	xfs_off_t		newlen,
858 	xfs_extlen_t		cowextsize)
859 {
860 	struct xfs_mount	*mp = dest->i_mount;
861 	struct xfs_trans	*tp;
862 	int			error;
863 
864 	if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
865 		return 0;
866 
867 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
868 	if (error)
869 		goto out_error;
870 
871 	xfs_ilock(dest, XFS_ILOCK_EXCL);
872 	xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
873 
874 	if (newlen > i_size_read(VFS_I(dest))) {
875 		trace_xfs_reflink_update_inode_size(dest, newlen);
876 		i_size_write(VFS_I(dest), newlen);
877 		dest->i_d.di_size = newlen;
878 	}
879 
880 	if (cowextsize) {
881 		dest->i_d.di_cowextsize = cowextsize;
882 		dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
883 	}
884 
885 	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
886 
887 	error = xfs_trans_commit(tp);
888 	if (error)
889 		goto out_error;
890 	return error;
891 
892 out_error:
893 	trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_);
894 	return error;
895 }
896 
897 /*
898  * Do we have enough reserve in this AG to handle a reflink?  The refcount
899  * btree already reserved all the space it needs, but the rmap btree can grow
900  * infinitely, so we won't allow more reflinks when the AG is down to the
901  * btree reserves.
902  */
903 static int
904 xfs_reflink_ag_has_free_space(
905 	struct xfs_mount	*mp,
906 	xfs_agnumber_t		agno)
907 {
908 	struct xfs_perag	*pag;
909 	int			error = 0;
910 
911 	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
912 		return 0;
913 
914 	pag = xfs_perag_get(mp, agno);
915 	if (xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) ||
916 	    xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
917 		error = -ENOSPC;
918 	xfs_perag_put(pag);
919 	return error;
920 }
921 
922 /*
923  * Unmap a range of blocks from a file, then map other blocks into the hole.
924  * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
925  * The extent irec is mapped into dest at irec->br_startoff.
926  */
927 STATIC int
928 xfs_reflink_remap_extent(
929 	struct xfs_inode	*ip,
930 	struct xfs_bmbt_irec	*irec,
931 	xfs_fileoff_t		destoff,
932 	xfs_off_t		new_isize)
933 {
934 	struct xfs_mount	*mp = ip->i_mount;
935 	struct xfs_trans	*tp;
936 	xfs_fsblock_t		firstfsb;
937 	unsigned int		resblks;
938 	struct xfs_defer_ops	dfops;
939 	struct xfs_bmbt_irec	uirec;
940 	bool			real_extent;
941 	xfs_filblks_t		rlen;
942 	xfs_filblks_t		unmap_len;
943 	xfs_off_t		newlen;
944 	int			error;
945 
946 	unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
947 	trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
948 
949 	/* Only remap normal extents. */
950 	real_extent =  (irec->br_startblock != HOLESTARTBLOCK &&
951 			irec->br_startblock != DELAYSTARTBLOCK &&
952 			!ISUNWRITTEN(irec));
953 
954 	/* No reflinking if we're low on space */
955 	if (real_extent) {
956 		error = xfs_reflink_ag_has_free_space(mp,
957 				XFS_FSB_TO_AGNO(mp, irec->br_startblock));
958 		if (error)
959 			goto out;
960 	}
961 
962 	/* Start a rolling transaction to switch the mappings */
963 	resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
964 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
965 	if (error)
966 		goto out;
967 
968 	xfs_ilock(ip, XFS_ILOCK_EXCL);
969 	xfs_trans_ijoin(tp, ip, 0);
970 
971 	/* If we're not just clearing space, then do we have enough quota? */
972 	if (real_extent) {
973 		error = xfs_trans_reserve_quota_nblks(tp, ip,
974 				irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS);
975 		if (error)
976 			goto out_cancel;
977 	}
978 
979 	trace_xfs_reflink_remap(ip, irec->br_startoff,
980 				irec->br_blockcount, irec->br_startblock);
981 
982 	/* Unmap the old blocks in the data fork. */
983 	rlen = unmap_len;
984 	while (rlen) {
985 		xfs_defer_init(&dfops, &firstfsb);
986 		error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1,
987 				&firstfsb, &dfops);
988 		if (error)
989 			goto out_defer;
990 
991 		/*
992 		 * Trim the extent to whatever got unmapped.
993 		 * Remember, bunmapi works backwards.
994 		 */
995 		uirec.br_startblock = irec->br_startblock + rlen;
996 		uirec.br_startoff = irec->br_startoff + rlen;
997 		uirec.br_blockcount = unmap_len - rlen;
998 		unmap_len = rlen;
999 
1000 		/* If this isn't a real mapping, we're done. */
1001 		if (!real_extent || uirec.br_blockcount == 0)
1002 			goto next_extent;
1003 
1004 		trace_xfs_reflink_remap(ip, uirec.br_startoff,
1005 				uirec.br_blockcount, uirec.br_startblock);
1006 
1007 		/* Update the refcount tree */
1008 		error = xfs_refcount_increase_extent(mp, &dfops, &uirec);
1009 		if (error)
1010 			goto out_defer;
1011 
1012 		/* Map the new blocks into the data fork. */
1013 		error = xfs_bmap_map_extent(mp, &dfops, ip, &uirec);
1014 		if (error)
1015 			goto out_defer;
1016 
1017 		/* Update quota accounting. */
1018 		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
1019 				uirec.br_blockcount);
1020 
1021 		/* Update dest isize if needed. */
1022 		newlen = XFS_FSB_TO_B(mp,
1023 				uirec.br_startoff + uirec.br_blockcount);
1024 		newlen = min_t(xfs_off_t, newlen, new_isize);
1025 		if (newlen > i_size_read(VFS_I(ip))) {
1026 			trace_xfs_reflink_update_inode_size(ip, newlen);
1027 			i_size_write(VFS_I(ip), newlen);
1028 			ip->i_d.di_size = newlen;
1029 			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1030 		}
1031 
1032 next_extent:
1033 		/* Process all the deferred stuff. */
1034 		error = xfs_defer_finish(&tp, &dfops, ip);
1035 		if (error)
1036 			goto out_defer;
1037 	}
1038 
1039 	error = xfs_trans_commit(tp);
1040 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1041 	if (error)
1042 		goto out;
1043 	return 0;
1044 
1045 out_defer:
1046 	xfs_defer_cancel(&dfops);
1047 out_cancel:
1048 	xfs_trans_cancel(tp);
1049 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1050 out:
1051 	trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
1052 	return error;
1053 }
1054 
1055 /*
1056  * Iteratively remap one file's extents (and holes) to another's.
1057  */
1058 STATIC int
1059 xfs_reflink_remap_blocks(
1060 	struct xfs_inode	*src,
1061 	xfs_fileoff_t		srcoff,
1062 	struct xfs_inode	*dest,
1063 	xfs_fileoff_t		destoff,
1064 	xfs_filblks_t		len,
1065 	xfs_off_t		new_isize)
1066 {
1067 	struct xfs_bmbt_irec	imap;
1068 	int			nimaps;
1069 	int			error = 0;
1070 	xfs_filblks_t		range_len;
1071 
1072 	/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
1073 	while (len) {
1074 		trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
1075 				dest, destoff);
1076 		/* Read extent from the source file */
1077 		nimaps = 1;
1078 		xfs_ilock(src, XFS_ILOCK_EXCL);
1079 		error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
1080 		xfs_iunlock(src, XFS_ILOCK_EXCL);
1081 		if (error)
1082 			goto err;
1083 		ASSERT(nimaps == 1);
1084 
1085 		trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
1086 				&imap);
1087 
1088 		/* Translate imap into the destination file. */
1089 		range_len = imap.br_startoff + imap.br_blockcount - srcoff;
1090 		imap.br_startoff += destoff - srcoff;
1091 
1092 		/* Clear dest from destoff to the end of imap and map it in. */
1093 		error = xfs_reflink_remap_extent(dest, &imap, destoff,
1094 				new_isize);
1095 		if (error)
1096 			goto err;
1097 
1098 		if (fatal_signal_pending(current)) {
1099 			error = -EINTR;
1100 			goto err;
1101 		}
1102 
1103 		/* Advance drange/srange */
1104 		srcoff += range_len;
1105 		destoff += range_len;
1106 		len -= range_len;
1107 	}
1108 
1109 	return 0;
1110 
1111 err:
1112 	trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
1113 	return error;
1114 }
1115 
1116 /*
1117  * Link a range of blocks from one file to another.
1118  */
1119 int
1120 xfs_reflink_remap_range(
1121 	struct file		*file_in,
1122 	loff_t			pos_in,
1123 	struct file		*file_out,
1124 	loff_t			pos_out,
1125 	u64			len,
1126 	bool			is_dedupe)
1127 {
1128 	struct inode		*inode_in = file_inode(file_in);
1129 	struct xfs_inode	*src = XFS_I(inode_in);
1130 	struct inode		*inode_out = file_inode(file_out);
1131 	struct xfs_inode	*dest = XFS_I(inode_out);
1132 	struct xfs_mount	*mp = src->i_mount;
1133 	bool			same_inode = (inode_in == inode_out);
1134 	xfs_fileoff_t		sfsbno, dfsbno;
1135 	xfs_filblks_t		fsblen;
1136 	xfs_extlen_t		cowextsize;
1137 	ssize_t			ret;
1138 
1139 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
1140 		return -EOPNOTSUPP;
1141 
1142 	if (XFS_FORCED_SHUTDOWN(mp))
1143 		return -EIO;
1144 
1145 	/* Lock both files against IO */
1146 	lock_two_nondirectories(inode_in, inode_out);
1147 	if (same_inode)
1148 		xfs_ilock(src, XFS_MMAPLOCK_EXCL);
1149 	else
1150 		xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
1151 
1152 	/* Check file eligibility and prepare for block sharing. */
1153 	ret = -EINVAL;
1154 	/* Don't reflink realtime inodes */
1155 	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
1156 		goto out_unlock;
1157 
1158 	/* Don't share DAX file data for now. */
1159 	if (IS_DAX(inode_in) || IS_DAX(inode_out))
1160 		goto out_unlock;
1161 
1162 	ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
1163 			&len, is_dedupe);
1164 	if (ret <= 0)
1165 		goto out_unlock;
1166 
1167 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
1168 
1169 	/* Set flags and remap blocks. */
1170 	ret = xfs_reflink_set_inode_flag(src, dest);
1171 	if (ret)
1172 		goto out_unlock;
1173 
1174 	dfsbno = XFS_B_TO_FSBT(mp, pos_out);
1175 	sfsbno = XFS_B_TO_FSBT(mp, pos_in);
1176 	fsblen = XFS_B_TO_FSB(mp, len);
1177 	ret = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
1178 			pos_out + len);
1179 	if (ret)
1180 		goto out_unlock;
1181 
1182 	/* Zap any page cache for the destination file's range. */
1183 	truncate_inode_pages_range(&inode_out->i_data, pos_out,
1184 				   PAGE_ALIGN(pos_out + len) - 1);
1185 
1186 	/*
1187 	 * Carry the cowextsize hint from src to dest if we're sharing the
1188 	 * entire source file to the entire destination file, the source file
1189 	 * has a cowextsize hint, and the destination file does not.
1190 	 */
1191 	cowextsize = 0;
1192 	if (pos_in == 0 && len == i_size_read(inode_in) &&
1193 	    (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
1194 	    pos_out == 0 && len >= i_size_read(inode_out) &&
1195 	    !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
1196 		cowextsize = src->i_d.di_cowextsize;
1197 
1198 	ret = xfs_reflink_update_dest(dest, pos_out + len, cowextsize);
1199 
1200 out_unlock:
1201 	xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
1202 	if (!same_inode)
1203 		xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
1204 	unlock_two_nondirectories(inode_in, inode_out);
1205 	if (ret)
1206 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
1207 	return ret;
1208 }
1209 
1210 /*
1211  * The user wants to preemptively CoW all shared blocks in this file,
1212  * which enables us to turn off the reflink flag.  Iterate all
1213  * extents which are not prealloc/delalloc to see which ranges are
1214  * mentioned in the refcount tree, then read those blocks into the
1215  * pagecache, dirty them, fsync them back out, and then we can update
1216  * the inode flag.  What happens if we run out of memory? :)
1217  */
1218 STATIC int
1219 xfs_reflink_dirty_extents(
1220 	struct xfs_inode	*ip,
1221 	xfs_fileoff_t		fbno,
1222 	xfs_filblks_t		end,
1223 	xfs_off_t		isize)
1224 {
1225 	struct xfs_mount	*mp = ip->i_mount;
1226 	xfs_agnumber_t		agno;
1227 	xfs_agblock_t		agbno;
1228 	xfs_extlen_t		aglen;
1229 	xfs_agblock_t		rbno;
1230 	xfs_extlen_t		rlen;
1231 	xfs_off_t		fpos;
1232 	xfs_off_t		flen;
1233 	struct xfs_bmbt_irec	map[2];
1234 	int			nmaps;
1235 	int			error = 0;
1236 
1237 	while (end - fbno > 0) {
1238 		nmaps = 1;
1239 		/*
1240 		 * Look for extents in the file.  Skip holes, delalloc, or
1241 		 * unwritten extents; they can't be reflinked.
1242 		 */
1243 		error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0);
1244 		if (error)
1245 			goto out;
1246 		if (nmaps == 0)
1247 			break;
1248 		if (map[0].br_startblock == HOLESTARTBLOCK ||
1249 		    map[0].br_startblock == DELAYSTARTBLOCK ||
1250 		    ISUNWRITTEN(&map[0]))
1251 			goto next;
1252 
1253 		map[1] = map[0];
1254 		while (map[1].br_blockcount) {
1255 			agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock);
1256 			agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock);
1257 			aglen = map[1].br_blockcount;
1258 
1259 			error = xfs_reflink_find_shared(mp, agno, agbno, aglen,
1260 					&rbno, &rlen, true);
1261 			if (error)
1262 				goto out;
1263 			if (rbno == NULLAGBLOCK)
1264 				break;
1265 
1266 			/* Dirty the pages */
1267 			xfs_iunlock(ip, XFS_ILOCK_EXCL);
1268 			fpos = XFS_FSB_TO_B(mp, map[1].br_startoff +
1269 					(rbno - agbno));
1270 			flen = XFS_FSB_TO_B(mp, rlen);
1271 			if (fpos + flen > isize)
1272 				flen = isize - fpos;
1273 			error = iomap_file_dirty(VFS_I(ip), fpos, flen,
1274 					&xfs_iomap_ops);
1275 			xfs_ilock(ip, XFS_ILOCK_EXCL);
1276 			if (error)
1277 				goto out;
1278 
1279 			map[1].br_blockcount -= (rbno - agbno + rlen);
1280 			map[1].br_startoff += (rbno - agbno + rlen);
1281 			map[1].br_startblock += (rbno - agbno + rlen);
1282 		}
1283 
1284 next:
1285 		fbno = map[0].br_startoff + map[0].br_blockcount;
1286 	}
1287 out:
1288 	return error;
1289 }
1290 
1291 /* Clear the inode reflink flag if there are no shared extents. */
1292 int
1293 xfs_reflink_clear_inode_flag(
1294 	struct xfs_inode	*ip,
1295 	struct xfs_trans	**tpp)
1296 {
1297 	struct xfs_mount	*mp = ip->i_mount;
1298 	xfs_fileoff_t		fbno;
1299 	xfs_filblks_t		end;
1300 	xfs_agnumber_t		agno;
1301 	xfs_agblock_t		agbno;
1302 	xfs_extlen_t		aglen;
1303 	xfs_agblock_t		rbno;
1304 	xfs_extlen_t		rlen;
1305 	struct xfs_bmbt_irec	map;
1306 	int			nmaps;
1307 	int			error = 0;
1308 
1309 	ASSERT(xfs_is_reflink_inode(ip));
1310 
1311 	fbno = 0;
1312 	end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip)));
1313 	while (end - fbno > 0) {
1314 		nmaps = 1;
1315 		/*
1316 		 * Look for extents in the file.  Skip holes, delalloc, or
1317 		 * unwritten extents; they can't be reflinked.
1318 		 */
1319 		error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0);
1320 		if (error)
1321 			return error;
1322 		if (nmaps == 0)
1323 			break;
1324 		if (map.br_startblock == HOLESTARTBLOCK ||
1325 		    map.br_startblock == DELAYSTARTBLOCK ||
1326 		    ISUNWRITTEN(&map))
1327 			goto next;
1328 
1329 		agno = XFS_FSB_TO_AGNO(mp, map.br_startblock);
1330 		agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock);
1331 		aglen = map.br_blockcount;
1332 
1333 		error = xfs_reflink_find_shared(mp, agno, agbno, aglen,
1334 				&rbno, &rlen, false);
1335 		if (error)
1336 			return error;
1337 		/* Is there still a shared block here? */
1338 		if (rbno != NULLAGBLOCK)
1339 			return 0;
1340 next:
1341 		fbno = map.br_startoff + map.br_blockcount;
1342 	}
1343 
1344 	/*
1345 	 * We didn't find any shared blocks so turn off the reflink flag.
1346 	 * First, get rid of any leftover CoW mappings.
1347 	 */
1348 	error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF);
1349 	if (error)
1350 		return error;
1351 
1352 	/* Clear the inode flag. */
1353 	trace_xfs_reflink_unset_inode_flag(ip);
1354 	ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
1355 	xfs_inode_clear_cowblocks_tag(ip);
1356 	xfs_trans_ijoin(*tpp, ip, 0);
1357 	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
1358 
1359 	return error;
1360 }
1361 
1362 /*
1363  * Clear the inode reflink flag if there are no shared extents and the size
1364  * hasn't changed.
1365  */
1366 STATIC int
1367 xfs_reflink_try_clear_inode_flag(
1368 	struct xfs_inode	*ip)
1369 {
1370 	struct xfs_mount	*mp = ip->i_mount;
1371 	struct xfs_trans	*tp;
1372 	int			error = 0;
1373 
1374 	/* Start a rolling transaction to remove the mappings */
1375 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
1376 	if (error)
1377 		return error;
1378 
1379 	xfs_ilock(ip, XFS_ILOCK_EXCL);
1380 	xfs_trans_ijoin(tp, ip, 0);
1381 
1382 	error = xfs_reflink_clear_inode_flag(ip, &tp);
1383 	if (error)
1384 		goto cancel;
1385 
1386 	error = xfs_trans_commit(tp);
1387 	if (error)
1388 		goto out;
1389 
1390 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1391 	return 0;
1392 cancel:
1393 	xfs_trans_cancel(tp);
1394 out:
1395 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1396 	return error;
1397 }
1398 
1399 /*
1400  * Pre-COW all shared blocks within a given byte range of a file and turn off
1401  * the reflink flag if we unshare all of the file's blocks.
1402  */
1403 int
1404 xfs_reflink_unshare(
1405 	struct xfs_inode	*ip,
1406 	xfs_off_t		offset,
1407 	xfs_off_t		len)
1408 {
1409 	struct xfs_mount	*mp = ip->i_mount;
1410 	xfs_fileoff_t		fbno;
1411 	xfs_filblks_t		end;
1412 	xfs_off_t		isize;
1413 	int			error;
1414 
1415 	if (!xfs_is_reflink_inode(ip))
1416 		return 0;
1417 
1418 	trace_xfs_reflink_unshare(ip, offset, len);
1419 
1420 	inode_dio_wait(VFS_I(ip));
1421 
1422 	/* Try to CoW the selected ranges */
1423 	xfs_ilock(ip, XFS_ILOCK_EXCL);
1424 	fbno = XFS_B_TO_FSBT(mp, offset);
1425 	isize = i_size_read(VFS_I(ip));
1426 	end = XFS_B_TO_FSB(mp, offset + len);
1427 	error = xfs_reflink_dirty_extents(ip, fbno, end, isize);
1428 	if (error)
1429 		goto out_unlock;
1430 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1431 
1432 	/* Wait for the IO to finish */
1433 	error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
1434 	if (error)
1435 		goto out;
1436 
1437 	/* Turn off the reflink flag if possible. */
1438 	error = xfs_reflink_try_clear_inode_flag(ip);
1439 	if (error)
1440 		goto out;
1441 
1442 	return 0;
1443 
1444 out_unlock:
1445 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1446 out:
1447 	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
1448 	return error;
1449 }
1450