10b61f8a4SDave Chinner // SPDX-License-Identifier: GPL-2.0+
23993baebSDarrick J. Wong /*
33993baebSDarrick J. Wong * Copyright (C) 2016 Oracle. All Rights Reserved.
43993baebSDarrick J. Wong * Author: Darrick J. Wong <darrick.wong@oracle.com>
53993baebSDarrick J. Wong */
63993baebSDarrick J. Wong #include "xfs.h"
73993baebSDarrick J. Wong #include "xfs_fs.h"
83993baebSDarrick J. Wong #include "xfs_shared.h"
93993baebSDarrick J. Wong #include "xfs_format.h"
103993baebSDarrick J. Wong #include "xfs_log_format.h"
113993baebSDarrick J. Wong #include "xfs_trans_resv.h"
123993baebSDarrick J. Wong #include "xfs_mount.h"
133993baebSDarrick J. Wong #include "xfs_defer.h"
143993baebSDarrick J. Wong #include "xfs_inode.h"
153993baebSDarrick J. Wong #include "xfs_trans.h"
163993baebSDarrick J. Wong #include "xfs_bmap.h"
173993baebSDarrick J. Wong #include "xfs_bmap_util.h"
183993baebSDarrick J. Wong #include "xfs_trace.h"
193993baebSDarrick J. Wong #include "xfs_icache.h"
20174edb0eSDarrick J. Wong #include "xfs_btree.h"
213993baebSDarrick J. Wong #include "xfs_refcount_btree.h"
223993baebSDarrick J. Wong #include "xfs_refcount.h"
233993baebSDarrick J. Wong #include "xfs_bmap_btree.h"
243993baebSDarrick J. Wong #include "xfs_trans_space.h"
253993baebSDarrick J. Wong #include "xfs_bit.h"
263993baebSDarrick J. Wong #include "xfs_alloc.h"
273993baebSDarrick J. Wong #include "xfs_quota.h"
283993baebSDarrick J. Wong #include "xfs_reflink.h"
292a06705cSDarrick J. Wong #include "xfs_iomap.h"
309bbafc71SDave Chinner #include "xfs_ag.h"
316fa164b8SDarrick J. Wong #include "xfs_ag_resv.h"
323993baebSDarrick J. Wong
333993baebSDarrick J. Wong /*
343993baebSDarrick J. Wong * Copy on Write of Shared Blocks
353993baebSDarrick J. Wong *
363993baebSDarrick J. Wong * XFS must preserve "the usual" file semantics even when two files share
373993baebSDarrick J. Wong * the same physical blocks. This means that a write to one file must not
383993baebSDarrick J. Wong * alter the blocks in a different file; the way that we'll do that is
393993baebSDarrick J. Wong * through the use of a copy-on-write mechanism. At a high level, that
403993baebSDarrick J. Wong * means that when we want to write to a shared block, we allocate a new
413993baebSDarrick J. Wong * block, write the data to the new block, and if that succeeds we map the
423993baebSDarrick J. Wong * new block into the file.
433993baebSDarrick J. Wong *
443993baebSDarrick J. Wong * XFS provides a "delayed allocation" mechanism that defers the allocation
453993baebSDarrick J. Wong * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
463993baebSDarrick J. Wong * possible. This reduces fragmentation by enabling the filesystem to ask
473993baebSDarrick J. Wong * for bigger chunks less often, which is exactly what we want for CoW.
483993baebSDarrick J. Wong *
493993baebSDarrick J. Wong * The delalloc mechanism begins when the kernel wants to make a block
503993baebSDarrick J. Wong * writable (write_begin or page_mkwrite). If the offset is not mapped, we
513993baebSDarrick J. Wong * create a delalloc mapping, which is a regular in-core extent, but without
523993baebSDarrick J. Wong * a real startblock. (For delalloc mappings, the startblock encodes both
533993baebSDarrick J. Wong * a flag that this is a delalloc mapping, and a worst-case estimate of how
543993baebSDarrick J. Wong * many blocks might be required to put the mapping into the BMBT.) delalloc
553993baebSDarrick J. Wong * mappings are a reservation against the free space in the filesystem;
563993baebSDarrick J. Wong * adjacent mappings can also be combined into fewer larger mappings.
573993baebSDarrick J. Wong *
585eda4300SDarrick J. Wong * As an optimization, the CoW extent size hint (cowextsz) creates
595eda4300SDarrick J. Wong * outsized aligned delalloc reservations in the hope of landing out of
605eda4300SDarrick J. Wong * order nearby CoW writes in a single extent on disk, thereby reducing
615eda4300SDarrick J. Wong * fragmentation and improving future performance.
625eda4300SDarrick J. Wong *
635eda4300SDarrick J. Wong * D: --RRRRRRSSSRRRRRRRR--- (data fork)
645eda4300SDarrick J. Wong * C: ------DDDDDDD--------- (CoW fork)
655eda4300SDarrick J. Wong *
663993baebSDarrick J. Wong * When dirty pages are being written out (typically in writepage), the
675eda4300SDarrick J. Wong * delalloc reservations are converted into unwritten mappings by
685eda4300SDarrick J. Wong * allocating blocks and replacing the delalloc mapping with real ones.
695eda4300SDarrick J. Wong * A delalloc mapping can be replaced by several unwritten ones if the
705eda4300SDarrick J. Wong * free space is fragmented.
715eda4300SDarrick J. Wong *
725eda4300SDarrick J. Wong * D: --RRRRRRSSSRRRRRRRR---
735eda4300SDarrick J. Wong * C: ------UUUUUUU---------
743993baebSDarrick J. Wong *
753993baebSDarrick J. Wong * We want to adapt the delalloc mechanism for copy-on-write, since the
763993baebSDarrick J. Wong * write paths are similar. The first two steps (creating the reservation
773993baebSDarrick J. Wong * and allocating the blocks) are exactly the same as delalloc except that
783993baebSDarrick J. Wong * the mappings must be stored in a separate CoW fork because we do not want
793993baebSDarrick J. Wong * to disturb the mapping in the data fork until we're sure that the write
803993baebSDarrick J. Wong * succeeded. IO completion in this case is the process of removing the old
813993baebSDarrick J. Wong * mapping from the data fork and moving the new mapping from the CoW fork to
823993baebSDarrick J. Wong * the data fork. This will be discussed shortly.
833993baebSDarrick J. Wong *
843993baebSDarrick J. Wong * For now, unaligned directio writes will be bounced back to the page cache.
853993baebSDarrick J. Wong * Block-aligned directio writes will use the same mechanism as buffered
863993baebSDarrick J. Wong * writes.
873993baebSDarrick J. Wong *
885eda4300SDarrick J. Wong * Just prior to submitting the actual disk write requests, we convert
895eda4300SDarrick J. Wong * the extents representing the range of the file actually being written
905eda4300SDarrick J. Wong * (as opposed to extra pieces created for the cowextsize hint) to real
915eda4300SDarrick J. Wong * extents. This will become important in the next step:
925eda4300SDarrick J. Wong *
935eda4300SDarrick J. Wong * D: --RRRRRRSSSRRRRRRRR---
945eda4300SDarrick J. Wong * C: ------UUrrUUU---------
955eda4300SDarrick J. Wong *
963993baebSDarrick J. Wong * CoW remapping must be done after the data block write completes,
973993baebSDarrick J. Wong * because we don't want to destroy the old data fork map until we're sure
983993baebSDarrick J. Wong * the new block has been written. Since the new mappings are kept in a
993993baebSDarrick J. Wong * separate fork, we can simply iterate these mappings to find the ones
1003993baebSDarrick J. Wong * that cover the file blocks that we just CoW'd. For each extent, simply
1013993baebSDarrick J. Wong * unmap the corresponding range in the data fork, map the new range into
1025eda4300SDarrick J. Wong * the data fork, and remove the extent from the CoW fork. Because of
1035eda4300SDarrick J. Wong * the presence of the cowextsize hint, however, we must be careful
1045eda4300SDarrick J. Wong * only to remap the blocks that we've actually written out -- we must
1055eda4300SDarrick J. Wong * never remap delalloc reservations nor CoW staging blocks that have
1065eda4300SDarrick J. Wong * yet to be written. This corresponds exactly to the real extents in
1075eda4300SDarrick J. Wong * the CoW fork:
1085eda4300SDarrick J. Wong *
1095eda4300SDarrick J. Wong * D: --RRRRRRrrSRRRRRRRR---
1105eda4300SDarrick J. Wong * C: ------UU--UUU---------
1113993baebSDarrick J. Wong *
1123993baebSDarrick J. Wong * Since the remapping operation can be applied to an arbitrary file
1133993baebSDarrick J. Wong * range, we record the need for the remap step as a flag in the ioend
1143993baebSDarrick J. Wong * instead of declaring a new IO type. This is required for direct io
1153993baebSDarrick J. Wong * because we only have ioend for the whole dio, and we have to be able to
1163993baebSDarrick J. Wong * remember the presence of unwritten blocks and CoW blocks with a single
1173993baebSDarrick J. Wong * ioend structure. Better yet, the more ground we can cover with one
1183993baebSDarrick J. Wong * ioend, the better.
1193993baebSDarrick J. Wong */
1202a06705cSDarrick J. Wong
1212a06705cSDarrick J. Wong /*
1222a06705cSDarrick J. Wong * Given an AG extent, find the lowest-numbered run of shared blocks
1232a06705cSDarrick J. Wong * within that range and return the range in fbno/flen. If
1242a06705cSDarrick J. Wong * find_end_of_shared is true, return the longest contiguous extent of
1252a06705cSDarrick J. Wong * shared blocks. If there are no shared extents, fbno and flen will
1262a06705cSDarrick J. Wong * be set to NULLAGBLOCK and 0, respectively.
1272a06705cSDarrick J. Wong */
12808d3e84fSDave Chinner static int
xfs_reflink_find_shared(struct xfs_perag * pag,struct xfs_trans * tp,xfs_agblock_t agbno,xfs_extlen_t aglen,xfs_agblock_t * fbno,xfs_extlen_t * flen,bool find_end_of_shared)1292a06705cSDarrick J. Wong xfs_reflink_find_shared(
13008d3e84fSDave Chinner struct xfs_perag *pag,
13192ff7285SDarrick J. Wong struct xfs_trans *tp,
1322a06705cSDarrick J. Wong xfs_agblock_t agbno,
1332a06705cSDarrick J. Wong xfs_extlen_t aglen,
1342a06705cSDarrick J. Wong xfs_agblock_t *fbno,
1352a06705cSDarrick J. Wong xfs_extlen_t *flen,
1362a06705cSDarrick J. Wong bool find_end_of_shared)
1372a06705cSDarrick J. Wong {
1382a06705cSDarrick J. Wong struct xfs_buf *agbp;
1392a06705cSDarrick J. Wong struct xfs_btree_cur *cur;
1402a06705cSDarrick J. Wong int error;
1412a06705cSDarrick J. Wong
14208d3e84fSDave Chinner error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
1432a06705cSDarrick J. Wong if (error)
1442a06705cSDarrick J. Wong return error;
1452a06705cSDarrick J. Wong
14608d3e84fSDave Chinner cur = xfs_refcountbt_init_cursor(pag->pag_mount, tp, agbp, pag);
1472a06705cSDarrick J. Wong
1482a06705cSDarrick J. Wong error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
1492a06705cSDarrick J. Wong find_end_of_shared);
1502a06705cSDarrick J. Wong
1510b04b6b8SDarrick J. Wong xfs_btree_del_cursor(cur, error);
1522a06705cSDarrick J. Wong
15392ff7285SDarrick J. Wong xfs_trans_brelse(tp, agbp);
1542a06705cSDarrick J. Wong return error;
1552a06705cSDarrick J. Wong }
1562a06705cSDarrick J. Wong
1572a06705cSDarrick J. Wong /*
1582a06705cSDarrick J. Wong * Trim the mapping to the next block where there's a change in the
1592a06705cSDarrick J. Wong * shared/unshared status. More specifically, this means that we
1602a06705cSDarrick J. Wong * find the lowest-numbered extent of shared blocks that coincides with
1612a06705cSDarrick J. Wong * the given block mapping. If the shared extent overlaps the start of
1622a06705cSDarrick J. Wong * the mapping, trim the mapping to the end of the shared extent. If
1632a06705cSDarrick J. Wong * the shared region intersects the mapping, trim the mapping to the
1642a06705cSDarrick J. Wong * start of the shared extent. If there are no shared regions that
1652a06705cSDarrick J. Wong * overlap, just return the original extent.
1662a06705cSDarrick J. Wong */
1672a06705cSDarrick J. Wong int
xfs_reflink_trim_around_shared(struct xfs_inode * ip,struct xfs_bmbt_irec * irec,bool * shared)1682a06705cSDarrick J. Wong xfs_reflink_trim_around_shared(
1692a06705cSDarrick J. Wong struct xfs_inode *ip,
1702a06705cSDarrick J. Wong struct xfs_bmbt_irec *irec,
171d392bc81SChristoph Hellwig bool *shared)
1722a06705cSDarrick J. Wong {
17308d3e84fSDave Chinner struct xfs_mount *mp = ip->i_mount;
17408d3e84fSDave Chinner struct xfs_perag *pag;
1752a06705cSDarrick J. Wong xfs_agblock_t agbno;
1762a06705cSDarrick J. Wong xfs_extlen_t aglen;
1772a06705cSDarrick J. Wong xfs_agblock_t fbno;
1782a06705cSDarrick J. Wong xfs_extlen_t flen;
1792a06705cSDarrick J. Wong int error = 0;
1802a06705cSDarrick J. Wong
1812a06705cSDarrick J. Wong /* Holes, unwritten, and delalloc extents cannot be shared */
182877f58f5SDarrick J. Wong if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) {
1832a06705cSDarrick J. Wong *shared = false;
1842a06705cSDarrick J. Wong return 0;
1852a06705cSDarrick J. Wong }
1862a06705cSDarrick J. Wong
1872a06705cSDarrick J. Wong trace_xfs_reflink_trim_around_shared(ip, irec);
1882a06705cSDarrick J. Wong
18908d3e84fSDave Chinner pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, irec->br_startblock));
19008d3e84fSDave Chinner agbno = XFS_FSB_TO_AGBNO(mp, irec->br_startblock);
1912a06705cSDarrick J. Wong aglen = irec->br_blockcount;
1922a06705cSDarrick J. Wong
19308d3e84fSDave Chinner error = xfs_reflink_find_shared(pag, NULL, agbno, aglen, &fbno, &flen,
19408d3e84fSDave Chinner true);
19508d3e84fSDave Chinner xfs_perag_put(pag);
1962a06705cSDarrick J. Wong if (error)
1972a06705cSDarrick J. Wong return error;
1982a06705cSDarrick J. Wong
199d392bc81SChristoph Hellwig *shared = false;
2002a06705cSDarrick J. Wong if (fbno == NULLAGBLOCK) {
2012a06705cSDarrick J. Wong /* No shared blocks at all. */
2022a06705cSDarrick J. Wong return 0;
203a0ebf8c4SZeng Heng }
204a0ebf8c4SZeng Heng
205a0ebf8c4SZeng Heng if (fbno == agbno) {
2062a06705cSDarrick J. Wong /*
2072a06705cSDarrick J. Wong * The start of this extent is shared. Truncate the
2082a06705cSDarrick J. Wong * mapping at the end of the shared region so that a
2092a06705cSDarrick J. Wong * subsequent iteration starts at the start of the
2102a06705cSDarrick J. Wong * unshared region.
2112a06705cSDarrick J. Wong */
2122a06705cSDarrick J. Wong irec->br_blockcount = flen;
2132a06705cSDarrick J. Wong *shared = true;
2142a06705cSDarrick J. Wong return 0;
215a0ebf8c4SZeng Heng }
216a0ebf8c4SZeng Heng
2172a06705cSDarrick J. Wong /*
2182a06705cSDarrick J. Wong * There's a shared extent midway through this extent.
2192a06705cSDarrick J. Wong * Truncate the mapping at the start of the shared
2202a06705cSDarrick J. Wong * extent so that a subsequent iteration starts at the
2212a06705cSDarrick J. Wong * start of the shared region.
2222a06705cSDarrick J. Wong */
2232a06705cSDarrick J. Wong irec->br_blockcount = fbno - agbno;
2242a06705cSDarrick J. Wong return 0;
2252a06705cSDarrick J. Wong }
2262a06705cSDarrick J. Wong
227aa124436Szhengbin int
xfs_bmap_trim_cow(struct xfs_inode * ip,struct xfs_bmbt_irec * imap,bool * shared)228aa124436Szhengbin xfs_bmap_trim_cow(
22966ae56a5SChristoph Hellwig struct xfs_inode *ip,
23066ae56a5SChristoph Hellwig struct xfs_bmbt_irec *imap,
23166ae56a5SChristoph Hellwig bool *shared)
23266ae56a5SChristoph Hellwig {
23366ae56a5SChristoph Hellwig /* We can't update any real extents in always COW mode. */
23466ae56a5SChristoph Hellwig if (xfs_is_always_cow_inode(ip) &&
23566ae56a5SChristoph Hellwig !isnullstartblock(imap->br_startblock)) {
23666ae56a5SChristoph Hellwig *shared = true;
23766ae56a5SChristoph Hellwig return 0;
23866ae56a5SChristoph Hellwig }
23966ae56a5SChristoph Hellwig
24066ae56a5SChristoph Hellwig /* Trim the mapping to the nearest shared extent boundary. */
24166ae56a5SChristoph Hellwig return xfs_reflink_trim_around_shared(ip, imap, shared);
24266ae56a5SChristoph Hellwig }
24366ae56a5SChristoph Hellwig
24426b91c72SChristoph Hellwig static int
xfs_reflink_convert_cow_locked(struct xfs_inode * ip,xfs_fileoff_t offset_fsb,xfs_filblks_t count_fsb)24526b91c72SChristoph Hellwig xfs_reflink_convert_cow_locked(
2465eda4300SDarrick J. Wong struct xfs_inode *ip,
2475eda4300SDarrick J. Wong xfs_fileoff_t offset_fsb,
2488a749386SBrian Foster xfs_filblks_t count_fsb)
2495eda4300SDarrick J. Wong {
25026b91c72SChristoph Hellwig struct xfs_iext_cursor icur;
25126b91c72SChristoph Hellwig struct xfs_bmbt_irec got;
25226b91c72SChristoph Hellwig struct xfs_btree_cur *dummy_cur = NULL;
25326b91c72SChristoph Hellwig int dummy_logflags;
254c1a4447fSDarrick J. Wong int error = 0;
2555eda4300SDarrick J. Wong
25626b91c72SChristoph Hellwig if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
2575eda4300SDarrick J. Wong return 0;
2585eda4300SDarrick J. Wong
25926b91c72SChristoph Hellwig do {
26026b91c72SChristoph Hellwig if (got.br_startoff >= offset_fsb + count_fsb)
26126b91c72SChristoph Hellwig break;
26226b91c72SChristoph Hellwig if (got.br_state == XFS_EXT_NORM)
26326b91c72SChristoph Hellwig continue;
26426b91c72SChristoph Hellwig if (WARN_ON_ONCE(isnullstartblock(got.br_startblock)))
26526b91c72SChristoph Hellwig return -EIO;
26626b91c72SChristoph Hellwig
26726b91c72SChristoph Hellwig xfs_trim_extent(&got, offset_fsb, count_fsb);
26826b91c72SChristoph Hellwig if (!got.br_blockcount)
26926b91c72SChristoph Hellwig continue;
27026b91c72SChristoph Hellwig
27126b91c72SChristoph Hellwig got.br_state = XFS_EXT_NORM;
27226b91c72SChristoph Hellwig error = xfs_bmap_add_extent_unwritten_real(NULL, ip,
27326b91c72SChristoph Hellwig XFS_COW_FORK, &icur, &dummy_cur, &got,
27426b91c72SChristoph Hellwig &dummy_logflags);
27526b91c72SChristoph Hellwig if (error)
27626b91c72SChristoph Hellwig return error;
27726b91c72SChristoph Hellwig } while (xfs_iext_next_extent(ip->i_cowfp, &icur, &got));
27826b91c72SChristoph Hellwig
27926b91c72SChristoph Hellwig return error;
2805eda4300SDarrick J. Wong }
2815eda4300SDarrick J. Wong
2825eda4300SDarrick J. Wong /* Convert all of the unwritten CoW extents in a file's range to real ones. */
2835eda4300SDarrick J. Wong int
xfs_reflink_convert_cow(struct xfs_inode * ip,xfs_off_t offset,xfs_off_t count)2845eda4300SDarrick J. Wong xfs_reflink_convert_cow(
2855eda4300SDarrick J. Wong struct xfs_inode *ip,
2865eda4300SDarrick J. Wong xfs_off_t offset,
2875eda4300SDarrick J. Wong xfs_off_t count)
2885eda4300SDarrick J. Wong {
2895eda4300SDarrick J. Wong struct xfs_mount *mp = ip->i_mount;
2905eda4300SDarrick J. Wong xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset);
2915eda4300SDarrick J. Wong xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count);
292b121459cSChristoph Hellwig xfs_filblks_t count_fsb = end_fsb - offset_fsb;
29326b91c72SChristoph Hellwig int error;
294b121459cSChristoph Hellwig
295b121459cSChristoph Hellwig ASSERT(count != 0);
2965eda4300SDarrick J. Wong
2975eda4300SDarrick J. Wong xfs_ilock(ip, XFS_ILOCK_EXCL);
29826b91c72SChristoph Hellwig error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
2995eda4300SDarrick J. Wong xfs_iunlock(ip, XFS_ILOCK_EXCL);
3005eda4300SDarrick J. Wong return error;
3015eda4300SDarrick J. Wong }
3025eda4300SDarrick J. Wong
303df307077SDave Chinner /*
304df307077SDave Chinner * Find the extent that maps the given range in the COW fork. Even if the extent
305df307077SDave Chinner * is not shared we might have a preallocation for it in the COW fork. If so we
306df307077SDave Chinner * use it that rather than trigger a new allocation.
307df307077SDave Chinner */
308df307077SDave Chinner static int
xfs_find_trim_cow_extent(struct xfs_inode * ip,struct xfs_bmbt_irec * imap,struct xfs_bmbt_irec * cmap,bool * shared,bool * found)309df307077SDave Chinner xfs_find_trim_cow_extent(
310df307077SDave Chinner struct xfs_inode *ip,
311df307077SDave Chinner struct xfs_bmbt_irec *imap,
312ffb375a8SChristoph Hellwig struct xfs_bmbt_irec *cmap,
313df307077SDave Chinner bool *shared,
314df307077SDave Chinner bool *found)
315df307077SDave Chinner {
316df307077SDave Chinner xfs_fileoff_t offset_fsb = imap->br_startoff;
317df307077SDave Chinner xfs_filblks_t count_fsb = imap->br_blockcount;
318df307077SDave Chinner struct xfs_iext_cursor icur;
319df307077SDave Chinner
320df307077SDave Chinner *found = false;
321df307077SDave Chinner
322df307077SDave Chinner /*
323df307077SDave Chinner * If we don't find an overlapping extent, trim the range we need to
324df307077SDave Chinner * allocate to fit the hole we found.
325df307077SDave Chinner */
326ffb375a8SChristoph Hellwig if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, cmap))
327ffb375a8SChristoph Hellwig cmap->br_startoff = offset_fsb + count_fsb;
328ffb375a8SChristoph Hellwig if (cmap->br_startoff > offset_fsb) {
329032dc923SChristoph Hellwig xfs_trim_extent(imap, imap->br_startoff,
330ffb375a8SChristoph Hellwig cmap->br_startoff - imap->br_startoff);
331aa124436Szhengbin return xfs_bmap_trim_cow(ip, imap, shared);
332032dc923SChristoph Hellwig }
333df307077SDave Chinner
334df307077SDave Chinner *shared = true;
335ffb375a8SChristoph Hellwig if (isnullstartblock(cmap->br_startblock)) {
336ffb375a8SChristoph Hellwig xfs_trim_extent(imap, cmap->br_startoff, cmap->br_blockcount);
337df307077SDave Chinner return 0;
338df307077SDave Chinner }
339df307077SDave Chinner
340df307077SDave Chinner /* real extent found - no need to allocate */
341ffb375a8SChristoph Hellwig xfs_trim_extent(cmap, offset_fsb, count_fsb);
342df307077SDave Chinner *found = true;
343df307077SDave Chinner return 0;
344df307077SDave Chinner }
345df307077SDave Chinner
346d6211330SChandan Babu R static int
xfs_reflink_convert_unwritten(struct xfs_inode * ip,struct xfs_bmbt_irec * imap,struct xfs_bmbt_irec * cmap,bool convert_now)347d6211330SChandan Babu R xfs_reflink_convert_unwritten(
348d6211330SChandan Babu R struct xfs_inode *ip,
349d6211330SChandan Babu R struct xfs_bmbt_irec *imap,
350d6211330SChandan Babu R struct xfs_bmbt_irec *cmap,
351d6211330SChandan Babu R bool convert_now)
352d6211330SChandan Babu R {
353d6211330SChandan Babu R xfs_fileoff_t offset_fsb = imap->br_startoff;
354d6211330SChandan Babu R xfs_filblks_t count_fsb = imap->br_blockcount;
355d6211330SChandan Babu R int error;
356d6211330SChandan Babu R
357d6211330SChandan Babu R /*
358d6211330SChandan Babu R * cmap might larger than imap due to cowextsize hint.
359d6211330SChandan Babu R */
360d6211330SChandan Babu R xfs_trim_extent(cmap, offset_fsb, count_fsb);
361d6211330SChandan Babu R
362d6211330SChandan Babu R /*
363d6211330SChandan Babu R * COW fork extents are supposed to remain unwritten until we're ready
364d6211330SChandan Babu R * to initiate a disk write. For direct I/O we are going to write the
365d6211330SChandan Babu R * data and need the conversion, but for buffered writes we're done.
366d6211330SChandan Babu R */
367d6211330SChandan Babu R if (!convert_now || cmap->br_state == XFS_EXT_NORM)
368d6211330SChandan Babu R return 0;
369d6211330SChandan Babu R
370d6211330SChandan Babu R trace_xfs_reflink_convert_cow(ip, cmap);
371d6211330SChandan Babu R
372d6211330SChandan Babu R error = xfs_reflink_convert_cow_locked(ip, offset_fsb, count_fsb);
373d6211330SChandan Babu R if (!error)
374d6211330SChandan Babu R cmap->br_state = XFS_EXT_NORM;
375d6211330SChandan Babu R
376d6211330SChandan Babu R return error;
377d6211330SChandan Babu R }
378d6211330SChandan Babu R
379d6211330SChandan Babu R static int
xfs_reflink_fill_cow_hole(struct xfs_inode * ip,struct xfs_bmbt_irec * imap,struct xfs_bmbt_irec * cmap,bool * shared,uint * lockmode,bool convert_now)380d6211330SChandan Babu R xfs_reflink_fill_cow_hole(
3810613f16cSDarrick J. Wong struct xfs_inode *ip,
3823c68d44aSChristoph Hellwig struct xfs_bmbt_irec *imap,
383ffb375a8SChristoph Hellwig struct xfs_bmbt_irec *cmap,
3843c68d44aSChristoph Hellwig bool *shared,
38578f0cc9dSChristoph Hellwig uint *lockmode,
386affe250aSDarrick J. Wong bool convert_now)
3870613f16cSDarrick J. Wong {
3880613f16cSDarrick J. Wong struct xfs_mount *mp = ip->i_mount;
389df307077SDave Chinner struct xfs_trans *tp;
390a14234c7SChristoph Hellwig xfs_filblks_t resaligned;
391d6211330SChandan Babu R xfs_extlen_t resblks;
392d6211330SChandan Babu R int nimaps;
393d6211330SChandan Babu R int error;
394d6211330SChandan Babu R bool found;
3950613f16cSDarrick J. Wong
3963c68d44aSChristoph Hellwig resaligned = xfs_aligned_fsb_count(imap->br_startoff,
3973c68d44aSChristoph Hellwig imap->br_blockcount, xfs_get_cowextsz_hint(ip));
3983c68d44aSChristoph Hellwig resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
3993c68d44aSChristoph Hellwig
4003c68d44aSChristoph Hellwig xfs_iunlock(ip, *lockmode);
401f273387bSDarrick J. Wong *lockmode = 0;
4023c68d44aSChristoph Hellwig
403f273387bSDarrick J. Wong error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, resblks, 0,
404f273387bSDarrick J. Wong false, &tp);
4053c68d44aSChristoph Hellwig if (error)
4063c68d44aSChristoph Hellwig return error;
4073c68d44aSChristoph Hellwig
408f273387bSDarrick J. Wong *lockmode = XFS_ILOCK_EXCL;
409df307077SDave Chinner
410ffb375a8SChristoph Hellwig error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
411df307077SDave Chinner if (error || !*shared)
412df307077SDave Chinner goto out_trans_cancel;
413d6211330SChandan Babu R
414df307077SDave Chinner if (found) {
415df307077SDave Chinner xfs_trans_cancel(tp);
416df307077SDave Chinner goto convert;
417a14234c7SChristoph Hellwig }
418a14234c7SChristoph Hellwig
419a14234c7SChristoph Hellwig /* Allocate the entire reservation as unwritten blocks. */
420df307077SDave Chinner nimaps = 1;
4213c68d44aSChristoph Hellwig error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
422da781e64SBrian Foster XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0, cmap,
423da781e64SBrian Foster &nimaps);
424a14234c7SChristoph Hellwig if (error)
42535b11010SDarrick J. Wong goto out_trans_cancel;
426a14234c7SChristoph Hellwig
42786d692bfSDarrick J. Wong xfs_inode_set_cowblocks_tag(ip);
4280613f16cSDarrick J. Wong error = xfs_trans_commit(tp);
429a14234c7SChristoph Hellwig if (error)
4300613f16cSDarrick J. Wong return error;
4319f37bd11SDarrick J. Wong
4323c68d44aSChristoph Hellwig convert:
433d6211330SChandan Babu R return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
434df307077SDave Chinner
435df307077SDave Chinner out_trans_cancel:
4360613f16cSDarrick J. Wong xfs_trans_cancel(tp);
4370613f16cSDarrick J. Wong return error;
4380613f16cSDarrick J. Wong }
4390613f16cSDarrick J. Wong
440d6211330SChandan Babu R static int
xfs_reflink_fill_delalloc(struct xfs_inode * ip,struct xfs_bmbt_irec * imap,struct xfs_bmbt_irec * cmap,bool * shared,uint * lockmode,bool convert_now)441d6211330SChandan Babu R xfs_reflink_fill_delalloc(
442d6211330SChandan Babu R struct xfs_inode *ip,
443d6211330SChandan Babu R struct xfs_bmbt_irec *imap,
444d6211330SChandan Babu R struct xfs_bmbt_irec *cmap,
445d6211330SChandan Babu R bool *shared,
446d6211330SChandan Babu R uint *lockmode,
447d6211330SChandan Babu R bool convert_now)
448d6211330SChandan Babu R {
449d6211330SChandan Babu R struct xfs_mount *mp = ip->i_mount;
450d6211330SChandan Babu R struct xfs_trans *tp;
451d6211330SChandan Babu R int nimaps;
452d6211330SChandan Babu R int error;
453d6211330SChandan Babu R bool found;
454d6211330SChandan Babu R
455d6211330SChandan Babu R do {
456d6211330SChandan Babu R xfs_iunlock(ip, *lockmode);
457d6211330SChandan Babu R *lockmode = 0;
458d6211330SChandan Babu R
459d6211330SChandan Babu R error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, 0, 0,
460d6211330SChandan Babu R false, &tp);
461d6211330SChandan Babu R if (error)
462d6211330SChandan Babu R return error;
463d6211330SChandan Babu R
464d6211330SChandan Babu R *lockmode = XFS_ILOCK_EXCL;
465d6211330SChandan Babu R
466d6211330SChandan Babu R error = xfs_find_trim_cow_extent(ip, imap, cmap, shared,
467d6211330SChandan Babu R &found);
468d6211330SChandan Babu R if (error || !*shared)
469d6211330SChandan Babu R goto out_trans_cancel;
470d6211330SChandan Babu R
471d6211330SChandan Babu R if (found) {
472d6211330SChandan Babu R xfs_trans_cancel(tp);
473d6211330SChandan Babu R break;
474d6211330SChandan Babu R }
475d6211330SChandan Babu R
476d6211330SChandan Babu R ASSERT(isnullstartblock(cmap->br_startblock) ||
477d6211330SChandan Babu R cmap->br_startblock == DELAYSTARTBLOCK);
478d6211330SChandan Babu R
479d6211330SChandan Babu R /*
480d6211330SChandan Babu R * Replace delalloc reservation with an unwritten extent.
481d6211330SChandan Babu R */
482d6211330SChandan Babu R nimaps = 1;
483d6211330SChandan Babu R error = xfs_bmapi_write(tp, ip, cmap->br_startoff,
484d6211330SChandan Babu R cmap->br_blockcount,
485d6211330SChandan Babu R XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC, 0,
486d6211330SChandan Babu R cmap, &nimaps);
487d6211330SChandan Babu R if (error)
488d6211330SChandan Babu R goto out_trans_cancel;
489d6211330SChandan Babu R
490d6211330SChandan Babu R xfs_inode_set_cowblocks_tag(ip);
491d6211330SChandan Babu R error = xfs_trans_commit(tp);
492d6211330SChandan Babu R if (error)
493d6211330SChandan Babu R return error;
494d6211330SChandan Babu R } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
495d6211330SChandan Babu R
496d6211330SChandan Babu R return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
497d6211330SChandan Babu R
498d6211330SChandan Babu R out_trans_cancel:
499d6211330SChandan Babu R xfs_trans_cancel(tp);
500d6211330SChandan Babu R return error;
501d6211330SChandan Babu R }
502d6211330SChandan Babu R
503d6211330SChandan Babu R /* Allocate all CoW reservations covering a range of blocks in a file. */
504d6211330SChandan Babu R int
xfs_reflink_allocate_cow(struct xfs_inode * ip,struct xfs_bmbt_irec * imap,struct xfs_bmbt_irec * cmap,bool * shared,uint * lockmode,bool convert_now)505d6211330SChandan Babu R xfs_reflink_allocate_cow(
506d6211330SChandan Babu R struct xfs_inode *ip,
507d6211330SChandan Babu R struct xfs_bmbt_irec *imap,
508d6211330SChandan Babu R struct xfs_bmbt_irec *cmap,
509d6211330SChandan Babu R bool *shared,
510d6211330SChandan Babu R uint *lockmode,
511d6211330SChandan Babu R bool convert_now)
512d6211330SChandan Babu R {
513d6211330SChandan Babu R int error;
514d6211330SChandan Babu R bool found;
515d6211330SChandan Babu R
516d6211330SChandan Babu R ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
517d6211330SChandan Babu R if (!ip->i_cowfp) {
518d6211330SChandan Babu R ASSERT(!xfs_is_reflink_inode(ip));
519d6211330SChandan Babu R xfs_ifork_init_cow(ip);
520d6211330SChandan Babu R }
521d6211330SChandan Babu R
522d6211330SChandan Babu R error = xfs_find_trim_cow_extent(ip, imap, cmap, shared, &found);
523d6211330SChandan Babu R if (error || !*shared)
524d6211330SChandan Babu R return error;
525d6211330SChandan Babu R
526d6211330SChandan Babu R /* CoW fork has a real extent */
527d6211330SChandan Babu R if (found)
528d6211330SChandan Babu R return xfs_reflink_convert_unwritten(ip, imap, cmap,
529d6211330SChandan Babu R convert_now);
530d6211330SChandan Babu R
531d6211330SChandan Babu R /*
532d6211330SChandan Babu R * CoW fork does not have an extent and data extent is shared.
533d6211330SChandan Babu R * Allocate a real extent in the CoW fork.
534d6211330SChandan Babu R */
535d6211330SChandan Babu R if (cmap->br_startoff > imap->br_startoff)
536d6211330SChandan Babu R return xfs_reflink_fill_cow_hole(ip, imap, cmap, shared,
537d6211330SChandan Babu R lockmode, convert_now);
538d6211330SChandan Babu R
539d6211330SChandan Babu R /*
540d6211330SChandan Babu R * CoW fork has a delalloc reservation. Replace it with a real extent.
541d6211330SChandan Babu R * There may or may not be a data fork mapping.
542d6211330SChandan Babu R */
543d6211330SChandan Babu R if (isnullstartblock(cmap->br_startblock) ||
544d6211330SChandan Babu R cmap->br_startblock == DELAYSTARTBLOCK)
545d6211330SChandan Babu R return xfs_reflink_fill_delalloc(ip, imap, cmap, shared,
546d6211330SChandan Babu R lockmode, convert_now);
547d6211330SChandan Babu R
548d6211330SChandan Babu R /* Shouldn't get here. */
549d6211330SChandan Babu R ASSERT(0);
550d6211330SChandan Babu R return -EFSCORRUPTED;
551d6211330SChandan Babu R }
552d6211330SChandan Babu R
553ef473667SDarrick J. Wong /*
5543802a345SChristoph Hellwig * Cancel CoW reservations for some block range of an inode.
5553802a345SChristoph Hellwig *
5563802a345SChristoph Hellwig * If cancel_real is true this function cancels all COW fork extents for the
5573802a345SChristoph Hellwig * inode; if cancel_real is false, real extents are not cleared.
558c5295c6aSDave Chinner *
559c5295c6aSDave Chinner * Caller must have already joined the inode to the current transaction. The
560c5295c6aSDave Chinner * inode will be joined to the transaction returned to the caller.
56143caeb18SDarrick J. Wong */
56243caeb18SDarrick J. Wong int
xfs_reflink_cancel_cow_blocks(struct xfs_inode * ip,struct xfs_trans ** tpp,xfs_fileoff_t offset_fsb,xfs_fileoff_t end_fsb,bool cancel_real)56343caeb18SDarrick J. Wong xfs_reflink_cancel_cow_blocks(
56443caeb18SDarrick J. Wong struct xfs_inode *ip,
56543caeb18SDarrick J. Wong struct xfs_trans **tpp,
56643caeb18SDarrick J. Wong xfs_fileoff_t offset_fsb,
5673802a345SChristoph Hellwig xfs_fileoff_t end_fsb,
5683802a345SChristoph Hellwig bool cancel_real)
56943caeb18SDarrick J. Wong {
570732436efSDarrick J. Wong struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
571df5ab1b5SChristoph Hellwig struct xfs_bmbt_irec got, del;
572b2b1712aSChristoph Hellwig struct xfs_iext_cursor icur;
573df5ab1b5SChristoph Hellwig int error = 0;
57443caeb18SDarrick J. Wong
57551d62690SChristoph Hellwig if (!xfs_inode_has_cow_data(ip))
57643caeb18SDarrick J. Wong return 0;
57741caabd0SChristoph Hellwig if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
5783e0ee78fSChristoph Hellwig return 0;
5793e0ee78fSChristoph Hellwig
58041caabd0SChristoph Hellwig /* Walk backwards until we're out of the I/O range... */
58141caabd0SChristoph Hellwig while (got.br_startoff + got.br_blockcount > offset_fsb) {
5823e0ee78fSChristoph Hellwig del = got;
5833e0ee78fSChristoph Hellwig xfs_trim_extent(&del, offset_fsb, end_fsb - offset_fsb);
58441caabd0SChristoph Hellwig
58541caabd0SChristoph Hellwig /* Extent delete may have bumped ext forward */
58641caabd0SChristoph Hellwig if (!del.br_blockcount) {
58741caabd0SChristoph Hellwig xfs_iext_prev(ifp, &icur);
58841caabd0SChristoph Hellwig goto next_extent;
58941caabd0SChristoph Hellwig }
59041caabd0SChristoph Hellwig
5913e0ee78fSChristoph Hellwig trace_xfs_reflink_cancel_cow(ip, &del);
5923e0ee78fSChristoph Hellwig
5933e0ee78fSChristoph Hellwig if (isnullstartblock(del.br_startblock)) {
5943e0ee78fSChristoph Hellwig error = xfs_bmap_del_extent_delay(ip, XFS_COW_FORK,
595b2b1712aSChristoph Hellwig &icur, &got, &del);
59643caeb18SDarrick J. Wong if (error)
59743caeb18SDarrick J. Wong break;
5983802a345SChristoph Hellwig } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) {
599692b6cddSDave Chinner ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER);
60043caeb18SDarrick J. Wong
601174edb0eSDarrick J. Wong /* Free the CoW orphan record. */
60274b4c5d4SDarrick J. Wong xfs_refcount_free_cow_extent(*tpp, del.br_startblock,
60374b4c5d4SDarrick J. Wong del.br_blockcount);
604174edb0eSDarrick J. Wong
6057dfee17bSDave Chinner error = xfs_free_extent_later(*tpp, del.br_startblock,
606b742d7b4SDave Chinner del.br_blockcount, NULL,
607b742d7b4SDave Chinner XFS_AG_RESV_NONE);
6087dfee17bSDave Chinner if (error)
6097dfee17bSDave Chinner break;
61043caeb18SDarrick J. Wong
61143caeb18SDarrick J. Wong /* Roll the transaction */
6129e28a242SBrian Foster error = xfs_defer_finish(tpp);
6139b1f4e98SBrian Foster if (error)
61443caeb18SDarrick J. Wong break;
61543caeb18SDarrick J. Wong
61643caeb18SDarrick J. Wong /* Remove the mapping from the CoW fork. */
617b2b1712aSChristoph Hellwig xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
6184b4c1326SDarrick J. Wong
6194b4c1326SDarrick J. Wong /* Remove the quota reservation */
62085546500SDarrick J. Wong error = xfs_quota_unreserve_blkres(ip,
62185546500SDarrick J. Wong del.br_blockcount);
6224b4c1326SDarrick J. Wong if (error)
6234b4c1326SDarrick J. Wong break;
6249d40fba8SDarrick J. Wong } else {
6259d40fba8SDarrick J. Wong /* Didn't do anything, push cursor back. */
6269d40fba8SDarrick J. Wong xfs_iext_prev(ifp, &icur);
62743caeb18SDarrick J. Wong }
62841caabd0SChristoph Hellwig next_extent:
62941caabd0SChristoph Hellwig if (!xfs_iext_get_extent(ifp, &icur, &got))
630c17a8ef4SBrian Foster break;
63143caeb18SDarrick J. Wong }
63243caeb18SDarrick J. Wong
633c17a8ef4SBrian Foster /* clear tag if cow fork is emptied */
634c17a8ef4SBrian Foster if (!ifp->if_bytes)
635c17a8ef4SBrian Foster xfs_inode_clear_cowblocks_tag(ip);
63643caeb18SDarrick J. Wong return error;
63743caeb18SDarrick J. Wong }
63843caeb18SDarrick J. Wong
63943caeb18SDarrick J. Wong /*
6403802a345SChristoph Hellwig * Cancel CoW reservations for some byte range of an inode.
6413802a345SChristoph Hellwig *
6423802a345SChristoph Hellwig * If cancel_real is true this function cancels all COW fork extents for the
6433802a345SChristoph Hellwig * inode; if cancel_real is false, real extents are not cleared.
64443caeb18SDarrick J. Wong */
64543caeb18SDarrick J. Wong int
xfs_reflink_cancel_cow_range(struct xfs_inode * ip,xfs_off_t offset,xfs_off_t count,bool cancel_real)64643caeb18SDarrick J. Wong xfs_reflink_cancel_cow_range(
64743caeb18SDarrick J. Wong struct xfs_inode *ip,
64843caeb18SDarrick J. Wong xfs_off_t offset,
6493802a345SChristoph Hellwig xfs_off_t count,
6503802a345SChristoph Hellwig bool cancel_real)
65143caeb18SDarrick J. Wong {
65243caeb18SDarrick J. Wong struct xfs_trans *tp;
65343caeb18SDarrick J. Wong xfs_fileoff_t offset_fsb;
65443caeb18SDarrick J. Wong xfs_fileoff_t end_fsb;
65543caeb18SDarrick J. Wong int error;
65643caeb18SDarrick J. Wong
65743caeb18SDarrick J. Wong trace_xfs_reflink_cancel_cow_range(ip, offset, count);
65866ae56a5SChristoph Hellwig ASSERT(ip->i_cowfp);
65943caeb18SDarrick J. Wong
66043caeb18SDarrick J. Wong offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
66143caeb18SDarrick J. Wong if (count == NULLFILEOFF)
66243caeb18SDarrick J. Wong end_fsb = NULLFILEOFF;
66343caeb18SDarrick J. Wong else
66443caeb18SDarrick J. Wong end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
66543caeb18SDarrick J. Wong
66643caeb18SDarrick J. Wong /* Start a rolling transaction to remove the mappings */
66743caeb18SDarrick J. Wong error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
66873d30d48SChristoph Hellwig 0, 0, 0, &tp);
66943caeb18SDarrick J. Wong if (error)
67043caeb18SDarrick J. Wong goto out;
67143caeb18SDarrick J. Wong
67243caeb18SDarrick J. Wong xfs_ilock(ip, XFS_ILOCK_EXCL);
67343caeb18SDarrick J. Wong xfs_trans_ijoin(tp, ip, 0);
67443caeb18SDarrick J. Wong
67543caeb18SDarrick J. Wong /* Scrape out the old CoW reservations */
6763802a345SChristoph Hellwig error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb,
6773802a345SChristoph Hellwig cancel_real);
67843caeb18SDarrick J. Wong if (error)
67943caeb18SDarrick J. Wong goto out_cancel;
68043caeb18SDarrick J. Wong
68143caeb18SDarrick J. Wong error = xfs_trans_commit(tp);
68243caeb18SDarrick J. Wong
68343caeb18SDarrick J. Wong xfs_iunlock(ip, XFS_ILOCK_EXCL);
68443caeb18SDarrick J. Wong return error;
68543caeb18SDarrick J. Wong
68643caeb18SDarrick J. Wong out_cancel:
68743caeb18SDarrick J. Wong xfs_trans_cancel(tp);
68843caeb18SDarrick J. Wong xfs_iunlock(ip, XFS_ILOCK_EXCL);
68943caeb18SDarrick J. Wong out:
69043caeb18SDarrick J. Wong trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_);
69143caeb18SDarrick J. Wong return error;
69243caeb18SDarrick J. Wong }
69343caeb18SDarrick J. Wong
69443caeb18SDarrick J. Wong /*
695d6f215f3SDarrick J. Wong * Remap part of the CoW fork into the data fork.
696d6f215f3SDarrick J. Wong *
697d6f215f3SDarrick J. Wong * We aim to remap the range starting at @offset_fsb and ending at @end_fsb
698d6f215f3SDarrick J. Wong * into the data fork; this function will remap what it can (at the end of the
699d6f215f3SDarrick J. Wong * range) and update @end_fsb appropriately. Each remap gets its own
700d6f215f3SDarrick J. Wong * transaction because we can end up merging and splitting bmbt blocks for
701d6f215f3SDarrick J. Wong * every remap operation and we'd like to keep the block reservation
702d6f215f3SDarrick J. Wong * requirements as low as possible.
70343caeb18SDarrick J. Wong */
704d6f215f3SDarrick J. Wong STATIC int
xfs_reflink_end_cow_extent(struct xfs_inode * ip,xfs_fileoff_t * offset_fsb,xfs_fileoff_t end_fsb)705d6f215f3SDarrick J. Wong xfs_reflink_end_cow_extent(
70643caeb18SDarrick J. Wong struct xfs_inode *ip,
707df2fd88fSDarrick J. Wong xfs_fileoff_t *offset_fsb,
708df2fd88fSDarrick J. Wong xfs_fileoff_t end_fsb)
70943caeb18SDarrick J. Wong {
710b2b1712aSChristoph Hellwig struct xfs_iext_cursor icur;
711df2fd88fSDarrick J. Wong struct xfs_bmbt_irec got, del, data;
712d6f215f3SDarrick J. Wong struct xfs_mount *mp = ip->i_mount;
713d6f215f3SDarrick J. Wong struct xfs_trans *tp;
714732436efSDarrick J. Wong struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK);
715d6f215f3SDarrick J. Wong unsigned int resblks;
716df2fd88fSDarrick J. Wong int nmaps;
717d6f215f3SDarrick J. Wong int error;
71843caeb18SDarrick J. Wong
719d6f215f3SDarrick J. Wong resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
720d6f215f3SDarrick J. Wong error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
72173d30d48SChristoph Hellwig XFS_TRANS_RESERVE, &tp);
722d6f215f3SDarrick J. Wong if (error)
723d6f215f3SDarrick J. Wong return error;
72443caeb18SDarrick J. Wong
725fe0be23eSDarrick J. Wong /*
726d6f215f3SDarrick J. Wong * Lock the inode. We have to ijoin without automatic unlock because
727d6f215f3SDarrick J. Wong * the lead transaction is the refcountbt record deletion; the data
728d6f215f3SDarrick J. Wong * fork update follows as a deferred log item.
729fe0be23eSDarrick J. Wong */
73043caeb18SDarrick J. Wong xfs_ilock(ip, XFS_ILOCK_EXCL);
73143caeb18SDarrick J. Wong xfs_trans_ijoin(tp, ip, 0);
73243caeb18SDarrick J. Wong
7335f1d5bbfSChandan Babu R error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK,
7345f1d5bbfSChandan Babu R XFS_IEXT_REFLINK_END_COW_CNT);
7354f86bb4bSChandan Babu R if (error == -EFBIG)
7364f86bb4bSChandan Babu R error = xfs_iext_count_upgrade(tp, ip,
7374f86bb4bSChandan Babu R XFS_IEXT_REFLINK_END_COW_CNT);
7385f1d5bbfSChandan Babu R if (error)
7395f1d5bbfSChandan Babu R goto out_cancel;
7405f1d5bbfSChandan Babu R
741e12199f8SChristoph Hellwig /*
742dc56015fSChristoph Hellwig * In case of racing, overlapping AIO writes no COW extents might be
743dc56015fSChristoph Hellwig * left by the time I/O completes for the loser of the race. In that
744dc56015fSChristoph Hellwig * case we are done.
745e12199f8SChristoph Hellwig */
746df2fd88fSDarrick J. Wong if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) ||
747df2fd88fSDarrick J. Wong got.br_startoff >= end_fsb) {
748df2fd88fSDarrick J. Wong *offset_fsb = end_fsb;
749e12199f8SChristoph Hellwig goto out_cancel;
750d6f215f3SDarrick J. Wong }
751c1112b6eSChristoph Hellwig
7525eda4300SDarrick J. Wong /*
753d6f215f3SDarrick J. Wong * Only remap real extents that contain data. With AIO, speculative
754d6f215f3SDarrick J. Wong * preallocations can leak into the range we are called upon, and we
755df2fd88fSDarrick J. Wong * need to skip them. Preserve @got for the eventual CoW fork
756df2fd88fSDarrick J. Wong * deletion; from now on @del represents the mapping that we're
757df2fd88fSDarrick J. Wong * actually remapping.
758d6f215f3SDarrick J. Wong */
759df2fd88fSDarrick J. Wong while (!xfs_bmap_is_written_extent(&got)) {
760df2fd88fSDarrick J. Wong if (!xfs_iext_next_extent(ifp, &icur, &got) ||
761df2fd88fSDarrick J. Wong got.br_startoff >= end_fsb) {
762df2fd88fSDarrick J. Wong *offset_fsb = end_fsb;
763d6f215f3SDarrick J. Wong goto out_cancel;
764d6f215f3SDarrick J. Wong }
765df2fd88fSDarrick J. Wong }
766df2fd88fSDarrick J. Wong del = got;
767767a94d8SChristoph Hellwig xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb);
7685eda4300SDarrick J. Wong
769df2fd88fSDarrick J. Wong /* Grab the corresponding mapping in the data fork. */
770df2fd88fSDarrick J. Wong nmaps = 1;
771df2fd88fSDarrick J. Wong error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data,
772df2fd88fSDarrick J. Wong &nmaps, 0);
77343caeb18SDarrick J. Wong if (error)
774c8eac49eSBrian Foster goto out_cancel;
77543caeb18SDarrick J. Wong
776df2fd88fSDarrick J. Wong /* We can only remap the smaller of the two extent sizes. */
777df2fd88fSDarrick J. Wong data.br_blockcount = min(data.br_blockcount, del.br_blockcount);
778df2fd88fSDarrick J. Wong del.br_blockcount = data.br_blockcount;
779df2fd88fSDarrick J. Wong
780df2fd88fSDarrick J. Wong trace_xfs_reflink_cow_remap_from(ip, &del);
781df2fd88fSDarrick J. Wong trace_xfs_reflink_cow_remap_to(ip, &data);
782df2fd88fSDarrick J. Wong
783df2fd88fSDarrick J. Wong if (xfs_bmap_is_real_extent(&data)) {
784df2fd88fSDarrick J. Wong /*
785df2fd88fSDarrick J. Wong * If the extent we're remapping is backed by storage (written
786df2fd88fSDarrick J. Wong * or not), unmap the extent and drop its refcount.
787df2fd88fSDarrick J. Wong */
788df2fd88fSDarrick J. Wong xfs_bmap_unmap_extent(tp, ip, &data);
789df2fd88fSDarrick J. Wong xfs_refcount_decrease_extent(tp, &data);
790df2fd88fSDarrick J. Wong xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
791df2fd88fSDarrick J. Wong -data.br_blockcount);
792df2fd88fSDarrick J. Wong } else if (data.br_startblock == DELAYSTARTBLOCK) {
793df2fd88fSDarrick J. Wong int done;
794df2fd88fSDarrick J. Wong
795df2fd88fSDarrick J. Wong /*
796df2fd88fSDarrick J. Wong * If the extent we're remapping is a delalloc reservation,
797df2fd88fSDarrick J. Wong * we can use the regular bunmapi function to release the
798df2fd88fSDarrick J. Wong * incore state. Dropping the delalloc reservation takes care
799df2fd88fSDarrick J. Wong * of the quota reservation for us.
800df2fd88fSDarrick J. Wong */
801df2fd88fSDarrick J. Wong error = xfs_bunmapi(NULL, ip, data.br_startoff,
802df2fd88fSDarrick J. Wong data.br_blockcount, 0, 1, &done);
803df2fd88fSDarrick J. Wong if (error)
804df2fd88fSDarrick J. Wong goto out_cancel;
805df2fd88fSDarrick J. Wong ASSERT(done);
806df2fd88fSDarrick J. Wong }
80743caeb18SDarrick J. Wong
808174edb0eSDarrick J. Wong /* Free the CoW orphan record. */
80974b4c5d4SDarrick J. Wong xfs_refcount_free_cow_extent(tp, del.br_startblock, del.br_blockcount);
810174edb0eSDarrick J. Wong
81143caeb18SDarrick J. Wong /* Map the new blocks into the data fork. */
8123e08f42aSDarrick J. Wong xfs_bmap_map_extent(tp, ip, &del);
81343caeb18SDarrick J. Wong
8144b4c1326SDarrick J. Wong /* Charge this new data fork mapping to the on-disk quota. */
8154b4c1326SDarrick J. Wong xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_DELBCOUNT,
8164b4c1326SDarrick J. Wong (long)del.br_blockcount);
8174b4c1326SDarrick J. Wong
81843caeb18SDarrick J. Wong /* Remove the mapping from the CoW fork. */
819b2b1712aSChristoph Hellwig xfs_bmap_del_extent_cow(ip, &icur, &got, &del);
82043caeb18SDarrick J. Wong
82143caeb18SDarrick J. Wong error = xfs_trans_commit(tp);
82243caeb18SDarrick J. Wong xfs_iunlock(ip, XFS_ILOCK_EXCL);
82343caeb18SDarrick J. Wong if (error)
824d6f215f3SDarrick J. Wong return error;
825d6f215f3SDarrick J. Wong
826d6f215f3SDarrick J. Wong /* Update the caller about how much progress we made. */
827df2fd88fSDarrick J. Wong *offset_fsb = del.br_startoff + del.br_blockcount;
82843caeb18SDarrick J. Wong return 0;
82943caeb18SDarrick J. Wong
830e12199f8SChristoph Hellwig out_cancel:
83143caeb18SDarrick J. Wong xfs_trans_cancel(tp);
83243caeb18SDarrick J. Wong xfs_iunlock(ip, XFS_ILOCK_EXCL);
833d6f215f3SDarrick J. Wong return error;
834d6f215f3SDarrick J. Wong }
835d6f215f3SDarrick J. Wong
836d6f215f3SDarrick J. Wong /*
837d6f215f3SDarrick J. Wong * Remap parts of a file's data fork after a successful CoW.
838d6f215f3SDarrick J. Wong */
839d6f215f3SDarrick J. Wong int
xfs_reflink_end_cow(struct xfs_inode * ip,xfs_off_t offset,xfs_off_t count)840d6f215f3SDarrick J. Wong xfs_reflink_end_cow(
841d6f215f3SDarrick J. Wong struct xfs_inode *ip,
842d6f215f3SDarrick J. Wong xfs_off_t offset,
843d6f215f3SDarrick J. Wong xfs_off_t count)
844d6f215f3SDarrick J. Wong {
845d6f215f3SDarrick J. Wong xfs_fileoff_t offset_fsb;
846d6f215f3SDarrick J. Wong xfs_fileoff_t end_fsb;
847d6f215f3SDarrick J. Wong int error = 0;
848d6f215f3SDarrick J. Wong
849d6f215f3SDarrick J. Wong trace_xfs_reflink_end_cow(ip, offset, count);
850d6f215f3SDarrick J. Wong
851d6f215f3SDarrick J. Wong offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
852d6f215f3SDarrick J. Wong end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
853d6f215f3SDarrick J. Wong
854d6f215f3SDarrick J. Wong /*
855df2fd88fSDarrick J. Wong * Walk forwards until we've remapped the I/O range. The loop function
856d6f215f3SDarrick J. Wong * repeatedly cycles the ILOCK to allocate one transaction per remapped
857d6f215f3SDarrick J. Wong * extent.
858d6f215f3SDarrick J. Wong *
859b63da6c8SRandy Dunlap * If we're being called by writeback then the pages will still
860d6f215f3SDarrick J. Wong * have PageWriteback set, which prevents races with reflink remapping
861d6f215f3SDarrick J. Wong * and truncate. Reflink remapping prevents races with writeback by
862d6f215f3SDarrick J. Wong * taking the iolock and mmaplock before flushing the pages and
863d6f215f3SDarrick J. Wong * remapping, which means there won't be any further writeback or page
864d6f215f3SDarrick J. Wong * cache dirtying until the reflink completes.
865d6f215f3SDarrick J. Wong *
866d6f215f3SDarrick J. Wong * We should never have two threads issuing writeback for the same file
867d6f215f3SDarrick J. Wong * region. There are also have post-eof checks in the writeback
868d6f215f3SDarrick J. Wong * preparation code so that we don't bother writing out pages that are
869d6f215f3SDarrick J. Wong * about to be truncated.
870d6f215f3SDarrick J. Wong *
871d6f215f3SDarrick J. Wong * If we're being called as part of directio write completion, the dio
872d6f215f3SDarrick J. Wong * count is still elevated, which reflink and truncate will wait for.
873d6f215f3SDarrick J. Wong * Reflink remapping takes the iolock and mmaplock and waits for
874d6f215f3SDarrick J. Wong * pending dio to finish, which should prevent any directio until the
875d6f215f3SDarrick J. Wong * remap completes. Multiple concurrent directio writes to the same
876d6f215f3SDarrick J. Wong * region are handled by end_cow processing only occurring for the
877d6f215f3SDarrick J. Wong * threads which succeed; the outcome of multiple overlapping direct
878d6f215f3SDarrick J. Wong * writes is not well defined anyway.
879d6f215f3SDarrick J. Wong *
880d6f215f3SDarrick J. Wong * It's possible that a buffered write and a direct write could collide
881d6f215f3SDarrick J. Wong * here (the buffered write stumbles in after the dio flushes and
882d6f215f3SDarrick J. Wong * invalidates the page cache and immediately queues writeback), but we
883d6f215f3SDarrick J. Wong * have never supported this 100%. If either disk write succeeds the
884d6f215f3SDarrick J. Wong * blocks will be remapped.
885d6f215f3SDarrick J. Wong */
886d6f215f3SDarrick J. Wong while (end_fsb > offset_fsb && !error)
887df2fd88fSDarrick J. Wong error = xfs_reflink_end_cow_extent(ip, &offset_fsb, end_fsb);
888d6f215f3SDarrick J. Wong
889d6f215f3SDarrick J. Wong if (error)
89043caeb18SDarrick J. Wong trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
89143caeb18SDarrick J. Wong return error;
89243caeb18SDarrick J. Wong }
893174edb0eSDarrick J. Wong
894174edb0eSDarrick J. Wong /*
8957993f1a4SDarrick J. Wong * Free all CoW staging blocks that are still referenced by the ondisk refcount
8967993f1a4SDarrick J. Wong * metadata. The ondisk metadata does not track which inode created the
8977993f1a4SDarrick J. Wong * staging extent, so callers must ensure that there are no cached inodes with
8987993f1a4SDarrick J. Wong * live CoW staging extents.
899174edb0eSDarrick J. Wong */
900174edb0eSDarrick J. Wong int
xfs_reflink_recover_cow(struct xfs_mount * mp)901174edb0eSDarrick J. Wong xfs_reflink_recover_cow(
902174edb0eSDarrick J. Wong struct xfs_mount *mp)
903174edb0eSDarrick J. Wong {
904934933c3SDave Chinner struct xfs_perag *pag;
905174edb0eSDarrick J. Wong xfs_agnumber_t agno;
906174edb0eSDarrick J. Wong int error = 0;
907174edb0eSDarrick J. Wong
90838c26bfdSDave Chinner if (!xfs_has_reflink(mp))
909174edb0eSDarrick J. Wong return 0;
910174edb0eSDarrick J. Wong
911934933c3SDave Chinner for_each_perag(mp, agno, pag) {
912a81a0621SDave Chinner error = xfs_refcount_recover_cow_leftovers(mp, pag);
913934933c3SDave Chinner if (error) {
914c4d5660aSDave Chinner xfs_perag_rele(pag);
915174edb0eSDarrick J. Wong break;
916174edb0eSDarrick J. Wong }
917934933c3SDave Chinner }
918174edb0eSDarrick J. Wong
919174edb0eSDarrick J. Wong return error;
920174edb0eSDarrick J. Wong }
921862bb360SDarrick J. Wong
922862bb360SDarrick J. Wong /*
923862bb360SDarrick J. Wong * Reflinking (Block) Ranges of Two Files Together
924862bb360SDarrick J. Wong *
925862bb360SDarrick J. Wong * First, ensure that the reflink flag is set on both inodes. The flag is an
926862bb360SDarrick J. Wong * optimization to avoid unnecessary refcount btree lookups in the write path.
927862bb360SDarrick J. Wong *
928862bb360SDarrick J. Wong * Now we can iteratively remap the range of extents (and holes) in src to the
929862bb360SDarrick J. Wong * corresponding ranges in dest. Let drange and srange denote the ranges of
930862bb360SDarrick J. Wong * logical blocks in dest and src touched by the reflink operation.
931862bb360SDarrick J. Wong *
932862bb360SDarrick J. Wong * While the length of drange is greater than zero,
933862bb360SDarrick J. Wong * - Read src's bmbt at the start of srange ("imap")
934862bb360SDarrick J. Wong * - If imap doesn't exist, make imap appear to start at the end of srange
935862bb360SDarrick J. Wong * with zero length.
936862bb360SDarrick J. Wong * - If imap starts before srange, advance imap to start at srange.
937862bb360SDarrick J. Wong * - If imap goes beyond srange, truncate imap to end at the end of srange.
938862bb360SDarrick J. Wong * - Punch (imap start - srange start + imap len) blocks from dest at
939862bb360SDarrick J. Wong * offset (drange start).
940862bb360SDarrick J. Wong * - If imap points to a real range of pblks,
941862bb360SDarrick J. Wong * > Increase the refcount of the imap's pblks
942862bb360SDarrick J. Wong * > Map imap's pblks into dest at the offset
943862bb360SDarrick J. Wong * (drange start + imap start - srange start)
944862bb360SDarrick J. Wong * - Advance drange and srange by (imap start - srange start + imap len)
945862bb360SDarrick J. Wong *
946862bb360SDarrick J. Wong * Finally, if the reflink made dest longer, update both the in-core and
947862bb360SDarrick J. Wong * on-disk file sizes.
948862bb360SDarrick J. Wong *
949862bb360SDarrick J. Wong * ASCII Art Demonstration:
950862bb360SDarrick J. Wong *
951862bb360SDarrick J. Wong * Let's say we want to reflink this source file:
952862bb360SDarrick J. Wong *
953862bb360SDarrick J. Wong * ----SSSSSSS-SSSSS----SSSSSS (src file)
954862bb360SDarrick J. Wong * <-------------------->
955862bb360SDarrick J. Wong *
956862bb360SDarrick J. Wong * into this destination file:
957862bb360SDarrick J. Wong *
958862bb360SDarrick J. Wong * --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
959862bb360SDarrick J. Wong * <-------------------->
960862bb360SDarrick J. Wong * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
961862bb360SDarrick J. Wong * Observe that the range has different logical offsets in either file.
962862bb360SDarrick J. Wong *
963862bb360SDarrick J. Wong * Consider that the first extent in the source file doesn't line up with our
964862bb360SDarrick J. Wong * reflink range. Unmapping and remapping are separate operations, so we can
965862bb360SDarrick J. Wong * unmap more blocks from the destination file than we remap.
966862bb360SDarrick J. Wong *
967862bb360SDarrick J. Wong * ----SSSSSSS-SSSSS----SSSSSS
968862bb360SDarrick J. Wong * <------->
969862bb360SDarrick J. Wong * --DDDDD---------DDDDD--DDD
970862bb360SDarrick J. Wong * <------->
971862bb360SDarrick J. Wong *
972862bb360SDarrick J. Wong * Now remap the source extent into the destination file:
973862bb360SDarrick J. Wong *
974862bb360SDarrick J. Wong * ----SSSSSSS-SSSSS----SSSSSS
975862bb360SDarrick J. Wong * <------->
976862bb360SDarrick J. Wong * --DDDDD--SSSSSSSDDDDD--DDD
977862bb360SDarrick J. Wong * <------->
978862bb360SDarrick J. Wong *
979862bb360SDarrick J. Wong * Do likewise with the second hole and extent in our range. Holes in the
980862bb360SDarrick J. Wong * unmap range don't affect our operation.
981862bb360SDarrick J. Wong *
982862bb360SDarrick J. Wong * ----SSSSSSS-SSSSS----SSSSSS
983862bb360SDarrick J. Wong * <---->
984862bb360SDarrick J. Wong * --DDDDD--SSSSSSS-SSSSS-DDD
985862bb360SDarrick J. Wong * <---->
986862bb360SDarrick J. Wong *
987862bb360SDarrick J. Wong * Finally, unmap and remap part of the third extent. This will increase the
988862bb360SDarrick J. Wong * size of the destination file.
989862bb360SDarrick J. Wong *
990862bb360SDarrick J. Wong * ----SSSSSSS-SSSSS----SSSSSS
991862bb360SDarrick J. Wong * <----->
992862bb360SDarrick J. Wong * --DDDDD--SSSSSSS-SSSSS----SSS
993862bb360SDarrick J. Wong * <----->
994862bb360SDarrick J. Wong *
995862bb360SDarrick J. Wong * Once we update the destination file's i_size, we're done.
996862bb360SDarrick J. Wong */
997862bb360SDarrick J. Wong
998862bb360SDarrick J. Wong /*
999862bb360SDarrick J. Wong * Ensure the reflink bit is set in both inodes.
1000862bb360SDarrick J. Wong */
1001862bb360SDarrick J. Wong STATIC int
xfs_reflink_set_inode_flag(struct xfs_inode * src,struct xfs_inode * dest)1002862bb360SDarrick J. Wong xfs_reflink_set_inode_flag(
1003862bb360SDarrick J. Wong struct xfs_inode *src,
1004862bb360SDarrick J. Wong struct xfs_inode *dest)
1005862bb360SDarrick J. Wong {
1006862bb360SDarrick J. Wong struct xfs_mount *mp = src->i_mount;
1007862bb360SDarrick J. Wong int error;
1008862bb360SDarrick J. Wong struct xfs_trans *tp;
1009862bb360SDarrick J. Wong
1010862bb360SDarrick J. Wong if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest))
1011862bb360SDarrick J. Wong return 0;
1012862bb360SDarrick J. Wong
1013862bb360SDarrick J. Wong error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
1014862bb360SDarrick J. Wong if (error)
1015862bb360SDarrick J. Wong goto out_error;
1016862bb360SDarrick J. Wong
1017862bb360SDarrick J. Wong /* Lock both files against IO */
1018862bb360SDarrick J. Wong if (src->i_ino == dest->i_ino)
1019862bb360SDarrick J. Wong xfs_ilock(src, XFS_ILOCK_EXCL);
1020862bb360SDarrick J. Wong else
10217c2d238aSDarrick J. Wong xfs_lock_two_inodes(src, XFS_ILOCK_EXCL, dest, XFS_ILOCK_EXCL);
1022862bb360SDarrick J. Wong
1023862bb360SDarrick J. Wong if (!xfs_is_reflink_inode(src)) {
1024862bb360SDarrick J. Wong trace_xfs_reflink_set_inode_flag(src);
1025862bb360SDarrick J. Wong xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
10263e09ab8fSChristoph Hellwig src->i_diflags2 |= XFS_DIFLAG2_REFLINK;
1027862bb360SDarrick J. Wong xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
1028862bb360SDarrick J. Wong xfs_ifork_init_cow(src);
1029862bb360SDarrick J. Wong } else
1030862bb360SDarrick J. Wong xfs_iunlock(src, XFS_ILOCK_EXCL);
1031862bb360SDarrick J. Wong
1032862bb360SDarrick J. Wong if (src->i_ino == dest->i_ino)
1033862bb360SDarrick J. Wong goto commit_flags;
1034862bb360SDarrick J. Wong
1035862bb360SDarrick J. Wong if (!xfs_is_reflink_inode(dest)) {
1036862bb360SDarrick J. Wong trace_xfs_reflink_set_inode_flag(dest);
1037862bb360SDarrick J. Wong xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
10383e09ab8fSChristoph Hellwig dest->i_diflags2 |= XFS_DIFLAG2_REFLINK;
1039862bb360SDarrick J. Wong xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
1040862bb360SDarrick J. Wong xfs_ifork_init_cow(dest);
1041862bb360SDarrick J. Wong } else
1042862bb360SDarrick J. Wong xfs_iunlock(dest, XFS_ILOCK_EXCL);
1043862bb360SDarrick J. Wong
1044862bb360SDarrick J. Wong commit_flags:
1045862bb360SDarrick J. Wong error = xfs_trans_commit(tp);
1046862bb360SDarrick J. Wong if (error)
1047862bb360SDarrick J. Wong goto out_error;
1048862bb360SDarrick J. Wong return error;
1049862bb360SDarrick J. Wong
1050862bb360SDarrick J. Wong out_error:
1051862bb360SDarrick J. Wong trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_);
1052862bb360SDarrick J. Wong return error;
1053862bb360SDarrick J. Wong }
1054862bb360SDarrick J. Wong
1055862bb360SDarrick J. Wong /*
1056f7ca3522SDarrick J. Wong * Update destination inode size & cowextsize hint, if necessary.
1057862bb360SDarrick J. Wong */
10583fc9f5e4SDarrick J. Wong int
xfs_reflink_update_dest(struct xfs_inode * dest,xfs_off_t newlen,xfs_extlen_t cowextsize,unsigned int remap_flags)1059862bb360SDarrick J. Wong xfs_reflink_update_dest(
1060862bb360SDarrick J. Wong struct xfs_inode *dest,
1061f7ca3522SDarrick J. Wong xfs_off_t newlen,
1062c5ecb423SChristoph Hellwig xfs_extlen_t cowextsize,
1063a91ae49bSDarrick J. Wong unsigned int remap_flags)
1064862bb360SDarrick J. Wong {
1065862bb360SDarrick J. Wong struct xfs_mount *mp = dest->i_mount;
1066862bb360SDarrick J. Wong struct xfs_trans *tp;
1067862bb360SDarrick J. Wong int error;
1068862bb360SDarrick J. Wong
1069bf4a1fcfSDarrick J. Wong if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
1070862bb360SDarrick J. Wong return 0;
1071862bb360SDarrick J. Wong
1072862bb360SDarrick J. Wong error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
1073862bb360SDarrick J. Wong if (error)
1074862bb360SDarrick J. Wong goto out_error;
1075862bb360SDarrick J. Wong
1076862bb360SDarrick J. Wong xfs_ilock(dest, XFS_ILOCK_EXCL);
1077862bb360SDarrick J. Wong xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
1078862bb360SDarrick J. Wong
1079f7ca3522SDarrick J. Wong if (newlen > i_size_read(VFS_I(dest))) {
1080862bb360SDarrick J. Wong trace_xfs_reflink_update_inode_size(dest, newlen);
1081862bb360SDarrick J. Wong i_size_write(VFS_I(dest), newlen);
108213d2c10bSChristoph Hellwig dest->i_disk_size = newlen;
1083f7ca3522SDarrick J. Wong }
1084f7ca3522SDarrick J. Wong
1085f7ca3522SDarrick J. Wong if (cowextsize) {
1086b33ce57dSChristoph Hellwig dest->i_cowextsize = cowextsize;
10873e09ab8fSChristoph Hellwig dest->i_diflags2 |= XFS_DIFLAG2_COWEXTSIZE;
1088f7ca3522SDarrick J. Wong }
1089f7ca3522SDarrick J. Wong
1090862bb360SDarrick J. Wong xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
1091862bb360SDarrick J. Wong
1092862bb360SDarrick J. Wong error = xfs_trans_commit(tp);
1093862bb360SDarrick J. Wong if (error)
1094862bb360SDarrick J. Wong goto out_error;
1095862bb360SDarrick J. Wong return error;
1096862bb360SDarrick J. Wong
1097862bb360SDarrick J. Wong out_error:
1098862bb360SDarrick J. Wong trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_);
1099862bb360SDarrick J. Wong return error;
1100862bb360SDarrick J. Wong }
1101862bb360SDarrick J. Wong
1102862bb360SDarrick J. Wong /*
11036fa164b8SDarrick J. Wong * Do we have enough reserve in this AG to handle a reflink? The refcount
11046fa164b8SDarrick J. Wong * btree already reserved all the space it needs, but the rmap btree can grow
11056fa164b8SDarrick J. Wong * infinitely, so we won't allow more reflinks when the AG is down to the
11066fa164b8SDarrick J. Wong * btree reserves.
11076fa164b8SDarrick J. Wong */
11086fa164b8SDarrick J. Wong static int
xfs_reflink_ag_has_free_space(struct xfs_mount * mp,xfs_agnumber_t agno)11096fa164b8SDarrick J. Wong xfs_reflink_ag_has_free_space(
11106fa164b8SDarrick J. Wong struct xfs_mount *mp,
11116fa164b8SDarrick J. Wong xfs_agnumber_t agno)
11126fa164b8SDarrick J. Wong {
11136fa164b8SDarrick J. Wong struct xfs_perag *pag;
11146fa164b8SDarrick J. Wong int error = 0;
11156fa164b8SDarrick J. Wong
111638c26bfdSDave Chinner if (!xfs_has_rmapbt(mp))
11176fa164b8SDarrick J. Wong return 0;
11186fa164b8SDarrick J. Wong
11196fa164b8SDarrick J. Wong pag = xfs_perag_get(mp, agno);
112021592863SBrian Foster if (xfs_ag_resv_critical(pag, XFS_AG_RESV_RMAPBT) ||
11216fa164b8SDarrick J. Wong xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
11226fa164b8SDarrick J. Wong error = -ENOSPC;
11236fa164b8SDarrick J. Wong xfs_perag_put(pag);
11246fa164b8SDarrick J. Wong return error;
11256fa164b8SDarrick J. Wong }
11266fa164b8SDarrick J. Wong
11276fa164b8SDarrick J. Wong /*
112800fd1d56SDarrick J. Wong * Remap the given extent into the file. The dmap blockcount will be set to
112900fd1d56SDarrick J. Wong * the number of blocks that were actually remapped.
1130862bb360SDarrick J. Wong */
1131862bb360SDarrick J. Wong STATIC int
xfs_reflink_remap_extent(struct xfs_inode * ip,struct xfs_bmbt_irec * dmap,xfs_off_t new_isize)1132862bb360SDarrick J. Wong xfs_reflink_remap_extent(
1133862bb360SDarrick J. Wong struct xfs_inode *ip,
113400fd1d56SDarrick J. Wong struct xfs_bmbt_irec *dmap,
1135862bb360SDarrick J. Wong xfs_off_t new_isize)
1136862bb360SDarrick J. Wong {
113700fd1d56SDarrick J. Wong struct xfs_bmbt_irec smap;
1138862bb360SDarrick J. Wong struct xfs_mount *mp = ip->i_mount;
1139862bb360SDarrick J. Wong struct xfs_trans *tp;
1140862bb360SDarrick J. Wong xfs_off_t newlen;
1141f273387bSDarrick J. Wong int64_t qdelta = 0;
114200fd1d56SDarrick J. Wong unsigned int resblks;
11434ca74205SDarrick J. Wong bool quota_reserved = true;
114400fd1d56SDarrick J. Wong bool smap_real;
114500fd1d56SDarrick J. Wong bool dmap_written = xfs_bmap_is_written_extent(dmap);
1146ee898d78SChandan Babu R int iext_delta = 0;
114700fd1d56SDarrick J. Wong int nimaps;
1148862bb360SDarrick J. Wong int error;
1149862bb360SDarrick J. Wong
1150f273387bSDarrick J. Wong /*
1151f273387bSDarrick J. Wong * Start a rolling transaction to switch the mappings.
1152f273387bSDarrick J. Wong *
1153f273387bSDarrick J. Wong * Adding a written extent to the extent map can cause a bmbt split,
1154f273387bSDarrick J. Wong * and removing a mapped extent from the extent can cause a bmbt split.
1155f273387bSDarrick J. Wong * The two operations cannot both cause a split since they operate on
1156f273387bSDarrick J. Wong * the same index in the bmap btree, so we only need a reservation for
1157f273387bSDarrick J. Wong * one bmbt split if either thing is happening. However, we haven't
1158f273387bSDarrick J. Wong * locked the inode yet, so we reserve assuming this is the case.
11594ca74205SDarrick J. Wong *
11604ca74205SDarrick J. Wong * The first allocation call tries to reserve enough space to handle
11614ca74205SDarrick J. Wong * mapping dmap into a sparse part of the file plus the bmbt split. We
11624ca74205SDarrick J. Wong * haven't locked the inode or read the existing mapping yet, so we do
11634ca74205SDarrick J. Wong * not know for sure that we need the space. This should succeed most
11644ca74205SDarrick J. Wong * of the time.
11654ca74205SDarrick J. Wong *
11664ca74205SDarrick J. Wong * If the first attempt fails, try again but reserving only enough
11674ca74205SDarrick J. Wong * space to handle a bmbt split. This is the hard minimum requirement,
11684ca74205SDarrick J. Wong * and we revisit quota reservations later when we know more about what
11694ca74205SDarrick J. Wong * we're remapping.
1170f273387bSDarrick J. Wong */
117100fd1d56SDarrick J. Wong resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
11724ca74205SDarrick J. Wong error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
11734ca74205SDarrick J. Wong resblks + dmap->br_blockcount, 0, false, &tp);
11744ca74205SDarrick J. Wong if (error == -EDQUOT || error == -ENOSPC) {
11754ca74205SDarrick J. Wong quota_reserved = false;
11764ca74205SDarrick J. Wong error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write,
11774ca74205SDarrick J. Wong resblks, 0, false, &tp);
11784ca74205SDarrick J. Wong }
1179862bb360SDarrick J. Wong if (error)
1180862bb360SDarrick J. Wong goto out;
1181862bb360SDarrick J. Wong
118283895227SDarrick J. Wong /*
118300fd1d56SDarrick J. Wong * Read what's currently mapped in the destination file into smap.
118400fd1d56SDarrick J. Wong * If smap isn't a hole, we will have to remove it before we can add
118500fd1d56SDarrick J. Wong * dmap to the destination file.
118683895227SDarrick J. Wong */
118700fd1d56SDarrick J. Wong nimaps = 1;
118800fd1d56SDarrick J. Wong error = xfs_bmapi_read(ip, dmap->br_startoff, dmap->br_blockcount,
118900fd1d56SDarrick J. Wong &smap, &nimaps, 0);
119000fd1d56SDarrick J. Wong if (error)
119100fd1d56SDarrick J. Wong goto out_cancel;
119200fd1d56SDarrick J. Wong ASSERT(nimaps == 1 && smap.br_startoff == dmap->br_startoff);
119300fd1d56SDarrick J. Wong smap_real = xfs_bmap_is_real_extent(&smap);
119400fd1d56SDarrick J. Wong
119500fd1d56SDarrick J. Wong /*
119600fd1d56SDarrick J. Wong * We can only remap as many blocks as the smaller of the two extent
119700fd1d56SDarrick J. Wong * maps, because we can only remap one extent at a time.
119800fd1d56SDarrick J. Wong */
119900fd1d56SDarrick J. Wong dmap->br_blockcount = min(dmap->br_blockcount, smap.br_blockcount);
120000fd1d56SDarrick J. Wong ASSERT(dmap->br_blockcount == smap.br_blockcount);
120100fd1d56SDarrick J. Wong
120200fd1d56SDarrick J. Wong trace_xfs_reflink_remap_extent_dest(ip, &smap);
120300fd1d56SDarrick J. Wong
1204168eae80SDarrick J. Wong /*
1205168eae80SDarrick J. Wong * Two extents mapped to the same physical block must not have
1206168eae80SDarrick J. Wong * different states; that's filesystem corruption. Move on to the next
1207168eae80SDarrick J. Wong * extent if they're both holes or both the same physical extent.
1208168eae80SDarrick J. Wong */
1209168eae80SDarrick J. Wong if (dmap->br_startblock == smap.br_startblock) {
1210168eae80SDarrick J. Wong if (dmap->br_state != smap.br_state)
1211168eae80SDarrick J. Wong error = -EFSCORRUPTED;
1212168eae80SDarrick J. Wong goto out_cancel;
1213168eae80SDarrick J. Wong }
1214168eae80SDarrick J. Wong
1215168eae80SDarrick J. Wong /* If both extents are unwritten, leave them alone. */
1216168eae80SDarrick J. Wong if (dmap->br_state == XFS_EXT_UNWRITTEN &&
1217168eae80SDarrick J. Wong smap.br_state == XFS_EXT_UNWRITTEN)
1218168eae80SDarrick J. Wong goto out_cancel;
1219168eae80SDarrick J. Wong
122000fd1d56SDarrick J. Wong /* No reflinking if the AG of the dest mapping is low on space. */
122100fd1d56SDarrick J. Wong if (dmap_written) {
122200fd1d56SDarrick J. Wong error = xfs_reflink_ag_has_free_space(mp,
122300fd1d56SDarrick J. Wong XFS_FSB_TO_AGNO(mp, dmap->br_startblock));
122400fd1d56SDarrick J. Wong if (error)
122500fd1d56SDarrick J. Wong goto out_cancel;
122600fd1d56SDarrick J. Wong }
122700fd1d56SDarrick J. Wong
122800fd1d56SDarrick J. Wong /*
1229f273387bSDarrick J. Wong * Increase quota reservation if we think the quota block counter for
123000fd1d56SDarrick J. Wong * this file could increase.
123100fd1d56SDarrick J. Wong *
123200fd1d56SDarrick J. Wong * If we are mapping a written extent into the file, we need to have
123300fd1d56SDarrick J. Wong * enough quota block count reservation to handle the blocks in that
123494b941fdSDarrick J. Wong * extent. We log only the delta to the quota block counts, so if the
123594b941fdSDarrick J. Wong * extent we're unmapping also has blocks allocated to it, we don't
123694b941fdSDarrick J. Wong * need a quota reservation for the extent itself.
123700fd1d56SDarrick J. Wong *
123800fd1d56SDarrick J. Wong * Note that if we're replacing a delalloc reservation with a written
123900fd1d56SDarrick J. Wong * extent, we have to take the full quota reservation because removing
124000fd1d56SDarrick J. Wong * the delalloc reservation gives the block count back to the quota
124100fd1d56SDarrick J. Wong * count. This is suboptimal, but the VFS flushed the dest range
124200fd1d56SDarrick J. Wong * before we started. That should have removed all the delalloc
124300fd1d56SDarrick J. Wong * reservations, but we code defensively.
1244766aabd5SDarrick J. Wong *
1245766aabd5SDarrick J. Wong * xfs_trans_alloc_inode above already tried to grab an even larger
1246766aabd5SDarrick J. Wong * quota reservation, and kicked off a blockgc scan if it couldn't.
1247766aabd5SDarrick J. Wong * If we can't get a potentially smaller quota reservation now, we're
1248766aabd5SDarrick J. Wong * done.
124900fd1d56SDarrick J. Wong */
12504ca74205SDarrick J. Wong if (!quota_reserved && !smap_real && dmap_written) {
1251f273387bSDarrick J. Wong error = xfs_trans_reserve_quota_nblks(tp, ip,
1252f273387bSDarrick J. Wong dmap->br_blockcount, 0, false);
1253862bb360SDarrick J. Wong if (error)
1254862bb360SDarrick J. Wong goto out_cancel;
1255aa5d0ba0SDarrick J. Wong }
1256862bb360SDarrick J. Wong
1257ee898d78SChandan Babu R if (smap_real)
1258ee898d78SChandan Babu R ++iext_delta;
1259ee898d78SChandan Babu R
1260ee898d78SChandan Babu R if (dmap_written)
1261ee898d78SChandan Babu R ++iext_delta;
1262ee898d78SChandan Babu R
1263ee898d78SChandan Babu R error = xfs_iext_count_may_overflow(ip, XFS_DATA_FORK, iext_delta);
12644f86bb4bSChandan Babu R if (error == -EFBIG)
12654f86bb4bSChandan Babu R error = xfs_iext_count_upgrade(tp, ip, iext_delta);
1266ee898d78SChandan Babu R if (error)
1267ee898d78SChandan Babu R goto out_cancel;
1268ee898d78SChandan Babu R
126900fd1d56SDarrick J. Wong if (smap_real) {
127000fd1d56SDarrick J. Wong /*
127100fd1d56SDarrick J. Wong * If the extent we're unmapping is backed by storage (written
127200fd1d56SDarrick J. Wong * or not), unmap the extent and drop its refcount.
127300fd1d56SDarrick J. Wong */
127400fd1d56SDarrick J. Wong xfs_bmap_unmap_extent(tp, ip, &smap);
127500fd1d56SDarrick J. Wong xfs_refcount_decrease_extent(tp, &smap);
127600fd1d56SDarrick J. Wong qdelta -= smap.br_blockcount;
127700fd1d56SDarrick J. Wong } else if (smap.br_startblock == DELAYSTARTBLOCK) {
1278f1e6a8d7SDarrick J. Wong int done;
1279862bb360SDarrick J. Wong
1280862bb360SDarrick J. Wong /*
128100fd1d56SDarrick J. Wong * If the extent we're unmapping is a delalloc reservation,
128200fd1d56SDarrick J. Wong * we can use the regular bunmapi function to release the
128300fd1d56SDarrick J. Wong * incore state. Dropping the delalloc reservation takes care
128400fd1d56SDarrick J. Wong * of the quota reservation for us.
1285862bb360SDarrick J. Wong */
1286f1e6a8d7SDarrick J. Wong error = xfs_bunmapi(NULL, ip, smap.br_startoff,
1287f1e6a8d7SDarrick J. Wong smap.br_blockcount, 0, 1, &done);
128800fd1d56SDarrick J. Wong if (error)
128900fd1d56SDarrick J. Wong goto out_cancel;
1290f1e6a8d7SDarrick J. Wong ASSERT(done);
129100fd1d56SDarrick J. Wong }
1292862bb360SDarrick J. Wong
129300fd1d56SDarrick J. Wong /*
129400fd1d56SDarrick J. Wong * If the extent we're sharing is backed by written storage, increase
129500fd1d56SDarrick J. Wong * its refcount and map it into the file.
129600fd1d56SDarrick J. Wong */
129700fd1d56SDarrick J. Wong if (dmap_written) {
129800fd1d56SDarrick J. Wong xfs_refcount_increase_extent(tp, dmap);
129900fd1d56SDarrick J. Wong xfs_bmap_map_extent(tp, ip, dmap);
130000fd1d56SDarrick J. Wong qdelta += dmap->br_blockcount;
130100fd1d56SDarrick J. Wong }
1302862bb360SDarrick J. Wong
130300fd1d56SDarrick J. Wong xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, qdelta);
1304862bb360SDarrick J. Wong
1305862bb360SDarrick J. Wong /* Update dest isize if needed. */
130600fd1d56SDarrick J. Wong newlen = XFS_FSB_TO_B(mp, dmap->br_startoff + dmap->br_blockcount);
1307862bb360SDarrick J. Wong newlen = min_t(xfs_off_t, newlen, new_isize);
1308862bb360SDarrick J. Wong if (newlen > i_size_read(VFS_I(ip))) {
1309862bb360SDarrick J. Wong trace_xfs_reflink_update_inode_size(ip, newlen);
1310862bb360SDarrick J. Wong i_size_write(VFS_I(ip), newlen);
131113d2c10bSChristoph Hellwig ip->i_disk_size = newlen;
1312862bb360SDarrick J. Wong xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1313862bb360SDarrick J. Wong }
1314862bb360SDarrick J. Wong
131500fd1d56SDarrick J. Wong /* Commit everything and unlock. */
1316862bb360SDarrick J. Wong error = xfs_trans_commit(tp);
131700fd1d56SDarrick J. Wong goto out_unlock;
1318862bb360SDarrick J. Wong
1319862bb360SDarrick J. Wong out_cancel:
1320862bb360SDarrick J. Wong xfs_trans_cancel(tp);
132100fd1d56SDarrick J. Wong out_unlock:
1322862bb360SDarrick J. Wong xfs_iunlock(ip, XFS_ILOCK_EXCL);
1323862bb360SDarrick J. Wong out:
132400fd1d56SDarrick J. Wong if (error)
1325862bb360SDarrick J. Wong trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
1326862bb360SDarrick J. Wong return error;
1327862bb360SDarrick J. Wong }
1328862bb360SDarrick J. Wong
132900fd1d56SDarrick J. Wong /* Remap a range of one file to the other. */
13303fc9f5e4SDarrick J. Wong int
xfs_reflink_remap_blocks(struct xfs_inode * src,loff_t pos_in,struct xfs_inode * dest,loff_t pos_out,loff_t remap_len,loff_t * remapped)1331862bb360SDarrick J. Wong xfs_reflink_remap_blocks(
1332862bb360SDarrick J. Wong struct xfs_inode *src,
13339f04aaffSDarrick J. Wong loff_t pos_in,
1334862bb360SDarrick J. Wong struct xfs_inode *dest,
13359f04aaffSDarrick J. Wong loff_t pos_out,
13363f68c1f5SDarrick J. Wong loff_t remap_len,
13373f68c1f5SDarrick J. Wong loff_t *remapped)
1338862bb360SDarrick J. Wong {
1339862bb360SDarrick J. Wong struct xfs_bmbt_irec imap;
134000fd1d56SDarrick J. Wong struct xfs_mount *mp = src->i_mount;
134100fd1d56SDarrick J. Wong xfs_fileoff_t srcoff = XFS_B_TO_FSBT(mp, pos_in);
134200fd1d56SDarrick J. Wong xfs_fileoff_t destoff = XFS_B_TO_FSBT(mp, pos_out);
13439f04aaffSDarrick J. Wong xfs_filblks_t len;
13443f68c1f5SDarrick J. Wong xfs_filblks_t remapped_len = 0;
13459f04aaffSDarrick J. Wong xfs_off_t new_isize = pos_out + remap_len;
1346862bb360SDarrick J. Wong int nimaps;
1347862bb360SDarrick J. Wong int error = 0;
13489f04aaffSDarrick J. Wong
134900fd1d56SDarrick J. Wong len = min_t(xfs_filblks_t, XFS_B_TO_FSB(mp, remap_len),
135000fd1d56SDarrick J. Wong XFS_MAX_FILEOFF);
1351862bb360SDarrick J. Wong
135200fd1d56SDarrick J. Wong trace_xfs_reflink_remap_blocks(src, srcoff, len, dest, destoff);
135301c2e13dSDarrick J. Wong
135400fd1d56SDarrick J. Wong while (len > 0) {
135500fd1d56SDarrick J. Wong unsigned int lock_mode;
135601c2e13dSDarrick J. Wong
1357862bb360SDarrick J. Wong /* Read extent from the source file */
1358862bb360SDarrick J. Wong nimaps = 1;
135901c2e13dSDarrick J. Wong lock_mode = xfs_ilock_data_map_shared(src);
1360862bb360SDarrick J. Wong error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
136101c2e13dSDarrick J. Wong xfs_iunlock(src, lock_mode);
1362862bb360SDarrick J. Wong if (error)
13639f04aaffSDarrick J. Wong break;
136400fd1d56SDarrick J. Wong /*
136500fd1d56SDarrick J. Wong * The caller supposedly flushed all dirty pages in the source
136600fd1d56SDarrick J. Wong * file range, which means that writeback should have allocated
136700fd1d56SDarrick J. Wong * or deleted all delalloc reservations in that range. If we
136800fd1d56SDarrick J. Wong * find one, that's a good sign that something is seriously
136900fd1d56SDarrick J. Wong * wrong here.
137000fd1d56SDarrick J. Wong */
137100fd1d56SDarrick J. Wong ASSERT(nimaps == 1 && imap.br_startoff == srcoff);
137200fd1d56SDarrick J. Wong if (imap.br_startblock == DELAYSTARTBLOCK) {
137300fd1d56SDarrick J. Wong ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
137400fd1d56SDarrick J. Wong error = -EFSCORRUPTED;
137500fd1d56SDarrick J. Wong break;
137600fd1d56SDarrick J. Wong }
1377862bb360SDarrick J. Wong
137800fd1d56SDarrick J. Wong trace_xfs_reflink_remap_extent_src(src, &imap);
1379862bb360SDarrick J. Wong
138000fd1d56SDarrick J. Wong /* Remap into the destination file at the given offset. */
138100fd1d56SDarrick J. Wong imap.br_startoff = destoff;
138200fd1d56SDarrick J. Wong error = xfs_reflink_remap_extent(dest, &imap, new_isize);
1383862bb360SDarrick J. Wong if (error)
13849f04aaffSDarrick J. Wong break;
1385862bb360SDarrick J. Wong
1386862bb360SDarrick J. Wong if (fatal_signal_pending(current)) {
1387862bb360SDarrick J. Wong error = -EINTR;
13889f04aaffSDarrick J. Wong break;
1389862bb360SDarrick J. Wong }
1390862bb360SDarrick J. Wong
1391862bb360SDarrick J. Wong /* Advance drange/srange */
139200fd1d56SDarrick J. Wong srcoff += imap.br_blockcount;
139300fd1d56SDarrick J. Wong destoff += imap.br_blockcount;
139400fd1d56SDarrick J. Wong len -= imap.br_blockcount;
139500fd1d56SDarrick J. Wong remapped_len += imap.br_blockcount;
1396862bb360SDarrick J. Wong }
1397862bb360SDarrick J. Wong
13989f04aaffSDarrick J. Wong if (error)
1399862bb360SDarrick J. Wong trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
14003f68c1f5SDarrick J. Wong *remapped = min_t(loff_t, remap_len,
14013f68c1f5SDarrick J. Wong XFS_FSB_TO_B(src->i_mount, remapped_len));
1402862bb360SDarrick J. Wong return error;
1403862bb360SDarrick J. Wong }
1404862bb360SDarrick J. Wong
1405862bb360SDarrick J. Wong /*
1406410fdc72SDarrick J. Wong * If we're reflinking to a point past the destination file's EOF, we must
1407410fdc72SDarrick J. Wong * zero any speculative post-EOF preallocations that sit between the old EOF
1408410fdc72SDarrick J. Wong * and the destination file offset.
1409410fdc72SDarrick J. Wong */
1410410fdc72SDarrick J. Wong static int
xfs_reflink_zero_posteof(struct xfs_inode * ip,loff_t pos)1411410fdc72SDarrick J. Wong xfs_reflink_zero_posteof(
1412410fdc72SDarrick J. Wong struct xfs_inode *ip,
1413410fdc72SDarrick J. Wong loff_t pos)
1414410fdc72SDarrick J. Wong {
1415410fdc72SDarrick J. Wong loff_t isize = i_size_read(VFS_I(ip));
1416410fdc72SDarrick J. Wong
1417410fdc72SDarrick J. Wong if (pos <= isize)
1418410fdc72SDarrick J. Wong return 0;
1419410fdc72SDarrick J. Wong
1420410fdc72SDarrick J. Wong trace_xfs_zero_eof(ip, isize, pos - isize);
1421f1ba5fafSShiyang Ruan return xfs_zero_range(ip, isize, pos - isize, NULL);
1422410fdc72SDarrick J. Wong }
1423410fdc72SDarrick J. Wong
1424410fdc72SDarrick J. Wong /*
14250d41e1d2SDarrick J. Wong * Prepare two files for range cloning. Upon a successful return both inodes
1426b3998900SDave Chinner * will have the iolock and mmaplock held, the page cache of the out file will
1427b3998900SDave Chinner * be truncated, and any leases on the out file will have been broken. This
1428b3998900SDave Chinner * function borrows heavily from xfs_file_aio_write_checks.
1429dceeb47bSDave Chinner *
1430dceeb47bSDave Chinner * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
1431dceeb47bSDave Chinner * checked that the bytes beyond EOF physically match. Hence we cannot use the
1432dceeb47bSDave Chinner * EOF block in the source dedupe range because it's not a complete block match,
1433b3998900SDave Chinner * hence can introduce a corruption into the file that has it's block replaced.
1434dceeb47bSDave Chinner *
1435b3998900SDave Chinner * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
1436b3998900SDave Chinner * "block aligned" for the purposes of cloning entire files. However, if the
1437b3998900SDave Chinner * source file range includes the EOF block and it lands within the existing EOF
1438b3998900SDave Chinner * of the destination file, then we can expose stale data from beyond the source
1439b3998900SDave Chinner * file EOF in the destination file.
1440b3998900SDave Chinner *
1441b3998900SDave Chinner * XFS doesn't support partial block sharing, so in both cases we have check
1442b3998900SDave Chinner * these cases ourselves. For dedupe, we can simply round the length to dedupe
1443b3998900SDave Chinner * down to the previous whole block and ignore the partial EOF block. While this
1444b3998900SDave Chinner * means we can't dedupe the last block of a file, this is an acceptible
1445b3998900SDave Chinner * tradeoff for simplicity on implementation.
1446b3998900SDave Chinner *
1447b3998900SDave Chinner * For cloning, we want to share the partial EOF block if it is also the new EOF
1448b3998900SDave Chinner * block of the destination file. If the partial EOF block lies inside the
1449b3998900SDave Chinner * existing destination EOF, then we have to abort the clone to avoid exposing
1450b3998900SDave Chinner * stale data in the destination file. Hence we reject these clone attempts with
1451b3998900SDave Chinner * -EINVAL in this case.
1452862bb360SDarrick J. Wong */
14533fc9f5e4SDarrick J. Wong int
xfs_reflink_remap_prep(struct file * file_in,loff_t pos_in,struct file * file_out,loff_t pos_out,loff_t * len,unsigned int remap_flags)14540d41e1d2SDarrick J. Wong xfs_reflink_remap_prep(
14555faaf4faSChristoph Hellwig struct file *file_in,
14565faaf4faSChristoph Hellwig loff_t pos_in,
14575faaf4faSChristoph Hellwig struct file *file_out,
14585faaf4faSChristoph Hellwig loff_t pos_out,
145942ec3d4cSDarrick J. Wong loff_t *len,
1460a91ae49bSDarrick J. Wong unsigned int remap_flags)
1461862bb360SDarrick J. Wong {
14625faaf4faSChristoph Hellwig struct inode *inode_in = file_inode(file_in);
14635faaf4faSChristoph Hellwig struct xfs_inode *src = XFS_I(inode_in);
14645faaf4faSChristoph Hellwig struct inode *inode_out = file_inode(file_out);
14655faaf4faSChristoph Hellwig struct xfs_inode *dest = XFS_I(inode_out);
1466451d34eeSDarrick J. Wong int ret;
1467862bb360SDarrick J. Wong
14685faaf4faSChristoph Hellwig /* Lock both files against IO */
1469e2aaee9cSDarrick J. Wong ret = xfs_ilock2_io_mmap(src, dest);
14701364b1d4SDarrick J. Wong if (ret)
14711364b1d4SDarrick J. Wong return ret;
14725faaf4faSChristoph Hellwig
1473876bec6fSDarrick J. Wong /* Check file eligibility and prepare for block sharing. */
14745faaf4faSChristoph Hellwig ret = -EINVAL;
1475862bb360SDarrick J. Wong /* Don't reflink realtime inodes */
1476862bb360SDarrick J. Wong if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
14775faaf4faSChristoph Hellwig goto out_unlock;
1478862bb360SDarrick J. Wong
147913f9e267SShiyang Ruan /* Don't share DAX file data with non-DAX file. */
148013f9e267SShiyang Ruan if (IS_DAX(inode_in) != IS_DAX(inode_out))
14815faaf4faSChristoph Hellwig goto out_unlock;
1482cc714660SDarrick J. Wong
14836f7db389SShiyang Ruan if (!IS_DAX(inode_in))
14846f7db389SShiyang Ruan ret = generic_remap_file_range_prep(file_in, pos_in, file_out,
14856f7db389SShiyang Ruan pos_out, len, remap_flags);
14866f7db389SShiyang Ruan else
14876f7db389SShiyang Ruan ret = dax_remap_file_range_prep(file_in, pos_in, file_out,
14886f7db389SShiyang Ruan pos_out, len, remap_flags, &xfs_read_iomap_ops);
1489451d34eeSDarrick J. Wong if (ret || *len == 0)
14905faaf4faSChristoph Hellwig goto out_unlock;
14915faaf4faSChristoph Hellwig
149209ac8623SDarrick J. Wong /* Attach dquots to dest inode before changing block map */
1493c14cfccaSDarrick J. Wong ret = xfs_qm_dqattach(dest);
149409ac8623SDarrick J. Wong if (ret)
149509ac8623SDarrick J. Wong goto out_unlock;
149609ac8623SDarrick J. Wong
14975c989a0eSDarrick J. Wong /*
1498410fdc72SDarrick J. Wong * Zero existing post-eof speculative preallocations in the destination
1499410fdc72SDarrick J. Wong * file.
15005c989a0eSDarrick J. Wong */
1501410fdc72SDarrick J. Wong ret = xfs_reflink_zero_posteof(dest, pos_out);
15025c989a0eSDarrick J. Wong if (ret)
15035c989a0eSDarrick J. Wong goto out_unlock;
15045c989a0eSDarrick J. Wong
1505876bec6fSDarrick J. Wong /* Set flags and remap blocks. */
15065faaf4faSChristoph Hellwig ret = xfs_reflink_set_inode_flag(src, dest);
15075faaf4faSChristoph Hellwig if (ret)
15085faaf4faSChristoph Hellwig goto out_unlock;
1509862bb360SDarrick J. Wong
15102c307174SDave Chinner /*
15112c307174SDave Chinner * If pos_out > EOF, we may have dirtied blocks between EOF and
15122c307174SDave Chinner * pos_out. In that case, we need to extend the flush and unmap to cover
15132c307174SDave Chinner * from EOF to the end of the copy length.
15142c307174SDave Chinner */
15152c307174SDave Chinner if (pos_out > XFS_ISIZE(dest)) {
15162c307174SDave Chinner loff_t flen = *len + (pos_out - XFS_ISIZE(dest));
15172c307174SDave Chinner ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
15182c307174SDave Chinner } else {
15192c307174SDave Chinner ret = xfs_flush_unmap_range(dest, pos_out, *len);
15202c307174SDave Chinner }
15212c307174SDave Chinner if (ret)
15222c307174SDave Chinner goto out_unlock;
15237debbf01SDarrick J. Wong
1524d7d84772SCatherine Hoang xfs_iflags_set(src, XFS_IREMAPPING);
1525d7d84772SCatherine Hoang if (inode_in != inode_out)
1526d7d84772SCatherine Hoang xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL);
1527d7d84772SCatherine Hoang
1528451d34eeSDarrick J. Wong return 0;
15290d41e1d2SDarrick J. Wong out_unlock:
1530e2aaee9cSDarrick J. Wong xfs_iunlock2_io_mmap(src, dest);
15310d41e1d2SDarrick J. Wong return ret;
15320d41e1d2SDarrick J. Wong }
15330d41e1d2SDarrick J. Wong
1534ea7cdd7bSDarrick J. Wong /* Does this inode need the reflink flag? */
1535ea7cdd7bSDarrick J. Wong int
xfs_reflink_inode_has_shared_extents(struct xfs_trans * tp,struct xfs_inode * ip,bool * has_shared)1536ea7cdd7bSDarrick J. Wong xfs_reflink_inode_has_shared_extents(
1537ea7cdd7bSDarrick J. Wong struct xfs_trans *tp,
1538ea7cdd7bSDarrick J. Wong struct xfs_inode *ip,
1539ea7cdd7bSDarrick J. Wong bool *has_shared)
1540ea7cdd7bSDarrick J. Wong {
1541ea7cdd7bSDarrick J. Wong struct xfs_bmbt_irec got;
1542ea7cdd7bSDarrick J. Wong struct xfs_mount *mp = ip->i_mount;
1543ea7cdd7bSDarrick J. Wong struct xfs_ifork *ifp;
1544b2b1712aSChristoph Hellwig struct xfs_iext_cursor icur;
1545ea7cdd7bSDarrick J. Wong bool found;
1546ea7cdd7bSDarrick J. Wong int error;
1547ea7cdd7bSDarrick J. Wong
1548732436efSDarrick J. Wong ifp = xfs_ifork_ptr(ip, XFS_DATA_FORK);
1549ea7cdd7bSDarrick J. Wong error = xfs_iread_extents(tp, ip, XFS_DATA_FORK);
1550ea7cdd7bSDarrick J. Wong if (error)
1551ea7cdd7bSDarrick J. Wong return error;
1552ea7cdd7bSDarrick J. Wong
1553ea7cdd7bSDarrick J. Wong *has_shared = false;
1554b2b1712aSChristoph Hellwig found = xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got);
1555ea7cdd7bSDarrick J. Wong while (found) {
155608d3e84fSDave Chinner struct xfs_perag *pag;
155708d3e84fSDave Chinner xfs_agblock_t agbno;
155808d3e84fSDave Chinner xfs_extlen_t aglen;
155908d3e84fSDave Chinner xfs_agblock_t rbno;
156008d3e84fSDave Chinner xfs_extlen_t rlen;
156108d3e84fSDave Chinner
1562ea7cdd7bSDarrick J. Wong if (isnullstartblock(got.br_startblock) ||
1563ea7cdd7bSDarrick J. Wong got.br_state != XFS_EXT_NORM)
1564ea7cdd7bSDarrick J. Wong goto next;
156508d3e84fSDave Chinner
156608d3e84fSDave Chinner pag = xfs_perag_get(mp, XFS_FSB_TO_AGNO(mp, got.br_startblock));
1567ea7cdd7bSDarrick J. Wong agbno = XFS_FSB_TO_AGBNO(mp, got.br_startblock);
1568ea7cdd7bSDarrick J. Wong aglen = got.br_blockcount;
156908d3e84fSDave Chinner error = xfs_reflink_find_shared(pag, tp, agbno, aglen,
1570ea7cdd7bSDarrick J. Wong &rbno, &rlen, false);
157108d3e84fSDave Chinner xfs_perag_put(pag);
1572ea7cdd7bSDarrick J. Wong if (error)
1573ea7cdd7bSDarrick J. Wong return error;
157408d3e84fSDave Chinner
1575ea7cdd7bSDarrick J. Wong /* Is there still a shared block here? */
1576ea7cdd7bSDarrick J. Wong if (rbno != NULLAGBLOCK) {
1577ea7cdd7bSDarrick J. Wong *has_shared = true;
1578ea7cdd7bSDarrick J. Wong return 0;
1579ea7cdd7bSDarrick J. Wong }
1580ea7cdd7bSDarrick J. Wong next:
1581b2b1712aSChristoph Hellwig found = xfs_iext_next_extent(ifp, &icur, &got);
1582ea7cdd7bSDarrick J. Wong }
1583ea7cdd7bSDarrick J. Wong
1584ea7cdd7bSDarrick J. Wong return 0;
1585ea7cdd7bSDarrick J. Wong }
1586ea7cdd7bSDarrick J. Wong
1587844e5e74SDave Chinner /*
1588844e5e74SDave Chinner * Clear the inode reflink flag if there are no shared extents.
1589844e5e74SDave Chinner *
1590844e5e74SDave Chinner * The caller is responsible for joining the inode to the transaction passed in.
1591844e5e74SDave Chinner * The inode will be joined to the transaction that is returned to the caller.
1592844e5e74SDave Chinner */
159398cc2db5SDarrick J. Wong int
xfs_reflink_clear_inode_flag(struct xfs_inode * ip,struct xfs_trans ** tpp)159498cc2db5SDarrick J. Wong xfs_reflink_clear_inode_flag(
159598cc2db5SDarrick J. Wong struct xfs_inode *ip,
159698cc2db5SDarrick J. Wong struct xfs_trans **tpp)
159798cc2db5SDarrick J. Wong {
1598ea7cdd7bSDarrick J. Wong bool needs_flag;
159998cc2db5SDarrick J. Wong int error = 0;
160098cc2db5SDarrick J. Wong
160163646fc5SDarrick J. Wong ASSERT(xfs_is_reflink_inode(ip));
160298cc2db5SDarrick J. Wong
1603*7b5b1191SBrian Foster if (!xfs_can_free_cowblocks(ip))
1604*7b5b1191SBrian Foster return 0;
1605*7b5b1191SBrian Foster
1606ea7cdd7bSDarrick J. Wong error = xfs_reflink_inode_has_shared_extents(*tpp, ip, &needs_flag);
1607ea7cdd7bSDarrick J. Wong if (error || needs_flag)
160898cc2db5SDarrick J. Wong return error;
160998cc2db5SDarrick J. Wong
161098cc2db5SDarrick J. Wong /*
161198cc2db5SDarrick J. Wong * We didn't find any shared blocks so turn off the reflink flag.
161298cc2db5SDarrick J. Wong * First, get rid of any leftover CoW mappings.
161398cc2db5SDarrick J. Wong */
1614a5084865SDarrick J. Wong error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, XFS_MAX_FILEOFF,
1615a5084865SDarrick J. Wong true);
161698cc2db5SDarrick J. Wong if (error)
161798cc2db5SDarrick J. Wong return error;
161898cc2db5SDarrick J. Wong
161998cc2db5SDarrick J. Wong /* Clear the inode flag. */
162098cc2db5SDarrick J. Wong trace_xfs_reflink_unset_inode_flag(ip);
16213e09ab8fSChristoph Hellwig ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
162283104d44SDarrick J. Wong xfs_inode_clear_cowblocks_tag(ip);
162398cc2db5SDarrick J. Wong xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
162498cc2db5SDarrick J. Wong
162598cc2db5SDarrick J. Wong return error;
162698cc2db5SDarrick J. Wong }
162798cc2db5SDarrick J. Wong
162898cc2db5SDarrick J. Wong /*
162998cc2db5SDarrick J. Wong * Clear the inode reflink flag if there are no shared extents and the size
163098cc2db5SDarrick J. Wong * hasn't changed.
163198cc2db5SDarrick J. Wong */
163298cc2db5SDarrick J. Wong STATIC int
xfs_reflink_try_clear_inode_flag(struct xfs_inode * ip)163398cc2db5SDarrick J. Wong xfs_reflink_try_clear_inode_flag(
163497a1b87eSDarrick J. Wong struct xfs_inode *ip)
163598cc2db5SDarrick J. Wong {
163698cc2db5SDarrick J. Wong struct xfs_mount *mp = ip->i_mount;
163798cc2db5SDarrick J. Wong struct xfs_trans *tp;
163898cc2db5SDarrick J. Wong int error = 0;
163998cc2db5SDarrick J. Wong
164098cc2db5SDarrick J. Wong /* Start a rolling transaction to remove the mappings */
164198cc2db5SDarrick J. Wong error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
164298cc2db5SDarrick J. Wong if (error)
164398cc2db5SDarrick J. Wong return error;
164498cc2db5SDarrick J. Wong
164598cc2db5SDarrick J. Wong xfs_ilock(ip, XFS_ILOCK_EXCL);
164698cc2db5SDarrick J. Wong xfs_trans_ijoin(tp, ip, 0);
164798cc2db5SDarrick J. Wong
164898cc2db5SDarrick J. Wong error = xfs_reflink_clear_inode_flag(ip, &tp);
164998cc2db5SDarrick J. Wong if (error)
165098cc2db5SDarrick J. Wong goto cancel;
165198cc2db5SDarrick J. Wong
165298cc2db5SDarrick J. Wong error = xfs_trans_commit(tp);
165398cc2db5SDarrick J. Wong if (error)
165498cc2db5SDarrick J. Wong goto out;
165598cc2db5SDarrick J. Wong
165698cc2db5SDarrick J. Wong xfs_iunlock(ip, XFS_ILOCK_EXCL);
165798cc2db5SDarrick J. Wong return 0;
165898cc2db5SDarrick J. Wong cancel:
165998cc2db5SDarrick J. Wong xfs_trans_cancel(tp);
166098cc2db5SDarrick J. Wong out:
166198cc2db5SDarrick J. Wong xfs_iunlock(ip, XFS_ILOCK_EXCL);
166298cc2db5SDarrick J. Wong return error;
166398cc2db5SDarrick J. Wong }
166498cc2db5SDarrick J. Wong
166598cc2db5SDarrick J. Wong /*
166698cc2db5SDarrick J. Wong * Pre-COW all shared blocks within a given byte range of a file and turn off
166798cc2db5SDarrick J. Wong * the reflink flag if we unshare all of the file's blocks.
166898cc2db5SDarrick J. Wong */
166998cc2db5SDarrick J. Wong int
xfs_reflink_unshare(struct xfs_inode * ip,xfs_off_t offset,xfs_off_t len)167098cc2db5SDarrick J. Wong xfs_reflink_unshare(
167198cc2db5SDarrick J. Wong struct xfs_inode *ip,
167298cc2db5SDarrick J. Wong xfs_off_t offset,
167398cc2db5SDarrick J. Wong xfs_off_t len)
167498cc2db5SDarrick J. Wong {
1675dd26b846SChristoph Hellwig struct inode *inode = VFS_I(ip);
167698cc2db5SDarrick J. Wong int error;
167798cc2db5SDarrick J. Wong
167898cc2db5SDarrick J. Wong if (!xfs_is_reflink_inode(ip))
167998cc2db5SDarrick J. Wong return 0;
168098cc2db5SDarrick J. Wong
168198cc2db5SDarrick J. Wong trace_xfs_reflink_unshare(ip, offset, len);
168298cc2db5SDarrick J. Wong
1683dd26b846SChristoph Hellwig inode_dio_wait(inode);
168498cc2db5SDarrick J. Wong
1685d984648eSShiyang Ruan if (IS_DAX(inode))
1686d984648eSShiyang Ruan error = dax_file_unshare(inode, offset, len,
1687d984648eSShiyang Ruan &xfs_dax_write_iomap_ops);
1688d984648eSShiyang Ruan else
1689f150b423SChristoph Hellwig error = iomap_file_unshare(inode, offset, len,
1690f150b423SChristoph Hellwig &xfs_buffered_write_iomap_ops);
169198cc2db5SDarrick J. Wong if (error)
1692dd26b846SChristoph Hellwig goto out;
169346afb062SDarrick J. Wong
1694d4f74e16SDarrick J. Wong error = filemap_write_and_wait_range(inode->i_mapping, offset,
1695d4f74e16SDarrick J. Wong offset + len - 1);
169698cc2db5SDarrick J. Wong if (error)
169798cc2db5SDarrick J. Wong goto out;
169898cc2db5SDarrick J. Wong
169997a1b87eSDarrick J. Wong /* Turn off the reflink flag if possible. */
170097a1b87eSDarrick J. Wong error = xfs_reflink_try_clear_inode_flag(ip);
170198cc2db5SDarrick J. Wong if (error)
170298cc2db5SDarrick J. Wong goto out;
170398cc2db5SDarrick J. Wong return 0;
170498cc2db5SDarrick J. Wong
170598cc2db5SDarrick J. Wong out:
170698cc2db5SDarrick J. Wong trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
170798cc2db5SDarrick J. Wong return error;
170898cc2db5SDarrick J. Wong }
1709