xref: /openbmc/linux/fs/xfs/xfs_bmap_util.c (revision 2596e07a)
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * Copyright (c) 2012 Red Hat, Inc.
4  * All Rights Reserved.
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it would be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write the Free Software Foundation,
17  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18  */
19 #include "xfs.h"
20 #include "xfs_fs.h"
21 #include "xfs_shared.h"
22 #include "xfs_format.h"
23 #include "xfs_log_format.h"
24 #include "xfs_trans_resv.h"
25 #include "xfs_bit.h"
26 #include "xfs_mount.h"
27 #include "xfs_da_format.h"
28 #include "xfs_inode.h"
29 #include "xfs_btree.h"
30 #include "xfs_trans.h"
31 #include "xfs_extfree_item.h"
32 #include "xfs_alloc.h"
33 #include "xfs_bmap.h"
34 #include "xfs_bmap_util.h"
35 #include "xfs_bmap_btree.h"
36 #include "xfs_rtalloc.h"
37 #include "xfs_error.h"
38 #include "xfs_quota.h"
39 #include "xfs_trans_space.h"
40 #include "xfs_trace.h"
41 #include "xfs_icache.h"
42 #include "xfs_log.h"
43 
44 /* Kernel only BMAP related definitions and functions */
45 
46 /*
47  * Convert the given file system block to a disk block.  We have to treat it
48  * differently based on whether the file is a real time file or not, because the
49  * bmap code does.
50  */
51 xfs_daddr_t
52 xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
53 {
54 	return (XFS_IS_REALTIME_INODE(ip) ? \
55 		 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
56 		 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
57 }
58 
59 /*
60  * Routine to zero an extent on disk allocated to the specific inode.
61  *
62  * The VFS functions take a linearised filesystem block offset, so we have to
63  * convert the sparse xfs fsb to the right format first.
64  * VFS types are real funky, too.
65  */
66 int
67 xfs_zero_extent(
68 	struct xfs_inode *ip,
69 	xfs_fsblock_t	start_fsb,
70 	xfs_off_t	count_fsb)
71 {
72 	struct xfs_mount *mp = ip->i_mount;
73 	xfs_daddr_t	sector = xfs_fsb_to_db(ip, start_fsb);
74 	sector_t	block = XFS_BB_TO_FSBT(mp, sector);
75 	ssize_t		size = XFS_FSB_TO_B(mp, count_fsb);
76 
77 	if (IS_DAX(VFS_I(ip)))
78 		return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
79 				sector, size);
80 
81 	/*
82 	 * let the block layer decide on the fastest method of
83 	 * implementing the zeroing.
84 	 */
85 	return sb_issue_zeroout(mp->m_super, block, count_fsb, GFP_NOFS);
86 
87 }
88 
89 /*
90  * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
91  * caller.  Frees all the extents that need freeing, which must be done
92  * last due to locking considerations.  We never free any extents in
93  * the first transaction.
94  *
95  * If an inode *ip is provided, rejoin it to the transaction if
96  * the transaction was committed.
97  */
98 int						/* error */
99 xfs_bmap_finish(
100 	struct xfs_trans		**tp,	/* transaction pointer addr */
101 	struct xfs_bmap_free		*flist,	/* i/o: list extents to free */
102 	struct xfs_inode		*ip)
103 {
104 	struct xfs_efd_log_item		*efd;	/* extent free data */
105 	struct xfs_efi_log_item		*efi;	/* extent free intention */
106 	int				error;	/* error return value */
107 	int				committed;/* xact committed or not */
108 	struct xfs_bmap_free_item	*free;	/* free extent item */
109 	struct xfs_bmap_free_item	*next;	/* next item on free list */
110 
111 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
112 	if (flist->xbf_count == 0)
113 		return 0;
114 
115 	efi = xfs_trans_get_efi(*tp, flist->xbf_count);
116 	for (free = flist->xbf_first; free; free = free->xbfi_next)
117 		xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
118 			free->xbfi_blockcount);
119 
120 	error = __xfs_trans_roll(tp, ip, &committed);
121 	if (error) {
122 		/*
123 		 * If the transaction was committed, drop the EFD reference
124 		 * since we're bailing out of here. The other reference is
125 		 * dropped when the EFI hits the AIL.
126 		 *
127 		 * If the transaction was not committed, the EFI is freed by the
128 		 * EFI item unlock handler on abort. Also, we have a new
129 		 * transaction so we should return committed=1 even though we're
130 		 * returning an error.
131 		 */
132 		if (committed) {
133 			xfs_efi_release(efi);
134 			xfs_force_shutdown((*tp)->t_mountp,
135 				(error == -EFSCORRUPTED) ?
136 					SHUTDOWN_CORRUPT_INCORE :
137 					SHUTDOWN_META_IO_ERROR);
138 		}
139 		return error;
140 	}
141 
142 	/*
143 	 * Get an EFD and free each extent in the list, logging to the EFD in
144 	 * the process. The remaining bmap free list is cleaned up by the caller
145 	 * on error.
146 	 */
147 	efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
148 	for (free = flist->xbf_first; free != NULL; free = next) {
149 		next = free->xbfi_next;
150 
151 		error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
152 					      free->xbfi_blockcount);
153 		if (error)
154 			return error;
155 
156 		xfs_bmap_del_free(flist, NULL, free);
157 	}
158 
159 	return 0;
160 }
161 
162 int
163 xfs_bmap_rtalloc(
164 	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
165 {
166 	xfs_alloctype_t	atype = 0;	/* type for allocation routines */
167 	int		error;		/* error return value */
168 	xfs_mount_t	*mp;		/* mount point structure */
169 	xfs_extlen_t	prod = 0;	/* product factor for allocators */
170 	xfs_extlen_t	ralen = 0;	/* realtime allocation length */
171 	xfs_extlen_t	align;		/* minimum allocation alignment */
172 	xfs_rtblock_t	rtb;
173 
174 	mp = ap->ip->i_mount;
175 	align = xfs_get_extsz_hint(ap->ip);
176 	prod = align / mp->m_sb.sb_rextsize;
177 	error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
178 					align, 1, ap->eof, 0,
179 					ap->conv, &ap->offset, &ap->length);
180 	if (error)
181 		return error;
182 	ASSERT(ap->length);
183 	ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
184 
185 	/*
186 	 * If the offset & length are not perfectly aligned
187 	 * then kill prod, it will just get us in trouble.
188 	 */
189 	if (do_mod(ap->offset, align) || ap->length % align)
190 		prod = 1;
191 	/*
192 	 * Set ralen to be the actual requested length in rtextents.
193 	 */
194 	ralen = ap->length / mp->m_sb.sb_rextsize;
195 	/*
196 	 * If the old value was close enough to MAXEXTLEN that
197 	 * we rounded up to it, cut it back so it's valid again.
198 	 * Note that if it's a really large request (bigger than
199 	 * MAXEXTLEN), we don't hear about that number, and can't
200 	 * adjust the starting point to match it.
201 	 */
202 	if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
203 		ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
204 
205 	/*
206 	 * Lock out other modifications to the RT bitmap inode.
207 	 */
208 	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
209 	xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
210 
211 	/*
212 	 * If it's an allocation to an empty file at offset 0,
213 	 * pick an extent that will space things out in the rt area.
214 	 */
215 	if (ap->eof && ap->offset == 0) {
216 		xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
217 
218 		error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
219 		if (error)
220 			return error;
221 		ap->blkno = rtx * mp->m_sb.sb_rextsize;
222 	} else {
223 		ap->blkno = 0;
224 	}
225 
226 	xfs_bmap_adjacent(ap);
227 
228 	/*
229 	 * Realtime allocation, done through xfs_rtallocate_extent.
230 	 */
231 	atype = ap->blkno == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
232 	do_div(ap->blkno, mp->m_sb.sb_rextsize);
233 	rtb = ap->blkno;
234 	ap->length = ralen;
235 	if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
236 				&ralen, atype, ap->wasdel, prod, &rtb)))
237 		return error;
238 	if (rtb == NULLFSBLOCK && prod > 1 &&
239 	    (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
240 					   ap->length, &ralen, atype,
241 					   ap->wasdel, 1, &rtb)))
242 		return error;
243 	ap->blkno = rtb;
244 	if (ap->blkno != NULLFSBLOCK) {
245 		ap->blkno *= mp->m_sb.sb_rextsize;
246 		ralen *= mp->m_sb.sb_rextsize;
247 		ap->length = ralen;
248 		ap->ip->i_d.di_nblocks += ralen;
249 		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
250 		if (ap->wasdel)
251 			ap->ip->i_delayed_blks -= ralen;
252 		/*
253 		 * Adjust the disk quota also. This was reserved
254 		 * earlier.
255 		 */
256 		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
257 			ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
258 					XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
259 
260 		/* Zero the extent if we were asked to do so */
261 		if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
262 			error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
263 			if (error)
264 				return error;
265 		}
266 	} else {
267 		ap->length = 0;
268 	}
269 	return 0;
270 }
271 
272 /*
273  * Check if the endoff is outside the last extent. If so the caller will grow
274  * the allocation to a stripe unit boundary.  All offsets are considered outside
275  * the end of file for an empty fork, so 1 is returned in *eof in that case.
276  */
277 int
278 xfs_bmap_eof(
279 	struct xfs_inode	*ip,
280 	xfs_fileoff_t		endoff,
281 	int			whichfork,
282 	int			*eof)
283 {
284 	struct xfs_bmbt_irec	rec;
285 	int			error;
286 
287 	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
288 	if (error || *eof)
289 		return error;
290 
291 	*eof = endoff >= rec.br_startoff + rec.br_blockcount;
292 	return 0;
293 }
294 
295 /*
296  * Extent tree block counting routines.
297  */
298 
299 /*
300  * Count leaf blocks given a range of extent records.
301  */
302 STATIC void
303 xfs_bmap_count_leaves(
304 	xfs_ifork_t		*ifp,
305 	xfs_extnum_t		idx,
306 	int			numrecs,
307 	int			*count)
308 {
309 	int		b;
310 
311 	for (b = 0; b < numrecs; b++) {
312 		xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
313 		*count += xfs_bmbt_get_blockcount(frp);
314 	}
315 }
316 
317 /*
318  * Count leaf blocks given a range of extent records originally
319  * in btree format.
320  */
321 STATIC void
322 xfs_bmap_disk_count_leaves(
323 	struct xfs_mount	*mp,
324 	struct xfs_btree_block	*block,
325 	int			numrecs,
326 	int			*count)
327 {
328 	int		b;
329 	xfs_bmbt_rec_t	*frp;
330 
331 	for (b = 1; b <= numrecs; b++) {
332 		frp = XFS_BMBT_REC_ADDR(mp, block, b);
333 		*count += xfs_bmbt_disk_get_blockcount(frp);
334 	}
335 }
336 
337 /*
338  * Recursively walks each level of a btree
339  * to count total fsblocks in use.
340  */
341 STATIC int                                     /* error */
342 xfs_bmap_count_tree(
343 	xfs_mount_t     *mp,            /* file system mount point */
344 	xfs_trans_t     *tp,            /* transaction pointer */
345 	xfs_ifork_t	*ifp,		/* inode fork pointer */
346 	xfs_fsblock_t   blockno,	/* file system block number */
347 	int             levelin,	/* level in btree */
348 	int		*count)		/* Count of blocks */
349 {
350 	int			error;
351 	xfs_buf_t		*bp, *nbp;
352 	int			level = levelin;
353 	__be64			*pp;
354 	xfs_fsblock_t           bno = blockno;
355 	xfs_fsblock_t		nextbno;
356 	struct xfs_btree_block	*block, *nextblock;
357 	int			numrecs;
358 
359 	error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
360 						&xfs_bmbt_buf_ops);
361 	if (error)
362 		return error;
363 	*count += 1;
364 	block = XFS_BUF_TO_BLOCK(bp);
365 
366 	if (--level) {
367 		/* Not at node above leaves, count this level of nodes */
368 		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
369 		while (nextbno != NULLFSBLOCK) {
370 			error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
371 						XFS_BMAP_BTREE_REF,
372 						&xfs_bmbt_buf_ops);
373 			if (error)
374 				return error;
375 			*count += 1;
376 			nextblock = XFS_BUF_TO_BLOCK(nbp);
377 			nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
378 			xfs_trans_brelse(tp, nbp);
379 		}
380 
381 		/* Dive to the next level */
382 		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
383 		bno = be64_to_cpu(*pp);
384 		if (unlikely((error =
385 		     xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
386 			xfs_trans_brelse(tp, bp);
387 			XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
388 					 XFS_ERRLEVEL_LOW, mp);
389 			return -EFSCORRUPTED;
390 		}
391 		xfs_trans_brelse(tp, bp);
392 	} else {
393 		/* count all level 1 nodes and their leaves */
394 		for (;;) {
395 			nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
396 			numrecs = be16_to_cpu(block->bb_numrecs);
397 			xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
398 			xfs_trans_brelse(tp, bp);
399 			if (nextbno == NULLFSBLOCK)
400 				break;
401 			bno = nextbno;
402 			error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
403 						XFS_BMAP_BTREE_REF,
404 						&xfs_bmbt_buf_ops);
405 			if (error)
406 				return error;
407 			*count += 1;
408 			block = XFS_BUF_TO_BLOCK(bp);
409 		}
410 	}
411 	return 0;
412 }
413 
414 /*
415  * Count fsblocks of the given fork.
416  */
417 int						/* error */
418 xfs_bmap_count_blocks(
419 	xfs_trans_t		*tp,		/* transaction pointer */
420 	xfs_inode_t		*ip,		/* incore inode */
421 	int			whichfork,	/* data or attr fork */
422 	int			*count)		/* out: count of blocks */
423 {
424 	struct xfs_btree_block	*block;	/* current btree block */
425 	xfs_fsblock_t		bno;	/* block # of "block" */
426 	xfs_ifork_t		*ifp;	/* fork structure */
427 	int			level;	/* btree level, for checking */
428 	xfs_mount_t		*mp;	/* file system mount structure */
429 	__be64			*pp;	/* pointer to block address */
430 
431 	bno = NULLFSBLOCK;
432 	mp = ip->i_mount;
433 	ifp = XFS_IFORK_PTR(ip, whichfork);
434 	if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
435 		xfs_bmap_count_leaves(ifp, 0,
436 			ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
437 			count);
438 		return 0;
439 	}
440 
441 	/*
442 	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
443 	 */
444 	block = ifp->if_broot;
445 	level = be16_to_cpu(block->bb_level);
446 	ASSERT(level > 0);
447 	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
448 	bno = be64_to_cpu(*pp);
449 	ASSERT(bno != NULLFSBLOCK);
450 	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
451 	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
452 
453 	if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
454 		XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
455 				 mp);
456 		return -EFSCORRUPTED;
457 	}
458 
459 	return 0;
460 }
461 
462 /*
463  * returns 1 for success, 0 if we failed to map the extent.
464  */
465 STATIC int
466 xfs_getbmapx_fix_eof_hole(
467 	xfs_inode_t		*ip,		/* xfs incore inode pointer */
468 	struct getbmapx		*out,		/* output structure */
469 	int			prealloced,	/* this is a file with
470 						 * preallocated data space */
471 	__int64_t		end,		/* last block requested */
472 	xfs_fsblock_t		startblock)
473 {
474 	__int64_t		fixlen;
475 	xfs_mount_t		*mp;		/* file system mount point */
476 	xfs_ifork_t		*ifp;		/* inode fork pointer */
477 	xfs_extnum_t		lastx;		/* last extent pointer */
478 	xfs_fileoff_t		fileblock;
479 
480 	if (startblock == HOLESTARTBLOCK) {
481 		mp = ip->i_mount;
482 		out->bmv_block = -1;
483 		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
484 		fixlen -= out->bmv_offset;
485 		if (prealloced && out->bmv_offset + out->bmv_length == end) {
486 			/* Came to hole at EOF. Trim it. */
487 			if (fixlen <= 0)
488 				return 0;
489 			out->bmv_length = fixlen;
490 		}
491 	} else {
492 		if (startblock == DELAYSTARTBLOCK)
493 			out->bmv_block = -2;
494 		else
495 			out->bmv_block = xfs_fsb_to_db(ip, startblock);
496 		fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
497 		ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
498 		if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
499 		   (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
500 			out->bmv_oflags |= BMV_OF_LAST;
501 	}
502 
503 	return 1;
504 }
505 
506 /*
507  * Get inode's extents as described in bmv, and format for output.
508  * Calls formatter to fill the user's buffer until all extents
509  * are mapped, until the passed-in bmv->bmv_count slots have
510  * been filled, or until the formatter short-circuits the loop,
511  * if it is tracking filled-in extents on its own.
512  */
513 int						/* error code */
514 xfs_getbmap(
515 	xfs_inode_t		*ip,
516 	struct getbmapx		*bmv,		/* user bmap structure */
517 	xfs_bmap_format_t	formatter,	/* format to user */
518 	void			*arg)		/* formatter arg */
519 {
520 	__int64_t		bmvend;		/* last block requested */
521 	int			error = 0;	/* return value */
522 	__int64_t		fixlen;		/* length for -1 case */
523 	int			i;		/* extent number */
524 	int			lock;		/* lock state */
525 	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
526 	xfs_mount_t		*mp;		/* file system mount point */
527 	int			nex;		/* # of user extents can do */
528 	int			nexleft;	/* # of user extents left */
529 	int			subnex;		/* # of bmapi's can do */
530 	int			nmap;		/* number of map entries */
531 	struct getbmapx		*out;		/* output structure */
532 	int			whichfork;	/* data or attr fork */
533 	int			prealloced;	/* this is a file with
534 						 * preallocated data space */
535 	int			iflags;		/* interface flags */
536 	int			bmapi_flags;	/* flags for xfs_bmapi */
537 	int			cur_ext = 0;
538 
539 	mp = ip->i_mount;
540 	iflags = bmv->bmv_iflags;
541 	whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
542 
543 	if (whichfork == XFS_ATTR_FORK) {
544 		if (XFS_IFORK_Q(ip)) {
545 			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
546 			    ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
547 			    ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
548 				return -EINVAL;
549 		} else if (unlikely(
550 			   ip->i_d.di_aformat != 0 &&
551 			   ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
552 			XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
553 					 ip->i_mount);
554 			return -EFSCORRUPTED;
555 		}
556 
557 		prealloced = 0;
558 		fixlen = 1LL << 32;
559 	} else {
560 		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
561 		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
562 		    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
563 			return -EINVAL;
564 
565 		if (xfs_get_extsz_hint(ip) ||
566 		    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
567 			prealloced = 1;
568 			fixlen = mp->m_super->s_maxbytes;
569 		} else {
570 			prealloced = 0;
571 			fixlen = XFS_ISIZE(ip);
572 		}
573 	}
574 
575 	if (bmv->bmv_length == -1) {
576 		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
577 		bmv->bmv_length =
578 			max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
579 	} else if (bmv->bmv_length == 0) {
580 		bmv->bmv_entries = 0;
581 		return 0;
582 	} else if (bmv->bmv_length < 0) {
583 		return -EINVAL;
584 	}
585 
586 	nex = bmv->bmv_count - 1;
587 	if (nex <= 0)
588 		return -EINVAL;
589 	bmvend = bmv->bmv_offset + bmv->bmv_length;
590 
591 
592 	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
593 		return -ENOMEM;
594 	out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
595 	if (!out)
596 		return -ENOMEM;
597 
598 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
599 	if (whichfork == XFS_DATA_FORK) {
600 		if (!(iflags & BMV_IF_DELALLOC) &&
601 		    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
602 			error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
603 			if (error)
604 				goto out_unlock_iolock;
605 
606 			/*
607 			 * Even after flushing the inode, there can still be
608 			 * delalloc blocks on the inode beyond EOF due to
609 			 * speculative preallocation.  These are not removed
610 			 * until the release function is called or the inode
611 			 * is inactivated.  Hence we cannot assert here that
612 			 * ip->i_delayed_blks == 0.
613 			 */
614 		}
615 
616 		lock = xfs_ilock_data_map_shared(ip);
617 	} else {
618 		lock = xfs_ilock_attr_map_shared(ip);
619 	}
620 
621 	/*
622 	 * Don't let nex be bigger than the number of extents
623 	 * we can have assuming alternating holes and real extents.
624 	 */
625 	if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
626 		nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
627 
628 	bmapi_flags = xfs_bmapi_aflag(whichfork);
629 	if (!(iflags & BMV_IF_PREALLOC))
630 		bmapi_flags |= XFS_BMAPI_IGSTATE;
631 
632 	/*
633 	 * Allocate enough space to handle "subnex" maps at a time.
634 	 */
635 	error = -ENOMEM;
636 	subnex = 16;
637 	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
638 	if (!map)
639 		goto out_unlock_ilock;
640 
641 	bmv->bmv_entries = 0;
642 
643 	if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
644 	    (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
645 		error = 0;
646 		goto out_free_map;
647 	}
648 
649 	nexleft = nex;
650 
651 	do {
652 		nmap = (nexleft > subnex) ? subnex : nexleft;
653 		error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
654 				       XFS_BB_TO_FSB(mp, bmv->bmv_length),
655 				       map, &nmap, bmapi_flags);
656 		if (error)
657 			goto out_free_map;
658 		ASSERT(nmap <= subnex);
659 
660 		for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
661 			out[cur_ext].bmv_oflags = 0;
662 			if (map[i].br_state == XFS_EXT_UNWRITTEN)
663 				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
664 			else if (map[i].br_startblock == DELAYSTARTBLOCK)
665 				out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
666 			out[cur_ext].bmv_offset =
667 				XFS_FSB_TO_BB(mp, map[i].br_startoff);
668 			out[cur_ext].bmv_length =
669 				XFS_FSB_TO_BB(mp, map[i].br_blockcount);
670 			out[cur_ext].bmv_unused1 = 0;
671 			out[cur_ext].bmv_unused2 = 0;
672 
673 			/*
674 			 * delayed allocation extents that start beyond EOF can
675 			 * occur due to speculative EOF allocation when the
676 			 * delalloc extent is larger than the largest freespace
677 			 * extent at conversion time. These extents cannot be
678 			 * converted by data writeback, so can exist here even
679 			 * if we are not supposed to be finding delalloc
680 			 * extents.
681 			 */
682 			if (map[i].br_startblock == DELAYSTARTBLOCK &&
683 			    map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
684 				ASSERT((iflags & BMV_IF_DELALLOC) != 0);
685 
686                         if (map[i].br_startblock == HOLESTARTBLOCK &&
687 			    whichfork == XFS_ATTR_FORK) {
688 				/* came to the end of attribute fork */
689 				out[cur_ext].bmv_oflags |= BMV_OF_LAST;
690 				goto out_free_map;
691 			}
692 
693 			if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
694 					prealloced, bmvend,
695 					map[i].br_startblock))
696 				goto out_free_map;
697 
698 			bmv->bmv_offset =
699 				out[cur_ext].bmv_offset +
700 				out[cur_ext].bmv_length;
701 			bmv->bmv_length =
702 				max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
703 
704 			/*
705 			 * In case we don't want to return the hole,
706 			 * don't increase cur_ext so that we can reuse
707 			 * it in the next loop.
708 			 */
709 			if ((iflags & BMV_IF_NO_HOLES) &&
710 			    map[i].br_startblock == HOLESTARTBLOCK) {
711 				memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
712 				continue;
713 			}
714 
715 			nexleft--;
716 			bmv->bmv_entries++;
717 			cur_ext++;
718 		}
719 	} while (nmap && nexleft && bmv->bmv_length);
720 
721  out_free_map:
722 	kmem_free(map);
723  out_unlock_ilock:
724 	xfs_iunlock(ip, lock);
725  out_unlock_iolock:
726 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
727 
728 	for (i = 0; i < cur_ext; i++) {
729 		int full = 0;	/* user array is full */
730 
731 		/* format results & advance arg */
732 		error = formatter(&arg, &out[i], &full);
733 		if (error || full)
734 			break;
735 	}
736 
737 	kmem_free(out);
738 	return error;
739 }
740 
741 /*
742  * dead simple method of punching delalyed allocation blocks from a range in
743  * the inode. Walks a block at a time so will be slow, but is only executed in
744  * rare error cases so the overhead is not critical. This will always punch out
745  * both the start and end blocks, even if the ranges only partially overlap
746  * them, so it is up to the caller to ensure that partial blocks are not
747  * passed in.
748  */
749 int
750 xfs_bmap_punch_delalloc_range(
751 	struct xfs_inode	*ip,
752 	xfs_fileoff_t		start_fsb,
753 	xfs_fileoff_t		length)
754 {
755 	xfs_fileoff_t		remaining = length;
756 	int			error = 0;
757 
758 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
759 
760 	do {
761 		int		done;
762 		xfs_bmbt_irec_t	imap;
763 		int		nimaps = 1;
764 		xfs_fsblock_t	firstblock;
765 		xfs_bmap_free_t flist;
766 
767 		/*
768 		 * Map the range first and check that it is a delalloc extent
769 		 * before trying to unmap the range. Otherwise we will be
770 		 * trying to remove a real extent (which requires a
771 		 * transaction) or a hole, which is probably a bad idea...
772 		 */
773 		error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
774 				       XFS_BMAPI_ENTIRE);
775 
776 		if (error) {
777 			/* something screwed, just bail */
778 			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
779 				xfs_alert(ip->i_mount,
780 			"Failed delalloc mapping lookup ino %lld fsb %lld.",
781 						ip->i_ino, start_fsb);
782 			}
783 			break;
784 		}
785 		if (!nimaps) {
786 			/* nothing there */
787 			goto next_block;
788 		}
789 		if (imap.br_startblock != DELAYSTARTBLOCK) {
790 			/* been converted, ignore */
791 			goto next_block;
792 		}
793 		WARN_ON(imap.br_blockcount == 0);
794 
795 		/*
796 		 * Note: while we initialise the firstblock/flist pair, they
797 		 * should never be used because blocks should never be
798 		 * allocated or freed for a delalloc extent and hence we need
799 		 * don't cancel or finish them after the xfs_bunmapi() call.
800 		 */
801 		xfs_bmap_init(&flist, &firstblock);
802 		error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
803 					&flist, &done);
804 		if (error)
805 			break;
806 
807 		ASSERT(!flist.xbf_count && !flist.xbf_first);
808 next_block:
809 		start_fsb++;
810 		remaining--;
811 	} while(remaining > 0);
812 
813 	return error;
814 }
815 
816 /*
817  * Test whether it is appropriate to check an inode for and free post EOF
818  * blocks. The 'force' parameter determines whether we should also consider
819  * regular files that are marked preallocated or append-only.
820  */
821 bool
822 xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
823 {
824 	/* prealloc/delalloc exists only on regular files */
825 	if (!S_ISREG(ip->i_d.di_mode))
826 		return false;
827 
828 	/*
829 	 * Zero sized files with no cached pages and delalloc blocks will not
830 	 * have speculative prealloc/delalloc blocks to remove.
831 	 */
832 	if (VFS_I(ip)->i_size == 0 &&
833 	    VFS_I(ip)->i_mapping->nrpages == 0 &&
834 	    ip->i_delayed_blks == 0)
835 		return false;
836 
837 	/* If we haven't read in the extent list, then don't do it now. */
838 	if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
839 		return false;
840 
841 	/*
842 	 * Do not free real preallocated or append-only files unless the file
843 	 * has delalloc blocks and we are forced to remove them.
844 	 */
845 	if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
846 		if (!force || ip->i_delayed_blks == 0)
847 			return false;
848 
849 	return true;
850 }
851 
852 /*
853  * This is called by xfs_inactive to free any blocks beyond eof
854  * when the link count isn't zero and by xfs_dm_punch_hole() when
855  * punching a hole to EOF.
856  */
857 int
858 xfs_free_eofblocks(
859 	xfs_mount_t	*mp,
860 	xfs_inode_t	*ip,
861 	bool		need_iolock)
862 {
863 	xfs_trans_t	*tp;
864 	int		error;
865 	xfs_fileoff_t	end_fsb;
866 	xfs_fileoff_t	last_fsb;
867 	xfs_filblks_t	map_len;
868 	int		nimaps;
869 	xfs_bmbt_irec_t	imap;
870 
871 	/*
872 	 * Figure out if there are any blocks beyond the end
873 	 * of the file.  If not, then there is nothing to do.
874 	 */
875 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
876 	last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
877 	if (last_fsb <= end_fsb)
878 		return 0;
879 	map_len = last_fsb - end_fsb;
880 
881 	nimaps = 1;
882 	xfs_ilock(ip, XFS_ILOCK_SHARED);
883 	error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
884 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
885 
886 	if (!error && (nimaps != 0) &&
887 	    (imap.br_startblock != HOLESTARTBLOCK ||
888 	     ip->i_delayed_blks)) {
889 		/*
890 		 * Attach the dquots to the inode up front.
891 		 */
892 		error = xfs_qm_dqattach(ip, 0);
893 		if (error)
894 			return error;
895 
896 		/*
897 		 * There are blocks after the end of file.
898 		 * Free them up now by truncating the file to
899 		 * its current size.
900 		 */
901 		tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
902 
903 		if (need_iolock) {
904 			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
905 				xfs_trans_cancel(tp);
906 				return -EAGAIN;
907 			}
908 		}
909 
910 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
911 		if (error) {
912 			ASSERT(XFS_FORCED_SHUTDOWN(mp));
913 			xfs_trans_cancel(tp);
914 			if (need_iolock)
915 				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
916 			return error;
917 		}
918 
919 		xfs_ilock(ip, XFS_ILOCK_EXCL);
920 		xfs_trans_ijoin(tp, ip, 0);
921 
922 		/*
923 		 * Do not update the on-disk file size.  If we update the
924 		 * on-disk file size and then the system crashes before the
925 		 * contents of the file are flushed to disk then the files
926 		 * may be full of holes (ie NULL files bug).
927 		 */
928 		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
929 					      XFS_ISIZE(ip));
930 		if (error) {
931 			/*
932 			 * If we get an error at this point we simply don't
933 			 * bother truncating the file.
934 			 */
935 			xfs_trans_cancel(tp);
936 		} else {
937 			error = xfs_trans_commit(tp);
938 			if (!error)
939 				xfs_inode_clear_eofblocks_tag(ip);
940 		}
941 
942 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
943 		if (need_iolock)
944 			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
945 	}
946 	return error;
947 }
948 
949 int
950 xfs_alloc_file_space(
951 	struct xfs_inode	*ip,
952 	xfs_off_t		offset,
953 	xfs_off_t		len,
954 	int			alloc_type)
955 {
956 	xfs_mount_t		*mp = ip->i_mount;
957 	xfs_off_t		count;
958 	xfs_filblks_t		allocated_fsb;
959 	xfs_filblks_t		allocatesize_fsb;
960 	xfs_extlen_t		extsz, temp;
961 	xfs_fileoff_t		startoffset_fsb;
962 	xfs_fsblock_t		firstfsb;
963 	int			nimaps;
964 	int			quota_flag;
965 	int			rt;
966 	xfs_trans_t		*tp;
967 	xfs_bmbt_irec_t		imaps[1], *imapp;
968 	xfs_bmap_free_t		free_list;
969 	uint			qblocks, resblks, resrtextents;
970 	int			error;
971 
972 	trace_xfs_alloc_file_space(ip);
973 
974 	if (XFS_FORCED_SHUTDOWN(mp))
975 		return -EIO;
976 
977 	error = xfs_qm_dqattach(ip, 0);
978 	if (error)
979 		return error;
980 
981 	if (len <= 0)
982 		return -EINVAL;
983 
984 	rt = XFS_IS_REALTIME_INODE(ip);
985 	extsz = xfs_get_extsz_hint(ip);
986 
987 	count = len;
988 	imapp = &imaps[0];
989 	nimaps = 1;
990 	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
991 	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
992 
993 	/*
994 	 * Allocate file space until done or until there is an error
995 	 */
996 	while (allocatesize_fsb && !error) {
997 		xfs_fileoff_t	s, e;
998 
999 		/*
1000 		 * Determine space reservations for data/realtime.
1001 		 */
1002 		if (unlikely(extsz)) {
1003 			s = startoffset_fsb;
1004 			do_div(s, extsz);
1005 			s *= extsz;
1006 			e = startoffset_fsb + allocatesize_fsb;
1007 			if ((temp = do_mod(startoffset_fsb, extsz)))
1008 				e += temp;
1009 			if ((temp = do_mod(e, extsz)))
1010 				e += extsz - temp;
1011 		} else {
1012 			s = 0;
1013 			e = allocatesize_fsb;
1014 		}
1015 
1016 		/*
1017 		 * The transaction reservation is limited to a 32-bit block
1018 		 * count, hence we need to limit the number of blocks we are
1019 		 * trying to reserve to avoid an overflow. We can't allocate
1020 		 * more than @nimaps extents, and an extent is limited on disk
1021 		 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
1022 		 */
1023 		resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
1024 		if (unlikely(rt)) {
1025 			resrtextents = qblocks = resblks;
1026 			resrtextents /= mp->m_sb.sb_rextsize;
1027 			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1028 			quota_flag = XFS_QMOPT_RES_RTBLKS;
1029 		} else {
1030 			resrtextents = 0;
1031 			resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
1032 			quota_flag = XFS_QMOPT_RES_REGBLKS;
1033 		}
1034 
1035 		/*
1036 		 * Allocate and setup the transaction.
1037 		 */
1038 		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1039 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
1040 					  resblks, resrtextents);
1041 		/*
1042 		 * Check for running out of space
1043 		 */
1044 		if (error) {
1045 			/*
1046 			 * Free the transaction structure.
1047 			 */
1048 			ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1049 			xfs_trans_cancel(tp);
1050 			break;
1051 		}
1052 		xfs_ilock(ip, XFS_ILOCK_EXCL);
1053 		error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
1054 						      0, quota_flag);
1055 		if (error)
1056 			goto error1;
1057 
1058 		xfs_trans_ijoin(tp, ip, 0);
1059 
1060 		xfs_bmap_init(&free_list, &firstfsb);
1061 		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
1062 					allocatesize_fsb, alloc_type, &firstfsb,
1063 					resblks, imapp, &nimaps, &free_list);
1064 		if (error)
1065 			goto error0;
1066 
1067 		/*
1068 		 * Complete the transaction
1069 		 */
1070 		error = xfs_bmap_finish(&tp, &free_list, NULL);
1071 		if (error)
1072 			goto error0;
1073 
1074 		error = xfs_trans_commit(tp);
1075 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
1076 		if (error)
1077 			break;
1078 
1079 		allocated_fsb = imapp->br_blockcount;
1080 
1081 		if (nimaps == 0) {
1082 			error = -ENOSPC;
1083 			break;
1084 		}
1085 
1086 		startoffset_fsb += allocated_fsb;
1087 		allocatesize_fsb -= allocated_fsb;
1088 	}
1089 
1090 	return error;
1091 
1092 error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
1093 	xfs_bmap_cancel(&free_list);
1094 	xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
1095 
1096 error1:	/* Just cancel transaction */
1097 	xfs_trans_cancel(tp);
1098 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1099 	return error;
1100 }
1101 
1102 /*
1103  * Zero file bytes between startoff and endoff inclusive.
1104  * The iolock is held exclusive and no blocks are buffered.
1105  *
1106  * This function is used by xfs_free_file_space() to zero
1107  * partial blocks when the range to free is not block aligned.
1108  * When unreserving space with boundaries that are not block
1109  * aligned we round up the start and round down the end
1110  * boundaries and then use this function to zero the parts of
1111  * the blocks that got dropped during the rounding.
1112  */
1113 STATIC int
1114 xfs_zero_remaining_bytes(
1115 	xfs_inode_t		*ip,
1116 	xfs_off_t		startoff,
1117 	xfs_off_t		endoff)
1118 {
1119 	xfs_bmbt_irec_t		imap;
1120 	xfs_fileoff_t		offset_fsb;
1121 	xfs_off_t		lastoffset;
1122 	xfs_off_t		offset;
1123 	xfs_buf_t		*bp;
1124 	xfs_mount_t		*mp = ip->i_mount;
1125 	int			nimap;
1126 	int			error = 0;
1127 
1128 	/*
1129 	 * Avoid doing I/O beyond eof - it's not necessary
1130 	 * since nothing can read beyond eof.  The space will
1131 	 * be zeroed when the file is extended anyway.
1132 	 */
1133 	if (startoff >= XFS_ISIZE(ip))
1134 		return 0;
1135 
1136 	if (endoff > XFS_ISIZE(ip))
1137 		endoff = XFS_ISIZE(ip);
1138 
1139 	for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
1140 		uint lock_mode;
1141 
1142 		offset_fsb = XFS_B_TO_FSBT(mp, offset);
1143 		nimap = 1;
1144 
1145 		lock_mode = xfs_ilock_data_map_shared(ip);
1146 		error = xfs_bmapi_read(ip, offset_fsb, 1, &imap, &nimap, 0);
1147 		xfs_iunlock(ip, lock_mode);
1148 
1149 		if (error || nimap < 1)
1150 			break;
1151 		ASSERT(imap.br_blockcount >= 1);
1152 		ASSERT(imap.br_startoff == offset_fsb);
1153 		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1154 
1155 		if (imap.br_startblock == HOLESTARTBLOCK ||
1156 		    imap.br_state == XFS_EXT_UNWRITTEN) {
1157 			/* skip the entire extent */
1158 			lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff +
1159 						      imap.br_blockcount) - 1;
1160 			continue;
1161 		}
1162 
1163 		lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1;
1164 		if (lastoffset > endoff)
1165 			lastoffset = endoff;
1166 
1167 		/* DAX can just zero the backing device directly */
1168 		if (IS_DAX(VFS_I(ip))) {
1169 			error = dax_zero_page_range(VFS_I(ip), offset,
1170 						    lastoffset - offset + 1,
1171 						    xfs_get_blocks_direct);
1172 			if (error)
1173 				return error;
1174 			continue;
1175 		}
1176 
1177 		error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ?
1178 				mp->m_rtdev_targp : mp->m_ddev_targp,
1179 				xfs_fsb_to_db(ip, imap.br_startblock),
1180 				BTOBB(mp->m_sb.sb_blocksize),
1181 				0, &bp, NULL);
1182 		if (error)
1183 			return error;
1184 
1185 		memset(bp->b_addr +
1186 				(offset - XFS_FSB_TO_B(mp, imap.br_startoff)),
1187 		       0, lastoffset - offset + 1);
1188 
1189 		error = xfs_bwrite(bp);
1190 		xfs_buf_relse(bp);
1191 		if (error)
1192 			return error;
1193 	}
1194 	return error;
1195 }
1196 
1197 int
1198 xfs_free_file_space(
1199 	struct xfs_inode	*ip,
1200 	xfs_off_t		offset,
1201 	xfs_off_t		len)
1202 {
1203 	int			done;
1204 	xfs_fileoff_t		endoffset_fsb;
1205 	int			error;
1206 	xfs_fsblock_t		firstfsb;
1207 	xfs_bmap_free_t		free_list;
1208 	xfs_bmbt_irec_t		imap;
1209 	xfs_off_t		ioffset;
1210 	xfs_off_t		iendoffset;
1211 	xfs_extlen_t		mod=0;
1212 	xfs_mount_t		*mp;
1213 	int			nimap;
1214 	uint			resblks;
1215 	xfs_off_t		rounding;
1216 	int			rt;
1217 	xfs_fileoff_t		startoffset_fsb;
1218 	xfs_trans_t		*tp;
1219 
1220 	mp = ip->i_mount;
1221 
1222 	trace_xfs_free_file_space(ip);
1223 
1224 	error = xfs_qm_dqattach(ip, 0);
1225 	if (error)
1226 		return error;
1227 
1228 	error = 0;
1229 	if (len <= 0)	/* if nothing being freed */
1230 		return error;
1231 	rt = XFS_IS_REALTIME_INODE(ip);
1232 	startoffset_fsb	= XFS_B_TO_FSB(mp, offset);
1233 	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
1234 
1235 	/* wait for the completion of any pending DIOs */
1236 	inode_dio_wait(VFS_I(ip));
1237 
1238 	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
1239 	ioffset = round_down(offset, rounding);
1240 	iendoffset = round_up(offset + len, rounding) - 1;
1241 	error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset,
1242 					     iendoffset);
1243 	if (error)
1244 		goto out;
1245 	truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset);
1246 
1247 	/*
1248 	 * Need to zero the stuff we're not freeing, on disk.
1249 	 * If it's a realtime file & can't use unwritten extents then we
1250 	 * actually need to zero the extent edges.  Otherwise xfs_bunmapi
1251 	 * will take care of it for us.
1252 	 */
1253 	if (rt && !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
1254 		nimap = 1;
1255 		error = xfs_bmapi_read(ip, startoffset_fsb, 1,
1256 					&imap, &nimap, 0);
1257 		if (error)
1258 			goto out;
1259 		ASSERT(nimap == 0 || nimap == 1);
1260 		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1261 			xfs_daddr_t	block;
1262 
1263 			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1264 			block = imap.br_startblock;
1265 			mod = do_div(block, mp->m_sb.sb_rextsize);
1266 			if (mod)
1267 				startoffset_fsb += mp->m_sb.sb_rextsize - mod;
1268 		}
1269 		nimap = 1;
1270 		error = xfs_bmapi_read(ip, endoffset_fsb - 1, 1,
1271 					&imap, &nimap, 0);
1272 		if (error)
1273 			goto out;
1274 		ASSERT(nimap == 0 || nimap == 1);
1275 		if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1276 			ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1277 			mod++;
1278 			if (mod && (mod != mp->m_sb.sb_rextsize))
1279 				endoffset_fsb -= mod;
1280 		}
1281 	}
1282 	if ((done = (endoffset_fsb <= startoffset_fsb)))
1283 		/*
1284 		 * One contiguous piece to clear
1285 		 */
1286 		error = xfs_zero_remaining_bytes(ip, offset, offset + len - 1);
1287 	else {
1288 		/*
1289 		 * Some full blocks, possibly two pieces to clear
1290 		 */
1291 		if (offset < XFS_FSB_TO_B(mp, startoffset_fsb))
1292 			error = xfs_zero_remaining_bytes(ip, offset,
1293 				XFS_FSB_TO_B(mp, startoffset_fsb) - 1);
1294 		if (!error &&
1295 		    XFS_FSB_TO_B(mp, endoffset_fsb) < offset + len)
1296 			error = xfs_zero_remaining_bytes(ip,
1297 				XFS_FSB_TO_B(mp, endoffset_fsb),
1298 				offset + len - 1);
1299 	}
1300 
1301 	/*
1302 	 * free file space until done or until there is an error
1303 	 */
1304 	resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1305 	while (!error && !done) {
1306 
1307 		/*
1308 		 * allocate and setup the transaction. Allow this
1309 		 * transaction to dip into the reserve blocks to ensure
1310 		 * the freeing of the space succeeds at ENOSPC.
1311 		 */
1312 		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1313 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0);
1314 
1315 		/*
1316 		 * check for running out of space
1317 		 */
1318 		if (error) {
1319 			/*
1320 			 * Free the transaction structure.
1321 			 */
1322 			ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1323 			xfs_trans_cancel(tp);
1324 			break;
1325 		}
1326 		xfs_ilock(ip, XFS_ILOCK_EXCL);
1327 		error = xfs_trans_reserve_quota(tp, mp,
1328 				ip->i_udquot, ip->i_gdquot, ip->i_pdquot,
1329 				resblks, 0, XFS_QMOPT_RES_REGBLKS);
1330 		if (error)
1331 			goto error1;
1332 
1333 		xfs_trans_ijoin(tp, ip, 0);
1334 
1335 		/*
1336 		 * issue the bunmapi() call to free the blocks
1337 		 */
1338 		xfs_bmap_init(&free_list, &firstfsb);
1339 		error = xfs_bunmapi(tp, ip, startoffset_fsb,
1340 				  endoffset_fsb - startoffset_fsb,
1341 				  0, 2, &firstfsb, &free_list, &done);
1342 		if (error)
1343 			goto error0;
1344 
1345 		/*
1346 		 * complete the transaction
1347 		 */
1348 		error = xfs_bmap_finish(&tp, &free_list, NULL);
1349 		if (error)
1350 			goto error0;
1351 
1352 		error = xfs_trans_commit(tp);
1353 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
1354 	}
1355 
1356  out:
1357 	return error;
1358 
1359  error0:
1360 	xfs_bmap_cancel(&free_list);
1361  error1:
1362 	xfs_trans_cancel(tp);
1363 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1364 	goto out;
1365 }
1366 
1367 /*
1368  * Preallocate and zero a range of a file. This mechanism has the allocation
1369  * semantics of fallocate and in addition converts data in the range to zeroes.
1370  */
1371 int
1372 xfs_zero_file_space(
1373 	struct xfs_inode	*ip,
1374 	xfs_off_t		offset,
1375 	xfs_off_t		len)
1376 {
1377 	struct xfs_mount	*mp = ip->i_mount;
1378 	uint			blksize;
1379 	int			error;
1380 
1381 	trace_xfs_zero_file_space(ip);
1382 
1383 	blksize = 1 << mp->m_sb.sb_blocklog;
1384 
1385 	/*
1386 	 * Punch a hole and prealloc the range. We use hole punch rather than
1387 	 * unwritten extent conversion for two reasons:
1388 	 *
1389 	 * 1.) Hole punch handles partial block zeroing for us.
1390 	 *
1391 	 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued
1392 	 * by virtue of the hole punch.
1393 	 */
1394 	error = xfs_free_file_space(ip, offset, len);
1395 	if (error)
1396 		goto out;
1397 
1398 	error = xfs_alloc_file_space(ip, round_down(offset, blksize),
1399 				     round_up(offset + len, blksize) -
1400 				     round_down(offset, blksize),
1401 				     XFS_BMAPI_PREALLOC);
1402 out:
1403 	return error;
1404 
1405 }
1406 
1407 /*
1408  * @next_fsb will keep track of the extent currently undergoing shift.
1409  * @stop_fsb will keep track of the extent at which we have to stop.
1410  * If we are shifting left, we will start with block (offset + len) and
1411  * shift each extent till last extent.
1412  * If we are shifting right, we will start with last extent inside file space
1413  * and continue until we reach the block corresponding to offset.
1414  */
1415 static int
1416 xfs_shift_file_space(
1417 	struct xfs_inode        *ip,
1418 	xfs_off_t               offset,
1419 	xfs_off_t               len,
1420 	enum shift_direction	direction)
1421 {
1422 	int			done = 0;
1423 	struct xfs_mount	*mp = ip->i_mount;
1424 	struct xfs_trans	*tp;
1425 	int			error;
1426 	struct xfs_bmap_free	free_list;
1427 	xfs_fsblock_t		first_block;
1428 	xfs_fileoff_t		stop_fsb;
1429 	xfs_fileoff_t		next_fsb;
1430 	xfs_fileoff_t		shift_fsb;
1431 
1432 	ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
1433 
1434 	if (direction == SHIFT_LEFT) {
1435 		next_fsb = XFS_B_TO_FSB(mp, offset + len);
1436 		stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
1437 	} else {
1438 		/*
1439 		 * If right shift, delegate the work of initialization of
1440 		 * next_fsb to xfs_bmap_shift_extent as it has ilock held.
1441 		 */
1442 		next_fsb = NULLFSBLOCK;
1443 		stop_fsb = XFS_B_TO_FSB(mp, offset);
1444 	}
1445 
1446 	shift_fsb = XFS_B_TO_FSB(mp, len);
1447 
1448 	/*
1449 	 * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
1450 	 * into the accessible region of the file.
1451 	 */
1452 	if (xfs_can_free_eofblocks(ip, true)) {
1453 		error = xfs_free_eofblocks(mp, ip, false);
1454 		if (error)
1455 			return error;
1456 	}
1457 
1458 	/*
1459 	 * Writeback and invalidate cache for the remainder of the file as we're
1460 	 * about to shift down every extent from offset to EOF.
1461 	 */
1462 	error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1463 					     offset, -1);
1464 	if (error)
1465 		return error;
1466 	error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
1467 					offset >> PAGE_CACHE_SHIFT, -1);
1468 	if (error)
1469 		return error;
1470 
1471 	/*
1472 	 * The extent shiting code works on extent granularity. So, if
1473 	 * stop_fsb is not the starting block of extent, we need to split
1474 	 * the extent at stop_fsb.
1475 	 */
1476 	if (direction == SHIFT_RIGHT) {
1477 		error = xfs_bmap_split_extent(ip, stop_fsb);
1478 		if (error)
1479 			return error;
1480 	}
1481 
1482 	while (!error && !done) {
1483 		tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
1484 		/*
1485 		 * We would need to reserve permanent block for transaction.
1486 		 * This will come into picture when after shifting extent into
1487 		 * hole we found that adjacent extents can be merged which
1488 		 * may lead to freeing of a block during record update.
1489 		 */
1490 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
1491 				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0);
1492 		if (error) {
1493 			xfs_trans_cancel(tp);
1494 			break;
1495 		}
1496 
1497 		xfs_ilock(ip, XFS_ILOCK_EXCL);
1498 		error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
1499 				ip->i_gdquot, ip->i_pdquot,
1500 				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
1501 				XFS_QMOPT_RES_REGBLKS);
1502 		if (error)
1503 			goto out_trans_cancel;
1504 
1505 		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1506 
1507 		xfs_bmap_init(&free_list, &first_block);
1508 
1509 		/*
1510 		 * We are using the write transaction in which max 2 bmbt
1511 		 * updates are allowed
1512 		 */
1513 		error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb,
1514 				&done, stop_fsb, &first_block, &free_list,
1515 				direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
1516 		if (error)
1517 			goto out_bmap_cancel;
1518 
1519 		error = xfs_bmap_finish(&tp, &free_list, NULL);
1520 		if (error)
1521 			goto out_bmap_cancel;
1522 
1523 		error = xfs_trans_commit(tp);
1524 	}
1525 
1526 	return error;
1527 
1528 out_bmap_cancel:
1529 	xfs_bmap_cancel(&free_list);
1530 out_trans_cancel:
1531 	xfs_trans_cancel(tp);
1532 	return error;
1533 }
1534 
1535 /*
1536  * xfs_collapse_file_space()
1537  *	This routine frees disk space and shift extent for the given file.
1538  *	The first thing we do is to free data blocks in the specified range
1539  *	by calling xfs_free_file_space(). It would also sync dirty data
1540  *	and invalidate page cache over the region on which collapse range
1541  *	is working. And Shift extent records to the left to cover a hole.
1542  * RETURNS:
1543  *	0 on success
1544  *	errno on error
1545  *
1546  */
1547 int
1548 xfs_collapse_file_space(
1549 	struct xfs_inode	*ip,
1550 	xfs_off_t		offset,
1551 	xfs_off_t		len)
1552 {
1553 	int error;
1554 
1555 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1556 	trace_xfs_collapse_file_space(ip);
1557 
1558 	error = xfs_free_file_space(ip, offset, len);
1559 	if (error)
1560 		return error;
1561 
1562 	return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT);
1563 }
1564 
1565 /*
1566  * xfs_insert_file_space()
1567  *	This routine create hole space by shifting extents for the given file.
1568  *	The first thing we do is to sync dirty data and invalidate page cache
1569  *	over the region on which insert range is working. And split an extent
1570  *	to two extents at given offset by calling xfs_bmap_split_extent.
1571  *	And shift all extent records which are laying between [offset,
1572  *	last allocated extent] to the right to reserve hole range.
1573  * RETURNS:
1574  *	0 on success
1575  *	errno on error
1576  */
1577 int
1578 xfs_insert_file_space(
1579 	struct xfs_inode	*ip,
1580 	loff_t			offset,
1581 	loff_t			len)
1582 {
1583 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1584 	trace_xfs_insert_file_space(ip);
1585 
1586 	return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT);
1587 }
1588 
1589 /*
1590  * We need to check that the format of the data fork in the temporary inode is
1591  * valid for the target inode before doing the swap. This is not a problem with
1592  * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
1593  * data fork depending on the space the attribute fork is taking so we can get
1594  * invalid formats on the target inode.
1595  *
1596  * E.g. target has space for 7 extents in extent format, temp inode only has
1597  * space for 6.  If we defragment down to 7 extents, then the tmp format is a
1598  * btree, but when swapped it needs to be in extent format. Hence we can't just
1599  * blindly swap data forks on attr2 filesystems.
1600  *
1601  * Note that we check the swap in both directions so that we don't end up with
1602  * a corrupt temporary inode, either.
1603  *
1604  * Note that fixing the way xfs_fsr sets up the attribute fork in the source
1605  * inode will prevent this situation from occurring, so all we do here is
1606  * reject and log the attempt. basically we are putting the responsibility on
1607  * userspace to get this right.
1608  */
1609 static int
1610 xfs_swap_extents_check_format(
1611 	xfs_inode_t	*ip,	/* target inode */
1612 	xfs_inode_t	*tip)	/* tmp inode */
1613 {
1614 
1615 	/* Should never get a local format */
1616 	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
1617 	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
1618 		return -EINVAL;
1619 
1620 	/*
1621 	 * if the target inode has less extents that then temporary inode then
1622 	 * why did userspace call us?
1623 	 */
1624 	if (ip->i_d.di_nextents < tip->i_d.di_nextents)
1625 		return -EINVAL;
1626 
1627 	/*
1628 	 * if the target inode is in extent form and the temp inode is in btree
1629 	 * form then we will end up with the target inode in the wrong format
1630 	 * as we already know there are less extents in the temp inode.
1631 	 */
1632 	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1633 	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
1634 		return -EINVAL;
1635 
1636 	/* Check temp in extent form to max in target */
1637 	if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1638 	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
1639 			XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1640 		return -EINVAL;
1641 
1642 	/* Check target in extent form to max in temp */
1643 	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1644 	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
1645 			XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1646 		return -EINVAL;
1647 
1648 	/*
1649 	 * If we are in a btree format, check that the temp root block will fit
1650 	 * in the target and that it has enough extents to be in btree format
1651 	 * in the target.
1652 	 *
1653 	 * Note that we have to be careful to allow btree->extent conversions
1654 	 * (a common defrag case) which will occur when the temp inode is in
1655 	 * extent format...
1656 	 */
1657 	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1658 		if (XFS_IFORK_BOFF(ip) &&
1659 		    XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
1660 			return -EINVAL;
1661 		if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
1662 		    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1663 			return -EINVAL;
1664 	}
1665 
1666 	/* Reciprocal target->temp btree format checks */
1667 	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1668 		if (XFS_IFORK_BOFF(tip) &&
1669 		    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
1670 			return -EINVAL;
1671 		if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
1672 		    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1673 			return -EINVAL;
1674 	}
1675 
1676 	return 0;
1677 }
1678 
1679 static int
1680 xfs_swap_extent_flush(
1681 	struct xfs_inode	*ip)
1682 {
1683 	int	error;
1684 
1685 	error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
1686 	if (error)
1687 		return error;
1688 	truncate_pagecache_range(VFS_I(ip), 0, -1);
1689 
1690 	/* Verify O_DIRECT for ftmp */
1691 	if (VFS_I(ip)->i_mapping->nrpages)
1692 		return -EINVAL;
1693 	return 0;
1694 }
1695 
1696 int
1697 xfs_swap_extents(
1698 	xfs_inode_t	*ip,	/* target inode */
1699 	xfs_inode_t	*tip,	/* tmp inode */
1700 	xfs_swapext_t	*sxp)
1701 {
1702 	xfs_mount_t	*mp = ip->i_mount;
1703 	xfs_trans_t	*tp;
1704 	xfs_bstat_t	*sbp = &sxp->sx_stat;
1705 	xfs_ifork_t	*tempifp, *ifp, *tifp;
1706 	int		src_log_flags, target_log_flags;
1707 	int		error = 0;
1708 	int		aforkblks = 0;
1709 	int		taforkblks = 0;
1710 	__uint64_t	tmp;
1711 	int		lock_flags;
1712 
1713 	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
1714 	if (!tempifp) {
1715 		error = -ENOMEM;
1716 		goto out;
1717 	}
1718 
1719 	/*
1720 	 * Lock the inodes against other IO, page faults and truncate to
1721 	 * begin with.  Then we can ensure the inodes are flushed and have no
1722 	 * page cache safely. Once we have done this we can take the ilocks and
1723 	 * do the rest of the checks.
1724 	 */
1725 	lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1726 	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
1727 	xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
1728 
1729 	/* Verify that both files have the same format */
1730 	if ((ip->i_d.di_mode & S_IFMT) != (tip->i_d.di_mode & S_IFMT)) {
1731 		error = -EINVAL;
1732 		goto out_unlock;
1733 	}
1734 
1735 	/* Verify both files are either real-time or non-realtime */
1736 	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
1737 		error = -EINVAL;
1738 		goto out_unlock;
1739 	}
1740 
1741 	error = xfs_swap_extent_flush(ip);
1742 	if (error)
1743 		goto out_unlock;
1744 	error = xfs_swap_extent_flush(tip);
1745 	if (error)
1746 		goto out_unlock;
1747 
1748 	tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
1749 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
1750 	if (error) {
1751 		xfs_trans_cancel(tp);
1752 		goto out_unlock;
1753 	}
1754 
1755 	/*
1756 	 * Lock and join the inodes to the tansaction so that transaction commit
1757 	 * or cancel will unlock the inodes from this point onwards.
1758 	 */
1759 	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
1760 	lock_flags |= XFS_ILOCK_EXCL;
1761 	xfs_trans_ijoin(tp, ip, lock_flags);
1762 	xfs_trans_ijoin(tp, tip, lock_flags);
1763 
1764 
1765 	/* Verify all data are being swapped */
1766 	if (sxp->sx_offset != 0 ||
1767 	    sxp->sx_length != ip->i_d.di_size ||
1768 	    sxp->sx_length != tip->i_d.di_size) {
1769 		error = -EFAULT;
1770 		goto out_trans_cancel;
1771 	}
1772 
1773 	trace_xfs_swap_extent_before(ip, 0);
1774 	trace_xfs_swap_extent_before(tip, 1);
1775 
1776 	/* check inode formats now that data is flushed */
1777 	error = xfs_swap_extents_check_format(ip, tip);
1778 	if (error) {
1779 		xfs_notice(mp,
1780 		    "%s: inode 0x%llx format is incompatible for exchanging.",
1781 				__func__, ip->i_ino);
1782 		goto out_trans_cancel;
1783 	}
1784 
1785 	/*
1786 	 * Compare the current change & modify times with that
1787 	 * passed in.  If they differ, we abort this swap.
1788 	 * This is the mechanism used to ensure the calling
1789 	 * process that the file was not changed out from
1790 	 * under it.
1791 	 */
1792 	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
1793 	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
1794 	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
1795 	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
1796 		error = -EBUSY;
1797 		goto out_trans_cancel;
1798 	}
1799 	/*
1800 	 * Count the number of extended attribute blocks
1801 	 */
1802 	if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
1803 	     (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
1804 		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
1805 		if (error)
1806 			goto out_trans_cancel;
1807 	}
1808 	if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
1809 	     (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
1810 		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
1811 			&taforkblks);
1812 		if (error)
1813 			goto out_trans_cancel;
1814 	}
1815 
1816 	/*
1817 	 * Before we've swapped the forks, lets set the owners of the forks
1818 	 * appropriately. We have to do this as we are demand paging the btree
1819 	 * buffers, and so the validation done on read will expect the owner
1820 	 * field to be correctly set. Once we change the owners, we can swap the
1821 	 * inode forks.
1822 	 *
1823 	 * Note the trickiness in setting the log flags - we set the owner log
1824 	 * flag on the opposite inode (i.e. the inode we are setting the new
1825 	 * owner to be) because once we swap the forks and log that, log
1826 	 * recovery is going to see the fork as owned by the swapped inode,
1827 	 * not the pre-swapped inodes.
1828 	 */
1829 	src_log_flags = XFS_ILOG_CORE;
1830 	target_log_flags = XFS_ILOG_CORE;
1831 	if (ip->i_d.di_version == 3 &&
1832 	    ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1833 		target_log_flags |= XFS_ILOG_DOWNER;
1834 		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
1835 					      tip->i_ino, NULL);
1836 		if (error)
1837 			goto out_trans_cancel;
1838 	}
1839 
1840 	if (tip->i_d.di_version == 3 &&
1841 	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1842 		src_log_flags |= XFS_ILOG_DOWNER;
1843 		error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
1844 					      ip->i_ino, NULL);
1845 		if (error)
1846 			goto out_trans_cancel;
1847 	}
1848 
1849 	/*
1850 	 * Swap the data forks of the inodes
1851 	 */
1852 	ifp = &ip->i_df;
1853 	tifp = &tip->i_df;
1854 	*tempifp = *ifp;	/* struct copy */
1855 	*ifp = *tifp;		/* struct copy */
1856 	*tifp = *tempifp;	/* struct copy */
1857 
1858 	/*
1859 	 * Fix the on-disk inode values
1860 	 */
1861 	tmp = (__uint64_t)ip->i_d.di_nblocks;
1862 	ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
1863 	tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
1864 
1865 	tmp = (__uint64_t) ip->i_d.di_nextents;
1866 	ip->i_d.di_nextents = tip->i_d.di_nextents;
1867 	tip->i_d.di_nextents = tmp;
1868 
1869 	tmp = (__uint64_t) ip->i_d.di_format;
1870 	ip->i_d.di_format = tip->i_d.di_format;
1871 	tip->i_d.di_format = tmp;
1872 
1873 	/*
1874 	 * The extents in the source inode could still contain speculative
1875 	 * preallocation beyond EOF (e.g. the file is open but not modified
1876 	 * while defrag is in progress). In that case, we need to copy over the
1877 	 * number of delalloc blocks the data fork in the source inode is
1878 	 * tracking beyond EOF so that when the fork is truncated away when the
1879 	 * temporary inode is unlinked we don't underrun the i_delayed_blks
1880 	 * counter on that inode.
1881 	 */
1882 	ASSERT(tip->i_delayed_blks == 0);
1883 	tip->i_delayed_blks = ip->i_delayed_blks;
1884 	ip->i_delayed_blks = 0;
1885 
1886 	switch (ip->i_d.di_format) {
1887 	case XFS_DINODE_FMT_EXTENTS:
1888 		/* If the extents fit in the inode, fix the
1889 		 * pointer.  Otherwise it's already NULL or
1890 		 * pointing to the extent.
1891 		 */
1892 		if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
1893 			ifp->if_u1.if_extents =
1894 				ifp->if_u2.if_inline_ext;
1895 		}
1896 		src_log_flags |= XFS_ILOG_DEXT;
1897 		break;
1898 	case XFS_DINODE_FMT_BTREE:
1899 		ASSERT(ip->i_d.di_version < 3 ||
1900 		       (src_log_flags & XFS_ILOG_DOWNER));
1901 		src_log_flags |= XFS_ILOG_DBROOT;
1902 		break;
1903 	}
1904 
1905 	switch (tip->i_d.di_format) {
1906 	case XFS_DINODE_FMT_EXTENTS:
1907 		/* If the extents fit in the inode, fix the
1908 		 * pointer.  Otherwise it's already NULL or
1909 		 * pointing to the extent.
1910 		 */
1911 		if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
1912 			tifp->if_u1.if_extents =
1913 				tifp->if_u2.if_inline_ext;
1914 		}
1915 		target_log_flags |= XFS_ILOG_DEXT;
1916 		break;
1917 	case XFS_DINODE_FMT_BTREE:
1918 		target_log_flags |= XFS_ILOG_DBROOT;
1919 		ASSERT(tip->i_d.di_version < 3 ||
1920 		       (target_log_flags & XFS_ILOG_DOWNER));
1921 		break;
1922 	}
1923 
1924 	xfs_trans_log_inode(tp, ip,  src_log_flags);
1925 	xfs_trans_log_inode(tp, tip, target_log_flags);
1926 
1927 	/*
1928 	 * If this is a synchronous mount, make sure that the
1929 	 * transaction goes to disk before returning to the user.
1930 	 */
1931 	if (mp->m_flags & XFS_MOUNT_WSYNC)
1932 		xfs_trans_set_sync(tp);
1933 
1934 	error = xfs_trans_commit(tp);
1935 
1936 	trace_xfs_swap_extent_after(ip, 0);
1937 	trace_xfs_swap_extent_after(tip, 1);
1938 out:
1939 	kmem_free(tempifp);
1940 	return error;
1941 
1942 out_unlock:
1943 	xfs_iunlock(ip, lock_flags);
1944 	xfs_iunlock(tip, lock_flags);
1945 	goto out;
1946 
1947 out_trans_cancel:
1948 	xfs_trans_cancel(tp);
1949 	goto out;
1950 }
1951