xref: /openbmc/linux/fs/xfs/xfs_bmap_util.c (revision 77a87824)
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * Copyright (c) 2012 Red Hat, Inc.
4  * All Rights Reserved.
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public License as
8  * published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it would be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  * GNU General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public License
16  * along with this program; if not, write the Free Software Foundation,
17  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
18  */
19 #include "xfs.h"
20 #include "xfs_fs.h"
21 #include "xfs_shared.h"
22 #include "xfs_format.h"
23 #include "xfs_log_format.h"
24 #include "xfs_trans_resv.h"
25 #include "xfs_bit.h"
26 #include "xfs_mount.h"
27 #include "xfs_da_format.h"
28 #include "xfs_inode.h"
29 #include "xfs_btree.h"
30 #include "xfs_trans.h"
31 #include "xfs_extfree_item.h"
32 #include "xfs_alloc.h"
33 #include "xfs_bmap.h"
34 #include "xfs_bmap_util.h"
35 #include "xfs_bmap_btree.h"
36 #include "xfs_rtalloc.h"
37 #include "xfs_error.h"
38 #include "xfs_quota.h"
39 #include "xfs_trans_space.h"
40 #include "xfs_trace.h"
41 #include "xfs_icache.h"
42 #include "xfs_log.h"
43 
44 /* Kernel only BMAP related definitions and functions */
45 
46 /*
47  * Convert the given file system block to a disk block.  We have to treat it
48  * differently based on whether the file is a real time file or not, because the
49  * bmap code does.
50  */
51 xfs_daddr_t
52 xfs_fsb_to_db(struct xfs_inode *ip, xfs_fsblock_t fsb)
53 {
54 	return (XFS_IS_REALTIME_INODE(ip) ? \
55 		 (xfs_daddr_t)XFS_FSB_TO_BB((ip)->i_mount, (fsb)) : \
56 		 XFS_FSB_TO_DADDR((ip)->i_mount, (fsb)));
57 }
58 
59 /*
60  * Routine to zero an extent on disk allocated to the specific inode.
61  *
62  * The VFS functions take a linearised filesystem block offset, so we have to
63  * convert the sparse xfs fsb to the right format first.
64  * VFS types are real funky, too.
65  */
66 int
67 xfs_zero_extent(
68 	struct xfs_inode *ip,
69 	xfs_fsblock_t	start_fsb,
70 	xfs_off_t	count_fsb)
71 {
72 	struct xfs_mount *mp = ip->i_mount;
73 	xfs_daddr_t	sector = xfs_fsb_to_db(ip, start_fsb);
74 	sector_t	block = XFS_BB_TO_FSBT(mp, sector);
75 
76 	return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)),
77 		block << (mp->m_super->s_blocksize_bits - 9),
78 		count_fsb << (mp->m_super->s_blocksize_bits - 9),
79 		GFP_NOFS, true);
80 }
81 
82 /* Sort bmap items by AG. */
83 static int
84 xfs_bmap_free_list_cmp(
85 	void			*priv,
86 	struct list_head	*a,
87 	struct list_head	*b)
88 {
89 	struct xfs_mount	*mp = priv;
90 	struct xfs_bmap_free_item	*ra;
91 	struct xfs_bmap_free_item	*rb;
92 
93 	ra = container_of(a, struct xfs_bmap_free_item, xbfi_list);
94 	rb = container_of(b, struct xfs_bmap_free_item, xbfi_list);
95 	return  XFS_FSB_TO_AGNO(mp, ra->xbfi_startblock) -
96 		XFS_FSB_TO_AGNO(mp, rb->xbfi_startblock);
97 }
98 
99 /*
100  * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
101  * caller.  Frees all the extents that need freeing, which must be done
102  * last due to locking considerations.  We never free any extents in
103  * the first transaction.
104  *
105  * If an inode *ip is provided, rejoin it to the transaction if
106  * the transaction was committed.
107  */
108 int						/* error */
109 xfs_bmap_finish(
110 	struct xfs_trans		**tp,	/* transaction pointer addr */
111 	struct xfs_bmap_free		*flist,	/* i/o: list extents to free */
112 	struct xfs_inode		*ip)
113 {
114 	struct xfs_efd_log_item		*efd;	/* extent free data */
115 	struct xfs_efi_log_item		*efi;	/* extent free intention */
116 	int				error;	/* error return value */
117 	int				committed;/* xact committed or not */
118 	struct xfs_bmap_free_item	*free;	/* free extent item */
119 
120 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
121 	if (flist->xbf_count == 0)
122 		return 0;
123 
124 	list_sort((*tp)->t_mountp, &flist->xbf_flist, xfs_bmap_free_list_cmp);
125 
126 	efi = xfs_trans_get_efi(*tp, flist->xbf_count);
127 	list_for_each_entry(free, &flist->xbf_flist, xbfi_list)
128 		xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
129 			free->xbfi_blockcount);
130 
131 	error = __xfs_trans_roll(tp, ip, &committed);
132 	if (error) {
133 		/*
134 		 * If the transaction was committed, drop the EFD reference
135 		 * since we're bailing out of here. The other reference is
136 		 * dropped when the EFI hits the AIL.
137 		 *
138 		 * If the transaction was not committed, the EFI is freed by the
139 		 * EFI item unlock handler on abort. Also, we have a new
140 		 * transaction so we should return committed=1 even though we're
141 		 * returning an error.
142 		 */
143 		if (committed) {
144 			xfs_efi_release(efi);
145 			xfs_force_shutdown((*tp)->t_mountp,
146 					   SHUTDOWN_META_IO_ERROR);
147 		}
148 		return error;
149 	}
150 
151 	/*
152 	 * Get an EFD and free each extent in the list, logging to the EFD in
153 	 * the process. The remaining bmap free list is cleaned up by the caller
154 	 * on error.
155 	 */
156 	efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count);
157 	while (!list_empty(&flist->xbf_flist)) {
158 		free = list_first_entry(&flist->xbf_flist,
159 				struct xfs_bmap_free_item, xbfi_list);
160 		error = xfs_trans_free_extent(*tp, efd, free->xbfi_startblock,
161 					      free->xbfi_blockcount);
162 		if (error)
163 			return error;
164 
165 		xfs_bmap_del_free(flist, free);
166 	}
167 
168 	return 0;
169 }
170 
171 int
172 xfs_bmap_rtalloc(
173 	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
174 {
175 	xfs_alloctype_t	atype = 0;	/* type for allocation routines */
176 	int		error;		/* error return value */
177 	xfs_mount_t	*mp;		/* mount point structure */
178 	xfs_extlen_t	prod = 0;	/* product factor for allocators */
179 	xfs_extlen_t	ralen = 0;	/* realtime allocation length */
180 	xfs_extlen_t	align;		/* minimum allocation alignment */
181 	xfs_rtblock_t	rtb;
182 
183 	mp = ap->ip->i_mount;
184 	align = xfs_get_extsz_hint(ap->ip);
185 	prod = align / mp->m_sb.sb_rextsize;
186 	error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
187 					align, 1, ap->eof, 0,
188 					ap->conv, &ap->offset, &ap->length);
189 	if (error)
190 		return error;
191 	ASSERT(ap->length);
192 	ASSERT(ap->length % mp->m_sb.sb_rextsize == 0);
193 
194 	/*
195 	 * If the offset & length are not perfectly aligned
196 	 * then kill prod, it will just get us in trouble.
197 	 */
198 	if (do_mod(ap->offset, align) || ap->length % align)
199 		prod = 1;
200 	/*
201 	 * Set ralen to be the actual requested length in rtextents.
202 	 */
203 	ralen = ap->length / mp->m_sb.sb_rextsize;
204 	/*
205 	 * If the old value was close enough to MAXEXTLEN that
206 	 * we rounded up to it, cut it back so it's valid again.
207 	 * Note that if it's a really large request (bigger than
208 	 * MAXEXTLEN), we don't hear about that number, and can't
209 	 * adjust the starting point to match it.
210 	 */
211 	if (ralen * mp->m_sb.sb_rextsize >= MAXEXTLEN)
212 		ralen = MAXEXTLEN / mp->m_sb.sb_rextsize;
213 
214 	/*
215 	 * Lock out modifications to both the RT bitmap and summary inodes
216 	 */
217 	xfs_ilock(mp->m_rbmip, XFS_ILOCK_EXCL);
218 	xfs_trans_ijoin(ap->tp, mp->m_rbmip, XFS_ILOCK_EXCL);
219 	xfs_ilock(mp->m_rsumip, XFS_ILOCK_EXCL);
220 	xfs_trans_ijoin(ap->tp, mp->m_rsumip, XFS_ILOCK_EXCL);
221 
222 	/*
223 	 * If it's an allocation to an empty file at offset 0,
224 	 * pick an extent that will space things out in the rt area.
225 	 */
226 	if (ap->eof && ap->offset == 0) {
227 		xfs_rtblock_t uninitialized_var(rtx); /* realtime extent no */
228 
229 		error = xfs_rtpick_extent(mp, ap->tp, ralen, &rtx);
230 		if (error)
231 			return error;
232 		ap->blkno = rtx * mp->m_sb.sb_rextsize;
233 	} else {
234 		ap->blkno = 0;
235 	}
236 
237 	xfs_bmap_adjacent(ap);
238 
239 	/*
240 	 * Realtime allocation, done through xfs_rtallocate_extent.
241 	 */
242 	atype = ap->blkno == 0 ?  XFS_ALLOCTYPE_ANY_AG : XFS_ALLOCTYPE_NEAR_BNO;
243 	do_div(ap->blkno, mp->m_sb.sb_rextsize);
244 	rtb = ap->blkno;
245 	ap->length = ralen;
246 	if ((error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1, ap->length,
247 				&ralen, atype, ap->wasdel, prod, &rtb)))
248 		return error;
249 	if (rtb == NULLFSBLOCK && prod > 1 &&
250 	    (error = xfs_rtallocate_extent(ap->tp, ap->blkno, 1,
251 					   ap->length, &ralen, atype,
252 					   ap->wasdel, 1, &rtb)))
253 		return error;
254 	ap->blkno = rtb;
255 	if (ap->blkno != NULLFSBLOCK) {
256 		ap->blkno *= mp->m_sb.sb_rextsize;
257 		ralen *= mp->m_sb.sb_rextsize;
258 		ap->length = ralen;
259 		ap->ip->i_d.di_nblocks += ralen;
260 		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
261 		if (ap->wasdel)
262 			ap->ip->i_delayed_blks -= ralen;
263 		/*
264 		 * Adjust the disk quota also. This was reserved
265 		 * earlier.
266 		 */
267 		xfs_trans_mod_dquot_byino(ap->tp, ap->ip,
268 			ap->wasdel ? XFS_TRANS_DQ_DELRTBCOUNT :
269 					XFS_TRANS_DQ_RTBCOUNT, (long) ralen);
270 
271 		/* Zero the extent if we were asked to do so */
272 		if (ap->userdata & XFS_ALLOC_USERDATA_ZERO) {
273 			error = xfs_zero_extent(ap->ip, ap->blkno, ap->length);
274 			if (error)
275 				return error;
276 		}
277 	} else {
278 		ap->length = 0;
279 	}
280 	return 0;
281 }
282 
283 /*
284  * Check if the endoff is outside the last extent. If so the caller will grow
285  * the allocation to a stripe unit boundary.  All offsets are considered outside
286  * the end of file for an empty fork, so 1 is returned in *eof in that case.
287  */
288 int
289 xfs_bmap_eof(
290 	struct xfs_inode	*ip,
291 	xfs_fileoff_t		endoff,
292 	int			whichfork,
293 	int			*eof)
294 {
295 	struct xfs_bmbt_irec	rec;
296 	int			error;
297 
298 	error = xfs_bmap_last_extent(NULL, ip, whichfork, &rec, eof);
299 	if (error || *eof)
300 		return error;
301 
302 	*eof = endoff >= rec.br_startoff + rec.br_blockcount;
303 	return 0;
304 }
305 
306 /*
307  * Extent tree block counting routines.
308  */
309 
310 /*
311  * Count leaf blocks given a range of extent records.
312  */
313 STATIC void
314 xfs_bmap_count_leaves(
315 	xfs_ifork_t		*ifp,
316 	xfs_extnum_t		idx,
317 	int			numrecs,
318 	int			*count)
319 {
320 	int		b;
321 
322 	for (b = 0; b < numrecs; b++) {
323 		xfs_bmbt_rec_host_t *frp = xfs_iext_get_ext(ifp, idx + b);
324 		*count += xfs_bmbt_get_blockcount(frp);
325 	}
326 }
327 
328 /*
329  * Count leaf blocks given a range of extent records originally
330  * in btree format.
331  */
332 STATIC void
333 xfs_bmap_disk_count_leaves(
334 	struct xfs_mount	*mp,
335 	struct xfs_btree_block	*block,
336 	int			numrecs,
337 	int			*count)
338 {
339 	int		b;
340 	xfs_bmbt_rec_t	*frp;
341 
342 	for (b = 1; b <= numrecs; b++) {
343 		frp = XFS_BMBT_REC_ADDR(mp, block, b);
344 		*count += xfs_bmbt_disk_get_blockcount(frp);
345 	}
346 }
347 
348 /*
349  * Recursively walks each level of a btree
350  * to count total fsblocks in use.
351  */
352 STATIC int                                     /* error */
353 xfs_bmap_count_tree(
354 	xfs_mount_t     *mp,            /* file system mount point */
355 	xfs_trans_t     *tp,            /* transaction pointer */
356 	xfs_ifork_t	*ifp,		/* inode fork pointer */
357 	xfs_fsblock_t   blockno,	/* file system block number */
358 	int             levelin,	/* level in btree */
359 	int		*count)		/* Count of blocks */
360 {
361 	int			error;
362 	xfs_buf_t		*bp, *nbp;
363 	int			level = levelin;
364 	__be64			*pp;
365 	xfs_fsblock_t           bno = blockno;
366 	xfs_fsblock_t		nextbno;
367 	struct xfs_btree_block	*block, *nextblock;
368 	int			numrecs;
369 
370 	error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
371 						&xfs_bmbt_buf_ops);
372 	if (error)
373 		return error;
374 	*count += 1;
375 	block = XFS_BUF_TO_BLOCK(bp);
376 
377 	if (--level) {
378 		/* Not at node above leaves, count this level of nodes */
379 		nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
380 		while (nextbno != NULLFSBLOCK) {
381 			error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
382 						XFS_BMAP_BTREE_REF,
383 						&xfs_bmbt_buf_ops);
384 			if (error)
385 				return error;
386 			*count += 1;
387 			nextblock = XFS_BUF_TO_BLOCK(nbp);
388 			nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
389 			xfs_trans_brelse(tp, nbp);
390 		}
391 
392 		/* Dive to the next level */
393 		pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
394 		bno = be64_to_cpu(*pp);
395 		if (unlikely((error =
396 		     xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
397 			xfs_trans_brelse(tp, bp);
398 			XFS_ERROR_REPORT("xfs_bmap_count_tree(1)",
399 					 XFS_ERRLEVEL_LOW, mp);
400 			return -EFSCORRUPTED;
401 		}
402 		xfs_trans_brelse(tp, bp);
403 	} else {
404 		/* count all level 1 nodes and their leaves */
405 		for (;;) {
406 			nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
407 			numrecs = be16_to_cpu(block->bb_numrecs);
408 			xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
409 			xfs_trans_brelse(tp, bp);
410 			if (nextbno == NULLFSBLOCK)
411 				break;
412 			bno = nextbno;
413 			error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
414 						XFS_BMAP_BTREE_REF,
415 						&xfs_bmbt_buf_ops);
416 			if (error)
417 				return error;
418 			*count += 1;
419 			block = XFS_BUF_TO_BLOCK(bp);
420 		}
421 	}
422 	return 0;
423 }
424 
425 /*
426  * Count fsblocks of the given fork.
427  */
428 static int					/* error */
429 xfs_bmap_count_blocks(
430 	xfs_trans_t		*tp,		/* transaction pointer */
431 	xfs_inode_t		*ip,		/* incore inode */
432 	int			whichfork,	/* data or attr fork */
433 	int			*count)		/* out: count of blocks */
434 {
435 	struct xfs_btree_block	*block;	/* current btree block */
436 	xfs_fsblock_t		bno;	/* block # of "block" */
437 	xfs_ifork_t		*ifp;	/* fork structure */
438 	int			level;	/* btree level, for checking */
439 	xfs_mount_t		*mp;	/* file system mount structure */
440 	__be64			*pp;	/* pointer to block address */
441 
442 	bno = NULLFSBLOCK;
443 	mp = ip->i_mount;
444 	ifp = XFS_IFORK_PTR(ip, whichfork);
445 	if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
446 		xfs_bmap_count_leaves(ifp, 0,
447 			ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
448 			count);
449 		return 0;
450 	}
451 
452 	/*
453 	 * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out.
454 	 */
455 	block = ifp->if_broot;
456 	level = be16_to_cpu(block->bb_level);
457 	ASSERT(level > 0);
458 	pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
459 	bno = be64_to_cpu(*pp);
460 	ASSERT(bno != NULLFSBLOCK);
461 	ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
462 	ASSERT(XFS_FSB_TO_AGBNO(mp, bno) < mp->m_sb.sb_agblocks);
463 
464 	if (unlikely(xfs_bmap_count_tree(mp, tp, ifp, bno, level, count) < 0)) {
465 		XFS_ERROR_REPORT("xfs_bmap_count_blocks(2)", XFS_ERRLEVEL_LOW,
466 				 mp);
467 		return -EFSCORRUPTED;
468 	}
469 
470 	return 0;
471 }
472 
473 /*
474  * returns 1 for success, 0 if we failed to map the extent.
475  */
476 STATIC int
477 xfs_getbmapx_fix_eof_hole(
478 	xfs_inode_t		*ip,		/* xfs incore inode pointer */
479 	struct getbmapx		*out,		/* output structure */
480 	int			prealloced,	/* this is a file with
481 						 * preallocated data space */
482 	__int64_t		end,		/* last block requested */
483 	xfs_fsblock_t		startblock)
484 {
485 	__int64_t		fixlen;
486 	xfs_mount_t		*mp;		/* file system mount point */
487 	xfs_ifork_t		*ifp;		/* inode fork pointer */
488 	xfs_extnum_t		lastx;		/* last extent pointer */
489 	xfs_fileoff_t		fileblock;
490 
491 	if (startblock == HOLESTARTBLOCK) {
492 		mp = ip->i_mount;
493 		out->bmv_block = -1;
494 		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, XFS_ISIZE(ip)));
495 		fixlen -= out->bmv_offset;
496 		if (prealloced && out->bmv_offset + out->bmv_length == end) {
497 			/* Came to hole at EOF. Trim it. */
498 			if (fixlen <= 0)
499 				return 0;
500 			out->bmv_length = fixlen;
501 		}
502 	} else {
503 		if (startblock == DELAYSTARTBLOCK)
504 			out->bmv_block = -2;
505 		else
506 			out->bmv_block = xfs_fsb_to_db(ip, startblock);
507 		fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
508 		ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
509 		if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
510 		   (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
511 			out->bmv_oflags |= BMV_OF_LAST;
512 	}
513 
514 	return 1;
515 }
516 
517 /*
518  * Get inode's extents as described in bmv, and format for output.
519  * Calls formatter to fill the user's buffer until all extents
520  * are mapped, until the passed-in bmv->bmv_count slots have
521  * been filled, or until the formatter short-circuits the loop,
522  * if it is tracking filled-in extents on its own.
523  */
524 int						/* error code */
525 xfs_getbmap(
526 	xfs_inode_t		*ip,
527 	struct getbmapx		*bmv,		/* user bmap structure */
528 	xfs_bmap_format_t	formatter,	/* format to user */
529 	void			*arg)		/* formatter arg */
530 {
531 	__int64_t		bmvend;		/* last block requested */
532 	int			error = 0;	/* return value */
533 	__int64_t		fixlen;		/* length for -1 case */
534 	int			i;		/* extent number */
535 	int			lock;		/* lock state */
536 	xfs_bmbt_irec_t		*map;		/* buffer for user's data */
537 	xfs_mount_t		*mp;		/* file system mount point */
538 	int			nex;		/* # of user extents can do */
539 	int			nexleft;	/* # of user extents left */
540 	int			subnex;		/* # of bmapi's can do */
541 	int			nmap;		/* number of map entries */
542 	struct getbmapx		*out;		/* output structure */
543 	int			whichfork;	/* data or attr fork */
544 	int			prealloced;	/* this is a file with
545 						 * preallocated data space */
546 	int			iflags;		/* interface flags */
547 	int			bmapi_flags;	/* flags for xfs_bmapi */
548 	int			cur_ext = 0;
549 
550 	mp = ip->i_mount;
551 	iflags = bmv->bmv_iflags;
552 	whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
553 
554 	if (whichfork == XFS_ATTR_FORK) {
555 		if (XFS_IFORK_Q(ip)) {
556 			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
557 			    ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
558 			    ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)
559 				return -EINVAL;
560 		} else if (unlikely(
561 			   ip->i_d.di_aformat != 0 &&
562 			   ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS)) {
563 			XFS_ERROR_REPORT("xfs_getbmap", XFS_ERRLEVEL_LOW,
564 					 ip->i_mount);
565 			return -EFSCORRUPTED;
566 		}
567 
568 		prealloced = 0;
569 		fixlen = 1LL << 32;
570 	} else {
571 		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
572 		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
573 		    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
574 			return -EINVAL;
575 
576 		if (xfs_get_extsz_hint(ip) ||
577 		    ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC|XFS_DIFLAG_APPEND)){
578 			prealloced = 1;
579 			fixlen = mp->m_super->s_maxbytes;
580 		} else {
581 			prealloced = 0;
582 			fixlen = XFS_ISIZE(ip);
583 		}
584 	}
585 
586 	if (bmv->bmv_length == -1) {
587 		fixlen = XFS_FSB_TO_BB(mp, XFS_B_TO_FSB(mp, fixlen));
588 		bmv->bmv_length =
589 			max_t(__int64_t, fixlen - bmv->bmv_offset, 0);
590 	} else if (bmv->bmv_length == 0) {
591 		bmv->bmv_entries = 0;
592 		return 0;
593 	} else if (bmv->bmv_length < 0) {
594 		return -EINVAL;
595 	}
596 
597 	nex = bmv->bmv_count - 1;
598 	if (nex <= 0)
599 		return -EINVAL;
600 	bmvend = bmv->bmv_offset + bmv->bmv_length;
601 
602 
603 	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
604 		return -ENOMEM;
605 	out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
606 	if (!out)
607 		return -ENOMEM;
608 
609 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
610 	if (whichfork == XFS_DATA_FORK) {
611 		if (!(iflags & BMV_IF_DELALLOC) &&
612 		    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
613 			error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
614 			if (error)
615 				goto out_unlock_iolock;
616 
617 			/*
618 			 * Even after flushing the inode, there can still be
619 			 * delalloc blocks on the inode beyond EOF due to
620 			 * speculative preallocation.  These are not removed
621 			 * until the release function is called or the inode
622 			 * is inactivated.  Hence we cannot assert here that
623 			 * ip->i_delayed_blks == 0.
624 			 */
625 		}
626 
627 		lock = xfs_ilock_data_map_shared(ip);
628 	} else {
629 		lock = xfs_ilock_attr_map_shared(ip);
630 	}
631 
632 	/*
633 	 * Don't let nex be bigger than the number of extents
634 	 * we can have assuming alternating holes and real extents.
635 	 */
636 	if (nex > XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1)
637 		nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
638 
639 	bmapi_flags = xfs_bmapi_aflag(whichfork);
640 	if (!(iflags & BMV_IF_PREALLOC))
641 		bmapi_flags |= XFS_BMAPI_IGSTATE;
642 
643 	/*
644 	 * Allocate enough space to handle "subnex" maps at a time.
645 	 */
646 	error = -ENOMEM;
647 	subnex = 16;
648 	map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
649 	if (!map)
650 		goto out_unlock_ilock;
651 
652 	bmv->bmv_entries = 0;
653 
654 	if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0 &&
655 	    (whichfork == XFS_ATTR_FORK || !(iflags & BMV_IF_DELALLOC))) {
656 		error = 0;
657 		goto out_free_map;
658 	}
659 
660 	nexleft = nex;
661 
662 	do {
663 		nmap = (nexleft > subnex) ? subnex : nexleft;
664 		error = xfs_bmapi_read(ip, XFS_BB_TO_FSBT(mp, bmv->bmv_offset),
665 				       XFS_BB_TO_FSB(mp, bmv->bmv_length),
666 				       map, &nmap, bmapi_flags);
667 		if (error)
668 			goto out_free_map;
669 		ASSERT(nmap <= subnex);
670 
671 		for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
672 			out[cur_ext].bmv_oflags = 0;
673 			if (map[i].br_state == XFS_EXT_UNWRITTEN)
674 				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
675 			else if (map[i].br_startblock == DELAYSTARTBLOCK)
676 				out[cur_ext].bmv_oflags |= BMV_OF_DELALLOC;
677 			out[cur_ext].bmv_offset =
678 				XFS_FSB_TO_BB(mp, map[i].br_startoff);
679 			out[cur_ext].bmv_length =
680 				XFS_FSB_TO_BB(mp, map[i].br_blockcount);
681 			out[cur_ext].bmv_unused1 = 0;
682 			out[cur_ext].bmv_unused2 = 0;
683 
684 			/*
685 			 * delayed allocation extents that start beyond EOF can
686 			 * occur due to speculative EOF allocation when the
687 			 * delalloc extent is larger than the largest freespace
688 			 * extent at conversion time. These extents cannot be
689 			 * converted by data writeback, so can exist here even
690 			 * if we are not supposed to be finding delalloc
691 			 * extents.
692 			 */
693 			if (map[i].br_startblock == DELAYSTARTBLOCK &&
694 			    map[i].br_startoff <= XFS_B_TO_FSB(mp, XFS_ISIZE(ip)))
695 				ASSERT((iflags & BMV_IF_DELALLOC) != 0);
696 
697                         if (map[i].br_startblock == HOLESTARTBLOCK &&
698 			    whichfork == XFS_ATTR_FORK) {
699 				/* came to the end of attribute fork */
700 				out[cur_ext].bmv_oflags |= BMV_OF_LAST;
701 				goto out_free_map;
702 			}
703 
704 			if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
705 					prealloced, bmvend,
706 					map[i].br_startblock))
707 				goto out_free_map;
708 
709 			bmv->bmv_offset =
710 				out[cur_ext].bmv_offset +
711 				out[cur_ext].bmv_length;
712 			bmv->bmv_length =
713 				max_t(__int64_t, 0, bmvend - bmv->bmv_offset);
714 
715 			/*
716 			 * In case we don't want to return the hole,
717 			 * don't increase cur_ext so that we can reuse
718 			 * it in the next loop.
719 			 */
720 			if ((iflags & BMV_IF_NO_HOLES) &&
721 			    map[i].br_startblock == HOLESTARTBLOCK) {
722 				memset(&out[cur_ext], 0, sizeof(out[cur_ext]));
723 				continue;
724 			}
725 
726 			nexleft--;
727 			bmv->bmv_entries++;
728 			cur_ext++;
729 		}
730 	} while (nmap && nexleft && bmv->bmv_length);
731 
732  out_free_map:
733 	kmem_free(map);
734  out_unlock_ilock:
735 	xfs_iunlock(ip, lock);
736  out_unlock_iolock:
737 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
738 
739 	for (i = 0; i < cur_ext; i++) {
740 		int full = 0;	/* user array is full */
741 
742 		/* format results & advance arg */
743 		error = formatter(&arg, &out[i], &full);
744 		if (error || full)
745 			break;
746 	}
747 
748 	kmem_free(out);
749 	return error;
750 }
751 
752 /*
753  * dead simple method of punching delalyed allocation blocks from a range in
754  * the inode. Walks a block at a time so will be slow, but is only executed in
755  * rare error cases so the overhead is not critical. This will always punch out
756  * both the start and end blocks, even if the ranges only partially overlap
757  * them, so it is up to the caller to ensure that partial blocks are not
758  * passed in.
759  */
760 int
761 xfs_bmap_punch_delalloc_range(
762 	struct xfs_inode	*ip,
763 	xfs_fileoff_t		start_fsb,
764 	xfs_fileoff_t		length)
765 {
766 	xfs_fileoff_t		remaining = length;
767 	int			error = 0;
768 
769 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
770 
771 	do {
772 		int		done;
773 		xfs_bmbt_irec_t	imap;
774 		int		nimaps = 1;
775 		xfs_fsblock_t	firstblock;
776 		xfs_bmap_free_t flist;
777 
778 		/*
779 		 * Map the range first and check that it is a delalloc extent
780 		 * before trying to unmap the range. Otherwise we will be
781 		 * trying to remove a real extent (which requires a
782 		 * transaction) or a hole, which is probably a bad idea...
783 		 */
784 		error = xfs_bmapi_read(ip, start_fsb, 1, &imap, &nimaps,
785 				       XFS_BMAPI_ENTIRE);
786 
787 		if (error) {
788 			/* something screwed, just bail */
789 			if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
790 				xfs_alert(ip->i_mount,
791 			"Failed delalloc mapping lookup ino %lld fsb %lld.",
792 						ip->i_ino, start_fsb);
793 			}
794 			break;
795 		}
796 		if (!nimaps) {
797 			/* nothing there */
798 			goto next_block;
799 		}
800 		if (imap.br_startblock != DELAYSTARTBLOCK) {
801 			/* been converted, ignore */
802 			goto next_block;
803 		}
804 		WARN_ON(imap.br_blockcount == 0);
805 
806 		/*
807 		 * Note: while we initialise the firstblock/flist pair, they
808 		 * should never be used because blocks should never be
809 		 * allocated or freed for a delalloc extent and hence we need
810 		 * don't cancel or finish them after the xfs_bunmapi() call.
811 		 */
812 		xfs_bmap_init(&flist, &firstblock);
813 		error = xfs_bunmapi(NULL, ip, start_fsb, 1, 0, 1, &firstblock,
814 					&flist, &done);
815 		if (error)
816 			break;
817 
818 		ASSERT(!flist.xbf_count && list_empty(&flist.xbf_flist));
819 next_block:
820 		start_fsb++;
821 		remaining--;
822 	} while(remaining > 0);
823 
824 	return error;
825 }
826 
827 /*
828  * Test whether it is appropriate to check an inode for and free post EOF
829  * blocks. The 'force' parameter determines whether we should also consider
830  * regular files that are marked preallocated or append-only.
831  */
832 bool
833 xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
834 {
835 	/* prealloc/delalloc exists only on regular files */
836 	if (!S_ISREG(VFS_I(ip)->i_mode))
837 		return false;
838 
839 	/*
840 	 * Zero sized files with no cached pages and delalloc blocks will not
841 	 * have speculative prealloc/delalloc blocks to remove.
842 	 */
843 	if (VFS_I(ip)->i_size == 0 &&
844 	    VFS_I(ip)->i_mapping->nrpages == 0 &&
845 	    ip->i_delayed_blks == 0)
846 		return false;
847 
848 	/* If we haven't read in the extent list, then don't do it now. */
849 	if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
850 		return false;
851 
852 	/*
853 	 * Do not free real preallocated or append-only files unless the file
854 	 * has delalloc blocks and we are forced to remove them.
855 	 */
856 	if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
857 		if (!force || ip->i_delayed_blks == 0)
858 			return false;
859 
860 	return true;
861 }
862 
863 /*
864  * This is called by xfs_inactive to free any blocks beyond eof
865  * when the link count isn't zero and by xfs_dm_punch_hole() when
866  * punching a hole to EOF.
867  */
868 int
869 xfs_free_eofblocks(
870 	xfs_mount_t	*mp,
871 	xfs_inode_t	*ip,
872 	bool		need_iolock)
873 {
874 	xfs_trans_t	*tp;
875 	int		error;
876 	xfs_fileoff_t	end_fsb;
877 	xfs_fileoff_t	last_fsb;
878 	xfs_filblks_t	map_len;
879 	int		nimaps;
880 	xfs_bmbt_irec_t	imap;
881 
882 	/*
883 	 * Figure out if there are any blocks beyond the end
884 	 * of the file.  If not, then there is nothing to do.
885 	 */
886 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_ISIZE(ip));
887 	last_fsb = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
888 	if (last_fsb <= end_fsb)
889 		return 0;
890 	map_len = last_fsb - end_fsb;
891 
892 	nimaps = 1;
893 	xfs_ilock(ip, XFS_ILOCK_SHARED);
894 	error = xfs_bmapi_read(ip, end_fsb, map_len, &imap, &nimaps, 0);
895 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
896 
897 	if (!error && (nimaps != 0) &&
898 	    (imap.br_startblock != HOLESTARTBLOCK ||
899 	     ip->i_delayed_blks)) {
900 		/*
901 		 * Attach the dquots to the inode up front.
902 		 */
903 		error = xfs_qm_dqattach(ip, 0);
904 		if (error)
905 			return error;
906 
907 		/*
908 		 * There are blocks after the end of file.
909 		 * Free them up now by truncating the file to
910 		 * its current size.
911 		 */
912 		if (need_iolock) {
913 			if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
914 				return -EAGAIN;
915 		}
916 
917 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0,
918 				&tp);
919 		if (error) {
920 			ASSERT(XFS_FORCED_SHUTDOWN(mp));
921 			if (need_iolock)
922 				xfs_iunlock(ip, XFS_IOLOCK_EXCL);
923 			return error;
924 		}
925 
926 		xfs_ilock(ip, XFS_ILOCK_EXCL);
927 		xfs_trans_ijoin(tp, ip, 0);
928 
929 		/*
930 		 * Do not update the on-disk file size.  If we update the
931 		 * on-disk file size and then the system crashes before the
932 		 * contents of the file are flushed to disk then the files
933 		 * may be full of holes (ie NULL files bug).
934 		 */
935 		error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK,
936 					      XFS_ISIZE(ip));
937 		if (error) {
938 			/*
939 			 * If we get an error at this point we simply don't
940 			 * bother truncating the file.
941 			 */
942 			xfs_trans_cancel(tp);
943 		} else {
944 			error = xfs_trans_commit(tp);
945 			if (!error)
946 				xfs_inode_clear_eofblocks_tag(ip);
947 		}
948 
949 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
950 		if (need_iolock)
951 			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
952 	}
953 	return error;
954 }
955 
956 int
957 xfs_alloc_file_space(
958 	struct xfs_inode	*ip,
959 	xfs_off_t		offset,
960 	xfs_off_t		len,
961 	int			alloc_type)
962 {
963 	xfs_mount_t		*mp = ip->i_mount;
964 	xfs_off_t		count;
965 	xfs_filblks_t		allocated_fsb;
966 	xfs_filblks_t		allocatesize_fsb;
967 	xfs_extlen_t		extsz, temp;
968 	xfs_fileoff_t		startoffset_fsb;
969 	xfs_fsblock_t		firstfsb;
970 	int			nimaps;
971 	int			quota_flag;
972 	int			rt;
973 	xfs_trans_t		*tp;
974 	xfs_bmbt_irec_t		imaps[1], *imapp;
975 	xfs_bmap_free_t		free_list;
976 	uint			qblocks, resblks, resrtextents;
977 	int			error;
978 
979 	trace_xfs_alloc_file_space(ip);
980 
981 	if (XFS_FORCED_SHUTDOWN(mp))
982 		return -EIO;
983 
984 	error = xfs_qm_dqattach(ip, 0);
985 	if (error)
986 		return error;
987 
988 	if (len <= 0)
989 		return -EINVAL;
990 
991 	rt = XFS_IS_REALTIME_INODE(ip);
992 	extsz = xfs_get_extsz_hint(ip);
993 
994 	count = len;
995 	imapp = &imaps[0];
996 	nimaps = 1;
997 	startoffset_fsb	= XFS_B_TO_FSBT(mp, offset);
998 	allocatesize_fsb = XFS_B_TO_FSB(mp, count);
999 
1000 	/*
1001 	 * Allocate file space until done or until there is an error
1002 	 */
1003 	while (allocatesize_fsb && !error) {
1004 		xfs_fileoff_t	s, e;
1005 
1006 		/*
1007 		 * Determine space reservations for data/realtime.
1008 		 */
1009 		if (unlikely(extsz)) {
1010 			s = startoffset_fsb;
1011 			do_div(s, extsz);
1012 			s *= extsz;
1013 			e = startoffset_fsb + allocatesize_fsb;
1014 			if ((temp = do_mod(startoffset_fsb, extsz)))
1015 				e += temp;
1016 			if ((temp = do_mod(e, extsz)))
1017 				e += extsz - temp;
1018 		} else {
1019 			s = 0;
1020 			e = allocatesize_fsb;
1021 		}
1022 
1023 		/*
1024 		 * The transaction reservation is limited to a 32-bit block
1025 		 * count, hence we need to limit the number of blocks we are
1026 		 * trying to reserve to avoid an overflow. We can't allocate
1027 		 * more than @nimaps extents, and an extent is limited on disk
1028 		 * to MAXEXTLEN (21 bits), so use that to enforce the limit.
1029 		 */
1030 		resblks = min_t(xfs_fileoff_t, (e - s), (MAXEXTLEN * nimaps));
1031 		if (unlikely(rt)) {
1032 			resrtextents = qblocks = resblks;
1033 			resrtextents /= mp->m_sb.sb_rextsize;
1034 			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1035 			quota_flag = XFS_QMOPT_RES_RTBLKS;
1036 		} else {
1037 			resrtextents = 0;
1038 			resblks = qblocks = XFS_DIOSTRAT_SPACE_RES(mp, resblks);
1039 			quota_flag = XFS_QMOPT_RES_REGBLKS;
1040 		}
1041 
1042 		/*
1043 		 * Allocate and setup the transaction.
1044 		 */
1045 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
1046 				resrtextents, 0, &tp);
1047 
1048 		/*
1049 		 * Check for running out of space
1050 		 */
1051 		if (error) {
1052 			/*
1053 			 * Free the transaction structure.
1054 			 */
1055 			ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1056 			break;
1057 		}
1058 		xfs_ilock(ip, XFS_ILOCK_EXCL);
1059 		error = xfs_trans_reserve_quota_nblks(tp, ip, qblocks,
1060 						      0, quota_flag);
1061 		if (error)
1062 			goto error1;
1063 
1064 		xfs_trans_ijoin(tp, ip, 0);
1065 
1066 		xfs_bmap_init(&free_list, &firstfsb);
1067 		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
1068 					allocatesize_fsb, alloc_type, &firstfsb,
1069 					resblks, imapp, &nimaps, &free_list);
1070 		if (error)
1071 			goto error0;
1072 
1073 		/*
1074 		 * Complete the transaction
1075 		 */
1076 		error = xfs_bmap_finish(&tp, &free_list, NULL);
1077 		if (error)
1078 			goto error0;
1079 
1080 		error = xfs_trans_commit(tp);
1081 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
1082 		if (error)
1083 			break;
1084 
1085 		allocated_fsb = imapp->br_blockcount;
1086 
1087 		if (nimaps == 0) {
1088 			error = -ENOSPC;
1089 			break;
1090 		}
1091 
1092 		startoffset_fsb += allocated_fsb;
1093 		allocatesize_fsb -= allocated_fsb;
1094 	}
1095 
1096 	return error;
1097 
1098 error0:	/* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */
1099 	xfs_bmap_cancel(&free_list);
1100 	xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag);
1101 
1102 error1:	/* Just cancel transaction */
1103 	xfs_trans_cancel(tp);
1104 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1105 	return error;
1106 }
1107 
1108 static int
1109 xfs_unmap_extent(
1110 	struct xfs_inode	*ip,
1111 	xfs_fileoff_t		startoffset_fsb,
1112 	xfs_filblks_t		len_fsb,
1113 	int			*done)
1114 {
1115 	struct xfs_mount	*mp = ip->i_mount;
1116 	struct xfs_trans	*tp;
1117 	struct xfs_bmap_free	free_list;
1118 	xfs_fsblock_t		firstfsb;
1119 	uint			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0);
1120 	int			error;
1121 
1122 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
1123 	if (error) {
1124 		ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp));
1125 		return error;
1126 	}
1127 
1128 	xfs_ilock(ip, XFS_ILOCK_EXCL);
1129 	error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot, ip->i_gdquot,
1130 			ip->i_pdquot, resblks, 0, XFS_QMOPT_RES_REGBLKS);
1131 	if (error)
1132 		goto out_trans_cancel;
1133 
1134 	xfs_trans_ijoin(tp, ip, 0);
1135 
1136 	xfs_bmap_init(&free_list, &firstfsb);
1137 	error = xfs_bunmapi(tp, ip, startoffset_fsb, len_fsb, 0, 2, &firstfsb,
1138 			&free_list, done);
1139 	if (error)
1140 		goto out_bmap_cancel;
1141 
1142 	error = xfs_bmap_finish(&tp, &free_list, NULL);
1143 	if (error)
1144 		goto out_bmap_cancel;
1145 
1146 	error = xfs_trans_commit(tp);
1147 out_unlock:
1148 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1149 	return error;
1150 
1151 out_bmap_cancel:
1152 	xfs_bmap_cancel(&free_list);
1153 out_trans_cancel:
1154 	xfs_trans_cancel(tp);
1155 	goto out_unlock;
1156 }
1157 
1158 static int
1159 xfs_adjust_extent_unmap_boundaries(
1160 	struct xfs_inode	*ip,
1161 	xfs_fileoff_t		*startoffset_fsb,
1162 	xfs_fileoff_t		*endoffset_fsb)
1163 {
1164 	struct xfs_mount	*mp = ip->i_mount;
1165 	struct xfs_bmbt_irec	imap;
1166 	int			nimap, error;
1167 	xfs_extlen_t		mod = 0;
1168 
1169 	nimap = 1;
1170 	error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
1171 	if (error)
1172 		return error;
1173 
1174 	if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1175 		xfs_daddr_t	block;
1176 
1177 		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1178 		block = imap.br_startblock;
1179 		mod = do_div(block, mp->m_sb.sb_rextsize);
1180 		if (mod)
1181 			*startoffset_fsb += mp->m_sb.sb_rextsize - mod;
1182 	}
1183 
1184 	nimap = 1;
1185 	error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
1186 	if (error)
1187 		return error;
1188 
1189 	if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
1190 		ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
1191 		mod++;
1192 		if (mod && mod != mp->m_sb.sb_rextsize)
1193 			*endoffset_fsb -= mod;
1194 	}
1195 
1196 	return 0;
1197 }
1198 
1199 static int
1200 xfs_flush_unmap_range(
1201 	struct xfs_inode	*ip,
1202 	xfs_off_t		offset,
1203 	xfs_off_t		len)
1204 {
1205 	struct xfs_mount	*mp = ip->i_mount;
1206 	struct inode		*inode = VFS_I(ip);
1207 	xfs_off_t		rounding, start, end;
1208 	int			error;
1209 
1210 	/* wait for the completion of any pending DIOs */
1211 	inode_dio_wait(inode);
1212 
1213 	rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_SIZE);
1214 	start = round_down(offset, rounding);
1215 	end = round_up(offset + len, rounding) - 1;
1216 
1217 	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
1218 	if (error)
1219 		return error;
1220 	truncate_pagecache_range(inode, start, end);
1221 	return 0;
1222 }
1223 
1224 int
1225 xfs_free_file_space(
1226 	struct xfs_inode	*ip,
1227 	xfs_off_t		offset,
1228 	xfs_off_t		len)
1229 {
1230 	struct xfs_mount	*mp = ip->i_mount;
1231 	xfs_fileoff_t		startoffset_fsb;
1232 	xfs_fileoff_t		endoffset_fsb;
1233 	int			done = 0, error;
1234 
1235 	trace_xfs_free_file_space(ip);
1236 
1237 	error = xfs_qm_dqattach(ip, 0);
1238 	if (error)
1239 		return error;
1240 
1241 	if (len <= 0)	/* if nothing being freed */
1242 		return 0;
1243 
1244 	error = xfs_flush_unmap_range(ip, offset, len);
1245 	if (error)
1246 		return error;
1247 
1248 	startoffset_fsb = XFS_B_TO_FSB(mp, offset);
1249 	endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
1250 
1251 	/*
1252 	 * Need to zero the stuff we're not freeing, on disk.  If it's a RT file
1253 	 * and we can't use unwritten extents then we actually need to ensure
1254 	 * to zero the whole extent, otherwise we just need to take of block
1255 	 * boundaries, and xfs_bunmapi will handle the rest.
1256 	 */
1257 	if (XFS_IS_REALTIME_INODE(ip) &&
1258 	    !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
1259 		error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
1260 				&endoffset_fsb);
1261 		if (error)
1262 			return error;
1263 	}
1264 
1265 	if (endoffset_fsb > startoffset_fsb) {
1266 		while (!done) {
1267 			error = xfs_unmap_extent(ip, startoffset_fsb,
1268 					endoffset_fsb - startoffset_fsb, &done);
1269 			if (error)
1270 				return error;
1271 		}
1272 	}
1273 
1274 	/*
1275 	 * Now that we've unmap all full blocks we'll have to zero out any
1276 	 * partial block at the beginning and/or end.  xfs_zero_range is
1277 	 * smart enough to skip any holes, including those we just created.
1278 	 */
1279 	return xfs_zero_range(ip, offset, len, NULL);
1280 }
1281 
1282 /*
1283  * Preallocate and zero a range of a file. This mechanism has the allocation
1284  * semantics of fallocate and in addition converts data in the range to zeroes.
1285  */
1286 int
1287 xfs_zero_file_space(
1288 	struct xfs_inode	*ip,
1289 	xfs_off_t		offset,
1290 	xfs_off_t		len)
1291 {
1292 	struct xfs_mount	*mp = ip->i_mount;
1293 	uint			blksize;
1294 	int			error;
1295 
1296 	trace_xfs_zero_file_space(ip);
1297 
1298 	blksize = 1 << mp->m_sb.sb_blocklog;
1299 
1300 	/*
1301 	 * Punch a hole and prealloc the range. We use hole punch rather than
1302 	 * unwritten extent conversion for two reasons:
1303 	 *
1304 	 * 1.) Hole punch handles partial block zeroing for us.
1305 	 *
1306 	 * 2.) If prealloc returns ENOSPC, the file range is still zero-valued
1307 	 * by virtue of the hole punch.
1308 	 */
1309 	error = xfs_free_file_space(ip, offset, len);
1310 	if (error)
1311 		goto out;
1312 
1313 	error = xfs_alloc_file_space(ip, round_down(offset, blksize),
1314 				     round_up(offset + len, blksize) -
1315 				     round_down(offset, blksize),
1316 				     XFS_BMAPI_PREALLOC);
1317 out:
1318 	return error;
1319 
1320 }
1321 
1322 /*
1323  * @next_fsb will keep track of the extent currently undergoing shift.
1324  * @stop_fsb will keep track of the extent at which we have to stop.
1325  * If we are shifting left, we will start with block (offset + len) and
1326  * shift each extent till last extent.
1327  * If we are shifting right, we will start with last extent inside file space
1328  * and continue until we reach the block corresponding to offset.
1329  */
1330 static int
1331 xfs_shift_file_space(
1332 	struct xfs_inode        *ip,
1333 	xfs_off_t               offset,
1334 	xfs_off_t               len,
1335 	enum shift_direction	direction)
1336 {
1337 	int			done = 0;
1338 	struct xfs_mount	*mp = ip->i_mount;
1339 	struct xfs_trans	*tp;
1340 	int			error;
1341 	struct xfs_bmap_free	free_list;
1342 	xfs_fsblock_t		first_block;
1343 	xfs_fileoff_t		stop_fsb;
1344 	xfs_fileoff_t		next_fsb;
1345 	xfs_fileoff_t		shift_fsb;
1346 
1347 	ASSERT(direction == SHIFT_LEFT || direction == SHIFT_RIGHT);
1348 
1349 	if (direction == SHIFT_LEFT) {
1350 		next_fsb = XFS_B_TO_FSB(mp, offset + len);
1351 		stop_fsb = XFS_B_TO_FSB(mp, VFS_I(ip)->i_size);
1352 	} else {
1353 		/*
1354 		 * If right shift, delegate the work of initialization of
1355 		 * next_fsb to xfs_bmap_shift_extent as it has ilock held.
1356 		 */
1357 		next_fsb = NULLFSBLOCK;
1358 		stop_fsb = XFS_B_TO_FSB(mp, offset);
1359 	}
1360 
1361 	shift_fsb = XFS_B_TO_FSB(mp, len);
1362 
1363 	/*
1364 	 * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
1365 	 * into the accessible region of the file.
1366 	 */
1367 	if (xfs_can_free_eofblocks(ip, true)) {
1368 		error = xfs_free_eofblocks(mp, ip, false);
1369 		if (error)
1370 			return error;
1371 	}
1372 
1373 	/*
1374 	 * Writeback and invalidate cache for the remainder of the file as we're
1375 	 * about to shift down every extent from offset to EOF.
1376 	 */
1377 	error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
1378 					     offset, -1);
1379 	if (error)
1380 		return error;
1381 	error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
1382 					offset >> PAGE_SHIFT, -1);
1383 	if (error)
1384 		return error;
1385 
1386 	/*
1387 	 * The extent shiting code works on extent granularity. So, if
1388 	 * stop_fsb is not the starting block of extent, we need to split
1389 	 * the extent at stop_fsb.
1390 	 */
1391 	if (direction == SHIFT_RIGHT) {
1392 		error = xfs_bmap_split_extent(ip, stop_fsb);
1393 		if (error)
1394 			return error;
1395 	}
1396 
1397 	while (!error && !done) {
1398 		/*
1399 		 * We would need to reserve permanent block for transaction.
1400 		 * This will come into picture when after shifting extent into
1401 		 * hole we found that adjacent extents can be merged which
1402 		 * may lead to freeing of a block during record update.
1403 		 */
1404 		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write,
1405 				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0, 0, &tp);
1406 		if (error)
1407 			break;
1408 
1409 		xfs_ilock(ip, XFS_ILOCK_EXCL);
1410 		error = xfs_trans_reserve_quota(tp, mp, ip->i_udquot,
1411 				ip->i_gdquot, ip->i_pdquot,
1412 				XFS_DIOSTRAT_SPACE_RES(mp, 0), 0,
1413 				XFS_QMOPT_RES_REGBLKS);
1414 		if (error)
1415 			goto out_trans_cancel;
1416 
1417 		xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1418 
1419 		xfs_bmap_init(&free_list, &first_block);
1420 
1421 		/*
1422 		 * We are using the write transaction in which max 2 bmbt
1423 		 * updates are allowed
1424 		 */
1425 		error = xfs_bmap_shift_extents(tp, ip, &next_fsb, shift_fsb,
1426 				&done, stop_fsb, &first_block, &free_list,
1427 				direction, XFS_BMAP_MAX_SHIFT_EXTENTS);
1428 		if (error)
1429 			goto out_bmap_cancel;
1430 
1431 		error = xfs_bmap_finish(&tp, &free_list, NULL);
1432 		if (error)
1433 			goto out_bmap_cancel;
1434 
1435 		error = xfs_trans_commit(tp);
1436 	}
1437 
1438 	return error;
1439 
1440 out_bmap_cancel:
1441 	xfs_bmap_cancel(&free_list);
1442 out_trans_cancel:
1443 	xfs_trans_cancel(tp);
1444 	return error;
1445 }
1446 
1447 /*
1448  * xfs_collapse_file_space()
1449  *	This routine frees disk space and shift extent for the given file.
1450  *	The first thing we do is to free data blocks in the specified range
1451  *	by calling xfs_free_file_space(). It would also sync dirty data
1452  *	and invalidate page cache over the region on which collapse range
1453  *	is working. And Shift extent records to the left to cover a hole.
1454  * RETURNS:
1455  *	0 on success
1456  *	errno on error
1457  *
1458  */
1459 int
1460 xfs_collapse_file_space(
1461 	struct xfs_inode	*ip,
1462 	xfs_off_t		offset,
1463 	xfs_off_t		len)
1464 {
1465 	int error;
1466 
1467 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1468 	trace_xfs_collapse_file_space(ip);
1469 
1470 	error = xfs_free_file_space(ip, offset, len);
1471 	if (error)
1472 		return error;
1473 
1474 	return xfs_shift_file_space(ip, offset, len, SHIFT_LEFT);
1475 }
1476 
1477 /*
1478  * xfs_insert_file_space()
1479  *	This routine create hole space by shifting extents for the given file.
1480  *	The first thing we do is to sync dirty data and invalidate page cache
1481  *	over the region on which insert range is working. And split an extent
1482  *	to two extents at given offset by calling xfs_bmap_split_extent.
1483  *	And shift all extent records which are laying between [offset,
1484  *	last allocated extent] to the right to reserve hole range.
1485  * RETURNS:
1486  *	0 on success
1487  *	errno on error
1488  */
1489 int
1490 xfs_insert_file_space(
1491 	struct xfs_inode	*ip,
1492 	loff_t			offset,
1493 	loff_t			len)
1494 {
1495 	ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1496 	trace_xfs_insert_file_space(ip);
1497 
1498 	return xfs_shift_file_space(ip, offset, len, SHIFT_RIGHT);
1499 }
1500 
1501 /*
1502  * We need to check that the format of the data fork in the temporary inode is
1503  * valid for the target inode before doing the swap. This is not a problem with
1504  * attr1 because of the fixed fork offset, but attr2 has a dynamically sized
1505  * data fork depending on the space the attribute fork is taking so we can get
1506  * invalid formats on the target inode.
1507  *
1508  * E.g. target has space for 7 extents in extent format, temp inode only has
1509  * space for 6.  If we defragment down to 7 extents, then the tmp format is a
1510  * btree, but when swapped it needs to be in extent format. Hence we can't just
1511  * blindly swap data forks on attr2 filesystems.
1512  *
1513  * Note that we check the swap in both directions so that we don't end up with
1514  * a corrupt temporary inode, either.
1515  *
1516  * Note that fixing the way xfs_fsr sets up the attribute fork in the source
1517  * inode will prevent this situation from occurring, so all we do here is
1518  * reject and log the attempt. basically we are putting the responsibility on
1519  * userspace to get this right.
1520  */
1521 static int
1522 xfs_swap_extents_check_format(
1523 	xfs_inode_t	*ip,	/* target inode */
1524 	xfs_inode_t	*tip)	/* tmp inode */
1525 {
1526 
1527 	/* Should never get a local format */
1528 	if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL ||
1529 	    tip->i_d.di_format == XFS_DINODE_FMT_LOCAL)
1530 		return -EINVAL;
1531 
1532 	/*
1533 	 * if the target inode has less extents that then temporary inode then
1534 	 * why did userspace call us?
1535 	 */
1536 	if (ip->i_d.di_nextents < tip->i_d.di_nextents)
1537 		return -EINVAL;
1538 
1539 	/*
1540 	 * if the target inode is in extent form and the temp inode is in btree
1541 	 * form then we will end up with the target inode in the wrong format
1542 	 * as we already know there are less extents in the temp inode.
1543 	 */
1544 	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1545 	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE)
1546 		return -EINVAL;
1547 
1548 	/* Check temp in extent form to max in target */
1549 	if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1550 	    XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) >
1551 			XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1552 		return -EINVAL;
1553 
1554 	/* Check target in extent form to max in temp */
1555 	if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS &&
1556 	    XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) >
1557 			XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1558 		return -EINVAL;
1559 
1560 	/*
1561 	 * If we are in a btree format, check that the temp root block will fit
1562 	 * in the target and that it has enough extents to be in btree format
1563 	 * in the target.
1564 	 *
1565 	 * Note that we have to be careful to allow btree->extent conversions
1566 	 * (a common defrag case) which will occur when the temp inode is in
1567 	 * extent format...
1568 	 */
1569 	if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1570 		if (XFS_IFORK_BOFF(ip) &&
1571 		    XFS_BMAP_BMDR_SPACE(tip->i_df.if_broot) > XFS_IFORK_BOFF(ip))
1572 			return -EINVAL;
1573 		if (XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <=
1574 		    XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK))
1575 			return -EINVAL;
1576 	}
1577 
1578 	/* Reciprocal target->temp btree format checks */
1579 	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1580 		if (XFS_IFORK_BOFF(tip) &&
1581 		    XFS_BMAP_BMDR_SPACE(ip->i_df.if_broot) > XFS_IFORK_BOFF(tip))
1582 			return -EINVAL;
1583 		if (XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <=
1584 		    XFS_IFORK_MAXEXT(tip, XFS_DATA_FORK))
1585 			return -EINVAL;
1586 	}
1587 
1588 	return 0;
1589 }
1590 
1591 static int
1592 xfs_swap_extent_flush(
1593 	struct xfs_inode	*ip)
1594 {
1595 	int	error;
1596 
1597 	error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
1598 	if (error)
1599 		return error;
1600 	truncate_pagecache_range(VFS_I(ip), 0, -1);
1601 
1602 	/* Verify O_DIRECT for ftmp */
1603 	if (VFS_I(ip)->i_mapping->nrpages)
1604 		return -EINVAL;
1605 	return 0;
1606 }
1607 
1608 int
1609 xfs_swap_extents(
1610 	xfs_inode_t	*ip,	/* target inode */
1611 	xfs_inode_t	*tip,	/* tmp inode */
1612 	xfs_swapext_t	*sxp)
1613 {
1614 	xfs_mount_t	*mp = ip->i_mount;
1615 	xfs_trans_t	*tp;
1616 	xfs_bstat_t	*sbp = &sxp->sx_stat;
1617 	xfs_ifork_t	*tempifp, *ifp, *tifp;
1618 	int		src_log_flags, target_log_flags;
1619 	int		error = 0;
1620 	int		aforkblks = 0;
1621 	int		taforkblks = 0;
1622 	__uint64_t	tmp;
1623 	int		lock_flags;
1624 
1625 	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
1626 	if (!tempifp) {
1627 		error = -ENOMEM;
1628 		goto out;
1629 	}
1630 
1631 	/*
1632 	 * Lock the inodes against other IO, page faults and truncate to
1633 	 * begin with.  Then we can ensure the inodes are flushed and have no
1634 	 * page cache safely. Once we have done this we can take the ilocks and
1635 	 * do the rest of the checks.
1636 	 */
1637 	lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
1638 	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
1639 	xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
1640 
1641 	/* Verify that both files have the same format */
1642 	if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
1643 		error = -EINVAL;
1644 		goto out_unlock;
1645 	}
1646 
1647 	/* Verify both files are either real-time or non-realtime */
1648 	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
1649 		error = -EINVAL;
1650 		goto out_unlock;
1651 	}
1652 
1653 	error = xfs_swap_extent_flush(ip);
1654 	if (error)
1655 		goto out_unlock;
1656 	error = xfs_swap_extent_flush(tip);
1657 	if (error)
1658 		goto out_unlock;
1659 
1660 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
1661 	if (error)
1662 		goto out_unlock;
1663 
1664 	/*
1665 	 * Lock and join the inodes to the tansaction so that transaction commit
1666 	 * or cancel will unlock the inodes from this point onwards.
1667 	 */
1668 	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
1669 	lock_flags |= XFS_ILOCK_EXCL;
1670 	xfs_trans_ijoin(tp, ip, lock_flags);
1671 	xfs_trans_ijoin(tp, tip, lock_flags);
1672 
1673 
1674 	/* Verify all data are being swapped */
1675 	if (sxp->sx_offset != 0 ||
1676 	    sxp->sx_length != ip->i_d.di_size ||
1677 	    sxp->sx_length != tip->i_d.di_size) {
1678 		error = -EFAULT;
1679 		goto out_trans_cancel;
1680 	}
1681 
1682 	trace_xfs_swap_extent_before(ip, 0);
1683 	trace_xfs_swap_extent_before(tip, 1);
1684 
1685 	/* check inode formats now that data is flushed */
1686 	error = xfs_swap_extents_check_format(ip, tip);
1687 	if (error) {
1688 		xfs_notice(mp,
1689 		    "%s: inode 0x%llx format is incompatible for exchanging.",
1690 				__func__, ip->i_ino);
1691 		goto out_trans_cancel;
1692 	}
1693 
1694 	/*
1695 	 * Compare the current change & modify times with that
1696 	 * passed in.  If they differ, we abort this swap.
1697 	 * This is the mechanism used to ensure the calling
1698 	 * process that the file was not changed out from
1699 	 * under it.
1700 	 */
1701 	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
1702 	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
1703 	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
1704 	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
1705 		error = -EBUSY;
1706 		goto out_trans_cancel;
1707 	}
1708 	/*
1709 	 * Count the number of extended attribute blocks
1710 	 */
1711 	if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
1712 	     (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
1713 		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
1714 		if (error)
1715 			goto out_trans_cancel;
1716 	}
1717 	if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
1718 	     (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
1719 		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
1720 			&taforkblks);
1721 		if (error)
1722 			goto out_trans_cancel;
1723 	}
1724 
1725 	/*
1726 	 * Before we've swapped the forks, lets set the owners of the forks
1727 	 * appropriately. We have to do this as we are demand paging the btree
1728 	 * buffers, and so the validation done on read will expect the owner
1729 	 * field to be correctly set. Once we change the owners, we can swap the
1730 	 * inode forks.
1731 	 *
1732 	 * Note the trickiness in setting the log flags - we set the owner log
1733 	 * flag on the opposite inode (i.e. the inode we are setting the new
1734 	 * owner to be) because once we swap the forks and log that, log
1735 	 * recovery is going to see the fork as owned by the swapped inode,
1736 	 * not the pre-swapped inodes.
1737 	 */
1738 	src_log_flags = XFS_ILOG_CORE;
1739 	target_log_flags = XFS_ILOG_CORE;
1740 	if (ip->i_d.di_version == 3 &&
1741 	    ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1742 		target_log_flags |= XFS_ILOG_DOWNER;
1743 		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
1744 					      tip->i_ino, NULL);
1745 		if (error)
1746 			goto out_trans_cancel;
1747 	}
1748 
1749 	if (tip->i_d.di_version == 3 &&
1750 	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
1751 		src_log_flags |= XFS_ILOG_DOWNER;
1752 		error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
1753 					      ip->i_ino, NULL);
1754 		if (error)
1755 			goto out_trans_cancel;
1756 	}
1757 
1758 	/*
1759 	 * Swap the data forks of the inodes
1760 	 */
1761 	ifp = &ip->i_df;
1762 	tifp = &tip->i_df;
1763 	*tempifp = *ifp;	/* struct copy */
1764 	*ifp = *tifp;		/* struct copy */
1765 	*tifp = *tempifp;	/* struct copy */
1766 
1767 	/*
1768 	 * Fix the on-disk inode values
1769 	 */
1770 	tmp = (__uint64_t)ip->i_d.di_nblocks;
1771 	ip->i_d.di_nblocks = tip->i_d.di_nblocks - taforkblks + aforkblks;
1772 	tip->i_d.di_nblocks = tmp + taforkblks - aforkblks;
1773 
1774 	tmp = (__uint64_t) ip->i_d.di_nextents;
1775 	ip->i_d.di_nextents = tip->i_d.di_nextents;
1776 	tip->i_d.di_nextents = tmp;
1777 
1778 	tmp = (__uint64_t) ip->i_d.di_format;
1779 	ip->i_d.di_format = tip->i_d.di_format;
1780 	tip->i_d.di_format = tmp;
1781 
1782 	/*
1783 	 * The extents in the source inode could still contain speculative
1784 	 * preallocation beyond EOF (e.g. the file is open but not modified
1785 	 * while defrag is in progress). In that case, we need to copy over the
1786 	 * number of delalloc blocks the data fork in the source inode is
1787 	 * tracking beyond EOF so that when the fork is truncated away when the
1788 	 * temporary inode is unlinked we don't underrun the i_delayed_blks
1789 	 * counter on that inode.
1790 	 */
1791 	ASSERT(tip->i_delayed_blks == 0);
1792 	tip->i_delayed_blks = ip->i_delayed_blks;
1793 	ip->i_delayed_blks = 0;
1794 
1795 	switch (ip->i_d.di_format) {
1796 	case XFS_DINODE_FMT_EXTENTS:
1797 		/* If the extents fit in the inode, fix the
1798 		 * pointer.  Otherwise it's already NULL or
1799 		 * pointing to the extent.
1800 		 */
1801 		if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
1802 			ifp->if_u1.if_extents =
1803 				ifp->if_u2.if_inline_ext;
1804 		}
1805 		src_log_flags |= XFS_ILOG_DEXT;
1806 		break;
1807 	case XFS_DINODE_FMT_BTREE:
1808 		ASSERT(ip->i_d.di_version < 3 ||
1809 		       (src_log_flags & XFS_ILOG_DOWNER));
1810 		src_log_flags |= XFS_ILOG_DBROOT;
1811 		break;
1812 	}
1813 
1814 	switch (tip->i_d.di_format) {
1815 	case XFS_DINODE_FMT_EXTENTS:
1816 		/* If the extents fit in the inode, fix the
1817 		 * pointer.  Otherwise it's already NULL or
1818 		 * pointing to the extent.
1819 		 */
1820 		if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
1821 			tifp->if_u1.if_extents =
1822 				tifp->if_u2.if_inline_ext;
1823 		}
1824 		target_log_flags |= XFS_ILOG_DEXT;
1825 		break;
1826 	case XFS_DINODE_FMT_BTREE:
1827 		target_log_flags |= XFS_ILOG_DBROOT;
1828 		ASSERT(tip->i_d.di_version < 3 ||
1829 		       (target_log_flags & XFS_ILOG_DOWNER));
1830 		break;
1831 	}
1832 
1833 	xfs_trans_log_inode(tp, ip,  src_log_flags);
1834 	xfs_trans_log_inode(tp, tip, target_log_flags);
1835 
1836 	/*
1837 	 * If this is a synchronous mount, make sure that the
1838 	 * transaction goes to disk before returning to the user.
1839 	 */
1840 	if (mp->m_flags & XFS_MOUNT_WSYNC)
1841 		xfs_trans_set_sync(tp);
1842 
1843 	error = xfs_trans_commit(tp);
1844 
1845 	trace_xfs_swap_extent_after(ip, 0);
1846 	trace_xfs_swap_extent_after(tip, 1);
1847 out:
1848 	kmem_free(tempifp);
1849 	return error;
1850 
1851 out_unlock:
1852 	xfs_iunlock(ip, lock_flags);
1853 	xfs_iunlock(tip, lock_flags);
1854 	goto out;
1855 
1856 out_trans_cancel:
1857 	xfs_trans_cancel(tp);
1858 	goto out;
1859 }
1860