xref: /openbmc/linux/fs/xfs/xfs_inode.c (revision fa96acadf1eb712fca6d59922ad93787c87e44ec)
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include <linux/log2.h>
19 
20 #include "xfs.h"
21 #include "xfs_fs.h"
22 #include "xfs_types.h"
23 #include "xfs_log.h"
24 #include "xfs_inum.h"
25 #include "xfs_trans.h"
26 #include "xfs_trans_priv.h"
27 #include "xfs_sb.h"
28 #include "xfs_ag.h"
29 #include "xfs_mount.h"
30 #include "xfs_bmap_btree.h"
31 #include "xfs_alloc_btree.h"
32 #include "xfs_ialloc_btree.h"
33 #include "xfs_attr_sf.h"
34 #include "xfs_dinode.h"
35 #include "xfs_inode.h"
36 #include "xfs_buf_item.h"
37 #include "xfs_inode_item.h"
38 #include "xfs_btree.h"
39 #include "xfs_alloc.h"
40 #include "xfs_ialloc.h"
41 #include "xfs_bmap.h"
42 #include "xfs_error.h"
43 #include "xfs_utils.h"
44 #include "xfs_quota.h"
45 #include "xfs_filestream.h"
46 #include "xfs_vnodeops.h"
47 #include "xfs_trace.h"
48 
49 kmem_zone_t *xfs_ifork_zone;
50 kmem_zone_t *xfs_inode_zone;
51 
52 /*
53  * Used in xfs_itruncate_extents().  This is the maximum number of extents
54  * freed from a file in a single transaction.
55  */
56 #define	XFS_ITRUNC_MAX_EXTENTS	2
57 
58 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
59 STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
60 STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
61 STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
62 
63 /*
64  * helper function to extract extent size hint from inode
65  */
66 xfs_extlen_t
67 xfs_get_extsz_hint(
68 	struct xfs_inode	*ip)
69 {
70 	if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
71 		return ip->i_d.di_extsize;
72 	if (XFS_IS_REALTIME_INODE(ip))
73 		return ip->i_mount->m_sb.sb_rextsize;
74 	return 0;
75 }
76 
77 /*
78  * This is a wrapper routine around the xfs_ilock() routine used to centralize
79  * some grungy code.  It is used in places that wish to lock the inode solely
80  * for reading the extents.  The reason these places can't just call
81  * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
82  * extents from disk for a file in b-tree format.  If the inode is in b-tree
83  * format, then we need to lock the inode exclusively until the extents are read
84  * in.  Locking it exclusively all the time would limit our parallelism
85  * unnecessarily, though.  What we do instead is check to see if the extents
86  * have been read in yet, and only lock the inode exclusively if they have not.
87  *
88  * The function returns a value which should be given to the corresponding
89  * xfs_iunlock_map_shared().  This value is the mode in which the lock was
90  * actually taken.
91  */
92 uint
93 xfs_ilock_map_shared(
94 	xfs_inode_t	*ip)
95 {
96 	uint	lock_mode;
97 
98 	if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
99 	    ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
100 		lock_mode = XFS_ILOCK_EXCL;
101 	} else {
102 		lock_mode = XFS_ILOCK_SHARED;
103 	}
104 
105 	xfs_ilock(ip, lock_mode);
106 
107 	return lock_mode;
108 }
109 
110 /*
111  * This is simply the unlock routine to go with xfs_ilock_map_shared().
112  * All it does is call xfs_iunlock() with the given lock_mode.
113  */
114 void
115 xfs_iunlock_map_shared(
116 	xfs_inode_t	*ip,
117 	unsigned int	lock_mode)
118 {
119 	xfs_iunlock(ip, lock_mode);
120 }
121 
122 /*
123  * The xfs inode contains 2 locks: a multi-reader lock called the
124  * i_iolock and a multi-reader lock called the i_lock.  This routine
125  * allows either or both of the locks to be obtained.
126  *
127  * The 2 locks should always be ordered so that the IO lock is
128  * obtained first in order to prevent deadlock.
129  *
130  * ip -- the inode being locked
131  * lock_flags -- this parameter indicates the inode's locks
132  *       to be locked.  It can be:
133  *		XFS_IOLOCK_SHARED,
134  *		XFS_IOLOCK_EXCL,
135  *		XFS_ILOCK_SHARED,
136  *		XFS_ILOCK_EXCL,
137  *		XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
138  *		XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
139  *		XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
140  *		XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
141  */
142 void
143 xfs_ilock(
144 	xfs_inode_t		*ip,
145 	uint			lock_flags)
146 {
147 	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
148 
149 	/*
150 	 * You can't set both SHARED and EXCL for the same lock,
151 	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
152 	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
153 	 */
154 	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
155 	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
156 	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
157 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
158 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
159 
160 	if (lock_flags & XFS_IOLOCK_EXCL)
161 		mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
162 	else if (lock_flags & XFS_IOLOCK_SHARED)
163 		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
164 
165 	if (lock_flags & XFS_ILOCK_EXCL)
166 		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
167 	else if (lock_flags & XFS_ILOCK_SHARED)
168 		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
169 }
170 
171 /*
172  * This is just like xfs_ilock(), except that the caller
173  * is guaranteed not to sleep.  It returns 1 if it gets
174  * the requested locks and 0 otherwise.  If the IO lock is
175  * obtained but the inode lock cannot be, then the IO lock
176  * is dropped before returning.
177  *
178  * ip -- the inode being locked
179  * lock_flags -- this parameter indicates the inode's locks to be
180  *       to be locked.  See the comment for xfs_ilock() for a list
181  *	 of valid values.
182  */
183 int
184 xfs_ilock_nowait(
185 	xfs_inode_t		*ip,
186 	uint			lock_flags)
187 {
188 	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
189 
190 	/*
191 	 * You can't set both SHARED and EXCL for the same lock,
192 	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
193 	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
194 	 */
195 	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
196 	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
197 	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
198 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
199 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
200 
201 	if (lock_flags & XFS_IOLOCK_EXCL) {
202 		if (!mrtryupdate(&ip->i_iolock))
203 			goto out;
204 	} else if (lock_flags & XFS_IOLOCK_SHARED) {
205 		if (!mrtryaccess(&ip->i_iolock))
206 			goto out;
207 	}
208 	if (lock_flags & XFS_ILOCK_EXCL) {
209 		if (!mrtryupdate(&ip->i_lock))
210 			goto out_undo_iolock;
211 	} else if (lock_flags & XFS_ILOCK_SHARED) {
212 		if (!mrtryaccess(&ip->i_lock))
213 			goto out_undo_iolock;
214 	}
215 	return 1;
216 
217  out_undo_iolock:
218 	if (lock_flags & XFS_IOLOCK_EXCL)
219 		mrunlock_excl(&ip->i_iolock);
220 	else if (lock_flags & XFS_IOLOCK_SHARED)
221 		mrunlock_shared(&ip->i_iolock);
222  out:
223 	return 0;
224 }
225 
226 /*
227  * xfs_iunlock() is used to drop the inode locks acquired with
228  * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
229  * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
230  * that we know which locks to drop.
231  *
232  * ip -- the inode being unlocked
233  * lock_flags -- this parameter indicates the inode's locks to be
234  *       to be unlocked.  See the comment for xfs_ilock() for a list
235  *	 of valid values for this parameter.
236  *
237  */
238 void
239 xfs_iunlock(
240 	xfs_inode_t		*ip,
241 	uint			lock_flags)
242 {
243 	/*
244 	 * You can't set both SHARED and EXCL for the same lock,
245 	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
246 	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
247 	 */
248 	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
249 	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
250 	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
251 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
252 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
253 	ASSERT(lock_flags != 0);
254 
255 	if (lock_flags & XFS_IOLOCK_EXCL)
256 		mrunlock_excl(&ip->i_iolock);
257 	else if (lock_flags & XFS_IOLOCK_SHARED)
258 		mrunlock_shared(&ip->i_iolock);
259 
260 	if (lock_flags & XFS_ILOCK_EXCL)
261 		mrunlock_excl(&ip->i_lock);
262 	else if (lock_flags & XFS_ILOCK_SHARED)
263 		mrunlock_shared(&ip->i_lock);
264 
265 	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
266 }
267 
268 /*
269  * give up write locks.  the i/o lock cannot be held nested
270  * if it is being demoted.
271  */
272 void
273 xfs_ilock_demote(
274 	xfs_inode_t		*ip,
275 	uint			lock_flags)
276 {
277 	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
278 	ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
279 
280 	if (lock_flags & XFS_ILOCK_EXCL)
281 		mrdemote(&ip->i_lock);
282 	if (lock_flags & XFS_IOLOCK_EXCL)
283 		mrdemote(&ip->i_iolock);
284 
285 	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
286 }
287 
288 #ifdef DEBUG
289 int
290 xfs_isilocked(
291 	xfs_inode_t		*ip,
292 	uint			lock_flags)
293 {
294 	if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
295 		if (!(lock_flags & XFS_ILOCK_SHARED))
296 			return !!ip->i_lock.mr_writer;
297 		return rwsem_is_locked(&ip->i_lock.mr_lock);
298 	}
299 
300 	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
301 		if (!(lock_flags & XFS_IOLOCK_SHARED))
302 			return !!ip->i_iolock.mr_writer;
303 		return rwsem_is_locked(&ip->i_iolock.mr_lock);
304 	}
305 
306 	ASSERT(0);
307 	return 0;
308 }
309 #endif
310 
311 void
312 __xfs_iflock(
313 	struct xfs_inode	*ip)
314 {
315 	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
316 	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
317 
318 	do {
319 		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
320 		if (xfs_isiflocked(ip))
321 			io_schedule();
322 	} while (!xfs_iflock_nowait(ip));
323 
324 	finish_wait(wq, &wait.wait);
325 }
326 
327 #ifdef DEBUG
328 /*
329  * Make sure that the extents in the given memory buffer
330  * are valid.
331  */
332 STATIC void
333 xfs_validate_extents(
334 	xfs_ifork_t		*ifp,
335 	int			nrecs,
336 	xfs_exntfmt_t		fmt)
337 {
338 	xfs_bmbt_irec_t		irec;
339 	xfs_bmbt_rec_host_t	rec;
340 	int			i;
341 
342 	for (i = 0; i < nrecs; i++) {
343 		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
344 		rec.l0 = get_unaligned(&ep->l0);
345 		rec.l1 = get_unaligned(&ep->l1);
346 		xfs_bmbt_get_all(&rec, &irec);
347 		if (fmt == XFS_EXTFMT_NOSTATE)
348 			ASSERT(irec.br_state == XFS_EXT_NORM);
349 	}
350 }
351 #else /* DEBUG */
352 #define xfs_validate_extents(ifp, nrecs, fmt)
353 #endif /* DEBUG */
354 
355 /*
356  * Check that none of the inode's in the buffer have a next
357  * unlinked field of 0.
358  */
359 #if defined(DEBUG)
360 void
361 xfs_inobp_check(
362 	xfs_mount_t	*mp,
363 	xfs_buf_t	*bp)
364 {
365 	int		i;
366 	int		j;
367 	xfs_dinode_t	*dip;
368 
369 	j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
370 
371 	for (i = 0; i < j; i++) {
372 		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
373 					i * mp->m_sb.sb_inodesize);
374 		if (!dip->di_next_unlinked)  {
375 			xfs_alert(mp,
376 	"Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
377 				bp);
378 			ASSERT(dip->di_next_unlinked);
379 		}
380 	}
381 }
382 #endif
383 
384 /*
385  * This routine is called to map an inode to the buffer containing the on-disk
386  * version of the inode.  It returns a pointer to the buffer containing the
387  * on-disk inode in the bpp parameter, and in the dipp parameter it returns a
388  * pointer to the on-disk inode within that buffer.
389  *
390  * If a non-zero error is returned, then the contents of bpp and dipp are
391  * undefined.
392  */
393 int
394 xfs_imap_to_bp(
395 	struct xfs_mount	*mp,
396 	struct xfs_trans	*tp,
397 	struct xfs_imap		*imap,
398 	struct xfs_dinode	**dipp,
399 	struct xfs_buf		**bpp,
400 	uint			buf_flags,
401 	uint			iget_flags)
402 {
403 	struct xfs_buf		*bp;
404 	int			error;
405 	int			i;
406 	int			ni;
407 
408 	buf_flags |= XBF_UNMAPPED;
409 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
410 				   (int)imap->im_len, buf_flags, &bp);
411 	if (error) {
412 		if (error != EAGAIN) {
413 			xfs_warn(mp,
414 				"%s: xfs_trans_read_buf() returned error %d.",
415 				__func__, error);
416 		} else {
417 			ASSERT(buf_flags & XBF_TRYLOCK);
418 		}
419 		return error;
420 	}
421 
422 	/*
423 	 * Validate the magic number and version of every inode in the buffer
424 	 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
425 	 */
426 #ifdef DEBUG
427 	ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
428 #else	/* usual case */
429 	ni = 1;
430 #endif
431 
432 	for (i = 0; i < ni; i++) {
433 		int		di_ok;
434 		xfs_dinode_t	*dip;
435 
436 		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
437 					(i << mp->m_sb.sb_inodelog));
438 		di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
439 			    XFS_DINODE_GOOD_VERSION(dip->di_version);
440 		if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
441 						XFS_ERRTAG_ITOBP_INOTOBP,
442 						XFS_RANDOM_ITOBP_INOTOBP))) {
443 			if (iget_flags & XFS_IGET_UNTRUSTED) {
444 				xfs_trans_brelse(tp, bp);
445 				return XFS_ERROR(EINVAL);
446 			}
447 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
448 					     mp, dip);
449 #ifdef DEBUG
450 			xfs_emerg(mp,
451 				"bad inode magic/vsn daddr %lld #%d (magic=%x)",
452 				(unsigned long long)imap->im_blkno, i,
453 				be16_to_cpu(dip->di_magic));
454 			ASSERT(0);
455 #endif
456 			xfs_trans_brelse(tp, bp);
457 			return XFS_ERROR(EFSCORRUPTED);
458 		}
459 	}
460 
461 	xfs_inobp_check(mp, bp);
462 
463 	*bpp = bp;
464 	*dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
465 	return 0;
466 }
467 
468 /*
469  * Move inode type and inode format specific information from the
470  * on-disk inode to the in-core inode.  For fifos, devs, and sockets
471  * this means set if_rdev to the proper value.  For files, directories,
472  * and symlinks this means to bring in the in-line data or extent
473  * pointers.  For a file in B-tree format, only the root is immediately
474  * brought in-core.  The rest will be in-lined in if_extents when it
475  * is first referenced (see xfs_iread_extents()).
476  */
477 STATIC int
478 xfs_iformat(
479 	xfs_inode_t		*ip,
480 	xfs_dinode_t		*dip)
481 {
482 	xfs_attr_shortform_t	*atp;
483 	int			size;
484 	int			error = 0;
485 	xfs_fsize_t             di_size;
486 
487 	if (unlikely(be32_to_cpu(dip->di_nextents) +
488 		     be16_to_cpu(dip->di_anextents) >
489 		     be64_to_cpu(dip->di_nblocks))) {
490 		xfs_warn(ip->i_mount,
491 			"corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
492 			(unsigned long long)ip->i_ino,
493 			(int)(be32_to_cpu(dip->di_nextents) +
494 			      be16_to_cpu(dip->di_anextents)),
495 			(unsigned long long)
496 				be64_to_cpu(dip->di_nblocks));
497 		XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
498 				     ip->i_mount, dip);
499 		return XFS_ERROR(EFSCORRUPTED);
500 	}
501 
502 	if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
503 		xfs_warn(ip->i_mount, "corrupt dinode %Lu, forkoff = 0x%x.",
504 			(unsigned long long)ip->i_ino,
505 			dip->di_forkoff);
506 		XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
507 				     ip->i_mount, dip);
508 		return XFS_ERROR(EFSCORRUPTED);
509 	}
510 
511 	if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
512 		     !ip->i_mount->m_rtdev_targp)) {
513 		xfs_warn(ip->i_mount,
514 			"corrupt dinode %Lu, has realtime flag set.",
515 			ip->i_ino);
516 		XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
517 				     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
518 		return XFS_ERROR(EFSCORRUPTED);
519 	}
520 
521 	switch (ip->i_d.di_mode & S_IFMT) {
522 	case S_IFIFO:
523 	case S_IFCHR:
524 	case S_IFBLK:
525 	case S_IFSOCK:
526 		if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
527 			XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
528 					      ip->i_mount, dip);
529 			return XFS_ERROR(EFSCORRUPTED);
530 		}
531 		ip->i_d.di_size = 0;
532 		ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
533 		break;
534 
535 	case S_IFREG:
536 	case S_IFLNK:
537 	case S_IFDIR:
538 		switch (dip->di_format) {
539 		case XFS_DINODE_FMT_LOCAL:
540 			/*
541 			 * no local regular files yet
542 			 */
543 			if (unlikely(S_ISREG(be16_to_cpu(dip->di_mode)))) {
544 				xfs_warn(ip->i_mount,
545 			"corrupt inode %Lu (local format for regular file).",
546 					(unsigned long long) ip->i_ino);
547 				XFS_CORRUPTION_ERROR("xfs_iformat(4)",
548 						     XFS_ERRLEVEL_LOW,
549 						     ip->i_mount, dip);
550 				return XFS_ERROR(EFSCORRUPTED);
551 			}
552 
553 			di_size = be64_to_cpu(dip->di_size);
554 			if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
555 				xfs_warn(ip->i_mount,
556 			"corrupt inode %Lu (bad size %Ld for local inode).",
557 					(unsigned long long) ip->i_ino,
558 					(long long) di_size);
559 				XFS_CORRUPTION_ERROR("xfs_iformat(5)",
560 						     XFS_ERRLEVEL_LOW,
561 						     ip->i_mount, dip);
562 				return XFS_ERROR(EFSCORRUPTED);
563 			}
564 
565 			size = (int)di_size;
566 			error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
567 			break;
568 		case XFS_DINODE_FMT_EXTENTS:
569 			error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
570 			break;
571 		case XFS_DINODE_FMT_BTREE:
572 			error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
573 			break;
574 		default:
575 			XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
576 					 ip->i_mount);
577 			return XFS_ERROR(EFSCORRUPTED);
578 		}
579 		break;
580 
581 	default:
582 		XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
583 		return XFS_ERROR(EFSCORRUPTED);
584 	}
585 	if (error) {
586 		return error;
587 	}
588 	if (!XFS_DFORK_Q(dip))
589 		return 0;
590 
591 	ASSERT(ip->i_afp == NULL);
592 	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP | KM_NOFS);
593 
594 	switch (dip->di_aformat) {
595 	case XFS_DINODE_FMT_LOCAL:
596 		atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
597 		size = be16_to_cpu(atp->hdr.totsize);
598 
599 		if (unlikely(size < sizeof(struct xfs_attr_sf_hdr))) {
600 			xfs_warn(ip->i_mount,
601 				"corrupt inode %Lu (bad attr fork size %Ld).",
602 				(unsigned long long) ip->i_ino,
603 				(long long) size);
604 			XFS_CORRUPTION_ERROR("xfs_iformat(8)",
605 					     XFS_ERRLEVEL_LOW,
606 					     ip->i_mount, dip);
607 			return XFS_ERROR(EFSCORRUPTED);
608 		}
609 
610 		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
611 		break;
612 	case XFS_DINODE_FMT_EXTENTS:
613 		error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
614 		break;
615 	case XFS_DINODE_FMT_BTREE:
616 		error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
617 		break;
618 	default:
619 		error = XFS_ERROR(EFSCORRUPTED);
620 		break;
621 	}
622 	if (error) {
623 		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
624 		ip->i_afp = NULL;
625 		xfs_idestroy_fork(ip, XFS_DATA_FORK);
626 	}
627 	return error;
628 }
629 
630 /*
631  * The file is in-lined in the on-disk inode.
632  * If it fits into if_inline_data, then copy
633  * it there, otherwise allocate a buffer for it
634  * and copy the data there.  Either way, set
635  * if_data to point at the data.
636  * If we allocate a buffer for the data, make
637  * sure that its size is a multiple of 4 and
638  * record the real size in i_real_bytes.
639  */
640 STATIC int
641 xfs_iformat_local(
642 	xfs_inode_t	*ip,
643 	xfs_dinode_t	*dip,
644 	int		whichfork,
645 	int		size)
646 {
647 	xfs_ifork_t	*ifp;
648 	int		real_size;
649 
650 	/*
651 	 * If the size is unreasonable, then something
652 	 * is wrong and we just bail out rather than crash in
653 	 * kmem_alloc() or memcpy() below.
654 	 */
655 	if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
656 		xfs_warn(ip->i_mount,
657 	"corrupt inode %Lu (bad size %d for local fork, size = %d).",
658 			(unsigned long long) ip->i_ino, size,
659 			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
660 		XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
661 				     ip->i_mount, dip);
662 		return XFS_ERROR(EFSCORRUPTED);
663 	}
664 	ifp = XFS_IFORK_PTR(ip, whichfork);
665 	real_size = 0;
666 	if (size == 0)
667 		ifp->if_u1.if_data = NULL;
668 	else if (size <= sizeof(ifp->if_u2.if_inline_data))
669 		ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
670 	else {
671 		real_size = roundup(size, 4);
672 		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP | KM_NOFS);
673 	}
674 	ifp->if_bytes = size;
675 	ifp->if_real_bytes = real_size;
676 	if (size)
677 		memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
678 	ifp->if_flags &= ~XFS_IFEXTENTS;
679 	ifp->if_flags |= XFS_IFINLINE;
680 	return 0;
681 }
682 
683 /*
684  * The file consists of a set of extents all
685  * of which fit into the on-disk inode.
686  * If there are few enough extents to fit into
687  * the if_inline_ext, then copy them there.
688  * Otherwise allocate a buffer for them and copy
689  * them into it.  Either way, set if_extents
690  * to point at the extents.
691  */
692 STATIC int
693 xfs_iformat_extents(
694 	xfs_inode_t	*ip,
695 	xfs_dinode_t	*dip,
696 	int		whichfork)
697 {
698 	xfs_bmbt_rec_t	*dp;
699 	xfs_ifork_t	*ifp;
700 	int		nex;
701 	int		size;
702 	int		i;
703 
704 	ifp = XFS_IFORK_PTR(ip, whichfork);
705 	nex = XFS_DFORK_NEXTENTS(dip, whichfork);
706 	size = nex * (uint)sizeof(xfs_bmbt_rec_t);
707 
708 	/*
709 	 * If the number of extents is unreasonable, then something
710 	 * is wrong and we just bail out rather than crash in
711 	 * kmem_alloc() or memcpy() below.
712 	 */
713 	if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
714 		xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).",
715 			(unsigned long long) ip->i_ino, nex);
716 		XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
717 				     ip->i_mount, dip);
718 		return XFS_ERROR(EFSCORRUPTED);
719 	}
720 
721 	ifp->if_real_bytes = 0;
722 	if (nex == 0)
723 		ifp->if_u1.if_extents = NULL;
724 	else if (nex <= XFS_INLINE_EXTS)
725 		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
726 	else
727 		xfs_iext_add(ifp, 0, nex);
728 
729 	ifp->if_bytes = size;
730 	if (size) {
731 		dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
732 		xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip));
733 		for (i = 0; i < nex; i++, dp++) {
734 			xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
735 			ep->l0 = get_unaligned_be64(&dp->l0);
736 			ep->l1 = get_unaligned_be64(&dp->l1);
737 		}
738 		XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork);
739 		if (whichfork != XFS_DATA_FORK ||
740 			XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
741 				if (unlikely(xfs_check_nostate_extents(
742 				    ifp, 0, nex))) {
743 					XFS_ERROR_REPORT("xfs_iformat_extents(2)",
744 							 XFS_ERRLEVEL_LOW,
745 							 ip->i_mount);
746 					return XFS_ERROR(EFSCORRUPTED);
747 				}
748 	}
749 	ifp->if_flags |= XFS_IFEXTENTS;
750 	return 0;
751 }
752 
753 /*
754  * The file has too many extents to fit into
755  * the inode, so they are in B-tree format.
756  * Allocate a buffer for the root of the B-tree
757  * and copy the root into it.  The i_extents
758  * field will remain NULL until all of the
759  * extents are read in (when they are needed).
760  */
761 STATIC int
762 xfs_iformat_btree(
763 	xfs_inode_t		*ip,
764 	xfs_dinode_t		*dip,
765 	int			whichfork)
766 {
767 	xfs_bmdr_block_t	*dfp;
768 	xfs_ifork_t		*ifp;
769 	/* REFERENCED */
770 	int			nrecs;
771 	int			size;
772 
773 	ifp = XFS_IFORK_PTR(ip, whichfork);
774 	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
775 	size = XFS_BMAP_BROOT_SPACE(dfp);
776 	nrecs = be16_to_cpu(dfp->bb_numrecs);
777 
778 	/*
779 	 * blow out if -- fork has less extents than can fit in
780 	 * fork (fork shouldn't be a btree format), root btree
781 	 * block has more records than can fit into the fork,
782 	 * or the number of extents is greater than the number of
783 	 * blocks.
784 	 */
785 	if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <=
786 			XFS_IFORK_MAXEXT(ip, whichfork) ||
787 		     XFS_BMDR_SPACE_CALC(nrecs) >
788 			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork) ||
789 		     XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
790 		xfs_warn(ip->i_mount, "corrupt inode %Lu (btree).",
791 			(unsigned long long) ip->i_ino);
792 		XFS_CORRUPTION_ERROR("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
793 				 ip->i_mount, dip);
794 		return XFS_ERROR(EFSCORRUPTED);
795 	}
796 
797 	ifp->if_broot_bytes = size;
798 	ifp->if_broot = kmem_alloc(size, KM_SLEEP | KM_NOFS);
799 	ASSERT(ifp->if_broot != NULL);
800 	/*
801 	 * Copy and convert from the on-disk structure
802 	 * to the in-memory structure.
803 	 */
804 	xfs_bmdr_to_bmbt(ip->i_mount, dfp,
805 			 XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
806 			 ifp->if_broot, size);
807 	ifp->if_flags &= ~XFS_IFEXTENTS;
808 	ifp->if_flags |= XFS_IFBROOT;
809 
810 	return 0;
811 }
812 
813 STATIC void
814 xfs_dinode_from_disk(
815 	xfs_icdinode_t		*to,
816 	xfs_dinode_t		*from)
817 {
818 	to->di_magic = be16_to_cpu(from->di_magic);
819 	to->di_mode = be16_to_cpu(from->di_mode);
820 	to->di_version = from ->di_version;
821 	to->di_format = from->di_format;
822 	to->di_onlink = be16_to_cpu(from->di_onlink);
823 	to->di_uid = be32_to_cpu(from->di_uid);
824 	to->di_gid = be32_to_cpu(from->di_gid);
825 	to->di_nlink = be32_to_cpu(from->di_nlink);
826 	to->di_projid_lo = be16_to_cpu(from->di_projid_lo);
827 	to->di_projid_hi = be16_to_cpu(from->di_projid_hi);
828 	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
829 	to->di_flushiter = be16_to_cpu(from->di_flushiter);
830 	to->di_atime.t_sec = be32_to_cpu(from->di_atime.t_sec);
831 	to->di_atime.t_nsec = be32_to_cpu(from->di_atime.t_nsec);
832 	to->di_mtime.t_sec = be32_to_cpu(from->di_mtime.t_sec);
833 	to->di_mtime.t_nsec = be32_to_cpu(from->di_mtime.t_nsec);
834 	to->di_ctime.t_sec = be32_to_cpu(from->di_ctime.t_sec);
835 	to->di_ctime.t_nsec = be32_to_cpu(from->di_ctime.t_nsec);
836 	to->di_size = be64_to_cpu(from->di_size);
837 	to->di_nblocks = be64_to_cpu(from->di_nblocks);
838 	to->di_extsize = be32_to_cpu(from->di_extsize);
839 	to->di_nextents = be32_to_cpu(from->di_nextents);
840 	to->di_anextents = be16_to_cpu(from->di_anextents);
841 	to->di_forkoff = from->di_forkoff;
842 	to->di_aformat	= from->di_aformat;
843 	to->di_dmevmask	= be32_to_cpu(from->di_dmevmask);
844 	to->di_dmstate	= be16_to_cpu(from->di_dmstate);
845 	to->di_flags	= be16_to_cpu(from->di_flags);
846 	to->di_gen	= be32_to_cpu(from->di_gen);
847 }
848 
849 void
850 xfs_dinode_to_disk(
851 	xfs_dinode_t		*to,
852 	xfs_icdinode_t		*from)
853 {
854 	to->di_magic = cpu_to_be16(from->di_magic);
855 	to->di_mode = cpu_to_be16(from->di_mode);
856 	to->di_version = from ->di_version;
857 	to->di_format = from->di_format;
858 	to->di_onlink = cpu_to_be16(from->di_onlink);
859 	to->di_uid = cpu_to_be32(from->di_uid);
860 	to->di_gid = cpu_to_be32(from->di_gid);
861 	to->di_nlink = cpu_to_be32(from->di_nlink);
862 	to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
863 	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
864 	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
865 	to->di_flushiter = cpu_to_be16(from->di_flushiter);
866 	to->di_atime.t_sec = cpu_to_be32(from->di_atime.t_sec);
867 	to->di_atime.t_nsec = cpu_to_be32(from->di_atime.t_nsec);
868 	to->di_mtime.t_sec = cpu_to_be32(from->di_mtime.t_sec);
869 	to->di_mtime.t_nsec = cpu_to_be32(from->di_mtime.t_nsec);
870 	to->di_ctime.t_sec = cpu_to_be32(from->di_ctime.t_sec);
871 	to->di_ctime.t_nsec = cpu_to_be32(from->di_ctime.t_nsec);
872 	to->di_size = cpu_to_be64(from->di_size);
873 	to->di_nblocks = cpu_to_be64(from->di_nblocks);
874 	to->di_extsize = cpu_to_be32(from->di_extsize);
875 	to->di_nextents = cpu_to_be32(from->di_nextents);
876 	to->di_anextents = cpu_to_be16(from->di_anextents);
877 	to->di_forkoff = from->di_forkoff;
878 	to->di_aformat = from->di_aformat;
879 	to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
880 	to->di_dmstate = cpu_to_be16(from->di_dmstate);
881 	to->di_flags = cpu_to_be16(from->di_flags);
882 	to->di_gen = cpu_to_be32(from->di_gen);
883 }
884 
885 STATIC uint
886 _xfs_dic2xflags(
887 	__uint16_t		di_flags)
888 {
889 	uint			flags = 0;
890 
891 	if (di_flags & XFS_DIFLAG_ANY) {
892 		if (di_flags & XFS_DIFLAG_REALTIME)
893 			flags |= XFS_XFLAG_REALTIME;
894 		if (di_flags & XFS_DIFLAG_PREALLOC)
895 			flags |= XFS_XFLAG_PREALLOC;
896 		if (di_flags & XFS_DIFLAG_IMMUTABLE)
897 			flags |= XFS_XFLAG_IMMUTABLE;
898 		if (di_flags & XFS_DIFLAG_APPEND)
899 			flags |= XFS_XFLAG_APPEND;
900 		if (di_flags & XFS_DIFLAG_SYNC)
901 			flags |= XFS_XFLAG_SYNC;
902 		if (di_flags & XFS_DIFLAG_NOATIME)
903 			flags |= XFS_XFLAG_NOATIME;
904 		if (di_flags & XFS_DIFLAG_NODUMP)
905 			flags |= XFS_XFLAG_NODUMP;
906 		if (di_flags & XFS_DIFLAG_RTINHERIT)
907 			flags |= XFS_XFLAG_RTINHERIT;
908 		if (di_flags & XFS_DIFLAG_PROJINHERIT)
909 			flags |= XFS_XFLAG_PROJINHERIT;
910 		if (di_flags & XFS_DIFLAG_NOSYMLINKS)
911 			flags |= XFS_XFLAG_NOSYMLINKS;
912 		if (di_flags & XFS_DIFLAG_EXTSIZE)
913 			flags |= XFS_XFLAG_EXTSIZE;
914 		if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
915 			flags |= XFS_XFLAG_EXTSZINHERIT;
916 		if (di_flags & XFS_DIFLAG_NODEFRAG)
917 			flags |= XFS_XFLAG_NODEFRAG;
918 		if (di_flags & XFS_DIFLAG_FILESTREAM)
919 			flags |= XFS_XFLAG_FILESTREAM;
920 	}
921 
922 	return flags;
923 }
924 
925 uint
926 xfs_ip2xflags(
927 	xfs_inode_t		*ip)
928 {
929 	xfs_icdinode_t		*dic = &ip->i_d;
930 
931 	return _xfs_dic2xflags(dic->di_flags) |
932 				(XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
933 }
934 
935 uint
936 xfs_dic2xflags(
937 	xfs_dinode_t		*dip)
938 {
939 	return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
940 				(XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
941 }
942 
943 /*
944  * Read the disk inode attributes into the in-core inode structure.
945  */
946 int
947 xfs_iread(
948 	xfs_mount_t	*mp,
949 	xfs_trans_t	*tp,
950 	xfs_inode_t	*ip,
951 	uint		iget_flags)
952 {
953 	xfs_buf_t	*bp;
954 	xfs_dinode_t	*dip;
955 	int		error;
956 
957 	/*
958 	 * Fill in the location information in the in-core inode.
959 	 */
960 	error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
961 	if (error)
962 		return error;
963 
964 	/*
965 	 * Get pointers to the on-disk inode and the buffer containing it.
966 	 */
967 	error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0, iget_flags);
968 	if (error)
969 		return error;
970 
971 	/*
972 	 * If we got something that isn't an inode it means someone
973 	 * (nfs or dmi) has a stale handle.
974 	 */
975 	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC)) {
976 #ifdef DEBUG
977 		xfs_alert(mp,
978 			"%s: dip->di_magic (0x%x) != XFS_DINODE_MAGIC (0x%x)",
979 			__func__, be16_to_cpu(dip->di_magic), XFS_DINODE_MAGIC);
980 #endif /* DEBUG */
981 		error = XFS_ERROR(EINVAL);
982 		goto out_brelse;
983 	}
984 
985 	/*
986 	 * If the on-disk inode is already linked to a directory
987 	 * entry, copy all of the inode into the in-core inode.
988 	 * xfs_iformat() handles copying in the inode format
989 	 * specific information.
990 	 * Otherwise, just get the truly permanent information.
991 	 */
992 	if (dip->di_mode) {
993 		xfs_dinode_from_disk(&ip->i_d, dip);
994 		error = xfs_iformat(ip, dip);
995 		if (error)  {
996 #ifdef DEBUG
997 			xfs_alert(mp, "%s: xfs_iformat() returned error %d",
998 				__func__, error);
999 #endif /* DEBUG */
1000 			goto out_brelse;
1001 		}
1002 	} else {
1003 		ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
1004 		ip->i_d.di_version = dip->di_version;
1005 		ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
1006 		ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
1007 		/*
1008 		 * Make sure to pull in the mode here as well in
1009 		 * case the inode is released without being used.
1010 		 * This ensures that xfs_inactive() will see that
1011 		 * the inode is already free and not try to mess
1012 		 * with the uninitialized part of it.
1013 		 */
1014 		ip->i_d.di_mode = 0;
1015 	}
1016 
1017 	/*
1018 	 * The inode format changed when we moved the link count and
1019 	 * made it 32 bits long.  If this is an old format inode,
1020 	 * convert it in memory to look like a new one.  If it gets
1021 	 * flushed to disk we will convert back before flushing or
1022 	 * logging it.  We zero out the new projid field and the old link
1023 	 * count field.  We'll handle clearing the pad field (the remains
1024 	 * of the old uuid field) when we actually convert the inode to
1025 	 * the new format. We don't change the version number so that we
1026 	 * can distinguish this from a real new format inode.
1027 	 */
1028 	if (ip->i_d.di_version == 1) {
1029 		ip->i_d.di_nlink = ip->i_d.di_onlink;
1030 		ip->i_d.di_onlink = 0;
1031 		xfs_set_projid(ip, 0);
1032 	}
1033 
1034 	ip->i_delayed_blks = 0;
1035 
1036 	/*
1037 	 * Mark the buffer containing the inode as something to keep
1038 	 * around for a while.  This helps to keep recently accessed
1039 	 * meta-data in-core longer.
1040 	 */
1041 	xfs_buf_set_ref(bp, XFS_INO_REF);
1042 
1043 	/*
1044 	 * Use xfs_trans_brelse() to release the buffer containing the
1045 	 * on-disk inode, because it was acquired with xfs_trans_read_buf()
1046 	 * in xfs_imap_to_bp() above.  If tp is NULL, this is just a normal
1047 	 * brelse().  If we're within a transaction, then xfs_trans_brelse()
1048 	 * will only release the buffer if it is not dirty within the
1049 	 * transaction.  It will be OK to release the buffer in this case,
1050 	 * because inodes on disk are never destroyed and we will be
1051 	 * locking the new in-core inode before putting it in the hash
1052 	 * table where other processes can find it.  Thus we don't have
1053 	 * to worry about the inode being changed just because we released
1054 	 * the buffer.
1055 	 */
1056  out_brelse:
1057 	xfs_trans_brelse(tp, bp);
1058 	return error;
1059 }
1060 
1061 /*
1062  * Read in extents from a btree-format inode.
1063  * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
1064  */
1065 int
1066 xfs_iread_extents(
1067 	xfs_trans_t	*tp,
1068 	xfs_inode_t	*ip,
1069 	int		whichfork)
1070 {
1071 	int		error;
1072 	xfs_ifork_t	*ifp;
1073 	xfs_extnum_t	nextents;
1074 
1075 	if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
1076 		XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
1077 				 ip->i_mount);
1078 		return XFS_ERROR(EFSCORRUPTED);
1079 	}
1080 	nextents = XFS_IFORK_NEXTENTS(ip, whichfork);
1081 	ifp = XFS_IFORK_PTR(ip, whichfork);
1082 
1083 	/*
1084 	 * We know that the size is valid (it's checked in iformat_btree)
1085 	 */
1086 	ifp->if_bytes = ifp->if_real_bytes = 0;
1087 	ifp->if_flags |= XFS_IFEXTENTS;
1088 	xfs_iext_add(ifp, 0, nextents);
1089 	error = xfs_bmap_read_extents(tp, ip, whichfork);
1090 	if (error) {
1091 		xfs_iext_destroy(ifp);
1092 		ifp->if_flags &= ~XFS_IFEXTENTS;
1093 		return error;
1094 	}
1095 	xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip));
1096 	return 0;
1097 }
1098 
1099 /*
1100  * Allocate an inode on disk and return a copy of its in-core version.
1101  * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
1102  * appropriately within the inode.  The uid and gid for the inode are
1103  * set according to the contents of the given cred structure.
1104  *
1105  * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
1106  * has a free inode available, call xfs_iget()
1107  * to obtain the in-core version of the allocated inode.  Finally,
1108  * fill in the inode and log its initial contents.  In this case,
1109  * ialloc_context would be set to NULL and call_again set to false.
1110  *
1111  * If xfs_dialloc() does not have an available inode,
1112  * it will replenish its supply by doing an allocation. Since we can
1113  * only do one allocation within a transaction without deadlocks, we
1114  * must commit the current transaction before returning the inode itself.
1115  * In this case, therefore, we will set call_again to true and return.
1116  * The caller should then commit the current transaction, start a new
1117  * transaction, and call xfs_ialloc() again to actually get the inode.
1118  *
1119  * To ensure that some other process does not grab the inode that
1120  * was allocated during the first call to xfs_ialloc(), this routine
1121  * also returns the [locked] bp pointing to the head of the freelist
1122  * as ialloc_context.  The caller should hold this buffer across
1123  * the commit and pass it back into this routine on the second call.
1124  *
1125  * If we are allocating quota inodes, we do not have a parent inode
1126  * to attach to or associate with (i.e. pip == NULL) because they
1127  * are not linked into the directory structure - they are attached
1128  * directly to the superblock - and so have no parent.
1129  */
1130 int
1131 xfs_ialloc(
1132 	xfs_trans_t	*tp,
1133 	xfs_inode_t	*pip,
1134 	umode_t		mode,
1135 	xfs_nlink_t	nlink,
1136 	xfs_dev_t	rdev,
1137 	prid_t		prid,
1138 	int		okalloc,
1139 	xfs_buf_t	**ialloc_context,
1140 	xfs_inode_t	**ipp)
1141 {
1142 	xfs_ino_t	ino;
1143 	xfs_inode_t	*ip;
1144 	uint		flags;
1145 	int		error;
1146 	timespec_t	tv;
1147 	int		filestreams = 0;
1148 
1149 	/*
1150 	 * Call the space management code to pick
1151 	 * the on-disk inode to be allocated.
1152 	 */
1153 	error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
1154 			    ialloc_context, &ino);
1155 	if (error)
1156 		return error;
1157 	if (*ialloc_context || ino == NULLFSINO) {
1158 		*ipp = NULL;
1159 		return 0;
1160 	}
1161 	ASSERT(*ialloc_context == NULL);
1162 
1163 	/*
1164 	 * Get the in-core inode with the lock held exclusively.
1165 	 * This is because we're setting fields here we need
1166 	 * to prevent others from looking at until we're done.
1167 	 */
1168 	error = xfs_iget(tp->t_mountp, tp, ino, XFS_IGET_CREATE,
1169 			 XFS_ILOCK_EXCL, &ip);
1170 	if (error)
1171 		return error;
1172 	ASSERT(ip != NULL);
1173 
1174 	ip->i_d.di_mode = mode;
1175 	ip->i_d.di_onlink = 0;
1176 	ip->i_d.di_nlink = nlink;
1177 	ASSERT(ip->i_d.di_nlink == nlink);
1178 	ip->i_d.di_uid = current_fsuid();
1179 	ip->i_d.di_gid = current_fsgid();
1180 	xfs_set_projid(ip, prid);
1181 	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1182 
1183 	/*
1184 	 * If the superblock version is up to where we support new format
1185 	 * inodes and this is currently an old format inode, then change
1186 	 * the inode version number now.  This way we only do the conversion
1187 	 * here rather than here and in the flush/logging code.
1188 	 */
1189 	if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
1190 	    ip->i_d.di_version == 1) {
1191 		ip->i_d.di_version = 2;
1192 		/*
1193 		 * We've already zeroed the old link count, the projid field,
1194 		 * and the pad field.
1195 		 */
1196 	}
1197 
1198 	/*
1199 	 * Project ids won't be stored on disk if we are using a version 1 inode.
1200 	 */
1201 	if ((prid != 0) && (ip->i_d.di_version == 1))
1202 		xfs_bump_ino_vers2(tp, ip);
1203 
1204 	if (pip && XFS_INHERIT_GID(pip)) {
1205 		ip->i_d.di_gid = pip->i_d.di_gid;
1206 		if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
1207 			ip->i_d.di_mode |= S_ISGID;
1208 		}
1209 	}
1210 
1211 	/*
1212 	 * If the group ID of the new file does not match the effective group
1213 	 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
1214 	 * (and only if the irix_sgid_inherit compatibility variable is set).
1215 	 */
1216 	if ((irix_sgid_inherit) &&
1217 	    (ip->i_d.di_mode & S_ISGID) &&
1218 	    (!in_group_p((gid_t)ip->i_d.di_gid))) {
1219 		ip->i_d.di_mode &= ~S_ISGID;
1220 	}
1221 
1222 	ip->i_d.di_size = 0;
1223 	ip->i_d.di_nextents = 0;
1224 	ASSERT(ip->i_d.di_nblocks == 0);
1225 
1226 	nanotime(&tv);
1227 	ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
1228 	ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
1229 	ip->i_d.di_atime = ip->i_d.di_mtime;
1230 	ip->i_d.di_ctime = ip->i_d.di_mtime;
1231 
1232 	/*
1233 	 * di_gen will have been taken care of in xfs_iread.
1234 	 */
1235 	ip->i_d.di_extsize = 0;
1236 	ip->i_d.di_dmevmask = 0;
1237 	ip->i_d.di_dmstate = 0;
1238 	ip->i_d.di_flags = 0;
1239 	flags = XFS_ILOG_CORE;
1240 	switch (mode & S_IFMT) {
1241 	case S_IFIFO:
1242 	case S_IFCHR:
1243 	case S_IFBLK:
1244 	case S_IFSOCK:
1245 		ip->i_d.di_format = XFS_DINODE_FMT_DEV;
1246 		ip->i_df.if_u2.if_rdev = rdev;
1247 		ip->i_df.if_flags = 0;
1248 		flags |= XFS_ILOG_DEV;
1249 		break;
1250 	case S_IFREG:
1251 		/*
1252 		 * we can't set up filestreams until after the VFS inode
1253 		 * is set up properly.
1254 		 */
1255 		if (pip && xfs_inode_is_filestream(pip))
1256 			filestreams = 1;
1257 		/* fall through */
1258 	case S_IFDIR:
1259 		if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
1260 			uint	di_flags = 0;
1261 
1262 			if (S_ISDIR(mode)) {
1263 				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1264 					di_flags |= XFS_DIFLAG_RTINHERIT;
1265 				if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1266 					di_flags |= XFS_DIFLAG_EXTSZINHERIT;
1267 					ip->i_d.di_extsize = pip->i_d.di_extsize;
1268 				}
1269 			} else if (S_ISREG(mode)) {
1270 				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1271 					di_flags |= XFS_DIFLAG_REALTIME;
1272 				if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
1273 					di_flags |= XFS_DIFLAG_EXTSIZE;
1274 					ip->i_d.di_extsize = pip->i_d.di_extsize;
1275 				}
1276 			}
1277 			if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
1278 			    xfs_inherit_noatime)
1279 				di_flags |= XFS_DIFLAG_NOATIME;
1280 			if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
1281 			    xfs_inherit_nodump)
1282 				di_flags |= XFS_DIFLAG_NODUMP;
1283 			if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
1284 			    xfs_inherit_sync)
1285 				di_flags |= XFS_DIFLAG_SYNC;
1286 			if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
1287 			    xfs_inherit_nosymlinks)
1288 				di_flags |= XFS_DIFLAG_NOSYMLINKS;
1289 			if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1290 				di_flags |= XFS_DIFLAG_PROJINHERIT;
1291 			if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
1292 			    xfs_inherit_nodefrag)
1293 				di_flags |= XFS_DIFLAG_NODEFRAG;
1294 			if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
1295 				di_flags |= XFS_DIFLAG_FILESTREAM;
1296 			ip->i_d.di_flags |= di_flags;
1297 		}
1298 		/* FALLTHROUGH */
1299 	case S_IFLNK:
1300 		ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1301 		ip->i_df.if_flags = XFS_IFEXTENTS;
1302 		ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
1303 		ip->i_df.if_u1.if_extents = NULL;
1304 		break;
1305 	default:
1306 		ASSERT(0);
1307 	}
1308 	/*
1309 	 * Attribute fork settings for new inode.
1310 	 */
1311 	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1312 	ip->i_d.di_anextents = 0;
1313 
1314 	/*
1315 	 * Log the new values stuffed into the inode.
1316 	 */
1317 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1318 	xfs_trans_log_inode(tp, ip, flags);
1319 
1320 	/* now that we have an i_mode we can setup inode ops and unlock */
1321 	xfs_setup_inode(ip);
1322 
1323 	/* now we have set up the vfs inode we can associate the filestream */
1324 	if (filestreams) {
1325 		error = xfs_filestream_associate(pip, ip);
1326 		if (error < 0)
1327 			return -error;
1328 		if (!error)
1329 			xfs_iflags_set(ip, XFS_IFILESTREAM);
1330 	}
1331 
1332 	*ipp = ip;
1333 	return 0;
1334 }
1335 
1336 /*
1337  * Free up the underlying blocks past new_size.  The new size must be smaller
1338  * than the current size.  This routine can be used both for the attribute and
1339  * data fork, and does not modify the inode size, which is left to the caller.
1340  *
1341  * The transaction passed to this routine must have made a permanent log
1342  * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
1343  * given transaction and start new ones, so make sure everything involved in
1344  * the transaction is tidy before calling here.  Some transaction will be
1345  * returned to the caller to be committed.  The incoming transaction must
1346  * already include the inode, and both inode locks must be held exclusively.
1347  * The inode must also be "held" within the transaction.  On return the inode
1348  * will be "held" within the returned transaction.  This routine does NOT
1349  * require any disk space to be reserved for it within the transaction.
1350  *
1351  * If we get an error, we must return with the inode locked and linked into the
1352  * current transaction. This keeps things simple for the higher level code,
1353  * because it always knows that the inode is locked and held in the transaction
1354  * that returns to it whether errors occur or not.  We don't mark the inode
1355  * dirty on error so that transactions can be easily aborted if possible.
1356  */
1357 int
1358 xfs_itruncate_extents(
1359 	struct xfs_trans	**tpp,
1360 	struct xfs_inode	*ip,
1361 	int			whichfork,
1362 	xfs_fsize_t		new_size)
1363 {
1364 	struct xfs_mount	*mp = ip->i_mount;
1365 	struct xfs_trans	*tp = *tpp;
1366 	struct xfs_trans	*ntp;
1367 	xfs_bmap_free_t		free_list;
1368 	xfs_fsblock_t		first_block;
1369 	xfs_fileoff_t		first_unmap_block;
1370 	xfs_fileoff_t		last_block;
1371 	xfs_filblks_t		unmap_len;
1372 	int			committed;
1373 	int			error = 0;
1374 	int			done = 0;
1375 
1376 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1377 	ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
1378 	       xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1379 	ASSERT(new_size <= XFS_ISIZE(ip));
1380 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1381 	ASSERT(ip->i_itemp != NULL);
1382 	ASSERT(ip->i_itemp->ili_lock_flags == 0);
1383 	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1384 
1385 	trace_xfs_itruncate_extents_start(ip, new_size);
1386 
1387 	/*
1388 	 * Since it is possible for space to become allocated beyond
1389 	 * the end of the file (in a crash where the space is allocated
1390 	 * but the inode size is not yet updated), simply remove any
1391 	 * blocks which show up between the new EOF and the maximum
1392 	 * possible file size.  If the first block to be removed is
1393 	 * beyond the maximum file size (ie it is the same as last_block),
1394 	 * then there is nothing to do.
1395 	 */
1396 	first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1397 	last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
1398 	if (first_unmap_block == last_block)
1399 		return 0;
1400 
1401 	ASSERT(first_unmap_block < last_block);
1402 	unmap_len = last_block - first_unmap_block + 1;
1403 	while (!done) {
1404 		xfs_bmap_init(&free_list, &first_block);
1405 		error = xfs_bunmapi(tp, ip,
1406 				    first_unmap_block, unmap_len,
1407 				    xfs_bmapi_aflag(whichfork),
1408 				    XFS_ITRUNC_MAX_EXTENTS,
1409 				    &first_block, &free_list,
1410 				    &done);
1411 		if (error)
1412 			goto out_bmap_cancel;
1413 
1414 		/*
1415 		 * Duplicate the transaction that has the permanent
1416 		 * reservation and commit the old transaction.
1417 		 */
1418 		error = xfs_bmap_finish(&tp, &free_list, &committed);
1419 		if (committed)
1420 			xfs_trans_ijoin(tp, ip, 0);
1421 		if (error)
1422 			goto out_bmap_cancel;
1423 
1424 		if (committed) {
1425 			/*
1426 			 * Mark the inode dirty so it will be logged and
1427 			 * moved forward in the log as part of every commit.
1428 			 */
1429 			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1430 		}
1431 
1432 		ntp = xfs_trans_dup(tp);
1433 		error = xfs_trans_commit(tp, 0);
1434 		tp = ntp;
1435 
1436 		xfs_trans_ijoin(tp, ip, 0);
1437 
1438 		if (error)
1439 			goto out;
1440 
1441 		/*
1442 		 * Transaction commit worked ok so we can drop the extra ticket
1443 		 * reference that we gained in xfs_trans_dup()
1444 		 */
1445 		xfs_log_ticket_put(tp->t_ticket);
1446 		error = xfs_trans_reserve(tp, 0,
1447 					XFS_ITRUNCATE_LOG_RES(mp), 0,
1448 					XFS_TRANS_PERM_LOG_RES,
1449 					XFS_ITRUNCATE_LOG_COUNT);
1450 		if (error)
1451 			goto out;
1452 	}
1453 
1454 	/*
1455 	 * Always re-log the inode so that our permanent transaction can keep
1456 	 * on rolling it forward in the log.
1457 	 */
1458 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1459 
1460 	trace_xfs_itruncate_extents_end(ip, new_size);
1461 
1462 out:
1463 	*tpp = tp;
1464 	return error;
1465 out_bmap_cancel:
1466 	/*
1467 	 * If the bunmapi call encounters an error, return to the caller where
1468 	 * the transaction can be properly aborted.  We just need to make sure
1469 	 * we're not holding any resources that we were not when we came in.
1470 	 */
1471 	xfs_bmap_cancel(&free_list);
1472 	goto out;
1473 }
1474 
1475 /*
1476  * This is called when the inode's link count goes to 0.
1477  * We place the on-disk inode on a list in the AGI.  It
1478  * will be pulled from this list when the inode is freed.
1479  */
1480 int
1481 xfs_iunlink(
1482 	xfs_trans_t	*tp,
1483 	xfs_inode_t	*ip)
1484 {
1485 	xfs_mount_t	*mp;
1486 	xfs_agi_t	*agi;
1487 	xfs_dinode_t	*dip;
1488 	xfs_buf_t	*agibp;
1489 	xfs_buf_t	*ibp;
1490 	xfs_agino_t	agino;
1491 	short		bucket_index;
1492 	int		offset;
1493 	int		error;
1494 
1495 	ASSERT(ip->i_d.di_nlink == 0);
1496 	ASSERT(ip->i_d.di_mode != 0);
1497 
1498 	mp = tp->t_mountp;
1499 
1500 	/*
1501 	 * Get the agi buffer first.  It ensures lock ordering
1502 	 * on the list.
1503 	 */
1504 	error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
1505 	if (error)
1506 		return error;
1507 	agi = XFS_BUF_TO_AGI(agibp);
1508 
1509 	/*
1510 	 * Get the index into the agi hash table for the
1511 	 * list this inode will go on.
1512 	 */
1513 	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1514 	ASSERT(agino != 0);
1515 	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1516 	ASSERT(agi->agi_unlinked[bucket_index]);
1517 	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
1518 
1519 	if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
1520 		/*
1521 		 * There is already another inode in the bucket we need
1522 		 * to add ourselves to.  Add us at the front of the list.
1523 		 * Here we put the head pointer into our next pointer,
1524 		 * and then we fall through to point the head at us.
1525 		 */
1526 		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
1527 				       0, 0);
1528 		if (error)
1529 			return error;
1530 
1531 		ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
1532 		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1533 		offset = ip->i_imap.im_boffset +
1534 			offsetof(xfs_dinode_t, di_next_unlinked);
1535 		xfs_trans_inode_buf(tp, ibp);
1536 		xfs_trans_log_buf(tp, ibp, offset,
1537 				  (offset + sizeof(xfs_agino_t) - 1));
1538 		xfs_inobp_check(mp, ibp);
1539 	}
1540 
1541 	/*
1542 	 * Point the bucket head pointer at the inode being inserted.
1543 	 */
1544 	ASSERT(agino != 0);
1545 	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
1546 	offset = offsetof(xfs_agi_t, agi_unlinked) +
1547 		(sizeof(xfs_agino_t) * bucket_index);
1548 	xfs_trans_log_buf(tp, agibp, offset,
1549 			  (offset + sizeof(xfs_agino_t) - 1));
1550 	return 0;
1551 }
1552 
1553 /*
1554  * Pull the on-disk inode from the AGI unlinked list.
1555  */
1556 STATIC int
1557 xfs_iunlink_remove(
1558 	xfs_trans_t	*tp,
1559 	xfs_inode_t	*ip)
1560 {
1561 	xfs_ino_t	next_ino;
1562 	xfs_mount_t	*mp;
1563 	xfs_agi_t	*agi;
1564 	xfs_dinode_t	*dip;
1565 	xfs_buf_t	*agibp;
1566 	xfs_buf_t	*ibp;
1567 	xfs_agnumber_t	agno;
1568 	xfs_agino_t	agino;
1569 	xfs_agino_t	next_agino;
1570 	xfs_buf_t	*last_ibp;
1571 	xfs_dinode_t	*last_dip = NULL;
1572 	short		bucket_index;
1573 	int		offset, last_offset = 0;
1574 	int		error;
1575 
1576 	mp = tp->t_mountp;
1577 	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1578 
1579 	/*
1580 	 * Get the agi buffer first.  It ensures lock ordering
1581 	 * on the list.
1582 	 */
1583 	error = xfs_read_agi(mp, tp, agno, &agibp);
1584 	if (error)
1585 		return error;
1586 
1587 	agi = XFS_BUF_TO_AGI(agibp);
1588 
1589 	/*
1590 	 * Get the index into the agi hash table for the
1591 	 * list this inode will go on.
1592 	 */
1593 	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1594 	ASSERT(agino != 0);
1595 	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1596 	ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO));
1597 	ASSERT(agi->agi_unlinked[bucket_index]);
1598 
1599 	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
1600 		/*
1601 		 * We're at the head of the list.  Get the inode's on-disk
1602 		 * buffer to see if there is anyone after us on the list.
1603 		 * Only modify our next pointer if it is not already NULLAGINO.
1604 		 * This saves us the overhead of dealing with the buffer when
1605 		 * there is no need to change it.
1606 		 */
1607 		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
1608 				       0, 0);
1609 		if (error) {
1610 			xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
1611 				__func__, error);
1612 			return error;
1613 		}
1614 		next_agino = be32_to_cpu(dip->di_next_unlinked);
1615 		ASSERT(next_agino != 0);
1616 		if (next_agino != NULLAGINO) {
1617 			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
1618 			offset = ip->i_imap.im_boffset +
1619 				offsetof(xfs_dinode_t, di_next_unlinked);
1620 			xfs_trans_inode_buf(tp, ibp);
1621 			xfs_trans_log_buf(tp, ibp, offset,
1622 					  (offset + sizeof(xfs_agino_t) - 1));
1623 			xfs_inobp_check(mp, ibp);
1624 		} else {
1625 			xfs_trans_brelse(tp, ibp);
1626 		}
1627 		/*
1628 		 * Point the bucket head pointer at the next inode.
1629 		 */
1630 		ASSERT(next_agino != 0);
1631 		ASSERT(next_agino != agino);
1632 		agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
1633 		offset = offsetof(xfs_agi_t, agi_unlinked) +
1634 			(sizeof(xfs_agino_t) * bucket_index);
1635 		xfs_trans_log_buf(tp, agibp, offset,
1636 				  (offset + sizeof(xfs_agino_t) - 1));
1637 	} else {
1638 		/*
1639 		 * We need to search the list for the inode being freed.
1640 		 */
1641 		next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
1642 		last_ibp = NULL;
1643 		while (next_agino != agino) {
1644 			struct xfs_imap	imap;
1645 
1646 			if (last_ibp)
1647 				xfs_trans_brelse(tp, last_ibp);
1648 
1649 			imap.im_blkno = 0;
1650 			next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
1651 
1652 			error = xfs_imap(mp, tp, next_ino, &imap, 0);
1653 			if (error) {
1654 				xfs_warn(mp,
1655 	"%s: xfs_imap returned error %d.",
1656 					 __func__, error);
1657 				return error;
1658 			}
1659 
1660 			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
1661 					       &last_ibp, 0, 0);
1662 			if (error) {
1663 				xfs_warn(mp,
1664 	"%s: xfs_imap_to_bp returned error %d.",
1665 					__func__, error);
1666 				return error;
1667 			}
1668 
1669 			last_offset = imap.im_boffset;
1670 			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
1671 			ASSERT(next_agino != NULLAGINO);
1672 			ASSERT(next_agino != 0);
1673 		}
1674 
1675 		/*
1676 		 * Now last_ibp points to the buffer previous to us on the
1677 		 * unlinked list.  Pull us from the list.
1678 		 */
1679 		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
1680 				       0, 0);
1681 		if (error) {
1682 			xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
1683 				__func__, error);
1684 			return error;
1685 		}
1686 		next_agino = be32_to_cpu(dip->di_next_unlinked);
1687 		ASSERT(next_agino != 0);
1688 		ASSERT(next_agino != agino);
1689 		if (next_agino != NULLAGINO) {
1690 			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
1691 			offset = ip->i_imap.im_boffset +
1692 				offsetof(xfs_dinode_t, di_next_unlinked);
1693 			xfs_trans_inode_buf(tp, ibp);
1694 			xfs_trans_log_buf(tp, ibp, offset,
1695 					  (offset + sizeof(xfs_agino_t) - 1));
1696 			xfs_inobp_check(mp, ibp);
1697 		} else {
1698 			xfs_trans_brelse(tp, ibp);
1699 		}
1700 		/*
1701 		 * Point the previous inode on the list to the next inode.
1702 		 */
1703 		last_dip->di_next_unlinked = cpu_to_be32(next_agino);
1704 		ASSERT(next_agino != 0);
1705 		offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
1706 		xfs_trans_inode_buf(tp, last_ibp);
1707 		xfs_trans_log_buf(tp, last_ibp, offset,
1708 				  (offset + sizeof(xfs_agino_t) - 1));
1709 		xfs_inobp_check(mp, last_ibp);
1710 	}
1711 	return 0;
1712 }
1713 
1714 /*
1715  * A big issue when freeing the inode cluster is is that we _cannot_ skip any
1716  * inodes that are in memory - they all must be marked stale and attached to
1717  * the cluster buffer.
1718  */
1719 STATIC int
1720 xfs_ifree_cluster(
1721 	xfs_inode_t	*free_ip,
1722 	xfs_trans_t	*tp,
1723 	xfs_ino_t	inum)
1724 {
1725 	xfs_mount_t		*mp = free_ip->i_mount;
1726 	int			blks_per_cluster;
1727 	int			nbufs;
1728 	int			ninodes;
1729 	int			i, j;
1730 	xfs_daddr_t		blkno;
1731 	xfs_buf_t		*bp;
1732 	xfs_inode_t		*ip;
1733 	xfs_inode_log_item_t	*iip;
1734 	xfs_log_item_t		*lip;
1735 	struct xfs_perag	*pag;
1736 
1737 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
1738 	if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
1739 		blks_per_cluster = 1;
1740 		ninodes = mp->m_sb.sb_inopblock;
1741 		nbufs = XFS_IALLOC_BLOCKS(mp);
1742 	} else {
1743 		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
1744 					mp->m_sb.sb_blocksize;
1745 		ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
1746 		nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
1747 	}
1748 
1749 	for (j = 0; j < nbufs; j++, inum += ninodes) {
1750 		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
1751 					 XFS_INO_TO_AGBNO(mp, inum));
1752 
1753 		/*
1754 		 * We obtain and lock the backing buffer first in the process
1755 		 * here, as we have to ensure that any dirty inode that we
1756 		 * can't get the flush lock on is attached to the buffer.
1757 		 * If we scan the in-memory inodes first, then buffer IO can
1758 		 * complete before we get a lock on it, and hence we may fail
1759 		 * to mark all the active inodes on the buffer stale.
1760 		 */
1761 		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
1762 					mp->m_bsize * blks_per_cluster, 0);
1763 
1764 		if (!bp)
1765 			return ENOMEM;
1766 		/*
1767 		 * Walk the inodes already attached to the buffer and mark them
1768 		 * stale. These will all have the flush locks held, so an
1769 		 * in-memory inode walk can't lock them. By marking them all
1770 		 * stale first, we will not attempt to lock them in the loop
1771 		 * below as the XFS_ISTALE flag will be set.
1772 		 */
1773 		lip = bp->b_fspriv;
1774 		while (lip) {
1775 			if (lip->li_type == XFS_LI_INODE) {
1776 				iip = (xfs_inode_log_item_t *)lip;
1777 				ASSERT(iip->ili_logged == 1);
1778 				lip->li_cb = xfs_istale_done;
1779 				xfs_trans_ail_copy_lsn(mp->m_ail,
1780 							&iip->ili_flush_lsn,
1781 							&iip->ili_item.li_lsn);
1782 				xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
1783 			}
1784 			lip = lip->li_bio_list;
1785 		}
1786 
1787 
1788 		/*
1789 		 * For each inode in memory attempt to add it to the inode
1790 		 * buffer and set it up for being staled on buffer IO
1791 		 * completion.  This is safe as we've locked out tail pushing
1792 		 * and flushing by locking the buffer.
1793 		 *
1794 		 * We have already marked every inode that was part of a
1795 		 * transaction stale above, which means there is no point in
1796 		 * even trying to lock them.
1797 		 */
1798 		for (i = 0; i < ninodes; i++) {
1799 retry:
1800 			rcu_read_lock();
1801 			ip = radix_tree_lookup(&pag->pag_ici_root,
1802 					XFS_INO_TO_AGINO(mp, (inum + i)));
1803 
1804 			/* Inode not in memory, nothing to do */
1805 			if (!ip) {
1806 				rcu_read_unlock();
1807 				continue;
1808 			}
1809 
1810 			/*
1811 			 * because this is an RCU protected lookup, we could
1812 			 * find a recently freed or even reallocated inode
1813 			 * during the lookup. We need to check under the
1814 			 * i_flags_lock for a valid inode here. Skip it if it
1815 			 * is not valid, the wrong inode or stale.
1816 			 */
1817 			spin_lock(&ip->i_flags_lock);
1818 			if (ip->i_ino != inum + i ||
1819 			    __xfs_iflags_test(ip, XFS_ISTALE)) {
1820 				spin_unlock(&ip->i_flags_lock);
1821 				rcu_read_unlock();
1822 				continue;
1823 			}
1824 			spin_unlock(&ip->i_flags_lock);
1825 
1826 			/*
1827 			 * Don't try to lock/unlock the current inode, but we
1828 			 * _cannot_ skip the other inodes that we did not find
1829 			 * in the list attached to the buffer and are not
1830 			 * already marked stale. If we can't lock it, back off
1831 			 * and retry.
1832 			 */
1833 			if (ip != free_ip &&
1834 			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
1835 				rcu_read_unlock();
1836 				delay(1);
1837 				goto retry;
1838 			}
1839 			rcu_read_unlock();
1840 
1841 			xfs_iflock(ip);
1842 			xfs_iflags_set(ip, XFS_ISTALE);
1843 
1844 			/*
1845 			 * we don't need to attach clean inodes or those only
1846 			 * with unlogged changes (which we throw away, anyway).
1847 			 */
1848 			iip = ip->i_itemp;
1849 			if (!iip || xfs_inode_clean(ip)) {
1850 				ASSERT(ip != free_ip);
1851 				xfs_ifunlock(ip);
1852 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
1853 				continue;
1854 			}
1855 
1856 			iip->ili_last_fields = iip->ili_fields;
1857 			iip->ili_fields = 0;
1858 			iip->ili_logged = 1;
1859 			xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
1860 						&iip->ili_item.li_lsn);
1861 
1862 			xfs_buf_attach_iodone(bp, xfs_istale_done,
1863 						  &iip->ili_item);
1864 
1865 			if (ip != free_ip)
1866 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
1867 		}
1868 
1869 		xfs_trans_stale_inode_buf(tp, bp);
1870 		xfs_trans_binval(tp, bp);
1871 	}
1872 
1873 	xfs_perag_put(pag);
1874 	return 0;
1875 }
1876 
1877 /*
1878  * This is called to return an inode to the inode free list.
1879  * The inode should already be truncated to 0 length and have
1880  * no pages associated with it.  This routine also assumes that
1881  * the inode is already a part of the transaction.
1882  *
1883  * The on-disk copy of the inode will have been added to the list
1884  * of unlinked inodes in the AGI. We need to remove the inode from
1885  * that list atomically with respect to freeing it here.
1886  */
1887 int
1888 xfs_ifree(
1889 	xfs_trans_t	*tp,
1890 	xfs_inode_t	*ip,
1891 	xfs_bmap_free_t	*flist)
1892 {
1893 	int			error;
1894 	int			delete;
1895 	xfs_ino_t		first_ino;
1896 	xfs_dinode_t    	*dip;
1897 	xfs_buf_t       	*ibp;
1898 
1899 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1900 	ASSERT(ip->i_d.di_nlink == 0);
1901 	ASSERT(ip->i_d.di_nextents == 0);
1902 	ASSERT(ip->i_d.di_anextents == 0);
1903 	ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
1904 	ASSERT(ip->i_d.di_nblocks == 0);
1905 
1906 	/*
1907 	 * Pull the on-disk inode from the AGI unlinked list.
1908 	 */
1909 	error = xfs_iunlink_remove(tp, ip);
1910 	if (error != 0) {
1911 		return error;
1912 	}
1913 
1914 	error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
1915 	if (error != 0) {
1916 		return error;
1917 	}
1918 	ip->i_d.di_mode = 0;		/* mark incore inode as free */
1919 	ip->i_d.di_flags = 0;
1920 	ip->i_d.di_dmevmask = 0;
1921 	ip->i_d.di_forkoff = 0;		/* mark the attr fork not in use */
1922 	ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1923 	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1924 	/*
1925 	 * Bump the generation count so no one will be confused
1926 	 * by reincarnations of this inode.
1927 	 */
1928 	ip->i_d.di_gen++;
1929 
1930 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1931 
1932 	error = xfs_imap_to_bp(ip->i_mount, tp, &ip->i_imap, &dip, &ibp,
1933 			       0, 0);
1934 	if (error)
1935 		return error;
1936 
1937         /*
1938 	* Clear the on-disk di_mode. This is to prevent xfs_bulkstat
1939 	* from picking up this inode when it is reclaimed (its incore state
1940 	* initialzed but not flushed to disk yet). The in-core di_mode is
1941 	* already cleared  and a corresponding transaction logged.
1942 	* The hack here just synchronizes the in-core to on-disk
1943 	* di_mode value in advance before the actual inode sync to disk.
1944 	* This is OK because the inode is already unlinked and would never
1945 	* change its di_mode again for this inode generation.
1946 	* This is a temporary hack that would require a proper fix
1947 	* in the future.
1948 	*/
1949 	dip->di_mode = 0;
1950 
1951 	if (delete) {
1952 		error = xfs_ifree_cluster(ip, tp, first_ino);
1953 	}
1954 
1955 	return error;
1956 }
1957 
1958 /*
1959  * Reallocate the space for if_broot based on the number of records
1960  * being added or deleted as indicated in rec_diff.  Move the records
1961  * and pointers in if_broot to fit the new size.  When shrinking this
1962  * will eliminate holes between the records and pointers created by
1963  * the caller.  When growing this will create holes to be filled in
1964  * by the caller.
1965  *
1966  * The caller must not request to add more records than would fit in
1967  * the on-disk inode root.  If the if_broot is currently NULL, then
1968  * if we adding records one will be allocated.  The caller must also
1969  * not request that the number of records go below zero, although
1970  * it can go to zero.
1971  *
1972  * ip -- the inode whose if_broot area is changing
1973  * ext_diff -- the change in the number of records, positive or negative,
1974  *	 requested for the if_broot array.
1975  */
1976 void
1977 xfs_iroot_realloc(
1978 	xfs_inode_t		*ip,
1979 	int			rec_diff,
1980 	int			whichfork)
1981 {
1982 	struct xfs_mount	*mp = ip->i_mount;
1983 	int			cur_max;
1984 	xfs_ifork_t		*ifp;
1985 	struct xfs_btree_block	*new_broot;
1986 	int			new_max;
1987 	size_t			new_size;
1988 	char			*np;
1989 	char			*op;
1990 
1991 	/*
1992 	 * Handle the degenerate case quietly.
1993 	 */
1994 	if (rec_diff == 0) {
1995 		return;
1996 	}
1997 
1998 	ifp = XFS_IFORK_PTR(ip, whichfork);
1999 	if (rec_diff > 0) {
2000 		/*
2001 		 * If there wasn't any memory allocated before, just
2002 		 * allocate it now and get out.
2003 		 */
2004 		if (ifp->if_broot_bytes == 0) {
2005 			new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
2006 			ifp->if_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
2007 			ifp->if_broot_bytes = (int)new_size;
2008 			return;
2009 		}
2010 
2011 		/*
2012 		 * If there is already an existing if_broot, then we need
2013 		 * to realloc() it and shift the pointers to their new
2014 		 * location.  The records don't change location because
2015 		 * they are kept butted up against the btree block header.
2016 		 */
2017 		cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
2018 		new_max = cur_max + rec_diff;
2019 		new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2020 		ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
2021 				(size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
2022 				KM_SLEEP | KM_NOFS);
2023 		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2024 						     ifp->if_broot_bytes);
2025 		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2026 						     (int)new_size);
2027 		ifp->if_broot_bytes = (int)new_size;
2028 		ASSERT(ifp->if_broot_bytes <=
2029 			XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2030 		memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
2031 		return;
2032 	}
2033 
2034 	/*
2035 	 * rec_diff is less than 0.  In this case, we are shrinking the
2036 	 * if_broot buffer.  It must already exist.  If we go to zero
2037 	 * records, just get rid of the root and clear the status bit.
2038 	 */
2039 	ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
2040 	cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
2041 	new_max = cur_max + rec_diff;
2042 	ASSERT(new_max >= 0);
2043 	if (new_max > 0)
2044 		new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2045 	else
2046 		new_size = 0;
2047 	if (new_size > 0) {
2048 		new_broot = kmem_alloc(new_size, KM_SLEEP | KM_NOFS);
2049 		/*
2050 		 * First copy over the btree block header.
2051 		 */
2052 		memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
2053 	} else {
2054 		new_broot = NULL;
2055 		ifp->if_flags &= ~XFS_IFBROOT;
2056 	}
2057 
2058 	/*
2059 	 * Only copy the records and pointers if there are any.
2060 	 */
2061 	if (new_max > 0) {
2062 		/*
2063 		 * First copy the records.
2064 		 */
2065 		op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
2066 		np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
2067 		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
2068 
2069 		/*
2070 		 * Then copy the pointers.
2071 		 */
2072 		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
2073 						     ifp->if_broot_bytes);
2074 		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
2075 						     (int)new_size);
2076 		memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
2077 	}
2078 	kmem_free(ifp->if_broot);
2079 	ifp->if_broot = new_broot;
2080 	ifp->if_broot_bytes = (int)new_size;
2081 	ASSERT(ifp->if_broot_bytes <=
2082 		XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2083 	return;
2084 }
2085 
2086 
2087 /*
2088  * This is called when the amount of space needed for if_data
2089  * is increased or decreased.  The change in size is indicated by
2090  * the number of bytes that need to be added or deleted in the
2091  * byte_diff parameter.
2092  *
2093  * If the amount of space needed has decreased below the size of the
2094  * inline buffer, then switch to using the inline buffer.  Otherwise,
2095  * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
2096  * to what is needed.
2097  *
2098  * ip -- the inode whose if_data area is changing
2099  * byte_diff -- the change in the number of bytes, positive or negative,
2100  *	 requested for the if_data array.
2101  */
2102 void
2103 xfs_idata_realloc(
2104 	xfs_inode_t	*ip,
2105 	int		byte_diff,
2106 	int		whichfork)
2107 {
2108 	xfs_ifork_t	*ifp;
2109 	int		new_size;
2110 	int		real_size;
2111 
2112 	if (byte_diff == 0) {
2113 		return;
2114 	}
2115 
2116 	ifp = XFS_IFORK_PTR(ip, whichfork);
2117 	new_size = (int)ifp->if_bytes + byte_diff;
2118 	ASSERT(new_size >= 0);
2119 
2120 	if (new_size == 0) {
2121 		if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2122 			kmem_free(ifp->if_u1.if_data);
2123 		}
2124 		ifp->if_u1.if_data = NULL;
2125 		real_size = 0;
2126 	} else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
2127 		/*
2128 		 * If the valid extents/data can fit in if_inline_ext/data,
2129 		 * copy them from the malloc'd vector and free it.
2130 		 */
2131 		if (ifp->if_u1.if_data == NULL) {
2132 			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2133 		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2134 			ASSERT(ifp->if_real_bytes != 0);
2135 			memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
2136 			      new_size);
2137 			kmem_free(ifp->if_u1.if_data);
2138 			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2139 		}
2140 		real_size = 0;
2141 	} else {
2142 		/*
2143 		 * Stuck with malloc/realloc.
2144 		 * For inline data, the underlying buffer must be
2145 		 * a multiple of 4 bytes in size so that it can be
2146 		 * logged and stay on word boundaries.  We enforce
2147 		 * that here.
2148 		 */
2149 		real_size = roundup(new_size, 4);
2150 		if (ifp->if_u1.if_data == NULL) {
2151 			ASSERT(ifp->if_real_bytes == 0);
2152 			ifp->if_u1.if_data = kmem_alloc(real_size,
2153 							KM_SLEEP | KM_NOFS);
2154 		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2155 			/*
2156 			 * Only do the realloc if the underlying size
2157 			 * is really changing.
2158 			 */
2159 			if (ifp->if_real_bytes != real_size) {
2160 				ifp->if_u1.if_data =
2161 					kmem_realloc(ifp->if_u1.if_data,
2162 							real_size,
2163 							ifp->if_real_bytes,
2164 							KM_SLEEP | KM_NOFS);
2165 			}
2166 		} else {
2167 			ASSERT(ifp->if_real_bytes == 0);
2168 			ifp->if_u1.if_data = kmem_alloc(real_size,
2169 							KM_SLEEP | KM_NOFS);
2170 			memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
2171 				ifp->if_bytes);
2172 		}
2173 	}
2174 	ifp->if_real_bytes = real_size;
2175 	ifp->if_bytes = new_size;
2176 	ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2177 }
2178 
2179 void
2180 xfs_idestroy_fork(
2181 	xfs_inode_t	*ip,
2182 	int		whichfork)
2183 {
2184 	xfs_ifork_t	*ifp;
2185 
2186 	ifp = XFS_IFORK_PTR(ip, whichfork);
2187 	if (ifp->if_broot != NULL) {
2188 		kmem_free(ifp->if_broot);
2189 		ifp->if_broot = NULL;
2190 	}
2191 
2192 	/*
2193 	 * If the format is local, then we can't have an extents
2194 	 * array so just look for an inline data array.  If we're
2195 	 * not local then we may or may not have an extents list,
2196 	 * so check and free it up if we do.
2197 	 */
2198 	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
2199 		if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
2200 		    (ifp->if_u1.if_data != NULL)) {
2201 			ASSERT(ifp->if_real_bytes != 0);
2202 			kmem_free(ifp->if_u1.if_data);
2203 			ifp->if_u1.if_data = NULL;
2204 			ifp->if_real_bytes = 0;
2205 		}
2206 	} else if ((ifp->if_flags & XFS_IFEXTENTS) &&
2207 		   ((ifp->if_flags & XFS_IFEXTIREC) ||
2208 		    ((ifp->if_u1.if_extents != NULL) &&
2209 		     (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)))) {
2210 		ASSERT(ifp->if_real_bytes != 0);
2211 		xfs_iext_destroy(ifp);
2212 	}
2213 	ASSERT(ifp->if_u1.if_extents == NULL ||
2214 	       ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
2215 	ASSERT(ifp->if_real_bytes == 0);
2216 	if (whichfork == XFS_ATTR_FORK) {
2217 		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
2218 		ip->i_afp = NULL;
2219 	}
2220 }
2221 
2222 /*
2223  * This is called to unpin an inode.  The caller must have the inode locked
2224  * in at least shared mode so that the buffer cannot be subsequently pinned
2225  * once someone is waiting for it to be unpinned.
2226  */
2227 static void
2228 xfs_iunpin(
2229 	struct xfs_inode	*ip)
2230 {
2231 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2232 
2233 	trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2234 
2235 	/* Give the log a push to start the unpinning I/O */
2236 	xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2237 
2238 }
2239 
2240 static void
2241 __xfs_iunpin_wait(
2242 	struct xfs_inode	*ip)
2243 {
2244 	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2245 	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2246 
2247 	xfs_iunpin(ip);
2248 
2249 	do {
2250 		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
2251 		if (xfs_ipincount(ip))
2252 			io_schedule();
2253 	} while (xfs_ipincount(ip));
2254 	finish_wait(wq, &wait.wait);
2255 }
2256 
2257 void
2258 xfs_iunpin_wait(
2259 	struct xfs_inode	*ip)
2260 {
2261 	if (xfs_ipincount(ip))
2262 		__xfs_iunpin_wait(ip);
2263 }
2264 
2265 /*
2266  * xfs_iextents_copy()
2267  *
2268  * This is called to copy the REAL extents (as opposed to the delayed
2269  * allocation extents) from the inode into the given buffer.  It
2270  * returns the number of bytes copied into the buffer.
2271  *
2272  * If there are no delayed allocation extents, then we can just
2273  * memcpy() the extents into the buffer.  Otherwise, we need to
2274  * examine each extent in turn and skip those which are delayed.
2275  */
2276 int
2277 xfs_iextents_copy(
2278 	xfs_inode_t		*ip,
2279 	xfs_bmbt_rec_t		*dp,
2280 	int			whichfork)
2281 {
2282 	int			copied;
2283 	int			i;
2284 	xfs_ifork_t		*ifp;
2285 	int			nrecs;
2286 	xfs_fsblock_t		start_block;
2287 
2288 	ifp = XFS_IFORK_PTR(ip, whichfork);
2289 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2290 	ASSERT(ifp->if_bytes > 0);
2291 
2292 	nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2293 	XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
2294 	ASSERT(nrecs > 0);
2295 
2296 	/*
2297 	 * There are some delayed allocation extents in the
2298 	 * inode, so copy the extents one at a time and skip
2299 	 * the delayed ones.  There must be at least one
2300 	 * non-delayed extent.
2301 	 */
2302 	copied = 0;
2303 	for (i = 0; i < nrecs; i++) {
2304 		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i);
2305 		start_block = xfs_bmbt_get_startblock(ep);
2306 		if (isnullstartblock(start_block)) {
2307 			/*
2308 			 * It's a delayed allocation extent, so skip it.
2309 			 */
2310 			continue;
2311 		}
2312 
2313 		/* Translate to on disk format */
2314 		put_unaligned(cpu_to_be64(ep->l0), &dp->l0);
2315 		put_unaligned(cpu_to_be64(ep->l1), &dp->l1);
2316 		dp++;
2317 		copied++;
2318 	}
2319 	ASSERT(copied != 0);
2320 	xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip));
2321 
2322 	return (copied * (uint)sizeof(xfs_bmbt_rec_t));
2323 }
2324 
2325 /*
2326  * Each of the following cases stores data into the same region
2327  * of the on-disk inode, so only one of them can be valid at
2328  * any given time. While it is possible to have conflicting formats
2329  * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
2330  * in EXTENTS format, this can only happen when the fork has
2331  * changed formats after being modified but before being flushed.
2332  * In these cases, the format always takes precedence, because the
2333  * format indicates the current state of the fork.
2334  */
2335 /*ARGSUSED*/
2336 STATIC void
2337 xfs_iflush_fork(
2338 	xfs_inode_t		*ip,
2339 	xfs_dinode_t		*dip,
2340 	xfs_inode_log_item_t	*iip,
2341 	int			whichfork,
2342 	xfs_buf_t		*bp)
2343 {
2344 	char			*cp;
2345 	xfs_ifork_t		*ifp;
2346 	xfs_mount_t		*mp;
2347 #ifdef XFS_TRANS_DEBUG
2348 	int			first;
2349 #endif
2350 	static const short	brootflag[2] =
2351 		{ XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
2352 	static const short	dataflag[2] =
2353 		{ XFS_ILOG_DDATA, XFS_ILOG_ADATA };
2354 	static const short	extflag[2] =
2355 		{ XFS_ILOG_DEXT, XFS_ILOG_AEXT };
2356 
2357 	if (!iip)
2358 		return;
2359 	ifp = XFS_IFORK_PTR(ip, whichfork);
2360 	/*
2361 	 * This can happen if we gave up in iformat in an error path,
2362 	 * for the attribute fork.
2363 	 */
2364 	if (!ifp) {
2365 		ASSERT(whichfork == XFS_ATTR_FORK);
2366 		return;
2367 	}
2368 	cp = XFS_DFORK_PTR(dip, whichfork);
2369 	mp = ip->i_mount;
2370 	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
2371 	case XFS_DINODE_FMT_LOCAL:
2372 		if ((iip->ili_fields & dataflag[whichfork]) &&
2373 		    (ifp->if_bytes > 0)) {
2374 			ASSERT(ifp->if_u1.if_data != NULL);
2375 			ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2376 			memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
2377 		}
2378 		break;
2379 
2380 	case XFS_DINODE_FMT_EXTENTS:
2381 		ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
2382 		       !(iip->ili_fields & extflag[whichfork]));
2383 		if ((iip->ili_fields & extflag[whichfork]) &&
2384 		    (ifp->if_bytes > 0)) {
2385 			ASSERT(xfs_iext_get_ext(ifp, 0));
2386 			ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
2387 			(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
2388 				whichfork);
2389 		}
2390 		break;
2391 
2392 	case XFS_DINODE_FMT_BTREE:
2393 		if ((iip->ili_fields & brootflag[whichfork]) &&
2394 		    (ifp->if_broot_bytes > 0)) {
2395 			ASSERT(ifp->if_broot != NULL);
2396 			ASSERT(ifp->if_broot_bytes <=
2397 			       (XFS_IFORK_SIZE(ip, whichfork) +
2398 				XFS_BROOT_SIZE_ADJ));
2399 			xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
2400 				(xfs_bmdr_block_t *)cp,
2401 				XFS_DFORK_SIZE(dip, mp, whichfork));
2402 		}
2403 		break;
2404 
2405 	case XFS_DINODE_FMT_DEV:
2406 		if (iip->ili_fields & XFS_ILOG_DEV) {
2407 			ASSERT(whichfork == XFS_DATA_FORK);
2408 			xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
2409 		}
2410 		break;
2411 
2412 	case XFS_DINODE_FMT_UUID:
2413 		if (iip->ili_fields & XFS_ILOG_UUID) {
2414 			ASSERT(whichfork == XFS_DATA_FORK);
2415 			memcpy(XFS_DFORK_DPTR(dip),
2416 			       &ip->i_df.if_u2.if_uuid,
2417 			       sizeof(uuid_t));
2418 		}
2419 		break;
2420 
2421 	default:
2422 		ASSERT(0);
2423 		break;
2424 	}
2425 }
2426 
2427 STATIC int
2428 xfs_iflush_cluster(
2429 	xfs_inode_t	*ip,
2430 	xfs_buf_t	*bp)
2431 {
2432 	xfs_mount_t		*mp = ip->i_mount;
2433 	struct xfs_perag	*pag;
2434 	unsigned long		first_index, mask;
2435 	unsigned long		inodes_per_cluster;
2436 	int			ilist_size;
2437 	xfs_inode_t		**ilist;
2438 	xfs_inode_t		*iq;
2439 	int			nr_found;
2440 	int			clcount = 0;
2441 	int			bufwasdelwri;
2442 	int			i;
2443 
2444 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2445 
2446 	inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
2447 	ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
2448 	ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
2449 	if (!ilist)
2450 		goto out_put;
2451 
2452 	mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2453 	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2454 	rcu_read_lock();
2455 	/* really need a gang lookup range call here */
2456 	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2457 					first_index, inodes_per_cluster);
2458 	if (nr_found == 0)
2459 		goto out_free;
2460 
2461 	for (i = 0; i < nr_found; i++) {
2462 		iq = ilist[i];
2463 		if (iq == ip)
2464 			continue;
2465 
2466 		/*
2467 		 * because this is an RCU protected lookup, we could find a
2468 		 * recently freed or even reallocated inode during the lookup.
2469 		 * We need to check under the i_flags_lock for a valid inode
2470 		 * here. Skip it if it is not valid or the wrong inode.
2471 		 */
2472 		spin_lock(&ip->i_flags_lock);
2473 		if (!ip->i_ino ||
2474 		    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
2475 			spin_unlock(&ip->i_flags_lock);
2476 			continue;
2477 		}
2478 		spin_unlock(&ip->i_flags_lock);
2479 
2480 		/*
2481 		 * Do an un-protected check to see if the inode is dirty and
2482 		 * is a candidate for flushing.  These checks will be repeated
2483 		 * later after the appropriate locks are acquired.
2484 		 */
2485 		if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
2486 			continue;
2487 
2488 		/*
2489 		 * Try to get locks.  If any are unavailable or it is pinned,
2490 		 * then this inode cannot be flushed and is skipped.
2491 		 */
2492 
2493 		if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
2494 			continue;
2495 		if (!xfs_iflock_nowait(iq)) {
2496 			xfs_iunlock(iq, XFS_ILOCK_SHARED);
2497 			continue;
2498 		}
2499 		if (xfs_ipincount(iq)) {
2500 			xfs_ifunlock(iq);
2501 			xfs_iunlock(iq, XFS_ILOCK_SHARED);
2502 			continue;
2503 		}
2504 
2505 		/*
2506 		 * arriving here means that this inode can be flushed.  First
2507 		 * re-check that it's dirty before flushing.
2508 		 */
2509 		if (!xfs_inode_clean(iq)) {
2510 			int	error;
2511 			error = xfs_iflush_int(iq, bp);
2512 			if (error) {
2513 				xfs_iunlock(iq, XFS_ILOCK_SHARED);
2514 				goto cluster_corrupt_out;
2515 			}
2516 			clcount++;
2517 		} else {
2518 			xfs_ifunlock(iq);
2519 		}
2520 		xfs_iunlock(iq, XFS_ILOCK_SHARED);
2521 	}
2522 
2523 	if (clcount) {
2524 		XFS_STATS_INC(xs_icluster_flushcnt);
2525 		XFS_STATS_ADD(xs_icluster_flushinode, clcount);
2526 	}
2527 
2528 out_free:
2529 	rcu_read_unlock();
2530 	kmem_free(ilist);
2531 out_put:
2532 	xfs_perag_put(pag);
2533 	return 0;
2534 
2535 
2536 cluster_corrupt_out:
2537 	/*
2538 	 * Corruption detected in the clustering loop.  Invalidate the
2539 	 * inode buffer and shut down the filesystem.
2540 	 */
2541 	rcu_read_unlock();
2542 	/*
2543 	 * Clean up the buffer.  If it was delwri, just release it --
2544 	 * brelse can handle it with no problems.  If not, shut down the
2545 	 * filesystem before releasing the buffer.
2546 	 */
2547 	bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
2548 	if (bufwasdelwri)
2549 		xfs_buf_relse(bp);
2550 
2551 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
2552 
2553 	if (!bufwasdelwri) {
2554 		/*
2555 		 * Just like incore_relse: if we have b_iodone functions,
2556 		 * mark the buffer as an error and call them.  Otherwise
2557 		 * mark it as stale and brelse.
2558 		 */
2559 		if (bp->b_iodone) {
2560 			XFS_BUF_UNDONE(bp);
2561 			xfs_buf_stale(bp);
2562 			xfs_buf_ioerror(bp, EIO);
2563 			xfs_buf_ioend(bp, 0);
2564 		} else {
2565 			xfs_buf_stale(bp);
2566 			xfs_buf_relse(bp);
2567 		}
2568 	}
2569 
2570 	/*
2571 	 * Unlocks the flush lock
2572 	 */
2573 	xfs_iflush_abort(iq, false);
2574 	kmem_free(ilist);
2575 	xfs_perag_put(pag);
2576 	return XFS_ERROR(EFSCORRUPTED);
2577 }
2578 
2579 /*
2580  * Flush dirty inode metadata into the backing buffer.
2581  *
2582  * The caller must have the inode lock and the inode flush lock held.  The
2583  * inode lock will still be held upon return to the caller, and the inode
2584  * flush lock will be released after the inode has reached the disk.
2585  *
2586  * The caller must write out the buffer returned in *bpp and release it.
2587  */
2588 int
2589 xfs_iflush(
2590 	struct xfs_inode	*ip,
2591 	struct xfs_buf		**bpp)
2592 {
2593 	struct xfs_mount	*mp = ip->i_mount;
2594 	struct xfs_buf		*bp;
2595 	struct xfs_dinode	*dip;
2596 	int			error;
2597 
2598 	XFS_STATS_INC(xs_iflush_count);
2599 
2600 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2601 	ASSERT(xfs_isiflocked(ip));
2602 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2603 	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2604 
2605 	*bpp = NULL;
2606 
2607 	xfs_iunpin_wait(ip);
2608 
2609 	/*
2610 	 * For stale inodes we cannot rely on the backing buffer remaining
2611 	 * stale in cache for the remaining life of the stale inode and so
2612 	 * xfs_imap_to_bp() below may give us a buffer that no longer contains
2613 	 * inodes below. We have to check this after ensuring the inode is
2614 	 * unpinned so that it is safe to reclaim the stale inode after the
2615 	 * flush call.
2616 	 */
2617 	if (xfs_iflags_test(ip, XFS_ISTALE)) {
2618 		xfs_ifunlock(ip);
2619 		return 0;
2620 	}
2621 
2622 	/*
2623 	 * This may have been unpinned because the filesystem is shutting
2624 	 * down forcibly. If that's the case we must not write this inode
2625 	 * to disk, because the log record didn't make it to disk.
2626 	 *
2627 	 * We also have to remove the log item from the AIL in this case,
2628 	 * as we wait for an empty AIL as part of the unmount process.
2629 	 */
2630 	if (XFS_FORCED_SHUTDOWN(mp)) {
2631 		error = XFS_ERROR(EIO);
2632 		goto abort_out;
2633 	}
2634 
2635 	/*
2636 	 * Get the buffer containing the on-disk inode.
2637 	 */
2638 	error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
2639 			       0);
2640 	if (error || !bp) {
2641 		xfs_ifunlock(ip);
2642 		return error;
2643 	}
2644 
2645 	/*
2646 	 * First flush out the inode that xfs_iflush was called with.
2647 	 */
2648 	error = xfs_iflush_int(ip, bp);
2649 	if (error)
2650 		goto corrupt_out;
2651 
2652 	/*
2653 	 * If the buffer is pinned then push on the log now so we won't
2654 	 * get stuck waiting in the write for too long.
2655 	 */
2656 	if (xfs_buf_ispinned(bp))
2657 		xfs_log_force(mp, 0);
2658 
2659 	/*
2660 	 * inode clustering:
2661 	 * see if other inodes can be gathered into this write
2662 	 */
2663 	error = xfs_iflush_cluster(ip, bp);
2664 	if (error)
2665 		goto cluster_corrupt_out;
2666 
2667 	*bpp = bp;
2668 	return 0;
2669 
2670 corrupt_out:
2671 	xfs_buf_relse(bp);
2672 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
2673 cluster_corrupt_out:
2674 	error = XFS_ERROR(EFSCORRUPTED);
2675 abort_out:
2676 	/*
2677 	 * Unlocks the flush lock
2678 	 */
2679 	xfs_iflush_abort(ip, false);
2680 	return error;
2681 }
2682 
2683 
2684 STATIC int
2685 xfs_iflush_int(
2686 	xfs_inode_t		*ip,
2687 	xfs_buf_t		*bp)
2688 {
2689 	xfs_inode_log_item_t	*iip;
2690 	xfs_dinode_t		*dip;
2691 	xfs_mount_t		*mp;
2692 #ifdef XFS_TRANS_DEBUG
2693 	int			first;
2694 #endif
2695 
2696 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2697 	ASSERT(xfs_isiflocked(ip));
2698 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
2699 	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
2700 
2701 	iip = ip->i_itemp;
2702 	mp = ip->i_mount;
2703 
2704 	/* set *dip = inode's place in the buffer */
2705 	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
2706 
2707 	if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
2708 			       mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
2709 		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2710 			"%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
2711 			__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
2712 		goto corrupt_out;
2713 	}
2714 	if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
2715 				mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
2716 		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2717 			"%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
2718 			__func__, ip->i_ino, ip, ip->i_d.di_magic);
2719 		goto corrupt_out;
2720 	}
2721 	if (S_ISREG(ip->i_d.di_mode)) {
2722 		if (XFS_TEST_ERROR(
2723 		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2724 		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
2725 		    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
2726 			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2727 				"%s: Bad regular inode %Lu, ptr 0x%p",
2728 				__func__, ip->i_ino, ip);
2729 			goto corrupt_out;
2730 		}
2731 	} else if (S_ISDIR(ip->i_d.di_mode)) {
2732 		if (XFS_TEST_ERROR(
2733 		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
2734 		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
2735 		    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
2736 		    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
2737 			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2738 				"%s: Bad directory inode %Lu, ptr 0x%p",
2739 				__func__, ip->i_ino, ip);
2740 			goto corrupt_out;
2741 		}
2742 	}
2743 	if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
2744 				ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
2745 				XFS_RANDOM_IFLUSH_5)) {
2746 		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2747 			"%s: detected corrupt incore inode %Lu, "
2748 			"total extents = %d, nblocks = %Ld, ptr 0x%p",
2749 			__func__, ip->i_ino,
2750 			ip->i_d.di_nextents + ip->i_d.di_anextents,
2751 			ip->i_d.di_nblocks, ip);
2752 		goto corrupt_out;
2753 	}
2754 	if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
2755 				mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
2756 		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
2757 			"%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
2758 			__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
2759 		goto corrupt_out;
2760 	}
2761 	/*
2762 	 * bump the flush iteration count, used to detect flushes which
2763 	 * postdate a log record during recovery.
2764 	 */
2765 
2766 	ip->i_d.di_flushiter++;
2767 
2768 	/*
2769 	 * Copy the dirty parts of the inode into the on-disk
2770 	 * inode.  We always copy out the core of the inode,
2771 	 * because if the inode is dirty at all the core must
2772 	 * be.
2773 	 */
2774 	xfs_dinode_to_disk(dip, &ip->i_d);
2775 
2776 	/* Wrap, we never let the log put out DI_MAX_FLUSH */
2777 	if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
2778 		ip->i_d.di_flushiter = 0;
2779 
2780 	/*
2781 	 * If this is really an old format inode and the superblock version
2782 	 * has not been updated to support only new format inodes, then
2783 	 * convert back to the old inode format.  If the superblock version
2784 	 * has been updated, then make the conversion permanent.
2785 	 */
2786 	ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
2787 	if (ip->i_d.di_version == 1) {
2788 		if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
2789 			/*
2790 			 * Convert it back.
2791 			 */
2792 			ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
2793 			dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
2794 		} else {
2795 			/*
2796 			 * The superblock version has already been bumped,
2797 			 * so just make the conversion to the new inode
2798 			 * format permanent.
2799 			 */
2800 			ip->i_d.di_version = 2;
2801 			dip->di_version = 2;
2802 			ip->i_d.di_onlink = 0;
2803 			dip->di_onlink = 0;
2804 			memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
2805 			memset(&(dip->di_pad[0]), 0,
2806 			      sizeof(dip->di_pad));
2807 			ASSERT(xfs_get_projid(ip) == 0);
2808 		}
2809 	}
2810 
2811 	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
2812 	if (XFS_IFORK_Q(ip))
2813 		xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
2814 	xfs_inobp_check(mp, bp);
2815 
2816 	/*
2817 	 * We've recorded everything logged in the inode, so we'd like to clear
2818 	 * the ili_fields bits so we don't log and flush things unnecessarily.
2819 	 * However, we can't stop logging all this information until the data
2820 	 * we've copied into the disk buffer is written to disk.  If we did we
2821 	 * might overwrite the copy of the inode in the log with all the data
2822 	 * after re-logging only part of it, and in the face of a crash we
2823 	 * wouldn't have all the data we need to recover.
2824 	 *
2825 	 * What we do is move the bits to the ili_last_fields field.  When
2826 	 * logging the inode, these bits are moved back to the ili_fields field.
2827 	 * In the xfs_iflush_done() routine we clear ili_last_fields, since we
2828 	 * know that the information those bits represent is permanently on
2829 	 * disk.  As long as the flush completes before the inode is logged
2830 	 * again, then both ili_fields and ili_last_fields will be cleared.
2831 	 *
2832 	 * We can play with the ili_fields bits here, because the inode lock
2833 	 * must be held exclusively in order to set bits there and the flush
2834 	 * lock protects the ili_last_fields bits.  Set ili_logged so the flush
2835 	 * done routine can tell whether or not to look in the AIL.  Also, store
2836 	 * the current LSN of the inode so that we can tell whether the item has
2837 	 * moved in the AIL from xfs_iflush_done().  In order to read the lsn we
2838 	 * need the AIL lock, because it is a 64 bit value that cannot be read
2839 	 * atomically.
2840 	 */
2841 	if (iip != NULL && iip->ili_fields != 0) {
2842 		iip->ili_last_fields = iip->ili_fields;
2843 		iip->ili_fields = 0;
2844 		iip->ili_logged = 1;
2845 
2846 		xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2847 					&iip->ili_item.li_lsn);
2848 
2849 		/*
2850 		 * Attach the function xfs_iflush_done to the inode's
2851 		 * buffer.  This will remove the inode from the AIL
2852 		 * and unlock the inode's flush lock when the inode is
2853 		 * completely written to disk.
2854 		 */
2855 		xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
2856 
2857 		ASSERT(bp->b_fspriv != NULL);
2858 		ASSERT(bp->b_iodone != NULL);
2859 	} else {
2860 		/*
2861 		 * We're flushing an inode which is not in the AIL and has
2862 		 * not been logged.  For this case we can immediately drop
2863 		 * the inode flush lock because we can avoid the whole
2864 		 * AIL state thing.  It's OK to drop the flush lock now,
2865 		 * because we've already locked the buffer and to do anything
2866 		 * you really need both.
2867 		 */
2868 		if (iip != NULL) {
2869 			ASSERT(iip->ili_logged == 0);
2870 			ASSERT(iip->ili_last_fields == 0);
2871 			ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
2872 		}
2873 		xfs_ifunlock(ip);
2874 	}
2875 
2876 	return 0;
2877 
2878 corrupt_out:
2879 	return XFS_ERROR(EFSCORRUPTED);
2880 }
2881 
2882 /*
2883  * Return a pointer to the extent record at file index idx.
2884  */
2885 xfs_bmbt_rec_host_t *
2886 xfs_iext_get_ext(
2887 	xfs_ifork_t	*ifp,		/* inode fork pointer */
2888 	xfs_extnum_t	idx)		/* index of target extent */
2889 {
2890 	ASSERT(idx >= 0);
2891 	ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
2892 
2893 	if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
2894 		return ifp->if_u1.if_ext_irec->er_extbuf;
2895 	} else if (ifp->if_flags & XFS_IFEXTIREC) {
2896 		xfs_ext_irec_t	*erp;		/* irec pointer */
2897 		int		erp_idx = 0;	/* irec index */
2898 		xfs_extnum_t	page_idx = idx;	/* ext index in target list */
2899 
2900 		erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 0);
2901 		return &erp->er_extbuf[page_idx];
2902 	} else if (ifp->if_bytes) {
2903 		return &ifp->if_u1.if_extents[idx];
2904 	} else {
2905 		return NULL;
2906 	}
2907 }
2908 
2909 /*
2910  * Insert new item(s) into the extent records for incore inode
2911  * fork 'ifp'.  'count' new items are inserted at index 'idx'.
2912  */
2913 void
2914 xfs_iext_insert(
2915 	xfs_inode_t	*ip,		/* incore inode pointer */
2916 	xfs_extnum_t	idx,		/* starting index of new items */
2917 	xfs_extnum_t	count,		/* number of inserted items */
2918 	xfs_bmbt_irec_t	*new,		/* items to insert */
2919 	int		state)		/* type of extent conversion */
2920 {
2921 	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
2922 	xfs_extnum_t	i;		/* extent record index */
2923 
2924 	trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
2925 
2926 	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
2927 	xfs_iext_add(ifp, idx, count);
2928 	for (i = idx; i < idx + count; i++, new++)
2929 		xfs_bmbt_set_all(xfs_iext_get_ext(ifp, i), new);
2930 }
2931 
2932 /*
2933  * This is called when the amount of space required for incore file
2934  * extents needs to be increased. The ext_diff parameter stores the
2935  * number of new extents being added and the idx parameter contains
2936  * the extent index where the new extents will be added. If the new
2937  * extents are being appended, then we just need to (re)allocate and
2938  * initialize the space. Otherwise, if the new extents are being
2939  * inserted into the middle of the existing entries, a bit more work
2940  * is required to make room for the new extents to be inserted. The
2941  * caller is responsible for filling in the new extent entries upon
2942  * return.
2943  */
2944 void
2945 xfs_iext_add(
2946 	xfs_ifork_t	*ifp,		/* inode fork pointer */
2947 	xfs_extnum_t	idx,		/* index to begin adding exts */
2948 	int		ext_diff)	/* number of extents to add */
2949 {
2950 	int		byte_diff;	/* new bytes being added */
2951 	int		new_size;	/* size of extents after adding */
2952 	xfs_extnum_t	nextents;	/* number of extents in file */
2953 
2954 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2955 	ASSERT((idx >= 0) && (idx <= nextents));
2956 	byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
2957 	new_size = ifp->if_bytes + byte_diff;
2958 	/*
2959 	 * If the new number of extents (nextents + ext_diff)
2960 	 * fits inside the inode, then continue to use the inline
2961 	 * extent buffer.
2962 	 */
2963 	if (nextents + ext_diff <= XFS_INLINE_EXTS) {
2964 		if (idx < nextents) {
2965 			memmove(&ifp->if_u2.if_inline_ext[idx + ext_diff],
2966 				&ifp->if_u2.if_inline_ext[idx],
2967 				(nextents - idx) * sizeof(xfs_bmbt_rec_t));
2968 			memset(&ifp->if_u2.if_inline_ext[idx], 0, byte_diff);
2969 		}
2970 		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
2971 		ifp->if_real_bytes = 0;
2972 	}
2973 	/*
2974 	 * Otherwise use a linear (direct) extent list.
2975 	 * If the extents are currently inside the inode,
2976 	 * xfs_iext_realloc_direct will switch us from
2977 	 * inline to direct extent allocation mode.
2978 	 */
2979 	else if (nextents + ext_diff <= XFS_LINEAR_EXTS) {
2980 		xfs_iext_realloc_direct(ifp, new_size);
2981 		if (idx < nextents) {
2982 			memmove(&ifp->if_u1.if_extents[idx + ext_diff],
2983 				&ifp->if_u1.if_extents[idx],
2984 				(nextents - idx) * sizeof(xfs_bmbt_rec_t));
2985 			memset(&ifp->if_u1.if_extents[idx], 0, byte_diff);
2986 		}
2987 	}
2988 	/* Indirection array */
2989 	else {
2990 		xfs_ext_irec_t	*erp;
2991 		int		erp_idx = 0;
2992 		int		page_idx = idx;
2993 
2994 		ASSERT(nextents + ext_diff > XFS_LINEAR_EXTS);
2995 		if (ifp->if_flags & XFS_IFEXTIREC) {
2996 			erp = xfs_iext_idx_to_irec(ifp, &page_idx, &erp_idx, 1);
2997 		} else {
2998 			xfs_iext_irec_init(ifp);
2999 			ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3000 			erp = ifp->if_u1.if_ext_irec;
3001 		}
3002 		/* Extents fit in target extent page */
3003 		if (erp && erp->er_extcount + ext_diff <= XFS_LINEAR_EXTS) {
3004 			if (page_idx < erp->er_extcount) {
3005 				memmove(&erp->er_extbuf[page_idx + ext_diff],
3006 					&erp->er_extbuf[page_idx],
3007 					(erp->er_extcount - page_idx) *
3008 					sizeof(xfs_bmbt_rec_t));
3009 				memset(&erp->er_extbuf[page_idx], 0, byte_diff);
3010 			}
3011 			erp->er_extcount += ext_diff;
3012 			xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3013 		}
3014 		/* Insert a new extent page */
3015 		else if (erp) {
3016 			xfs_iext_add_indirect_multi(ifp,
3017 				erp_idx, page_idx, ext_diff);
3018 		}
3019 		/*
3020 		 * If extent(s) are being appended to the last page in
3021 		 * the indirection array and the new extent(s) don't fit
3022 		 * in the page, then erp is NULL and erp_idx is set to
3023 		 * the next index needed in the indirection array.
3024 		 */
3025 		else {
3026 			int	count = ext_diff;
3027 
3028 			while (count) {
3029 				erp = xfs_iext_irec_new(ifp, erp_idx);
3030 				erp->er_extcount = count;
3031 				count -= MIN(count, (int)XFS_LINEAR_EXTS);
3032 				if (count) {
3033 					erp_idx++;
3034 				}
3035 			}
3036 		}
3037 	}
3038 	ifp->if_bytes = new_size;
3039 }
3040 
3041 /*
3042  * This is called when incore extents are being added to the indirection
3043  * array and the new extents do not fit in the target extent list. The
3044  * erp_idx parameter contains the irec index for the target extent list
3045  * in the indirection array, and the idx parameter contains the extent
3046  * index within the list. The number of extents being added is stored
3047  * in the count parameter.
3048  *
3049  *    |-------|   |-------|
3050  *    |       |   |       |    idx - number of extents before idx
3051  *    |  idx  |   | count |
3052  *    |       |   |       |    count - number of extents being inserted at idx
3053  *    |-------|   |-------|
3054  *    | count |   | nex2  |    nex2 - number of extents after idx + count
3055  *    |-------|   |-------|
3056  */
3057 void
3058 xfs_iext_add_indirect_multi(
3059 	xfs_ifork_t	*ifp,			/* inode fork pointer */
3060 	int		erp_idx,		/* target extent irec index */
3061 	xfs_extnum_t	idx,			/* index within target list */
3062 	int		count)			/* new extents being added */
3063 {
3064 	int		byte_diff;		/* new bytes being added */
3065 	xfs_ext_irec_t	*erp;			/* pointer to irec entry */
3066 	xfs_extnum_t	ext_diff;		/* number of extents to add */
3067 	xfs_extnum_t	ext_cnt;		/* new extents still needed */
3068 	xfs_extnum_t	nex2;			/* extents after idx + count */
3069 	xfs_bmbt_rec_t	*nex2_ep = NULL;	/* temp list for nex2 extents */
3070 	int		nlists;			/* number of irec's (lists) */
3071 
3072 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3073 	erp = &ifp->if_u1.if_ext_irec[erp_idx];
3074 	nex2 = erp->er_extcount - idx;
3075 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3076 
3077 	/*
3078 	 * Save second part of target extent list
3079 	 * (all extents past */
3080 	if (nex2) {
3081 		byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3082 		nex2_ep = (xfs_bmbt_rec_t *) kmem_alloc(byte_diff, KM_NOFS);
3083 		memmove(nex2_ep, &erp->er_extbuf[idx], byte_diff);
3084 		erp->er_extcount -= nex2;
3085 		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -nex2);
3086 		memset(&erp->er_extbuf[idx], 0, byte_diff);
3087 	}
3088 
3089 	/*
3090 	 * Add the new extents to the end of the target
3091 	 * list, then allocate new irec record(s) and
3092 	 * extent buffer(s) as needed to store the rest
3093 	 * of the new extents.
3094 	 */
3095 	ext_cnt = count;
3096 	ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS - erp->er_extcount);
3097 	if (ext_diff) {
3098 		erp->er_extcount += ext_diff;
3099 		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3100 		ext_cnt -= ext_diff;
3101 	}
3102 	while (ext_cnt) {
3103 		erp_idx++;
3104 		erp = xfs_iext_irec_new(ifp, erp_idx);
3105 		ext_diff = MIN(ext_cnt, (int)XFS_LINEAR_EXTS);
3106 		erp->er_extcount = ext_diff;
3107 		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, ext_diff);
3108 		ext_cnt -= ext_diff;
3109 	}
3110 
3111 	/* Add nex2 extents back to indirection array */
3112 	if (nex2) {
3113 		xfs_extnum_t	ext_avail;
3114 		int		i;
3115 
3116 		byte_diff = nex2 * sizeof(xfs_bmbt_rec_t);
3117 		ext_avail = XFS_LINEAR_EXTS - erp->er_extcount;
3118 		i = 0;
3119 		/*
3120 		 * If nex2 extents fit in the current page, append
3121 		 * nex2_ep after the new extents.
3122 		 */
3123 		if (nex2 <= ext_avail) {
3124 			i = erp->er_extcount;
3125 		}
3126 		/*
3127 		 * Otherwise, check if space is available in the
3128 		 * next page.
3129 		 */
3130 		else if ((erp_idx < nlists - 1) &&
3131 			 (nex2 <= (ext_avail = XFS_LINEAR_EXTS -
3132 			  ifp->if_u1.if_ext_irec[erp_idx+1].er_extcount))) {
3133 			erp_idx++;
3134 			erp++;
3135 			/* Create a hole for nex2 extents */
3136 			memmove(&erp->er_extbuf[nex2], erp->er_extbuf,
3137 				erp->er_extcount * sizeof(xfs_bmbt_rec_t));
3138 		}
3139 		/*
3140 		 * Final choice, create a new extent page for
3141 		 * nex2 extents.
3142 		 */
3143 		else {
3144 			erp_idx++;
3145 			erp = xfs_iext_irec_new(ifp, erp_idx);
3146 		}
3147 		memmove(&erp->er_extbuf[i], nex2_ep, byte_diff);
3148 		kmem_free(nex2_ep);
3149 		erp->er_extcount += nex2;
3150 		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, nex2);
3151 	}
3152 }
3153 
3154 /*
3155  * This is called when the amount of space required for incore file
3156  * extents needs to be decreased. The ext_diff parameter stores the
3157  * number of extents to be removed and the idx parameter contains
3158  * the extent index where the extents will be removed from.
3159  *
3160  * If the amount of space needed has decreased below the linear
3161  * limit, XFS_IEXT_BUFSZ, then switch to using the contiguous
3162  * extent array.  Otherwise, use kmem_realloc() to adjust the
3163  * size to what is needed.
3164  */
3165 void
3166 xfs_iext_remove(
3167 	xfs_inode_t	*ip,		/* incore inode pointer */
3168 	xfs_extnum_t	idx,		/* index to begin removing exts */
3169 	int		ext_diff,	/* number of extents to remove */
3170 	int		state)		/* type of extent conversion */
3171 {
3172 	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
3173 	xfs_extnum_t	nextents;	/* number of extents in file */
3174 	int		new_size;	/* size of extents after removal */
3175 
3176 	trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
3177 
3178 	ASSERT(ext_diff > 0);
3179 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3180 	new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
3181 
3182 	if (new_size == 0) {
3183 		xfs_iext_destroy(ifp);
3184 	} else if (ifp->if_flags & XFS_IFEXTIREC) {
3185 		xfs_iext_remove_indirect(ifp, idx, ext_diff);
3186 	} else if (ifp->if_real_bytes) {
3187 		xfs_iext_remove_direct(ifp, idx, ext_diff);
3188 	} else {
3189 		xfs_iext_remove_inline(ifp, idx, ext_diff);
3190 	}
3191 	ifp->if_bytes = new_size;
3192 }
3193 
3194 /*
3195  * This removes ext_diff extents from the inline buffer, beginning
3196  * at extent index idx.
3197  */
3198 void
3199 xfs_iext_remove_inline(
3200 	xfs_ifork_t	*ifp,		/* inode fork pointer */
3201 	xfs_extnum_t	idx,		/* index to begin removing exts */
3202 	int		ext_diff)	/* number of extents to remove */
3203 {
3204 	int		nextents;	/* number of extents in file */
3205 
3206 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3207 	ASSERT(idx < XFS_INLINE_EXTS);
3208 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3209 	ASSERT(((nextents - ext_diff) > 0) &&
3210 		(nextents - ext_diff) < XFS_INLINE_EXTS);
3211 
3212 	if (idx + ext_diff < nextents) {
3213 		memmove(&ifp->if_u2.if_inline_ext[idx],
3214 			&ifp->if_u2.if_inline_ext[idx + ext_diff],
3215 			(nextents - (idx + ext_diff)) *
3216 			 sizeof(xfs_bmbt_rec_t));
3217 		memset(&ifp->if_u2.if_inline_ext[nextents - ext_diff],
3218 			0, ext_diff * sizeof(xfs_bmbt_rec_t));
3219 	} else {
3220 		memset(&ifp->if_u2.if_inline_ext[idx], 0,
3221 			ext_diff * sizeof(xfs_bmbt_rec_t));
3222 	}
3223 }
3224 
3225 /*
3226  * This removes ext_diff extents from a linear (direct) extent list,
3227  * beginning at extent index idx. If the extents are being removed
3228  * from the end of the list (ie. truncate) then we just need to re-
3229  * allocate the list to remove the extra space. Otherwise, if the
3230  * extents are being removed from the middle of the existing extent
3231  * entries, then we first need to move the extent records beginning
3232  * at idx + ext_diff up in the list to overwrite the records being
3233  * removed, then remove the extra space via kmem_realloc.
3234  */
3235 void
3236 xfs_iext_remove_direct(
3237 	xfs_ifork_t	*ifp,		/* inode fork pointer */
3238 	xfs_extnum_t	idx,		/* index to begin removing exts */
3239 	int		ext_diff)	/* number of extents to remove */
3240 {
3241 	xfs_extnum_t	nextents;	/* number of extents in file */
3242 	int		new_size;	/* size of extents after removal */
3243 
3244 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3245 	new_size = ifp->if_bytes -
3246 		(ext_diff * sizeof(xfs_bmbt_rec_t));
3247 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3248 
3249 	if (new_size == 0) {
3250 		xfs_iext_destroy(ifp);
3251 		return;
3252 	}
3253 	/* Move extents up in the list (if needed) */
3254 	if (idx + ext_diff < nextents) {
3255 		memmove(&ifp->if_u1.if_extents[idx],
3256 			&ifp->if_u1.if_extents[idx + ext_diff],
3257 			(nextents - (idx + ext_diff)) *
3258 			 sizeof(xfs_bmbt_rec_t));
3259 	}
3260 	memset(&ifp->if_u1.if_extents[nextents - ext_diff],
3261 		0, ext_diff * sizeof(xfs_bmbt_rec_t));
3262 	/*
3263 	 * Reallocate the direct extent list. If the extents
3264 	 * will fit inside the inode then xfs_iext_realloc_direct
3265 	 * will switch from direct to inline extent allocation
3266 	 * mode for us.
3267 	 */
3268 	xfs_iext_realloc_direct(ifp, new_size);
3269 	ifp->if_bytes = new_size;
3270 }
3271 
3272 /*
3273  * This is called when incore extents are being removed from the
3274  * indirection array and the extents being removed span multiple extent
3275  * buffers. The idx parameter contains the file extent index where we
3276  * want to begin removing extents, and the count parameter contains
3277  * how many extents need to be removed.
3278  *
3279  *    |-------|   |-------|
3280  *    | nex1  |   |       |    nex1 - number of extents before idx
3281  *    |-------|   | count |
3282  *    |       |   |       |    count - number of extents being removed at idx
3283  *    | count |   |-------|
3284  *    |       |   | nex2  |    nex2 - number of extents after idx + count
3285  *    |-------|   |-------|
3286  */
3287 void
3288 xfs_iext_remove_indirect(
3289 	xfs_ifork_t	*ifp,		/* inode fork pointer */
3290 	xfs_extnum_t	idx,		/* index to begin removing extents */
3291 	int		count)		/* number of extents to remove */
3292 {
3293 	xfs_ext_irec_t	*erp;		/* indirection array pointer */
3294 	int		erp_idx = 0;	/* indirection array index */
3295 	xfs_extnum_t	ext_cnt;	/* extents left to remove */
3296 	xfs_extnum_t	ext_diff;	/* extents to remove in current list */
3297 	xfs_extnum_t	nex1;		/* number of extents before idx */
3298 	xfs_extnum_t	nex2;		/* extents after idx + count */
3299 	int		page_idx = idx;	/* index in target extent list */
3300 
3301 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3302 	erp = xfs_iext_idx_to_irec(ifp,  &page_idx, &erp_idx, 0);
3303 	ASSERT(erp != NULL);
3304 	nex1 = page_idx;
3305 	ext_cnt = count;
3306 	while (ext_cnt) {
3307 		nex2 = MAX((erp->er_extcount - (nex1 + ext_cnt)), 0);
3308 		ext_diff = MIN(ext_cnt, (erp->er_extcount - nex1));
3309 		/*
3310 		 * Check for deletion of entire list;
3311 		 * xfs_iext_irec_remove() updates extent offsets.
3312 		 */
3313 		if (ext_diff == erp->er_extcount) {
3314 			xfs_iext_irec_remove(ifp, erp_idx);
3315 			ext_cnt -= ext_diff;
3316 			nex1 = 0;
3317 			if (ext_cnt) {
3318 				ASSERT(erp_idx < ifp->if_real_bytes /
3319 					XFS_IEXT_BUFSZ);
3320 				erp = &ifp->if_u1.if_ext_irec[erp_idx];
3321 				nex1 = 0;
3322 				continue;
3323 			} else {
3324 				break;
3325 			}
3326 		}
3327 		/* Move extents up (if needed) */
3328 		if (nex2) {
3329 			memmove(&erp->er_extbuf[nex1],
3330 				&erp->er_extbuf[nex1 + ext_diff],
3331 				nex2 * sizeof(xfs_bmbt_rec_t));
3332 		}
3333 		/* Zero out rest of page */
3334 		memset(&erp->er_extbuf[nex1 + nex2], 0, (XFS_IEXT_BUFSZ -
3335 			((nex1 + nex2) * sizeof(xfs_bmbt_rec_t))));
3336 		/* Update remaining counters */
3337 		erp->er_extcount -= ext_diff;
3338 		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1, -ext_diff);
3339 		ext_cnt -= ext_diff;
3340 		nex1 = 0;
3341 		erp_idx++;
3342 		erp++;
3343 	}
3344 	ifp->if_bytes -= count * sizeof(xfs_bmbt_rec_t);
3345 	xfs_iext_irec_compact(ifp);
3346 }
3347 
3348 /*
3349  * Create, destroy, or resize a linear (direct) block of extents.
3350  */
3351 void
3352 xfs_iext_realloc_direct(
3353 	xfs_ifork_t	*ifp,		/* inode fork pointer */
3354 	int		new_size)	/* new size of extents */
3355 {
3356 	int		rnew_size;	/* real new size of extents */
3357 
3358 	rnew_size = new_size;
3359 
3360 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC) ||
3361 		((new_size >= 0) && (new_size <= XFS_IEXT_BUFSZ) &&
3362 		 (new_size != ifp->if_real_bytes)));
3363 
3364 	/* Free extent records */
3365 	if (new_size == 0) {
3366 		xfs_iext_destroy(ifp);
3367 	}
3368 	/* Resize direct extent list and zero any new bytes */
3369 	else if (ifp->if_real_bytes) {
3370 		/* Check if extents will fit inside the inode */
3371 		if (new_size <= XFS_INLINE_EXTS * sizeof(xfs_bmbt_rec_t)) {
3372 			xfs_iext_direct_to_inline(ifp, new_size /
3373 				(uint)sizeof(xfs_bmbt_rec_t));
3374 			ifp->if_bytes = new_size;
3375 			return;
3376 		}
3377 		if (!is_power_of_2(new_size)){
3378 			rnew_size = roundup_pow_of_two(new_size);
3379 		}
3380 		if (rnew_size != ifp->if_real_bytes) {
3381 			ifp->if_u1.if_extents =
3382 				kmem_realloc(ifp->if_u1.if_extents,
3383 						rnew_size,
3384 						ifp->if_real_bytes, KM_NOFS);
3385 		}
3386 		if (rnew_size > ifp->if_real_bytes) {
3387 			memset(&ifp->if_u1.if_extents[ifp->if_bytes /
3388 				(uint)sizeof(xfs_bmbt_rec_t)], 0,
3389 				rnew_size - ifp->if_real_bytes);
3390 		}
3391 	}
3392 	/*
3393 	 * Switch from the inline extent buffer to a direct
3394 	 * extent list. Be sure to include the inline extent
3395 	 * bytes in new_size.
3396 	 */
3397 	else {
3398 		new_size += ifp->if_bytes;
3399 		if (!is_power_of_2(new_size)) {
3400 			rnew_size = roundup_pow_of_two(new_size);
3401 		}
3402 		xfs_iext_inline_to_direct(ifp, rnew_size);
3403 	}
3404 	ifp->if_real_bytes = rnew_size;
3405 	ifp->if_bytes = new_size;
3406 }
3407 
3408 /*
3409  * Switch from linear (direct) extent records to inline buffer.
3410  */
3411 void
3412 xfs_iext_direct_to_inline(
3413 	xfs_ifork_t	*ifp,		/* inode fork pointer */
3414 	xfs_extnum_t	nextents)	/* number of extents in file */
3415 {
3416 	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
3417 	ASSERT(nextents <= XFS_INLINE_EXTS);
3418 	/*
3419 	 * The inline buffer was zeroed when we switched
3420 	 * from inline to direct extent allocation mode,
3421 	 * so we don't need to clear it here.
3422 	 */
3423 	memcpy(ifp->if_u2.if_inline_ext, ifp->if_u1.if_extents,
3424 		nextents * sizeof(xfs_bmbt_rec_t));
3425 	kmem_free(ifp->if_u1.if_extents);
3426 	ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
3427 	ifp->if_real_bytes = 0;
3428 }
3429 
3430 /*
3431  * Switch from inline buffer to linear (direct) extent records.
3432  * new_size should already be rounded up to the next power of 2
3433  * by the caller (when appropriate), so use new_size as it is.
3434  * However, since new_size may be rounded up, we can't update
3435  * if_bytes here. It is the caller's responsibility to update
3436  * if_bytes upon return.
3437  */
3438 void
3439 xfs_iext_inline_to_direct(
3440 	xfs_ifork_t	*ifp,		/* inode fork pointer */
3441 	int		new_size)	/* number of extents in file */
3442 {
3443 	ifp->if_u1.if_extents = kmem_alloc(new_size, KM_NOFS);
3444 	memset(ifp->if_u1.if_extents, 0, new_size);
3445 	if (ifp->if_bytes) {
3446 		memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
3447 			ifp->if_bytes);
3448 		memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
3449 			sizeof(xfs_bmbt_rec_t));
3450 	}
3451 	ifp->if_real_bytes = new_size;
3452 }
3453 
3454 /*
3455  * Resize an extent indirection array to new_size bytes.
3456  */
3457 STATIC void
3458 xfs_iext_realloc_indirect(
3459 	xfs_ifork_t	*ifp,		/* inode fork pointer */
3460 	int		new_size)	/* new indirection array size */
3461 {
3462 	int		nlists;		/* number of irec's (ex lists) */
3463 	int		size;		/* current indirection array size */
3464 
3465 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3466 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3467 	size = nlists * sizeof(xfs_ext_irec_t);
3468 	ASSERT(ifp->if_real_bytes);
3469 	ASSERT((new_size >= 0) && (new_size != size));
3470 	if (new_size == 0) {
3471 		xfs_iext_destroy(ifp);
3472 	} else {
3473 		ifp->if_u1.if_ext_irec = (xfs_ext_irec_t *)
3474 			kmem_realloc(ifp->if_u1.if_ext_irec,
3475 				new_size, size, KM_NOFS);
3476 	}
3477 }
3478 
3479 /*
3480  * Switch from indirection array to linear (direct) extent allocations.
3481  */
3482 STATIC void
3483 xfs_iext_indirect_to_direct(
3484 	 xfs_ifork_t	*ifp)		/* inode fork pointer */
3485 {
3486 	xfs_bmbt_rec_host_t *ep;	/* extent record pointer */
3487 	xfs_extnum_t	nextents;	/* number of extents in file */
3488 	int		size;		/* size of file extents */
3489 
3490 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3491 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3492 	ASSERT(nextents <= XFS_LINEAR_EXTS);
3493 	size = nextents * sizeof(xfs_bmbt_rec_t);
3494 
3495 	xfs_iext_irec_compact_pages(ifp);
3496 	ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ);
3497 
3498 	ep = ifp->if_u1.if_ext_irec->er_extbuf;
3499 	kmem_free(ifp->if_u1.if_ext_irec);
3500 	ifp->if_flags &= ~XFS_IFEXTIREC;
3501 	ifp->if_u1.if_extents = ep;
3502 	ifp->if_bytes = size;
3503 	if (nextents < XFS_LINEAR_EXTS) {
3504 		xfs_iext_realloc_direct(ifp, size);
3505 	}
3506 }
3507 
3508 /*
3509  * Free incore file extents.
3510  */
3511 void
3512 xfs_iext_destroy(
3513 	xfs_ifork_t	*ifp)		/* inode fork pointer */
3514 {
3515 	if (ifp->if_flags & XFS_IFEXTIREC) {
3516 		int	erp_idx;
3517 		int	nlists;
3518 
3519 		nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3520 		for (erp_idx = nlists - 1; erp_idx >= 0 ; erp_idx--) {
3521 			xfs_iext_irec_remove(ifp, erp_idx);
3522 		}
3523 		ifp->if_flags &= ~XFS_IFEXTIREC;
3524 	} else if (ifp->if_real_bytes) {
3525 		kmem_free(ifp->if_u1.if_extents);
3526 	} else if (ifp->if_bytes) {
3527 		memset(ifp->if_u2.if_inline_ext, 0, XFS_INLINE_EXTS *
3528 			sizeof(xfs_bmbt_rec_t));
3529 	}
3530 	ifp->if_u1.if_extents = NULL;
3531 	ifp->if_real_bytes = 0;
3532 	ifp->if_bytes = 0;
3533 }
3534 
3535 /*
3536  * Return a pointer to the extent record for file system block bno.
3537  */
3538 xfs_bmbt_rec_host_t *			/* pointer to found extent record */
3539 xfs_iext_bno_to_ext(
3540 	xfs_ifork_t	*ifp,		/* inode fork pointer */
3541 	xfs_fileoff_t	bno,		/* block number to search for */
3542 	xfs_extnum_t	*idxp)		/* index of target extent */
3543 {
3544 	xfs_bmbt_rec_host_t *base;	/* pointer to first extent */
3545 	xfs_filblks_t	blockcount = 0;	/* number of blocks in extent */
3546 	xfs_bmbt_rec_host_t *ep = NULL;	/* pointer to target extent */
3547 	xfs_ext_irec_t	*erp = NULL;	/* indirection array pointer */
3548 	int		high;		/* upper boundary in search */
3549 	xfs_extnum_t	idx = 0;	/* index of target extent */
3550 	int		low;		/* lower boundary in search */
3551 	xfs_extnum_t	nextents;	/* number of file extents */
3552 	xfs_fileoff_t	startoff = 0;	/* start offset of extent */
3553 
3554 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3555 	if (nextents == 0) {
3556 		*idxp = 0;
3557 		return NULL;
3558 	}
3559 	low = 0;
3560 	if (ifp->if_flags & XFS_IFEXTIREC) {
3561 		/* Find target extent list */
3562 		int	erp_idx = 0;
3563 		erp = xfs_iext_bno_to_irec(ifp, bno, &erp_idx);
3564 		base = erp->er_extbuf;
3565 		high = erp->er_extcount - 1;
3566 	} else {
3567 		base = ifp->if_u1.if_extents;
3568 		high = nextents - 1;
3569 	}
3570 	/* Binary search extent records */
3571 	while (low <= high) {
3572 		idx = (low + high) >> 1;
3573 		ep = base + idx;
3574 		startoff = xfs_bmbt_get_startoff(ep);
3575 		blockcount = xfs_bmbt_get_blockcount(ep);
3576 		if (bno < startoff) {
3577 			high = idx - 1;
3578 		} else if (bno >= startoff + blockcount) {
3579 			low = idx + 1;
3580 		} else {
3581 			/* Convert back to file-based extent index */
3582 			if (ifp->if_flags & XFS_IFEXTIREC) {
3583 				idx += erp->er_extoff;
3584 			}
3585 			*idxp = idx;
3586 			return ep;
3587 		}
3588 	}
3589 	/* Convert back to file-based extent index */
3590 	if (ifp->if_flags & XFS_IFEXTIREC) {
3591 		idx += erp->er_extoff;
3592 	}
3593 	if (bno >= startoff + blockcount) {
3594 		if (++idx == nextents) {
3595 			ep = NULL;
3596 		} else {
3597 			ep = xfs_iext_get_ext(ifp, idx);
3598 		}
3599 	}
3600 	*idxp = idx;
3601 	return ep;
3602 }
3603 
3604 /*
3605  * Return a pointer to the indirection array entry containing the
3606  * extent record for filesystem block bno. Store the index of the
3607  * target irec in *erp_idxp.
3608  */
3609 xfs_ext_irec_t *			/* pointer to found extent record */
3610 xfs_iext_bno_to_irec(
3611 	xfs_ifork_t	*ifp,		/* inode fork pointer */
3612 	xfs_fileoff_t	bno,		/* block number to search for */
3613 	int		*erp_idxp)	/* irec index of target ext list */
3614 {
3615 	xfs_ext_irec_t	*erp = NULL;	/* indirection array pointer */
3616 	xfs_ext_irec_t	*erp_next;	/* next indirection array entry */
3617 	int		erp_idx;	/* indirection array index */
3618 	int		nlists;		/* number of extent irec's (lists) */
3619 	int		high;		/* binary search upper limit */
3620 	int		low;		/* binary search lower limit */
3621 
3622 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3623 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3624 	erp_idx = 0;
3625 	low = 0;
3626 	high = nlists - 1;
3627 	while (low <= high) {
3628 		erp_idx = (low + high) >> 1;
3629 		erp = &ifp->if_u1.if_ext_irec[erp_idx];
3630 		erp_next = erp_idx < nlists - 1 ? erp + 1 : NULL;
3631 		if (bno < xfs_bmbt_get_startoff(erp->er_extbuf)) {
3632 			high = erp_idx - 1;
3633 		} else if (erp_next && bno >=
3634 			   xfs_bmbt_get_startoff(erp_next->er_extbuf)) {
3635 			low = erp_idx + 1;
3636 		} else {
3637 			break;
3638 		}
3639 	}
3640 	*erp_idxp = erp_idx;
3641 	return erp;
3642 }
3643 
3644 /*
3645  * Return a pointer to the indirection array entry containing the
3646  * extent record at file extent index *idxp. Store the index of the
3647  * target irec in *erp_idxp and store the page index of the target
3648  * extent record in *idxp.
3649  */
3650 xfs_ext_irec_t *
3651 xfs_iext_idx_to_irec(
3652 	xfs_ifork_t	*ifp,		/* inode fork pointer */
3653 	xfs_extnum_t	*idxp,		/* extent index (file -> page) */
3654 	int		*erp_idxp,	/* pointer to target irec */
3655 	int		realloc)	/* new bytes were just added */
3656 {
3657 	xfs_ext_irec_t	*prev;		/* pointer to previous irec */
3658 	xfs_ext_irec_t	*erp = NULL;	/* pointer to current irec */
3659 	int		erp_idx;	/* indirection array index */
3660 	int		nlists;		/* number of irec's (ex lists) */
3661 	int		high;		/* binary search upper limit */
3662 	int		low;		/* binary search lower limit */
3663 	xfs_extnum_t	page_idx = *idxp; /* extent index in target list */
3664 
3665 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3666 	ASSERT(page_idx >= 0);
3667 	ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
3668 	ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
3669 
3670 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3671 	erp_idx = 0;
3672 	low = 0;
3673 	high = nlists - 1;
3674 
3675 	/* Binary search extent irec's */
3676 	while (low <= high) {
3677 		erp_idx = (low + high) >> 1;
3678 		erp = &ifp->if_u1.if_ext_irec[erp_idx];
3679 		prev = erp_idx > 0 ? erp - 1 : NULL;
3680 		if (page_idx < erp->er_extoff || (page_idx == erp->er_extoff &&
3681 		     realloc && prev && prev->er_extcount < XFS_LINEAR_EXTS)) {
3682 			high = erp_idx - 1;
3683 		} else if (page_idx > erp->er_extoff + erp->er_extcount ||
3684 			   (page_idx == erp->er_extoff + erp->er_extcount &&
3685 			    !realloc)) {
3686 			low = erp_idx + 1;
3687 		} else if (page_idx == erp->er_extoff + erp->er_extcount &&
3688 			   erp->er_extcount == XFS_LINEAR_EXTS) {
3689 			ASSERT(realloc);
3690 			page_idx = 0;
3691 			erp_idx++;
3692 			erp = erp_idx < nlists ? erp + 1 : NULL;
3693 			break;
3694 		} else {
3695 			page_idx -= erp->er_extoff;
3696 			break;
3697 		}
3698 	}
3699 	*idxp = page_idx;
3700 	*erp_idxp = erp_idx;
3701 	return(erp);
3702 }
3703 
3704 /*
3705  * Allocate and initialize an indirection array once the space needed
3706  * for incore extents increases above XFS_IEXT_BUFSZ.
3707  */
3708 void
3709 xfs_iext_irec_init(
3710 	xfs_ifork_t	*ifp)		/* inode fork pointer */
3711 {
3712 	xfs_ext_irec_t	*erp;		/* indirection array pointer */
3713 	xfs_extnum_t	nextents;	/* number of extents in file */
3714 
3715 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
3716 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3717 	ASSERT(nextents <= XFS_LINEAR_EXTS);
3718 
3719 	erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
3720 
3721 	if (nextents == 0) {
3722 		ifp->if_u1.if_extents = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
3723 	} else if (!ifp->if_real_bytes) {
3724 		xfs_iext_inline_to_direct(ifp, XFS_IEXT_BUFSZ);
3725 	} else if (ifp->if_real_bytes < XFS_IEXT_BUFSZ) {
3726 		xfs_iext_realloc_direct(ifp, XFS_IEXT_BUFSZ);
3727 	}
3728 	erp->er_extbuf = ifp->if_u1.if_extents;
3729 	erp->er_extcount = nextents;
3730 	erp->er_extoff = 0;
3731 
3732 	ifp->if_flags |= XFS_IFEXTIREC;
3733 	ifp->if_real_bytes = XFS_IEXT_BUFSZ;
3734 	ifp->if_bytes = nextents * sizeof(xfs_bmbt_rec_t);
3735 	ifp->if_u1.if_ext_irec = erp;
3736 
3737 	return;
3738 }
3739 
3740 /*
3741  * Allocate and initialize a new entry in the indirection array.
3742  */
3743 xfs_ext_irec_t *
3744 xfs_iext_irec_new(
3745 	xfs_ifork_t	*ifp,		/* inode fork pointer */
3746 	int		erp_idx)	/* index for new irec */
3747 {
3748 	xfs_ext_irec_t	*erp;		/* indirection array pointer */
3749 	int		i;		/* loop counter */
3750 	int		nlists;		/* number of irec's (ex lists) */
3751 
3752 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3753 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3754 
3755 	/* Resize indirection array */
3756 	xfs_iext_realloc_indirect(ifp, ++nlists *
3757 				  sizeof(xfs_ext_irec_t));
3758 	/*
3759 	 * Move records down in the array so the
3760 	 * new page can use erp_idx.
3761 	 */
3762 	erp = ifp->if_u1.if_ext_irec;
3763 	for (i = nlists - 1; i > erp_idx; i--) {
3764 		memmove(&erp[i], &erp[i-1], sizeof(xfs_ext_irec_t));
3765 	}
3766 	ASSERT(i == erp_idx);
3767 
3768 	/* Initialize new extent record */
3769 	erp = ifp->if_u1.if_ext_irec;
3770 	erp[erp_idx].er_extbuf = kmem_alloc(XFS_IEXT_BUFSZ, KM_NOFS);
3771 	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
3772 	memset(erp[erp_idx].er_extbuf, 0, XFS_IEXT_BUFSZ);
3773 	erp[erp_idx].er_extcount = 0;
3774 	erp[erp_idx].er_extoff = erp_idx > 0 ?
3775 		erp[erp_idx-1].er_extoff + erp[erp_idx-1].er_extcount : 0;
3776 	return (&erp[erp_idx]);
3777 }
3778 
3779 /*
3780  * Remove a record from the indirection array.
3781  */
3782 void
3783 xfs_iext_irec_remove(
3784 	xfs_ifork_t	*ifp,		/* inode fork pointer */
3785 	int		erp_idx)	/* irec index to remove */
3786 {
3787 	xfs_ext_irec_t	*erp;		/* indirection array pointer */
3788 	int		i;		/* loop counter */
3789 	int		nlists;		/* number of irec's (ex lists) */
3790 
3791 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3792 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3793 	erp = &ifp->if_u1.if_ext_irec[erp_idx];
3794 	if (erp->er_extbuf) {
3795 		xfs_iext_irec_update_extoffs(ifp, erp_idx + 1,
3796 			-erp->er_extcount);
3797 		kmem_free(erp->er_extbuf);
3798 	}
3799 	/* Compact extent records */
3800 	erp = ifp->if_u1.if_ext_irec;
3801 	for (i = erp_idx; i < nlists - 1; i++) {
3802 		memmove(&erp[i], &erp[i+1], sizeof(xfs_ext_irec_t));
3803 	}
3804 	/*
3805 	 * Manually free the last extent record from the indirection
3806 	 * array.  A call to xfs_iext_realloc_indirect() with a size
3807 	 * of zero would result in a call to xfs_iext_destroy() which
3808 	 * would in turn call this function again, creating a nasty
3809 	 * infinite loop.
3810 	 */
3811 	if (--nlists) {
3812 		xfs_iext_realloc_indirect(ifp,
3813 			nlists * sizeof(xfs_ext_irec_t));
3814 	} else {
3815 		kmem_free(ifp->if_u1.if_ext_irec);
3816 	}
3817 	ifp->if_real_bytes = nlists * XFS_IEXT_BUFSZ;
3818 }
3819 
3820 /*
3821  * This is called to clean up large amounts of unused memory allocated
3822  * by the indirection array.  Before compacting anything though, verify
3823  * that the indirection array is still needed and switch back to the
3824  * linear extent list (or even the inline buffer) if possible.  The
3825  * compaction policy is as follows:
3826  *
3827  *    Full Compaction: Extents fit into a single page (or inline buffer)
3828  * Partial Compaction: Extents occupy less than 50% of allocated space
3829  *      No Compaction: Extents occupy at least 50% of allocated space
3830  */
3831 void
3832 xfs_iext_irec_compact(
3833 	xfs_ifork_t	*ifp)		/* inode fork pointer */
3834 {
3835 	xfs_extnum_t	nextents;	/* number of extents in file */
3836 	int		nlists;		/* number of irec's (ex lists) */
3837 
3838 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3839 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3840 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
3841 
3842 	if (nextents == 0) {
3843 		xfs_iext_destroy(ifp);
3844 	} else if (nextents <= XFS_INLINE_EXTS) {
3845 		xfs_iext_indirect_to_direct(ifp);
3846 		xfs_iext_direct_to_inline(ifp, nextents);
3847 	} else if (nextents <= XFS_LINEAR_EXTS) {
3848 		xfs_iext_indirect_to_direct(ifp);
3849 	} else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) {
3850 		xfs_iext_irec_compact_pages(ifp);
3851 	}
3852 }
3853 
3854 /*
3855  * Combine extents from neighboring extent pages.
3856  */
3857 void
3858 xfs_iext_irec_compact_pages(
3859 	xfs_ifork_t	*ifp)		/* inode fork pointer */
3860 {
3861 	xfs_ext_irec_t	*erp, *erp_next;/* pointers to irec entries */
3862 	int		erp_idx = 0;	/* indirection array index */
3863 	int		nlists;		/* number of irec's (ex lists) */
3864 
3865 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3866 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3867 	while (erp_idx < nlists - 1) {
3868 		erp = &ifp->if_u1.if_ext_irec[erp_idx];
3869 		erp_next = erp + 1;
3870 		if (erp_next->er_extcount <=
3871 		    (XFS_LINEAR_EXTS - erp->er_extcount)) {
3872 			memcpy(&erp->er_extbuf[erp->er_extcount],
3873 				erp_next->er_extbuf, erp_next->er_extcount *
3874 				sizeof(xfs_bmbt_rec_t));
3875 			erp->er_extcount += erp_next->er_extcount;
3876 			/*
3877 			 * Free page before removing extent record
3878 			 * so er_extoffs don't get modified in
3879 			 * xfs_iext_irec_remove.
3880 			 */
3881 			kmem_free(erp_next->er_extbuf);
3882 			erp_next->er_extbuf = NULL;
3883 			xfs_iext_irec_remove(ifp, erp_idx + 1);
3884 			nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3885 		} else {
3886 			erp_idx++;
3887 		}
3888 	}
3889 }
3890 
3891 /*
3892  * This is called to update the er_extoff field in the indirection
3893  * array when extents have been added or removed from one of the
3894  * extent lists. erp_idx contains the irec index to begin updating
3895  * at and ext_diff contains the number of extents that were added
3896  * or removed.
3897  */
3898 void
3899 xfs_iext_irec_update_extoffs(
3900 	xfs_ifork_t	*ifp,		/* inode fork pointer */
3901 	int		erp_idx,	/* irec index to update */
3902 	int		ext_diff)	/* number of new extents */
3903 {
3904 	int		i;		/* loop counter */
3905 	int		nlists;		/* number of irec's (ex lists */
3906 
3907 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
3908 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
3909 	for (i = erp_idx; i < nlists; i++) {
3910 		ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
3911 	}
3912 }
3913