xref: /openbmc/linux/fs/xfs/xfs_inode.c (revision d5cb9783536a41df9f9cba5b0a1d78047ed787f7)
1 /*
2  * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_types.h"
21 #include "xfs_bit.h"
22 #include "xfs_log.h"
23 #include "xfs_inum.h"
24 #include "xfs_imap.h"
25 #include "xfs_trans.h"
26 #include "xfs_trans_priv.h"
27 #include "xfs_sb.h"
28 #include "xfs_ag.h"
29 #include "xfs_dir.h"
30 #include "xfs_dir2.h"
31 #include "xfs_dmapi.h"
32 #include "xfs_mount.h"
33 #include "xfs_bmap_btree.h"
34 #include "xfs_alloc_btree.h"
35 #include "xfs_ialloc_btree.h"
36 #include "xfs_dir_sf.h"
37 #include "xfs_dir2_sf.h"
38 #include "xfs_attr_sf.h"
39 #include "xfs_dinode.h"
40 #include "xfs_inode.h"
41 #include "xfs_buf_item.h"
42 #include "xfs_inode_item.h"
43 #include "xfs_btree.h"
44 #include "xfs_alloc.h"
45 #include "xfs_ialloc.h"
46 #include "xfs_bmap.h"
47 #include "xfs_rw.h"
48 #include "xfs_error.h"
49 #include "xfs_utils.h"
50 #include "xfs_dir2_trace.h"
51 #include "xfs_quota.h"
52 #include "xfs_mac.h"
53 #include "xfs_acl.h"
54 
55 
56 kmem_zone_t *xfs_ifork_zone;
57 kmem_zone_t *xfs_inode_zone;
58 kmem_zone_t *xfs_chashlist_zone;
59 
60 /*
61  * Used in xfs_itruncate().  This is the maximum number of extents
62  * freed from a file in a single transaction.
63  */
64 #define	XFS_ITRUNC_MAX_EXTENTS	2
65 
66 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
67 STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int);
68 STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int);
69 STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int);
70 
71 
72 #ifdef DEBUG
73 /*
74  * Make sure that the extents in the given memory buffer
75  * are valid.
76  */
77 STATIC void
78 xfs_validate_extents(
79 	xfs_bmbt_rec_t		*ep,
80 	int			nrecs,
81 	int			disk,
82 	xfs_exntfmt_t		fmt)
83 {
84 	xfs_bmbt_irec_t		irec;
85 	xfs_bmbt_rec_t		rec;
86 	int			i;
87 
88 	for (i = 0; i < nrecs; i++) {
89 		rec.l0 = get_unaligned((__uint64_t*)&ep->l0);
90 		rec.l1 = get_unaligned((__uint64_t*)&ep->l1);
91 		if (disk)
92 			xfs_bmbt_disk_get_all(&rec, &irec);
93 		else
94 			xfs_bmbt_get_all(&rec, &irec);
95 		if (fmt == XFS_EXTFMT_NOSTATE)
96 			ASSERT(irec.br_state == XFS_EXT_NORM);
97 		ep++;
98 	}
99 }
100 #else /* DEBUG */
101 #define xfs_validate_extents(ep, nrecs, disk, fmt)
102 #endif /* DEBUG */
103 
104 /*
105  * Check that none of the inode's in the buffer have a next
106  * unlinked field of 0.
107  */
108 #if defined(DEBUG)
109 void
110 xfs_inobp_check(
111 	xfs_mount_t	*mp,
112 	xfs_buf_t	*bp)
113 {
114 	int		i;
115 	int		j;
116 	xfs_dinode_t	*dip;
117 
118 	j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
119 
120 	for (i = 0; i < j; i++) {
121 		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
122 					i * mp->m_sb.sb_inodesize);
123 		if (!dip->di_next_unlinked)  {
124 			xfs_fs_cmn_err(CE_ALERT, mp,
125 				"Detected a bogus zero next_unlinked field in incore inode buffer 0x%p.  About to pop an ASSERT.",
126 				bp);
127 			ASSERT(dip->di_next_unlinked);
128 		}
129 	}
130 }
131 #endif
132 
133 /*
134  * This routine is called to map an inode number within a file
135  * system to the buffer containing the on-disk version of the
136  * inode.  It returns a pointer to the buffer containing the
137  * on-disk inode in the bpp parameter, and in the dip parameter
138  * it returns a pointer to the on-disk inode within that buffer.
139  *
140  * If a non-zero error is returned, then the contents of bpp and
141  * dipp are undefined.
142  *
143  * Use xfs_imap() to determine the size and location of the
144  * buffer to read from disk.
145  */
146 STATIC int
147 xfs_inotobp(
148 	xfs_mount_t	*mp,
149 	xfs_trans_t	*tp,
150 	xfs_ino_t	ino,
151 	xfs_dinode_t	**dipp,
152 	xfs_buf_t	**bpp,
153 	int		*offset)
154 {
155 	int		di_ok;
156 	xfs_imap_t	imap;
157 	xfs_buf_t	*bp;
158 	int		error;
159 	xfs_dinode_t	*dip;
160 
161 	/*
162 	 * Call the space managment code to find the location of the
163 	 * inode on disk.
164 	 */
165 	imap.im_blkno = 0;
166 	error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
167 	if (error != 0) {
168 		cmn_err(CE_WARN,
169 	"xfs_inotobp: xfs_imap()  returned an "
170 	"error %d on %s.  Returning error.", error, mp->m_fsname);
171 		return error;
172 	}
173 
174 	/*
175 	 * If the inode number maps to a block outside the bounds of the
176 	 * file system then return NULL rather than calling read_buf
177 	 * and panicing when we get an error from the driver.
178 	 */
179 	if ((imap.im_blkno + imap.im_len) >
180 	    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
181 		cmn_err(CE_WARN,
182 	"xfs_inotobp: inode number (%llu + %d) maps to a block outside the bounds "
183 	"of the file system %s.  Returning EINVAL.",
184 			(unsigned long long)imap.im_blkno,
185 			imap.im_len, mp->m_fsname);
186 		return XFS_ERROR(EINVAL);
187 	}
188 
189 	/*
190 	 * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
191 	 * default to just a read_buf() call.
192 	 */
193 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
194 				   (int)imap.im_len, XFS_BUF_LOCK, &bp);
195 
196 	if (error) {
197 		cmn_err(CE_WARN,
198 	"xfs_inotobp: xfs_trans_read_buf()  returned an "
199 	"error %d on %s.  Returning error.", error, mp->m_fsname);
200 		return error;
201 	}
202 	dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0);
203 	di_ok =
204 		INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC &&
205 		XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT));
206 	if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
207 			XFS_RANDOM_ITOBP_INOTOBP))) {
208 		XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip);
209 		xfs_trans_brelse(tp, bp);
210 		cmn_err(CE_WARN,
211 	"xfs_inotobp: XFS_TEST_ERROR()  returned an "
212 	"error on %s.  Returning EFSCORRUPTED.",  mp->m_fsname);
213 		return XFS_ERROR(EFSCORRUPTED);
214 	}
215 
216 	xfs_inobp_check(mp, bp);
217 
218 	/*
219 	 * Set *dipp to point to the on-disk inode in the buffer.
220 	 */
221 	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
222 	*bpp = bp;
223 	*offset = imap.im_boffset;
224 	return 0;
225 }
226 
227 
228 /*
229  * This routine is called to map an inode to the buffer containing
230  * the on-disk version of the inode.  It returns a pointer to the
231  * buffer containing the on-disk inode in the bpp parameter, and in
232  * the dip parameter it returns a pointer to the on-disk inode within
233  * that buffer.
234  *
235  * If a non-zero error is returned, then the contents of bpp and
236  * dipp are undefined.
237  *
238  * If the inode is new and has not yet been initialized, use xfs_imap()
239  * to determine the size and location of the buffer to read from disk.
240  * If the inode has already been mapped to its buffer and read in once,
241  * then use the mapping information stored in the inode rather than
242  * calling xfs_imap().  This allows us to avoid the overhead of looking
243  * at the inode btree for small block file systems (see xfs_dilocate()).
244  * We can tell whether the inode has been mapped in before by comparing
245  * its disk block address to 0.  Only uninitialized inodes will have
246  * 0 for the disk block address.
247  */
248 int
249 xfs_itobp(
250 	xfs_mount_t	*mp,
251 	xfs_trans_t	*tp,
252 	xfs_inode_t	*ip,
253 	xfs_dinode_t	**dipp,
254 	xfs_buf_t	**bpp,
255 	xfs_daddr_t	bno)
256 {
257 	xfs_buf_t	*bp;
258 	int		error;
259 	xfs_imap_t	imap;
260 #ifdef __KERNEL__
261 	int		i;
262 	int		ni;
263 #endif
264 
265 	if (ip->i_blkno == (xfs_daddr_t)0) {
266 		/*
267 		 * Call the space management code to find the location of the
268 		 * inode on disk.
269 		 */
270 		imap.im_blkno = bno;
271 		error = xfs_imap(mp, tp, ip->i_ino, &imap, XFS_IMAP_LOOKUP);
272 		if (error != 0) {
273 			return error;
274 		}
275 
276 		/*
277 		 * If the inode number maps to a block outside the bounds
278 		 * of the file system then return NULL rather than calling
279 		 * read_buf and panicing when we get an error from the
280 		 * driver.
281 		 */
282 		if ((imap.im_blkno + imap.im_len) >
283 		    XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
284 #ifdef DEBUG
285 			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
286 					"(imap.im_blkno (0x%llx) "
287 					"+ imap.im_len (0x%llx)) > "
288 					" XFS_FSB_TO_BB(mp, "
289 					"mp->m_sb.sb_dblocks) (0x%llx)",
290 					(unsigned long long) imap.im_blkno,
291 					(unsigned long long) imap.im_len,
292 					XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
293 #endif /* DEBUG */
294 			return XFS_ERROR(EINVAL);
295 		}
296 
297 		/*
298 		 * Fill in the fields in the inode that will be used to
299 		 * map the inode to its buffer from now on.
300 		 */
301 		ip->i_blkno = imap.im_blkno;
302 		ip->i_len = imap.im_len;
303 		ip->i_boffset = imap.im_boffset;
304 	} else {
305 		/*
306 		 * We've already mapped the inode once, so just use the
307 		 * mapping that we saved the first time.
308 		 */
309 		imap.im_blkno = ip->i_blkno;
310 		imap.im_len = ip->i_len;
311 		imap.im_boffset = ip->i_boffset;
312 	}
313 	ASSERT(bno == 0 || bno == imap.im_blkno);
314 
315 	/*
316 	 * Read in the buffer.  If tp is NULL, xfs_trans_read_buf() will
317 	 * default to just a read_buf() call.
318 	 */
319 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
320 				   (int)imap.im_len, XFS_BUF_LOCK, &bp);
321 
322 	if (error) {
323 #ifdef DEBUG
324 		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
325 				"xfs_trans_read_buf() returned error %d, "
326 				"imap.im_blkno 0x%llx, imap.im_len 0x%llx",
327 				error, (unsigned long long) imap.im_blkno,
328 				(unsigned long long) imap.im_len);
329 #endif /* DEBUG */
330 		return error;
331 	}
332 #ifdef __KERNEL__
333 	/*
334 	 * Validate the magic number and version of every inode in the buffer
335 	 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
336 	 */
337 #ifdef DEBUG
338 	ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
339 #else
340 	ni = 1;
341 #endif
342 	for (i = 0; i < ni; i++) {
343 		int		di_ok;
344 		xfs_dinode_t	*dip;
345 
346 		dip = (xfs_dinode_t *)xfs_buf_offset(bp,
347 					(i << mp->m_sb.sb_inodelog));
348 		di_ok = INT_GET(dip->di_core.di_magic, ARCH_CONVERT) == XFS_DINODE_MAGIC &&
349 			    XFS_DINODE_GOOD_VERSION(INT_GET(dip->di_core.di_version, ARCH_CONVERT));
350 		if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
351 				 XFS_RANDOM_ITOBP_INOTOBP))) {
352 #ifdef DEBUG
353 			prdev("bad inode magic/vsn daddr %lld #%d (magic=%x)",
354 				mp->m_ddev_targp,
355 				(unsigned long long)imap.im_blkno, i,
356 				INT_GET(dip->di_core.di_magic, ARCH_CONVERT));
357 #endif
358 			XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH,
359 					     mp, dip);
360 			xfs_trans_brelse(tp, bp);
361 			return XFS_ERROR(EFSCORRUPTED);
362 		}
363 	}
364 #endif	/* __KERNEL__ */
365 
366 	xfs_inobp_check(mp, bp);
367 
368 	/*
369 	 * Mark the buffer as an inode buffer now that it looks good
370 	 */
371 	XFS_BUF_SET_VTYPE(bp, B_FS_INO);
372 
373 	/*
374 	 * Set *dipp to point to the on-disk inode in the buffer.
375 	 */
376 	*dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
377 	*bpp = bp;
378 	return 0;
379 }
380 
381 /*
382  * Move inode type and inode format specific information from the
383  * on-disk inode to the in-core inode.  For fifos, devs, and sockets
384  * this means set if_rdev to the proper value.  For files, directories,
385  * and symlinks this means to bring in the in-line data or extent
386  * pointers.  For a file in B-tree format, only the root is immediately
387  * brought in-core.  The rest will be in-lined in if_extents when it
388  * is first referenced (see xfs_iread_extents()).
389  */
390 STATIC int
391 xfs_iformat(
392 	xfs_inode_t		*ip,
393 	xfs_dinode_t		*dip)
394 {
395 	xfs_attr_shortform_t	*atp;
396 	int			size;
397 	int			error;
398 	xfs_fsize_t             di_size;
399 	ip->i_df.if_ext_max =
400 		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
401 	error = 0;
402 
403 	if (unlikely(
404 	    INT_GET(dip->di_core.di_nextents, ARCH_CONVERT) +
405 		INT_GET(dip->di_core.di_anextents, ARCH_CONVERT) >
406 	    INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT))) {
407 		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
408 			"corrupt dinode %Lu, extent total = %d, nblocks = %Lu."
409 			"  Unmount and run xfs_repair.",
410 			(unsigned long long)ip->i_ino,
411 			(int)(INT_GET(dip->di_core.di_nextents, ARCH_CONVERT)
412 			    + INT_GET(dip->di_core.di_anextents, ARCH_CONVERT)),
413 			(unsigned long long)
414 			INT_GET(dip->di_core.di_nblocks, ARCH_CONVERT));
415 		XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
416 				     ip->i_mount, dip);
417 		return XFS_ERROR(EFSCORRUPTED);
418 	}
419 
420 	if (unlikely(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT) > ip->i_mount->m_sb.sb_inodesize)) {
421 		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
422 			"corrupt dinode %Lu, forkoff = 0x%x."
423 			"  Unmount and run xfs_repair.",
424 			(unsigned long long)ip->i_ino,
425 			(int)(INT_GET(dip->di_core.di_forkoff, ARCH_CONVERT)));
426 		XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
427 				     ip->i_mount, dip);
428 		return XFS_ERROR(EFSCORRUPTED);
429 	}
430 
431 	switch (ip->i_d.di_mode & S_IFMT) {
432 	case S_IFIFO:
433 	case S_IFCHR:
434 	case S_IFBLK:
435 	case S_IFSOCK:
436 		if (unlikely(INT_GET(dip->di_core.di_format, ARCH_CONVERT) != XFS_DINODE_FMT_DEV)) {
437 			XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
438 					      ip->i_mount, dip);
439 			return XFS_ERROR(EFSCORRUPTED);
440 		}
441 		ip->i_d.di_size = 0;
442 		ip->i_df.if_u2.if_rdev = INT_GET(dip->di_u.di_dev, ARCH_CONVERT);
443 		break;
444 
445 	case S_IFREG:
446 	case S_IFLNK:
447 	case S_IFDIR:
448 		switch (INT_GET(dip->di_core.di_format, ARCH_CONVERT)) {
449 		case XFS_DINODE_FMT_LOCAL:
450 			/*
451 			 * no local regular files yet
452 			 */
453 			if (unlikely((INT_GET(dip->di_core.di_mode, ARCH_CONVERT) & S_IFMT) == S_IFREG)) {
454 				xfs_fs_cmn_err(CE_WARN, ip->i_mount,
455 					"corrupt inode (local format for regular file) %Lu.  Unmount and run xfs_repair.",
456 					(unsigned long long) ip->i_ino);
457 				XFS_CORRUPTION_ERROR("xfs_iformat(4)",
458 						     XFS_ERRLEVEL_LOW,
459 						     ip->i_mount, dip);
460 				return XFS_ERROR(EFSCORRUPTED);
461 			}
462 
463 			di_size = INT_GET(dip->di_core.di_size, ARCH_CONVERT);
464 			if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
465 				xfs_fs_cmn_err(CE_WARN, ip->i_mount,
466 					"corrupt inode %Lu (bad size %Ld for local inode).  Unmount and run xfs_repair.",
467 					(unsigned long long) ip->i_ino,
468 					(long long) di_size);
469 				XFS_CORRUPTION_ERROR("xfs_iformat(5)",
470 						     XFS_ERRLEVEL_LOW,
471 						     ip->i_mount, dip);
472 				return XFS_ERROR(EFSCORRUPTED);
473 			}
474 
475 			size = (int)di_size;
476 			error = xfs_iformat_local(ip, dip, XFS_DATA_FORK, size);
477 			break;
478 		case XFS_DINODE_FMT_EXTENTS:
479 			error = xfs_iformat_extents(ip, dip, XFS_DATA_FORK);
480 			break;
481 		case XFS_DINODE_FMT_BTREE:
482 			error = xfs_iformat_btree(ip, dip, XFS_DATA_FORK);
483 			break;
484 		default:
485 			XFS_ERROR_REPORT("xfs_iformat(6)", XFS_ERRLEVEL_LOW,
486 					 ip->i_mount);
487 			return XFS_ERROR(EFSCORRUPTED);
488 		}
489 		break;
490 
491 	default:
492 		XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
493 		return XFS_ERROR(EFSCORRUPTED);
494 	}
495 	if (error) {
496 		return error;
497 	}
498 	if (!XFS_DFORK_Q(dip))
499 		return 0;
500 	ASSERT(ip->i_afp == NULL);
501 	ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
502 	ip->i_afp->if_ext_max =
503 		XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
504 	switch (INT_GET(dip->di_core.di_aformat, ARCH_CONVERT)) {
505 	case XFS_DINODE_FMT_LOCAL:
506 		atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
507 		size = (int)INT_GET(atp->hdr.totsize, ARCH_CONVERT);
508 		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
509 		break;
510 	case XFS_DINODE_FMT_EXTENTS:
511 		error = xfs_iformat_extents(ip, dip, XFS_ATTR_FORK);
512 		break;
513 	case XFS_DINODE_FMT_BTREE:
514 		error = xfs_iformat_btree(ip, dip, XFS_ATTR_FORK);
515 		break;
516 	default:
517 		error = XFS_ERROR(EFSCORRUPTED);
518 		break;
519 	}
520 	if (error) {
521 		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
522 		ip->i_afp = NULL;
523 		xfs_idestroy_fork(ip, XFS_DATA_FORK);
524 	}
525 	return error;
526 }
527 
528 /*
529  * The file is in-lined in the on-disk inode.
530  * If it fits into if_inline_data, then copy
531  * it there, otherwise allocate a buffer for it
532  * and copy the data there.  Either way, set
533  * if_data to point at the data.
534  * If we allocate a buffer for the data, make
535  * sure that its size is a multiple of 4 and
536  * record the real size in i_real_bytes.
537  */
538 STATIC int
539 xfs_iformat_local(
540 	xfs_inode_t	*ip,
541 	xfs_dinode_t	*dip,
542 	int		whichfork,
543 	int		size)
544 {
545 	xfs_ifork_t	*ifp;
546 	int		real_size;
547 
548 	/*
549 	 * If the size is unreasonable, then something
550 	 * is wrong and we just bail out rather than crash in
551 	 * kmem_alloc() or memcpy() below.
552 	 */
553 	if (unlikely(size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
554 		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
555 			"corrupt inode %Lu (bad size %d for local fork, size = %d).  Unmount and run xfs_repair.",
556 			(unsigned long long) ip->i_ino, size,
557 			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork));
558 		XFS_CORRUPTION_ERROR("xfs_iformat_local", XFS_ERRLEVEL_LOW,
559 				     ip->i_mount, dip);
560 		return XFS_ERROR(EFSCORRUPTED);
561 	}
562 	ifp = XFS_IFORK_PTR(ip, whichfork);
563 	real_size = 0;
564 	if (size == 0)
565 		ifp->if_u1.if_data = NULL;
566 	else if (size <= sizeof(ifp->if_u2.if_inline_data))
567 		ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
568 	else {
569 		real_size = roundup(size, 4);
570 		ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
571 	}
572 	ifp->if_bytes = size;
573 	ifp->if_real_bytes = real_size;
574 	if (size)
575 		memcpy(ifp->if_u1.if_data, XFS_DFORK_PTR(dip, whichfork), size);
576 	ifp->if_flags &= ~XFS_IFEXTENTS;
577 	ifp->if_flags |= XFS_IFINLINE;
578 	return 0;
579 }
580 
581 /*
582  * The file consists of a set of extents all
583  * of which fit into the on-disk inode.
584  * If there are few enough extents to fit into
585  * the if_inline_ext, then copy them there.
586  * Otherwise allocate a buffer for them and copy
587  * them into it.  Either way, set if_extents
588  * to point at the extents.
589  */
590 STATIC int
591 xfs_iformat_extents(
592 	xfs_inode_t	*ip,
593 	xfs_dinode_t	*dip,
594 	int		whichfork)
595 {
596 	xfs_bmbt_rec_t	*ep, *dp;
597 	xfs_ifork_t	*ifp;
598 	int		nex;
599 	int		real_size;
600 	int		size;
601 	int		i;
602 
603 	ifp = XFS_IFORK_PTR(ip, whichfork);
604 	nex = XFS_DFORK_NEXTENTS(dip, whichfork);
605 	size = nex * (uint)sizeof(xfs_bmbt_rec_t);
606 
607 	/*
608 	 * If the number of extents is unreasonable, then something
609 	 * is wrong and we just bail out rather than crash in
610 	 * kmem_alloc() or memcpy() below.
611 	 */
612 	if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) {
613 		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
614 			"corrupt inode %Lu ((a)extents = %d).  Unmount and run xfs_repair.",
615 			(unsigned long long) ip->i_ino, nex);
616 		XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW,
617 				     ip->i_mount, dip);
618 		return XFS_ERROR(EFSCORRUPTED);
619 	}
620 
621 	real_size = 0;
622 	if (nex == 0)
623 		ifp->if_u1.if_extents = NULL;
624 	else if (nex <= XFS_INLINE_EXTS)
625 		ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
626 	else {
627 		ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP);
628 		ASSERT(ifp->if_u1.if_extents != NULL);
629 		real_size = size;
630 	}
631 	ifp->if_bytes = size;
632 	ifp->if_real_bytes = real_size;
633 	if (size) {
634 		dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork);
635 		xfs_validate_extents(dp, nex, 1, XFS_EXTFMT_INODE(ip));
636 		ep = ifp->if_u1.if_extents;
637 		for (i = 0; i < nex; i++, ep++, dp++) {
638 			ep->l0 = INT_GET(get_unaligned((__uint64_t*)&dp->l0),
639 								ARCH_CONVERT);
640 			ep->l1 = INT_GET(get_unaligned((__uint64_t*)&dp->l1),
641 								ARCH_CONVERT);
642 		}
643 		xfs_bmap_trace_exlist("xfs_iformat_extents", ip, nex,
644 			whichfork);
645 		if (whichfork != XFS_DATA_FORK ||
646 			XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE)
647 				if (unlikely(xfs_check_nostate_extents(
648 				    ifp->if_u1.if_extents, nex))) {
649 					XFS_ERROR_REPORT("xfs_iformat_extents(2)",
650 							 XFS_ERRLEVEL_LOW,
651 							 ip->i_mount);
652 					return XFS_ERROR(EFSCORRUPTED);
653 				}
654 	}
655 	ifp->if_flags |= XFS_IFEXTENTS;
656 	return 0;
657 }
658 
659 /*
660  * The file has too many extents to fit into
661  * the inode, so they are in B-tree format.
662  * Allocate a buffer for the root of the B-tree
663  * and copy the root into it.  The i_extents
664  * field will remain NULL until all of the
665  * extents are read in (when they are needed).
666  */
667 STATIC int
668 xfs_iformat_btree(
669 	xfs_inode_t		*ip,
670 	xfs_dinode_t		*dip,
671 	int			whichfork)
672 {
673 	xfs_bmdr_block_t	*dfp;
674 	xfs_ifork_t		*ifp;
675 	/* REFERENCED */
676 	int			nrecs;
677 	int			size;
678 
679 	ifp = XFS_IFORK_PTR(ip, whichfork);
680 	dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
681 	size = XFS_BMAP_BROOT_SPACE(dfp);
682 	nrecs = XFS_BMAP_BROOT_NUMRECS(dfp);
683 
684 	/*
685 	 * blow out if -- fork has less extents than can fit in
686 	 * fork (fork shouldn't be a btree format), root btree
687 	 * block has more records than can fit into the fork,
688 	 * or the number of extents is greater than the number of
689 	 * blocks.
690 	 */
691 	if (unlikely(XFS_IFORK_NEXTENTS(ip, whichfork) <= ifp->if_ext_max
692 	    || XFS_BMDR_SPACE_CALC(nrecs) >
693 			XFS_DFORK_SIZE(dip, ip->i_mount, whichfork)
694 	    || XFS_IFORK_NEXTENTS(ip, whichfork) > ip->i_d.di_nblocks)) {
695 		xfs_fs_cmn_err(CE_WARN, ip->i_mount,
696 			"corrupt inode %Lu (btree).  Unmount and run xfs_repair.",
697 			(unsigned long long) ip->i_ino);
698 		XFS_ERROR_REPORT("xfs_iformat_btree", XFS_ERRLEVEL_LOW,
699 				 ip->i_mount);
700 		return XFS_ERROR(EFSCORRUPTED);
701 	}
702 
703 	ifp->if_broot_bytes = size;
704 	ifp->if_broot = kmem_alloc(size, KM_SLEEP);
705 	ASSERT(ifp->if_broot != NULL);
706 	/*
707 	 * Copy and convert from the on-disk structure
708 	 * to the in-memory structure.
709 	 */
710 	xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
711 		ifp->if_broot, size);
712 	ifp->if_flags &= ~XFS_IFEXTENTS;
713 	ifp->if_flags |= XFS_IFBROOT;
714 
715 	return 0;
716 }
717 
718 /*
719  * xfs_xlate_dinode_core - translate an xfs_inode_core_t between ondisk
720  * and native format
721  *
722  * buf  = on-disk representation
723  * dip  = native representation
724  * dir  = direction - +ve -> disk to native
725  *                    -ve -> native to disk
726  */
727 void
728 xfs_xlate_dinode_core(
729 	xfs_caddr_t		buf,
730 	xfs_dinode_core_t	*dip,
731 	int			dir)
732 {
733 	xfs_dinode_core_t	*buf_core = (xfs_dinode_core_t *)buf;
734 	xfs_dinode_core_t	*mem_core = (xfs_dinode_core_t *)dip;
735 	xfs_arch_t		arch = ARCH_CONVERT;
736 
737 	ASSERT(dir);
738 
739 	INT_XLATE(buf_core->di_magic, mem_core->di_magic, dir, arch);
740 	INT_XLATE(buf_core->di_mode, mem_core->di_mode, dir, arch);
741 	INT_XLATE(buf_core->di_version,	mem_core->di_version, dir, arch);
742 	INT_XLATE(buf_core->di_format, mem_core->di_format, dir, arch);
743 	INT_XLATE(buf_core->di_onlink, mem_core->di_onlink, dir, arch);
744 	INT_XLATE(buf_core->di_uid, mem_core->di_uid, dir, arch);
745 	INT_XLATE(buf_core->di_gid, mem_core->di_gid, dir, arch);
746 	INT_XLATE(buf_core->di_nlink, mem_core->di_nlink, dir, arch);
747 	INT_XLATE(buf_core->di_projid, mem_core->di_projid, dir, arch);
748 
749 	if (dir > 0) {
750 		memcpy(mem_core->di_pad, buf_core->di_pad,
751 			sizeof(buf_core->di_pad));
752 	} else {
753 		memcpy(buf_core->di_pad, mem_core->di_pad,
754 			sizeof(buf_core->di_pad));
755 	}
756 
757 	INT_XLATE(buf_core->di_flushiter, mem_core->di_flushiter, dir, arch);
758 
759 	INT_XLATE(buf_core->di_atime.t_sec, mem_core->di_atime.t_sec,
760 			dir, arch);
761 	INT_XLATE(buf_core->di_atime.t_nsec, mem_core->di_atime.t_nsec,
762 			dir, arch);
763 	INT_XLATE(buf_core->di_mtime.t_sec, mem_core->di_mtime.t_sec,
764 			dir, arch);
765 	INT_XLATE(buf_core->di_mtime.t_nsec, mem_core->di_mtime.t_nsec,
766 			dir, arch);
767 	INT_XLATE(buf_core->di_ctime.t_sec, mem_core->di_ctime.t_sec,
768 			dir, arch);
769 	INT_XLATE(buf_core->di_ctime.t_nsec, mem_core->di_ctime.t_nsec,
770 			dir, arch);
771 	INT_XLATE(buf_core->di_size, mem_core->di_size, dir, arch);
772 	INT_XLATE(buf_core->di_nblocks, mem_core->di_nblocks, dir, arch);
773 	INT_XLATE(buf_core->di_extsize, mem_core->di_extsize, dir, arch);
774 	INT_XLATE(buf_core->di_nextents, mem_core->di_nextents, dir, arch);
775 	INT_XLATE(buf_core->di_anextents, mem_core->di_anextents, dir, arch);
776 	INT_XLATE(buf_core->di_forkoff, mem_core->di_forkoff, dir, arch);
777 	INT_XLATE(buf_core->di_aformat, mem_core->di_aformat, dir, arch);
778 	INT_XLATE(buf_core->di_dmevmask, mem_core->di_dmevmask, dir, arch);
779 	INT_XLATE(buf_core->di_dmstate, mem_core->di_dmstate, dir, arch);
780 	INT_XLATE(buf_core->di_flags, mem_core->di_flags, dir, arch);
781 	INT_XLATE(buf_core->di_gen, mem_core->di_gen, dir, arch);
782 }
783 
784 STATIC uint
785 _xfs_dic2xflags(
786 	xfs_dinode_core_t	*dic,
787 	__uint16_t		di_flags)
788 {
789 	uint			flags = 0;
790 
791 	if (di_flags & XFS_DIFLAG_ANY) {
792 		if (di_flags & XFS_DIFLAG_REALTIME)
793 			flags |= XFS_XFLAG_REALTIME;
794 		if (di_flags & XFS_DIFLAG_PREALLOC)
795 			flags |= XFS_XFLAG_PREALLOC;
796 		if (di_flags & XFS_DIFLAG_IMMUTABLE)
797 			flags |= XFS_XFLAG_IMMUTABLE;
798 		if (di_flags & XFS_DIFLAG_APPEND)
799 			flags |= XFS_XFLAG_APPEND;
800 		if (di_flags & XFS_DIFLAG_SYNC)
801 			flags |= XFS_XFLAG_SYNC;
802 		if (di_flags & XFS_DIFLAG_NOATIME)
803 			flags |= XFS_XFLAG_NOATIME;
804 		if (di_flags & XFS_DIFLAG_NODUMP)
805 			flags |= XFS_XFLAG_NODUMP;
806 		if (di_flags & XFS_DIFLAG_RTINHERIT)
807 			flags |= XFS_XFLAG_RTINHERIT;
808 		if (di_flags & XFS_DIFLAG_PROJINHERIT)
809 			flags |= XFS_XFLAG_PROJINHERIT;
810 		if (di_flags & XFS_DIFLAG_NOSYMLINKS)
811 			flags |= XFS_XFLAG_NOSYMLINKS;
812 	}
813 
814 	return flags;
815 }
816 
817 uint
818 xfs_ip2xflags(
819 	xfs_inode_t		*ip)
820 {
821 	xfs_dinode_core_t	*dic = &ip->i_d;
822 
823 	return _xfs_dic2xflags(dic, dic->di_flags) |
824 		(XFS_CFORK_Q(dic) ? XFS_XFLAG_HASATTR : 0);
825 }
826 
827 uint
828 xfs_dic2xflags(
829 	xfs_dinode_core_t	*dic)
830 {
831 	return _xfs_dic2xflags(dic, INT_GET(dic->di_flags, ARCH_CONVERT)) |
832 		(XFS_CFORK_Q_DISK(dic) ? XFS_XFLAG_HASATTR : 0);
833 }
834 
835 /*
836  * Given a mount structure and an inode number, return a pointer
837  * to a newly allocated in-core inode coresponding to the given
838  * inode number.
839  *
840  * Initialize the inode's attributes and extent pointers if it
841  * already has them (it will not if the inode has no links).
842  */
843 int
844 xfs_iread(
845 	xfs_mount_t	*mp,
846 	xfs_trans_t	*tp,
847 	xfs_ino_t	ino,
848 	xfs_inode_t	**ipp,
849 	xfs_daddr_t	bno)
850 {
851 	xfs_buf_t	*bp;
852 	xfs_dinode_t	*dip;
853 	xfs_inode_t	*ip;
854 	int		error;
855 
856 	ASSERT(xfs_inode_zone != NULL);
857 
858 	ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
859 	ip->i_ino = ino;
860 	ip->i_mount = mp;
861 
862 	/*
863 	 * Get pointer's to the on-disk inode and the buffer containing it.
864 	 * If the inode number refers to a block outside the file system
865 	 * then xfs_itobp() will return NULL.  In this case we should
866 	 * return NULL as well.  Set i_blkno to 0 so that xfs_itobp() will
867 	 * know that this is a new incore inode.
868 	 */
869 	error = xfs_itobp(mp, tp, ip, &dip, &bp, bno);
870 
871 	if (error != 0) {
872 		kmem_zone_free(xfs_inode_zone, ip);
873 		return error;
874 	}
875 
876 	/*
877 	 * Initialize inode's trace buffers.
878 	 * Do this before xfs_iformat in case it adds entries.
879 	 */
880 #ifdef XFS_BMAP_TRACE
881 	ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_SLEEP);
882 #endif
883 #ifdef XFS_BMBT_TRACE
884 	ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_SLEEP);
885 #endif
886 #ifdef XFS_RW_TRACE
887 	ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_SLEEP);
888 #endif
889 #ifdef XFS_ILOCK_TRACE
890 	ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_SLEEP);
891 #endif
892 #ifdef XFS_DIR2_TRACE
893 	ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_SLEEP);
894 #endif
895 
896 	/*
897 	 * If we got something that isn't an inode it means someone
898 	 * (nfs or dmi) has a stale handle.
899 	 */
900 	if (INT_GET(dip->di_core.di_magic, ARCH_CONVERT) != XFS_DINODE_MAGIC) {
901 		kmem_zone_free(xfs_inode_zone, ip);
902 		xfs_trans_brelse(tp, bp);
903 #ifdef DEBUG
904 		xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
905 				"dip->di_core.di_magic (0x%x) != "
906 				"XFS_DINODE_MAGIC (0x%x)",
907 				INT_GET(dip->di_core.di_magic, ARCH_CONVERT),
908 				XFS_DINODE_MAGIC);
909 #endif /* DEBUG */
910 		return XFS_ERROR(EINVAL);
911 	}
912 
913 	/*
914 	 * If the on-disk inode is already linked to a directory
915 	 * entry, copy all of the inode into the in-core inode.
916 	 * xfs_iformat() handles copying in the inode format
917 	 * specific information.
918 	 * Otherwise, just get the truly permanent information.
919 	 */
920 	if (dip->di_core.di_mode) {
921 		xfs_xlate_dinode_core((xfs_caddr_t)&dip->di_core,
922 		     &(ip->i_d), 1);
923 		error = xfs_iformat(ip, dip);
924 		if (error)  {
925 			kmem_zone_free(xfs_inode_zone, ip);
926 			xfs_trans_brelse(tp, bp);
927 #ifdef DEBUG
928 			xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
929 					"xfs_iformat() returned error %d",
930 					error);
931 #endif /* DEBUG */
932 			return error;
933 		}
934 	} else {
935 		ip->i_d.di_magic = INT_GET(dip->di_core.di_magic, ARCH_CONVERT);
936 		ip->i_d.di_version = INT_GET(dip->di_core.di_version, ARCH_CONVERT);
937 		ip->i_d.di_gen = INT_GET(dip->di_core.di_gen, ARCH_CONVERT);
938 		ip->i_d.di_flushiter = INT_GET(dip->di_core.di_flushiter, ARCH_CONVERT);
939 		/*
940 		 * Make sure to pull in the mode here as well in
941 		 * case the inode is released without being used.
942 		 * This ensures that xfs_inactive() will see that
943 		 * the inode is already free and not try to mess
944 		 * with the uninitialized part of it.
945 		 */
946 		ip->i_d.di_mode = 0;
947 		/*
948 		 * Initialize the per-fork minima and maxima for a new
949 		 * inode here.  xfs_iformat will do it for old inodes.
950 		 */
951 		ip->i_df.if_ext_max =
952 			XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
953 	}
954 
955 	INIT_LIST_HEAD(&ip->i_reclaim);
956 
957 	/*
958 	 * The inode format changed when we moved the link count and
959 	 * made it 32 bits long.  If this is an old format inode,
960 	 * convert it in memory to look like a new one.  If it gets
961 	 * flushed to disk we will convert back before flushing or
962 	 * logging it.  We zero out the new projid field and the old link
963 	 * count field.  We'll handle clearing the pad field (the remains
964 	 * of the old uuid field) when we actually convert the inode to
965 	 * the new format. We don't change the version number so that we
966 	 * can distinguish this from a real new format inode.
967 	 */
968 	if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
969 		ip->i_d.di_nlink = ip->i_d.di_onlink;
970 		ip->i_d.di_onlink = 0;
971 		ip->i_d.di_projid = 0;
972 	}
973 
974 	ip->i_delayed_blks = 0;
975 
976 	/*
977 	 * Mark the buffer containing the inode as something to keep
978 	 * around for a while.  This helps to keep recently accessed
979 	 * meta-data in-core longer.
980 	 */
981 	 XFS_BUF_SET_REF(bp, XFS_INO_REF);
982 
983 	/*
984 	 * Use xfs_trans_brelse() to release the buffer containing the
985 	 * on-disk inode, because it was acquired with xfs_trans_read_buf()
986 	 * in xfs_itobp() above.  If tp is NULL, this is just a normal
987 	 * brelse().  If we're within a transaction, then xfs_trans_brelse()
988 	 * will only release the buffer if it is not dirty within the
989 	 * transaction.  It will be OK to release the buffer in this case,
990 	 * because inodes on disk are never destroyed and we will be
991 	 * locking the new in-core inode before putting it in the hash
992 	 * table where other processes can find it.  Thus we don't have
993 	 * to worry about the inode being changed just because we released
994 	 * the buffer.
995 	 */
996 	xfs_trans_brelse(tp, bp);
997 	*ipp = ip;
998 	return 0;
999 }
1000 
1001 /*
1002  * Read in extents from a btree-format inode.
1003  * Allocate and fill in if_extents.  Real work is done in xfs_bmap.c.
1004  */
1005 int
1006 xfs_iread_extents(
1007 	xfs_trans_t	*tp,
1008 	xfs_inode_t	*ip,
1009 	int		whichfork)
1010 {
1011 	int		error;
1012 	xfs_ifork_t	*ifp;
1013 	size_t		size;
1014 
1015 	if (unlikely(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE)) {
1016 		XFS_ERROR_REPORT("xfs_iread_extents", XFS_ERRLEVEL_LOW,
1017 				 ip->i_mount);
1018 		return XFS_ERROR(EFSCORRUPTED);
1019 	}
1020 	size = XFS_IFORK_NEXTENTS(ip, whichfork) * (uint)sizeof(xfs_bmbt_rec_t);
1021 	ifp = XFS_IFORK_PTR(ip, whichfork);
1022 	/*
1023 	 * We know that the size is valid (it's checked in iformat_btree)
1024 	 */
1025 	ifp->if_u1.if_extents = kmem_alloc(size, KM_SLEEP);
1026 	ASSERT(ifp->if_u1.if_extents != NULL);
1027 	ifp->if_lastex = NULLEXTNUM;
1028 	ifp->if_bytes = ifp->if_real_bytes = (int)size;
1029 	ifp->if_flags |= XFS_IFEXTENTS;
1030 	error = xfs_bmap_read_extents(tp, ip, whichfork);
1031 	if (error) {
1032 		kmem_free(ifp->if_u1.if_extents, size);
1033 		ifp->if_u1.if_extents = NULL;
1034 		ifp->if_bytes = ifp->if_real_bytes = 0;
1035 		ifp->if_flags &= ~XFS_IFEXTENTS;
1036 		return error;
1037 	}
1038 	xfs_validate_extents((xfs_bmbt_rec_t *)ifp->if_u1.if_extents,
1039 		XFS_IFORK_NEXTENTS(ip, whichfork), 0, XFS_EXTFMT_INODE(ip));
1040 	return 0;
1041 }
1042 
1043 /*
1044  * Allocate an inode on disk and return a copy of its in-core version.
1045  * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
1046  * appropriately within the inode.  The uid and gid for the inode are
1047  * set according to the contents of the given cred structure.
1048  *
1049  * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
1050  * has a free inode available, call xfs_iget()
1051  * to obtain the in-core version of the allocated inode.  Finally,
1052  * fill in the inode and log its initial contents.  In this case,
1053  * ialloc_context would be set to NULL and call_again set to false.
1054  *
1055  * If xfs_dialloc() does not have an available inode,
1056  * it will replenish its supply by doing an allocation. Since we can
1057  * only do one allocation within a transaction without deadlocks, we
1058  * must commit the current transaction before returning the inode itself.
1059  * In this case, therefore, we will set call_again to true and return.
1060  * The caller should then commit the current transaction, start a new
1061  * transaction, and call xfs_ialloc() again to actually get the inode.
1062  *
1063  * To ensure that some other process does not grab the inode that
1064  * was allocated during the first call to xfs_ialloc(), this routine
1065  * also returns the [locked] bp pointing to the head of the freelist
1066  * as ialloc_context.  The caller should hold this buffer across
1067  * the commit and pass it back into this routine on the second call.
1068  */
1069 int
1070 xfs_ialloc(
1071 	xfs_trans_t	*tp,
1072 	xfs_inode_t	*pip,
1073 	mode_t		mode,
1074 	xfs_nlink_t	nlink,
1075 	xfs_dev_t	rdev,
1076 	cred_t		*cr,
1077 	xfs_prid_t	prid,
1078 	int		okalloc,
1079 	xfs_buf_t	**ialloc_context,
1080 	boolean_t	*call_again,
1081 	xfs_inode_t	**ipp)
1082 {
1083 	xfs_ino_t	ino;
1084 	xfs_inode_t	*ip;
1085 	vnode_t		*vp;
1086 	uint		flags;
1087 	int		error;
1088 
1089 	/*
1090 	 * Call the space management code to pick
1091 	 * the on-disk inode to be allocated.
1092 	 */
1093 	error = xfs_dialloc(tp, pip->i_ino, mode, okalloc,
1094 			    ialloc_context, call_again, &ino);
1095 	if (error != 0) {
1096 		return error;
1097 	}
1098 	if (*call_again || ino == NULLFSINO) {
1099 		*ipp = NULL;
1100 		return 0;
1101 	}
1102 	ASSERT(*ialloc_context == NULL);
1103 
1104 	/*
1105 	 * Get the in-core inode with the lock held exclusively.
1106 	 * This is because we're setting fields here we need
1107 	 * to prevent others from looking at until we're done.
1108 	 */
1109 	error = xfs_trans_iget(tp->t_mountp, tp, ino,
1110 			IGET_CREATE, XFS_ILOCK_EXCL, &ip);
1111 	if (error != 0) {
1112 		return error;
1113 	}
1114 	ASSERT(ip != NULL);
1115 
1116 	vp = XFS_ITOV(ip);
1117 	ip->i_d.di_mode = (__uint16_t)mode;
1118 	ip->i_d.di_onlink = 0;
1119 	ip->i_d.di_nlink = nlink;
1120 	ASSERT(ip->i_d.di_nlink == nlink);
1121 	ip->i_d.di_uid = current_fsuid(cr);
1122 	ip->i_d.di_gid = current_fsgid(cr);
1123 	ip->i_d.di_projid = prid;
1124 	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1125 
1126 	/*
1127 	 * If the superblock version is up to where we support new format
1128 	 * inodes and this is currently an old format inode, then change
1129 	 * the inode version number now.  This way we only do the conversion
1130 	 * here rather than here and in the flush/logging code.
1131 	 */
1132 	if (XFS_SB_VERSION_HASNLINK(&tp->t_mountp->m_sb) &&
1133 	    ip->i_d.di_version == XFS_DINODE_VERSION_1) {
1134 		ip->i_d.di_version = XFS_DINODE_VERSION_2;
1135 		/*
1136 		 * We've already zeroed the old link count, the projid field,
1137 		 * and the pad field.
1138 		 */
1139 	}
1140 
1141 	/*
1142 	 * Project ids won't be stored on disk if we are using a version 1 inode.
1143 	 */
1144 	if ( (prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
1145 		xfs_bump_ino_vers2(tp, ip);
1146 
1147 	if (XFS_INHERIT_GID(pip, vp->v_vfsp)) {
1148 		ip->i_d.di_gid = pip->i_d.di_gid;
1149 		if ((pip->i_d.di_mode & S_ISGID) && (mode & S_IFMT) == S_IFDIR) {
1150 			ip->i_d.di_mode |= S_ISGID;
1151 		}
1152 	}
1153 
1154 	/*
1155 	 * If the group ID of the new file does not match the effective group
1156 	 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
1157 	 * (and only if the irix_sgid_inherit compatibility variable is set).
1158 	 */
1159 	if ((irix_sgid_inherit) &&
1160 	    (ip->i_d.di_mode & S_ISGID) &&
1161 	    (!in_group_p((gid_t)ip->i_d.di_gid))) {
1162 		ip->i_d.di_mode &= ~S_ISGID;
1163 	}
1164 
1165 	ip->i_d.di_size = 0;
1166 	ip->i_d.di_nextents = 0;
1167 	ASSERT(ip->i_d.di_nblocks == 0);
1168 	xfs_ichgtime(ip, XFS_ICHGTIME_CHG|XFS_ICHGTIME_ACC|XFS_ICHGTIME_MOD);
1169 	/*
1170 	 * di_gen will have been taken care of in xfs_iread.
1171 	 */
1172 	ip->i_d.di_extsize = 0;
1173 	ip->i_d.di_dmevmask = 0;
1174 	ip->i_d.di_dmstate = 0;
1175 	ip->i_d.di_flags = 0;
1176 	flags = XFS_ILOG_CORE;
1177 	switch (mode & S_IFMT) {
1178 	case S_IFIFO:
1179 	case S_IFCHR:
1180 	case S_IFBLK:
1181 	case S_IFSOCK:
1182 		ip->i_d.di_format = XFS_DINODE_FMT_DEV;
1183 		ip->i_df.if_u2.if_rdev = rdev;
1184 		ip->i_df.if_flags = 0;
1185 		flags |= XFS_ILOG_DEV;
1186 		break;
1187 	case S_IFREG:
1188 	case S_IFDIR:
1189 		if (unlikely(pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
1190 			uint	di_flags = 0;
1191 
1192 			if ((mode & S_IFMT) == S_IFDIR) {
1193 				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
1194 					di_flags |= XFS_DIFLAG_RTINHERIT;
1195 			} else {
1196 				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) {
1197 					di_flags |= XFS_DIFLAG_REALTIME;
1198 					ip->i_iocore.io_flags |= XFS_IOCORE_RT;
1199 				}
1200 			}
1201 			if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
1202 			    xfs_inherit_noatime)
1203 				di_flags |= XFS_DIFLAG_NOATIME;
1204 			if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
1205 			    xfs_inherit_nodump)
1206 				di_flags |= XFS_DIFLAG_NODUMP;
1207 			if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
1208 			    xfs_inherit_sync)
1209 				di_flags |= XFS_DIFLAG_SYNC;
1210 			if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
1211 			    xfs_inherit_nosymlinks)
1212 				di_flags |= XFS_DIFLAG_NOSYMLINKS;
1213 			if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1214 				di_flags |= XFS_DIFLAG_PROJINHERIT;
1215 			ip->i_d.di_flags |= di_flags;
1216 		}
1217 		/* FALLTHROUGH */
1218 	case S_IFLNK:
1219 		ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
1220 		ip->i_df.if_flags = XFS_IFEXTENTS;
1221 		ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
1222 		ip->i_df.if_u1.if_extents = NULL;
1223 		break;
1224 	default:
1225 		ASSERT(0);
1226 	}
1227 	/*
1228 	 * Attribute fork settings for new inode.
1229 	 */
1230 	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
1231 	ip->i_d.di_anextents = 0;
1232 
1233 	/*
1234 	 * Log the new values stuffed into the inode.
1235 	 */
1236 	xfs_trans_log_inode(tp, ip, flags);
1237 
1238 	/* now that we have an i_mode  we can set Linux inode ops (& unlock) */
1239 	VFS_INIT_VNODE(XFS_MTOVFS(tp->t_mountp), vp, XFS_ITOBHV(ip), 1);
1240 
1241 	*ipp = ip;
1242 	return 0;
1243 }
1244 
1245 /*
1246  * Check to make sure that there are no blocks allocated to the
1247  * file beyond the size of the file.  We don't check this for
1248  * files with fixed size extents or real time extents, but we
1249  * at least do it for regular files.
1250  */
1251 #ifdef DEBUG
1252 void
1253 xfs_isize_check(
1254 	xfs_mount_t	*mp,
1255 	xfs_inode_t	*ip,
1256 	xfs_fsize_t	isize)
1257 {
1258 	xfs_fileoff_t	map_first;
1259 	int		nimaps;
1260 	xfs_bmbt_irec_t	imaps[2];
1261 
1262 	if ((ip->i_d.di_mode & S_IFMT) != S_IFREG)
1263 		return;
1264 
1265 	if ( ip->i_d.di_flags & XFS_DIFLAG_REALTIME )
1266 		return;
1267 
1268 	nimaps = 2;
1269 	map_first = XFS_B_TO_FSB(mp, (xfs_ufsize_t)isize);
1270 	/*
1271 	 * The filesystem could be shutting down, so bmapi may return
1272 	 * an error.
1273 	 */
1274 	if (xfs_bmapi(NULL, ip, map_first,
1275 			 (XFS_B_TO_FSB(mp,
1276 				       (xfs_ufsize_t)XFS_MAXIOFFSET(mp)) -
1277 			  map_first),
1278 			 XFS_BMAPI_ENTIRE, NULL, 0, imaps, &nimaps,
1279 			 NULL))
1280 	    return;
1281 	ASSERT(nimaps == 1);
1282 	ASSERT(imaps[0].br_startblock == HOLESTARTBLOCK);
1283 }
1284 #endif	/* DEBUG */
1285 
1286 /*
1287  * Calculate the last possible buffered byte in a file.  This must
1288  * include data that was buffered beyond the EOF by the write code.
1289  * This also needs to deal with overflowing the xfs_fsize_t type
1290  * which can happen for sizes near the limit.
1291  *
1292  * We also need to take into account any blocks beyond the EOF.  It
1293  * may be the case that they were buffered by a write which failed.
1294  * In that case the pages will still be in memory, but the inode size
1295  * will never have been updated.
1296  */
1297 xfs_fsize_t
1298 xfs_file_last_byte(
1299 	xfs_inode_t	*ip)
1300 {
1301 	xfs_mount_t	*mp;
1302 	xfs_fsize_t	last_byte;
1303 	xfs_fileoff_t	last_block;
1304 	xfs_fileoff_t	size_last_block;
1305 	int		error;
1306 
1307 	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS));
1308 
1309 	mp = ip->i_mount;
1310 	/*
1311 	 * Only check for blocks beyond the EOF if the extents have
1312 	 * been read in.  This eliminates the need for the inode lock,
1313 	 * and it also saves us from looking when it really isn't
1314 	 * necessary.
1315 	 */
1316 	if (ip->i_df.if_flags & XFS_IFEXTENTS) {
1317 		error = xfs_bmap_last_offset(NULL, ip, &last_block,
1318 			XFS_DATA_FORK);
1319 		if (error) {
1320 			last_block = 0;
1321 		}
1322 	} else {
1323 		last_block = 0;
1324 	}
1325 	size_last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)ip->i_d.di_size);
1326 	last_block = XFS_FILEOFF_MAX(last_block, size_last_block);
1327 
1328 	last_byte = XFS_FSB_TO_B(mp, last_block);
1329 	if (last_byte < 0) {
1330 		return XFS_MAXIOFFSET(mp);
1331 	}
1332 	last_byte += (1 << mp->m_writeio_log);
1333 	if (last_byte < 0) {
1334 		return XFS_MAXIOFFSET(mp);
1335 	}
1336 	return last_byte;
1337 }
1338 
1339 #if defined(XFS_RW_TRACE)
1340 STATIC void
1341 xfs_itrunc_trace(
1342 	int		tag,
1343 	xfs_inode_t	*ip,
1344 	int		flag,
1345 	xfs_fsize_t	new_size,
1346 	xfs_off_t	toss_start,
1347 	xfs_off_t	toss_finish)
1348 {
1349 	if (ip->i_rwtrace == NULL) {
1350 		return;
1351 	}
1352 
1353 	ktrace_enter(ip->i_rwtrace,
1354 		     (void*)((long)tag),
1355 		     (void*)ip,
1356 		     (void*)(unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff),
1357 		     (void*)(unsigned long)(ip->i_d.di_size & 0xffffffff),
1358 		     (void*)((long)flag),
1359 		     (void*)(unsigned long)((new_size >> 32) & 0xffffffff),
1360 		     (void*)(unsigned long)(new_size & 0xffffffff),
1361 		     (void*)(unsigned long)((toss_start >> 32) & 0xffffffff),
1362 		     (void*)(unsigned long)(toss_start & 0xffffffff),
1363 		     (void*)(unsigned long)((toss_finish >> 32) & 0xffffffff),
1364 		     (void*)(unsigned long)(toss_finish & 0xffffffff),
1365 		     (void*)(unsigned long)current_cpu(),
1366 		     (void*)0,
1367 		     (void*)0,
1368 		     (void*)0,
1369 		     (void*)0);
1370 }
1371 #else
1372 #define	xfs_itrunc_trace(tag, ip, flag, new_size, toss_start, toss_finish)
1373 #endif
1374 
1375 /*
1376  * Start the truncation of the file to new_size.  The new size
1377  * must be smaller than the current size.  This routine will
1378  * clear the buffer and page caches of file data in the removed
1379  * range, and xfs_itruncate_finish() will remove the underlying
1380  * disk blocks.
1381  *
1382  * The inode must have its I/O lock locked EXCLUSIVELY, and it
1383  * must NOT have the inode lock held at all.  This is because we're
1384  * calling into the buffer/page cache code and we can't hold the
1385  * inode lock when we do so.
1386  *
1387  * The flags parameter can have either the value XFS_ITRUNC_DEFINITE
1388  * or XFS_ITRUNC_MAYBE.  The XFS_ITRUNC_MAYBE value should be used
1389  * in the case that the caller is locking things out of order and
1390  * may not be able to call xfs_itruncate_finish() with the inode lock
1391  * held without dropping the I/O lock.  If the caller must drop the
1392  * I/O lock before calling xfs_itruncate_finish(), then xfs_itruncate_start()
1393  * must be called again with all the same restrictions as the initial
1394  * call.
1395  */
1396 void
1397 xfs_itruncate_start(
1398 	xfs_inode_t	*ip,
1399 	uint		flags,
1400 	xfs_fsize_t	new_size)
1401 {
1402 	xfs_fsize_t	last_byte;
1403 	xfs_off_t	toss_start;
1404 	xfs_mount_t	*mp;
1405 	vnode_t		*vp;
1406 
1407 	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
1408 	ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
1409 	ASSERT((flags == XFS_ITRUNC_DEFINITE) ||
1410 	       (flags == XFS_ITRUNC_MAYBE));
1411 
1412 	mp = ip->i_mount;
1413 	vp = XFS_ITOV(ip);
1414 	/*
1415 	 * Call VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES() to get rid of pages and buffers
1416 	 * overlapping the region being removed.  We have to use
1417 	 * the less efficient VOP_FLUSHINVAL_PAGES() in the case that the
1418 	 * caller may not be able to finish the truncate without
1419 	 * dropping the inode's I/O lock.  Make sure
1420 	 * to catch any pages brought in by buffers overlapping
1421 	 * the EOF by searching out beyond the isize by our
1422 	 * block size. We round new_size up to a block boundary
1423 	 * so that we don't toss things on the same block as
1424 	 * new_size but before it.
1425 	 *
1426 	 * Before calling VOP_TOSS_PAGES() or VOP_FLUSHINVAL_PAGES(), make sure to
1427 	 * call remapf() over the same region if the file is mapped.
1428 	 * This frees up mapped file references to the pages in the
1429 	 * given range and for the VOP_FLUSHINVAL_PAGES() case it ensures
1430 	 * that we get the latest mapped changes flushed out.
1431 	 */
1432 	toss_start = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1433 	toss_start = XFS_FSB_TO_B(mp, toss_start);
1434 	if (toss_start < 0) {
1435 		/*
1436 		 * The place to start tossing is beyond our maximum
1437 		 * file size, so there is no way that the data extended
1438 		 * out there.
1439 		 */
1440 		return;
1441 	}
1442 	last_byte = xfs_file_last_byte(ip);
1443 	xfs_itrunc_trace(XFS_ITRUNC_START, ip, flags, new_size, toss_start,
1444 			 last_byte);
1445 	if (last_byte > toss_start) {
1446 		if (flags & XFS_ITRUNC_DEFINITE) {
1447 			VOP_TOSS_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
1448 		} else {
1449 			VOP_FLUSHINVAL_PAGES(vp, toss_start, -1, FI_REMAPF_LOCKED);
1450 		}
1451 	}
1452 
1453 #ifdef DEBUG
1454 	if (new_size == 0) {
1455 		ASSERT(VN_CACHED(vp) == 0);
1456 	}
1457 #endif
1458 }
1459 
1460 /*
1461  * Shrink the file to the given new_size.  The new
1462  * size must be smaller than the current size.
1463  * This will free up the underlying blocks
1464  * in the removed range after a call to xfs_itruncate_start()
1465  * or xfs_atruncate_start().
1466  *
1467  * The transaction passed to this routine must have made
1468  * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES.
1469  * This routine may commit the given transaction and
1470  * start new ones, so make sure everything involved in
1471  * the transaction is tidy before calling here.
1472  * Some transaction will be returned to the caller to be
1473  * committed.  The incoming transaction must already include
1474  * the inode, and both inode locks must be held exclusively.
1475  * The inode must also be "held" within the transaction.  On
1476  * return the inode will be "held" within the returned transaction.
1477  * This routine does NOT require any disk space to be reserved
1478  * for it within the transaction.
1479  *
1480  * The fork parameter must be either xfs_attr_fork or xfs_data_fork,
1481  * and it indicates the fork which is to be truncated.  For the
1482  * attribute fork we only support truncation to size 0.
1483  *
1484  * We use the sync parameter to indicate whether or not the first
1485  * transaction we perform might have to be synchronous.  For the attr fork,
1486  * it needs to be so if the unlink of the inode is not yet known to be
1487  * permanent in the log.  This keeps us from freeing and reusing the
1488  * blocks of the attribute fork before the unlink of the inode becomes
1489  * permanent.
1490  *
1491  * For the data fork, we normally have to run synchronously if we're
1492  * being called out of the inactive path or we're being called
1493  * out of the create path where we're truncating an existing file.
1494  * Either way, the truncate needs to be sync so blocks don't reappear
1495  * in the file with altered data in case of a crash.  wsync filesystems
1496  * can run the first case async because anything that shrinks the inode
1497  * has to run sync so by the time we're called here from inactive, the
1498  * inode size is permanently set to 0.
1499  *
1500  * Calls from the truncate path always need to be sync unless we're
1501  * in a wsync filesystem and the file has already been unlinked.
1502  *
1503  * The caller is responsible for correctly setting the sync parameter.
1504  * It gets too hard for us to guess here which path we're being called
1505  * out of just based on inode state.
1506  */
1507 int
1508 xfs_itruncate_finish(
1509 	xfs_trans_t	**tp,
1510 	xfs_inode_t	*ip,
1511 	xfs_fsize_t	new_size,
1512 	int		fork,
1513 	int		sync)
1514 {
1515 	xfs_fsblock_t	first_block;
1516 	xfs_fileoff_t	first_unmap_block;
1517 	xfs_fileoff_t	last_block;
1518 	xfs_filblks_t	unmap_len=0;
1519 	xfs_mount_t	*mp;
1520 	xfs_trans_t	*ntp;
1521 	int		done;
1522 	int		committed;
1523 	xfs_bmap_free_t	free_list;
1524 	int		error;
1525 
1526 	ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0);
1527 	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0);
1528 	ASSERT((new_size == 0) || (new_size <= ip->i_d.di_size));
1529 	ASSERT(*tp != NULL);
1530 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
1531 	ASSERT(ip->i_transp == *tp);
1532 	ASSERT(ip->i_itemp != NULL);
1533 	ASSERT(ip->i_itemp->ili_flags & XFS_ILI_HOLD);
1534 
1535 
1536 	ntp = *tp;
1537 	mp = (ntp)->t_mountp;
1538 	ASSERT(! XFS_NOT_DQATTACHED(mp, ip));
1539 
1540 	/*
1541 	 * We only support truncating the entire attribute fork.
1542 	 */
1543 	if (fork == XFS_ATTR_FORK) {
1544 		new_size = 0LL;
1545 	}
1546 	first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1547 	xfs_itrunc_trace(XFS_ITRUNC_FINISH1, ip, 0, new_size, 0, 0);
1548 	/*
1549 	 * The first thing we do is set the size to new_size permanently
1550 	 * on disk.  This way we don't have to worry about anyone ever
1551 	 * being able to look at the data being freed even in the face
1552 	 * of a crash.  What we're getting around here is the case where
1553 	 * we free a block, it is allocated to another file, it is written
1554 	 * to, and then we crash.  If the new data gets written to the
1555 	 * file but the log buffers containing the free and reallocation
1556 	 * don't, then we'd end up with garbage in the blocks being freed.
1557 	 * As long as we make the new_size permanent before actually
1558 	 * freeing any blocks it doesn't matter if they get writtten to.
1559 	 *
1560 	 * The callers must signal into us whether or not the size
1561 	 * setting here must be synchronous.  There are a few cases
1562 	 * where it doesn't have to be synchronous.  Those cases
1563 	 * occur if the file is unlinked and we know the unlink is
1564 	 * permanent or if the blocks being truncated are guaranteed
1565 	 * to be beyond the inode eof (regardless of the link count)
1566 	 * and the eof value is permanent.  Both of these cases occur
1567 	 * only on wsync-mounted filesystems.  In those cases, we're
1568 	 * guaranteed that no user will ever see the data in the blocks
1569 	 * that are being truncated so the truncate can run async.
1570 	 * In the free beyond eof case, the file may wind up with
1571 	 * more blocks allocated to it than it needs if we crash
1572 	 * and that won't get fixed until the next time the file
1573 	 * is re-opened and closed but that's ok as that shouldn't
1574 	 * be too many blocks.
1575 	 *
1576 	 * However, we can't just make all wsync xactions run async
1577 	 * because there's one call out of the create path that needs
1578 	 * to run sync where it's truncating an existing file to size
1579 	 * 0 whose size is > 0.
1580 	 *
1581 	 * It's probably possible to come up with a test in this
1582 	 * routine that would correctly distinguish all the above
1583 	 * cases from the values of the function parameters and the
1584 	 * inode state but for sanity's sake, I've decided to let the
1585 	 * layers above just tell us.  It's simpler to correctly figure
1586 	 * out in the layer above exactly under what conditions we
1587 	 * can run async and I think it's easier for others read and
1588 	 * follow the logic in case something has to be changed.
1589 	 * cscope is your friend -- rcc.
1590 	 *
1591 	 * The attribute fork is much simpler.
1592 	 *
1593 	 * For the attribute fork we allow the caller to tell us whether
1594 	 * the unlink of the inode that led to this call is yet permanent
1595 	 * in the on disk log.  If it is not and we will be freeing extents
1596 	 * in this inode then we make the first transaction synchronous
1597 	 * to make sure that the unlink is permanent by the time we free
1598 	 * the blocks.
1599 	 */
1600 	if (fork == XFS_DATA_FORK) {
1601 		if (ip->i_d.di_nextents > 0) {
1602 			ip->i_d.di_size = new_size;
1603 			xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1604 		}
1605 	} else if (sync) {
1606 		ASSERT(!(mp->m_flags & XFS_MOUNT_WSYNC));
1607 		if (ip->i_d.di_anextents > 0)
1608 			xfs_trans_set_sync(ntp);
1609 	}
1610 	ASSERT(fork == XFS_DATA_FORK ||
1611 		(fork == XFS_ATTR_FORK &&
1612 			((sync && !(mp->m_flags & XFS_MOUNT_WSYNC)) ||
1613 			 (sync == 0 && (mp->m_flags & XFS_MOUNT_WSYNC)))));
1614 
1615 	/*
1616 	 * Since it is possible for space to become allocated beyond
1617 	 * the end of the file (in a crash where the space is allocated
1618 	 * but the inode size is not yet updated), simply remove any
1619 	 * blocks which show up between the new EOF and the maximum
1620 	 * possible file size.  If the first block to be removed is
1621 	 * beyond the maximum file size (ie it is the same as last_block),
1622 	 * then there is nothing to do.
1623 	 */
1624 	last_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)XFS_MAXIOFFSET(mp));
1625 	ASSERT(first_unmap_block <= last_block);
1626 	done = 0;
1627 	if (last_block == first_unmap_block) {
1628 		done = 1;
1629 	} else {
1630 		unmap_len = last_block - first_unmap_block + 1;
1631 	}
1632 	while (!done) {
1633 		/*
1634 		 * Free up up to XFS_ITRUNC_MAX_EXTENTS.  xfs_bunmapi()
1635 		 * will tell us whether it freed the entire range or
1636 		 * not.  If this is a synchronous mount (wsync),
1637 		 * then we can tell bunmapi to keep all the
1638 		 * transactions asynchronous since the unlink
1639 		 * transaction that made this inode inactive has
1640 		 * already hit the disk.  There's no danger of
1641 		 * the freed blocks being reused, there being a
1642 		 * crash, and the reused blocks suddenly reappearing
1643 		 * in this file with garbage in them once recovery
1644 		 * runs.
1645 		 */
1646 		XFS_BMAP_INIT(&free_list, &first_block);
1647 		error = xfs_bunmapi(ntp, ip, first_unmap_block,
1648 				    unmap_len,
1649 				    XFS_BMAPI_AFLAG(fork) |
1650 				      (sync ? 0 : XFS_BMAPI_ASYNC),
1651 				    XFS_ITRUNC_MAX_EXTENTS,
1652 				    &first_block, &free_list, &done);
1653 		if (error) {
1654 			/*
1655 			 * If the bunmapi call encounters an error,
1656 			 * return to the caller where the transaction
1657 			 * can be properly aborted.  We just need to
1658 			 * make sure we're not holding any resources
1659 			 * that we were not when we came in.
1660 			 */
1661 			xfs_bmap_cancel(&free_list);
1662 			return error;
1663 		}
1664 
1665 		/*
1666 		 * Duplicate the transaction that has the permanent
1667 		 * reservation and commit the old transaction.
1668 		 */
1669 		error = xfs_bmap_finish(tp, &free_list, first_block,
1670 					&committed);
1671 		ntp = *tp;
1672 		if (error) {
1673 			/*
1674 			 * If the bmap finish call encounters an error,
1675 			 * return to the caller where the transaction
1676 			 * can be properly aborted.  We just need to
1677 			 * make sure we're not holding any resources
1678 			 * that we were not when we came in.
1679 			 *
1680 			 * Aborting from this point might lose some
1681 			 * blocks in the file system, but oh well.
1682 			 */
1683 			xfs_bmap_cancel(&free_list);
1684 			if (committed) {
1685 				/*
1686 				 * If the passed in transaction committed
1687 				 * in xfs_bmap_finish(), then we want to
1688 				 * add the inode to this one before returning.
1689 				 * This keeps things simple for the higher
1690 				 * level code, because it always knows that
1691 				 * the inode is locked and held in the
1692 				 * transaction that returns to it whether
1693 				 * errors occur or not.  We don't mark the
1694 				 * inode dirty so that this transaction can
1695 				 * be easily aborted if possible.
1696 				 */
1697 				xfs_trans_ijoin(ntp, ip,
1698 					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1699 				xfs_trans_ihold(ntp, ip);
1700 			}
1701 			return error;
1702 		}
1703 
1704 		if (committed) {
1705 			/*
1706 			 * The first xact was committed,
1707 			 * so add the inode to the new one.
1708 			 * Mark it dirty so it will be logged
1709 			 * and moved forward in the log as
1710 			 * part of every commit.
1711 			 */
1712 			xfs_trans_ijoin(ntp, ip,
1713 					XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1714 			xfs_trans_ihold(ntp, ip);
1715 			xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1716 		}
1717 		ntp = xfs_trans_dup(ntp);
1718 		(void) xfs_trans_commit(*tp, 0, NULL);
1719 		*tp = ntp;
1720 		error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0,
1721 					  XFS_TRANS_PERM_LOG_RES,
1722 					  XFS_ITRUNCATE_LOG_COUNT);
1723 		/*
1724 		 * Add the inode being truncated to the next chained
1725 		 * transaction.
1726 		 */
1727 		xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1728 		xfs_trans_ihold(ntp, ip);
1729 		if (error)
1730 			return (error);
1731 	}
1732 	/*
1733 	 * Only update the size in the case of the data fork, but
1734 	 * always re-log the inode so that our permanent transaction
1735 	 * can keep on rolling it forward in the log.
1736 	 */
1737 	if (fork == XFS_DATA_FORK) {
1738 		xfs_isize_check(mp, ip, new_size);
1739 		ip->i_d.di_size = new_size;
1740 	}
1741 	xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1742 	ASSERT((new_size != 0) ||
1743 	       (fork == XFS_ATTR_FORK) ||
1744 	       (ip->i_delayed_blks == 0));
1745 	ASSERT((new_size != 0) ||
1746 	       (fork == XFS_ATTR_FORK) ||
1747 	       (ip->i_d.di_nextents == 0));
1748 	xfs_itrunc_trace(XFS_ITRUNC_FINISH2, ip, 0, new_size, 0, 0);
1749 	return 0;
1750 }
1751 
1752 
1753 /*
1754  * xfs_igrow_start
1755  *
1756  * Do the first part of growing a file: zero any data in the last
1757  * block that is beyond the old EOF.  We need to do this before
1758  * the inode is joined to the transaction to modify the i_size.
1759  * That way we can drop the inode lock and call into the buffer
1760  * cache to get the buffer mapping the EOF.
1761  */
1762 int
1763 xfs_igrow_start(
1764 	xfs_inode_t	*ip,
1765 	xfs_fsize_t	new_size,
1766 	cred_t		*credp)
1767 {
1768 	xfs_fsize_t	isize;
1769 	int		error;
1770 
1771 	ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
1772 	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
1773 	ASSERT(new_size > ip->i_d.di_size);
1774 
1775 	error = 0;
1776 	isize = ip->i_d.di_size;
1777 	/*
1778 	 * Zero any pages that may have been created by
1779 	 * xfs_write_file() beyond the end of the file
1780 	 * and any blocks between the old and new file sizes.
1781 	 */
1782 	error = xfs_zero_eof(XFS_ITOV(ip), &ip->i_iocore, new_size, isize,
1783 				new_size);
1784 	return error;
1785 }
1786 
1787 /*
1788  * xfs_igrow_finish
1789  *
1790  * This routine is called to extend the size of a file.
1791  * The inode must have both the iolock and the ilock locked
1792  * for update and it must be a part of the current transaction.
1793  * The xfs_igrow_start() function must have been called previously.
1794  * If the change_flag is not zero, the inode change timestamp will
1795  * be updated.
1796  */
1797 void
1798 xfs_igrow_finish(
1799 	xfs_trans_t	*tp,
1800 	xfs_inode_t	*ip,
1801 	xfs_fsize_t	new_size,
1802 	int		change_flag)
1803 {
1804 	ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0);
1805 	ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0);
1806 	ASSERT(ip->i_transp == tp);
1807 	ASSERT(new_size > ip->i_d.di_size);
1808 
1809 	/*
1810 	 * Update the file size.  Update the inode change timestamp
1811 	 * if change_flag set.
1812 	 */
1813 	ip->i_d.di_size = new_size;
1814 	if (change_flag)
1815 		xfs_ichgtime(ip, XFS_ICHGTIME_CHG);
1816 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1817 
1818 }
1819 
1820 
1821 /*
1822  * This is called when the inode's link count goes to 0.
1823  * We place the on-disk inode on a list in the AGI.  It
1824  * will be pulled from this list when the inode is freed.
1825  */
1826 int
1827 xfs_iunlink(
1828 	xfs_trans_t	*tp,
1829 	xfs_inode_t	*ip)
1830 {
1831 	xfs_mount_t	*mp;
1832 	xfs_agi_t	*agi;
1833 	xfs_dinode_t	*dip;
1834 	xfs_buf_t	*agibp;
1835 	xfs_buf_t	*ibp;
1836 	xfs_agnumber_t	agno;
1837 	xfs_daddr_t	agdaddr;
1838 	xfs_agino_t	agino;
1839 	short		bucket_index;
1840 	int		offset;
1841 	int		error;
1842 	int		agi_ok;
1843 
1844 	ASSERT(ip->i_d.di_nlink == 0);
1845 	ASSERT(ip->i_d.di_mode != 0);
1846 	ASSERT(ip->i_transp == tp);
1847 
1848 	mp = tp->t_mountp;
1849 
1850 	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1851 	agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
1852 
1853 	/*
1854 	 * Get the agi buffer first.  It ensures lock ordering
1855 	 * on the list.
1856 	 */
1857 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
1858 				   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
1859 	if (error) {
1860 		return error;
1861 	}
1862 	/*
1863 	 * Validate the magic number of the agi block.
1864 	 */
1865 	agi = XFS_BUF_TO_AGI(agibp);
1866 	agi_ok =
1867 		be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
1868 		XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
1869 	if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK,
1870 			XFS_RANDOM_IUNLINK))) {
1871 		XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi);
1872 		xfs_trans_brelse(tp, agibp);
1873 		return XFS_ERROR(EFSCORRUPTED);
1874 	}
1875 	/*
1876 	 * Get the index into the agi hash table for the
1877 	 * list this inode will go on.
1878 	 */
1879 	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1880 	ASSERT(agino != 0);
1881 	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1882 	ASSERT(agi->agi_unlinked[bucket_index]);
1883 	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
1884 
1885 	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO) {
1886 		/*
1887 		 * There is already another inode in the bucket we need
1888 		 * to add ourselves to.  Add us at the front of the list.
1889 		 * Here we put the head pointer into our next pointer,
1890 		 * and then we fall through to point the head at us.
1891 		 */
1892 		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
1893 		if (error) {
1894 			return error;
1895 		}
1896 		ASSERT(INT_GET(dip->di_next_unlinked, ARCH_CONVERT) == NULLAGINO);
1897 		ASSERT(dip->di_next_unlinked);
1898 		/* both on-disk, don't endian flip twice */
1899 		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1900 		offset = ip->i_boffset +
1901 			offsetof(xfs_dinode_t, di_next_unlinked);
1902 		xfs_trans_inode_buf(tp, ibp);
1903 		xfs_trans_log_buf(tp, ibp, offset,
1904 				  (offset + sizeof(xfs_agino_t) - 1));
1905 		xfs_inobp_check(mp, ibp);
1906 	}
1907 
1908 	/*
1909 	 * Point the bucket head pointer at the inode being inserted.
1910 	 */
1911 	ASSERT(agino != 0);
1912 	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
1913 	offset = offsetof(xfs_agi_t, agi_unlinked) +
1914 		(sizeof(xfs_agino_t) * bucket_index);
1915 	xfs_trans_log_buf(tp, agibp, offset,
1916 			  (offset + sizeof(xfs_agino_t) - 1));
1917 	return 0;
1918 }
1919 
1920 /*
1921  * Pull the on-disk inode from the AGI unlinked list.
1922  */
1923 STATIC int
1924 xfs_iunlink_remove(
1925 	xfs_trans_t	*tp,
1926 	xfs_inode_t	*ip)
1927 {
1928 	xfs_ino_t	next_ino;
1929 	xfs_mount_t	*mp;
1930 	xfs_agi_t	*agi;
1931 	xfs_dinode_t	*dip;
1932 	xfs_buf_t	*agibp;
1933 	xfs_buf_t	*ibp;
1934 	xfs_agnumber_t	agno;
1935 	xfs_daddr_t	agdaddr;
1936 	xfs_agino_t	agino;
1937 	xfs_agino_t	next_agino;
1938 	xfs_buf_t	*last_ibp;
1939 	xfs_dinode_t	*last_dip;
1940 	short		bucket_index;
1941 	int		offset, last_offset;
1942 	int		error;
1943 	int		agi_ok;
1944 
1945 	/*
1946 	 * First pull the on-disk inode from the AGI unlinked list.
1947 	 */
1948 	mp = tp->t_mountp;
1949 
1950 	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1951 	agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
1952 
1953 	/*
1954 	 * Get the agi buffer first.  It ensures lock ordering
1955 	 * on the list.
1956 	 */
1957 	error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
1958 				   XFS_FSS_TO_BB(mp, 1), 0, &agibp);
1959 	if (error) {
1960 		cmn_err(CE_WARN,
1961 			"xfs_iunlink_remove: xfs_trans_read_buf()  returned an error %d on %s.  Returning error.",
1962 			error, mp->m_fsname);
1963 		return error;
1964 	}
1965 	/*
1966 	 * Validate the magic number of the agi block.
1967 	 */
1968 	agi = XFS_BUF_TO_AGI(agibp);
1969 	agi_ok =
1970 		be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
1971 		XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
1972 	if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE,
1973 			XFS_RANDOM_IUNLINK_REMOVE))) {
1974 		XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW,
1975 				     mp, agi);
1976 		xfs_trans_brelse(tp, agibp);
1977 		cmn_err(CE_WARN,
1978 			"xfs_iunlink_remove: XFS_TEST_ERROR()  returned an error on %s.  Returning EFSCORRUPTED.",
1979 			 mp->m_fsname);
1980 		return XFS_ERROR(EFSCORRUPTED);
1981 	}
1982 	/*
1983 	 * Get the index into the agi hash table for the
1984 	 * list this inode will go on.
1985 	 */
1986 	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1987 	ASSERT(agino != 0);
1988 	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1989 	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != NULLAGINO);
1990 	ASSERT(agi->agi_unlinked[bucket_index]);
1991 
1992 	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
1993 		/*
1994 		 * We're at the head of the list.  Get the inode's
1995 		 * on-disk buffer to see if there is anyone after us
1996 		 * on the list.  Only modify our next pointer if it
1997 		 * is not already NULLAGINO.  This saves us the overhead
1998 		 * of dealing with the buffer when there is no need to
1999 		 * change it.
2000 		 */
2001 		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
2002 		if (error) {
2003 			cmn_err(CE_WARN,
2004 				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
2005 				error, mp->m_fsname);
2006 			return error;
2007 		}
2008 		next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT);
2009 		ASSERT(next_agino != 0);
2010 		if (next_agino != NULLAGINO) {
2011 			INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
2012 			offset = ip->i_boffset +
2013 				offsetof(xfs_dinode_t, di_next_unlinked);
2014 			xfs_trans_inode_buf(tp, ibp);
2015 			xfs_trans_log_buf(tp, ibp, offset,
2016 					  (offset + sizeof(xfs_agino_t) - 1));
2017 			xfs_inobp_check(mp, ibp);
2018 		} else {
2019 			xfs_trans_brelse(tp, ibp);
2020 		}
2021 		/*
2022 		 * Point the bucket head pointer at the next inode.
2023 		 */
2024 		ASSERT(next_agino != 0);
2025 		ASSERT(next_agino != agino);
2026 		agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
2027 		offset = offsetof(xfs_agi_t, agi_unlinked) +
2028 			(sizeof(xfs_agino_t) * bucket_index);
2029 		xfs_trans_log_buf(tp, agibp, offset,
2030 				  (offset + sizeof(xfs_agino_t) - 1));
2031 	} else {
2032 		/*
2033 		 * We need to search the list for the inode being freed.
2034 		 */
2035 		next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2036 		last_ibp = NULL;
2037 		while (next_agino != agino) {
2038 			/*
2039 			 * If the last inode wasn't the one pointing to
2040 			 * us, then release its buffer since we're not
2041 			 * going to do anything with it.
2042 			 */
2043 			if (last_ibp != NULL) {
2044 				xfs_trans_brelse(tp, last_ibp);
2045 			}
2046 			next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
2047 			error = xfs_inotobp(mp, tp, next_ino, &last_dip,
2048 					    &last_ibp, &last_offset);
2049 			if (error) {
2050 				cmn_err(CE_WARN,
2051 			"xfs_iunlink_remove: xfs_inotobp()  returned an error %d on %s.  Returning error.",
2052 					error, mp->m_fsname);
2053 				return error;
2054 			}
2055 			next_agino = INT_GET(last_dip->di_next_unlinked, ARCH_CONVERT);
2056 			ASSERT(next_agino != NULLAGINO);
2057 			ASSERT(next_agino != 0);
2058 		}
2059 		/*
2060 		 * Now last_ibp points to the buffer previous to us on
2061 		 * the unlinked list.  Pull us from the list.
2062 		 */
2063 		error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0);
2064 		if (error) {
2065 			cmn_err(CE_WARN,
2066 				"xfs_iunlink_remove: xfs_itobp()  returned an error %d on %s.  Returning error.",
2067 				error, mp->m_fsname);
2068 			return error;
2069 		}
2070 		next_agino = INT_GET(dip->di_next_unlinked, ARCH_CONVERT);
2071 		ASSERT(next_agino != 0);
2072 		ASSERT(next_agino != agino);
2073 		if (next_agino != NULLAGINO) {
2074 			INT_SET(dip->di_next_unlinked, ARCH_CONVERT, NULLAGINO);
2075 			offset = ip->i_boffset +
2076 				offsetof(xfs_dinode_t, di_next_unlinked);
2077 			xfs_trans_inode_buf(tp, ibp);
2078 			xfs_trans_log_buf(tp, ibp, offset,
2079 					  (offset + sizeof(xfs_agino_t) - 1));
2080 			xfs_inobp_check(mp, ibp);
2081 		} else {
2082 			xfs_trans_brelse(tp, ibp);
2083 		}
2084 		/*
2085 		 * Point the previous inode on the list to the next inode.
2086 		 */
2087 		INT_SET(last_dip->di_next_unlinked, ARCH_CONVERT, next_agino);
2088 		ASSERT(next_agino != 0);
2089 		offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
2090 		xfs_trans_inode_buf(tp, last_ibp);
2091 		xfs_trans_log_buf(tp, last_ibp, offset,
2092 				  (offset + sizeof(xfs_agino_t) - 1));
2093 		xfs_inobp_check(mp, last_ibp);
2094 	}
2095 	return 0;
2096 }
2097 
2098 static __inline__ int xfs_inode_clean(xfs_inode_t *ip)
2099 {
2100 	return (((ip->i_itemp == NULL) ||
2101 		!(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
2102 		(ip->i_update_core == 0));
2103 }
2104 
2105 STATIC void
2106 xfs_ifree_cluster(
2107 	xfs_inode_t	*free_ip,
2108 	xfs_trans_t	*tp,
2109 	xfs_ino_t	inum)
2110 {
2111 	xfs_mount_t		*mp = free_ip->i_mount;
2112 	int			blks_per_cluster;
2113 	int			nbufs;
2114 	int			ninodes;
2115 	int			i, j, found, pre_flushed;
2116 	xfs_daddr_t		blkno;
2117 	xfs_buf_t		*bp;
2118 	xfs_ihash_t		*ih;
2119 	xfs_inode_t		*ip, **ip_found;
2120 	xfs_inode_log_item_t	*iip;
2121 	xfs_log_item_t		*lip;
2122 	SPLDECL(s);
2123 
2124 	if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
2125 		blks_per_cluster = 1;
2126 		ninodes = mp->m_sb.sb_inopblock;
2127 		nbufs = XFS_IALLOC_BLOCKS(mp);
2128 	} else {
2129 		blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
2130 					mp->m_sb.sb_blocksize;
2131 		ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
2132 		nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
2133 	}
2134 
2135 	ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
2136 
2137 	for (j = 0; j < nbufs; j++, inum += ninodes) {
2138 		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2139 					 XFS_INO_TO_AGBNO(mp, inum));
2140 
2141 
2142 		/*
2143 		 * Look for each inode in memory and attempt to lock it,
2144 		 * we can be racing with flush and tail pushing here.
2145 		 * any inode we get the locks on, add to an array of
2146 		 * inode items to process later.
2147 		 *
2148 		 * The get the buffer lock, we could beat a flush
2149 		 * or tail pushing thread to the lock here, in which
2150 		 * case they will go looking for the inode buffer
2151 		 * and fail, we need some other form of interlock
2152 		 * here.
2153 		 */
2154 		found = 0;
2155 		for (i = 0; i < ninodes; i++) {
2156 			ih = XFS_IHASH(mp, inum + i);
2157 			read_lock(&ih->ih_lock);
2158 			for (ip = ih->ih_next; ip != NULL; ip = ip->i_next) {
2159 				if (ip->i_ino == inum + i)
2160 					break;
2161 			}
2162 
2163 			/* Inode not in memory or we found it already,
2164 			 * nothing to do
2165 			 */
2166 			if (!ip || (ip->i_flags & XFS_ISTALE)) {
2167 				read_unlock(&ih->ih_lock);
2168 				continue;
2169 			}
2170 
2171 			if (xfs_inode_clean(ip)) {
2172 				read_unlock(&ih->ih_lock);
2173 				continue;
2174 			}
2175 
2176 			/* If we can get the locks then add it to the
2177 			 * list, otherwise by the time we get the bp lock
2178 			 * below it will already be attached to the
2179 			 * inode buffer.
2180 			 */
2181 
2182 			/* This inode will already be locked - by us, lets
2183 			 * keep it that way.
2184 			 */
2185 
2186 			if (ip == free_ip) {
2187 				if (xfs_iflock_nowait(ip)) {
2188 					ip->i_flags |= XFS_ISTALE;
2189 
2190 					if (xfs_inode_clean(ip)) {
2191 						xfs_ifunlock(ip);
2192 					} else {
2193 						ip_found[found++] = ip;
2194 					}
2195 				}
2196 				read_unlock(&ih->ih_lock);
2197 				continue;
2198 			}
2199 
2200 			if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2201 				if (xfs_iflock_nowait(ip)) {
2202 					ip->i_flags |= XFS_ISTALE;
2203 
2204 					if (xfs_inode_clean(ip)) {
2205 						xfs_ifunlock(ip);
2206 						xfs_iunlock(ip, XFS_ILOCK_EXCL);
2207 					} else {
2208 						ip_found[found++] = ip;
2209 					}
2210 				} else {
2211 					xfs_iunlock(ip, XFS_ILOCK_EXCL);
2212 				}
2213 			}
2214 
2215 			read_unlock(&ih->ih_lock);
2216 		}
2217 
2218 		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2219 					mp->m_bsize * blks_per_cluster,
2220 					XFS_BUF_LOCK);
2221 
2222 		pre_flushed = 0;
2223 		lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
2224 		while (lip) {
2225 			if (lip->li_type == XFS_LI_INODE) {
2226 				iip = (xfs_inode_log_item_t *)lip;
2227 				ASSERT(iip->ili_logged == 1);
2228 				lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
2229 				AIL_LOCK(mp,s);
2230 				iip->ili_flush_lsn = iip->ili_item.li_lsn;
2231 				AIL_UNLOCK(mp, s);
2232 				iip->ili_inode->i_flags |= XFS_ISTALE;
2233 				pre_flushed++;
2234 			}
2235 			lip = lip->li_bio_list;
2236 		}
2237 
2238 		for (i = 0; i < found; i++) {
2239 			ip = ip_found[i];
2240 			iip = ip->i_itemp;
2241 
2242 			if (!iip) {
2243 				ip->i_update_core = 0;
2244 				xfs_ifunlock(ip);
2245 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
2246 				continue;
2247 			}
2248 
2249 			iip->ili_last_fields = iip->ili_format.ilf_fields;
2250 			iip->ili_format.ilf_fields = 0;
2251 			iip->ili_logged = 1;
2252 			AIL_LOCK(mp,s);
2253 			iip->ili_flush_lsn = iip->ili_item.li_lsn;
2254 			AIL_UNLOCK(mp, s);
2255 
2256 			xfs_buf_attach_iodone(bp,
2257 				(void(*)(xfs_buf_t*,xfs_log_item_t*))
2258 				xfs_istale_done, (xfs_log_item_t *)iip);
2259 			if (ip != free_ip) {
2260 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
2261 			}
2262 		}
2263 
2264 		if (found || pre_flushed)
2265 			xfs_trans_stale_inode_buf(tp, bp);
2266 		xfs_trans_binval(tp, bp);
2267 	}
2268 
2269 	kmem_free(ip_found, ninodes * sizeof(xfs_inode_t *));
2270 }
2271 
2272 /*
2273  * This is called to return an inode to the inode free list.
2274  * The inode should already be truncated to 0 length and have
2275  * no pages associated with it.  This routine also assumes that
2276  * the inode is already a part of the transaction.
2277  *
2278  * The on-disk copy of the inode will have been added to the list
2279  * of unlinked inodes in the AGI. We need to remove the inode from
2280  * that list atomically with respect to freeing it here.
2281  */
2282 int
2283 xfs_ifree(
2284 	xfs_trans_t	*tp,
2285 	xfs_inode_t	*ip,
2286 	xfs_bmap_free_t	*flist)
2287 {
2288 	int			error;
2289 	int			delete;
2290 	xfs_ino_t		first_ino;
2291 
2292 	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
2293 	ASSERT(ip->i_transp == tp);
2294 	ASSERT(ip->i_d.di_nlink == 0);
2295 	ASSERT(ip->i_d.di_nextents == 0);
2296 	ASSERT(ip->i_d.di_anextents == 0);
2297 	ASSERT((ip->i_d.di_size == 0) ||
2298 	       ((ip->i_d.di_mode & S_IFMT) != S_IFREG));
2299 	ASSERT(ip->i_d.di_nblocks == 0);
2300 
2301 	/*
2302 	 * Pull the on-disk inode from the AGI unlinked list.
2303 	 */
2304 	error = xfs_iunlink_remove(tp, ip);
2305 	if (error != 0) {
2306 		return error;
2307 	}
2308 
2309 	error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
2310 	if (error != 0) {
2311 		return error;
2312 	}
2313 	ip->i_d.di_mode = 0;		/* mark incore inode as free */
2314 	ip->i_d.di_flags = 0;
2315 	ip->i_d.di_dmevmask = 0;
2316 	ip->i_d.di_forkoff = 0;		/* mark the attr fork not in use */
2317 	ip->i_df.if_ext_max =
2318 		XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
2319 	ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
2320 	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
2321 	/*
2322 	 * Bump the generation count so no one will be confused
2323 	 * by reincarnations of this inode.
2324 	 */
2325 	ip->i_d.di_gen++;
2326 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2327 
2328 	if (delete) {
2329 		xfs_ifree_cluster(ip, tp, first_ino);
2330 	}
2331 
2332 	return 0;
2333 }
2334 
2335 /*
2336  * Reallocate the space for if_broot based on the number of records
2337  * being added or deleted as indicated in rec_diff.  Move the records
2338  * and pointers in if_broot to fit the new size.  When shrinking this
2339  * will eliminate holes between the records and pointers created by
2340  * the caller.  When growing this will create holes to be filled in
2341  * by the caller.
2342  *
2343  * The caller must not request to add more records than would fit in
2344  * the on-disk inode root.  If the if_broot is currently NULL, then
2345  * if we adding records one will be allocated.  The caller must also
2346  * not request that the number of records go below zero, although
2347  * it can go to zero.
2348  *
2349  * ip -- the inode whose if_broot area is changing
2350  * ext_diff -- the change in the number of records, positive or negative,
2351  *	 requested for the if_broot array.
2352  */
2353 void
2354 xfs_iroot_realloc(
2355 	xfs_inode_t		*ip,
2356 	int			rec_diff,
2357 	int			whichfork)
2358 {
2359 	int			cur_max;
2360 	xfs_ifork_t		*ifp;
2361 	xfs_bmbt_block_t	*new_broot;
2362 	int			new_max;
2363 	size_t			new_size;
2364 	char			*np;
2365 	char			*op;
2366 
2367 	/*
2368 	 * Handle the degenerate case quietly.
2369 	 */
2370 	if (rec_diff == 0) {
2371 		return;
2372 	}
2373 
2374 	ifp = XFS_IFORK_PTR(ip, whichfork);
2375 	if (rec_diff > 0) {
2376 		/*
2377 		 * If there wasn't any memory allocated before, just
2378 		 * allocate it now and get out.
2379 		 */
2380 		if (ifp->if_broot_bytes == 0) {
2381 			new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
2382 			ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size,
2383 								     KM_SLEEP);
2384 			ifp->if_broot_bytes = (int)new_size;
2385 			return;
2386 		}
2387 
2388 		/*
2389 		 * If there is already an existing if_broot, then we need
2390 		 * to realloc() it and shift the pointers to their new
2391 		 * location.  The records don't change location because
2392 		 * they are kept butted up against the btree block header.
2393 		 */
2394 		cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
2395 		new_max = cur_max + rec_diff;
2396 		new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2397 		ifp->if_broot = (xfs_bmbt_block_t *)
2398 		  kmem_realloc(ifp->if_broot,
2399 				new_size,
2400 				(size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
2401 				KM_SLEEP);
2402 		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
2403 						      ifp->if_broot_bytes);
2404 		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
2405 						      (int)new_size);
2406 		ifp->if_broot_bytes = (int)new_size;
2407 		ASSERT(ifp->if_broot_bytes <=
2408 			XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2409 		memmove(np, op, cur_max * (uint)sizeof(xfs_dfsbno_t));
2410 		return;
2411 	}
2412 
2413 	/*
2414 	 * rec_diff is less than 0.  In this case, we are shrinking the
2415 	 * if_broot buffer.  It must already exist.  If we go to zero
2416 	 * records, just get rid of the root and clear the status bit.
2417 	 */
2418 	ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
2419 	cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
2420 	new_max = cur_max + rec_diff;
2421 	ASSERT(new_max >= 0);
2422 	if (new_max > 0)
2423 		new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
2424 	else
2425 		new_size = 0;
2426 	if (new_size > 0) {
2427 		new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP);
2428 		/*
2429 		 * First copy over the btree block header.
2430 		 */
2431 		memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t));
2432 	} else {
2433 		new_broot = NULL;
2434 		ifp->if_flags &= ~XFS_IFBROOT;
2435 	}
2436 
2437 	/*
2438 	 * Only copy the records and pointers if there are any.
2439 	 */
2440 	if (new_max > 0) {
2441 		/*
2442 		 * First copy the records.
2443 		 */
2444 		op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1,
2445 						     ifp->if_broot_bytes);
2446 		np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
2447 						     (int)new_size);
2448 		memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
2449 
2450 		/*
2451 		 * Then copy the pointers.
2452 		 */
2453 		op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
2454 						     ifp->if_broot_bytes);
2455 		np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1,
2456 						     (int)new_size);
2457 		memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
2458 	}
2459 	kmem_free(ifp->if_broot, ifp->if_broot_bytes);
2460 	ifp->if_broot = new_broot;
2461 	ifp->if_broot_bytes = (int)new_size;
2462 	ASSERT(ifp->if_broot_bytes <=
2463 		XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
2464 	return;
2465 }
2466 
2467 
2468 /*
2469  * This is called when the amount of space needed for if_extents
2470  * is increased or decreased.  The change in size is indicated by
2471  * the number of extents that need to be added or deleted in the
2472  * ext_diff parameter.
2473  *
2474  * If the amount of space needed has decreased below the size of the
2475  * inline buffer, then switch to using the inline buffer.  Otherwise,
2476  * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
2477  * to what is needed.
2478  *
2479  * ip -- the inode whose if_extents area is changing
2480  * ext_diff -- the change in the number of extents, positive or negative,
2481  *	 requested for the if_extents array.
2482  */
2483 void
2484 xfs_iext_realloc(
2485 	xfs_inode_t	*ip,
2486 	int		ext_diff,
2487 	int		whichfork)
2488 {
2489 	int		byte_diff;
2490 	xfs_ifork_t	*ifp;
2491 	int		new_size;
2492 	uint		rnew_size;
2493 
2494 	if (ext_diff == 0) {
2495 		return;
2496 	}
2497 
2498 	ifp = XFS_IFORK_PTR(ip, whichfork);
2499 	byte_diff = ext_diff * (uint)sizeof(xfs_bmbt_rec_t);
2500 	new_size = (int)ifp->if_bytes + byte_diff;
2501 	ASSERT(new_size >= 0);
2502 
2503 	if (new_size == 0) {
2504 		if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) {
2505 			ASSERT(ifp->if_real_bytes != 0);
2506 			kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
2507 		}
2508 		ifp->if_u1.if_extents = NULL;
2509 		rnew_size = 0;
2510 	} else if (new_size <= sizeof(ifp->if_u2.if_inline_ext)) {
2511 		/*
2512 		 * If the valid extents can fit in if_inline_ext,
2513 		 * copy them from the malloc'd vector and free it.
2514 		 */
2515 		if (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext) {
2516 			/*
2517 			 * For now, empty files are format EXTENTS,
2518 			 * so the if_extents pointer is null.
2519 			 */
2520 			if (ifp->if_u1.if_extents) {
2521 				memcpy(ifp->if_u2.if_inline_ext,
2522 					ifp->if_u1.if_extents, new_size);
2523 				kmem_free(ifp->if_u1.if_extents,
2524 					  ifp->if_real_bytes);
2525 			}
2526 			ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
2527 		}
2528 		rnew_size = 0;
2529 	} else {
2530 		rnew_size = new_size;
2531 		if ((rnew_size & (rnew_size - 1)) != 0)
2532 			rnew_size = xfs_iroundup(rnew_size);
2533 		/*
2534 		 * Stuck with malloc/realloc.
2535 		 */
2536 		if (ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext) {
2537 			ifp->if_u1.if_extents = (xfs_bmbt_rec_t *)
2538 				kmem_alloc(rnew_size, KM_SLEEP);
2539 			memcpy(ifp->if_u1.if_extents, ifp->if_u2.if_inline_ext,
2540 			      sizeof(ifp->if_u2.if_inline_ext));
2541 		} else if (rnew_size != ifp->if_real_bytes) {
2542 			ifp->if_u1.if_extents = (xfs_bmbt_rec_t *)
2543 			  kmem_realloc(ifp->if_u1.if_extents,
2544 					rnew_size,
2545 					ifp->if_real_bytes,
2546 					KM_NOFS);
2547 		}
2548 	}
2549 	ifp->if_real_bytes = rnew_size;
2550 	ifp->if_bytes = new_size;
2551 }
2552 
2553 
2554 /*
2555  * This is called when the amount of space needed for if_data
2556  * is increased or decreased.  The change in size is indicated by
2557  * the number of bytes that need to be added or deleted in the
2558  * byte_diff parameter.
2559  *
2560  * If the amount of space needed has decreased below the size of the
2561  * inline buffer, then switch to using the inline buffer.  Otherwise,
2562  * use kmem_realloc() or kmem_alloc() to adjust the size of the buffer
2563  * to what is needed.
2564  *
2565  * ip -- the inode whose if_data area is changing
2566  * byte_diff -- the change in the number of bytes, positive or negative,
2567  *	 requested for the if_data array.
2568  */
2569 void
2570 xfs_idata_realloc(
2571 	xfs_inode_t	*ip,
2572 	int		byte_diff,
2573 	int		whichfork)
2574 {
2575 	xfs_ifork_t	*ifp;
2576 	int		new_size;
2577 	int		real_size;
2578 
2579 	if (byte_diff == 0) {
2580 		return;
2581 	}
2582 
2583 	ifp = XFS_IFORK_PTR(ip, whichfork);
2584 	new_size = (int)ifp->if_bytes + byte_diff;
2585 	ASSERT(new_size >= 0);
2586 
2587 	if (new_size == 0) {
2588 		if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2589 			kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
2590 		}
2591 		ifp->if_u1.if_data = NULL;
2592 		real_size = 0;
2593 	} else if (new_size <= sizeof(ifp->if_u2.if_inline_data)) {
2594 		/*
2595 		 * If the valid extents/data can fit in if_inline_ext/data,
2596 		 * copy them from the malloc'd vector and free it.
2597 		 */
2598 		if (ifp->if_u1.if_data == NULL) {
2599 			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2600 		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2601 			ASSERT(ifp->if_real_bytes != 0);
2602 			memcpy(ifp->if_u2.if_inline_data, ifp->if_u1.if_data,
2603 			      new_size);
2604 			kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
2605 			ifp->if_u1.if_data = ifp->if_u2.if_inline_data;
2606 		}
2607 		real_size = 0;
2608 	} else {
2609 		/*
2610 		 * Stuck with malloc/realloc.
2611 		 * For inline data, the underlying buffer must be
2612 		 * a multiple of 4 bytes in size so that it can be
2613 		 * logged and stay on word boundaries.  We enforce
2614 		 * that here.
2615 		 */
2616 		real_size = roundup(new_size, 4);
2617 		if (ifp->if_u1.if_data == NULL) {
2618 			ASSERT(ifp->if_real_bytes == 0);
2619 			ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
2620 		} else if (ifp->if_u1.if_data != ifp->if_u2.if_inline_data) {
2621 			/*
2622 			 * Only do the realloc if the underlying size
2623 			 * is really changing.
2624 			 */
2625 			if (ifp->if_real_bytes != real_size) {
2626 				ifp->if_u1.if_data =
2627 					kmem_realloc(ifp->if_u1.if_data,
2628 							real_size,
2629 							ifp->if_real_bytes,
2630 							KM_SLEEP);
2631 			}
2632 		} else {
2633 			ASSERT(ifp->if_real_bytes == 0);
2634 			ifp->if_u1.if_data = kmem_alloc(real_size, KM_SLEEP);
2635 			memcpy(ifp->if_u1.if_data, ifp->if_u2.if_inline_data,
2636 				ifp->if_bytes);
2637 		}
2638 	}
2639 	ifp->if_real_bytes = real_size;
2640 	ifp->if_bytes = new_size;
2641 	ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2642 }
2643 
2644 
2645 
2646 
2647 /*
2648  * Map inode to disk block and offset.
2649  *
2650  * mp -- the mount point structure for the current file system
2651  * tp -- the current transaction
2652  * ino -- the inode number of the inode to be located
2653  * imap -- this structure is filled in with the information necessary
2654  *	 to retrieve the given inode from disk
2655  * flags -- flags to pass to xfs_dilocate indicating whether or not
2656  *	 lookups in the inode btree were OK or not
2657  */
2658 int
2659 xfs_imap(
2660 	xfs_mount_t	*mp,
2661 	xfs_trans_t	*tp,
2662 	xfs_ino_t	ino,
2663 	xfs_imap_t	*imap,
2664 	uint		flags)
2665 {
2666 	xfs_fsblock_t	fsbno;
2667 	int		len;
2668 	int		off;
2669 	int		error;
2670 
2671 	fsbno = imap->im_blkno ?
2672 		XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
2673 	error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
2674 	if (error != 0) {
2675 		return error;
2676 	}
2677 	imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
2678 	imap->im_len = XFS_FSB_TO_BB(mp, len);
2679 	imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
2680 	imap->im_ioffset = (ushort)off;
2681 	imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
2682 	return 0;
2683 }
2684 
2685 void
2686 xfs_idestroy_fork(
2687 	xfs_inode_t	*ip,
2688 	int		whichfork)
2689 {
2690 	xfs_ifork_t	*ifp;
2691 
2692 	ifp = XFS_IFORK_PTR(ip, whichfork);
2693 	if (ifp->if_broot != NULL) {
2694 		kmem_free(ifp->if_broot, ifp->if_broot_bytes);
2695 		ifp->if_broot = NULL;
2696 	}
2697 
2698 	/*
2699 	 * If the format is local, then we can't have an extents
2700 	 * array so just look for an inline data array.  If we're
2701 	 * not local then we may or may not have an extents list,
2702 	 * so check and free it up if we do.
2703 	 */
2704 	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
2705 		if ((ifp->if_u1.if_data != ifp->if_u2.if_inline_data) &&
2706 		    (ifp->if_u1.if_data != NULL)) {
2707 			ASSERT(ifp->if_real_bytes != 0);
2708 			kmem_free(ifp->if_u1.if_data, ifp->if_real_bytes);
2709 			ifp->if_u1.if_data = NULL;
2710 			ifp->if_real_bytes = 0;
2711 		}
2712 	} else if ((ifp->if_flags & XFS_IFEXTENTS) &&
2713 		   (ifp->if_u1.if_extents != NULL) &&
2714 		   (ifp->if_u1.if_extents != ifp->if_u2.if_inline_ext)) {
2715 		ASSERT(ifp->if_real_bytes != 0);
2716 		kmem_free(ifp->if_u1.if_extents, ifp->if_real_bytes);
2717 		ifp->if_u1.if_extents = NULL;
2718 		ifp->if_real_bytes = 0;
2719 	}
2720 	ASSERT(ifp->if_u1.if_extents == NULL ||
2721 	       ifp->if_u1.if_extents == ifp->if_u2.if_inline_ext);
2722 	ASSERT(ifp->if_real_bytes == 0);
2723 	if (whichfork == XFS_ATTR_FORK) {
2724 		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
2725 		ip->i_afp = NULL;
2726 	}
2727 }
2728 
2729 /*
2730  * This is called free all the memory associated with an inode.
2731  * It must free the inode itself and any buffers allocated for
2732  * if_extents/if_data and if_broot.  It must also free the lock
2733  * associated with the inode.
2734  */
2735 void
2736 xfs_idestroy(
2737 	xfs_inode_t	*ip)
2738 {
2739 
2740 	switch (ip->i_d.di_mode & S_IFMT) {
2741 	case S_IFREG:
2742 	case S_IFDIR:
2743 	case S_IFLNK:
2744 		xfs_idestroy_fork(ip, XFS_DATA_FORK);
2745 		break;
2746 	}
2747 	if (ip->i_afp)
2748 		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
2749 	mrfree(&ip->i_lock);
2750 	mrfree(&ip->i_iolock);
2751 	freesema(&ip->i_flock);
2752 #ifdef XFS_BMAP_TRACE
2753 	ktrace_free(ip->i_xtrace);
2754 #endif
2755 #ifdef XFS_BMBT_TRACE
2756 	ktrace_free(ip->i_btrace);
2757 #endif
2758 #ifdef XFS_RW_TRACE
2759 	ktrace_free(ip->i_rwtrace);
2760 #endif
2761 #ifdef XFS_ILOCK_TRACE
2762 	ktrace_free(ip->i_lock_trace);
2763 #endif
2764 #ifdef XFS_DIR2_TRACE
2765 	ktrace_free(ip->i_dir_trace);
2766 #endif
2767 	if (ip->i_itemp) {
2768 		/* XXXdpd should be able to assert this but shutdown
2769 		 * is leaving the AIL behind. */
2770 		ASSERT(((ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL) == 0) ||
2771 		       XFS_FORCED_SHUTDOWN(ip->i_mount));
2772 		xfs_inode_item_destroy(ip);
2773 	}
2774 	kmem_zone_free(xfs_inode_zone, ip);
2775 }
2776 
2777 
2778 /*
2779  * Increment the pin count of the given buffer.
2780  * This value is protected by ipinlock spinlock in the mount structure.
2781  */
2782 void
2783 xfs_ipin(
2784 	xfs_inode_t	*ip)
2785 {
2786 	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE));
2787 
2788 	atomic_inc(&ip->i_pincount);
2789 }
2790 
2791 /*
2792  * Decrement the pin count of the given inode, and wake up
2793  * anyone in xfs_iwait_unpin() if the count goes to 0.  The
2794  * inode must have been previoulsy pinned with a call to xfs_ipin().
2795  */
2796 void
2797 xfs_iunpin(
2798 	xfs_inode_t	*ip)
2799 {
2800 	ASSERT(atomic_read(&ip->i_pincount) > 0);
2801 
2802 	if (atomic_dec_and_test(&ip->i_pincount)) {
2803 		vnode_t	*vp = XFS_ITOV_NULL(ip);
2804 
2805 		/* make sync come back and flush this inode */
2806 		if (vp) {
2807 			struct inode	*inode = LINVFS_GET_IP(vp);
2808 
2809 			if (!(inode->i_state & I_NEW))
2810 				mark_inode_dirty_sync(inode);
2811 		}
2812 
2813 		wake_up(&ip->i_ipin_wait);
2814 	}
2815 }
2816 
2817 /*
2818  * This is called to wait for the given inode to be unpinned.
2819  * It will sleep until this happens.  The caller must have the
2820  * inode locked in at least shared mode so that the buffer cannot
2821  * be subsequently pinned once someone is waiting for it to be
2822  * unpinned.
2823  */
2824 STATIC void
2825 xfs_iunpin_wait(
2826 	xfs_inode_t	*ip)
2827 {
2828 	xfs_inode_log_item_t	*iip;
2829 	xfs_lsn_t	lsn;
2830 
2831 	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
2832 
2833 	if (atomic_read(&ip->i_pincount) == 0) {
2834 		return;
2835 	}
2836 
2837 	iip = ip->i_itemp;
2838 	if (iip && iip->ili_last_lsn) {
2839 		lsn = iip->ili_last_lsn;
2840 	} else {
2841 		lsn = (xfs_lsn_t)0;
2842 	}
2843 
2844 	/*
2845 	 * Give the log a push so we don't wait here too long.
2846 	 */
2847 	xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE);
2848 
2849 	wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
2850 }
2851 
2852 
2853 /*
2854  * xfs_iextents_copy()
2855  *
2856  * This is called to copy the REAL extents (as opposed to the delayed
2857  * allocation extents) from the inode into the given buffer.  It
2858  * returns the number of bytes copied into the buffer.
2859  *
2860  * If there are no delayed allocation extents, then we can just
2861  * memcpy() the extents into the buffer.  Otherwise, we need to
2862  * examine each extent in turn and skip those which are delayed.
2863  */
2864 int
2865 xfs_iextents_copy(
2866 	xfs_inode_t		*ip,
2867 	xfs_bmbt_rec_t		*buffer,
2868 	int			whichfork)
2869 {
2870 	int			copied;
2871 	xfs_bmbt_rec_t		*dest_ep;
2872 	xfs_bmbt_rec_t		*ep;
2873 #ifdef XFS_BMAP_TRACE
2874 	static char		fname[] = "xfs_iextents_copy";
2875 #endif
2876 	int			i;
2877 	xfs_ifork_t		*ifp;
2878 	int			nrecs;
2879 	xfs_fsblock_t		start_block;
2880 
2881 	ifp = XFS_IFORK_PTR(ip, whichfork);
2882 	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
2883 	ASSERT(ifp->if_bytes > 0);
2884 
2885 	nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
2886 	xfs_bmap_trace_exlist(fname, ip, nrecs, whichfork);
2887 	ASSERT(nrecs > 0);
2888 
2889 	/*
2890 	 * There are some delayed allocation extents in the
2891 	 * inode, so copy the extents one at a time and skip
2892 	 * the delayed ones.  There must be at least one
2893 	 * non-delayed extent.
2894 	 */
2895 	ep = ifp->if_u1.if_extents;
2896 	dest_ep = buffer;
2897 	copied = 0;
2898 	for (i = 0; i < nrecs; i++) {
2899 		start_block = xfs_bmbt_get_startblock(ep);
2900 		if (ISNULLSTARTBLOCK(start_block)) {
2901 			/*
2902 			 * It's a delayed allocation extent, so skip it.
2903 			 */
2904 			ep++;
2905 			continue;
2906 		}
2907 
2908 		/* Translate to on disk format */
2909 		put_unaligned(INT_GET(ep->l0, ARCH_CONVERT),
2910 			      (__uint64_t*)&dest_ep->l0);
2911 		put_unaligned(INT_GET(ep->l1, ARCH_CONVERT),
2912 			      (__uint64_t*)&dest_ep->l1);
2913 		dest_ep++;
2914 		ep++;
2915 		copied++;
2916 	}
2917 	ASSERT(copied != 0);
2918 	xfs_validate_extents(buffer, copied, 1, XFS_EXTFMT_INODE(ip));
2919 
2920 	return (copied * (uint)sizeof(xfs_bmbt_rec_t));
2921 }
2922 
2923 /*
2924  * Each of the following cases stores data into the same region
2925  * of the on-disk inode, so only one of them can be valid at
2926  * any given time. While it is possible to have conflicting formats
2927  * and log flags, e.g. having XFS_ILOG_?DATA set when the fork is
2928  * in EXTENTS format, this can only happen when the fork has
2929  * changed formats after being modified but before being flushed.
2930  * In these cases, the format always takes precedence, because the
2931  * format indicates the current state of the fork.
2932  */
2933 /*ARGSUSED*/
2934 STATIC int
2935 xfs_iflush_fork(
2936 	xfs_inode_t		*ip,
2937 	xfs_dinode_t		*dip,
2938 	xfs_inode_log_item_t	*iip,
2939 	int			whichfork,
2940 	xfs_buf_t		*bp)
2941 {
2942 	char			*cp;
2943 	xfs_ifork_t		*ifp;
2944 	xfs_mount_t		*mp;
2945 #ifdef XFS_TRANS_DEBUG
2946 	int			first;
2947 #endif
2948 	static const short	brootflag[2] =
2949 		{ XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
2950 	static const short	dataflag[2] =
2951 		{ XFS_ILOG_DDATA, XFS_ILOG_ADATA };
2952 	static const short	extflag[2] =
2953 		{ XFS_ILOG_DEXT, XFS_ILOG_AEXT };
2954 
2955 	if (iip == NULL)
2956 		return 0;
2957 	ifp = XFS_IFORK_PTR(ip, whichfork);
2958 	/*
2959 	 * This can happen if we gave up in iformat in an error path,
2960 	 * for the attribute fork.
2961 	 */
2962 	if (ifp == NULL) {
2963 		ASSERT(whichfork == XFS_ATTR_FORK);
2964 		return 0;
2965 	}
2966 	cp = XFS_DFORK_PTR(dip, whichfork);
2967 	mp = ip->i_mount;
2968 	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
2969 	case XFS_DINODE_FMT_LOCAL:
2970 		if ((iip->ili_format.ilf_fields & dataflag[whichfork]) &&
2971 		    (ifp->if_bytes > 0)) {
2972 			ASSERT(ifp->if_u1.if_data != NULL);
2973 			ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
2974 			memcpy(cp, ifp->if_u1.if_data, ifp->if_bytes);
2975 		}
2976 		if (whichfork == XFS_DATA_FORK) {
2977 			if (unlikely(XFS_DIR_SHORTFORM_VALIDATE_ONDISK(mp, dip))) {
2978 				XFS_ERROR_REPORT("xfs_iflush_fork",
2979 						 XFS_ERRLEVEL_LOW, mp);
2980 				return XFS_ERROR(EFSCORRUPTED);
2981 			}
2982 		}
2983 		break;
2984 
2985 	case XFS_DINODE_FMT_EXTENTS:
2986 		ASSERT((ifp->if_flags & XFS_IFEXTENTS) ||
2987 		       !(iip->ili_format.ilf_fields & extflag[whichfork]));
2988 		ASSERT((ifp->if_u1.if_extents != NULL) || (ifp->if_bytes == 0));
2989 		ASSERT((ifp->if_u1.if_extents == NULL) || (ifp->if_bytes > 0));
2990 		if ((iip->ili_format.ilf_fields & extflag[whichfork]) &&
2991 		    (ifp->if_bytes > 0)) {
2992 			ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) > 0);
2993 			(void)xfs_iextents_copy(ip, (xfs_bmbt_rec_t *)cp,
2994 				whichfork);
2995 		}
2996 		break;
2997 
2998 	case XFS_DINODE_FMT_BTREE:
2999 		if ((iip->ili_format.ilf_fields & brootflag[whichfork]) &&
3000 		    (ifp->if_broot_bytes > 0)) {
3001 			ASSERT(ifp->if_broot != NULL);
3002 			ASSERT(ifp->if_broot_bytes <=
3003 			       (XFS_IFORK_SIZE(ip, whichfork) +
3004 				XFS_BROOT_SIZE_ADJ));
3005 			xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes,
3006 				(xfs_bmdr_block_t *)cp,
3007 				XFS_DFORK_SIZE(dip, mp, whichfork));
3008 		}
3009 		break;
3010 
3011 	case XFS_DINODE_FMT_DEV:
3012 		if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
3013 			ASSERT(whichfork == XFS_DATA_FORK);
3014 			INT_SET(dip->di_u.di_dev, ARCH_CONVERT, ip->i_df.if_u2.if_rdev);
3015 		}
3016 		break;
3017 
3018 	case XFS_DINODE_FMT_UUID:
3019 		if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
3020 			ASSERT(whichfork == XFS_DATA_FORK);
3021 			memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid,
3022 				sizeof(uuid_t));
3023 		}
3024 		break;
3025 
3026 	default:
3027 		ASSERT(0);
3028 		break;
3029 	}
3030 
3031 	return 0;
3032 }
3033 
3034 /*
3035  * xfs_iflush() will write a modified inode's changes out to the
3036  * inode's on disk home.  The caller must have the inode lock held
3037  * in at least shared mode and the inode flush semaphore must be
3038  * held as well.  The inode lock will still be held upon return from
3039  * the call and the caller is free to unlock it.
3040  * The inode flush lock will be unlocked when the inode reaches the disk.
3041  * The flags indicate how the inode's buffer should be written out.
3042  */
3043 int
3044 xfs_iflush(
3045 	xfs_inode_t		*ip,
3046 	uint			flags)
3047 {
3048 	xfs_inode_log_item_t	*iip;
3049 	xfs_buf_t		*bp;
3050 	xfs_dinode_t		*dip;
3051 	xfs_mount_t		*mp;
3052 	int			error;
3053 	/* REFERENCED */
3054 	xfs_chash_t		*ch;
3055 	xfs_inode_t		*iq;
3056 	int			clcount;	/* count of inodes clustered */
3057 	int			bufwasdelwri;
3058 	enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
3059 	SPLDECL(s);
3060 
3061 	XFS_STATS_INC(xs_iflush_count);
3062 
3063 	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
3064 	ASSERT(valusema(&ip->i_flock) <= 0);
3065 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3066 	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
3067 
3068 	iip = ip->i_itemp;
3069 	mp = ip->i_mount;
3070 
3071 	/*
3072 	 * If the inode isn't dirty, then just release the inode
3073 	 * flush lock and do nothing.
3074 	 */
3075 	if ((ip->i_update_core == 0) &&
3076 	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
3077 		ASSERT((iip != NULL) ?
3078 			 !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
3079 		xfs_ifunlock(ip);
3080 		return 0;
3081 	}
3082 
3083 	/*
3084 	 * We can't flush the inode until it is unpinned, so
3085 	 * wait for it.  We know noone new can pin it, because
3086 	 * we are holding the inode lock shared and you need
3087 	 * to hold it exclusively to pin the inode.
3088 	 */
3089 	xfs_iunpin_wait(ip);
3090 
3091 	/*
3092 	 * This may have been unpinned because the filesystem is shutting
3093 	 * down forcibly. If that's the case we must not write this inode
3094 	 * to disk, because the log record didn't make it to disk!
3095 	 */
3096 	if (XFS_FORCED_SHUTDOWN(mp)) {
3097 		ip->i_update_core = 0;
3098 		if (iip)
3099 			iip->ili_format.ilf_fields = 0;
3100 		xfs_ifunlock(ip);
3101 		return XFS_ERROR(EIO);
3102 	}
3103 
3104 	/*
3105 	 * Get the buffer containing the on-disk inode.
3106 	 */
3107 	error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0);
3108 	if (error != 0) {
3109 		xfs_ifunlock(ip);
3110 		return error;
3111 	}
3112 
3113 	/*
3114 	 * Decide how buffer will be flushed out.  This is done before
3115 	 * the call to xfs_iflush_int because this field is zeroed by it.
3116 	 */
3117 	if (iip != NULL && iip->ili_format.ilf_fields != 0) {
3118 		/*
3119 		 * Flush out the inode buffer according to the directions
3120 		 * of the caller.  In the cases where the caller has given
3121 		 * us a choice choose the non-delwri case.  This is because
3122 		 * the inode is in the AIL and we need to get it out soon.
3123 		 */
3124 		switch (flags) {
3125 		case XFS_IFLUSH_SYNC:
3126 		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
3127 			flags = 0;
3128 			break;
3129 		case XFS_IFLUSH_ASYNC:
3130 		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
3131 			flags = INT_ASYNC;
3132 			break;
3133 		case XFS_IFLUSH_DELWRI:
3134 			flags = INT_DELWRI;
3135 			break;
3136 		default:
3137 			ASSERT(0);
3138 			flags = 0;
3139 			break;
3140 		}
3141 	} else {
3142 		switch (flags) {
3143 		case XFS_IFLUSH_DELWRI_ELSE_SYNC:
3144 		case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
3145 		case XFS_IFLUSH_DELWRI:
3146 			flags = INT_DELWRI;
3147 			break;
3148 		case XFS_IFLUSH_ASYNC:
3149 			flags = INT_ASYNC;
3150 			break;
3151 		case XFS_IFLUSH_SYNC:
3152 			flags = 0;
3153 			break;
3154 		default:
3155 			ASSERT(0);
3156 			flags = 0;
3157 			break;
3158 		}
3159 	}
3160 
3161 	/*
3162 	 * First flush out the inode that xfs_iflush was called with.
3163 	 */
3164 	error = xfs_iflush_int(ip, bp);
3165 	if (error) {
3166 		goto corrupt_out;
3167 	}
3168 
3169 	/*
3170 	 * inode clustering:
3171 	 * see if other inodes can be gathered into this write
3172 	 */
3173 
3174 	ip->i_chash->chl_buf = bp;
3175 
3176 	ch = XFS_CHASH(mp, ip->i_blkno);
3177 	s = mutex_spinlock(&ch->ch_lock);
3178 
3179 	clcount = 0;
3180 	for (iq = ip->i_cnext; iq != ip; iq = iq->i_cnext) {
3181 		/*
3182 		 * Do an un-protected check to see if the inode is dirty and
3183 		 * is a candidate for flushing.  These checks will be repeated
3184 		 * later after the appropriate locks are acquired.
3185 		 */
3186 		iip = iq->i_itemp;
3187 		if ((iq->i_update_core == 0) &&
3188 		    ((iip == NULL) ||
3189 		     !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
3190 		      xfs_ipincount(iq) == 0) {
3191 			continue;
3192 		}
3193 
3194 		/*
3195 		 * Try to get locks.  If any are unavailable,
3196 		 * then this inode cannot be flushed and is skipped.
3197 		 */
3198 
3199 		/* get inode locks (just i_lock) */
3200 		if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) {
3201 			/* get inode flush lock */
3202 			if (xfs_iflock_nowait(iq)) {
3203 				/* check if pinned */
3204 				if (xfs_ipincount(iq) == 0) {
3205 					/* arriving here means that
3206 					 * this inode can be flushed.
3207 					 * first re-check that it's
3208 					 * dirty
3209 					 */
3210 					iip = iq->i_itemp;
3211 					if ((iq->i_update_core != 0)||
3212 					    ((iip != NULL) &&
3213 					     (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
3214 						clcount++;
3215 						error = xfs_iflush_int(iq, bp);
3216 						if (error) {
3217 							xfs_iunlock(iq,
3218 								    XFS_ILOCK_SHARED);
3219 							goto cluster_corrupt_out;
3220 						}
3221 					} else {
3222 						xfs_ifunlock(iq);
3223 					}
3224 				} else {
3225 					xfs_ifunlock(iq);
3226 				}
3227 			}
3228 			xfs_iunlock(iq, XFS_ILOCK_SHARED);
3229 		}
3230 	}
3231 	mutex_spinunlock(&ch->ch_lock, s);
3232 
3233 	if (clcount) {
3234 		XFS_STATS_INC(xs_icluster_flushcnt);
3235 		XFS_STATS_ADD(xs_icluster_flushinode, clcount);
3236 	}
3237 
3238 	/*
3239 	 * If the buffer is pinned then push on the log so we won't
3240 	 * get stuck waiting in the write for too long.
3241 	 */
3242 	if (XFS_BUF_ISPINNED(bp)){
3243 		xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
3244 	}
3245 
3246 	if (flags & INT_DELWRI) {
3247 		xfs_bdwrite(mp, bp);
3248 	} else if (flags & INT_ASYNC) {
3249 		xfs_bawrite(mp, bp);
3250 	} else {
3251 		error = xfs_bwrite(mp, bp);
3252 	}
3253 	return error;
3254 
3255 corrupt_out:
3256 	xfs_buf_relse(bp);
3257 	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
3258 	xfs_iflush_abort(ip);
3259 	/*
3260 	 * Unlocks the flush lock
3261 	 */
3262 	return XFS_ERROR(EFSCORRUPTED);
3263 
3264 cluster_corrupt_out:
3265 	/* Corruption detected in the clustering loop.  Invalidate the
3266 	 * inode buffer and shut down the filesystem.
3267 	 */
3268 	mutex_spinunlock(&ch->ch_lock, s);
3269 
3270 	/*
3271 	 * Clean up the buffer.  If it was B_DELWRI, just release it --
3272 	 * brelse can handle it with no problems.  If not, shut down the
3273 	 * filesystem before releasing the buffer.
3274 	 */
3275 	if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) {
3276 		xfs_buf_relse(bp);
3277 	}
3278 
3279 	xfs_force_shutdown(mp, XFS_CORRUPT_INCORE);
3280 
3281 	if(!bufwasdelwri)  {
3282 		/*
3283 		 * Just like incore_relse: if we have b_iodone functions,
3284 		 * mark the buffer as an error and call them.  Otherwise
3285 		 * mark it as stale and brelse.
3286 		 */
3287 		if (XFS_BUF_IODONE_FUNC(bp)) {
3288 			XFS_BUF_CLR_BDSTRAT_FUNC(bp);
3289 			XFS_BUF_UNDONE(bp);
3290 			XFS_BUF_STALE(bp);
3291 			XFS_BUF_SHUT(bp);
3292 			XFS_BUF_ERROR(bp,EIO);
3293 			xfs_biodone(bp);
3294 		} else {
3295 			XFS_BUF_STALE(bp);
3296 			xfs_buf_relse(bp);
3297 		}
3298 	}
3299 
3300 	xfs_iflush_abort(iq);
3301 	/*
3302 	 * Unlocks the flush lock
3303 	 */
3304 	return XFS_ERROR(EFSCORRUPTED);
3305 }
3306 
3307 
3308 STATIC int
3309 xfs_iflush_int(
3310 	xfs_inode_t		*ip,
3311 	xfs_buf_t		*bp)
3312 {
3313 	xfs_inode_log_item_t	*iip;
3314 	xfs_dinode_t		*dip;
3315 	xfs_mount_t		*mp;
3316 #ifdef XFS_TRANS_DEBUG
3317 	int			first;
3318 #endif
3319 	SPLDECL(s);
3320 
3321 	ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS));
3322 	ASSERT(valusema(&ip->i_flock) <= 0);
3323 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3324 	       ip->i_d.di_nextents > ip->i_df.if_ext_max);
3325 
3326 	iip = ip->i_itemp;
3327 	mp = ip->i_mount;
3328 
3329 
3330 	/*
3331 	 * If the inode isn't dirty, then just release the inode
3332 	 * flush lock and do nothing.
3333 	 */
3334 	if ((ip->i_update_core == 0) &&
3335 	    ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
3336 		xfs_ifunlock(ip);
3337 		return 0;
3338 	}
3339 
3340 	/* set *dip = inode's place in the buffer */
3341 	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset);
3342 
3343 	/*
3344 	 * Clear i_update_core before copying out the data.
3345 	 * This is for coordination with our timestamp updates
3346 	 * that don't hold the inode lock. They will always
3347 	 * update the timestamps BEFORE setting i_update_core,
3348 	 * so if we clear i_update_core after they set it we
3349 	 * are guaranteed to see their updates to the timestamps.
3350 	 * I believe that this depends on strongly ordered memory
3351 	 * semantics, but we have that.  We use the SYNCHRONIZE
3352 	 * macro to make sure that the compiler does not reorder
3353 	 * the i_update_core access below the data copy below.
3354 	 */
3355 	ip->i_update_core = 0;
3356 	SYNCHRONIZE();
3357 
3358 	if (XFS_TEST_ERROR(INT_GET(dip->di_core.di_magic,ARCH_CONVERT) != XFS_DINODE_MAGIC,
3359 			       mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
3360 		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3361 		    "xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
3362 			ip->i_ino, (int) INT_GET(dip->di_core.di_magic, ARCH_CONVERT), dip);
3363 		goto corrupt_out;
3364 	}
3365 	if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
3366 				mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
3367 		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3368 			"xfs_iflush: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
3369 			ip->i_ino, ip, ip->i_d.di_magic);
3370 		goto corrupt_out;
3371 	}
3372 	if ((ip->i_d.di_mode & S_IFMT) == S_IFREG) {
3373 		if (XFS_TEST_ERROR(
3374 		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3375 		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
3376 		    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
3377 			xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3378 				"xfs_iflush: Bad regular inode %Lu, ptr 0x%p",
3379 				ip->i_ino, ip);
3380 			goto corrupt_out;
3381 		}
3382 	} else if ((ip->i_d.di_mode & S_IFMT) == S_IFDIR) {
3383 		if (XFS_TEST_ERROR(
3384 		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3385 		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
3386 		    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
3387 		    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
3388 			xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3389 				"xfs_iflush: Bad directory inode %Lu, ptr 0x%p",
3390 				ip->i_ino, ip);
3391 			goto corrupt_out;
3392 		}
3393 	}
3394 	if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
3395 				ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
3396 				XFS_RANDOM_IFLUSH_5)) {
3397 		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3398 			"xfs_iflush: detected corrupt incore inode %Lu, total extents = %d, nblocks = %Ld, ptr 0x%p",
3399 			ip->i_ino,
3400 			ip->i_d.di_nextents + ip->i_d.di_anextents,
3401 			ip->i_d.di_nblocks,
3402 			ip);
3403 		goto corrupt_out;
3404 	}
3405 	if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
3406 				mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
3407 		xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
3408 			"xfs_iflush: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
3409 			ip->i_ino, ip->i_d.di_forkoff, ip);
3410 		goto corrupt_out;
3411 	}
3412 	/*
3413 	 * bump the flush iteration count, used to detect flushes which
3414 	 * postdate a log record during recovery.
3415 	 */
3416 
3417 	ip->i_d.di_flushiter++;
3418 
3419 	/*
3420 	 * Copy the dirty parts of the inode into the on-disk
3421 	 * inode.  We always copy out the core of the inode,
3422 	 * because if the inode is dirty at all the core must
3423 	 * be.
3424 	 */
3425 	xfs_xlate_dinode_core((xfs_caddr_t)&(dip->di_core), &(ip->i_d), -1);
3426 
3427 	/* Wrap, we never let the log put out DI_MAX_FLUSH */
3428 	if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
3429 		ip->i_d.di_flushiter = 0;
3430 
3431 	/*
3432 	 * If this is really an old format inode and the superblock version
3433 	 * has not been updated to support only new format inodes, then
3434 	 * convert back to the old inode format.  If the superblock version
3435 	 * has been updated, then make the conversion permanent.
3436 	 */
3437 	ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
3438 	       XFS_SB_VERSION_HASNLINK(&mp->m_sb));
3439 	if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
3440 		if (!XFS_SB_VERSION_HASNLINK(&mp->m_sb)) {
3441 			/*
3442 			 * Convert it back.
3443 			 */
3444 			ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
3445 			INT_SET(dip->di_core.di_onlink, ARCH_CONVERT, ip->i_d.di_nlink);
3446 		} else {
3447 			/*
3448 			 * The superblock version has already been bumped,
3449 			 * so just make the conversion to the new inode
3450 			 * format permanent.
3451 			 */
3452 			ip->i_d.di_version = XFS_DINODE_VERSION_2;
3453 			INT_SET(dip->di_core.di_version, ARCH_CONVERT, XFS_DINODE_VERSION_2);
3454 			ip->i_d.di_onlink = 0;
3455 			dip->di_core.di_onlink = 0;
3456 			memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
3457 			memset(&(dip->di_core.di_pad[0]), 0,
3458 			      sizeof(dip->di_core.di_pad));
3459 			ASSERT(ip->i_d.di_projid == 0);
3460 		}
3461 	}
3462 
3463 	if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) {
3464 		goto corrupt_out;
3465 	}
3466 
3467 	if (XFS_IFORK_Q(ip)) {
3468 		/*
3469 		 * The only error from xfs_iflush_fork is on the data fork.
3470 		 */
3471 		(void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
3472 	}
3473 	xfs_inobp_check(mp, bp);
3474 
3475 	/*
3476 	 * We've recorded everything logged in the inode, so we'd
3477 	 * like to clear the ilf_fields bits so we don't log and
3478 	 * flush things unnecessarily.  However, we can't stop
3479 	 * logging all this information until the data we've copied
3480 	 * into the disk buffer is written to disk.  If we did we might
3481 	 * overwrite the copy of the inode in the log with all the
3482 	 * data after re-logging only part of it, and in the face of
3483 	 * a crash we wouldn't have all the data we need to recover.
3484 	 *
3485 	 * What we do is move the bits to the ili_last_fields field.
3486 	 * When logging the inode, these bits are moved back to the
3487 	 * ilf_fields field.  In the xfs_iflush_done() routine we
3488 	 * clear ili_last_fields, since we know that the information
3489 	 * those bits represent is permanently on disk.  As long as
3490 	 * the flush completes before the inode is logged again, then
3491 	 * both ilf_fields and ili_last_fields will be cleared.
3492 	 *
3493 	 * We can play with the ilf_fields bits here, because the inode
3494 	 * lock must be held exclusively in order to set bits there
3495 	 * and the flush lock protects the ili_last_fields bits.
3496 	 * Set ili_logged so the flush done
3497 	 * routine can tell whether or not to look in the AIL.
3498 	 * Also, store the current LSN of the inode so that we can tell
3499 	 * whether the item has moved in the AIL from xfs_iflush_done().
3500 	 * In order to read the lsn we need the AIL lock, because
3501 	 * it is a 64 bit value that cannot be read atomically.
3502 	 */
3503 	if (iip != NULL && iip->ili_format.ilf_fields != 0) {
3504 		iip->ili_last_fields = iip->ili_format.ilf_fields;
3505 		iip->ili_format.ilf_fields = 0;
3506 		iip->ili_logged = 1;
3507 
3508 		ASSERT(sizeof(xfs_lsn_t) == 8);	/* don't lock if it shrinks */
3509 		AIL_LOCK(mp,s);
3510 		iip->ili_flush_lsn = iip->ili_item.li_lsn;
3511 		AIL_UNLOCK(mp, s);
3512 
3513 		/*
3514 		 * Attach the function xfs_iflush_done to the inode's
3515 		 * buffer.  This will remove the inode from the AIL
3516 		 * and unlock the inode's flush lock when the inode is
3517 		 * completely written to disk.
3518 		 */
3519 		xfs_buf_attach_iodone(bp, (void(*)(xfs_buf_t*,xfs_log_item_t*))
3520 				      xfs_iflush_done, (xfs_log_item_t *)iip);
3521 
3522 		ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
3523 		ASSERT(XFS_BUF_IODONE_FUNC(bp) != NULL);
3524 	} else {
3525 		/*
3526 		 * We're flushing an inode which is not in the AIL and has
3527 		 * not been logged but has i_update_core set.  For this
3528 		 * case we can use a B_DELWRI flush and immediately drop
3529 		 * the inode flush lock because we can avoid the whole
3530 		 * AIL state thing.  It's OK to drop the flush lock now,
3531 		 * because we've already locked the buffer and to do anything
3532 		 * you really need both.
3533 		 */
3534 		if (iip != NULL) {
3535 			ASSERT(iip->ili_logged == 0);
3536 			ASSERT(iip->ili_last_fields == 0);
3537 			ASSERT((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0);
3538 		}
3539 		xfs_ifunlock(ip);
3540 	}
3541 
3542 	return 0;
3543 
3544 corrupt_out:
3545 	return XFS_ERROR(EFSCORRUPTED);
3546 }
3547 
3548 
3549 /*
3550  * Flush all inactive inodes in mp.
3551  */
3552 void
3553 xfs_iflush_all(
3554 	xfs_mount_t	*mp)
3555 {
3556 	xfs_inode_t	*ip;
3557 	vnode_t		*vp;
3558 
3559  again:
3560 	XFS_MOUNT_ILOCK(mp);
3561 	ip = mp->m_inodes;
3562 	if (ip == NULL)
3563 		goto out;
3564 
3565 	do {
3566 		/* Make sure we skip markers inserted by sync */
3567 		if (ip->i_mount == NULL) {
3568 			ip = ip->i_mnext;
3569 			continue;
3570 		}
3571 
3572 		vp = XFS_ITOV_NULL(ip);
3573 		if (!vp) {
3574 			XFS_MOUNT_IUNLOCK(mp);
3575 			xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
3576 			goto again;
3577 		}
3578 
3579 		ASSERT(vn_count(vp) == 0);
3580 
3581 		ip = ip->i_mnext;
3582 	} while (ip != mp->m_inodes);
3583  out:
3584 	XFS_MOUNT_IUNLOCK(mp);
3585 }
3586 
3587 /*
3588  * xfs_iaccess: check accessibility of inode for mode.
3589  */
3590 int
3591 xfs_iaccess(
3592 	xfs_inode_t	*ip,
3593 	mode_t		mode,
3594 	cred_t		*cr)
3595 {
3596 	int		error;
3597 	mode_t		orgmode = mode;
3598 	struct inode	*inode = LINVFS_GET_IP(XFS_ITOV(ip));
3599 
3600 	if (mode & S_IWUSR) {
3601 		umode_t		imode = inode->i_mode;
3602 
3603 		if (IS_RDONLY(inode) &&
3604 		    (S_ISREG(imode) || S_ISDIR(imode) || S_ISLNK(imode)))
3605 			return XFS_ERROR(EROFS);
3606 
3607 		if (IS_IMMUTABLE(inode))
3608 			return XFS_ERROR(EACCES);
3609 	}
3610 
3611 	/*
3612 	 * If there's an Access Control List it's used instead of
3613 	 * the mode bits.
3614 	 */
3615 	if ((error = _ACL_XFS_IACCESS(ip, mode, cr)) != -1)
3616 		return error ? XFS_ERROR(error) : 0;
3617 
3618 	if (current_fsuid(cr) != ip->i_d.di_uid) {
3619 		mode >>= 3;
3620 		if (!in_group_p((gid_t)ip->i_d.di_gid))
3621 			mode >>= 3;
3622 	}
3623 
3624 	/*
3625 	 * If the DACs are ok we don't need any capability check.
3626 	 */
3627 	if ((ip->i_d.di_mode & mode) == mode)
3628 		return 0;
3629 	/*
3630 	 * Read/write DACs are always overridable.
3631 	 * Executable DACs are overridable if at least one exec bit is set.
3632 	 */
3633 	if (!(orgmode & S_IXUSR) ||
3634 	    (inode->i_mode & S_IXUGO) || S_ISDIR(inode->i_mode))
3635 		if (capable_cred(cr, CAP_DAC_OVERRIDE))
3636 			return 0;
3637 
3638 	if ((orgmode == S_IRUSR) ||
3639 	    (S_ISDIR(inode->i_mode) && (!(orgmode & S_IWUSR)))) {
3640 		if (capable_cred(cr, CAP_DAC_READ_SEARCH))
3641 			return 0;
3642 #ifdef	NOISE
3643 		cmn_err(CE_NOTE, "Ick: mode=%o, orgmode=%o", mode, orgmode);
3644 #endif	/* NOISE */
3645 		return XFS_ERROR(EACCES);
3646 	}
3647 	return XFS_ERROR(EACCES);
3648 }
3649 
3650 /*
3651  * xfs_iroundup: round up argument to next power of two
3652  */
3653 uint
3654 xfs_iroundup(
3655 	uint	v)
3656 {
3657 	int i;
3658 	uint m;
3659 
3660 	if ((v & (v - 1)) == 0)
3661 		return v;
3662 	ASSERT((v & 0x80000000) == 0);
3663 	if ((v & (v + 1)) == 0)
3664 		return v + 1;
3665 	for (i = 0, m = 1; i < 31; i++, m <<= 1) {
3666 		if (v & m)
3667 			continue;
3668 		v |= m;
3669 		if ((v & (v + 1)) == 0)
3670 			return v + 1;
3671 	}
3672 	ASSERT(0);
3673 	return( 0 );
3674 }
3675 
3676 #ifdef XFS_ILOCK_TRACE
3677 ktrace_t	*xfs_ilock_trace_buf;
3678 
3679 void
3680 xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
3681 {
3682 	ktrace_enter(ip->i_lock_trace,
3683 		     (void *)ip,
3684 		     (void *)(unsigned long)lock, /* 1 = LOCK, 3=UNLOCK, etc */
3685 		     (void *)(unsigned long)lockflags, /* XFS_ILOCK_EXCL etc */
3686 		     (void *)ra,		/* caller of ilock */
3687 		     (void *)(unsigned long)current_cpu(),
3688 		     (void *)(unsigned long)current_pid(),
3689 		     NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL);
3690 }
3691 #endif
3692