xref: /openbmc/linux/fs/xfs/xfs_inode.c (revision 483eb062)
1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include <linux/log2.h>
19 
20 #include "xfs.h"
21 #include "xfs_fs.h"
22 #include "xfs_shared.h"
23 #include "xfs_format.h"
24 #include "xfs_log_format.h"
25 #include "xfs_trans_resv.h"
26 #include "xfs_inum.h"
27 #include "xfs_sb.h"
28 #include "xfs_ag.h"
29 #include "xfs_mount.h"
30 #include "xfs_inode.h"
31 #include "xfs_da_format.h"
32 #include "xfs_da_btree.h"
33 #include "xfs_dir2.h"
34 #include "xfs_attr_sf.h"
35 #include "xfs_attr.h"
36 #include "xfs_trans_space.h"
37 #include "xfs_trans.h"
38 #include "xfs_buf_item.h"
39 #include "xfs_inode_item.h"
40 #include "xfs_ialloc.h"
41 #include "xfs_bmap.h"
42 #include "xfs_bmap_util.h"
43 #include "xfs_error.h"
44 #include "xfs_quota.h"
45 #include "xfs_dinode.h"
46 #include "xfs_filestream.h"
47 #include "xfs_cksum.h"
48 #include "xfs_trace.h"
49 #include "xfs_icache.h"
50 #include "xfs_symlink.h"
51 #include "xfs_trans_priv.h"
52 #include "xfs_log.h"
53 #include "xfs_bmap_btree.h"
54 
55 kmem_zone_t *xfs_inode_zone;
56 
57 /*
58  * Used in xfs_itruncate_extents().  This is the maximum number of extents
59  * freed from a file in a single transaction.
60  */
61 #define	XFS_ITRUNC_MAX_EXTENTS	2
62 
63 STATIC int xfs_iflush_int(xfs_inode_t *, xfs_buf_t *);
64 
65 /*
66  * helper function to extract extent size hint from inode
67  */
68 xfs_extlen_t
69 xfs_get_extsz_hint(
70 	struct xfs_inode	*ip)
71 {
72 	if ((ip->i_d.di_flags & XFS_DIFLAG_EXTSIZE) && ip->i_d.di_extsize)
73 		return ip->i_d.di_extsize;
74 	if (XFS_IS_REALTIME_INODE(ip))
75 		return ip->i_mount->m_sb.sb_rextsize;
76 	return 0;
77 }
78 
79 /*
80  * These two are wrapper routines around the xfs_ilock() routine used to
81  * centralize some grungy code.  They are used in places that wish to lock the
82  * inode solely for reading the extents.  The reason these places can't just
83  * call xfs_ilock(ip, XFS_ILOCK_SHARED) is that the inode lock also guards to
84  * bringing in of the extents from disk for a file in b-tree format.  If the
85  * inode is in b-tree format, then we need to lock the inode exclusively until
86  * the extents are read in.  Locking it exclusively all the time would limit
87  * our parallelism unnecessarily, though.  What we do instead is check to see
88  * if the extents have been read in yet, and only lock the inode exclusively
89  * if they have not.
90  *
91  * The functions return a value which should be given to the corresponding
92  * xfs_iunlock() call.
93  */
94 uint
95 xfs_ilock_data_map_shared(
96 	struct xfs_inode	*ip)
97 {
98 	uint			lock_mode = XFS_ILOCK_SHARED;
99 
100 	if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE &&
101 	    (ip->i_df.if_flags & XFS_IFEXTENTS) == 0)
102 		lock_mode = XFS_ILOCK_EXCL;
103 	xfs_ilock(ip, lock_mode);
104 	return lock_mode;
105 }
106 
107 uint
108 xfs_ilock_attr_map_shared(
109 	struct xfs_inode	*ip)
110 {
111 	uint			lock_mode = XFS_ILOCK_SHARED;
112 
113 	if (ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE &&
114 	    (ip->i_afp->if_flags & XFS_IFEXTENTS) == 0)
115 		lock_mode = XFS_ILOCK_EXCL;
116 	xfs_ilock(ip, lock_mode);
117 	return lock_mode;
118 }
119 
120 /*
121  * The xfs inode contains 2 locks: a multi-reader lock called the
122  * i_iolock and a multi-reader lock called the i_lock.  This routine
123  * allows either or both of the locks to be obtained.
124  *
125  * The 2 locks should always be ordered so that the IO lock is
126  * obtained first in order to prevent deadlock.
127  *
128  * ip -- the inode being locked
129  * lock_flags -- this parameter indicates the inode's locks
130  *       to be locked.  It can be:
131  *		XFS_IOLOCK_SHARED,
132  *		XFS_IOLOCK_EXCL,
133  *		XFS_ILOCK_SHARED,
134  *		XFS_ILOCK_EXCL,
135  *		XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
136  *		XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
137  *		XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
138  *		XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
139  */
140 void
141 xfs_ilock(
142 	xfs_inode_t		*ip,
143 	uint			lock_flags)
144 {
145 	trace_xfs_ilock(ip, lock_flags, _RET_IP_);
146 
147 	/*
148 	 * You can't set both SHARED and EXCL for the same lock,
149 	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
150 	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
151 	 */
152 	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
153 	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
154 	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
155 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
156 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
157 
158 	if (lock_flags & XFS_IOLOCK_EXCL)
159 		mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
160 	else if (lock_flags & XFS_IOLOCK_SHARED)
161 		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
162 
163 	if (lock_flags & XFS_ILOCK_EXCL)
164 		mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
165 	else if (lock_flags & XFS_ILOCK_SHARED)
166 		mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
167 }
168 
169 /*
170  * This is just like xfs_ilock(), except that the caller
171  * is guaranteed not to sleep.  It returns 1 if it gets
172  * the requested locks and 0 otherwise.  If the IO lock is
173  * obtained but the inode lock cannot be, then the IO lock
174  * is dropped before returning.
175  *
176  * ip -- the inode being locked
177  * lock_flags -- this parameter indicates the inode's locks to be
178  *       to be locked.  See the comment for xfs_ilock() for a list
179  *	 of valid values.
180  */
181 int
182 xfs_ilock_nowait(
183 	xfs_inode_t		*ip,
184 	uint			lock_flags)
185 {
186 	trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
187 
188 	/*
189 	 * You can't set both SHARED and EXCL for the same lock,
190 	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
191 	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
192 	 */
193 	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
194 	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
195 	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
196 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
197 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
198 
199 	if (lock_flags & XFS_IOLOCK_EXCL) {
200 		if (!mrtryupdate(&ip->i_iolock))
201 			goto out;
202 	} else if (lock_flags & XFS_IOLOCK_SHARED) {
203 		if (!mrtryaccess(&ip->i_iolock))
204 			goto out;
205 	}
206 	if (lock_flags & XFS_ILOCK_EXCL) {
207 		if (!mrtryupdate(&ip->i_lock))
208 			goto out_undo_iolock;
209 	} else if (lock_flags & XFS_ILOCK_SHARED) {
210 		if (!mrtryaccess(&ip->i_lock))
211 			goto out_undo_iolock;
212 	}
213 	return 1;
214 
215  out_undo_iolock:
216 	if (lock_flags & XFS_IOLOCK_EXCL)
217 		mrunlock_excl(&ip->i_iolock);
218 	else if (lock_flags & XFS_IOLOCK_SHARED)
219 		mrunlock_shared(&ip->i_iolock);
220  out:
221 	return 0;
222 }
223 
224 /*
225  * xfs_iunlock() is used to drop the inode locks acquired with
226  * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
227  * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
228  * that we know which locks to drop.
229  *
230  * ip -- the inode being unlocked
231  * lock_flags -- this parameter indicates the inode's locks to be
232  *       to be unlocked.  See the comment for xfs_ilock() for a list
233  *	 of valid values for this parameter.
234  *
235  */
236 void
237 xfs_iunlock(
238 	xfs_inode_t		*ip,
239 	uint			lock_flags)
240 {
241 	/*
242 	 * You can't set both SHARED and EXCL for the same lock,
243 	 * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
244 	 * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
245 	 */
246 	ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
247 	       (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
248 	ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
249 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
250 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
251 	ASSERT(lock_flags != 0);
252 
253 	if (lock_flags & XFS_IOLOCK_EXCL)
254 		mrunlock_excl(&ip->i_iolock);
255 	else if (lock_flags & XFS_IOLOCK_SHARED)
256 		mrunlock_shared(&ip->i_iolock);
257 
258 	if (lock_flags & XFS_ILOCK_EXCL)
259 		mrunlock_excl(&ip->i_lock);
260 	else if (lock_flags & XFS_ILOCK_SHARED)
261 		mrunlock_shared(&ip->i_lock);
262 
263 	trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
264 }
265 
266 /*
267  * give up write locks.  the i/o lock cannot be held nested
268  * if it is being demoted.
269  */
270 void
271 xfs_ilock_demote(
272 	xfs_inode_t		*ip,
273 	uint			lock_flags)
274 {
275 	ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
276 	ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
277 
278 	if (lock_flags & XFS_ILOCK_EXCL)
279 		mrdemote(&ip->i_lock);
280 	if (lock_flags & XFS_IOLOCK_EXCL)
281 		mrdemote(&ip->i_iolock);
282 
283 	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
284 }
285 
286 #if defined(DEBUG) || defined(XFS_WARN)
287 int
288 xfs_isilocked(
289 	xfs_inode_t		*ip,
290 	uint			lock_flags)
291 {
292 	if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
293 		if (!(lock_flags & XFS_ILOCK_SHARED))
294 			return !!ip->i_lock.mr_writer;
295 		return rwsem_is_locked(&ip->i_lock.mr_lock);
296 	}
297 
298 	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
299 		if (!(lock_flags & XFS_IOLOCK_SHARED))
300 			return !!ip->i_iolock.mr_writer;
301 		return rwsem_is_locked(&ip->i_iolock.mr_lock);
302 	}
303 
304 	ASSERT(0);
305 	return 0;
306 }
307 #endif
308 
309 #ifdef DEBUG
310 int xfs_locked_n;
311 int xfs_small_retries;
312 int xfs_middle_retries;
313 int xfs_lots_retries;
314 int xfs_lock_delays;
315 #endif
316 
317 /*
318  * Bump the subclass so xfs_lock_inodes() acquires each lock with
319  * a different value
320  */
321 static inline int
322 xfs_lock_inumorder(int lock_mode, int subclass)
323 {
324 	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
325 		lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_IOLOCK_SHIFT;
326 	if (lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL))
327 		lock_mode |= (subclass + XFS_LOCK_INUMORDER) << XFS_ILOCK_SHIFT;
328 
329 	return lock_mode;
330 }
331 
332 /*
333  * The following routine will lock n inodes in exclusive mode.
334  * We assume the caller calls us with the inodes in i_ino order.
335  *
336  * We need to detect deadlock where an inode that we lock
337  * is in the AIL and we start waiting for another inode that is locked
338  * by a thread in a long running transaction (such as truncate). This can
339  * result in deadlock since the long running trans might need to wait
340  * for the inode we just locked in order to push the tail and free space
341  * in the log.
342  */
343 void
344 xfs_lock_inodes(
345 	xfs_inode_t	**ips,
346 	int		inodes,
347 	uint		lock_mode)
348 {
349 	int		attempts = 0, i, j, try_lock;
350 	xfs_log_item_t	*lp;
351 
352 	ASSERT(ips && (inodes >= 2)); /* we need at least two */
353 
354 	try_lock = 0;
355 	i = 0;
356 
357 again:
358 	for (; i < inodes; i++) {
359 		ASSERT(ips[i]);
360 
361 		if (i && (ips[i] == ips[i-1]))	/* Already locked */
362 			continue;
363 
364 		/*
365 		 * If try_lock is not set yet, make sure all locked inodes
366 		 * are not in the AIL.
367 		 * If any are, set try_lock to be used later.
368 		 */
369 
370 		if (!try_lock) {
371 			for (j = (i - 1); j >= 0 && !try_lock; j--) {
372 				lp = (xfs_log_item_t *)ips[j]->i_itemp;
373 				if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
374 					try_lock++;
375 				}
376 			}
377 		}
378 
379 		/*
380 		 * If any of the previous locks we have locked is in the AIL,
381 		 * we must TRY to get the second and subsequent locks. If
382 		 * we can't get any, we must release all we have
383 		 * and try again.
384 		 */
385 
386 		if (try_lock) {
387 			/* try_lock must be 0 if i is 0. */
388 			/*
389 			 * try_lock means we have an inode locked
390 			 * that is in the AIL.
391 			 */
392 			ASSERT(i != 0);
393 			if (!xfs_ilock_nowait(ips[i], xfs_lock_inumorder(lock_mode, i))) {
394 				attempts++;
395 
396 				/*
397 				 * Unlock all previous guys and try again.
398 				 * xfs_iunlock will try to push the tail
399 				 * if the inode is in the AIL.
400 				 */
401 
402 				for(j = i - 1; j >= 0; j--) {
403 
404 					/*
405 					 * Check to see if we've already
406 					 * unlocked this one.
407 					 * Not the first one going back,
408 					 * and the inode ptr is the same.
409 					 */
410 					if ((j != (i - 1)) && ips[j] ==
411 								ips[j+1])
412 						continue;
413 
414 					xfs_iunlock(ips[j], lock_mode);
415 				}
416 
417 				if ((attempts % 5) == 0) {
418 					delay(1); /* Don't just spin the CPU */
419 #ifdef DEBUG
420 					xfs_lock_delays++;
421 #endif
422 				}
423 				i = 0;
424 				try_lock = 0;
425 				goto again;
426 			}
427 		} else {
428 			xfs_ilock(ips[i], xfs_lock_inumorder(lock_mode, i));
429 		}
430 	}
431 
432 #ifdef DEBUG
433 	if (attempts) {
434 		if (attempts < 5) xfs_small_retries++;
435 		else if (attempts < 100) xfs_middle_retries++;
436 		else xfs_lots_retries++;
437 	} else {
438 		xfs_locked_n++;
439 	}
440 #endif
441 }
442 
443 /*
444  * xfs_lock_two_inodes() can only be used to lock one type of lock
445  * at a time - the iolock or the ilock, but not both at once. If
446  * we lock both at once, lockdep will report false positives saying
447  * we have violated locking orders.
448  */
449 void
450 xfs_lock_two_inodes(
451 	xfs_inode_t		*ip0,
452 	xfs_inode_t		*ip1,
453 	uint			lock_mode)
454 {
455 	xfs_inode_t		*temp;
456 	int			attempts = 0;
457 	xfs_log_item_t		*lp;
458 
459 	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL))
460 		ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0);
461 	ASSERT(ip0->i_ino != ip1->i_ino);
462 
463 	if (ip0->i_ino > ip1->i_ino) {
464 		temp = ip0;
465 		ip0 = ip1;
466 		ip1 = temp;
467 	}
468 
469  again:
470 	xfs_ilock(ip0, xfs_lock_inumorder(lock_mode, 0));
471 
472 	/*
473 	 * If the first lock we have locked is in the AIL, we must TRY to get
474 	 * the second lock. If we can't get it, we must release the first one
475 	 * and try again.
476 	 */
477 	lp = (xfs_log_item_t *)ip0->i_itemp;
478 	if (lp && (lp->li_flags & XFS_LI_IN_AIL)) {
479 		if (!xfs_ilock_nowait(ip1, xfs_lock_inumorder(lock_mode, 1))) {
480 			xfs_iunlock(ip0, lock_mode);
481 			if ((++attempts % 5) == 0)
482 				delay(1); /* Don't just spin the CPU */
483 			goto again;
484 		}
485 	} else {
486 		xfs_ilock(ip1, xfs_lock_inumorder(lock_mode, 1));
487 	}
488 }
489 
490 
491 void
492 __xfs_iflock(
493 	struct xfs_inode	*ip)
494 {
495 	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
496 	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
497 
498 	do {
499 		prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
500 		if (xfs_isiflocked(ip))
501 			io_schedule();
502 	} while (!xfs_iflock_nowait(ip));
503 
504 	finish_wait(wq, &wait.wait);
505 }
506 
507 STATIC uint
508 _xfs_dic2xflags(
509 	__uint16_t		di_flags)
510 {
511 	uint			flags = 0;
512 
513 	if (di_flags & XFS_DIFLAG_ANY) {
514 		if (di_flags & XFS_DIFLAG_REALTIME)
515 			flags |= XFS_XFLAG_REALTIME;
516 		if (di_flags & XFS_DIFLAG_PREALLOC)
517 			flags |= XFS_XFLAG_PREALLOC;
518 		if (di_flags & XFS_DIFLAG_IMMUTABLE)
519 			flags |= XFS_XFLAG_IMMUTABLE;
520 		if (di_flags & XFS_DIFLAG_APPEND)
521 			flags |= XFS_XFLAG_APPEND;
522 		if (di_flags & XFS_DIFLAG_SYNC)
523 			flags |= XFS_XFLAG_SYNC;
524 		if (di_flags & XFS_DIFLAG_NOATIME)
525 			flags |= XFS_XFLAG_NOATIME;
526 		if (di_flags & XFS_DIFLAG_NODUMP)
527 			flags |= XFS_XFLAG_NODUMP;
528 		if (di_flags & XFS_DIFLAG_RTINHERIT)
529 			flags |= XFS_XFLAG_RTINHERIT;
530 		if (di_flags & XFS_DIFLAG_PROJINHERIT)
531 			flags |= XFS_XFLAG_PROJINHERIT;
532 		if (di_flags & XFS_DIFLAG_NOSYMLINKS)
533 			flags |= XFS_XFLAG_NOSYMLINKS;
534 		if (di_flags & XFS_DIFLAG_EXTSIZE)
535 			flags |= XFS_XFLAG_EXTSIZE;
536 		if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
537 			flags |= XFS_XFLAG_EXTSZINHERIT;
538 		if (di_flags & XFS_DIFLAG_NODEFRAG)
539 			flags |= XFS_XFLAG_NODEFRAG;
540 		if (di_flags & XFS_DIFLAG_FILESTREAM)
541 			flags |= XFS_XFLAG_FILESTREAM;
542 	}
543 
544 	return flags;
545 }
546 
547 uint
548 xfs_ip2xflags(
549 	xfs_inode_t		*ip)
550 {
551 	xfs_icdinode_t		*dic = &ip->i_d;
552 
553 	return _xfs_dic2xflags(dic->di_flags) |
554 				(XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
555 }
556 
557 uint
558 xfs_dic2xflags(
559 	xfs_dinode_t		*dip)
560 {
561 	return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
562 				(XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
563 }
564 
565 /*
566  * Lookups up an inode from "name". If ci_name is not NULL, then a CI match
567  * is allowed, otherwise it has to be an exact match. If a CI match is found,
568  * ci_name->name will point to a the actual name (caller must free) or
569  * will be set to NULL if an exact match is found.
570  */
571 int
572 xfs_lookup(
573 	xfs_inode_t		*dp,
574 	struct xfs_name		*name,
575 	xfs_inode_t		**ipp,
576 	struct xfs_name		*ci_name)
577 {
578 	xfs_ino_t		inum;
579 	int			error;
580 	uint			lock_mode;
581 
582 	trace_xfs_lookup(dp, name);
583 
584 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
585 		return XFS_ERROR(EIO);
586 
587 	lock_mode = xfs_ilock_data_map_shared(dp);
588 	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
589 	xfs_iunlock(dp, lock_mode);
590 
591 	if (error)
592 		goto out;
593 
594 	error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp);
595 	if (error)
596 		goto out_free_name;
597 
598 	return 0;
599 
600 out_free_name:
601 	if (ci_name)
602 		kmem_free(ci_name->name);
603 out:
604 	*ipp = NULL;
605 	return error;
606 }
607 
608 /*
609  * Allocate an inode on disk and return a copy of its in-core version.
610  * The in-core inode is locked exclusively.  Set mode, nlink, and rdev
611  * appropriately within the inode.  The uid and gid for the inode are
612  * set according to the contents of the given cred structure.
613  *
614  * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
615  * has a free inode available, call xfs_iget() to obtain the in-core
616  * version of the allocated inode.  Finally, fill in the inode and
617  * log its initial contents.  In this case, ialloc_context would be
618  * set to NULL.
619  *
620  * If xfs_dialloc() does not have an available inode, it will replenish
621  * its supply by doing an allocation. Since we can only do one
622  * allocation within a transaction without deadlocks, we must commit
623  * the current transaction before returning the inode itself.
624  * In this case, therefore, we will set ialloc_context and return.
625  * The caller should then commit the current transaction, start a new
626  * transaction, and call xfs_ialloc() again to actually get the inode.
627  *
628  * To ensure that some other process does not grab the inode that
629  * was allocated during the first call to xfs_ialloc(), this routine
630  * also returns the [locked] bp pointing to the head of the freelist
631  * as ialloc_context.  The caller should hold this buffer across
632  * the commit and pass it back into this routine on the second call.
633  *
634  * If we are allocating quota inodes, we do not have a parent inode
635  * to attach to or associate with (i.e. pip == NULL) because they
636  * are not linked into the directory structure - they are attached
637  * directly to the superblock - and so have no parent.
638  */
639 int
640 xfs_ialloc(
641 	xfs_trans_t	*tp,
642 	xfs_inode_t	*pip,
643 	umode_t		mode,
644 	xfs_nlink_t	nlink,
645 	xfs_dev_t	rdev,
646 	prid_t		prid,
647 	int		okalloc,
648 	xfs_buf_t	**ialloc_context,
649 	xfs_inode_t	**ipp)
650 {
651 	struct xfs_mount *mp = tp->t_mountp;
652 	xfs_ino_t	ino;
653 	xfs_inode_t	*ip;
654 	uint		flags;
655 	int		error;
656 	timespec_t	tv;
657 	int		filestreams = 0;
658 
659 	/*
660 	 * Call the space management code to pick
661 	 * the on-disk inode to be allocated.
662 	 */
663 	error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
664 			    ialloc_context, &ino);
665 	if (error)
666 		return error;
667 	if (*ialloc_context || ino == NULLFSINO) {
668 		*ipp = NULL;
669 		return 0;
670 	}
671 	ASSERT(*ialloc_context == NULL);
672 
673 	/*
674 	 * Get the in-core inode with the lock held exclusively.
675 	 * This is because we're setting fields here we need
676 	 * to prevent others from looking at until we're done.
677 	 */
678 	error = xfs_iget(mp, tp, ino, XFS_IGET_CREATE,
679 			 XFS_ILOCK_EXCL, &ip);
680 	if (error)
681 		return error;
682 	ASSERT(ip != NULL);
683 
684 	ip->i_d.di_mode = mode;
685 	ip->i_d.di_onlink = 0;
686 	ip->i_d.di_nlink = nlink;
687 	ASSERT(ip->i_d.di_nlink == nlink);
688 	ip->i_d.di_uid = xfs_kuid_to_uid(current_fsuid());
689 	ip->i_d.di_gid = xfs_kgid_to_gid(current_fsgid());
690 	xfs_set_projid(ip, prid);
691 	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
692 
693 	/*
694 	 * If the superblock version is up to where we support new format
695 	 * inodes and this is currently an old format inode, then change
696 	 * the inode version number now.  This way we only do the conversion
697 	 * here rather than here and in the flush/logging code.
698 	 */
699 	if (xfs_sb_version_hasnlink(&mp->m_sb) &&
700 	    ip->i_d.di_version == 1) {
701 		ip->i_d.di_version = 2;
702 		/*
703 		 * We've already zeroed the old link count, the projid field,
704 		 * and the pad field.
705 		 */
706 	}
707 
708 	/*
709 	 * Project ids won't be stored on disk if we are using a version 1 inode.
710 	 */
711 	if ((prid != 0) && (ip->i_d.di_version == 1))
712 		xfs_bump_ino_vers2(tp, ip);
713 
714 	if (pip && XFS_INHERIT_GID(pip)) {
715 		ip->i_d.di_gid = pip->i_d.di_gid;
716 		if ((pip->i_d.di_mode & S_ISGID) && S_ISDIR(mode)) {
717 			ip->i_d.di_mode |= S_ISGID;
718 		}
719 	}
720 
721 	/*
722 	 * If the group ID of the new file does not match the effective group
723 	 * ID or one of the supplementary group IDs, the S_ISGID bit is cleared
724 	 * (and only if the irix_sgid_inherit compatibility variable is set).
725 	 */
726 	if ((irix_sgid_inherit) &&
727 	    (ip->i_d.di_mode & S_ISGID) &&
728 	    (!in_group_p(xfs_gid_to_kgid(ip->i_d.di_gid)))) {
729 		ip->i_d.di_mode &= ~S_ISGID;
730 	}
731 
732 	ip->i_d.di_size = 0;
733 	ip->i_d.di_nextents = 0;
734 	ASSERT(ip->i_d.di_nblocks == 0);
735 
736 	nanotime(&tv);
737 	ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec;
738 	ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec;
739 	ip->i_d.di_atime = ip->i_d.di_mtime;
740 	ip->i_d.di_ctime = ip->i_d.di_mtime;
741 
742 	/*
743 	 * di_gen will have been taken care of in xfs_iread.
744 	 */
745 	ip->i_d.di_extsize = 0;
746 	ip->i_d.di_dmevmask = 0;
747 	ip->i_d.di_dmstate = 0;
748 	ip->i_d.di_flags = 0;
749 
750 	if (ip->i_d.di_version == 3) {
751 		ASSERT(ip->i_d.di_ino == ino);
752 		ASSERT(uuid_equal(&ip->i_d.di_uuid, &mp->m_sb.sb_uuid));
753 		ip->i_d.di_crc = 0;
754 		ip->i_d.di_changecount = 1;
755 		ip->i_d.di_lsn = 0;
756 		ip->i_d.di_flags2 = 0;
757 		memset(&(ip->i_d.di_pad2[0]), 0, sizeof(ip->i_d.di_pad2));
758 		ip->i_d.di_crtime = ip->i_d.di_mtime;
759 	}
760 
761 
762 	flags = XFS_ILOG_CORE;
763 	switch (mode & S_IFMT) {
764 	case S_IFIFO:
765 	case S_IFCHR:
766 	case S_IFBLK:
767 	case S_IFSOCK:
768 		ip->i_d.di_format = XFS_DINODE_FMT_DEV;
769 		ip->i_df.if_u2.if_rdev = rdev;
770 		ip->i_df.if_flags = 0;
771 		flags |= XFS_ILOG_DEV;
772 		break;
773 	case S_IFREG:
774 		/*
775 		 * we can't set up filestreams until after the VFS inode
776 		 * is set up properly.
777 		 */
778 		if (pip && xfs_inode_is_filestream(pip))
779 			filestreams = 1;
780 		/* fall through */
781 	case S_IFDIR:
782 		if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
783 			uint	di_flags = 0;
784 
785 			if (S_ISDIR(mode)) {
786 				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
787 					di_flags |= XFS_DIFLAG_RTINHERIT;
788 				if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
789 					di_flags |= XFS_DIFLAG_EXTSZINHERIT;
790 					ip->i_d.di_extsize = pip->i_d.di_extsize;
791 				}
792 			} else if (S_ISREG(mode)) {
793 				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
794 					di_flags |= XFS_DIFLAG_REALTIME;
795 				if (pip->i_d.di_flags & XFS_DIFLAG_EXTSZINHERIT) {
796 					di_flags |= XFS_DIFLAG_EXTSIZE;
797 					ip->i_d.di_extsize = pip->i_d.di_extsize;
798 				}
799 			}
800 			if ((pip->i_d.di_flags & XFS_DIFLAG_NOATIME) &&
801 			    xfs_inherit_noatime)
802 				di_flags |= XFS_DIFLAG_NOATIME;
803 			if ((pip->i_d.di_flags & XFS_DIFLAG_NODUMP) &&
804 			    xfs_inherit_nodump)
805 				di_flags |= XFS_DIFLAG_NODUMP;
806 			if ((pip->i_d.di_flags & XFS_DIFLAG_SYNC) &&
807 			    xfs_inherit_sync)
808 				di_flags |= XFS_DIFLAG_SYNC;
809 			if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) &&
810 			    xfs_inherit_nosymlinks)
811 				di_flags |= XFS_DIFLAG_NOSYMLINKS;
812 			if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
813 				di_flags |= XFS_DIFLAG_PROJINHERIT;
814 			if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) &&
815 			    xfs_inherit_nodefrag)
816 				di_flags |= XFS_DIFLAG_NODEFRAG;
817 			if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
818 				di_flags |= XFS_DIFLAG_FILESTREAM;
819 			ip->i_d.di_flags |= di_flags;
820 		}
821 		/* FALLTHROUGH */
822 	case S_IFLNK:
823 		ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
824 		ip->i_df.if_flags = XFS_IFEXTENTS;
825 		ip->i_df.if_bytes = ip->i_df.if_real_bytes = 0;
826 		ip->i_df.if_u1.if_extents = NULL;
827 		break;
828 	default:
829 		ASSERT(0);
830 	}
831 	/*
832 	 * Attribute fork settings for new inode.
833 	 */
834 	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
835 	ip->i_d.di_anextents = 0;
836 
837 	/*
838 	 * Log the new values stuffed into the inode.
839 	 */
840 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
841 	xfs_trans_log_inode(tp, ip, flags);
842 
843 	/* now that we have an i_mode we can setup inode ops and unlock */
844 	xfs_setup_inode(ip);
845 
846 	/* now we have set up the vfs inode we can associate the filestream */
847 	if (filestreams) {
848 		error = xfs_filestream_associate(pip, ip);
849 		if (error < 0)
850 			return -error;
851 		if (!error)
852 			xfs_iflags_set(ip, XFS_IFILESTREAM);
853 	}
854 
855 	*ipp = ip;
856 	return 0;
857 }
858 
859 /*
860  * Allocates a new inode from disk and return a pointer to the
861  * incore copy. This routine will internally commit the current
862  * transaction and allocate a new one if the Space Manager needed
863  * to do an allocation to replenish the inode free-list.
864  *
865  * This routine is designed to be called from xfs_create and
866  * xfs_create_dir.
867  *
868  */
869 int
870 xfs_dir_ialloc(
871 	xfs_trans_t	**tpp,		/* input: current transaction;
872 					   output: may be a new transaction. */
873 	xfs_inode_t	*dp,		/* directory within whose allocate
874 					   the inode. */
875 	umode_t		mode,
876 	xfs_nlink_t	nlink,
877 	xfs_dev_t	rdev,
878 	prid_t		prid,		/* project id */
879 	int		okalloc,	/* ok to allocate new space */
880 	xfs_inode_t	**ipp,		/* pointer to inode; it will be
881 					   locked. */
882 	int		*committed)
883 
884 {
885 	xfs_trans_t	*tp;
886 	xfs_trans_t	*ntp;
887 	xfs_inode_t	*ip;
888 	xfs_buf_t	*ialloc_context = NULL;
889 	int		code;
890 	void		*dqinfo;
891 	uint		tflags;
892 
893 	tp = *tpp;
894 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
895 
896 	/*
897 	 * xfs_ialloc will return a pointer to an incore inode if
898 	 * the Space Manager has an available inode on the free
899 	 * list. Otherwise, it will do an allocation and replenish
900 	 * the freelist.  Since we can only do one allocation per
901 	 * transaction without deadlocks, we will need to commit the
902 	 * current transaction and start a new one.  We will then
903 	 * need to call xfs_ialloc again to get the inode.
904 	 *
905 	 * If xfs_ialloc did an allocation to replenish the freelist,
906 	 * it returns the bp containing the head of the freelist as
907 	 * ialloc_context. We will hold a lock on it across the
908 	 * transaction commit so that no other process can steal
909 	 * the inode(s) that we've just allocated.
910 	 */
911 	code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid, okalloc,
912 			  &ialloc_context, &ip);
913 
914 	/*
915 	 * Return an error if we were unable to allocate a new inode.
916 	 * This should only happen if we run out of space on disk or
917 	 * encounter a disk error.
918 	 */
919 	if (code) {
920 		*ipp = NULL;
921 		return code;
922 	}
923 	if (!ialloc_context && !ip) {
924 		*ipp = NULL;
925 		return XFS_ERROR(ENOSPC);
926 	}
927 
928 	/*
929 	 * If the AGI buffer is non-NULL, then we were unable to get an
930 	 * inode in one operation.  We need to commit the current
931 	 * transaction and call xfs_ialloc() again.  It is guaranteed
932 	 * to succeed the second time.
933 	 */
934 	if (ialloc_context) {
935 		struct xfs_trans_res tres;
936 
937 		/*
938 		 * Normally, xfs_trans_commit releases all the locks.
939 		 * We call bhold to hang on to the ialloc_context across
940 		 * the commit.  Holding this buffer prevents any other
941 		 * processes from doing any allocations in this
942 		 * allocation group.
943 		 */
944 		xfs_trans_bhold(tp, ialloc_context);
945 		/*
946 		 * Save the log reservation so we can use
947 		 * them in the next transaction.
948 		 */
949 		tres.tr_logres = xfs_trans_get_log_res(tp);
950 		tres.tr_logcount = xfs_trans_get_log_count(tp);
951 
952 		/*
953 		 * We want the quota changes to be associated with the next
954 		 * transaction, NOT this one. So, detach the dqinfo from this
955 		 * and attach it to the next transaction.
956 		 */
957 		dqinfo = NULL;
958 		tflags = 0;
959 		if (tp->t_dqinfo) {
960 			dqinfo = (void *)tp->t_dqinfo;
961 			tp->t_dqinfo = NULL;
962 			tflags = tp->t_flags & XFS_TRANS_DQ_DIRTY;
963 			tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY);
964 		}
965 
966 		ntp = xfs_trans_dup(tp);
967 		code = xfs_trans_commit(tp, 0);
968 		tp = ntp;
969 		if (committed != NULL) {
970 			*committed = 1;
971 		}
972 		/*
973 		 * If we get an error during the commit processing,
974 		 * release the buffer that is still held and return
975 		 * to the caller.
976 		 */
977 		if (code) {
978 			xfs_buf_relse(ialloc_context);
979 			if (dqinfo) {
980 				tp->t_dqinfo = dqinfo;
981 				xfs_trans_free_dqinfo(tp);
982 			}
983 			*tpp = ntp;
984 			*ipp = NULL;
985 			return code;
986 		}
987 
988 		/*
989 		 * transaction commit worked ok so we can drop the extra ticket
990 		 * reference that we gained in xfs_trans_dup()
991 		 */
992 		xfs_log_ticket_put(tp->t_ticket);
993 		tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
994 		code = xfs_trans_reserve(tp, &tres, 0, 0);
995 
996 		/*
997 		 * Re-attach the quota info that we detached from prev trx.
998 		 */
999 		if (dqinfo) {
1000 			tp->t_dqinfo = dqinfo;
1001 			tp->t_flags |= tflags;
1002 		}
1003 
1004 		if (code) {
1005 			xfs_buf_relse(ialloc_context);
1006 			*tpp = ntp;
1007 			*ipp = NULL;
1008 			return code;
1009 		}
1010 		xfs_trans_bjoin(tp, ialloc_context);
1011 
1012 		/*
1013 		 * Call ialloc again. Since we've locked out all
1014 		 * other allocations in this allocation group,
1015 		 * this call should always succeed.
1016 		 */
1017 		code = xfs_ialloc(tp, dp, mode, nlink, rdev, prid,
1018 				  okalloc, &ialloc_context, &ip);
1019 
1020 		/*
1021 		 * If we get an error at this point, return to the caller
1022 		 * so that the current transaction can be aborted.
1023 		 */
1024 		if (code) {
1025 			*tpp = tp;
1026 			*ipp = NULL;
1027 			return code;
1028 		}
1029 		ASSERT(!ialloc_context && ip);
1030 
1031 	} else {
1032 		if (committed != NULL)
1033 			*committed = 0;
1034 	}
1035 
1036 	*ipp = ip;
1037 	*tpp = tp;
1038 
1039 	return 0;
1040 }
1041 
1042 /*
1043  * Decrement the link count on an inode & log the change.
1044  * If this causes the link count to go to zero, initiate the
1045  * logging activity required to truncate a file.
1046  */
1047 int				/* error */
1048 xfs_droplink(
1049 	xfs_trans_t *tp,
1050 	xfs_inode_t *ip)
1051 {
1052 	int	error;
1053 
1054 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1055 
1056 	ASSERT (ip->i_d.di_nlink > 0);
1057 	ip->i_d.di_nlink--;
1058 	drop_nlink(VFS_I(ip));
1059 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1060 
1061 	error = 0;
1062 	if (ip->i_d.di_nlink == 0) {
1063 		/*
1064 		 * We're dropping the last link to this file.
1065 		 * Move the on-disk inode to the AGI unlinked list.
1066 		 * From xfs_inactive() we will pull the inode from
1067 		 * the list and free it.
1068 		 */
1069 		error = xfs_iunlink(tp, ip);
1070 	}
1071 	return error;
1072 }
1073 
1074 /*
1075  * This gets called when the inode's version needs to be changed from 1 to 2.
1076  * Currently this happens when the nlink field overflows the old 16-bit value
1077  * or when chproj is called to change the project for the first time.
1078  * As a side effect the superblock version will also get rev'd
1079  * to contain the NLINK bit.
1080  */
1081 void
1082 xfs_bump_ino_vers2(
1083 	xfs_trans_t	*tp,
1084 	xfs_inode_t	*ip)
1085 {
1086 	xfs_mount_t	*mp;
1087 
1088 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1089 	ASSERT(ip->i_d.di_version == 1);
1090 
1091 	ip->i_d.di_version = 2;
1092 	ip->i_d.di_onlink = 0;
1093 	memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
1094 	mp = tp->t_mountp;
1095 	if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
1096 		spin_lock(&mp->m_sb_lock);
1097 		if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
1098 			xfs_sb_version_addnlink(&mp->m_sb);
1099 			spin_unlock(&mp->m_sb_lock);
1100 			xfs_mod_sb(tp, XFS_SB_VERSIONNUM);
1101 		} else {
1102 			spin_unlock(&mp->m_sb_lock);
1103 		}
1104 	}
1105 	/* Caller must log the inode */
1106 }
1107 
1108 /*
1109  * Increment the link count on an inode & log the change.
1110  */
1111 int
1112 xfs_bumplink(
1113 	xfs_trans_t *tp,
1114 	xfs_inode_t *ip)
1115 {
1116 	xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
1117 
1118 	ASSERT(ip->i_d.di_nlink > 0);
1119 	ip->i_d.di_nlink++;
1120 	inc_nlink(VFS_I(ip));
1121 	if ((ip->i_d.di_version == 1) &&
1122 	    (ip->i_d.di_nlink > XFS_MAXLINK_1)) {
1123 		/*
1124 		 * The inode has increased its number of links beyond
1125 		 * what can fit in an old format inode.  It now needs
1126 		 * to be converted to a version 2 inode with a 32 bit
1127 		 * link count.  If this is the first inode in the file
1128 		 * system to do this, then we need to bump the superblock
1129 		 * version number as well.
1130 		 */
1131 		xfs_bump_ino_vers2(tp, ip);
1132 	}
1133 
1134 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1135 	return 0;
1136 }
1137 
1138 int
1139 xfs_create(
1140 	xfs_inode_t		*dp,
1141 	struct xfs_name		*name,
1142 	umode_t			mode,
1143 	xfs_dev_t		rdev,
1144 	xfs_inode_t		**ipp)
1145 {
1146 	int			is_dir = S_ISDIR(mode);
1147 	struct xfs_mount	*mp = dp->i_mount;
1148 	struct xfs_inode	*ip = NULL;
1149 	struct xfs_trans	*tp = NULL;
1150 	int			error;
1151 	xfs_bmap_free_t		free_list;
1152 	xfs_fsblock_t		first_block;
1153 	bool                    unlock_dp_on_error = false;
1154 	uint			cancel_flags;
1155 	int			committed;
1156 	prid_t			prid;
1157 	struct xfs_dquot	*udqp = NULL;
1158 	struct xfs_dquot	*gdqp = NULL;
1159 	struct xfs_dquot	*pdqp = NULL;
1160 	struct xfs_trans_res	tres;
1161 	uint			resblks;
1162 
1163 	trace_xfs_create(dp, name);
1164 
1165 	if (XFS_FORCED_SHUTDOWN(mp))
1166 		return XFS_ERROR(EIO);
1167 
1168 	if (dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)
1169 		prid = xfs_get_projid(dp);
1170 	else
1171 		prid = XFS_PROJID_DEFAULT;
1172 
1173 	/*
1174 	 * Make sure that we have allocated dquot(s) on disk.
1175 	 */
1176 	error = xfs_qm_vop_dqalloc(dp, xfs_kuid_to_uid(current_fsuid()),
1177 					xfs_kgid_to_gid(current_fsgid()), prid,
1178 					XFS_QMOPT_QUOTALL | XFS_QMOPT_INHERIT,
1179 					&udqp, &gdqp, &pdqp);
1180 	if (error)
1181 		return error;
1182 
1183 	if (is_dir) {
1184 		rdev = 0;
1185 		resblks = XFS_MKDIR_SPACE_RES(mp, name->len);
1186 		tres.tr_logres = M_RES(mp)->tr_mkdir.tr_logres;
1187 		tres.tr_logcount = XFS_MKDIR_LOG_COUNT;
1188 		tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
1189 	} else {
1190 		resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1191 		tres.tr_logres = M_RES(mp)->tr_create.tr_logres;
1192 		tres.tr_logcount = XFS_CREATE_LOG_COUNT;
1193 		tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1194 	}
1195 
1196 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1197 
1198 	/*
1199 	 * Initially assume that the file does not exist and
1200 	 * reserve the resources for that case.  If that is not
1201 	 * the case we'll drop the one we have and get a more
1202 	 * appropriate transaction later.
1203 	 */
1204 	tres.tr_logflags = XFS_TRANS_PERM_LOG_RES;
1205 	error = xfs_trans_reserve(tp, &tres, resblks, 0);
1206 	if (error == ENOSPC) {
1207 		/* flush outstanding delalloc blocks and retry */
1208 		xfs_flush_inodes(mp);
1209 		error = xfs_trans_reserve(tp, &tres, resblks, 0);
1210 	}
1211 	if (error == ENOSPC) {
1212 		/* No space at all so try a "no-allocation" reservation */
1213 		resblks = 0;
1214 		error = xfs_trans_reserve(tp, &tres, 0, 0);
1215 	}
1216 	if (error) {
1217 		cancel_flags = 0;
1218 		goto out_trans_cancel;
1219 	}
1220 
1221 	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
1222 	unlock_dp_on_error = true;
1223 
1224 	xfs_bmap_init(&free_list, &first_block);
1225 
1226 	/*
1227 	 * Reserve disk quota and the inode.
1228 	 */
1229 	error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp,
1230 						pdqp, resblks, 1, 0);
1231 	if (error)
1232 		goto out_trans_cancel;
1233 
1234 	error = xfs_dir_canenter(tp, dp, name, resblks);
1235 	if (error)
1236 		goto out_trans_cancel;
1237 
1238 	/*
1239 	 * A newly created regular or special file just has one directory
1240 	 * entry pointing to them, but a directory also the "." entry
1241 	 * pointing to itself.
1242 	 */
1243 	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
1244 			       prid, resblks > 0, &ip, &committed);
1245 	if (error) {
1246 		if (error == ENOSPC)
1247 			goto out_trans_cancel;
1248 		goto out_trans_abort;
1249 	}
1250 
1251 	/*
1252 	 * Now we join the directory inode to the transaction.  We do not do it
1253 	 * earlier because xfs_dir_ialloc might commit the previous transaction
1254 	 * (and release all the locks).  An error from here on will result in
1255 	 * the transaction cancel unlocking dp so don't do it explicitly in the
1256 	 * error path.
1257 	 */
1258 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1259 	unlock_dp_on_error = false;
1260 
1261 	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1262 					&first_block, &free_list, resblks ?
1263 					resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1264 	if (error) {
1265 		ASSERT(error != ENOSPC);
1266 		goto out_trans_abort;
1267 	}
1268 	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1269 	xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
1270 
1271 	if (is_dir) {
1272 		error = xfs_dir_init(tp, ip, dp);
1273 		if (error)
1274 			goto out_bmap_cancel;
1275 
1276 		error = xfs_bumplink(tp, dp);
1277 		if (error)
1278 			goto out_bmap_cancel;
1279 	}
1280 
1281 	/*
1282 	 * If this is a synchronous mount, make sure that the
1283 	 * create transaction goes to disk before returning to
1284 	 * the user.
1285 	 */
1286 	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
1287 		xfs_trans_set_sync(tp);
1288 
1289 	/*
1290 	 * Attach the dquot(s) to the inodes and modify them incore.
1291 	 * These ids of the inode couldn't have changed since the new
1292 	 * inode has been locked ever since it was created.
1293 	 */
1294 	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
1295 
1296 	error = xfs_bmap_finish(&tp, &free_list, &committed);
1297 	if (error)
1298 		goto out_bmap_cancel;
1299 
1300 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1301 	if (error)
1302 		goto out_release_inode;
1303 
1304 	xfs_qm_dqrele(udqp);
1305 	xfs_qm_dqrele(gdqp);
1306 	xfs_qm_dqrele(pdqp);
1307 
1308 	*ipp = ip;
1309 	return 0;
1310 
1311  out_bmap_cancel:
1312 	xfs_bmap_cancel(&free_list);
1313  out_trans_abort:
1314 	cancel_flags |= XFS_TRANS_ABORT;
1315  out_trans_cancel:
1316 	xfs_trans_cancel(tp, cancel_flags);
1317  out_release_inode:
1318 	/*
1319 	 * Wait until after the current transaction is aborted to
1320 	 * release the inode.  This prevents recursive transactions
1321 	 * and deadlocks from xfs_inactive.
1322 	 */
1323 	if (ip)
1324 		IRELE(ip);
1325 
1326 	xfs_qm_dqrele(udqp);
1327 	xfs_qm_dqrele(gdqp);
1328 	xfs_qm_dqrele(pdqp);
1329 
1330 	if (unlock_dp_on_error)
1331 		xfs_iunlock(dp, XFS_ILOCK_EXCL);
1332 	return error;
1333 }
1334 
1335 int
1336 xfs_link(
1337 	xfs_inode_t		*tdp,
1338 	xfs_inode_t		*sip,
1339 	struct xfs_name		*target_name)
1340 {
1341 	xfs_mount_t		*mp = tdp->i_mount;
1342 	xfs_trans_t		*tp;
1343 	int			error;
1344 	xfs_bmap_free_t         free_list;
1345 	xfs_fsblock_t           first_block;
1346 	int			cancel_flags;
1347 	int			committed;
1348 	int			resblks;
1349 
1350 	trace_xfs_link(tdp, target_name);
1351 
1352 	ASSERT(!S_ISDIR(sip->i_d.di_mode));
1353 
1354 	if (XFS_FORCED_SHUTDOWN(mp))
1355 		return XFS_ERROR(EIO);
1356 
1357 	error = xfs_qm_dqattach(sip, 0);
1358 	if (error)
1359 		goto std_return;
1360 
1361 	error = xfs_qm_dqattach(tdp, 0);
1362 	if (error)
1363 		goto std_return;
1364 
1365 	tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
1366 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1367 	resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
1368 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0);
1369 	if (error == ENOSPC) {
1370 		resblks = 0;
1371 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0);
1372 	}
1373 	if (error) {
1374 		cancel_flags = 0;
1375 		goto error_return;
1376 	}
1377 
1378 	xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
1379 
1380 	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
1381 	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
1382 
1383 	/*
1384 	 * If we are using project inheritance, we only allow hard link
1385 	 * creation in our tree when the project IDs are the same; else
1386 	 * the tree quota mechanism could be circumvented.
1387 	 */
1388 	if (unlikely((tdp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
1389 		     (xfs_get_projid(tdp) != xfs_get_projid(sip)))) {
1390 		error = XFS_ERROR(EXDEV);
1391 		goto error_return;
1392 	}
1393 
1394 	error = xfs_dir_canenter(tp, tdp, target_name, resblks);
1395 	if (error)
1396 		goto error_return;
1397 
1398 	xfs_bmap_init(&free_list, &first_block);
1399 
1400 	error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
1401 					&first_block, &free_list, resblks);
1402 	if (error)
1403 		goto abort_return;
1404 	xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
1405 	xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
1406 
1407 	error = xfs_bumplink(tp, sip);
1408 	if (error)
1409 		goto abort_return;
1410 
1411 	/*
1412 	 * If this is a synchronous mount, make sure that the
1413 	 * link transaction goes to disk before returning to
1414 	 * the user.
1415 	 */
1416 	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
1417 		xfs_trans_set_sync(tp);
1418 	}
1419 
1420 	error = xfs_bmap_finish (&tp, &free_list, &committed);
1421 	if (error) {
1422 		xfs_bmap_cancel(&free_list);
1423 		goto abort_return;
1424 	}
1425 
1426 	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1427 
1428  abort_return:
1429 	cancel_flags |= XFS_TRANS_ABORT;
1430  error_return:
1431 	xfs_trans_cancel(tp, cancel_flags);
1432  std_return:
1433 	return error;
1434 }
1435 
1436 /*
1437  * Free up the underlying blocks past new_size.  The new size must be smaller
1438  * than the current size.  This routine can be used both for the attribute and
1439  * data fork, and does not modify the inode size, which is left to the caller.
1440  *
1441  * The transaction passed to this routine must have made a permanent log
1442  * reservation of at least XFS_ITRUNCATE_LOG_RES.  This routine may commit the
1443  * given transaction and start new ones, so make sure everything involved in
1444  * the transaction is tidy before calling here.  Some transaction will be
1445  * returned to the caller to be committed.  The incoming transaction must
1446  * already include the inode, and both inode locks must be held exclusively.
1447  * The inode must also be "held" within the transaction.  On return the inode
1448  * will be "held" within the returned transaction.  This routine does NOT
1449  * require any disk space to be reserved for it within the transaction.
1450  *
1451  * If we get an error, we must return with the inode locked and linked into the
1452  * current transaction. This keeps things simple for the higher level code,
1453  * because it always knows that the inode is locked and held in the transaction
1454  * that returns to it whether errors occur or not.  We don't mark the inode
1455  * dirty on error so that transactions can be easily aborted if possible.
1456  */
1457 int
1458 xfs_itruncate_extents(
1459 	struct xfs_trans	**tpp,
1460 	struct xfs_inode	*ip,
1461 	int			whichfork,
1462 	xfs_fsize_t		new_size)
1463 {
1464 	struct xfs_mount	*mp = ip->i_mount;
1465 	struct xfs_trans	*tp = *tpp;
1466 	struct xfs_trans	*ntp;
1467 	xfs_bmap_free_t		free_list;
1468 	xfs_fsblock_t		first_block;
1469 	xfs_fileoff_t		first_unmap_block;
1470 	xfs_fileoff_t		last_block;
1471 	xfs_filblks_t		unmap_len;
1472 	int			committed;
1473 	int			error = 0;
1474 	int			done = 0;
1475 
1476 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1477 	ASSERT(!atomic_read(&VFS_I(ip)->i_count) ||
1478 	       xfs_isilocked(ip, XFS_IOLOCK_EXCL));
1479 	ASSERT(new_size <= XFS_ISIZE(ip));
1480 	ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
1481 	ASSERT(ip->i_itemp != NULL);
1482 	ASSERT(ip->i_itemp->ili_lock_flags == 0);
1483 	ASSERT(!XFS_NOT_DQATTACHED(mp, ip));
1484 
1485 	trace_xfs_itruncate_extents_start(ip, new_size);
1486 
1487 	/*
1488 	 * Since it is possible for space to become allocated beyond
1489 	 * the end of the file (in a crash where the space is allocated
1490 	 * but the inode size is not yet updated), simply remove any
1491 	 * blocks which show up between the new EOF and the maximum
1492 	 * possible file size.  If the first block to be removed is
1493 	 * beyond the maximum file size (ie it is the same as last_block),
1494 	 * then there is nothing to do.
1495 	 */
1496 	first_unmap_block = XFS_B_TO_FSB(mp, (xfs_ufsize_t)new_size);
1497 	last_block = XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
1498 	if (first_unmap_block == last_block)
1499 		return 0;
1500 
1501 	ASSERT(first_unmap_block < last_block);
1502 	unmap_len = last_block - first_unmap_block + 1;
1503 	while (!done) {
1504 		xfs_bmap_init(&free_list, &first_block);
1505 		error = xfs_bunmapi(tp, ip,
1506 				    first_unmap_block, unmap_len,
1507 				    xfs_bmapi_aflag(whichfork),
1508 				    XFS_ITRUNC_MAX_EXTENTS,
1509 				    &first_block, &free_list,
1510 				    &done);
1511 		if (error)
1512 			goto out_bmap_cancel;
1513 
1514 		/*
1515 		 * Duplicate the transaction that has the permanent
1516 		 * reservation and commit the old transaction.
1517 		 */
1518 		error = xfs_bmap_finish(&tp, &free_list, &committed);
1519 		if (committed)
1520 			xfs_trans_ijoin(tp, ip, 0);
1521 		if (error)
1522 			goto out_bmap_cancel;
1523 
1524 		if (committed) {
1525 			/*
1526 			 * Mark the inode dirty so it will be logged and
1527 			 * moved forward in the log as part of every commit.
1528 			 */
1529 			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1530 		}
1531 
1532 		ntp = xfs_trans_dup(tp);
1533 		error = xfs_trans_commit(tp, 0);
1534 		tp = ntp;
1535 
1536 		xfs_trans_ijoin(tp, ip, 0);
1537 
1538 		if (error)
1539 			goto out;
1540 
1541 		/*
1542 		 * Transaction commit worked ok so we can drop the extra ticket
1543 		 * reference that we gained in xfs_trans_dup()
1544 		 */
1545 		xfs_log_ticket_put(tp->t_ticket);
1546 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
1547 		if (error)
1548 			goto out;
1549 	}
1550 
1551 	/*
1552 	 * Always re-log the inode so that our permanent transaction can keep
1553 	 * on rolling it forward in the log.
1554 	 */
1555 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1556 
1557 	trace_xfs_itruncate_extents_end(ip, new_size);
1558 
1559 out:
1560 	*tpp = tp;
1561 	return error;
1562 out_bmap_cancel:
1563 	/*
1564 	 * If the bunmapi call encounters an error, return to the caller where
1565 	 * the transaction can be properly aborted.  We just need to make sure
1566 	 * we're not holding any resources that we were not when we came in.
1567 	 */
1568 	xfs_bmap_cancel(&free_list);
1569 	goto out;
1570 }
1571 
1572 int
1573 xfs_release(
1574 	xfs_inode_t	*ip)
1575 {
1576 	xfs_mount_t	*mp = ip->i_mount;
1577 	int		error;
1578 
1579 	if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0))
1580 		return 0;
1581 
1582 	/* If this is a read-only mount, don't do this (would generate I/O) */
1583 	if (mp->m_flags & XFS_MOUNT_RDONLY)
1584 		return 0;
1585 
1586 	if (!XFS_FORCED_SHUTDOWN(mp)) {
1587 		int truncated;
1588 
1589 		/*
1590 		 * If we are using filestreams, and we have an unlinked
1591 		 * file that we are processing the last close on, then nothing
1592 		 * will be able to reopen and write to this file. Purge this
1593 		 * inode from the filestreams cache so that it doesn't delay
1594 		 * teardown of the inode.
1595 		 */
1596 		if ((ip->i_d.di_nlink == 0) && xfs_inode_is_filestream(ip))
1597 			xfs_filestream_deassociate(ip);
1598 
1599 		/*
1600 		 * If we previously truncated this file and removed old data
1601 		 * in the process, we want to initiate "early" writeout on
1602 		 * the last close.  This is an attempt to combat the notorious
1603 		 * NULL files problem which is particularly noticeable from a
1604 		 * truncate down, buffered (re-)write (delalloc), followed by
1605 		 * a crash.  What we are effectively doing here is
1606 		 * significantly reducing the time window where we'd otherwise
1607 		 * be exposed to that problem.
1608 		 */
1609 		truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
1610 		if (truncated) {
1611 			xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
1612 			if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
1613 				error = -filemap_flush(VFS_I(ip)->i_mapping);
1614 				if (error)
1615 					return error;
1616 			}
1617 		}
1618 	}
1619 
1620 	if (ip->i_d.di_nlink == 0)
1621 		return 0;
1622 
1623 	if (xfs_can_free_eofblocks(ip, false)) {
1624 
1625 		/*
1626 		 * If we can't get the iolock just skip truncating the blocks
1627 		 * past EOF because we could deadlock with the mmap_sem
1628 		 * otherwise.  We'll get another chance to drop them once the
1629 		 * last reference to the inode is dropped, so we'll never leak
1630 		 * blocks permanently.
1631 		 *
1632 		 * Further, check if the inode is being opened, written and
1633 		 * closed frequently and we have delayed allocation blocks
1634 		 * outstanding (e.g. streaming writes from the NFS server),
1635 		 * truncating the blocks past EOF will cause fragmentation to
1636 		 * occur.
1637 		 *
1638 		 * In this case don't do the truncation, either, but we have to
1639 		 * be careful how we detect this case. Blocks beyond EOF show
1640 		 * up as i_delayed_blks even when the inode is clean, so we
1641 		 * need to truncate them away first before checking for a dirty
1642 		 * release. Hence on the first dirty close we will still remove
1643 		 * the speculative allocation, but after that we will leave it
1644 		 * in place.
1645 		 */
1646 		if (xfs_iflags_test(ip, XFS_IDIRTY_RELEASE))
1647 			return 0;
1648 
1649 		error = xfs_free_eofblocks(mp, ip, true);
1650 		if (error && error != EAGAIN)
1651 			return error;
1652 
1653 		/* delalloc blocks after truncation means it really is dirty */
1654 		if (ip->i_delayed_blks)
1655 			xfs_iflags_set(ip, XFS_IDIRTY_RELEASE);
1656 	}
1657 	return 0;
1658 }
1659 
1660 /*
1661  * xfs_inactive_truncate
1662  *
1663  * Called to perform a truncate when an inode becomes unlinked.
1664  */
1665 STATIC int
1666 xfs_inactive_truncate(
1667 	struct xfs_inode *ip)
1668 {
1669 	struct xfs_mount	*mp = ip->i_mount;
1670 	struct xfs_trans	*tp;
1671 	int			error;
1672 
1673 	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1674 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0);
1675 	if (error) {
1676 		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1677 		xfs_trans_cancel(tp, 0);
1678 		return error;
1679 	}
1680 
1681 	xfs_ilock(ip, XFS_ILOCK_EXCL);
1682 	xfs_trans_ijoin(tp, ip, 0);
1683 
1684 	/*
1685 	 * Log the inode size first to prevent stale data exposure in the event
1686 	 * of a system crash before the truncate completes. See the related
1687 	 * comment in xfs_setattr_size() for details.
1688 	 */
1689 	ip->i_d.di_size = 0;
1690 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
1691 
1692 	error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0);
1693 	if (error)
1694 		goto error_trans_cancel;
1695 
1696 	ASSERT(ip->i_d.di_nextents == 0);
1697 
1698 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1699 	if (error)
1700 		goto error_unlock;
1701 
1702 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1703 	return 0;
1704 
1705 error_trans_cancel:
1706 	xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT);
1707 error_unlock:
1708 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1709 	return error;
1710 }
1711 
1712 /*
1713  * xfs_inactive_ifree()
1714  *
1715  * Perform the inode free when an inode is unlinked.
1716  */
1717 STATIC int
1718 xfs_inactive_ifree(
1719 	struct xfs_inode *ip)
1720 {
1721 	xfs_bmap_free_t		free_list;
1722 	xfs_fsblock_t		first_block;
1723 	int			committed;
1724 	struct xfs_mount	*mp = ip->i_mount;
1725 	struct xfs_trans	*tp;
1726 	int			error;
1727 
1728 	tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1729 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ifree, 0, 0);
1730 	if (error) {
1731 		ASSERT(XFS_FORCED_SHUTDOWN(mp));
1732 		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES);
1733 		return error;
1734 	}
1735 
1736 	xfs_ilock(ip, XFS_ILOCK_EXCL);
1737 	xfs_trans_ijoin(tp, ip, 0);
1738 
1739 	xfs_bmap_init(&free_list, &first_block);
1740 	error = xfs_ifree(tp, ip, &free_list);
1741 	if (error) {
1742 		/*
1743 		 * If we fail to free the inode, shut down.  The cancel
1744 		 * might do that, we need to make sure.  Otherwise the
1745 		 * inode might be lost for a long time or forever.
1746 		 */
1747 		if (!XFS_FORCED_SHUTDOWN(mp)) {
1748 			xfs_notice(mp, "%s: xfs_ifree returned error %d",
1749 				__func__, error);
1750 			xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1751 		}
1752 		xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT);
1753 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
1754 		return error;
1755 	}
1756 
1757 	/*
1758 	 * Credit the quota account(s). The inode is gone.
1759 	 */
1760 	xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1761 
1762 	/*
1763 	 * Just ignore errors at this point.  There is nothing we can
1764 	 * do except to try to keep going. Make sure it's not a silent
1765 	 * error.
1766 	 */
1767 	error = xfs_bmap_finish(&tp,  &free_list, &committed);
1768 	if (error)
1769 		xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
1770 			__func__, error);
1771 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1772 	if (error)
1773 		xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1774 			__func__, error);
1775 
1776 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
1777 	return 0;
1778 }
1779 
1780 /*
1781  * xfs_inactive
1782  *
1783  * This is called when the vnode reference count for the vnode
1784  * goes to zero.  If the file has been unlinked, then it must
1785  * now be truncated.  Also, we clear all of the read-ahead state
1786  * kept for the inode here since the file is now closed.
1787  */
1788 void
1789 xfs_inactive(
1790 	xfs_inode_t	*ip)
1791 {
1792 	struct xfs_mount	*mp;
1793 	int			error;
1794 	int			truncate = 0;
1795 
1796 	/*
1797 	 * If the inode is already free, then there can be nothing
1798 	 * to clean up here.
1799 	 */
1800 	if (ip->i_d.di_mode == 0) {
1801 		ASSERT(ip->i_df.if_real_bytes == 0);
1802 		ASSERT(ip->i_df.if_broot_bytes == 0);
1803 		return;
1804 	}
1805 
1806 	mp = ip->i_mount;
1807 
1808 	/* If this is a read-only mount, don't do this (would generate I/O) */
1809 	if (mp->m_flags & XFS_MOUNT_RDONLY)
1810 		return;
1811 
1812 	if (ip->i_d.di_nlink != 0) {
1813 		/*
1814 		 * force is true because we are evicting an inode from the
1815 		 * cache. Post-eof blocks must be freed, lest we end up with
1816 		 * broken free space accounting.
1817 		 */
1818 		if (xfs_can_free_eofblocks(ip, true))
1819 			xfs_free_eofblocks(mp, ip, false);
1820 
1821 		return;
1822 	}
1823 
1824 	if (S_ISREG(ip->i_d.di_mode) &&
1825 	    (ip->i_d.di_size != 0 || XFS_ISIZE(ip) != 0 ||
1826 	     ip->i_d.di_nextents > 0 || ip->i_delayed_blks > 0))
1827 		truncate = 1;
1828 
1829 	error = xfs_qm_dqattach(ip, 0);
1830 	if (error)
1831 		return;
1832 
1833 	if (S_ISLNK(ip->i_d.di_mode))
1834 		error = xfs_inactive_symlink(ip);
1835 	else if (truncate)
1836 		error = xfs_inactive_truncate(ip);
1837 	if (error)
1838 		return;
1839 
1840 	/*
1841 	 * If there are attributes associated with the file then blow them away
1842 	 * now.  The code calls a routine that recursively deconstructs the
1843 	 * attribute fork.  We need to just commit the current transaction
1844 	 * because we can't use it for xfs_attr_inactive().
1845 	 */
1846 	if (ip->i_d.di_anextents > 0) {
1847 		ASSERT(ip->i_d.di_forkoff != 0);
1848 
1849 		error = xfs_attr_inactive(ip);
1850 		if (error)
1851 			return;
1852 	}
1853 
1854 	if (ip->i_afp)
1855 		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
1856 
1857 	ASSERT(ip->i_d.di_anextents == 0);
1858 
1859 	/*
1860 	 * Free the inode.
1861 	 */
1862 	error = xfs_inactive_ifree(ip);
1863 	if (error)
1864 		return;
1865 
1866 	/*
1867 	 * Release the dquots held by inode, if any.
1868 	 */
1869 	xfs_qm_dqdetach(ip);
1870 }
1871 
1872 /*
1873  * This is called when the inode's link count goes to 0.
1874  * We place the on-disk inode on a list in the AGI.  It
1875  * will be pulled from this list when the inode is freed.
1876  */
1877 int
1878 xfs_iunlink(
1879 	xfs_trans_t	*tp,
1880 	xfs_inode_t	*ip)
1881 {
1882 	xfs_mount_t	*mp;
1883 	xfs_agi_t	*agi;
1884 	xfs_dinode_t	*dip;
1885 	xfs_buf_t	*agibp;
1886 	xfs_buf_t	*ibp;
1887 	xfs_agino_t	agino;
1888 	short		bucket_index;
1889 	int		offset;
1890 	int		error;
1891 
1892 	ASSERT(ip->i_d.di_nlink == 0);
1893 	ASSERT(ip->i_d.di_mode != 0);
1894 
1895 	mp = tp->t_mountp;
1896 
1897 	/*
1898 	 * Get the agi buffer first.  It ensures lock ordering
1899 	 * on the list.
1900 	 */
1901 	error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
1902 	if (error)
1903 		return error;
1904 	agi = XFS_BUF_TO_AGI(agibp);
1905 
1906 	/*
1907 	 * Get the index into the agi hash table for the
1908 	 * list this inode will go on.
1909 	 */
1910 	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1911 	ASSERT(agino != 0);
1912 	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1913 	ASSERT(agi->agi_unlinked[bucket_index]);
1914 	ASSERT(be32_to_cpu(agi->agi_unlinked[bucket_index]) != agino);
1915 
1916 	if (agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO)) {
1917 		/*
1918 		 * There is already another inode in the bucket we need
1919 		 * to add ourselves to.  Add us at the front of the list.
1920 		 * Here we put the head pointer into our next pointer,
1921 		 * and then we fall through to point the head at us.
1922 		 */
1923 		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
1924 				       0, 0);
1925 		if (error)
1926 			return error;
1927 
1928 		ASSERT(dip->di_next_unlinked == cpu_to_be32(NULLAGINO));
1929 		dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
1930 		offset = ip->i_imap.im_boffset +
1931 			offsetof(xfs_dinode_t, di_next_unlinked);
1932 
1933 		/* need to recalc the inode CRC if appropriate */
1934 		xfs_dinode_calc_crc(mp, dip);
1935 
1936 		xfs_trans_inode_buf(tp, ibp);
1937 		xfs_trans_log_buf(tp, ibp, offset,
1938 				  (offset + sizeof(xfs_agino_t) - 1));
1939 		xfs_inobp_check(mp, ibp);
1940 	}
1941 
1942 	/*
1943 	 * Point the bucket head pointer at the inode being inserted.
1944 	 */
1945 	ASSERT(agino != 0);
1946 	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
1947 	offset = offsetof(xfs_agi_t, agi_unlinked) +
1948 		(sizeof(xfs_agino_t) * bucket_index);
1949 	xfs_trans_log_buf(tp, agibp, offset,
1950 			  (offset + sizeof(xfs_agino_t) - 1));
1951 	return 0;
1952 }
1953 
1954 /*
1955  * Pull the on-disk inode from the AGI unlinked list.
1956  */
1957 STATIC int
1958 xfs_iunlink_remove(
1959 	xfs_trans_t	*tp,
1960 	xfs_inode_t	*ip)
1961 {
1962 	xfs_ino_t	next_ino;
1963 	xfs_mount_t	*mp;
1964 	xfs_agi_t	*agi;
1965 	xfs_dinode_t	*dip;
1966 	xfs_buf_t	*agibp;
1967 	xfs_buf_t	*ibp;
1968 	xfs_agnumber_t	agno;
1969 	xfs_agino_t	agino;
1970 	xfs_agino_t	next_agino;
1971 	xfs_buf_t	*last_ibp;
1972 	xfs_dinode_t	*last_dip = NULL;
1973 	short		bucket_index;
1974 	int		offset, last_offset = 0;
1975 	int		error;
1976 
1977 	mp = tp->t_mountp;
1978 	agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
1979 
1980 	/*
1981 	 * Get the agi buffer first.  It ensures lock ordering
1982 	 * on the list.
1983 	 */
1984 	error = xfs_read_agi(mp, tp, agno, &agibp);
1985 	if (error)
1986 		return error;
1987 
1988 	agi = XFS_BUF_TO_AGI(agibp);
1989 
1990 	/*
1991 	 * Get the index into the agi hash table for the
1992 	 * list this inode will go on.
1993 	 */
1994 	agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
1995 	ASSERT(agino != 0);
1996 	bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
1997 	ASSERT(agi->agi_unlinked[bucket_index] != cpu_to_be32(NULLAGINO));
1998 	ASSERT(agi->agi_unlinked[bucket_index]);
1999 
2000 	if (be32_to_cpu(agi->agi_unlinked[bucket_index]) == agino) {
2001 		/*
2002 		 * We're at the head of the list.  Get the inode's on-disk
2003 		 * buffer to see if there is anyone after us on the list.
2004 		 * Only modify our next pointer if it is not already NULLAGINO.
2005 		 * This saves us the overhead of dealing with the buffer when
2006 		 * there is no need to change it.
2007 		 */
2008 		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
2009 				       0, 0);
2010 		if (error) {
2011 			xfs_warn(mp, "%s: xfs_imap_to_bp returned error %d.",
2012 				__func__, error);
2013 			return error;
2014 		}
2015 		next_agino = be32_to_cpu(dip->di_next_unlinked);
2016 		ASSERT(next_agino != 0);
2017 		if (next_agino != NULLAGINO) {
2018 			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2019 			offset = ip->i_imap.im_boffset +
2020 				offsetof(xfs_dinode_t, di_next_unlinked);
2021 
2022 			/* need to recalc the inode CRC if appropriate */
2023 			xfs_dinode_calc_crc(mp, dip);
2024 
2025 			xfs_trans_inode_buf(tp, ibp);
2026 			xfs_trans_log_buf(tp, ibp, offset,
2027 					  (offset + sizeof(xfs_agino_t) - 1));
2028 			xfs_inobp_check(mp, ibp);
2029 		} else {
2030 			xfs_trans_brelse(tp, ibp);
2031 		}
2032 		/*
2033 		 * Point the bucket head pointer at the next inode.
2034 		 */
2035 		ASSERT(next_agino != 0);
2036 		ASSERT(next_agino != agino);
2037 		agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
2038 		offset = offsetof(xfs_agi_t, agi_unlinked) +
2039 			(sizeof(xfs_agino_t) * bucket_index);
2040 		xfs_trans_log_buf(tp, agibp, offset,
2041 				  (offset + sizeof(xfs_agino_t) - 1));
2042 	} else {
2043 		/*
2044 		 * We need to search the list for the inode being freed.
2045 		 */
2046 		next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2047 		last_ibp = NULL;
2048 		while (next_agino != agino) {
2049 			struct xfs_imap	imap;
2050 
2051 			if (last_ibp)
2052 				xfs_trans_brelse(tp, last_ibp);
2053 
2054 			imap.im_blkno = 0;
2055 			next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
2056 
2057 			error = xfs_imap(mp, tp, next_ino, &imap, 0);
2058 			if (error) {
2059 				xfs_warn(mp,
2060 	"%s: xfs_imap returned error %d.",
2061 					 __func__, error);
2062 				return error;
2063 			}
2064 
2065 			error = xfs_imap_to_bp(mp, tp, &imap, &last_dip,
2066 					       &last_ibp, 0, 0);
2067 			if (error) {
2068 				xfs_warn(mp,
2069 	"%s: xfs_imap_to_bp returned error %d.",
2070 					__func__, error);
2071 				return error;
2072 			}
2073 
2074 			last_offset = imap.im_boffset;
2075 			next_agino = be32_to_cpu(last_dip->di_next_unlinked);
2076 			ASSERT(next_agino != NULLAGINO);
2077 			ASSERT(next_agino != 0);
2078 		}
2079 
2080 		/*
2081 		 * Now last_ibp points to the buffer previous to us on the
2082 		 * unlinked list.  Pull us from the list.
2083 		 */
2084 		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &ibp,
2085 				       0, 0);
2086 		if (error) {
2087 			xfs_warn(mp, "%s: xfs_imap_to_bp(2) returned error %d.",
2088 				__func__, error);
2089 			return error;
2090 		}
2091 		next_agino = be32_to_cpu(dip->di_next_unlinked);
2092 		ASSERT(next_agino != 0);
2093 		ASSERT(next_agino != agino);
2094 		if (next_agino != NULLAGINO) {
2095 			dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
2096 			offset = ip->i_imap.im_boffset +
2097 				offsetof(xfs_dinode_t, di_next_unlinked);
2098 
2099 			/* need to recalc the inode CRC if appropriate */
2100 			xfs_dinode_calc_crc(mp, dip);
2101 
2102 			xfs_trans_inode_buf(tp, ibp);
2103 			xfs_trans_log_buf(tp, ibp, offset,
2104 					  (offset + sizeof(xfs_agino_t) - 1));
2105 			xfs_inobp_check(mp, ibp);
2106 		} else {
2107 			xfs_trans_brelse(tp, ibp);
2108 		}
2109 		/*
2110 		 * Point the previous inode on the list to the next inode.
2111 		 */
2112 		last_dip->di_next_unlinked = cpu_to_be32(next_agino);
2113 		ASSERT(next_agino != 0);
2114 		offset = last_offset + offsetof(xfs_dinode_t, di_next_unlinked);
2115 
2116 		/* need to recalc the inode CRC if appropriate */
2117 		xfs_dinode_calc_crc(mp, last_dip);
2118 
2119 		xfs_trans_inode_buf(tp, last_ibp);
2120 		xfs_trans_log_buf(tp, last_ibp, offset,
2121 				  (offset + sizeof(xfs_agino_t) - 1));
2122 		xfs_inobp_check(mp, last_ibp);
2123 	}
2124 	return 0;
2125 }
2126 
2127 /*
2128  * A big issue when freeing the inode cluster is that we _cannot_ skip any
2129  * inodes that are in memory - they all must be marked stale and attached to
2130  * the cluster buffer.
2131  */
2132 STATIC int
2133 xfs_ifree_cluster(
2134 	xfs_inode_t	*free_ip,
2135 	xfs_trans_t	*tp,
2136 	xfs_ino_t	inum)
2137 {
2138 	xfs_mount_t		*mp = free_ip->i_mount;
2139 	int			blks_per_cluster;
2140 	int			inodes_per_cluster;
2141 	int			nbufs;
2142 	int			i, j;
2143 	xfs_daddr_t		blkno;
2144 	xfs_buf_t		*bp;
2145 	xfs_inode_t		*ip;
2146 	xfs_inode_log_item_t	*iip;
2147 	xfs_log_item_t		*lip;
2148 	struct xfs_perag	*pag;
2149 
2150 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
2151 	blks_per_cluster = xfs_icluster_size_fsb(mp);
2152 	inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog;
2153 	nbufs = mp->m_ialloc_blks / blks_per_cluster;
2154 
2155 	for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) {
2156 		blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
2157 					 XFS_INO_TO_AGBNO(mp, inum));
2158 
2159 		/*
2160 		 * We obtain and lock the backing buffer first in the process
2161 		 * here, as we have to ensure that any dirty inode that we
2162 		 * can't get the flush lock on is attached to the buffer.
2163 		 * If we scan the in-memory inodes first, then buffer IO can
2164 		 * complete before we get a lock on it, and hence we may fail
2165 		 * to mark all the active inodes on the buffer stale.
2166 		 */
2167 		bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2168 					mp->m_bsize * blks_per_cluster,
2169 					XBF_UNMAPPED);
2170 
2171 		if (!bp)
2172 			return ENOMEM;
2173 
2174 		/*
2175 		 * This buffer may not have been correctly initialised as we
2176 		 * didn't read it from disk. That's not important because we are
2177 		 * only using to mark the buffer as stale in the log, and to
2178 		 * attach stale cached inodes on it. That means it will never be
2179 		 * dispatched for IO. If it is, we want to know about it, and we
2180 		 * want it to fail. We can acheive this by adding a write
2181 		 * verifier to the buffer.
2182 		 */
2183 		 bp->b_ops = &xfs_inode_buf_ops;
2184 
2185 		/*
2186 		 * Walk the inodes already attached to the buffer and mark them
2187 		 * stale. These will all have the flush locks held, so an
2188 		 * in-memory inode walk can't lock them. By marking them all
2189 		 * stale first, we will not attempt to lock them in the loop
2190 		 * below as the XFS_ISTALE flag will be set.
2191 		 */
2192 		lip = bp->b_fspriv;
2193 		while (lip) {
2194 			if (lip->li_type == XFS_LI_INODE) {
2195 				iip = (xfs_inode_log_item_t *)lip;
2196 				ASSERT(iip->ili_logged == 1);
2197 				lip->li_cb = xfs_istale_done;
2198 				xfs_trans_ail_copy_lsn(mp->m_ail,
2199 							&iip->ili_flush_lsn,
2200 							&iip->ili_item.li_lsn);
2201 				xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
2202 			}
2203 			lip = lip->li_bio_list;
2204 		}
2205 
2206 
2207 		/*
2208 		 * For each inode in memory attempt to add it to the inode
2209 		 * buffer and set it up for being staled on buffer IO
2210 		 * completion.  This is safe as we've locked out tail pushing
2211 		 * and flushing by locking the buffer.
2212 		 *
2213 		 * We have already marked every inode that was part of a
2214 		 * transaction stale above, which means there is no point in
2215 		 * even trying to lock them.
2216 		 */
2217 		for (i = 0; i < inodes_per_cluster; i++) {
2218 retry:
2219 			rcu_read_lock();
2220 			ip = radix_tree_lookup(&pag->pag_ici_root,
2221 					XFS_INO_TO_AGINO(mp, (inum + i)));
2222 
2223 			/* Inode not in memory, nothing to do */
2224 			if (!ip) {
2225 				rcu_read_unlock();
2226 				continue;
2227 			}
2228 
2229 			/*
2230 			 * because this is an RCU protected lookup, we could
2231 			 * find a recently freed or even reallocated inode
2232 			 * during the lookup. We need to check under the
2233 			 * i_flags_lock for a valid inode here. Skip it if it
2234 			 * is not valid, the wrong inode or stale.
2235 			 */
2236 			spin_lock(&ip->i_flags_lock);
2237 			if (ip->i_ino != inum + i ||
2238 			    __xfs_iflags_test(ip, XFS_ISTALE)) {
2239 				spin_unlock(&ip->i_flags_lock);
2240 				rcu_read_unlock();
2241 				continue;
2242 			}
2243 			spin_unlock(&ip->i_flags_lock);
2244 
2245 			/*
2246 			 * Don't try to lock/unlock the current inode, but we
2247 			 * _cannot_ skip the other inodes that we did not find
2248 			 * in the list attached to the buffer and are not
2249 			 * already marked stale. If we can't lock it, back off
2250 			 * and retry.
2251 			 */
2252 			if (ip != free_ip &&
2253 			    !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2254 				rcu_read_unlock();
2255 				delay(1);
2256 				goto retry;
2257 			}
2258 			rcu_read_unlock();
2259 
2260 			xfs_iflock(ip);
2261 			xfs_iflags_set(ip, XFS_ISTALE);
2262 
2263 			/*
2264 			 * we don't need to attach clean inodes or those only
2265 			 * with unlogged changes (which we throw away, anyway).
2266 			 */
2267 			iip = ip->i_itemp;
2268 			if (!iip || xfs_inode_clean(ip)) {
2269 				ASSERT(ip != free_ip);
2270 				xfs_ifunlock(ip);
2271 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
2272 				continue;
2273 			}
2274 
2275 			iip->ili_last_fields = iip->ili_fields;
2276 			iip->ili_fields = 0;
2277 			iip->ili_logged = 1;
2278 			xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2279 						&iip->ili_item.li_lsn);
2280 
2281 			xfs_buf_attach_iodone(bp, xfs_istale_done,
2282 						  &iip->ili_item);
2283 
2284 			if (ip != free_ip)
2285 				xfs_iunlock(ip, XFS_ILOCK_EXCL);
2286 		}
2287 
2288 		xfs_trans_stale_inode_buf(tp, bp);
2289 		xfs_trans_binval(tp, bp);
2290 	}
2291 
2292 	xfs_perag_put(pag);
2293 	return 0;
2294 }
2295 
2296 /*
2297  * This is called to return an inode to the inode free list.
2298  * The inode should already be truncated to 0 length and have
2299  * no pages associated with it.  This routine also assumes that
2300  * the inode is already a part of the transaction.
2301  *
2302  * The on-disk copy of the inode will have been added to the list
2303  * of unlinked inodes in the AGI. We need to remove the inode from
2304  * that list atomically with respect to freeing it here.
2305  */
2306 int
2307 xfs_ifree(
2308 	xfs_trans_t	*tp,
2309 	xfs_inode_t	*ip,
2310 	xfs_bmap_free_t	*flist)
2311 {
2312 	int			error;
2313 	int			delete;
2314 	xfs_ino_t		first_ino;
2315 
2316 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2317 	ASSERT(ip->i_d.di_nlink == 0);
2318 	ASSERT(ip->i_d.di_nextents == 0);
2319 	ASSERT(ip->i_d.di_anextents == 0);
2320 	ASSERT(ip->i_d.di_size == 0 || !S_ISREG(ip->i_d.di_mode));
2321 	ASSERT(ip->i_d.di_nblocks == 0);
2322 
2323 	/*
2324 	 * Pull the on-disk inode from the AGI unlinked list.
2325 	 */
2326 	error = xfs_iunlink_remove(tp, ip);
2327 	if (error)
2328 		return error;
2329 
2330 	error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino);
2331 	if (error)
2332 		return error;
2333 
2334 	ip->i_d.di_mode = 0;		/* mark incore inode as free */
2335 	ip->i_d.di_flags = 0;
2336 	ip->i_d.di_dmevmask = 0;
2337 	ip->i_d.di_forkoff = 0;		/* mark the attr fork not in use */
2338 	ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
2339 	ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
2340 	/*
2341 	 * Bump the generation count so no one will be confused
2342 	 * by reincarnations of this inode.
2343 	 */
2344 	ip->i_d.di_gen++;
2345 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2346 
2347 	if (delete)
2348 		error = xfs_ifree_cluster(ip, tp, first_ino);
2349 
2350 	return error;
2351 }
2352 
2353 /*
2354  * This is called to unpin an inode.  The caller must have the inode locked
2355  * in at least shared mode so that the buffer cannot be subsequently pinned
2356  * once someone is waiting for it to be unpinned.
2357  */
2358 static void
2359 xfs_iunpin(
2360 	struct xfs_inode	*ip)
2361 {
2362 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2363 
2364 	trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
2365 
2366 	/* Give the log a push to start the unpinning I/O */
2367 	xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0);
2368 
2369 }
2370 
2371 static void
2372 __xfs_iunpin_wait(
2373 	struct xfs_inode	*ip)
2374 {
2375 	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IPINNED_BIT);
2376 	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IPINNED_BIT);
2377 
2378 	xfs_iunpin(ip);
2379 
2380 	do {
2381 		prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
2382 		if (xfs_ipincount(ip))
2383 			io_schedule();
2384 	} while (xfs_ipincount(ip));
2385 	finish_wait(wq, &wait.wait);
2386 }
2387 
2388 void
2389 xfs_iunpin_wait(
2390 	struct xfs_inode	*ip)
2391 {
2392 	if (xfs_ipincount(ip))
2393 		__xfs_iunpin_wait(ip);
2394 }
2395 
2396 /*
2397  * Removing an inode from the namespace involves removing the directory entry
2398  * and dropping the link count on the inode. Removing the directory entry can
2399  * result in locking an AGF (directory blocks were freed) and removing a link
2400  * count can result in placing the inode on an unlinked list which results in
2401  * locking an AGI.
2402  *
2403  * The big problem here is that we have an ordering constraint on AGF and AGI
2404  * locking - inode allocation locks the AGI, then can allocate a new extent for
2405  * new inodes, locking the AGF after the AGI. Similarly, freeing the inode
2406  * removes the inode from the unlinked list, requiring that we lock the AGI
2407  * first, and then freeing the inode can result in an inode chunk being freed
2408  * and hence freeing disk space requiring that we lock an AGF.
2409  *
2410  * Hence the ordering that is imposed by other parts of the code is AGI before
2411  * AGF. This means we cannot remove the directory entry before we drop the inode
2412  * reference count and put it on the unlinked list as this results in a lock
2413  * order of AGF then AGI, and this can deadlock against inode allocation and
2414  * freeing. Therefore we must drop the link counts before we remove the
2415  * directory entry.
2416  *
2417  * This is still safe from a transactional point of view - it is not until we
2418  * get to xfs_bmap_finish() that we have the possibility of multiple
2419  * transactions in this operation. Hence as long as we remove the directory
2420  * entry and drop the link count in the first transaction of the remove
2421  * operation, there are no transactional constraints on the ordering here.
2422  */
2423 int
2424 xfs_remove(
2425 	xfs_inode_t             *dp,
2426 	struct xfs_name		*name,
2427 	xfs_inode_t		*ip)
2428 {
2429 	xfs_mount_t		*mp = dp->i_mount;
2430 	xfs_trans_t             *tp = NULL;
2431 	int			is_dir = S_ISDIR(ip->i_d.di_mode);
2432 	int                     error = 0;
2433 	xfs_bmap_free_t         free_list;
2434 	xfs_fsblock_t           first_block;
2435 	int			cancel_flags;
2436 	int			committed;
2437 	int			link_zero;
2438 	uint			resblks;
2439 	uint			log_count;
2440 
2441 	trace_xfs_remove(dp, name);
2442 
2443 	if (XFS_FORCED_SHUTDOWN(mp))
2444 		return XFS_ERROR(EIO);
2445 
2446 	error = xfs_qm_dqattach(dp, 0);
2447 	if (error)
2448 		goto std_return;
2449 
2450 	error = xfs_qm_dqattach(ip, 0);
2451 	if (error)
2452 		goto std_return;
2453 
2454 	if (is_dir) {
2455 		tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR);
2456 		log_count = XFS_DEFAULT_LOG_COUNT;
2457 	} else {
2458 		tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE);
2459 		log_count = XFS_REMOVE_LOG_COUNT;
2460 	}
2461 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2462 
2463 	/*
2464 	 * We try to get the real space reservation first,
2465 	 * allowing for directory btree deletion(s) implying
2466 	 * possible bmap insert(s).  If we can't get the space
2467 	 * reservation then we use 0 instead, and avoid the bmap
2468 	 * btree insert(s) in the directory code by, if the bmap
2469 	 * insert tries to happen, instead trimming the LAST
2470 	 * block from the directory.
2471 	 */
2472 	resblks = XFS_REMOVE_SPACE_RES(mp);
2473 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, resblks, 0);
2474 	if (error == ENOSPC) {
2475 		resblks = 0;
2476 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_remove, 0, 0);
2477 	}
2478 	if (error) {
2479 		ASSERT(error != ENOSPC);
2480 		cancel_flags = 0;
2481 		goto out_trans_cancel;
2482 	}
2483 
2484 	xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
2485 
2486 	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2487 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
2488 
2489 	/*
2490 	 * If we're removing a directory perform some additional validation.
2491 	 */
2492 	cancel_flags |= XFS_TRANS_ABORT;
2493 	if (is_dir) {
2494 		ASSERT(ip->i_d.di_nlink >= 2);
2495 		if (ip->i_d.di_nlink != 2) {
2496 			error = XFS_ERROR(ENOTEMPTY);
2497 			goto out_trans_cancel;
2498 		}
2499 		if (!xfs_dir_isempty(ip)) {
2500 			error = XFS_ERROR(ENOTEMPTY);
2501 			goto out_trans_cancel;
2502 		}
2503 
2504 		/* Drop the link from ip's "..".  */
2505 		error = xfs_droplink(tp, dp);
2506 		if (error)
2507 			goto out_trans_cancel;
2508 
2509 		/* Drop the "." link from ip to self.  */
2510 		error = xfs_droplink(tp, ip);
2511 		if (error)
2512 			goto out_trans_cancel;
2513 	} else {
2514 		/*
2515 		 * When removing a non-directory we need to log the parent
2516 		 * inode here.  For a directory this is done implicitly
2517 		 * by the xfs_droplink call for the ".." entry.
2518 		 */
2519 		xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
2520 	}
2521 	xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2522 
2523 	/* Drop the link from dp to ip. */
2524 	error = xfs_droplink(tp, ip);
2525 	if (error)
2526 		goto out_trans_cancel;
2527 
2528 	/* Determine if this is the last link while the inode is locked */
2529 	link_zero = (ip->i_d.di_nlink == 0);
2530 
2531 	xfs_bmap_init(&free_list, &first_block);
2532 	error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2533 					&first_block, &free_list, resblks);
2534 	if (error) {
2535 		ASSERT(error != ENOENT);
2536 		goto out_bmap_cancel;
2537 	}
2538 
2539 	/*
2540 	 * If this is a synchronous mount, make sure that the
2541 	 * remove transaction goes to disk before returning to
2542 	 * the user.
2543 	 */
2544 	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
2545 		xfs_trans_set_sync(tp);
2546 
2547 	error = xfs_bmap_finish(&tp, &free_list, &committed);
2548 	if (error)
2549 		goto out_bmap_cancel;
2550 
2551 	error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2552 	if (error)
2553 		goto std_return;
2554 
2555 	/*
2556 	 * If we are using filestreams, kill the stream association.
2557 	 * If the file is still open it may get a new one but that
2558 	 * will get killed on last close in xfs_close() so we don't
2559 	 * have to worry about that.
2560 	 */
2561 	if (!is_dir && link_zero && xfs_inode_is_filestream(ip))
2562 		xfs_filestream_deassociate(ip);
2563 
2564 	return 0;
2565 
2566  out_bmap_cancel:
2567 	xfs_bmap_cancel(&free_list);
2568  out_trans_cancel:
2569 	xfs_trans_cancel(tp, cancel_flags);
2570  std_return:
2571 	return error;
2572 }
2573 
2574 /*
2575  * Enter all inodes for a rename transaction into a sorted array.
2576  */
2577 STATIC void
2578 xfs_sort_for_rename(
2579 	xfs_inode_t	*dp1,	/* in: old (source) directory inode */
2580 	xfs_inode_t	*dp2,	/* in: new (target) directory inode */
2581 	xfs_inode_t	*ip1,	/* in: inode of old entry */
2582 	xfs_inode_t	*ip2,	/* in: inode of new entry, if it
2583 				   already exists, NULL otherwise. */
2584 	xfs_inode_t	**i_tab,/* out: array of inode returned, sorted */
2585 	int		*num_inodes)  /* out: number of inodes in array */
2586 {
2587 	xfs_inode_t		*temp;
2588 	int			i, j;
2589 
2590 	/*
2591 	 * i_tab contains a list of pointers to inodes.  We initialize
2592 	 * the table here & we'll sort it.  We will then use it to
2593 	 * order the acquisition of the inode locks.
2594 	 *
2595 	 * Note that the table may contain duplicates.  e.g., dp1 == dp2.
2596 	 */
2597 	i_tab[0] = dp1;
2598 	i_tab[1] = dp2;
2599 	i_tab[2] = ip1;
2600 	if (ip2) {
2601 		*num_inodes = 4;
2602 		i_tab[3] = ip2;
2603 	} else {
2604 		*num_inodes = 3;
2605 		i_tab[3] = NULL;
2606 	}
2607 
2608 	/*
2609 	 * Sort the elements via bubble sort.  (Remember, there are at
2610 	 * most 4 elements to sort, so this is adequate.)
2611 	 */
2612 	for (i = 0; i < *num_inodes; i++) {
2613 		for (j = 1; j < *num_inodes; j++) {
2614 			if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) {
2615 				temp = i_tab[j];
2616 				i_tab[j] = i_tab[j-1];
2617 				i_tab[j-1] = temp;
2618 			}
2619 		}
2620 	}
2621 }
2622 
2623 /*
2624  * xfs_rename
2625  */
2626 int
2627 xfs_rename(
2628 	xfs_inode_t	*src_dp,
2629 	struct xfs_name	*src_name,
2630 	xfs_inode_t	*src_ip,
2631 	xfs_inode_t	*target_dp,
2632 	struct xfs_name	*target_name,
2633 	xfs_inode_t	*target_ip)
2634 {
2635 	xfs_trans_t	*tp = NULL;
2636 	xfs_mount_t	*mp = src_dp->i_mount;
2637 	int		new_parent;		/* moving to a new dir */
2638 	int		src_is_directory;	/* src_name is a directory */
2639 	int		error;
2640 	xfs_bmap_free_t free_list;
2641 	xfs_fsblock_t   first_block;
2642 	int		cancel_flags;
2643 	int		committed;
2644 	xfs_inode_t	*inodes[4];
2645 	int		spaceres;
2646 	int		num_inodes;
2647 
2648 	trace_xfs_rename(src_dp, target_dp, src_name, target_name);
2649 
2650 	new_parent = (src_dp != target_dp);
2651 	src_is_directory = S_ISDIR(src_ip->i_d.di_mode);
2652 
2653 	xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip,
2654 				inodes, &num_inodes);
2655 
2656 	xfs_bmap_init(&free_list, &first_block);
2657 	tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
2658 	cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2659 	spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
2660 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, spaceres, 0);
2661 	if (error == ENOSPC) {
2662 		spaceres = 0;
2663 		error = xfs_trans_reserve(tp, &M_RES(mp)->tr_rename, 0, 0);
2664 	}
2665 	if (error) {
2666 		xfs_trans_cancel(tp, 0);
2667 		goto std_return;
2668 	}
2669 
2670 	/*
2671 	 * Attach the dquots to the inodes
2672 	 */
2673 	error = xfs_qm_vop_rename_dqattach(inodes);
2674 	if (error) {
2675 		xfs_trans_cancel(tp, cancel_flags);
2676 		goto std_return;
2677 	}
2678 
2679 	/*
2680 	 * Lock all the participating inodes. Depending upon whether
2681 	 * the target_name exists in the target directory, and
2682 	 * whether the target directory is the same as the source
2683 	 * directory, we can lock from 2 to 4 inodes.
2684 	 */
2685 	xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
2686 
2687 	/*
2688 	 * Join all the inodes to the transaction. From this point on,
2689 	 * we can rely on either trans_commit or trans_cancel to unlock
2690 	 * them.
2691 	 */
2692 	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
2693 	if (new_parent)
2694 		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
2695 	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
2696 	if (target_ip)
2697 		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
2698 
2699 	/*
2700 	 * If we are using project inheritance, we only allow renames
2701 	 * into our tree when the project IDs are the same; else the
2702 	 * tree quota mechanism would be circumvented.
2703 	 */
2704 	if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
2705 		     (xfs_get_projid(target_dp) != xfs_get_projid(src_ip)))) {
2706 		error = XFS_ERROR(EXDEV);
2707 		goto error_return;
2708 	}
2709 
2710 	/*
2711 	 * Set up the target.
2712 	 */
2713 	if (target_ip == NULL) {
2714 		/*
2715 		 * If there's no space reservation, check the entry will
2716 		 * fit before actually inserting it.
2717 		 */
2718 		error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
2719 		if (error)
2720 			goto error_return;
2721 		/*
2722 		 * If target does not exist and the rename crosses
2723 		 * directories, adjust the target directory link count
2724 		 * to account for the ".." reference from the new entry.
2725 		 */
2726 		error = xfs_dir_createname(tp, target_dp, target_name,
2727 						src_ip->i_ino, &first_block,
2728 						&free_list, spaceres);
2729 		if (error == ENOSPC)
2730 			goto error_return;
2731 		if (error)
2732 			goto abort_return;
2733 
2734 		xfs_trans_ichgtime(tp, target_dp,
2735 					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2736 
2737 		if (new_parent && src_is_directory) {
2738 			error = xfs_bumplink(tp, target_dp);
2739 			if (error)
2740 				goto abort_return;
2741 		}
2742 	} else { /* target_ip != NULL */
2743 		/*
2744 		 * If target exists and it's a directory, check that both
2745 		 * target and source are directories and that target can be
2746 		 * destroyed, or that neither is a directory.
2747 		 */
2748 		if (S_ISDIR(target_ip->i_d.di_mode)) {
2749 			/*
2750 			 * Make sure target dir is empty.
2751 			 */
2752 			if (!(xfs_dir_isempty(target_ip)) ||
2753 			    (target_ip->i_d.di_nlink > 2)) {
2754 				error = XFS_ERROR(EEXIST);
2755 				goto error_return;
2756 			}
2757 		}
2758 
2759 		/*
2760 		 * Link the source inode under the target name.
2761 		 * If the source inode is a directory and we are moving
2762 		 * it across directories, its ".." entry will be
2763 		 * inconsistent until we replace that down below.
2764 		 *
2765 		 * In case there is already an entry with the same
2766 		 * name at the destination directory, remove it first.
2767 		 */
2768 		error = xfs_dir_replace(tp, target_dp, target_name,
2769 					src_ip->i_ino,
2770 					&first_block, &free_list, spaceres);
2771 		if (error)
2772 			goto abort_return;
2773 
2774 		xfs_trans_ichgtime(tp, target_dp,
2775 					XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2776 
2777 		/*
2778 		 * Decrement the link count on the target since the target
2779 		 * dir no longer points to it.
2780 		 */
2781 		error = xfs_droplink(tp, target_ip);
2782 		if (error)
2783 			goto abort_return;
2784 
2785 		if (src_is_directory) {
2786 			/*
2787 			 * Drop the link from the old "." entry.
2788 			 */
2789 			error = xfs_droplink(tp, target_ip);
2790 			if (error)
2791 				goto abort_return;
2792 		}
2793 	} /* target_ip != NULL */
2794 
2795 	/*
2796 	 * Remove the source.
2797 	 */
2798 	if (new_parent && src_is_directory) {
2799 		/*
2800 		 * Rewrite the ".." entry to point to the new
2801 		 * directory.
2802 		 */
2803 		error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
2804 					target_dp->i_ino,
2805 					&first_block, &free_list, spaceres);
2806 		ASSERT(error != EEXIST);
2807 		if (error)
2808 			goto abort_return;
2809 	}
2810 
2811 	/*
2812 	 * We always want to hit the ctime on the source inode.
2813 	 *
2814 	 * This isn't strictly required by the standards since the source
2815 	 * inode isn't really being changed, but old unix file systems did
2816 	 * it and some incremental backup programs won't work without it.
2817 	 */
2818 	xfs_trans_ichgtime(tp, src_ip, XFS_ICHGTIME_CHG);
2819 	xfs_trans_log_inode(tp, src_ip, XFS_ILOG_CORE);
2820 
2821 	/*
2822 	 * Adjust the link count on src_dp.  This is necessary when
2823 	 * renaming a directory, either within one parent when
2824 	 * the target existed, or across two parent directories.
2825 	 */
2826 	if (src_is_directory && (new_parent || target_ip != NULL)) {
2827 
2828 		/*
2829 		 * Decrement link count on src_directory since the
2830 		 * entry that's moved no longer points to it.
2831 		 */
2832 		error = xfs_droplink(tp, src_dp);
2833 		if (error)
2834 			goto abort_return;
2835 	}
2836 
2837 	error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
2838 					&first_block, &free_list, spaceres);
2839 	if (error)
2840 		goto abort_return;
2841 
2842 	xfs_trans_ichgtime(tp, src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
2843 	xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
2844 	if (new_parent)
2845 		xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
2846 
2847 	/*
2848 	 * If this is a synchronous mount, make sure that the
2849 	 * rename transaction goes to disk before returning to
2850 	 * the user.
2851 	 */
2852 	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
2853 		xfs_trans_set_sync(tp);
2854 	}
2855 
2856 	error = xfs_bmap_finish(&tp, &free_list, &committed);
2857 	if (error) {
2858 		xfs_bmap_cancel(&free_list);
2859 		xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES |
2860 				 XFS_TRANS_ABORT));
2861 		goto std_return;
2862 	}
2863 
2864 	/*
2865 	 * trans_commit will unlock src_ip, target_ip & decrement
2866 	 * the vnode references.
2867 	 */
2868 	return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
2869 
2870  abort_return:
2871 	cancel_flags |= XFS_TRANS_ABORT;
2872  error_return:
2873 	xfs_bmap_cancel(&free_list);
2874 	xfs_trans_cancel(tp, cancel_flags);
2875  std_return:
2876 	return error;
2877 }
2878 
2879 STATIC int
2880 xfs_iflush_cluster(
2881 	xfs_inode_t	*ip,
2882 	xfs_buf_t	*bp)
2883 {
2884 	xfs_mount_t		*mp = ip->i_mount;
2885 	struct xfs_perag	*pag;
2886 	unsigned long		first_index, mask;
2887 	unsigned long		inodes_per_cluster;
2888 	int			ilist_size;
2889 	xfs_inode_t		**ilist;
2890 	xfs_inode_t		*iq;
2891 	int			nr_found;
2892 	int			clcount = 0;
2893 	int			bufwasdelwri;
2894 	int			i;
2895 
2896 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
2897 
2898 	inodes_per_cluster = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog;
2899 	ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
2900 	ilist = kmem_alloc(ilist_size, KM_MAYFAIL|KM_NOFS);
2901 	if (!ilist)
2902 		goto out_put;
2903 
2904 	mask = ~(((mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog)) - 1);
2905 	first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2906 	rcu_read_lock();
2907 	/* really need a gang lookup range call here */
2908 	nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
2909 					first_index, inodes_per_cluster);
2910 	if (nr_found == 0)
2911 		goto out_free;
2912 
2913 	for (i = 0; i < nr_found; i++) {
2914 		iq = ilist[i];
2915 		if (iq == ip)
2916 			continue;
2917 
2918 		/*
2919 		 * because this is an RCU protected lookup, we could find a
2920 		 * recently freed or even reallocated inode during the lookup.
2921 		 * We need to check under the i_flags_lock for a valid inode
2922 		 * here. Skip it if it is not valid or the wrong inode.
2923 		 */
2924 		spin_lock(&ip->i_flags_lock);
2925 		if (!ip->i_ino ||
2926 		    (XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index) {
2927 			spin_unlock(&ip->i_flags_lock);
2928 			continue;
2929 		}
2930 		spin_unlock(&ip->i_flags_lock);
2931 
2932 		/*
2933 		 * Do an un-protected check to see if the inode is dirty and
2934 		 * is a candidate for flushing.  These checks will be repeated
2935 		 * later after the appropriate locks are acquired.
2936 		 */
2937 		if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
2938 			continue;
2939 
2940 		/*
2941 		 * Try to get locks.  If any are unavailable or it is pinned,
2942 		 * then this inode cannot be flushed and is skipped.
2943 		 */
2944 
2945 		if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
2946 			continue;
2947 		if (!xfs_iflock_nowait(iq)) {
2948 			xfs_iunlock(iq, XFS_ILOCK_SHARED);
2949 			continue;
2950 		}
2951 		if (xfs_ipincount(iq)) {
2952 			xfs_ifunlock(iq);
2953 			xfs_iunlock(iq, XFS_ILOCK_SHARED);
2954 			continue;
2955 		}
2956 
2957 		/*
2958 		 * arriving here means that this inode can be flushed.  First
2959 		 * re-check that it's dirty before flushing.
2960 		 */
2961 		if (!xfs_inode_clean(iq)) {
2962 			int	error;
2963 			error = xfs_iflush_int(iq, bp);
2964 			if (error) {
2965 				xfs_iunlock(iq, XFS_ILOCK_SHARED);
2966 				goto cluster_corrupt_out;
2967 			}
2968 			clcount++;
2969 		} else {
2970 			xfs_ifunlock(iq);
2971 		}
2972 		xfs_iunlock(iq, XFS_ILOCK_SHARED);
2973 	}
2974 
2975 	if (clcount) {
2976 		XFS_STATS_INC(xs_icluster_flushcnt);
2977 		XFS_STATS_ADD(xs_icluster_flushinode, clcount);
2978 	}
2979 
2980 out_free:
2981 	rcu_read_unlock();
2982 	kmem_free(ilist);
2983 out_put:
2984 	xfs_perag_put(pag);
2985 	return 0;
2986 
2987 
2988 cluster_corrupt_out:
2989 	/*
2990 	 * Corruption detected in the clustering loop.  Invalidate the
2991 	 * inode buffer and shut down the filesystem.
2992 	 */
2993 	rcu_read_unlock();
2994 	/*
2995 	 * Clean up the buffer.  If it was delwri, just release it --
2996 	 * brelse can handle it with no problems.  If not, shut down the
2997 	 * filesystem before releasing the buffer.
2998 	 */
2999 	bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
3000 	if (bufwasdelwri)
3001 		xfs_buf_relse(bp);
3002 
3003 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3004 
3005 	if (!bufwasdelwri) {
3006 		/*
3007 		 * Just like incore_relse: if we have b_iodone functions,
3008 		 * mark the buffer as an error and call them.  Otherwise
3009 		 * mark it as stale and brelse.
3010 		 */
3011 		if (bp->b_iodone) {
3012 			XFS_BUF_UNDONE(bp);
3013 			xfs_buf_stale(bp);
3014 			xfs_buf_ioerror(bp, EIO);
3015 			xfs_buf_ioend(bp, 0);
3016 		} else {
3017 			xfs_buf_stale(bp);
3018 			xfs_buf_relse(bp);
3019 		}
3020 	}
3021 
3022 	/*
3023 	 * Unlocks the flush lock
3024 	 */
3025 	xfs_iflush_abort(iq, false);
3026 	kmem_free(ilist);
3027 	xfs_perag_put(pag);
3028 	return XFS_ERROR(EFSCORRUPTED);
3029 }
3030 
3031 /*
3032  * Flush dirty inode metadata into the backing buffer.
3033  *
3034  * The caller must have the inode lock and the inode flush lock held.  The
3035  * inode lock will still be held upon return to the caller, and the inode
3036  * flush lock will be released after the inode has reached the disk.
3037  *
3038  * The caller must write out the buffer returned in *bpp and release it.
3039  */
3040 int
3041 xfs_iflush(
3042 	struct xfs_inode	*ip,
3043 	struct xfs_buf		**bpp)
3044 {
3045 	struct xfs_mount	*mp = ip->i_mount;
3046 	struct xfs_buf		*bp;
3047 	struct xfs_dinode	*dip;
3048 	int			error;
3049 
3050 	XFS_STATS_INC(xs_iflush_count);
3051 
3052 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3053 	ASSERT(xfs_isiflocked(ip));
3054 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3055 	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3056 
3057 	*bpp = NULL;
3058 
3059 	xfs_iunpin_wait(ip);
3060 
3061 	/*
3062 	 * For stale inodes we cannot rely on the backing buffer remaining
3063 	 * stale in cache for the remaining life of the stale inode and so
3064 	 * xfs_imap_to_bp() below may give us a buffer that no longer contains
3065 	 * inodes below. We have to check this after ensuring the inode is
3066 	 * unpinned so that it is safe to reclaim the stale inode after the
3067 	 * flush call.
3068 	 */
3069 	if (xfs_iflags_test(ip, XFS_ISTALE)) {
3070 		xfs_ifunlock(ip);
3071 		return 0;
3072 	}
3073 
3074 	/*
3075 	 * This may have been unpinned because the filesystem is shutting
3076 	 * down forcibly. If that's the case we must not write this inode
3077 	 * to disk, because the log record didn't make it to disk.
3078 	 *
3079 	 * We also have to remove the log item from the AIL in this case,
3080 	 * as we wait for an empty AIL as part of the unmount process.
3081 	 */
3082 	if (XFS_FORCED_SHUTDOWN(mp)) {
3083 		error = XFS_ERROR(EIO);
3084 		goto abort_out;
3085 	}
3086 
3087 	/*
3088 	 * Get the buffer containing the on-disk inode.
3089 	 */
3090 	error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK,
3091 			       0);
3092 	if (error || !bp) {
3093 		xfs_ifunlock(ip);
3094 		return error;
3095 	}
3096 
3097 	/*
3098 	 * First flush out the inode that xfs_iflush was called with.
3099 	 */
3100 	error = xfs_iflush_int(ip, bp);
3101 	if (error)
3102 		goto corrupt_out;
3103 
3104 	/*
3105 	 * If the buffer is pinned then push on the log now so we won't
3106 	 * get stuck waiting in the write for too long.
3107 	 */
3108 	if (xfs_buf_ispinned(bp))
3109 		xfs_log_force(mp, 0);
3110 
3111 	/*
3112 	 * inode clustering:
3113 	 * see if other inodes can be gathered into this write
3114 	 */
3115 	error = xfs_iflush_cluster(ip, bp);
3116 	if (error)
3117 		goto cluster_corrupt_out;
3118 
3119 	*bpp = bp;
3120 	return 0;
3121 
3122 corrupt_out:
3123 	xfs_buf_relse(bp);
3124 	xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3125 cluster_corrupt_out:
3126 	error = XFS_ERROR(EFSCORRUPTED);
3127 abort_out:
3128 	/*
3129 	 * Unlocks the flush lock
3130 	 */
3131 	xfs_iflush_abort(ip, false);
3132 	return error;
3133 }
3134 
3135 STATIC int
3136 xfs_iflush_int(
3137 	struct xfs_inode	*ip,
3138 	struct xfs_buf		*bp)
3139 {
3140 	struct xfs_inode_log_item *iip = ip->i_itemp;
3141 	struct xfs_dinode	*dip;
3142 	struct xfs_mount	*mp = ip->i_mount;
3143 
3144 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3145 	ASSERT(xfs_isiflocked(ip));
3146 	ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
3147 	       ip->i_d.di_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3148 	ASSERT(iip != NULL && iip->ili_fields != 0);
3149 
3150 	/* set *dip = inode's place in the buffer */
3151 	dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
3152 
3153 	if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC),
3154 			       mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
3155 		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3156 			"%s: Bad inode %Lu magic number 0x%x, ptr 0x%p",
3157 			__func__, ip->i_ino, be16_to_cpu(dip->di_magic), dip);
3158 		goto corrupt_out;
3159 	}
3160 	if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
3161 				mp, XFS_ERRTAG_IFLUSH_2, XFS_RANDOM_IFLUSH_2)) {
3162 		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3163 			"%s: Bad inode %Lu, ptr 0x%p, magic number 0x%x",
3164 			__func__, ip->i_ino, ip, ip->i_d.di_magic);
3165 		goto corrupt_out;
3166 	}
3167 	if (S_ISREG(ip->i_d.di_mode)) {
3168 		if (XFS_TEST_ERROR(
3169 		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3170 		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE),
3171 		    mp, XFS_ERRTAG_IFLUSH_3, XFS_RANDOM_IFLUSH_3)) {
3172 			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3173 				"%s: Bad regular inode %Lu, ptr 0x%p",
3174 				__func__, ip->i_ino, ip);
3175 			goto corrupt_out;
3176 		}
3177 	} else if (S_ISDIR(ip->i_d.di_mode)) {
3178 		if (XFS_TEST_ERROR(
3179 		    (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) &&
3180 		    (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) &&
3181 		    (ip->i_d.di_format != XFS_DINODE_FMT_LOCAL),
3182 		    mp, XFS_ERRTAG_IFLUSH_4, XFS_RANDOM_IFLUSH_4)) {
3183 			xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3184 				"%s: Bad directory inode %Lu, ptr 0x%p",
3185 				__func__, ip->i_ino, ip);
3186 			goto corrupt_out;
3187 		}
3188 	}
3189 	if (XFS_TEST_ERROR(ip->i_d.di_nextents + ip->i_d.di_anextents >
3190 				ip->i_d.di_nblocks, mp, XFS_ERRTAG_IFLUSH_5,
3191 				XFS_RANDOM_IFLUSH_5)) {
3192 		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3193 			"%s: detected corrupt incore inode %Lu, "
3194 			"total extents = %d, nblocks = %Ld, ptr 0x%p",
3195 			__func__, ip->i_ino,
3196 			ip->i_d.di_nextents + ip->i_d.di_anextents,
3197 			ip->i_d.di_nblocks, ip);
3198 		goto corrupt_out;
3199 	}
3200 	if (XFS_TEST_ERROR(ip->i_d.di_forkoff > mp->m_sb.sb_inodesize,
3201 				mp, XFS_ERRTAG_IFLUSH_6, XFS_RANDOM_IFLUSH_6)) {
3202 		xfs_alert_tag(mp, XFS_PTAG_IFLUSH,
3203 			"%s: bad inode %Lu, forkoff 0x%x, ptr 0x%p",
3204 			__func__, ip->i_ino, ip->i_d.di_forkoff, ip);
3205 		goto corrupt_out;
3206 	}
3207 
3208 	/*
3209 	 * Inode item log recovery for v1/v2 inodes are dependent on the
3210 	 * di_flushiter count for correct sequencing. We bump the flush
3211 	 * iteration count so we can detect flushes which postdate a log record
3212 	 * during recovery. This is redundant as we now log every change and
3213 	 * hence this can't happen but we need to still do it to ensure
3214 	 * backwards compatibility with old kernels that predate logging all
3215 	 * inode changes.
3216 	 */
3217 	if (ip->i_d.di_version < 3)
3218 		ip->i_d.di_flushiter++;
3219 
3220 	/*
3221 	 * Copy the dirty parts of the inode into the on-disk
3222 	 * inode.  We always copy out the core of the inode,
3223 	 * because if the inode is dirty at all the core must
3224 	 * be.
3225 	 */
3226 	xfs_dinode_to_disk(dip, &ip->i_d);
3227 
3228 	/* Wrap, we never let the log put out DI_MAX_FLUSH */
3229 	if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
3230 		ip->i_d.di_flushiter = 0;
3231 
3232 	/*
3233 	 * If this is really an old format inode and the superblock version
3234 	 * has not been updated to support only new format inodes, then
3235 	 * convert back to the old inode format.  If the superblock version
3236 	 * has been updated, then make the conversion permanent.
3237 	 */
3238 	ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
3239 	if (ip->i_d.di_version == 1) {
3240 		if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
3241 			/*
3242 			 * Convert it back.
3243 			 */
3244 			ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
3245 			dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
3246 		} else {
3247 			/*
3248 			 * The superblock version has already been bumped,
3249 			 * so just make the conversion to the new inode
3250 			 * format permanent.
3251 			 */
3252 			ip->i_d.di_version = 2;
3253 			dip->di_version = 2;
3254 			ip->i_d.di_onlink = 0;
3255 			dip->di_onlink = 0;
3256 			memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
3257 			memset(&(dip->di_pad[0]), 0,
3258 			      sizeof(dip->di_pad));
3259 			ASSERT(xfs_get_projid(ip) == 0);
3260 		}
3261 	}
3262 
3263 	xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
3264 	if (XFS_IFORK_Q(ip))
3265 		xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
3266 	xfs_inobp_check(mp, bp);
3267 
3268 	/*
3269 	 * We've recorded everything logged in the inode, so we'd like to clear
3270 	 * the ili_fields bits so we don't log and flush things unnecessarily.
3271 	 * However, we can't stop logging all this information until the data
3272 	 * we've copied into the disk buffer is written to disk.  If we did we
3273 	 * might overwrite the copy of the inode in the log with all the data
3274 	 * after re-logging only part of it, and in the face of a crash we
3275 	 * wouldn't have all the data we need to recover.
3276 	 *
3277 	 * What we do is move the bits to the ili_last_fields field.  When
3278 	 * logging the inode, these bits are moved back to the ili_fields field.
3279 	 * In the xfs_iflush_done() routine we clear ili_last_fields, since we
3280 	 * know that the information those bits represent is permanently on
3281 	 * disk.  As long as the flush completes before the inode is logged
3282 	 * again, then both ili_fields and ili_last_fields will be cleared.
3283 	 *
3284 	 * We can play with the ili_fields bits here, because the inode lock
3285 	 * must be held exclusively in order to set bits there and the flush
3286 	 * lock protects the ili_last_fields bits.  Set ili_logged so the flush
3287 	 * done routine can tell whether or not to look in the AIL.  Also, store
3288 	 * the current LSN of the inode so that we can tell whether the item has
3289 	 * moved in the AIL from xfs_iflush_done().  In order to read the lsn we
3290 	 * need the AIL lock, because it is a 64 bit value that cannot be read
3291 	 * atomically.
3292 	 */
3293 	iip->ili_last_fields = iip->ili_fields;
3294 	iip->ili_fields = 0;
3295 	iip->ili_logged = 1;
3296 
3297 	xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3298 				&iip->ili_item.li_lsn);
3299 
3300 	/*
3301 	 * Attach the function xfs_iflush_done to the inode's
3302 	 * buffer.  This will remove the inode from the AIL
3303 	 * and unlock the inode's flush lock when the inode is
3304 	 * completely written to disk.
3305 	 */
3306 	xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
3307 
3308 	/* update the lsn in the on disk inode if required */
3309 	if (ip->i_d.di_version == 3)
3310 		dip->di_lsn = cpu_to_be64(iip->ili_item.li_lsn);
3311 
3312 	/* generate the checksum. */
3313 	xfs_dinode_calc_crc(mp, dip);
3314 
3315 	ASSERT(bp->b_fspriv != NULL);
3316 	ASSERT(bp->b_iodone != NULL);
3317 	return 0;
3318 
3319 corrupt_out:
3320 	return XFS_ERROR(EFSCORRUPTED);
3321 }
3322