xref: /openbmc/linux/fs/xfs/xfs_icache.c (revision 7fdff526)
10b61f8a4SDave Chinner // SPDX-License-Identifier: GPL-2.0
26d8b79cfSDave Chinner /*
36d8b79cfSDave Chinner  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
46d8b79cfSDave Chinner  * All Rights Reserved.
56d8b79cfSDave Chinner  */
66d8b79cfSDave Chinner #include "xfs.h"
76d8b79cfSDave Chinner #include "xfs_fs.h"
85467b34bSDarrick J. Wong #include "xfs_shared.h"
96ca1c906SDave Chinner #include "xfs_format.h"
10239880efSDave Chinner #include "xfs_log_format.h"
11239880efSDave Chinner #include "xfs_trans_resv.h"
126d8b79cfSDave Chinner #include "xfs_sb.h"
136d8b79cfSDave Chinner #include "xfs_mount.h"
146d8b79cfSDave Chinner #include "xfs_inode.h"
15239880efSDave Chinner #include "xfs_trans.h"
16239880efSDave Chinner #include "xfs_trans_priv.h"
176d8b79cfSDave Chinner #include "xfs_inode_item.h"
186d8b79cfSDave Chinner #include "xfs_quota.h"
196d8b79cfSDave Chinner #include "xfs_trace.h"
206d8b79cfSDave Chinner #include "xfs_icache.h"
21c24b5dfaSDave Chinner #include "xfs_bmap_util.h"
22dc06f398SBrian Foster #include "xfs_dquot_item.h"
23dc06f398SBrian Foster #include "xfs_dquot.h"
2483104d44SDarrick J. Wong #include "xfs_reflink.h"
25bb8a66afSChristoph Hellwig #include "xfs_ialloc.h"
266d8b79cfSDave Chinner 
27f0e28280SJeff Layton #include <linux/iversion.h>
286d8b79cfSDave Chinner 
29c809d7e9SDarrick J. Wong /* Radix tree tags for incore inode tree. */
30c809d7e9SDarrick J. Wong 
31c809d7e9SDarrick J. Wong /* inode is to be reclaimed */
32c809d7e9SDarrick J. Wong #define XFS_ICI_RECLAIM_TAG	0
33c809d7e9SDarrick J. Wong /* Inode has speculative preallocations (posteof or cow) to clean. */
34c809d7e9SDarrick J. Wong #define XFS_ICI_BLOCKGC_TAG	1
35c809d7e9SDarrick J. Wong 
36c809d7e9SDarrick J. Wong /*
37c809d7e9SDarrick J. Wong  * The goal for walking incore inodes.  These can correspond with incore inode
38c809d7e9SDarrick J. Wong  * radix tree tags when convenient.  Avoid existing XFS_IWALK namespace.
39c809d7e9SDarrick J. Wong  */
40c809d7e9SDarrick J. Wong enum xfs_icwalk_goal {
41c809d7e9SDarrick J. Wong 	/* Goals that are not related to tags; these must be < 0. */
42c809d7e9SDarrick J. Wong 	XFS_ICWALK_DQRELE	= -1,
43c809d7e9SDarrick J. Wong 
44c809d7e9SDarrick J. Wong 	/* Goals directly associated with tagged inodes. */
45c809d7e9SDarrick J. Wong 	XFS_ICWALK_BLOCKGC	= XFS_ICI_BLOCKGC_TAG,
46c809d7e9SDarrick J. Wong };
47c809d7e9SDarrick J. Wong 
48c809d7e9SDarrick J. Wong #define XFS_ICWALK_NULL_TAG	(-1U)
49c809d7e9SDarrick J. Wong 
50c809d7e9SDarrick J. Wong /* Compute the inode radix tree tag for this goal. */
51c809d7e9SDarrick J. Wong static inline unsigned int
52c809d7e9SDarrick J. Wong xfs_icwalk_tag(enum xfs_icwalk_goal goal)
53c809d7e9SDarrick J. Wong {
54c809d7e9SDarrick J. Wong 	return goal < 0 ? XFS_ICWALK_NULL_TAG : goal;
55c809d7e9SDarrick J. Wong }
56c809d7e9SDarrick J. Wong 
57*7fdff526SDarrick J. Wong static int xfs_icwalk(struct xfs_mount *mp,
58df600197SDarrick J. Wong 		int (*execute)(struct xfs_inode *ip, void *args),
59c809d7e9SDarrick J. Wong 		void *args, enum xfs_icwalk_goal goal);
60*7fdff526SDarrick J. Wong static int xfs_icwalk_ag(struct xfs_perag *pag,
61df600197SDarrick J. Wong 		int (*execute)(struct xfs_inode *ip, void *args),
62c809d7e9SDarrick J. Wong 		void *args, enum xfs_icwalk_goal goal);
63df600197SDarrick J. Wong 
6433479e05SDave Chinner /*
651ad2cfe0SDarrick J. Wong  * Private inode cache walk flags for struct xfs_eofblocks.  Must not coincide
661ad2cfe0SDarrick J. Wong  * with XFS_EOF_FLAGS_*.
671ad2cfe0SDarrick J. Wong  */
681ad2cfe0SDarrick J. Wong #define XFS_ICWALK_FLAG_DROP_UDQUOT	(1U << 31)
691ad2cfe0SDarrick J. Wong #define XFS_ICWALK_FLAG_DROP_GDQUOT	(1U << 30)
701ad2cfe0SDarrick J. Wong #define XFS_ICWALK_FLAG_DROP_PDQUOT	(1U << 29)
711ad2cfe0SDarrick J. Wong 
721ad2cfe0SDarrick J. Wong #define XFS_ICWALK_PRIVATE_FLAGS	(XFS_ICWALK_FLAG_DROP_UDQUOT | \
731ad2cfe0SDarrick J. Wong 					 XFS_ICWALK_FLAG_DROP_GDQUOT | \
741ad2cfe0SDarrick J. Wong 					 XFS_ICWALK_FLAG_DROP_PDQUOT)
751ad2cfe0SDarrick J. Wong 
761ad2cfe0SDarrick J. Wong /*
7733479e05SDave Chinner  * Allocate and initialise an xfs_inode.
7833479e05SDave Chinner  */
79638f4416SDave Chinner struct xfs_inode *
8033479e05SDave Chinner xfs_inode_alloc(
8133479e05SDave Chinner 	struct xfs_mount	*mp,
8233479e05SDave Chinner 	xfs_ino_t		ino)
8333479e05SDave Chinner {
8433479e05SDave Chinner 	struct xfs_inode	*ip;
8533479e05SDave Chinner 
8633479e05SDave Chinner 	/*
873050bd0bSCarlos Maiolino 	 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL
883050bd0bSCarlos Maiolino 	 * and return NULL here on ENOMEM.
8933479e05SDave Chinner 	 */
903050bd0bSCarlos Maiolino 	ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL);
913050bd0bSCarlos Maiolino 
9233479e05SDave Chinner 	if (inode_init_always(mp->m_super, VFS_I(ip))) {
93377bcd5fSCarlos Maiolino 		kmem_cache_free(xfs_inode_zone, ip);
9433479e05SDave Chinner 		return NULL;
9533479e05SDave Chinner 	}
9633479e05SDave Chinner 
97c19b3b05SDave Chinner 	/* VFS doesn't initialise i_mode! */
98c19b3b05SDave Chinner 	VFS_I(ip)->i_mode = 0;
99c19b3b05SDave Chinner 
100ff6d6af2SBill O'Donnell 	XFS_STATS_INC(mp, vn_active);
10133479e05SDave Chinner 	ASSERT(atomic_read(&ip->i_pincount) == 0);
10233479e05SDave Chinner 	ASSERT(ip->i_ino == 0);
10333479e05SDave Chinner 
10433479e05SDave Chinner 	/* initialise the xfs inode */
10533479e05SDave Chinner 	ip->i_ino = ino;
10633479e05SDave Chinner 	ip->i_mount = mp;
10733479e05SDave Chinner 	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
10833479e05SDave Chinner 	ip->i_afp = NULL;
1093993baebSDarrick J. Wong 	ip->i_cowfp = NULL;
1103ba738dfSChristoph Hellwig 	memset(&ip->i_df, 0, sizeof(ip->i_df));
11133479e05SDave Chinner 	ip->i_flags = 0;
11233479e05SDave Chinner 	ip->i_delayed_blks = 0;
1133e09ab8fSChristoph Hellwig 	ip->i_diflags2 = mp->m_ino_geo.new_diflags2;
1146e73a545SChristoph Hellwig 	ip->i_nblocks = 0;
1157821ea30SChristoph Hellwig 	ip->i_forkoff = 0;
1166772c1f1SDarrick J. Wong 	ip->i_sick = 0;
1176772c1f1SDarrick J. Wong 	ip->i_checked = 0;
118cb357bf3SDarrick J. Wong 	INIT_WORK(&ip->i_ioend_work, xfs_end_io);
119cb357bf3SDarrick J. Wong 	INIT_LIST_HEAD(&ip->i_ioend_list);
120cb357bf3SDarrick J. Wong 	spin_lock_init(&ip->i_ioend_lock);
12133479e05SDave Chinner 
12233479e05SDave Chinner 	return ip;
12333479e05SDave Chinner }
12433479e05SDave Chinner 
12533479e05SDave Chinner STATIC void
12633479e05SDave Chinner xfs_inode_free_callback(
12733479e05SDave Chinner 	struct rcu_head		*head)
12833479e05SDave Chinner {
12933479e05SDave Chinner 	struct inode		*inode = container_of(head, struct inode, i_rcu);
13033479e05SDave Chinner 	struct xfs_inode	*ip = XFS_I(inode);
13133479e05SDave Chinner 
132c19b3b05SDave Chinner 	switch (VFS_I(ip)->i_mode & S_IFMT) {
13333479e05SDave Chinner 	case S_IFREG:
13433479e05SDave Chinner 	case S_IFDIR:
13533479e05SDave Chinner 	case S_IFLNK:
136ef838512SChristoph Hellwig 		xfs_idestroy_fork(&ip->i_df);
13733479e05SDave Chinner 		break;
13833479e05SDave Chinner 	}
13933479e05SDave Chinner 
140ef838512SChristoph Hellwig 	if (ip->i_afp) {
141ef838512SChristoph Hellwig 		xfs_idestroy_fork(ip->i_afp);
142ef838512SChristoph Hellwig 		kmem_cache_free(xfs_ifork_zone, ip->i_afp);
143ef838512SChristoph Hellwig 	}
144ef838512SChristoph Hellwig 	if (ip->i_cowfp) {
145ef838512SChristoph Hellwig 		xfs_idestroy_fork(ip->i_cowfp);
146ef838512SChristoph Hellwig 		kmem_cache_free(xfs_ifork_zone, ip->i_cowfp);
147ef838512SChristoph Hellwig 	}
14833479e05SDave Chinner 	if (ip->i_itemp) {
14922525c17SDave Chinner 		ASSERT(!test_bit(XFS_LI_IN_AIL,
15022525c17SDave Chinner 				 &ip->i_itemp->ili_item.li_flags));
15133479e05SDave Chinner 		xfs_inode_item_destroy(ip);
15233479e05SDave Chinner 		ip->i_itemp = NULL;
15333479e05SDave Chinner 	}
15433479e05SDave Chinner 
155377bcd5fSCarlos Maiolino 	kmem_cache_free(xfs_inode_zone, ip);
1561f2dcfe8SDave Chinner }
1571f2dcfe8SDave Chinner 
1588a17d7ddSDave Chinner static void
1598a17d7ddSDave Chinner __xfs_inode_free(
1608a17d7ddSDave Chinner 	struct xfs_inode	*ip)
1618a17d7ddSDave Chinner {
1628a17d7ddSDave Chinner 	/* asserts to verify all state is correct here */
1638a17d7ddSDave Chinner 	ASSERT(atomic_read(&ip->i_pincount) == 0);
16448d55e2aSDave Chinner 	ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list));
1658a17d7ddSDave Chinner 	XFS_STATS_DEC(ip->i_mount, vn_active);
1668a17d7ddSDave Chinner 
1678a17d7ddSDave Chinner 	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
1688a17d7ddSDave Chinner }
1698a17d7ddSDave Chinner 
1701f2dcfe8SDave Chinner void
1711f2dcfe8SDave Chinner xfs_inode_free(
1721f2dcfe8SDave Chinner 	struct xfs_inode	*ip)
1731f2dcfe8SDave Chinner {
174718ecc50SDave Chinner 	ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING));
17598efe8afSBrian Foster 
17633479e05SDave Chinner 	/*
17733479e05SDave Chinner 	 * Because we use RCU freeing we need to ensure the inode always
17833479e05SDave Chinner 	 * appears to be reclaimed with an invalid inode number when in the
17933479e05SDave Chinner 	 * free state. The ip->i_flags_lock provides the barrier against lookup
18033479e05SDave Chinner 	 * races.
18133479e05SDave Chinner 	 */
18233479e05SDave Chinner 	spin_lock(&ip->i_flags_lock);
18333479e05SDave Chinner 	ip->i_flags = XFS_IRECLAIM;
18433479e05SDave Chinner 	ip->i_ino = 0;
18533479e05SDave Chinner 	spin_unlock(&ip->i_flags_lock);
18633479e05SDave Chinner 
1878a17d7ddSDave Chinner 	__xfs_inode_free(ip);
18833479e05SDave Chinner }
18933479e05SDave Chinner 
19033479e05SDave Chinner /*
19102511a5aSDave Chinner  * Queue background inode reclaim work if there are reclaimable inodes and there
19202511a5aSDave Chinner  * isn't reclaim work already scheduled or in progress.
193ad438c40SDave Chinner  */
194ad438c40SDave Chinner static void
195ad438c40SDave Chinner xfs_reclaim_work_queue(
196ad438c40SDave Chinner 	struct xfs_mount        *mp)
197ad438c40SDave Chinner {
198ad438c40SDave Chinner 
199ad438c40SDave Chinner 	rcu_read_lock();
200ad438c40SDave Chinner 	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
201ad438c40SDave Chinner 		queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
202ad438c40SDave Chinner 			msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
203ad438c40SDave Chinner 	}
204ad438c40SDave Chinner 	rcu_read_unlock();
205ad438c40SDave Chinner }
206ad438c40SDave Chinner 
207ad438c40SDave Chinner static void
208ad438c40SDave Chinner xfs_perag_set_reclaim_tag(
209ad438c40SDave Chinner 	struct xfs_perag	*pag)
210ad438c40SDave Chinner {
211ad438c40SDave Chinner 	struct xfs_mount	*mp = pag->pag_mount;
212ad438c40SDave Chinner 
21395989c46SBrian Foster 	lockdep_assert_held(&pag->pag_ici_lock);
214ad438c40SDave Chinner 	if (pag->pag_ici_reclaimable++)
215ad438c40SDave Chinner 		return;
216ad438c40SDave Chinner 
217ad438c40SDave Chinner 	/* propagate the reclaim tag up into the perag radix tree */
218ad438c40SDave Chinner 	spin_lock(&mp->m_perag_lock);
219ad438c40SDave Chinner 	radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno,
220ad438c40SDave Chinner 			   XFS_ICI_RECLAIM_TAG);
221ad438c40SDave Chinner 	spin_unlock(&mp->m_perag_lock);
222ad438c40SDave Chinner 
223ad438c40SDave Chinner 	/* schedule periodic background inode reclaim */
224ad438c40SDave Chinner 	xfs_reclaim_work_queue(mp);
225ad438c40SDave Chinner 
226ad438c40SDave Chinner 	trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
227ad438c40SDave Chinner }
228ad438c40SDave Chinner 
229ad438c40SDave Chinner static void
230ad438c40SDave Chinner xfs_perag_clear_reclaim_tag(
231ad438c40SDave Chinner 	struct xfs_perag	*pag)
232ad438c40SDave Chinner {
233ad438c40SDave Chinner 	struct xfs_mount	*mp = pag->pag_mount;
234ad438c40SDave Chinner 
23595989c46SBrian Foster 	lockdep_assert_held(&pag->pag_ici_lock);
236ad438c40SDave Chinner 	if (--pag->pag_ici_reclaimable)
237ad438c40SDave Chinner 		return;
238ad438c40SDave Chinner 
239ad438c40SDave Chinner 	/* clear the reclaim tag from the perag radix tree */
240ad438c40SDave Chinner 	spin_lock(&mp->m_perag_lock);
241ad438c40SDave Chinner 	radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno,
242ad438c40SDave Chinner 			     XFS_ICI_RECLAIM_TAG);
243ad438c40SDave Chinner 	spin_unlock(&mp->m_perag_lock);
244ad438c40SDave Chinner 	trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_);
245ad438c40SDave Chinner }
246ad438c40SDave Chinner 
247ad438c40SDave Chinner 
248ad438c40SDave Chinner /*
249ad438c40SDave Chinner  * We set the inode flag atomically with the radix tree tag.
250ad438c40SDave Chinner  * Once we get tag lookups on the radix tree, this inode flag
251ad438c40SDave Chinner  * can go away.
252ad438c40SDave Chinner  */
253ad438c40SDave Chinner void
254ad438c40SDave Chinner xfs_inode_set_reclaim_tag(
255ad438c40SDave Chinner 	struct xfs_inode	*ip)
256ad438c40SDave Chinner {
257ad438c40SDave Chinner 	struct xfs_mount	*mp = ip->i_mount;
258ad438c40SDave Chinner 	struct xfs_perag	*pag;
259ad438c40SDave Chinner 
260ad438c40SDave Chinner 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
261ad438c40SDave Chinner 	spin_lock(&pag->pag_ici_lock);
262ad438c40SDave Chinner 	spin_lock(&ip->i_flags_lock);
263ad438c40SDave Chinner 
264ad438c40SDave Chinner 	radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino),
265ad438c40SDave Chinner 			   XFS_ICI_RECLAIM_TAG);
266ad438c40SDave Chinner 	xfs_perag_set_reclaim_tag(pag);
267ad438c40SDave Chinner 	__xfs_iflags_set(ip, XFS_IRECLAIMABLE);
268ad438c40SDave Chinner 
269ad438c40SDave Chinner 	spin_unlock(&ip->i_flags_lock);
270ad438c40SDave Chinner 	spin_unlock(&pag->pag_ici_lock);
271ad438c40SDave Chinner 	xfs_perag_put(pag);
272ad438c40SDave Chinner }
273ad438c40SDave Chinner 
274ad438c40SDave Chinner STATIC void
275ad438c40SDave Chinner xfs_inode_clear_reclaim_tag(
276ad438c40SDave Chinner 	struct xfs_perag	*pag,
277ad438c40SDave Chinner 	xfs_ino_t		ino)
278ad438c40SDave Chinner {
279ad438c40SDave Chinner 	radix_tree_tag_clear(&pag->pag_ici_root,
280ad438c40SDave Chinner 			     XFS_INO_TO_AGINO(pag->pag_mount, ino),
281ad438c40SDave Chinner 			     XFS_ICI_RECLAIM_TAG);
282ad438c40SDave Chinner 	xfs_perag_clear_reclaim_tag(pag);
283ad438c40SDave Chinner }
284ad438c40SDave Chinner 
285*7fdff526SDarrick J. Wong static inline void
286ae2c4ac2SBrian Foster xfs_inew_wait(
287ae2c4ac2SBrian Foster 	struct xfs_inode	*ip)
288ae2c4ac2SBrian Foster {
289ae2c4ac2SBrian Foster 	wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT);
290ae2c4ac2SBrian Foster 	DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT);
291ae2c4ac2SBrian Foster 
292ae2c4ac2SBrian Foster 	do {
29321417136SIngo Molnar 		prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
294ae2c4ac2SBrian Foster 		if (!xfs_iflags_test(ip, XFS_INEW))
295ae2c4ac2SBrian Foster 			break;
296ae2c4ac2SBrian Foster 		schedule();
297ae2c4ac2SBrian Foster 	} while (true);
29821417136SIngo Molnar 	finish_wait(wq, &wait.wq_entry);
299ae2c4ac2SBrian Foster }
300ae2c4ac2SBrian Foster 
301ad438c40SDave Chinner /*
30250997470SDave Chinner  * When we recycle a reclaimable inode, we need to re-initialise the VFS inode
30350997470SDave Chinner  * part of the structure. This is made more complex by the fact we store
30450997470SDave Chinner  * information about the on-disk values in the VFS inode and so we can't just
30583e06f21SDave Chinner  * overwrite the values unconditionally. Hence we save the parameters we
30650997470SDave Chinner  * need to retain across reinitialisation, and rewrite them into the VFS inode
30783e06f21SDave Chinner  * after reinitialisation even if it fails.
30850997470SDave Chinner  */
30950997470SDave Chinner static int
31050997470SDave Chinner xfs_reinit_inode(
31150997470SDave Chinner 	struct xfs_mount	*mp,
31250997470SDave Chinner 	struct inode		*inode)
31350997470SDave Chinner {
31450997470SDave Chinner 	int		error;
31554d7b5c1SDave Chinner 	uint32_t	nlink = inode->i_nlink;
3169e9a2674SDave Chinner 	uint32_t	generation = inode->i_generation;
317f0e28280SJeff Layton 	uint64_t	version = inode_peek_iversion(inode);
318c19b3b05SDave Chinner 	umode_t		mode = inode->i_mode;
319acd1d715SAmir Goldstein 	dev_t		dev = inode->i_rdev;
3203d8f2821SChristoph Hellwig 	kuid_t		uid = inode->i_uid;
3213d8f2821SChristoph Hellwig 	kgid_t		gid = inode->i_gid;
32250997470SDave Chinner 
32350997470SDave Chinner 	error = inode_init_always(mp->m_super, inode);
32450997470SDave Chinner 
32554d7b5c1SDave Chinner 	set_nlink(inode, nlink);
3269e9a2674SDave Chinner 	inode->i_generation = generation;
327f0e28280SJeff Layton 	inode_set_iversion_queried(inode, version);
328c19b3b05SDave Chinner 	inode->i_mode = mode;
329acd1d715SAmir Goldstein 	inode->i_rdev = dev;
3303d8f2821SChristoph Hellwig 	inode->i_uid = uid;
3313d8f2821SChristoph Hellwig 	inode->i_gid = gid;
33250997470SDave Chinner 	return error;
33350997470SDave Chinner }
33450997470SDave Chinner 
33550997470SDave Chinner /*
336afca6c5bSDave Chinner  * If we are allocating a new inode, then check what was returned is
337afca6c5bSDave Chinner  * actually a free, empty inode. If we are not allocating an inode,
338afca6c5bSDave Chinner  * then check we didn't find a free inode.
339afca6c5bSDave Chinner  *
340afca6c5bSDave Chinner  * Returns:
341afca6c5bSDave Chinner  *	0		if the inode free state matches the lookup context
342afca6c5bSDave Chinner  *	-ENOENT		if the inode is free and we are not allocating
343afca6c5bSDave Chinner  *	-EFSCORRUPTED	if there is any state mismatch at all
344afca6c5bSDave Chinner  */
345afca6c5bSDave Chinner static int
346afca6c5bSDave Chinner xfs_iget_check_free_state(
347afca6c5bSDave Chinner 	struct xfs_inode	*ip,
348afca6c5bSDave Chinner 	int			flags)
349afca6c5bSDave Chinner {
350afca6c5bSDave Chinner 	if (flags & XFS_IGET_CREATE) {
351afca6c5bSDave Chinner 		/* should be a free inode */
352afca6c5bSDave Chinner 		if (VFS_I(ip)->i_mode != 0) {
353afca6c5bSDave Chinner 			xfs_warn(ip->i_mount,
354afca6c5bSDave Chinner "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)",
355afca6c5bSDave Chinner 				ip->i_ino, VFS_I(ip)->i_mode);
356afca6c5bSDave Chinner 			return -EFSCORRUPTED;
357afca6c5bSDave Chinner 		}
358afca6c5bSDave Chinner 
3596e73a545SChristoph Hellwig 		if (ip->i_nblocks != 0) {
360afca6c5bSDave Chinner 			xfs_warn(ip->i_mount,
361afca6c5bSDave Chinner "Corruption detected! Free inode 0x%llx has blocks allocated!",
362afca6c5bSDave Chinner 				ip->i_ino);
363afca6c5bSDave Chinner 			return -EFSCORRUPTED;
364afca6c5bSDave Chinner 		}
365afca6c5bSDave Chinner 		return 0;
366afca6c5bSDave Chinner 	}
367afca6c5bSDave Chinner 
368afca6c5bSDave Chinner 	/* should be an allocated inode */
369afca6c5bSDave Chinner 	if (VFS_I(ip)->i_mode == 0)
370afca6c5bSDave Chinner 		return -ENOENT;
371afca6c5bSDave Chinner 
372afca6c5bSDave Chinner 	return 0;
373afca6c5bSDave Chinner }
374afca6c5bSDave Chinner 
375afca6c5bSDave Chinner /*
37633479e05SDave Chinner  * Check the validity of the inode we just found it the cache
37733479e05SDave Chinner  */
37833479e05SDave Chinner static int
37933479e05SDave Chinner xfs_iget_cache_hit(
38033479e05SDave Chinner 	struct xfs_perag	*pag,
38133479e05SDave Chinner 	struct xfs_inode	*ip,
38233479e05SDave Chinner 	xfs_ino_t		ino,
38333479e05SDave Chinner 	int			flags,
38433479e05SDave Chinner 	int			lock_flags) __releases(RCU)
38533479e05SDave Chinner {
38633479e05SDave Chinner 	struct inode		*inode = VFS_I(ip);
38733479e05SDave Chinner 	struct xfs_mount	*mp = ip->i_mount;
38833479e05SDave Chinner 	int			error;
38933479e05SDave Chinner 
39033479e05SDave Chinner 	/*
39133479e05SDave Chinner 	 * check for re-use of an inode within an RCU grace period due to the
39233479e05SDave Chinner 	 * radix tree nodes not being updated yet. We monitor for this by
39333479e05SDave Chinner 	 * setting the inode number to zero before freeing the inode structure.
39433479e05SDave Chinner 	 * If the inode has been reallocated and set up, then the inode number
39533479e05SDave Chinner 	 * will not match, so check for that, too.
39633479e05SDave Chinner 	 */
39733479e05SDave Chinner 	spin_lock(&ip->i_flags_lock);
39833479e05SDave Chinner 	if (ip->i_ino != ino) {
39933479e05SDave Chinner 		trace_xfs_iget_skip(ip);
400ff6d6af2SBill O'Donnell 		XFS_STATS_INC(mp, xs_ig_frecycle);
4012451337dSDave Chinner 		error = -EAGAIN;
40233479e05SDave Chinner 		goto out_error;
40333479e05SDave Chinner 	}
40433479e05SDave Chinner 
40533479e05SDave Chinner 
40633479e05SDave Chinner 	/*
40733479e05SDave Chinner 	 * If we are racing with another cache hit that is currently
40833479e05SDave Chinner 	 * instantiating this inode or currently recycling it out of
40933479e05SDave Chinner 	 * reclaimabe state, wait for the initialisation to complete
41033479e05SDave Chinner 	 * before continuing.
41133479e05SDave Chinner 	 *
41233479e05SDave Chinner 	 * XXX(hch): eventually we should do something equivalent to
41333479e05SDave Chinner 	 *	     wait_on_inode to wait for these flags to be cleared
41433479e05SDave Chinner 	 *	     instead of polling for it.
41533479e05SDave Chinner 	 */
41633479e05SDave Chinner 	if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
41733479e05SDave Chinner 		trace_xfs_iget_skip(ip);
418ff6d6af2SBill O'Donnell 		XFS_STATS_INC(mp, xs_ig_frecycle);
4192451337dSDave Chinner 		error = -EAGAIN;
42033479e05SDave Chinner 		goto out_error;
42133479e05SDave Chinner 	}
42233479e05SDave Chinner 
42333479e05SDave Chinner 	/*
424afca6c5bSDave Chinner 	 * Check the inode free state is valid. This also detects lookup
425afca6c5bSDave Chinner 	 * racing with unlinks.
42633479e05SDave Chinner 	 */
427afca6c5bSDave Chinner 	error = xfs_iget_check_free_state(ip, flags);
428afca6c5bSDave Chinner 	if (error)
42933479e05SDave Chinner 		goto out_error;
43033479e05SDave Chinner 
43133479e05SDave Chinner 	/*
43233479e05SDave Chinner 	 * If IRECLAIMABLE is set, we've torn down the VFS inode already.
43333479e05SDave Chinner 	 * Need to carefully get it back into useable state.
43433479e05SDave Chinner 	 */
43533479e05SDave Chinner 	if (ip->i_flags & XFS_IRECLAIMABLE) {
43633479e05SDave Chinner 		trace_xfs_iget_reclaim(ip);
43733479e05SDave Chinner 
438378f681cSDarrick J. Wong 		if (flags & XFS_IGET_INCORE) {
439378f681cSDarrick J. Wong 			error = -EAGAIN;
440378f681cSDarrick J. Wong 			goto out_error;
441378f681cSDarrick J. Wong 		}
442378f681cSDarrick J. Wong 
44333479e05SDave Chinner 		/*
44433479e05SDave Chinner 		 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
44533479e05SDave Chinner 		 * from stomping over us while we recycle the inode.  We can't
44633479e05SDave Chinner 		 * clear the radix tree reclaimable tag yet as it requires
44733479e05SDave Chinner 		 * pag_ici_lock to be held exclusive.
44833479e05SDave Chinner 		 */
44933479e05SDave Chinner 		ip->i_flags |= XFS_IRECLAIM;
45033479e05SDave Chinner 
45133479e05SDave Chinner 		spin_unlock(&ip->i_flags_lock);
45233479e05SDave Chinner 		rcu_read_unlock();
45333479e05SDave Chinner 
454d45344d6SIra Weiny 		ASSERT(!rwsem_is_locked(&inode->i_rwsem));
45550997470SDave Chinner 		error = xfs_reinit_inode(mp, inode);
45633479e05SDave Chinner 		if (error) {
457756baca2SBrian Foster 			bool wake;
45833479e05SDave Chinner 			/*
45933479e05SDave Chinner 			 * Re-initializing the inode failed, and we are in deep
46033479e05SDave Chinner 			 * trouble.  Try to re-add it to the reclaim list.
46133479e05SDave Chinner 			 */
46233479e05SDave Chinner 			rcu_read_lock();
46333479e05SDave Chinner 			spin_lock(&ip->i_flags_lock);
464756baca2SBrian Foster 			wake = !!__xfs_iflags_test(ip, XFS_INEW);
46533479e05SDave Chinner 			ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
466756baca2SBrian Foster 			if (wake)
467756baca2SBrian Foster 				wake_up_bit(&ip->i_flags, __XFS_INEW_BIT);
46833479e05SDave Chinner 			ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
46933479e05SDave Chinner 			trace_xfs_iget_reclaim_fail(ip);
47033479e05SDave Chinner 			goto out_error;
47133479e05SDave Chinner 		}
47233479e05SDave Chinner 
47333479e05SDave Chinner 		spin_lock(&pag->pag_ici_lock);
47433479e05SDave Chinner 		spin_lock(&ip->i_flags_lock);
47533479e05SDave Chinner 
47633479e05SDave Chinner 		/*
47733479e05SDave Chinner 		 * Clear the per-lifetime state in the inode as we are now
47833479e05SDave Chinner 		 * effectively a new inode and need to return to the initial
47933479e05SDave Chinner 		 * state before reuse occurs.
48033479e05SDave Chinner 		 */
48133479e05SDave Chinner 		ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
48233479e05SDave Chinner 		ip->i_flags |= XFS_INEW;
483545c0889SDave Chinner 		xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
48433479e05SDave Chinner 		inode->i_state = I_NEW;
4856772c1f1SDarrick J. Wong 		ip->i_sick = 0;
4866772c1f1SDarrick J. Wong 		ip->i_checked = 0;
48733479e05SDave Chinner 
48833479e05SDave Chinner 		spin_unlock(&ip->i_flags_lock);
48933479e05SDave Chinner 		spin_unlock(&pag->pag_ici_lock);
49033479e05SDave Chinner 	} else {
49133479e05SDave Chinner 		/* If the VFS inode is being torn down, pause and try again. */
49233479e05SDave Chinner 		if (!igrab(inode)) {
49333479e05SDave Chinner 			trace_xfs_iget_skip(ip);
4942451337dSDave Chinner 			error = -EAGAIN;
49533479e05SDave Chinner 			goto out_error;
49633479e05SDave Chinner 		}
49733479e05SDave Chinner 
49833479e05SDave Chinner 		/* We've got a live one. */
49933479e05SDave Chinner 		spin_unlock(&ip->i_flags_lock);
50033479e05SDave Chinner 		rcu_read_unlock();
50133479e05SDave Chinner 		trace_xfs_iget_hit(ip);
50233479e05SDave Chinner 	}
50333479e05SDave Chinner 
50433479e05SDave Chinner 	if (lock_flags != 0)
50533479e05SDave Chinner 		xfs_ilock(ip, lock_flags);
50633479e05SDave Chinner 
507378f681cSDarrick J. Wong 	if (!(flags & XFS_IGET_INCORE))
508dae2f8edSIra Weiny 		xfs_iflags_clear(ip, XFS_ISTALE);
509ff6d6af2SBill O'Donnell 	XFS_STATS_INC(mp, xs_ig_found);
51033479e05SDave Chinner 
51133479e05SDave Chinner 	return 0;
51233479e05SDave Chinner 
51333479e05SDave Chinner out_error:
51433479e05SDave Chinner 	spin_unlock(&ip->i_flags_lock);
51533479e05SDave Chinner 	rcu_read_unlock();
51633479e05SDave Chinner 	return error;
51733479e05SDave Chinner }
51833479e05SDave Chinner 
51933479e05SDave Chinner 
52033479e05SDave Chinner static int
52133479e05SDave Chinner xfs_iget_cache_miss(
52233479e05SDave Chinner 	struct xfs_mount	*mp,
52333479e05SDave Chinner 	struct xfs_perag	*pag,
52433479e05SDave Chinner 	xfs_trans_t		*tp,
52533479e05SDave Chinner 	xfs_ino_t		ino,
52633479e05SDave Chinner 	struct xfs_inode	**ipp,
52733479e05SDave Chinner 	int			flags,
52833479e05SDave Chinner 	int			lock_flags)
52933479e05SDave Chinner {
53033479e05SDave Chinner 	struct xfs_inode	*ip;
53133479e05SDave Chinner 	int			error;
53233479e05SDave Chinner 	xfs_agino_t		agino = XFS_INO_TO_AGINO(mp, ino);
53333479e05SDave Chinner 	int			iflags;
53433479e05SDave Chinner 
53533479e05SDave Chinner 	ip = xfs_inode_alloc(mp, ino);
53633479e05SDave Chinner 	if (!ip)
5372451337dSDave Chinner 		return -ENOMEM;
53833479e05SDave Chinner 
539bb8a66afSChristoph Hellwig 	error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags);
54033479e05SDave Chinner 	if (error)
54133479e05SDave Chinner 		goto out_destroy;
54233479e05SDave Chinner 
543bb8a66afSChristoph Hellwig 	/*
544bb8a66afSChristoph Hellwig 	 * For version 5 superblocks, if we are initialising a new inode and we
545bb8a66afSChristoph Hellwig 	 * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can
546bb8a66afSChristoph Hellwig 	 * simply build the new inode core with a random generation number.
547bb8a66afSChristoph Hellwig 	 *
548bb8a66afSChristoph Hellwig 	 * For version 4 (and older) superblocks, log recovery is dependent on
549965e0a1aSChristoph Hellwig 	 * the i_flushiter field being initialised from the current on-disk
550bb8a66afSChristoph Hellwig 	 * value and hence we must also read the inode off disk even when
551bb8a66afSChristoph Hellwig 	 * initializing new inodes.
552bb8a66afSChristoph Hellwig 	 */
553bb8a66afSChristoph Hellwig 	if (xfs_sb_version_has_v3inode(&mp->m_sb) &&
554bb8a66afSChristoph Hellwig 	    (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) {
555bb8a66afSChristoph Hellwig 		VFS_I(ip)->i_generation = prandom_u32();
556bb8a66afSChristoph Hellwig 	} else {
557bb8a66afSChristoph Hellwig 		struct xfs_buf		*bp;
558bb8a66afSChristoph Hellwig 
559af9dcddeSChristoph Hellwig 		error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp);
560bb8a66afSChristoph Hellwig 		if (error)
561bb8a66afSChristoph Hellwig 			goto out_destroy;
562bb8a66afSChristoph Hellwig 
563af9dcddeSChristoph Hellwig 		error = xfs_inode_from_disk(ip,
564af9dcddeSChristoph Hellwig 				xfs_buf_offset(bp, ip->i_imap.im_boffset));
565bb8a66afSChristoph Hellwig 		if (!error)
566bb8a66afSChristoph Hellwig 			xfs_buf_set_ref(bp, XFS_INO_REF);
567bb8a66afSChristoph Hellwig 		xfs_trans_brelse(tp, bp);
568bb8a66afSChristoph Hellwig 
569bb8a66afSChristoph Hellwig 		if (error)
570bb8a66afSChristoph Hellwig 			goto out_destroy;
571bb8a66afSChristoph Hellwig 	}
572bb8a66afSChristoph Hellwig 
57333479e05SDave Chinner 	trace_xfs_iget_miss(ip);
57433479e05SDave Chinner 
575ee457001SDave Chinner 	/*
576afca6c5bSDave Chinner 	 * Check the inode free state is valid. This also detects lookup
577afca6c5bSDave Chinner 	 * racing with unlinks.
578ee457001SDave Chinner 	 */
579afca6c5bSDave Chinner 	error = xfs_iget_check_free_state(ip, flags);
580afca6c5bSDave Chinner 	if (error)
581ee457001SDave Chinner 		goto out_destroy;
58233479e05SDave Chinner 
58333479e05SDave Chinner 	/*
58433479e05SDave Chinner 	 * Preload the radix tree so we can insert safely under the
58533479e05SDave Chinner 	 * write spinlock. Note that we cannot sleep inside the preload
58633479e05SDave Chinner 	 * region. Since we can be called from transaction context, don't
58733479e05SDave Chinner 	 * recurse into the file system.
58833479e05SDave Chinner 	 */
58933479e05SDave Chinner 	if (radix_tree_preload(GFP_NOFS)) {
5902451337dSDave Chinner 		error = -EAGAIN;
59133479e05SDave Chinner 		goto out_destroy;
59233479e05SDave Chinner 	}
59333479e05SDave Chinner 
59433479e05SDave Chinner 	/*
59533479e05SDave Chinner 	 * Because the inode hasn't been added to the radix-tree yet it can't
59633479e05SDave Chinner 	 * be found by another thread, so we can do the non-sleeping lock here.
59733479e05SDave Chinner 	 */
59833479e05SDave Chinner 	if (lock_flags) {
59933479e05SDave Chinner 		if (!xfs_ilock_nowait(ip, lock_flags))
60033479e05SDave Chinner 			BUG();
60133479e05SDave Chinner 	}
60233479e05SDave Chinner 
60333479e05SDave Chinner 	/*
60433479e05SDave Chinner 	 * These values must be set before inserting the inode into the radix
60533479e05SDave Chinner 	 * tree as the moment it is inserted a concurrent lookup (allowed by the
60633479e05SDave Chinner 	 * RCU locking mechanism) can find it and that lookup must see that this
60733479e05SDave Chinner 	 * is an inode currently under construction (i.e. that XFS_INEW is set).
60833479e05SDave Chinner 	 * The ip->i_flags_lock that protects the XFS_INEW flag forms the
60933479e05SDave Chinner 	 * memory barrier that ensures this detection works correctly at lookup
61033479e05SDave Chinner 	 * time.
61133479e05SDave Chinner 	 */
61233479e05SDave Chinner 	iflags = XFS_INEW;
61333479e05SDave Chinner 	if (flags & XFS_IGET_DONTCACHE)
6142c567af4SIra Weiny 		d_mark_dontcache(VFS_I(ip));
615113a5683SChandra Seetharaman 	ip->i_udquot = NULL;
616113a5683SChandra Seetharaman 	ip->i_gdquot = NULL;
61792f8ff73SChandra Seetharaman 	ip->i_pdquot = NULL;
61833479e05SDave Chinner 	xfs_iflags_set(ip, iflags);
61933479e05SDave Chinner 
62033479e05SDave Chinner 	/* insert the new inode */
62133479e05SDave Chinner 	spin_lock(&pag->pag_ici_lock);
62233479e05SDave Chinner 	error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
62333479e05SDave Chinner 	if (unlikely(error)) {
62433479e05SDave Chinner 		WARN_ON(error != -EEXIST);
625ff6d6af2SBill O'Donnell 		XFS_STATS_INC(mp, xs_ig_dup);
6262451337dSDave Chinner 		error = -EAGAIN;
62733479e05SDave Chinner 		goto out_preload_end;
62833479e05SDave Chinner 	}
62933479e05SDave Chinner 	spin_unlock(&pag->pag_ici_lock);
63033479e05SDave Chinner 	radix_tree_preload_end();
63133479e05SDave Chinner 
63233479e05SDave Chinner 	*ipp = ip;
63333479e05SDave Chinner 	return 0;
63433479e05SDave Chinner 
63533479e05SDave Chinner out_preload_end:
63633479e05SDave Chinner 	spin_unlock(&pag->pag_ici_lock);
63733479e05SDave Chinner 	radix_tree_preload_end();
63833479e05SDave Chinner 	if (lock_flags)
63933479e05SDave Chinner 		xfs_iunlock(ip, lock_flags);
64033479e05SDave Chinner out_destroy:
64133479e05SDave Chinner 	__destroy_inode(VFS_I(ip));
64233479e05SDave Chinner 	xfs_inode_free(ip);
64333479e05SDave Chinner 	return error;
64433479e05SDave Chinner }
64533479e05SDave Chinner 
64633479e05SDave Chinner /*
64702511a5aSDave Chinner  * Look up an inode by number in the given file system.  The inode is looked up
64802511a5aSDave Chinner  * in the cache held in each AG.  If the inode is found in the cache, initialise
64902511a5aSDave Chinner  * the vfs inode if necessary.
65033479e05SDave Chinner  *
65102511a5aSDave Chinner  * If it is not in core, read it in from the file system's device, add it to the
65202511a5aSDave Chinner  * cache and initialise the vfs inode.
65333479e05SDave Chinner  *
65433479e05SDave Chinner  * The inode is locked according to the value of the lock_flags parameter.
65502511a5aSDave Chinner  * Inode lookup is only done during metadata operations and not as part of the
65602511a5aSDave Chinner  * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup.
65733479e05SDave Chinner  */
65833479e05SDave Chinner int
65933479e05SDave Chinner xfs_iget(
66002511a5aSDave Chinner 	struct xfs_mount	*mp,
66102511a5aSDave Chinner 	struct xfs_trans	*tp,
66233479e05SDave Chinner 	xfs_ino_t		ino,
66333479e05SDave Chinner 	uint			flags,
66433479e05SDave Chinner 	uint			lock_flags,
66502511a5aSDave Chinner 	struct xfs_inode	**ipp)
66633479e05SDave Chinner {
66702511a5aSDave Chinner 	struct xfs_inode	*ip;
66802511a5aSDave Chinner 	struct xfs_perag	*pag;
66933479e05SDave Chinner 	xfs_agino_t		agino;
67002511a5aSDave Chinner 	int			error;
67133479e05SDave Chinner 
67233479e05SDave Chinner 	ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
67333479e05SDave Chinner 
67433479e05SDave Chinner 	/* reject inode numbers outside existing AGs */
67533479e05SDave Chinner 	if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
6762451337dSDave Chinner 		return -EINVAL;
67733479e05SDave Chinner 
678ff6d6af2SBill O'Donnell 	XFS_STATS_INC(mp, xs_ig_attempts);
6798774cf8bSLucas Stach 
68033479e05SDave Chinner 	/* get the perag structure and ensure that it's inode capable */
68133479e05SDave Chinner 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
68233479e05SDave Chinner 	agino = XFS_INO_TO_AGINO(mp, ino);
68333479e05SDave Chinner 
68433479e05SDave Chinner again:
68533479e05SDave Chinner 	error = 0;
68633479e05SDave Chinner 	rcu_read_lock();
68733479e05SDave Chinner 	ip = radix_tree_lookup(&pag->pag_ici_root, agino);
68833479e05SDave Chinner 
68933479e05SDave Chinner 	if (ip) {
69033479e05SDave Chinner 		error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
69133479e05SDave Chinner 		if (error)
69233479e05SDave Chinner 			goto out_error_or_again;
69333479e05SDave Chinner 	} else {
69433479e05SDave Chinner 		rcu_read_unlock();
695378f681cSDarrick J. Wong 		if (flags & XFS_IGET_INCORE) {
696ed438b47SDarrick J. Wong 			error = -ENODATA;
697378f681cSDarrick J. Wong 			goto out_error_or_again;
698378f681cSDarrick J. Wong 		}
699ff6d6af2SBill O'Donnell 		XFS_STATS_INC(mp, xs_ig_missed);
70033479e05SDave Chinner 
70133479e05SDave Chinner 		error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
70233479e05SDave Chinner 							flags, lock_flags);
70333479e05SDave Chinner 		if (error)
70433479e05SDave Chinner 			goto out_error_or_again;
70533479e05SDave Chinner 	}
70633479e05SDave Chinner 	xfs_perag_put(pag);
70733479e05SDave Chinner 
70833479e05SDave Chinner 	*ipp = ip;
70933479e05SDave Chinner 
71033479e05SDave Chinner 	/*
71158c90473SDave Chinner 	 * If we have a real type for an on-disk inode, we can setup the inode
71233479e05SDave Chinner 	 * now.	 If it's a new inode being created, xfs_ialloc will handle it.
71333479e05SDave Chinner 	 */
714c19b3b05SDave Chinner 	if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0)
71558c90473SDave Chinner 		xfs_setup_existing_inode(ip);
71633479e05SDave Chinner 	return 0;
71733479e05SDave Chinner 
71833479e05SDave Chinner out_error_or_again:
719378f681cSDarrick J. Wong 	if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) {
72033479e05SDave Chinner 		delay(1);
72133479e05SDave Chinner 		goto again;
72233479e05SDave Chinner 	}
72333479e05SDave Chinner 	xfs_perag_put(pag);
72433479e05SDave Chinner 	return error;
72533479e05SDave Chinner }
72633479e05SDave Chinner 
7276d8b79cfSDave Chinner /*
728378f681cSDarrick J. Wong  * "Is this a cached inode that's also allocated?"
729378f681cSDarrick J. Wong  *
730378f681cSDarrick J. Wong  * Look up an inode by number in the given file system.  If the inode is
731378f681cSDarrick J. Wong  * in cache and isn't in purgatory, return 1 if the inode is allocated
732378f681cSDarrick J. Wong  * and 0 if it is not.  For all other cases (not in cache, being torn
733378f681cSDarrick J. Wong  * down, etc.), return a negative error code.
734378f681cSDarrick J. Wong  *
735378f681cSDarrick J. Wong  * The caller has to prevent inode allocation and freeing activity,
736378f681cSDarrick J. Wong  * presumably by locking the AGI buffer.   This is to ensure that an
737378f681cSDarrick J. Wong  * inode cannot transition from allocated to freed until the caller is
738378f681cSDarrick J. Wong  * ready to allow that.  If the inode is in an intermediate state (new,
739378f681cSDarrick J. Wong  * reclaimable, or being reclaimed), -EAGAIN will be returned; if the
740378f681cSDarrick J. Wong  * inode is not in the cache, -ENOENT will be returned.  The caller must
741378f681cSDarrick J. Wong  * deal with these scenarios appropriately.
742378f681cSDarrick J. Wong  *
743378f681cSDarrick J. Wong  * This is a specialized use case for the online scrubber; if you're
744378f681cSDarrick J. Wong  * reading this, you probably want xfs_iget.
745378f681cSDarrick J. Wong  */
746378f681cSDarrick J. Wong int
747378f681cSDarrick J. Wong xfs_icache_inode_is_allocated(
748378f681cSDarrick J. Wong 	struct xfs_mount	*mp,
749378f681cSDarrick J. Wong 	struct xfs_trans	*tp,
750378f681cSDarrick J. Wong 	xfs_ino_t		ino,
751378f681cSDarrick J. Wong 	bool			*inuse)
752378f681cSDarrick J. Wong {
753378f681cSDarrick J. Wong 	struct xfs_inode	*ip;
754378f681cSDarrick J. Wong 	int			error;
755378f681cSDarrick J. Wong 
756378f681cSDarrick J. Wong 	error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip);
757378f681cSDarrick J. Wong 	if (error)
758378f681cSDarrick J. Wong 		return error;
759378f681cSDarrick J. Wong 
760378f681cSDarrick J. Wong 	*inuse = !!(VFS_I(ip)->i_mode);
76144a8736bSDarrick J. Wong 	xfs_irele(ip);
762378f681cSDarrick J. Wong 	return 0;
763378f681cSDarrick J. Wong }
764378f681cSDarrick J. Wong 
765378f681cSDarrick J. Wong /*
7666d8b79cfSDave Chinner  * The inode lookup is done in batches to keep the amount of lock traffic and
7676d8b79cfSDave Chinner  * radix tree lookups to a minimum. The batch size is a trade off between
7686d8b79cfSDave Chinner  * lookup reduction and stack usage. This is in the reclaim path, so we can't
7696d8b79cfSDave Chinner  * be too greedy.
770df600197SDarrick J. Wong  *
771c1115c0cSDarrick J. Wong  * XXX: This will be moved closer to xfs_icwalk* once we get rid of the
772df600197SDarrick J. Wong  * separate reclaim walk functions.
7736d8b79cfSDave Chinner  */
7746d8b79cfSDave Chinner #define XFS_LOOKUP_BATCH	32
7756d8b79cfSDave Chinner 
7761ad2cfe0SDarrick J. Wong #ifdef CONFIG_XFS_QUOTA
777b9baaef4SDarrick J. Wong /* Decide if we want to grab this inode to drop its dquots. */
778b9baaef4SDarrick J. Wong static bool
779b9baaef4SDarrick J. Wong xfs_dqrele_igrab(
780b9baaef4SDarrick J. Wong 	struct xfs_inode	*ip)
781b9baaef4SDarrick J. Wong {
782b9baaef4SDarrick J. Wong 	bool			ret = false;
783b9baaef4SDarrick J. Wong 
784b9baaef4SDarrick J. Wong 	ASSERT(rcu_read_lock_held());
785b9baaef4SDarrick J. Wong 
786b9baaef4SDarrick J. Wong 	/* Check for stale RCU freed inode */
787b9baaef4SDarrick J. Wong 	spin_lock(&ip->i_flags_lock);
788b9baaef4SDarrick J. Wong 	if (!ip->i_ino)
789b9baaef4SDarrick J. Wong 		goto out_unlock;
790b9baaef4SDarrick J. Wong 
791b9baaef4SDarrick J. Wong 	/*
792b9baaef4SDarrick J. Wong 	 * Skip inodes that are anywhere in the reclaim machinery because we
793b9baaef4SDarrick J. Wong 	 * drop dquots before tagging an inode for reclamation.
794b9baaef4SDarrick J. Wong 	 */
795b9baaef4SDarrick J. Wong 	if (ip->i_flags & (XFS_IRECLAIM | XFS_IRECLAIMABLE))
796b9baaef4SDarrick J. Wong 		goto out_unlock;
797b9baaef4SDarrick J. Wong 
798b9baaef4SDarrick J. Wong 	/*
799b9baaef4SDarrick J. Wong 	 * The inode looks alive; try to grab a VFS reference so that it won't
800b9baaef4SDarrick J. Wong 	 * get destroyed.  If we got the reference, return true to say that
801b9baaef4SDarrick J. Wong 	 * we grabbed the inode.
802b9baaef4SDarrick J. Wong 	 *
803b9baaef4SDarrick J. Wong 	 * If we can't get the reference, then we know the inode had its VFS
804b9baaef4SDarrick J. Wong 	 * state torn down and hasn't yet entered the reclaim machinery.  Since
805b9baaef4SDarrick J. Wong 	 * we also know that dquots are detached from an inode before it enters
806b9baaef4SDarrick J. Wong 	 * reclaim, we can skip the inode.
807b9baaef4SDarrick J. Wong 	 */
808b9baaef4SDarrick J. Wong 	ret = igrab(VFS_I(ip)) != NULL;
809b9baaef4SDarrick J. Wong 
810b9baaef4SDarrick J. Wong out_unlock:
811b9baaef4SDarrick J. Wong 	spin_unlock(&ip->i_flags_lock);
812b9baaef4SDarrick J. Wong 	return ret;
813b9baaef4SDarrick J. Wong }
814b9baaef4SDarrick J. Wong 
8151ad2cfe0SDarrick J. Wong /* Drop this inode's dquots. */
8161ad2cfe0SDarrick J. Wong static int
8171ad2cfe0SDarrick J. Wong xfs_dqrele_inode(
8181ad2cfe0SDarrick J. Wong 	struct xfs_inode	*ip,
8191ad2cfe0SDarrick J. Wong 	void			*priv)
8201ad2cfe0SDarrick J. Wong {
8211ad2cfe0SDarrick J. Wong 	struct xfs_eofblocks	*eofb = priv;
8221ad2cfe0SDarrick J. Wong 
8239d2793ceSDarrick J. Wong 	if (xfs_iflags_test(ip, XFS_INEW))
8249d2793ceSDarrick J. Wong 		xfs_inew_wait(ip);
8259d2793ceSDarrick J. Wong 
8261ad2cfe0SDarrick J. Wong 	xfs_ilock(ip, XFS_ILOCK_EXCL);
8271ad2cfe0SDarrick J. Wong 	if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_UDQUOT) {
8281ad2cfe0SDarrick J. Wong 		xfs_qm_dqrele(ip->i_udquot);
8291ad2cfe0SDarrick J. Wong 		ip->i_udquot = NULL;
8301ad2cfe0SDarrick J. Wong 	}
8311ad2cfe0SDarrick J. Wong 	if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_GDQUOT) {
8321ad2cfe0SDarrick J. Wong 		xfs_qm_dqrele(ip->i_gdquot);
8331ad2cfe0SDarrick J. Wong 		ip->i_gdquot = NULL;
8341ad2cfe0SDarrick J. Wong 	}
8351ad2cfe0SDarrick J. Wong 	if (eofb->eof_flags & XFS_ICWALK_FLAG_DROP_PDQUOT) {
8361ad2cfe0SDarrick J. Wong 		xfs_qm_dqrele(ip->i_pdquot);
8371ad2cfe0SDarrick J. Wong 		ip->i_pdquot = NULL;
8381ad2cfe0SDarrick J. Wong 	}
8391ad2cfe0SDarrick J. Wong 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
8401ad2cfe0SDarrick J. Wong 	return 0;
8411ad2cfe0SDarrick J. Wong }
8421ad2cfe0SDarrick J. Wong 
8431ad2cfe0SDarrick J. Wong /*
8441ad2cfe0SDarrick J. Wong  * Detach all dquots from incore inodes if we can.  The caller must already
8451ad2cfe0SDarrick J. Wong  * have dropped the relevant XFS_[UGP]QUOTA_ACTIVE flags so that dquots will
8461ad2cfe0SDarrick J. Wong  * not get reattached.
8471ad2cfe0SDarrick J. Wong  */
8481ad2cfe0SDarrick J. Wong int
8491ad2cfe0SDarrick J. Wong xfs_dqrele_all_inodes(
8501ad2cfe0SDarrick J. Wong 	struct xfs_mount	*mp,
8511ad2cfe0SDarrick J. Wong 	unsigned int		qflags)
8521ad2cfe0SDarrick J. Wong {
8531ad2cfe0SDarrick J. Wong 	struct xfs_eofblocks	eofb = { .eof_flags = 0 };
8541ad2cfe0SDarrick J. Wong 
8551ad2cfe0SDarrick J. Wong 	if (qflags & XFS_UQUOTA_ACCT)
8561ad2cfe0SDarrick J. Wong 		eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_UDQUOT;
8571ad2cfe0SDarrick J. Wong 	if (qflags & XFS_GQUOTA_ACCT)
8581ad2cfe0SDarrick J. Wong 		eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_GDQUOT;
8591ad2cfe0SDarrick J. Wong 	if (qflags & XFS_PQUOTA_ACCT)
8601ad2cfe0SDarrick J. Wong 		eofb.eof_flags |= XFS_ICWALK_FLAG_DROP_PDQUOT;
8611ad2cfe0SDarrick J. Wong 
862*7fdff526SDarrick J. Wong 	return xfs_icwalk(mp, xfs_dqrele_inode, &eofb, XFS_ICWALK_DQRELE);
8631ad2cfe0SDarrick J. Wong }
864b9baaef4SDarrick J. Wong #else
865b9baaef4SDarrick J. Wong # define xfs_dqrele_igrab(ip)		(false)
8661ad2cfe0SDarrick J. Wong #endif /* CONFIG_XFS_QUOTA */
8671ad2cfe0SDarrick J. Wong 
868579b62faSBrian Foster /*
8696d8b79cfSDave Chinner  * Grab the inode for reclaim exclusively.
87050718b8dSDave Chinner  *
87150718b8dSDave Chinner  * We have found this inode via a lookup under RCU, so the inode may have
87250718b8dSDave Chinner  * already been freed, or it may be in the process of being recycled by
87350718b8dSDave Chinner  * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode
87450718b8dSDave Chinner  * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE
87550718b8dSDave Chinner  * will not be set. Hence we need to check for both these flag conditions to
87650718b8dSDave Chinner  * avoid inodes that are no longer reclaim candidates.
87750718b8dSDave Chinner  *
87850718b8dSDave Chinner  * Note: checking for other state flags here, under the i_flags_lock or not, is
87950718b8dSDave Chinner  * racy and should be avoided. Those races should be resolved only after we have
88050718b8dSDave Chinner  * ensured that we are able to reclaim this inode and the world can see that we
88150718b8dSDave Chinner  * are going to reclaim it.
88250718b8dSDave Chinner  *
88350718b8dSDave Chinner  * Return true if we grabbed it, false otherwise.
8846d8b79cfSDave Chinner  */
88550718b8dSDave Chinner static bool
8866d8b79cfSDave Chinner xfs_reclaim_inode_grab(
88750718b8dSDave Chinner 	struct xfs_inode	*ip)
8886d8b79cfSDave Chinner {
8896d8b79cfSDave Chinner 	ASSERT(rcu_read_lock_held());
8906d8b79cfSDave Chinner 
8916d8b79cfSDave Chinner 	spin_lock(&ip->i_flags_lock);
8926d8b79cfSDave Chinner 	if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) ||
8936d8b79cfSDave Chinner 	    __xfs_iflags_test(ip, XFS_IRECLAIM)) {
8946d8b79cfSDave Chinner 		/* not a reclaim candidate. */
8956d8b79cfSDave Chinner 		spin_unlock(&ip->i_flags_lock);
89650718b8dSDave Chinner 		return false;
8976d8b79cfSDave Chinner 	}
8986d8b79cfSDave Chinner 	__xfs_iflags_set(ip, XFS_IRECLAIM);
8996d8b79cfSDave Chinner 	spin_unlock(&ip->i_flags_lock);
90050718b8dSDave Chinner 	return true;
9016d8b79cfSDave Chinner }
9026d8b79cfSDave Chinner 
9036d8b79cfSDave Chinner /*
90402511a5aSDave Chinner  * Inode reclaim is non-blocking, so the default action if progress cannot be
90502511a5aSDave Chinner  * made is to "requeue" the inode for reclaim by unlocking it and clearing the
90602511a5aSDave Chinner  * XFS_IRECLAIM flag.  If we are in a shutdown state, we don't care about
90702511a5aSDave Chinner  * blocking anymore and hence we can wait for the inode to be able to reclaim
90802511a5aSDave Chinner  * it.
9096d8b79cfSDave Chinner  *
91002511a5aSDave Chinner  * We do no IO here - if callers require inodes to be cleaned they must push the
91102511a5aSDave Chinner  * AIL first to trigger writeback of dirty inodes.  This enables writeback to be
91202511a5aSDave Chinner  * done in the background in a non-blocking manner, and enables memory reclaim
91302511a5aSDave Chinner  * to make progress without blocking.
9146d8b79cfSDave Chinner  */
9154d0bab3aSDave Chinner static void
9166d8b79cfSDave Chinner xfs_reclaim_inode(
9176d8b79cfSDave Chinner 	struct xfs_inode	*ip,
91850718b8dSDave Chinner 	struct xfs_perag	*pag)
9196d8b79cfSDave Chinner {
9208a17d7ddSDave Chinner 	xfs_ino_t		ino = ip->i_ino; /* for radix_tree_delete */
9216d8b79cfSDave Chinner 
9229552e14dSDave Chinner 	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
9236d8b79cfSDave Chinner 		goto out;
924718ecc50SDave Chinner 	if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING))
9259552e14dSDave Chinner 		goto out_iunlock;
9266d8b79cfSDave Chinner 
9276d8b79cfSDave Chinner 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
9286d8b79cfSDave Chinner 		xfs_iunpin_wait(ip);
92988fc1879SBrian Foster 		xfs_iflush_abort(ip);
9306d8b79cfSDave Chinner 		goto reclaim;
9316d8b79cfSDave Chinner 	}
932617825feSDave Chinner 	if (xfs_ipincount(ip))
933718ecc50SDave Chinner 		goto out_clear_flush;
934617825feSDave Chinner 	if (!xfs_inode_clean(ip))
935718ecc50SDave Chinner 		goto out_clear_flush;
936617825feSDave Chinner 
937718ecc50SDave Chinner 	xfs_iflags_clear(ip, XFS_IFLUSHING);
9386d8b79cfSDave Chinner reclaim:
93998efe8afSBrian Foster 
9408a17d7ddSDave Chinner 	/*
9418a17d7ddSDave Chinner 	 * Because we use RCU freeing we need to ensure the inode always appears
9428a17d7ddSDave Chinner 	 * to be reclaimed with an invalid inode number when in the free state.
94398efe8afSBrian Foster 	 * We do this as early as possible under the ILOCK so that
944f2e9ad21SOmar Sandoval 	 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to
945f2e9ad21SOmar Sandoval 	 * detect races with us here. By doing this, we guarantee that once
946f2e9ad21SOmar Sandoval 	 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that
947f2e9ad21SOmar Sandoval 	 * it will see either a valid inode that will serialise correctly, or it
948f2e9ad21SOmar Sandoval 	 * will see an invalid inode that it can skip.
9498a17d7ddSDave Chinner 	 */
9508a17d7ddSDave Chinner 	spin_lock(&ip->i_flags_lock);
9518a17d7ddSDave Chinner 	ip->i_flags = XFS_IRECLAIM;
9528a17d7ddSDave Chinner 	ip->i_ino = 0;
9538a17d7ddSDave Chinner 	spin_unlock(&ip->i_flags_lock);
9548a17d7ddSDave Chinner 
9556d8b79cfSDave Chinner 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
9566d8b79cfSDave Chinner 
957ff6d6af2SBill O'Donnell 	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
9586d8b79cfSDave Chinner 	/*
9596d8b79cfSDave Chinner 	 * Remove the inode from the per-AG radix tree.
9606d8b79cfSDave Chinner 	 *
9616d8b79cfSDave Chinner 	 * Because radix_tree_delete won't complain even if the item was never
9626d8b79cfSDave Chinner 	 * added to the tree assert that it's been there before to catch
9636d8b79cfSDave Chinner 	 * problems with the inode life time early on.
9646d8b79cfSDave Chinner 	 */
9656d8b79cfSDave Chinner 	spin_lock(&pag->pag_ici_lock);
9666d8b79cfSDave Chinner 	if (!radix_tree_delete(&pag->pag_ici_root,
9678a17d7ddSDave Chinner 				XFS_INO_TO_AGINO(ip->i_mount, ino)))
9686d8b79cfSDave Chinner 		ASSERT(0);
969545c0889SDave Chinner 	xfs_perag_clear_reclaim_tag(pag);
9706d8b79cfSDave Chinner 	spin_unlock(&pag->pag_ici_lock);
9716d8b79cfSDave Chinner 
9726d8b79cfSDave Chinner 	/*
9736d8b79cfSDave Chinner 	 * Here we do an (almost) spurious inode lock in order to coordinate
9746d8b79cfSDave Chinner 	 * with inode cache radix tree lookups.  This is because the lookup
9756d8b79cfSDave Chinner 	 * can reference the inodes in the cache without taking references.
9766d8b79cfSDave Chinner 	 *
9776d8b79cfSDave Chinner 	 * We make that OK here by ensuring that we wait until the inode is
9786d8b79cfSDave Chinner 	 * unlocked after the lookup before we go ahead and free it.
9796d8b79cfSDave Chinner 	 */
9806d8b79cfSDave Chinner 	xfs_ilock(ip, XFS_ILOCK_EXCL);
9813ea06d73SDarrick J. Wong 	ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot);
9826d8b79cfSDave Chinner 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
98396355d5aSDave Chinner 	ASSERT(xfs_inode_clean(ip));
9846d8b79cfSDave Chinner 
9858a17d7ddSDave Chinner 	__xfs_inode_free(ip);
9864d0bab3aSDave Chinner 	return;
9876d8b79cfSDave Chinner 
988718ecc50SDave Chinner out_clear_flush:
989718ecc50SDave Chinner 	xfs_iflags_clear(ip, XFS_IFLUSHING);
9909552e14dSDave Chinner out_iunlock:
9916d8b79cfSDave Chinner 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
9929552e14dSDave Chinner out:
993617825feSDave Chinner 	xfs_iflags_clear(ip, XFS_IRECLAIM);
9946d8b79cfSDave Chinner }
9956d8b79cfSDave Chinner 
9966d8b79cfSDave Chinner /*
9976d8b79cfSDave Chinner  * Walk the AGs and reclaim the inodes in them. Even if the filesystem is
9986d8b79cfSDave Chinner  * corrupted, we still want to try to reclaim all the inodes. If we don't,
9996d8b79cfSDave Chinner  * then a shut down during filesystem unmount reclaim walk leak all the
10006d8b79cfSDave Chinner  * unreclaimed inodes.
1001617825feSDave Chinner  *
1002617825feSDave Chinner  * Returns non-zero if any AGs or inodes were skipped in the reclaim pass
1003617825feSDave Chinner  * so that callers that want to block until all dirty inodes are written back
1004617825feSDave Chinner  * and reclaimed can sanely loop.
10056d8b79cfSDave Chinner  */
10064d0bab3aSDave Chinner static void
10076d8b79cfSDave Chinner xfs_reclaim_inodes_ag(
10086d8b79cfSDave Chinner 	struct xfs_mount	*mp,
10096d8b79cfSDave Chinner 	int			*nr_to_scan)
10106d8b79cfSDave Chinner {
10116d8b79cfSDave Chinner 	struct xfs_perag	*pag;
10120e8e2c63SDave Chinner 	xfs_agnumber_t		ag = 0;
10136d8b79cfSDave Chinner 
10146d8b79cfSDave Chinner 	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
10156d8b79cfSDave Chinner 		unsigned long	first_index = 0;
10166d8b79cfSDave Chinner 		int		done = 0;
10176d8b79cfSDave Chinner 		int		nr_found = 0;
10186d8b79cfSDave Chinner 
10196d8b79cfSDave Chinner 		ag = pag->pag_agno + 1;
10206d8b79cfSDave Chinner 
10210e8e2c63SDave Chinner 		first_index = READ_ONCE(pag->pag_ici_reclaim_cursor);
10226d8b79cfSDave Chinner 		do {
10236d8b79cfSDave Chinner 			struct xfs_inode *batch[XFS_LOOKUP_BATCH];
10246d8b79cfSDave Chinner 			int	i;
10256d8b79cfSDave Chinner 
10266d8b79cfSDave Chinner 			rcu_read_lock();
10276d8b79cfSDave Chinner 			nr_found = radix_tree_gang_lookup_tag(
10286d8b79cfSDave Chinner 					&pag->pag_ici_root,
10296d8b79cfSDave Chinner 					(void **)batch, first_index,
10306d8b79cfSDave Chinner 					XFS_LOOKUP_BATCH,
10316d8b79cfSDave Chinner 					XFS_ICI_RECLAIM_TAG);
10326d8b79cfSDave Chinner 			if (!nr_found) {
10336d8b79cfSDave Chinner 				done = 1;
10346d8b79cfSDave Chinner 				rcu_read_unlock();
10356d8b79cfSDave Chinner 				break;
10366d8b79cfSDave Chinner 			}
10376d8b79cfSDave Chinner 
10386d8b79cfSDave Chinner 			/*
10396d8b79cfSDave Chinner 			 * Grab the inodes before we drop the lock. if we found
10406d8b79cfSDave Chinner 			 * nothing, nr == 0 and the loop will be skipped.
10416d8b79cfSDave Chinner 			 */
10426d8b79cfSDave Chinner 			for (i = 0; i < nr_found; i++) {
10436d8b79cfSDave Chinner 				struct xfs_inode *ip = batch[i];
10446d8b79cfSDave Chinner 
104550718b8dSDave Chinner 				if (done || !xfs_reclaim_inode_grab(ip))
10466d8b79cfSDave Chinner 					batch[i] = NULL;
10476d8b79cfSDave Chinner 
10486d8b79cfSDave Chinner 				/*
10496d8b79cfSDave Chinner 				 * Update the index for the next lookup. Catch
10506d8b79cfSDave Chinner 				 * overflows into the next AG range which can
10516d8b79cfSDave Chinner 				 * occur if we have inodes in the last block of
10526d8b79cfSDave Chinner 				 * the AG and we are currently pointing to the
10536d8b79cfSDave Chinner 				 * last inode.
10546d8b79cfSDave Chinner 				 *
10556d8b79cfSDave Chinner 				 * Because we may see inodes that are from the
10566d8b79cfSDave Chinner 				 * wrong AG due to RCU freeing and
10576d8b79cfSDave Chinner 				 * reallocation, only update the index if it
10586d8b79cfSDave Chinner 				 * lies in this AG. It was a race that lead us
10596d8b79cfSDave Chinner 				 * to see this inode, so another lookup from
10606d8b79cfSDave Chinner 				 * the same index will not find it again.
10616d8b79cfSDave Chinner 				 */
10626d8b79cfSDave Chinner 				if (XFS_INO_TO_AGNO(mp, ip->i_ino) !=
10636d8b79cfSDave Chinner 								pag->pag_agno)
10646d8b79cfSDave Chinner 					continue;
10656d8b79cfSDave Chinner 				first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
10666d8b79cfSDave Chinner 				if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
10676d8b79cfSDave Chinner 					done = 1;
10686d8b79cfSDave Chinner 			}
10696d8b79cfSDave Chinner 
10706d8b79cfSDave Chinner 			/* unlock now we've grabbed the inodes. */
10716d8b79cfSDave Chinner 			rcu_read_unlock();
10726d8b79cfSDave Chinner 
10736d8b79cfSDave Chinner 			for (i = 0; i < nr_found; i++) {
10744d0bab3aSDave Chinner 				if (batch[i])
10754d0bab3aSDave Chinner 					xfs_reclaim_inode(batch[i], pag);
10766d8b79cfSDave Chinner 			}
10776d8b79cfSDave Chinner 
10786d8b79cfSDave Chinner 			*nr_to_scan -= XFS_LOOKUP_BATCH;
10796d8b79cfSDave Chinner 			cond_resched();
10806d8b79cfSDave Chinner 		} while (nr_found && !done && *nr_to_scan > 0);
10816d8b79cfSDave Chinner 
10820e8e2c63SDave Chinner 		if (done)
10830e8e2c63SDave Chinner 			first_index = 0;
10840e8e2c63SDave Chinner 		WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index);
10856d8b79cfSDave Chinner 		xfs_perag_put(pag);
10866d8b79cfSDave Chinner 	}
10876d8b79cfSDave Chinner }
10886d8b79cfSDave Chinner 
10894d0bab3aSDave Chinner void
10906d8b79cfSDave Chinner xfs_reclaim_inodes(
10914d0bab3aSDave Chinner 	struct xfs_mount	*mp)
10926d8b79cfSDave Chinner {
10936d8b79cfSDave Chinner 	int		nr_to_scan = INT_MAX;
10946d8b79cfSDave Chinner 
10954d0bab3aSDave Chinner 	while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
1096617825feSDave Chinner 		xfs_ail_push_all_sync(mp->m_ail);
10974d0bab3aSDave Chinner 		xfs_reclaim_inodes_ag(mp, &nr_to_scan);
10980f4ec0f1SZheng Bin 	}
10996d8b79cfSDave Chinner }
11006d8b79cfSDave Chinner 
11016d8b79cfSDave Chinner /*
110202511a5aSDave Chinner  * The shrinker infrastructure determines how many inodes we should scan for
110302511a5aSDave Chinner  * reclaim. We want as many clean inodes ready to reclaim as possible, so we
110402511a5aSDave Chinner  * push the AIL here. We also want to proactively free up memory if we can to
110502511a5aSDave Chinner  * minimise the amount of work memory reclaim has to do so we kick the
110602511a5aSDave Chinner  * background reclaim if it isn't already scheduled.
11076d8b79cfSDave Chinner  */
11080a234c6dSDave Chinner long
11096d8b79cfSDave Chinner xfs_reclaim_inodes_nr(
11106d8b79cfSDave Chinner 	struct xfs_mount	*mp,
11116d8b79cfSDave Chinner 	int			nr_to_scan)
11126d8b79cfSDave Chinner {
11136d8b79cfSDave Chinner 	/* kick background reclaimer and push the AIL */
11146d8b79cfSDave Chinner 	xfs_reclaim_work_queue(mp);
11156d8b79cfSDave Chinner 	xfs_ail_push_all(mp->m_ail);
11166d8b79cfSDave Chinner 
111750718b8dSDave Chinner 	xfs_reclaim_inodes_ag(mp, &nr_to_scan);
1118617825feSDave Chinner 	return 0;
11196d8b79cfSDave Chinner }
11206d8b79cfSDave Chinner 
11216d8b79cfSDave Chinner /*
11226d8b79cfSDave Chinner  * Return the number of reclaimable inodes in the filesystem for
11236d8b79cfSDave Chinner  * the shrinker to determine how much to reclaim.
11246d8b79cfSDave Chinner  */
11256d8b79cfSDave Chinner int
11266d8b79cfSDave Chinner xfs_reclaim_inodes_count(
11276d8b79cfSDave Chinner 	struct xfs_mount	*mp)
11286d8b79cfSDave Chinner {
11296d8b79cfSDave Chinner 	struct xfs_perag	*pag;
11306d8b79cfSDave Chinner 	xfs_agnumber_t		ag = 0;
11316d8b79cfSDave Chinner 	int			reclaimable = 0;
11326d8b79cfSDave Chinner 
11336d8b79cfSDave Chinner 	while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) {
11346d8b79cfSDave Chinner 		ag = pag->pag_agno + 1;
11356d8b79cfSDave Chinner 		reclaimable += pag->pag_ici_reclaimable;
11366d8b79cfSDave Chinner 		xfs_perag_put(pag);
11376d8b79cfSDave Chinner 	}
11386d8b79cfSDave Chinner 	return reclaimable;
11396d8b79cfSDave Chinner }
11406d8b79cfSDave Chinner 
114139b1cfd7SDarrick J. Wong STATIC bool
11423e3f9f58SBrian Foster xfs_inode_match_id(
11433e3f9f58SBrian Foster 	struct xfs_inode	*ip,
11443e3f9f58SBrian Foster 	struct xfs_eofblocks	*eofb)
11453e3f9f58SBrian Foster {
1146b9fe5052SDwight Engen 	if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
1147b9fe5052SDwight Engen 	    !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
114839b1cfd7SDarrick J. Wong 		return false;
11491b556048SBrian Foster 
1150b9fe5052SDwight Engen 	if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
1151b9fe5052SDwight Engen 	    !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
115239b1cfd7SDarrick J. Wong 		return false;
11531b556048SBrian Foster 
1154b9fe5052SDwight Engen 	if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
1155ceaf603cSChristoph Hellwig 	    ip->i_projid != eofb->eof_prid)
115639b1cfd7SDarrick J. Wong 		return false;
11571b556048SBrian Foster 
115839b1cfd7SDarrick J. Wong 	return true;
11593e3f9f58SBrian Foster }
11603e3f9f58SBrian Foster 
1161f4526397SBrian Foster /*
1162f4526397SBrian Foster  * A union-based inode filtering algorithm. Process the inode if any of the
1163f4526397SBrian Foster  * criteria match. This is for global/internal scans only.
1164f4526397SBrian Foster  */
116539b1cfd7SDarrick J. Wong STATIC bool
1166f4526397SBrian Foster xfs_inode_match_id_union(
1167f4526397SBrian Foster 	struct xfs_inode	*ip,
1168f4526397SBrian Foster 	struct xfs_eofblocks	*eofb)
1169f4526397SBrian Foster {
1170f4526397SBrian Foster 	if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) &&
1171f4526397SBrian Foster 	    uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid))
117239b1cfd7SDarrick J. Wong 		return true;
1173f4526397SBrian Foster 
1174f4526397SBrian Foster 	if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) &&
1175f4526397SBrian Foster 	    gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid))
117639b1cfd7SDarrick J. Wong 		return true;
1177f4526397SBrian Foster 
1178f4526397SBrian Foster 	if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) &&
1179ceaf603cSChristoph Hellwig 	    ip->i_projid == eofb->eof_prid)
118039b1cfd7SDarrick J. Wong 		return true;
1181f4526397SBrian Foster 
118239b1cfd7SDarrick J. Wong 	return false;
1183f4526397SBrian Foster }
1184f4526397SBrian Foster 
1185a91bf992SDarrick J. Wong /*
1186a91bf992SDarrick J. Wong  * Is this inode @ip eligible for eof/cow block reclamation, given some
1187a91bf992SDarrick J. Wong  * filtering parameters @eofb?  The inode is eligible if @eofb is null or
1188a91bf992SDarrick J. Wong  * if the predicate functions match.
1189a91bf992SDarrick J. Wong  */
1190a91bf992SDarrick J. Wong static bool
1191a91bf992SDarrick J. Wong xfs_inode_matches_eofb(
1192a91bf992SDarrick J. Wong 	struct xfs_inode	*ip,
1193a91bf992SDarrick J. Wong 	struct xfs_eofblocks	*eofb)
1194a91bf992SDarrick J. Wong {
119539b1cfd7SDarrick J. Wong 	bool			match;
1196a91bf992SDarrick J. Wong 
1197a91bf992SDarrick J. Wong 	if (!eofb)
1198a91bf992SDarrick J. Wong 		return true;
1199a91bf992SDarrick J. Wong 
1200a91bf992SDarrick J. Wong 	if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
1201a91bf992SDarrick J. Wong 		match = xfs_inode_match_id_union(ip, eofb);
1202a91bf992SDarrick J. Wong 	else
1203a91bf992SDarrick J. Wong 		match = xfs_inode_match_id(ip, eofb);
1204a91bf992SDarrick J. Wong 	if (!match)
1205a91bf992SDarrick J. Wong 		return false;
1206a91bf992SDarrick J. Wong 
1207a91bf992SDarrick J. Wong 	/* skip the inode if the file size is too small */
1208a91bf992SDarrick J. Wong 	if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) &&
1209a91bf992SDarrick J. Wong 	    XFS_ISIZE(ip) < eofb->eof_min_file_size)
1210a91bf992SDarrick J. Wong 		return false;
1211a91bf992SDarrick J. Wong 
1212a91bf992SDarrick J. Wong 	return true;
1213a91bf992SDarrick J. Wong }
1214a91bf992SDarrick J. Wong 
12154d0bab3aSDave Chinner /*
12164d0bab3aSDave Chinner  * This is a fast pass over the inode cache to try to get reclaim moving on as
12174d0bab3aSDave Chinner  * many inodes as possible in a short period of time. It kicks itself every few
12184d0bab3aSDave Chinner  * seconds, as well as being kicked by the inode cache shrinker when memory
121902511a5aSDave Chinner  * goes low.
12204d0bab3aSDave Chinner  */
12214d0bab3aSDave Chinner void
12224d0bab3aSDave Chinner xfs_reclaim_worker(
12234d0bab3aSDave Chinner 	struct work_struct *work)
12244d0bab3aSDave Chinner {
12254d0bab3aSDave Chinner 	struct xfs_mount *mp = container_of(to_delayed_work(work),
12264d0bab3aSDave Chinner 					struct xfs_mount, m_reclaim_work);
12274d0bab3aSDave Chinner 	int		nr_to_scan = INT_MAX;
12284d0bab3aSDave Chinner 
12294d0bab3aSDave Chinner 	xfs_reclaim_inodes_ag(mp, &nr_to_scan);
12304d0bab3aSDave Chinner 	xfs_reclaim_work_queue(mp);
12314d0bab3aSDave Chinner }
12324d0bab3aSDave Chinner 
12333e3f9f58SBrian Foster STATIC int
123441176a68SBrian Foster xfs_inode_free_eofblocks(
123541176a68SBrian Foster 	struct xfs_inode	*ip,
12360fa4a10aSDarrick J. Wong 	void			*args,
12370fa4a10aSDarrick J. Wong 	unsigned int		*lockflags)
123841176a68SBrian Foster {
12393e3f9f58SBrian Foster 	struct xfs_eofblocks	*eofb = args;
1240390600f8SDarrick J. Wong 	bool			wait;
1241390600f8SDarrick J. Wong 
1242390600f8SDarrick J. Wong 	wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
12435400da7dSBrian Foster 
1244ce2d3bbeSDarrick J. Wong 	if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS))
1245ce2d3bbeSDarrick J. Wong 		return 0;
1246ce2d3bbeSDarrick J. Wong 
124741176a68SBrian Foster 	/*
124841176a68SBrian Foster 	 * If the mapping is dirty the operation can block and wait for some
124941176a68SBrian Foster 	 * time. Unless we are waiting, skip it.
125041176a68SBrian Foster 	 */
1251390600f8SDarrick J. Wong 	if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
125241176a68SBrian Foster 		return 0;
125341176a68SBrian Foster 
1254a91bf992SDarrick J. Wong 	if (!xfs_inode_matches_eofb(ip, eofb))
12553e3f9f58SBrian Foster 		return 0;
12563e3f9f58SBrian Foster 
1257a36b9261SBrian Foster 	/*
1258a36b9261SBrian Foster 	 * If the caller is waiting, return -EAGAIN to keep the background
1259a36b9261SBrian Foster 	 * scanner moving and revisit the inode in a subsequent pass.
1260a36b9261SBrian Foster 	 */
1261c3155097SBrian Foster 	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1262390600f8SDarrick J. Wong 		if (wait)
1263390600f8SDarrick J. Wong 			return -EAGAIN;
1264390600f8SDarrick J. Wong 		return 0;
1265a36b9261SBrian Foster 	}
12660fa4a10aSDarrick J. Wong 	*lockflags |= XFS_IOLOCK_EXCL;
1267390600f8SDarrick J. Wong 
12682b156ff8SDarrick J. Wong 	if (xfs_can_free_eofblocks(ip, false))
12690fa4a10aSDarrick J. Wong 		return xfs_free_eofblocks(ip);
12702b156ff8SDarrick J. Wong 
12712b156ff8SDarrick J. Wong 	/* inode could be preallocated or append-only */
12722b156ff8SDarrick J. Wong 	trace_xfs_inode_free_eofblocks_invalid(ip);
12732b156ff8SDarrick J. Wong 	xfs_inode_clear_eofblocks_tag(ip);
12742b156ff8SDarrick J. Wong 	return 0;
127541176a68SBrian Foster }
127641176a68SBrian Foster 
1277f9296569SDarrick J. Wong /*
12789669f51dSDarrick J. Wong  * Background scanning to trim preallocated space. This is queued based on the
12799669f51dSDarrick J. Wong  * 'speculative_prealloc_lifetime' tunable (5m by default).
1280f9296569SDarrick J. Wong  */
12819669f51dSDarrick J. Wong static inline void
12829669f51dSDarrick J. Wong xfs_blockgc_queue(
1283894ecacfSDarrick J. Wong 	struct xfs_perag	*pag)
1284f9296569SDarrick J. Wong {
1285f9296569SDarrick J. Wong 	rcu_read_lock();
1286894ecacfSDarrick J. Wong 	if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG))
12873fef46fcSDarrick J. Wong 		queue_delayed_work(pag->pag_mount->m_gc_workqueue,
1288894ecacfSDarrick J. Wong 				   &pag->pag_blockgc_work,
12899669f51dSDarrick J. Wong 				   msecs_to_jiffies(xfs_blockgc_secs * 1000));
1290f9296569SDarrick J. Wong 	rcu_read_unlock();
1291f9296569SDarrick J. Wong }
1292f9296569SDarrick J. Wong 
129383104d44SDarrick J. Wong static void
1294ce2d3bbeSDarrick J. Wong xfs_blockgc_set_iflag(
1295ce2d3bbeSDarrick J. Wong 	struct xfs_inode	*ip,
1296ce2d3bbeSDarrick J. Wong 	unsigned long		iflag)
129727b52867SBrian Foster {
129827b52867SBrian Foster 	struct xfs_mount	*mp = ip->i_mount;
129927b52867SBrian Foster 	struct xfs_perag	*pag;
130027b52867SBrian Foster 	int			tagged;
130127b52867SBrian Foster 
1302ce2d3bbeSDarrick J. Wong 	ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
1303ce2d3bbeSDarrick J. Wong 
130485a6e764SChristoph Hellwig 	/*
130585a6e764SChristoph Hellwig 	 * Don't bother locking the AG and looking up in the radix trees
130685a6e764SChristoph Hellwig 	 * if we already know that we have the tag set.
130785a6e764SChristoph Hellwig 	 */
1308ce2d3bbeSDarrick J. Wong 	if (ip->i_flags & iflag)
130985a6e764SChristoph Hellwig 		return;
131085a6e764SChristoph Hellwig 	spin_lock(&ip->i_flags_lock);
1311ce2d3bbeSDarrick J. Wong 	ip->i_flags |= iflag;
131285a6e764SChristoph Hellwig 	spin_unlock(&ip->i_flags_lock);
131385a6e764SChristoph Hellwig 
131427b52867SBrian Foster 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
131527b52867SBrian Foster 	spin_lock(&pag->pag_ici_lock);
131627b52867SBrian Foster 
1317ce2d3bbeSDarrick J. Wong 	tagged = radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG);
131827b52867SBrian Foster 	radix_tree_tag_set(&pag->pag_ici_root,
1319ce2d3bbeSDarrick J. Wong 			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1320ce2d3bbeSDarrick J. Wong 			   XFS_ICI_BLOCKGC_TAG);
132127b52867SBrian Foster 	if (!tagged) {
1322ce2d3bbeSDarrick J. Wong 		/* propagate the blockgc tag up into the perag radix tree */
132327b52867SBrian Foster 		spin_lock(&ip->i_mount->m_perag_lock);
132427b52867SBrian Foster 		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
132527b52867SBrian Foster 				   XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1326ce2d3bbeSDarrick J. Wong 				   XFS_ICI_BLOCKGC_TAG);
132727b52867SBrian Foster 		spin_unlock(&ip->i_mount->m_perag_lock);
132827b52867SBrian Foster 
1329579b62faSBrian Foster 		/* kick off background trimming */
1330894ecacfSDarrick J. Wong 		xfs_blockgc_queue(pag);
1331579b62faSBrian Foster 
1332ce2d3bbeSDarrick J. Wong 		trace_xfs_perag_set_blockgc(ip->i_mount, pag->pag_agno, -1,
1333ce2d3bbeSDarrick J. Wong 				_RET_IP_);
133427b52867SBrian Foster 	}
133527b52867SBrian Foster 
133627b52867SBrian Foster 	spin_unlock(&pag->pag_ici_lock);
133727b52867SBrian Foster 	xfs_perag_put(pag);
133827b52867SBrian Foster }
133927b52867SBrian Foster 
134027b52867SBrian Foster void
134183104d44SDarrick J. Wong xfs_inode_set_eofblocks_tag(
134227b52867SBrian Foster 	xfs_inode_t	*ip)
134327b52867SBrian Foster {
134483104d44SDarrick J. Wong 	trace_xfs_inode_set_eofblocks_tag(ip);
13459669f51dSDarrick J. Wong 	return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS);
134683104d44SDarrick J. Wong }
134783104d44SDarrick J. Wong 
134883104d44SDarrick J. Wong static void
1349ce2d3bbeSDarrick J. Wong xfs_blockgc_clear_iflag(
1350ce2d3bbeSDarrick J. Wong 	struct xfs_inode	*ip,
1351ce2d3bbeSDarrick J. Wong 	unsigned long		iflag)
135283104d44SDarrick J. Wong {
135327b52867SBrian Foster 	struct xfs_mount	*mp = ip->i_mount;
135427b52867SBrian Foster 	struct xfs_perag	*pag;
1355ce2d3bbeSDarrick J. Wong 	bool			clear_tag;
1356ce2d3bbeSDarrick J. Wong 
1357ce2d3bbeSDarrick J. Wong 	ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0);
135827b52867SBrian Foster 
135985a6e764SChristoph Hellwig 	spin_lock(&ip->i_flags_lock);
1360ce2d3bbeSDarrick J. Wong 	ip->i_flags &= ~iflag;
1361ce2d3bbeSDarrick J. Wong 	clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0;
136285a6e764SChristoph Hellwig 	spin_unlock(&ip->i_flags_lock);
136385a6e764SChristoph Hellwig 
1364ce2d3bbeSDarrick J. Wong 	if (!clear_tag)
1365ce2d3bbeSDarrick J. Wong 		return;
1366ce2d3bbeSDarrick J. Wong 
136727b52867SBrian Foster 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
136827b52867SBrian Foster 	spin_lock(&pag->pag_ici_lock);
136927b52867SBrian Foster 
137027b52867SBrian Foster 	radix_tree_tag_clear(&pag->pag_ici_root,
1371ce2d3bbeSDarrick J. Wong 			     XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
1372ce2d3bbeSDarrick J. Wong 			     XFS_ICI_BLOCKGC_TAG);
1373ce2d3bbeSDarrick J. Wong 	if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) {
1374ce2d3bbeSDarrick J. Wong 		/* clear the blockgc tag from the perag radix tree */
137527b52867SBrian Foster 		spin_lock(&ip->i_mount->m_perag_lock);
137627b52867SBrian Foster 		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
137727b52867SBrian Foster 				     XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
1378ce2d3bbeSDarrick J. Wong 				     XFS_ICI_BLOCKGC_TAG);
137927b52867SBrian Foster 		spin_unlock(&ip->i_mount->m_perag_lock);
1380ce2d3bbeSDarrick J. Wong 		trace_xfs_perag_clear_blockgc(ip->i_mount, pag->pag_agno, -1,
1381ce2d3bbeSDarrick J. Wong 				_RET_IP_);
138227b52867SBrian Foster 	}
138327b52867SBrian Foster 
138427b52867SBrian Foster 	spin_unlock(&pag->pag_ici_lock);
138527b52867SBrian Foster 	xfs_perag_put(pag);
138627b52867SBrian Foster }
138727b52867SBrian Foster 
138883104d44SDarrick J. Wong void
138983104d44SDarrick J. Wong xfs_inode_clear_eofblocks_tag(
139083104d44SDarrick J. Wong 	xfs_inode_t	*ip)
139183104d44SDarrick J. Wong {
139283104d44SDarrick J. Wong 	trace_xfs_inode_clear_eofblocks_tag(ip);
1393ce2d3bbeSDarrick J. Wong 	return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS);
139483104d44SDarrick J. Wong }
139583104d44SDarrick J. Wong 
139683104d44SDarrick J. Wong /*
1397be78ff0eSDarrick J. Wong  * Set ourselves up to free CoW blocks from this file.  If it's already clean
1398be78ff0eSDarrick J. Wong  * then we can bail out quickly, but otherwise we must back off if the file
1399be78ff0eSDarrick J. Wong  * is undergoing some kind of write.
1400be78ff0eSDarrick J. Wong  */
1401be78ff0eSDarrick J. Wong static bool
1402be78ff0eSDarrick J. Wong xfs_prep_free_cowblocks(
140351d62690SChristoph Hellwig 	struct xfs_inode	*ip)
1404be78ff0eSDarrick J. Wong {
1405be78ff0eSDarrick J. Wong 	/*
1406be78ff0eSDarrick J. Wong 	 * Just clear the tag if we have an empty cow fork or none at all. It's
1407be78ff0eSDarrick J. Wong 	 * possible the inode was fully unshared since it was originally tagged.
1408be78ff0eSDarrick J. Wong 	 */
140951d62690SChristoph Hellwig 	if (!xfs_inode_has_cow_data(ip)) {
1410be78ff0eSDarrick J. Wong 		trace_xfs_inode_free_cowblocks_invalid(ip);
1411be78ff0eSDarrick J. Wong 		xfs_inode_clear_cowblocks_tag(ip);
1412be78ff0eSDarrick J. Wong 		return false;
1413be78ff0eSDarrick J. Wong 	}
1414be78ff0eSDarrick J. Wong 
1415be78ff0eSDarrick J. Wong 	/*
1416be78ff0eSDarrick J. Wong 	 * If the mapping is dirty or under writeback we cannot touch the
1417be78ff0eSDarrick J. Wong 	 * CoW fork.  Leave it alone if we're in the midst of a directio.
1418be78ff0eSDarrick J. Wong 	 */
1419be78ff0eSDarrick J. Wong 	if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
1420be78ff0eSDarrick J. Wong 	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
1421be78ff0eSDarrick J. Wong 	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
1422be78ff0eSDarrick J. Wong 	    atomic_read(&VFS_I(ip)->i_dio_count))
1423be78ff0eSDarrick J. Wong 		return false;
1424be78ff0eSDarrick J. Wong 
1425be78ff0eSDarrick J. Wong 	return true;
1426be78ff0eSDarrick J. Wong }
1427be78ff0eSDarrick J. Wong 
1428be78ff0eSDarrick J. Wong /*
142983104d44SDarrick J. Wong  * Automatic CoW Reservation Freeing
143083104d44SDarrick J. Wong  *
143183104d44SDarrick J. Wong  * These functions automatically garbage collect leftover CoW reservations
143283104d44SDarrick J. Wong  * that were made on behalf of a cowextsize hint when we start to run out
143383104d44SDarrick J. Wong  * of quota or when the reservations sit around for too long.  If the file
143483104d44SDarrick J. Wong  * has dirty pages or is undergoing writeback, its CoW reservations will
143583104d44SDarrick J. Wong  * be retained.
143683104d44SDarrick J. Wong  *
143783104d44SDarrick J. Wong  * The actual garbage collection piggybacks off the same code that runs
143883104d44SDarrick J. Wong  * the speculative EOF preallocation garbage collector.
143983104d44SDarrick J. Wong  */
144083104d44SDarrick J. Wong STATIC int
144183104d44SDarrick J. Wong xfs_inode_free_cowblocks(
144283104d44SDarrick J. Wong 	struct xfs_inode	*ip,
14430fa4a10aSDarrick J. Wong 	void			*args,
14440fa4a10aSDarrick J. Wong 	unsigned int		*lockflags)
144583104d44SDarrick J. Wong {
144683104d44SDarrick J. Wong 	struct xfs_eofblocks	*eofb = args;
1447f41a0716SDarrick J. Wong 	bool			wait;
1448be78ff0eSDarrick J. Wong 	int			ret = 0;
144983104d44SDarrick J. Wong 
1450f41a0716SDarrick J. Wong 	wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC);
1451f41a0716SDarrick J. Wong 
1452ce2d3bbeSDarrick J. Wong 	if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS))
1453ce2d3bbeSDarrick J. Wong 		return 0;
1454ce2d3bbeSDarrick J. Wong 
145551d62690SChristoph Hellwig 	if (!xfs_prep_free_cowblocks(ip))
145683104d44SDarrick J. Wong 		return 0;
145783104d44SDarrick J. Wong 
1458a91bf992SDarrick J. Wong 	if (!xfs_inode_matches_eofb(ip, eofb))
145983104d44SDarrick J. Wong 		return 0;
146083104d44SDarrick J. Wong 
1461f41a0716SDarrick J. Wong 	/*
1462f41a0716SDarrick J. Wong 	 * If the caller is waiting, return -EAGAIN to keep the background
1463f41a0716SDarrick J. Wong 	 * scanner moving and revisit the inode in a subsequent pass.
1464f41a0716SDarrick J. Wong 	 */
14650fa4a10aSDarrick J. Wong 	if (!(*lockflags & XFS_IOLOCK_EXCL) &&
14660fa4a10aSDarrick J. Wong 	    !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
1467f41a0716SDarrick J. Wong 		if (wait)
1468f41a0716SDarrick J. Wong 			return -EAGAIN;
1469f41a0716SDarrick J. Wong 		return 0;
1470f41a0716SDarrick J. Wong 	}
14710fa4a10aSDarrick J. Wong 	*lockflags |= XFS_IOLOCK_EXCL;
14720fa4a10aSDarrick J. Wong 
1473f41a0716SDarrick J. Wong 	if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) {
1474f41a0716SDarrick J. Wong 		if (wait)
14750fa4a10aSDarrick J. Wong 			return -EAGAIN;
14760fa4a10aSDarrick J. Wong 		return 0;
1477f41a0716SDarrick J. Wong 	}
14780fa4a10aSDarrick J. Wong 	*lockflags |= XFS_MMAPLOCK_EXCL;
147983104d44SDarrick J. Wong 
1480be78ff0eSDarrick J. Wong 	/*
1481be78ff0eSDarrick J. Wong 	 * Check again, nobody else should be able to dirty blocks or change
1482be78ff0eSDarrick J. Wong 	 * the reflink iflag now that we have the first two locks held.
1483be78ff0eSDarrick J. Wong 	 */
148451d62690SChristoph Hellwig 	if (xfs_prep_free_cowblocks(ip))
14853802a345SChristoph Hellwig 		ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false);
148683104d44SDarrick J. Wong 	return ret;
148783104d44SDarrick J. Wong }
148883104d44SDarrick J. Wong 
148983104d44SDarrick J. Wong void
149083104d44SDarrick J. Wong xfs_inode_set_cowblocks_tag(
149183104d44SDarrick J. Wong 	xfs_inode_t	*ip)
149283104d44SDarrick J. Wong {
14937b7381f0SBrian Foster 	trace_xfs_inode_set_cowblocks_tag(ip);
14949669f51dSDarrick J. Wong 	return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS);
149583104d44SDarrick J. Wong }
149683104d44SDarrick J. Wong 
149783104d44SDarrick J. Wong void
149883104d44SDarrick J. Wong xfs_inode_clear_cowblocks_tag(
149983104d44SDarrick J. Wong 	xfs_inode_t	*ip)
150083104d44SDarrick J. Wong {
15017b7381f0SBrian Foster 	trace_xfs_inode_clear_cowblocks_tag(ip);
1502ce2d3bbeSDarrick J. Wong 	return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS);
150383104d44SDarrick J. Wong }
1504d6b636ebSDarrick J. Wong 
1505894ecacfSDarrick J. Wong #define for_each_perag_tag(mp, next_agno, pag, tag) \
1506894ecacfSDarrick J. Wong 	for ((next_agno) = 0, (pag) = xfs_perag_get_tag((mp), 0, (tag)); \
1507894ecacfSDarrick J. Wong 		(pag) != NULL; \
1508894ecacfSDarrick J. Wong 		(next_agno) = (pag)->pag_agno + 1, \
1509894ecacfSDarrick J. Wong 		xfs_perag_put(pag), \
1510894ecacfSDarrick J. Wong 		(pag) = xfs_perag_get_tag((mp), (next_agno), (tag)))
1511894ecacfSDarrick J. Wong 
1512894ecacfSDarrick J. Wong 
1513d6b636ebSDarrick J. Wong /* Disable post-EOF and CoW block auto-reclamation. */
1514d6b636ebSDarrick J. Wong void
1515c9a6526fSDarrick J. Wong xfs_blockgc_stop(
1516d6b636ebSDarrick J. Wong 	struct xfs_mount	*mp)
1517d6b636ebSDarrick J. Wong {
1518894ecacfSDarrick J. Wong 	struct xfs_perag	*pag;
1519894ecacfSDarrick J. Wong 	xfs_agnumber_t		agno;
1520894ecacfSDarrick J. Wong 
1521894ecacfSDarrick J. Wong 	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1522894ecacfSDarrick J. Wong 		cancel_delayed_work_sync(&pag->pag_blockgc_work);
1523d6b636ebSDarrick J. Wong }
1524d6b636ebSDarrick J. Wong 
1525d6b636ebSDarrick J. Wong /* Enable post-EOF and CoW block auto-reclamation. */
1526d6b636ebSDarrick J. Wong void
1527c9a6526fSDarrick J. Wong xfs_blockgc_start(
1528d6b636ebSDarrick J. Wong 	struct xfs_mount	*mp)
1529d6b636ebSDarrick J. Wong {
1530894ecacfSDarrick J. Wong 	struct xfs_perag	*pag;
1531894ecacfSDarrick J. Wong 	xfs_agnumber_t		agno;
1532894ecacfSDarrick J. Wong 
1533894ecacfSDarrick J. Wong 	for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG)
1534894ecacfSDarrick J. Wong 		xfs_blockgc_queue(pag);
1535d6b636ebSDarrick J. Wong }
15363d4feec0SDarrick J. Wong 
1537df600197SDarrick J. Wong /*
1538b9baaef4SDarrick J. Wong  * Decide if the given @ip is eligible for garbage collection of speculative
1539b9baaef4SDarrick J. Wong  * preallocations, and grab it if so.  Returns true if it's ready to go or
1540b9baaef4SDarrick J. Wong  * false if we should just ignore it.
1541df600197SDarrick J. Wong  */
1542df600197SDarrick J. Wong static bool
1543b9baaef4SDarrick J. Wong xfs_blockgc_igrab(
1544*7fdff526SDarrick J. Wong 	struct xfs_inode	*ip)
1545df600197SDarrick J. Wong {
1546df600197SDarrick J. Wong 	struct inode		*inode = VFS_I(ip);
1547df600197SDarrick J. Wong 
1548df600197SDarrick J. Wong 	ASSERT(rcu_read_lock_held());
1549df600197SDarrick J. Wong 
1550df600197SDarrick J. Wong 	/* Check for stale RCU freed inode */
1551df600197SDarrick J. Wong 	spin_lock(&ip->i_flags_lock);
1552df600197SDarrick J. Wong 	if (!ip->i_ino)
1553df600197SDarrick J. Wong 		goto out_unlock_noent;
1554df600197SDarrick J. Wong 
1555df600197SDarrick J. Wong 	/* avoid new or reclaimable inodes. Leave for reclaim code to flush */
1556*7fdff526SDarrick J. Wong 	if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM))
1557df600197SDarrick J. Wong 		goto out_unlock_noent;
1558df600197SDarrick J. Wong 	spin_unlock(&ip->i_flags_lock);
1559df600197SDarrick J. Wong 
1560df600197SDarrick J. Wong 	/* nothing to sync during shutdown */
1561df600197SDarrick J. Wong 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1562df600197SDarrick J. Wong 		return false;
1563df600197SDarrick J. Wong 
1564df600197SDarrick J. Wong 	/* If we can't grab the inode, it must on it's way to reclaim. */
1565df600197SDarrick J. Wong 	if (!igrab(inode))
1566df600197SDarrick J. Wong 		return false;
1567df600197SDarrick J. Wong 
1568df600197SDarrick J. Wong 	/* inode is valid */
1569df600197SDarrick J. Wong 	return true;
1570df600197SDarrick J. Wong 
1571df600197SDarrick J. Wong out_unlock_noent:
1572df600197SDarrick J. Wong 	spin_unlock(&ip->i_flags_lock);
1573df600197SDarrick J. Wong 	return false;
1574df600197SDarrick J. Wong }
1575df600197SDarrick J. Wong 
157641956753SDarrick J. Wong /* Scan one incore inode for block preallocations that we can remove. */
157741956753SDarrick J. Wong static int
157841956753SDarrick J. Wong xfs_blockgc_scan_inode(
157941956753SDarrick J. Wong 	struct xfs_inode	*ip,
158041956753SDarrick J. Wong 	void			*args)
158185c5b270SDarrick J. Wong {
15820fa4a10aSDarrick J. Wong 	unsigned int		lockflags = 0;
158385c5b270SDarrick J. Wong 	int			error;
158485c5b270SDarrick J. Wong 
15850fa4a10aSDarrick J. Wong 	error = xfs_inode_free_eofblocks(ip, args, &lockflags);
158685c5b270SDarrick J. Wong 	if (error)
15870fa4a10aSDarrick J. Wong 		goto unlock;
158885c5b270SDarrick J. Wong 
15890fa4a10aSDarrick J. Wong 	error = xfs_inode_free_cowblocks(ip, args, &lockflags);
15900fa4a10aSDarrick J. Wong unlock:
15910fa4a10aSDarrick J. Wong 	if (lockflags)
15920fa4a10aSDarrick J. Wong 		xfs_iunlock(ip, lockflags);
159385c5b270SDarrick J. Wong 	return error;
159485c5b270SDarrick J. Wong }
159585c5b270SDarrick J. Wong 
15969669f51dSDarrick J. Wong /* Background worker that trims preallocated space. */
15979669f51dSDarrick J. Wong void
15989669f51dSDarrick J. Wong xfs_blockgc_worker(
15999669f51dSDarrick J. Wong 	struct work_struct	*work)
16009669f51dSDarrick J. Wong {
1601894ecacfSDarrick J. Wong 	struct xfs_perag	*pag = container_of(to_delayed_work(work),
1602894ecacfSDarrick J. Wong 					struct xfs_perag, pag_blockgc_work);
1603894ecacfSDarrick J. Wong 	struct xfs_mount	*mp = pag->pag_mount;
16049669f51dSDarrick J. Wong 	int			error;
16059669f51dSDarrick J. Wong 
16069669f51dSDarrick J. Wong 	if (!sb_start_write_trylock(mp->m_super))
16079669f51dSDarrick J. Wong 		return;
1608*7fdff526SDarrick J. Wong 	error = xfs_icwalk_ag(pag, xfs_blockgc_scan_inode, NULL,
1609c809d7e9SDarrick J. Wong 			XFS_ICWALK_BLOCKGC);
16109669f51dSDarrick J. Wong 	if (error)
1611894ecacfSDarrick J. Wong 		xfs_info(mp, "AG %u preallocation gc worker failed, err=%d",
1612894ecacfSDarrick J. Wong 				pag->pag_agno, error);
16139669f51dSDarrick J. Wong 	sb_end_write(mp->m_super);
1614894ecacfSDarrick J. Wong 	xfs_blockgc_queue(pag);
16159669f51dSDarrick J. Wong }
16169669f51dSDarrick J. Wong 
161785c5b270SDarrick J. Wong /*
161885c5b270SDarrick J. Wong  * Try to free space in the filesystem by purging eofblocks and cowblocks.
161985c5b270SDarrick J. Wong  */
162085c5b270SDarrick J. Wong int
162185c5b270SDarrick J. Wong xfs_blockgc_free_space(
162285c5b270SDarrick J. Wong 	struct xfs_mount	*mp,
162385c5b270SDarrick J. Wong 	struct xfs_eofblocks	*eofb)
162485c5b270SDarrick J. Wong {
162585c5b270SDarrick J. Wong 	trace_xfs_blockgc_free_space(mp, eofb, _RET_IP_);
162685c5b270SDarrick J. Wong 
1627*7fdff526SDarrick J. Wong 	return xfs_icwalk(mp, xfs_blockgc_scan_inode, eofb,
1628c809d7e9SDarrick J. Wong 			XFS_ICWALK_BLOCKGC);
162985c5b270SDarrick J. Wong }
163085c5b270SDarrick J. Wong 
16313d4feec0SDarrick J. Wong /*
1632c237dd7cSDarrick J. Wong  * Run cow/eofblocks scans on the supplied dquots.  We don't know exactly which
1633c237dd7cSDarrick J. Wong  * quota caused an allocation failure, so we make a best effort by including
1634c237dd7cSDarrick J. Wong  * each quota under low free space conditions (less than 1% free space) in the
1635c237dd7cSDarrick J. Wong  * scan.
1636111068f8SDarrick J. Wong  *
1637111068f8SDarrick J. Wong  * Callers must not hold any inode's ILOCK.  If requesting a synchronous scan
1638111068f8SDarrick J. Wong  * (XFS_EOF_FLAGS_SYNC), the caller also must not hold any inode's IOLOCK or
1639111068f8SDarrick J. Wong  * MMAPLOCK.
16403d4feec0SDarrick J. Wong  */
1641111068f8SDarrick J. Wong int
1642c237dd7cSDarrick J. Wong xfs_blockgc_free_dquots(
1643c237dd7cSDarrick J. Wong 	struct xfs_mount	*mp,
1644c237dd7cSDarrick J. Wong 	struct xfs_dquot	*udqp,
1645c237dd7cSDarrick J. Wong 	struct xfs_dquot	*gdqp,
1646c237dd7cSDarrick J. Wong 	struct xfs_dquot	*pdqp,
1647111068f8SDarrick J. Wong 	unsigned int		eof_flags)
16483d4feec0SDarrick J. Wong {
16493d4feec0SDarrick J. Wong 	struct xfs_eofblocks	eofb = {0};
16503d4feec0SDarrick J. Wong 	bool			do_work = false;
16513d4feec0SDarrick J. Wong 
1652c237dd7cSDarrick J. Wong 	if (!udqp && !gdqp && !pdqp)
1653c237dd7cSDarrick J. Wong 		return 0;
1654c237dd7cSDarrick J. Wong 
16553d4feec0SDarrick J. Wong 	/*
1656111068f8SDarrick J. Wong 	 * Run a scan to free blocks using the union filter to cover all
1657111068f8SDarrick J. Wong 	 * applicable quotas in a single scan.
16583d4feec0SDarrick J. Wong 	 */
1659111068f8SDarrick J. Wong 	eofb.eof_flags = XFS_EOF_FLAGS_UNION | eof_flags;
16603d4feec0SDarrick J. Wong 
1661c237dd7cSDarrick J. Wong 	if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) {
1662c237dd7cSDarrick J. Wong 		eofb.eof_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id);
16633d4feec0SDarrick J. Wong 		eofb.eof_flags |= XFS_EOF_FLAGS_UID;
16643d4feec0SDarrick J. Wong 		do_work = true;
16653d4feec0SDarrick J. Wong 	}
16663d4feec0SDarrick J. Wong 
1667c237dd7cSDarrick J. Wong 	if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) {
1668c237dd7cSDarrick J. Wong 		eofb.eof_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id);
16693d4feec0SDarrick J. Wong 		eofb.eof_flags |= XFS_EOF_FLAGS_GID;
16703d4feec0SDarrick J. Wong 		do_work = true;
16713d4feec0SDarrick J. Wong 	}
16723d4feec0SDarrick J. Wong 
1673c237dd7cSDarrick J. Wong 	if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) {
1674c237dd7cSDarrick J. Wong 		eofb.eof_prid = pdqp->q_id;
16753d4feec0SDarrick J. Wong 		eofb.eof_flags |= XFS_EOF_FLAGS_PRID;
16763d4feec0SDarrick J. Wong 		do_work = true;
16773d4feec0SDarrick J. Wong 	}
16783d4feec0SDarrick J. Wong 
16793d4feec0SDarrick J. Wong 	if (!do_work)
1680111068f8SDarrick J. Wong 		return 0;
16813d4feec0SDarrick J. Wong 
168285c5b270SDarrick J. Wong 	return xfs_blockgc_free_space(mp, &eofb);
1683c237dd7cSDarrick J. Wong }
1684c237dd7cSDarrick J. Wong 
1685c237dd7cSDarrick J. Wong /* Run cow/eofblocks scans on the quotas attached to the inode. */
1686c237dd7cSDarrick J. Wong int
1687c237dd7cSDarrick J. Wong xfs_blockgc_free_quota(
1688c237dd7cSDarrick J. Wong 	struct xfs_inode	*ip,
1689c237dd7cSDarrick J. Wong 	unsigned int		eof_flags)
1690c237dd7cSDarrick J. Wong {
1691c237dd7cSDarrick J. Wong 	return xfs_blockgc_free_dquots(ip->i_mount,
1692c237dd7cSDarrick J. Wong 			xfs_inode_dquot(ip, XFS_DQTYPE_USER),
1693c237dd7cSDarrick J. Wong 			xfs_inode_dquot(ip, XFS_DQTYPE_GROUP),
1694c237dd7cSDarrick J. Wong 			xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), eof_flags);
16953d4feec0SDarrick J. Wong }
1696df600197SDarrick J. Wong 
1697df600197SDarrick J. Wong /* XFS Inode Cache Walking Code */
1698df600197SDarrick J. Wong 
1699df600197SDarrick J. Wong /*
1700b9baaef4SDarrick J. Wong  * Decide if we want to grab this inode in anticipation of doing work towards
1701b9baaef4SDarrick J. Wong  * the goal.  If selected, the VFS must hold a reference to this inode, which
1702b9baaef4SDarrick J. Wong  * will be released after processing.
1703b9baaef4SDarrick J. Wong  */
1704b9baaef4SDarrick J. Wong static inline bool
1705b9baaef4SDarrick J. Wong xfs_icwalk_igrab(
1706b9baaef4SDarrick J. Wong 	enum xfs_icwalk_goal	goal,
1707*7fdff526SDarrick J. Wong 	struct xfs_inode	*ip)
1708b9baaef4SDarrick J. Wong {
1709b9baaef4SDarrick J. Wong 	switch (goal) {
1710b9baaef4SDarrick J. Wong 	case XFS_ICWALK_DQRELE:
1711b9baaef4SDarrick J. Wong 		return xfs_dqrele_igrab(ip);
1712b9baaef4SDarrick J. Wong 	case XFS_ICWALK_BLOCKGC:
1713*7fdff526SDarrick J. Wong 		return xfs_blockgc_igrab(ip);
1714b9baaef4SDarrick J. Wong 	default:
1715b9baaef4SDarrick J. Wong 		return false;
1716b9baaef4SDarrick J. Wong 	}
1717b9baaef4SDarrick J. Wong }
1718b9baaef4SDarrick J. Wong 
1719b9baaef4SDarrick J. Wong /*
1720df600197SDarrick J. Wong  * For a given per-AG structure @pag, grab, @execute, and rele all incore
1721df600197SDarrick J. Wong  * inodes with the given radix tree @tag.
1722df600197SDarrick J. Wong  */
1723df600197SDarrick J. Wong static int
1724c1115c0cSDarrick J. Wong xfs_icwalk_ag(
1725df600197SDarrick J. Wong 	struct xfs_perag	*pag,
1726df600197SDarrick J. Wong 	int			(*execute)(struct xfs_inode *ip, void *args),
1727df600197SDarrick J. Wong 	void			*args,
1728c809d7e9SDarrick J. Wong 	enum xfs_icwalk_goal	goal)
1729df600197SDarrick J. Wong {
1730df600197SDarrick J. Wong 	struct xfs_mount	*mp = pag->pag_mount;
1731df600197SDarrick J. Wong 	uint32_t		first_index;
1732df600197SDarrick J. Wong 	int			last_error = 0;
1733df600197SDarrick J. Wong 	int			skipped;
1734df600197SDarrick J. Wong 	bool			done;
1735df600197SDarrick J. Wong 	int			nr_found;
1736df600197SDarrick J. Wong 
1737df600197SDarrick J. Wong restart:
1738df600197SDarrick J. Wong 	done = false;
1739df600197SDarrick J. Wong 	skipped = 0;
1740df600197SDarrick J. Wong 	first_index = 0;
1741df600197SDarrick J. Wong 	nr_found = 0;
1742df600197SDarrick J. Wong 	do {
1743df600197SDarrick J. Wong 		struct xfs_inode *batch[XFS_LOOKUP_BATCH];
1744c809d7e9SDarrick J. Wong 		unsigned int	tag = xfs_icwalk_tag(goal);
1745df600197SDarrick J. Wong 		int		error = 0;
1746df600197SDarrick J. Wong 		int		i;
1747df600197SDarrick J. Wong 
1748df600197SDarrick J. Wong 		rcu_read_lock();
1749df600197SDarrick J. Wong 
1750c809d7e9SDarrick J. Wong 		if (tag == XFS_ICWALK_NULL_TAG)
1751df600197SDarrick J. Wong 			nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
1752df600197SDarrick J. Wong 					(void **)batch, first_index,
1753df600197SDarrick J. Wong 					XFS_LOOKUP_BATCH);
1754df600197SDarrick J. Wong 		else
1755df600197SDarrick J. Wong 			nr_found = radix_tree_gang_lookup_tag(
1756df600197SDarrick J. Wong 					&pag->pag_ici_root,
1757df600197SDarrick J. Wong 					(void **) batch, first_index,
1758df600197SDarrick J. Wong 					XFS_LOOKUP_BATCH, tag);
1759df600197SDarrick J. Wong 
1760df600197SDarrick J. Wong 		if (!nr_found) {
1761df600197SDarrick J. Wong 			rcu_read_unlock();
1762df600197SDarrick J. Wong 			break;
1763df600197SDarrick J. Wong 		}
1764df600197SDarrick J. Wong 
1765df600197SDarrick J. Wong 		/*
1766df600197SDarrick J. Wong 		 * Grab the inodes before we drop the lock. if we found
1767df600197SDarrick J. Wong 		 * nothing, nr == 0 and the loop will be skipped.
1768df600197SDarrick J. Wong 		 */
1769df600197SDarrick J. Wong 		for (i = 0; i < nr_found; i++) {
1770df600197SDarrick J. Wong 			struct xfs_inode *ip = batch[i];
1771df600197SDarrick J. Wong 
1772*7fdff526SDarrick J. Wong 			if (done || !xfs_icwalk_igrab(goal, ip))
1773df600197SDarrick J. Wong 				batch[i] = NULL;
1774df600197SDarrick J. Wong 
1775df600197SDarrick J. Wong 			/*
1776df600197SDarrick J. Wong 			 * Update the index for the next lookup. Catch
1777df600197SDarrick J. Wong 			 * overflows into the next AG range which can occur if
1778df600197SDarrick J. Wong 			 * we have inodes in the last block of the AG and we
1779df600197SDarrick J. Wong 			 * are currently pointing to the last inode.
1780df600197SDarrick J. Wong 			 *
1781df600197SDarrick J. Wong 			 * Because we may see inodes that are from the wrong AG
1782df600197SDarrick J. Wong 			 * due to RCU freeing and reallocation, only update the
1783df600197SDarrick J. Wong 			 * index if it lies in this AG. It was a race that lead
1784df600197SDarrick J. Wong 			 * us to see this inode, so another lookup from the
1785df600197SDarrick J. Wong 			 * same index will not find it again.
1786df600197SDarrick J. Wong 			 */
1787df600197SDarrick J. Wong 			if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno)
1788df600197SDarrick J. Wong 				continue;
1789df600197SDarrick J. Wong 			first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
1790df600197SDarrick J. Wong 			if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino))
1791df600197SDarrick J. Wong 				done = true;
1792df600197SDarrick J. Wong 		}
1793df600197SDarrick J. Wong 
1794df600197SDarrick J. Wong 		/* unlock now we've grabbed the inodes. */
1795df600197SDarrick J. Wong 		rcu_read_unlock();
1796df600197SDarrick J. Wong 
1797df600197SDarrick J. Wong 		for (i = 0; i < nr_found; i++) {
1798df600197SDarrick J. Wong 			if (!batch[i])
1799df600197SDarrick J. Wong 				continue;
1800df600197SDarrick J. Wong 			error = execute(batch[i], args);
1801df600197SDarrick J. Wong 			xfs_irele(batch[i]);
1802df600197SDarrick J. Wong 			if (error == -EAGAIN) {
1803df600197SDarrick J. Wong 				skipped++;
1804df600197SDarrick J. Wong 				continue;
1805df600197SDarrick J. Wong 			}
1806df600197SDarrick J. Wong 			if (error && last_error != -EFSCORRUPTED)
1807df600197SDarrick J. Wong 				last_error = error;
1808df600197SDarrick J. Wong 		}
1809df600197SDarrick J. Wong 
1810df600197SDarrick J. Wong 		/* bail out if the filesystem is corrupted.  */
1811df600197SDarrick J. Wong 		if (error == -EFSCORRUPTED)
1812df600197SDarrick J. Wong 			break;
1813df600197SDarrick J. Wong 
1814df600197SDarrick J. Wong 		cond_resched();
1815df600197SDarrick J. Wong 
1816df600197SDarrick J. Wong 	} while (nr_found && !done);
1817df600197SDarrick J. Wong 
1818df600197SDarrick J. Wong 	if (skipped) {
1819df600197SDarrick J. Wong 		delay(1);
1820df600197SDarrick J. Wong 		goto restart;
1821df600197SDarrick J. Wong 	}
1822df600197SDarrick J. Wong 	return last_error;
1823df600197SDarrick J. Wong }
1824df600197SDarrick J. Wong 
1825df600197SDarrick J. Wong /* Fetch the next (possibly tagged) per-AG structure. */
1826df600197SDarrick J. Wong static inline struct xfs_perag *
1827c1115c0cSDarrick J. Wong xfs_icwalk_get_perag(
1828df600197SDarrick J. Wong 	struct xfs_mount	*mp,
1829df600197SDarrick J. Wong 	xfs_agnumber_t		agno,
1830c809d7e9SDarrick J. Wong 	enum xfs_icwalk_goal	goal)
1831df600197SDarrick J. Wong {
1832c809d7e9SDarrick J. Wong 	unsigned int		tag = xfs_icwalk_tag(goal);
1833c809d7e9SDarrick J. Wong 
1834c809d7e9SDarrick J. Wong 	if (tag == XFS_ICWALK_NULL_TAG)
1835df600197SDarrick J. Wong 		return xfs_perag_get(mp, agno);
1836df600197SDarrick J. Wong 	return xfs_perag_get_tag(mp, agno, tag);
1837df600197SDarrick J. Wong }
1838df600197SDarrick J. Wong 
1839df600197SDarrick J. Wong /*
1840df600197SDarrick J. Wong  * Call the @execute function on all incore inodes matching the radix tree
1841df600197SDarrick J. Wong  * @tag.
1842df600197SDarrick J. Wong  */
1843df600197SDarrick J. Wong static int
1844c1115c0cSDarrick J. Wong xfs_icwalk(
1845df600197SDarrick J. Wong 	struct xfs_mount	*mp,
1846df600197SDarrick J. Wong 	int			(*execute)(struct xfs_inode *ip, void *args),
1847df600197SDarrick J. Wong 	void			*args,
1848c809d7e9SDarrick J. Wong 	enum xfs_icwalk_goal	goal)
1849df600197SDarrick J. Wong {
1850df600197SDarrick J. Wong 	struct xfs_perag	*pag;
1851df600197SDarrick J. Wong 	int			error = 0;
1852df600197SDarrick J. Wong 	int			last_error = 0;
1853df600197SDarrick J. Wong 	xfs_agnumber_t		agno = 0;
1854df600197SDarrick J. Wong 
1855c809d7e9SDarrick J. Wong 	while ((pag = xfs_icwalk_get_perag(mp, agno, goal))) {
1856df600197SDarrick J. Wong 		agno = pag->pag_agno + 1;
1857*7fdff526SDarrick J. Wong 		error = xfs_icwalk_ag(pag, execute, args, goal);
1858df600197SDarrick J. Wong 		xfs_perag_put(pag);
1859df600197SDarrick J. Wong 		if (error) {
1860df600197SDarrick J. Wong 			last_error = error;
1861df600197SDarrick J. Wong 			if (error == -EFSCORRUPTED)
1862df600197SDarrick J. Wong 				break;
1863df600197SDarrick J. Wong 		}
1864df600197SDarrick J. Wong 	}
1865df600197SDarrick J. Wong 	return last_error;
1866df600197SDarrick J. Wong 	BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_EOF_FLAGS_VALID);
1867df600197SDarrick J. Wong }
1868