16d8b79cfSDave Chinner /* 26d8b79cfSDave Chinner * Copyright (c) 2000-2005 Silicon Graphics, Inc. 36d8b79cfSDave Chinner * All Rights Reserved. 46d8b79cfSDave Chinner * 56d8b79cfSDave Chinner * This program is free software; you can redistribute it and/or 66d8b79cfSDave Chinner * modify it under the terms of the GNU General Public License as 76d8b79cfSDave Chinner * published by the Free Software Foundation. 86d8b79cfSDave Chinner * 96d8b79cfSDave Chinner * This program is distributed in the hope that it would be useful, 106d8b79cfSDave Chinner * but WITHOUT ANY WARRANTY; without even the implied warranty of 116d8b79cfSDave Chinner * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 126d8b79cfSDave Chinner * GNU General Public License for more details. 136d8b79cfSDave Chinner * 146d8b79cfSDave Chinner * You should have received a copy of the GNU General Public License 156d8b79cfSDave Chinner * along with this program; if not, write the Free Software Foundation, 166d8b79cfSDave Chinner * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 176d8b79cfSDave Chinner */ 186d8b79cfSDave Chinner #include "xfs.h" 196d8b79cfSDave Chinner #include "xfs_fs.h" 206d8b79cfSDave Chinner #include "xfs_types.h" 216d8b79cfSDave Chinner #include "xfs_log.h" 226d8b79cfSDave Chinner #include "xfs_log_priv.h" 236d8b79cfSDave Chinner #include "xfs_inum.h" 246d8b79cfSDave Chinner #include "xfs_trans.h" 256d8b79cfSDave Chinner #include "xfs_trans_priv.h" 266d8b79cfSDave Chinner #include "xfs_sb.h" 276d8b79cfSDave Chinner #include "xfs_ag.h" 286d8b79cfSDave Chinner #include "xfs_mount.h" 296d8b79cfSDave Chinner #include "xfs_bmap_btree.h" 306d8b79cfSDave Chinner #include "xfs_inode.h" 316d8b79cfSDave Chinner #include "xfs_dinode.h" 326d8b79cfSDave Chinner #include "xfs_error.h" 336d8b79cfSDave Chinner #include "xfs_filestream.h" 346d8b79cfSDave Chinner #include "xfs_vnodeops.h" 356d8b79cfSDave Chinner #include "xfs_inode_item.h" 366d8b79cfSDave Chinner #include "xfs_quota.h" 376d8b79cfSDave Chinner #include "xfs_trace.h" 386d8b79cfSDave Chinner #include "xfs_fsops.h" 396d8b79cfSDave Chinner #include "xfs_icache.h" 406d8b79cfSDave Chinner 416d8b79cfSDave Chinner #include <linux/kthread.h> 426d8b79cfSDave Chinner #include <linux/freezer.h> 436d8b79cfSDave Chinner 4433479e05SDave Chinner STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, 4533479e05SDave Chinner struct xfs_perag *pag, struct xfs_inode *ip); 4633479e05SDave Chinner 4733479e05SDave Chinner /* 4833479e05SDave Chinner * Allocate and initialise an xfs_inode. 4933479e05SDave Chinner */ 5033479e05SDave Chinner STATIC struct xfs_inode * 5133479e05SDave Chinner xfs_inode_alloc( 5233479e05SDave Chinner struct xfs_mount *mp, 5333479e05SDave Chinner xfs_ino_t ino) 5433479e05SDave Chinner { 5533479e05SDave Chinner struct xfs_inode *ip; 5633479e05SDave Chinner 5733479e05SDave Chinner /* 5833479e05SDave Chinner * if this didn't occur in transactions, we could use 5933479e05SDave Chinner * KM_MAYFAIL and return NULL here on ENOMEM. Set the 6033479e05SDave Chinner * code up to do this anyway. 6133479e05SDave Chinner */ 6233479e05SDave Chinner ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); 6333479e05SDave Chinner if (!ip) 6433479e05SDave Chinner return NULL; 6533479e05SDave Chinner if (inode_init_always(mp->m_super, VFS_I(ip))) { 6633479e05SDave Chinner kmem_zone_free(xfs_inode_zone, ip); 6733479e05SDave Chinner return NULL; 6833479e05SDave Chinner } 6933479e05SDave Chinner 7033479e05SDave Chinner ASSERT(atomic_read(&ip->i_pincount) == 0); 7133479e05SDave Chinner ASSERT(!spin_is_locked(&ip->i_flags_lock)); 7233479e05SDave Chinner ASSERT(!xfs_isiflocked(ip)); 7333479e05SDave Chinner ASSERT(ip->i_ino == 0); 7433479e05SDave Chinner 7533479e05SDave Chinner mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 7633479e05SDave Chinner 7733479e05SDave Chinner /* initialise the xfs inode */ 7833479e05SDave Chinner ip->i_ino = ino; 7933479e05SDave Chinner ip->i_mount = mp; 8033479e05SDave Chinner memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 8133479e05SDave Chinner ip->i_afp = NULL; 8233479e05SDave Chinner memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); 8333479e05SDave Chinner ip->i_flags = 0; 8433479e05SDave Chinner ip->i_delayed_blks = 0; 8533479e05SDave Chinner memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); 8633479e05SDave Chinner 8733479e05SDave Chinner return ip; 8833479e05SDave Chinner } 8933479e05SDave Chinner 9033479e05SDave Chinner STATIC void 9133479e05SDave Chinner xfs_inode_free_callback( 9233479e05SDave Chinner struct rcu_head *head) 9333479e05SDave Chinner { 9433479e05SDave Chinner struct inode *inode = container_of(head, struct inode, i_rcu); 9533479e05SDave Chinner struct xfs_inode *ip = XFS_I(inode); 9633479e05SDave Chinner 9733479e05SDave Chinner kmem_zone_free(xfs_inode_zone, ip); 9833479e05SDave Chinner } 9933479e05SDave Chinner 10033479e05SDave Chinner STATIC void 10133479e05SDave Chinner xfs_inode_free( 10233479e05SDave Chinner struct xfs_inode *ip) 10333479e05SDave Chinner { 10433479e05SDave Chinner switch (ip->i_d.di_mode & S_IFMT) { 10533479e05SDave Chinner case S_IFREG: 10633479e05SDave Chinner case S_IFDIR: 10733479e05SDave Chinner case S_IFLNK: 10833479e05SDave Chinner xfs_idestroy_fork(ip, XFS_DATA_FORK); 10933479e05SDave Chinner break; 11033479e05SDave Chinner } 11133479e05SDave Chinner 11233479e05SDave Chinner if (ip->i_afp) 11333479e05SDave Chinner xfs_idestroy_fork(ip, XFS_ATTR_FORK); 11433479e05SDave Chinner 11533479e05SDave Chinner if (ip->i_itemp) { 11633479e05SDave Chinner ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); 11733479e05SDave Chinner xfs_inode_item_destroy(ip); 11833479e05SDave Chinner ip->i_itemp = NULL; 11933479e05SDave Chinner } 12033479e05SDave Chinner 12133479e05SDave Chinner /* asserts to verify all state is correct here */ 12233479e05SDave Chinner ASSERT(atomic_read(&ip->i_pincount) == 0); 12333479e05SDave Chinner ASSERT(!spin_is_locked(&ip->i_flags_lock)); 12433479e05SDave Chinner ASSERT(!xfs_isiflocked(ip)); 12533479e05SDave Chinner 12633479e05SDave Chinner /* 12733479e05SDave Chinner * Because we use RCU freeing we need to ensure the inode always 12833479e05SDave Chinner * appears to be reclaimed with an invalid inode number when in the 12933479e05SDave Chinner * free state. The ip->i_flags_lock provides the barrier against lookup 13033479e05SDave Chinner * races. 13133479e05SDave Chinner */ 13233479e05SDave Chinner spin_lock(&ip->i_flags_lock); 13333479e05SDave Chinner ip->i_flags = XFS_IRECLAIM; 13433479e05SDave Chinner ip->i_ino = 0; 13533479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 13633479e05SDave Chinner 13733479e05SDave Chinner call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 13833479e05SDave Chinner } 13933479e05SDave Chinner 14033479e05SDave Chinner /* 14133479e05SDave Chinner * Check the validity of the inode we just found it the cache 14233479e05SDave Chinner */ 14333479e05SDave Chinner static int 14433479e05SDave Chinner xfs_iget_cache_hit( 14533479e05SDave Chinner struct xfs_perag *pag, 14633479e05SDave Chinner struct xfs_inode *ip, 14733479e05SDave Chinner xfs_ino_t ino, 14833479e05SDave Chinner int flags, 14933479e05SDave Chinner int lock_flags) __releases(RCU) 15033479e05SDave Chinner { 15133479e05SDave Chinner struct inode *inode = VFS_I(ip); 15233479e05SDave Chinner struct xfs_mount *mp = ip->i_mount; 15333479e05SDave Chinner int error; 15433479e05SDave Chinner 15533479e05SDave Chinner /* 15633479e05SDave Chinner * check for re-use of an inode within an RCU grace period due to the 15733479e05SDave Chinner * radix tree nodes not being updated yet. We monitor for this by 15833479e05SDave Chinner * setting the inode number to zero before freeing the inode structure. 15933479e05SDave Chinner * If the inode has been reallocated and set up, then the inode number 16033479e05SDave Chinner * will not match, so check for that, too. 16133479e05SDave Chinner */ 16233479e05SDave Chinner spin_lock(&ip->i_flags_lock); 16333479e05SDave Chinner if (ip->i_ino != ino) { 16433479e05SDave Chinner trace_xfs_iget_skip(ip); 16533479e05SDave Chinner XFS_STATS_INC(xs_ig_frecycle); 16633479e05SDave Chinner error = EAGAIN; 16733479e05SDave Chinner goto out_error; 16833479e05SDave Chinner } 16933479e05SDave Chinner 17033479e05SDave Chinner 17133479e05SDave Chinner /* 17233479e05SDave Chinner * If we are racing with another cache hit that is currently 17333479e05SDave Chinner * instantiating this inode or currently recycling it out of 17433479e05SDave Chinner * reclaimabe state, wait for the initialisation to complete 17533479e05SDave Chinner * before continuing. 17633479e05SDave Chinner * 17733479e05SDave Chinner * XXX(hch): eventually we should do something equivalent to 17833479e05SDave Chinner * wait_on_inode to wait for these flags to be cleared 17933479e05SDave Chinner * instead of polling for it. 18033479e05SDave Chinner */ 18133479e05SDave Chinner if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { 18233479e05SDave Chinner trace_xfs_iget_skip(ip); 18333479e05SDave Chinner XFS_STATS_INC(xs_ig_frecycle); 18433479e05SDave Chinner error = EAGAIN; 18533479e05SDave Chinner goto out_error; 18633479e05SDave Chinner } 18733479e05SDave Chinner 18833479e05SDave Chinner /* 18933479e05SDave Chinner * If lookup is racing with unlink return an error immediately. 19033479e05SDave Chinner */ 19133479e05SDave Chinner if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { 19233479e05SDave Chinner error = ENOENT; 19333479e05SDave Chinner goto out_error; 19433479e05SDave Chinner } 19533479e05SDave Chinner 19633479e05SDave Chinner /* 19733479e05SDave Chinner * If IRECLAIMABLE is set, we've torn down the VFS inode already. 19833479e05SDave Chinner * Need to carefully get it back into useable state. 19933479e05SDave Chinner */ 20033479e05SDave Chinner if (ip->i_flags & XFS_IRECLAIMABLE) { 20133479e05SDave Chinner trace_xfs_iget_reclaim(ip); 20233479e05SDave Chinner 20333479e05SDave Chinner /* 20433479e05SDave Chinner * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode 20533479e05SDave Chinner * from stomping over us while we recycle the inode. We can't 20633479e05SDave Chinner * clear the radix tree reclaimable tag yet as it requires 20733479e05SDave Chinner * pag_ici_lock to be held exclusive. 20833479e05SDave Chinner */ 20933479e05SDave Chinner ip->i_flags |= XFS_IRECLAIM; 21033479e05SDave Chinner 21133479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 21233479e05SDave Chinner rcu_read_unlock(); 21333479e05SDave Chinner 21433479e05SDave Chinner error = -inode_init_always(mp->m_super, inode); 21533479e05SDave Chinner if (error) { 21633479e05SDave Chinner /* 21733479e05SDave Chinner * Re-initializing the inode failed, and we are in deep 21833479e05SDave Chinner * trouble. Try to re-add it to the reclaim list. 21933479e05SDave Chinner */ 22033479e05SDave Chinner rcu_read_lock(); 22133479e05SDave Chinner spin_lock(&ip->i_flags_lock); 22233479e05SDave Chinner 22333479e05SDave Chinner ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 22433479e05SDave Chinner ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 22533479e05SDave Chinner trace_xfs_iget_reclaim_fail(ip); 22633479e05SDave Chinner goto out_error; 22733479e05SDave Chinner } 22833479e05SDave Chinner 22933479e05SDave Chinner spin_lock(&pag->pag_ici_lock); 23033479e05SDave Chinner spin_lock(&ip->i_flags_lock); 23133479e05SDave Chinner 23233479e05SDave Chinner /* 23333479e05SDave Chinner * Clear the per-lifetime state in the inode as we are now 23433479e05SDave Chinner * effectively a new inode and need to return to the initial 23533479e05SDave Chinner * state before reuse occurs. 23633479e05SDave Chinner */ 23733479e05SDave Chinner ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 23833479e05SDave Chinner ip->i_flags |= XFS_INEW; 23933479e05SDave Chinner __xfs_inode_clear_reclaim_tag(mp, pag, ip); 24033479e05SDave Chinner inode->i_state = I_NEW; 24133479e05SDave Chinner 24233479e05SDave Chinner ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); 24333479e05SDave Chinner mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 24433479e05SDave Chinner 24533479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 24633479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 24733479e05SDave Chinner } else { 24833479e05SDave Chinner /* If the VFS inode is being torn down, pause and try again. */ 24933479e05SDave Chinner if (!igrab(inode)) { 25033479e05SDave Chinner trace_xfs_iget_skip(ip); 25133479e05SDave Chinner error = EAGAIN; 25233479e05SDave Chinner goto out_error; 25333479e05SDave Chinner } 25433479e05SDave Chinner 25533479e05SDave Chinner /* We've got a live one. */ 25633479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 25733479e05SDave Chinner rcu_read_unlock(); 25833479e05SDave Chinner trace_xfs_iget_hit(ip); 25933479e05SDave Chinner } 26033479e05SDave Chinner 26133479e05SDave Chinner if (lock_flags != 0) 26233479e05SDave Chinner xfs_ilock(ip, lock_flags); 26333479e05SDave Chinner 26433479e05SDave Chinner xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); 26533479e05SDave Chinner XFS_STATS_INC(xs_ig_found); 26633479e05SDave Chinner 26733479e05SDave Chinner return 0; 26833479e05SDave Chinner 26933479e05SDave Chinner out_error: 27033479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 27133479e05SDave Chinner rcu_read_unlock(); 27233479e05SDave Chinner return error; 27333479e05SDave Chinner } 27433479e05SDave Chinner 27533479e05SDave Chinner 27633479e05SDave Chinner static int 27733479e05SDave Chinner xfs_iget_cache_miss( 27833479e05SDave Chinner struct xfs_mount *mp, 27933479e05SDave Chinner struct xfs_perag *pag, 28033479e05SDave Chinner xfs_trans_t *tp, 28133479e05SDave Chinner xfs_ino_t ino, 28233479e05SDave Chinner struct xfs_inode **ipp, 28333479e05SDave Chinner int flags, 28433479e05SDave Chinner int lock_flags) 28533479e05SDave Chinner { 28633479e05SDave Chinner struct xfs_inode *ip; 28733479e05SDave Chinner int error; 28833479e05SDave Chinner xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 28933479e05SDave Chinner int iflags; 29033479e05SDave Chinner 29133479e05SDave Chinner ip = xfs_inode_alloc(mp, ino); 29233479e05SDave Chinner if (!ip) 29333479e05SDave Chinner return ENOMEM; 29433479e05SDave Chinner 29533479e05SDave Chinner error = xfs_iread(mp, tp, ip, flags); 29633479e05SDave Chinner if (error) 29733479e05SDave Chinner goto out_destroy; 29833479e05SDave Chinner 29933479e05SDave Chinner trace_xfs_iget_miss(ip); 30033479e05SDave Chinner 30133479e05SDave Chinner if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { 30233479e05SDave Chinner error = ENOENT; 30333479e05SDave Chinner goto out_destroy; 30433479e05SDave Chinner } 30533479e05SDave Chinner 30633479e05SDave Chinner /* 30733479e05SDave Chinner * Preload the radix tree so we can insert safely under the 30833479e05SDave Chinner * write spinlock. Note that we cannot sleep inside the preload 30933479e05SDave Chinner * region. Since we can be called from transaction context, don't 31033479e05SDave Chinner * recurse into the file system. 31133479e05SDave Chinner */ 31233479e05SDave Chinner if (radix_tree_preload(GFP_NOFS)) { 31333479e05SDave Chinner error = EAGAIN; 31433479e05SDave Chinner goto out_destroy; 31533479e05SDave Chinner } 31633479e05SDave Chinner 31733479e05SDave Chinner /* 31833479e05SDave Chinner * Because the inode hasn't been added to the radix-tree yet it can't 31933479e05SDave Chinner * be found by another thread, so we can do the non-sleeping lock here. 32033479e05SDave Chinner */ 32133479e05SDave Chinner if (lock_flags) { 32233479e05SDave Chinner if (!xfs_ilock_nowait(ip, lock_flags)) 32333479e05SDave Chinner BUG(); 32433479e05SDave Chinner } 32533479e05SDave Chinner 32633479e05SDave Chinner /* 32733479e05SDave Chinner * These values must be set before inserting the inode into the radix 32833479e05SDave Chinner * tree as the moment it is inserted a concurrent lookup (allowed by the 32933479e05SDave Chinner * RCU locking mechanism) can find it and that lookup must see that this 33033479e05SDave Chinner * is an inode currently under construction (i.e. that XFS_INEW is set). 33133479e05SDave Chinner * The ip->i_flags_lock that protects the XFS_INEW flag forms the 33233479e05SDave Chinner * memory barrier that ensures this detection works correctly at lookup 33333479e05SDave Chinner * time. 33433479e05SDave Chinner */ 33533479e05SDave Chinner iflags = XFS_INEW; 33633479e05SDave Chinner if (flags & XFS_IGET_DONTCACHE) 33733479e05SDave Chinner iflags |= XFS_IDONTCACHE; 33833479e05SDave Chinner ip->i_udquot = ip->i_gdquot = NULL; 33933479e05SDave Chinner xfs_iflags_set(ip, iflags); 34033479e05SDave Chinner 34133479e05SDave Chinner /* insert the new inode */ 34233479e05SDave Chinner spin_lock(&pag->pag_ici_lock); 34333479e05SDave Chinner error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 34433479e05SDave Chinner if (unlikely(error)) { 34533479e05SDave Chinner WARN_ON(error != -EEXIST); 34633479e05SDave Chinner XFS_STATS_INC(xs_ig_dup); 34733479e05SDave Chinner error = EAGAIN; 34833479e05SDave Chinner goto out_preload_end; 34933479e05SDave Chinner } 35033479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 35133479e05SDave Chinner radix_tree_preload_end(); 35233479e05SDave Chinner 35333479e05SDave Chinner *ipp = ip; 35433479e05SDave Chinner return 0; 35533479e05SDave Chinner 35633479e05SDave Chinner out_preload_end: 35733479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 35833479e05SDave Chinner radix_tree_preload_end(); 35933479e05SDave Chinner if (lock_flags) 36033479e05SDave Chinner xfs_iunlock(ip, lock_flags); 36133479e05SDave Chinner out_destroy: 36233479e05SDave Chinner __destroy_inode(VFS_I(ip)); 36333479e05SDave Chinner xfs_inode_free(ip); 36433479e05SDave Chinner return error; 36533479e05SDave Chinner } 36633479e05SDave Chinner 36733479e05SDave Chinner /* 36833479e05SDave Chinner * Look up an inode by number in the given file system. 36933479e05SDave Chinner * The inode is looked up in the cache held in each AG. 37033479e05SDave Chinner * If the inode is found in the cache, initialise the vfs inode 37133479e05SDave Chinner * if necessary. 37233479e05SDave Chinner * 37333479e05SDave Chinner * If it is not in core, read it in from the file system's device, 37433479e05SDave Chinner * add it to the cache and initialise the vfs inode. 37533479e05SDave Chinner * 37633479e05SDave Chinner * The inode is locked according to the value of the lock_flags parameter. 37733479e05SDave Chinner * This flag parameter indicates how and if the inode's IO lock and inode lock 37833479e05SDave Chinner * should be taken. 37933479e05SDave Chinner * 38033479e05SDave Chinner * mp -- the mount point structure for the current file system. It points 38133479e05SDave Chinner * to the inode hash table. 38233479e05SDave Chinner * tp -- a pointer to the current transaction if there is one. This is 38333479e05SDave Chinner * simply passed through to the xfs_iread() call. 38433479e05SDave Chinner * ino -- the number of the inode desired. This is the unique identifier 38533479e05SDave Chinner * within the file system for the inode being requested. 38633479e05SDave Chinner * lock_flags -- flags indicating how to lock the inode. See the comment 38733479e05SDave Chinner * for xfs_ilock() for a list of valid values. 38833479e05SDave Chinner */ 38933479e05SDave Chinner int 39033479e05SDave Chinner xfs_iget( 39133479e05SDave Chinner xfs_mount_t *mp, 39233479e05SDave Chinner xfs_trans_t *tp, 39333479e05SDave Chinner xfs_ino_t ino, 39433479e05SDave Chinner uint flags, 39533479e05SDave Chinner uint lock_flags, 39633479e05SDave Chinner xfs_inode_t **ipp) 39733479e05SDave Chinner { 39833479e05SDave Chinner xfs_inode_t *ip; 39933479e05SDave Chinner int error; 40033479e05SDave Chinner xfs_perag_t *pag; 40133479e05SDave Chinner xfs_agino_t agino; 40233479e05SDave Chinner 40333479e05SDave Chinner /* 40433479e05SDave Chinner * xfs_reclaim_inode() uses the ILOCK to ensure an inode 40533479e05SDave Chinner * doesn't get freed while it's being referenced during a 40633479e05SDave Chinner * radix tree traversal here. It assumes this function 40733479e05SDave Chinner * aqcuires only the ILOCK (and therefore it has no need to 40833479e05SDave Chinner * involve the IOLOCK in this synchronization). 40933479e05SDave Chinner */ 41033479e05SDave Chinner ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 41133479e05SDave Chinner 41233479e05SDave Chinner /* reject inode numbers outside existing AGs */ 41333479e05SDave Chinner if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 41433479e05SDave Chinner return EINVAL; 41533479e05SDave Chinner 41633479e05SDave Chinner /* get the perag structure and ensure that it's inode capable */ 41733479e05SDave Chinner pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 41833479e05SDave Chinner agino = XFS_INO_TO_AGINO(mp, ino); 41933479e05SDave Chinner 42033479e05SDave Chinner again: 42133479e05SDave Chinner error = 0; 42233479e05SDave Chinner rcu_read_lock(); 42333479e05SDave Chinner ip = radix_tree_lookup(&pag->pag_ici_root, agino); 42433479e05SDave Chinner 42533479e05SDave Chinner if (ip) { 42633479e05SDave Chinner error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 42733479e05SDave Chinner if (error) 42833479e05SDave Chinner goto out_error_or_again; 42933479e05SDave Chinner } else { 43033479e05SDave Chinner rcu_read_unlock(); 43133479e05SDave Chinner XFS_STATS_INC(xs_ig_missed); 43233479e05SDave Chinner 43333479e05SDave Chinner error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 43433479e05SDave Chinner flags, lock_flags); 43533479e05SDave Chinner if (error) 43633479e05SDave Chinner goto out_error_or_again; 43733479e05SDave Chinner } 43833479e05SDave Chinner xfs_perag_put(pag); 43933479e05SDave Chinner 44033479e05SDave Chinner *ipp = ip; 44133479e05SDave Chinner 44233479e05SDave Chinner /* 44333479e05SDave Chinner * If we have a real type for an on-disk inode, we can set ops(&unlock) 44433479e05SDave Chinner * now. If it's a new inode being created, xfs_ialloc will handle it. 44533479e05SDave Chinner */ 44633479e05SDave Chinner if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) 44733479e05SDave Chinner xfs_setup_inode(ip); 44833479e05SDave Chinner return 0; 44933479e05SDave Chinner 45033479e05SDave Chinner out_error_or_again: 45133479e05SDave Chinner if (error == EAGAIN) { 45233479e05SDave Chinner delay(1); 45333479e05SDave Chinner goto again; 45433479e05SDave Chinner } 45533479e05SDave Chinner xfs_perag_put(pag); 45633479e05SDave Chinner return error; 45733479e05SDave Chinner } 45833479e05SDave Chinner 4596d8b79cfSDave Chinner /* 4606d8b79cfSDave Chinner * The inode lookup is done in batches to keep the amount of lock traffic and 4616d8b79cfSDave Chinner * radix tree lookups to a minimum. The batch size is a trade off between 4626d8b79cfSDave Chinner * lookup reduction and stack usage. This is in the reclaim path, so we can't 4636d8b79cfSDave Chinner * be too greedy. 4646d8b79cfSDave Chinner */ 4656d8b79cfSDave Chinner #define XFS_LOOKUP_BATCH 32 4666d8b79cfSDave Chinner 4676d8b79cfSDave Chinner STATIC int 4686d8b79cfSDave Chinner xfs_inode_ag_walk_grab( 4696d8b79cfSDave Chinner struct xfs_inode *ip) 4706d8b79cfSDave Chinner { 4716d8b79cfSDave Chinner struct inode *inode = VFS_I(ip); 4726d8b79cfSDave Chinner 4736d8b79cfSDave Chinner ASSERT(rcu_read_lock_held()); 4746d8b79cfSDave Chinner 4756d8b79cfSDave Chinner /* 4766d8b79cfSDave Chinner * check for stale RCU freed inode 4776d8b79cfSDave Chinner * 4786d8b79cfSDave Chinner * If the inode has been reallocated, it doesn't matter if it's not in 4796d8b79cfSDave Chinner * the AG we are walking - we are walking for writeback, so if it 4806d8b79cfSDave Chinner * passes all the "valid inode" checks and is dirty, then we'll write 4816d8b79cfSDave Chinner * it back anyway. If it has been reallocated and still being 4826d8b79cfSDave Chinner * initialised, the XFS_INEW check below will catch it. 4836d8b79cfSDave Chinner */ 4846d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 4856d8b79cfSDave Chinner if (!ip->i_ino) 4866d8b79cfSDave Chinner goto out_unlock_noent; 4876d8b79cfSDave Chinner 4886d8b79cfSDave Chinner /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 4896d8b79cfSDave Chinner if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) 4906d8b79cfSDave Chinner goto out_unlock_noent; 4916d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 4926d8b79cfSDave Chinner 4936d8b79cfSDave Chinner /* nothing to sync during shutdown */ 4946d8b79cfSDave Chinner if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 4956d8b79cfSDave Chinner return EFSCORRUPTED; 4966d8b79cfSDave Chinner 4976d8b79cfSDave Chinner /* If we can't grab the inode, it must on it's way to reclaim. */ 4986d8b79cfSDave Chinner if (!igrab(inode)) 4996d8b79cfSDave Chinner return ENOENT; 5006d8b79cfSDave Chinner 5016d8b79cfSDave Chinner if (is_bad_inode(inode)) { 5026d8b79cfSDave Chinner IRELE(ip); 5036d8b79cfSDave Chinner return ENOENT; 5046d8b79cfSDave Chinner } 5056d8b79cfSDave Chinner 5066d8b79cfSDave Chinner /* inode is valid */ 5076d8b79cfSDave Chinner return 0; 5086d8b79cfSDave Chinner 5096d8b79cfSDave Chinner out_unlock_noent: 5106d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 5116d8b79cfSDave Chinner return ENOENT; 5126d8b79cfSDave Chinner } 5136d8b79cfSDave Chinner 5146d8b79cfSDave Chinner STATIC int 5156d8b79cfSDave Chinner xfs_inode_ag_walk( 5166d8b79cfSDave Chinner struct xfs_mount *mp, 5176d8b79cfSDave Chinner struct xfs_perag *pag, 5186d8b79cfSDave Chinner int (*execute)(struct xfs_inode *ip, 519a454f742SBrian Foster struct xfs_perag *pag, int flags, 520a454f742SBrian Foster void *args), 521a454f742SBrian Foster int flags, 522a454f742SBrian Foster void *args, 523a454f742SBrian Foster int tag) 5246d8b79cfSDave Chinner { 5256d8b79cfSDave Chinner uint32_t first_index; 5266d8b79cfSDave Chinner int last_error = 0; 5276d8b79cfSDave Chinner int skipped; 5286d8b79cfSDave Chinner int done; 5296d8b79cfSDave Chinner int nr_found; 5306d8b79cfSDave Chinner 5316d8b79cfSDave Chinner restart: 5326d8b79cfSDave Chinner done = 0; 5336d8b79cfSDave Chinner skipped = 0; 5346d8b79cfSDave Chinner first_index = 0; 5356d8b79cfSDave Chinner nr_found = 0; 5366d8b79cfSDave Chinner do { 5376d8b79cfSDave Chinner struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 5386d8b79cfSDave Chinner int error = 0; 5396d8b79cfSDave Chinner int i; 5406d8b79cfSDave Chinner 5416d8b79cfSDave Chinner rcu_read_lock(); 542a454f742SBrian Foster 543a454f742SBrian Foster if (tag == -1) 5446d8b79cfSDave Chinner nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 5456d8b79cfSDave Chinner (void **)batch, first_index, 5466d8b79cfSDave Chinner XFS_LOOKUP_BATCH); 547a454f742SBrian Foster else 548a454f742SBrian Foster nr_found = radix_tree_gang_lookup_tag( 549a454f742SBrian Foster &pag->pag_ici_root, 550a454f742SBrian Foster (void **) batch, first_index, 551a454f742SBrian Foster XFS_LOOKUP_BATCH, tag); 552a454f742SBrian Foster 5536d8b79cfSDave Chinner if (!nr_found) { 5546d8b79cfSDave Chinner rcu_read_unlock(); 5556d8b79cfSDave Chinner break; 5566d8b79cfSDave Chinner } 5576d8b79cfSDave Chinner 5586d8b79cfSDave Chinner /* 5596d8b79cfSDave Chinner * Grab the inodes before we drop the lock. if we found 5606d8b79cfSDave Chinner * nothing, nr == 0 and the loop will be skipped. 5616d8b79cfSDave Chinner */ 5626d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 5636d8b79cfSDave Chinner struct xfs_inode *ip = batch[i]; 5646d8b79cfSDave Chinner 5656d8b79cfSDave Chinner if (done || xfs_inode_ag_walk_grab(ip)) 5666d8b79cfSDave Chinner batch[i] = NULL; 5676d8b79cfSDave Chinner 5686d8b79cfSDave Chinner /* 5696d8b79cfSDave Chinner * Update the index for the next lookup. Catch 5706d8b79cfSDave Chinner * overflows into the next AG range which can occur if 5716d8b79cfSDave Chinner * we have inodes in the last block of the AG and we 5726d8b79cfSDave Chinner * are currently pointing to the last inode. 5736d8b79cfSDave Chinner * 5746d8b79cfSDave Chinner * Because we may see inodes that are from the wrong AG 5756d8b79cfSDave Chinner * due to RCU freeing and reallocation, only update the 5766d8b79cfSDave Chinner * index if it lies in this AG. It was a race that lead 5776d8b79cfSDave Chinner * us to see this inode, so another lookup from the 5786d8b79cfSDave Chinner * same index will not find it again. 5796d8b79cfSDave Chinner */ 5806d8b79cfSDave Chinner if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 5816d8b79cfSDave Chinner continue; 5826d8b79cfSDave Chinner first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 5836d8b79cfSDave Chinner if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 5846d8b79cfSDave Chinner done = 1; 5856d8b79cfSDave Chinner } 5866d8b79cfSDave Chinner 5876d8b79cfSDave Chinner /* unlock now we've grabbed the inodes. */ 5886d8b79cfSDave Chinner rcu_read_unlock(); 5896d8b79cfSDave Chinner 5906d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 5916d8b79cfSDave Chinner if (!batch[i]) 5926d8b79cfSDave Chinner continue; 593a454f742SBrian Foster error = execute(batch[i], pag, flags, args); 5946d8b79cfSDave Chinner IRELE(batch[i]); 5956d8b79cfSDave Chinner if (error == EAGAIN) { 5966d8b79cfSDave Chinner skipped++; 5976d8b79cfSDave Chinner continue; 5986d8b79cfSDave Chinner } 5996d8b79cfSDave Chinner if (error && last_error != EFSCORRUPTED) 6006d8b79cfSDave Chinner last_error = error; 6016d8b79cfSDave Chinner } 6026d8b79cfSDave Chinner 6036d8b79cfSDave Chinner /* bail out if the filesystem is corrupted. */ 6046d8b79cfSDave Chinner if (error == EFSCORRUPTED) 6056d8b79cfSDave Chinner break; 6066d8b79cfSDave Chinner 6076d8b79cfSDave Chinner cond_resched(); 6086d8b79cfSDave Chinner 6096d8b79cfSDave Chinner } while (nr_found && !done); 6106d8b79cfSDave Chinner 6116d8b79cfSDave Chinner if (skipped) { 6126d8b79cfSDave Chinner delay(1); 6136d8b79cfSDave Chinner goto restart; 6146d8b79cfSDave Chinner } 6156d8b79cfSDave Chinner return last_error; 6166d8b79cfSDave Chinner } 6176d8b79cfSDave Chinner 6186d8b79cfSDave Chinner int 6196d8b79cfSDave Chinner xfs_inode_ag_iterator( 6206d8b79cfSDave Chinner struct xfs_mount *mp, 6216d8b79cfSDave Chinner int (*execute)(struct xfs_inode *ip, 622a454f742SBrian Foster struct xfs_perag *pag, int flags, 623a454f742SBrian Foster void *args), 624a454f742SBrian Foster int flags, 625a454f742SBrian Foster void *args) 6266d8b79cfSDave Chinner { 6276d8b79cfSDave Chinner struct xfs_perag *pag; 6286d8b79cfSDave Chinner int error = 0; 6296d8b79cfSDave Chinner int last_error = 0; 6306d8b79cfSDave Chinner xfs_agnumber_t ag; 6316d8b79cfSDave Chinner 6326d8b79cfSDave Chinner ag = 0; 6336d8b79cfSDave Chinner while ((pag = xfs_perag_get(mp, ag))) { 6346d8b79cfSDave Chinner ag = pag->pag_agno + 1; 635a454f742SBrian Foster error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); 636a454f742SBrian Foster xfs_perag_put(pag); 637a454f742SBrian Foster if (error) { 638a454f742SBrian Foster last_error = error; 639a454f742SBrian Foster if (error == EFSCORRUPTED) 640a454f742SBrian Foster break; 641a454f742SBrian Foster } 642a454f742SBrian Foster } 643a454f742SBrian Foster return XFS_ERROR(last_error); 644a454f742SBrian Foster } 645a454f742SBrian Foster 646a454f742SBrian Foster int 647a454f742SBrian Foster xfs_inode_ag_iterator_tag( 648a454f742SBrian Foster struct xfs_mount *mp, 649a454f742SBrian Foster int (*execute)(struct xfs_inode *ip, 650a454f742SBrian Foster struct xfs_perag *pag, int flags, 651a454f742SBrian Foster void *args), 652a454f742SBrian Foster int flags, 653a454f742SBrian Foster void *args, 654a454f742SBrian Foster int tag) 655a454f742SBrian Foster { 656a454f742SBrian Foster struct xfs_perag *pag; 657a454f742SBrian Foster int error = 0; 658a454f742SBrian Foster int last_error = 0; 659a454f742SBrian Foster xfs_agnumber_t ag; 660a454f742SBrian Foster 661a454f742SBrian Foster ag = 0; 662a454f742SBrian Foster while ((pag = xfs_perag_get_tag(mp, ag, tag))) { 663a454f742SBrian Foster ag = pag->pag_agno + 1; 664a454f742SBrian Foster error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); 6656d8b79cfSDave Chinner xfs_perag_put(pag); 6666d8b79cfSDave Chinner if (error) { 6676d8b79cfSDave Chinner last_error = error; 6686d8b79cfSDave Chinner if (error == EFSCORRUPTED) 6696d8b79cfSDave Chinner break; 6706d8b79cfSDave Chinner } 6716d8b79cfSDave Chinner } 6726d8b79cfSDave Chinner return XFS_ERROR(last_error); 6736d8b79cfSDave Chinner } 6746d8b79cfSDave Chinner 6756d8b79cfSDave Chinner /* 6766d8b79cfSDave Chinner * Queue a new inode reclaim pass if there are reclaimable inodes and there 6776d8b79cfSDave Chinner * isn't a reclaim pass already in progress. By default it runs every 5s based 6786d8b79cfSDave Chinner * on the xfs periodic sync default of 30s. Perhaps this should have it's own 6796d8b79cfSDave Chinner * tunable, but that can be done if this method proves to be ineffective or too 6806d8b79cfSDave Chinner * aggressive. 6816d8b79cfSDave Chinner */ 6826d8b79cfSDave Chinner static void 6836d8b79cfSDave Chinner xfs_reclaim_work_queue( 6846d8b79cfSDave Chinner struct xfs_mount *mp) 6856d8b79cfSDave Chinner { 6866d8b79cfSDave Chinner 6876d8b79cfSDave Chinner rcu_read_lock(); 6886d8b79cfSDave Chinner if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 6896d8b79cfSDave Chinner queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 6906d8b79cfSDave Chinner msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 6916d8b79cfSDave Chinner } 6926d8b79cfSDave Chinner rcu_read_unlock(); 6936d8b79cfSDave Chinner } 6946d8b79cfSDave Chinner 6956d8b79cfSDave Chinner /* 6966d8b79cfSDave Chinner * This is a fast pass over the inode cache to try to get reclaim moving on as 6976d8b79cfSDave Chinner * many inodes as possible in a short period of time. It kicks itself every few 6986d8b79cfSDave Chinner * seconds, as well as being kicked by the inode cache shrinker when memory 6996d8b79cfSDave Chinner * goes low. It scans as quickly as possible avoiding locked inodes or those 7006d8b79cfSDave Chinner * already being flushed, and once done schedules a future pass. 7016d8b79cfSDave Chinner */ 7026d8b79cfSDave Chinner void 7036d8b79cfSDave Chinner xfs_reclaim_worker( 7046d8b79cfSDave Chinner struct work_struct *work) 7056d8b79cfSDave Chinner { 7066d8b79cfSDave Chinner struct xfs_mount *mp = container_of(to_delayed_work(work), 7076d8b79cfSDave Chinner struct xfs_mount, m_reclaim_work); 7086d8b79cfSDave Chinner 7096d8b79cfSDave Chinner xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 7106d8b79cfSDave Chinner xfs_reclaim_work_queue(mp); 7116d8b79cfSDave Chinner } 7126d8b79cfSDave Chinner 71333479e05SDave Chinner static void 7146d8b79cfSDave Chinner __xfs_inode_set_reclaim_tag( 7156d8b79cfSDave Chinner struct xfs_perag *pag, 7166d8b79cfSDave Chinner struct xfs_inode *ip) 7176d8b79cfSDave Chinner { 7186d8b79cfSDave Chinner radix_tree_tag_set(&pag->pag_ici_root, 7196d8b79cfSDave Chinner XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 7206d8b79cfSDave Chinner XFS_ICI_RECLAIM_TAG); 7216d8b79cfSDave Chinner 7226d8b79cfSDave Chinner if (!pag->pag_ici_reclaimable) { 7236d8b79cfSDave Chinner /* propagate the reclaim tag up into the perag radix tree */ 7246d8b79cfSDave Chinner spin_lock(&ip->i_mount->m_perag_lock); 7256d8b79cfSDave Chinner radix_tree_tag_set(&ip->i_mount->m_perag_tree, 7266d8b79cfSDave Chinner XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 7276d8b79cfSDave Chinner XFS_ICI_RECLAIM_TAG); 7286d8b79cfSDave Chinner spin_unlock(&ip->i_mount->m_perag_lock); 7296d8b79cfSDave Chinner 7306d8b79cfSDave Chinner /* schedule periodic background inode reclaim */ 7316d8b79cfSDave Chinner xfs_reclaim_work_queue(ip->i_mount); 7326d8b79cfSDave Chinner 7336d8b79cfSDave Chinner trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 7346d8b79cfSDave Chinner -1, _RET_IP_); 7356d8b79cfSDave Chinner } 7366d8b79cfSDave Chinner pag->pag_ici_reclaimable++; 7376d8b79cfSDave Chinner } 7386d8b79cfSDave Chinner 7396d8b79cfSDave Chinner /* 7406d8b79cfSDave Chinner * We set the inode flag atomically with the radix tree tag. 7416d8b79cfSDave Chinner * Once we get tag lookups on the radix tree, this inode flag 7426d8b79cfSDave Chinner * can go away. 7436d8b79cfSDave Chinner */ 7446d8b79cfSDave Chinner void 7456d8b79cfSDave Chinner xfs_inode_set_reclaim_tag( 7466d8b79cfSDave Chinner xfs_inode_t *ip) 7476d8b79cfSDave Chinner { 7486d8b79cfSDave Chinner struct xfs_mount *mp = ip->i_mount; 7496d8b79cfSDave Chinner struct xfs_perag *pag; 7506d8b79cfSDave Chinner 7516d8b79cfSDave Chinner pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 7526d8b79cfSDave Chinner spin_lock(&pag->pag_ici_lock); 7536d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 7546d8b79cfSDave Chinner __xfs_inode_set_reclaim_tag(pag, ip); 7556d8b79cfSDave Chinner __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 7566d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 7576d8b79cfSDave Chinner spin_unlock(&pag->pag_ici_lock); 7586d8b79cfSDave Chinner xfs_perag_put(pag); 7596d8b79cfSDave Chinner } 7606d8b79cfSDave Chinner 7616d8b79cfSDave Chinner STATIC void 7626d8b79cfSDave Chinner __xfs_inode_clear_reclaim( 7636d8b79cfSDave Chinner xfs_perag_t *pag, 7646d8b79cfSDave Chinner xfs_inode_t *ip) 7656d8b79cfSDave Chinner { 7666d8b79cfSDave Chinner pag->pag_ici_reclaimable--; 7676d8b79cfSDave Chinner if (!pag->pag_ici_reclaimable) { 7686d8b79cfSDave Chinner /* clear the reclaim tag from the perag radix tree */ 7696d8b79cfSDave Chinner spin_lock(&ip->i_mount->m_perag_lock); 7706d8b79cfSDave Chinner radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 7716d8b79cfSDave Chinner XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 7726d8b79cfSDave Chinner XFS_ICI_RECLAIM_TAG); 7736d8b79cfSDave Chinner spin_unlock(&ip->i_mount->m_perag_lock); 7746d8b79cfSDave Chinner trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, 7756d8b79cfSDave Chinner -1, _RET_IP_); 7766d8b79cfSDave Chinner } 7776d8b79cfSDave Chinner } 7786d8b79cfSDave Chinner 77933479e05SDave Chinner STATIC void 7806d8b79cfSDave Chinner __xfs_inode_clear_reclaim_tag( 7816d8b79cfSDave Chinner xfs_mount_t *mp, 7826d8b79cfSDave Chinner xfs_perag_t *pag, 7836d8b79cfSDave Chinner xfs_inode_t *ip) 7846d8b79cfSDave Chinner { 7856d8b79cfSDave Chinner radix_tree_tag_clear(&pag->pag_ici_root, 7866d8b79cfSDave Chinner XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 7876d8b79cfSDave Chinner __xfs_inode_clear_reclaim(pag, ip); 7886d8b79cfSDave Chinner } 7896d8b79cfSDave Chinner 7906d8b79cfSDave Chinner /* 7916d8b79cfSDave Chinner * Grab the inode for reclaim exclusively. 7926d8b79cfSDave Chinner * Return 0 if we grabbed it, non-zero otherwise. 7936d8b79cfSDave Chinner */ 7946d8b79cfSDave Chinner STATIC int 7956d8b79cfSDave Chinner xfs_reclaim_inode_grab( 7966d8b79cfSDave Chinner struct xfs_inode *ip, 7976d8b79cfSDave Chinner int flags) 7986d8b79cfSDave Chinner { 7996d8b79cfSDave Chinner ASSERT(rcu_read_lock_held()); 8006d8b79cfSDave Chinner 8016d8b79cfSDave Chinner /* quick check for stale RCU freed inode */ 8026d8b79cfSDave Chinner if (!ip->i_ino) 8036d8b79cfSDave Chinner return 1; 8046d8b79cfSDave Chinner 8056d8b79cfSDave Chinner /* 8066d8b79cfSDave Chinner * If we are asked for non-blocking operation, do unlocked checks to 8076d8b79cfSDave Chinner * see if the inode already is being flushed or in reclaim to avoid 8086d8b79cfSDave Chinner * lock traffic. 8096d8b79cfSDave Chinner */ 8106d8b79cfSDave Chinner if ((flags & SYNC_TRYLOCK) && 8116d8b79cfSDave Chinner __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) 8126d8b79cfSDave Chinner return 1; 8136d8b79cfSDave Chinner 8146d8b79cfSDave Chinner /* 8156d8b79cfSDave Chinner * The radix tree lock here protects a thread in xfs_iget from racing 8166d8b79cfSDave Chinner * with us starting reclaim on the inode. Once we have the 8176d8b79cfSDave Chinner * XFS_IRECLAIM flag set it will not touch us. 8186d8b79cfSDave Chinner * 8196d8b79cfSDave Chinner * Due to RCU lookup, we may find inodes that have been freed and only 8206d8b79cfSDave Chinner * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that 8216d8b79cfSDave Chinner * aren't candidates for reclaim at all, so we must check the 8226d8b79cfSDave Chinner * XFS_IRECLAIMABLE is set first before proceeding to reclaim. 8236d8b79cfSDave Chinner */ 8246d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 8256d8b79cfSDave Chinner if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 8266d8b79cfSDave Chinner __xfs_iflags_test(ip, XFS_IRECLAIM)) { 8276d8b79cfSDave Chinner /* not a reclaim candidate. */ 8286d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 8296d8b79cfSDave Chinner return 1; 8306d8b79cfSDave Chinner } 8316d8b79cfSDave Chinner __xfs_iflags_set(ip, XFS_IRECLAIM); 8326d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 8336d8b79cfSDave Chinner return 0; 8346d8b79cfSDave Chinner } 8356d8b79cfSDave Chinner 8366d8b79cfSDave Chinner /* 8376d8b79cfSDave Chinner * Inodes in different states need to be treated differently. The following 8386d8b79cfSDave Chinner * table lists the inode states and the reclaim actions necessary: 8396d8b79cfSDave Chinner * 8406d8b79cfSDave Chinner * inode state iflush ret required action 8416d8b79cfSDave Chinner * --------------- ---------- --------------- 8426d8b79cfSDave Chinner * bad - reclaim 8436d8b79cfSDave Chinner * shutdown EIO unpin and reclaim 8446d8b79cfSDave Chinner * clean, unpinned 0 reclaim 8456d8b79cfSDave Chinner * stale, unpinned 0 reclaim 8466d8b79cfSDave Chinner * clean, pinned(*) 0 requeue 8476d8b79cfSDave Chinner * stale, pinned EAGAIN requeue 8486d8b79cfSDave Chinner * dirty, async - requeue 8496d8b79cfSDave Chinner * dirty, sync 0 reclaim 8506d8b79cfSDave Chinner * 8516d8b79cfSDave Chinner * (*) dgc: I don't think the clean, pinned state is possible but it gets 8526d8b79cfSDave Chinner * handled anyway given the order of checks implemented. 8536d8b79cfSDave Chinner * 8546d8b79cfSDave Chinner * Also, because we get the flush lock first, we know that any inode that has 8556d8b79cfSDave Chinner * been flushed delwri has had the flush completed by the time we check that 8566d8b79cfSDave Chinner * the inode is clean. 8576d8b79cfSDave Chinner * 8586d8b79cfSDave Chinner * Note that because the inode is flushed delayed write by AIL pushing, the 8596d8b79cfSDave Chinner * flush lock may already be held here and waiting on it can result in very 8606d8b79cfSDave Chinner * long latencies. Hence for sync reclaims, where we wait on the flush lock, 8616d8b79cfSDave Chinner * the caller should push the AIL first before trying to reclaim inodes to 8626d8b79cfSDave Chinner * minimise the amount of time spent waiting. For background relaim, we only 8636d8b79cfSDave Chinner * bother to reclaim clean inodes anyway. 8646d8b79cfSDave Chinner * 8656d8b79cfSDave Chinner * Hence the order of actions after gaining the locks should be: 8666d8b79cfSDave Chinner * bad => reclaim 8676d8b79cfSDave Chinner * shutdown => unpin and reclaim 8686d8b79cfSDave Chinner * pinned, async => requeue 8696d8b79cfSDave Chinner * pinned, sync => unpin 8706d8b79cfSDave Chinner * stale => reclaim 8716d8b79cfSDave Chinner * clean => reclaim 8726d8b79cfSDave Chinner * dirty, async => requeue 8736d8b79cfSDave Chinner * dirty, sync => flush, wait and reclaim 8746d8b79cfSDave Chinner */ 8756d8b79cfSDave Chinner STATIC int 8766d8b79cfSDave Chinner xfs_reclaim_inode( 8776d8b79cfSDave Chinner struct xfs_inode *ip, 8786d8b79cfSDave Chinner struct xfs_perag *pag, 8796d8b79cfSDave Chinner int sync_mode) 8806d8b79cfSDave Chinner { 8816d8b79cfSDave Chinner struct xfs_buf *bp = NULL; 8826d8b79cfSDave Chinner int error; 8836d8b79cfSDave Chinner 8846d8b79cfSDave Chinner restart: 8856d8b79cfSDave Chinner error = 0; 8866d8b79cfSDave Chinner xfs_ilock(ip, XFS_ILOCK_EXCL); 8876d8b79cfSDave Chinner if (!xfs_iflock_nowait(ip)) { 8886d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 8896d8b79cfSDave Chinner goto out; 8906d8b79cfSDave Chinner xfs_iflock(ip); 8916d8b79cfSDave Chinner } 8926d8b79cfSDave Chinner 8936d8b79cfSDave Chinner if (is_bad_inode(VFS_I(ip))) 8946d8b79cfSDave Chinner goto reclaim; 8956d8b79cfSDave Chinner if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 8966d8b79cfSDave Chinner xfs_iunpin_wait(ip); 8976d8b79cfSDave Chinner xfs_iflush_abort(ip, false); 8986d8b79cfSDave Chinner goto reclaim; 8996d8b79cfSDave Chinner } 9006d8b79cfSDave Chinner if (xfs_ipincount(ip)) { 9016d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 9026d8b79cfSDave Chinner goto out_ifunlock; 9036d8b79cfSDave Chinner xfs_iunpin_wait(ip); 9046d8b79cfSDave Chinner } 9056d8b79cfSDave Chinner if (xfs_iflags_test(ip, XFS_ISTALE)) 9066d8b79cfSDave Chinner goto reclaim; 9076d8b79cfSDave Chinner if (xfs_inode_clean(ip)) 9086d8b79cfSDave Chinner goto reclaim; 9096d8b79cfSDave Chinner 9106d8b79cfSDave Chinner /* 9116d8b79cfSDave Chinner * Never flush out dirty data during non-blocking reclaim, as it would 9126d8b79cfSDave Chinner * just contend with AIL pushing trying to do the same job. 9136d8b79cfSDave Chinner */ 9146d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 9156d8b79cfSDave Chinner goto out_ifunlock; 9166d8b79cfSDave Chinner 9176d8b79cfSDave Chinner /* 9186d8b79cfSDave Chinner * Now we have an inode that needs flushing. 9196d8b79cfSDave Chinner * 9206d8b79cfSDave Chinner * Note that xfs_iflush will never block on the inode buffer lock, as 9216d8b79cfSDave Chinner * xfs_ifree_cluster() can lock the inode buffer before it locks the 9226d8b79cfSDave Chinner * ip->i_lock, and we are doing the exact opposite here. As a result, 9236d8b79cfSDave Chinner * doing a blocking xfs_imap_to_bp() to get the cluster buffer would 9246d8b79cfSDave Chinner * result in an ABBA deadlock with xfs_ifree_cluster(). 9256d8b79cfSDave Chinner * 9266d8b79cfSDave Chinner * As xfs_ifree_cluser() must gather all inodes that are active in the 9276d8b79cfSDave Chinner * cache to mark them stale, if we hit this case we don't actually want 9286d8b79cfSDave Chinner * to do IO here - we want the inode marked stale so we can simply 9296d8b79cfSDave Chinner * reclaim it. Hence if we get an EAGAIN error here, just unlock the 9306d8b79cfSDave Chinner * inode, back off and try again. Hopefully the next pass through will 9316d8b79cfSDave Chinner * see the stale flag set on the inode. 9326d8b79cfSDave Chinner */ 9336d8b79cfSDave Chinner error = xfs_iflush(ip, &bp); 9346d8b79cfSDave Chinner if (error == EAGAIN) { 9356d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 9366d8b79cfSDave Chinner /* backoff longer than in xfs_ifree_cluster */ 9376d8b79cfSDave Chinner delay(2); 9386d8b79cfSDave Chinner goto restart; 9396d8b79cfSDave Chinner } 9406d8b79cfSDave Chinner 9416d8b79cfSDave Chinner if (!error) { 9426d8b79cfSDave Chinner error = xfs_bwrite(bp); 9436d8b79cfSDave Chinner xfs_buf_relse(bp); 9446d8b79cfSDave Chinner } 9456d8b79cfSDave Chinner 9466d8b79cfSDave Chinner xfs_iflock(ip); 9476d8b79cfSDave Chinner reclaim: 9486d8b79cfSDave Chinner xfs_ifunlock(ip); 9496d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 9506d8b79cfSDave Chinner 9516d8b79cfSDave Chinner XFS_STATS_INC(xs_ig_reclaims); 9526d8b79cfSDave Chinner /* 9536d8b79cfSDave Chinner * Remove the inode from the per-AG radix tree. 9546d8b79cfSDave Chinner * 9556d8b79cfSDave Chinner * Because radix_tree_delete won't complain even if the item was never 9566d8b79cfSDave Chinner * added to the tree assert that it's been there before to catch 9576d8b79cfSDave Chinner * problems with the inode life time early on. 9586d8b79cfSDave Chinner */ 9596d8b79cfSDave Chinner spin_lock(&pag->pag_ici_lock); 9606d8b79cfSDave Chinner if (!radix_tree_delete(&pag->pag_ici_root, 9616d8b79cfSDave Chinner XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 9626d8b79cfSDave Chinner ASSERT(0); 9636d8b79cfSDave Chinner __xfs_inode_clear_reclaim(pag, ip); 9646d8b79cfSDave Chinner spin_unlock(&pag->pag_ici_lock); 9656d8b79cfSDave Chinner 9666d8b79cfSDave Chinner /* 9676d8b79cfSDave Chinner * Here we do an (almost) spurious inode lock in order to coordinate 9686d8b79cfSDave Chinner * with inode cache radix tree lookups. This is because the lookup 9696d8b79cfSDave Chinner * can reference the inodes in the cache without taking references. 9706d8b79cfSDave Chinner * 9716d8b79cfSDave Chinner * We make that OK here by ensuring that we wait until the inode is 9726d8b79cfSDave Chinner * unlocked after the lookup before we go ahead and free it. 9736d8b79cfSDave Chinner */ 9746d8b79cfSDave Chinner xfs_ilock(ip, XFS_ILOCK_EXCL); 9756d8b79cfSDave Chinner xfs_qm_dqdetach(ip); 9766d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 9776d8b79cfSDave Chinner 9786d8b79cfSDave Chinner xfs_inode_free(ip); 9796d8b79cfSDave Chinner return error; 9806d8b79cfSDave Chinner 9816d8b79cfSDave Chinner out_ifunlock: 9826d8b79cfSDave Chinner xfs_ifunlock(ip); 9836d8b79cfSDave Chinner out: 9846d8b79cfSDave Chinner xfs_iflags_clear(ip, XFS_IRECLAIM); 9856d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 9866d8b79cfSDave Chinner /* 9876d8b79cfSDave Chinner * We could return EAGAIN here to make reclaim rescan the inode tree in 9886d8b79cfSDave Chinner * a short while. However, this just burns CPU time scanning the tree 9896d8b79cfSDave Chinner * waiting for IO to complete and the reclaim work never goes back to 9906d8b79cfSDave Chinner * the idle state. Instead, return 0 to let the next scheduled 9916d8b79cfSDave Chinner * background reclaim attempt to reclaim the inode again. 9926d8b79cfSDave Chinner */ 9936d8b79cfSDave Chinner return 0; 9946d8b79cfSDave Chinner } 9956d8b79cfSDave Chinner 9966d8b79cfSDave Chinner /* 9976d8b79cfSDave Chinner * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 9986d8b79cfSDave Chinner * corrupted, we still want to try to reclaim all the inodes. If we don't, 9996d8b79cfSDave Chinner * then a shut down during filesystem unmount reclaim walk leak all the 10006d8b79cfSDave Chinner * unreclaimed inodes. 10016d8b79cfSDave Chinner */ 100233479e05SDave Chinner STATIC int 10036d8b79cfSDave Chinner xfs_reclaim_inodes_ag( 10046d8b79cfSDave Chinner struct xfs_mount *mp, 10056d8b79cfSDave Chinner int flags, 10066d8b79cfSDave Chinner int *nr_to_scan) 10076d8b79cfSDave Chinner { 10086d8b79cfSDave Chinner struct xfs_perag *pag; 10096d8b79cfSDave Chinner int error = 0; 10106d8b79cfSDave Chinner int last_error = 0; 10116d8b79cfSDave Chinner xfs_agnumber_t ag; 10126d8b79cfSDave Chinner int trylock = flags & SYNC_TRYLOCK; 10136d8b79cfSDave Chinner int skipped; 10146d8b79cfSDave Chinner 10156d8b79cfSDave Chinner restart: 10166d8b79cfSDave Chinner ag = 0; 10176d8b79cfSDave Chinner skipped = 0; 10186d8b79cfSDave Chinner while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 10196d8b79cfSDave Chinner unsigned long first_index = 0; 10206d8b79cfSDave Chinner int done = 0; 10216d8b79cfSDave Chinner int nr_found = 0; 10226d8b79cfSDave Chinner 10236d8b79cfSDave Chinner ag = pag->pag_agno + 1; 10246d8b79cfSDave Chinner 10256d8b79cfSDave Chinner if (trylock) { 10266d8b79cfSDave Chinner if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 10276d8b79cfSDave Chinner skipped++; 10286d8b79cfSDave Chinner xfs_perag_put(pag); 10296d8b79cfSDave Chinner continue; 10306d8b79cfSDave Chinner } 10316d8b79cfSDave Chinner first_index = pag->pag_ici_reclaim_cursor; 10326d8b79cfSDave Chinner } else 10336d8b79cfSDave Chinner mutex_lock(&pag->pag_ici_reclaim_lock); 10346d8b79cfSDave Chinner 10356d8b79cfSDave Chinner do { 10366d8b79cfSDave Chinner struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 10376d8b79cfSDave Chinner int i; 10386d8b79cfSDave Chinner 10396d8b79cfSDave Chinner rcu_read_lock(); 10406d8b79cfSDave Chinner nr_found = radix_tree_gang_lookup_tag( 10416d8b79cfSDave Chinner &pag->pag_ici_root, 10426d8b79cfSDave Chinner (void **)batch, first_index, 10436d8b79cfSDave Chinner XFS_LOOKUP_BATCH, 10446d8b79cfSDave Chinner XFS_ICI_RECLAIM_TAG); 10456d8b79cfSDave Chinner if (!nr_found) { 10466d8b79cfSDave Chinner done = 1; 10476d8b79cfSDave Chinner rcu_read_unlock(); 10486d8b79cfSDave Chinner break; 10496d8b79cfSDave Chinner } 10506d8b79cfSDave Chinner 10516d8b79cfSDave Chinner /* 10526d8b79cfSDave Chinner * Grab the inodes before we drop the lock. if we found 10536d8b79cfSDave Chinner * nothing, nr == 0 and the loop will be skipped. 10546d8b79cfSDave Chinner */ 10556d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 10566d8b79cfSDave Chinner struct xfs_inode *ip = batch[i]; 10576d8b79cfSDave Chinner 10586d8b79cfSDave Chinner if (done || xfs_reclaim_inode_grab(ip, flags)) 10596d8b79cfSDave Chinner batch[i] = NULL; 10606d8b79cfSDave Chinner 10616d8b79cfSDave Chinner /* 10626d8b79cfSDave Chinner * Update the index for the next lookup. Catch 10636d8b79cfSDave Chinner * overflows into the next AG range which can 10646d8b79cfSDave Chinner * occur if we have inodes in the last block of 10656d8b79cfSDave Chinner * the AG and we are currently pointing to the 10666d8b79cfSDave Chinner * last inode. 10676d8b79cfSDave Chinner * 10686d8b79cfSDave Chinner * Because we may see inodes that are from the 10696d8b79cfSDave Chinner * wrong AG due to RCU freeing and 10706d8b79cfSDave Chinner * reallocation, only update the index if it 10716d8b79cfSDave Chinner * lies in this AG. It was a race that lead us 10726d8b79cfSDave Chinner * to see this inode, so another lookup from 10736d8b79cfSDave Chinner * the same index will not find it again. 10746d8b79cfSDave Chinner */ 10756d8b79cfSDave Chinner if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 10766d8b79cfSDave Chinner pag->pag_agno) 10776d8b79cfSDave Chinner continue; 10786d8b79cfSDave Chinner first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 10796d8b79cfSDave Chinner if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 10806d8b79cfSDave Chinner done = 1; 10816d8b79cfSDave Chinner } 10826d8b79cfSDave Chinner 10836d8b79cfSDave Chinner /* unlock now we've grabbed the inodes. */ 10846d8b79cfSDave Chinner rcu_read_unlock(); 10856d8b79cfSDave Chinner 10866d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 10876d8b79cfSDave Chinner if (!batch[i]) 10886d8b79cfSDave Chinner continue; 10896d8b79cfSDave Chinner error = xfs_reclaim_inode(batch[i], pag, flags); 10906d8b79cfSDave Chinner if (error && last_error != EFSCORRUPTED) 10916d8b79cfSDave Chinner last_error = error; 10926d8b79cfSDave Chinner } 10936d8b79cfSDave Chinner 10946d8b79cfSDave Chinner *nr_to_scan -= XFS_LOOKUP_BATCH; 10956d8b79cfSDave Chinner 10966d8b79cfSDave Chinner cond_resched(); 10976d8b79cfSDave Chinner 10986d8b79cfSDave Chinner } while (nr_found && !done && *nr_to_scan > 0); 10996d8b79cfSDave Chinner 11006d8b79cfSDave Chinner if (trylock && !done) 11016d8b79cfSDave Chinner pag->pag_ici_reclaim_cursor = first_index; 11026d8b79cfSDave Chinner else 11036d8b79cfSDave Chinner pag->pag_ici_reclaim_cursor = 0; 11046d8b79cfSDave Chinner mutex_unlock(&pag->pag_ici_reclaim_lock); 11056d8b79cfSDave Chinner xfs_perag_put(pag); 11066d8b79cfSDave Chinner } 11076d8b79cfSDave Chinner 11086d8b79cfSDave Chinner /* 11096d8b79cfSDave Chinner * if we skipped any AG, and we still have scan count remaining, do 11106d8b79cfSDave Chinner * another pass this time using blocking reclaim semantics (i.e 11116d8b79cfSDave Chinner * waiting on the reclaim locks and ignoring the reclaim cursors). This 11126d8b79cfSDave Chinner * ensure that when we get more reclaimers than AGs we block rather 11136d8b79cfSDave Chinner * than spin trying to execute reclaim. 11146d8b79cfSDave Chinner */ 11156d8b79cfSDave Chinner if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { 11166d8b79cfSDave Chinner trylock = 0; 11176d8b79cfSDave Chinner goto restart; 11186d8b79cfSDave Chinner } 11196d8b79cfSDave Chinner return XFS_ERROR(last_error); 11206d8b79cfSDave Chinner } 11216d8b79cfSDave Chinner 11226d8b79cfSDave Chinner int 11236d8b79cfSDave Chinner xfs_reclaim_inodes( 11246d8b79cfSDave Chinner xfs_mount_t *mp, 11256d8b79cfSDave Chinner int mode) 11266d8b79cfSDave Chinner { 11276d8b79cfSDave Chinner int nr_to_scan = INT_MAX; 11286d8b79cfSDave Chinner 11296d8b79cfSDave Chinner return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); 11306d8b79cfSDave Chinner } 11316d8b79cfSDave Chinner 11326d8b79cfSDave Chinner /* 11336d8b79cfSDave Chinner * Scan a certain number of inodes for reclaim. 11346d8b79cfSDave Chinner * 11356d8b79cfSDave Chinner * When called we make sure that there is a background (fast) inode reclaim in 11366d8b79cfSDave Chinner * progress, while we will throttle the speed of reclaim via doing synchronous 11376d8b79cfSDave Chinner * reclaim of inodes. That means if we come across dirty inodes, we wait for 11386d8b79cfSDave Chinner * them to be cleaned, which we hope will not be very long due to the 11396d8b79cfSDave Chinner * background walker having already kicked the IO off on those dirty inodes. 11406d8b79cfSDave Chinner */ 11416d8b79cfSDave Chinner void 11426d8b79cfSDave Chinner xfs_reclaim_inodes_nr( 11436d8b79cfSDave Chinner struct xfs_mount *mp, 11446d8b79cfSDave Chinner int nr_to_scan) 11456d8b79cfSDave Chinner { 11466d8b79cfSDave Chinner /* kick background reclaimer and push the AIL */ 11476d8b79cfSDave Chinner xfs_reclaim_work_queue(mp); 11486d8b79cfSDave Chinner xfs_ail_push_all(mp->m_ail); 11496d8b79cfSDave Chinner 11506d8b79cfSDave Chinner xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 11516d8b79cfSDave Chinner } 11526d8b79cfSDave Chinner 11536d8b79cfSDave Chinner /* 11546d8b79cfSDave Chinner * Return the number of reclaimable inodes in the filesystem for 11556d8b79cfSDave Chinner * the shrinker to determine how much to reclaim. 11566d8b79cfSDave Chinner */ 11576d8b79cfSDave Chinner int 11586d8b79cfSDave Chinner xfs_reclaim_inodes_count( 11596d8b79cfSDave Chinner struct xfs_mount *mp) 11606d8b79cfSDave Chinner { 11616d8b79cfSDave Chinner struct xfs_perag *pag; 11626d8b79cfSDave Chinner xfs_agnumber_t ag = 0; 11636d8b79cfSDave Chinner int reclaimable = 0; 11646d8b79cfSDave Chinner 11656d8b79cfSDave Chinner while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 11666d8b79cfSDave Chinner ag = pag->pag_agno + 1; 11676d8b79cfSDave Chinner reclaimable += pag->pag_ici_reclaimable; 11686d8b79cfSDave Chinner xfs_perag_put(pag); 11696d8b79cfSDave Chinner } 11706d8b79cfSDave Chinner return reclaimable; 11716d8b79cfSDave Chinner } 11726d8b79cfSDave Chinner 117341176a68SBrian Foster STATIC int 11743e3f9f58SBrian Foster xfs_inode_match_id( 11753e3f9f58SBrian Foster struct xfs_inode *ip, 11763e3f9f58SBrian Foster struct xfs_eofblocks *eofb) 11773e3f9f58SBrian Foster { 11781b556048SBrian Foster if (eofb->eof_flags & XFS_EOF_FLAGS_UID && 11791b556048SBrian Foster ip->i_d.di_uid != eofb->eof_uid) 11803e3f9f58SBrian Foster return 0; 11811b556048SBrian Foster 11821b556048SBrian Foster if (eofb->eof_flags & XFS_EOF_FLAGS_GID && 11831b556048SBrian Foster ip->i_d.di_gid != eofb->eof_gid) 11841b556048SBrian Foster return 0; 11851b556048SBrian Foster 11861b556048SBrian Foster if (eofb->eof_flags & XFS_EOF_FLAGS_PRID && 11871b556048SBrian Foster xfs_get_projid(ip) != eofb->eof_prid) 11881b556048SBrian Foster return 0; 11891b556048SBrian Foster 11901b556048SBrian Foster return 1; 11913e3f9f58SBrian Foster } 11923e3f9f58SBrian Foster 11933e3f9f58SBrian Foster STATIC int 119441176a68SBrian Foster xfs_inode_free_eofblocks( 119541176a68SBrian Foster struct xfs_inode *ip, 119641176a68SBrian Foster struct xfs_perag *pag, 119741176a68SBrian Foster int flags, 119841176a68SBrian Foster void *args) 119941176a68SBrian Foster { 120041176a68SBrian Foster int ret; 12013e3f9f58SBrian Foster struct xfs_eofblocks *eofb = args; 120241176a68SBrian Foster 120341176a68SBrian Foster if (!xfs_can_free_eofblocks(ip, false)) { 120441176a68SBrian Foster /* inode could be preallocated or append-only */ 120541176a68SBrian Foster trace_xfs_inode_free_eofblocks_invalid(ip); 120641176a68SBrian Foster xfs_inode_clear_eofblocks_tag(ip); 120741176a68SBrian Foster return 0; 120841176a68SBrian Foster } 120941176a68SBrian Foster 121041176a68SBrian Foster /* 121141176a68SBrian Foster * If the mapping is dirty the operation can block and wait for some 121241176a68SBrian Foster * time. Unless we are waiting, skip it. 121341176a68SBrian Foster */ 121441176a68SBrian Foster if (!(flags & SYNC_WAIT) && 121541176a68SBrian Foster mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 121641176a68SBrian Foster return 0; 121741176a68SBrian Foster 1218*00ca79a0SBrian Foster if (eofb) { 1219*00ca79a0SBrian Foster if (!xfs_inode_match_id(ip, eofb)) 12203e3f9f58SBrian Foster return 0; 12213e3f9f58SBrian Foster 1222*00ca79a0SBrian Foster /* skip the inode if the file size is too small */ 1223*00ca79a0SBrian Foster if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 1224*00ca79a0SBrian Foster XFS_ISIZE(ip) < eofb->eof_min_file_size) 1225*00ca79a0SBrian Foster return 0; 1226*00ca79a0SBrian Foster } 1227*00ca79a0SBrian Foster 122841176a68SBrian Foster ret = xfs_free_eofblocks(ip->i_mount, ip, true); 122941176a68SBrian Foster 123041176a68SBrian Foster /* don't revisit the inode if we're not waiting */ 123141176a68SBrian Foster if (ret == EAGAIN && !(flags & SYNC_WAIT)) 123241176a68SBrian Foster ret = 0; 123341176a68SBrian Foster 123441176a68SBrian Foster return ret; 123541176a68SBrian Foster } 123641176a68SBrian Foster 123741176a68SBrian Foster int 123841176a68SBrian Foster xfs_icache_free_eofblocks( 123941176a68SBrian Foster struct xfs_mount *mp, 12408ca149deSBrian Foster struct xfs_eofblocks *eofb) 124141176a68SBrian Foster { 12428ca149deSBrian Foster int flags = SYNC_TRYLOCK; 12438ca149deSBrian Foster 12448ca149deSBrian Foster if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) 12458ca149deSBrian Foster flags = SYNC_WAIT; 12468ca149deSBrian Foster 124741176a68SBrian Foster return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, 12488ca149deSBrian Foster eofb, XFS_ICI_EOFBLOCKS_TAG); 124941176a68SBrian Foster } 125041176a68SBrian Foster 125127b52867SBrian Foster void 125227b52867SBrian Foster xfs_inode_set_eofblocks_tag( 125327b52867SBrian Foster xfs_inode_t *ip) 125427b52867SBrian Foster { 125527b52867SBrian Foster struct xfs_mount *mp = ip->i_mount; 125627b52867SBrian Foster struct xfs_perag *pag; 125727b52867SBrian Foster int tagged; 125827b52867SBrian Foster 125927b52867SBrian Foster pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 126027b52867SBrian Foster spin_lock(&pag->pag_ici_lock); 126127b52867SBrian Foster trace_xfs_inode_set_eofblocks_tag(ip); 126227b52867SBrian Foster 126327b52867SBrian Foster tagged = radix_tree_tagged(&pag->pag_ici_root, 126427b52867SBrian Foster XFS_ICI_EOFBLOCKS_TAG); 126527b52867SBrian Foster radix_tree_tag_set(&pag->pag_ici_root, 126627b52867SBrian Foster XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 126727b52867SBrian Foster XFS_ICI_EOFBLOCKS_TAG); 126827b52867SBrian Foster if (!tagged) { 126927b52867SBrian Foster /* propagate the eofblocks tag up into the perag radix tree */ 127027b52867SBrian Foster spin_lock(&ip->i_mount->m_perag_lock); 127127b52867SBrian Foster radix_tree_tag_set(&ip->i_mount->m_perag_tree, 127227b52867SBrian Foster XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 127327b52867SBrian Foster XFS_ICI_EOFBLOCKS_TAG); 127427b52867SBrian Foster spin_unlock(&ip->i_mount->m_perag_lock); 127527b52867SBrian Foster 127627b52867SBrian Foster trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, 127727b52867SBrian Foster -1, _RET_IP_); 127827b52867SBrian Foster } 127927b52867SBrian Foster 128027b52867SBrian Foster spin_unlock(&pag->pag_ici_lock); 128127b52867SBrian Foster xfs_perag_put(pag); 128227b52867SBrian Foster } 128327b52867SBrian Foster 128427b52867SBrian Foster void 128527b52867SBrian Foster xfs_inode_clear_eofblocks_tag( 128627b52867SBrian Foster xfs_inode_t *ip) 128727b52867SBrian Foster { 128827b52867SBrian Foster struct xfs_mount *mp = ip->i_mount; 128927b52867SBrian Foster struct xfs_perag *pag; 129027b52867SBrian Foster 129127b52867SBrian Foster pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 129227b52867SBrian Foster spin_lock(&pag->pag_ici_lock); 129327b52867SBrian Foster trace_xfs_inode_clear_eofblocks_tag(ip); 129427b52867SBrian Foster 129527b52867SBrian Foster radix_tree_tag_clear(&pag->pag_ici_root, 129627b52867SBrian Foster XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 129727b52867SBrian Foster XFS_ICI_EOFBLOCKS_TAG); 129827b52867SBrian Foster if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { 129927b52867SBrian Foster /* clear the eofblocks tag from the perag radix tree */ 130027b52867SBrian Foster spin_lock(&ip->i_mount->m_perag_lock); 130127b52867SBrian Foster radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 130227b52867SBrian Foster XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 130327b52867SBrian Foster XFS_ICI_EOFBLOCKS_TAG); 130427b52867SBrian Foster spin_unlock(&ip->i_mount->m_perag_lock); 130527b52867SBrian Foster trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, 130627b52867SBrian Foster -1, _RET_IP_); 130727b52867SBrian Foster } 130827b52867SBrian Foster 130927b52867SBrian Foster spin_unlock(&pag->pag_ici_lock); 131027b52867SBrian Foster xfs_perag_put(pag); 131127b52867SBrian Foster } 131227b52867SBrian Foster 1313