16d8b79cfSDave Chinner /* 26d8b79cfSDave Chinner * Copyright (c) 2000-2005 Silicon Graphics, Inc. 36d8b79cfSDave Chinner * All Rights Reserved. 46d8b79cfSDave Chinner * 56d8b79cfSDave Chinner * This program is free software; you can redistribute it and/or 66d8b79cfSDave Chinner * modify it under the terms of the GNU General Public License as 76d8b79cfSDave Chinner * published by the Free Software Foundation. 86d8b79cfSDave Chinner * 96d8b79cfSDave Chinner * This program is distributed in the hope that it would be useful, 106d8b79cfSDave Chinner * but WITHOUT ANY WARRANTY; without even the implied warranty of 116d8b79cfSDave Chinner * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 126d8b79cfSDave Chinner * GNU General Public License for more details. 136d8b79cfSDave Chinner * 146d8b79cfSDave Chinner * You should have received a copy of the GNU General Public License 156d8b79cfSDave Chinner * along with this program; if not, write the Free Software Foundation, 166d8b79cfSDave Chinner * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 176d8b79cfSDave Chinner */ 186d8b79cfSDave Chinner #include "xfs.h" 196d8b79cfSDave Chinner #include "xfs_fs.h" 206ca1c906SDave Chinner #include "xfs_format.h" 216d8b79cfSDave Chinner #include "xfs_types.h" 226d8b79cfSDave Chinner #include "xfs_log.h" 236d8b79cfSDave Chinner #include "xfs_log_priv.h" 246d8b79cfSDave Chinner #include "xfs_inum.h" 256d8b79cfSDave Chinner #include "xfs_trans.h" 266d8b79cfSDave Chinner #include "xfs_trans_priv.h" 276d8b79cfSDave Chinner #include "xfs_sb.h" 286d8b79cfSDave Chinner #include "xfs_ag.h" 296d8b79cfSDave Chinner #include "xfs_mount.h" 306d8b79cfSDave Chinner #include "xfs_bmap_btree.h" 316d8b79cfSDave Chinner #include "xfs_inode.h" 326d8b79cfSDave Chinner #include "xfs_dinode.h" 336d8b79cfSDave Chinner #include "xfs_error.h" 346d8b79cfSDave Chinner #include "xfs_filestream.h" 356d8b79cfSDave Chinner #include "xfs_inode_item.h" 366d8b79cfSDave Chinner #include "xfs_quota.h" 376d8b79cfSDave Chinner #include "xfs_trace.h" 386d8b79cfSDave Chinner #include "xfs_fsops.h" 396d8b79cfSDave Chinner #include "xfs_icache.h" 40c24b5dfaSDave Chinner #include "xfs_bmap_util.h" 416d8b79cfSDave Chinner 426d8b79cfSDave Chinner #include <linux/kthread.h> 436d8b79cfSDave Chinner #include <linux/freezer.h> 446d8b79cfSDave Chinner 4533479e05SDave Chinner STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, 4633479e05SDave Chinner struct xfs_perag *pag, struct xfs_inode *ip); 4733479e05SDave Chinner 4833479e05SDave Chinner /* 4933479e05SDave Chinner * Allocate and initialise an xfs_inode. 5033479e05SDave Chinner */ 51*638f4416SDave Chinner struct xfs_inode * 5233479e05SDave Chinner xfs_inode_alloc( 5333479e05SDave Chinner struct xfs_mount *mp, 5433479e05SDave Chinner xfs_ino_t ino) 5533479e05SDave Chinner { 5633479e05SDave Chinner struct xfs_inode *ip; 5733479e05SDave Chinner 5833479e05SDave Chinner /* 5933479e05SDave Chinner * if this didn't occur in transactions, we could use 6033479e05SDave Chinner * KM_MAYFAIL and return NULL here on ENOMEM. Set the 6133479e05SDave Chinner * code up to do this anyway. 6233479e05SDave Chinner */ 6333479e05SDave Chinner ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); 6433479e05SDave Chinner if (!ip) 6533479e05SDave Chinner return NULL; 6633479e05SDave Chinner if (inode_init_always(mp->m_super, VFS_I(ip))) { 6733479e05SDave Chinner kmem_zone_free(xfs_inode_zone, ip); 6833479e05SDave Chinner return NULL; 6933479e05SDave Chinner } 7033479e05SDave Chinner 7133479e05SDave Chinner ASSERT(atomic_read(&ip->i_pincount) == 0); 7233479e05SDave Chinner ASSERT(!spin_is_locked(&ip->i_flags_lock)); 7333479e05SDave Chinner ASSERT(!xfs_isiflocked(ip)); 7433479e05SDave Chinner ASSERT(ip->i_ino == 0); 7533479e05SDave Chinner 7633479e05SDave Chinner mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 7733479e05SDave Chinner 7833479e05SDave Chinner /* initialise the xfs inode */ 7933479e05SDave Chinner ip->i_ino = ino; 8033479e05SDave Chinner ip->i_mount = mp; 8133479e05SDave Chinner memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 8233479e05SDave Chinner ip->i_afp = NULL; 8333479e05SDave Chinner memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); 8433479e05SDave Chinner ip->i_flags = 0; 8533479e05SDave Chinner ip->i_delayed_blks = 0; 8633479e05SDave Chinner memset(&ip->i_d, 0, sizeof(xfs_icdinode_t)); 8733479e05SDave Chinner 8833479e05SDave Chinner return ip; 8933479e05SDave Chinner } 9033479e05SDave Chinner 9133479e05SDave Chinner STATIC void 9233479e05SDave Chinner xfs_inode_free_callback( 9333479e05SDave Chinner struct rcu_head *head) 9433479e05SDave Chinner { 9533479e05SDave Chinner struct inode *inode = container_of(head, struct inode, i_rcu); 9633479e05SDave Chinner struct xfs_inode *ip = XFS_I(inode); 9733479e05SDave Chinner 9833479e05SDave Chinner kmem_zone_free(xfs_inode_zone, ip); 9933479e05SDave Chinner } 10033479e05SDave Chinner 101*638f4416SDave Chinner void 10233479e05SDave Chinner xfs_inode_free( 10333479e05SDave Chinner struct xfs_inode *ip) 10433479e05SDave Chinner { 10533479e05SDave Chinner switch (ip->i_d.di_mode & S_IFMT) { 10633479e05SDave Chinner case S_IFREG: 10733479e05SDave Chinner case S_IFDIR: 10833479e05SDave Chinner case S_IFLNK: 10933479e05SDave Chinner xfs_idestroy_fork(ip, XFS_DATA_FORK); 11033479e05SDave Chinner break; 11133479e05SDave Chinner } 11233479e05SDave Chinner 11333479e05SDave Chinner if (ip->i_afp) 11433479e05SDave Chinner xfs_idestroy_fork(ip, XFS_ATTR_FORK); 11533479e05SDave Chinner 11633479e05SDave Chinner if (ip->i_itemp) { 11733479e05SDave Chinner ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); 11833479e05SDave Chinner xfs_inode_item_destroy(ip); 11933479e05SDave Chinner ip->i_itemp = NULL; 12033479e05SDave Chinner } 12133479e05SDave Chinner 12233479e05SDave Chinner /* asserts to verify all state is correct here */ 12333479e05SDave Chinner ASSERT(atomic_read(&ip->i_pincount) == 0); 12433479e05SDave Chinner ASSERT(!spin_is_locked(&ip->i_flags_lock)); 12533479e05SDave Chinner ASSERT(!xfs_isiflocked(ip)); 12633479e05SDave Chinner 12733479e05SDave Chinner /* 12833479e05SDave Chinner * Because we use RCU freeing we need to ensure the inode always 12933479e05SDave Chinner * appears to be reclaimed with an invalid inode number when in the 13033479e05SDave Chinner * free state. The ip->i_flags_lock provides the barrier against lookup 13133479e05SDave Chinner * races. 13233479e05SDave Chinner */ 13333479e05SDave Chinner spin_lock(&ip->i_flags_lock); 13433479e05SDave Chinner ip->i_flags = XFS_IRECLAIM; 13533479e05SDave Chinner ip->i_ino = 0; 13633479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 13733479e05SDave Chinner 13833479e05SDave Chinner call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 13933479e05SDave Chinner } 14033479e05SDave Chinner 14133479e05SDave Chinner /* 14233479e05SDave Chinner * Check the validity of the inode we just found it the cache 14333479e05SDave Chinner */ 14433479e05SDave Chinner static int 14533479e05SDave Chinner xfs_iget_cache_hit( 14633479e05SDave Chinner struct xfs_perag *pag, 14733479e05SDave Chinner struct xfs_inode *ip, 14833479e05SDave Chinner xfs_ino_t ino, 14933479e05SDave Chinner int flags, 15033479e05SDave Chinner int lock_flags) __releases(RCU) 15133479e05SDave Chinner { 15233479e05SDave Chinner struct inode *inode = VFS_I(ip); 15333479e05SDave Chinner struct xfs_mount *mp = ip->i_mount; 15433479e05SDave Chinner int error; 15533479e05SDave Chinner 15633479e05SDave Chinner /* 15733479e05SDave Chinner * check for re-use of an inode within an RCU grace period due to the 15833479e05SDave Chinner * radix tree nodes not being updated yet. We monitor for this by 15933479e05SDave Chinner * setting the inode number to zero before freeing the inode structure. 16033479e05SDave Chinner * If the inode has been reallocated and set up, then the inode number 16133479e05SDave Chinner * will not match, so check for that, too. 16233479e05SDave Chinner */ 16333479e05SDave Chinner spin_lock(&ip->i_flags_lock); 16433479e05SDave Chinner if (ip->i_ino != ino) { 16533479e05SDave Chinner trace_xfs_iget_skip(ip); 16633479e05SDave Chinner XFS_STATS_INC(xs_ig_frecycle); 16733479e05SDave Chinner error = EAGAIN; 16833479e05SDave Chinner goto out_error; 16933479e05SDave Chinner } 17033479e05SDave Chinner 17133479e05SDave Chinner 17233479e05SDave Chinner /* 17333479e05SDave Chinner * If we are racing with another cache hit that is currently 17433479e05SDave Chinner * instantiating this inode or currently recycling it out of 17533479e05SDave Chinner * reclaimabe state, wait for the initialisation to complete 17633479e05SDave Chinner * before continuing. 17733479e05SDave Chinner * 17833479e05SDave Chinner * XXX(hch): eventually we should do something equivalent to 17933479e05SDave Chinner * wait_on_inode to wait for these flags to be cleared 18033479e05SDave Chinner * instead of polling for it. 18133479e05SDave Chinner */ 18233479e05SDave Chinner if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { 18333479e05SDave Chinner trace_xfs_iget_skip(ip); 18433479e05SDave Chinner XFS_STATS_INC(xs_ig_frecycle); 18533479e05SDave Chinner error = EAGAIN; 18633479e05SDave Chinner goto out_error; 18733479e05SDave Chinner } 18833479e05SDave Chinner 18933479e05SDave Chinner /* 19033479e05SDave Chinner * If lookup is racing with unlink return an error immediately. 19133479e05SDave Chinner */ 19233479e05SDave Chinner if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { 19333479e05SDave Chinner error = ENOENT; 19433479e05SDave Chinner goto out_error; 19533479e05SDave Chinner } 19633479e05SDave Chinner 19733479e05SDave Chinner /* 19833479e05SDave Chinner * If IRECLAIMABLE is set, we've torn down the VFS inode already. 19933479e05SDave Chinner * Need to carefully get it back into useable state. 20033479e05SDave Chinner */ 20133479e05SDave Chinner if (ip->i_flags & XFS_IRECLAIMABLE) { 20233479e05SDave Chinner trace_xfs_iget_reclaim(ip); 20333479e05SDave Chinner 20433479e05SDave Chinner /* 20533479e05SDave Chinner * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode 20633479e05SDave Chinner * from stomping over us while we recycle the inode. We can't 20733479e05SDave Chinner * clear the radix tree reclaimable tag yet as it requires 20833479e05SDave Chinner * pag_ici_lock to be held exclusive. 20933479e05SDave Chinner */ 21033479e05SDave Chinner ip->i_flags |= XFS_IRECLAIM; 21133479e05SDave Chinner 21233479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 21333479e05SDave Chinner rcu_read_unlock(); 21433479e05SDave Chinner 21533479e05SDave Chinner error = -inode_init_always(mp->m_super, inode); 21633479e05SDave Chinner if (error) { 21733479e05SDave Chinner /* 21833479e05SDave Chinner * Re-initializing the inode failed, and we are in deep 21933479e05SDave Chinner * trouble. Try to re-add it to the reclaim list. 22033479e05SDave Chinner */ 22133479e05SDave Chinner rcu_read_lock(); 22233479e05SDave Chinner spin_lock(&ip->i_flags_lock); 22333479e05SDave Chinner 22433479e05SDave Chinner ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 22533479e05SDave Chinner ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 22633479e05SDave Chinner trace_xfs_iget_reclaim_fail(ip); 22733479e05SDave Chinner goto out_error; 22833479e05SDave Chinner } 22933479e05SDave Chinner 23033479e05SDave Chinner spin_lock(&pag->pag_ici_lock); 23133479e05SDave Chinner spin_lock(&ip->i_flags_lock); 23233479e05SDave Chinner 23333479e05SDave Chinner /* 23433479e05SDave Chinner * Clear the per-lifetime state in the inode as we are now 23533479e05SDave Chinner * effectively a new inode and need to return to the initial 23633479e05SDave Chinner * state before reuse occurs. 23733479e05SDave Chinner */ 23833479e05SDave Chinner ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 23933479e05SDave Chinner ip->i_flags |= XFS_INEW; 24033479e05SDave Chinner __xfs_inode_clear_reclaim_tag(mp, pag, ip); 24133479e05SDave Chinner inode->i_state = I_NEW; 24233479e05SDave Chinner 24333479e05SDave Chinner ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); 24433479e05SDave Chinner mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 24533479e05SDave Chinner 24633479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 24733479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 24833479e05SDave Chinner } else { 24933479e05SDave Chinner /* If the VFS inode is being torn down, pause and try again. */ 25033479e05SDave Chinner if (!igrab(inode)) { 25133479e05SDave Chinner trace_xfs_iget_skip(ip); 25233479e05SDave Chinner error = EAGAIN; 25333479e05SDave Chinner goto out_error; 25433479e05SDave Chinner } 25533479e05SDave Chinner 25633479e05SDave Chinner /* We've got a live one. */ 25733479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 25833479e05SDave Chinner rcu_read_unlock(); 25933479e05SDave Chinner trace_xfs_iget_hit(ip); 26033479e05SDave Chinner } 26133479e05SDave Chinner 26233479e05SDave Chinner if (lock_flags != 0) 26333479e05SDave Chinner xfs_ilock(ip, lock_flags); 26433479e05SDave Chinner 26533479e05SDave Chinner xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); 26633479e05SDave Chinner XFS_STATS_INC(xs_ig_found); 26733479e05SDave Chinner 26833479e05SDave Chinner return 0; 26933479e05SDave Chinner 27033479e05SDave Chinner out_error: 27133479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 27233479e05SDave Chinner rcu_read_unlock(); 27333479e05SDave Chinner return error; 27433479e05SDave Chinner } 27533479e05SDave Chinner 27633479e05SDave Chinner 27733479e05SDave Chinner static int 27833479e05SDave Chinner xfs_iget_cache_miss( 27933479e05SDave Chinner struct xfs_mount *mp, 28033479e05SDave Chinner struct xfs_perag *pag, 28133479e05SDave Chinner xfs_trans_t *tp, 28233479e05SDave Chinner xfs_ino_t ino, 28333479e05SDave Chinner struct xfs_inode **ipp, 28433479e05SDave Chinner int flags, 28533479e05SDave Chinner int lock_flags) 28633479e05SDave Chinner { 28733479e05SDave Chinner struct xfs_inode *ip; 28833479e05SDave Chinner int error; 28933479e05SDave Chinner xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 29033479e05SDave Chinner int iflags; 29133479e05SDave Chinner 29233479e05SDave Chinner ip = xfs_inode_alloc(mp, ino); 29333479e05SDave Chinner if (!ip) 29433479e05SDave Chinner return ENOMEM; 29533479e05SDave Chinner 29633479e05SDave Chinner error = xfs_iread(mp, tp, ip, flags); 29733479e05SDave Chinner if (error) 29833479e05SDave Chinner goto out_destroy; 29933479e05SDave Chinner 30033479e05SDave Chinner trace_xfs_iget_miss(ip); 30133479e05SDave Chinner 30233479e05SDave Chinner if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { 30333479e05SDave Chinner error = ENOENT; 30433479e05SDave Chinner goto out_destroy; 30533479e05SDave Chinner } 30633479e05SDave Chinner 30733479e05SDave Chinner /* 30833479e05SDave Chinner * Preload the radix tree so we can insert safely under the 30933479e05SDave Chinner * write spinlock. Note that we cannot sleep inside the preload 31033479e05SDave Chinner * region. Since we can be called from transaction context, don't 31133479e05SDave Chinner * recurse into the file system. 31233479e05SDave Chinner */ 31333479e05SDave Chinner if (radix_tree_preload(GFP_NOFS)) { 31433479e05SDave Chinner error = EAGAIN; 31533479e05SDave Chinner goto out_destroy; 31633479e05SDave Chinner } 31733479e05SDave Chinner 31833479e05SDave Chinner /* 31933479e05SDave Chinner * Because the inode hasn't been added to the radix-tree yet it can't 32033479e05SDave Chinner * be found by another thread, so we can do the non-sleeping lock here. 32133479e05SDave Chinner */ 32233479e05SDave Chinner if (lock_flags) { 32333479e05SDave Chinner if (!xfs_ilock_nowait(ip, lock_flags)) 32433479e05SDave Chinner BUG(); 32533479e05SDave Chinner } 32633479e05SDave Chinner 32733479e05SDave Chinner /* 32833479e05SDave Chinner * These values must be set before inserting the inode into the radix 32933479e05SDave Chinner * tree as the moment it is inserted a concurrent lookup (allowed by the 33033479e05SDave Chinner * RCU locking mechanism) can find it and that lookup must see that this 33133479e05SDave Chinner * is an inode currently under construction (i.e. that XFS_INEW is set). 33233479e05SDave Chinner * The ip->i_flags_lock that protects the XFS_INEW flag forms the 33333479e05SDave Chinner * memory barrier that ensures this detection works correctly at lookup 33433479e05SDave Chinner * time. 33533479e05SDave Chinner */ 33633479e05SDave Chinner iflags = XFS_INEW; 33733479e05SDave Chinner if (flags & XFS_IGET_DONTCACHE) 33833479e05SDave Chinner iflags |= XFS_IDONTCACHE; 339113a5683SChandra Seetharaman ip->i_udquot = NULL; 340113a5683SChandra Seetharaman ip->i_gdquot = NULL; 34192f8ff73SChandra Seetharaman ip->i_pdquot = NULL; 34233479e05SDave Chinner xfs_iflags_set(ip, iflags); 34333479e05SDave Chinner 34433479e05SDave Chinner /* insert the new inode */ 34533479e05SDave Chinner spin_lock(&pag->pag_ici_lock); 34633479e05SDave Chinner error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 34733479e05SDave Chinner if (unlikely(error)) { 34833479e05SDave Chinner WARN_ON(error != -EEXIST); 34933479e05SDave Chinner XFS_STATS_INC(xs_ig_dup); 35033479e05SDave Chinner error = EAGAIN; 35133479e05SDave Chinner goto out_preload_end; 35233479e05SDave Chinner } 35333479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 35433479e05SDave Chinner radix_tree_preload_end(); 35533479e05SDave Chinner 35633479e05SDave Chinner *ipp = ip; 35733479e05SDave Chinner return 0; 35833479e05SDave Chinner 35933479e05SDave Chinner out_preload_end: 36033479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 36133479e05SDave Chinner radix_tree_preload_end(); 36233479e05SDave Chinner if (lock_flags) 36333479e05SDave Chinner xfs_iunlock(ip, lock_flags); 36433479e05SDave Chinner out_destroy: 36533479e05SDave Chinner __destroy_inode(VFS_I(ip)); 36633479e05SDave Chinner xfs_inode_free(ip); 36733479e05SDave Chinner return error; 36833479e05SDave Chinner } 36933479e05SDave Chinner 37033479e05SDave Chinner /* 37133479e05SDave Chinner * Look up an inode by number in the given file system. 37233479e05SDave Chinner * The inode is looked up in the cache held in each AG. 37333479e05SDave Chinner * If the inode is found in the cache, initialise the vfs inode 37433479e05SDave Chinner * if necessary. 37533479e05SDave Chinner * 37633479e05SDave Chinner * If it is not in core, read it in from the file system's device, 37733479e05SDave Chinner * add it to the cache and initialise the vfs inode. 37833479e05SDave Chinner * 37933479e05SDave Chinner * The inode is locked according to the value of the lock_flags parameter. 38033479e05SDave Chinner * This flag parameter indicates how and if the inode's IO lock and inode lock 38133479e05SDave Chinner * should be taken. 38233479e05SDave Chinner * 38333479e05SDave Chinner * mp -- the mount point structure for the current file system. It points 38433479e05SDave Chinner * to the inode hash table. 38533479e05SDave Chinner * tp -- a pointer to the current transaction if there is one. This is 38633479e05SDave Chinner * simply passed through to the xfs_iread() call. 38733479e05SDave Chinner * ino -- the number of the inode desired. This is the unique identifier 38833479e05SDave Chinner * within the file system for the inode being requested. 38933479e05SDave Chinner * lock_flags -- flags indicating how to lock the inode. See the comment 39033479e05SDave Chinner * for xfs_ilock() for a list of valid values. 39133479e05SDave Chinner */ 39233479e05SDave Chinner int 39333479e05SDave Chinner xfs_iget( 39433479e05SDave Chinner xfs_mount_t *mp, 39533479e05SDave Chinner xfs_trans_t *tp, 39633479e05SDave Chinner xfs_ino_t ino, 39733479e05SDave Chinner uint flags, 39833479e05SDave Chinner uint lock_flags, 39933479e05SDave Chinner xfs_inode_t **ipp) 40033479e05SDave Chinner { 40133479e05SDave Chinner xfs_inode_t *ip; 40233479e05SDave Chinner int error; 40333479e05SDave Chinner xfs_perag_t *pag; 40433479e05SDave Chinner xfs_agino_t agino; 40533479e05SDave Chinner 40633479e05SDave Chinner /* 40733479e05SDave Chinner * xfs_reclaim_inode() uses the ILOCK to ensure an inode 40833479e05SDave Chinner * doesn't get freed while it's being referenced during a 40933479e05SDave Chinner * radix tree traversal here. It assumes this function 41033479e05SDave Chinner * aqcuires only the ILOCK (and therefore it has no need to 41133479e05SDave Chinner * involve the IOLOCK in this synchronization). 41233479e05SDave Chinner */ 41333479e05SDave Chinner ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 41433479e05SDave Chinner 41533479e05SDave Chinner /* reject inode numbers outside existing AGs */ 41633479e05SDave Chinner if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 41733479e05SDave Chinner return EINVAL; 41833479e05SDave Chinner 41933479e05SDave Chinner /* get the perag structure and ensure that it's inode capable */ 42033479e05SDave Chinner pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 42133479e05SDave Chinner agino = XFS_INO_TO_AGINO(mp, ino); 42233479e05SDave Chinner 42333479e05SDave Chinner again: 42433479e05SDave Chinner error = 0; 42533479e05SDave Chinner rcu_read_lock(); 42633479e05SDave Chinner ip = radix_tree_lookup(&pag->pag_ici_root, agino); 42733479e05SDave Chinner 42833479e05SDave Chinner if (ip) { 42933479e05SDave Chinner error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 43033479e05SDave Chinner if (error) 43133479e05SDave Chinner goto out_error_or_again; 43233479e05SDave Chinner } else { 43333479e05SDave Chinner rcu_read_unlock(); 43433479e05SDave Chinner XFS_STATS_INC(xs_ig_missed); 43533479e05SDave Chinner 43633479e05SDave Chinner error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 43733479e05SDave Chinner flags, lock_flags); 43833479e05SDave Chinner if (error) 43933479e05SDave Chinner goto out_error_or_again; 44033479e05SDave Chinner } 44133479e05SDave Chinner xfs_perag_put(pag); 44233479e05SDave Chinner 44333479e05SDave Chinner *ipp = ip; 44433479e05SDave Chinner 44533479e05SDave Chinner /* 44633479e05SDave Chinner * If we have a real type for an on-disk inode, we can set ops(&unlock) 44733479e05SDave Chinner * now. If it's a new inode being created, xfs_ialloc will handle it. 44833479e05SDave Chinner */ 44933479e05SDave Chinner if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0) 45033479e05SDave Chinner xfs_setup_inode(ip); 45133479e05SDave Chinner return 0; 45233479e05SDave Chinner 45333479e05SDave Chinner out_error_or_again: 45433479e05SDave Chinner if (error == EAGAIN) { 45533479e05SDave Chinner delay(1); 45633479e05SDave Chinner goto again; 45733479e05SDave Chinner } 45833479e05SDave Chinner xfs_perag_put(pag); 45933479e05SDave Chinner return error; 46033479e05SDave Chinner } 46133479e05SDave Chinner 4626d8b79cfSDave Chinner /* 4636d8b79cfSDave Chinner * The inode lookup is done in batches to keep the amount of lock traffic and 4646d8b79cfSDave Chinner * radix tree lookups to a minimum. The batch size is a trade off between 4656d8b79cfSDave Chinner * lookup reduction and stack usage. This is in the reclaim path, so we can't 4666d8b79cfSDave Chinner * be too greedy. 4676d8b79cfSDave Chinner */ 4686d8b79cfSDave Chinner #define XFS_LOOKUP_BATCH 32 4696d8b79cfSDave Chinner 4706d8b79cfSDave Chinner STATIC int 4716d8b79cfSDave Chinner xfs_inode_ag_walk_grab( 4726d8b79cfSDave Chinner struct xfs_inode *ip) 4736d8b79cfSDave Chinner { 4746d8b79cfSDave Chinner struct inode *inode = VFS_I(ip); 4756d8b79cfSDave Chinner 4766d8b79cfSDave Chinner ASSERT(rcu_read_lock_held()); 4776d8b79cfSDave Chinner 4786d8b79cfSDave Chinner /* 4796d8b79cfSDave Chinner * check for stale RCU freed inode 4806d8b79cfSDave Chinner * 4816d8b79cfSDave Chinner * If the inode has been reallocated, it doesn't matter if it's not in 4826d8b79cfSDave Chinner * the AG we are walking - we are walking for writeback, so if it 4836d8b79cfSDave Chinner * passes all the "valid inode" checks and is dirty, then we'll write 4846d8b79cfSDave Chinner * it back anyway. If it has been reallocated and still being 4856d8b79cfSDave Chinner * initialised, the XFS_INEW check below will catch it. 4866d8b79cfSDave Chinner */ 4876d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 4886d8b79cfSDave Chinner if (!ip->i_ino) 4896d8b79cfSDave Chinner goto out_unlock_noent; 4906d8b79cfSDave Chinner 4916d8b79cfSDave Chinner /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 4926d8b79cfSDave Chinner if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) 4936d8b79cfSDave Chinner goto out_unlock_noent; 4946d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 4956d8b79cfSDave Chinner 4966d8b79cfSDave Chinner /* nothing to sync during shutdown */ 4976d8b79cfSDave Chinner if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 4986d8b79cfSDave Chinner return EFSCORRUPTED; 4996d8b79cfSDave Chinner 5006d8b79cfSDave Chinner /* If we can't grab the inode, it must on it's way to reclaim. */ 5016d8b79cfSDave Chinner if (!igrab(inode)) 5026d8b79cfSDave Chinner return ENOENT; 5036d8b79cfSDave Chinner 5046d8b79cfSDave Chinner if (is_bad_inode(inode)) { 5056d8b79cfSDave Chinner IRELE(ip); 5066d8b79cfSDave Chinner return ENOENT; 5076d8b79cfSDave Chinner } 5086d8b79cfSDave Chinner 5096d8b79cfSDave Chinner /* inode is valid */ 5106d8b79cfSDave Chinner return 0; 5116d8b79cfSDave Chinner 5126d8b79cfSDave Chinner out_unlock_noent: 5136d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 5146d8b79cfSDave Chinner return ENOENT; 5156d8b79cfSDave Chinner } 5166d8b79cfSDave Chinner 5176d8b79cfSDave Chinner STATIC int 5186d8b79cfSDave Chinner xfs_inode_ag_walk( 5196d8b79cfSDave Chinner struct xfs_mount *mp, 5206d8b79cfSDave Chinner struct xfs_perag *pag, 5216d8b79cfSDave Chinner int (*execute)(struct xfs_inode *ip, 522a454f742SBrian Foster struct xfs_perag *pag, int flags, 523a454f742SBrian Foster void *args), 524a454f742SBrian Foster int flags, 525a454f742SBrian Foster void *args, 526a454f742SBrian Foster int tag) 5276d8b79cfSDave Chinner { 5286d8b79cfSDave Chinner uint32_t first_index; 5296d8b79cfSDave Chinner int last_error = 0; 5306d8b79cfSDave Chinner int skipped; 5316d8b79cfSDave Chinner int done; 5326d8b79cfSDave Chinner int nr_found; 5336d8b79cfSDave Chinner 5346d8b79cfSDave Chinner restart: 5356d8b79cfSDave Chinner done = 0; 5366d8b79cfSDave Chinner skipped = 0; 5376d8b79cfSDave Chinner first_index = 0; 5386d8b79cfSDave Chinner nr_found = 0; 5396d8b79cfSDave Chinner do { 5406d8b79cfSDave Chinner struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 5416d8b79cfSDave Chinner int error = 0; 5426d8b79cfSDave Chinner int i; 5436d8b79cfSDave Chinner 5446d8b79cfSDave Chinner rcu_read_lock(); 545a454f742SBrian Foster 546a454f742SBrian Foster if (tag == -1) 5476d8b79cfSDave Chinner nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 5486d8b79cfSDave Chinner (void **)batch, first_index, 5496d8b79cfSDave Chinner XFS_LOOKUP_BATCH); 550a454f742SBrian Foster else 551a454f742SBrian Foster nr_found = radix_tree_gang_lookup_tag( 552a454f742SBrian Foster &pag->pag_ici_root, 553a454f742SBrian Foster (void **) batch, first_index, 554a454f742SBrian Foster XFS_LOOKUP_BATCH, tag); 555a454f742SBrian Foster 5566d8b79cfSDave Chinner if (!nr_found) { 5576d8b79cfSDave Chinner rcu_read_unlock(); 5586d8b79cfSDave Chinner break; 5596d8b79cfSDave Chinner } 5606d8b79cfSDave Chinner 5616d8b79cfSDave Chinner /* 5626d8b79cfSDave Chinner * Grab the inodes before we drop the lock. if we found 5636d8b79cfSDave Chinner * nothing, nr == 0 and the loop will be skipped. 5646d8b79cfSDave Chinner */ 5656d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 5666d8b79cfSDave Chinner struct xfs_inode *ip = batch[i]; 5676d8b79cfSDave Chinner 5686d8b79cfSDave Chinner if (done || xfs_inode_ag_walk_grab(ip)) 5696d8b79cfSDave Chinner batch[i] = NULL; 5706d8b79cfSDave Chinner 5716d8b79cfSDave Chinner /* 5726d8b79cfSDave Chinner * Update the index for the next lookup. Catch 5736d8b79cfSDave Chinner * overflows into the next AG range which can occur if 5746d8b79cfSDave Chinner * we have inodes in the last block of the AG and we 5756d8b79cfSDave Chinner * are currently pointing to the last inode. 5766d8b79cfSDave Chinner * 5776d8b79cfSDave Chinner * Because we may see inodes that are from the wrong AG 5786d8b79cfSDave Chinner * due to RCU freeing and reallocation, only update the 5796d8b79cfSDave Chinner * index if it lies in this AG. It was a race that lead 5806d8b79cfSDave Chinner * us to see this inode, so another lookup from the 5816d8b79cfSDave Chinner * same index will not find it again. 5826d8b79cfSDave Chinner */ 5836d8b79cfSDave Chinner if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 5846d8b79cfSDave Chinner continue; 5856d8b79cfSDave Chinner first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 5866d8b79cfSDave Chinner if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 5876d8b79cfSDave Chinner done = 1; 5886d8b79cfSDave Chinner } 5896d8b79cfSDave Chinner 5906d8b79cfSDave Chinner /* unlock now we've grabbed the inodes. */ 5916d8b79cfSDave Chinner rcu_read_unlock(); 5926d8b79cfSDave Chinner 5936d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 5946d8b79cfSDave Chinner if (!batch[i]) 5956d8b79cfSDave Chinner continue; 596a454f742SBrian Foster error = execute(batch[i], pag, flags, args); 5976d8b79cfSDave Chinner IRELE(batch[i]); 5986d8b79cfSDave Chinner if (error == EAGAIN) { 5996d8b79cfSDave Chinner skipped++; 6006d8b79cfSDave Chinner continue; 6016d8b79cfSDave Chinner } 6026d8b79cfSDave Chinner if (error && last_error != EFSCORRUPTED) 6036d8b79cfSDave Chinner last_error = error; 6046d8b79cfSDave Chinner } 6056d8b79cfSDave Chinner 6066d8b79cfSDave Chinner /* bail out if the filesystem is corrupted. */ 6076d8b79cfSDave Chinner if (error == EFSCORRUPTED) 6086d8b79cfSDave Chinner break; 6096d8b79cfSDave Chinner 6106d8b79cfSDave Chinner cond_resched(); 6116d8b79cfSDave Chinner 6126d8b79cfSDave Chinner } while (nr_found && !done); 6136d8b79cfSDave Chinner 6146d8b79cfSDave Chinner if (skipped) { 6156d8b79cfSDave Chinner delay(1); 6166d8b79cfSDave Chinner goto restart; 6176d8b79cfSDave Chinner } 6186d8b79cfSDave Chinner return last_error; 6196d8b79cfSDave Chinner } 6206d8b79cfSDave Chinner 621579b62faSBrian Foster /* 622579b62faSBrian Foster * Background scanning to trim post-EOF preallocated space. This is queued 623b9fe5052SDwight Engen * based on the 'speculative_prealloc_lifetime' tunable (5m by default). 624579b62faSBrian Foster */ 625579b62faSBrian Foster STATIC void 626579b62faSBrian Foster xfs_queue_eofblocks( 627579b62faSBrian Foster struct xfs_mount *mp) 628579b62faSBrian Foster { 629579b62faSBrian Foster rcu_read_lock(); 630579b62faSBrian Foster if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) 631579b62faSBrian Foster queue_delayed_work(mp->m_eofblocks_workqueue, 632579b62faSBrian Foster &mp->m_eofblocks_work, 633579b62faSBrian Foster msecs_to_jiffies(xfs_eofb_secs * 1000)); 634579b62faSBrian Foster rcu_read_unlock(); 635579b62faSBrian Foster } 636579b62faSBrian Foster 637579b62faSBrian Foster void 638579b62faSBrian Foster xfs_eofblocks_worker( 639579b62faSBrian Foster struct work_struct *work) 640579b62faSBrian Foster { 641579b62faSBrian Foster struct xfs_mount *mp = container_of(to_delayed_work(work), 642579b62faSBrian Foster struct xfs_mount, m_eofblocks_work); 643579b62faSBrian Foster xfs_icache_free_eofblocks(mp, NULL); 644579b62faSBrian Foster xfs_queue_eofblocks(mp); 645579b62faSBrian Foster } 646579b62faSBrian Foster 6476d8b79cfSDave Chinner int 6486d8b79cfSDave Chinner xfs_inode_ag_iterator( 6496d8b79cfSDave Chinner struct xfs_mount *mp, 6506d8b79cfSDave Chinner int (*execute)(struct xfs_inode *ip, 651a454f742SBrian Foster struct xfs_perag *pag, int flags, 652a454f742SBrian Foster void *args), 653a454f742SBrian Foster int flags, 654a454f742SBrian Foster void *args) 6556d8b79cfSDave Chinner { 6566d8b79cfSDave Chinner struct xfs_perag *pag; 6576d8b79cfSDave Chinner int error = 0; 6586d8b79cfSDave Chinner int last_error = 0; 6596d8b79cfSDave Chinner xfs_agnumber_t ag; 6606d8b79cfSDave Chinner 6616d8b79cfSDave Chinner ag = 0; 6626d8b79cfSDave Chinner while ((pag = xfs_perag_get(mp, ag))) { 6636d8b79cfSDave Chinner ag = pag->pag_agno + 1; 664a454f742SBrian Foster error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); 665a454f742SBrian Foster xfs_perag_put(pag); 666a454f742SBrian Foster if (error) { 667a454f742SBrian Foster last_error = error; 668a454f742SBrian Foster if (error == EFSCORRUPTED) 669a454f742SBrian Foster break; 670a454f742SBrian Foster } 671a454f742SBrian Foster } 672a454f742SBrian Foster return XFS_ERROR(last_error); 673a454f742SBrian Foster } 674a454f742SBrian Foster 675a454f742SBrian Foster int 676a454f742SBrian Foster xfs_inode_ag_iterator_tag( 677a454f742SBrian Foster struct xfs_mount *mp, 678a454f742SBrian Foster int (*execute)(struct xfs_inode *ip, 679a454f742SBrian Foster struct xfs_perag *pag, int flags, 680a454f742SBrian Foster void *args), 681a454f742SBrian Foster int flags, 682a454f742SBrian Foster void *args, 683a454f742SBrian Foster int tag) 684a454f742SBrian Foster { 685a454f742SBrian Foster struct xfs_perag *pag; 686a454f742SBrian Foster int error = 0; 687a454f742SBrian Foster int last_error = 0; 688a454f742SBrian Foster xfs_agnumber_t ag; 689a454f742SBrian Foster 690a454f742SBrian Foster ag = 0; 691a454f742SBrian Foster while ((pag = xfs_perag_get_tag(mp, ag, tag))) { 692a454f742SBrian Foster ag = pag->pag_agno + 1; 693a454f742SBrian Foster error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); 6946d8b79cfSDave Chinner xfs_perag_put(pag); 6956d8b79cfSDave Chinner if (error) { 6966d8b79cfSDave Chinner last_error = error; 6976d8b79cfSDave Chinner if (error == EFSCORRUPTED) 6986d8b79cfSDave Chinner break; 6996d8b79cfSDave Chinner } 7006d8b79cfSDave Chinner } 7016d8b79cfSDave Chinner return XFS_ERROR(last_error); 7026d8b79cfSDave Chinner } 7036d8b79cfSDave Chinner 7046d8b79cfSDave Chinner /* 7056d8b79cfSDave Chinner * Queue a new inode reclaim pass if there are reclaimable inodes and there 7066d8b79cfSDave Chinner * isn't a reclaim pass already in progress. By default it runs every 5s based 7076d8b79cfSDave Chinner * on the xfs periodic sync default of 30s. Perhaps this should have it's own 7086d8b79cfSDave Chinner * tunable, but that can be done if this method proves to be ineffective or too 7096d8b79cfSDave Chinner * aggressive. 7106d8b79cfSDave Chinner */ 7116d8b79cfSDave Chinner static void 7126d8b79cfSDave Chinner xfs_reclaim_work_queue( 7136d8b79cfSDave Chinner struct xfs_mount *mp) 7146d8b79cfSDave Chinner { 7156d8b79cfSDave Chinner 7166d8b79cfSDave Chinner rcu_read_lock(); 7176d8b79cfSDave Chinner if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 7186d8b79cfSDave Chinner queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 7196d8b79cfSDave Chinner msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 7206d8b79cfSDave Chinner } 7216d8b79cfSDave Chinner rcu_read_unlock(); 7226d8b79cfSDave Chinner } 7236d8b79cfSDave Chinner 7246d8b79cfSDave Chinner /* 7256d8b79cfSDave Chinner * This is a fast pass over the inode cache to try to get reclaim moving on as 7266d8b79cfSDave Chinner * many inodes as possible in a short period of time. It kicks itself every few 7276d8b79cfSDave Chinner * seconds, as well as being kicked by the inode cache shrinker when memory 7286d8b79cfSDave Chinner * goes low. It scans as quickly as possible avoiding locked inodes or those 7296d8b79cfSDave Chinner * already being flushed, and once done schedules a future pass. 7306d8b79cfSDave Chinner */ 7316d8b79cfSDave Chinner void 7326d8b79cfSDave Chinner xfs_reclaim_worker( 7336d8b79cfSDave Chinner struct work_struct *work) 7346d8b79cfSDave Chinner { 7356d8b79cfSDave Chinner struct xfs_mount *mp = container_of(to_delayed_work(work), 7366d8b79cfSDave Chinner struct xfs_mount, m_reclaim_work); 7376d8b79cfSDave Chinner 7386d8b79cfSDave Chinner xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 7396d8b79cfSDave Chinner xfs_reclaim_work_queue(mp); 7406d8b79cfSDave Chinner } 7416d8b79cfSDave Chinner 74233479e05SDave Chinner static void 7436d8b79cfSDave Chinner __xfs_inode_set_reclaim_tag( 7446d8b79cfSDave Chinner struct xfs_perag *pag, 7456d8b79cfSDave Chinner struct xfs_inode *ip) 7466d8b79cfSDave Chinner { 7476d8b79cfSDave Chinner radix_tree_tag_set(&pag->pag_ici_root, 7486d8b79cfSDave Chinner XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 7496d8b79cfSDave Chinner XFS_ICI_RECLAIM_TAG); 7506d8b79cfSDave Chinner 7516d8b79cfSDave Chinner if (!pag->pag_ici_reclaimable) { 7526d8b79cfSDave Chinner /* propagate the reclaim tag up into the perag radix tree */ 7536d8b79cfSDave Chinner spin_lock(&ip->i_mount->m_perag_lock); 7546d8b79cfSDave Chinner radix_tree_tag_set(&ip->i_mount->m_perag_tree, 7556d8b79cfSDave Chinner XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 7566d8b79cfSDave Chinner XFS_ICI_RECLAIM_TAG); 7576d8b79cfSDave Chinner spin_unlock(&ip->i_mount->m_perag_lock); 7586d8b79cfSDave Chinner 7596d8b79cfSDave Chinner /* schedule periodic background inode reclaim */ 7606d8b79cfSDave Chinner xfs_reclaim_work_queue(ip->i_mount); 7616d8b79cfSDave Chinner 7626d8b79cfSDave Chinner trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, 7636d8b79cfSDave Chinner -1, _RET_IP_); 7646d8b79cfSDave Chinner } 7656d8b79cfSDave Chinner pag->pag_ici_reclaimable++; 7666d8b79cfSDave Chinner } 7676d8b79cfSDave Chinner 7686d8b79cfSDave Chinner /* 7696d8b79cfSDave Chinner * We set the inode flag atomically with the radix tree tag. 7706d8b79cfSDave Chinner * Once we get tag lookups on the radix tree, this inode flag 7716d8b79cfSDave Chinner * can go away. 7726d8b79cfSDave Chinner */ 7736d8b79cfSDave Chinner void 7746d8b79cfSDave Chinner xfs_inode_set_reclaim_tag( 7756d8b79cfSDave Chinner xfs_inode_t *ip) 7766d8b79cfSDave Chinner { 7776d8b79cfSDave Chinner struct xfs_mount *mp = ip->i_mount; 7786d8b79cfSDave Chinner struct xfs_perag *pag; 7796d8b79cfSDave Chinner 7806d8b79cfSDave Chinner pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 7816d8b79cfSDave Chinner spin_lock(&pag->pag_ici_lock); 7826d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 7836d8b79cfSDave Chinner __xfs_inode_set_reclaim_tag(pag, ip); 7846d8b79cfSDave Chinner __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 7856d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 7866d8b79cfSDave Chinner spin_unlock(&pag->pag_ici_lock); 7876d8b79cfSDave Chinner xfs_perag_put(pag); 7886d8b79cfSDave Chinner } 7896d8b79cfSDave Chinner 7906d8b79cfSDave Chinner STATIC void 7916d8b79cfSDave Chinner __xfs_inode_clear_reclaim( 7926d8b79cfSDave Chinner xfs_perag_t *pag, 7936d8b79cfSDave Chinner xfs_inode_t *ip) 7946d8b79cfSDave Chinner { 7956d8b79cfSDave Chinner pag->pag_ici_reclaimable--; 7966d8b79cfSDave Chinner if (!pag->pag_ici_reclaimable) { 7976d8b79cfSDave Chinner /* clear the reclaim tag from the perag radix tree */ 7986d8b79cfSDave Chinner spin_lock(&ip->i_mount->m_perag_lock); 7996d8b79cfSDave Chinner radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 8006d8b79cfSDave Chinner XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 8016d8b79cfSDave Chinner XFS_ICI_RECLAIM_TAG); 8026d8b79cfSDave Chinner spin_unlock(&ip->i_mount->m_perag_lock); 8036d8b79cfSDave Chinner trace_xfs_perag_clear_reclaim(ip->i_mount, pag->pag_agno, 8046d8b79cfSDave Chinner -1, _RET_IP_); 8056d8b79cfSDave Chinner } 8066d8b79cfSDave Chinner } 8076d8b79cfSDave Chinner 80833479e05SDave Chinner STATIC void 8096d8b79cfSDave Chinner __xfs_inode_clear_reclaim_tag( 8106d8b79cfSDave Chinner xfs_mount_t *mp, 8116d8b79cfSDave Chinner xfs_perag_t *pag, 8126d8b79cfSDave Chinner xfs_inode_t *ip) 8136d8b79cfSDave Chinner { 8146d8b79cfSDave Chinner radix_tree_tag_clear(&pag->pag_ici_root, 8156d8b79cfSDave Chinner XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); 8166d8b79cfSDave Chinner __xfs_inode_clear_reclaim(pag, ip); 8176d8b79cfSDave Chinner } 8186d8b79cfSDave Chinner 8196d8b79cfSDave Chinner /* 8206d8b79cfSDave Chinner * Grab the inode for reclaim exclusively. 8216d8b79cfSDave Chinner * Return 0 if we grabbed it, non-zero otherwise. 8226d8b79cfSDave Chinner */ 8236d8b79cfSDave Chinner STATIC int 8246d8b79cfSDave Chinner xfs_reclaim_inode_grab( 8256d8b79cfSDave Chinner struct xfs_inode *ip, 8266d8b79cfSDave Chinner int flags) 8276d8b79cfSDave Chinner { 8286d8b79cfSDave Chinner ASSERT(rcu_read_lock_held()); 8296d8b79cfSDave Chinner 8306d8b79cfSDave Chinner /* quick check for stale RCU freed inode */ 8316d8b79cfSDave Chinner if (!ip->i_ino) 8326d8b79cfSDave Chinner return 1; 8336d8b79cfSDave Chinner 8346d8b79cfSDave Chinner /* 8356d8b79cfSDave Chinner * If we are asked for non-blocking operation, do unlocked checks to 8366d8b79cfSDave Chinner * see if the inode already is being flushed or in reclaim to avoid 8376d8b79cfSDave Chinner * lock traffic. 8386d8b79cfSDave Chinner */ 8396d8b79cfSDave Chinner if ((flags & SYNC_TRYLOCK) && 8406d8b79cfSDave Chinner __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) 8416d8b79cfSDave Chinner return 1; 8426d8b79cfSDave Chinner 8436d8b79cfSDave Chinner /* 8446d8b79cfSDave Chinner * The radix tree lock here protects a thread in xfs_iget from racing 8456d8b79cfSDave Chinner * with us starting reclaim on the inode. Once we have the 8466d8b79cfSDave Chinner * XFS_IRECLAIM flag set it will not touch us. 8476d8b79cfSDave Chinner * 8486d8b79cfSDave Chinner * Due to RCU lookup, we may find inodes that have been freed and only 8496d8b79cfSDave Chinner * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that 8506d8b79cfSDave Chinner * aren't candidates for reclaim at all, so we must check the 8516d8b79cfSDave Chinner * XFS_IRECLAIMABLE is set first before proceeding to reclaim. 8526d8b79cfSDave Chinner */ 8536d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 8546d8b79cfSDave Chinner if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 8556d8b79cfSDave Chinner __xfs_iflags_test(ip, XFS_IRECLAIM)) { 8566d8b79cfSDave Chinner /* not a reclaim candidate. */ 8576d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 8586d8b79cfSDave Chinner return 1; 8596d8b79cfSDave Chinner } 8606d8b79cfSDave Chinner __xfs_iflags_set(ip, XFS_IRECLAIM); 8616d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 8626d8b79cfSDave Chinner return 0; 8636d8b79cfSDave Chinner } 8646d8b79cfSDave Chinner 8656d8b79cfSDave Chinner /* 8666d8b79cfSDave Chinner * Inodes in different states need to be treated differently. The following 8676d8b79cfSDave Chinner * table lists the inode states and the reclaim actions necessary: 8686d8b79cfSDave Chinner * 8696d8b79cfSDave Chinner * inode state iflush ret required action 8706d8b79cfSDave Chinner * --------------- ---------- --------------- 8716d8b79cfSDave Chinner * bad - reclaim 8726d8b79cfSDave Chinner * shutdown EIO unpin and reclaim 8736d8b79cfSDave Chinner * clean, unpinned 0 reclaim 8746d8b79cfSDave Chinner * stale, unpinned 0 reclaim 8756d8b79cfSDave Chinner * clean, pinned(*) 0 requeue 8766d8b79cfSDave Chinner * stale, pinned EAGAIN requeue 8776d8b79cfSDave Chinner * dirty, async - requeue 8786d8b79cfSDave Chinner * dirty, sync 0 reclaim 8796d8b79cfSDave Chinner * 8806d8b79cfSDave Chinner * (*) dgc: I don't think the clean, pinned state is possible but it gets 8816d8b79cfSDave Chinner * handled anyway given the order of checks implemented. 8826d8b79cfSDave Chinner * 8836d8b79cfSDave Chinner * Also, because we get the flush lock first, we know that any inode that has 8846d8b79cfSDave Chinner * been flushed delwri has had the flush completed by the time we check that 8856d8b79cfSDave Chinner * the inode is clean. 8866d8b79cfSDave Chinner * 8876d8b79cfSDave Chinner * Note that because the inode is flushed delayed write by AIL pushing, the 8886d8b79cfSDave Chinner * flush lock may already be held here and waiting on it can result in very 8896d8b79cfSDave Chinner * long latencies. Hence for sync reclaims, where we wait on the flush lock, 8906d8b79cfSDave Chinner * the caller should push the AIL first before trying to reclaim inodes to 8916d8b79cfSDave Chinner * minimise the amount of time spent waiting. For background relaim, we only 8926d8b79cfSDave Chinner * bother to reclaim clean inodes anyway. 8936d8b79cfSDave Chinner * 8946d8b79cfSDave Chinner * Hence the order of actions after gaining the locks should be: 8956d8b79cfSDave Chinner * bad => reclaim 8966d8b79cfSDave Chinner * shutdown => unpin and reclaim 8976d8b79cfSDave Chinner * pinned, async => requeue 8986d8b79cfSDave Chinner * pinned, sync => unpin 8996d8b79cfSDave Chinner * stale => reclaim 9006d8b79cfSDave Chinner * clean => reclaim 9016d8b79cfSDave Chinner * dirty, async => requeue 9026d8b79cfSDave Chinner * dirty, sync => flush, wait and reclaim 9036d8b79cfSDave Chinner */ 9046d8b79cfSDave Chinner STATIC int 9056d8b79cfSDave Chinner xfs_reclaim_inode( 9066d8b79cfSDave Chinner struct xfs_inode *ip, 9076d8b79cfSDave Chinner struct xfs_perag *pag, 9086d8b79cfSDave Chinner int sync_mode) 9096d8b79cfSDave Chinner { 9106d8b79cfSDave Chinner struct xfs_buf *bp = NULL; 9116d8b79cfSDave Chinner int error; 9126d8b79cfSDave Chinner 9136d8b79cfSDave Chinner restart: 9146d8b79cfSDave Chinner error = 0; 9156d8b79cfSDave Chinner xfs_ilock(ip, XFS_ILOCK_EXCL); 9166d8b79cfSDave Chinner if (!xfs_iflock_nowait(ip)) { 9176d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 9186d8b79cfSDave Chinner goto out; 9196d8b79cfSDave Chinner xfs_iflock(ip); 9206d8b79cfSDave Chinner } 9216d8b79cfSDave Chinner 9226d8b79cfSDave Chinner if (is_bad_inode(VFS_I(ip))) 9236d8b79cfSDave Chinner goto reclaim; 9246d8b79cfSDave Chinner if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 9256d8b79cfSDave Chinner xfs_iunpin_wait(ip); 9266d8b79cfSDave Chinner xfs_iflush_abort(ip, false); 9276d8b79cfSDave Chinner goto reclaim; 9286d8b79cfSDave Chinner } 9296d8b79cfSDave Chinner if (xfs_ipincount(ip)) { 9306d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 9316d8b79cfSDave Chinner goto out_ifunlock; 9326d8b79cfSDave Chinner xfs_iunpin_wait(ip); 9336d8b79cfSDave Chinner } 9346d8b79cfSDave Chinner if (xfs_iflags_test(ip, XFS_ISTALE)) 9356d8b79cfSDave Chinner goto reclaim; 9366d8b79cfSDave Chinner if (xfs_inode_clean(ip)) 9376d8b79cfSDave Chinner goto reclaim; 9386d8b79cfSDave Chinner 9396d8b79cfSDave Chinner /* 9406d8b79cfSDave Chinner * Never flush out dirty data during non-blocking reclaim, as it would 9416d8b79cfSDave Chinner * just contend with AIL pushing trying to do the same job. 9426d8b79cfSDave Chinner */ 9436d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 9446d8b79cfSDave Chinner goto out_ifunlock; 9456d8b79cfSDave Chinner 9466d8b79cfSDave Chinner /* 9476d8b79cfSDave Chinner * Now we have an inode that needs flushing. 9486d8b79cfSDave Chinner * 9496d8b79cfSDave Chinner * Note that xfs_iflush will never block on the inode buffer lock, as 9506d8b79cfSDave Chinner * xfs_ifree_cluster() can lock the inode buffer before it locks the 9516d8b79cfSDave Chinner * ip->i_lock, and we are doing the exact opposite here. As a result, 9526d8b79cfSDave Chinner * doing a blocking xfs_imap_to_bp() to get the cluster buffer would 9536d8b79cfSDave Chinner * result in an ABBA deadlock with xfs_ifree_cluster(). 9546d8b79cfSDave Chinner * 9556d8b79cfSDave Chinner * As xfs_ifree_cluser() must gather all inodes that are active in the 9566d8b79cfSDave Chinner * cache to mark them stale, if we hit this case we don't actually want 9576d8b79cfSDave Chinner * to do IO here - we want the inode marked stale so we can simply 9586d8b79cfSDave Chinner * reclaim it. Hence if we get an EAGAIN error here, just unlock the 9596d8b79cfSDave Chinner * inode, back off and try again. Hopefully the next pass through will 9606d8b79cfSDave Chinner * see the stale flag set on the inode. 9616d8b79cfSDave Chinner */ 9626d8b79cfSDave Chinner error = xfs_iflush(ip, &bp); 9636d8b79cfSDave Chinner if (error == EAGAIN) { 9646d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 9656d8b79cfSDave Chinner /* backoff longer than in xfs_ifree_cluster */ 9666d8b79cfSDave Chinner delay(2); 9676d8b79cfSDave Chinner goto restart; 9686d8b79cfSDave Chinner } 9696d8b79cfSDave Chinner 9706d8b79cfSDave Chinner if (!error) { 9716d8b79cfSDave Chinner error = xfs_bwrite(bp); 9726d8b79cfSDave Chinner xfs_buf_relse(bp); 9736d8b79cfSDave Chinner } 9746d8b79cfSDave Chinner 9756d8b79cfSDave Chinner xfs_iflock(ip); 9766d8b79cfSDave Chinner reclaim: 9776d8b79cfSDave Chinner xfs_ifunlock(ip); 9786d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 9796d8b79cfSDave Chinner 9806d8b79cfSDave Chinner XFS_STATS_INC(xs_ig_reclaims); 9816d8b79cfSDave Chinner /* 9826d8b79cfSDave Chinner * Remove the inode from the per-AG radix tree. 9836d8b79cfSDave Chinner * 9846d8b79cfSDave Chinner * Because radix_tree_delete won't complain even if the item was never 9856d8b79cfSDave Chinner * added to the tree assert that it's been there before to catch 9866d8b79cfSDave Chinner * problems with the inode life time early on. 9876d8b79cfSDave Chinner */ 9886d8b79cfSDave Chinner spin_lock(&pag->pag_ici_lock); 9896d8b79cfSDave Chinner if (!radix_tree_delete(&pag->pag_ici_root, 9906d8b79cfSDave Chinner XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino))) 9916d8b79cfSDave Chinner ASSERT(0); 9926d8b79cfSDave Chinner __xfs_inode_clear_reclaim(pag, ip); 9936d8b79cfSDave Chinner spin_unlock(&pag->pag_ici_lock); 9946d8b79cfSDave Chinner 9956d8b79cfSDave Chinner /* 9966d8b79cfSDave Chinner * Here we do an (almost) spurious inode lock in order to coordinate 9976d8b79cfSDave Chinner * with inode cache radix tree lookups. This is because the lookup 9986d8b79cfSDave Chinner * can reference the inodes in the cache without taking references. 9996d8b79cfSDave Chinner * 10006d8b79cfSDave Chinner * We make that OK here by ensuring that we wait until the inode is 10016d8b79cfSDave Chinner * unlocked after the lookup before we go ahead and free it. 10026d8b79cfSDave Chinner */ 10036d8b79cfSDave Chinner xfs_ilock(ip, XFS_ILOCK_EXCL); 10046d8b79cfSDave Chinner xfs_qm_dqdetach(ip); 10056d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 10066d8b79cfSDave Chinner 10076d8b79cfSDave Chinner xfs_inode_free(ip); 10086d8b79cfSDave Chinner return error; 10096d8b79cfSDave Chinner 10106d8b79cfSDave Chinner out_ifunlock: 10116d8b79cfSDave Chinner xfs_ifunlock(ip); 10126d8b79cfSDave Chinner out: 10136d8b79cfSDave Chinner xfs_iflags_clear(ip, XFS_IRECLAIM); 10146d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 10156d8b79cfSDave Chinner /* 10166d8b79cfSDave Chinner * We could return EAGAIN here to make reclaim rescan the inode tree in 10176d8b79cfSDave Chinner * a short while. However, this just burns CPU time scanning the tree 10186d8b79cfSDave Chinner * waiting for IO to complete and the reclaim work never goes back to 10196d8b79cfSDave Chinner * the idle state. Instead, return 0 to let the next scheduled 10206d8b79cfSDave Chinner * background reclaim attempt to reclaim the inode again. 10216d8b79cfSDave Chinner */ 10226d8b79cfSDave Chinner return 0; 10236d8b79cfSDave Chinner } 10246d8b79cfSDave Chinner 10256d8b79cfSDave Chinner /* 10266d8b79cfSDave Chinner * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 10276d8b79cfSDave Chinner * corrupted, we still want to try to reclaim all the inodes. If we don't, 10286d8b79cfSDave Chinner * then a shut down during filesystem unmount reclaim walk leak all the 10296d8b79cfSDave Chinner * unreclaimed inodes. 10306d8b79cfSDave Chinner */ 103133479e05SDave Chinner STATIC int 10326d8b79cfSDave Chinner xfs_reclaim_inodes_ag( 10336d8b79cfSDave Chinner struct xfs_mount *mp, 10346d8b79cfSDave Chinner int flags, 10356d8b79cfSDave Chinner int *nr_to_scan) 10366d8b79cfSDave Chinner { 10376d8b79cfSDave Chinner struct xfs_perag *pag; 10386d8b79cfSDave Chinner int error = 0; 10396d8b79cfSDave Chinner int last_error = 0; 10406d8b79cfSDave Chinner xfs_agnumber_t ag; 10416d8b79cfSDave Chinner int trylock = flags & SYNC_TRYLOCK; 10426d8b79cfSDave Chinner int skipped; 10436d8b79cfSDave Chinner 10446d8b79cfSDave Chinner restart: 10456d8b79cfSDave Chinner ag = 0; 10466d8b79cfSDave Chinner skipped = 0; 10476d8b79cfSDave Chinner while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 10486d8b79cfSDave Chinner unsigned long first_index = 0; 10496d8b79cfSDave Chinner int done = 0; 10506d8b79cfSDave Chinner int nr_found = 0; 10516d8b79cfSDave Chinner 10526d8b79cfSDave Chinner ag = pag->pag_agno + 1; 10536d8b79cfSDave Chinner 10546d8b79cfSDave Chinner if (trylock) { 10556d8b79cfSDave Chinner if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 10566d8b79cfSDave Chinner skipped++; 10576d8b79cfSDave Chinner xfs_perag_put(pag); 10586d8b79cfSDave Chinner continue; 10596d8b79cfSDave Chinner } 10606d8b79cfSDave Chinner first_index = pag->pag_ici_reclaim_cursor; 10616d8b79cfSDave Chinner } else 10626d8b79cfSDave Chinner mutex_lock(&pag->pag_ici_reclaim_lock); 10636d8b79cfSDave Chinner 10646d8b79cfSDave Chinner do { 10656d8b79cfSDave Chinner struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 10666d8b79cfSDave Chinner int i; 10676d8b79cfSDave Chinner 10686d8b79cfSDave Chinner rcu_read_lock(); 10696d8b79cfSDave Chinner nr_found = radix_tree_gang_lookup_tag( 10706d8b79cfSDave Chinner &pag->pag_ici_root, 10716d8b79cfSDave Chinner (void **)batch, first_index, 10726d8b79cfSDave Chinner XFS_LOOKUP_BATCH, 10736d8b79cfSDave Chinner XFS_ICI_RECLAIM_TAG); 10746d8b79cfSDave Chinner if (!nr_found) { 10756d8b79cfSDave Chinner done = 1; 10766d8b79cfSDave Chinner rcu_read_unlock(); 10776d8b79cfSDave Chinner break; 10786d8b79cfSDave Chinner } 10796d8b79cfSDave Chinner 10806d8b79cfSDave Chinner /* 10816d8b79cfSDave Chinner * Grab the inodes before we drop the lock. if we found 10826d8b79cfSDave Chinner * nothing, nr == 0 and the loop will be skipped. 10836d8b79cfSDave Chinner */ 10846d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 10856d8b79cfSDave Chinner struct xfs_inode *ip = batch[i]; 10866d8b79cfSDave Chinner 10876d8b79cfSDave Chinner if (done || xfs_reclaim_inode_grab(ip, flags)) 10886d8b79cfSDave Chinner batch[i] = NULL; 10896d8b79cfSDave Chinner 10906d8b79cfSDave Chinner /* 10916d8b79cfSDave Chinner * Update the index for the next lookup. Catch 10926d8b79cfSDave Chinner * overflows into the next AG range which can 10936d8b79cfSDave Chinner * occur if we have inodes in the last block of 10946d8b79cfSDave Chinner * the AG and we are currently pointing to the 10956d8b79cfSDave Chinner * last inode. 10966d8b79cfSDave Chinner * 10976d8b79cfSDave Chinner * Because we may see inodes that are from the 10986d8b79cfSDave Chinner * wrong AG due to RCU freeing and 10996d8b79cfSDave Chinner * reallocation, only update the index if it 11006d8b79cfSDave Chinner * lies in this AG. It was a race that lead us 11016d8b79cfSDave Chinner * to see this inode, so another lookup from 11026d8b79cfSDave Chinner * the same index will not find it again. 11036d8b79cfSDave Chinner */ 11046d8b79cfSDave Chinner if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 11056d8b79cfSDave Chinner pag->pag_agno) 11066d8b79cfSDave Chinner continue; 11076d8b79cfSDave Chinner first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 11086d8b79cfSDave Chinner if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 11096d8b79cfSDave Chinner done = 1; 11106d8b79cfSDave Chinner } 11116d8b79cfSDave Chinner 11126d8b79cfSDave Chinner /* unlock now we've grabbed the inodes. */ 11136d8b79cfSDave Chinner rcu_read_unlock(); 11146d8b79cfSDave Chinner 11156d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 11166d8b79cfSDave Chinner if (!batch[i]) 11176d8b79cfSDave Chinner continue; 11186d8b79cfSDave Chinner error = xfs_reclaim_inode(batch[i], pag, flags); 11196d8b79cfSDave Chinner if (error && last_error != EFSCORRUPTED) 11206d8b79cfSDave Chinner last_error = error; 11216d8b79cfSDave Chinner } 11226d8b79cfSDave Chinner 11236d8b79cfSDave Chinner *nr_to_scan -= XFS_LOOKUP_BATCH; 11246d8b79cfSDave Chinner 11256d8b79cfSDave Chinner cond_resched(); 11266d8b79cfSDave Chinner 11276d8b79cfSDave Chinner } while (nr_found && !done && *nr_to_scan > 0); 11286d8b79cfSDave Chinner 11296d8b79cfSDave Chinner if (trylock && !done) 11306d8b79cfSDave Chinner pag->pag_ici_reclaim_cursor = first_index; 11316d8b79cfSDave Chinner else 11326d8b79cfSDave Chinner pag->pag_ici_reclaim_cursor = 0; 11336d8b79cfSDave Chinner mutex_unlock(&pag->pag_ici_reclaim_lock); 11346d8b79cfSDave Chinner xfs_perag_put(pag); 11356d8b79cfSDave Chinner } 11366d8b79cfSDave Chinner 11376d8b79cfSDave Chinner /* 11386d8b79cfSDave Chinner * if we skipped any AG, and we still have scan count remaining, do 11396d8b79cfSDave Chinner * another pass this time using blocking reclaim semantics (i.e 11406d8b79cfSDave Chinner * waiting on the reclaim locks and ignoring the reclaim cursors). This 11416d8b79cfSDave Chinner * ensure that when we get more reclaimers than AGs we block rather 11426d8b79cfSDave Chinner * than spin trying to execute reclaim. 11436d8b79cfSDave Chinner */ 11446d8b79cfSDave Chinner if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { 11456d8b79cfSDave Chinner trylock = 0; 11466d8b79cfSDave Chinner goto restart; 11476d8b79cfSDave Chinner } 11486d8b79cfSDave Chinner return XFS_ERROR(last_error); 11496d8b79cfSDave Chinner } 11506d8b79cfSDave Chinner 11516d8b79cfSDave Chinner int 11526d8b79cfSDave Chinner xfs_reclaim_inodes( 11536d8b79cfSDave Chinner xfs_mount_t *mp, 11546d8b79cfSDave Chinner int mode) 11556d8b79cfSDave Chinner { 11566d8b79cfSDave Chinner int nr_to_scan = INT_MAX; 11576d8b79cfSDave Chinner 11586d8b79cfSDave Chinner return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); 11596d8b79cfSDave Chinner } 11606d8b79cfSDave Chinner 11616d8b79cfSDave Chinner /* 11626d8b79cfSDave Chinner * Scan a certain number of inodes for reclaim. 11636d8b79cfSDave Chinner * 11646d8b79cfSDave Chinner * When called we make sure that there is a background (fast) inode reclaim in 11656d8b79cfSDave Chinner * progress, while we will throttle the speed of reclaim via doing synchronous 11666d8b79cfSDave Chinner * reclaim of inodes. That means if we come across dirty inodes, we wait for 11676d8b79cfSDave Chinner * them to be cleaned, which we hope will not be very long due to the 11686d8b79cfSDave Chinner * background walker having already kicked the IO off on those dirty inodes. 11696d8b79cfSDave Chinner */ 11706d8b79cfSDave Chinner void 11716d8b79cfSDave Chinner xfs_reclaim_inodes_nr( 11726d8b79cfSDave Chinner struct xfs_mount *mp, 11736d8b79cfSDave Chinner int nr_to_scan) 11746d8b79cfSDave Chinner { 11756d8b79cfSDave Chinner /* kick background reclaimer and push the AIL */ 11766d8b79cfSDave Chinner xfs_reclaim_work_queue(mp); 11776d8b79cfSDave Chinner xfs_ail_push_all(mp->m_ail); 11786d8b79cfSDave Chinner 11796d8b79cfSDave Chinner xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 11806d8b79cfSDave Chinner } 11816d8b79cfSDave Chinner 11826d8b79cfSDave Chinner /* 11836d8b79cfSDave Chinner * Return the number of reclaimable inodes in the filesystem for 11846d8b79cfSDave Chinner * the shrinker to determine how much to reclaim. 11856d8b79cfSDave Chinner */ 11866d8b79cfSDave Chinner int 11876d8b79cfSDave Chinner xfs_reclaim_inodes_count( 11886d8b79cfSDave Chinner struct xfs_mount *mp) 11896d8b79cfSDave Chinner { 11906d8b79cfSDave Chinner struct xfs_perag *pag; 11916d8b79cfSDave Chinner xfs_agnumber_t ag = 0; 11926d8b79cfSDave Chinner int reclaimable = 0; 11936d8b79cfSDave Chinner 11946d8b79cfSDave Chinner while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 11956d8b79cfSDave Chinner ag = pag->pag_agno + 1; 11966d8b79cfSDave Chinner reclaimable += pag->pag_ici_reclaimable; 11976d8b79cfSDave Chinner xfs_perag_put(pag); 11986d8b79cfSDave Chinner } 11996d8b79cfSDave Chinner return reclaimable; 12006d8b79cfSDave Chinner } 12016d8b79cfSDave Chinner 120241176a68SBrian Foster STATIC int 12033e3f9f58SBrian Foster xfs_inode_match_id( 12043e3f9f58SBrian Foster struct xfs_inode *ip, 12053e3f9f58SBrian Foster struct xfs_eofblocks *eofb) 12063e3f9f58SBrian Foster { 1207b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1208b9fe5052SDwight Engen !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 12093e3f9f58SBrian Foster return 0; 12101b556048SBrian Foster 1211b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1212b9fe5052SDwight Engen !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 12131b556048SBrian Foster return 0; 12141b556048SBrian Foster 1215b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 12161b556048SBrian Foster xfs_get_projid(ip) != eofb->eof_prid) 12171b556048SBrian Foster return 0; 12181b556048SBrian Foster 12191b556048SBrian Foster return 1; 12203e3f9f58SBrian Foster } 12213e3f9f58SBrian Foster 12223e3f9f58SBrian Foster STATIC int 122341176a68SBrian Foster xfs_inode_free_eofblocks( 122441176a68SBrian Foster struct xfs_inode *ip, 122541176a68SBrian Foster struct xfs_perag *pag, 122641176a68SBrian Foster int flags, 122741176a68SBrian Foster void *args) 122841176a68SBrian Foster { 122941176a68SBrian Foster int ret; 12303e3f9f58SBrian Foster struct xfs_eofblocks *eofb = args; 123141176a68SBrian Foster 123241176a68SBrian Foster if (!xfs_can_free_eofblocks(ip, false)) { 123341176a68SBrian Foster /* inode could be preallocated or append-only */ 123441176a68SBrian Foster trace_xfs_inode_free_eofblocks_invalid(ip); 123541176a68SBrian Foster xfs_inode_clear_eofblocks_tag(ip); 123641176a68SBrian Foster return 0; 123741176a68SBrian Foster } 123841176a68SBrian Foster 123941176a68SBrian Foster /* 124041176a68SBrian Foster * If the mapping is dirty the operation can block and wait for some 124141176a68SBrian Foster * time. Unless we are waiting, skip it. 124241176a68SBrian Foster */ 124341176a68SBrian Foster if (!(flags & SYNC_WAIT) && 124441176a68SBrian Foster mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 124541176a68SBrian Foster return 0; 124641176a68SBrian Foster 124700ca79a0SBrian Foster if (eofb) { 124800ca79a0SBrian Foster if (!xfs_inode_match_id(ip, eofb)) 12493e3f9f58SBrian Foster return 0; 12503e3f9f58SBrian Foster 125100ca79a0SBrian Foster /* skip the inode if the file size is too small */ 125200ca79a0SBrian Foster if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 125300ca79a0SBrian Foster XFS_ISIZE(ip) < eofb->eof_min_file_size) 125400ca79a0SBrian Foster return 0; 125500ca79a0SBrian Foster } 125600ca79a0SBrian Foster 125741176a68SBrian Foster ret = xfs_free_eofblocks(ip->i_mount, ip, true); 125841176a68SBrian Foster 125941176a68SBrian Foster /* don't revisit the inode if we're not waiting */ 126041176a68SBrian Foster if (ret == EAGAIN && !(flags & SYNC_WAIT)) 126141176a68SBrian Foster ret = 0; 126241176a68SBrian Foster 126341176a68SBrian Foster return ret; 126441176a68SBrian Foster } 126541176a68SBrian Foster 126641176a68SBrian Foster int 126741176a68SBrian Foster xfs_icache_free_eofblocks( 126841176a68SBrian Foster struct xfs_mount *mp, 12698ca149deSBrian Foster struct xfs_eofblocks *eofb) 127041176a68SBrian Foster { 12718ca149deSBrian Foster int flags = SYNC_TRYLOCK; 12728ca149deSBrian Foster 12738ca149deSBrian Foster if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) 12748ca149deSBrian Foster flags = SYNC_WAIT; 12758ca149deSBrian Foster 127641176a68SBrian Foster return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, 12778ca149deSBrian Foster eofb, XFS_ICI_EOFBLOCKS_TAG); 127841176a68SBrian Foster } 127941176a68SBrian Foster 128027b52867SBrian Foster void 128127b52867SBrian Foster xfs_inode_set_eofblocks_tag( 128227b52867SBrian Foster xfs_inode_t *ip) 128327b52867SBrian Foster { 128427b52867SBrian Foster struct xfs_mount *mp = ip->i_mount; 128527b52867SBrian Foster struct xfs_perag *pag; 128627b52867SBrian Foster int tagged; 128727b52867SBrian Foster 128827b52867SBrian Foster pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 128927b52867SBrian Foster spin_lock(&pag->pag_ici_lock); 129027b52867SBrian Foster trace_xfs_inode_set_eofblocks_tag(ip); 129127b52867SBrian Foster 129227b52867SBrian Foster tagged = radix_tree_tagged(&pag->pag_ici_root, 129327b52867SBrian Foster XFS_ICI_EOFBLOCKS_TAG); 129427b52867SBrian Foster radix_tree_tag_set(&pag->pag_ici_root, 129527b52867SBrian Foster XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 129627b52867SBrian Foster XFS_ICI_EOFBLOCKS_TAG); 129727b52867SBrian Foster if (!tagged) { 129827b52867SBrian Foster /* propagate the eofblocks tag up into the perag radix tree */ 129927b52867SBrian Foster spin_lock(&ip->i_mount->m_perag_lock); 130027b52867SBrian Foster radix_tree_tag_set(&ip->i_mount->m_perag_tree, 130127b52867SBrian Foster XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 130227b52867SBrian Foster XFS_ICI_EOFBLOCKS_TAG); 130327b52867SBrian Foster spin_unlock(&ip->i_mount->m_perag_lock); 130427b52867SBrian Foster 1305579b62faSBrian Foster /* kick off background trimming */ 1306579b62faSBrian Foster xfs_queue_eofblocks(ip->i_mount); 1307579b62faSBrian Foster 130827b52867SBrian Foster trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, 130927b52867SBrian Foster -1, _RET_IP_); 131027b52867SBrian Foster } 131127b52867SBrian Foster 131227b52867SBrian Foster spin_unlock(&pag->pag_ici_lock); 131327b52867SBrian Foster xfs_perag_put(pag); 131427b52867SBrian Foster } 131527b52867SBrian Foster 131627b52867SBrian Foster void 131727b52867SBrian Foster xfs_inode_clear_eofblocks_tag( 131827b52867SBrian Foster xfs_inode_t *ip) 131927b52867SBrian Foster { 132027b52867SBrian Foster struct xfs_mount *mp = ip->i_mount; 132127b52867SBrian Foster struct xfs_perag *pag; 132227b52867SBrian Foster 132327b52867SBrian Foster pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 132427b52867SBrian Foster spin_lock(&pag->pag_ici_lock); 132527b52867SBrian Foster trace_xfs_inode_clear_eofblocks_tag(ip); 132627b52867SBrian Foster 132727b52867SBrian Foster radix_tree_tag_clear(&pag->pag_ici_root, 132827b52867SBrian Foster XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 132927b52867SBrian Foster XFS_ICI_EOFBLOCKS_TAG); 133027b52867SBrian Foster if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { 133127b52867SBrian Foster /* clear the eofblocks tag from the perag radix tree */ 133227b52867SBrian Foster spin_lock(&ip->i_mount->m_perag_lock); 133327b52867SBrian Foster radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 133427b52867SBrian Foster XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 133527b52867SBrian Foster XFS_ICI_EOFBLOCKS_TAG); 133627b52867SBrian Foster spin_unlock(&ip->i_mount->m_perag_lock); 133727b52867SBrian Foster trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, 133827b52867SBrian Foster -1, _RET_IP_); 133927b52867SBrian Foster } 134027b52867SBrian Foster 134127b52867SBrian Foster spin_unlock(&pag->pag_ici_lock); 134227b52867SBrian Foster xfs_perag_put(pag); 134327b52867SBrian Foster } 134427b52867SBrian Foster 1345