16d8b79cfSDave Chinner /* 26d8b79cfSDave Chinner * Copyright (c) 2000-2005 Silicon Graphics, Inc. 36d8b79cfSDave Chinner * All Rights Reserved. 46d8b79cfSDave Chinner * 56d8b79cfSDave Chinner * This program is free software; you can redistribute it and/or 66d8b79cfSDave Chinner * modify it under the terms of the GNU General Public License as 76d8b79cfSDave Chinner * published by the Free Software Foundation. 86d8b79cfSDave Chinner * 96d8b79cfSDave Chinner * This program is distributed in the hope that it would be useful, 106d8b79cfSDave Chinner * but WITHOUT ANY WARRANTY; without even the implied warranty of 116d8b79cfSDave Chinner * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 126d8b79cfSDave Chinner * GNU General Public License for more details. 136d8b79cfSDave Chinner * 146d8b79cfSDave Chinner * You should have received a copy of the GNU General Public License 156d8b79cfSDave Chinner * along with this program; if not, write the Free Software Foundation, 166d8b79cfSDave Chinner * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 176d8b79cfSDave Chinner */ 186d8b79cfSDave Chinner #include "xfs.h" 196d8b79cfSDave Chinner #include "xfs_fs.h" 206ca1c906SDave Chinner #include "xfs_format.h" 21239880efSDave Chinner #include "xfs_log_format.h" 22239880efSDave Chinner #include "xfs_trans_resv.h" 236d8b79cfSDave Chinner #include "xfs_sb.h" 246d8b79cfSDave Chinner #include "xfs_mount.h" 256d8b79cfSDave Chinner #include "xfs_inode.h" 266d8b79cfSDave Chinner #include "xfs_error.h" 27239880efSDave Chinner #include "xfs_trans.h" 28239880efSDave Chinner #include "xfs_trans_priv.h" 296d8b79cfSDave Chinner #include "xfs_inode_item.h" 306d8b79cfSDave Chinner #include "xfs_quota.h" 316d8b79cfSDave Chinner #include "xfs_trace.h" 326d8b79cfSDave Chinner #include "xfs_icache.h" 33c24b5dfaSDave Chinner #include "xfs_bmap_util.h" 34dc06f398SBrian Foster #include "xfs_dquot_item.h" 35dc06f398SBrian Foster #include "xfs_dquot.h" 366d8b79cfSDave Chinner 376d8b79cfSDave Chinner #include <linux/kthread.h> 386d8b79cfSDave Chinner #include <linux/freezer.h> 396d8b79cfSDave Chinner 4033479e05SDave Chinner /* 4133479e05SDave Chinner * Allocate and initialise an xfs_inode. 4233479e05SDave Chinner */ 43638f4416SDave Chinner struct xfs_inode * 4433479e05SDave Chinner xfs_inode_alloc( 4533479e05SDave Chinner struct xfs_mount *mp, 4633479e05SDave Chinner xfs_ino_t ino) 4733479e05SDave Chinner { 4833479e05SDave Chinner struct xfs_inode *ip; 4933479e05SDave Chinner 5033479e05SDave Chinner /* 5133479e05SDave Chinner * if this didn't occur in transactions, we could use 5233479e05SDave Chinner * KM_MAYFAIL and return NULL here on ENOMEM. Set the 5333479e05SDave Chinner * code up to do this anyway. 5433479e05SDave Chinner */ 5533479e05SDave Chinner ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); 5633479e05SDave Chinner if (!ip) 5733479e05SDave Chinner return NULL; 5833479e05SDave Chinner if (inode_init_always(mp->m_super, VFS_I(ip))) { 5933479e05SDave Chinner kmem_zone_free(xfs_inode_zone, ip); 6033479e05SDave Chinner return NULL; 6133479e05SDave Chinner } 6233479e05SDave Chinner 63c19b3b05SDave Chinner /* VFS doesn't initialise i_mode! */ 64c19b3b05SDave Chinner VFS_I(ip)->i_mode = 0; 65c19b3b05SDave Chinner 66ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, vn_active); 6733479e05SDave Chinner ASSERT(atomic_read(&ip->i_pincount) == 0); 6833479e05SDave Chinner ASSERT(!spin_is_locked(&ip->i_flags_lock)); 6933479e05SDave Chinner ASSERT(!xfs_isiflocked(ip)); 7033479e05SDave Chinner ASSERT(ip->i_ino == 0); 7133479e05SDave Chinner 7233479e05SDave Chinner mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 7333479e05SDave Chinner 7433479e05SDave Chinner /* initialise the xfs inode */ 7533479e05SDave Chinner ip->i_ino = ino; 7633479e05SDave Chinner ip->i_mount = mp; 7733479e05SDave Chinner memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 7833479e05SDave Chinner ip->i_afp = NULL; 7933479e05SDave Chinner memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); 8033479e05SDave Chinner ip->i_flags = 0; 8133479e05SDave Chinner ip->i_delayed_blks = 0; 82f8d55aa0SDave Chinner memset(&ip->i_d, 0, sizeof(ip->i_d)); 8333479e05SDave Chinner 8433479e05SDave Chinner return ip; 8533479e05SDave Chinner } 8633479e05SDave Chinner 8733479e05SDave Chinner STATIC void 8833479e05SDave Chinner xfs_inode_free_callback( 8933479e05SDave Chinner struct rcu_head *head) 9033479e05SDave Chinner { 9133479e05SDave Chinner struct inode *inode = container_of(head, struct inode, i_rcu); 9233479e05SDave Chinner struct xfs_inode *ip = XFS_I(inode); 9333479e05SDave Chinner 94c19b3b05SDave Chinner switch (VFS_I(ip)->i_mode & S_IFMT) { 9533479e05SDave Chinner case S_IFREG: 9633479e05SDave Chinner case S_IFDIR: 9733479e05SDave Chinner case S_IFLNK: 9833479e05SDave Chinner xfs_idestroy_fork(ip, XFS_DATA_FORK); 9933479e05SDave Chinner break; 10033479e05SDave Chinner } 10133479e05SDave Chinner 10233479e05SDave Chinner if (ip->i_afp) 10333479e05SDave Chinner xfs_idestroy_fork(ip, XFS_ATTR_FORK); 10433479e05SDave Chinner 10533479e05SDave Chinner if (ip->i_itemp) { 10633479e05SDave Chinner ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); 10733479e05SDave Chinner xfs_inode_item_destroy(ip); 10833479e05SDave Chinner ip->i_itemp = NULL; 10933479e05SDave Chinner } 11033479e05SDave Chinner 1111f2dcfe8SDave Chinner kmem_zone_free(xfs_inode_zone, ip); 1121f2dcfe8SDave Chinner } 1131f2dcfe8SDave Chinner 1148a17d7ddSDave Chinner static void 1158a17d7ddSDave Chinner __xfs_inode_free( 1168a17d7ddSDave Chinner struct xfs_inode *ip) 1178a17d7ddSDave Chinner { 1188a17d7ddSDave Chinner /* asserts to verify all state is correct here */ 1198a17d7ddSDave Chinner ASSERT(atomic_read(&ip->i_pincount) == 0); 1208a17d7ddSDave Chinner ASSERT(!xfs_isiflocked(ip)); 1218a17d7ddSDave Chinner XFS_STATS_DEC(ip->i_mount, vn_active); 1228a17d7ddSDave Chinner 1238a17d7ddSDave Chinner call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 1248a17d7ddSDave Chinner } 1258a17d7ddSDave Chinner 1261f2dcfe8SDave Chinner void 1271f2dcfe8SDave Chinner xfs_inode_free( 1281f2dcfe8SDave Chinner struct xfs_inode *ip) 1291f2dcfe8SDave Chinner { 13033479e05SDave Chinner /* 13133479e05SDave Chinner * Because we use RCU freeing we need to ensure the inode always 13233479e05SDave Chinner * appears to be reclaimed with an invalid inode number when in the 13333479e05SDave Chinner * free state. The ip->i_flags_lock provides the barrier against lookup 13433479e05SDave Chinner * races. 13533479e05SDave Chinner */ 13633479e05SDave Chinner spin_lock(&ip->i_flags_lock); 13733479e05SDave Chinner ip->i_flags = XFS_IRECLAIM; 13833479e05SDave Chinner ip->i_ino = 0; 13933479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 14033479e05SDave Chinner 1418a17d7ddSDave Chinner __xfs_inode_free(ip); 14233479e05SDave Chinner } 14333479e05SDave Chinner 14433479e05SDave Chinner /* 145ad438c40SDave Chinner * Queue a new inode reclaim pass if there are reclaimable inodes and there 146ad438c40SDave Chinner * isn't a reclaim pass already in progress. By default it runs every 5s based 147ad438c40SDave Chinner * on the xfs periodic sync default of 30s. Perhaps this should have it's own 148ad438c40SDave Chinner * tunable, but that can be done if this method proves to be ineffective or too 149ad438c40SDave Chinner * aggressive. 150ad438c40SDave Chinner */ 151ad438c40SDave Chinner static void 152ad438c40SDave Chinner xfs_reclaim_work_queue( 153ad438c40SDave Chinner struct xfs_mount *mp) 154ad438c40SDave Chinner { 155ad438c40SDave Chinner 156ad438c40SDave Chinner rcu_read_lock(); 157ad438c40SDave Chinner if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 158ad438c40SDave Chinner queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 159ad438c40SDave Chinner msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 160ad438c40SDave Chinner } 161ad438c40SDave Chinner rcu_read_unlock(); 162ad438c40SDave Chinner } 163ad438c40SDave Chinner 164ad438c40SDave Chinner /* 165ad438c40SDave Chinner * This is a fast pass over the inode cache to try to get reclaim moving on as 166ad438c40SDave Chinner * many inodes as possible in a short period of time. It kicks itself every few 167ad438c40SDave Chinner * seconds, as well as being kicked by the inode cache shrinker when memory 168ad438c40SDave Chinner * goes low. It scans as quickly as possible avoiding locked inodes or those 169ad438c40SDave Chinner * already being flushed, and once done schedules a future pass. 170ad438c40SDave Chinner */ 171ad438c40SDave Chinner void 172ad438c40SDave Chinner xfs_reclaim_worker( 173ad438c40SDave Chinner struct work_struct *work) 174ad438c40SDave Chinner { 175ad438c40SDave Chinner struct xfs_mount *mp = container_of(to_delayed_work(work), 176ad438c40SDave Chinner struct xfs_mount, m_reclaim_work); 177ad438c40SDave Chinner 178ad438c40SDave Chinner xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 179ad438c40SDave Chinner xfs_reclaim_work_queue(mp); 180ad438c40SDave Chinner } 181ad438c40SDave Chinner 182ad438c40SDave Chinner static void 183ad438c40SDave Chinner xfs_perag_set_reclaim_tag( 184ad438c40SDave Chinner struct xfs_perag *pag) 185ad438c40SDave Chinner { 186ad438c40SDave Chinner struct xfs_mount *mp = pag->pag_mount; 187ad438c40SDave Chinner 188ad438c40SDave Chinner ASSERT(spin_is_locked(&pag->pag_ici_lock)); 189ad438c40SDave Chinner if (pag->pag_ici_reclaimable++) 190ad438c40SDave Chinner return; 191ad438c40SDave Chinner 192ad438c40SDave Chinner /* propagate the reclaim tag up into the perag radix tree */ 193ad438c40SDave Chinner spin_lock(&mp->m_perag_lock); 194ad438c40SDave Chinner radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, 195ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 196ad438c40SDave Chinner spin_unlock(&mp->m_perag_lock); 197ad438c40SDave Chinner 198ad438c40SDave Chinner /* schedule periodic background inode reclaim */ 199ad438c40SDave Chinner xfs_reclaim_work_queue(mp); 200ad438c40SDave Chinner 201ad438c40SDave Chinner trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 202ad438c40SDave Chinner } 203ad438c40SDave Chinner 204ad438c40SDave Chinner static void 205ad438c40SDave Chinner xfs_perag_clear_reclaim_tag( 206ad438c40SDave Chinner struct xfs_perag *pag) 207ad438c40SDave Chinner { 208ad438c40SDave Chinner struct xfs_mount *mp = pag->pag_mount; 209ad438c40SDave Chinner 210ad438c40SDave Chinner ASSERT(spin_is_locked(&pag->pag_ici_lock)); 211ad438c40SDave Chinner if (--pag->pag_ici_reclaimable) 212ad438c40SDave Chinner return; 213ad438c40SDave Chinner 214ad438c40SDave Chinner /* clear the reclaim tag from the perag radix tree */ 215ad438c40SDave Chinner spin_lock(&mp->m_perag_lock); 216ad438c40SDave Chinner radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, 217ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 218ad438c40SDave Chinner spin_unlock(&mp->m_perag_lock); 219ad438c40SDave Chinner trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 220ad438c40SDave Chinner } 221ad438c40SDave Chinner 222ad438c40SDave Chinner 223ad438c40SDave Chinner /* 224ad438c40SDave Chinner * We set the inode flag atomically with the radix tree tag. 225ad438c40SDave Chinner * Once we get tag lookups on the radix tree, this inode flag 226ad438c40SDave Chinner * can go away. 227ad438c40SDave Chinner */ 228ad438c40SDave Chinner void 229ad438c40SDave Chinner xfs_inode_set_reclaim_tag( 230ad438c40SDave Chinner struct xfs_inode *ip) 231ad438c40SDave Chinner { 232ad438c40SDave Chinner struct xfs_mount *mp = ip->i_mount; 233ad438c40SDave Chinner struct xfs_perag *pag; 234ad438c40SDave Chinner 235ad438c40SDave Chinner pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 236ad438c40SDave Chinner spin_lock(&pag->pag_ici_lock); 237ad438c40SDave Chinner spin_lock(&ip->i_flags_lock); 238ad438c40SDave Chinner 239ad438c40SDave Chinner radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), 240ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 241ad438c40SDave Chinner xfs_perag_set_reclaim_tag(pag); 242ad438c40SDave Chinner __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 243ad438c40SDave Chinner 244ad438c40SDave Chinner spin_unlock(&ip->i_flags_lock); 245ad438c40SDave Chinner spin_unlock(&pag->pag_ici_lock); 246ad438c40SDave Chinner xfs_perag_put(pag); 247ad438c40SDave Chinner } 248ad438c40SDave Chinner 249ad438c40SDave Chinner STATIC void 250ad438c40SDave Chinner xfs_inode_clear_reclaim_tag( 251ad438c40SDave Chinner struct xfs_perag *pag, 252ad438c40SDave Chinner xfs_ino_t ino) 253ad438c40SDave Chinner { 254ad438c40SDave Chinner radix_tree_tag_clear(&pag->pag_ici_root, 255ad438c40SDave Chinner XFS_INO_TO_AGINO(pag->pag_mount, ino), 256ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 257ad438c40SDave Chinner xfs_perag_clear_reclaim_tag(pag); 258ad438c40SDave Chinner } 259ad438c40SDave Chinner 260ad438c40SDave Chinner /* 26150997470SDave Chinner * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 26250997470SDave Chinner * part of the structure. This is made more complex by the fact we store 26350997470SDave Chinner * information about the on-disk values in the VFS inode and so we can't just 26483e06f21SDave Chinner * overwrite the values unconditionally. Hence we save the parameters we 26550997470SDave Chinner * need to retain across reinitialisation, and rewrite them into the VFS inode 26683e06f21SDave Chinner * after reinitialisation even if it fails. 26750997470SDave Chinner */ 26850997470SDave Chinner static int 26950997470SDave Chinner xfs_reinit_inode( 27050997470SDave Chinner struct xfs_mount *mp, 27150997470SDave Chinner struct inode *inode) 27250997470SDave Chinner { 27350997470SDave Chinner int error; 27454d7b5c1SDave Chinner uint32_t nlink = inode->i_nlink; 2759e9a2674SDave Chinner uint32_t generation = inode->i_generation; 27683e06f21SDave Chinner uint64_t version = inode->i_version; 277c19b3b05SDave Chinner umode_t mode = inode->i_mode; 27850997470SDave Chinner 27950997470SDave Chinner error = inode_init_always(mp->m_super, inode); 28050997470SDave Chinner 28154d7b5c1SDave Chinner set_nlink(inode, nlink); 2829e9a2674SDave Chinner inode->i_generation = generation; 28383e06f21SDave Chinner inode->i_version = version; 284c19b3b05SDave Chinner inode->i_mode = mode; 28550997470SDave Chinner return error; 28650997470SDave Chinner } 28750997470SDave Chinner 28850997470SDave Chinner /* 28933479e05SDave Chinner * Check the validity of the inode we just found it the cache 29033479e05SDave Chinner */ 29133479e05SDave Chinner static int 29233479e05SDave Chinner xfs_iget_cache_hit( 29333479e05SDave Chinner struct xfs_perag *pag, 29433479e05SDave Chinner struct xfs_inode *ip, 29533479e05SDave Chinner xfs_ino_t ino, 29633479e05SDave Chinner int flags, 29733479e05SDave Chinner int lock_flags) __releases(RCU) 29833479e05SDave Chinner { 29933479e05SDave Chinner struct inode *inode = VFS_I(ip); 30033479e05SDave Chinner struct xfs_mount *mp = ip->i_mount; 30133479e05SDave Chinner int error; 30233479e05SDave Chinner 30333479e05SDave Chinner /* 30433479e05SDave Chinner * check for re-use of an inode within an RCU grace period due to the 30533479e05SDave Chinner * radix tree nodes not being updated yet. We monitor for this by 30633479e05SDave Chinner * setting the inode number to zero before freeing the inode structure. 30733479e05SDave Chinner * If the inode has been reallocated and set up, then the inode number 30833479e05SDave Chinner * will not match, so check for that, too. 30933479e05SDave Chinner */ 31033479e05SDave Chinner spin_lock(&ip->i_flags_lock); 31133479e05SDave Chinner if (ip->i_ino != ino) { 31233479e05SDave Chinner trace_xfs_iget_skip(ip); 313ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_frecycle); 3142451337dSDave Chinner error = -EAGAIN; 31533479e05SDave Chinner goto out_error; 31633479e05SDave Chinner } 31733479e05SDave Chinner 31833479e05SDave Chinner 31933479e05SDave Chinner /* 32033479e05SDave Chinner * If we are racing with another cache hit that is currently 32133479e05SDave Chinner * instantiating this inode or currently recycling it out of 32233479e05SDave Chinner * reclaimabe state, wait for the initialisation to complete 32333479e05SDave Chinner * before continuing. 32433479e05SDave Chinner * 32533479e05SDave Chinner * XXX(hch): eventually we should do something equivalent to 32633479e05SDave Chinner * wait_on_inode to wait for these flags to be cleared 32733479e05SDave Chinner * instead of polling for it. 32833479e05SDave Chinner */ 32933479e05SDave Chinner if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { 33033479e05SDave Chinner trace_xfs_iget_skip(ip); 331ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_frecycle); 3322451337dSDave Chinner error = -EAGAIN; 33333479e05SDave Chinner goto out_error; 33433479e05SDave Chinner } 33533479e05SDave Chinner 33633479e05SDave Chinner /* 33733479e05SDave Chinner * If lookup is racing with unlink return an error immediately. 33833479e05SDave Chinner */ 339c19b3b05SDave Chinner if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) { 3402451337dSDave Chinner error = -ENOENT; 34133479e05SDave Chinner goto out_error; 34233479e05SDave Chinner } 34333479e05SDave Chinner 34433479e05SDave Chinner /* 34533479e05SDave Chinner * If IRECLAIMABLE is set, we've torn down the VFS inode already. 34633479e05SDave Chinner * Need to carefully get it back into useable state. 34733479e05SDave Chinner */ 34833479e05SDave Chinner if (ip->i_flags & XFS_IRECLAIMABLE) { 34933479e05SDave Chinner trace_xfs_iget_reclaim(ip); 35033479e05SDave Chinner 35133479e05SDave Chinner /* 35233479e05SDave Chinner * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode 35333479e05SDave Chinner * from stomping over us while we recycle the inode. We can't 35433479e05SDave Chinner * clear the radix tree reclaimable tag yet as it requires 35533479e05SDave Chinner * pag_ici_lock to be held exclusive. 35633479e05SDave Chinner */ 35733479e05SDave Chinner ip->i_flags |= XFS_IRECLAIM; 35833479e05SDave Chinner 35933479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 36033479e05SDave Chinner rcu_read_unlock(); 36133479e05SDave Chinner 36250997470SDave Chinner error = xfs_reinit_inode(mp, inode); 36333479e05SDave Chinner if (error) { 36433479e05SDave Chinner /* 36533479e05SDave Chinner * Re-initializing the inode failed, and we are in deep 36633479e05SDave Chinner * trouble. Try to re-add it to the reclaim list. 36733479e05SDave Chinner */ 36833479e05SDave Chinner rcu_read_lock(); 36933479e05SDave Chinner spin_lock(&ip->i_flags_lock); 37033479e05SDave Chinner 37133479e05SDave Chinner ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 37233479e05SDave Chinner ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 37333479e05SDave Chinner trace_xfs_iget_reclaim_fail(ip); 37433479e05SDave Chinner goto out_error; 37533479e05SDave Chinner } 37633479e05SDave Chinner 37733479e05SDave Chinner spin_lock(&pag->pag_ici_lock); 37833479e05SDave Chinner spin_lock(&ip->i_flags_lock); 37933479e05SDave Chinner 38033479e05SDave Chinner /* 38133479e05SDave Chinner * Clear the per-lifetime state in the inode as we are now 38233479e05SDave Chinner * effectively a new inode and need to return to the initial 38333479e05SDave Chinner * state before reuse occurs. 38433479e05SDave Chinner */ 38533479e05SDave Chinner ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 38633479e05SDave Chinner ip->i_flags |= XFS_INEW; 387545c0889SDave Chinner xfs_inode_clear_reclaim_tag(pag, ip->i_ino); 38833479e05SDave Chinner inode->i_state = I_NEW; 38933479e05SDave Chinner 39033479e05SDave Chinner ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); 39133479e05SDave Chinner mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 39233479e05SDave Chinner 39333479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 39433479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 39533479e05SDave Chinner } else { 39633479e05SDave Chinner /* If the VFS inode is being torn down, pause and try again. */ 39733479e05SDave Chinner if (!igrab(inode)) { 39833479e05SDave Chinner trace_xfs_iget_skip(ip); 3992451337dSDave Chinner error = -EAGAIN; 40033479e05SDave Chinner goto out_error; 40133479e05SDave Chinner } 40233479e05SDave Chinner 40333479e05SDave Chinner /* We've got a live one. */ 40433479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 40533479e05SDave Chinner rcu_read_unlock(); 40633479e05SDave Chinner trace_xfs_iget_hit(ip); 40733479e05SDave Chinner } 40833479e05SDave Chinner 40933479e05SDave Chinner if (lock_flags != 0) 41033479e05SDave Chinner xfs_ilock(ip, lock_flags); 41133479e05SDave Chinner 41233479e05SDave Chinner xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); 413ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_found); 41433479e05SDave Chinner 41533479e05SDave Chinner return 0; 41633479e05SDave Chinner 41733479e05SDave Chinner out_error: 41833479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 41933479e05SDave Chinner rcu_read_unlock(); 42033479e05SDave Chinner return error; 42133479e05SDave Chinner } 42233479e05SDave Chinner 42333479e05SDave Chinner 42433479e05SDave Chinner static int 42533479e05SDave Chinner xfs_iget_cache_miss( 42633479e05SDave Chinner struct xfs_mount *mp, 42733479e05SDave Chinner struct xfs_perag *pag, 42833479e05SDave Chinner xfs_trans_t *tp, 42933479e05SDave Chinner xfs_ino_t ino, 43033479e05SDave Chinner struct xfs_inode **ipp, 43133479e05SDave Chinner int flags, 43233479e05SDave Chinner int lock_flags) 43333479e05SDave Chinner { 43433479e05SDave Chinner struct xfs_inode *ip; 43533479e05SDave Chinner int error; 43633479e05SDave Chinner xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 43733479e05SDave Chinner int iflags; 43833479e05SDave Chinner 43933479e05SDave Chinner ip = xfs_inode_alloc(mp, ino); 44033479e05SDave Chinner if (!ip) 4412451337dSDave Chinner return -ENOMEM; 44233479e05SDave Chinner 44333479e05SDave Chinner error = xfs_iread(mp, tp, ip, flags); 44433479e05SDave Chinner if (error) 44533479e05SDave Chinner goto out_destroy; 44633479e05SDave Chinner 44733479e05SDave Chinner trace_xfs_iget_miss(ip); 44833479e05SDave Chinner 449c19b3b05SDave Chinner if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) { 4502451337dSDave Chinner error = -ENOENT; 45133479e05SDave Chinner goto out_destroy; 45233479e05SDave Chinner } 45333479e05SDave Chinner 45433479e05SDave Chinner /* 45533479e05SDave Chinner * Preload the radix tree so we can insert safely under the 45633479e05SDave Chinner * write spinlock. Note that we cannot sleep inside the preload 45733479e05SDave Chinner * region. Since we can be called from transaction context, don't 45833479e05SDave Chinner * recurse into the file system. 45933479e05SDave Chinner */ 46033479e05SDave Chinner if (radix_tree_preload(GFP_NOFS)) { 4612451337dSDave Chinner error = -EAGAIN; 46233479e05SDave Chinner goto out_destroy; 46333479e05SDave Chinner } 46433479e05SDave Chinner 46533479e05SDave Chinner /* 46633479e05SDave Chinner * Because the inode hasn't been added to the radix-tree yet it can't 46733479e05SDave Chinner * be found by another thread, so we can do the non-sleeping lock here. 46833479e05SDave Chinner */ 46933479e05SDave Chinner if (lock_flags) { 47033479e05SDave Chinner if (!xfs_ilock_nowait(ip, lock_flags)) 47133479e05SDave Chinner BUG(); 47233479e05SDave Chinner } 47333479e05SDave Chinner 47433479e05SDave Chinner /* 47533479e05SDave Chinner * These values must be set before inserting the inode into the radix 47633479e05SDave Chinner * tree as the moment it is inserted a concurrent lookup (allowed by the 47733479e05SDave Chinner * RCU locking mechanism) can find it and that lookup must see that this 47833479e05SDave Chinner * is an inode currently under construction (i.e. that XFS_INEW is set). 47933479e05SDave Chinner * The ip->i_flags_lock that protects the XFS_INEW flag forms the 48033479e05SDave Chinner * memory barrier that ensures this detection works correctly at lookup 48133479e05SDave Chinner * time. 48233479e05SDave Chinner */ 48333479e05SDave Chinner iflags = XFS_INEW; 48433479e05SDave Chinner if (flags & XFS_IGET_DONTCACHE) 48533479e05SDave Chinner iflags |= XFS_IDONTCACHE; 486113a5683SChandra Seetharaman ip->i_udquot = NULL; 487113a5683SChandra Seetharaman ip->i_gdquot = NULL; 48892f8ff73SChandra Seetharaman ip->i_pdquot = NULL; 48933479e05SDave Chinner xfs_iflags_set(ip, iflags); 49033479e05SDave Chinner 49133479e05SDave Chinner /* insert the new inode */ 49233479e05SDave Chinner spin_lock(&pag->pag_ici_lock); 49333479e05SDave Chinner error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 49433479e05SDave Chinner if (unlikely(error)) { 49533479e05SDave Chinner WARN_ON(error != -EEXIST); 496ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_dup); 4972451337dSDave Chinner error = -EAGAIN; 49833479e05SDave Chinner goto out_preload_end; 49933479e05SDave Chinner } 50033479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 50133479e05SDave Chinner radix_tree_preload_end(); 50233479e05SDave Chinner 50333479e05SDave Chinner *ipp = ip; 50433479e05SDave Chinner return 0; 50533479e05SDave Chinner 50633479e05SDave Chinner out_preload_end: 50733479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 50833479e05SDave Chinner radix_tree_preload_end(); 50933479e05SDave Chinner if (lock_flags) 51033479e05SDave Chinner xfs_iunlock(ip, lock_flags); 51133479e05SDave Chinner out_destroy: 51233479e05SDave Chinner __destroy_inode(VFS_I(ip)); 51333479e05SDave Chinner xfs_inode_free(ip); 51433479e05SDave Chinner return error; 51533479e05SDave Chinner } 51633479e05SDave Chinner 51733479e05SDave Chinner /* 51833479e05SDave Chinner * Look up an inode by number in the given file system. 51933479e05SDave Chinner * The inode is looked up in the cache held in each AG. 52033479e05SDave Chinner * If the inode is found in the cache, initialise the vfs inode 52133479e05SDave Chinner * if necessary. 52233479e05SDave Chinner * 52333479e05SDave Chinner * If it is not in core, read it in from the file system's device, 52433479e05SDave Chinner * add it to the cache and initialise the vfs inode. 52533479e05SDave Chinner * 52633479e05SDave Chinner * The inode is locked according to the value of the lock_flags parameter. 52733479e05SDave Chinner * This flag parameter indicates how and if the inode's IO lock and inode lock 52833479e05SDave Chinner * should be taken. 52933479e05SDave Chinner * 53033479e05SDave Chinner * mp -- the mount point structure for the current file system. It points 53133479e05SDave Chinner * to the inode hash table. 53233479e05SDave Chinner * tp -- a pointer to the current transaction if there is one. This is 53333479e05SDave Chinner * simply passed through to the xfs_iread() call. 53433479e05SDave Chinner * ino -- the number of the inode desired. This is the unique identifier 53533479e05SDave Chinner * within the file system for the inode being requested. 53633479e05SDave Chinner * lock_flags -- flags indicating how to lock the inode. See the comment 53733479e05SDave Chinner * for xfs_ilock() for a list of valid values. 53833479e05SDave Chinner */ 53933479e05SDave Chinner int 54033479e05SDave Chinner xfs_iget( 54133479e05SDave Chinner xfs_mount_t *mp, 54233479e05SDave Chinner xfs_trans_t *tp, 54333479e05SDave Chinner xfs_ino_t ino, 54433479e05SDave Chinner uint flags, 54533479e05SDave Chinner uint lock_flags, 54633479e05SDave Chinner xfs_inode_t **ipp) 54733479e05SDave Chinner { 54833479e05SDave Chinner xfs_inode_t *ip; 54933479e05SDave Chinner int error; 55033479e05SDave Chinner xfs_perag_t *pag; 55133479e05SDave Chinner xfs_agino_t agino; 55233479e05SDave Chinner 55333479e05SDave Chinner /* 55433479e05SDave Chinner * xfs_reclaim_inode() uses the ILOCK to ensure an inode 55533479e05SDave Chinner * doesn't get freed while it's being referenced during a 55633479e05SDave Chinner * radix tree traversal here. It assumes this function 55733479e05SDave Chinner * aqcuires only the ILOCK (and therefore it has no need to 55833479e05SDave Chinner * involve the IOLOCK in this synchronization). 55933479e05SDave Chinner */ 56033479e05SDave Chinner ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 56133479e05SDave Chinner 56233479e05SDave Chinner /* reject inode numbers outside existing AGs */ 56333479e05SDave Chinner if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 5642451337dSDave Chinner return -EINVAL; 56533479e05SDave Chinner 566ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_attempts); 5678774cf8bSLucas Stach 56833479e05SDave Chinner /* get the perag structure and ensure that it's inode capable */ 56933479e05SDave Chinner pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 57033479e05SDave Chinner agino = XFS_INO_TO_AGINO(mp, ino); 57133479e05SDave Chinner 57233479e05SDave Chinner again: 57333479e05SDave Chinner error = 0; 57433479e05SDave Chinner rcu_read_lock(); 57533479e05SDave Chinner ip = radix_tree_lookup(&pag->pag_ici_root, agino); 57633479e05SDave Chinner 57733479e05SDave Chinner if (ip) { 57833479e05SDave Chinner error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 57933479e05SDave Chinner if (error) 58033479e05SDave Chinner goto out_error_or_again; 58133479e05SDave Chinner } else { 58233479e05SDave Chinner rcu_read_unlock(); 583ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_missed); 58433479e05SDave Chinner 58533479e05SDave Chinner error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 58633479e05SDave Chinner flags, lock_flags); 58733479e05SDave Chinner if (error) 58833479e05SDave Chinner goto out_error_or_again; 58933479e05SDave Chinner } 59033479e05SDave Chinner xfs_perag_put(pag); 59133479e05SDave Chinner 59233479e05SDave Chinner *ipp = ip; 59333479e05SDave Chinner 59433479e05SDave Chinner /* 59558c90473SDave Chinner * If we have a real type for an on-disk inode, we can setup the inode 59633479e05SDave Chinner * now. If it's a new inode being created, xfs_ialloc will handle it. 59733479e05SDave Chinner */ 598c19b3b05SDave Chinner if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 59958c90473SDave Chinner xfs_setup_existing_inode(ip); 60033479e05SDave Chinner return 0; 60133479e05SDave Chinner 60233479e05SDave Chinner out_error_or_again: 6032451337dSDave Chinner if (error == -EAGAIN) { 60433479e05SDave Chinner delay(1); 60533479e05SDave Chinner goto again; 60633479e05SDave Chinner } 60733479e05SDave Chinner xfs_perag_put(pag); 60833479e05SDave Chinner return error; 60933479e05SDave Chinner } 61033479e05SDave Chinner 6116d8b79cfSDave Chinner /* 6126d8b79cfSDave Chinner * The inode lookup is done in batches to keep the amount of lock traffic and 6136d8b79cfSDave Chinner * radix tree lookups to a minimum. The batch size is a trade off between 6146d8b79cfSDave Chinner * lookup reduction and stack usage. This is in the reclaim path, so we can't 6156d8b79cfSDave Chinner * be too greedy. 6166d8b79cfSDave Chinner */ 6176d8b79cfSDave Chinner #define XFS_LOOKUP_BATCH 32 6186d8b79cfSDave Chinner 6196d8b79cfSDave Chinner STATIC int 6206d8b79cfSDave Chinner xfs_inode_ag_walk_grab( 6216d8b79cfSDave Chinner struct xfs_inode *ip) 6226d8b79cfSDave Chinner { 6236d8b79cfSDave Chinner struct inode *inode = VFS_I(ip); 6246d8b79cfSDave Chinner 6256d8b79cfSDave Chinner ASSERT(rcu_read_lock_held()); 6266d8b79cfSDave Chinner 6276d8b79cfSDave Chinner /* 6286d8b79cfSDave Chinner * check for stale RCU freed inode 6296d8b79cfSDave Chinner * 6306d8b79cfSDave Chinner * If the inode has been reallocated, it doesn't matter if it's not in 6316d8b79cfSDave Chinner * the AG we are walking - we are walking for writeback, so if it 6326d8b79cfSDave Chinner * passes all the "valid inode" checks and is dirty, then we'll write 6336d8b79cfSDave Chinner * it back anyway. If it has been reallocated and still being 6346d8b79cfSDave Chinner * initialised, the XFS_INEW check below will catch it. 6356d8b79cfSDave Chinner */ 6366d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 6376d8b79cfSDave Chinner if (!ip->i_ino) 6386d8b79cfSDave Chinner goto out_unlock_noent; 6396d8b79cfSDave Chinner 6406d8b79cfSDave Chinner /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 6416d8b79cfSDave Chinner if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) 6426d8b79cfSDave Chinner goto out_unlock_noent; 6436d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 6446d8b79cfSDave Chinner 6456d8b79cfSDave Chinner /* nothing to sync during shutdown */ 6466d8b79cfSDave Chinner if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 6472451337dSDave Chinner return -EFSCORRUPTED; 6486d8b79cfSDave Chinner 6496d8b79cfSDave Chinner /* If we can't grab the inode, it must on it's way to reclaim. */ 6506d8b79cfSDave Chinner if (!igrab(inode)) 6512451337dSDave Chinner return -ENOENT; 6526d8b79cfSDave Chinner 6536d8b79cfSDave Chinner /* inode is valid */ 6546d8b79cfSDave Chinner return 0; 6556d8b79cfSDave Chinner 6566d8b79cfSDave Chinner out_unlock_noent: 6576d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 6582451337dSDave Chinner return -ENOENT; 6596d8b79cfSDave Chinner } 6606d8b79cfSDave Chinner 6616d8b79cfSDave Chinner STATIC int 6626d8b79cfSDave Chinner xfs_inode_ag_walk( 6636d8b79cfSDave Chinner struct xfs_mount *mp, 6646d8b79cfSDave Chinner struct xfs_perag *pag, 665e0094008SEric Sandeen int (*execute)(struct xfs_inode *ip, int flags, 666a454f742SBrian Foster void *args), 667a454f742SBrian Foster int flags, 668a454f742SBrian Foster void *args, 669a454f742SBrian Foster int tag) 6706d8b79cfSDave Chinner { 6716d8b79cfSDave Chinner uint32_t first_index; 6726d8b79cfSDave Chinner int last_error = 0; 6736d8b79cfSDave Chinner int skipped; 6746d8b79cfSDave Chinner int done; 6756d8b79cfSDave Chinner int nr_found; 6766d8b79cfSDave Chinner 6776d8b79cfSDave Chinner restart: 6786d8b79cfSDave Chinner done = 0; 6796d8b79cfSDave Chinner skipped = 0; 6806d8b79cfSDave Chinner first_index = 0; 6816d8b79cfSDave Chinner nr_found = 0; 6826d8b79cfSDave Chinner do { 6836d8b79cfSDave Chinner struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 6846d8b79cfSDave Chinner int error = 0; 6856d8b79cfSDave Chinner int i; 6866d8b79cfSDave Chinner 6876d8b79cfSDave Chinner rcu_read_lock(); 688a454f742SBrian Foster 689a454f742SBrian Foster if (tag == -1) 6906d8b79cfSDave Chinner nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 6916d8b79cfSDave Chinner (void **)batch, first_index, 6926d8b79cfSDave Chinner XFS_LOOKUP_BATCH); 693a454f742SBrian Foster else 694a454f742SBrian Foster nr_found = radix_tree_gang_lookup_tag( 695a454f742SBrian Foster &pag->pag_ici_root, 696a454f742SBrian Foster (void **) batch, first_index, 697a454f742SBrian Foster XFS_LOOKUP_BATCH, tag); 698a454f742SBrian Foster 6996d8b79cfSDave Chinner if (!nr_found) { 7006d8b79cfSDave Chinner rcu_read_unlock(); 7016d8b79cfSDave Chinner break; 7026d8b79cfSDave Chinner } 7036d8b79cfSDave Chinner 7046d8b79cfSDave Chinner /* 7056d8b79cfSDave Chinner * Grab the inodes before we drop the lock. if we found 7066d8b79cfSDave Chinner * nothing, nr == 0 and the loop will be skipped. 7076d8b79cfSDave Chinner */ 7086d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 7096d8b79cfSDave Chinner struct xfs_inode *ip = batch[i]; 7106d8b79cfSDave Chinner 7116d8b79cfSDave Chinner if (done || xfs_inode_ag_walk_grab(ip)) 7126d8b79cfSDave Chinner batch[i] = NULL; 7136d8b79cfSDave Chinner 7146d8b79cfSDave Chinner /* 7156d8b79cfSDave Chinner * Update the index for the next lookup. Catch 7166d8b79cfSDave Chinner * overflows into the next AG range which can occur if 7176d8b79cfSDave Chinner * we have inodes in the last block of the AG and we 7186d8b79cfSDave Chinner * are currently pointing to the last inode. 7196d8b79cfSDave Chinner * 7206d8b79cfSDave Chinner * Because we may see inodes that are from the wrong AG 7216d8b79cfSDave Chinner * due to RCU freeing and reallocation, only update the 7226d8b79cfSDave Chinner * index if it lies in this AG. It was a race that lead 7236d8b79cfSDave Chinner * us to see this inode, so another lookup from the 7246d8b79cfSDave Chinner * same index will not find it again. 7256d8b79cfSDave Chinner */ 7266d8b79cfSDave Chinner if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 7276d8b79cfSDave Chinner continue; 7286d8b79cfSDave Chinner first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 7296d8b79cfSDave Chinner if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 7306d8b79cfSDave Chinner done = 1; 7316d8b79cfSDave Chinner } 7326d8b79cfSDave Chinner 7336d8b79cfSDave Chinner /* unlock now we've grabbed the inodes. */ 7346d8b79cfSDave Chinner rcu_read_unlock(); 7356d8b79cfSDave Chinner 7366d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 7376d8b79cfSDave Chinner if (!batch[i]) 7386d8b79cfSDave Chinner continue; 739e0094008SEric Sandeen error = execute(batch[i], flags, args); 7406d8b79cfSDave Chinner IRELE(batch[i]); 7412451337dSDave Chinner if (error == -EAGAIN) { 7426d8b79cfSDave Chinner skipped++; 7436d8b79cfSDave Chinner continue; 7446d8b79cfSDave Chinner } 7452451337dSDave Chinner if (error && last_error != -EFSCORRUPTED) 7466d8b79cfSDave Chinner last_error = error; 7476d8b79cfSDave Chinner } 7486d8b79cfSDave Chinner 7496d8b79cfSDave Chinner /* bail out if the filesystem is corrupted. */ 7502451337dSDave Chinner if (error == -EFSCORRUPTED) 7516d8b79cfSDave Chinner break; 7526d8b79cfSDave Chinner 7536d8b79cfSDave Chinner cond_resched(); 7546d8b79cfSDave Chinner 7556d8b79cfSDave Chinner } while (nr_found && !done); 7566d8b79cfSDave Chinner 7576d8b79cfSDave Chinner if (skipped) { 7586d8b79cfSDave Chinner delay(1); 7596d8b79cfSDave Chinner goto restart; 7606d8b79cfSDave Chinner } 7616d8b79cfSDave Chinner return last_error; 7626d8b79cfSDave Chinner } 7636d8b79cfSDave Chinner 764579b62faSBrian Foster /* 765579b62faSBrian Foster * Background scanning to trim post-EOF preallocated space. This is queued 766b9fe5052SDwight Engen * based on the 'speculative_prealloc_lifetime' tunable (5m by default). 767579b62faSBrian Foster */ 768fa5a4f57SBrian Foster void 769579b62faSBrian Foster xfs_queue_eofblocks( 770579b62faSBrian Foster struct xfs_mount *mp) 771579b62faSBrian Foster { 772579b62faSBrian Foster rcu_read_lock(); 773579b62faSBrian Foster if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) 774579b62faSBrian Foster queue_delayed_work(mp->m_eofblocks_workqueue, 775579b62faSBrian Foster &mp->m_eofblocks_work, 776579b62faSBrian Foster msecs_to_jiffies(xfs_eofb_secs * 1000)); 777579b62faSBrian Foster rcu_read_unlock(); 778579b62faSBrian Foster } 779579b62faSBrian Foster 780579b62faSBrian Foster void 781579b62faSBrian Foster xfs_eofblocks_worker( 782579b62faSBrian Foster struct work_struct *work) 783579b62faSBrian Foster { 784579b62faSBrian Foster struct xfs_mount *mp = container_of(to_delayed_work(work), 785579b62faSBrian Foster struct xfs_mount, m_eofblocks_work); 786579b62faSBrian Foster xfs_icache_free_eofblocks(mp, NULL); 787579b62faSBrian Foster xfs_queue_eofblocks(mp); 788579b62faSBrian Foster } 789579b62faSBrian Foster 7906d8b79cfSDave Chinner int 7916d8b79cfSDave Chinner xfs_inode_ag_iterator( 7926d8b79cfSDave Chinner struct xfs_mount *mp, 793e0094008SEric Sandeen int (*execute)(struct xfs_inode *ip, int flags, 794a454f742SBrian Foster void *args), 795a454f742SBrian Foster int flags, 796a454f742SBrian Foster void *args) 7976d8b79cfSDave Chinner { 7986d8b79cfSDave Chinner struct xfs_perag *pag; 7996d8b79cfSDave Chinner int error = 0; 8006d8b79cfSDave Chinner int last_error = 0; 8016d8b79cfSDave Chinner xfs_agnumber_t ag; 8026d8b79cfSDave Chinner 8036d8b79cfSDave Chinner ag = 0; 8046d8b79cfSDave Chinner while ((pag = xfs_perag_get(mp, ag))) { 8056d8b79cfSDave Chinner ag = pag->pag_agno + 1; 806a454f742SBrian Foster error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); 807a454f742SBrian Foster xfs_perag_put(pag); 808a454f742SBrian Foster if (error) { 809a454f742SBrian Foster last_error = error; 8102451337dSDave Chinner if (error == -EFSCORRUPTED) 811a454f742SBrian Foster break; 812a454f742SBrian Foster } 813a454f742SBrian Foster } 814b474c7aeSEric Sandeen return last_error; 815a454f742SBrian Foster } 816a454f742SBrian Foster 817a454f742SBrian Foster int 818a454f742SBrian Foster xfs_inode_ag_iterator_tag( 819a454f742SBrian Foster struct xfs_mount *mp, 820e0094008SEric Sandeen int (*execute)(struct xfs_inode *ip, int flags, 821a454f742SBrian Foster void *args), 822a454f742SBrian Foster int flags, 823a454f742SBrian Foster void *args, 824a454f742SBrian Foster int tag) 825a454f742SBrian Foster { 826a454f742SBrian Foster struct xfs_perag *pag; 827a454f742SBrian Foster int error = 0; 828a454f742SBrian Foster int last_error = 0; 829a454f742SBrian Foster xfs_agnumber_t ag; 830a454f742SBrian Foster 831a454f742SBrian Foster ag = 0; 832a454f742SBrian Foster while ((pag = xfs_perag_get_tag(mp, ag, tag))) { 833a454f742SBrian Foster ag = pag->pag_agno + 1; 834a454f742SBrian Foster error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); 8356d8b79cfSDave Chinner xfs_perag_put(pag); 8366d8b79cfSDave Chinner if (error) { 8376d8b79cfSDave Chinner last_error = error; 8382451337dSDave Chinner if (error == -EFSCORRUPTED) 8396d8b79cfSDave Chinner break; 8406d8b79cfSDave Chinner } 8416d8b79cfSDave Chinner } 842b474c7aeSEric Sandeen return last_error; 8436d8b79cfSDave Chinner } 8446d8b79cfSDave Chinner 8456d8b79cfSDave Chinner /* 8466d8b79cfSDave Chinner * Grab the inode for reclaim exclusively. 8476d8b79cfSDave Chinner * Return 0 if we grabbed it, non-zero otherwise. 8486d8b79cfSDave Chinner */ 8496d8b79cfSDave Chinner STATIC int 8506d8b79cfSDave Chinner xfs_reclaim_inode_grab( 8516d8b79cfSDave Chinner struct xfs_inode *ip, 8526d8b79cfSDave Chinner int flags) 8536d8b79cfSDave Chinner { 8546d8b79cfSDave Chinner ASSERT(rcu_read_lock_held()); 8556d8b79cfSDave Chinner 8566d8b79cfSDave Chinner /* quick check for stale RCU freed inode */ 8576d8b79cfSDave Chinner if (!ip->i_ino) 8586d8b79cfSDave Chinner return 1; 8596d8b79cfSDave Chinner 8606d8b79cfSDave Chinner /* 8616d8b79cfSDave Chinner * If we are asked for non-blocking operation, do unlocked checks to 8626d8b79cfSDave Chinner * see if the inode already is being flushed or in reclaim to avoid 8636d8b79cfSDave Chinner * lock traffic. 8646d8b79cfSDave Chinner */ 8656d8b79cfSDave Chinner if ((flags & SYNC_TRYLOCK) && 8666d8b79cfSDave Chinner __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) 8676d8b79cfSDave Chinner return 1; 8686d8b79cfSDave Chinner 8696d8b79cfSDave Chinner /* 8706d8b79cfSDave Chinner * The radix tree lock here protects a thread in xfs_iget from racing 8716d8b79cfSDave Chinner * with us starting reclaim on the inode. Once we have the 8726d8b79cfSDave Chinner * XFS_IRECLAIM flag set it will not touch us. 8736d8b79cfSDave Chinner * 8746d8b79cfSDave Chinner * Due to RCU lookup, we may find inodes that have been freed and only 8756d8b79cfSDave Chinner * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that 8766d8b79cfSDave Chinner * aren't candidates for reclaim at all, so we must check the 8776d8b79cfSDave Chinner * XFS_IRECLAIMABLE is set first before proceeding to reclaim. 8786d8b79cfSDave Chinner */ 8796d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 8806d8b79cfSDave Chinner if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 8816d8b79cfSDave Chinner __xfs_iflags_test(ip, XFS_IRECLAIM)) { 8826d8b79cfSDave Chinner /* not a reclaim candidate. */ 8836d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 8846d8b79cfSDave Chinner return 1; 8856d8b79cfSDave Chinner } 8866d8b79cfSDave Chinner __xfs_iflags_set(ip, XFS_IRECLAIM); 8876d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 8886d8b79cfSDave Chinner return 0; 8896d8b79cfSDave Chinner } 8906d8b79cfSDave Chinner 8916d8b79cfSDave Chinner /* 8926d8b79cfSDave Chinner * Inodes in different states need to be treated differently. The following 8936d8b79cfSDave Chinner * table lists the inode states and the reclaim actions necessary: 8946d8b79cfSDave Chinner * 8956d8b79cfSDave Chinner * inode state iflush ret required action 8966d8b79cfSDave Chinner * --------------- ---------- --------------- 8976d8b79cfSDave Chinner * bad - reclaim 8986d8b79cfSDave Chinner * shutdown EIO unpin and reclaim 8996d8b79cfSDave Chinner * clean, unpinned 0 reclaim 9006d8b79cfSDave Chinner * stale, unpinned 0 reclaim 9016d8b79cfSDave Chinner * clean, pinned(*) 0 requeue 9026d8b79cfSDave Chinner * stale, pinned EAGAIN requeue 9036d8b79cfSDave Chinner * dirty, async - requeue 9046d8b79cfSDave Chinner * dirty, sync 0 reclaim 9056d8b79cfSDave Chinner * 9066d8b79cfSDave Chinner * (*) dgc: I don't think the clean, pinned state is possible but it gets 9076d8b79cfSDave Chinner * handled anyway given the order of checks implemented. 9086d8b79cfSDave Chinner * 9096d8b79cfSDave Chinner * Also, because we get the flush lock first, we know that any inode that has 9106d8b79cfSDave Chinner * been flushed delwri has had the flush completed by the time we check that 9116d8b79cfSDave Chinner * the inode is clean. 9126d8b79cfSDave Chinner * 9136d8b79cfSDave Chinner * Note that because the inode is flushed delayed write by AIL pushing, the 9146d8b79cfSDave Chinner * flush lock may already be held here and waiting on it can result in very 9156d8b79cfSDave Chinner * long latencies. Hence for sync reclaims, where we wait on the flush lock, 9166d8b79cfSDave Chinner * the caller should push the AIL first before trying to reclaim inodes to 9176d8b79cfSDave Chinner * minimise the amount of time spent waiting. For background relaim, we only 9186d8b79cfSDave Chinner * bother to reclaim clean inodes anyway. 9196d8b79cfSDave Chinner * 9206d8b79cfSDave Chinner * Hence the order of actions after gaining the locks should be: 9216d8b79cfSDave Chinner * bad => reclaim 9226d8b79cfSDave Chinner * shutdown => unpin and reclaim 9236d8b79cfSDave Chinner * pinned, async => requeue 9246d8b79cfSDave Chinner * pinned, sync => unpin 9256d8b79cfSDave Chinner * stale => reclaim 9266d8b79cfSDave Chinner * clean => reclaim 9276d8b79cfSDave Chinner * dirty, async => requeue 9286d8b79cfSDave Chinner * dirty, sync => flush, wait and reclaim 9296d8b79cfSDave Chinner */ 9306d8b79cfSDave Chinner STATIC int 9316d8b79cfSDave Chinner xfs_reclaim_inode( 9326d8b79cfSDave Chinner struct xfs_inode *ip, 9336d8b79cfSDave Chinner struct xfs_perag *pag, 9346d8b79cfSDave Chinner int sync_mode) 9356d8b79cfSDave Chinner { 9366d8b79cfSDave Chinner struct xfs_buf *bp = NULL; 9378a17d7ddSDave Chinner xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 9386d8b79cfSDave Chinner int error; 9396d8b79cfSDave Chinner 9406d8b79cfSDave Chinner restart: 9416d8b79cfSDave Chinner error = 0; 9426d8b79cfSDave Chinner xfs_ilock(ip, XFS_ILOCK_EXCL); 9436d8b79cfSDave Chinner if (!xfs_iflock_nowait(ip)) { 9446d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 9456d8b79cfSDave Chinner goto out; 9466d8b79cfSDave Chinner xfs_iflock(ip); 9476d8b79cfSDave Chinner } 9486d8b79cfSDave Chinner 9496d8b79cfSDave Chinner if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 9506d8b79cfSDave Chinner xfs_iunpin_wait(ip); 9516d8b79cfSDave Chinner xfs_iflush_abort(ip, false); 9526d8b79cfSDave Chinner goto reclaim; 9536d8b79cfSDave Chinner } 9546d8b79cfSDave Chinner if (xfs_ipincount(ip)) { 9556d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 9566d8b79cfSDave Chinner goto out_ifunlock; 9576d8b79cfSDave Chinner xfs_iunpin_wait(ip); 9586d8b79cfSDave Chinner } 9596d8b79cfSDave Chinner if (xfs_iflags_test(ip, XFS_ISTALE)) 9606d8b79cfSDave Chinner goto reclaim; 9616d8b79cfSDave Chinner if (xfs_inode_clean(ip)) 9626d8b79cfSDave Chinner goto reclaim; 9636d8b79cfSDave Chinner 9646d8b79cfSDave Chinner /* 9656d8b79cfSDave Chinner * Never flush out dirty data during non-blocking reclaim, as it would 9666d8b79cfSDave Chinner * just contend with AIL pushing trying to do the same job. 9676d8b79cfSDave Chinner */ 9686d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 9696d8b79cfSDave Chinner goto out_ifunlock; 9706d8b79cfSDave Chinner 9716d8b79cfSDave Chinner /* 9726d8b79cfSDave Chinner * Now we have an inode that needs flushing. 9736d8b79cfSDave Chinner * 9746d8b79cfSDave Chinner * Note that xfs_iflush will never block on the inode buffer lock, as 9756d8b79cfSDave Chinner * xfs_ifree_cluster() can lock the inode buffer before it locks the 9766d8b79cfSDave Chinner * ip->i_lock, and we are doing the exact opposite here. As a result, 9776d8b79cfSDave Chinner * doing a blocking xfs_imap_to_bp() to get the cluster buffer would 9786d8b79cfSDave Chinner * result in an ABBA deadlock with xfs_ifree_cluster(). 9796d8b79cfSDave Chinner * 9806d8b79cfSDave Chinner * As xfs_ifree_cluser() must gather all inodes that are active in the 9816d8b79cfSDave Chinner * cache to mark them stale, if we hit this case we don't actually want 9826d8b79cfSDave Chinner * to do IO here - we want the inode marked stale so we can simply 9836d8b79cfSDave Chinner * reclaim it. Hence if we get an EAGAIN error here, just unlock the 9846d8b79cfSDave Chinner * inode, back off and try again. Hopefully the next pass through will 9856d8b79cfSDave Chinner * see the stale flag set on the inode. 9866d8b79cfSDave Chinner */ 9876d8b79cfSDave Chinner error = xfs_iflush(ip, &bp); 9882451337dSDave Chinner if (error == -EAGAIN) { 9896d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 9906d8b79cfSDave Chinner /* backoff longer than in xfs_ifree_cluster */ 9916d8b79cfSDave Chinner delay(2); 9926d8b79cfSDave Chinner goto restart; 9936d8b79cfSDave Chinner } 9946d8b79cfSDave Chinner 9956d8b79cfSDave Chinner if (!error) { 9966d8b79cfSDave Chinner error = xfs_bwrite(bp); 9976d8b79cfSDave Chinner xfs_buf_relse(bp); 9986d8b79cfSDave Chinner } 9996d8b79cfSDave Chinner 10006d8b79cfSDave Chinner xfs_iflock(ip); 10016d8b79cfSDave Chinner reclaim: 10028a17d7ddSDave Chinner /* 10038a17d7ddSDave Chinner * Because we use RCU freeing we need to ensure the inode always appears 10048a17d7ddSDave Chinner * to be reclaimed with an invalid inode number when in the free state. 10058a17d7ddSDave Chinner * We do this as early as possible under the ILOCK and flush lock so 10068a17d7ddSDave Chinner * that xfs_iflush_cluster() can be guaranteed to detect races with us 10078a17d7ddSDave Chinner * here. By doing this, we guarantee that once xfs_iflush_cluster has 10088a17d7ddSDave Chinner * locked both the XFS_ILOCK and the flush lock that it will see either 10098a17d7ddSDave Chinner * a valid, flushable inode that will serialise correctly against the 10108a17d7ddSDave Chinner * locks below, or it will see a clean (and invalid) inode that it can 10118a17d7ddSDave Chinner * skip. 10128a17d7ddSDave Chinner */ 10138a17d7ddSDave Chinner spin_lock(&ip->i_flags_lock); 10148a17d7ddSDave Chinner ip->i_flags = XFS_IRECLAIM; 10158a17d7ddSDave Chinner ip->i_ino = 0; 10168a17d7ddSDave Chinner spin_unlock(&ip->i_flags_lock); 10178a17d7ddSDave Chinner 10186d8b79cfSDave Chinner xfs_ifunlock(ip); 10196d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 10206d8b79cfSDave Chinner 1021ff6d6af2SBill O'Donnell XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 10226d8b79cfSDave Chinner /* 10236d8b79cfSDave Chinner * Remove the inode from the per-AG radix tree. 10246d8b79cfSDave Chinner * 10256d8b79cfSDave Chinner * Because radix_tree_delete won't complain even if the item was never 10266d8b79cfSDave Chinner * added to the tree assert that it's been there before to catch 10276d8b79cfSDave Chinner * problems with the inode life time early on. 10286d8b79cfSDave Chinner */ 10296d8b79cfSDave Chinner spin_lock(&pag->pag_ici_lock); 10306d8b79cfSDave Chinner if (!radix_tree_delete(&pag->pag_ici_root, 10318a17d7ddSDave Chinner XFS_INO_TO_AGINO(ip->i_mount, ino))) 10326d8b79cfSDave Chinner ASSERT(0); 1033545c0889SDave Chinner xfs_perag_clear_reclaim_tag(pag); 10346d8b79cfSDave Chinner spin_unlock(&pag->pag_ici_lock); 10356d8b79cfSDave Chinner 10366d8b79cfSDave Chinner /* 10376d8b79cfSDave Chinner * Here we do an (almost) spurious inode lock in order to coordinate 10386d8b79cfSDave Chinner * with inode cache radix tree lookups. This is because the lookup 10396d8b79cfSDave Chinner * can reference the inodes in the cache without taking references. 10406d8b79cfSDave Chinner * 10416d8b79cfSDave Chinner * We make that OK here by ensuring that we wait until the inode is 10426d8b79cfSDave Chinner * unlocked after the lookup before we go ahead and free it. 10436d8b79cfSDave Chinner */ 10446d8b79cfSDave Chinner xfs_ilock(ip, XFS_ILOCK_EXCL); 10456d8b79cfSDave Chinner xfs_qm_dqdetach(ip); 10466d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 10476d8b79cfSDave Chinner 10488a17d7ddSDave Chinner __xfs_inode_free(ip); 10496d8b79cfSDave Chinner return error; 10506d8b79cfSDave Chinner 10516d8b79cfSDave Chinner out_ifunlock: 10526d8b79cfSDave Chinner xfs_ifunlock(ip); 10536d8b79cfSDave Chinner out: 10546d8b79cfSDave Chinner xfs_iflags_clear(ip, XFS_IRECLAIM); 10556d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 10566d8b79cfSDave Chinner /* 10572451337dSDave Chinner * We could return -EAGAIN here to make reclaim rescan the inode tree in 10586d8b79cfSDave Chinner * a short while. However, this just burns CPU time scanning the tree 10596d8b79cfSDave Chinner * waiting for IO to complete and the reclaim work never goes back to 10606d8b79cfSDave Chinner * the idle state. Instead, return 0 to let the next scheduled 10616d8b79cfSDave Chinner * background reclaim attempt to reclaim the inode again. 10626d8b79cfSDave Chinner */ 10636d8b79cfSDave Chinner return 0; 10646d8b79cfSDave Chinner } 10656d8b79cfSDave Chinner 10666d8b79cfSDave Chinner /* 10676d8b79cfSDave Chinner * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 10686d8b79cfSDave Chinner * corrupted, we still want to try to reclaim all the inodes. If we don't, 10696d8b79cfSDave Chinner * then a shut down during filesystem unmount reclaim walk leak all the 10706d8b79cfSDave Chinner * unreclaimed inodes. 10716d8b79cfSDave Chinner */ 107233479e05SDave Chinner STATIC int 10736d8b79cfSDave Chinner xfs_reclaim_inodes_ag( 10746d8b79cfSDave Chinner struct xfs_mount *mp, 10756d8b79cfSDave Chinner int flags, 10766d8b79cfSDave Chinner int *nr_to_scan) 10776d8b79cfSDave Chinner { 10786d8b79cfSDave Chinner struct xfs_perag *pag; 10796d8b79cfSDave Chinner int error = 0; 10806d8b79cfSDave Chinner int last_error = 0; 10816d8b79cfSDave Chinner xfs_agnumber_t ag; 10826d8b79cfSDave Chinner int trylock = flags & SYNC_TRYLOCK; 10836d8b79cfSDave Chinner int skipped; 10846d8b79cfSDave Chinner 10856d8b79cfSDave Chinner restart: 10866d8b79cfSDave Chinner ag = 0; 10876d8b79cfSDave Chinner skipped = 0; 10886d8b79cfSDave Chinner while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 10896d8b79cfSDave Chinner unsigned long first_index = 0; 10906d8b79cfSDave Chinner int done = 0; 10916d8b79cfSDave Chinner int nr_found = 0; 10926d8b79cfSDave Chinner 10936d8b79cfSDave Chinner ag = pag->pag_agno + 1; 10946d8b79cfSDave Chinner 10956d8b79cfSDave Chinner if (trylock) { 10966d8b79cfSDave Chinner if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 10976d8b79cfSDave Chinner skipped++; 10986d8b79cfSDave Chinner xfs_perag_put(pag); 10996d8b79cfSDave Chinner continue; 11006d8b79cfSDave Chinner } 11016d8b79cfSDave Chinner first_index = pag->pag_ici_reclaim_cursor; 11026d8b79cfSDave Chinner } else 11036d8b79cfSDave Chinner mutex_lock(&pag->pag_ici_reclaim_lock); 11046d8b79cfSDave Chinner 11056d8b79cfSDave Chinner do { 11066d8b79cfSDave Chinner struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 11076d8b79cfSDave Chinner int i; 11086d8b79cfSDave Chinner 11096d8b79cfSDave Chinner rcu_read_lock(); 11106d8b79cfSDave Chinner nr_found = radix_tree_gang_lookup_tag( 11116d8b79cfSDave Chinner &pag->pag_ici_root, 11126d8b79cfSDave Chinner (void **)batch, first_index, 11136d8b79cfSDave Chinner XFS_LOOKUP_BATCH, 11146d8b79cfSDave Chinner XFS_ICI_RECLAIM_TAG); 11156d8b79cfSDave Chinner if (!nr_found) { 11166d8b79cfSDave Chinner done = 1; 11176d8b79cfSDave Chinner rcu_read_unlock(); 11186d8b79cfSDave Chinner break; 11196d8b79cfSDave Chinner } 11206d8b79cfSDave Chinner 11216d8b79cfSDave Chinner /* 11226d8b79cfSDave Chinner * Grab the inodes before we drop the lock. if we found 11236d8b79cfSDave Chinner * nothing, nr == 0 and the loop will be skipped. 11246d8b79cfSDave Chinner */ 11256d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 11266d8b79cfSDave Chinner struct xfs_inode *ip = batch[i]; 11276d8b79cfSDave Chinner 11286d8b79cfSDave Chinner if (done || xfs_reclaim_inode_grab(ip, flags)) 11296d8b79cfSDave Chinner batch[i] = NULL; 11306d8b79cfSDave Chinner 11316d8b79cfSDave Chinner /* 11326d8b79cfSDave Chinner * Update the index for the next lookup. Catch 11336d8b79cfSDave Chinner * overflows into the next AG range which can 11346d8b79cfSDave Chinner * occur if we have inodes in the last block of 11356d8b79cfSDave Chinner * the AG and we are currently pointing to the 11366d8b79cfSDave Chinner * last inode. 11376d8b79cfSDave Chinner * 11386d8b79cfSDave Chinner * Because we may see inodes that are from the 11396d8b79cfSDave Chinner * wrong AG due to RCU freeing and 11406d8b79cfSDave Chinner * reallocation, only update the index if it 11416d8b79cfSDave Chinner * lies in this AG. It was a race that lead us 11426d8b79cfSDave Chinner * to see this inode, so another lookup from 11436d8b79cfSDave Chinner * the same index will not find it again. 11446d8b79cfSDave Chinner */ 11456d8b79cfSDave Chinner if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 11466d8b79cfSDave Chinner pag->pag_agno) 11476d8b79cfSDave Chinner continue; 11486d8b79cfSDave Chinner first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 11496d8b79cfSDave Chinner if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 11506d8b79cfSDave Chinner done = 1; 11516d8b79cfSDave Chinner } 11526d8b79cfSDave Chinner 11536d8b79cfSDave Chinner /* unlock now we've grabbed the inodes. */ 11546d8b79cfSDave Chinner rcu_read_unlock(); 11556d8b79cfSDave Chinner 11566d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 11576d8b79cfSDave Chinner if (!batch[i]) 11586d8b79cfSDave Chinner continue; 11596d8b79cfSDave Chinner error = xfs_reclaim_inode(batch[i], pag, flags); 11602451337dSDave Chinner if (error && last_error != -EFSCORRUPTED) 11616d8b79cfSDave Chinner last_error = error; 11626d8b79cfSDave Chinner } 11636d8b79cfSDave Chinner 11646d8b79cfSDave Chinner *nr_to_scan -= XFS_LOOKUP_BATCH; 11656d8b79cfSDave Chinner 11666d8b79cfSDave Chinner cond_resched(); 11676d8b79cfSDave Chinner 11686d8b79cfSDave Chinner } while (nr_found && !done && *nr_to_scan > 0); 11696d8b79cfSDave Chinner 11706d8b79cfSDave Chinner if (trylock && !done) 11716d8b79cfSDave Chinner pag->pag_ici_reclaim_cursor = first_index; 11726d8b79cfSDave Chinner else 11736d8b79cfSDave Chinner pag->pag_ici_reclaim_cursor = 0; 11746d8b79cfSDave Chinner mutex_unlock(&pag->pag_ici_reclaim_lock); 11756d8b79cfSDave Chinner xfs_perag_put(pag); 11766d8b79cfSDave Chinner } 11776d8b79cfSDave Chinner 11786d8b79cfSDave Chinner /* 11796d8b79cfSDave Chinner * if we skipped any AG, and we still have scan count remaining, do 11806d8b79cfSDave Chinner * another pass this time using blocking reclaim semantics (i.e 11816d8b79cfSDave Chinner * waiting on the reclaim locks and ignoring the reclaim cursors). This 11826d8b79cfSDave Chinner * ensure that when we get more reclaimers than AGs we block rather 11836d8b79cfSDave Chinner * than spin trying to execute reclaim. 11846d8b79cfSDave Chinner */ 11856d8b79cfSDave Chinner if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { 11866d8b79cfSDave Chinner trylock = 0; 11876d8b79cfSDave Chinner goto restart; 11886d8b79cfSDave Chinner } 1189b474c7aeSEric Sandeen return last_error; 11906d8b79cfSDave Chinner } 11916d8b79cfSDave Chinner 11926d8b79cfSDave Chinner int 11936d8b79cfSDave Chinner xfs_reclaim_inodes( 11946d8b79cfSDave Chinner xfs_mount_t *mp, 11956d8b79cfSDave Chinner int mode) 11966d8b79cfSDave Chinner { 11976d8b79cfSDave Chinner int nr_to_scan = INT_MAX; 11986d8b79cfSDave Chinner 11996d8b79cfSDave Chinner return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); 12006d8b79cfSDave Chinner } 12016d8b79cfSDave Chinner 12026d8b79cfSDave Chinner /* 12036d8b79cfSDave Chinner * Scan a certain number of inodes for reclaim. 12046d8b79cfSDave Chinner * 12056d8b79cfSDave Chinner * When called we make sure that there is a background (fast) inode reclaim in 12066d8b79cfSDave Chinner * progress, while we will throttle the speed of reclaim via doing synchronous 12076d8b79cfSDave Chinner * reclaim of inodes. That means if we come across dirty inodes, we wait for 12086d8b79cfSDave Chinner * them to be cleaned, which we hope will not be very long due to the 12096d8b79cfSDave Chinner * background walker having already kicked the IO off on those dirty inodes. 12106d8b79cfSDave Chinner */ 12110a234c6dSDave Chinner long 12126d8b79cfSDave Chinner xfs_reclaim_inodes_nr( 12136d8b79cfSDave Chinner struct xfs_mount *mp, 12146d8b79cfSDave Chinner int nr_to_scan) 12156d8b79cfSDave Chinner { 12166d8b79cfSDave Chinner /* kick background reclaimer and push the AIL */ 12176d8b79cfSDave Chinner xfs_reclaim_work_queue(mp); 12186d8b79cfSDave Chinner xfs_ail_push_all(mp->m_ail); 12196d8b79cfSDave Chinner 12200a234c6dSDave Chinner return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 12216d8b79cfSDave Chinner } 12226d8b79cfSDave Chinner 12236d8b79cfSDave Chinner /* 12246d8b79cfSDave Chinner * Return the number of reclaimable inodes in the filesystem for 12256d8b79cfSDave Chinner * the shrinker to determine how much to reclaim. 12266d8b79cfSDave Chinner */ 12276d8b79cfSDave Chinner int 12286d8b79cfSDave Chinner xfs_reclaim_inodes_count( 12296d8b79cfSDave Chinner struct xfs_mount *mp) 12306d8b79cfSDave Chinner { 12316d8b79cfSDave Chinner struct xfs_perag *pag; 12326d8b79cfSDave Chinner xfs_agnumber_t ag = 0; 12336d8b79cfSDave Chinner int reclaimable = 0; 12346d8b79cfSDave Chinner 12356d8b79cfSDave Chinner while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 12366d8b79cfSDave Chinner ag = pag->pag_agno + 1; 12376d8b79cfSDave Chinner reclaimable += pag->pag_ici_reclaimable; 12386d8b79cfSDave Chinner xfs_perag_put(pag); 12396d8b79cfSDave Chinner } 12406d8b79cfSDave Chinner return reclaimable; 12416d8b79cfSDave Chinner } 12426d8b79cfSDave Chinner 124341176a68SBrian Foster STATIC int 12443e3f9f58SBrian Foster xfs_inode_match_id( 12453e3f9f58SBrian Foster struct xfs_inode *ip, 12463e3f9f58SBrian Foster struct xfs_eofblocks *eofb) 12473e3f9f58SBrian Foster { 1248b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1249b9fe5052SDwight Engen !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 12503e3f9f58SBrian Foster return 0; 12511b556048SBrian Foster 1252b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1253b9fe5052SDwight Engen !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 12541b556048SBrian Foster return 0; 12551b556048SBrian Foster 1256b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 12571b556048SBrian Foster xfs_get_projid(ip) != eofb->eof_prid) 12581b556048SBrian Foster return 0; 12591b556048SBrian Foster 12601b556048SBrian Foster return 1; 12613e3f9f58SBrian Foster } 12623e3f9f58SBrian Foster 1263f4526397SBrian Foster /* 1264f4526397SBrian Foster * A union-based inode filtering algorithm. Process the inode if any of the 1265f4526397SBrian Foster * criteria match. This is for global/internal scans only. 1266f4526397SBrian Foster */ 1267f4526397SBrian Foster STATIC int 1268f4526397SBrian Foster xfs_inode_match_id_union( 1269f4526397SBrian Foster struct xfs_inode *ip, 1270f4526397SBrian Foster struct xfs_eofblocks *eofb) 1271f4526397SBrian Foster { 1272f4526397SBrian Foster if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1273f4526397SBrian Foster uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1274f4526397SBrian Foster return 1; 1275f4526397SBrian Foster 1276f4526397SBrian Foster if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1277f4526397SBrian Foster gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1278f4526397SBrian Foster return 1; 1279f4526397SBrian Foster 1280f4526397SBrian Foster if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1281f4526397SBrian Foster xfs_get_projid(ip) == eofb->eof_prid) 1282f4526397SBrian Foster return 1; 1283f4526397SBrian Foster 1284f4526397SBrian Foster return 0; 1285f4526397SBrian Foster } 1286f4526397SBrian Foster 12873e3f9f58SBrian Foster STATIC int 128841176a68SBrian Foster xfs_inode_free_eofblocks( 128941176a68SBrian Foster struct xfs_inode *ip, 129041176a68SBrian Foster int flags, 129141176a68SBrian Foster void *args) 129241176a68SBrian Foster { 129341176a68SBrian Foster int ret; 12943e3f9f58SBrian Foster struct xfs_eofblocks *eofb = args; 12955400da7dSBrian Foster bool need_iolock = true; 1296f4526397SBrian Foster int match; 12975400da7dSBrian Foster 12985400da7dSBrian Foster ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); 129941176a68SBrian Foster 130041176a68SBrian Foster if (!xfs_can_free_eofblocks(ip, false)) { 130141176a68SBrian Foster /* inode could be preallocated or append-only */ 130241176a68SBrian Foster trace_xfs_inode_free_eofblocks_invalid(ip); 130341176a68SBrian Foster xfs_inode_clear_eofblocks_tag(ip); 130441176a68SBrian Foster return 0; 130541176a68SBrian Foster } 130641176a68SBrian Foster 130741176a68SBrian Foster /* 130841176a68SBrian Foster * If the mapping is dirty the operation can block and wait for some 130941176a68SBrian Foster * time. Unless we are waiting, skip it. 131041176a68SBrian Foster */ 131141176a68SBrian Foster if (!(flags & SYNC_WAIT) && 131241176a68SBrian Foster mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 131341176a68SBrian Foster return 0; 131441176a68SBrian Foster 131500ca79a0SBrian Foster if (eofb) { 1316f4526397SBrian Foster if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 1317f4526397SBrian Foster match = xfs_inode_match_id_union(ip, eofb); 1318f4526397SBrian Foster else 1319f4526397SBrian Foster match = xfs_inode_match_id(ip, eofb); 1320f4526397SBrian Foster if (!match) 13213e3f9f58SBrian Foster return 0; 13223e3f9f58SBrian Foster 132300ca79a0SBrian Foster /* skip the inode if the file size is too small */ 132400ca79a0SBrian Foster if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 132500ca79a0SBrian Foster XFS_ISIZE(ip) < eofb->eof_min_file_size) 132600ca79a0SBrian Foster return 0; 13275400da7dSBrian Foster 13285400da7dSBrian Foster /* 13295400da7dSBrian Foster * A scan owner implies we already hold the iolock. Skip it in 13305400da7dSBrian Foster * xfs_free_eofblocks() to avoid deadlock. This also eliminates 13315400da7dSBrian Foster * the possibility of EAGAIN being returned. 13325400da7dSBrian Foster */ 13335400da7dSBrian Foster if (eofb->eof_scan_owner == ip->i_ino) 13345400da7dSBrian Foster need_iolock = false; 133500ca79a0SBrian Foster } 133600ca79a0SBrian Foster 13375400da7dSBrian Foster ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock); 133841176a68SBrian Foster 133941176a68SBrian Foster /* don't revisit the inode if we're not waiting */ 13402451337dSDave Chinner if (ret == -EAGAIN && !(flags & SYNC_WAIT)) 134141176a68SBrian Foster ret = 0; 134241176a68SBrian Foster 134341176a68SBrian Foster return ret; 134441176a68SBrian Foster } 134541176a68SBrian Foster 134641176a68SBrian Foster int 134741176a68SBrian Foster xfs_icache_free_eofblocks( 134841176a68SBrian Foster struct xfs_mount *mp, 13498ca149deSBrian Foster struct xfs_eofblocks *eofb) 135041176a68SBrian Foster { 13518ca149deSBrian Foster int flags = SYNC_TRYLOCK; 13528ca149deSBrian Foster 13538ca149deSBrian Foster if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) 13548ca149deSBrian Foster flags = SYNC_WAIT; 13558ca149deSBrian Foster 135641176a68SBrian Foster return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, 13578ca149deSBrian Foster eofb, XFS_ICI_EOFBLOCKS_TAG); 135841176a68SBrian Foster } 135941176a68SBrian Foster 1360dc06f398SBrian Foster /* 1361dc06f398SBrian Foster * Run eofblocks scans on the quotas applicable to the inode. For inodes with 1362dc06f398SBrian Foster * multiple quotas, we don't know exactly which quota caused an allocation 1363dc06f398SBrian Foster * failure. We make a best effort by including each quota under low free space 1364dc06f398SBrian Foster * conditions (less than 1% free space) in the scan. 1365dc06f398SBrian Foster */ 1366dc06f398SBrian Foster int 1367dc06f398SBrian Foster xfs_inode_free_quota_eofblocks( 1368dc06f398SBrian Foster struct xfs_inode *ip) 1369dc06f398SBrian Foster { 1370dc06f398SBrian Foster int scan = 0; 1371dc06f398SBrian Foster struct xfs_eofblocks eofb = {0}; 1372dc06f398SBrian Foster struct xfs_dquot *dq; 1373dc06f398SBrian Foster 1374dc06f398SBrian Foster ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 1375dc06f398SBrian Foster 1376dc06f398SBrian Foster /* 1377dc06f398SBrian Foster * Set the scan owner to avoid a potential livelock. Otherwise, the scan 1378dc06f398SBrian Foster * can repeatedly trylock on the inode we're currently processing. We 1379dc06f398SBrian Foster * run a sync scan to increase effectiveness and use the union filter to 1380dc06f398SBrian Foster * cover all applicable quotas in a single scan. 1381dc06f398SBrian Foster */ 1382dc06f398SBrian Foster eofb.eof_scan_owner = ip->i_ino; 1383dc06f398SBrian Foster eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; 1384dc06f398SBrian Foster 1385dc06f398SBrian Foster if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { 1386dc06f398SBrian Foster dq = xfs_inode_dquot(ip, XFS_DQ_USER); 1387dc06f398SBrian Foster if (dq && xfs_dquot_lowsp(dq)) { 1388dc06f398SBrian Foster eofb.eof_uid = VFS_I(ip)->i_uid; 1389dc06f398SBrian Foster eofb.eof_flags |= XFS_EOF_FLAGS_UID; 1390dc06f398SBrian Foster scan = 1; 1391dc06f398SBrian Foster } 1392dc06f398SBrian Foster } 1393dc06f398SBrian Foster 1394dc06f398SBrian Foster if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { 1395dc06f398SBrian Foster dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); 1396dc06f398SBrian Foster if (dq && xfs_dquot_lowsp(dq)) { 1397dc06f398SBrian Foster eofb.eof_gid = VFS_I(ip)->i_gid; 1398dc06f398SBrian Foster eofb.eof_flags |= XFS_EOF_FLAGS_GID; 1399dc06f398SBrian Foster scan = 1; 1400dc06f398SBrian Foster } 1401dc06f398SBrian Foster } 1402dc06f398SBrian Foster 1403dc06f398SBrian Foster if (scan) 1404dc06f398SBrian Foster xfs_icache_free_eofblocks(ip->i_mount, &eofb); 1405dc06f398SBrian Foster 1406dc06f398SBrian Foster return scan; 1407dc06f398SBrian Foster } 1408dc06f398SBrian Foster 140927b52867SBrian Foster void 141027b52867SBrian Foster xfs_inode_set_eofblocks_tag( 141127b52867SBrian Foster xfs_inode_t *ip) 141227b52867SBrian Foster { 141327b52867SBrian Foster struct xfs_mount *mp = ip->i_mount; 141427b52867SBrian Foster struct xfs_perag *pag; 141527b52867SBrian Foster int tagged; 141627b52867SBrian Foster 1417*85a6e764SChristoph Hellwig /* 1418*85a6e764SChristoph Hellwig * Don't bother locking the AG and looking up in the radix trees 1419*85a6e764SChristoph Hellwig * if we already know that we have the tag set. 1420*85a6e764SChristoph Hellwig */ 1421*85a6e764SChristoph Hellwig if (ip->i_flags & XFS_IEOFBLOCKS) 1422*85a6e764SChristoph Hellwig return; 1423*85a6e764SChristoph Hellwig spin_lock(&ip->i_flags_lock); 1424*85a6e764SChristoph Hellwig ip->i_flags |= XFS_IEOFBLOCKS; 1425*85a6e764SChristoph Hellwig spin_unlock(&ip->i_flags_lock); 1426*85a6e764SChristoph Hellwig 142727b52867SBrian Foster pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 142827b52867SBrian Foster spin_lock(&pag->pag_ici_lock); 142927b52867SBrian Foster trace_xfs_inode_set_eofblocks_tag(ip); 143027b52867SBrian Foster 143127b52867SBrian Foster tagged = radix_tree_tagged(&pag->pag_ici_root, 143227b52867SBrian Foster XFS_ICI_EOFBLOCKS_TAG); 143327b52867SBrian Foster radix_tree_tag_set(&pag->pag_ici_root, 143427b52867SBrian Foster XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 143527b52867SBrian Foster XFS_ICI_EOFBLOCKS_TAG); 143627b52867SBrian Foster if (!tagged) { 143727b52867SBrian Foster /* propagate the eofblocks tag up into the perag radix tree */ 143827b52867SBrian Foster spin_lock(&ip->i_mount->m_perag_lock); 143927b52867SBrian Foster radix_tree_tag_set(&ip->i_mount->m_perag_tree, 144027b52867SBrian Foster XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 144127b52867SBrian Foster XFS_ICI_EOFBLOCKS_TAG); 144227b52867SBrian Foster spin_unlock(&ip->i_mount->m_perag_lock); 144327b52867SBrian Foster 1444579b62faSBrian Foster /* kick off background trimming */ 1445579b62faSBrian Foster xfs_queue_eofblocks(ip->i_mount); 1446579b62faSBrian Foster 144727b52867SBrian Foster trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, 144827b52867SBrian Foster -1, _RET_IP_); 144927b52867SBrian Foster } 145027b52867SBrian Foster 145127b52867SBrian Foster spin_unlock(&pag->pag_ici_lock); 145227b52867SBrian Foster xfs_perag_put(pag); 145327b52867SBrian Foster } 145427b52867SBrian Foster 145527b52867SBrian Foster void 145627b52867SBrian Foster xfs_inode_clear_eofblocks_tag( 145727b52867SBrian Foster xfs_inode_t *ip) 145827b52867SBrian Foster { 145927b52867SBrian Foster struct xfs_mount *mp = ip->i_mount; 146027b52867SBrian Foster struct xfs_perag *pag; 146127b52867SBrian Foster 1462*85a6e764SChristoph Hellwig spin_lock(&ip->i_flags_lock); 1463*85a6e764SChristoph Hellwig ip->i_flags &= ~XFS_IEOFBLOCKS; 1464*85a6e764SChristoph Hellwig spin_unlock(&ip->i_flags_lock); 1465*85a6e764SChristoph Hellwig 146627b52867SBrian Foster pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 146727b52867SBrian Foster spin_lock(&pag->pag_ici_lock); 146827b52867SBrian Foster trace_xfs_inode_clear_eofblocks_tag(ip); 146927b52867SBrian Foster 147027b52867SBrian Foster radix_tree_tag_clear(&pag->pag_ici_root, 147127b52867SBrian Foster XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 147227b52867SBrian Foster XFS_ICI_EOFBLOCKS_TAG); 147327b52867SBrian Foster if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { 147427b52867SBrian Foster /* clear the eofblocks tag from the perag radix tree */ 147527b52867SBrian Foster spin_lock(&ip->i_mount->m_perag_lock); 147627b52867SBrian Foster radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 147727b52867SBrian Foster XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 147827b52867SBrian Foster XFS_ICI_EOFBLOCKS_TAG); 147927b52867SBrian Foster spin_unlock(&ip->i_mount->m_perag_lock); 148027b52867SBrian Foster trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, 148127b52867SBrian Foster -1, _RET_IP_); 148227b52867SBrian Foster } 148327b52867SBrian Foster 148427b52867SBrian Foster spin_unlock(&pag->pag_ici_lock); 148527b52867SBrian Foster xfs_perag_put(pag); 148627b52867SBrian Foster } 148727b52867SBrian Foster 1488