10b61f8a4SDave Chinner // SPDX-License-Identifier: GPL-2.0 26d8b79cfSDave Chinner /* 36d8b79cfSDave Chinner * Copyright (c) 2000-2005 Silicon Graphics, Inc. 46d8b79cfSDave Chinner * All Rights Reserved. 56d8b79cfSDave Chinner */ 66d8b79cfSDave Chinner #include "xfs.h" 76d8b79cfSDave Chinner #include "xfs_fs.h" 85467b34bSDarrick J. Wong #include "xfs_shared.h" 96ca1c906SDave Chinner #include "xfs_format.h" 10239880efSDave Chinner #include "xfs_log_format.h" 11239880efSDave Chinner #include "xfs_trans_resv.h" 126d8b79cfSDave Chinner #include "xfs_sb.h" 136d8b79cfSDave Chinner #include "xfs_mount.h" 146d8b79cfSDave Chinner #include "xfs_inode.h" 15239880efSDave Chinner #include "xfs_trans.h" 16239880efSDave Chinner #include "xfs_trans_priv.h" 176d8b79cfSDave Chinner #include "xfs_inode_item.h" 186d8b79cfSDave Chinner #include "xfs_quota.h" 196d8b79cfSDave Chinner #include "xfs_trace.h" 206d8b79cfSDave Chinner #include "xfs_icache.h" 21c24b5dfaSDave Chinner #include "xfs_bmap_util.h" 22dc06f398SBrian Foster #include "xfs_dquot_item.h" 23dc06f398SBrian Foster #include "xfs_dquot.h" 2483104d44SDarrick J. Wong #include "xfs_reflink.h" 256d8b79cfSDave Chinner 26f0e28280SJeff Layton #include <linux/iversion.h> 276d8b79cfSDave Chinner 2833479e05SDave Chinner /* 2933479e05SDave Chinner * Allocate and initialise an xfs_inode. 3033479e05SDave Chinner */ 31638f4416SDave Chinner struct xfs_inode * 3233479e05SDave Chinner xfs_inode_alloc( 3333479e05SDave Chinner struct xfs_mount *mp, 3433479e05SDave Chinner xfs_ino_t ino) 3533479e05SDave Chinner { 3633479e05SDave Chinner struct xfs_inode *ip; 3733479e05SDave Chinner 3833479e05SDave Chinner /* 3933479e05SDave Chinner * if this didn't occur in transactions, we could use 4033479e05SDave Chinner * KM_MAYFAIL and return NULL here on ENOMEM. Set the 4133479e05SDave Chinner * code up to do this anyway. 4233479e05SDave Chinner */ 43707e0ddaSTetsuo Handa ip = kmem_zone_alloc(xfs_inode_zone, 0); 4433479e05SDave Chinner if (!ip) 4533479e05SDave Chinner return NULL; 4633479e05SDave Chinner if (inode_init_always(mp->m_super, VFS_I(ip))) { 47377bcd5fSCarlos Maiolino kmem_cache_free(xfs_inode_zone, ip); 4833479e05SDave Chinner return NULL; 4933479e05SDave Chinner } 5033479e05SDave Chinner 51c19b3b05SDave Chinner /* VFS doesn't initialise i_mode! */ 52c19b3b05SDave Chinner VFS_I(ip)->i_mode = 0; 53c19b3b05SDave Chinner 54ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, vn_active); 5533479e05SDave Chinner ASSERT(atomic_read(&ip->i_pincount) == 0); 5633479e05SDave Chinner ASSERT(!xfs_isiflocked(ip)); 5733479e05SDave Chinner ASSERT(ip->i_ino == 0); 5833479e05SDave Chinner 5933479e05SDave Chinner /* initialise the xfs inode */ 6033479e05SDave Chinner ip->i_ino = ino; 6133479e05SDave Chinner ip->i_mount = mp; 6233479e05SDave Chinner memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 6333479e05SDave Chinner ip->i_afp = NULL; 643993baebSDarrick J. Wong ip->i_cowfp = NULL; 653993baebSDarrick J. Wong ip->i_cnextents = 0; 663993baebSDarrick J. Wong ip->i_cformat = XFS_DINODE_FMT_EXTENTS; 673ba738dfSChristoph Hellwig memset(&ip->i_df, 0, sizeof(ip->i_df)); 6833479e05SDave Chinner ip->i_flags = 0; 6933479e05SDave Chinner ip->i_delayed_blks = 0; 70f8d55aa0SDave Chinner memset(&ip->i_d, 0, sizeof(ip->i_d)); 716772c1f1SDarrick J. Wong ip->i_sick = 0; 726772c1f1SDarrick J. Wong ip->i_checked = 0; 73cb357bf3SDarrick J. Wong INIT_WORK(&ip->i_ioend_work, xfs_end_io); 74cb357bf3SDarrick J. Wong INIT_LIST_HEAD(&ip->i_ioend_list); 75cb357bf3SDarrick J. Wong spin_lock_init(&ip->i_ioend_lock); 7633479e05SDave Chinner 7733479e05SDave Chinner return ip; 7833479e05SDave Chinner } 7933479e05SDave Chinner 8033479e05SDave Chinner STATIC void 8133479e05SDave Chinner xfs_inode_free_callback( 8233479e05SDave Chinner struct rcu_head *head) 8333479e05SDave Chinner { 8433479e05SDave Chinner struct inode *inode = container_of(head, struct inode, i_rcu); 8533479e05SDave Chinner struct xfs_inode *ip = XFS_I(inode); 8633479e05SDave Chinner 87c19b3b05SDave Chinner switch (VFS_I(ip)->i_mode & S_IFMT) { 8833479e05SDave Chinner case S_IFREG: 8933479e05SDave Chinner case S_IFDIR: 9033479e05SDave Chinner case S_IFLNK: 9133479e05SDave Chinner xfs_idestroy_fork(ip, XFS_DATA_FORK); 9233479e05SDave Chinner break; 9333479e05SDave Chinner } 9433479e05SDave Chinner 9533479e05SDave Chinner if (ip->i_afp) 9633479e05SDave Chinner xfs_idestroy_fork(ip, XFS_ATTR_FORK); 973993baebSDarrick J. Wong if (ip->i_cowfp) 983993baebSDarrick J. Wong xfs_idestroy_fork(ip, XFS_COW_FORK); 9933479e05SDave Chinner 10033479e05SDave Chinner if (ip->i_itemp) { 10122525c17SDave Chinner ASSERT(!test_bit(XFS_LI_IN_AIL, 10222525c17SDave Chinner &ip->i_itemp->ili_item.li_flags)); 10333479e05SDave Chinner xfs_inode_item_destroy(ip); 10433479e05SDave Chinner ip->i_itemp = NULL; 10533479e05SDave Chinner } 10633479e05SDave Chinner 107377bcd5fSCarlos Maiolino kmem_cache_free(xfs_inode_zone, ip); 1081f2dcfe8SDave Chinner } 1091f2dcfe8SDave Chinner 1108a17d7ddSDave Chinner static void 1118a17d7ddSDave Chinner __xfs_inode_free( 1128a17d7ddSDave Chinner struct xfs_inode *ip) 1138a17d7ddSDave Chinner { 1148a17d7ddSDave Chinner /* asserts to verify all state is correct here */ 1158a17d7ddSDave Chinner ASSERT(atomic_read(&ip->i_pincount) == 0); 1168a17d7ddSDave Chinner XFS_STATS_DEC(ip->i_mount, vn_active); 1178a17d7ddSDave Chinner 1188a17d7ddSDave Chinner call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 1198a17d7ddSDave Chinner } 1208a17d7ddSDave Chinner 1211f2dcfe8SDave Chinner void 1221f2dcfe8SDave Chinner xfs_inode_free( 1231f2dcfe8SDave Chinner struct xfs_inode *ip) 1241f2dcfe8SDave Chinner { 12598efe8afSBrian Foster ASSERT(!xfs_isiflocked(ip)); 12698efe8afSBrian Foster 12733479e05SDave Chinner /* 12833479e05SDave Chinner * Because we use RCU freeing we need to ensure the inode always 12933479e05SDave Chinner * appears to be reclaimed with an invalid inode number when in the 13033479e05SDave Chinner * free state. The ip->i_flags_lock provides the barrier against lookup 13133479e05SDave Chinner * races. 13233479e05SDave Chinner */ 13333479e05SDave Chinner spin_lock(&ip->i_flags_lock); 13433479e05SDave Chinner ip->i_flags = XFS_IRECLAIM; 13533479e05SDave Chinner ip->i_ino = 0; 13633479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 13733479e05SDave Chinner 1388a17d7ddSDave Chinner __xfs_inode_free(ip); 13933479e05SDave Chinner } 14033479e05SDave Chinner 14133479e05SDave Chinner /* 142ad438c40SDave Chinner * Queue a new inode reclaim pass if there are reclaimable inodes and there 143ad438c40SDave Chinner * isn't a reclaim pass already in progress. By default it runs every 5s based 144ad438c40SDave Chinner * on the xfs periodic sync default of 30s. Perhaps this should have it's own 145ad438c40SDave Chinner * tunable, but that can be done if this method proves to be ineffective or too 146ad438c40SDave Chinner * aggressive. 147ad438c40SDave Chinner */ 148ad438c40SDave Chinner static void 149ad438c40SDave Chinner xfs_reclaim_work_queue( 150ad438c40SDave Chinner struct xfs_mount *mp) 151ad438c40SDave Chinner { 152ad438c40SDave Chinner 153ad438c40SDave Chinner rcu_read_lock(); 154ad438c40SDave Chinner if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 155ad438c40SDave Chinner queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 156ad438c40SDave Chinner msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 157ad438c40SDave Chinner } 158ad438c40SDave Chinner rcu_read_unlock(); 159ad438c40SDave Chinner } 160ad438c40SDave Chinner 161ad438c40SDave Chinner /* 162ad438c40SDave Chinner * This is a fast pass over the inode cache to try to get reclaim moving on as 163ad438c40SDave Chinner * many inodes as possible in a short period of time. It kicks itself every few 164ad438c40SDave Chinner * seconds, as well as being kicked by the inode cache shrinker when memory 165ad438c40SDave Chinner * goes low. It scans as quickly as possible avoiding locked inodes or those 166ad438c40SDave Chinner * already being flushed, and once done schedules a future pass. 167ad438c40SDave Chinner */ 168ad438c40SDave Chinner void 169ad438c40SDave Chinner xfs_reclaim_worker( 170ad438c40SDave Chinner struct work_struct *work) 171ad438c40SDave Chinner { 172ad438c40SDave Chinner struct xfs_mount *mp = container_of(to_delayed_work(work), 173ad438c40SDave Chinner struct xfs_mount, m_reclaim_work); 174ad438c40SDave Chinner 175ad438c40SDave Chinner xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 176ad438c40SDave Chinner xfs_reclaim_work_queue(mp); 177ad438c40SDave Chinner } 178ad438c40SDave Chinner 179ad438c40SDave Chinner static void 180ad438c40SDave Chinner xfs_perag_set_reclaim_tag( 181ad438c40SDave Chinner struct xfs_perag *pag) 182ad438c40SDave Chinner { 183ad438c40SDave Chinner struct xfs_mount *mp = pag->pag_mount; 184ad438c40SDave Chinner 18595989c46SBrian Foster lockdep_assert_held(&pag->pag_ici_lock); 186ad438c40SDave Chinner if (pag->pag_ici_reclaimable++) 187ad438c40SDave Chinner return; 188ad438c40SDave Chinner 189ad438c40SDave Chinner /* propagate the reclaim tag up into the perag radix tree */ 190ad438c40SDave Chinner spin_lock(&mp->m_perag_lock); 191ad438c40SDave Chinner radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, 192ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 193ad438c40SDave Chinner spin_unlock(&mp->m_perag_lock); 194ad438c40SDave Chinner 195ad438c40SDave Chinner /* schedule periodic background inode reclaim */ 196ad438c40SDave Chinner xfs_reclaim_work_queue(mp); 197ad438c40SDave Chinner 198ad438c40SDave Chinner trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 199ad438c40SDave Chinner } 200ad438c40SDave Chinner 201ad438c40SDave Chinner static void 202ad438c40SDave Chinner xfs_perag_clear_reclaim_tag( 203ad438c40SDave Chinner struct xfs_perag *pag) 204ad438c40SDave Chinner { 205ad438c40SDave Chinner struct xfs_mount *mp = pag->pag_mount; 206ad438c40SDave Chinner 20795989c46SBrian Foster lockdep_assert_held(&pag->pag_ici_lock); 208ad438c40SDave Chinner if (--pag->pag_ici_reclaimable) 209ad438c40SDave Chinner return; 210ad438c40SDave Chinner 211ad438c40SDave Chinner /* clear the reclaim tag from the perag radix tree */ 212ad438c40SDave Chinner spin_lock(&mp->m_perag_lock); 213ad438c40SDave Chinner radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, 214ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 215ad438c40SDave Chinner spin_unlock(&mp->m_perag_lock); 216ad438c40SDave Chinner trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 217ad438c40SDave Chinner } 218ad438c40SDave Chinner 219ad438c40SDave Chinner 220ad438c40SDave Chinner /* 221ad438c40SDave Chinner * We set the inode flag atomically with the radix tree tag. 222ad438c40SDave Chinner * Once we get tag lookups on the radix tree, this inode flag 223ad438c40SDave Chinner * can go away. 224ad438c40SDave Chinner */ 225ad438c40SDave Chinner void 226ad438c40SDave Chinner xfs_inode_set_reclaim_tag( 227ad438c40SDave Chinner struct xfs_inode *ip) 228ad438c40SDave Chinner { 229ad438c40SDave Chinner struct xfs_mount *mp = ip->i_mount; 230ad438c40SDave Chinner struct xfs_perag *pag; 231ad438c40SDave Chinner 232ad438c40SDave Chinner pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 233ad438c40SDave Chinner spin_lock(&pag->pag_ici_lock); 234ad438c40SDave Chinner spin_lock(&ip->i_flags_lock); 235ad438c40SDave Chinner 236ad438c40SDave Chinner radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), 237ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 238ad438c40SDave Chinner xfs_perag_set_reclaim_tag(pag); 239ad438c40SDave Chinner __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 240ad438c40SDave Chinner 241ad438c40SDave Chinner spin_unlock(&ip->i_flags_lock); 242ad438c40SDave Chinner spin_unlock(&pag->pag_ici_lock); 243ad438c40SDave Chinner xfs_perag_put(pag); 244ad438c40SDave Chinner } 245ad438c40SDave Chinner 246ad438c40SDave Chinner STATIC void 247ad438c40SDave Chinner xfs_inode_clear_reclaim_tag( 248ad438c40SDave Chinner struct xfs_perag *pag, 249ad438c40SDave Chinner xfs_ino_t ino) 250ad438c40SDave Chinner { 251ad438c40SDave Chinner radix_tree_tag_clear(&pag->pag_ici_root, 252ad438c40SDave Chinner XFS_INO_TO_AGINO(pag->pag_mount, ino), 253ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 254ad438c40SDave Chinner xfs_perag_clear_reclaim_tag(pag); 255ad438c40SDave Chinner } 256ad438c40SDave Chinner 257ae2c4ac2SBrian Foster static void 258ae2c4ac2SBrian Foster xfs_inew_wait( 259ae2c4ac2SBrian Foster struct xfs_inode *ip) 260ae2c4ac2SBrian Foster { 261ae2c4ac2SBrian Foster wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); 262ae2c4ac2SBrian Foster DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); 263ae2c4ac2SBrian Foster 264ae2c4ac2SBrian Foster do { 26521417136SIngo Molnar prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 266ae2c4ac2SBrian Foster if (!xfs_iflags_test(ip, XFS_INEW)) 267ae2c4ac2SBrian Foster break; 268ae2c4ac2SBrian Foster schedule(); 269ae2c4ac2SBrian Foster } while (true); 27021417136SIngo Molnar finish_wait(wq, &wait.wq_entry); 271ae2c4ac2SBrian Foster } 272ae2c4ac2SBrian Foster 273ad438c40SDave Chinner /* 27450997470SDave Chinner * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 27550997470SDave Chinner * part of the structure. This is made more complex by the fact we store 27650997470SDave Chinner * information about the on-disk values in the VFS inode and so we can't just 27783e06f21SDave Chinner * overwrite the values unconditionally. Hence we save the parameters we 27850997470SDave Chinner * need to retain across reinitialisation, and rewrite them into the VFS inode 27983e06f21SDave Chinner * after reinitialisation even if it fails. 28050997470SDave Chinner */ 28150997470SDave Chinner static int 28250997470SDave Chinner xfs_reinit_inode( 28350997470SDave Chinner struct xfs_mount *mp, 28450997470SDave Chinner struct inode *inode) 28550997470SDave Chinner { 28650997470SDave Chinner int error; 28754d7b5c1SDave Chinner uint32_t nlink = inode->i_nlink; 2889e9a2674SDave Chinner uint32_t generation = inode->i_generation; 289f0e28280SJeff Layton uint64_t version = inode_peek_iversion(inode); 290c19b3b05SDave Chinner umode_t mode = inode->i_mode; 291acd1d715SAmir Goldstein dev_t dev = inode->i_rdev; 2923d8f2821SChristoph Hellwig kuid_t uid = inode->i_uid; 2933d8f2821SChristoph Hellwig kgid_t gid = inode->i_gid; 29450997470SDave Chinner 29550997470SDave Chinner error = inode_init_always(mp->m_super, inode); 29650997470SDave Chinner 29754d7b5c1SDave Chinner set_nlink(inode, nlink); 2989e9a2674SDave Chinner inode->i_generation = generation; 299f0e28280SJeff Layton inode_set_iversion_queried(inode, version); 300c19b3b05SDave Chinner inode->i_mode = mode; 301acd1d715SAmir Goldstein inode->i_rdev = dev; 3023d8f2821SChristoph Hellwig inode->i_uid = uid; 3033d8f2821SChristoph Hellwig inode->i_gid = gid; 30450997470SDave Chinner return error; 30550997470SDave Chinner } 30650997470SDave Chinner 30750997470SDave Chinner /* 308afca6c5bSDave Chinner * If we are allocating a new inode, then check what was returned is 309afca6c5bSDave Chinner * actually a free, empty inode. If we are not allocating an inode, 310afca6c5bSDave Chinner * then check we didn't find a free inode. 311afca6c5bSDave Chinner * 312afca6c5bSDave Chinner * Returns: 313afca6c5bSDave Chinner * 0 if the inode free state matches the lookup context 314afca6c5bSDave Chinner * -ENOENT if the inode is free and we are not allocating 315afca6c5bSDave Chinner * -EFSCORRUPTED if there is any state mismatch at all 316afca6c5bSDave Chinner */ 317afca6c5bSDave Chinner static int 318afca6c5bSDave Chinner xfs_iget_check_free_state( 319afca6c5bSDave Chinner struct xfs_inode *ip, 320afca6c5bSDave Chinner int flags) 321afca6c5bSDave Chinner { 322afca6c5bSDave Chinner if (flags & XFS_IGET_CREATE) { 323afca6c5bSDave Chinner /* should be a free inode */ 324afca6c5bSDave Chinner if (VFS_I(ip)->i_mode != 0) { 325afca6c5bSDave Chinner xfs_warn(ip->i_mount, 326afca6c5bSDave Chinner "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", 327afca6c5bSDave Chinner ip->i_ino, VFS_I(ip)->i_mode); 328afca6c5bSDave Chinner return -EFSCORRUPTED; 329afca6c5bSDave Chinner } 330afca6c5bSDave Chinner 331afca6c5bSDave Chinner if (ip->i_d.di_nblocks != 0) { 332afca6c5bSDave Chinner xfs_warn(ip->i_mount, 333afca6c5bSDave Chinner "Corruption detected! Free inode 0x%llx has blocks allocated!", 334afca6c5bSDave Chinner ip->i_ino); 335afca6c5bSDave Chinner return -EFSCORRUPTED; 336afca6c5bSDave Chinner } 337afca6c5bSDave Chinner return 0; 338afca6c5bSDave Chinner } 339afca6c5bSDave Chinner 340afca6c5bSDave Chinner /* should be an allocated inode */ 341afca6c5bSDave Chinner if (VFS_I(ip)->i_mode == 0) 342afca6c5bSDave Chinner return -ENOENT; 343afca6c5bSDave Chinner 344afca6c5bSDave Chinner return 0; 345afca6c5bSDave Chinner } 346afca6c5bSDave Chinner 347afca6c5bSDave Chinner /* 34833479e05SDave Chinner * Check the validity of the inode we just found it the cache 34933479e05SDave Chinner */ 35033479e05SDave Chinner static int 35133479e05SDave Chinner xfs_iget_cache_hit( 35233479e05SDave Chinner struct xfs_perag *pag, 35333479e05SDave Chinner struct xfs_inode *ip, 35433479e05SDave Chinner xfs_ino_t ino, 35533479e05SDave Chinner int flags, 35633479e05SDave Chinner int lock_flags) __releases(RCU) 35733479e05SDave Chinner { 35833479e05SDave Chinner struct inode *inode = VFS_I(ip); 35933479e05SDave Chinner struct xfs_mount *mp = ip->i_mount; 36033479e05SDave Chinner int error; 36133479e05SDave Chinner 36233479e05SDave Chinner /* 36333479e05SDave Chinner * check for re-use of an inode within an RCU grace period due to the 36433479e05SDave Chinner * radix tree nodes not being updated yet. We monitor for this by 36533479e05SDave Chinner * setting the inode number to zero before freeing the inode structure. 36633479e05SDave Chinner * If the inode has been reallocated and set up, then the inode number 36733479e05SDave Chinner * will not match, so check for that, too. 36833479e05SDave Chinner */ 36933479e05SDave Chinner spin_lock(&ip->i_flags_lock); 37033479e05SDave Chinner if (ip->i_ino != ino) { 37133479e05SDave Chinner trace_xfs_iget_skip(ip); 372ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_frecycle); 3732451337dSDave Chinner error = -EAGAIN; 37433479e05SDave Chinner goto out_error; 37533479e05SDave Chinner } 37633479e05SDave Chinner 37733479e05SDave Chinner 37833479e05SDave Chinner /* 37933479e05SDave Chinner * If we are racing with another cache hit that is currently 38033479e05SDave Chinner * instantiating this inode or currently recycling it out of 38133479e05SDave Chinner * reclaimabe state, wait for the initialisation to complete 38233479e05SDave Chinner * before continuing. 38333479e05SDave Chinner * 38433479e05SDave Chinner * XXX(hch): eventually we should do something equivalent to 38533479e05SDave Chinner * wait_on_inode to wait for these flags to be cleared 38633479e05SDave Chinner * instead of polling for it. 38733479e05SDave Chinner */ 38833479e05SDave Chinner if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { 38933479e05SDave Chinner trace_xfs_iget_skip(ip); 390ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_frecycle); 3912451337dSDave Chinner error = -EAGAIN; 39233479e05SDave Chinner goto out_error; 39333479e05SDave Chinner } 39433479e05SDave Chinner 39533479e05SDave Chinner /* 396afca6c5bSDave Chinner * Check the inode free state is valid. This also detects lookup 397afca6c5bSDave Chinner * racing with unlinks. 39833479e05SDave Chinner */ 399afca6c5bSDave Chinner error = xfs_iget_check_free_state(ip, flags); 400afca6c5bSDave Chinner if (error) 40133479e05SDave Chinner goto out_error; 40233479e05SDave Chinner 40333479e05SDave Chinner /* 40433479e05SDave Chinner * If IRECLAIMABLE is set, we've torn down the VFS inode already. 40533479e05SDave Chinner * Need to carefully get it back into useable state. 40633479e05SDave Chinner */ 40733479e05SDave Chinner if (ip->i_flags & XFS_IRECLAIMABLE) { 40833479e05SDave Chinner trace_xfs_iget_reclaim(ip); 40933479e05SDave Chinner 410378f681cSDarrick J. Wong if (flags & XFS_IGET_INCORE) { 411378f681cSDarrick J. Wong error = -EAGAIN; 412378f681cSDarrick J. Wong goto out_error; 413378f681cSDarrick J. Wong } 414378f681cSDarrick J. Wong 41533479e05SDave Chinner /* 41633479e05SDave Chinner * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode 41733479e05SDave Chinner * from stomping over us while we recycle the inode. We can't 41833479e05SDave Chinner * clear the radix tree reclaimable tag yet as it requires 41933479e05SDave Chinner * pag_ici_lock to be held exclusive. 42033479e05SDave Chinner */ 42133479e05SDave Chinner ip->i_flags |= XFS_IRECLAIM; 42233479e05SDave Chinner 42333479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 42433479e05SDave Chinner rcu_read_unlock(); 42533479e05SDave Chinner 426d45344d6SIra Weiny ASSERT(!rwsem_is_locked(&inode->i_rwsem)); 42750997470SDave Chinner error = xfs_reinit_inode(mp, inode); 42833479e05SDave Chinner if (error) { 429756baca2SBrian Foster bool wake; 43033479e05SDave Chinner /* 43133479e05SDave Chinner * Re-initializing the inode failed, and we are in deep 43233479e05SDave Chinner * trouble. Try to re-add it to the reclaim list. 43333479e05SDave Chinner */ 43433479e05SDave Chinner rcu_read_lock(); 43533479e05SDave Chinner spin_lock(&ip->i_flags_lock); 436756baca2SBrian Foster wake = !!__xfs_iflags_test(ip, XFS_INEW); 43733479e05SDave Chinner ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 438756baca2SBrian Foster if (wake) 439756baca2SBrian Foster wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); 44033479e05SDave Chinner ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 44133479e05SDave Chinner trace_xfs_iget_reclaim_fail(ip); 44233479e05SDave Chinner goto out_error; 44333479e05SDave Chinner } 44433479e05SDave Chinner 44533479e05SDave Chinner spin_lock(&pag->pag_ici_lock); 44633479e05SDave Chinner spin_lock(&ip->i_flags_lock); 44733479e05SDave Chinner 44833479e05SDave Chinner /* 44933479e05SDave Chinner * Clear the per-lifetime state in the inode as we are now 45033479e05SDave Chinner * effectively a new inode and need to return to the initial 45133479e05SDave Chinner * state before reuse occurs. 45233479e05SDave Chinner */ 45333479e05SDave Chinner ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 45433479e05SDave Chinner ip->i_flags |= XFS_INEW; 455545c0889SDave Chinner xfs_inode_clear_reclaim_tag(pag, ip->i_ino); 45633479e05SDave Chinner inode->i_state = I_NEW; 4576772c1f1SDarrick J. Wong ip->i_sick = 0; 4586772c1f1SDarrick J. Wong ip->i_checked = 0; 45933479e05SDave Chinner 46033479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 46133479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 46233479e05SDave Chinner } else { 46333479e05SDave Chinner /* If the VFS inode is being torn down, pause and try again. */ 46433479e05SDave Chinner if (!igrab(inode)) { 46533479e05SDave Chinner trace_xfs_iget_skip(ip); 4662451337dSDave Chinner error = -EAGAIN; 46733479e05SDave Chinner goto out_error; 46833479e05SDave Chinner } 46933479e05SDave Chinner 47033479e05SDave Chinner /* We've got a live one. */ 47133479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 47233479e05SDave Chinner rcu_read_unlock(); 47333479e05SDave Chinner trace_xfs_iget_hit(ip); 47433479e05SDave Chinner } 47533479e05SDave Chinner 47633479e05SDave Chinner if (lock_flags != 0) 47733479e05SDave Chinner xfs_ilock(ip, lock_flags); 47833479e05SDave Chinner 479378f681cSDarrick J. Wong if (!(flags & XFS_IGET_INCORE)) 48033479e05SDave Chinner xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); 481ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_found); 48233479e05SDave Chinner 48333479e05SDave Chinner return 0; 48433479e05SDave Chinner 48533479e05SDave Chinner out_error: 48633479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 48733479e05SDave Chinner rcu_read_unlock(); 48833479e05SDave Chinner return error; 48933479e05SDave Chinner } 49033479e05SDave Chinner 49133479e05SDave Chinner 49233479e05SDave Chinner static int 49333479e05SDave Chinner xfs_iget_cache_miss( 49433479e05SDave Chinner struct xfs_mount *mp, 49533479e05SDave Chinner struct xfs_perag *pag, 49633479e05SDave Chinner xfs_trans_t *tp, 49733479e05SDave Chinner xfs_ino_t ino, 49833479e05SDave Chinner struct xfs_inode **ipp, 49933479e05SDave Chinner int flags, 50033479e05SDave Chinner int lock_flags) 50133479e05SDave Chinner { 50233479e05SDave Chinner struct xfs_inode *ip; 50333479e05SDave Chinner int error; 50433479e05SDave Chinner xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 50533479e05SDave Chinner int iflags; 50633479e05SDave Chinner 50733479e05SDave Chinner ip = xfs_inode_alloc(mp, ino); 50833479e05SDave Chinner if (!ip) 5092451337dSDave Chinner return -ENOMEM; 51033479e05SDave Chinner 51133479e05SDave Chinner error = xfs_iread(mp, tp, ip, flags); 51233479e05SDave Chinner if (error) 51333479e05SDave Chinner goto out_destroy; 51433479e05SDave Chinner 5159cfb9b47SDarrick J. Wong if (!xfs_inode_verify_forks(ip)) { 5169cfb9b47SDarrick J. Wong error = -EFSCORRUPTED; 5179cfb9b47SDarrick J. Wong goto out_destroy; 5189cfb9b47SDarrick J. Wong } 5199cfb9b47SDarrick J. Wong 52033479e05SDave Chinner trace_xfs_iget_miss(ip); 52133479e05SDave Chinner 522ee457001SDave Chinner 523ee457001SDave Chinner /* 524afca6c5bSDave Chinner * Check the inode free state is valid. This also detects lookup 525afca6c5bSDave Chinner * racing with unlinks. 526ee457001SDave Chinner */ 527afca6c5bSDave Chinner error = xfs_iget_check_free_state(ip, flags); 528afca6c5bSDave Chinner if (error) 529ee457001SDave Chinner goto out_destroy; 53033479e05SDave Chinner 53133479e05SDave Chinner /* 53233479e05SDave Chinner * Preload the radix tree so we can insert safely under the 53333479e05SDave Chinner * write spinlock. Note that we cannot sleep inside the preload 53433479e05SDave Chinner * region. Since we can be called from transaction context, don't 53533479e05SDave Chinner * recurse into the file system. 53633479e05SDave Chinner */ 53733479e05SDave Chinner if (radix_tree_preload(GFP_NOFS)) { 5382451337dSDave Chinner error = -EAGAIN; 53933479e05SDave Chinner goto out_destroy; 54033479e05SDave Chinner } 54133479e05SDave Chinner 54233479e05SDave Chinner /* 54333479e05SDave Chinner * Because the inode hasn't been added to the radix-tree yet it can't 54433479e05SDave Chinner * be found by another thread, so we can do the non-sleeping lock here. 54533479e05SDave Chinner */ 54633479e05SDave Chinner if (lock_flags) { 54733479e05SDave Chinner if (!xfs_ilock_nowait(ip, lock_flags)) 54833479e05SDave Chinner BUG(); 54933479e05SDave Chinner } 55033479e05SDave Chinner 55133479e05SDave Chinner /* 55233479e05SDave Chinner * These values must be set before inserting the inode into the radix 55333479e05SDave Chinner * tree as the moment it is inserted a concurrent lookup (allowed by the 55433479e05SDave Chinner * RCU locking mechanism) can find it and that lookup must see that this 55533479e05SDave Chinner * is an inode currently under construction (i.e. that XFS_INEW is set). 55633479e05SDave Chinner * The ip->i_flags_lock that protects the XFS_INEW flag forms the 55733479e05SDave Chinner * memory barrier that ensures this detection works correctly at lookup 55833479e05SDave Chinner * time. 55933479e05SDave Chinner */ 56033479e05SDave Chinner iflags = XFS_INEW; 56133479e05SDave Chinner if (flags & XFS_IGET_DONTCACHE) 56233479e05SDave Chinner iflags |= XFS_IDONTCACHE; 563113a5683SChandra Seetharaman ip->i_udquot = NULL; 564113a5683SChandra Seetharaman ip->i_gdquot = NULL; 56592f8ff73SChandra Seetharaman ip->i_pdquot = NULL; 56633479e05SDave Chinner xfs_iflags_set(ip, iflags); 56733479e05SDave Chinner 56833479e05SDave Chinner /* insert the new inode */ 56933479e05SDave Chinner spin_lock(&pag->pag_ici_lock); 57033479e05SDave Chinner error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 57133479e05SDave Chinner if (unlikely(error)) { 57233479e05SDave Chinner WARN_ON(error != -EEXIST); 573ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_dup); 5742451337dSDave Chinner error = -EAGAIN; 57533479e05SDave Chinner goto out_preload_end; 57633479e05SDave Chinner } 57733479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 57833479e05SDave Chinner radix_tree_preload_end(); 57933479e05SDave Chinner 58033479e05SDave Chinner *ipp = ip; 58133479e05SDave Chinner return 0; 58233479e05SDave Chinner 58333479e05SDave Chinner out_preload_end: 58433479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 58533479e05SDave Chinner radix_tree_preload_end(); 58633479e05SDave Chinner if (lock_flags) 58733479e05SDave Chinner xfs_iunlock(ip, lock_flags); 58833479e05SDave Chinner out_destroy: 58933479e05SDave Chinner __destroy_inode(VFS_I(ip)); 59033479e05SDave Chinner xfs_inode_free(ip); 59133479e05SDave Chinner return error; 59233479e05SDave Chinner } 59333479e05SDave Chinner 59433479e05SDave Chinner /* 59533479e05SDave Chinner * Look up an inode by number in the given file system. 59633479e05SDave Chinner * The inode is looked up in the cache held in each AG. 59733479e05SDave Chinner * If the inode is found in the cache, initialise the vfs inode 59833479e05SDave Chinner * if necessary. 59933479e05SDave Chinner * 60033479e05SDave Chinner * If it is not in core, read it in from the file system's device, 60133479e05SDave Chinner * add it to the cache and initialise the vfs inode. 60233479e05SDave Chinner * 60333479e05SDave Chinner * The inode is locked according to the value of the lock_flags parameter. 60433479e05SDave Chinner * This flag parameter indicates how and if the inode's IO lock and inode lock 60533479e05SDave Chinner * should be taken. 60633479e05SDave Chinner * 60733479e05SDave Chinner * mp -- the mount point structure for the current file system. It points 60833479e05SDave Chinner * to the inode hash table. 60933479e05SDave Chinner * tp -- a pointer to the current transaction if there is one. This is 61033479e05SDave Chinner * simply passed through to the xfs_iread() call. 61133479e05SDave Chinner * ino -- the number of the inode desired. This is the unique identifier 61233479e05SDave Chinner * within the file system for the inode being requested. 61333479e05SDave Chinner * lock_flags -- flags indicating how to lock the inode. See the comment 61433479e05SDave Chinner * for xfs_ilock() for a list of valid values. 61533479e05SDave Chinner */ 61633479e05SDave Chinner int 61733479e05SDave Chinner xfs_iget( 61833479e05SDave Chinner xfs_mount_t *mp, 61933479e05SDave Chinner xfs_trans_t *tp, 62033479e05SDave Chinner xfs_ino_t ino, 62133479e05SDave Chinner uint flags, 62233479e05SDave Chinner uint lock_flags, 62333479e05SDave Chinner xfs_inode_t **ipp) 62433479e05SDave Chinner { 62533479e05SDave Chinner xfs_inode_t *ip; 62633479e05SDave Chinner int error; 62733479e05SDave Chinner xfs_perag_t *pag; 62833479e05SDave Chinner xfs_agino_t agino; 62933479e05SDave Chinner 63033479e05SDave Chinner /* 63133479e05SDave Chinner * xfs_reclaim_inode() uses the ILOCK to ensure an inode 63233479e05SDave Chinner * doesn't get freed while it's being referenced during a 63333479e05SDave Chinner * radix tree traversal here. It assumes this function 63433479e05SDave Chinner * aqcuires only the ILOCK (and therefore it has no need to 63533479e05SDave Chinner * involve the IOLOCK in this synchronization). 63633479e05SDave Chinner */ 63733479e05SDave Chinner ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 63833479e05SDave Chinner 63933479e05SDave Chinner /* reject inode numbers outside existing AGs */ 64033479e05SDave Chinner if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 6412451337dSDave Chinner return -EINVAL; 64233479e05SDave Chinner 643ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_attempts); 6448774cf8bSLucas Stach 64533479e05SDave Chinner /* get the perag structure and ensure that it's inode capable */ 64633479e05SDave Chinner pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 64733479e05SDave Chinner agino = XFS_INO_TO_AGINO(mp, ino); 64833479e05SDave Chinner 64933479e05SDave Chinner again: 65033479e05SDave Chinner error = 0; 65133479e05SDave Chinner rcu_read_lock(); 65233479e05SDave Chinner ip = radix_tree_lookup(&pag->pag_ici_root, agino); 65333479e05SDave Chinner 65433479e05SDave Chinner if (ip) { 65533479e05SDave Chinner error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 65633479e05SDave Chinner if (error) 65733479e05SDave Chinner goto out_error_or_again; 65833479e05SDave Chinner } else { 65933479e05SDave Chinner rcu_read_unlock(); 660378f681cSDarrick J. Wong if (flags & XFS_IGET_INCORE) { 661ed438b47SDarrick J. Wong error = -ENODATA; 662378f681cSDarrick J. Wong goto out_error_or_again; 663378f681cSDarrick J. Wong } 664ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_missed); 66533479e05SDave Chinner 66633479e05SDave Chinner error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 66733479e05SDave Chinner flags, lock_flags); 66833479e05SDave Chinner if (error) 66933479e05SDave Chinner goto out_error_or_again; 67033479e05SDave Chinner } 67133479e05SDave Chinner xfs_perag_put(pag); 67233479e05SDave Chinner 67333479e05SDave Chinner *ipp = ip; 67433479e05SDave Chinner 67533479e05SDave Chinner /* 67658c90473SDave Chinner * If we have a real type for an on-disk inode, we can setup the inode 67733479e05SDave Chinner * now. If it's a new inode being created, xfs_ialloc will handle it. 67833479e05SDave Chinner */ 679c19b3b05SDave Chinner if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 68058c90473SDave Chinner xfs_setup_existing_inode(ip); 68133479e05SDave Chinner return 0; 68233479e05SDave Chinner 68333479e05SDave Chinner out_error_or_again: 684378f681cSDarrick J. Wong if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { 68533479e05SDave Chinner delay(1); 68633479e05SDave Chinner goto again; 68733479e05SDave Chinner } 68833479e05SDave Chinner xfs_perag_put(pag); 68933479e05SDave Chinner return error; 69033479e05SDave Chinner } 69133479e05SDave Chinner 6926d8b79cfSDave Chinner /* 693378f681cSDarrick J. Wong * "Is this a cached inode that's also allocated?" 694378f681cSDarrick J. Wong * 695378f681cSDarrick J. Wong * Look up an inode by number in the given file system. If the inode is 696378f681cSDarrick J. Wong * in cache and isn't in purgatory, return 1 if the inode is allocated 697378f681cSDarrick J. Wong * and 0 if it is not. For all other cases (not in cache, being torn 698378f681cSDarrick J. Wong * down, etc.), return a negative error code. 699378f681cSDarrick J. Wong * 700378f681cSDarrick J. Wong * The caller has to prevent inode allocation and freeing activity, 701378f681cSDarrick J. Wong * presumably by locking the AGI buffer. This is to ensure that an 702378f681cSDarrick J. Wong * inode cannot transition from allocated to freed until the caller is 703378f681cSDarrick J. Wong * ready to allow that. If the inode is in an intermediate state (new, 704378f681cSDarrick J. Wong * reclaimable, or being reclaimed), -EAGAIN will be returned; if the 705378f681cSDarrick J. Wong * inode is not in the cache, -ENOENT will be returned. The caller must 706378f681cSDarrick J. Wong * deal with these scenarios appropriately. 707378f681cSDarrick J. Wong * 708378f681cSDarrick J. Wong * This is a specialized use case for the online scrubber; if you're 709378f681cSDarrick J. Wong * reading this, you probably want xfs_iget. 710378f681cSDarrick J. Wong */ 711378f681cSDarrick J. Wong int 712378f681cSDarrick J. Wong xfs_icache_inode_is_allocated( 713378f681cSDarrick J. Wong struct xfs_mount *mp, 714378f681cSDarrick J. Wong struct xfs_trans *tp, 715378f681cSDarrick J. Wong xfs_ino_t ino, 716378f681cSDarrick J. Wong bool *inuse) 717378f681cSDarrick J. Wong { 718378f681cSDarrick J. Wong struct xfs_inode *ip; 719378f681cSDarrick J. Wong int error; 720378f681cSDarrick J. Wong 721378f681cSDarrick J. Wong error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); 722378f681cSDarrick J. Wong if (error) 723378f681cSDarrick J. Wong return error; 724378f681cSDarrick J. Wong 725378f681cSDarrick J. Wong *inuse = !!(VFS_I(ip)->i_mode); 72644a8736bSDarrick J. Wong xfs_irele(ip); 727378f681cSDarrick J. Wong return 0; 728378f681cSDarrick J. Wong } 729378f681cSDarrick J. Wong 730378f681cSDarrick J. Wong /* 7316d8b79cfSDave Chinner * The inode lookup is done in batches to keep the amount of lock traffic and 7326d8b79cfSDave Chinner * radix tree lookups to a minimum. The batch size is a trade off between 7336d8b79cfSDave Chinner * lookup reduction and stack usage. This is in the reclaim path, so we can't 7346d8b79cfSDave Chinner * be too greedy. 7356d8b79cfSDave Chinner */ 7366d8b79cfSDave Chinner #define XFS_LOOKUP_BATCH 32 7376d8b79cfSDave Chinner 7386d8b79cfSDave Chinner STATIC int 7396d8b79cfSDave Chinner xfs_inode_ag_walk_grab( 740ae2c4ac2SBrian Foster struct xfs_inode *ip, 741ae2c4ac2SBrian Foster int flags) 7426d8b79cfSDave Chinner { 7436d8b79cfSDave Chinner struct inode *inode = VFS_I(ip); 744ae2c4ac2SBrian Foster bool newinos = !!(flags & XFS_AGITER_INEW_WAIT); 7456d8b79cfSDave Chinner 7466d8b79cfSDave Chinner ASSERT(rcu_read_lock_held()); 7476d8b79cfSDave Chinner 7486d8b79cfSDave Chinner /* 7496d8b79cfSDave Chinner * check for stale RCU freed inode 7506d8b79cfSDave Chinner * 7516d8b79cfSDave Chinner * If the inode has been reallocated, it doesn't matter if it's not in 7526d8b79cfSDave Chinner * the AG we are walking - we are walking for writeback, so if it 7536d8b79cfSDave Chinner * passes all the "valid inode" checks and is dirty, then we'll write 7546d8b79cfSDave Chinner * it back anyway. If it has been reallocated and still being 7556d8b79cfSDave Chinner * initialised, the XFS_INEW check below will catch it. 7566d8b79cfSDave Chinner */ 7576d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 7586d8b79cfSDave Chinner if (!ip->i_ino) 7596d8b79cfSDave Chinner goto out_unlock_noent; 7606d8b79cfSDave Chinner 7616d8b79cfSDave Chinner /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 762ae2c4ac2SBrian Foster if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || 763ae2c4ac2SBrian Foster __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) 7646d8b79cfSDave Chinner goto out_unlock_noent; 7656d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 7666d8b79cfSDave Chinner 7676d8b79cfSDave Chinner /* nothing to sync during shutdown */ 7686d8b79cfSDave Chinner if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 7692451337dSDave Chinner return -EFSCORRUPTED; 7706d8b79cfSDave Chinner 7716d8b79cfSDave Chinner /* If we can't grab the inode, it must on it's way to reclaim. */ 7726d8b79cfSDave Chinner if (!igrab(inode)) 7732451337dSDave Chinner return -ENOENT; 7746d8b79cfSDave Chinner 7756d8b79cfSDave Chinner /* inode is valid */ 7766d8b79cfSDave Chinner return 0; 7776d8b79cfSDave Chinner 7786d8b79cfSDave Chinner out_unlock_noent: 7796d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 7802451337dSDave Chinner return -ENOENT; 7816d8b79cfSDave Chinner } 7826d8b79cfSDave Chinner 7836d8b79cfSDave Chinner STATIC int 7846d8b79cfSDave Chinner xfs_inode_ag_walk( 7856d8b79cfSDave Chinner struct xfs_mount *mp, 7866d8b79cfSDave Chinner struct xfs_perag *pag, 787e0094008SEric Sandeen int (*execute)(struct xfs_inode *ip, int flags, 788a454f742SBrian Foster void *args), 789a454f742SBrian Foster int flags, 790a454f742SBrian Foster void *args, 791ae2c4ac2SBrian Foster int tag, 792ae2c4ac2SBrian Foster int iter_flags) 7936d8b79cfSDave Chinner { 7946d8b79cfSDave Chinner uint32_t first_index; 7956d8b79cfSDave Chinner int last_error = 0; 7966d8b79cfSDave Chinner int skipped; 7976d8b79cfSDave Chinner int done; 7986d8b79cfSDave Chinner int nr_found; 7996d8b79cfSDave Chinner 8006d8b79cfSDave Chinner restart: 8016d8b79cfSDave Chinner done = 0; 8026d8b79cfSDave Chinner skipped = 0; 8036d8b79cfSDave Chinner first_index = 0; 8046d8b79cfSDave Chinner nr_found = 0; 8056d8b79cfSDave Chinner do { 8066d8b79cfSDave Chinner struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 8076d8b79cfSDave Chinner int error = 0; 8086d8b79cfSDave Chinner int i; 8096d8b79cfSDave Chinner 8106d8b79cfSDave Chinner rcu_read_lock(); 811a454f742SBrian Foster 812a454f742SBrian Foster if (tag == -1) 8136d8b79cfSDave Chinner nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 8146d8b79cfSDave Chinner (void **)batch, first_index, 8156d8b79cfSDave Chinner XFS_LOOKUP_BATCH); 816a454f742SBrian Foster else 817a454f742SBrian Foster nr_found = radix_tree_gang_lookup_tag( 818a454f742SBrian Foster &pag->pag_ici_root, 819a454f742SBrian Foster (void **) batch, first_index, 820a454f742SBrian Foster XFS_LOOKUP_BATCH, tag); 821a454f742SBrian Foster 8226d8b79cfSDave Chinner if (!nr_found) { 8236d8b79cfSDave Chinner rcu_read_unlock(); 8246d8b79cfSDave Chinner break; 8256d8b79cfSDave Chinner } 8266d8b79cfSDave Chinner 8276d8b79cfSDave Chinner /* 8286d8b79cfSDave Chinner * Grab the inodes before we drop the lock. if we found 8296d8b79cfSDave Chinner * nothing, nr == 0 and the loop will be skipped. 8306d8b79cfSDave Chinner */ 8316d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 8326d8b79cfSDave Chinner struct xfs_inode *ip = batch[i]; 8336d8b79cfSDave Chinner 834ae2c4ac2SBrian Foster if (done || xfs_inode_ag_walk_grab(ip, iter_flags)) 8356d8b79cfSDave Chinner batch[i] = NULL; 8366d8b79cfSDave Chinner 8376d8b79cfSDave Chinner /* 8386d8b79cfSDave Chinner * Update the index for the next lookup. Catch 8396d8b79cfSDave Chinner * overflows into the next AG range which can occur if 8406d8b79cfSDave Chinner * we have inodes in the last block of the AG and we 8416d8b79cfSDave Chinner * are currently pointing to the last inode. 8426d8b79cfSDave Chinner * 8436d8b79cfSDave Chinner * Because we may see inodes that are from the wrong AG 8446d8b79cfSDave Chinner * due to RCU freeing and reallocation, only update the 8456d8b79cfSDave Chinner * index if it lies in this AG. It was a race that lead 8466d8b79cfSDave Chinner * us to see this inode, so another lookup from the 8476d8b79cfSDave Chinner * same index will not find it again. 8486d8b79cfSDave Chinner */ 8496d8b79cfSDave Chinner if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 8506d8b79cfSDave Chinner continue; 8516d8b79cfSDave Chinner first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 8526d8b79cfSDave Chinner if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 8536d8b79cfSDave Chinner done = 1; 8546d8b79cfSDave Chinner } 8556d8b79cfSDave Chinner 8566d8b79cfSDave Chinner /* unlock now we've grabbed the inodes. */ 8576d8b79cfSDave Chinner rcu_read_unlock(); 8586d8b79cfSDave Chinner 8596d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 8606d8b79cfSDave Chinner if (!batch[i]) 8616d8b79cfSDave Chinner continue; 862ae2c4ac2SBrian Foster if ((iter_flags & XFS_AGITER_INEW_WAIT) && 863ae2c4ac2SBrian Foster xfs_iflags_test(batch[i], XFS_INEW)) 864ae2c4ac2SBrian Foster xfs_inew_wait(batch[i]); 865e0094008SEric Sandeen error = execute(batch[i], flags, args); 86644a8736bSDarrick J. Wong xfs_irele(batch[i]); 8672451337dSDave Chinner if (error == -EAGAIN) { 8686d8b79cfSDave Chinner skipped++; 8696d8b79cfSDave Chinner continue; 8706d8b79cfSDave Chinner } 8712451337dSDave Chinner if (error && last_error != -EFSCORRUPTED) 8726d8b79cfSDave Chinner last_error = error; 8736d8b79cfSDave Chinner } 8746d8b79cfSDave Chinner 8756d8b79cfSDave Chinner /* bail out if the filesystem is corrupted. */ 8762451337dSDave Chinner if (error == -EFSCORRUPTED) 8776d8b79cfSDave Chinner break; 8786d8b79cfSDave Chinner 8796d8b79cfSDave Chinner cond_resched(); 8806d8b79cfSDave Chinner 8816d8b79cfSDave Chinner } while (nr_found && !done); 8826d8b79cfSDave Chinner 8836d8b79cfSDave Chinner if (skipped) { 8846d8b79cfSDave Chinner delay(1); 8856d8b79cfSDave Chinner goto restart; 8866d8b79cfSDave Chinner } 8876d8b79cfSDave Chinner return last_error; 8886d8b79cfSDave Chinner } 8896d8b79cfSDave Chinner 890579b62faSBrian Foster /* 891579b62faSBrian Foster * Background scanning to trim post-EOF preallocated space. This is queued 892b9fe5052SDwight Engen * based on the 'speculative_prealloc_lifetime' tunable (5m by default). 893579b62faSBrian Foster */ 894fa5a4f57SBrian Foster void 895579b62faSBrian Foster xfs_queue_eofblocks( 896579b62faSBrian Foster struct xfs_mount *mp) 897579b62faSBrian Foster { 898579b62faSBrian Foster rcu_read_lock(); 899579b62faSBrian Foster if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) 900579b62faSBrian Foster queue_delayed_work(mp->m_eofblocks_workqueue, 901579b62faSBrian Foster &mp->m_eofblocks_work, 902579b62faSBrian Foster msecs_to_jiffies(xfs_eofb_secs * 1000)); 903579b62faSBrian Foster rcu_read_unlock(); 904579b62faSBrian Foster } 905579b62faSBrian Foster 906579b62faSBrian Foster void 907579b62faSBrian Foster xfs_eofblocks_worker( 908579b62faSBrian Foster struct work_struct *work) 909579b62faSBrian Foster { 910579b62faSBrian Foster struct xfs_mount *mp = container_of(to_delayed_work(work), 911579b62faSBrian Foster struct xfs_mount, m_eofblocks_work); 9124b674b9aSBrian Foster 9134b674b9aSBrian Foster if (!sb_start_write_trylock(mp->m_super)) 9144b674b9aSBrian Foster return; 915579b62faSBrian Foster xfs_icache_free_eofblocks(mp, NULL); 9164b674b9aSBrian Foster sb_end_write(mp->m_super); 9174b674b9aSBrian Foster 918579b62faSBrian Foster xfs_queue_eofblocks(mp); 919579b62faSBrian Foster } 920579b62faSBrian Foster 92183104d44SDarrick J. Wong /* 92283104d44SDarrick J. Wong * Background scanning to trim preallocated CoW space. This is queued 92383104d44SDarrick J. Wong * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). 92483104d44SDarrick J. Wong * (We'll just piggyback on the post-EOF prealloc space workqueue.) 92583104d44SDarrick J. Wong */ 92610ddf64eSDarrick J. Wong void 92783104d44SDarrick J. Wong xfs_queue_cowblocks( 92883104d44SDarrick J. Wong struct xfs_mount *mp) 92983104d44SDarrick J. Wong { 93083104d44SDarrick J. Wong rcu_read_lock(); 93183104d44SDarrick J. Wong if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) 93283104d44SDarrick J. Wong queue_delayed_work(mp->m_eofblocks_workqueue, 93383104d44SDarrick J. Wong &mp->m_cowblocks_work, 93483104d44SDarrick J. Wong msecs_to_jiffies(xfs_cowb_secs * 1000)); 93583104d44SDarrick J. Wong rcu_read_unlock(); 93683104d44SDarrick J. Wong } 93783104d44SDarrick J. Wong 93883104d44SDarrick J. Wong void 93983104d44SDarrick J. Wong xfs_cowblocks_worker( 94083104d44SDarrick J. Wong struct work_struct *work) 94183104d44SDarrick J. Wong { 94283104d44SDarrick J. Wong struct xfs_mount *mp = container_of(to_delayed_work(work), 94383104d44SDarrick J. Wong struct xfs_mount, m_cowblocks_work); 9444b674b9aSBrian Foster 9454b674b9aSBrian Foster if (!sb_start_write_trylock(mp->m_super)) 9464b674b9aSBrian Foster return; 94783104d44SDarrick J. Wong xfs_icache_free_cowblocks(mp, NULL); 9484b674b9aSBrian Foster sb_end_write(mp->m_super); 9494b674b9aSBrian Foster 95083104d44SDarrick J. Wong xfs_queue_cowblocks(mp); 95183104d44SDarrick J. Wong } 95283104d44SDarrick J. Wong 9536d8b79cfSDave Chinner int 954ae2c4ac2SBrian Foster xfs_inode_ag_iterator_flags( 9556d8b79cfSDave Chinner struct xfs_mount *mp, 956e0094008SEric Sandeen int (*execute)(struct xfs_inode *ip, int flags, 957a454f742SBrian Foster void *args), 958a454f742SBrian Foster int flags, 959ae2c4ac2SBrian Foster void *args, 960ae2c4ac2SBrian Foster int iter_flags) 9616d8b79cfSDave Chinner { 9626d8b79cfSDave Chinner struct xfs_perag *pag; 9636d8b79cfSDave Chinner int error = 0; 9646d8b79cfSDave Chinner int last_error = 0; 9656d8b79cfSDave Chinner xfs_agnumber_t ag; 9666d8b79cfSDave Chinner 9676d8b79cfSDave Chinner ag = 0; 9686d8b79cfSDave Chinner while ((pag = xfs_perag_get(mp, ag))) { 9696d8b79cfSDave Chinner ag = pag->pag_agno + 1; 970ae2c4ac2SBrian Foster error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1, 971ae2c4ac2SBrian Foster iter_flags); 972a454f742SBrian Foster xfs_perag_put(pag); 973a454f742SBrian Foster if (error) { 974a454f742SBrian Foster last_error = error; 9752451337dSDave Chinner if (error == -EFSCORRUPTED) 976a454f742SBrian Foster break; 977a454f742SBrian Foster } 978a454f742SBrian Foster } 979b474c7aeSEric Sandeen return last_error; 980a454f742SBrian Foster } 981a454f742SBrian Foster 982a454f742SBrian Foster int 983ae2c4ac2SBrian Foster xfs_inode_ag_iterator( 984ae2c4ac2SBrian Foster struct xfs_mount *mp, 985ae2c4ac2SBrian Foster int (*execute)(struct xfs_inode *ip, int flags, 986ae2c4ac2SBrian Foster void *args), 987ae2c4ac2SBrian Foster int flags, 988ae2c4ac2SBrian Foster void *args) 989ae2c4ac2SBrian Foster { 990ae2c4ac2SBrian Foster return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0); 991ae2c4ac2SBrian Foster } 992ae2c4ac2SBrian Foster 993ae2c4ac2SBrian Foster int 994a454f742SBrian Foster xfs_inode_ag_iterator_tag( 995a454f742SBrian Foster struct xfs_mount *mp, 996e0094008SEric Sandeen int (*execute)(struct xfs_inode *ip, int flags, 997a454f742SBrian Foster void *args), 998a454f742SBrian Foster int flags, 999a454f742SBrian Foster void *args, 1000a454f742SBrian Foster int tag) 1001a454f742SBrian Foster { 1002a454f742SBrian Foster struct xfs_perag *pag; 1003a454f742SBrian Foster int error = 0; 1004a454f742SBrian Foster int last_error = 0; 1005a454f742SBrian Foster xfs_agnumber_t ag; 1006a454f742SBrian Foster 1007a454f742SBrian Foster ag = 0; 1008a454f742SBrian Foster while ((pag = xfs_perag_get_tag(mp, ag, tag))) { 1009a454f742SBrian Foster ag = pag->pag_agno + 1; 1010ae2c4ac2SBrian Foster error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag, 1011ae2c4ac2SBrian Foster 0); 10126d8b79cfSDave Chinner xfs_perag_put(pag); 10136d8b79cfSDave Chinner if (error) { 10146d8b79cfSDave Chinner last_error = error; 10152451337dSDave Chinner if (error == -EFSCORRUPTED) 10166d8b79cfSDave Chinner break; 10176d8b79cfSDave Chinner } 10186d8b79cfSDave Chinner } 1019b474c7aeSEric Sandeen return last_error; 10206d8b79cfSDave Chinner } 10216d8b79cfSDave Chinner 10226d8b79cfSDave Chinner /* 10236d8b79cfSDave Chinner * Grab the inode for reclaim exclusively. 10246d8b79cfSDave Chinner * Return 0 if we grabbed it, non-zero otherwise. 10256d8b79cfSDave Chinner */ 10266d8b79cfSDave Chinner STATIC int 10276d8b79cfSDave Chinner xfs_reclaim_inode_grab( 10286d8b79cfSDave Chinner struct xfs_inode *ip, 10296d8b79cfSDave Chinner int flags) 10306d8b79cfSDave Chinner { 10316d8b79cfSDave Chinner ASSERT(rcu_read_lock_held()); 10326d8b79cfSDave Chinner 10336d8b79cfSDave Chinner /* quick check for stale RCU freed inode */ 10346d8b79cfSDave Chinner if (!ip->i_ino) 10356d8b79cfSDave Chinner return 1; 10366d8b79cfSDave Chinner 10376d8b79cfSDave Chinner /* 10386d8b79cfSDave Chinner * If we are asked for non-blocking operation, do unlocked checks to 10396d8b79cfSDave Chinner * see if the inode already is being flushed or in reclaim to avoid 10406d8b79cfSDave Chinner * lock traffic. 10416d8b79cfSDave Chinner */ 10426d8b79cfSDave Chinner if ((flags & SYNC_TRYLOCK) && 10436d8b79cfSDave Chinner __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) 10446d8b79cfSDave Chinner return 1; 10456d8b79cfSDave Chinner 10466d8b79cfSDave Chinner /* 10476d8b79cfSDave Chinner * The radix tree lock here protects a thread in xfs_iget from racing 10486d8b79cfSDave Chinner * with us starting reclaim on the inode. Once we have the 10496d8b79cfSDave Chinner * XFS_IRECLAIM flag set it will not touch us. 10506d8b79cfSDave Chinner * 10516d8b79cfSDave Chinner * Due to RCU lookup, we may find inodes that have been freed and only 10526d8b79cfSDave Chinner * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that 10536d8b79cfSDave Chinner * aren't candidates for reclaim at all, so we must check the 10546d8b79cfSDave Chinner * XFS_IRECLAIMABLE is set first before proceeding to reclaim. 10556d8b79cfSDave Chinner */ 10566d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 10576d8b79cfSDave Chinner if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 10586d8b79cfSDave Chinner __xfs_iflags_test(ip, XFS_IRECLAIM)) { 10596d8b79cfSDave Chinner /* not a reclaim candidate. */ 10606d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 10616d8b79cfSDave Chinner return 1; 10626d8b79cfSDave Chinner } 10636d8b79cfSDave Chinner __xfs_iflags_set(ip, XFS_IRECLAIM); 10646d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 10656d8b79cfSDave Chinner return 0; 10666d8b79cfSDave Chinner } 10676d8b79cfSDave Chinner 10686d8b79cfSDave Chinner /* 10696d8b79cfSDave Chinner * Inodes in different states need to be treated differently. The following 10706d8b79cfSDave Chinner * table lists the inode states and the reclaim actions necessary: 10716d8b79cfSDave Chinner * 10726d8b79cfSDave Chinner * inode state iflush ret required action 10736d8b79cfSDave Chinner * --------------- ---------- --------------- 10746d8b79cfSDave Chinner * bad - reclaim 10756d8b79cfSDave Chinner * shutdown EIO unpin and reclaim 10766d8b79cfSDave Chinner * clean, unpinned 0 reclaim 10776d8b79cfSDave Chinner * stale, unpinned 0 reclaim 10786d8b79cfSDave Chinner * clean, pinned(*) 0 requeue 10796d8b79cfSDave Chinner * stale, pinned EAGAIN requeue 10806d8b79cfSDave Chinner * dirty, async - requeue 10816d8b79cfSDave Chinner * dirty, sync 0 reclaim 10826d8b79cfSDave Chinner * 10836d8b79cfSDave Chinner * (*) dgc: I don't think the clean, pinned state is possible but it gets 10846d8b79cfSDave Chinner * handled anyway given the order of checks implemented. 10856d8b79cfSDave Chinner * 10866d8b79cfSDave Chinner * Also, because we get the flush lock first, we know that any inode that has 10876d8b79cfSDave Chinner * been flushed delwri has had the flush completed by the time we check that 10886d8b79cfSDave Chinner * the inode is clean. 10896d8b79cfSDave Chinner * 10906d8b79cfSDave Chinner * Note that because the inode is flushed delayed write by AIL pushing, the 10916d8b79cfSDave Chinner * flush lock may already be held here and waiting on it can result in very 10926d8b79cfSDave Chinner * long latencies. Hence for sync reclaims, where we wait on the flush lock, 10936d8b79cfSDave Chinner * the caller should push the AIL first before trying to reclaim inodes to 10946d8b79cfSDave Chinner * minimise the amount of time spent waiting. For background relaim, we only 10956d8b79cfSDave Chinner * bother to reclaim clean inodes anyway. 10966d8b79cfSDave Chinner * 10976d8b79cfSDave Chinner * Hence the order of actions after gaining the locks should be: 10986d8b79cfSDave Chinner * bad => reclaim 10996d8b79cfSDave Chinner * shutdown => unpin and reclaim 11006d8b79cfSDave Chinner * pinned, async => requeue 11016d8b79cfSDave Chinner * pinned, sync => unpin 11026d8b79cfSDave Chinner * stale => reclaim 11036d8b79cfSDave Chinner * clean => reclaim 11046d8b79cfSDave Chinner * dirty, async => requeue 11056d8b79cfSDave Chinner * dirty, sync => flush, wait and reclaim 11066d8b79cfSDave Chinner */ 11076d8b79cfSDave Chinner STATIC int 11086d8b79cfSDave Chinner xfs_reclaim_inode( 11096d8b79cfSDave Chinner struct xfs_inode *ip, 11106d8b79cfSDave Chinner struct xfs_perag *pag, 11116d8b79cfSDave Chinner int sync_mode) 11126d8b79cfSDave Chinner { 11136d8b79cfSDave Chinner struct xfs_buf *bp = NULL; 11148a17d7ddSDave Chinner xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 11156d8b79cfSDave Chinner int error; 11166d8b79cfSDave Chinner 11176d8b79cfSDave Chinner restart: 11186d8b79cfSDave Chinner error = 0; 11196d8b79cfSDave Chinner xfs_ilock(ip, XFS_ILOCK_EXCL); 11206d8b79cfSDave Chinner if (!xfs_iflock_nowait(ip)) { 11216d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 11226d8b79cfSDave Chinner goto out; 11236d8b79cfSDave Chinner xfs_iflock(ip); 11246d8b79cfSDave Chinner } 11256d8b79cfSDave Chinner 11266d8b79cfSDave Chinner if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 11276d8b79cfSDave Chinner xfs_iunpin_wait(ip); 112898efe8afSBrian Foster /* xfs_iflush_abort() drops the flush lock */ 1129*88fc1879SBrian Foster xfs_iflush_abort(ip); 11306d8b79cfSDave Chinner goto reclaim; 11316d8b79cfSDave Chinner } 11326d8b79cfSDave Chinner if (xfs_ipincount(ip)) { 11336d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 11346d8b79cfSDave Chinner goto out_ifunlock; 11356d8b79cfSDave Chinner xfs_iunpin_wait(ip); 11366d8b79cfSDave Chinner } 113798efe8afSBrian Foster if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) { 113898efe8afSBrian Foster xfs_ifunlock(ip); 11396d8b79cfSDave Chinner goto reclaim; 114098efe8afSBrian Foster } 11416d8b79cfSDave Chinner 11426d8b79cfSDave Chinner /* 11436d8b79cfSDave Chinner * Never flush out dirty data during non-blocking reclaim, as it would 11446d8b79cfSDave Chinner * just contend with AIL pushing trying to do the same job. 11456d8b79cfSDave Chinner */ 11466d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 11476d8b79cfSDave Chinner goto out_ifunlock; 11486d8b79cfSDave Chinner 11496d8b79cfSDave Chinner /* 11506d8b79cfSDave Chinner * Now we have an inode that needs flushing. 11516d8b79cfSDave Chinner * 11526d8b79cfSDave Chinner * Note that xfs_iflush will never block on the inode buffer lock, as 11536d8b79cfSDave Chinner * xfs_ifree_cluster() can lock the inode buffer before it locks the 11546d8b79cfSDave Chinner * ip->i_lock, and we are doing the exact opposite here. As a result, 11556d8b79cfSDave Chinner * doing a blocking xfs_imap_to_bp() to get the cluster buffer would 11566d8b79cfSDave Chinner * result in an ABBA deadlock with xfs_ifree_cluster(). 11576d8b79cfSDave Chinner * 11586d8b79cfSDave Chinner * As xfs_ifree_cluser() must gather all inodes that are active in the 11596d8b79cfSDave Chinner * cache to mark them stale, if we hit this case we don't actually want 11606d8b79cfSDave Chinner * to do IO here - we want the inode marked stale so we can simply 11616d8b79cfSDave Chinner * reclaim it. Hence if we get an EAGAIN error here, just unlock the 11626d8b79cfSDave Chinner * inode, back off and try again. Hopefully the next pass through will 11636d8b79cfSDave Chinner * see the stale flag set on the inode. 11646d8b79cfSDave Chinner */ 11656d8b79cfSDave Chinner error = xfs_iflush(ip, &bp); 11662451337dSDave Chinner if (error == -EAGAIN) { 11676d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 11686d8b79cfSDave Chinner /* backoff longer than in xfs_ifree_cluster */ 11696d8b79cfSDave Chinner delay(2); 11706d8b79cfSDave Chinner goto restart; 11716d8b79cfSDave Chinner } 11726d8b79cfSDave Chinner 11736d8b79cfSDave Chinner if (!error) { 11746d8b79cfSDave Chinner error = xfs_bwrite(bp); 11756d8b79cfSDave Chinner xfs_buf_relse(bp); 11766d8b79cfSDave Chinner } 11776d8b79cfSDave Chinner 11786d8b79cfSDave Chinner reclaim: 117998efe8afSBrian Foster ASSERT(!xfs_isiflocked(ip)); 118098efe8afSBrian Foster 11818a17d7ddSDave Chinner /* 11828a17d7ddSDave Chinner * Because we use RCU freeing we need to ensure the inode always appears 11838a17d7ddSDave Chinner * to be reclaimed with an invalid inode number when in the free state. 118498efe8afSBrian Foster * We do this as early as possible under the ILOCK so that 1185f2e9ad21SOmar Sandoval * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to 1186f2e9ad21SOmar Sandoval * detect races with us here. By doing this, we guarantee that once 1187f2e9ad21SOmar Sandoval * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that 1188f2e9ad21SOmar Sandoval * it will see either a valid inode that will serialise correctly, or it 1189f2e9ad21SOmar Sandoval * will see an invalid inode that it can skip. 11908a17d7ddSDave Chinner */ 11918a17d7ddSDave Chinner spin_lock(&ip->i_flags_lock); 11928a17d7ddSDave Chinner ip->i_flags = XFS_IRECLAIM; 11938a17d7ddSDave Chinner ip->i_ino = 0; 11948a17d7ddSDave Chinner spin_unlock(&ip->i_flags_lock); 11958a17d7ddSDave Chinner 11966d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 11976d8b79cfSDave Chinner 1198ff6d6af2SBill O'Donnell XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 11996d8b79cfSDave Chinner /* 12006d8b79cfSDave Chinner * Remove the inode from the per-AG radix tree. 12016d8b79cfSDave Chinner * 12026d8b79cfSDave Chinner * Because radix_tree_delete won't complain even if the item was never 12036d8b79cfSDave Chinner * added to the tree assert that it's been there before to catch 12046d8b79cfSDave Chinner * problems with the inode life time early on. 12056d8b79cfSDave Chinner */ 12066d8b79cfSDave Chinner spin_lock(&pag->pag_ici_lock); 12076d8b79cfSDave Chinner if (!radix_tree_delete(&pag->pag_ici_root, 12088a17d7ddSDave Chinner XFS_INO_TO_AGINO(ip->i_mount, ino))) 12096d8b79cfSDave Chinner ASSERT(0); 1210545c0889SDave Chinner xfs_perag_clear_reclaim_tag(pag); 12116d8b79cfSDave Chinner spin_unlock(&pag->pag_ici_lock); 12126d8b79cfSDave Chinner 12136d8b79cfSDave Chinner /* 12146d8b79cfSDave Chinner * Here we do an (almost) spurious inode lock in order to coordinate 12156d8b79cfSDave Chinner * with inode cache radix tree lookups. This is because the lookup 12166d8b79cfSDave Chinner * can reference the inodes in the cache without taking references. 12176d8b79cfSDave Chinner * 12186d8b79cfSDave Chinner * We make that OK here by ensuring that we wait until the inode is 12196d8b79cfSDave Chinner * unlocked after the lookup before we go ahead and free it. 12206d8b79cfSDave Chinner */ 12216d8b79cfSDave Chinner xfs_ilock(ip, XFS_ILOCK_EXCL); 12226d8b79cfSDave Chinner xfs_qm_dqdetach(ip); 12236d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 12246d8b79cfSDave Chinner 12258a17d7ddSDave Chinner __xfs_inode_free(ip); 12266d8b79cfSDave Chinner return error; 12276d8b79cfSDave Chinner 12286d8b79cfSDave Chinner out_ifunlock: 12296d8b79cfSDave Chinner xfs_ifunlock(ip); 12306d8b79cfSDave Chinner out: 12316d8b79cfSDave Chinner xfs_iflags_clear(ip, XFS_IRECLAIM); 12326d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 12336d8b79cfSDave Chinner /* 12342451337dSDave Chinner * We could return -EAGAIN here to make reclaim rescan the inode tree in 12356d8b79cfSDave Chinner * a short while. However, this just burns CPU time scanning the tree 12366d8b79cfSDave Chinner * waiting for IO to complete and the reclaim work never goes back to 12376d8b79cfSDave Chinner * the idle state. Instead, return 0 to let the next scheduled 12386d8b79cfSDave Chinner * background reclaim attempt to reclaim the inode again. 12396d8b79cfSDave Chinner */ 12406d8b79cfSDave Chinner return 0; 12416d8b79cfSDave Chinner } 12426d8b79cfSDave Chinner 12436d8b79cfSDave Chinner /* 12446d8b79cfSDave Chinner * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 12456d8b79cfSDave Chinner * corrupted, we still want to try to reclaim all the inodes. If we don't, 12466d8b79cfSDave Chinner * then a shut down during filesystem unmount reclaim walk leak all the 12476d8b79cfSDave Chinner * unreclaimed inodes. 12486d8b79cfSDave Chinner */ 124933479e05SDave Chinner STATIC int 12506d8b79cfSDave Chinner xfs_reclaim_inodes_ag( 12516d8b79cfSDave Chinner struct xfs_mount *mp, 12526d8b79cfSDave Chinner int flags, 12536d8b79cfSDave Chinner int *nr_to_scan) 12546d8b79cfSDave Chinner { 12556d8b79cfSDave Chinner struct xfs_perag *pag; 12566d8b79cfSDave Chinner int error = 0; 12576d8b79cfSDave Chinner int last_error = 0; 12586d8b79cfSDave Chinner xfs_agnumber_t ag; 12596d8b79cfSDave Chinner int trylock = flags & SYNC_TRYLOCK; 12606d8b79cfSDave Chinner int skipped; 12616d8b79cfSDave Chinner 12626d8b79cfSDave Chinner restart: 12636d8b79cfSDave Chinner ag = 0; 12646d8b79cfSDave Chinner skipped = 0; 12656d8b79cfSDave Chinner while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 12666d8b79cfSDave Chinner unsigned long first_index = 0; 12676d8b79cfSDave Chinner int done = 0; 12686d8b79cfSDave Chinner int nr_found = 0; 12696d8b79cfSDave Chinner 12706d8b79cfSDave Chinner ag = pag->pag_agno + 1; 12716d8b79cfSDave Chinner 12726d8b79cfSDave Chinner if (trylock) { 12736d8b79cfSDave Chinner if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 12746d8b79cfSDave Chinner skipped++; 12756d8b79cfSDave Chinner xfs_perag_put(pag); 12766d8b79cfSDave Chinner continue; 12776d8b79cfSDave Chinner } 12786d8b79cfSDave Chinner first_index = pag->pag_ici_reclaim_cursor; 12796d8b79cfSDave Chinner } else 12806d8b79cfSDave Chinner mutex_lock(&pag->pag_ici_reclaim_lock); 12816d8b79cfSDave Chinner 12826d8b79cfSDave Chinner do { 12836d8b79cfSDave Chinner struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 12846d8b79cfSDave Chinner int i; 12856d8b79cfSDave Chinner 12866d8b79cfSDave Chinner rcu_read_lock(); 12876d8b79cfSDave Chinner nr_found = radix_tree_gang_lookup_tag( 12886d8b79cfSDave Chinner &pag->pag_ici_root, 12896d8b79cfSDave Chinner (void **)batch, first_index, 12906d8b79cfSDave Chinner XFS_LOOKUP_BATCH, 12916d8b79cfSDave Chinner XFS_ICI_RECLAIM_TAG); 12926d8b79cfSDave Chinner if (!nr_found) { 12936d8b79cfSDave Chinner done = 1; 12946d8b79cfSDave Chinner rcu_read_unlock(); 12956d8b79cfSDave Chinner break; 12966d8b79cfSDave Chinner } 12976d8b79cfSDave Chinner 12986d8b79cfSDave Chinner /* 12996d8b79cfSDave Chinner * Grab the inodes before we drop the lock. if we found 13006d8b79cfSDave Chinner * nothing, nr == 0 and the loop will be skipped. 13016d8b79cfSDave Chinner */ 13026d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 13036d8b79cfSDave Chinner struct xfs_inode *ip = batch[i]; 13046d8b79cfSDave Chinner 13056d8b79cfSDave Chinner if (done || xfs_reclaim_inode_grab(ip, flags)) 13066d8b79cfSDave Chinner batch[i] = NULL; 13076d8b79cfSDave Chinner 13086d8b79cfSDave Chinner /* 13096d8b79cfSDave Chinner * Update the index for the next lookup. Catch 13106d8b79cfSDave Chinner * overflows into the next AG range which can 13116d8b79cfSDave Chinner * occur if we have inodes in the last block of 13126d8b79cfSDave Chinner * the AG and we are currently pointing to the 13136d8b79cfSDave Chinner * last inode. 13146d8b79cfSDave Chinner * 13156d8b79cfSDave Chinner * Because we may see inodes that are from the 13166d8b79cfSDave Chinner * wrong AG due to RCU freeing and 13176d8b79cfSDave Chinner * reallocation, only update the index if it 13186d8b79cfSDave Chinner * lies in this AG. It was a race that lead us 13196d8b79cfSDave Chinner * to see this inode, so another lookup from 13206d8b79cfSDave Chinner * the same index will not find it again. 13216d8b79cfSDave Chinner */ 13226d8b79cfSDave Chinner if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 13236d8b79cfSDave Chinner pag->pag_agno) 13246d8b79cfSDave Chinner continue; 13256d8b79cfSDave Chinner first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 13266d8b79cfSDave Chinner if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 13276d8b79cfSDave Chinner done = 1; 13286d8b79cfSDave Chinner } 13296d8b79cfSDave Chinner 13306d8b79cfSDave Chinner /* unlock now we've grabbed the inodes. */ 13316d8b79cfSDave Chinner rcu_read_unlock(); 13326d8b79cfSDave Chinner 13336d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 13346d8b79cfSDave Chinner if (!batch[i]) 13356d8b79cfSDave Chinner continue; 13366d8b79cfSDave Chinner error = xfs_reclaim_inode(batch[i], pag, flags); 13372451337dSDave Chinner if (error && last_error != -EFSCORRUPTED) 13386d8b79cfSDave Chinner last_error = error; 13396d8b79cfSDave Chinner } 13406d8b79cfSDave Chinner 13416d8b79cfSDave Chinner *nr_to_scan -= XFS_LOOKUP_BATCH; 13426d8b79cfSDave Chinner 13436d8b79cfSDave Chinner cond_resched(); 13446d8b79cfSDave Chinner 13456d8b79cfSDave Chinner } while (nr_found && !done && *nr_to_scan > 0); 13466d8b79cfSDave Chinner 13476d8b79cfSDave Chinner if (trylock && !done) 13486d8b79cfSDave Chinner pag->pag_ici_reclaim_cursor = first_index; 13496d8b79cfSDave Chinner else 13506d8b79cfSDave Chinner pag->pag_ici_reclaim_cursor = 0; 13516d8b79cfSDave Chinner mutex_unlock(&pag->pag_ici_reclaim_lock); 13526d8b79cfSDave Chinner xfs_perag_put(pag); 13536d8b79cfSDave Chinner } 13546d8b79cfSDave Chinner 13556d8b79cfSDave Chinner /* 13566d8b79cfSDave Chinner * if we skipped any AG, and we still have scan count remaining, do 13576d8b79cfSDave Chinner * another pass this time using blocking reclaim semantics (i.e 13586d8b79cfSDave Chinner * waiting on the reclaim locks and ignoring the reclaim cursors). This 13596d8b79cfSDave Chinner * ensure that when we get more reclaimers than AGs we block rather 13606d8b79cfSDave Chinner * than spin trying to execute reclaim. 13616d8b79cfSDave Chinner */ 13626d8b79cfSDave Chinner if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { 13636d8b79cfSDave Chinner trylock = 0; 13646d8b79cfSDave Chinner goto restart; 13656d8b79cfSDave Chinner } 1366b474c7aeSEric Sandeen return last_error; 13676d8b79cfSDave Chinner } 13686d8b79cfSDave Chinner 13696d8b79cfSDave Chinner int 13706d8b79cfSDave Chinner xfs_reclaim_inodes( 13716d8b79cfSDave Chinner xfs_mount_t *mp, 13726d8b79cfSDave Chinner int mode) 13736d8b79cfSDave Chinner { 13746d8b79cfSDave Chinner int nr_to_scan = INT_MAX; 13756d8b79cfSDave Chinner 13766d8b79cfSDave Chinner return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); 13776d8b79cfSDave Chinner } 13786d8b79cfSDave Chinner 13796d8b79cfSDave Chinner /* 13806d8b79cfSDave Chinner * Scan a certain number of inodes for reclaim. 13816d8b79cfSDave Chinner * 13826d8b79cfSDave Chinner * When called we make sure that there is a background (fast) inode reclaim in 13836d8b79cfSDave Chinner * progress, while we will throttle the speed of reclaim via doing synchronous 13846d8b79cfSDave Chinner * reclaim of inodes. That means if we come across dirty inodes, we wait for 13856d8b79cfSDave Chinner * them to be cleaned, which we hope will not be very long due to the 13866d8b79cfSDave Chinner * background walker having already kicked the IO off on those dirty inodes. 13876d8b79cfSDave Chinner */ 13880a234c6dSDave Chinner long 13896d8b79cfSDave Chinner xfs_reclaim_inodes_nr( 13906d8b79cfSDave Chinner struct xfs_mount *mp, 13916d8b79cfSDave Chinner int nr_to_scan) 13926d8b79cfSDave Chinner { 13936d8b79cfSDave Chinner /* kick background reclaimer and push the AIL */ 13946d8b79cfSDave Chinner xfs_reclaim_work_queue(mp); 13956d8b79cfSDave Chinner xfs_ail_push_all(mp->m_ail); 13966d8b79cfSDave Chinner 13970a234c6dSDave Chinner return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 13986d8b79cfSDave Chinner } 13996d8b79cfSDave Chinner 14006d8b79cfSDave Chinner /* 14016d8b79cfSDave Chinner * Return the number of reclaimable inodes in the filesystem for 14026d8b79cfSDave Chinner * the shrinker to determine how much to reclaim. 14036d8b79cfSDave Chinner */ 14046d8b79cfSDave Chinner int 14056d8b79cfSDave Chinner xfs_reclaim_inodes_count( 14066d8b79cfSDave Chinner struct xfs_mount *mp) 14076d8b79cfSDave Chinner { 14086d8b79cfSDave Chinner struct xfs_perag *pag; 14096d8b79cfSDave Chinner xfs_agnumber_t ag = 0; 14106d8b79cfSDave Chinner int reclaimable = 0; 14116d8b79cfSDave Chinner 14126d8b79cfSDave Chinner while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 14136d8b79cfSDave Chinner ag = pag->pag_agno + 1; 14146d8b79cfSDave Chinner reclaimable += pag->pag_ici_reclaimable; 14156d8b79cfSDave Chinner xfs_perag_put(pag); 14166d8b79cfSDave Chinner } 14176d8b79cfSDave Chinner return reclaimable; 14186d8b79cfSDave Chinner } 14196d8b79cfSDave Chinner 142041176a68SBrian Foster STATIC int 14213e3f9f58SBrian Foster xfs_inode_match_id( 14223e3f9f58SBrian Foster struct xfs_inode *ip, 14233e3f9f58SBrian Foster struct xfs_eofblocks *eofb) 14243e3f9f58SBrian Foster { 1425b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1426b9fe5052SDwight Engen !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 14273e3f9f58SBrian Foster return 0; 14281b556048SBrian Foster 1429b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1430b9fe5052SDwight Engen !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 14311b556048SBrian Foster return 0; 14321b556048SBrian Foster 1433b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1434de7a866fSChristoph Hellwig ip->i_d.di_projid != eofb->eof_prid) 14351b556048SBrian Foster return 0; 14361b556048SBrian Foster 14371b556048SBrian Foster return 1; 14383e3f9f58SBrian Foster } 14393e3f9f58SBrian Foster 1440f4526397SBrian Foster /* 1441f4526397SBrian Foster * A union-based inode filtering algorithm. Process the inode if any of the 1442f4526397SBrian Foster * criteria match. This is for global/internal scans only. 1443f4526397SBrian Foster */ 1444f4526397SBrian Foster STATIC int 1445f4526397SBrian Foster xfs_inode_match_id_union( 1446f4526397SBrian Foster struct xfs_inode *ip, 1447f4526397SBrian Foster struct xfs_eofblocks *eofb) 1448f4526397SBrian Foster { 1449f4526397SBrian Foster if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1450f4526397SBrian Foster uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1451f4526397SBrian Foster return 1; 1452f4526397SBrian Foster 1453f4526397SBrian Foster if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1454f4526397SBrian Foster gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1455f4526397SBrian Foster return 1; 1456f4526397SBrian Foster 1457f4526397SBrian Foster if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1458de7a866fSChristoph Hellwig ip->i_d.di_projid == eofb->eof_prid) 1459f4526397SBrian Foster return 1; 1460f4526397SBrian Foster 1461f4526397SBrian Foster return 0; 1462f4526397SBrian Foster } 1463f4526397SBrian Foster 14643e3f9f58SBrian Foster STATIC int 146541176a68SBrian Foster xfs_inode_free_eofblocks( 146641176a68SBrian Foster struct xfs_inode *ip, 146741176a68SBrian Foster int flags, 146841176a68SBrian Foster void *args) 146941176a68SBrian Foster { 1470a36b9261SBrian Foster int ret = 0; 14713e3f9f58SBrian Foster struct xfs_eofblocks *eofb = args; 1472f4526397SBrian Foster int match; 14735400da7dSBrian Foster 147441176a68SBrian Foster if (!xfs_can_free_eofblocks(ip, false)) { 147541176a68SBrian Foster /* inode could be preallocated or append-only */ 147641176a68SBrian Foster trace_xfs_inode_free_eofblocks_invalid(ip); 147741176a68SBrian Foster xfs_inode_clear_eofblocks_tag(ip); 147841176a68SBrian Foster return 0; 147941176a68SBrian Foster } 148041176a68SBrian Foster 148141176a68SBrian Foster /* 148241176a68SBrian Foster * If the mapping is dirty the operation can block and wait for some 148341176a68SBrian Foster * time. Unless we are waiting, skip it. 148441176a68SBrian Foster */ 148541176a68SBrian Foster if (!(flags & SYNC_WAIT) && 148641176a68SBrian Foster mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 148741176a68SBrian Foster return 0; 148841176a68SBrian Foster 148900ca79a0SBrian Foster if (eofb) { 1490f4526397SBrian Foster if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 1491f4526397SBrian Foster match = xfs_inode_match_id_union(ip, eofb); 1492f4526397SBrian Foster else 1493f4526397SBrian Foster match = xfs_inode_match_id(ip, eofb); 1494f4526397SBrian Foster if (!match) 14953e3f9f58SBrian Foster return 0; 14963e3f9f58SBrian Foster 149700ca79a0SBrian Foster /* skip the inode if the file size is too small */ 149800ca79a0SBrian Foster if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 149900ca79a0SBrian Foster XFS_ISIZE(ip) < eofb->eof_min_file_size) 150000ca79a0SBrian Foster return 0; 150100ca79a0SBrian Foster } 150200ca79a0SBrian Foster 1503a36b9261SBrian Foster /* 1504a36b9261SBrian Foster * If the caller is waiting, return -EAGAIN to keep the background 1505a36b9261SBrian Foster * scanner moving and revisit the inode in a subsequent pass. 1506a36b9261SBrian Foster */ 1507c3155097SBrian Foster if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1508a36b9261SBrian Foster if (flags & SYNC_WAIT) 1509a36b9261SBrian Foster ret = -EAGAIN; 1510a36b9261SBrian Foster return ret; 1511a36b9261SBrian Foster } 1512a36b9261SBrian Foster ret = xfs_free_eofblocks(ip); 1513a36b9261SBrian Foster xfs_iunlock(ip, XFS_IOLOCK_EXCL); 151441176a68SBrian Foster 151541176a68SBrian Foster return ret; 151641176a68SBrian Foster } 151741176a68SBrian Foster 151883104d44SDarrick J. Wong static int 151983104d44SDarrick J. Wong __xfs_icache_free_eofblocks( 152041176a68SBrian Foster struct xfs_mount *mp, 152183104d44SDarrick J. Wong struct xfs_eofblocks *eofb, 152283104d44SDarrick J. Wong int (*execute)(struct xfs_inode *ip, int flags, 152383104d44SDarrick J. Wong void *args), 152483104d44SDarrick J. Wong int tag) 152541176a68SBrian Foster { 15268ca149deSBrian Foster int flags = SYNC_TRYLOCK; 15278ca149deSBrian Foster 15288ca149deSBrian Foster if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) 15298ca149deSBrian Foster flags = SYNC_WAIT; 15308ca149deSBrian Foster 153183104d44SDarrick J. Wong return xfs_inode_ag_iterator_tag(mp, execute, flags, 153283104d44SDarrick J. Wong eofb, tag); 153383104d44SDarrick J. Wong } 153483104d44SDarrick J. Wong 153583104d44SDarrick J. Wong int 153683104d44SDarrick J. Wong xfs_icache_free_eofblocks( 153783104d44SDarrick J. Wong struct xfs_mount *mp, 153883104d44SDarrick J. Wong struct xfs_eofblocks *eofb) 153983104d44SDarrick J. Wong { 154083104d44SDarrick J. Wong return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks, 154183104d44SDarrick J. Wong XFS_ICI_EOFBLOCKS_TAG); 154241176a68SBrian Foster } 154341176a68SBrian Foster 1544dc06f398SBrian Foster /* 1545dc06f398SBrian Foster * Run eofblocks scans on the quotas applicable to the inode. For inodes with 1546dc06f398SBrian Foster * multiple quotas, we don't know exactly which quota caused an allocation 1547dc06f398SBrian Foster * failure. We make a best effort by including each quota under low free space 1548dc06f398SBrian Foster * conditions (less than 1% free space) in the scan. 1549dc06f398SBrian Foster */ 155083104d44SDarrick J. Wong static int 155183104d44SDarrick J. Wong __xfs_inode_free_quota_eofblocks( 155283104d44SDarrick J. Wong struct xfs_inode *ip, 155383104d44SDarrick J. Wong int (*execute)(struct xfs_mount *mp, 155483104d44SDarrick J. Wong struct xfs_eofblocks *eofb)) 1555dc06f398SBrian Foster { 1556dc06f398SBrian Foster int scan = 0; 1557dc06f398SBrian Foster struct xfs_eofblocks eofb = {0}; 1558dc06f398SBrian Foster struct xfs_dquot *dq; 1559dc06f398SBrian Foster 1560dc06f398SBrian Foster /* 1561c3155097SBrian Foster * Run a sync scan to increase effectiveness and use the union filter to 1562dc06f398SBrian Foster * cover all applicable quotas in a single scan. 1563dc06f398SBrian Foster */ 1564dc06f398SBrian Foster eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; 1565dc06f398SBrian Foster 1566dc06f398SBrian Foster if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { 1567dc06f398SBrian Foster dq = xfs_inode_dquot(ip, XFS_DQ_USER); 1568dc06f398SBrian Foster if (dq && xfs_dquot_lowsp(dq)) { 1569dc06f398SBrian Foster eofb.eof_uid = VFS_I(ip)->i_uid; 1570dc06f398SBrian Foster eofb.eof_flags |= XFS_EOF_FLAGS_UID; 1571dc06f398SBrian Foster scan = 1; 1572dc06f398SBrian Foster } 1573dc06f398SBrian Foster } 1574dc06f398SBrian Foster 1575dc06f398SBrian Foster if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { 1576dc06f398SBrian Foster dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); 1577dc06f398SBrian Foster if (dq && xfs_dquot_lowsp(dq)) { 1578dc06f398SBrian Foster eofb.eof_gid = VFS_I(ip)->i_gid; 1579dc06f398SBrian Foster eofb.eof_flags |= XFS_EOF_FLAGS_GID; 1580dc06f398SBrian Foster scan = 1; 1581dc06f398SBrian Foster } 1582dc06f398SBrian Foster } 1583dc06f398SBrian Foster 1584dc06f398SBrian Foster if (scan) 158583104d44SDarrick J. Wong execute(ip->i_mount, &eofb); 1586dc06f398SBrian Foster 1587dc06f398SBrian Foster return scan; 1588dc06f398SBrian Foster } 1589dc06f398SBrian Foster 159083104d44SDarrick J. Wong int 159183104d44SDarrick J. Wong xfs_inode_free_quota_eofblocks( 159283104d44SDarrick J. Wong struct xfs_inode *ip) 159383104d44SDarrick J. Wong { 159483104d44SDarrick J. Wong return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); 159583104d44SDarrick J. Wong } 159683104d44SDarrick J. Wong 159791aae6beSDarrick J. Wong static inline unsigned long 159891aae6beSDarrick J. Wong xfs_iflag_for_tag( 159991aae6beSDarrick J. Wong int tag) 160091aae6beSDarrick J. Wong { 160191aae6beSDarrick J. Wong switch (tag) { 160291aae6beSDarrick J. Wong case XFS_ICI_EOFBLOCKS_TAG: 160391aae6beSDarrick J. Wong return XFS_IEOFBLOCKS; 160491aae6beSDarrick J. Wong case XFS_ICI_COWBLOCKS_TAG: 160591aae6beSDarrick J. Wong return XFS_ICOWBLOCKS; 160691aae6beSDarrick J. Wong default: 160791aae6beSDarrick J. Wong ASSERT(0); 160891aae6beSDarrick J. Wong return 0; 160991aae6beSDarrick J. Wong } 161091aae6beSDarrick J. Wong } 161191aae6beSDarrick J. Wong 161283104d44SDarrick J. Wong static void 161391aae6beSDarrick J. Wong __xfs_inode_set_blocks_tag( 161483104d44SDarrick J. Wong xfs_inode_t *ip, 161583104d44SDarrick J. Wong void (*execute)(struct xfs_mount *mp), 161683104d44SDarrick J. Wong void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 161783104d44SDarrick J. Wong int error, unsigned long caller_ip), 161883104d44SDarrick J. Wong int tag) 161927b52867SBrian Foster { 162027b52867SBrian Foster struct xfs_mount *mp = ip->i_mount; 162127b52867SBrian Foster struct xfs_perag *pag; 162227b52867SBrian Foster int tagged; 162327b52867SBrian Foster 162485a6e764SChristoph Hellwig /* 162585a6e764SChristoph Hellwig * Don't bother locking the AG and looking up in the radix trees 162685a6e764SChristoph Hellwig * if we already know that we have the tag set. 162785a6e764SChristoph Hellwig */ 162891aae6beSDarrick J. Wong if (ip->i_flags & xfs_iflag_for_tag(tag)) 162985a6e764SChristoph Hellwig return; 163085a6e764SChristoph Hellwig spin_lock(&ip->i_flags_lock); 163191aae6beSDarrick J. Wong ip->i_flags |= xfs_iflag_for_tag(tag); 163285a6e764SChristoph Hellwig spin_unlock(&ip->i_flags_lock); 163385a6e764SChristoph Hellwig 163427b52867SBrian Foster pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 163527b52867SBrian Foster spin_lock(&pag->pag_ici_lock); 163627b52867SBrian Foster 163783104d44SDarrick J. Wong tagged = radix_tree_tagged(&pag->pag_ici_root, tag); 163827b52867SBrian Foster radix_tree_tag_set(&pag->pag_ici_root, 163983104d44SDarrick J. Wong XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 164027b52867SBrian Foster if (!tagged) { 164127b52867SBrian Foster /* propagate the eofblocks tag up into the perag radix tree */ 164227b52867SBrian Foster spin_lock(&ip->i_mount->m_perag_lock); 164327b52867SBrian Foster radix_tree_tag_set(&ip->i_mount->m_perag_tree, 164427b52867SBrian Foster XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 164583104d44SDarrick J. Wong tag); 164627b52867SBrian Foster spin_unlock(&ip->i_mount->m_perag_lock); 164727b52867SBrian Foster 1648579b62faSBrian Foster /* kick off background trimming */ 164983104d44SDarrick J. Wong execute(ip->i_mount); 1650579b62faSBrian Foster 165183104d44SDarrick J. Wong set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 165227b52867SBrian Foster } 165327b52867SBrian Foster 165427b52867SBrian Foster spin_unlock(&pag->pag_ici_lock); 165527b52867SBrian Foster xfs_perag_put(pag); 165627b52867SBrian Foster } 165727b52867SBrian Foster 165827b52867SBrian Foster void 165983104d44SDarrick J. Wong xfs_inode_set_eofblocks_tag( 166027b52867SBrian Foster xfs_inode_t *ip) 166127b52867SBrian Foster { 166283104d44SDarrick J. Wong trace_xfs_inode_set_eofblocks_tag(ip); 166391aae6beSDarrick J. Wong return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks, 166483104d44SDarrick J. Wong trace_xfs_perag_set_eofblocks, 166583104d44SDarrick J. Wong XFS_ICI_EOFBLOCKS_TAG); 166683104d44SDarrick J. Wong } 166783104d44SDarrick J. Wong 166883104d44SDarrick J. Wong static void 166991aae6beSDarrick J. Wong __xfs_inode_clear_blocks_tag( 167083104d44SDarrick J. Wong xfs_inode_t *ip, 167183104d44SDarrick J. Wong void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 167283104d44SDarrick J. Wong int error, unsigned long caller_ip), 167383104d44SDarrick J. Wong int tag) 167483104d44SDarrick J. Wong { 167527b52867SBrian Foster struct xfs_mount *mp = ip->i_mount; 167627b52867SBrian Foster struct xfs_perag *pag; 167727b52867SBrian Foster 167885a6e764SChristoph Hellwig spin_lock(&ip->i_flags_lock); 167991aae6beSDarrick J. Wong ip->i_flags &= ~xfs_iflag_for_tag(tag); 168085a6e764SChristoph Hellwig spin_unlock(&ip->i_flags_lock); 168185a6e764SChristoph Hellwig 168227b52867SBrian Foster pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 168327b52867SBrian Foster spin_lock(&pag->pag_ici_lock); 168427b52867SBrian Foster 168527b52867SBrian Foster radix_tree_tag_clear(&pag->pag_ici_root, 168683104d44SDarrick J. Wong XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 168783104d44SDarrick J. Wong if (!radix_tree_tagged(&pag->pag_ici_root, tag)) { 168827b52867SBrian Foster /* clear the eofblocks tag from the perag radix tree */ 168927b52867SBrian Foster spin_lock(&ip->i_mount->m_perag_lock); 169027b52867SBrian Foster radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 169127b52867SBrian Foster XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 169283104d44SDarrick J. Wong tag); 169327b52867SBrian Foster spin_unlock(&ip->i_mount->m_perag_lock); 169483104d44SDarrick J. Wong clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 169527b52867SBrian Foster } 169627b52867SBrian Foster 169727b52867SBrian Foster spin_unlock(&pag->pag_ici_lock); 169827b52867SBrian Foster xfs_perag_put(pag); 169927b52867SBrian Foster } 170027b52867SBrian Foster 170183104d44SDarrick J. Wong void 170283104d44SDarrick J. Wong xfs_inode_clear_eofblocks_tag( 170383104d44SDarrick J. Wong xfs_inode_t *ip) 170483104d44SDarrick J. Wong { 170583104d44SDarrick J. Wong trace_xfs_inode_clear_eofblocks_tag(ip); 170691aae6beSDarrick J. Wong return __xfs_inode_clear_blocks_tag(ip, 170783104d44SDarrick J. Wong trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); 170883104d44SDarrick J. Wong } 170983104d44SDarrick J. Wong 171083104d44SDarrick J. Wong /* 1711be78ff0eSDarrick J. Wong * Set ourselves up to free CoW blocks from this file. If it's already clean 1712be78ff0eSDarrick J. Wong * then we can bail out quickly, but otherwise we must back off if the file 1713be78ff0eSDarrick J. Wong * is undergoing some kind of write. 1714be78ff0eSDarrick J. Wong */ 1715be78ff0eSDarrick J. Wong static bool 1716be78ff0eSDarrick J. Wong xfs_prep_free_cowblocks( 171751d62690SChristoph Hellwig struct xfs_inode *ip) 1718be78ff0eSDarrick J. Wong { 1719be78ff0eSDarrick J. Wong /* 1720be78ff0eSDarrick J. Wong * Just clear the tag if we have an empty cow fork or none at all. It's 1721be78ff0eSDarrick J. Wong * possible the inode was fully unshared since it was originally tagged. 1722be78ff0eSDarrick J. Wong */ 172351d62690SChristoph Hellwig if (!xfs_inode_has_cow_data(ip)) { 1724be78ff0eSDarrick J. Wong trace_xfs_inode_free_cowblocks_invalid(ip); 1725be78ff0eSDarrick J. Wong xfs_inode_clear_cowblocks_tag(ip); 1726be78ff0eSDarrick J. Wong return false; 1727be78ff0eSDarrick J. Wong } 1728be78ff0eSDarrick J. Wong 1729be78ff0eSDarrick J. Wong /* 1730be78ff0eSDarrick J. Wong * If the mapping is dirty or under writeback we cannot touch the 1731be78ff0eSDarrick J. Wong * CoW fork. Leave it alone if we're in the midst of a directio. 1732be78ff0eSDarrick J. Wong */ 1733be78ff0eSDarrick J. Wong if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || 1734be78ff0eSDarrick J. Wong mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || 1735be78ff0eSDarrick J. Wong mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || 1736be78ff0eSDarrick J. Wong atomic_read(&VFS_I(ip)->i_dio_count)) 1737be78ff0eSDarrick J. Wong return false; 1738be78ff0eSDarrick J. Wong 1739be78ff0eSDarrick J. Wong return true; 1740be78ff0eSDarrick J. Wong } 1741be78ff0eSDarrick J. Wong 1742be78ff0eSDarrick J. Wong /* 174383104d44SDarrick J. Wong * Automatic CoW Reservation Freeing 174483104d44SDarrick J. Wong * 174583104d44SDarrick J. Wong * These functions automatically garbage collect leftover CoW reservations 174683104d44SDarrick J. Wong * that were made on behalf of a cowextsize hint when we start to run out 174783104d44SDarrick J. Wong * of quota or when the reservations sit around for too long. If the file 174883104d44SDarrick J. Wong * has dirty pages or is undergoing writeback, its CoW reservations will 174983104d44SDarrick J. Wong * be retained. 175083104d44SDarrick J. Wong * 175183104d44SDarrick J. Wong * The actual garbage collection piggybacks off the same code that runs 175283104d44SDarrick J. Wong * the speculative EOF preallocation garbage collector. 175383104d44SDarrick J. Wong */ 175483104d44SDarrick J. Wong STATIC int 175583104d44SDarrick J. Wong xfs_inode_free_cowblocks( 175683104d44SDarrick J. Wong struct xfs_inode *ip, 175783104d44SDarrick J. Wong int flags, 175883104d44SDarrick J. Wong void *args) 175983104d44SDarrick J. Wong { 176083104d44SDarrick J. Wong struct xfs_eofblocks *eofb = args; 1761be78ff0eSDarrick J. Wong int match; 1762be78ff0eSDarrick J. Wong int ret = 0; 176383104d44SDarrick J. Wong 176451d62690SChristoph Hellwig if (!xfs_prep_free_cowblocks(ip)) 176583104d44SDarrick J. Wong return 0; 176683104d44SDarrick J. Wong 176783104d44SDarrick J. Wong if (eofb) { 176883104d44SDarrick J. Wong if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 176983104d44SDarrick J. Wong match = xfs_inode_match_id_union(ip, eofb); 177083104d44SDarrick J. Wong else 177183104d44SDarrick J. Wong match = xfs_inode_match_id(ip, eofb); 177283104d44SDarrick J. Wong if (!match) 177383104d44SDarrick J. Wong return 0; 177483104d44SDarrick J. Wong 177583104d44SDarrick J. Wong /* skip the inode if the file size is too small */ 177683104d44SDarrick J. Wong if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 177783104d44SDarrick J. Wong XFS_ISIZE(ip) < eofb->eof_min_file_size) 177883104d44SDarrick J. Wong return 0; 177983104d44SDarrick J. Wong } 178083104d44SDarrick J. Wong 178183104d44SDarrick J. Wong /* Free the CoW blocks */ 178283104d44SDarrick J. Wong xfs_ilock(ip, XFS_IOLOCK_EXCL); 178383104d44SDarrick J. Wong xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 178483104d44SDarrick J. Wong 1785be78ff0eSDarrick J. Wong /* 1786be78ff0eSDarrick J. Wong * Check again, nobody else should be able to dirty blocks or change 1787be78ff0eSDarrick J. Wong * the reflink iflag now that we have the first two locks held. 1788be78ff0eSDarrick J. Wong */ 178951d62690SChristoph Hellwig if (xfs_prep_free_cowblocks(ip)) 17903802a345SChristoph Hellwig ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); 179183104d44SDarrick J. Wong 179283104d44SDarrick J. Wong xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 179383104d44SDarrick J. Wong xfs_iunlock(ip, XFS_IOLOCK_EXCL); 179483104d44SDarrick J. Wong 179583104d44SDarrick J. Wong return ret; 179683104d44SDarrick J. Wong } 179783104d44SDarrick J. Wong 179883104d44SDarrick J. Wong int 179983104d44SDarrick J. Wong xfs_icache_free_cowblocks( 180083104d44SDarrick J. Wong struct xfs_mount *mp, 180183104d44SDarrick J. Wong struct xfs_eofblocks *eofb) 180283104d44SDarrick J. Wong { 180383104d44SDarrick J. Wong return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks, 180483104d44SDarrick J. Wong XFS_ICI_COWBLOCKS_TAG); 180583104d44SDarrick J. Wong } 180683104d44SDarrick J. Wong 180783104d44SDarrick J. Wong int 180883104d44SDarrick J. Wong xfs_inode_free_quota_cowblocks( 180983104d44SDarrick J. Wong struct xfs_inode *ip) 181083104d44SDarrick J. Wong { 181183104d44SDarrick J. Wong return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); 181283104d44SDarrick J. Wong } 181383104d44SDarrick J. Wong 181483104d44SDarrick J. Wong void 181583104d44SDarrick J. Wong xfs_inode_set_cowblocks_tag( 181683104d44SDarrick J. Wong xfs_inode_t *ip) 181783104d44SDarrick J. Wong { 18187b7381f0SBrian Foster trace_xfs_inode_set_cowblocks_tag(ip); 181991aae6beSDarrick J. Wong return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks, 18207b7381f0SBrian Foster trace_xfs_perag_set_cowblocks, 182183104d44SDarrick J. Wong XFS_ICI_COWBLOCKS_TAG); 182283104d44SDarrick J. Wong } 182383104d44SDarrick J. Wong 182483104d44SDarrick J. Wong void 182583104d44SDarrick J. Wong xfs_inode_clear_cowblocks_tag( 182683104d44SDarrick J. Wong xfs_inode_t *ip) 182783104d44SDarrick J. Wong { 18287b7381f0SBrian Foster trace_xfs_inode_clear_cowblocks_tag(ip); 182991aae6beSDarrick J. Wong return __xfs_inode_clear_blocks_tag(ip, 18307b7381f0SBrian Foster trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); 183183104d44SDarrick J. Wong } 1832d6b636ebSDarrick J. Wong 1833d6b636ebSDarrick J. Wong /* Disable post-EOF and CoW block auto-reclamation. */ 1834d6b636ebSDarrick J. Wong void 1835ed30dcbdSDarrick J. Wong xfs_stop_block_reaping( 1836d6b636ebSDarrick J. Wong struct xfs_mount *mp) 1837d6b636ebSDarrick J. Wong { 1838d6b636ebSDarrick J. Wong cancel_delayed_work_sync(&mp->m_eofblocks_work); 1839d6b636ebSDarrick J. Wong cancel_delayed_work_sync(&mp->m_cowblocks_work); 1840d6b636ebSDarrick J. Wong } 1841d6b636ebSDarrick J. Wong 1842d6b636ebSDarrick J. Wong /* Enable post-EOF and CoW block auto-reclamation. */ 1843d6b636ebSDarrick J. Wong void 1844ed30dcbdSDarrick J. Wong xfs_start_block_reaping( 1845d6b636ebSDarrick J. Wong struct xfs_mount *mp) 1846d6b636ebSDarrick J. Wong { 1847d6b636ebSDarrick J. Wong xfs_queue_eofblocks(mp); 1848d6b636ebSDarrick J. Wong xfs_queue_cowblocks(mp); 1849d6b636ebSDarrick J. Wong } 1850