10b61f8a4SDave Chinner // SPDX-License-Identifier: GPL-2.0 26d8b79cfSDave Chinner /* 36d8b79cfSDave Chinner * Copyright (c) 2000-2005 Silicon Graphics, Inc. 46d8b79cfSDave Chinner * All Rights Reserved. 56d8b79cfSDave Chinner */ 66d8b79cfSDave Chinner #include "xfs.h" 76d8b79cfSDave Chinner #include "xfs_fs.h" 85467b34bSDarrick J. Wong #include "xfs_shared.h" 96ca1c906SDave Chinner #include "xfs_format.h" 10239880efSDave Chinner #include "xfs_log_format.h" 11239880efSDave Chinner #include "xfs_trans_resv.h" 126d8b79cfSDave Chinner #include "xfs_sb.h" 136d8b79cfSDave Chinner #include "xfs_mount.h" 146d8b79cfSDave Chinner #include "xfs_inode.h" 15239880efSDave Chinner #include "xfs_trans.h" 16239880efSDave Chinner #include "xfs_trans_priv.h" 176d8b79cfSDave Chinner #include "xfs_inode_item.h" 186d8b79cfSDave Chinner #include "xfs_quota.h" 196d8b79cfSDave Chinner #include "xfs_trace.h" 206d8b79cfSDave Chinner #include "xfs_icache.h" 21c24b5dfaSDave Chinner #include "xfs_bmap_util.h" 22dc06f398SBrian Foster #include "xfs_dquot_item.h" 23dc06f398SBrian Foster #include "xfs_dquot.h" 2483104d44SDarrick J. Wong #include "xfs_reflink.h" 256d8b79cfSDave Chinner 26f0e28280SJeff Layton #include <linux/iversion.h> 276d8b79cfSDave Chinner 2833479e05SDave Chinner /* 2933479e05SDave Chinner * Allocate and initialise an xfs_inode. 3033479e05SDave Chinner */ 31638f4416SDave Chinner struct xfs_inode * 3233479e05SDave Chinner xfs_inode_alloc( 3333479e05SDave Chinner struct xfs_mount *mp, 3433479e05SDave Chinner xfs_ino_t ino) 3533479e05SDave Chinner { 3633479e05SDave Chinner struct xfs_inode *ip; 3733479e05SDave Chinner 3833479e05SDave Chinner /* 3933479e05SDave Chinner * if this didn't occur in transactions, we could use 4033479e05SDave Chinner * KM_MAYFAIL and return NULL here on ENOMEM. Set the 4133479e05SDave Chinner * code up to do this anyway. 4233479e05SDave Chinner */ 43*707e0ddaSTetsuo Handa ip = kmem_zone_alloc(xfs_inode_zone, 0); 4433479e05SDave Chinner if (!ip) 4533479e05SDave Chinner return NULL; 4633479e05SDave Chinner if (inode_init_always(mp->m_super, VFS_I(ip))) { 4733479e05SDave Chinner kmem_zone_free(xfs_inode_zone, ip); 4833479e05SDave Chinner return NULL; 4933479e05SDave Chinner } 5033479e05SDave Chinner 51c19b3b05SDave Chinner /* VFS doesn't initialise i_mode! */ 52c19b3b05SDave Chinner VFS_I(ip)->i_mode = 0; 53c19b3b05SDave Chinner 54ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, vn_active); 5533479e05SDave Chinner ASSERT(atomic_read(&ip->i_pincount) == 0); 5633479e05SDave Chinner ASSERT(!xfs_isiflocked(ip)); 5733479e05SDave Chinner ASSERT(ip->i_ino == 0); 5833479e05SDave Chinner 5933479e05SDave Chinner /* initialise the xfs inode */ 6033479e05SDave Chinner ip->i_ino = ino; 6133479e05SDave Chinner ip->i_mount = mp; 6233479e05SDave Chinner memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 6333479e05SDave Chinner ip->i_afp = NULL; 643993baebSDarrick J. Wong ip->i_cowfp = NULL; 653993baebSDarrick J. Wong ip->i_cnextents = 0; 663993baebSDarrick J. Wong ip->i_cformat = XFS_DINODE_FMT_EXTENTS; 673ba738dfSChristoph Hellwig memset(&ip->i_df, 0, sizeof(ip->i_df)); 6833479e05SDave Chinner ip->i_flags = 0; 6933479e05SDave Chinner ip->i_delayed_blks = 0; 70f8d55aa0SDave Chinner memset(&ip->i_d, 0, sizeof(ip->i_d)); 716772c1f1SDarrick J. Wong ip->i_sick = 0; 726772c1f1SDarrick J. Wong ip->i_checked = 0; 73cb357bf3SDarrick J. Wong INIT_WORK(&ip->i_ioend_work, xfs_end_io); 74cb357bf3SDarrick J. Wong INIT_LIST_HEAD(&ip->i_ioend_list); 75cb357bf3SDarrick J. Wong spin_lock_init(&ip->i_ioend_lock); 7633479e05SDave Chinner 7733479e05SDave Chinner return ip; 7833479e05SDave Chinner } 7933479e05SDave Chinner 8033479e05SDave Chinner STATIC void 8133479e05SDave Chinner xfs_inode_free_callback( 8233479e05SDave Chinner struct rcu_head *head) 8333479e05SDave Chinner { 8433479e05SDave Chinner struct inode *inode = container_of(head, struct inode, i_rcu); 8533479e05SDave Chinner struct xfs_inode *ip = XFS_I(inode); 8633479e05SDave Chinner 87c19b3b05SDave Chinner switch (VFS_I(ip)->i_mode & S_IFMT) { 8833479e05SDave Chinner case S_IFREG: 8933479e05SDave Chinner case S_IFDIR: 9033479e05SDave Chinner case S_IFLNK: 9133479e05SDave Chinner xfs_idestroy_fork(ip, XFS_DATA_FORK); 9233479e05SDave Chinner break; 9333479e05SDave Chinner } 9433479e05SDave Chinner 9533479e05SDave Chinner if (ip->i_afp) 9633479e05SDave Chinner xfs_idestroy_fork(ip, XFS_ATTR_FORK); 973993baebSDarrick J. Wong if (ip->i_cowfp) 983993baebSDarrick J. Wong xfs_idestroy_fork(ip, XFS_COW_FORK); 9933479e05SDave Chinner 10033479e05SDave Chinner if (ip->i_itemp) { 10122525c17SDave Chinner ASSERT(!test_bit(XFS_LI_IN_AIL, 10222525c17SDave Chinner &ip->i_itemp->ili_item.li_flags)); 10333479e05SDave Chinner xfs_inode_item_destroy(ip); 10433479e05SDave Chinner ip->i_itemp = NULL; 10533479e05SDave Chinner } 10633479e05SDave Chinner 1071f2dcfe8SDave Chinner kmem_zone_free(xfs_inode_zone, ip); 1081f2dcfe8SDave Chinner } 1091f2dcfe8SDave Chinner 1108a17d7ddSDave Chinner static void 1118a17d7ddSDave Chinner __xfs_inode_free( 1128a17d7ddSDave Chinner struct xfs_inode *ip) 1138a17d7ddSDave Chinner { 1148a17d7ddSDave Chinner /* asserts to verify all state is correct here */ 1158a17d7ddSDave Chinner ASSERT(atomic_read(&ip->i_pincount) == 0); 1168a17d7ddSDave Chinner XFS_STATS_DEC(ip->i_mount, vn_active); 1178a17d7ddSDave Chinner 1188a17d7ddSDave Chinner call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 1198a17d7ddSDave Chinner } 1208a17d7ddSDave Chinner 1211f2dcfe8SDave Chinner void 1221f2dcfe8SDave Chinner xfs_inode_free( 1231f2dcfe8SDave Chinner struct xfs_inode *ip) 1241f2dcfe8SDave Chinner { 12598efe8afSBrian Foster ASSERT(!xfs_isiflocked(ip)); 12698efe8afSBrian Foster 12733479e05SDave Chinner /* 12833479e05SDave Chinner * Because we use RCU freeing we need to ensure the inode always 12933479e05SDave Chinner * appears to be reclaimed with an invalid inode number when in the 13033479e05SDave Chinner * free state. The ip->i_flags_lock provides the barrier against lookup 13133479e05SDave Chinner * races. 13233479e05SDave Chinner */ 13333479e05SDave Chinner spin_lock(&ip->i_flags_lock); 13433479e05SDave Chinner ip->i_flags = XFS_IRECLAIM; 13533479e05SDave Chinner ip->i_ino = 0; 13633479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 13733479e05SDave Chinner 1388a17d7ddSDave Chinner __xfs_inode_free(ip); 13933479e05SDave Chinner } 14033479e05SDave Chinner 14133479e05SDave Chinner /* 142ad438c40SDave Chinner * Queue a new inode reclaim pass if there are reclaimable inodes and there 143ad438c40SDave Chinner * isn't a reclaim pass already in progress. By default it runs every 5s based 144ad438c40SDave Chinner * on the xfs periodic sync default of 30s. Perhaps this should have it's own 145ad438c40SDave Chinner * tunable, but that can be done if this method proves to be ineffective or too 146ad438c40SDave Chinner * aggressive. 147ad438c40SDave Chinner */ 148ad438c40SDave Chinner static void 149ad438c40SDave Chinner xfs_reclaim_work_queue( 150ad438c40SDave Chinner struct xfs_mount *mp) 151ad438c40SDave Chinner { 152ad438c40SDave Chinner 153ad438c40SDave Chinner rcu_read_lock(); 154ad438c40SDave Chinner if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 155ad438c40SDave Chinner queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 156ad438c40SDave Chinner msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 157ad438c40SDave Chinner } 158ad438c40SDave Chinner rcu_read_unlock(); 159ad438c40SDave Chinner } 160ad438c40SDave Chinner 161ad438c40SDave Chinner /* 162ad438c40SDave Chinner * This is a fast pass over the inode cache to try to get reclaim moving on as 163ad438c40SDave Chinner * many inodes as possible in a short period of time. It kicks itself every few 164ad438c40SDave Chinner * seconds, as well as being kicked by the inode cache shrinker when memory 165ad438c40SDave Chinner * goes low. It scans as quickly as possible avoiding locked inodes or those 166ad438c40SDave Chinner * already being flushed, and once done schedules a future pass. 167ad438c40SDave Chinner */ 168ad438c40SDave Chinner void 169ad438c40SDave Chinner xfs_reclaim_worker( 170ad438c40SDave Chinner struct work_struct *work) 171ad438c40SDave Chinner { 172ad438c40SDave Chinner struct xfs_mount *mp = container_of(to_delayed_work(work), 173ad438c40SDave Chinner struct xfs_mount, m_reclaim_work); 174ad438c40SDave Chinner 175ad438c40SDave Chinner xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 176ad438c40SDave Chinner xfs_reclaim_work_queue(mp); 177ad438c40SDave Chinner } 178ad438c40SDave Chinner 179ad438c40SDave Chinner static void 180ad438c40SDave Chinner xfs_perag_set_reclaim_tag( 181ad438c40SDave Chinner struct xfs_perag *pag) 182ad438c40SDave Chinner { 183ad438c40SDave Chinner struct xfs_mount *mp = pag->pag_mount; 184ad438c40SDave Chinner 18595989c46SBrian Foster lockdep_assert_held(&pag->pag_ici_lock); 186ad438c40SDave Chinner if (pag->pag_ici_reclaimable++) 187ad438c40SDave Chinner return; 188ad438c40SDave Chinner 189ad438c40SDave Chinner /* propagate the reclaim tag up into the perag radix tree */ 190ad438c40SDave Chinner spin_lock(&mp->m_perag_lock); 191ad438c40SDave Chinner radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, 192ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 193ad438c40SDave Chinner spin_unlock(&mp->m_perag_lock); 194ad438c40SDave Chinner 195ad438c40SDave Chinner /* schedule periodic background inode reclaim */ 196ad438c40SDave Chinner xfs_reclaim_work_queue(mp); 197ad438c40SDave Chinner 198ad438c40SDave Chinner trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 199ad438c40SDave Chinner } 200ad438c40SDave Chinner 201ad438c40SDave Chinner static void 202ad438c40SDave Chinner xfs_perag_clear_reclaim_tag( 203ad438c40SDave Chinner struct xfs_perag *pag) 204ad438c40SDave Chinner { 205ad438c40SDave Chinner struct xfs_mount *mp = pag->pag_mount; 206ad438c40SDave Chinner 20795989c46SBrian Foster lockdep_assert_held(&pag->pag_ici_lock); 208ad438c40SDave Chinner if (--pag->pag_ici_reclaimable) 209ad438c40SDave Chinner return; 210ad438c40SDave Chinner 211ad438c40SDave Chinner /* clear the reclaim tag from the perag radix tree */ 212ad438c40SDave Chinner spin_lock(&mp->m_perag_lock); 213ad438c40SDave Chinner radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, 214ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 215ad438c40SDave Chinner spin_unlock(&mp->m_perag_lock); 216ad438c40SDave Chinner trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 217ad438c40SDave Chinner } 218ad438c40SDave Chinner 219ad438c40SDave Chinner 220ad438c40SDave Chinner /* 221ad438c40SDave Chinner * We set the inode flag atomically with the radix tree tag. 222ad438c40SDave Chinner * Once we get tag lookups on the radix tree, this inode flag 223ad438c40SDave Chinner * can go away. 224ad438c40SDave Chinner */ 225ad438c40SDave Chinner void 226ad438c40SDave Chinner xfs_inode_set_reclaim_tag( 227ad438c40SDave Chinner struct xfs_inode *ip) 228ad438c40SDave Chinner { 229ad438c40SDave Chinner struct xfs_mount *mp = ip->i_mount; 230ad438c40SDave Chinner struct xfs_perag *pag; 231ad438c40SDave Chinner 232ad438c40SDave Chinner pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 233ad438c40SDave Chinner spin_lock(&pag->pag_ici_lock); 234ad438c40SDave Chinner spin_lock(&ip->i_flags_lock); 235ad438c40SDave Chinner 236ad438c40SDave Chinner radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), 237ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 238ad438c40SDave Chinner xfs_perag_set_reclaim_tag(pag); 239ad438c40SDave Chinner __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 240ad438c40SDave Chinner 241ad438c40SDave Chinner spin_unlock(&ip->i_flags_lock); 242ad438c40SDave Chinner spin_unlock(&pag->pag_ici_lock); 243ad438c40SDave Chinner xfs_perag_put(pag); 244ad438c40SDave Chinner } 245ad438c40SDave Chinner 246ad438c40SDave Chinner STATIC void 247ad438c40SDave Chinner xfs_inode_clear_reclaim_tag( 248ad438c40SDave Chinner struct xfs_perag *pag, 249ad438c40SDave Chinner xfs_ino_t ino) 250ad438c40SDave Chinner { 251ad438c40SDave Chinner radix_tree_tag_clear(&pag->pag_ici_root, 252ad438c40SDave Chinner XFS_INO_TO_AGINO(pag->pag_mount, ino), 253ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 254ad438c40SDave Chinner xfs_perag_clear_reclaim_tag(pag); 255ad438c40SDave Chinner } 256ad438c40SDave Chinner 257ae2c4ac2SBrian Foster static void 258ae2c4ac2SBrian Foster xfs_inew_wait( 259ae2c4ac2SBrian Foster struct xfs_inode *ip) 260ae2c4ac2SBrian Foster { 261ae2c4ac2SBrian Foster wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); 262ae2c4ac2SBrian Foster DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); 263ae2c4ac2SBrian Foster 264ae2c4ac2SBrian Foster do { 26521417136SIngo Molnar prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 266ae2c4ac2SBrian Foster if (!xfs_iflags_test(ip, XFS_INEW)) 267ae2c4ac2SBrian Foster break; 268ae2c4ac2SBrian Foster schedule(); 269ae2c4ac2SBrian Foster } while (true); 27021417136SIngo Molnar finish_wait(wq, &wait.wq_entry); 271ae2c4ac2SBrian Foster } 272ae2c4ac2SBrian Foster 273ad438c40SDave Chinner /* 27450997470SDave Chinner * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 27550997470SDave Chinner * part of the structure. This is made more complex by the fact we store 27650997470SDave Chinner * information about the on-disk values in the VFS inode and so we can't just 27783e06f21SDave Chinner * overwrite the values unconditionally. Hence we save the parameters we 27850997470SDave Chinner * need to retain across reinitialisation, and rewrite them into the VFS inode 27983e06f21SDave Chinner * after reinitialisation even if it fails. 28050997470SDave Chinner */ 28150997470SDave Chinner static int 28250997470SDave Chinner xfs_reinit_inode( 28350997470SDave Chinner struct xfs_mount *mp, 28450997470SDave Chinner struct inode *inode) 28550997470SDave Chinner { 28650997470SDave Chinner int error; 28754d7b5c1SDave Chinner uint32_t nlink = inode->i_nlink; 2889e9a2674SDave Chinner uint32_t generation = inode->i_generation; 289f0e28280SJeff Layton uint64_t version = inode_peek_iversion(inode); 290c19b3b05SDave Chinner umode_t mode = inode->i_mode; 291acd1d715SAmir Goldstein dev_t dev = inode->i_rdev; 29250997470SDave Chinner 29350997470SDave Chinner error = inode_init_always(mp->m_super, inode); 29450997470SDave Chinner 29554d7b5c1SDave Chinner set_nlink(inode, nlink); 2969e9a2674SDave Chinner inode->i_generation = generation; 297f0e28280SJeff Layton inode_set_iversion_queried(inode, version); 298c19b3b05SDave Chinner inode->i_mode = mode; 299acd1d715SAmir Goldstein inode->i_rdev = dev; 30050997470SDave Chinner return error; 30150997470SDave Chinner } 30250997470SDave Chinner 30350997470SDave Chinner /* 304afca6c5bSDave Chinner * If we are allocating a new inode, then check what was returned is 305afca6c5bSDave Chinner * actually a free, empty inode. If we are not allocating an inode, 306afca6c5bSDave Chinner * then check we didn't find a free inode. 307afca6c5bSDave Chinner * 308afca6c5bSDave Chinner * Returns: 309afca6c5bSDave Chinner * 0 if the inode free state matches the lookup context 310afca6c5bSDave Chinner * -ENOENT if the inode is free and we are not allocating 311afca6c5bSDave Chinner * -EFSCORRUPTED if there is any state mismatch at all 312afca6c5bSDave Chinner */ 313afca6c5bSDave Chinner static int 314afca6c5bSDave Chinner xfs_iget_check_free_state( 315afca6c5bSDave Chinner struct xfs_inode *ip, 316afca6c5bSDave Chinner int flags) 317afca6c5bSDave Chinner { 318afca6c5bSDave Chinner if (flags & XFS_IGET_CREATE) { 319afca6c5bSDave Chinner /* should be a free inode */ 320afca6c5bSDave Chinner if (VFS_I(ip)->i_mode != 0) { 321afca6c5bSDave Chinner xfs_warn(ip->i_mount, 322afca6c5bSDave Chinner "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", 323afca6c5bSDave Chinner ip->i_ino, VFS_I(ip)->i_mode); 324afca6c5bSDave Chinner return -EFSCORRUPTED; 325afca6c5bSDave Chinner } 326afca6c5bSDave Chinner 327afca6c5bSDave Chinner if (ip->i_d.di_nblocks != 0) { 328afca6c5bSDave Chinner xfs_warn(ip->i_mount, 329afca6c5bSDave Chinner "Corruption detected! Free inode 0x%llx has blocks allocated!", 330afca6c5bSDave Chinner ip->i_ino); 331afca6c5bSDave Chinner return -EFSCORRUPTED; 332afca6c5bSDave Chinner } 333afca6c5bSDave Chinner return 0; 334afca6c5bSDave Chinner } 335afca6c5bSDave Chinner 336afca6c5bSDave Chinner /* should be an allocated inode */ 337afca6c5bSDave Chinner if (VFS_I(ip)->i_mode == 0) 338afca6c5bSDave Chinner return -ENOENT; 339afca6c5bSDave Chinner 340afca6c5bSDave Chinner return 0; 341afca6c5bSDave Chinner } 342afca6c5bSDave Chinner 343afca6c5bSDave Chinner /* 34433479e05SDave Chinner * Check the validity of the inode we just found it the cache 34533479e05SDave Chinner */ 34633479e05SDave Chinner static int 34733479e05SDave Chinner xfs_iget_cache_hit( 34833479e05SDave Chinner struct xfs_perag *pag, 34933479e05SDave Chinner struct xfs_inode *ip, 35033479e05SDave Chinner xfs_ino_t ino, 35133479e05SDave Chinner int flags, 35233479e05SDave Chinner int lock_flags) __releases(RCU) 35333479e05SDave Chinner { 35433479e05SDave Chinner struct inode *inode = VFS_I(ip); 35533479e05SDave Chinner struct xfs_mount *mp = ip->i_mount; 35633479e05SDave Chinner int error; 35733479e05SDave Chinner 35833479e05SDave Chinner /* 35933479e05SDave Chinner * check for re-use of an inode within an RCU grace period due to the 36033479e05SDave Chinner * radix tree nodes not being updated yet. We monitor for this by 36133479e05SDave Chinner * setting the inode number to zero before freeing the inode structure. 36233479e05SDave Chinner * If the inode has been reallocated and set up, then the inode number 36333479e05SDave Chinner * will not match, so check for that, too. 36433479e05SDave Chinner */ 36533479e05SDave Chinner spin_lock(&ip->i_flags_lock); 36633479e05SDave Chinner if (ip->i_ino != ino) { 36733479e05SDave Chinner trace_xfs_iget_skip(ip); 368ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_frecycle); 3692451337dSDave Chinner error = -EAGAIN; 37033479e05SDave Chinner goto out_error; 37133479e05SDave Chinner } 37233479e05SDave Chinner 37333479e05SDave Chinner 37433479e05SDave Chinner /* 37533479e05SDave Chinner * If we are racing with another cache hit that is currently 37633479e05SDave Chinner * instantiating this inode or currently recycling it out of 37733479e05SDave Chinner * reclaimabe state, wait for the initialisation to complete 37833479e05SDave Chinner * before continuing. 37933479e05SDave Chinner * 38033479e05SDave Chinner * XXX(hch): eventually we should do something equivalent to 38133479e05SDave Chinner * wait_on_inode to wait for these flags to be cleared 38233479e05SDave Chinner * instead of polling for it. 38333479e05SDave Chinner */ 38433479e05SDave Chinner if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { 38533479e05SDave Chinner trace_xfs_iget_skip(ip); 386ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_frecycle); 3872451337dSDave Chinner error = -EAGAIN; 38833479e05SDave Chinner goto out_error; 38933479e05SDave Chinner } 39033479e05SDave Chinner 39133479e05SDave Chinner /* 392afca6c5bSDave Chinner * Check the inode free state is valid. This also detects lookup 393afca6c5bSDave Chinner * racing with unlinks. 39433479e05SDave Chinner */ 395afca6c5bSDave Chinner error = xfs_iget_check_free_state(ip, flags); 396afca6c5bSDave Chinner if (error) 39733479e05SDave Chinner goto out_error; 39833479e05SDave Chinner 39933479e05SDave Chinner /* 40033479e05SDave Chinner * If IRECLAIMABLE is set, we've torn down the VFS inode already. 40133479e05SDave Chinner * Need to carefully get it back into useable state. 40233479e05SDave Chinner */ 40333479e05SDave Chinner if (ip->i_flags & XFS_IRECLAIMABLE) { 40433479e05SDave Chinner trace_xfs_iget_reclaim(ip); 40533479e05SDave Chinner 406378f681cSDarrick J. Wong if (flags & XFS_IGET_INCORE) { 407378f681cSDarrick J. Wong error = -EAGAIN; 408378f681cSDarrick J. Wong goto out_error; 409378f681cSDarrick J. Wong } 410378f681cSDarrick J. Wong 41133479e05SDave Chinner /* 41233479e05SDave Chinner * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode 41333479e05SDave Chinner * from stomping over us while we recycle the inode. We can't 41433479e05SDave Chinner * clear the radix tree reclaimable tag yet as it requires 41533479e05SDave Chinner * pag_ici_lock to be held exclusive. 41633479e05SDave Chinner */ 41733479e05SDave Chinner ip->i_flags |= XFS_IRECLAIM; 41833479e05SDave Chinner 41933479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 42033479e05SDave Chinner rcu_read_unlock(); 42133479e05SDave Chinner 42250997470SDave Chinner error = xfs_reinit_inode(mp, inode); 42333479e05SDave Chinner if (error) { 424756baca2SBrian Foster bool wake; 42533479e05SDave Chinner /* 42633479e05SDave Chinner * Re-initializing the inode failed, and we are in deep 42733479e05SDave Chinner * trouble. Try to re-add it to the reclaim list. 42833479e05SDave Chinner */ 42933479e05SDave Chinner rcu_read_lock(); 43033479e05SDave Chinner spin_lock(&ip->i_flags_lock); 431756baca2SBrian Foster wake = !!__xfs_iflags_test(ip, XFS_INEW); 43233479e05SDave Chinner ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 433756baca2SBrian Foster if (wake) 434756baca2SBrian Foster wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); 43533479e05SDave Chinner ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 43633479e05SDave Chinner trace_xfs_iget_reclaim_fail(ip); 43733479e05SDave Chinner goto out_error; 43833479e05SDave Chinner } 43933479e05SDave Chinner 44033479e05SDave Chinner spin_lock(&pag->pag_ici_lock); 44133479e05SDave Chinner spin_lock(&ip->i_flags_lock); 44233479e05SDave Chinner 44333479e05SDave Chinner /* 44433479e05SDave Chinner * Clear the per-lifetime state in the inode as we are now 44533479e05SDave Chinner * effectively a new inode and need to return to the initial 44633479e05SDave Chinner * state before reuse occurs. 44733479e05SDave Chinner */ 44833479e05SDave Chinner ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 44933479e05SDave Chinner ip->i_flags |= XFS_INEW; 450545c0889SDave Chinner xfs_inode_clear_reclaim_tag(pag, ip->i_ino); 45133479e05SDave Chinner inode->i_state = I_NEW; 4526772c1f1SDarrick J. Wong ip->i_sick = 0; 4536772c1f1SDarrick J. Wong ip->i_checked = 0; 45433479e05SDave Chinner 45565523218SChristoph Hellwig ASSERT(!rwsem_is_locked(&inode->i_rwsem)); 45665523218SChristoph Hellwig init_rwsem(&inode->i_rwsem); 45733479e05SDave Chinner 45833479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 45933479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 46033479e05SDave Chinner } else { 46133479e05SDave Chinner /* If the VFS inode is being torn down, pause and try again. */ 46233479e05SDave Chinner if (!igrab(inode)) { 46333479e05SDave Chinner trace_xfs_iget_skip(ip); 4642451337dSDave Chinner error = -EAGAIN; 46533479e05SDave Chinner goto out_error; 46633479e05SDave Chinner } 46733479e05SDave Chinner 46833479e05SDave Chinner /* We've got a live one. */ 46933479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 47033479e05SDave Chinner rcu_read_unlock(); 47133479e05SDave Chinner trace_xfs_iget_hit(ip); 47233479e05SDave Chinner } 47333479e05SDave Chinner 47433479e05SDave Chinner if (lock_flags != 0) 47533479e05SDave Chinner xfs_ilock(ip, lock_flags); 47633479e05SDave Chinner 477378f681cSDarrick J. Wong if (!(flags & XFS_IGET_INCORE)) 47833479e05SDave Chinner xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); 479ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_found); 48033479e05SDave Chinner 48133479e05SDave Chinner return 0; 48233479e05SDave Chinner 48333479e05SDave Chinner out_error: 48433479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 48533479e05SDave Chinner rcu_read_unlock(); 48633479e05SDave Chinner return error; 48733479e05SDave Chinner } 48833479e05SDave Chinner 48933479e05SDave Chinner 49033479e05SDave Chinner static int 49133479e05SDave Chinner xfs_iget_cache_miss( 49233479e05SDave Chinner struct xfs_mount *mp, 49333479e05SDave Chinner struct xfs_perag *pag, 49433479e05SDave Chinner xfs_trans_t *tp, 49533479e05SDave Chinner xfs_ino_t ino, 49633479e05SDave Chinner struct xfs_inode **ipp, 49733479e05SDave Chinner int flags, 49833479e05SDave Chinner int lock_flags) 49933479e05SDave Chinner { 50033479e05SDave Chinner struct xfs_inode *ip; 50133479e05SDave Chinner int error; 50233479e05SDave Chinner xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 50333479e05SDave Chinner int iflags; 50433479e05SDave Chinner 50533479e05SDave Chinner ip = xfs_inode_alloc(mp, ino); 50633479e05SDave Chinner if (!ip) 5072451337dSDave Chinner return -ENOMEM; 50833479e05SDave Chinner 50933479e05SDave Chinner error = xfs_iread(mp, tp, ip, flags); 51033479e05SDave Chinner if (error) 51133479e05SDave Chinner goto out_destroy; 51233479e05SDave Chinner 5139cfb9b47SDarrick J. Wong if (!xfs_inode_verify_forks(ip)) { 5149cfb9b47SDarrick J. Wong error = -EFSCORRUPTED; 5159cfb9b47SDarrick J. Wong goto out_destroy; 5169cfb9b47SDarrick J. Wong } 5179cfb9b47SDarrick J. Wong 51833479e05SDave Chinner trace_xfs_iget_miss(ip); 51933479e05SDave Chinner 520ee457001SDave Chinner 521ee457001SDave Chinner /* 522afca6c5bSDave Chinner * Check the inode free state is valid. This also detects lookup 523afca6c5bSDave Chinner * racing with unlinks. 524ee457001SDave Chinner */ 525afca6c5bSDave Chinner error = xfs_iget_check_free_state(ip, flags); 526afca6c5bSDave Chinner if (error) 527ee457001SDave Chinner goto out_destroy; 52833479e05SDave Chinner 52933479e05SDave Chinner /* 53033479e05SDave Chinner * Preload the radix tree so we can insert safely under the 53133479e05SDave Chinner * write spinlock. Note that we cannot sleep inside the preload 53233479e05SDave Chinner * region. Since we can be called from transaction context, don't 53333479e05SDave Chinner * recurse into the file system. 53433479e05SDave Chinner */ 53533479e05SDave Chinner if (radix_tree_preload(GFP_NOFS)) { 5362451337dSDave Chinner error = -EAGAIN; 53733479e05SDave Chinner goto out_destroy; 53833479e05SDave Chinner } 53933479e05SDave Chinner 54033479e05SDave Chinner /* 54133479e05SDave Chinner * Because the inode hasn't been added to the radix-tree yet it can't 54233479e05SDave Chinner * be found by another thread, so we can do the non-sleeping lock here. 54333479e05SDave Chinner */ 54433479e05SDave Chinner if (lock_flags) { 54533479e05SDave Chinner if (!xfs_ilock_nowait(ip, lock_flags)) 54633479e05SDave Chinner BUG(); 54733479e05SDave Chinner } 54833479e05SDave Chinner 54933479e05SDave Chinner /* 55033479e05SDave Chinner * These values must be set before inserting the inode into the radix 55133479e05SDave Chinner * tree as the moment it is inserted a concurrent lookup (allowed by the 55233479e05SDave Chinner * RCU locking mechanism) can find it and that lookup must see that this 55333479e05SDave Chinner * is an inode currently under construction (i.e. that XFS_INEW is set). 55433479e05SDave Chinner * The ip->i_flags_lock that protects the XFS_INEW flag forms the 55533479e05SDave Chinner * memory barrier that ensures this detection works correctly at lookup 55633479e05SDave Chinner * time. 55733479e05SDave Chinner */ 55833479e05SDave Chinner iflags = XFS_INEW; 55933479e05SDave Chinner if (flags & XFS_IGET_DONTCACHE) 56033479e05SDave Chinner iflags |= XFS_IDONTCACHE; 561113a5683SChandra Seetharaman ip->i_udquot = NULL; 562113a5683SChandra Seetharaman ip->i_gdquot = NULL; 56392f8ff73SChandra Seetharaman ip->i_pdquot = NULL; 56433479e05SDave Chinner xfs_iflags_set(ip, iflags); 56533479e05SDave Chinner 56633479e05SDave Chinner /* insert the new inode */ 56733479e05SDave Chinner spin_lock(&pag->pag_ici_lock); 56833479e05SDave Chinner error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 56933479e05SDave Chinner if (unlikely(error)) { 57033479e05SDave Chinner WARN_ON(error != -EEXIST); 571ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_dup); 5722451337dSDave Chinner error = -EAGAIN; 57333479e05SDave Chinner goto out_preload_end; 57433479e05SDave Chinner } 57533479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 57633479e05SDave Chinner radix_tree_preload_end(); 57733479e05SDave Chinner 57833479e05SDave Chinner *ipp = ip; 57933479e05SDave Chinner return 0; 58033479e05SDave Chinner 58133479e05SDave Chinner out_preload_end: 58233479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 58333479e05SDave Chinner radix_tree_preload_end(); 58433479e05SDave Chinner if (lock_flags) 58533479e05SDave Chinner xfs_iunlock(ip, lock_flags); 58633479e05SDave Chinner out_destroy: 58733479e05SDave Chinner __destroy_inode(VFS_I(ip)); 58833479e05SDave Chinner xfs_inode_free(ip); 58933479e05SDave Chinner return error; 59033479e05SDave Chinner } 59133479e05SDave Chinner 59233479e05SDave Chinner /* 59333479e05SDave Chinner * Look up an inode by number in the given file system. 59433479e05SDave Chinner * The inode is looked up in the cache held in each AG. 59533479e05SDave Chinner * If the inode is found in the cache, initialise the vfs inode 59633479e05SDave Chinner * if necessary. 59733479e05SDave Chinner * 59833479e05SDave Chinner * If it is not in core, read it in from the file system's device, 59933479e05SDave Chinner * add it to the cache and initialise the vfs inode. 60033479e05SDave Chinner * 60133479e05SDave Chinner * The inode is locked according to the value of the lock_flags parameter. 60233479e05SDave Chinner * This flag parameter indicates how and if the inode's IO lock and inode lock 60333479e05SDave Chinner * should be taken. 60433479e05SDave Chinner * 60533479e05SDave Chinner * mp -- the mount point structure for the current file system. It points 60633479e05SDave Chinner * to the inode hash table. 60733479e05SDave Chinner * tp -- a pointer to the current transaction if there is one. This is 60833479e05SDave Chinner * simply passed through to the xfs_iread() call. 60933479e05SDave Chinner * ino -- the number of the inode desired. This is the unique identifier 61033479e05SDave Chinner * within the file system for the inode being requested. 61133479e05SDave Chinner * lock_flags -- flags indicating how to lock the inode. See the comment 61233479e05SDave Chinner * for xfs_ilock() for a list of valid values. 61333479e05SDave Chinner */ 61433479e05SDave Chinner int 61533479e05SDave Chinner xfs_iget( 61633479e05SDave Chinner xfs_mount_t *mp, 61733479e05SDave Chinner xfs_trans_t *tp, 61833479e05SDave Chinner xfs_ino_t ino, 61933479e05SDave Chinner uint flags, 62033479e05SDave Chinner uint lock_flags, 62133479e05SDave Chinner xfs_inode_t **ipp) 62233479e05SDave Chinner { 62333479e05SDave Chinner xfs_inode_t *ip; 62433479e05SDave Chinner int error; 62533479e05SDave Chinner xfs_perag_t *pag; 62633479e05SDave Chinner xfs_agino_t agino; 62733479e05SDave Chinner 62833479e05SDave Chinner /* 62933479e05SDave Chinner * xfs_reclaim_inode() uses the ILOCK to ensure an inode 63033479e05SDave Chinner * doesn't get freed while it's being referenced during a 63133479e05SDave Chinner * radix tree traversal here. It assumes this function 63233479e05SDave Chinner * aqcuires only the ILOCK (and therefore it has no need to 63333479e05SDave Chinner * involve the IOLOCK in this synchronization). 63433479e05SDave Chinner */ 63533479e05SDave Chinner ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 63633479e05SDave Chinner 63733479e05SDave Chinner /* reject inode numbers outside existing AGs */ 63833479e05SDave Chinner if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 6392451337dSDave Chinner return -EINVAL; 64033479e05SDave Chinner 641ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_attempts); 6428774cf8bSLucas Stach 64333479e05SDave Chinner /* get the perag structure and ensure that it's inode capable */ 64433479e05SDave Chinner pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 64533479e05SDave Chinner agino = XFS_INO_TO_AGINO(mp, ino); 64633479e05SDave Chinner 64733479e05SDave Chinner again: 64833479e05SDave Chinner error = 0; 64933479e05SDave Chinner rcu_read_lock(); 65033479e05SDave Chinner ip = radix_tree_lookup(&pag->pag_ici_root, agino); 65133479e05SDave Chinner 65233479e05SDave Chinner if (ip) { 65333479e05SDave Chinner error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 65433479e05SDave Chinner if (error) 65533479e05SDave Chinner goto out_error_or_again; 65633479e05SDave Chinner } else { 65733479e05SDave Chinner rcu_read_unlock(); 658378f681cSDarrick J. Wong if (flags & XFS_IGET_INCORE) { 659ed438b47SDarrick J. Wong error = -ENODATA; 660378f681cSDarrick J. Wong goto out_error_or_again; 661378f681cSDarrick J. Wong } 662ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_missed); 66333479e05SDave Chinner 66433479e05SDave Chinner error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 66533479e05SDave Chinner flags, lock_flags); 66633479e05SDave Chinner if (error) 66733479e05SDave Chinner goto out_error_or_again; 66833479e05SDave Chinner } 66933479e05SDave Chinner xfs_perag_put(pag); 67033479e05SDave Chinner 67133479e05SDave Chinner *ipp = ip; 67233479e05SDave Chinner 67333479e05SDave Chinner /* 67458c90473SDave Chinner * If we have a real type for an on-disk inode, we can setup the inode 67533479e05SDave Chinner * now. If it's a new inode being created, xfs_ialloc will handle it. 67633479e05SDave Chinner */ 677c19b3b05SDave Chinner if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 67858c90473SDave Chinner xfs_setup_existing_inode(ip); 67933479e05SDave Chinner return 0; 68033479e05SDave Chinner 68133479e05SDave Chinner out_error_or_again: 682378f681cSDarrick J. Wong if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { 68333479e05SDave Chinner delay(1); 68433479e05SDave Chinner goto again; 68533479e05SDave Chinner } 68633479e05SDave Chinner xfs_perag_put(pag); 68733479e05SDave Chinner return error; 68833479e05SDave Chinner } 68933479e05SDave Chinner 6906d8b79cfSDave Chinner /* 691378f681cSDarrick J. Wong * "Is this a cached inode that's also allocated?" 692378f681cSDarrick J. Wong * 693378f681cSDarrick J. Wong * Look up an inode by number in the given file system. If the inode is 694378f681cSDarrick J. Wong * in cache and isn't in purgatory, return 1 if the inode is allocated 695378f681cSDarrick J. Wong * and 0 if it is not. For all other cases (not in cache, being torn 696378f681cSDarrick J. Wong * down, etc.), return a negative error code. 697378f681cSDarrick J. Wong * 698378f681cSDarrick J. Wong * The caller has to prevent inode allocation and freeing activity, 699378f681cSDarrick J. Wong * presumably by locking the AGI buffer. This is to ensure that an 700378f681cSDarrick J. Wong * inode cannot transition from allocated to freed until the caller is 701378f681cSDarrick J. Wong * ready to allow that. If the inode is in an intermediate state (new, 702378f681cSDarrick J. Wong * reclaimable, or being reclaimed), -EAGAIN will be returned; if the 703378f681cSDarrick J. Wong * inode is not in the cache, -ENOENT will be returned. The caller must 704378f681cSDarrick J. Wong * deal with these scenarios appropriately. 705378f681cSDarrick J. Wong * 706378f681cSDarrick J. Wong * This is a specialized use case for the online scrubber; if you're 707378f681cSDarrick J. Wong * reading this, you probably want xfs_iget. 708378f681cSDarrick J. Wong */ 709378f681cSDarrick J. Wong int 710378f681cSDarrick J. Wong xfs_icache_inode_is_allocated( 711378f681cSDarrick J. Wong struct xfs_mount *mp, 712378f681cSDarrick J. Wong struct xfs_trans *tp, 713378f681cSDarrick J. Wong xfs_ino_t ino, 714378f681cSDarrick J. Wong bool *inuse) 715378f681cSDarrick J. Wong { 716378f681cSDarrick J. Wong struct xfs_inode *ip; 717378f681cSDarrick J. Wong int error; 718378f681cSDarrick J. Wong 719378f681cSDarrick J. Wong error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); 720378f681cSDarrick J. Wong if (error) 721378f681cSDarrick J. Wong return error; 722378f681cSDarrick J. Wong 723378f681cSDarrick J. Wong *inuse = !!(VFS_I(ip)->i_mode); 72444a8736bSDarrick J. Wong xfs_irele(ip); 725378f681cSDarrick J. Wong return 0; 726378f681cSDarrick J. Wong } 727378f681cSDarrick J. Wong 728378f681cSDarrick J. Wong /* 7296d8b79cfSDave Chinner * The inode lookup is done in batches to keep the amount of lock traffic and 7306d8b79cfSDave Chinner * radix tree lookups to a minimum. The batch size is a trade off between 7316d8b79cfSDave Chinner * lookup reduction and stack usage. This is in the reclaim path, so we can't 7326d8b79cfSDave Chinner * be too greedy. 7336d8b79cfSDave Chinner */ 7346d8b79cfSDave Chinner #define XFS_LOOKUP_BATCH 32 7356d8b79cfSDave Chinner 7366d8b79cfSDave Chinner STATIC int 7376d8b79cfSDave Chinner xfs_inode_ag_walk_grab( 738ae2c4ac2SBrian Foster struct xfs_inode *ip, 739ae2c4ac2SBrian Foster int flags) 7406d8b79cfSDave Chinner { 7416d8b79cfSDave Chinner struct inode *inode = VFS_I(ip); 742ae2c4ac2SBrian Foster bool newinos = !!(flags & XFS_AGITER_INEW_WAIT); 7436d8b79cfSDave Chinner 7446d8b79cfSDave Chinner ASSERT(rcu_read_lock_held()); 7456d8b79cfSDave Chinner 7466d8b79cfSDave Chinner /* 7476d8b79cfSDave Chinner * check for stale RCU freed inode 7486d8b79cfSDave Chinner * 7496d8b79cfSDave Chinner * If the inode has been reallocated, it doesn't matter if it's not in 7506d8b79cfSDave Chinner * the AG we are walking - we are walking for writeback, so if it 7516d8b79cfSDave Chinner * passes all the "valid inode" checks and is dirty, then we'll write 7526d8b79cfSDave Chinner * it back anyway. If it has been reallocated and still being 7536d8b79cfSDave Chinner * initialised, the XFS_INEW check below will catch it. 7546d8b79cfSDave Chinner */ 7556d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 7566d8b79cfSDave Chinner if (!ip->i_ino) 7576d8b79cfSDave Chinner goto out_unlock_noent; 7586d8b79cfSDave Chinner 7596d8b79cfSDave Chinner /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 760ae2c4ac2SBrian Foster if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || 761ae2c4ac2SBrian Foster __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) 7626d8b79cfSDave Chinner goto out_unlock_noent; 7636d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 7646d8b79cfSDave Chinner 7656d8b79cfSDave Chinner /* nothing to sync during shutdown */ 7666d8b79cfSDave Chinner if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 7672451337dSDave Chinner return -EFSCORRUPTED; 7686d8b79cfSDave Chinner 7696d8b79cfSDave Chinner /* If we can't grab the inode, it must on it's way to reclaim. */ 7706d8b79cfSDave Chinner if (!igrab(inode)) 7712451337dSDave Chinner return -ENOENT; 7726d8b79cfSDave Chinner 7736d8b79cfSDave Chinner /* inode is valid */ 7746d8b79cfSDave Chinner return 0; 7756d8b79cfSDave Chinner 7766d8b79cfSDave Chinner out_unlock_noent: 7776d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 7782451337dSDave Chinner return -ENOENT; 7796d8b79cfSDave Chinner } 7806d8b79cfSDave Chinner 7816d8b79cfSDave Chinner STATIC int 7826d8b79cfSDave Chinner xfs_inode_ag_walk( 7836d8b79cfSDave Chinner struct xfs_mount *mp, 7846d8b79cfSDave Chinner struct xfs_perag *pag, 785e0094008SEric Sandeen int (*execute)(struct xfs_inode *ip, int flags, 786a454f742SBrian Foster void *args), 787a454f742SBrian Foster int flags, 788a454f742SBrian Foster void *args, 789ae2c4ac2SBrian Foster int tag, 790ae2c4ac2SBrian Foster int iter_flags) 7916d8b79cfSDave Chinner { 7926d8b79cfSDave Chinner uint32_t first_index; 7936d8b79cfSDave Chinner int last_error = 0; 7946d8b79cfSDave Chinner int skipped; 7956d8b79cfSDave Chinner int done; 7966d8b79cfSDave Chinner int nr_found; 7976d8b79cfSDave Chinner 7986d8b79cfSDave Chinner restart: 7996d8b79cfSDave Chinner done = 0; 8006d8b79cfSDave Chinner skipped = 0; 8016d8b79cfSDave Chinner first_index = 0; 8026d8b79cfSDave Chinner nr_found = 0; 8036d8b79cfSDave Chinner do { 8046d8b79cfSDave Chinner struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 8056d8b79cfSDave Chinner int error = 0; 8066d8b79cfSDave Chinner int i; 8076d8b79cfSDave Chinner 8086d8b79cfSDave Chinner rcu_read_lock(); 809a454f742SBrian Foster 810a454f742SBrian Foster if (tag == -1) 8116d8b79cfSDave Chinner nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 8126d8b79cfSDave Chinner (void **)batch, first_index, 8136d8b79cfSDave Chinner XFS_LOOKUP_BATCH); 814a454f742SBrian Foster else 815a454f742SBrian Foster nr_found = radix_tree_gang_lookup_tag( 816a454f742SBrian Foster &pag->pag_ici_root, 817a454f742SBrian Foster (void **) batch, first_index, 818a454f742SBrian Foster XFS_LOOKUP_BATCH, tag); 819a454f742SBrian Foster 8206d8b79cfSDave Chinner if (!nr_found) { 8216d8b79cfSDave Chinner rcu_read_unlock(); 8226d8b79cfSDave Chinner break; 8236d8b79cfSDave Chinner } 8246d8b79cfSDave Chinner 8256d8b79cfSDave Chinner /* 8266d8b79cfSDave Chinner * Grab the inodes before we drop the lock. if we found 8276d8b79cfSDave Chinner * nothing, nr == 0 and the loop will be skipped. 8286d8b79cfSDave Chinner */ 8296d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 8306d8b79cfSDave Chinner struct xfs_inode *ip = batch[i]; 8316d8b79cfSDave Chinner 832ae2c4ac2SBrian Foster if (done || xfs_inode_ag_walk_grab(ip, iter_flags)) 8336d8b79cfSDave Chinner batch[i] = NULL; 8346d8b79cfSDave Chinner 8356d8b79cfSDave Chinner /* 8366d8b79cfSDave Chinner * Update the index for the next lookup. Catch 8376d8b79cfSDave Chinner * overflows into the next AG range which can occur if 8386d8b79cfSDave Chinner * we have inodes in the last block of the AG and we 8396d8b79cfSDave Chinner * are currently pointing to the last inode. 8406d8b79cfSDave Chinner * 8416d8b79cfSDave Chinner * Because we may see inodes that are from the wrong AG 8426d8b79cfSDave Chinner * due to RCU freeing and reallocation, only update the 8436d8b79cfSDave Chinner * index if it lies in this AG. It was a race that lead 8446d8b79cfSDave Chinner * us to see this inode, so another lookup from the 8456d8b79cfSDave Chinner * same index will not find it again. 8466d8b79cfSDave Chinner */ 8476d8b79cfSDave Chinner if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 8486d8b79cfSDave Chinner continue; 8496d8b79cfSDave Chinner first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 8506d8b79cfSDave Chinner if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 8516d8b79cfSDave Chinner done = 1; 8526d8b79cfSDave Chinner } 8536d8b79cfSDave Chinner 8546d8b79cfSDave Chinner /* unlock now we've grabbed the inodes. */ 8556d8b79cfSDave Chinner rcu_read_unlock(); 8566d8b79cfSDave Chinner 8576d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 8586d8b79cfSDave Chinner if (!batch[i]) 8596d8b79cfSDave Chinner continue; 860ae2c4ac2SBrian Foster if ((iter_flags & XFS_AGITER_INEW_WAIT) && 861ae2c4ac2SBrian Foster xfs_iflags_test(batch[i], XFS_INEW)) 862ae2c4ac2SBrian Foster xfs_inew_wait(batch[i]); 863e0094008SEric Sandeen error = execute(batch[i], flags, args); 86444a8736bSDarrick J. Wong xfs_irele(batch[i]); 8652451337dSDave Chinner if (error == -EAGAIN) { 8666d8b79cfSDave Chinner skipped++; 8676d8b79cfSDave Chinner continue; 8686d8b79cfSDave Chinner } 8692451337dSDave Chinner if (error && last_error != -EFSCORRUPTED) 8706d8b79cfSDave Chinner last_error = error; 8716d8b79cfSDave Chinner } 8726d8b79cfSDave Chinner 8736d8b79cfSDave Chinner /* bail out if the filesystem is corrupted. */ 8742451337dSDave Chinner if (error == -EFSCORRUPTED) 8756d8b79cfSDave Chinner break; 8766d8b79cfSDave Chinner 8776d8b79cfSDave Chinner cond_resched(); 8786d8b79cfSDave Chinner 8796d8b79cfSDave Chinner } while (nr_found && !done); 8806d8b79cfSDave Chinner 8816d8b79cfSDave Chinner if (skipped) { 8826d8b79cfSDave Chinner delay(1); 8836d8b79cfSDave Chinner goto restart; 8846d8b79cfSDave Chinner } 8856d8b79cfSDave Chinner return last_error; 8866d8b79cfSDave Chinner } 8876d8b79cfSDave Chinner 888579b62faSBrian Foster /* 889579b62faSBrian Foster * Background scanning to trim post-EOF preallocated space. This is queued 890b9fe5052SDwight Engen * based on the 'speculative_prealloc_lifetime' tunable (5m by default). 891579b62faSBrian Foster */ 892fa5a4f57SBrian Foster void 893579b62faSBrian Foster xfs_queue_eofblocks( 894579b62faSBrian Foster struct xfs_mount *mp) 895579b62faSBrian Foster { 896579b62faSBrian Foster rcu_read_lock(); 897579b62faSBrian Foster if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) 898579b62faSBrian Foster queue_delayed_work(mp->m_eofblocks_workqueue, 899579b62faSBrian Foster &mp->m_eofblocks_work, 900579b62faSBrian Foster msecs_to_jiffies(xfs_eofb_secs * 1000)); 901579b62faSBrian Foster rcu_read_unlock(); 902579b62faSBrian Foster } 903579b62faSBrian Foster 904579b62faSBrian Foster void 905579b62faSBrian Foster xfs_eofblocks_worker( 906579b62faSBrian Foster struct work_struct *work) 907579b62faSBrian Foster { 908579b62faSBrian Foster struct xfs_mount *mp = container_of(to_delayed_work(work), 909579b62faSBrian Foster struct xfs_mount, m_eofblocks_work); 910579b62faSBrian Foster xfs_icache_free_eofblocks(mp, NULL); 911579b62faSBrian Foster xfs_queue_eofblocks(mp); 912579b62faSBrian Foster } 913579b62faSBrian Foster 91483104d44SDarrick J. Wong /* 91583104d44SDarrick J. Wong * Background scanning to trim preallocated CoW space. This is queued 91683104d44SDarrick J. Wong * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). 91783104d44SDarrick J. Wong * (We'll just piggyback on the post-EOF prealloc space workqueue.) 91883104d44SDarrick J. Wong */ 91910ddf64eSDarrick J. Wong void 92083104d44SDarrick J. Wong xfs_queue_cowblocks( 92183104d44SDarrick J. Wong struct xfs_mount *mp) 92283104d44SDarrick J. Wong { 92383104d44SDarrick J. Wong rcu_read_lock(); 92483104d44SDarrick J. Wong if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) 92583104d44SDarrick J. Wong queue_delayed_work(mp->m_eofblocks_workqueue, 92683104d44SDarrick J. Wong &mp->m_cowblocks_work, 92783104d44SDarrick J. Wong msecs_to_jiffies(xfs_cowb_secs * 1000)); 92883104d44SDarrick J. Wong rcu_read_unlock(); 92983104d44SDarrick J. Wong } 93083104d44SDarrick J. Wong 93183104d44SDarrick J. Wong void 93283104d44SDarrick J. Wong xfs_cowblocks_worker( 93383104d44SDarrick J. Wong struct work_struct *work) 93483104d44SDarrick J. Wong { 93583104d44SDarrick J. Wong struct xfs_mount *mp = container_of(to_delayed_work(work), 93683104d44SDarrick J. Wong struct xfs_mount, m_cowblocks_work); 93783104d44SDarrick J. Wong xfs_icache_free_cowblocks(mp, NULL); 93883104d44SDarrick J. Wong xfs_queue_cowblocks(mp); 93983104d44SDarrick J. Wong } 94083104d44SDarrick J. Wong 9416d8b79cfSDave Chinner int 942ae2c4ac2SBrian Foster xfs_inode_ag_iterator_flags( 9436d8b79cfSDave Chinner struct xfs_mount *mp, 944e0094008SEric Sandeen int (*execute)(struct xfs_inode *ip, int flags, 945a454f742SBrian Foster void *args), 946a454f742SBrian Foster int flags, 947ae2c4ac2SBrian Foster void *args, 948ae2c4ac2SBrian Foster int iter_flags) 9496d8b79cfSDave Chinner { 9506d8b79cfSDave Chinner struct xfs_perag *pag; 9516d8b79cfSDave Chinner int error = 0; 9526d8b79cfSDave Chinner int last_error = 0; 9536d8b79cfSDave Chinner xfs_agnumber_t ag; 9546d8b79cfSDave Chinner 9556d8b79cfSDave Chinner ag = 0; 9566d8b79cfSDave Chinner while ((pag = xfs_perag_get(mp, ag))) { 9576d8b79cfSDave Chinner ag = pag->pag_agno + 1; 958ae2c4ac2SBrian Foster error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1, 959ae2c4ac2SBrian Foster iter_flags); 960a454f742SBrian Foster xfs_perag_put(pag); 961a454f742SBrian Foster if (error) { 962a454f742SBrian Foster last_error = error; 9632451337dSDave Chinner if (error == -EFSCORRUPTED) 964a454f742SBrian Foster break; 965a454f742SBrian Foster } 966a454f742SBrian Foster } 967b474c7aeSEric Sandeen return last_error; 968a454f742SBrian Foster } 969a454f742SBrian Foster 970a454f742SBrian Foster int 971ae2c4ac2SBrian Foster xfs_inode_ag_iterator( 972ae2c4ac2SBrian Foster struct xfs_mount *mp, 973ae2c4ac2SBrian Foster int (*execute)(struct xfs_inode *ip, int flags, 974ae2c4ac2SBrian Foster void *args), 975ae2c4ac2SBrian Foster int flags, 976ae2c4ac2SBrian Foster void *args) 977ae2c4ac2SBrian Foster { 978ae2c4ac2SBrian Foster return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0); 979ae2c4ac2SBrian Foster } 980ae2c4ac2SBrian Foster 981ae2c4ac2SBrian Foster int 982a454f742SBrian Foster xfs_inode_ag_iterator_tag( 983a454f742SBrian Foster struct xfs_mount *mp, 984e0094008SEric Sandeen int (*execute)(struct xfs_inode *ip, int flags, 985a454f742SBrian Foster void *args), 986a454f742SBrian Foster int flags, 987a454f742SBrian Foster void *args, 988a454f742SBrian Foster int tag) 989a454f742SBrian Foster { 990a454f742SBrian Foster struct xfs_perag *pag; 991a454f742SBrian Foster int error = 0; 992a454f742SBrian Foster int last_error = 0; 993a454f742SBrian Foster xfs_agnumber_t ag; 994a454f742SBrian Foster 995a454f742SBrian Foster ag = 0; 996a454f742SBrian Foster while ((pag = xfs_perag_get_tag(mp, ag, tag))) { 997a454f742SBrian Foster ag = pag->pag_agno + 1; 998ae2c4ac2SBrian Foster error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag, 999ae2c4ac2SBrian Foster 0); 10006d8b79cfSDave Chinner xfs_perag_put(pag); 10016d8b79cfSDave Chinner if (error) { 10026d8b79cfSDave Chinner last_error = error; 10032451337dSDave Chinner if (error == -EFSCORRUPTED) 10046d8b79cfSDave Chinner break; 10056d8b79cfSDave Chinner } 10066d8b79cfSDave Chinner } 1007b474c7aeSEric Sandeen return last_error; 10086d8b79cfSDave Chinner } 10096d8b79cfSDave Chinner 10106d8b79cfSDave Chinner /* 10116d8b79cfSDave Chinner * Grab the inode for reclaim exclusively. 10126d8b79cfSDave Chinner * Return 0 if we grabbed it, non-zero otherwise. 10136d8b79cfSDave Chinner */ 10146d8b79cfSDave Chinner STATIC int 10156d8b79cfSDave Chinner xfs_reclaim_inode_grab( 10166d8b79cfSDave Chinner struct xfs_inode *ip, 10176d8b79cfSDave Chinner int flags) 10186d8b79cfSDave Chinner { 10196d8b79cfSDave Chinner ASSERT(rcu_read_lock_held()); 10206d8b79cfSDave Chinner 10216d8b79cfSDave Chinner /* quick check for stale RCU freed inode */ 10226d8b79cfSDave Chinner if (!ip->i_ino) 10236d8b79cfSDave Chinner return 1; 10246d8b79cfSDave Chinner 10256d8b79cfSDave Chinner /* 10266d8b79cfSDave Chinner * If we are asked for non-blocking operation, do unlocked checks to 10276d8b79cfSDave Chinner * see if the inode already is being flushed or in reclaim to avoid 10286d8b79cfSDave Chinner * lock traffic. 10296d8b79cfSDave Chinner */ 10306d8b79cfSDave Chinner if ((flags & SYNC_TRYLOCK) && 10316d8b79cfSDave Chinner __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) 10326d8b79cfSDave Chinner return 1; 10336d8b79cfSDave Chinner 10346d8b79cfSDave Chinner /* 10356d8b79cfSDave Chinner * The radix tree lock here protects a thread in xfs_iget from racing 10366d8b79cfSDave Chinner * with us starting reclaim on the inode. Once we have the 10376d8b79cfSDave Chinner * XFS_IRECLAIM flag set it will not touch us. 10386d8b79cfSDave Chinner * 10396d8b79cfSDave Chinner * Due to RCU lookup, we may find inodes that have been freed and only 10406d8b79cfSDave Chinner * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that 10416d8b79cfSDave Chinner * aren't candidates for reclaim at all, so we must check the 10426d8b79cfSDave Chinner * XFS_IRECLAIMABLE is set first before proceeding to reclaim. 10436d8b79cfSDave Chinner */ 10446d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 10456d8b79cfSDave Chinner if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 10466d8b79cfSDave Chinner __xfs_iflags_test(ip, XFS_IRECLAIM)) { 10476d8b79cfSDave Chinner /* not a reclaim candidate. */ 10486d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 10496d8b79cfSDave Chinner return 1; 10506d8b79cfSDave Chinner } 10516d8b79cfSDave Chinner __xfs_iflags_set(ip, XFS_IRECLAIM); 10526d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 10536d8b79cfSDave Chinner return 0; 10546d8b79cfSDave Chinner } 10556d8b79cfSDave Chinner 10566d8b79cfSDave Chinner /* 10576d8b79cfSDave Chinner * Inodes in different states need to be treated differently. The following 10586d8b79cfSDave Chinner * table lists the inode states and the reclaim actions necessary: 10596d8b79cfSDave Chinner * 10606d8b79cfSDave Chinner * inode state iflush ret required action 10616d8b79cfSDave Chinner * --------------- ---------- --------------- 10626d8b79cfSDave Chinner * bad - reclaim 10636d8b79cfSDave Chinner * shutdown EIO unpin and reclaim 10646d8b79cfSDave Chinner * clean, unpinned 0 reclaim 10656d8b79cfSDave Chinner * stale, unpinned 0 reclaim 10666d8b79cfSDave Chinner * clean, pinned(*) 0 requeue 10676d8b79cfSDave Chinner * stale, pinned EAGAIN requeue 10686d8b79cfSDave Chinner * dirty, async - requeue 10696d8b79cfSDave Chinner * dirty, sync 0 reclaim 10706d8b79cfSDave Chinner * 10716d8b79cfSDave Chinner * (*) dgc: I don't think the clean, pinned state is possible but it gets 10726d8b79cfSDave Chinner * handled anyway given the order of checks implemented. 10736d8b79cfSDave Chinner * 10746d8b79cfSDave Chinner * Also, because we get the flush lock first, we know that any inode that has 10756d8b79cfSDave Chinner * been flushed delwri has had the flush completed by the time we check that 10766d8b79cfSDave Chinner * the inode is clean. 10776d8b79cfSDave Chinner * 10786d8b79cfSDave Chinner * Note that because the inode is flushed delayed write by AIL pushing, the 10796d8b79cfSDave Chinner * flush lock may already be held here and waiting on it can result in very 10806d8b79cfSDave Chinner * long latencies. Hence for sync reclaims, where we wait on the flush lock, 10816d8b79cfSDave Chinner * the caller should push the AIL first before trying to reclaim inodes to 10826d8b79cfSDave Chinner * minimise the amount of time spent waiting. For background relaim, we only 10836d8b79cfSDave Chinner * bother to reclaim clean inodes anyway. 10846d8b79cfSDave Chinner * 10856d8b79cfSDave Chinner * Hence the order of actions after gaining the locks should be: 10866d8b79cfSDave Chinner * bad => reclaim 10876d8b79cfSDave Chinner * shutdown => unpin and reclaim 10886d8b79cfSDave Chinner * pinned, async => requeue 10896d8b79cfSDave Chinner * pinned, sync => unpin 10906d8b79cfSDave Chinner * stale => reclaim 10916d8b79cfSDave Chinner * clean => reclaim 10926d8b79cfSDave Chinner * dirty, async => requeue 10936d8b79cfSDave Chinner * dirty, sync => flush, wait and reclaim 10946d8b79cfSDave Chinner */ 10956d8b79cfSDave Chinner STATIC int 10966d8b79cfSDave Chinner xfs_reclaim_inode( 10976d8b79cfSDave Chinner struct xfs_inode *ip, 10986d8b79cfSDave Chinner struct xfs_perag *pag, 10996d8b79cfSDave Chinner int sync_mode) 11006d8b79cfSDave Chinner { 11016d8b79cfSDave Chinner struct xfs_buf *bp = NULL; 11028a17d7ddSDave Chinner xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 11036d8b79cfSDave Chinner int error; 11046d8b79cfSDave Chinner 11056d8b79cfSDave Chinner restart: 11066d8b79cfSDave Chinner error = 0; 11076d8b79cfSDave Chinner xfs_ilock(ip, XFS_ILOCK_EXCL); 11086d8b79cfSDave Chinner if (!xfs_iflock_nowait(ip)) { 11096d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 11106d8b79cfSDave Chinner goto out; 11116d8b79cfSDave Chinner xfs_iflock(ip); 11126d8b79cfSDave Chinner } 11136d8b79cfSDave Chinner 11146d8b79cfSDave Chinner if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 11156d8b79cfSDave Chinner xfs_iunpin_wait(ip); 111698efe8afSBrian Foster /* xfs_iflush_abort() drops the flush lock */ 11176d8b79cfSDave Chinner xfs_iflush_abort(ip, false); 11186d8b79cfSDave Chinner goto reclaim; 11196d8b79cfSDave Chinner } 11206d8b79cfSDave Chinner if (xfs_ipincount(ip)) { 11216d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 11226d8b79cfSDave Chinner goto out_ifunlock; 11236d8b79cfSDave Chinner xfs_iunpin_wait(ip); 11246d8b79cfSDave Chinner } 112598efe8afSBrian Foster if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) { 112698efe8afSBrian Foster xfs_ifunlock(ip); 11276d8b79cfSDave Chinner goto reclaim; 112898efe8afSBrian Foster } 11296d8b79cfSDave Chinner 11306d8b79cfSDave Chinner /* 11316d8b79cfSDave Chinner * Never flush out dirty data during non-blocking reclaim, as it would 11326d8b79cfSDave Chinner * just contend with AIL pushing trying to do the same job. 11336d8b79cfSDave Chinner */ 11346d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 11356d8b79cfSDave Chinner goto out_ifunlock; 11366d8b79cfSDave Chinner 11376d8b79cfSDave Chinner /* 11386d8b79cfSDave Chinner * Now we have an inode that needs flushing. 11396d8b79cfSDave Chinner * 11406d8b79cfSDave Chinner * Note that xfs_iflush will never block on the inode buffer lock, as 11416d8b79cfSDave Chinner * xfs_ifree_cluster() can lock the inode buffer before it locks the 11426d8b79cfSDave Chinner * ip->i_lock, and we are doing the exact opposite here. As a result, 11436d8b79cfSDave Chinner * doing a blocking xfs_imap_to_bp() to get the cluster buffer would 11446d8b79cfSDave Chinner * result in an ABBA deadlock with xfs_ifree_cluster(). 11456d8b79cfSDave Chinner * 11466d8b79cfSDave Chinner * As xfs_ifree_cluser() must gather all inodes that are active in the 11476d8b79cfSDave Chinner * cache to mark them stale, if we hit this case we don't actually want 11486d8b79cfSDave Chinner * to do IO here - we want the inode marked stale so we can simply 11496d8b79cfSDave Chinner * reclaim it. Hence if we get an EAGAIN error here, just unlock the 11506d8b79cfSDave Chinner * inode, back off and try again. Hopefully the next pass through will 11516d8b79cfSDave Chinner * see the stale flag set on the inode. 11526d8b79cfSDave Chinner */ 11536d8b79cfSDave Chinner error = xfs_iflush(ip, &bp); 11542451337dSDave Chinner if (error == -EAGAIN) { 11556d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 11566d8b79cfSDave Chinner /* backoff longer than in xfs_ifree_cluster */ 11576d8b79cfSDave Chinner delay(2); 11586d8b79cfSDave Chinner goto restart; 11596d8b79cfSDave Chinner } 11606d8b79cfSDave Chinner 11616d8b79cfSDave Chinner if (!error) { 11626d8b79cfSDave Chinner error = xfs_bwrite(bp); 11636d8b79cfSDave Chinner xfs_buf_relse(bp); 11646d8b79cfSDave Chinner } 11656d8b79cfSDave Chinner 11666d8b79cfSDave Chinner reclaim: 116798efe8afSBrian Foster ASSERT(!xfs_isiflocked(ip)); 116898efe8afSBrian Foster 11698a17d7ddSDave Chinner /* 11708a17d7ddSDave Chinner * Because we use RCU freeing we need to ensure the inode always appears 11718a17d7ddSDave Chinner * to be reclaimed with an invalid inode number when in the free state. 117298efe8afSBrian Foster * We do this as early as possible under the ILOCK so that 1173f2e9ad21SOmar Sandoval * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to 1174f2e9ad21SOmar Sandoval * detect races with us here. By doing this, we guarantee that once 1175f2e9ad21SOmar Sandoval * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that 1176f2e9ad21SOmar Sandoval * it will see either a valid inode that will serialise correctly, or it 1177f2e9ad21SOmar Sandoval * will see an invalid inode that it can skip. 11788a17d7ddSDave Chinner */ 11798a17d7ddSDave Chinner spin_lock(&ip->i_flags_lock); 11808a17d7ddSDave Chinner ip->i_flags = XFS_IRECLAIM; 11818a17d7ddSDave Chinner ip->i_ino = 0; 11828a17d7ddSDave Chinner spin_unlock(&ip->i_flags_lock); 11838a17d7ddSDave Chinner 11846d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 11856d8b79cfSDave Chinner 1186ff6d6af2SBill O'Donnell XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 11876d8b79cfSDave Chinner /* 11886d8b79cfSDave Chinner * Remove the inode from the per-AG radix tree. 11896d8b79cfSDave Chinner * 11906d8b79cfSDave Chinner * Because radix_tree_delete won't complain even if the item was never 11916d8b79cfSDave Chinner * added to the tree assert that it's been there before to catch 11926d8b79cfSDave Chinner * problems with the inode life time early on. 11936d8b79cfSDave Chinner */ 11946d8b79cfSDave Chinner spin_lock(&pag->pag_ici_lock); 11956d8b79cfSDave Chinner if (!radix_tree_delete(&pag->pag_ici_root, 11968a17d7ddSDave Chinner XFS_INO_TO_AGINO(ip->i_mount, ino))) 11976d8b79cfSDave Chinner ASSERT(0); 1198545c0889SDave Chinner xfs_perag_clear_reclaim_tag(pag); 11996d8b79cfSDave Chinner spin_unlock(&pag->pag_ici_lock); 12006d8b79cfSDave Chinner 12016d8b79cfSDave Chinner /* 12026d8b79cfSDave Chinner * Here we do an (almost) spurious inode lock in order to coordinate 12036d8b79cfSDave Chinner * with inode cache radix tree lookups. This is because the lookup 12046d8b79cfSDave Chinner * can reference the inodes in the cache without taking references. 12056d8b79cfSDave Chinner * 12066d8b79cfSDave Chinner * We make that OK here by ensuring that we wait until the inode is 12076d8b79cfSDave Chinner * unlocked after the lookup before we go ahead and free it. 12086d8b79cfSDave Chinner */ 12096d8b79cfSDave Chinner xfs_ilock(ip, XFS_ILOCK_EXCL); 12106d8b79cfSDave Chinner xfs_qm_dqdetach(ip); 12116d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 12126d8b79cfSDave Chinner 12138a17d7ddSDave Chinner __xfs_inode_free(ip); 12146d8b79cfSDave Chinner return error; 12156d8b79cfSDave Chinner 12166d8b79cfSDave Chinner out_ifunlock: 12176d8b79cfSDave Chinner xfs_ifunlock(ip); 12186d8b79cfSDave Chinner out: 12196d8b79cfSDave Chinner xfs_iflags_clear(ip, XFS_IRECLAIM); 12206d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 12216d8b79cfSDave Chinner /* 12222451337dSDave Chinner * We could return -EAGAIN here to make reclaim rescan the inode tree in 12236d8b79cfSDave Chinner * a short while. However, this just burns CPU time scanning the tree 12246d8b79cfSDave Chinner * waiting for IO to complete and the reclaim work never goes back to 12256d8b79cfSDave Chinner * the idle state. Instead, return 0 to let the next scheduled 12266d8b79cfSDave Chinner * background reclaim attempt to reclaim the inode again. 12276d8b79cfSDave Chinner */ 12286d8b79cfSDave Chinner return 0; 12296d8b79cfSDave Chinner } 12306d8b79cfSDave Chinner 12316d8b79cfSDave Chinner /* 12326d8b79cfSDave Chinner * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 12336d8b79cfSDave Chinner * corrupted, we still want to try to reclaim all the inodes. If we don't, 12346d8b79cfSDave Chinner * then a shut down during filesystem unmount reclaim walk leak all the 12356d8b79cfSDave Chinner * unreclaimed inodes. 12366d8b79cfSDave Chinner */ 123733479e05SDave Chinner STATIC int 12386d8b79cfSDave Chinner xfs_reclaim_inodes_ag( 12396d8b79cfSDave Chinner struct xfs_mount *mp, 12406d8b79cfSDave Chinner int flags, 12416d8b79cfSDave Chinner int *nr_to_scan) 12426d8b79cfSDave Chinner { 12436d8b79cfSDave Chinner struct xfs_perag *pag; 12446d8b79cfSDave Chinner int error = 0; 12456d8b79cfSDave Chinner int last_error = 0; 12466d8b79cfSDave Chinner xfs_agnumber_t ag; 12476d8b79cfSDave Chinner int trylock = flags & SYNC_TRYLOCK; 12486d8b79cfSDave Chinner int skipped; 12496d8b79cfSDave Chinner 12506d8b79cfSDave Chinner restart: 12516d8b79cfSDave Chinner ag = 0; 12526d8b79cfSDave Chinner skipped = 0; 12536d8b79cfSDave Chinner while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 12546d8b79cfSDave Chinner unsigned long first_index = 0; 12556d8b79cfSDave Chinner int done = 0; 12566d8b79cfSDave Chinner int nr_found = 0; 12576d8b79cfSDave Chinner 12586d8b79cfSDave Chinner ag = pag->pag_agno + 1; 12596d8b79cfSDave Chinner 12606d8b79cfSDave Chinner if (trylock) { 12616d8b79cfSDave Chinner if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 12626d8b79cfSDave Chinner skipped++; 12636d8b79cfSDave Chinner xfs_perag_put(pag); 12646d8b79cfSDave Chinner continue; 12656d8b79cfSDave Chinner } 12666d8b79cfSDave Chinner first_index = pag->pag_ici_reclaim_cursor; 12676d8b79cfSDave Chinner } else 12686d8b79cfSDave Chinner mutex_lock(&pag->pag_ici_reclaim_lock); 12696d8b79cfSDave Chinner 12706d8b79cfSDave Chinner do { 12716d8b79cfSDave Chinner struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 12726d8b79cfSDave Chinner int i; 12736d8b79cfSDave Chinner 12746d8b79cfSDave Chinner rcu_read_lock(); 12756d8b79cfSDave Chinner nr_found = radix_tree_gang_lookup_tag( 12766d8b79cfSDave Chinner &pag->pag_ici_root, 12776d8b79cfSDave Chinner (void **)batch, first_index, 12786d8b79cfSDave Chinner XFS_LOOKUP_BATCH, 12796d8b79cfSDave Chinner XFS_ICI_RECLAIM_TAG); 12806d8b79cfSDave Chinner if (!nr_found) { 12816d8b79cfSDave Chinner done = 1; 12826d8b79cfSDave Chinner rcu_read_unlock(); 12836d8b79cfSDave Chinner break; 12846d8b79cfSDave Chinner } 12856d8b79cfSDave Chinner 12866d8b79cfSDave Chinner /* 12876d8b79cfSDave Chinner * Grab the inodes before we drop the lock. if we found 12886d8b79cfSDave Chinner * nothing, nr == 0 and the loop will be skipped. 12896d8b79cfSDave Chinner */ 12906d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 12916d8b79cfSDave Chinner struct xfs_inode *ip = batch[i]; 12926d8b79cfSDave Chinner 12936d8b79cfSDave Chinner if (done || xfs_reclaim_inode_grab(ip, flags)) 12946d8b79cfSDave Chinner batch[i] = NULL; 12956d8b79cfSDave Chinner 12966d8b79cfSDave Chinner /* 12976d8b79cfSDave Chinner * Update the index for the next lookup. Catch 12986d8b79cfSDave Chinner * overflows into the next AG range which can 12996d8b79cfSDave Chinner * occur if we have inodes in the last block of 13006d8b79cfSDave Chinner * the AG and we are currently pointing to the 13016d8b79cfSDave Chinner * last inode. 13026d8b79cfSDave Chinner * 13036d8b79cfSDave Chinner * Because we may see inodes that are from the 13046d8b79cfSDave Chinner * wrong AG due to RCU freeing and 13056d8b79cfSDave Chinner * reallocation, only update the index if it 13066d8b79cfSDave Chinner * lies in this AG. It was a race that lead us 13076d8b79cfSDave Chinner * to see this inode, so another lookup from 13086d8b79cfSDave Chinner * the same index will not find it again. 13096d8b79cfSDave Chinner */ 13106d8b79cfSDave Chinner if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 13116d8b79cfSDave Chinner pag->pag_agno) 13126d8b79cfSDave Chinner continue; 13136d8b79cfSDave Chinner first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 13146d8b79cfSDave Chinner if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 13156d8b79cfSDave Chinner done = 1; 13166d8b79cfSDave Chinner } 13176d8b79cfSDave Chinner 13186d8b79cfSDave Chinner /* unlock now we've grabbed the inodes. */ 13196d8b79cfSDave Chinner rcu_read_unlock(); 13206d8b79cfSDave Chinner 13216d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 13226d8b79cfSDave Chinner if (!batch[i]) 13236d8b79cfSDave Chinner continue; 13246d8b79cfSDave Chinner error = xfs_reclaim_inode(batch[i], pag, flags); 13252451337dSDave Chinner if (error && last_error != -EFSCORRUPTED) 13266d8b79cfSDave Chinner last_error = error; 13276d8b79cfSDave Chinner } 13286d8b79cfSDave Chinner 13296d8b79cfSDave Chinner *nr_to_scan -= XFS_LOOKUP_BATCH; 13306d8b79cfSDave Chinner 13316d8b79cfSDave Chinner cond_resched(); 13326d8b79cfSDave Chinner 13336d8b79cfSDave Chinner } while (nr_found && !done && *nr_to_scan > 0); 13346d8b79cfSDave Chinner 13356d8b79cfSDave Chinner if (trylock && !done) 13366d8b79cfSDave Chinner pag->pag_ici_reclaim_cursor = first_index; 13376d8b79cfSDave Chinner else 13386d8b79cfSDave Chinner pag->pag_ici_reclaim_cursor = 0; 13396d8b79cfSDave Chinner mutex_unlock(&pag->pag_ici_reclaim_lock); 13406d8b79cfSDave Chinner xfs_perag_put(pag); 13416d8b79cfSDave Chinner } 13426d8b79cfSDave Chinner 13436d8b79cfSDave Chinner /* 13446d8b79cfSDave Chinner * if we skipped any AG, and we still have scan count remaining, do 13456d8b79cfSDave Chinner * another pass this time using blocking reclaim semantics (i.e 13466d8b79cfSDave Chinner * waiting on the reclaim locks and ignoring the reclaim cursors). This 13476d8b79cfSDave Chinner * ensure that when we get more reclaimers than AGs we block rather 13486d8b79cfSDave Chinner * than spin trying to execute reclaim. 13496d8b79cfSDave Chinner */ 13506d8b79cfSDave Chinner if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { 13516d8b79cfSDave Chinner trylock = 0; 13526d8b79cfSDave Chinner goto restart; 13536d8b79cfSDave Chinner } 1354b474c7aeSEric Sandeen return last_error; 13556d8b79cfSDave Chinner } 13566d8b79cfSDave Chinner 13576d8b79cfSDave Chinner int 13586d8b79cfSDave Chinner xfs_reclaim_inodes( 13596d8b79cfSDave Chinner xfs_mount_t *mp, 13606d8b79cfSDave Chinner int mode) 13616d8b79cfSDave Chinner { 13626d8b79cfSDave Chinner int nr_to_scan = INT_MAX; 13636d8b79cfSDave Chinner 13646d8b79cfSDave Chinner return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); 13656d8b79cfSDave Chinner } 13666d8b79cfSDave Chinner 13676d8b79cfSDave Chinner /* 13686d8b79cfSDave Chinner * Scan a certain number of inodes for reclaim. 13696d8b79cfSDave Chinner * 13706d8b79cfSDave Chinner * When called we make sure that there is a background (fast) inode reclaim in 13716d8b79cfSDave Chinner * progress, while we will throttle the speed of reclaim via doing synchronous 13726d8b79cfSDave Chinner * reclaim of inodes. That means if we come across dirty inodes, we wait for 13736d8b79cfSDave Chinner * them to be cleaned, which we hope will not be very long due to the 13746d8b79cfSDave Chinner * background walker having already kicked the IO off on those dirty inodes. 13756d8b79cfSDave Chinner */ 13760a234c6dSDave Chinner long 13776d8b79cfSDave Chinner xfs_reclaim_inodes_nr( 13786d8b79cfSDave Chinner struct xfs_mount *mp, 13796d8b79cfSDave Chinner int nr_to_scan) 13806d8b79cfSDave Chinner { 13816d8b79cfSDave Chinner /* kick background reclaimer and push the AIL */ 13826d8b79cfSDave Chinner xfs_reclaim_work_queue(mp); 13836d8b79cfSDave Chinner xfs_ail_push_all(mp->m_ail); 13846d8b79cfSDave Chinner 13850a234c6dSDave Chinner return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 13866d8b79cfSDave Chinner } 13876d8b79cfSDave Chinner 13886d8b79cfSDave Chinner /* 13896d8b79cfSDave Chinner * Return the number of reclaimable inodes in the filesystem for 13906d8b79cfSDave Chinner * the shrinker to determine how much to reclaim. 13916d8b79cfSDave Chinner */ 13926d8b79cfSDave Chinner int 13936d8b79cfSDave Chinner xfs_reclaim_inodes_count( 13946d8b79cfSDave Chinner struct xfs_mount *mp) 13956d8b79cfSDave Chinner { 13966d8b79cfSDave Chinner struct xfs_perag *pag; 13976d8b79cfSDave Chinner xfs_agnumber_t ag = 0; 13986d8b79cfSDave Chinner int reclaimable = 0; 13996d8b79cfSDave Chinner 14006d8b79cfSDave Chinner while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 14016d8b79cfSDave Chinner ag = pag->pag_agno + 1; 14026d8b79cfSDave Chinner reclaimable += pag->pag_ici_reclaimable; 14036d8b79cfSDave Chinner xfs_perag_put(pag); 14046d8b79cfSDave Chinner } 14056d8b79cfSDave Chinner return reclaimable; 14066d8b79cfSDave Chinner } 14076d8b79cfSDave Chinner 140841176a68SBrian Foster STATIC int 14093e3f9f58SBrian Foster xfs_inode_match_id( 14103e3f9f58SBrian Foster struct xfs_inode *ip, 14113e3f9f58SBrian Foster struct xfs_eofblocks *eofb) 14123e3f9f58SBrian Foster { 1413b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1414b9fe5052SDwight Engen !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 14153e3f9f58SBrian Foster return 0; 14161b556048SBrian Foster 1417b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1418b9fe5052SDwight Engen !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 14191b556048SBrian Foster return 0; 14201b556048SBrian Foster 1421b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 14221b556048SBrian Foster xfs_get_projid(ip) != eofb->eof_prid) 14231b556048SBrian Foster return 0; 14241b556048SBrian Foster 14251b556048SBrian Foster return 1; 14263e3f9f58SBrian Foster } 14273e3f9f58SBrian Foster 1428f4526397SBrian Foster /* 1429f4526397SBrian Foster * A union-based inode filtering algorithm. Process the inode if any of the 1430f4526397SBrian Foster * criteria match. This is for global/internal scans only. 1431f4526397SBrian Foster */ 1432f4526397SBrian Foster STATIC int 1433f4526397SBrian Foster xfs_inode_match_id_union( 1434f4526397SBrian Foster struct xfs_inode *ip, 1435f4526397SBrian Foster struct xfs_eofblocks *eofb) 1436f4526397SBrian Foster { 1437f4526397SBrian Foster if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1438f4526397SBrian Foster uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1439f4526397SBrian Foster return 1; 1440f4526397SBrian Foster 1441f4526397SBrian Foster if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1442f4526397SBrian Foster gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1443f4526397SBrian Foster return 1; 1444f4526397SBrian Foster 1445f4526397SBrian Foster if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1446f4526397SBrian Foster xfs_get_projid(ip) == eofb->eof_prid) 1447f4526397SBrian Foster return 1; 1448f4526397SBrian Foster 1449f4526397SBrian Foster return 0; 1450f4526397SBrian Foster } 1451f4526397SBrian Foster 14523e3f9f58SBrian Foster STATIC int 145341176a68SBrian Foster xfs_inode_free_eofblocks( 145441176a68SBrian Foster struct xfs_inode *ip, 145541176a68SBrian Foster int flags, 145641176a68SBrian Foster void *args) 145741176a68SBrian Foster { 1458a36b9261SBrian Foster int ret = 0; 14593e3f9f58SBrian Foster struct xfs_eofblocks *eofb = args; 1460f4526397SBrian Foster int match; 14615400da7dSBrian Foster 146241176a68SBrian Foster if (!xfs_can_free_eofblocks(ip, false)) { 146341176a68SBrian Foster /* inode could be preallocated or append-only */ 146441176a68SBrian Foster trace_xfs_inode_free_eofblocks_invalid(ip); 146541176a68SBrian Foster xfs_inode_clear_eofblocks_tag(ip); 146641176a68SBrian Foster return 0; 146741176a68SBrian Foster } 146841176a68SBrian Foster 146941176a68SBrian Foster /* 147041176a68SBrian Foster * If the mapping is dirty the operation can block and wait for some 147141176a68SBrian Foster * time. Unless we are waiting, skip it. 147241176a68SBrian Foster */ 147341176a68SBrian Foster if (!(flags & SYNC_WAIT) && 147441176a68SBrian Foster mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 147541176a68SBrian Foster return 0; 147641176a68SBrian Foster 147700ca79a0SBrian Foster if (eofb) { 1478f4526397SBrian Foster if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 1479f4526397SBrian Foster match = xfs_inode_match_id_union(ip, eofb); 1480f4526397SBrian Foster else 1481f4526397SBrian Foster match = xfs_inode_match_id(ip, eofb); 1482f4526397SBrian Foster if (!match) 14833e3f9f58SBrian Foster return 0; 14843e3f9f58SBrian Foster 148500ca79a0SBrian Foster /* skip the inode if the file size is too small */ 148600ca79a0SBrian Foster if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 148700ca79a0SBrian Foster XFS_ISIZE(ip) < eofb->eof_min_file_size) 148800ca79a0SBrian Foster return 0; 148900ca79a0SBrian Foster } 149000ca79a0SBrian Foster 1491a36b9261SBrian Foster /* 1492a36b9261SBrian Foster * If the caller is waiting, return -EAGAIN to keep the background 1493a36b9261SBrian Foster * scanner moving and revisit the inode in a subsequent pass. 1494a36b9261SBrian Foster */ 1495c3155097SBrian Foster if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1496a36b9261SBrian Foster if (flags & SYNC_WAIT) 1497a36b9261SBrian Foster ret = -EAGAIN; 1498a36b9261SBrian Foster return ret; 1499a36b9261SBrian Foster } 1500a36b9261SBrian Foster ret = xfs_free_eofblocks(ip); 1501a36b9261SBrian Foster xfs_iunlock(ip, XFS_IOLOCK_EXCL); 150241176a68SBrian Foster 150341176a68SBrian Foster return ret; 150441176a68SBrian Foster } 150541176a68SBrian Foster 150683104d44SDarrick J. Wong static int 150783104d44SDarrick J. Wong __xfs_icache_free_eofblocks( 150841176a68SBrian Foster struct xfs_mount *mp, 150983104d44SDarrick J. Wong struct xfs_eofblocks *eofb, 151083104d44SDarrick J. Wong int (*execute)(struct xfs_inode *ip, int flags, 151183104d44SDarrick J. Wong void *args), 151283104d44SDarrick J. Wong int tag) 151341176a68SBrian Foster { 15148ca149deSBrian Foster int flags = SYNC_TRYLOCK; 15158ca149deSBrian Foster 15168ca149deSBrian Foster if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) 15178ca149deSBrian Foster flags = SYNC_WAIT; 15188ca149deSBrian Foster 151983104d44SDarrick J. Wong return xfs_inode_ag_iterator_tag(mp, execute, flags, 152083104d44SDarrick J. Wong eofb, tag); 152183104d44SDarrick J. Wong } 152283104d44SDarrick J. Wong 152383104d44SDarrick J. Wong int 152483104d44SDarrick J. Wong xfs_icache_free_eofblocks( 152583104d44SDarrick J. Wong struct xfs_mount *mp, 152683104d44SDarrick J. Wong struct xfs_eofblocks *eofb) 152783104d44SDarrick J. Wong { 152883104d44SDarrick J. Wong return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks, 152983104d44SDarrick J. Wong XFS_ICI_EOFBLOCKS_TAG); 153041176a68SBrian Foster } 153141176a68SBrian Foster 1532dc06f398SBrian Foster /* 1533dc06f398SBrian Foster * Run eofblocks scans on the quotas applicable to the inode. For inodes with 1534dc06f398SBrian Foster * multiple quotas, we don't know exactly which quota caused an allocation 1535dc06f398SBrian Foster * failure. We make a best effort by including each quota under low free space 1536dc06f398SBrian Foster * conditions (less than 1% free space) in the scan. 1537dc06f398SBrian Foster */ 153883104d44SDarrick J. Wong static int 153983104d44SDarrick J. Wong __xfs_inode_free_quota_eofblocks( 154083104d44SDarrick J. Wong struct xfs_inode *ip, 154183104d44SDarrick J. Wong int (*execute)(struct xfs_mount *mp, 154283104d44SDarrick J. Wong struct xfs_eofblocks *eofb)) 1543dc06f398SBrian Foster { 1544dc06f398SBrian Foster int scan = 0; 1545dc06f398SBrian Foster struct xfs_eofblocks eofb = {0}; 1546dc06f398SBrian Foster struct xfs_dquot *dq; 1547dc06f398SBrian Foster 1548dc06f398SBrian Foster /* 1549c3155097SBrian Foster * Run a sync scan to increase effectiveness and use the union filter to 1550dc06f398SBrian Foster * cover all applicable quotas in a single scan. 1551dc06f398SBrian Foster */ 1552dc06f398SBrian Foster eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; 1553dc06f398SBrian Foster 1554dc06f398SBrian Foster if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { 1555dc06f398SBrian Foster dq = xfs_inode_dquot(ip, XFS_DQ_USER); 1556dc06f398SBrian Foster if (dq && xfs_dquot_lowsp(dq)) { 1557dc06f398SBrian Foster eofb.eof_uid = VFS_I(ip)->i_uid; 1558dc06f398SBrian Foster eofb.eof_flags |= XFS_EOF_FLAGS_UID; 1559dc06f398SBrian Foster scan = 1; 1560dc06f398SBrian Foster } 1561dc06f398SBrian Foster } 1562dc06f398SBrian Foster 1563dc06f398SBrian Foster if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { 1564dc06f398SBrian Foster dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); 1565dc06f398SBrian Foster if (dq && xfs_dquot_lowsp(dq)) { 1566dc06f398SBrian Foster eofb.eof_gid = VFS_I(ip)->i_gid; 1567dc06f398SBrian Foster eofb.eof_flags |= XFS_EOF_FLAGS_GID; 1568dc06f398SBrian Foster scan = 1; 1569dc06f398SBrian Foster } 1570dc06f398SBrian Foster } 1571dc06f398SBrian Foster 1572dc06f398SBrian Foster if (scan) 157383104d44SDarrick J. Wong execute(ip->i_mount, &eofb); 1574dc06f398SBrian Foster 1575dc06f398SBrian Foster return scan; 1576dc06f398SBrian Foster } 1577dc06f398SBrian Foster 157883104d44SDarrick J. Wong int 157983104d44SDarrick J. Wong xfs_inode_free_quota_eofblocks( 158083104d44SDarrick J. Wong struct xfs_inode *ip) 158183104d44SDarrick J. Wong { 158283104d44SDarrick J. Wong return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); 158383104d44SDarrick J. Wong } 158483104d44SDarrick J. Wong 158591aae6beSDarrick J. Wong static inline unsigned long 158691aae6beSDarrick J. Wong xfs_iflag_for_tag( 158791aae6beSDarrick J. Wong int tag) 158891aae6beSDarrick J. Wong { 158991aae6beSDarrick J. Wong switch (tag) { 159091aae6beSDarrick J. Wong case XFS_ICI_EOFBLOCKS_TAG: 159191aae6beSDarrick J. Wong return XFS_IEOFBLOCKS; 159291aae6beSDarrick J. Wong case XFS_ICI_COWBLOCKS_TAG: 159391aae6beSDarrick J. Wong return XFS_ICOWBLOCKS; 159491aae6beSDarrick J. Wong default: 159591aae6beSDarrick J. Wong ASSERT(0); 159691aae6beSDarrick J. Wong return 0; 159791aae6beSDarrick J. Wong } 159891aae6beSDarrick J. Wong } 159991aae6beSDarrick J. Wong 160083104d44SDarrick J. Wong static void 160191aae6beSDarrick J. Wong __xfs_inode_set_blocks_tag( 160283104d44SDarrick J. Wong xfs_inode_t *ip, 160383104d44SDarrick J. Wong void (*execute)(struct xfs_mount *mp), 160483104d44SDarrick J. Wong void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 160583104d44SDarrick J. Wong int error, unsigned long caller_ip), 160683104d44SDarrick J. Wong int tag) 160727b52867SBrian Foster { 160827b52867SBrian Foster struct xfs_mount *mp = ip->i_mount; 160927b52867SBrian Foster struct xfs_perag *pag; 161027b52867SBrian Foster int tagged; 161127b52867SBrian Foster 161285a6e764SChristoph Hellwig /* 161385a6e764SChristoph Hellwig * Don't bother locking the AG and looking up in the radix trees 161485a6e764SChristoph Hellwig * if we already know that we have the tag set. 161585a6e764SChristoph Hellwig */ 161691aae6beSDarrick J. Wong if (ip->i_flags & xfs_iflag_for_tag(tag)) 161785a6e764SChristoph Hellwig return; 161885a6e764SChristoph Hellwig spin_lock(&ip->i_flags_lock); 161991aae6beSDarrick J. Wong ip->i_flags |= xfs_iflag_for_tag(tag); 162085a6e764SChristoph Hellwig spin_unlock(&ip->i_flags_lock); 162185a6e764SChristoph Hellwig 162227b52867SBrian Foster pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 162327b52867SBrian Foster spin_lock(&pag->pag_ici_lock); 162427b52867SBrian Foster 162583104d44SDarrick J. Wong tagged = radix_tree_tagged(&pag->pag_ici_root, tag); 162627b52867SBrian Foster radix_tree_tag_set(&pag->pag_ici_root, 162783104d44SDarrick J. Wong XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 162827b52867SBrian Foster if (!tagged) { 162927b52867SBrian Foster /* propagate the eofblocks tag up into the perag radix tree */ 163027b52867SBrian Foster spin_lock(&ip->i_mount->m_perag_lock); 163127b52867SBrian Foster radix_tree_tag_set(&ip->i_mount->m_perag_tree, 163227b52867SBrian Foster XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 163383104d44SDarrick J. Wong tag); 163427b52867SBrian Foster spin_unlock(&ip->i_mount->m_perag_lock); 163527b52867SBrian Foster 1636579b62faSBrian Foster /* kick off background trimming */ 163783104d44SDarrick J. Wong execute(ip->i_mount); 1638579b62faSBrian Foster 163983104d44SDarrick J. Wong set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 164027b52867SBrian Foster } 164127b52867SBrian Foster 164227b52867SBrian Foster spin_unlock(&pag->pag_ici_lock); 164327b52867SBrian Foster xfs_perag_put(pag); 164427b52867SBrian Foster } 164527b52867SBrian Foster 164627b52867SBrian Foster void 164783104d44SDarrick J. Wong xfs_inode_set_eofblocks_tag( 164827b52867SBrian Foster xfs_inode_t *ip) 164927b52867SBrian Foster { 165083104d44SDarrick J. Wong trace_xfs_inode_set_eofblocks_tag(ip); 165191aae6beSDarrick J. Wong return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks, 165283104d44SDarrick J. Wong trace_xfs_perag_set_eofblocks, 165383104d44SDarrick J. Wong XFS_ICI_EOFBLOCKS_TAG); 165483104d44SDarrick J. Wong } 165583104d44SDarrick J. Wong 165683104d44SDarrick J. Wong static void 165791aae6beSDarrick J. Wong __xfs_inode_clear_blocks_tag( 165883104d44SDarrick J. Wong xfs_inode_t *ip, 165983104d44SDarrick J. Wong void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 166083104d44SDarrick J. Wong int error, unsigned long caller_ip), 166183104d44SDarrick J. Wong int tag) 166283104d44SDarrick J. Wong { 166327b52867SBrian Foster struct xfs_mount *mp = ip->i_mount; 166427b52867SBrian Foster struct xfs_perag *pag; 166527b52867SBrian Foster 166685a6e764SChristoph Hellwig spin_lock(&ip->i_flags_lock); 166791aae6beSDarrick J. Wong ip->i_flags &= ~xfs_iflag_for_tag(tag); 166885a6e764SChristoph Hellwig spin_unlock(&ip->i_flags_lock); 166985a6e764SChristoph Hellwig 167027b52867SBrian Foster pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 167127b52867SBrian Foster spin_lock(&pag->pag_ici_lock); 167227b52867SBrian Foster 167327b52867SBrian Foster radix_tree_tag_clear(&pag->pag_ici_root, 167483104d44SDarrick J. Wong XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 167583104d44SDarrick J. Wong if (!radix_tree_tagged(&pag->pag_ici_root, tag)) { 167627b52867SBrian Foster /* clear the eofblocks tag from the perag radix tree */ 167727b52867SBrian Foster spin_lock(&ip->i_mount->m_perag_lock); 167827b52867SBrian Foster radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 167927b52867SBrian Foster XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 168083104d44SDarrick J. Wong tag); 168127b52867SBrian Foster spin_unlock(&ip->i_mount->m_perag_lock); 168283104d44SDarrick J. Wong clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 168327b52867SBrian Foster } 168427b52867SBrian Foster 168527b52867SBrian Foster spin_unlock(&pag->pag_ici_lock); 168627b52867SBrian Foster xfs_perag_put(pag); 168727b52867SBrian Foster } 168827b52867SBrian Foster 168983104d44SDarrick J. Wong void 169083104d44SDarrick J. Wong xfs_inode_clear_eofblocks_tag( 169183104d44SDarrick J. Wong xfs_inode_t *ip) 169283104d44SDarrick J. Wong { 169383104d44SDarrick J. Wong trace_xfs_inode_clear_eofblocks_tag(ip); 169491aae6beSDarrick J. Wong return __xfs_inode_clear_blocks_tag(ip, 169583104d44SDarrick J. Wong trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); 169683104d44SDarrick J. Wong } 169783104d44SDarrick J. Wong 169883104d44SDarrick J. Wong /* 1699be78ff0eSDarrick J. Wong * Set ourselves up to free CoW blocks from this file. If it's already clean 1700be78ff0eSDarrick J. Wong * then we can bail out quickly, but otherwise we must back off if the file 1701be78ff0eSDarrick J. Wong * is undergoing some kind of write. 1702be78ff0eSDarrick J. Wong */ 1703be78ff0eSDarrick J. Wong static bool 1704be78ff0eSDarrick J. Wong xfs_prep_free_cowblocks( 170551d62690SChristoph Hellwig struct xfs_inode *ip) 1706be78ff0eSDarrick J. Wong { 1707be78ff0eSDarrick J. Wong /* 1708be78ff0eSDarrick J. Wong * Just clear the tag if we have an empty cow fork or none at all. It's 1709be78ff0eSDarrick J. Wong * possible the inode was fully unshared since it was originally tagged. 1710be78ff0eSDarrick J. Wong */ 171151d62690SChristoph Hellwig if (!xfs_inode_has_cow_data(ip)) { 1712be78ff0eSDarrick J. Wong trace_xfs_inode_free_cowblocks_invalid(ip); 1713be78ff0eSDarrick J. Wong xfs_inode_clear_cowblocks_tag(ip); 1714be78ff0eSDarrick J. Wong return false; 1715be78ff0eSDarrick J. Wong } 1716be78ff0eSDarrick J. Wong 1717be78ff0eSDarrick J. Wong /* 1718be78ff0eSDarrick J. Wong * If the mapping is dirty or under writeback we cannot touch the 1719be78ff0eSDarrick J. Wong * CoW fork. Leave it alone if we're in the midst of a directio. 1720be78ff0eSDarrick J. Wong */ 1721be78ff0eSDarrick J. Wong if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || 1722be78ff0eSDarrick J. Wong mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || 1723be78ff0eSDarrick J. Wong mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || 1724be78ff0eSDarrick J. Wong atomic_read(&VFS_I(ip)->i_dio_count)) 1725be78ff0eSDarrick J. Wong return false; 1726be78ff0eSDarrick J. Wong 1727be78ff0eSDarrick J. Wong return true; 1728be78ff0eSDarrick J. Wong } 1729be78ff0eSDarrick J. Wong 1730be78ff0eSDarrick J. Wong /* 173183104d44SDarrick J. Wong * Automatic CoW Reservation Freeing 173283104d44SDarrick J. Wong * 173383104d44SDarrick J. Wong * These functions automatically garbage collect leftover CoW reservations 173483104d44SDarrick J. Wong * that were made on behalf of a cowextsize hint when we start to run out 173583104d44SDarrick J. Wong * of quota or when the reservations sit around for too long. If the file 173683104d44SDarrick J. Wong * has dirty pages or is undergoing writeback, its CoW reservations will 173783104d44SDarrick J. Wong * be retained. 173883104d44SDarrick J. Wong * 173983104d44SDarrick J. Wong * The actual garbage collection piggybacks off the same code that runs 174083104d44SDarrick J. Wong * the speculative EOF preallocation garbage collector. 174183104d44SDarrick J. Wong */ 174283104d44SDarrick J. Wong STATIC int 174383104d44SDarrick J. Wong xfs_inode_free_cowblocks( 174483104d44SDarrick J. Wong struct xfs_inode *ip, 174583104d44SDarrick J. Wong int flags, 174683104d44SDarrick J. Wong void *args) 174783104d44SDarrick J. Wong { 174883104d44SDarrick J. Wong struct xfs_eofblocks *eofb = args; 1749be78ff0eSDarrick J. Wong int match; 1750be78ff0eSDarrick J. Wong int ret = 0; 175183104d44SDarrick J. Wong 175251d62690SChristoph Hellwig if (!xfs_prep_free_cowblocks(ip)) 175383104d44SDarrick J. Wong return 0; 175483104d44SDarrick J. Wong 175583104d44SDarrick J. Wong if (eofb) { 175683104d44SDarrick J. Wong if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 175783104d44SDarrick J. Wong match = xfs_inode_match_id_union(ip, eofb); 175883104d44SDarrick J. Wong else 175983104d44SDarrick J. Wong match = xfs_inode_match_id(ip, eofb); 176083104d44SDarrick J. Wong if (!match) 176183104d44SDarrick J. Wong return 0; 176283104d44SDarrick J. Wong 176383104d44SDarrick J. Wong /* skip the inode if the file size is too small */ 176483104d44SDarrick J. Wong if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 176583104d44SDarrick J. Wong XFS_ISIZE(ip) < eofb->eof_min_file_size) 176683104d44SDarrick J. Wong return 0; 176783104d44SDarrick J. Wong } 176883104d44SDarrick J. Wong 176983104d44SDarrick J. Wong /* Free the CoW blocks */ 177083104d44SDarrick J. Wong xfs_ilock(ip, XFS_IOLOCK_EXCL); 177183104d44SDarrick J. Wong xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 177283104d44SDarrick J. Wong 1773be78ff0eSDarrick J. Wong /* 1774be78ff0eSDarrick J. Wong * Check again, nobody else should be able to dirty blocks or change 1775be78ff0eSDarrick J. Wong * the reflink iflag now that we have the first two locks held. 1776be78ff0eSDarrick J. Wong */ 177751d62690SChristoph Hellwig if (xfs_prep_free_cowblocks(ip)) 17783802a345SChristoph Hellwig ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); 177983104d44SDarrick J. Wong 178083104d44SDarrick J. Wong xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 178183104d44SDarrick J. Wong xfs_iunlock(ip, XFS_IOLOCK_EXCL); 178283104d44SDarrick J. Wong 178383104d44SDarrick J. Wong return ret; 178483104d44SDarrick J. Wong } 178583104d44SDarrick J. Wong 178683104d44SDarrick J. Wong int 178783104d44SDarrick J. Wong xfs_icache_free_cowblocks( 178883104d44SDarrick J. Wong struct xfs_mount *mp, 178983104d44SDarrick J. Wong struct xfs_eofblocks *eofb) 179083104d44SDarrick J. Wong { 179183104d44SDarrick J. Wong return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks, 179283104d44SDarrick J. Wong XFS_ICI_COWBLOCKS_TAG); 179383104d44SDarrick J. Wong } 179483104d44SDarrick J. Wong 179583104d44SDarrick J. Wong int 179683104d44SDarrick J. Wong xfs_inode_free_quota_cowblocks( 179783104d44SDarrick J. Wong struct xfs_inode *ip) 179883104d44SDarrick J. Wong { 179983104d44SDarrick J. Wong return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); 180083104d44SDarrick J. Wong } 180183104d44SDarrick J. Wong 180283104d44SDarrick J. Wong void 180383104d44SDarrick J. Wong xfs_inode_set_cowblocks_tag( 180483104d44SDarrick J. Wong xfs_inode_t *ip) 180583104d44SDarrick J. Wong { 18067b7381f0SBrian Foster trace_xfs_inode_set_cowblocks_tag(ip); 180791aae6beSDarrick J. Wong return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks, 18087b7381f0SBrian Foster trace_xfs_perag_set_cowblocks, 180983104d44SDarrick J. Wong XFS_ICI_COWBLOCKS_TAG); 181083104d44SDarrick J. Wong } 181183104d44SDarrick J. Wong 181283104d44SDarrick J. Wong void 181383104d44SDarrick J. Wong xfs_inode_clear_cowblocks_tag( 181483104d44SDarrick J. Wong xfs_inode_t *ip) 181583104d44SDarrick J. Wong { 18167b7381f0SBrian Foster trace_xfs_inode_clear_cowblocks_tag(ip); 181791aae6beSDarrick J. Wong return __xfs_inode_clear_blocks_tag(ip, 18187b7381f0SBrian Foster trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); 181983104d44SDarrick J. Wong } 1820d6b636ebSDarrick J. Wong 1821d6b636ebSDarrick J. Wong /* Disable post-EOF and CoW block auto-reclamation. */ 1822d6b636ebSDarrick J. Wong void 1823ed30dcbdSDarrick J. Wong xfs_stop_block_reaping( 1824d6b636ebSDarrick J. Wong struct xfs_mount *mp) 1825d6b636ebSDarrick J. Wong { 1826d6b636ebSDarrick J. Wong cancel_delayed_work_sync(&mp->m_eofblocks_work); 1827d6b636ebSDarrick J. Wong cancel_delayed_work_sync(&mp->m_cowblocks_work); 1828d6b636ebSDarrick J. Wong } 1829d6b636ebSDarrick J. Wong 1830d6b636ebSDarrick J. Wong /* Enable post-EOF and CoW block auto-reclamation. */ 1831d6b636ebSDarrick J. Wong void 1832ed30dcbdSDarrick J. Wong xfs_start_block_reaping( 1833d6b636ebSDarrick J. Wong struct xfs_mount *mp) 1834d6b636ebSDarrick J. Wong { 1835d6b636ebSDarrick J. Wong xfs_queue_eofblocks(mp); 1836d6b636ebSDarrick J. Wong xfs_queue_cowblocks(mp); 1837d6b636ebSDarrick J. Wong } 1838