10b61f8a4SDave Chinner // SPDX-License-Identifier: GPL-2.0 26d8b79cfSDave Chinner /* 36d8b79cfSDave Chinner * Copyright (c) 2000-2005 Silicon Graphics, Inc. 46d8b79cfSDave Chinner * All Rights Reserved. 56d8b79cfSDave Chinner */ 66d8b79cfSDave Chinner #include "xfs.h" 76d8b79cfSDave Chinner #include "xfs_fs.h" 85467b34bSDarrick J. Wong #include "xfs_shared.h" 96ca1c906SDave Chinner #include "xfs_format.h" 10239880efSDave Chinner #include "xfs_log_format.h" 11239880efSDave Chinner #include "xfs_trans_resv.h" 126d8b79cfSDave Chinner #include "xfs_sb.h" 136d8b79cfSDave Chinner #include "xfs_mount.h" 146d8b79cfSDave Chinner #include "xfs_inode.h" 15239880efSDave Chinner #include "xfs_trans.h" 16239880efSDave Chinner #include "xfs_trans_priv.h" 176d8b79cfSDave Chinner #include "xfs_inode_item.h" 186d8b79cfSDave Chinner #include "xfs_quota.h" 196d8b79cfSDave Chinner #include "xfs_trace.h" 206d8b79cfSDave Chinner #include "xfs_icache.h" 21c24b5dfaSDave Chinner #include "xfs_bmap_util.h" 22dc06f398SBrian Foster #include "xfs_dquot_item.h" 23dc06f398SBrian Foster #include "xfs_dquot.h" 2483104d44SDarrick J. Wong #include "xfs_reflink.h" 25bb8a66afSChristoph Hellwig #include "xfs_ialloc.h" 266d8b79cfSDave Chinner 27f0e28280SJeff Layton #include <linux/iversion.h> 286d8b79cfSDave Chinner 2933479e05SDave Chinner /* 3033479e05SDave Chinner * Allocate and initialise an xfs_inode. 3133479e05SDave Chinner */ 32638f4416SDave Chinner struct xfs_inode * 3333479e05SDave Chinner xfs_inode_alloc( 3433479e05SDave Chinner struct xfs_mount *mp, 3533479e05SDave Chinner xfs_ino_t ino) 3633479e05SDave Chinner { 3733479e05SDave Chinner struct xfs_inode *ip; 3833479e05SDave Chinner 3933479e05SDave Chinner /* 4033479e05SDave Chinner * if this didn't occur in transactions, we could use 4133479e05SDave Chinner * KM_MAYFAIL and return NULL here on ENOMEM. Set the 4233479e05SDave Chinner * code up to do this anyway. 4333479e05SDave Chinner */ 44707e0ddaSTetsuo Handa ip = kmem_zone_alloc(xfs_inode_zone, 0); 4533479e05SDave Chinner if (!ip) 4633479e05SDave Chinner return NULL; 4733479e05SDave Chinner if (inode_init_always(mp->m_super, VFS_I(ip))) { 48377bcd5fSCarlos Maiolino kmem_cache_free(xfs_inode_zone, ip); 4933479e05SDave Chinner return NULL; 5033479e05SDave Chinner } 5133479e05SDave Chinner 52c19b3b05SDave Chinner /* VFS doesn't initialise i_mode! */ 53c19b3b05SDave Chinner VFS_I(ip)->i_mode = 0; 54c19b3b05SDave Chinner 55ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, vn_active); 5633479e05SDave Chinner ASSERT(atomic_read(&ip->i_pincount) == 0); 5733479e05SDave Chinner ASSERT(!xfs_isiflocked(ip)); 5833479e05SDave Chinner ASSERT(ip->i_ino == 0); 5933479e05SDave Chinner 6033479e05SDave Chinner /* initialise the xfs inode */ 6133479e05SDave Chinner ip->i_ino = ino; 6233479e05SDave Chinner ip->i_mount = mp; 6333479e05SDave Chinner memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 6433479e05SDave Chinner ip->i_afp = NULL; 653993baebSDarrick J. Wong ip->i_cowfp = NULL; 663ba738dfSChristoph Hellwig memset(&ip->i_df, 0, sizeof(ip->i_df)); 6733479e05SDave Chinner ip->i_flags = 0; 6833479e05SDave Chinner ip->i_delayed_blks = 0; 69f8d55aa0SDave Chinner memset(&ip->i_d, 0, sizeof(ip->i_d)); 706772c1f1SDarrick J. Wong ip->i_sick = 0; 716772c1f1SDarrick J. Wong ip->i_checked = 0; 72cb357bf3SDarrick J. Wong INIT_WORK(&ip->i_ioend_work, xfs_end_io); 73cb357bf3SDarrick J. Wong INIT_LIST_HEAD(&ip->i_ioend_list); 74cb357bf3SDarrick J. Wong spin_lock_init(&ip->i_ioend_lock); 7533479e05SDave Chinner 7633479e05SDave Chinner return ip; 7733479e05SDave Chinner } 7833479e05SDave Chinner 7933479e05SDave Chinner STATIC void 8033479e05SDave Chinner xfs_inode_free_callback( 8133479e05SDave Chinner struct rcu_head *head) 8233479e05SDave Chinner { 8333479e05SDave Chinner struct inode *inode = container_of(head, struct inode, i_rcu); 8433479e05SDave Chinner struct xfs_inode *ip = XFS_I(inode); 8533479e05SDave Chinner 86c19b3b05SDave Chinner switch (VFS_I(ip)->i_mode & S_IFMT) { 8733479e05SDave Chinner case S_IFREG: 8833479e05SDave Chinner case S_IFDIR: 8933479e05SDave Chinner case S_IFLNK: 90ef838512SChristoph Hellwig xfs_idestroy_fork(&ip->i_df); 9133479e05SDave Chinner break; 9233479e05SDave Chinner } 9333479e05SDave Chinner 94ef838512SChristoph Hellwig if (ip->i_afp) { 95ef838512SChristoph Hellwig xfs_idestroy_fork(ip->i_afp); 96ef838512SChristoph Hellwig kmem_cache_free(xfs_ifork_zone, ip->i_afp); 97ef838512SChristoph Hellwig } 98ef838512SChristoph Hellwig if (ip->i_cowfp) { 99ef838512SChristoph Hellwig xfs_idestroy_fork(ip->i_cowfp); 100ef838512SChristoph Hellwig kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); 101ef838512SChristoph Hellwig } 10233479e05SDave Chinner if (ip->i_itemp) { 10322525c17SDave Chinner ASSERT(!test_bit(XFS_LI_IN_AIL, 10422525c17SDave Chinner &ip->i_itemp->ili_item.li_flags)); 10533479e05SDave Chinner xfs_inode_item_destroy(ip); 10633479e05SDave Chinner ip->i_itemp = NULL; 10733479e05SDave Chinner } 10833479e05SDave Chinner 109377bcd5fSCarlos Maiolino kmem_cache_free(xfs_inode_zone, ip); 1101f2dcfe8SDave Chinner } 1111f2dcfe8SDave Chinner 1128a17d7ddSDave Chinner static void 1138a17d7ddSDave Chinner __xfs_inode_free( 1148a17d7ddSDave Chinner struct xfs_inode *ip) 1158a17d7ddSDave Chinner { 1168a17d7ddSDave Chinner /* asserts to verify all state is correct here */ 1178a17d7ddSDave Chinner ASSERT(atomic_read(&ip->i_pincount) == 0); 1188a17d7ddSDave Chinner XFS_STATS_DEC(ip->i_mount, vn_active); 1198a17d7ddSDave Chinner 1208a17d7ddSDave Chinner call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 1218a17d7ddSDave Chinner } 1228a17d7ddSDave Chinner 1231f2dcfe8SDave Chinner void 1241f2dcfe8SDave Chinner xfs_inode_free( 1251f2dcfe8SDave Chinner struct xfs_inode *ip) 1261f2dcfe8SDave Chinner { 12798efe8afSBrian Foster ASSERT(!xfs_isiflocked(ip)); 12898efe8afSBrian Foster 12933479e05SDave Chinner /* 13033479e05SDave Chinner * Because we use RCU freeing we need to ensure the inode always 13133479e05SDave Chinner * appears to be reclaimed with an invalid inode number when in the 13233479e05SDave Chinner * free state. The ip->i_flags_lock provides the barrier against lookup 13333479e05SDave Chinner * races. 13433479e05SDave Chinner */ 13533479e05SDave Chinner spin_lock(&ip->i_flags_lock); 13633479e05SDave Chinner ip->i_flags = XFS_IRECLAIM; 13733479e05SDave Chinner ip->i_ino = 0; 13833479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 13933479e05SDave Chinner 1408a17d7ddSDave Chinner __xfs_inode_free(ip); 14133479e05SDave Chinner } 14233479e05SDave Chinner 14333479e05SDave Chinner /* 144ad438c40SDave Chinner * Queue a new inode reclaim pass if there are reclaimable inodes and there 145ad438c40SDave Chinner * isn't a reclaim pass already in progress. By default it runs every 5s based 146ad438c40SDave Chinner * on the xfs periodic sync default of 30s. Perhaps this should have it's own 147ad438c40SDave Chinner * tunable, but that can be done if this method proves to be ineffective or too 148ad438c40SDave Chinner * aggressive. 149ad438c40SDave Chinner */ 150ad438c40SDave Chinner static void 151ad438c40SDave Chinner xfs_reclaim_work_queue( 152ad438c40SDave Chinner struct xfs_mount *mp) 153ad438c40SDave Chinner { 154ad438c40SDave Chinner 155ad438c40SDave Chinner rcu_read_lock(); 156ad438c40SDave Chinner if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 157ad438c40SDave Chinner queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 158ad438c40SDave Chinner msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 159ad438c40SDave Chinner } 160ad438c40SDave Chinner rcu_read_unlock(); 161ad438c40SDave Chinner } 162ad438c40SDave Chinner 163ad438c40SDave Chinner /* 164ad438c40SDave Chinner * This is a fast pass over the inode cache to try to get reclaim moving on as 165ad438c40SDave Chinner * many inodes as possible in a short period of time. It kicks itself every few 166ad438c40SDave Chinner * seconds, as well as being kicked by the inode cache shrinker when memory 167ad438c40SDave Chinner * goes low. It scans as quickly as possible avoiding locked inodes or those 168ad438c40SDave Chinner * already being flushed, and once done schedules a future pass. 169ad438c40SDave Chinner */ 170ad438c40SDave Chinner void 171ad438c40SDave Chinner xfs_reclaim_worker( 172ad438c40SDave Chinner struct work_struct *work) 173ad438c40SDave Chinner { 174ad438c40SDave Chinner struct xfs_mount *mp = container_of(to_delayed_work(work), 175ad438c40SDave Chinner struct xfs_mount, m_reclaim_work); 176ad438c40SDave Chinner 177ad438c40SDave Chinner xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 178ad438c40SDave Chinner xfs_reclaim_work_queue(mp); 179ad438c40SDave Chinner } 180ad438c40SDave Chinner 181ad438c40SDave Chinner static void 182ad438c40SDave Chinner xfs_perag_set_reclaim_tag( 183ad438c40SDave Chinner struct xfs_perag *pag) 184ad438c40SDave Chinner { 185ad438c40SDave Chinner struct xfs_mount *mp = pag->pag_mount; 186ad438c40SDave Chinner 18795989c46SBrian Foster lockdep_assert_held(&pag->pag_ici_lock); 188ad438c40SDave Chinner if (pag->pag_ici_reclaimable++) 189ad438c40SDave Chinner return; 190ad438c40SDave Chinner 191ad438c40SDave Chinner /* propagate the reclaim tag up into the perag radix tree */ 192ad438c40SDave Chinner spin_lock(&mp->m_perag_lock); 193ad438c40SDave Chinner radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, 194ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 195ad438c40SDave Chinner spin_unlock(&mp->m_perag_lock); 196ad438c40SDave Chinner 197ad438c40SDave Chinner /* schedule periodic background inode reclaim */ 198ad438c40SDave Chinner xfs_reclaim_work_queue(mp); 199ad438c40SDave Chinner 200ad438c40SDave Chinner trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 201ad438c40SDave Chinner } 202ad438c40SDave Chinner 203ad438c40SDave Chinner static void 204ad438c40SDave Chinner xfs_perag_clear_reclaim_tag( 205ad438c40SDave Chinner struct xfs_perag *pag) 206ad438c40SDave Chinner { 207ad438c40SDave Chinner struct xfs_mount *mp = pag->pag_mount; 208ad438c40SDave Chinner 20995989c46SBrian Foster lockdep_assert_held(&pag->pag_ici_lock); 210ad438c40SDave Chinner if (--pag->pag_ici_reclaimable) 211ad438c40SDave Chinner return; 212ad438c40SDave Chinner 213ad438c40SDave Chinner /* clear the reclaim tag from the perag radix tree */ 214ad438c40SDave Chinner spin_lock(&mp->m_perag_lock); 215ad438c40SDave Chinner radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, 216ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 217ad438c40SDave Chinner spin_unlock(&mp->m_perag_lock); 218ad438c40SDave Chinner trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 219ad438c40SDave Chinner } 220ad438c40SDave Chinner 221ad438c40SDave Chinner 222ad438c40SDave Chinner /* 223ad438c40SDave Chinner * We set the inode flag atomically with the radix tree tag. 224ad438c40SDave Chinner * Once we get tag lookups on the radix tree, this inode flag 225ad438c40SDave Chinner * can go away. 226ad438c40SDave Chinner */ 227ad438c40SDave Chinner void 228ad438c40SDave Chinner xfs_inode_set_reclaim_tag( 229ad438c40SDave Chinner struct xfs_inode *ip) 230ad438c40SDave Chinner { 231ad438c40SDave Chinner struct xfs_mount *mp = ip->i_mount; 232ad438c40SDave Chinner struct xfs_perag *pag; 233ad438c40SDave Chinner 234ad438c40SDave Chinner pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 235ad438c40SDave Chinner spin_lock(&pag->pag_ici_lock); 236ad438c40SDave Chinner spin_lock(&ip->i_flags_lock); 237ad438c40SDave Chinner 238ad438c40SDave Chinner radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), 239ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 240ad438c40SDave Chinner xfs_perag_set_reclaim_tag(pag); 241ad438c40SDave Chinner __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 242ad438c40SDave Chinner 243ad438c40SDave Chinner spin_unlock(&ip->i_flags_lock); 244ad438c40SDave Chinner spin_unlock(&pag->pag_ici_lock); 245ad438c40SDave Chinner xfs_perag_put(pag); 246ad438c40SDave Chinner } 247ad438c40SDave Chinner 248ad438c40SDave Chinner STATIC void 249ad438c40SDave Chinner xfs_inode_clear_reclaim_tag( 250ad438c40SDave Chinner struct xfs_perag *pag, 251ad438c40SDave Chinner xfs_ino_t ino) 252ad438c40SDave Chinner { 253ad438c40SDave Chinner radix_tree_tag_clear(&pag->pag_ici_root, 254ad438c40SDave Chinner XFS_INO_TO_AGINO(pag->pag_mount, ino), 255ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 256ad438c40SDave Chinner xfs_perag_clear_reclaim_tag(pag); 257ad438c40SDave Chinner } 258ad438c40SDave Chinner 259ae2c4ac2SBrian Foster static void 260ae2c4ac2SBrian Foster xfs_inew_wait( 261ae2c4ac2SBrian Foster struct xfs_inode *ip) 262ae2c4ac2SBrian Foster { 263ae2c4ac2SBrian Foster wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); 264ae2c4ac2SBrian Foster DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); 265ae2c4ac2SBrian Foster 266ae2c4ac2SBrian Foster do { 26721417136SIngo Molnar prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 268ae2c4ac2SBrian Foster if (!xfs_iflags_test(ip, XFS_INEW)) 269ae2c4ac2SBrian Foster break; 270ae2c4ac2SBrian Foster schedule(); 271ae2c4ac2SBrian Foster } while (true); 27221417136SIngo Molnar finish_wait(wq, &wait.wq_entry); 273ae2c4ac2SBrian Foster } 274ae2c4ac2SBrian Foster 275ad438c40SDave Chinner /* 27650997470SDave Chinner * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 27750997470SDave Chinner * part of the structure. This is made more complex by the fact we store 27850997470SDave Chinner * information about the on-disk values in the VFS inode and so we can't just 27983e06f21SDave Chinner * overwrite the values unconditionally. Hence we save the parameters we 28050997470SDave Chinner * need to retain across reinitialisation, and rewrite them into the VFS inode 28183e06f21SDave Chinner * after reinitialisation even if it fails. 28250997470SDave Chinner */ 28350997470SDave Chinner static int 28450997470SDave Chinner xfs_reinit_inode( 28550997470SDave Chinner struct xfs_mount *mp, 28650997470SDave Chinner struct inode *inode) 28750997470SDave Chinner { 28850997470SDave Chinner int error; 28954d7b5c1SDave Chinner uint32_t nlink = inode->i_nlink; 2909e9a2674SDave Chinner uint32_t generation = inode->i_generation; 291f0e28280SJeff Layton uint64_t version = inode_peek_iversion(inode); 292c19b3b05SDave Chinner umode_t mode = inode->i_mode; 293acd1d715SAmir Goldstein dev_t dev = inode->i_rdev; 2943d8f2821SChristoph Hellwig kuid_t uid = inode->i_uid; 2953d8f2821SChristoph Hellwig kgid_t gid = inode->i_gid; 29650997470SDave Chinner 29750997470SDave Chinner error = inode_init_always(mp->m_super, inode); 29850997470SDave Chinner 29954d7b5c1SDave Chinner set_nlink(inode, nlink); 3009e9a2674SDave Chinner inode->i_generation = generation; 301f0e28280SJeff Layton inode_set_iversion_queried(inode, version); 302c19b3b05SDave Chinner inode->i_mode = mode; 303acd1d715SAmir Goldstein inode->i_rdev = dev; 3043d8f2821SChristoph Hellwig inode->i_uid = uid; 3053d8f2821SChristoph Hellwig inode->i_gid = gid; 30650997470SDave Chinner return error; 30750997470SDave Chinner } 30850997470SDave Chinner 30950997470SDave Chinner /* 310afca6c5bSDave Chinner * If we are allocating a new inode, then check what was returned is 311afca6c5bSDave Chinner * actually a free, empty inode. If we are not allocating an inode, 312afca6c5bSDave Chinner * then check we didn't find a free inode. 313afca6c5bSDave Chinner * 314afca6c5bSDave Chinner * Returns: 315afca6c5bSDave Chinner * 0 if the inode free state matches the lookup context 316afca6c5bSDave Chinner * -ENOENT if the inode is free and we are not allocating 317afca6c5bSDave Chinner * -EFSCORRUPTED if there is any state mismatch at all 318afca6c5bSDave Chinner */ 319afca6c5bSDave Chinner static int 320afca6c5bSDave Chinner xfs_iget_check_free_state( 321afca6c5bSDave Chinner struct xfs_inode *ip, 322afca6c5bSDave Chinner int flags) 323afca6c5bSDave Chinner { 324afca6c5bSDave Chinner if (flags & XFS_IGET_CREATE) { 325afca6c5bSDave Chinner /* should be a free inode */ 326afca6c5bSDave Chinner if (VFS_I(ip)->i_mode != 0) { 327afca6c5bSDave Chinner xfs_warn(ip->i_mount, 328afca6c5bSDave Chinner "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", 329afca6c5bSDave Chinner ip->i_ino, VFS_I(ip)->i_mode); 330afca6c5bSDave Chinner return -EFSCORRUPTED; 331afca6c5bSDave Chinner } 332afca6c5bSDave Chinner 333afca6c5bSDave Chinner if (ip->i_d.di_nblocks != 0) { 334afca6c5bSDave Chinner xfs_warn(ip->i_mount, 335afca6c5bSDave Chinner "Corruption detected! Free inode 0x%llx has blocks allocated!", 336afca6c5bSDave Chinner ip->i_ino); 337afca6c5bSDave Chinner return -EFSCORRUPTED; 338afca6c5bSDave Chinner } 339afca6c5bSDave Chinner return 0; 340afca6c5bSDave Chinner } 341afca6c5bSDave Chinner 342afca6c5bSDave Chinner /* should be an allocated inode */ 343afca6c5bSDave Chinner if (VFS_I(ip)->i_mode == 0) 344afca6c5bSDave Chinner return -ENOENT; 345afca6c5bSDave Chinner 346afca6c5bSDave Chinner return 0; 347afca6c5bSDave Chinner } 348afca6c5bSDave Chinner 349afca6c5bSDave Chinner /* 35033479e05SDave Chinner * Check the validity of the inode we just found it the cache 35133479e05SDave Chinner */ 35233479e05SDave Chinner static int 35333479e05SDave Chinner xfs_iget_cache_hit( 35433479e05SDave Chinner struct xfs_perag *pag, 35533479e05SDave Chinner struct xfs_inode *ip, 35633479e05SDave Chinner xfs_ino_t ino, 35733479e05SDave Chinner int flags, 35833479e05SDave Chinner int lock_flags) __releases(RCU) 35933479e05SDave Chinner { 36033479e05SDave Chinner struct inode *inode = VFS_I(ip); 36133479e05SDave Chinner struct xfs_mount *mp = ip->i_mount; 36233479e05SDave Chinner int error; 36333479e05SDave Chinner 36433479e05SDave Chinner /* 36533479e05SDave Chinner * check for re-use of an inode within an RCU grace period due to the 36633479e05SDave Chinner * radix tree nodes not being updated yet. We monitor for this by 36733479e05SDave Chinner * setting the inode number to zero before freeing the inode structure. 36833479e05SDave Chinner * If the inode has been reallocated and set up, then the inode number 36933479e05SDave Chinner * will not match, so check for that, too. 37033479e05SDave Chinner */ 37133479e05SDave Chinner spin_lock(&ip->i_flags_lock); 37233479e05SDave Chinner if (ip->i_ino != ino) { 37333479e05SDave Chinner trace_xfs_iget_skip(ip); 374ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_frecycle); 3752451337dSDave Chinner error = -EAGAIN; 37633479e05SDave Chinner goto out_error; 37733479e05SDave Chinner } 37833479e05SDave Chinner 37933479e05SDave Chinner 38033479e05SDave Chinner /* 38133479e05SDave Chinner * If we are racing with another cache hit that is currently 38233479e05SDave Chinner * instantiating this inode or currently recycling it out of 38333479e05SDave Chinner * reclaimabe state, wait for the initialisation to complete 38433479e05SDave Chinner * before continuing. 38533479e05SDave Chinner * 38633479e05SDave Chinner * XXX(hch): eventually we should do something equivalent to 38733479e05SDave Chinner * wait_on_inode to wait for these flags to be cleared 38833479e05SDave Chinner * instead of polling for it. 38933479e05SDave Chinner */ 39033479e05SDave Chinner if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { 39133479e05SDave Chinner trace_xfs_iget_skip(ip); 392ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_frecycle); 3932451337dSDave Chinner error = -EAGAIN; 39433479e05SDave Chinner goto out_error; 39533479e05SDave Chinner } 39633479e05SDave Chinner 39733479e05SDave Chinner /* 398afca6c5bSDave Chinner * Check the inode free state is valid. This also detects lookup 399afca6c5bSDave Chinner * racing with unlinks. 40033479e05SDave Chinner */ 401afca6c5bSDave Chinner error = xfs_iget_check_free_state(ip, flags); 402afca6c5bSDave Chinner if (error) 40333479e05SDave Chinner goto out_error; 40433479e05SDave Chinner 40533479e05SDave Chinner /* 40633479e05SDave Chinner * If IRECLAIMABLE is set, we've torn down the VFS inode already. 40733479e05SDave Chinner * Need to carefully get it back into useable state. 40833479e05SDave Chinner */ 40933479e05SDave Chinner if (ip->i_flags & XFS_IRECLAIMABLE) { 41033479e05SDave Chinner trace_xfs_iget_reclaim(ip); 41133479e05SDave Chinner 412378f681cSDarrick J. Wong if (flags & XFS_IGET_INCORE) { 413378f681cSDarrick J. Wong error = -EAGAIN; 414378f681cSDarrick J. Wong goto out_error; 415378f681cSDarrick J. Wong } 416378f681cSDarrick J. Wong 41733479e05SDave Chinner /* 41833479e05SDave Chinner * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode 41933479e05SDave Chinner * from stomping over us while we recycle the inode. We can't 42033479e05SDave Chinner * clear the radix tree reclaimable tag yet as it requires 42133479e05SDave Chinner * pag_ici_lock to be held exclusive. 42233479e05SDave Chinner */ 42333479e05SDave Chinner ip->i_flags |= XFS_IRECLAIM; 42433479e05SDave Chinner 42533479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 42633479e05SDave Chinner rcu_read_unlock(); 42733479e05SDave Chinner 428d45344d6SIra Weiny ASSERT(!rwsem_is_locked(&inode->i_rwsem)); 42950997470SDave Chinner error = xfs_reinit_inode(mp, inode); 43033479e05SDave Chinner if (error) { 431756baca2SBrian Foster bool wake; 43233479e05SDave Chinner /* 43333479e05SDave Chinner * Re-initializing the inode failed, and we are in deep 43433479e05SDave Chinner * trouble. Try to re-add it to the reclaim list. 43533479e05SDave Chinner */ 43633479e05SDave Chinner rcu_read_lock(); 43733479e05SDave Chinner spin_lock(&ip->i_flags_lock); 438756baca2SBrian Foster wake = !!__xfs_iflags_test(ip, XFS_INEW); 43933479e05SDave Chinner ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 440756baca2SBrian Foster if (wake) 441756baca2SBrian Foster wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); 44233479e05SDave Chinner ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 44333479e05SDave Chinner trace_xfs_iget_reclaim_fail(ip); 44433479e05SDave Chinner goto out_error; 44533479e05SDave Chinner } 44633479e05SDave Chinner 44733479e05SDave Chinner spin_lock(&pag->pag_ici_lock); 44833479e05SDave Chinner spin_lock(&ip->i_flags_lock); 44933479e05SDave Chinner 45033479e05SDave Chinner /* 45133479e05SDave Chinner * Clear the per-lifetime state in the inode as we are now 45233479e05SDave Chinner * effectively a new inode and need to return to the initial 45333479e05SDave Chinner * state before reuse occurs. 45433479e05SDave Chinner */ 45533479e05SDave Chinner ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 45633479e05SDave Chinner ip->i_flags |= XFS_INEW; 457545c0889SDave Chinner xfs_inode_clear_reclaim_tag(pag, ip->i_ino); 45833479e05SDave Chinner inode->i_state = I_NEW; 4596772c1f1SDarrick J. Wong ip->i_sick = 0; 4606772c1f1SDarrick J. Wong ip->i_checked = 0; 46133479e05SDave Chinner 46233479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 46333479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 46433479e05SDave Chinner } else { 46533479e05SDave Chinner /* If the VFS inode is being torn down, pause and try again. */ 46633479e05SDave Chinner if (!igrab(inode)) { 46733479e05SDave Chinner trace_xfs_iget_skip(ip); 4682451337dSDave Chinner error = -EAGAIN; 46933479e05SDave Chinner goto out_error; 47033479e05SDave Chinner } 47133479e05SDave Chinner 47233479e05SDave Chinner /* We've got a live one. */ 47333479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 47433479e05SDave Chinner rcu_read_unlock(); 47533479e05SDave Chinner trace_xfs_iget_hit(ip); 47633479e05SDave Chinner } 47733479e05SDave Chinner 47833479e05SDave Chinner if (lock_flags != 0) 47933479e05SDave Chinner xfs_ilock(ip, lock_flags); 48033479e05SDave Chinner 481378f681cSDarrick J. Wong if (!(flags & XFS_IGET_INCORE)) 482dae2f8edSIra Weiny xfs_iflags_clear(ip, XFS_ISTALE); 483ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_found); 48433479e05SDave Chinner 48533479e05SDave Chinner return 0; 48633479e05SDave Chinner 48733479e05SDave Chinner out_error: 48833479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 48933479e05SDave Chinner rcu_read_unlock(); 49033479e05SDave Chinner return error; 49133479e05SDave Chinner } 49233479e05SDave Chinner 49333479e05SDave Chinner 49433479e05SDave Chinner static int 49533479e05SDave Chinner xfs_iget_cache_miss( 49633479e05SDave Chinner struct xfs_mount *mp, 49733479e05SDave Chinner struct xfs_perag *pag, 49833479e05SDave Chinner xfs_trans_t *tp, 49933479e05SDave Chinner xfs_ino_t ino, 50033479e05SDave Chinner struct xfs_inode **ipp, 50133479e05SDave Chinner int flags, 50233479e05SDave Chinner int lock_flags) 50333479e05SDave Chinner { 50433479e05SDave Chinner struct xfs_inode *ip; 50533479e05SDave Chinner int error; 50633479e05SDave Chinner xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 50733479e05SDave Chinner int iflags; 50833479e05SDave Chinner 50933479e05SDave Chinner ip = xfs_inode_alloc(mp, ino); 51033479e05SDave Chinner if (!ip) 5112451337dSDave Chinner return -ENOMEM; 51233479e05SDave Chinner 513bb8a66afSChristoph Hellwig error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags); 51433479e05SDave Chinner if (error) 51533479e05SDave Chinner goto out_destroy; 51633479e05SDave Chinner 517bb8a66afSChristoph Hellwig /* 518bb8a66afSChristoph Hellwig * For version 5 superblocks, if we are initialising a new inode and we 519bb8a66afSChristoph Hellwig * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can 520bb8a66afSChristoph Hellwig * simply build the new inode core with a random generation number. 521bb8a66afSChristoph Hellwig * 522bb8a66afSChristoph Hellwig * For version 4 (and older) superblocks, log recovery is dependent on 523bb8a66afSChristoph Hellwig * the di_flushiter field being initialised from the current on-disk 524bb8a66afSChristoph Hellwig * value and hence we must also read the inode off disk even when 525bb8a66afSChristoph Hellwig * initializing new inodes. 526bb8a66afSChristoph Hellwig */ 527bb8a66afSChristoph Hellwig if (xfs_sb_version_has_v3inode(&mp->m_sb) && 528bb8a66afSChristoph Hellwig (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { 529bb8a66afSChristoph Hellwig VFS_I(ip)->i_generation = prandom_u32(); 530bb8a66afSChristoph Hellwig } else { 531bb8a66afSChristoph Hellwig struct xfs_dinode *dip; 532bb8a66afSChristoph Hellwig struct xfs_buf *bp; 533bb8a66afSChristoph Hellwig 534bb8a66afSChristoph Hellwig error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0); 535bb8a66afSChristoph Hellwig if (error) 536bb8a66afSChristoph Hellwig goto out_destroy; 537bb8a66afSChristoph Hellwig 538bb8a66afSChristoph Hellwig error = xfs_inode_from_disk(ip, dip); 539bb8a66afSChristoph Hellwig if (!error) 540bb8a66afSChristoph Hellwig xfs_buf_set_ref(bp, XFS_INO_REF); 541bb8a66afSChristoph Hellwig xfs_trans_brelse(tp, bp); 542bb8a66afSChristoph Hellwig 543bb8a66afSChristoph Hellwig if (error) 544bb8a66afSChristoph Hellwig goto out_destroy; 545bb8a66afSChristoph Hellwig } 546bb8a66afSChristoph Hellwig 54733479e05SDave Chinner trace_xfs_iget_miss(ip); 54833479e05SDave Chinner 549ee457001SDave Chinner /* 550afca6c5bSDave Chinner * Check the inode free state is valid. This also detects lookup 551afca6c5bSDave Chinner * racing with unlinks. 552ee457001SDave Chinner */ 553afca6c5bSDave Chinner error = xfs_iget_check_free_state(ip, flags); 554afca6c5bSDave Chinner if (error) 555ee457001SDave Chinner goto out_destroy; 55633479e05SDave Chinner 55733479e05SDave Chinner /* 55833479e05SDave Chinner * Preload the radix tree so we can insert safely under the 55933479e05SDave Chinner * write spinlock. Note that we cannot sleep inside the preload 56033479e05SDave Chinner * region. Since we can be called from transaction context, don't 56133479e05SDave Chinner * recurse into the file system. 56233479e05SDave Chinner */ 56333479e05SDave Chinner if (radix_tree_preload(GFP_NOFS)) { 5642451337dSDave Chinner error = -EAGAIN; 56533479e05SDave Chinner goto out_destroy; 56633479e05SDave Chinner } 56733479e05SDave Chinner 56833479e05SDave Chinner /* 56933479e05SDave Chinner * Because the inode hasn't been added to the radix-tree yet it can't 57033479e05SDave Chinner * be found by another thread, so we can do the non-sleeping lock here. 57133479e05SDave Chinner */ 57233479e05SDave Chinner if (lock_flags) { 57333479e05SDave Chinner if (!xfs_ilock_nowait(ip, lock_flags)) 57433479e05SDave Chinner BUG(); 57533479e05SDave Chinner } 57633479e05SDave Chinner 57733479e05SDave Chinner /* 57833479e05SDave Chinner * These values must be set before inserting the inode into the radix 57933479e05SDave Chinner * tree as the moment it is inserted a concurrent lookup (allowed by the 58033479e05SDave Chinner * RCU locking mechanism) can find it and that lookup must see that this 58133479e05SDave Chinner * is an inode currently under construction (i.e. that XFS_INEW is set). 58233479e05SDave Chinner * The ip->i_flags_lock that protects the XFS_INEW flag forms the 58333479e05SDave Chinner * memory barrier that ensures this detection works correctly at lookup 58433479e05SDave Chinner * time. 58533479e05SDave Chinner */ 58633479e05SDave Chinner iflags = XFS_INEW; 58733479e05SDave Chinner if (flags & XFS_IGET_DONTCACHE) 5882c567af4SIra Weiny d_mark_dontcache(VFS_I(ip)); 589113a5683SChandra Seetharaman ip->i_udquot = NULL; 590113a5683SChandra Seetharaman ip->i_gdquot = NULL; 59192f8ff73SChandra Seetharaman ip->i_pdquot = NULL; 59233479e05SDave Chinner xfs_iflags_set(ip, iflags); 59333479e05SDave Chinner 59433479e05SDave Chinner /* insert the new inode */ 59533479e05SDave Chinner spin_lock(&pag->pag_ici_lock); 59633479e05SDave Chinner error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 59733479e05SDave Chinner if (unlikely(error)) { 59833479e05SDave Chinner WARN_ON(error != -EEXIST); 599ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_dup); 6002451337dSDave Chinner error = -EAGAIN; 60133479e05SDave Chinner goto out_preload_end; 60233479e05SDave Chinner } 60333479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 60433479e05SDave Chinner radix_tree_preload_end(); 60533479e05SDave Chinner 60633479e05SDave Chinner *ipp = ip; 60733479e05SDave Chinner return 0; 60833479e05SDave Chinner 60933479e05SDave Chinner out_preload_end: 61033479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 61133479e05SDave Chinner radix_tree_preload_end(); 61233479e05SDave Chinner if (lock_flags) 61333479e05SDave Chinner xfs_iunlock(ip, lock_flags); 61433479e05SDave Chinner out_destroy: 61533479e05SDave Chinner __destroy_inode(VFS_I(ip)); 61633479e05SDave Chinner xfs_inode_free(ip); 61733479e05SDave Chinner return error; 61833479e05SDave Chinner } 61933479e05SDave Chinner 62033479e05SDave Chinner /* 62133479e05SDave Chinner * Look up an inode by number in the given file system. 62233479e05SDave Chinner * The inode is looked up in the cache held in each AG. 62333479e05SDave Chinner * If the inode is found in the cache, initialise the vfs inode 62433479e05SDave Chinner * if necessary. 62533479e05SDave Chinner * 62633479e05SDave Chinner * If it is not in core, read it in from the file system's device, 62733479e05SDave Chinner * add it to the cache and initialise the vfs inode. 62833479e05SDave Chinner * 62933479e05SDave Chinner * The inode is locked according to the value of the lock_flags parameter. 63033479e05SDave Chinner * This flag parameter indicates how and if the inode's IO lock and inode lock 63133479e05SDave Chinner * should be taken. 63233479e05SDave Chinner * 63333479e05SDave Chinner * mp -- the mount point structure for the current file system. It points 63433479e05SDave Chinner * to the inode hash table. 63533479e05SDave Chinner * tp -- a pointer to the current transaction if there is one. This is 63633479e05SDave Chinner * simply passed through to the xfs_iread() call. 63733479e05SDave Chinner * ino -- the number of the inode desired. This is the unique identifier 63833479e05SDave Chinner * within the file system for the inode being requested. 63933479e05SDave Chinner * lock_flags -- flags indicating how to lock the inode. See the comment 64033479e05SDave Chinner * for xfs_ilock() for a list of valid values. 64133479e05SDave Chinner */ 64233479e05SDave Chinner int 64333479e05SDave Chinner xfs_iget( 64433479e05SDave Chinner xfs_mount_t *mp, 64533479e05SDave Chinner xfs_trans_t *tp, 64633479e05SDave Chinner xfs_ino_t ino, 64733479e05SDave Chinner uint flags, 64833479e05SDave Chinner uint lock_flags, 64933479e05SDave Chinner xfs_inode_t **ipp) 65033479e05SDave Chinner { 65133479e05SDave Chinner xfs_inode_t *ip; 65233479e05SDave Chinner int error; 65333479e05SDave Chinner xfs_perag_t *pag; 65433479e05SDave Chinner xfs_agino_t agino; 65533479e05SDave Chinner 65633479e05SDave Chinner /* 65733479e05SDave Chinner * xfs_reclaim_inode() uses the ILOCK to ensure an inode 65833479e05SDave Chinner * doesn't get freed while it's being referenced during a 65933479e05SDave Chinner * radix tree traversal here. It assumes this function 66033479e05SDave Chinner * aqcuires only the ILOCK (and therefore it has no need to 66133479e05SDave Chinner * involve the IOLOCK in this synchronization). 66233479e05SDave Chinner */ 66333479e05SDave Chinner ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 66433479e05SDave Chinner 66533479e05SDave Chinner /* reject inode numbers outside existing AGs */ 66633479e05SDave Chinner if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 6672451337dSDave Chinner return -EINVAL; 66833479e05SDave Chinner 669ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_attempts); 6708774cf8bSLucas Stach 67133479e05SDave Chinner /* get the perag structure and ensure that it's inode capable */ 67233479e05SDave Chinner pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 67333479e05SDave Chinner agino = XFS_INO_TO_AGINO(mp, ino); 67433479e05SDave Chinner 67533479e05SDave Chinner again: 67633479e05SDave Chinner error = 0; 67733479e05SDave Chinner rcu_read_lock(); 67833479e05SDave Chinner ip = radix_tree_lookup(&pag->pag_ici_root, agino); 67933479e05SDave Chinner 68033479e05SDave Chinner if (ip) { 68133479e05SDave Chinner error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 68233479e05SDave Chinner if (error) 68333479e05SDave Chinner goto out_error_or_again; 68433479e05SDave Chinner } else { 68533479e05SDave Chinner rcu_read_unlock(); 686378f681cSDarrick J. Wong if (flags & XFS_IGET_INCORE) { 687ed438b47SDarrick J. Wong error = -ENODATA; 688378f681cSDarrick J. Wong goto out_error_or_again; 689378f681cSDarrick J. Wong } 690ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_missed); 69133479e05SDave Chinner 69233479e05SDave Chinner error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 69333479e05SDave Chinner flags, lock_flags); 69433479e05SDave Chinner if (error) 69533479e05SDave Chinner goto out_error_or_again; 69633479e05SDave Chinner } 69733479e05SDave Chinner xfs_perag_put(pag); 69833479e05SDave Chinner 69933479e05SDave Chinner *ipp = ip; 70033479e05SDave Chinner 70133479e05SDave Chinner /* 70258c90473SDave Chinner * If we have a real type for an on-disk inode, we can setup the inode 70333479e05SDave Chinner * now. If it's a new inode being created, xfs_ialloc will handle it. 70433479e05SDave Chinner */ 705c19b3b05SDave Chinner if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 70658c90473SDave Chinner xfs_setup_existing_inode(ip); 70733479e05SDave Chinner return 0; 70833479e05SDave Chinner 70933479e05SDave Chinner out_error_or_again: 710378f681cSDarrick J. Wong if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { 71133479e05SDave Chinner delay(1); 71233479e05SDave Chinner goto again; 71333479e05SDave Chinner } 71433479e05SDave Chinner xfs_perag_put(pag); 71533479e05SDave Chinner return error; 71633479e05SDave Chinner } 71733479e05SDave Chinner 7186d8b79cfSDave Chinner /* 719378f681cSDarrick J. Wong * "Is this a cached inode that's also allocated?" 720378f681cSDarrick J. Wong * 721378f681cSDarrick J. Wong * Look up an inode by number in the given file system. If the inode is 722378f681cSDarrick J. Wong * in cache and isn't in purgatory, return 1 if the inode is allocated 723378f681cSDarrick J. Wong * and 0 if it is not. For all other cases (not in cache, being torn 724378f681cSDarrick J. Wong * down, etc.), return a negative error code. 725378f681cSDarrick J. Wong * 726378f681cSDarrick J. Wong * The caller has to prevent inode allocation and freeing activity, 727378f681cSDarrick J. Wong * presumably by locking the AGI buffer. This is to ensure that an 728378f681cSDarrick J. Wong * inode cannot transition from allocated to freed until the caller is 729378f681cSDarrick J. Wong * ready to allow that. If the inode is in an intermediate state (new, 730378f681cSDarrick J. Wong * reclaimable, or being reclaimed), -EAGAIN will be returned; if the 731378f681cSDarrick J. Wong * inode is not in the cache, -ENOENT will be returned. The caller must 732378f681cSDarrick J. Wong * deal with these scenarios appropriately. 733378f681cSDarrick J. Wong * 734378f681cSDarrick J. Wong * This is a specialized use case for the online scrubber; if you're 735378f681cSDarrick J. Wong * reading this, you probably want xfs_iget. 736378f681cSDarrick J. Wong */ 737378f681cSDarrick J. Wong int 738378f681cSDarrick J. Wong xfs_icache_inode_is_allocated( 739378f681cSDarrick J. Wong struct xfs_mount *mp, 740378f681cSDarrick J. Wong struct xfs_trans *tp, 741378f681cSDarrick J. Wong xfs_ino_t ino, 742378f681cSDarrick J. Wong bool *inuse) 743378f681cSDarrick J. Wong { 744378f681cSDarrick J. Wong struct xfs_inode *ip; 745378f681cSDarrick J. Wong int error; 746378f681cSDarrick J. Wong 747378f681cSDarrick J. Wong error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); 748378f681cSDarrick J. Wong if (error) 749378f681cSDarrick J. Wong return error; 750378f681cSDarrick J. Wong 751378f681cSDarrick J. Wong *inuse = !!(VFS_I(ip)->i_mode); 75244a8736bSDarrick J. Wong xfs_irele(ip); 753378f681cSDarrick J. Wong return 0; 754378f681cSDarrick J. Wong } 755378f681cSDarrick J. Wong 756378f681cSDarrick J. Wong /* 7576d8b79cfSDave Chinner * The inode lookup is done in batches to keep the amount of lock traffic and 7586d8b79cfSDave Chinner * radix tree lookups to a minimum. The batch size is a trade off between 7596d8b79cfSDave Chinner * lookup reduction and stack usage. This is in the reclaim path, so we can't 7606d8b79cfSDave Chinner * be too greedy. 7616d8b79cfSDave Chinner */ 7626d8b79cfSDave Chinner #define XFS_LOOKUP_BATCH 32 7636d8b79cfSDave Chinner 76439b1cfd7SDarrick J. Wong /* 76539b1cfd7SDarrick J. Wong * Decide if the given @ip is eligible to be a part of the inode walk, and 76639b1cfd7SDarrick J. Wong * grab it if so. Returns true if it's ready to go or false if we should just 76739b1cfd7SDarrick J. Wong * ignore it. 76839b1cfd7SDarrick J. Wong */ 76939b1cfd7SDarrick J. Wong STATIC bool 770042f65f4SDarrick J. Wong xfs_inode_walk_ag_grab( 771ae2c4ac2SBrian Foster struct xfs_inode *ip, 772ae2c4ac2SBrian Foster int flags) 7736d8b79cfSDave Chinner { 7746d8b79cfSDave Chinner struct inode *inode = VFS_I(ip); 775042f65f4SDarrick J. Wong bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT); 7766d8b79cfSDave Chinner 7776d8b79cfSDave Chinner ASSERT(rcu_read_lock_held()); 7786d8b79cfSDave Chinner 7796d8b79cfSDave Chinner /* 7806d8b79cfSDave Chinner * check for stale RCU freed inode 7816d8b79cfSDave Chinner * 7826d8b79cfSDave Chinner * If the inode has been reallocated, it doesn't matter if it's not in 7836d8b79cfSDave Chinner * the AG we are walking - we are walking for writeback, so if it 7846d8b79cfSDave Chinner * passes all the "valid inode" checks and is dirty, then we'll write 7856d8b79cfSDave Chinner * it back anyway. If it has been reallocated and still being 7866d8b79cfSDave Chinner * initialised, the XFS_INEW check below will catch it. 7876d8b79cfSDave Chinner */ 7886d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 7896d8b79cfSDave Chinner if (!ip->i_ino) 7906d8b79cfSDave Chinner goto out_unlock_noent; 7916d8b79cfSDave Chinner 7926d8b79cfSDave Chinner /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 793ae2c4ac2SBrian Foster if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || 794ae2c4ac2SBrian Foster __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) 7956d8b79cfSDave Chinner goto out_unlock_noent; 7966d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 7976d8b79cfSDave Chinner 7986d8b79cfSDave Chinner /* nothing to sync during shutdown */ 7996d8b79cfSDave Chinner if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 80039b1cfd7SDarrick J. Wong return false; 8016d8b79cfSDave Chinner 8026d8b79cfSDave Chinner /* If we can't grab the inode, it must on it's way to reclaim. */ 8036d8b79cfSDave Chinner if (!igrab(inode)) 80439b1cfd7SDarrick J. Wong return false; 8056d8b79cfSDave Chinner 8066d8b79cfSDave Chinner /* inode is valid */ 80739b1cfd7SDarrick J. Wong return true; 8086d8b79cfSDave Chinner 8096d8b79cfSDave Chinner out_unlock_noent: 8106d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 81139b1cfd7SDarrick J. Wong return false; 8126d8b79cfSDave Chinner } 8136d8b79cfSDave Chinner 8145662d38cSDarrick J. Wong /* 8155662d38cSDarrick J. Wong * For a given per-AG structure @pag, grab, @execute, and rele all incore 8165662d38cSDarrick J. Wong * inodes with the given radix tree @tag. 8175662d38cSDarrick J. Wong */ 8186d8b79cfSDave Chinner STATIC int 819042f65f4SDarrick J. Wong xfs_inode_walk_ag( 8206d8b79cfSDave Chinner struct xfs_perag *pag, 821964176bdSDarrick J. Wong int iter_flags, 822390600f8SDarrick J. Wong int (*execute)(struct xfs_inode *ip, void *args), 823a454f742SBrian Foster void *args, 824964176bdSDarrick J. Wong int tag) 8256d8b79cfSDave Chinner { 826964176bdSDarrick J. Wong struct xfs_mount *mp = pag->pag_mount; 8276d8b79cfSDave Chinner uint32_t first_index; 8286d8b79cfSDave Chinner int last_error = 0; 8296d8b79cfSDave Chinner int skipped; 8307e88d314SDarrick J. Wong bool done; 8316d8b79cfSDave Chinner int nr_found; 8326d8b79cfSDave Chinner 8336d8b79cfSDave Chinner restart: 8347e88d314SDarrick J. Wong done = false; 8356d8b79cfSDave Chinner skipped = 0; 8366d8b79cfSDave Chinner first_index = 0; 8376d8b79cfSDave Chinner nr_found = 0; 8386d8b79cfSDave Chinner do { 8396d8b79cfSDave Chinner struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 8406d8b79cfSDave Chinner int error = 0; 8416d8b79cfSDave Chinner int i; 8426d8b79cfSDave Chinner 8436d8b79cfSDave Chinner rcu_read_lock(); 844a454f742SBrian Foster 845fc96be95SDarrick J. Wong if (tag == XFS_ICI_NO_TAG) 8466d8b79cfSDave Chinner nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 8476d8b79cfSDave Chinner (void **)batch, first_index, 8486d8b79cfSDave Chinner XFS_LOOKUP_BATCH); 849a454f742SBrian Foster else 850a454f742SBrian Foster nr_found = radix_tree_gang_lookup_tag( 851a454f742SBrian Foster &pag->pag_ici_root, 852a454f742SBrian Foster (void **) batch, first_index, 853a454f742SBrian Foster XFS_LOOKUP_BATCH, tag); 854a454f742SBrian Foster 8556d8b79cfSDave Chinner if (!nr_found) { 8566d8b79cfSDave Chinner rcu_read_unlock(); 8576d8b79cfSDave Chinner break; 8586d8b79cfSDave Chinner } 8596d8b79cfSDave Chinner 8606d8b79cfSDave Chinner /* 8616d8b79cfSDave Chinner * Grab the inodes before we drop the lock. if we found 8626d8b79cfSDave Chinner * nothing, nr == 0 and the loop will be skipped. 8636d8b79cfSDave Chinner */ 8646d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 8656d8b79cfSDave Chinner struct xfs_inode *ip = batch[i]; 8666d8b79cfSDave Chinner 867042f65f4SDarrick J. Wong if (done || !xfs_inode_walk_ag_grab(ip, iter_flags)) 8686d8b79cfSDave Chinner batch[i] = NULL; 8696d8b79cfSDave Chinner 8706d8b79cfSDave Chinner /* 8716d8b79cfSDave Chinner * Update the index for the next lookup. Catch 8726d8b79cfSDave Chinner * overflows into the next AG range which can occur if 8736d8b79cfSDave Chinner * we have inodes in the last block of the AG and we 8746d8b79cfSDave Chinner * are currently pointing to the last inode. 8756d8b79cfSDave Chinner * 8766d8b79cfSDave Chinner * Because we may see inodes that are from the wrong AG 8776d8b79cfSDave Chinner * due to RCU freeing and reallocation, only update the 8786d8b79cfSDave Chinner * index if it lies in this AG. It was a race that lead 8796d8b79cfSDave Chinner * us to see this inode, so another lookup from the 8806d8b79cfSDave Chinner * same index will not find it again. 8816d8b79cfSDave Chinner */ 8826d8b79cfSDave Chinner if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 8836d8b79cfSDave Chinner continue; 8846d8b79cfSDave Chinner first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 8856d8b79cfSDave Chinner if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 8867e88d314SDarrick J. Wong done = true; 8876d8b79cfSDave Chinner } 8886d8b79cfSDave Chinner 8896d8b79cfSDave Chinner /* unlock now we've grabbed the inodes. */ 8906d8b79cfSDave Chinner rcu_read_unlock(); 8916d8b79cfSDave Chinner 8926d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 8936d8b79cfSDave Chinner if (!batch[i]) 8946d8b79cfSDave Chinner continue; 895042f65f4SDarrick J. Wong if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) && 896ae2c4ac2SBrian Foster xfs_iflags_test(batch[i], XFS_INEW)) 897ae2c4ac2SBrian Foster xfs_inew_wait(batch[i]); 898390600f8SDarrick J. Wong error = execute(batch[i], args); 89944a8736bSDarrick J. Wong xfs_irele(batch[i]); 9002451337dSDave Chinner if (error == -EAGAIN) { 9016d8b79cfSDave Chinner skipped++; 9026d8b79cfSDave Chinner continue; 9036d8b79cfSDave Chinner } 9042451337dSDave Chinner if (error && last_error != -EFSCORRUPTED) 9056d8b79cfSDave Chinner last_error = error; 9066d8b79cfSDave Chinner } 9076d8b79cfSDave Chinner 9086d8b79cfSDave Chinner /* bail out if the filesystem is corrupted. */ 9092451337dSDave Chinner if (error == -EFSCORRUPTED) 9106d8b79cfSDave Chinner break; 9116d8b79cfSDave Chinner 9126d8b79cfSDave Chinner cond_resched(); 9136d8b79cfSDave Chinner 9146d8b79cfSDave Chinner } while (nr_found && !done); 9156d8b79cfSDave Chinner 9166d8b79cfSDave Chinner if (skipped) { 9176d8b79cfSDave Chinner delay(1); 9186d8b79cfSDave Chinner goto restart; 9196d8b79cfSDave Chinner } 9206d8b79cfSDave Chinner return last_error; 9216d8b79cfSDave Chinner } 9226d8b79cfSDave Chinner 9235662d38cSDarrick J. Wong /* Fetch the next (possibly tagged) per-AG structure. */ 9245662d38cSDarrick J. Wong static inline struct xfs_perag * 9255662d38cSDarrick J. Wong xfs_inode_walk_get_perag( 9265662d38cSDarrick J. Wong struct xfs_mount *mp, 9275662d38cSDarrick J. Wong xfs_agnumber_t agno, 9285662d38cSDarrick J. Wong int tag) 9295662d38cSDarrick J. Wong { 9305662d38cSDarrick J. Wong if (tag == XFS_ICI_NO_TAG) 9315662d38cSDarrick J. Wong return xfs_perag_get(mp, agno); 9325662d38cSDarrick J. Wong return xfs_perag_get_tag(mp, agno, tag); 9335662d38cSDarrick J. Wong } 9345662d38cSDarrick J. Wong 9355662d38cSDarrick J. Wong /* 9365662d38cSDarrick J. Wong * Call the @execute function on all incore inodes matching the radix tree 9375662d38cSDarrick J. Wong * @tag. 9385662d38cSDarrick J. Wong */ 9395662d38cSDarrick J. Wong int 940042f65f4SDarrick J. Wong xfs_inode_walk( 9415662d38cSDarrick J. Wong struct xfs_mount *mp, 9425662d38cSDarrick J. Wong int iter_flags, 9435662d38cSDarrick J. Wong int (*execute)(struct xfs_inode *ip, void *args), 9445662d38cSDarrick J. Wong void *args, 9455662d38cSDarrick J. Wong int tag) 9465662d38cSDarrick J. Wong { 9475662d38cSDarrick J. Wong struct xfs_perag *pag; 9485662d38cSDarrick J. Wong int error = 0; 9495662d38cSDarrick J. Wong int last_error = 0; 9505662d38cSDarrick J. Wong xfs_agnumber_t ag; 9515662d38cSDarrick J. Wong 9525662d38cSDarrick J. Wong ag = 0; 9535662d38cSDarrick J. Wong while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) { 9545662d38cSDarrick J. Wong ag = pag->pag_agno + 1; 955964176bdSDarrick J. Wong error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag); 9565662d38cSDarrick J. Wong xfs_perag_put(pag); 9575662d38cSDarrick J. Wong if (error) { 9585662d38cSDarrick J. Wong last_error = error; 9595662d38cSDarrick J. Wong if (error == -EFSCORRUPTED) 9605662d38cSDarrick J. Wong break; 9615662d38cSDarrick J. Wong } 9625662d38cSDarrick J. Wong } 9635662d38cSDarrick J. Wong return last_error; 9645662d38cSDarrick J. Wong } 9655662d38cSDarrick J. Wong 966579b62faSBrian Foster /* 967579b62faSBrian Foster * Background scanning to trim post-EOF preallocated space. This is queued 968b9fe5052SDwight Engen * based on the 'speculative_prealloc_lifetime' tunable (5m by default). 969579b62faSBrian Foster */ 970fa5a4f57SBrian Foster void 971579b62faSBrian Foster xfs_queue_eofblocks( 972579b62faSBrian Foster struct xfs_mount *mp) 973579b62faSBrian Foster { 974579b62faSBrian Foster rcu_read_lock(); 975579b62faSBrian Foster if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) 976579b62faSBrian Foster queue_delayed_work(mp->m_eofblocks_workqueue, 977579b62faSBrian Foster &mp->m_eofblocks_work, 978579b62faSBrian Foster msecs_to_jiffies(xfs_eofb_secs * 1000)); 979579b62faSBrian Foster rcu_read_unlock(); 980579b62faSBrian Foster } 981579b62faSBrian Foster 982579b62faSBrian Foster void 983579b62faSBrian Foster xfs_eofblocks_worker( 984579b62faSBrian Foster struct work_struct *work) 985579b62faSBrian Foster { 986579b62faSBrian Foster struct xfs_mount *mp = container_of(to_delayed_work(work), 987579b62faSBrian Foster struct xfs_mount, m_eofblocks_work); 9884b674b9aSBrian Foster 9894b674b9aSBrian Foster if (!sb_start_write_trylock(mp->m_super)) 9904b674b9aSBrian Foster return; 991579b62faSBrian Foster xfs_icache_free_eofblocks(mp, NULL); 9924b674b9aSBrian Foster sb_end_write(mp->m_super); 9934b674b9aSBrian Foster 994579b62faSBrian Foster xfs_queue_eofblocks(mp); 995579b62faSBrian Foster } 996579b62faSBrian Foster 99783104d44SDarrick J. Wong /* 99883104d44SDarrick J. Wong * Background scanning to trim preallocated CoW space. This is queued 99983104d44SDarrick J. Wong * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). 100083104d44SDarrick J. Wong * (We'll just piggyback on the post-EOF prealloc space workqueue.) 100183104d44SDarrick J. Wong */ 100210ddf64eSDarrick J. Wong void 100383104d44SDarrick J. Wong xfs_queue_cowblocks( 100483104d44SDarrick J. Wong struct xfs_mount *mp) 100583104d44SDarrick J. Wong { 100683104d44SDarrick J. Wong rcu_read_lock(); 100783104d44SDarrick J. Wong if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) 100883104d44SDarrick J. Wong queue_delayed_work(mp->m_eofblocks_workqueue, 100983104d44SDarrick J. Wong &mp->m_cowblocks_work, 101083104d44SDarrick J. Wong msecs_to_jiffies(xfs_cowb_secs * 1000)); 101183104d44SDarrick J. Wong rcu_read_unlock(); 101283104d44SDarrick J. Wong } 101383104d44SDarrick J. Wong 101483104d44SDarrick J. Wong void 101583104d44SDarrick J. Wong xfs_cowblocks_worker( 101683104d44SDarrick J. Wong struct work_struct *work) 101783104d44SDarrick J. Wong { 101883104d44SDarrick J. Wong struct xfs_mount *mp = container_of(to_delayed_work(work), 101983104d44SDarrick J. Wong struct xfs_mount, m_cowblocks_work); 10204b674b9aSBrian Foster 10214b674b9aSBrian Foster if (!sb_start_write_trylock(mp->m_super)) 10224b674b9aSBrian Foster return; 102383104d44SDarrick J. Wong xfs_icache_free_cowblocks(mp, NULL); 10244b674b9aSBrian Foster sb_end_write(mp->m_super); 10254b674b9aSBrian Foster 102683104d44SDarrick J. Wong xfs_queue_cowblocks(mp); 102783104d44SDarrick J. Wong } 102883104d44SDarrick J. Wong 10296d8b79cfSDave Chinner /* 10306d8b79cfSDave Chinner * Grab the inode for reclaim exclusively. 10316d8b79cfSDave Chinner * Return 0 if we grabbed it, non-zero otherwise. 10326d8b79cfSDave Chinner */ 10336d8b79cfSDave Chinner STATIC int 10346d8b79cfSDave Chinner xfs_reclaim_inode_grab( 10356d8b79cfSDave Chinner struct xfs_inode *ip, 10366d8b79cfSDave Chinner int flags) 10376d8b79cfSDave Chinner { 10386d8b79cfSDave Chinner ASSERT(rcu_read_lock_held()); 10396d8b79cfSDave Chinner 10406d8b79cfSDave Chinner /* quick check for stale RCU freed inode */ 10416d8b79cfSDave Chinner if (!ip->i_ino) 10426d8b79cfSDave Chinner return 1; 10436d8b79cfSDave Chinner 10446d8b79cfSDave Chinner /* 10456d8b79cfSDave Chinner * If we are asked for non-blocking operation, do unlocked checks to 10466d8b79cfSDave Chinner * see if the inode already is being flushed or in reclaim to avoid 10476d8b79cfSDave Chinner * lock traffic. 10486d8b79cfSDave Chinner */ 10496d8b79cfSDave Chinner if ((flags & SYNC_TRYLOCK) && 10506d8b79cfSDave Chinner __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) 10516d8b79cfSDave Chinner return 1; 10526d8b79cfSDave Chinner 10536d8b79cfSDave Chinner /* 10546d8b79cfSDave Chinner * The radix tree lock here protects a thread in xfs_iget from racing 10556d8b79cfSDave Chinner * with us starting reclaim on the inode. Once we have the 10566d8b79cfSDave Chinner * XFS_IRECLAIM flag set it will not touch us. 10576d8b79cfSDave Chinner * 10586d8b79cfSDave Chinner * Due to RCU lookup, we may find inodes that have been freed and only 10596d8b79cfSDave Chinner * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that 10606d8b79cfSDave Chinner * aren't candidates for reclaim at all, so we must check the 10616d8b79cfSDave Chinner * XFS_IRECLAIMABLE is set first before proceeding to reclaim. 10626d8b79cfSDave Chinner */ 10636d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 10646d8b79cfSDave Chinner if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 10656d8b79cfSDave Chinner __xfs_iflags_test(ip, XFS_IRECLAIM)) { 10666d8b79cfSDave Chinner /* not a reclaim candidate. */ 10676d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 10686d8b79cfSDave Chinner return 1; 10696d8b79cfSDave Chinner } 10706d8b79cfSDave Chinner __xfs_iflags_set(ip, XFS_IRECLAIM); 10716d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 10726d8b79cfSDave Chinner return 0; 10736d8b79cfSDave Chinner } 10746d8b79cfSDave Chinner 10756d8b79cfSDave Chinner /* 10766d8b79cfSDave Chinner * Inodes in different states need to be treated differently. The following 10776d8b79cfSDave Chinner * table lists the inode states and the reclaim actions necessary: 10786d8b79cfSDave Chinner * 10796d8b79cfSDave Chinner * inode state iflush ret required action 10806d8b79cfSDave Chinner * --------------- ---------- --------------- 10816d8b79cfSDave Chinner * bad - reclaim 10826d8b79cfSDave Chinner * shutdown EIO unpin and reclaim 10836d8b79cfSDave Chinner * clean, unpinned 0 reclaim 10846d8b79cfSDave Chinner * stale, unpinned 0 reclaim 10856d8b79cfSDave Chinner * clean, pinned(*) 0 requeue 10866d8b79cfSDave Chinner * stale, pinned EAGAIN requeue 10876d8b79cfSDave Chinner * dirty, async - requeue 10886d8b79cfSDave Chinner * dirty, sync 0 reclaim 10896d8b79cfSDave Chinner * 10906d8b79cfSDave Chinner * (*) dgc: I don't think the clean, pinned state is possible but it gets 10916d8b79cfSDave Chinner * handled anyway given the order of checks implemented. 10926d8b79cfSDave Chinner * 10936d8b79cfSDave Chinner * Also, because we get the flush lock first, we know that any inode that has 10946d8b79cfSDave Chinner * been flushed delwri has had the flush completed by the time we check that 10956d8b79cfSDave Chinner * the inode is clean. 10966d8b79cfSDave Chinner * 10976d8b79cfSDave Chinner * Note that because the inode is flushed delayed write by AIL pushing, the 10986d8b79cfSDave Chinner * flush lock may already be held here and waiting on it can result in very 10996d8b79cfSDave Chinner * long latencies. Hence for sync reclaims, where we wait on the flush lock, 11006d8b79cfSDave Chinner * the caller should push the AIL first before trying to reclaim inodes to 11016d8b79cfSDave Chinner * minimise the amount of time spent waiting. For background relaim, we only 11026d8b79cfSDave Chinner * bother to reclaim clean inodes anyway. 11036d8b79cfSDave Chinner * 11046d8b79cfSDave Chinner * Hence the order of actions after gaining the locks should be: 11056d8b79cfSDave Chinner * bad => reclaim 11066d8b79cfSDave Chinner * shutdown => unpin and reclaim 11076d8b79cfSDave Chinner * pinned, async => requeue 11086d8b79cfSDave Chinner * pinned, sync => unpin 11096d8b79cfSDave Chinner * stale => reclaim 11106d8b79cfSDave Chinner * clean => reclaim 11116d8b79cfSDave Chinner * dirty, async => requeue 11126d8b79cfSDave Chinner * dirty, sync => flush, wait and reclaim 11136d8b79cfSDave Chinner */ 11146d8b79cfSDave Chinner STATIC int 11156d8b79cfSDave Chinner xfs_reclaim_inode( 11166d8b79cfSDave Chinner struct xfs_inode *ip, 11176d8b79cfSDave Chinner struct xfs_perag *pag, 11186d8b79cfSDave Chinner int sync_mode) 11196d8b79cfSDave Chinner { 11206d8b79cfSDave Chinner struct xfs_buf *bp = NULL; 11218a17d7ddSDave Chinner xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 11226d8b79cfSDave Chinner int error; 11236d8b79cfSDave Chinner 11246d8b79cfSDave Chinner restart: 11256d8b79cfSDave Chinner error = 0; 11266d8b79cfSDave Chinner xfs_ilock(ip, XFS_ILOCK_EXCL); 11276d8b79cfSDave Chinner if (!xfs_iflock_nowait(ip)) { 11286d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 11296d8b79cfSDave Chinner goto out; 11306d8b79cfSDave Chinner xfs_iflock(ip); 11316d8b79cfSDave Chinner } 11326d8b79cfSDave Chinner 11336d8b79cfSDave Chinner if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 11346d8b79cfSDave Chinner xfs_iunpin_wait(ip); 113598efe8afSBrian Foster /* xfs_iflush_abort() drops the flush lock */ 113688fc1879SBrian Foster xfs_iflush_abort(ip); 11376d8b79cfSDave Chinner goto reclaim; 11386d8b79cfSDave Chinner } 11396d8b79cfSDave Chinner if (xfs_ipincount(ip)) { 11406d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 11416d8b79cfSDave Chinner goto out_ifunlock; 11426d8b79cfSDave Chinner xfs_iunpin_wait(ip); 11436d8b79cfSDave Chinner } 1144*96355d5aSDave Chinner if (xfs_inode_clean(ip)) { 114598efe8afSBrian Foster xfs_ifunlock(ip); 11466d8b79cfSDave Chinner goto reclaim; 114798efe8afSBrian Foster } 11486d8b79cfSDave Chinner 11496d8b79cfSDave Chinner /* 11506d8b79cfSDave Chinner * Never flush out dirty data during non-blocking reclaim, as it would 11516d8b79cfSDave Chinner * just contend with AIL pushing trying to do the same job. 11526d8b79cfSDave Chinner */ 11536d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 11546d8b79cfSDave Chinner goto out_ifunlock; 11556d8b79cfSDave Chinner 11566d8b79cfSDave Chinner /* 11576d8b79cfSDave Chinner * Now we have an inode that needs flushing. 11586d8b79cfSDave Chinner * 11596d8b79cfSDave Chinner * Note that xfs_iflush will never block on the inode buffer lock, as 11606d8b79cfSDave Chinner * xfs_ifree_cluster() can lock the inode buffer before it locks the 11616d8b79cfSDave Chinner * ip->i_lock, and we are doing the exact opposite here. As a result, 11626d8b79cfSDave Chinner * doing a blocking xfs_imap_to_bp() to get the cluster buffer would 11636d8b79cfSDave Chinner * result in an ABBA deadlock with xfs_ifree_cluster(). 11646d8b79cfSDave Chinner * 11656d8b79cfSDave Chinner * As xfs_ifree_cluser() must gather all inodes that are active in the 11666d8b79cfSDave Chinner * cache to mark them stale, if we hit this case we don't actually want 11676d8b79cfSDave Chinner * to do IO here - we want the inode marked stale so we can simply 11686d8b79cfSDave Chinner * reclaim it. Hence if we get an EAGAIN error here, just unlock the 11696d8b79cfSDave Chinner * inode, back off and try again. Hopefully the next pass through will 11706d8b79cfSDave Chinner * see the stale flag set on the inode. 11716d8b79cfSDave Chinner */ 11726d8b79cfSDave Chinner error = xfs_iflush(ip, &bp); 11732451337dSDave Chinner if (error == -EAGAIN) { 11746d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 11756d8b79cfSDave Chinner /* backoff longer than in xfs_ifree_cluster */ 11766d8b79cfSDave Chinner delay(2); 11776d8b79cfSDave Chinner goto restart; 11786d8b79cfSDave Chinner } 11796d8b79cfSDave Chinner 11806d8b79cfSDave Chinner if (!error) { 11816d8b79cfSDave Chinner error = xfs_bwrite(bp); 11826d8b79cfSDave Chinner xfs_buf_relse(bp); 11836d8b79cfSDave Chinner } 11846d8b79cfSDave Chinner 11856d8b79cfSDave Chinner reclaim: 118698efe8afSBrian Foster ASSERT(!xfs_isiflocked(ip)); 118798efe8afSBrian Foster 11888a17d7ddSDave Chinner /* 11898a17d7ddSDave Chinner * Because we use RCU freeing we need to ensure the inode always appears 11908a17d7ddSDave Chinner * to be reclaimed with an invalid inode number when in the free state. 119198efe8afSBrian Foster * We do this as early as possible under the ILOCK so that 1192f2e9ad21SOmar Sandoval * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to 1193f2e9ad21SOmar Sandoval * detect races with us here. By doing this, we guarantee that once 1194f2e9ad21SOmar Sandoval * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that 1195f2e9ad21SOmar Sandoval * it will see either a valid inode that will serialise correctly, or it 1196f2e9ad21SOmar Sandoval * will see an invalid inode that it can skip. 11978a17d7ddSDave Chinner */ 11988a17d7ddSDave Chinner spin_lock(&ip->i_flags_lock); 11998a17d7ddSDave Chinner ip->i_flags = XFS_IRECLAIM; 12008a17d7ddSDave Chinner ip->i_ino = 0; 12018a17d7ddSDave Chinner spin_unlock(&ip->i_flags_lock); 12028a17d7ddSDave Chinner 12036d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 12046d8b79cfSDave Chinner 1205ff6d6af2SBill O'Donnell XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 12066d8b79cfSDave Chinner /* 12076d8b79cfSDave Chinner * Remove the inode from the per-AG radix tree. 12086d8b79cfSDave Chinner * 12096d8b79cfSDave Chinner * Because radix_tree_delete won't complain even if the item was never 12106d8b79cfSDave Chinner * added to the tree assert that it's been there before to catch 12116d8b79cfSDave Chinner * problems with the inode life time early on. 12126d8b79cfSDave Chinner */ 12136d8b79cfSDave Chinner spin_lock(&pag->pag_ici_lock); 12146d8b79cfSDave Chinner if (!radix_tree_delete(&pag->pag_ici_root, 12158a17d7ddSDave Chinner XFS_INO_TO_AGINO(ip->i_mount, ino))) 12166d8b79cfSDave Chinner ASSERT(0); 1217545c0889SDave Chinner xfs_perag_clear_reclaim_tag(pag); 12186d8b79cfSDave Chinner spin_unlock(&pag->pag_ici_lock); 12196d8b79cfSDave Chinner 12206d8b79cfSDave Chinner /* 12216d8b79cfSDave Chinner * Here we do an (almost) spurious inode lock in order to coordinate 12226d8b79cfSDave Chinner * with inode cache radix tree lookups. This is because the lookup 12236d8b79cfSDave Chinner * can reference the inodes in the cache without taking references. 12246d8b79cfSDave Chinner * 12256d8b79cfSDave Chinner * We make that OK here by ensuring that we wait until the inode is 12266d8b79cfSDave Chinner * unlocked after the lookup before we go ahead and free it. 12276d8b79cfSDave Chinner */ 12286d8b79cfSDave Chinner xfs_ilock(ip, XFS_ILOCK_EXCL); 12296d8b79cfSDave Chinner xfs_qm_dqdetach(ip); 12306d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 1231*96355d5aSDave Chinner ASSERT(xfs_inode_clean(ip)); 12326d8b79cfSDave Chinner 12338a17d7ddSDave Chinner __xfs_inode_free(ip); 12346d8b79cfSDave Chinner return error; 12356d8b79cfSDave Chinner 12366d8b79cfSDave Chinner out_ifunlock: 12376d8b79cfSDave Chinner xfs_ifunlock(ip); 12386d8b79cfSDave Chinner out: 12396d8b79cfSDave Chinner xfs_iflags_clear(ip, XFS_IRECLAIM); 12406d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 12416d8b79cfSDave Chinner /* 12422451337dSDave Chinner * We could return -EAGAIN here to make reclaim rescan the inode tree in 12436d8b79cfSDave Chinner * a short while. However, this just burns CPU time scanning the tree 12446d8b79cfSDave Chinner * waiting for IO to complete and the reclaim work never goes back to 12456d8b79cfSDave Chinner * the idle state. Instead, return 0 to let the next scheduled 12466d8b79cfSDave Chinner * background reclaim attempt to reclaim the inode again. 12476d8b79cfSDave Chinner */ 12486d8b79cfSDave Chinner return 0; 12496d8b79cfSDave Chinner } 12506d8b79cfSDave Chinner 12516d8b79cfSDave Chinner /* 12526d8b79cfSDave Chinner * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 12536d8b79cfSDave Chinner * corrupted, we still want to try to reclaim all the inodes. If we don't, 12546d8b79cfSDave Chinner * then a shut down during filesystem unmount reclaim walk leak all the 12556d8b79cfSDave Chinner * unreclaimed inodes. 12566d8b79cfSDave Chinner */ 125733479e05SDave Chinner STATIC int 12586d8b79cfSDave Chinner xfs_reclaim_inodes_ag( 12596d8b79cfSDave Chinner struct xfs_mount *mp, 12606d8b79cfSDave Chinner int flags, 12616d8b79cfSDave Chinner int *nr_to_scan) 12626d8b79cfSDave Chinner { 12636d8b79cfSDave Chinner struct xfs_perag *pag; 12646d8b79cfSDave Chinner int error = 0; 12656d8b79cfSDave Chinner int last_error = 0; 12666d8b79cfSDave Chinner xfs_agnumber_t ag; 12676d8b79cfSDave Chinner int trylock = flags & SYNC_TRYLOCK; 12686d8b79cfSDave Chinner int skipped; 12696d8b79cfSDave Chinner 12706d8b79cfSDave Chinner restart: 12716d8b79cfSDave Chinner ag = 0; 12726d8b79cfSDave Chinner skipped = 0; 12736d8b79cfSDave Chinner while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 12746d8b79cfSDave Chinner unsigned long first_index = 0; 12756d8b79cfSDave Chinner int done = 0; 12766d8b79cfSDave Chinner int nr_found = 0; 12776d8b79cfSDave Chinner 12786d8b79cfSDave Chinner ag = pag->pag_agno + 1; 12796d8b79cfSDave Chinner 12806d8b79cfSDave Chinner if (trylock) { 12816d8b79cfSDave Chinner if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 12826d8b79cfSDave Chinner skipped++; 12836d8b79cfSDave Chinner xfs_perag_put(pag); 12846d8b79cfSDave Chinner continue; 12856d8b79cfSDave Chinner } 12866d8b79cfSDave Chinner first_index = pag->pag_ici_reclaim_cursor; 12876d8b79cfSDave Chinner } else 12886d8b79cfSDave Chinner mutex_lock(&pag->pag_ici_reclaim_lock); 12896d8b79cfSDave Chinner 12906d8b79cfSDave Chinner do { 12916d8b79cfSDave Chinner struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 12926d8b79cfSDave Chinner int i; 12936d8b79cfSDave Chinner 12946d8b79cfSDave Chinner rcu_read_lock(); 12956d8b79cfSDave Chinner nr_found = radix_tree_gang_lookup_tag( 12966d8b79cfSDave Chinner &pag->pag_ici_root, 12976d8b79cfSDave Chinner (void **)batch, first_index, 12986d8b79cfSDave Chinner XFS_LOOKUP_BATCH, 12996d8b79cfSDave Chinner XFS_ICI_RECLAIM_TAG); 13006d8b79cfSDave Chinner if (!nr_found) { 13016d8b79cfSDave Chinner done = 1; 13026d8b79cfSDave Chinner rcu_read_unlock(); 13036d8b79cfSDave Chinner break; 13046d8b79cfSDave Chinner } 13056d8b79cfSDave Chinner 13066d8b79cfSDave Chinner /* 13076d8b79cfSDave Chinner * Grab the inodes before we drop the lock. if we found 13086d8b79cfSDave Chinner * nothing, nr == 0 and the loop will be skipped. 13096d8b79cfSDave Chinner */ 13106d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 13116d8b79cfSDave Chinner struct xfs_inode *ip = batch[i]; 13126d8b79cfSDave Chinner 13136d8b79cfSDave Chinner if (done || xfs_reclaim_inode_grab(ip, flags)) 13146d8b79cfSDave Chinner batch[i] = NULL; 13156d8b79cfSDave Chinner 13166d8b79cfSDave Chinner /* 13176d8b79cfSDave Chinner * Update the index for the next lookup. Catch 13186d8b79cfSDave Chinner * overflows into the next AG range which can 13196d8b79cfSDave Chinner * occur if we have inodes in the last block of 13206d8b79cfSDave Chinner * the AG and we are currently pointing to the 13216d8b79cfSDave Chinner * last inode. 13226d8b79cfSDave Chinner * 13236d8b79cfSDave Chinner * Because we may see inodes that are from the 13246d8b79cfSDave Chinner * wrong AG due to RCU freeing and 13256d8b79cfSDave Chinner * reallocation, only update the index if it 13266d8b79cfSDave Chinner * lies in this AG. It was a race that lead us 13276d8b79cfSDave Chinner * to see this inode, so another lookup from 13286d8b79cfSDave Chinner * the same index will not find it again. 13296d8b79cfSDave Chinner */ 13306d8b79cfSDave Chinner if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 13316d8b79cfSDave Chinner pag->pag_agno) 13326d8b79cfSDave Chinner continue; 13336d8b79cfSDave Chinner first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 13346d8b79cfSDave Chinner if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 13356d8b79cfSDave Chinner done = 1; 13366d8b79cfSDave Chinner } 13376d8b79cfSDave Chinner 13386d8b79cfSDave Chinner /* unlock now we've grabbed the inodes. */ 13396d8b79cfSDave Chinner rcu_read_unlock(); 13406d8b79cfSDave Chinner 13416d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 13426d8b79cfSDave Chinner if (!batch[i]) 13436d8b79cfSDave Chinner continue; 13446d8b79cfSDave Chinner error = xfs_reclaim_inode(batch[i], pag, flags); 13452451337dSDave Chinner if (error && last_error != -EFSCORRUPTED) 13466d8b79cfSDave Chinner last_error = error; 13476d8b79cfSDave Chinner } 13486d8b79cfSDave Chinner 13496d8b79cfSDave Chinner *nr_to_scan -= XFS_LOOKUP_BATCH; 13506d8b79cfSDave Chinner 13516d8b79cfSDave Chinner cond_resched(); 13526d8b79cfSDave Chinner 13536d8b79cfSDave Chinner } while (nr_found && !done && *nr_to_scan > 0); 13546d8b79cfSDave Chinner 13556d8b79cfSDave Chinner if (trylock && !done) 13566d8b79cfSDave Chinner pag->pag_ici_reclaim_cursor = first_index; 13576d8b79cfSDave Chinner else 13586d8b79cfSDave Chinner pag->pag_ici_reclaim_cursor = 0; 13596d8b79cfSDave Chinner mutex_unlock(&pag->pag_ici_reclaim_lock); 13606d8b79cfSDave Chinner xfs_perag_put(pag); 13616d8b79cfSDave Chinner } 13626d8b79cfSDave Chinner 13636d8b79cfSDave Chinner /* 13646d8b79cfSDave Chinner * if we skipped any AG, and we still have scan count remaining, do 13656d8b79cfSDave Chinner * another pass this time using blocking reclaim semantics (i.e 13666d8b79cfSDave Chinner * waiting on the reclaim locks and ignoring the reclaim cursors). This 13676d8b79cfSDave Chinner * ensure that when we get more reclaimers than AGs we block rather 13686d8b79cfSDave Chinner * than spin trying to execute reclaim. 13696d8b79cfSDave Chinner */ 13706d8b79cfSDave Chinner if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { 13716d8b79cfSDave Chinner trylock = 0; 13726d8b79cfSDave Chinner goto restart; 13736d8b79cfSDave Chinner } 1374b474c7aeSEric Sandeen return last_error; 13756d8b79cfSDave Chinner } 13766d8b79cfSDave Chinner 13776d8b79cfSDave Chinner int 13786d8b79cfSDave Chinner xfs_reclaim_inodes( 13796d8b79cfSDave Chinner xfs_mount_t *mp, 13806d8b79cfSDave Chinner int mode) 13816d8b79cfSDave Chinner { 13826d8b79cfSDave Chinner int nr_to_scan = INT_MAX; 13836d8b79cfSDave Chinner 13846d8b79cfSDave Chinner return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); 13856d8b79cfSDave Chinner } 13866d8b79cfSDave Chinner 13876d8b79cfSDave Chinner /* 13886d8b79cfSDave Chinner * Scan a certain number of inodes for reclaim. 13896d8b79cfSDave Chinner * 13906d8b79cfSDave Chinner * When called we make sure that there is a background (fast) inode reclaim in 13916d8b79cfSDave Chinner * progress, while we will throttle the speed of reclaim via doing synchronous 13926d8b79cfSDave Chinner * reclaim of inodes. That means if we come across dirty inodes, we wait for 13936d8b79cfSDave Chinner * them to be cleaned, which we hope will not be very long due to the 13946d8b79cfSDave Chinner * background walker having already kicked the IO off on those dirty inodes. 13956d8b79cfSDave Chinner */ 13960a234c6dSDave Chinner long 13976d8b79cfSDave Chinner xfs_reclaim_inodes_nr( 13986d8b79cfSDave Chinner struct xfs_mount *mp, 13996d8b79cfSDave Chinner int nr_to_scan) 14006d8b79cfSDave Chinner { 14016d8b79cfSDave Chinner /* kick background reclaimer and push the AIL */ 14026d8b79cfSDave Chinner xfs_reclaim_work_queue(mp); 14036d8b79cfSDave Chinner xfs_ail_push_all(mp->m_ail); 14046d8b79cfSDave Chinner 14050a234c6dSDave Chinner return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 14066d8b79cfSDave Chinner } 14076d8b79cfSDave Chinner 14086d8b79cfSDave Chinner /* 14096d8b79cfSDave Chinner * Return the number of reclaimable inodes in the filesystem for 14106d8b79cfSDave Chinner * the shrinker to determine how much to reclaim. 14116d8b79cfSDave Chinner */ 14126d8b79cfSDave Chinner int 14136d8b79cfSDave Chinner xfs_reclaim_inodes_count( 14146d8b79cfSDave Chinner struct xfs_mount *mp) 14156d8b79cfSDave Chinner { 14166d8b79cfSDave Chinner struct xfs_perag *pag; 14176d8b79cfSDave Chinner xfs_agnumber_t ag = 0; 14186d8b79cfSDave Chinner int reclaimable = 0; 14196d8b79cfSDave Chinner 14206d8b79cfSDave Chinner while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 14216d8b79cfSDave Chinner ag = pag->pag_agno + 1; 14226d8b79cfSDave Chinner reclaimable += pag->pag_ici_reclaimable; 14236d8b79cfSDave Chinner xfs_perag_put(pag); 14246d8b79cfSDave Chinner } 14256d8b79cfSDave Chinner return reclaimable; 14266d8b79cfSDave Chinner } 14276d8b79cfSDave Chinner 142839b1cfd7SDarrick J. Wong STATIC bool 14293e3f9f58SBrian Foster xfs_inode_match_id( 14303e3f9f58SBrian Foster struct xfs_inode *ip, 14313e3f9f58SBrian Foster struct xfs_eofblocks *eofb) 14323e3f9f58SBrian Foster { 1433b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1434b9fe5052SDwight Engen !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 143539b1cfd7SDarrick J. Wong return false; 14361b556048SBrian Foster 1437b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1438b9fe5052SDwight Engen !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 143939b1cfd7SDarrick J. Wong return false; 14401b556048SBrian Foster 1441b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1442de7a866fSChristoph Hellwig ip->i_d.di_projid != eofb->eof_prid) 144339b1cfd7SDarrick J. Wong return false; 14441b556048SBrian Foster 144539b1cfd7SDarrick J. Wong return true; 14463e3f9f58SBrian Foster } 14473e3f9f58SBrian Foster 1448f4526397SBrian Foster /* 1449f4526397SBrian Foster * A union-based inode filtering algorithm. Process the inode if any of the 1450f4526397SBrian Foster * criteria match. This is for global/internal scans only. 1451f4526397SBrian Foster */ 145239b1cfd7SDarrick J. Wong STATIC bool 1453f4526397SBrian Foster xfs_inode_match_id_union( 1454f4526397SBrian Foster struct xfs_inode *ip, 1455f4526397SBrian Foster struct xfs_eofblocks *eofb) 1456f4526397SBrian Foster { 1457f4526397SBrian Foster if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1458f4526397SBrian Foster uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 145939b1cfd7SDarrick J. Wong return true; 1460f4526397SBrian Foster 1461f4526397SBrian Foster if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1462f4526397SBrian Foster gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 146339b1cfd7SDarrick J. Wong return true; 1464f4526397SBrian Foster 1465f4526397SBrian Foster if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1466de7a866fSChristoph Hellwig ip->i_d.di_projid == eofb->eof_prid) 146739b1cfd7SDarrick J. Wong return true; 1468f4526397SBrian Foster 146939b1cfd7SDarrick J. Wong return false; 1470f4526397SBrian Foster } 1471f4526397SBrian Foster 1472a91bf992SDarrick J. Wong /* 1473a91bf992SDarrick J. Wong * Is this inode @ip eligible for eof/cow block reclamation, given some 1474a91bf992SDarrick J. Wong * filtering parameters @eofb? The inode is eligible if @eofb is null or 1475a91bf992SDarrick J. Wong * if the predicate functions match. 1476a91bf992SDarrick J. Wong */ 1477a91bf992SDarrick J. Wong static bool 1478a91bf992SDarrick J. Wong xfs_inode_matches_eofb( 1479a91bf992SDarrick J. Wong struct xfs_inode *ip, 1480a91bf992SDarrick J. Wong struct xfs_eofblocks *eofb) 1481a91bf992SDarrick J. Wong { 148239b1cfd7SDarrick J. Wong bool match; 1483a91bf992SDarrick J. Wong 1484a91bf992SDarrick J. Wong if (!eofb) 1485a91bf992SDarrick J. Wong return true; 1486a91bf992SDarrick J. Wong 1487a91bf992SDarrick J. Wong if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 1488a91bf992SDarrick J. Wong match = xfs_inode_match_id_union(ip, eofb); 1489a91bf992SDarrick J. Wong else 1490a91bf992SDarrick J. Wong match = xfs_inode_match_id(ip, eofb); 1491a91bf992SDarrick J. Wong if (!match) 1492a91bf992SDarrick J. Wong return false; 1493a91bf992SDarrick J. Wong 1494a91bf992SDarrick J. Wong /* skip the inode if the file size is too small */ 1495a91bf992SDarrick J. Wong if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) && 1496a91bf992SDarrick J. Wong XFS_ISIZE(ip) < eofb->eof_min_file_size) 1497a91bf992SDarrick J. Wong return false; 1498a91bf992SDarrick J. Wong 1499a91bf992SDarrick J. Wong return true; 1500a91bf992SDarrick J. Wong } 1501a91bf992SDarrick J. Wong 15023e3f9f58SBrian Foster STATIC int 150341176a68SBrian Foster xfs_inode_free_eofblocks( 150441176a68SBrian Foster struct xfs_inode *ip, 150541176a68SBrian Foster void *args) 150641176a68SBrian Foster { 15073e3f9f58SBrian Foster struct xfs_eofblocks *eofb = args; 1508390600f8SDarrick J. Wong bool wait; 1509390600f8SDarrick J. Wong int ret; 1510390600f8SDarrick J. Wong 1511390600f8SDarrick J. Wong wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); 15125400da7dSBrian Foster 151341176a68SBrian Foster if (!xfs_can_free_eofblocks(ip, false)) { 151441176a68SBrian Foster /* inode could be preallocated or append-only */ 151541176a68SBrian Foster trace_xfs_inode_free_eofblocks_invalid(ip); 151641176a68SBrian Foster xfs_inode_clear_eofblocks_tag(ip); 151741176a68SBrian Foster return 0; 151841176a68SBrian Foster } 151941176a68SBrian Foster 152041176a68SBrian Foster /* 152141176a68SBrian Foster * If the mapping is dirty the operation can block and wait for some 152241176a68SBrian Foster * time. Unless we are waiting, skip it. 152341176a68SBrian Foster */ 1524390600f8SDarrick J. Wong if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 152541176a68SBrian Foster return 0; 152641176a68SBrian Foster 1527a91bf992SDarrick J. Wong if (!xfs_inode_matches_eofb(ip, eofb)) 15283e3f9f58SBrian Foster return 0; 15293e3f9f58SBrian Foster 1530a36b9261SBrian Foster /* 1531a36b9261SBrian Foster * If the caller is waiting, return -EAGAIN to keep the background 1532a36b9261SBrian Foster * scanner moving and revisit the inode in a subsequent pass. 1533a36b9261SBrian Foster */ 1534c3155097SBrian Foster if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1535390600f8SDarrick J. Wong if (wait) 1536390600f8SDarrick J. Wong return -EAGAIN; 1537390600f8SDarrick J. Wong return 0; 1538a36b9261SBrian Foster } 1539390600f8SDarrick J. Wong 1540a36b9261SBrian Foster ret = xfs_free_eofblocks(ip); 1541a36b9261SBrian Foster xfs_iunlock(ip, XFS_IOLOCK_EXCL); 154241176a68SBrian Foster 154341176a68SBrian Foster return ret; 154441176a68SBrian Foster } 154541176a68SBrian Foster 154683104d44SDarrick J. Wong int 154783104d44SDarrick J. Wong xfs_icache_free_eofblocks( 154883104d44SDarrick J. Wong struct xfs_mount *mp, 154983104d44SDarrick J. Wong struct xfs_eofblocks *eofb) 155083104d44SDarrick J. Wong { 1551042f65f4SDarrick J. Wong return xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb, 155283104d44SDarrick J. Wong XFS_ICI_EOFBLOCKS_TAG); 155341176a68SBrian Foster } 155441176a68SBrian Foster 1555dc06f398SBrian Foster /* 1556dc06f398SBrian Foster * Run eofblocks scans on the quotas applicable to the inode. For inodes with 1557dc06f398SBrian Foster * multiple quotas, we don't know exactly which quota caused an allocation 1558dc06f398SBrian Foster * failure. We make a best effort by including each quota under low free space 1559dc06f398SBrian Foster * conditions (less than 1% free space) in the scan. 1560dc06f398SBrian Foster */ 156183104d44SDarrick J. Wong static int 156283104d44SDarrick J. Wong __xfs_inode_free_quota_eofblocks( 156383104d44SDarrick J. Wong struct xfs_inode *ip, 156483104d44SDarrick J. Wong int (*execute)(struct xfs_mount *mp, 156583104d44SDarrick J. Wong struct xfs_eofblocks *eofb)) 1566dc06f398SBrian Foster { 1567dc06f398SBrian Foster int scan = 0; 1568dc06f398SBrian Foster struct xfs_eofblocks eofb = {0}; 1569dc06f398SBrian Foster struct xfs_dquot *dq; 1570dc06f398SBrian Foster 1571dc06f398SBrian Foster /* 1572c3155097SBrian Foster * Run a sync scan to increase effectiveness and use the union filter to 1573dc06f398SBrian Foster * cover all applicable quotas in a single scan. 1574dc06f398SBrian Foster */ 1575dc06f398SBrian Foster eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; 1576dc06f398SBrian Foster 1577dc06f398SBrian Foster if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { 1578dc06f398SBrian Foster dq = xfs_inode_dquot(ip, XFS_DQ_USER); 1579dc06f398SBrian Foster if (dq && xfs_dquot_lowsp(dq)) { 1580dc06f398SBrian Foster eofb.eof_uid = VFS_I(ip)->i_uid; 1581dc06f398SBrian Foster eofb.eof_flags |= XFS_EOF_FLAGS_UID; 1582dc06f398SBrian Foster scan = 1; 1583dc06f398SBrian Foster } 1584dc06f398SBrian Foster } 1585dc06f398SBrian Foster 1586dc06f398SBrian Foster if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { 1587dc06f398SBrian Foster dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); 1588dc06f398SBrian Foster if (dq && xfs_dquot_lowsp(dq)) { 1589dc06f398SBrian Foster eofb.eof_gid = VFS_I(ip)->i_gid; 1590dc06f398SBrian Foster eofb.eof_flags |= XFS_EOF_FLAGS_GID; 1591dc06f398SBrian Foster scan = 1; 1592dc06f398SBrian Foster } 1593dc06f398SBrian Foster } 1594dc06f398SBrian Foster 1595dc06f398SBrian Foster if (scan) 159683104d44SDarrick J. Wong execute(ip->i_mount, &eofb); 1597dc06f398SBrian Foster 1598dc06f398SBrian Foster return scan; 1599dc06f398SBrian Foster } 1600dc06f398SBrian Foster 160183104d44SDarrick J. Wong int 160283104d44SDarrick J. Wong xfs_inode_free_quota_eofblocks( 160383104d44SDarrick J. Wong struct xfs_inode *ip) 160483104d44SDarrick J. Wong { 160583104d44SDarrick J. Wong return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); 160683104d44SDarrick J. Wong } 160783104d44SDarrick J. Wong 160891aae6beSDarrick J. Wong static inline unsigned long 160991aae6beSDarrick J. Wong xfs_iflag_for_tag( 161091aae6beSDarrick J. Wong int tag) 161191aae6beSDarrick J. Wong { 161291aae6beSDarrick J. Wong switch (tag) { 161391aae6beSDarrick J. Wong case XFS_ICI_EOFBLOCKS_TAG: 161491aae6beSDarrick J. Wong return XFS_IEOFBLOCKS; 161591aae6beSDarrick J. Wong case XFS_ICI_COWBLOCKS_TAG: 161691aae6beSDarrick J. Wong return XFS_ICOWBLOCKS; 161791aae6beSDarrick J. Wong default: 161891aae6beSDarrick J. Wong ASSERT(0); 161991aae6beSDarrick J. Wong return 0; 162091aae6beSDarrick J. Wong } 162191aae6beSDarrick J. Wong } 162291aae6beSDarrick J. Wong 162383104d44SDarrick J. Wong static void 162491aae6beSDarrick J. Wong __xfs_inode_set_blocks_tag( 162583104d44SDarrick J. Wong xfs_inode_t *ip, 162683104d44SDarrick J. Wong void (*execute)(struct xfs_mount *mp), 162783104d44SDarrick J. Wong void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 162883104d44SDarrick J. Wong int error, unsigned long caller_ip), 162983104d44SDarrick J. Wong int tag) 163027b52867SBrian Foster { 163127b52867SBrian Foster struct xfs_mount *mp = ip->i_mount; 163227b52867SBrian Foster struct xfs_perag *pag; 163327b52867SBrian Foster int tagged; 163427b52867SBrian Foster 163585a6e764SChristoph Hellwig /* 163685a6e764SChristoph Hellwig * Don't bother locking the AG and looking up in the radix trees 163785a6e764SChristoph Hellwig * if we already know that we have the tag set. 163885a6e764SChristoph Hellwig */ 163991aae6beSDarrick J. Wong if (ip->i_flags & xfs_iflag_for_tag(tag)) 164085a6e764SChristoph Hellwig return; 164185a6e764SChristoph Hellwig spin_lock(&ip->i_flags_lock); 164291aae6beSDarrick J. Wong ip->i_flags |= xfs_iflag_for_tag(tag); 164385a6e764SChristoph Hellwig spin_unlock(&ip->i_flags_lock); 164485a6e764SChristoph Hellwig 164527b52867SBrian Foster pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 164627b52867SBrian Foster spin_lock(&pag->pag_ici_lock); 164727b52867SBrian Foster 164883104d44SDarrick J. Wong tagged = radix_tree_tagged(&pag->pag_ici_root, tag); 164927b52867SBrian Foster radix_tree_tag_set(&pag->pag_ici_root, 165083104d44SDarrick J. Wong XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 165127b52867SBrian Foster if (!tagged) { 165227b52867SBrian Foster /* propagate the eofblocks tag up into the perag radix tree */ 165327b52867SBrian Foster spin_lock(&ip->i_mount->m_perag_lock); 165427b52867SBrian Foster radix_tree_tag_set(&ip->i_mount->m_perag_tree, 165527b52867SBrian Foster XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 165683104d44SDarrick J. Wong tag); 165727b52867SBrian Foster spin_unlock(&ip->i_mount->m_perag_lock); 165827b52867SBrian Foster 1659579b62faSBrian Foster /* kick off background trimming */ 166083104d44SDarrick J. Wong execute(ip->i_mount); 1661579b62faSBrian Foster 166283104d44SDarrick J. Wong set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 166327b52867SBrian Foster } 166427b52867SBrian Foster 166527b52867SBrian Foster spin_unlock(&pag->pag_ici_lock); 166627b52867SBrian Foster xfs_perag_put(pag); 166727b52867SBrian Foster } 166827b52867SBrian Foster 166927b52867SBrian Foster void 167083104d44SDarrick J. Wong xfs_inode_set_eofblocks_tag( 167127b52867SBrian Foster xfs_inode_t *ip) 167227b52867SBrian Foster { 167383104d44SDarrick J. Wong trace_xfs_inode_set_eofblocks_tag(ip); 167491aae6beSDarrick J. Wong return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks, 167583104d44SDarrick J. Wong trace_xfs_perag_set_eofblocks, 167683104d44SDarrick J. Wong XFS_ICI_EOFBLOCKS_TAG); 167783104d44SDarrick J. Wong } 167883104d44SDarrick J. Wong 167983104d44SDarrick J. Wong static void 168091aae6beSDarrick J. Wong __xfs_inode_clear_blocks_tag( 168183104d44SDarrick J. Wong xfs_inode_t *ip, 168283104d44SDarrick J. Wong void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 168383104d44SDarrick J. Wong int error, unsigned long caller_ip), 168483104d44SDarrick J. Wong int tag) 168583104d44SDarrick J. Wong { 168627b52867SBrian Foster struct xfs_mount *mp = ip->i_mount; 168727b52867SBrian Foster struct xfs_perag *pag; 168827b52867SBrian Foster 168985a6e764SChristoph Hellwig spin_lock(&ip->i_flags_lock); 169091aae6beSDarrick J. Wong ip->i_flags &= ~xfs_iflag_for_tag(tag); 169185a6e764SChristoph Hellwig spin_unlock(&ip->i_flags_lock); 169285a6e764SChristoph Hellwig 169327b52867SBrian Foster pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 169427b52867SBrian Foster spin_lock(&pag->pag_ici_lock); 169527b52867SBrian Foster 169627b52867SBrian Foster radix_tree_tag_clear(&pag->pag_ici_root, 169783104d44SDarrick J. Wong XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 169883104d44SDarrick J. Wong if (!radix_tree_tagged(&pag->pag_ici_root, tag)) { 169927b52867SBrian Foster /* clear the eofblocks tag from the perag radix tree */ 170027b52867SBrian Foster spin_lock(&ip->i_mount->m_perag_lock); 170127b52867SBrian Foster radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 170227b52867SBrian Foster XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 170383104d44SDarrick J. Wong tag); 170427b52867SBrian Foster spin_unlock(&ip->i_mount->m_perag_lock); 170583104d44SDarrick J. Wong clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 170627b52867SBrian Foster } 170727b52867SBrian Foster 170827b52867SBrian Foster spin_unlock(&pag->pag_ici_lock); 170927b52867SBrian Foster xfs_perag_put(pag); 171027b52867SBrian Foster } 171127b52867SBrian Foster 171283104d44SDarrick J. Wong void 171383104d44SDarrick J. Wong xfs_inode_clear_eofblocks_tag( 171483104d44SDarrick J. Wong xfs_inode_t *ip) 171583104d44SDarrick J. Wong { 171683104d44SDarrick J. Wong trace_xfs_inode_clear_eofblocks_tag(ip); 171791aae6beSDarrick J. Wong return __xfs_inode_clear_blocks_tag(ip, 171883104d44SDarrick J. Wong trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); 171983104d44SDarrick J. Wong } 172083104d44SDarrick J. Wong 172183104d44SDarrick J. Wong /* 1722be78ff0eSDarrick J. Wong * Set ourselves up to free CoW blocks from this file. If it's already clean 1723be78ff0eSDarrick J. Wong * then we can bail out quickly, but otherwise we must back off if the file 1724be78ff0eSDarrick J. Wong * is undergoing some kind of write. 1725be78ff0eSDarrick J. Wong */ 1726be78ff0eSDarrick J. Wong static bool 1727be78ff0eSDarrick J. Wong xfs_prep_free_cowblocks( 172851d62690SChristoph Hellwig struct xfs_inode *ip) 1729be78ff0eSDarrick J. Wong { 1730be78ff0eSDarrick J. Wong /* 1731be78ff0eSDarrick J. Wong * Just clear the tag if we have an empty cow fork or none at all. It's 1732be78ff0eSDarrick J. Wong * possible the inode was fully unshared since it was originally tagged. 1733be78ff0eSDarrick J. Wong */ 173451d62690SChristoph Hellwig if (!xfs_inode_has_cow_data(ip)) { 1735be78ff0eSDarrick J. Wong trace_xfs_inode_free_cowblocks_invalid(ip); 1736be78ff0eSDarrick J. Wong xfs_inode_clear_cowblocks_tag(ip); 1737be78ff0eSDarrick J. Wong return false; 1738be78ff0eSDarrick J. Wong } 1739be78ff0eSDarrick J. Wong 1740be78ff0eSDarrick J. Wong /* 1741be78ff0eSDarrick J. Wong * If the mapping is dirty or under writeback we cannot touch the 1742be78ff0eSDarrick J. Wong * CoW fork. Leave it alone if we're in the midst of a directio. 1743be78ff0eSDarrick J. Wong */ 1744be78ff0eSDarrick J. Wong if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || 1745be78ff0eSDarrick J. Wong mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || 1746be78ff0eSDarrick J. Wong mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || 1747be78ff0eSDarrick J. Wong atomic_read(&VFS_I(ip)->i_dio_count)) 1748be78ff0eSDarrick J. Wong return false; 1749be78ff0eSDarrick J. Wong 1750be78ff0eSDarrick J. Wong return true; 1751be78ff0eSDarrick J. Wong } 1752be78ff0eSDarrick J. Wong 1753be78ff0eSDarrick J. Wong /* 175483104d44SDarrick J. Wong * Automatic CoW Reservation Freeing 175583104d44SDarrick J. Wong * 175683104d44SDarrick J. Wong * These functions automatically garbage collect leftover CoW reservations 175783104d44SDarrick J. Wong * that were made on behalf of a cowextsize hint when we start to run out 175883104d44SDarrick J. Wong * of quota or when the reservations sit around for too long. If the file 175983104d44SDarrick J. Wong * has dirty pages or is undergoing writeback, its CoW reservations will 176083104d44SDarrick J. Wong * be retained. 176183104d44SDarrick J. Wong * 176283104d44SDarrick J. Wong * The actual garbage collection piggybacks off the same code that runs 176383104d44SDarrick J. Wong * the speculative EOF preallocation garbage collector. 176483104d44SDarrick J. Wong */ 176583104d44SDarrick J. Wong STATIC int 176683104d44SDarrick J. Wong xfs_inode_free_cowblocks( 176783104d44SDarrick J. Wong struct xfs_inode *ip, 176883104d44SDarrick J. Wong void *args) 176983104d44SDarrick J. Wong { 177083104d44SDarrick J. Wong struct xfs_eofblocks *eofb = args; 1771be78ff0eSDarrick J. Wong int ret = 0; 177283104d44SDarrick J. Wong 177351d62690SChristoph Hellwig if (!xfs_prep_free_cowblocks(ip)) 177483104d44SDarrick J. Wong return 0; 177583104d44SDarrick J. Wong 1776a91bf992SDarrick J. Wong if (!xfs_inode_matches_eofb(ip, eofb)) 177783104d44SDarrick J. Wong return 0; 177883104d44SDarrick J. Wong 177983104d44SDarrick J. Wong /* Free the CoW blocks */ 178083104d44SDarrick J. Wong xfs_ilock(ip, XFS_IOLOCK_EXCL); 178183104d44SDarrick J. Wong xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 178283104d44SDarrick J. Wong 1783be78ff0eSDarrick J. Wong /* 1784be78ff0eSDarrick J. Wong * Check again, nobody else should be able to dirty blocks or change 1785be78ff0eSDarrick J. Wong * the reflink iflag now that we have the first two locks held. 1786be78ff0eSDarrick J. Wong */ 178751d62690SChristoph Hellwig if (xfs_prep_free_cowblocks(ip)) 17883802a345SChristoph Hellwig ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); 178983104d44SDarrick J. Wong 179083104d44SDarrick J. Wong xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 179183104d44SDarrick J. Wong xfs_iunlock(ip, XFS_IOLOCK_EXCL); 179283104d44SDarrick J. Wong 179383104d44SDarrick J. Wong return ret; 179483104d44SDarrick J. Wong } 179583104d44SDarrick J. Wong 179683104d44SDarrick J. Wong int 179783104d44SDarrick J. Wong xfs_icache_free_cowblocks( 179883104d44SDarrick J. Wong struct xfs_mount *mp, 179983104d44SDarrick J. Wong struct xfs_eofblocks *eofb) 180083104d44SDarrick J. Wong { 1801042f65f4SDarrick J. Wong return xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb, 180283104d44SDarrick J. Wong XFS_ICI_COWBLOCKS_TAG); 180383104d44SDarrick J. Wong } 180483104d44SDarrick J. Wong 180583104d44SDarrick J. Wong int 180683104d44SDarrick J. Wong xfs_inode_free_quota_cowblocks( 180783104d44SDarrick J. Wong struct xfs_inode *ip) 180883104d44SDarrick J. Wong { 180983104d44SDarrick J. Wong return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); 181083104d44SDarrick J. Wong } 181183104d44SDarrick J. Wong 181283104d44SDarrick J. Wong void 181383104d44SDarrick J. Wong xfs_inode_set_cowblocks_tag( 181483104d44SDarrick J. Wong xfs_inode_t *ip) 181583104d44SDarrick J. Wong { 18167b7381f0SBrian Foster trace_xfs_inode_set_cowblocks_tag(ip); 181791aae6beSDarrick J. Wong return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks, 18187b7381f0SBrian Foster trace_xfs_perag_set_cowblocks, 181983104d44SDarrick J. Wong XFS_ICI_COWBLOCKS_TAG); 182083104d44SDarrick J. Wong } 182183104d44SDarrick J. Wong 182283104d44SDarrick J. Wong void 182383104d44SDarrick J. Wong xfs_inode_clear_cowblocks_tag( 182483104d44SDarrick J. Wong xfs_inode_t *ip) 182583104d44SDarrick J. Wong { 18267b7381f0SBrian Foster trace_xfs_inode_clear_cowblocks_tag(ip); 182791aae6beSDarrick J. Wong return __xfs_inode_clear_blocks_tag(ip, 18287b7381f0SBrian Foster trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); 182983104d44SDarrick J. Wong } 1830d6b636ebSDarrick J. Wong 1831d6b636ebSDarrick J. Wong /* Disable post-EOF and CoW block auto-reclamation. */ 1832d6b636ebSDarrick J. Wong void 1833ed30dcbdSDarrick J. Wong xfs_stop_block_reaping( 1834d6b636ebSDarrick J. Wong struct xfs_mount *mp) 1835d6b636ebSDarrick J. Wong { 1836d6b636ebSDarrick J. Wong cancel_delayed_work_sync(&mp->m_eofblocks_work); 1837d6b636ebSDarrick J. Wong cancel_delayed_work_sync(&mp->m_cowblocks_work); 1838d6b636ebSDarrick J. Wong } 1839d6b636ebSDarrick J. Wong 1840d6b636ebSDarrick J. Wong /* Enable post-EOF and CoW block auto-reclamation. */ 1841d6b636ebSDarrick J. Wong void 1842ed30dcbdSDarrick J. Wong xfs_start_block_reaping( 1843d6b636ebSDarrick J. Wong struct xfs_mount *mp) 1844d6b636ebSDarrick J. Wong { 1845d6b636ebSDarrick J. Wong xfs_queue_eofblocks(mp); 1846d6b636ebSDarrick J. Wong xfs_queue_cowblocks(mp); 1847d6b636ebSDarrick J. Wong } 1848