10b61f8a4SDave Chinner // SPDX-License-Identifier: GPL-2.0 26d8b79cfSDave Chinner /* 36d8b79cfSDave Chinner * Copyright (c) 2000-2005 Silicon Graphics, Inc. 46d8b79cfSDave Chinner * All Rights Reserved. 56d8b79cfSDave Chinner */ 66d8b79cfSDave Chinner #include "xfs.h" 76d8b79cfSDave Chinner #include "xfs_fs.h" 85467b34bSDarrick J. Wong #include "xfs_shared.h" 96ca1c906SDave Chinner #include "xfs_format.h" 10239880efSDave Chinner #include "xfs_log_format.h" 11239880efSDave Chinner #include "xfs_trans_resv.h" 126d8b79cfSDave Chinner #include "xfs_sb.h" 136d8b79cfSDave Chinner #include "xfs_mount.h" 146d8b79cfSDave Chinner #include "xfs_inode.h" 15239880efSDave Chinner #include "xfs_trans.h" 16239880efSDave Chinner #include "xfs_trans_priv.h" 176d8b79cfSDave Chinner #include "xfs_inode_item.h" 186d8b79cfSDave Chinner #include "xfs_quota.h" 196d8b79cfSDave Chinner #include "xfs_trace.h" 206d8b79cfSDave Chinner #include "xfs_icache.h" 21c24b5dfaSDave Chinner #include "xfs_bmap_util.h" 22dc06f398SBrian Foster #include "xfs_dquot_item.h" 23dc06f398SBrian Foster #include "xfs_dquot.h" 2483104d44SDarrick J. Wong #include "xfs_reflink.h" 25bb8a66afSChristoph Hellwig #include "xfs_ialloc.h" 266d8b79cfSDave Chinner 27f0e28280SJeff Layton #include <linux/iversion.h> 286d8b79cfSDave Chinner 2933479e05SDave Chinner /* 3033479e05SDave Chinner * Allocate and initialise an xfs_inode. 3133479e05SDave Chinner */ 32638f4416SDave Chinner struct xfs_inode * 3333479e05SDave Chinner xfs_inode_alloc( 3433479e05SDave Chinner struct xfs_mount *mp, 3533479e05SDave Chinner xfs_ino_t ino) 3633479e05SDave Chinner { 3733479e05SDave Chinner struct xfs_inode *ip; 3833479e05SDave Chinner 3933479e05SDave Chinner /* 4033479e05SDave Chinner * if this didn't occur in transactions, we could use 4133479e05SDave Chinner * KM_MAYFAIL and return NULL here on ENOMEM. Set the 4233479e05SDave Chinner * code up to do this anyway. 4333479e05SDave Chinner */ 44707e0ddaSTetsuo Handa ip = kmem_zone_alloc(xfs_inode_zone, 0); 4533479e05SDave Chinner if (!ip) 4633479e05SDave Chinner return NULL; 4733479e05SDave Chinner if (inode_init_always(mp->m_super, VFS_I(ip))) { 48377bcd5fSCarlos Maiolino kmem_cache_free(xfs_inode_zone, ip); 4933479e05SDave Chinner return NULL; 5033479e05SDave Chinner } 5133479e05SDave Chinner 52c19b3b05SDave Chinner /* VFS doesn't initialise i_mode! */ 53c19b3b05SDave Chinner VFS_I(ip)->i_mode = 0; 54c19b3b05SDave Chinner 55ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, vn_active); 5633479e05SDave Chinner ASSERT(atomic_read(&ip->i_pincount) == 0); 5733479e05SDave Chinner ASSERT(!xfs_isiflocked(ip)); 5833479e05SDave Chinner ASSERT(ip->i_ino == 0); 5933479e05SDave Chinner 6033479e05SDave Chinner /* initialise the xfs inode */ 6133479e05SDave Chinner ip->i_ino = ino; 6233479e05SDave Chinner ip->i_mount = mp; 6333479e05SDave Chinner memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 6433479e05SDave Chinner ip->i_afp = NULL; 653993baebSDarrick J. Wong ip->i_cowfp = NULL; 663ba738dfSChristoph Hellwig memset(&ip->i_df, 0, sizeof(ip->i_df)); 6733479e05SDave Chinner ip->i_flags = 0; 6833479e05SDave Chinner ip->i_delayed_blks = 0; 69f8d55aa0SDave Chinner memset(&ip->i_d, 0, sizeof(ip->i_d)); 706772c1f1SDarrick J. Wong ip->i_sick = 0; 716772c1f1SDarrick J. Wong ip->i_checked = 0; 72cb357bf3SDarrick J. Wong INIT_WORK(&ip->i_ioend_work, xfs_end_io); 73cb357bf3SDarrick J. Wong INIT_LIST_HEAD(&ip->i_ioend_list); 74cb357bf3SDarrick J. Wong spin_lock_init(&ip->i_ioend_lock); 7533479e05SDave Chinner 7633479e05SDave Chinner return ip; 7733479e05SDave Chinner } 7833479e05SDave Chinner 7933479e05SDave Chinner STATIC void 8033479e05SDave Chinner xfs_inode_free_callback( 8133479e05SDave Chinner struct rcu_head *head) 8233479e05SDave Chinner { 8333479e05SDave Chinner struct inode *inode = container_of(head, struct inode, i_rcu); 8433479e05SDave Chinner struct xfs_inode *ip = XFS_I(inode); 8533479e05SDave Chinner 86c19b3b05SDave Chinner switch (VFS_I(ip)->i_mode & S_IFMT) { 8733479e05SDave Chinner case S_IFREG: 8833479e05SDave Chinner case S_IFDIR: 8933479e05SDave Chinner case S_IFLNK: 90ef838512SChristoph Hellwig xfs_idestroy_fork(&ip->i_df); 9133479e05SDave Chinner break; 9233479e05SDave Chinner } 9333479e05SDave Chinner 94ef838512SChristoph Hellwig if (ip->i_afp) { 95ef838512SChristoph Hellwig xfs_idestroy_fork(ip->i_afp); 96ef838512SChristoph Hellwig kmem_cache_free(xfs_ifork_zone, ip->i_afp); 97ef838512SChristoph Hellwig } 98ef838512SChristoph Hellwig if (ip->i_cowfp) { 99ef838512SChristoph Hellwig xfs_idestroy_fork(ip->i_cowfp); 100ef838512SChristoph Hellwig kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); 101ef838512SChristoph Hellwig } 10233479e05SDave Chinner if (ip->i_itemp) { 10322525c17SDave Chinner ASSERT(!test_bit(XFS_LI_IN_AIL, 10422525c17SDave Chinner &ip->i_itemp->ili_item.li_flags)); 10533479e05SDave Chinner xfs_inode_item_destroy(ip); 10633479e05SDave Chinner ip->i_itemp = NULL; 10733479e05SDave Chinner } 10833479e05SDave Chinner 109377bcd5fSCarlos Maiolino kmem_cache_free(xfs_inode_zone, ip); 1101f2dcfe8SDave Chinner } 1111f2dcfe8SDave Chinner 1128a17d7ddSDave Chinner static void 1138a17d7ddSDave Chinner __xfs_inode_free( 1148a17d7ddSDave Chinner struct xfs_inode *ip) 1158a17d7ddSDave Chinner { 1168a17d7ddSDave Chinner /* asserts to verify all state is correct here */ 1178a17d7ddSDave Chinner ASSERT(atomic_read(&ip->i_pincount) == 0); 1188a17d7ddSDave Chinner XFS_STATS_DEC(ip->i_mount, vn_active); 1198a17d7ddSDave Chinner 1208a17d7ddSDave Chinner call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 1218a17d7ddSDave Chinner } 1228a17d7ddSDave Chinner 1231f2dcfe8SDave Chinner void 1241f2dcfe8SDave Chinner xfs_inode_free( 1251f2dcfe8SDave Chinner struct xfs_inode *ip) 1261f2dcfe8SDave Chinner { 12798efe8afSBrian Foster ASSERT(!xfs_isiflocked(ip)); 12898efe8afSBrian Foster 12933479e05SDave Chinner /* 13033479e05SDave Chinner * Because we use RCU freeing we need to ensure the inode always 13133479e05SDave Chinner * appears to be reclaimed with an invalid inode number when in the 13233479e05SDave Chinner * free state. The ip->i_flags_lock provides the barrier against lookup 13333479e05SDave Chinner * races. 13433479e05SDave Chinner */ 13533479e05SDave Chinner spin_lock(&ip->i_flags_lock); 13633479e05SDave Chinner ip->i_flags = XFS_IRECLAIM; 13733479e05SDave Chinner ip->i_ino = 0; 13833479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 13933479e05SDave Chinner 1408a17d7ddSDave Chinner __xfs_inode_free(ip); 14133479e05SDave Chinner } 14233479e05SDave Chinner 14333479e05SDave Chinner /* 144ad438c40SDave Chinner * Queue a new inode reclaim pass if there are reclaimable inodes and there 145ad438c40SDave Chinner * isn't a reclaim pass already in progress. By default it runs every 5s based 146ad438c40SDave Chinner * on the xfs periodic sync default of 30s. Perhaps this should have it's own 147ad438c40SDave Chinner * tunable, but that can be done if this method proves to be ineffective or too 148ad438c40SDave Chinner * aggressive. 149ad438c40SDave Chinner */ 150ad438c40SDave Chinner static void 151ad438c40SDave Chinner xfs_reclaim_work_queue( 152ad438c40SDave Chinner struct xfs_mount *mp) 153ad438c40SDave Chinner { 154ad438c40SDave Chinner 155ad438c40SDave Chinner rcu_read_lock(); 156ad438c40SDave Chinner if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 157ad438c40SDave Chinner queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 158ad438c40SDave Chinner msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 159ad438c40SDave Chinner } 160ad438c40SDave Chinner rcu_read_unlock(); 161ad438c40SDave Chinner } 162ad438c40SDave Chinner 163ad438c40SDave Chinner /* 164ad438c40SDave Chinner * This is a fast pass over the inode cache to try to get reclaim moving on as 165ad438c40SDave Chinner * many inodes as possible in a short period of time. It kicks itself every few 166ad438c40SDave Chinner * seconds, as well as being kicked by the inode cache shrinker when memory 167ad438c40SDave Chinner * goes low. It scans as quickly as possible avoiding locked inodes or those 168ad438c40SDave Chinner * already being flushed, and once done schedules a future pass. 169ad438c40SDave Chinner */ 170ad438c40SDave Chinner void 171ad438c40SDave Chinner xfs_reclaim_worker( 172ad438c40SDave Chinner struct work_struct *work) 173ad438c40SDave Chinner { 174ad438c40SDave Chinner struct xfs_mount *mp = container_of(to_delayed_work(work), 175ad438c40SDave Chinner struct xfs_mount, m_reclaim_work); 176ad438c40SDave Chinner 177ad438c40SDave Chinner xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 178ad438c40SDave Chinner xfs_reclaim_work_queue(mp); 179ad438c40SDave Chinner } 180ad438c40SDave Chinner 181ad438c40SDave Chinner static void 182ad438c40SDave Chinner xfs_perag_set_reclaim_tag( 183ad438c40SDave Chinner struct xfs_perag *pag) 184ad438c40SDave Chinner { 185ad438c40SDave Chinner struct xfs_mount *mp = pag->pag_mount; 186ad438c40SDave Chinner 18795989c46SBrian Foster lockdep_assert_held(&pag->pag_ici_lock); 188ad438c40SDave Chinner if (pag->pag_ici_reclaimable++) 189ad438c40SDave Chinner return; 190ad438c40SDave Chinner 191ad438c40SDave Chinner /* propagate the reclaim tag up into the perag radix tree */ 192ad438c40SDave Chinner spin_lock(&mp->m_perag_lock); 193ad438c40SDave Chinner radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, 194ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 195ad438c40SDave Chinner spin_unlock(&mp->m_perag_lock); 196ad438c40SDave Chinner 197ad438c40SDave Chinner /* schedule periodic background inode reclaim */ 198ad438c40SDave Chinner xfs_reclaim_work_queue(mp); 199ad438c40SDave Chinner 200ad438c40SDave Chinner trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 201ad438c40SDave Chinner } 202ad438c40SDave Chinner 203ad438c40SDave Chinner static void 204ad438c40SDave Chinner xfs_perag_clear_reclaim_tag( 205ad438c40SDave Chinner struct xfs_perag *pag) 206ad438c40SDave Chinner { 207ad438c40SDave Chinner struct xfs_mount *mp = pag->pag_mount; 208ad438c40SDave Chinner 20995989c46SBrian Foster lockdep_assert_held(&pag->pag_ici_lock); 210ad438c40SDave Chinner if (--pag->pag_ici_reclaimable) 211ad438c40SDave Chinner return; 212ad438c40SDave Chinner 213ad438c40SDave Chinner /* clear the reclaim tag from the perag radix tree */ 214ad438c40SDave Chinner spin_lock(&mp->m_perag_lock); 215ad438c40SDave Chinner radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, 216ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 217ad438c40SDave Chinner spin_unlock(&mp->m_perag_lock); 218ad438c40SDave Chinner trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 219ad438c40SDave Chinner } 220ad438c40SDave Chinner 221ad438c40SDave Chinner 222ad438c40SDave Chinner /* 223ad438c40SDave Chinner * We set the inode flag atomically with the radix tree tag. 224ad438c40SDave Chinner * Once we get tag lookups on the radix tree, this inode flag 225ad438c40SDave Chinner * can go away. 226ad438c40SDave Chinner */ 227ad438c40SDave Chinner void 228ad438c40SDave Chinner xfs_inode_set_reclaim_tag( 229ad438c40SDave Chinner struct xfs_inode *ip) 230ad438c40SDave Chinner { 231ad438c40SDave Chinner struct xfs_mount *mp = ip->i_mount; 232ad438c40SDave Chinner struct xfs_perag *pag; 233ad438c40SDave Chinner 234ad438c40SDave Chinner pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 235ad438c40SDave Chinner spin_lock(&pag->pag_ici_lock); 236ad438c40SDave Chinner spin_lock(&ip->i_flags_lock); 237ad438c40SDave Chinner 238ad438c40SDave Chinner radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), 239ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 240ad438c40SDave Chinner xfs_perag_set_reclaim_tag(pag); 241ad438c40SDave Chinner __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 242ad438c40SDave Chinner 243ad438c40SDave Chinner spin_unlock(&ip->i_flags_lock); 244ad438c40SDave Chinner spin_unlock(&pag->pag_ici_lock); 245ad438c40SDave Chinner xfs_perag_put(pag); 246ad438c40SDave Chinner } 247ad438c40SDave Chinner 248ad438c40SDave Chinner STATIC void 249ad438c40SDave Chinner xfs_inode_clear_reclaim_tag( 250ad438c40SDave Chinner struct xfs_perag *pag, 251ad438c40SDave Chinner xfs_ino_t ino) 252ad438c40SDave Chinner { 253ad438c40SDave Chinner radix_tree_tag_clear(&pag->pag_ici_root, 254ad438c40SDave Chinner XFS_INO_TO_AGINO(pag->pag_mount, ino), 255ad438c40SDave Chinner XFS_ICI_RECLAIM_TAG); 256ad438c40SDave Chinner xfs_perag_clear_reclaim_tag(pag); 257ad438c40SDave Chinner } 258ad438c40SDave Chinner 259ae2c4ac2SBrian Foster static void 260ae2c4ac2SBrian Foster xfs_inew_wait( 261ae2c4ac2SBrian Foster struct xfs_inode *ip) 262ae2c4ac2SBrian Foster { 263ae2c4ac2SBrian Foster wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); 264ae2c4ac2SBrian Foster DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); 265ae2c4ac2SBrian Foster 266ae2c4ac2SBrian Foster do { 26721417136SIngo Molnar prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 268ae2c4ac2SBrian Foster if (!xfs_iflags_test(ip, XFS_INEW)) 269ae2c4ac2SBrian Foster break; 270ae2c4ac2SBrian Foster schedule(); 271ae2c4ac2SBrian Foster } while (true); 27221417136SIngo Molnar finish_wait(wq, &wait.wq_entry); 273ae2c4ac2SBrian Foster } 274ae2c4ac2SBrian Foster 275ad438c40SDave Chinner /* 27650997470SDave Chinner * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 27750997470SDave Chinner * part of the structure. This is made more complex by the fact we store 27850997470SDave Chinner * information about the on-disk values in the VFS inode and so we can't just 27983e06f21SDave Chinner * overwrite the values unconditionally. Hence we save the parameters we 28050997470SDave Chinner * need to retain across reinitialisation, and rewrite them into the VFS inode 28183e06f21SDave Chinner * after reinitialisation even if it fails. 28250997470SDave Chinner */ 28350997470SDave Chinner static int 28450997470SDave Chinner xfs_reinit_inode( 28550997470SDave Chinner struct xfs_mount *mp, 28650997470SDave Chinner struct inode *inode) 28750997470SDave Chinner { 28850997470SDave Chinner int error; 28954d7b5c1SDave Chinner uint32_t nlink = inode->i_nlink; 2909e9a2674SDave Chinner uint32_t generation = inode->i_generation; 291f0e28280SJeff Layton uint64_t version = inode_peek_iversion(inode); 292c19b3b05SDave Chinner umode_t mode = inode->i_mode; 293acd1d715SAmir Goldstein dev_t dev = inode->i_rdev; 2943d8f2821SChristoph Hellwig kuid_t uid = inode->i_uid; 2953d8f2821SChristoph Hellwig kgid_t gid = inode->i_gid; 29650997470SDave Chinner 29750997470SDave Chinner error = inode_init_always(mp->m_super, inode); 29850997470SDave Chinner 29954d7b5c1SDave Chinner set_nlink(inode, nlink); 3009e9a2674SDave Chinner inode->i_generation = generation; 301f0e28280SJeff Layton inode_set_iversion_queried(inode, version); 302c19b3b05SDave Chinner inode->i_mode = mode; 303acd1d715SAmir Goldstein inode->i_rdev = dev; 3043d8f2821SChristoph Hellwig inode->i_uid = uid; 3053d8f2821SChristoph Hellwig inode->i_gid = gid; 30650997470SDave Chinner return error; 30750997470SDave Chinner } 30850997470SDave Chinner 30950997470SDave Chinner /* 310afca6c5bSDave Chinner * If we are allocating a new inode, then check what was returned is 311afca6c5bSDave Chinner * actually a free, empty inode. If we are not allocating an inode, 312afca6c5bSDave Chinner * then check we didn't find a free inode. 313afca6c5bSDave Chinner * 314afca6c5bSDave Chinner * Returns: 315afca6c5bSDave Chinner * 0 if the inode free state matches the lookup context 316afca6c5bSDave Chinner * -ENOENT if the inode is free and we are not allocating 317afca6c5bSDave Chinner * -EFSCORRUPTED if there is any state mismatch at all 318afca6c5bSDave Chinner */ 319afca6c5bSDave Chinner static int 320afca6c5bSDave Chinner xfs_iget_check_free_state( 321afca6c5bSDave Chinner struct xfs_inode *ip, 322afca6c5bSDave Chinner int flags) 323afca6c5bSDave Chinner { 324afca6c5bSDave Chinner if (flags & XFS_IGET_CREATE) { 325afca6c5bSDave Chinner /* should be a free inode */ 326afca6c5bSDave Chinner if (VFS_I(ip)->i_mode != 0) { 327afca6c5bSDave Chinner xfs_warn(ip->i_mount, 328afca6c5bSDave Chinner "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", 329afca6c5bSDave Chinner ip->i_ino, VFS_I(ip)->i_mode); 330afca6c5bSDave Chinner return -EFSCORRUPTED; 331afca6c5bSDave Chinner } 332afca6c5bSDave Chinner 333afca6c5bSDave Chinner if (ip->i_d.di_nblocks != 0) { 334afca6c5bSDave Chinner xfs_warn(ip->i_mount, 335afca6c5bSDave Chinner "Corruption detected! Free inode 0x%llx has blocks allocated!", 336afca6c5bSDave Chinner ip->i_ino); 337afca6c5bSDave Chinner return -EFSCORRUPTED; 338afca6c5bSDave Chinner } 339afca6c5bSDave Chinner return 0; 340afca6c5bSDave Chinner } 341afca6c5bSDave Chinner 342afca6c5bSDave Chinner /* should be an allocated inode */ 343afca6c5bSDave Chinner if (VFS_I(ip)->i_mode == 0) 344afca6c5bSDave Chinner return -ENOENT; 345afca6c5bSDave Chinner 346afca6c5bSDave Chinner return 0; 347afca6c5bSDave Chinner } 348afca6c5bSDave Chinner 349afca6c5bSDave Chinner /* 35033479e05SDave Chinner * Check the validity of the inode we just found it the cache 35133479e05SDave Chinner */ 35233479e05SDave Chinner static int 35333479e05SDave Chinner xfs_iget_cache_hit( 35433479e05SDave Chinner struct xfs_perag *pag, 35533479e05SDave Chinner struct xfs_inode *ip, 35633479e05SDave Chinner xfs_ino_t ino, 35733479e05SDave Chinner int flags, 35833479e05SDave Chinner int lock_flags) __releases(RCU) 35933479e05SDave Chinner { 36033479e05SDave Chinner struct inode *inode = VFS_I(ip); 36133479e05SDave Chinner struct xfs_mount *mp = ip->i_mount; 36233479e05SDave Chinner int error; 36333479e05SDave Chinner 36433479e05SDave Chinner /* 36533479e05SDave Chinner * check for re-use of an inode within an RCU grace period due to the 36633479e05SDave Chinner * radix tree nodes not being updated yet. We monitor for this by 36733479e05SDave Chinner * setting the inode number to zero before freeing the inode structure. 36833479e05SDave Chinner * If the inode has been reallocated and set up, then the inode number 36933479e05SDave Chinner * will not match, so check for that, too. 37033479e05SDave Chinner */ 37133479e05SDave Chinner spin_lock(&ip->i_flags_lock); 37233479e05SDave Chinner if (ip->i_ino != ino) { 37333479e05SDave Chinner trace_xfs_iget_skip(ip); 374ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_frecycle); 3752451337dSDave Chinner error = -EAGAIN; 37633479e05SDave Chinner goto out_error; 37733479e05SDave Chinner } 37833479e05SDave Chinner 37933479e05SDave Chinner 38033479e05SDave Chinner /* 38133479e05SDave Chinner * If we are racing with another cache hit that is currently 38233479e05SDave Chinner * instantiating this inode or currently recycling it out of 38333479e05SDave Chinner * reclaimabe state, wait for the initialisation to complete 38433479e05SDave Chinner * before continuing. 38533479e05SDave Chinner * 38633479e05SDave Chinner * XXX(hch): eventually we should do something equivalent to 38733479e05SDave Chinner * wait_on_inode to wait for these flags to be cleared 38833479e05SDave Chinner * instead of polling for it. 38933479e05SDave Chinner */ 39033479e05SDave Chinner if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { 39133479e05SDave Chinner trace_xfs_iget_skip(ip); 392ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_frecycle); 3932451337dSDave Chinner error = -EAGAIN; 39433479e05SDave Chinner goto out_error; 39533479e05SDave Chinner } 39633479e05SDave Chinner 39733479e05SDave Chinner /* 398afca6c5bSDave Chinner * Check the inode free state is valid. This also detects lookup 399afca6c5bSDave Chinner * racing with unlinks. 40033479e05SDave Chinner */ 401afca6c5bSDave Chinner error = xfs_iget_check_free_state(ip, flags); 402afca6c5bSDave Chinner if (error) 40333479e05SDave Chinner goto out_error; 40433479e05SDave Chinner 40533479e05SDave Chinner /* 40633479e05SDave Chinner * If IRECLAIMABLE is set, we've torn down the VFS inode already. 40733479e05SDave Chinner * Need to carefully get it back into useable state. 40833479e05SDave Chinner */ 40933479e05SDave Chinner if (ip->i_flags & XFS_IRECLAIMABLE) { 41033479e05SDave Chinner trace_xfs_iget_reclaim(ip); 41133479e05SDave Chinner 412378f681cSDarrick J. Wong if (flags & XFS_IGET_INCORE) { 413378f681cSDarrick J. Wong error = -EAGAIN; 414378f681cSDarrick J. Wong goto out_error; 415378f681cSDarrick J. Wong } 416378f681cSDarrick J. Wong 41733479e05SDave Chinner /* 41833479e05SDave Chinner * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode 41933479e05SDave Chinner * from stomping over us while we recycle the inode. We can't 42033479e05SDave Chinner * clear the radix tree reclaimable tag yet as it requires 42133479e05SDave Chinner * pag_ici_lock to be held exclusive. 42233479e05SDave Chinner */ 42333479e05SDave Chinner ip->i_flags |= XFS_IRECLAIM; 42433479e05SDave Chinner 42533479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 42633479e05SDave Chinner rcu_read_unlock(); 42733479e05SDave Chinner 428d45344d6SIra Weiny ASSERT(!rwsem_is_locked(&inode->i_rwsem)); 42950997470SDave Chinner error = xfs_reinit_inode(mp, inode); 43033479e05SDave Chinner if (error) { 431756baca2SBrian Foster bool wake; 43233479e05SDave Chinner /* 43333479e05SDave Chinner * Re-initializing the inode failed, and we are in deep 43433479e05SDave Chinner * trouble. Try to re-add it to the reclaim list. 43533479e05SDave Chinner */ 43633479e05SDave Chinner rcu_read_lock(); 43733479e05SDave Chinner spin_lock(&ip->i_flags_lock); 438756baca2SBrian Foster wake = !!__xfs_iflags_test(ip, XFS_INEW); 43933479e05SDave Chinner ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 440756baca2SBrian Foster if (wake) 441756baca2SBrian Foster wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); 44233479e05SDave Chinner ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 44333479e05SDave Chinner trace_xfs_iget_reclaim_fail(ip); 44433479e05SDave Chinner goto out_error; 44533479e05SDave Chinner } 44633479e05SDave Chinner 44733479e05SDave Chinner spin_lock(&pag->pag_ici_lock); 44833479e05SDave Chinner spin_lock(&ip->i_flags_lock); 44933479e05SDave Chinner 45033479e05SDave Chinner /* 45133479e05SDave Chinner * Clear the per-lifetime state in the inode as we are now 45233479e05SDave Chinner * effectively a new inode and need to return to the initial 45333479e05SDave Chinner * state before reuse occurs. 45433479e05SDave Chinner */ 45533479e05SDave Chinner ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 45633479e05SDave Chinner ip->i_flags |= XFS_INEW; 457545c0889SDave Chinner xfs_inode_clear_reclaim_tag(pag, ip->i_ino); 45833479e05SDave Chinner inode->i_state = I_NEW; 4596772c1f1SDarrick J. Wong ip->i_sick = 0; 4606772c1f1SDarrick J. Wong ip->i_checked = 0; 46133479e05SDave Chinner 46233479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 46333479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 46433479e05SDave Chinner } else { 46533479e05SDave Chinner /* If the VFS inode is being torn down, pause and try again. */ 46633479e05SDave Chinner if (!igrab(inode)) { 46733479e05SDave Chinner trace_xfs_iget_skip(ip); 4682451337dSDave Chinner error = -EAGAIN; 46933479e05SDave Chinner goto out_error; 47033479e05SDave Chinner } 47133479e05SDave Chinner 47233479e05SDave Chinner /* We've got a live one. */ 47333479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 47433479e05SDave Chinner rcu_read_unlock(); 47533479e05SDave Chinner trace_xfs_iget_hit(ip); 47633479e05SDave Chinner } 47733479e05SDave Chinner 47833479e05SDave Chinner if (lock_flags != 0) 47933479e05SDave Chinner xfs_ilock(ip, lock_flags); 48033479e05SDave Chinner 481378f681cSDarrick J. Wong if (!(flags & XFS_IGET_INCORE)) 48233479e05SDave Chinner xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); 483ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_found); 48433479e05SDave Chinner 48533479e05SDave Chinner return 0; 48633479e05SDave Chinner 48733479e05SDave Chinner out_error: 48833479e05SDave Chinner spin_unlock(&ip->i_flags_lock); 48933479e05SDave Chinner rcu_read_unlock(); 49033479e05SDave Chinner return error; 49133479e05SDave Chinner } 49233479e05SDave Chinner 49333479e05SDave Chinner 49433479e05SDave Chinner static int 49533479e05SDave Chinner xfs_iget_cache_miss( 49633479e05SDave Chinner struct xfs_mount *mp, 49733479e05SDave Chinner struct xfs_perag *pag, 49833479e05SDave Chinner xfs_trans_t *tp, 49933479e05SDave Chinner xfs_ino_t ino, 50033479e05SDave Chinner struct xfs_inode **ipp, 50133479e05SDave Chinner int flags, 50233479e05SDave Chinner int lock_flags) 50333479e05SDave Chinner { 50433479e05SDave Chinner struct xfs_inode *ip; 50533479e05SDave Chinner int error; 50633479e05SDave Chinner xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 50733479e05SDave Chinner int iflags; 50833479e05SDave Chinner 50933479e05SDave Chinner ip = xfs_inode_alloc(mp, ino); 51033479e05SDave Chinner if (!ip) 5112451337dSDave Chinner return -ENOMEM; 51233479e05SDave Chinner 513bb8a66afSChristoph Hellwig error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags); 51433479e05SDave Chinner if (error) 51533479e05SDave Chinner goto out_destroy; 51633479e05SDave Chinner 517bb8a66afSChristoph Hellwig /* 518bb8a66afSChristoph Hellwig * For version 5 superblocks, if we are initialising a new inode and we 519bb8a66afSChristoph Hellwig * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can 520bb8a66afSChristoph Hellwig * simply build the new inode core with a random generation number. 521bb8a66afSChristoph Hellwig * 522bb8a66afSChristoph Hellwig * For version 4 (and older) superblocks, log recovery is dependent on 523bb8a66afSChristoph Hellwig * the di_flushiter field being initialised from the current on-disk 524bb8a66afSChristoph Hellwig * value and hence we must also read the inode off disk even when 525bb8a66afSChristoph Hellwig * initializing new inodes. 526bb8a66afSChristoph Hellwig */ 527bb8a66afSChristoph Hellwig if (xfs_sb_version_has_v3inode(&mp->m_sb) && 528bb8a66afSChristoph Hellwig (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { 529bb8a66afSChristoph Hellwig VFS_I(ip)->i_generation = prandom_u32(); 530bb8a66afSChristoph Hellwig } else { 531bb8a66afSChristoph Hellwig struct xfs_dinode *dip; 532bb8a66afSChristoph Hellwig struct xfs_buf *bp; 533bb8a66afSChristoph Hellwig 534bb8a66afSChristoph Hellwig error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0); 535bb8a66afSChristoph Hellwig if (error) 536bb8a66afSChristoph Hellwig goto out_destroy; 537bb8a66afSChristoph Hellwig 538bb8a66afSChristoph Hellwig error = xfs_inode_from_disk(ip, dip); 539bb8a66afSChristoph Hellwig if (!error) 540bb8a66afSChristoph Hellwig xfs_buf_set_ref(bp, XFS_INO_REF); 541bb8a66afSChristoph Hellwig xfs_trans_brelse(tp, bp); 542bb8a66afSChristoph Hellwig 543bb8a66afSChristoph Hellwig if (error) 544bb8a66afSChristoph Hellwig goto out_destroy; 545bb8a66afSChristoph Hellwig } 546bb8a66afSChristoph Hellwig 54733479e05SDave Chinner trace_xfs_iget_miss(ip); 54833479e05SDave Chinner 549ee457001SDave Chinner /* 550afca6c5bSDave Chinner * Check the inode free state is valid. This also detects lookup 551afca6c5bSDave Chinner * racing with unlinks. 552ee457001SDave Chinner */ 553afca6c5bSDave Chinner error = xfs_iget_check_free_state(ip, flags); 554afca6c5bSDave Chinner if (error) 555ee457001SDave Chinner goto out_destroy; 55633479e05SDave Chinner 55733479e05SDave Chinner /* 55833479e05SDave Chinner * Preload the radix tree so we can insert safely under the 55933479e05SDave Chinner * write spinlock. Note that we cannot sleep inside the preload 56033479e05SDave Chinner * region. Since we can be called from transaction context, don't 56133479e05SDave Chinner * recurse into the file system. 56233479e05SDave Chinner */ 56333479e05SDave Chinner if (radix_tree_preload(GFP_NOFS)) { 5642451337dSDave Chinner error = -EAGAIN; 56533479e05SDave Chinner goto out_destroy; 56633479e05SDave Chinner } 56733479e05SDave Chinner 56833479e05SDave Chinner /* 56933479e05SDave Chinner * Because the inode hasn't been added to the radix-tree yet it can't 57033479e05SDave Chinner * be found by another thread, so we can do the non-sleeping lock here. 57133479e05SDave Chinner */ 57233479e05SDave Chinner if (lock_flags) { 57333479e05SDave Chinner if (!xfs_ilock_nowait(ip, lock_flags)) 57433479e05SDave Chinner BUG(); 57533479e05SDave Chinner } 57633479e05SDave Chinner 57733479e05SDave Chinner /* 57833479e05SDave Chinner * These values must be set before inserting the inode into the radix 57933479e05SDave Chinner * tree as the moment it is inserted a concurrent lookup (allowed by the 58033479e05SDave Chinner * RCU locking mechanism) can find it and that lookup must see that this 58133479e05SDave Chinner * is an inode currently under construction (i.e. that XFS_INEW is set). 58233479e05SDave Chinner * The ip->i_flags_lock that protects the XFS_INEW flag forms the 58333479e05SDave Chinner * memory barrier that ensures this detection works correctly at lookup 58433479e05SDave Chinner * time. 58533479e05SDave Chinner */ 58633479e05SDave Chinner iflags = XFS_INEW; 58733479e05SDave Chinner if (flags & XFS_IGET_DONTCACHE) 58833479e05SDave Chinner iflags |= XFS_IDONTCACHE; 589113a5683SChandra Seetharaman ip->i_udquot = NULL; 590113a5683SChandra Seetharaman ip->i_gdquot = NULL; 59192f8ff73SChandra Seetharaman ip->i_pdquot = NULL; 59233479e05SDave Chinner xfs_iflags_set(ip, iflags); 59333479e05SDave Chinner 59433479e05SDave Chinner /* insert the new inode */ 59533479e05SDave Chinner spin_lock(&pag->pag_ici_lock); 59633479e05SDave Chinner error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 59733479e05SDave Chinner if (unlikely(error)) { 59833479e05SDave Chinner WARN_ON(error != -EEXIST); 599ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_dup); 6002451337dSDave Chinner error = -EAGAIN; 60133479e05SDave Chinner goto out_preload_end; 60233479e05SDave Chinner } 60333479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 60433479e05SDave Chinner radix_tree_preload_end(); 60533479e05SDave Chinner 60633479e05SDave Chinner *ipp = ip; 60733479e05SDave Chinner return 0; 60833479e05SDave Chinner 60933479e05SDave Chinner out_preload_end: 61033479e05SDave Chinner spin_unlock(&pag->pag_ici_lock); 61133479e05SDave Chinner radix_tree_preload_end(); 61233479e05SDave Chinner if (lock_flags) 61333479e05SDave Chinner xfs_iunlock(ip, lock_flags); 61433479e05SDave Chinner out_destroy: 61533479e05SDave Chinner __destroy_inode(VFS_I(ip)); 61633479e05SDave Chinner xfs_inode_free(ip); 61733479e05SDave Chinner return error; 61833479e05SDave Chinner } 61933479e05SDave Chinner 62033479e05SDave Chinner /* 62133479e05SDave Chinner * Look up an inode by number in the given file system. 62233479e05SDave Chinner * The inode is looked up in the cache held in each AG. 62333479e05SDave Chinner * If the inode is found in the cache, initialise the vfs inode 62433479e05SDave Chinner * if necessary. 62533479e05SDave Chinner * 62633479e05SDave Chinner * If it is not in core, read it in from the file system's device, 62733479e05SDave Chinner * add it to the cache and initialise the vfs inode. 62833479e05SDave Chinner * 62933479e05SDave Chinner * The inode is locked according to the value of the lock_flags parameter. 63033479e05SDave Chinner * This flag parameter indicates how and if the inode's IO lock and inode lock 63133479e05SDave Chinner * should be taken. 63233479e05SDave Chinner * 63333479e05SDave Chinner * mp -- the mount point structure for the current file system. It points 63433479e05SDave Chinner * to the inode hash table. 63533479e05SDave Chinner * tp -- a pointer to the current transaction if there is one. This is 63633479e05SDave Chinner * simply passed through to the xfs_iread() call. 63733479e05SDave Chinner * ino -- the number of the inode desired. This is the unique identifier 63833479e05SDave Chinner * within the file system for the inode being requested. 63933479e05SDave Chinner * lock_flags -- flags indicating how to lock the inode. See the comment 64033479e05SDave Chinner * for xfs_ilock() for a list of valid values. 64133479e05SDave Chinner */ 64233479e05SDave Chinner int 64333479e05SDave Chinner xfs_iget( 64433479e05SDave Chinner xfs_mount_t *mp, 64533479e05SDave Chinner xfs_trans_t *tp, 64633479e05SDave Chinner xfs_ino_t ino, 64733479e05SDave Chinner uint flags, 64833479e05SDave Chinner uint lock_flags, 64933479e05SDave Chinner xfs_inode_t **ipp) 65033479e05SDave Chinner { 65133479e05SDave Chinner xfs_inode_t *ip; 65233479e05SDave Chinner int error; 65333479e05SDave Chinner xfs_perag_t *pag; 65433479e05SDave Chinner xfs_agino_t agino; 65533479e05SDave Chinner 65633479e05SDave Chinner /* 65733479e05SDave Chinner * xfs_reclaim_inode() uses the ILOCK to ensure an inode 65833479e05SDave Chinner * doesn't get freed while it's being referenced during a 65933479e05SDave Chinner * radix tree traversal here. It assumes this function 66033479e05SDave Chinner * aqcuires only the ILOCK (and therefore it has no need to 66133479e05SDave Chinner * involve the IOLOCK in this synchronization). 66233479e05SDave Chinner */ 66333479e05SDave Chinner ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 66433479e05SDave Chinner 66533479e05SDave Chinner /* reject inode numbers outside existing AGs */ 66633479e05SDave Chinner if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 6672451337dSDave Chinner return -EINVAL; 66833479e05SDave Chinner 669ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_attempts); 6708774cf8bSLucas Stach 67133479e05SDave Chinner /* get the perag structure and ensure that it's inode capable */ 67233479e05SDave Chinner pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 67333479e05SDave Chinner agino = XFS_INO_TO_AGINO(mp, ino); 67433479e05SDave Chinner 67533479e05SDave Chinner again: 67633479e05SDave Chinner error = 0; 67733479e05SDave Chinner rcu_read_lock(); 67833479e05SDave Chinner ip = radix_tree_lookup(&pag->pag_ici_root, agino); 67933479e05SDave Chinner 68033479e05SDave Chinner if (ip) { 68133479e05SDave Chinner error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 68233479e05SDave Chinner if (error) 68333479e05SDave Chinner goto out_error_or_again; 68433479e05SDave Chinner } else { 68533479e05SDave Chinner rcu_read_unlock(); 686378f681cSDarrick J. Wong if (flags & XFS_IGET_INCORE) { 687ed438b47SDarrick J. Wong error = -ENODATA; 688378f681cSDarrick J. Wong goto out_error_or_again; 689378f681cSDarrick J. Wong } 690ff6d6af2SBill O'Donnell XFS_STATS_INC(mp, xs_ig_missed); 69133479e05SDave Chinner 69233479e05SDave Chinner error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 69333479e05SDave Chinner flags, lock_flags); 69433479e05SDave Chinner if (error) 69533479e05SDave Chinner goto out_error_or_again; 69633479e05SDave Chinner } 69733479e05SDave Chinner xfs_perag_put(pag); 69833479e05SDave Chinner 69933479e05SDave Chinner *ipp = ip; 70033479e05SDave Chinner 70133479e05SDave Chinner /* 70258c90473SDave Chinner * If we have a real type for an on-disk inode, we can setup the inode 70333479e05SDave Chinner * now. If it's a new inode being created, xfs_ialloc will handle it. 70433479e05SDave Chinner */ 705c19b3b05SDave Chinner if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 70658c90473SDave Chinner xfs_setup_existing_inode(ip); 70733479e05SDave Chinner return 0; 70833479e05SDave Chinner 70933479e05SDave Chinner out_error_or_again: 710378f681cSDarrick J. Wong if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { 71133479e05SDave Chinner delay(1); 71233479e05SDave Chinner goto again; 71333479e05SDave Chinner } 71433479e05SDave Chinner xfs_perag_put(pag); 71533479e05SDave Chinner return error; 71633479e05SDave Chinner } 71733479e05SDave Chinner 7186d8b79cfSDave Chinner /* 719378f681cSDarrick J. Wong * "Is this a cached inode that's also allocated?" 720378f681cSDarrick J. Wong * 721378f681cSDarrick J. Wong * Look up an inode by number in the given file system. If the inode is 722378f681cSDarrick J. Wong * in cache and isn't in purgatory, return 1 if the inode is allocated 723378f681cSDarrick J. Wong * and 0 if it is not. For all other cases (not in cache, being torn 724378f681cSDarrick J. Wong * down, etc.), return a negative error code. 725378f681cSDarrick J. Wong * 726378f681cSDarrick J. Wong * The caller has to prevent inode allocation and freeing activity, 727378f681cSDarrick J. Wong * presumably by locking the AGI buffer. This is to ensure that an 728378f681cSDarrick J. Wong * inode cannot transition from allocated to freed until the caller is 729378f681cSDarrick J. Wong * ready to allow that. If the inode is in an intermediate state (new, 730378f681cSDarrick J. Wong * reclaimable, or being reclaimed), -EAGAIN will be returned; if the 731378f681cSDarrick J. Wong * inode is not in the cache, -ENOENT will be returned. The caller must 732378f681cSDarrick J. Wong * deal with these scenarios appropriately. 733378f681cSDarrick J. Wong * 734378f681cSDarrick J. Wong * This is a specialized use case for the online scrubber; if you're 735378f681cSDarrick J. Wong * reading this, you probably want xfs_iget. 736378f681cSDarrick J. Wong */ 737378f681cSDarrick J. Wong int 738378f681cSDarrick J. Wong xfs_icache_inode_is_allocated( 739378f681cSDarrick J. Wong struct xfs_mount *mp, 740378f681cSDarrick J. Wong struct xfs_trans *tp, 741378f681cSDarrick J. Wong xfs_ino_t ino, 742378f681cSDarrick J. Wong bool *inuse) 743378f681cSDarrick J. Wong { 744378f681cSDarrick J. Wong struct xfs_inode *ip; 745378f681cSDarrick J. Wong int error; 746378f681cSDarrick J. Wong 747378f681cSDarrick J. Wong error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); 748378f681cSDarrick J. Wong if (error) 749378f681cSDarrick J. Wong return error; 750378f681cSDarrick J. Wong 751378f681cSDarrick J. Wong *inuse = !!(VFS_I(ip)->i_mode); 75244a8736bSDarrick J. Wong xfs_irele(ip); 753378f681cSDarrick J. Wong return 0; 754378f681cSDarrick J. Wong } 755378f681cSDarrick J. Wong 756378f681cSDarrick J. Wong /* 7576d8b79cfSDave Chinner * The inode lookup is done in batches to keep the amount of lock traffic and 7586d8b79cfSDave Chinner * radix tree lookups to a minimum. The batch size is a trade off between 7596d8b79cfSDave Chinner * lookup reduction and stack usage. This is in the reclaim path, so we can't 7606d8b79cfSDave Chinner * be too greedy. 7616d8b79cfSDave Chinner */ 7626d8b79cfSDave Chinner #define XFS_LOOKUP_BATCH 32 7636d8b79cfSDave Chinner 7646d8b79cfSDave Chinner STATIC int 7656d8b79cfSDave Chinner xfs_inode_ag_walk_grab( 766ae2c4ac2SBrian Foster struct xfs_inode *ip, 767ae2c4ac2SBrian Foster int flags) 7686d8b79cfSDave Chinner { 7696d8b79cfSDave Chinner struct inode *inode = VFS_I(ip); 770ae2c4ac2SBrian Foster bool newinos = !!(flags & XFS_AGITER_INEW_WAIT); 7716d8b79cfSDave Chinner 7726d8b79cfSDave Chinner ASSERT(rcu_read_lock_held()); 7736d8b79cfSDave Chinner 7746d8b79cfSDave Chinner /* 7756d8b79cfSDave Chinner * check for stale RCU freed inode 7766d8b79cfSDave Chinner * 7776d8b79cfSDave Chinner * If the inode has been reallocated, it doesn't matter if it's not in 7786d8b79cfSDave Chinner * the AG we are walking - we are walking for writeback, so if it 7796d8b79cfSDave Chinner * passes all the "valid inode" checks and is dirty, then we'll write 7806d8b79cfSDave Chinner * it back anyway. If it has been reallocated and still being 7816d8b79cfSDave Chinner * initialised, the XFS_INEW check below will catch it. 7826d8b79cfSDave Chinner */ 7836d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 7846d8b79cfSDave Chinner if (!ip->i_ino) 7856d8b79cfSDave Chinner goto out_unlock_noent; 7866d8b79cfSDave Chinner 7876d8b79cfSDave Chinner /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 788ae2c4ac2SBrian Foster if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || 789ae2c4ac2SBrian Foster __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) 7906d8b79cfSDave Chinner goto out_unlock_noent; 7916d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 7926d8b79cfSDave Chinner 7936d8b79cfSDave Chinner /* nothing to sync during shutdown */ 7946d8b79cfSDave Chinner if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 7952451337dSDave Chinner return -EFSCORRUPTED; 7966d8b79cfSDave Chinner 7976d8b79cfSDave Chinner /* If we can't grab the inode, it must on it's way to reclaim. */ 7986d8b79cfSDave Chinner if (!igrab(inode)) 7992451337dSDave Chinner return -ENOENT; 8006d8b79cfSDave Chinner 8016d8b79cfSDave Chinner /* inode is valid */ 8026d8b79cfSDave Chinner return 0; 8036d8b79cfSDave Chinner 8046d8b79cfSDave Chinner out_unlock_noent: 8056d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 8062451337dSDave Chinner return -ENOENT; 8076d8b79cfSDave Chinner } 8086d8b79cfSDave Chinner 8096d8b79cfSDave Chinner STATIC int 8106d8b79cfSDave Chinner xfs_inode_ag_walk( 8116d8b79cfSDave Chinner struct xfs_mount *mp, 8126d8b79cfSDave Chinner struct xfs_perag *pag, 813*390600f8SDarrick J. Wong int (*execute)(struct xfs_inode *ip, void *args), 814a454f742SBrian Foster void *args, 815ae2c4ac2SBrian Foster int tag, 816ae2c4ac2SBrian Foster int iter_flags) 8176d8b79cfSDave Chinner { 8186d8b79cfSDave Chinner uint32_t first_index; 8196d8b79cfSDave Chinner int last_error = 0; 8206d8b79cfSDave Chinner int skipped; 8216d8b79cfSDave Chinner int done; 8226d8b79cfSDave Chinner int nr_found; 8236d8b79cfSDave Chinner 8246d8b79cfSDave Chinner restart: 8256d8b79cfSDave Chinner done = 0; 8266d8b79cfSDave Chinner skipped = 0; 8276d8b79cfSDave Chinner first_index = 0; 8286d8b79cfSDave Chinner nr_found = 0; 8296d8b79cfSDave Chinner do { 8306d8b79cfSDave Chinner struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 8316d8b79cfSDave Chinner int error = 0; 8326d8b79cfSDave Chinner int i; 8336d8b79cfSDave Chinner 8346d8b79cfSDave Chinner rcu_read_lock(); 835a454f742SBrian Foster 836fc96be95SDarrick J. Wong if (tag == XFS_ICI_NO_TAG) 8376d8b79cfSDave Chinner nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 8386d8b79cfSDave Chinner (void **)batch, first_index, 8396d8b79cfSDave Chinner XFS_LOOKUP_BATCH); 840a454f742SBrian Foster else 841a454f742SBrian Foster nr_found = radix_tree_gang_lookup_tag( 842a454f742SBrian Foster &pag->pag_ici_root, 843a454f742SBrian Foster (void **) batch, first_index, 844a454f742SBrian Foster XFS_LOOKUP_BATCH, tag); 845a454f742SBrian Foster 8466d8b79cfSDave Chinner if (!nr_found) { 8476d8b79cfSDave Chinner rcu_read_unlock(); 8486d8b79cfSDave Chinner break; 8496d8b79cfSDave Chinner } 8506d8b79cfSDave Chinner 8516d8b79cfSDave Chinner /* 8526d8b79cfSDave Chinner * Grab the inodes before we drop the lock. if we found 8536d8b79cfSDave Chinner * nothing, nr == 0 and the loop will be skipped. 8546d8b79cfSDave Chinner */ 8556d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 8566d8b79cfSDave Chinner struct xfs_inode *ip = batch[i]; 8576d8b79cfSDave Chinner 858ae2c4ac2SBrian Foster if (done || xfs_inode_ag_walk_grab(ip, iter_flags)) 8596d8b79cfSDave Chinner batch[i] = NULL; 8606d8b79cfSDave Chinner 8616d8b79cfSDave Chinner /* 8626d8b79cfSDave Chinner * Update the index for the next lookup. Catch 8636d8b79cfSDave Chinner * overflows into the next AG range which can occur if 8646d8b79cfSDave Chinner * we have inodes in the last block of the AG and we 8656d8b79cfSDave Chinner * are currently pointing to the last inode. 8666d8b79cfSDave Chinner * 8676d8b79cfSDave Chinner * Because we may see inodes that are from the wrong AG 8686d8b79cfSDave Chinner * due to RCU freeing and reallocation, only update the 8696d8b79cfSDave Chinner * index if it lies in this AG. It was a race that lead 8706d8b79cfSDave Chinner * us to see this inode, so another lookup from the 8716d8b79cfSDave Chinner * same index will not find it again. 8726d8b79cfSDave Chinner */ 8736d8b79cfSDave Chinner if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 8746d8b79cfSDave Chinner continue; 8756d8b79cfSDave Chinner first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 8766d8b79cfSDave Chinner if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 8776d8b79cfSDave Chinner done = 1; 8786d8b79cfSDave Chinner } 8796d8b79cfSDave Chinner 8806d8b79cfSDave Chinner /* unlock now we've grabbed the inodes. */ 8816d8b79cfSDave Chinner rcu_read_unlock(); 8826d8b79cfSDave Chinner 8836d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 8846d8b79cfSDave Chinner if (!batch[i]) 8856d8b79cfSDave Chinner continue; 886ae2c4ac2SBrian Foster if ((iter_flags & XFS_AGITER_INEW_WAIT) && 887ae2c4ac2SBrian Foster xfs_iflags_test(batch[i], XFS_INEW)) 888ae2c4ac2SBrian Foster xfs_inew_wait(batch[i]); 889*390600f8SDarrick J. Wong error = execute(batch[i], args); 89044a8736bSDarrick J. Wong xfs_irele(batch[i]); 8912451337dSDave Chinner if (error == -EAGAIN) { 8926d8b79cfSDave Chinner skipped++; 8936d8b79cfSDave Chinner continue; 8946d8b79cfSDave Chinner } 8952451337dSDave Chinner if (error && last_error != -EFSCORRUPTED) 8966d8b79cfSDave Chinner last_error = error; 8976d8b79cfSDave Chinner } 8986d8b79cfSDave Chinner 8996d8b79cfSDave Chinner /* bail out if the filesystem is corrupted. */ 9002451337dSDave Chinner if (error == -EFSCORRUPTED) 9016d8b79cfSDave Chinner break; 9026d8b79cfSDave Chinner 9036d8b79cfSDave Chinner cond_resched(); 9046d8b79cfSDave Chinner 9056d8b79cfSDave Chinner } while (nr_found && !done); 9066d8b79cfSDave Chinner 9076d8b79cfSDave Chinner if (skipped) { 9086d8b79cfSDave Chinner delay(1); 9096d8b79cfSDave Chinner goto restart; 9106d8b79cfSDave Chinner } 9116d8b79cfSDave Chinner return last_error; 9126d8b79cfSDave Chinner } 9136d8b79cfSDave Chinner 914579b62faSBrian Foster /* 915579b62faSBrian Foster * Background scanning to trim post-EOF preallocated space. This is queued 916b9fe5052SDwight Engen * based on the 'speculative_prealloc_lifetime' tunable (5m by default). 917579b62faSBrian Foster */ 918fa5a4f57SBrian Foster void 919579b62faSBrian Foster xfs_queue_eofblocks( 920579b62faSBrian Foster struct xfs_mount *mp) 921579b62faSBrian Foster { 922579b62faSBrian Foster rcu_read_lock(); 923579b62faSBrian Foster if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) 924579b62faSBrian Foster queue_delayed_work(mp->m_eofblocks_workqueue, 925579b62faSBrian Foster &mp->m_eofblocks_work, 926579b62faSBrian Foster msecs_to_jiffies(xfs_eofb_secs * 1000)); 927579b62faSBrian Foster rcu_read_unlock(); 928579b62faSBrian Foster } 929579b62faSBrian Foster 930579b62faSBrian Foster void 931579b62faSBrian Foster xfs_eofblocks_worker( 932579b62faSBrian Foster struct work_struct *work) 933579b62faSBrian Foster { 934579b62faSBrian Foster struct xfs_mount *mp = container_of(to_delayed_work(work), 935579b62faSBrian Foster struct xfs_mount, m_eofblocks_work); 9364b674b9aSBrian Foster 9374b674b9aSBrian Foster if (!sb_start_write_trylock(mp->m_super)) 9384b674b9aSBrian Foster return; 939579b62faSBrian Foster xfs_icache_free_eofblocks(mp, NULL); 9404b674b9aSBrian Foster sb_end_write(mp->m_super); 9414b674b9aSBrian Foster 942579b62faSBrian Foster xfs_queue_eofblocks(mp); 943579b62faSBrian Foster } 944579b62faSBrian Foster 94583104d44SDarrick J. Wong /* 94683104d44SDarrick J. Wong * Background scanning to trim preallocated CoW space. This is queued 94783104d44SDarrick J. Wong * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). 94883104d44SDarrick J. Wong * (We'll just piggyback on the post-EOF prealloc space workqueue.) 94983104d44SDarrick J. Wong */ 95010ddf64eSDarrick J. Wong void 95183104d44SDarrick J. Wong xfs_queue_cowblocks( 95283104d44SDarrick J. Wong struct xfs_mount *mp) 95383104d44SDarrick J. Wong { 95483104d44SDarrick J. Wong rcu_read_lock(); 95583104d44SDarrick J. Wong if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) 95683104d44SDarrick J. Wong queue_delayed_work(mp->m_eofblocks_workqueue, 95783104d44SDarrick J. Wong &mp->m_cowblocks_work, 95883104d44SDarrick J. Wong msecs_to_jiffies(xfs_cowb_secs * 1000)); 95983104d44SDarrick J. Wong rcu_read_unlock(); 96083104d44SDarrick J. Wong } 96183104d44SDarrick J. Wong 96283104d44SDarrick J. Wong void 96383104d44SDarrick J. Wong xfs_cowblocks_worker( 96483104d44SDarrick J. Wong struct work_struct *work) 96583104d44SDarrick J. Wong { 96683104d44SDarrick J. Wong struct xfs_mount *mp = container_of(to_delayed_work(work), 96783104d44SDarrick J. Wong struct xfs_mount, m_cowblocks_work); 9684b674b9aSBrian Foster 9694b674b9aSBrian Foster if (!sb_start_write_trylock(mp->m_super)) 9704b674b9aSBrian Foster return; 97183104d44SDarrick J. Wong xfs_icache_free_cowblocks(mp, NULL); 9724b674b9aSBrian Foster sb_end_write(mp->m_super); 9734b674b9aSBrian Foster 97483104d44SDarrick J. Wong xfs_queue_cowblocks(mp); 97583104d44SDarrick J. Wong } 97683104d44SDarrick J. Wong 9779be05904SDarrick J. Wong /* Fetch the next (possibly tagged) per-AG structure. */ 9789be05904SDarrick J. Wong static inline struct xfs_perag * 9799be05904SDarrick J. Wong xfs_inode_walk_get_perag( 9806d8b79cfSDave Chinner struct xfs_mount *mp, 9819be05904SDarrick J. Wong xfs_agnumber_t agno, 9829be05904SDarrick J. Wong int tag) 9836d8b79cfSDave Chinner { 9849be05904SDarrick J. Wong if (tag == XFS_ICI_NO_TAG) 9859be05904SDarrick J. Wong return xfs_perag_get(mp, agno); 9869be05904SDarrick J. Wong return xfs_perag_get_tag(mp, agno, tag); 987a454f742SBrian Foster } 988a454f742SBrian Foster 989a454f742SBrian Foster int 9909be05904SDarrick J. Wong xfs_inode_ag_iterator( 991a454f742SBrian Foster struct xfs_mount *mp, 9929be05904SDarrick J. Wong int iter_flags, 993*390600f8SDarrick J. Wong int (*execute)(struct xfs_inode *ip, void *args), 994a454f742SBrian Foster void *args, 995a454f742SBrian Foster int tag) 996a454f742SBrian Foster { 997a454f742SBrian Foster struct xfs_perag *pag; 998a454f742SBrian Foster int error = 0; 999a454f742SBrian Foster int last_error = 0; 1000a454f742SBrian Foster xfs_agnumber_t ag; 1001a454f742SBrian Foster 1002a454f742SBrian Foster ag = 0; 10039be05904SDarrick J. Wong while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) { 1004a454f742SBrian Foster ag = pag->pag_agno + 1; 1005*390600f8SDarrick J. Wong error = xfs_inode_ag_walk(mp, pag, execute, args, tag, 10069be05904SDarrick J. Wong iter_flags); 10076d8b79cfSDave Chinner xfs_perag_put(pag); 10086d8b79cfSDave Chinner if (error) { 10096d8b79cfSDave Chinner last_error = error; 10102451337dSDave Chinner if (error == -EFSCORRUPTED) 10116d8b79cfSDave Chinner break; 10126d8b79cfSDave Chinner } 10136d8b79cfSDave Chinner } 1014b474c7aeSEric Sandeen return last_error; 10156d8b79cfSDave Chinner } 10166d8b79cfSDave Chinner 10176d8b79cfSDave Chinner /* 10186d8b79cfSDave Chinner * Grab the inode for reclaim exclusively. 10196d8b79cfSDave Chinner * Return 0 if we grabbed it, non-zero otherwise. 10206d8b79cfSDave Chinner */ 10216d8b79cfSDave Chinner STATIC int 10226d8b79cfSDave Chinner xfs_reclaim_inode_grab( 10236d8b79cfSDave Chinner struct xfs_inode *ip, 10246d8b79cfSDave Chinner int flags) 10256d8b79cfSDave Chinner { 10266d8b79cfSDave Chinner ASSERT(rcu_read_lock_held()); 10276d8b79cfSDave Chinner 10286d8b79cfSDave Chinner /* quick check for stale RCU freed inode */ 10296d8b79cfSDave Chinner if (!ip->i_ino) 10306d8b79cfSDave Chinner return 1; 10316d8b79cfSDave Chinner 10326d8b79cfSDave Chinner /* 10336d8b79cfSDave Chinner * If we are asked for non-blocking operation, do unlocked checks to 10346d8b79cfSDave Chinner * see if the inode already is being flushed or in reclaim to avoid 10356d8b79cfSDave Chinner * lock traffic. 10366d8b79cfSDave Chinner */ 10376d8b79cfSDave Chinner if ((flags & SYNC_TRYLOCK) && 10386d8b79cfSDave Chinner __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) 10396d8b79cfSDave Chinner return 1; 10406d8b79cfSDave Chinner 10416d8b79cfSDave Chinner /* 10426d8b79cfSDave Chinner * The radix tree lock here protects a thread in xfs_iget from racing 10436d8b79cfSDave Chinner * with us starting reclaim on the inode. Once we have the 10446d8b79cfSDave Chinner * XFS_IRECLAIM flag set it will not touch us. 10456d8b79cfSDave Chinner * 10466d8b79cfSDave Chinner * Due to RCU lookup, we may find inodes that have been freed and only 10476d8b79cfSDave Chinner * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that 10486d8b79cfSDave Chinner * aren't candidates for reclaim at all, so we must check the 10496d8b79cfSDave Chinner * XFS_IRECLAIMABLE is set first before proceeding to reclaim. 10506d8b79cfSDave Chinner */ 10516d8b79cfSDave Chinner spin_lock(&ip->i_flags_lock); 10526d8b79cfSDave Chinner if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 10536d8b79cfSDave Chinner __xfs_iflags_test(ip, XFS_IRECLAIM)) { 10546d8b79cfSDave Chinner /* not a reclaim candidate. */ 10556d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 10566d8b79cfSDave Chinner return 1; 10576d8b79cfSDave Chinner } 10586d8b79cfSDave Chinner __xfs_iflags_set(ip, XFS_IRECLAIM); 10596d8b79cfSDave Chinner spin_unlock(&ip->i_flags_lock); 10606d8b79cfSDave Chinner return 0; 10616d8b79cfSDave Chinner } 10626d8b79cfSDave Chinner 10636d8b79cfSDave Chinner /* 10646d8b79cfSDave Chinner * Inodes in different states need to be treated differently. The following 10656d8b79cfSDave Chinner * table lists the inode states and the reclaim actions necessary: 10666d8b79cfSDave Chinner * 10676d8b79cfSDave Chinner * inode state iflush ret required action 10686d8b79cfSDave Chinner * --------------- ---------- --------------- 10696d8b79cfSDave Chinner * bad - reclaim 10706d8b79cfSDave Chinner * shutdown EIO unpin and reclaim 10716d8b79cfSDave Chinner * clean, unpinned 0 reclaim 10726d8b79cfSDave Chinner * stale, unpinned 0 reclaim 10736d8b79cfSDave Chinner * clean, pinned(*) 0 requeue 10746d8b79cfSDave Chinner * stale, pinned EAGAIN requeue 10756d8b79cfSDave Chinner * dirty, async - requeue 10766d8b79cfSDave Chinner * dirty, sync 0 reclaim 10776d8b79cfSDave Chinner * 10786d8b79cfSDave Chinner * (*) dgc: I don't think the clean, pinned state is possible but it gets 10796d8b79cfSDave Chinner * handled anyway given the order of checks implemented. 10806d8b79cfSDave Chinner * 10816d8b79cfSDave Chinner * Also, because we get the flush lock first, we know that any inode that has 10826d8b79cfSDave Chinner * been flushed delwri has had the flush completed by the time we check that 10836d8b79cfSDave Chinner * the inode is clean. 10846d8b79cfSDave Chinner * 10856d8b79cfSDave Chinner * Note that because the inode is flushed delayed write by AIL pushing, the 10866d8b79cfSDave Chinner * flush lock may already be held here and waiting on it can result in very 10876d8b79cfSDave Chinner * long latencies. Hence for sync reclaims, where we wait on the flush lock, 10886d8b79cfSDave Chinner * the caller should push the AIL first before trying to reclaim inodes to 10896d8b79cfSDave Chinner * minimise the amount of time spent waiting. For background relaim, we only 10906d8b79cfSDave Chinner * bother to reclaim clean inodes anyway. 10916d8b79cfSDave Chinner * 10926d8b79cfSDave Chinner * Hence the order of actions after gaining the locks should be: 10936d8b79cfSDave Chinner * bad => reclaim 10946d8b79cfSDave Chinner * shutdown => unpin and reclaim 10956d8b79cfSDave Chinner * pinned, async => requeue 10966d8b79cfSDave Chinner * pinned, sync => unpin 10976d8b79cfSDave Chinner * stale => reclaim 10986d8b79cfSDave Chinner * clean => reclaim 10996d8b79cfSDave Chinner * dirty, async => requeue 11006d8b79cfSDave Chinner * dirty, sync => flush, wait and reclaim 11016d8b79cfSDave Chinner */ 11026d8b79cfSDave Chinner STATIC int 11036d8b79cfSDave Chinner xfs_reclaim_inode( 11046d8b79cfSDave Chinner struct xfs_inode *ip, 11056d8b79cfSDave Chinner struct xfs_perag *pag, 11066d8b79cfSDave Chinner int sync_mode) 11076d8b79cfSDave Chinner { 11086d8b79cfSDave Chinner struct xfs_buf *bp = NULL; 11098a17d7ddSDave Chinner xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 11106d8b79cfSDave Chinner int error; 11116d8b79cfSDave Chinner 11126d8b79cfSDave Chinner restart: 11136d8b79cfSDave Chinner error = 0; 11146d8b79cfSDave Chinner xfs_ilock(ip, XFS_ILOCK_EXCL); 11156d8b79cfSDave Chinner if (!xfs_iflock_nowait(ip)) { 11166d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 11176d8b79cfSDave Chinner goto out; 11186d8b79cfSDave Chinner xfs_iflock(ip); 11196d8b79cfSDave Chinner } 11206d8b79cfSDave Chinner 11216d8b79cfSDave Chinner if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 11226d8b79cfSDave Chinner xfs_iunpin_wait(ip); 112398efe8afSBrian Foster /* xfs_iflush_abort() drops the flush lock */ 112488fc1879SBrian Foster xfs_iflush_abort(ip); 11256d8b79cfSDave Chinner goto reclaim; 11266d8b79cfSDave Chinner } 11276d8b79cfSDave Chinner if (xfs_ipincount(ip)) { 11286d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 11296d8b79cfSDave Chinner goto out_ifunlock; 11306d8b79cfSDave Chinner xfs_iunpin_wait(ip); 11316d8b79cfSDave Chinner } 113298efe8afSBrian Foster if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) { 113398efe8afSBrian Foster xfs_ifunlock(ip); 11346d8b79cfSDave Chinner goto reclaim; 113598efe8afSBrian Foster } 11366d8b79cfSDave Chinner 11376d8b79cfSDave Chinner /* 11386d8b79cfSDave Chinner * Never flush out dirty data during non-blocking reclaim, as it would 11396d8b79cfSDave Chinner * just contend with AIL pushing trying to do the same job. 11406d8b79cfSDave Chinner */ 11416d8b79cfSDave Chinner if (!(sync_mode & SYNC_WAIT)) 11426d8b79cfSDave Chinner goto out_ifunlock; 11436d8b79cfSDave Chinner 11446d8b79cfSDave Chinner /* 11456d8b79cfSDave Chinner * Now we have an inode that needs flushing. 11466d8b79cfSDave Chinner * 11476d8b79cfSDave Chinner * Note that xfs_iflush will never block on the inode buffer lock, as 11486d8b79cfSDave Chinner * xfs_ifree_cluster() can lock the inode buffer before it locks the 11496d8b79cfSDave Chinner * ip->i_lock, and we are doing the exact opposite here. As a result, 11506d8b79cfSDave Chinner * doing a blocking xfs_imap_to_bp() to get the cluster buffer would 11516d8b79cfSDave Chinner * result in an ABBA deadlock with xfs_ifree_cluster(). 11526d8b79cfSDave Chinner * 11536d8b79cfSDave Chinner * As xfs_ifree_cluser() must gather all inodes that are active in the 11546d8b79cfSDave Chinner * cache to mark them stale, if we hit this case we don't actually want 11556d8b79cfSDave Chinner * to do IO here - we want the inode marked stale so we can simply 11566d8b79cfSDave Chinner * reclaim it. Hence if we get an EAGAIN error here, just unlock the 11576d8b79cfSDave Chinner * inode, back off and try again. Hopefully the next pass through will 11586d8b79cfSDave Chinner * see the stale flag set on the inode. 11596d8b79cfSDave Chinner */ 11606d8b79cfSDave Chinner error = xfs_iflush(ip, &bp); 11612451337dSDave Chinner if (error == -EAGAIN) { 11626d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 11636d8b79cfSDave Chinner /* backoff longer than in xfs_ifree_cluster */ 11646d8b79cfSDave Chinner delay(2); 11656d8b79cfSDave Chinner goto restart; 11666d8b79cfSDave Chinner } 11676d8b79cfSDave Chinner 11686d8b79cfSDave Chinner if (!error) { 11696d8b79cfSDave Chinner error = xfs_bwrite(bp); 11706d8b79cfSDave Chinner xfs_buf_relse(bp); 11716d8b79cfSDave Chinner } 11726d8b79cfSDave Chinner 11736d8b79cfSDave Chinner reclaim: 117498efe8afSBrian Foster ASSERT(!xfs_isiflocked(ip)); 117598efe8afSBrian Foster 11768a17d7ddSDave Chinner /* 11778a17d7ddSDave Chinner * Because we use RCU freeing we need to ensure the inode always appears 11788a17d7ddSDave Chinner * to be reclaimed with an invalid inode number when in the free state. 117998efe8afSBrian Foster * We do this as early as possible under the ILOCK so that 1180f2e9ad21SOmar Sandoval * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to 1181f2e9ad21SOmar Sandoval * detect races with us here. By doing this, we guarantee that once 1182f2e9ad21SOmar Sandoval * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that 1183f2e9ad21SOmar Sandoval * it will see either a valid inode that will serialise correctly, or it 1184f2e9ad21SOmar Sandoval * will see an invalid inode that it can skip. 11858a17d7ddSDave Chinner */ 11868a17d7ddSDave Chinner spin_lock(&ip->i_flags_lock); 11878a17d7ddSDave Chinner ip->i_flags = XFS_IRECLAIM; 11888a17d7ddSDave Chinner ip->i_ino = 0; 11898a17d7ddSDave Chinner spin_unlock(&ip->i_flags_lock); 11908a17d7ddSDave Chinner 11916d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 11926d8b79cfSDave Chinner 1193ff6d6af2SBill O'Donnell XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 11946d8b79cfSDave Chinner /* 11956d8b79cfSDave Chinner * Remove the inode from the per-AG radix tree. 11966d8b79cfSDave Chinner * 11976d8b79cfSDave Chinner * Because radix_tree_delete won't complain even if the item was never 11986d8b79cfSDave Chinner * added to the tree assert that it's been there before to catch 11996d8b79cfSDave Chinner * problems with the inode life time early on. 12006d8b79cfSDave Chinner */ 12016d8b79cfSDave Chinner spin_lock(&pag->pag_ici_lock); 12026d8b79cfSDave Chinner if (!radix_tree_delete(&pag->pag_ici_root, 12038a17d7ddSDave Chinner XFS_INO_TO_AGINO(ip->i_mount, ino))) 12046d8b79cfSDave Chinner ASSERT(0); 1205545c0889SDave Chinner xfs_perag_clear_reclaim_tag(pag); 12066d8b79cfSDave Chinner spin_unlock(&pag->pag_ici_lock); 12076d8b79cfSDave Chinner 12086d8b79cfSDave Chinner /* 12096d8b79cfSDave Chinner * Here we do an (almost) spurious inode lock in order to coordinate 12106d8b79cfSDave Chinner * with inode cache radix tree lookups. This is because the lookup 12116d8b79cfSDave Chinner * can reference the inodes in the cache without taking references. 12126d8b79cfSDave Chinner * 12136d8b79cfSDave Chinner * We make that OK here by ensuring that we wait until the inode is 12146d8b79cfSDave Chinner * unlocked after the lookup before we go ahead and free it. 12156d8b79cfSDave Chinner */ 12166d8b79cfSDave Chinner xfs_ilock(ip, XFS_ILOCK_EXCL); 12176d8b79cfSDave Chinner xfs_qm_dqdetach(ip); 12186d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 12196d8b79cfSDave Chinner 12208a17d7ddSDave Chinner __xfs_inode_free(ip); 12216d8b79cfSDave Chinner return error; 12226d8b79cfSDave Chinner 12236d8b79cfSDave Chinner out_ifunlock: 12246d8b79cfSDave Chinner xfs_ifunlock(ip); 12256d8b79cfSDave Chinner out: 12266d8b79cfSDave Chinner xfs_iflags_clear(ip, XFS_IRECLAIM); 12276d8b79cfSDave Chinner xfs_iunlock(ip, XFS_ILOCK_EXCL); 12286d8b79cfSDave Chinner /* 12292451337dSDave Chinner * We could return -EAGAIN here to make reclaim rescan the inode tree in 12306d8b79cfSDave Chinner * a short while. However, this just burns CPU time scanning the tree 12316d8b79cfSDave Chinner * waiting for IO to complete and the reclaim work never goes back to 12326d8b79cfSDave Chinner * the idle state. Instead, return 0 to let the next scheduled 12336d8b79cfSDave Chinner * background reclaim attempt to reclaim the inode again. 12346d8b79cfSDave Chinner */ 12356d8b79cfSDave Chinner return 0; 12366d8b79cfSDave Chinner } 12376d8b79cfSDave Chinner 12386d8b79cfSDave Chinner /* 12396d8b79cfSDave Chinner * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 12406d8b79cfSDave Chinner * corrupted, we still want to try to reclaim all the inodes. If we don't, 12416d8b79cfSDave Chinner * then a shut down during filesystem unmount reclaim walk leak all the 12426d8b79cfSDave Chinner * unreclaimed inodes. 12436d8b79cfSDave Chinner */ 124433479e05SDave Chinner STATIC int 12456d8b79cfSDave Chinner xfs_reclaim_inodes_ag( 12466d8b79cfSDave Chinner struct xfs_mount *mp, 12476d8b79cfSDave Chinner int flags, 12486d8b79cfSDave Chinner int *nr_to_scan) 12496d8b79cfSDave Chinner { 12506d8b79cfSDave Chinner struct xfs_perag *pag; 12516d8b79cfSDave Chinner int error = 0; 12526d8b79cfSDave Chinner int last_error = 0; 12536d8b79cfSDave Chinner xfs_agnumber_t ag; 12546d8b79cfSDave Chinner int trylock = flags & SYNC_TRYLOCK; 12556d8b79cfSDave Chinner int skipped; 12566d8b79cfSDave Chinner 12576d8b79cfSDave Chinner restart: 12586d8b79cfSDave Chinner ag = 0; 12596d8b79cfSDave Chinner skipped = 0; 12606d8b79cfSDave Chinner while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 12616d8b79cfSDave Chinner unsigned long first_index = 0; 12626d8b79cfSDave Chinner int done = 0; 12636d8b79cfSDave Chinner int nr_found = 0; 12646d8b79cfSDave Chinner 12656d8b79cfSDave Chinner ag = pag->pag_agno + 1; 12666d8b79cfSDave Chinner 12676d8b79cfSDave Chinner if (trylock) { 12686d8b79cfSDave Chinner if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 12696d8b79cfSDave Chinner skipped++; 12706d8b79cfSDave Chinner xfs_perag_put(pag); 12716d8b79cfSDave Chinner continue; 12726d8b79cfSDave Chinner } 12736d8b79cfSDave Chinner first_index = pag->pag_ici_reclaim_cursor; 12746d8b79cfSDave Chinner } else 12756d8b79cfSDave Chinner mutex_lock(&pag->pag_ici_reclaim_lock); 12766d8b79cfSDave Chinner 12776d8b79cfSDave Chinner do { 12786d8b79cfSDave Chinner struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 12796d8b79cfSDave Chinner int i; 12806d8b79cfSDave Chinner 12816d8b79cfSDave Chinner rcu_read_lock(); 12826d8b79cfSDave Chinner nr_found = radix_tree_gang_lookup_tag( 12836d8b79cfSDave Chinner &pag->pag_ici_root, 12846d8b79cfSDave Chinner (void **)batch, first_index, 12856d8b79cfSDave Chinner XFS_LOOKUP_BATCH, 12866d8b79cfSDave Chinner XFS_ICI_RECLAIM_TAG); 12876d8b79cfSDave Chinner if (!nr_found) { 12886d8b79cfSDave Chinner done = 1; 12896d8b79cfSDave Chinner rcu_read_unlock(); 12906d8b79cfSDave Chinner break; 12916d8b79cfSDave Chinner } 12926d8b79cfSDave Chinner 12936d8b79cfSDave Chinner /* 12946d8b79cfSDave Chinner * Grab the inodes before we drop the lock. if we found 12956d8b79cfSDave Chinner * nothing, nr == 0 and the loop will be skipped. 12966d8b79cfSDave Chinner */ 12976d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 12986d8b79cfSDave Chinner struct xfs_inode *ip = batch[i]; 12996d8b79cfSDave Chinner 13006d8b79cfSDave Chinner if (done || xfs_reclaim_inode_grab(ip, flags)) 13016d8b79cfSDave Chinner batch[i] = NULL; 13026d8b79cfSDave Chinner 13036d8b79cfSDave Chinner /* 13046d8b79cfSDave Chinner * Update the index for the next lookup. Catch 13056d8b79cfSDave Chinner * overflows into the next AG range which can 13066d8b79cfSDave Chinner * occur if we have inodes in the last block of 13076d8b79cfSDave Chinner * the AG and we are currently pointing to the 13086d8b79cfSDave Chinner * last inode. 13096d8b79cfSDave Chinner * 13106d8b79cfSDave Chinner * Because we may see inodes that are from the 13116d8b79cfSDave Chinner * wrong AG due to RCU freeing and 13126d8b79cfSDave Chinner * reallocation, only update the index if it 13136d8b79cfSDave Chinner * lies in this AG. It was a race that lead us 13146d8b79cfSDave Chinner * to see this inode, so another lookup from 13156d8b79cfSDave Chinner * the same index will not find it again. 13166d8b79cfSDave Chinner */ 13176d8b79cfSDave Chinner if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 13186d8b79cfSDave Chinner pag->pag_agno) 13196d8b79cfSDave Chinner continue; 13206d8b79cfSDave Chinner first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 13216d8b79cfSDave Chinner if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 13226d8b79cfSDave Chinner done = 1; 13236d8b79cfSDave Chinner } 13246d8b79cfSDave Chinner 13256d8b79cfSDave Chinner /* unlock now we've grabbed the inodes. */ 13266d8b79cfSDave Chinner rcu_read_unlock(); 13276d8b79cfSDave Chinner 13286d8b79cfSDave Chinner for (i = 0; i < nr_found; i++) { 13296d8b79cfSDave Chinner if (!batch[i]) 13306d8b79cfSDave Chinner continue; 13316d8b79cfSDave Chinner error = xfs_reclaim_inode(batch[i], pag, flags); 13322451337dSDave Chinner if (error && last_error != -EFSCORRUPTED) 13336d8b79cfSDave Chinner last_error = error; 13346d8b79cfSDave Chinner } 13356d8b79cfSDave Chinner 13366d8b79cfSDave Chinner *nr_to_scan -= XFS_LOOKUP_BATCH; 13376d8b79cfSDave Chinner 13386d8b79cfSDave Chinner cond_resched(); 13396d8b79cfSDave Chinner 13406d8b79cfSDave Chinner } while (nr_found && !done && *nr_to_scan > 0); 13416d8b79cfSDave Chinner 13426d8b79cfSDave Chinner if (trylock && !done) 13436d8b79cfSDave Chinner pag->pag_ici_reclaim_cursor = first_index; 13446d8b79cfSDave Chinner else 13456d8b79cfSDave Chinner pag->pag_ici_reclaim_cursor = 0; 13466d8b79cfSDave Chinner mutex_unlock(&pag->pag_ici_reclaim_lock); 13476d8b79cfSDave Chinner xfs_perag_put(pag); 13486d8b79cfSDave Chinner } 13496d8b79cfSDave Chinner 13506d8b79cfSDave Chinner /* 13516d8b79cfSDave Chinner * if we skipped any AG, and we still have scan count remaining, do 13526d8b79cfSDave Chinner * another pass this time using blocking reclaim semantics (i.e 13536d8b79cfSDave Chinner * waiting on the reclaim locks and ignoring the reclaim cursors). This 13546d8b79cfSDave Chinner * ensure that when we get more reclaimers than AGs we block rather 13556d8b79cfSDave Chinner * than spin trying to execute reclaim. 13566d8b79cfSDave Chinner */ 13576d8b79cfSDave Chinner if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { 13586d8b79cfSDave Chinner trylock = 0; 13596d8b79cfSDave Chinner goto restart; 13606d8b79cfSDave Chinner } 1361b474c7aeSEric Sandeen return last_error; 13626d8b79cfSDave Chinner } 13636d8b79cfSDave Chinner 13646d8b79cfSDave Chinner int 13656d8b79cfSDave Chinner xfs_reclaim_inodes( 13666d8b79cfSDave Chinner xfs_mount_t *mp, 13676d8b79cfSDave Chinner int mode) 13686d8b79cfSDave Chinner { 13696d8b79cfSDave Chinner int nr_to_scan = INT_MAX; 13706d8b79cfSDave Chinner 13716d8b79cfSDave Chinner return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); 13726d8b79cfSDave Chinner } 13736d8b79cfSDave Chinner 13746d8b79cfSDave Chinner /* 13756d8b79cfSDave Chinner * Scan a certain number of inodes for reclaim. 13766d8b79cfSDave Chinner * 13776d8b79cfSDave Chinner * When called we make sure that there is a background (fast) inode reclaim in 13786d8b79cfSDave Chinner * progress, while we will throttle the speed of reclaim via doing synchronous 13796d8b79cfSDave Chinner * reclaim of inodes. That means if we come across dirty inodes, we wait for 13806d8b79cfSDave Chinner * them to be cleaned, which we hope will not be very long due to the 13816d8b79cfSDave Chinner * background walker having already kicked the IO off on those dirty inodes. 13826d8b79cfSDave Chinner */ 13830a234c6dSDave Chinner long 13846d8b79cfSDave Chinner xfs_reclaim_inodes_nr( 13856d8b79cfSDave Chinner struct xfs_mount *mp, 13866d8b79cfSDave Chinner int nr_to_scan) 13876d8b79cfSDave Chinner { 13886d8b79cfSDave Chinner /* kick background reclaimer and push the AIL */ 13896d8b79cfSDave Chinner xfs_reclaim_work_queue(mp); 13906d8b79cfSDave Chinner xfs_ail_push_all(mp->m_ail); 13916d8b79cfSDave Chinner 13920a234c6dSDave Chinner return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 13936d8b79cfSDave Chinner } 13946d8b79cfSDave Chinner 13956d8b79cfSDave Chinner /* 13966d8b79cfSDave Chinner * Return the number of reclaimable inodes in the filesystem for 13976d8b79cfSDave Chinner * the shrinker to determine how much to reclaim. 13986d8b79cfSDave Chinner */ 13996d8b79cfSDave Chinner int 14006d8b79cfSDave Chinner xfs_reclaim_inodes_count( 14016d8b79cfSDave Chinner struct xfs_mount *mp) 14026d8b79cfSDave Chinner { 14036d8b79cfSDave Chinner struct xfs_perag *pag; 14046d8b79cfSDave Chinner xfs_agnumber_t ag = 0; 14056d8b79cfSDave Chinner int reclaimable = 0; 14066d8b79cfSDave Chinner 14076d8b79cfSDave Chinner while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 14086d8b79cfSDave Chinner ag = pag->pag_agno + 1; 14096d8b79cfSDave Chinner reclaimable += pag->pag_ici_reclaimable; 14106d8b79cfSDave Chinner xfs_perag_put(pag); 14116d8b79cfSDave Chinner } 14126d8b79cfSDave Chinner return reclaimable; 14136d8b79cfSDave Chinner } 14146d8b79cfSDave Chinner 141541176a68SBrian Foster STATIC int 14163e3f9f58SBrian Foster xfs_inode_match_id( 14173e3f9f58SBrian Foster struct xfs_inode *ip, 14183e3f9f58SBrian Foster struct xfs_eofblocks *eofb) 14193e3f9f58SBrian Foster { 1420b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1421b9fe5052SDwight Engen !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 14223e3f9f58SBrian Foster return 0; 14231b556048SBrian Foster 1424b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1425b9fe5052SDwight Engen !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 14261b556048SBrian Foster return 0; 14271b556048SBrian Foster 1428b9fe5052SDwight Engen if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1429de7a866fSChristoph Hellwig ip->i_d.di_projid != eofb->eof_prid) 14301b556048SBrian Foster return 0; 14311b556048SBrian Foster 14321b556048SBrian Foster return 1; 14333e3f9f58SBrian Foster } 14343e3f9f58SBrian Foster 1435f4526397SBrian Foster /* 1436f4526397SBrian Foster * A union-based inode filtering algorithm. Process the inode if any of the 1437f4526397SBrian Foster * criteria match. This is for global/internal scans only. 1438f4526397SBrian Foster */ 1439f4526397SBrian Foster STATIC int 1440f4526397SBrian Foster xfs_inode_match_id_union( 1441f4526397SBrian Foster struct xfs_inode *ip, 1442f4526397SBrian Foster struct xfs_eofblocks *eofb) 1443f4526397SBrian Foster { 1444f4526397SBrian Foster if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1445f4526397SBrian Foster uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1446f4526397SBrian Foster return 1; 1447f4526397SBrian Foster 1448f4526397SBrian Foster if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1449f4526397SBrian Foster gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1450f4526397SBrian Foster return 1; 1451f4526397SBrian Foster 1452f4526397SBrian Foster if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1453de7a866fSChristoph Hellwig ip->i_d.di_projid == eofb->eof_prid) 1454f4526397SBrian Foster return 1; 1455f4526397SBrian Foster 1456f4526397SBrian Foster return 0; 1457f4526397SBrian Foster } 1458f4526397SBrian Foster 14593e3f9f58SBrian Foster STATIC int 146041176a68SBrian Foster xfs_inode_free_eofblocks( 146141176a68SBrian Foster struct xfs_inode *ip, 146241176a68SBrian Foster void *args) 146341176a68SBrian Foster { 14643e3f9f58SBrian Foster struct xfs_eofblocks *eofb = args; 1465*390600f8SDarrick J. Wong bool wait; 1466f4526397SBrian Foster int match; 1467*390600f8SDarrick J. Wong int ret; 1468*390600f8SDarrick J. Wong 1469*390600f8SDarrick J. Wong wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); 14705400da7dSBrian Foster 147141176a68SBrian Foster if (!xfs_can_free_eofblocks(ip, false)) { 147241176a68SBrian Foster /* inode could be preallocated or append-only */ 147341176a68SBrian Foster trace_xfs_inode_free_eofblocks_invalid(ip); 147441176a68SBrian Foster xfs_inode_clear_eofblocks_tag(ip); 147541176a68SBrian Foster return 0; 147641176a68SBrian Foster } 147741176a68SBrian Foster 147841176a68SBrian Foster /* 147941176a68SBrian Foster * If the mapping is dirty the operation can block and wait for some 148041176a68SBrian Foster * time. Unless we are waiting, skip it. 148141176a68SBrian Foster */ 1482*390600f8SDarrick J. Wong if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 148341176a68SBrian Foster return 0; 148441176a68SBrian Foster 148500ca79a0SBrian Foster if (eofb) { 1486f4526397SBrian Foster if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 1487f4526397SBrian Foster match = xfs_inode_match_id_union(ip, eofb); 1488f4526397SBrian Foster else 1489f4526397SBrian Foster match = xfs_inode_match_id(ip, eofb); 1490f4526397SBrian Foster if (!match) 14913e3f9f58SBrian Foster return 0; 14923e3f9f58SBrian Foster 149300ca79a0SBrian Foster /* skip the inode if the file size is too small */ 149400ca79a0SBrian Foster if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 149500ca79a0SBrian Foster XFS_ISIZE(ip) < eofb->eof_min_file_size) 149600ca79a0SBrian Foster return 0; 149700ca79a0SBrian Foster } 149800ca79a0SBrian Foster 1499a36b9261SBrian Foster /* 1500a36b9261SBrian Foster * If the caller is waiting, return -EAGAIN to keep the background 1501a36b9261SBrian Foster * scanner moving and revisit the inode in a subsequent pass. 1502a36b9261SBrian Foster */ 1503c3155097SBrian Foster if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1504*390600f8SDarrick J. Wong if (wait) 1505*390600f8SDarrick J. Wong return -EAGAIN; 1506*390600f8SDarrick J. Wong return 0; 1507a36b9261SBrian Foster } 1508*390600f8SDarrick J. Wong 1509a36b9261SBrian Foster ret = xfs_free_eofblocks(ip); 1510a36b9261SBrian Foster xfs_iunlock(ip, XFS_IOLOCK_EXCL); 151141176a68SBrian Foster 151241176a68SBrian Foster return ret; 151341176a68SBrian Foster } 151441176a68SBrian Foster 151583104d44SDarrick J. Wong static int 151683104d44SDarrick J. Wong __xfs_icache_free_eofblocks( 151741176a68SBrian Foster struct xfs_mount *mp, 151883104d44SDarrick J. Wong struct xfs_eofblocks *eofb, 1519*390600f8SDarrick J. Wong int (*execute)(struct xfs_inode *ip, void *args), 152083104d44SDarrick J. Wong int tag) 152141176a68SBrian Foster { 1522*390600f8SDarrick J. Wong return xfs_inode_ag_iterator(mp, 0, execute, eofb, tag); 152383104d44SDarrick J. Wong } 152483104d44SDarrick J. Wong 152583104d44SDarrick J. Wong int 152683104d44SDarrick J. Wong xfs_icache_free_eofblocks( 152783104d44SDarrick J. Wong struct xfs_mount *mp, 152883104d44SDarrick J. Wong struct xfs_eofblocks *eofb) 152983104d44SDarrick J. Wong { 153083104d44SDarrick J. Wong return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks, 153183104d44SDarrick J. Wong XFS_ICI_EOFBLOCKS_TAG); 153241176a68SBrian Foster } 153341176a68SBrian Foster 1534dc06f398SBrian Foster /* 1535dc06f398SBrian Foster * Run eofblocks scans on the quotas applicable to the inode. For inodes with 1536dc06f398SBrian Foster * multiple quotas, we don't know exactly which quota caused an allocation 1537dc06f398SBrian Foster * failure. We make a best effort by including each quota under low free space 1538dc06f398SBrian Foster * conditions (less than 1% free space) in the scan. 1539dc06f398SBrian Foster */ 154083104d44SDarrick J. Wong static int 154183104d44SDarrick J. Wong __xfs_inode_free_quota_eofblocks( 154283104d44SDarrick J. Wong struct xfs_inode *ip, 154383104d44SDarrick J. Wong int (*execute)(struct xfs_mount *mp, 154483104d44SDarrick J. Wong struct xfs_eofblocks *eofb)) 1545dc06f398SBrian Foster { 1546dc06f398SBrian Foster int scan = 0; 1547dc06f398SBrian Foster struct xfs_eofblocks eofb = {0}; 1548dc06f398SBrian Foster struct xfs_dquot *dq; 1549dc06f398SBrian Foster 1550dc06f398SBrian Foster /* 1551c3155097SBrian Foster * Run a sync scan to increase effectiveness and use the union filter to 1552dc06f398SBrian Foster * cover all applicable quotas in a single scan. 1553dc06f398SBrian Foster */ 1554dc06f398SBrian Foster eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; 1555dc06f398SBrian Foster 1556dc06f398SBrian Foster if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { 1557dc06f398SBrian Foster dq = xfs_inode_dquot(ip, XFS_DQ_USER); 1558dc06f398SBrian Foster if (dq && xfs_dquot_lowsp(dq)) { 1559dc06f398SBrian Foster eofb.eof_uid = VFS_I(ip)->i_uid; 1560dc06f398SBrian Foster eofb.eof_flags |= XFS_EOF_FLAGS_UID; 1561dc06f398SBrian Foster scan = 1; 1562dc06f398SBrian Foster } 1563dc06f398SBrian Foster } 1564dc06f398SBrian Foster 1565dc06f398SBrian Foster if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { 1566dc06f398SBrian Foster dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); 1567dc06f398SBrian Foster if (dq && xfs_dquot_lowsp(dq)) { 1568dc06f398SBrian Foster eofb.eof_gid = VFS_I(ip)->i_gid; 1569dc06f398SBrian Foster eofb.eof_flags |= XFS_EOF_FLAGS_GID; 1570dc06f398SBrian Foster scan = 1; 1571dc06f398SBrian Foster } 1572dc06f398SBrian Foster } 1573dc06f398SBrian Foster 1574dc06f398SBrian Foster if (scan) 157583104d44SDarrick J. Wong execute(ip->i_mount, &eofb); 1576dc06f398SBrian Foster 1577dc06f398SBrian Foster return scan; 1578dc06f398SBrian Foster } 1579dc06f398SBrian Foster 158083104d44SDarrick J. Wong int 158183104d44SDarrick J. Wong xfs_inode_free_quota_eofblocks( 158283104d44SDarrick J. Wong struct xfs_inode *ip) 158383104d44SDarrick J. Wong { 158483104d44SDarrick J. Wong return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); 158583104d44SDarrick J. Wong } 158683104d44SDarrick J. Wong 158791aae6beSDarrick J. Wong static inline unsigned long 158891aae6beSDarrick J. Wong xfs_iflag_for_tag( 158991aae6beSDarrick J. Wong int tag) 159091aae6beSDarrick J. Wong { 159191aae6beSDarrick J. Wong switch (tag) { 159291aae6beSDarrick J. Wong case XFS_ICI_EOFBLOCKS_TAG: 159391aae6beSDarrick J. Wong return XFS_IEOFBLOCKS; 159491aae6beSDarrick J. Wong case XFS_ICI_COWBLOCKS_TAG: 159591aae6beSDarrick J. Wong return XFS_ICOWBLOCKS; 159691aae6beSDarrick J. Wong default: 159791aae6beSDarrick J. Wong ASSERT(0); 159891aae6beSDarrick J. Wong return 0; 159991aae6beSDarrick J. Wong } 160091aae6beSDarrick J. Wong } 160191aae6beSDarrick J. Wong 160283104d44SDarrick J. Wong static void 160391aae6beSDarrick J. Wong __xfs_inode_set_blocks_tag( 160483104d44SDarrick J. Wong xfs_inode_t *ip, 160583104d44SDarrick J. Wong void (*execute)(struct xfs_mount *mp), 160683104d44SDarrick J. Wong void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 160783104d44SDarrick J. Wong int error, unsigned long caller_ip), 160883104d44SDarrick J. Wong int tag) 160927b52867SBrian Foster { 161027b52867SBrian Foster struct xfs_mount *mp = ip->i_mount; 161127b52867SBrian Foster struct xfs_perag *pag; 161227b52867SBrian Foster int tagged; 161327b52867SBrian Foster 161485a6e764SChristoph Hellwig /* 161585a6e764SChristoph Hellwig * Don't bother locking the AG and looking up in the radix trees 161685a6e764SChristoph Hellwig * if we already know that we have the tag set. 161785a6e764SChristoph Hellwig */ 161891aae6beSDarrick J. Wong if (ip->i_flags & xfs_iflag_for_tag(tag)) 161985a6e764SChristoph Hellwig return; 162085a6e764SChristoph Hellwig spin_lock(&ip->i_flags_lock); 162191aae6beSDarrick J. Wong ip->i_flags |= xfs_iflag_for_tag(tag); 162285a6e764SChristoph Hellwig spin_unlock(&ip->i_flags_lock); 162385a6e764SChristoph Hellwig 162427b52867SBrian Foster pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 162527b52867SBrian Foster spin_lock(&pag->pag_ici_lock); 162627b52867SBrian Foster 162783104d44SDarrick J. Wong tagged = radix_tree_tagged(&pag->pag_ici_root, tag); 162827b52867SBrian Foster radix_tree_tag_set(&pag->pag_ici_root, 162983104d44SDarrick J. Wong XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 163027b52867SBrian Foster if (!tagged) { 163127b52867SBrian Foster /* propagate the eofblocks tag up into the perag radix tree */ 163227b52867SBrian Foster spin_lock(&ip->i_mount->m_perag_lock); 163327b52867SBrian Foster radix_tree_tag_set(&ip->i_mount->m_perag_tree, 163427b52867SBrian Foster XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 163583104d44SDarrick J. Wong tag); 163627b52867SBrian Foster spin_unlock(&ip->i_mount->m_perag_lock); 163727b52867SBrian Foster 1638579b62faSBrian Foster /* kick off background trimming */ 163983104d44SDarrick J. Wong execute(ip->i_mount); 1640579b62faSBrian Foster 164183104d44SDarrick J. Wong set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 164227b52867SBrian Foster } 164327b52867SBrian Foster 164427b52867SBrian Foster spin_unlock(&pag->pag_ici_lock); 164527b52867SBrian Foster xfs_perag_put(pag); 164627b52867SBrian Foster } 164727b52867SBrian Foster 164827b52867SBrian Foster void 164983104d44SDarrick J. Wong xfs_inode_set_eofblocks_tag( 165027b52867SBrian Foster xfs_inode_t *ip) 165127b52867SBrian Foster { 165283104d44SDarrick J. Wong trace_xfs_inode_set_eofblocks_tag(ip); 165391aae6beSDarrick J. Wong return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks, 165483104d44SDarrick J. Wong trace_xfs_perag_set_eofblocks, 165583104d44SDarrick J. Wong XFS_ICI_EOFBLOCKS_TAG); 165683104d44SDarrick J. Wong } 165783104d44SDarrick J. Wong 165883104d44SDarrick J. Wong static void 165991aae6beSDarrick J. Wong __xfs_inode_clear_blocks_tag( 166083104d44SDarrick J. Wong xfs_inode_t *ip, 166183104d44SDarrick J. Wong void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 166283104d44SDarrick J. Wong int error, unsigned long caller_ip), 166383104d44SDarrick J. Wong int tag) 166483104d44SDarrick J. Wong { 166527b52867SBrian Foster struct xfs_mount *mp = ip->i_mount; 166627b52867SBrian Foster struct xfs_perag *pag; 166727b52867SBrian Foster 166885a6e764SChristoph Hellwig spin_lock(&ip->i_flags_lock); 166991aae6beSDarrick J. Wong ip->i_flags &= ~xfs_iflag_for_tag(tag); 167085a6e764SChristoph Hellwig spin_unlock(&ip->i_flags_lock); 167185a6e764SChristoph Hellwig 167227b52867SBrian Foster pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 167327b52867SBrian Foster spin_lock(&pag->pag_ici_lock); 167427b52867SBrian Foster 167527b52867SBrian Foster radix_tree_tag_clear(&pag->pag_ici_root, 167683104d44SDarrick J. Wong XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 167783104d44SDarrick J. Wong if (!radix_tree_tagged(&pag->pag_ici_root, tag)) { 167827b52867SBrian Foster /* clear the eofblocks tag from the perag radix tree */ 167927b52867SBrian Foster spin_lock(&ip->i_mount->m_perag_lock); 168027b52867SBrian Foster radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 168127b52867SBrian Foster XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 168283104d44SDarrick J. Wong tag); 168327b52867SBrian Foster spin_unlock(&ip->i_mount->m_perag_lock); 168483104d44SDarrick J. Wong clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 168527b52867SBrian Foster } 168627b52867SBrian Foster 168727b52867SBrian Foster spin_unlock(&pag->pag_ici_lock); 168827b52867SBrian Foster xfs_perag_put(pag); 168927b52867SBrian Foster } 169027b52867SBrian Foster 169183104d44SDarrick J. Wong void 169283104d44SDarrick J. Wong xfs_inode_clear_eofblocks_tag( 169383104d44SDarrick J. Wong xfs_inode_t *ip) 169483104d44SDarrick J. Wong { 169583104d44SDarrick J. Wong trace_xfs_inode_clear_eofblocks_tag(ip); 169691aae6beSDarrick J. Wong return __xfs_inode_clear_blocks_tag(ip, 169783104d44SDarrick J. Wong trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); 169883104d44SDarrick J. Wong } 169983104d44SDarrick J. Wong 170083104d44SDarrick J. Wong /* 1701be78ff0eSDarrick J. Wong * Set ourselves up to free CoW blocks from this file. If it's already clean 1702be78ff0eSDarrick J. Wong * then we can bail out quickly, but otherwise we must back off if the file 1703be78ff0eSDarrick J. Wong * is undergoing some kind of write. 1704be78ff0eSDarrick J. Wong */ 1705be78ff0eSDarrick J. Wong static bool 1706be78ff0eSDarrick J. Wong xfs_prep_free_cowblocks( 170751d62690SChristoph Hellwig struct xfs_inode *ip) 1708be78ff0eSDarrick J. Wong { 1709be78ff0eSDarrick J. Wong /* 1710be78ff0eSDarrick J. Wong * Just clear the tag if we have an empty cow fork or none at all. It's 1711be78ff0eSDarrick J. Wong * possible the inode was fully unshared since it was originally tagged. 1712be78ff0eSDarrick J. Wong */ 171351d62690SChristoph Hellwig if (!xfs_inode_has_cow_data(ip)) { 1714be78ff0eSDarrick J. Wong trace_xfs_inode_free_cowblocks_invalid(ip); 1715be78ff0eSDarrick J. Wong xfs_inode_clear_cowblocks_tag(ip); 1716be78ff0eSDarrick J. Wong return false; 1717be78ff0eSDarrick J. Wong } 1718be78ff0eSDarrick J. Wong 1719be78ff0eSDarrick J. Wong /* 1720be78ff0eSDarrick J. Wong * If the mapping is dirty or under writeback we cannot touch the 1721be78ff0eSDarrick J. Wong * CoW fork. Leave it alone if we're in the midst of a directio. 1722be78ff0eSDarrick J. Wong */ 1723be78ff0eSDarrick J. Wong if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || 1724be78ff0eSDarrick J. Wong mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || 1725be78ff0eSDarrick J. Wong mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || 1726be78ff0eSDarrick J. Wong atomic_read(&VFS_I(ip)->i_dio_count)) 1727be78ff0eSDarrick J. Wong return false; 1728be78ff0eSDarrick J. Wong 1729be78ff0eSDarrick J. Wong return true; 1730be78ff0eSDarrick J. Wong } 1731be78ff0eSDarrick J. Wong 1732be78ff0eSDarrick J. Wong /* 173383104d44SDarrick J. Wong * Automatic CoW Reservation Freeing 173483104d44SDarrick J. Wong * 173583104d44SDarrick J. Wong * These functions automatically garbage collect leftover CoW reservations 173683104d44SDarrick J. Wong * that were made on behalf of a cowextsize hint when we start to run out 173783104d44SDarrick J. Wong * of quota or when the reservations sit around for too long. If the file 173883104d44SDarrick J. Wong * has dirty pages or is undergoing writeback, its CoW reservations will 173983104d44SDarrick J. Wong * be retained. 174083104d44SDarrick J. Wong * 174183104d44SDarrick J. Wong * The actual garbage collection piggybacks off the same code that runs 174283104d44SDarrick J. Wong * the speculative EOF preallocation garbage collector. 174383104d44SDarrick J. Wong */ 174483104d44SDarrick J. Wong STATIC int 174583104d44SDarrick J. Wong xfs_inode_free_cowblocks( 174683104d44SDarrick J. Wong struct xfs_inode *ip, 174783104d44SDarrick J. Wong void *args) 174883104d44SDarrick J. Wong { 174983104d44SDarrick J. Wong struct xfs_eofblocks *eofb = args; 1750be78ff0eSDarrick J. Wong int match; 1751be78ff0eSDarrick J. Wong int ret = 0; 175283104d44SDarrick J. Wong 175351d62690SChristoph Hellwig if (!xfs_prep_free_cowblocks(ip)) 175483104d44SDarrick J. Wong return 0; 175583104d44SDarrick J. Wong 175683104d44SDarrick J. Wong if (eofb) { 175783104d44SDarrick J. Wong if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 175883104d44SDarrick J. Wong match = xfs_inode_match_id_union(ip, eofb); 175983104d44SDarrick J. Wong else 176083104d44SDarrick J. Wong match = xfs_inode_match_id(ip, eofb); 176183104d44SDarrick J. Wong if (!match) 176283104d44SDarrick J. Wong return 0; 176383104d44SDarrick J. Wong 176483104d44SDarrick J. Wong /* skip the inode if the file size is too small */ 176583104d44SDarrick J. Wong if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 176683104d44SDarrick J. Wong XFS_ISIZE(ip) < eofb->eof_min_file_size) 176783104d44SDarrick J. Wong return 0; 176883104d44SDarrick J. Wong } 176983104d44SDarrick J. Wong 177083104d44SDarrick J. Wong /* Free the CoW blocks */ 177183104d44SDarrick J. Wong xfs_ilock(ip, XFS_IOLOCK_EXCL); 177283104d44SDarrick J. Wong xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 177383104d44SDarrick J. Wong 1774be78ff0eSDarrick J. Wong /* 1775be78ff0eSDarrick J. Wong * Check again, nobody else should be able to dirty blocks or change 1776be78ff0eSDarrick J. Wong * the reflink iflag now that we have the first two locks held. 1777be78ff0eSDarrick J. Wong */ 177851d62690SChristoph Hellwig if (xfs_prep_free_cowblocks(ip)) 17793802a345SChristoph Hellwig ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); 178083104d44SDarrick J. Wong 178183104d44SDarrick J. Wong xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 178283104d44SDarrick J. Wong xfs_iunlock(ip, XFS_IOLOCK_EXCL); 178383104d44SDarrick J. Wong 178483104d44SDarrick J. Wong return ret; 178583104d44SDarrick J. Wong } 178683104d44SDarrick J. Wong 178783104d44SDarrick J. Wong int 178883104d44SDarrick J. Wong xfs_icache_free_cowblocks( 178983104d44SDarrick J. Wong struct xfs_mount *mp, 179083104d44SDarrick J. Wong struct xfs_eofblocks *eofb) 179183104d44SDarrick J. Wong { 179283104d44SDarrick J. Wong return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks, 179383104d44SDarrick J. Wong XFS_ICI_COWBLOCKS_TAG); 179483104d44SDarrick J. Wong } 179583104d44SDarrick J. Wong 179683104d44SDarrick J. Wong int 179783104d44SDarrick J. Wong xfs_inode_free_quota_cowblocks( 179883104d44SDarrick J. Wong struct xfs_inode *ip) 179983104d44SDarrick J. Wong { 180083104d44SDarrick J. Wong return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); 180183104d44SDarrick J. Wong } 180283104d44SDarrick J. Wong 180383104d44SDarrick J. Wong void 180483104d44SDarrick J. Wong xfs_inode_set_cowblocks_tag( 180583104d44SDarrick J. Wong xfs_inode_t *ip) 180683104d44SDarrick J. Wong { 18077b7381f0SBrian Foster trace_xfs_inode_set_cowblocks_tag(ip); 180891aae6beSDarrick J. Wong return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks, 18097b7381f0SBrian Foster trace_xfs_perag_set_cowblocks, 181083104d44SDarrick J. Wong XFS_ICI_COWBLOCKS_TAG); 181183104d44SDarrick J. Wong } 181283104d44SDarrick J. Wong 181383104d44SDarrick J. Wong void 181483104d44SDarrick J. Wong xfs_inode_clear_cowblocks_tag( 181583104d44SDarrick J. Wong xfs_inode_t *ip) 181683104d44SDarrick J. Wong { 18177b7381f0SBrian Foster trace_xfs_inode_clear_cowblocks_tag(ip); 181891aae6beSDarrick J. Wong return __xfs_inode_clear_blocks_tag(ip, 18197b7381f0SBrian Foster trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); 182083104d44SDarrick J. Wong } 1821d6b636ebSDarrick J. Wong 1822d6b636ebSDarrick J. Wong /* Disable post-EOF and CoW block auto-reclamation. */ 1823d6b636ebSDarrick J. Wong void 1824ed30dcbdSDarrick J. Wong xfs_stop_block_reaping( 1825d6b636ebSDarrick J. Wong struct xfs_mount *mp) 1826d6b636ebSDarrick J. Wong { 1827d6b636ebSDarrick J. Wong cancel_delayed_work_sync(&mp->m_eofblocks_work); 1828d6b636ebSDarrick J. Wong cancel_delayed_work_sync(&mp->m_cowblocks_work); 1829d6b636ebSDarrick J. Wong } 1830d6b636ebSDarrick J. Wong 1831d6b636ebSDarrick J. Wong /* Enable post-EOF and CoW block auto-reclamation. */ 1832d6b636ebSDarrick J. Wong void 1833ed30dcbdSDarrick J. Wong xfs_start_block_reaping( 1834d6b636ebSDarrick J. Wong struct xfs_mount *mp) 1835d6b636ebSDarrick J. Wong { 1836d6b636ebSDarrick J. Wong xfs_queue_eofblocks(mp); 1837d6b636ebSDarrick J. Wong xfs_queue_cowblocks(mp); 1838d6b636ebSDarrick J. Wong } 1839