1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_trans_priv.h" 16 #include "xfs_inode_item.h" 17 #include "xfs_quota.h" 18 #include "xfs_trace.h" 19 #include "xfs_icache.h" 20 #include "xfs_bmap_util.h" 21 #include "xfs_dquot_item.h" 22 #include "xfs_dquot.h" 23 #include "xfs_reflink.h" 24 #include "xfs_ialloc.h" 25 #include "xfs_ag.h" 26 27 #include <linux/iversion.h> 28 29 /* Radix tree tags for incore inode tree. */ 30 31 /* inode is to be reclaimed */ 32 #define XFS_ICI_RECLAIM_TAG 0 33 /* Inode has speculative preallocations (posteof or cow) to clean. */ 34 #define XFS_ICI_BLOCKGC_TAG 1 35 36 /* 37 * The goal for walking incore inodes. These can correspond with incore inode 38 * radix tree tags when convenient. Avoid existing XFS_IWALK namespace. 39 */ 40 enum xfs_icwalk_goal { 41 /* Goals directly associated with tagged inodes. */ 42 XFS_ICWALK_BLOCKGC = XFS_ICI_BLOCKGC_TAG, 43 XFS_ICWALK_RECLAIM = XFS_ICI_RECLAIM_TAG, 44 }; 45 46 static int xfs_icwalk(struct xfs_mount *mp, 47 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); 48 static int xfs_icwalk_ag(struct xfs_perag *pag, 49 enum xfs_icwalk_goal goal, struct xfs_icwalk *icw); 50 51 /* 52 * Private inode cache walk flags for struct xfs_icwalk. Must not 53 * coincide with XFS_ICWALK_FLAGS_VALID. 54 */ 55 56 /* Stop scanning after icw_scan_limit inodes. */ 57 #define XFS_ICWALK_FLAG_SCAN_LIMIT (1U << 28) 58 59 #define XFS_ICWALK_FLAG_RECLAIM_SICK (1U << 27) 60 #define XFS_ICWALK_FLAG_UNION (1U << 26) /* union filter algorithm */ 61 62 #define XFS_ICWALK_PRIVATE_FLAGS (XFS_ICWALK_FLAG_SCAN_LIMIT | \ 63 XFS_ICWALK_FLAG_RECLAIM_SICK | \ 64 XFS_ICWALK_FLAG_UNION) 65 66 /* 67 * Allocate and initialise an xfs_inode. 68 */ 69 struct xfs_inode * 70 xfs_inode_alloc( 71 struct xfs_mount *mp, 72 xfs_ino_t ino) 73 { 74 struct xfs_inode *ip; 75 76 /* 77 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL 78 * and return NULL here on ENOMEM. 79 */ 80 ip = kmem_cache_alloc(xfs_inode_cache, GFP_KERNEL | __GFP_NOFAIL); 81 82 if (inode_init_always(mp->m_super, VFS_I(ip))) { 83 kmem_cache_free(xfs_inode_cache, ip); 84 return NULL; 85 } 86 87 /* VFS doesn't initialise i_mode or i_state! */ 88 VFS_I(ip)->i_mode = 0; 89 VFS_I(ip)->i_state = 0; 90 mapping_set_large_folios(VFS_I(ip)->i_mapping); 91 92 XFS_STATS_INC(mp, vn_active); 93 ASSERT(atomic_read(&ip->i_pincount) == 0); 94 ASSERT(ip->i_ino == 0); 95 96 /* initialise the xfs inode */ 97 ip->i_ino = ino; 98 ip->i_mount = mp; 99 memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 100 ip->i_afp = NULL; 101 ip->i_cowfp = NULL; 102 memset(&ip->i_df, 0, sizeof(ip->i_df)); 103 ip->i_flags = 0; 104 ip->i_delayed_blks = 0; 105 ip->i_diflags2 = mp->m_ino_geo.new_diflags2; 106 ip->i_nblocks = 0; 107 ip->i_forkoff = 0; 108 ip->i_sick = 0; 109 ip->i_checked = 0; 110 INIT_WORK(&ip->i_ioend_work, xfs_end_io); 111 INIT_LIST_HEAD(&ip->i_ioend_list); 112 spin_lock_init(&ip->i_ioend_lock); 113 114 return ip; 115 } 116 117 STATIC void 118 xfs_inode_free_callback( 119 struct rcu_head *head) 120 { 121 struct inode *inode = container_of(head, struct inode, i_rcu); 122 struct xfs_inode *ip = XFS_I(inode); 123 124 switch (VFS_I(ip)->i_mode & S_IFMT) { 125 case S_IFREG: 126 case S_IFDIR: 127 case S_IFLNK: 128 xfs_idestroy_fork(&ip->i_df); 129 break; 130 } 131 132 if (ip->i_afp) { 133 xfs_idestroy_fork(ip->i_afp); 134 kmem_cache_free(xfs_ifork_cache, ip->i_afp); 135 } 136 if (ip->i_cowfp) { 137 xfs_idestroy_fork(ip->i_cowfp); 138 kmem_cache_free(xfs_ifork_cache, ip->i_cowfp); 139 } 140 if (ip->i_itemp) { 141 ASSERT(!test_bit(XFS_LI_IN_AIL, 142 &ip->i_itemp->ili_item.li_flags)); 143 xfs_inode_item_destroy(ip); 144 ip->i_itemp = NULL; 145 } 146 147 kmem_cache_free(xfs_inode_cache, ip); 148 } 149 150 static void 151 __xfs_inode_free( 152 struct xfs_inode *ip) 153 { 154 /* asserts to verify all state is correct here */ 155 ASSERT(atomic_read(&ip->i_pincount) == 0); 156 ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list)); 157 XFS_STATS_DEC(ip->i_mount, vn_active); 158 159 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 160 } 161 162 void 163 xfs_inode_free( 164 struct xfs_inode *ip) 165 { 166 ASSERT(!xfs_iflags_test(ip, XFS_IFLUSHING)); 167 168 /* 169 * Because we use RCU freeing we need to ensure the inode always 170 * appears to be reclaimed with an invalid inode number when in the 171 * free state. The ip->i_flags_lock provides the barrier against lookup 172 * races. 173 */ 174 spin_lock(&ip->i_flags_lock); 175 ip->i_flags = XFS_IRECLAIM; 176 ip->i_ino = 0; 177 spin_unlock(&ip->i_flags_lock); 178 179 __xfs_inode_free(ip); 180 } 181 182 /* 183 * Queue background inode reclaim work if there are reclaimable inodes and there 184 * isn't reclaim work already scheduled or in progress. 185 */ 186 static void 187 xfs_reclaim_work_queue( 188 struct xfs_mount *mp) 189 { 190 191 rcu_read_lock(); 192 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 193 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 194 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 195 } 196 rcu_read_unlock(); 197 } 198 199 /* 200 * Background scanning to trim preallocated space. This is queued based on the 201 * 'speculative_prealloc_lifetime' tunable (5m by default). 202 */ 203 static inline void 204 xfs_blockgc_queue( 205 struct xfs_perag *pag) 206 { 207 struct xfs_mount *mp = pag->pag_mount; 208 209 if (!xfs_is_blockgc_enabled(mp)) 210 return; 211 212 rcu_read_lock(); 213 if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) 214 queue_delayed_work(pag->pag_mount->m_blockgc_wq, 215 &pag->pag_blockgc_work, 216 msecs_to_jiffies(xfs_blockgc_secs * 1000)); 217 rcu_read_unlock(); 218 } 219 220 /* Set a tag on both the AG incore inode tree and the AG radix tree. */ 221 static void 222 xfs_perag_set_inode_tag( 223 struct xfs_perag *pag, 224 xfs_agino_t agino, 225 unsigned int tag) 226 { 227 struct xfs_mount *mp = pag->pag_mount; 228 bool was_tagged; 229 230 lockdep_assert_held(&pag->pag_ici_lock); 231 232 was_tagged = radix_tree_tagged(&pag->pag_ici_root, tag); 233 radix_tree_tag_set(&pag->pag_ici_root, agino, tag); 234 235 if (tag == XFS_ICI_RECLAIM_TAG) 236 pag->pag_ici_reclaimable++; 237 238 if (was_tagged) 239 return; 240 241 /* propagate the tag up into the perag radix tree */ 242 spin_lock(&mp->m_perag_lock); 243 radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, tag); 244 spin_unlock(&mp->m_perag_lock); 245 246 /* start background work */ 247 switch (tag) { 248 case XFS_ICI_RECLAIM_TAG: 249 xfs_reclaim_work_queue(mp); 250 break; 251 case XFS_ICI_BLOCKGC_TAG: 252 xfs_blockgc_queue(pag); 253 break; 254 } 255 256 trace_xfs_perag_set_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); 257 } 258 259 /* Clear a tag on both the AG incore inode tree and the AG radix tree. */ 260 static void 261 xfs_perag_clear_inode_tag( 262 struct xfs_perag *pag, 263 xfs_agino_t agino, 264 unsigned int tag) 265 { 266 struct xfs_mount *mp = pag->pag_mount; 267 268 lockdep_assert_held(&pag->pag_ici_lock); 269 270 /* 271 * Reclaim can signal (with a null agino) that it cleared its own tag 272 * by removing the inode from the radix tree. 273 */ 274 if (agino != NULLAGINO) 275 radix_tree_tag_clear(&pag->pag_ici_root, agino, tag); 276 else 277 ASSERT(tag == XFS_ICI_RECLAIM_TAG); 278 279 if (tag == XFS_ICI_RECLAIM_TAG) 280 pag->pag_ici_reclaimable--; 281 282 if (radix_tree_tagged(&pag->pag_ici_root, tag)) 283 return; 284 285 /* clear the tag from the perag radix tree */ 286 spin_lock(&mp->m_perag_lock); 287 radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, tag); 288 spin_unlock(&mp->m_perag_lock); 289 290 trace_xfs_perag_clear_inode_tag(mp, pag->pag_agno, tag, _RET_IP_); 291 } 292 293 /* 294 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 295 * part of the structure. This is made more complex by the fact we store 296 * information about the on-disk values in the VFS inode and so we can't just 297 * overwrite the values unconditionally. Hence we save the parameters we 298 * need to retain across reinitialisation, and rewrite them into the VFS inode 299 * after reinitialisation even if it fails. 300 */ 301 static int 302 xfs_reinit_inode( 303 struct xfs_mount *mp, 304 struct inode *inode) 305 { 306 int error; 307 uint32_t nlink = inode->i_nlink; 308 uint32_t generation = inode->i_generation; 309 uint64_t version = inode_peek_iversion(inode); 310 umode_t mode = inode->i_mode; 311 dev_t dev = inode->i_rdev; 312 kuid_t uid = inode->i_uid; 313 kgid_t gid = inode->i_gid; 314 315 error = inode_init_always(mp->m_super, inode); 316 317 set_nlink(inode, nlink); 318 inode->i_generation = generation; 319 inode_set_iversion_queried(inode, version); 320 inode->i_mode = mode; 321 inode->i_rdev = dev; 322 inode->i_uid = uid; 323 inode->i_gid = gid; 324 mapping_set_large_folios(inode->i_mapping); 325 return error; 326 } 327 328 /* 329 * Carefully nudge an inode whose VFS state has been torn down back into a 330 * usable state. Drops the i_flags_lock and the rcu read lock. 331 */ 332 static int 333 xfs_iget_recycle( 334 struct xfs_perag *pag, 335 struct xfs_inode *ip) __releases(&ip->i_flags_lock) 336 { 337 struct xfs_mount *mp = ip->i_mount; 338 struct inode *inode = VFS_I(ip); 339 int error; 340 341 trace_xfs_iget_recycle(ip); 342 343 /* 344 * We need to make it look like the inode is being reclaimed to prevent 345 * the actual reclaim workers from stomping over us while we recycle 346 * the inode. We can't clear the radix tree tag yet as it requires 347 * pag_ici_lock to be held exclusive. 348 */ 349 ip->i_flags |= XFS_IRECLAIM; 350 351 spin_unlock(&ip->i_flags_lock); 352 rcu_read_unlock(); 353 354 ASSERT(!rwsem_is_locked(&inode->i_rwsem)); 355 error = xfs_reinit_inode(mp, inode); 356 if (error) { 357 /* 358 * Re-initializing the inode failed, and we are in deep 359 * trouble. Try to re-add it to the reclaim list. 360 */ 361 rcu_read_lock(); 362 spin_lock(&ip->i_flags_lock); 363 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 364 ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 365 spin_unlock(&ip->i_flags_lock); 366 rcu_read_unlock(); 367 368 trace_xfs_iget_recycle_fail(ip); 369 return error; 370 } 371 372 spin_lock(&pag->pag_ici_lock); 373 spin_lock(&ip->i_flags_lock); 374 375 /* 376 * Clear the per-lifetime state in the inode as we are now effectively 377 * a new inode and need to return to the initial state before reuse 378 * occurs. 379 */ 380 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 381 ip->i_flags |= XFS_INEW; 382 xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 383 XFS_ICI_RECLAIM_TAG); 384 inode->i_state = I_NEW; 385 spin_unlock(&ip->i_flags_lock); 386 spin_unlock(&pag->pag_ici_lock); 387 388 return 0; 389 } 390 391 /* 392 * If we are allocating a new inode, then check what was returned is 393 * actually a free, empty inode. If we are not allocating an inode, 394 * then check we didn't find a free inode. 395 * 396 * Returns: 397 * 0 if the inode free state matches the lookup context 398 * -ENOENT if the inode is free and we are not allocating 399 * -EFSCORRUPTED if there is any state mismatch at all 400 */ 401 static int 402 xfs_iget_check_free_state( 403 struct xfs_inode *ip, 404 int flags) 405 { 406 if (flags & XFS_IGET_CREATE) { 407 /* should be a free inode */ 408 if (VFS_I(ip)->i_mode != 0) { 409 xfs_warn(ip->i_mount, 410 "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", 411 ip->i_ino, VFS_I(ip)->i_mode); 412 return -EFSCORRUPTED; 413 } 414 415 if (ip->i_nblocks != 0) { 416 xfs_warn(ip->i_mount, 417 "Corruption detected! Free inode 0x%llx has blocks allocated!", 418 ip->i_ino); 419 return -EFSCORRUPTED; 420 } 421 return 0; 422 } 423 424 /* should be an allocated inode */ 425 if (VFS_I(ip)->i_mode == 0) 426 return -ENOENT; 427 428 return 0; 429 } 430 431 /* Make all pending inactivation work start immediately. */ 432 static void 433 xfs_inodegc_queue_all( 434 struct xfs_mount *mp) 435 { 436 struct xfs_inodegc *gc; 437 int cpu; 438 439 for_each_online_cpu(cpu) { 440 gc = per_cpu_ptr(mp->m_inodegc, cpu); 441 if (!llist_empty(&gc->list)) 442 queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); 443 } 444 } 445 446 /* 447 * Check the validity of the inode we just found it the cache 448 */ 449 static int 450 xfs_iget_cache_hit( 451 struct xfs_perag *pag, 452 struct xfs_inode *ip, 453 xfs_ino_t ino, 454 int flags, 455 int lock_flags) __releases(RCU) 456 { 457 struct inode *inode = VFS_I(ip); 458 struct xfs_mount *mp = ip->i_mount; 459 int error; 460 461 /* 462 * check for re-use of an inode within an RCU grace period due to the 463 * radix tree nodes not being updated yet. We monitor for this by 464 * setting the inode number to zero before freeing the inode structure. 465 * If the inode has been reallocated and set up, then the inode number 466 * will not match, so check for that, too. 467 */ 468 spin_lock(&ip->i_flags_lock); 469 if (ip->i_ino != ino) 470 goto out_skip; 471 472 /* 473 * If we are racing with another cache hit that is currently 474 * instantiating this inode or currently recycling it out of 475 * reclaimable state, wait for the initialisation to complete 476 * before continuing. 477 * 478 * If we're racing with the inactivation worker we also want to wait. 479 * If we're creating a new file, it's possible that the worker 480 * previously marked the inode as free on disk but hasn't finished 481 * updating the incore state yet. The AGI buffer will be dirty and 482 * locked to the icreate transaction, so a synchronous push of the 483 * inodegc workers would result in deadlock. For a regular iget, the 484 * worker is running already, so we might as well wait. 485 * 486 * XXX(hch): eventually we should do something equivalent to 487 * wait_on_inode to wait for these flags to be cleared 488 * instead of polling for it. 489 */ 490 if (ip->i_flags & (XFS_INEW | XFS_IRECLAIM | XFS_INACTIVATING)) 491 goto out_skip; 492 493 if (ip->i_flags & XFS_NEED_INACTIVE) { 494 /* Unlinked inodes cannot be re-grabbed. */ 495 if (VFS_I(ip)->i_nlink == 0) { 496 error = -ENOENT; 497 goto out_error; 498 } 499 goto out_inodegc_flush; 500 } 501 502 /* 503 * Check the inode free state is valid. This also detects lookup 504 * racing with unlinks. 505 */ 506 error = xfs_iget_check_free_state(ip, flags); 507 if (error) 508 goto out_error; 509 510 /* Skip inodes that have no vfs state. */ 511 if ((flags & XFS_IGET_INCORE) && 512 (ip->i_flags & XFS_IRECLAIMABLE)) 513 goto out_skip; 514 515 /* The inode fits the selection criteria; process it. */ 516 if (ip->i_flags & XFS_IRECLAIMABLE) { 517 /* Drops i_flags_lock and RCU read lock. */ 518 error = xfs_iget_recycle(pag, ip); 519 if (error) 520 return error; 521 } else { 522 /* If the VFS inode is being torn down, pause and try again. */ 523 if (!igrab(inode)) 524 goto out_skip; 525 526 /* We've got a live one. */ 527 spin_unlock(&ip->i_flags_lock); 528 rcu_read_unlock(); 529 trace_xfs_iget_hit(ip); 530 } 531 532 if (lock_flags != 0) 533 xfs_ilock(ip, lock_flags); 534 535 if (!(flags & XFS_IGET_INCORE)) 536 xfs_iflags_clear(ip, XFS_ISTALE); 537 XFS_STATS_INC(mp, xs_ig_found); 538 539 return 0; 540 541 out_skip: 542 trace_xfs_iget_skip(ip); 543 XFS_STATS_INC(mp, xs_ig_frecycle); 544 error = -EAGAIN; 545 out_error: 546 spin_unlock(&ip->i_flags_lock); 547 rcu_read_unlock(); 548 return error; 549 550 out_inodegc_flush: 551 spin_unlock(&ip->i_flags_lock); 552 rcu_read_unlock(); 553 /* 554 * Do not wait for the workers, because the caller could hold an AGI 555 * buffer lock. We're just going to sleep in a loop anyway. 556 */ 557 if (xfs_is_inodegc_enabled(mp)) 558 xfs_inodegc_queue_all(mp); 559 return -EAGAIN; 560 } 561 562 static int 563 xfs_iget_cache_miss( 564 struct xfs_mount *mp, 565 struct xfs_perag *pag, 566 xfs_trans_t *tp, 567 xfs_ino_t ino, 568 struct xfs_inode **ipp, 569 int flags, 570 int lock_flags) 571 { 572 struct xfs_inode *ip; 573 int error; 574 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 575 int iflags; 576 577 ip = xfs_inode_alloc(mp, ino); 578 if (!ip) 579 return -ENOMEM; 580 581 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags); 582 if (error) 583 goto out_destroy; 584 585 /* 586 * For version 5 superblocks, if we are initialising a new inode and we 587 * are not utilising the XFS_FEAT_IKEEP inode cluster mode, we can 588 * simply build the new inode core with a random generation number. 589 * 590 * For version 4 (and older) superblocks, log recovery is dependent on 591 * the i_flushiter field being initialised from the current on-disk 592 * value and hence we must also read the inode off disk even when 593 * initializing new inodes. 594 */ 595 if (xfs_has_v3inodes(mp) && 596 (flags & XFS_IGET_CREATE) && !xfs_has_ikeep(mp)) { 597 VFS_I(ip)->i_generation = prandom_u32(); 598 } else { 599 struct xfs_buf *bp; 600 601 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp); 602 if (error) 603 goto out_destroy; 604 605 error = xfs_inode_from_disk(ip, 606 xfs_buf_offset(bp, ip->i_imap.im_boffset)); 607 if (!error) 608 xfs_buf_set_ref(bp, XFS_INO_REF); 609 xfs_trans_brelse(tp, bp); 610 611 if (error) 612 goto out_destroy; 613 } 614 615 trace_xfs_iget_miss(ip); 616 617 /* 618 * Check the inode free state is valid. This also detects lookup 619 * racing with unlinks. 620 */ 621 error = xfs_iget_check_free_state(ip, flags); 622 if (error) 623 goto out_destroy; 624 625 /* 626 * Preload the radix tree so we can insert safely under the 627 * write spinlock. Note that we cannot sleep inside the preload 628 * region. Since we can be called from transaction context, don't 629 * recurse into the file system. 630 */ 631 if (radix_tree_preload(GFP_NOFS)) { 632 error = -EAGAIN; 633 goto out_destroy; 634 } 635 636 /* 637 * Because the inode hasn't been added to the radix-tree yet it can't 638 * be found by another thread, so we can do the non-sleeping lock here. 639 */ 640 if (lock_flags) { 641 if (!xfs_ilock_nowait(ip, lock_flags)) 642 BUG(); 643 } 644 645 /* 646 * These values must be set before inserting the inode into the radix 647 * tree as the moment it is inserted a concurrent lookup (allowed by the 648 * RCU locking mechanism) can find it and that lookup must see that this 649 * is an inode currently under construction (i.e. that XFS_INEW is set). 650 * The ip->i_flags_lock that protects the XFS_INEW flag forms the 651 * memory barrier that ensures this detection works correctly at lookup 652 * time. 653 */ 654 iflags = XFS_INEW; 655 if (flags & XFS_IGET_DONTCACHE) 656 d_mark_dontcache(VFS_I(ip)); 657 ip->i_udquot = NULL; 658 ip->i_gdquot = NULL; 659 ip->i_pdquot = NULL; 660 xfs_iflags_set(ip, iflags); 661 662 /* insert the new inode */ 663 spin_lock(&pag->pag_ici_lock); 664 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 665 if (unlikely(error)) { 666 WARN_ON(error != -EEXIST); 667 XFS_STATS_INC(mp, xs_ig_dup); 668 error = -EAGAIN; 669 goto out_preload_end; 670 } 671 spin_unlock(&pag->pag_ici_lock); 672 radix_tree_preload_end(); 673 674 *ipp = ip; 675 return 0; 676 677 out_preload_end: 678 spin_unlock(&pag->pag_ici_lock); 679 radix_tree_preload_end(); 680 if (lock_flags) 681 xfs_iunlock(ip, lock_flags); 682 out_destroy: 683 __destroy_inode(VFS_I(ip)); 684 xfs_inode_free(ip); 685 return error; 686 } 687 688 /* 689 * Look up an inode by number in the given file system. The inode is looked up 690 * in the cache held in each AG. If the inode is found in the cache, initialise 691 * the vfs inode if necessary. 692 * 693 * If it is not in core, read it in from the file system's device, add it to the 694 * cache and initialise the vfs inode. 695 * 696 * The inode is locked according to the value of the lock_flags parameter. 697 * Inode lookup is only done during metadata operations and not as part of the 698 * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup. 699 */ 700 int 701 xfs_iget( 702 struct xfs_mount *mp, 703 struct xfs_trans *tp, 704 xfs_ino_t ino, 705 uint flags, 706 uint lock_flags, 707 struct xfs_inode **ipp) 708 { 709 struct xfs_inode *ip; 710 struct xfs_perag *pag; 711 xfs_agino_t agino; 712 int error; 713 714 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 715 716 /* reject inode numbers outside existing AGs */ 717 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 718 return -EINVAL; 719 720 XFS_STATS_INC(mp, xs_ig_attempts); 721 722 /* get the perag structure and ensure that it's inode capable */ 723 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 724 agino = XFS_INO_TO_AGINO(mp, ino); 725 726 again: 727 error = 0; 728 rcu_read_lock(); 729 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 730 731 if (ip) { 732 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 733 if (error) 734 goto out_error_or_again; 735 } else { 736 rcu_read_unlock(); 737 if (flags & XFS_IGET_INCORE) { 738 error = -ENODATA; 739 goto out_error_or_again; 740 } 741 XFS_STATS_INC(mp, xs_ig_missed); 742 743 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 744 flags, lock_flags); 745 if (error) 746 goto out_error_or_again; 747 } 748 xfs_perag_put(pag); 749 750 *ipp = ip; 751 752 /* 753 * If we have a real type for an on-disk inode, we can setup the inode 754 * now. If it's a new inode being created, xfs_init_new_inode will 755 * handle it. 756 */ 757 if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 758 xfs_setup_existing_inode(ip); 759 return 0; 760 761 out_error_or_again: 762 if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { 763 delay(1); 764 goto again; 765 } 766 xfs_perag_put(pag); 767 return error; 768 } 769 770 /* 771 * "Is this a cached inode that's also allocated?" 772 * 773 * Look up an inode by number in the given file system. If the inode is 774 * in cache and isn't in purgatory, return 1 if the inode is allocated 775 * and 0 if it is not. For all other cases (not in cache, being torn 776 * down, etc.), return a negative error code. 777 * 778 * The caller has to prevent inode allocation and freeing activity, 779 * presumably by locking the AGI buffer. This is to ensure that an 780 * inode cannot transition from allocated to freed until the caller is 781 * ready to allow that. If the inode is in an intermediate state (new, 782 * reclaimable, or being reclaimed), -EAGAIN will be returned; if the 783 * inode is not in the cache, -ENOENT will be returned. The caller must 784 * deal with these scenarios appropriately. 785 * 786 * This is a specialized use case for the online scrubber; if you're 787 * reading this, you probably want xfs_iget. 788 */ 789 int 790 xfs_icache_inode_is_allocated( 791 struct xfs_mount *mp, 792 struct xfs_trans *tp, 793 xfs_ino_t ino, 794 bool *inuse) 795 { 796 struct xfs_inode *ip; 797 int error; 798 799 error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); 800 if (error) 801 return error; 802 803 *inuse = !!(VFS_I(ip)->i_mode); 804 xfs_irele(ip); 805 return 0; 806 } 807 808 /* 809 * Grab the inode for reclaim exclusively. 810 * 811 * We have found this inode via a lookup under RCU, so the inode may have 812 * already been freed, or it may be in the process of being recycled by 813 * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode 814 * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE 815 * will not be set. Hence we need to check for both these flag conditions to 816 * avoid inodes that are no longer reclaim candidates. 817 * 818 * Note: checking for other state flags here, under the i_flags_lock or not, is 819 * racy and should be avoided. Those races should be resolved only after we have 820 * ensured that we are able to reclaim this inode and the world can see that we 821 * are going to reclaim it. 822 * 823 * Return true if we grabbed it, false otherwise. 824 */ 825 static bool 826 xfs_reclaim_igrab( 827 struct xfs_inode *ip, 828 struct xfs_icwalk *icw) 829 { 830 ASSERT(rcu_read_lock_held()); 831 832 spin_lock(&ip->i_flags_lock); 833 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 834 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 835 /* not a reclaim candidate. */ 836 spin_unlock(&ip->i_flags_lock); 837 return false; 838 } 839 840 /* Don't reclaim a sick inode unless the caller asked for it. */ 841 if (ip->i_sick && 842 (!icw || !(icw->icw_flags & XFS_ICWALK_FLAG_RECLAIM_SICK))) { 843 spin_unlock(&ip->i_flags_lock); 844 return false; 845 } 846 847 __xfs_iflags_set(ip, XFS_IRECLAIM); 848 spin_unlock(&ip->i_flags_lock); 849 return true; 850 } 851 852 /* 853 * Inode reclaim is non-blocking, so the default action if progress cannot be 854 * made is to "requeue" the inode for reclaim by unlocking it and clearing the 855 * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about 856 * blocking anymore and hence we can wait for the inode to be able to reclaim 857 * it. 858 * 859 * We do no IO here - if callers require inodes to be cleaned they must push the 860 * AIL first to trigger writeback of dirty inodes. This enables writeback to be 861 * done in the background in a non-blocking manner, and enables memory reclaim 862 * to make progress without blocking. 863 */ 864 static void 865 xfs_reclaim_inode( 866 struct xfs_inode *ip, 867 struct xfs_perag *pag) 868 { 869 xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 870 871 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 872 goto out; 873 if (xfs_iflags_test_and_set(ip, XFS_IFLUSHING)) 874 goto out_iunlock; 875 876 if (xfs_is_shutdown(ip->i_mount)) { 877 xfs_iunpin_wait(ip); 878 xfs_iflush_abort(ip); 879 goto reclaim; 880 } 881 if (xfs_ipincount(ip)) 882 goto out_clear_flush; 883 if (!xfs_inode_clean(ip)) 884 goto out_clear_flush; 885 886 xfs_iflags_clear(ip, XFS_IFLUSHING); 887 reclaim: 888 trace_xfs_inode_reclaiming(ip); 889 890 /* 891 * Because we use RCU freeing we need to ensure the inode always appears 892 * to be reclaimed with an invalid inode number when in the free state. 893 * We do this as early as possible under the ILOCK so that 894 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to 895 * detect races with us here. By doing this, we guarantee that once 896 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that 897 * it will see either a valid inode that will serialise correctly, or it 898 * will see an invalid inode that it can skip. 899 */ 900 spin_lock(&ip->i_flags_lock); 901 ip->i_flags = XFS_IRECLAIM; 902 ip->i_ino = 0; 903 ip->i_sick = 0; 904 ip->i_checked = 0; 905 spin_unlock(&ip->i_flags_lock); 906 907 xfs_iunlock(ip, XFS_ILOCK_EXCL); 908 909 XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 910 /* 911 * Remove the inode from the per-AG radix tree. 912 * 913 * Because radix_tree_delete won't complain even if the item was never 914 * added to the tree assert that it's been there before to catch 915 * problems with the inode life time early on. 916 */ 917 spin_lock(&pag->pag_ici_lock); 918 if (!radix_tree_delete(&pag->pag_ici_root, 919 XFS_INO_TO_AGINO(ip->i_mount, ino))) 920 ASSERT(0); 921 xfs_perag_clear_inode_tag(pag, NULLAGINO, XFS_ICI_RECLAIM_TAG); 922 spin_unlock(&pag->pag_ici_lock); 923 924 /* 925 * Here we do an (almost) spurious inode lock in order to coordinate 926 * with inode cache radix tree lookups. This is because the lookup 927 * can reference the inodes in the cache without taking references. 928 * 929 * We make that OK here by ensuring that we wait until the inode is 930 * unlocked after the lookup before we go ahead and free it. 931 */ 932 xfs_ilock(ip, XFS_ILOCK_EXCL); 933 ASSERT(!ip->i_udquot && !ip->i_gdquot && !ip->i_pdquot); 934 xfs_iunlock(ip, XFS_ILOCK_EXCL); 935 ASSERT(xfs_inode_clean(ip)); 936 937 __xfs_inode_free(ip); 938 return; 939 940 out_clear_flush: 941 xfs_iflags_clear(ip, XFS_IFLUSHING); 942 out_iunlock: 943 xfs_iunlock(ip, XFS_ILOCK_EXCL); 944 out: 945 xfs_iflags_clear(ip, XFS_IRECLAIM); 946 } 947 948 /* Reclaim sick inodes if we're unmounting or the fs went down. */ 949 static inline bool 950 xfs_want_reclaim_sick( 951 struct xfs_mount *mp) 952 { 953 return xfs_is_unmounting(mp) || xfs_has_norecovery(mp) || 954 xfs_is_shutdown(mp); 955 } 956 957 void 958 xfs_reclaim_inodes( 959 struct xfs_mount *mp) 960 { 961 struct xfs_icwalk icw = { 962 .icw_flags = 0, 963 }; 964 965 if (xfs_want_reclaim_sick(mp)) 966 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; 967 968 while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 969 xfs_ail_push_all_sync(mp->m_ail); 970 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); 971 } 972 } 973 974 /* 975 * The shrinker infrastructure determines how many inodes we should scan for 976 * reclaim. We want as many clean inodes ready to reclaim as possible, so we 977 * push the AIL here. We also want to proactively free up memory if we can to 978 * minimise the amount of work memory reclaim has to do so we kick the 979 * background reclaim if it isn't already scheduled. 980 */ 981 long 982 xfs_reclaim_inodes_nr( 983 struct xfs_mount *mp, 984 unsigned long nr_to_scan) 985 { 986 struct xfs_icwalk icw = { 987 .icw_flags = XFS_ICWALK_FLAG_SCAN_LIMIT, 988 .icw_scan_limit = min_t(unsigned long, LONG_MAX, nr_to_scan), 989 }; 990 991 if (xfs_want_reclaim_sick(mp)) 992 icw.icw_flags |= XFS_ICWALK_FLAG_RECLAIM_SICK; 993 994 /* kick background reclaimer and push the AIL */ 995 xfs_reclaim_work_queue(mp); 996 xfs_ail_push_all(mp->m_ail); 997 998 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, &icw); 999 return 0; 1000 } 1001 1002 /* 1003 * Return the number of reclaimable inodes in the filesystem for 1004 * the shrinker to determine how much to reclaim. 1005 */ 1006 long 1007 xfs_reclaim_inodes_count( 1008 struct xfs_mount *mp) 1009 { 1010 struct xfs_perag *pag; 1011 xfs_agnumber_t ag = 0; 1012 long reclaimable = 0; 1013 1014 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1015 ag = pag->pag_agno + 1; 1016 reclaimable += pag->pag_ici_reclaimable; 1017 xfs_perag_put(pag); 1018 } 1019 return reclaimable; 1020 } 1021 1022 STATIC bool 1023 xfs_icwalk_match_id( 1024 struct xfs_inode *ip, 1025 struct xfs_icwalk *icw) 1026 { 1027 if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && 1028 !uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) 1029 return false; 1030 1031 if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && 1032 !gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) 1033 return false; 1034 1035 if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && 1036 ip->i_projid != icw->icw_prid) 1037 return false; 1038 1039 return true; 1040 } 1041 1042 /* 1043 * A union-based inode filtering algorithm. Process the inode if any of the 1044 * criteria match. This is for global/internal scans only. 1045 */ 1046 STATIC bool 1047 xfs_icwalk_match_id_union( 1048 struct xfs_inode *ip, 1049 struct xfs_icwalk *icw) 1050 { 1051 if ((icw->icw_flags & XFS_ICWALK_FLAG_UID) && 1052 uid_eq(VFS_I(ip)->i_uid, icw->icw_uid)) 1053 return true; 1054 1055 if ((icw->icw_flags & XFS_ICWALK_FLAG_GID) && 1056 gid_eq(VFS_I(ip)->i_gid, icw->icw_gid)) 1057 return true; 1058 1059 if ((icw->icw_flags & XFS_ICWALK_FLAG_PRID) && 1060 ip->i_projid == icw->icw_prid) 1061 return true; 1062 1063 return false; 1064 } 1065 1066 /* 1067 * Is this inode @ip eligible for eof/cow block reclamation, given some 1068 * filtering parameters @icw? The inode is eligible if @icw is null or 1069 * if the predicate functions match. 1070 */ 1071 static bool 1072 xfs_icwalk_match( 1073 struct xfs_inode *ip, 1074 struct xfs_icwalk *icw) 1075 { 1076 bool match; 1077 1078 if (!icw) 1079 return true; 1080 1081 if (icw->icw_flags & XFS_ICWALK_FLAG_UNION) 1082 match = xfs_icwalk_match_id_union(ip, icw); 1083 else 1084 match = xfs_icwalk_match_id(ip, icw); 1085 if (!match) 1086 return false; 1087 1088 /* skip the inode if the file size is too small */ 1089 if ((icw->icw_flags & XFS_ICWALK_FLAG_MINFILESIZE) && 1090 XFS_ISIZE(ip) < icw->icw_min_file_size) 1091 return false; 1092 1093 return true; 1094 } 1095 1096 /* 1097 * This is a fast pass over the inode cache to try to get reclaim moving on as 1098 * many inodes as possible in a short period of time. It kicks itself every few 1099 * seconds, as well as being kicked by the inode cache shrinker when memory 1100 * goes low. 1101 */ 1102 void 1103 xfs_reclaim_worker( 1104 struct work_struct *work) 1105 { 1106 struct xfs_mount *mp = container_of(to_delayed_work(work), 1107 struct xfs_mount, m_reclaim_work); 1108 1109 xfs_icwalk(mp, XFS_ICWALK_RECLAIM, NULL); 1110 xfs_reclaim_work_queue(mp); 1111 } 1112 1113 STATIC int 1114 xfs_inode_free_eofblocks( 1115 struct xfs_inode *ip, 1116 struct xfs_icwalk *icw, 1117 unsigned int *lockflags) 1118 { 1119 bool wait; 1120 1121 wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); 1122 1123 if (!xfs_iflags_test(ip, XFS_IEOFBLOCKS)) 1124 return 0; 1125 1126 /* 1127 * If the mapping is dirty the operation can block and wait for some 1128 * time. Unless we are waiting, skip it. 1129 */ 1130 if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 1131 return 0; 1132 1133 if (!xfs_icwalk_match(ip, icw)) 1134 return 0; 1135 1136 /* 1137 * If the caller is waiting, return -EAGAIN to keep the background 1138 * scanner moving and revisit the inode in a subsequent pass. 1139 */ 1140 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1141 if (wait) 1142 return -EAGAIN; 1143 return 0; 1144 } 1145 *lockflags |= XFS_IOLOCK_EXCL; 1146 1147 if (xfs_can_free_eofblocks(ip, false)) 1148 return xfs_free_eofblocks(ip); 1149 1150 /* inode could be preallocated or append-only */ 1151 trace_xfs_inode_free_eofblocks_invalid(ip); 1152 xfs_inode_clear_eofblocks_tag(ip); 1153 return 0; 1154 } 1155 1156 static void 1157 xfs_blockgc_set_iflag( 1158 struct xfs_inode *ip, 1159 unsigned long iflag) 1160 { 1161 struct xfs_mount *mp = ip->i_mount; 1162 struct xfs_perag *pag; 1163 1164 ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); 1165 1166 /* 1167 * Don't bother locking the AG and looking up in the radix trees 1168 * if we already know that we have the tag set. 1169 */ 1170 if (ip->i_flags & iflag) 1171 return; 1172 spin_lock(&ip->i_flags_lock); 1173 ip->i_flags |= iflag; 1174 spin_unlock(&ip->i_flags_lock); 1175 1176 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1177 spin_lock(&pag->pag_ici_lock); 1178 1179 xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 1180 XFS_ICI_BLOCKGC_TAG); 1181 1182 spin_unlock(&pag->pag_ici_lock); 1183 xfs_perag_put(pag); 1184 } 1185 1186 void 1187 xfs_inode_set_eofblocks_tag( 1188 xfs_inode_t *ip) 1189 { 1190 trace_xfs_inode_set_eofblocks_tag(ip); 1191 return xfs_blockgc_set_iflag(ip, XFS_IEOFBLOCKS); 1192 } 1193 1194 static void 1195 xfs_blockgc_clear_iflag( 1196 struct xfs_inode *ip, 1197 unsigned long iflag) 1198 { 1199 struct xfs_mount *mp = ip->i_mount; 1200 struct xfs_perag *pag; 1201 bool clear_tag; 1202 1203 ASSERT((iflag & ~(XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0); 1204 1205 spin_lock(&ip->i_flags_lock); 1206 ip->i_flags &= ~iflag; 1207 clear_tag = (ip->i_flags & (XFS_IEOFBLOCKS | XFS_ICOWBLOCKS)) == 0; 1208 spin_unlock(&ip->i_flags_lock); 1209 1210 if (!clear_tag) 1211 return; 1212 1213 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1214 spin_lock(&pag->pag_ici_lock); 1215 1216 xfs_perag_clear_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 1217 XFS_ICI_BLOCKGC_TAG); 1218 1219 spin_unlock(&pag->pag_ici_lock); 1220 xfs_perag_put(pag); 1221 } 1222 1223 void 1224 xfs_inode_clear_eofblocks_tag( 1225 xfs_inode_t *ip) 1226 { 1227 trace_xfs_inode_clear_eofblocks_tag(ip); 1228 return xfs_blockgc_clear_iflag(ip, XFS_IEOFBLOCKS); 1229 } 1230 1231 /* 1232 * Set ourselves up to free CoW blocks from this file. If it's already clean 1233 * then we can bail out quickly, but otherwise we must back off if the file 1234 * is undergoing some kind of write. 1235 */ 1236 static bool 1237 xfs_prep_free_cowblocks( 1238 struct xfs_inode *ip) 1239 { 1240 /* 1241 * Just clear the tag if we have an empty cow fork or none at all. It's 1242 * possible the inode was fully unshared since it was originally tagged. 1243 */ 1244 if (!xfs_inode_has_cow_data(ip)) { 1245 trace_xfs_inode_free_cowblocks_invalid(ip); 1246 xfs_inode_clear_cowblocks_tag(ip); 1247 return false; 1248 } 1249 1250 /* 1251 * If the mapping is dirty or under writeback we cannot touch the 1252 * CoW fork. Leave it alone if we're in the midst of a directio. 1253 */ 1254 if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || 1255 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || 1256 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || 1257 atomic_read(&VFS_I(ip)->i_dio_count)) 1258 return false; 1259 1260 return true; 1261 } 1262 1263 /* 1264 * Automatic CoW Reservation Freeing 1265 * 1266 * These functions automatically garbage collect leftover CoW reservations 1267 * that were made on behalf of a cowextsize hint when we start to run out 1268 * of quota or when the reservations sit around for too long. If the file 1269 * has dirty pages or is undergoing writeback, its CoW reservations will 1270 * be retained. 1271 * 1272 * The actual garbage collection piggybacks off the same code that runs 1273 * the speculative EOF preallocation garbage collector. 1274 */ 1275 STATIC int 1276 xfs_inode_free_cowblocks( 1277 struct xfs_inode *ip, 1278 struct xfs_icwalk *icw, 1279 unsigned int *lockflags) 1280 { 1281 bool wait; 1282 int ret = 0; 1283 1284 wait = icw && (icw->icw_flags & XFS_ICWALK_FLAG_SYNC); 1285 1286 if (!xfs_iflags_test(ip, XFS_ICOWBLOCKS)) 1287 return 0; 1288 1289 if (!xfs_prep_free_cowblocks(ip)) 1290 return 0; 1291 1292 if (!xfs_icwalk_match(ip, icw)) 1293 return 0; 1294 1295 /* 1296 * If the caller is waiting, return -EAGAIN to keep the background 1297 * scanner moving and revisit the inode in a subsequent pass. 1298 */ 1299 if (!(*lockflags & XFS_IOLOCK_EXCL) && 1300 !xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1301 if (wait) 1302 return -EAGAIN; 1303 return 0; 1304 } 1305 *lockflags |= XFS_IOLOCK_EXCL; 1306 1307 if (!xfs_ilock_nowait(ip, XFS_MMAPLOCK_EXCL)) { 1308 if (wait) 1309 return -EAGAIN; 1310 return 0; 1311 } 1312 *lockflags |= XFS_MMAPLOCK_EXCL; 1313 1314 /* 1315 * Check again, nobody else should be able to dirty blocks or change 1316 * the reflink iflag now that we have the first two locks held. 1317 */ 1318 if (xfs_prep_free_cowblocks(ip)) 1319 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); 1320 return ret; 1321 } 1322 1323 void 1324 xfs_inode_set_cowblocks_tag( 1325 xfs_inode_t *ip) 1326 { 1327 trace_xfs_inode_set_cowblocks_tag(ip); 1328 return xfs_blockgc_set_iflag(ip, XFS_ICOWBLOCKS); 1329 } 1330 1331 void 1332 xfs_inode_clear_cowblocks_tag( 1333 xfs_inode_t *ip) 1334 { 1335 trace_xfs_inode_clear_cowblocks_tag(ip); 1336 return xfs_blockgc_clear_iflag(ip, XFS_ICOWBLOCKS); 1337 } 1338 1339 /* Disable post-EOF and CoW block auto-reclamation. */ 1340 void 1341 xfs_blockgc_stop( 1342 struct xfs_mount *mp) 1343 { 1344 struct xfs_perag *pag; 1345 xfs_agnumber_t agno; 1346 1347 if (!xfs_clear_blockgc_enabled(mp)) 1348 return; 1349 1350 for_each_perag(mp, agno, pag) 1351 cancel_delayed_work_sync(&pag->pag_blockgc_work); 1352 trace_xfs_blockgc_stop(mp, __return_address); 1353 } 1354 1355 /* Enable post-EOF and CoW block auto-reclamation. */ 1356 void 1357 xfs_blockgc_start( 1358 struct xfs_mount *mp) 1359 { 1360 struct xfs_perag *pag; 1361 xfs_agnumber_t agno; 1362 1363 if (xfs_set_blockgc_enabled(mp)) 1364 return; 1365 1366 trace_xfs_blockgc_start(mp, __return_address); 1367 for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) 1368 xfs_blockgc_queue(pag); 1369 } 1370 1371 /* Don't try to run block gc on an inode that's in any of these states. */ 1372 #define XFS_BLOCKGC_NOGRAB_IFLAGS (XFS_INEW | \ 1373 XFS_NEED_INACTIVE | \ 1374 XFS_INACTIVATING | \ 1375 XFS_IRECLAIMABLE | \ 1376 XFS_IRECLAIM) 1377 /* 1378 * Decide if the given @ip is eligible for garbage collection of speculative 1379 * preallocations, and grab it if so. Returns true if it's ready to go or 1380 * false if we should just ignore it. 1381 */ 1382 static bool 1383 xfs_blockgc_igrab( 1384 struct xfs_inode *ip) 1385 { 1386 struct inode *inode = VFS_I(ip); 1387 1388 ASSERT(rcu_read_lock_held()); 1389 1390 /* Check for stale RCU freed inode */ 1391 spin_lock(&ip->i_flags_lock); 1392 if (!ip->i_ino) 1393 goto out_unlock_noent; 1394 1395 if (ip->i_flags & XFS_BLOCKGC_NOGRAB_IFLAGS) 1396 goto out_unlock_noent; 1397 spin_unlock(&ip->i_flags_lock); 1398 1399 /* nothing to sync during shutdown */ 1400 if (xfs_is_shutdown(ip->i_mount)) 1401 return false; 1402 1403 /* If we can't grab the inode, it must on it's way to reclaim. */ 1404 if (!igrab(inode)) 1405 return false; 1406 1407 /* inode is valid */ 1408 return true; 1409 1410 out_unlock_noent: 1411 spin_unlock(&ip->i_flags_lock); 1412 return false; 1413 } 1414 1415 /* Scan one incore inode for block preallocations that we can remove. */ 1416 static int 1417 xfs_blockgc_scan_inode( 1418 struct xfs_inode *ip, 1419 struct xfs_icwalk *icw) 1420 { 1421 unsigned int lockflags = 0; 1422 int error; 1423 1424 error = xfs_inode_free_eofblocks(ip, icw, &lockflags); 1425 if (error) 1426 goto unlock; 1427 1428 error = xfs_inode_free_cowblocks(ip, icw, &lockflags); 1429 unlock: 1430 if (lockflags) 1431 xfs_iunlock(ip, lockflags); 1432 xfs_irele(ip); 1433 return error; 1434 } 1435 1436 /* Background worker that trims preallocated space. */ 1437 void 1438 xfs_blockgc_worker( 1439 struct work_struct *work) 1440 { 1441 struct xfs_perag *pag = container_of(to_delayed_work(work), 1442 struct xfs_perag, pag_blockgc_work); 1443 struct xfs_mount *mp = pag->pag_mount; 1444 int error; 1445 1446 trace_xfs_blockgc_worker(mp, __return_address); 1447 1448 error = xfs_icwalk_ag(pag, XFS_ICWALK_BLOCKGC, NULL); 1449 if (error) 1450 xfs_info(mp, "AG %u preallocation gc worker failed, err=%d", 1451 pag->pag_agno, error); 1452 xfs_blockgc_queue(pag); 1453 } 1454 1455 /* 1456 * Try to free space in the filesystem by purging inactive inodes, eofblocks 1457 * and cowblocks. 1458 */ 1459 int 1460 xfs_blockgc_free_space( 1461 struct xfs_mount *mp, 1462 struct xfs_icwalk *icw) 1463 { 1464 int error; 1465 1466 trace_xfs_blockgc_free_space(mp, icw, _RET_IP_); 1467 1468 error = xfs_icwalk(mp, XFS_ICWALK_BLOCKGC, icw); 1469 if (error) 1470 return error; 1471 1472 xfs_inodegc_flush(mp); 1473 return 0; 1474 } 1475 1476 /* 1477 * Reclaim all the free space that we can by scheduling the background blockgc 1478 * and inodegc workers immediately and waiting for them all to clear. 1479 */ 1480 void 1481 xfs_blockgc_flush_all( 1482 struct xfs_mount *mp) 1483 { 1484 struct xfs_perag *pag; 1485 xfs_agnumber_t agno; 1486 1487 trace_xfs_blockgc_flush_all(mp, __return_address); 1488 1489 /* 1490 * For each blockgc worker, move its queue time up to now. If it 1491 * wasn't queued, it will not be requeued. Then flush whatever's 1492 * left. 1493 */ 1494 for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) 1495 mod_delayed_work(pag->pag_mount->m_blockgc_wq, 1496 &pag->pag_blockgc_work, 0); 1497 1498 for_each_perag_tag(mp, agno, pag, XFS_ICI_BLOCKGC_TAG) 1499 flush_delayed_work(&pag->pag_blockgc_work); 1500 1501 xfs_inodegc_flush(mp); 1502 } 1503 1504 /* 1505 * Run cow/eofblocks scans on the supplied dquots. We don't know exactly which 1506 * quota caused an allocation failure, so we make a best effort by including 1507 * each quota under low free space conditions (less than 1% free space) in the 1508 * scan. 1509 * 1510 * Callers must not hold any inode's ILOCK. If requesting a synchronous scan 1511 * (XFS_ICWALK_FLAG_SYNC), the caller also must not hold any inode's IOLOCK or 1512 * MMAPLOCK. 1513 */ 1514 int 1515 xfs_blockgc_free_dquots( 1516 struct xfs_mount *mp, 1517 struct xfs_dquot *udqp, 1518 struct xfs_dquot *gdqp, 1519 struct xfs_dquot *pdqp, 1520 unsigned int iwalk_flags) 1521 { 1522 struct xfs_icwalk icw = {0}; 1523 bool do_work = false; 1524 1525 if (!udqp && !gdqp && !pdqp) 1526 return 0; 1527 1528 /* 1529 * Run a scan to free blocks using the union filter to cover all 1530 * applicable quotas in a single scan. 1531 */ 1532 icw.icw_flags = XFS_ICWALK_FLAG_UNION | iwalk_flags; 1533 1534 if (XFS_IS_UQUOTA_ENFORCED(mp) && udqp && xfs_dquot_lowsp(udqp)) { 1535 icw.icw_uid = make_kuid(mp->m_super->s_user_ns, udqp->q_id); 1536 icw.icw_flags |= XFS_ICWALK_FLAG_UID; 1537 do_work = true; 1538 } 1539 1540 if (XFS_IS_UQUOTA_ENFORCED(mp) && gdqp && xfs_dquot_lowsp(gdqp)) { 1541 icw.icw_gid = make_kgid(mp->m_super->s_user_ns, gdqp->q_id); 1542 icw.icw_flags |= XFS_ICWALK_FLAG_GID; 1543 do_work = true; 1544 } 1545 1546 if (XFS_IS_PQUOTA_ENFORCED(mp) && pdqp && xfs_dquot_lowsp(pdqp)) { 1547 icw.icw_prid = pdqp->q_id; 1548 icw.icw_flags |= XFS_ICWALK_FLAG_PRID; 1549 do_work = true; 1550 } 1551 1552 if (!do_work) 1553 return 0; 1554 1555 return xfs_blockgc_free_space(mp, &icw); 1556 } 1557 1558 /* Run cow/eofblocks scans on the quotas attached to the inode. */ 1559 int 1560 xfs_blockgc_free_quota( 1561 struct xfs_inode *ip, 1562 unsigned int iwalk_flags) 1563 { 1564 return xfs_blockgc_free_dquots(ip->i_mount, 1565 xfs_inode_dquot(ip, XFS_DQTYPE_USER), 1566 xfs_inode_dquot(ip, XFS_DQTYPE_GROUP), 1567 xfs_inode_dquot(ip, XFS_DQTYPE_PROJ), iwalk_flags); 1568 } 1569 1570 /* XFS Inode Cache Walking Code */ 1571 1572 /* 1573 * The inode lookup is done in batches to keep the amount of lock traffic and 1574 * radix tree lookups to a minimum. The batch size is a trade off between 1575 * lookup reduction and stack usage. This is in the reclaim path, so we can't 1576 * be too greedy. 1577 */ 1578 #define XFS_LOOKUP_BATCH 32 1579 1580 1581 /* 1582 * Decide if we want to grab this inode in anticipation of doing work towards 1583 * the goal. 1584 */ 1585 static inline bool 1586 xfs_icwalk_igrab( 1587 enum xfs_icwalk_goal goal, 1588 struct xfs_inode *ip, 1589 struct xfs_icwalk *icw) 1590 { 1591 switch (goal) { 1592 case XFS_ICWALK_BLOCKGC: 1593 return xfs_blockgc_igrab(ip); 1594 case XFS_ICWALK_RECLAIM: 1595 return xfs_reclaim_igrab(ip, icw); 1596 default: 1597 return false; 1598 } 1599 } 1600 1601 /* 1602 * Process an inode. Each processing function must handle any state changes 1603 * made by the icwalk igrab function. Return -EAGAIN to skip an inode. 1604 */ 1605 static inline int 1606 xfs_icwalk_process_inode( 1607 enum xfs_icwalk_goal goal, 1608 struct xfs_inode *ip, 1609 struct xfs_perag *pag, 1610 struct xfs_icwalk *icw) 1611 { 1612 int error = 0; 1613 1614 switch (goal) { 1615 case XFS_ICWALK_BLOCKGC: 1616 error = xfs_blockgc_scan_inode(ip, icw); 1617 break; 1618 case XFS_ICWALK_RECLAIM: 1619 xfs_reclaim_inode(ip, pag); 1620 break; 1621 } 1622 return error; 1623 } 1624 1625 /* 1626 * For a given per-AG structure @pag and a goal, grab qualifying inodes and 1627 * process them in some manner. 1628 */ 1629 static int 1630 xfs_icwalk_ag( 1631 struct xfs_perag *pag, 1632 enum xfs_icwalk_goal goal, 1633 struct xfs_icwalk *icw) 1634 { 1635 struct xfs_mount *mp = pag->pag_mount; 1636 uint32_t first_index; 1637 int last_error = 0; 1638 int skipped; 1639 bool done; 1640 int nr_found; 1641 1642 restart: 1643 done = false; 1644 skipped = 0; 1645 if (goal == XFS_ICWALK_RECLAIM) 1646 first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); 1647 else 1648 first_index = 0; 1649 nr_found = 0; 1650 do { 1651 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 1652 int error = 0; 1653 int i; 1654 1655 rcu_read_lock(); 1656 1657 nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, 1658 (void **) batch, first_index, 1659 XFS_LOOKUP_BATCH, goal); 1660 if (!nr_found) { 1661 done = true; 1662 rcu_read_unlock(); 1663 break; 1664 } 1665 1666 /* 1667 * Grab the inodes before we drop the lock. if we found 1668 * nothing, nr == 0 and the loop will be skipped. 1669 */ 1670 for (i = 0; i < nr_found; i++) { 1671 struct xfs_inode *ip = batch[i]; 1672 1673 if (done || !xfs_icwalk_igrab(goal, ip, icw)) 1674 batch[i] = NULL; 1675 1676 /* 1677 * Update the index for the next lookup. Catch 1678 * overflows into the next AG range which can occur if 1679 * we have inodes in the last block of the AG and we 1680 * are currently pointing to the last inode. 1681 * 1682 * Because we may see inodes that are from the wrong AG 1683 * due to RCU freeing and reallocation, only update the 1684 * index if it lies in this AG. It was a race that lead 1685 * us to see this inode, so another lookup from the 1686 * same index will not find it again. 1687 */ 1688 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 1689 continue; 1690 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 1691 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 1692 done = true; 1693 } 1694 1695 /* unlock now we've grabbed the inodes. */ 1696 rcu_read_unlock(); 1697 1698 for (i = 0; i < nr_found; i++) { 1699 if (!batch[i]) 1700 continue; 1701 error = xfs_icwalk_process_inode(goal, batch[i], pag, 1702 icw); 1703 if (error == -EAGAIN) { 1704 skipped++; 1705 continue; 1706 } 1707 if (error && last_error != -EFSCORRUPTED) 1708 last_error = error; 1709 } 1710 1711 /* bail out if the filesystem is corrupted. */ 1712 if (error == -EFSCORRUPTED) 1713 break; 1714 1715 cond_resched(); 1716 1717 if (icw && (icw->icw_flags & XFS_ICWALK_FLAG_SCAN_LIMIT)) { 1718 icw->icw_scan_limit -= XFS_LOOKUP_BATCH; 1719 if (icw->icw_scan_limit <= 0) 1720 break; 1721 } 1722 } while (nr_found && !done); 1723 1724 if (goal == XFS_ICWALK_RECLAIM) { 1725 if (done) 1726 first_index = 0; 1727 WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); 1728 } 1729 1730 if (skipped) { 1731 delay(1); 1732 goto restart; 1733 } 1734 return last_error; 1735 } 1736 1737 /* Walk all incore inodes to achieve a given goal. */ 1738 static int 1739 xfs_icwalk( 1740 struct xfs_mount *mp, 1741 enum xfs_icwalk_goal goal, 1742 struct xfs_icwalk *icw) 1743 { 1744 struct xfs_perag *pag; 1745 int error = 0; 1746 int last_error = 0; 1747 xfs_agnumber_t agno; 1748 1749 for_each_perag_tag(mp, agno, pag, goal) { 1750 error = xfs_icwalk_ag(pag, goal, icw); 1751 if (error) { 1752 last_error = error; 1753 if (error == -EFSCORRUPTED) { 1754 xfs_perag_put(pag); 1755 break; 1756 } 1757 } 1758 } 1759 return last_error; 1760 BUILD_BUG_ON(XFS_ICWALK_PRIVATE_FLAGS & XFS_ICWALK_FLAGS_VALID); 1761 } 1762 1763 #ifdef DEBUG 1764 static void 1765 xfs_check_delalloc( 1766 struct xfs_inode *ip, 1767 int whichfork) 1768 { 1769 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); 1770 struct xfs_bmbt_irec got; 1771 struct xfs_iext_cursor icur; 1772 1773 if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got)) 1774 return; 1775 do { 1776 if (isnullstartblock(got.br_startblock)) { 1777 xfs_warn(ip->i_mount, 1778 "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]", 1779 ip->i_ino, 1780 whichfork == XFS_DATA_FORK ? "data" : "cow", 1781 got.br_startoff, got.br_blockcount); 1782 } 1783 } while (xfs_iext_next_extent(ifp, &icur, &got)); 1784 } 1785 #else 1786 #define xfs_check_delalloc(ip, whichfork) do { } while (0) 1787 #endif 1788 1789 /* Schedule the inode for reclaim. */ 1790 static void 1791 xfs_inodegc_set_reclaimable( 1792 struct xfs_inode *ip) 1793 { 1794 struct xfs_mount *mp = ip->i_mount; 1795 struct xfs_perag *pag; 1796 1797 if (!xfs_is_shutdown(mp) && ip->i_delayed_blks) { 1798 xfs_check_delalloc(ip, XFS_DATA_FORK); 1799 xfs_check_delalloc(ip, XFS_COW_FORK); 1800 ASSERT(0); 1801 } 1802 1803 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1804 spin_lock(&pag->pag_ici_lock); 1805 spin_lock(&ip->i_flags_lock); 1806 1807 trace_xfs_inode_set_reclaimable(ip); 1808 ip->i_flags &= ~(XFS_NEED_INACTIVE | XFS_INACTIVATING); 1809 ip->i_flags |= XFS_IRECLAIMABLE; 1810 xfs_perag_set_inode_tag(pag, XFS_INO_TO_AGINO(mp, ip->i_ino), 1811 XFS_ICI_RECLAIM_TAG); 1812 1813 spin_unlock(&ip->i_flags_lock); 1814 spin_unlock(&pag->pag_ici_lock); 1815 xfs_perag_put(pag); 1816 } 1817 1818 /* 1819 * Free all speculative preallocations and possibly even the inode itself. 1820 * This is the last chance to make changes to an otherwise unreferenced file 1821 * before incore reclamation happens. 1822 */ 1823 static void 1824 xfs_inodegc_inactivate( 1825 struct xfs_inode *ip) 1826 { 1827 trace_xfs_inode_inactivating(ip); 1828 xfs_inactive(ip); 1829 xfs_inodegc_set_reclaimable(ip); 1830 } 1831 1832 void 1833 xfs_inodegc_worker( 1834 struct work_struct *work) 1835 { 1836 struct xfs_inodegc *gc = container_of(work, struct xfs_inodegc, 1837 work); 1838 struct llist_node *node = llist_del_all(&gc->list); 1839 struct xfs_inode *ip, *n; 1840 1841 WRITE_ONCE(gc->items, 0); 1842 1843 if (!node) 1844 return; 1845 1846 ip = llist_entry(node, struct xfs_inode, i_gclist); 1847 trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits)); 1848 1849 WRITE_ONCE(gc->shrinker_hits, 0); 1850 llist_for_each_entry_safe(ip, n, node, i_gclist) { 1851 xfs_iflags_set(ip, XFS_INACTIVATING); 1852 xfs_inodegc_inactivate(ip); 1853 } 1854 } 1855 1856 /* 1857 * Force all currently queued inode inactivation work to run immediately, and 1858 * wait for the work to finish. Two pass - queue all the work first pass, wait 1859 * for it in a second pass. 1860 */ 1861 void 1862 xfs_inodegc_flush( 1863 struct xfs_mount *mp) 1864 { 1865 struct xfs_inodegc *gc; 1866 int cpu; 1867 1868 if (!xfs_is_inodegc_enabled(mp)) 1869 return; 1870 1871 trace_xfs_inodegc_flush(mp, __return_address); 1872 1873 xfs_inodegc_queue_all(mp); 1874 1875 for_each_online_cpu(cpu) { 1876 gc = per_cpu_ptr(mp->m_inodegc, cpu); 1877 flush_work(&gc->work); 1878 } 1879 } 1880 1881 /* 1882 * Flush all the pending work and then disable the inode inactivation background 1883 * workers and wait for them to stop. 1884 */ 1885 void 1886 xfs_inodegc_stop( 1887 struct xfs_mount *mp) 1888 { 1889 struct xfs_inodegc *gc; 1890 int cpu; 1891 1892 if (!xfs_clear_inodegc_enabled(mp)) 1893 return; 1894 1895 xfs_inodegc_queue_all(mp); 1896 1897 for_each_online_cpu(cpu) { 1898 gc = per_cpu_ptr(mp->m_inodegc, cpu); 1899 cancel_work_sync(&gc->work); 1900 } 1901 trace_xfs_inodegc_stop(mp, __return_address); 1902 } 1903 1904 /* 1905 * Enable the inode inactivation background workers and schedule deferred inode 1906 * inactivation work if there is any. 1907 */ 1908 void 1909 xfs_inodegc_start( 1910 struct xfs_mount *mp) 1911 { 1912 if (xfs_set_inodegc_enabled(mp)) 1913 return; 1914 1915 trace_xfs_inodegc_start(mp, __return_address); 1916 xfs_inodegc_queue_all(mp); 1917 } 1918 1919 #ifdef CONFIG_XFS_RT 1920 static inline bool 1921 xfs_inodegc_want_queue_rt_file( 1922 struct xfs_inode *ip) 1923 { 1924 struct xfs_mount *mp = ip->i_mount; 1925 uint64_t freertx; 1926 1927 if (!XFS_IS_REALTIME_INODE(ip)) 1928 return false; 1929 1930 freertx = READ_ONCE(mp->m_sb.sb_frextents); 1931 return freertx < mp->m_low_rtexts[XFS_LOWSP_5_PCNT]; 1932 } 1933 #else 1934 # define xfs_inodegc_want_queue_rt_file(ip) (false) 1935 #endif /* CONFIG_XFS_RT */ 1936 1937 /* 1938 * Schedule the inactivation worker when: 1939 * 1940 * - We've accumulated more than one inode cluster buffer's worth of inodes. 1941 * - There is less than 5% free space left. 1942 * - Any of the quotas for this inode are near an enforcement limit. 1943 */ 1944 static inline bool 1945 xfs_inodegc_want_queue_work( 1946 struct xfs_inode *ip, 1947 unsigned int items) 1948 { 1949 struct xfs_mount *mp = ip->i_mount; 1950 1951 if (items > mp->m_ino_geo.inodes_per_cluster) 1952 return true; 1953 1954 if (__percpu_counter_compare(&mp->m_fdblocks, 1955 mp->m_low_space[XFS_LOWSP_5_PCNT], 1956 XFS_FDBLOCKS_BATCH) < 0) 1957 return true; 1958 1959 if (xfs_inodegc_want_queue_rt_file(ip)) 1960 return true; 1961 1962 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_USER)) 1963 return true; 1964 1965 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_GROUP)) 1966 return true; 1967 1968 if (xfs_inode_near_dquot_enforcement(ip, XFS_DQTYPE_PROJ)) 1969 return true; 1970 1971 return false; 1972 } 1973 1974 /* 1975 * Upper bound on the number of inodes in each AG that can be queued for 1976 * inactivation at any given time, to avoid monopolizing the workqueue. 1977 */ 1978 #define XFS_INODEGC_MAX_BACKLOG (4 * XFS_INODES_PER_CHUNK) 1979 1980 /* 1981 * Make the frontend wait for inactivations when: 1982 * 1983 * - Memory shrinkers queued the inactivation worker and it hasn't finished. 1984 * - The queue depth exceeds the maximum allowable percpu backlog. 1985 * 1986 * Note: If the current thread is running a transaction, we don't ever want to 1987 * wait for other transactions because that could introduce a deadlock. 1988 */ 1989 static inline bool 1990 xfs_inodegc_want_flush_work( 1991 struct xfs_inode *ip, 1992 unsigned int items, 1993 unsigned int shrinker_hits) 1994 { 1995 if (current->journal_info) 1996 return false; 1997 1998 if (shrinker_hits > 0) 1999 return true; 2000 2001 if (items > XFS_INODEGC_MAX_BACKLOG) 2002 return true; 2003 2004 return false; 2005 } 2006 2007 /* 2008 * Queue a background inactivation worker if there are inodes that need to be 2009 * inactivated and higher level xfs code hasn't disabled the background 2010 * workers. 2011 */ 2012 static void 2013 xfs_inodegc_queue( 2014 struct xfs_inode *ip) 2015 { 2016 struct xfs_mount *mp = ip->i_mount; 2017 struct xfs_inodegc *gc; 2018 int items; 2019 unsigned int shrinker_hits; 2020 2021 trace_xfs_inode_set_need_inactive(ip); 2022 spin_lock(&ip->i_flags_lock); 2023 ip->i_flags |= XFS_NEED_INACTIVE; 2024 spin_unlock(&ip->i_flags_lock); 2025 2026 gc = get_cpu_ptr(mp->m_inodegc); 2027 llist_add(&ip->i_gclist, &gc->list); 2028 items = READ_ONCE(gc->items); 2029 WRITE_ONCE(gc->items, items + 1); 2030 shrinker_hits = READ_ONCE(gc->shrinker_hits); 2031 put_cpu_ptr(gc); 2032 2033 if (!xfs_is_inodegc_enabled(mp)) 2034 return; 2035 2036 if (xfs_inodegc_want_queue_work(ip, items)) { 2037 trace_xfs_inodegc_queue(mp, __return_address); 2038 queue_work(mp->m_inodegc_wq, &gc->work); 2039 } 2040 2041 if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) { 2042 trace_xfs_inodegc_throttle(mp, __return_address); 2043 flush_work(&gc->work); 2044 } 2045 } 2046 2047 /* 2048 * Fold the dead CPU inodegc queue into the current CPUs queue. 2049 */ 2050 void 2051 xfs_inodegc_cpu_dead( 2052 struct xfs_mount *mp, 2053 unsigned int dead_cpu) 2054 { 2055 struct xfs_inodegc *dead_gc, *gc; 2056 struct llist_node *first, *last; 2057 unsigned int count = 0; 2058 2059 dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu); 2060 cancel_work_sync(&dead_gc->work); 2061 2062 if (llist_empty(&dead_gc->list)) 2063 return; 2064 2065 first = dead_gc->list.first; 2066 last = first; 2067 while (last->next) { 2068 last = last->next; 2069 count++; 2070 } 2071 dead_gc->list.first = NULL; 2072 dead_gc->items = 0; 2073 2074 /* Add pending work to current CPU */ 2075 gc = get_cpu_ptr(mp->m_inodegc); 2076 llist_add_batch(first, last, &gc->list); 2077 count += READ_ONCE(gc->items); 2078 WRITE_ONCE(gc->items, count); 2079 put_cpu_ptr(gc); 2080 2081 if (xfs_is_inodegc_enabled(mp)) { 2082 trace_xfs_inodegc_queue(mp, __return_address); 2083 queue_work(mp->m_inodegc_wq, &gc->work); 2084 } 2085 } 2086 2087 /* 2088 * We set the inode flag atomically with the radix tree tag. Once we get tag 2089 * lookups on the radix tree, this inode flag can go away. 2090 * 2091 * We always use background reclaim here because even if the inode is clean, it 2092 * still may be under IO and hence we have wait for IO completion to occur 2093 * before we can reclaim the inode. The background reclaim path handles this 2094 * more efficiently than we can here, so simply let background reclaim tear down 2095 * all inodes. 2096 */ 2097 void 2098 xfs_inode_mark_reclaimable( 2099 struct xfs_inode *ip) 2100 { 2101 struct xfs_mount *mp = ip->i_mount; 2102 bool need_inactive; 2103 2104 XFS_STATS_INC(mp, vn_reclaim); 2105 2106 /* 2107 * We should never get here with any of the reclaim flags already set. 2108 */ 2109 ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_ALL_IRECLAIM_FLAGS)); 2110 2111 need_inactive = xfs_inode_needs_inactive(ip); 2112 if (need_inactive) { 2113 xfs_inodegc_queue(ip); 2114 return; 2115 } 2116 2117 /* Going straight to reclaim, so drop the dquots. */ 2118 xfs_qm_dqdetach(ip); 2119 xfs_inodegc_set_reclaimable(ip); 2120 } 2121 2122 /* 2123 * Register a phony shrinker so that we can run background inodegc sooner when 2124 * there's memory pressure. Inactivation does not itself free any memory but 2125 * it does make inodes reclaimable, which eventually frees memory. 2126 * 2127 * The count function, seek value, and batch value are crafted to trigger the 2128 * scan function during the second round of scanning. Hopefully this means 2129 * that we reclaimed enough memory that initiating metadata transactions won't 2130 * make things worse. 2131 */ 2132 #define XFS_INODEGC_SHRINKER_COUNT (1UL << DEF_PRIORITY) 2133 #define XFS_INODEGC_SHRINKER_BATCH ((XFS_INODEGC_SHRINKER_COUNT / 2) + 1) 2134 2135 static unsigned long 2136 xfs_inodegc_shrinker_count( 2137 struct shrinker *shrink, 2138 struct shrink_control *sc) 2139 { 2140 struct xfs_mount *mp = container_of(shrink, struct xfs_mount, 2141 m_inodegc_shrinker); 2142 struct xfs_inodegc *gc; 2143 int cpu; 2144 2145 if (!xfs_is_inodegc_enabled(mp)) 2146 return 0; 2147 2148 for_each_online_cpu(cpu) { 2149 gc = per_cpu_ptr(mp->m_inodegc, cpu); 2150 if (!llist_empty(&gc->list)) 2151 return XFS_INODEGC_SHRINKER_COUNT; 2152 } 2153 2154 return 0; 2155 } 2156 2157 static unsigned long 2158 xfs_inodegc_shrinker_scan( 2159 struct shrinker *shrink, 2160 struct shrink_control *sc) 2161 { 2162 struct xfs_mount *mp = container_of(shrink, struct xfs_mount, 2163 m_inodegc_shrinker); 2164 struct xfs_inodegc *gc; 2165 int cpu; 2166 bool no_items = true; 2167 2168 if (!xfs_is_inodegc_enabled(mp)) 2169 return SHRINK_STOP; 2170 2171 trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address); 2172 2173 for_each_online_cpu(cpu) { 2174 gc = per_cpu_ptr(mp->m_inodegc, cpu); 2175 if (!llist_empty(&gc->list)) { 2176 unsigned int h = READ_ONCE(gc->shrinker_hits); 2177 2178 WRITE_ONCE(gc->shrinker_hits, h + 1); 2179 queue_work_on(cpu, mp->m_inodegc_wq, &gc->work); 2180 no_items = false; 2181 } 2182 } 2183 2184 /* 2185 * If there are no inodes to inactivate, we don't want the shrinker 2186 * to think there's deferred work to call us back about. 2187 */ 2188 if (no_items) 2189 return LONG_MAX; 2190 2191 return SHRINK_STOP; 2192 } 2193 2194 /* Register a shrinker so we can accelerate inodegc and throttle queuing. */ 2195 int 2196 xfs_inodegc_register_shrinker( 2197 struct xfs_mount *mp) 2198 { 2199 struct shrinker *shrink = &mp->m_inodegc_shrinker; 2200 2201 shrink->count_objects = xfs_inodegc_shrinker_count; 2202 shrink->scan_objects = xfs_inodegc_shrinker_scan; 2203 shrink->seeks = 0; 2204 shrink->flags = SHRINKER_NONSLAB; 2205 shrink->batch = XFS_INODEGC_SHRINKER_BATCH; 2206 2207 return register_shrinker(shrink); 2208 } 2209