1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_sb.h" 13 #include "xfs_mount.h" 14 #include "xfs_inode.h" 15 #include "xfs_trans.h" 16 #include "xfs_trans_priv.h" 17 #include "xfs_inode_item.h" 18 #include "xfs_quota.h" 19 #include "xfs_trace.h" 20 #include "xfs_icache.h" 21 #include "xfs_bmap_util.h" 22 #include "xfs_dquot_item.h" 23 #include "xfs_dquot.h" 24 #include "xfs_reflink.h" 25 #include "xfs_ialloc.h" 26 27 #include <linux/iversion.h> 28 29 /* 30 * Allocate and initialise an xfs_inode. 31 */ 32 struct xfs_inode * 33 xfs_inode_alloc( 34 struct xfs_mount *mp, 35 xfs_ino_t ino) 36 { 37 struct xfs_inode *ip; 38 39 /* 40 * XXX: If this didn't occur in transactions, we could drop GFP_NOFAIL 41 * and return NULL here on ENOMEM. 42 */ 43 ip = kmem_cache_alloc(xfs_inode_zone, GFP_KERNEL | __GFP_NOFAIL); 44 45 if (inode_init_always(mp->m_super, VFS_I(ip))) { 46 kmem_cache_free(xfs_inode_zone, ip); 47 return NULL; 48 } 49 50 /* VFS doesn't initialise i_mode! */ 51 VFS_I(ip)->i_mode = 0; 52 53 XFS_STATS_INC(mp, vn_active); 54 ASSERT(atomic_read(&ip->i_pincount) == 0); 55 ASSERT(!xfs_isiflocked(ip)); 56 ASSERT(ip->i_ino == 0); 57 58 /* initialise the xfs inode */ 59 ip->i_ino = ino; 60 ip->i_mount = mp; 61 memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 62 ip->i_afp = NULL; 63 ip->i_cowfp = NULL; 64 memset(&ip->i_df, 0, sizeof(ip->i_df)); 65 ip->i_flags = 0; 66 ip->i_delayed_blks = 0; 67 memset(&ip->i_d, 0, sizeof(ip->i_d)); 68 ip->i_sick = 0; 69 ip->i_checked = 0; 70 INIT_WORK(&ip->i_ioend_work, xfs_end_io); 71 INIT_LIST_HEAD(&ip->i_ioend_list); 72 spin_lock_init(&ip->i_ioend_lock); 73 74 return ip; 75 } 76 77 STATIC void 78 xfs_inode_free_callback( 79 struct rcu_head *head) 80 { 81 struct inode *inode = container_of(head, struct inode, i_rcu); 82 struct xfs_inode *ip = XFS_I(inode); 83 84 switch (VFS_I(ip)->i_mode & S_IFMT) { 85 case S_IFREG: 86 case S_IFDIR: 87 case S_IFLNK: 88 xfs_idestroy_fork(&ip->i_df); 89 break; 90 } 91 92 if (ip->i_afp) { 93 xfs_idestroy_fork(ip->i_afp); 94 kmem_cache_free(xfs_ifork_zone, ip->i_afp); 95 } 96 if (ip->i_cowfp) { 97 xfs_idestroy_fork(ip->i_cowfp); 98 kmem_cache_free(xfs_ifork_zone, ip->i_cowfp); 99 } 100 if (ip->i_itemp) { 101 ASSERT(!test_bit(XFS_LI_IN_AIL, 102 &ip->i_itemp->ili_item.li_flags)); 103 xfs_inode_item_destroy(ip); 104 ip->i_itemp = NULL; 105 } 106 107 kmem_cache_free(xfs_inode_zone, ip); 108 } 109 110 static void 111 __xfs_inode_free( 112 struct xfs_inode *ip) 113 { 114 /* asserts to verify all state is correct here */ 115 ASSERT(atomic_read(&ip->i_pincount) == 0); 116 ASSERT(!ip->i_itemp || list_empty(&ip->i_itemp->ili_item.li_bio_list)); 117 XFS_STATS_DEC(ip->i_mount, vn_active); 118 119 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 120 } 121 122 void 123 xfs_inode_free( 124 struct xfs_inode *ip) 125 { 126 ASSERT(!xfs_isiflocked(ip)); 127 128 /* 129 * Because we use RCU freeing we need to ensure the inode always 130 * appears to be reclaimed with an invalid inode number when in the 131 * free state. The ip->i_flags_lock provides the barrier against lookup 132 * races. 133 */ 134 spin_lock(&ip->i_flags_lock); 135 ip->i_flags = XFS_IRECLAIM; 136 ip->i_ino = 0; 137 spin_unlock(&ip->i_flags_lock); 138 139 __xfs_inode_free(ip); 140 } 141 142 /* 143 * Queue background inode reclaim work if there are reclaimable inodes and there 144 * isn't reclaim work already scheduled or in progress. 145 */ 146 static void 147 xfs_reclaim_work_queue( 148 struct xfs_mount *mp) 149 { 150 151 rcu_read_lock(); 152 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 153 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 154 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 155 } 156 rcu_read_unlock(); 157 } 158 159 static void 160 xfs_perag_set_reclaim_tag( 161 struct xfs_perag *pag) 162 { 163 struct xfs_mount *mp = pag->pag_mount; 164 165 lockdep_assert_held(&pag->pag_ici_lock); 166 if (pag->pag_ici_reclaimable++) 167 return; 168 169 /* propagate the reclaim tag up into the perag radix tree */ 170 spin_lock(&mp->m_perag_lock); 171 radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, 172 XFS_ICI_RECLAIM_TAG); 173 spin_unlock(&mp->m_perag_lock); 174 175 /* schedule periodic background inode reclaim */ 176 xfs_reclaim_work_queue(mp); 177 178 trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 179 } 180 181 static void 182 xfs_perag_clear_reclaim_tag( 183 struct xfs_perag *pag) 184 { 185 struct xfs_mount *mp = pag->pag_mount; 186 187 lockdep_assert_held(&pag->pag_ici_lock); 188 if (--pag->pag_ici_reclaimable) 189 return; 190 191 /* clear the reclaim tag from the perag radix tree */ 192 spin_lock(&mp->m_perag_lock); 193 radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, 194 XFS_ICI_RECLAIM_TAG); 195 spin_unlock(&mp->m_perag_lock); 196 trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 197 } 198 199 200 /* 201 * We set the inode flag atomically with the radix tree tag. 202 * Once we get tag lookups on the radix tree, this inode flag 203 * can go away. 204 */ 205 void 206 xfs_inode_set_reclaim_tag( 207 struct xfs_inode *ip) 208 { 209 struct xfs_mount *mp = ip->i_mount; 210 struct xfs_perag *pag; 211 212 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 213 spin_lock(&pag->pag_ici_lock); 214 spin_lock(&ip->i_flags_lock); 215 216 radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), 217 XFS_ICI_RECLAIM_TAG); 218 xfs_perag_set_reclaim_tag(pag); 219 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 220 221 spin_unlock(&ip->i_flags_lock); 222 spin_unlock(&pag->pag_ici_lock); 223 xfs_perag_put(pag); 224 } 225 226 STATIC void 227 xfs_inode_clear_reclaim_tag( 228 struct xfs_perag *pag, 229 xfs_ino_t ino) 230 { 231 radix_tree_tag_clear(&pag->pag_ici_root, 232 XFS_INO_TO_AGINO(pag->pag_mount, ino), 233 XFS_ICI_RECLAIM_TAG); 234 xfs_perag_clear_reclaim_tag(pag); 235 } 236 237 static void 238 xfs_inew_wait( 239 struct xfs_inode *ip) 240 { 241 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); 242 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); 243 244 do { 245 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 246 if (!xfs_iflags_test(ip, XFS_INEW)) 247 break; 248 schedule(); 249 } while (true); 250 finish_wait(wq, &wait.wq_entry); 251 } 252 253 /* 254 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 255 * part of the structure. This is made more complex by the fact we store 256 * information about the on-disk values in the VFS inode and so we can't just 257 * overwrite the values unconditionally. Hence we save the parameters we 258 * need to retain across reinitialisation, and rewrite them into the VFS inode 259 * after reinitialisation even if it fails. 260 */ 261 static int 262 xfs_reinit_inode( 263 struct xfs_mount *mp, 264 struct inode *inode) 265 { 266 int error; 267 uint32_t nlink = inode->i_nlink; 268 uint32_t generation = inode->i_generation; 269 uint64_t version = inode_peek_iversion(inode); 270 umode_t mode = inode->i_mode; 271 dev_t dev = inode->i_rdev; 272 kuid_t uid = inode->i_uid; 273 kgid_t gid = inode->i_gid; 274 275 error = inode_init_always(mp->m_super, inode); 276 277 set_nlink(inode, nlink); 278 inode->i_generation = generation; 279 inode_set_iversion_queried(inode, version); 280 inode->i_mode = mode; 281 inode->i_rdev = dev; 282 inode->i_uid = uid; 283 inode->i_gid = gid; 284 return error; 285 } 286 287 /* 288 * If we are allocating a new inode, then check what was returned is 289 * actually a free, empty inode. If we are not allocating an inode, 290 * then check we didn't find a free inode. 291 * 292 * Returns: 293 * 0 if the inode free state matches the lookup context 294 * -ENOENT if the inode is free and we are not allocating 295 * -EFSCORRUPTED if there is any state mismatch at all 296 */ 297 static int 298 xfs_iget_check_free_state( 299 struct xfs_inode *ip, 300 int flags) 301 { 302 if (flags & XFS_IGET_CREATE) { 303 /* should be a free inode */ 304 if (VFS_I(ip)->i_mode != 0) { 305 xfs_warn(ip->i_mount, 306 "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", 307 ip->i_ino, VFS_I(ip)->i_mode); 308 return -EFSCORRUPTED; 309 } 310 311 if (ip->i_d.di_nblocks != 0) { 312 xfs_warn(ip->i_mount, 313 "Corruption detected! Free inode 0x%llx has blocks allocated!", 314 ip->i_ino); 315 return -EFSCORRUPTED; 316 } 317 return 0; 318 } 319 320 /* should be an allocated inode */ 321 if (VFS_I(ip)->i_mode == 0) 322 return -ENOENT; 323 324 return 0; 325 } 326 327 /* 328 * Check the validity of the inode we just found it the cache 329 */ 330 static int 331 xfs_iget_cache_hit( 332 struct xfs_perag *pag, 333 struct xfs_inode *ip, 334 xfs_ino_t ino, 335 int flags, 336 int lock_flags) __releases(RCU) 337 { 338 struct inode *inode = VFS_I(ip); 339 struct xfs_mount *mp = ip->i_mount; 340 int error; 341 342 /* 343 * check for re-use of an inode within an RCU grace period due to the 344 * radix tree nodes not being updated yet. We monitor for this by 345 * setting the inode number to zero before freeing the inode structure. 346 * If the inode has been reallocated and set up, then the inode number 347 * will not match, so check for that, too. 348 */ 349 spin_lock(&ip->i_flags_lock); 350 if (ip->i_ino != ino) { 351 trace_xfs_iget_skip(ip); 352 XFS_STATS_INC(mp, xs_ig_frecycle); 353 error = -EAGAIN; 354 goto out_error; 355 } 356 357 358 /* 359 * If we are racing with another cache hit that is currently 360 * instantiating this inode or currently recycling it out of 361 * reclaimabe state, wait for the initialisation to complete 362 * before continuing. 363 * 364 * XXX(hch): eventually we should do something equivalent to 365 * wait_on_inode to wait for these flags to be cleared 366 * instead of polling for it. 367 */ 368 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { 369 trace_xfs_iget_skip(ip); 370 XFS_STATS_INC(mp, xs_ig_frecycle); 371 error = -EAGAIN; 372 goto out_error; 373 } 374 375 /* 376 * Check the inode free state is valid. This also detects lookup 377 * racing with unlinks. 378 */ 379 error = xfs_iget_check_free_state(ip, flags); 380 if (error) 381 goto out_error; 382 383 /* 384 * If IRECLAIMABLE is set, we've torn down the VFS inode already. 385 * Need to carefully get it back into useable state. 386 */ 387 if (ip->i_flags & XFS_IRECLAIMABLE) { 388 trace_xfs_iget_reclaim(ip); 389 390 if (flags & XFS_IGET_INCORE) { 391 error = -EAGAIN; 392 goto out_error; 393 } 394 395 /* 396 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode 397 * from stomping over us while we recycle the inode. We can't 398 * clear the radix tree reclaimable tag yet as it requires 399 * pag_ici_lock to be held exclusive. 400 */ 401 ip->i_flags |= XFS_IRECLAIM; 402 403 spin_unlock(&ip->i_flags_lock); 404 rcu_read_unlock(); 405 406 ASSERT(!rwsem_is_locked(&inode->i_rwsem)); 407 error = xfs_reinit_inode(mp, inode); 408 if (error) { 409 bool wake; 410 /* 411 * Re-initializing the inode failed, and we are in deep 412 * trouble. Try to re-add it to the reclaim list. 413 */ 414 rcu_read_lock(); 415 spin_lock(&ip->i_flags_lock); 416 wake = !!__xfs_iflags_test(ip, XFS_INEW); 417 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 418 if (wake) 419 wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); 420 ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 421 trace_xfs_iget_reclaim_fail(ip); 422 goto out_error; 423 } 424 425 spin_lock(&pag->pag_ici_lock); 426 spin_lock(&ip->i_flags_lock); 427 428 /* 429 * Clear the per-lifetime state in the inode as we are now 430 * effectively a new inode and need to return to the initial 431 * state before reuse occurs. 432 */ 433 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 434 ip->i_flags |= XFS_INEW; 435 xfs_inode_clear_reclaim_tag(pag, ip->i_ino); 436 inode->i_state = I_NEW; 437 ip->i_sick = 0; 438 ip->i_checked = 0; 439 440 spin_unlock(&ip->i_flags_lock); 441 spin_unlock(&pag->pag_ici_lock); 442 } else { 443 /* If the VFS inode is being torn down, pause and try again. */ 444 if (!igrab(inode)) { 445 trace_xfs_iget_skip(ip); 446 error = -EAGAIN; 447 goto out_error; 448 } 449 450 /* We've got a live one. */ 451 spin_unlock(&ip->i_flags_lock); 452 rcu_read_unlock(); 453 trace_xfs_iget_hit(ip); 454 } 455 456 if (lock_flags != 0) 457 xfs_ilock(ip, lock_flags); 458 459 if (!(flags & XFS_IGET_INCORE)) 460 xfs_iflags_clear(ip, XFS_ISTALE); 461 XFS_STATS_INC(mp, xs_ig_found); 462 463 return 0; 464 465 out_error: 466 spin_unlock(&ip->i_flags_lock); 467 rcu_read_unlock(); 468 return error; 469 } 470 471 472 static int 473 xfs_iget_cache_miss( 474 struct xfs_mount *mp, 475 struct xfs_perag *pag, 476 xfs_trans_t *tp, 477 xfs_ino_t ino, 478 struct xfs_inode **ipp, 479 int flags, 480 int lock_flags) 481 { 482 struct xfs_inode *ip; 483 int error; 484 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 485 int iflags; 486 487 ip = xfs_inode_alloc(mp, ino); 488 if (!ip) 489 return -ENOMEM; 490 491 error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, flags); 492 if (error) 493 goto out_destroy; 494 495 /* 496 * For version 5 superblocks, if we are initialising a new inode and we 497 * are not utilising the XFS_MOUNT_IKEEP inode cluster mode, we can 498 * simply build the new inode core with a random generation number. 499 * 500 * For version 4 (and older) superblocks, log recovery is dependent on 501 * the di_flushiter field being initialised from the current on-disk 502 * value and hence we must also read the inode off disk even when 503 * initializing new inodes. 504 */ 505 if (xfs_sb_version_has_v3inode(&mp->m_sb) && 506 (flags & XFS_IGET_CREATE) && !(mp->m_flags & XFS_MOUNT_IKEEP)) { 507 VFS_I(ip)->i_generation = prandom_u32(); 508 } else { 509 struct xfs_dinode *dip; 510 struct xfs_buf *bp; 511 512 error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &dip, &bp, 0); 513 if (error) 514 goto out_destroy; 515 516 error = xfs_inode_from_disk(ip, dip); 517 if (!error) 518 xfs_buf_set_ref(bp, XFS_INO_REF); 519 xfs_trans_brelse(tp, bp); 520 521 if (error) 522 goto out_destroy; 523 } 524 525 trace_xfs_iget_miss(ip); 526 527 /* 528 * Check the inode free state is valid. This also detects lookup 529 * racing with unlinks. 530 */ 531 error = xfs_iget_check_free_state(ip, flags); 532 if (error) 533 goto out_destroy; 534 535 /* 536 * Preload the radix tree so we can insert safely under the 537 * write spinlock. Note that we cannot sleep inside the preload 538 * region. Since we can be called from transaction context, don't 539 * recurse into the file system. 540 */ 541 if (radix_tree_preload(GFP_NOFS)) { 542 error = -EAGAIN; 543 goto out_destroy; 544 } 545 546 /* 547 * Because the inode hasn't been added to the radix-tree yet it can't 548 * be found by another thread, so we can do the non-sleeping lock here. 549 */ 550 if (lock_flags) { 551 if (!xfs_ilock_nowait(ip, lock_flags)) 552 BUG(); 553 } 554 555 /* 556 * These values must be set before inserting the inode into the radix 557 * tree as the moment it is inserted a concurrent lookup (allowed by the 558 * RCU locking mechanism) can find it and that lookup must see that this 559 * is an inode currently under construction (i.e. that XFS_INEW is set). 560 * The ip->i_flags_lock that protects the XFS_INEW flag forms the 561 * memory barrier that ensures this detection works correctly at lookup 562 * time. 563 */ 564 iflags = XFS_INEW; 565 if (flags & XFS_IGET_DONTCACHE) 566 d_mark_dontcache(VFS_I(ip)); 567 ip->i_udquot = NULL; 568 ip->i_gdquot = NULL; 569 ip->i_pdquot = NULL; 570 xfs_iflags_set(ip, iflags); 571 572 /* insert the new inode */ 573 spin_lock(&pag->pag_ici_lock); 574 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 575 if (unlikely(error)) { 576 WARN_ON(error != -EEXIST); 577 XFS_STATS_INC(mp, xs_ig_dup); 578 error = -EAGAIN; 579 goto out_preload_end; 580 } 581 spin_unlock(&pag->pag_ici_lock); 582 radix_tree_preload_end(); 583 584 *ipp = ip; 585 return 0; 586 587 out_preload_end: 588 spin_unlock(&pag->pag_ici_lock); 589 radix_tree_preload_end(); 590 if (lock_flags) 591 xfs_iunlock(ip, lock_flags); 592 out_destroy: 593 __destroy_inode(VFS_I(ip)); 594 xfs_inode_free(ip); 595 return error; 596 } 597 598 /* 599 * Look up an inode by number in the given file system. The inode is looked up 600 * in the cache held in each AG. If the inode is found in the cache, initialise 601 * the vfs inode if necessary. 602 * 603 * If it is not in core, read it in from the file system's device, add it to the 604 * cache and initialise the vfs inode. 605 * 606 * The inode is locked according to the value of the lock_flags parameter. 607 * Inode lookup is only done during metadata operations and not as part of the 608 * data IO path. Hence we only allow locking of the XFS_ILOCK during lookup. 609 */ 610 int 611 xfs_iget( 612 struct xfs_mount *mp, 613 struct xfs_trans *tp, 614 xfs_ino_t ino, 615 uint flags, 616 uint lock_flags, 617 struct xfs_inode **ipp) 618 { 619 struct xfs_inode *ip; 620 struct xfs_perag *pag; 621 xfs_agino_t agino; 622 int error; 623 624 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 625 626 /* reject inode numbers outside existing AGs */ 627 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 628 return -EINVAL; 629 630 XFS_STATS_INC(mp, xs_ig_attempts); 631 632 /* get the perag structure and ensure that it's inode capable */ 633 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 634 agino = XFS_INO_TO_AGINO(mp, ino); 635 636 again: 637 error = 0; 638 rcu_read_lock(); 639 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 640 641 if (ip) { 642 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 643 if (error) 644 goto out_error_or_again; 645 } else { 646 rcu_read_unlock(); 647 if (flags & XFS_IGET_INCORE) { 648 error = -ENODATA; 649 goto out_error_or_again; 650 } 651 XFS_STATS_INC(mp, xs_ig_missed); 652 653 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 654 flags, lock_flags); 655 if (error) 656 goto out_error_or_again; 657 } 658 xfs_perag_put(pag); 659 660 *ipp = ip; 661 662 /* 663 * If we have a real type for an on-disk inode, we can setup the inode 664 * now. If it's a new inode being created, xfs_ialloc will handle it. 665 */ 666 if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 667 xfs_setup_existing_inode(ip); 668 return 0; 669 670 out_error_or_again: 671 if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { 672 delay(1); 673 goto again; 674 } 675 xfs_perag_put(pag); 676 return error; 677 } 678 679 /* 680 * "Is this a cached inode that's also allocated?" 681 * 682 * Look up an inode by number in the given file system. If the inode is 683 * in cache and isn't in purgatory, return 1 if the inode is allocated 684 * and 0 if it is not. For all other cases (not in cache, being torn 685 * down, etc.), return a negative error code. 686 * 687 * The caller has to prevent inode allocation and freeing activity, 688 * presumably by locking the AGI buffer. This is to ensure that an 689 * inode cannot transition from allocated to freed until the caller is 690 * ready to allow that. If the inode is in an intermediate state (new, 691 * reclaimable, or being reclaimed), -EAGAIN will be returned; if the 692 * inode is not in the cache, -ENOENT will be returned. The caller must 693 * deal with these scenarios appropriately. 694 * 695 * This is a specialized use case for the online scrubber; if you're 696 * reading this, you probably want xfs_iget. 697 */ 698 int 699 xfs_icache_inode_is_allocated( 700 struct xfs_mount *mp, 701 struct xfs_trans *tp, 702 xfs_ino_t ino, 703 bool *inuse) 704 { 705 struct xfs_inode *ip; 706 int error; 707 708 error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); 709 if (error) 710 return error; 711 712 *inuse = !!(VFS_I(ip)->i_mode); 713 xfs_irele(ip); 714 return 0; 715 } 716 717 /* 718 * The inode lookup is done in batches to keep the amount of lock traffic and 719 * radix tree lookups to a minimum. The batch size is a trade off between 720 * lookup reduction and stack usage. This is in the reclaim path, so we can't 721 * be too greedy. 722 */ 723 #define XFS_LOOKUP_BATCH 32 724 725 /* 726 * Decide if the given @ip is eligible to be a part of the inode walk, and 727 * grab it if so. Returns true if it's ready to go or false if we should just 728 * ignore it. 729 */ 730 STATIC bool 731 xfs_inode_walk_ag_grab( 732 struct xfs_inode *ip, 733 int flags) 734 { 735 struct inode *inode = VFS_I(ip); 736 bool newinos = !!(flags & XFS_INODE_WALK_INEW_WAIT); 737 738 ASSERT(rcu_read_lock_held()); 739 740 /* Check for stale RCU freed inode */ 741 spin_lock(&ip->i_flags_lock); 742 if (!ip->i_ino) 743 goto out_unlock_noent; 744 745 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 746 if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || 747 __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) 748 goto out_unlock_noent; 749 spin_unlock(&ip->i_flags_lock); 750 751 /* nothing to sync during shutdown */ 752 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 753 return false; 754 755 /* If we can't grab the inode, it must on it's way to reclaim. */ 756 if (!igrab(inode)) 757 return false; 758 759 /* inode is valid */ 760 return true; 761 762 out_unlock_noent: 763 spin_unlock(&ip->i_flags_lock); 764 return false; 765 } 766 767 /* 768 * For a given per-AG structure @pag, grab, @execute, and rele all incore 769 * inodes with the given radix tree @tag. 770 */ 771 STATIC int 772 xfs_inode_walk_ag( 773 struct xfs_perag *pag, 774 int iter_flags, 775 int (*execute)(struct xfs_inode *ip, void *args), 776 void *args, 777 int tag) 778 { 779 struct xfs_mount *mp = pag->pag_mount; 780 uint32_t first_index; 781 int last_error = 0; 782 int skipped; 783 bool done; 784 int nr_found; 785 786 restart: 787 done = false; 788 skipped = 0; 789 first_index = 0; 790 nr_found = 0; 791 do { 792 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 793 int error = 0; 794 int i; 795 796 rcu_read_lock(); 797 798 if (tag == XFS_ICI_NO_TAG) 799 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 800 (void **)batch, first_index, 801 XFS_LOOKUP_BATCH); 802 else 803 nr_found = radix_tree_gang_lookup_tag( 804 &pag->pag_ici_root, 805 (void **) batch, first_index, 806 XFS_LOOKUP_BATCH, tag); 807 808 if (!nr_found) { 809 rcu_read_unlock(); 810 break; 811 } 812 813 /* 814 * Grab the inodes before we drop the lock. if we found 815 * nothing, nr == 0 and the loop will be skipped. 816 */ 817 for (i = 0; i < nr_found; i++) { 818 struct xfs_inode *ip = batch[i]; 819 820 if (done || !xfs_inode_walk_ag_grab(ip, iter_flags)) 821 batch[i] = NULL; 822 823 /* 824 * Update the index for the next lookup. Catch 825 * overflows into the next AG range which can occur if 826 * we have inodes in the last block of the AG and we 827 * are currently pointing to the last inode. 828 * 829 * Because we may see inodes that are from the wrong AG 830 * due to RCU freeing and reallocation, only update the 831 * index if it lies in this AG. It was a race that lead 832 * us to see this inode, so another lookup from the 833 * same index will not find it again. 834 */ 835 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 836 continue; 837 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 838 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 839 done = true; 840 } 841 842 /* unlock now we've grabbed the inodes. */ 843 rcu_read_unlock(); 844 845 for (i = 0; i < nr_found; i++) { 846 if (!batch[i]) 847 continue; 848 if ((iter_flags & XFS_INODE_WALK_INEW_WAIT) && 849 xfs_iflags_test(batch[i], XFS_INEW)) 850 xfs_inew_wait(batch[i]); 851 error = execute(batch[i], args); 852 xfs_irele(batch[i]); 853 if (error == -EAGAIN) { 854 skipped++; 855 continue; 856 } 857 if (error && last_error != -EFSCORRUPTED) 858 last_error = error; 859 } 860 861 /* bail out if the filesystem is corrupted. */ 862 if (error == -EFSCORRUPTED) 863 break; 864 865 cond_resched(); 866 867 } while (nr_found && !done); 868 869 if (skipped) { 870 delay(1); 871 goto restart; 872 } 873 return last_error; 874 } 875 876 /* Fetch the next (possibly tagged) per-AG structure. */ 877 static inline struct xfs_perag * 878 xfs_inode_walk_get_perag( 879 struct xfs_mount *mp, 880 xfs_agnumber_t agno, 881 int tag) 882 { 883 if (tag == XFS_ICI_NO_TAG) 884 return xfs_perag_get(mp, agno); 885 return xfs_perag_get_tag(mp, agno, tag); 886 } 887 888 /* 889 * Call the @execute function on all incore inodes matching the radix tree 890 * @tag. 891 */ 892 int 893 xfs_inode_walk( 894 struct xfs_mount *mp, 895 int iter_flags, 896 int (*execute)(struct xfs_inode *ip, void *args), 897 void *args, 898 int tag) 899 { 900 struct xfs_perag *pag; 901 int error = 0; 902 int last_error = 0; 903 xfs_agnumber_t ag; 904 905 ag = 0; 906 while ((pag = xfs_inode_walk_get_perag(mp, ag, tag))) { 907 ag = pag->pag_agno + 1; 908 error = xfs_inode_walk_ag(pag, iter_flags, execute, args, tag); 909 xfs_perag_put(pag); 910 if (error) { 911 last_error = error; 912 if (error == -EFSCORRUPTED) 913 break; 914 } 915 } 916 return last_error; 917 } 918 919 /* 920 * Background scanning to trim post-EOF preallocated space. This is queued 921 * based on the 'speculative_prealloc_lifetime' tunable (5m by default). 922 */ 923 void 924 xfs_queue_eofblocks( 925 struct xfs_mount *mp) 926 { 927 rcu_read_lock(); 928 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) 929 queue_delayed_work(mp->m_eofblocks_workqueue, 930 &mp->m_eofblocks_work, 931 msecs_to_jiffies(xfs_eofb_secs * 1000)); 932 rcu_read_unlock(); 933 } 934 935 void 936 xfs_eofblocks_worker( 937 struct work_struct *work) 938 { 939 struct xfs_mount *mp = container_of(to_delayed_work(work), 940 struct xfs_mount, m_eofblocks_work); 941 942 if (!sb_start_write_trylock(mp->m_super)) 943 return; 944 xfs_icache_free_eofblocks(mp, NULL); 945 sb_end_write(mp->m_super); 946 947 xfs_queue_eofblocks(mp); 948 } 949 950 /* 951 * Background scanning to trim preallocated CoW space. This is queued 952 * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). 953 * (We'll just piggyback on the post-EOF prealloc space workqueue.) 954 */ 955 void 956 xfs_queue_cowblocks( 957 struct xfs_mount *mp) 958 { 959 rcu_read_lock(); 960 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) 961 queue_delayed_work(mp->m_eofblocks_workqueue, 962 &mp->m_cowblocks_work, 963 msecs_to_jiffies(xfs_cowb_secs * 1000)); 964 rcu_read_unlock(); 965 } 966 967 void 968 xfs_cowblocks_worker( 969 struct work_struct *work) 970 { 971 struct xfs_mount *mp = container_of(to_delayed_work(work), 972 struct xfs_mount, m_cowblocks_work); 973 974 if (!sb_start_write_trylock(mp->m_super)) 975 return; 976 xfs_icache_free_cowblocks(mp, NULL); 977 sb_end_write(mp->m_super); 978 979 xfs_queue_cowblocks(mp); 980 } 981 982 /* 983 * Grab the inode for reclaim exclusively. 984 * 985 * We have found this inode via a lookup under RCU, so the inode may have 986 * already been freed, or it may be in the process of being recycled by 987 * xfs_iget(). In both cases, the inode will have XFS_IRECLAIM set. If the inode 988 * has been fully recycled by the time we get the i_flags_lock, XFS_IRECLAIMABLE 989 * will not be set. Hence we need to check for both these flag conditions to 990 * avoid inodes that are no longer reclaim candidates. 991 * 992 * Note: checking for other state flags here, under the i_flags_lock or not, is 993 * racy and should be avoided. Those races should be resolved only after we have 994 * ensured that we are able to reclaim this inode and the world can see that we 995 * are going to reclaim it. 996 * 997 * Return true if we grabbed it, false otherwise. 998 */ 999 static bool 1000 xfs_reclaim_inode_grab( 1001 struct xfs_inode *ip) 1002 { 1003 ASSERT(rcu_read_lock_held()); 1004 1005 spin_lock(&ip->i_flags_lock); 1006 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 1007 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 1008 /* not a reclaim candidate. */ 1009 spin_unlock(&ip->i_flags_lock); 1010 return false; 1011 } 1012 __xfs_iflags_set(ip, XFS_IRECLAIM); 1013 spin_unlock(&ip->i_flags_lock); 1014 return true; 1015 } 1016 1017 /* 1018 * Inode reclaim is non-blocking, so the default action if progress cannot be 1019 * made is to "requeue" the inode for reclaim by unlocking it and clearing the 1020 * XFS_IRECLAIM flag. If we are in a shutdown state, we don't care about 1021 * blocking anymore and hence we can wait for the inode to be able to reclaim 1022 * it. 1023 * 1024 * We do no IO here - if callers require inodes to be cleaned they must push the 1025 * AIL first to trigger writeback of dirty inodes. This enables writeback to be 1026 * done in the background in a non-blocking manner, and enables memory reclaim 1027 * to make progress without blocking. 1028 */ 1029 static void 1030 xfs_reclaim_inode( 1031 struct xfs_inode *ip, 1032 struct xfs_perag *pag) 1033 { 1034 xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 1035 1036 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) 1037 goto out; 1038 if (!xfs_iflock_nowait(ip)) 1039 goto out_iunlock; 1040 1041 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1042 xfs_iunpin_wait(ip); 1043 /* xfs_iflush_abort() drops the flush lock */ 1044 xfs_iflush_abort(ip); 1045 goto reclaim; 1046 } 1047 if (xfs_ipincount(ip)) 1048 goto out_ifunlock; 1049 if (!xfs_inode_clean(ip)) 1050 goto out_ifunlock; 1051 1052 xfs_ifunlock(ip); 1053 reclaim: 1054 ASSERT(!xfs_isiflocked(ip)); 1055 1056 /* 1057 * Because we use RCU freeing we need to ensure the inode always appears 1058 * to be reclaimed with an invalid inode number when in the free state. 1059 * We do this as early as possible under the ILOCK so that 1060 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to 1061 * detect races with us here. By doing this, we guarantee that once 1062 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that 1063 * it will see either a valid inode that will serialise correctly, or it 1064 * will see an invalid inode that it can skip. 1065 */ 1066 spin_lock(&ip->i_flags_lock); 1067 ip->i_flags = XFS_IRECLAIM; 1068 ip->i_ino = 0; 1069 spin_unlock(&ip->i_flags_lock); 1070 1071 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1072 1073 XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 1074 /* 1075 * Remove the inode from the per-AG radix tree. 1076 * 1077 * Because radix_tree_delete won't complain even if the item was never 1078 * added to the tree assert that it's been there before to catch 1079 * problems with the inode life time early on. 1080 */ 1081 spin_lock(&pag->pag_ici_lock); 1082 if (!radix_tree_delete(&pag->pag_ici_root, 1083 XFS_INO_TO_AGINO(ip->i_mount, ino))) 1084 ASSERT(0); 1085 xfs_perag_clear_reclaim_tag(pag); 1086 spin_unlock(&pag->pag_ici_lock); 1087 1088 /* 1089 * Here we do an (almost) spurious inode lock in order to coordinate 1090 * with inode cache radix tree lookups. This is because the lookup 1091 * can reference the inodes in the cache without taking references. 1092 * 1093 * We make that OK here by ensuring that we wait until the inode is 1094 * unlocked after the lookup before we go ahead and free it. 1095 */ 1096 xfs_ilock(ip, XFS_ILOCK_EXCL); 1097 xfs_qm_dqdetach(ip); 1098 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1099 ASSERT(xfs_inode_clean(ip)); 1100 1101 __xfs_inode_free(ip); 1102 return; 1103 1104 out_ifunlock: 1105 xfs_ifunlock(ip); 1106 out_iunlock: 1107 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1108 out: 1109 xfs_iflags_clear(ip, XFS_IRECLAIM); 1110 } 1111 1112 /* 1113 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 1114 * corrupted, we still want to try to reclaim all the inodes. If we don't, 1115 * then a shut down during filesystem unmount reclaim walk leak all the 1116 * unreclaimed inodes. 1117 * 1118 * Returns non-zero if any AGs or inodes were skipped in the reclaim pass 1119 * so that callers that want to block until all dirty inodes are written back 1120 * and reclaimed can sanely loop. 1121 */ 1122 static void 1123 xfs_reclaim_inodes_ag( 1124 struct xfs_mount *mp, 1125 int *nr_to_scan) 1126 { 1127 struct xfs_perag *pag; 1128 xfs_agnumber_t ag = 0; 1129 1130 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1131 unsigned long first_index = 0; 1132 int done = 0; 1133 int nr_found = 0; 1134 1135 ag = pag->pag_agno + 1; 1136 1137 first_index = READ_ONCE(pag->pag_ici_reclaim_cursor); 1138 do { 1139 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 1140 int i; 1141 1142 rcu_read_lock(); 1143 nr_found = radix_tree_gang_lookup_tag( 1144 &pag->pag_ici_root, 1145 (void **)batch, first_index, 1146 XFS_LOOKUP_BATCH, 1147 XFS_ICI_RECLAIM_TAG); 1148 if (!nr_found) { 1149 done = 1; 1150 rcu_read_unlock(); 1151 break; 1152 } 1153 1154 /* 1155 * Grab the inodes before we drop the lock. if we found 1156 * nothing, nr == 0 and the loop will be skipped. 1157 */ 1158 for (i = 0; i < nr_found; i++) { 1159 struct xfs_inode *ip = batch[i]; 1160 1161 if (done || !xfs_reclaim_inode_grab(ip)) 1162 batch[i] = NULL; 1163 1164 /* 1165 * Update the index for the next lookup. Catch 1166 * overflows into the next AG range which can 1167 * occur if we have inodes in the last block of 1168 * the AG and we are currently pointing to the 1169 * last inode. 1170 * 1171 * Because we may see inodes that are from the 1172 * wrong AG due to RCU freeing and 1173 * reallocation, only update the index if it 1174 * lies in this AG. It was a race that lead us 1175 * to see this inode, so another lookup from 1176 * the same index will not find it again. 1177 */ 1178 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 1179 pag->pag_agno) 1180 continue; 1181 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 1182 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 1183 done = 1; 1184 } 1185 1186 /* unlock now we've grabbed the inodes. */ 1187 rcu_read_unlock(); 1188 1189 for (i = 0; i < nr_found; i++) { 1190 if (batch[i]) 1191 xfs_reclaim_inode(batch[i], pag); 1192 } 1193 1194 *nr_to_scan -= XFS_LOOKUP_BATCH; 1195 cond_resched(); 1196 } while (nr_found && !done && *nr_to_scan > 0); 1197 1198 if (done) 1199 first_index = 0; 1200 WRITE_ONCE(pag->pag_ici_reclaim_cursor, first_index); 1201 xfs_perag_put(pag); 1202 } 1203 } 1204 1205 void 1206 xfs_reclaim_inodes( 1207 struct xfs_mount *mp) 1208 { 1209 int nr_to_scan = INT_MAX; 1210 1211 while (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 1212 xfs_ail_push_all_sync(mp->m_ail); 1213 xfs_reclaim_inodes_ag(mp, &nr_to_scan); 1214 }; 1215 } 1216 1217 /* 1218 * The shrinker infrastructure determines how many inodes we should scan for 1219 * reclaim. We want as many clean inodes ready to reclaim as possible, so we 1220 * push the AIL here. We also want to proactively free up memory if we can to 1221 * minimise the amount of work memory reclaim has to do so we kick the 1222 * background reclaim if it isn't already scheduled. 1223 */ 1224 long 1225 xfs_reclaim_inodes_nr( 1226 struct xfs_mount *mp, 1227 int nr_to_scan) 1228 { 1229 /* kick background reclaimer and push the AIL */ 1230 xfs_reclaim_work_queue(mp); 1231 xfs_ail_push_all(mp->m_ail); 1232 1233 xfs_reclaim_inodes_ag(mp, &nr_to_scan); 1234 return 0; 1235 } 1236 1237 /* 1238 * Return the number of reclaimable inodes in the filesystem for 1239 * the shrinker to determine how much to reclaim. 1240 */ 1241 int 1242 xfs_reclaim_inodes_count( 1243 struct xfs_mount *mp) 1244 { 1245 struct xfs_perag *pag; 1246 xfs_agnumber_t ag = 0; 1247 int reclaimable = 0; 1248 1249 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1250 ag = pag->pag_agno + 1; 1251 reclaimable += pag->pag_ici_reclaimable; 1252 xfs_perag_put(pag); 1253 } 1254 return reclaimable; 1255 } 1256 1257 STATIC bool 1258 xfs_inode_match_id( 1259 struct xfs_inode *ip, 1260 struct xfs_eofblocks *eofb) 1261 { 1262 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1263 !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1264 return false; 1265 1266 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1267 !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1268 return false; 1269 1270 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1271 ip->i_d.di_projid != eofb->eof_prid) 1272 return false; 1273 1274 return true; 1275 } 1276 1277 /* 1278 * A union-based inode filtering algorithm. Process the inode if any of the 1279 * criteria match. This is for global/internal scans only. 1280 */ 1281 STATIC bool 1282 xfs_inode_match_id_union( 1283 struct xfs_inode *ip, 1284 struct xfs_eofblocks *eofb) 1285 { 1286 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1287 uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1288 return true; 1289 1290 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1291 gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1292 return true; 1293 1294 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1295 ip->i_d.di_projid == eofb->eof_prid) 1296 return true; 1297 1298 return false; 1299 } 1300 1301 /* 1302 * Is this inode @ip eligible for eof/cow block reclamation, given some 1303 * filtering parameters @eofb? The inode is eligible if @eofb is null or 1304 * if the predicate functions match. 1305 */ 1306 static bool 1307 xfs_inode_matches_eofb( 1308 struct xfs_inode *ip, 1309 struct xfs_eofblocks *eofb) 1310 { 1311 bool match; 1312 1313 if (!eofb) 1314 return true; 1315 1316 if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 1317 match = xfs_inode_match_id_union(ip, eofb); 1318 else 1319 match = xfs_inode_match_id(ip, eofb); 1320 if (!match) 1321 return false; 1322 1323 /* skip the inode if the file size is too small */ 1324 if ((eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE) && 1325 XFS_ISIZE(ip) < eofb->eof_min_file_size) 1326 return false; 1327 1328 return true; 1329 } 1330 1331 /* 1332 * This is a fast pass over the inode cache to try to get reclaim moving on as 1333 * many inodes as possible in a short period of time. It kicks itself every few 1334 * seconds, as well as being kicked by the inode cache shrinker when memory 1335 * goes low. 1336 */ 1337 void 1338 xfs_reclaim_worker( 1339 struct work_struct *work) 1340 { 1341 struct xfs_mount *mp = container_of(to_delayed_work(work), 1342 struct xfs_mount, m_reclaim_work); 1343 int nr_to_scan = INT_MAX; 1344 1345 xfs_reclaim_inodes_ag(mp, &nr_to_scan); 1346 xfs_reclaim_work_queue(mp); 1347 } 1348 1349 STATIC int 1350 xfs_inode_free_eofblocks( 1351 struct xfs_inode *ip, 1352 void *args) 1353 { 1354 struct xfs_eofblocks *eofb = args; 1355 bool wait; 1356 int ret; 1357 1358 wait = eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC); 1359 1360 if (!xfs_can_free_eofblocks(ip, false)) { 1361 /* inode could be preallocated or append-only */ 1362 trace_xfs_inode_free_eofblocks_invalid(ip); 1363 xfs_inode_clear_eofblocks_tag(ip); 1364 return 0; 1365 } 1366 1367 /* 1368 * If the mapping is dirty the operation can block and wait for some 1369 * time. Unless we are waiting, skip it. 1370 */ 1371 if (!wait && mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 1372 return 0; 1373 1374 if (!xfs_inode_matches_eofb(ip, eofb)) 1375 return 0; 1376 1377 /* 1378 * If the caller is waiting, return -EAGAIN to keep the background 1379 * scanner moving and revisit the inode in a subsequent pass. 1380 */ 1381 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1382 if (wait) 1383 return -EAGAIN; 1384 return 0; 1385 } 1386 1387 ret = xfs_free_eofblocks(ip); 1388 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1389 1390 return ret; 1391 } 1392 1393 int 1394 xfs_icache_free_eofblocks( 1395 struct xfs_mount *mp, 1396 struct xfs_eofblocks *eofb) 1397 { 1398 return xfs_inode_walk(mp, 0, xfs_inode_free_eofblocks, eofb, 1399 XFS_ICI_EOFBLOCKS_TAG); 1400 } 1401 1402 /* 1403 * Run eofblocks scans on the quotas applicable to the inode. For inodes with 1404 * multiple quotas, we don't know exactly which quota caused an allocation 1405 * failure. We make a best effort by including each quota under low free space 1406 * conditions (less than 1% free space) in the scan. 1407 */ 1408 static int 1409 __xfs_inode_free_quota_eofblocks( 1410 struct xfs_inode *ip, 1411 int (*execute)(struct xfs_mount *mp, 1412 struct xfs_eofblocks *eofb)) 1413 { 1414 int scan = 0; 1415 struct xfs_eofblocks eofb = {0}; 1416 struct xfs_dquot *dq; 1417 1418 /* 1419 * Run a sync scan to increase effectiveness and use the union filter to 1420 * cover all applicable quotas in a single scan. 1421 */ 1422 eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; 1423 1424 if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { 1425 dq = xfs_inode_dquot(ip, XFS_DQTYPE_USER); 1426 if (dq && xfs_dquot_lowsp(dq)) { 1427 eofb.eof_uid = VFS_I(ip)->i_uid; 1428 eofb.eof_flags |= XFS_EOF_FLAGS_UID; 1429 scan = 1; 1430 } 1431 } 1432 1433 if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { 1434 dq = xfs_inode_dquot(ip, XFS_DQTYPE_GROUP); 1435 if (dq && xfs_dquot_lowsp(dq)) { 1436 eofb.eof_gid = VFS_I(ip)->i_gid; 1437 eofb.eof_flags |= XFS_EOF_FLAGS_GID; 1438 scan = 1; 1439 } 1440 } 1441 1442 if (scan) 1443 execute(ip->i_mount, &eofb); 1444 1445 return scan; 1446 } 1447 1448 int 1449 xfs_inode_free_quota_eofblocks( 1450 struct xfs_inode *ip) 1451 { 1452 return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); 1453 } 1454 1455 static inline unsigned long 1456 xfs_iflag_for_tag( 1457 int tag) 1458 { 1459 switch (tag) { 1460 case XFS_ICI_EOFBLOCKS_TAG: 1461 return XFS_IEOFBLOCKS; 1462 case XFS_ICI_COWBLOCKS_TAG: 1463 return XFS_ICOWBLOCKS; 1464 default: 1465 ASSERT(0); 1466 return 0; 1467 } 1468 } 1469 1470 static void 1471 __xfs_inode_set_blocks_tag( 1472 xfs_inode_t *ip, 1473 void (*execute)(struct xfs_mount *mp), 1474 void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 1475 int error, unsigned long caller_ip), 1476 int tag) 1477 { 1478 struct xfs_mount *mp = ip->i_mount; 1479 struct xfs_perag *pag; 1480 int tagged; 1481 1482 /* 1483 * Don't bother locking the AG and looking up in the radix trees 1484 * if we already know that we have the tag set. 1485 */ 1486 if (ip->i_flags & xfs_iflag_for_tag(tag)) 1487 return; 1488 spin_lock(&ip->i_flags_lock); 1489 ip->i_flags |= xfs_iflag_for_tag(tag); 1490 spin_unlock(&ip->i_flags_lock); 1491 1492 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1493 spin_lock(&pag->pag_ici_lock); 1494 1495 tagged = radix_tree_tagged(&pag->pag_ici_root, tag); 1496 radix_tree_tag_set(&pag->pag_ici_root, 1497 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 1498 if (!tagged) { 1499 /* propagate the eofblocks tag up into the perag radix tree */ 1500 spin_lock(&ip->i_mount->m_perag_lock); 1501 radix_tree_tag_set(&ip->i_mount->m_perag_tree, 1502 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 1503 tag); 1504 spin_unlock(&ip->i_mount->m_perag_lock); 1505 1506 /* kick off background trimming */ 1507 execute(ip->i_mount); 1508 1509 set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 1510 } 1511 1512 spin_unlock(&pag->pag_ici_lock); 1513 xfs_perag_put(pag); 1514 } 1515 1516 void 1517 xfs_inode_set_eofblocks_tag( 1518 xfs_inode_t *ip) 1519 { 1520 trace_xfs_inode_set_eofblocks_tag(ip); 1521 return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks, 1522 trace_xfs_perag_set_eofblocks, 1523 XFS_ICI_EOFBLOCKS_TAG); 1524 } 1525 1526 static void 1527 __xfs_inode_clear_blocks_tag( 1528 xfs_inode_t *ip, 1529 void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 1530 int error, unsigned long caller_ip), 1531 int tag) 1532 { 1533 struct xfs_mount *mp = ip->i_mount; 1534 struct xfs_perag *pag; 1535 1536 spin_lock(&ip->i_flags_lock); 1537 ip->i_flags &= ~xfs_iflag_for_tag(tag); 1538 spin_unlock(&ip->i_flags_lock); 1539 1540 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1541 spin_lock(&pag->pag_ici_lock); 1542 1543 radix_tree_tag_clear(&pag->pag_ici_root, 1544 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 1545 if (!radix_tree_tagged(&pag->pag_ici_root, tag)) { 1546 /* clear the eofblocks tag from the perag radix tree */ 1547 spin_lock(&ip->i_mount->m_perag_lock); 1548 radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 1549 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 1550 tag); 1551 spin_unlock(&ip->i_mount->m_perag_lock); 1552 clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 1553 } 1554 1555 spin_unlock(&pag->pag_ici_lock); 1556 xfs_perag_put(pag); 1557 } 1558 1559 void 1560 xfs_inode_clear_eofblocks_tag( 1561 xfs_inode_t *ip) 1562 { 1563 trace_xfs_inode_clear_eofblocks_tag(ip); 1564 return __xfs_inode_clear_blocks_tag(ip, 1565 trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); 1566 } 1567 1568 /* 1569 * Set ourselves up to free CoW blocks from this file. If it's already clean 1570 * then we can bail out quickly, but otherwise we must back off if the file 1571 * is undergoing some kind of write. 1572 */ 1573 static bool 1574 xfs_prep_free_cowblocks( 1575 struct xfs_inode *ip) 1576 { 1577 /* 1578 * Just clear the tag if we have an empty cow fork or none at all. It's 1579 * possible the inode was fully unshared since it was originally tagged. 1580 */ 1581 if (!xfs_inode_has_cow_data(ip)) { 1582 trace_xfs_inode_free_cowblocks_invalid(ip); 1583 xfs_inode_clear_cowblocks_tag(ip); 1584 return false; 1585 } 1586 1587 /* 1588 * If the mapping is dirty or under writeback we cannot touch the 1589 * CoW fork. Leave it alone if we're in the midst of a directio. 1590 */ 1591 if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || 1592 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || 1593 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || 1594 atomic_read(&VFS_I(ip)->i_dio_count)) 1595 return false; 1596 1597 return true; 1598 } 1599 1600 /* 1601 * Automatic CoW Reservation Freeing 1602 * 1603 * These functions automatically garbage collect leftover CoW reservations 1604 * that were made on behalf of a cowextsize hint when we start to run out 1605 * of quota or when the reservations sit around for too long. If the file 1606 * has dirty pages or is undergoing writeback, its CoW reservations will 1607 * be retained. 1608 * 1609 * The actual garbage collection piggybacks off the same code that runs 1610 * the speculative EOF preallocation garbage collector. 1611 */ 1612 STATIC int 1613 xfs_inode_free_cowblocks( 1614 struct xfs_inode *ip, 1615 void *args) 1616 { 1617 struct xfs_eofblocks *eofb = args; 1618 int ret = 0; 1619 1620 if (!xfs_prep_free_cowblocks(ip)) 1621 return 0; 1622 1623 if (!xfs_inode_matches_eofb(ip, eofb)) 1624 return 0; 1625 1626 /* Free the CoW blocks */ 1627 xfs_ilock(ip, XFS_IOLOCK_EXCL); 1628 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 1629 1630 /* 1631 * Check again, nobody else should be able to dirty blocks or change 1632 * the reflink iflag now that we have the first two locks held. 1633 */ 1634 if (xfs_prep_free_cowblocks(ip)) 1635 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); 1636 1637 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 1638 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1639 1640 return ret; 1641 } 1642 1643 int 1644 xfs_icache_free_cowblocks( 1645 struct xfs_mount *mp, 1646 struct xfs_eofblocks *eofb) 1647 { 1648 return xfs_inode_walk(mp, 0, xfs_inode_free_cowblocks, eofb, 1649 XFS_ICI_COWBLOCKS_TAG); 1650 } 1651 1652 int 1653 xfs_inode_free_quota_cowblocks( 1654 struct xfs_inode *ip) 1655 { 1656 return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); 1657 } 1658 1659 void 1660 xfs_inode_set_cowblocks_tag( 1661 xfs_inode_t *ip) 1662 { 1663 trace_xfs_inode_set_cowblocks_tag(ip); 1664 return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks, 1665 trace_xfs_perag_set_cowblocks, 1666 XFS_ICI_COWBLOCKS_TAG); 1667 } 1668 1669 void 1670 xfs_inode_clear_cowblocks_tag( 1671 xfs_inode_t *ip) 1672 { 1673 trace_xfs_inode_clear_cowblocks_tag(ip); 1674 return __xfs_inode_clear_blocks_tag(ip, 1675 trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); 1676 } 1677 1678 /* Disable post-EOF and CoW block auto-reclamation. */ 1679 void 1680 xfs_stop_block_reaping( 1681 struct xfs_mount *mp) 1682 { 1683 cancel_delayed_work_sync(&mp->m_eofblocks_work); 1684 cancel_delayed_work_sync(&mp->m_cowblocks_work); 1685 } 1686 1687 /* Enable post-EOF and CoW block auto-reclamation. */ 1688 void 1689 xfs_start_block_reaping( 1690 struct xfs_mount *mp) 1691 { 1692 xfs_queue_eofblocks(mp); 1693 xfs_queue_cowblocks(mp); 1694 } 1695