1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_sb.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_error.h" 15 #include "xfs_trans.h" 16 #include "xfs_trans_priv.h" 17 #include "xfs_inode_item.h" 18 #include "xfs_quota.h" 19 #include "xfs_trace.h" 20 #include "xfs_icache.h" 21 #include "xfs_bmap_util.h" 22 #include "xfs_dquot_item.h" 23 #include "xfs_dquot.h" 24 #include "xfs_reflink.h" 25 26 #include <linux/kthread.h> 27 #include <linux/freezer.h> 28 #include <linux/iversion.h> 29 30 /* 31 * Allocate and initialise an xfs_inode. 32 */ 33 struct xfs_inode * 34 xfs_inode_alloc( 35 struct xfs_mount *mp, 36 xfs_ino_t ino) 37 { 38 struct xfs_inode *ip; 39 40 /* 41 * if this didn't occur in transactions, we could use 42 * KM_MAYFAIL and return NULL here on ENOMEM. Set the 43 * code up to do this anyway. 44 */ 45 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); 46 if (!ip) 47 return NULL; 48 if (inode_init_always(mp->m_super, VFS_I(ip))) { 49 kmem_zone_free(xfs_inode_zone, ip); 50 return NULL; 51 } 52 53 /* VFS doesn't initialise i_mode! */ 54 VFS_I(ip)->i_mode = 0; 55 56 XFS_STATS_INC(mp, vn_active); 57 ASSERT(atomic_read(&ip->i_pincount) == 0); 58 ASSERT(!xfs_isiflocked(ip)); 59 ASSERT(ip->i_ino == 0); 60 61 /* initialise the xfs inode */ 62 ip->i_ino = ino; 63 ip->i_mount = mp; 64 memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 65 ip->i_afp = NULL; 66 ip->i_cowfp = NULL; 67 ip->i_cnextents = 0; 68 ip->i_cformat = XFS_DINODE_FMT_EXTENTS; 69 memset(&ip->i_df, 0, sizeof(ip->i_df)); 70 ip->i_flags = 0; 71 ip->i_delayed_blks = 0; 72 memset(&ip->i_d, 0, sizeof(ip->i_d)); 73 74 return ip; 75 } 76 77 STATIC void 78 xfs_inode_free_callback( 79 struct rcu_head *head) 80 { 81 struct inode *inode = container_of(head, struct inode, i_rcu); 82 struct xfs_inode *ip = XFS_I(inode); 83 84 switch (VFS_I(ip)->i_mode & S_IFMT) { 85 case S_IFREG: 86 case S_IFDIR: 87 case S_IFLNK: 88 xfs_idestroy_fork(ip, XFS_DATA_FORK); 89 break; 90 } 91 92 if (ip->i_afp) 93 xfs_idestroy_fork(ip, XFS_ATTR_FORK); 94 if (ip->i_cowfp) 95 xfs_idestroy_fork(ip, XFS_COW_FORK); 96 97 if (ip->i_itemp) { 98 ASSERT(!test_bit(XFS_LI_IN_AIL, 99 &ip->i_itemp->ili_item.li_flags)); 100 xfs_inode_item_destroy(ip); 101 ip->i_itemp = NULL; 102 } 103 104 kmem_zone_free(xfs_inode_zone, ip); 105 } 106 107 static void 108 __xfs_inode_free( 109 struct xfs_inode *ip) 110 { 111 /* asserts to verify all state is correct here */ 112 ASSERT(atomic_read(&ip->i_pincount) == 0); 113 XFS_STATS_DEC(ip->i_mount, vn_active); 114 115 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 116 } 117 118 void 119 xfs_inode_free( 120 struct xfs_inode *ip) 121 { 122 ASSERT(!xfs_isiflocked(ip)); 123 124 /* 125 * Because we use RCU freeing we need to ensure the inode always 126 * appears to be reclaimed with an invalid inode number when in the 127 * free state. The ip->i_flags_lock provides the barrier against lookup 128 * races. 129 */ 130 spin_lock(&ip->i_flags_lock); 131 ip->i_flags = XFS_IRECLAIM; 132 ip->i_ino = 0; 133 spin_unlock(&ip->i_flags_lock); 134 135 __xfs_inode_free(ip); 136 } 137 138 /* 139 * Queue a new inode reclaim pass if there are reclaimable inodes and there 140 * isn't a reclaim pass already in progress. By default it runs every 5s based 141 * on the xfs periodic sync default of 30s. Perhaps this should have it's own 142 * tunable, but that can be done if this method proves to be ineffective or too 143 * aggressive. 144 */ 145 static void 146 xfs_reclaim_work_queue( 147 struct xfs_mount *mp) 148 { 149 150 rcu_read_lock(); 151 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 152 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 153 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 154 } 155 rcu_read_unlock(); 156 } 157 158 /* 159 * This is a fast pass over the inode cache to try to get reclaim moving on as 160 * many inodes as possible in a short period of time. It kicks itself every few 161 * seconds, as well as being kicked by the inode cache shrinker when memory 162 * goes low. It scans as quickly as possible avoiding locked inodes or those 163 * already being flushed, and once done schedules a future pass. 164 */ 165 void 166 xfs_reclaim_worker( 167 struct work_struct *work) 168 { 169 struct xfs_mount *mp = container_of(to_delayed_work(work), 170 struct xfs_mount, m_reclaim_work); 171 172 xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 173 xfs_reclaim_work_queue(mp); 174 } 175 176 static void 177 xfs_perag_set_reclaim_tag( 178 struct xfs_perag *pag) 179 { 180 struct xfs_mount *mp = pag->pag_mount; 181 182 lockdep_assert_held(&pag->pag_ici_lock); 183 if (pag->pag_ici_reclaimable++) 184 return; 185 186 /* propagate the reclaim tag up into the perag radix tree */ 187 spin_lock(&mp->m_perag_lock); 188 radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, 189 XFS_ICI_RECLAIM_TAG); 190 spin_unlock(&mp->m_perag_lock); 191 192 /* schedule periodic background inode reclaim */ 193 xfs_reclaim_work_queue(mp); 194 195 trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 196 } 197 198 static void 199 xfs_perag_clear_reclaim_tag( 200 struct xfs_perag *pag) 201 { 202 struct xfs_mount *mp = pag->pag_mount; 203 204 lockdep_assert_held(&pag->pag_ici_lock); 205 if (--pag->pag_ici_reclaimable) 206 return; 207 208 /* clear the reclaim tag from the perag radix tree */ 209 spin_lock(&mp->m_perag_lock); 210 radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, 211 XFS_ICI_RECLAIM_TAG); 212 spin_unlock(&mp->m_perag_lock); 213 trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 214 } 215 216 217 /* 218 * We set the inode flag atomically with the radix tree tag. 219 * Once we get tag lookups on the radix tree, this inode flag 220 * can go away. 221 */ 222 void 223 xfs_inode_set_reclaim_tag( 224 struct xfs_inode *ip) 225 { 226 struct xfs_mount *mp = ip->i_mount; 227 struct xfs_perag *pag; 228 229 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 230 spin_lock(&pag->pag_ici_lock); 231 spin_lock(&ip->i_flags_lock); 232 233 radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), 234 XFS_ICI_RECLAIM_TAG); 235 xfs_perag_set_reclaim_tag(pag); 236 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 237 238 spin_unlock(&ip->i_flags_lock); 239 spin_unlock(&pag->pag_ici_lock); 240 xfs_perag_put(pag); 241 } 242 243 STATIC void 244 xfs_inode_clear_reclaim_tag( 245 struct xfs_perag *pag, 246 xfs_ino_t ino) 247 { 248 radix_tree_tag_clear(&pag->pag_ici_root, 249 XFS_INO_TO_AGINO(pag->pag_mount, ino), 250 XFS_ICI_RECLAIM_TAG); 251 xfs_perag_clear_reclaim_tag(pag); 252 } 253 254 static void 255 xfs_inew_wait( 256 struct xfs_inode *ip) 257 { 258 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); 259 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); 260 261 do { 262 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 263 if (!xfs_iflags_test(ip, XFS_INEW)) 264 break; 265 schedule(); 266 } while (true); 267 finish_wait(wq, &wait.wq_entry); 268 } 269 270 /* 271 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 272 * part of the structure. This is made more complex by the fact we store 273 * information about the on-disk values in the VFS inode and so we can't just 274 * overwrite the values unconditionally. Hence we save the parameters we 275 * need to retain across reinitialisation, and rewrite them into the VFS inode 276 * after reinitialisation even if it fails. 277 */ 278 static int 279 xfs_reinit_inode( 280 struct xfs_mount *mp, 281 struct inode *inode) 282 { 283 int error; 284 uint32_t nlink = inode->i_nlink; 285 uint32_t generation = inode->i_generation; 286 uint64_t version = inode_peek_iversion(inode); 287 umode_t mode = inode->i_mode; 288 dev_t dev = inode->i_rdev; 289 290 error = inode_init_always(mp->m_super, inode); 291 292 set_nlink(inode, nlink); 293 inode->i_generation = generation; 294 inode_set_iversion_queried(inode, version); 295 inode->i_mode = mode; 296 inode->i_rdev = dev; 297 return error; 298 } 299 300 /* 301 * If we are allocating a new inode, then check what was returned is 302 * actually a free, empty inode. If we are not allocating an inode, 303 * then check we didn't find a free inode. 304 * 305 * Returns: 306 * 0 if the inode free state matches the lookup context 307 * -ENOENT if the inode is free and we are not allocating 308 * -EFSCORRUPTED if there is any state mismatch at all 309 */ 310 static int 311 xfs_iget_check_free_state( 312 struct xfs_inode *ip, 313 int flags) 314 { 315 if (flags & XFS_IGET_CREATE) { 316 /* should be a free inode */ 317 if (VFS_I(ip)->i_mode != 0) { 318 xfs_warn(ip->i_mount, 319 "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", 320 ip->i_ino, VFS_I(ip)->i_mode); 321 return -EFSCORRUPTED; 322 } 323 324 if (ip->i_d.di_nblocks != 0) { 325 xfs_warn(ip->i_mount, 326 "Corruption detected! Free inode 0x%llx has blocks allocated!", 327 ip->i_ino); 328 return -EFSCORRUPTED; 329 } 330 return 0; 331 } 332 333 /* should be an allocated inode */ 334 if (VFS_I(ip)->i_mode == 0) 335 return -ENOENT; 336 337 return 0; 338 } 339 340 /* 341 * Check the validity of the inode we just found it the cache 342 */ 343 static int 344 xfs_iget_cache_hit( 345 struct xfs_perag *pag, 346 struct xfs_inode *ip, 347 xfs_ino_t ino, 348 int flags, 349 int lock_flags) __releases(RCU) 350 { 351 struct inode *inode = VFS_I(ip); 352 struct xfs_mount *mp = ip->i_mount; 353 int error; 354 355 /* 356 * check for re-use of an inode within an RCU grace period due to the 357 * radix tree nodes not being updated yet. We monitor for this by 358 * setting the inode number to zero before freeing the inode structure. 359 * If the inode has been reallocated and set up, then the inode number 360 * will not match, so check for that, too. 361 */ 362 spin_lock(&ip->i_flags_lock); 363 if (ip->i_ino != ino) { 364 trace_xfs_iget_skip(ip); 365 XFS_STATS_INC(mp, xs_ig_frecycle); 366 error = -EAGAIN; 367 goto out_error; 368 } 369 370 371 /* 372 * If we are racing with another cache hit that is currently 373 * instantiating this inode or currently recycling it out of 374 * reclaimabe state, wait for the initialisation to complete 375 * before continuing. 376 * 377 * XXX(hch): eventually we should do something equivalent to 378 * wait_on_inode to wait for these flags to be cleared 379 * instead of polling for it. 380 */ 381 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { 382 trace_xfs_iget_skip(ip); 383 XFS_STATS_INC(mp, xs_ig_frecycle); 384 error = -EAGAIN; 385 goto out_error; 386 } 387 388 /* 389 * Check the inode free state is valid. This also detects lookup 390 * racing with unlinks. 391 */ 392 error = xfs_iget_check_free_state(ip, flags); 393 if (error) 394 goto out_error; 395 396 /* 397 * If IRECLAIMABLE is set, we've torn down the VFS inode already. 398 * Need to carefully get it back into useable state. 399 */ 400 if (ip->i_flags & XFS_IRECLAIMABLE) { 401 trace_xfs_iget_reclaim(ip); 402 403 if (flags & XFS_IGET_INCORE) { 404 error = -EAGAIN; 405 goto out_error; 406 } 407 408 /* 409 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode 410 * from stomping over us while we recycle the inode. We can't 411 * clear the radix tree reclaimable tag yet as it requires 412 * pag_ici_lock to be held exclusive. 413 */ 414 ip->i_flags |= XFS_IRECLAIM; 415 416 spin_unlock(&ip->i_flags_lock); 417 rcu_read_unlock(); 418 419 error = xfs_reinit_inode(mp, inode); 420 if (error) { 421 bool wake; 422 /* 423 * Re-initializing the inode failed, and we are in deep 424 * trouble. Try to re-add it to the reclaim list. 425 */ 426 rcu_read_lock(); 427 spin_lock(&ip->i_flags_lock); 428 wake = !!__xfs_iflags_test(ip, XFS_INEW); 429 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 430 if (wake) 431 wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); 432 ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 433 trace_xfs_iget_reclaim_fail(ip); 434 goto out_error; 435 } 436 437 spin_lock(&pag->pag_ici_lock); 438 spin_lock(&ip->i_flags_lock); 439 440 /* 441 * Clear the per-lifetime state in the inode as we are now 442 * effectively a new inode and need to return to the initial 443 * state before reuse occurs. 444 */ 445 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 446 ip->i_flags |= XFS_INEW; 447 xfs_inode_clear_reclaim_tag(pag, ip->i_ino); 448 inode->i_state = I_NEW; 449 450 ASSERT(!rwsem_is_locked(&inode->i_rwsem)); 451 init_rwsem(&inode->i_rwsem); 452 453 spin_unlock(&ip->i_flags_lock); 454 spin_unlock(&pag->pag_ici_lock); 455 } else { 456 /* If the VFS inode is being torn down, pause and try again. */ 457 if (!igrab(inode)) { 458 trace_xfs_iget_skip(ip); 459 error = -EAGAIN; 460 goto out_error; 461 } 462 463 /* We've got a live one. */ 464 spin_unlock(&ip->i_flags_lock); 465 rcu_read_unlock(); 466 trace_xfs_iget_hit(ip); 467 } 468 469 if (lock_flags != 0) 470 xfs_ilock(ip, lock_flags); 471 472 if (!(flags & XFS_IGET_INCORE)) 473 xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); 474 XFS_STATS_INC(mp, xs_ig_found); 475 476 return 0; 477 478 out_error: 479 spin_unlock(&ip->i_flags_lock); 480 rcu_read_unlock(); 481 return error; 482 } 483 484 485 static int 486 xfs_iget_cache_miss( 487 struct xfs_mount *mp, 488 struct xfs_perag *pag, 489 xfs_trans_t *tp, 490 xfs_ino_t ino, 491 struct xfs_inode **ipp, 492 int flags, 493 int lock_flags) 494 { 495 struct xfs_inode *ip; 496 int error; 497 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 498 int iflags; 499 500 ip = xfs_inode_alloc(mp, ino); 501 if (!ip) 502 return -ENOMEM; 503 504 error = xfs_iread(mp, tp, ip, flags); 505 if (error) 506 goto out_destroy; 507 508 if (!xfs_inode_verify_forks(ip)) { 509 error = -EFSCORRUPTED; 510 goto out_destroy; 511 } 512 513 trace_xfs_iget_miss(ip); 514 515 516 /* 517 * Check the inode free state is valid. This also detects lookup 518 * racing with unlinks. 519 */ 520 error = xfs_iget_check_free_state(ip, flags); 521 if (error) 522 goto out_destroy; 523 524 /* 525 * Preload the radix tree so we can insert safely under the 526 * write spinlock. Note that we cannot sleep inside the preload 527 * region. Since we can be called from transaction context, don't 528 * recurse into the file system. 529 */ 530 if (radix_tree_preload(GFP_NOFS)) { 531 error = -EAGAIN; 532 goto out_destroy; 533 } 534 535 /* 536 * Because the inode hasn't been added to the radix-tree yet it can't 537 * be found by another thread, so we can do the non-sleeping lock here. 538 */ 539 if (lock_flags) { 540 if (!xfs_ilock_nowait(ip, lock_flags)) 541 BUG(); 542 } 543 544 /* 545 * These values must be set before inserting the inode into the radix 546 * tree as the moment it is inserted a concurrent lookup (allowed by the 547 * RCU locking mechanism) can find it and that lookup must see that this 548 * is an inode currently under construction (i.e. that XFS_INEW is set). 549 * The ip->i_flags_lock that protects the XFS_INEW flag forms the 550 * memory barrier that ensures this detection works correctly at lookup 551 * time. 552 */ 553 iflags = XFS_INEW; 554 if (flags & XFS_IGET_DONTCACHE) 555 iflags |= XFS_IDONTCACHE; 556 ip->i_udquot = NULL; 557 ip->i_gdquot = NULL; 558 ip->i_pdquot = NULL; 559 xfs_iflags_set(ip, iflags); 560 561 /* insert the new inode */ 562 spin_lock(&pag->pag_ici_lock); 563 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 564 if (unlikely(error)) { 565 WARN_ON(error != -EEXIST); 566 XFS_STATS_INC(mp, xs_ig_dup); 567 error = -EAGAIN; 568 goto out_preload_end; 569 } 570 spin_unlock(&pag->pag_ici_lock); 571 radix_tree_preload_end(); 572 573 *ipp = ip; 574 return 0; 575 576 out_preload_end: 577 spin_unlock(&pag->pag_ici_lock); 578 radix_tree_preload_end(); 579 if (lock_flags) 580 xfs_iunlock(ip, lock_flags); 581 out_destroy: 582 __destroy_inode(VFS_I(ip)); 583 xfs_inode_free(ip); 584 return error; 585 } 586 587 /* 588 * Look up an inode by number in the given file system. 589 * The inode is looked up in the cache held in each AG. 590 * If the inode is found in the cache, initialise the vfs inode 591 * if necessary. 592 * 593 * If it is not in core, read it in from the file system's device, 594 * add it to the cache and initialise the vfs inode. 595 * 596 * The inode is locked according to the value of the lock_flags parameter. 597 * This flag parameter indicates how and if the inode's IO lock and inode lock 598 * should be taken. 599 * 600 * mp -- the mount point structure for the current file system. It points 601 * to the inode hash table. 602 * tp -- a pointer to the current transaction if there is one. This is 603 * simply passed through to the xfs_iread() call. 604 * ino -- the number of the inode desired. This is the unique identifier 605 * within the file system for the inode being requested. 606 * lock_flags -- flags indicating how to lock the inode. See the comment 607 * for xfs_ilock() for a list of valid values. 608 */ 609 int 610 xfs_iget( 611 xfs_mount_t *mp, 612 xfs_trans_t *tp, 613 xfs_ino_t ino, 614 uint flags, 615 uint lock_flags, 616 xfs_inode_t **ipp) 617 { 618 xfs_inode_t *ip; 619 int error; 620 xfs_perag_t *pag; 621 xfs_agino_t agino; 622 623 /* 624 * xfs_reclaim_inode() uses the ILOCK to ensure an inode 625 * doesn't get freed while it's being referenced during a 626 * radix tree traversal here. It assumes this function 627 * aqcuires only the ILOCK (and therefore it has no need to 628 * involve the IOLOCK in this synchronization). 629 */ 630 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 631 632 /* reject inode numbers outside existing AGs */ 633 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 634 return -EINVAL; 635 636 XFS_STATS_INC(mp, xs_ig_attempts); 637 638 /* get the perag structure and ensure that it's inode capable */ 639 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 640 agino = XFS_INO_TO_AGINO(mp, ino); 641 642 again: 643 error = 0; 644 rcu_read_lock(); 645 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 646 647 if (ip) { 648 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 649 if (error) 650 goto out_error_or_again; 651 } else { 652 rcu_read_unlock(); 653 if (flags & XFS_IGET_INCORE) { 654 error = -ENODATA; 655 goto out_error_or_again; 656 } 657 XFS_STATS_INC(mp, xs_ig_missed); 658 659 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 660 flags, lock_flags); 661 if (error) 662 goto out_error_or_again; 663 } 664 xfs_perag_put(pag); 665 666 *ipp = ip; 667 668 /* 669 * If we have a real type for an on-disk inode, we can setup the inode 670 * now. If it's a new inode being created, xfs_ialloc will handle it. 671 */ 672 if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 673 xfs_setup_existing_inode(ip); 674 return 0; 675 676 out_error_or_again: 677 if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { 678 delay(1); 679 goto again; 680 } 681 xfs_perag_put(pag); 682 return error; 683 } 684 685 /* 686 * "Is this a cached inode that's also allocated?" 687 * 688 * Look up an inode by number in the given file system. If the inode is 689 * in cache and isn't in purgatory, return 1 if the inode is allocated 690 * and 0 if it is not. For all other cases (not in cache, being torn 691 * down, etc.), return a negative error code. 692 * 693 * The caller has to prevent inode allocation and freeing activity, 694 * presumably by locking the AGI buffer. This is to ensure that an 695 * inode cannot transition from allocated to freed until the caller is 696 * ready to allow that. If the inode is in an intermediate state (new, 697 * reclaimable, or being reclaimed), -EAGAIN will be returned; if the 698 * inode is not in the cache, -ENOENT will be returned. The caller must 699 * deal with these scenarios appropriately. 700 * 701 * This is a specialized use case for the online scrubber; if you're 702 * reading this, you probably want xfs_iget. 703 */ 704 int 705 xfs_icache_inode_is_allocated( 706 struct xfs_mount *mp, 707 struct xfs_trans *tp, 708 xfs_ino_t ino, 709 bool *inuse) 710 { 711 struct xfs_inode *ip; 712 int error; 713 714 error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); 715 if (error) 716 return error; 717 718 *inuse = !!(VFS_I(ip)->i_mode); 719 xfs_irele(ip); 720 return 0; 721 } 722 723 /* 724 * The inode lookup is done in batches to keep the amount of lock traffic and 725 * radix tree lookups to a minimum. The batch size is a trade off between 726 * lookup reduction and stack usage. This is in the reclaim path, so we can't 727 * be too greedy. 728 */ 729 #define XFS_LOOKUP_BATCH 32 730 731 STATIC int 732 xfs_inode_ag_walk_grab( 733 struct xfs_inode *ip, 734 int flags) 735 { 736 struct inode *inode = VFS_I(ip); 737 bool newinos = !!(flags & XFS_AGITER_INEW_WAIT); 738 739 ASSERT(rcu_read_lock_held()); 740 741 /* 742 * check for stale RCU freed inode 743 * 744 * If the inode has been reallocated, it doesn't matter if it's not in 745 * the AG we are walking - we are walking for writeback, so if it 746 * passes all the "valid inode" checks and is dirty, then we'll write 747 * it back anyway. If it has been reallocated and still being 748 * initialised, the XFS_INEW check below will catch it. 749 */ 750 spin_lock(&ip->i_flags_lock); 751 if (!ip->i_ino) 752 goto out_unlock_noent; 753 754 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 755 if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || 756 __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) 757 goto out_unlock_noent; 758 spin_unlock(&ip->i_flags_lock); 759 760 /* nothing to sync during shutdown */ 761 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 762 return -EFSCORRUPTED; 763 764 /* If we can't grab the inode, it must on it's way to reclaim. */ 765 if (!igrab(inode)) 766 return -ENOENT; 767 768 /* inode is valid */ 769 return 0; 770 771 out_unlock_noent: 772 spin_unlock(&ip->i_flags_lock); 773 return -ENOENT; 774 } 775 776 STATIC int 777 xfs_inode_ag_walk( 778 struct xfs_mount *mp, 779 struct xfs_perag *pag, 780 int (*execute)(struct xfs_inode *ip, int flags, 781 void *args), 782 int flags, 783 void *args, 784 int tag, 785 int iter_flags) 786 { 787 uint32_t first_index; 788 int last_error = 0; 789 int skipped; 790 int done; 791 int nr_found; 792 793 restart: 794 done = 0; 795 skipped = 0; 796 first_index = 0; 797 nr_found = 0; 798 do { 799 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 800 int error = 0; 801 int i; 802 803 rcu_read_lock(); 804 805 if (tag == -1) 806 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 807 (void **)batch, first_index, 808 XFS_LOOKUP_BATCH); 809 else 810 nr_found = radix_tree_gang_lookup_tag( 811 &pag->pag_ici_root, 812 (void **) batch, first_index, 813 XFS_LOOKUP_BATCH, tag); 814 815 if (!nr_found) { 816 rcu_read_unlock(); 817 break; 818 } 819 820 /* 821 * Grab the inodes before we drop the lock. if we found 822 * nothing, nr == 0 and the loop will be skipped. 823 */ 824 for (i = 0; i < nr_found; i++) { 825 struct xfs_inode *ip = batch[i]; 826 827 if (done || xfs_inode_ag_walk_grab(ip, iter_flags)) 828 batch[i] = NULL; 829 830 /* 831 * Update the index for the next lookup. Catch 832 * overflows into the next AG range which can occur if 833 * we have inodes in the last block of the AG and we 834 * are currently pointing to the last inode. 835 * 836 * Because we may see inodes that are from the wrong AG 837 * due to RCU freeing and reallocation, only update the 838 * index if it lies in this AG. It was a race that lead 839 * us to see this inode, so another lookup from the 840 * same index will not find it again. 841 */ 842 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 843 continue; 844 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 845 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 846 done = 1; 847 } 848 849 /* unlock now we've grabbed the inodes. */ 850 rcu_read_unlock(); 851 852 for (i = 0; i < nr_found; i++) { 853 if (!batch[i]) 854 continue; 855 if ((iter_flags & XFS_AGITER_INEW_WAIT) && 856 xfs_iflags_test(batch[i], XFS_INEW)) 857 xfs_inew_wait(batch[i]); 858 error = execute(batch[i], flags, args); 859 xfs_irele(batch[i]); 860 if (error == -EAGAIN) { 861 skipped++; 862 continue; 863 } 864 if (error && last_error != -EFSCORRUPTED) 865 last_error = error; 866 } 867 868 /* bail out if the filesystem is corrupted. */ 869 if (error == -EFSCORRUPTED) 870 break; 871 872 cond_resched(); 873 874 } while (nr_found && !done); 875 876 if (skipped) { 877 delay(1); 878 goto restart; 879 } 880 return last_error; 881 } 882 883 /* 884 * Background scanning to trim post-EOF preallocated space. This is queued 885 * based on the 'speculative_prealloc_lifetime' tunable (5m by default). 886 */ 887 void 888 xfs_queue_eofblocks( 889 struct xfs_mount *mp) 890 { 891 rcu_read_lock(); 892 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) 893 queue_delayed_work(mp->m_eofblocks_workqueue, 894 &mp->m_eofblocks_work, 895 msecs_to_jiffies(xfs_eofb_secs * 1000)); 896 rcu_read_unlock(); 897 } 898 899 void 900 xfs_eofblocks_worker( 901 struct work_struct *work) 902 { 903 struct xfs_mount *mp = container_of(to_delayed_work(work), 904 struct xfs_mount, m_eofblocks_work); 905 xfs_icache_free_eofblocks(mp, NULL); 906 xfs_queue_eofblocks(mp); 907 } 908 909 /* 910 * Background scanning to trim preallocated CoW space. This is queued 911 * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). 912 * (We'll just piggyback on the post-EOF prealloc space workqueue.) 913 */ 914 void 915 xfs_queue_cowblocks( 916 struct xfs_mount *mp) 917 { 918 rcu_read_lock(); 919 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) 920 queue_delayed_work(mp->m_eofblocks_workqueue, 921 &mp->m_cowblocks_work, 922 msecs_to_jiffies(xfs_cowb_secs * 1000)); 923 rcu_read_unlock(); 924 } 925 926 void 927 xfs_cowblocks_worker( 928 struct work_struct *work) 929 { 930 struct xfs_mount *mp = container_of(to_delayed_work(work), 931 struct xfs_mount, m_cowblocks_work); 932 xfs_icache_free_cowblocks(mp, NULL); 933 xfs_queue_cowblocks(mp); 934 } 935 936 int 937 xfs_inode_ag_iterator_flags( 938 struct xfs_mount *mp, 939 int (*execute)(struct xfs_inode *ip, int flags, 940 void *args), 941 int flags, 942 void *args, 943 int iter_flags) 944 { 945 struct xfs_perag *pag; 946 int error = 0; 947 int last_error = 0; 948 xfs_agnumber_t ag; 949 950 ag = 0; 951 while ((pag = xfs_perag_get(mp, ag))) { 952 ag = pag->pag_agno + 1; 953 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1, 954 iter_flags); 955 xfs_perag_put(pag); 956 if (error) { 957 last_error = error; 958 if (error == -EFSCORRUPTED) 959 break; 960 } 961 } 962 return last_error; 963 } 964 965 int 966 xfs_inode_ag_iterator( 967 struct xfs_mount *mp, 968 int (*execute)(struct xfs_inode *ip, int flags, 969 void *args), 970 int flags, 971 void *args) 972 { 973 return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0); 974 } 975 976 int 977 xfs_inode_ag_iterator_tag( 978 struct xfs_mount *mp, 979 int (*execute)(struct xfs_inode *ip, int flags, 980 void *args), 981 int flags, 982 void *args, 983 int tag) 984 { 985 struct xfs_perag *pag; 986 int error = 0; 987 int last_error = 0; 988 xfs_agnumber_t ag; 989 990 ag = 0; 991 while ((pag = xfs_perag_get_tag(mp, ag, tag))) { 992 ag = pag->pag_agno + 1; 993 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag, 994 0); 995 xfs_perag_put(pag); 996 if (error) { 997 last_error = error; 998 if (error == -EFSCORRUPTED) 999 break; 1000 } 1001 } 1002 return last_error; 1003 } 1004 1005 /* 1006 * Grab the inode for reclaim exclusively. 1007 * Return 0 if we grabbed it, non-zero otherwise. 1008 */ 1009 STATIC int 1010 xfs_reclaim_inode_grab( 1011 struct xfs_inode *ip, 1012 int flags) 1013 { 1014 ASSERT(rcu_read_lock_held()); 1015 1016 /* quick check for stale RCU freed inode */ 1017 if (!ip->i_ino) 1018 return 1; 1019 1020 /* 1021 * If we are asked for non-blocking operation, do unlocked checks to 1022 * see if the inode already is being flushed or in reclaim to avoid 1023 * lock traffic. 1024 */ 1025 if ((flags & SYNC_TRYLOCK) && 1026 __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) 1027 return 1; 1028 1029 /* 1030 * The radix tree lock here protects a thread in xfs_iget from racing 1031 * with us starting reclaim on the inode. Once we have the 1032 * XFS_IRECLAIM flag set it will not touch us. 1033 * 1034 * Due to RCU lookup, we may find inodes that have been freed and only 1035 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that 1036 * aren't candidates for reclaim at all, so we must check the 1037 * XFS_IRECLAIMABLE is set first before proceeding to reclaim. 1038 */ 1039 spin_lock(&ip->i_flags_lock); 1040 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 1041 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 1042 /* not a reclaim candidate. */ 1043 spin_unlock(&ip->i_flags_lock); 1044 return 1; 1045 } 1046 __xfs_iflags_set(ip, XFS_IRECLAIM); 1047 spin_unlock(&ip->i_flags_lock); 1048 return 0; 1049 } 1050 1051 /* 1052 * Inodes in different states need to be treated differently. The following 1053 * table lists the inode states and the reclaim actions necessary: 1054 * 1055 * inode state iflush ret required action 1056 * --------------- ---------- --------------- 1057 * bad - reclaim 1058 * shutdown EIO unpin and reclaim 1059 * clean, unpinned 0 reclaim 1060 * stale, unpinned 0 reclaim 1061 * clean, pinned(*) 0 requeue 1062 * stale, pinned EAGAIN requeue 1063 * dirty, async - requeue 1064 * dirty, sync 0 reclaim 1065 * 1066 * (*) dgc: I don't think the clean, pinned state is possible but it gets 1067 * handled anyway given the order of checks implemented. 1068 * 1069 * Also, because we get the flush lock first, we know that any inode that has 1070 * been flushed delwri has had the flush completed by the time we check that 1071 * the inode is clean. 1072 * 1073 * Note that because the inode is flushed delayed write by AIL pushing, the 1074 * flush lock may already be held here and waiting on it can result in very 1075 * long latencies. Hence for sync reclaims, where we wait on the flush lock, 1076 * the caller should push the AIL first before trying to reclaim inodes to 1077 * minimise the amount of time spent waiting. For background relaim, we only 1078 * bother to reclaim clean inodes anyway. 1079 * 1080 * Hence the order of actions after gaining the locks should be: 1081 * bad => reclaim 1082 * shutdown => unpin and reclaim 1083 * pinned, async => requeue 1084 * pinned, sync => unpin 1085 * stale => reclaim 1086 * clean => reclaim 1087 * dirty, async => requeue 1088 * dirty, sync => flush, wait and reclaim 1089 */ 1090 STATIC int 1091 xfs_reclaim_inode( 1092 struct xfs_inode *ip, 1093 struct xfs_perag *pag, 1094 int sync_mode) 1095 { 1096 struct xfs_buf *bp = NULL; 1097 xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 1098 int error; 1099 1100 restart: 1101 error = 0; 1102 xfs_ilock(ip, XFS_ILOCK_EXCL); 1103 if (!xfs_iflock_nowait(ip)) { 1104 if (!(sync_mode & SYNC_WAIT)) 1105 goto out; 1106 xfs_iflock(ip); 1107 } 1108 1109 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1110 xfs_iunpin_wait(ip); 1111 /* xfs_iflush_abort() drops the flush lock */ 1112 xfs_iflush_abort(ip, false); 1113 goto reclaim; 1114 } 1115 if (xfs_ipincount(ip)) { 1116 if (!(sync_mode & SYNC_WAIT)) 1117 goto out_ifunlock; 1118 xfs_iunpin_wait(ip); 1119 } 1120 if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) { 1121 xfs_ifunlock(ip); 1122 goto reclaim; 1123 } 1124 1125 /* 1126 * Never flush out dirty data during non-blocking reclaim, as it would 1127 * just contend with AIL pushing trying to do the same job. 1128 */ 1129 if (!(sync_mode & SYNC_WAIT)) 1130 goto out_ifunlock; 1131 1132 /* 1133 * Now we have an inode that needs flushing. 1134 * 1135 * Note that xfs_iflush will never block on the inode buffer lock, as 1136 * xfs_ifree_cluster() can lock the inode buffer before it locks the 1137 * ip->i_lock, and we are doing the exact opposite here. As a result, 1138 * doing a blocking xfs_imap_to_bp() to get the cluster buffer would 1139 * result in an ABBA deadlock with xfs_ifree_cluster(). 1140 * 1141 * As xfs_ifree_cluser() must gather all inodes that are active in the 1142 * cache to mark them stale, if we hit this case we don't actually want 1143 * to do IO here - we want the inode marked stale so we can simply 1144 * reclaim it. Hence if we get an EAGAIN error here, just unlock the 1145 * inode, back off and try again. Hopefully the next pass through will 1146 * see the stale flag set on the inode. 1147 */ 1148 error = xfs_iflush(ip, &bp); 1149 if (error == -EAGAIN) { 1150 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1151 /* backoff longer than in xfs_ifree_cluster */ 1152 delay(2); 1153 goto restart; 1154 } 1155 1156 if (!error) { 1157 error = xfs_bwrite(bp); 1158 xfs_buf_relse(bp); 1159 } 1160 1161 reclaim: 1162 ASSERT(!xfs_isiflocked(ip)); 1163 1164 /* 1165 * Because we use RCU freeing we need to ensure the inode always appears 1166 * to be reclaimed with an invalid inode number when in the free state. 1167 * We do this as early as possible under the ILOCK so that 1168 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to 1169 * detect races with us here. By doing this, we guarantee that once 1170 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that 1171 * it will see either a valid inode that will serialise correctly, or it 1172 * will see an invalid inode that it can skip. 1173 */ 1174 spin_lock(&ip->i_flags_lock); 1175 ip->i_flags = XFS_IRECLAIM; 1176 ip->i_ino = 0; 1177 spin_unlock(&ip->i_flags_lock); 1178 1179 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1180 1181 XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 1182 /* 1183 * Remove the inode from the per-AG radix tree. 1184 * 1185 * Because radix_tree_delete won't complain even if the item was never 1186 * added to the tree assert that it's been there before to catch 1187 * problems with the inode life time early on. 1188 */ 1189 spin_lock(&pag->pag_ici_lock); 1190 if (!radix_tree_delete(&pag->pag_ici_root, 1191 XFS_INO_TO_AGINO(ip->i_mount, ino))) 1192 ASSERT(0); 1193 xfs_perag_clear_reclaim_tag(pag); 1194 spin_unlock(&pag->pag_ici_lock); 1195 1196 /* 1197 * Here we do an (almost) spurious inode lock in order to coordinate 1198 * with inode cache radix tree lookups. This is because the lookup 1199 * can reference the inodes in the cache without taking references. 1200 * 1201 * We make that OK here by ensuring that we wait until the inode is 1202 * unlocked after the lookup before we go ahead and free it. 1203 */ 1204 xfs_ilock(ip, XFS_ILOCK_EXCL); 1205 xfs_qm_dqdetach(ip); 1206 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1207 1208 __xfs_inode_free(ip); 1209 return error; 1210 1211 out_ifunlock: 1212 xfs_ifunlock(ip); 1213 out: 1214 xfs_iflags_clear(ip, XFS_IRECLAIM); 1215 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1216 /* 1217 * We could return -EAGAIN here to make reclaim rescan the inode tree in 1218 * a short while. However, this just burns CPU time scanning the tree 1219 * waiting for IO to complete and the reclaim work never goes back to 1220 * the idle state. Instead, return 0 to let the next scheduled 1221 * background reclaim attempt to reclaim the inode again. 1222 */ 1223 return 0; 1224 } 1225 1226 /* 1227 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 1228 * corrupted, we still want to try to reclaim all the inodes. If we don't, 1229 * then a shut down during filesystem unmount reclaim walk leak all the 1230 * unreclaimed inodes. 1231 */ 1232 STATIC int 1233 xfs_reclaim_inodes_ag( 1234 struct xfs_mount *mp, 1235 int flags, 1236 int *nr_to_scan) 1237 { 1238 struct xfs_perag *pag; 1239 int error = 0; 1240 int last_error = 0; 1241 xfs_agnumber_t ag; 1242 int trylock = flags & SYNC_TRYLOCK; 1243 int skipped; 1244 1245 restart: 1246 ag = 0; 1247 skipped = 0; 1248 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1249 unsigned long first_index = 0; 1250 int done = 0; 1251 int nr_found = 0; 1252 1253 ag = pag->pag_agno + 1; 1254 1255 if (trylock) { 1256 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 1257 skipped++; 1258 xfs_perag_put(pag); 1259 continue; 1260 } 1261 first_index = pag->pag_ici_reclaim_cursor; 1262 } else 1263 mutex_lock(&pag->pag_ici_reclaim_lock); 1264 1265 do { 1266 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 1267 int i; 1268 1269 rcu_read_lock(); 1270 nr_found = radix_tree_gang_lookup_tag( 1271 &pag->pag_ici_root, 1272 (void **)batch, first_index, 1273 XFS_LOOKUP_BATCH, 1274 XFS_ICI_RECLAIM_TAG); 1275 if (!nr_found) { 1276 done = 1; 1277 rcu_read_unlock(); 1278 break; 1279 } 1280 1281 /* 1282 * Grab the inodes before we drop the lock. if we found 1283 * nothing, nr == 0 and the loop will be skipped. 1284 */ 1285 for (i = 0; i < nr_found; i++) { 1286 struct xfs_inode *ip = batch[i]; 1287 1288 if (done || xfs_reclaim_inode_grab(ip, flags)) 1289 batch[i] = NULL; 1290 1291 /* 1292 * Update the index for the next lookup. Catch 1293 * overflows into the next AG range which can 1294 * occur if we have inodes in the last block of 1295 * the AG and we are currently pointing to the 1296 * last inode. 1297 * 1298 * Because we may see inodes that are from the 1299 * wrong AG due to RCU freeing and 1300 * reallocation, only update the index if it 1301 * lies in this AG. It was a race that lead us 1302 * to see this inode, so another lookup from 1303 * the same index will not find it again. 1304 */ 1305 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 1306 pag->pag_agno) 1307 continue; 1308 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 1309 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 1310 done = 1; 1311 } 1312 1313 /* unlock now we've grabbed the inodes. */ 1314 rcu_read_unlock(); 1315 1316 for (i = 0; i < nr_found; i++) { 1317 if (!batch[i]) 1318 continue; 1319 error = xfs_reclaim_inode(batch[i], pag, flags); 1320 if (error && last_error != -EFSCORRUPTED) 1321 last_error = error; 1322 } 1323 1324 *nr_to_scan -= XFS_LOOKUP_BATCH; 1325 1326 cond_resched(); 1327 1328 } while (nr_found && !done && *nr_to_scan > 0); 1329 1330 if (trylock && !done) 1331 pag->pag_ici_reclaim_cursor = first_index; 1332 else 1333 pag->pag_ici_reclaim_cursor = 0; 1334 mutex_unlock(&pag->pag_ici_reclaim_lock); 1335 xfs_perag_put(pag); 1336 } 1337 1338 /* 1339 * if we skipped any AG, and we still have scan count remaining, do 1340 * another pass this time using blocking reclaim semantics (i.e 1341 * waiting on the reclaim locks and ignoring the reclaim cursors). This 1342 * ensure that when we get more reclaimers than AGs we block rather 1343 * than spin trying to execute reclaim. 1344 */ 1345 if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { 1346 trylock = 0; 1347 goto restart; 1348 } 1349 return last_error; 1350 } 1351 1352 int 1353 xfs_reclaim_inodes( 1354 xfs_mount_t *mp, 1355 int mode) 1356 { 1357 int nr_to_scan = INT_MAX; 1358 1359 return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); 1360 } 1361 1362 /* 1363 * Scan a certain number of inodes for reclaim. 1364 * 1365 * When called we make sure that there is a background (fast) inode reclaim in 1366 * progress, while we will throttle the speed of reclaim via doing synchronous 1367 * reclaim of inodes. That means if we come across dirty inodes, we wait for 1368 * them to be cleaned, which we hope will not be very long due to the 1369 * background walker having already kicked the IO off on those dirty inodes. 1370 */ 1371 long 1372 xfs_reclaim_inodes_nr( 1373 struct xfs_mount *mp, 1374 int nr_to_scan) 1375 { 1376 /* kick background reclaimer and push the AIL */ 1377 xfs_reclaim_work_queue(mp); 1378 xfs_ail_push_all(mp->m_ail); 1379 1380 return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 1381 } 1382 1383 /* 1384 * Return the number of reclaimable inodes in the filesystem for 1385 * the shrinker to determine how much to reclaim. 1386 */ 1387 int 1388 xfs_reclaim_inodes_count( 1389 struct xfs_mount *mp) 1390 { 1391 struct xfs_perag *pag; 1392 xfs_agnumber_t ag = 0; 1393 int reclaimable = 0; 1394 1395 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1396 ag = pag->pag_agno + 1; 1397 reclaimable += pag->pag_ici_reclaimable; 1398 xfs_perag_put(pag); 1399 } 1400 return reclaimable; 1401 } 1402 1403 STATIC int 1404 xfs_inode_match_id( 1405 struct xfs_inode *ip, 1406 struct xfs_eofblocks *eofb) 1407 { 1408 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1409 !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1410 return 0; 1411 1412 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1413 !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1414 return 0; 1415 1416 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1417 xfs_get_projid(ip) != eofb->eof_prid) 1418 return 0; 1419 1420 return 1; 1421 } 1422 1423 /* 1424 * A union-based inode filtering algorithm. Process the inode if any of the 1425 * criteria match. This is for global/internal scans only. 1426 */ 1427 STATIC int 1428 xfs_inode_match_id_union( 1429 struct xfs_inode *ip, 1430 struct xfs_eofblocks *eofb) 1431 { 1432 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1433 uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1434 return 1; 1435 1436 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1437 gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1438 return 1; 1439 1440 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1441 xfs_get_projid(ip) == eofb->eof_prid) 1442 return 1; 1443 1444 return 0; 1445 } 1446 1447 STATIC int 1448 xfs_inode_free_eofblocks( 1449 struct xfs_inode *ip, 1450 int flags, 1451 void *args) 1452 { 1453 int ret = 0; 1454 struct xfs_eofblocks *eofb = args; 1455 int match; 1456 1457 if (!xfs_can_free_eofblocks(ip, false)) { 1458 /* inode could be preallocated or append-only */ 1459 trace_xfs_inode_free_eofblocks_invalid(ip); 1460 xfs_inode_clear_eofblocks_tag(ip); 1461 return 0; 1462 } 1463 1464 /* 1465 * If the mapping is dirty the operation can block and wait for some 1466 * time. Unless we are waiting, skip it. 1467 */ 1468 if (!(flags & SYNC_WAIT) && 1469 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 1470 return 0; 1471 1472 if (eofb) { 1473 if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 1474 match = xfs_inode_match_id_union(ip, eofb); 1475 else 1476 match = xfs_inode_match_id(ip, eofb); 1477 if (!match) 1478 return 0; 1479 1480 /* skip the inode if the file size is too small */ 1481 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 1482 XFS_ISIZE(ip) < eofb->eof_min_file_size) 1483 return 0; 1484 } 1485 1486 /* 1487 * If the caller is waiting, return -EAGAIN to keep the background 1488 * scanner moving and revisit the inode in a subsequent pass. 1489 */ 1490 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1491 if (flags & SYNC_WAIT) 1492 ret = -EAGAIN; 1493 return ret; 1494 } 1495 ret = xfs_free_eofblocks(ip); 1496 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1497 1498 return ret; 1499 } 1500 1501 static int 1502 __xfs_icache_free_eofblocks( 1503 struct xfs_mount *mp, 1504 struct xfs_eofblocks *eofb, 1505 int (*execute)(struct xfs_inode *ip, int flags, 1506 void *args), 1507 int tag) 1508 { 1509 int flags = SYNC_TRYLOCK; 1510 1511 if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) 1512 flags = SYNC_WAIT; 1513 1514 return xfs_inode_ag_iterator_tag(mp, execute, flags, 1515 eofb, tag); 1516 } 1517 1518 int 1519 xfs_icache_free_eofblocks( 1520 struct xfs_mount *mp, 1521 struct xfs_eofblocks *eofb) 1522 { 1523 return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks, 1524 XFS_ICI_EOFBLOCKS_TAG); 1525 } 1526 1527 /* 1528 * Run eofblocks scans on the quotas applicable to the inode. For inodes with 1529 * multiple quotas, we don't know exactly which quota caused an allocation 1530 * failure. We make a best effort by including each quota under low free space 1531 * conditions (less than 1% free space) in the scan. 1532 */ 1533 static int 1534 __xfs_inode_free_quota_eofblocks( 1535 struct xfs_inode *ip, 1536 int (*execute)(struct xfs_mount *mp, 1537 struct xfs_eofblocks *eofb)) 1538 { 1539 int scan = 0; 1540 struct xfs_eofblocks eofb = {0}; 1541 struct xfs_dquot *dq; 1542 1543 /* 1544 * Run a sync scan to increase effectiveness and use the union filter to 1545 * cover all applicable quotas in a single scan. 1546 */ 1547 eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; 1548 1549 if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { 1550 dq = xfs_inode_dquot(ip, XFS_DQ_USER); 1551 if (dq && xfs_dquot_lowsp(dq)) { 1552 eofb.eof_uid = VFS_I(ip)->i_uid; 1553 eofb.eof_flags |= XFS_EOF_FLAGS_UID; 1554 scan = 1; 1555 } 1556 } 1557 1558 if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { 1559 dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); 1560 if (dq && xfs_dquot_lowsp(dq)) { 1561 eofb.eof_gid = VFS_I(ip)->i_gid; 1562 eofb.eof_flags |= XFS_EOF_FLAGS_GID; 1563 scan = 1; 1564 } 1565 } 1566 1567 if (scan) 1568 execute(ip->i_mount, &eofb); 1569 1570 return scan; 1571 } 1572 1573 int 1574 xfs_inode_free_quota_eofblocks( 1575 struct xfs_inode *ip) 1576 { 1577 return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); 1578 } 1579 1580 static inline unsigned long 1581 xfs_iflag_for_tag( 1582 int tag) 1583 { 1584 switch (tag) { 1585 case XFS_ICI_EOFBLOCKS_TAG: 1586 return XFS_IEOFBLOCKS; 1587 case XFS_ICI_COWBLOCKS_TAG: 1588 return XFS_ICOWBLOCKS; 1589 default: 1590 ASSERT(0); 1591 return 0; 1592 } 1593 } 1594 1595 static void 1596 __xfs_inode_set_blocks_tag( 1597 xfs_inode_t *ip, 1598 void (*execute)(struct xfs_mount *mp), 1599 void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 1600 int error, unsigned long caller_ip), 1601 int tag) 1602 { 1603 struct xfs_mount *mp = ip->i_mount; 1604 struct xfs_perag *pag; 1605 int tagged; 1606 1607 /* 1608 * Don't bother locking the AG and looking up in the radix trees 1609 * if we already know that we have the tag set. 1610 */ 1611 if (ip->i_flags & xfs_iflag_for_tag(tag)) 1612 return; 1613 spin_lock(&ip->i_flags_lock); 1614 ip->i_flags |= xfs_iflag_for_tag(tag); 1615 spin_unlock(&ip->i_flags_lock); 1616 1617 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1618 spin_lock(&pag->pag_ici_lock); 1619 1620 tagged = radix_tree_tagged(&pag->pag_ici_root, tag); 1621 radix_tree_tag_set(&pag->pag_ici_root, 1622 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 1623 if (!tagged) { 1624 /* propagate the eofblocks tag up into the perag radix tree */ 1625 spin_lock(&ip->i_mount->m_perag_lock); 1626 radix_tree_tag_set(&ip->i_mount->m_perag_tree, 1627 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 1628 tag); 1629 spin_unlock(&ip->i_mount->m_perag_lock); 1630 1631 /* kick off background trimming */ 1632 execute(ip->i_mount); 1633 1634 set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 1635 } 1636 1637 spin_unlock(&pag->pag_ici_lock); 1638 xfs_perag_put(pag); 1639 } 1640 1641 void 1642 xfs_inode_set_eofblocks_tag( 1643 xfs_inode_t *ip) 1644 { 1645 trace_xfs_inode_set_eofblocks_tag(ip); 1646 return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks, 1647 trace_xfs_perag_set_eofblocks, 1648 XFS_ICI_EOFBLOCKS_TAG); 1649 } 1650 1651 static void 1652 __xfs_inode_clear_blocks_tag( 1653 xfs_inode_t *ip, 1654 void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 1655 int error, unsigned long caller_ip), 1656 int tag) 1657 { 1658 struct xfs_mount *mp = ip->i_mount; 1659 struct xfs_perag *pag; 1660 1661 spin_lock(&ip->i_flags_lock); 1662 ip->i_flags &= ~xfs_iflag_for_tag(tag); 1663 spin_unlock(&ip->i_flags_lock); 1664 1665 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1666 spin_lock(&pag->pag_ici_lock); 1667 1668 radix_tree_tag_clear(&pag->pag_ici_root, 1669 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 1670 if (!radix_tree_tagged(&pag->pag_ici_root, tag)) { 1671 /* clear the eofblocks tag from the perag radix tree */ 1672 spin_lock(&ip->i_mount->m_perag_lock); 1673 radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 1674 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 1675 tag); 1676 spin_unlock(&ip->i_mount->m_perag_lock); 1677 clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 1678 } 1679 1680 spin_unlock(&pag->pag_ici_lock); 1681 xfs_perag_put(pag); 1682 } 1683 1684 void 1685 xfs_inode_clear_eofblocks_tag( 1686 xfs_inode_t *ip) 1687 { 1688 trace_xfs_inode_clear_eofblocks_tag(ip); 1689 return __xfs_inode_clear_blocks_tag(ip, 1690 trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); 1691 } 1692 1693 /* 1694 * Set ourselves up to free CoW blocks from this file. If it's already clean 1695 * then we can bail out quickly, but otherwise we must back off if the file 1696 * is undergoing some kind of write. 1697 */ 1698 static bool 1699 xfs_prep_free_cowblocks( 1700 struct xfs_inode *ip) 1701 { 1702 /* 1703 * Just clear the tag if we have an empty cow fork or none at all. It's 1704 * possible the inode was fully unshared since it was originally tagged. 1705 */ 1706 if (!xfs_inode_has_cow_data(ip)) { 1707 trace_xfs_inode_free_cowblocks_invalid(ip); 1708 xfs_inode_clear_cowblocks_tag(ip); 1709 return false; 1710 } 1711 1712 /* 1713 * If the mapping is dirty or under writeback we cannot touch the 1714 * CoW fork. Leave it alone if we're in the midst of a directio. 1715 */ 1716 if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || 1717 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || 1718 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || 1719 atomic_read(&VFS_I(ip)->i_dio_count)) 1720 return false; 1721 1722 return true; 1723 } 1724 1725 /* 1726 * Automatic CoW Reservation Freeing 1727 * 1728 * These functions automatically garbage collect leftover CoW reservations 1729 * that were made on behalf of a cowextsize hint when we start to run out 1730 * of quota or when the reservations sit around for too long. If the file 1731 * has dirty pages or is undergoing writeback, its CoW reservations will 1732 * be retained. 1733 * 1734 * The actual garbage collection piggybacks off the same code that runs 1735 * the speculative EOF preallocation garbage collector. 1736 */ 1737 STATIC int 1738 xfs_inode_free_cowblocks( 1739 struct xfs_inode *ip, 1740 int flags, 1741 void *args) 1742 { 1743 struct xfs_eofblocks *eofb = args; 1744 int match; 1745 int ret = 0; 1746 1747 if (!xfs_prep_free_cowblocks(ip)) 1748 return 0; 1749 1750 if (eofb) { 1751 if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 1752 match = xfs_inode_match_id_union(ip, eofb); 1753 else 1754 match = xfs_inode_match_id(ip, eofb); 1755 if (!match) 1756 return 0; 1757 1758 /* skip the inode if the file size is too small */ 1759 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 1760 XFS_ISIZE(ip) < eofb->eof_min_file_size) 1761 return 0; 1762 } 1763 1764 /* Free the CoW blocks */ 1765 xfs_ilock(ip, XFS_IOLOCK_EXCL); 1766 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 1767 1768 /* 1769 * Check again, nobody else should be able to dirty blocks or change 1770 * the reflink iflag now that we have the first two locks held. 1771 */ 1772 if (xfs_prep_free_cowblocks(ip)) 1773 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); 1774 1775 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 1776 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1777 1778 return ret; 1779 } 1780 1781 int 1782 xfs_icache_free_cowblocks( 1783 struct xfs_mount *mp, 1784 struct xfs_eofblocks *eofb) 1785 { 1786 return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks, 1787 XFS_ICI_COWBLOCKS_TAG); 1788 } 1789 1790 int 1791 xfs_inode_free_quota_cowblocks( 1792 struct xfs_inode *ip) 1793 { 1794 return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); 1795 } 1796 1797 void 1798 xfs_inode_set_cowblocks_tag( 1799 xfs_inode_t *ip) 1800 { 1801 trace_xfs_inode_set_cowblocks_tag(ip); 1802 return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks, 1803 trace_xfs_perag_set_cowblocks, 1804 XFS_ICI_COWBLOCKS_TAG); 1805 } 1806 1807 void 1808 xfs_inode_clear_cowblocks_tag( 1809 xfs_inode_t *ip) 1810 { 1811 trace_xfs_inode_clear_cowblocks_tag(ip); 1812 return __xfs_inode_clear_blocks_tag(ip, 1813 trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); 1814 } 1815 1816 /* Disable post-EOF and CoW block auto-reclamation. */ 1817 void 1818 xfs_icache_disable_reclaim( 1819 struct xfs_mount *mp) 1820 { 1821 cancel_delayed_work_sync(&mp->m_eofblocks_work); 1822 cancel_delayed_work_sync(&mp->m_cowblocks_work); 1823 } 1824 1825 /* Enable post-EOF and CoW block auto-reclamation. */ 1826 void 1827 xfs_icache_enable_reclaim( 1828 struct xfs_mount *mp) 1829 { 1830 xfs_queue_eofblocks(mp); 1831 xfs_queue_cowblocks(mp); 1832 } 1833