1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_fs.h" 20 #include "xfs_format.h" 21 #include "xfs_log_format.h" 22 #include "xfs_trans_resv.h" 23 #include "xfs_sb.h" 24 #include "xfs_mount.h" 25 #include "xfs_inode.h" 26 #include "xfs_error.h" 27 #include "xfs_trans.h" 28 #include "xfs_trans_priv.h" 29 #include "xfs_inode_item.h" 30 #include "xfs_quota.h" 31 #include "xfs_trace.h" 32 #include "xfs_icache.h" 33 #include "xfs_bmap_util.h" 34 #include "xfs_dquot_item.h" 35 #include "xfs_dquot.h" 36 #include "xfs_reflink.h" 37 38 #include <linux/kthread.h> 39 #include <linux/freezer.h> 40 #include <linux/iversion.h> 41 42 /* 43 * Allocate and initialise an xfs_inode. 44 */ 45 struct xfs_inode * 46 xfs_inode_alloc( 47 struct xfs_mount *mp, 48 xfs_ino_t ino) 49 { 50 struct xfs_inode *ip; 51 52 /* 53 * if this didn't occur in transactions, we could use 54 * KM_MAYFAIL and return NULL here on ENOMEM. Set the 55 * code up to do this anyway. 56 */ 57 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); 58 if (!ip) 59 return NULL; 60 if (inode_init_always(mp->m_super, VFS_I(ip))) { 61 kmem_zone_free(xfs_inode_zone, ip); 62 return NULL; 63 } 64 65 /* VFS doesn't initialise i_mode! */ 66 VFS_I(ip)->i_mode = 0; 67 68 XFS_STATS_INC(mp, vn_active); 69 ASSERT(atomic_read(&ip->i_pincount) == 0); 70 ASSERT(!xfs_isiflocked(ip)); 71 ASSERT(ip->i_ino == 0); 72 73 /* initialise the xfs inode */ 74 ip->i_ino = ino; 75 ip->i_mount = mp; 76 memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 77 ip->i_afp = NULL; 78 ip->i_cowfp = NULL; 79 ip->i_cnextents = 0; 80 ip->i_cformat = XFS_DINODE_FMT_EXTENTS; 81 memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); 82 ip->i_flags = 0; 83 ip->i_delayed_blks = 0; 84 memset(&ip->i_d, 0, sizeof(ip->i_d)); 85 86 return ip; 87 } 88 89 STATIC void 90 xfs_inode_free_callback( 91 struct rcu_head *head) 92 { 93 struct inode *inode = container_of(head, struct inode, i_rcu); 94 struct xfs_inode *ip = XFS_I(inode); 95 96 switch (VFS_I(ip)->i_mode & S_IFMT) { 97 case S_IFREG: 98 case S_IFDIR: 99 case S_IFLNK: 100 xfs_idestroy_fork(ip, XFS_DATA_FORK); 101 break; 102 } 103 104 if (ip->i_afp) 105 xfs_idestroy_fork(ip, XFS_ATTR_FORK); 106 if (ip->i_cowfp) 107 xfs_idestroy_fork(ip, XFS_COW_FORK); 108 109 if (ip->i_itemp) { 110 ASSERT(!test_bit(XFS_LI_IN_AIL, 111 &ip->i_itemp->ili_item.li_flags)); 112 xfs_inode_item_destroy(ip); 113 ip->i_itemp = NULL; 114 } 115 116 kmem_zone_free(xfs_inode_zone, ip); 117 } 118 119 static void 120 __xfs_inode_free( 121 struct xfs_inode *ip) 122 { 123 /* asserts to verify all state is correct here */ 124 ASSERT(atomic_read(&ip->i_pincount) == 0); 125 XFS_STATS_DEC(ip->i_mount, vn_active); 126 127 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 128 } 129 130 void 131 xfs_inode_free( 132 struct xfs_inode *ip) 133 { 134 ASSERT(!xfs_isiflocked(ip)); 135 136 /* 137 * Because we use RCU freeing we need to ensure the inode always 138 * appears to be reclaimed with an invalid inode number when in the 139 * free state. The ip->i_flags_lock provides the barrier against lookup 140 * races. 141 */ 142 spin_lock(&ip->i_flags_lock); 143 ip->i_flags = XFS_IRECLAIM; 144 ip->i_ino = 0; 145 spin_unlock(&ip->i_flags_lock); 146 147 __xfs_inode_free(ip); 148 } 149 150 /* 151 * Queue a new inode reclaim pass if there are reclaimable inodes and there 152 * isn't a reclaim pass already in progress. By default it runs every 5s based 153 * on the xfs periodic sync default of 30s. Perhaps this should have it's own 154 * tunable, but that can be done if this method proves to be ineffective or too 155 * aggressive. 156 */ 157 static void 158 xfs_reclaim_work_queue( 159 struct xfs_mount *mp) 160 { 161 162 rcu_read_lock(); 163 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 164 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 165 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 166 } 167 rcu_read_unlock(); 168 } 169 170 /* 171 * This is a fast pass over the inode cache to try to get reclaim moving on as 172 * many inodes as possible in a short period of time. It kicks itself every few 173 * seconds, as well as being kicked by the inode cache shrinker when memory 174 * goes low. It scans as quickly as possible avoiding locked inodes or those 175 * already being flushed, and once done schedules a future pass. 176 */ 177 void 178 xfs_reclaim_worker( 179 struct work_struct *work) 180 { 181 struct xfs_mount *mp = container_of(to_delayed_work(work), 182 struct xfs_mount, m_reclaim_work); 183 184 xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 185 xfs_reclaim_work_queue(mp); 186 } 187 188 static void 189 xfs_perag_set_reclaim_tag( 190 struct xfs_perag *pag) 191 { 192 struct xfs_mount *mp = pag->pag_mount; 193 194 lockdep_assert_held(&pag->pag_ici_lock); 195 if (pag->pag_ici_reclaimable++) 196 return; 197 198 /* propagate the reclaim tag up into the perag radix tree */ 199 spin_lock(&mp->m_perag_lock); 200 radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, 201 XFS_ICI_RECLAIM_TAG); 202 spin_unlock(&mp->m_perag_lock); 203 204 /* schedule periodic background inode reclaim */ 205 xfs_reclaim_work_queue(mp); 206 207 trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 208 } 209 210 static void 211 xfs_perag_clear_reclaim_tag( 212 struct xfs_perag *pag) 213 { 214 struct xfs_mount *mp = pag->pag_mount; 215 216 lockdep_assert_held(&pag->pag_ici_lock); 217 if (--pag->pag_ici_reclaimable) 218 return; 219 220 /* clear the reclaim tag from the perag radix tree */ 221 spin_lock(&mp->m_perag_lock); 222 radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, 223 XFS_ICI_RECLAIM_TAG); 224 spin_unlock(&mp->m_perag_lock); 225 trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 226 } 227 228 229 /* 230 * We set the inode flag atomically with the radix tree tag. 231 * Once we get tag lookups on the radix tree, this inode flag 232 * can go away. 233 */ 234 void 235 xfs_inode_set_reclaim_tag( 236 struct xfs_inode *ip) 237 { 238 struct xfs_mount *mp = ip->i_mount; 239 struct xfs_perag *pag; 240 241 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 242 spin_lock(&pag->pag_ici_lock); 243 spin_lock(&ip->i_flags_lock); 244 245 radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), 246 XFS_ICI_RECLAIM_TAG); 247 xfs_perag_set_reclaim_tag(pag); 248 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 249 250 spin_unlock(&ip->i_flags_lock); 251 spin_unlock(&pag->pag_ici_lock); 252 xfs_perag_put(pag); 253 } 254 255 STATIC void 256 xfs_inode_clear_reclaim_tag( 257 struct xfs_perag *pag, 258 xfs_ino_t ino) 259 { 260 radix_tree_tag_clear(&pag->pag_ici_root, 261 XFS_INO_TO_AGINO(pag->pag_mount, ino), 262 XFS_ICI_RECLAIM_TAG); 263 xfs_perag_clear_reclaim_tag(pag); 264 } 265 266 static void 267 xfs_inew_wait( 268 struct xfs_inode *ip) 269 { 270 wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); 271 DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); 272 273 do { 274 prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); 275 if (!xfs_iflags_test(ip, XFS_INEW)) 276 break; 277 schedule(); 278 } while (true); 279 finish_wait(wq, &wait.wq_entry); 280 } 281 282 /* 283 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 284 * part of the structure. This is made more complex by the fact we store 285 * information about the on-disk values in the VFS inode and so we can't just 286 * overwrite the values unconditionally. Hence we save the parameters we 287 * need to retain across reinitialisation, and rewrite them into the VFS inode 288 * after reinitialisation even if it fails. 289 */ 290 static int 291 xfs_reinit_inode( 292 struct xfs_mount *mp, 293 struct inode *inode) 294 { 295 int error; 296 uint32_t nlink = inode->i_nlink; 297 uint32_t generation = inode->i_generation; 298 uint64_t version = inode_peek_iversion(inode); 299 umode_t mode = inode->i_mode; 300 dev_t dev = inode->i_rdev; 301 302 error = inode_init_always(mp->m_super, inode); 303 304 set_nlink(inode, nlink); 305 inode->i_generation = generation; 306 inode_set_iversion_queried(inode, version); 307 inode->i_mode = mode; 308 inode->i_rdev = dev; 309 return error; 310 } 311 312 /* 313 * If we are allocating a new inode, then check what was returned is 314 * actually a free, empty inode. If we are not allocating an inode, 315 * then check we didn't find a free inode. 316 * 317 * Returns: 318 * 0 if the inode free state matches the lookup context 319 * -ENOENT if the inode is free and we are not allocating 320 * -EFSCORRUPTED if there is any state mismatch at all 321 */ 322 static int 323 xfs_iget_check_free_state( 324 struct xfs_inode *ip, 325 int flags) 326 { 327 if (flags & XFS_IGET_CREATE) { 328 /* should be a free inode */ 329 if (VFS_I(ip)->i_mode != 0) { 330 xfs_warn(ip->i_mount, 331 "Corruption detected! Free inode 0x%llx not marked free! (mode 0x%x)", 332 ip->i_ino, VFS_I(ip)->i_mode); 333 return -EFSCORRUPTED; 334 } 335 336 if (ip->i_d.di_nblocks != 0) { 337 xfs_warn(ip->i_mount, 338 "Corruption detected! Free inode 0x%llx has blocks allocated!", 339 ip->i_ino); 340 return -EFSCORRUPTED; 341 } 342 return 0; 343 } 344 345 /* should be an allocated inode */ 346 if (VFS_I(ip)->i_mode == 0) 347 return -ENOENT; 348 349 return 0; 350 } 351 352 /* 353 * Check the validity of the inode we just found it the cache 354 */ 355 static int 356 xfs_iget_cache_hit( 357 struct xfs_perag *pag, 358 struct xfs_inode *ip, 359 xfs_ino_t ino, 360 int flags, 361 int lock_flags) __releases(RCU) 362 { 363 struct inode *inode = VFS_I(ip); 364 struct xfs_mount *mp = ip->i_mount; 365 int error; 366 367 /* 368 * check for re-use of an inode within an RCU grace period due to the 369 * radix tree nodes not being updated yet. We monitor for this by 370 * setting the inode number to zero before freeing the inode structure. 371 * If the inode has been reallocated and set up, then the inode number 372 * will not match, so check for that, too. 373 */ 374 spin_lock(&ip->i_flags_lock); 375 if (ip->i_ino != ino) { 376 trace_xfs_iget_skip(ip); 377 XFS_STATS_INC(mp, xs_ig_frecycle); 378 error = -EAGAIN; 379 goto out_error; 380 } 381 382 383 /* 384 * If we are racing with another cache hit that is currently 385 * instantiating this inode or currently recycling it out of 386 * reclaimabe state, wait for the initialisation to complete 387 * before continuing. 388 * 389 * XXX(hch): eventually we should do something equivalent to 390 * wait_on_inode to wait for these flags to be cleared 391 * instead of polling for it. 392 */ 393 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { 394 trace_xfs_iget_skip(ip); 395 XFS_STATS_INC(mp, xs_ig_frecycle); 396 error = -EAGAIN; 397 goto out_error; 398 } 399 400 /* 401 * Check the inode free state is valid. This also detects lookup 402 * racing with unlinks. 403 */ 404 error = xfs_iget_check_free_state(ip, flags); 405 if (error) 406 goto out_error; 407 408 /* 409 * If IRECLAIMABLE is set, we've torn down the VFS inode already. 410 * Need to carefully get it back into useable state. 411 */ 412 if (ip->i_flags & XFS_IRECLAIMABLE) { 413 trace_xfs_iget_reclaim(ip); 414 415 if (flags & XFS_IGET_INCORE) { 416 error = -EAGAIN; 417 goto out_error; 418 } 419 420 /* 421 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode 422 * from stomping over us while we recycle the inode. We can't 423 * clear the radix tree reclaimable tag yet as it requires 424 * pag_ici_lock to be held exclusive. 425 */ 426 ip->i_flags |= XFS_IRECLAIM; 427 428 spin_unlock(&ip->i_flags_lock); 429 rcu_read_unlock(); 430 431 error = xfs_reinit_inode(mp, inode); 432 if (error) { 433 bool wake; 434 /* 435 * Re-initializing the inode failed, and we are in deep 436 * trouble. Try to re-add it to the reclaim list. 437 */ 438 rcu_read_lock(); 439 spin_lock(&ip->i_flags_lock); 440 wake = !!__xfs_iflags_test(ip, XFS_INEW); 441 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 442 if (wake) 443 wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); 444 ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 445 trace_xfs_iget_reclaim_fail(ip); 446 goto out_error; 447 } 448 449 spin_lock(&pag->pag_ici_lock); 450 spin_lock(&ip->i_flags_lock); 451 452 /* 453 * Clear the per-lifetime state in the inode as we are now 454 * effectively a new inode and need to return to the initial 455 * state before reuse occurs. 456 */ 457 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 458 ip->i_flags |= XFS_INEW; 459 xfs_inode_clear_reclaim_tag(pag, ip->i_ino); 460 inode->i_state = I_NEW; 461 462 ASSERT(!rwsem_is_locked(&inode->i_rwsem)); 463 init_rwsem(&inode->i_rwsem); 464 465 spin_unlock(&ip->i_flags_lock); 466 spin_unlock(&pag->pag_ici_lock); 467 } else { 468 /* If the VFS inode is being torn down, pause and try again. */ 469 if (!igrab(inode)) { 470 trace_xfs_iget_skip(ip); 471 error = -EAGAIN; 472 goto out_error; 473 } 474 475 /* We've got a live one. */ 476 spin_unlock(&ip->i_flags_lock); 477 rcu_read_unlock(); 478 trace_xfs_iget_hit(ip); 479 } 480 481 if (lock_flags != 0) 482 xfs_ilock(ip, lock_flags); 483 484 if (!(flags & XFS_IGET_INCORE)) 485 xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); 486 XFS_STATS_INC(mp, xs_ig_found); 487 488 return 0; 489 490 out_error: 491 spin_unlock(&ip->i_flags_lock); 492 rcu_read_unlock(); 493 return error; 494 } 495 496 497 static int 498 xfs_iget_cache_miss( 499 struct xfs_mount *mp, 500 struct xfs_perag *pag, 501 xfs_trans_t *tp, 502 xfs_ino_t ino, 503 struct xfs_inode **ipp, 504 int flags, 505 int lock_flags) 506 { 507 struct xfs_inode *ip; 508 int error; 509 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 510 int iflags; 511 512 ip = xfs_inode_alloc(mp, ino); 513 if (!ip) 514 return -ENOMEM; 515 516 error = xfs_iread(mp, tp, ip, flags); 517 if (error) 518 goto out_destroy; 519 520 if (!xfs_inode_verify_forks(ip)) { 521 error = -EFSCORRUPTED; 522 goto out_destroy; 523 } 524 525 trace_xfs_iget_miss(ip); 526 527 528 /* 529 * Check the inode free state is valid. This also detects lookup 530 * racing with unlinks. 531 */ 532 error = xfs_iget_check_free_state(ip, flags); 533 if (error) 534 goto out_destroy; 535 536 /* 537 * Preload the radix tree so we can insert safely under the 538 * write spinlock. Note that we cannot sleep inside the preload 539 * region. Since we can be called from transaction context, don't 540 * recurse into the file system. 541 */ 542 if (radix_tree_preload(GFP_NOFS)) { 543 error = -EAGAIN; 544 goto out_destroy; 545 } 546 547 /* 548 * Because the inode hasn't been added to the radix-tree yet it can't 549 * be found by another thread, so we can do the non-sleeping lock here. 550 */ 551 if (lock_flags) { 552 if (!xfs_ilock_nowait(ip, lock_flags)) 553 BUG(); 554 } 555 556 /* 557 * These values must be set before inserting the inode into the radix 558 * tree as the moment it is inserted a concurrent lookup (allowed by the 559 * RCU locking mechanism) can find it and that lookup must see that this 560 * is an inode currently under construction (i.e. that XFS_INEW is set). 561 * The ip->i_flags_lock that protects the XFS_INEW flag forms the 562 * memory barrier that ensures this detection works correctly at lookup 563 * time. 564 */ 565 iflags = XFS_INEW; 566 if (flags & XFS_IGET_DONTCACHE) 567 iflags |= XFS_IDONTCACHE; 568 ip->i_udquot = NULL; 569 ip->i_gdquot = NULL; 570 ip->i_pdquot = NULL; 571 xfs_iflags_set(ip, iflags); 572 573 /* insert the new inode */ 574 spin_lock(&pag->pag_ici_lock); 575 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 576 if (unlikely(error)) { 577 WARN_ON(error != -EEXIST); 578 XFS_STATS_INC(mp, xs_ig_dup); 579 error = -EAGAIN; 580 goto out_preload_end; 581 } 582 spin_unlock(&pag->pag_ici_lock); 583 radix_tree_preload_end(); 584 585 *ipp = ip; 586 return 0; 587 588 out_preload_end: 589 spin_unlock(&pag->pag_ici_lock); 590 radix_tree_preload_end(); 591 if (lock_flags) 592 xfs_iunlock(ip, lock_flags); 593 out_destroy: 594 __destroy_inode(VFS_I(ip)); 595 xfs_inode_free(ip); 596 return error; 597 } 598 599 /* 600 * Look up an inode by number in the given file system. 601 * The inode is looked up in the cache held in each AG. 602 * If the inode is found in the cache, initialise the vfs inode 603 * if necessary. 604 * 605 * If it is not in core, read it in from the file system's device, 606 * add it to the cache and initialise the vfs inode. 607 * 608 * The inode is locked according to the value of the lock_flags parameter. 609 * This flag parameter indicates how and if the inode's IO lock and inode lock 610 * should be taken. 611 * 612 * mp -- the mount point structure for the current file system. It points 613 * to the inode hash table. 614 * tp -- a pointer to the current transaction if there is one. This is 615 * simply passed through to the xfs_iread() call. 616 * ino -- the number of the inode desired. This is the unique identifier 617 * within the file system for the inode being requested. 618 * lock_flags -- flags indicating how to lock the inode. See the comment 619 * for xfs_ilock() for a list of valid values. 620 */ 621 int 622 xfs_iget( 623 xfs_mount_t *mp, 624 xfs_trans_t *tp, 625 xfs_ino_t ino, 626 uint flags, 627 uint lock_flags, 628 xfs_inode_t **ipp) 629 { 630 xfs_inode_t *ip; 631 int error; 632 xfs_perag_t *pag; 633 xfs_agino_t agino; 634 635 /* 636 * xfs_reclaim_inode() uses the ILOCK to ensure an inode 637 * doesn't get freed while it's being referenced during a 638 * radix tree traversal here. It assumes this function 639 * aqcuires only the ILOCK (and therefore it has no need to 640 * involve the IOLOCK in this synchronization). 641 */ 642 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 643 644 /* reject inode numbers outside existing AGs */ 645 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 646 return -EINVAL; 647 648 XFS_STATS_INC(mp, xs_ig_attempts); 649 650 /* get the perag structure and ensure that it's inode capable */ 651 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 652 agino = XFS_INO_TO_AGINO(mp, ino); 653 654 again: 655 error = 0; 656 rcu_read_lock(); 657 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 658 659 if (ip) { 660 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 661 if (error) 662 goto out_error_or_again; 663 } else { 664 rcu_read_unlock(); 665 if (flags & XFS_IGET_INCORE) { 666 error = -ENODATA; 667 goto out_error_or_again; 668 } 669 XFS_STATS_INC(mp, xs_ig_missed); 670 671 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 672 flags, lock_flags); 673 if (error) 674 goto out_error_or_again; 675 } 676 xfs_perag_put(pag); 677 678 *ipp = ip; 679 680 /* 681 * If we have a real type for an on-disk inode, we can setup the inode 682 * now. If it's a new inode being created, xfs_ialloc will handle it. 683 */ 684 if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 685 xfs_setup_existing_inode(ip); 686 return 0; 687 688 out_error_or_again: 689 if (!(flags & XFS_IGET_INCORE) && error == -EAGAIN) { 690 delay(1); 691 goto again; 692 } 693 xfs_perag_put(pag); 694 return error; 695 } 696 697 /* 698 * "Is this a cached inode that's also allocated?" 699 * 700 * Look up an inode by number in the given file system. If the inode is 701 * in cache and isn't in purgatory, return 1 if the inode is allocated 702 * and 0 if it is not. For all other cases (not in cache, being torn 703 * down, etc.), return a negative error code. 704 * 705 * The caller has to prevent inode allocation and freeing activity, 706 * presumably by locking the AGI buffer. This is to ensure that an 707 * inode cannot transition from allocated to freed until the caller is 708 * ready to allow that. If the inode is in an intermediate state (new, 709 * reclaimable, or being reclaimed), -EAGAIN will be returned; if the 710 * inode is not in the cache, -ENOENT will be returned. The caller must 711 * deal with these scenarios appropriately. 712 * 713 * This is a specialized use case for the online scrubber; if you're 714 * reading this, you probably want xfs_iget. 715 */ 716 int 717 xfs_icache_inode_is_allocated( 718 struct xfs_mount *mp, 719 struct xfs_trans *tp, 720 xfs_ino_t ino, 721 bool *inuse) 722 { 723 struct xfs_inode *ip; 724 int error; 725 726 error = xfs_iget(mp, tp, ino, XFS_IGET_INCORE, 0, &ip); 727 if (error) 728 return error; 729 730 *inuse = !!(VFS_I(ip)->i_mode); 731 IRELE(ip); 732 return 0; 733 } 734 735 /* 736 * The inode lookup is done in batches to keep the amount of lock traffic and 737 * radix tree lookups to a minimum. The batch size is a trade off between 738 * lookup reduction and stack usage. This is in the reclaim path, so we can't 739 * be too greedy. 740 */ 741 #define XFS_LOOKUP_BATCH 32 742 743 STATIC int 744 xfs_inode_ag_walk_grab( 745 struct xfs_inode *ip, 746 int flags) 747 { 748 struct inode *inode = VFS_I(ip); 749 bool newinos = !!(flags & XFS_AGITER_INEW_WAIT); 750 751 ASSERT(rcu_read_lock_held()); 752 753 /* 754 * check for stale RCU freed inode 755 * 756 * If the inode has been reallocated, it doesn't matter if it's not in 757 * the AG we are walking - we are walking for writeback, so if it 758 * passes all the "valid inode" checks and is dirty, then we'll write 759 * it back anyway. If it has been reallocated and still being 760 * initialised, the XFS_INEW check below will catch it. 761 */ 762 spin_lock(&ip->i_flags_lock); 763 if (!ip->i_ino) 764 goto out_unlock_noent; 765 766 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 767 if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || 768 __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) 769 goto out_unlock_noent; 770 spin_unlock(&ip->i_flags_lock); 771 772 /* nothing to sync during shutdown */ 773 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 774 return -EFSCORRUPTED; 775 776 /* If we can't grab the inode, it must on it's way to reclaim. */ 777 if (!igrab(inode)) 778 return -ENOENT; 779 780 /* inode is valid */ 781 return 0; 782 783 out_unlock_noent: 784 spin_unlock(&ip->i_flags_lock); 785 return -ENOENT; 786 } 787 788 STATIC int 789 xfs_inode_ag_walk( 790 struct xfs_mount *mp, 791 struct xfs_perag *pag, 792 int (*execute)(struct xfs_inode *ip, int flags, 793 void *args), 794 int flags, 795 void *args, 796 int tag, 797 int iter_flags) 798 { 799 uint32_t first_index; 800 int last_error = 0; 801 int skipped; 802 int done; 803 int nr_found; 804 805 restart: 806 done = 0; 807 skipped = 0; 808 first_index = 0; 809 nr_found = 0; 810 do { 811 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 812 int error = 0; 813 int i; 814 815 rcu_read_lock(); 816 817 if (tag == -1) 818 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 819 (void **)batch, first_index, 820 XFS_LOOKUP_BATCH); 821 else 822 nr_found = radix_tree_gang_lookup_tag( 823 &pag->pag_ici_root, 824 (void **) batch, first_index, 825 XFS_LOOKUP_BATCH, tag); 826 827 if (!nr_found) { 828 rcu_read_unlock(); 829 break; 830 } 831 832 /* 833 * Grab the inodes before we drop the lock. if we found 834 * nothing, nr == 0 and the loop will be skipped. 835 */ 836 for (i = 0; i < nr_found; i++) { 837 struct xfs_inode *ip = batch[i]; 838 839 if (done || xfs_inode_ag_walk_grab(ip, iter_flags)) 840 batch[i] = NULL; 841 842 /* 843 * Update the index for the next lookup. Catch 844 * overflows into the next AG range which can occur if 845 * we have inodes in the last block of the AG and we 846 * are currently pointing to the last inode. 847 * 848 * Because we may see inodes that are from the wrong AG 849 * due to RCU freeing and reallocation, only update the 850 * index if it lies in this AG. It was a race that lead 851 * us to see this inode, so another lookup from the 852 * same index will not find it again. 853 */ 854 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 855 continue; 856 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 857 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 858 done = 1; 859 } 860 861 /* unlock now we've grabbed the inodes. */ 862 rcu_read_unlock(); 863 864 for (i = 0; i < nr_found; i++) { 865 if (!batch[i]) 866 continue; 867 if ((iter_flags & XFS_AGITER_INEW_WAIT) && 868 xfs_iflags_test(batch[i], XFS_INEW)) 869 xfs_inew_wait(batch[i]); 870 error = execute(batch[i], flags, args); 871 IRELE(batch[i]); 872 if (error == -EAGAIN) { 873 skipped++; 874 continue; 875 } 876 if (error && last_error != -EFSCORRUPTED) 877 last_error = error; 878 } 879 880 /* bail out if the filesystem is corrupted. */ 881 if (error == -EFSCORRUPTED) 882 break; 883 884 cond_resched(); 885 886 } while (nr_found && !done); 887 888 if (skipped) { 889 delay(1); 890 goto restart; 891 } 892 return last_error; 893 } 894 895 /* 896 * Background scanning to trim post-EOF preallocated space. This is queued 897 * based on the 'speculative_prealloc_lifetime' tunable (5m by default). 898 */ 899 void 900 xfs_queue_eofblocks( 901 struct xfs_mount *mp) 902 { 903 rcu_read_lock(); 904 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) 905 queue_delayed_work(mp->m_eofblocks_workqueue, 906 &mp->m_eofblocks_work, 907 msecs_to_jiffies(xfs_eofb_secs * 1000)); 908 rcu_read_unlock(); 909 } 910 911 void 912 xfs_eofblocks_worker( 913 struct work_struct *work) 914 { 915 struct xfs_mount *mp = container_of(to_delayed_work(work), 916 struct xfs_mount, m_eofblocks_work); 917 xfs_icache_free_eofblocks(mp, NULL); 918 xfs_queue_eofblocks(mp); 919 } 920 921 /* 922 * Background scanning to trim preallocated CoW space. This is queued 923 * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default). 924 * (We'll just piggyback on the post-EOF prealloc space workqueue.) 925 */ 926 void 927 xfs_queue_cowblocks( 928 struct xfs_mount *mp) 929 { 930 rcu_read_lock(); 931 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG)) 932 queue_delayed_work(mp->m_eofblocks_workqueue, 933 &mp->m_cowblocks_work, 934 msecs_to_jiffies(xfs_cowb_secs * 1000)); 935 rcu_read_unlock(); 936 } 937 938 void 939 xfs_cowblocks_worker( 940 struct work_struct *work) 941 { 942 struct xfs_mount *mp = container_of(to_delayed_work(work), 943 struct xfs_mount, m_cowblocks_work); 944 xfs_icache_free_cowblocks(mp, NULL); 945 xfs_queue_cowblocks(mp); 946 } 947 948 int 949 xfs_inode_ag_iterator_flags( 950 struct xfs_mount *mp, 951 int (*execute)(struct xfs_inode *ip, int flags, 952 void *args), 953 int flags, 954 void *args, 955 int iter_flags) 956 { 957 struct xfs_perag *pag; 958 int error = 0; 959 int last_error = 0; 960 xfs_agnumber_t ag; 961 962 ag = 0; 963 while ((pag = xfs_perag_get(mp, ag))) { 964 ag = pag->pag_agno + 1; 965 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1, 966 iter_flags); 967 xfs_perag_put(pag); 968 if (error) { 969 last_error = error; 970 if (error == -EFSCORRUPTED) 971 break; 972 } 973 } 974 return last_error; 975 } 976 977 int 978 xfs_inode_ag_iterator( 979 struct xfs_mount *mp, 980 int (*execute)(struct xfs_inode *ip, int flags, 981 void *args), 982 int flags, 983 void *args) 984 { 985 return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0); 986 } 987 988 int 989 xfs_inode_ag_iterator_tag( 990 struct xfs_mount *mp, 991 int (*execute)(struct xfs_inode *ip, int flags, 992 void *args), 993 int flags, 994 void *args, 995 int tag) 996 { 997 struct xfs_perag *pag; 998 int error = 0; 999 int last_error = 0; 1000 xfs_agnumber_t ag; 1001 1002 ag = 0; 1003 while ((pag = xfs_perag_get_tag(mp, ag, tag))) { 1004 ag = pag->pag_agno + 1; 1005 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag, 1006 0); 1007 xfs_perag_put(pag); 1008 if (error) { 1009 last_error = error; 1010 if (error == -EFSCORRUPTED) 1011 break; 1012 } 1013 } 1014 return last_error; 1015 } 1016 1017 /* 1018 * Grab the inode for reclaim exclusively. 1019 * Return 0 if we grabbed it, non-zero otherwise. 1020 */ 1021 STATIC int 1022 xfs_reclaim_inode_grab( 1023 struct xfs_inode *ip, 1024 int flags) 1025 { 1026 ASSERT(rcu_read_lock_held()); 1027 1028 /* quick check for stale RCU freed inode */ 1029 if (!ip->i_ino) 1030 return 1; 1031 1032 /* 1033 * If we are asked for non-blocking operation, do unlocked checks to 1034 * see if the inode already is being flushed or in reclaim to avoid 1035 * lock traffic. 1036 */ 1037 if ((flags & SYNC_TRYLOCK) && 1038 __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) 1039 return 1; 1040 1041 /* 1042 * The radix tree lock here protects a thread in xfs_iget from racing 1043 * with us starting reclaim on the inode. Once we have the 1044 * XFS_IRECLAIM flag set it will not touch us. 1045 * 1046 * Due to RCU lookup, we may find inodes that have been freed and only 1047 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that 1048 * aren't candidates for reclaim at all, so we must check the 1049 * XFS_IRECLAIMABLE is set first before proceeding to reclaim. 1050 */ 1051 spin_lock(&ip->i_flags_lock); 1052 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 1053 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 1054 /* not a reclaim candidate. */ 1055 spin_unlock(&ip->i_flags_lock); 1056 return 1; 1057 } 1058 __xfs_iflags_set(ip, XFS_IRECLAIM); 1059 spin_unlock(&ip->i_flags_lock); 1060 return 0; 1061 } 1062 1063 /* 1064 * Inodes in different states need to be treated differently. The following 1065 * table lists the inode states and the reclaim actions necessary: 1066 * 1067 * inode state iflush ret required action 1068 * --------------- ---------- --------------- 1069 * bad - reclaim 1070 * shutdown EIO unpin and reclaim 1071 * clean, unpinned 0 reclaim 1072 * stale, unpinned 0 reclaim 1073 * clean, pinned(*) 0 requeue 1074 * stale, pinned EAGAIN requeue 1075 * dirty, async - requeue 1076 * dirty, sync 0 reclaim 1077 * 1078 * (*) dgc: I don't think the clean, pinned state is possible but it gets 1079 * handled anyway given the order of checks implemented. 1080 * 1081 * Also, because we get the flush lock first, we know that any inode that has 1082 * been flushed delwri has had the flush completed by the time we check that 1083 * the inode is clean. 1084 * 1085 * Note that because the inode is flushed delayed write by AIL pushing, the 1086 * flush lock may already be held here and waiting on it can result in very 1087 * long latencies. Hence for sync reclaims, where we wait on the flush lock, 1088 * the caller should push the AIL first before trying to reclaim inodes to 1089 * minimise the amount of time spent waiting. For background relaim, we only 1090 * bother to reclaim clean inodes anyway. 1091 * 1092 * Hence the order of actions after gaining the locks should be: 1093 * bad => reclaim 1094 * shutdown => unpin and reclaim 1095 * pinned, async => requeue 1096 * pinned, sync => unpin 1097 * stale => reclaim 1098 * clean => reclaim 1099 * dirty, async => requeue 1100 * dirty, sync => flush, wait and reclaim 1101 */ 1102 STATIC int 1103 xfs_reclaim_inode( 1104 struct xfs_inode *ip, 1105 struct xfs_perag *pag, 1106 int sync_mode) 1107 { 1108 struct xfs_buf *bp = NULL; 1109 xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 1110 int error; 1111 1112 restart: 1113 error = 0; 1114 xfs_ilock(ip, XFS_ILOCK_EXCL); 1115 if (!xfs_iflock_nowait(ip)) { 1116 if (!(sync_mode & SYNC_WAIT)) 1117 goto out; 1118 xfs_iflock(ip); 1119 } 1120 1121 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 1122 xfs_iunpin_wait(ip); 1123 /* xfs_iflush_abort() drops the flush lock */ 1124 xfs_iflush_abort(ip, false); 1125 goto reclaim; 1126 } 1127 if (xfs_ipincount(ip)) { 1128 if (!(sync_mode & SYNC_WAIT)) 1129 goto out_ifunlock; 1130 xfs_iunpin_wait(ip); 1131 } 1132 if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) { 1133 xfs_ifunlock(ip); 1134 goto reclaim; 1135 } 1136 1137 /* 1138 * Never flush out dirty data during non-blocking reclaim, as it would 1139 * just contend with AIL pushing trying to do the same job. 1140 */ 1141 if (!(sync_mode & SYNC_WAIT)) 1142 goto out_ifunlock; 1143 1144 /* 1145 * Now we have an inode that needs flushing. 1146 * 1147 * Note that xfs_iflush will never block on the inode buffer lock, as 1148 * xfs_ifree_cluster() can lock the inode buffer before it locks the 1149 * ip->i_lock, and we are doing the exact opposite here. As a result, 1150 * doing a blocking xfs_imap_to_bp() to get the cluster buffer would 1151 * result in an ABBA deadlock with xfs_ifree_cluster(). 1152 * 1153 * As xfs_ifree_cluser() must gather all inodes that are active in the 1154 * cache to mark them stale, if we hit this case we don't actually want 1155 * to do IO here - we want the inode marked stale so we can simply 1156 * reclaim it. Hence if we get an EAGAIN error here, just unlock the 1157 * inode, back off and try again. Hopefully the next pass through will 1158 * see the stale flag set on the inode. 1159 */ 1160 error = xfs_iflush(ip, &bp); 1161 if (error == -EAGAIN) { 1162 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1163 /* backoff longer than in xfs_ifree_cluster */ 1164 delay(2); 1165 goto restart; 1166 } 1167 1168 if (!error) { 1169 error = xfs_bwrite(bp); 1170 xfs_buf_relse(bp); 1171 } 1172 1173 reclaim: 1174 ASSERT(!xfs_isiflocked(ip)); 1175 1176 /* 1177 * Because we use RCU freeing we need to ensure the inode always appears 1178 * to be reclaimed with an invalid inode number when in the free state. 1179 * We do this as early as possible under the ILOCK so that 1180 * xfs_iflush_cluster() and xfs_ifree_cluster() can be guaranteed to 1181 * detect races with us here. By doing this, we guarantee that once 1182 * xfs_iflush_cluster() or xfs_ifree_cluster() has locked XFS_ILOCK that 1183 * it will see either a valid inode that will serialise correctly, or it 1184 * will see an invalid inode that it can skip. 1185 */ 1186 spin_lock(&ip->i_flags_lock); 1187 ip->i_flags = XFS_IRECLAIM; 1188 ip->i_ino = 0; 1189 spin_unlock(&ip->i_flags_lock); 1190 1191 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1192 1193 XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 1194 /* 1195 * Remove the inode from the per-AG radix tree. 1196 * 1197 * Because radix_tree_delete won't complain even if the item was never 1198 * added to the tree assert that it's been there before to catch 1199 * problems with the inode life time early on. 1200 */ 1201 spin_lock(&pag->pag_ici_lock); 1202 if (!radix_tree_delete(&pag->pag_ici_root, 1203 XFS_INO_TO_AGINO(ip->i_mount, ino))) 1204 ASSERT(0); 1205 xfs_perag_clear_reclaim_tag(pag); 1206 spin_unlock(&pag->pag_ici_lock); 1207 1208 /* 1209 * Here we do an (almost) spurious inode lock in order to coordinate 1210 * with inode cache radix tree lookups. This is because the lookup 1211 * can reference the inodes in the cache without taking references. 1212 * 1213 * We make that OK here by ensuring that we wait until the inode is 1214 * unlocked after the lookup before we go ahead and free it. 1215 */ 1216 xfs_ilock(ip, XFS_ILOCK_EXCL); 1217 xfs_qm_dqdetach(ip); 1218 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1219 1220 __xfs_inode_free(ip); 1221 return error; 1222 1223 out_ifunlock: 1224 xfs_ifunlock(ip); 1225 out: 1226 xfs_iflags_clear(ip, XFS_IRECLAIM); 1227 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1228 /* 1229 * We could return -EAGAIN here to make reclaim rescan the inode tree in 1230 * a short while. However, this just burns CPU time scanning the tree 1231 * waiting for IO to complete and the reclaim work never goes back to 1232 * the idle state. Instead, return 0 to let the next scheduled 1233 * background reclaim attempt to reclaim the inode again. 1234 */ 1235 return 0; 1236 } 1237 1238 /* 1239 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 1240 * corrupted, we still want to try to reclaim all the inodes. If we don't, 1241 * then a shut down during filesystem unmount reclaim walk leak all the 1242 * unreclaimed inodes. 1243 */ 1244 STATIC int 1245 xfs_reclaim_inodes_ag( 1246 struct xfs_mount *mp, 1247 int flags, 1248 int *nr_to_scan) 1249 { 1250 struct xfs_perag *pag; 1251 int error = 0; 1252 int last_error = 0; 1253 xfs_agnumber_t ag; 1254 int trylock = flags & SYNC_TRYLOCK; 1255 int skipped; 1256 1257 restart: 1258 ag = 0; 1259 skipped = 0; 1260 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1261 unsigned long first_index = 0; 1262 int done = 0; 1263 int nr_found = 0; 1264 1265 ag = pag->pag_agno + 1; 1266 1267 if (trylock) { 1268 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 1269 skipped++; 1270 xfs_perag_put(pag); 1271 continue; 1272 } 1273 first_index = pag->pag_ici_reclaim_cursor; 1274 } else 1275 mutex_lock(&pag->pag_ici_reclaim_lock); 1276 1277 do { 1278 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 1279 int i; 1280 1281 rcu_read_lock(); 1282 nr_found = radix_tree_gang_lookup_tag( 1283 &pag->pag_ici_root, 1284 (void **)batch, first_index, 1285 XFS_LOOKUP_BATCH, 1286 XFS_ICI_RECLAIM_TAG); 1287 if (!nr_found) { 1288 done = 1; 1289 rcu_read_unlock(); 1290 break; 1291 } 1292 1293 /* 1294 * Grab the inodes before we drop the lock. if we found 1295 * nothing, nr == 0 and the loop will be skipped. 1296 */ 1297 for (i = 0; i < nr_found; i++) { 1298 struct xfs_inode *ip = batch[i]; 1299 1300 if (done || xfs_reclaim_inode_grab(ip, flags)) 1301 batch[i] = NULL; 1302 1303 /* 1304 * Update the index for the next lookup. Catch 1305 * overflows into the next AG range which can 1306 * occur if we have inodes in the last block of 1307 * the AG and we are currently pointing to the 1308 * last inode. 1309 * 1310 * Because we may see inodes that are from the 1311 * wrong AG due to RCU freeing and 1312 * reallocation, only update the index if it 1313 * lies in this AG. It was a race that lead us 1314 * to see this inode, so another lookup from 1315 * the same index will not find it again. 1316 */ 1317 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 1318 pag->pag_agno) 1319 continue; 1320 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 1321 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 1322 done = 1; 1323 } 1324 1325 /* unlock now we've grabbed the inodes. */ 1326 rcu_read_unlock(); 1327 1328 for (i = 0; i < nr_found; i++) { 1329 if (!batch[i]) 1330 continue; 1331 error = xfs_reclaim_inode(batch[i], pag, flags); 1332 if (error && last_error != -EFSCORRUPTED) 1333 last_error = error; 1334 } 1335 1336 *nr_to_scan -= XFS_LOOKUP_BATCH; 1337 1338 cond_resched(); 1339 1340 } while (nr_found && !done && *nr_to_scan > 0); 1341 1342 if (trylock && !done) 1343 pag->pag_ici_reclaim_cursor = first_index; 1344 else 1345 pag->pag_ici_reclaim_cursor = 0; 1346 mutex_unlock(&pag->pag_ici_reclaim_lock); 1347 xfs_perag_put(pag); 1348 } 1349 1350 /* 1351 * if we skipped any AG, and we still have scan count remaining, do 1352 * another pass this time using blocking reclaim semantics (i.e 1353 * waiting on the reclaim locks and ignoring the reclaim cursors). This 1354 * ensure that when we get more reclaimers than AGs we block rather 1355 * than spin trying to execute reclaim. 1356 */ 1357 if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { 1358 trylock = 0; 1359 goto restart; 1360 } 1361 return last_error; 1362 } 1363 1364 int 1365 xfs_reclaim_inodes( 1366 xfs_mount_t *mp, 1367 int mode) 1368 { 1369 int nr_to_scan = INT_MAX; 1370 1371 return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); 1372 } 1373 1374 /* 1375 * Scan a certain number of inodes for reclaim. 1376 * 1377 * When called we make sure that there is a background (fast) inode reclaim in 1378 * progress, while we will throttle the speed of reclaim via doing synchronous 1379 * reclaim of inodes. That means if we come across dirty inodes, we wait for 1380 * them to be cleaned, which we hope will not be very long due to the 1381 * background walker having already kicked the IO off on those dirty inodes. 1382 */ 1383 long 1384 xfs_reclaim_inodes_nr( 1385 struct xfs_mount *mp, 1386 int nr_to_scan) 1387 { 1388 /* kick background reclaimer and push the AIL */ 1389 xfs_reclaim_work_queue(mp); 1390 xfs_ail_push_all(mp->m_ail); 1391 1392 return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 1393 } 1394 1395 /* 1396 * Return the number of reclaimable inodes in the filesystem for 1397 * the shrinker to determine how much to reclaim. 1398 */ 1399 int 1400 xfs_reclaim_inodes_count( 1401 struct xfs_mount *mp) 1402 { 1403 struct xfs_perag *pag; 1404 xfs_agnumber_t ag = 0; 1405 int reclaimable = 0; 1406 1407 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1408 ag = pag->pag_agno + 1; 1409 reclaimable += pag->pag_ici_reclaimable; 1410 xfs_perag_put(pag); 1411 } 1412 return reclaimable; 1413 } 1414 1415 STATIC int 1416 xfs_inode_match_id( 1417 struct xfs_inode *ip, 1418 struct xfs_eofblocks *eofb) 1419 { 1420 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1421 !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1422 return 0; 1423 1424 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1425 !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1426 return 0; 1427 1428 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1429 xfs_get_projid(ip) != eofb->eof_prid) 1430 return 0; 1431 1432 return 1; 1433 } 1434 1435 /* 1436 * A union-based inode filtering algorithm. Process the inode if any of the 1437 * criteria match. This is for global/internal scans only. 1438 */ 1439 STATIC int 1440 xfs_inode_match_id_union( 1441 struct xfs_inode *ip, 1442 struct xfs_eofblocks *eofb) 1443 { 1444 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1445 uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1446 return 1; 1447 1448 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1449 gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1450 return 1; 1451 1452 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1453 xfs_get_projid(ip) == eofb->eof_prid) 1454 return 1; 1455 1456 return 0; 1457 } 1458 1459 STATIC int 1460 xfs_inode_free_eofblocks( 1461 struct xfs_inode *ip, 1462 int flags, 1463 void *args) 1464 { 1465 int ret = 0; 1466 struct xfs_eofblocks *eofb = args; 1467 int match; 1468 1469 if (!xfs_can_free_eofblocks(ip, false)) { 1470 /* inode could be preallocated or append-only */ 1471 trace_xfs_inode_free_eofblocks_invalid(ip); 1472 xfs_inode_clear_eofblocks_tag(ip); 1473 return 0; 1474 } 1475 1476 /* 1477 * If the mapping is dirty the operation can block and wait for some 1478 * time. Unless we are waiting, skip it. 1479 */ 1480 if (!(flags & SYNC_WAIT) && 1481 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 1482 return 0; 1483 1484 if (eofb) { 1485 if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 1486 match = xfs_inode_match_id_union(ip, eofb); 1487 else 1488 match = xfs_inode_match_id(ip, eofb); 1489 if (!match) 1490 return 0; 1491 1492 /* skip the inode if the file size is too small */ 1493 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 1494 XFS_ISIZE(ip) < eofb->eof_min_file_size) 1495 return 0; 1496 } 1497 1498 /* 1499 * If the caller is waiting, return -EAGAIN to keep the background 1500 * scanner moving and revisit the inode in a subsequent pass. 1501 */ 1502 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { 1503 if (flags & SYNC_WAIT) 1504 ret = -EAGAIN; 1505 return ret; 1506 } 1507 ret = xfs_free_eofblocks(ip); 1508 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1509 1510 return ret; 1511 } 1512 1513 static int 1514 __xfs_icache_free_eofblocks( 1515 struct xfs_mount *mp, 1516 struct xfs_eofblocks *eofb, 1517 int (*execute)(struct xfs_inode *ip, int flags, 1518 void *args), 1519 int tag) 1520 { 1521 int flags = SYNC_TRYLOCK; 1522 1523 if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) 1524 flags = SYNC_WAIT; 1525 1526 return xfs_inode_ag_iterator_tag(mp, execute, flags, 1527 eofb, tag); 1528 } 1529 1530 int 1531 xfs_icache_free_eofblocks( 1532 struct xfs_mount *mp, 1533 struct xfs_eofblocks *eofb) 1534 { 1535 return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks, 1536 XFS_ICI_EOFBLOCKS_TAG); 1537 } 1538 1539 /* 1540 * Run eofblocks scans on the quotas applicable to the inode. For inodes with 1541 * multiple quotas, we don't know exactly which quota caused an allocation 1542 * failure. We make a best effort by including each quota under low free space 1543 * conditions (less than 1% free space) in the scan. 1544 */ 1545 static int 1546 __xfs_inode_free_quota_eofblocks( 1547 struct xfs_inode *ip, 1548 int (*execute)(struct xfs_mount *mp, 1549 struct xfs_eofblocks *eofb)) 1550 { 1551 int scan = 0; 1552 struct xfs_eofblocks eofb = {0}; 1553 struct xfs_dquot *dq; 1554 1555 /* 1556 * Run a sync scan to increase effectiveness and use the union filter to 1557 * cover all applicable quotas in a single scan. 1558 */ 1559 eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; 1560 1561 if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { 1562 dq = xfs_inode_dquot(ip, XFS_DQ_USER); 1563 if (dq && xfs_dquot_lowsp(dq)) { 1564 eofb.eof_uid = VFS_I(ip)->i_uid; 1565 eofb.eof_flags |= XFS_EOF_FLAGS_UID; 1566 scan = 1; 1567 } 1568 } 1569 1570 if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { 1571 dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); 1572 if (dq && xfs_dquot_lowsp(dq)) { 1573 eofb.eof_gid = VFS_I(ip)->i_gid; 1574 eofb.eof_flags |= XFS_EOF_FLAGS_GID; 1575 scan = 1; 1576 } 1577 } 1578 1579 if (scan) 1580 execute(ip->i_mount, &eofb); 1581 1582 return scan; 1583 } 1584 1585 int 1586 xfs_inode_free_quota_eofblocks( 1587 struct xfs_inode *ip) 1588 { 1589 return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks); 1590 } 1591 1592 static inline unsigned long 1593 xfs_iflag_for_tag( 1594 int tag) 1595 { 1596 switch (tag) { 1597 case XFS_ICI_EOFBLOCKS_TAG: 1598 return XFS_IEOFBLOCKS; 1599 case XFS_ICI_COWBLOCKS_TAG: 1600 return XFS_ICOWBLOCKS; 1601 default: 1602 ASSERT(0); 1603 return 0; 1604 } 1605 } 1606 1607 static void 1608 __xfs_inode_set_blocks_tag( 1609 xfs_inode_t *ip, 1610 void (*execute)(struct xfs_mount *mp), 1611 void (*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 1612 int error, unsigned long caller_ip), 1613 int tag) 1614 { 1615 struct xfs_mount *mp = ip->i_mount; 1616 struct xfs_perag *pag; 1617 int tagged; 1618 1619 /* 1620 * Don't bother locking the AG and looking up in the radix trees 1621 * if we already know that we have the tag set. 1622 */ 1623 if (ip->i_flags & xfs_iflag_for_tag(tag)) 1624 return; 1625 spin_lock(&ip->i_flags_lock); 1626 ip->i_flags |= xfs_iflag_for_tag(tag); 1627 spin_unlock(&ip->i_flags_lock); 1628 1629 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1630 spin_lock(&pag->pag_ici_lock); 1631 1632 tagged = radix_tree_tagged(&pag->pag_ici_root, tag); 1633 radix_tree_tag_set(&pag->pag_ici_root, 1634 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 1635 if (!tagged) { 1636 /* propagate the eofblocks tag up into the perag radix tree */ 1637 spin_lock(&ip->i_mount->m_perag_lock); 1638 radix_tree_tag_set(&ip->i_mount->m_perag_tree, 1639 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 1640 tag); 1641 spin_unlock(&ip->i_mount->m_perag_lock); 1642 1643 /* kick off background trimming */ 1644 execute(ip->i_mount); 1645 1646 set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 1647 } 1648 1649 spin_unlock(&pag->pag_ici_lock); 1650 xfs_perag_put(pag); 1651 } 1652 1653 void 1654 xfs_inode_set_eofblocks_tag( 1655 xfs_inode_t *ip) 1656 { 1657 trace_xfs_inode_set_eofblocks_tag(ip); 1658 return __xfs_inode_set_blocks_tag(ip, xfs_queue_eofblocks, 1659 trace_xfs_perag_set_eofblocks, 1660 XFS_ICI_EOFBLOCKS_TAG); 1661 } 1662 1663 static void 1664 __xfs_inode_clear_blocks_tag( 1665 xfs_inode_t *ip, 1666 void (*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno, 1667 int error, unsigned long caller_ip), 1668 int tag) 1669 { 1670 struct xfs_mount *mp = ip->i_mount; 1671 struct xfs_perag *pag; 1672 1673 spin_lock(&ip->i_flags_lock); 1674 ip->i_flags &= ~xfs_iflag_for_tag(tag); 1675 spin_unlock(&ip->i_flags_lock); 1676 1677 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1678 spin_lock(&pag->pag_ici_lock); 1679 1680 radix_tree_tag_clear(&pag->pag_ici_root, 1681 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag); 1682 if (!radix_tree_tagged(&pag->pag_ici_root, tag)) { 1683 /* clear the eofblocks tag from the perag radix tree */ 1684 spin_lock(&ip->i_mount->m_perag_lock); 1685 radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 1686 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 1687 tag); 1688 spin_unlock(&ip->i_mount->m_perag_lock); 1689 clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_); 1690 } 1691 1692 spin_unlock(&pag->pag_ici_lock); 1693 xfs_perag_put(pag); 1694 } 1695 1696 void 1697 xfs_inode_clear_eofblocks_tag( 1698 xfs_inode_t *ip) 1699 { 1700 trace_xfs_inode_clear_eofblocks_tag(ip); 1701 return __xfs_inode_clear_blocks_tag(ip, 1702 trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG); 1703 } 1704 1705 /* 1706 * Set ourselves up to free CoW blocks from this file. If it's already clean 1707 * then we can bail out quickly, but otherwise we must back off if the file 1708 * is undergoing some kind of write. 1709 */ 1710 static bool 1711 xfs_prep_free_cowblocks( 1712 struct xfs_inode *ip, 1713 struct xfs_ifork *ifp) 1714 { 1715 /* 1716 * Just clear the tag if we have an empty cow fork or none at all. It's 1717 * possible the inode was fully unshared since it was originally tagged. 1718 */ 1719 if (!xfs_is_reflink_inode(ip) || !ifp->if_bytes) { 1720 trace_xfs_inode_free_cowblocks_invalid(ip); 1721 xfs_inode_clear_cowblocks_tag(ip); 1722 return false; 1723 } 1724 1725 /* 1726 * If the mapping is dirty or under writeback we cannot touch the 1727 * CoW fork. Leave it alone if we're in the midst of a directio. 1728 */ 1729 if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) || 1730 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) || 1731 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) || 1732 atomic_read(&VFS_I(ip)->i_dio_count)) 1733 return false; 1734 1735 return true; 1736 } 1737 1738 /* 1739 * Automatic CoW Reservation Freeing 1740 * 1741 * These functions automatically garbage collect leftover CoW reservations 1742 * that were made on behalf of a cowextsize hint when we start to run out 1743 * of quota or when the reservations sit around for too long. If the file 1744 * has dirty pages or is undergoing writeback, its CoW reservations will 1745 * be retained. 1746 * 1747 * The actual garbage collection piggybacks off the same code that runs 1748 * the speculative EOF preallocation garbage collector. 1749 */ 1750 STATIC int 1751 xfs_inode_free_cowblocks( 1752 struct xfs_inode *ip, 1753 int flags, 1754 void *args) 1755 { 1756 struct xfs_eofblocks *eofb = args; 1757 struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); 1758 int match; 1759 int ret = 0; 1760 1761 if (!xfs_prep_free_cowblocks(ip, ifp)) 1762 return 0; 1763 1764 if (eofb) { 1765 if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 1766 match = xfs_inode_match_id_union(ip, eofb); 1767 else 1768 match = xfs_inode_match_id(ip, eofb); 1769 if (!match) 1770 return 0; 1771 1772 /* skip the inode if the file size is too small */ 1773 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 1774 XFS_ISIZE(ip) < eofb->eof_min_file_size) 1775 return 0; 1776 } 1777 1778 /* Free the CoW blocks */ 1779 xfs_ilock(ip, XFS_IOLOCK_EXCL); 1780 xfs_ilock(ip, XFS_MMAPLOCK_EXCL); 1781 1782 /* 1783 * Check again, nobody else should be able to dirty blocks or change 1784 * the reflink iflag now that we have the first two locks held. 1785 */ 1786 if (xfs_prep_free_cowblocks(ip, ifp)) 1787 ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF, false); 1788 1789 xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); 1790 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 1791 1792 return ret; 1793 } 1794 1795 int 1796 xfs_icache_free_cowblocks( 1797 struct xfs_mount *mp, 1798 struct xfs_eofblocks *eofb) 1799 { 1800 return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks, 1801 XFS_ICI_COWBLOCKS_TAG); 1802 } 1803 1804 int 1805 xfs_inode_free_quota_cowblocks( 1806 struct xfs_inode *ip) 1807 { 1808 return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks); 1809 } 1810 1811 void 1812 xfs_inode_set_cowblocks_tag( 1813 xfs_inode_t *ip) 1814 { 1815 trace_xfs_inode_set_cowblocks_tag(ip); 1816 return __xfs_inode_set_blocks_tag(ip, xfs_queue_cowblocks, 1817 trace_xfs_perag_set_cowblocks, 1818 XFS_ICI_COWBLOCKS_TAG); 1819 } 1820 1821 void 1822 xfs_inode_clear_cowblocks_tag( 1823 xfs_inode_t *ip) 1824 { 1825 trace_xfs_inode_clear_cowblocks_tag(ip); 1826 return __xfs_inode_clear_blocks_tag(ip, 1827 trace_xfs_perag_clear_cowblocks, XFS_ICI_COWBLOCKS_TAG); 1828 } 1829 1830 /* Disable post-EOF and CoW block auto-reclamation. */ 1831 void 1832 xfs_icache_disable_reclaim( 1833 struct xfs_mount *mp) 1834 { 1835 cancel_delayed_work_sync(&mp->m_eofblocks_work); 1836 cancel_delayed_work_sync(&mp->m_cowblocks_work); 1837 } 1838 1839 /* Enable post-EOF and CoW block auto-reclamation. */ 1840 void 1841 xfs_icache_enable_reclaim( 1842 struct xfs_mount *mp) 1843 { 1844 xfs_queue_eofblocks(mp); 1845 xfs_queue_cowblocks(mp); 1846 } 1847