1 /* 2 * Copyright (c) 2000-2005 Silicon Graphics, Inc. 3 * All Rights Reserved. 4 * 5 * This program is free software; you can redistribute it and/or 6 * modify it under the terms of the GNU General Public License as 7 * published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope that it would be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, write the Free Software Foundation, 16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 */ 18 #include "xfs.h" 19 #include "xfs_fs.h" 20 #include "xfs_format.h" 21 #include "xfs_log_format.h" 22 #include "xfs_trans_resv.h" 23 #include "xfs_sb.h" 24 #include "xfs_mount.h" 25 #include "xfs_inode.h" 26 #include "xfs_error.h" 27 #include "xfs_trans.h" 28 #include "xfs_trans_priv.h" 29 #include "xfs_inode_item.h" 30 #include "xfs_quota.h" 31 #include "xfs_trace.h" 32 #include "xfs_icache.h" 33 #include "xfs_bmap_util.h" 34 #include "xfs_dquot_item.h" 35 #include "xfs_dquot.h" 36 37 #include <linux/kthread.h> 38 #include <linux/freezer.h> 39 40 /* 41 * Allocate and initialise an xfs_inode. 42 */ 43 struct xfs_inode * 44 xfs_inode_alloc( 45 struct xfs_mount *mp, 46 xfs_ino_t ino) 47 { 48 struct xfs_inode *ip; 49 50 /* 51 * if this didn't occur in transactions, we could use 52 * KM_MAYFAIL and return NULL here on ENOMEM. Set the 53 * code up to do this anyway. 54 */ 55 ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); 56 if (!ip) 57 return NULL; 58 if (inode_init_always(mp->m_super, VFS_I(ip))) { 59 kmem_zone_free(xfs_inode_zone, ip); 60 return NULL; 61 } 62 63 /* VFS doesn't initialise i_mode! */ 64 VFS_I(ip)->i_mode = 0; 65 66 XFS_STATS_INC(mp, vn_active); 67 ASSERT(atomic_read(&ip->i_pincount) == 0); 68 ASSERT(!spin_is_locked(&ip->i_flags_lock)); 69 ASSERT(!xfs_isiflocked(ip)); 70 ASSERT(ip->i_ino == 0); 71 72 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 73 74 /* initialise the xfs inode */ 75 ip->i_ino = ino; 76 ip->i_mount = mp; 77 memset(&ip->i_imap, 0, sizeof(struct xfs_imap)); 78 ip->i_afp = NULL; 79 memset(&ip->i_df, 0, sizeof(xfs_ifork_t)); 80 ip->i_flags = 0; 81 ip->i_delayed_blks = 0; 82 memset(&ip->i_d, 0, sizeof(ip->i_d)); 83 84 return ip; 85 } 86 87 STATIC void 88 xfs_inode_free_callback( 89 struct rcu_head *head) 90 { 91 struct inode *inode = container_of(head, struct inode, i_rcu); 92 struct xfs_inode *ip = XFS_I(inode); 93 94 switch (VFS_I(ip)->i_mode & S_IFMT) { 95 case S_IFREG: 96 case S_IFDIR: 97 case S_IFLNK: 98 xfs_idestroy_fork(ip, XFS_DATA_FORK); 99 break; 100 } 101 102 if (ip->i_afp) 103 xfs_idestroy_fork(ip, XFS_ATTR_FORK); 104 105 if (ip->i_itemp) { 106 ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL)); 107 xfs_inode_item_destroy(ip); 108 ip->i_itemp = NULL; 109 } 110 111 kmem_zone_free(xfs_inode_zone, ip); 112 } 113 114 static void 115 __xfs_inode_free( 116 struct xfs_inode *ip) 117 { 118 /* asserts to verify all state is correct here */ 119 ASSERT(atomic_read(&ip->i_pincount) == 0); 120 ASSERT(!xfs_isiflocked(ip)); 121 XFS_STATS_DEC(ip->i_mount, vn_active); 122 123 call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback); 124 } 125 126 void 127 xfs_inode_free( 128 struct xfs_inode *ip) 129 { 130 /* 131 * Because we use RCU freeing we need to ensure the inode always 132 * appears to be reclaimed with an invalid inode number when in the 133 * free state. The ip->i_flags_lock provides the barrier against lookup 134 * races. 135 */ 136 spin_lock(&ip->i_flags_lock); 137 ip->i_flags = XFS_IRECLAIM; 138 ip->i_ino = 0; 139 spin_unlock(&ip->i_flags_lock); 140 141 __xfs_inode_free(ip); 142 } 143 144 /* 145 * Queue a new inode reclaim pass if there are reclaimable inodes and there 146 * isn't a reclaim pass already in progress. By default it runs every 5s based 147 * on the xfs periodic sync default of 30s. Perhaps this should have it's own 148 * tunable, but that can be done if this method proves to be ineffective or too 149 * aggressive. 150 */ 151 static void 152 xfs_reclaim_work_queue( 153 struct xfs_mount *mp) 154 { 155 156 rcu_read_lock(); 157 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { 158 queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work, 159 msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); 160 } 161 rcu_read_unlock(); 162 } 163 164 /* 165 * This is a fast pass over the inode cache to try to get reclaim moving on as 166 * many inodes as possible in a short period of time. It kicks itself every few 167 * seconds, as well as being kicked by the inode cache shrinker when memory 168 * goes low. It scans as quickly as possible avoiding locked inodes or those 169 * already being flushed, and once done schedules a future pass. 170 */ 171 void 172 xfs_reclaim_worker( 173 struct work_struct *work) 174 { 175 struct xfs_mount *mp = container_of(to_delayed_work(work), 176 struct xfs_mount, m_reclaim_work); 177 178 xfs_reclaim_inodes(mp, SYNC_TRYLOCK); 179 xfs_reclaim_work_queue(mp); 180 } 181 182 static void 183 xfs_perag_set_reclaim_tag( 184 struct xfs_perag *pag) 185 { 186 struct xfs_mount *mp = pag->pag_mount; 187 188 ASSERT(spin_is_locked(&pag->pag_ici_lock)); 189 if (pag->pag_ici_reclaimable++) 190 return; 191 192 /* propagate the reclaim tag up into the perag radix tree */ 193 spin_lock(&mp->m_perag_lock); 194 radix_tree_tag_set(&mp->m_perag_tree, pag->pag_agno, 195 XFS_ICI_RECLAIM_TAG); 196 spin_unlock(&mp->m_perag_lock); 197 198 /* schedule periodic background inode reclaim */ 199 xfs_reclaim_work_queue(mp); 200 201 trace_xfs_perag_set_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 202 } 203 204 static void 205 xfs_perag_clear_reclaim_tag( 206 struct xfs_perag *pag) 207 { 208 struct xfs_mount *mp = pag->pag_mount; 209 210 ASSERT(spin_is_locked(&pag->pag_ici_lock)); 211 if (--pag->pag_ici_reclaimable) 212 return; 213 214 /* clear the reclaim tag from the perag radix tree */ 215 spin_lock(&mp->m_perag_lock); 216 radix_tree_tag_clear(&mp->m_perag_tree, pag->pag_agno, 217 XFS_ICI_RECLAIM_TAG); 218 spin_unlock(&mp->m_perag_lock); 219 trace_xfs_perag_clear_reclaim(mp, pag->pag_agno, -1, _RET_IP_); 220 } 221 222 223 /* 224 * We set the inode flag atomically with the radix tree tag. 225 * Once we get tag lookups on the radix tree, this inode flag 226 * can go away. 227 */ 228 void 229 xfs_inode_set_reclaim_tag( 230 struct xfs_inode *ip) 231 { 232 struct xfs_mount *mp = ip->i_mount; 233 struct xfs_perag *pag; 234 235 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 236 spin_lock(&pag->pag_ici_lock); 237 spin_lock(&ip->i_flags_lock); 238 239 radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), 240 XFS_ICI_RECLAIM_TAG); 241 xfs_perag_set_reclaim_tag(pag); 242 __xfs_iflags_set(ip, XFS_IRECLAIMABLE); 243 244 spin_unlock(&ip->i_flags_lock); 245 spin_unlock(&pag->pag_ici_lock); 246 xfs_perag_put(pag); 247 } 248 249 STATIC void 250 xfs_inode_clear_reclaim_tag( 251 struct xfs_perag *pag, 252 xfs_ino_t ino) 253 { 254 radix_tree_tag_clear(&pag->pag_ici_root, 255 XFS_INO_TO_AGINO(pag->pag_mount, ino), 256 XFS_ICI_RECLAIM_TAG); 257 xfs_perag_clear_reclaim_tag(pag); 258 } 259 260 /* 261 * When we recycle a reclaimable inode, we need to re-initialise the VFS inode 262 * part of the structure. This is made more complex by the fact we store 263 * information about the on-disk values in the VFS inode and so we can't just 264 * overwrite the values unconditionally. Hence we save the parameters we 265 * need to retain across reinitialisation, and rewrite them into the VFS inode 266 * after reinitialisation even if it fails. 267 */ 268 static int 269 xfs_reinit_inode( 270 struct xfs_mount *mp, 271 struct inode *inode) 272 { 273 int error; 274 uint32_t nlink = inode->i_nlink; 275 uint32_t generation = inode->i_generation; 276 uint64_t version = inode->i_version; 277 umode_t mode = inode->i_mode; 278 279 error = inode_init_always(mp->m_super, inode); 280 281 set_nlink(inode, nlink); 282 inode->i_generation = generation; 283 inode->i_version = version; 284 inode->i_mode = mode; 285 return error; 286 } 287 288 /* 289 * Check the validity of the inode we just found it the cache 290 */ 291 static int 292 xfs_iget_cache_hit( 293 struct xfs_perag *pag, 294 struct xfs_inode *ip, 295 xfs_ino_t ino, 296 int flags, 297 int lock_flags) __releases(RCU) 298 { 299 struct inode *inode = VFS_I(ip); 300 struct xfs_mount *mp = ip->i_mount; 301 int error; 302 303 /* 304 * check for re-use of an inode within an RCU grace period due to the 305 * radix tree nodes not being updated yet. We monitor for this by 306 * setting the inode number to zero before freeing the inode structure. 307 * If the inode has been reallocated and set up, then the inode number 308 * will not match, so check for that, too. 309 */ 310 spin_lock(&ip->i_flags_lock); 311 if (ip->i_ino != ino) { 312 trace_xfs_iget_skip(ip); 313 XFS_STATS_INC(mp, xs_ig_frecycle); 314 error = -EAGAIN; 315 goto out_error; 316 } 317 318 319 /* 320 * If we are racing with another cache hit that is currently 321 * instantiating this inode or currently recycling it out of 322 * reclaimabe state, wait for the initialisation to complete 323 * before continuing. 324 * 325 * XXX(hch): eventually we should do something equivalent to 326 * wait_on_inode to wait for these flags to be cleared 327 * instead of polling for it. 328 */ 329 if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { 330 trace_xfs_iget_skip(ip); 331 XFS_STATS_INC(mp, xs_ig_frecycle); 332 error = -EAGAIN; 333 goto out_error; 334 } 335 336 /* 337 * If lookup is racing with unlink return an error immediately. 338 */ 339 if (VFS_I(ip)->i_mode == 0 && !(flags & XFS_IGET_CREATE)) { 340 error = -ENOENT; 341 goto out_error; 342 } 343 344 /* 345 * If IRECLAIMABLE is set, we've torn down the VFS inode already. 346 * Need to carefully get it back into useable state. 347 */ 348 if (ip->i_flags & XFS_IRECLAIMABLE) { 349 trace_xfs_iget_reclaim(ip); 350 351 /* 352 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode 353 * from stomping over us while we recycle the inode. We can't 354 * clear the radix tree reclaimable tag yet as it requires 355 * pag_ici_lock to be held exclusive. 356 */ 357 ip->i_flags |= XFS_IRECLAIM; 358 359 spin_unlock(&ip->i_flags_lock); 360 rcu_read_unlock(); 361 362 error = xfs_reinit_inode(mp, inode); 363 if (error) { 364 /* 365 * Re-initializing the inode failed, and we are in deep 366 * trouble. Try to re-add it to the reclaim list. 367 */ 368 rcu_read_lock(); 369 spin_lock(&ip->i_flags_lock); 370 371 ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); 372 ASSERT(ip->i_flags & XFS_IRECLAIMABLE); 373 trace_xfs_iget_reclaim_fail(ip); 374 goto out_error; 375 } 376 377 spin_lock(&pag->pag_ici_lock); 378 spin_lock(&ip->i_flags_lock); 379 380 /* 381 * Clear the per-lifetime state in the inode as we are now 382 * effectively a new inode and need to return to the initial 383 * state before reuse occurs. 384 */ 385 ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS; 386 ip->i_flags |= XFS_INEW; 387 xfs_inode_clear_reclaim_tag(pag, ip->i_ino); 388 inode->i_state = I_NEW; 389 390 ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); 391 mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); 392 393 spin_unlock(&ip->i_flags_lock); 394 spin_unlock(&pag->pag_ici_lock); 395 } else { 396 /* If the VFS inode is being torn down, pause and try again. */ 397 if (!igrab(inode)) { 398 trace_xfs_iget_skip(ip); 399 error = -EAGAIN; 400 goto out_error; 401 } 402 403 /* We've got a live one. */ 404 spin_unlock(&ip->i_flags_lock); 405 rcu_read_unlock(); 406 trace_xfs_iget_hit(ip); 407 } 408 409 if (lock_flags != 0) 410 xfs_ilock(ip, lock_flags); 411 412 xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE); 413 XFS_STATS_INC(mp, xs_ig_found); 414 415 return 0; 416 417 out_error: 418 spin_unlock(&ip->i_flags_lock); 419 rcu_read_unlock(); 420 return error; 421 } 422 423 424 static int 425 xfs_iget_cache_miss( 426 struct xfs_mount *mp, 427 struct xfs_perag *pag, 428 xfs_trans_t *tp, 429 xfs_ino_t ino, 430 struct xfs_inode **ipp, 431 int flags, 432 int lock_flags) 433 { 434 struct xfs_inode *ip; 435 int error; 436 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino); 437 int iflags; 438 439 ip = xfs_inode_alloc(mp, ino); 440 if (!ip) 441 return -ENOMEM; 442 443 error = xfs_iread(mp, tp, ip, flags); 444 if (error) 445 goto out_destroy; 446 447 trace_xfs_iget_miss(ip); 448 449 if ((VFS_I(ip)->i_mode == 0) && !(flags & XFS_IGET_CREATE)) { 450 error = -ENOENT; 451 goto out_destroy; 452 } 453 454 /* 455 * Preload the radix tree so we can insert safely under the 456 * write spinlock. Note that we cannot sleep inside the preload 457 * region. Since we can be called from transaction context, don't 458 * recurse into the file system. 459 */ 460 if (radix_tree_preload(GFP_NOFS)) { 461 error = -EAGAIN; 462 goto out_destroy; 463 } 464 465 /* 466 * Because the inode hasn't been added to the radix-tree yet it can't 467 * be found by another thread, so we can do the non-sleeping lock here. 468 */ 469 if (lock_flags) { 470 if (!xfs_ilock_nowait(ip, lock_flags)) 471 BUG(); 472 } 473 474 /* 475 * These values must be set before inserting the inode into the radix 476 * tree as the moment it is inserted a concurrent lookup (allowed by the 477 * RCU locking mechanism) can find it and that lookup must see that this 478 * is an inode currently under construction (i.e. that XFS_INEW is set). 479 * The ip->i_flags_lock that protects the XFS_INEW flag forms the 480 * memory barrier that ensures this detection works correctly at lookup 481 * time. 482 */ 483 iflags = XFS_INEW; 484 if (flags & XFS_IGET_DONTCACHE) 485 iflags |= XFS_IDONTCACHE; 486 ip->i_udquot = NULL; 487 ip->i_gdquot = NULL; 488 ip->i_pdquot = NULL; 489 xfs_iflags_set(ip, iflags); 490 491 /* insert the new inode */ 492 spin_lock(&pag->pag_ici_lock); 493 error = radix_tree_insert(&pag->pag_ici_root, agino, ip); 494 if (unlikely(error)) { 495 WARN_ON(error != -EEXIST); 496 XFS_STATS_INC(mp, xs_ig_dup); 497 error = -EAGAIN; 498 goto out_preload_end; 499 } 500 spin_unlock(&pag->pag_ici_lock); 501 radix_tree_preload_end(); 502 503 *ipp = ip; 504 return 0; 505 506 out_preload_end: 507 spin_unlock(&pag->pag_ici_lock); 508 radix_tree_preload_end(); 509 if (lock_flags) 510 xfs_iunlock(ip, lock_flags); 511 out_destroy: 512 __destroy_inode(VFS_I(ip)); 513 xfs_inode_free(ip); 514 return error; 515 } 516 517 /* 518 * Look up an inode by number in the given file system. 519 * The inode is looked up in the cache held in each AG. 520 * If the inode is found in the cache, initialise the vfs inode 521 * if necessary. 522 * 523 * If it is not in core, read it in from the file system's device, 524 * add it to the cache and initialise the vfs inode. 525 * 526 * The inode is locked according to the value of the lock_flags parameter. 527 * This flag parameter indicates how and if the inode's IO lock and inode lock 528 * should be taken. 529 * 530 * mp -- the mount point structure for the current file system. It points 531 * to the inode hash table. 532 * tp -- a pointer to the current transaction if there is one. This is 533 * simply passed through to the xfs_iread() call. 534 * ino -- the number of the inode desired. This is the unique identifier 535 * within the file system for the inode being requested. 536 * lock_flags -- flags indicating how to lock the inode. See the comment 537 * for xfs_ilock() for a list of valid values. 538 */ 539 int 540 xfs_iget( 541 xfs_mount_t *mp, 542 xfs_trans_t *tp, 543 xfs_ino_t ino, 544 uint flags, 545 uint lock_flags, 546 xfs_inode_t **ipp) 547 { 548 xfs_inode_t *ip; 549 int error; 550 xfs_perag_t *pag; 551 xfs_agino_t agino; 552 553 /* 554 * xfs_reclaim_inode() uses the ILOCK to ensure an inode 555 * doesn't get freed while it's being referenced during a 556 * radix tree traversal here. It assumes this function 557 * aqcuires only the ILOCK (and therefore it has no need to 558 * involve the IOLOCK in this synchronization). 559 */ 560 ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0); 561 562 /* reject inode numbers outside existing AGs */ 563 if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount) 564 return -EINVAL; 565 566 XFS_STATS_INC(mp, xs_ig_attempts); 567 568 /* get the perag structure and ensure that it's inode capable */ 569 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino)); 570 agino = XFS_INO_TO_AGINO(mp, ino); 571 572 again: 573 error = 0; 574 rcu_read_lock(); 575 ip = radix_tree_lookup(&pag->pag_ici_root, agino); 576 577 if (ip) { 578 error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags); 579 if (error) 580 goto out_error_or_again; 581 } else { 582 rcu_read_unlock(); 583 XFS_STATS_INC(mp, xs_ig_missed); 584 585 error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, 586 flags, lock_flags); 587 if (error) 588 goto out_error_or_again; 589 } 590 xfs_perag_put(pag); 591 592 *ipp = ip; 593 594 /* 595 * If we have a real type for an on-disk inode, we can setup the inode 596 * now. If it's a new inode being created, xfs_ialloc will handle it. 597 */ 598 if (xfs_iflags_test(ip, XFS_INEW) && VFS_I(ip)->i_mode != 0) 599 xfs_setup_existing_inode(ip); 600 return 0; 601 602 out_error_or_again: 603 if (error == -EAGAIN) { 604 delay(1); 605 goto again; 606 } 607 xfs_perag_put(pag); 608 return error; 609 } 610 611 /* 612 * The inode lookup is done in batches to keep the amount of lock traffic and 613 * radix tree lookups to a minimum. The batch size is a trade off between 614 * lookup reduction and stack usage. This is in the reclaim path, so we can't 615 * be too greedy. 616 */ 617 #define XFS_LOOKUP_BATCH 32 618 619 STATIC int 620 xfs_inode_ag_walk_grab( 621 struct xfs_inode *ip) 622 { 623 struct inode *inode = VFS_I(ip); 624 625 ASSERT(rcu_read_lock_held()); 626 627 /* 628 * check for stale RCU freed inode 629 * 630 * If the inode has been reallocated, it doesn't matter if it's not in 631 * the AG we are walking - we are walking for writeback, so if it 632 * passes all the "valid inode" checks and is dirty, then we'll write 633 * it back anyway. If it has been reallocated and still being 634 * initialised, the XFS_INEW check below will catch it. 635 */ 636 spin_lock(&ip->i_flags_lock); 637 if (!ip->i_ino) 638 goto out_unlock_noent; 639 640 /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ 641 if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) 642 goto out_unlock_noent; 643 spin_unlock(&ip->i_flags_lock); 644 645 /* nothing to sync during shutdown */ 646 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) 647 return -EFSCORRUPTED; 648 649 /* If we can't grab the inode, it must on it's way to reclaim. */ 650 if (!igrab(inode)) 651 return -ENOENT; 652 653 /* inode is valid */ 654 return 0; 655 656 out_unlock_noent: 657 spin_unlock(&ip->i_flags_lock); 658 return -ENOENT; 659 } 660 661 STATIC int 662 xfs_inode_ag_walk( 663 struct xfs_mount *mp, 664 struct xfs_perag *pag, 665 int (*execute)(struct xfs_inode *ip, int flags, 666 void *args), 667 int flags, 668 void *args, 669 int tag) 670 { 671 uint32_t first_index; 672 int last_error = 0; 673 int skipped; 674 int done; 675 int nr_found; 676 677 restart: 678 done = 0; 679 skipped = 0; 680 first_index = 0; 681 nr_found = 0; 682 do { 683 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 684 int error = 0; 685 int i; 686 687 rcu_read_lock(); 688 689 if (tag == -1) 690 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, 691 (void **)batch, first_index, 692 XFS_LOOKUP_BATCH); 693 else 694 nr_found = radix_tree_gang_lookup_tag( 695 &pag->pag_ici_root, 696 (void **) batch, first_index, 697 XFS_LOOKUP_BATCH, tag); 698 699 if (!nr_found) { 700 rcu_read_unlock(); 701 break; 702 } 703 704 /* 705 * Grab the inodes before we drop the lock. if we found 706 * nothing, nr == 0 and the loop will be skipped. 707 */ 708 for (i = 0; i < nr_found; i++) { 709 struct xfs_inode *ip = batch[i]; 710 711 if (done || xfs_inode_ag_walk_grab(ip)) 712 batch[i] = NULL; 713 714 /* 715 * Update the index for the next lookup. Catch 716 * overflows into the next AG range which can occur if 717 * we have inodes in the last block of the AG and we 718 * are currently pointing to the last inode. 719 * 720 * Because we may see inodes that are from the wrong AG 721 * due to RCU freeing and reallocation, only update the 722 * index if it lies in this AG. It was a race that lead 723 * us to see this inode, so another lookup from the 724 * same index will not find it again. 725 */ 726 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != pag->pag_agno) 727 continue; 728 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 729 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 730 done = 1; 731 } 732 733 /* unlock now we've grabbed the inodes. */ 734 rcu_read_unlock(); 735 736 for (i = 0; i < nr_found; i++) { 737 if (!batch[i]) 738 continue; 739 error = execute(batch[i], flags, args); 740 IRELE(batch[i]); 741 if (error == -EAGAIN) { 742 skipped++; 743 continue; 744 } 745 if (error && last_error != -EFSCORRUPTED) 746 last_error = error; 747 } 748 749 /* bail out if the filesystem is corrupted. */ 750 if (error == -EFSCORRUPTED) 751 break; 752 753 cond_resched(); 754 755 } while (nr_found && !done); 756 757 if (skipped) { 758 delay(1); 759 goto restart; 760 } 761 return last_error; 762 } 763 764 /* 765 * Background scanning to trim post-EOF preallocated space. This is queued 766 * based on the 'speculative_prealloc_lifetime' tunable (5m by default). 767 */ 768 void 769 xfs_queue_eofblocks( 770 struct xfs_mount *mp) 771 { 772 rcu_read_lock(); 773 if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG)) 774 queue_delayed_work(mp->m_eofblocks_workqueue, 775 &mp->m_eofblocks_work, 776 msecs_to_jiffies(xfs_eofb_secs * 1000)); 777 rcu_read_unlock(); 778 } 779 780 void 781 xfs_eofblocks_worker( 782 struct work_struct *work) 783 { 784 struct xfs_mount *mp = container_of(to_delayed_work(work), 785 struct xfs_mount, m_eofblocks_work); 786 xfs_icache_free_eofblocks(mp, NULL); 787 xfs_queue_eofblocks(mp); 788 } 789 790 int 791 xfs_inode_ag_iterator( 792 struct xfs_mount *mp, 793 int (*execute)(struct xfs_inode *ip, int flags, 794 void *args), 795 int flags, 796 void *args) 797 { 798 struct xfs_perag *pag; 799 int error = 0; 800 int last_error = 0; 801 xfs_agnumber_t ag; 802 803 ag = 0; 804 while ((pag = xfs_perag_get(mp, ag))) { 805 ag = pag->pag_agno + 1; 806 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); 807 xfs_perag_put(pag); 808 if (error) { 809 last_error = error; 810 if (error == -EFSCORRUPTED) 811 break; 812 } 813 } 814 return last_error; 815 } 816 817 int 818 xfs_inode_ag_iterator_tag( 819 struct xfs_mount *mp, 820 int (*execute)(struct xfs_inode *ip, int flags, 821 void *args), 822 int flags, 823 void *args, 824 int tag) 825 { 826 struct xfs_perag *pag; 827 int error = 0; 828 int last_error = 0; 829 xfs_agnumber_t ag; 830 831 ag = 0; 832 while ((pag = xfs_perag_get_tag(mp, ag, tag))) { 833 ag = pag->pag_agno + 1; 834 error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); 835 xfs_perag_put(pag); 836 if (error) { 837 last_error = error; 838 if (error == -EFSCORRUPTED) 839 break; 840 } 841 } 842 return last_error; 843 } 844 845 /* 846 * Grab the inode for reclaim exclusively. 847 * Return 0 if we grabbed it, non-zero otherwise. 848 */ 849 STATIC int 850 xfs_reclaim_inode_grab( 851 struct xfs_inode *ip, 852 int flags) 853 { 854 ASSERT(rcu_read_lock_held()); 855 856 /* quick check for stale RCU freed inode */ 857 if (!ip->i_ino) 858 return 1; 859 860 /* 861 * If we are asked for non-blocking operation, do unlocked checks to 862 * see if the inode already is being flushed or in reclaim to avoid 863 * lock traffic. 864 */ 865 if ((flags & SYNC_TRYLOCK) && 866 __xfs_iflags_test(ip, XFS_IFLOCK | XFS_IRECLAIM)) 867 return 1; 868 869 /* 870 * The radix tree lock here protects a thread in xfs_iget from racing 871 * with us starting reclaim on the inode. Once we have the 872 * XFS_IRECLAIM flag set it will not touch us. 873 * 874 * Due to RCU lookup, we may find inodes that have been freed and only 875 * have XFS_IRECLAIM set. Indeed, we may see reallocated inodes that 876 * aren't candidates for reclaim at all, so we must check the 877 * XFS_IRECLAIMABLE is set first before proceeding to reclaim. 878 */ 879 spin_lock(&ip->i_flags_lock); 880 if (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) || 881 __xfs_iflags_test(ip, XFS_IRECLAIM)) { 882 /* not a reclaim candidate. */ 883 spin_unlock(&ip->i_flags_lock); 884 return 1; 885 } 886 __xfs_iflags_set(ip, XFS_IRECLAIM); 887 spin_unlock(&ip->i_flags_lock); 888 return 0; 889 } 890 891 /* 892 * Inodes in different states need to be treated differently. The following 893 * table lists the inode states and the reclaim actions necessary: 894 * 895 * inode state iflush ret required action 896 * --------------- ---------- --------------- 897 * bad - reclaim 898 * shutdown EIO unpin and reclaim 899 * clean, unpinned 0 reclaim 900 * stale, unpinned 0 reclaim 901 * clean, pinned(*) 0 requeue 902 * stale, pinned EAGAIN requeue 903 * dirty, async - requeue 904 * dirty, sync 0 reclaim 905 * 906 * (*) dgc: I don't think the clean, pinned state is possible but it gets 907 * handled anyway given the order of checks implemented. 908 * 909 * Also, because we get the flush lock first, we know that any inode that has 910 * been flushed delwri has had the flush completed by the time we check that 911 * the inode is clean. 912 * 913 * Note that because the inode is flushed delayed write by AIL pushing, the 914 * flush lock may already be held here and waiting on it can result in very 915 * long latencies. Hence for sync reclaims, where we wait on the flush lock, 916 * the caller should push the AIL first before trying to reclaim inodes to 917 * minimise the amount of time spent waiting. For background relaim, we only 918 * bother to reclaim clean inodes anyway. 919 * 920 * Hence the order of actions after gaining the locks should be: 921 * bad => reclaim 922 * shutdown => unpin and reclaim 923 * pinned, async => requeue 924 * pinned, sync => unpin 925 * stale => reclaim 926 * clean => reclaim 927 * dirty, async => requeue 928 * dirty, sync => flush, wait and reclaim 929 */ 930 STATIC int 931 xfs_reclaim_inode( 932 struct xfs_inode *ip, 933 struct xfs_perag *pag, 934 int sync_mode) 935 { 936 struct xfs_buf *bp = NULL; 937 xfs_ino_t ino = ip->i_ino; /* for radix_tree_delete */ 938 int error; 939 940 restart: 941 error = 0; 942 xfs_ilock(ip, XFS_ILOCK_EXCL); 943 if (!xfs_iflock_nowait(ip)) { 944 if (!(sync_mode & SYNC_WAIT)) 945 goto out; 946 xfs_iflock(ip); 947 } 948 949 if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { 950 xfs_iunpin_wait(ip); 951 xfs_iflush_abort(ip, false); 952 goto reclaim; 953 } 954 if (xfs_ipincount(ip)) { 955 if (!(sync_mode & SYNC_WAIT)) 956 goto out_ifunlock; 957 xfs_iunpin_wait(ip); 958 } 959 if (xfs_iflags_test(ip, XFS_ISTALE)) 960 goto reclaim; 961 if (xfs_inode_clean(ip)) 962 goto reclaim; 963 964 /* 965 * Never flush out dirty data during non-blocking reclaim, as it would 966 * just contend with AIL pushing trying to do the same job. 967 */ 968 if (!(sync_mode & SYNC_WAIT)) 969 goto out_ifunlock; 970 971 /* 972 * Now we have an inode that needs flushing. 973 * 974 * Note that xfs_iflush will never block on the inode buffer lock, as 975 * xfs_ifree_cluster() can lock the inode buffer before it locks the 976 * ip->i_lock, and we are doing the exact opposite here. As a result, 977 * doing a blocking xfs_imap_to_bp() to get the cluster buffer would 978 * result in an ABBA deadlock with xfs_ifree_cluster(). 979 * 980 * As xfs_ifree_cluser() must gather all inodes that are active in the 981 * cache to mark them stale, if we hit this case we don't actually want 982 * to do IO here - we want the inode marked stale so we can simply 983 * reclaim it. Hence if we get an EAGAIN error here, just unlock the 984 * inode, back off and try again. Hopefully the next pass through will 985 * see the stale flag set on the inode. 986 */ 987 error = xfs_iflush(ip, &bp); 988 if (error == -EAGAIN) { 989 xfs_iunlock(ip, XFS_ILOCK_EXCL); 990 /* backoff longer than in xfs_ifree_cluster */ 991 delay(2); 992 goto restart; 993 } 994 995 if (!error) { 996 error = xfs_bwrite(bp); 997 xfs_buf_relse(bp); 998 } 999 1000 xfs_iflock(ip); 1001 reclaim: 1002 /* 1003 * Because we use RCU freeing we need to ensure the inode always appears 1004 * to be reclaimed with an invalid inode number when in the free state. 1005 * We do this as early as possible under the ILOCK and flush lock so 1006 * that xfs_iflush_cluster() can be guaranteed to detect races with us 1007 * here. By doing this, we guarantee that once xfs_iflush_cluster has 1008 * locked both the XFS_ILOCK and the flush lock that it will see either 1009 * a valid, flushable inode that will serialise correctly against the 1010 * locks below, or it will see a clean (and invalid) inode that it can 1011 * skip. 1012 */ 1013 spin_lock(&ip->i_flags_lock); 1014 ip->i_flags = XFS_IRECLAIM; 1015 ip->i_ino = 0; 1016 spin_unlock(&ip->i_flags_lock); 1017 1018 xfs_ifunlock(ip); 1019 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1020 1021 XFS_STATS_INC(ip->i_mount, xs_ig_reclaims); 1022 /* 1023 * Remove the inode from the per-AG radix tree. 1024 * 1025 * Because radix_tree_delete won't complain even if the item was never 1026 * added to the tree assert that it's been there before to catch 1027 * problems with the inode life time early on. 1028 */ 1029 spin_lock(&pag->pag_ici_lock); 1030 if (!radix_tree_delete(&pag->pag_ici_root, 1031 XFS_INO_TO_AGINO(ip->i_mount, ino))) 1032 ASSERT(0); 1033 xfs_perag_clear_reclaim_tag(pag); 1034 spin_unlock(&pag->pag_ici_lock); 1035 1036 /* 1037 * Here we do an (almost) spurious inode lock in order to coordinate 1038 * with inode cache radix tree lookups. This is because the lookup 1039 * can reference the inodes in the cache without taking references. 1040 * 1041 * We make that OK here by ensuring that we wait until the inode is 1042 * unlocked after the lookup before we go ahead and free it. 1043 */ 1044 xfs_ilock(ip, XFS_ILOCK_EXCL); 1045 xfs_qm_dqdetach(ip); 1046 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1047 1048 __xfs_inode_free(ip); 1049 return error; 1050 1051 out_ifunlock: 1052 xfs_ifunlock(ip); 1053 out: 1054 xfs_iflags_clear(ip, XFS_IRECLAIM); 1055 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1056 /* 1057 * We could return -EAGAIN here to make reclaim rescan the inode tree in 1058 * a short while. However, this just burns CPU time scanning the tree 1059 * waiting for IO to complete and the reclaim work never goes back to 1060 * the idle state. Instead, return 0 to let the next scheduled 1061 * background reclaim attempt to reclaim the inode again. 1062 */ 1063 return 0; 1064 } 1065 1066 /* 1067 * Walk the AGs and reclaim the inodes in them. Even if the filesystem is 1068 * corrupted, we still want to try to reclaim all the inodes. If we don't, 1069 * then a shut down during filesystem unmount reclaim walk leak all the 1070 * unreclaimed inodes. 1071 */ 1072 STATIC int 1073 xfs_reclaim_inodes_ag( 1074 struct xfs_mount *mp, 1075 int flags, 1076 int *nr_to_scan) 1077 { 1078 struct xfs_perag *pag; 1079 int error = 0; 1080 int last_error = 0; 1081 xfs_agnumber_t ag; 1082 int trylock = flags & SYNC_TRYLOCK; 1083 int skipped; 1084 1085 restart: 1086 ag = 0; 1087 skipped = 0; 1088 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1089 unsigned long first_index = 0; 1090 int done = 0; 1091 int nr_found = 0; 1092 1093 ag = pag->pag_agno + 1; 1094 1095 if (trylock) { 1096 if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { 1097 skipped++; 1098 xfs_perag_put(pag); 1099 continue; 1100 } 1101 first_index = pag->pag_ici_reclaim_cursor; 1102 } else 1103 mutex_lock(&pag->pag_ici_reclaim_lock); 1104 1105 do { 1106 struct xfs_inode *batch[XFS_LOOKUP_BATCH]; 1107 int i; 1108 1109 rcu_read_lock(); 1110 nr_found = radix_tree_gang_lookup_tag( 1111 &pag->pag_ici_root, 1112 (void **)batch, first_index, 1113 XFS_LOOKUP_BATCH, 1114 XFS_ICI_RECLAIM_TAG); 1115 if (!nr_found) { 1116 done = 1; 1117 rcu_read_unlock(); 1118 break; 1119 } 1120 1121 /* 1122 * Grab the inodes before we drop the lock. if we found 1123 * nothing, nr == 0 and the loop will be skipped. 1124 */ 1125 for (i = 0; i < nr_found; i++) { 1126 struct xfs_inode *ip = batch[i]; 1127 1128 if (done || xfs_reclaim_inode_grab(ip, flags)) 1129 batch[i] = NULL; 1130 1131 /* 1132 * Update the index for the next lookup. Catch 1133 * overflows into the next AG range which can 1134 * occur if we have inodes in the last block of 1135 * the AG and we are currently pointing to the 1136 * last inode. 1137 * 1138 * Because we may see inodes that are from the 1139 * wrong AG due to RCU freeing and 1140 * reallocation, only update the index if it 1141 * lies in this AG. It was a race that lead us 1142 * to see this inode, so another lookup from 1143 * the same index will not find it again. 1144 */ 1145 if (XFS_INO_TO_AGNO(mp, ip->i_ino) != 1146 pag->pag_agno) 1147 continue; 1148 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); 1149 if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) 1150 done = 1; 1151 } 1152 1153 /* unlock now we've grabbed the inodes. */ 1154 rcu_read_unlock(); 1155 1156 for (i = 0; i < nr_found; i++) { 1157 if (!batch[i]) 1158 continue; 1159 error = xfs_reclaim_inode(batch[i], pag, flags); 1160 if (error && last_error != -EFSCORRUPTED) 1161 last_error = error; 1162 } 1163 1164 *nr_to_scan -= XFS_LOOKUP_BATCH; 1165 1166 cond_resched(); 1167 1168 } while (nr_found && !done && *nr_to_scan > 0); 1169 1170 if (trylock && !done) 1171 pag->pag_ici_reclaim_cursor = first_index; 1172 else 1173 pag->pag_ici_reclaim_cursor = 0; 1174 mutex_unlock(&pag->pag_ici_reclaim_lock); 1175 xfs_perag_put(pag); 1176 } 1177 1178 /* 1179 * if we skipped any AG, and we still have scan count remaining, do 1180 * another pass this time using blocking reclaim semantics (i.e 1181 * waiting on the reclaim locks and ignoring the reclaim cursors). This 1182 * ensure that when we get more reclaimers than AGs we block rather 1183 * than spin trying to execute reclaim. 1184 */ 1185 if (skipped && (flags & SYNC_WAIT) && *nr_to_scan > 0) { 1186 trylock = 0; 1187 goto restart; 1188 } 1189 return last_error; 1190 } 1191 1192 int 1193 xfs_reclaim_inodes( 1194 xfs_mount_t *mp, 1195 int mode) 1196 { 1197 int nr_to_scan = INT_MAX; 1198 1199 return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); 1200 } 1201 1202 /* 1203 * Scan a certain number of inodes for reclaim. 1204 * 1205 * When called we make sure that there is a background (fast) inode reclaim in 1206 * progress, while we will throttle the speed of reclaim via doing synchronous 1207 * reclaim of inodes. That means if we come across dirty inodes, we wait for 1208 * them to be cleaned, which we hope will not be very long due to the 1209 * background walker having already kicked the IO off on those dirty inodes. 1210 */ 1211 long 1212 xfs_reclaim_inodes_nr( 1213 struct xfs_mount *mp, 1214 int nr_to_scan) 1215 { 1216 /* kick background reclaimer and push the AIL */ 1217 xfs_reclaim_work_queue(mp); 1218 xfs_ail_push_all(mp->m_ail); 1219 1220 return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan); 1221 } 1222 1223 /* 1224 * Return the number of reclaimable inodes in the filesystem for 1225 * the shrinker to determine how much to reclaim. 1226 */ 1227 int 1228 xfs_reclaim_inodes_count( 1229 struct xfs_mount *mp) 1230 { 1231 struct xfs_perag *pag; 1232 xfs_agnumber_t ag = 0; 1233 int reclaimable = 0; 1234 1235 while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { 1236 ag = pag->pag_agno + 1; 1237 reclaimable += pag->pag_ici_reclaimable; 1238 xfs_perag_put(pag); 1239 } 1240 return reclaimable; 1241 } 1242 1243 STATIC int 1244 xfs_inode_match_id( 1245 struct xfs_inode *ip, 1246 struct xfs_eofblocks *eofb) 1247 { 1248 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1249 !uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1250 return 0; 1251 1252 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1253 !gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1254 return 0; 1255 1256 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1257 xfs_get_projid(ip) != eofb->eof_prid) 1258 return 0; 1259 1260 return 1; 1261 } 1262 1263 /* 1264 * A union-based inode filtering algorithm. Process the inode if any of the 1265 * criteria match. This is for global/internal scans only. 1266 */ 1267 STATIC int 1268 xfs_inode_match_id_union( 1269 struct xfs_inode *ip, 1270 struct xfs_eofblocks *eofb) 1271 { 1272 if ((eofb->eof_flags & XFS_EOF_FLAGS_UID) && 1273 uid_eq(VFS_I(ip)->i_uid, eofb->eof_uid)) 1274 return 1; 1275 1276 if ((eofb->eof_flags & XFS_EOF_FLAGS_GID) && 1277 gid_eq(VFS_I(ip)->i_gid, eofb->eof_gid)) 1278 return 1; 1279 1280 if ((eofb->eof_flags & XFS_EOF_FLAGS_PRID) && 1281 xfs_get_projid(ip) == eofb->eof_prid) 1282 return 1; 1283 1284 return 0; 1285 } 1286 1287 STATIC int 1288 xfs_inode_free_eofblocks( 1289 struct xfs_inode *ip, 1290 int flags, 1291 void *args) 1292 { 1293 int ret; 1294 struct xfs_eofblocks *eofb = args; 1295 bool need_iolock = true; 1296 int match; 1297 1298 ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0)); 1299 1300 if (!xfs_can_free_eofblocks(ip, false)) { 1301 /* inode could be preallocated or append-only */ 1302 trace_xfs_inode_free_eofblocks_invalid(ip); 1303 xfs_inode_clear_eofblocks_tag(ip); 1304 return 0; 1305 } 1306 1307 /* 1308 * If the mapping is dirty the operation can block and wait for some 1309 * time. Unless we are waiting, skip it. 1310 */ 1311 if (!(flags & SYNC_WAIT) && 1312 mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY)) 1313 return 0; 1314 1315 if (eofb) { 1316 if (eofb->eof_flags & XFS_EOF_FLAGS_UNION) 1317 match = xfs_inode_match_id_union(ip, eofb); 1318 else 1319 match = xfs_inode_match_id(ip, eofb); 1320 if (!match) 1321 return 0; 1322 1323 /* skip the inode if the file size is too small */ 1324 if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE && 1325 XFS_ISIZE(ip) < eofb->eof_min_file_size) 1326 return 0; 1327 1328 /* 1329 * A scan owner implies we already hold the iolock. Skip it in 1330 * xfs_free_eofblocks() to avoid deadlock. This also eliminates 1331 * the possibility of EAGAIN being returned. 1332 */ 1333 if (eofb->eof_scan_owner == ip->i_ino) 1334 need_iolock = false; 1335 } 1336 1337 ret = xfs_free_eofblocks(ip->i_mount, ip, need_iolock); 1338 1339 /* don't revisit the inode if we're not waiting */ 1340 if (ret == -EAGAIN && !(flags & SYNC_WAIT)) 1341 ret = 0; 1342 1343 return ret; 1344 } 1345 1346 int 1347 xfs_icache_free_eofblocks( 1348 struct xfs_mount *mp, 1349 struct xfs_eofblocks *eofb) 1350 { 1351 int flags = SYNC_TRYLOCK; 1352 1353 if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC)) 1354 flags = SYNC_WAIT; 1355 1356 return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags, 1357 eofb, XFS_ICI_EOFBLOCKS_TAG); 1358 } 1359 1360 /* 1361 * Run eofblocks scans on the quotas applicable to the inode. For inodes with 1362 * multiple quotas, we don't know exactly which quota caused an allocation 1363 * failure. We make a best effort by including each quota under low free space 1364 * conditions (less than 1% free space) in the scan. 1365 */ 1366 int 1367 xfs_inode_free_quota_eofblocks( 1368 struct xfs_inode *ip) 1369 { 1370 int scan = 0; 1371 struct xfs_eofblocks eofb = {0}; 1372 struct xfs_dquot *dq; 1373 1374 ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); 1375 1376 /* 1377 * Set the scan owner to avoid a potential livelock. Otherwise, the scan 1378 * can repeatedly trylock on the inode we're currently processing. We 1379 * run a sync scan to increase effectiveness and use the union filter to 1380 * cover all applicable quotas in a single scan. 1381 */ 1382 eofb.eof_scan_owner = ip->i_ino; 1383 eofb.eof_flags = XFS_EOF_FLAGS_UNION|XFS_EOF_FLAGS_SYNC; 1384 1385 if (XFS_IS_UQUOTA_ENFORCED(ip->i_mount)) { 1386 dq = xfs_inode_dquot(ip, XFS_DQ_USER); 1387 if (dq && xfs_dquot_lowsp(dq)) { 1388 eofb.eof_uid = VFS_I(ip)->i_uid; 1389 eofb.eof_flags |= XFS_EOF_FLAGS_UID; 1390 scan = 1; 1391 } 1392 } 1393 1394 if (XFS_IS_GQUOTA_ENFORCED(ip->i_mount)) { 1395 dq = xfs_inode_dquot(ip, XFS_DQ_GROUP); 1396 if (dq && xfs_dquot_lowsp(dq)) { 1397 eofb.eof_gid = VFS_I(ip)->i_gid; 1398 eofb.eof_flags |= XFS_EOF_FLAGS_GID; 1399 scan = 1; 1400 } 1401 } 1402 1403 if (scan) 1404 xfs_icache_free_eofblocks(ip->i_mount, &eofb); 1405 1406 return scan; 1407 } 1408 1409 void 1410 xfs_inode_set_eofblocks_tag( 1411 xfs_inode_t *ip) 1412 { 1413 struct xfs_mount *mp = ip->i_mount; 1414 struct xfs_perag *pag; 1415 int tagged; 1416 1417 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1418 spin_lock(&pag->pag_ici_lock); 1419 trace_xfs_inode_set_eofblocks_tag(ip); 1420 1421 tagged = radix_tree_tagged(&pag->pag_ici_root, 1422 XFS_ICI_EOFBLOCKS_TAG); 1423 radix_tree_tag_set(&pag->pag_ici_root, 1424 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 1425 XFS_ICI_EOFBLOCKS_TAG); 1426 if (!tagged) { 1427 /* propagate the eofblocks tag up into the perag radix tree */ 1428 spin_lock(&ip->i_mount->m_perag_lock); 1429 radix_tree_tag_set(&ip->i_mount->m_perag_tree, 1430 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 1431 XFS_ICI_EOFBLOCKS_TAG); 1432 spin_unlock(&ip->i_mount->m_perag_lock); 1433 1434 /* kick off background trimming */ 1435 xfs_queue_eofblocks(ip->i_mount); 1436 1437 trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno, 1438 -1, _RET_IP_); 1439 } 1440 1441 spin_unlock(&pag->pag_ici_lock); 1442 xfs_perag_put(pag); 1443 } 1444 1445 void 1446 xfs_inode_clear_eofblocks_tag( 1447 xfs_inode_t *ip) 1448 { 1449 struct xfs_mount *mp = ip->i_mount; 1450 struct xfs_perag *pag; 1451 1452 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino)); 1453 spin_lock(&pag->pag_ici_lock); 1454 trace_xfs_inode_clear_eofblocks_tag(ip); 1455 1456 radix_tree_tag_clear(&pag->pag_ici_root, 1457 XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), 1458 XFS_ICI_EOFBLOCKS_TAG); 1459 if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) { 1460 /* clear the eofblocks tag from the perag radix tree */ 1461 spin_lock(&ip->i_mount->m_perag_lock); 1462 radix_tree_tag_clear(&ip->i_mount->m_perag_tree, 1463 XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), 1464 XFS_ICI_EOFBLOCKS_TAG); 1465 spin_unlock(&ip->i_mount->m_perag_lock); 1466 trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno, 1467 -1, _RET_IP_); 1468 } 1469 1470 spin_unlock(&pag->pag_ici_lock); 1471 xfs_perag_put(pag); 1472 } 1473 1474