xfs_inode.c (3b5d1afd1f13bcab85eaa28223ad396694f929e3) xfs_inode.c (b63da6c8dfa9b2ab3554e8c59ef294d1f28bb9bd)
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6#include <linux/iversion.h>
7
8#include "xfs.h"

--- 30 unchanged lines hidden (view full) ---

39kmem_zone_t *xfs_inode_zone;
40
41/*
42 * Used in xfs_itruncate_extents(). This is the maximum number of extents
43 * freed from a file in a single transaction.
44 */
45#define XFS_ITRUNC_MAX_EXTENTS 2
46
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4 * All Rights Reserved.
5 */
6#include <linux/iversion.h>
7
8#include "xfs.h"

--- 30 unchanged lines hidden (view full) ---

39kmem_zone_t *xfs_inode_zone;
40
41/*
42 * Used in xfs_itruncate_extents(). This is the maximum number of extents
43 * freed from a file in a single transaction.
44 */
45#define XFS_ITRUNC_MAX_EXTENTS 2
46
47STATIC int xfs_iflush_int(struct xfs_inode *, struct xfs_buf *);
48STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
49STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
50
51/*
52 * helper function to extract extent size hint from inode
53 */
54xfs_extlen_t
55xfs_get_extsz_hint(

--- 391 unchanged lines hidden (view full) ---

447 uint lock_mode)
448{
449 int attempts = 0, i, j, try_lock;
450 struct xfs_log_item *lp;
451
452 /*
453 * Currently supports between 2 and 5 inodes with exclusive locking. We
454 * support an arbitrary depth of locking here, but absolute limits on
47STATIC int xfs_iunlink(struct xfs_trans *, struct xfs_inode *);
48STATIC int xfs_iunlink_remove(struct xfs_trans *, struct xfs_inode *);
49
50/*
51 * helper function to extract extent size hint from inode
52 */
53xfs_extlen_t
54xfs_get_extsz_hint(

--- 391 unchanged lines hidden (view full) ---

446 uint lock_mode)
447{
448 int attempts = 0, i, j, try_lock;
449 struct xfs_log_item *lp;
450
451 /*
452 * Currently supports between 2 and 5 inodes with exclusive locking. We
453 * support an arbitrary depth of locking here, but absolute limits on
455 * inodes depend on the the type of locking and the limits placed by
454 * inodes depend on the type of locking and the limits placed by
456 * lockdep annotations in xfs_lock_inumorder. These are all checked by
457 * the asserts.
458 */
459 ASSERT(ips && inodes >= 2 && inodes <= 5);
460 ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
461 XFS_ILOCK_EXCL));
462 ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
463 XFS_ILOCK_SHARED)));

--- 1271 unchanged lines hidden (view full) ---

1735 "Failed to remove inode(s) from unlinked list. "
1736 "Please free space, unmount and run xfs_repair.");
1737 } else {
1738 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1739 }
1740 return error;
1741 }
1742
455 * lockdep annotations in xfs_lock_inumorder. These are all checked by
456 * the asserts.
457 */
458 ASSERT(ips && inodes >= 2 && inodes <= 5);
459 ASSERT(lock_mode & (XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL |
460 XFS_ILOCK_EXCL));
461 ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
462 XFS_ILOCK_SHARED)));

--- 1271 unchanged lines hidden (view full) ---

1734 "Failed to remove inode(s) from unlinked list. "
1735 "Please free space, unmount and run xfs_repair.");
1736 } else {
1737 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1738 }
1739 return error;
1740 }
1741
1742 /*
1743 * We do not hold the inode locked across the entire rolling transaction
1744 * here. We only need to hold it for the first transaction that
1745 * xfs_ifree() builds, which may mark the inode XFS_ISTALE if the
1746 * underlying cluster buffer is freed. Relogging an XFS_ISTALE inode
1747 * here breaks the relationship between cluster buffer invalidation and
1748 * stale inode invalidation on cluster buffer item journal commit
1749 * completion, and can result in leaving dirty stale inodes hanging
1750 * around in memory.
1751 *
1752 * We have no need for serialising this inode operation against other
1753 * operations - we freed the inode and hence reallocation is required
1754 * and that will serialise on reallocating the space the deferops need
1755 * to free. Hence we can unlock the inode on the first commit of
1756 * the transaction rather than roll it right through the deferops. This
1757 * avoids relogging the XFS_ISTALE inode.
1758 *
1759 * We check that xfs_ifree() hasn't grown an internal transaction roll
1760 * by asserting that the inode is still locked when it returns.
1761 */
1743 xfs_ilock(ip, XFS_ILOCK_EXCL);
1762 xfs_ilock(ip, XFS_ILOCK_EXCL);
1744 xfs_trans_ijoin(tp, ip, 0);
1763 xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
1745
1746 error = xfs_ifree(tp, ip);
1764
1765 error = xfs_ifree(tp, ip);
1766 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
1747 if (error) {
1748 /*
1749 * If we fail to free the inode, shut down. The cancel
1750 * might do that, we need to make sure. Otherwise the
1751 * inode might be lost for a long time or forever.
1752 */
1753 if (!XFS_FORCED_SHUTDOWN(mp)) {
1754 xfs_notice(mp, "%s: xfs_ifree returned error %d",
1755 __func__, error);
1756 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1757 }
1758 xfs_trans_cancel(tp);
1767 if (error) {
1768 /*
1769 * If we fail to free the inode, shut down. The cancel
1770 * might do that, we need to make sure. Otherwise the
1771 * inode might be lost for a long time or forever.
1772 */
1773 if (!XFS_FORCED_SHUTDOWN(mp)) {
1774 xfs_notice(mp, "%s: xfs_ifree returned error %d",
1775 __func__, error);
1776 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1777 }
1778 xfs_trans_cancel(tp);
1759 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1760 return error;
1761 }
1762
1763 /*
1764 * Credit the quota account(s). The inode is gone.
1765 */
1766 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1767
1768 /*
1769 * Just ignore errors at this point. There is nothing we can do except
1770 * to try to keep going. Make sure it's not a silent error.
1771 */
1772 error = xfs_trans_commit(tp);
1773 if (error)
1774 xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1775 __func__, error);
1776
1779 return error;
1780 }
1781
1782 /*
1783 * Credit the quota account(s). The inode is gone.
1784 */
1785 xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1786
1787 /*
1788 * Just ignore errors at this point. There is nothing we can do except
1789 * to try to keep going. Make sure it's not a silent error.
1790 */
1791 error = xfs_trans_commit(tp);
1792 if (error)
1793 xfs_notice(mp, "%s: xfs_trans_commit returned error %d",
1794 __func__, error);
1795
1777 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1778 return 0;
1779}
1780
1781/*
1782 * xfs_inactive
1783 *
1784 * This is called when the vnode reference count for the vnode
1785 * goes to zero. If the file has been unlinked, then it must

--- 356 unchanged lines hidden (view full) ---

2142 dip->di_next_unlinked = cpu_to_be32(next_agino);
2143 offset = imap->im_boffset +
2144 offsetof(struct xfs_dinode, di_next_unlinked);
2145
2146 /* need to recalc the inode CRC if appropriate */
2147 xfs_dinode_calc_crc(mp, dip);
2148 xfs_trans_inode_buf(tp, ibp);
2149 xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
1796 return 0;
1797}
1798
1799/*
1800 * xfs_inactive
1801 *
1802 * This is called when the vnode reference count for the vnode
1803 * goes to zero. If the file has been unlinked, then it must

--- 356 unchanged lines hidden (view full) ---

2160 dip->di_next_unlinked = cpu_to_be32(next_agino);
2161 offset = imap->im_boffset +
2162 offsetof(struct xfs_dinode, di_next_unlinked);
2163
2164 /* need to recalc the inode CRC if appropriate */
2165 xfs_dinode_calc_crc(mp, dip);
2166 xfs_trans_inode_buf(tp, ibp);
2167 xfs_trans_log_buf(tp, ibp, offset, offset + sizeof(xfs_agino_t) - 1);
2150 xfs_inobp_check(mp, ibp);
2151}
2152
2153/* Set an in-core inode's unlinked pointer and return the old value. */
2154STATIC int
2155xfs_iunlink_update_inode(
2156 struct xfs_trans *tp,
2157 struct xfs_inode *ip,
2158 xfs_agnumber_t agno,

--- 84 unchanged lines hidden (view full) ---

2243 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2244 if (next_agino == agino ||
2245 !xfs_verify_agino_or_null(mp, agno, next_agino)) {
2246 xfs_buf_mark_corrupt(agibp);
2247 return -EFSCORRUPTED;
2248 }
2249
2250 if (next_agino != NULLAGINO) {
2168}
2169
2170/* Set an in-core inode's unlinked pointer and return the old value. */
2171STATIC int
2172xfs_iunlink_update_inode(
2173 struct xfs_trans *tp,
2174 struct xfs_inode *ip,
2175 xfs_agnumber_t agno,

--- 84 unchanged lines hidden (view full) ---

2260 next_agino = be32_to_cpu(agi->agi_unlinked[bucket_index]);
2261 if (next_agino == agino ||
2262 !xfs_verify_agino_or_null(mp, agno, next_agino)) {
2263 xfs_buf_mark_corrupt(agibp);
2264 return -EFSCORRUPTED;
2265 }
2266
2267 if (next_agino != NULLAGINO) {
2251 struct xfs_perag *pag;
2252 xfs_agino_t old_agino;
2253
2254 /*
2255 * There is already another inode in the bucket, so point this
2256 * inode to the current head of the list.
2257 */
2258 error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
2259 &old_agino);
2260 if (error)
2261 return error;
2262 ASSERT(old_agino == NULLAGINO);
2263
2264 /*
2265 * agino has been unlinked, add a backref from the next inode
2266 * back to agino.
2267 */
2268 xfs_agino_t old_agino;
2269
2270 /*
2271 * There is already another inode in the bucket, so point this
2272 * inode to the current head of the list.
2273 */
2274 error = xfs_iunlink_update_inode(tp, ip, agno, next_agino,
2275 &old_agino);
2276 if (error)
2277 return error;
2278 ASSERT(old_agino == NULLAGINO);
2279
2280 /*
2281 * agino has been unlinked, add a backref from the next inode
2282 * back to agino.
2283 */
2268 pag = xfs_perag_get(mp, agno);
2269 error = xfs_iunlink_add_backref(pag, agino, next_agino);
2270 xfs_perag_put(pag);
2284 error = xfs_iunlink_add_backref(agibp->b_pag, agino, next_agino);
2271 if (error)
2272 return error;
2273 }
2274
2275 /* Point the head of the list to point to this inode. */
2276 return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
2277}
2278

--- 119 unchanged lines hidden (view full) ---

2398 struct xfs_trans *tp,
2399 struct xfs_inode *ip)
2400{
2401 struct xfs_mount *mp = tp->t_mountp;
2402 struct xfs_agi *agi;
2403 struct xfs_buf *agibp;
2404 struct xfs_buf *last_ibp;
2405 struct xfs_dinode *last_dip = NULL;
2285 if (error)
2286 return error;
2287 }
2288
2289 /* Point the head of the list to point to this inode. */
2290 return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index, agino);
2291}
2292

--- 119 unchanged lines hidden (view full) ---

2412 struct xfs_trans *tp,
2413 struct xfs_inode *ip)
2414{
2415 struct xfs_mount *mp = tp->t_mountp;
2416 struct xfs_agi *agi;
2417 struct xfs_buf *agibp;
2418 struct xfs_buf *last_ibp;
2419 struct xfs_dinode *last_dip = NULL;
2406 struct xfs_perag *pag = NULL;
2407 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
2408 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2409 xfs_agino_t next_agino;
2410 xfs_agino_t head_agino;
2411 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2412 int error;
2413
2414 trace_xfs_iunlink_remove(ip);

--- 27 unchanged lines hidden (view full) ---

2442 /*
2443 * If there was a backref pointing from the next inode back to this
2444 * one, remove it because we've removed this inode from the list.
2445 *
2446 * Later, if this inode was in the middle of the list we'll update
2447 * this inode's backref to point from the next inode.
2448 */
2449 if (next_agino != NULLAGINO) {
2420 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
2421 xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ip->i_ino);
2422 xfs_agino_t next_agino;
2423 xfs_agino_t head_agino;
2424 short bucket_index = agino % XFS_AGI_UNLINKED_BUCKETS;
2425 int error;
2426
2427 trace_xfs_iunlink_remove(ip);

--- 27 unchanged lines hidden (view full) ---

2455 /*
2456 * If there was a backref pointing from the next inode back to this
2457 * one, remove it because we've removed this inode from the list.
2458 *
2459 * Later, if this inode was in the middle of the list we'll update
2460 * this inode's backref to point from the next inode.
2461 */
2462 if (next_agino != NULLAGINO) {
2450 pag = xfs_perag_get(mp, agno);
2451 error = xfs_iunlink_change_backref(pag, next_agino,
2463 error = xfs_iunlink_change_backref(agibp->b_pag, next_agino,
2452 NULLAGINO);
2453 if (error)
2464 NULLAGINO);
2465 if (error)
2454 goto out;
2466 return error;
2455 }
2456
2467 }
2468
2457 if (head_agino == agino) {
2458 /* Point the head of the list to the next unlinked inode. */
2459 error = xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
2460 next_agino);
2461 if (error)
2462 goto out;
2463 } else {
2469 if (head_agino != agino) {
2464 struct xfs_imap imap;
2465 xfs_agino_t prev_agino;
2466
2470 struct xfs_imap imap;
2471 xfs_agino_t prev_agino;
2472
2467 if (!pag)
2468 pag = xfs_perag_get(mp, agno);
2469
2470 /* We need to search the list for the inode being freed. */
2471 error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
2472 &prev_agino, &imap, &last_dip, &last_ibp,
2473 /* We need to search the list for the inode being freed. */
2474 error = xfs_iunlink_map_prev(tp, agno, head_agino, agino,
2475 &prev_agino, &imap, &last_dip, &last_ibp,
2473 pag);
2476 agibp->b_pag);
2474 if (error)
2477 if (error)
2475 goto out;
2478 return error;
2476
2477 /* Point the previous inode on the list to the next inode. */
2478 xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
2479 last_dip, &imap, next_agino);
2480
2481 /*
2482 * Now we deal with the backref for this inode. If this inode
2483 * pointed at a real inode, change the backref that pointed to
2484 * us to point to our old next. If this inode was the end of
2485 * the list, delete the backref that pointed to us. Note that
2486 * change_backref takes care of deleting the backref if
2487 * next_agino is NULLAGINO.
2488 */
2479
2480 /* Point the previous inode on the list to the next inode. */
2481 xfs_iunlink_update_dinode(tp, agno, prev_agino, last_ibp,
2482 last_dip, &imap, next_agino);
2483
2484 /*
2485 * Now we deal with the backref for this inode. If this inode
2486 * pointed at a real inode, change the backref that pointed to
2487 * us to point to our old next. If this inode was the end of
2488 * the list, delete the backref that pointed to us. Note that
2489 * change_backref takes care of deleting the backref if
2490 * next_agino is NULLAGINO.
2491 */
2489 error = xfs_iunlink_change_backref(pag, agino, next_agino);
2490 if (error)
2491 goto out;
2492 return xfs_iunlink_change_backref(agibp->b_pag, agino,
2493 next_agino);
2492 }
2493
2494 }
2495
2494out:
2495 if (pag)
2496 xfs_perag_put(pag);
2497 return error;
2496 /* Point the head of the list to the next unlinked inode. */
2497 return xfs_iunlink_update_bucket(tp, agno, agibp, bucket_index,
2498 next_agino);
2498}
2499
2500/*
2499}
2500
2501/*
2501 * Look up the inode number specified and mark it stale if it is found. If it is
2502 * dirty, return the inode so it can be attached to the cluster buffer so it can
2503 * be processed appropriately when the cluster free transaction completes.
2502 * Look up the inode number specified and if it is not already marked XFS_ISTALE
2503 * mark it stale. We should only find clean inodes in this lookup that aren't
2504 * already stale.
2504 */
2505 */
2505static struct xfs_inode *
2506xfs_ifree_get_one_inode(
2507 struct xfs_perag *pag,
2506static void
2507xfs_ifree_mark_inode_stale(
2508 struct xfs_buf *bp,
2508 struct xfs_inode *free_ip,
2509 xfs_ino_t inum)
2510{
2509 struct xfs_inode *free_ip,
2510 xfs_ino_t inum)
2511{
2511 struct xfs_mount *mp = pag->pag_mount;
2512 struct xfs_mount *mp = bp->b_mount;
2513 struct xfs_perag *pag = bp->b_pag;
2514 struct xfs_inode_log_item *iip;
2512 struct xfs_inode *ip;
2513
2514retry:
2515 rcu_read_lock();
2516 ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
2517
2518 /* Inode not in memory, nothing to do */
2515 struct xfs_inode *ip;
2516
2517retry:
2518 rcu_read_lock();
2519 ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, inum));
2520
2521 /* Inode not in memory, nothing to do */
2519 if (!ip)
2520 goto out_rcu_unlock;
2522 if (!ip) {
2523 rcu_read_unlock();
2524 return;
2525 }
2521
2522 /*
2523 * because this is an RCU protected lookup, we could find a recently
2524 * freed or even reallocated inode during the lookup. We need to check
2525 * under the i_flags_lock for a valid inode here. Skip it if it is not
2526 * valid, the wrong inode or stale.
2527 */
2528 spin_lock(&ip->i_flags_lock);
2529 if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
2530 spin_unlock(&ip->i_flags_lock);
2526
2527 /*
2528 * because this is an RCU protected lookup, we could find a recently
2529 * freed or even reallocated inode during the lookup. We need to check
2530 * under the i_flags_lock for a valid inode here. Skip it if it is not
2531 * valid, the wrong inode or stale.
2532 */
2533 spin_lock(&ip->i_flags_lock);
2534 if (ip->i_ino != inum || __xfs_iflags_test(ip, XFS_ISTALE)) {
2535 spin_unlock(&ip->i_flags_lock);
2531 goto out_rcu_unlock;
2536 rcu_read_unlock();
2537 return;
2532 }
2538 }
2533 spin_unlock(&ip->i_flags_lock);
2534
2535 /*
2536 * Don't try to lock/unlock the current inode, but we _cannot_ skip the
2537 * other inodes that we did not find in the list attached to the buffer
2538 * and are not already marked stale. If we can't lock it, back off and
2539 * retry.
2540 */
2541 if (ip != free_ip) {
2542 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2539
2540 /*
2541 * Don't try to lock/unlock the current inode, but we _cannot_ skip the
2542 * other inodes that we did not find in the list attached to the buffer
2543 * and are not already marked stale. If we can't lock it, back off and
2544 * retry.
2545 */
2546 if (ip != free_ip) {
2547 if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
2548 spin_unlock(&ip->i_flags_lock);
2543 rcu_read_unlock();
2544 delay(1);
2545 goto retry;
2546 }
2549 rcu_read_unlock();
2550 delay(1);
2551 goto retry;
2552 }
2547
2548 /*
2549 * Check the inode number again in case we're racing with
2550 * freeing in xfs_reclaim_inode(). See the comments in that
2551 * function for more information as to why the initial check is
2552 * not sufficient.
2553 */
2554 if (ip->i_ino != inum) {
2555 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2556 goto out_rcu_unlock;
2557 }
2558 }
2553 }
2554 ip->i_flags |= XFS_ISTALE;
2555 spin_unlock(&ip->i_flags_lock);
2559 rcu_read_unlock();
2560
2556 rcu_read_unlock();
2557
2561 xfs_iflock(ip);
2562 xfs_iflags_set(ip, XFS_ISTALE);
2558 /*
2559 * If we can't get the flush lock, the inode is already attached. All
2560 * we needed to do here is mark the inode stale so buffer IO completion
2561 * will remove it from the AIL.
2562 */
2563 iip = ip->i_itemp;
2564 if (!xfs_iflock_nowait(ip)) {
2565 ASSERT(!list_empty(&iip->ili_item.li_bio_list));
2566 ASSERT(iip->ili_last_fields);
2567 goto out_iunlock;
2568 }
2563
2564 /*
2569
2570 /*
2565 * We don't need to attach clean inodes or those only with unlogged
2566 * changes (which we throw away, anyway).
2571 * Inodes not attached to the buffer can be released immediately.
2572 * Everything else has to go through xfs_iflush_abort() on journal
2573 * commit as the flock synchronises removal of the inode from the
2574 * cluster buffer against inode reclaim.
2567 */
2575 */
2568 if (!ip->i_itemp || xfs_inode_clean(ip)) {
2569 ASSERT(ip != free_ip);
2576 if (!iip || list_empty(&iip->ili_item.li_bio_list)) {
2570 xfs_ifunlock(ip);
2577 xfs_ifunlock(ip);
2571 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2572 goto out_no_inode;
2578 goto out_iunlock;
2573 }
2579 }
2574 return ip;
2575
2580
2576out_rcu_unlock:
2577 rcu_read_unlock();
2578out_no_inode:
2579 return NULL;
2581 /* we have a dirty inode in memory that has not yet been flushed. */
2582 spin_lock(&iip->ili_lock);
2583 iip->ili_last_fields = iip->ili_fields;
2584 iip->ili_fields = 0;
2585 iip->ili_fsync_fields = 0;
2586 spin_unlock(&iip->ili_lock);
2587 ASSERT(iip->ili_last_fields);
2588
2589out_iunlock:
2590 if (ip != free_ip)
2591 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2580}
2581
2582/*
2583 * A big issue when freeing the inode cluster is that we _cannot_ skip any
2584 * inodes that are in memory - they all must be marked stale and attached to
2585 * the cluster buffer.
2586 */
2587STATIC int
2588xfs_ifree_cluster(
2592}
2593
2594/*
2595 * A big issue when freeing the inode cluster is that we _cannot_ skip any
2596 * inodes that are in memory - they all must be marked stale and attached to
2597 * the cluster buffer.
2598 */
2599STATIC int
2600xfs_ifree_cluster(
2589 xfs_inode_t *free_ip,
2590 xfs_trans_t *tp,
2601 struct xfs_inode *free_ip,
2602 struct xfs_trans *tp,
2591 struct xfs_icluster *xic)
2592{
2603 struct xfs_icluster *xic)
2604{
2593 xfs_mount_t *mp = free_ip->i_mount;
2605 struct xfs_mount *mp = free_ip->i_mount;
2606 struct xfs_ino_geometry *igeo = M_IGEO(mp);
2607 struct xfs_buf *bp;
2608 xfs_daddr_t blkno;
2609 xfs_ino_t inum = xic->first_ino;
2594 int nbufs;
2595 int i, j;
2596 int ioffset;
2610 int nbufs;
2611 int i, j;
2612 int ioffset;
2597 xfs_daddr_t blkno;
2598 xfs_buf_t *bp;
2599 xfs_inode_t *ip;
2600 struct xfs_inode_log_item *iip;
2601 struct xfs_log_item *lip;
2602 struct xfs_perag *pag;
2603 struct xfs_ino_geometry *igeo = M_IGEO(mp);
2604 xfs_ino_t inum;
2605 int error;
2606
2613 int error;
2614
2607 inum = xic->first_ino;
2608 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum));
2609 nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
2610
2611 for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
2612 /*
2613 * The allocation bitmap tells us which inodes of the chunk were
2614 * physically allocated. Skip the cluster if an inode falls into
2615 * a sparse region.
2616 */

--- 12 unchanged lines hidden (view full) ---

2629 * can't get the flush lock on is attached to the buffer.
2630 * If we scan the in-memory inodes first, then buffer IO can
2631 * complete before we get a lock on it, and hence we may fail
2632 * to mark all the active inodes on the buffer stale.
2633 */
2634 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2635 mp->m_bsize * igeo->blocks_per_cluster,
2636 XBF_UNMAPPED, &bp);
2615 nbufs = igeo->ialloc_blks / igeo->blocks_per_cluster;
2616
2617 for (j = 0; j < nbufs; j++, inum += igeo->inodes_per_cluster) {
2618 /*
2619 * The allocation bitmap tells us which inodes of the chunk were
2620 * physically allocated. Skip the cluster if an inode falls into
2621 * a sparse region.
2622 */

--- 12 unchanged lines hidden (view full) ---

2635 * can't get the flush lock on is attached to the buffer.
2636 * If we scan the in-memory inodes first, then buffer IO can
2637 * complete before we get a lock on it, and hence we may fail
2638 * to mark all the active inodes on the buffer stale.
2639 */
2640 error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
2641 mp->m_bsize * igeo->blocks_per_cluster,
2642 XBF_UNMAPPED, &bp);
2637 if (error) {
2638 xfs_perag_put(pag);
2643 if (error)
2639 return error;
2644 return error;
2640 }
2641
2642 /*
2643 * This buffer may not have been correctly initialised as we
2644 * didn't read it from disk. That's not important because we are
2645 * only using to mark the buffer as stale in the log, and to
2646 * attach stale cached inodes on it. That means it will never be
2647 * dispatched for IO. If it is, we want to know about it, and we
2648 * want it to fail. We can acheive this by adding a write
2649 * verifier to the buffer.
2650 */
2651 bp->b_ops = &xfs_inode_buf_ops;
2652
2653 /*
2645
2646 /*
2647 * This buffer may not have been correctly initialised as we
2648 * didn't read it from disk. That's not important because we are
2649 * only using to mark the buffer as stale in the log, and to
2650 * attach stale cached inodes on it. That means it will never be
2651 * dispatched for IO. If it is, we want to know about it, and we
2652 * want it to fail. We can acheive this by adding a write
2653 * verifier to the buffer.
2654 */
2655 bp->b_ops = &xfs_inode_buf_ops;
2656
2657 /*
2654 * Walk the inodes already attached to the buffer and mark them
2655 * stale. These will all have the flush locks held, so an
2656 * in-memory inode walk can't lock them. By marking them all
2657 * stale first, we will not attempt to lock them in the loop
2658 * below as the XFS_ISTALE flag will be set.
2658 * Now we need to set all the cached clean inodes as XFS_ISTALE,
2659 * too. This requires lookups, and will skip inodes that we've
2660 * already marked XFS_ISTALE.
2659 */
2661 */
2660 list_for_each_entry(lip, &bp->b_li_list, li_bio_list) {
2661 if (lip->li_type == XFS_LI_INODE) {
2662 iip = (struct xfs_inode_log_item *)lip;
2663 ASSERT(iip->ili_logged == 1);
2664 lip->li_cb = xfs_istale_done;
2665 xfs_trans_ail_copy_lsn(mp->m_ail,
2666 &iip->ili_flush_lsn,
2667 &iip->ili_item.li_lsn);
2668 xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
2669 }
2670 }
2662 for (i = 0; i < igeo->inodes_per_cluster; i++)
2663 xfs_ifree_mark_inode_stale(bp, free_ip, inum + i);
2671
2664
2672
2673 /*
2674 * For each inode in memory attempt to add it to the inode
2675 * buffer and set it up for being staled on buffer IO
2676 * completion. This is safe as we've locked out tail pushing
2677 * and flushing by locking the buffer.
2678 *
2679 * We have already marked every inode that was part of a
2680 * transaction stale above, which means there is no point in
2681 * even trying to lock them.
2682 */
2683 for (i = 0; i < igeo->inodes_per_cluster; i++) {
2684 ip = xfs_ifree_get_one_inode(pag, free_ip, inum + i);
2685 if (!ip)
2686 continue;
2687
2688 iip = ip->i_itemp;
2689 iip->ili_last_fields = iip->ili_fields;
2690 iip->ili_fields = 0;
2691 iip->ili_fsync_fields = 0;
2692 iip->ili_logged = 1;
2693 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
2694 &iip->ili_item.li_lsn);
2695
2696 xfs_buf_attach_iodone(bp, xfs_istale_done,
2697 &iip->ili_item);
2698
2699 if (ip != free_ip)
2700 xfs_iunlock(ip, XFS_ILOCK_EXCL);
2701 }
2702
2703 xfs_trans_stale_inode_buf(tp, bp);
2704 xfs_trans_binval(tp, bp);
2705 }
2665 xfs_trans_stale_inode_buf(tp, bp);
2666 xfs_trans_binval(tp, bp);
2667 }
2706
2707 xfs_perag_put(pag);
2708 return 0;
2709}
2710
2711/*
2712 * This is called to return an inode to the inode free list.
2713 * The inode should already be truncated to 0 length and have
2714 * no pages associated with it. This routine also assumes that
2715 * the inode is already a part of the transaction.

--- 4 unchanged lines hidden (view full) ---

2720 */
2721int
2722xfs_ifree(
2723 struct xfs_trans *tp,
2724 struct xfs_inode *ip)
2725{
2726 int error;
2727 struct xfs_icluster xic = { 0 };
2668 return 0;
2669}
2670
2671/*
2672 * This is called to return an inode to the inode free list.
2673 * The inode should already be truncated to 0 length and have
2674 * no pages associated with it. This routine also assumes that
2675 * the inode is already a part of the transaction.

--- 4 unchanged lines hidden (view full) ---

2680 */
2681int
2682xfs_ifree(
2683 struct xfs_trans *tp,
2684 struct xfs_inode *ip)
2685{
2686 int error;
2687 struct xfs_icluster xic = { 0 };
2688 struct xfs_inode_log_item *iip = ip->i_itemp;
2728
2729 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2730 ASSERT(VFS_I(ip)->i_nlink == 0);
2731 ASSERT(ip->i_df.if_nextents == 0);
2732 ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
2733 ASSERT(ip->i_d.di_nblocks == 0);
2734
2735 /*

--- 21 unchanged lines hidden (view full) ---

2757 VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
2758 ip->i_d.di_flags = 0;
2759 ip->i_d.di_flags2 = 0;
2760 ip->i_d.di_dmevmask = 0;
2761 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
2762 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
2763
2764 /* Don't attempt to replay owner changes for a deleted inode */
2689
2690 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
2691 ASSERT(VFS_I(ip)->i_nlink == 0);
2692 ASSERT(ip->i_df.if_nextents == 0);
2693 ASSERT(ip->i_d.di_size == 0 || !S_ISREG(VFS_I(ip)->i_mode));
2694 ASSERT(ip->i_d.di_nblocks == 0);
2695
2696 /*

--- 21 unchanged lines hidden (view full) ---

2718 VFS_I(ip)->i_mode = 0; /* mark incore inode as free */
2719 ip->i_d.di_flags = 0;
2720 ip->i_d.di_flags2 = 0;
2721 ip->i_d.di_dmevmask = 0;
2722 ip->i_d.di_forkoff = 0; /* mark the attr fork not in use */
2723 ip->i_df.if_format = XFS_DINODE_FMT_EXTENTS;
2724
2725 /* Don't attempt to replay owner changes for a deleted inode */
2765 ip->i_itemp->ili_fields &= ~(XFS_ILOG_AOWNER|XFS_ILOG_DOWNER);
2726 spin_lock(&iip->ili_lock);
2727 iip->ili_fields &= ~(XFS_ILOG_AOWNER | XFS_ILOG_DOWNER);
2728 spin_unlock(&iip->ili_lock);
2766
2767 /*
2768 * Bump the generation count so no one will be confused
2769 * by reincarnations of this inode.
2770 */
2771 VFS_I(ip)->i_generation++;
2772 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2773

--- 363 unchanged lines hidden (view full) ---

3137out_trans_abort:
3138 xfs_trans_cancel(tp);
3139 return error;
3140}
3141
3142/*
3143 * xfs_rename_alloc_whiteout()
3144 *
2729
2730 /*
2731 * Bump the generation count so no one will be confused
2732 * by reincarnations of this inode.
2733 */
2734 VFS_I(ip)->i_generation++;
2735 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2736

--- 363 unchanged lines hidden (view full) ---

3100out_trans_abort:
3101 xfs_trans_cancel(tp);
3102 return error;
3103}
3104
3105/*
3106 * xfs_rename_alloc_whiteout()
3107 *
3145 * Return a referenced, unlinked, unlocked inode that that can be used as a
3108 * Return a referenced, unlinked, unlocked inode that can be used as a
3146 * whiteout in a rename transaction. We use a tmpfile inode here so that if we
3147 * crash between allocating the inode and linking it into the rename transaction
3148 * recovery will free the inode and we won't leak it.
3149 */
3150static int
3151xfs_rename_alloc_whiteout(
3152 struct xfs_inode *dp,
3153 struct xfs_inode **wip)

--- 310 unchanged lines hidden (view full) ---

3464out_trans_cancel:
3465 xfs_trans_cancel(tp);
3466out_release_wip:
3467 if (wip)
3468 xfs_irele(wip);
3469 return error;
3470}
3471
3109 * whiteout in a rename transaction. We use a tmpfile inode here so that if we
3110 * crash between allocating the inode and linking it into the rename transaction
3111 * recovery will free the inode and we won't leak it.
3112 */
3113static int
3114xfs_rename_alloc_whiteout(
3115 struct xfs_inode *dp,
3116 struct xfs_inode **wip)

--- 310 unchanged lines hidden (view full) ---

3427out_trans_cancel:
3428 xfs_trans_cancel(tp);
3429out_release_wip:
3430 if (wip)
3431 xfs_irele(wip);
3432 return error;
3433}
3434
3472STATIC int
3473xfs_iflush_cluster(
3474 struct xfs_inode *ip,
3475 struct xfs_buf *bp)
3476{
3477 struct xfs_mount *mp = ip->i_mount;
3478 struct xfs_perag *pag;
3479 unsigned long first_index, mask;
3480 int cilist_size;
3481 struct xfs_inode **cilist;
3482 struct xfs_inode *cip;
3483 struct xfs_ino_geometry *igeo = M_IGEO(mp);
3484 int error = 0;
3485 int nr_found;
3486 int clcount = 0;
3487 int i;
3488
3489 pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
3490
3491 cilist_size = igeo->inodes_per_cluster * sizeof(struct xfs_inode *);
3492 cilist = kmem_alloc(cilist_size, KM_MAYFAIL|KM_NOFS);
3493 if (!cilist)
3494 goto out_put;
3495
3496 mask = ~(igeo->inodes_per_cluster - 1);
3497 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
3498 rcu_read_lock();
3499 /* really need a gang lookup range call here */
3500 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)cilist,
3501 first_index, igeo->inodes_per_cluster);
3502 if (nr_found == 0)
3503 goto out_free;
3504
3505 for (i = 0; i < nr_found; i++) {
3506 cip = cilist[i];
3507 if (cip == ip)
3508 continue;
3509
3510 /*
3511 * because this is an RCU protected lookup, we could find a
3512 * recently freed or even reallocated inode during the lookup.
3513 * We need to check under the i_flags_lock for a valid inode
3514 * here. Skip it if it is not valid or the wrong inode.
3515 */
3516 spin_lock(&cip->i_flags_lock);
3517 if (!cip->i_ino ||
3518 __xfs_iflags_test(cip, XFS_ISTALE)) {
3519 spin_unlock(&cip->i_flags_lock);
3520 continue;
3521 }
3522
3523 /*
3524 * Once we fall off the end of the cluster, no point checking
3525 * any more inodes in the list because they will also all be
3526 * outside the cluster.
3527 */
3528 if ((XFS_INO_TO_AGINO(mp, cip->i_ino) & mask) != first_index) {
3529 spin_unlock(&cip->i_flags_lock);
3530 break;
3531 }
3532 spin_unlock(&cip->i_flags_lock);
3533
3534 /*
3535 * Do an un-protected check to see if the inode is dirty and
3536 * is a candidate for flushing. These checks will be repeated
3537 * later after the appropriate locks are acquired.
3538 */
3539 if (xfs_inode_clean(cip) && xfs_ipincount(cip) == 0)
3540 continue;
3541
3542 /*
3543 * Try to get locks. If any are unavailable or it is pinned,
3544 * then this inode cannot be flushed and is skipped.
3545 */
3546
3547 if (!xfs_ilock_nowait(cip, XFS_ILOCK_SHARED))
3548 continue;
3549 if (!xfs_iflock_nowait(cip)) {
3550 xfs_iunlock(cip, XFS_ILOCK_SHARED);
3551 continue;
3552 }
3553 if (xfs_ipincount(cip)) {
3554 xfs_ifunlock(cip);
3555 xfs_iunlock(cip, XFS_ILOCK_SHARED);
3556 continue;
3557 }
3558
3559
3560 /*
3561 * Check the inode number again, just to be certain we are not
3562 * racing with freeing in xfs_reclaim_inode(). See the comments
3563 * in that function for more information as to why the initial
3564 * check is not sufficient.
3565 */
3566 if (!cip->i_ino) {
3567 xfs_ifunlock(cip);
3568 xfs_iunlock(cip, XFS_ILOCK_SHARED);
3569 continue;
3570 }
3571
3572 /*
3573 * arriving here means that this inode can be flushed. First
3574 * re-check that it's dirty before flushing.
3575 */
3576 if (!xfs_inode_clean(cip)) {
3577 error = xfs_iflush_int(cip, bp);
3578 if (error) {
3579 xfs_iunlock(cip, XFS_ILOCK_SHARED);
3580 goto out_free;
3581 }
3582 clcount++;
3583 } else {
3584 xfs_ifunlock(cip);
3585 }
3586 xfs_iunlock(cip, XFS_ILOCK_SHARED);
3587 }
3588
3589 if (clcount) {
3590 XFS_STATS_INC(mp, xs_icluster_flushcnt);
3591 XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
3592 }
3593
3594out_free:
3595 rcu_read_unlock();
3596 kmem_free(cilist);
3597out_put:
3598 xfs_perag_put(pag);
3599 return error;
3600}
3601
3602/*
3603 * Flush dirty inode metadata into the backing buffer.
3604 *
3605 * The caller must have the inode lock and the inode flush lock held. The
3606 * inode lock will still be held upon return to the caller, and the inode
3607 * flush lock will be released after the inode has reached the disk.
3608 *
3609 * The caller must write out the buffer returned in *bpp and release it.
3610 */
3611int
3435static int
3612xfs_iflush(
3613 struct xfs_inode *ip,
3436xfs_iflush(
3437 struct xfs_inode *ip,
3614 struct xfs_buf **bpp)
3615{
3616 struct xfs_mount *mp = ip->i_mount;
3617 struct xfs_buf *bp = NULL;
3618 struct xfs_dinode *dip;
3619 int error;
3620
3621 XFS_STATS_INC(mp, xs_iflush_count);
3622
3623 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3624 ASSERT(xfs_isiflocked(ip));
3625 ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
3626 ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3627
3628 *bpp = NULL;
3629
3630 xfs_iunpin_wait(ip);
3631
3632 /*
3633 * For stale inodes we cannot rely on the backing buffer remaining
3634 * stale in cache for the remaining life of the stale inode and so
3635 * xfs_imap_to_bp() below may give us a buffer that no longer contains
3636 * inodes below. We have to check this after ensuring the inode is
3637 * unpinned so that it is safe to reclaim the stale inode after the
3638 * flush call.
3639 */
3640 if (xfs_iflags_test(ip, XFS_ISTALE)) {
3641 xfs_ifunlock(ip);
3642 return 0;
3643 }
3644
3645 /*
3646 * Get the buffer containing the on-disk inode. We are doing a try-lock
3647 * operation here, so we may get an EAGAIN error. In that case, return
3648 * leaving the inode dirty.
3649 *
3650 * If we get any other error, we effectively have a corruption situation
3651 * and we cannot flush the inode. Abort the flush and shut down.
3652 */
3653 error = xfs_imap_to_bp(mp, NULL, &ip->i_imap, &dip, &bp, XBF_TRYLOCK);
3654 if (error == -EAGAIN) {
3655 xfs_ifunlock(ip);
3656 return error;
3657 }
3658 if (error)
3659 goto abort;
3660
3661 /*
3662 * If the buffer is pinned then push on the log now so we won't
3663 * get stuck waiting in the write for too long.
3664 */
3665 if (xfs_buf_ispinned(bp))
3666 xfs_log_force(mp, 0);
3667
3668 /*
3669 * Flush the provided inode then attempt to gather others from the
3670 * cluster into the write.
3671 *
3672 * Note: Once we attempt to flush an inode, we must run buffer
3673 * completion callbacks on any failure. If this fails, simulate an I/O
3674 * failure on the buffer and shut down.
3675 */
3676 error = xfs_iflush_int(ip, bp);
3677 if (!error)
3678 error = xfs_iflush_cluster(ip, bp);
3679 if (error) {
3680 bp->b_flags |= XBF_ASYNC;
3681 xfs_buf_ioend_fail(bp);
3682 goto shutdown;
3683 }
3684
3685 *bpp = bp;
3686 return 0;
3687
3688abort:
3689 xfs_iflush_abort(ip);
3690shutdown:
3691 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3692 return error;
3693}
3694
3695STATIC int
3696xfs_iflush_int(
3697 struct xfs_inode *ip,
3698 struct xfs_buf *bp)
3699{
3700 struct xfs_inode_log_item *iip = ip->i_itemp;
3701 struct xfs_dinode *dip;
3702 struct xfs_mount *mp = ip->i_mount;
3703 int error;
3704
3705 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3706 ASSERT(xfs_isiflocked(ip));
3707 ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
3708 ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3438 struct xfs_buf *bp)
3439{
3440 struct xfs_inode_log_item *iip = ip->i_itemp;
3441 struct xfs_dinode *dip;
3442 struct xfs_mount *mp = ip->i_mount;
3443 int error;
3444
3445 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
3446 ASSERT(xfs_isiflocked(ip));
3447 ASSERT(ip->i_df.if_format != XFS_DINODE_FMT_BTREE ||
3448 ip->i_df.if_nextents > XFS_IFORK_MAXEXT(ip, XFS_DATA_FORK));
3709 ASSERT(iip != NULL && iip->ili_fields != 0);
3449 ASSERT(iip->ili_item.li_buf == bp);
3710
3711 dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
3712
3713 /*
3714 * We don't flush the inode if any of the following checks fail, but we
3715 * do still update the log item and attach to the backing buffer as if
3716 * the flush happened. This is a formality to facilitate predictable
3717 * error handling as the caller will shutdown and fail the buffer.

--- 78 unchanged lines hidden (view full) ---

3796
3797 /* Wrap, we never let the log put out DI_MAX_FLUSH */
3798 if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
3799 ip->i_d.di_flushiter = 0;
3800
3801 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
3802 if (XFS_IFORK_Q(ip))
3803 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
3450
3451 dip = xfs_buf_offset(bp, ip->i_imap.im_boffset);
3452
3453 /*
3454 * We don't flush the inode if any of the following checks fail, but we
3455 * do still update the log item and attach to the backing buffer as if
3456 * the flush happened. This is a formality to facilitate predictable
3457 * error handling as the caller will shutdown and fail the buffer.

--- 78 unchanged lines hidden (view full) ---

3536
3537 /* Wrap, we never let the log put out DI_MAX_FLUSH */
3538 if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
3539 ip->i_d.di_flushiter = 0;
3540
3541 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK);
3542 if (XFS_IFORK_Q(ip))
3543 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK);
3804 xfs_inobp_check(mp, bp);
3805
3806 /*
3807 * We've recorded everything logged in the inode, so we'd like to clear
3808 * the ili_fields bits so we don't log and flush things unnecessarily.
3809 * However, we can't stop logging all this information until the data
3810 * we've copied into the disk buffer is written to disk. If we did we
3811 * might overwrite the copy of the inode in the log with all the data
3812 * after re-logging only part of it, and in the face of a crash we
3813 * wouldn't have all the data we need to recover.
3814 *
3815 * What we do is move the bits to the ili_last_fields field. When
3816 * logging the inode, these bits are moved back to the ili_fields field.
3817 * In the xfs_iflush_done() routine we clear ili_last_fields, since we
3818 * know that the information those bits represent is permanently on
3819 * disk. As long as the flush completes before the inode is logged
3820 * again, then both ili_fields and ili_last_fields will be cleared.
3544
3545 /*
3546 * We've recorded everything logged in the inode, so we'd like to clear
3547 * the ili_fields bits so we don't log and flush things unnecessarily.
3548 * However, we can't stop logging all this information until the data
3549 * we've copied into the disk buffer is written to disk. If we did we
3550 * might overwrite the copy of the inode in the log with all the data
3551 * after re-logging only part of it, and in the face of a crash we
3552 * wouldn't have all the data we need to recover.
3553 *
3554 * What we do is move the bits to the ili_last_fields field. When
3555 * logging the inode, these bits are moved back to the ili_fields field.
3556 * In the xfs_iflush_done() routine we clear ili_last_fields, since we
3557 * know that the information those bits represent is permanently on
3558 * disk. As long as the flush completes before the inode is logged
3559 * again, then both ili_fields and ili_last_fields will be cleared.
3821 *
3822 * We can play with the ili_fields bits here, because the inode lock
3823 * must be held exclusively in order to set bits there and the flush
3824 * lock protects the ili_last_fields bits. Set ili_logged so the flush
3825 * done routine can tell whether or not to look in the AIL. Also, store
3826 * the current LSN of the inode so that we can tell whether the item has
3827 * moved in the AIL from xfs_iflush_done(). In order to read the lsn we
3828 * need the AIL lock, because it is a 64 bit value that cannot be read
3829 * atomically.
3830 */
3831 error = 0;
3832flush_out:
3560 */
3561 error = 0;
3562flush_out:
3563 spin_lock(&iip->ili_lock);
3833 iip->ili_last_fields = iip->ili_fields;
3834 iip->ili_fields = 0;
3835 iip->ili_fsync_fields = 0;
3564 iip->ili_last_fields = iip->ili_fields;
3565 iip->ili_fields = 0;
3566 iip->ili_fsync_fields = 0;
3836 iip->ili_logged = 1;
3567 spin_unlock(&iip->ili_lock);
3837
3568
3569 /*
3570 * Store the current LSN of the inode so that we can tell whether the
3571 * item has moved in the AIL from xfs_iflush_done().
3572 */
3838 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3839 &iip->ili_item.li_lsn);
3840
3573 xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
3574 &iip->ili_item.li_lsn);
3575
3576 /* generate the checksum. */
3577 xfs_dinode_calc_crc(mp, dip);
3578 return error;
3579}
3580
3581/*
3582 * Non-blocking flush of dirty inode metadata into the backing buffer.
3583 *
3584 * The caller must have a reference to the inode and hold the cluster buffer
3585 * locked. The function will walk across all the inodes on the cluster buffer it
3586 * can find and lock without blocking, and flush them to the cluster buffer.
3587 *
3588 * On successful flushing of at least one inode, the caller must write out the
3589 * buffer and release it. If no inodes are flushed, -EAGAIN will be returned and
3590 * the caller needs to release the buffer. On failure, the filesystem will be
3591 * shut down, the buffer will have been unlocked and released, and EFSCORRUPTED
3592 * will be returned.
3593 */
3594int
3595xfs_iflush_cluster(
3596 struct xfs_buf *bp)
3597{
3598 struct xfs_mount *mp = bp->b_mount;
3599 struct xfs_log_item *lip, *n;
3600 struct xfs_inode *ip;
3601 struct xfs_inode_log_item *iip;
3602 int clcount = 0;
3603 int error = 0;
3604
3841 /*
3605 /*
3842 * Attach the inode item callback to the buffer whether the flush
3843 * succeeded or not. If not, the caller will shut down and fail I/O
3844 * completion on the buffer to remove the inode from the AIL and release
3845 * the flush lock.
3606 * We must use the safe variant here as on shutdown xfs_iflush_abort()
3607 * can remove itself from the list.
3846 */
3608 */
3847 xfs_buf_attach_iodone(bp, xfs_iflush_done, &iip->ili_item);
3609 list_for_each_entry_safe(lip, n, &bp->b_li_list, li_bio_list) {
3610 iip = (struct xfs_inode_log_item *)lip;
3611 ip = iip->ili_inode;
3848
3612
3849 /* generate the checksum. */
3850 xfs_dinode_calc_crc(mp, dip);
3613 /*
3614 * Quick and dirty check to avoid locks if possible.
3615 */
3616 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK))
3617 continue;
3618 if (xfs_ipincount(ip))
3619 continue;
3851
3620
3852 ASSERT(!list_empty(&bp->b_li_list));
3853 ASSERT(bp->b_iodone != NULL);
3854 return error;
3621 /*
3622 * The inode is still attached to the buffer, which means it is
3623 * dirty but reclaim might try to grab it. Check carefully for
3624 * that, and grab the ilock while still holding the i_flags_lock
3625 * to guarantee reclaim will not be able to reclaim this inode
3626 * once we drop the i_flags_lock.
3627 */
3628 spin_lock(&ip->i_flags_lock);
3629 ASSERT(!__xfs_iflags_test(ip, XFS_ISTALE));
3630 if (__xfs_iflags_test(ip, XFS_IRECLAIM | XFS_IFLOCK)) {
3631 spin_unlock(&ip->i_flags_lock);
3632 continue;
3633 }
3634
3635 /*
3636 * ILOCK will pin the inode against reclaim and prevent
3637 * concurrent transactions modifying the inode while we are
3638 * flushing the inode.
3639 */
3640 if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3641 spin_unlock(&ip->i_flags_lock);
3642 continue;
3643 }
3644 spin_unlock(&ip->i_flags_lock);
3645
3646 /*
3647 * Skip inodes that are already flush locked as they have
3648 * already been written to the buffer.
3649 */
3650 if (!xfs_iflock_nowait(ip)) {
3651 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3652 continue;
3653 }
3654
3655 /*
3656 * Abort flushing this inode if we are shut down because the
3657 * inode may not currently be in the AIL. This can occur when
3658 * log I/O failure unpins the inode without inserting into the
3659 * AIL, leaving a dirty/unpinned inode attached to the buffer
3660 * that otherwise looks like it should be flushed.
3661 */
3662 if (XFS_FORCED_SHUTDOWN(mp)) {
3663 xfs_iunpin_wait(ip);
3664 /* xfs_iflush_abort() drops the flush lock */
3665 xfs_iflush_abort(ip);
3666 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3667 error = -EIO;
3668 continue;
3669 }
3670
3671 /* don't block waiting on a log force to unpin dirty inodes */
3672 if (xfs_ipincount(ip)) {
3673 xfs_ifunlock(ip);
3674 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3675 continue;
3676 }
3677
3678 if (!xfs_inode_clean(ip))
3679 error = xfs_iflush(ip, bp);
3680 else
3681 xfs_ifunlock(ip);
3682 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3683 if (error)
3684 break;
3685 clcount++;
3686 }
3687
3688 if (error) {
3689 bp->b_flags |= XBF_ASYNC;
3690 xfs_buf_ioend_fail(bp);
3691 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3692 return error;
3693 }
3694
3695 if (!clcount)
3696 return -EAGAIN;
3697
3698 XFS_STATS_INC(mp, xs_icluster_flushcnt);
3699 XFS_STATS_ADD(mp, xs_icluster_flushinode, clcount);
3700 return 0;
3701
3855}
3856
3857/* Release an inode. */
3858void
3859xfs_irele(
3860 struct xfs_inode *ip)
3861{
3862 trace_xfs_irele(ip, _RET_IP_);

--- 13 unchanged lines hidden (view full) ---

3876 if (xfs_ipincount(ip))
3877 lsn = ip->i_itemp->ili_last_lsn;
3878 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3879
3880 if (!lsn)
3881 return 0;
3882 return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
3883}
3702}
3703
3704/* Release an inode. */
3705void
3706xfs_irele(
3707 struct xfs_inode *ip)
3708{
3709 trace_xfs_irele(ip, _RET_IP_);

--- 13 unchanged lines hidden (view full) ---

3723 if (xfs_ipincount(ip))
3724 lsn = ip->i_itemp->ili_last_lsn;
3725 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3726
3727 if (!lsn)
3728 return 0;
3729 return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
3730}
3731
3732/*
3733 * Grab the exclusive iolock for a data copy from src to dest, making sure to
3734 * abide vfs locking order (lowest pointer value goes first) and breaking the
3735 * layout leases before proceeding. The loop is needed because we cannot call
3736 * the blocking break_layout() with the iolocks held, and therefore have to
3737 * back out both locks.
3738 */
3739static int
3740xfs_iolock_two_inodes_and_break_layout(
3741 struct inode *src,
3742 struct inode *dest)
3743{
3744 int error;
3745
3746 if (src > dest)
3747 swap(src, dest);
3748
3749retry:
3750 /* Wait to break both inodes' layouts before we start locking. */
3751 error = break_layout(src, true);
3752 if (error)
3753 return error;
3754 if (src != dest) {
3755 error = break_layout(dest, true);
3756 if (error)
3757 return error;
3758 }
3759
3760 /* Lock one inode and make sure nobody got in and leased it. */
3761 inode_lock(src);
3762 error = break_layout(src, false);
3763 if (error) {
3764 inode_unlock(src);
3765 if (error == -EWOULDBLOCK)
3766 goto retry;
3767 return error;
3768 }
3769
3770 if (src == dest)
3771 return 0;
3772
3773 /* Lock the other inode and make sure nobody got in and leased it. */
3774 inode_lock_nested(dest, I_MUTEX_NONDIR2);
3775 error = break_layout(dest, false);
3776 if (error) {
3777 inode_unlock(src);
3778 inode_unlock(dest);
3779 if (error == -EWOULDBLOCK)
3780 goto retry;
3781 return error;
3782 }
3783
3784 return 0;
3785}
3786
3787/*
3788 * Lock two inodes so that userspace cannot initiate I/O via file syscalls or
3789 * mmap activity.
3790 */
3791int
3792xfs_ilock2_io_mmap(
3793 struct xfs_inode *ip1,
3794 struct xfs_inode *ip2)
3795{
3796 int ret;
3797
3798 ret = xfs_iolock_two_inodes_and_break_layout(VFS_I(ip1), VFS_I(ip2));
3799 if (ret)
3800 return ret;
3801 if (ip1 == ip2)
3802 xfs_ilock(ip1, XFS_MMAPLOCK_EXCL);
3803 else
3804 xfs_lock_two_inodes(ip1, XFS_MMAPLOCK_EXCL,
3805 ip2, XFS_MMAPLOCK_EXCL);
3806 return 0;
3807}
3808
3809/* Unlock both inodes to allow IO and mmap activity. */
3810void
3811xfs_iunlock2_io_mmap(
3812 struct xfs_inode *ip1,
3813 struct xfs_inode *ip2)
3814{
3815 bool same_inode = (ip1 == ip2);
3816
3817 xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL);
3818 if (!same_inode)
3819 xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL);
3820 inode_unlock(VFS_I(ip2));
3821 if (!same_inode)
3822 inode_unlock(VFS_I(ip1));
3823}