10b61f8a4SDave Chinner // SPDX-License-Identifier: GPL-2.0
21da177e4SLinus Torvalds /*
387c199c2STim Shimmin * Copyright (c) 2000-2006 Silicon Graphics, Inc.
47b718769SNathan Scott * All Rights Reserved.
51da177e4SLinus Torvalds */
61da177e4SLinus Torvalds #include "xfs.h"
7a844f451SNathan Scott #include "xfs_fs.h"
870a9883cSDave Chinner #include "xfs_shared.h"
9239880efSDave Chinner #include "xfs_format.h"
10239880efSDave Chinner #include "xfs_log_format.h"
11239880efSDave Chinner #include "xfs_trans_resv.h"
12a844f451SNathan Scott #include "xfs_bit.h"
13a844f451SNathan Scott #include "xfs_sb.h"
141da177e4SLinus Torvalds #include "xfs_mount.h"
1550995582SDarrick J. Wong #include "xfs_defer.h"
161da177e4SLinus Torvalds #include "xfs_inode.h"
17239880efSDave Chinner #include "xfs_trans.h"
18239880efSDave Chinner #include "xfs_log.h"
191da177e4SLinus Torvalds #include "xfs_log_priv.h"
201da177e4SLinus Torvalds #include "xfs_log_recover.h"
211da177e4SLinus Torvalds #include "xfs_trans_priv.h"
22a4fbe6abSDave Chinner #include "xfs_alloc.h"
23a4fbe6abSDave Chinner #include "xfs_ialloc.h"
240b1b213fSChristoph Hellwig #include "xfs_trace.h"
2533479e05SDave Chinner #include "xfs_icache.h"
26a4fbe6abSDave Chinner #include "xfs_error.h"
2760a4a222SBrian Foster #include "xfs_buf_item.h"
289bbafc71SDave Chinner #include "xfs_ag.h"
294bc61983SDarrick J. Wong #include "xfs_quota.h"
307993f1a4SDarrick J. Wong #include "xfs_reflink.h"
311da177e4SLinus Torvalds
32fc06c6d0SDave Chinner #define BLK_AVG(blk1, blk2) ((blk1+blk2) >> 1)
33fc06c6d0SDave Chinner
349a8d2fdbSMark Tinguely STATIC int
359a8d2fdbSMark Tinguely xlog_find_zeroed(
369a8d2fdbSMark Tinguely struct xlog *,
379a8d2fdbSMark Tinguely xfs_daddr_t *);
389a8d2fdbSMark Tinguely STATIC int
399a8d2fdbSMark Tinguely xlog_clear_stale_blocks(
409a8d2fdbSMark Tinguely struct xlog *,
419a8d2fdbSMark Tinguely xfs_lsn_t);
427088c413SBrian Foster STATIC int
437088c413SBrian Foster xlog_do_recovery_pass(
447088c413SBrian Foster struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
451da177e4SLinus Torvalds
461da177e4SLinus Torvalds /*
471da177e4SLinus Torvalds * Sector aligned buffer routines for buffer create/read/write/access
481da177e4SLinus Torvalds */
491da177e4SLinus Torvalds
50ff30a622SAlex Elder /*
5199c26595SBrian Foster * Verify the log-relative block number and length in basic blocks are valid for
5299c26595SBrian Foster * an operation involving the given XFS log buffer. Returns true if the fields
5399c26595SBrian Foster * are valid, false otherwise.
54ff30a622SAlex Elder */
5599c26595SBrian Foster static inline bool
xlog_verify_bno(struct xlog * log,xfs_daddr_t blk_no,int bbcount)566e9b3dd8SChristoph Hellwig xlog_verify_bno(
579a8d2fdbSMark Tinguely struct xlog *log,
5899c26595SBrian Foster xfs_daddr_t blk_no,
59ff30a622SAlex Elder int bbcount)
60ff30a622SAlex Elder {
6199c26595SBrian Foster if (blk_no < 0 || blk_no >= log->l_logBBsize)
6299c26595SBrian Foster return false;
6399c26595SBrian Foster if (bbcount <= 0 || (blk_no + bbcount) > log->l_logBBsize)
6499c26595SBrian Foster return false;
6599c26595SBrian Foster return true;
66ff30a622SAlex Elder }
67ff30a622SAlex Elder
6836adecffSAlex Elder /*
696ad5b325SChristoph Hellwig * Allocate a buffer to hold log data. The buffer needs to be able to map to
706ad5b325SChristoph Hellwig * a range of nbblks basic blocks at any valid offset within the log.
7136adecffSAlex Elder */
726ad5b325SChristoph Hellwig static char *
xlog_alloc_buffer(struct xlog * log,int nbblks)736e9b3dd8SChristoph Hellwig xlog_alloc_buffer(
749a8d2fdbSMark Tinguely struct xlog *log,
753228149cSDave Chinner int nbblks)
761da177e4SLinus Torvalds {
7799c26595SBrian Foster /*
7899c26595SBrian Foster * Pass log block 0 since we don't have an addr yet, buffer will be
7999c26595SBrian Foster * verified on read.
8099c26595SBrian Foster */
81a71895c5SDarrick J. Wong if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) {
82a0fa2b67SDave Chinner xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
83ff30a622SAlex Elder nbblks);
843228149cSDave Chinner return NULL;
853228149cSDave Chinner }
861da177e4SLinus Torvalds
8736adecffSAlex Elder /*
886ad5b325SChristoph Hellwig * We do log I/O in units of log sectors (a power-of-2 multiple of the
896ad5b325SChristoph Hellwig * basic block size), so we round up the requested size to accommodate
906ad5b325SChristoph Hellwig * the basic blocks required for complete log sectors.
9136adecffSAlex Elder *
926ad5b325SChristoph Hellwig * In addition, the buffer may be used for a non-sector-aligned block
936ad5b325SChristoph Hellwig * offset, in which case an I/O of the requested size could extend
946ad5b325SChristoph Hellwig * beyond the end of the buffer. If the requested size is only 1 basic
956ad5b325SChristoph Hellwig * block it will never straddle a sector boundary, so this won't be an
966ad5b325SChristoph Hellwig * issue. Nor will this be a problem if the log I/O is done in basic
976ad5b325SChristoph Hellwig * blocks (sector size 1). But otherwise we extend the buffer by one
986ad5b325SChristoph Hellwig * extra log sector to ensure there's space to accommodate this
996ad5b325SChristoph Hellwig * possibility.
10036adecffSAlex Elder */
10169ce58f0SAlex Elder if (nbblks > 1 && log->l_sectBBsize > 1)
10269ce58f0SAlex Elder nbblks += log->l_sectBBsize;
10369ce58f0SAlex Elder nbblks = round_up(nbblks, log->l_sectBBsize);
104d634525dSDave Chinner return kvzalloc(BBTOB(nbblks), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1051da177e4SLinus Torvalds }
1061da177e4SLinus Torvalds
10748389ef1SAlex Elder /*
10848389ef1SAlex Elder * Return the address of the start of the given block number's data
10948389ef1SAlex Elder * in a log buffer. The buffer covers a log sector-aligned region.
11048389ef1SAlex Elder */
11118ffb8c3SChristoph Hellwig static inline unsigned int
xlog_align(struct xlog * log,xfs_daddr_t blk_no)112076e6acbSChristoph Hellwig xlog_align(
1139a8d2fdbSMark Tinguely struct xlog *log,
11418ffb8c3SChristoph Hellwig xfs_daddr_t blk_no)
115076e6acbSChristoph Hellwig {
11618ffb8c3SChristoph Hellwig return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1));
117076e6acbSChristoph Hellwig }
118076e6acbSChristoph Hellwig
1196ad5b325SChristoph Hellwig static int
xlog_do_io(struct xlog * log,xfs_daddr_t blk_no,unsigned int nbblks,char * data,enum req_op op)1206ad5b325SChristoph Hellwig xlog_do_io(
1219a8d2fdbSMark Tinguely struct xlog *log,
1221da177e4SLinus Torvalds xfs_daddr_t blk_no,
1236ad5b325SChristoph Hellwig unsigned int nbblks,
1246ad5b325SChristoph Hellwig char *data,
125d03025aeSBart Van Assche enum req_op op)
1261da177e4SLinus Torvalds {
1271da177e4SLinus Torvalds int error;
1281da177e4SLinus Torvalds
129a71895c5SDarrick J. Wong if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) {
13099c26595SBrian Foster xfs_warn(log->l_mp,
13199c26595SBrian Foster "Invalid log block/length (0x%llx, 0x%x) for buffer",
13299c26595SBrian Foster blk_no, nbblks);
1332451337dSDave Chinner return -EFSCORRUPTED;
1343228149cSDave Chinner }
1353228149cSDave Chinner
13669ce58f0SAlex Elder blk_no = round_down(blk_no, log->l_sectBBsize);
13769ce58f0SAlex Elder nbblks = round_up(nbblks, log->l_sectBBsize);
1381da177e4SLinus Torvalds ASSERT(nbblks > 0);
1391da177e4SLinus Torvalds
1406ad5b325SChristoph Hellwig error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
1416ad5b325SChristoph Hellwig BBTOB(nbblks), data, op);
1422039a272SDave Chinner if (error && !xlog_is_shutdown(log)) {
1436ad5b325SChristoph Hellwig xfs_alert(log->l_mp,
1446ad5b325SChristoph Hellwig "log recovery %s I/O error at daddr 0x%llx len %d error %d",
1456ad5b325SChristoph Hellwig op == REQ_OP_WRITE ? "write" : "read",
1466ad5b325SChristoph Hellwig blk_no, nbblks, error);
1476ad5b325SChristoph Hellwig }
1481da177e4SLinus Torvalds return error;
1491da177e4SLinus Torvalds }
1501da177e4SLinus Torvalds
151076e6acbSChristoph Hellwig STATIC int
xlog_bread_noalign(struct xlog * log,xfs_daddr_t blk_no,int nbblks,char * data)1526ad5b325SChristoph Hellwig xlog_bread_noalign(
1536ad5b325SChristoph Hellwig struct xlog *log,
1546ad5b325SChristoph Hellwig xfs_daddr_t blk_no,
1556ad5b325SChristoph Hellwig int nbblks,
1566ad5b325SChristoph Hellwig char *data)
1576ad5b325SChristoph Hellwig {
1586ad5b325SChristoph Hellwig return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
1596ad5b325SChristoph Hellwig }
1606ad5b325SChristoph Hellwig
1616ad5b325SChristoph Hellwig STATIC int
xlog_bread(struct xlog * log,xfs_daddr_t blk_no,int nbblks,char * data,char ** offset)162076e6acbSChristoph Hellwig xlog_bread(
1639a8d2fdbSMark Tinguely struct xlog *log,
164076e6acbSChristoph Hellwig xfs_daddr_t blk_no,
165076e6acbSChristoph Hellwig int nbblks,
1666ad5b325SChristoph Hellwig char *data,
167b2a922cdSChristoph Hellwig char **offset)
168076e6acbSChristoph Hellwig {
169076e6acbSChristoph Hellwig int error;
170076e6acbSChristoph Hellwig
1716ad5b325SChristoph Hellwig error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
1726ad5b325SChristoph Hellwig if (!error)
1736ad5b325SChristoph Hellwig *offset = data + xlog_align(log, blk_no);
174076e6acbSChristoph Hellwig return error;
175076e6acbSChristoph Hellwig }
176076e6acbSChristoph Hellwig
177ba0f32d4SChristoph Hellwig STATIC int
xlog_bwrite(struct xlog * log,xfs_daddr_t blk_no,int nbblks,char * data)1781da177e4SLinus Torvalds xlog_bwrite(
1799a8d2fdbSMark Tinguely struct xlog *log,
1801da177e4SLinus Torvalds xfs_daddr_t blk_no,
1811da177e4SLinus Torvalds int nbblks,
1826ad5b325SChristoph Hellwig char *data)
1831da177e4SLinus Torvalds {
1846ad5b325SChristoph Hellwig return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE);
1851da177e4SLinus Torvalds }
1861da177e4SLinus Torvalds
1871da177e4SLinus Torvalds #ifdef DEBUG
1881da177e4SLinus Torvalds /*
1891da177e4SLinus Torvalds * dump debug superblock and log record information
1901da177e4SLinus Torvalds */
1911da177e4SLinus Torvalds STATIC void
xlog_header_check_dump(xfs_mount_t * mp,xlog_rec_header_t * head)1921da177e4SLinus Torvalds xlog_header_check_dump(
1931da177e4SLinus Torvalds xfs_mount_t *mp,
1941da177e4SLinus Torvalds xlog_rec_header_t *head)
1951da177e4SLinus Torvalds {
19608e96e1aSEric Sandeen xfs_debug(mp, "%s: SB : uuid = %pU, fmt = %d",
19703daa57cSJoe Perches __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
19808e96e1aSEric Sandeen xfs_debug(mp, " log : uuid = %pU, fmt = %d",
19903daa57cSJoe Perches &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
2001da177e4SLinus Torvalds }
2011da177e4SLinus Torvalds #else
2021da177e4SLinus Torvalds #define xlog_header_check_dump(mp, head)
2031da177e4SLinus Torvalds #endif
2041da177e4SLinus Torvalds
2051da177e4SLinus Torvalds /*
2061da177e4SLinus Torvalds * check log record header for recovery
2071da177e4SLinus Torvalds */
2081da177e4SLinus Torvalds STATIC int
xlog_header_check_recover(xfs_mount_t * mp,xlog_rec_header_t * head)2091da177e4SLinus Torvalds xlog_header_check_recover(
2101da177e4SLinus Torvalds xfs_mount_t *mp,
2111da177e4SLinus Torvalds xlog_rec_header_t *head)
2121da177e4SLinus Torvalds {
21369ef921bSChristoph Hellwig ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
2141da177e4SLinus Torvalds
2151da177e4SLinus Torvalds /*
2161da177e4SLinus Torvalds * IRIX doesn't write the h_fmt field and leaves it zeroed
2171da177e4SLinus Torvalds * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
2181da177e4SLinus Torvalds * a dirty log created in IRIX.
2191da177e4SLinus Torvalds */
220a71895c5SDarrick J. Wong if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) {
221a0fa2b67SDave Chinner xfs_warn(mp,
222a0fa2b67SDave Chinner "dirty log written in incompatible format - can't recover");
2231da177e4SLinus Torvalds xlog_header_check_dump(mp, head);
2242451337dSDave Chinner return -EFSCORRUPTED;
225a71895c5SDarrick J. Wong }
226a71895c5SDarrick J. Wong if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
227a71895c5SDarrick J. Wong &head->h_fs_uuid))) {
228a0fa2b67SDave Chinner xfs_warn(mp,
229a0fa2b67SDave Chinner "dirty log entry has mismatched uuid - can't recover");
2301da177e4SLinus Torvalds xlog_header_check_dump(mp, head);
2312451337dSDave Chinner return -EFSCORRUPTED;
2321da177e4SLinus Torvalds }
2331da177e4SLinus Torvalds return 0;
2341da177e4SLinus Torvalds }
2351da177e4SLinus Torvalds
2361da177e4SLinus Torvalds /*
2371da177e4SLinus Torvalds * read the head block of the log and check the header
2381da177e4SLinus Torvalds */
2391da177e4SLinus Torvalds STATIC int
xlog_header_check_mount(xfs_mount_t * mp,xlog_rec_header_t * head)2401da177e4SLinus Torvalds xlog_header_check_mount(
2411da177e4SLinus Torvalds xfs_mount_t *mp,
2421da177e4SLinus Torvalds xlog_rec_header_t *head)
2431da177e4SLinus Torvalds {
24469ef921bSChristoph Hellwig ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
2451da177e4SLinus Torvalds
246d905fdaaSAmir Goldstein if (uuid_is_null(&head->h_fs_uuid)) {
2471da177e4SLinus Torvalds /*
2481da177e4SLinus Torvalds * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
249d905fdaaSAmir Goldstein * h_fs_uuid is null, we assume this log was last mounted
2501da177e4SLinus Torvalds * by IRIX and continue.
2511da177e4SLinus Torvalds */
252d905fdaaSAmir Goldstein xfs_warn(mp, "null uuid in log - IRIX style log");
253a71895c5SDarrick J. Wong } else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
254a71895c5SDarrick J. Wong &head->h_fs_uuid))) {
255a0fa2b67SDave Chinner xfs_warn(mp, "log has mismatched uuid - can't recover");
2561da177e4SLinus Torvalds xlog_header_check_dump(mp, head);
2572451337dSDave Chinner return -EFSCORRUPTED;
2581da177e4SLinus Torvalds }
2591da177e4SLinus Torvalds return 0;
2601da177e4SLinus Torvalds }
2611da177e4SLinus Torvalds
2621da177e4SLinus Torvalds /*
2631da177e4SLinus Torvalds * This routine finds (to an approximation) the first block in the physical
2641da177e4SLinus Torvalds * log which contains the given cycle. It uses a binary search algorithm.
2651da177e4SLinus Torvalds * Note that the algorithm can not be perfect because the disk will not
2661da177e4SLinus Torvalds * necessarily be perfect.
2671da177e4SLinus Torvalds */
268a8272ce0SDavid Chinner STATIC int
xlog_find_cycle_start(struct xlog * log,char * buffer,xfs_daddr_t first_blk,xfs_daddr_t * last_blk,uint cycle)2691da177e4SLinus Torvalds xlog_find_cycle_start(
2709a8d2fdbSMark Tinguely struct xlog *log,
2716e9b3dd8SChristoph Hellwig char *buffer,
2721da177e4SLinus Torvalds xfs_daddr_t first_blk,
2731da177e4SLinus Torvalds xfs_daddr_t *last_blk,
2741da177e4SLinus Torvalds uint cycle)
2751da177e4SLinus Torvalds {
276b2a922cdSChristoph Hellwig char *offset;
2771da177e4SLinus Torvalds xfs_daddr_t mid_blk;
278e3bb2e30SAlex Elder xfs_daddr_t end_blk;
2791da177e4SLinus Torvalds uint mid_cycle;
2801da177e4SLinus Torvalds int error;
2811da177e4SLinus Torvalds
282e3bb2e30SAlex Elder end_blk = *last_blk;
283e3bb2e30SAlex Elder mid_blk = BLK_AVG(first_blk, end_blk);
284e3bb2e30SAlex Elder while (mid_blk != first_blk && mid_blk != end_blk) {
2856e9b3dd8SChristoph Hellwig error = xlog_bread(log, mid_blk, 1, buffer, &offset);
286076e6acbSChristoph Hellwig if (error)
2871da177e4SLinus Torvalds return error;
28803bea6feSChristoph Hellwig mid_cycle = xlog_get_cycle(offset);
289e3bb2e30SAlex Elder if (mid_cycle == cycle)
290e3bb2e30SAlex Elder end_blk = mid_blk; /* last_half_cycle == mid_cycle */
291e3bb2e30SAlex Elder else
292e3bb2e30SAlex Elder first_blk = mid_blk; /* first_half_cycle == mid_cycle */
293e3bb2e30SAlex Elder mid_blk = BLK_AVG(first_blk, end_blk);
2941da177e4SLinus Torvalds }
295e3bb2e30SAlex Elder ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
296e3bb2e30SAlex Elder (mid_blk == end_blk && mid_blk-1 == first_blk));
297e3bb2e30SAlex Elder
298e3bb2e30SAlex Elder *last_blk = end_blk;
2991da177e4SLinus Torvalds
3001da177e4SLinus Torvalds return 0;
3011da177e4SLinus Torvalds }
3021da177e4SLinus Torvalds
3031da177e4SLinus Torvalds /*
3043f943d85SAlex Elder * Check that a range of blocks does not contain stop_on_cycle_no.
3053f943d85SAlex Elder * Fill in *new_blk with the block offset where such a block is
3063f943d85SAlex Elder * found, or with -1 (an invalid block number) if there is no such
3073f943d85SAlex Elder * block in the range. The scan needs to occur from front to back
3083f943d85SAlex Elder * and the pointer into the region must be updated since a later
3093f943d85SAlex Elder * routine will need to perform another test.
3101da177e4SLinus Torvalds */
3111da177e4SLinus Torvalds STATIC int
xlog_find_verify_cycle(struct xlog * log,xfs_daddr_t start_blk,int nbblks,uint stop_on_cycle_no,xfs_daddr_t * new_blk)3121da177e4SLinus Torvalds xlog_find_verify_cycle(
3139a8d2fdbSMark Tinguely struct xlog *log,
3141da177e4SLinus Torvalds xfs_daddr_t start_blk,
3151da177e4SLinus Torvalds int nbblks,
3161da177e4SLinus Torvalds uint stop_on_cycle_no,
3171da177e4SLinus Torvalds xfs_daddr_t *new_blk)
3181da177e4SLinus Torvalds {
3191da177e4SLinus Torvalds xfs_daddr_t i, j;
3201da177e4SLinus Torvalds uint cycle;
3216e9b3dd8SChristoph Hellwig char *buffer;
3221da177e4SLinus Torvalds xfs_daddr_t bufblks;
323b2a922cdSChristoph Hellwig char *buf = NULL;
3241da177e4SLinus Torvalds int error = 0;
3251da177e4SLinus Torvalds
3266881a229SAlex Elder /*
3276881a229SAlex Elder * Greedily allocate a buffer big enough to handle the full
3286881a229SAlex Elder * range of basic blocks we'll be examining. If that fails,
3296881a229SAlex Elder * try a smaller size. We need to be able to read at least
3306881a229SAlex Elder * a log sector, or we're out of luck.
3316881a229SAlex Elder */
3328b010acbSWang Jianchao bufblks = roundup_pow_of_two(nbblks);
33381158e0cSDave Chinner while (bufblks > log->l_logBBsize)
33481158e0cSDave Chinner bufblks >>= 1;
3356e9b3dd8SChristoph Hellwig while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
3361da177e4SLinus Torvalds bufblks >>= 1;
33769ce58f0SAlex Elder if (bufblks < log->l_sectBBsize)
3382451337dSDave Chinner return -ENOMEM;
3391da177e4SLinus Torvalds }
3401da177e4SLinus Torvalds
3411da177e4SLinus Torvalds for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
3421da177e4SLinus Torvalds int bcount;
3431da177e4SLinus Torvalds
3441da177e4SLinus Torvalds bcount = min(bufblks, (start_blk + nbblks - i));
3451da177e4SLinus Torvalds
3466e9b3dd8SChristoph Hellwig error = xlog_bread(log, i, bcount, buffer, &buf);
347076e6acbSChristoph Hellwig if (error)
3481da177e4SLinus Torvalds goto out;
3491da177e4SLinus Torvalds
3501da177e4SLinus Torvalds for (j = 0; j < bcount; j++) {
35103bea6feSChristoph Hellwig cycle = xlog_get_cycle(buf);
3521da177e4SLinus Torvalds if (cycle == stop_on_cycle_no) {
3531da177e4SLinus Torvalds *new_blk = i+j;
3541da177e4SLinus Torvalds goto out;
3551da177e4SLinus Torvalds }
3561da177e4SLinus Torvalds
3571da177e4SLinus Torvalds buf += BBSIZE;
3581da177e4SLinus Torvalds }
3591da177e4SLinus Torvalds }
3601da177e4SLinus Torvalds
3611da177e4SLinus Torvalds *new_blk = -1;
3621da177e4SLinus Torvalds
3631da177e4SLinus Torvalds out:
3646e9b3dd8SChristoph Hellwig kmem_free(buffer);
3651da177e4SLinus Torvalds return error;
3661da177e4SLinus Torvalds }
3671da177e4SLinus Torvalds
3680c771b99SGao Xiang static inline int
xlog_logrec_hblks(struct xlog * log,struct xlog_rec_header * rh)3690c771b99SGao Xiang xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh)
3700c771b99SGao Xiang {
37138c26bfdSDave Chinner if (xfs_has_logv2(log->l_mp)) {
3720c771b99SGao Xiang int h_size = be32_to_cpu(rh->h_size);
3730c771b99SGao Xiang
3740c771b99SGao Xiang if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) &&
3750c771b99SGao Xiang h_size > XLOG_HEADER_CYCLE_SIZE)
3760c771b99SGao Xiang return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
3770c771b99SGao Xiang }
3780c771b99SGao Xiang return 1;
3790c771b99SGao Xiang }
3800c771b99SGao Xiang
3811da177e4SLinus Torvalds /*
3821da177e4SLinus Torvalds * Potentially backup over partial log record write.
3831da177e4SLinus Torvalds *
3841da177e4SLinus Torvalds * In the typical case, last_blk is the number of the block directly after
3851da177e4SLinus Torvalds * a good log record. Therefore, we subtract one to get the block number
3861da177e4SLinus Torvalds * of the last block in the given buffer. extra_bblks contains the number
3871da177e4SLinus Torvalds * of blocks we would have read on a previous read. This happens when the
3881da177e4SLinus Torvalds * last log record is split over the end of the physical log.
3891da177e4SLinus Torvalds *
3901da177e4SLinus Torvalds * extra_bblks is the number of blocks potentially verified on a previous
3911da177e4SLinus Torvalds * call to this routine.
3921da177e4SLinus Torvalds */
3931da177e4SLinus Torvalds STATIC int
xlog_find_verify_log_record(struct xlog * log,xfs_daddr_t start_blk,xfs_daddr_t * last_blk,int extra_bblks)3941da177e4SLinus Torvalds xlog_find_verify_log_record(
3959a8d2fdbSMark Tinguely struct xlog *log,
3961da177e4SLinus Torvalds xfs_daddr_t start_blk,
3971da177e4SLinus Torvalds xfs_daddr_t *last_blk,
3981da177e4SLinus Torvalds int extra_bblks)
3991da177e4SLinus Torvalds {
4001da177e4SLinus Torvalds xfs_daddr_t i;
4016e9b3dd8SChristoph Hellwig char *buffer;
402b2a922cdSChristoph Hellwig char *offset = NULL;
4031da177e4SLinus Torvalds xlog_rec_header_t *head = NULL;
4041da177e4SLinus Torvalds int error = 0;
4051da177e4SLinus Torvalds int smallmem = 0;
4061da177e4SLinus Torvalds int num_blks = *last_blk - start_blk;
4071da177e4SLinus Torvalds int xhdrs;
4081da177e4SLinus Torvalds
4091da177e4SLinus Torvalds ASSERT(start_blk != 0 || *last_blk != start_blk);
4101da177e4SLinus Torvalds
4116e9b3dd8SChristoph Hellwig buffer = xlog_alloc_buffer(log, num_blks);
4126e9b3dd8SChristoph Hellwig if (!buffer) {
4136e9b3dd8SChristoph Hellwig buffer = xlog_alloc_buffer(log, 1);
4146e9b3dd8SChristoph Hellwig if (!buffer)
4152451337dSDave Chinner return -ENOMEM;
4161da177e4SLinus Torvalds smallmem = 1;
4171da177e4SLinus Torvalds } else {
4186e9b3dd8SChristoph Hellwig error = xlog_bread(log, start_blk, num_blks, buffer, &offset);
419076e6acbSChristoph Hellwig if (error)
4201da177e4SLinus Torvalds goto out;
4211da177e4SLinus Torvalds offset += ((num_blks - 1) << BBSHIFT);
4221da177e4SLinus Torvalds }
4231da177e4SLinus Torvalds
4241da177e4SLinus Torvalds for (i = (*last_blk) - 1; i >= 0; i--) {
4251da177e4SLinus Torvalds if (i < start_blk) {
4261da177e4SLinus Torvalds /* valid log record not found */
427a0fa2b67SDave Chinner xfs_warn(log->l_mp,
428a0fa2b67SDave Chinner "Log inconsistent (didn't find previous header)");
4291da177e4SLinus Torvalds ASSERT(0);
430895e196fSDarrick J. Wong error = -EFSCORRUPTED;
4311da177e4SLinus Torvalds goto out;
4321da177e4SLinus Torvalds }
4331da177e4SLinus Torvalds
4341da177e4SLinus Torvalds if (smallmem) {
4356e9b3dd8SChristoph Hellwig error = xlog_bread(log, i, 1, buffer, &offset);
436076e6acbSChristoph Hellwig if (error)
4371da177e4SLinus Torvalds goto out;
4381da177e4SLinus Torvalds }
4391da177e4SLinus Torvalds
4401da177e4SLinus Torvalds head = (xlog_rec_header_t *)offset;
4411da177e4SLinus Torvalds
44269ef921bSChristoph Hellwig if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
4431da177e4SLinus Torvalds break;
4441da177e4SLinus Torvalds
4451da177e4SLinus Torvalds if (!smallmem)
4461da177e4SLinus Torvalds offset -= BBSIZE;
4471da177e4SLinus Torvalds }
4481da177e4SLinus Torvalds
4491da177e4SLinus Torvalds /*
4501da177e4SLinus Torvalds * We hit the beginning of the physical log & still no header. Return
4511da177e4SLinus Torvalds * to caller. If caller can handle a return of -1, then this routine
4521da177e4SLinus Torvalds * will be called again for the end of the physical log.
4531da177e4SLinus Torvalds */
4541da177e4SLinus Torvalds if (i == -1) {
4552451337dSDave Chinner error = 1;
4561da177e4SLinus Torvalds goto out;
4571da177e4SLinus Torvalds }
4581da177e4SLinus Torvalds
4591da177e4SLinus Torvalds /*
4601da177e4SLinus Torvalds * We have the final block of the good log (the first block
4611da177e4SLinus Torvalds * of the log record _before_ the head. So we check the uuid.
4621da177e4SLinus Torvalds */
4631da177e4SLinus Torvalds if ((error = xlog_header_check_mount(log->l_mp, head)))
4641da177e4SLinus Torvalds goto out;
4651da177e4SLinus Torvalds
4661da177e4SLinus Torvalds /*
4671da177e4SLinus Torvalds * We may have found a log record header before we expected one.
4681da177e4SLinus Torvalds * last_blk will be the 1st block # with a given cycle #. We may end
4691da177e4SLinus Torvalds * up reading an entire log record. In this case, we don't want to
4701da177e4SLinus Torvalds * reset last_blk. Only when last_blk points in the middle of a log
4711da177e4SLinus Torvalds * record do we update last_blk.
4721da177e4SLinus Torvalds */
4730c771b99SGao Xiang xhdrs = xlog_logrec_hblks(log, head);
4741da177e4SLinus Torvalds
475b53e675dSChristoph Hellwig if (*last_blk - i + extra_bblks !=
476b53e675dSChristoph Hellwig BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
4771da177e4SLinus Torvalds *last_blk = i;
4781da177e4SLinus Torvalds
4791da177e4SLinus Torvalds out:
4806e9b3dd8SChristoph Hellwig kmem_free(buffer);
4811da177e4SLinus Torvalds return error;
4821da177e4SLinus Torvalds }
4831da177e4SLinus Torvalds
4841da177e4SLinus Torvalds /*
4851da177e4SLinus Torvalds * Head is defined to be the point of the log where the next log write
4860a94da24SZhi Yong Wu * could go. This means that incomplete LR writes at the end are
4871da177e4SLinus Torvalds * eliminated when calculating the head. We aren't guaranteed that previous
4881da177e4SLinus Torvalds * LR have complete transactions. We only know that a cycle number of
4891da177e4SLinus Torvalds * current cycle number -1 won't be present in the log if we start writing
4901da177e4SLinus Torvalds * from our current block number.
4911da177e4SLinus Torvalds *
4921da177e4SLinus Torvalds * last_blk contains the block number of the first block with a given
4931da177e4SLinus Torvalds * cycle number.
4941da177e4SLinus Torvalds *
4951da177e4SLinus Torvalds * Return: zero if normal, non-zero if error.
4961da177e4SLinus Torvalds */
497ba0f32d4SChristoph Hellwig STATIC int
xlog_find_head(struct xlog * log,xfs_daddr_t * return_head_blk)4981da177e4SLinus Torvalds xlog_find_head(
4999a8d2fdbSMark Tinguely struct xlog *log,
5001da177e4SLinus Torvalds xfs_daddr_t *return_head_blk)
5011da177e4SLinus Torvalds {
5026e9b3dd8SChristoph Hellwig char *buffer;
503b2a922cdSChristoph Hellwig char *offset;
5041da177e4SLinus Torvalds xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
5051da177e4SLinus Torvalds int num_scan_bblks;
5061da177e4SLinus Torvalds uint first_half_cycle, last_half_cycle;
5071da177e4SLinus Torvalds uint stop_on_cycle;
5081da177e4SLinus Torvalds int error, log_bbnum = log->l_logBBsize;
5091da177e4SLinus Torvalds
5101da177e4SLinus Torvalds /* Is the end of the log device zeroed? */
5112451337dSDave Chinner error = xlog_find_zeroed(log, &first_blk);
5122451337dSDave Chinner if (error < 0) {
5132451337dSDave Chinner xfs_warn(log->l_mp, "empty log check failed");
5142451337dSDave Chinner return error;
5152451337dSDave Chinner }
5162451337dSDave Chinner if (error == 1) {
5171da177e4SLinus Torvalds *return_head_blk = first_blk;
5181da177e4SLinus Torvalds
5191da177e4SLinus Torvalds /* Is the whole lot zeroed? */
5201da177e4SLinus Torvalds if (!first_blk) {
5211da177e4SLinus Torvalds /* Linux XFS shouldn't generate totally zeroed logs -
5221da177e4SLinus Torvalds * mkfs etc write a dummy unmount record to a fresh
5231da177e4SLinus Torvalds * log so we can store the uuid in there
5241da177e4SLinus Torvalds */
525a0fa2b67SDave Chinner xfs_warn(log->l_mp, "totally zeroed log");
5261da177e4SLinus Torvalds }
5271da177e4SLinus Torvalds
5281da177e4SLinus Torvalds return 0;
5291da177e4SLinus Torvalds }
5301da177e4SLinus Torvalds
5311da177e4SLinus Torvalds first_blk = 0; /* get cycle # of 1st block */
5326e9b3dd8SChristoph Hellwig buffer = xlog_alloc_buffer(log, 1);
5336e9b3dd8SChristoph Hellwig if (!buffer)
5342451337dSDave Chinner return -ENOMEM;
535076e6acbSChristoph Hellwig
5366e9b3dd8SChristoph Hellwig error = xlog_bread(log, 0, 1, buffer, &offset);
537076e6acbSChristoph Hellwig if (error)
5386e9b3dd8SChristoph Hellwig goto out_free_buffer;
539076e6acbSChristoph Hellwig
54003bea6feSChristoph Hellwig first_half_cycle = xlog_get_cycle(offset);
5411da177e4SLinus Torvalds
5421da177e4SLinus Torvalds last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
5436e9b3dd8SChristoph Hellwig error = xlog_bread(log, last_blk, 1, buffer, &offset);
544076e6acbSChristoph Hellwig if (error)
5456e9b3dd8SChristoph Hellwig goto out_free_buffer;
546076e6acbSChristoph Hellwig
54703bea6feSChristoph Hellwig last_half_cycle = xlog_get_cycle(offset);
5481da177e4SLinus Torvalds ASSERT(last_half_cycle != 0);
5491da177e4SLinus Torvalds
5501da177e4SLinus Torvalds /*
5511da177e4SLinus Torvalds * If the 1st half cycle number is equal to the last half cycle number,
5521da177e4SLinus Torvalds * then the entire log is stamped with the same cycle number. In this
5531da177e4SLinus Torvalds * case, head_blk can't be set to zero (which makes sense). The below
5541da177e4SLinus Torvalds * math doesn't work out properly with head_blk equal to zero. Instead,
5551da177e4SLinus Torvalds * we set it to log_bbnum which is an invalid block number, but this
5561da177e4SLinus Torvalds * value makes the math correct. If head_blk doesn't changed through
5571da177e4SLinus Torvalds * all the tests below, *head_blk is set to zero at the very end rather
5581da177e4SLinus Torvalds * than log_bbnum. In a sense, log_bbnum and zero are the same block
5591da177e4SLinus Torvalds * in a circular file.
5601da177e4SLinus Torvalds */
5611da177e4SLinus Torvalds if (first_half_cycle == last_half_cycle) {
5621da177e4SLinus Torvalds /*
5631da177e4SLinus Torvalds * In this case we believe that the entire log should have
5641da177e4SLinus Torvalds * cycle number last_half_cycle. We need to scan backwards
5651da177e4SLinus Torvalds * from the end verifying that there are no holes still
5661da177e4SLinus Torvalds * containing last_half_cycle - 1. If we find such a hole,
5671da177e4SLinus Torvalds * then the start of that hole will be the new head. The
5681da177e4SLinus Torvalds * simple case looks like
5691da177e4SLinus Torvalds * x | x ... | x - 1 | x
5701da177e4SLinus Torvalds * Another case that fits this picture would be
5711da177e4SLinus Torvalds * x | x + 1 | x ... | x
572c41564b5SNathan Scott * In this case the head really is somewhere at the end of the
5731da177e4SLinus Torvalds * log, as one of the latest writes at the beginning was
5741da177e4SLinus Torvalds * incomplete.
5751da177e4SLinus Torvalds * One more case is
5761da177e4SLinus Torvalds * x | x + 1 | x ... | x - 1 | x
5771da177e4SLinus Torvalds * This is really the combination of the above two cases, and
5781da177e4SLinus Torvalds * the head has to end up at the start of the x-1 hole at the
5791da177e4SLinus Torvalds * end of the log.
5801da177e4SLinus Torvalds *
5811da177e4SLinus Torvalds * In the 256k log case, we will read from the beginning to the
5821da177e4SLinus Torvalds * end of the log and search for cycle numbers equal to x-1.
5831da177e4SLinus Torvalds * We don't worry about the x+1 blocks that we encounter,
5841da177e4SLinus Torvalds * because we know that they cannot be the head since the log
5851da177e4SLinus Torvalds * started with x.
5861da177e4SLinus Torvalds */
5871da177e4SLinus Torvalds head_blk = log_bbnum;
5881da177e4SLinus Torvalds stop_on_cycle = last_half_cycle - 1;
5891da177e4SLinus Torvalds } else {
5901da177e4SLinus Torvalds /*
5911da177e4SLinus Torvalds * In this case we want to find the first block with cycle
5921da177e4SLinus Torvalds * number matching last_half_cycle. We expect the log to be
5931da177e4SLinus Torvalds * some variation on
5943f943d85SAlex Elder * x + 1 ... | x ... | x
5951da177e4SLinus Torvalds * The first block with cycle number x (last_half_cycle) will
5961da177e4SLinus Torvalds * be where the new head belongs. First we do a binary search
5971da177e4SLinus Torvalds * for the first occurrence of last_half_cycle. The binary
5981da177e4SLinus Torvalds * search may not be totally accurate, so then we scan back
5991da177e4SLinus Torvalds * from there looking for occurrences of last_half_cycle before
6001da177e4SLinus Torvalds * us. If that backwards scan wraps around the beginning of
6011da177e4SLinus Torvalds * the log, then we look for occurrences of last_half_cycle - 1
6021da177e4SLinus Torvalds * at the end of the log. The cases we're looking for look
6031da177e4SLinus Torvalds * like
6043f943d85SAlex Elder * v binary search stopped here
6053f943d85SAlex Elder * x + 1 ... | x | x + 1 | x ... | x
6063f943d85SAlex Elder * ^ but we want to locate this spot
6071da177e4SLinus Torvalds * or
6081da177e4SLinus Torvalds * <---------> less than scan distance
6093f943d85SAlex Elder * x + 1 ... | x ... | x - 1 | x
6103f943d85SAlex Elder * ^ we want to locate this spot
6111da177e4SLinus Torvalds */
6121da177e4SLinus Torvalds stop_on_cycle = last_half_cycle;
6136e9b3dd8SChristoph Hellwig error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk,
6146e9b3dd8SChristoph Hellwig last_half_cycle);
6156e9b3dd8SChristoph Hellwig if (error)
6166e9b3dd8SChristoph Hellwig goto out_free_buffer;
6171da177e4SLinus Torvalds }
6181da177e4SLinus Torvalds
6191da177e4SLinus Torvalds /*
6201da177e4SLinus Torvalds * Now validate the answer. Scan back some number of maximum possible
6211da177e4SLinus Torvalds * blocks and make sure each one has the expected cycle number. The
6221da177e4SLinus Torvalds * maximum is determined by the total possible amount of buffering
6231da177e4SLinus Torvalds * in the in-core log. The following number can be made tighter if
6241da177e4SLinus Torvalds * we actually look at the block size of the filesystem.
6251da177e4SLinus Torvalds */
6269f2a4505SBrian Foster num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log));
6271da177e4SLinus Torvalds if (head_blk >= num_scan_bblks) {
6281da177e4SLinus Torvalds /*
6291da177e4SLinus Torvalds * We are guaranteed that the entire check can be performed
6301da177e4SLinus Torvalds * in one buffer.
6311da177e4SLinus Torvalds */
6321da177e4SLinus Torvalds start_blk = head_blk - num_scan_bblks;
6331da177e4SLinus Torvalds if ((error = xlog_find_verify_cycle(log,
6341da177e4SLinus Torvalds start_blk, num_scan_bblks,
6351da177e4SLinus Torvalds stop_on_cycle, &new_blk)))
6366e9b3dd8SChristoph Hellwig goto out_free_buffer;
6371da177e4SLinus Torvalds if (new_blk != -1)
6381da177e4SLinus Torvalds head_blk = new_blk;
6391da177e4SLinus Torvalds } else { /* need to read 2 parts of log */
6401da177e4SLinus Torvalds /*
6411da177e4SLinus Torvalds * We are going to scan backwards in the log in two parts.
6421da177e4SLinus Torvalds * First we scan the physical end of the log. In this part
6431da177e4SLinus Torvalds * of the log, we are looking for blocks with cycle number
6441da177e4SLinus Torvalds * last_half_cycle - 1.
6451da177e4SLinus Torvalds * If we find one, then we know that the log starts there, as
6461da177e4SLinus Torvalds * we've found a hole that didn't get written in going around
6471da177e4SLinus Torvalds * the end of the physical log. The simple case for this is
6481da177e4SLinus Torvalds * x + 1 ... | x ... | x - 1 | x
6491da177e4SLinus Torvalds * <---------> less than scan distance
6501da177e4SLinus Torvalds * If all of the blocks at the end of the log have cycle number
6511da177e4SLinus Torvalds * last_half_cycle, then we check the blocks at the start of
6521da177e4SLinus Torvalds * the log looking for occurrences of last_half_cycle. If we
6531da177e4SLinus Torvalds * find one, then our current estimate for the location of the
6541da177e4SLinus Torvalds * first occurrence of last_half_cycle is wrong and we move
6551da177e4SLinus Torvalds * back to the hole we've found. This case looks like
6561da177e4SLinus Torvalds * x + 1 ... | x | x + 1 | x ...
6571da177e4SLinus Torvalds * ^ binary search stopped here
6581da177e4SLinus Torvalds * Another case we need to handle that only occurs in 256k
6591da177e4SLinus Torvalds * logs is
6601da177e4SLinus Torvalds * x + 1 ... | x ... | x+1 | x ...
6611da177e4SLinus Torvalds * ^ binary search stops here
6621da177e4SLinus Torvalds * In a 256k log, the scan at the end of the log will see the
6631da177e4SLinus Torvalds * x + 1 blocks. We need to skip past those since that is
6641da177e4SLinus Torvalds * certainly not the head of the log. By searching for
6651da177e4SLinus Torvalds * last_half_cycle-1 we accomplish that.
6661da177e4SLinus Torvalds */
6671da177e4SLinus Torvalds ASSERT(head_blk <= INT_MAX &&
6683f943d85SAlex Elder (xfs_daddr_t) num_scan_bblks >= head_blk);
6693f943d85SAlex Elder start_blk = log_bbnum - (num_scan_bblks - head_blk);
6701da177e4SLinus Torvalds if ((error = xlog_find_verify_cycle(log, start_blk,
6711da177e4SLinus Torvalds num_scan_bblks - (int)head_blk,
6721da177e4SLinus Torvalds (stop_on_cycle - 1), &new_blk)))
6736e9b3dd8SChristoph Hellwig goto out_free_buffer;
6741da177e4SLinus Torvalds if (new_blk != -1) {
6751da177e4SLinus Torvalds head_blk = new_blk;
6769db127edSAlex Elder goto validate_head;
6771da177e4SLinus Torvalds }
6781da177e4SLinus Torvalds
6791da177e4SLinus Torvalds /*
6801da177e4SLinus Torvalds * Scan beginning of log now. The last part of the physical
6811da177e4SLinus Torvalds * log is good. This scan needs to verify that it doesn't find
6821da177e4SLinus Torvalds * the last_half_cycle.
6831da177e4SLinus Torvalds */
6841da177e4SLinus Torvalds start_blk = 0;
6851da177e4SLinus Torvalds ASSERT(head_blk <= INT_MAX);
6861da177e4SLinus Torvalds if ((error = xlog_find_verify_cycle(log,
6871da177e4SLinus Torvalds start_blk, (int)head_blk,
6881da177e4SLinus Torvalds stop_on_cycle, &new_blk)))
6896e9b3dd8SChristoph Hellwig goto out_free_buffer;
6901da177e4SLinus Torvalds if (new_blk != -1)
6911da177e4SLinus Torvalds head_blk = new_blk;
6921da177e4SLinus Torvalds }
6931da177e4SLinus Torvalds
6949db127edSAlex Elder validate_head:
6951da177e4SLinus Torvalds /*
6961da177e4SLinus Torvalds * Now we need to make sure head_blk is not pointing to a block in
6971da177e4SLinus Torvalds * the middle of a log record.
6981da177e4SLinus Torvalds */
6991da177e4SLinus Torvalds num_scan_bblks = XLOG_REC_SHIFT(log);
7001da177e4SLinus Torvalds if (head_blk >= num_scan_bblks) {
7011da177e4SLinus Torvalds start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
7021da177e4SLinus Torvalds
7031da177e4SLinus Torvalds /* start ptr at last block ptr before head_blk */
7042451337dSDave Chinner error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
7052451337dSDave Chinner if (error == 1)
7062451337dSDave Chinner error = -EIO;
7072451337dSDave Chinner if (error)
7086e9b3dd8SChristoph Hellwig goto out_free_buffer;
7091da177e4SLinus Torvalds } else {
7101da177e4SLinus Torvalds start_blk = 0;
7111da177e4SLinus Torvalds ASSERT(head_blk <= INT_MAX);
7122451337dSDave Chinner error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
7132451337dSDave Chinner if (error < 0)
7146e9b3dd8SChristoph Hellwig goto out_free_buffer;
7152451337dSDave Chinner if (error == 1) {
7161da177e4SLinus Torvalds /* We hit the beginning of the log during our search */
7173f943d85SAlex Elder start_blk = log_bbnum - (num_scan_bblks - head_blk);
7181da177e4SLinus Torvalds new_blk = log_bbnum;
7191da177e4SLinus Torvalds ASSERT(start_blk <= INT_MAX &&
7201da177e4SLinus Torvalds (xfs_daddr_t) log_bbnum-start_blk >= 0);
7211da177e4SLinus Torvalds ASSERT(head_blk <= INT_MAX);
7222451337dSDave Chinner error = xlog_find_verify_log_record(log, start_blk,
7232451337dSDave Chinner &new_blk, (int)head_blk);
7242451337dSDave Chinner if (error == 1)
7252451337dSDave Chinner error = -EIO;
7262451337dSDave Chinner if (error)
7276e9b3dd8SChristoph Hellwig goto out_free_buffer;
7281da177e4SLinus Torvalds if (new_blk != log_bbnum)
7291da177e4SLinus Torvalds head_blk = new_blk;
7301da177e4SLinus Torvalds } else if (error)
7316e9b3dd8SChristoph Hellwig goto out_free_buffer;
7321da177e4SLinus Torvalds }
7331da177e4SLinus Torvalds
7346e9b3dd8SChristoph Hellwig kmem_free(buffer);
7351da177e4SLinus Torvalds if (head_blk == log_bbnum)
7361da177e4SLinus Torvalds *return_head_blk = 0;
7371da177e4SLinus Torvalds else
7381da177e4SLinus Torvalds *return_head_blk = head_blk;
7391da177e4SLinus Torvalds /*
7401da177e4SLinus Torvalds * When returning here, we have a good block number. Bad block
7411da177e4SLinus Torvalds * means that during a previous crash, we didn't have a clean break
7421da177e4SLinus Torvalds * from cycle number N to cycle number N-1. In this case, we need
7431da177e4SLinus Torvalds * to find the first block with cycle number N-1.
7441da177e4SLinus Torvalds */
7451da177e4SLinus Torvalds return 0;
7461da177e4SLinus Torvalds
7476e9b3dd8SChristoph Hellwig out_free_buffer:
7486e9b3dd8SChristoph Hellwig kmem_free(buffer);
7491da177e4SLinus Torvalds if (error)
750a0fa2b67SDave Chinner xfs_warn(log->l_mp, "failed to find log head");
7511da177e4SLinus Torvalds return error;
7521da177e4SLinus Torvalds }
7531da177e4SLinus Torvalds
7541da177e4SLinus Torvalds /*
755eed6b462SBrian Foster * Seek backwards in the log for log record headers.
756eed6b462SBrian Foster *
757eed6b462SBrian Foster * Given a starting log block, walk backwards until we find the provided number
758eed6b462SBrian Foster * of records or hit the provided tail block. The return value is the number of
759eed6b462SBrian Foster * records encountered or a negative error code. The log block and buffer
760eed6b462SBrian Foster * pointer of the last record seen are returned in rblk and rhead respectively.
761eed6b462SBrian Foster */
762eed6b462SBrian Foster STATIC int
xlog_rseek_logrec_hdr(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk,int count,char * buffer,xfs_daddr_t * rblk,struct xlog_rec_header ** rhead,bool * wrapped)763eed6b462SBrian Foster xlog_rseek_logrec_hdr(
764eed6b462SBrian Foster struct xlog *log,
765eed6b462SBrian Foster xfs_daddr_t head_blk,
766eed6b462SBrian Foster xfs_daddr_t tail_blk,
767eed6b462SBrian Foster int count,
7686e9b3dd8SChristoph Hellwig char *buffer,
769eed6b462SBrian Foster xfs_daddr_t *rblk,
770eed6b462SBrian Foster struct xlog_rec_header **rhead,
771eed6b462SBrian Foster bool *wrapped)
772eed6b462SBrian Foster {
773eed6b462SBrian Foster int i;
774eed6b462SBrian Foster int error;
775eed6b462SBrian Foster int found = 0;
776eed6b462SBrian Foster char *offset = NULL;
777eed6b462SBrian Foster xfs_daddr_t end_blk;
778eed6b462SBrian Foster
779eed6b462SBrian Foster *wrapped = false;
780eed6b462SBrian Foster
781eed6b462SBrian Foster /*
782eed6b462SBrian Foster * Walk backwards from the head block until we hit the tail or the first
783eed6b462SBrian Foster * block in the log.
784eed6b462SBrian Foster */
785eed6b462SBrian Foster end_blk = head_blk > tail_blk ? tail_blk : 0;
786eed6b462SBrian Foster for (i = (int) head_blk - 1; i >= end_blk; i--) {
7876e9b3dd8SChristoph Hellwig error = xlog_bread(log, i, 1, buffer, &offset);
788eed6b462SBrian Foster if (error)
789eed6b462SBrian Foster goto out_error;
790eed6b462SBrian Foster
791eed6b462SBrian Foster if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
792eed6b462SBrian Foster *rblk = i;
793eed6b462SBrian Foster *rhead = (struct xlog_rec_header *) offset;
794eed6b462SBrian Foster if (++found == count)
795eed6b462SBrian Foster break;
796eed6b462SBrian Foster }
797eed6b462SBrian Foster }
798eed6b462SBrian Foster
799eed6b462SBrian Foster /*
800eed6b462SBrian Foster * If we haven't hit the tail block or the log record header count,
801eed6b462SBrian Foster * start looking again from the end of the physical log. Note that
802eed6b462SBrian Foster * callers can pass head == tail if the tail is not yet known.
803eed6b462SBrian Foster */
804eed6b462SBrian Foster if (tail_blk >= head_blk && found != count) {
805eed6b462SBrian Foster for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
8066e9b3dd8SChristoph Hellwig error = xlog_bread(log, i, 1, buffer, &offset);
807eed6b462SBrian Foster if (error)
808eed6b462SBrian Foster goto out_error;
809eed6b462SBrian Foster
810eed6b462SBrian Foster if (*(__be32 *)offset ==
811eed6b462SBrian Foster cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
812eed6b462SBrian Foster *wrapped = true;
813eed6b462SBrian Foster *rblk = i;
814eed6b462SBrian Foster *rhead = (struct xlog_rec_header *) offset;
815eed6b462SBrian Foster if (++found == count)
816eed6b462SBrian Foster break;
817eed6b462SBrian Foster }
818eed6b462SBrian Foster }
819eed6b462SBrian Foster }
820eed6b462SBrian Foster
821eed6b462SBrian Foster return found;
822eed6b462SBrian Foster
823eed6b462SBrian Foster out_error:
824eed6b462SBrian Foster return error;
825eed6b462SBrian Foster }
826eed6b462SBrian Foster
827eed6b462SBrian Foster /*
8287088c413SBrian Foster * Seek forward in the log for log record headers.
8297088c413SBrian Foster *
8307088c413SBrian Foster * Given head and tail blocks, walk forward from the tail block until we find
8317088c413SBrian Foster * the provided number of records or hit the head block. The return value is the
8327088c413SBrian Foster * number of records encountered or a negative error code. The log block and
8337088c413SBrian Foster * buffer pointer of the last record seen are returned in rblk and rhead
8347088c413SBrian Foster * respectively.
8357088c413SBrian Foster */
8367088c413SBrian Foster STATIC int
xlog_seek_logrec_hdr(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk,int count,char * buffer,xfs_daddr_t * rblk,struct xlog_rec_header ** rhead,bool * wrapped)8377088c413SBrian Foster xlog_seek_logrec_hdr(
8387088c413SBrian Foster struct xlog *log,
8397088c413SBrian Foster xfs_daddr_t head_blk,
8407088c413SBrian Foster xfs_daddr_t tail_blk,
8417088c413SBrian Foster int count,
8426e9b3dd8SChristoph Hellwig char *buffer,
8437088c413SBrian Foster xfs_daddr_t *rblk,
8447088c413SBrian Foster struct xlog_rec_header **rhead,
8457088c413SBrian Foster bool *wrapped)
8467088c413SBrian Foster {
8477088c413SBrian Foster int i;
8487088c413SBrian Foster int error;
8497088c413SBrian Foster int found = 0;
8507088c413SBrian Foster char *offset = NULL;
8517088c413SBrian Foster xfs_daddr_t end_blk;
8527088c413SBrian Foster
8537088c413SBrian Foster *wrapped = false;
8547088c413SBrian Foster
8557088c413SBrian Foster /*
8567088c413SBrian Foster * Walk forward from the tail block until we hit the head or the last
8577088c413SBrian Foster * block in the log.
8587088c413SBrian Foster */
8597088c413SBrian Foster end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
8607088c413SBrian Foster for (i = (int) tail_blk; i <= end_blk; i++) {
8616e9b3dd8SChristoph Hellwig error = xlog_bread(log, i, 1, buffer, &offset);
8627088c413SBrian Foster if (error)
8637088c413SBrian Foster goto out_error;
8647088c413SBrian Foster
8657088c413SBrian Foster if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
8667088c413SBrian Foster *rblk = i;
8677088c413SBrian Foster *rhead = (struct xlog_rec_header *) offset;
8687088c413SBrian Foster if (++found == count)
8697088c413SBrian Foster break;
8707088c413SBrian Foster }
8717088c413SBrian Foster }
8727088c413SBrian Foster
8737088c413SBrian Foster /*
8747088c413SBrian Foster * If we haven't hit the head block or the log record header count,
8757088c413SBrian Foster * start looking again from the start of the physical log.
8767088c413SBrian Foster */
8777088c413SBrian Foster if (tail_blk > head_blk && found != count) {
8787088c413SBrian Foster for (i = 0; i < (int) head_blk; i++) {
8796e9b3dd8SChristoph Hellwig error = xlog_bread(log, i, 1, buffer, &offset);
8807088c413SBrian Foster if (error)
8817088c413SBrian Foster goto out_error;
8827088c413SBrian Foster
8837088c413SBrian Foster if (*(__be32 *)offset ==
8847088c413SBrian Foster cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
8857088c413SBrian Foster *wrapped = true;
8867088c413SBrian Foster *rblk = i;
8877088c413SBrian Foster *rhead = (struct xlog_rec_header *) offset;
8887088c413SBrian Foster if (++found == count)
8897088c413SBrian Foster break;
8907088c413SBrian Foster }
8917088c413SBrian Foster }
8927088c413SBrian Foster }
8937088c413SBrian Foster
8947088c413SBrian Foster return found;
8957088c413SBrian Foster
8967088c413SBrian Foster out_error:
8977088c413SBrian Foster return error;
8987088c413SBrian Foster }
8997088c413SBrian Foster
9007088c413SBrian Foster /*
9014a4f66eaSBrian Foster * Calculate distance from head to tail (i.e., unused space in the log).
9024a4f66eaSBrian Foster */
9034a4f66eaSBrian Foster static inline int
xlog_tail_distance(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk)9044a4f66eaSBrian Foster xlog_tail_distance(
9054a4f66eaSBrian Foster struct xlog *log,
9064a4f66eaSBrian Foster xfs_daddr_t head_blk,
9074a4f66eaSBrian Foster xfs_daddr_t tail_blk)
9084a4f66eaSBrian Foster {
9094a4f66eaSBrian Foster if (head_blk < tail_blk)
9104a4f66eaSBrian Foster return tail_blk - head_blk;
9114a4f66eaSBrian Foster
9124a4f66eaSBrian Foster return tail_blk + (log->l_logBBsize - head_blk);
9134a4f66eaSBrian Foster }
9144a4f66eaSBrian Foster
9154a4f66eaSBrian Foster /*
9164a4f66eaSBrian Foster * Verify the log tail. This is particularly important when torn or incomplete
9174a4f66eaSBrian Foster * writes have been detected near the front of the log and the head has been
9184a4f66eaSBrian Foster * walked back accordingly.
9197088c413SBrian Foster *
9204a4f66eaSBrian Foster * We also have to handle the case where the tail was pinned and the head
9214a4f66eaSBrian Foster * blocked behind the tail right before a crash. If the tail had been pushed
9224a4f66eaSBrian Foster * immediately prior to the crash and the subsequent checkpoint was only
9234a4f66eaSBrian Foster * partially written, it's possible it overwrote the last referenced tail in the
9244a4f66eaSBrian Foster * log with garbage. This is not a coherency problem because the tail must have
9254a4f66eaSBrian Foster * been pushed before it can be overwritten, but appears as log corruption to
9264a4f66eaSBrian Foster * recovery because we have no way to know the tail was updated if the
9274a4f66eaSBrian Foster * subsequent checkpoint didn't write successfully.
9284a4f66eaSBrian Foster *
9294a4f66eaSBrian Foster * Therefore, CRC check the log from tail to head. If a failure occurs and the
9304a4f66eaSBrian Foster * offending record is within max iclog bufs from the head, walk the tail
9314a4f66eaSBrian Foster * forward and retry until a valid tail is found or corruption is detected out
9324a4f66eaSBrian Foster * of the range of a possible overwrite.
9337088c413SBrian Foster */
9347088c413SBrian Foster STATIC int
xlog_verify_tail(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t * tail_blk,int hsize)9357088c413SBrian Foster xlog_verify_tail(
9367088c413SBrian Foster struct xlog *log,
9377088c413SBrian Foster xfs_daddr_t head_blk,
9384a4f66eaSBrian Foster xfs_daddr_t *tail_blk,
9394a4f66eaSBrian Foster int hsize)
9407088c413SBrian Foster {
9417088c413SBrian Foster struct xlog_rec_header *thead;
9426e9b3dd8SChristoph Hellwig char *buffer;
9437088c413SBrian Foster xfs_daddr_t first_bad;
9447088c413SBrian Foster int error = 0;
9457088c413SBrian Foster bool wrapped;
9464a4f66eaSBrian Foster xfs_daddr_t tmp_tail;
9474a4f66eaSBrian Foster xfs_daddr_t orig_tail = *tail_blk;
9487088c413SBrian Foster
9496e9b3dd8SChristoph Hellwig buffer = xlog_alloc_buffer(log, 1);
9506e9b3dd8SChristoph Hellwig if (!buffer)
9517088c413SBrian Foster return -ENOMEM;
9527088c413SBrian Foster
9537088c413SBrian Foster /*
9544a4f66eaSBrian Foster * Make sure the tail points to a record (returns positive count on
9554a4f66eaSBrian Foster * success).
9567088c413SBrian Foster */
9576e9b3dd8SChristoph Hellwig error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer,
9584a4f66eaSBrian Foster &tmp_tail, &thead, &wrapped);
9594a4f66eaSBrian Foster if (error < 0)
9607088c413SBrian Foster goto out;
9614a4f66eaSBrian Foster if (*tail_blk != tmp_tail)
9624a4f66eaSBrian Foster *tail_blk = tmp_tail;
9634a4f66eaSBrian Foster
9644a4f66eaSBrian Foster /*
9654a4f66eaSBrian Foster * Run a CRC check from the tail to the head. We can't just check
9664a4f66eaSBrian Foster * MAX_ICLOGS records past the tail because the tail may point to stale
9674a4f66eaSBrian Foster * blocks cleared during the search for the head/tail. These blocks are
9684a4f66eaSBrian Foster * overwritten with zero-length records and thus record count is not a
9694a4f66eaSBrian Foster * reliable indicator of the iclog state before a crash.
9704a4f66eaSBrian Foster */
9714a4f66eaSBrian Foster first_bad = 0;
9724a4f66eaSBrian Foster error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
9734a4f66eaSBrian Foster XLOG_RECOVER_CRCPASS, &first_bad);
974a4c9b34dSBrian Foster while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
9754a4f66eaSBrian Foster int tail_distance;
9764a4f66eaSBrian Foster
9774a4f66eaSBrian Foster /*
9784a4f66eaSBrian Foster * Is corruption within range of the head? If so, retry from
9794a4f66eaSBrian Foster * the next record. Otherwise return an error.
9804a4f66eaSBrian Foster */
9814a4f66eaSBrian Foster tail_distance = xlog_tail_distance(log, head_blk, first_bad);
9824a4f66eaSBrian Foster if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
9834a4f66eaSBrian Foster break;
9844a4f66eaSBrian Foster
9854a4f66eaSBrian Foster /* skip to the next record; returns positive count on success */
9866e9b3dd8SChristoph Hellwig error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2,
9876e9b3dd8SChristoph Hellwig buffer, &tmp_tail, &thead, &wrapped);
9884a4f66eaSBrian Foster if (error < 0)
9894a4f66eaSBrian Foster goto out;
9904a4f66eaSBrian Foster
9914a4f66eaSBrian Foster *tail_blk = tmp_tail;
9924a4f66eaSBrian Foster first_bad = 0;
9934a4f66eaSBrian Foster error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
9944a4f66eaSBrian Foster XLOG_RECOVER_CRCPASS, &first_bad);
9957088c413SBrian Foster }
9967088c413SBrian Foster
9974a4f66eaSBrian Foster if (!error && *tail_blk != orig_tail)
9984a4f66eaSBrian Foster xfs_warn(log->l_mp,
9994a4f66eaSBrian Foster "Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
10004a4f66eaSBrian Foster orig_tail, *tail_blk);
10017088c413SBrian Foster out:
10026e9b3dd8SChristoph Hellwig kmem_free(buffer);
10037088c413SBrian Foster return error;
10047088c413SBrian Foster }
10057088c413SBrian Foster
10067088c413SBrian Foster /*
10077088c413SBrian Foster * Detect and trim torn writes from the head of the log.
10087088c413SBrian Foster *
10097088c413SBrian Foster * Storage without sector atomicity guarantees can result in torn writes in the
10107088c413SBrian Foster * log in the event of a crash. Our only means to detect this scenario is via
10117088c413SBrian Foster * CRC verification. While we can't always be certain that CRC verification
10127088c413SBrian Foster * failure is due to a torn write vs. an unrelated corruption, we do know that
10137088c413SBrian Foster * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
10147088c413SBrian Foster * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
10157088c413SBrian Foster * the log and treat failures in this range as torn writes as a matter of
10167088c413SBrian Foster * policy. In the event of CRC failure, the head is walked back to the last good
10177088c413SBrian Foster * record in the log and the tail is updated from that record and verified.
10187088c413SBrian Foster */
10197088c413SBrian Foster STATIC int
xlog_verify_head(struct xlog * log,xfs_daddr_t * head_blk,xfs_daddr_t * tail_blk,char * buffer,xfs_daddr_t * rhead_blk,struct xlog_rec_header ** rhead,bool * wrapped)10207088c413SBrian Foster xlog_verify_head(
10217088c413SBrian Foster struct xlog *log,
10227088c413SBrian Foster xfs_daddr_t *head_blk, /* in/out: unverified head */
10237088c413SBrian Foster xfs_daddr_t *tail_blk, /* out: tail block */
10246e9b3dd8SChristoph Hellwig char *buffer,
10257088c413SBrian Foster xfs_daddr_t *rhead_blk, /* start blk of last record */
10267088c413SBrian Foster struct xlog_rec_header **rhead, /* ptr to last record */
10277088c413SBrian Foster bool *wrapped) /* last rec. wraps phys. log */
10287088c413SBrian Foster {
10297088c413SBrian Foster struct xlog_rec_header *tmp_rhead;
10306e9b3dd8SChristoph Hellwig char *tmp_buffer;
10317088c413SBrian Foster xfs_daddr_t first_bad;
10327088c413SBrian Foster xfs_daddr_t tmp_rhead_blk;
10337088c413SBrian Foster int found;
10347088c413SBrian Foster int error;
10357088c413SBrian Foster bool tmp_wrapped;
10367088c413SBrian Foster
10377088c413SBrian Foster /*
103882ff6cc2SBrian Foster * Check the head of the log for torn writes. Search backwards from the
103982ff6cc2SBrian Foster * head until we hit the tail or the maximum number of log record I/Os
104082ff6cc2SBrian Foster * that could have been in flight at one time. Use a temporary buffer so
10416e9b3dd8SChristoph Hellwig * we don't trash the rhead/buffer pointers from the caller.
10427088c413SBrian Foster */
10436e9b3dd8SChristoph Hellwig tmp_buffer = xlog_alloc_buffer(log, 1);
10446e9b3dd8SChristoph Hellwig if (!tmp_buffer)
10457088c413SBrian Foster return -ENOMEM;
10467088c413SBrian Foster error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
10476e9b3dd8SChristoph Hellwig XLOG_MAX_ICLOGS, tmp_buffer,
10486e9b3dd8SChristoph Hellwig &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
10496e9b3dd8SChristoph Hellwig kmem_free(tmp_buffer);
10507088c413SBrian Foster if (error < 0)
10517088c413SBrian Foster return error;
10527088c413SBrian Foster
10537088c413SBrian Foster /*
10547088c413SBrian Foster * Now run a CRC verification pass over the records starting at the
10557088c413SBrian Foster * block found above to the current head. If a CRC failure occurs, the
10567088c413SBrian Foster * log block of the first bad record is saved in first_bad.
10577088c413SBrian Foster */
10587088c413SBrian Foster error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
10597088c413SBrian Foster XLOG_RECOVER_CRCPASS, &first_bad);
1060a4c9b34dSBrian Foster if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
10617088c413SBrian Foster /*
10627088c413SBrian Foster * We've hit a potential torn write. Reset the error and warn
10637088c413SBrian Foster * about it.
10647088c413SBrian Foster */
10657088c413SBrian Foster error = 0;
10667088c413SBrian Foster xfs_warn(log->l_mp,
10677088c413SBrian Foster "Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
10687088c413SBrian Foster first_bad, *head_blk);
10697088c413SBrian Foster
10707088c413SBrian Foster /*
10717088c413SBrian Foster * Get the header block and buffer pointer for the last good
10727088c413SBrian Foster * record before the bad record.
10737088c413SBrian Foster *
10747088c413SBrian Foster * Note that xlog_find_tail() clears the blocks at the new head
10757088c413SBrian Foster * (i.e., the records with invalid CRC) if the cycle number
1076b63da6c8SRandy Dunlap * matches the current cycle.
10777088c413SBrian Foster */
10786e9b3dd8SChristoph Hellwig found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
10796e9b3dd8SChristoph Hellwig buffer, rhead_blk, rhead, wrapped);
10807088c413SBrian Foster if (found < 0)
10817088c413SBrian Foster return found;
10827088c413SBrian Foster if (found == 0) /* XXX: right thing to do here? */
10837088c413SBrian Foster return -EIO;
10847088c413SBrian Foster
10857088c413SBrian Foster /*
10867088c413SBrian Foster * Reset the head block to the starting block of the first bad
10877088c413SBrian Foster * log record and set the tail block based on the last good
10887088c413SBrian Foster * record.
10897088c413SBrian Foster *
10907088c413SBrian Foster * Bail out if the updated head/tail match as this indicates
10917088c413SBrian Foster * possible corruption outside of the acceptable
10927088c413SBrian Foster * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
10937088c413SBrian Foster */
10947088c413SBrian Foster *head_blk = first_bad;
10957088c413SBrian Foster *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
10967088c413SBrian Foster if (*head_blk == *tail_blk) {
10977088c413SBrian Foster ASSERT(0);
10987088c413SBrian Foster return 0;
10997088c413SBrian Foster }
11007088c413SBrian Foster }
11015297ac1fSBrian Foster if (error)
11027088c413SBrian Foster return error;
11035297ac1fSBrian Foster
11044a4f66eaSBrian Foster return xlog_verify_tail(log, *head_blk, tail_blk,
11054a4f66eaSBrian Foster be32_to_cpu((*rhead)->h_size));
11067088c413SBrian Foster }
11077088c413SBrian Foster
11087088c413SBrian Foster /*
11090703a8e1SDave Chinner * We need to make sure we handle log wrapping properly, so we can't use the
11100703a8e1SDave Chinner * calculated logbno directly. Make sure it wraps to the correct bno inside the
11110703a8e1SDave Chinner * log.
11120703a8e1SDave Chinner *
11130703a8e1SDave Chinner * The log is limited to 32 bit sizes, so we use the appropriate modulus
11140703a8e1SDave Chinner * operation here and cast it back to a 64 bit daddr on return.
11150703a8e1SDave Chinner */
11160703a8e1SDave Chinner static inline xfs_daddr_t
xlog_wrap_logbno(struct xlog * log,xfs_daddr_t bno)11170703a8e1SDave Chinner xlog_wrap_logbno(
11180703a8e1SDave Chinner struct xlog *log,
11190703a8e1SDave Chinner xfs_daddr_t bno)
11200703a8e1SDave Chinner {
11210703a8e1SDave Chinner int mod;
11220703a8e1SDave Chinner
11230703a8e1SDave Chinner div_s64_rem(bno, log->l_logBBsize, &mod);
11240703a8e1SDave Chinner return mod;
11250703a8e1SDave Chinner }
11260703a8e1SDave Chinner
11270703a8e1SDave Chinner /*
112865b99a08SBrian Foster * Check whether the head of the log points to an unmount record. In other
112965b99a08SBrian Foster * words, determine whether the log is clean. If so, update the in-core state
113065b99a08SBrian Foster * appropriately.
113165b99a08SBrian Foster */
113265b99a08SBrian Foster static int
xlog_check_unmount_rec(struct xlog * log,xfs_daddr_t * head_blk,xfs_daddr_t * tail_blk,struct xlog_rec_header * rhead,xfs_daddr_t rhead_blk,char * buffer,bool * clean)113365b99a08SBrian Foster xlog_check_unmount_rec(
113465b99a08SBrian Foster struct xlog *log,
113565b99a08SBrian Foster xfs_daddr_t *head_blk,
113665b99a08SBrian Foster xfs_daddr_t *tail_blk,
113765b99a08SBrian Foster struct xlog_rec_header *rhead,
113865b99a08SBrian Foster xfs_daddr_t rhead_blk,
11396e9b3dd8SChristoph Hellwig char *buffer,
114065b99a08SBrian Foster bool *clean)
114165b99a08SBrian Foster {
114265b99a08SBrian Foster struct xlog_op_header *op_head;
114365b99a08SBrian Foster xfs_daddr_t umount_data_blk;
114465b99a08SBrian Foster xfs_daddr_t after_umount_blk;
114565b99a08SBrian Foster int hblks;
114665b99a08SBrian Foster int error;
114765b99a08SBrian Foster char *offset;
114865b99a08SBrian Foster
114965b99a08SBrian Foster *clean = false;
115065b99a08SBrian Foster
115165b99a08SBrian Foster /*
115265b99a08SBrian Foster * Look for unmount record. If we find it, then we know there was a
115365b99a08SBrian Foster * clean unmount. Since 'i' could be the last block in the physical
115465b99a08SBrian Foster * log, we convert to a log block before comparing to the head_blk.
115565b99a08SBrian Foster *
115665b99a08SBrian Foster * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
115765b99a08SBrian Foster * below. We won't want to clear the unmount record if there is one, so
115865b99a08SBrian Foster * we pass the lsn of the unmount record rather than the block after it.
115965b99a08SBrian Foster */
11600c771b99SGao Xiang hblks = xlog_logrec_hblks(log, rhead);
11610703a8e1SDave Chinner after_umount_blk = xlog_wrap_logbno(log,
11620703a8e1SDave Chinner rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
11630703a8e1SDave Chinner
116465b99a08SBrian Foster if (*head_blk == after_umount_blk &&
116565b99a08SBrian Foster be32_to_cpu(rhead->h_num_logops) == 1) {
11660703a8e1SDave Chinner umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks);
11676e9b3dd8SChristoph Hellwig error = xlog_bread(log, umount_data_blk, 1, buffer, &offset);
116865b99a08SBrian Foster if (error)
116965b99a08SBrian Foster return error;
117065b99a08SBrian Foster
117165b99a08SBrian Foster op_head = (struct xlog_op_header *)offset;
117265b99a08SBrian Foster if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
117365b99a08SBrian Foster /*
117465b99a08SBrian Foster * Set tail and last sync so that newly written log
117565b99a08SBrian Foster * records will point recovery to after the current
117665b99a08SBrian Foster * unmount record.
117765b99a08SBrian Foster */
117865b99a08SBrian Foster xlog_assign_atomic_lsn(&log->l_tail_lsn,
117965b99a08SBrian Foster log->l_curr_cycle, after_umount_blk);
118065b99a08SBrian Foster xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
118165b99a08SBrian Foster log->l_curr_cycle, after_umount_blk);
118265b99a08SBrian Foster *tail_blk = after_umount_blk;
118365b99a08SBrian Foster
118465b99a08SBrian Foster *clean = true;
118565b99a08SBrian Foster }
118665b99a08SBrian Foster }
118765b99a08SBrian Foster
118865b99a08SBrian Foster return 0;
118965b99a08SBrian Foster }
119065b99a08SBrian Foster
1191717bc0ebSBrian Foster static void
xlog_set_state(struct xlog * log,xfs_daddr_t head_blk,struct xlog_rec_header * rhead,xfs_daddr_t rhead_blk,bool bump_cycle)1192717bc0ebSBrian Foster xlog_set_state(
1193717bc0ebSBrian Foster struct xlog *log,
1194717bc0ebSBrian Foster xfs_daddr_t head_blk,
1195717bc0ebSBrian Foster struct xlog_rec_header *rhead,
1196717bc0ebSBrian Foster xfs_daddr_t rhead_blk,
1197717bc0ebSBrian Foster bool bump_cycle)
1198717bc0ebSBrian Foster {
1199717bc0ebSBrian Foster /*
1200717bc0ebSBrian Foster * Reset log values according to the state of the log when we
1201717bc0ebSBrian Foster * crashed. In the case where head_blk == 0, we bump curr_cycle
1202717bc0ebSBrian Foster * one because the next write starts a new cycle rather than
1203717bc0ebSBrian Foster * continuing the cycle of the last good log record. At this
1204717bc0ebSBrian Foster * point we have guaranteed that all partial log records have been
1205717bc0ebSBrian Foster * accounted for. Therefore, we know that the last good log record
1206717bc0ebSBrian Foster * written was complete and ended exactly on the end boundary
1207717bc0ebSBrian Foster * of the physical log.
1208717bc0ebSBrian Foster */
1209717bc0ebSBrian Foster log->l_prev_block = rhead_blk;
1210717bc0ebSBrian Foster log->l_curr_block = (int)head_blk;
1211717bc0ebSBrian Foster log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
1212717bc0ebSBrian Foster if (bump_cycle)
1213717bc0ebSBrian Foster log->l_curr_cycle++;
1214717bc0ebSBrian Foster atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
1215717bc0ebSBrian Foster atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
1216717bc0ebSBrian Foster xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
1217717bc0ebSBrian Foster BBTOB(log->l_curr_block));
1218717bc0ebSBrian Foster xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
1219717bc0ebSBrian Foster BBTOB(log->l_curr_block));
1220717bc0ebSBrian Foster }
1221717bc0ebSBrian Foster
122265b99a08SBrian Foster /*
12231da177e4SLinus Torvalds * Find the sync block number or the tail of the log.
12241da177e4SLinus Torvalds *
12251da177e4SLinus Torvalds * This will be the block number of the last record to have its
12261da177e4SLinus Torvalds * associated buffers synced to disk. Every log record header has
12271da177e4SLinus Torvalds * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
12281da177e4SLinus Torvalds * to get a sync block number. The only concern is to figure out which
12291da177e4SLinus Torvalds * log record header to believe.
12301da177e4SLinus Torvalds *
12311da177e4SLinus Torvalds * The following algorithm uses the log record header with the largest
12321da177e4SLinus Torvalds * lsn. The entire log record does not need to be valid. We only care
12331da177e4SLinus Torvalds * that the header is valid.
12341da177e4SLinus Torvalds *
12351da177e4SLinus Torvalds * We could speed up search by using current head_blk buffer, but it is not
12361da177e4SLinus Torvalds * available.
12371da177e4SLinus Torvalds */
12385d77c0dcSEric Sandeen STATIC int
xlog_find_tail(struct xlog * log,xfs_daddr_t * head_blk,xfs_daddr_t * tail_blk)12391da177e4SLinus Torvalds xlog_find_tail(
12409a8d2fdbSMark Tinguely struct xlog *log,
12411da177e4SLinus Torvalds xfs_daddr_t *head_blk,
124265be6054SEric Sandeen xfs_daddr_t *tail_blk)
12431da177e4SLinus Torvalds {
12441da177e4SLinus Torvalds xlog_rec_header_t *rhead;
1245b2a922cdSChristoph Hellwig char *offset = NULL;
12466e9b3dd8SChristoph Hellwig char *buffer;
12477088c413SBrian Foster int error;
12487088c413SBrian Foster xfs_daddr_t rhead_blk;
12491da177e4SLinus Torvalds xfs_lsn_t tail_lsn;
1250eed6b462SBrian Foster bool wrapped = false;
125165b99a08SBrian Foster bool clean = false;
12521da177e4SLinus Torvalds
12531da177e4SLinus Torvalds /*
12541da177e4SLinus Torvalds * Find previous log record
12551da177e4SLinus Torvalds */
12561da177e4SLinus Torvalds if ((error = xlog_find_head(log, head_blk)))
12571da177e4SLinus Torvalds return error;
125882ff6cc2SBrian Foster ASSERT(*head_blk < INT_MAX);
12591da177e4SLinus Torvalds
12606e9b3dd8SChristoph Hellwig buffer = xlog_alloc_buffer(log, 1);
12616e9b3dd8SChristoph Hellwig if (!buffer)
12622451337dSDave Chinner return -ENOMEM;
12631da177e4SLinus Torvalds if (*head_blk == 0) { /* special case */
12646e9b3dd8SChristoph Hellwig error = xlog_bread(log, 0, 1, buffer, &offset);
1265076e6acbSChristoph Hellwig if (error)
12669db127edSAlex Elder goto done;
1267076e6acbSChristoph Hellwig
126803bea6feSChristoph Hellwig if (xlog_get_cycle(offset) == 0) {
12691da177e4SLinus Torvalds *tail_blk = 0;
12701da177e4SLinus Torvalds /* leave all other log inited values alone */
12719db127edSAlex Elder goto done;
12721da177e4SLinus Torvalds }
12731da177e4SLinus Torvalds }
12741da177e4SLinus Torvalds
12751da177e4SLinus Torvalds /*
127682ff6cc2SBrian Foster * Search backwards through the log looking for the log record header
127782ff6cc2SBrian Foster * block. This wraps all the way back around to the head so something is
127882ff6cc2SBrian Foster * seriously wrong if we can't find it.
12791da177e4SLinus Torvalds */
12806e9b3dd8SChristoph Hellwig error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
128182ff6cc2SBrian Foster &rhead_blk, &rhead, &wrapped);
128282ff6cc2SBrian Foster if (error < 0)
1283050552cbSDarrick J. Wong goto done;
128482ff6cc2SBrian Foster if (!error) {
128582ff6cc2SBrian Foster xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1286050552cbSDarrick J. Wong error = -EFSCORRUPTED;
1287050552cbSDarrick J. Wong goto done;
128882ff6cc2SBrian Foster }
128982ff6cc2SBrian Foster *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
12901da177e4SLinus Torvalds
12911da177e4SLinus Torvalds /*
1292717bc0ebSBrian Foster * Set the log state based on the current head record.
12931da177e4SLinus Torvalds */
1294717bc0ebSBrian Foster xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
12951c3cb9ecSDave Chinner tail_lsn = atomic64_read(&log->l_tail_lsn);
12961da177e4SLinus Torvalds
12971da177e4SLinus Torvalds /*
129865b99a08SBrian Foster * Look for an unmount record at the head of the log. This sets the log
129965b99a08SBrian Foster * state to determine whether recovery is necessary.
13001da177e4SLinus Torvalds */
130165b99a08SBrian Foster error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
13026e9b3dd8SChristoph Hellwig rhead_blk, buffer, &clean);
1303076e6acbSChristoph Hellwig if (error)
13049db127edSAlex Elder goto done;
1305076e6acbSChristoph Hellwig
13061da177e4SLinus Torvalds /*
13077f6aff3aSBrian Foster * Verify the log head if the log is not clean (e.g., we have anything
13087f6aff3aSBrian Foster * but an unmount record at the head). This uses CRC verification to
13097f6aff3aSBrian Foster * detect and trim torn writes. If discovered, CRC failures are
13107f6aff3aSBrian Foster * considered torn writes and the log head is trimmed accordingly.
13117f6aff3aSBrian Foster *
13127f6aff3aSBrian Foster * Note that we can only run CRC verification when the log is dirty
13137f6aff3aSBrian Foster * because there's no guarantee that the log data behind an unmount
13147f6aff3aSBrian Foster * record is compatible with the current architecture.
13151da177e4SLinus Torvalds */
13167f6aff3aSBrian Foster if (!clean) {
13177f6aff3aSBrian Foster xfs_daddr_t orig_head = *head_blk;
13187f6aff3aSBrian Foster
13196e9b3dd8SChristoph Hellwig error = xlog_verify_head(log, head_blk, tail_blk, buffer,
13207f6aff3aSBrian Foster &rhead_blk, &rhead, &wrapped);
13217f6aff3aSBrian Foster if (error)
13227f6aff3aSBrian Foster goto done;
13237f6aff3aSBrian Foster
13247f6aff3aSBrian Foster /* update in-core state again if the head changed */
13257f6aff3aSBrian Foster if (*head_blk != orig_head) {
13267f6aff3aSBrian Foster xlog_set_state(log, *head_blk, rhead, rhead_blk,
13277f6aff3aSBrian Foster wrapped);
13287f6aff3aSBrian Foster tail_lsn = atomic64_read(&log->l_tail_lsn);
13297f6aff3aSBrian Foster error = xlog_check_unmount_rec(log, head_blk, tail_blk,
13306e9b3dd8SChristoph Hellwig rhead, rhead_blk, buffer,
13317f6aff3aSBrian Foster &clean);
13327f6aff3aSBrian Foster if (error)
13337f6aff3aSBrian Foster goto done;
13347f6aff3aSBrian Foster }
13357f6aff3aSBrian Foster }
133692821e2bSDavid Chinner
133792821e2bSDavid Chinner /*
133865b99a08SBrian Foster * Note that the unmount was clean. If the unmount was not clean, we
133965b99a08SBrian Foster * need to know this to rebuild the superblock counters from the perag
134065b99a08SBrian Foster * headers if we have a filesystem using non-persistent counters.
134192821e2bSDavid Chinner */
134265b99a08SBrian Foster if (clean)
13432e973b2cSDave Chinner set_bit(XFS_OPSTATE_CLEAN, &log->l_mp->m_opstate);
13441da177e4SLinus Torvalds
13451da177e4SLinus Torvalds /*
13461da177e4SLinus Torvalds * Make sure that there are no blocks in front of the head
13471da177e4SLinus Torvalds * with the same cycle number as the head. This can happen
13481da177e4SLinus Torvalds * because we allow multiple outstanding log writes concurrently,
13491da177e4SLinus Torvalds * and the later writes might make it out before earlier ones.
13501da177e4SLinus Torvalds *
13511da177e4SLinus Torvalds * We use the lsn from before modifying it so that we'll never
13521da177e4SLinus Torvalds * overwrite the unmount record after a clean unmount.
13531da177e4SLinus Torvalds *
13541da177e4SLinus Torvalds * Do this only if we are going to recover the filesystem
13551da177e4SLinus Torvalds *
13561da177e4SLinus Torvalds * NOTE: This used to say "if (!readonly)"
13571da177e4SLinus Torvalds * However on Linux, we can & do recover a read-only filesystem.
13581da177e4SLinus Torvalds * We only skip recovery if NORECOVERY is specified on mount,
13591da177e4SLinus Torvalds * in which case we would not be here.
13601da177e4SLinus Torvalds *
13611da177e4SLinus Torvalds * But... if the -device- itself is readonly, just skip this.
13621da177e4SLinus Torvalds * We can't recover this device anyway, so it won't matter.
13631da177e4SLinus Torvalds */
13642d15d2c0SChristoph Hellwig if (!xfs_readonly_buftarg(log->l_targ))
13651da177e4SLinus Torvalds error = xlog_clear_stale_blocks(log, tail_lsn);
13661da177e4SLinus Torvalds
13679db127edSAlex Elder done:
13686e9b3dd8SChristoph Hellwig kmem_free(buffer);
13691da177e4SLinus Torvalds
13701da177e4SLinus Torvalds if (error)
1371a0fa2b67SDave Chinner xfs_warn(log->l_mp, "failed to locate log tail");
13721da177e4SLinus Torvalds return error;
13731da177e4SLinus Torvalds }
13741da177e4SLinus Torvalds
13751da177e4SLinus Torvalds /*
13761da177e4SLinus Torvalds * Is the log zeroed at all?
13771da177e4SLinus Torvalds *
13781da177e4SLinus Torvalds * The last binary search should be changed to perform an X block read
13791da177e4SLinus Torvalds * once X becomes small enough. You can then search linearly through
13801da177e4SLinus Torvalds * the X blocks. This will cut down on the number of reads we need to do.
13811da177e4SLinus Torvalds *
13821da177e4SLinus Torvalds * If the log is partially zeroed, this routine will pass back the blkno
13831da177e4SLinus Torvalds * of the first block with cycle number 0. It won't have a complete LR
13841da177e4SLinus Torvalds * preceding it.
13851da177e4SLinus Torvalds *
13861da177e4SLinus Torvalds * Return:
13871da177e4SLinus Torvalds * 0 => the log is completely written to
13882451337dSDave Chinner * 1 => use *blk_no as the first block of the log
13892451337dSDave Chinner * <0 => error has occurred
13901da177e4SLinus Torvalds */
1391a8272ce0SDavid Chinner STATIC int
xlog_find_zeroed(struct xlog * log,xfs_daddr_t * blk_no)13921da177e4SLinus Torvalds xlog_find_zeroed(
13939a8d2fdbSMark Tinguely struct xlog *log,
13941da177e4SLinus Torvalds xfs_daddr_t *blk_no)
13951da177e4SLinus Torvalds {
13966e9b3dd8SChristoph Hellwig char *buffer;
1397b2a922cdSChristoph Hellwig char *offset;
13981da177e4SLinus Torvalds uint first_cycle, last_cycle;
13991da177e4SLinus Torvalds xfs_daddr_t new_blk, last_blk, start_blk;
14001da177e4SLinus Torvalds xfs_daddr_t num_scan_bblks;
14011da177e4SLinus Torvalds int error, log_bbnum = log->l_logBBsize;
14021da177e4SLinus Torvalds
14036fdf8cccSNathan Scott *blk_no = 0;
14046fdf8cccSNathan Scott
14051da177e4SLinus Torvalds /* check totally zeroed log */
14066e9b3dd8SChristoph Hellwig buffer = xlog_alloc_buffer(log, 1);
14076e9b3dd8SChristoph Hellwig if (!buffer)
14082451337dSDave Chinner return -ENOMEM;
14096e9b3dd8SChristoph Hellwig error = xlog_bread(log, 0, 1, buffer, &offset);
1410076e6acbSChristoph Hellwig if (error)
14116e9b3dd8SChristoph Hellwig goto out_free_buffer;
1412076e6acbSChristoph Hellwig
141303bea6feSChristoph Hellwig first_cycle = xlog_get_cycle(offset);
14141da177e4SLinus Torvalds if (first_cycle == 0) { /* completely zeroed log */
14151da177e4SLinus Torvalds *blk_no = 0;
14166e9b3dd8SChristoph Hellwig kmem_free(buffer);
14172451337dSDave Chinner return 1;
14181da177e4SLinus Torvalds }
14191da177e4SLinus Torvalds
14201da177e4SLinus Torvalds /* check partially zeroed log */
14216e9b3dd8SChristoph Hellwig error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset);
1422076e6acbSChristoph Hellwig if (error)
14236e9b3dd8SChristoph Hellwig goto out_free_buffer;
1424076e6acbSChristoph Hellwig
142503bea6feSChristoph Hellwig last_cycle = xlog_get_cycle(offset);
14261da177e4SLinus Torvalds if (last_cycle != 0) { /* log completely written to */
14276e9b3dd8SChristoph Hellwig kmem_free(buffer);
14281da177e4SLinus Torvalds return 0;
14291da177e4SLinus Torvalds }
14301da177e4SLinus Torvalds
14311da177e4SLinus Torvalds /* we have a partially zeroed log */
14321da177e4SLinus Torvalds last_blk = log_bbnum-1;
14336e9b3dd8SChristoph Hellwig error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0);
14346e9b3dd8SChristoph Hellwig if (error)
14356e9b3dd8SChristoph Hellwig goto out_free_buffer;
14361da177e4SLinus Torvalds
14371da177e4SLinus Torvalds /*
14381da177e4SLinus Torvalds * Validate the answer. Because there is no way to guarantee that
14391da177e4SLinus Torvalds * the entire log is made up of log records which are the same size,
14401da177e4SLinus Torvalds * we scan over the defined maximum blocks. At this point, the maximum
14411da177e4SLinus Torvalds * is not chosen to mean anything special. XXXmiken
14421da177e4SLinus Torvalds */
14431da177e4SLinus Torvalds num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
14441da177e4SLinus Torvalds ASSERT(num_scan_bblks <= INT_MAX);
14451da177e4SLinus Torvalds
14461da177e4SLinus Torvalds if (last_blk < num_scan_bblks)
14471da177e4SLinus Torvalds num_scan_bblks = last_blk;
14481da177e4SLinus Torvalds start_blk = last_blk - num_scan_bblks;
14491da177e4SLinus Torvalds
14501da177e4SLinus Torvalds /*
14511da177e4SLinus Torvalds * We search for any instances of cycle number 0 that occur before
14521da177e4SLinus Torvalds * our current estimate of the head. What we're trying to detect is
14531da177e4SLinus Torvalds * 1 ... | 0 | 1 | 0...
14541da177e4SLinus Torvalds * ^ binary search ends here
14551da177e4SLinus Torvalds */
14561da177e4SLinus Torvalds if ((error = xlog_find_verify_cycle(log, start_blk,
14571da177e4SLinus Torvalds (int)num_scan_bblks, 0, &new_blk)))
14586e9b3dd8SChristoph Hellwig goto out_free_buffer;
14591da177e4SLinus Torvalds if (new_blk != -1)
14601da177e4SLinus Torvalds last_blk = new_blk;
14611da177e4SLinus Torvalds
14621da177e4SLinus Torvalds /*
14631da177e4SLinus Torvalds * Potentially backup over partial log record write. We don't need
14641da177e4SLinus Torvalds * to search the end of the log because we know it is zero.
14651da177e4SLinus Torvalds */
14662451337dSDave Chinner error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
14672451337dSDave Chinner if (error == 1)
14682451337dSDave Chinner error = -EIO;
14692451337dSDave Chinner if (error)
14706e9b3dd8SChristoph Hellwig goto out_free_buffer;
14711da177e4SLinus Torvalds
14721da177e4SLinus Torvalds *blk_no = last_blk;
14736e9b3dd8SChristoph Hellwig out_free_buffer:
14746e9b3dd8SChristoph Hellwig kmem_free(buffer);
14751da177e4SLinus Torvalds if (error)
14761da177e4SLinus Torvalds return error;
14772451337dSDave Chinner return 1;
14781da177e4SLinus Torvalds }
14791da177e4SLinus Torvalds
14801da177e4SLinus Torvalds /*
14811da177e4SLinus Torvalds * These are simple subroutines used by xlog_clear_stale_blocks() below
14821da177e4SLinus Torvalds * to initialize a buffer full of empty log record headers and write
14831da177e4SLinus Torvalds * them into the log.
14841da177e4SLinus Torvalds */
14851da177e4SLinus Torvalds STATIC void
xlog_add_record(struct xlog * log,char * buf,int cycle,int block,int tail_cycle,int tail_block)14861da177e4SLinus Torvalds xlog_add_record(
14879a8d2fdbSMark Tinguely struct xlog *log,
1488b2a922cdSChristoph Hellwig char *buf,
14891da177e4SLinus Torvalds int cycle,
14901da177e4SLinus Torvalds int block,
14911da177e4SLinus Torvalds int tail_cycle,
14921da177e4SLinus Torvalds int tail_block)
14931da177e4SLinus Torvalds {
14941da177e4SLinus Torvalds xlog_rec_header_t *recp = (xlog_rec_header_t *)buf;
14951da177e4SLinus Torvalds
14961da177e4SLinus Torvalds memset(buf, 0, BBSIZE);
1497b53e675dSChristoph Hellwig recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1498b53e675dSChristoph Hellwig recp->h_cycle = cpu_to_be32(cycle);
1499b53e675dSChristoph Hellwig recp->h_version = cpu_to_be32(
150038c26bfdSDave Chinner xfs_has_logv2(log->l_mp) ? 2 : 1);
1501b53e675dSChristoph Hellwig recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1502b53e675dSChristoph Hellwig recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1503b53e675dSChristoph Hellwig recp->h_fmt = cpu_to_be32(XLOG_FMT);
15041da177e4SLinus Torvalds memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
15051da177e4SLinus Torvalds }
15061da177e4SLinus Torvalds
15071da177e4SLinus Torvalds STATIC int
xlog_write_log_records(struct xlog * log,int cycle,int start_block,int blocks,int tail_cycle,int tail_block)15081da177e4SLinus Torvalds xlog_write_log_records(
15099a8d2fdbSMark Tinguely struct xlog *log,
15101da177e4SLinus Torvalds int cycle,
15111da177e4SLinus Torvalds int start_block,
15121da177e4SLinus Torvalds int blocks,
15131da177e4SLinus Torvalds int tail_cycle,
15141da177e4SLinus Torvalds int tail_block)
15151da177e4SLinus Torvalds {
1516b2a922cdSChristoph Hellwig char *offset;
15176e9b3dd8SChristoph Hellwig char *buffer;
15181da177e4SLinus Torvalds int balign, ealign;
151969ce58f0SAlex Elder int sectbb = log->l_sectBBsize;
15201da177e4SLinus Torvalds int end_block = start_block + blocks;
15211da177e4SLinus Torvalds int bufblks;
15221da177e4SLinus Torvalds int error = 0;
15231da177e4SLinus Torvalds int i, j = 0;
15241da177e4SLinus Torvalds
15256881a229SAlex Elder /*
15266881a229SAlex Elder * Greedily allocate a buffer big enough to handle the full
15276881a229SAlex Elder * range of basic blocks to be written. If that fails, try
15286881a229SAlex Elder * a smaller size. We need to be able to write at least a
15296881a229SAlex Elder * log sector, or we're out of luck.
15306881a229SAlex Elder */
15318b010acbSWang Jianchao bufblks = roundup_pow_of_two(blocks);
153281158e0cSDave Chinner while (bufblks > log->l_logBBsize)
153381158e0cSDave Chinner bufblks >>= 1;
15346e9b3dd8SChristoph Hellwig while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
15351da177e4SLinus Torvalds bufblks >>= 1;
153669ce58f0SAlex Elder if (bufblks < sectbb)
15372451337dSDave Chinner return -ENOMEM;
15381da177e4SLinus Torvalds }
15391da177e4SLinus Torvalds
15401da177e4SLinus Torvalds /* We may need to do a read at the start to fill in part of
15411da177e4SLinus Torvalds * the buffer in the starting sector not covered by the first
15421da177e4SLinus Torvalds * write below.
15431da177e4SLinus Torvalds */
15445c17f533SAlex Elder balign = round_down(start_block, sectbb);
15451da177e4SLinus Torvalds if (balign != start_block) {
15466e9b3dd8SChristoph Hellwig error = xlog_bread_noalign(log, start_block, 1, buffer);
1547076e6acbSChristoph Hellwig if (error)
15486e9b3dd8SChristoph Hellwig goto out_free_buffer;
1549076e6acbSChristoph Hellwig
15501da177e4SLinus Torvalds j = start_block - balign;
15511da177e4SLinus Torvalds }
15521da177e4SLinus Torvalds
15531da177e4SLinus Torvalds for (i = start_block; i < end_block; i += bufblks) {
15541da177e4SLinus Torvalds int bcount, endcount;
15551da177e4SLinus Torvalds
15561da177e4SLinus Torvalds bcount = min(bufblks, end_block - start_block);
15571da177e4SLinus Torvalds endcount = bcount - j;
15581da177e4SLinus Torvalds
15591da177e4SLinus Torvalds /* We may need to do a read at the end to fill in part of
15601da177e4SLinus Torvalds * the buffer in the final sector not covered by the write.
15611da177e4SLinus Torvalds * If this is the same sector as the above read, skip it.
15621da177e4SLinus Torvalds */
15635c17f533SAlex Elder ealign = round_down(end_block, sectbb);
15641da177e4SLinus Torvalds if (j == 0 && (start_block + endcount > ealign)) {
15656ad5b325SChristoph Hellwig error = xlog_bread_noalign(log, ealign, sectbb,
15666e9b3dd8SChristoph Hellwig buffer + BBTOB(ealign - start_block));
1567076e6acbSChristoph Hellwig if (error)
1568076e6acbSChristoph Hellwig break;
1569076e6acbSChristoph Hellwig
15701da177e4SLinus Torvalds }
15711da177e4SLinus Torvalds
15726e9b3dd8SChristoph Hellwig offset = buffer + xlog_align(log, start_block);
15731da177e4SLinus Torvalds for (; j < endcount; j++) {
15741da177e4SLinus Torvalds xlog_add_record(log, offset, cycle, i+j,
15751da177e4SLinus Torvalds tail_cycle, tail_block);
15761da177e4SLinus Torvalds offset += BBSIZE;
15771da177e4SLinus Torvalds }
15786e9b3dd8SChristoph Hellwig error = xlog_bwrite(log, start_block, endcount, buffer);
15791da177e4SLinus Torvalds if (error)
15801da177e4SLinus Torvalds break;
15811da177e4SLinus Torvalds start_block += endcount;
15821da177e4SLinus Torvalds j = 0;
15831da177e4SLinus Torvalds }
1584076e6acbSChristoph Hellwig
15856e9b3dd8SChristoph Hellwig out_free_buffer:
15866e9b3dd8SChristoph Hellwig kmem_free(buffer);
15871da177e4SLinus Torvalds return error;
15881da177e4SLinus Torvalds }
15891da177e4SLinus Torvalds
15901da177e4SLinus Torvalds /*
15911da177e4SLinus Torvalds * This routine is called to blow away any incomplete log writes out
15921da177e4SLinus Torvalds * in front of the log head. We do this so that we won't become confused
15931da177e4SLinus Torvalds * if we come up, write only a little bit more, and then crash again.
15941da177e4SLinus Torvalds * If we leave the partial log records out there, this situation could
15951da177e4SLinus Torvalds * cause us to think those partial writes are valid blocks since they
15961da177e4SLinus Torvalds * have the current cycle number. We get rid of them by overwriting them
15971da177e4SLinus Torvalds * with empty log records with the old cycle number rather than the
15981da177e4SLinus Torvalds * current one.
15991da177e4SLinus Torvalds *
16001da177e4SLinus Torvalds * The tail lsn is passed in rather than taken from
16011da177e4SLinus Torvalds * the log so that we will not write over the unmount record after a
16021da177e4SLinus Torvalds * clean unmount in a 512 block log. Doing so would leave the log without
16031da177e4SLinus Torvalds * any valid log records in it until a new one was written. If we crashed
16041da177e4SLinus Torvalds * during that time we would not be able to recover.
16051da177e4SLinus Torvalds */
16061da177e4SLinus Torvalds STATIC int
xlog_clear_stale_blocks(struct xlog * log,xfs_lsn_t tail_lsn)16071da177e4SLinus Torvalds xlog_clear_stale_blocks(
16089a8d2fdbSMark Tinguely struct xlog *log,
16091da177e4SLinus Torvalds xfs_lsn_t tail_lsn)
16101da177e4SLinus Torvalds {
16111da177e4SLinus Torvalds int tail_cycle, head_cycle;
16121da177e4SLinus Torvalds int tail_block, head_block;
16131da177e4SLinus Torvalds int tail_distance, max_distance;
16141da177e4SLinus Torvalds int distance;
16151da177e4SLinus Torvalds int error;
16161da177e4SLinus Torvalds
16171da177e4SLinus Torvalds tail_cycle = CYCLE_LSN(tail_lsn);
16181da177e4SLinus Torvalds tail_block = BLOCK_LSN(tail_lsn);
16191da177e4SLinus Torvalds head_cycle = log->l_curr_cycle;
16201da177e4SLinus Torvalds head_block = log->l_curr_block;
16211da177e4SLinus Torvalds
16221da177e4SLinus Torvalds /*
16231da177e4SLinus Torvalds * Figure out the distance between the new head of the log
16241da177e4SLinus Torvalds * and the tail. We want to write over any blocks beyond the
16251da177e4SLinus Torvalds * head that we may have written just before the crash, but
16261da177e4SLinus Torvalds * we don't want to overwrite the tail of the log.
16271da177e4SLinus Torvalds */
16281da177e4SLinus Torvalds if (head_cycle == tail_cycle) {
16291da177e4SLinus Torvalds /*
16301da177e4SLinus Torvalds * The tail is behind the head in the physical log,
16311da177e4SLinus Torvalds * so the distance from the head to the tail is the
16321da177e4SLinus Torvalds * distance from the head to the end of the log plus
16331da177e4SLinus Torvalds * the distance from the beginning of the log to the
16341da177e4SLinus Torvalds * tail.
16351da177e4SLinus Torvalds */
1636a71895c5SDarrick J. Wong if (XFS_IS_CORRUPT(log->l_mp,
1637a71895c5SDarrick J. Wong head_block < tail_block ||
1638a71895c5SDarrick J. Wong head_block >= log->l_logBBsize))
16392451337dSDave Chinner return -EFSCORRUPTED;
16401da177e4SLinus Torvalds tail_distance = tail_block + (log->l_logBBsize - head_block);
16411da177e4SLinus Torvalds } else {
16421da177e4SLinus Torvalds /*
16431da177e4SLinus Torvalds * The head is behind the tail in the physical log,
16441da177e4SLinus Torvalds * so the distance from the head to the tail is just
16451da177e4SLinus Torvalds * the tail block minus the head block.
16461da177e4SLinus Torvalds */
1647a71895c5SDarrick J. Wong if (XFS_IS_CORRUPT(log->l_mp,
1648a71895c5SDarrick J. Wong head_block >= tail_block ||
1649a71895c5SDarrick J. Wong head_cycle != tail_cycle + 1))
16502451337dSDave Chinner return -EFSCORRUPTED;
16511da177e4SLinus Torvalds tail_distance = tail_block - head_block;
16521da177e4SLinus Torvalds }
16531da177e4SLinus Torvalds
16541da177e4SLinus Torvalds /*
16551da177e4SLinus Torvalds * If the head is right up against the tail, we can't clear
16561da177e4SLinus Torvalds * anything.
16571da177e4SLinus Torvalds */
16581da177e4SLinus Torvalds if (tail_distance <= 0) {
16591da177e4SLinus Torvalds ASSERT(tail_distance == 0);
16601da177e4SLinus Torvalds return 0;
16611da177e4SLinus Torvalds }
16621da177e4SLinus Torvalds
16631da177e4SLinus Torvalds max_distance = XLOG_TOTAL_REC_SHIFT(log);
16641da177e4SLinus Torvalds /*
16651da177e4SLinus Torvalds * Take the smaller of the maximum amount of outstanding I/O
16661da177e4SLinus Torvalds * we could have and the distance to the tail to clear out.
16671da177e4SLinus Torvalds * We take the smaller so that we don't overwrite the tail and
16681da177e4SLinus Torvalds * we don't waste all day writing from the head to the tail
16691da177e4SLinus Torvalds * for no reason.
16701da177e4SLinus Torvalds */
16719bb54cb5SDave Chinner max_distance = min(max_distance, tail_distance);
16721da177e4SLinus Torvalds
16731da177e4SLinus Torvalds if ((head_block + max_distance) <= log->l_logBBsize) {
16741da177e4SLinus Torvalds /*
16751da177e4SLinus Torvalds * We can stomp all the blocks we need to without
16761da177e4SLinus Torvalds * wrapping around the end of the log. Just do it
16771da177e4SLinus Torvalds * in a single write. Use the cycle number of the
16781da177e4SLinus Torvalds * current cycle minus one so that the log will look like:
16791da177e4SLinus Torvalds * n ... | n - 1 ...
16801da177e4SLinus Torvalds */
16811da177e4SLinus Torvalds error = xlog_write_log_records(log, (head_cycle - 1),
16821da177e4SLinus Torvalds head_block, max_distance, tail_cycle,
16831da177e4SLinus Torvalds tail_block);
16841da177e4SLinus Torvalds if (error)
16851da177e4SLinus Torvalds return error;
16861da177e4SLinus Torvalds } else {
16871da177e4SLinus Torvalds /*
16881da177e4SLinus Torvalds * We need to wrap around the end of the physical log in
16891da177e4SLinus Torvalds * order to clear all the blocks. Do it in two separate
16901da177e4SLinus Torvalds * I/Os. The first write should be from the head to the
16911da177e4SLinus Torvalds * end of the physical log, and it should use the current
16921da177e4SLinus Torvalds * cycle number minus one just like above.
16931da177e4SLinus Torvalds */
16941da177e4SLinus Torvalds distance = log->l_logBBsize - head_block;
16951da177e4SLinus Torvalds error = xlog_write_log_records(log, (head_cycle - 1),
16961da177e4SLinus Torvalds head_block, distance, tail_cycle,
16971da177e4SLinus Torvalds tail_block);
16981da177e4SLinus Torvalds
16991da177e4SLinus Torvalds if (error)
17001da177e4SLinus Torvalds return error;
17011da177e4SLinus Torvalds
17021da177e4SLinus Torvalds /*
17031da177e4SLinus Torvalds * Now write the blocks at the start of the physical log.
17041da177e4SLinus Torvalds * This writes the remainder of the blocks we want to clear.
17051da177e4SLinus Torvalds * It uses the current cycle number since we're now on the
17061da177e4SLinus Torvalds * same cycle as the head so that we get:
17071da177e4SLinus Torvalds * n ... n ... | n - 1 ...
17081da177e4SLinus Torvalds * ^^^^^ blocks we're writing
17091da177e4SLinus Torvalds */
17101da177e4SLinus Torvalds distance = max_distance - (log->l_logBBsize - head_block);
17111da177e4SLinus Torvalds error = xlog_write_log_records(log, head_cycle, 0, distance,
17121da177e4SLinus Torvalds tail_cycle, tail_block);
17131da177e4SLinus Torvalds if (error)
17141da177e4SLinus Torvalds return error;
17151da177e4SLinus Torvalds }
17161da177e4SLinus Torvalds
17171da177e4SLinus Torvalds return 0;
17181da177e4SLinus Torvalds }
17191da177e4SLinus Torvalds
1720154c733aSDarrick J. Wong /*
1721154c733aSDarrick J. Wong * Release the recovered intent item in the AIL that matches the given intent
1722154c733aSDarrick J. Wong * type and intent id.
1723154c733aSDarrick J. Wong */
1724154c733aSDarrick J. Wong void
xlog_recover_release_intent(struct xlog * log,unsigned short intent_type,uint64_t intent_id)1725154c733aSDarrick J. Wong xlog_recover_release_intent(
1726154c733aSDarrick J. Wong struct xlog *log,
1727154c733aSDarrick J. Wong unsigned short intent_type,
1728154c733aSDarrick J. Wong uint64_t intent_id)
1729154c733aSDarrick J. Wong {
1730cd3c2cf3SDarrick J. Wong struct xfs_defer_pending *dfp, *n;
1731154c733aSDarrick J. Wong
1732cd3c2cf3SDarrick J. Wong list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
1733cd3c2cf3SDarrick J. Wong struct xfs_log_item *lip = dfp->dfp_intent;
1734cd3c2cf3SDarrick J. Wong
1735154c733aSDarrick J. Wong if (lip->li_type != intent_type)
1736154c733aSDarrick J. Wong continue;
1737154c733aSDarrick J. Wong if (!lip->li_ops->iop_match(lip, intent_id))
1738154c733aSDarrick J. Wong continue;
1739154c733aSDarrick J. Wong
1740cd3c2cf3SDarrick J. Wong ASSERT(xlog_item_is_intent(lip));
1741154c733aSDarrick J. Wong
1742cd3c2cf3SDarrick J. Wong xfs_defer_cancel_recovery(log->l_mp, dfp);
1743cd3c2cf3SDarrick J. Wong }
1744154c733aSDarrick J. Wong }
1745154c733aSDarrick J. Wong
17464bc61983SDarrick J. Wong int
xlog_recover_iget(struct xfs_mount * mp,xfs_ino_t ino,struct xfs_inode ** ipp)17474bc61983SDarrick J. Wong xlog_recover_iget(
17484bc61983SDarrick J. Wong struct xfs_mount *mp,
17494bc61983SDarrick J. Wong xfs_ino_t ino,
17504bc61983SDarrick J. Wong struct xfs_inode **ipp)
17514bc61983SDarrick J. Wong {
17524bc61983SDarrick J. Wong int error;
17534bc61983SDarrick J. Wong
17544bc61983SDarrick J. Wong error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
17554bc61983SDarrick J. Wong if (error)
17564bc61983SDarrick J. Wong return error;
17574bc61983SDarrick J. Wong
17584bc61983SDarrick J. Wong error = xfs_qm_dqattach(*ipp);
17594bc61983SDarrick J. Wong if (error) {
17604bc61983SDarrick J. Wong xfs_irele(*ipp);
17614bc61983SDarrick J. Wong return error;
17624bc61983SDarrick J. Wong }
17634bc61983SDarrick J. Wong
17644bc61983SDarrick J. Wong if (VFS_I(*ipp)->i_nlink == 0)
17654bc61983SDarrick J. Wong xfs_iflags_set(*ipp, XFS_IRECOVERY);
17664bc61983SDarrick J. Wong
17674bc61983SDarrick J. Wong return 0;
17684bc61983SDarrick J. Wong }
17694bc61983SDarrick J. Wong
17701da177e4SLinus Torvalds /******************************************************************************
17711da177e4SLinus Torvalds *
17721da177e4SLinus Torvalds * Log recover routines
17731da177e4SLinus Torvalds *
17741da177e4SLinus Torvalds ******************************************************************************
17751da177e4SLinus Torvalds */
177686ffa471SDarrick J. Wong static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
177786ffa471SDarrick J. Wong &xlog_buf_item_ops,
177886ffa471SDarrick J. Wong &xlog_inode_item_ops,
177986ffa471SDarrick J. Wong &xlog_dquot_item_ops,
178086ffa471SDarrick J. Wong &xlog_quotaoff_item_ops,
178186ffa471SDarrick J. Wong &xlog_icreate_item_ops,
178286ffa471SDarrick J. Wong &xlog_efi_item_ops,
178386ffa471SDarrick J. Wong &xlog_efd_item_ops,
178486ffa471SDarrick J. Wong &xlog_rui_item_ops,
178586ffa471SDarrick J. Wong &xlog_rud_item_ops,
178686ffa471SDarrick J. Wong &xlog_cui_item_ops,
178786ffa471SDarrick J. Wong &xlog_cud_item_ops,
178886ffa471SDarrick J. Wong &xlog_bui_item_ops,
178986ffa471SDarrick J. Wong &xlog_bud_item_ops,
1790fd920008SAllison Henderson &xlog_attri_item_ops,
1791fd920008SAllison Henderson &xlog_attrd_item_ops,
179286ffa471SDarrick J. Wong };
179386ffa471SDarrick J. Wong
179486ffa471SDarrick J. Wong static const struct xlog_recover_item_ops *
xlog_find_item_ops(struct xlog_recover_item * item)179586ffa471SDarrick J. Wong xlog_find_item_ops(
179686ffa471SDarrick J. Wong struct xlog_recover_item *item)
179786ffa471SDarrick J. Wong {
179886ffa471SDarrick J. Wong unsigned int i;
179986ffa471SDarrick J. Wong
180086ffa471SDarrick J. Wong for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++)
180186ffa471SDarrick J. Wong if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type)
180286ffa471SDarrick J. Wong return xlog_recover_item_ops[i];
180386ffa471SDarrick J. Wong
180486ffa471SDarrick J. Wong return NULL;
180586ffa471SDarrick J. Wong }
18061da177e4SLinus Torvalds
1807f0a76953SDave Chinner /*
1808a775ad77SDave Chinner * Sort the log items in the transaction.
1809a775ad77SDave Chinner *
1810a775ad77SDave Chinner * The ordering constraints are defined by the inode allocation and unlink
1811a775ad77SDave Chinner * behaviour. The rules are:
1812a775ad77SDave Chinner *
1813a775ad77SDave Chinner * 1. Every item is only logged once in a given transaction. Hence it
1814a775ad77SDave Chinner * represents the last logged state of the item. Hence ordering is
1815a775ad77SDave Chinner * dependent on the order in which operations need to be performed so
1816a775ad77SDave Chinner * required initial conditions are always met.
1817a775ad77SDave Chinner *
1818a775ad77SDave Chinner * 2. Cancelled buffers are recorded in pass 1 in a separate table and
1819a775ad77SDave Chinner * there's nothing to replay from them so we can simply cull them
1820a775ad77SDave Chinner * from the transaction. However, we can't do that until after we've
1821a775ad77SDave Chinner * replayed all the other items because they may be dependent on the
1822a775ad77SDave Chinner * cancelled buffer and replaying the cancelled buffer can remove it
1823a775ad77SDave Chinner * form the cancelled buffer table. Hence they have to be done last.
1824a775ad77SDave Chinner *
1825a775ad77SDave Chinner * 3. Inode allocation buffers must be replayed before inode items that
182628c8e41aSDave Chinner * read the buffer and replay changes into it. For filesystems using the
182728c8e41aSDave Chinner * ICREATE transactions, this means XFS_LI_ICREATE objects need to get
182828c8e41aSDave Chinner * treated the same as inode allocation buffers as they create and
182928c8e41aSDave Chinner * initialise the buffers directly.
1830a775ad77SDave Chinner *
1831a775ad77SDave Chinner * 4. Inode unlink buffers must be replayed after inode items are replayed.
1832a775ad77SDave Chinner * This ensures that inodes are completely flushed to the inode buffer
1833a775ad77SDave Chinner * in a "free" state before we remove the unlinked inode list pointer.
1834a775ad77SDave Chinner *
1835a775ad77SDave Chinner * Hence the ordering needs to be inode allocation buffers first, inode items
1836a775ad77SDave Chinner * second, inode unlink buffers third and cancelled buffers last.
1837a775ad77SDave Chinner *
1838a775ad77SDave Chinner * But there's a problem with that - we can't tell an inode allocation buffer
1839a775ad77SDave Chinner * apart from a regular buffer, so we can't separate them. We can, however,
1840a775ad77SDave Chinner * tell an inode unlink buffer from the others, and so we can separate them out
1841a775ad77SDave Chinner * from all the other buffers and move them to last.
1842a775ad77SDave Chinner *
1843a775ad77SDave Chinner * Hence, 4 lists, in order from head to tail:
1844a775ad77SDave Chinner * - buffer_list for all buffers except cancelled/inode unlink buffers
1845a775ad77SDave Chinner * - item_list for all non-buffer items
1846a775ad77SDave Chinner * - inode_buffer_list for inode unlink buffers
1847a775ad77SDave Chinner * - cancel_list for the cancelled buffers
184828c8e41aSDave Chinner *
184928c8e41aSDave Chinner * Note that we add objects to the tail of the lists so that first-to-last
185028c8e41aSDave Chinner * ordering is preserved within the lists. Adding objects to the head of the
185128c8e41aSDave Chinner * list means when we traverse from the head we walk them in last-to-first
185228c8e41aSDave Chinner * order. For cancelled buffers and inode unlink buffers this doesn't matter,
185328c8e41aSDave Chinner * but for all other items there may be specific ordering that we need to
185428c8e41aSDave Chinner * preserve.
1855f0a76953SDave Chinner */
18561da177e4SLinus Torvalds STATIC int
xlog_recover_reorder_trans(struct xlog * log,struct xlog_recover * trans,int pass)18571da177e4SLinus Torvalds xlog_recover_reorder_trans(
1858ad223e60SMark Tinguely struct xlog *log,
1859ad223e60SMark Tinguely struct xlog_recover *trans,
18609abbc539SDave Chinner int pass)
18611da177e4SLinus Torvalds {
186235f4521fSDarrick J. Wong struct xlog_recover_item *item, *n;
18632a84108fSMark Tinguely int error = 0;
1864f0a76953SDave Chinner LIST_HEAD(sort_list);
1865a775ad77SDave Chinner LIST_HEAD(cancel_list);
1866a775ad77SDave Chinner LIST_HEAD(buffer_list);
1867a775ad77SDave Chinner LIST_HEAD(inode_buffer_list);
18685ce70b77SChristoph Hellwig LIST_HEAD(item_list);
1869f0a76953SDave Chinner
1870f0a76953SDave Chinner list_splice_init(&trans->r_itemq, &sort_list);
1871f0a76953SDave Chinner list_for_each_entry_safe(item, n, &sort_list, ri_list) {
187286ffa471SDarrick J. Wong enum xlog_recover_reorder fate = XLOG_REORDER_ITEM_LIST;
18731da177e4SLinus Torvalds
187486ffa471SDarrick J. Wong item->ri_ops = xlog_find_item_ops(item);
187586ffa471SDarrick J. Wong if (!item->ri_ops) {
1876a0fa2b67SDave Chinner xfs_warn(log->l_mp,
18770d2d35a3SDarrick J. Wong "%s: unrecognized type of log operation (%d)",
18780d2d35a3SDarrick J. Wong __func__, ITEM_TYPE(item));
18791da177e4SLinus Torvalds ASSERT(0);
18802a84108fSMark Tinguely /*
18812a84108fSMark Tinguely * return the remaining items back to the transaction
18822a84108fSMark Tinguely * item list so they can be freed in caller.
18832a84108fSMark Tinguely */
18842a84108fSMark Tinguely if (!list_empty(&sort_list))
18852a84108fSMark Tinguely list_splice_init(&sort_list, &trans->r_itemq);
188686ffa471SDarrick J. Wong error = -EFSCORRUPTED;
188786ffa471SDarrick J. Wong break;
188886ffa471SDarrick J. Wong }
188986ffa471SDarrick J. Wong
189086ffa471SDarrick J. Wong if (item->ri_ops->reorder)
189186ffa471SDarrick J. Wong fate = item->ri_ops->reorder(item);
189286ffa471SDarrick J. Wong
189386ffa471SDarrick J. Wong switch (fate) {
189486ffa471SDarrick J. Wong case XLOG_REORDER_BUFFER_LIST:
189586ffa471SDarrick J. Wong list_move_tail(&item->ri_list, &buffer_list);
189686ffa471SDarrick J. Wong break;
189786ffa471SDarrick J. Wong case XLOG_REORDER_CANCEL_LIST:
189886ffa471SDarrick J. Wong trace_xfs_log_recover_item_reorder_head(log,
189986ffa471SDarrick J. Wong trans, item, pass);
190086ffa471SDarrick J. Wong list_move(&item->ri_list, &cancel_list);
190186ffa471SDarrick J. Wong break;
190286ffa471SDarrick J. Wong case XLOG_REORDER_INODE_BUFFER_LIST:
190386ffa471SDarrick J. Wong list_move(&item->ri_list, &inode_buffer_list);
190486ffa471SDarrick J. Wong break;
190586ffa471SDarrick J. Wong case XLOG_REORDER_ITEM_LIST:
190686ffa471SDarrick J. Wong trace_xfs_log_recover_item_reorder_tail(log,
190786ffa471SDarrick J. Wong trans, item, pass);
190886ffa471SDarrick J. Wong list_move_tail(&item->ri_list, &item_list);
190986ffa471SDarrick J. Wong break;
19101da177e4SLinus Torvalds }
1911f0a76953SDave Chinner }
191286ffa471SDarrick J. Wong
1913f0a76953SDave Chinner ASSERT(list_empty(&sort_list));
1914a775ad77SDave Chinner if (!list_empty(&buffer_list))
1915a775ad77SDave Chinner list_splice(&buffer_list, &trans->r_itemq);
19165ce70b77SChristoph Hellwig if (!list_empty(&item_list))
19175ce70b77SChristoph Hellwig list_splice_tail(&item_list, &trans->r_itemq);
1918a775ad77SDave Chinner if (!list_empty(&inode_buffer_list))
1919a775ad77SDave Chinner list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1920a775ad77SDave Chinner if (!list_empty(&cancel_list))
1921a775ad77SDave Chinner list_splice_tail(&cancel_list, &trans->r_itemq);
19222a84108fSMark Tinguely return error;
19231da177e4SLinus Torvalds }
19241da177e4SLinus Torvalds
19258ea5682dSDarrick J. Wong void
xlog_buf_readahead(struct xlog * log,xfs_daddr_t blkno,uint len,const struct xfs_buf_ops * ops)19267d4894b4SChristoph Hellwig xlog_buf_readahead(
19277d4894b4SChristoph Hellwig struct xlog *log,
19287d4894b4SChristoph Hellwig xfs_daddr_t blkno,
19297d4894b4SChristoph Hellwig uint len,
19307d4894b4SChristoph Hellwig const struct xfs_buf_ops *ops)
19317d4894b4SChristoph Hellwig {
19327d4894b4SChristoph Hellwig if (!xlog_is_buffer_cancelled(log, blkno, len))
19337d4894b4SChristoph Hellwig xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
19347d4894b4SChristoph Hellwig }
19357d4894b4SChristoph Hellwig
1936cd3c2cf3SDarrick J. Wong /*
1937cd3c2cf3SDarrick J. Wong * Create a deferred work structure for resuming and tracking the progress of a
1938cd3c2cf3SDarrick J. Wong * log intent item that was found during recovery.
1939cd3c2cf3SDarrick J. Wong */
1940cd3c2cf3SDarrick J. Wong void
xlog_recover_intent_item(struct xlog * log,struct xfs_log_item * lip,xfs_lsn_t lsn,unsigned int dfp_type)1941cd3c2cf3SDarrick J. Wong xlog_recover_intent_item(
1942cd3c2cf3SDarrick J. Wong struct xlog *log,
1943cd3c2cf3SDarrick J. Wong struct xfs_log_item *lip,
1944cd3c2cf3SDarrick J. Wong xfs_lsn_t lsn,
1945cd3c2cf3SDarrick J. Wong unsigned int dfp_type)
1946cd3c2cf3SDarrick J. Wong {
1947cd3c2cf3SDarrick J. Wong ASSERT(xlog_item_is_intent(lip));
1948cd3c2cf3SDarrick J. Wong
1949cd3c2cf3SDarrick J. Wong xfs_defer_start_recovery(lip, dfp_type, &log->r_dfops);
1950cd3c2cf3SDarrick J. Wong
1951cd3c2cf3SDarrick J. Wong /*
1952cd3c2cf3SDarrick J. Wong * Insert the intent into the AIL directly and drop one reference so
1953cd3c2cf3SDarrick J. Wong * that finishing or canceling the work will drop the other.
1954cd3c2cf3SDarrick J. Wong */
1955cd3c2cf3SDarrick J. Wong xfs_trans_ail_insert(log->l_ailp, lip, lsn);
1956cd3c2cf3SDarrick J. Wong lip->li_ops->iop_unpin(lip, 0);
1957cd3c2cf3SDarrick J. Wong }
1958cd3c2cf3SDarrick J. Wong
19591da177e4SLinus Torvalds STATIC int
xlog_recover_items_pass2(struct xlog * log,struct xlog_recover * trans,struct list_head * buffer_list,struct list_head * item_list)196000574da1SZhi Yong Wu xlog_recover_items_pass2(
196100574da1SZhi Yong Wu struct xlog *log,
196200574da1SZhi Yong Wu struct xlog_recover *trans,
196300574da1SZhi Yong Wu struct list_head *buffer_list,
196400574da1SZhi Yong Wu struct list_head *item_list)
196500574da1SZhi Yong Wu {
196600574da1SZhi Yong Wu struct xlog_recover_item *item;
196700574da1SZhi Yong Wu int error = 0;
196800574da1SZhi Yong Wu
196900574da1SZhi Yong Wu list_for_each_entry(item, item_list, ri_list) {
19702565a11bSDarrick J. Wong trace_xfs_log_recover_item_recover(log, trans, item,
19712565a11bSDarrick J. Wong XLOG_RECOVER_PASS2);
19722565a11bSDarrick J. Wong
19732565a11bSDarrick J. Wong if (item->ri_ops->commit_pass2)
19742565a11bSDarrick J. Wong error = item->ri_ops->commit_pass2(log, buffer_list,
19752565a11bSDarrick J. Wong item, trans->r_lsn);
197600574da1SZhi Yong Wu if (error)
197700574da1SZhi Yong Wu return error;
197800574da1SZhi Yong Wu }
197900574da1SZhi Yong Wu
198000574da1SZhi Yong Wu return error;
198100574da1SZhi Yong Wu }
198200574da1SZhi Yong Wu
1983d0450948SChristoph Hellwig /*
1984d0450948SChristoph Hellwig * Perform the transaction.
1985d0450948SChristoph Hellwig *
1986d0450948SChristoph Hellwig * If the transaction modifies a buffer or inode, do it now. Otherwise,
1987d0450948SChristoph Hellwig * EFIs and EFDs get queued up by adding entries into the AIL for them.
1988d0450948SChristoph Hellwig */
1989d0450948SChristoph Hellwig STATIC int
xlog_recover_commit_trans(struct xlog * log,struct xlog_recover * trans,int pass,struct list_head * buffer_list)1990d0450948SChristoph Hellwig xlog_recover_commit_trans(
1991ad223e60SMark Tinguely struct xlog *log,
1992d0450948SChristoph Hellwig struct xlog_recover *trans,
199312818d24SBrian Foster int pass,
199412818d24SBrian Foster struct list_head *buffer_list)
1995d0450948SChristoph Hellwig {
199600574da1SZhi Yong Wu int error = 0;
199700574da1SZhi Yong Wu int items_queued = 0;
199800574da1SZhi Yong Wu struct xlog_recover_item *item;
199900574da1SZhi Yong Wu struct xlog_recover_item *next;
200000574da1SZhi Yong Wu LIST_HEAD (ra_list);
200100574da1SZhi Yong Wu LIST_HEAD (done_list);
200200574da1SZhi Yong Wu
200300574da1SZhi Yong Wu #define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
20041da177e4SLinus Torvalds
200539775431SBrian Foster hlist_del_init(&trans->r_list);
2006d0450948SChristoph Hellwig
2007d0450948SChristoph Hellwig error = xlog_recover_reorder_trans(log, trans, pass);
2008d0450948SChristoph Hellwig if (error)
20091da177e4SLinus Torvalds return error;
2010d0450948SChristoph Hellwig
201100574da1SZhi Yong Wu list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
20123304a4faSDarrick J. Wong trace_xfs_log_recover_item_recover(log, trans, item, pass);
20133304a4faSDarrick J. Wong
201443ff2122SChristoph Hellwig switch (pass) {
201543ff2122SChristoph Hellwig case XLOG_RECOVER_PASS1:
20163304a4faSDarrick J. Wong if (item->ri_ops->commit_pass1)
20173304a4faSDarrick J. Wong error = item->ri_ops->commit_pass1(log, item);
201843ff2122SChristoph Hellwig break;
201943ff2122SChristoph Hellwig case XLOG_RECOVER_PASS2:
20208ea5682dSDarrick J. Wong if (item->ri_ops->ra_pass2)
20218ea5682dSDarrick J. Wong item->ri_ops->ra_pass2(log, item);
202200574da1SZhi Yong Wu list_move_tail(&item->ri_list, &ra_list);
202300574da1SZhi Yong Wu items_queued++;
202400574da1SZhi Yong Wu if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
202500574da1SZhi Yong Wu error = xlog_recover_items_pass2(log, trans,
202612818d24SBrian Foster buffer_list, &ra_list);
202700574da1SZhi Yong Wu list_splice_tail_init(&ra_list, &done_list);
202800574da1SZhi Yong Wu items_queued = 0;
202900574da1SZhi Yong Wu }
203000574da1SZhi Yong Wu
203143ff2122SChristoph Hellwig break;
203243ff2122SChristoph Hellwig default:
203343ff2122SChristoph Hellwig ASSERT(0);
203443ff2122SChristoph Hellwig }
203543ff2122SChristoph Hellwig
2036d0450948SChristoph Hellwig if (error)
203743ff2122SChristoph Hellwig goto out;
2038d0450948SChristoph Hellwig }
2039d0450948SChristoph Hellwig
204000574da1SZhi Yong Wu out:
204100574da1SZhi Yong Wu if (!list_empty(&ra_list)) {
204200574da1SZhi Yong Wu if (!error)
204300574da1SZhi Yong Wu error = xlog_recover_items_pass2(log, trans,
204412818d24SBrian Foster buffer_list, &ra_list);
204500574da1SZhi Yong Wu list_splice_tail_init(&ra_list, &done_list);
204600574da1SZhi Yong Wu }
204700574da1SZhi Yong Wu
204800574da1SZhi Yong Wu if (!list_empty(&done_list))
204900574da1SZhi Yong Wu list_splice_init(&done_list, &trans->r_itemq);
205000574da1SZhi Yong Wu
205112818d24SBrian Foster return error;
20521da177e4SLinus Torvalds }
20531da177e4SLinus Torvalds
205476560669SDave Chinner STATIC void
xlog_recover_add_item(struct list_head * head)205576560669SDave Chinner xlog_recover_add_item(
205676560669SDave Chinner struct list_head *head)
20571da177e4SLinus Torvalds {
205835f4521fSDarrick J. Wong struct xlog_recover_item *item;
205976560669SDave Chinner
206035f4521fSDarrick J. Wong item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
206176560669SDave Chinner INIT_LIST_HEAD(&item->ri_list);
206276560669SDave Chinner list_add_tail(&item->ri_list, head);
206376560669SDave Chinner }
206476560669SDave Chinner
206576560669SDave Chinner STATIC int
xlog_recover_add_to_cont_trans(struct xlog * log,struct xlog_recover * trans,char * dp,int len)206676560669SDave Chinner xlog_recover_add_to_cont_trans(
206776560669SDave Chinner struct xlog *log,
206876560669SDave Chinner struct xlog_recover *trans,
2069b2a922cdSChristoph Hellwig char *dp,
207076560669SDave Chinner int len)
207176560669SDave Chinner {
207235f4521fSDarrick J. Wong struct xlog_recover_item *item;
2073b2a922cdSChristoph Hellwig char *ptr, *old_ptr;
207476560669SDave Chinner int old_len;
207576560669SDave Chinner
207689cebc84SBrian Foster /*
207789cebc84SBrian Foster * If the transaction is empty, the header was split across this and the
207889cebc84SBrian Foster * previous record. Copy the rest of the header.
207989cebc84SBrian Foster */
208076560669SDave Chinner if (list_empty(&trans->r_itemq)) {
2081848ccfc8SBrian Foster ASSERT(len <= sizeof(struct xfs_trans_header));
208289cebc84SBrian Foster if (len > sizeof(struct xfs_trans_header)) {
208389cebc84SBrian Foster xfs_warn(log->l_mp, "%s: bad header length", __func__);
2084895e196fSDarrick J. Wong return -EFSCORRUPTED;
208589cebc84SBrian Foster }
208689cebc84SBrian Foster
208776560669SDave Chinner xlog_recover_add_item(&trans->r_itemq);
2088b2a922cdSChristoph Hellwig ptr = (char *)&trans->r_theader +
208989cebc84SBrian Foster sizeof(struct xfs_trans_header) - len;
209076560669SDave Chinner memcpy(ptr, dp, len);
20911da177e4SLinus Torvalds return 0;
20921da177e4SLinus Torvalds }
209389cebc84SBrian Foster
209476560669SDave Chinner /* take the tail entry */
209535f4521fSDarrick J. Wong item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
209635f4521fSDarrick J. Wong ri_list);
209776560669SDave Chinner
209876560669SDave Chinner old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
209976560669SDave Chinner old_len = item->ri_buf[item->ri_cnt-1].i_len;
210076560669SDave Chinner
2101de2860f4SDave Chinner ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
2102de2860f4SDave Chinner if (!ptr)
2103de2860f4SDave Chinner return -ENOMEM;
210476560669SDave Chinner memcpy(&ptr[old_len], dp, len);
210576560669SDave Chinner item->ri_buf[item->ri_cnt-1].i_len += len;
210676560669SDave Chinner item->ri_buf[item->ri_cnt-1].i_addr = ptr;
210776560669SDave Chinner trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
210876560669SDave Chinner return 0;
210976560669SDave Chinner }
211076560669SDave Chinner
211176560669SDave Chinner /*
211276560669SDave Chinner * The next region to add is the start of a new region. It could be
211376560669SDave Chinner * a whole region or it could be the first part of a new region. Because
211476560669SDave Chinner * of this, the assumption here is that the type and size fields of all
211576560669SDave Chinner * format structures fit into the first 32 bits of the structure.
211676560669SDave Chinner *
211776560669SDave Chinner * This works because all regions must be 32 bit aligned. Therefore, we
211876560669SDave Chinner * either have both fields or we have neither field. In the case we have
211976560669SDave Chinner * neither field, the data part of the region is zero length. We only have
212076560669SDave Chinner * a log_op_header and can throw away the header since a new one will appear
212176560669SDave Chinner * later. If we have at least 4 bytes, then we can determine how many regions
212276560669SDave Chinner * will appear in the current log item.
212376560669SDave Chinner */
212476560669SDave Chinner STATIC int
xlog_recover_add_to_trans(struct xlog * log,struct xlog_recover * trans,char * dp,int len)212576560669SDave Chinner xlog_recover_add_to_trans(
212676560669SDave Chinner struct xlog *log,
212776560669SDave Chinner struct xlog_recover *trans,
2128b2a922cdSChristoph Hellwig char *dp,
212976560669SDave Chinner int len)
213076560669SDave Chinner {
213106b11321SDarrick J. Wong struct xfs_inode_log_format *in_f; /* any will do */
213235f4521fSDarrick J. Wong struct xlog_recover_item *item;
2133b2a922cdSChristoph Hellwig char *ptr;
213476560669SDave Chinner
213576560669SDave Chinner if (!len)
213676560669SDave Chinner return 0;
213776560669SDave Chinner if (list_empty(&trans->r_itemq)) {
213876560669SDave Chinner /* we need to catch log corruptions here */
213976560669SDave Chinner if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
214076560669SDave Chinner xfs_warn(log->l_mp, "%s: bad header magic number",
214176560669SDave Chinner __func__);
214276560669SDave Chinner ASSERT(0);
2143895e196fSDarrick J. Wong return -EFSCORRUPTED;
214476560669SDave Chinner }
214589cebc84SBrian Foster
214689cebc84SBrian Foster if (len > sizeof(struct xfs_trans_header)) {
214789cebc84SBrian Foster xfs_warn(log->l_mp, "%s: bad header length", __func__);
214889cebc84SBrian Foster ASSERT(0);
2149895e196fSDarrick J. Wong return -EFSCORRUPTED;
215089cebc84SBrian Foster }
215189cebc84SBrian Foster
215289cebc84SBrian Foster /*
215389cebc84SBrian Foster * The transaction header can be arbitrarily split across op
215489cebc84SBrian Foster * records. If we don't have the whole thing here, copy what we
215589cebc84SBrian Foster * do have and handle the rest in the next record.
215689cebc84SBrian Foster */
215789cebc84SBrian Foster if (len == sizeof(struct xfs_trans_header))
215876560669SDave Chinner xlog_recover_add_item(&trans->r_itemq);
215976560669SDave Chinner memcpy(&trans->r_theader, dp, len);
216076560669SDave Chinner return 0;
216176560669SDave Chinner }
216276560669SDave Chinner
2163707e0ddaSTetsuo Handa ptr = kmem_alloc(len, 0);
216476560669SDave Chinner memcpy(ptr, dp, len);
216506b11321SDarrick J. Wong in_f = (struct xfs_inode_log_format *)ptr;
216676560669SDave Chinner
216776560669SDave Chinner /* take the tail entry */
216835f4521fSDarrick J. Wong item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
216935f4521fSDarrick J. Wong ri_list);
217076560669SDave Chinner if (item->ri_total != 0 &&
217176560669SDave Chinner item->ri_total == item->ri_cnt) {
217276560669SDave Chinner /* tail item is in use, get a new one */
217376560669SDave Chinner xlog_recover_add_item(&trans->r_itemq);
217476560669SDave Chinner item = list_entry(trans->r_itemq.prev,
217535f4521fSDarrick J. Wong struct xlog_recover_item, ri_list);
217676560669SDave Chinner }
217776560669SDave Chinner
217876560669SDave Chinner if (item->ri_total == 0) { /* first region to be added */
217976560669SDave Chinner if (in_f->ilf_size == 0 ||
218076560669SDave Chinner in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
218176560669SDave Chinner xfs_warn(log->l_mp,
218276560669SDave Chinner "bad number of regions (%d) in inode log format",
218376560669SDave Chinner in_f->ilf_size);
218476560669SDave Chinner ASSERT(0);
218576560669SDave Chinner kmem_free(ptr);
2186895e196fSDarrick J. Wong return -EFSCORRUPTED;
218776560669SDave Chinner }
218876560669SDave Chinner
218976560669SDave Chinner item->ri_total = in_f->ilf_size;
219076560669SDave Chinner item->ri_buf =
219176560669SDave Chinner kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
2192707e0ddaSTetsuo Handa 0);
219376560669SDave Chinner }
2194d6abecb8SDarrick J. Wong
2195d6abecb8SDarrick J. Wong if (item->ri_total <= item->ri_cnt) {
2196d6abecb8SDarrick J. Wong xfs_warn(log->l_mp,
2197d6abecb8SDarrick J. Wong "log item region count (%d) overflowed size (%d)",
2198d6abecb8SDarrick J. Wong item->ri_cnt, item->ri_total);
2199d6abecb8SDarrick J. Wong ASSERT(0);
2200d6abecb8SDarrick J. Wong kmem_free(ptr);
2201d6abecb8SDarrick J. Wong return -EFSCORRUPTED;
2202d6abecb8SDarrick J. Wong }
2203d6abecb8SDarrick J. Wong
220476560669SDave Chinner /* Description region is ri_buf[0] */
220576560669SDave Chinner item->ri_buf[item->ri_cnt].i_addr = ptr;
220676560669SDave Chinner item->ri_buf[item->ri_cnt].i_len = len;
220776560669SDave Chinner item->ri_cnt++;
220876560669SDave Chinner trace_xfs_log_recover_item_add(log, trans, item, 0);
220976560669SDave Chinner return 0;
221076560669SDave Chinner }
2211b818cca1SDave Chinner
221276560669SDave Chinner /*
221376560669SDave Chinner * Free up any resources allocated by the transaction
221476560669SDave Chinner *
221576560669SDave Chinner * Remember that EFIs, EFDs, and IUNLINKs are handled later.
221676560669SDave Chinner */
221776560669SDave Chinner STATIC void
xlog_recover_free_trans(struct xlog_recover * trans)221876560669SDave Chinner xlog_recover_free_trans(
221976560669SDave Chinner struct xlog_recover *trans)
222076560669SDave Chinner {
222135f4521fSDarrick J. Wong struct xlog_recover_item *item, *n;
222276560669SDave Chinner int i;
222376560669SDave Chinner
222439775431SBrian Foster hlist_del_init(&trans->r_list);
222539775431SBrian Foster
222676560669SDave Chinner list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
222776560669SDave Chinner /* Free the regions in the item. */
222876560669SDave Chinner list_del(&item->ri_list);
222976560669SDave Chinner for (i = 0; i < item->ri_cnt; i++)
223076560669SDave Chinner kmem_free(item->ri_buf[i].i_addr);
223176560669SDave Chinner /* Free the item itself */
223276560669SDave Chinner kmem_free(item->ri_buf);
223376560669SDave Chinner kmem_free(item);
223476560669SDave Chinner }
223576560669SDave Chinner /* Free the transaction recover structure */
223676560669SDave Chinner kmem_free(trans);
223776560669SDave Chinner }
223876560669SDave Chinner
2239e9131e50SDave Chinner /*
2240e9131e50SDave Chinner * On error or completion, trans is freed.
2241e9131e50SDave Chinner */
22421da177e4SLinus Torvalds STATIC int
xlog_recovery_process_trans(struct xlog * log,struct xlog_recover * trans,char * dp,unsigned int len,unsigned int flags,int pass,struct list_head * buffer_list)2243eeb11688SDave Chinner xlog_recovery_process_trans(
2244eeb11688SDave Chinner struct xlog *log,
2245eeb11688SDave Chinner struct xlog_recover *trans,
2246b2a922cdSChristoph Hellwig char *dp,
2247eeb11688SDave Chinner unsigned int len,
2248eeb11688SDave Chinner unsigned int flags,
224912818d24SBrian Foster int pass,
225012818d24SBrian Foster struct list_head *buffer_list)
22511da177e4SLinus Torvalds {
2252e9131e50SDave Chinner int error = 0;
2253e9131e50SDave Chinner bool freeit = false;
2254eeb11688SDave Chinner
2255eeb11688SDave Chinner /* mask off ophdr transaction container flags */
2256eeb11688SDave Chinner flags &= ~XLOG_END_TRANS;
2257eeb11688SDave Chinner if (flags & XLOG_WAS_CONT_TRANS)
2258eeb11688SDave Chinner flags &= ~XLOG_CONTINUE_TRANS;
2259eeb11688SDave Chinner
226088b863dbSDave Chinner /*
226188b863dbSDave Chinner * Callees must not free the trans structure. We'll decide if we need to
226288b863dbSDave Chinner * free it or not based on the operation being done and it's result.
226388b863dbSDave Chinner */
2264eeb11688SDave Chinner switch (flags) {
2265eeb11688SDave Chinner /* expected flag values */
2266eeb11688SDave Chinner case 0:
2267eeb11688SDave Chinner case XLOG_CONTINUE_TRANS:
2268eeb11688SDave Chinner error = xlog_recover_add_to_trans(log, trans, dp, len);
2269eeb11688SDave Chinner break;
2270eeb11688SDave Chinner case XLOG_WAS_CONT_TRANS:
2271eeb11688SDave Chinner error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
2272eeb11688SDave Chinner break;
2273eeb11688SDave Chinner case XLOG_COMMIT_TRANS:
227412818d24SBrian Foster error = xlog_recover_commit_trans(log, trans, pass,
227512818d24SBrian Foster buffer_list);
227688b863dbSDave Chinner /* success or fail, we are now done with this transaction. */
227788b863dbSDave Chinner freeit = true;
2278eeb11688SDave Chinner break;
2279eeb11688SDave Chinner
2280eeb11688SDave Chinner /* unexpected flag values */
2281eeb11688SDave Chinner case XLOG_UNMOUNT_TRANS:
2282e9131e50SDave Chinner /* just skip trans */
22831da177e4SLinus Torvalds xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
2284e9131e50SDave Chinner freeit = true;
2285eeb11688SDave Chinner break;
2286eeb11688SDave Chinner case XLOG_START_TRANS:
2287eeb11688SDave Chinner default:
2288eeb11688SDave Chinner xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
2289eeb11688SDave Chinner ASSERT(0);
2290895e196fSDarrick J. Wong error = -EFSCORRUPTED;
2291eeb11688SDave Chinner break;
2292eeb11688SDave Chinner }
2293e9131e50SDave Chinner if (error || freeit)
2294e9131e50SDave Chinner xlog_recover_free_trans(trans);
2295eeb11688SDave Chinner return error;
2296eeb11688SDave Chinner }
2297eeb11688SDave Chinner
2298b818cca1SDave Chinner /*
2299b818cca1SDave Chinner * Lookup the transaction recovery structure associated with the ID in the
2300b818cca1SDave Chinner * current ophdr. If the transaction doesn't exist and the start flag is set in
2301b818cca1SDave Chinner * the ophdr, then allocate a new transaction for future ID matches to find.
2302b818cca1SDave Chinner * Either way, return what we found during the lookup - an existing transaction
2303b818cca1SDave Chinner * or nothing.
2304b818cca1SDave Chinner */
2305eeb11688SDave Chinner STATIC struct xlog_recover *
xlog_recover_ophdr_to_trans(struct hlist_head rhash[],struct xlog_rec_header * rhead,struct xlog_op_header * ohead)2306eeb11688SDave Chinner xlog_recover_ophdr_to_trans(
2307eeb11688SDave Chinner struct hlist_head rhash[],
2308eeb11688SDave Chinner struct xlog_rec_header *rhead,
2309eeb11688SDave Chinner struct xlog_op_header *ohead)
2310eeb11688SDave Chinner {
2311eeb11688SDave Chinner struct xlog_recover *trans;
2312eeb11688SDave Chinner xlog_tid_t tid;
2313eeb11688SDave Chinner struct hlist_head *rhp;
2314eeb11688SDave Chinner
2315eeb11688SDave Chinner tid = be32_to_cpu(ohead->oh_tid);
2316eeb11688SDave Chinner rhp = &rhash[XLOG_RHASH(tid)];
2317b818cca1SDave Chinner hlist_for_each_entry(trans, rhp, r_list) {
2318b818cca1SDave Chinner if (trans->r_log_tid == tid)
2319eeb11688SDave Chinner return trans;
2320b818cca1SDave Chinner }
2321eeb11688SDave Chinner
2322eeb11688SDave Chinner /*
2323b818cca1SDave Chinner * skip over non-start transaction headers - we could be
2324b818cca1SDave Chinner * processing slack space before the next transaction starts
2325eeb11688SDave Chinner */
2326b818cca1SDave Chinner if (!(ohead->oh_flags & XLOG_START_TRANS))
2327b818cca1SDave Chinner return NULL;
2328b818cca1SDave Chinner
2329eeb11688SDave Chinner ASSERT(be32_to_cpu(ohead->oh_len) == 0);
2330b818cca1SDave Chinner
2331b818cca1SDave Chinner /*
2332b818cca1SDave Chinner * This is a new transaction so allocate a new recovery container to
2333b818cca1SDave Chinner * hold the recovery ops that will follow.
2334b818cca1SDave Chinner */
2335707e0ddaSTetsuo Handa trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
2336b818cca1SDave Chinner trans->r_log_tid = tid;
2337b818cca1SDave Chinner trans->r_lsn = be64_to_cpu(rhead->h_lsn);
2338b818cca1SDave Chinner INIT_LIST_HEAD(&trans->r_itemq);
2339b818cca1SDave Chinner INIT_HLIST_NODE(&trans->r_list);
2340b818cca1SDave Chinner hlist_add_head(&trans->r_list, rhp);
2341b818cca1SDave Chinner
2342b818cca1SDave Chinner /*
2343b818cca1SDave Chinner * Nothing more to do for this ophdr. Items to be added to this new
2344b818cca1SDave Chinner * transaction will be in subsequent ophdr containers.
2345b818cca1SDave Chinner */
2346eeb11688SDave Chinner return NULL;
2347eeb11688SDave Chinner }
2348eeb11688SDave Chinner
2349eeb11688SDave Chinner STATIC int
xlog_recover_process_ophdr(struct xlog * log,struct hlist_head rhash[],struct xlog_rec_header * rhead,struct xlog_op_header * ohead,char * dp,char * end,int pass,struct list_head * buffer_list)2350eeb11688SDave Chinner xlog_recover_process_ophdr(
2351eeb11688SDave Chinner struct xlog *log,
2352eeb11688SDave Chinner struct hlist_head rhash[],
2353eeb11688SDave Chinner struct xlog_rec_header *rhead,
2354eeb11688SDave Chinner struct xlog_op_header *ohead,
2355b2a922cdSChristoph Hellwig char *dp,
2356b2a922cdSChristoph Hellwig char *end,
235712818d24SBrian Foster int pass,
235812818d24SBrian Foster struct list_head *buffer_list)
2359eeb11688SDave Chinner {
2360eeb11688SDave Chinner struct xlog_recover *trans;
2361eeb11688SDave Chinner unsigned int len;
236212818d24SBrian Foster int error;
2363eeb11688SDave Chinner
2364eeb11688SDave Chinner /* Do we understand who wrote this op? */
2365eeb11688SDave Chinner if (ohead->oh_clientid != XFS_TRANSACTION &&
2366eeb11688SDave Chinner ohead->oh_clientid != XFS_LOG) {
2367eeb11688SDave Chinner xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
2368eeb11688SDave Chinner __func__, ohead->oh_clientid);
2369eeb11688SDave Chinner ASSERT(0);
2370895e196fSDarrick J. Wong return -EFSCORRUPTED;
2371eeb11688SDave Chinner }
2372eeb11688SDave Chinner
2373eeb11688SDave Chinner /*
2374eeb11688SDave Chinner * Check the ophdr contains all the data it is supposed to contain.
2375eeb11688SDave Chinner */
2376eeb11688SDave Chinner len = be32_to_cpu(ohead->oh_len);
2377eeb11688SDave Chinner if (dp + len > end) {
2378eeb11688SDave Chinner xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
2379eeb11688SDave Chinner WARN_ON(1);
2380895e196fSDarrick J. Wong return -EFSCORRUPTED;
2381eeb11688SDave Chinner }
2382eeb11688SDave Chinner
2383eeb11688SDave Chinner trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
2384eeb11688SDave Chinner if (!trans) {
2385eeb11688SDave Chinner /* nothing to do, so skip over this ophdr */
23861da177e4SLinus Torvalds return 0;
23871da177e4SLinus Torvalds }
23881da177e4SLinus Torvalds
238912818d24SBrian Foster /*
239012818d24SBrian Foster * The recovered buffer queue is drained only once we know that all
239112818d24SBrian Foster * recovery items for the current LSN have been processed. This is
239212818d24SBrian Foster * required because:
239312818d24SBrian Foster *
239412818d24SBrian Foster * - Buffer write submission updates the metadata LSN of the buffer.
239512818d24SBrian Foster * - Log recovery skips items with a metadata LSN >= the current LSN of
239612818d24SBrian Foster * the recovery item.
239712818d24SBrian Foster * - Separate recovery items against the same metadata buffer can share
239812818d24SBrian Foster * a current LSN. I.e., consider that the LSN of a recovery item is
239912818d24SBrian Foster * defined as the starting LSN of the first record in which its
240012818d24SBrian Foster * transaction appears, that a record can hold multiple transactions,
240112818d24SBrian Foster * and/or that a transaction can span multiple records.
240212818d24SBrian Foster *
240312818d24SBrian Foster * In other words, we are allowed to submit a buffer from log recovery
240412818d24SBrian Foster * once per current LSN. Otherwise, we may incorrectly skip recovery
240512818d24SBrian Foster * items and cause corruption.
240612818d24SBrian Foster *
240712818d24SBrian Foster * We don't know up front whether buffers are updated multiple times per
240812818d24SBrian Foster * LSN. Therefore, track the current LSN of each commit log record as it
240912818d24SBrian Foster * is processed and drain the queue when it changes. Use commit records
241012818d24SBrian Foster * because they are ordered correctly by the logging code.
241112818d24SBrian Foster */
241212818d24SBrian Foster if (log->l_recovery_lsn != trans->r_lsn &&
241312818d24SBrian Foster ohead->oh_flags & XLOG_COMMIT_TRANS) {
241412818d24SBrian Foster error = xfs_buf_delwri_submit(buffer_list);
241512818d24SBrian Foster if (error)
241612818d24SBrian Foster return error;
241712818d24SBrian Foster log->l_recovery_lsn = trans->r_lsn;
241812818d24SBrian Foster }
241912818d24SBrian Foster
2420e9131e50SDave Chinner return xlog_recovery_process_trans(log, trans, dp, len,
242112818d24SBrian Foster ohead->oh_flags, pass, buffer_list);
2422eeb11688SDave Chinner }
24231da177e4SLinus Torvalds
24241da177e4SLinus Torvalds /*
24251da177e4SLinus Torvalds * There are two valid states of the r_state field. 0 indicates that the
24261da177e4SLinus Torvalds * transaction structure is in a normal state. We have either seen the
24271da177e4SLinus Torvalds * start of the transaction or the last operation we added was not a partial
24281da177e4SLinus Torvalds * operation. If the last operation we added to the transaction was a
24291da177e4SLinus Torvalds * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
24301da177e4SLinus Torvalds *
24311da177e4SLinus Torvalds * NOTE: skip LRs with 0 data length.
24321da177e4SLinus Torvalds */
24331da177e4SLinus Torvalds STATIC int
xlog_recover_process_data(struct xlog * log,struct hlist_head rhash[],struct xlog_rec_header * rhead,char * dp,int pass,struct list_head * buffer_list)24341da177e4SLinus Torvalds xlog_recover_process_data(
24359a8d2fdbSMark Tinguely struct xlog *log,
2436f0a76953SDave Chinner struct hlist_head rhash[],
24379a8d2fdbSMark Tinguely struct xlog_rec_header *rhead,
2438b2a922cdSChristoph Hellwig char *dp,
243912818d24SBrian Foster int pass,
244012818d24SBrian Foster struct list_head *buffer_list)
24411da177e4SLinus Torvalds {
2442eeb11688SDave Chinner struct xlog_op_header *ohead;
2443b2a922cdSChristoph Hellwig char *end;
24441da177e4SLinus Torvalds int num_logops;
24451da177e4SLinus Torvalds int error;
24461da177e4SLinus Torvalds
2447eeb11688SDave Chinner end = dp + be32_to_cpu(rhead->h_len);
2448b53e675dSChristoph Hellwig num_logops = be32_to_cpu(rhead->h_num_logops);
24491da177e4SLinus Torvalds
24501da177e4SLinus Torvalds /* check the log format matches our own - else we can't recover */
24511da177e4SLinus Torvalds if (xlog_header_check_recover(log->l_mp, rhead))
24522451337dSDave Chinner return -EIO;
24531da177e4SLinus Torvalds
24545cd9cee9SBrian Foster trace_xfs_log_recover_record(log, rhead, pass);
2455eeb11688SDave Chinner while ((dp < end) && num_logops) {
2456eeb11688SDave Chinner
2457eeb11688SDave Chinner ohead = (struct xlog_op_header *)dp;
2458eeb11688SDave Chinner dp += sizeof(*ohead);
2459*7cd9f0a3Slei lu if (dp > end) {
2460*7cd9f0a3Slei lu xfs_warn(log->l_mp, "%s: op header overrun", __func__);
2461*7cd9f0a3Slei lu return -EFSCORRUPTED;
2462*7cd9f0a3Slei lu }
2463eeb11688SDave Chinner
2464eeb11688SDave Chinner /* errors will abort recovery */
2465eeb11688SDave Chinner error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
246612818d24SBrian Foster dp, end, pass, buffer_list);
2467eeb11688SDave Chinner if (error)
24681da177e4SLinus Torvalds return error;
2469eeb11688SDave Chinner
247067fcb7bfSChristoph Hellwig dp += be32_to_cpu(ohead->oh_len);
24711da177e4SLinus Torvalds num_logops--;
24721da177e4SLinus Torvalds }
24731da177e4SLinus Torvalds return 0;
24741da177e4SLinus Torvalds }
24751da177e4SLinus Torvalds
247650995582SDarrick J. Wong /* Take all the collected deferred ops and finish them in order. */
247750995582SDarrick J. Wong static int
xlog_finish_defer_ops(struct xfs_mount * mp,struct list_head * capture_list)247850995582SDarrick J. Wong xlog_finish_defer_ops(
2479e6fff81eSDarrick J. Wong struct xfs_mount *mp,
2480e6fff81eSDarrick J. Wong struct list_head *capture_list)
248150995582SDarrick J. Wong {
2482e6fff81eSDarrick J. Wong struct xfs_defer_capture *dfc, *next;
248350995582SDarrick J. Wong struct xfs_trans *tp;
2484e6fff81eSDarrick J. Wong int error = 0;
248550995582SDarrick J. Wong
2486e6fff81eSDarrick J. Wong list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2487929b92f6SDarrick J. Wong struct xfs_trans_res resv;
2488512edfacSDarrick J. Wong struct xfs_defer_resources dres;
2489929b92f6SDarrick J. Wong
2490929b92f6SDarrick J. Wong /*
2491929b92f6SDarrick J. Wong * Create a new transaction reservation from the captured
2492929b92f6SDarrick J. Wong * information. Set logcount to 1 to force the new transaction
2493929b92f6SDarrick J. Wong * to regrant every roll so that we can make forward progress
2494929b92f6SDarrick J. Wong * in recovery no matter how full the log might be.
2495929b92f6SDarrick J. Wong */
2496929b92f6SDarrick J. Wong resv.tr_logres = dfc->dfc_logres;
2497929b92f6SDarrick J. Wong resv.tr_logcount = 1;
2498929b92f6SDarrick J. Wong resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
2499929b92f6SDarrick J. Wong
2500929b92f6SDarrick J. Wong error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
2501929b92f6SDarrick J. Wong dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
25024e6b8270SDarrick J. Wong if (error) {
2503b5f17becSDave Chinner xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR);
250450995582SDarrick J. Wong return error;
25054e6b8270SDarrick J. Wong }
250650995582SDarrick J. Wong
2507e6fff81eSDarrick J. Wong /*
2508e6fff81eSDarrick J. Wong * Transfer to this new transaction all the dfops we captured
2509e6fff81eSDarrick J. Wong * from recovering a single intent item.
2510e6fff81eSDarrick J. Wong */
2511e6fff81eSDarrick J. Wong list_del_init(&dfc->dfc_list);
2512512edfacSDarrick J. Wong xfs_defer_ops_continue(dfc, tp, &dres);
2513e6fff81eSDarrick J. Wong error = xfs_trans_commit(tp);
2514512edfacSDarrick J. Wong xfs_defer_resources_rele(&dres);
2515e6fff81eSDarrick J. Wong if (error)
2516e6fff81eSDarrick J. Wong return error;
251750995582SDarrick J. Wong }
251850995582SDarrick J. Wong
2519e6fff81eSDarrick J. Wong ASSERT(list_empty(capture_list));
2520e6fff81eSDarrick J. Wong return 0;
2521e6fff81eSDarrick J. Wong }
2522e6fff81eSDarrick J. Wong
2523e6fff81eSDarrick J. Wong /* Release all the captured defer ops and capture structures in this list. */
2524e6fff81eSDarrick J. Wong static void
xlog_abort_defer_ops(struct xfs_mount * mp,struct list_head * capture_list)2525e6fff81eSDarrick J. Wong xlog_abort_defer_ops(
2526e6fff81eSDarrick J. Wong struct xfs_mount *mp,
2527e6fff81eSDarrick J. Wong struct list_head *capture_list)
2528e6fff81eSDarrick J. Wong {
2529e6fff81eSDarrick J. Wong struct xfs_defer_capture *dfc;
2530e6fff81eSDarrick J. Wong struct xfs_defer_capture *next;
2531e6fff81eSDarrick J. Wong
2532e6fff81eSDarrick J. Wong list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2533e6fff81eSDarrick J. Wong list_del_init(&dfc->dfc_list);
2534005be668SLong Li xfs_defer_ops_capture_abort(mp, dfc);
2535e6fff81eSDarrick J. Wong }
2536e6fff81eSDarrick J. Wong }
2537ab9c81efSDave Chinner
2538dc42375dSDarrick J. Wong /*
2539dc42375dSDarrick J. Wong * When this is called, all of the log intent items which did not have
2540ab9c81efSDave Chinner * corresponding log done items should be in the AIL. What we do now is update
2541ab9c81efSDave Chinner * the data structures associated with each one.
2542dc42375dSDarrick J. Wong *
2543ab9c81efSDave Chinner * Since we process the log intent items in normal transactions, they will be
2544ab9c81efSDave Chinner * removed at some point after the commit. This prevents us from just walking
2545ab9c81efSDave Chinner * down the list processing each one. We'll use a flag in the intent item to
2546ab9c81efSDave Chinner * skip those that we've already processed and use the AIL iteration mechanism's
2547ab9c81efSDave Chinner * generation count to try to speed this up at least a bit.
2548dc42375dSDarrick J. Wong *
2549ab9c81efSDave Chinner * When we start, we know that the intents are the only things in the AIL. As we
2550ab9c81efSDave Chinner * process them, however, other items are added to the AIL. Hence we know we
2551ab9c81efSDave Chinner * have started recovery on all the pending intents when we find an non-intent
2552ab9c81efSDave Chinner * item in the AIL.
2553dc42375dSDarrick J. Wong */
2554dc42375dSDarrick J. Wong STATIC int
xlog_recover_process_intents(struct xlog * log)2555dc42375dSDarrick J. Wong xlog_recover_process_intents(
2556dc42375dSDarrick J. Wong struct xlog *log)
2557dc42375dSDarrick J. Wong {
2558e6fff81eSDarrick J. Wong LIST_HEAD(capture_list);
2559cd3c2cf3SDarrick J. Wong struct xfs_defer_pending *dfp, *n;
2560e6fff81eSDarrick J. Wong int error = 0;
25617bf7a193SDarrick J. Wong #if defined(DEBUG) || defined(XFS_WARN)
2562dc42375dSDarrick J. Wong xfs_lsn_t last_lsn;
2563dc42375dSDarrick J. Wong
2564dc42375dSDarrick J. Wong last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
25657bf7a193SDarrick J. Wong #endif
256697cf7967SDarrick J. Wong
2567cd3c2cf3SDarrick J. Wong list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
2568cd3c2cf3SDarrick J. Wong struct xfs_log_item *lip = dfp->dfp_intent;
2569cd3c2cf3SDarrick J. Wong const struct xfs_item_ops *ops = lip->li_ops;
2570cd3c2cf3SDarrick J. Wong
2571cd3c2cf3SDarrick J. Wong ASSERT(xlog_item_is_intent(lip));
2572dc42375dSDarrick J. Wong
2573dc42375dSDarrick J. Wong /*
2574dc42375dSDarrick J. Wong * We should never see a redo item with a LSN higher than
2575dc42375dSDarrick J. Wong * the last transaction we found in the log at the start
2576dc42375dSDarrick J. Wong * of recovery.
2577dc42375dSDarrick J. Wong */
2578dc42375dSDarrick J. Wong ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
2579dc42375dSDarrick J. Wong
258050995582SDarrick J. Wong /*
258150995582SDarrick J. Wong * NOTE: If your intent processing routine can create more
2582e6fff81eSDarrick J. Wong * deferred ops, you /must/ attach them to the capture list in
2583e6fff81eSDarrick J. Wong * the recover routine or else those subsequent intents will be
258450995582SDarrick J. Wong * replayed in the wrong order!
258597cf7967SDarrick J. Wong *
258697cf7967SDarrick J. Wong * The recovery function can free the log item, so we must not
258797cf7967SDarrick J. Wong * access lip after it returns.
258850995582SDarrick J. Wong */
258987db24c8SDarrick J. Wong error = ops->iop_recover(dfp, &capture_list);
259063370326SDarrick J. Wong if (error) {
259163370326SDarrick J. Wong trace_xlog_intent_recovery_failed(log->l_mp, error,
259297cf7967SDarrick J. Wong ops->iop_recover);
2593e6fff81eSDarrick J. Wong break;
25941da177e4SLinus Torvalds }
2595e6fff81eSDarrick J. Wong
2596cd3c2cf3SDarrick J. Wong xfs_defer_cancel_recovery(log->l_mp, dfp);
2597cd3c2cf3SDarrick J. Wong }
2598e6fff81eSDarrick J. Wong if (error)
2599e6fff81eSDarrick J. Wong goto err;
260050995582SDarrick J. Wong
2601e6fff81eSDarrick J. Wong error = xlog_finish_defer_ops(log->l_mp, &capture_list);
2602e6fff81eSDarrick J. Wong if (error)
2603e6fff81eSDarrick J. Wong goto err;
2604e6fff81eSDarrick J. Wong
2605e6fff81eSDarrick J. Wong return 0;
2606e6fff81eSDarrick J. Wong err:
2607e6fff81eSDarrick J. Wong xlog_abort_defer_ops(log->l_mp, &capture_list);
26083c1e2bbeSDavid Chinner return error;
26091da177e4SLinus Torvalds }
26101da177e4SLinus Torvalds
26111da177e4SLinus Torvalds /*
2612ab9c81efSDave Chinner * A cancel occurs when the mount has failed and we're bailing out. Release all
2613ab9c81efSDave Chinner * pending log intent items that we haven't started recovery on so they don't
2614ab9c81efSDave Chinner * pin the AIL.
2615f0b2efadSBrian Foster */
2616a7a9250eSHariprasad Kelam STATIC void
xlog_recover_cancel_intents(struct xlog * log)2617dc42375dSDarrick J. Wong xlog_recover_cancel_intents(
2618f0b2efadSBrian Foster struct xlog *log)
2619f0b2efadSBrian Foster {
2620cd3c2cf3SDarrick J. Wong struct xfs_defer_pending *dfp, *n;
2621f0b2efadSBrian Foster
2622cd3c2cf3SDarrick J. Wong list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
2623cd3c2cf3SDarrick J. Wong ASSERT(xlog_item_is_intent(dfp->dfp_intent));
2624f0b2efadSBrian Foster
2625cd3c2cf3SDarrick J. Wong xfs_defer_cancel_recovery(log->l_mp, dfp);
2626f0b2efadSBrian Foster }
2627f0b2efadSBrian Foster }
2628f0b2efadSBrian Foster
2629f0b2efadSBrian Foster /*
2630680776e5SDarrick J. Wong * Transfer ownership of the recovered log intent item to the recovery
2631680776e5SDarrick J. Wong * transaction.
2632680776e5SDarrick J. Wong */
2633680776e5SDarrick J. Wong void
xlog_recover_transfer_intent(struct xfs_trans * tp,struct xfs_defer_pending * dfp)2634680776e5SDarrick J. Wong xlog_recover_transfer_intent(
2635680776e5SDarrick J. Wong struct xfs_trans *tp,
2636680776e5SDarrick J. Wong struct xfs_defer_pending *dfp)
2637680776e5SDarrick J. Wong {
2638680776e5SDarrick J. Wong dfp->dfp_intent = NULL;
2639680776e5SDarrick J. Wong }
2640680776e5SDarrick J. Wong
2641680776e5SDarrick J. Wong /*
26421da177e4SLinus Torvalds * This routine performs a transaction to null out a bad inode pointer
26431da177e4SLinus Torvalds * in an agi unlinked inode hash bucket.
26441da177e4SLinus Torvalds */
26451da177e4SLinus Torvalds STATIC void
xlog_recover_clear_agi_bucket(struct xfs_perag * pag,int bucket)26461da177e4SLinus Torvalds xlog_recover_clear_agi_bucket(
264761021debSDave Chinner struct xfs_perag *pag,
26481da177e4SLinus Torvalds int bucket)
26491da177e4SLinus Torvalds {
265061021debSDave Chinner struct xfs_mount *mp = pag->pag_mount;
265161021debSDave Chinner struct xfs_trans *tp;
265261021debSDave Chinner struct xfs_agi *agi;
2653e8222613SDave Chinner struct xfs_buf *agibp;
26541da177e4SLinus Torvalds int offset;
26551da177e4SLinus Torvalds int error;
26561da177e4SLinus Torvalds
2657253f4911SChristoph Hellwig error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
2658e5720eecSDavid Chinner if (error)
2659253f4911SChristoph Hellwig goto out_error;
26601da177e4SLinus Torvalds
266161021debSDave Chinner error = xfs_read_agi(pag, tp, &agibp);
26625e1be0fbSChristoph Hellwig if (error)
2663e5720eecSDavid Chinner goto out_abort;
26641da177e4SLinus Torvalds
2665370c782bSChristoph Hellwig agi = agibp->b_addr;
266616259e7dSChristoph Hellwig agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
26671da177e4SLinus Torvalds offset = offsetof(xfs_agi_t, agi_unlinked) +
26681da177e4SLinus Torvalds (sizeof(xfs_agino_t) * bucket);
26691da177e4SLinus Torvalds xfs_trans_log_buf(tp, agibp, offset,
26701da177e4SLinus Torvalds (offset + sizeof(xfs_agino_t) - 1));
26711da177e4SLinus Torvalds
267270393313SChristoph Hellwig error = xfs_trans_commit(tp);
2673e5720eecSDavid Chinner if (error)
2674e5720eecSDavid Chinner goto out_error;
2675e5720eecSDavid Chinner return;
2676e5720eecSDavid Chinner
2677e5720eecSDavid Chinner out_abort:
26784906e215SChristoph Hellwig xfs_trans_cancel(tp);
2679e5720eecSDavid Chinner out_error:
268061021debSDave Chinner xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__,
268161021debSDave Chinner pag->pag_agno);
2682e5720eecSDavid Chinner return;
26831da177e4SLinus Torvalds }
26841da177e4SLinus Torvalds
268504755d2eSDave Chinner static int
xlog_recover_iunlink_bucket(struct xfs_perag * pag,struct xfs_agi * agi,int bucket)268604755d2eSDave Chinner xlog_recover_iunlink_bucket(
268761021debSDave Chinner struct xfs_perag *pag,
268804755d2eSDave Chinner struct xfs_agi *agi,
268923fac50fSChristoph Hellwig int bucket)
269023fac50fSChristoph Hellwig {
269104755d2eSDave Chinner struct xfs_mount *mp = pag->pag_mount;
26922fd26cc0SDave Chinner struct xfs_inode *prev_ip = NULL;
269323fac50fSChristoph Hellwig struct xfs_inode *ip;
26942fd26cc0SDave Chinner xfs_agino_t prev_agino, agino;
26952fd26cc0SDave Chinner int error = 0;
269623fac50fSChristoph Hellwig
269704755d2eSDave Chinner agino = be32_to_cpu(agi->agi_unlinked[bucket]);
269804755d2eSDave Chinner while (agino != NULLAGINO) {
269904755d2eSDave Chinner error = xfs_iget(mp, NULL,
270004755d2eSDave Chinner XFS_AGINO_TO_INO(mp, pag->pag_agno, agino),
270104755d2eSDave Chinner 0, 0, &ip);
270223fac50fSChristoph Hellwig if (error)
27032fd26cc0SDave Chinner break;
270423fac50fSChristoph Hellwig
270554d7b5c1SDave Chinner ASSERT(VFS_I(ip)->i_nlink == 0);
2706c19b3b05SDave Chinner ASSERT(VFS_I(ip)->i_mode != 0);
270704755d2eSDave Chinner xfs_iflags_clear(ip, XFS_IRECOVERY);
27084fcc94d6SDave Chinner agino = ip->i_next_unlinked;
270923fac50fSChristoph Hellwig
27102fd26cc0SDave Chinner if (prev_ip) {
27112fd26cc0SDave Chinner ip->i_prev_unlinked = prev_agino;
27122fd26cc0SDave Chinner xfs_irele(prev_ip);
271323fac50fSChristoph Hellwig
271423fac50fSChristoph Hellwig /*
27152fd26cc0SDave Chinner * Ensure the inode is removed from the unlinked list
27162fd26cc0SDave Chinner * before we continue so that it won't race with
27172fd26cc0SDave Chinner * building the in-memory list here. This could be
27182fd26cc0SDave Chinner * serialised with the agibp lock, but that just
27192fd26cc0SDave Chinner * serialises via lockstepping and it's much simpler
27202fd26cc0SDave Chinner * just to flush the inodegc queue and wait for it to
27212fd26cc0SDave Chinner * complete.
272223fac50fSChristoph Hellwig */
2723d4d12c02SDave Chinner error = xfs_inodegc_flush(mp);
2724d4d12c02SDave Chinner if (error)
2725d4d12c02SDave Chinner break;
272604755d2eSDave Chinner }
27272fd26cc0SDave Chinner
27282fd26cc0SDave Chinner prev_agino = agino;
27292fd26cc0SDave Chinner prev_ip = ip;
27302fd26cc0SDave Chinner }
27312fd26cc0SDave Chinner
27322fd26cc0SDave Chinner if (prev_ip) {
2733d4d12c02SDave Chinner int error2;
2734d4d12c02SDave Chinner
27352fd26cc0SDave Chinner ip->i_prev_unlinked = prev_agino;
27362fd26cc0SDave Chinner xfs_irele(prev_ip);
2737d4d12c02SDave Chinner
2738d4d12c02SDave Chinner error2 = xfs_inodegc_flush(mp);
2739d4d12c02SDave Chinner if (error2 && !error)
2740d4d12c02SDave Chinner return error2;
27412fd26cc0SDave Chinner }
27422fd26cc0SDave Chinner return error;
274323fac50fSChristoph Hellwig }
274423fac50fSChristoph Hellwig
27451da177e4SLinus Torvalds /*
27468ab39f11SDave Chinner * Recover AGI unlinked lists
27471da177e4SLinus Torvalds *
27488ab39f11SDave Chinner * This is called during recovery to process any inodes which we unlinked but
27498ab39f11SDave Chinner * not freed when the system crashed. These inodes will be on the lists in the
27508ab39f11SDave Chinner * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
27518ab39f11SDave Chinner * any inodes found on the lists. Each inode is removed from the lists when it
27528ab39f11SDave Chinner * has been fully truncated and is freed. The freeing of the inode and its
27538ab39f11SDave Chinner * removal from the list must be atomic.
27548ab39f11SDave Chinner *
27558ab39f11SDave Chinner * If everything we touch in the agi processing loop is already in memory, this
27568ab39f11SDave Chinner * loop can hold the cpu for a long time. It runs without lock contention,
27578ab39f11SDave Chinner * memory allocation contention, the need wait for IO, etc, and so will run
27588ab39f11SDave Chinner * until we either run out of inodes to process, run low on memory or we run out
27598ab39f11SDave Chinner * of log space.
27608ab39f11SDave Chinner *
27618ab39f11SDave Chinner * This behaviour is bad for latency on single CPU and non-preemptible kernels,
2762bd24a4f5SBhaskar Chowdhury * and can prevent other filesystem work (such as CIL pushes) from running. This
27638ab39f11SDave Chinner * can lead to deadlocks if the recovery process runs out of log reservation
27648ab39f11SDave Chinner * space. Hence we need to yield the CPU when there is other kernel work
27658ab39f11SDave Chinner * scheduled on this CPU to ensure other scheduled work can run without undue
27668ab39f11SDave Chinner * latency.
27671da177e4SLinus Torvalds */
276804755d2eSDave Chinner static void
xlog_recover_iunlink_ag(struct xfs_perag * pag)276904755d2eSDave Chinner xlog_recover_iunlink_ag(
277004755d2eSDave Chinner struct xfs_perag *pag)
27711da177e4SLinus Torvalds {
2772934933c3SDave Chinner struct xfs_agi *agi;
2773e8222613SDave Chinner struct xfs_buf *agibp;
27741da177e4SLinus Torvalds int bucket;
27751da177e4SLinus Torvalds int error;
27761da177e4SLinus Torvalds
277761021debSDave Chinner error = xfs_read_agi(pag, NULL, &agibp);
27785e1be0fbSChristoph Hellwig if (error) {
27795e1be0fbSChristoph Hellwig /*
27805e1be0fbSChristoph Hellwig * AGI is b0rked. Don't process it.
27815e1be0fbSChristoph Hellwig *
278204755d2eSDave Chinner * We should probably mark the filesystem as corrupt after we've
278304755d2eSDave Chinner * recovered all the ag's we can....
27845e1be0fbSChristoph Hellwig */
278504755d2eSDave Chinner return;
27861da177e4SLinus Torvalds }
278704755d2eSDave Chinner
2788d97d32edSJan Kara /*
278904755d2eSDave Chinner * Unlock the buffer so that it can be acquired in the normal course of
279004755d2eSDave Chinner * the transaction to truncate and free each inode. Because we are not
279104755d2eSDave Chinner * racing with anyone else here for the AGI buffer, we don't even need
279204755d2eSDave Chinner * to hold it locked to read the initial unlinked bucket entries out of
279304755d2eSDave Chinner * the buffer. We keep buffer reference though, so that it stays pinned
279404755d2eSDave Chinner * in memory while we need the buffer.
2795d97d32edSJan Kara */
2796370c782bSChristoph Hellwig agi = agibp->b_addr;
2797d97d32edSJan Kara xfs_buf_unlock(agibp);
27981da177e4SLinus Torvalds
27991da177e4SLinus Torvalds for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
280004755d2eSDave Chinner error = xlog_recover_iunlink_bucket(pag, agi, bucket);
280104755d2eSDave Chinner if (error) {
280204755d2eSDave Chinner /*
280304755d2eSDave Chinner * Bucket is unrecoverable, so only a repair scan can
280404755d2eSDave Chinner * free the remaining unlinked inodes. Just empty the
280504755d2eSDave Chinner * bucket and remaining inodes on it unreferenced and
280604755d2eSDave Chinner * unfreeable.
280704755d2eSDave Chinner */
280804755d2eSDave Chinner xlog_recover_clear_agi_bucket(pag, bucket);
28091da177e4SLinus Torvalds }
28101da177e4SLinus Torvalds }
281104755d2eSDave Chinner
2812d97d32edSJan Kara xfs_buf_rele(agibp);
28131da177e4SLinus Torvalds }
2814ab23a776SDave Chinner
281504755d2eSDave Chinner static void
xlog_recover_process_iunlinks(struct xlog * log)281604755d2eSDave Chinner xlog_recover_process_iunlinks(
281704755d2eSDave Chinner struct xlog *log)
281804755d2eSDave Chinner {
281904755d2eSDave Chinner struct xfs_perag *pag;
282004755d2eSDave Chinner xfs_agnumber_t agno;
282104755d2eSDave Chinner
282204755d2eSDave Chinner for_each_perag(log->l_mp, agno, pag)
282304755d2eSDave Chinner xlog_recover_iunlink_ag(pag);
28241da177e4SLinus Torvalds }
28251da177e4SLinus Torvalds
282691083269SEric Sandeen STATIC void
xlog_unpack_data(struct xlog_rec_header * rhead,char * dp,struct xlog * log)28271da177e4SLinus Torvalds xlog_unpack_data(
28289a8d2fdbSMark Tinguely struct xlog_rec_header *rhead,
2829b2a922cdSChristoph Hellwig char *dp,
28309a8d2fdbSMark Tinguely struct xlog *log)
28311da177e4SLinus Torvalds {
28321da177e4SLinus Torvalds int i, j, k;
28331da177e4SLinus Torvalds
2834b53e675dSChristoph Hellwig for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
28351da177e4SLinus Torvalds i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
2836b53e675dSChristoph Hellwig *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
28371da177e4SLinus Torvalds dp += BBSIZE;
28381da177e4SLinus Torvalds }
28391da177e4SLinus Torvalds
284038c26bfdSDave Chinner if (xfs_has_logv2(log->l_mp)) {
2841b28708d6SChristoph Hellwig xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
2842b53e675dSChristoph Hellwig for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
28431da177e4SLinus Torvalds j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
28441da177e4SLinus Torvalds k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
2845b53e675dSChristoph Hellwig *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
28461da177e4SLinus Torvalds dp += BBSIZE;
28471da177e4SLinus Torvalds }
28481da177e4SLinus Torvalds }
28491da177e4SLinus Torvalds }
28501da177e4SLinus Torvalds
28519d94901fSBrian Foster /*
2852b94fb2d1SBrian Foster * CRC check, unpack and process a log record.
28539d94901fSBrian Foster */
28549d94901fSBrian Foster STATIC int
xlog_recover_process(struct xlog * log,struct hlist_head rhash[],struct xlog_rec_header * rhead,char * dp,int pass,struct list_head * buffer_list)28559d94901fSBrian Foster xlog_recover_process(
28569d94901fSBrian Foster struct xlog *log,
28579d94901fSBrian Foster struct hlist_head rhash[],
28589d94901fSBrian Foster struct xlog_rec_header *rhead,
28599d94901fSBrian Foster char *dp,
286012818d24SBrian Foster int pass,
286112818d24SBrian Foster struct list_head *buffer_list)
28629d94901fSBrian Foster {
2863cae028dfSDave Chinner __le32 old_crc = rhead->h_crc;
2864b94fb2d1SBrian Foster __le32 crc;
2865b94fb2d1SBrian Foster
2866b94fb2d1SBrian Foster crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
28676528250bSBrian Foster
28686528250bSBrian Foster /*
28696528250bSBrian Foster * Nothing else to do if this is a CRC verification pass. Just return
28706528250bSBrian Foster * if this a record with a non-zero crc. Unfortunately, mkfs always
2871cae028dfSDave Chinner * sets old_crc to 0 so we must consider this valid even on v5 supers.
28726528250bSBrian Foster * Otherwise, return EFSBADCRC on failure so the callers up the stack
28736528250bSBrian Foster * know precisely what failed.
28746528250bSBrian Foster */
28756528250bSBrian Foster if (pass == XLOG_RECOVER_CRCPASS) {
2876cae028dfSDave Chinner if (old_crc && crc != old_crc)
28776528250bSBrian Foster return -EFSBADCRC;
28786528250bSBrian Foster return 0;
28796528250bSBrian Foster }
28806528250bSBrian Foster
28816528250bSBrian Foster /*
28826528250bSBrian Foster * We're in the normal recovery path. Issue a warning if and only if the
28836528250bSBrian Foster * CRC in the header is non-zero. This is an advisory warning and the
28846528250bSBrian Foster * zero CRC check prevents warnings from being emitted when upgrading
28856528250bSBrian Foster * the kernel from one that does not add CRCs by default.
28866528250bSBrian Foster */
2887cae028dfSDave Chinner if (crc != old_crc) {
288838c26bfdSDave Chinner if (old_crc || xfs_has_crc(log->l_mp)) {
2889b94fb2d1SBrian Foster xfs_alert(log->l_mp,
2890b94fb2d1SBrian Foster "log record CRC mismatch: found 0x%x, expected 0x%x.",
2891cae028dfSDave Chinner le32_to_cpu(old_crc),
2892b94fb2d1SBrian Foster le32_to_cpu(crc));
2893b94fb2d1SBrian Foster xfs_hex_dump(dp, 32);
2894b94fb2d1SBrian Foster }
2895b94fb2d1SBrian Foster
2896b94fb2d1SBrian Foster /*
2897b94fb2d1SBrian Foster * If the filesystem is CRC enabled, this mismatch becomes a
2898b94fb2d1SBrian Foster * fatal log corruption failure.
2899b94fb2d1SBrian Foster */
290038c26bfdSDave Chinner if (xfs_has_crc(log->l_mp)) {
2901a5155b87SDarrick J. Wong XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
2902b94fb2d1SBrian Foster return -EFSCORRUPTED;
2903b94fb2d1SBrian Foster }
2904a5155b87SDarrick J. Wong }
29059d94901fSBrian Foster
290691083269SEric Sandeen xlog_unpack_data(rhead, dp, log);
29079d94901fSBrian Foster
290812818d24SBrian Foster return xlog_recover_process_data(log, rhash, rhead, dp, pass,
290912818d24SBrian Foster buffer_list);
29109d94901fSBrian Foster }
29119d94901fSBrian Foster
29121da177e4SLinus Torvalds STATIC int
xlog_valid_rec_header(struct xlog * log,struct xlog_rec_header * rhead,xfs_daddr_t blkno,int bufsize)29131da177e4SLinus Torvalds xlog_valid_rec_header(
29149a8d2fdbSMark Tinguely struct xlog *log,
29159a8d2fdbSMark Tinguely struct xlog_rec_header *rhead,
2916f692d09eSGao Xiang xfs_daddr_t blkno,
2917f692d09eSGao Xiang int bufsize)
29181da177e4SLinus Torvalds {
29191da177e4SLinus Torvalds int hlen;
29201da177e4SLinus Torvalds
2921a71895c5SDarrick J. Wong if (XFS_IS_CORRUPT(log->l_mp,
2922a71895c5SDarrick J. Wong rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)))
29232451337dSDave Chinner return -EFSCORRUPTED;
2924a71895c5SDarrick J. Wong if (XFS_IS_CORRUPT(log->l_mp,
29251da177e4SLinus Torvalds (!rhead->h_version ||
2926a71895c5SDarrick J. Wong (be32_to_cpu(rhead->h_version) &
2927a71895c5SDarrick J. Wong (~XLOG_VERSION_OKBITS))))) {
2928a0fa2b67SDave Chinner xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
292934a622b2SHarvey Harrison __func__, be32_to_cpu(rhead->h_version));
2930895e196fSDarrick J. Wong return -EFSCORRUPTED;
29311da177e4SLinus Torvalds }
29321da177e4SLinus Torvalds
2933f692d09eSGao Xiang /*
2934f692d09eSGao Xiang * LR body must have data (or it wouldn't have been written)
2935f692d09eSGao Xiang * and h_len must not be greater than LR buffer size.
2936f692d09eSGao Xiang */
2937b53e675dSChristoph Hellwig hlen = be32_to_cpu(rhead->h_len);
2938f692d09eSGao Xiang if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize))
29392451337dSDave Chinner return -EFSCORRUPTED;
2940f692d09eSGao Xiang
2941a71895c5SDarrick J. Wong if (XFS_IS_CORRUPT(log->l_mp,
2942a71895c5SDarrick J. Wong blkno > log->l_logBBsize || blkno > INT_MAX))
29432451337dSDave Chinner return -EFSCORRUPTED;
29441da177e4SLinus Torvalds return 0;
29451da177e4SLinus Torvalds }
29461da177e4SLinus Torvalds
29471da177e4SLinus Torvalds /*
29481da177e4SLinus Torvalds * Read the log from tail to head and process the log records found.
29491da177e4SLinus Torvalds * Handle the two cases where the tail and head are in the same cycle
29501da177e4SLinus Torvalds * and where the active portion of the log wraps around the end of
29511da177e4SLinus Torvalds * the physical log separately. The pass parameter is passed through
29521da177e4SLinus Torvalds * to the routines called to process the data and is not looked at
29531da177e4SLinus Torvalds * here.
29541da177e4SLinus Torvalds */
29551da177e4SLinus Torvalds STATIC int
xlog_do_recovery_pass(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk,int pass,xfs_daddr_t * first_bad)29561da177e4SLinus Torvalds xlog_do_recovery_pass(
29579a8d2fdbSMark Tinguely struct xlog *log,
29581da177e4SLinus Torvalds xfs_daddr_t head_blk,
29591da177e4SLinus Torvalds xfs_daddr_t tail_blk,
2960d7f37692SBrian Foster int pass,
2961d7f37692SBrian Foster xfs_daddr_t *first_bad) /* out: first bad log rec */
29621da177e4SLinus Torvalds {
29631da177e4SLinus Torvalds xlog_rec_header_t *rhead;
2964284f1c2cSBrian Foster xfs_daddr_t blk_no, rblk_no;
2965d7f37692SBrian Foster xfs_daddr_t rhead_blk;
2966b2a922cdSChristoph Hellwig char *offset;
29676ad5b325SChristoph Hellwig char *hbp, *dbp;
2968a70f9fe5SBrian Foster int error = 0, h_size, h_len;
296912818d24SBrian Foster int error2 = 0;
29701da177e4SLinus Torvalds int bblks, split_bblks;
2971c2389c07SChristoph Hellwig int hblks = 1, split_hblks, wrapped_hblks;
297239775431SBrian Foster int i;
2973f0a76953SDave Chinner struct hlist_head rhash[XLOG_RHASH_SIZE];
297412818d24SBrian Foster LIST_HEAD (buffer_list);
29751da177e4SLinus Torvalds
29761da177e4SLinus Torvalds ASSERT(head_blk != tail_blk);
2977a4c9b34dSBrian Foster blk_no = rhead_blk = tail_blk;
29781da177e4SLinus Torvalds
297939775431SBrian Foster for (i = 0; i < XLOG_RHASH_SIZE; i++)
298039775431SBrian Foster INIT_HLIST_HEAD(&rhash[i]);
298139775431SBrian Foster
29821da177e4SLinus Torvalds /*
29831da177e4SLinus Torvalds * Read the header of the tail block and get the iclog buffer size from
29841da177e4SLinus Torvalds * h_size. Use this to tell how many sectors make up the log header.
29851da177e4SLinus Torvalds */
298638c26bfdSDave Chinner if (xfs_has_logv2(log->l_mp)) {
29871da177e4SLinus Torvalds /*
29881da177e4SLinus Torvalds * When using variable length iclogs, read first sector of
29891da177e4SLinus Torvalds * iclog header and extract the header size from it. Get a
29901da177e4SLinus Torvalds * new hbp that is the correct size.
29911da177e4SLinus Torvalds */
29926e9b3dd8SChristoph Hellwig hbp = xlog_alloc_buffer(log, 1);
29931da177e4SLinus Torvalds if (!hbp)
29942451337dSDave Chinner return -ENOMEM;
2995076e6acbSChristoph Hellwig
2996076e6acbSChristoph Hellwig error = xlog_bread(log, tail_blk, 1, hbp, &offset);
2997076e6acbSChristoph Hellwig if (error)
29981da177e4SLinus Torvalds goto bread_err1;
2999076e6acbSChristoph Hellwig
30001da177e4SLinus Torvalds rhead = (xlog_rec_header_t *)offset;
3001a70f9fe5SBrian Foster
3002a70f9fe5SBrian Foster /*
3003a70f9fe5SBrian Foster * xfsprogs has a bug where record length is based on lsunit but
3004a70f9fe5SBrian Foster * h_size (iclog size) is hardcoded to 32k. Now that we
3005a70f9fe5SBrian Foster * unconditionally CRC verify the unmount record, this means the
3006a70f9fe5SBrian Foster * log buffer can be too small for the record and cause an
3007a70f9fe5SBrian Foster * overrun.
3008a70f9fe5SBrian Foster *
3009a70f9fe5SBrian Foster * Detect this condition here. Use lsunit for the buffer size as
3010a70f9fe5SBrian Foster * long as this looks like the mkfs case. Otherwise, return an
3011a70f9fe5SBrian Foster * error to avoid a buffer overrun.
3012a70f9fe5SBrian Foster */
3013b53e675dSChristoph Hellwig h_size = be32_to_cpu(rhead->h_size);
3014a70f9fe5SBrian Foster h_len = be32_to_cpu(rhead->h_len);
3015f692d09eSGao Xiang if (h_len > h_size && h_len <= log->l_mp->m_logbsize &&
3016f692d09eSGao Xiang rhead->h_num_logops == cpu_to_be32(1)) {
3017a70f9fe5SBrian Foster xfs_warn(log->l_mp,
3018a70f9fe5SBrian Foster "invalid iclog size (%d bytes), using lsunit (%d bytes)",
3019a70f9fe5SBrian Foster h_size, log->l_mp->m_logbsize);
3020a70f9fe5SBrian Foster h_size = log->l_mp->m_logbsize;
3021f692d09eSGao Xiang }
3022f692d09eSGao Xiang
3023f692d09eSGao Xiang error = xlog_valid_rec_header(log, rhead, tail_blk, h_size);
3024f692d09eSGao Xiang if (error)
3025050552cbSDarrick J. Wong goto bread_err1;
3026a70f9fe5SBrian Foster
3027c2389c07SChristoph Hellwig /*
3028c2389c07SChristoph Hellwig * This open codes xlog_logrec_hblks so that we can reuse the
3029c2389c07SChristoph Hellwig * fixed up h_size value calculated above. Without that we'd
3030c2389c07SChristoph Hellwig * still allocate the buffer based on the incorrect on-disk
3031c2389c07SChristoph Hellwig * size.
3032c2389c07SChristoph Hellwig */
3033c2389c07SChristoph Hellwig if (h_size > XLOG_HEADER_CYCLE_SIZE &&
3034c2389c07SChristoph Hellwig (rhead->h_version & cpu_to_be32(XLOG_VERSION_2))) {
3035c2389c07SChristoph Hellwig hblks = DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
3036c2389c07SChristoph Hellwig if (hblks > 1) {
30376ad5b325SChristoph Hellwig kmem_free(hbp);
30386e9b3dd8SChristoph Hellwig hbp = xlog_alloc_buffer(log, hblks);
30391da177e4SLinus Torvalds }
3040c2389c07SChristoph Hellwig }
30411da177e4SLinus Torvalds } else {
304269ce58f0SAlex Elder ASSERT(log->l_sectBBsize == 1);
30436e9b3dd8SChristoph Hellwig hbp = xlog_alloc_buffer(log, 1);
30441da177e4SLinus Torvalds h_size = XLOG_BIG_RECORD_BSIZE;
30451da177e4SLinus Torvalds }
30461da177e4SLinus Torvalds
30471da177e4SLinus Torvalds if (!hbp)
30482451337dSDave Chinner return -ENOMEM;
30496e9b3dd8SChristoph Hellwig dbp = xlog_alloc_buffer(log, BTOBB(h_size));
30501da177e4SLinus Torvalds if (!dbp) {
30516ad5b325SChristoph Hellwig kmem_free(hbp);
30522451337dSDave Chinner return -ENOMEM;
30531da177e4SLinus Torvalds }
30541da177e4SLinus Torvalds
30551da177e4SLinus Torvalds memset(rhash, 0, sizeof(rhash));
3056970fd3f0SEric Sandeen if (tail_blk > head_blk) {
30571da177e4SLinus Torvalds /*
30581da177e4SLinus Torvalds * Perform recovery around the end of the physical log.
30591da177e4SLinus Torvalds * When the head is not on the same cycle number as the tail,
3060970fd3f0SEric Sandeen * we can't do a sequential recovery.
30611da177e4SLinus Torvalds */
30621da177e4SLinus Torvalds while (blk_no < log->l_logBBsize) {
30631da177e4SLinus Torvalds /*
30641da177e4SLinus Torvalds * Check for header wrapping around physical end-of-log
30651da177e4SLinus Torvalds */
30666ad5b325SChristoph Hellwig offset = hbp;
30671da177e4SLinus Torvalds split_hblks = 0;
30681da177e4SLinus Torvalds wrapped_hblks = 0;
30691da177e4SLinus Torvalds if (blk_no + hblks <= log->l_logBBsize) {
30701da177e4SLinus Torvalds /* Read header in one read */
3071076e6acbSChristoph Hellwig error = xlog_bread(log, blk_no, hblks, hbp,
3072076e6acbSChristoph Hellwig &offset);
30731da177e4SLinus Torvalds if (error)
30741da177e4SLinus Torvalds goto bread_err2;
30751da177e4SLinus Torvalds } else {
30761da177e4SLinus Torvalds /* This LR is split across physical log end */
30771da177e4SLinus Torvalds if (blk_no != log->l_logBBsize) {
30781da177e4SLinus Torvalds /* some data before physical log end */
30791da177e4SLinus Torvalds ASSERT(blk_no <= INT_MAX);
30801da177e4SLinus Torvalds split_hblks = log->l_logBBsize - (int)blk_no;
30811da177e4SLinus Torvalds ASSERT(split_hblks > 0);
3082076e6acbSChristoph Hellwig error = xlog_bread(log, blk_no,
3083076e6acbSChristoph Hellwig split_hblks, hbp,
3084076e6acbSChristoph Hellwig &offset);
3085076e6acbSChristoph Hellwig if (error)
30861da177e4SLinus Torvalds goto bread_err2;
30871da177e4SLinus Torvalds }
3088076e6acbSChristoph Hellwig
30891da177e4SLinus Torvalds /*
30901da177e4SLinus Torvalds * Note: this black magic still works with
30911da177e4SLinus Torvalds * large sector sizes (non-512) only because:
30921da177e4SLinus Torvalds * - we increased the buffer size originally
30931da177e4SLinus Torvalds * by 1 sector giving us enough extra space
30941da177e4SLinus Torvalds * for the second read;
30951da177e4SLinus Torvalds * - the log start is guaranteed to be sector
30961da177e4SLinus Torvalds * aligned;
30971da177e4SLinus Torvalds * - we read the log end (LR header start)
30981da177e4SLinus Torvalds * _first_, then the log start (LR header end)
30991da177e4SLinus Torvalds * - order is important.
31001da177e4SLinus Torvalds */
3101234f56acSDavid Chinner wrapped_hblks = hblks - split_hblks;
31026ad5b325SChristoph Hellwig error = xlog_bread_noalign(log, 0,
31036ad5b325SChristoph Hellwig wrapped_hblks,
310444396476SDave Chinner offset + BBTOB(split_hblks));
31051da177e4SLinus Torvalds if (error)
31061da177e4SLinus Torvalds goto bread_err2;
31071da177e4SLinus Torvalds }
31081da177e4SLinus Torvalds rhead = (xlog_rec_header_t *)offset;
31091da177e4SLinus Torvalds error = xlog_valid_rec_header(log, rhead,
3110f692d09eSGao Xiang split_hblks ? blk_no : 0, h_size);
31111da177e4SLinus Torvalds if (error)
31121da177e4SLinus Torvalds goto bread_err2;
31131da177e4SLinus Torvalds
3114b53e675dSChristoph Hellwig bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
31151da177e4SLinus Torvalds blk_no += hblks;
31161da177e4SLinus Torvalds
3117284f1c2cSBrian Foster /*
3118284f1c2cSBrian Foster * Read the log record data in multiple reads if it
3119284f1c2cSBrian Foster * wraps around the end of the log. Note that if the
3120284f1c2cSBrian Foster * header already wrapped, blk_no could point past the
3121284f1c2cSBrian Foster * end of the log. The record data is contiguous in
3122284f1c2cSBrian Foster * that case.
3123284f1c2cSBrian Foster */
3124284f1c2cSBrian Foster if (blk_no + bblks <= log->l_logBBsize ||
3125284f1c2cSBrian Foster blk_no >= log->l_logBBsize) {
31260703a8e1SDave Chinner rblk_no = xlog_wrap_logbno(log, blk_no);
3127284f1c2cSBrian Foster error = xlog_bread(log, rblk_no, bblks, dbp,
3128076e6acbSChristoph Hellwig &offset);
31291da177e4SLinus Torvalds if (error)
31301da177e4SLinus Torvalds goto bread_err2;
31311da177e4SLinus Torvalds } else {
31321da177e4SLinus Torvalds /* This log record is split across the
31331da177e4SLinus Torvalds * physical end of log */
31346ad5b325SChristoph Hellwig offset = dbp;
31351da177e4SLinus Torvalds split_bblks = 0;
31361da177e4SLinus Torvalds if (blk_no != log->l_logBBsize) {
31371da177e4SLinus Torvalds /* some data is before the physical
31381da177e4SLinus Torvalds * end of log */
31391da177e4SLinus Torvalds ASSERT(!wrapped_hblks);
31401da177e4SLinus Torvalds ASSERT(blk_no <= INT_MAX);
31411da177e4SLinus Torvalds split_bblks =
31421da177e4SLinus Torvalds log->l_logBBsize - (int)blk_no;
31431da177e4SLinus Torvalds ASSERT(split_bblks > 0);
3144076e6acbSChristoph Hellwig error = xlog_bread(log, blk_no,
3145076e6acbSChristoph Hellwig split_bblks, dbp,
3146076e6acbSChristoph Hellwig &offset);
3147076e6acbSChristoph Hellwig if (error)
31481da177e4SLinus Torvalds goto bread_err2;
31491da177e4SLinus Torvalds }
3150076e6acbSChristoph Hellwig
31511da177e4SLinus Torvalds /*
31521da177e4SLinus Torvalds * Note: this black magic still works with
31531da177e4SLinus Torvalds * large sector sizes (non-512) only because:
31541da177e4SLinus Torvalds * - we increased the buffer size originally
31551da177e4SLinus Torvalds * by 1 sector giving us enough extra space
31561da177e4SLinus Torvalds * for the second read;
31571da177e4SLinus Torvalds * - the log start is guaranteed to be sector
31581da177e4SLinus Torvalds * aligned;
31591da177e4SLinus Torvalds * - we read the log end (LR header start)
31601da177e4SLinus Torvalds * _first_, then the log start (LR header end)
31611da177e4SLinus Torvalds * - order is important.
31621da177e4SLinus Torvalds */
31636ad5b325SChristoph Hellwig error = xlog_bread_noalign(log, 0,
31646ad5b325SChristoph Hellwig bblks - split_bblks,
316544396476SDave Chinner offset + BBTOB(split_bblks));
3166076e6acbSChristoph Hellwig if (error)
3167076e6acbSChristoph Hellwig goto bread_err2;
31681da177e4SLinus Torvalds }
31690e446be4SChristoph Hellwig
31709d94901fSBrian Foster error = xlog_recover_process(log, rhash, rhead, offset,
317112818d24SBrian Foster pass, &buffer_list);
31720e446be4SChristoph Hellwig if (error)
31731da177e4SLinus Torvalds goto bread_err2;
3174d7f37692SBrian Foster
31751da177e4SLinus Torvalds blk_no += bblks;
3176d7f37692SBrian Foster rhead_blk = blk_no;
31771da177e4SLinus Torvalds }
31781da177e4SLinus Torvalds
31791da177e4SLinus Torvalds ASSERT(blk_no >= log->l_logBBsize);
31801da177e4SLinus Torvalds blk_no -= log->l_logBBsize;
3181d7f37692SBrian Foster rhead_blk = blk_no;
3182970fd3f0SEric Sandeen }
31831da177e4SLinus Torvalds
31841da177e4SLinus Torvalds /* read first part of physical log */
31851da177e4SLinus Torvalds while (blk_no < head_blk) {
3186076e6acbSChristoph Hellwig error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3187076e6acbSChristoph Hellwig if (error)
31881da177e4SLinus Torvalds goto bread_err2;
3189076e6acbSChristoph Hellwig
31901da177e4SLinus Torvalds rhead = (xlog_rec_header_t *)offset;
3191f692d09eSGao Xiang error = xlog_valid_rec_header(log, rhead, blk_no, h_size);
31921da177e4SLinus Torvalds if (error)
31931da177e4SLinus Torvalds goto bread_err2;
3194076e6acbSChristoph Hellwig
3195970fd3f0SEric Sandeen /* blocks in data section */
3196b53e675dSChristoph Hellwig bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3197076e6acbSChristoph Hellwig error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3198076e6acbSChristoph Hellwig &offset);
3199076e6acbSChristoph Hellwig if (error)
32001da177e4SLinus Torvalds goto bread_err2;
3201076e6acbSChristoph Hellwig
320212818d24SBrian Foster error = xlog_recover_process(log, rhash, rhead, offset, pass,
320312818d24SBrian Foster &buffer_list);
32040e446be4SChristoph Hellwig if (error)
32051da177e4SLinus Torvalds goto bread_err2;
3206d7f37692SBrian Foster
32071da177e4SLinus Torvalds blk_no += bblks + hblks;
3208d7f37692SBrian Foster rhead_blk = blk_no;
32091da177e4SLinus Torvalds }
32101da177e4SLinus Torvalds
32111da177e4SLinus Torvalds bread_err2:
32126ad5b325SChristoph Hellwig kmem_free(dbp);
32131da177e4SLinus Torvalds bread_err1:
32146ad5b325SChristoph Hellwig kmem_free(hbp);
3215d7f37692SBrian Foster
321612818d24SBrian Foster /*
3217ae609281SLong Li * Submit buffers that have been dirtied by the last record recovered.
321812818d24SBrian Foster */
3219ae609281SLong Li if (!list_empty(&buffer_list)) {
3220ae609281SLong Li if (error) {
3221ae609281SLong Li /*
3222ae609281SLong Li * If there has been an item recovery error then we
3223ae609281SLong Li * cannot allow partial checkpoint writeback to
3224ae609281SLong Li * occur. We might have multiple checkpoints with the
3225ae609281SLong Li * same start LSN in this buffer list, and partial
3226ae609281SLong Li * writeback of a checkpoint in this situation can
3227ae609281SLong Li * prevent future recovery of all the changes in the
3228ae609281SLong Li * checkpoints at this start LSN.
3229ae609281SLong Li *
3230ae609281SLong Li * Note: Shutting down the filesystem will result in the
3231ae609281SLong Li * delwri submission marking all the buffers stale,
3232ae609281SLong Li * completing them and cleaning up _XBF_LOGRECOVERY
3233ae609281SLong Li * state without doing any IO.
3234ae609281SLong Li */
3235ae609281SLong Li xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
3236ae609281SLong Li }
323712818d24SBrian Foster error2 = xfs_buf_delwri_submit(&buffer_list);
3238ae609281SLong Li }
323912818d24SBrian Foster
3240d7f37692SBrian Foster if (error && first_bad)
3241d7f37692SBrian Foster *first_bad = rhead_blk;
3242d7f37692SBrian Foster
324339775431SBrian Foster /*
324439775431SBrian Foster * Transactions are freed at commit time but transactions without commit
324539775431SBrian Foster * records on disk are never committed. Free any that may be left in the
324639775431SBrian Foster * hash table.
324739775431SBrian Foster */
324839775431SBrian Foster for (i = 0; i < XLOG_RHASH_SIZE; i++) {
324939775431SBrian Foster struct hlist_node *tmp;
325039775431SBrian Foster struct xlog_recover *trans;
325139775431SBrian Foster
325239775431SBrian Foster hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
325339775431SBrian Foster xlog_recover_free_trans(trans);
325439775431SBrian Foster }
325539775431SBrian Foster
325612818d24SBrian Foster return error ? error : error2;
32571da177e4SLinus Torvalds }
32581da177e4SLinus Torvalds
32591da177e4SLinus Torvalds /*
32601da177e4SLinus Torvalds * Do the recovery of the log. We actually do this in two phases.
32611da177e4SLinus Torvalds * The two passes are necessary in order to implement the function
32621da177e4SLinus Torvalds * of cancelling a record written into the log. The first pass
32631da177e4SLinus Torvalds * determines those things which have been cancelled, and the
32641da177e4SLinus Torvalds * second pass replays log items normally except for those which
32651da177e4SLinus Torvalds * have been cancelled. The handling of the replay and cancellations
32661da177e4SLinus Torvalds * takes place in the log item type specific routines.
32671da177e4SLinus Torvalds *
32681da177e4SLinus Torvalds * The table of items which have cancel records in the log is allocated
32691da177e4SLinus Torvalds * and freed at this level, since only here do we know when all of
32701da177e4SLinus Torvalds * the log recovery has been completed.
32711da177e4SLinus Torvalds */
32721da177e4SLinus Torvalds STATIC int
xlog_do_log_recovery(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk)32731da177e4SLinus Torvalds xlog_do_log_recovery(
32749a8d2fdbSMark Tinguely struct xlog *log,
32751da177e4SLinus Torvalds xfs_daddr_t head_blk,
32761da177e4SLinus Torvalds xfs_daddr_t tail_blk)
32771da177e4SLinus Torvalds {
327827232349SDarrick J. Wong int error;
32791da177e4SLinus Torvalds
32801da177e4SLinus Torvalds ASSERT(head_blk != tail_blk);
32811da177e4SLinus Torvalds
32821da177e4SLinus Torvalds /*
32831da177e4SLinus Torvalds * First do a pass to find all of the cancelled buf log items.
32841da177e4SLinus Torvalds * Store them in the buf_cancel_table for use in the second pass.
32851da177e4SLinus Torvalds */
3286910bbdf2SDarrick J. Wong error = xlog_alloc_buf_cancel_table(log);
3287910bbdf2SDarrick J. Wong if (error)
3288910bbdf2SDarrick J. Wong return error;
3289d5689eaaSChristoph Hellwig
32901da177e4SLinus Torvalds error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3291d7f37692SBrian Foster XLOG_RECOVER_PASS1, NULL);
329227232349SDarrick J. Wong if (error != 0)
329327232349SDarrick J. Wong goto out_cancel;
329427232349SDarrick J. Wong
32951da177e4SLinus Torvalds /*
32961da177e4SLinus Torvalds * Then do a second pass to actually recover the items in the log.
32971da177e4SLinus Torvalds * When it is complete free the table of buf cancel items.
32981da177e4SLinus Torvalds */
32991da177e4SLinus Torvalds error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3300d7f37692SBrian Foster XLOG_RECOVER_PASS2, NULL);
330127232349SDarrick J. Wong if (!error)
330227232349SDarrick J. Wong xlog_check_buf_cancel_table(log);
330327232349SDarrick J. Wong out_cancel:
330427232349SDarrick J. Wong xlog_free_buf_cancel_table(log);
33051da177e4SLinus Torvalds return error;
33061da177e4SLinus Torvalds }
33071da177e4SLinus Torvalds
33081da177e4SLinus Torvalds /*
33091da177e4SLinus Torvalds * Do the actual recovery
33101da177e4SLinus Torvalds */
33111da177e4SLinus Torvalds STATIC int
xlog_do_recover(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk)33121da177e4SLinus Torvalds xlog_do_recover(
33139a8d2fdbSMark Tinguely struct xlog *log,
33141da177e4SLinus Torvalds xfs_daddr_t head_blk,
33151da177e4SLinus Torvalds xfs_daddr_t tail_blk)
33161da177e4SLinus Torvalds {
3317a798011cSDave Chinner struct xfs_mount *mp = log->l_mp;
3318b3f8e08cSChristoph Hellwig struct xfs_buf *bp = mp->m_sb_bp;
3319b3f8e08cSChristoph Hellwig struct xfs_sb *sbp = &mp->m_sb;
33201da177e4SLinus Torvalds int error;
33211da177e4SLinus Torvalds
3322e67d3d42SBrian Foster trace_xfs_log_recover(log, head_blk, tail_blk);
3323e67d3d42SBrian Foster
33241da177e4SLinus Torvalds /*
33251da177e4SLinus Torvalds * First replay the images in the log.
33261da177e4SLinus Torvalds */
33271da177e4SLinus Torvalds error = xlog_do_log_recovery(log, head_blk, tail_blk);
332843ff2122SChristoph Hellwig if (error)
33291da177e4SLinus Torvalds return error;
33301da177e4SLinus Torvalds
33312039a272SDave Chinner if (xlog_is_shutdown(log))
33322451337dSDave Chinner return -EIO;
33331da177e4SLinus Torvalds
33341da177e4SLinus Torvalds /*
33351da177e4SLinus Torvalds * We now update the tail_lsn since much of the recovery has completed
33361da177e4SLinus Torvalds * and there may be space available to use. If there were no extent
33371da177e4SLinus Torvalds * or iunlinks, we can free up the entire log and set the tail_lsn to
33381da177e4SLinus Torvalds * be the last_sync_lsn. This was set in xlog_find_tail to be the
33391da177e4SLinus Torvalds * lsn of the last known good LR on disk. If there are extent frees
33401da177e4SLinus Torvalds * or iunlinks they will have some entries in the AIL; so we look at
33411da177e4SLinus Torvalds * the AIL to determine how to set the tail_lsn.
33421da177e4SLinus Torvalds */
3343a798011cSDave Chinner xlog_assign_tail_lsn(mp);
33441da177e4SLinus Torvalds
33451da177e4SLinus Torvalds /*
3346b3f8e08cSChristoph Hellwig * Now that we've finished replaying all buffer and inode updates,
3347b3f8e08cSChristoph Hellwig * re-read the superblock and reverify it.
33481da177e4SLinus Torvalds */
3349b3f8e08cSChristoph Hellwig xfs_buf_lock(bp);
3350b3f8e08cSChristoph Hellwig xfs_buf_hold(bp);
335126e32875SChristoph Hellwig error = _xfs_buf_read(bp, XBF_READ);
3352d64e31a2SDavid Chinner if (error) {
33532039a272SDave Chinner if (!xlog_is_shutdown(log)) {
3354cdbcf82bSDarrick J. Wong xfs_buf_ioerror_alert(bp, __this_address);
33551da177e4SLinus Torvalds ASSERT(0);
3356595bff75SDave Chinner }
33571da177e4SLinus Torvalds xfs_buf_relse(bp);
33581da177e4SLinus Torvalds return error;
33591da177e4SLinus Torvalds }
33601da177e4SLinus Torvalds
33611da177e4SLinus Torvalds /* Convert superblock from on-disk format */
33623e6e8afdSChristoph Hellwig xfs_sb_from_disk(sbp, bp->b_addr);
33631da177e4SLinus Torvalds xfs_buf_relse(bp);
33641da177e4SLinus Torvalds
3365a798011cSDave Chinner /* re-initialise in-core superblock and geometry structures */
3366a1d86e8dSDave Chinner mp->m_features |= xfs_sb_version_to_features(sbp);
3367a798011cSDave Chinner xfs_reinit_percpu_counters(mp);
33685478eeadSLachlan McIlroy
33691da177e4SLinus Torvalds /* Normal transactions can now occur */
3370e1d06e5fSDave Chinner clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
33711da177e4SLinus Torvalds return 0;
33721da177e4SLinus Torvalds }
33731da177e4SLinus Torvalds
33741da177e4SLinus Torvalds /*
33751da177e4SLinus Torvalds * Perform recovery and re-initialize some log variables in xlog_find_tail.
33761da177e4SLinus Torvalds *
33771da177e4SLinus Torvalds * Return error or zero.
33781da177e4SLinus Torvalds */
33791da177e4SLinus Torvalds int
xlog_recover(struct xlog * log)33801da177e4SLinus Torvalds xlog_recover(
33819a8d2fdbSMark Tinguely struct xlog *log)
33821da177e4SLinus Torvalds {
33831da177e4SLinus Torvalds xfs_daddr_t head_blk, tail_blk;
33841da177e4SLinus Torvalds int error;
33851da177e4SLinus Torvalds
33861da177e4SLinus Torvalds /* find the tail of the log */
3387a45086e2SBrian Foster error = xlog_find_tail(log, &head_blk, &tail_blk);
3388a45086e2SBrian Foster if (error)
33891da177e4SLinus Torvalds return error;
33901da177e4SLinus Torvalds
3391a45086e2SBrian Foster /*
3392a45086e2SBrian Foster * The superblock was read before the log was available and thus the LSN
3393a45086e2SBrian Foster * could not be verified. Check the superblock LSN against the current
3394a45086e2SBrian Foster * LSN now that it's known.
3395a45086e2SBrian Foster */
339638c26bfdSDave Chinner if (xfs_has_crc(log->l_mp) &&
3397a45086e2SBrian Foster !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
3398a45086e2SBrian Foster return -EINVAL;
3399a45086e2SBrian Foster
34001da177e4SLinus Torvalds if (tail_blk != head_blk) {
34011da177e4SLinus Torvalds /* There used to be a comment here:
34021da177e4SLinus Torvalds *
34031da177e4SLinus Torvalds * disallow recovery on read-only mounts. note -- mount
34041da177e4SLinus Torvalds * checks for ENOSPC and turns it into an intelligent
34051da177e4SLinus Torvalds * error message.
34061da177e4SLinus Torvalds * ...but this is no longer true. Now, unless you specify
34071da177e4SLinus Torvalds * NORECOVERY (in which case this function would never be
34081da177e4SLinus Torvalds * called), we just go ahead and recover. We do this all
34091da177e4SLinus Torvalds * under the vfs layer, so we can get away with it unless
34101da177e4SLinus Torvalds * the device itself is read-only, in which case we fail.
34111da177e4SLinus Torvalds */
34123a02ee18SUtako Kusaka if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
34131da177e4SLinus Torvalds return error;
34141da177e4SLinus Torvalds }
34151da177e4SLinus Torvalds
3416e721f504SDave Chinner /*
3417e721f504SDave Chinner * Version 5 superblock log feature mask validation. We know the
3418e721f504SDave Chinner * log is dirty so check if there are any unknown log features
3419e721f504SDave Chinner * in what we need to recover. If there are unknown features
3420e721f504SDave Chinner * (e.g. unsupported transactions, then simply reject the
3421e721f504SDave Chinner * attempt at recovery before touching anything.
3422e721f504SDave Chinner */
3423d6837c1aSDave Chinner if (xfs_sb_is_v5(&log->l_mp->m_sb) &&
3424e721f504SDave Chinner xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
3425e721f504SDave Chinner XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
3426e721f504SDave Chinner xfs_warn(log->l_mp,
3427f41febd2SJoe Perches "Superblock has unknown incompatible log features (0x%x) enabled.",
3428e721f504SDave Chinner (log->l_mp->m_sb.sb_features_log_incompat &
3429e721f504SDave Chinner XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
3430f41febd2SJoe Perches xfs_warn(log->l_mp,
3431f41febd2SJoe Perches "The log can not be fully and/or safely recovered by this kernel.");
3432f41febd2SJoe Perches xfs_warn(log->l_mp,
3433f41febd2SJoe Perches "Please recover the log on a kernel that supports the unknown features.");
34342451337dSDave Chinner return -EINVAL;
3435e721f504SDave Chinner }
3436e721f504SDave Chinner
34372e227178SBrian Foster /*
34382e227178SBrian Foster * Delay log recovery if the debug hook is set. This is debug
3439bd24a4f5SBhaskar Chowdhury * instrumentation to coordinate simulation of I/O failures with
34402e227178SBrian Foster * log recovery.
34412e227178SBrian Foster */
34422e227178SBrian Foster if (xfs_globals.log_recovery_delay) {
34432e227178SBrian Foster xfs_notice(log->l_mp,
34442e227178SBrian Foster "Delaying log recovery for %d seconds.",
34452e227178SBrian Foster xfs_globals.log_recovery_delay);
34462e227178SBrian Foster msleep(xfs_globals.log_recovery_delay * 1000);
34472e227178SBrian Foster }
34482e227178SBrian Foster
3449a0fa2b67SDave Chinner xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
3450a0fa2b67SDave Chinner log->l_mp->m_logname ? log->l_mp->m_logname
3451a0fa2b67SDave Chinner : "internal");
34521da177e4SLinus Torvalds
34531da177e4SLinus Torvalds error = xlog_do_recover(log, head_blk, tail_blk);
3454e1d06e5fSDave Chinner set_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
34551da177e4SLinus Torvalds }
34561da177e4SLinus Torvalds return error;
34571da177e4SLinus Torvalds }
34581da177e4SLinus Torvalds
34591da177e4SLinus Torvalds /*
3460fd67d8a0SDave Chinner * In the first part of recovery we replay inodes and buffers and build up the
3461fd67d8a0SDave Chinner * list of intents which need to be processed. Here we process the intents and
3462fd67d8a0SDave Chinner * clean up the on disk unlinked inode lists. This is separated from the first
3463fd67d8a0SDave Chinner * part of recovery so that the root and real-time bitmap inodes can be read in
3464fd67d8a0SDave Chinner * from disk in between the two stages. This is necessary so that we can free
3465fd67d8a0SDave Chinner * space in the real-time portion of the file system.
34661da177e4SLinus Torvalds */
34671da177e4SLinus Torvalds int
xlog_recover_finish(struct xlog * log)34681da177e4SLinus Torvalds xlog_recover_finish(
34699a8d2fdbSMark Tinguely struct xlog *log)
34701da177e4SLinus Torvalds {
34713c1e2bbeSDavid Chinner int error;
3472fd67d8a0SDave Chinner
3473dc42375dSDarrick J. Wong error = xlog_recover_process_intents(log);
34743c1e2bbeSDavid Chinner if (error) {
34752e76f188SDarrick J. Wong /*
3476fd67d8a0SDave Chinner * Cancel all the unprocessed intent items now so that we don't
3477fd67d8a0SDave Chinner * leave them pinned in the AIL. This can cause the AIL to
3478fd67d8a0SDave Chinner * livelock on the pinned item if anyone tries to push the AIL
3479fd67d8a0SDave Chinner * (inode reclaim does this) before we get around to
3480fd67d8a0SDave Chinner * xfs_log_mount_cancel.
34812e76f188SDarrick J. Wong */
34822e76f188SDarrick J. Wong xlog_recover_cancel_intents(log);
3483dc42375dSDarrick J. Wong xfs_alert(log->l_mp, "Failed to recover intents");
3484b5f17becSDave Chinner xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
34853c1e2bbeSDavid Chinner return error;
34863c1e2bbeSDavid Chinner }
34879e88b5d8SDarrick J. Wong
34881da177e4SLinus Torvalds /*
3489fd67d8a0SDave Chinner * Sync the log to get all the intents out of the AIL. This isn't
3490fd67d8a0SDave Chinner * absolutely necessary, but it helps in case the unlink transactions
3491fd67d8a0SDave Chinner * would have problems pushing the intents out of the way.
34921da177e4SLinus Torvalds */
3493a14a348bSChristoph Hellwig xfs_log_force(log->l_mp, XFS_LOG_SYNC);
34941da177e4SLinus Torvalds
3495908ce71eSDarrick J. Wong /*
3496fd67d8a0SDave Chinner * Now that we've recovered the log and all the intents, we can clear
3497fd67d8a0SDave Chinner * the log incompat feature bits in the superblock because there's no
3498fd67d8a0SDave Chinner * longer anything to protect. We rely on the AIL push to write out the
3499fd67d8a0SDave Chinner * updated superblock after everything else.
3500908ce71eSDarrick J. Wong */
3501908ce71eSDarrick J. Wong if (xfs_clear_incompat_log_features(log->l_mp)) {
3502908ce71eSDarrick J. Wong error = xfs_sync_sb(log->l_mp, false);
3503908ce71eSDarrick J. Wong if (error < 0) {
3504908ce71eSDarrick J. Wong xfs_alert(log->l_mp,
3505908ce71eSDarrick J. Wong "Failed to clear log incompat features on recovery");
3506908ce71eSDarrick J. Wong return error;
3507908ce71eSDarrick J. Wong }
3508908ce71eSDarrick J. Wong }
3509908ce71eSDarrick J. Wong
35101da177e4SLinus Torvalds xlog_recover_process_iunlinks(log);
35117993f1a4SDarrick J. Wong
35127993f1a4SDarrick J. Wong /*
35137993f1a4SDarrick J. Wong * Recover any CoW staging blocks that are still referenced by the
35147993f1a4SDarrick J. Wong * ondisk refcount metadata. During mount there cannot be any live
35157993f1a4SDarrick J. Wong * staging extents as we have not permitted any user modifications.
35167993f1a4SDarrick J. Wong * Therefore, it is safe to free them all right now, even on a
35177993f1a4SDarrick J. Wong * read-only mount.
35187993f1a4SDarrick J. Wong */
35197993f1a4SDarrick J. Wong error = xfs_reflink_recover_cow(log->l_mp);
35207993f1a4SDarrick J. Wong if (error) {
35217993f1a4SDarrick J. Wong xfs_alert(log->l_mp,
35227993f1a4SDarrick J. Wong "Failed to recover leftover CoW staging extents, err %d.",
35237993f1a4SDarrick J. Wong error);
35247993f1a4SDarrick J. Wong /*
35257993f1a4SDarrick J. Wong * If we get an error here, make sure the log is shut down
35267993f1a4SDarrick J. Wong * but return zero so that any log items committed since the
35277993f1a4SDarrick J. Wong * end of intents processing can be pushed through the CIL
35287993f1a4SDarrick J. Wong * and AIL.
35297993f1a4SDarrick J. Wong */
3530b5f17becSDave Chinner xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
35317993f1a4SDarrick J. Wong }
35327993f1a4SDarrick J. Wong
35331da177e4SLinus Torvalds return 0;
35341da177e4SLinus Torvalds }
35351da177e4SLinus Torvalds
3536a7a9250eSHariprasad Kelam void
xlog_recover_cancel(struct xlog * log)3537f0b2efadSBrian Foster xlog_recover_cancel(
3538f0b2efadSBrian Foster struct xlog *log)
3539f0b2efadSBrian Foster {
3540e1d06e5fSDave Chinner if (xlog_recovery_needed(log))
3541a7a9250eSHariprasad Kelam xlog_recover_cancel_intents(log);
3542f0b2efadSBrian Foster }
35431da177e4SLinus Torvalds
3544