xref: /openbmc/linux/fs/xfs/xfs_log_recover.c (revision d37cf9b63113f13d742713881ce691fc615d8b3b)
10b61f8a4SDave Chinner // SPDX-License-Identifier: GPL-2.0
21da177e4SLinus Torvalds /*
387c199c2STim Shimmin  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
47b718769SNathan Scott  * All Rights Reserved.
51da177e4SLinus Torvalds  */
61da177e4SLinus Torvalds #include "xfs.h"
7a844f451SNathan Scott #include "xfs_fs.h"
870a9883cSDave Chinner #include "xfs_shared.h"
9239880efSDave Chinner #include "xfs_format.h"
10239880efSDave Chinner #include "xfs_log_format.h"
11239880efSDave Chinner #include "xfs_trans_resv.h"
12a844f451SNathan Scott #include "xfs_bit.h"
13a844f451SNathan Scott #include "xfs_sb.h"
141da177e4SLinus Torvalds #include "xfs_mount.h"
1550995582SDarrick J. Wong #include "xfs_defer.h"
161da177e4SLinus Torvalds #include "xfs_inode.h"
17239880efSDave Chinner #include "xfs_trans.h"
18239880efSDave Chinner #include "xfs_log.h"
191da177e4SLinus Torvalds #include "xfs_log_priv.h"
201da177e4SLinus Torvalds #include "xfs_log_recover.h"
211da177e4SLinus Torvalds #include "xfs_trans_priv.h"
22a4fbe6abSDave Chinner #include "xfs_alloc.h"
23a4fbe6abSDave Chinner #include "xfs_ialloc.h"
240b1b213fSChristoph Hellwig #include "xfs_trace.h"
2533479e05SDave Chinner #include "xfs_icache.h"
26a4fbe6abSDave Chinner #include "xfs_error.h"
2760a4a222SBrian Foster #include "xfs_buf_item.h"
289bbafc71SDave Chinner #include "xfs_ag.h"
294bc61983SDarrick J. Wong #include "xfs_quota.h"
307993f1a4SDarrick J. Wong #include "xfs_reflink.h"
311da177e4SLinus Torvalds 
32fc06c6d0SDave Chinner #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
33fc06c6d0SDave Chinner 
349a8d2fdbSMark Tinguely STATIC int
359a8d2fdbSMark Tinguely xlog_find_zeroed(
369a8d2fdbSMark Tinguely 	struct xlog	*,
379a8d2fdbSMark Tinguely 	xfs_daddr_t	*);
389a8d2fdbSMark Tinguely STATIC int
399a8d2fdbSMark Tinguely xlog_clear_stale_blocks(
409a8d2fdbSMark Tinguely 	struct xlog	*,
419a8d2fdbSMark Tinguely 	xfs_lsn_t);
427088c413SBrian Foster STATIC int
437088c413SBrian Foster xlog_do_recovery_pass(
447088c413SBrian Foster         struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
451da177e4SLinus Torvalds 
461da177e4SLinus Torvalds /*
471da177e4SLinus Torvalds  * Sector aligned buffer routines for buffer create/read/write/access
481da177e4SLinus Torvalds  */
491da177e4SLinus Torvalds 
50ff30a622SAlex Elder /*
5199c26595SBrian Foster  * Verify the log-relative block number and length in basic blocks are valid for
5299c26595SBrian Foster  * an operation involving the given XFS log buffer. Returns true if the fields
5399c26595SBrian Foster  * are valid, false otherwise.
54ff30a622SAlex Elder  */
5599c26595SBrian Foster static inline bool
xlog_verify_bno(struct xlog * log,xfs_daddr_t blk_no,int bbcount)566e9b3dd8SChristoph Hellwig xlog_verify_bno(
579a8d2fdbSMark Tinguely 	struct xlog	*log,
5899c26595SBrian Foster 	xfs_daddr_t	blk_no,
59ff30a622SAlex Elder 	int		bbcount)
60ff30a622SAlex Elder {
6199c26595SBrian Foster 	if (blk_no < 0 || blk_no >= log->l_logBBsize)
6299c26595SBrian Foster 		return false;
6399c26595SBrian Foster 	if (bbcount <= 0 || (blk_no + bbcount) > log->l_logBBsize)
6499c26595SBrian Foster 		return false;
6599c26595SBrian Foster 	return true;
66ff30a622SAlex Elder }
67ff30a622SAlex Elder 
6836adecffSAlex Elder /*
696ad5b325SChristoph Hellwig  * Allocate a buffer to hold log data.  The buffer needs to be able to map to
706ad5b325SChristoph Hellwig  * a range of nbblks basic blocks at any valid offset within the log.
7136adecffSAlex Elder  */
726ad5b325SChristoph Hellwig static char *
xlog_alloc_buffer(struct xlog * log,int nbblks)736e9b3dd8SChristoph Hellwig xlog_alloc_buffer(
749a8d2fdbSMark Tinguely 	struct xlog	*log,
753228149cSDave Chinner 	int		nbblks)
761da177e4SLinus Torvalds {
7799c26595SBrian Foster 	/*
7899c26595SBrian Foster 	 * Pass log block 0 since we don't have an addr yet, buffer will be
7999c26595SBrian Foster 	 * verified on read.
8099c26595SBrian Foster 	 */
81a71895c5SDarrick J. Wong 	if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, 0, nbblks))) {
82a0fa2b67SDave Chinner 		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
83ff30a622SAlex Elder 			nbblks);
843228149cSDave Chinner 		return NULL;
853228149cSDave Chinner 	}
861da177e4SLinus Torvalds 
8736adecffSAlex Elder 	/*
886ad5b325SChristoph Hellwig 	 * We do log I/O in units of log sectors (a power-of-2 multiple of the
896ad5b325SChristoph Hellwig 	 * basic block size), so we round up the requested size to accommodate
906ad5b325SChristoph Hellwig 	 * the basic blocks required for complete log sectors.
9136adecffSAlex Elder 	 *
926ad5b325SChristoph Hellwig 	 * In addition, the buffer may be used for a non-sector-aligned block
936ad5b325SChristoph Hellwig 	 * offset, in which case an I/O of the requested size could extend
946ad5b325SChristoph Hellwig 	 * beyond the end of the buffer.  If the requested size is only 1 basic
956ad5b325SChristoph Hellwig 	 * block it will never straddle a sector boundary, so this won't be an
966ad5b325SChristoph Hellwig 	 * issue.  Nor will this be a problem if the log I/O is done in basic
976ad5b325SChristoph Hellwig 	 * blocks (sector size 1).  But otherwise we extend the buffer by one
986ad5b325SChristoph Hellwig 	 * extra log sector to ensure there's space to accommodate this
996ad5b325SChristoph Hellwig 	 * possibility.
10036adecffSAlex Elder 	 */
10169ce58f0SAlex Elder 	if (nbblks > 1 && log->l_sectBBsize > 1)
10269ce58f0SAlex Elder 		nbblks += log->l_sectBBsize;
10369ce58f0SAlex Elder 	nbblks = round_up(nbblks, log->l_sectBBsize);
104d634525dSDave Chinner 	return kvzalloc(BBTOB(nbblks), GFP_KERNEL | __GFP_RETRY_MAYFAIL);
1051da177e4SLinus Torvalds }
1061da177e4SLinus Torvalds 
10748389ef1SAlex Elder /*
10848389ef1SAlex Elder  * Return the address of the start of the given block number's data
10948389ef1SAlex Elder  * in a log buffer.  The buffer covers a log sector-aligned region.
11048389ef1SAlex Elder  */
11118ffb8c3SChristoph Hellwig static inline unsigned int
xlog_align(struct xlog * log,xfs_daddr_t blk_no)112076e6acbSChristoph Hellwig xlog_align(
1139a8d2fdbSMark Tinguely 	struct xlog	*log,
11418ffb8c3SChristoph Hellwig 	xfs_daddr_t	blk_no)
115076e6acbSChristoph Hellwig {
11618ffb8c3SChristoph Hellwig 	return BBTOB(blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1));
117076e6acbSChristoph Hellwig }
118076e6acbSChristoph Hellwig 
1196ad5b325SChristoph Hellwig static int
xlog_do_io(struct xlog * log,xfs_daddr_t blk_no,unsigned int nbblks,char * data,enum req_op op)1206ad5b325SChristoph Hellwig xlog_do_io(
1219a8d2fdbSMark Tinguely 	struct xlog		*log,
1221da177e4SLinus Torvalds 	xfs_daddr_t		blk_no,
1236ad5b325SChristoph Hellwig 	unsigned int		nbblks,
1246ad5b325SChristoph Hellwig 	char			*data,
125d03025aeSBart Van Assche 	enum req_op		op)
1261da177e4SLinus Torvalds {
1271da177e4SLinus Torvalds 	int			error;
1281da177e4SLinus Torvalds 
129a71895c5SDarrick J. Wong 	if (XFS_IS_CORRUPT(log->l_mp, !xlog_verify_bno(log, blk_no, nbblks))) {
13099c26595SBrian Foster 		xfs_warn(log->l_mp,
13199c26595SBrian Foster 			 "Invalid log block/length (0x%llx, 0x%x) for buffer",
13299c26595SBrian Foster 			 blk_no, nbblks);
1332451337dSDave Chinner 		return -EFSCORRUPTED;
1343228149cSDave Chinner 	}
1353228149cSDave Chinner 
13669ce58f0SAlex Elder 	blk_no = round_down(blk_no, log->l_sectBBsize);
13769ce58f0SAlex Elder 	nbblks = round_up(nbblks, log->l_sectBBsize);
1381da177e4SLinus Torvalds 	ASSERT(nbblks > 0);
1391da177e4SLinus Torvalds 
1406ad5b325SChristoph Hellwig 	error = xfs_rw_bdev(log->l_targ->bt_bdev, log->l_logBBstart + blk_no,
1416ad5b325SChristoph Hellwig 			BBTOB(nbblks), data, op);
1422039a272SDave Chinner 	if (error && !xlog_is_shutdown(log)) {
1436ad5b325SChristoph Hellwig 		xfs_alert(log->l_mp,
1446ad5b325SChristoph Hellwig 			  "log recovery %s I/O error at daddr 0x%llx len %d error %d",
1456ad5b325SChristoph Hellwig 			  op == REQ_OP_WRITE ? "write" : "read",
1466ad5b325SChristoph Hellwig 			  blk_no, nbblks, error);
1476ad5b325SChristoph Hellwig 	}
1481da177e4SLinus Torvalds 	return error;
1491da177e4SLinus Torvalds }
1501da177e4SLinus Torvalds 
151076e6acbSChristoph Hellwig STATIC int
xlog_bread_noalign(struct xlog * log,xfs_daddr_t blk_no,int nbblks,char * data)1526ad5b325SChristoph Hellwig xlog_bread_noalign(
1536ad5b325SChristoph Hellwig 	struct xlog	*log,
1546ad5b325SChristoph Hellwig 	xfs_daddr_t	blk_no,
1556ad5b325SChristoph Hellwig 	int		nbblks,
1566ad5b325SChristoph Hellwig 	char		*data)
1576ad5b325SChristoph Hellwig {
1586ad5b325SChristoph Hellwig 	return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
1596ad5b325SChristoph Hellwig }
1606ad5b325SChristoph Hellwig 
1616ad5b325SChristoph Hellwig STATIC int
xlog_bread(struct xlog * log,xfs_daddr_t blk_no,int nbblks,char * data,char ** offset)162076e6acbSChristoph Hellwig xlog_bread(
1639a8d2fdbSMark Tinguely 	struct xlog	*log,
164076e6acbSChristoph Hellwig 	xfs_daddr_t	blk_no,
165076e6acbSChristoph Hellwig 	int		nbblks,
1666ad5b325SChristoph Hellwig 	char		*data,
167b2a922cdSChristoph Hellwig 	char		**offset)
168076e6acbSChristoph Hellwig {
169076e6acbSChristoph Hellwig 	int		error;
170076e6acbSChristoph Hellwig 
1716ad5b325SChristoph Hellwig 	error = xlog_do_io(log, blk_no, nbblks, data, REQ_OP_READ);
1726ad5b325SChristoph Hellwig 	if (!error)
1736ad5b325SChristoph Hellwig 		*offset = data + xlog_align(log, blk_no);
174076e6acbSChristoph Hellwig 	return error;
175076e6acbSChristoph Hellwig }
176076e6acbSChristoph Hellwig 
177ba0f32d4SChristoph Hellwig STATIC int
xlog_bwrite(struct xlog * log,xfs_daddr_t blk_no,int nbblks,char * data)1781da177e4SLinus Torvalds xlog_bwrite(
1799a8d2fdbSMark Tinguely 	struct xlog	*log,
1801da177e4SLinus Torvalds 	xfs_daddr_t	blk_no,
1811da177e4SLinus Torvalds 	int		nbblks,
1826ad5b325SChristoph Hellwig 	char		*data)
1831da177e4SLinus Torvalds {
1846ad5b325SChristoph Hellwig 	return xlog_do_io(log, blk_no, nbblks, data, REQ_OP_WRITE);
1851da177e4SLinus Torvalds }
1861da177e4SLinus Torvalds 
1871da177e4SLinus Torvalds #ifdef DEBUG
1881da177e4SLinus Torvalds /*
1891da177e4SLinus Torvalds  * dump debug superblock and log record information
1901da177e4SLinus Torvalds  */
1911da177e4SLinus Torvalds STATIC void
xlog_header_check_dump(xfs_mount_t * mp,xlog_rec_header_t * head)1921da177e4SLinus Torvalds xlog_header_check_dump(
1931da177e4SLinus Torvalds 	xfs_mount_t		*mp,
1941da177e4SLinus Torvalds 	xlog_rec_header_t	*head)
1951da177e4SLinus Torvalds {
19608e96e1aSEric Sandeen 	xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d",
19703daa57cSJoe Perches 		__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
19808e96e1aSEric Sandeen 	xfs_debug(mp, "    log : uuid = %pU, fmt = %d",
19903daa57cSJoe Perches 		&head->h_fs_uuid, be32_to_cpu(head->h_fmt));
2001da177e4SLinus Torvalds }
2011da177e4SLinus Torvalds #else
2021da177e4SLinus Torvalds #define xlog_header_check_dump(mp, head)
2031da177e4SLinus Torvalds #endif
2041da177e4SLinus Torvalds 
2051da177e4SLinus Torvalds /*
2061da177e4SLinus Torvalds  * check log record header for recovery
2071da177e4SLinus Torvalds  */
2081da177e4SLinus Torvalds STATIC int
xlog_header_check_recover(xfs_mount_t * mp,xlog_rec_header_t * head)2091da177e4SLinus Torvalds xlog_header_check_recover(
2101da177e4SLinus Torvalds 	xfs_mount_t		*mp,
2111da177e4SLinus Torvalds 	xlog_rec_header_t	*head)
2121da177e4SLinus Torvalds {
21369ef921bSChristoph Hellwig 	ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
2141da177e4SLinus Torvalds 
2151da177e4SLinus Torvalds 	/*
2161da177e4SLinus Torvalds 	 * IRIX doesn't write the h_fmt field and leaves it zeroed
2171da177e4SLinus Torvalds 	 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
2181da177e4SLinus Torvalds 	 * a dirty log created in IRIX.
2191da177e4SLinus Torvalds 	 */
220a71895c5SDarrick J. Wong 	if (XFS_IS_CORRUPT(mp, head->h_fmt != cpu_to_be32(XLOG_FMT))) {
221a0fa2b67SDave Chinner 		xfs_warn(mp,
222a0fa2b67SDave Chinner 	"dirty log written in incompatible format - can't recover");
2231da177e4SLinus Torvalds 		xlog_header_check_dump(mp, head);
2242451337dSDave Chinner 		return -EFSCORRUPTED;
225a71895c5SDarrick J. Wong 	}
226a71895c5SDarrick J. Wong 	if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
227a71895c5SDarrick J. Wong 					   &head->h_fs_uuid))) {
228a0fa2b67SDave Chinner 		xfs_warn(mp,
229a0fa2b67SDave Chinner 	"dirty log entry has mismatched uuid - can't recover");
2301da177e4SLinus Torvalds 		xlog_header_check_dump(mp, head);
2312451337dSDave Chinner 		return -EFSCORRUPTED;
2321da177e4SLinus Torvalds 	}
2331da177e4SLinus Torvalds 	return 0;
2341da177e4SLinus Torvalds }
2351da177e4SLinus Torvalds 
2361da177e4SLinus Torvalds /*
2371da177e4SLinus Torvalds  * read the head block of the log and check the header
2381da177e4SLinus Torvalds  */
2391da177e4SLinus Torvalds STATIC int
xlog_header_check_mount(xfs_mount_t * mp,xlog_rec_header_t * head)2401da177e4SLinus Torvalds xlog_header_check_mount(
2411da177e4SLinus Torvalds 	xfs_mount_t		*mp,
2421da177e4SLinus Torvalds 	xlog_rec_header_t	*head)
2431da177e4SLinus Torvalds {
24469ef921bSChristoph Hellwig 	ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
2451da177e4SLinus Torvalds 
246d905fdaaSAmir Goldstein 	if (uuid_is_null(&head->h_fs_uuid)) {
2471da177e4SLinus Torvalds 		/*
2481da177e4SLinus Torvalds 		 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
249d905fdaaSAmir Goldstein 		 * h_fs_uuid is null, we assume this log was last mounted
2501da177e4SLinus Torvalds 		 * by IRIX and continue.
2511da177e4SLinus Torvalds 		 */
252d905fdaaSAmir Goldstein 		xfs_warn(mp, "null uuid in log - IRIX style log");
253a71895c5SDarrick J. Wong 	} else if (XFS_IS_CORRUPT(mp, !uuid_equal(&mp->m_sb.sb_uuid,
254a71895c5SDarrick J. Wong 						  &head->h_fs_uuid))) {
255a0fa2b67SDave Chinner 		xfs_warn(mp, "log has mismatched uuid - can't recover");
2561da177e4SLinus Torvalds 		xlog_header_check_dump(mp, head);
2572451337dSDave Chinner 		return -EFSCORRUPTED;
2581da177e4SLinus Torvalds 	}
2591da177e4SLinus Torvalds 	return 0;
2601da177e4SLinus Torvalds }
2611da177e4SLinus Torvalds 
2621da177e4SLinus Torvalds /*
2631da177e4SLinus Torvalds  * This routine finds (to an approximation) the first block in the physical
2641da177e4SLinus Torvalds  * log which contains the given cycle.  It uses a binary search algorithm.
2651da177e4SLinus Torvalds  * Note that the algorithm can not be perfect because the disk will not
2661da177e4SLinus Torvalds  * necessarily be perfect.
2671da177e4SLinus Torvalds  */
268a8272ce0SDavid Chinner STATIC int
xlog_find_cycle_start(struct xlog * log,char * buffer,xfs_daddr_t first_blk,xfs_daddr_t * last_blk,uint cycle)2691da177e4SLinus Torvalds xlog_find_cycle_start(
2709a8d2fdbSMark Tinguely 	struct xlog	*log,
2716e9b3dd8SChristoph Hellwig 	char		*buffer,
2721da177e4SLinus Torvalds 	xfs_daddr_t	first_blk,
2731da177e4SLinus Torvalds 	xfs_daddr_t	*last_blk,
2741da177e4SLinus Torvalds 	uint		cycle)
2751da177e4SLinus Torvalds {
276b2a922cdSChristoph Hellwig 	char		*offset;
2771da177e4SLinus Torvalds 	xfs_daddr_t	mid_blk;
278e3bb2e30SAlex Elder 	xfs_daddr_t	end_blk;
2791da177e4SLinus Torvalds 	uint		mid_cycle;
2801da177e4SLinus Torvalds 	int		error;
2811da177e4SLinus Torvalds 
282e3bb2e30SAlex Elder 	end_blk = *last_blk;
283e3bb2e30SAlex Elder 	mid_blk = BLK_AVG(first_blk, end_blk);
284e3bb2e30SAlex Elder 	while (mid_blk != first_blk && mid_blk != end_blk) {
2856e9b3dd8SChristoph Hellwig 		error = xlog_bread(log, mid_blk, 1, buffer, &offset);
286076e6acbSChristoph Hellwig 		if (error)
2871da177e4SLinus Torvalds 			return error;
28803bea6feSChristoph Hellwig 		mid_cycle = xlog_get_cycle(offset);
289e3bb2e30SAlex Elder 		if (mid_cycle == cycle)
290e3bb2e30SAlex Elder 			end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
291e3bb2e30SAlex Elder 		else
292e3bb2e30SAlex Elder 			first_blk = mid_blk; /* first_half_cycle == mid_cycle */
293e3bb2e30SAlex Elder 		mid_blk = BLK_AVG(first_blk, end_blk);
2941da177e4SLinus Torvalds 	}
295e3bb2e30SAlex Elder 	ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
296e3bb2e30SAlex Elder 	       (mid_blk == end_blk && mid_blk-1 == first_blk));
297e3bb2e30SAlex Elder 
298e3bb2e30SAlex Elder 	*last_blk = end_blk;
2991da177e4SLinus Torvalds 
3001da177e4SLinus Torvalds 	return 0;
3011da177e4SLinus Torvalds }
3021da177e4SLinus Torvalds 
3031da177e4SLinus Torvalds /*
3043f943d85SAlex Elder  * Check that a range of blocks does not contain stop_on_cycle_no.
3053f943d85SAlex Elder  * Fill in *new_blk with the block offset where such a block is
3063f943d85SAlex Elder  * found, or with -1 (an invalid block number) if there is no such
3073f943d85SAlex Elder  * block in the range.  The scan needs to occur from front to back
3083f943d85SAlex Elder  * and the pointer into the region must be updated since a later
3093f943d85SAlex Elder  * routine will need to perform another test.
3101da177e4SLinus Torvalds  */
3111da177e4SLinus Torvalds STATIC int
xlog_find_verify_cycle(struct xlog * log,xfs_daddr_t start_blk,int nbblks,uint stop_on_cycle_no,xfs_daddr_t * new_blk)3121da177e4SLinus Torvalds xlog_find_verify_cycle(
3139a8d2fdbSMark Tinguely 	struct xlog	*log,
3141da177e4SLinus Torvalds 	xfs_daddr_t	start_blk,
3151da177e4SLinus Torvalds 	int		nbblks,
3161da177e4SLinus Torvalds 	uint		stop_on_cycle_no,
3171da177e4SLinus Torvalds 	xfs_daddr_t	*new_blk)
3181da177e4SLinus Torvalds {
3191da177e4SLinus Torvalds 	xfs_daddr_t	i, j;
3201da177e4SLinus Torvalds 	uint		cycle;
3216e9b3dd8SChristoph Hellwig 	char		*buffer;
3221da177e4SLinus Torvalds 	xfs_daddr_t	bufblks;
323b2a922cdSChristoph Hellwig 	char		*buf = NULL;
3241da177e4SLinus Torvalds 	int		error = 0;
3251da177e4SLinus Torvalds 
3266881a229SAlex Elder 	/*
3276881a229SAlex Elder 	 * Greedily allocate a buffer big enough to handle the full
3286881a229SAlex Elder 	 * range of basic blocks we'll be examining.  If that fails,
3296881a229SAlex Elder 	 * try a smaller size.  We need to be able to read at least
3306881a229SAlex Elder 	 * a log sector, or we're out of luck.
3316881a229SAlex Elder 	 */
3328b010acbSWang Jianchao 	bufblks = roundup_pow_of_two(nbblks);
33381158e0cSDave Chinner 	while (bufblks > log->l_logBBsize)
33481158e0cSDave Chinner 		bufblks >>= 1;
3356e9b3dd8SChristoph Hellwig 	while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
3361da177e4SLinus Torvalds 		bufblks >>= 1;
33769ce58f0SAlex Elder 		if (bufblks < log->l_sectBBsize)
3382451337dSDave Chinner 			return -ENOMEM;
3391da177e4SLinus Torvalds 	}
3401da177e4SLinus Torvalds 
3411da177e4SLinus Torvalds 	for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
3421da177e4SLinus Torvalds 		int	bcount;
3431da177e4SLinus Torvalds 
3441da177e4SLinus Torvalds 		bcount = min(bufblks, (start_blk + nbblks - i));
3451da177e4SLinus Torvalds 
3466e9b3dd8SChristoph Hellwig 		error = xlog_bread(log, i, bcount, buffer, &buf);
347076e6acbSChristoph Hellwig 		if (error)
3481da177e4SLinus Torvalds 			goto out;
3491da177e4SLinus Torvalds 
3501da177e4SLinus Torvalds 		for (j = 0; j < bcount; j++) {
35103bea6feSChristoph Hellwig 			cycle = xlog_get_cycle(buf);
3521da177e4SLinus Torvalds 			if (cycle == stop_on_cycle_no) {
3531da177e4SLinus Torvalds 				*new_blk = i+j;
3541da177e4SLinus Torvalds 				goto out;
3551da177e4SLinus Torvalds 			}
3561da177e4SLinus Torvalds 
3571da177e4SLinus Torvalds 			buf += BBSIZE;
3581da177e4SLinus Torvalds 		}
3591da177e4SLinus Torvalds 	}
3601da177e4SLinus Torvalds 
3611da177e4SLinus Torvalds 	*new_blk = -1;
3621da177e4SLinus Torvalds 
3631da177e4SLinus Torvalds out:
3646e9b3dd8SChristoph Hellwig 	kmem_free(buffer);
3651da177e4SLinus Torvalds 	return error;
3661da177e4SLinus Torvalds }
3671da177e4SLinus Torvalds 
3680c771b99SGao Xiang static inline int
xlog_logrec_hblks(struct xlog * log,struct xlog_rec_header * rh)3690c771b99SGao Xiang xlog_logrec_hblks(struct xlog *log, struct xlog_rec_header *rh)
3700c771b99SGao Xiang {
37138c26bfdSDave Chinner 	if (xfs_has_logv2(log->l_mp)) {
3720c771b99SGao Xiang 		int	h_size = be32_to_cpu(rh->h_size);
3730c771b99SGao Xiang 
3740c771b99SGao Xiang 		if ((be32_to_cpu(rh->h_version) & XLOG_VERSION_2) &&
3750c771b99SGao Xiang 		    h_size > XLOG_HEADER_CYCLE_SIZE)
3760c771b99SGao Xiang 			return DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
3770c771b99SGao Xiang 	}
3780c771b99SGao Xiang 	return 1;
3790c771b99SGao Xiang }
3800c771b99SGao Xiang 
3811da177e4SLinus Torvalds /*
3821da177e4SLinus Torvalds  * Potentially backup over partial log record write.
3831da177e4SLinus Torvalds  *
3841da177e4SLinus Torvalds  * In the typical case, last_blk is the number of the block directly after
3851da177e4SLinus Torvalds  * a good log record.  Therefore, we subtract one to get the block number
3861da177e4SLinus Torvalds  * of the last block in the given buffer.  extra_bblks contains the number
3871da177e4SLinus Torvalds  * of blocks we would have read on a previous read.  This happens when the
3881da177e4SLinus Torvalds  * last log record is split over the end of the physical log.
3891da177e4SLinus Torvalds  *
3901da177e4SLinus Torvalds  * extra_bblks is the number of blocks potentially verified on a previous
3911da177e4SLinus Torvalds  * call to this routine.
3921da177e4SLinus Torvalds  */
3931da177e4SLinus Torvalds STATIC int
xlog_find_verify_log_record(struct xlog * log,xfs_daddr_t start_blk,xfs_daddr_t * last_blk,int extra_bblks)3941da177e4SLinus Torvalds xlog_find_verify_log_record(
3959a8d2fdbSMark Tinguely 	struct xlog		*log,
3961da177e4SLinus Torvalds 	xfs_daddr_t		start_blk,
3971da177e4SLinus Torvalds 	xfs_daddr_t		*last_blk,
3981da177e4SLinus Torvalds 	int			extra_bblks)
3991da177e4SLinus Torvalds {
4001da177e4SLinus Torvalds 	xfs_daddr_t		i;
4016e9b3dd8SChristoph Hellwig 	char			*buffer;
402b2a922cdSChristoph Hellwig 	char			*offset = NULL;
4031da177e4SLinus Torvalds 	xlog_rec_header_t	*head = NULL;
4041da177e4SLinus Torvalds 	int			error = 0;
4051da177e4SLinus Torvalds 	int			smallmem = 0;
4061da177e4SLinus Torvalds 	int			num_blks = *last_blk - start_blk;
4071da177e4SLinus Torvalds 	int			xhdrs;
4081da177e4SLinus Torvalds 
4091da177e4SLinus Torvalds 	ASSERT(start_blk != 0 || *last_blk != start_blk);
4101da177e4SLinus Torvalds 
4116e9b3dd8SChristoph Hellwig 	buffer = xlog_alloc_buffer(log, num_blks);
4126e9b3dd8SChristoph Hellwig 	if (!buffer) {
4136e9b3dd8SChristoph Hellwig 		buffer = xlog_alloc_buffer(log, 1);
4146e9b3dd8SChristoph Hellwig 		if (!buffer)
4152451337dSDave Chinner 			return -ENOMEM;
4161da177e4SLinus Torvalds 		smallmem = 1;
4171da177e4SLinus Torvalds 	} else {
4186e9b3dd8SChristoph Hellwig 		error = xlog_bread(log, start_blk, num_blks, buffer, &offset);
419076e6acbSChristoph Hellwig 		if (error)
4201da177e4SLinus Torvalds 			goto out;
4211da177e4SLinus Torvalds 		offset += ((num_blks - 1) << BBSHIFT);
4221da177e4SLinus Torvalds 	}
4231da177e4SLinus Torvalds 
4241da177e4SLinus Torvalds 	for (i = (*last_blk) - 1; i >= 0; i--) {
4251da177e4SLinus Torvalds 		if (i < start_blk) {
4261da177e4SLinus Torvalds 			/* valid log record not found */
427a0fa2b67SDave Chinner 			xfs_warn(log->l_mp,
428a0fa2b67SDave Chinner 		"Log inconsistent (didn't find previous header)");
4291da177e4SLinus Torvalds 			ASSERT(0);
430895e196fSDarrick J. Wong 			error = -EFSCORRUPTED;
4311da177e4SLinus Torvalds 			goto out;
4321da177e4SLinus Torvalds 		}
4331da177e4SLinus Torvalds 
4341da177e4SLinus Torvalds 		if (smallmem) {
4356e9b3dd8SChristoph Hellwig 			error = xlog_bread(log, i, 1, buffer, &offset);
436076e6acbSChristoph Hellwig 			if (error)
4371da177e4SLinus Torvalds 				goto out;
4381da177e4SLinus Torvalds 		}
4391da177e4SLinus Torvalds 
4401da177e4SLinus Torvalds 		head = (xlog_rec_header_t *)offset;
4411da177e4SLinus Torvalds 
44269ef921bSChristoph Hellwig 		if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
4431da177e4SLinus Torvalds 			break;
4441da177e4SLinus Torvalds 
4451da177e4SLinus Torvalds 		if (!smallmem)
4461da177e4SLinus Torvalds 			offset -= BBSIZE;
4471da177e4SLinus Torvalds 	}
4481da177e4SLinus Torvalds 
4491da177e4SLinus Torvalds 	/*
4501da177e4SLinus Torvalds 	 * We hit the beginning of the physical log & still no header.  Return
4511da177e4SLinus Torvalds 	 * to caller.  If caller can handle a return of -1, then this routine
4521da177e4SLinus Torvalds 	 * will be called again for the end of the physical log.
4531da177e4SLinus Torvalds 	 */
4541da177e4SLinus Torvalds 	if (i == -1) {
4552451337dSDave Chinner 		error = 1;
4561da177e4SLinus Torvalds 		goto out;
4571da177e4SLinus Torvalds 	}
4581da177e4SLinus Torvalds 
4591da177e4SLinus Torvalds 	/*
4601da177e4SLinus Torvalds 	 * We have the final block of the good log (the first block
4611da177e4SLinus Torvalds 	 * of the log record _before_ the head. So we check the uuid.
4621da177e4SLinus Torvalds 	 */
4631da177e4SLinus Torvalds 	if ((error = xlog_header_check_mount(log->l_mp, head)))
4641da177e4SLinus Torvalds 		goto out;
4651da177e4SLinus Torvalds 
4661da177e4SLinus Torvalds 	/*
4671da177e4SLinus Torvalds 	 * We may have found a log record header before we expected one.
4681da177e4SLinus Torvalds 	 * last_blk will be the 1st block # with a given cycle #.  We may end
4691da177e4SLinus Torvalds 	 * up reading an entire log record.  In this case, we don't want to
4701da177e4SLinus Torvalds 	 * reset last_blk.  Only when last_blk points in the middle of a log
4711da177e4SLinus Torvalds 	 * record do we update last_blk.
4721da177e4SLinus Torvalds 	 */
4730c771b99SGao Xiang 	xhdrs = xlog_logrec_hblks(log, head);
4741da177e4SLinus Torvalds 
475b53e675dSChristoph Hellwig 	if (*last_blk - i + extra_bblks !=
476b53e675dSChristoph Hellwig 	    BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
4771da177e4SLinus Torvalds 		*last_blk = i;
4781da177e4SLinus Torvalds 
4791da177e4SLinus Torvalds out:
4806e9b3dd8SChristoph Hellwig 	kmem_free(buffer);
4811da177e4SLinus Torvalds 	return error;
4821da177e4SLinus Torvalds }
4831da177e4SLinus Torvalds 
4841da177e4SLinus Torvalds /*
4851da177e4SLinus Torvalds  * Head is defined to be the point of the log where the next log write
4860a94da24SZhi Yong Wu  * could go.  This means that incomplete LR writes at the end are
4871da177e4SLinus Torvalds  * eliminated when calculating the head.  We aren't guaranteed that previous
4881da177e4SLinus Torvalds  * LR have complete transactions.  We only know that a cycle number of
4891da177e4SLinus Torvalds  * current cycle number -1 won't be present in the log if we start writing
4901da177e4SLinus Torvalds  * from our current block number.
4911da177e4SLinus Torvalds  *
4921da177e4SLinus Torvalds  * last_blk contains the block number of the first block with a given
4931da177e4SLinus Torvalds  * cycle number.
4941da177e4SLinus Torvalds  *
4951da177e4SLinus Torvalds  * Return: zero if normal, non-zero if error.
4961da177e4SLinus Torvalds  */
497ba0f32d4SChristoph Hellwig STATIC int
xlog_find_head(struct xlog * log,xfs_daddr_t * return_head_blk)4981da177e4SLinus Torvalds xlog_find_head(
4999a8d2fdbSMark Tinguely 	struct xlog	*log,
5001da177e4SLinus Torvalds 	xfs_daddr_t	*return_head_blk)
5011da177e4SLinus Torvalds {
5026e9b3dd8SChristoph Hellwig 	char		*buffer;
503b2a922cdSChristoph Hellwig 	char		*offset;
5041da177e4SLinus Torvalds 	xfs_daddr_t	new_blk, first_blk, start_blk, last_blk, head_blk;
5051da177e4SLinus Torvalds 	int		num_scan_bblks;
5061da177e4SLinus Torvalds 	uint		first_half_cycle, last_half_cycle;
5071da177e4SLinus Torvalds 	uint		stop_on_cycle;
5081da177e4SLinus Torvalds 	int		error, log_bbnum = log->l_logBBsize;
5091da177e4SLinus Torvalds 
5101da177e4SLinus Torvalds 	/* Is the end of the log device zeroed? */
5112451337dSDave Chinner 	error = xlog_find_zeroed(log, &first_blk);
5122451337dSDave Chinner 	if (error < 0) {
5132451337dSDave Chinner 		xfs_warn(log->l_mp, "empty log check failed");
5142451337dSDave Chinner 		return error;
5152451337dSDave Chinner 	}
5162451337dSDave Chinner 	if (error == 1) {
5171da177e4SLinus Torvalds 		*return_head_blk = first_blk;
5181da177e4SLinus Torvalds 
5191da177e4SLinus Torvalds 		/* Is the whole lot zeroed? */
5201da177e4SLinus Torvalds 		if (!first_blk) {
5211da177e4SLinus Torvalds 			/* Linux XFS shouldn't generate totally zeroed logs -
5221da177e4SLinus Torvalds 			 * mkfs etc write a dummy unmount record to a fresh
5231da177e4SLinus Torvalds 			 * log so we can store the uuid in there
5241da177e4SLinus Torvalds 			 */
525a0fa2b67SDave Chinner 			xfs_warn(log->l_mp, "totally zeroed log");
5261da177e4SLinus Torvalds 		}
5271da177e4SLinus Torvalds 
5281da177e4SLinus Torvalds 		return 0;
5291da177e4SLinus Torvalds 	}
5301da177e4SLinus Torvalds 
5311da177e4SLinus Torvalds 	first_blk = 0;			/* get cycle # of 1st block */
5326e9b3dd8SChristoph Hellwig 	buffer = xlog_alloc_buffer(log, 1);
5336e9b3dd8SChristoph Hellwig 	if (!buffer)
5342451337dSDave Chinner 		return -ENOMEM;
535076e6acbSChristoph Hellwig 
5366e9b3dd8SChristoph Hellwig 	error = xlog_bread(log, 0, 1, buffer, &offset);
537076e6acbSChristoph Hellwig 	if (error)
5386e9b3dd8SChristoph Hellwig 		goto out_free_buffer;
539076e6acbSChristoph Hellwig 
54003bea6feSChristoph Hellwig 	first_half_cycle = xlog_get_cycle(offset);
5411da177e4SLinus Torvalds 
5421da177e4SLinus Torvalds 	last_blk = head_blk = log_bbnum - 1;	/* get cycle # of last block */
5436e9b3dd8SChristoph Hellwig 	error = xlog_bread(log, last_blk, 1, buffer, &offset);
544076e6acbSChristoph Hellwig 	if (error)
5456e9b3dd8SChristoph Hellwig 		goto out_free_buffer;
546076e6acbSChristoph Hellwig 
54703bea6feSChristoph Hellwig 	last_half_cycle = xlog_get_cycle(offset);
5481da177e4SLinus Torvalds 	ASSERT(last_half_cycle != 0);
5491da177e4SLinus Torvalds 
5501da177e4SLinus Torvalds 	/*
5511da177e4SLinus Torvalds 	 * If the 1st half cycle number is equal to the last half cycle number,
5521da177e4SLinus Torvalds 	 * then the entire log is stamped with the same cycle number.  In this
5531da177e4SLinus Torvalds 	 * case, head_blk can't be set to zero (which makes sense).  The below
5541da177e4SLinus Torvalds 	 * math doesn't work out properly with head_blk equal to zero.  Instead,
5551da177e4SLinus Torvalds 	 * we set it to log_bbnum which is an invalid block number, but this
5561da177e4SLinus Torvalds 	 * value makes the math correct.  If head_blk doesn't changed through
5571da177e4SLinus Torvalds 	 * all the tests below, *head_blk is set to zero at the very end rather
5581da177e4SLinus Torvalds 	 * than log_bbnum.  In a sense, log_bbnum and zero are the same block
5591da177e4SLinus Torvalds 	 * in a circular file.
5601da177e4SLinus Torvalds 	 */
5611da177e4SLinus Torvalds 	if (first_half_cycle == last_half_cycle) {
5621da177e4SLinus Torvalds 		/*
5631da177e4SLinus Torvalds 		 * In this case we believe that the entire log should have
5641da177e4SLinus Torvalds 		 * cycle number last_half_cycle.  We need to scan backwards
5651da177e4SLinus Torvalds 		 * from the end verifying that there are no holes still
5661da177e4SLinus Torvalds 		 * containing last_half_cycle - 1.  If we find such a hole,
5671da177e4SLinus Torvalds 		 * then the start of that hole will be the new head.  The
5681da177e4SLinus Torvalds 		 * simple case looks like
5691da177e4SLinus Torvalds 		 *        x | x ... | x - 1 | x
5701da177e4SLinus Torvalds 		 * Another case that fits this picture would be
5711da177e4SLinus Torvalds 		 *        x | x + 1 | x ... | x
572c41564b5SNathan Scott 		 * In this case the head really is somewhere at the end of the
5731da177e4SLinus Torvalds 		 * log, as one of the latest writes at the beginning was
5741da177e4SLinus Torvalds 		 * incomplete.
5751da177e4SLinus Torvalds 		 * One more case is
5761da177e4SLinus Torvalds 		 *        x | x + 1 | x ... | x - 1 | x
5771da177e4SLinus Torvalds 		 * This is really the combination of the above two cases, and
5781da177e4SLinus Torvalds 		 * the head has to end up at the start of the x-1 hole at the
5791da177e4SLinus Torvalds 		 * end of the log.
5801da177e4SLinus Torvalds 		 *
5811da177e4SLinus Torvalds 		 * In the 256k log case, we will read from the beginning to the
5821da177e4SLinus Torvalds 		 * end of the log and search for cycle numbers equal to x-1.
5831da177e4SLinus Torvalds 		 * We don't worry about the x+1 blocks that we encounter,
5841da177e4SLinus Torvalds 		 * because we know that they cannot be the head since the log
5851da177e4SLinus Torvalds 		 * started with x.
5861da177e4SLinus Torvalds 		 */
5871da177e4SLinus Torvalds 		head_blk = log_bbnum;
5881da177e4SLinus Torvalds 		stop_on_cycle = last_half_cycle - 1;
5891da177e4SLinus Torvalds 	} else {
5901da177e4SLinus Torvalds 		/*
5911da177e4SLinus Torvalds 		 * In this case we want to find the first block with cycle
5921da177e4SLinus Torvalds 		 * number matching last_half_cycle.  We expect the log to be
5931da177e4SLinus Torvalds 		 * some variation on
5943f943d85SAlex Elder 		 *        x + 1 ... | x ... | x
5951da177e4SLinus Torvalds 		 * The first block with cycle number x (last_half_cycle) will
5961da177e4SLinus Torvalds 		 * be where the new head belongs.  First we do a binary search
5971da177e4SLinus Torvalds 		 * for the first occurrence of last_half_cycle.  The binary
5981da177e4SLinus Torvalds 		 * search may not be totally accurate, so then we scan back
5991da177e4SLinus Torvalds 		 * from there looking for occurrences of last_half_cycle before
6001da177e4SLinus Torvalds 		 * us.  If that backwards scan wraps around the beginning of
6011da177e4SLinus Torvalds 		 * the log, then we look for occurrences of last_half_cycle - 1
6021da177e4SLinus Torvalds 		 * at the end of the log.  The cases we're looking for look
6031da177e4SLinus Torvalds 		 * like
6043f943d85SAlex Elder 		 *                               v binary search stopped here
6053f943d85SAlex Elder 		 *        x + 1 ... | x | x + 1 | x ... | x
6063f943d85SAlex Elder 		 *                   ^ but we want to locate this spot
6071da177e4SLinus Torvalds 		 * or
6081da177e4SLinus Torvalds 		 *        <---------> less than scan distance
6093f943d85SAlex Elder 		 *        x + 1 ... | x ... | x - 1 | x
6103f943d85SAlex Elder 		 *                           ^ we want to locate this spot
6111da177e4SLinus Torvalds 		 */
6121da177e4SLinus Torvalds 		stop_on_cycle = last_half_cycle;
6136e9b3dd8SChristoph Hellwig 		error = xlog_find_cycle_start(log, buffer, first_blk, &head_blk,
6146e9b3dd8SChristoph Hellwig 				last_half_cycle);
6156e9b3dd8SChristoph Hellwig 		if (error)
6166e9b3dd8SChristoph Hellwig 			goto out_free_buffer;
6171da177e4SLinus Torvalds 	}
6181da177e4SLinus Torvalds 
6191da177e4SLinus Torvalds 	/*
6201da177e4SLinus Torvalds 	 * Now validate the answer.  Scan back some number of maximum possible
6211da177e4SLinus Torvalds 	 * blocks and make sure each one has the expected cycle number.  The
6221da177e4SLinus Torvalds 	 * maximum is determined by the total possible amount of buffering
6231da177e4SLinus Torvalds 	 * in the in-core log.  The following number can be made tighter if
6241da177e4SLinus Torvalds 	 * we actually look at the block size of the filesystem.
6251da177e4SLinus Torvalds 	 */
6269f2a4505SBrian Foster 	num_scan_bblks = min_t(int, log_bbnum, XLOG_TOTAL_REC_SHIFT(log));
6271da177e4SLinus Torvalds 	if (head_blk >= num_scan_bblks) {
6281da177e4SLinus Torvalds 		/*
6291da177e4SLinus Torvalds 		 * We are guaranteed that the entire check can be performed
6301da177e4SLinus Torvalds 		 * in one buffer.
6311da177e4SLinus Torvalds 		 */
6321da177e4SLinus Torvalds 		start_blk = head_blk - num_scan_bblks;
6331da177e4SLinus Torvalds 		if ((error = xlog_find_verify_cycle(log,
6341da177e4SLinus Torvalds 						start_blk, num_scan_bblks,
6351da177e4SLinus Torvalds 						stop_on_cycle, &new_blk)))
6366e9b3dd8SChristoph Hellwig 			goto out_free_buffer;
6371da177e4SLinus Torvalds 		if (new_blk != -1)
6381da177e4SLinus Torvalds 			head_blk = new_blk;
6391da177e4SLinus Torvalds 	} else {		/* need to read 2 parts of log */
6401da177e4SLinus Torvalds 		/*
6411da177e4SLinus Torvalds 		 * We are going to scan backwards in the log in two parts.
6421da177e4SLinus Torvalds 		 * First we scan the physical end of the log.  In this part
6431da177e4SLinus Torvalds 		 * of the log, we are looking for blocks with cycle number
6441da177e4SLinus Torvalds 		 * last_half_cycle - 1.
6451da177e4SLinus Torvalds 		 * If we find one, then we know that the log starts there, as
6461da177e4SLinus Torvalds 		 * we've found a hole that didn't get written in going around
6471da177e4SLinus Torvalds 		 * the end of the physical log.  The simple case for this is
6481da177e4SLinus Torvalds 		 *        x + 1 ... | x ... | x - 1 | x
6491da177e4SLinus Torvalds 		 *        <---------> less than scan distance
6501da177e4SLinus Torvalds 		 * If all of the blocks at the end of the log have cycle number
6511da177e4SLinus Torvalds 		 * last_half_cycle, then we check the blocks at the start of
6521da177e4SLinus Torvalds 		 * the log looking for occurrences of last_half_cycle.  If we
6531da177e4SLinus Torvalds 		 * find one, then our current estimate for the location of the
6541da177e4SLinus Torvalds 		 * first occurrence of last_half_cycle is wrong and we move
6551da177e4SLinus Torvalds 		 * back to the hole we've found.  This case looks like
6561da177e4SLinus Torvalds 		 *        x + 1 ... | x | x + 1 | x ...
6571da177e4SLinus Torvalds 		 *                               ^ binary search stopped here
6581da177e4SLinus Torvalds 		 * Another case we need to handle that only occurs in 256k
6591da177e4SLinus Torvalds 		 * logs is
6601da177e4SLinus Torvalds 		 *        x + 1 ... | x ... | x+1 | x ...
6611da177e4SLinus Torvalds 		 *                   ^ binary search stops here
6621da177e4SLinus Torvalds 		 * In a 256k log, the scan at the end of the log will see the
6631da177e4SLinus Torvalds 		 * x + 1 blocks.  We need to skip past those since that is
6641da177e4SLinus Torvalds 		 * certainly not the head of the log.  By searching for
6651da177e4SLinus Torvalds 		 * last_half_cycle-1 we accomplish that.
6661da177e4SLinus Torvalds 		 */
6671da177e4SLinus Torvalds 		ASSERT(head_blk <= INT_MAX &&
6683f943d85SAlex Elder 			(xfs_daddr_t) num_scan_bblks >= head_blk);
6693f943d85SAlex Elder 		start_blk = log_bbnum - (num_scan_bblks - head_blk);
6701da177e4SLinus Torvalds 		if ((error = xlog_find_verify_cycle(log, start_blk,
6711da177e4SLinus Torvalds 					num_scan_bblks - (int)head_blk,
6721da177e4SLinus Torvalds 					(stop_on_cycle - 1), &new_blk)))
6736e9b3dd8SChristoph Hellwig 			goto out_free_buffer;
6741da177e4SLinus Torvalds 		if (new_blk != -1) {
6751da177e4SLinus Torvalds 			head_blk = new_blk;
6769db127edSAlex Elder 			goto validate_head;
6771da177e4SLinus Torvalds 		}
6781da177e4SLinus Torvalds 
6791da177e4SLinus Torvalds 		/*
6801da177e4SLinus Torvalds 		 * Scan beginning of log now.  The last part of the physical
6811da177e4SLinus Torvalds 		 * log is good.  This scan needs to verify that it doesn't find
6821da177e4SLinus Torvalds 		 * the last_half_cycle.
6831da177e4SLinus Torvalds 		 */
6841da177e4SLinus Torvalds 		start_blk = 0;
6851da177e4SLinus Torvalds 		ASSERT(head_blk <= INT_MAX);
6861da177e4SLinus Torvalds 		if ((error = xlog_find_verify_cycle(log,
6871da177e4SLinus Torvalds 					start_blk, (int)head_blk,
6881da177e4SLinus Torvalds 					stop_on_cycle, &new_blk)))
6896e9b3dd8SChristoph Hellwig 			goto out_free_buffer;
6901da177e4SLinus Torvalds 		if (new_blk != -1)
6911da177e4SLinus Torvalds 			head_blk = new_blk;
6921da177e4SLinus Torvalds 	}
6931da177e4SLinus Torvalds 
6949db127edSAlex Elder validate_head:
6951da177e4SLinus Torvalds 	/*
6961da177e4SLinus Torvalds 	 * Now we need to make sure head_blk is not pointing to a block in
6971da177e4SLinus Torvalds 	 * the middle of a log record.
6981da177e4SLinus Torvalds 	 */
6991da177e4SLinus Torvalds 	num_scan_bblks = XLOG_REC_SHIFT(log);
7001da177e4SLinus Torvalds 	if (head_blk >= num_scan_bblks) {
7011da177e4SLinus Torvalds 		start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
7021da177e4SLinus Torvalds 
7031da177e4SLinus Torvalds 		/* start ptr at last block ptr before head_blk */
7042451337dSDave Chinner 		error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
7052451337dSDave Chinner 		if (error == 1)
7062451337dSDave Chinner 			error = -EIO;
7072451337dSDave Chinner 		if (error)
7086e9b3dd8SChristoph Hellwig 			goto out_free_buffer;
7091da177e4SLinus Torvalds 	} else {
7101da177e4SLinus Torvalds 		start_blk = 0;
7111da177e4SLinus Torvalds 		ASSERT(head_blk <= INT_MAX);
7122451337dSDave Chinner 		error = xlog_find_verify_log_record(log, start_blk, &head_blk, 0);
7132451337dSDave Chinner 		if (error < 0)
7146e9b3dd8SChristoph Hellwig 			goto out_free_buffer;
7152451337dSDave Chinner 		if (error == 1) {
7161da177e4SLinus Torvalds 			/* We hit the beginning of the log during our search */
7173f943d85SAlex Elder 			start_blk = log_bbnum - (num_scan_bblks - head_blk);
7181da177e4SLinus Torvalds 			new_blk = log_bbnum;
7191da177e4SLinus Torvalds 			ASSERT(start_blk <= INT_MAX &&
7201da177e4SLinus Torvalds 				(xfs_daddr_t) log_bbnum-start_blk >= 0);
7211da177e4SLinus Torvalds 			ASSERT(head_blk <= INT_MAX);
7222451337dSDave Chinner 			error = xlog_find_verify_log_record(log, start_blk,
7232451337dSDave Chinner 							&new_blk, (int)head_blk);
7242451337dSDave Chinner 			if (error == 1)
7252451337dSDave Chinner 				error = -EIO;
7262451337dSDave Chinner 			if (error)
7276e9b3dd8SChristoph Hellwig 				goto out_free_buffer;
7281da177e4SLinus Torvalds 			if (new_blk != log_bbnum)
7291da177e4SLinus Torvalds 				head_blk = new_blk;
7301da177e4SLinus Torvalds 		} else if (error)
7316e9b3dd8SChristoph Hellwig 			goto out_free_buffer;
7321da177e4SLinus Torvalds 	}
7331da177e4SLinus Torvalds 
7346e9b3dd8SChristoph Hellwig 	kmem_free(buffer);
7351da177e4SLinus Torvalds 	if (head_blk == log_bbnum)
7361da177e4SLinus Torvalds 		*return_head_blk = 0;
7371da177e4SLinus Torvalds 	else
7381da177e4SLinus Torvalds 		*return_head_blk = head_blk;
7391da177e4SLinus Torvalds 	/*
7401da177e4SLinus Torvalds 	 * When returning here, we have a good block number.  Bad block
7411da177e4SLinus Torvalds 	 * means that during a previous crash, we didn't have a clean break
7421da177e4SLinus Torvalds 	 * from cycle number N to cycle number N-1.  In this case, we need
7431da177e4SLinus Torvalds 	 * to find the first block with cycle number N-1.
7441da177e4SLinus Torvalds 	 */
7451da177e4SLinus Torvalds 	return 0;
7461da177e4SLinus Torvalds 
7476e9b3dd8SChristoph Hellwig out_free_buffer:
7486e9b3dd8SChristoph Hellwig 	kmem_free(buffer);
7491da177e4SLinus Torvalds 	if (error)
750a0fa2b67SDave Chinner 		xfs_warn(log->l_mp, "failed to find log head");
7511da177e4SLinus Torvalds 	return error;
7521da177e4SLinus Torvalds }
7531da177e4SLinus Torvalds 
7541da177e4SLinus Torvalds /*
755eed6b462SBrian Foster  * Seek backwards in the log for log record headers.
756eed6b462SBrian Foster  *
757eed6b462SBrian Foster  * Given a starting log block, walk backwards until we find the provided number
758eed6b462SBrian Foster  * of records or hit the provided tail block. The return value is the number of
759eed6b462SBrian Foster  * records encountered or a negative error code. The log block and buffer
760eed6b462SBrian Foster  * pointer of the last record seen are returned in rblk and rhead respectively.
761eed6b462SBrian Foster  */
762eed6b462SBrian Foster STATIC int
xlog_rseek_logrec_hdr(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk,int count,char * buffer,xfs_daddr_t * rblk,struct xlog_rec_header ** rhead,bool * wrapped)763eed6b462SBrian Foster xlog_rseek_logrec_hdr(
764eed6b462SBrian Foster 	struct xlog		*log,
765eed6b462SBrian Foster 	xfs_daddr_t		head_blk,
766eed6b462SBrian Foster 	xfs_daddr_t		tail_blk,
767eed6b462SBrian Foster 	int			count,
7686e9b3dd8SChristoph Hellwig 	char			*buffer,
769eed6b462SBrian Foster 	xfs_daddr_t		*rblk,
770eed6b462SBrian Foster 	struct xlog_rec_header	**rhead,
771eed6b462SBrian Foster 	bool			*wrapped)
772eed6b462SBrian Foster {
773eed6b462SBrian Foster 	int			i;
774eed6b462SBrian Foster 	int			error;
775eed6b462SBrian Foster 	int			found = 0;
776eed6b462SBrian Foster 	char			*offset = NULL;
777eed6b462SBrian Foster 	xfs_daddr_t		end_blk;
778eed6b462SBrian Foster 
779eed6b462SBrian Foster 	*wrapped = false;
780eed6b462SBrian Foster 
781eed6b462SBrian Foster 	/*
782eed6b462SBrian Foster 	 * Walk backwards from the head block until we hit the tail or the first
783eed6b462SBrian Foster 	 * block in the log.
784eed6b462SBrian Foster 	 */
785eed6b462SBrian Foster 	end_blk = head_blk > tail_blk ? tail_blk : 0;
786eed6b462SBrian Foster 	for (i = (int) head_blk - 1; i >= end_blk; i--) {
7876e9b3dd8SChristoph Hellwig 		error = xlog_bread(log, i, 1, buffer, &offset);
788eed6b462SBrian Foster 		if (error)
789eed6b462SBrian Foster 			goto out_error;
790eed6b462SBrian Foster 
791eed6b462SBrian Foster 		if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
792eed6b462SBrian Foster 			*rblk = i;
793eed6b462SBrian Foster 			*rhead = (struct xlog_rec_header *) offset;
794eed6b462SBrian Foster 			if (++found == count)
795eed6b462SBrian Foster 				break;
796eed6b462SBrian Foster 		}
797eed6b462SBrian Foster 	}
798eed6b462SBrian Foster 
799eed6b462SBrian Foster 	/*
800eed6b462SBrian Foster 	 * If we haven't hit the tail block or the log record header count,
801eed6b462SBrian Foster 	 * start looking again from the end of the physical log. Note that
802eed6b462SBrian Foster 	 * callers can pass head == tail if the tail is not yet known.
803eed6b462SBrian Foster 	 */
804eed6b462SBrian Foster 	if (tail_blk >= head_blk && found != count) {
805eed6b462SBrian Foster 		for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
8066e9b3dd8SChristoph Hellwig 			error = xlog_bread(log, i, 1, buffer, &offset);
807eed6b462SBrian Foster 			if (error)
808eed6b462SBrian Foster 				goto out_error;
809eed6b462SBrian Foster 
810eed6b462SBrian Foster 			if (*(__be32 *)offset ==
811eed6b462SBrian Foster 			    cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
812eed6b462SBrian Foster 				*wrapped = true;
813eed6b462SBrian Foster 				*rblk = i;
814eed6b462SBrian Foster 				*rhead = (struct xlog_rec_header *) offset;
815eed6b462SBrian Foster 				if (++found == count)
816eed6b462SBrian Foster 					break;
817eed6b462SBrian Foster 			}
818eed6b462SBrian Foster 		}
819eed6b462SBrian Foster 	}
820eed6b462SBrian Foster 
821eed6b462SBrian Foster 	return found;
822eed6b462SBrian Foster 
823eed6b462SBrian Foster out_error:
824eed6b462SBrian Foster 	return error;
825eed6b462SBrian Foster }
826eed6b462SBrian Foster 
827eed6b462SBrian Foster /*
8287088c413SBrian Foster  * Seek forward in the log for log record headers.
8297088c413SBrian Foster  *
8307088c413SBrian Foster  * Given head and tail blocks, walk forward from the tail block until we find
8317088c413SBrian Foster  * the provided number of records or hit the head block. The return value is the
8327088c413SBrian Foster  * number of records encountered or a negative error code. The log block and
8337088c413SBrian Foster  * buffer pointer of the last record seen are returned in rblk and rhead
8347088c413SBrian Foster  * respectively.
8357088c413SBrian Foster  */
8367088c413SBrian Foster STATIC int
xlog_seek_logrec_hdr(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk,int count,char * buffer,xfs_daddr_t * rblk,struct xlog_rec_header ** rhead,bool * wrapped)8377088c413SBrian Foster xlog_seek_logrec_hdr(
8387088c413SBrian Foster 	struct xlog		*log,
8397088c413SBrian Foster 	xfs_daddr_t		head_blk,
8407088c413SBrian Foster 	xfs_daddr_t		tail_blk,
8417088c413SBrian Foster 	int			count,
8426e9b3dd8SChristoph Hellwig 	char			*buffer,
8437088c413SBrian Foster 	xfs_daddr_t		*rblk,
8447088c413SBrian Foster 	struct xlog_rec_header	**rhead,
8457088c413SBrian Foster 	bool			*wrapped)
8467088c413SBrian Foster {
8477088c413SBrian Foster 	int			i;
8487088c413SBrian Foster 	int			error;
8497088c413SBrian Foster 	int			found = 0;
8507088c413SBrian Foster 	char			*offset = NULL;
8517088c413SBrian Foster 	xfs_daddr_t		end_blk;
8527088c413SBrian Foster 
8537088c413SBrian Foster 	*wrapped = false;
8547088c413SBrian Foster 
8557088c413SBrian Foster 	/*
8567088c413SBrian Foster 	 * Walk forward from the tail block until we hit the head or the last
8577088c413SBrian Foster 	 * block in the log.
8587088c413SBrian Foster 	 */
8597088c413SBrian Foster 	end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
8607088c413SBrian Foster 	for (i = (int) tail_blk; i <= end_blk; i++) {
8616e9b3dd8SChristoph Hellwig 		error = xlog_bread(log, i, 1, buffer, &offset);
8627088c413SBrian Foster 		if (error)
8637088c413SBrian Foster 			goto out_error;
8647088c413SBrian Foster 
8657088c413SBrian Foster 		if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
8667088c413SBrian Foster 			*rblk = i;
8677088c413SBrian Foster 			*rhead = (struct xlog_rec_header *) offset;
8687088c413SBrian Foster 			if (++found == count)
8697088c413SBrian Foster 				break;
8707088c413SBrian Foster 		}
8717088c413SBrian Foster 	}
8727088c413SBrian Foster 
8737088c413SBrian Foster 	/*
8747088c413SBrian Foster 	 * If we haven't hit the head block or the log record header count,
8757088c413SBrian Foster 	 * start looking again from the start of the physical log.
8767088c413SBrian Foster 	 */
8777088c413SBrian Foster 	if (tail_blk > head_blk && found != count) {
8787088c413SBrian Foster 		for (i = 0; i < (int) head_blk; i++) {
8796e9b3dd8SChristoph Hellwig 			error = xlog_bread(log, i, 1, buffer, &offset);
8807088c413SBrian Foster 			if (error)
8817088c413SBrian Foster 				goto out_error;
8827088c413SBrian Foster 
8837088c413SBrian Foster 			if (*(__be32 *)offset ==
8847088c413SBrian Foster 			    cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
8857088c413SBrian Foster 				*wrapped = true;
8867088c413SBrian Foster 				*rblk = i;
8877088c413SBrian Foster 				*rhead = (struct xlog_rec_header *) offset;
8887088c413SBrian Foster 				if (++found == count)
8897088c413SBrian Foster 					break;
8907088c413SBrian Foster 			}
8917088c413SBrian Foster 		}
8927088c413SBrian Foster 	}
8937088c413SBrian Foster 
8947088c413SBrian Foster 	return found;
8957088c413SBrian Foster 
8967088c413SBrian Foster out_error:
8977088c413SBrian Foster 	return error;
8987088c413SBrian Foster }
8997088c413SBrian Foster 
9007088c413SBrian Foster /*
9014a4f66eaSBrian Foster  * Calculate distance from head to tail (i.e., unused space in the log).
9024a4f66eaSBrian Foster  */
9034a4f66eaSBrian Foster static inline int
xlog_tail_distance(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk)9044a4f66eaSBrian Foster xlog_tail_distance(
9054a4f66eaSBrian Foster 	struct xlog	*log,
9064a4f66eaSBrian Foster 	xfs_daddr_t	head_blk,
9074a4f66eaSBrian Foster 	xfs_daddr_t	tail_blk)
9084a4f66eaSBrian Foster {
9094a4f66eaSBrian Foster 	if (head_blk < tail_blk)
9104a4f66eaSBrian Foster 		return tail_blk - head_blk;
9114a4f66eaSBrian Foster 
9124a4f66eaSBrian Foster 	return tail_blk + (log->l_logBBsize - head_blk);
9134a4f66eaSBrian Foster }
9144a4f66eaSBrian Foster 
9154a4f66eaSBrian Foster /*
9164a4f66eaSBrian Foster  * Verify the log tail. This is particularly important when torn or incomplete
9174a4f66eaSBrian Foster  * writes have been detected near the front of the log and the head has been
9184a4f66eaSBrian Foster  * walked back accordingly.
9197088c413SBrian Foster  *
9204a4f66eaSBrian Foster  * We also have to handle the case where the tail was pinned and the head
9214a4f66eaSBrian Foster  * blocked behind the tail right before a crash. If the tail had been pushed
9224a4f66eaSBrian Foster  * immediately prior to the crash and the subsequent checkpoint was only
9234a4f66eaSBrian Foster  * partially written, it's possible it overwrote the last referenced tail in the
9244a4f66eaSBrian Foster  * log with garbage. This is not a coherency problem because the tail must have
9254a4f66eaSBrian Foster  * been pushed before it can be overwritten, but appears as log corruption to
9264a4f66eaSBrian Foster  * recovery because we have no way to know the tail was updated if the
9274a4f66eaSBrian Foster  * subsequent checkpoint didn't write successfully.
9284a4f66eaSBrian Foster  *
9294a4f66eaSBrian Foster  * Therefore, CRC check the log from tail to head. If a failure occurs and the
9304a4f66eaSBrian Foster  * offending record is within max iclog bufs from the head, walk the tail
9314a4f66eaSBrian Foster  * forward and retry until a valid tail is found or corruption is detected out
9324a4f66eaSBrian Foster  * of the range of a possible overwrite.
9337088c413SBrian Foster  */
9347088c413SBrian Foster STATIC int
xlog_verify_tail(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t * tail_blk,int hsize)9357088c413SBrian Foster xlog_verify_tail(
9367088c413SBrian Foster 	struct xlog		*log,
9377088c413SBrian Foster 	xfs_daddr_t		head_blk,
9384a4f66eaSBrian Foster 	xfs_daddr_t		*tail_blk,
9394a4f66eaSBrian Foster 	int			hsize)
9407088c413SBrian Foster {
9417088c413SBrian Foster 	struct xlog_rec_header	*thead;
9426e9b3dd8SChristoph Hellwig 	char			*buffer;
9437088c413SBrian Foster 	xfs_daddr_t		first_bad;
9447088c413SBrian Foster 	int			error = 0;
9457088c413SBrian Foster 	bool			wrapped;
9464a4f66eaSBrian Foster 	xfs_daddr_t		tmp_tail;
9474a4f66eaSBrian Foster 	xfs_daddr_t		orig_tail = *tail_blk;
9487088c413SBrian Foster 
9496e9b3dd8SChristoph Hellwig 	buffer = xlog_alloc_buffer(log, 1);
9506e9b3dd8SChristoph Hellwig 	if (!buffer)
9517088c413SBrian Foster 		return -ENOMEM;
9527088c413SBrian Foster 
9537088c413SBrian Foster 	/*
9544a4f66eaSBrian Foster 	 * Make sure the tail points to a record (returns positive count on
9554a4f66eaSBrian Foster 	 * success).
9567088c413SBrian Foster 	 */
9576e9b3dd8SChristoph Hellwig 	error = xlog_seek_logrec_hdr(log, head_blk, *tail_blk, 1, buffer,
9584a4f66eaSBrian Foster 			&tmp_tail, &thead, &wrapped);
9594a4f66eaSBrian Foster 	if (error < 0)
9607088c413SBrian Foster 		goto out;
9614a4f66eaSBrian Foster 	if (*tail_blk != tmp_tail)
9624a4f66eaSBrian Foster 		*tail_blk = tmp_tail;
9634a4f66eaSBrian Foster 
9644a4f66eaSBrian Foster 	/*
9654a4f66eaSBrian Foster 	 * Run a CRC check from the tail to the head. We can't just check
9664a4f66eaSBrian Foster 	 * MAX_ICLOGS records past the tail because the tail may point to stale
9674a4f66eaSBrian Foster 	 * blocks cleared during the search for the head/tail. These blocks are
9684a4f66eaSBrian Foster 	 * overwritten with zero-length records and thus record count is not a
9694a4f66eaSBrian Foster 	 * reliable indicator of the iclog state before a crash.
9704a4f66eaSBrian Foster 	 */
9714a4f66eaSBrian Foster 	first_bad = 0;
9724a4f66eaSBrian Foster 	error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
9734a4f66eaSBrian Foster 				      XLOG_RECOVER_CRCPASS, &first_bad);
974a4c9b34dSBrian Foster 	while ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
9754a4f66eaSBrian Foster 		int	tail_distance;
9764a4f66eaSBrian Foster 
9774a4f66eaSBrian Foster 		/*
9784a4f66eaSBrian Foster 		 * Is corruption within range of the head? If so, retry from
9794a4f66eaSBrian Foster 		 * the next record. Otherwise return an error.
9804a4f66eaSBrian Foster 		 */
9814a4f66eaSBrian Foster 		tail_distance = xlog_tail_distance(log, head_blk, first_bad);
9824a4f66eaSBrian Foster 		if (tail_distance > BTOBB(XLOG_MAX_ICLOGS * hsize))
9834a4f66eaSBrian Foster 			break;
9844a4f66eaSBrian Foster 
9854a4f66eaSBrian Foster 		/* skip to the next record; returns positive count on success */
9866e9b3dd8SChristoph Hellwig 		error = xlog_seek_logrec_hdr(log, head_blk, first_bad, 2,
9876e9b3dd8SChristoph Hellwig 				buffer, &tmp_tail, &thead, &wrapped);
9884a4f66eaSBrian Foster 		if (error < 0)
9894a4f66eaSBrian Foster 			goto out;
9904a4f66eaSBrian Foster 
9914a4f66eaSBrian Foster 		*tail_blk = tmp_tail;
9924a4f66eaSBrian Foster 		first_bad = 0;
9934a4f66eaSBrian Foster 		error = xlog_do_recovery_pass(log, head_blk, *tail_blk,
9944a4f66eaSBrian Foster 					      XLOG_RECOVER_CRCPASS, &first_bad);
9957088c413SBrian Foster 	}
9967088c413SBrian Foster 
9974a4f66eaSBrian Foster 	if (!error && *tail_blk != orig_tail)
9984a4f66eaSBrian Foster 		xfs_warn(log->l_mp,
9994a4f66eaSBrian Foster 		"Tail block (0x%llx) overwrite detected. Updated to 0x%llx",
10004a4f66eaSBrian Foster 			 orig_tail, *tail_blk);
10017088c413SBrian Foster out:
10026e9b3dd8SChristoph Hellwig 	kmem_free(buffer);
10037088c413SBrian Foster 	return error;
10047088c413SBrian Foster }
10057088c413SBrian Foster 
10067088c413SBrian Foster /*
10077088c413SBrian Foster  * Detect and trim torn writes from the head of the log.
10087088c413SBrian Foster  *
10097088c413SBrian Foster  * Storage without sector atomicity guarantees can result in torn writes in the
10107088c413SBrian Foster  * log in the event of a crash. Our only means to detect this scenario is via
10117088c413SBrian Foster  * CRC verification. While we can't always be certain that CRC verification
10127088c413SBrian Foster  * failure is due to a torn write vs. an unrelated corruption, we do know that
10137088c413SBrian Foster  * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
10147088c413SBrian Foster  * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
10157088c413SBrian Foster  * the log and treat failures in this range as torn writes as a matter of
10167088c413SBrian Foster  * policy. In the event of CRC failure, the head is walked back to the last good
10177088c413SBrian Foster  * record in the log and the tail is updated from that record and verified.
10187088c413SBrian Foster  */
10197088c413SBrian Foster STATIC int
xlog_verify_head(struct xlog * log,xfs_daddr_t * head_blk,xfs_daddr_t * tail_blk,char * buffer,xfs_daddr_t * rhead_blk,struct xlog_rec_header ** rhead,bool * wrapped)10207088c413SBrian Foster xlog_verify_head(
10217088c413SBrian Foster 	struct xlog		*log,
10227088c413SBrian Foster 	xfs_daddr_t		*head_blk,	/* in/out: unverified head */
10237088c413SBrian Foster 	xfs_daddr_t		*tail_blk,	/* out: tail block */
10246e9b3dd8SChristoph Hellwig 	char			*buffer,
10257088c413SBrian Foster 	xfs_daddr_t		*rhead_blk,	/* start blk of last record */
10267088c413SBrian Foster 	struct xlog_rec_header	**rhead,	/* ptr to last record */
10277088c413SBrian Foster 	bool			*wrapped)	/* last rec. wraps phys. log */
10287088c413SBrian Foster {
10297088c413SBrian Foster 	struct xlog_rec_header	*tmp_rhead;
10306e9b3dd8SChristoph Hellwig 	char			*tmp_buffer;
10317088c413SBrian Foster 	xfs_daddr_t		first_bad;
10327088c413SBrian Foster 	xfs_daddr_t		tmp_rhead_blk;
10337088c413SBrian Foster 	int			found;
10347088c413SBrian Foster 	int			error;
10357088c413SBrian Foster 	bool			tmp_wrapped;
10367088c413SBrian Foster 
10377088c413SBrian Foster 	/*
103882ff6cc2SBrian Foster 	 * Check the head of the log for torn writes. Search backwards from the
103982ff6cc2SBrian Foster 	 * head until we hit the tail or the maximum number of log record I/Os
104082ff6cc2SBrian Foster 	 * that could have been in flight at one time. Use a temporary buffer so
10416e9b3dd8SChristoph Hellwig 	 * we don't trash the rhead/buffer pointers from the caller.
10427088c413SBrian Foster 	 */
10436e9b3dd8SChristoph Hellwig 	tmp_buffer = xlog_alloc_buffer(log, 1);
10446e9b3dd8SChristoph Hellwig 	if (!tmp_buffer)
10457088c413SBrian Foster 		return -ENOMEM;
10467088c413SBrian Foster 	error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
10476e9b3dd8SChristoph Hellwig 				      XLOG_MAX_ICLOGS, tmp_buffer,
10486e9b3dd8SChristoph Hellwig 				      &tmp_rhead_blk, &tmp_rhead, &tmp_wrapped);
10496e9b3dd8SChristoph Hellwig 	kmem_free(tmp_buffer);
10507088c413SBrian Foster 	if (error < 0)
10517088c413SBrian Foster 		return error;
10527088c413SBrian Foster 
10537088c413SBrian Foster 	/*
10547088c413SBrian Foster 	 * Now run a CRC verification pass over the records starting at the
10557088c413SBrian Foster 	 * block found above to the current head. If a CRC failure occurs, the
10567088c413SBrian Foster 	 * log block of the first bad record is saved in first_bad.
10577088c413SBrian Foster 	 */
10587088c413SBrian Foster 	error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
10597088c413SBrian Foster 				      XLOG_RECOVER_CRCPASS, &first_bad);
1060a4c9b34dSBrian Foster 	if ((error == -EFSBADCRC || error == -EFSCORRUPTED) && first_bad) {
10617088c413SBrian Foster 		/*
10627088c413SBrian Foster 		 * We've hit a potential torn write. Reset the error and warn
10637088c413SBrian Foster 		 * about it.
10647088c413SBrian Foster 		 */
10657088c413SBrian Foster 		error = 0;
10667088c413SBrian Foster 		xfs_warn(log->l_mp,
10677088c413SBrian Foster "Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
10687088c413SBrian Foster 			 first_bad, *head_blk);
10697088c413SBrian Foster 
10707088c413SBrian Foster 		/*
10717088c413SBrian Foster 		 * Get the header block and buffer pointer for the last good
10727088c413SBrian Foster 		 * record before the bad record.
10737088c413SBrian Foster 		 *
10747088c413SBrian Foster 		 * Note that xlog_find_tail() clears the blocks at the new head
10757088c413SBrian Foster 		 * (i.e., the records with invalid CRC) if the cycle number
1076b63da6c8SRandy Dunlap 		 * matches the current cycle.
10777088c413SBrian Foster 		 */
10786e9b3dd8SChristoph Hellwig 		found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1,
10796e9b3dd8SChristoph Hellwig 				buffer, rhead_blk, rhead, wrapped);
10807088c413SBrian Foster 		if (found < 0)
10817088c413SBrian Foster 			return found;
10827088c413SBrian Foster 		if (found == 0)		/* XXX: right thing to do here? */
10837088c413SBrian Foster 			return -EIO;
10847088c413SBrian Foster 
10857088c413SBrian Foster 		/*
10867088c413SBrian Foster 		 * Reset the head block to the starting block of the first bad
10877088c413SBrian Foster 		 * log record and set the tail block based on the last good
10887088c413SBrian Foster 		 * record.
10897088c413SBrian Foster 		 *
10907088c413SBrian Foster 		 * Bail out if the updated head/tail match as this indicates
10917088c413SBrian Foster 		 * possible corruption outside of the acceptable
10927088c413SBrian Foster 		 * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
10937088c413SBrian Foster 		 */
10947088c413SBrian Foster 		*head_blk = first_bad;
10957088c413SBrian Foster 		*tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
10967088c413SBrian Foster 		if (*head_blk == *tail_blk) {
10977088c413SBrian Foster 			ASSERT(0);
10987088c413SBrian Foster 			return 0;
10997088c413SBrian Foster 		}
11007088c413SBrian Foster 	}
11015297ac1fSBrian Foster 	if (error)
11027088c413SBrian Foster 		return error;
11035297ac1fSBrian Foster 
11044a4f66eaSBrian Foster 	return xlog_verify_tail(log, *head_blk, tail_blk,
11054a4f66eaSBrian Foster 				be32_to_cpu((*rhead)->h_size));
11067088c413SBrian Foster }
11077088c413SBrian Foster 
11087088c413SBrian Foster /*
11090703a8e1SDave Chinner  * We need to make sure we handle log wrapping properly, so we can't use the
11100703a8e1SDave Chinner  * calculated logbno directly. Make sure it wraps to the correct bno inside the
11110703a8e1SDave Chinner  * log.
11120703a8e1SDave Chinner  *
11130703a8e1SDave Chinner  * The log is limited to 32 bit sizes, so we use the appropriate modulus
11140703a8e1SDave Chinner  * operation here and cast it back to a 64 bit daddr on return.
11150703a8e1SDave Chinner  */
11160703a8e1SDave Chinner static inline xfs_daddr_t
xlog_wrap_logbno(struct xlog * log,xfs_daddr_t bno)11170703a8e1SDave Chinner xlog_wrap_logbno(
11180703a8e1SDave Chinner 	struct xlog		*log,
11190703a8e1SDave Chinner 	xfs_daddr_t		bno)
11200703a8e1SDave Chinner {
11210703a8e1SDave Chinner 	int			mod;
11220703a8e1SDave Chinner 
11230703a8e1SDave Chinner 	div_s64_rem(bno, log->l_logBBsize, &mod);
11240703a8e1SDave Chinner 	return mod;
11250703a8e1SDave Chinner }
11260703a8e1SDave Chinner 
11270703a8e1SDave Chinner /*
112865b99a08SBrian Foster  * Check whether the head of the log points to an unmount record. In other
112965b99a08SBrian Foster  * words, determine whether the log is clean. If so, update the in-core state
113065b99a08SBrian Foster  * appropriately.
113165b99a08SBrian Foster  */
113265b99a08SBrian Foster static int
xlog_check_unmount_rec(struct xlog * log,xfs_daddr_t * head_blk,xfs_daddr_t * tail_blk,struct xlog_rec_header * rhead,xfs_daddr_t rhead_blk,char * buffer,bool * clean)113365b99a08SBrian Foster xlog_check_unmount_rec(
113465b99a08SBrian Foster 	struct xlog		*log,
113565b99a08SBrian Foster 	xfs_daddr_t		*head_blk,
113665b99a08SBrian Foster 	xfs_daddr_t		*tail_blk,
113765b99a08SBrian Foster 	struct xlog_rec_header	*rhead,
113865b99a08SBrian Foster 	xfs_daddr_t		rhead_blk,
11396e9b3dd8SChristoph Hellwig 	char			*buffer,
114065b99a08SBrian Foster 	bool			*clean)
114165b99a08SBrian Foster {
114265b99a08SBrian Foster 	struct xlog_op_header	*op_head;
114365b99a08SBrian Foster 	xfs_daddr_t		umount_data_blk;
114465b99a08SBrian Foster 	xfs_daddr_t		after_umount_blk;
114565b99a08SBrian Foster 	int			hblks;
114665b99a08SBrian Foster 	int			error;
114765b99a08SBrian Foster 	char			*offset;
114865b99a08SBrian Foster 
114965b99a08SBrian Foster 	*clean = false;
115065b99a08SBrian Foster 
115165b99a08SBrian Foster 	/*
115265b99a08SBrian Foster 	 * Look for unmount record. If we find it, then we know there was a
115365b99a08SBrian Foster 	 * clean unmount. Since 'i' could be the last block in the physical
115465b99a08SBrian Foster 	 * log, we convert to a log block before comparing to the head_blk.
115565b99a08SBrian Foster 	 *
115665b99a08SBrian Foster 	 * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
115765b99a08SBrian Foster 	 * below. We won't want to clear the unmount record if there is one, so
115865b99a08SBrian Foster 	 * we pass the lsn of the unmount record rather than the block after it.
115965b99a08SBrian Foster 	 */
11600c771b99SGao Xiang 	hblks = xlog_logrec_hblks(log, rhead);
11610703a8e1SDave Chinner 	after_umount_blk = xlog_wrap_logbno(log,
11620703a8e1SDave Chinner 			rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)));
11630703a8e1SDave Chinner 
116465b99a08SBrian Foster 	if (*head_blk == after_umount_blk &&
116565b99a08SBrian Foster 	    be32_to_cpu(rhead->h_num_logops) == 1) {
11660703a8e1SDave Chinner 		umount_data_blk = xlog_wrap_logbno(log, rhead_blk + hblks);
11676e9b3dd8SChristoph Hellwig 		error = xlog_bread(log, umount_data_blk, 1, buffer, &offset);
116865b99a08SBrian Foster 		if (error)
116965b99a08SBrian Foster 			return error;
117065b99a08SBrian Foster 
117165b99a08SBrian Foster 		op_head = (struct xlog_op_header *)offset;
117265b99a08SBrian Foster 		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
117365b99a08SBrian Foster 			/*
117465b99a08SBrian Foster 			 * Set tail and last sync so that newly written log
117565b99a08SBrian Foster 			 * records will point recovery to after the current
117665b99a08SBrian Foster 			 * unmount record.
117765b99a08SBrian Foster 			 */
117865b99a08SBrian Foster 			xlog_assign_atomic_lsn(&log->l_tail_lsn,
117965b99a08SBrian Foster 					log->l_curr_cycle, after_umount_blk);
118065b99a08SBrian Foster 			xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
118165b99a08SBrian Foster 					log->l_curr_cycle, after_umount_blk);
118265b99a08SBrian Foster 			*tail_blk = after_umount_blk;
118365b99a08SBrian Foster 
118465b99a08SBrian Foster 			*clean = true;
118565b99a08SBrian Foster 		}
118665b99a08SBrian Foster 	}
118765b99a08SBrian Foster 
118865b99a08SBrian Foster 	return 0;
118965b99a08SBrian Foster }
119065b99a08SBrian Foster 
1191717bc0ebSBrian Foster static void
xlog_set_state(struct xlog * log,xfs_daddr_t head_blk,struct xlog_rec_header * rhead,xfs_daddr_t rhead_blk,bool bump_cycle)1192717bc0ebSBrian Foster xlog_set_state(
1193717bc0ebSBrian Foster 	struct xlog		*log,
1194717bc0ebSBrian Foster 	xfs_daddr_t		head_blk,
1195717bc0ebSBrian Foster 	struct xlog_rec_header	*rhead,
1196717bc0ebSBrian Foster 	xfs_daddr_t		rhead_blk,
1197717bc0ebSBrian Foster 	bool			bump_cycle)
1198717bc0ebSBrian Foster {
1199717bc0ebSBrian Foster 	/*
1200717bc0ebSBrian Foster 	 * Reset log values according to the state of the log when we
1201717bc0ebSBrian Foster 	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
1202717bc0ebSBrian Foster 	 * one because the next write starts a new cycle rather than
1203717bc0ebSBrian Foster 	 * continuing the cycle of the last good log record.  At this
1204717bc0ebSBrian Foster 	 * point we have guaranteed that all partial log records have been
1205717bc0ebSBrian Foster 	 * accounted for.  Therefore, we know that the last good log record
1206717bc0ebSBrian Foster 	 * written was complete and ended exactly on the end boundary
1207717bc0ebSBrian Foster 	 * of the physical log.
1208717bc0ebSBrian Foster 	 */
1209717bc0ebSBrian Foster 	log->l_prev_block = rhead_blk;
1210717bc0ebSBrian Foster 	log->l_curr_block = (int)head_blk;
1211717bc0ebSBrian Foster 	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
1212717bc0ebSBrian Foster 	if (bump_cycle)
1213717bc0ebSBrian Foster 		log->l_curr_cycle++;
1214717bc0ebSBrian Foster 	atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
1215717bc0ebSBrian Foster 	atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
1216717bc0ebSBrian Foster 	xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
1217717bc0ebSBrian Foster 					BBTOB(log->l_curr_block));
1218717bc0ebSBrian Foster 	xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
1219717bc0ebSBrian Foster 					BBTOB(log->l_curr_block));
1220717bc0ebSBrian Foster }
1221717bc0ebSBrian Foster 
122265b99a08SBrian Foster /*
12231da177e4SLinus Torvalds  * Find the sync block number or the tail of the log.
12241da177e4SLinus Torvalds  *
12251da177e4SLinus Torvalds  * This will be the block number of the last record to have its
12261da177e4SLinus Torvalds  * associated buffers synced to disk.  Every log record header has
12271da177e4SLinus Torvalds  * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
12281da177e4SLinus Torvalds  * to get a sync block number.  The only concern is to figure out which
12291da177e4SLinus Torvalds  * log record header to believe.
12301da177e4SLinus Torvalds  *
12311da177e4SLinus Torvalds  * The following algorithm uses the log record header with the largest
12321da177e4SLinus Torvalds  * lsn.  The entire log record does not need to be valid.  We only care
12331da177e4SLinus Torvalds  * that the header is valid.
12341da177e4SLinus Torvalds  *
12351da177e4SLinus Torvalds  * We could speed up search by using current head_blk buffer, but it is not
12361da177e4SLinus Torvalds  * available.
12371da177e4SLinus Torvalds  */
12385d77c0dcSEric Sandeen STATIC int
xlog_find_tail(struct xlog * log,xfs_daddr_t * head_blk,xfs_daddr_t * tail_blk)12391da177e4SLinus Torvalds xlog_find_tail(
12409a8d2fdbSMark Tinguely 	struct xlog		*log,
12411da177e4SLinus Torvalds 	xfs_daddr_t		*head_blk,
124265be6054SEric Sandeen 	xfs_daddr_t		*tail_blk)
12431da177e4SLinus Torvalds {
12441da177e4SLinus Torvalds 	xlog_rec_header_t	*rhead;
1245b2a922cdSChristoph Hellwig 	char			*offset = NULL;
12466e9b3dd8SChristoph Hellwig 	char			*buffer;
12477088c413SBrian Foster 	int			error;
12487088c413SBrian Foster 	xfs_daddr_t		rhead_blk;
12491da177e4SLinus Torvalds 	xfs_lsn_t		tail_lsn;
1250eed6b462SBrian Foster 	bool			wrapped = false;
125165b99a08SBrian Foster 	bool			clean = false;
12521da177e4SLinus Torvalds 
12531da177e4SLinus Torvalds 	/*
12541da177e4SLinus Torvalds 	 * Find previous log record
12551da177e4SLinus Torvalds 	 */
12561da177e4SLinus Torvalds 	if ((error = xlog_find_head(log, head_blk)))
12571da177e4SLinus Torvalds 		return error;
125882ff6cc2SBrian Foster 	ASSERT(*head_blk < INT_MAX);
12591da177e4SLinus Torvalds 
12606e9b3dd8SChristoph Hellwig 	buffer = xlog_alloc_buffer(log, 1);
12616e9b3dd8SChristoph Hellwig 	if (!buffer)
12622451337dSDave Chinner 		return -ENOMEM;
12631da177e4SLinus Torvalds 	if (*head_blk == 0) {				/* special case */
12646e9b3dd8SChristoph Hellwig 		error = xlog_bread(log, 0, 1, buffer, &offset);
1265076e6acbSChristoph Hellwig 		if (error)
12669db127edSAlex Elder 			goto done;
1267076e6acbSChristoph Hellwig 
126803bea6feSChristoph Hellwig 		if (xlog_get_cycle(offset) == 0) {
12691da177e4SLinus Torvalds 			*tail_blk = 0;
12701da177e4SLinus Torvalds 			/* leave all other log inited values alone */
12719db127edSAlex Elder 			goto done;
12721da177e4SLinus Torvalds 		}
12731da177e4SLinus Torvalds 	}
12741da177e4SLinus Torvalds 
12751da177e4SLinus Torvalds 	/*
127682ff6cc2SBrian Foster 	 * Search backwards through the log looking for the log record header
127782ff6cc2SBrian Foster 	 * block. This wraps all the way back around to the head so something is
127882ff6cc2SBrian Foster 	 * seriously wrong if we can't find it.
12791da177e4SLinus Torvalds 	 */
12806e9b3dd8SChristoph Hellwig 	error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, buffer,
128182ff6cc2SBrian Foster 				      &rhead_blk, &rhead, &wrapped);
128282ff6cc2SBrian Foster 	if (error < 0)
1283050552cbSDarrick J. Wong 		goto done;
128482ff6cc2SBrian Foster 	if (!error) {
128582ff6cc2SBrian Foster 		xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
1286050552cbSDarrick J. Wong 		error = -EFSCORRUPTED;
1287050552cbSDarrick J. Wong 		goto done;
128882ff6cc2SBrian Foster 	}
128982ff6cc2SBrian Foster 	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
12901da177e4SLinus Torvalds 
12911da177e4SLinus Torvalds 	/*
1292717bc0ebSBrian Foster 	 * Set the log state based on the current head record.
12931da177e4SLinus Torvalds 	 */
1294717bc0ebSBrian Foster 	xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
12951c3cb9ecSDave Chinner 	tail_lsn = atomic64_read(&log->l_tail_lsn);
12961da177e4SLinus Torvalds 
12971da177e4SLinus Torvalds 	/*
129865b99a08SBrian Foster 	 * Look for an unmount record at the head of the log. This sets the log
129965b99a08SBrian Foster 	 * state to determine whether recovery is necessary.
13001da177e4SLinus Torvalds 	 */
130165b99a08SBrian Foster 	error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
13026e9b3dd8SChristoph Hellwig 				       rhead_blk, buffer, &clean);
1303076e6acbSChristoph Hellwig 	if (error)
13049db127edSAlex Elder 		goto done;
1305076e6acbSChristoph Hellwig 
13061da177e4SLinus Torvalds 	/*
13077f6aff3aSBrian Foster 	 * Verify the log head if the log is not clean (e.g., we have anything
13087f6aff3aSBrian Foster 	 * but an unmount record at the head). This uses CRC verification to
13097f6aff3aSBrian Foster 	 * detect and trim torn writes. If discovered, CRC failures are
13107f6aff3aSBrian Foster 	 * considered torn writes and the log head is trimmed accordingly.
13117f6aff3aSBrian Foster 	 *
13127f6aff3aSBrian Foster 	 * Note that we can only run CRC verification when the log is dirty
13137f6aff3aSBrian Foster 	 * because there's no guarantee that the log data behind an unmount
13147f6aff3aSBrian Foster 	 * record is compatible with the current architecture.
13151da177e4SLinus Torvalds 	 */
13167f6aff3aSBrian Foster 	if (!clean) {
13177f6aff3aSBrian Foster 		xfs_daddr_t	orig_head = *head_blk;
13187f6aff3aSBrian Foster 
13196e9b3dd8SChristoph Hellwig 		error = xlog_verify_head(log, head_blk, tail_blk, buffer,
13207f6aff3aSBrian Foster 					 &rhead_blk, &rhead, &wrapped);
13217f6aff3aSBrian Foster 		if (error)
13227f6aff3aSBrian Foster 			goto done;
13237f6aff3aSBrian Foster 
13247f6aff3aSBrian Foster 		/* update in-core state again if the head changed */
13257f6aff3aSBrian Foster 		if (*head_blk != orig_head) {
13267f6aff3aSBrian Foster 			xlog_set_state(log, *head_blk, rhead, rhead_blk,
13277f6aff3aSBrian Foster 				       wrapped);
13287f6aff3aSBrian Foster 			tail_lsn = atomic64_read(&log->l_tail_lsn);
13297f6aff3aSBrian Foster 			error = xlog_check_unmount_rec(log, head_blk, tail_blk,
13306e9b3dd8SChristoph Hellwig 						       rhead, rhead_blk, buffer,
13317f6aff3aSBrian Foster 						       &clean);
13327f6aff3aSBrian Foster 			if (error)
13337f6aff3aSBrian Foster 				goto done;
13347f6aff3aSBrian Foster 		}
13357f6aff3aSBrian Foster 	}
133692821e2bSDavid Chinner 
133792821e2bSDavid Chinner 	/*
133865b99a08SBrian Foster 	 * Note that the unmount was clean. If the unmount was not clean, we
133965b99a08SBrian Foster 	 * need to know this to rebuild the superblock counters from the perag
134065b99a08SBrian Foster 	 * headers if we have a filesystem using non-persistent counters.
134192821e2bSDavid Chinner 	 */
134265b99a08SBrian Foster 	if (clean)
13432e973b2cSDave Chinner 		set_bit(XFS_OPSTATE_CLEAN, &log->l_mp->m_opstate);
13441da177e4SLinus Torvalds 
13451da177e4SLinus Torvalds 	/*
13461da177e4SLinus Torvalds 	 * Make sure that there are no blocks in front of the head
13471da177e4SLinus Torvalds 	 * with the same cycle number as the head.  This can happen
13481da177e4SLinus Torvalds 	 * because we allow multiple outstanding log writes concurrently,
13491da177e4SLinus Torvalds 	 * and the later writes might make it out before earlier ones.
13501da177e4SLinus Torvalds 	 *
13511da177e4SLinus Torvalds 	 * We use the lsn from before modifying it so that we'll never
13521da177e4SLinus Torvalds 	 * overwrite the unmount record after a clean unmount.
13531da177e4SLinus Torvalds 	 *
13541da177e4SLinus Torvalds 	 * Do this only if we are going to recover the filesystem
13551da177e4SLinus Torvalds 	 *
13561da177e4SLinus Torvalds 	 * NOTE: This used to say "if (!readonly)"
13571da177e4SLinus Torvalds 	 * However on Linux, we can & do recover a read-only filesystem.
13581da177e4SLinus Torvalds 	 * We only skip recovery if NORECOVERY is specified on mount,
13591da177e4SLinus Torvalds 	 * in which case we would not be here.
13601da177e4SLinus Torvalds 	 *
13611da177e4SLinus Torvalds 	 * But... if the -device- itself is readonly, just skip this.
13621da177e4SLinus Torvalds 	 * We can't recover this device anyway, so it won't matter.
13631da177e4SLinus Torvalds 	 */
13642d15d2c0SChristoph Hellwig 	if (!xfs_readonly_buftarg(log->l_targ))
13651da177e4SLinus Torvalds 		error = xlog_clear_stale_blocks(log, tail_lsn);
13661da177e4SLinus Torvalds 
13679db127edSAlex Elder done:
13686e9b3dd8SChristoph Hellwig 	kmem_free(buffer);
13691da177e4SLinus Torvalds 
13701da177e4SLinus Torvalds 	if (error)
1371a0fa2b67SDave Chinner 		xfs_warn(log->l_mp, "failed to locate log tail");
13721da177e4SLinus Torvalds 	return error;
13731da177e4SLinus Torvalds }
13741da177e4SLinus Torvalds 
13751da177e4SLinus Torvalds /*
13761da177e4SLinus Torvalds  * Is the log zeroed at all?
13771da177e4SLinus Torvalds  *
13781da177e4SLinus Torvalds  * The last binary search should be changed to perform an X block read
13791da177e4SLinus Torvalds  * once X becomes small enough.  You can then search linearly through
13801da177e4SLinus Torvalds  * the X blocks.  This will cut down on the number of reads we need to do.
13811da177e4SLinus Torvalds  *
13821da177e4SLinus Torvalds  * If the log is partially zeroed, this routine will pass back the blkno
13831da177e4SLinus Torvalds  * of the first block with cycle number 0.  It won't have a complete LR
13841da177e4SLinus Torvalds  * preceding it.
13851da177e4SLinus Torvalds  *
13861da177e4SLinus Torvalds  * Return:
13871da177e4SLinus Torvalds  *	0  => the log is completely written to
13882451337dSDave Chinner  *	1 => use *blk_no as the first block of the log
13892451337dSDave Chinner  *	<0 => error has occurred
13901da177e4SLinus Torvalds  */
1391a8272ce0SDavid Chinner STATIC int
xlog_find_zeroed(struct xlog * log,xfs_daddr_t * blk_no)13921da177e4SLinus Torvalds xlog_find_zeroed(
13939a8d2fdbSMark Tinguely 	struct xlog	*log,
13941da177e4SLinus Torvalds 	xfs_daddr_t	*blk_no)
13951da177e4SLinus Torvalds {
13966e9b3dd8SChristoph Hellwig 	char		*buffer;
1397b2a922cdSChristoph Hellwig 	char		*offset;
13981da177e4SLinus Torvalds 	uint	        first_cycle, last_cycle;
13991da177e4SLinus Torvalds 	xfs_daddr_t	new_blk, last_blk, start_blk;
14001da177e4SLinus Torvalds 	xfs_daddr_t     num_scan_bblks;
14011da177e4SLinus Torvalds 	int	        error, log_bbnum = log->l_logBBsize;
14021da177e4SLinus Torvalds 
14036fdf8cccSNathan Scott 	*blk_no = 0;
14046fdf8cccSNathan Scott 
14051da177e4SLinus Torvalds 	/* check totally zeroed log */
14066e9b3dd8SChristoph Hellwig 	buffer = xlog_alloc_buffer(log, 1);
14076e9b3dd8SChristoph Hellwig 	if (!buffer)
14082451337dSDave Chinner 		return -ENOMEM;
14096e9b3dd8SChristoph Hellwig 	error = xlog_bread(log, 0, 1, buffer, &offset);
1410076e6acbSChristoph Hellwig 	if (error)
14116e9b3dd8SChristoph Hellwig 		goto out_free_buffer;
1412076e6acbSChristoph Hellwig 
141303bea6feSChristoph Hellwig 	first_cycle = xlog_get_cycle(offset);
14141da177e4SLinus Torvalds 	if (first_cycle == 0) {		/* completely zeroed log */
14151da177e4SLinus Torvalds 		*blk_no = 0;
14166e9b3dd8SChristoph Hellwig 		kmem_free(buffer);
14172451337dSDave Chinner 		return 1;
14181da177e4SLinus Torvalds 	}
14191da177e4SLinus Torvalds 
14201da177e4SLinus Torvalds 	/* check partially zeroed log */
14216e9b3dd8SChristoph Hellwig 	error = xlog_bread(log, log_bbnum-1, 1, buffer, &offset);
1422076e6acbSChristoph Hellwig 	if (error)
14236e9b3dd8SChristoph Hellwig 		goto out_free_buffer;
1424076e6acbSChristoph Hellwig 
142503bea6feSChristoph Hellwig 	last_cycle = xlog_get_cycle(offset);
14261da177e4SLinus Torvalds 	if (last_cycle != 0) {		/* log completely written to */
14276e9b3dd8SChristoph Hellwig 		kmem_free(buffer);
14281da177e4SLinus Torvalds 		return 0;
14291da177e4SLinus Torvalds 	}
14301da177e4SLinus Torvalds 
14311da177e4SLinus Torvalds 	/* we have a partially zeroed log */
14321da177e4SLinus Torvalds 	last_blk = log_bbnum-1;
14336e9b3dd8SChristoph Hellwig 	error = xlog_find_cycle_start(log, buffer, 0, &last_blk, 0);
14346e9b3dd8SChristoph Hellwig 	if (error)
14356e9b3dd8SChristoph Hellwig 		goto out_free_buffer;
14361da177e4SLinus Torvalds 
14371da177e4SLinus Torvalds 	/*
14381da177e4SLinus Torvalds 	 * Validate the answer.  Because there is no way to guarantee that
14391da177e4SLinus Torvalds 	 * the entire log is made up of log records which are the same size,
14401da177e4SLinus Torvalds 	 * we scan over the defined maximum blocks.  At this point, the maximum
14411da177e4SLinus Torvalds 	 * is not chosen to mean anything special.   XXXmiken
14421da177e4SLinus Torvalds 	 */
14431da177e4SLinus Torvalds 	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
14441da177e4SLinus Torvalds 	ASSERT(num_scan_bblks <= INT_MAX);
14451da177e4SLinus Torvalds 
14461da177e4SLinus Torvalds 	if (last_blk < num_scan_bblks)
14471da177e4SLinus Torvalds 		num_scan_bblks = last_blk;
14481da177e4SLinus Torvalds 	start_blk = last_blk - num_scan_bblks;
14491da177e4SLinus Torvalds 
14501da177e4SLinus Torvalds 	/*
14511da177e4SLinus Torvalds 	 * We search for any instances of cycle number 0 that occur before
14521da177e4SLinus Torvalds 	 * our current estimate of the head.  What we're trying to detect is
14531da177e4SLinus Torvalds 	 *        1 ... | 0 | 1 | 0...
14541da177e4SLinus Torvalds 	 *                       ^ binary search ends here
14551da177e4SLinus Torvalds 	 */
14561da177e4SLinus Torvalds 	if ((error = xlog_find_verify_cycle(log, start_blk,
14571da177e4SLinus Torvalds 					 (int)num_scan_bblks, 0, &new_blk)))
14586e9b3dd8SChristoph Hellwig 		goto out_free_buffer;
14591da177e4SLinus Torvalds 	if (new_blk != -1)
14601da177e4SLinus Torvalds 		last_blk = new_blk;
14611da177e4SLinus Torvalds 
14621da177e4SLinus Torvalds 	/*
14631da177e4SLinus Torvalds 	 * Potentially backup over partial log record write.  We don't need
14641da177e4SLinus Torvalds 	 * to search the end of the log because we know it is zero.
14651da177e4SLinus Torvalds 	 */
14662451337dSDave Chinner 	error = xlog_find_verify_log_record(log, start_blk, &last_blk, 0);
14672451337dSDave Chinner 	if (error == 1)
14682451337dSDave Chinner 		error = -EIO;
14692451337dSDave Chinner 	if (error)
14706e9b3dd8SChristoph Hellwig 		goto out_free_buffer;
14711da177e4SLinus Torvalds 
14721da177e4SLinus Torvalds 	*blk_no = last_blk;
14736e9b3dd8SChristoph Hellwig out_free_buffer:
14746e9b3dd8SChristoph Hellwig 	kmem_free(buffer);
14751da177e4SLinus Torvalds 	if (error)
14761da177e4SLinus Torvalds 		return error;
14772451337dSDave Chinner 	return 1;
14781da177e4SLinus Torvalds }
14791da177e4SLinus Torvalds 
14801da177e4SLinus Torvalds /*
14811da177e4SLinus Torvalds  * These are simple subroutines used by xlog_clear_stale_blocks() below
14821da177e4SLinus Torvalds  * to initialize a buffer full of empty log record headers and write
14831da177e4SLinus Torvalds  * them into the log.
14841da177e4SLinus Torvalds  */
14851da177e4SLinus Torvalds STATIC void
xlog_add_record(struct xlog * log,char * buf,int cycle,int block,int tail_cycle,int tail_block)14861da177e4SLinus Torvalds xlog_add_record(
14879a8d2fdbSMark Tinguely 	struct xlog		*log,
1488b2a922cdSChristoph Hellwig 	char			*buf,
14891da177e4SLinus Torvalds 	int			cycle,
14901da177e4SLinus Torvalds 	int			block,
14911da177e4SLinus Torvalds 	int			tail_cycle,
14921da177e4SLinus Torvalds 	int			tail_block)
14931da177e4SLinus Torvalds {
14941da177e4SLinus Torvalds 	xlog_rec_header_t	*recp = (xlog_rec_header_t *)buf;
14951da177e4SLinus Torvalds 
14961da177e4SLinus Torvalds 	memset(buf, 0, BBSIZE);
1497b53e675dSChristoph Hellwig 	recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1498b53e675dSChristoph Hellwig 	recp->h_cycle = cpu_to_be32(cycle);
1499b53e675dSChristoph Hellwig 	recp->h_version = cpu_to_be32(
150038c26bfdSDave Chinner 			xfs_has_logv2(log->l_mp) ? 2 : 1);
1501b53e675dSChristoph Hellwig 	recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1502b53e675dSChristoph Hellwig 	recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1503b53e675dSChristoph Hellwig 	recp->h_fmt = cpu_to_be32(XLOG_FMT);
15041da177e4SLinus Torvalds 	memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
15051da177e4SLinus Torvalds }
15061da177e4SLinus Torvalds 
15071da177e4SLinus Torvalds STATIC int
xlog_write_log_records(struct xlog * log,int cycle,int start_block,int blocks,int tail_cycle,int tail_block)15081da177e4SLinus Torvalds xlog_write_log_records(
15099a8d2fdbSMark Tinguely 	struct xlog	*log,
15101da177e4SLinus Torvalds 	int		cycle,
15111da177e4SLinus Torvalds 	int		start_block,
15121da177e4SLinus Torvalds 	int		blocks,
15131da177e4SLinus Torvalds 	int		tail_cycle,
15141da177e4SLinus Torvalds 	int		tail_block)
15151da177e4SLinus Torvalds {
1516b2a922cdSChristoph Hellwig 	char		*offset;
15176e9b3dd8SChristoph Hellwig 	char		*buffer;
15181da177e4SLinus Torvalds 	int		balign, ealign;
151969ce58f0SAlex Elder 	int		sectbb = log->l_sectBBsize;
15201da177e4SLinus Torvalds 	int		end_block = start_block + blocks;
15211da177e4SLinus Torvalds 	int		bufblks;
15221da177e4SLinus Torvalds 	int		error = 0;
15231da177e4SLinus Torvalds 	int		i, j = 0;
15241da177e4SLinus Torvalds 
15256881a229SAlex Elder 	/*
15266881a229SAlex Elder 	 * Greedily allocate a buffer big enough to handle the full
15276881a229SAlex Elder 	 * range of basic blocks to be written.  If that fails, try
15286881a229SAlex Elder 	 * a smaller size.  We need to be able to write at least a
15296881a229SAlex Elder 	 * log sector, or we're out of luck.
15306881a229SAlex Elder 	 */
15318b010acbSWang Jianchao 	bufblks = roundup_pow_of_two(blocks);
153281158e0cSDave Chinner 	while (bufblks > log->l_logBBsize)
153381158e0cSDave Chinner 		bufblks >>= 1;
15346e9b3dd8SChristoph Hellwig 	while (!(buffer = xlog_alloc_buffer(log, bufblks))) {
15351da177e4SLinus Torvalds 		bufblks >>= 1;
153669ce58f0SAlex Elder 		if (bufblks < sectbb)
15372451337dSDave Chinner 			return -ENOMEM;
15381da177e4SLinus Torvalds 	}
15391da177e4SLinus Torvalds 
15401da177e4SLinus Torvalds 	/* We may need to do a read at the start to fill in part of
15411da177e4SLinus Torvalds 	 * the buffer in the starting sector not covered by the first
15421da177e4SLinus Torvalds 	 * write below.
15431da177e4SLinus Torvalds 	 */
15445c17f533SAlex Elder 	balign = round_down(start_block, sectbb);
15451da177e4SLinus Torvalds 	if (balign != start_block) {
15466e9b3dd8SChristoph Hellwig 		error = xlog_bread_noalign(log, start_block, 1, buffer);
1547076e6acbSChristoph Hellwig 		if (error)
15486e9b3dd8SChristoph Hellwig 			goto out_free_buffer;
1549076e6acbSChristoph Hellwig 
15501da177e4SLinus Torvalds 		j = start_block - balign;
15511da177e4SLinus Torvalds 	}
15521da177e4SLinus Torvalds 
15531da177e4SLinus Torvalds 	for (i = start_block; i < end_block; i += bufblks) {
15541da177e4SLinus Torvalds 		int		bcount, endcount;
15551da177e4SLinus Torvalds 
15561da177e4SLinus Torvalds 		bcount = min(bufblks, end_block - start_block);
15571da177e4SLinus Torvalds 		endcount = bcount - j;
15581da177e4SLinus Torvalds 
15591da177e4SLinus Torvalds 		/* We may need to do a read at the end to fill in part of
15601da177e4SLinus Torvalds 		 * the buffer in the final sector not covered by the write.
15611da177e4SLinus Torvalds 		 * If this is the same sector as the above read, skip it.
15621da177e4SLinus Torvalds 		 */
15635c17f533SAlex Elder 		ealign = round_down(end_block, sectbb);
15641da177e4SLinus Torvalds 		if (j == 0 && (start_block + endcount > ealign)) {
15656ad5b325SChristoph Hellwig 			error = xlog_bread_noalign(log, ealign, sectbb,
15666e9b3dd8SChristoph Hellwig 					buffer + BBTOB(ealign - start_block));
1567076e6acbSChristoph Hellwig 			if (error)
1568076e6acbSChristoph Hellwig 				break;
1569076e6acbSChristoph Hellwig 
15701da177e4SLinus Torvalds 		}
15711da177e4SLinus Torvalds 
15726e9b3dd8SChristoph Hellwig 		offset = buffer + xlog_align(log, start_block);
15731da177e4SLinus Torvalds 		for (; j < endcount; j++) {
15741da177e4SLinus Torvalds 			xlog_add_record(log, offset, cycle, i+j,
15751da177e4SLinus Torvalds 					tail_cycle, tail_block);
15761da177e4SLinus Torvalds 			offset += BBSIZE;
15771da177e4SLinus Torvalds 		}
15786e9b3dd8SChristoph Hellwig 		error = xlog_bwrite(log, start_block, endcount, buffer);
15791da177e4SLinus Torvalds 		if (error)
15801da177e4SLinus Torvalds 			break;
15811da177e4SLinus Torvalds 		start_block += endcount;
15821da177e4SLinus Torvalds 		j = 0;
15831da177e4SLinus Torvalds 	}
1584076e6acbSChristoph Hellwig 
15856e9b3dd8SChristoph Hellwig out_free_buffer:
15866e9b3dd8SChristoph Hellwig 	kmem_free(buffer);
15871da177e4SLinus Torvalds 	return error;
15881da177e4SLinus Torvalds }
15891da177e4SLinus Torvalds 
15901da177e4SLinus Torvalds /*
15911da177e4SLinus Torvalds  * This routine is called to blow away any incomplete log writes out
15921da177e4SLinus Torvalds  * in front of the log head.  We do this so that we won't become confused
15931da177e4SLinus Torvalds  * if we come up, write only a little bit more, and then crash again.
15941da177e4SLinus Torvalds  * If we leave the partial log records out there, this situation could
15951da177e4SLinus Torvalds  * cause us to think those partial writes are valid blocks since they
15961da177e4SLinus Torvalds  * have the current cycle number.  We get rid of them by overwriting them
15971da177e4SLinus Torvalds  * with empty log records with the old cycle number rather than the
15981da177e4SLinus Torvalds  * current one.
15991da177e4SLinus Torvalds  *
16001da177e4SLinus Torvalds  * The tail lsn is passed in rather than taken from
16011da177e4SLinus Torvalds  * the log so that we will not write over the unmount record after a
16021da177e4SLinus Torvalds  * clean unmount in a 512 block log.  Doing so would leave the log without
16031da177e4SLinus Torvalds  * any valid log records in it until a new one was written.  If we crashed
16041da177e4SLinus Torvalds  * during that time we would not be able to recover.
16051da177e4SLinus Torvalds  */
16061da177e4SLinus Torvalds STATIC int
xlog_clear_stale_blocks(struct xlog * log,xfs_lsn_t tail_lsn)16071da177e4SLinus Torvalds xlog_clear_stale_blocks(
16089a8d2fdbSMark Tinguely 	struct xlog	*log,
16091da177e4SLinus Torvalds 	xfs_lsn_t	tail_lsn)
16101da177e4SLinus Torvalds {
16111da177e4SLinus Torvalds 	int		tail_cycle, head_cycle;
16121da177e4SLinus Torvalds 	int		tail_block, head_block;
16131da177e4SLinus Torvalds 	int		tail_distance, max_distance;
16141da177e4SLinus Torvalds 	int		distance;
16151da177e4SLinus Torvalds 	int		error;
16161da177e4SLinus Torvalds 
16171da177e4SLinus Torvalds 	tail_cycle = CYCLE_LSN(tail_lsn);
16181da177e4SLinus Torvalds 	tail_block = BLOCK_LSN(tail_lsn);
16191da177e4SLinus Torvalds 	head_cycle = log->l_curr_cycle;
16201da177e4SLinus Torvalds 	head_block = log->l_curr_block;
16211da177e4SLinus Torvalds 
16221da177e4SLinus Torvalds 	/*
16231da177e4SLinus Torvalds 	 * Figure out the distance between the new head of the log
16241da177e4SLinus Torvalds 	 * and the tail.  We want to write over any blocks beyond the
16251da177e4SLinus Torvalds 	 * head that we may have written just before the crash, but
16261da177e4SLinus Torvalds 	 * we don't want to overwrite the tail of the log.
16271da177e4SLinus Torvalds 	 */
16281da177e4SLinus Torvalds 	if (head_cycle == tail_cycle) {
16291da177e4SLinus Torvalds 		/*
16301da177e4SLinus Torvalds 		 * The tail is behind the head in the physical log,
16311da177e4SLinus Torvalds 		 * so the distance from the head to the tail is the
16321da177e4SLinus Torvalds 		 * distance from the head to the end of the log plus
16331da177e4SLinus Torvalds 		 * the distance from the beginning of the log to the
16341da177e4SLinus Torvalds 		 * tail.
16351da177e4SLinus Torvalds 		 */
1636a71895c5SDarrick J. Wong 		if (XFS_IS_CORRUPT(log->l_mp,
1637a71895c5SDarrick J. Wong 				   head_block < tail_block ||
1638a71895c5SDarrick J. Wong 				   head_block >= log->l_logBBsize))
16392451337dSDave Chinner 			return -EFSCORRUPTED;
16401da177e4SLinus Torvalds 		tail_distance = tail_block + (log->l_logBBsize - head_block);
16411da177e4SLinus Torvalds 	} else {
16421da177e4SLinus Torvalds 		/*
16431da177e4SLinus Torvalds 		 * The head is behind the tail in the physical log,
16441da177e4SLinus Torvalds 		 * so the distance from the head to the tail is just
16451da177e4SLinus Torvalds 		 * the tail block minus the head block.
16461da177e4SLinus Torvalds 		 */
1647a71895c5SDarrick J. Wong 		if (XFS_IS_CORRUPT(log->l_mp,
1648a71895c5SDarrick J. Wong 				   head_block >= tail_block ||
1649a71895c5SDarrick J. Wong 				   head_cycle != tail_cycle + 1))
16502451337dSDave Chinner 			return -EFSCORRUPTED;
16511da177e4SLinus Torvalds 		tail_distance = tail_block - head_block;
16521da177e4SLinus Torvalds 	}
16531da177e4SLinus Torvalds 
16541da177e4SLinus Torvalds 	/*
16551da177e4SLinus Torvalds 	 * If the head is right up against the tail, we can't clear
16561da177e4SLinus Torvalds 	 * anything.
16571da177e4SLinus Torvalds 	 */
16581da177e4SLinus Torvalds 	if (tail_distance <= 0) {
16591da177e4SLinus Torvalds 		ASSERT(tail_distance == 0);
16601da177e4SLinus Torvalds 		return 0;
16611da177e4SLinus Torvalds 	}
16621da177e4SLinus Torvalds 
16631da177e4SLinus Torvalds 	max_distance = XLOG_TOTAL_REC_SHIFT(log);
16641da177e4SLinus Torvalds 	/*
16651da177e4SLinus Torvalds 	 * Take the smaller of the maximum amount of outstanding I/O
16661da177e4SLinus Torvalds 	 * we could have and the distance to the tail to clear out.
16671da177e4SLinus Torvalds 	 * We take the smaller so that we don't overwrite the tail and
16681da177e4SLinus Torvalds 	 * we don't waste all day writing from the head to the tail
16691da177e4SLinus Torvalds 	 * for no reason.
16701da177e4SLinus Torvalds 	 */
16719bb54cb5SDave Chinner 	max_distance = min(max_distance, tail_distance);
16721da177e4SLinus Torvalds 
16731da177e4SLinus Torvalds 	if ((head_block + max_distance) <= log->l_logBBsize) {
16741da177e4SLinus Torvalds 		/*
16751da177e4SLinus Torvalds 		 * We can stomp all the blocks we need to without
16761da177e4SLinus Torvalds 		 * wrapping around the end of the log.  Just do it
16771da177e4SLinus Torvalds 		 * in a single write.  Use the cycle number of the
16781da177e4SLinus Torvalds 		 * current cycle minus one so that the log will look like:
16791da177e4SLinus Torvalds 		 *     n ... | n - 1 ...
16801da177e4SLinus Torvalds 		 */
16811da177e4SLinus Torvalds 		error = xlog_write_log_records(log, (head_cycle - 1),
16821da177e4SLinus Torvalds 				head_block, max_distance, tail_cycle,
16831da177e4SLinus Torvalds 				tail_block);
16841da177e4SLinus Torvalds 		if (error)
16851da177e4SLinus Torvalds 			return error;
16861da177e4SLinus Torvalds 	} else {
16871da177e4SLinus Torvalds 		/*
16881da177e4SLinus Torvalds 		 * We need to wrap around the end of the physical log in
16891da177e4SLinus Torvalds 		 * order to clear all the blocks.  Do it in two separate
16901da177e4SLinus Torvalds 		 * I/Os.  The first write should be from the head to the
16911da177e4SLinus Torvalds 		 * end of the physical log, and it should use the current
16921da177e4SLinus Torvalds 		 * cycle number minus one just like above.
16931da177e4SLinus Torvalds 		 */
16941da177e4SLinus Torvalds 		distance = log->l_logBBsize - head_block;
16951da177e4SLinus Torvalds 		error = xlog_write_log_records(log, (head_cycle - 1),
16961da177e4SLinus Torvalds 				head_block, distance, tail_cycle,
16971da177e4SLinus Torvalds 				tail_block);
16981da177e4SLinus Torvalds 
16991da177e4SLinus Torvalds 		if (error)
17001da177e4SLinus Torvalds 			return error;
17011da177e4SLinus Torvalds 
17021da177e4SLinus Torvalds 		/*
17031da177e4SLinus Torvalds 		 * Now write the blocks at the start of the physical log.
17041da177e4SLinus Torvalds 		 * This writes the remainder of the blocks we want to clear.
17051da177e4SLinus Torvalds 		 * It uses the current cycle number since we're now on the
17061da177e4SLinus Torvalds 		 * same cycle as the head so that we get:
17071da177e4SLinus Torvalds 		 *    n ... n ... | n - 1 ...
17081da177e4SLinus Torvalds 		 *    ^^^^^ blocks we're writing
17091da177e4SLinus Torvalds 		 */
17101da177e4SLinus Torvalds 		distance = max_distance - (log->l_logBBsize - head_block);
17111da177e4SLinus Torvalds 		error = xlog_write_log_records(log, head_cycle, 0, distance,
17121da177e4SLinus Torvalds 				tail_cycle, tail_block);
17131da177e4SLinus Torvalds 		if (error)
17141da177e4SLinus Torvalds 			return error;
17151da177e4SLinus Torvalds 	}
17161da177e4SLinus Torvalds 
17171da177e4SLinus Torvalds 	return 0;
17181da177e4SLinus Torvalds }
17191da177e4SLinus Torvalds 
1720154c733aSDarrick J. Wong /*
1721154c733aSDarrick J. Wong  * Release the recovered intent item in the AIL that matches the given intent
1722154c733aSDarrick J. Wong  * type and intent id.
1723154c733aSDarrick J. Wong  */
1724154c733aSDarrick J. Wong void
xlog_recover_release_intent(struct xlog * log,unsigned short intent_type,uint64_t intent_id)1725154c733aSDarrick J. Wong xlog_recover_release_intent(
1726154c733aSDarrick J. Wong 	struct xlog			*log,
1727154c733aSDarrick J. Wong 	unsigned short			intent_type,
1728154c733aSDarrick J. Wong 	uint64_t			intent_id)
1729154c733aSDarrick J. Wong {
1730cd3c2cf3SDarrick J. Wong 	struct xfs_defer_pending	*dfp, *n;
1731154c733aSDarrick J. Wong 
1732cd3c2cf3SDarrick J. Wong 	list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
1733cd3c2cf3SDarrick J. Wong 		struct xfs_log_item	*lip = dfp->dfp_intent;
1734cd3c2cf3SDarrick J. Wong 
1735154c733aSDarrick J. Wong 		if (lip->li_type != intent_type)
1736154c733aSDarrick J. Wong 			continue;
1737154c733aSDarrick J. Wong 		if (!lip->li_ops->iop_match(lip, intent_id))
1738154c733aSDarrick J. Wong 			continue;
1739154c733aSDarrick J. Wong 
1740cd3c2cf3SDarrick J. Wong 		ASSERT(xlog_item_is_intent(lip));
1741154c733aSDarrick J. Wong 
1742cd3c2cf3SDarrick J. Wong 		xfs_defer_cancel_recovery(log->l_mp, dfp);
1743cd3c2cf3SDarrick J. Wong 	}
1744154c733aSDarrick J. Wong }
1745154c733aSDarrick J. Wong 
17464bc61983SDarrick J. Wong int
xlog_recover_iget(struct xfs_mount * mp,xfs_ino_t ino,struct xfs_inode ** ipp)17474bc61983SDarrick J. Wong xlog_recover_iget(
17484bc61983SDarrick J. Wong 	struct xfs_mount	*mp,
17494bc61983SDarrick J. Wong 	xfs_ino_t		ino,
17504bc61983SDarrick J. Wong 	struct xfs_inode	**ipp)
17514bc61983SDarrick J. Wong {
17524bc61983SDarrick J. Wong 	int			error;
17534bc61983SDarrick J. Wong 
17544bc61983SDarrick J. Wong 	error = xfs_iget(mp, NULL, ino, 0, 0, ipp);
17554bc61983SDarrick J. Wong 	if (error)
17564bc61983SDarrick J. Wong 		return error;
17574bc61983SDarrick J. Wong 
17584bc61983SDarrick J. Wong 	error = xfs_qm_dqattach(*ipp);
17594bc61983SDarrick J. Wong 	if (error) {
17604bc61983SDarrick J. Wong 		xfs_irele(*ipp);
17614bc61983SDarrick J. Wong 		return error;
17624bc61983SDarrick J. Wong 	}
17634bc61983SDarrick J. Wong 
17644bc61983SDarrick J. Wong 	if (VFS_I(*ipp)->i_nlink == 0)
17654bc61983SDarrick J. Wong 		xfs_iflags_set(*ipp, XFS_IRECOVERY);
17664bc61983SDarrick J. Wong 
17674bc61983SDarrick J. Wong 	return 0;
17684bc61983SDarrick J. Wong }
17694bc61983SDarrick J. Wong 
17701da177e4SLinus Torvalds /******************************************************************************
17711da177e4SLinus Torvalds  *
17721da177e4SLinus Torvalds  *		Log recover routines
17731da177e4SLinus Torvalds  *
17741da177e4SLinus Torvalds  ******************************************************************************
17751da177e4SLinus Torvalds  */
177686ffa471SDarrick J. Wong static const struct xlog_recover_item_ops *xlog_recover_item_ops[] = {
177786ffa471SDarrick J. Wong 	&xlog_buf_item_ops,
177886ffa471SDarrick J. Wong 	&xlog_inode_item_ops,
177986ffa471SDarrick J. Wong 	&xlog_dquot_item_ops,
178086ffa471SDarrick J. Wong 	&xlog_quotaoff_item_ops,
178186ffa471SDarrick J. Wong 	&xlog_icreate_item_ops,
178286ffa471SDarrick J. Wong 	&xlog_efi_item_ops,
178386ffa471SDarrick J. Wong 	&xlog_efd_item_ops,
178486ffa471SDarrick J. Wong 	&xlog_rui_item_ops,
178586ffa471SDarrick J. Wong 	&xlog_rud_item_ops,
178686ffa471SDarrick J. Wong 	&xlog_cui_item_ops,
178786ffa471SDarrick J. Wong 	&xlog_cud_item_ops,
178886ffa471SDarrick J. Wong 	&xlog_bui_item_ops,
178986ffa471SDarrick J. Wong 	&xlog_bud_item_ops,
1790fd920008SAllison Henderson 	&xlog_attri_item_ops,
1791fd920008SAllison Henderson 	&xlog_attrd_item_ops,
179286ffa471SDarrick J. Wong };
179386ffa471SDarrick J. Wong 
179486ffa471SDarrick J. Wong static const struct xlog_recover_item_ops *
xlog_find_item_ops(struct xlog_recover_item * item)179586ffa471SDarrick J. Wong xlog_find_item_ops(
179686ffa471SDarrick J. Wong 	struct xlog_recover_item		*item)
179786ffa471SDarrick J. Wong {
179886ffa471SDarrick J. Wong 	unsigned int				i;
179986ffa471SDarrick J. Wong 
180086ffa471SDarrick J. Wong 	for (i = 0; i < ARRAY_SIZE(xlog_recover_item_ops); i++)
180186ffa471SDarrick J. Wong 		if (ITEM_TYPE(item) == xlog_recover_item_ops[i]->item_type)
180286ffa471SDarrick J. Wong 			return xlog_recover_item_ops[i];
180386ffa471SDarrick J. Wong 
180486ffa471SDarrick J. Wong 	return NULL;
180586ffa471SDarrick J. Wong }
18061da177e4SLinus Torvalds 
1807f0a76953SDave Chinner /*
1808a775ad77SDave Chinner  * Sort the log items in the transaction.
1809a775ad77SDave Chinner  *
1810a775ad77SDave Chinner  * The ordering constraints are defined by the inode allocation and unlink
1811a775ad77SDave Chinner  * behaviour. The rules are:
1812a775ad77SDave Chinner  *
1813a775ad77SDave Chinner  *	1. Every item is only logged once in a given transaction. Hence it
1814a775ad77SDave Chinner  *	   represents the last logged state of the item. Hence ordering is
1815a775ad77SDave Chinner  *	   dependent on the order in which operations need to be performed so
1816a775ad77SDave Chinner  *	   required initial conditions are always met.
1817a775ad77SDave Chinner  *
1818a775ad77SDave Chinner  *	2. Cancelled buffers are recorded in pass 1 in a separate table and
1819a775ad77SDave Chinner  *	   there's nothing to replay from them so we can simply cull them
1820a775ad77SDave Chinner  *	   from the transaction. However, we can't do that until after we've
1821a775ad77SDave Chinner  *	   replayed all the other items because they may be dependent on the
1822a775ad77SDave Chinner  *	   cancelled buffer and replaying the cancelled buffer can remove it
1823a775ad77SDave Chinner  *	   form the cancelled buffer table. Hence they have to be done last.
1824a775ad77SDave Chinner  *
1825a775ad77SDave Chinner  *	3. Inode allocation buffers must be replayed before inode items that
182628c8e41aSDave Chinner  *	   read the buffer and replay changes into it. For filesystems using the
182728c8e41aSDave Chinner  *	   ICREATE transactions, this means XFS_LI_ICREATE objects need to get
182828c8e41aSDave Chinner  *	   treated the same as inode allocation buffers as they create and
182928c8e41aSDave Chinner  *	   initialise the buffers directly.
1830a775ad77SDave Chinner  *
1831a775ad77SDave Chinner  *	4. Inode unlink buffers must be replayed after inode items are replayed.
1832a775ad77SDave Chinner  *	   This ensures that inodes are completely flushed to the inode buffer
1833a775ad77SDave Chinner  *	   in a "free" state before we remove the unlinked inode list pointer.
1834a775ad77SDave Chinner  *
1835a775ad77SDave Chinner  * Hence the ordering needs to be inode allocation buffers first, inode items
1836a775ad77SDave Chinner  * second, inode unlink buffers third and cancelled buffers last.
1837a775ad77SDave Chinner  *
1838a775ad77SDave Chinner  * But there's a problem with that - we can't tell an inode allocation buffer
1839a775ad77SDave Chinner  * apart from a regular buffer, so we can't separate them. We can, however,
1840a775ad77SDave Chinner  * tell an inode unlink buffer from the others, and so we can separate them out
1841a775ad77SDave Chinner  * from all the other buffers and move them to last.
1842a775ad77SDave Chinner  *
1843a775ad77SDave Chinner  * Hence, 4 lists, in order from head to tail:
1844a775ad77SDave Chinner  *	- buffer_list for all buffers except cancelled/inode unlink buffers
1845a775ad77SDave Chinner  *	- item_list for all non-buffer items
1846a775ad77SDave Chinner  *	- inode_buffer_list for inode unlink buffers
1847a775ad77SDave Chinner  *	- cancel_list for the cancelled buffers
184828c8e41aSDave Chinner  *
184928c8e41aSDave Chinner  * Note that we add objects to the tail of the lists so that first-to-last
185028c8e41aSDave Chinner  * ordering is preserved within the lists. Adding objects to the head of the
185128c8e41aSDave Chinner  * list means when we traverse from the head we walk them in last-to-first
185228c8e41aSDave Chinner  * order. For cancelled buffers and inode unlink buffers this doesn't matter,
185328c8e41aSDave Chinner  * but for all other items there may be specific ordering that we need to
185428c8e41aSDave Chinner  * preserve.
1855f0a76953SDave Chinner  */
18561da177e4SLinus Torvalds STATIC int
xlog_recover_reorder_trans(struct xlog * log,struct xlog_recover * trans,int pass)18571da177e4SLinus Torvalds xlog_recover_reorder_trans(
1858ad223e60SMark Tinguely 	struct xlog		*log,
1859ad223e60SMark Tinguely 	struct xlog_recover	*trans,
18609abbc539SDave Chinner 	int			pass)
18611da177e4SLinus Torvalds {
186235f4521fSDarrick J. Wong 	struct xlog_recover_item *item, *n;
18632a84108fSMark Tinguely 	int			error = 0;
1864f0a76953SDave Chinner 	LIST_HEAD(sort_list);
1865a775ad77SDave Chinner 	LIST_HEAD(cancel_list);
1866a775ad77SDave Chinner 	LIST_HEAD(buffer_list);
1867a775ad77SDave Chinner 	LIST_HEAD(inode_buffer_list);
18685ce70b77SChristoph Hellwig 	LIST_HEAD(item_list);
1869f0a76953SDave Chinner 
1870f0a76953SDave Chinner 	list_splice_init(&trans->r_itemq, &sort_list);
1871f0a76953SDave Chinner 	list_for_each_entry_safe(item, n, &sort_list, ri_list) {
187286ffa471SDarrick J. Wong 		enum xlog_recover_reorder	fate = XLOG_REORDER_ITEM_LIST;
18731da177e4SLinus Torvalds 
187486ffa471SDarrick J. Wong 		item->ri_ops = xlog_find_item_ops(item);
187586ffa471SDarrick J. Wong 		if (!item->ri_ops) {
1876a0fa2b67SDave Chinner 			xfs_warn(log->l_mp,
18770d2d35a3SDarrick J. Wong 				"%s: unrecognized type of log operation (%d)",
18780d2d35a3SDarrick J. Wong 				__func__, ITEM_TYPE(item));
18791da177e4SLinus Torvalds 			ASSERT(0);
18802a84108fSMark Tinguely 			/*
18812a84108fSMark Tinguely 			 * return the remaining items back to the transaction
18822a84108fSMark Tinguely 			 * item list so they can be freed in caller.
18832a84108fSMark Tinguely 			 */
18842a84108fSMark Tinguely 			if (!list_empty(&sort_list))
18852a84108fSMark Tinguely 				list_splice_init(&sort_list, &trans->r_itemq);
188686ffa471SDarrick J. Wong 			error = -EFSCORRUPTED;
188786ffa471SDarrick J. Wong 			break;
188886ffa471SDarrick J. Wong 		}
188986ffa471SDarrick J. Wong 
189086ffa471SDarrick J. Wong 		if (item->ri_ops->reorder)
189186ffa471SDarrick J. Wong 			fate = item->ri_ops->reorder(item);
189286ffa471SDarrick J. Wong 
189386ffa471SDarrick J. Wong 		switch (fate) {
189486ffa471SDarrick J. Wong 		case XLOG_REORDER_BUFFER_LIST:
189586ffa471SDarrick J. Wong 			list_move_tail(&item->ri_list, &buffer_list);
189686ffa471SDarrick J. Wong 			break;
189786ffa471SDarrick J. Wong 		case XLOG_REORDER_CANCEL_LIST:
189886ffa471SDarrick J. Wong 			trace_xfs_log_recover_item_reorder_head(log,
189986ffa471SDarrick J. Wong 					trans, item, pass);
190086ffa471SDarrick J. Wong 			list_move(&item->ri_list, &cancel_list);
190186ffa471SDarrick J. Wong 			break;
190286ffa471SDarrick J. Wong 		case XLOG_REORDER_INODE_BUFFER_LIST:
190386ffa471SDarrick J. Wong 			list_move(&item->ri_list, &inode_buffer_list);
190486ffa471SDarrick J. Wong 			break;
190586ffa471SDarrick J. Wong 		case XLOG_REORDER_ITEM_LIST:
190686ffa471SDarrick J. Wong 			trace_xfs_log_recover_item_reorder_tail(log,
190786ffa471SDarrick J. Wong 							trans, item, pass);
190886ffa471SDarrick J. Wong 			list_move_tail(&item->ri_list, &item_list);
190986ffa471SDarrick J. Wong 			break;
19101da177e4SLinus Torvalds 		}
1911f0a76953SDave Chinner 	}
191286ffa471SDarrick J. Wong 
1913f0a76953SDave Chinner 	ASSERT(list_empty(&sort_list));
1914a775ad77SDave Chinner 	if (!list_empty(&buffer_list))
1915a775ad77SDave Chinner 		list_splice(&buffer_list, &trans->r_itemq);
19165ce70b77SChristoph Hellwig 	if (!list_empty(&item_list))
19175ce70b77SChristoph Hellwig 		list_splice_tail(&item_list, &trans->r_itemq);
1918a775ad77SDave Chinner 	if (!list_empty(&inode_buffer_list))
1919a775ad77SDave Chinner 		list_splice_tail(&inode_buffer_list, &trans->r_itemq);
1920a775ad77SDave Chinner 	if (!list_empty(&cancel_list))
1921a775ad77SDave Chinner 		list_splice_tail(&cancel_list, &trans->r_itemq);
19222a84108fSMark Tinguely 	return error;
19231da177e4SLinus Torvalds }
19241da177e4SLinus Torvalds 
19258ea5682dSDarrick J. Wong void
xlog_buf_readahead(struct xlog * log,xfs_daddr_t blkno,uint len,const struct xfs_buf_ops * ops)19267d4894b4SChristoph Hellwig xlog_buf_readahead(
19277d4894b4SChristoph Hellwig 	struct xlog		*log,
19287d4894b4SChristoph Hellwig 	xfs_daddr_t		blkno,
19297d4894b4SChristoph Hellwig 	uint			len,
19307d4894b4SChristoph Hellwig 	const struct xfs_buf_ops *ops)
19317d4894b4SChristoph Hellwig {
19327d4894b4SChristoph Hellwig 	if (!xlog_is_buffer_cancelled(log, blkno, len))
19337d4894b4SChristoph Hellwig 		xfs_buf_readahead(log->l_mp->m_ddev_targp, blkno, len, ops);
19347d4894b4SChristoph Hellwig }
19357d4894b4SChristoph Hellwig 
1936cd3c2cf3SDarrick J. Wong /*
1937cd3c2cf3SDarrick J. Wong  * Create a deferred work structure for resuming and tracking the progress of a
1938cd3c2cf3SDarrick J. Wong  * log intent item that was found during recovery.
1939cd3c2cf3SDarrick J. Wong  */
1940cd3c2cf3SDarrick J. Wong void
xlog_recover_intent_item(struct xlog * log,struct xfs_log_item * lip,xfs_lsn_t lsn,unsigned int dfp_type)1941cd3c2cf3SDarrick J. Wong xlog_recover_intent_item(
1942cd3c2cf3SDarrick J. Wong 	struct xlog			*log,
1943cd3c2cf3SDarrick J. Wong 	struct xfs_log_item		*lip,
1944cd3c2cf3SDarrick J. Wong 	xfs_lsn_t			lsn,
1945cd3c2cf3SDarrick J. Wong 	unsigned int			dfp_type)
1946cd3c2cf3SDarrick J. Wong {
1947cd3c2cf3SDarrick J. Wong 	ASSERT(xlog_item_is_intent(lip));
1948cd3c2cf3SDarrick J. Wong 
1949cd3c2cf3SDarrick J. Wong 	xfs_defer_start_recovery(lip, dfp_type, &log->r_dfops);
1950cd3c2cf3SDarrick J. Wong 
1951cd3c2cf3SDarrick J. Wong 	/*
1952cd3c2cf3SDarrick J. Wong 	 * Insert the intent into the AIL directly and drop one reference so
1953cd3c2cf3SDarrick J. Wong 	 * that finishing or canceling the work will drop the other.
1954cd3c2cf3SDarrick J. Wong 	 */
1955cd3c2cf3SDarrick J. Wong 	xfs_trans_ail_insert(log->l_ailp, lip, lsn);
1956cd3c2cf3SDarrick J. Wong 	lip->li_ops->iop_unpin(lip, 0);
1957cd3c2cf3SDarrick J. Wong }
1958cd3c2cf3SDarrick J. Wong 
19591da177e4SLinus Torvalds STATIC int
xlog_recover_items_pass2(struct xlog * log,struct xlog_recover * trans,struct list_head * buffer_list,struct list_head * item_list)196000574da1SZhi Yong Wu xlog_recover_items_pass2(
196100574da1SZhi Yong Wu 	struct xlog                     *log,
196200574da1SZhi Yong Wu 	struct xlog_recover             *trans,
196300574da1SZhi Yong Wu 	struct list_head                *buffer_list,
196400574da1SZhi Yong Wu 	struct list_head                *item_list)
196500574da1SZhi Yong Wu {
196600574da1SZhi Yong Wu 	struct xlog_recover_item	*item;
196700574da1SZhi Yong Wu 	int				error = 0;
196800574da1SZhi Yong Wu 
196900574da1SZhi Yong Wu 	list_for_each_entry(item, item_list, ri_list) {
19702565a11bSDarrick J. Wong 		trace_xfs_log_recover_item_recover(log, trans, item,
19712565a11bSDarrick J. Wong 				XLOG_RECOVER_PASS2);
19722565a11bSDarrick J. Wong 
19732565a11bSDarrick J. Wong 		if (item->ri_ops->commit_pass2)
19742565a11bSDarrick J. Wong 			error = item->ri_ops->commit_pass2(log, buffer_list,
19752565a11bSDarrick J. Wong 					item, trans->r_lsn);
197600574da1SZhi Yong Wu 		if (error)
197700574da1SZhi Yong Wu 			return error;
197800574da1SZhi Yong Wu 	}
197900574da1SZhi Yong Wu 
198000574da1SZhi Yong Wu 	return error;
198100574da1SZhi Yong Wu }
198200574da1SZhi Yong Wu 
1983d0450948SChristoph Hellwig /*
1984d0450948SChristoph Hellwig  * Perform the transaction.
1985d0450948SChristoph Hellwig  *
1986d0450948SChristoph Hellwig  * If the transaction modifies a buffer or inode, do it now.  Otherwise,
1987d0450948SChristoph Hellwig  * EFIs and EFDs get queued up by adding entries into the AIL for them.
1988d0450948SChristoph Hellwig  */
1989d0450948SChristoph Hellwig STATIC int
xlog_recover_commit_trans(struct xlog * log,struct xlog_recover * trans,int pass,struct list_head * buffer_list)1990d0450948SChristoph Hellwig xlog_recover_commit_trans(
1991ad223e60SMark Tinguely 	struct xlog		*log,
1992d0450948SChristoph Hellwig 	struct xlog_recover	*trans,
199312818d24SBrian Foster 	int			pass,
199412818d24SBrian Foster 	struct list_head	*buffer_list)
1995d0450948SChristoph Hellwig {
199600574da1SZhi Yong Wu 	int				error = 0;
199700574da1SZhi Yong Wu 	int				items_queued = 0;
199800574da1SZhi Yong Wu 	struct xlog_recover_item	*item;
199900574da1SZhi Yong Wu 	struct xlog_recover_item	*next;
200000574da1SZhi Yong Wu 	LIST_HEAD			(ra_list);
200100574da1SZhi Yong Wu 	LIST_HEAD			(done_list);
200200574da1SZhi Yong Wu 
200300574da1SZhi Yong Wu 	#define XLOG_RECOVER_COMMIT_QUEUE_MAX 100
20041da177e4SLinus Torvalds 
200539775431SBrian Foster 	hlist_del_init(&trans->r_list);
2006d0450948SChristoph Hellwig 
2007d0450948SChristoph Hellwig 	error = xlog_recover_reorder_trans(log, trans, pass);
2008d0450948SChristoph Hellwig 	if (error)
20091da177e4SLinus Torvalds 		return error;
2010d0450948SChristoph Hellwig 
201100574da1SZhi Yong Wu 	list_for_each_entry_safe(item, next, &trans->r_itemq, ri_list) {
20123304a4faSDarrick J. Wong 		trace_xfs_log_recover_item_recover(log, trans, item, pass);
20133304a4faSDarrick J. Wong 
201443ff2122SChristoph Hellwig 		switch (pass) {
201543ff2122SChristoph Hellwig 		case XLOG_RECOVER_PASS1:
20163304a4faSDarrick J. Wong 			if (item->ri_ops->commit_pass1)
20173304a4faSDarrick J. Wong 				error = item->ri_ops->commit_pass1(log, item);
201843ff2122SChristoph Hellwig 			break;
201943ff2122SChristoph Hellwig 		case XLOG_RECOVER_PASS2:
20208ea5682dSDarrick J. Wong 			if (item->ri_ops->ra_pass2)
20218ea5682dSDarrick J. Wong 				item->ri_ops->ra_pass2(log, item);
202200574da1SZhi Yong Wu 			list_move_tail(&item->ri_list, &ra_list);
202300574da1SZhi Yong Wu 			items_queued++;
202400574da1SZhi Yong Wu 			if (items_queued >= XLOG_RECOVER_COMMIT_QUEUE_MAX) {
202500574da1SZhi Yong Wu 				error = xlog_recover_items_pass2(log, trans,
202612818d24SBrian Foster 						buffer_list, &ra_list);
202700574da1SZhi Yong Wu 				list_splice_tail_init(&ra_list, &done_list);
202800574da1SZhi Yong Wu 				items_queued = 0;
202900574da1SZhi Yong Wu 			}
203000574da1SZhi Yong Wu 
203143ff2122SChristoph Hellwig 			break;
203243ff2122SChristoph Hellwig 		default:
203343ff2122SChristoph Hellwig 			ASSERT(0);
203443ff2122SChristoph Hellwig 		}
203543ff2122SChristoph Hellwig 
2036d0450948SChristoph Hellwig 		if (error)
203743ff2122SChristoph Hellwig 			goto out;
2038d0450948SChristoph Hellwig 	}
2039d0450948SChristoph Hellwig 
204000574da1SZhi Yong Wu out:
204100574da1SZhi Yong Wu 	if (!list_empty(&ra_list)) {
204200574da1SZhi Yong Wu 		if (!error)
204300574da1SZhi Yong Wu 			error = xlog_recover_items_pass2(log, trans,
204412818d24SBrian Foster 					buffer_list, &ra_list);
204500574da1SZhi Yong Wu 		list_splice_tail_init(&ra_list, &done_list);
204600574da1SZhi Yong Wu 	}
204700574da1SZhi Yong Wu 
204800574da1SZhi Yong Wu 	if (!list_empty(&done_list))
204900574da1SZhi Yong Wu 		list_splice_init(&done_list, &trans->r_itemq);
205000574da1SZhi Yong Wu 
205112818d24SBrian Foster 	return error;
20521da177e4SLinus Torvalds }
20531da177e4SLinus Torvalds 
205476560669SDave Chinner STATIC void
xlog_recover_add_item(struct list_head * head)205576560669SDave Chinner xlog_recover_add_item(
205676560669SDave Chinner 	struct list_head	*head)
20571da177e4SLinus Torvalds {
205835f4521fSDarrick J. Wong 	struct xlog_recover_item *item;
205976560669SDave Chinner 
206035f4521fSDarrick J. Wong 	item = kmem_zalloc(sizeof(struct xlog_recover_item), 0);
206176560669SDave Chinner 	INIT_LIST_HEAD(&item->ri_list);
206276560669SDave Chinner 	list_add_tail(&item->ri_list, head);
206376560669SDave Chinner }
206476560669SDave Chinner 
206576560669SDave Chinner STATIC int
xlog_recover_add_to_cont_trans(struct xlog * log,struct xlog_recover * trans,char * dp,int len)206676560669SDave Chinner xlog_recover_add_to_cont_trans(
206776560669SDave Chinner 	struct xlog		*log,
206876560669SDave Chinner 	struct xlog_recover	*trans,
2069b2a922cdSChristoph Hellwig 	char			*dp,
207076560669SDave Chinner 	int			len)
207176560669SDave Chinner {
207235f4521fSDarrick J. Wong 	struct xlog_recover_item *item;
2073b2a922cdSChristoph Hellwig 	char			*ptr, *old_ptr;
207476560669SDave Chinner 	int			old_len;
207576560669SDave Chinner 
207689cebc84SBrian Foster 	/*
207789cebc84SBrian Foster 	 * If the transaction is empty, the header was split across this and the
207889cebc84SBrian Foster 	 * previous record. Copy the rest of the header.
207989cebc84SBrian Foster 	 */
208076560669SDave Chinner 	if (list_empty(&trans->r_itemq)) {
2081848ccfc8SBrian Foster 		ASSERT(len <= sizeof(struct xfs_trans_header));
208289cebc84SBrian Foster 		if (len > sizeof(struct xfs_trans_header)) {
208389cebc84SBrian Foster 			xfs_warn(log->l_mp, "%s: bad header length", __func__);
2084895e196fSDarrick J. Wong 			return -EFSCORRUPTED;
208589cebc84SBrian Foster 		}
208689cebc84SBrian Foster 
208776560669SDave Chinner 		xlog_recover_add_item(&trans->r_itemq);
2088b2a922cdSChristoph Hellwig 		ptr = (char *)&trans->r_theader +
208989cebc84SBrian Foster 				sizeof(struct xfs_trans_header) - len;
209076560669SDave Chinner 		memcpy(ptr, dp, len);
20911da177e4SLinus Torvalds 		return 0;
20921da177e4SLinus Torvalds 	}
209389cebc84SBrian Foster 
209476560669SDave Chinner 	/* take the tail entry */
209535f4521fSDarrick J. Wong 	item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
209635f4521fSDarrick J. Wong 			  ri_list);
209776560669SDave Chinner 
209876560669SDave Chinner 	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
209976560669SDave Chinner 	old_len = item->ri_buf[item->ri_cnt-1].i_len;
210076560669SDave Chinner 
2101de2860f4SDave Chinner 	ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
2102de2860f4SDave Chinner 	if (!ptr)
2103de2860f4SDave Chinner 		return -ENOMEM;
210476560669SDave Chinner 	memcpy(&ptr[old_len], dp, len);
210576560669SDave Chinner 	item->ri_buf[item->ri_cnt-1].i_len += len;
210676560669SDave Chinner 	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
210776560669SDave Chinner 	trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
210876560669SDave Chinner 	return 0;
210976560669SDave Chinner }
211076560669SDave Chinner 
211176560669SDave Chinner /*
211276560669SDave Chinner  * The next region to add is the start of a new region.  It could be
211376560669SDave Chinner  * a whole region or it could be the first part of a new region.  Because
211476560669SDave Chinner  * of this, the assumption here is that the type and size fields of all
211576560669SDave Chinner  * format structures fit into the first 32 bits of the structure.
211676560669SDave Chinner  *
211776560669SDave Chinner  * This works because all regions must be 32 bit aligned.  Therefore, we
211876560669SDave Chinner  * either have both fields or we have neither field.  In the case we have
211976560669SDave Chinner  * neither field, the data part of the region is zero length.  We only have
212076560669SDave Chinner  * a log_op_header and can throw away the header since a new one will appear
212176560669SDave Chinner  * later.  If we have at least 4 bytes, then we can determine how many regions
212276560669SDave Chinner  * will appear in the current log item.
212376560669SDave Chinner  */
212476560669SDave Chinner STATIC int
xlog_recover_add_to_trans(struct xlog * log,struct xlog_recover * trans,char * dp,int len)212576560669SDave Chinner xlog_recover_add_to_trans(
212676560669SDave Chinner 	struct xlog		*log,
212776560669SDave Chinner 	struct xlog_recover	*trans,
2128b2a922cdSChristoph Hellwig 	char			*dp,
212976560669SDave Chinner 	int			len)
213076560669SDave Chinner {
213106b11321SDarrick J. Wong 	struct xfs_inode_log_format	*in_f;			/* any will do */
213235f4521fSDarrick J. Wong 	struct xlog_recover_item *item;
2133b2a922cdSChristoph Hellwig 	char			*ptr;
213476560669SDave Chinner 
213576560669SDave Chinner 	if (!len)
213676560669SDave Chinner 		return 0;
213776560669SDave Chinner 	if (list_empty(&trans->r_itemq)) {
213876560669SDave Chinner 		/* we need to catch log corruptions here */
213976560669SDave Chinner 		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
214076560669SDave Chinner 			xfs_warn(log->l_mp, "%s: bad header magic number",
214176560669SDave Chinner 				__func__);
214276560669SDave Chinner 			ASSERT(0);
2143895e196fSDarrick J. Wong 			return -EFSCORRUPTED;
214476560669SDave Chinner 		}
214589cebc84SBrian Foster 
214689cebc84SBrian Foster 		if (len > sizeof(struct xfs_trans_header)) {
214789cebc84SBrian Foster 			xfs_warn(log->l_mp, "%s: bad header length", __func__);
214889cebc84SBrian Foster 			ASSERT(0);
2149895e196fSDarrick J. Wong 			return -EFSCORRUPTED;
215089cebc84SBrian Foster 		}
215189cebc84SBrian Foster 
215289cebc84SBrian Foster 		/*
215389cebc84SBrian Foster 		 * The transaction header can be arbitrarily split across op
215489cebc84SBrian Foster 		 * records. If we don't have the whole thing here, copy what we
215589cebc84SBrian Foster 		 * do have and handle the rest in the next record.
215689cebc84SBrian Foster 		 */
215789cebc84SBrian Foster 		if (len == sizeof(struct xfs_trans_header))
215876560669SDave Chinner 			xlog_recover_add_item(&trans->r_itemq);
215976560669SDave Chinner 		memcpy(&trans->r_theader, dp, len);
216076560669SDave Chinner 		return 0;
216176560669SDave Chinner 	}
216276560669SDave Chinner 
2163707e0ddaSTetsuo Handa 	ptr = kmem_alloc(len, 0);
216476560669SDave Chinner 	memcpy(ptr, dp, len);
216506b11321SDarrick J. Wong 	in_f = (struct xfs_inode_log_format *)ptr;
216676560669SDave Chinner 
216776560669SDave Chinner 	/* take the tail entry */
216835f4521fSDarrick J. Wong 	item = list_entry(trans->r_itemq.prev, struct xlog_recover_item,
216935f4521fSDarrick J. Wong 			  ri_list);
217076560669SDave Chinner 	if (item->ri_total != 0 &&
217176560669SDave Chinner 	     item->ri_total == item->ri_cnt) {
217276560669SDave Chinner 		/* tail item is in use, get a new one */
217376560669SDave Chinner 		xlog_recover_add_item(&trans->r_itemq);
217476560669SDave Chinner 		item = list_entry(trans->r_itemq.prev,
217535f4521fSDarrick J. Wong 					struct xlog_recover_item, ri_list);
217676560669SDave Chinner 	}
217776560669SDave Chinner 
217876560669SDave Chinner 	if (item->ri_total == 0) {		/* first region to be added */
217976560669SDave Chinner 		if (in_f->ilf_size == 0 ||
218076560669SDave Chinner 		    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
218176560669SDave Chinner 			xfs_warn(log->l_mp,
218276560669SDave Chinner 		"bad number of regions (%d) in inode log format",
218376560669SDave Chinner 				  in_f->ilf_size);
218476560669SDave Chinner 			ASSERT(0);
218576560669SDave Chinner 			kmem_free(ptr);
2186895e196fSDarrick J. Wong 			return -EFSCORRUPTED;
218776560669SDave Chinner 		}
218876560669SDave Chinner 
218976560669SDave Chinner 		item->ri_total = in_f->ilf_size;
219076560669SDave Chinner 		item->ri_buf =
219176560669SDave Chinner 			kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
2192707e0ddaSTetsuo Handa 				    0);
219376560669SDave Chinner 	}
2194d6abecb8SDarrick J. Wong 
2195d6abecb8SDarrick J. Wong 	if (item->ri_total <= item->ri_cnt) {
2196d6abecb8SDarrick J. Wong 		xfs_warn(log->l_mp,
2197d6abecb8SDarrick J. Wong 	"log item region count (%d) overflowed size (%d)",
2198d6abecb8SDarrick J. Wong 				item->ri_cnt, item->ri_total);
2199d6abecb8SDarrick J. Wong 		ASSERT(0);
2200d6abecb8SDarrick J. Wong 		kmem_free(ptr);
2201d6abecb8SDarrick J. Wong 		return -EFSCORRUPTED;
2202d6abecb8SDarrick J. Wong 	}
2203d6abecb8SDarrick J. Wong 
220476560669SDave Chinner 	/* Description region is ri_buf[0] */
220576560669SDave Chinner 	item->ri_buf[item->ri_cnt].i_addr = ptr;
220676560669SDave Chinner 	item->ri_buf[item->ri_cnt].i_len  = len;
220776560669SDave Chinner 	item->ri_cnt++;
220876560669SDave Chinner 	trace_xfs_log_recover_item_add(log, trans, item, 0);
220976560669SDave Chinner 	return 0;
221076560669SDave Chinner }
2211b818cca1SDave Chinner 
221276560669SDave Chinner /*
221376560669SDave Chinner  * Free up any resources allocated by the transaction
221476560669SDave Chinner  *
221576560669SDave Chinner  * Remember that EFIs, EFDs, and IUNLINKs are handled later.
221676560669SDave Chinner  */
221776560669SDave Chinner STATIC void
xlog_recover_free_trans(struct xlog_recover * trans)221876560669SDave Chinner xlog_recover_free_trans(
221976560669SDave Chinner 	struct xlog_recover	*trans)
222076560669SDave Chinner {
222135f4521fSDarrick J. Wong 	struct xlog_recover_item *item, *n;
222276560669SDave Chinner 	int			i;
222376560669SDave Chinner 
222439775431SBrian Foster 	hlist_del_init(&trans->r_list);
222539775431SBrian Foster 
222676560669SDave Chinner 	list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
222776560669SDave Chinner 		/* Free the regions in the item. */
222876560669SDave Chinner 		list_del(&item->ri_list);
222976560669SDave Chinner 		for (i = 0; i < item->ri_cnt; i++)
223076560669SDave Chinner 			kmem_free(item->ri_buf[i].i_addr);
223176560669SDave Chinner 		/* Free the item itself */
223276560669SDave Chinner 		kmem_free(item->ri_buf);
223376560669SDave Chinner 		kmem_free(item);
223476560669SDave Chinner 	}
223576560669SDave Chinner 	/* Free the transaction recover structure */
223676560669SDave Chinner 	kmem_free(trans);
223776560669SDave Chinner }
223876560669SDave Chinner 
2239e9131e50SDave Chinner /*
2240e9131e50SDave Chinner  * On error or completion, trans is freed.
2241e9131e50SDave Chinner  */
22421da177e4SLinus Torvalds STATIC int
xlog_recovery_process_trans(struct xlog * log,struct xlog_recover * trans,char * dp,unsigned int len,unsigned int flags,int pass,struct list_head * buffer_list)2243eeb11688SDave Chinner xlog_recovery_process_trans(
2244eeb11688SDave Chinner 	struct xlog		*log,
2245eeb11688SDave Chinner 	struct xlog_recover	*trans,
2246b2a922cdSChristoph Hellwig 	char			*dp,
2247eeb11688SDave Chinner 	unsigned int		len,
2248eeb11688SDave Chinner 	unsigned int		flags,
224912818d24SBrian Foster 	int			pass,
225012818d24SBrian Foster 	struct list_head	*buffer_list)
22511da177e4SLinus Torvalds {
2252e9131e50SDave Chinner 	int			error = 0;
2253e9131e50SDave Chinner 	bool			freeit = false;
2254eeb11688SDave Chinner 
2255eeb11688SDave Chinner 	/* mask off ophdr transaction container flags */
2256eeb11688SDave Chinner 	flags &= ~XLOG_END_TRANS;
2257eeb11688SDave Chinner 	if (flags & XLOG_WAS_CONT_TRANS)
2258eeb11688SDave Chinner 		flags &= ~XLOG_CONTINUE_TRANS;
2259eeb11688SDave Chinner 
226088b863dbSDave Chinner 	/*
226188b863dbSDave Chinner 	 * Callees must not free the trans structure. We'll decide if we need to
226288b863dbSDave Chinner 	 * free it or not based on the operation being done and it's result.
226388b863dbSDave Chinner 	 */
2264eeb11688SDave Chinner 	switch (flags) {
2265eeb11688SDave Chinner 	/* expected flag values */
2266eeb11688SDave Chinner 	case 0:
2267eeb11688SDave Chinner 	case XLOG_CONTINUE_TRANS:
2268eeb11688SDave Chinner 		error = xlog_recover_add_to_trans(log, trans, dp, len);
2269eeb11688SDave Chinner 		break;
2270eeb11688SDave Chinner 	case XLOG_WAS_CONT_TRANS:
2271eeb11688SDave Chinner 		error = xlog_recover_add_to_cont_trans(log, trans, dp, len);
2272eeb11688SDave Chinner 		break;
2273eeb11688SDave Chinner 	case XLOG_COMMIT_TRANS:
227412818d24SBrian Foster 		error = xlog_recover_commit_trans(log, trans, pass,
227512818d24SBrian Foster 						  buffer_list);
227688b863dbSDave Chinner 		/* success or fail, we are now done with this transaction. */
227788b863dbSDave Chinner 		freeit = true;
2278eeb11688SDave Chinner 		break;
2279eeb11688SDave Chinner 
2280eeb11688SDave Chinner 	/* unexpected flag values */
2281eeb11688SDave Chinner 	case XLOG_UNMOUNT_TRANS:
2282e9131e50SDave Chinner 		/* just skip trans */
22831da177e4SLinus Torvalds 		xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
2284e9131e50SDave Chinner 		freeit = true;
2285eeb11688SDave Chinner 		break;
2286eeb11688SDave Chinner 	case XLOG_START_TRANS:
2287eeb11688SDave Chinner 	default:
2288eeb11688SDave Chinner 		xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags);
2289eeb11688SDave Chinner 		ASSERT(0);
2290895e196fSDarrick J. Wong 		error = -EFSCORRUPTED;
2291eeb11688SDave Chinner 		break;
2292eeb11688SDave Chinner 	}
2293e9131e50SDave Chinner 	if (error || freeit)
2294e9131e50SDave Chinner 		xlog_recover_free_trans(trans);
2295eeb11688SDave Chinner 	return error;
2296eeb11688SDave Chinner }
2297eeb11688SDave Chinner 
2298b818cca1SDave Chinner /*
2299b818cca1SDave Chinner  * Lookup the transaction recovery structure associated with the ID in the
2300b818cca1SDave Chinner  * current ophdr. If the transaction doesn't exist and the start flag is set in
2301b818cca1SDave Chinner  * the ophdr, then allocate a new transaction for future ID matches to find.
2302b818cca1SDave Chinner  * Either way, return what we found during the lookup - an existing transaction
2303b818cca1SDave Chinner  * or nothing.
2304b818cca1SDave Chinner  */
2305eeb11688SDave Chinner STATIC struct xlog_recover *
xlog_recover_ophdr_to_trans(struct hlist_head rhash[],struct xlog_rec_header * rhead,struct xlog_op_header * ohead)2306eeb11688SDave Chinner xlog_recover_ophdr_to_trans(
2307eeb11688SDave Chinner 	struct hlist_head	rhash[],
2308eeb11688SDave Chinner 	struct xlog_rec_header	*rhead,
2309eeb11688SDave Chinner 	struct xlog_op_header	*ohead)
2310eeb11688SDave Chinner {
2311eeb11688SDave Chinner 	struct xlog_recover	*trans;
2312eeb11688SDave Chinner 	xlog_tid_t		tid;
2313eeb11688SDave Chinner 	struct hlist_head	*rhp;
2314eeb11688SDave Chinner 
2315eeb11688SDave Chinner 	tid = be32_to_cpu(ohead->oh_tid);
2316eeb11688SDave Chinner 	rhp = &rhash[XLOG_RHASH(tid)];
2317b818cca1SDave Chinner 	hlist_for_each_entry(trans, rhp, r_list) {
2318b818cca1SDave Chinner 		if (trans->r_log_tid == tid)
2319eeb11688SDave Chinner 			return trans;
2320b818cca1SDave Chinner 	}
2321eeb11688SDave Chinner 
2322eeb11688SDave Chinner 	/*
2323b818cca1SDave Chinner 	 * skip over non-start transaction headers - we could be
2324b818cca1SDave Chinner 	 * processing slack space before the next transaction starts
2325eeb11688SDave Chinner 	 */
2326b818cca1SDave Chinner 	if (!(ohead->oh_flags & XLOG_START_TRANS))
2327b818cca1SDave Chinner 		return NULL;
2328b818cca1SDave Chinner 
2329eeb11688SDave Chinner 	ASSERT(be32_to_cpu(ohead->oh_len) == 0);
2330b818cca1SDave Chinner 
2331b818cca1SDave Chinner 	/*
2332b818cca1SDave Chinner 	 * This is a new transaction so allocate a new recovery container to
2333b818cca1SDave Chinner 	 * hold the recovery ops that will follow.
2334b818cca1SDave Chinner 	 */
2335707e0ddaSTetsuo Handa 	trans = kmem_zalloc(sizeof(struct xlog_recover), 0);
2336b818cca1SDave Chinner 	trans->r_log_tid = tid;
2337b818cca1SDave Chinner 	trans->r_lsn = be64_to_cpu(rhead->h_lsn);
2338b818cca1SDave Chinner 	INIT_LIST_HEAD(&trans->r_itemq);
2339b818cca1SDave Chinner 	INIT_HLIST_NODE(&trans->r_list);
2340b818cca1SDave Chinner 	hlist_add_head(&trans->r_list, rhp);
2341b818cca1SDave Chinner 
2342b818cca1SDave Chinner 	/*
2343b818cca1SDave Chinner 	 * Nothing more to do for this ophdr. Items to be added to this new
2344b818cca1SDave Chinner 	 * transaction will be in subsequent ophdr containers.
2345b818cca1SDave Chinner 	 */
2346eeb11688SDave Chinner 	return NULL;
2347eeb11688SDave Chinner }
2348eeb11688SDave Chinner 
2349eeb11688SDave Chinner STATIC int
xlog_recover_process_ophdr(struct xlog * log,struct hlist_head rhash[],struct xlog_rec_header * rhead,struct xlog_op_header * ohead,char * dp,char * end,int pass,struct list_head * buffer_list)2350eeb11688SDave Chinner xlog_recover_process_ophdr(
2351eeb11688SDave Chinner 	struct xlog		*log,
2352eeb11688SDave Chinner 	struct hlist_head	rhash[],
2353eeb11688SDave Chinner 	struct xlog_rec_header	*rhead,
2354eeb11688SDave Chinner 	struct xlog_op_header	*ohead,
2355b2a922cdSChristoph Hellwig 	char			*dp,
2356b2a922cdSChristoph Hellwig 	char			*end,
235712818d24SBrian Foster 	int			pass,
235812818d24SBrian Foster 	struct list_head	*buffer_list)
2359eeb11688SDave Chinner {
2360eeb11688SDave Chinner 	struct xlog_recover	*trans;
2361eeb11688SDave Chinner 	unsigned int		len;
236212818d24SBrian Foster 	int			error;
2363eeb11688SDave Chinner 
2364eeb11688SDave Chinner 	/* Do we understand who wrote this op? */
2365eeb11688SDave Chinner 	if (ohead->oh_clientid != XFS_TRANSACTION &&
2366eeb11688SDave Chinner 	    ohead->oh_clientid != XFS_LOG) {
2367eeb11688SDave Chinner 		xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
2368eeb11688SDave Chinner 			__func__, ohead->oh_clientid);
2369eeb11688SDave Chinner 		ASSERT(0);
2370895e196fSDarrick J. Wong 		return -EFSCORRUPTED;
2371eeb11688SDave Chinner 	}
2372eeb11688SDave Chinner 
2373eeb11688SDave Chinner 	/*
2374eeb11688SDave Chinner 	 * Check the ophdr contains all the data it is supposed to contain.
2375eeb11688SDave Chinner 	 */
2376eeb11688SDave Chinner 	len = be32_to_cpu(ohead->oh_len);
2377eeb11688SDave Chinner 	if (dp + len > end) {
2378eeb11688SDave Chinner 		xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len);
2379eeb11688SDave Chinner 		WARN_ON(1);
2380895e196fSDarrick J. Wong 		return -EFSCORRUPTED;
2381eeb11688SDave Chinner 	}
2382eeb11688SDave Chinner 
2383eeb11688SDave Chinner 	trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead);
2384eeb11688SDave Chinner 	if (!trans) {
2385eeb11688SDave Chinner 		/* nothing to do, so skip over this ophdr */
23861da177e4SLinus Torvalds 		return 0;
23871da177e4SLinus Torvalds 	}
23881da177e4SLinus Torvalds 
238912818d24SBrian Foster 	/*
239012818d24SBrian Foster 	 * The recovered buffer queue is drained only once we know that all
239112818d24SBrian Foster 	 * recovery items for the current LSN have been processed. This is
239212818d24SBrian Foster 	 * required because:
239312818d24SBrian Foster 	 *
239412818d24SBrian Foster 	 * - Buffer write submission updates the metadata LSN of the buffer.
239512818d24SBrian Foster 	 * - Log recovery skips items with a metadata LSN >= the current LSN of
239612818d24SBrian Foster 	 *   the recovery item.
239712818d24SBrian Foster 	 * - Separate recovery items against the same metadata buffer can share
239812818d24SBrian Foster 	 *   a current LSN. I.e., consider that the LSN of a recovery item is
239912818d24SBrian Foster 	 *   defined as the starting LSN of the first record in which its
240012818d24SBrian Foster 	 *   transaction appears, that a record can hold multiple transactions,
240112818d24SBrian Foster 	 *   and/or that a transaction can span multiple records.
240212818d24SBrian Foster 	 *
240312818d24SBrian Foster 	 * In other words, we are allowed to submit a buffer from log recovery
240412818d24SBrian Foster 	 * once per current LSN. Otherwise, we may incorrectly skip recovery
240512818d24SBrian Foster 	 * items and cause corruption.
240612818d24SBrian Foster 	 *
240712818d24SBrian Foster 	 * We don't know up front whether buffers are updated multiple times per
240812818d24SBrian Foster 	 * LSN. Therefore, track the current LSN of each commit log record as it
240912818d24SBrian Foster 	 * is processed and drain the queue when it changes. Use commit records
241012818d24SBrian Foster 	 * because they are ordered correctly by the logging code.
241112818d24SBrian Foster 	 */
241212818d24SBrian Foster 	if (log->l_recovery_lsn != trans->r_lsn &&
241312818d24SBrian Foster 	    ohead->oh_flags & XLOG_COMMIT_TRANS) {
241412818d24SBrian Foster 		error = xfs_buf_delwri_submit(buffer_list);
241512818d24SBrian Foster 		if (error)
241612818d24SBrian Foster 			return error;
241712818d24SBrian Foster 		log->l_recovery_lsn = trans->r_lsn;
241812818d24SBrian Foster 	}
241912818d24SBrian Foster 
2420e9131e50SDave Chinner 	return xlog_recovery_process_trans(log, trans, dp, len,
242112818d24SBrian Foster 					   ohead->oh_flags, pass, buffer_list);
2422eeb11688SDave Chinner }
24231da177e4SLinus Torvalds 
24241da177e4SLinus Torvalds /*
24251da177e4SLinus Torvalds  * There are two valid states of the r_state field.  0 indicates that the
24261da177e4SLinus Torvalds  * transaction structure is in a normal state.  We have either seen the
24271da177e4SLinus Torvalds  * start of the transaction or the last operation we added was not a partial
24281da177e4SLinus Torvalds  * operation.  If the last operation we added to the transaction was a
24291da177e4SLinus Torvalds  * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
24301da177e4SLinus Torvalds  *
24311da177e4SLinus Torvalds  * NOTE: skip LRs with 0 data length.
24321da177e4SLinus Torvalds  */
24331da177e4SLinus Torvalds STATIC int
xlog_recover_process_data(struct xlog * log,struct hlist_head rhash[],struct xlog_rec_header * rhead,char * dp,int pass,struct list_head * buffer_list)24341da177e4SLinus Torvalds xlog_recover_process_data(
24359a8d2fdbSMark Tinguely 	struct xlog		*log,
2436f0a76953SDave Chinner 	struct hlist_head	rhash[],
24379a8d2fdbSMark Tinguely 	struct xlog_rec_header	*rhead,
2438b2a922cdSChristoph Hellwig 	char			*dp,
243912818d24SBrian Foster 	int			pass,
244012818d24SBrian Foster 	struct list_head	*buffer_list)
24411da177e4SLinus Torvalds {
2442eeb11688SDave Chinner 	struct xlog_op_header	*ohead;
2443b2a922cdSChristoph Hellwig 	char			*end;
24441da177e4SLinus Torvalds 	int			num_logops;
24451da177e4SLinus Torvalds 	int			error;
24461da177e4SLinus Torvalds 
2447eeb11688SDave Chinner 	end = dp + be32_to_cpu(rhead->h_len);
2448b53e675dSChristoph Hellwig 	num_logops = be32_to_cpu(rhead->h_num_logops);
24491da177e4SLinus Torvalds 
24501da177e4SLinus Torvalds 	/* check the log format matches our own - else we can't recover */
24511da177e4SLinus Torvalds 	if (xlog_header_check_recover(log->l_mp, rhead))
24522451337dSDave Chinner 		return -EIO;
24531da177e4SLinus Torvalds 
24545cd9cee9SBrian Foster 	trace_xfs_log_recover_record(log, rhead, pass);
2455eeb11688SDave Chinner 	while ((dp < end) && num_logops) {
2456eeb11688SDave Chinner 
2457eeb11688SDave Chinner 		ohead = (struct xlog_op_header *)dp;
2458eeb11688SDave Chinner 		dp += sizeof(*ohead);
2459*7cd9f0a3Slei lu 		if (dp > end) {
2460*7cd9f0a3Slei lu 			xfs_warn(log->l_mp, "%s: op header overrun", __func__);
2461*7cd9f0a3Slei lu 			return -EFSCORRUPTED;
2462*7cd9f0a3Slei lu 		}
2463eeb11688SDave Chinner 
2464eeb11688SDave Chinner 		/* errors will abort recovery */
2465eeb11688SDave Chinner 		error = xlog_recover_process_ophdr(log, rhash, rhead, ohead,
246612818d24SBrian Foster 						   dp, end, pass, buffer_list);
2467eeb11688SDave Chinner 		if (error)
24681da177e4SLinus Torvalds 			return error;
2469eeb11688SDave Chinner 
247067fcb7bfSChristoph Hellwig 		dp += be32_to_cpu(ohead->oh_len);
24711da177e4SLinus Torvalds 		num_logops--;
24721da177e4SLinus Torvalds 	}
24731da177e4SLinus Torvalds 	return 0;
24741da177e4SLinus Torvalds }
24751da177e4SLinus Torvalds 
247650995582SDarrick J. Wong /* Take all the collected deferred ops and finish them in order. */
247750995582SDarrick J. Wong static int
xlog_finish_defer_ops(struct xfs_mount * mp,struct list_head * capture_list)247850995582SDarrick J. Wong xlog_finish_defer_ops(
2479e6fff81eSDarrick J. Wong 	struct xfs_mount	*mp,
2480e6fff81eSDarrick J. Wong 	struct list_head	*capture_list)
248150995582SDarrick J. Wong {
2482e6fff81eSDarrick J. Wong 	struct xfs_defer_capture *dfc, *next;
248350995582SDarrick J. Wong 	struct xfs_trans	*tp;
2484e6fff81eSDarrick J. Wong 	int			error = 0;
248550995582SDarrick J. Wong 
2486e6fff81eSDarrick J. Wong 	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2487929b92f6SDarrick J. Wong 		struct xfs_trans_res	resv;
2488512edfacSDarrick J. Wong 		struct xfs_defer_resources dres;
2489929b92f6SDarrick J. Wong 
2490929b92f6SDarrick J. Wong 		/*
2491929b92f6SDarrick J. Wong 		 * Create a new transaction reservation from the captured
2492929b92f6SDarrick J. Wong 		 * information.  Set logcount to 1 to force the new transaction
2493929b92f6SDarrick J. Wong 		 * to regrant every roll so that we can make forward progress
2494929b92f6SDarrick J. Wong 		 * in recovery no matter how full the log might be.
2495929b92f6SDarrick J. Wong 		 */
2496929b92f6SDarrick J. Wong 		resv.tr_logres = dfc->dfc_logres;
2497929b92f6SDarrick J. Wong 		resv.tr_logcount = 1;
2498929b92f6SDarrick J. Wong 		resv.tr_logflags = XFS_TRANS_PERM_LOG_RES;
2499929b92f6SDarrick J. Wong 
2500929b92f6SDarrick J. Wong 		error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
2501929b92f6SDarrick J. Wong 				dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
25024e6b8270SDarrick J. Wong 		if (error) {
2503b5f17becSDave Chinner 			xlog_force_shutdown(mp->m_log, SHUTDOWN_LOG_IO_ERROR);
250450995582SDarrick J. Wong 			return error;
25054e6b8270SDarrick J. Wong 		}
250650995582SDarrick J. Wong 
2507e6fff81eSDarrick J. Wong 		/*
2508e6fff81eSDarrick J. Wong 		 * Transfer to this new transaction all the dfops we captured
2509e6fff81eSDarrick J. Wong 		 * from recovering a single intent item.
2510e6fff81eSDarrick J. Wong 		 */
2511e6fff81eSDarrick J. Wong 		list_del_init(&dfc->dfc_list);
2512512edfacSDarrick J. Wong 		xfs_defer_ops_continue(dfc, tp, &dres);
2513e6fff81eSDarrick J. Wong 		error = xfs_trans_commit(tp);
2514512edfacSDarrick J. Wong 		xfs_defer_resources_rele(&dres);
2515e6fff81eSDarrick J. Wong 		if (error)
2516e6fff81eSDarrick J. Wong 			return error;
251750995582SDarrick J. Wong 	}
251850995582SDarrick J. Wong 
2519e6fff81eSDarrick J. Wong 	ASSERT(list_empty(capture_list));
2520e6fff81eSDarrick J. Wong 	return 0;
2521e6fff81eSDarrick J. Wong }
2522e6fff81eSDarrick J. Wong 
2523e6fff81eSDarrick J. Wong /* Release all the captured defer ops and capture structures in this list. */
2524e6fff81eSDarrick J. Wong static void
xlog_abort_defer_ops(struct xfs_mount * mp,struct list_head * capture_list)2525e6fff81eSDarrick J. Wong xlog_abort_defer_ops(
2526e6fff81eSDarrick J. Wong 	struct xfs_mount		*mp,
2527e6fff81eSDarrick J. Wong 	struct list_head		*capture_list)
2528e6fff81eSDarrick J. Wong {
2529e6fff81eSDarrick J. Wong 	struct xfs_defer_capture	*dfc;
2530e6fff81eSDarrick J. Wong 	struct xfs_defer_capture	*next;
2531e6fff81eSDarrick J. Wong 
2532e6fff81eSDarrick J. Wong 	list_for_each_entry_safe(dfc, next, capture_list, dfc_list) {
2533e6fff81eSDarrick J. Wong 		list_del_init(&dfc->dfc_list);
2534005be668SLong Li 		xfs_defer_ops_capture_abort(mp, dfc);
2535e6fff81eSDarrick J. Wong 	}
2536e6fff81eSDarrick J. Wong }
2537ab9c81efSDave Chinner 
2538dc42375dSDarrick J. Wong /*
2539dc42375dSDarrick J. Wong  * When this is called, all of the log intent items which did not have
2540ab9c81efSDave Chinner  * corresponding log done items should be in the AIL.  What we do now is update
2541ab9c81efSDave Chinner  * the data structures associated with each one.
2542dc42375dSDarrick J. Wong  *
2543ab9c81efSDave Chinner  * Since we process the log intent items in normal transactions, they will be
2544ab9c81efSDave Chinner  * removed at some point after the commit.  This prevents us from just walking
2545ab9c81efSDave Chinner  * down the list processing each one.  We'll use a flag in the intent item to
2546ab9c81efSDave Chinner  * skip those that we've already processed and use the AIL iteration mechanism's
2547ab9c81efSDave Chinner  * generation count to try to speed this up at least a bit.
2548dc42375dSDarrick J. Wong  *
2549ab9c81efSDave Chinner  * When we start, we know that the intents are the only things in the AIL. As we
2550ab9c81efSDave Chinner  * process them, however, other items are added to the AIL. Hence we know we
2551ab9c81efSDave Chinner  * have started recovery on all the pending intents when we find an non-intent
2552ab9c81efSDave Chinner  * item in the AIL.
2553dc42375dSDarrick J. Wong  */
2554dc42375dSDarrick J. Wong STATIC int
xlog_recover_process_intents(struct xlog * log)2555dc42375dSDarrick J. Wong xlog_recover_process_intents(
2556dc42375dSDarrick J. Wong 	struct xlog			*log)
2557dc42375dSDarrick J. Wong {
2558e6fff81eSDarrick J. Wong 	LIST_HEAD(capture_list);
2559cd3c2cf3SDarrick J. Wong 	struct xfs_defer_pending	*dfp, *n;
2560e6fff81eSDarrick J. Wong 	int				error = 0;
25617bf7a193SDarrick J. Wong #if defined(DEBUG) || defined(XFS_WARN)
2562dc42375dSDarrick J. Wong 	xfs_lsn_t			last_lsn;
2563dc42375dSDarrick J. Wong 
2564dc42375dSDarrick J. Wong 	last_lsn = xlog_assign_lsn(log->l_curr_cycle, log->l_curr_block);
25657bf7a193SDarrick J. Wong #endif
256697cf7967SDarrick J. Wong 
2567cd3c2cf3SDarrick J. Wong 	list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
2568cd3c2cf3SDarrick J. Wong 		struct xfs_log_item	*lip = dfp->dfp_intent;
2569cd3c2cf3SDarrick J. Wong 		const struct xfs_item_ops *ops = lip->li_ops;
2570cd3c2cf3SDarrick J. Wong 
2571cd3c2cf3SDarrick J. Wong 		ASSERT(xlog_item_is_intent(lip));
2572dc42375dSDarrick J. Wong 
2573dc42375dSDarrick J. Wong 		/*
2574dc42375dSDarrick J. Wong 		 * We should never see a redo item with a LSN higher than
2575dc42375dSDarrick J. Wong 		 * the last transaction we found in the log at the start
2576dc42375dSDarrick J. Wong 		 * of recovery.
2577dc42375dSDarrick J. Wong 		 */
2578dc42375dSDarrick J. Wong 		ASSERT(XFS_LSN_CMP(last_lsn, lip->li_lsn) >= 0);
2579dc42375dSDarrick J. Wong 
258050995582SDarrick J. Wong 		/*
258150995582SDarrick J. Wong 		 * NOTE: If your intent processing routine can create more
2582e6fff81eSDarrick J. Wong 		 * deferred ops, you /must/ attach them to the capture list in
2583e6fff81eSDarrick J. Wong 		 * the recover routine or else those subsequent intents will be
258450995582SDarrick J. Wong 		 * replayed in the wrong order!
258597cf7967SDarrick J. Wong 		 *
258697cf7967SDarrick J. Wong 		 * The recovery function can free the log item, so we must not
258797cf7967SDarrick J. Wong 		 * access lip after it returns.
258850995582SDarrick J. Wong 		 */
258987db24c8SDarrick J. Wong 		error = ops->iop_recover(dfp, &capture_list);
259063370326SDarrick J. Wong 		if (error) {
259163370326SDarrick J. Wong 			trace_xlog_intent_recovery_failed(log->l_mp, error,
259297cf7967SDarrick J. Wong 					ops->iop_recover);
2593e6fff81eSDarrick J. Wong 			break;
25941da177e4SLinus Torvalds 		}
2595e6fff81eSDarrick J. Wong 
2596cd3c2cf3SDarrick J. Wong 		xfs_defer_cancel_recovery(log->l_mp, dfp);
2597cd3c2cf3SDarrick J. Wong 	}
2598e6fff81eSDarrick J. Wong 	if (error)
2599e6fff81eSDarrick J. Wong 		goto err;
260050995582SDarrick J. Wong 
2601e6fff81eSDarrick J. Wong 	error = xlog_finish_defer_ops(log->l_mp, &capture_list);
2602e6fff81eSDarrick J. Wong 	if (error)
2603e6fff81eSDarrick J. Wong 		goto err;
2604e6fff81eSDarrick J. Wong 
2605e6fff81eSDarrick J. Wong 	return 0;
2606e6fff81eSDarrick J. Wong err:
2607e6fff81eSDarrick J. Wong 	xlog_abort_defer_ops(log->l_mp, &capture_list);
26083c1e2bbeSDavid Chinner 	return error;
26091da177e4SLinus Torvalds }
26101da177e4SLinus Torvalds 
26111da177e4SLinus Torvalds /*
2612ab9c81efSDave Chinner  * A cancel occurs when the mount has failed and we're bailing out.  Release all
2613ab9c81efSDave Chinner  * pending log intent items that we haven't started recovery on so they don't
2614ab9c81efSDave Chinner  * pin the AIL.
2615f0b2efadSBrian Foster  */
2616a7a9250eSHariprasad Kelam STATIC void
xlog_recover_cancel_intents(struct xlog * log)2617dc42375dSDarrick J. Wong xlog_recover_cancel_intents(
2618f0b2efadSBrian Foster 	struct xlog			*log)
2619f0b2efadSBrian Foster {
2620cd3c2cf3SDarrick J. Wong 	struct xfs_defer_pending	*dfp, *n;
2621f0b2efadSBrian Foster 
2622cd3c2cf3SDarrick J. Wong 	list_for_each_entry_safe(dfp, n, &log->r_dfops, dfp_list) {
2623cd3c2cf3SDarrick J. Wong 		ASSERT(xlog_item_is_intent(dfp->dfp_intent));
2624f0b2efadSBrian Foster 
2625cd3c2cf3SDarrick J. Wong 		xfs_defer_cancel_recovery(log->l_mp, dfp);
2626f0b2efadSBrian Foster 	}
2627f0b2efadSBrian Foster }
2628f0b2efadSBrian Foster 
2629f0b2efadSBrian Foster /*
2630680776e5SDarrick J. Wong  * Transfer ownership of the recovered log intent item to the recovery
2631680776e5SDarrick J. Wong  * transaction.
2632680776e5SDarrick J. Wong  */
2633680776e5SDarrick J. Wong void
xlog_recover_transfer_intent(struct xfs_trans * tp,struct xfs_defer_pending * dfp)2634680776e5SDarrick J. Wong xlog_recover_transfer_intent(
2635680776e5SDarrick J. Wong 	struct xfs_trans		*tp,
2636680776e5SDarrick J. Wong 	struct xfs_defer_pending	*dfp)
2637680776e5SDarrick J. Wong {
2638680776e5SDarrick J. Wong 	dfp->dfp_intent = NULL;
2639680776e5SDarrick J. Wong }
2640680776e5SDarrick J. Wong 
2641680776e5SDarrick J. Wong /*
26421da177e4SLinus Torvalds  * This routine performs a transaction to null out a bad inode pointer
26431da177e4SLinus Torvalds  * in an agi unlinked inode hash bucket.
26441da177e4SLinus Torvalds  */
26451da177e4SLinus Torvalds STATIC void
xlog_recover_clear_agi_bucket(struct xfs_perag * pag,int bucket)26461da177e4SLinus Torvalds xlog_recover_clear_agi_bucket(
264761021debSDave Chinner 	struct xfs_perag	*pag,
26481da177e4SLinus Torvalds 	int			bucket)
26491da177e4SLinus Torvalds {
265061021debSDave Chinner 	struct xfs_mount	*mp = pag->pag_mount;
265161021debSDave Chinner 	struct xfs_trans	*tp;
265261021debSDave Chinner 	struct xfs_agi		*agi;
2653e8222613SDave Chinner 	struct xfs_buf		*agibp;
26541da177e4SLinus Torvalds 	int			offset;
26551da177e4SLinus Torvalds 	int			error;
26561da177e4SLinus Torvalds 
2657253f4911SChristoph Hellwig 	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_clearagi, 0, 0, 0, &tp);
2658e5720eecSDavid Chinner 	if (error)
2659253f4911SChristoph Hellwig 		goto out_error;
26601da177e4SLinus Torvalds 
266161021debSDave Chinner 	error = xfs_read_agi(pag, tp, &agibp);
26625e1be0fbSChristoph Hellwig 	if (error)
2663e5720eecSDavid Chinner 		goto out_abort;
26641da177e4SLinus Torvalds 
2665370c782bSChristoph Hellwig 	agi = agibp->b_addr;
266616259e7dSChristoph Hellwig 	agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
26671da177e4SLinus Torvalds 	offset = offsetof(xfs_agi_t, agi_unlinked) +
26681da177e4SLinus Torvalds 		 (sizeof(xfs_agino_t) * bucket);
26691da177e4SLinus Torvalds 	xfs_trans_log_buf(tp, agibp, offset,
26701da177e4SLinus Torvalds 			  (offset + sizeof(xfs_agino_t) - 1));
26711da177e4SLinus Torvalds 
267270393313SChristoph Hellwig 	error = xfs_trans_commit(tp);
2673e5720eecSDavid Chinner 	if (error)
2674e5720eecSDavid Chinner 		goto out_error;
2675e5720eecSDavid Chinner 	return;
2676e5720eecSDavid Chinner 
2677e5720eecSDavid Chinner out_abort:
26784906e215SChristoph Hellwig 	xfs_trans_cancel(tp);
2679e5720eecSDavid Chinner out_error:
268061021debSDave Chinner 	xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__,
268161021debSDave Chinner 			pag->pag_agno);
2682e5720eecSDavid Chinner 	return;
26831da177e4SLinus Torvalds }
26841da177e4SLinus Torvalds 
268504755d2eSDave Chinner static int
xlog_recover_iunlink_bucket(struct xfs_perag * pag,struct xfs_agi * agi,int bucket)268604755d2eSDave Chinner xlog_recover_iunlink_bucket(
268761021debSDave Chinner 	struct xfs_perag	*pag,
268804755d2eSDave Chinner 	struct xfs_agi		*agi,
268923fac50fSChristoph Hellwig 	int			bucket)
269023fac50fSChristoph Hellwig {
269104755d2eSDave Chinner 	struct xfs_mount	*mp = pag->pag_mount;
26922fd26cc0SDave Chinner 	struct xfs_inode	*prev_ip = NULL;
269323fac50fSChristoph Hellwig 	struct xfs_inode	*ip;
26942fd26cc0SDave Chinner 	xfs_agino_t		prev_agino, agino;
26952fd26cc0SDave Chinner 	int			error = 0;
269623fac50fSChristoph Hellwig 
269704755d2eSDave Chinner 	agino = be32_to_cpu(agi->agi_unlinked[bucket]);
269804755d2eSDave Chinner 	while (agino != NULLAGINO) {
269904755d2eSDave Chinner 		error = xfs_iget(mp, NULL,
270004755d2eSDave Chinner 				XFS_AGINO_TO_INO(mp, pag->pag_agno, agino),
270104755d2eSDave Chinner 				0, 0, &ip);
270223fac50fSChristoph Hellwig 		if (error)
27032fd26cc0SDave Chinner 			break;
270423fac50fSChristoph Hellwig 
270554d7b5c1SDave Chinner 		ASSERT(VFS_I(ip)->i_nlink == 0);
2706c19b3b05SDave Chinner 		ASSERT(VFS_I(ip)->i_mode != 0);
270704755d2eSDave Chinner 		xfs_iflags_clear(ip, XFS_IRECOVERY);
27084fcc94d6SDave Chinner 		agino = ip->i_next_unlinked;
270923fac50fSChristoph Hellwig 
27102fd26cc0SDave Chinner 		if (prev_ip) {
27112fd26cc0SDave Chinner 			ip->i_prev_unlinked = prev_agino;
27122fd26cc0SDave Chinner 			xfs_irele(prev_ip);
271323fac50fSChristoph Hellwig 
271423fac50fSChristoph Hellwig 			/*
27152fd26cc0SDave Chinner 			 * Ensure the inode is removed from the unlinked list
27162fd26cc0SDave Chinner 			 * before we continue so that it won't race with
27172fd26cc0SDave Chinner 			 * building the in-memory list here. This could be
27182fd26cc0SDave Chinner 			 * serialised with the agibp lock, but that just
27192fd26cc0SDave Chinner 			 * serialises via lockstepping and it's much simpler
27202fd26cc0SDave Chinner 			 * just to flush the inodegc queue and wait for it to
27212fd26cc0SDave Chinner 			 * complete.
272223fac50fSChristoph Hellwig 			 */
2723d4d12c02SDave Chinner 			error = xfs_inodegc_flush(mp);
2724d4d12c02SDave Chinner 			if (error)
2725d4d12c02SDave Chinner 				break;
272604755d2eSDave Chinner 		}
27272fd26cc0SDave Chinner 
27282fd26cc0SDave Chinner 		prev_agino = agino;
27292fd26cc0SDave Chinner 		prev_ip = ip;
27302fd26cc0SDave Chinner 	}
27312fd26cc0SDave Chinner 
27322fd26cc0SDave Chinner 	if (prev_ip) {
2733d4d12c02SDave Chinner 		int	error2;
2734d4d12c02SDave Chinner 
27352fd26cc0SDave Chinner 		ip->i_prev_unlinked = prev_agino;
27362fd26cc0SDave Chinner 		xfs_irele(prev_ip);
2737d4d12c02SDave Chinner 
2738d4d12c02SDave Chinner 		error2 = xfs_inodegc_flush(mp);
2739d4d12c02SDave Chinner 		if (error2 && !error)
2740d4d12c02SDave Chinner 			return error2;
27412fd26cc0SDave Chinner 	}
27422fd26cc0SDave Chinner 	return error;
274323fac50fSChristoph Hellwig }
274423fac50fSChristoph Hellwig 
27451da177e4SLinus Torvalds /*
27468ab39f11SDave Chinner  * Recover AGI unlinked lists
27471da177e4SLinus Torvalds  *
27488ab39f11SDave Chinner  * This is called during recovery to process any inodes which we unlinked but
27498ab39f11SDave Chinner  * not freed when the system crashed.  These inodes will be on the lists in the
27508ab39f11SDave Chinner  * AGI blocks. What we do here is scan all the AGIs and fully truncate and free
27518ab39f11SDave Chinner  * any inodes found on the lists. Each inode is removed from the lists when it
27528ab39f11SDave Chinner  * has been fully truncated and is freed. The freeing of the inode and its
27538ab39f11SDave Chinner  * removal from the list must be atomic.
27548ab39f11SDave Chinner  *
27558ab39f11SDave Chinner  * If everything we touch in the agi processing loop is already in memory, this
27568ab39f11SDave Chinner  * loop can hold the cpu for a long time. It runs without lock contention,
27578ab39f11SDave Chinner  * memory allocation contention, the need wait for IO, etc, and so will run
27588ab39f11SDave Chinner  * until we either run out of inodes to process, run low on memory or we run out
27598ab39f11SDave Chinner  * of log space.
27608ab39f11SDave Chinner  *
27618ab39f11SDave Chinner  * This behaviour is bad for latency on single CPU and non-preemptible kernels,
2762bd24a4f5SBhaskar Chowdhury  * and can prevent other filesystem work (such as CIL pushes) from running. This
27638ab39f11SDave Chinner  * can lead to deadlocks if the recovery process runs out of log reservation
27648ab39f11SDave Chinner  * space. Hence we need to yield the CPU when there is other kernel work
27658ab39f11SDave Chinner  * scheduled on this CPU to ensure other scheduled work can run without undue
27668ab39f11SDave Chinner  * latency.
27671da177e4SLinus Torvalds  */
276804755d2eSDave Chinner static void
xlog_recover_iunlink_ag(struct xfs_perag * pag)276904755d2eSDave Chinner xlog_recover_iunlink_ag(
277004755d2eSDave Chinner 	struct xfs_perag	*pag)
27711da177e4SLinus Torvalds {
2772934933c3SDave Chinner 	struct xfs_agi		*agi;
2773e8222613SDave Chinner 	struct xfs_buf		*agibp;
27741da177e4SLinus Torvalds 	int			bucket;
27751da177e4SLinus Torvalds 	int			error;
27761da177e4SLinus Torvalds 
277761021debSDave Chinner 	error = xfs_read_agi(pag, NULL, &agibp);
27785e1be0fbSChristoph Hellwig 	if (error) {
27795e1be0fbSChristoph Hellwig 		/*
27805e1be0fbSChristoph Hellwig 		 * AGI is b0rked. Don't process it.
27815e1be0fbSChristoph Hellwig 		 *
278204755d2eSDave Chinner 		 * We should probably mark the filesystem as corrupt after we've
278304755d2eSDave Chinner 		 * recovered all the ag's we can....
27845e1be0fbSChristoph Hellwig 		 */
278504755d2eSDave Chinner 		return;
27861da177e4SLinus Torvalds 	}
278704755d2eSDave Chinner 
2788d97d32edSJan Kara 	/*
278904755d2eSDave Chinner 	 * Unlock the buffer so that it can be acquired in the normal course of
279004755d2eSDave Chinner 	 * the transaction to truncate and free each inode.  Because we are not
279104755d2eSDave Chinner 	 * racing with anyone else here for the AGI buffer, we don't even need
279204755d2eSDave Chinner 	 * to hold it locked to read the initial unlinked bucket entries out of
279304755d2eSDave Chinner 	 * the buffer. We keep buffer reference though, so that it stays pinned
279404755d2eSDave Chinner 	 * in memory while we need the buffer.
2795d97d32edSJan Kara 	 */
2796370c782bSChristoph Hellwig 	agi = agibp->b_addr;
2797d97d32edSJan Kara 	xfs_buf_unlock(agibp);
27981da177e4SLinus Torvalds 
27991da177e4SLinus Torvalds 	for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
280004755d2eSDave Chinner 		error = xlog_recover_iunlink_bucket(pag, agi, bucket);
280104755d2eSDave Chinner 		if (error) {
280204755d2eSDave Chinner 			/*
280304755d2eSDave Chinner 			 * Bucket is unrecoverable, so only a repair scan can
280404755d2eSDave Chinner 			 * free the remaining unlinked inodes. Just empty the
280504755d2eSDave Chinner 			 * bucket and remaining inodes on it unreferenced and
280604755d2eSDave Chinner 			 * unfreeable.
280704755d2eSDave Chinner 			 */
280804755d2eSDave Chinner 			xlog_recover_clear_agi_bucket(pag, bucket);
28091da177e4SLinus Torvalds 		}
28101da177e4SLinus Torvalds 	}
281104755d2eSDave Chinner 
2812d97d32edSJan Kara 	xfs_buf_rele(agibp);
28131da177e4SLinus Torvalds }
2814ab23a776SDave Chinner 
281504755d2eSDave Chinner static void
xlog_recover_process_iunlinks(struct xlog * log)281604755d2eSDave Chinner xlog_recover_process_iunlinks(
281704755d2eSDave Chinner 	struct xlog	*log)
281804755d2eSDave Chinner {
281904755d2eSDave Chinner 	struct xfs_perag	*pag;
282004755d2eSDave Chinner 	xfs_agnumber_t		agno;
282104755d2eSDave Chinner 
282204755d2eSDave Chinner 	for_each_perag(log->l_mp, agno, pag)
282304755d2eSDave Chinner 		xlog_recover_iunlink_ag(pag);
28241da177e4SLinus Torvalds }
28251da177e4SLinus Torvalds 
282691083269SEric Sandeen STATIC void
xlog_unpack_data(struct xlog_rec_header * rhead,char * dp,struct xlog * log)28271da177e4SLinus Torvalds xlog_unpack_data(
28289a8d2fdbSMark Tinguely 	struct xlog_rec_header	*rhead,
2829b2a922cdSChristoph Hellwig 	char			*dp,
28309a8d2fdbSMark Tinguely 	struct xlog		*log)
28311da177e4SLinus Torvalds {
28321da177e4SLinus Torvalds 	int			i, j, k;
28331da177e4SLinus Torvalds 
2834b53e675dSChristoph Hellwig 	for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
28351da177e4SLinus Torvalds 		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
2836b53e675dSChristoph Hellwig 		*(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
28371da177e4SLinus Torvalds 		dp += BBSIZE;
28381da177e4SLinus Torvalds 	}
28391da177e4SLinus Torvalds 
284038c26bfdSDave Chinner 	if (xfs_has_logv2(log->l_mp)) {
2841b28708d6SChristoph Hellwig 		xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
2842b53e675dSChristoph Hellwig 		for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
28431da177e4SLinus Torvalds 			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
28441da177e4SLinus Torvalds 			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
2845b53e675dSChristoph Hellwig 			*(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
28461da177e4SLinus Torvalds 			dp += BBSIZE;
28471da177e4SLinus Torvalds 		}
28481da177e4SLinus Torvalds 	}
28491da177e4SLinus Torvalds }
28501da177e4SLinus Torvalds 
28519d94901fSBrian Foster /*
2852b94fb2d1SBrian Foster  * CRC check, unpack and process a log record.
28539d94901fSBrian Foster  */
28549d94901fSBrian Foster STATIC int
xlog_recover_process(struct xlog * log,struct hlist_head rhash[],struct xlog_rec_header * rhead,char * dp,int pass,struct list_head * buffer_list)28559d94901fSBrian Foster xlog_recover_process(
28569d94901fSBrian Foster 	struct xlog		*log,
28579d94901fSBrian Foster 	struct hlist_head	rhash[],
28589d94901fSBrian Foster 	struct xlog_rec_header	*rhead,
28599d94901fSBrian Foster 	char			*dp,
286012818d24SBrian Foster 	int			pass,
286112818d24SBrian Foster 	struct list_head	*buffer_list)
28629d94901fSBrian Foster {
2863cae028dfSDave Chinner 	__le32			old_crc = rhead->h_crc;
2864b94fb2d1SBrian Foster 	__le32			crc;
2865b94fb2d1SBrian Foster 
2866b94fb2d1SBrian Foster 	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
28676528250bSBrian Foster 
28686528250bSBrian Foster 	/*
28696528250bSBrian Foster 	 * Nothing else to do if this is a CRC verification pass. Just return
28706528250bSBrian Foster 	 * if this a record with a non-zero crc. Unfortunately, mkfs always
2871cae028dfSDave Chinner 	 * sets old_crc to 0 so we must consider this valid even on v5 supers.
28726528250bSBrian Foster 	 * Otherwise, return EFSBADCRC on failure so the callers up the stack
28736528250bSBrian Foster 	 * know precisely what failed.
28746528250bSBrian Foster 	 */
28756528250bSBrian Foster 	if (pass == XLOG_RECOVER_CRCPASS) {
2876cae028dfSDave Chinner 		if (old_crc && crc != old_crc)
28776528250bSBrian Foster 			return -EFSBADCRC;
28786528250bSBrian Foster 		return 0;
28796528250bSBrian Foster 	}
28806528250bSBrian Foster 
28816528250bSBrian Foster 	/*
28826528250bSBrian Foster 	 * We're in the normal recovery path. Issue a warning if and only if the
28836528250bSBrian Foster 	 * CRC in the header is non-zero. This is an advisory warning and the
28846528250bSBrian Foster 	 * zero CRC check prevents warnings from being emitted when upgrading
28856528250bSBrian Foster 	 * the kernel from one that does not add CRCs by default.
28866528250bSBrian Foster 	 */
2887cae028dfSDave Chinner 	if (crc != old_crc) {
288838c26bfdSDave Chinner 		if (old_crc || xfs_has_crc(log->l_mp)) {
2889b94fb2d1SBrian Foster 			xfs_alert(log->l_mp,
2890b94fb2d1SBrian Foster 		"log record CRC mismatch: found 0x%x, expected 0x%x.",
2891cae028dfSDave Chinner 					le32_to_cpu(old_crc),
2892b94fb2d1SBrian Foster 					le32_to_cpu(crc));
2893b94fb2d1SBrian Foster 			xfs_hex_dump(dp, 32);
2894b94fb2d1SBrian Foster 		}
2895b94fb2d1SBrian Foster 
2896b94fb2d1SBrian Foster 		/*
2897b94fb2d1SBrian Foster 		 * If the filesystem is CRC enabled, this mismatch becomes a
2898b94fb2d1SBrian Foster 		 * fatal log corruption failure.
2899b94fb2d1SBrian Foster 		 */
290038c26bfdSDave Chinner 		if (xfs_has_crc(log->l_mp)) {
2901a5155b87SDarrick J. Wong 			XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_LOW, log->l_mp);
2902b94fb2d1SBrian Foster 			return -EFSCORRUPTED;
2903b94fb2d1SBrian Foster 		}
2904a5155b87SDarrick J. Wong 	}
29059d94901fSBrian Foster 
290691083269SEric Sandeen 	xlog_unpack_data(rhead, dp, log);
29079d94901fSBrian Foster 
290812818d24SBrian Foster 	return xlog_recover_process_data(log, rhash, rhead, dp, pass,
290912818d24SBrian Foster 					 buffer_list);
29109d94901fSBrian Foster }
29119d94901fSBrian Foster 
29121da177e4SLinus Torvalds STATIC int
xlog_valid_rec_header(struct xlog * log,struct xlog_rec_header * rhead,xfs_daddr_t blkno,int bufsize)29131da177e4SLinus Torvalds xlog_valid_rec_header(
29149a8d2fdbSMark Tinguely 	struct xlog		*log,
29159a8d2fdbSMark Tinguely 	struct xlog_rec_header	*rhead,
2916f692d09eSGao Xiang 	xfs_daddr_t		blkno,
2917f692d09eSGao Xiang 	int			bufsize)
29181da177e4SLinus Torvalds {
29191da177e4SLinus Torvalds 	int			hlen;
29201da177e4SLinus Torvalds 
2921a71895c5SDarrick J. Wong 	if (XFS_IS_CORRUPT(log->l_mp,
2922a71895c5SDarrick J. Wong 			   rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)))
29232451337dSDave Chinner 		return -EFSCORRUPTED;
2924a71895c5SDarrick J. Wong 	if (XFS_IS_CORRUPT(log->l_mp,
29251da177e4SLinus Torvalds 			   (!rhead->h_version ||
2926a71895c5SDarrick J. Wong 			   (be32_to_cpu(rhead->h_version) &
2927a71895c5SDarrick J. Wong 			    (~XLOG_VERSION_OKBITS))))) {
2928a0fa2b67SDave Chinner 		xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
292934a622b2SHarvey Harrison 			__func__, be32_to_cpu(rhead->h_version));
2930895e196fSDarrick J. Wong 		return -EFSCORRUPTED;
29311da177e4SLinus Torvalds 	}
29321da177e4SLinus Torvalds 
2933f692d09eSGao Xiang 	/*
2934f692d09eSGao Xiang 	 * LR body must have data (or it wouldn't have been written)
2935f692d09eSGao Xiang 	 * and h_len must not be greater than LR buffer size.
2936f692d09eSGao Xiang 	 */
2937b53e675dSChristoph Hellwig 	hlen = be32_to_cpu(rhead->h_len);
2938f692d09eSGao Xiang 	if (XFS_IS_CORRUPT(log->l_mp, hlen <= 0 || hlen > bufsize))
29392451337dSDave Chinner 		return -EFSCORRUPTED;
2940f692d09eSGao Xiang 
2941a71895c5SDarrick J. Wong 	if (XFS_IS_CORRUPT(log->l_mp,
2942a71895c5SDarrick J. Wong 			   blkno > log->l_logBBsize || blkno > INT_MAX))
29432451337dSDave Chinner 		return -EFSCORRUPTED;
29441da177e4SLinus Torvalds 	return 0;
29451da177e4SLinus Torvalds }
29461da177e4SLinus Torvalds 
29471da177e4SLinus Torvalds /*
29481da177e4SLinus Torvalds  * Read the log from tail to head and process the log records found.
29491da177e4SLinus Torvalds  * Handle the two cases where the tail and head are in the same cycle
29501da177e4SLinus Torvalds  * and where the active portion of the log wraps around the end of
29511da177e4SLinus Torvalds  * the physical log separately.  The pass parameter is passed through
29521da177e4SLinus Torvalds  * to the routines called to process the data and is not looked at
29531da177e4SLinus Torvalds  * here.
29541da177e4SLinus Torvalds  */
29551da177e4SLinus Torvalds STATIC int
xlog_do_recovery_pass(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk,int pass,xfs_daddr_t * first_bad)29561da177e4SLinus Torvalds xlog_do_recovery_pass(
29579a8d2fdbSMark Tinguely 	struct xlog		*log,
29581da177e4SLinus Torvalds 	xfs_daddr_t		head_blk,
29591da177e4SLinus Torvalds 	xfs_daddr_t		tail_blk,
2960d7f37692SBrian Foster 	int			pass,
2961d7f37692SBrian Foster 	xfs_daddr_t		*first_bad)	/* out: first bad log rec */
29621da177e4SLinus Torvalds {
29631da177e4SLinus Torvalds 	xlog_rec_header_t	*rhead;
2964284f1c2cSBrian Foster 	xfs_daddr_t		blk_no, rblk_no;
2965d7f37692SBrian Foster 	xfs_daddr_t		rhead_blk;
2966b2a922cdSChristoph Hellwig 	char			*offset;
29676ad5b325SChristoph Hellwig 	char			*hbp, *dbp;
2968a70f9fe5SBrian Foster 	int			error = 0, h_size, h_len;
296912818d24SBrian Foster 	int			error2 = 0;
29701da177e4SLinus Torvalds 	int			bblks, split_bblks;
2971c2389c07SChristoph Hellwig 	int			hblks = 1, split_hblks, wrapped_hblks;
297239775431SBrian Foster 	int			i;
2973f0a76953SDave Chinner 	struct hlist_head	rhash[XLOG_RHASH_SIZE];
297412818d24SBrian Foster 	LIST_HEAD		(buffer_list);
29751da177e4SLinus Torvalds 
29761da177e4SLinus Torvalds 	ASSERT(head_blk != tail_blk);
2977a4c9b34dSBrian Foster 	blk_no = rhead_blk = tail_blk;
29781da177e4SLinus Torvalds 
297939775431SBrian Foster 	for (i = 0; i < XLOG_RHASH_SIZE; i++)
298039775431SBrian Foster 		INIT_HLIST_HEAD(&rhash[i]);
298139775431SBrian Foster 
29821da177e4SLinus Torvalds 	/*
29831da177e4SLinus Torvalds 	 * Read the header of the tail block and get the iclog buffer size from
29841da177e4SLinus Torvalds 	 * h_size.  Use this to tell how many sectors make up the log header.
29851da177e4SLinus Torvalds 	 */
298638c26bfdSDave Chinner 	if (xfs_has_logv2(log->l_mp)) {
29871da177e4SLinus Torvalds 		/*
29881da177e4SLinus Torvalds 		 * When using variable length iclogs, read first sector of
29891da177e4SLinus Torvalds 		 * iclog header and extract the header size from it.  Get a
29901da177e4SLinus Torvalds 		 * new hbp that is the correct size.
29911da177e4SLinus Torvalds 		 */
29926e9b3dd8SChristoph Hellwig 		hbp = xlog_alloc_buffer(log, 1);
29931da177e4SLinus Torvalds 		if (!hbp)
29942451337dSDave Chinner 			return -ENOMEM;
2995076e6acbSChristoph Hellwig 
2996076e6acbSChristoph Hellwig 		error = xlog_bread(log, tail_blk, 1, hbp, &offset);
2997076e6acbSChristoph Hellwig 		if (error)
29981da177e4SLinus Torvalds 			goto bread_err1;
2999076e6acbSChristoph Hellwig 
30001da177e4SLinus Torvalds 		rhead = (xlog_rec_header_t *)offset;
3001a70f9fe5SBrian Foster 
3002a70f9fe5SBrian Foster 		/*
3003a70f9fe5SBrian Foster 		 * xfsprogs has a bug where record length is based on lsunit but
3004a70f9fe5SBrian Foster 		 * h_size (iclog size) is hardcoded to 32k. Now that we
3005a70f9fe5SBrian Foster 		 * unconditionally CRC verify the unmount record, this means the
3006a70f9fe5SBrian Foster 		 * log buffer can be too small for the record and cause an
3007a70f9fe5SBrian Foster 		 * overrun.
3008a70f9fe5SBrian Foster 		 *
3009a70f9fe5SBrian Foster 		 * Detect this condition here. Use lsunit for the buffer size as
3010a70f9fe5SBrian Foster 		 * long as this looks like the mkfs case. Otherwise, return an
3011a70f9fe5SBrian Foster 		 * error to avoid a buffer overrun.
3012a70f9fe5SBrian Foster 		 */
3013b53e675dSChristoph Hellwig 		h_size = be32_to_cpu(rhead->h_size);
3014a70f9fe5SBrian Foster 		h_len = be32_to_cpu(rhead->h_len);
3015f692d09eSGao Xiang 		if (h_len > h_size && h_len <= log->l_mp->m_logbsize &&
3016f692d09eSGao Xiang 		    rhead->h_num_logops == cpu_to_be32(1)) {
3017a70f9fe5SBrian Foster 			xfs_warn(log->l_mp,
3018a70f9fe5SBrian Foster 		"invalid iclog size (%d bytes), using lsunit (%d bytes)",
3019a70f9fe5SBrian Foster 				 h_size, log->l_mp->m_logbsize);
3020a70f9fe5SBrian Foster 			h_size = log->l_mp->m_logbsize;
3021f692d09eSGao Xiang 		}
3022f692d09eSGao Xiang 
3023f692d09eSGao Xiang 		error = xlog_valid_rec_header(log, rhead, tail_blk, h_size);
3024f692d09eSGao Xiang 		if (error)
3025050552cbSDarrick J. Wong 			goto bread_err1;
3026a70f9fe5SBrian Foster 
3027c2389c07SChristoph Hellwig 		/*
3028c2389c07SChristoph Hellwig 		 * This open codes xlog_logrec_hblks so that we can reuse the
3029c2389c07SChristoph Hellwig 		 * fixed up h_size value calculated above.  Without that we'd
3030c2389c07SChristoph Hellwig 		 * still allocate the buffer based on the incorrect on-disk
3031c2389c07SChristoph Hellwig 		 * size.
3032c2389c07SChristoph Hellwig 		 */
3033c2389c07SChristoph Hellwig 		if (h_size > XLOG_HEADER_CYCLE_SIZE &&
3034c2389c07SChristoph Hellwig 		    (rhead->h_version & cpu_to_be32(XLOG_VERSION_2))) {
3035c2389c07SChristoph Hellwig 			hblks = DIV_ROUND_UP(h_size, XLOG_HEADER_CYCLE_SIZE);
3036c2389c07SChristoph Hellwig 			if (hblks > 1) {
30376ad5b325SChristoph Hellwig 				kmem_free(hbp);
30386e9b3dd8SChristoph Hellwig 				hbp = xlog_alloc_buffer(log, hblks);
30391da177e4SLinus Torvalds 			}
3040c2389c07SChristoph Hellwig 		}
30411da177e4SLinus Torvalds 	} else {
304269ce58f0SAlex Elder 		ASSERT(log->l_sectBBsize == 1);
30436e9b3dd8SChristoph Hellwig 		hbp = xlog_alloc_buffer(log, 1);
30441da177e4SLinus Torvalds 		h_size = XLOG_BIG_RECORD_BSIZE;
30451da177e4SLinus Torvalds 	}
30461da177e4SLinus Torvalds 
30471da177e4SLinus Torvalds 	if (!hbp)
30482451337dSDave Chinner 		return -ENOMEM;
30496e9b3dd8SChristoph Hellwig 	dbp = xlog_alloc_buffer(log, BTOBB(h_size));
30501da177e4SLinus Torvalds 	if (!dbp) {
30516ad5b325SChristoph Hellwig 		kmem_free(hbp);
30522451337dSDave Chinner 		return -ENOMEM;
30531da177e4SLinus Torvalds 	}
30541da177e4SLinus Torvalds 
30551da177e4SLinus Torvalds 	memset(rhash, 0, sizeof(rhash));
3056970fd3f0SEric Sandeen 	if (tail_blk > head_blk) {
30571da177e4SLinus Torvalds 		/*
30581da177e4SLinus Torvalds 		 * Perform recovery around the end of the physical log.
30591da177e4SLinus Torvalds 		 * When the head is not on the same cycle number as the tail,
3060970fd3f0SEric Sandeen 		 * we can't do a sequential recovery.
30611da177e4SLinus Torvalds 		 */
30621da177e4SLinus Torvalds 		while (blk_no < log->l_logBBsize) {
30631da177e4SLinus Torvalds 			/*
30641da177e4SLinus Torvalds 			 * Check for header wrapping around physical end-of-log
30651da177e4SLinus Torvalds 			 */
30666ad5b325SChristoph Hellwig 			offset = hbp;
30671da177e4SLinus Torvalds 			split_hblks = 0;
30681da177e4SLinus Torvalds 			wrapped_hblks = 0;
30691da177e4SLinus Torvalds 			if (blk_no + hblks <= log->l_logBBsize) {
30701da177e4SLinus Torvalds 				/* Read header in one read */
3071076e6acbSChristoph Hellwig 				error = xlog_bread(log, blk_no, hblks, hbp,
3072076e6acbSChristoph Hellwig 						   &offset);
30731da177e4SLinus Torvalds 				if (error)
30741da177e4SLinus Torvalds 					goto bread_err2;
30751da177e4SLinus Torvalds 			} else {
30761da177e4SLinus Torvalds 				/* This LR is split across physical log end */
30771da177e4SLinus Torvalds 				if (blk_no != log->l_logBBsize) {
30781da177e4SLinus Torvalds 					/* some data before physical log end */
30791da177e4SLinus Torvalds 					ASSERT(blk_no <= INT_MAX);
30801da177e4SLinus Torvalds 					split_hblks = log->l_logBBsize - (int)blk_no;
30811da177e4SLinus Torvalds 					ASSERT(split_hblks > 0);
3082076e6acbSChristoph Hellwig 					error = xlog_bread(log, blk_no,
3083076e6acbSChristoph Hellwig 							   split_hblks, hbp,
3084076e6acbSChristoph Hellwig 							   &offset);
3085076e6acbSChristoph Hellwig 					if (error)
30861da177e4SLinus Torvalds 						goto bread_err2;
30871da177e4SLinus Torvalds 				}
3088076e6acbSChristoph Hellwig 
30891da177e4SLinus Torvalds 				/*
30901da177e4SLinus Torvalds 				 * Note: this black magic still works with
30911da177e4SLinus Torvalds 				 * large sector sizes (non-512) only because:
30921da177e4SLinus Torvalds 				 * - we increased the buffer size originally
30931da177e4SLinus Torvalds 				 *   by 1 sector giving us enough extra space
30941da177e4SLinus Torvalds 				 *   for the second read;
30951da177e4SLinus Torvalds 				 * - the log start is guaranteed to be sector
30961da177e4SLinus Torvalds 				 *   aligned;
30971da177e4SLinus Torvalds 				 * - we read the log end (LR header start)
30981da177e4SLinus Torvalds 				 *   _first_, then the log start (LR header end)
30991da177e4SLinus Torvalds 				 *   - order is important.
31001da177e4SLinus Torvalds 				 */
3101234f56acSDavid Chinner 				wrapped_hblks = hblks - split_hblks;
31026ad5b325SChristoph Hellwig 				error = xlog_bread_noalign(log, 0,
31036ad5b325SChristoph Hellwig 						wrapped_hblks,
310444396476SDave Chinner 						offset + BBTOB(split_hblks));
31051da177e4SLinus Torvalds 				if (error)
31061da177e4SLinus Torvalds 					goto bread_err2;
31071da177e4SLinus Torvalds 			}
31081da177e4SLinus Torvalds 			rhead = (xlog_rec_header_t *)offset;
31091da177e4SLinus Torvalds 			error = xlog_valid_rec_header(log, rhead,
3110f692d09eSGao Xiang 					split_hblks ? blk_no : 0, h_size);
31111da177e4SLinus Torvalds 			if (error)
31121da177e4SLinus Torvalds 				goto bread_err2;
31131da177e4SLinus Torvalds 
3114b53e675dSChristoph Hellwig 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
31151da177e4SLinus Torvalds 			blk_no += hblks;
31161da177e4SLinus Torvalds 
3117284f1c2cSBrian Foster 			/*
3118284f1c2cSBrian Foster 			 * Read the log record data in multiple reads if it
3119284f1c2cSBrian Foster 			 * wraps around the end of the log. Note that if the
3120284f1c2cSBrian Foster 			 * header already wrapped, blk_no could point past the
3121284f1c2cSBrian Foster 			 * end of the log. The record data is contiguous in
3122284f1c2cSBrian Foster 			 * that case.
3123284f1c2cSBrian Foster 			 */
3124284f1c2cSBrian Foster 			if (blk_no + bblks <= log->l_logBBsize ||
3125284f1c2cSBrian Foster 			    blk_no >= log->l_logBBsize) {
31260703a8e1SDave Chinner 				rblk_no = xlog_wrap_logbno(log, blk_no);
3127284f1c2cSBrian Foster 				error = xlog_bread(log, rblk_no, bblks, dbp,
3128076e6acbSChristoph Hellwig 						   &offset);
31291da177e4SLinus Torvalds 				if (error)
31301da177e4SLinus Torvalds 					goto bread_err2;
31311da177e4SLinus Torvalds 			} else {
31321da177e4SLinus Torvalds 				/* This log record is split across the
31331da177e4SLinus Torvalds 				 * physical end of log */
31346ad5b325SChristoph Hellwig 				offset = dbp;
31351da177e4SLinus Torvalds 				split_bblks = 0;
31361da177e4SLinus Torvalds 				if (blk_no != log->l_logBBsize) {
31371da177e4SLinus Torvalds 					/* some data is before the physical
31381da177e4SLinus Torvalds 					 * end of log */
31391da177e4SLinus Torvalds 					ASSERT(!wrapped_hblks);
31401da177e4SLinus Torvalds 					ASSERT(blk_no <= INT_MAX);
31411da177e4SLinus Torvalds 					split_bblks =
31421da177e4SLinus Torvalds 						log->l_logBBsize - (int)blk_no;
31431da177e4SLinus Torvalds 					ASSERT(split_bblks > 0);
3144076e6acbSChristoph Hellwig 					error = xlog_bread(log, blk_no,
3145076e6acbSChristoph Hellwig 							split_bblks, dbp,
3146076e6acbSChristoph Hellwig 							&offset);
3147076e6acbSChristoph Hellwig 					if (error)
31481da177e4SLinus Torvalds 						goto bread_err2;
31491da177e4SLinus Torvalds 				}
3150076e6acbSChristoph Hellwig 
31511da177e4SLinus Torvalds 				/*
31521da177e4SLinus Torvalds 				 * Note: this black magic still works with
31531da177e4SLinus Torvalds 				 * large sector sizes (non-512) only because:
31541da177e4SLinus Torvalds 				 * - we increased the buffer size originally
31551da177e4SLinus Torvalds 				 *   by 1 sector giving us enough extra space
31561da177e4SLinus Torvalds 				 *   for the second read;
31571da177e4SLinus Torvalds 				 * - the log start is guaranteed to be sector
31581da177e4SLinus Torvalds 				 *   aligned;
31591da177e4SLinus Torvalds 				 * - we read the log end (LR header start)
31601da177e4SLinus Torvalds 				 *   _first_, then the log start (LR header end)
31611da177e4SLinus Torvalds 				 *   - order is important.
31621da177e4SLinus Torvalds 				 */
31636ad5b325SChristoph Hellwig 				error = xlog_bread_noalign(log, 0,
31646ad5b325SChristoph Hellwig 						bblks - split_bblks,
316544396476SDave Chinner 						offset + BBTOB(split_bblks));
3166076e6acbSChristoph Hellwig 				if (error)
3167076e6acbSChristoph Hellwig 					goto bread_err2;
31681da177e4SLinus Torvalds 			}
31690e446be4SChristoph Hellwig 
31709d94901fSBrian Foster 			error = xlog_recover_process(log, rhash, rhead, offset,
317112818d24SBrian Foster 						     pass, &buffer_list);
31720e446be4SChristoph Hellwig 			if (error)
31731da177e4SLinus Torvalds 				goto bread_err2;
3174d7f37692SBrian Foster 
31751da177e4SLinus Torvalds 			blk_no += bblks;
3176d7f37692SBrian Foster 			rhead_blk = blk_no;
31771da177e4SLinus Torvalds 		}
31781da177e4SLinus Torvalds 
31791da177e4SLinus Torvalds 		ASSERT(blk_no >= log->l_logBBsize);
31801da177e4SLinus Torvalds 		blk_no -= log->l_logBBsize;
3181d7f37692SBrian Foster 		rhead_blk = blk_no;
3182970fd3f0SEric Sandeen 	}
31831da177e4SLinus Torvalds 
31841da177e4SLinus Torvalds 	/* read first part of physical log */
31851da177e4SLinus Torvalds 	while (blk_no < head_blk) {
3186076e6acbSChristoph Hellwig 		error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3187076e6acbSChristoph Hellwig 		if (error)
31881da177e4SLinus Torvalds 			goto bread_err2;
3189076e6acbSChristoph Hellwig 
31901da177e4SLinus Torvalds 		rhead = (xlog_rec_header_t *)offset;
3191f692d09eSGao Xiang 		error = xlog_valid_rec_header(log, rhead, blk_no, h_size);
31921da177e4SLinus Torvalds 		if (error)
31931da177e4SLinus Torvalds 			goto bread_err2;
3194076e6acbSChristoph Hellwig 
3195970fd3f0SEric Sandeen 		/* blocks in data section */
3196b53e675dSChristoph Hellwig 		bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3197076e6acbSChristoph Hellwig 		error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3198076e6acbSChristoph Hellwig 				   &offset);
3199076e6acbSChristoph Hellwig 		if (error)
32001da177e4SLinus Torvalds 			goto bread_err2;
3201076e6acbSChristoph Hellwig 
320212818d24SBrian Foster 		error = xlog_recover_process(log, rhash, rhead, offset, pass,
320312818d24SBrian Foster 					     &buffer_list);
32040e446be4SChristoph Hellwig 		if (error)
32051da177e4SLinus Torvalds 			goto bread_err2;
3206d7f37692SBrian Foster 
32071da177e4SLinus Torvalds 		blk_no += bblks + hblks;
3208d7f37692SBrian Foster 		rhead_blk = blk_no;
32091da177e4SLinus Torvalds 	}
32101da177e4SLinus Torvalds 
32111da177e4SLinus Torvalds  bread_err2:
32126ad5b325SChristoph Hellwig 	kmem_free(dbp);
32131da177e4SLinus Torvalds  bread_err1:
32146ad5b325SChristoph Hellwig 	kmem_free(hbp);
3215d7f37692SBrian Foster 
321612818d24SBrian Foster 	/*
3217ae609281SLong Li 	 * Submit buffers that have been dirtied by the last record recovered.
321812818d24SBrian Foster 	 */
3219ae609281SLong Li 	if (!list_empty(&buffer_list)) {
3220ae609281SLong Li 		if (error) {
3221ae609281SLong Li 			/*
3222ae609281SLong Li 			 * If there has been an item recovery error then we
3223ae609281SLong Li 			 * cannot allow partial checkpoint writeback to
3224ae609281SLong Li 			 * occur.  We might have multiple checkpoints with the
3225ae609281SLong Li 			 * same start LSN in this buffer list, and partial
3226ae609281SLong Li 			 * writeback of a checkpoint in this situation can
3227ae609281SLong Li 			 * prevent future recovery of all the changes in the
3228ae609281SLong Li 			 * checkpoints at this start LSN.
3229ae609281SLong Li 			 *
3230ae609281SLong Li 			 * Note: Shutting down the filesystem will result in the
3231ae609281SLong Li 			 * delwri submission marking all the buffers stale,
3232ae609281SLong Li 			 * completing them and cleaning up _XBF_LOGRECOVERY
3233ae609281SLong Li 			 * state without doing any IO.
3234ae609281SLong Li 			 */
3235ae609281SLong Li 			xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
3236ae609281SLong Li 		}
323712818d24SBrian Foster 		error2 = xfs_buf_delwri_submit(&buffer_list);
3238ae609281SLong Li 	}
323912818d24SBrian Foster 
3240d7f37692SBrian Foster 	if (error && first_bad)
3241d7f37692SBrian Foster 		*first_bad = rhead_blk;
3242d7f37692SBrian Foster 
324339775431SBrian Foster 	/*
324439775431SBrian Foster 	 * Transactions are freed at commit time but transactions without commit
324539775431SBrian Foster 	 * records on disk are never committed. Free any that may be left in the
324639775431SBrian Foster 	 * hash table.
324739775431SBrian Foster 	 */
324839775431SBrian Foster 	for (i = 0; i < XLOG_RHASH_SIZE; i++) {
324939775431SBrian Foster 		struct hlist_node	*tmp;
325039775431SBrian Foster 		struct xlog_recover	*trans;
325139775431SBrian Foster 
325239775431SBrian Foster 		hlist_for_each_entry_safe(trans, tmp, &rhash[i], r_list)
325339775431SBrian Foster 			xlog_recover_free_trans(trans);
325439775431SBrian Foster 	}
325539775431SBrian Foster 
325612818d24SBrian Foster 	return error ? error : error2;
32571da177e4SLinus Torvalds }
32581da177e4SLinus Torvalds 
32591da177e4SLinus Torvalds /*
32601da177e4SLinus Torvalds  * Do the recovery of the log.  We actually do this in two phases.
32611da177e4SLinus Torvalds  * The two passes are necessary in order to implement the function
32621da177e4SLinus Torvalds  * of cancelling a record written into the log.  The first pass
32631da177e4SLinus Torvalds  * determines those things which have been cancelled, and the
32641da177e4SLinus Torvalds  * second pass replays log items normally except for those which
32651da177e4SLinus Torvalds  * have been cancelled.  The handling of the replay and cancellations
32661da177e4SLinus Torvalds  * takes place in the log item type specific routines.
32671da177e4SLinus Torvalds  *
32681da177e4SLinus Torvalds  * The table of items which have cancel records in the log is allocated
32691da177e4SLinus Torvalds  * and freed at this level, since only here do we know when all of
32701da177e4SLinus Torvalds  * the log recovery has been completed.
32711da177e4SLinus Torvalds  */
32721da177e4SLinus Torvalds STATIC int
xlog_do_log_recovery(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk)32731da177e4SLinus Torvalds xlog_do_log_recovery(
32749a8d2fdbSMark Tinguely 	struct xlog	*log,
32751da177e4SLinus Torvalds 	xfs_daddr_t	head_blk,
32761da177e4SLinus Torvalds 	xfs_daddr_t	tail_blk)
32771da177e4SLinus Torvalds {
327827232349SDarrick J. Wong 	int		error;
32791da177e4SLinus Torvalds 
32801da177e4SLinus Torvalds 	ASSERT(head_blk != tail_blk);
32811da177e4SLinus Torvalds 
32821da177e4SLinus Torvalds 	/*
32831da177e4SLinus Torvalds 	 * First do a pass to find all of the cancelled buf log items.
32841da177e4SLinus Torvalds 	 * Store them in the buf_cancel_table for use in the second pass.
32851da177e4SLinus Torvalds 	 */
3286910bbdf2SDarrick J. Wong 	error = xlog_alloc_buf_cancel_table(log);
3287910bbdf2SDarrick J. Wong 	if (error)
3288910bbdf2SDarrick J. Wong 		return error;
3289d5689eaaSChristoph Hellwig 
32901da177e4SLinus Torvalds 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3291d7f37692SBrian Foster 				      XLOG_RECOVER_PASS1, NULL);
329227232349SDarrick J. Wong 	if (error != 0)
329327232349SDarrick J. Wong 		goto out_cancel;
329427232349SDarrick J. Wong 
32951da177e4SLinus Torvalds 	/*
32961da177e4SLinus Torvalds 	 * Then do a second pass to actually recover the items in the log.
32971da177e4SLinus Torvalds 	 * When it is complete free the table of buf cancel items.
32981da177e4SLinus Torvalds 	 */
32991da177e4SLinus Torvalds 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3300d7f37692SBrian Foster 				      XLOG_RECOVER_PASS2, NULL);
330127232349SDarrick J. Wong 	if (!error)
330227232349SDarrick J. Wong 		xlog_check_buf_cancel_table(log);
330327232349SDarrick J. Wong out_cancel:
330427232349SDarrick J. Wong 	xlog_free_buf_cancel_table(log);
33051da177e4SLinus Torvalds 	return error;
33061da177e4SLinus Torvalds }
33071da177e4SLinus Torvalds 
33081da177e4SLinus Torvalds /*
33091da177e4SLinus Torvalds  * Do the actual recovery
33101da177e4SLinus Torvalds  */
33111da177e4SLinus Torvalds STATIC int
xlog_do_recover(struct xlog * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk)33121da177e4SLinus Torvalds xlog_do_recover(
33139a8d2fdbSMark Tinguely 	struct xlog		*log,
33141da177e4SLinus Torvalds 	xfs_daddr_t		head_blk,
33151da177e4SLinus Torvalds 	xfs_daddr_t		tail_blk)
33161da177e4SLinus Torvalds {
3317a798011cSDave Chinner 	struct xfs_mount	*mp = log->l_mp;
3318b3f8e08cSChristoph Hellwig 	struct xfs_buf		*bp = mp->m_sb_bp;
3319b3f8e08cSChristoph Hellwig 	struct xfs_sb		*sbp = &mp->m_sb;
33201da177e4SLinus Torvalds 	int			error;
33211da177e4SLinus Torvalds 
3322e67d3d42SBrian Foster 	trace_xfs_log_recover(log, head_blk, tail_blk);
3323e67d3d42SBrian Foster 
33241da177e4SLinus Torvalds 	/*
33251da177e4SLinus Torvalds 	 * First replay the images in the log.
33261da177e4SLinus Torvalds 	 */
33271da177e4SLinus Torvalds 	error = xlog_do_log_recovery(log, head_blk, tail_blk);
332843ff2122SChristoph Hellwig 	if (error)
33291da177e4SLinus Torvalds 		return error;
33301da177e4SLinus Torvalds 
33312039a272SDave Chinner 	if (xlog_is_shutdown(log))
33322451337dSDave Chinner 		return -EIO;
33331da177e4SLinus Torvalds 
33341da177e4SLinus Torvalds 	/*
33351da177e4SLinus Torvalds 	 * We now update the tail_lsn since much of the recovery has completed
33361da177e4SLinus Torvalds 	 * and there may be space available to use.  If there were no extent
33371da177e4SLinus Torvalds 	 * or iunlinks, we can free up the entire log and set the tail_lsn to
33381da177e4SLinus Torvalds 	 * be the last_sync_lsn.  This was set in xlog_find_tail to be the
33391da177e4SLinus Torvalds 	 * lsn of the last known good LR on disk.  If there are extent frees
33401da177e4SLinus Torvalds 	 * or iunlinks they will have some entries in the AIL; so we look at
33411da177e4SLinus Torvalds 	 * the AIL to determine how to set the tail_lsn.
33421da177e4SLinus Torvalds 	 */
3343a798011cSDave Chinner 	xlog_assign_tail_lsn(mp);
33441da177e4SLinus Torvalds 
33451da177e4SLinus Torvalds 	/*
3346b3f8e08cSChristoph Hellwig 	 * Now that we've finished replaying all buffer and inode updates,
3347b3f8e08cSChristoph Hellwig 	 * re-read the superblock and reverify it.
33481da177e4SLinus Torvalds 	 */
3349b3f8e08cSChristoph Hellwig 	xfs_buf_lock(bp);
3350b3f8e08cSChristoph Hellwig 	xfs_buf_hold(bp);
335126e32875SChristoph Hellwig 	error = _xfs_buf_read(bp, XBF_READ);
3352d64e31a2SDavid Chinner 	if (error) {
33532039a272SDave Chinner 		if (!xlog_is_shutdown(log)) {
3354cdbcf82bSDarrick J. Wong 			xfs_buf_ioerror_alert(bp, __this_address);
33551da177e4SLinus Torvalds 			ASSERT(0);
3356595bff75SDave Chinner 		}
33571da177e4SLinus Torvalds 		xfs_buf_relse(bp);
33581da177e4SLinus Torvalds 		return error;
33591da177e4SLinus Torvalds 	}
33601da177e4SLinus Torvalds 
33611da177e4SLinus Torvalds 	/* Convert superblock from on-disk format */
33623e6e8afdSChristoph Hellwig 	xfs_sb_from_disk(sbp, bp->b_addr);
33631da177e4SLinus Torvalds 	xfs_buf_relse(bp);
33641da177e4SLinus Torvalds 
3365a798011cSDave Chinner 	/* re-initialise in-core superblock and geometry structures */
3366a1d86e8dSDave Chinner 	mp->m_features |= xfs_sb_version_to_features(sbp);
3367a798011cSDave Chinner 	xfs_reinit_percpu_counters(mp);
33685478eeadSLachlan McIlroy 
33691da177e4SLinus Torvalds 	/* Normal transactions can now occur */
3370e1d06e5fSDave Chinner 	clear_bit(XLOG_ACTIVE_RECOVERY, &log->l_opstate);
33711da177e4SLinus Torvalds 	return 0;
33721da177e4SLinus Torvalds }
33731da177e4SLinus Torvalds 
33741da177e4SLinus Torvalds /*
33751da177e4SLinus Torvalds  * Perform recovery and re-initialize some log variables in xlog_find_tail.
33761da177e4SLinus Torvalds  *
33771da177e4SLinus Torvalds  * Return error or zero.
33781da177e4SLinus Torvalds  */
33791da177e4SLinus Torvalds int
xlog_recover(struct xlog * log)33801da177e4SLinus Torvalds xlog_recover(
33819a8d2fdbSMark Tinguely 	struct xlog	*log)
33821da177e4SLinus Torvalds {
33831da177e4SLinus Torvalds 	xfs_daddr_t	head_blk, tail_blk;
33841da177e4SLinus Torvalds 	int		error;
33851da177e4SLinus Torvalds 
33861da177e4SLinus Torvalds 	/* find the tail of the log */
3387a45086e2SBrian Foster 	error = xlog_find_tail(log, &head_blk, &tail_blk);
3388a45086e2SBrian Foster 	if (error)
33891da177e4SLinus Torvalds 		return error;
33901da177e4SLinus Torvalds 
3391a45086e2SBrian Foster 	/*
3392a45086e2SBrian Foster 	 * The superblock was read before the log was available and thus the LSN
3393a45086e2SBrian Foster 	 * could not be verified. Check the superblock LSN against the current
3394a45086e2SBrian Foster 	 * LSN now that it's known.
3395a45086e2SBrian Foster 	 */
339638c26bfdSDave Chinner 	if (xfs_has_crc(log->l_mp) &&
3397a45086e2SBrian Foster 	    !xfs_log_check_lsn(log->l_mp, log->l_mp->m_sb.sb_lsn))
3398a45086e2SBrian Foster 		return -EINVAL;
3399a45086e2SBrian Foster 
34001da177e4SLinus Torvalds 	if (tail_blk != head_blk) {
34011da177e4SLinus Torvalds 		/* There used to be a comment here:
34021da177e4SLinus Torvalds 		 *
34031da177e4SLinus Torvalds 		 * disallow recovery on read-only mounts.  note -- mount
34041da177e4SLinus Torvalds 		 * checks for ENOSPC and turns it into an intelligent
34051da177e4SLinus Torvalds 		 * error message.
34061da177e4SLinus Torvalds 		 * ...but this is no longer true.  Now, unless you specify
34071da177e4SLinus Torvalds 		 * NORECOVERY (in which case this function would never be
34081da177e4SLinus Torvalds 		 * called), we just go ahead and recover.  We do this all
34091da177e4SLinus Torvalds 		 * under the vfs layer, so we can get away with it unless
34101da177e4SLinus Torvalds 		 * the device itself is read-only, in which case we fail.
34111da177e4SLinus Torvalds 		 */
34123a02ee18SUtako Kusaka 		if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
34131da177e4SLinus Torvalds 			return error;
34141da177e4SLinus Torvalds 		}
34151da177e4SLinus Torvalds 
3416e721f504SDave Chinner 		/*
3417e721f504SDave Chinner 		 * Version 5 superblock log feature mask validation. We know the
3418e721f504SDave Chinner 		 * log is dirty so check if there are any unknown log features
3419e721f504SDave Chinner 		 * in what we need to recover. If there are unknown features
3420e721f504SDave Chinner 		 * (e.g. unsupported transactions, then simply reject the
3421e721f504SDave Chinner 		 * attempt at recovery before touching anything.
3422e721f504SDave Chinner 		 */
3423d6837c1aSDave Chinner 		if (xfs_sb_is_v5(&log->l_mp->m_sb) &&
3424e721f504SDave Chinner 		    xfs_sb_has_incompat_log_feature(&log->l_mp->m_sb,
3425e721f504SDave Chinner 					XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN)) {
3426e721f504SDave Chinner 			xfs_warn(log->l_mp,
3427f41febd2SJoe Perches "Superblock has unknown incompatible log features (0x%x) enabled.",
3428e721f504SDave Chinner 				(log->l_mp->m_sb.sb_features_log_incompat &
3429e721f504SDave Chinner 					XFS_SB_FEAT_INCOMPAT_LOG_UNKNOWN));
3430f41febd2SJoe Perches 			xfs_warn(log->l_mp,
3431f41febd2SJoe Perches "The log can not be fully and/or safely recovered by this kernel.");
3432f41febd2SJoe Perches 			xfs_warn(log->l_mp,
3433f41febd2SJoe Perches "Please recover the log on a kernel that supports the unknown features.");
34342451337dSDave Chinner 			return -EINVAL;
3435e721f504SDave Chinner 		}
3436e721f504SDave Chinner 
34372e227178SBrian Foster 		/*
34382e227178SBrian Foster 		 * Delay log recovery if the debug hook is set. This is debug
3439bd24a4f5SBhaskar Chowdhury 		 * instrumentation to coordinate simulation of I/O failures with
34402e227178SBrian Foster 		 * log recovery.
34412e227178SBrian Foster 		 */
34422e227178SBrian Foster 		if (xfs_globals.log_recovery_delay) {
34432e227178SBrian Foster 			xfs_notice(log->l_mp,
34442e227178SBrian Foster 				"Delaying log recovery for %d seconds.",
34452e227178SBrian Foster 				xfs_globals.log_recovery_delay);
34462e227178SBrian Foster 			msleep(xfs_globals.log_recovery_delay * 1000);
34472e227178SBrian Foster 		}
34482e227178SBrian Foster 
3449a0fa2b67SDave Chinner 		xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
3450a0fa2b67SDave Chinner 				log->l_mp->m_logname ? log->l_mp->m_logname
3451a0fa2b67SDave Chinner 						     : "internal");
34521da177e4SLinus Torvalds 
34531da177e4SLinus Torvalds 		error = xlog_do_recover(log, head_blk, tail_blk);
3454e1d06e5fSDave Chinner 		set_bit(XLOG_RECOVERY_NEEDED, &log->l_opstate);
34551da177e4SLinus Torvalds 	}
34561da177e4SLinus Torvalds 	return error;
34571da177e4SLinus Torvalds }
34581da177e4SLinus Torvalds 
34591da177e4SLinus Torvalds /*
3460fd67d8a0SDave Chinner  * In the first part of recovery we replay inodes and buffers and build up the
3461fd67d8a0SDave Chinner  * list of intents which need to be processed. Here we process the intents and
3462fd67d8a0SDave Chinner  * clean up the on disk unlinked inode lists. This is separated from the first
3463fd67d8a0SDave Chinner  * part of recovery so that the root and real-time bitmap inodes can be read in
3464fd67d8a0SDave Chinner  * from disk in between the two stages.  This is necessary so that we can free
3465fd67d8a0SDave Chinner  * space in the real-time portion of the file system.
34661da177e4SLinus Torvalds  */
34671da177e4SLinus Torvalds int
xlog_recover_finish(struct xlog * log)34681da177e4SLinus Torvalds xlog_recover_finish(
34699a8d2fdbSMark Tinguely 	struct xlog	*log)
34701da177e4SLinus Torvalds {
34713c1e2bbeSDavid Chinner 	int	error;
3472fd67d8a0SDave Chinner 
3473dc42375dSDarrick J. Wong 	error = xlog_recover_process_intents(log);
34743c1e2bbeSDavid Chinner 	if (error) {
34752e76f188SDarrick J. Wong 		/*
3476fd67d8a0SDave Chinner 		 * Cancel all the unprocessed intent items now so that we don't
3477fd67d8a0SDave Chinner 		 * leave them pinned in the AIL.  This can cause the AIL to
3478fd67d8a0SDave Chinner 		 * livelock on the pinned item if anyone tries to push the AIL
3479fd67d8a0SDave Chinner 		 * (inode reclaim does this) before we get around to
3480fd67d8a0SDave Chinner 		 * xfs_log_mount_cancel.
34812e76f188SDarrick J. Wong 		 */
34822e76f188SDarrick J. Wong 		xlog_recover_cancel_intents(log);
3483dc42375dSDarrick J. Wong 		xfs_alert(log->l_mp, "Failed to recover intents");
3484b5f17becSDave Chinner 		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
34853c1e2bbeSDavid Chinner 		return error;
34863c1e2bbeSDavid Chinner 	}
34879e88b5d8SDarrick J. Wong 
34881da177e4SLinus Torvalds 	/*
3489fd67d8a0SDave Chinner 	 * Sync the log to get all the intents out of the AIL.  This isn't
3490fd67d8a0SDave Chinner 	 * absolutely necessary, but it helps in case the unlink transactions
3491fd67d8a0SDave Chinner 	 * would have problems pushing the intents out of the way.
34921da177e4SLinus Torvalds 	 */
3493a14a348bSChristoph Hellwig 	xfs_log_force(log->l_mp, XFS_LOG_SYNC);
34941da177e4SLinus Torvalds 
3495908ce71eSDarrick J. Wong 	/*
3496fd67d8a0SDave Chinner 	 * Now that we've recovered the log and all the intents, we can clear
3497fd67d8a0SDave Chinner 	 * the log incompat feature bits in the superblock because there's no
3498fd67d8a0SDave Chinner 	 * longer anything to protect.  We rely on the AIL push to write out the
3499fd67d8a0SDave Chinner 	 * updated superblock after everything else.
3500908ce71eSDarrick J. Wong 	 */
3501908ce71eSDarrick J. Wong 	if (xfs_clear_incompat_log_features(log->l_mp)) {
3502908ce71eSDarrick J. Wong 		error = xfs_sync_sb(log->l_mp, false);
3503908ce71eSDarrick J. Wong 		if (error < 0) {
3504908ce71eSDarrick J. Wong 			xfs_alert(log->l_mp,
3505908ce71eSDarrick J. Wong 	"Failed to clear log incompat features on recovery");
3506908ce71eSDarrick J. Wong 			return error;
3507908ce71eSDarrick J. Wong 		}
3508908ce71eSDarrick J. Wong 	}
3509908ce71eSDarrick J. Wong 
35101da177e4SLinus Torvalds 	xlog_recover_process_iunlinks(log);
35117993f1a4SDarrick J. Wong 
35127993f1a4SDarrick J. Wong 	/*
35137993f1a4SDarrick J. Wong 	 * Recover any CoW staging blocks that are still referenced by the
35147993f1a4SDarrick J. Wong 	 * ondisk refcount metadata.  During mount there cannot be any live
35157993f1a4SDarrick J. Wong 	 * staging extents as we have not permitted any user modifications.
35167993f1a4SDarrick J. Wong 	 * Therefore, it is safe to free them all right now, even on a
35177993f1a4SDarrick J. Wong 	 * read-only mount.
35187993f1a4SDarrick J. Wong 	 */
35197993f1a4SDarrick J. Wong 	error = xfs_reflink_recover_cow(log->l_mp);
35207993f1a4SDarrick J. Wong 	if (error) {
35217993f1a4SDarrick J. Wong 		xfs_alert(log->l_mp,
35227993f1a4SDarrick J. Wong 	"Failed to recover leftover CoW staging extents, err %d.",
35237993f1a4SDarrick J. Wong 				error);
35247993f1a4SDarrick J. Wong 		/*
35257993f1a4SDarrick J. Wong 		 * If we get an error here, make sure the log is shut down
35267993f1a4SDarrick J. Wong 		 * but return zero so that any log items committed since the
35277993f1a4SDarrick J. Wong 		 * end of intents processing can be pushed through the CIL
35287993f1a4SDarrick J. Wong 		 * and AIL.
35297993f1a4SDarrick J. Wong 		 */
3530b5f17becSDave Chinner 		xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR);
35317993f1a4SDarrick J. Wong 	}
35327993f1a4SDarrick J. Wong 
35331da177e4SLinus Torvalds 	return 0;
35341da177e4SLinus Torvalds }
35351da177e4SLinus Torvalds 
3536a7a9250eSHariprasad Kelam void
xlog_recover_cancel(struct xlog * log)3537f0b2efadSBrian Foster xlog_recover_cancel(
3538f0b2efadSBrian Foster 	struct xlog	*log)
3539f0b2efadSBrian Foster {
3540e1d06e5fSDave Chinner 	if (xlog_recovery_needed(log))
3541a7a9250eSHariprasad Kelam 		xlog_recover_cancel_intents(log);
3542f0b2efadSBrian Foster }
35431da177e4SLinus Torvalds 
3544