xref: /openbmc/linux/fs/xfs/xfs_inode_item_recover.c (revision f8a11425075ff11b4b5784f077cb84f3d2dfb3f0)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_trans.h"
15 #include "xfs_inode_item.h"
16 #include "xfs_trace.h"
17 #include "xfs_trans_priv.h"
18 #include "xfs_buf_item.h"
19 #include "xfs_log.h"
20 #include "xfs_error.h"
21 #include "xfs_log_priv.h"
22 #include "xfs_log_recover.h"
23 #include "xfs_icache.h"
24 #include "xfs_bmap_btree.h"
25 
26 STATIC void
27 xlog_recover_inode_ra_pass2(
28 	struct xlog                     *log,
29 	struct xlog_recover_item        *item)
30 {
31 	if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
32 		struct xfs_inode_log_format	*ilfp = item->ri_buf[0].i_addr;
33 
34 		xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
35 				   &xfs_inode_buf_ra_ops);
36 	} else {
37 		struct xfs_inode_log_format_32	*ilfp = item->ri_buf[0].i_addr;
38 
39 		xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len,
40 				   &xfs_inode_buf_ra_ops);
41 	}
42 }
43 
44 /*
45  * Inode fork owner changes
46  *
47  * If we have been told that we have to reparent the inode fork, it's because an
48  * extent swap operation on a CRC enabled filesystem has been done and we are
49  * replaying it. We need to walk the BMBT of the appropriate fork and change the
50  * owners of it.
51  *
52  * The complexity here is that we don't have an inode context to work with, so
53  * after we've replayed the inode we need to instantiate one.  This is where the
54  * fun begins.
55  *
56  * We are in the middle of log recovery, so we can't run transactions. That
57  * means we cannot use cache coherent inode instantiation via xfs_iget(), as
58  * that will result in the corresponding iput() running the inode through
59  * xfs_inactive(). If we've just replayed an inode core that changes the link
60  * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
61  * transactions (bad!).
62  *
63  * So, to avoid this, we instantiate an inode directly from the inode core we've
64  * just recovered. We have the buffer still locked, and all we really need to
65  * instantiate is the inode core and the forks being modified. We can do this
66  * manually, then run the inode btree owner change, and then tear down the
67  * xfs_inode without having to run any transactions at all.
68  *
69  * Also, because we don't have a transaction context available here but need to
70  * gather all the buffers we modify for writeback so we pass the buffer_list
71  * instead for the operation to use.
72  */
73 
74 STATIC int
75 xfs_recover_inode_owner_change(
76 	struct xfs_mount	*mp,
77 	struct xfs_dinode	*dip,
78 	struct xfs_inode_log_format *in_f,
79 	struct list_head	*buffer_list)
80 {
81 	struct xfs_inode	*ip;
82 	int			error;
83 
84 	ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
85 
86 	ip = xfs_inode_alloc(mp, in_f->ilf_ino);
87 	if (!ip)
88 		return -ENOMEM;
89 
90 	/* instantiate the inode */
91 	ASSERT(dip->di_version >= 3);
92 
93 	error = xfs_inode_from_disk(ip, dip);
94 	if (error)
95 		goto out_free_ip;
96 
97 	if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
98 		ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
99 		error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
100 					      ip->i_ino, buffer_list);
101 		if (error)
102 			goto out_free_ip;
103 	}
104 
105 	if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
106 		ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
107 		error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
108 					      ip->i_ino, buffer_list);
109 		if (error)
110 			goto out_free_ip;
111 	}
112 
113 out_free_ip:
114 	xfs_inode_free(ip);
115 	return error;
116 }
117 
118 static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld)
119 {
120 	return ld->di_version >= 3 &&
121 	       (ld->di_flags2 & XFS_DIFLAG2_BIGTIME);
122 }
123 
124 /* Convert a log timestamp to an ondisk timestamp. */
125 static inline xfs_timestamp_t
126 xfs_log_dinode_to_disk_ts(
127 	struct xfs_log_dinode		*from,
128 	const xfs_log_timestamp_t	its)
129 {
130 	struct xfs_legacy_timestamp	*lts;
131 	struct xfs_log_legacy_timestamp	*lits;
132 	xfs_timestamp_t			ts;
133 
134 	if (xfs_log_dinode_has_bigtime(from))
135 		return cpu_to_be64(its);
136 
137 	lts = (struct xfs_legacy_timestamp *)&ts;
138 	lits = (struct xfs_log_legacy_timestamp *)&its;
139 	lts->t_sec = cpu_to_be32(lits->t_sec);
140 	lts->t_nsec = cpu_to_be32(lits->t_nsec);
141 
142 	return ts;
143 }
144 
145 STATIC void
146 xfs_log_dinode_to_disk(
147 	struct xfs_log_dinode	*from,
148 	struct xfs_dinode	*to)
149 {
150 	to->di_magic = cpu_to_be16(from->di_magic);
151 	to->di_mode = cpu_to_be16(from->di_mode);
152 	to->di_version = from->di_version;
153 	to->di_format = from->di_format;
154 	to->di_onlink = 0;
155 	to->di_uid = cpu_to_be32(from->di_uid);
156 	to->di_gid = cpu_to_be32(from->di_gid);
157 	to->di_nlink = cpu_to_be32(from->di_nlink);
158 	to->di_projid_lo = cpu_to_be16(from->di_projid_lo);
159 	to->di_projid_hi = cpu_to_be16(from->di_projid_hi);
160 	memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad));
161 
162 	to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime);
163 	to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime);
164 	to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime);
165 
166 	to->di_size = cpu_to_be64(from->di_size);
167 	to->di_nblocks = cpu_to_be64(from->di_nblocks);
168 	to->di_extsize = cpu_to_be32(from->di_extsize);
169 	to->di_nextents = cpu_to_be32(from->di_nextents);
170 	to->di_anextents = cpu_to_be16(from->di_anextents);
171 	to->di_forkoff = from->di_forkoff;
172 	to->di_aformat = from->di_aformat;
173 	to->di_dmevmask = cpu_to_be32(from->di_dmevmask);
174 	to->di_dmstate = cpu_to_be16(from->di_dmstate);
175 	to->di_flags = cpu_to_be16(from->di_flags);
176 	to->di_gen = cpu_to_be32(from->di_gen);
177 
178 	if (from->di_version == 3) {
179 		to->di_changecount = cpu_to_be64(from->di_changecount);
180 		to->di_crtime = xfs_log_dinode_to_disk_ts(from,
181 							  from->di_crtime);
182 		to->di_flags2 = cpu_to_be64(from->di_flags2);
183 		to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
184 		to->di_ino = cpu_to_be64(from->di_ino);
185 		to->di_lsn = cpu_to_be64(from->di_lsn);
186 		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
187 		uuid_copy(&to->di_uuid, &from->di_uuid);
188 		to->di_flushiter = 0;
189 	} else {
190 		to->di_flushiter = cpu_to_be16(from->di_flushiter);
191 	}
192 }
193 
194 STATIC int
195 xlog_recover_inode_commit_pass2(
196 	struct xlog			*log,
197 	struct list_head		*buffer_list,
198 	struct xlog_recover_item	*item,
199 	xfs_lsn_t			current_lsn)
200 {
201 	struct xfs_inode_log_format	*in_f;
202 	struct xfs_mount		*mp = log->l_mp;
203 	struct xfs_buf			*bp;
204 	struct xfs_dinode		*dip;
205 	int				len;
206 	char				*src;
207 	char				*dest;
208 	int				error;
209 	int				attr_index;
210 	uint				fields;
211 	struct xfs_log_dinode		*ldip;
212 	uint				isize;
213 	int				need_free = 0;
214 
215 	if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) {
216 		in_f = item->ri_buf[0].i_addr;
217 	} else {
218 		in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0);
219 		need_free = 1;
220 		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
221 		if (error)
222 			goto error;
223 	}
224 
225 	/*
226 	 * Inode buffers can be freed, look out for it,
227 	 * and do not replay the inode.
228 	 */
229 	if (xlog_is_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len)) {
230 		error = 0;
231 		trace_xfs_log_recover_inode_cancel(log, in_f);
232 		goto error;
233 	}
234 	trace_xfs_log_recover_inode_recover(log, in_f);
235 
236 	error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
237 			0, &bp, &xfs_inode_buf_ops);
238 	if (error)
239 		goto error;
240 	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
241 	dip = xfs_buf_offset(bp, in_f->ilf_boffset);
242 
243 	/*
244 	 * Make sure the place we're flushing out to really looks
245 	 * like an inode!
246 	 */
247 	if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) {
248 		xfs_alert(mp,
249 	"%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld",
250 			__func__, dip, bp, in_f->ilf_ino);
251 		error = -EFSCORRUPTED;
252 		goto out_release;
253 	}
254 	ldip = item->ri_buf[1].i_addr;
255 	if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) {
256 		xfs_alert(mp,
257 			"%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld",
258 			__func__, item, in_f->ilf_ino);
259 		error = -EFSCORRUPTED;
260 		goto out_release;
261 	}
262 
263 	/*
264 	 * If the inode has an LSN in it, recover the inode only if it's less
265 	 * than the lsn of the transaction we are replaying. Note: we still
266 	 * need to replay an owner change even though the inode is more recent
267 	 * than the transaction as there is no guarantee that all the btree
268 	 * blocks are more recent than this transaction, too.
269 	 */
270 	if (dip->di_version >= 3) {
271 		xfs_lsn_t	lsn = be64_to_cpu(dip->di_lsn);
272 
273 		if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
274 			trace_xfs_log_recover_inode_skip(log, in_f);
275 			error = 0;
276 			goto out_owner_change;
277 		}
278 	}
279 
280 	/*
281 	 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes
282 	 * are transactional and if ordering is necessary we can determine that
283 	 * more accurately by the LSN field in the V3 inode core. Don't trust
284 	 * the inode versions we might be changing them here - use the
285 	 * superblock flag to determine whether we need to look at di_flushiter
286 	 * to skip replay when the on disk inode is newer than the log one
287 	 */
288 	if (!xfs_sb_version_has_v3inode(&mp->m_sb) &&
289 	    ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
290 		/*
291 		 * Deal with the wrap case, DI_MAX_FLUSH is less
292 		 * than smaller numbers
293 		 */
294 		if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
295 		    ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) {
296 			/* do nothing */
297 		} else {
298 			trace_xfs_log_recover_inode_skip(log, in_f);
299 			error = 0;
300 			goto out_release;
301 		}
302 	}
303 
304 	/* Take the opportunity to reset the flush iteration count */
305 	ldip->di_flushiter = 0;
306 
307 	if (unlikely(S_ISREG(ldip->di_mode))) {
308 		if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
309 		    (ldip->di_format != XFS_DINODE_FMT_BTREE)) {
310 			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
311 					 XFS_ERRLEVEL_LOW, mp, ldip,
312 					 sizeof(*ldip));
313 			xfs_alert(mp,
314 		"%s: Bad regular inode log record, rec ptr "PTR_FMT", "
315 		"ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
316 				__func__, item, dip, bp, in_f->ilf_ino);
317 			error = -EFSCORRUPTED;
318 			goto out_release;
319 		}
320 	} else if (unlikely(S_ISDIR(ldip->di_mode))) {
321 		if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) &&
322 		    (ldip->di_format != XFS_DINODE_FMT_BTREE) &&
323 		    (ldip->di_format != XFS_DINODE_FMT_LOCAL)) {
324 			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
325 					     XFS_ERRLEVEL_LOW, mp, ldip,
326 					     sizeof(*ldip));
327 			xfs_alert(mp,
328 		"%s: Bad dir inode log record, rec ptr "PTR_FMT", "
329 		"ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld",
330 				__func__, item, dip, bp, in_f->ilf_ino);
331 			error = -EFSCORRUPTED;
332 			goto out_release;
333 		}
334 	}
335 	if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){
336 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
337 				     XFS_ERRLEVEL_LOW, mp, ldip,
338 				     sizeof(*ldip));
339 		xfs_alert(mp,
340 	"%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
341 	"dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld",
342 			__func__, item, dip, bp, in_f->ilf_ino,
343 			ldip->di_nextents + ldip->di_anextents,
344 			ldip->di_nblocks);
345 		error = -EFSCORRUPTED;
346 		goto out_release;
347 	}
348 	if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) {
349 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
350 				     XFS_ERRLEVEL_LOW, mp, ldip,
351 				     sizeof(*ldip));
352 		xfs_alert(mp,
353 	"%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", "
354 	"dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__,
355 			item, dip, bp, in_f->ilf_ino, ldip->di_forkoff);
356 		error = -EFSCORRUPTED;
357 		goto out_release;
358 	}
359 	isize = xfs_log_dinode_size(mp);
360 	if (unlikely(item->ri_buf[1].i_len > isize)) {
361 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
362 				     XFS_ERRLEVEL_LOW, mp, ldip,
363 				     sizeof(*ldip));
364 		xfs_alert(mp,
365 			"%s: Bad inode log record length %d, rec ptr "PTR_FMT,
366 			__func__, item->ri_buf[1].i_len, item);
367 		error = -EFSCORRUPTED;
368 		goto out_release;
369 	}
370 
371 	/* recover the log dinode inode into the on disk inode */
372 	xfs_log_dinode_to_disk(ldip, dip);
373 
374 	fields = in_f->ilf_fields;
375 	if (fields & XFS_ILOG_DEV)
376 		xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
377 
378 	if (in_f->ilf_size == 2)
379 		goto out_owner_change;
380 	len = item->ri_buf[2].i_len;
381 	src = item->ri_buf[2].i_addr;
382 	ASSERT(in_f->ilf_size <= 4);
383 	ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
384 	ASSERT(!(fields & XFS_ILOG_DFORK) ||
385 	       (len == in_f->ilf_dsize));
386 
387 	switch (fields & XFS_ILOG_DFORK) {
388 	case XFS_ILOG_DDATA:
389 	case XFS_ILOG_DEXT:
390 		memcpy(XFS_DFORK_DPTR(dip), src, len);
391 		break;
392 
393 	case XFS_ILOG_DBROOT:
394 		xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
395 				 (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip),
396 				 XFS_DFORK_DSIZE(dip, mp));
397 		break;
398 
399 	default:
400 		/*
401 		 * There are no data fork flags set.
402 		 */
403 		ASSERT((fields & XFS_ILOG_DFORK) == 0);
404 		break;
405 	}
406 
407 	/*
408 	 * If we logged any attribute data, recover it.  There may or
409 	 * may not have been any other non-core data logged in this
410 	 * transaction.
411 	 */
412 	if (in_f->ilf_fields & XFS_ILOG_AFORK) {
413 		if (in_f->ilf_fields & XFS_ILOG_DFORK) {
414 			attr_index = 3;
415 		} else {
416 			attr_index = 2;
417 		}
418 		len = item->ri_buf[attr_index].i_len;
419 		src = item->ri_buf[attr_index].i_addr;
420 		ASSERT(len == in_f->ilf_asize);
421 
422 		switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
423 		case XFS_ILOG_ADATA:
424 		case XFS_ILOG_AEXT:
425 			dest = XFS_DFORK_APTR(dip);
426 			ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
427 			memcpy(dest, src, len);
428 			break;
429 
430 		case XFS_ILOG_ABROOT:
431 			dest = XFS_DFORK_APTR(dip);
432 			xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
433 					 len, (struct xfs_bmdr_block *)dest,
434 					 XFS_DFORK_ASIZE(dip, mp));
435 			break;
436 
437 		default:
438 			xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
439 			ASSERT(0);
440 			error = -EFSCORRUPTED;
441 			goto out_release;
442 		}
443 	}
444 
445 out_owner_change:
446 	/* Recover the swapext owner change unless inode has been deleted */
447 	if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) &&
448 	    (dip->di_mode != 0))
449 		error = xfs_recover_inode_owner_change(mp, dip, in_f,
450 						       buffer_list);
451 	/* re-generate the checksum. */
452 	xfs_dinode_calc_crc(log->l_mp, dip);
453 
454 	ASSERT(bp->b_mount == mp);
455 	bp->b_flags |= _XBF_LOGRECOVERY;
456 	xfs_buf_delwri_queue(bp, buffer_list);
457 
458 out_release:
459 	xfs_buf_relse(bp);
460 error:
461 	if (need_free)
462 		kmem_free(in_f);
463 	return error;
464 }
465 
466 const struct xlog_recover_item_ops xlog_inode_item_ops = {
467 	.item_type		= XFS_LI_INODE,
468 	.ra_pass2		= xlog_recover_inode_ra_pass2,
469 	.commit_pass2		= xlog_recover_inode_commit_pass2,
470 };
471