1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode_item.h" 16 #include "xfs_trace.h" 17 #include "xfs_trans_priv.h" 18 #include "xfs_buf_item.h" 19 #include "xfs_log.h" 20 #include "xfs_error.h" 21 #include "xfs_log_priv.h" 22 #include "xfs_log_recover.h" 23 #include "xfs_icache.h" 24 #include "xfs_bmap_btree.h" 25 26 STATIC void 27 xlog_recover_inode_ra_pass2( 28 struct xlog *log, 29 struct xlog_recover_item *item) 30 { 31 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { 32 struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr; 33 34 xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, 35 &xfs_inode_buf_ra_ops); 36 } else { 37 struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr; 38 39 xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, 40 &xfs_inode_buf_ra_ops); 41 } 42 } 43 44 /* 45 * Inode fork owner changes 46 * 47 * If we have been told that we have to reparent the inode fork, it's because an 48 * extent swap operation on a CRC enabled filesystem has been done and we are 49 * replaying it. We need to walk the BMBT of the appropriate fork and change the 50 * owners of it. 51 * 52 * The complexity here is that we don't have an inode context to work with, so 53 * after we've replayed the inode we need to instantiate one. This is where the 54 * fun begins. 55 * 56 * We are in the middle of log recovery, so we can't run transactions. That 57 * means we cannot use cache coherent inode instantiation via xfs_iget(), as 58 * that will result in the corresponding iput() running the inode through 59 * xfs_inactive(). If we've just replayed an inode core that changes the link 60 * count to zero (i.e. it's been unlinked), then xfs_inactive() will run 61 * transactions (bad!). 62 * 63 * So, to avoid this, we instantiate an inode directly from the inode core we've 64 * just recovered. We have the buffer still locked, and all we really need to 65 * instantiate is the inode core and the forks being modified. We can do this 66 * manually, then run the inode btree owner change, and then tear down the 67 * xfs_inode without having to run any transactions at all. 68 * 69 * Also, because we don't have a transaction context available here but need to 70 * gather all the buffers we modify for writeback so we pass the buffer_list 71 * instead for the operation to use. 72 */ 73 74 STATIC int 75 xfs_recover_inode_owner_change( 76 struct xfs_mount *mp, 77 struct xfs_dinode *dip, 78 struct xfs_inode_log_format *in_f, 79 struct list_head *buffer_list) 80 { 81 struct xfs_inode *ip; 82 int error; 83 84 ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); 85 86 ip = xfs_inode_alloc(mp, in_f->ilf_ino); 87 if (!ip) 88 return -ENOMEM; 89 90 /* instantiate the inode */ 91 ASSERT(dip->di_version >= 3); 92 93 error = xfs_inode_from_disk(ip, dip); 94 if (error) 95 goto out_free_ip; 96 97 if (in_f->ilf_fields & XFS_ILOG_DOWNER) { 98 ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); 99 error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, 100 ip->i_ino, buffer_list); 101 if (error) 102 goto out_free_ip; 103 } 104 105 if (in_f->ilf_fields & XFS_ILOG_AOWNER) { 106 ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); 107 error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, 108 ip->i_ino, buffer_list); 109 if (error) 110 goto out_free_ip; 111 } 112 113 out_free_ip: 114 xfs_inode_free(ip); 115 return error; 116 } 117 118 static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld) 119 { 120 return ld->di_version >= 3 && 121 (ld->di_flags2 & XFS_DIFLAG2_BIGTIME); 122 } 123 124 /* Convert a log timestamp to an ondisk timestamp. */ 125 static inline xfs_timestamp_t 126 xfs_log_dinode_to_disk_ts( 127 struct xfs_log_dinode *from, 128 const xfs_log_timestamp_t its) 129 { 130 struct xfs_legacy_timestamp *lts; 131 struct xfs_log_legacy_timestamp *lits; 132 xfs_timestamp_t ts; 133 134 if (xfs_log_dinode_has_bigtime(from)) 135 return cpu_to_be64(its); 136 137 lts = (struct xfs_legacy_timestamp *)&ts; 138 lits = (struct xfs_log_legacy_timestamp *)&its; 139 lts->t_sec = cpu_to_be32(lits->t_sec); 140 lts->t_nsec = cpu_to_be32(lits->t_nsec); 141 142 return ts; 143 } 144 145 STATIC void 146 xfs_log_dinode_to_disk( 147 struct xfs_log_dinode *from, 148 struct xfs_dinode *to, 149 xfs_lsn_t lsn) 150 { 151 to->di_magic = cpu_to_be16(from->di_magic); 152 to->di_mode = cpu_to_be16(from->di_mode); 153 to->di_version = from->di_version; 154 to->di_format = from->di_format; 155 to->di_onlink = 0; 156 to->di_uid = cpu_to_be32(from->di_uid); 157 to->di_gid = cpu_to_be32(from->di_gid); 158 to->di_nlink = cpu_to_be32(from->di_nlink); 159 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 160 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 161 memcpy(to->di_pad, from->di_pad, sizeof(to->di_pad)); 162 163 to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime); 164 to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime); 165 to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime); 166 167 to->di_size = cpu_to_be64(from->di_size); 168 to->di_nblocks = cpu_to_be64(from->di_nblocks); 169 to->di_extsize = cpu_to_be32(from->di_extsize); 170 to->di_nextents = cpu_to_be32(from->di_nextents); 171 to->di_anextents = cpu_to_be16(from->di_anextents); 172 to->di_forkoff = from->di_forkoff; 173 to->di_aformat = from->di_aformat; 174 to->di_dmevmask = cpu_to_be32(from->di_dmevmask); 175 to->di_dmstate = cpu_to_be16(from->di_dmstate); 176 to->di_flags = cpu_to_be16(from->di_flags); 177 to->di_gen = cpu_to_be32(from->di_gen); 178 179 if (from->di_version == 3) { 180 to->di_changecount = cpu_to_be64(from->di_changecount); 181 to->di_crtime = xfs_log_dinode_to_disk_ts(from, 182 from->di_crtime); 183 to->di_flags2 = cpu_to_be64(from->di_flags2); 184 to->di_cowextsize = cpu_to_be32(from->di_cowextsize); 185 to->di_ino = cpu_to_be64(from->di_ino); 186 to->di_lsn = cpu_to_be64(lsn); 187 memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2)); 188 uuid_copy(&to->di_uuid, &from->di_uuid); 189 to->di_flushiter = 0; 190 } else { 191 to->di_flushiter = cpu_to_be16(from->di_flushiter); 192 } 193 } 194 195 STATIC int 196 xlog_recover_inode_commit_pass2( 197 struct xlog *log, 198 struct list_head *buffer_list, 199 struct xlog_recover_item *item, 200 xfs_lsn_t current_lsn) 201 { 202 struct xfs_inode_log_format *in_f; 203 struct xfs_mount *mp = log->l_mp; 204 struct xfs_buf *bp; 205 struct xfs_dinode *dip; 206 int len; 207 char *src; 208 char *dest; 209 int error; 210 int attr_index; 211 uint fields; 212 struct xfs_log_dinode *ldip; 213 uint isize; 214 int need_free = 0; 215 216 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { 217 in_f = item->ri_buf[0].i_addr; 218 } else { 219 in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); 220 need_free = 1; 221 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); 222 if (error) 223 goto error; 224 } 225 226 /* 227 * Inode buffers can be freed, look out for it, 228 * and do not replay the inode. 229 */ 230 if (xlog_is_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len)) { 231 error = 0; 232 trace_xfs_log_recover_inode_cancel(log, in_f); 233 goto error; 234 } 235 trace_xfs_log_recover_inode_recover(log, in_f); 236 237 error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 238 0, &bp, &xfs_inode_buf_ops); 239 if (error) 240 goto error; 241 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); 242 dip = xfs_buf_offset(bp, in_f->ilf_boffset); 243 244 /* 245 * Make sure the place we're flushing out to really looks 246 * like an inode! 247 */ 248 if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { 249 xfs_alert(mp, 250 "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %Ld", 251 __func__, dip, bp, in_f->ilf_ino); 252 error = -EFSCORRUPTED; 253 goto out_release; 254 } 255 ldip = item->ri_buf[1].i_addr; 256 if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { 257 xfs_alert(mp, 258 "%s: Bad inode log record, rec ptr "PTR_FMT", ino %Ld", 259 __func__, item, in_f->ilf_ino); 260 error = -EFSCORRUPTED; 261 goto out_release; 262 } 263 264 /* 265 * If the inode has an LSN in it, recover the inode only if the on-disk 266 * inode's LSN is older than the lsn of the transaction we are 267 * replaying. We can have multiple checkpoints with the same start LSN, 268 * so the current LSN being equal to the on-disk LSN doesn't necessarily 269 * mean that the on-disk inode is more recent than the change being 270 * replayed. 271 * 272 * We must check the current_lsn against the on-disk inode 273 * here because the we can't trust the log dinode to contain a valid LSN 274 * (see comment below before replaying the log dinode for details). 275 * 276 * Note: we still need to replay an owner change even though the inode 277 * is more recent than the transaction as there is no guarantee that all 278 * the btree blocks are more recent than this transaction, too. 279 */ 280 if (dip->di_version >= 3) { 281 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); 282 283 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) { 284 trace_xfs_log_recover_inode_skip(log, in_f); 285 error = 0; 286 goto out_owner_change; 287 } 288 } 289 290 /* 291 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes 292 * are transactional and if ordering is necessary we can determine that 293 * more accurately by the LSN field in the V3 inode core. Don't trust 294 * the inode versions we might be changing them here - use the 295 * superblock flag to determine whether we need to look at di_flushiter 296 * to skip replay when the on disk inode is newer than the log one 297 */ 298 if (!xfs_has_v3inodes(mp) && 299 ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { 300 /* 301 * Deal with the wrap case, DI_MAX_FLUSH is less 302 * than smaller numbers 303 */ 304 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && 305 ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { 306 /* do nothing */ 307 } else { 308 trace_xfs_log_recover_inode_skip(log, in_f); 309 error = 0; 310 goto out_release; 311 } 312 } 313 314 /* Take the opportunity to reset the flush iteration count */ 315 ldip->di_flushiter = 0; 316 317 if (unlikely(S_ISREG(ldip->di_mode))) { 318 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && 319 (ldip->di_format != XFS_DINODE_FMT_BTREE)) { 320 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)", 321 XFS_ERRLEVEL_LOW, mp, ldip, 322 sizeof(*ldip)); 323 xfs_alert(mp, 324 "%s: Bad regular inode log record, rec ptr "PTR_FMT", " 325 "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", 326 __func__, item, dip, bp, in_f->ilf_ino); 327 error = -EFSCORRUPTED; 328 goto out_release; 329 } 330 } else if (unlikely(S_ISDIR(ldip->di_mode))) { 331 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && 332 (ldip->di_format != XFS_DINODE_FMT_BTREE) && 333 (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { 334 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)", 335 XFS_ERRLEVEL_LOW, mp, ldip, 336 sizeof(*ldip)); 337 xfs_alert(mp, 338 "%s: Bad dir inode log record, rec ptr "PTR_FMT", " 339 "ino ptr = "PTR_FMT", ino bp = "PTR_FMT", ino %Ld", 340 __func__, item, dip, bp, in_f->ilf_ino); 341 error = -EFSCORRUPTED; 342 goto out_release; 343 } 344 } 345 if (unlikely(ldip->di_nextents + ldip->di_anextents > ldip->di_nblocks)){ 346 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)", 347 XFS_ERRLEVEL_LOW, mp, ldip, 348 sizeof(*ldip)); 349 xfs_alert(mp, 350 "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " 351 "dino bp "PTR_FMT", ino %Ld, total extents = %d, nblocks = %Ld", 352 __func__, item, dip, bp, in_f->ilf_ino, 353 ldip->di_nextents + ldip->di_anextents, 354 ldip->di_nblocks); 355 error = -EFSCORRUPTED; 356 goto out_release; 357 } 358 if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { 359 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)", 360 XFS_ERRLEVEL_LOW, mp, ldip, 361 sizeof(*ldip)); 362 xfs_alert(mp, 363 "%s: Bad inode log record, rec ptr "PTR_FMT", dino ptr "PTR_FMT", " 364 "dino bp "PTR_FMT", ino %Ld, forkoff 0x%x", __func__, 365 item, dip, bp, in_f->ilf_ino, ldip->di_forkoff); 366 error = -EFSCORRUPTED; 367 goto out_release; 368 } 369 isize = xfs_log_dinode_size(mp); 370 if (unlikely(item->ri_buf[1].i_len > isize)) { 371 XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)", 372 XFS_ERRLEVEL_LOW, mp, ldip, 373 sizeof(*ldip)); 374 xfs_alert(mp, 375 "%s: Bad inode log record length %d, rec ptr "PTR_FMT, 376 __func__, item->ri_buf[1].i_len, item); 377 error = -EFSCORRUPTED; 378 goto out_release; 379 } 380 381 /* 382 * Recover the log dinode inode into the on disk inode. 383 * 384 * The LSN in the log dinode is garbage - it can be zero or reflect 385 * stale in-memory runtime state that isn't coherent with the changes 386 * logged in this transaction or the changes written to the on-disk 387 * inode. Hence we write the current lSN into the inode because that 388 * matches what xfs_iflush() would write inode the inode when flushing 389 * the changes in this transaction. 390 */ 391 xfs_log_dinode_to_disk(ldip, dip, current_lsn); 392 393 fields = in_f->ilf_fields; 394 if (fields & XFS_ILOG_DEV) 395 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); 396 397 if (in_f->ilf_size == 2) 398 goto out_owner_change; 399 len = item->ri_buf[2].i_len; 400 src = item->ri_buf[2].i_addr; 401 ASSERT(in_f->ilf_size <= 4); 402 ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); 403 ASSERT(!(fields & XFS_ILOG_DFORK) || 404 (len == in_f->ilf_dsize)); 405 406 switch (fields & XFS_ILOG_DFORK) { 407 case XFS_ILOG_DDATA: 408 case XFS_ILOG_DEXT: 409 memcpy(XFS_DFORK_DPTR(dip), src, len); 410 break; 411 412 case XFS_ILOG_DBROOT: 413 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, 414 (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip), 415 XFS_DFORK_DSIZE(dip, mp)); 416 break; 417 418 default: 419 /* 420 * There are no data fork flags set. 421 */ 422 ASSERT((fields & XFS_ILOG_DFORK) == 0); 423 break; 424 } 425 426 /* 427 * If we logged any attribute data, recover it. There may or 428 * may not have been any other non-core data logged in this 429 * transaction. 430 */ 431 if (in_f->ilf_fields & XFS_ILOG_AFORK) { 432 if (in_f->ilf_fields & XFS_ILOG_DFORK) { 433 attr_index = 3; 434 } else { 435 attr_index = 2; 436 } 437 len = item->ri_buf[attr_index].i_len; 438 src = item->ri_buf[attr_index].i_addr; 439 ASSERT(len == in_f->ilf_asize); 440 441 switch (in_f->ilf_fields & XFS_ILOG_AFORK) { 442 case XFS_ILOG_ADATA: 443 case XFS_ILOG_AEXT: 444 dest = XFS_DFORK_APTR(dip); 445 ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); 446 memcpy(dest, src, len); 447 break; 448 449 case XFS_ILOG_ABROOT: 450 dest = XFS_DFORK_APTR(dip); 451 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, 452 len, (struct xfs_bmdr_block *)dest, 453 XFS_DFORK_ASIZE(dip, mp)); 454 break; 455 456 default: 457 xfs_warn(log->l_mp, "%s: Invalid flag", __func__); 458 ASSERT(0); 459 error = -EFSCORRUPTED; 460 goto out_release; 461 } 462 } 463 464 out_owner_change: 465 /* Recover the swapext owner change unless inode has been deleted */ 466 if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) && 467 (dip->di_mode != 0)) 468 error = xfs_recover_inode_owner_change(mp, dip, in_f, 469 buffer_list); 470 /* re-generate the checksum. */ 471 xfs_dinode_calc_crc(log->l_mp, dip); 472 473 ASSERT(bp->b_mount == mp); 474 bp->b_flags |= _XBF_LOGRECOVERY; 475 xfs_buf_delwri_queue(bp, buffer_list); 476 477 out_release: 478 xfs_buf_relse(bp); 479 error: 480 if (need_free) 481 kmem_free(in_f); 482 return error; 483 } 484 485 const struct xlog_recover_item_ops xlog_inode_item_ops = { 486 .item_type = XFS_LI_INODE, 487 .ra_pass2 = xlog_recover_inode_ra_pass2, 488 .commit_pass2 = xlog_recover_inode_commit_pass2, 489 }; 490