1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2006 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_trans.h" 15 #include "xfs_inode_item.h" 16 #include "xfs_trace.h" 17 #include "xfs_trans_priv.h" 18 #include "xfs_buf_item.h" 19 #include "xfs_log.h" 20 #include "xfs_error.h" 21 #include "xfs_log_priv.h" 22 #include "xfs_log_recover.h" 23 #include "xfs_icache.h" 24 #include "xfs_bmap_btree.h" 25 26 STATIC void 27 xlog_recover_inode_ra_pass2( 28 struct xlog *log, 29 struct xlog_recover_item *item) 30 { 31 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { 32 struct xfs_inode_log_format *ilfp = item->ri_buf[0].i_addr; 33 34 xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, 35 &xfs_inode_buf_ra_ops); 36 } else { 37 struct xfs_inode_log_format_32 *ilfp = item->ri_buf[0].i_addr; 38 39 xlog_buf_readahead(log, ilfp->ilf_blkno, ilfp->ilf_len, 40 &xfs_inode_buf_ra_ops); 41 } 42 } 43 44 /* 45 * Inode fork owner changes 46 * 47 * If we have been told that we have to reparent the inode fork, it's because an 48 * extent swap operation on a CRC enabled filesystem has been done and we are 49 * replaying it. We need to walk the BMBT of the appropriate fork and change the 50 * owners of it. 51 * 52 * The complexity here is that we don't have an inode context to work with, so 53 * after we've replayed the inode we need to instantiate one. This is where the 54 * fun begins. 55 * 56 * We are in the middle of log recovery, so we can't run transactions. That 57 * means we cannot use cache coherent inode instantiation via xfs_iget(), as 58 * that will result in the corresponding iput() running the inode through 59 * xfs_inactive(). If we've just replayed an inode core that changes the link 60 * count to zero (i.e. it's been unlinked), then xfs_inactive() will run 61 * transactions (bad!). 62 * 63 * So, to avoid this, we instantiate an inode directly from the inode core we've 64 * just recovered. We have the buffer still locked, and all we really need to 65 * instantiate is the inode core and the forks being modified. We can do this 66 * manually, then run the inode btree owner change, and then tear down the 67 * xfs_inode without having to run any transactions at all. 68 * 69 * Also, because we don't have a transaction context available here but need to 70 * gather all the buffers we modify for writeback so we pass the buffer_list 71 * instead for the operation to use. 72 */ 73 74 STATIC int 75 xfs_recover_inode_owner_change( 76 struct xfs_mount *mp, 77 struct xfs_dinode *dip, 78 struct xfs_inode_log_format *in_f, 79 struct list_head *buffer_list) 80 { 81 struct xfs_inode *ip; 82 int error; 83 84 ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)); 85 86 ip = xfs_inode_alloc(mp, in_f->ilf_ino); 87 if (!ip) 88 return -ENOMEM; 89 90 /* instantiate the inode */ 91 ASSERT(dip->di_version >= 3); 92 93 error = xfs_inode_from_disk(ip, dip); 94 if (error) 95 goto out_free_ip; 96 97 if (in_f->ilf_fields & XFS_ILOG_DOWNER) { 98 ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT); 99 error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK, 100 ip->i_ino, buffer_list); 101 if (error) 102 goto out_free_ip; 103 } 104 105 if (in_f->ilf_fields & XFS_ILOG_AOWNER) { 106 ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT); 107 error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK, 108 ip->i_ino, buffer_list); 109 if (error) 110 goto out_free_ip; 111 } 112 113 out_free_ip: 114 xfs_inode_free(ip); 115 return error; 116 } 117 118 static inline bool xfs_log_dinode_has_bigtime(const struct xfs_log_dinode *ld) 119 { 120 return ld->di_version >= 3 && 121 (ld->di_flags2 & XFS_DIFLAG2_BIGTIME); 122 } 123 124 /* Convert a log timestamp to an ondisk timestamp. */ 125 static inline xfs_timestamp_t 126 xfs_log_dinode_to_disk_ts( 127 struct xfs_log_dinode *from, 128 const xfs_log_timestamp_t its) 129 { 130 struct xfs_legacy_timestamp *lts; 131 struct xfs_log_legacy_timestamp *lits; 132 xfs_timestamp_t ts; 133 134 if (xfs_log_dinode_has_bigtime(from)) 135 return cpu_to_be64(its); 136 137 lts = (struct xfs_legacy_timestamp *)&ts; 138 lits = (struct xfs_log_legacy_timestamp *)&its; 139 lts->t_sec = cpu_to_be32(lits->t_sec); 140 lts->t_nsec = cpu_to_be32(lits->t_nsec); 141 142 return ts; 143 } 144 145 static inline bool xfs_log_dinode_has_large_extent_counts( 146 const struct xfs_log_dinode *ld) 147 { 148 return ld->di_version >= 3 && 149 (ld->di_flags2 & XFS_DIFLAG2_NREXT64); 150 } 151 152 static inline void 153 xfs_log_dinode_to_disk_iext_counters( 154 struct xfs_log_dinode *from, 155 struct xfs_dinode *to) 156 { 157 if (xfs_log_dinode_has_large_extent_counts(from)) { 158 to->di_big_nextents = cpu_to_be64(from->di_big_nextents); 159 to->di_big_anextents = cpu_to_be32(from->di_big_anextents); 160 to->di_nrext64_pad = cpu_to_be16(from->di_nrext64_pad); 161 } else { 162 to->di_nextents = cpu_to_be32(from->di_nextents); 163 to->di_anextents = cpu_to_be16(from->di_anextents); 164 } 165 166 } 167 168 STATIC void 169 xfs_log_dinode_to_disk( 170 struct xfs_log_dinode *from, 171 struct xfs_dinode *to, 172 xfs_lsn_t lsn) 173 { 174 to->di_magic = cpu_to_be16(from->di_magic); 175 to->di_mode = cpu_to_be16(from->di_mode); 176 to->di_version = from->di_version; 177 to->di_format = from->di_format; 178 to->di_onlink = 0; 179 to->di_uid = cpu_to_be32(from->di_uid); 180 to->di_gid = cpu_to_be32(from->di_gid); 181 to->di_nlink = cpu_to_be32(from->di_nlink); 182 to->di_projid_lo = cpu_to_be16(from->di_projid_lo); 183 to->di_projid_hi = cpu_to_be16(from->di_projid_hi); 184 185 to->di_atime = xfs_log_dinode_to_disk_ts(from, from->di_atime); 186 to->di_mtime = xfs_log_dinode_to_disk_ts(from, from->di_mtime); 187 to->di_ctime = xfs_log_dinode_to_disk_ts(from, from->di_ctime); 188 189 to->di_size = cpu_to_be64(from->di_size); 190 to->di_nblocks = cpu_to_be64(from->di_nblocks); 191 to->di_extsize = cpu_to_be32(from->di_extsize); 192 to->di_forkoff = from->di_forkoff; 193 to->di_aformat = from->di_aformat; 194 to->di_dmevmask = cpu_to_be32(from->di_dmevmask); 195 to->di_dmstate = cpu_to_be16(from->di_dmstate); 196 to->di_flags = cpu_to_be16(from->di_flags); 197 to->di_gen = cpu_to_be32(from->di_gen); 198 199 if (from->di_version == 3) { 200 to->di_changecount = cpu_to_be64(from->di_changecount); 201 to->di_crtime = xfs_log_dinode_to_disk_ts(from, 202 from->di_crtime); 203 to->di_flags2 = cpu_to_be64(from->di_flags2); 204 to->di_cowextsize = cpu_to_be32(from->di_cowextsize); 205 to->di_ino = cpu_to_be64(from->di_ino); 206 to->di_lsn = cpu_to_be64(lsn); 207 memset(to->di_pad2, 0, sizeof(to->di_pad2)); 208 uuid_copy(&to->di_uuid, &from->di_uuid); 209 to->di_v3_pad = 0; 210 } else { 211 to->di_flushiter = cpu_to_be16(from->di_flushiter); 212 memset(to->di_v2_pad, 0, sizeof(to->di_v2_pad)); 213 } 214 215 xfs_log_dinode_to_disk_iext_counters(from, to); 216 } 217 218 STATIC int 219 xlog_dinode_verify_extent_counts( 220 struct xfs_mount *mp, 221 struct xfs_log_dinode *ldip) 222 { 223 xfs_extnum_t nextents; 224 xfs_aextnum_t anextents; 225 226 if (xfs_log_dinode_has_large_extent_counts(ldip)) { 227 if (!xfs_has_large_extent_counts(mp) || 228 (ldip->di_nrext64_pad != 0)) { 229 XFS_CORRUPTION_ERROR( 230 "Bad log dinode large extent count format", 231 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 232 xfs_alert(mp, 233 "Bad inode 0x%llx, large extent counts %d, padding 0x%x", 234 ldip->di_ino, xfs_has_large_extent_counts(mp), 235 ldip->di_nrext64_pad); 236 return -EFSCORRUPTED; 237 } 238 239 nextents = ldip->di_big_nextents; 240 anextents = ldip->di_big_anextents; 241 } else { 242 if (ldip->di_version == 3 && ldip->di_v3_pad != 0) { 243 XFS_CORRUPTION_ERROR( 244 "Bad log dinode di_v3_pad", 245 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 246 xfs_alert(mp, 247 "Bad inode 0x%llx, di_v3_pad 0x%llx", 248 ldip->di_ino, ldip->di_v3_pad); 249 return -EFSCORRUPTED; 250 } 251 252 nextents = ldip->di_nextents; 253 anextents = ldip->di_anextents; 254 } 255 256 if (unlikely(nextents + anextents > ldip->di_nblocks)) { 257 XFS_CORRUPTION_ERROR("Bad log dinode extent counts", 258 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 259 xfs_alert(mp, 260 "Bad inode 0x%llx, large extent counts %d, nextents 0x%llx, anextents 0x%x, nblocks 0x%llx", 261 ldip->di_ino, xfs_has_large_extent_counts(mp), nextents, 262 anextents, ldip->di_nblocks); 263 return -EFSCORRUPTED; 264 } 265 266 return 0; 267 } 268 269 STATIC int 270 xlog_recover_inode_commit_pass2( 271 struct xlog *log, 272 struct list_head *buffer_list, 273 struct xlog_recover_item *item, 274 xfs_lsn_t current_lsn) 275 { 276 struct xfs_inode_log_format *in_f; 277 struct xfs_mount *mp = log->l_mp; 278 struct xfs_buf *bp; 279 struct xfs_dinode *dip; 280 int len; 281 char *src; 282 char *dest; 283 int error; 284 int attr_index; 285 uint fields; 286 struct xfs_log_dinode *ldip; 287 uint isize; 288 int need_free = 0; 289 290 if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { 291 in_f = item->ri_buf[0].i_addr; 292 } else { 293 in_f = kmem_alloc(sizeof(struct xfs_inode_log_format), 0); 294 need_free = 1; 295 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f); 296 if (error) 297 goto error; 298 } 299 300 /* 301 * Inode buffers can be freed, look out for it, 302 * and do not replay the inode. 303 */ 304 if (xlog_is_buffer_cancelled(log, in_f->ilf_blkno, in_f->ilf_len)) { 305 error = 0; 306 trace_xfs_log_recover_inode_cancel(log, in_f); 307 goto error; 308 } 309 trace_xfs_log_recover_inode_recover(log, in_f); 310 311 error = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 312 0, &bp, &xfs_inode_buf_ops); 313 if (error) 314 goto error; 315 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); 316 dip = xfs_buf_offset(bp, in_f->ilf_boffset); 317 318 /* 319 * Make sure the place we're flushing out to really looks 320 * like an inode! 321 */ 322 if (XFS_IS_CORRUPT(mp, !xfs_verify_magic16(bp, dip->di_magic))) { 323 xfs_alert(mp, 324 "%s: Bad inode magic number, dip = "PTR_FMT", dino bp = "PTR_FMT", ino = %lld", 325 __func__, dip, bp, in_f->ilf_ino); 326 error = -EFSCORRUPTED; 327 goto out_release; 328 } 329 ldip = item->ri_buf[1].i_addr; 330 if (XFS_IS_CORRUPT(mp, ldip->di_magic != XFS_DINODE_MAGIC)) { 331 xfs_alert(mp, 332 "%s: Bad inode log record, rec ptr "PTR_FMT", ino %lld", 333 __func__, item, in_f->ilf_ino); 334 error = -EFSCORRUPTED; 335 goto out_release; 336 } 337 338 /* 339 * If the inode has an LSN in it, recover the inode only if the on-disk 340 * inode's LSN is older than the lsn of the transaction we are 341 * replaying. We can have multiple checkpoints with the same start LSN, 342 * so the current LSN being equal to the on-disk LSN doesn't necessarily 343 * mean that the on-disk inode is more recent than the change being 344 * replayed. 345 * 346 * We must check the current_lsn against the on-disk inode 347 * here because the we can't trust the log dinode to contain a valid LSN 348 * (see comment below before replaying the log dinode for details). 349 * 350 * Note: we still need to replay an owner change even though the inode 351 * is more recent than the transaction as there is no guarantee that all 352 * the btree blocks are more recent than this transaction, too. 353 */ 354 if (dip->di_version >= 3) { 355 xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn); 356 357 if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) { 358 trace_xfs_log_recover_inode_skip(log, in_f); 359 error = 0; 360 goto out_owner_change; 361 } 362 } 363 364 /* 365 * di_flushiter is only valid for v1/2 inodes. All changes for v3 inodes 366 * are transactional and if ordering is necessary we can determine that 367 * more accurately by the LSN field in the V3 inode core. Don't trust 368 * the inode versions we might be changing them here - use the 369 * superblock flag to determine whether we need to look at di_flushiter 370 * to skip replay when the on disk inode is newer than the log one 371 */ 372 if (!xfs_has_v3inodes(mp)) { 373 if (ldip->di_flushiter < be16_to_cpu(dip->di_flushiter)) { 374 /* 375 * Deal with the wrap case, DI_MAX_FLUSH is less 376 * than smaller numbers 377 */ 378 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH && 379 ldip->di_flushiter < (DI_MAX_FLUSH >> 1)) { 380 /* do nothing */ 381 } else { 382 trace_xfs_log_recover_inode_skip(log, in_f); 383 error = 0; 384 goto out_release; 385 } 386 } 387 388 /* Take the opportunity to reset the flush iteration count */ 389 ldip->di_flushiter = 0; 390 } 391 392 393 if (unlikely(S_ISREG(ldip->di_mode))) { 394 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && 395 (ldip->di_format != XFS_DINODE_FMT_BTREE)) { 396 XFS_CORRUPTION_ERROR( 397 "Bad log dinode data fork format for regular file", 398 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 399 xfs_alert(mp, 400 "Bad inode 0x%llx, data fork format 0x%x", 401 in_f->ilf_ino, ldip->di_format); 402 error = -EFSCORRUPTED; 403 goto out_release; 404 } 405 } else if (unlikely(S_ISDIR(ldip->di_mode))) { 406 if ((ldip->di_format != XFS_DINODE_FMT_EXTENTS) && 407 (ldip->di_format != XFS_DINODE_FMT_BTREE) && 408 (ldip->di_format != XFS_DINODE_FMT_LOCAL)) { 409 XFS_CORRUPTION_ERROR( 410 "Bad log dinode data fork format for directory", 411 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 412 xfs_alert(mp, 413 "Bad inode 0x%llx, data fork format 0x%x", 414 in_f->ilf_ino, ldip->di_format); 415 error = -EFSCORRUPTED; 416 goto out_release; 417 } 418 } 419 420 error = xlog_dinode_verify_extent_counts(mp, ldip); 421 if (error) 422 goto out_release; 423 424 if (unlikely(ldip->di_forkoff > mp->m_sb.sb_inodesize)) { 425 XFS_CORRUPTION_ERROR("Bad log dinode fork offset", 426 XFS_ERRLEVEL_LOW, mp, ldip, sizeof(*ldip)); 427 xfs_alert(mp, 428 "Bad inode 0x%llx, di_forkoff 0x%x", 429 in_f->ilf_ino, ldip->di_forkoff); 430 error = -EFSCORRUPTED; 431 goto out_release; 432 } 433 isize = xfs_log_dinode_size(mp); 434 if (unlikely(item->ri_buf[1].i_len > isize)) { 435 XFS_CORRUPTION_ERROR("Bad log dinode size", XFS_ERRLEVEL_LOW, 436 mp, ldip, sizeof(*ldip)); 437 xfs_alert(mp, 438 "Bad inode 0x%llx log dinode size 0x%x", 439 in_f->ilf_ino, item->ri_buf[1].i_len); 440 error = -EFSCORRUPTED; 441 goto out_release; 442 } 443 444 /* 445 * Recover the log dinode inode into the on disk inode. 446 * 447 * The LSN in the log dinode is garbage - it can be zero or reflect 448 * stale in-memory runtime state that isn't coherent with the changes 449 * logged in this transaction or the changes written to the on-disk 450 * inode. Hence we write the current lSN into the inode because that 451 * matches what xfs_iflush() would write inode the inode when flushing 452 * the changes in this transaction. 453 */ 454 xfs_log_dinode_to_disk(ldip, dip, current_lsn); 455 456 fields = in_f->ilf_fields; 457 if (fields & XFS_ILOG_DEV) 458 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev); 459 460 if (in_f->ilf_size == 2) 461 goto out_owner_change; 462 len = item->ri_buf[2].i_len; 463 src = item->ri_buf[2].i_addr; 464 ASSERT(in_f->ilf_size <= 4); 465 ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK)); 466 ASSERT(!(fields & XFS_ILOG_DFORK) || 467 (len == xlog_calc_iovec_len(in_f->ilf_dsize))); 468 469 switch (fields & XFS_ILOG_DFORK) { 470 case XFS_ILOG_DDATA: 471 case XFS_ILOG_DEXT: 472 memcpy(XFS_DFORK_DPTR(dip), src, len); 473 break; 474 475 case XFS_ILOG_DBROOT: 476 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len, 477 (struct xfs_bmdr_block *)XFS_DFORK_DPTR(dip), 478 XFS_DFORK_DSIZE(dip, mp)); 479 break; 480 481 default: 482 /* 483 * There are no data fork flags set. 484 */ 485 ASSERT((fields & XFS_ILOG_DFORK) == 0); 486 break; 487 } 488 489 /* 490 * If we logged any attribute data, recover it. There may or 491 * may not have been any other non-core data logged in this 492 * transaction. 493 */ 494 if (in_f->ilf_fields & XFS_ILOG_AFORK) { 495 if (in_f->ilf_fields & XFS_ILOG_DFORK) { 496 attr_index = 3; 497 } else { 498 attr_index = 2; 499 } 500 len = item->ri_buf[attr_index].i_len; 501 src = item->ri_buf[attr_index].i_addr; 502 ASSERT(len == xlog_calc_iovec_len(in_f->ilf_asize)); 503 504 switch (in_f->ilf_fields & XFS_ILOG_AFORK) { 505 case XFS_ILOG_ADATA: 506 case XFS_ILOG_AEXT: 507 dest = XFS_DFORK_APTR(dip); 508 ASSERT(len <= XFS_DFORK_ASIZE(dip, mp)); 509 memcpy(dest, src, len); 510 break; 511 512 case XFS_ILOG_ABROOT: 513 dest = XFS_DFORK_APTR(dip); 514 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, 515 len, (struct xfs_bmdr_block *)dest, 516 XFS_DFORK_ASIZE(dip, mp)); 517 break; 518 519 default: 520 xfs_warn(log->l_mp, "%s: Invalid flag", __func__); 521 ASSERT(0); 522 error = -EFSCORRUPTED; 523 goto out_release; 524 } 525 } 526 527 out_owner_change: 528 /* Recover the swapext owner change unless inode has been deleted */ 529 if ((in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER)) && 530 (dip->di_mode != 0)) 531 error = xfs_recover_inode_owner_change(mp, dip, in_f, 532 buffer_list); 533 /* re-generate the checksum. */ 534 xfs_dinode_calc_crc(log->l_mp, dip); 535 536 ASSERT(bp->b_mount == mp); 537 bp->b_flags |= _XBF_LOGRECOVERY; 538 xfs_buf_delwri_queue(bp, buffer_list); 539 540 out_release: 541 xfs_buf_relse(bp); 542 error: 543 if (need_free) 544 kmem_free(in_f); 545 return error; 546 } 547 548 const struct xlog_recover_item_ops xlog_inode_item_ops = { 549 .item_type = XFS_LI_INODE, 550 .ra_pass2 = xlog_recover_inode_ra_pass2, 551 .commit_pass2 = xlog_recover_inode_commit_pass2, 552 }; 553