1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2017-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_log_format.h" 14 #include "xfs_inode.h" 15 #include "xfs_ialloc.h" 16 #include "xfs_da_format.h" 17 #include "xfs_reflink.h" 18 #include "xfs_rmap.h" 19 #include "xfs_bmap_util.h" 20 #include "scrub/scrub.h" 21 #include "scrub/common.h" 22 #include "scrub/btree.h" 23 24 /* 25 * Grab total control of the inode metadata. It doesn't matter here if 26 * the file data is still changing; exclusive access to the metadata is 27 * the goal. 28 */ 29 int 30 xchk_setup_inode( 31 struct xfs_scrub *sc) 32 { 33 int error; 34 35 if (xchk_need_intent_drain(sc)) 36 xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); 37 38 /* 39 * Try to get the inode. If the verifiers fail, we try again 40 * in raw mode. 41 */ 42 error = xchk_get_inode(sc); 43 switch (error) { 44 case 0: 45 break; 46 case -EFSCORRUPTED: 47 case -EFSBADCRC: 48 return xchk_trans_alloc(sc, 0); 49 default: 50 return error; 51 } 52 53 /* Got the inode, lock it and we're ready to go. */ 54 sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 55 xfs_ilock(sc->ip, sc->ilock_flags); 56 error = xchk_trans_alloc(sc, 0); 57 if (error) 58 goto out; 59 sc->ilock_flags |= XFS_ILOCK_EXCL; 60 xfs_ilock(sc->ip, XFS_ILOCK_EXCL); 61 62 out: 63 /* scrub teardown will unlock and release the inode for us */ 64 return error; 65 } 66 67 /* Inode core */ 68 69 /* Validate di_extsize hint. */ 70 STATIC void 71 xchk_inode_extsize( 72 struct xfs_scrub *sc, 73 struct xfs_dinode *dip, 74 xfs_ino_t ino, 75 uint16_t mode, 76 uint16_t flags) 77 { 78 xfs_failaddr_t fa; 79 uint32_t value = be32_to_cpu(dip->di_extsize); 80 81 fa = xfs_inode_validate_extsize(sc->mp, value, mode, flags); 82 if (fa) 83 xchk_ino_set_corrupt(sc, ino); 84 85 /* 86 * XFS allows a sysadmin to change the rt extent size when adding a rt 87 * section to a filesystem after formatting. If there are any 88 * directories with extszinherit and rtinherit set, the hint could 89 * become misaligned with the new rextsize. The verifier doesn't check 90 * this, because we allow rtinherit directories even without an rt 91 * device. Flag this as an administrative warning since we will clean 92 * this up eventually. 93 */ 94 if ((flags & XFS_DIFLAG_RTINHERIT) && 95 (flags & XFS_DIFLAG_EXTSZINHERIT) && 96 value % sc->mp->m_sb.sb_rextsize > 0) 97 xchk_ino_set_warning(sc, ino); 98 } 99 100 /* 101 * Validate di_cowextsize hint. 102 * 103 * The rules are documented at xfs_ioctl_setattr_check_cowextsize(). 104 * These functions must be kept in sync with each other. 105 */ 106 STATIC void 107 xchk_inode_cowextsize( 108 struct xfs_scrub *sc, 109 struct xfs_dinode *dip, 110 xfs_ino_t ino, 111 uint16_t mode, 112 uint16_t flags, 113 uint64_t flags2) 114 { 115 xfs_failaddr_t fa; 116 117 fa = xfs_inode_validate_cowextsize(sc->mp, 118 be32_to_cpu(dip->di_cowextsize), mode, flags, 119 flags2); 120 if (fa) 121 xchk_ino_set_corrupt(sc, ino); 122 } 123 124 /* Make sure the di_flags make sense for the inode. */ 125 STATIC void 126 xchk_inode_flags( 127 struct xfs_scrub *sc, 128 struct xfs_dinode *dip, 129 xfs_ino_t ino, 130 uint16_t mode, 131 uint16_t flags) 132 { 133 struct xfs_mount *mp = sc->mp; 134 135 /* di_flags are all taken, last bit cannot be used */ 136 if (flags & ~XFS_DIFLAG_ANY) 137 goto bad; 138 139 /* rt flags require rt device */ 140 if ((flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp) 141 goto bad; 142 143 /* new rt bitmap flag only valid for rbmino */ 144 if ((flags & XFS_DIFLAG_NEWRTBM) && ino != mp->m_sb.sb_rbmino) 145 goto bad; 146 147 /* directory-only flags */ 148 if ((flags & (XFS_DIFLAG_RTINHERIT | 149 XFS_DIFLAG_EXTSZINHERIT | 150 XFS_DIFLAG_PROJINHERIT | 151 XFS_DIFLAG_NOSYMLINKS)) && 152 !S_ISDIR(mode)) 153 goto bad; 154 155 /* file-only flags */ 156 if ((flags & (XFS_DIFLAG_REALTIME | FS_XFLAG_EXTSIZE)) && 157 !S_ISREG(mode)) 158 goto bad; 159 160 /* filestreams and rt make no sense */ 161 if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME)) 162 goto bad; 163 164 return; 165 bad: 166 xchk_ino_set_corrupt(sc, ino); 167 } 168 169 /* Make sure the di_flags2 make sense for the inode. */ 170 STATIC void 171 xchk_inode_flags2( 172 struct xfs_scrub *sc, 173 struct xfs_dinode *dip, 174 xfs_ino_t ino, 175 uint16_t mode, 176 uint16_t flags, 177 uint64_t flags2) 178 { 179 struct xfs_mount *mp = sc->mp; 180 181 /* Unknown di_flags2 could be from a future kernel */ 182 if (flags2 & ~XFS_DIFLAG2_ANY) 183 xchk_ino_set_warning(sc, ino); 184 185 /* reflink flag requires reflink feature */ 186 if ((flags2 & XFS_DIFLAG2_REFLINK) && 187 !xfs_has_reflink(mp)) 188 goto bad; 189 190 /* cowextsize flag is checked w.r.t. mode separately */ 191 192 /* file/dir-only flags */ 193 if ((flags2 & XFS_DIFLAG2_DAX) && !(S_ISREG(mode) || S_ISDIR(mode))) 194 goto bad; 195 196 /* file-only flags */ 197 if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode)) 198 goto bad; 199 200 /* realtime and reflink make no sense, currently */ 201 if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK)) 202 goto bad; 203 204 /* no bigtime iflag without the bigtime feature */ 205 if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp)) 206 goto bad; 207 208 return; 209 bad: 210 xchk_ino_set_corrupt(sc, ino); 211 } 212 213 static inline void 214 xchk_dinode_nsec( 215 struct xfs_scrub *sc, 216 xfs_ino_t ino, 217 struct xfs_dinode *dip, 218 const xfs_timestamp_t ts) 219 { 220 struct timespec64 tv; 221 222 tv = xfs_inode_from_disk_ts(dip, ts); 223 if (tv.tv_nsec < 0 || tv.tv_nsec >= NSEC_PER_SEC) 224 xchk_ino_set_corrupt(sc, ino); 225 } 226 227 /* Scrub all the ondisk inode fields. */ 228 STATIC void 229 xchk_dinode( 230 struct xfs_scrub *sc, 231 struct xfs_dinode *dip, 232 xfs_ino_t ino) 233 { 234 struct xfs_mount *mp = sc->mp; 235 size_t fork_recs; 236 unsigned long long isize; 237 uint64_t flags2; 238 xfs_extnum_t nextents; 239 xfs_extnum_t naextents; 240 prid_t prid; 241 uint16_t flags; 242 uint16_t mode; 243 244 flags = be16_to_cpu(dip->di_flags); 245 if (dip->di_version >= 3) 246 flags2 = be64_to_cpu(dip->di_flags2); 247 else 248 flags2 = 0; 249 250 /* di_mode */ 251 mode = be16_to_cpu(dip->di_mode); 252 switch (mode & S_IFMT) { 253 case S_IFLNK: 254 case S_IFREG: 255 case S_IFDIR: 256 case S_IFCHR: 257 case S_IFBLK: 258 case S_IFIFO: 259 case S_IFSOCK: 260 /* mode is recognized */ 261 break; 262 default: 263 xchk_ino_set_corrupt(sc, ino); 264 break; 265 } 266 267 /* v1/v2 fields */ 268 switch (dip->di_version) { 269 case 1: 270 /* 271 * We autoconvert v1 inodes into v2 inodes on writeout, 272 * so just mark this inode for preening. 273 */ 274 xchk_ino_set_preen(sc, ino); 275 prid = 0; 276 break; 277 case 2: 278 case 3: 279 if (dip->di_onlink != 0) 280 xchk_ino_set_corrupt(sc, ino); 281 282 if (dip->di_mode == 0 && sc->ip) 283 xchk_ino_set_corrupt(sc, ino); 284 285 if (dip->di_projid_hi != 0 && 286 !xfs_has_projid32(mp)) 287 xchk_ino_set_corrupt(sc, ino); 288 289 prid = be16_to_cpu(dip->di_projid_lo); 290 break; 291 default: 292 xchk_ino_set_corrupt(sc, ino); 293 return; 294 } 295 296 if (xfs_has_projid32(mp)) 297 prid |= (prid_t)be16_to_cpu(dip->di_projid_hi) << 16; 298 299 /* 300 * di_uid/di_gid -- -1 isn't invalid, but there's no way that 301 * userspace could have created that. 302 */ 303 if (dip->di_uid == cpu_to_be32(-1U) || 304 dip->di_gid == cpu_to_be32(-1U)) 305 xchk_ino_set_warning(sc, ino); 306 307 /* 308 * project id of -1 isn't supposed to be valid, but the kernel didn't 309 * always validate that. 310 */ 311 if (prid == -1U) 312 xchk_ino_set_warning(sc, ino); 313 314 /* di_format */ 315 switch (dip->di_format) { 316 case XFS_DINODE_FMT_DEV: 317 if (!S_ISCHR(mode) && !S_ISBLK(mode) && 318 !S_ISFIFO(mode) && !S_ISSOCK(mode)) 319 xchk_ino_set_corrupt(sc, ino); 320 break; 321 case XFS_DINODE_FMT_LOCAL: 322 if (!S_ISDIR(mode) && !S_ISLNK(mode)) 323 xchk_ino_set_corrupt(sc, ino); 324 break; 325 case XFS_DINODE_FMT_EXTENTS: 326 if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode)) 327 xchk_ino_set_corrupt(sc, ino); 328 break; 329 case XFS_DINODE_FMT_BTREE: 330 if (!S_ISREG(mode) && !S_ISDIR(mode)) 331 xchk_ino_set_corrupt(sc, ino); 332 break; 333 case XFS_DINODE_FMT_UUID: 334 default: 335 xchk_ino_set_corrupt(sc, ino); 336 break; 337 } 338 339 /* di_[amc]time.nsec */ 340 xchk_dinode_nsec(sc, ino, dip, dip->di_atime); 341 xchk_dinode_nsec(sc, ino, dip, dip->di_mtime); 342 xchk_dinode_nsec(sc, ino, dip, dip->di_ctime); 343 344 /* 345 * di_size. xfs_dinode_verify checks for things that screw up 346 * the VFS such as the upper bit being set and zero-length 347 * symlinks/directories, but we can do more here. 348 */ 349 isize = be64_to_cpu(dip->di_size); 350 if (isize & (1ULL << 63)) 351 xchk_ino_set_corrupt(sc, ino); 352 353 /* Devices, fifos, and sockets must have zero size */ 354 if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0) 355 xchk_ino_set_corrupt(sc, ino); 356 357 /* Directories can't be larger than the data section size (32G) */ 358 if (S_ISDIR(mode) && (isize == 0 || isize >= XFS_DIR2_SPACE_SIZE)) 359 xchk_ino_set_corrupt(sc, ino); 360 361 /* Symlinks can't be larger than SYMLINK_MAXLEN */ 362 if (S_ISLNK(mode) && (isize == 0 || isize >= XFS_SYMLINK_MAXLEN)) 363 xchk_ino_set_corrupt(sc, ino); 364 365 /* 366 * Warn if the running kernel can't handle the kinds of offsets 367 * needed to deal with the file size. In other words, if the 368 * pagecache can't cache all the blocks in this file due to 369 * overly large offsets, flag the inode for admin review. 370 */ 371 if (isize > mp->m_super->s_maxbytes) 372 xchk_ino_set_warning(sc, ino); 373 374 /* di_nblocks */ 375 if (flags2 & XFS_DIFLAG2_REFLINK) { 376 ; /* nblocks can exceed dblocks */ 377 } else if (flags & XFS_DIFLAG_REALTIME) { 378 /* 379 * nblocks is the sum of data extents (in the rtdev), 380 * attr extents (in the datadev), and both forks' bmbt 381 * blocks (in the datadev). This clumsy check is the 382 * best we can do without cross-referencing with the 383 * inode forks. 384 */ 385 if (be64_to_cpu(dip->di_nblocks) >= 386 mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks) 387 xchk_ino_set_corrupt(sc, ino); 388 } else { 389 if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks) 390 xchk_ino_set_corrupt(sc, ino); 391 } 392 393 xchk_inode_flags(sc, dip, ino, mode, flags); 394 395 xchk_inode_extsize(sc, dip, ino, mode, flags); 396 397 nextents = xfs_dfork_data_extents(dip); 398 naextents = xfs_dfork_attr_extents(dip); 399 400 /* di_nextents */ 401 fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); 402 switch (dip->di_format) { 403 case XFS_DINODE_FMT_EXTENTS: 404 if (nextents > fork_recs) 405 xchk_ino_set_corrupt(sc, ino); 406 break; 407 case XFS_DINODE_FMT_BTREE: 408 if (nextents <= fork_recs) 409 xchk_ino_set_corrupt(sc, ino); 410 break; 411 default: 412 if (nextents != 0) 413 xchk_ino_set_corrupt(sc, ino); 414 break; 415 } 416 417 /* di_forkoff */ 418 if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) 419 xchk_ino_set_corrupt(sc, ino); 420 if (naextents != 0 && dip->di_forkoff == 0) 421 xchk_ino_set_corrupt(sc, ino); 422 if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS) 423 xchk_ino_set_corrupt(sc, ino); 424 425 /* di_aformat */ 426 if (dip->di_aformat != XFS_DINODE_FMT_LOCAL && 427 dip->di_aformat != XFS_DINODE_FMT_EXTENTS && 428 dip->di_aformat != XFS_DINODE_FMT_BTREE) 429 xchk_ino_set_corrupt(sc, ino); 430 431 /* di_anextents */ 432 fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); 433 switch (dip->di_aformat) { 434 case XFS_DINODE_FMT_EXTENTS: 435 if (naextents > fork_recs) 436 xchk_ino_set_corrupt(sc, ino); 437 break; 438 case XFS_DINODE_FMT_BTREE: 439 if (naextents <= fork_recs) 440 xchk_ino_set_corrupt(sc, ino); 441 break; 442 default: 443 if (naextents != 0) 444 xchk_ino_set_corrupt(sc, ino); 445 } 446 447 if (dip->di_version >= 3) { 448 xchk_dinode_nsec(sc, ino, dip, dip->di_crtime); 449 xchk_inode_flags2(sc, dip, ino, mode, flags, flags2); 450 xchk_inode_cowextsize(sc, dip, ino, mode, flags, 451 flags2); 452 } 453 } 454 455 /* 456 * Make sure the finobt doesn't think this inode is free. 457 * We don't have to check the inobt ourselves because we got the inode via 458 * IGET_UNTRUSTED, which checks the inobt for us. 459 */ 460 static void 461 xchk_inode_xref_finobt( 462 struct xfs_scrub *sc, 463 xfs_ino_t ino) 464 { 465 struct xfs_inobt_rec_incore rec; 466 xfs_agino_t agino; 467 int has_record; 468 int error; 469 470 if (!sc->sa.fino_cur || xchk_skip_xref(sc->sm)) 471 return; 472 473 agino = XFS_INO_TO_AGINO(sc->mp, ino); 474 475 /* 476 * Try to get the finobt record. If we can't get it, then we're 477 * in good shape. 478 */ 479 error = xfs_inobt_lookup(sc->sa.fino_cur, agino, XFS_LOOKUP_LE, 480 &has_record); 481 if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur) || 482 !has_record) 483 return; 484 485 error = xfs_inobt_get_rec(sc->sa.fino_cur, &rec, &has_record); 486 if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur) || 487 !has_record) 488 return; 489 490 /* 491 * Otherwise, make sure this record either doesn't cover this inode, 492 * or that it does but it's marked present. 493 */ 494 if (rec.ir_startino > agino || 495 rec.ir_startino + XFS_INODES_PER_CHUNK <= agino) 496 return; 497 498 if (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)) 499 xchk_btree_xref_set_corrupt(sc, sc->sa.fino_cur, 0); 500 } 501 502 /* Cross reference the inode fields with the forks. */ 503 STATIC void 504 xchk_inode_xref_bmap( 505 struct xfs_scrub *sc, 506 struct xfs_dinode *dip) 507 { 508 xfs_extnum_t nextents; 509 xfs_filblks_t count; 510 xfs_filblks_t acount; 511 int error; 512 513 if (xchk_skip_xref(sc->sm)) 514 return; 515 516 /* Walk all the extents to check nextents/naextents/nblocks. */ 517 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, 518 &nextents, &count); 519 if (!xchk_should_check_xref(sc, &error, NULL)) 520 return; 521 if (nextents < xfs_dfork_data_extents(dip)) 522 xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); 523 524 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, 525 &nextents, &acount); 526 if (!xchk_should_check_xref(sc, &error, NULL)) 527 return; 528 if (nextents != xfs_dfork_attr_extents(dip)) 529 xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); 530 531 /* Check nblocks against the inode. */ 532 if (count + acount != be64_to_cpu(dip->di_nblocks)) 533 xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); 534 } 535 536 /* Cross-reference with the other btrees. */ 537 STATIC void 538 xchk_inode_xref( 539 struct xfs_scrub *sc, 540 xfs_ino_t ino, 541 struct xfs_dinode *dip) 542 { 543 xfs_agnumber_t agno; 544 xfs_agblock_t agbno; 545 int error; 546 547 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 548 return; 549 550 agno = XFS_INO_TO_AGNO(sc->mp, ino); 551 agbno = XFS_INO_TO_AGBNO(sc->mp, ino); 552 553 error = xchk_ag_init_existing(sc, agno, &sc->sa); 554 if (!xchk_xref_process_error(sc, agno, agbno, &error)) 555 goto out_free; 556 557 xchk_xref_is_used_space(sc, agbno, 1); 558 xchk_inode_xref_finobt(sc, ino); 559 xchk_xref_is_only_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES); 560 xchk_xref_is_not_shared(sc, agbno, 1); 561 xchk_xref_is_not_cow_staging(sc, agbno, 1); 562 xchk_inode_xref_bmap(sc, dip); 563 564 out_free: 565 xchk_ag_free(sc, &sc->sa); 566 } 567 568 /* 569 * If the reflink iflag disagrees with a scan for shared data fork extents, 570 * either flag an error (shared extents w/ no flag) or a preen (flag set w/o 571 * any shared extents). We already checked for reflink iflag set on a non 572 * reflink filesystem. 573 */ 574 static void 575 xchk_inode_check_reflink_iflag( 576 struct xfs_scrub *sc, 577 xfs_ino_t ino) 578 { 579 struct xfs_mount *mp = sc->mp; 580 bool has_shared; 581 int error; 582 583 if (!xfs_has_reflink(mp)) 584 return; 585 586 error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, 587 &has_shared); 588 if (!xchk_xref_process_error(sc, XFS_INO_TO_AGNO(mp, ino), 589 XFS_INO_TO_AGBNO(mp, ino), &error)) 590 return; 591 if (xfs_is_reflink_inode(sc->ip) && !has_shared) 592 xchk_ino_set_preen(sc, ino); 593 else if (!xfs_is_reflink_inode(sc->ip) && has_shared) 594 xchk_ino_set_corrupt(sc, ino); 595 } 596 597 /* Scrub an inode. */ 598 int 599 xchk_inode( 600 struct xfs_scrub *sc) 601 { 602 struct xfs_dinode di; 603 int error = 0; 604 605 /* 606 * If sc->ip is NULL, that means that the setup function called 607 * xfs_iget to look up the inode. xfs_iget returned a EFSCORRUPTED 608 * and a NULL inode, so flag the corruption error and return. 609 */ 610 if (!sc->ip) { 611 xchk_ino_set_corrupt(sc, sc->sm->sm_ino); 612 return 0; 613 } 614 615 /* Scrub the inode core. */ 616 xfs_inode_to_disk(sc->ip, &di, 0); 617 xchk_dinode(sc, &di, sc->ip->i_ino); 618 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 619 goto out; 620 621 /* 622 * Look for discrepancies between file's data blocks and the reflink 623 * iflag. We already checked the iflag against the file mode when 624 * we scrubbed the dinode. 625 */ 626 if (S_ISREG(VFS_I(sc->ip)->i_mode)) 627 xchk_inode_check_reflink_iflag(sc, sc->ip->i_ino); 628 629 xchk_inode_xref(sc, sc->ip->i_ino, &di); 630 out: 631 return error; 632 } 633