1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2017 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_btree.h" 13 #include "xfs_log_format.h" 14 #include "xfs_inode.h" 15 #include "xfs_ialloc.h" 16 #include "xfs_da_format.h" 17 #include "xfs_reflink.h" 18 #include "xfs_rmap.h" 19 #include "xfs_bmap_util.h" 20 #include "scrub/scrub.h" 21 #include "scrub/common.h" 22 #include "scrub/btree.h" 23 24 /* 25 * Grab total control of the inode metadata. It doesn't matter here if 26 * the file data is still changing; exclusive access to the metadata is 27 * the goal. 28 */ 29 int 30 xchk_setup_inode( 31 struct xfs_scrub *sc) 32 { 33 int error; 34 35 /* 36 * Try to get the inode. If the verifiers fail, we try again 37 * in raw mode. 38 */ 39 error = xchk_get_inode(sc); 40 switch (error) { 41 case 0: 42 break; 43 case -EFSCORRUPTED: 44 case -EFSBADCRC: 45 return xchk_trans_alloc(sc, 0); 46 default: 47 return error; 48 } 49 50 /* Got the inode, lock it and we're ready to go. */ 51 sc->ilock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 52 xfs_ilock(sc->ip, sc->ilock_flags); 53 error = xchk_trans_alloc(sc, 0); 54 if (error) 55 goto out; 56 sc->ilock_flags |= XFS_ILOCK_EXCL; 57 xfs_ilock(sc->ip, XFS_ILOCK_EXCL); 58 59 out: 60 /* scrub teardown will unlock and release the inode for us */ 61 return error; 62 } 63 64 /* Inode core */ 65 66 /* Validate di_extsize hint. */ 67 STATIC void 68 xchk_inode_extsize( 69 struct xfs_scrub *sc, 70 struct xfs_dinode *dip, 71 xfs_ino_t ino, 72 uint16_t mode, 73 uint16_t flags) 74 { 75 xfs_failaddr_t fa; 76 uint32_t value = be32_to_cpu(dip->di_extsize); 77 78 fa = xfs_inode_validate_extsize(sc->mp, value, mode, flags); 79 if (fa) 80 xchk_ino_set_corrupt(sc, ino); 81 82 /* 83 * XFS allows a sysadmin to change the rt extent size when adding a rt 84 * section to a filesystem after formatting. If there are any 85 * directories with extszinherit and rtinherit set, the hint could 86 * become misaligned with the new rextsize. The verifier doesn't check 87 * this, because we allow rtinherit directories even without an rt 88 * device. Flag this as an administrative warning since we will clean 89 * this up eventually. 90 */ 91 if ((flags & XFS_DIFLAG_RTINHERIT) && 92 (flags & XFS_DIFLAG_EXTSZINHERIT) && 93 value % sc->mp->m_sb.sb_rextsize > 0) 94 xchk_ino_set_warning(sc, ino); 95 } 96 97 /* 98 * Validate di_cowextsize hint. 99 * 100 * The rules are documented at xfs_ioctl_setattr_check_cowextsize(). 101 * These functions must be kept in sync with each other. 102 */ 103 STATIC void 104 xchk_inode_cowextsize( 105 struct xfs_scrub *sc, 106 struct xfs_dinode *dip, 107 xfs_ino_t ino, 108 uint16_t mode, 109 uint16_t flags, 110 uint64_t flags2) 111 { 112 xfs_failaddr_t fa; 113 114 fa = xfs_inode_validate_cowextsize(sc->mp, 115 be32_to_cpu(dip->di_cowextsize), mode, flags, 116 flags2); 117 if (fa) 118 xchk_ino_set_corrupt(sc, ino); 119 } 120 121 /* Make sure the di_flags make sense for the inode. */ 122 STATIC void 123 xchk_inode_flags( 124 struct xfs_scrub *sc, 125 struct xfs_dinode *dip, 126 xfs_ino_t ino, 127 uint16_t mode, 128 uint16_t flags) 129 { 130 struct xfs_mount *mp = sc->mp; 131 132 /* di_flags are all taken, last bit cannot be used */ 133 if (flags & ~XFS_DIFLAG_ANY) 134 goto bad; 135 136 /* rt flags require rt device */ 137 if ((flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp) 138 goto bad; 139 140 /* new rt bitmap flag only valid for rbmino */ 141 if ((flags & XFS_DIFLAG_NEWRTBM) && ino != mp->m_sb.sb_rbmino) 142 goto bad; 143 144 /* directory-only flags */ 145 if ((flags & (XFS_DIFLAG_RTINHERIT | 146 XFS_DIFLAG_EXTSZINHERIT | 147 XFS_DIFLAG_PROJINHERIT | 148 XFS_DIFLAG_NOSYMLINKS)) && 149 !S_ISDIR(mode)) 150 goto bad; 151 152 /* file-only flags */ 153 if ((flags & (XFS_DIFLAG_REALTIME | FS_XFLAG_EXTSIZE)) && 154 !S_ISREG(mode)) 155 goto bad; 156 157 /* filestreams and rt make no sense */ 158 if ((flags & XFS_DIFLAG_FILESTREAM) && (flags & XFS_DIFLAG_REALTIME)) 159 goto bad; 160 161 return; 162 bad: 163 xchk_ino_set_corrupt(sc, ino); 164 } 165 166 /* Make sure the di_flags2 make sense for the inode. */ 167 STATIC void 168 xchk_inode_flags2( 169 struct xfs_scrub *sc, 170 struct xfs_dinode *dip, 171 xfs_ino_t ino, 172 uint16_t mode, 173 uint16_t flags, 174 uint64_t flags2) 175 { 176 struct xfs_mount *mp = sc->mp; 177 178 /* Unknown di_flags2 could be from a future kernel */ 179 if (flags2 & ~XFS_DIFLAG2_ANY) 180 xchk_ino_set_warning(sc, ino); 181 182 /* reflink flag requires reflink feature */ 183 if ((flags2 & XFS_DIFLAG2_REFLINK) && 184 !xfs_has_reflink(mp)) 185 goto bad; 186 187 /* cowextsize flag is checked w.r.t. mode separately */ 188 189 /* file/dir-only flags */ 190 if ((flags2 & XFS_DIFLAG2_DAX) && !(S_ISREG(mode) || S_ISDIR(mode))) 191 goto bad; 192 193 /* file-only flags */ 194 if ((flags2 & XFS_DIFLAG2_REFLINK) && !S_ISREG(mode)) 195 goto bad; 196 197 /* realtime and reflink make no sense, currently */ 198 if ((flags & XFS_DIFLAG_REALTIME) && (flags2 & XFS_DIFLAG2_REFLINK)) 199 goto bad; 200 201 /* no bigtime iflag without the bigtime feature */ 202 if (xfs_dinode_has_bigtime(dip) && !xfs_has_bigtime(mp)) 203 goto bad; 204 205 return; 206 bad: 207 xchk_ino_set_corrupt(sc, ino); 208 } 209 210 static inline void 211 xchk_dinode_nsec( 212 struct xfs_scrub *sc, 213 xfs_ino_t ino, 214 struct xfs_dinode *dip, 215 const xfs_timestamp_t ts) 216 { 217 struct timespec64 tv; 218 219 tv = xfs_inode_from_disk_ts(dip, ts); 220 if (tv.tv_nsec < 0 || tv.tv_nsec >= NSEC_PER_SEC) 221 xchk_ino_set_corrupt(sc, ino); 222 } 223 224 /* Scrub all the ondisk inode fields. */ 225 STATIC void 226 xchk_dinode( 227 struct xfs_scrub *sc, 228 struct xfs_dinode *dip, 229 xfs_ino_t ino) 230 { 231 struct xfs_mount *mp = sc->mp; 232 size_t fork_recs; 233 unsigned long long isize; 234 uint64_t flags2; 235 xfs_extnum_t nextents; 236 xfs_extnum_t naextents; 237 prid_t prid; 238 uint16_t flags; 239 uint16_t mode; 240 241 flags = be16_to_cpu(dip->di_flags); 242 if (dip->di_version >= 3) 243 flags2 = be64_to_cpu(dip->di_flags2); 244 else 245 flags2 = 0; 246 247 /* di_mode */ 248 mode = be16_to_cpu(dip->di_mode); 249 switch (mode & S_IFMT) { 250 case S_IFLNK: 251 case S_IFREG: 252 case S_IFDIR: 253 case S_IFCHR: 254 case S_IFBLK: 255 case S_IFIFO: 256 case S_IFSOCK: 257 /* mode is recognized */ 258 break; 259 default: 260 xchk_ino_set_corrupt(sc, ino); 261 break; 262 } 263 264 /* v1/v2 fields */ 265 switch (dip->di_version) { 266 case 1: 267 /* 268 * We autoconvert v1 inodes into v2 inodes on writeout, 269 * so just mark this inode for preening. 270 */ 271 xchk_ino_set_preen(sc, ino); 272 prid = 0; 273 break; 274 case 2: 275 case 3: 276 if (dip->di_onlink != 0) 277 xchk_ino_set_corrupt(sc, ino); 278 279 if (dip->di_mode == 0 && sc->ip) 280 xchk_ino_set_corrupt(sc, ino); 281 282 if (dip->di_projid_hi != 0 && 283 !xfs_has_projid32(mp)) 284 xchk_ino_set_corrupt(sc, ino); 285 286 prid = be16_to_cpu(dip->di_projid_lo); 287 break; 288 default: 289 xchk_ino_set_corrupt(sc, ino); 290 return; 291 } 292 293 if (xfs_has_projid32(mp)) 294 prid |= (prid_t)be16_to_cpu(dip->di_projid_hi) << 16; 295 296 /* 297 * di_uid/di_gid -- -1 isn't invalid, but there's no way that 298 * userspace could have created that. 299 */ 300 if (dip->di_uid == cpu_to_be32(-1U) || 301 dip->di_gid == cpu_to_be32(-1U)) 302 xchk_ino_set_warning(sc, ino); 303 304 /* 305 * project id of -1 isn't supposed to be valid, but the kernel didn't 306 * always validate that. 307 */ 308 if (prid == -1U) 309 xchk_ino_set_warning(sc, ino); 310 311 /* di_format */ 312 switch (dip->di_format) { 313 case XFS_DINODE_FMT_DEV: 314 if (!S_ISCHR(mode) && !S_ISBLK(mode) && 315 !S_ISFIFO(mode) && !S_ISSOCK(mode)) 316 xchk_ino_set_corrupt(sc, ino); 317 break; 318 case XFS_DINODE_FMT_LOCAL: 319 if (!S_ISDIR(mode) && !S_ISLNK(mode)) 320 xchk_ino_set_corrupt(sc, ino); 321 break; 322 case XFS_DINODE_FMT_EXTENTS: 323 if (!S_ISREG(mode) && !S_ISDIR(mode) && !S_ISLNK(mode)) 324 xchk_ino_set_corrupt(sc, ino); 325 break; 326 case XFS_DINODE_FMT_BTREE: 327 if (!S_ISREG(mode) && !S_ISDIR(mode)) 328 xchk_ino_set_corrupt(sc, ino); 329 break; 330 case XFS_DINODE_FMT_UUID: 331 default: 332 xchk_ino_set_corrupt(sc, ino); 333 break; 334 } 335 336 /* di_[amc]time.nsec */ 337 xchk_dinode_nsec(sc, ino, dip, dip->di_atime); 338 xchk_dinode_nsec(sc, ino, dip, dip->di_mtime); 339 xchk_dinode_nsec(sc, ino, dip, dip->di_ctime); 340 341 /* 342 * di_size. xfs_dinode_verify checks for things that screw up 343 * the VFS such as the upper bit being set and zero-length 344 * symlinks/directories, but we can do more here. 345 */ 346 isize = be64_to_cpu(dip->di_size); 347 if (isize & (1ULL << 63)) 348 xchk_ino_set_corrupt(sc, ino); 349 350 /* Devices, fifos, and sockets must have zero size */ 351 if (!S_ISDIR(mode) && !S_ISREG(mode) && !S_ISLNK(mode) && isize != 0) 352 xchk_ino_set_corrupt(sc, ino); 353 354 /* Directories can't be larger than the data section size (32G) */ 355 if (S_ISDIR(mode) && (isize == 0 || isize >= XFS_DIR2_SPACE_SIZE)) 356 xchk_ino_set_corrupt(sc, ino); 357 358 /* Symlinks can't be larger than SYMLINK_MAXLEN */ 359 if (S_ISLNK(mode) && (isize == 0 || isize >= XFS_SYMLINK_MAXLEN)) 360 xchk_ino_set_corrupt(sc, ino); 361 362 /* 363 * Warn if the running kernel can't handle the kinds of offsets 364 * needed to deal with the file size. In other words, if the 365 * pagecache can't cache all the blocks in this file due to 366 * overly large offsets, flag the inode for admin review. 367 */ 368 if (isize > mp->m_super->s_maxbytes) 369 xchk_ino_set_warning(sc, ino); 370 371 /* di_nblocks */ 372 if (flags2 & XFS_DIFLAG2_REFLINK) { 373 ; /* nblocks can exceed dblocks */ 374 } else if (flags & XFS_DIFLAG_REALTIME) { 375 /* 376 * nblocks is the sum of data extents (in the rtdev), 377 * attr extents (in the datadev), and both forks' bmbt 378 * blocks (in the datadev). This clumsy check is the 379 * best we can do without cross-referencing with the 380 * inode forks. 381 */ 382 if (be64_to_cpu(dip->di_nblocks) >= 383 mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks) 384 xchk_ino_set_corrupt(sc, ino); 385 } else { 386 if (be64_to_cpu(dip->di_nblocks) >= mp->m_sb.sb_dblocks) 387 xchk_ino_set_corrupt(sc, ino); 388 } 389 390 xchk_inode_flags(sc, dip, ino, mode, flags); 391 392 xchk_inode_extsize(sc, dip, ino, mode, flags); 393 394 nextents = xfs_dfork_data_extents(dip); 395 naextents = xfs_dfork_attr_extents(dip); 396 397 /* di_nextents */ 398 fork_recs = XFS_DFORK_DSIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); 399 switch (dip->di_format) { 400 case XFS_DINODE_FMT_EXTENTS: 401 if (nextents > fork_recs) 402 xchk_ino_set_corrupt(sc, ino); 403 break; 404 case XFS_DINODE_FMT_BTREE: 405 if (nextents <= fork_recs) 406 xchk_ino_set_corrupt(sc, ino); 407 break; 408 default: 409 if (nextents != 0) 410 xchk_ino_set_corrupt(sc, ino); 411 break; 412 } 413 414 /* di_forkoff */ 415 if (XFS_DFORK_APTR(dip) >= (char *)dip + mp->m_sb.sb_inodesize) 416 xchk_ino_set_corrupt(sc, ino); 417 if (naextents != 0 && dip->di_forkoff == 0) 418 xchk_ino_set_corrupt(sc, ino); 419 if (dip->di_forkoff == 0 && dip->di_aformat != XFS_DINODE_FMT_EXTENTS) 420 xchk_ino_set_corrupt(sc, ino); 421 422 /* di_aformat */ 423 if (dip->di_aformat != XFS_DINODE_FMT_LOCAL && 424 dip->di_aformat != XFS_DINODE_FMT_EXTENTS && 425 dip->di_aformat != XFS_DINODE_FMT_BTREE) 426 xchk_ino_set_corrupt(sc, ino); 427 428 /* di_anextents */ 429 fork_recs = XFS_DFORK_ASIZE(dip, mp) / sizeof(struct xfs_bmbt_rec); 430 switch (dip->di_aformat) { 431 case XFS_DINODE_FMT_EXTENTS: 432 if (naextents > fork_recs) 433 xchk_ino_set_corrupt(sc, ino); 434 break; 435 case XFS_DINODE_FMT_BTREE: 436 if (naextents <= fork_recs) 437 xchk_ino_set_corrupt(sc, ino); 438 break; 439 default: 440 if (naextents != 0) 441 xchk_ino_set_corrupt(sc, ino); 442 } 443 444 if (dip->di_version >= 3) { 445 xchk_dinode_nsec(sc, ino, dip, dip->di_crtime); 446 xchk_inode_flags2(sc, dip, ino, mode, flags, flags2); 447 xchk_inode_cowextsize(sc, dip, ino, mode, flags, 448 flags2); 449 } 450 } 451 452 /* 453 * Make sure the finobt doesn't think this inode is free. 454 * We don't have to check the inobt ourselves because we got the inode via 455 * IGET_UNTRUSTED, which checks the inobt for us. 456 */ 457 static void 458 xchk_inode_xref_finobt( 459 struct xfs_scrub *sc, 460 xfs_ino_t ino) 461 { 462 struct xfs_inobt_rec_incore rec; 463 xfs_agino_t agino; 464 int has_record; 465 int error; 466 467 if (!sc->sa.fino_cur || xchk_skip_xref(sc->sm)) 468 return; 469 470 agino = XFS_INO_TO_AGINO(sc->mp, ino); 471 472 /* 473 * Try to get the finobt record. If we can't get it, then we're 474 * in good shape. 475 */ 476 error = xfs_inobt_lookup(sc->sa.fino_cur, agino, XFS_LOOKUP_LE, 477 &has_record); 478 if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur) || 479 !has_record) 480 return; 481 482 error = xfs_inobt_get_rec(sc->sa.fino_cur, &rec, &has_record); 483 if (!xchk_should_check_xref(sc, &error, &sc->sa.fino_cur) || 484 !has_record) 485 return; 486 487 /* 488 * Otherwise, make sure this record either doesn't cover this inode, 489 * or that it does but it's marked present. 490 */ 491 if (rec.ir_startino > agino || 492 rec.ir_startino + XFS_INODES_PER_CHUNK <= agino) 493 return; 494 495 if (rec.ir_free & XFS_INOBT_MASK(agino - rec.ir_startino)) 496 xchk_btree_xref_set_corrupt(sc, sc->sa.fino_cur, 0); 497 } 498 499 /* Cross reference the inode fields with the forks. */ 500 STATIC void 501 xchk_inode_xref_bmap( 502 struct xfs_scrub *sc, 503 struct xfs_dinode *dip) 504 { 505 xfs_extnum_t nextents; 506 xfs_filblks_t count; 507 xfs_filblks_t acount; 508 int error; 509 510 if (xchk_skip_xref(sc->sm)) 511 return; 512 513 /* Walk all the extents to check nextents/naextents/nblocks. */ 514 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK, 515 &nextents, &count); 516 if (!xchk_should_check_xref(sc, &error, NULL)) 517 return; 518 if (nextents < xfs_dfork_data_extents(dip)) 519 xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); 520 521 error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK, 522 &nextents, &acount); 523 if (!xchk_should_check_xref(sc, &error, NULL)) 524 return; 525 if (nextents != xfs_dfork_attr_extents(dip)) 526 xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); 527 528 /* Check nblocks against the inode. */ 529 if (count + acount != be64_to_cpu(dip->di_nblocks)) 530 xchk_ino_xref_set_corrupt(sc, sc->ip->i_ino); 531 } 532 533 /* Cross-reference with the other btrees. */ 534 STATIC void 535 xchk_inode_xref( 536 struct xfs_scrub *sc, 537 xfs_ino_t ino, 538 struct xfs_dinode *dip) 539 { 540 xfs_agnumber_t agno; 541 xfs_agblock_t agbno; 542 int error; 543 544 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 545 return; 546 547 agno = XFS_INO_TO_AGNO(sc->mp, ino); 548 agbno = XFS_INO_TO_AGBNO(sc->mp, ino); 549 550 error = xchk_ag_init_existing(sc, agno, &sc->sa); 551 if (!xchk_xref_process_error(sc, agno, agbno, &error)) 552 goto out_free; 553 554 xchk_xref_is_used_space(sc, agbno, 1); 555 xchk_inode_xref_finobt(sc, ino); 556 xchk_xref_is_owned_by(sc, agbno, 1, &XFS_RMAP_OINFO_INODES); 557 xchk_xref_is_not_shared(sc, agbno, 1); 558 xchk_inode_xref_bmap(sc, dip); 559 560 out_free: 561 xchk_ag_free(sc, &sc->sa); 562 } 563 564 /* 565 * If the reflink iflag disagrees with a scan for shared data fork extents, 566 * either flag an error (shared extents w/ no flag) or a preen (flag set w/o 567 * any shared extents). We already checked for reflink iflag set on a non 568 * reflink filesystem. 569 */ 570 static void 571 xchk_inode_check_reflink_iflag( 572 struct xfs_scrub *sc, 573 xfs_ino_t ino) 574 { 575 struct xfs_mount *mp = sc->mp; 576 bool has_shared; 577 int error; 578 579 if (!xfs_has_reflink(mp)) 580 return; 581 582 error = xfs_reflink_inode_has_shared_extents(sc->tp, sc->ip, 583 &has_shared); 584 if (!xchk_xref_process_error(sc, XFS_INO_TO_AGNO(mp, ino), 585 XFS_INO_TO_AGBNO(mp, ino), &error)) 586 return; 587 if (xfs_is_reflink_inode(sc->ip) && !has_shared) 588 xchk_ino_set_preen(sc, ino); 589 else if (!xfs_is_reflink_inode(sc->ip) && has_shared) 590 xchk_ino_set_corrupt(sc, ino); 591 } 592 593 /* Scrub an inode. */ 594 int 595 xchk_inode( 596 struct xfs_scrub *sc) 597 { 598 struct xfs_dinode di; 599 int error = 0; 600 601 /* 602 * If sc->ip is NULL, that means that the setup function called 603 * xfs_iget to look up the inode. xfs_iget returned a EFSCORRUPTED 604 * and a NULL inode, so flag the corruption error and return. 605 */ 606 if (!sc->ip) { 607 xchk_ino_set_corrupt(sc, sc->sm->sm_ino); 608 return 0; 609 } 610 611 /* Scrub the inode core. */ 612 xfs_inode_to_disk(sc->ip, &di, 0); 613 xchk_dinode(sc, &di, sc->ip->i_ino); 614 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) 615 goto out; 616 617 /* 618 * Look for discrepancies between file's data blocks and the reflink 619 * iflag. We already checked the iflag against the file mode when 620 * we scrubbed the dinode. 621 */ 622 if (S_ISREG(VFS_I(sc->ip)->i_mode)) 623 xchk_inode_check_reflink_iflag(sc, sc->ip->i_ino); 624 625 xchk_inode_xref(sc, sc->ip->i_ino, &di); 626 out: 627 return error; 628 } 629