1 /* 2 * Copyright (C) 2017 Oracle. All Rights Reserved. 3 * 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 * 6 * This program is free software; you can redistribute it and/or 7 * modify it under the terms of the GNU General Public License 8 * as published by the Free Software Foundation; either version 2 9 * of the License, or (at your option) any later version. 10 * 11 * This program is distributed in the hope that it would be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 * GNU General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program; if not, write the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. 19 */ 20 #include "xfs.h" 21 #include "xfs_fs.h" 22 #include "xfs_shared.h" 23 #include "xfs_format.h" 24 #include "xfs_trans_resv.h" 25 #include "xfs_mount.h" 26 #include "xfs_defer.h" 27 #include "xfs_btree.h" 28 #include "xfs_bit.h" 29 #include "xfs_log_format.h" 30 #include "xfs_trans.h" 31 #include "xfs_sb.h" 32 #include "xfs_inode.h" 33 #include "xfs_icache.h" 34 #include "xfs_itable.h" 35 #include "xfs_alloc.h" 36 #include "xfs_alloc_btree.h" 37 #include "xfs_bmap.h" 38 #include "xfs_bmap_btree.h" 39 #include "xfs_ialloc.h" 40 #include "xfs_ialloc_btree.h" 41 #include "xfs_refcount.h" 42 #include "xfs_refcount_btree.h" 43 #include "xfs_rmap.h" 44 #include "xfs_rmap_btree.h" 45 #include "xfs_quota.h" 46 #include "xfs_qm.h" 47 #include "xfs_errortag.h" 48 #include "xfs_error.h" 49 #include "xfs_log.h" 50 #include "xfs_trans_priv.h" 51 #include "scrub/xfs_scrub.h" 52 #include "scrub/scrub.h" 53 #include "scrub/common.h" 54 #include "scrub/trace.h" 55 #include "scrub/btree.h" 56 #include "scrub/repair.h" 57 58 /* 59 * Online Scrub and Repair 60 * 61 * Traditionally, XFS (the kernel driver) did not know how to check or 62 * repair on-disk data structures. That task was left to the xfs_check 63 * and xfs_repair tools, both of which require taking the filesystem 64 * offline for a thorough but time consuming examination. Online 65 * scrub & repair, on the other hand, enables us to check the metadata 66 * for obvious errors while carefully stepping around the filesystem's 67 * ongoing operations, locking rules, etc. 68 * 69 * Given that most XFS metadata consist of records stored in a btree, 70 * most of the checking functions iterate the btree blocks themselves 71 * looking for irregularities. When a record block is encountered, each 72 * record can be checked for obviously bad values. Record values can 73 * also be cross-referenced against other btrees to look for potential 74 * misunderstandings between pieces of metadata. 75 * 76 * It is expected that the checkers responsible for per-AG metadata 77 * structures will lock the AG headers (AGI, AGF, AGFL), iterate the 78 * metadata structure, and perform any relevant cross-referencing before 79 * unlocking the AG and returning the results to userspace. These 80 * scrubbers must not keep an AG locked for too long to avoid tying up 81 * the block and inode allocators. 82 * 83 * Block maps and b-trees rooted in an inode present a special challenge 84 * because they can involve extents from any AG. The general scrubber 85 * structure of lock -> check -> xref -> unlock still holds, but AG 86 * locking order rules /must/ be obeyed to avoid deadlocks. The 87 * ordering rule, of course, is that we must lock in increasing AG 88 * order. Helper functions are provided to track which AG headers we've 89 * already locked. If we detect an imminent locking order violation, we 90 * can signal a potential deadlock, in which case the scrubber can jump 91 * out to the top level, lock all the AGs in order, and retry the scrub. 92 * 93 * For file data (directories, extended attributes, symlinks) scrub, we 94 * can simply lock the inode and walk the data. For btree data 95 * (directories and attributes) we follow the same btree-scrubbing 96 * strategy outlined previously to check the records. 97 * 98 * We use a bit of trickery with transactions to avoid buffer deadlocks 99 * if there is a cycle in the metadata. The basic problem is that 100 * travelling down a btree involves locking the current buffer at each 101 * tree level. If a pointer should somehow point back to a buffer that 102 * we've already examined, we will deadlock due to the second buffer 103 * locking attempt. Note however that grabbing a buffer in transaction 104 * context links the locked buffer to the transaction. If we try to 105 * re-grab the buffer in the context of the same transaction, we avoid 106 * the second lock attempt and continue. Between the verifier and the 107 * scrubber, something will notice that something is amiss and report 108 * the corruption. Therefore, each scrubber will allocate an empty 109 * transaction, attach buffers to it, and cancel the transaction at the 110 * end of the scrub run. Cancelling a non-dirty transaction simply 111 * unlocks the buffers. 112 * 113 * There are four pieces of data that scrub can communicate to 114 * userspace. The first is the error code (errno), which can be used to 115 * communicate operational errors in performing the scrub. There are 116 * also three flags that can be set in the scrub context. If the data 117 * structure itself is corrupt, the CORRUPT flag will be set. If 118 * the metadata is correct but otherwise suboptimal, the PREEN flag 119 * will be set. 120 * 121 * We perform secondary validation of filesystem metadata by 122 * cross-referencing every record with all other available metadata. 123 * For example, for block mapping extents, we verify that there are no 124 * records in the free space and inode btrees corresponding to that 125 * space extent and that there is a corresponding entry in the reverse 126 * mapping btree. Inconsistent metadata is noted by setting the 127 * XCORRUPT flag; btree query function errors are noted by setting the 128 * XFAIL flag and deleting the cursor to prevent further attempts to 129 * cross-reference with a defective btree. 130 * 131 * If a piece of metadata proves corrupt or suboptimal, the userspace 132 * program can ask the kernel to apply some tender loving care (TLC) to 133 * the metadata object by setting the REPAIR flag and re-calling the 134 * scrub ioctl. "Corruption" is defined by metadata violating the 135 * on-disk specification; operations cannot continue if the violation is 136 * left untreated. It is possible for XFS to continue if an object is 137 * "suboptimal", however performance may be degraded. Repairs are 138 * usually performed by rebuilding the metadata entirely out of 139 * redundant metadata. Optimizing, on the other hand, can sometimes be 140 * done without rebuilding entire structures. 141 * 142 * Generally speaking, the repair code has the following code structure: 143 * Lock -> scrub -> repair -> commit -> re-lock -> re-scrub -> unlock. 144 * The first check helps us figure out if we need to rebuild or simply 145 * optimize the structure so that the rebuild knows what to do. The 146 * second check evaluates the completeness of the repair; that is what 147 * is reported to userspace. 148 */ 149 150 /* 151 * Scrub probe -- userspace uses this to probe if we're willing to scrub 152 * or repair a given mountpoint. This will be used by xfs_scrub to 153 * probe the kernel's abilities to scrub (and repair) the metadata. We 154 * do this by validating the ioctl inputs from userspace, preparing the 155 * filesystem for a scrub (or a repair) operation, and immediately 156 * returning to userspace. Userspace can use the returned errno and 157 * structure state to decide (in broad terms) if scrub/repair are 158 * supported by the running kernel. 159 */ 160 static int 161 xfs_scrub_probe( 162 struct xfs_scrub_context *sc) 163 { 164 int error = 0; 165 166 if (xfs_scrub_should_terminate(sc, &error)) 167 return error; 168 169 return 0; 170 } 171 172 /* Scrub setup and teardown */ 173 174 /* Free all the resources and finish the transactions. */ 175 STATIC int 176 xfs_scrub_teardown( 177 struct xfs_scrub_context *sc, 178 struct xfs_inode *ip_in, 179 int error) 180 { 181 xfs_scrub_ag_free(sc, &sc->sa); 182 if (sc->tp) { 183 if (error == 0 && (sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR)) 184 error = xfs_trans_commit(sc->tp); 185 else 186 xfs_trans_cancel(sc->tp); 187 sc->tp = NULL; 188 } 189 if (sc->ip) { 190 if (sc->ilock_flags) 191 xfs_iunlock(sc->ip, sc->ilock_flags); 192 if (sc->ip != ip_in && 193 !xfs_internal_inum(sc->mp, sc->ip->i_ino)) 194 iput(VFS_I(sc->ip)); 195 sc->ip = NULL; 196 } 197 if (sc->has_quotaofflock) 198 mutex_unlock(&sc->mp->m_quotainfo->qi_quotaofflock); 199 if (sc->buf) { 200 kmem_free(sc->buf); 201 sc->buf = NULL; 202 } 203 return error; 204 } 205 206 /* Scrubbing dispatch. */ 207 208 static const struct xfs_scrub_meta_ops meta_scrub_ops[] = { 209 [XFS_SCRUB_TYPE_PROBE] = { /* ioctl presence test */ 210 .type = ST_NONE, 211 .setup = xfs_scrub_setup_fs, 212 .scrub = xfs_scrub_probe, 213 .repair = xfs_repair_probe, 214 }, 215 [XFS_SCRUB_TYPE_SB] = { /* superblock */ 216 .type = ST_PERAG, 217 .setup = xfs_scrub_setup_fs, 218 .scrub = xfs_scrub_superblock, 219 .repair = xfs_repair_superblock, 220 }, 221 [XFS_SCRUB_TYPE_AGF] = { /* agf */ 222 .type = ST_PERAG, 223 .setup = xfs_scrub_setup_fs, 224 .scrub = xfs_scrub_agf, 225 .repair = xfs_repair_notsupported, 226 }, 227 [XFS_SCRUB_TYPE_AGFL]= { /* agfl */ 228 .type = ST_PERAG, 229 .setup = xfs_scrub_setup_fs, 230 .scrub = xfs_scrub_agfl, 231 .repair = xfs_repair_notsupported, 232 }, 233 [XFS_SCRUB_TYPE_AGI] = { /* agi */ 234 .type = ST_PERAG, 235 .setup = xfs_scrub_setup_fs, 236 .scrub = xfs_scrub_agi, 237 .repair = xfs_repair_notsupported, 238 }, 239 [XFS_SCRUB_TYPE_BNOBT] = { /* bnobt */ 240 .type = ST_PERAG, 241 .setup = xfs_scrub_setup_ag_allocbt, 242 .scrub = xfs_scrub_bnobt, 243 .repair = xfs_repair_notsupported, 244 }, 245 [XFS_SCRUB_TYPE_CNTBT] = { /* cntbt */ 246 .type = ST_PERAG, 247 .setup = xfs_scrub_setup_ag_allocbt, 248 .scrub = xfs_scrub_cntbt, 249 .repair = xfs_repair_notsupported, 250 }, 251 [XFS_SCRUB_TYPE_INOBT] = { /* inobt */ 252 .type = ST_PERAG, 253 .setup = xfs_scrub_setup_ag_iallocbt, 254 .scrub = xfs_scrub_inobt, 255 .repair = xfs_repair_notsupported, 256 }, 257 [XFS_SCRUB_TYPE_FINOBT] = { /* finobt */ 258 .type = ST_PERAG, 259 .setup = xfs_scrub_setup_ag_iallocbt, 260 .scrub = xfs_scrub_finobt, 261 .has = xfs_sb_version_hasfinobt, 262 .repair = xfs_repair_notsupported, 263 }, 264 [XFS_SCRUB_TYPE_RMAPBT] = { /* rmapbt */ 265 .type = ST_PERAG, 266 .setup = xfs_scrub_setup_ag_rmapbt, 267 .scrub = xfs_scrub_rmapbt, 268 .has = xfs_sb_version_hasrmapbt, 269 .repair = xfs_repair_notsupported, 270 }, 271 [XFS_SCRUB_TYPE_REFCNTBT] = { /* refcountbt */ 272 .type = ST_PERAG, 273 .setup = xfs_scrub_setup_ag_refcountbt, 274 .scrub = xfs_scrub_refcountbt, 275 .has = xfs_sb_version_hasreflink, 276 .repair = xfs_repair_notsupported, 277 }, 278 [XFS_SCRUB_TYPE_INODE] = { /* inode record */ 279 .type = ST_INODE, 280 .setup = xfs_scrub_setup_inode, 281 .scrub = xfs_scrub_inode, 282 .repair = xfs_repair_notsupported, 283 }, 284 [XFS_SCRUB_TYPE_BMBTD] = { /* inode data fork */ 285 .type = ST_INODE, 286 .setup = xfs_scrub_setup_inode_bmap, 287 .scrub = xfs_scrub_bmap_data, 288 .repair = xfs_repair_notsupported, 289 }, 290 [XFS_SCRUB_TYPE_BMBTA] = { /* inode attr fork */ 291 .type = ST_INODE, 292 .setup = xfs_scrub_setup_inode_bmap, 293 .scrub = xfs_scrub_bmap_attr, 294 .repair = xfs_repair_notsupported, 295 }, 296 [XFS_SCRUB_TYPE_BMBTC] = { /* inode CoW fork */ 297 .type = ST_INODE, 298 .setup = xfs_scrub_setup_inode_bmap, 299 .scrub = xfs_scrub_bmap_cow, 300 .repair = xfs_repair_notsupported, 301 }, 302 [XFS_SCRUB_TYPE_DIR] = { /* directory */ 303 .type = ST_INODE, 304 .setup = xfs_scrub_setup_directory, 305 .scrub = xfs_scrub_directory, 306 .repair = xfs_repair_notsupported, 307 }, 308 [XFS_SCRUB_TYPE_XATTR] = { /* extended attributes */ 309 .type = ST_INODE, 310 .setup = xfs_scrub_setup_xattr, 311 .scrub = xfs_scrub_xattr, 312 .repair = xfs_repair_notsupported, 313 }, 314 [XFS_SCRUB_TYPE_SYMLINK] = { /* symbolic link */ 315 .type = ST_INODE, 316 .setup = xfs_scrub_setup_symlink, 317 .scrub = xfs_scrub_symlink, 318 .repair = xfs_repair_notsupported, 319 }, 320 [XFS_SCRUB_TYPE_PARENT] = { /* parent pointers */ 321 .type = ST_INODE, 322 .setup = xfs_scrub_setup_parent, 323 .scrub = xfs_scrub_parent, 324 .repair = xfs_repair_notsupported, 325 }, 326 [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ 327 .type = ST_FS, 328 .setup = xfs_scrub_setup_rt, 329 .scrub = xfs_scrub_rtbitmap, 330 .has = xfs_sb_version_hasrealtime, 331 .repair = xfs_repair_notsupported, 332 }, 333 [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ 334 .type = ST_FS, 335 .setup = xfs_scrub_setup_rt, 336 .scrub = xfs_scrub_rtsummary, 337 .has = xfs_sb_version_hasrealtime, 338 .repair = xfs_repair_notsupported, 339 }, 340 [XFS_SCRUB_TYPE_UQUOTA] = { /* user quota */ 341 .type = ST_FS, 342 .setup = xfs_scrub_setup_quota, 343 .scrub = xfs_scrub_quota, 344 .repair = xfs_repair_notsupported, 345 }, 346 [XFS_SCRUB_TYPE_GQUOTA] = { /* group quota */ 347 .type = ST_FS, 348 .setup = xfs_scrub_setup_quota, 349 .scrub = xfs_scrub_quota, 350 .repair = xfs_repair_notsupported, 351 }, 352 [XFS_SCRUB_TYPE_PQUOTA] = { /* project quota */ 353 .type = ST_FS, 354 .setup = xfs_scrub_setup_quota, 355 .scrub = xfs_scrub_quota, 356 .repair = xfs_repair_notsupported, 357 }, 358 }; 359 360 /* This isn't a stable feature, warn once per day. */ 361 static inline void 362 xfs_scrub_experimental_warning( 363 struct xfs_mount *mp) 364 { 365 static struct ratelimit_state scrub_warning = RATELIMIT_STATE_INIT( 366 "xfs_scrub_warning", 86400 * HZ, 1); 367 ratelimit_set_flags(&scrub_warning, RATELIMIT_MSG_ON_RELEASE); 368 369 if (__ratelimit(&scrub_warning)) 370 xfs_alert(mp, 371 "EXPERIMENTAL online scrub feature in use. Use at your own risk!"); 372 } 373 374 static int 375 xfs_scrub_validate_inputs( 376 struct xfs_mount *mp, 377 struct xfs_scrub_metadata *sm) 378 { 379 int error; 380 const struct xfs_scrub_meta_ops *ops; 381 382 error = -EINVAL; 383 /* Check our inputs. */ 384 sm->sm_flags &= ~XFS_SCRUB_FLAGS_OUT; 385 if (sm->sm_flags & ~XFS_SCRUB_FLAGS_IN) 386 goto out; 387 /* sm_reserved[] must be zero */ 388 if (memchr_inv(sm->sm_reserved, 0, sizeof(sm->sm_reserved))) 389 goto out; 390 391 error = -ENOENT; 392 /* Do we know about this type of metadata? */ 393 if (sm->sm_type >= XFS_SCRUB_TYPE_NR) 394 goto out; 395 ops = &meta_scrub_ops[sm->sm_type]; 396 if (ops->setup == NULL || ops->scrub == NULL) 397 goto out; 398 /* Does this fs even support this type of metadata? */ 399 if (ops->has && !ops->has(&mp->m_sb)) 400 goto out; 401 402 error = -EINVAL; 403 /* restricting fields must be appropriate for type */ 404 switch (ops->type) { 405 case ST_NONE: 406 case ST_FS: 407 if (sm->sm_ino || sm->sm_gen || sm->sm_agno) 408 goto out; 409 break; 410 case ST_PERAG: 411 if (sm->sm_ino || sm->sm_gen || 412 sm->sm_agno >= mp->m_sb.sb_agcount) 413 goto out; 414 break; 415 case ST_INODE: 416 if (sm->sm_agno || (sm->sm_gen && !sm->sm_ino)) 417 goto out; 418 break; 419 default: 420 goto out; 421 } 422 423 error = -EOPNOTSUPP; 424 /* 425 * We won't scrub any filesystem that doesn't have the ability 426 * to record unwritten extents. The option was made default in 427 * 2003, removed from mkfs in 2007, and cannot be disabled in 428 * v5, so if we find a filesystem without this flag it's either 429 * really old or totally unsupported. Avoid it either way. 430 * We also don't support v1-v3 filesystems, which aren't 431 * mountable. 432 */ 433 if (!xfs_sb_version_hasextflgbit(&mp->m_sb)) 434 goto out; 435 436 /* 437 * We only want to repair read-write v5+ filesystems. Defer the check 438 * for ops->repair until after our scrub confirms that we need to 439 * perform repairs so that we avoid failing due to not supporting 440 * repairing an object that doesn't need repairs. 441 */ 442 if (sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) { 443 error = -EOPNOTSUPP; 444 if (!xfs_sb_version_hascrc(&mp->m_sb)) 445 goto out; 446 447 error = -EROFS; 448 if (mp->m_flags & XFS_MOUNT_RDONLY) 449 goto out; 450 } 451 452 error = 0; 453 out: 454 return error; 455 } 456 457 #ifdef CONFIG_XFS_ONLINE_REPAIR 458 static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc) 459 { 460 /* 461 * Userspace asked us to repair something, we repaired it, rescanned 462 * it, and the rescan says it's still broken. Scream about this in 463 * the system logs. 464 */ 465 if ((sc->sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && 466 (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | 467 XFS_SCRUB_OFLAG_XCORRUPT))) 468 xfs_repair_failure(sc->mp); 469 } 470 #else 471 static inline void xfs_scrub_postmortem(struct xfs_scrub_context *sc) 472 { 473 /* 474 * Userspace asked us to scrub something, it's broken, and we have no 475 * way of fixing it. Scream in the logs. 476 */ 477 if (sc->sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | 478 XFS_SCRUB_OFLAG_XCORRUPT)) 479 xfs_alert_ratelimited(sc->mp, 480 "Corruption detected during scrub."); 481 } 482 #endif /* CONFIG_XFS_ONLINE_REPAIR */ 483 484 /* Dispatch metadata scrubbing. */ 485 int 486 xfs_scrub_metadata( 487 struct xfs_inode *ip, 488 struct xfs_scrub_metadata *sm) 489 { 490 struct xfs_scrub_context sc; 491 struct xfs_mount *mp = ip->i_mount; 492 bool try_harder = false; 493 bool already_fixed = false; 494 int error = 0; 495 496 BUILD_BUG_ON(sizeof(meta_scrub_ops) != 497 (sizeof(struct xfs_scrub_meta_ops) * XFS_SCRUB_TYPE_NR)); 498 499 trace_xfs_scrub_start(ip, sm, error); 500 501 /* Forbidden if we are shut down or mounted norecovery. */ 502 error = -ESHUTDOWN; 503 if (XFS_FORCED_SHUTDOWN(mp)) 504 goto out; 505 error = -ENOTRECOVERABLE; 506 if (mp->m_flags & XFS_MOUNT_NORECOVERY) 507 goto out; 508 509 error = xfs_scrub_validate_inputs(mp, sm); 510 if (error) 511 goto out; 512 513 xfs_scrub_experimental_warning(mp); 514 515 retry_op: 516 /* Set up for the operation. */ 517 memset(&sc, 0, sizeof(sc)); 518 sc.mp = ip->i_mount; 519 sc.sm = sm; 520 sc.ops = &meta_scrub_ops[sm->sm_type]; 521 sc.try_harder = try_harder; 522 sc.sa.agno = NULLAGNUMBER; 523 error = sc.ops->setup(&sc, ip); 524 if (error) 525 goto out_teardown; 526 527 /* Scrub for errors. */ 528 error = sc.ops->scrub(&sc); 529 if (!try_harder && error == -EDEADLOCK) { 530 /* 531 * Scrubbers return -EDEADLOCK to mean 'try harder'. 532 * Tear down everything we hold, then set up again with 533 * preparation for worst-case scenarios. 534 */ 535 error = xfs_scrub_teardown(&sc, ip, 0); 536 if (error) 537 goto out; 538 try_harder = true; 539 goto retry_op; 540 } else if (error) 541 goto out_teardown; 542 543 if ((sc.sm->sm_flags & XFS_SCRUB_IFLAG_REPAIR) && !already_fixed) { 544 bool needs_fix; 545 546 /* Let debug users force us into the repair routines. */ 547 if (XFS_TEST_ERROR(false, mp, XFS_ERRTAG_FORCE_SCRUB_REPAIR)) 548 sc.sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 549 550 needs_fix = (sc.sm->sm_flags & (XFS_SCRUB_OFLAG_CORRUPT | 551 XFS_SCRUB_OFLAG_XCORRUPT | 552 XFS_SCRUB_OFLAG_PREEN)); 553 /* 554 * If userspace asked for a repair but it wasn't necessary, 555 * report that back to userspace. 556 */ 557 if (!needs_fix) { 558 sc.sm->sm_flags |= XFS_SCRUB_OFLAG_NO_REPAIR_NEEDED; 559 goto out_nofix; 560 } 561 562 /* 563 * If it's broken, userspace wants us to fix it, and we haven't 564 * already tried to fix it, then attempt a repair. 565 */ 566 error = xfs_repair_attempt(ip, &sc, &already_fixed); 567 if (error == -EAGAIN) { 568 if (sc.try_harder) 569 try_harder = true; 570 error = xfs_scrub_teardown(&sc, ip, 0); 571 if (error) { 572 xfs_repair_failure(mp); 573 goto out; 574 } 575 goto retry_op; 576 } 577 } 578 579 out_nofix: 580 xfs_scrub_postmortem(&sc); 581 out_teardown: 582 error = xfs_scrub_teardown(&sc, ip, error); 583 out: 584 trace_xfs_scrub_done(ip, sm, error); 585 if (error == -EFSCORRUPTED || error == -EFSBADCRC) { 586 sm->sm_flags |= XFS_SCRUB_OFLAG_CORRUPT; 587 error = 0; 588 } 589 return error; 590 } 591