1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2019-2023 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <djwong@kernel.org> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_log_format.h" 12 #include "xfs_trans.h" 13 #include "xfs_mount.h" 14 #include "xfs_alloc.h" 15 #include "xfs_ialloc.h" 16 #include "xfs_health.h" 17 #include "xfs_btree.h" 18 #include "xfs_ag.h" 19 #include "xfs_rtalloc.h" 20 #include "xfs_inode.h" 21 #include "xfs_icache.h" 22 #include "scrub/scrub.h" 23 #include "scrub/common.h" 24 #include "scrub/trace.h" 25 26 /* 27 * FS Summary Counters 28 * =================== 29 * 30 * The basics of filesystem summary counter checking are that we iterate the 31 * AGs counting the number of free blocks, free space btree blocks, per-AG 32 * reservations, inodes, delayed allocation reservations, and free inodes. 33 * Then we compare what we computed against the in-core counters. 34 * 35 * However, the reality is that summary counters are a tricky beast to check. 36 * While we /could/ freeze the filesystem and scramble around the AGs counting 37 * the free blocks, in practice we prefer not do that for a scan because 38 * freezing is costly. To get around this, we added a per-cpu counter of the 39 * delalloc reservations so that we can rotor around the AGs relatively 40 * quickly, and we allow the counts to be slightly off because we're not taking 41 * any locks while we do this. 42 * 43 * So the first thing we do is warm up the buffer cache in the setup routine by 44 * walking all the AGs to make sure the incore per-AG structure has been 45 * initialized. The expected value calculation then iterates the incore per-AG 46 * structures as quickly as it can. We snapshot the percpu counters before and 47 * after this operation and use the difference in counter values to guess at 48 * our tolerance for mismatch between expected and actual counter values. 49 */ 50 51 struct xchk_fscounters { 52 struct xfs_scrub *sc; 53 uint64_t icount; 54 uint64_t ifree; 55 uint64_t fdblocks; 56 uint64_t frextents; 57 unsigned long long icount_min; 58 unsigned long long icount_max; 59 bool frozen; 60 }; 61 62 /* 63 * Since the expected value computation is lockless but only browses incore 64 * values, the percpu counters should be fairly close to each other. However, 65 * we'll allow ourselves to be off by at least this (arbitrary) amount. 66 */ 67 #define XCHK_FSCOUNT_MIN_VARIANCE (512) 68 69 /* 70 * Make sure the per-AG structure has been initialized from the on-disk header 71 * contents and trust that the incore counters match the ondisk counters. (The 72 * AGF and AGI scrubbers check them, and a normal xfs_scrub run checks the 73 * summary counters after checking all AG headers). Do this from the setup 74 * function so that the inner AG aggregation loop runs as quickly as possible. 75 * 76 * This function runs during the setup phase /before/ we start checking any 77 * metadata. 78 */ 79 STATIC int 80 xchk_fscount_warmup( 81 struct xfs_scrub *sc) 82 { 83 struct xfs_mount *mp = sc->mp; 84 struct xfs_buf *agi_bp = NULL; 85 struct xfs_buf *agf_bp = NULL; 86 struct xfs_perag *pag = NULL; 87 xfs_agnumber_t agno; 88 int error = 0; 89 90 for_each_perag(mp, agno, pag) { 91 if (xchk_should_terminate(sc, &error)) 92 break; 93 if (xfs_perag_initialised_agi(pag) && 94 xfs_perag_initialised_agf(pag)) 95 continue; 96 97 /* Lock both AG headers. */ 98 error = xfs_ialloc_read_agi(pag, sc->tp, &agi_bp); 99 if (error) 100 break; 101 error = xfs_alloc_read_agf(pag, sc->tp, 0, &agf_bp); 102 if (error) 103 break; 104 105 /* 106 * These are supposed to be initialized by the header read 107 * function. 108 */ 109 if (!xfs_perag_initialised_agi(pag) || 110 !xfs_perag_initialised_agf(pag)) { 111 error = -EFSCORRUPTED; 112 break; 113 } 114 115 xfs_buf_relse(agf_bp); 116 agf_bp = NULL; 117 xfs_buf_relse(agi_bp); 118 agi_bp = NULL; 119 } 120 121 if (agf_bp) 122 xfs_buf_relse(agf_bp); 123 if (agi_bp) 124 xfs_buf_relse(agi_bp); 125 if (pag) 126 xfs_perag_rele(pag); 127 return error; 128 } 129 130 static inline int 131 xchk_fsfreeze( 132 struct xfs_scrub *sc) 133 { 134 int error; 135 136 error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); 137 trace_xchk_fsfreeze(sc, error); 138 return error; 139 } 140 141 static inline int 142 xchk_fsthaw( 143 struct xfs_scrub *sc) 144 { 145 int error; 146 147 /* This should always succeed, we have a kernel freeze */ 148 error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); 149 trace_xchk_fsthaw(sc, error); 150 return error; 151 } 152 153 /* 154 * We couldn't stabilize the filesystem long enough to sample all the variables 155 * that comprise the summary counters and compare them to the percpu counters. 156 * We need to disable all writer threads, which means taking the first two 157 * freeze levels to put userspace to sleep, and the third freeze level to 158 * prevent background threads from starting new transactions. Take one level 159 * more to prevent other callers from unfreezing the filesystem while we run. 160 */ 161 STATIC int 162 xchk_fscounters_freeze( 163 struct xfs_scrub *sc) 164 { 165 struct xchk_fscounters *fsc = sc->buf; 166 int error = 0; 167 168 if (sc->flags & XCHK_HAVE_FREEZE_PROT) { 169 sc->flags &= ~XCHK_HAVE_FREEZE_PROT; 170 mnt_drop_write_file(sc->file); 171 } 172 173 /* Try to grab a kernel freeze. */ 174 while ((error = xchk_fsfreeze(sc)) == -EBUSY) { 175 if (xchk_should_terminate(sc, &error)) 176 return error; 177 178 delay(HZ / 10); 179 } 180 if (error) 181 return error; 182 183 fsc->frozen = true; 184 return 0; 185 } 186 187 /* Thaw the filesystem after checking or repairing fscounters. */ 188 STATIC void 189 xchk_fscounters_cleanup( 190 void *buf) 191 { 192 struct xchk_fscounters *fsc = buf; 193 struct xfs_scrub *sc = fsc->sc; 194 int error; 195 196 if (!fsc->frozen) 197 return; 198 199 error = xchk_fsthaw(sc); 200 if (error) 201 xfs_emerg(sc->mp, "still frozen after scrub, err=%d", error); 202 else 203 fsc->frozen = false; 204 } 205 206 int 207 xchk_setup_fscounters( 208 struct xfs_scrub *sc) 209 { 210 struct xchk_fscounters *fsc; 211 int error; 212 213 /* 214 * If the AGF doesn't track btreeblks, we have to lock the AGF to count 215 * btree block usage by walking the actual btrees. 216 */ 217 if (!xfs_has_lazysbcount(sc->mp)) 218 xchk_fsgates_enable(sc, XCHK_FSGATES_DRAIN); 219 220 sc->buf = kzalloc(sizeof(struct xchk_fscounters), XCHK_GFP_FLAGS); 221 if (!sc->buf) 222 return -ENOMEM; 223 sc->buf_cleanup = xchk_fscounters_cleanup; 224 fsc = sc->buf; 225 fsc->sc = sc; 226 227 xfs_icount_range(sc->mp, &fsc->icount_min, &fsc->icount_max); 228 229 /* We must get the incore counters set up before we can proceed. */ 230 error = xchk_fscount_warmup(sc); 231 if (error) 232 return error; 233 234 /* 235 * Pause all writer activity in the filesystem while we're scrubbing to 236 * reduce the likelihood of background perturbations to the counters 237 * throwing off our calculations. 238 */ 239 if (sc->flags & XCHK_TRY_HARDER) { 240 error = xchk_fscounters_freeze(sc); 241 if (error) 242 return error; 243 } 244 245 return xfs_trans_alloc_empty(sc->mp, &sc->tp); 246 } 247 248 /* 249 * Part 1: Collecting filesystem summary counts. For each AG, we add its 250 * summary counts (total inodes, free inodes, free data blocks) to an incore 251 * copy of the overall filesystem summary counts. 252 * 253 * To avoid false corruption reports in part 2, any failure in this part must 254 * set the INCOMPLETE flag even when a negative errno is returned. This care 255 * must be taken with certain errno values (i.e. EFSBADCRC, EFSCORRUPTED, 256 * ECANCELED) that are absorbed into a scrub state flag update by 257 * xchk_*_process_error. 258 */ 259 260 /* Count free space btree blocks manually for pre-lazysbcount filesystems. */ 261 static int 262 xchk_fscount_btreeblks( 263 struct xfs_scrub *sc, 264 struct xchk_fscounters *fsc, 265 xfs_agnumber_t agno) 266 { 267 xfs_extlen_t blocks; 268 int error; 269 270 error = xchk_ag_init_existing(sc, agno, &sc->sa); 271 if (error) 272 goto out_free; 273 274 error = xfs_btree_count_blocks(sc->sa.bno_cur, &blocks); 275 if (error) 276 goto out_free; 277 fsc->fdblocks += blocks - 1; 278 279 error = xfs_btree_count_blocks(sc->sa.cnt_cur, &blocks); 280 if (error) 281 goto out_free; 282 fsc->fdblocks += blocks - 1; 283 284 out_free: 285 xchk_ag_free(sc, &sc->sa); 286 return error; 287 } 288 289 /* 290 * Calculate what the global in-core counters ought to be from the incore 291 * per-AG structure. Callers can compare this to the actual in-core counters 292 * to estimate by how much both in-core and on-disk counters need to be 293 * adjusted. 294 */ 295 STATIC int 296 xchk_fscount_aggregate_agcounts( 297 struct xfs_scrub *sc, 298 struct xchk_fscounters *fsc) 299 { 300 struct xfs_mount *mp = sc->mp; 301 struct xfs_perag *pag; 302 uint64_t delayed; 303 xfs_agnumber_t agno; 304 int tries = 8; 305 int error = 0; 306 307 retry: 308 fsc->icount = 0; 309 fsc->ifree = 0; 310 fsc->fdblocks = 0; 311 312 for_each_perag(mp, agno, pag) { 313 if (xchk_should_terminate(sc, &error)) 314 break; 315 316 /* This somehow got unset since the warmup? */ 317 if (!xfs_perag_initialised_agi(pag) || 318 !xfs_perag_initialised_agf(pag)) { 319 error = -EFSCORRUPTED; 320 break; 321 } 322 323 /* Count all the inodes */ 324 fsc->icount += pag->pagi_count; 325 fsc->ifree += pag->pagi_freecount; 326 327 /* Add up the free/freelist/bnobt/cntbt blocks */ 328 fsc->fdblocks += pag->pagf_freeblks; 329 fsc->fdblocks += pag->pagf_flcount; 330 if (xfs_has_lazysbcount(sc->mp)) { 331 fsc->fdblocks += pag->pagf_btreeblks; 332 } else { 333 error = xchk_fscount_btreeblks(sc, fsc, agno); 334 if (error) 335 break; 336 } 337 338 /* 339 * Per-AG reservations are taken out of the incore counters, 340 * so they must be left out of the free blocks computation. 341 */ 342 fsc->fdblocks -= pag->pag_meta_resv.ar_reserved; 343 fsc->fdblocks -= pag->pag_rmapbt_resv.ar_orig_reserved; 344 345 } 346 if (pag) 347 xfs_perag_rele(pag); 348 if (error) { 349 xchk_set_incomplete(sc); 350 return error; 351 } 352 353 /* 354 * The global incore space reservation is taken from the incore 355 * counters, so leave that out of the computation. 356 */ 357 fsc->fdblocks -= mp->m_resblks_avail; 358 359 /* 360 * Delayed allocation reservations are taken out of the incore counters 361 * but not recorded on disk, so leave them and their indlen blocks out 362 * of the computation. 363 */ 364 delayed = percpu_counter_sum(&mp->m_delalloc_blks); 365 fsc->fdblocks -= delayed; 366 367 trace_xchk_fscounters_calc(mp, fsc->icount, fsc->ifree, fsc->fdblocks, 368 delayed); 369 370 371 /* Bail out if the values we compute are totally nonsense. */ 372 if (fsc->icount < fsc->icount_min || fsc->icount > fsc->icount_max || 373 fsc->fdblocks > mp->m_sb.sb_dblocks || 374 fsc->ifree > fsc->icount_max) 375 return -EFSCORRUPTED; 376 377 /* 378 * If ifree > icount then we probably had some perturbation in the 379 * counters while we were calculating things. We'll try a few times 380 * to maintain ifree <= icount before giving up. 381 */ 382 if (fsc->ifree > fsc->icount) { 383 if (tries--) 384 goto retry; 385 return -EDEADLOCK; 386 } 387 388 return 0; 389 } 390 391 #ifdef CONFIG_XFS_RT 392 STATIC int 393 xchk_fscount_add_frextent( 394 struct xfs_mount *mp, 395 struct xfs_trans *tp, 396 const struct xfs_rtalloc_rec *rec, 397 void *priv) 398 { 399 struct xchk_fscounters *fsc = priv; 400 int error = 0; 401 402 fsc->frextents += rec->ar_extcount; 403 404 xchk_should_terminate(fsc->sc, &error); 405 return error; 406 } 407 408 /* Calculate the number of free realtime extents from the realtime bitmap. */ 409 STATIC int 410 xchk_fscount_count_frextents( 411 struct xfs_scrub *sc, 412 struct xchk_fscounters *fsc) 413 { 414 struct xfs_mount *mp = sc->mp; 415 int error; 416 417 fsc->frextents = 0; 418 if (!xfs_has_realtime(mp)) 419 return 0; 420 421 xfs_ilock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); 422 error = xfs_rtalloc_query_all(sc->mp, sc->tp, 423 xchk_fscount_add_frextent, fsc); 424 if (error) { 425 xchk_set_incomplete(sc); 426 goto out_unlock; 427 } 428 429 out_unlock: 430 xfs_iunlock(sc->mp->m_rbmip, XFS_ILOCK_SHARED | XFS_ILOCK_RTBITMAP); 431 return error; 432 } 433 #else 434 STATIC int 435 xchk_fscount_count_frextents( 436 struct xfs_scrub *sc, 437 struct xchk_fscounters *fsc) 438 { 439 fsc->frextents = 0; 440 return 0; 441 } 442 #endif /* CONFIG_XFS_RT */ 443 444 /* 445 * Part 2: Comparing filesystem summary counters. All we have to do here is 446 * sum the percpu counters and compare them to what we've observed. 447 */ 448 449 /* 450 * Is the @counter reasonably close to the @expected value? 451 * 452 * We neither locked nor froze anything in the filesystem while aggregating the 453 * per-AG data to compute the @expected value, which means that the counter 454 * could have changed. We know the @old_value of the summation of the counter 455 * before the aggregation, and we re-sum the counter now. If the expected 456 * value falls between the two summations, we're ok. 457 * 458 * Otherwise, we /might/ have a problem. If the change in the summations is 459 * more than we want to tolerate, the filesystem is probably busy and we should 460 * just send back INCOMPLETE and see if userspace will try again. 461 * 462 * If we're repairing then we require an exact match. 463 */ 464 static inline bool 465 xchk_fscount_within_range( 466 struct xfs_scrub *sc, 467 const int64_t old_value, 468 struct percpu_counter *counter, 469 uint64_t expected) 470 { 471 int64_t min_value, max_value; 472 int64_t curr_value = percpu_counter_sum(counter); 473 474 trace_xchk_fscounters_within_range(sc->mp, expected, curr_value, 475 old_value); 476 477 /* Negative values are always wrong. */ 478 if (curr_value < 0) 479 return false; 480 481 /* Exact matches are always ok. */ 482 if (curr_value == expected) 483 return true; 484 485 min_value = min(old_value, curr_value); 486 max_value = max(old_value, curr_value); 487 488 /* Within the before-and-after range is ok. */ 489 if (expected >= min_value && expected <= max_value) 490 return true; 491 492 /* Everything else is bad. */ 493 return false; 494 } 495 496 /* Check the superblock counters. */ 497 int 498 xchk_fscounters( 499 struct xfs_scrub *sc) 500 { 501 struct xfs_mount *mp = sc->mp; 502 struct xchk_fscounters *fsc = sc->buf; 503 int64_t icount, ifree, fdblocks, frextents; 504 bool try_again = false; 505 int error; 506 507 /* Snapshot the percpu counters. */ 508 icount = percpu_counter_sum(&mp->m_icount); 509 ifree = percpu_counter_sum(&mp->m_ifree); 510 fdblocks = percpu_counter_sum(&mp->m_fdblocks); 511 frextents = percpu_counter_sum(&mp->m_frextents); 512 513 /* No negative values, please! */ 514 if (icount < 0 || ifree < 0) 515 xchk_set_corrupt(sc); 516 517 /* 518 * If the filesystem is not frozen, the counter summation calls above 519 * can race with xfs_mod_freecounter, which subtracts a requested space 520 * reservation from the counter and undoes the subtraction if that made 521 * the counter go negative. Therefore, it's possible to see negative 522 * values here, and we should only flag that as a corruption if we 523 * froze the fs. This is much more likely to happen with frextents 524 * since there are no reserved pools. 525 */ 526 if (fdblocks < 0 || frextents < 0) { 527 if (!fsc->frozen) 528 return -EDEADLOCK; 529 530 xchk_set_corrupt(sc); 531 return 0; 532 } 533 534 /* See if icount is obviously wrong. */ 535 if (icount < fsc->icount_min || icount > fsc->icount_max) 536 xchk_set_corrupt(sc); 537 538 /* See if fdblocks is obviously wrong. */ 539 if (fdblocks > mp->m_sb.sb_dblocks) 540 xchk_set_corrupt(sc); 541 542 /* See if frextents is obviously wrong. */ 543 if (frextents > mp->m_sb.sb_rextents) 544 xchk_set_corrupt(sc); 545 546 /* 547 * If ifree exceeds icount by more than the minimum variance then 548 * something's probably wrong with the counters. 549 */ 550 if (ifree > icount && ifree - icount > XCHK_FSCOUNT_MIN_VARIANCE) 551 xchk_set_corrupt(sc); 552 553 /* Walk the incore AG headers to calculate the expected counters. */ 554 error = xchk_fscount_aggregate_agcounts(sc, fsc); 555 if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) 556 return error; 557 558 /* Count the free extents counter for rt volumes. */ 559 error = xchk_fscount_count_frextents(sc, fsc); 560 if (!xchk_process_error(sc, 0, XFS_SB_BLOCK(mp), &error)) 561 return error; 562 if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_INCOMPLETE) 563 return 0; 564 565 /* 566 * Compare the in-core counters with whatever we counted. If the fs is 567 * frozen, we treat the discrepancy as a corruption because the freeze 568 * should have stabilized the counter values. Otherwise, we need 569 * userspace to call us back having granted us freeze permission. 570 */ 571 if (!xchk_fscount_within_range(sc, icount, &mp->m_icount, 572 fsc->icount)) { 573 if (fsc->frozen) 574 xchk_set_corrupt(sc); 575 else 576 try_again = true; 577 } 578 579 if (!xchk_fscount_within_range(sc, ifree, &mp->m_ifree, fsc->ifree)) { 580 if (fsc->frozen) 581 xchk_set_corrupt(sc); 582 else 583 try_again = true; 584 } 585 586 if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, 587 fsc->fdblocks)) { 588 if (fsc->frozen) 589 xchk_set_corrupt(sc); 590 else 591 try_again = true; 592 } 593 594 if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, 595 fsc->frextents)) { 596 if (fsc->frozen) 597 xchk_set_corrupt(sc); 598 else 599 try_again = true; 600 } 601 602 if (try_again) 603 return -EDEADLOCK; 604 605 return 0; 606 } 607